1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #define pr_fmt(fmt) "MPTCP: " fmt 8 9 #include <linux/kernel.h> 10 #include <linux/module.h> 11 #include <linux/netdevice.h> 12 #include <crypto/sha2.h> 13 #include <crypto/utils.h> 14 #include <net/sock.h> 15 #include <net/inet_common.h> 16 #include <net/inet_hashtables.h> 17 #include <net/protocol.h> 18 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 19 #include <net/ip6_route.h> 20 #include <net/transp_v6.h> 21 #endif 22 #include <net/mptcp.h> 23 #include "protocol.h" 24 #include "mib.h" 25 26 #include <trace/events/mptcp.h> 27 #include <trace/events/sock.h> 28 29 static void mptcp_subflow_ops_undo_override(struct sock *ssk); 30 31 static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, 32 enum linux_mptcp_mib_field field) 33 { 34 MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); 35 } 36 37 static void subflow_req_destructor(struct request_sock *req) 38 { 39 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 40 41 pr_debug("subflow_req=%p", subflow_req); 42 43 if (subflow_req->msk) 44 sock_put((struct sock *)subflow_req->msk); 45 46 mptcp_token_destroy_request(req); 47 } 48 49 static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, 50 void *hmac) 51 { 52 u8 msg[8]; 53 54 put_unaligned_be32(nonce1, &msg[0]); 55 put_unaligned_be32(nonce2, &msg[4]); 56 57 mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); 58 } 59 60 static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) 61 { 62 return mptcp_is_fully_established((void *)msk) && 63 ((mptcp_pm_is_userspace(msk) && 64 mptcp_userspace_pm_active(msk)) || 65 READ_ONCE(msk->pm.accept_subflow)); 66 } 67 68 /* validate received token and create truncated hmac and nonce for SYN-ACK */ 69 static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req) 70 { 71 struct mptcp_sock *msk = subflow_req->msk; 72 u8 hmac[SHA256_DIGEST_SIZE]; 73 74 get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); 75 76 subflow_generate_hmac(READ_ONCE(msk->local_key), 77 READ_ONCE(msk->remote_key), 78 subflow_req->local_nonce, 79 subflow_req->remote_nonce, hmac); 80 81 subflow_req->thmac = get_unaligned_be64(hmac); 82 } 83 84 static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) 85 { 86 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 87 struct mptcp_sock *msk; 88 int local_id; 89 90 msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token); 91 if (!msk) { 92 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); 93 return NULL; 94 } 95 96 local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); 97 if (local_id < 0) { 98 sock_put((struct sock *)msk); 99 return NULL; 100 } 101 subflow_req->local_id = local_id; 102 103 return msk; 104 } 105 106 static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener) 107 { 108 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 109 110 subflow_req->mp_capable = 0; 111 subflow_req->mp_join = 0; 112 subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener)); 113 subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener)); 114 subflow_req->msk = NULL; 115 mptcp_token_init_request(req); 116 } 117 118 static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk) 119 { 120 return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport; 121 } 122 123 static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) 124 { 125 struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP); 126 127 if (mpext) { 128 memset(mpext, 0, sizeof(*mpext)); 129 mpext->reset_reason = reason; 130 } 131 } 132 133 /* Init mptcp request socket. 134 * 135 * Returns an error code if a JOIN has failed and a TCP reset 136 * should be sent. 137 */ 138 static int subflow_check_req(struct request_sock *req, 139 const struct sock *sk_listener, 140 struct sk_buff *skb) 141 { 142 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); 143 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 144 struct mptcp_options_received mp_opt; 145 bool opt_mp_capable, opt_mp_join; 146 147 pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); 148 149 #ifdef CONFIG_TCP_MD5SIG 150 /* no MPTCP if MD5SIG is enabled on this socket or we may run out of 151 * TCP option space. 152 */ 153 if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) 154 return -EINVAL; 155 #endif 156 157 mptcp_get_options(skb, &mp_opt); 158 159 opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYN); 160 opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYN); 161 if (opt_mp_capable) { 162 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); 163 164 if (opt_mp_join) 165 return 0; 166 } else if (opt_mp_join) { 167 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); 168 } 169 170 if (opt_mp_capable && listener->request_mptcp) { 171 int err, retries = MPTCP_TOKEN_MAX_RETRIES; 172 173 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; 174 again: 175 do { 176 get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); 177 } while (subflow_req->local_key == 0); 178 179 if (unlikely(req->syncookie)) { 180 mptcp_crypto_key_sha(subflow_req->local_key, 181 &subflow_req->token, 182 &subflow_req->idsn); 183 if (mptcp_token_exists(subflow_req->token)) { 184 if (retries-- > 0) 185 goto again; 186 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); 187 } else { 188 subflow_req->mp_capable = 1; 189 } 190 return 0; 191 } 192 193 err = mptcp_token_new_request(req); 194 if (err == 0) 195 subflow_req->mp_capable = 1; 196 else if (retries-- > 0) 197 goto again; 198 else 199 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); 200 201 } else if (opt_mp_join && listener->request_mptcp) { 202 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; 203 subflow_req->mp_join = 1; 204 subflow_req->backup = mp_opt.backup; 205 subflow_req->remote_id = mp_opt.join_id; 206 subflow_req->token = mp_opt.token; 207 subflow_req->remote_nonce = mp_opt.nonce; 208 subflow_req->msk = subflow_token_join_request(req); 209 210 /* Can't fall back to TCP in this case. */ 211 if (!subflow_req->msk) { 212 subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); 213 return -EPERM; 214 } 215 216 if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { 217 pr_debug("syn inet_sport=%d %d", 218 ntohs(inet_sk(sk_listener)->inet_sport), 219 ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); 220 if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { 221 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX); 222 return -EPERM; 223 } 224 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX); 225 } 226 227 subflow_req_create_thmac(subflow_req); 228 229 if (unlikely(req->syncookie)) { 230 if (mptcp_can_accept_new_subflow(subflow_req->msk)) 231 subflow_init_req_cookie_join_save(subflow_req, skb); 232 else 233 return -EPERM; 234 } 235 236 pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, 237 subflow_req->remote_nonce, subflow_req->msk); 238 } 239 240 return 0; 241 } 242 243 int mptcp_subflow_init_cookie_req(struct request_sock *req, 244 const struct sock *sk_listener, 245 struct sk_buff *skb) 246 { 247 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); 248 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 249 struct mptcp_options_received mp_opt; 250 bool opt_mp_capable, opt_mp_join; 251 int err; 252 253 subflow_init_req(req, sk_listener); 254 mptcp_get_options(skb, &mp_opt); 255 256 opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK); 257 opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK); 258 if (opt_mp_capable && opt_mp_join) 259 return -EINVAL; 260 261 if (opt_mp_capable && listener->request_mptcp) { 262 if (mp_opt.sndr_key == 0) 263 return -EINVAL; 264 265 subflow_req->local_key = mp_opt.rcvr_key; 266 err = mptcp_token_new_request(req); 267 if (err) 268 return err; 269 270 subflow_req->mp_capable = 1; 271 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; 272 } else if (opt_mp_join && listener->request_mptcp) { 273 if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) 274 return -EINVAL; 275 276 subflow_req->mp_join = 1; 277 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; 278 } 279 280 return 0; 281 } 282 EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); 283 284 static struct dst_entry *subflow_v4_route_req(const struct sock *sk, 285 struct sk_buff *skb, 286 struct flowi *fl, 287 struct request_sock *req) 288 { 289 struct dst_entry *dst; 290 int err; 291 292 tcp_rsk(req)->is_mptcp = 1; 293 subflow_init_req(req, sk); 294 295 dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req); 296 if (!dst) 297 return NULL; 298 299 err = subflow_check_req(req, sk, skb); 300 if (err == 0) 301 return dst; 302 303 dst_release(dst); 304 if (!req->syncookie) 305 tcp_request_sock_ops.send_reset(sk, skb); 306 return NULL; 307 } 308 309 static void subflow_prep_synack(const struct sock *sk, struct request_sock *req, 310 struct tcp_fastopen_cookie *foc, 311 enum tcp_synack_type synack_type) 312 { 313 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 314 struct inet_request_sock *ireq = inet_rsk(req); 315 316 /* clear tstamp_ok, as needed depending on cookie */ 317 if (foc && foc->len > -1) 318 ireq->tstamp_ok = 0; 319 320 if (synack_type == TCP_SYNACK_FASTOPEN) 321 mptcp_fastopen_subflow_synack_set_params(subflow, req); 322 } 323 324 static int subflow_v4_send_synack(const struct sock *sk, struct dst_entry *dst, 325 struct flowi *fl, 326 struct request_sock *req, 327 struct tcp_fastopen_cookie *foc, 328 enum tcp_synack_type synack_type, 329 struct sk_buff *syn_skb) 330 { 331 subflow_prep_synack(sk, req, foc, synack_type); 332 333 return tcp_request_sock_ipv4_ops.send_synack(sk, dst, fl, req, foc, 334 synack_type, syn_skb); 335 } 336 337 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 338 static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst, 339 struct flowi *fl, 340 struct request_sock *req, 341 struct tcp_fastopen_cookie *foc, 342 enum tcp_synack_type synack_type, 343 struct sk_buff *syn_skb) 344 { 345 subflow_prep_synack(sk, req, foc, synack_type); 346 347 return tcp_request_sock_ipv6_ops.send_synack(sk, dst, fl, req, foc, 348 synack_type, syn_skb); 349 } 350 351 static struct dst_entry *subflow_v6_route_req(const struct sock *sk, 352 struct sk_buff *skb, 353 struct flowi *fl, 354 struct request_sock *req) 355 { 356 struct dst_entry *dst; 357 int err; 358 359 tcp_rsk(req)->is_mptcp = 1; 360 subflow_init_req(req, sk); 361 362 dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req); 363 if (!dst) 364 return NULL; 365 366 err = subflow_check_req(req, sk, skb); 367 if (err == 0) 368 return dst; 369 370 dst_release(dst); 371 if (!req->syncookie) 372 tcp6_request_sock_ops.send_reset(sk, skb); 373 return NULL; 374 } 375 #endif 376 377 /* validate received truncated hmac and create hmac for third ACK */ 378 static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) 379 { 380 u8 hmac[SHA256_DIGEST_SIZE]; 381 u64 thmac; 382 383 subflow_generate_hmac(subflow->remote_key, subflow->local_key, 384 subflow->remote_nonce, subflow->local_nonce, 385 hmac); 386 387 thmac = get_unaligned_be64(hmac); 388 pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", 389 subflow, subflow->token, thmac, subflow->thmac); 390 391 return thmac == subflow->thmac; 392 } 393 394 void mptcp_subflow_reset(struct sock *ssk) 395 { 396 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 397 struct sock *sk = subflow->conn; 398 399 /* mptcp_mp_fail_no_response() can reach here on an already closed 400 * socket 401 */ 402 if (ssk->sk_state == TCP_CLOSE) 403 return; 404 405 /* must hold: tcp_done() could drop last reference on parent */ 406 sock_hold(sk); 407 408 tcp_send_active_reset(ssk, GFP_ATOMIC); 409 tcp_done(ssk); 410 if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) 411 mptcp_schedule_work(sk); 412 413 sock_put(sk); 414 } 415 416 static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk) 417 { 418 return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; 419 } 420 421 void __mptcp_sync_state(struct sock *sk, int state) 422 { 423 struct mptcp_subflow_context *subflow; 424 struct mptcp_sock *msk = mptcp_sk(sk); 425 struct sock *ssk = msk->first; 426 427 subflow = mptcp_subflow_ctx(ssk); 428 __mptcp_propagate_sndbuf(sk, ssk); 429 if (!msk->rcvspace_init) 430 mptcp_rcv_space_init(msk, ssk); 431 432 if (sk->sk_state == TCP_SYN_SENT) { 433 /* subflow->idsn is always available is TCP_SYN_SENT state, 434 * even for the FASTOPEN scenarios 435 */ 436 WRITE_ONCE(msk->write_seq, subflow->idsn + 1); 437 WRITE_ONCE(msk->snd_nxt, msk->write_seq); 438 mptcp_set_state(sk, state); 439 sk->sk_state_change(sk); 440 } 441 } 442 443 static void subflow_set_remote_key(struct mptcp_sock *msk, 444 struct mptcp_subflow_context *subflow, 445 const struct mptcp_options_received *mp_opt) 446 { 447 /* active MPC subflow will reach here multiple times: 448 * at subflow_finish_connect() time and at 4th ack time 449 */ 450 if (subflow->remote_key_valid) 451 return; 452 453 subflow->remote_key_valid = 1; 454 subflow->remote_key = mp_opt->sndr_key; 455 mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn); 456 subflow->iasn++; 457 458 WRITE_ONCE(msk->remote_key, subflow->remote_key); 459 WRITE_ONCE(msk->ack_seq, subflow->iasn); 460 WRITE_ONCE(msk->can_ack, true); 461 atomic64_set(&msk->rcv_wnd_sent, subflow->iasn); 462 } 463 464 static void mptcp_propagate_state(struct sock *sk, struct sock *ssk, 465 struct mptcp_subflow_context *subflow, 466 const struct mptcp_options_received *mp_opt) 467 { 468 struct mptcp_sock *msk = mptcp_sk(sk); 469 470 mptcp_data_lock(sk); 471 if (mp_opt) { 472 /* Options are available only in the non fallback cases 473 * avoid updating rx path fields otherwise 474 */ 475 WRITE_ONCE(msk->snd_una, subflow->idsn + 1); 476 WRITE_ONCE(msk->wnd_end, subflow->idsn + 1 + tcp_sk(ssk)->snd_wnd); 477 subflow_set_remote_key(msk, subflow, mp_opt); 478 } 479 480 if (!sock_owned_by_user(sk)) { 481 __mptcp_sync_state(sk, ssk->sk_state); 482 } else { 483 msk->pending_state = ssk->sk_state; 484 __set_bit(MPTCP_SYNC_STATE, &msk->cb_flags); 485 } 486 mptcp_data_unlock(sk); 487 } 488 489 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) 490 { 491 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 492 struct mptcp_options_received mp_opt; 493 struct sock *parent = subflow->conn; 494 struct mptcp_sock *msk; 495 496 subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); 497 498 /* be sure no special action on any packet other than syn-ack */ 499 if (subflow->conn_finished) 500 return; 501 502 msk = mptcp_sk(parent); 503 subflow->rel_write_seq = 1; 504 subflow->conn_finished = 1; 505 subflow->ssn_offset = TCP_SKB_CB(skb)->seq; 506 pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); 507 508 mptcp_get_options(skb, &mp_opt); 509 if (subflow->request_mptcp) { 510 if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) { 511 MPTCP_INC_STATS(sock_net(sk), 512 MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); 513 mptcp_do_fallback(sk); 514 pr_fallback(msk); 515 goto fallback; 516 } 517 518 if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD) 519 WRITE_ONCE(msk->csum_enabled, true); 520 if (mp_opt.deny_join_id0) 521 WRITE_ONCE(msk->pm.remote_deny_join_id0, true); 522 subflow->mp_capable = 1; 523 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); 524 mptcp_finish_connect(sk); 525 mptcp_propagate_state(parent, sk, subflow, &mp_opt); 526 } else if (subflow->request_join) { 527 u8 hmac[SHA256_DIGEST_SIZE]; 528 529 if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYNACK)) { 530 subflow->reset_reason = MPTCP_RST_EMPTCP; 531 goto do_reset; 532 } 533 534 subflow->backup = mp_opt.backup; 535 subflow->thmac = mp_opt.thmac; 536 subflow->remote_nonce = mp_opt.nonce; 537 WRITE_ONCE(subflow->remote_id, mp_opt.join_id); 538 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d", 539 subflow, subflow->thmac, subflow->remote_nonce, 540 subflow->backup); 541 542 if (!subflow_thmac_valid(subflow)) { 543 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); 544 subflow->reset_reason = MPTCP_RST_EMPTCP; 545 goto do_reset; 546 } 547 548 if (!mptcp_finish_join(sk)) 549 goto do_reset; 550 551 subflow_generate_hmac(subflow->local_key, subflow->remote_key, 552 subflow->local_nonce, 553 subflow->remote_nonce, 554 hmac); 555 memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); 556 557 subflow->mp_join = 1; 558 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); 559 560 if (subflow_use_different_dport(msk, sk)) { 561 pr_debug("synack inet_dport=%d %d", 562 ntohs(inet_sk(sk)->inet_dport), 563 ntohs(inet_sk(parent)->inet_dport)); 564 MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX); 565 } 566 } else if (mptcp_check_fallback(sk)) { 567 fallback: 568 mptcp_propagate_state(parent, sk, subflow, NULL); 569 } 570 return; 571 572 do_reset: 573 subflow->reset_transient = 0; 574 mptcp_subflow_reset(sk); 575 } 576 577 static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id) 578 { 579 WARN_ON_ONCE(local_id < 0 || local_id > 255); 580 WRITE_ONCE(subflow->local_id, local_id); 581 } 582 583 static int subflow_chk_local_id(struct sock *sk) 584 { 585 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 586 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 587 int err; 588 589 if (likely(subflow->local_id >= 0)) 590 return 0; 591 592 err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); 593 if (err < 0) 594 return err; 595 596 subflow_set_local_id(subflow, err); 597 return 0; 598 } 599 600 static int subflow_rebuild_header(struct sock *sk) 601 { 602 int err = subflow_chk_local_id(sk); 603 604 if (unlikely(err < 0)) 605 return err; 606 607 return inet_sk_rebuild_header(sk); 608 } 609 610 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 611 static int subflow_v6_rebuild_header(struct sock *sk) 612 { 613 int err = subflow_chk_local_id(sk); 614 615 if (unlikely(err < 0)) 616 return err; 617 618 return inet6_sk_rebuild_header(sk); 619 } 620 #endif 621 622 static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init; 623 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; 624 625 static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) 626 { 627 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 628 629 pr_debug("subflow=%p", subflow); 630 631 /* Never answer to SYNs sent to broadcast or multicast */ 632 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) 633 goto drop; 634 635 return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops, 636 &subflow_request_sock_ipv4_ops, 637 sk, skb); 638 drop: 639 tcp_listendrop(sk); 640 return 0; 641 } 642 643 static void subflow_v4_req_destructor(struct request_sock *req) 644 { 645 subflow_req_destructor(req); 646 tcp_request_sock_ops.destructor(req); 647 } 648 649 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 650 static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init; 651 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; 652 static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; 653 static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; 654 static struct proto tcpv6_prot_override __ro_after_init; 655 656 static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) 657 { 658 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 659 660 pr_debug("subflow=%p", subflow); 661 662 if (skb->protocol == htons(ETH_P_IP)) 663 return subflow_v4_conn_request(sk, skb); 664 665 if (!ipv6_unicast_destination(skb)) 666 goto drop; 667 668 if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { 669 __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); 670 return 0; 671 } 672 673 return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops, 674 &subflow_request_sock_ipv6_ops, sk, skb); 675 676 drop: 677 tcp_listendrop(sk); 678 return 0; /* don't send reset */ 679 } 680 681 static void subflow_v6_req_destructor(struct request_sock *req) 682 { 683 subflow_req_destructor(req); 684 tcp6_request_sock_ops.destructor(req); 685 } 686 #endif 687 688 struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, 689 struct sock *sk_listener, 690 bool attach_listener) 691 { 692 if (ops->family == AF_INET) 693 ops = &mptcp_subflow_v4_request_sock_ops; 694 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 695 else if (ops->family == AF_INET6) 696 ops = &mptcp_subflow_v6_request_sock_ops; 697 #endif 698 699 return inet_reqsk_alloc(ops, sk_listener, attach_listener); 700 } 701 EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); 702 703 /* validate hmac received in third ACK */ 704 static bool subflow_hmac_valid(const struct request_sock *req, 705 const struct mptcp_options_received *mp_opt) 706 { 707 const struct mptcp_subflow_request_sock *subflow_req; 708 u8 hmac[SHA256_DIGEST_SIZE]; 709 struct mptcp_sock *msk; 710 711 subflow_req = mptcp_subflow_rsk(req); 712 msk = subflow_req->msk; 713 if (!msk) 714 return false; 715 716 subflow_generate_hmac(READ_ONCE(msk->remote_key), 717 READ_ONCE(msk->local_key), 718 subflow_req->remote_nonce, 719 subflow_req->local_nonce, hmac); 720 721 return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); 722 } 723 724 static void subflow_ulp_fallback(struct sock *sk, 725 struct mptcp_subflow_context *old_ctx) 726 { 727 struct inet_connection_sock *icsk = inet_csk(sk); 728 729 mptcp_subflow_tcp_fallback(sk, old_ctx); 730 icsk->icsk_ulp_ops = NULL; 731 rcu_assign_pointer(icsk->icsk_ulp_data, NULL); 732 tcp_sk(sk)->is_mptcp = 0; 733 734 mptcp_subflow_ops_undo_override(sk); 735 } 736 737 void mptcp_subflow_drop_ctx(struct sock *ssk) 738 { 739 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); 740 741 if (!ctx) 742 return; 743 744 list_del(&mptcp_subflow_ctx(ssk)->node); 745 if (inet_csk(ssk)->icsk_ulp_ops) { 746 subflow_ulp_fallback(ssk, ctx); 747 if (ctx->conn) 748 sock_put(ctx->conn); 749 } 750 751 kfree_rcu(ctx, rcu); 752 } 753 754 void __mptcp_subflow_fully_established(struct mptcp_sock *msk, 755 struct mptcp_subflow_context *subflow, 756 const struct mptcp_options_received *mp_opt) 757 { 758 subflow_set_remote_key(msk, subflow, mp_opt); 759 subflow->fully_established = 1; 760 WRITE_ONCE(msk->fully_established, true); 761 762 if (subflow->is_mptfo) 763 __mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); 764 } 765 766 static struct sock *subflow_syn_recv_sock(const struct sock *sk, 767 struct sk_buff *skb, 768 struct request_sock *req, 769 struct dst_entry *dst, 770 struct request_sock *req_unhash, 771 bool *own_req) 772 { 773 struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); 774 struct mptcp_subflow_request_sock *subflow_req; 775 struct mptcp_options_received mp_opt; 776 bool fallback, fallback_is_fatal; 777 struct mptcp_sock *owner; 778 struct sock *child; 779 780 pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); 781 782 /* After child creation we must look for MPC even when options 783 * are not parsed 784 */ 785 mp_opt.suboptions = 0; 786 787 /* hopefully temporary handling for MP_JOIN+syncookie */ 788 subflow_req = mptcp_subflow_rsk(req); 789 fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join; 790 fallback = !tcp_rsk(req)->is_mptcp; 791 if (fallback) 792 goto create_child; 793 794 /* if the sk is MP_CAPABLE, we try to fetch the client key */ 795 if (subflow_req->mp_capable) { 796 /* we can receive and accept an in-window, out-of-order pkt, 797 * which may not carry the MP_CAPABLE opt even on mptcp enabled 798 * paths: always try to extract the peer key, and fallback 799 * for packets missing it. 800 * Even OoO DSS packets coming legitly after dropped or 801 * reordered MPC will cause fallback, but we don't have other 802 * options. 803 */ 804 mptcp_get_options(skb, &mp_opt); 805 if (!(mp_opt.suboptions & 806 (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_ACK))) 807 fallback = true; 808 809 } else if (subflow_req->mp_join) { 810 mptcp_get_options(skb, &mp_opt); 811 if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK) || 812 !subflow_hmac_valid(req, &mp_opt) || 813 !mptcp_can_accept_new_subflow(subflow_req->msk)) { 814 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); 815 fallback = true; 816 } 817 } 818 819 create_child: 820 child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, 821 req_unhash, own_req); 822 823 if (child && *own_req) { 824 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); 825 826 tcp_rsk(req)->drop_req = false; 827 828 /* we need to fallback on ctx allocation failure and on pre-reqs 829 * checking above. In the latter scenario we additionally need 830 * to reset the context to non MPTCP status. 831 */ 832 if (!ctx || fallback) { 833 if (fallback_is_fatal) { 834 subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); 835 goto dispose_child; 836 } 837 goto fallback; 838 } 839 840 /* ssk inherits options of listener sk */ 841 ctx->setsockopt_seq = listener->setsockopt_seq; 842 843 if (ctx->mp_capable) { 844 ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req); 845 if (!ctx->conn) 846 goto fallback; 847 848 ctx->subflow_id = 1; 849 owner = mptcp_sk(ctx->conn); 850 mptcp_pm_new_connection(owner, child, 1); 851 852 /* with OoO packets we can reach here without ingress 853 * mpc option 854 */ 855 if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) { 856 mptcp_pm_fully_established(owner, child); 857 ctx->pm_notified = 1; 858 } 859 } else if (ctx->mp_join) { 860 owner = subflow_req->msk; 861 if (!owner) { 862 subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); 863 goto dispose_child; 864 } 865 866 /* move the msk reference ownership to the subflow */ 867 subflow_req->msk = NULL; 868 ctx->conn = (struct sock *)owner; 869 870 if (subflow_use_different_sport(owner, sk)) { 871 pr_debug("ack inet_sport=%d %d", 872 ntohs(inet_sk(sk)->inet_sport), 873 ntohs(inet_sk((struct sock *)owner)->inet_sport)); 874 if (!mptcp_pm_sport_in_anno_list(owner, sk)) { 875 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX); 876 goto dispose_child; 877 } 878 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX); 879 } 880 881 if (!mptcp_finish_join(child)) 882 goto dispose_child; 883 884 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); 885 tcp_rsk(req)->drop_req = true; 886 } 887 } 888 889 /* check for expected invariant - should never trigger, just help 890 * catching eariler subtle bugs 891 */ 892 WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && 893 (!mptcp_subflow_ctx(child) || 894 !mptcp_subflow_ctx(child)->conn)); 895 return child; 896 897 dispose_child: 898 mptcp_subflow_drop_ctx(child); 899 tcp_rsk(req)->drop_req = true; 900 inet_csk_prepare_for_destroy_sock(child); 901 tcp_done(child); 902 req->rsk_ops->send_reset(sk, skb); 903 904 /* The last child reference will be released by the caller */ 905 return child; 906 907 fallback: 908 if (fallback) 909 SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); 910 mptcp_subflow_drop_ctx(child); 911 return child; 912 } 913 914 static struct inet_connection_sock_af_ops subflow_specific __ro_after_init; 915 static struct proto tcp_prot_override __ro_after_init; 916 917 enum mapping_status { 918 MAPPING_OK, 919 MAPPING_INVALID, 920 MAPPING_EMPTY, 921 MAPPING_DATA_FIN, 922 MAPPING_DUMMY, 923 MAPPING_BAD_CSUM 924 }; 925 926 static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) 927 { 928 pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d", 929 ssn, subflow->map_subflow_seq, subflow->map_data_len); 930 } 931 932 static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) 933 { 934 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 935 unsigned int skb_consumed; 936 937 skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; 938 if (WARN_ON_ONCE(skb_consumed >= skb->len)) 939 return true; 940 941 return skb->len - skb_consumed <= subflow->map_data_len - 942 mptcp_subflow_get_map_offset(subflow); 943 } 944 945 static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) 946 { 947 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 948 u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; 949 950 if (unlikely(before(ssn, subflow->map_subflow_seq))) { 951 /* Mapping covers data later in the subflow stream, 952 * currently unsupported. 953 */ 954 dbg_bad_map(subflow, ssn); 955 return false; 956 } 957 if (unlikely(!before(ssn, subflow->map_subflow_seq + 958 subflow->map_data_len))) { 959 /* Mapping does covers past subflow data, invalid */ 960 dbg_bad_map(subflow, ssn); 961 return false; 962 } 963 return true; 964 } 965 966 static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb, 967 bool csum_reqd) 968 { 969 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 970 u32 offset, seq, delta; 971 __sum16 csum; 972 int len; 973 974 if (!csum_reqd) 975 return MAPPING_OK; 976 977 /* mapping already validated on previous traversal */ 978 if (subflow->map_csum_len == subflow->map_data_len) 979 return MAPPING_OK; 980 981 /* traverse the receive queue, ensuring it contains a full 982 * DSS mapping and accumulating the related csum. 983 * Preserve the accoumlate csum across multiple calls, to compute 984 * the csum only once 985 */ 986 delta = subflow->map_data_len - subflow->map_csum_len; 987 for (;;) { 988 seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len; 989 offset = seq - TCP_SKB_CB(skb)->seq; 990 991 /* if the current skb has not been accounted yet, csum its contents 992 * up to the amount covered by the current DSS 993 */ 994 if (offset < skb->len) { 995 __wsum csum; 996 997 len = min(skb->len - offset, delta); 998 csum = skb_checksum(skb, offset, len, 0); 999 subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum, 1000 subflow->map_csum_len); 1001 1002 delta -= len; 1003 subflow->map_csum_len += len; 1004 } 1005 if (delta == 0) 1006 break; 1007 1008 if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) { 1009 /* if this subflow is closed, the partial mapping 1010 * will be never completed; flush the pending skbs, so 1011 * that subflow_sched_work_if_closed() can kick in 1012 */ 1013 if (unlikely(ssk->sk_state == TCP_CLOSE)) 1014 while ((skb = skb_peek(&ssk->sk_receive_queue))) 1015 sk_eat_skb(ssk, skb); 1016 1017 /* not enough data to validate the csum */ 1018 return MAPPING_EMPTY; 1019 } 1020 1021 /* the DSS mapping for next skbs will be validated later, 1022 * when a get_mapping_status call will process such skb 1023 */ 1024 skb = skb->next; 1025 } 1026 1027 /* note that 'map_data_len' accounts only for the carried data, does 1028 * not include the eventual seq increment due to the data fin, 1029 * while the pseudo header requires the original DSS data len, 1030 * including that 1031 */ 1032 csum = __mptcp_make_csum(subflow->map_seq, 1033 subflow->map_subflow_seq, 1034 subflow->map_data_len + subflow->map_data_fin, 1035 subflow->map_data_csum); 1036 if (unlikely(csum)) { 1037 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); 1038 return MAPPING_BAD_CSUM; 1039 } 1040 1041 subflow->valid_csum_seen = 1; 1042 return MAPPING_OK; 1043 } 1044 1045 static enum mapping_status get_mapping_status(struct sock *ssk, 1046 struct mptcp_sock *msk) 1047 { 1048 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1049 bool csum_reqd = READ_ONCE(msk->csum_enabled); 1050 struct mptcp_ext *mpext; 1051 struct sk_buff *skb; 1052 u16 data_len; 1053 u64 map_seq; 1054 1055 skb = skb_peek(&ssk->sk_receive_queue); 1056 if (!skb) 1057 return MAPPING_EMPTY; 1058 1059 if (mptcp_check_fallback(ssk)) 1060 return MAPPING_DUMMY; 1061 1062 mpext = mptcp_get_ext(skb); 1063 if (!mpext || !mpext->use_map) { 1064 if (!subflow->map_valid && !skb->len) { 1065 /* the TCP stack deliver 0 len FIN pkt to the receive 1066 * queue, that is the only 0len pkts ever expected here, 1067 * and we can admit no mapping only for 0 len pkts 1068 */ 1069 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) 1070 WARN_ONCE(1, "0len seq %d:%d flags %x", 1071 TCP_SKB_CB(skb)->seq, 1072 TCP_SKB_CB(skb)->end_seq, 1073 TCP_SKB_CB(skb)->tcp_flags); 1074 sk_eat_skb(ssk, skb); 1075 return MAPPING_EMPTY; 1076 } 1077 1078 if (!subflow->map_valid) 1079 return MAPPING_INVALID; 1080 1081 goto validate_seq; 1082 } 1083 1084 trace_get_mapping_status(mpext); 1085 1086 data_len = mpext->data_len; 1087 if (data_len == 0) { 1088 pr_debug("infinite mapping received"); 1089 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); 1090 subflow->map_data_len = 0; 1091 return MAPPING_INVALID; 1092 } 1093 1094 if (mpext->data_fin == 1) { 1095 if (data_len == 1) { 1096 bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, 1097 mpext->dsn64); 1098 pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); 1099 if (subflow->map_valid) { 1100 /* A DATA_FIN might arrive in a DSS 1101 * option before the previous mapping 1102 * has been fully consumed. Continue 1103 * handling the existing mapping. 1104 */ 1105 skb_ext_del(skb, SKB_EXT_MPTCP); 1106 return MAPPING_OK; 1107 } else { 1108 if (updated) 1109 mptcp_schedule_work((struct sock *)msk); 1110 1111 return MAPPING_DATA_FIN; 1112 } 1113 } else { 1114 u64 data_fin_seq = mpext->data_seq + data_len - 1; 1115 1116 /* If mpext->data_seq is a 32-bit value, data_fin_seq 1117 * must also be limited to 32 bits. 1118 */ 1119 if (!mpext->dsn64) 1120 data_fin_seq &= GENMASK_ULL(31, 0); 1121 1122 mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); 1123 pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d", 1124 data_fin_seq, mpext->dsn64); 1125 } 1126 1127 /* Adjust for DATA_FIN using 1 byte of sequence space */ 1128 data_len--; 1129 } 1130 1131 map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64); 1132 WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); 1133 1134 if (subflow->map_valid) { 1135 /* Allow replacing only with an identical map */ 1136 if (subflow->map_seq == map_seq && 1137 subflow->map_subflow_seq == mpext->subflow_seq && 1138 subflow->map_data_len == data_len && 1139 subflow->map_csum_reqd == mpext->csum_reqd) { 1140 skb_ext_del(skb, SKB_EXT_MPTCP); 1141 goto validate_csum; 1142 } 1143 1144 /* If this skb data are fully covered by the current mapping, 1145 * the new map would need caching, which is not supported 1146 */ 1147 if (skb_is_fully_mapped(ssk, skb)) { 1148 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); 1149 return MAPPING_INVALID; 1150 } 1151 1152 /* will validate the next map after consuming the current one */ 1153 goto validate_csum; 1154 } 1155 1156 subflow->map_seq = map_seq; 1157 subflow->map_subflow_seq = mpext->subflow_seq; 1158 subflow->map_data_len = data_len; 1159 subflow->map_valid = 1; 1160 subflow->map_data_fin = mpext->data_fin; 1161 subflow->mpc_map = mpext->mpc_map; 1162 subflow->map_csum_reqd = mpext->csum_reqd; 1163 subflow->map_csum_len = 0; 1164 subflow->map_data_csum = csum_unfold(mpext->csum); 1165 1166 /* Cfr RFC 8684 Section 3.3.0 */ 1167 if (unlikely(subflow->map_csum_reqd != csum_reqd)) 1168 return MAPPING_INVALID; 1169 1170 pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", 1171 subflow->map_seq, subflow->map_subflow_seq, 1172 subflow->map_data_len, subflow->map_csum_reqd, 1173 subflow->map_data_csum); 1174 1175 validate_seq: 1176 /* we revalidate valid mapping on new skb, because we must ensure 1177 * the current skb is completely covered by the available mapping 1178 */ 1179 if (!validate_mapping(ssk, skb)) { 1180 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH); 1181 return MAPPING_INVALID; 1182 } 1183 1184 skb_ext_del(skb, SKB_EXT_MPTCP); 1185 1186 validate_csum: 1187 return validate_data_csum(ssk, skb, csum_reqd); 1188 } 1189 1190 static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, 1191 u64 limit) 1192 { 1193 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1194 bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 1195 u32 incr; 1196 1197 incr = limit >= skb->len ? skb->len + fin : limit; 1198 1199 pr_debug("discarding=%d len=%d seq=%d", incr, skb->len, 1200 subflow->map_subflow_seq); 1201 MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); 1202 tcp_sk(ssk)->copied_seq += incr; 1203 if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) 1204 sk_eat_skb(ssk, skb); 1205 if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) 1206 subflow->map_valid = 0; 1207 } 1208 1209 /* sched mptcp worker to remove the subflow if no more data is pending */ 1210 static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) 1211 { 1212 if (likely(ssk->sk_state != TCP_CLOSE)) 1213 return; 1214 1215 if (skb_queue_empty(&ssk->sk_receive_queue) && 1216 !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) 1217 mptcp_schedule_work((struct sock *)msk); 1218 } 1219 1220 static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) 1221 { 1222 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 1223 1224 if (subflow->mp_join) 1225 return false; 1226 else if (READ_ONCE(msk->csum_enabled)) 1227 return !subflow->valid_csum_seen; 1228 else 1229 return !subflow->fully_established; 1230 } 1231 1232 static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) 1233 { 1234 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1235 unsigned long fail_tout; 1236 1237 /* greceful failure can happen only on the MPC subflow */ 1238 if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) 1239 return; 1240 1241 /* since the close timeout take precedence on the fail one, 1242 * no need to start the latter when the first is already set 1243 */ 1244 if (sock_flag((struct sock *)msk, SOCK_DEAD)) 1245 return; 1246 1247 /* we don't need extreme accuracy here, use a zero fail_tout as special 1248 * value meaning no fail timeout at all; 1249 */ 1250 fail_tout = jiffies + TCP_RTO_MAX; 1251 if (!fail_tout) 1252 fail_tout = 1; 1253 WRITE_ONCE(subflow->fail_tout, fail_tout); 1254 tcp_send_ack(ssk); 1255 1256 mptcp_reset_tout_timer(msk, subflow->fail_tout); 1257 } 1258 1259 static bool subflow_check_data_avail(struct sock *ssk) 1260 { 1261 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1262 enum mapping_status status; 1263 struct mptcp_sock *msk; 1264 struct sk_buff *skb; 1265 1266 if (!skb_peek(&ssk->sk_receive_queue)) 1267 WRITE_ONCE(subflow->data_avail, false); 1268 if (subflow->data_avail) 1269 return true; 1270 1271 msk = mptcp_sk(subflow->conn); 1272 for (;;) { 1273 u64 ack_seq; 1274 u64 old_ack; 1275 1276 status = get_mapping_status(ssk, msk); 1277 trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); 1278 if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || 1279 status == MAPPING_BAD_CSUM)) 1280 goto fallback; 1281 1282 if (status != MAPPING_OK) 1283 goto no_data; 1284 1285 skb = skb_peek(&ssk->sk_receive_queue); 1286 if (WARN_ON_ONCE(!skb)) 1287 goto no_data; 1288 1289 if (unlikely(!READ_ONCE(msk->can_ack))) 1290 goto fallback; 1291 1292 old_ack = READ_ONCE(msk->ack_seq); 1293 ack_seq = mptcp_subflow_get_mapped_dsn(subflow); 1294 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, 1295 ack_seq); 1296 if (unlikely(before64(ack_seq, old_ack))) { 1297 mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); 1298 continue; 1299 } 1300 1301 WRITE_ONCE(subflow->data_avail, true); 1302 break; 1303 } 1304 return true; 1305 1306 no_data: 1307 subflow_sched_work_if_closed(msk, ssk); 1308 return false; 1309 1310 fallback: 1311 if (!__mptcp_check_fallback(msk)) { 1312 /* RFC 8684 section 3.7. */ 1313 if (status == MAPPING_BAD_CSUM && 1314 (subflow->mp_join || subflow->valid_csum_seen)) { 1315 subflow->send_mp_fail = 1; 1316 1317 if (!READ_ONCE(msk->allow_infinite_fallback)) { 1318 subflow->reset_transient = 0; 1319 subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; 1320 goto reset; 1321 } 1322 mptcp_subflow_fail(msk, ssk); 1323 WRITE_ONCE(subflow->data_avail, true); 1324 return true; 1325 } 1326 1327 if (!subflow_can_fallback(subflow) && subflow->map_data_len) { 1328 /* fatal protocol error, close the socket. 1329 * subflow_error_report() will introduce the appropriate barriers 1330 */ 1331 subflow->reset_transient = 0; 1332 subflow->reset_reason = MPTCP_RST_EMPTCP; 1333 1334 reset: 1335 WRITE_ONCE(ssk->sk_err, EBADMSG); 1336 tcp_set_state(ssk, TCP_CLOSE); 1337 while ((skb = skb_peek(&ssk->sk_receive_queue))) 1338 sk_eat_skb(ssk, skb); 1339 tcp_send_active_reset(ssk, GFP_ATOMIC); 1340 WRITE_ONCE(subflow->data_avail, false); 1341 return false; 1342 } 1343 1344 mptcp_do_fallback(ssk); 1345 } 1346 1347 skb = skb_peek(&ssk->sk_receive_queue); 1348 subflow->map_valid = 1; 1349 subflow->map_seq = READ_ONCE(msk->ack_seq); 1350 subflow->map_data_len = skb->len; 1351 subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; 1352 WRITE_ONCE(subflow->data_avail, true); 1353 return true; 1354 } 1355 1356 bool mptcp_subflow_data_available(struct sock *sk) 1357 { 1358 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1359 1360 /* check if current mapping is still valid */ 1361 if (subflow->map_valid && 1362 mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { 1363 subflow->map_valid = 0; 1364 WRITE_ONCE(subflow->data_avail, false); 1365 1366 pr_debug("Done with mapping: seq=%u data_len=%u", 1367 subflow->map_subflow_seq, 1368 subflow->map_data_len); 1369 } 1370 1371 return subflow_check_data_avail(sk); 1372 } 1373 1374 /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, 1375 * not the ssk one. 1376 * 1377 * In mptcp, rwin is about the mptcp-level connection data. 1378 * 1379 * Data that is still on the ssk rx queue can thus be ignored, 1380 * as far as mptcp peer is concerned that data is still inflight. 1381 * DSS ACK is updated when skb is moved to the mptcp rx queue. 1382 */ 1383 void mptcp_space(const struct sock *ssk, int *space, int *full_space) 1384 { 1385 const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1386 const struct sock *sk = subflow->conn; 1387 1388 *space = __mptcp_space(sk); 1389 *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); 1390 } 1391 1392 static void subflow_error_report(struct sock *ssk) 1393 { 1394 struct sock *sk = mptcp_subflow_ctx(ssk)->conn; 1395 1396 /* bail early if this is a no-op, so that we avoid introducing a 1397 * problematic lockdep dependency between TCP accept queue lock 1398 * and msk socket spinlock 1399 */ 1400 if (!sk->sk_socket) 1401 return; 1402 1403 mptcp_data_lock(sk); 1404 if (!sock_owned_by_user(sk)) 1405 __mptcp_error_report(sk); 1406 else 1407 __set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->cb_flags); 1408 mptcp_data_unlock(sk); 1409 } 1410 1411 static void subflow_data_ready(struct sock *sk) 1412 { 1413 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1414 u16 state = 1 << inet_sk_state_load(sk); 1415 struct sock *parent = subflow->conn; 1416 struct mptcp_sock *msk; 1417 1418 trace_sk_data_ready(sk); 1419 1420 msk = mptcp_sk(parent); 1421 if (state & TCPF_LISTEN) { 1422 /* MPJ subflow are removed from accept queue before reaching here, 1423 * avoid stray wakeups 1424 */ 1425 if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) 1426 return; 1427 1428 parent->sk_data_ready(parent); 1429 return; 1430 } 1431 1432 WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && 1433 !subflow->mp_join && !(state & TCPF_CLOSE)); 1434 1435 if (mptcp_subflow_data_available(sk)) { 1436 mptcp_data_ready(parent, sk); 1437 1438 /* subflow-level lowat test are not relevant. 1439 * respect the msk-level threshold eventually mandating an immediate ack 1440 */ 1441 if (mptcp_data_avail(msk) < parent->sk_rcvlowat && 1442 (tcp_sk(sk)->rcv_nxt - tcp_sk(sk)->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss) 1443 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; 1444 } else if (unlikely(sk->sk_err)) { 1445 subflow_error_report(sk); 1446 } 1447 } 1448 1449 static void subflow_write_space(struct sock *ssk) 1450 { 1451 struct sock *sk = mptcp_subflow_ctx(ssk)->conn; 1452 1453 mptcp_propagate_sndbuf(sk, ssk); 1454 mptcp_write_space(sk); 1455 } 1456 1457 static const struct inet_connection_sock_af_ops * 1458 subflow_default_af_ops(struct sock *sk) 1459 { 1460 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1461 if (sk->sk_family == AF_INET6) 1462 return &subflow_v6_specific; 1463 #endif 1464 return &subflow_specific; 1465 } 1466 1467 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1468 void mptcpv6_handle_mapped(struct sock *sk, bool mapped) 1469 { 1470 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1471 struct inet_connection_sock *icsk = inet_csk(sk); 1472 const struct inet_connection_sock_af_ops *target; 1473 1474 target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); 1475 1476 pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", 1477 subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); 1478 1479 if (likely(icsk->icsk_af_ops == target)) 1480 return; 1481 1482 subflow->icsk_af_ops = icsk->icsk_af_ops; 1483 icsk->icsk_af_ops = target; 1484 } 1485 #endif 1486 1487 void mptcp_info2sockaddr(const struct mptcp_addr_info *info, 1488 struct sockaddr_storage *addr, 1489 unsigned short family) 1490 { 1491 memset(addr, 0, sizeof(*addr)); 1492 addr->ss_family = family; 1493 if (addr->ss_family == AF_INET) { 1494 struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; 1495 1496 if (info->family == AF_INET) 1497 in_addr->sin_addr = info->addr; 1498 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1499 else if (ipv6_addr_v4mapped(&info->addr6)) 1500 in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3]; 1501 #endif 1502 in_addr->sin_port = info->port; 1503 } 1504 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1505 else if (addr->ss_family == AF_INET6) { 1506 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; 1507 1508 if (info->family == AF_INET) 1509 ipv6_addr_set_v4mapped(info->addr.s_addr, 1510 &in6_addr->sin6_addr); 1511 else 1512 in6_addr->sin6_addr = info->addr6; 1513 in6_addr->sin6_port = info->port; 1514 } 1515 #endif 1516 } 1517 1518 int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, 1519 const struct mptcp_addr_info *remote) 1520 { 1521 struct mptcp_sock *msk = mptcp_sk(sk); 1522 struct mptcp_subflow_context *subflow; 1523 struct sockaddr_storage addr; 1524 int remote_id = remote->id; 1525 int local_id = loc->id; 1526 int err = -ENOTCONN; 1527 struct socket *sf; 1528 struct sock *ssk; 1529 u32 remote_token; 1530 int addrlen; 1531 int ifindex; 1532 u8 flags; 1533 1534 if (!mptcp_is_fully_established(sk)) 1535 goto err_out; 1536 1537 err = mptcp_subflow_create_socket(sk, loc->family, &sf); 1538 if (err) 1539 goto err_out; 1540 1541 ssk = sf->sk; 1542 subflow = mptcp_subflow_ctx(ssk); 1543 do { 1544 get_random_bytes(&subflow->local_nonce, sizeof(u32)); 1545 } while (!subflow->local_nonce); 1546 1547 if (local_id) 1548 subflow_set_local_id(subflow, local_id); 1549 1550 mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id, 1551 &flags, &ifindex); 1552 subflow->remote_key_valid = 1; 1553 subflow->remote_key = READ_ONCE(msk->remote_key); 1554 subflow->local_key = READ_ONCE(msk->local_key); 1555 subflow->token = msk->token; 1556 mptcp_info2sockaddr(loc, &addr, ssk->sk_family); 1557 1558 addrlen = sizeof(struct sockaddr_in); 1559 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1560 if (addr.ss_family == AF_INET6) 1561 addrlen = sizeof(struct sockaddr_in6); 1562 #endif 1563 ssk->sk_bound_dev_if = ifindex; 1564 err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); 1565 if (err) 1566 goto failed; 1567 1568 mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); 1569 pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, 1570 remote_token, local_id, remote_id); 1571 subflow->remote_token = remote_token; 1572 WRITE_ONCE(subflow->remote_id, remote_id); 1573 subflow->request_join = 1; 1574 subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); 1575 subflow->subflow_id = msk->subflow_id++; 1576 mptcp_info2sockaddr(remote, &addr, ssk->sk_family); 1577 1578 sock_hold(ssk); 1579 list_add_tail(&subflow->node, &msk->conn_list); 1580 err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); 1581 if (err && err != -EINPROGRESS) 1582 goto failed_unlink; 1583 1584 /* discard the subflow socket */ 1585 mptcp_sock_graft(ssk, sk->sk_socket); 1586 iput(SOCK_INODE(sf)); 1587 WRITE_ONCE(msk->allow_infinite_fallback, false); 1588 mptcp_stop_tout_timer(sk); 1589 return 0; 1590 1591 failed_unlink: 1592 list_del(&subflow->node); 1593 sock_put(mptcp_subflow_tcp_sock(subflow)); 1594 1595 failed: 1596 subflow->disposable = 1; 1597 sock_release(sf); 1598 1599 err_out: 1600 /* we account subflows before the creation, and this failures will not 1601 * be caught by sk_state_change() 1602 */ 1603 mptcp_pm_close_subflow(msk); 1604 return err; 1605 } 1606 1607 static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) 1608 { 1609 #ifdef CONFIG_SOCK_CGROUP_DATA 1610 struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, 1611 *child_skcd = &child->sk_cgrp_data; 1612 1613 /* only the additional subflows created by kworkers have to be modified */ 1614 if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != 1615 cgroup_id(sock_cgroup_ptr(child_skcd))) { 1616 #ifdef CONFIG_MEMCG 1617 struct mem_cgroup *memcg = parent->sk_memcg; 1618 1619 mem_cgroup_sk_free(child); 1620 if (memcg && css_tryget(&memcg->css)) 1621 child->sk_memcg = memcg; 1622 #endif /* CONFIG_MEMCG */ 1623 1624 cgroup_sk_free(child_skcd); 1625 *child_skcd = *parent_skcd; 1626 cgroup_sk_clone(child_skcd); 1627 } 1628 #endif /* CONFIG_SOCK_CGROUP_DATA */ 1629 } 1630 1631 static void mptcp_subflow_ops_override(struct sock *ssk) 1632 { 1633 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1634 if (ssk->sk_prot == &tcpv6_prot) 1635 ssk->sk_prot = &tcpv6_prot_override; 1636 else 1637 #endif 1638 ssk->sk_prot = &tcp_prot_override; 1639 } 1640 1641 static void mptcp_subflow_ops_undo_override(struct sock *ssk) 1642 { 1643 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 1644 if (ssk->sk_prot == &tcpv6_prot_override) 1645 ssk->sk_prot = &tcpv6_prot; 1646 else 1647 #endif 1648 ssk->sk_prot = &tcp_prot; 1649 } 1650 1651 int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, 1652 struct socket **new_sock) 1653 { 1654 struct mptcp_subflow_context *subflow; 1655 struct net *net = sock_net(sk); 1656 struct socket *sf; 1657 int err; 1658 1659 /* un-accepted server sockets can reach here - on bad configuration 1660 * bail early to avoid greater trouble later 1661 */ 1662 if (unlikely(!sk->sk_socket)) 1663 return -EINVAL; 1664 1665 err = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &sf); 1666 if (err) 1667 return err; 1668 1669 lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING); 1670 1671 err = security_mptcp_add_subflow(sk, sf->sk); 1672 if (err) 1673 goto err_free; 1674 1675 /* the newly created socket has to be in the same cgroup as its parent */ 1676 mptcp_attach_cgroup(sk, sf->sk); 1677 1678 /* kernel sockets do not by default acquire net ref, but TCP timer 1679 * needs it. 1680 * Update ns_tracker to current stack trace and refcounted tracker. 1681 */ 1682 __netns_tracker_free(net, &sf->sk->ns_tracker, false); 1683 sf->sk->sk_net_refcnt = 1; 1684 get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); 1685 sock_inuse_add(net, 1); 1686 err = tcp_set_ulp(sf->sk, "mptcp"); 1687 if (err) 1688 goto err_free; 1689 1690 mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk); 1691 release_sock(sf->sk); 1692 1693 /* the newly created socket really belongs to the owning MPTCP master 1694 * socket, even if for additional subflows the allocation is performed 1695 * by a kernel workqueue. Adjust inode references, so that the 1696 * procfs/diag interfaces really show this one belonging to the correct 1697 * user. 1698 */ 1699 SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; 1700 SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid; 1701 SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; 1702 1703 subflow = mptcp_subflow_ctx(sf->sk); 1704 pr_debug("subflow=%p", subflow); 1705 1706 *new_sock = sf; 1707 sock_hold(sk); 1708 subflow->conn = sk; 1709 mptcp_subflow_ops_override(sf->sk); 1710 1711 return 0; 1712 1713 err_free: 1714 release_sock(sf->sk); 1715 sock_release(sf); 1716 return err; 1717 } 1718 1719 static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, 1720 gfp_t priority) 1721 { 1722 struct inet_connection_sock *icsk = inet_csk(sk); 1723 struct mptcp_subflow_context *ctx; 1724 1725 ctx = kzalloc(sizeof(*ctx), priority); 1726 if (!ctx) 1727 return NULL; 1728 1729 rcu_assign_pointer(icsk->icsk_ulp_data, ctx); 1730 INIT_LIST_HEAD(&ctx->node); 1731 INIT_LIST_HEAD(&ctx->delegated_node); 1732 1733 pr_debug("subflow=%p", ctx); 1734 1735 ctx->tcp_sock = sk; 1736 WRITE_ONCE(ctx->local_id, -1); 1737 1738 return ctx; 1739 } 1740 1741 static void __subflow_state_change(struct sock *sk) 1742 { 1743 struct socket_wq *wq; 1744 1745 rcu_read_lock(); 1746 wq = rcu_dereference(sk->sk_wq); 1747 if (skwq_has_sleeper(wq)) 1748 wake_up_interruptible_all(&wq->wait); 1749 rcu_read_unlock(); 1750 } 1751 1752 static bool subflow_is_done(const struct sock *sk) 1753 { 1754 return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; 1755 } 1756 1757 static void subflow_state_change(struct sock *sk) 1758 { 1759 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 1760 struct sock *parent = subflow->conn; 1761 struct mptcp_sock *msk; 1762 1763 __subflow_state_change(sk); 1764 1765 msk = mptcp_sk(parent); 1766 if (subflow_simultaneous_connect(sk)) { 1767 mptcp_do_fallback(sk); 1768 pr_fallback(msk); 1769 subflow->conn_finished = 1; 1770 mptcp_propagate_state(parent, sk, subflow, NULL); 1771 } 1772 1773 /* as recvmsg() does not acquire the subflow socket for ssk selection 1774 * a fin packet carrying a DSS can be unnoticed if we don't trigger 1775 * the data available machinery here. 1776 */ 1777 if (mptcp_subflow_data_available(sk)) 1778 mptcp_data_ready(parent, sk); 1779 else if (unlikely(sk->sk_err)) 1780 subflow_error_report(sk); 1781 1782 subflow_sched_work_if_closed(mptcp_sk(parent), sk); 1783 1784 /* when the fallback subflow closes the rx side, trigger a 'dummy' 1785 * ingress data fin, so that the msk state will follow along 1786 */ 1787 if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk && 1788 mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) 1789 mptcp_schedule_work(parent); 1790 } 1791 1792 void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) 1793 { 1794 struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; 1795 struct request_sock *req, *head, *tail; 1796 struct mptcp_subflow_context *subflow; 1797 struct sock *sk, *ssk; 1798 1799 /* Due to lock dependencies no relevant lock can be acquired under rskq_lock. 1800 * Splice the req list, so that accept() can not reach the pending ssk after 1801 * the listener socket is released below. 1802 */ 1803 spin_lock_bh(&queue->rskq_lock); 1804 head = queue->rskq_accept_head; 1805 tail = queue->rskq_accept_tail; 1806 queue->rskq_accept_head = NULL; 1807 queue->rskq_accept_tail = NULL; 1808 spin_unlock_bh(&queue->rskq_lock); 1809 1810 if (!head) 1811 return; 1812 1813 /* can't acquire the msk socket lock under the subflow one, 1814 * or will cause ABBA deadlock 1815 */ 1816 release_sock(listener_ssk); 1817 1818 for (req = head; req; req = req->dl_next) { 1819 ssk = req->sk; 1820 if (!sk_is_mptcp(ssk)) 1821 continue; 1822 1823 subflow = mptcp_subflow_ctx(ssk); 1824 if (!subflow || !subflow->conn) 1825 continue; 1826 1827 sk = subflow->conn; 1828 sock_hold(sk); 1829 1830 lock_sock_nested(sk, SINGLE_DEPTH_NESTING); 1831 __mptcp_unaccepted_force_close(sk); 1832 release_sock(sk); 1833 1834 /* lockdep will report a false positive ABBA deadlock 1835 * between cancel_work_sync and the listener socket. 1836 * The involved locks belong to different sockets WRT 1837 * the existing AB chain. 1838 * Using a per socket key is problematic as key 1839 * deregistration requires process context and must be 1840 * performed at socket disposal time, in atomic 1841 * context. 1842 * Just tell lockdep to consider the listener socket 1843 * released here. 1844 */ 1845 mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); 1846 mptcp_cancel_work(sk); 1847 mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_); 1848 1849 sock_put(sk); 1850 } 1851 1852 /* we are still under the listener msk socket lock */ 1853 lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); 1854 1855 /* restore the listener queue, to let the TCP code clean it up */ 1856 spin_lock_bh(&queue->rskq_lock); 1857 WARN_ON_ONCE(queue->rskq_accept_head); 1858 queue->rskq_accept_head = head; 1859 queue->rskq_accept_tail = tail; 1860 spin_unlock_bh(&queue->rskq_lock); 1861 } 1862 1863 static int subflow_ulp_init(struct sock *sk) 1864 { 1865 struct inet_connection_sock *icsk = inet_csk(sk); 1866 struct mptcp_subflow_context *ctx; 1867 struct tcp_sock *tp = tcp_sk(sk); 1868 int err = 0; 1869 1870 /* disallow attaching ULP to a socket unless it has been 1871 * created with sock_create_kern() 1872 */ 1873 if (!sk->sk_kern_sock) { 1874 err = -EOPNOTSUPP; 1875 goto out; 1876 } 1877 1878 ctx = subflow_create_ctx(sk, GFP_KERNEL); 1879 if (!ctx) { 1880 err = -ENOMEM; 1881 goto out; 1882 } 1883 1884 pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); 1885 1886 tp->is_mptcp = 1; 1887 ctx->icsk_af_ops = icsk->icsk_af_ops; 1888 icsk->icsk_af_ops = subflow_default_af_ops(sk); 1889 ctx->tcp_state_change = sk->sk_state_change; 1890 ctx->tcp_error_report = sk->sk_error_report; 1891 1892 WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable); 1893 WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space); 1894 1895 sk->sk_data_ready = subflow_data_ready; 1896 sk->sk_write_space = subflow_write_space; 1897 sk->sk_state_change = subflow_state_change; 1898 sk->sk_error_report = subflow_error_report; 1899 out: 1900 return err; 1901 } 1902 1903 static void subflow_ulp_release(struct sock *ssk) 1904 { 1905 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); 1906 bool release = true; 1907 struct sock *sk; 1908 1909 if (!ctx) 1910 return; 1911 1912 sk = ctx->conn; 1913 if (sk) { 1914 /* if the msk has been orphaned, keep the ctx 1915 * alive, will be freed by __mptcp_close_ssk(), 1916 * when the subflow is still unaccepted 1917 */ 1918 release = ctx->disposable || list_empty(&ctx->node); 1919 1920 /* inet_child_forget() does not call sk_state_change(), 1921 * explicitly trigger the socket close machinery 1922 */ 1923 if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, 1924 &mptcp_sk(sk)->flags)) 1925 mptcp_schedule_work(sk); 1926 sock_put(sk); 1927 } 1928 1929 mptcp_subflow_ops_undo_override(ssk); 1930 if (release) 1931 kfree_rcu(ctx, rcu); 1932 } 1933 1934 static void subflow_ulp_clone(const struct request_sock *req, 1935 struct sock *newsk, 1936 const gfp_t priority) 1937 { 1938 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 1939 struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); 1940 struct mptcp_subflow_context *new_ctx; 1941 1942 if (!tcp_rsk(req)->is_mptcp || 1943 (!subflow_req->mp_capable && !subflow_req->mp_join)) { 1944 subflow_ulp_fallback(newsk, old_ctx); 1945 return; 1946 } 1947 1948 new_ctx = subflow_create_ctx(newsk, priority); 1949 if (!new_ctx) { 1950 subflow_ulp_fallback(newsk, old_ctx); 1951 return; 1952 } 1953 1954 new_ctx->conn_finished = 1; 1955 new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; 1956 new_ctx->tcp_state_change = old_ctx->tcp_state_change; 1957 new_ctx->tcp_error_report = old_ctx->tcp_error_report; 1958 new_ctx->rel_write_seq = 1; 1959 new_ctx->tcp_sock = newsk; 1960 1961 if (subflow_req->mp_capable) { 1962 /* see comments in subflow_syn_recv_sock(), MPTCP connection 1963 * is fully established only after we receive the remote key 1964 */ 1965 new_ctx->mp_capable = 1; 1966 new_ctx->local_key = subflow_req->local_key; 1967 new_ctx->token = subflow_req->token; 1968 new_ctx->ssn_offset = subflow_req->ssn_offset; 1969 new_ctx->idsn = subflow_req->idsn; 1970 1971 /* this is the first subflow, id is always 0 */ 1972 subflow_set_local_id(new_ctx, 0); 1973 } else if (subflow_req->mp_join) { 1974 new_ctx->ssn_offset = subflow_req->ssn_offset; 1975 new_ctx->mp_join = 1; 1976 new_ctx->fully_established = 1; 1977 new_ctx->remote_key_valid = 1; 1978 new_ctx->backup = subflow_req->backup; 1979 WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id); 1980 new_ctx->token = subflow_req->token; 1981 new_ctx->thmac = subflow_req->thmac; 1982 1983 /* the subflow req id is valid, fetched via subflow_check_req() 1984 * and subflow_token_join_request() 1985 */ 1986 subflow_set_local_id(new_ctx, subflow_req->local_id); 1987 } 1988 } 1989 1990 static void tcp_release_cb_override(struct sock *ssk) 1991 { 1992 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1993 long status; 1994 1995 /* process and clear all the pending actions, but leave the subflow into 1996 * the napi queue. To respect locking, only the same CPU that originated 1997 * the action can touch the list. mptcp_napi_poll will take care of it. 1998 */ 1999 status = set_mask_bits(&subflow->delegated_status, MPTCP_DELEGATE_ACTIONS_MASK, 0); 2000 if (status) 2001 mptcp_subflow_process_delegated(ssk, status); 2002 2003 tcp_release_cb(ssk); 2004 } 2005 2006 static int tcp_abort_override(struct sock *ssk, int err) 2007 { 2008 /* closing a listener subflow requires a great deal of care. 2009 * keep it simple and just prevent such operation 2010 */ 2011 if (inet_sk_state_load(ssk) == TCP_LISTEN) 2012 return -EINVAL; 2013 2014 return tcp_abort(ssk, err); 2015 } 2016 2017 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { 2018 .name = "mptcp", 2019 .owner = THIS_MODULE, 2020 .init = subflow_ulp_init, 2021 .release = subflow_ulp_release, 2022 .clone = subflow_ulp_clone, 2023 }; 2024 2025 static int subflow_ops_init(struct request_sock_ops *subflow_ops) 2026 { 2027 subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); 2028 2029 subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, 2030 subflow_ops->obj_size, 0, 2031 SLAB_ACCOUNT | 2032 SLAB_TYPESAFE_BY_RCU, 2033 NULL); 2034 if (!subflow_ops->slab) 2035 return -ENOMEM; 2036 2037 return 0; 2038 } 2039 2040 void __init mptcp_subflow_init(void) 2041 { 2042 mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops; 2043 mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4"; 2044 mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor; 2045 2046 if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0) 2047 panic("MPTCP: failed to init subflow v4 request sock ops\n"); 2048 2049 subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; 2050 subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; 2051 subflow_request_sock_ipv4_ops.send_synack = subflow_v4_send_synack; 2052 2053 subflow_specific = ipv4_specific; 2054 subflow_specific.conn_request = subflow_v4_conn_request; 2055 subflow_specific.syn_recv_sock = subflow_syn_recv_sock; 2056 subflow_specific.sk_rx_dst_set = subflow_finish_connect; 2057 subflow_specific.rebuild_header = subflow_rebuild_header; 2058 2059 tcp_prot_override = tcp_prot; 2060 tcp_prot_override.release_cb = tcp_release_cb_override; 2061 tcp_prot_override.diag_destroy = tcp_abort_override; 2062 2063 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 2064 /* In struct mptcp_subflow_request_sock, we assume the TCP request sock 2065 * structures for v4 and v6 have the same size. It should not changed in 2066 * the future but better to make sure to be warned if it is no longer 2067 * the case. 2068 */ 2069 BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock)); 2070 2071 mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops; 2072 mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6"; 2073 mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor; 2074 2075 if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0) 2076 panic("MPTCP: failed to init subflow v6 request sock ops\n"); 2077 2078 subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; 2079 subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; 2080 subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack; 2081 2082 subflow_v6_specific = ipv6_specific; 2083 subflow_v6_specific.conn_request = subflow_v6_conn_request; 2084 subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; 2085 subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; 2086 subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header; 2087 2088 subflow_v6m_specific = subflow_v6_specific; 2089 subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; 2090 subflow_v6m_specific.send_check = ipv4_specific.send_check; 2091 subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; 2092 subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; 2093 subflow_v6m_specific.rebuild_header = subflow_rebuild_header; 2094 2095 tcpv6_prot_override = tcpv6_prot; 2096 tcpv6_prot_override.release_cb = tcp_release_cb_override; 2097 tcpv6_prot_override.diag_destroy = tcp_abort_override; 2098 #endif 2099 2100 mptcp_diag_subflow_init(&subflow_ulp_ops); 2101 2102 if (tcp_register_ulp(&subflow_ulp_ops) != 0) 2103 panic("MPTCP: failed to register subflows to ULP\n"); 2104 } 2105