1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2019, Intel Corporation. 5 */ 6 #define pr_fmt(fmt) "MPTCP: " fmt 7 8 #include <linux/rculist.h> 9 #include <linux/spinlock.h> 10 #include "protocol.h" 11 #include "mib.h" 12 13 #define ADD_ADDR_RETRANS_MAX 3 14 15 struct mptcp_pm_add_entry { 16 struct list_head list; 17 struct mptcp_addr_info addr; 18 u8 retrans_times; 19 struct timer_list add_timer; 20 struct mptcp_sock *sock; 21 struct rcu_head rcu; 22 }; 23 24 static DEFINE_SPINLOCK(mptcp_pm_list_lock); 25 static LIST_HEAD(mptcp_pm_list); 26 27 /* path manager helpers */ 28 29 /* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses, 30 * otherwise allow any matching local/remote pair 31 */ 32 bool mptcp_pm_addr_families_match(const struct sock *sk, 33 const struct mptcp_addr_info *loc, 34 const struct mptcp_addr_info *rem) 35 { 36 bool mptcp_is_v4 = sk->sk_family == AF_INET; 37 38 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 39 bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6); 40 bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6); 41 42 if (mptcp_is_v4) 43 return loc_is_v4 && rem_is_v4; 44 45 if (ipv6_only_sock(sk)) 46 return !loc_is_v4 && !rem_is_v4; 47 48 return loc_is_v4 == rem_is_v4; 49 #else 50 return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET; 51 #endif 52 } 53 54 bool mptcp_addresses_equal(const struct mptcp_addr_info *a, 55 const struct mptcp_addr_info *b, bool use_port) 56 { 57 bool addr_equals = false; 58 59 if (a->family == b->family) { 60 if (a->family == AF_INET) 61 addr_equals = a->addr.s_addr == b->addr.s_addr; 62 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 63 else 64 addr_equals = ipv6_addr_equal(&a->addr6, &b->addr6); 65 } else if (a->family == AF_INET) { 66 if (ipv6_addr_v4mapped(&b->addr6)) 67 addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3]; 68 } else if (b->family == AF_INET) { 69 if (ipv6_addr_v4mapped(&a->addr6)) 70 addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr; 71 #endif 72 } 73 74 if (!addr_equals) 75 return false; 76 if (!use_port) 77 return true; 78 79 return a->port == b->port; 80 } 81 82 void mptcp_local_address(const struct sock_common *skc, 83 struct mptcp_addr_info *addr) 84 { 85 addr->family = skc->skc_family; 86 addr->port = htons(skc->skc_num); 87 if (addr->family == AF_INET) 88 addr->addr.s_addr = skc->skc_rcv_saddr; 89 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 90 else if (addr->family == AF_INET6) 91 addr->addr6 = skc->skc_v6_rcv_saddr; 92 #endif 93 } 94 95 void mptcp_remote_address(const struct sock_common *skc, 96 struct mptcp_addr_info *addr) 97 { 98 addr->family = skc->skc_family; 99 addr->port = skc->skc_dport; 100 if (addr->family == AF_INET) 101 addr->addr.s_addr = skc->skc_daddr; 102 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 103 else if (addr->family == AF_INET6) 104 addr->addr6 = skc->skc_v6_daddr; 105 #endif 106 } 107 108 static bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk, 109 const struct mptcp_addr_info *remote) 110 { 111 struct mptcp_addr_info mpc_remote; 112 113 mptcp_remote_address((struct sock_common *)msk, &mpc_remote); 114 return mptcp_addresses_equal(&mpc_remote, remote, remote->port); 115 } 116 117 bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, 118 const struct mptcp_addr_info *saddr) 119 { 120 struct mptcp_subflow_context *subflow; 121 struct mptcp_addr_info cur; 122 struct sock_common *skc; 123 124 list_for_each_entry(subflow, list, node) { 125 skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); 126 127 mptcp_local_address(skc, &cur); 128 if (mptcp_addresses_equal(&cur, saddr, saddr->port)) 129 return true; 130 } 131 132 return false; 133 } 134 135 static struct mptcp_pm_add_entry * 136 mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, 137 const struct mptcp_addr_info *addr) 138 { 139 struct mptcp_pm_add_entry *entry; 140 141 lockdep_assert_held(&msk->pm.lock); 142 143 list_for_each_entry(entry, &msk->pm.anno_list, list) { 144 if (mptcp_addresses_equal(&entry->addr, addr, true)) 145 return entry; 146 } 147 148 return NULL; 149 } 150 151 bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, 152 const struct mptcp_addr_info *addr) 153 { 154 struct mptcp_pm_add_entry *entry; 155 bool ret; 156 157 entry = mptcp_pm_del_add_timer(msk, addr, false); 158 ret = entry; 159 kfree_rcu(entry, rcu); 160 161 return ret; 162 } 163 164 bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) 165 { 166 struct mptcp_pm_add_entry *entry; 167 struct mptcp_addr_info saddr; 168 bool ret = false; 169 170 mptcp_local_address((struct sock_common *)sk, &saddr); 171 172 spin_lock_bh(&msk->pm.lock); 173 list_for_each_entry(entry, &msk->pm.anno_list, list) { 174 if (mptcp_addresses_equal(&entry->addr, &saddr, true)) { 175 ret = true; 176 goto out; 177 } 178 } 179 180 out: 181 spin_unlock_bh(&msk->pm.lock); 182 return ret; 183 } 184 185 static void __mptcp_pm_send_ack(struct mptcp_sock *msk, 186 struct mptcp_subflow_context *subflow, 187 bool prio, bool backup) 188 { 189 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 190 bool slow; 191 192 pr_debug("send ack for %s\n", 193 prio ? "mp_prio" : 194 (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr")); 195 196 slow = lock_sock_fast(ssk); 197 if (prio) { 198 subflow->send_mp_prio = 1; 199 subflow->request_bkup = backup; 200 } 201 202 __mptcp_subflow_send_ack(ssk); 203 unlock_sock_fast(ssk, slow); 204 } 205 206 void mptcp_pm_send_ack(struct mptcp_sock *msk, 207 struct mptcp_subflow_context *subflow, 208 bool prio, bool backup) 209 { 210 spin_unlock_bh(&msk->pm.lock); 211 __mptcp_pm_send_ack(msk, subflow, prio, backup); 212 spin_lock_bh(&msk->pm.lock); 213 } 214 215 static bool subflow_in_rm_list(const struct mptcp_subflow_context *subflow, 216 const struct mptcp_rm_list *rm_list) 217 { 218 u8 i, id = subflow_get_local_id(subflow); 219 220 for (i = 0; i < rm_list->nr; i++) { 221 if (rm_list->ids[i] == id) 222 return true; 223 } 224 225 return false; 226 } 227 228 static void 229 mptcp_pm_addr_send_ack_avoid_list(struct mptcp_sock *msk, 230 const struct mptcp_rm_list *rm_list) 231 { 232 struct mptcp_subflow_context *subflow, *stale = NULL, *same_id = NULL; 233 234 msk_owned_by_me(msk); 235 lockdep_assert_held(&msk->pm.lock); 236 237 if (!mptcp_pm_should_add_signal(msk) && 238 !mptcp_pm_should_rm_signal(msk)) 239 return; 240 241 mptcp_for_each_subflow(msk, subflow) { 242 if (!__mptcp_subflow_active(subflow)) 243 continue; 244 245 if (unlikely(subflow->stale)) { 246 if (!stale) 247 stale = subflow; 248 } else if (unlikely(rm_list && 249 subflow_in_rm_list(subflow, rm_list))) { 250 if (!same_id) 251 same_id = subflow; 252 } else { 253 goto send_ack; 254 } 255 } 256 257 if (same_id) 258 subflow = same_id; 259 else if (stale) 260 subflow = stale; 261 else 262 return; 263 264 send_ack: 265 mptcp_pm_send_ack(msk, subflow, false, false); 266 } 267 268 void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) 269 { 270 mptcp_pm_addr_send_ack_avoid_list(msk, NULL); 271 } 272 273 int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, 274 struct mptcp_addr_info *addr, 275 struct mptcp_addr_info *rem, 276 u8 bkup) 277 { 278 struct mptcp_subflow_context *subflow; 279 280 pr_debug("bkup=%d\n", bkup); 281 282 mptcp_for_each_subflow(msk, subflow) { 283 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 284 struct mptcp_addr_info local, remote; 285 286 mptcp_local_address((struct sock_common *)ssk, &local); 287 if (!mptcp_addresses_equal(&local, addr, addr->port)) 288 continue; 289 290 if (rem && rem->family != AF_UNSPEC) { 291 mptcp_remote_address((struct sock_common *)ssk, &remote); 292 if (!mptcp_addresses_equal(&remote, rem, rem->port)) 293 continue; 294 } 295 296 __mptcp_pm_send_ack(msk, subflow, true, bkup); 297 return 0; 298 } 299 300 return -EINVAL; 301 } 302 303 static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk) 304 { 305 const struct net *net = sock_net((struct sock *)msk); 306 unsigned int rto = mptcp_get_add_addr_timeout(net); 307 struct mptcp_subflow_context *subflow; 308 unsigned int max = 0; 309 310 mptcp_for_each_subflow(msk, subflow) { 311 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 312 struct inet_connection_sock *icsk = inet_csk(ssk); 313 314 if (icsk->icsk_rto > max) 315 max = icsk->icsk_rto; 316 } 317 318 if (max && max < rto) 319 rto = max; 320 321 return rto; 322 } 323 324 static void mptcp_pm_add_timer(struct timer_list *timer) 325 { 326 struct mptcp_pm_add_entry *entry = timer_container_of(entry, timer, 327 add_timer); 328 struct mptcp_sock *msk = entry->sock; 329 struct sock *sk = (struct sock *)msk; 330 unsigned int timeout; 331 332 pr_debug("msk=%p\n", msk); 333 334 if (!msk) 335 return; 336 337 if (inet_sk_state_load(sk) == TCP_CLOSE) 338 return; 339 340 if (!entry->addr.id) 341 return; 342 343 if (mptcp_pm_should_add_signal_addr(msk)) { 344 sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); 345 goto out; 346 } 347 348 timeout = mptcp_adjust_add_addr_timeout(msk); 349 if (!timeout) 350 goto out; 351 352 spin_lock_bh(&msk->pm.lock); 353 354 if (!mptcp_pm_should_add_signal_addr(msk)) { 355 pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id); 356 mptcp_pm_announce_addr(msk, &entry->addr, false); 357 mptcp_pm_add_addr_send_ack(msk); 358 entry->retrans_times++; 359 } 360 361 if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) 362 sk_reset_timer(sk, timer, 363 jiffies + (timeout << entry->retrans_times)); 364 365 spin_unlock_bh(&msk->pm.lock); 366 367 if (entry->retrans_times == ADD_ADDR_RETRANS_MAX) 368 mptcp_pm_subflow_established(msk); 369 370 out: 371 __sock_put(sk); 372 } 373 374 struct mptcp_pm_add_entry * 375 mptcp_pm_del_add_timer(struct mptcp_sock *msk, 376 const struct mptcp_addr_info *addr, bool check_id) 377 { 378 struct mptcp_pm_add_entry *entry; 379 struct sock *sk = (struct sock *)msk; 380 bool stop_timer = false; 381 382 rcu_read_lock(); 383 384 spin_lock_bh(&msk->pm.lock); 385 entry = mptcp_lookup_anno_list_by_saddr(msk, addr); 386 if (entry && (!check_id || entry->addr.id == addr->id)) { 387 entry->retrans_times = ADD_ADDR_RETRANS_MAX; 388 stop_timer = true; 389 } 390 if (!check_id && entry) 391 list_del(&entry->list); 392 spin_unlock_bh(&msk->pm.lock); 393 394 /* Note: entry might have been removed by another thread. 395 * We hold rcu_read_lock() to ensure it is not freed under us. 396 */ 397 if (stop_timer) 398 sk_stop_timer_sync(sk, &entry->add_timer); 399 400 rcu_read_unlock(); 401 return entry; 402 } 403 404 bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, 405 const struct mptcp_addr_info *addr) 406 { 407 struct mptcp_pm_add_entry *add_entry = NULL; 408 struct sock *sk = (struct sock *)msk; 409 unsigned int timeout; 410 411 lockdep_assert_held(&msk->pm.lock); 412 413 add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); 414 415 if (add_entry) { 416 if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) 417 return false; 418 419 goto reset_timer; 420 } 421 422 add_entry = kmalloc_obj(*add_entry, GFP_ATOMIC); 423 if (!add_entry) 424 return false; 425 426 list_add(&add_entry->list, &msk->pm.anno_list); 427 428 add_entry->addr = *addr; 429 add_entry->sock = msk; 430 add_entry->retrans_times = 0; 431 432 timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); 433 reset_timer: 434 timeout = mptcp_adjust_add_addr_timeout(msk); 435 if (timeout) 436 sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout); 437 438 return true; 439 } 440 441 static void mptcp_pm_free_anno_list(struct mptcp_sock *msk) 442 { 443 struct mptcp_pm_add_entry *entry, *tmp; 444 struct sock *sk = (struct sock *)msk; 445 LIST_HEAD(free_list); 446 447 pr_debug("msk=%p\n", msk); 448 449 spin_lock_bh(&msk->pm.lock); 450 list_splice_init(&msk->pm.anno_list, &free_list); 451 spin_unlock_bh(&msk->pm.lock); 452 453 list_for_each_entry_safe(entry, tmp, &free_list, list) { 454 sk_stop_timer_sync(sk, &entry->add_timer); 455 kfree_rcu(entry, rcu); 456 } 457 } 458 459 /* path manager command handlers */ 460 461 int mptcp_pm_announce_addr(struct mptcp_sock *msk, 462 const struct mptcp_addr_info *addr, 463 bool echo) 464 { 465 u8 add_addr = READ_ONCE(msk->pm.addr_signal); 466 467 pr_debug("msk=%p, local_id=%d, echo=%d\n", msk, addr->id, echo); 468 469 lockdep_assert_held(&msk->pm.lock); 470 471 if (add_addr & 472 (echo ? BIT(MPTCP_ADD_ADDR_ECHO) : BIT(MPTCP_ADD_ADDR_SIGNAL))) { 473 MPTCP_INC_STATS(sock_net((struct sock *)msk), 474 echo ? MPTCP_MIB_ECHOADDTXDROP : MPTCP_MIB_ADDADDRTXDROP); 475 return -EINVAL; 476 } 477 478 if (echo) { 479 msk->pm.remote = *addr; 480 add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); 481 } else { 482 msk->pm.local = *addr; 483 add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); 484 } 485 WRITE_ONCE(msk->pm.addr_signal, add_addr); 486 return 0; 487 } 488 489 int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) 490 { 491 u8 rm_addr = READ_ONCE(msk->pm.addr_signal); 492 493 pr_debug("msk=%p, rm_list_nr=%d\n", msk, rm_list->nr); 494 495 if (rm_addr) { 496 MPTCP_ADD_STATS(sock_net((struct sock *)msk), 497 MPTCP_MIB_RMADDRTXDROP, rm_list->nr); 498 return -EINVAL; 499 } 500 501 msk->pm.rm_list_tx = *rm_list; 502 rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); 503 WRITE_ONCE(msk->pm.addr_signal, rm_addr); 504 mptcp_pm_addr_send_ack_avoid_list(msk, rm_list); 505 return 0; 506 } 507 508 /* path manager event handlers */ 509 510 void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side) 511 { 512 struct mptcp_pm_data *pm = &msk->pm; 513 514 pr_debug("msk=%p, token=%u side=%d\n", msk, READ_ONCE(msk->token), server_side); 515 516 WRITE_ONCE(pm->server_side, server_side); 517 mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC); 518 } 519 520 bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) 521 { 522 struct mptcp_pm_data *pm = &msk->pm; 523 unsigned int limit_extra_subflows; 524 int ret = 0; 525 526 if (mptcp_pm_is_userspace(msk)) { 527 if (mptcp_userspace_pm_active(msk)) { 528 spin_lock_bh(&pm->lock); 529 pm->extra_subflows++; 530 spin_unlock_bh(&pm->lock); 531 return true; 532 } 533 return false; 534 } 535 536 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); 537 538 pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, 539 pm->extra_subflows, limit_extra_subflows, 540 READ_ONCE(pm->accept_subflow)); 541 542 /* try to avoid acquiring the lock below */ 543 if (!READ_ONCE(pm->accept_subflow)) 544 return false; 545 546 spin_lock_bh(&pm->lock); 547 if (READ_ONCE(pm->accept_subflow)) { 548 ret = pm->extra_subflows < limit_extra_subflows; 549 if (ret && ++pm->extra_subflows == limit_extra_subflows) 550 WRITE_ONCE(pm->accept_subflow, false); 551 } 552 spin_unlock_bh(&pm->lock); 553 554 return ret; 555 } 556 557 /* return true if the new status bit is currently cleared, that is, this event 558 * can be server, eventually by an already scheduled work 559 */ 560 static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, 561 enum mptcp_pm_status new_status) 562 { 563 pr_debug("msk=%p status=%x new=%lx\n", msk, msk->pm.status, 564 BIT(new_status)); 565 if (msk->pm.status & BIT(new_status)) 566 return false; 567 568 msk->pm.status |= BIT(new_status); 569 mptcp_schedule_work((struct sock *)msk); 570 return true; 571 } 572 573 void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk) 574 { 575 struct mptcp_pm_data *pm = &msk->pm; 576 bool announce = false; 577 578 pr_debug("msk=%p\n", msk); 579 580 spin_lock_bh(&pm->lock); 581 582 /* mptcp_pm_fully_established() can be invoked by multiple 583 * racing paths - accept() and check_fully_established() 584 * be sure to serve this event only once. 585 */ 586 if (READ_ONCE(pm->work_pending) && 587 !(pm->status & BIT(MPTCP_PM_ALREADY_ESTABLISHED))) 588 mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED); 589 590 if ((pm->status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0) 591 announce = true; 592 593 pm->status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED); 594 spin_unlock_bh(&pm->lock); 595 596 if (announce) 597 mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, GFP_ATOMIC); 598 } 599 600 void mptcp_pm_connection_closed(struct mptcp_sock *msk) 601 { 602 pr_debug("msk=%p\n", msk); 603 604 if (msk->token) 605 mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); 606 } 607 608 void mptcp_pm_subflow_established(struct mptcp_sock *msk) 609 { 610 struct mptcp_pm_data *pm = &msk->pm; 611 612 pr_debug("msk=%p\n", msk); 613 614 if (!READ_ONCE(pm->work_pending)) 615 return; 616 617 spin_lock_bh(&pm->lock); 618 619 if (READ_ONCE(pm->work_pending)) 620 mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); 621 622 spin_unlock_bh(&pm->lock); 623 } 624 625 void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, 626 const struct mptcp_subflow_context *subflow) 627 { 628 struct sock *sk = (struct sock *)msk; 629 struct mptcp_pm_data *pm = &msk->pm; 630 bool update_subflows; 631 632 update_subflows = subflow->request_join || subflow->mp_join; 633 if (mptcp_pm_is_userspace(msk)) { 634 if (update_subflows) { 635 spin_lock_bh(&pm->lock); 636 pm->extra_subflows--; 637 spin_unlock_bh(&pm->lock); 638 } 639 return; 640 } 641 642 if (!READ_ONCE(pm->work_pending) && !update_subflows) 643 return; 644 645 spin_lock_bh(&pm->lock); 646 if (update_subflows) 647 __mptcp_pm_close_subflow(msk); 648 649 /* Even if this subflow is not really established, tell the PM to try 650 * to pick the next ones, if possible. 651 */ 652 if (mptcp_is_fully_established(sk) && 653 mptcp_pm_nl_check_work_pending(msk)) 654 mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); 655 656 spin_unlock_bh(&pm->lock); 657 } 658 659 void mptcp_pm_add_addr_received(const struct sock *ssk, 660 const struct mptcp_addr_info *addr) 661 { 662 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 663 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 664 struct mptcp_pm_data *pm = &msk->pm; 665 666 pr_debug("msk=%p remote_id=%d accept=%d\n", msk, addr->id, 667 READ_ONCE(pm->accept_addr)); 668 669 mptcp_event_addr_announced(ssk, addr); 670 671 spin_lock_bh(&pm->lock); 672 673 if (mptcp_pm_is_userspace(msk)) { 674 if (mptcp_userspace_pm_active(msk)) { 675 mptcp_pm_announce_addr(msk, addr, true); 676 mptcp_pm_add_addr_send_ack(msk); 677 } else { 678 __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); 679 } 680 /* - id0 should not have a different address 681 * - special case for C-flag: linked to fill_local_addresses_vec() 682 */ 683 } else if ((addr->id == 0 && !mptcp_pm_is_init_remote_addr(msk, addr)) || 684 (addr->id > 0 && !READ_ONCE(pm->accept_addr) && 685 !mptcp_pm_add_addr_c_flag_case(msk))) { 686 mptcp_pm_announce_addr(msk, addr, true); 687 mptcp_pm_add_addr_send_ack(msk); 688 } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { 689 pm->remote = *addr; 690 } else { 691 __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); 692 } 693 694 spin_unlock_bh(&pm->lock); 695 } 696 697 void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, 698 const struct mptcp_addr_info *addr) 699 { 700 struct mptcp_pm_data *pm = &msk->pm; 701 702 pr_debug("msk=%p\n", msk); 703 704 if (!READ_ONCE(pm->work_pending)) 705 return; 706 707 spin_lock_bh(&pm->lock); 708 709 if (mptcp_lookup_anno_list_by_saddr(msk, addr) && READ_ONCE(pm->work_pending)) 710 mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); 711 712 spin_unlock_bh(&pm->lock); 713 } 714 715 void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) 716 { 717 if (!mptcp_pm_should_add_signal(msk)) 718 return; 719 720 mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); 721 } 722 723 static void mptcp_pm_rm_addr_or_subflow(struct mptcp_sock *msk, 724 const struct mptcp_rm_list *rm_list, 725 enum linux_mptcp_mib_field rm_type) 726 { 727 struct mptcp_subflow_context *subflow, *tmp; 728 struct sock *sk = (struct sock *)msk; 729 u8 i; 730 731 pr_debug("%s rm_list_nr %d\n", 732 rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); 733 734 msk_owned_by_me(msk); 735 736 if (sk->sk_state == TCP_LISTEN) 737 return; 738 739 if (!rm_list->nr) 740 return; 741 742 if (list_empty(&msk->conn_list)) 743 return; 744 745 for (i = 0; i < rm_list->nr; i++) { 746 u8 rm_id = rm_list->ids[i]; 747 bool removed = false; 748 749 mptcp_for_each_subflow_safe(msk, subflow, tmp) { 750 struct sock *ssk = mptcp_subflow_tcp_sock(subflow); 751 u8 remote_id = READ_ONCE(subflow->remote_id); 752 int how = RCV_SHUTDOWN | SEND_SHUTDOWN; 753 u8 id = subflow_get_local_id(subflow); 754 755 if ((1 << inet_sk_state_load(ssk)) & 756 (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE)) 757 continue; 758 if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) 759 continue; 760 if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id) 761 continue; 762 763 pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n", 764 rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", 765 i, rm_id, id, remote_id, msk->mpc_endpoint_id); 766 spin_unlock_bh(&msk->pm.lock); 767 mptcp_subflow_shutdown(sk, ssk, how); 768 removed |= subflow->request_join; 769 770 /* the following takes care of updating the subflows counter */ 771 mptcp_close_ssk(sk, ssk, subflow); 772 spin_lock_bh(&msk->pm.lock); 773 774 if (rm_type == MPTCP_MIB_RMSUBFLOW) 775 __MPTCP_INC_STATS(sock_net(sk), rm_type); 776 } 777 778 if (rm_type == MPTCP_MIB_RMADDR) { 779 __MPTCP_INC_STATS(sock_net(sk), rm_type); 780 if (removed && mptcp_pm_is_kernel(msk)) 781 mptcp_pm_nl_rm_addr(msk, rm_id); 782 } 783 } 784 } 785 786 static void mptcp_pm_rm_addr_recv(struct mptcp_sock *msk) 787 { 788 mptcp_pm_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); 789 } 790 791 void mptcp_pm_rm_subflow(struct mptcp_sock *msk, 792 const struct mptcp_rm_list *rm_list) 793 { 794 mptcp_pm_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); 795 } 796 797 void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, 798 const struct mptcp_rm_list *rm_list) 799 { 800 struct mptcp_pm_data *pm = &msk->pm; 801 u8 i; 802 803 pr_debug("msk=%p remote_ids_nr=%d\n", msk, rm_list->nr); 804 805 for (i = 0; i < rm_list->nr; i++) 806 mptcp_event_addr_removed(msk, rm_list->ids[i]); 807 808 spin_lock_bh(&pm->lock); 809 if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED)) 810 pm->rm_list_rx = *rm_list; 811 else 812 __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP); 813 spin_unlock_bh(&pm->lock); 814 } 815 816 void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup) 817 { 818 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 819 struct sock *sk = subflow->conn; 820 struct mptcp_sock *msk; 821 822 pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); 823 msk = mptcp_sk(sk); 824 if (subflow->backup != bkup) 825 subflow->backup = bkup; 826 827 mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC); 828 } 829 830 void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) 831 { 832 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 833 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 834 835 pr_debug("fail_seq=%llu\n", fail_seq); 836 837 /* After accepting the fail, we can't create any other subflows */ 838 spin_lock_bh(&msk->fallback_lock); 839 if (!msk->allow_infinite_fallback) { 840 spin_unlock_bh(&msk->fallback_lock); 841 return; 842 } 843 msk->allow_subflows = false; 844 spin_unlock_bh(&msk->fallback_lock); 845 846 if (!subflow->fail_tout) { 847 pr_debug("send MP_FAIL response and infinite map\n"); 848 849 subflow->send_mp_fail = 1; 850 subflow->send_infinite_map = 1; 851 tcp_send_ack(sk); 852 } else { 853 pr_debug("MP_FAIL response received\n"); 854 WRITE_ONCE(subflow->fail_tout, 0); 855 } 856 } 857 858 bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, 859 unsigned int opt_size, unsigned int remaining, 860 struct mptcp_addr_info *addr, bool *echo, 861 bool *drop_other_suboptions) 862 { 863 int ret = false; 864 u8 add_addr; 865 u8 family; 866 bool port; 867 868 spin_lock_bh(&msk->pm.lock); 869 870 /* double check after the lock is acquired */ 871 if (!mptcp_pm_should_add_signal(msk)) 872 goto out_unlock; 873 874 /* always drop every other options for pure ack ADD_ADDR; this is a 875 * plain dup-ack from TCP perspective. The other MPTCP-relevant info, 876 * if any, will be carried by the 'original' TCP ack 877 */ 878 if (skb && skb_is_tcp_pure_ack(skb)) { 879 remaining += opt_size; 880 *drop_other_suboptions = true; 881 } 882 883 *echo = mptcp_pm_should_add_signal_echo(msk); 884 port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port); 885 886 family = *echo ? msk->pm.remote.family : msk->pm.local.family; 887 if (remaining < mptcp_add_addr_len(family, *echo, port)) 888 goto out_unlock; 889 890 if (*echo) { 891 *addr = msk->pm.remote; 892 add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO); 893 } else { 894 *addr = msk->pm.local; 895 add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL); 896 } 897 WRITE_ONCE(msk->pm.addr_signal, add_addr); 898 ret = true; 899 900 out_unlock: 901 spin_unlock_bh(&msk->pm.lock); 902 return ret; 903 } 904 905 bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, 906 struct mptcp_rm_list *rm_list) 907 { 908 int ret = false, len; 909 u8 rm_addr; 910 911 spin_lock_bh(&msk->pm.lock); 912 913 /* double check after the lock is acquired */ 914 if (!mptcp_pm_should_rm_signal(msk)) 915 goto out_unlock; 916 917 rm_addr = msk->pm.addr_signal & ~BIT(MPTCP_RM_ADDR_SIGNAL); 918 len = mptcp_rm_addr_len(&msk->pm.rm_list_tx); 919 if (len < 0) { 920 WRITE_ONCE(msk->pm.addr_signal, rm_addr); 921 goto out_unlock; 922 } 923 if (remaining < len) 924 goto out_unlock; 925 926 *rm_list = msk->pm.rm_list_tx; 927 WRITE_ONCE(msk->pm.addr_signal, rm_addr); 928 ret = true; 929 930 out_unlock: 931 spin_unlock_bh(&msk->pm.lock); 932 return ret; 933 } 934 935 int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) 936 { 937 struct mptcp_pm_addr_entry skc_local = { 0 }; 938 struct mptcp_addr_info msk_local; 939 940 if (WARN_ON_ONCE(!msk)) 941 return -1; 942 943 /* The 0 ID mapping is defined by the first subflow, copied into the msk 944 * addr 945 */ 946 mptcp_local_address((struct sock_common *)msk, &msk_local); 947 mptcp_local_address((struct sock_common *)skc, &skc_local.addr); 948 if (mptcp_addresses_equal(&msk_local, &skc_local.addr, false)) 949 return 0; 950 951 skc_local.addr.id = 0; 952 skc_local.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; 953 954 if (mptcp_pm_is_userspace(msk)) 955 return mptcp_userspace_pm_get_local_id(msk, &skc_local); 956 return mptcp_pm_nl_get_local_id(msk, &skc_local); 957 } 958 959 bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc) 960 { 961 struct mptcp_addr_info skc_local; 962 963 mptcp_local_address((struct sock_common *)skc, &skc_local); 964 965 if (mptcp_pm_is_userspace(msk)) 966 return mptcp_userspace_pm_is_backup(msk, &skc_local); 967 968 return mptcp_pm_nl_is_backup(msk, &skc_local); 969 } 970 971 static void mptcp_pm_subflows_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) 972 { 973 struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); 974 struct sock *sk = (struct sock *)msk; 975 unsigned int active_max_loss_cnt; 976 struct net *net = sock_net(sk); 977 unsigned int stale_loss_cnt; 978 bool slow; 979 980 stale_loss_cnt = mptcp_stale_loss_cnt(net); 981 if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt) 982 return; 983 984 /* look for another available subflow not in loss state */ 985 active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1); 986 mptcp_for_each_subflow(msk, iter) { 987 if (iter != subflow && mptcp_subflow_active(iter) && 988 iter->stale_count < active_max_loss_cnt) { 989 /* we have some alternatives, try to mark this subflow as idle ...*/ 990 slow = lock_sock_fast(ssk); 991 if (!tcp_rtx_and_write_queues_empty(ssk)) { 992 subflow->stale = 1; 993 __mptcp_retransmit_pending_data(sk); 994 MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE); 995 } 996 unlock_sock_fast(ssk, slow); 997 998 /* always try to push the pending data regardless of re-injections: 999 * we can possibly use backup subflows now, and subflow selection 1000 * is cheap under the msk socket lock 1001 */ 1002 __mptcp_push_pending(sk, 0); 1003 return; 1004 } 1005 } 1006 } 1007 1008 void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) 1009 { 1010 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 1011 u32 rcv_tstamp = READ_ONCE(tcp_sk(ssk)->rcv_tstamp); 1012 1013 /* keep track of rtx periods with no progress */ 1014 if (!subflow->stale_count) { 1015 subflow->stale_rcv_tstamp = rcv_tstamp; 1016 subflow->stale_count++; 1017 } else if (subflow->stale_rcv_tstamp == rcv_tstamp) { 1018 if (subflow->stale_count < U8_MAX) 1019 subflow->stale_count++; 1020 mptcp_pm_subflows_chk_stale(msk, ssk); 1021 } else { 1022 subflow->stale_count = 0; 1023 mptcp_subflow_set_active(subflow); 1024 } 1025 } 1026 1027 void mptcp_pm_worker(struct mptcp_sock *msk) 1028 { 1029 struct mptcp_pm_data *pm = &msk->pm; 1030 1031 msk_owned_by_me(msk); 1032 1033 if (!(pm->status & MPTCP_PM_WORK_MASK)) 1034 return; 1035 1036 spin_lock_bh(&msk->pm.lock); 1037 1038 pr_debug("msk=%p status=%x\n", msk, pm->status); 1039 if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { 1040 pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); 1041 mptcp_pm_addr_send_ack(msk); 1042 } 1043 if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { 1044 pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); 1045 mptcp_pm_rm_addr_recv(msk); 1046 } 1047 __mptcp_pm_kernel_worker(msk); 1048 1049 spin_unlock_bh(&msk->pm.lock); 1050 } 1051 1052 void mptcp_pm_destroy(struct mptcp_sock *msk) 1053 { 1054 mptcp_pm_free_anno_list(msk); 1055 1056 if (mptcp_pm_is_userspace(msk)) 1057 mptcp_userspace_pm_free_local_addr_list(msk); 1058 } 1059 1060 void mptcp_pm_data_reset(struct mptcp_sock *msk) 1061 { 1062 u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk)); 1063 struct mptcp_pm_data *pm = &msk->pm; 1064 1065 memset(&pm->reset, 0, sizeof(pm->reset)); 1066 pm->rm_list_tx.nr = 0; 1067 pm->rm_list_rx.nr = 0; 1068 WRITE_ONCE(pm->pm_type, pm_type); 1069 1070 if (pm_type == MPTCP_PM_TYPE_KERNEL) { 1071 bool subflows_allowed = !!mptcp_pm_get_limit_extra_subflows(msk); 1072 1073 /* pm->work_pending must be only be set to 'true' when 1074 * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL 1075 */ 1076 WRITE_ONCE(pm->work_pending, 1077 (!!mptcp_pm_get_endp_subflow_max(msk) && 1078 subflows_allowed) || 1079 !!mptcp_pm_get_endp_signal_max(msk)); 1080 WRITE_ONCE(pm->accept_addr, 1081 !!mptcp_pm_get_limit_add_addr_accepted(msk) && 1082 subflows_allowed); 1083 WRITE_ONCE(pm->accept_subflow, subflows_allowed); 1084 1085 bitmap_fill(pm->id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); 1086 } 1087 } 1088 1089 void mptcp_pm_data_init(struct mptcp_sock *msk) 1090 { 1091 spin_lock_init(&msk->pm.lock); 1092 INIT_LIST_HEAD(&msk->pm.anno_list); 1093 INIT_LIST_HEAD(&msk->pm.userspace_pm_local_addr_list); 1094 mptcp_pm_data_reset(msk); 1095 } 1096 1097 void __init mptcp_pm_init(void) 1098 { 1099 mptcp_pm_kernel_register(); 1100 mptcp_pm_userspace_register(); 1101 mptcp_pm_nl_init(); 1102 } 1103 1104 /* Must be called with rcu read lock held */ 1105 struct mptcp_pm_ops *mptcp_pm_find(const char *name) 1106 { 1107 struct mptcp_pm_ops *pm_ops; 1108 1109 list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) { 1110 if (!strcmp(pm_ops->name, name)) 1111 return pm_ops; 1112 } 1113 1114 return NULL; 1115 } 1116 1117 int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops) 1118 { 1119 return 0; 1120 } 1121 1122 int mptcp_pm_register(struct mptcp_pm_ops *pm_ops) 1123 { 1124 int ret; 1125 1126 ret = mptcp_pm_validate(pm_ops); 1127 if (ret) 1128 return ret; 1129 1130 spin_lock(&mptcp_pm_list_lock); 1131 if (mptcp_pm_find(pm_ops->name)) { 1132 spin_unlock(&mptcp_pm_list_lock); 1133 return -EEXIST; 1134 } 1135 list_add_tail_rcu(&pm_ops->list, &mptcp_pm_list); 1136 spin_unlock(&mptcp_pm_list_lock); 1137 1138 pr_debug("%s registered\n", pm_ops->name); 1139 return 0; 1140 } 1141 1142 void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops) 1143 { 1144 /* skip unregistering the default path manager */ 1145 if (WARN_ON_ONCE(pm_ops == &mptcp_pm_kernel)) 1146 return; 1147 1148 spin_lock(&mptcp_pm_list_lock); 1149 list_del_rcu(&pm_ops->list); 1150 spin_unlock(&mptcp_pm_list_lock); 1151 } 1152 1153 /* Build string with list of available path manager values. 1154 * Similar to tcp_get_available_congestion_control() 1155 */ 1156 void mptcp_pm_get_available(char *buf, size_t maxlen) 1157 { 1158 struct mptcp_pm_ops *pm_ops; 1159 size_t offs = 0; 1160 1161 rcu_read_lock(); 1162 list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) { 1163 offs += snprintf(buf + offs, maxlen - offs, "%s%s", 1164 offs == 0 ? "" : " ", pm_ops->name); 1165 1166 if (WARN_ON_ONCE(offs >= maxlen)) 1167 break; 1168 } 1169 rcu_read_unlock(); 1170 } 1171