1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2019, Intel Corporation. 5 */ 6 #define pr_fmt(fmt) "MPTCP: " fmt 7 8 #include <linux/kernel.h> 9 #include <net/mptcp.h> 10 #include "protocol.h" 11 12 #include "mib.h" 13 14 /* path manager command handlers */ 15 16 int mptcp_pm_announce_addr(struct mptcp_sock *msk, 17 const struct mptcp_addr_info *addr, 18 bool echo) 19 { 20 u8 add_addr = READ_ONCE(msk->pm.addr_signal); 21 22 pr_debug("msk=%p, local_id=%d, echo=%d\n", msk, addr->id, echo); 23 24 lockdep_assert_held(&msk->pm.lock); 25 26 if (add_addr & 27 (echo ? BIT(MPTCP_ADD_ADDR_ECHO) : BIT(MPTCP_ADD_ADDR_SIGNAL))) { 28 MPTCP_INC_STATS(sock_net((struct sock *)msk), 29 echo ? MPTCP_MIB_ECHOADDTXDROP : MPTCP_MIB_ADDADDRTXDROP); 30 return -EINVAL; 31 } 32 33 if (echo) { 34 msk->pm.remote = *addr; 35 add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); 36 } else { 37 msk->pm.local = *addr; 38 add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); 39 } 40 WRITE_ONCE(msk->pm.addr_signal, add_addr); 41 return 0; 42 } 43 44 int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) 45 { 46 u8 rm_addr = READ_ONCE(msk->pm.addr_signal); 47 48 pr_debug("msk=%p, rm_list_nr=%d\n", msk, rm_list->nr); 49 50 if (rm_addr) { 51 MPTCP_ADD_STATS(sock_net((struct sock *)msk), 52 MPTCP_MIB_RMADDRTXDROP, rm_list->nr); 53 return -EINVAL; 54 } 55 56 msk->pm.rm_list_tx = *rm_list; 57 rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); 58 WRITE_ONCE(msk->pm.addr_signal, rm_addr); 59 mptcp_pm_nl_addr_send_ack(msk); 60 return 0; 61 } 62 63 /* path manager event handlers */ 64 65 void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side) 66 { 67 struct mptcp_pm_data *pm = &msk->pm; 68 69 pr_debug("msk=%p, token=%u side=%d\n", msk, READ_ONCE(msk->token), server_side); 70 71 WRITE_ONCE(pm->server_side, server_side); 72 mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC); 73 } 74 75 bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) 76 { 77 struct mptcp_pm_data *pm = &msk->pm; 78 unsigned int subflows_max; 79 int ret = 0; 80 81 if (mptcp_pm_is_userspace(msk)) { 82 if (mptcp_userspace_pm_active(msk)) { 83 spin_lock_bh(&pm->lock); 84 pm->subflows++; 85 spin_unlock_bh(&pm->lock); 86 return true; 87 } 88 return false; 89 } 90 91 subflows_max = mptcp_pm_get_subflows_max(msk); 92 93 pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, pm->subflows, 94 subflows_max, READ_ONCE(pm->accept_subflow)); 95 96 /* try to avoid acquiring the lock below */ 97 if (!READ_ONCE(pm->accept_subflow)) 98 return false; 99 100 spin_lock_bh(&pm->lock); 101 if (READ_ONCE(pm->accept_subflow)) { 102 ret = pm->subflows < subflows_max; 103 if (ret && ++pm->subflows == subflows_max) 104 WRITE_ONCE(pm->accept_subflow, false); 105 } 106 spin_unlock_bh(&pm->lock); 107 108 return ret; 109 } 110 111 /* return true if the new status bit is currently cleared, that is, this event 112 * can be server, eventually by an already scheduled work 113 */ 114 static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, 115 enum mptcp_pm_status new_status) 116 { 117 pr_debug("msk=%p status=%x new=%lx\n", msk, msk->pm.status, 118 BIT(new_status)); 119 if (msk->pm.status & BIT(new_status)) 120 return false; 121 122 msk->pm.status |= BIT(new_status); 123 mptcp_schedule_work((struct sock *)msk); 124 return true; 125 } 126 127 void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk) 128 { 129 struct mptcp_pm_data *pm = &msk->pm; 130 bool announce = false; 131 132 pr_debug("msk=%p\n", msk); 133 134 spin_lock_bh(&pm->lock); 135 136 /* mptcp_pm_fully_established() can be invoked by multiple 137 * racing paths - accept() and check_fully_established() 138 * be sure to serve this event only once. 139 */ 140 if (READ_ONCE(pm->work_pending) && 141 !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED))) 142 mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED); 143 144 if ((msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0) 145 announce = true; 146 147 msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED); 148 spin_unlock_bh(&pm->lock); 149 150 if (announce) 151 mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, GFP_ATOMIC); 152 } 153 154 void mptcp_pm_connection_closed(struct mptcp_sock *msk) 155 { 156 pr_debug("msk=%p\n", msk); 157 158 if (msk->token) 159 mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); 160 } 161 162 void mptcp_pm_subflow_established(struct mptcp_sock *msk) 163 { 164 struct mptcp_pm_data *pm = &msk->pm; 165 166 pr_debug("msk=%p\n", msk); 167 168 if (!READ_ONCE(pm->work_pending)) 169 return; 170 171 spin_lock_bh(&pm->lock); 172 173 if (READ_ONCE(pm->work_pending)) 174 mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); 175 176 spin_unlock_bh(&pm->lock); 177 } 178 179 void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, 180 const struct mptcp_subflow_context *subflow) 181 { 182 struct mptcp_pm_data *pm = &msk->pm; 183 bool update_subflows; 184 185 update_subflows = subflow->request_join || subflow->mp_join; 186 if (mptcp_pm_is_userspace(msk)) { 187 if (update_subflows) { 188 spin_lock_bh(&pm->lock); 189 pm->subflows--; 190 spin_unlock_bh(&pm->lock); 191 } 192 return; 193 } 194 195 if (!READ_ONCE(pm->work_pending) && !update_subflows) 196 return; 197 198 spin_lock_bh(&pm->lock); 199 if (update_subflows) 200 __mptcp_pm_close_subflow(msk); 201 202 /* Even if this subflow is not really established, tell the PM to try 203 * to pick the next ones, if possible. 204 */ 205 if (mptcp_pm_nl_check_work_pending(msk)) 206 mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); 207 208 spin_unlock_bh(&pm->lock); 209 } 210 211 void mptcp_pm_add_addr_received(const struct sock *ssk, 212 const struct mptcp_addr_info *addr) 213 { 214 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 215 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 216 struct mptcp_pm_data *pm = &msk->pm; 217 218 pr_debug("msk=%p remote_id=%d accept=%d\n", msk, addr->id, 219 READ_ONCE(pm->accept_addr)); 220 221 mptcp_event_addr_announced(ssk, addr); 222 223 spin_lock_bh(&pm->lock); 224 225 if (mptcp_pm_is_userspace(msk)) { 226 if (mptcp_userspace_pm_active(msk)) { 227 mptcp_pm_announce_addr(msk, addr, true); 228 mptcp_pm_add_addr_send_ack(msk); 229 } else { 230 __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); 231 } 232 /* id0 should not have a different address */ 233 } else if ((addr->id == 0 && !mptcp_pm_nl_is_init_remote_addr(msk, addr)) || 234 (addr->id > 0 && !READ_ONCE(pm->accept_addr))) { 235 mptcp_pm_announce_addr(msk, addr, true); 236 mptcp_pm_add_addr_send_ack(msk); 237 } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { 238 pm->remote = *addr; 239 } else { 240 __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); 241 } 242 243 spin_unlock_bh(&pm->lock); 244 } 245 246 void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, 247 const struct mptcp_addr_info *addr) 248 { 249 struct mptcp_pm_data *pm = &msk->pm; 250 251 pr_debug("msk=%p\n", msk); 252 253 spin_lock_bh(&pm->lock); 254 255 if (mptcp_lookup_anno_list_by_saddr(msk, addr) && READ_ONCE(pm->work_pending)) 256 mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); 257 258 spin_unlock_bh(&pm->lock); 259 } 260 261 void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) 262 { 263 if (!mptcp_pm_should_add_signal(msk)) 264 return; 265 266 mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); 267 } 268 269 void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, 270 const struct mptcp_rm_list *rm_list) 271 { 272 struct mptcp_pm_data *pm = &msk->pm; 273 u8 i; 274 275 pr_debug("msk=%p remote_ids_nr=%d\n", msk, rm_list->nr); 276 277 for (i = 0; i < rm_list->nr; i++) 278 mptcp_event_addr_removed(msk, rm_list->ids[i]); 279 280 spin_lock_bh(&pm->lock); 281 if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED)) 282 pm->rm_list_rx = *rm_list; 283 else 284 __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP); 285 spin_unlock_bh(&pm->lock); 286 } 287 288 void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup) 289 { 290 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 291 struct sock *sk = subflow->conn; 292 struct mptcp_sock *msk; 293 294 pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); 295 msk = mptcp_sk(sk); 296 if (subflow->backup != bkup) 297 subflow->backup = bkup; 298 299 mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC); 300 } 301 302 void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) 303 { 304 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 305 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 306 307 pr_debug("fail_seq=%llu\n", fail_seq); 308 309 if (!READ_ONCE(msk->allow_infinite_fallback)) 310 return; 311 312 if (!subflow->fail_tout) { 313 pr_debug("send MP_FAIL response and infinite map\n"); 314 315 subflow->send_mp_fail = 1; 316 subflow->send_infinite_map = 1; 317 tcp_send_ack(sk); 318 } else { 319 pr_debug("MP_FAIL response received\n"); 320 WRITE_ONCE(subflow->fail_tout, 0); 321 } 322 } 323 324 /* path manager helpers */ 325 326 bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, 327 unsigned int opt_size, unsigned int remaining, 328 struct mptcp_addr_info *addr, bool *echo, 329 bool *drop_other_suboptions) 330 { 331 int ret = false; 332 u8 add_addr; 333 u8 family; 334 bool port; 335 336 spin_lock_bh(&msk->pm.lock); 337 338 /* double check after the lock is acquired */ 339 if (!mptcp_pm_should_add_signal(msk)) 340 goto out_unlock; 341 342 /* always drop every other options for pure ack ADD_ADDR; this is a 343 * plain dup-ack from TCP perspective. The other MPTCP-relevant info, 344 * if any, will be carried by the 'original' TCP ack 345 */ 346 if (skb && skb_is_tcp_pure_ack(skb)) { 347 remaining += opt_size; 348 *drop_other_suboptions = true; 349 } 350 351 *echo = mptcp_pm_should_add_signal_echo(msk); 352 port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port); 353 354 family = *echo ? msk->pm.remote.family : msk->pm.local.family; 355 if (remaining < mptcp_add_addr_len(family, *echo, port)) 356 goto out_unlock; 357 358 if (*echo) { 359 *addr = msk->pm.remote; 360 add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO); 361 } else { 362 *addr = msk->pm.local; 363 add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL); 364 } 365 WRITE_ONCE(msk->pm.addr_signal, add_addr); 366 ret = true; 367 368 out_unlock: 369 spin_unlock_bh(&msk->pm.lock); 370 return ret; 371 } 372 373 bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, 374 struct mptcp_rm_list *rm_list) 375 { 376 int ret = false, len; 377 u8 rm_addr; 378 379 spin_lock_bh(&msk->pm.lock); 380 381 /* double check after the lock is acquired */ 382 if (!mptcp_pm_should_rm_signal(msk)) 383 goto out_unlock; 384 385 rm_addr = msk->pm.addr_signal & ~BIT(MPTCP_RM_ADDR_SIGNAL); 386 len = mptcp_rm_addr_len(&msk->pm.rm_list_tx); 387 if (len < 0) { 388 WRITE_ONCE(msk->pm.addr_signal, rm_addr); 389 goto out_unlock; 390 } 391 if (remaining < len) 392 goto out_unlock; 393 394 *rm_list = msk->pm.rm_list_tx; 395 WRITE_ONCE(msk->pm.addr_signal, rm_addr); 396 ret = true; 397 398 out_unlock: 399 spin_unlock_bh(&msk->pm.lock); 400 return ret; 401 } 402 403 int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) 404 { 405 struct mptcp_addr_info skc_local; 406 struct mptcp_addr_info msk_local; 407 408 if (WARN_ON_ONCE(!msk)) 409 return -1; 410 411 /* The 0 ID mapping is defined by the first subflow, copied into the msk 412 * addr 413 */ 414 mptcp_local_address((struct sock_common *)msk, &msk_local); 415 mptcp_local_address((struct sock_common *)skc, &skc_local); 416 if (mptcp_addresses_equal(&msk_local, &skc_local, false)) 417 return 0; 418 419 if (mptcp_pm_is_userspace(msk)) 420 return mptcp_userspace_pm_get_local_id(msk, &skc_local); 421 return mptcp_pm_nl_get_local_id(msk, &skc_local); 422 } 423 424 bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc) 425 { 426 struct mptcp_addr_info skc_local; 427 428 mptcp_local_address((struct sock_common *)skc, &skc_local); 429 430 if (mptcp_pm_is_userspace(msk)) 431 return mptcp_userspace_pm_is_backup(msk, &skc_local); 432 433 return mptcp_pm_nl_is_backup(msk, &skc_local); 434 } 435 436 int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info) 437 { 438 if (info->attrs[MPTCP_PM_ATTR_TOKEN]) 439 return mptcp_userspace_pm_get_addr(skb, info); 440 return mptcp_pm_nl_get_addr(skb, info); 441 } 442 443 int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) 444 { 445 const struct genl_info *info = genl_info_dump(cb); 446 447 if (info->attrs[MPTCP_PM_ATTR_TOKEN]) 448 return mptcp_userspace_pm_dump_addr(msg, cb); 449 return mptcp_pm_nl_dump_addr(msg, cb); 450 } 451 452 int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info) 453 { 454 if (info->attrs[MPTCP_PM_ATTR_TOKEN]) 455 return mptcp_userspace_pm_set_flags(skb, info); 456 return mptcp_pm_nl_set_flags(skb, info); 457 } 458 459 void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) 460 { 461 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); 462 u32 rcv_tstamp = READ_ONCE(tcp_sk(ssk)->rcv_tstamp); 463 464 /* keep track of rtx periods with no progress */ 465 if (!subflow->stale_count) { 466 subflow->stale_rcv_tstamp = rcv_tstamp; 467 subflow->stale_count++; 468 } else if (subflow->stale_rcv_tstamp == rcv_tstamp) { 469 if (subflow->stale_count < U8_MAX) 470 subflow->stale_count++; 471 mptcp_pm_nl_subflow_chk_stale(msk, ssk); 472 } else { 473 subflow->stale_count = 0; 474 mptcp_subflow_set_active(subflow); 475 } 476 } 477 478 /* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses, 479 * otherwise allow any matching local/remote pair 480 */ 481 bool mptcp_pm_addr_families_match(const struct sock *sk, 482 const struct mptcp_addr_info *loc, 483 const struct mptcp_addr_info *rem) 484 { 485 bool mptcp_is_v4 = sk->sk_family == AF_INET; 486 487 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 488 bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6); 489 bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6); 490 491 if (mptcp_is_v4) 492 return loc_is_v4 && rem_is_v4; 493 494 if (ipv6_only_sock(sk)) 495 return !loc_is_v4 && !rem_is_v4; 496 497 return loc_is_v4 == rem_is_v4; 498 #else 499 return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET; 500 #endif 501 } 502 503 void mptcp_pm_data_reset(struct mptcp_sock *msk) 504 { 505 u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk)); 506 struct mptcp_pm_data *pm = &msk->pm; 507 508 pm->add_addr_signaled = 0; 509 pm->add_addr_accepted = 0; 510 pm->local_addr_used = 0; 511 pm->subflows = 0; 512 pm->rm_list_tx.nr = 0; 513 pm->rm_list_rx.nr = 0; 514 WRITE_ONCE(pm->pm_type, pm_type); 515 516 if (pm_type == MPTCP_PM_TYPE_KERNEL) { 517 bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk); 518 519 /* pm->work_pending must be only be set to 'true' when 520 * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL 521 */ 522 WRITE_ONCE(pm->work_pending, 523 (!!mptcp_pm_get_local_addr_max(msk) && 524 subflows_allowed) || 525 !!mptcp_pm_get_add_addr_signal_max(msk)); 526 WRITE_ONCE(pm->accept_addr, 527 !!mptcp_pm_get_add_addr_accept_max(msk) && 528 subflows_allowed); 529 WRITE_ONCE(pm->accept_subflow, subflows_allowed); 530 } else { 531 WRITE_ONCE(pm->work_pending, 0); 532 WRITE_ONCE(pm->accept_addr, 0); 533 WRITE_ONCE(pm->accept_subflow, 0); 534 } 535 536 WRITE_ONCE(pm->addr_signal, 0); 537 WRITE_ONCE(pm->remote_deny_join_id0, false); 538 pm->status = 0; 539 bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); 540 } 541 542 void mptcp_pm_data_init(struct mptcp_sock *msk) 543 { 544 spin_lock_init(&msk->pm.lock); 545 INIT_LIST_HEAD(&msk->pm.anno_list); 546 INIT_LIST_HEAD(&msk->pm.userspace_pm_local_addr_list); 547 mptcp_pm_data_reset(msk); 548 } 549 550 void __init mptcp_pm_init(void) 551 { 552 mptcp_pm_nl_init(); 553 } 554