1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2019, Tessares SA. 5 */ 6 7 #ifdef CONFIG_SYSCTL 8 #include <linux/sysctl.h> 9 #endif 10 11 #include <net/net_namespace.h> 12 #include <net/netns/generic.h> 13 14 #include "protocol.h" 15 #include "mib.h" 16 17 #define MPTCP_SYSCTL_PATH "net/mptcp" 18 19 static int mptcp_pernet_id; 20 21 #ifdef CONFIG_SYSCTL 22 static int mptcp_pm_type_max = __MPTCP_PM_TYPE_MAX; 23 #endif 24 25 struct mptcp_pernet { 26 #ifdef CONFIG_SYSCTL 27 struct ctl_table_header *ctl_table_hdr; 28 #endif 29 30 unsigned int add_addr_timeout; 31 unsigned int blackhole_timeout; 32 unsigned int close_timeout; 33 unsigned int stale_loss_cnt; 34 atomic_t active_disable_times; 35 unsigned long active_disable_stamp; 36 u8 syn_retrans_before_tcp_fallback; 37 u8 mptcp_enabled; 38 u8 checksum_enabled; 39 u8 allow_join_initial_addr_port; 40 u8 pm_type; 41 u8 add_addr_v6_port_drop_ts; 42 char scheduler[MPTCP_SCHED_NAME_MAX]; 43 char path_manager[MPTCP_PM_NAME_MAX]; 44 }; 45 46 static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) 47 { 48 return net_generic(net, mptcp_pernet_id); 49 } 50 51 int mptcp_is_enabled(const struct net *net) 52 { 53 return mptcp_get_pernet(net)->mptcp_enabled; 54 } 55 56 unsigned int mptcp_get_add_addr_timeout(const struct net *net) 57 { 58 return mptcp_get_pernet(net)->add_addr_timeout; 59 } 60 61 int mptcp_is_checksum_enabled(const struct net *net) 62 { 63 return mptcp_get_pernet(net)->checksum_enabled; 64 } 65 66 int mptcp_allow_join_id0(const struct net *net) 67 { 68 return mptcp_get_pernet(net)->allow_join_initial_addr_port; 69 } 70 71 unsigned int mptcp_stale_loss_cnt(const struct net *net) 72 { 73 return mptcp_get_pernet(net)->stale_loss_cnt; 74 } 75 76 unsigned int mptcp_close_timeout(const struct sock *sk) 77 { 78 if (sock_flag(sk, SOCK_DEAD)) 79 return TCP_TIMEWAIT_LEN; 80 return mptcp_get_pernet(sock_net(sk))->close_timeout; 81 } 82 83 int mptcp_get_pm_type(const struct net *net) 84 { 85 return mptcp_get_pernet(net)->pm_type; 86 } 87 88 const char *mptcp_get_path_manager(const struct net *net) 89 { 90 return mptcp_get_pernet(net)->path_manager; 91 } 92 93 const char *mptcp_get_scheduler(const struct net *net) 94 { 95 return mptcp_get_pernet(net)->scheduler; 96 } 97 98 unsigned int mptcp_add_addr_v6_port_drop_ts(const struct net *net) 99 { 100 return READ_ONCE(mptcp_get_pernet(net)->add_addr_v6_port_drop_ts); 101 } 102 103 static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) 104 { 105 pernet->mptcp_enabled = 1; 106 pernet->add_addr_timeout = TCP_RTO_MAX; 107 pernet->blackhole_timeout = 3600; 108 pernet->syn_retrans_before_tcp_fallback = 2; 109 atomic_set(&pernet->active_disable_times, 0); 110 pernet->close_timeout = TCP_TIMEWAIT_LEN; 111 pernet->checksum_enabled = 0; 112 pernet->allow_join_initial_addr_port = 1; 113 pernet->stale_loss_cnt = 4; 114 pernet->pm_type = MPTCP_PM_TYPE_KERNEL; 115 strscpy(pernet->scheduler, "default", sizeof(pernet->scheduler)); 116 strscpy(pernet->path_manager, "kernel", sizeof(pernet->path_manager)); 117 pernet->add_addr_v6_port_drop_ts = 1; 118 } 119 120 #ifdef CONFIG_SYSCTL 121 static int mptcp_set_scheduler(char *scheduler, const char *name) 122 { 123 struct mptcp_sched_ops *sched; 124 int ret = 0; 125 126 rcu_read_lock(); 127 sched = mptcp_sched_find(name); 128 if (sched) 129 strscpy(scheduler, name, MPTCP_SCHED_NAME_MAX); 130 else 131 ret = -ENOENT; 132 rcu_read_unlock(); 133 134 return ret; 135 } 136 137 static int proc_scheduler(const struct ctl_table *ctl, int write, 138 void *buffer, size_t *lenp, loff_t *ppos) 139 { 140 char (*scheduler)[MPTCP_SCHED_NAME_MAX] = ctl->data; 141 char val[MPTCP_SCHED_NAME_MAX]; 142 struct ctl_table tbl = { 143 .data = val, 144 .maxlen = MPTCP_SCHED_NAME_MAX, 145 }; 146 int ret; 147 148 strscpy(val, *scheduler, MPTCP_SCHED_NAME_MAX); 149 150 ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 151 if (write && ret == 0) 152 ret = mptcp_set_scheduler(*scheduler, val); 153 154 return ret; 155 } 156 157 static int proc_available_schedulers(const struct ctl_table *ctl, 158 int write, void *buffer, 159 size_t *lenp, loff_t *ppos) 160 { 161 struct ctl_table tbl = { .maxlen = MPTCP_SCHED_BUF_MAX, }; 162 int ret; 163 164 tbl.data = kmalloc(tbl.maxlen, GFP_USER); 165 if (!tbl.data) 166 return -ENOMEM; 167 168 mptcp_get_available_schedulers(tbl.data, MPTCP_SCHED_BUF_MAX); 169 ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 170 kfree(tbl.data); 171 172 return ret; 173 } 174 175 static int proc_blackhole_detect_timeout(const struct ctl_table *table, 176 int write, void *buffer, size_t *lenp, 177 loff_t *ppos) 178 { 179 struct mptcp_pernet *pernet = container_of(table->data, 180 struct mptcp_pernet, 181 blackhole_timeout); 182 int ret; 183 184 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 185 if (write && ret == 0) 186 atomic_set(&pernet->active_disable_times, 0); 187 188 return ret; 189 } 190 191 static int mptcp_set_path_manager(char *path_manager, const char *name) 192 { 193 struct mptcp_pm_ops *pm_ops; 194 int ret = 0; 195 196 rcu_read_lock(); 197 pm_ops = mptcp_pm_find(name); 198 if (pm_ops) 199 strscpy(path_manager, name, MPTCP_PM_NAME_MAX); 200 else 201 ret = -ENOENT; 202 rcu_read_unlock(); 203 204 return ret; 205 } 206 207 static int proc_path_manager(const struct ctl_table *ctl, int write, 208 void *buffer, size_t *lenp, loff_t *ppos) 209 { 210 struct mptcp_pernet *pernet = container_of(ctl->data, 211 struct mptcp_pernet, 212 path_manager); 213 char (*path_manager)[MPTCP_PM_NAME_MAX] = ctl->data; 214 char pm_name[MPTCP_PM_NAME_MAX]; 215 const struct ctl_table tbl = { 216 .data = pm_name, 217 .maxlen = MPTCP_PM_NAME_MAX, 218 }; 219 int ret; 220 221 strscpy(pm_name, *path_manager, MPTCP_PM_NAME_MAX); 222 223 ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 224 if (write && ret == 0) { 225 ret = mptcp_set_path_manager(*path_manager, pm_name); 226 if (ret == 0) { 227 u8 pm_type = __MPTCP_PM_TYPE_NR; 228 229 if (strncmp(pm_name, "kernel", MPTCP_PM_NAME_MAX) == 0) 230 pm_type = MPTCP_PM_TYPE_KERNEL; 231 else if (strncmp(pm_name, "userspace", MPTCP_PM_NAME_MAX) == 0) 232 pm_type = MPTCP_PM_TYPE_USERSPACE; 233 pernet->pm_type = pm_type; 234 } 235 } 236 237 return ret; 238 } 239 240 static int proc_pm_type(const struct ctl_table *ctl, int write, 241 void *buffer, size_t *lenp, loff_t *ppos) 242 { 243 struct mptcp_pernet *pernet = container_of(ctl->data, 244 struct mptcp_pernet, 245 pm_type); 246 int ret; 247 248 ret = proc_dou8vec_minmax(ctl, write, buffer, lenp, ppos); 249 if (write && ret == 0) { 250 u8 pm_type = READ_ONCE(*(u8 *)ctl->data); 251 char *pm_name = ""; 252 253 if (pm_type == MPTCP_PM_TYPE_KERNEL) 254 pm_name = "kernel"; 255 else if (pm_type == MPTCP_PM_TYPE_USERSPACE) 256 pm_name = "userspace"; 257 mptcp_set_path_manager(pernet->path_manager, pm_name); 258 } 259 260 return ret; 261 } 262 263 static int proc_available_path_managers(const struct ctl_table *ctl, 264 int write, void *buffer, 265 size_t *lenp, loff_t *ppos) 266 { 267 struct ctl_table tbl = { .maxlen = MPTCP_PM_BUF_MAX, }; 268 int ret; 269 270 tbl.data = kmalloc(tbl.maxlen, GFP_USER); 271 if (!tbl.data) 272 return -ENOMEM; 273 274 mptcp_pm_get_available(tbl.data, MPTCP_PM_BUF_MAX); 275 ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 276 kfree(tbl.data); 277 278 return ret; 279 } 280 281 static struct ctl_table mptcp_sysctl_table[] = { 282 { 283 .procname = "enabled", 284 .maxlen = sizeof(u8), 285 .mode = 0644, 286 /* users with CAP_NET_ADMIN or root (not and) can change this 287 * value, same as other sysctl or the 'net' tree. 288 */ 289 .proc_handler = proc_dou8vec_minmax, 290 .extra1 = SYSCTL_ZERO, 291 .extra2 = SYSCTL_ONE 292 }, 293 { 294 .procname = "add_addr_timeout", 295 .maxlen = sizeof(unsigned int), 296 .mode = 0644, 297 .proc_handler = proc_dointvec_jiffies, 298 }, 299 { 300 .procname = "checksum_enabled", 301 .maxlen = sizeof(u8), 302 .mode = 0644, 303 .proc_handler = proc_dou8vec_minmax, 304 .extra1 = SYSCTL_ZERO, 305 .extra2 = SYSCTL_ONE 306 }, 307 { 308 .procname = "allow_join_initial_addr_port", 309 .maxlen = sizeof(u8), 310 .mode = 0644, 311 .proc_handler = proc_dou8vec_minmax, 312 .extra1 = SYSCTL_ZERO, 313 .extra2 = SYSCTL_ONE 314 }, 315 { 316 .procname = "stale_loss_cnt", 317 .maxlen = sizeof(unsigned int), 318 .mode = 0644, 319 .proc_handler = proc_douintvec_minmax, 320 }, 321 { 322 .procname = "pm_type", 323 .maxlen = sizeof(u8), 324 .mode = 0644, 325 .proc_handler = proc_pm_type, 326 .extra1 = SYSCTL_ZERO, 327 .extra2 = &mptcp_pm_type_max 328 }, 329 { 330 .procname = "scheduler", 331 .maxlen = MPTCP_SCHED_NAME_MAX, 332 .mode = 0644, 333 .proc_handler = proc_scheduler, 334 }, 335 { 336 .procname = "available_schedulers", 337 .maxlen = MPTCP_SCHED_BUF_MAX, 338 .mode = 0444, 339 .proc_handler = proc_available_schedulers, 340 }, 341 { 342 .procname = "close_timeout", 343 .maxlen = sizeof(unsigned int), 344 .mode = 0644, 345 .proc_handler = proc_dointvec_jiffies, 346 }, 347 { 348 .procname = "blackhole_timeout", 349 .maxlen = sizeof(unsigned int), 350 .mode = 0644, 351 .proc_handler = proc_blackhole_detect_timeout, 352 .extra1 = SYSCTL_ZERO, 353 }, 354 { 355 .procname = "syn_retrans_before_tcp_fallback", 356 .maxlen = sizeof(u8), 357 .mode = 0644, 358 .proc_handler = proc_dou8vec_minmax, 359 }, 360 { 361 .procname = "path_manager", 362 .maxlen = MPTCP_PM_NAME_MAX, 363 .mode = 0644, 364 .proc_handler = proc_path_manager, 365 }, 366 { 367 .procname = "available_path_managers", 368 .maxlen = MPTCP_PM_BUF_MAX, 369 .mode = 0444, 370 .proc_handler = proc_available_path_managers, 371 }, 372 { 373 .procname = "add_addr_v6_port_drop_ts", 374 .maxlen = sizeof(u8), 375 .mode = 0644, 376 .proc_handler = proc_dou8vec_minmax, 377 .extra1 = SYSCTL_ZERO, 378 .extra2 = SYSCTL_ONE 379 }, 380 }; 381 382 static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) 383 { 384 struct ctl_table_header *hdr; 385 struct ctl_table *table; 386 387 table = mptcp_sysctl_table; 388 if (!net_eq(net, &init_net)) { 389 table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL); 390 if (!table) 391 goto err_alloc; 392 } 393 394 table[0].data = &pernet->mptcp_enabled; 395 table[1].data = &pernet->add_addr_timeout; 396 table[2].data = &pernet->checksum_enabled; 397 table[3].data = &pernet->allow_join_initial_addr_port; 398 table[4].data = &pernet->stale_loss_cnt; 399 table[5].data = &pernet->pm_type; 400 table[6].data = &pernet->scheduler; 401 /* table[7] is for available_schedulers which is read-only info */ 402 table[8].data = &pernet->close_timeout; 403 table[9].data = &pernet->blackhole_timeout; 404 table[10].data = &pernet->syn_retrans_before_tcp_fallback; 405 table[11].data = &pernet->path_manager; 406 /* table[12] is for available_path_managers which is read-only info */ 407 table[13].data = &pernet->add_addr_v6_port_drop_ts; 408 409 hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table, 410 ARRAY_SIZE(mptcp_sysctl_table)); 411 if (!hdr) 412 goto err_reg; 413 414 pernet->ctl_table_hdr = hdr; 415 416 return 0; 417 418 err_reg: 419 if (!net_eq(net, &init_net)) 420 kfree(table); 421 err_alloc: 422 return -ENOMEM; 423 } 424 425 static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) 426 { 427 const struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; 428 429 unregister_net_sysctl_table(pernet->ctl_table_hdr); 430 431 kfree(table); 432 } 433 434 #else 435 436 static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) 437 { 438 return 0; 439 } 440 441 static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {} 442 443 #endif /* CONFIG_SYSCTL */ 444 445 /* The following code block is to deal with middle box issues with MPTCP, 446 * similar to what is done with TFO. 447 * The proposed solution is to disable active MPTCP globally when SYN+MPC are 448 * dropped, while SYN without MPC aren't. In this case, active side MPTCP is 449 * disabled globally for 1hr at first. Then if it happens again, it is disabled 450 * for 2h, then 4h, 8h, ... 451 * The timeout is reset back to 1hr when a successful active MPTCP connection is 452 * fully established. 453 */ 454 455 /* Disable active MPTCP and record current jiffies and active_disable_times */ 456 void mptcp_active_disable(struct sock *sk) 457 { 458 struct net *net = sock_net(sk); 459 struct mptcp_pernet *pernet; 460 461 pernet = mptcp_get_pernet(net); 462 463 if (!READ_ONCE(pernet->blackhole_timeout)) 464 return; 465 466 /* Paired with READ_ONCE() in mptcp_active_should_disable() */ 467 WRITE_ONCE(pernet->active_disable_stamp, jiffies); 468 469 /* Paired with smp_rmb() in mptcp_active_should_disable(). 470 * We want pernet->active_disable_stamp to be updated first. 471 */ 472 smp_mb__before_atomic(); 473 atomic_inc(&pernet->active_disable_times); 474 475 MPTCP_INC_STATS(net, MPTCP_MIB_BLACKHOLE); 476 } 477 478 /* Calculate timeout for MPTCP active disable 479 * Return true if we are still in the active MPTCP disable period 480 * Return false if timeout already expired and we should use active MPTCP 481 */ 482 bool mptcp_active_should_disable(struct sock *ssk) 483 { 484 struct net *net = sock_net(ssk); 485 unsigned int blackhole_timeout; 486 struct mptcp_pernet *pernet; 487 unsigned long timeout; 488 int disable_times; 489 int multiplier; 490 491 pernet = mptcp_get_pernet(net); 492 blackhole_timeout = READ_ONCE(pernet->blackhole_timeout); 493 494 if (!blackhole_timeout) 495 return false; 496 497 disable_times = atomic_read(&pernet->active_disable_times); 498 if (!disable_times) 499 return false; 500 501 /* Paired with smp_mb__before_atomic() in mptcp_active_disable() */ 502 smp_rmb(); 503 504 /* Limit timeout to max: 2^6 * initial timeout */ 505 multiplier = 1 << min(disable_times - 1, 6); 506 507 /* Paired with the WRITE_ONCE() in mptcp_active_disable(). */ 508 timeout = READ_ONCE(pernet->active_disable_stamp) + 509 multiplier * blackhole_timeout * HZ; 510 511 return time_before(jiffies, timeout); 512 } 513 514 /* Enable active MPTCP and reset active_disable_times if needed */ 515 void mptcp_active_enable(struct sock *sk) 516 { 517 struct mptcp_pernet *pernet = mptcp_get_pernet(sock_net(sk)); 518 519 if (atomic_read(&pernet->active_disable_times)) { 520 struct net_device *dev; 521 struct dst_entry *dst; 522 523 rcu_read_lock(); 524 dst = __sk_dst_get(sk); 525 dev = dst ? dst_dev_rcu(dst) : NULL; 526 if (!(dev && (dev->flags & IFF_LOOPBACK))) 527 atomic_set(&pernet->active_disable_times, 0); 528 rcu_read_unlock(); 529 } 530 } 531 532 /* Check the number of retransmissions, and fallback to TCP if needed */ 533 void mptcp_active_detect_blackhole(struct sock *ssk, bool expired) 534 { 535 struct mptcp_subflow_context *subflow; 536 u8 timeouts, to_max; 537 struct net *net; 538 539 /* Only check MPTCP SYN ... */ 540 if (likely(!sk_is_mptcp(ssk) || ssk->sk_state != TCP_SYN_SENT)) 541 return; 542 543 subflow = mptcp_subflow_ctx(ssk); 544 545 /* ... + MP_CAPABLE */ 546 if (!subflow->request_mptcp) { 547 /* Mark as blackhole iif the 1st non-MPTCP SYN is accepted */ 548 subflow->mpc_drop = 0; 549 return; 550 } 551 552 net = sock_net(ssk); 553 timeouts = inet_csk(ssk)->icsk_retransmits; 554 to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback; 555 556 if (timeouts == to_max || (timeouts < to_max && expired)) { 557 subflow->mpc_drop = 1; 558 mptcp_early_fallback(mptcp_sk(subflow->conn), subflow, 559 MPTCP_MIB_MPCAPABLEACTIVEDROP); 560 } 561 } 562 563 static int __net_init mptcp_net_init(struct net *net) 564 { 565 struct mptcp_pernet *pernet = mptcp_get_pernet(net); 566 567 mptcp_pernet_set_defaults(pernet); 568 569 return mptcp_pernet_new_table(net, pernet); 570 } 571 572 /* Note: the callback will only be called per extra netns */ 573 static void __net_exit mptcp_net_exit(struct net *net) 574 { 575 struct mptcp_pernet *pernet = mptcp_get_pernet(net); 576 577 mptcp_pernet_del_table(pernet); 578 } 579 580 static struct pernet_operations mptcp_pernet_ops = { 581 .init = mptcp_net_init, 582 .exit = mptcp_net_exit, 583 .id = &mptcp_pernet_id, 584 .size = sizeof(struct mptcp_pernet), 585 }; 586 587 void __init mptcp_init(void) 588 { 589 mptcp_join_cookie_init(); 590 mptcp_proto_init(); 591 592 if (register_pernet_subsys(&mptcp_pernet_ops) < 0) 593 panic("Failed to register MPTCP pernet subsystem.\n"); 594 } 595 596 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 597 int __init mptcpv6_init(void) 598 { 599 int err; 600 601 err = mptcp_proto_v6_init(); 602 603 return err; 604 } 605 #endif 606