1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2019, Tessares SA. 5 */ 6 7 #ifdef CONFIG_SYSCTL 8 #include <linux/sysctl.h> 9 #endif 10 11 #include <net/net_namespace.h> 12 #include <net/netns/generic.h> 13 14 #include "protocol.h" 15 #include "mib.h" 16 17 #define MPTCP_SYSCTL_PATH "net/mptcp" 18 19 static int mptcp_pernet_id; 20 21 #ifdef CONFIG_SYSCTL 22 static int mptcp_pm_type_max = __MPTCP_PM_TYPE_MAX; 23 #endif 24 25 struct mptcp_pernet { 26 #ifdef CONFIG_SYSCTL 27 struct ctl_table_header *ctl_table_hdr; 28 #endif 29 30 unsigned int add_addr_timeout; 31 unsigned int blackhole_timeout; 32 unsigned int close_timeout; 33 unsigned int stale_loss_cnt; 34 atomic_t active_disable_times; 35 u8 syn_retrans_before_tcp_fallback; 36 unsigned long active_disable_stamp; 37 u8 mptcp_enabled; 38 u8 checksum_enabled; 39 u8 allow_join_initial_addr_port; 40 u8 pm_type; 41 char scheduler[MPTCP_SCHED_NAME_MAX]; 42 char path_manager[MPTCP_PM_NAME_MAX]; 43 }; 44 45 static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) 46 { 47 return net_generic(net, mptcp_pernet_id); 48 } 49 50 int mptcp_is_enabled(const struct net *net) 51 { 52 return mptcp_get_pernet(net)->mptcp_enabled; 53 } 54 55 unsigned int mptcp_get_add_addr_timeout(const struct net *net) 56 { 57 return mptcp_get_pernet(net)->add_addr_timeout; 58 } 59 60 int mptcp_is_checksum_enabled(const struct net *net) 61 { 62 return mptcp_get_pernet(net)->checksum_enabled; 63 } 64 65 int mptcp_allow_join_id0(const struct net *net) 66 { 67 return mptcp_get_pernet(net)->allow_join_initial_addr_port; 68 } 69 70 unsigned int mptcp_stale_loss_cnt(const struct net *net) 71 { 72 return mptcp_get_pernet(net)->stale_loss_cnt; 73 } 74 75 unsigned int mptcp_close_timeout(const struct sock *sk) 76 { 77 if (sock_flag(sk, SOCK_DEAD)) 78 return TCP_TIMEWAIT_LEN; 79 return mptcp_get_pernet(sock_net(sk))->close_timeout; 80 } 81 82 int mptcp_get_pm_type(const struct net *net) 83 { 84 return mptcp_get_pernet(net)->pm_type; 85 } 86 87 const char *mptcp_get_path_manager(const struct net *net) 88 { 89 return mptcp_get_pernet(net)->path_manager; 90 } 91 92 const char *mptcp_get_scheduler(const struct net *net) 93 { 94 return mptcp_get_pernet(net)->scheduler; 95 } 96 97 static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) 98 { 99 pernet->mptcp_enabled = 1; 100 pernet->add_addr_timeout = TCP_RTO_MAX; 101 pernet->blackhole_timeout = 3600; 102 pernet->syn_retrans_before_tcp_fallback = 2; 103 atomic_set(&pernet->active_disable_times, 0); 104 pernet->close_timeout = TCP_TIMEWAIT_LEN; 105 pernet->checksum_enabled = 0; 106 pernet->allow_join_initial_addr_port = 1; 107 pernet->stale_loss_cnt = 4; 108 pernet->pm_type = MPTCP_PM_TYPE_KERNEL; 109 strscpy(pernet->scheduler, "default", sizeof(pernet->scheduler)); 110 strscpy(pernet->path_manager, "kernel", sizeof(pernet->path_manager)); 111 } 112 113 #ifdef CONFIG_SYSCTL 114 static int mptcp_set_scheduler(char *scheduler, const char *name) 115 { 116 struct mptcp_sched_ops *sched; 117 int ret = 0; 118 119 rcu_read_lock(); 120 sched = mptcp_sched_find(name); 121 if (sched) 122 strscpy(scheduler, name, MPTCP_SCHED_NAME_MAX); 123 else 124 ret = -ENOENT; 125 rcu_read_unlock(); 126 127 return ret; 128 } 129 130 static int proc_scheduler(const struct ctl_table *ctl, int write, 131 void *buffer, size_t *lenp, loff_t *ppos) 132 { 133 char (*scheduler)[MPTCP_SCHED_NAME_MAX] = ctl->data; 134 char val[MPTCP_SCHED_NAME_MAX]; 135 struct ctl_table tbl = { 136 .data = val, 137 .maxlen = MPTCP_SCHED_NAME_MAX, 138 }; 139 int ret; 140 141 strscpy(val, *scheduler, MPTCP_SCHED_NAME_MAX); 142 143 ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 144 if (write && ret == 0) 145 ret = mptcp_set_scheduler(*scheduler, val); 146 147 return ret; 148 } 149 150 static int proc_available_schedulers(const struct ctl_table *ctl, 151 int write, void *buffer, 152 size_t *lenp, loff_t *ppos) 153 { 154 struct ctl_table tbl = { .maxlen = MPTCP_SCHED_BUF_MAX, }; 155 int ret; 156 157 tbl.data = kmalloc(tbl.maxlen, GFP_USER); 158 if (!tbl.data) 159 return -ENOMEM; 160 161 mptcp_get_available_schedulers(tbl.data, MPTCP_SCHED_BUF_MAX); 162 ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 163 kfree(tbl.data); 164 165 return ret; 166 } 167 168 static int proc_blackhole_detect_timeout(const struct ctl_table *table, 169 int write, void *buffer, size_t *lenp, 170 loff_t *ppos) 171 { 172 struct mptcp_pernet *pernet = container_of(table->data, 173 struct mptcp_pernet, 174 blackhole_timeout); 175 int ret; 176 177 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 178 if (write && ret == 0) 179 atomic_set(&pernet->active_disable_times, 0); 180 181 return ret; 182 } 183 184 static int mptcp_set_path_manager(char *path_manager, const char *name) 185 { 186 struct mptcp_pm_ops *pm_ops; 187 int ret = 0; 188 189 rcu_read_lock(); 190 pm_ops = mptcp_pm_find(name); 191 if (pm_ops) 192 strscpy(path_manager, name, MPTCP_PM_NAME_MAX); 193 else 194 ret = -ENOENT; 195 rcu_read_unlock(); 196 197 return ret; 198 } 199 200 static int proc_path_manager(const struct ctl_table *ctl, int write, 201 void *buffer, size_t *lenp, loff_t *ppos) 202 { 203 struct mptcp_pernet *pernet = container_of(ctl->data, 204 struct mptcp_pernet, 205 path_manager); 206 char (*path_manager)[MPTCP_PM_NAME_MAX] = ctl->data; 207 char pm_name[MPTCP_PM_NAME_MAX]; 208 const struct ctl_table tbl = { 209 .data = pm_name, 210 .maxlen = MPTCP_PM_NAME_MAX, 211 }; 212 int ret; 213 214 strscpy(pm_name, *path_manager, MPTCP_PM_NAME_MAX); 215 216 ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 217 if (write && ret == 0) { 218 ret = mptcp_set_path_manager(*path_manager, pm_name); 219 if (ret == 0) { 220 u8 pm_type = __MPTCP_PM_TYPE_NR; 221 222 if (strncmp(pm_name, "kernel", MPTCP_PM_NAME_MAX) == 0) 223 pm_type = MPTCP_PM_TYPE_KERNEL; 224 else if (strncmp(pm_name, "userspace", MPTCP_PM_NAME_MAX) == 0) 225 pm_type = MPTCP_PM_TYPE_USERSPACE; 226 pernet->pm_type = pm_type; 227 } 228 } 229 230 return ret; 231 } 232 233 static int proc_pm_type(const struct ctl_table *ctl, int write, 234 void *buffer, size_t *lenp, loff_t *ppos) 235 { 236 struct mptcp_pernet *pernet = container_of(ctl->data, 237 struct mptcp_pernet, 238 pm_type); 239 int ret; 240 241 ret = proc_dou8vec_minmax(ctl, write, buffer, lenp, ppos); 242 if (write && ret == 0) { 243 u8 pm_type = READ_ONCE(*(u8 *)ctl->data); 244 char *pm_name = ""; 245 246 if (pm_type == MPTCP_PM_TYPE_KERNEL) 247 pm_name = "kernel"; 248 else if (pm_type == MPTCP_PM_TYPE_USERSPACE) 249 pm_name = "userspace"; 250 mptcp_set_path_manager(pernet->path_manager, pm_name); 251 } 252 253 return ret; 254 } 255 256 static int proc_available_path_managers(const struct ctl_table *ctl, 257 int write, void *buffer, 258 size_t *lenp, loff_t *ppos) 259 { 260 struct ctl_table tbl = { .maxlen = MPTCP_PM_BUF_MAX, }; 261 int ret; 262 263 tbl.data = kmalloc(tbl.maxlen, GFP_USER); 264 if (!tbl.data) 265 return -ENOMEM; 266 267 mptcp_pm_get_available(tbl.data, MPTCP_PM_BUF_MAX); 268 ret = proc_dostring(&tbl, write, buffer, lenp, ppos); 269 kfree(tbl.data); 270 271 return ret; 272 } 273 274 static struct ctl_table mptcp_sysctl_table[] = { 275 { 276 .procname = "enabled", 277 .maxlen = sizeof(u8), 278 .mode = 0644, 279 /* users with CAP_NET_ADMIN or root (not and) can change this 280 * value, same as other sysctl or the 'net' tree. 281 */ 282 .proc_handler = proc_dou8vec_minmax, 283 .extra1 = SYSCTL_ZERO, 284 .extra2 = SYSCTL_ONE 285 }, 286 { 287 .procname = "add_addr_timeout", 288 .maxlen = sizeof(unsigned int), 289 .mode = 0644, 290 .proc_handler = proc_dointvec_jiffies, 291 }, 292 { 293 .procname = "checksum_enabled", 294 .maxlen = sizeof(u8), 295 .mode = 0644, 296 .proc_handler = proc_dou8vec_minmax, 297 .extra1 = SYSCTL_ZERO, 298 .extra2 = SYSCTL_ONE 299 }, 300 { 301 .procname = "allow_join_initial_addr_port", 302 .maxlen = sizeof(u8), 303 .mode = 0644, 304 .proc_handler = proc_dou8vec_minmax, 305 .extra1 = SYSCTL_ZERO, 306 .extra2 = SYSCTL_ONE 307 }, 308 { 309 .procname = "stale_loss_cnt", 310 .maxlen = sizeof(unsigned int), 311 .mode = 0644, 312 .proc_handler = proc_douintvec_minmax, 313 }, 314 { 315 .procname = "pm_type", 316 .maxlen = sizeof(u8), 317 .mode = 0644, 318 .proc_handler = proc_pm_type, 319 .extra1 = SYSCTL_ZERO, 320 .extra2 = &mptcp_pm_type_max 321 }, 322 { 323 .procname = "scheduler", 324 .maxlen = MPTCP_SCHED_NAME_MAX, 325 .mode = 0644, 326 .proc_handler = proc_scheduler, 327 }, 328 { 329 .procname = "available_schedulers", 330 .maxlen = MPTCP_SCHED_BUF_MAX, 331 .mode = 0444, 332 .proc_handler = proc_available_schedulers, 333 }, 334 { 335 .procname = "close_timeout", 336 .maxlen = sizeof(unsigned int), 337 .mode = 0644, 338 .proc_handler = proc_dointvec_jiffies, 339 }, 340 { 341 .procname = "blackhole_timeout", 342 .maxlen = sizeof(unsigned int), 343 .mode = 0644, 344 .proc_handler = proc_blackhole_detect_timeout, 345 .extra1 = SYSCTL_ZERO, 346 }, 347 { 348 .procname = "syn_retrans_before_tcp_fallback", 349 .maxlen = sizeof(u8), 350 .mode = 0644, 351 .proc_handler = proc_dou8vec_minmax, 352 }, 353 { 354 .procname = "path_manager", 355 .maxlen = MPTCP_PM_NAME_MAX, 356 .mode = 0644, 357 .proc_handler = proc_path_manager, 358 }, 359 { 360 .procname = "available_path_managers", 361 .maxlen = MPTCP_PM_BUF_MAX, 362 .mode = 0444, 363 .proc_handler = proc_available_path_managers, 364 }, 365 }; 366 367 static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) 368 { 369 struct ctl_table_header *hdr; 370 struct ctl_table *table; 371 372 table = mptcp_sysctl_table; 373 if (!net_eq(net, &init_net)) { 374 table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL); 375 if (!table) 376 goto err_alloc; 377 } 378 379 table[0].data = &pernet->mptcp_enabled; 380 table[1].data = &pernet->add_addr_timeout; 381 table[2].data = &pernet->checksum_enabled; 382 table[3].data = &pernet->allow_join_initial_addr_port; 383 table[4].data = &pernet->stale_loss_cnt; 384 table[5].data = &pernet->pm_type; 385 table[6].data = &pernet->scheduler; 386 /* table[7] is for available_schedulers which is read-only info */ 387 table[8].data = &pernet->close_timeout; 388 table[9].data = &pernet->blackhole_timeout; 389 table[10].data = &pernet->syn_retrans_before_tcp_fallback; 390 table[11].data = &pernet->path_manager; 391 /* table[12] is for available_path_managers which is read-only info */ 392 393 hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table, 394 ARRAY_SIZE(mptcp_sysctl_table)); 395 if (!hdr) 396 goto err_reg; 397 398 pernet->ctl_table_hdr = hdr; 399 400 return 0; 401 402 err_reg: 403 if (!net_eq(net, &init_net)) 404 kfree(table); 405 err_alloc: 406 return -ENOMEM; 407 } 408 409 static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) 410 { 411 const struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; 412 413 unregister_net_sysctl_table(pernet->ctl_table_hdr); 414 415 kfree(table); 416 } 417 418 #else 419 420 static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) 421 { 422 return 0; 423 } 424 425 static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {} 426 427 #endif /* CONFIG_SYSCTL */ 428 429 /* The following code block is to deal with middle box issues with MPTCP, 430 * similar to what is done with TFO. 431 * The proposed solution is to disable active MPTCP globally when SYN+MPC are 432 * dropped, while SYN without MPC aren't. In this case, active side MPTCP is 433 * disabled globally for 1hr at first. Then if it happens again, it is disabled 434 * for 2h, then 4h, 8h, ... 435 * The timeout is reset back to 1hr when a successful active MPTCP connection is 436 * fully established. 437 */ 438 439 /* Disable active MPTCP and record current jiffies and active_disable_times */ 440 void mptcp_active_disable(struct sock *sk) 441 { 442 struct net *net = sock_net(sk); 443 struct mptcp_pernet *pernet; 444 445 pernet = mptcp_get_pernet(net); 446 447 if (!READ_ONCE(pernet->blackhole_timeout)) 448 return; 449 450 /* Paired with READ_ONCE() in mptcp_active_should_disable() */ 451 WRITE_ONCE(pernet->active_disable_stamp, jiffies); 452 453 /* Paired with smp_rmb() in mptcp_active_should_disable(). 454 * We want pernet->active_disable_stamp to be updated first. 455 */ 456 smp_mb__before_atomic(); 457 atomic_inc(&pernet->active_disable_times); 458 459 MPTCP_INC_STATS(net, MPTCP_MIB_BLACKHOLE); 460 } 461 462 /* Calculate timeout for MPTCP active disable 463 * Return true if we are still in the active MPTCP disable period 464 * Return false if timeout already expired and we should use active MPTCP 465 */ 466 bool mptcp_active_should_disable(struct sock *ssk) 467 { 468 struct net *net = sock_net(ssk); 469 unsigned int blackhole_timeout; 470 struct mptcp_pernet *pernet; 471 unsigned long timeout; 472 int disable_times; 473 int multiplier; 474 475 pernet = mptcp_get_pernet(net); 476 blackhole_timeout = READ_ONCE(pernet->blackhole_timeout); 477 478 if (!blackhole_timeout) 479 return false; 480 481 disable_times = atomic_read(&pernet->active_disable_times); 482 if (!disable_times) 483 return false; 484 485 /* Paired with smp_mb__before_atomic() in mptcp_active_disable() */ 486 smp_rmb(); 487 488 /* Limit timeout to max: 2^6 * initial timeout */ 489 multiplier = 1 << min(disable_times - 1, 6); 490 491 /* Paired with the WRITE_ONCE() in mptcp_active_disable(). */ 492 timeout = READ_ONCE(pernet->active_disable_stamp) + 493 multiplier * blackhole_timeout * HZ; 494 495 return time_before(jiffies, timeout); 496 } 497 498 /* Enable active MPTCP and reset active_disable_times if needed */ 499 void mptcp_active_enable(struct sock *sk) 500 { 501 struct mptcp_pernet *pernet = mptcp_get_pernet(sock_net(sk)); 502 503 if (atomic_read(&pernet->active_disable_times)) { 504 struct dst_entry *dst = sk_dst_get(sk); 505 506 if (dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)) 507 atomic_set(&pernet->active_disable_times, 0); 508 } 509 } 510 511 /* Check the number of retransmissions, and fallback to TCP if needed */ 512 void mptcp_active_detect_blackhole(struct sock *ssk, bool expired) 513 { 514 struct mptcp_subflow_context *subflow; 515 u8 timeouts, to_max; 516 struct net *net; 517 518 /* Only check MPTCP SYN ... */ 519 if (likely(!sk_is_mptcp(ssk) || ssk->sk_state != TCP_SYN_SENT)) 520 return; 521 522 subflow = mptcp_subflow_ctx(ssk); 523 524 /* ... + MP_CAPABLE */ 525 if (!subflow->request_mptcp) { 526 /* Mark as blackhole iif the 1st non-MPTCP SYN is accepted */ 527 subflow->mpc_drop = 0; 528 return; 529 } 530 531 net = sock_net(ssk); 532 timeouts = inet_csk(ssk)->icsk_retransmits; 533 to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback; 534 535 if (timeouts == to_max || (timeouts < to_max && expired)) { 536 MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP); 537 subflow->mpc_drop = 1; 538 mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); 539 } 540 } 541 542 static int __net_init mptcp_net_init(struct net *net) 543 { 544 struct mptcp_pernet *pernet = mptcp_get_pernet(net); 545 546 mptcp_pernet_set_defaults(pernet); 547 548 return mptcp_pernet_new_table(net, pernet); 549 } 550 551 /* Note: the callback will only be called per extra netns */ 552 static void __net_exit mptcp_net_exit(struct net *net) 553 { 554 struct mptcp_pernet *pernet = mptcp_get_pernet(net); 555 556 mptcp_pernet_del_table(pernet); 557 } 558 559 static struct pernet_operations mptcp_pernet_ops = { 560 .init = mptcp_net_init, 561 .exit = mptcp_net_exit, 562 .id = &mptcp_pernet_id, 563 .size = sizeof(struct mptcp_pernet), 564 }; 565 566 void __init mptcp_init(void) 567 { 568 mptcp_join_cookie_init(); 569 mptcp_proto_init(); 570 571 if (register_pernet_subsys(&mptcp_pernet_ops) < 0) 572 panic("Failed to register MPTCP pernet subsystem.\n"); 573 } 574 575 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 576 int __init mptcpv6_init(void) 577 { 578 int err; 579 580 err = mptcp_proto_v6_init(); 581 582 return err; 583 } 584 #endif 585