1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_compat.h" 36 #include "opt_inet.h" 37 #include "opt_inet6.h" 38 #include "opt_ipsec.h" 39 #include "opt_tcpdebug.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/callout.h> 44 #include <sys/eventhandler.h> 45 #include <sys/hhook.h> 46 #include <sys/kernel.h> 47 #include <sys/khelp.h> 48 #include <sys/sysctl.h> 49 #include <sys/jail.h> 50 #include <sys/malloc.h> 51 #include <sys/refcount.h> 52 #include <sys/mbuf.h> 53 #ifdef INET6 54 #include <sys/domain.h> 55 #endif 56 #include <sys/priv.h> 57 #include <sys/proc.h> 58 #include <sys/sdt.h> 59 #include <sys/socket.h> 60 #include <sys/socketvar.h> 61 #include <sys/protosw.h> 62 #include <sys/random.h> 63 64 #include <vm/uma.h> 65 66 #include <net/route.h> 67 #include <net/if.h> 68 #include <net/if_var.h> 69 #include <net/vnet.h> 70 71 #include <netinet/in.h> 72 #include <netinet/in_fib.h> 73 #include <netinet/in_kdtrace.h> 74 #include <netinet/in_pcb.h> 75 #include <netinet/in_systm.h> 76 #include <netinet/in_var.h> 77 #include <netinet/ip.h> 78 #include <netinet/ip_icmp.h> 79 #include <netinet/ip_var.h> 80 #ifdef INET6 81 #include <netinet/ip6.h> 82 #include <netinet6/in6_fib.h> 83 #include <netinet6/in6_pcb.h> 84 #include <netinet6/ip6_var.h> 85 #include <netinet6/scope6_var.h> 86 #include <netinet6/nd6.h> 87 #endif 88 89 #ifdef TCP_RFC7413 90 #include <netinet/tcp_fastopen.h> 91 #endif 92 #include <netinet/tcp.h> 93 #include <netinet/tcp_fsm.h> 94 #include <netinet/tcp_seq.h> 95 #include <netinet/tcp_timer.h> 96 #include <netinet/tcp_var.h> 97 #include <netinet/tcp_syncache.h> 98 #include <netinet/cc/cc.h> 99 #ifdef INET6 100 #include <netinet6/tcp6_var.h> 101 #endif 102 #include <netinet/tcpip.h> 103 #ifdef TCPPCAP 104 #include <netinet/tcp_pcap.h> 105 #endif 106 #ifdef TCPDEBUG 107 #include <netinet/tcp_debug.h> 108 #endif 109 #ifdef INET6 110 #include <netinet6/ip6protosw.h> 111 #endif 112 #ifdef TCP_OFFLOAD 113 #include <netinet/tcp_offload.h> 114 #endif 115 116 #ifdef IPSEC 117 #include <netipsec/ipsec.h> 118 #include <netipsec/xform.h> 119 #ifdef INET6 120 #include <netipsec/ipsec6.h> 121 #endif 122 #include <netipsec/key.h> 123 #include <sys/syslog.h> 124 #endif /*IPSEC*/ 125 126 #include <machine/in_cksum.h> 127 #include <sys/md5.h> 128 129 #include <security/mac/mac_framework.h> 130 131 VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; 132 #ifdef INET6 133 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; 134 #endif 135 136 struct rwlock tcp_function_lock; 137 138 static int 139 sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) 140 { 141 int error, new; 142 143 new = V_tcp_mssdflt; 144 error = sysctl_handle_int(oidp, &new, 0, req); 145 if (error == 0 && req->newptr) { 146 if (new < TCP_MINMSS) 147 error = EINVAL; 148 else 149 V_tcp_mssdflt = new; 150 } 151 return (error); 152 } 153 154 SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, 155 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0, 156 &sysctl_net_inet_tcp_mss_check, "I", 157 "Default TCP Maximum Segment Size"); 158 159 #ifdef INET6 160 static int 161 sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) 162 { 163 int error, new; 164 165 new = V_tcp_v6mssdflt; 166 error = sysctl_handle_int(oidp, &new, 0, req); 167 if (error == 0 && req->newptr) { 168 if (new < TCP_MINMSS) 169 error = EINVAL; 170 else 171 V_tcp_v6mssdflt = new; 172 } 173 return (error); 174 } 175 176 SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, 177 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, 178 &sysctl_net_inet_tcp_mss_v6_check, "I", 179 "Default TCP Maximum Segment Size for IPv6"); 180 #endif /* INET6 */ 181 182 /* 183 * Minimum MSS we accept and use. This prevents DoS attacks where 184 * we are forced to a ridiculous low MSS like 20 and send hundreds 185 * of packets instead of one. The effect scales with the available 186 * bandwidth and quickly saturates the CPU and network interface 187 * with packet generation and sending. Set to zero to disable MINMSS 188 * checking. This setting prevents us from sending too small packets. 189 */ 190 VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; 191 SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, 192 &VNET_NAME(tcp_minmss), 0, 193 "Minimum TCP Maximum Segment Size"); 194 195 VNET_DEFINE(int, tcp_do_rfc1323) = 1; 196 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, 197 &VNET_NAME(tcp_do_rfc1323), 0, 198 "Enable rfc1323 (high performance TCP) extensions"); 199 200 static int tcp_log_debug = 0; 201 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, 202 &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); 203 204 static int tcp_tcbhashsize; 205 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 206 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); 207 208 static int do_tcpdrain = 1; 209 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, 210 "Enable tcp_drain routine for extra help when low on mbufs"); 211 212 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, 213 &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); 214 215 static VNET_DEFINE(int, icmp_may_rst) = 1; 216 #define V_icmp_may_rst VNET(icmp_may_rst) 217 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, 218 &VNET_NAME(icmp_may_rst), 0, 219 "Certain ICMP unreachable messages may abort connections in SYN_SENT"); 220 221 static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0; 222 #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) 223 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, 224 &VNET_NAME(tcp_isn_reseed_interval), 0, 225 "Seconds between reseeding of ISN secret"); 226 227 static int tcp_soreceive_stream; 228 SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, 229 &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); 230 231 #ifdef TCP_SIGNATURE 232 static int tcp_sig_checksigs = 1; 233 SYSCTL_INT(_net_inet_tcp, OID_AUTO, signature_verify_input, CTLFLAG_RW, 234 &tcp_sig_checksigs, 0, "Verify RFC2385 digests on inbound traffic"); 235 #endif 236 237 VNET_DEFINE(uma_zone_t, sack_hole_zone); 238 #define V_sack_hole_zone VNET(sack_hole_zone) 239 240 VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); 241 242 static struct inpcb *tcp_notify(struct inpcb *, int); 243 static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); 244 static void tcp_mtudisc(struct inpcb *, int); 245 static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, 246 void *ip4hdr, const void *ip6hdr); 247 248 249 static struct tcp_function_block tcp_def_funcblk = { 250 "default", 251 tcp_output, 252 tcp_do_segment, 253 tcp_default_ctloutput, 254 NULL, 255 NULL, 256 NULL, 257 NULL, 258 NULL, 259 NULL, 260 0, 261 0 262 }; 263 264 int t_functions_inited = 0; 265 struct tcp_funchead t_functions; 266 static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; 267 268 static void 269 init_tcp_functions(void) 270 { 271 if (t_functions_inited == 0) { 272 TAILQ_INIT(&t_functions); 273 rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0); 274 t_functions_inited = 1; 275 } 276 } 277 278 static struct tcp_function_block * 279 find_tcp_functions_locked(struct tcp_function_set *fs) 280 { 281 struct tcp_function *f; 282 struct tcp_function_block *blk=NULL; 283 284 TAILQ_FOREACH(f, &t_functions, tf_next) { 285 if (strcmp(f->tf_fb->tfb_tcp_block_name, fs->function_set_name) == 0) { 286 blk = f->tf_fb; 287 break; 288 } 289 } 290 return(blk); 291 } 292 293 static struct tcp_function_block * 294 find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) 295 { 296 struct tcp_function_block *rblk=NULL; 297 struct tcp_function *f; 298 299 TAILQ_FOREACH(f, &t_functions, tf_next) { 300 if (f->tf_fb == blk) { 301 rblk = blk; 302 if (s) { 303 *s = f; 304 } 305 break; 306 } 307 } 308 return (rblk); 309 } 310 311 struct tcp_function_block * 312 find_and_ref_tcp_functions(struct tcp_function_set *fs) 313 { 314 struct tcp_function_block *blk; 315 316 rw_rlock(&tcp_function_lock); 317 blk = find_tcp_functions_locked(fs); 318 if (blk) 319 refcount_acquire(&blk->tfb_refcnt); 320 rw_runlock(&tcp_function_lock); 321 return(blk); 322 } 323 324 struct tcp_function_block * 325 find_and_ref_tcp_fb(struct tcp_function_block *blk) 326 { 327 struct tcp_function_block *rblk; 328 329 rw_rlock(&tcp_function_lock); 330 rblk = find_tcp_fb_locked(blk, NULL); 331 if (rblk) 332 refcount_acquire(&rblk->tfb_refcnt); 333 rw_runlock(&tcp_function_lock); 334 return(rblk); 335 } 336 337 338 static int 339 sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) 340 { 341 int error=ENOENT; 342 struct tcp_function_set fs; 343 struct tcp_function_block *blk; 344 345 memset(&fs, 0, sizeof(fs)); 346 rw_rlock(&tcp_function_lock); 347 blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL); 348 if (blk) { 349 /* Found him */ 350 strcpy(fs.function_set_name, blk->tfb_tcp_block_name); 351 fs.pcbcnt = blk->tfb_refcnt; 352 } 353 rw_runlock(&tcp_function_lock); 354 error = sysctl_handle_string(oidp, fs.function_set_name, 355 sizeof(fs.function_set_name), req); 356 357 /* Check for error or no change */ 358 if (error != 0 || req->newptr == NULL) 359 return(error); 360 361 rw_wlock(&tcp_function_lock); 362 blk = find_tcp_functions_locked(&fs); 363 if ((blk == NULL) || 364 (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { 365 error = ENOENT; 366 goto done; 367 } 368 tcp_func_set_ptr = blk; 369 done: 370 rw_wunlock(&tcp_function_lock); 371 return (error); 372 } 373 374 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, 375 CTLTYPE_STRING | CTLFLAG_RW, 376 NULL, 0, sysctl_net_inet_default_tcp_functions, "A", 377 "Set/get the default TCP functions"); 378 379 static int 380 sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) 381 { 382 int error, cnt, linesz; 383 struct tcp_function *f; 384 char *buffer, *cp; 385 size_t bufsz, outsz; 386 387 cnt = 0; 388 rw_rlock(&tcp_function_lock); 389 TAILQ_FOREACH(f, &t_functions, tf_next) { 390 cnt++; 391 } 392 rw_runlock(&tcp_function_lock); 393 394 bufsz = (cnt+2) * (TCP_FUNCTION_NAME_LEN_MAX + 12) + 1; 395 buffer = malloc(bufsz, M_TEMP, M_WAITOK); 396 397 error = 0; 398 cp = buffer; 399 400 linesz = snprintf(cp, bufsz, "\n%-32s%c %s\n", "Stack", 'D', "PCB count"); 401 cp += linesz; 402 bufsz -= linesz; 403 outsz = linesz; 404 405 rw_rlock(&tcp_function_lock); 406 TAILQ_FOREACH(f, &t_functions, tf_next) { 407 linesz = snprintf(cp, bufsz, "%-32s%c %u\n", 408 f->tf_fb->tfb_tcp_block_name, 409 (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ', 410 f->tf_fb->tfb_refcnt); 411 if (linesz >= bufsz) { 412 error = EOVERFLOW; 413 break; 414 } 415 cp += linesz; 416 bufsz -= linesz; 417 outsz += linesz; 418 } 419 rw_runlock(&tcp_function_lock); 420 if (error == 0) 421 error = sysctl_handle_string(oidp, buffer, outsz + 1, req); 422 free(buffer, M_TEMP); 423 return (error); 424 } 425 426 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, 427 CTLTYPE_STRING|CTLFLAG_RD, 428 NULL, 0, sysctl_net_inet_list_available, "A", 429 "list available TCP Function sets"); 430 431 /* 432 * Target size of TCP PCB hash tables. Must be a power of two. 433 * 434 * Note that this can be overridden by the kernel environment 435 * variable net.inet.tcp.tcbhashsize 436 */ 437 #ifndef TCBHASHSIZE 438 #define TCBHASHSIZE 0 439 #endif 440 441 /* 442 * XXX 443 * Callouts should be moved into struct tcp directly. They are currently 444 * separate because the tcpcb structure is exported to userland for sysctl 445 * parsing purposes, which do not know about callouts. 446 */ 447 struct tcpcb_mem { 448 struct tcpcb tcb; 449 struct tcp_timer tt; 450 struct cc_var ccv; 451 struct osd osd; 452 }; 453 454 static VNET_DEFINE(uma_zone_t, tcpcb_zone); 455 #define V_tcpcb_zone VNET(tcpcb_zone) 456 457 MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); 458 MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); 459 460 static struct mtx isn_mtx; 461 462 #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) 463 #define ISN_LOCK() mtx_lock(&isn_mtx) 464 #define ISN_UNLOCK() mtx_unlock(&isn_mtx) 465 466 /* 467 * TCP initialization. 468 */ 469 static void 470 tcp_zone_change(void *tag) 471 { 472 473 uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); 474 uma_zone_set_max(V_tcpcb_zone, maxsockets); 475 tcp_tw_zone_change(); 476 } 477 478 static int 479 tcp_inpcb_init(void *mem, int size, int flags) 480 { 481 struct inpcb *inp = mem; 482 483 INP_LOCK_INIT(inp, "inp", "tcpinp"); 484 return (0); 485 } 486 487 /* 488 * Take a value and get the next power of 2 that doesn't overflow. 489 * Used to size the tcp_inpcb hash buckets. 490 */ 491 static int 492 maketcp_hashsize(int size) 493 { 494 int hashsize; 495 496 /* 497 * auto tune. 498 * get the next power of 2 higher than maxsockets. 499 */ 500 hashsize = 1 << fls(size); 501 /* catch overflow, and just go one power of 2 smaller */ 502 if (hashsize < size) { 503 hashsize = 1 << (fls(size) - 1); 504 } 505 return (hashsize); 506 } 507 508 int 509 register_tcp_functions(struct tcp_function_block *blk, int wait) 510 { 511 struct tcp_function_block *lblk; 512 struct tcp_function *n; 513 struct tcp_function_set fs; 514 515 if (t_functions_inited == 0) { 516 init_tcp_functions(); 517 } 518 if ((blk->tfb_tcp_output == NULL) || 519 (blk->tfb_tcp_do_segment == NULL) || 520 (blk->tfb_tcp_ctloutput == NULL) || 521 (strlen(blk->tfb_tcp_block_name) == 0)) { 522 /* 523 * These functions are required and you 524 * need a name. 525 */ 526 return (EINVAL); 527 } 528 if (blk->tfb_tcp_timer_stop_all || 529 blk->tfb_tcp_timer_activate || 530 blk->tfb_tcp_timer_active || 531 blk->tfb_tcp_timer_stop) { 532 /* 533 * If you define one timer function you 534 * must have them all. 535 */ 536 if ((blk->tfb_tcp_timer_stop_all == NULL) || 537 (blk->tfb_tcp_timer_activate == NULL) || 538 (blk->tfb_tcp_timer_active == NULL) || 539 (blk->tfb_tcp_timer_stop == NULL)) { 540 return (EINVAL); 541 } 542 } 543 n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); 544 if (n == NULL) { 545 return (ENOMEM); 546 } 547 n->tf_fb = blk; 548 strcpy(fs.function_set_name, blk->tfb_tcp_block_name); 549 rw_wlock(&tcp_function_lock); 550 lblk = find_tcp_functions_locked(&fs); 551 if (lblk) { 552 /* Duplicate name space not allowed */ 553 rw_wunlock(&tcp_function_lock); 554 free(n, M_TCPFUNCTIONS); 555 return (EALREADY); 556 } 557 refcount_init(&blk->tfb_refcnt, 0); 558 blk->tfb_flags = 0; 559 TAILQ_INSERT_TAIL(&t_functions, n, tf_next); 560 rw_wunlock(&tcp_function_lock); 561 return(0); 562 } 563 564 int 565 deregister_tcp_functions(struct tcp_function_block *blk) 566 { 567 struct tcp_function_block *lblk; 568 struct tcp_function *f; 569 int error=ENOENT; 570 571 if (strcmp(blk->tfb_tcp_block_name, "default") == 0) { 572 /* You can't un-register the default */ 573 return (EPERM); 574 } 575 rw_wlock(&tcp_function_lock); 576 if (blk == tcp_func_set_ptr) { 577 /* You can't free the current default */ 578 rw_wunlock(&tcp_function_lock); 579 return (EBUSY); 580 } 581 if (blk->tfb_refcnt) { 582 /* Still tcb attached, mark it. */ 583 blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; 584 rw_wunlock(&tcp_function_lock); 585 return (EBUSY); 586 } 587 lblk = find_tcp_fb_locked(blk, &f); 588 if (lblk) { 589 /* Found */ 590 TAILQ_REMOVE(&t_functions, f, tf_next); 591 f->tf_fb = NULL; 592 free(f, M_TCPFUNCTIONS); 593 error = 0; 594 } 595 rw_wunlock(&tcp_function_lock); 596 return (error); 597 } 598 599 void 600 tcp_init(void) 601 { 602 const char *tcbhash_tuneable; 603 int hashsize; 604 605 tcbhash_tuneable = "net.inet.tcp.tcbhashsize"; 606 607 if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, 608 &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 609 printf("%s: WARNING: unable to register helper hook\n", __func__); 610 if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, 611 &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 612 printf("%s: WARNING: unable to register helper hook\n", __func__); 613 hashsize = TCBHASHSIZE; 614 TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); 615 if (hashsize == 0) { 616 /* 617 * Auto tune the hash size based on maxsockets. 618 * A perfect hash would have a 1:1 mapping 619 * (hashsize = maxsockets) however it's been 620 * suggested that O(2) average is better. 621 */ 622 hashsize = maketcp_hashsize(maxsockets / 4); 623 /* 624 * Our historical default is 512, 625 * do not autotune lower than this. 626 */ 627 if (hashsize < 512) 628 hashsize = 512; 629 if (bootverbose && IS_DEFAULT_VNET(curvnet)) 630 printf("%s: %s auto tuned to %d\n", __func__, 631 tcbhash_tuneable, hashsize); 632 } 633 /* 634 * We require a hashsize to be a power of two. 635 * Previously if it was not a power of two we would just reset it 636 * back to 512, which could be a nasty surprise if you did not notice 637 * the error message. 638 * Instead what we do is clip it to the closest power of two lower 639 * than the specified hash value. 640 */ 641 if (!powerof2(hashsize)) { 642 int oldhashsize = hashsize; 643 644 hashsize = maketcp_hashsize(hashsize); 645 /* prevent absurdly low value */ 646 if (hashsize < 16) 647 hashsize = 16; 648 printf("%s: WARNING: TCB hash size not a power of 2, " 649 "clipped from %d to %d.\n", __func__, oldhashsize, 650 hashsize); 651 } 652 in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, 653 "tcp_inpcb", tcp_inpcb_init, NULL, 0, IPI_HASHFIELDS_4TUPLE); 654 655 /* 656 * These have to be type stable for the benefit of the timers. 657 */ 658 V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), 659 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 660 uma_zone_set_max(V_tcpcb_zone, maxsockets); 661 uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached"); 662 663 tcp_tw_init(); 664 syncache_init(); 665 tcp_hc_init(); 666 667 TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); 668 V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), 669 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 670 671 /* Skip initialization of globals for non-default instances. */ 672 if (!IS_DEFAULT_VNET(curvnet)) 673 return; 674 675 tcp_reass_global_init(); 676 677 /* XXX virtualize those bellow? */ 678 tcp_delacktime = TCPTV_DELACK; 679 tcp_keepinit = TCPTV_KEEP_INIT; 680 tcp_keepidle = TCPTV_KEEP_IDLE; 681 tcp_keepintvl = TCPTV_KEEPINTVL; 682 tcp_maxpersistidle = TCPTV_KEEP_IDLE; 683 tcp_msl = TCPTV_MSL; 684 tcp_rexmit_min = TCPTV_MIN; 685 if (tcp_rexmit_min < 1) 686 tcp_rexmit_min = 1; 687 tcp_persmin = TCPTV_PERSMIN; 688 tcp_persmax = TCPTV_PERSMAX; 689 tcp_rexmit_slop = TCPTV_CPU_VAR; 690 tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; 691 tcp_tcbhashsize = hashsize; 692 /* Setup the tcp function block list */ 693 init_tcp_functions(); 694 register_tcp_functions(&tcp_def_funcblk, M_WAITOK); 695 696 if (tcp_soreceive_stream) { 697 #ifdef INET 698 tcp_usrreqs.pru_soreceive = soreceive_stream; 699 #endif 700 #ifdef INET6 701 tcp6_usrreqs.pru_soreceive = soreceive_stream; 702 #endif /* INET6 */ 703 } 704 705 #ifdef INET6 706 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) 707 #else /* INET6 */ 708 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) 709 #endif /* INET6 */ 710 if (max_protohdr < TCP_MINPROTOHDR) 711 max_protohdr = TCP_MINPROTOHDR; 712 if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) 713 panic("tcp_init"); 714 #undef TCP_MINPROTOHDR 715 716 ISN_LOCK_INIT(); 717 EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, 718 SHUTDOWN_PRI_DEFAULT); 719 EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, 720 EVENTHANDLER_PRI_ANY); 721 #ifdef TCPPCAP 722 tcp_pcap_init(); 723 #endif 724 725 #ifdef TCP_RFC7413 726 tcp_fastopen_init(); 727 #endif 728 } 729 730 #ifdef VIMAGE 731 void 732 tcp_destroy(void) 733 { 734 int error; 735 736 /* 737 * All our processes are gone, all our sockets should be cleaned 738 * up, which means, we should be past the tcp_discardcb() calls. 739 * Sleep to let all tcpcb timers really disappear and then cleanup. 740 * Timewait will cleanup its queue and will be ready to go. 741 * XXX-BZ In theory a few ticks should be good enough to make sure 742 * the timers are all really gone. We should see if we could use a 743 * better metric here and, e.g., check a tcbcb count as an optimization? 744 */ 745 DELAY(1000000 / hz); 746 tcp_hc_destroy(); 747 syncache_destroy(); 748 tcp_tw_destroy(); 749 in_pcbinfo_destroy(&V_tcbinfo); 750 /* tcp_discardcb() clears the sack_holes up. */ 751 uma_zdestroy(V_sack_hole_zone); 752 uma_zdestroy(V_tcpcb_zone); 753 754 #ifdef TCP_RFC7413 755 /* 756 * Cannot free the zone until all tcpcbs are released as we attach 757 * the allocations to them. 758 */ 759 tcp_fastopen_destroy(); 760 #endif 761 762 error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); 763 if (error != 0) { 764 printf("%s: WARNING: unable to deregister helper hook " 765 "type=%d, id=%d: error %d returned\n", __func__, 766 HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); 767 } 768 error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); 769 if (error != 0) { 770 printf("%s: WARNING: unable to deregister helper hook " 771 "type=%d, id=%d: error %d returned\n", __func__, 772 HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); 773 } 774 } 775 #endif 776 777 void 778 tcp_fini(void *xtp) 779 { 780 781 } 782 783 /* 784 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. 785 * tcp_template used to store this data in mbufs, but we now recopy it out 786 * of the tcpcb each time to conserve mbufs. 787 */ 788 void 789 tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) 790 { 791 struct tcphdr *th = (struct tcphdr *)tcp_ptr; 792 793 INP_WLOCK_ASSERT(inp); 794 795 #ifdef INET6 796 if ((inp->inp_vflag & INP_IPV6) != 0) { 797 struct ip6_hdr *ip6; 798 799 ip6 = (struct ip6_hdr *)ip_ptr; 800 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | 801 (inp->inp_flow & IPV6_FLOWINFO_MASK); 802 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | 803 (IPV6_VERSION & IPV6_VERSION_MASK); 804 ip6->ip6_nxt = IPPROTO_TCP; 805 ip6->ip6_plen = htons(sizeof(struct tcphdr)); 806 ip6->ip6_src = inp->in6p_laddr; 807 ip6->ip6_dst = inp->in6p_faddr; 808 } 809 #endif /* INET6 */ 810 #if defined(INET6) && defined(INET) 811 else 812 #endif 813 #ifdef INET 814 { 815 struct ip *ip; 816 817 ip = (struct ip *)ip_ptr; 818 ip->ip_v = IPVERSION; 819 ip->ip_hl = 5; 820 ip->ip_tos = inp->inp_ip_tos; 821 ip->ip_len = 0; 822 ip->ip_id = 0; 823 ip->ip_off = 0; 824 ip->ip_ttl = inp->inp_ip_ttl; 825 ip->ip_sum = 0; 826 ip->ip_p = IPPROTO_TCP; 827 ip->ip_src = inp->inp_laddr; 828 ip->ip_dst = inp->inp_faddr; 829 } 830 #endif /* INET */ 831 th->th_sport = inp->inp_lport; 832 th->th_dport = inp->inp_fport; 833 th->th_seq = 0; 834 th->th_ack = 0; 835 th->th_x2 = 0; 836 th->th_off = 5; 837 th->th_flags = 0; 838 th->th_win = 0; 839 th->th_urp = 0; 840 th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ 841 } 842 843 /* 844 * Create template to be used to send tcp packets on a connection. 845 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only 846 * use for this function is in keepalives, which use tcp_respond. 847 */ 848 struct tcptemp * 849 tcpip_maketemplate(struct inpcb *inp) 850 { 851 struct tcptemp *t; 852 853 t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); 854 if (t == NULL) 855 return (NULL); 856 tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t); 857 return (t); 858 } 859 860 /* 861 * Send a single message to the TCP at address specified by 862 * the given TCP/IP header. If m == NULL, then we make a copy 863 * of the tcpiphdr at th and send directly to the addressed host. 864 * This is used to force keep alive messages out using the TCP 865 * template for a connection. If flags are given then we send 866 * a message back to the TCP which originated the segment th, 867 * and discard the mbuf containing it and any other attached mbufs. 868 * 869 * In any case the ack and sequence number of the transmitted 870 * segment are as specified by the parameters. 871 * 872 * NOTE: If m != NULL, then th must point to *inside* the mbuf. 873 */ 874 void 875 tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, 876 tcp_seq ack, tcp_seq seq, int flags) 877 { 878 struct tcpopt to; 879 struct inpcb *inp; 880 struct ip *ip; 881 struct mbuf *optm; 882 struct tcphdr *nth; 883 u_char *optp; 884 #ifdef INET6 885 struct ip6_hdr *ip6; 886 int isipv6; 887 #endif /* INET6 */ 888 int optlen, tlen, win; 889 bool incl_opts; 890 891 KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); 892 893 #ifdef INET6 894 isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); 895 ip6 = ipgen; 896 #endif /* INET6 */ 897 ip = ipgen; 898 899 if (tp != NULL) { 900 inp = tp->t_inpcb; 901 KASSERT(inp != NULL, ("tcp control block w/o inpcb")); 902 INP_WLOCK_ASSERT(inp); 903 } else 904 inp = NULL; 905 906 incl_opts = false; 907 win = 0; 908 if (tp != NULL) { 909 if (!(flags & TH_RST)) { 910 win = sbspace(&inp->inp_socket->so_rcv); 911 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 912 win = (long)TCP_MAXWIN << tp->rcv_scale; 913 } 914 if ((tp->t_flags & TF_NOOPT) == 0) 915 incl_opts = true; 916 } 917 if (m == NULL) { 918 m = m_gethdr(M_NOWAIT, MT_DATA); 919 if (m == NULL) 920 return; 921 m->m_data += max_linkhdr; 922 #ifdef INET6 923 if (isipv6) { 924 bcopy((caddr_t)ip6, mtod(m, caddr_t), 925 sizeof(struct ip6_hdr)); 926 ip6 = mtod(m, struct ip6_hdr *); 927 nth = (struct tcphdr *)(ip6 + 1); 928 } else 929 #endif /* INET6 */ 930 { 931 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); 932 ip = mtod(m, struct ip *); 933 nth = (struct tcphdr *)(ip + 1); 934 } 935 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); 936 flags = TH_ACK; 937 } else if (!M_WRITABLE(m)) { 938 struct mbuf *n; 939 940 /* Can't reuse 'm', allocate a new mbuf. */ 941 n = m_gethdr(M_NOWAIT, MT_DATA); 942 if (n == NULL) { 943 m_freem(m); 944 return; 945 } 946 947 if (!m_dup_pkthdr(n, m, M_NOWAIT)) { 948 m_freem(m); 949 m_freem(n); 950 return; 951 } 952 953 n->m_data += max_linkhdr; 954 /* m_len is set later */ 955 #define xchg(a,b,type) { type t; t=a; a=b; b=t; } 956 #ifdef INET6 957 if (isipv6) { 958 bcopy((caddr_t)ip6, mtod(n, caddr_t), 959 sizeof(struct ip6_hdr)); 960 ip6 = mtod(n, struct ip6_hdr *); 961 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 962 nth = (struct tcphdr *)(ip6 + 1); 963 } else 964 #endif /* INET6 */ 965 { 966 bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip)); 967 ip = mtod(n, struct ip *); 968 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); 969 nth = (struct tcphdr *)(ip + 1); 970 } 971 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); 972 xchg(nth->th_dport, nth->th_sport, uint16_t); 973 th = nth; 974 m_freem(m); 975 m = n; 976 } else { 977 /* 978 * reuse the mbuf. 979 * XXX MRT We inherit the FIB, which is lucky. 980 */ 981 m_freem(m->m_next); 982 m->m_next = NULL; 983 m->m_data = (caddr_t)ipgen; 984 /* m_len is set later */ 985 #ifdef INET6 986 if (isipv6) { 987 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 988 nth = (struct tcphdr *)(ip6 + 1); 989 } else 990 #endif /* INET6 */ 991 { 992 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); 993 nth = (struct tcphdr *)(ip + 1); 994 } 995 if (th != nth) { 996 /* 997 * this is usually a case when an extension header 998 * exists between the IPv6 header and the 999 * TCP header. 1000 */ 1001 nth->th_sport = th->th_sport; 1002 nth->th_dport = th->th_dport; 1003 } 1004 xchg(nth->th_dport, nth->th_sport, uint16_t); 1005 #undef xchg 1006 } 1007 tlen = 0; 1008 #ifdef INET6 1009 if (isipv6) 1010 tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); 1011 #endif 1012 #if defined(INET) && defined(INET6) 1013 else 1014 #endif 1015 #ifdef INET 1016 tlen = sizeof (struct tcpiphdr); 1017 #endif 1018 #ifdef INVARIANTS 1019 m->m_len = 0; 1020 KASSERT(M_TRAILINGSPACE(m) >= tlen, 1021 ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", 1022 m, tlen, (long)M_TRAILINGSPACE(m))); 1023 #endif 1024 m->m_len = tlen; 1025 to.to_flags = 0; 1026 if (incl_opts) { 1027 /* Make sure we have room. */ 1028 if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { 1029 m->m_next = m_get(M_NOWAIT, MT_DATA); 1030 if (m->m_next) { 1031 optp = mtod(m->m_next, u_char *); 1032 optm = m->m_next; 1033 } else 1034 incl_opts = false; 1035 } else { 1036 optp = (u_char *) (nth + 1); 1037 optm = m; 1038 } 1039 } 1040 if (incl_opts) { 1041 /* Timestamps. */ 1042 if (tp->t_flags & TF_RCVD_TSTMP) { 1043 to.to_tsval = tcp_ts_getticks() + tp->ts_offset; 1044 to.to_tsecr = tp->ts_recent; 1045 to.to_flags |= TOF_TS; 1046 } 1047 #ifdef TCP_SIGNATURE 1048 /* TCP-MD5 (RFC2385). */ 1049 if (tp->t_flags & TF_SIGNATURE) 1050 to.to_flags |= TOF_SIGNATURE; 1051 #endif 1052 1053 /* Add the options. */ 1054 tlen += optlen = tcp_addoptions(&to, optp); 1055 1056 /* Update m_len in the correct mbuf. */ 1057 optm->m_len += optlen; 1058 } else 1059 optlen = 0; 1060 #ifdef INET6 1061 if (isipv6) { 1062 ip6->ip6_flow = 0; 1063 ip6->ip6_vfc = IPV6_VERSION; 1064 ip6->ip6_nxt = IPPROTO_TCP; 1065 ip6->ip6_plen = htons(tlen - sizeof(*ip6)); 1066 } 1067 #endif 1068 #if defined(INET) && defined(INET6) 1069 else 1070 #endif 1071 #ifdef INET 1072 { 1073 ip->ip_len = htons(tlen); 1074 ip->ip_ttl = V_ip_defttl; 1075 if (V_path_mtu_discovery) 1076 ip->ip_off |= htons(IP_DF); 1077 } 1078 #endif 1079 m->m_pkthdr.len = tlen; 1080 m->m_pkthdr.rcvif = NULL; 1081 #ifdef MAC 1082 if (inp != NULL) { 1083 /* 1084 * Packet is associated with a socket, so allow the 1085 * label of the response to reflect the socket label. 1086 */ 1087 INP_WLOCK_ASSERT(inp); 1088 mac_inpcb_create_mbuf(inp, m); 1089 } else { 1090 /* 1091 * Packet is not associated with a socket, so possibly 1092 * update the label in place. 1093 */ 1094 mac_netinet_tcp_reply(m); 1095 } 1096 #endif 1097 nth->th_seq = htonl(seq); 1098 nth->th_ack = htonl(ack); 1099 nth->th_x2 = 0; 1100 nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; 1101 nth->th_flags = flags; 1102 if (tp != NULL) 1103 nth->th_win = htons((u_short) (win >> tp->rcv_scale)); 1104 else 1105 nth->th_win = htons((u_short)win); 1106 nth->th_urp = 0; 1107 1108 #ifdef TCP_SIGNATURE 1109 if (to.to_flags & TOF_SIGNATURE) { 1110 tcp_signature_compute(m, 0, 0, optlen, to.to_signature, 1111 IPSEC_DIR_OUTBOUND); 1112 } 1113 #endif 1114 1115 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 1116 #ifdef INET6 1117 if (isipv6) { 1118 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 1119 nth->th_sum = in6_cksum_pseudo(ip6, 1120 tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); 1121 ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : 1122 NULL, NULL); 1123 } 1124 #endif /* INET6 */ 1125 #if defined(INET6) && defined(INET) 1126 else 1127 #endif 1128 #ifdef INET 1129 { 1130 m->m_pkthdr.csum_flags = CSUM_TCP; 1131 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 1132 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); 1133 } 1134 #endif /* INET */ 1135 #ifdef TCPDEBUG 1136 if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) 1137 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); 1138 #endif 1139 TCP_PROBE3(debug__output, tp, th, mtod(m, const char *)); 1140 if (flags & TH_RST) 1141 TCP_PROBE5(accept__refused, NULL, NULL, mtod(m, const char *), 1142 tp, nth); 1143 1144 TCP_PROBE5(send, NULL, tp, mtod(m, const char *), tp, nth); 1145 #ifdef INET6 1146 if (isipv6) 1147 (void) ip6_output(m, NULL, NULL, 0, NULL, NULL, inp); 1148 #endif /* INET6 */ 1149 #if defined(INET) && defined(INET6) 1150 else 1151 #endif 1152 #ifdef INET 1153 (void) ip_output(m, NULL, NULL, 0, NULL, inp); 1154 #endif 1155 } 1156 1157 /* 1158 * Create a new TCP control block, making an 1159 * empty reassembly queue and hooking it to the argument 1160 * protocol control block. The `inp' parameter must have 1161 * come from the zone allocator set up in tcp_init(). 1162 */ 1163 struct tcpcb * 1164 tcp_newtcpcb(struct inpcb *inp) 1165 { 1166 struct tcpcb_mem *tm; 1167 struct tcpcb *tp; 1168 #ifdef INET6 1169 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 1170 #endif /* INET6 */ 1171 1172 tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO); 1173 if (tm == NULL) 1174 return (NULL); 1175 tp = &tm->tcb; 1176 1177 /* Initialise cc_var struct for this tcpcb. */ 1178 tp->ccv = &tm->ccv; 1179 tp->ccv->type = IPPROTO_TCP; 1180 tp->ccv->ccvc.tcp = tp; 1181 rw_rlock(&tcp_function_lock); 1182 tp->t_fb = tcp_func_set_ptr; 1183 refcount_acquire(&tp->t_fb->tfb_refcnt); 1184 rw_runlock(&tcp_function_lock); 1185 if (tp->t_fb->tfb_tcp_fb_init) { 1186 (*tp->t_fb->tfb_tcp_fb_init)(tp); 1187 } 1188 /* 1189 * Use the current system default CC algorithm. 1190 */ 1191 CC_LIST_RLOCK(); 1192 KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!")); 1193 CC_ALGO(tp) = CC_DEFAULT(); 1194 CC_LIST_RUNLOCK(); 1195 1196 if (CC_ALGO(tp)->cb_init != NULL) 1197 if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { 1198 if (tp->t_fb->tfb_tcp_fb_fini) 1199 (*tp->t_fb->tfb_tcp_fb_fini)(tp); 1200 refcount_release(&tp->t_fb->tfb_refcnt); 1201 uma_zfree(V_tcpcb_zone, tm); 1202 return (NULL); 1203 } 1204 1205 tp->osd = &tm->osd; 1206 if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { 1207 if (tp->t_fb->tfb_tcp_fb_fini) 1208 (*tp->t_fb->tfb_tcp_fb_fini)(tp); 1209 refcount_release(&tp->t_fb->tfb_refcnt); 1210 uma_zfree(V_tcpcb_zone, tm); 1211 return (NULL); 1212 } 1213 1214 #ifdef VIMAGE 1215 tp->t_vnet = inp->inp_vnet; 1216 #endif 1217 tp->t_timers = &tm->tt; 1218 /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ 1219 tp->t_maxseg = 1220 #ifdef INET6 1221 isipv6 ? V_tcp_v6mssdflt : 1222 #endif /* INET6 */ 1223 V_tcp_mssdflt; 1224 1225 /* Set up our timeouts. */ 1226 callout_init(&tp->t_timers->tt_rexmt, 1); 1227 callout_init(&tp->t_timers->tt_persist, 1); 1228 callout_init(&tp->t_timers->tt_keep, 1); 1229 callout_init(&tp->t_timers->tt_2msl, 1); 1230 callout_init(&tp->t_timers->tt_delack, 1); 1231 1232 if (V_tcp_do_rfc1323) 1233 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); 1234 if (V_tcp_do_sack) 1235 tp->t_flags |= TF_SACK_PERMIT; 1236 TAILQ_INIT(&tp->snd_holes); 1237 /* 1238 * The tcpcb will hold a reference on its inpcb until tcp_discardcb() 1239 * is called. 1240 */ 1241 in_pcbref(inp); /* Reference for tcpcb */ 1242 tp->t_inpcb = inp; 1243 1244 /* 1245 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 1246 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives 1247 * reasonable initial retransmit time. 1248 */ 1249 tp->t_srtt = TCPTV_SRTTBASE; 1250 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; 1251 tp->t_rttmin = tcp_rexmit_min; 1252 tp->t_rxtcur = TCPTV_RTOBASE; 1253 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 1254 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 1255 tp->t_rcvtime = ticks; 1256 /* 1257 * IPv4 TTL initialization is necessary for an IPv6 socket as well, 1258 * because the socket may be bound to an IPv6 wildcard address, 1259 * which may match an IPv4-mapped IPv6 address. 1260 */ 1261 inp->inp_ip_ttl = V_ip_defttl; 1262 inp->inp_ppcb = tp; 1263 #ifdef TCPPCAP 1264 /* 1265 * Init the TCP PCAP queues. 1266 */ 1267 tcp_pcap_tcpcb_init(tp); 1268 #endif 1269 return (tp); /* XXX */ 1270 } 1271 1272 /* 1273 * Switch the congestion control algorithm back to NewReno for any active 1274 * control blocks using an algorithm which is about to go away. 1275 * This ensures the CC framework can allow the unload to proceed without leaving 1276 * any dangling pointers which would trigger a panic. 1277 * Returning non-zero would inform the CC framework that something went wrong 1278 * and it would be unsafe to allow the unload to proceed. However, there is no 1279 * way for this to occur with this implementation so we always return zero. 1280 */ 1281 int 1282 tcp_ccalgounload(struct cc_algo *unload_algo) 1283 { 1284 struct cc_algo *tmpalgo; 1285 struct inpcb *inp; 1286 struct tcpcb *tp; 1287 VNET_ITERATOR_DECL(vnet_iter); 1288 1289 /* 1290 * Check all active control blocks across all network stacks and change 1291 * any that are using "unload_algo" back to NewReno. If "unload_algo" 1292 * requires cleanup code to be run, call it. 1293 */ 1294 VNET_LIST_RLOCK(); 1295 VNET_FOREACH(vnet_iter) { 1296 CURVNET_SET(vnet_iter); 1297 INP_INFO_WLOCK(&V_tcbinfo); 1298 /* 1299 * New connections already part way through being initialised 1300 * with the CC algo we're removing will not race with this code 1301 * because the INP_INFO_WLOCK is held during initialisation. We 1302 * therefore don't enter the loop below until the connection 1303 * list has stabilised. 1304 */ 1305 LIST_FOREACH(inp, &V_tcb, inp_list) { 1306 INP_WLOCK(inp); 1307 /* Important to skip tcptw structs. */ 1308 if (!(inp->inp_flags & INP_TIMEWAIT) && 1309 (tp = intotcpcb(inp)) != NULL) { 1310 /* 1311 * By holding INP_WLOCK here, we are assured 1312 * that the connection is not currently 1313 * executing inside the CC module's functions 1314 * i.e. it is safe to make the switch back to 1315 * NewReno. 1316 */ 1317 if (CC_ALGO(tp) == unload_algo) { 1318 tmpalgo = CC_ALGO(tp); 1319 /* NewReno does not require any init. */ 1320 CC_ALGO(tp) = &newreno_cc_algo; 1321 if (tmpalgo->cb_destroy != NULL) 1322 tmpalgo->cb_destroy(tp->ccv); 1323 } 1324 } 1325 INP_WUNLOCK(inp); 1326 } 1327 INP_INFO_WUNLOCK(&V_tcbinfo); 1328 CURVNET_RESTORE(); 1329 } 1330 VNET_LIST_RUNLOCK(); 1331 1332 return (0); 1333 } 1334 1335 /* 1336 * Drop a TCP connection, reporting 1337 * the specified error. If connection is synchronized, 1338 * then send a RST to peer. 1339 */ 1340 struct tcpcb * 1341 tcp_drop(struct tcpcb *tp, int errno) 1342 { 1343 struct socket *so = tp->t_inpcb->inp_socket; 1344 1345 INP_INFO_LOCK_ASSERT(&V_tcbinfo); 1346 INP_WLOCK_ASSERT(tp->t_inpcb); 1347 1348 if (TCPS_HAVERCVDSYN(tp->t_state)) { 1349 tcp_state_change(tp, TCPS_CLOSED); 1350 (void) tp->t_fb->tfb_tcp_output(tp); 1351 TCPSTAT_INC(tcps_drops); 1352 } else 1353 TCPSTAT_INC(tcps_conndrops); 1354 if (errno == ETIMEDOUT && tp->t_softerror) 1355 errno = tp->t_softerror; 1356 so->so_error = errno; 1357 return (tcp_close(tp)); 1358 } 1359 1360 void 1361 tcp_discardcb(struct tcpcb *tp) 1362 { 1363 struct inpcb *inp = tp->t_inpcb; 1364 struct socket *so = inp->inp_socket; 1365 #ifdef INET6 1366 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 1367 #endif /* INET6 */ 1368 int released; 1369 1370 INP_WLOCK_ASSERT(inp); 1371 1372 /* 1373 * Make sure that all of our timers are stopped before we delete the 1374 * PCB. 1375 * 1376 * If stopping a timer fails, we schedule a discard function in same 1377 * callout, and the last discard function called will take care of 1378 * deleting the tcpcb. 1379 */ 1380 tp->t_timers->tt_draincnt = 0; 1381 tcp_timer_stop(tp, TT_REXMT); 1382 tcp_timer_stop(tp, TT_PERSIST); 1383 tcp_timer_stop(tp, TT_KEEP); 1384 tcp_timer_stop(tp, TT_2MSL); 1385 tcp_timer_stop(tp, TT_DELACK); 1386 if (tp->t_fb->tfb_tcp_timer_stop_all) { 1387 /* 1388 * Call the stop-all function of the methods, 1389 * this function should call the tcp_timer_stop() 1390 * method with each of the function specific timeouts. 1391 * That stop will be called via the tfb_tcp_timer_stop() 1392 * which should use the async drain function of the 1393 * callout system (see tcp_var.h). 1394 */ 1395 tp->t_fb->tfb_tcp_timer_stop_all(tp); 1396 } 1397 1398 /* 1399 * If we got enough samples through the srtt filter, 1400 * save the rtt and rttvar in the routing entry. 1401 * 'Enough' is arbitrarily defined as 4 rtt samples. 1402 * 4 samples is enough for the srtt filter to converge 1403 * to within enough % of the correct value; fewer samples 1404 * and we could save a bogus rtt. The danger is not high 1405 * as tcp quickly recovers from everything. 1406 * XXX: Works very well but needs some more statistics! 1407 */ 1408 if (tp->t_rttupdated >= 4) { 1409 struct hc_metrics_lite metrics; 1410 u_long ssthresh; 1411 1412 bzero(&metrics, sizeof(metrics)); 1413 /* 1414 * Update the ssthresh always when the conditions below 1415 * are satisfied. This gives us better new start value 1416 * for the congestion avoidance for new connections. 1417 * ssthresh is only set if packet loss occurred on a session. 1418 * 1419 * XXXRW: 'so' may be NULL here, and/or socket buffer may be 1420 * being torn down. Ideally this code would not use 'so'. 1421 */ 1422 ssthresh = tp->snd_ssthresh; 1423 if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { 1424 /* 1425 * convert the limit from user data bytes to 1426 * packets then to packet data bytes. 1427 */ 1428 ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; 1429 if (ssthresh < 2) 1430 ssthresh = 2; 1431 ssthresh *= (u_long)(tp->t_maxseg + 1432 #ifdef INET6 1433 (isipv6 ? sizeof (struct ip6_hdr) + 1434 sizeof (struct tcphdr) : 1435 #endif 1436 sizeof (struct tcpiphdr) 1437 #ifdef INET6 1438 ) 1439 #endif 1440 ); 1441 } else 1442 ssthresh = 0; 1443 metrics.rmx_ssthresh = ssthresh; 1444 1445 metrics.rmx_rtt = tp->t_srtt; 1446 metrics.rmx_rttvar = tp->t_rttvar; 1447 metrics.rmx_cwnd = tp->snd_cwnd; 1448 metrics.rmx_sendpipe = 0; 1449 metrics.rmx_recvpipe = 0; 1450 1451 tcp_hc_update(&inp->inp_inc, &metrics); 1452 } 1453 1454 /* free the reassembly queue, if any */ 1455 tcp_reass_flush(tp); 1456 1457 #ifdef TCP_OFFLOAD 1458 /* Disconnect offload device, if any. */ 1459 if (tp->t_flags & TF_TOE) 1460 tcp_offload_detach(tp); 1461 #endif 1462 1463 tcp_free_sackholes(tp); 1464 1465 #ifdef TCPPCAP 1466 /* Free the TCP PCAP queues. */ 1467 tcp_pcap_drain(&(tp->t_inpkts)); 1468 tcp_pcap_drain(&(tp->t_outpkts)); 1469 #endif 1470 1471 /* Allow the CC algorithm to clean up after itself. */ 1472 if (CC_ALGO(tp)->cb_destroy != NULL) 1473 CC_ALGO(tp)->cb_destroy(tp->ccv); 1474 1475 khelp_destroy_osd(tp->osd); 1476 1477 CC_ALGO(tp) = NULL; 1478 inp->inp_ppcb = NULL; 1479 if (tp->t_timers->tt_draincnt == 0) { 1480 /* We own the last reference on tcpcb, let's free it. */ 1481 if (tp->t_fb->tfb_tcp_fb_fini) 1482 (*tp->t_fb->tfb_tcp_fb_fini)(tp); 1483 refcount_release(&tp->t_fb->tfb_refcnt); 1484 tp->t_inpcb = NULL; 1485 uma_zfree(V_tcpcb_zone, tp); 1486 released = in_pcbrele_wlocked(inp); 1487 KASSERT(!released, ("%s: inp %p should not have been released " 1488 "here", __func__, inp)); 1489 } 1490 } 1491 1492 void 1493 tcp_timer_discard(void *ptp) 1494 { 1495 struct inpcb *inp; 1496 struct tcpcb *tp; 1497 1498 tp = (struct tcpcb *)ptp; 1499 CURVNET_SET(tp->t_vnet); 1500 INP_INFO_RLOCK(&V_tcbinfo); 1501 inp = tp->t_inpcb; 1502 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", 1503 __func__, tp)); 1504 INP_WLOCK(inp); 1505 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0, 1506 ("%s: tcpcb has to be stopped here", __func__)); 1507 tp->t_timers->tt_draincnt--; 1508 if (tp->t_timers->tt_draincnt == 0) { 1509 /* We own the last reference on this tcpcb, let's free it. */ 1510 if (tp->t_fb->tfb_tcp_fb_fini) 1511 (*tp->t_fb->tfb_tcp_fb_fini)(tp); 1512 refcount_release(&tp->t_fb->tfb_refcnt); 1513 tp->t_inpcb = NULL; 1514 uma_zfree(V_tcpcb_zone, tp); 1515 if (in_pcbrele_wlocked(inp)) { 1516 INP_INFO_RUNLOCK(&V_tcbinfo); 1517 CURVNET_RESTORE(); 1518 return; 1519 } 1520 } 1521 INP_WUNLOCK(inp); 1522 INP_INFO_RUNLOCK(&V_tcbinfo); 1523 CURVNET_RESTORE(); 1524 } 1525 1526 /* 1527 * Attempt to close a TCP control block, marking it as dropped, and freeing 1528 * the socket if we hold the only reference. 1529 */ 1530 struct tcpcb * 1531 tcp_close(struct tcpcb *tp) 1532 { 1533 struct inpcb *inp = tp->t_inpcb; 1534 struct socket *so; 1535 1536 INP_INFO_LOCK_ASSERT(&V_tcbinfo); 1537 INP_WLOCK_ASSERT(inp); 1538 1539 #ifdef TCP_OFFLOAD 1540 if (tp->t_state == TCPS_LISTEN) 1541 tcp_offload_listen_stop(tp); 1542 #endif 1543 #ifdef TCP_RFC7413 1544 /* 1545 * This releases the TFO pending counter resource for TFO listen 1546 * sockets as well as passively-created TFO sockets that transition 1547 * from SYN_RECEIVED to CLOSED. 1548 */ 1549 if (tp->t_tfo_pending) { 1550 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 1551 tp->t_tfo_pending = NULL; 1552 } 1553 #endif 1554 in_pcbdrop(inp); 1555 TCPSTAT_INC(tcps_closed); 1556 TCPSTATES_DEC(tp->t_state); 1557 KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); 1558 so = inp->inp_socket; 1559 soisdisconnected(so); 1560 if (inp->inp_flags & INP_SOCKREF) { 1561 KASSERT(so->so_state & SS_PROTOREF, 1562 ("tcp_close: !SS_PROTOREF")); 1563 inp->inp_flags &= ~INP_SOCKREF; 1564 INP_WUNLOCK(inp); 1565 ACCEPT_LOCK(); 1566 SOCK_LOCK(so); 1567 so->so_state &= ~SS_PROTOREF; 1568 sofree(so); 1569 return (NULL); 1570 } 1571 return (tp); 1572 } 1573 1574 void 1575 tcp_drain(void) 1576 { 1577 VNET_ITERATOR_DECL(vnet_iter); 1578 1579 if (!do_tcpdrain) 1580 return; 1581 1582 VNET_LIST_RLOCK_NOSLEEP(); 1583 VNET_FOREACH(vnet_iter) { 1584 CURVNET_SET(vnet_iter); 1585 struct inpcb *inpb; 1586 struct tcpcb *tcpb; 1587 1588 /* 1589 * Walk the tcpbs, if existing, and flush the reassembly queue, 1590 * if there is one... 1591 * XXX: The "Net/3" implementation doesn't imply that the TCP 1592 * reassembly queue should be flushed, but in a situation 1593 * where we're really low on mbufs, this is potentially 1594 * useful. 1595 */ 1596 INP_INFO_WLOCK(&V_tcbinfo); 1597 LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { 1598 if (inpb->inp_flags & INP_TIMEWAIT) 1599 continue; 1600 INP_WLOCK(inpb); 1601 if ((tcpb = intotcpcb(inpb)) != NULL) { 1602 tcp_reass_flush(tcpb); 1603 tcp_clean_sackreport(tcpb); 1604 } 1605 INP_WUNLOCK(inpb); 1606 } 1607 INP_INFO_WUNLOCK(&V_tcbinfo); 1608 CURVNET_RESTORE(); 1609 } 1610 VNET_LIST_RUNLOCK_NOSLEEP(); 1611 } 1612 1613 /* 1614 * Notify a tcp user of an asynchronous error; 1615 * store error as soft error, but wake up user 1616 * (for now, won't do anything until can select for soft error). 1617 * 1618 * Do not wake up user since there currently is no mechanism for 1619 * reporting soft errors (yet - a kqueue filter may be added). 1620 */ 1621 static struct inpcb * 1622 tcp_notify(struct inpcb *inp, int error) 1623 { 1624 struct tcpcb *tp; 1625 1626 INP_INFO_LOCK_ASSERT(&V_tcbinfo); 1627 INP_WLOCK_ASSERT(inp); 1628 1629 if ((inp->inp_flags & INP_TIMEWAIT) || 1630 (inp->inp_flags & INP_DROPPED)) 1631 return (inp); 1632 1633 tp = intotcpcb(inp); 1634 KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); 1635 1636 /* 1637 * Ignore some errors if we are hooked up. 1638 * If connection hasn't completed, has retransmitted several times, 1639 * and receives a second error, give up now. This is better 1640 * than waiting a long time to establish a connection that 1641 * can never complete. 1642 */ 1643 if (tp->t_state == TCPS_ESTABLISHED && 1644 (error == EHOSTUNREACH || error == ENETUNREACH || 1645 error == EHOSTDOWN)) { 1646 if (inp->inp_route.ro_rt) { 1647 RTFREE(inp->inp_route.ro_rt); 1648 inp->inp_route.ro_rt = (struct rtentry *)NULL; 1649 } 1650 return (inp); 1651 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && 1652 tp->t_softerror) { 1653 tp = tcp_drop(tp, error); 1654 if (tp != NULL) 1655 return (inp); 1656 else 1657 return (NULL); 1658 } else { 1659 tp->t_softerror = error; 1660 return (inp); 1661 } 1662 #if 0 1663 wakeup( &so->so_timeo); 1664 sorwakeup(so); 1665 sowwakeup(so); 1666 #endif 1667 } 1668 1669 static int 1670 tcp_pcblist(SYSCTL_HANDLER_ARGS) 1671 { 1672 int error, i, m, n, pcb_count; 1673 struct inpcb *inp, **inp_list; 1674 inp_gen_t gencnt; 1675 struct xinpgen xig; 1676 1677 /* 1678 * The process of preparing the TCB list is too time-consuming and 1679 * resource-intensive to repeat twice on every request. 1680 */ 1681 if (req->oldptr == NULL) { 1682 n = V_tcbinfo.ipi_count + 1683 counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); 1684 n += imax(n / 8, 10); 1685 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); 1686 return (0); 1687 } 1688 1689 if (req->newptr != NULL) 1690 return (EPERM); 1691 1692 /* 1693 * OK, now we're committed to doing something. 1694 */ 1695 INP_LIST_RLOCK(&V_tcbinfo); 1696 gencnt = V_tcbinfo.ipi_gencnt; 1697 n = V_tcbinfo.ipi_count; 1698 INP_LIST_RUNLOCK(&V_tcbinfo); 1699 1700 m = counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); 1701 1702 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) 1703 + (n + m) * sizeof(struct xtcpcb)); 1704 if (error != 0) 1705 return (error); 1706 1707 xig.xig_len = sizeof xig; 1708 xig.xig_count = n + m; 1709 xig.xig_gen = gencnt; 1710 xig.xig_sogen = so_gencnt; 1711 error = SYSCTL_OUT(req, &xig, sizeof xig); 1712 if (error) 1713 return (error); 1714 1715 error = syncache_pcblist(req, m, &pcb_count); 1716 if (error) 1717 return (error); 1718 1719 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 1720 1721 INP_INFO_WLOCK(&V_tcbinfo); 1722 for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; 1723 inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { 1724 INP_WLOCK(inp); 1725 if (inp->inp_gencnt <= gencnt) { 1726 /* 1727 * XXX: This use of cr_cansee(), introduced with 1728 * TCP state changes, is not quite right, but for 1729 * now, better than nothing. 1730 */ 1731 if (inp->inp_flags & INP_TIMEWAIT) { 1732 if (intotw(inp) != NULL) 1733 error = cr_cansee(req->td->td_ucred, 1734 intotw(inp)->tw_cred); 1735 else 1736 error = EINVAL; /* Skip this inp. */ 1737 } else 1738 error = cr_canseeinpcb(req->td->td_ucred, inp); 1739 if (error == 0) { 1740 in_pcbref(inp); 1741 inp_list[i++] = inp; 1742 } 1743 } 1744 INP_WUNLOCK(inp); 1745 } 1746 INP_INFO_WUNLOCK(&V_tcbinfo); 1747 n = i; 1748 1749 error = 0; 1750 for (i = 0; i < n; i++) { 1751 inp = inp_list[i]; 1752 INP_RLOCK(inp); 1753 if (inp->inp_gencnt <= gencnt) { 1754 struct xtcpcb xt; 1755 void *inp_ppcb; 1756 1757 bzero(&xt, sizeof(xt)); 1758 xt.xt_len = sizeof xt; 1759 /* XXX should avoid extra copy */ 1760 bcopy(inp, &xt.xt_inp, sizeof *inp); 1761 inp_ppcb = inp->inp_ppcb; 1762 if (inp_ppcb == NULL) 1763 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 1764 else if (inp->inp_flags & INP_TIMEWAIT) { 1765 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 1766 xt.xt_tp.t_state = TCPS_TIME_WAIT; 1767 } else { 1768 bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); 1769 if (xt.xt_tp.t_timers) 1770 tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer); 1771 } 1772 if (inp->inp_socket != NULL) 1773 sotoxsocket(inp->inp_socket, &xt.xt_socket); 1774 else { 1775 bzero(&xt.xt_socket, sizeof xt.xt_socket); 1776 xt.xt_socket.xso_protocol = IPPROTO_TCP; 1777 } 1778 xt.xt_inp.inp_gencnt = inp->inp_gencnt; 1779 INP_RUNLOCK(inp); 1780 error = SYSCTL_OUT(req, &xt, sizeof xt); 1781 } else 1782 INP_RUNLOCK(inp); 1783 } 1784 INP_INFO_RLOCK(&V_tcbinfo); 1785 for (i = 0; i < n; i++) { 1786 inp = inp_list[i]; 1787 INP_RLOCK(inp); 1788 if (!in_pcbrele_rlocked(inp)) 1789 INP_RUNLOCK(inp); 1790 } 1791 INP_INFO_RUNLOCK(&V_tcbinfo); 1792 1793 if (!error) { 1794 /* 1795 * Give the user an updated idea of our state. 1796 * If the generation differs from what we told 1797 * her before, she knows that something happened 1798 * while we were processing this request, and it 1799 * might be necessary to retry. 1800 */ 1801 INP_LIST_RLOCK(&V_tcbinfo); 1802 xig.xig_gen = V_tcbinfo.ipi_gencnt; 1803 xig.xig_sogen = so_gencnt; 1804 xig.xig_count = V_tcbinfo.ipi_count + pcb_count; 1805 INP_LIST_RUNLOCK(&V_tcbinfo); 1806 error = SYSCTL_OUT(req, &xig, sizeof xig); 1807 } 1808 free(inp_list, M_TEMP); 1809 return (error); 1810 } 1811 1812 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, 1813 CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, 1814 tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); 1815 1816 #ifdef INET 1817 static int 1818 tcp_getcred(SYSCTL_HANDLER_ARGS) 1819 { 1820 struct xucred xuc; 1821 struct sockaddr_in addrs[2]; 1822 struct inpcb *inp; 1823 int error; 1824 1825 error = priv_check(req->td, PRIV_NETINET_GETCRED); 1826 if (error) 1827 return (error); 1828 error = SYSCTL_IN(req, addrs, sizeof(addrs)); 1829 if (error) 1830 return (error); 1831 inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, 1832 addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); 1833 if (inp != NULL) { 1834 if (inp->inp_socket == NULL) 1835 error = ENOENT; 1836 if (error == 0) 1837 error = cr_canseeinpcb(req->td->td_ucred, inp); 1838 if (error == 0) 1839 cru2x(inp->inp_cred, &xuc); 1840 INP_RUNLOCK(inp); 1841 } else 1842 error = ENOENT; 1843 if (error == 0) 1844 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 1845 return (error); 1846 } 1847 1848 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, 1849 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, 1850 tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); 1851 #endif /* INET */ 1852 1853 #ifdef INET6 1854 static int 1855 tcp6_getcred(SYSCTL_HANDLER_ARGS) 1856 { 1857 struct xucred xuc; 1858 struct sockaddr_in6 addrs[2]; 1859 struct inpcb *inp; 1860 int error; 1861 #ifdef INET 1862 int mapped = 0; 1863 #endif 1864 1865 error = priv_check(req->td, PRIV_NETINET_GETCRED); 1866 if (error) 1867 return (error); 1868 error = SYSCTL_IN(req, addrs, sizeof(addrs)); 1869 if (error) 1870 return (error); 1871 if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || 1872 (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { 1873 return (error); 1874 } 1875 if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { 1876 #ifdef INET 1877 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) 1878 mapped = 1; 1879 else 1880 #endif 1881 return (EINVAL); 1882 } 1883 1884 #ifdef INET 1885 if (mapped == 1) 1886 inp = in_pcblookup(&V_tcbinfo, 1887 *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], 1888 addrs[1].sin6_port, 1889 *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], 1890 addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); 1891 else 1892 #endif 1893 inp = in6_pcblookup(&V_tcbinfo, 1894 &addrs[1].sin6_addr, addrs[1].sin6_port, 1895 &addrs[0].sin6_addr, addrs[0].sin6_port, 1896 INPLOOKUP_RLOCKPCB, NULL); 1897 if (inp != NULL) { 1898 if (inp->inp_socket == NULL) 1899 error = ENOENT; 1900 if (error == 0) 1901 error = cr_canseeinpcb(req->td->td_ucred, inp); 1902 if (error == 0) 1903 cru2x(inp->inp_cred, &xuc); 1904 INP_RUNLOCK(inp); 1905 } else 1906 error = ENOENT; 1907 if (error == 0) 1908 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 1909 return (error); 1910 } 1911 1912 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, 1913 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, 1914 tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); 1915 #endif /* INET6 */ 1916 1917 1918 #ifdef INET 1919 void 1920 tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) 1921 { 1922 struct ip *ip = vip; 1923 struct tcphdr *th; 1924 struct in_addr faddr; 1925 struct inpcb *inp; 1926 struct tcpcb *tp; 1927 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; 1928 struct icmp *icp; 1929 struct in_conninfo inc; 1930 tcp_seq icmp_tcp_seq; 1931 int mtu; 1932 1933 faddr = ((struct sockaddr_in *)sa)->sin_addr; 1934 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1935 return; 1936 1937 if (cmd == PRC_MSGSIZE) 1938 notify = tcp_mtudisc_notify; 1939 else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || 1940 cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) 1941 notify = tcp_drop_syn_sent; 1942 else if (PRC_IS_REDIRECT(cmd)) { 1943 /* signal EHOSTDOWN, as it flushes the cached route */ 1944 in_pcbnotifyall(&V_tcbinfo, faddr, EHOSTDOWN, notify); 1945 return; 1946 } 1947 /* 1948 * Hostdead is ugly because it goes linearly through all PCBs. 1949 * XXX: We never get this from ICMP, otherwise it makes an 1950 * excellent DoS attack on machines with many connections. 1951 */ 1952 else if (cmd == PRC_HOSTDEAD) 1953 ip = NULL; 1954 else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) 1955 return; 1956 1957 if (ip == NULL) { 1958 in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); 1959 return; 1960 } 1961 1962 icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); 1963 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); 1964 INP_INFO_RLOCK(&V_tcbinfo); 1965 inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, 1966 th->th_sport, INPLOOKUP_WLOCKPCB, NULL); 1967 if (inp != NULL) { 1968 if (!(inp->inp_flags & INP_TIMEWAIT) && 1969 !(inp->inp_flags & INP_DROPPED) && 1970 !(inp->inp_socket == NULL)) { 1971 icmp_tcp_seq = ntohl(th->th_seq); 1972 tp = intotcpcb(inp); 1973 if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && 1974 SEQ_LT(icmp_tcp_seq, tp->snd_max)) { 1975 if (cmd == PRC_MSGSIZE) { 1976 /* 1977 * MTU discovery: 1978 * If we got a needfrag set the MTU 1979 * in the route to the suggested new 1980 * value (if given) and then notify. 1981 */ 1982 mtu = ntohs(icp->icmp_nextmtu); 1983 /* 1984 * If no alternative MTU was 1985 * proposed, try the next smaller 1986 * one. 1987 */ 1988 if (!mtu) 1989 mtu = ip_next_mtu( 1990 ntohs(ip->ip_len), 1); 1991 if (mtu < V_tcp_minmss + 1992 sizeof(struct tcpiphdr)) 1993 mtu = V_tcp_minmss + 1994 sizeof(struct tcpiphdr); 1995 /* 1996 * Only process the offered MTU if it 1997 * is smaller than the current one. 1998 */ 1999 if (mtu < tp->t_maxseg + 2000 sizeof(struct tcpiphdr)) { 2001 bzero(&inc, sizeof(inc)); 2002 inc.inc_faddr = faddr; 2003 inc.inc_fibnum = 2004 inp->inp_inc.inc_fibnum; 2005 tcp_hc_updatemtu(&inc, mtu); 2006 tcp_mtudisc(inp, mtu); 2007 } 2008 } else 2009 inp = (*notify)(inp, 2010 inetctlerrmap[cmd]); 2011 } 2012 } 2013 if (inp != NULL) 2014 INP_WUNLOCK(inp); 2015 } else { 2016 bzero(&inc, sizeof(inc)); 2017 inc.inc_fport = th->th_dport; 2018 inc.inc_lport = th->th_sport; 2019 inc.inc_faddr = faddr; 2020 inc.inc_laddr = ip->ip_src; 2021 syncache_unreach(&inc, th); 2022 } 2023 INP_INFO_RUNLOCK(&V_tcbinfo); 2024 } 2025 #endif /* INET */ 2026 2027 #ifdef INET6 2028 void 2029 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) 2030 { 2031 struct tcphdr th; 2032 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; 2033 struct ip6_hdr *ip6; 2034 struct mbuf *m; 2035 struct ip6ctlparam *ip6cp = NULL; 2036 const struct sockaddr_in6 *sa6_src = NULL; 2037 int off; 2038 struct tcp_portonly { 2039 u_int16_t th_sport; 2040 u_int16_t th_dport; 2041 } *thp; 2042 2043 if (sa->sa_family != AF_INET6 || 2044 sa->sa_len != sizeof(struct sockaddr_in6)) 2045 return; 2046 2047 if (cmd == PRC_MSGSIZE) 2048 notify = tcp_mtudisc_notify; 2049 else if (!PRC_IS_REDIRECT(cmd) && 2050 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) 2051 return; 2052 2053 /* if the parameter is from icmp6, decode it. */ 2054 if (d != NULL) { 2055 ip6cp = (struct ip6ctlparam *)d; 2056 m = ip6cp->ip6c_m; 2057 ip6 = ip6cp->ip6c_ip6; 2058 off = ip6cp->ip6c_off; 2059 sa6_src = ip6cp->ip6c_src; 2060 } else { 2061 m = NULL; 2062 ip6 = NULL; 2063 off = 0; /* fool gcc */ 2064 sa6_src = &sa6_any; 2065 } 2066 2067 if (ip6 != NULL) { 2068 struct in_conninfo inc; 2069 /* 2070 * XXX: We assume that when IPV6 is non NULL, 2071 * M and OFF are valid. 2072 */ 2073 2074 /* check if we can safely examine src and dst ports */ 2075 if (m->m_pkthdr.len < off + sizeof(*thp)) 2076 return; 2077 2078 bzero(&th, sizeof(th)); 2079 m_copydata(m, off, sizeof(*thp), (caddr_t)&th); 2080 2081 in6_pcbnotify(&V_tcbinfo, sa, th.th_dport, 2082 (struct sockaddr *)ip6cp->ip6c_src, 2083 th.th_sport, cmd, NULL, notify); 2084 2085 bzero(&inc, sizeof(inc)); 2086 inc.inc_fport = th.th_dport; 2087 inc.inc_lport = th.th_sport; 2088 inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; 2089 inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; 2090 inc.inc_flags |= INC_ISIPV6; 2091 INP_INFO_RLOCK(&V_tcbinfo); 2092 syncache_unreach(&inc, &th); 2093 INP_INFO_RUNLOCK(&V_tcbinfo); 2094 } else 2095 in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 2096 0, cmd, NULL, notify); 2097 } 2098 #endif /* INET6 */ 2099 2100 2101 /* 2102 * Following is where TCP initial sequence number generation occurs. 2103 * 2104 * There are two places where we must use initial sequence numbers: 2105 * 1. In SYN-ACK packets. 2106 * 2. In SYN packets. 2107 * 2108 * All ISNs for SYN-ACK packets are generated by the syncache. See 2109 * tcp_syncache.c for details. 2110 * 2111 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling 2112 * depends on this property. In addition, these ISNs should be 2113 * unguessable so as to prevent connection hijacking. To satisfy 2114 * the requirements of this situation, the algorithm outlined in 2115 * RFC 1948 is used, with only small modifications. 2116 * 2117 * Implementation details: 2118 * 2119 * Time is based off the system timer, and is corrected so that it 2120 * increases by one megabyte per second. This allows for proper 2121 * recycling on high speed LANs while still leaving over an hour 2122 * before rollover. 2123 * 2124 * As reading the *exact* system time is too expensive to be done 2125 * whenever setting up a TCP connection, we increment the time 2126 * offset in two ways. First, a small random positive increment 2127 * is added to isn_offset for each connection that is set up. 2128 * Second, the function tcp_isn_tick fires once per clock tick 2129 * and increments isn_offset as necessary so that sequence numbers 2130 * are incremented at approximately ISN_BYTES_PER_SECOND. The 2131 * random positive increments serve only to ensure that the same 2132 * exact sequence number is never sent out twice (as could otherwise 2133 * happen when a port is recycled in less than the system tick 2134 * interval.) 2135 * 2136 * net.inet.tcp.isn_reseed_interval controls the number of seconds 2137 * between seeding of isn_secret. This is normally set to zero, 2138 * as reseeding should not be necessary. 2139 * 2140 * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, 2141 * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In 2142 * general, this means holding an exclusive (write) lock. 2143 */ 2144 2145 #define ISN_BYTES_PER_SECOND 1048576 2146 #define ISN_STATIC_INCREMENT 4096 2147 #define ISN_RANDOM_INCREMENT (4096 - 1) 2148 2149 static VNET_DEFINE(u_char, isn_secret[32]); 2150 static VNET_DEFINE(int, isn_last); 2151 static VNET_DEFINE(int, isn_last_reseed); 2152 static VNET_DEFINE(u_int32_t, isn_offset); 2153 static VNET_DEFINE(u_int32_t, isn_offset_old); 2154 2155 #define V_isn_secret VNET(isn_secret) 2156 #define V_isn_last VNET(isn_last) 2157 #define V_isn_last_reseed VNET(isn_last_reseed) 2158 #define V_isn_offset VNET(isn_offset) 2159 #define V_isn_offset_old VNET(isn_offset_old) 2160 2161 tcp_seq 2162 tcp_new_isn(struct tcpcb *tp) 2163 { 2164 MD5_CTX isn_ctx; 2165 u_int32_t md5_buffer[4]; 2166 tcp_seq new_isn; 2167 u_int32_t projected_offset; 2168 2169 INP_WLOCK_ASSERT(tp->t_inpcb); 2170 2171 ISN_LOCK(); 2172 /* Seed if this is the first use, reseed if requested. */ 2173 if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && 2174 (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) 2175 < (u_int)ticks))) { 2176 read_random(&V_isn_secret, sizeof(V_isn_secret)); 2177 V_isn_last_reseed = ticks; 2178 } 2179 2180 /* Compute the md5 hash and return the ISN. */ 2181 MD5Init(&isn_ctx); 2182 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); 2183 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); 2184 #ifdef INET6 2185 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { 2186 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, 2187 sizeof(struct in6_addr)); 2188 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, 2189 sizeof(struct in6_addr)); 2190 } else 2191 #endif 2192 { 2193 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, 2194 sizeof(struct in_addr)); 2195 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, 2196 sizeof(struct in_addr)); 2197 } 2198 MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret)); 2199 MD5Final((u_char *) &md5_buffer, &isn_ctx); 2200 new_isn = (tcp_seq) md5_buffer[0]; 2201 V_isn_offset += ISN_STATIC_INCREMENT + 2202 (arc4random() & ISN_RANDOM_INCREMENT); 2203 if (ticks != V_isn_last) { 2204 projected_offset = V_isn_offset_old + 2205 ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); 2206 if (SEQ_GT(projected_offset, V_isn_offset)) 2207 V_isn_offset = projected_offset; 2208 V_isn_offset_old = V_isn_offset; 2209 V_isn_last = ticks; 2210 } 2211 new_isn += V_isn_offset; 2212 ISN_UNLOCK(); 2213 return (new_isn); 2214 } 2215 2216 /* 2217 * When a specific ICMP unreachable message is received and the 2218 * connection state is SYN-SENT, drop the connection. This behavior 2219 * is controlled by the icmp_may_rst sysctl. 2220 */ 2221 struct inpcb * 2222 tcp_drop_syn_sent(struct inpcb *inp, int errno) 2223 { 2224 struct tcpcb *tp; 2225 2226 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 2227 INP_WLOCK_ASSERT(inp); 2228 2229 if ((inp->inp_flags & INP_TIMEWAIT) || 2230 (inp->inp_flags & INP_DROPPED)) 2231 return (inp); 2232 2233 tp = intotcpcb(inp); 2234 if (tp->t_state != TCPS_SYN_SENT) 2235 return (inp); 2236 2237 tp = tcp_drop(tp, errno); 2238 if (tp != NULL) 2239 return (inp); 2240 else 2241 return (NULL); 2242 } 2243 2244 /* 2245 * When `need fragmentation' ICMP is received, update our idea of the MSS 2246 * based on the new value. Also nudge TCP to send something, since we 2247 * know the packet we just sent was dropped. 2248 * This duplicates some code in the tcp_mss() function in tcp_input.c. 2249 */ 2250 static struct inpcb * 2251 tcp_mtudisc_notify(struct inpcb *inp, int error) 2252 { 2253 2254 tcp_mtudisc(inp, -1); 2255 return (inp); 2256 } 2257 2258 static void 2259 tcp_mtudisc(struct inpcb *inp, int mtuoffer) 2260 { 2261 struct tcpcb *tp; 2262 struct socket *so; 2263 2264 INP_WLOCK_ASSERT(inp); 2265 if ((inp->inp_flags & INP_TIMEWAIT) || 2266 (inp->inp_flags & INP_DROPPED)) 2267 return; 2268 2269 tp = intotcpcb(inp); 2270 KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); 2271 2272 tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); 2273 2274 so = inp->inp_socket; 2275 SOCKBUF_LOCK(&so->so_snd); 2276 /* If the mss is larger than the socket buffer, decrease the mss. */ 2277 if (so->so_snd.sb_hiwat < tp->t_maxseg) 2278 tp->t_maxseg = so->so_snd.sb_hiwat; 2279 SOCKBUF_UNLOCK(&so->so_snd); 2280 2281 TCPSTAT_INC(tcps_mturesent); 2282 tp->t_rtttime = 0; 2283 tp->snd_nxt = tp->snd_una; 2284 tcp_free_sackholes(tp); 2285 tp->snd_recover = tp->snd_max; 2286 if (tp->t_flags & TF_SACK_PERMIT) 2287 EXIT_FASTRECOVERY(tp->t_flags); 2288 tp->t_fb->tfb_tcp_output(tp); 2289 } 2290 2291 #ifdef INET 2292 /* 2293 * Look-up the routing entry to the peer of this inpcb. If no route 2294 * is found and it cannot be allocated, then return 0. This routine 2295 * is called by TCP routines that access the rmx structure and by 2296 * tcp_mss_update to get the peer/interface MTU. 2297 */ 2298 u_long 2299 tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) 2300 { 2301 struct nhop4_extended nh4; 2302 struct ifnet *ifp; 2303 u_long maxmtu = 0; 2304 2305 KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); 2306 2307 if (inc->inc_faddr.s_addr != INADDR_ANY) { 2308 2309 if (fib4_lookup_nh_ext(inc->inc_fibnum, inc->inc_faddr, 2310 NHR_REF, 0, &nh4) != 0) 2311 return (0); 2312 2313 ifp = nh4.nh_ifp; 2314 maxmtu = nh4.nh_mtu; 2315 2316 /* Report additional interface capabilities. */ 2317 if (cap != NULL) { 2318 if (ifp->if_capenable & IFCAP_TSO4 && 2319 ifp->if_hwassist & CSUM_TSO) { 2320 cap->ifcap |= CSUM_TSO; 2321 cap->tsomax = ifp->if_hw_tsomax; 2322 cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; 2323 cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; 2324 } 2325 } 2326 fib4_free_nh_ext(inc->inc_fibnum, &nh4); 2327 } 2328 return (maxmtu); 2329 } 2330 #endif /* INET */ 2331 2332 #ifdef INET6 2333 u_long 2334 tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) 2335 { 2336 struct nhop6_extended nh6; 2337 struct in6_addr dst6; 2338 uint32_t scopeid; 2339 struct ifnet *ifp; 2340 u_long maxmtu = 0; 2341 2342 KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); 2343 2344 if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { 2345 in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); 2346 if (fib6_lookup_nh_ext(inc->inc_fibnum, &dst6, scopeid, 0, 2347 0, &nh6) != 0) 2348 return (0); 2349 2350 ifp = nh6.nh_ifp; 2351 maxmtu = nh6.nh_mtu; 2352 2353 /* Report additional interface capabilities. */ 2354 if (cap != NULL) { 2355 if (ifp->if_capenable & IFCAP_TSO6 && 2356 ifp->if_hwassist & CSUM_TSO) { 2357 cap->ifcap |= CSUM_TSO; 2358 cap->tsomax = ifp->if_hw_tsomax; 2359 cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; 2360 cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; 2361 } 2362 } 2363 fib6_free_nh_ext(inc->inc_fibnum, &nh6); 2364 } 2365 2366 return (maxmtu); 2367 } 2368 #endif /* INET6 */ 2369 2370 /* 2371 * Calculate effective SMSS per RFC5681 definition for a given TCP 2372 * connection at its current state, taking into account SACK and etc. 2373 */ 2374 u_int 2375 tcp_maxseg(const struct tcpcb *tp) 2376 { 2377 u_int optlen; 2378 2379 if (tp->t_flags & TF_NOOPT) 2380 return (tp->t_maxseg); 2381 2382 /* 2383 * Here we have a simplified code from tcp_addoptions(), 2384 * without a proper loop, and having most of paddings hardcoded. 2385 * We might make mistakes with padding here in some edge cases, 2386 * but this is harmless, since result of tcp_maxseg() is used 2387 * only in cwnd and ssthresh estimations. 2388 */ 2389 #define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) 2390 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 2391 if (tp->t_flags & TF_RCVD_TSTMP) 2392 optlen = TCPOLEN_TSTAMP_APPA; 2393 else 2394 optlen = 0; 2395 #ifdef TCP_SIGNATURE 2396 if (tp->t_flags & TF_SIGNATURE) 2397 optlen += PAD(TCPOLEN_SIGNATURE); 2398 #endif 2399 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { 2400 optlen += TCPOLEN_SACKHDR; 2401 optlen += tp->rcv_numsacks * TCPOLEN_SACK; 2402 optlen = PAD(optlen); 2403 } 2404 } else { 2405 if (tp->t_flags & TF_REQ_TSTMP) 2406 optlen = TCPOLEN_TSTAMP_APPA; 2407 else 2408 optlen = PAD(TCPOLEN_MAXSEG); 2409 if (tp->t_flags & TF_REQ_SCALE) 2410 optlen += PAD(TCPOLEN_WINDOW); 2411 #ifdef TCP_SIGNATURE 2412 if (tp->t_flags & TF_SIGNATURE) 2413 optlen += PAD(TCPOLEN_SIGNATURE); 2414 #endif 2415 if (tp->t_flags & TF_SACK_PERMIT) 2416 optlen += PAD(TCPOLEN_SACK_PERMITTED); 2417 } 2418 #undef PAD 2419 optlen = min(optlen, TCP_MAXOLEN); 2420 return (tp->t_maxseg - optlen); 2421 } 2422 2423 #ifdef IPSEC 2424 /* compute ESP/AH header size for TCP, including outer IP header. */ 2425 size_t 2426 ipsec_hdrsiz_tcp(struct tcpcb *tp) 2427 { 2428 struct inpcb *inp; 2429 struct mbuf *m; 2430 size_t hdrsiz; 2431 struct ip *ip; 2432 #ifdef INET6 2433 struct ip6_hdr *ip6; 2434 #endif 2435 struct tcphdr *th; 2436 2437 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL) || 2438 (!key_havesp(IPSEC_DIR_OUTBOUND))) 2439 return (0); 2440 m = m_gethdr(M_NOWAIT, MT_DATA); 2441 if (!m) 2442 return (0); 2443 2444 #ifdef INET6 2445 if ((inp->inp_vflag & INP_IPV6) != 0) { 2446 ip6 = mtod(m, struct ip6_hdr *); 2447 th = (struct tcphdr *)(ip6 + 1); 2448 m->m_pkthdr.len = m->m_len = 2449 sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 2450 tcpip_fillheaders(inp, ip6, th); 2451 hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 2452 } else 2453 #endif /* INET6 */ 2454 { 2455 ip = mtod(m, struct ip *); 2456 th = (struct tcphdr *)(ip + 1); 2457 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); 2458 tcpip_fillheaders(inp, ip, th); 2459 hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 2460 } 2461 2462 m_free(m); 2463 return (hdrsiz); 2464 } 2465 #endif /* IPSEC */ 2466 2467 #ifdef TCP_SIGNATURE 2468 /* 2469 * Callback function invoked by m_apply() to digest TCP segment data 2470 * contained within an mbuf chain. 2471 */ 2472 static int 2473 tcp_signature_apply(void *fstate, void *data, u_int len) 2474 { 2475 2476 MD5Update(fstate, (u_char *)data, len); 2477 return (0); 2478 } 2479 2480 /* 2481 * XXX The key is retrieved from the system's PF_KEY SADB, by keying a 2482 * search with the destination IP address, and a 'magic SPI' to be 2483 * determined by the application. This is hardcoded elsewhere to 1179 2484 */ 2485 struct secasvar * 2486 tcp_get_sav(struct mbuf *m, u_int direction) 2487 { 2488 union sockaddr_union dst; 2489 struct secasvar *sav; 2490 struct ip *ip; 2491 #ifdef INET6 2492 struct ip6_hdr *ip6; 2493 char ip6buf[INET6_ADDRSTRLEN]; 2494 #endif 2495 2496 /* Extract the destination from the IP header in the mbuf. */ 2497 bzero(&dst, sizeof(union sockaddr_union)); 2498 ip = mtod(m, struct ip *); 2499 #ifdef INET6 2500 ip6 = NULL; /* Make the compiler happy. */ 2501 #endif 2502 switch (ip->ip_v) { 2503 #ifdef INET 2504 case IPVERSION: 2505 dst.sa.sa_len = sizeof(struct sockaddr_in); 2506 dst.sa.sa_family = AF_INET; 2507 dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? 2508 ip->ip_src : ip->ip_dst; 2509 break; 2510 #endif 2511 #ifdef INET6 2512 case (IPV6_VERSION >> 4): 2513 ip6 = mtod(m, struct ip6_hdr *); 2514 dst.sa.sa_len = sizeof(struct sockaddr_in6); 2515 dst.sa.sa_family = AF_INET6; 2516 dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ? 2517 ip6->ip6_src : ip6->ip6_dst; 2518 break; 2519 #endif 2520 default: 2521 return (NULL); 2522 /* NOTREACHED */ 2523 break; 2524 } 2525 2526 /* Look up an SADB entry which matches the address of the peer. */ 2527 sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI)); 2528 if (sav == NULL) { 2529 ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__, 2530 (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) : 2531 #ifdef INET6 2532 (ip->ip_v == (IPV6_VERSION >> 4)) ? 2533 ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) : 2534 #endif 2535 "(unsupported)")); 2536 } 2537 2538 return (sav); 2539 } 2540 2541 /* 2542 * Compute TCP-MD5 hash of a TCP segment. (RFC2385) 2543 * 2544 * Parameters: 2545 * m pointer to head of mbuf chain 2546 * len length of TCP segment data, excluding options 2547 * optlen length of TCP segment options 2548 * buf pointer to storage for computed MD5 digest 2549 * sav pointer to security assosiation 2550 * 2551 * We do this over ip, tcphdr, segment data, and the key in the SADB. 2552 * When called from tcp_input(), we can be sure that th_sum has been 2553 * zeroed out and verified already. 2554 * 2555 * Releases reference to SADB key before return. 2556 * 2557 * Return 0 if successful, otherwise return -1. 2558 * 2559 */ 2560 int 2561 tcp_signature_do_compute(struct mbuf *m, int len, int optlen, 2562 u_char *buf, struct secasvar *sav) 2563 { 2564 #ifdef INET 2565 struct ippseudo ippseudo; 2566 #endif 2567 MD5_CTX ctx; 2568 int doff; 2569 struct ip *ip; 2570 #ifdef INET 2571 struct ipovly *ipovly; 2572 #endif 2573 struct tcphdr *th; 2574 #ifdef INET6 2575 struct ip6_hdr *ip6; 2576 struct in6_addr in6; 2577 uint32_t plen; 2578 uint16_t nhdr; 2579 #endif 2580 u_short savecsum; 2581 2582 KASSERT(m != NULL, ("NULL mbuf chain")); 2583 KASSERT(buf != NULL, ("NULL signature pointer")); 2584 2585 /* Extract the destination from the IP header in the mbuf. */ 2586 ip = mtod(m, struct ip *); 2587 #ifdef INET6 2588 ip6 = NULL; /* Make the compiler happy. */ 2589 #endif 2590 2591 MD5Init(&ctx); 2592 /* 2593 * Step 1: Update MD5 hash with IP(v6) pseudo-header. 2594 * 2595 * XXX The ippseudo header MUST be digested in network byte order, 2596 * or else we'll fail the regression test. Assume all fields we've 2597 * been doing arithmetic on have been in host byte order. 2598 * XXX One cannot depend on ipovly->ih_len here. When called from 2599 * tcp_output(), the underlying ip_len member has not yet been set. 2600 */ 2601 switch (ip->ip_v) { 2602 #ifdef INET 2603 case IPVERSION: 2604 ipovly = (struct ipovly *)ip; 2605 ippseudo.ippseudo_src = ipovly->ih_src; 2606 ippseudo.ippseudo_dst = ipovly->ih_dst; 2607 ippseudo.ippseudo_pad = 0; 2608 ippseudo.ippseudo_p = IPPROTO_TCP; 2609 ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + 2610 optlen); 2611 MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo)); 2612 2613 th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip)); 2614 doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen; 2615 break; 2616 #endif 2617 #ifdef INET6 2618 /* 2619 * RFC 2385, 2.0 Proposal 2620 * For IPv6, the pseudo-header is as described in RFC 2460, namely the 2621 * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero- 2622 * extended next header value (to form 32 bits), and 32-bit segment 2623 * length. 2624 * Note: Upper-Layer Packet Length comes before Next Header. 2625 */ 2626 case (IPV6_VERSION >> 4): 2627 in6 = ip6->ip6_src; 2628 in6_clearscope(&in6); 2629 MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); 2630 in6 = ip6->ip6_dst; 2631 in6_clearscope(&in6); 2632 MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); 2633 plen = htonl(len + sizeof(struct tcphdr) + optlen); 2634 MD5Update(&ctx, (char *)&plen, sizeof(uint32_t)); 2635 nhdr = 0; 2636 MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); 2637 MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); 2638 MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); 2639 nhdr = IPPROTO_TCP; 2640 MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); 2641 2642 th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr)); 2643 doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen; 2644 break; 2645 #endif 2646 default: 2647 KEY_FREESAV(&sav); 2648 return (-1); 2649 /* NOTREACHED */ 2650 break; 2651 } 2652 2653 2654 /* 2655 * Step 2: Update MD5 hash with TCP header, excluding options. 2656 * The TCP checksum must be set to zero. 2657 */ 2658 savecsum = th->th_sum; 2659 th->th_sum = 0; 2660 MD5Update(&ctx, (char *)th, sizeof(struct tcphdr)); 2661 th->th_sum = savecsum; 2662 2663 /* 2664 * Step 3: Update MD5 hash with TCP segment data. 2665 * Use m_apply() to avoid an early m_pullup(). 2666 */ 2667 if (len > 0) 2668 m_apply(m, doff, len, tcp_signature_apply, &ctx); 2669 2670 /* 2671 * Step 4: Update MD5 hash with shared secret. 2672 */ 2673 MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth)); 2674 MD5Final(buf, &ctx); 2675 2676 key_sa_recordxfer(sav, m); 2677 KEY_FREESAV(&sav); 2678 return (0); 2679 } 2680 2681 /* 2682 * Compute TCP-MD5 hash of a TCP segment. (RFC2385) 2683 * 2684 * Return 0 if successful, otherwise return -1. 2685 */ 2686 int 2687 tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, 2688 u_char *buf, u_int direction) 2689 { 2690 struct secasvar *sav; 2691 2692 if ((sav = tcp_get_sav(m, direction)) == NULL) 2693 return (-1); 2694 2695 return (tcp_signature_do_compute(m, len, optlen, buf, sav)); 2696 } 2697 2698 /* 2699 * Verify the TCP-MD5 hash of a TCP segment. (RFC2385) 2700 * 2701 * Parameters: 2702 * m pointer to head of mbuf chain 2703 * len length of TCP segment data, excluding options 2704 * optlen length of TCP segment options 2705 * buf pointer to storage for computed MD5 digest 2706 * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) 2707 * 2708 * Return 1 if successful, otherwise return 0. 2709 */ 2710 int 2711 tcp_signature_verify(struct mbuf *m, int off0, int tlen, int optlen, 2712 struct tcpopt *to, struct tcphdr *th, u_int tcpbflag) 2713 { 2714 char tmpdigest[TCP_SIGLEN]; 2715 2716 if (tcp_sig_checksigs == 0) 2717 return (1); 2718 if ((tcpbflag & TF_SIGNATURE) == 0) { 2719 if ((to->to_flags & TOF_SIGNATURE) != 0) { 2720 2721 /* 2722 * If this socket is not expecting signature but 2723 * the segment contains signature just fail. 2724 */ 2725 TCPSTAT_INC(tcps_sig_err_sigopt); 2726 TCPSTAT_INC(tcps_sig_rcvbadsig); 2727 return (0); 2728 } 2729 2730 /* Signature is not expected, and not present in segment. */ 2731 return (1); 2732 } 2733 2734 /* 2735 * If this socket is expecting signature but the segment does not 2736 * contain any just fail. 2737 */ 2738 if ((to->to_flags & TOF_SIGNATURE) == 0) { 2739 TCPSTAT_INC(tcps_sig_err_nosigopt); 2740 TCPSTAT_INC(tcps_sig_rcvbadsig); 2741 return (0); 2742 } 2743 if (tcp_signature_compute(m, off0, tlen, optlen, &tmpdigest[0], 2744 IPSEC_DIR_INBOUND) == -1) { 2745 TCPSTAT_INC(tcps_sig_err_buildsig); 2746 TCPSTAT_INC(tcps_sig_rcvbadsig); 2747 return (0); 2748 } 2749 2750 if (bcmp(to->to_signature, &tmpdigest[0], TCP_SIGLEN) != 0) { 2751 TCPSTAT_INC(tcps_sig_rcvbadsig); 2752 return (0); 2753 } 2754 TCPSTAT_INC(tcps_sig_rcvgoodsig); 2755 return (1); 2756 } 2757 #endif /* TCP_SIGNATURE */ 2758 2759 static int 2760 sysctl_drop(SYSCTL_HANDLER_ARGS) 2761 { 2762 /* addrs[0] is a foreign socket, addrs[1] is a local one. */ 2763 struct sockaddr_storage addrs[2]; 2764 struct inpcb *inp; 2765 struct tcpcb *tp; 2766 struct tcptw *tw; 2767 struct sockaddr_in *fin, *lin; 2768 #ifdef INET6 2769 struct sockaddr_in6 *fin6, *lin6; 2770 #endif 2771 int error; 2772 2773 inp = NULL; 2774 fin = lin = NULL; 2775 #ifdef INET6 2776 fin6 = lin6 = NULL; 2777 #endif 2778 error = 0; 2779 2780 if (req->oldptr != NULL || req->oldlen != 0) 2781 return (EINVAL); 2782 if (req->newptr == NULL) 2783 return (EPERM); 2784 if (req->newlen < sizeof(addrs)) 2785 return (ENOMEM); 2786 error = SYSCTL_IN(req, &addrs, sizeof(addrs)); 2787 if (error) 2788 return (error); 2789 2790 switch (addrs[0].ss_family) { 2791 #ifdef INET6 2792 case AF_INET6: 2793 fin6 = (struct sockaddr_in6 *)&addrs[0]; 2794 lin6 = (struct sockaddr_in6 *)&addrs[1]; 2795 if (fin6->sin6_len != sizeof(struct sockaddr_in6) || 2796 lin6->sin6_len != sizeof(struct sockaddr_in6)) 2797 return (EINVAL); 2798 if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { 2799 if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) 2800 return (EINVAL); 2801 in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); 2802 in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); 2803 fin = (struct sockaddr_in *)&addrs[0]; 2804 lin = (struct sockaddr_in *)&addrs[1]; 2805 break; 2806 } 2807 error = sa6_embedscope(fin6, V_ip6_use_defzone); 2808 if (error) 2809 return (error); 2810 error = sa6_embedscope(lin6, V_ip6_use_defzone); 2811 if (error) 2812 return (error); 2813 break; 2814 #endif 2815 #ifdef INET 2816 case AF_INET: 2817 fin = (struct sockaddr_in *)&addrs[0]; 2818 lin = (struct sockaddr_in *)&addrs[1]; 2819 if (fin->sin_len != sizeof(struct sockaddr_in) || 2820 lin->sin_len != sizeof(struct sockaddr_in)) 2821 return (EINVAL); 2822 break; 2823 #endif 2824 default: 2825 return (EINVAL); 2826 } 2827 INP_INFO_RLOCK(&V_tcbinfo); 2828 switch (addrs[0].ss_family) { 2829 #ifdef INET6 2830 case AF_INET6: 2831 inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, 2832 fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, 2833 INPLOOKUP_WLOCKPCB, NULL); 2834 break; 2835 #endif 2836 #ifdef INET 2837 case AF_INET: 2838 inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, 2839 lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); 2840 break; 2841 #endif 2842 } 2843 if (inp != NULL) { 2844 if (inp->inp_flags & INP_TIMEWAIT) { 2845 /* 2846 * XXXRW: There currently exists a state where an 2847 * inpcb is present, but its timewait state has been 2848 * discarded. For now, don't allow dropping of this 2849 * type of inpcb. 2850 */ 2851 tw = intotw(inp); 2852 if (tw != NULL) 2853 tcp_twclose(tw, 0); 2854 else 2855 INP_WUNLOCK(inp); 2856 } else if (!(inp->inp_flags & INP_DROPPED) && 2857 !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { 2858 tp = intotcpcb(inp); 2859 tp = tcp_drop(tp, ECONNABORTED); 2860 if (tp != NULL) 2861 INP_WUNLOCK(inp); 2862 } else 2863 INP_WUNLOCK(inp); 2864 } else 2865 error = ESRCH; 2866 INP_INFO_RUNLOCK(&V_tcbinfo); 2867 return (error); 2868 } 2869 2870 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, 2871 CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL, 2872 0, sysctl_drop, "", "Drop TCP connection"); 2873 2874 /* 2875 * Generate a standardized TCP log line for use throughout the 2876 * tcp subsystem. Memory allocation is done with M_NOWAIT to 2877 * allow use in the interrupt context. 2878 * 2879 * NB: The caller MUST free(s, M_TCPLOG) the returned string. 2880 * NB: The function may return NULL if memory allocation failed. 2881 * 2882 * Due to header inclusion and ordering limitations the struct ip 2883 * and ip6_hdr pointers have to be passed as void pointers. 2884 */ 2885 char * 2886 tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, 2887 const void *ip6hdr) 2888 { 2889 2890 /* Is logging enabled? */ 2891 if (tcp_log_in_vain == 0) 2892 return (NULL); 2893 2894 return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); 2895 } 2896 2897 char * 2898 tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, 2899 const void *ip6hdr) 2900 { 2901 2902 /* Is logging enabled? */ 2903 if (tcp_log_debug == 0) 2904 return (NULL); 2905 2906 return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); 2907 } 2908 2909 static char * 2910 tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, 2911 const void *ip6hdr) 2912 { 2913 char *s, *sp; 2914 size_t size; 2915 struct ip *ip; 2916 #ifdef INET6 2917 const struct ip6_hdr *ip6; 2918 2919 ip6 = (const struct ip6_hdr *)ip6hdr; 2920 #endif /* INET6 */ 2921 ip = (struct ip *)ip4hdr; 2922 2923 /* 2924 * The log line looks like this: 2925 * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>" 2926 */ 2927 size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + 2928 sizeof(PRINT_TH_FLAGS) + 1 + 2929 #ifdef INET6 2930 2 * INET6_ADDRSTRLEN; 2931 #else 2932 2 * INET_ADDRSTRLEN; 2933 #endif /* INET6 */ 2934 2935 s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); 2936 if (s == NULL) 2937 return (NULL); 2938 2939 strcat(s, "TCP: ["); 2940 sp = s + strlen(s); 2941 2942 if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { 2943 inet_ntoa_r(inc->inc_faddr, sp); 2944 sp = s + strlen(s); 2945 sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); 2946 sp = s + strlen(s); 2947 inet_ntoa_r(inc->inc_laddr, sp); 2948 sp = s + strlen(s); 2949 sprintf(sp, "]:%i", ntohs(inc->inc_lport)); 2950 #ifdef INET6 2951 } else if (inc) { 2952 ip6_sprintf(sp, &inc->inc6_faddr); 2953 sp = s + strlen(s); 2954 sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); 2955 sp = s + strlen(s); 2956 ip6_sprintf(sp, &inc->inc6_laddr); 2957 sp = s + strlen(s); 2958 sprintf(sp, "]:%i", ntohs(inc->inc_lport)); 2959 } else if (ip6 && th) { 2960 ip6_sprintf(sp, &ip6->ip6_src); 2961 sp = s + strlen(s); 2962 sprintf(sp, "]:%i to [", ntohs(th->th_sport)); 2963 sp = s + strlen(s); 2964 ip6_sprintf(sp, &ip6->ip6_dst); 2965 sp = s + strlen(s); 2966 sprintf(sp, "]:%i", ntohs(th->th_dport)); 2967 #endif /* INET6 */ 2968 #ifdef INET 2969 } else if (ip && th) { 2970 inet_ntoa_r(ip->ip_src, sp); 2971 sp = s + strlen(s); 2972 sprintf(sp, "]:%i to [", ntohs(th->th_sport)); 2973 sp = s + strlen(s); 2974 inet_ntoa_r(ip->ip_dst, sp); 2975 sp = s + strlen(s); 2976 sprintf(sp, "]:%i", ntohs(th->th_dport)); 2977 #endif /* INET */ 2978 } else { 2979 free(s, M_TCPLOG); 2980 return (NULL); 2981 } 2982 sp = s + strlen(s); 2983 if (th) 2984 sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS); 2985 if (*(s + size - 1) != '\0') 2986 panic("%s: string too long", __func__); 2987 return (s); 2988 } 2989 2990 /* 2991 * A subroutine which makes it easy to track TCP state changes with DTrace. 2992 * This function shouldn't be called for t_state initializations that don't 2993 * correspond to actual TCP state transitions. 2994 */ 2995 void 2996 tcp_state_change(struct tcpcb *tp, int newstate) 2997 { 2998 #if defined(KDTRACE_HOOKS) 2999 int pstate = tp->t_state; 3000 #endif 3001 3002 TCPSTATES_DEC(tp->t_state); 3003 TCPSTATES_INC(newstate); 3004 tp->t_state = newstate; 3005 TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); 3006 } 3007