1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_ipsec.h" 36 #include "opt_kern_tls.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/arb.h> 41 #include <sys/callout.h> 42 #include <sys/eventhandler.h> 43 #ifdef TCP_HHOOK 44 #include <sys/hhook.h> 45 #endif 46 #include <sys/kernel.h> 47 #ifdef TCP_HHOOK 48 #include <sys/khelp.h> 49 #endif 50 #ifdef KERN_TLS 51 #include <sys/ktls.h> 52 #endif 53 #include <sys/qmath.h> 54 #include <sys/stats.h> 55 #include <sys/sysctl.h> 56 #include <sys/jail.h> 57 #include <sys/malloc.h> 58 #include <sys/refcount.h> 59 #include <sys/mbuf.h> 60 #include <sys/priv.h> 61 #include <sys/sdt.h> 62 #include <sys/socket.h> 63 #include <sys/socketvar.h> 64 #include <sys/protosw.h> 65 #include <sys/random.h> 66 67 #include <vm/uma.h> 68 69 #include <net/route.h> 70 #include <net/route/nhop.h> 71 #include <net/if.h> 72 #include <net/if_var.h> 73 #include <net/if_private.h> 74 #include <net/vnet.h> 75 76 #include <netinet/in.h> 77 #include <netinet/in_fib.h> 78 #include <netinet/in_kdtrace.h> 79 #include <netinet/in_pcb.h> 80 #include <netinet/in_systm.h> 81 #include <netinet/in_var.h> 82 #include <netinet/ip.h> 83 #include <netinet/ip_icmp.h> 84 #include <netinet/ip_var.h> 85 #include <netinet/icmp_var.h> 86 #ifdef INET6 87 #include <netinet/icmp6.h> 88 #include <netinet/ip6.h> 89 #include <netinet6/in6_fib.h> 90 #include <netinet6/in6_pcb.h> 91 #include <netinet6/ip6_var.h> 92 #include <netinet6/scope6_var.h> 93 #include <netinet6/nd6.h> 94 #endif 95 96 #include <netinet/tcp.h> 97 #ifdef INVARIANTS 98 #define TCPSTATES 99 #endif 100 #include <netinet/tcp_fsm.h> 101 #include <netinet/tcp_seq.h> 102 #include <netinet/tcp_timer.h> 103 #include <netinet/tcp_var.h> 104 #include <netinet/tcp_ecn.h> 105 #include <netinet/tcp_log_buf.h> 106 #include <netinet/tcp_syncache.h> 107 #include <netinet/tcp_hpts.h> 108 #include <netinet/tcp_lro.h> 109 #include <netinet/cc/cc.h> 110 #include <netinet/tcpip.h> 111 #include <netinet/tcp_fastopen.h> 112 #include <netinet/tcp_accounting.h> 113 #ifdef TCP_OFFLOAD 114 #include <netinet/tcp_offload.h> 115 #endif 116 #include <netinet/udp.h> 117 #include <netinet/udp_var.h> 118 #ifdef INET6 119 #include <netinet6/tcp6_var.h> 120 #endif 121 122 #include <netipsec/ipsec_support.h> 123 124 #include <machine/in_cksum.h> 125 #include <crypto/siphash/siphash.h> 126 127 #include <security/mac/mac_framework.h> 128 129 #ifdef INET6 130 static ip6proto_ctlinput_t tcp6_ctlinput; 131 static udp_tun_icmp_t tcp6_ctlinput_viaudp; 132 #endif 133 134 VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; 135 #ifdef INET6 136 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; 137 #endif 138 139 VNET_DEFINE(uint32_t, tcp_ack_war_time_window) = 1000; 140 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow, 141 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ack_war_time_window), 0, 142 "Time interval in ms used to limit the number (ack_war_cnt) of challenge ACKs sent per TCP connection"); 143 VNET_DEFINE(uint32_t, tcp_ack_war_cnt) = 5; 144 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt, CTLFLAG_VNET | CTLFLAG_RW, 145 &VNET_NAME(tcp_ack_war_cnt), 0, 146 "Maximum number of challenge ACKs sent per TCP connection during the time interval (ack_war_timewindow)"); 147 148 struct rwlock tcp_function_lock; 149 150 static int 151 sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) 152 { 153 int error, new; 154 155 new = V_tcp_mssdflt; 156 error = sysctl_handle_int(oidp, &new, 0, req); 157 if (error == 0 && req->newptr) { 158 if (new < TCP_MINMSS) 159 error = EINVAL; 160 else 161 V_tcp_mssdflt = new; 162 } 163 return (error); 164 } 165 166 SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, 167 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 168 &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", 169 "Default TCP Maximum Segment Size"); 170 171 #ifdef INET6 172 static int 173 sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) 174 { 175 int error, new; 176 177 new = V_tcp_v6mssdflt; 178 error = sysctl_handle_int(oidp, &new, 0, req); 179 if (error == 0 && req->newptr) { 180 if (new < TCP_MINMSS) 181 error = EINVAL; 182 else 183 V_tcp_v6mssdflt = new; 184 } 185 return (error); 186 } 187 188 SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, 189 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 190 &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", 191 "Default TCP Maximum Segment Size for IPv6"); 192 #endif /* INET6 */ 193 194 /* 195 * Minimum MSS we accept and use. This prevents DoS attacks where 196 * we are forced to a ridiculous low MSS like 20 and send hundreds 197 * of packets instead of one. The effect scales with the available 198 * bandwidth and quickly saturates the CPU and network interface 199 * with packet generation and sending. Set to zero to disable MINMSS 200 * checking. This setting prevents us from sending too small packets. 201 */ 202 VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; 203 SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, 204 &VNET_NAME(tcp_minmss), 0, 205 "Minimum TCP Maximum Segment Size"); 206 207 VNET_DEFINE(int, tcp_do_rfc1323) = 1; 208 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, 209 &VNET_NAME(tcp_do_rfc1323), 0, 210 "Enable rfc1323 (high performance TCP) extensions"); 211 212 /* 213 * As of June 2021, several TCP stacks violate RFC 7323 from September 2014. 214 * Some stacks negotiate TS, but never send them after connection setup. Some 215 * stacks negotiate TS, but don't send them when sending keep-alive segments. 216 * These include modern widely deployed TCP stacks. 217 * Therefore tolerating violations for now... 218 */ 219 VNET_DEFINE(int, tcp_tolerate_missing_ts) = 1; 220 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tolerate_missing_ts, CTLFLAG_VNET | CTLFLAG_RW, 221 &VNET_NAME(tcp_tolerate_missing_ts), 0, 222 "Tolerate missing TCP timestamps"); 223 224 VNET_DEFINE(int, tcp_ts_offset_per_conn) = 1; 225 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_RW, 226 &VNET_NAME(tcp_ts_offset_per_conn), 0, 227 "Initialize TCP timestamps per connection instead of per host pair"); 228 229 /* How many connections are pacing */ 230 static volatile uint32_t number_of_tcp_connections_pacing = 0; 231 static uint32_t shadow_num_connections = 0; 232 static counter_u64_t tcp_pacing_failures; 233 static counter_u64_t tcp_dgp_failures; 234 static uint32_t shadow_tcp_pacing_dgp = 0; 235 static volatile uint32_t number_of_dgp_connections = 0; 236 237 static int tcp_pacing_limit = 10000; 238 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW, 239 &tcp_pacing_limit, 1000, 240 "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)"); 241 242 static int tcp_dgp_limit = -1; 243 SYSCTL_INT(_net_inet_tcp, OID_AUTO, dgp_limit, CTLFLAG_RW, 244 &tcp_dgp_limit, -1, 245 "If the TCP stack does DGP, is there a limit (-1 = no, 0 = no dgp N = number of connections)"); 246 247 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD, 248 &shadow_num_connections, 0, "Number of TCP connections being paced"); 249 250 SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, pacing_failures, CTLFLAG_RD, 251 &tcp_pacing_failures, "Number of times we failed to enable pacing to avoid exceeding the limit"); 252 253 SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, dgp_failures, CTLFLAG_RD, 254 &tcp_dgp_failures, "Number of times we failed to enable dgp to avoid exceeding the limit"); 255 256 static int tcp_log_debug = 0; 257 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, 258 &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); 259 260 /* 261 * Target size of TCP PCB hash tables. Must be a power of two. 262 * 263 * Note that this can be overridden by the kernel environment 264 * variable net.inet.tcp.tcbhashsize 265 */ 266 #ifndef TCBHASHSIZE 267 #define TCBHASHSIZE 0 268 #endif 269 static int tcp_tcbhashsize = TCBHASHSIZE; 270 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, 271 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); 272 273 static int do_tcpdrain = 1; 274 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, 275 "Enable tcp_drain routine for extra help when low on mbufs"); 276 277 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, 278 &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); 279 280 VNET_DEFINE_STATIC(int, icmp_may_rst) = 1; 281 #define V_icmp_may_rst VNET(icmp_may_rst) 282 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, 283 &VNET_NAME(icmp_may_rst), 0, 284 "Certain ICMP unreachable messages may abort connections in SYN_SENT"); 285 286 VNET_DEFINE_STATIC(int, tcp_isn_reseed_interval) = 0; 287 #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) 288 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, 289 &VNET_NAME(tcp_isn_reseed_interval), 0, 290 "Seconds between reseeding of ISN secret"); 291 292 static int tcp_soreceive_stream; 293 SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, 294 &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); 295 296 VNET_DEFINE(uma_zone_t, sack_hole_zone); 297 #define V_sack_hole_zone VNET(sack_hole_zone) 298 VNET_DEFINE(uint32_t, tcp_map_entries_limit) = 0; /* unlimited */ 299 static int 300 sysctl_net_inet_tcp_map_limit_check(SYSCTL_HANDLER_ARGS) 301 { 302 int error; 303 uint32_t new; 304 305 new = V_tcp_map_entries_limit; 306 error = sysctl_handle_int(oidp, &new, 0, req); 307 if (error == 0 && req->newptr) { 308 /* only allow "0" and value > minimum */ 309 if (new > 0 && new < TCP_MIN_MAP_ENTRIES_LIMIT) 310 error = EINVAL; 311 else 312 V_tcp_map_entries_limit = new; 313 } 314 return (error); 315 } 316 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, map_limit, 317 CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 318 &VNET_NAME(tcp_map_entries_limit), 0, 319 &sysctl_net_inet_tcp_map_limit_check, "IU", 320 "Total sendmap entries limit"); 321 322 VNET_DEFINE(uint32_t, tcp_map_split_limit) = 0; /* unlimited */ 323 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, split_limit, CTLFLAG_VNET | CTLFLAG_RW, 324 &VNET_NAME(tcp_map_split_limit), 0, 325 "Total sendmap split entries limit"); 326 327 #ifdef TCP_HHOOK 328 VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); 329 #endif 330 331 #define TS_OFFSET_SECRET_LENGTH SIPHASH_KEY_LENGTH 332 VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]); 333 #define V_ts_offset_secret VNET(ts_offset_secret) 334 335 static int tcp_default_fb_init(struct tcpcb *tp, void **ptr); 336 static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged); 337 static int tcp_default_handoff_ok(struct tcpcb *tp); 338 static struct inpcb *tcp_notify(struct inpcb *, int); 339 static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); 340 static struct inpcb *tcp_mtudisc(struct inpcb *, int); 341 static struct inpcb *tcp_drop_syn_sent(struct inpcb *, int); 342 static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, 343 const void *ip4hdr, const void *ip6hdr); 344 static void tcp_default_switch_failed(struct tcpcb *tp); 345 static ipproto_ctlinput_t tcp_ctlinput; 346 static udp_tun_icmp_t tcp_ctlinput_viaudp; 347 348 static struct tcp_function_block tcp_def_funcblk = { 349 .tfb_tcp_block_name = "freebsd", 350 .tfb_tcp_output = tcp_default_output, 351 .tfb_tcp_do_segment = tcp_do_segment, 352 .tfb_tcp_ctloutput = tcp_default_ctloutput, 353 .tfb_tcp_handoff_ok = tcp_default_handoff_ok, 354 .tfb_tcp_fb_init = tcp_default_fb_init, 355 .tfb_tcp_fb_fini = tcp_default_fb_fini, 356 .tfb_switch_failed = tcp_default_switch_failed, 357 .tfb_flags = TCP_FUNC_DEFAULT_OK, 358 }; 359 360 static int tcp_fb_cnt = 0; 361 struct tcp_funchead t_functions; 362 VNET_DEFINE_STATIC(struct tcp_function_block *, tcp_func_set_ptr) = &tcp_def_funcblk; 363 #define V_tcp_func_set_ptr VNET(tcp_func_set_ptr) 364 365 void 366 tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp) 367 { 368 TCPSTAT_INC(tcps_dsack_count); 369 tp->t_dsack_pack++; 370 if (tlp == 0) { 371 if (SEQ_GT(end, start)) { 372 tp->t_dsack_bytes += (end - start); 373 TCPSTAT_ADD(tcps_dsack_bytes, (end - start)); 374 } else { 375 tp->t_dsack_tlp_bytes += (start - end); 376 TCPSTAT_ADD(tcps_dsack_bytes, (start - end)); 377 } 378 } else { 379 if (SEQ_GT(end, start)) { 380 tp->t_dsack_bytes += (end - start); 381 TCPSTAT_ADD(tcps_dsack_tlp_bytes, (end - start)); 382 } else { 383 tp->t_dsack_tlp_bytes += (start - end); 384 TCPSTAT_ADD(tcps_dsack_tlp_bytes, (start - end)); 385 } 386 } 387 } 388 389 static struct tcp_function_block * 390 find_tcp_functions_locked(struct tcp_function_set *fs) 391 { 392 struct tcp_function *f; 393 struct tcp_function_block *blk = NULL; 394 395 rw_assert(&tcp_function_lock, RA_LOCKED); 396 TAILQ_FOREACH(f, &t_functions, tf_next) { 397 if (strcmp(f->tf_name, fs->function_set_name) == 0) { 398 blk = f->tf_fb; 399 break; 400 } 401 } 402 return (blk); 403 } 404 405 static struct tcp_function_block * 406 find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) 407 { 408 struct tcp_function_block *rblk = NULL; 409 struct tcp_function *f; 410 411 rw_assert(&tcp_function_lock, RA_LOCKED); 412 TAILQ_FOREACH(f, &t_functions, tf_next) { 413 if (f->tf_fb == blk) { 414 rblk = blk; 415 if (s) { 416 *s = f; 417 } 418 break; 419 } 420 } 421 return (rblk); 422 } 423 424 struct tcp_function_block * 425 find_and_ref_tcp_functions(struct tcp_function_set *fs) 426 { 427 struct tcp_function_block *blk; 428 429 rw_rlock(&tcp_function_lock); 430 blk = find_tcp_functions_locked(fs); 431 if (blk) 432 refcount_acquire(&blk->tfb_refcnt); 433 rw_runlock(&tcp_function_lock); 434 return (blk); 435 } 436 437 struct tcp_function_block * 438 find_and_ref_tcp_fb(struct tcp_function_block *blk) 439 { 440 struct tcp_function_block *rblk; 441 442 rw_rlock(&tcp_function_lock); 443 rblk = find_tcp_fb_locked(blk, NULL); 444 if (rblk) 445 refcount_acquire(&rblk->tfb_refcnt); 446 rw_runlock(&tcp_function_lock); 447 return (rblk); 448 } 449 450 /* Find a matching alias for the given tcp_function_block. */ 451 int 452 find_tcp_function_alias(struct tcp_function_block *blk, 453 struct tcp_function_set *fs) 454 { 455 struct tcp_function *f; 456 int found; 457 458 found = 0; 459 rw_rlock(&tcp_function_lock); 460 TAILQ_FOREACH(f, &t_functions, tf_next) { 461 if ((f->tf_fb == blk) && 462 (strncmp(f->tf_name, blk->tfb_tcp_block_name, 463 TCP_FUNCTION_NAME_LEN_MAX) != 0)) { 464 /* Matching function block with different name. */ 465 strncpy(fs->function_set_name, f->tf_name, 466 TCP_FUNCTION_NAME_LEN_MAX); 467 found = 1; 468 break; 469 } 470 } 471 /* Null terminate the string appropriately. */ 472 if (found) { 473 fs->function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; 474 } else { 475 fs->function_set_name[0] = '\0'; 476 } 477 rw_runlock(&tcp_function_lock); 478 return (found); 479 } 480 481 static struct tcp_function_block * 482 find_and_ref_tcp_default_fb(void) 483 { 484 struct tcp_function_block *rblk; 485 486 rw_rlock(&tcp_function_lock); 487 rblk = V_tcp_func_set_ptr; 488 refcount_acquire(&rblk->tfb_refcnt); 489 rw_runlock(&tcp_function_lock); 490 return (rblk); 491 } 492 493 void 494 tcp_switch_back_to_default(struct tcpcb *tp) 495 { 496 struct tcp_function_block *tfb; 497 void *ptr = NULL; 498 499 KASSERT(tp->t_fb != &tcp_def_funcblk, 500 ("%s: called by the built-in default stack", __func__)); 501 502 if (tp->t_fb->tfb_tcp_timer_stop_all != NULL) 503 tp->t_fb->tfb_tcp_timer_stop_all(tp); 504 505 /* 506 * Now, we'll find a new function block to use. 507 * Start by trying the current user-selected 508 * default, unless this stack is the user-selected 509 * default. 510 */ 511 tfb = find_and_ref_tcp_default_fb(); 512 if (tfb == tp->t_fb) { 513 refcount_release(&tfb->tfb_refcnt); 514 tfb = NULL; 515 } 516 /* Does the stack accept this connection? */ 517 if (tfb != NULL && (*tfb->tfb_tcp_handoff_ok)(tp)) { 518 refcount_release(&tfb->tfb_refcnt); 519 tfb = NULL; 520 } 521 /* Try to use that stack. */ 522 if (tfb != NULL) { 523 /* Initialize the new stack. If it succeeds, we are done. */ 524 if (tfb->tfb_tcp_fb_init == NULL || 525 (*tfb->tfb_tcp_fb_init)(tp, &ptr) == 0) { 526 /* Release the old stack */ 527 if (tp->t_fb->tfb_tcp_fb_fini != NULL) 528 (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); 529 refcount_release(&tp->t_fb->tfb_refcnt); 530 /* Now set in all the pointers */ 531 tp->t_fb = tfb; 532 tp->t_fb_ptr = ptr; 533 return; 534 } 535 /* 536 * Initialization failed. Release the reference count on 537 * the looked up default stack. 538 */ 539 refcount_release(&tfb->tfb_refcnt); 540 } 541 542 /* 543 * If that wasn't feasible, use the built-in default 544 * stack which is not allowed to reject anyone. 545 */ 546 tfb = find_and_ref_tcp_fb(&tcp_def_funcblk); 547 if (tfb == NULL) { 548 /* there always should be a default */ 549 panic("Can't refer to tcp_def_funcblk"); 550 } 551 if ((*tfb->tfb_tcp_handoff_ok)(tp)) { 552 /* The default stack cannot say no */ 553 panic("Default stack rejects a new session?"); 554 } 555 if (tfb->tfb_tcp_fb_init != NULL && 556 (*tfb->tfb_tcp_fb_init)(tp, &ptr)) { 557 /* The default stack cannot fail */ 558 panic("Default stack initialization failed"); 559 } 560 /* Now release the old stack */ 561 if (tp->t_fb->tfb_tcp_fb_fini != NULL) 562 (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); 563 refcount_release(&tp->t_fb->tfb_refcnt); 564 /* And set in the pointers to the new */ 565 tp->t_fb = tfb; 566 tp->t_fb_ptr = ptr; 567 } 568 569 static bool 570 tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp, 571 const struct sockaddr *sa, void *ctx) 572 { 573 struct ip *iph; 574 #ifdef INET6 575 struct ip6_hdr *ip6; 576 #endif 577 struct udphdr *uh; 578 struct tcphdr *th; 579 int thlen; 580 uint16_t port; 581 582 TCPSTAT_INC(tcps_tunneled_pkts); 583 if ((m->m_flags & M_PKTHDR) == 0) { 584 /* Can't handle one that is not a pkt hdr */ 585 TCPSTAT_INC(tcps_tunneled_errs); 586 goto out; 587 } 588 thlen = sizeof(struct tcphdr); 589 if (m->m_len < off + sizeof(struct udphdr) + thlen && 590 (m = m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) { 591 TCPSTAT_INC(tcps_tunneled_errs); 592 goto out; 593 } 594 iph = mtod(m, struct ip *); 595 uh = (struct udphdr *)((caddr_t)iph + off); 596 th = (struct tcphdr *)(uh + 1); 597 thlen = th->th_off << 2; 598 if (m->m_len < off + sizeof(struct udphdr) + thlen) { 599 m = m_pullup(m, off + sizeof(struct udphdr) + thlen); 600 if (m == NULL) { 601 TCPSTAT_INC(tcps_tunneled_errs); 602 goto out; 603 } else { 604 iph = mtod(m, struct ip *); 605 uh = (struct udphdr *)((caddr_t)iph + off); 606 th = (struct tcphdr *)(uh + 1); 607 } 608 } 609 m->m_pkthdr.tcp_tun_port = port = uh->uh_sport; 610 bcopy(th, uh, m->m_len - off); 611 m->m_len -= sizeof(struct udphdr); 612 m->m_pkthdr.len -= sizeof(struct udphdr); 613 /* 614 * We use the same algorithm for 615 * both UDP and TCP for c-sum. So 616 * the code in tcp_input will skip 617 * the checksum. So we do nothing 618 * with the flag (m->m_pkthdr.csum_flags). 619 */ 620 switch (iph->ip_v) { 621 #ifdef INET 622 case IPVERSION: 623 iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr)); 624 tcp_input_with_port(&m, &off, IPPROTO_TCP, port); 625 break; 626 #endif 627 #ifdef INET6 628 case IPV6_VERSION >> 4: 629 ip6 = mtod(m, struct ip6_hdr *); 630 ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr)); 631 tcp6_input_with_port(&m, &off, IPPROTO_TCP, port); 632 break; 633 #endif 634 default: 635 goto out; 636 break; 637 } 638 return (true); 639 out: 640 m_freem(m); 641 642 return (true); 643 } 644 645 static int 646 sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) 647 { 648 int error = ENOENT; 649 struct tcp_function_set fs; 650 struct tcp_function_block *blk; 651 652 memset(&fs, 0, sizeof(fs)); 653 rw_rlock(&tcp_function_lock); 654 blk = find_tcp_fb_locked(V_tcp_func_set_ptr, NULL); 655 if (blk) { 656 /* Found him */ 657 strcpy(fs.function_set_name, blk->tfb_tcp_block_name); 658 fs.pcbcnt = blk->tfb_refcnt; 659 } 660 rw_runlock(&tcp_function_lock); 661 error = sysctl_handle_string(oidp, fs.function_set_name, 662 sizeof(fs.function_set_name), req); 663 664 /* Check for error or no change */ 665 if (error != 0 || req->newptr == NULL) 666 return (error); 667 668 rw_wlock(&tcp_function_lock); 669 blk = find_tcp_functions_locked(&fs); 670 if ((blk == NULL) || 671 (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { 672 error = ENOENT; 673 goto done; 674 } 675 if ((blk->tfb_flags & TCP_FUNC_DEFAULT_OK) == 0) { 676 error = EINVAL; 677 goto done; 678 } 679 V_tcp_func_set_ptr = blk; 680 done: 681 rw_wunlock(&tcp_function_lock); 682 return (error); 683 } 684 685 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, 686 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 687 NULL, 0, sysctl_net_inet_default_tcp_functions, "A", 688 "Set/get the default TCP functions"); 689 690 static int 691 sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) 692 { 693 int error, cnt, linesz; 694 struct tcp_function *f; 695 char *buffer, *cp; 696 size_t bufsz, outsz; 697 bool alias; 698 699 cnt = 0; 700 rw_rlock(&tcp_function_lock); 701 TAILQ_FOREACH(f, &t_functions, tf_next) { 702 cnt++; 703 } 704 rw_runlock(&tcp_function_lock); 705 706 bufsz = (cnt+2) * ((TCP_FUNCTION_NAME_LEN_MAX * 2) + 13) + 1; 707 buffer = malloc(bufsz, M_TEMP, M_WAITOK); 708 709 error = 0; 710 cp = buffer; 711 712 linesz = snprintf(cp, bufsz, "\n%-32s%c %-32s %s\n", "Stack", 'D', 713 "Alias", "PCB count"); 714 cp += linesz; 715 bufsz -= linesz; 716 outsz = linesz; 717 718 rw_rlock(&tcp_function_lock); 719 TAILQ_FOREACH(f, &t_functions, tf_next) { 720 alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name); 721 linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n", 722 f->tf_fb->tfb_tcp_block_name, 723 (f->tf_fb == V_tcp_func_set_ptr) ? '*' : ' ', 724 alias ? f->tf_name : "-", 725 f->tf_fb->tfb_refcnt); 726 if (linesz >= bufsz) { 727 error = EOVERFLOW; 728 break; 729 } 730 cp += linesz; 731 bufsz -= linesz; 732 outsz += linesz; 733 } 734 rw_runlock(&tcp_function_lock); 735 if (error == 0) 736 error = sysctl_handle_string(oidp, buffer, outsz + 1, req); 737 free(buffer, M_TEMP); 738 return (error); 739 } 740 741 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, 742 CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 743 NULL, 0, sysctl_net_inet_list_available, "A", 744 "list available TCP Function sets"); 745 746 VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT; 747 748 #ifdef INET 749 VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL; 750 #define V_udp4_tun_socket VNET(udp4_tun_socket) 751 #endif 752 #ifdef INET6 753 VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL; 754 #define V_udp6_tun_socket VNET(udp6_tun_socket) 755 #endif 756 757 static struct sx tcpoudp_lock; 758 759 static void 760 tcp_over_udp_stop(void) 761 { 762 763 sx_assert(&tcpoudp_lock, SA_XLOCKED); 764 765 #ifdef INET 766 if (V_udp4_tun_socket != NULL) { 767 soclose(V_udp4_tun_socket); 768 V_udp4_tun_socket = NULL; 769 } 770 #endif 771 #ifdef INET6 772 if (V_udp6_tun_socket != NULL) { 773 soclose(V_udp6_tun_socket); 774 V_udp6_tun_socket = NULL; 775 } 776 #endif 777 } 778 779 static int 780 tcp_over_udp_start(void) 781 { 782 uint16_t port; 783 int ret; 784 #ifdef INET 785 struct sockaddr_in sin; 786 #endif 787 #ifdef INET6 788 struct sockaddr_in6 sin6; 789 #endif 790 791 sx_assert(&tcpoudp_lock, SA_XLOCKED); 792 793 port = V_tcp_udp_tunneling_port; 794 if (ntohs(port) == 0) { 795 /* Must have a port set */ 796 return (EINVAL); 797 } 798 #ifdef INET 799 if (V_udp4_tun_socket != NULL) { 800 /* Already running -- must stop first */ 801 return (EALREADY); 802 } 803 #endif 804 #ifdef INET6 805 if (V_udp6_tun_socket != NULL) { 806 /* Already running -- must stop first */ 807 return (EALREADY); 808 } 809 #endif 810 #ifdef INET 811 if ((ret = socreate(PF_INET, &V_udp4_tun_socket, 812 SOCK_DGRAM, IPPROTO_UDP, 813 curthread->td_ucred, curthread))) { 814 tcp_over_udp_stop(); 815 return (ret); 816 } 817 /* Call the special UDP hook. */ 818 if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket, 819 tcp_recv_udp_tunneled_packet, 820 tcp_ctlinput_viaudp, 821 NULL))) { 822 tcp_over_udp_stop(); 823 return (ret); 824 } 825 /* Ok, we have a socket, bind it to the port. */ 826 memset(&sin, 0, sizeof(struct sockaddr_in)); 827 sin.sin_len = sizeof(struct sockaddr_in); 828 sin.sin_family = AF_INET; 829 sin.sin_port = htons(port); 830 if ((ret = sobind(V_udp4_tun_socket, 831 (struct sockaddr *)&sin, curthread))) { 832 tcp_over_udp_stop(); 833 return (ret); 834 } 835 #endif 836 #ifdef INET6 837 if ((ret = socreate(PF_INET6, &V_udp6_tun_socket, 838 SOCK_DGRAM, IPPROTO_UDP, 839 curthread->td_ucred, curthread))) { 840 tcp_over_udp_stop(); 841 return (ret); 842 } 843 /* Call the special UDP hook. */ 844 if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket, 845 tcp_recv_udp_tunneled_packet, 846 tcp6_ctlinput_viaudp, 847 NULL))) { 848 tcp_over_udp_stop(); 849 return (ret); 850 } 851 /* Ok, we have a socket, bind it to the port. */ 852 memset(&sin6, 0, sizeof(struct sockaddr_in6)); 853 sin6.sin6_len = sizeof(struct sockaddr_in6); 854 sin6.sin6_family = AF_INET6; 855 sin6.sin6_port = htons(port); 856 if ((ret = sobind(V_udp6_tun_socket, 857 (struct sockaddr *)&sin6, curthread))) { 858 tcp_over_udp_stop(); 859 return (ret); 860 } 861 #endif 862 return (0); 863 } 864 865 static int 866 sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS) 867 { 868 int error; 869 uint32_t old, new; 870 871 old = V_tcp_udp_tunneling_port; 872 new = old; 873 error = sysctl_handle_int(oidp, &new, 0, req); 874 if ((error == 0) && 875 (req->newptr != NULL)) { 876 if ((new < TCP_TUNNELING_PORT_MIN) || 877 (new > TCP_TUNNELING_PORT_MAX)) { 878 error = EINVAL; 879 } else { 880 sx_xlock(&tcpoudp_lock); 881 V_tcp_udp_tunneling_port = new; 882 if (old != 0) { 883 tcp_over_udp_stop(); 884 } 885 if (new != 0) { 886 error = tcp_over_udp_start(); 887 if (error != 0) { 888 V_tcp_udp_tunneling_port = 0; 889 } 890 } 891 sx_xunlock(&tcpoudp_lock); 892 } 893 } 894 return (error); 895 } 896 897 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port, 898 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 899 &VNET_NAME(tcp_udp_tunneling_port), 900 0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU", 901 "Tunneling port for tcp over udp"); 902 903 VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT; 904 905 static int 906 sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS) 907 { 908 int error, new; 909 910 new = V_tcp_udp_tunneling_overhead; 911 error = sysctl_handle_int(oidp, &new, 0, req); 912 if (error == 0 && req->newptr) { 913 if ((new < TCP_TUNNELING_OVERHEAD_MIN) || 914 (new > TCP_TUNNELING_OVERHEAD_MAX)) 915 error = EINVAL; 916 else 917 V_tcp_udp_tunneling_overhead = new; 918 } 919 return (error); 920 } 921 922 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead, 923 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 924 &VNET_NAME(tcp_udp_tunneling_overhead), 925 0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU", 926 "MSS reduction when using tcp over udp"); 927 928 /* 929 * Exports one (struct tcp_function_info) for each alias/name. 930 */ 931 static int 932 sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS) 933 { 934 int cnt, error; 935 struct tcp_function *f; 936 struct tcp_function_info tfi; 937 938 /* 939 * We don't allow writes. 940 */ 941 if (req->newptr != NULL) 942 return (EINVAL); 943 944 /* 945 * Wire the old buffer so we can directly copy the functions to 946 * user space without dropping the lock. 947 */ 948 if (req->oldptr != NULL) { 949 error = sysctl_wire_old_buffer(req, 0); 950 if (error) 951 return (error); 952 } 953 954 /* 955 * Walk the list and copy out matching entries. If INVARIANTS 956 * is compiled in, also walk the list to verify the length of 957 * the list matches what we have recorded. 958 */ 959 rw_rlock(&tcp_function_lock); 960 961 cnt = 0; 962 #ifndef INVARIANTS 963 if (req->oldptr == NULL) { 964 cnt = tcp_fb_cnt; 965 goto skip_loop; 966 } 967 #endif 968 TAILQ_FOREACH(f, &t_functions, tf_next) { 969 #ifdef INVARIANTS 970 cnt++; 971 #endif 972 if (req->oldptr != NULL) { 973 bzero(&tfi, sizeof(tfi)); 974 tfi.tfi_refcnt = f->tf_fb->tfb_refcnt; 975 tfi.tfi_id = f->tf_fb->tfb_id; 976 (void)strlcpy(tfi.tfi_alias, f->tf_name, 977 sizeof(tfi.tfi_alias)); 978 (void)strlcpy(tfi.tfi_name, 979 f->tf_fb->tfb_tcp_block_name, sizeof(tfi.tfi_name)); 980 error = SYSCTL_OUT(req, &tfi, sizeof(tfi)); 981 /* 982 * Don't stop on error, as that is the 983 * mechanism we use to accumulate length 984 * information if the buffer was too short. 985 */ 986 } 987 } 988 KASSERT(cnt == tcp_fb_cnt, 989 ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt)); 990 #ifndef INVARIANTS 991 skip_loop: 992 #endif 993 rw_runlock(&tcp_function_lock); 994 if (req->oldptr == NULL) 995 error = SYSCTL_OUT(req, NULL, 996 (cnt + 1) * sizeof(struct tcp_function_info)); 997 998 return (error); 999 } 1000 1001 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info, 1002 CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE, 1003 NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info", 1004 "List TCP function block name-to-ID mappings"); 1005 1006 /* 1007 * tfb_tcp_handoff_ok() function for the default stack. 1008 * Note that we'll basically try to take all comers. 1009 */ 1010 static int 1011 tcp_default_handoff_ok(struct tcpcb *tp) 1012 { 1013 1014 return (0); 1015 } 1016 1017 /* 1018 * tfb_tcp_fb_init() function for the default stack. 1019 * 1020 * This handles making sure we have appropriate timers set if you are 1021 * transitioning a socket that has some amount of setup done. 1022 * 1023 * The init() fuction from the default can *never* return non-zero i.e. 1024 * it is required to always succeed since it is the stack of last resort! 1025 */ 1026 static int 1027 tcp_default_fb_init(struct tcpcb *tp, void **ptr) 1028 { 1029 struct socket *so = tptosocket(tp); 1030 int rexmt; 1031 1032 INP_WLOCK_ASSERT(tptoinpcb(tp)); 1033 /* We don't use the pointer */ 1034 *ptr = NULL; 1035 1036 /* Make sure we get no interesting mbuf queuing behavior */ 1037 /* All mbuf queue/ack compress flags should be off */ 1038 tcp_lro_features_off(tp); 1039 1040 /* Cancel the GP measurement in progress */ 1041 tp->t_flags &= ~TF_GPUTINPROG; 1042 /* Validate the timers are not in usec, if they are convert */ 1043 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); 1044 if ((tp->t_state == TCPS_SYN_SENT) || 1045 (tp->t_state == TCPS_SYN_RECEIVED)) 1046 rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; 1047 else 1048 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 1049 if (tp->t_rxtshift == 0) 1050 tp->t_rxtcur = rexmt; 1051 else 1052 TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, 1053 tcp_rexmit_max); 1054 1055 /* 1056 * Nothing to do for ESTABLISHED or LISTEN states. And, we don't 1057 * know what to do for unexpected states (which includes TIME_WAIT). 1058 */ 1059 if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT) 1060 return (0); 1061 1062 /* 1063 * Make sure some kind of transmission timer is set if there is 1064 * outstanding data. 1065 */ 1066 if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) || 1067 tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) || 1068 tcp_timer_active(tp, TT_PERSIST))) { 1069 /* 1070 * If the session has established and it looks like it should 1071 * be in the persist state, set the persist timer. Otherwise, 1072 * set the retransmit timer. 1073 */ 1074 if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 && 1075 (int32_t)(tp->snd_nxt - tp->snd_una) < 1076 (int32_t)sbavail(&so->so_snd)) 1077 tcp_setpersist(tp); 1078 else 1079 tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); 1080 } 1081 1082 /* All non-embryonic sessions get a keepalive timer. */ 1083 if (!tcp_timer_active(tp, TT_KEEP)) 1084 tcp_timer_activate(tp, TT_KEEP, 1085 TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) : 1086 TP_KEEPINIT(tp)); 1087 1088 /* 1089 * Make sure critical variables are initialized 1090 * if transitioning while in Recovery. 1091 */ 1092 if IN_FASTRECOVERY(tp->t_flags) { 1093 if (tp->sackhint.recover_fs == 0) 1094 tp->sackhint.recover_fs = max(1, 1095 tp->snd_nxt - tp->snd_una); 1096 } 1097 1098 return (0); 1099 } 1100 1101 /* 1102 * tfb_tcp_fb_fini() function for the default stack. 1103 * 1104 * This changes state as necessary (or prudent) to prepare for another stack 1105 * to assume responsibility for the connection. 1106 */ 1107 static void 1108 tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged) 1109 { 1110 1111 INP_WLOCK_ASSERT(tptoinpcb(tp)); 1112 1113 #ifdef TCP_BLACKBOX 1114 tcp_log_flowend(tp); 1115 #endif 1116 tp->t_acktime = 0; 1117 return; 1118 } 1119 1120 MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); 1121 MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); 1122 1123 static struct mtx isn_mtx; 1124 1125 #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) 1126 #define ISN_LOCK() mtx_lock(&isn_mtx) 1127 #define ISN_UNLOCK() mtx_unlock(&isn_mtx) 1128 1129 INPCBSTORAGE_DEFINE(tcpcbstor, tcpcb, "tcpinp", "tcp_inpcb", "tcp", "tcphash"); 1130 1131 /* 1132 * Take a value and get the next power of 2 that doesn't overflow. 1133 * Used to size the tcp_inpcb hash buckets. 1134 */ 1135 static int 1136 maketcp_hashsize(int size) 1137 { 1138 int hashsize; 1139 1140 /* 1141 * auto tune. 1142 * get the next power of 2 higher than maxsockets. 1143 */ 1144 hashsize = 1 << fls(size); 1145 /* catch overflow, and just go one power of 2 smaller */ 1146 if (hashsize < size) { 1147 hashsize = 1 << (fls(size) - 1); 1148 } 1149 return (hashsize); 1150 } 1151 1152 static volatile int next_tcp_stack_id = 1; 1153 1154 /* 1155 * Register a TCP function block with the name provided in the names 1156 * array. (Note that this function does NOT automatically register 1157 * blk->tfb_tcp_block_name as a stack name. Therefore, you should 1158 * explicitly include blk->tfb_tcp_block_name in the list of names if 1159 * you wish to register the stack with that name.) 1160 * 1161 * Either all name registrations will succeed or all will fail. If 1162 * a name registration fails, the function will update the num_names 1163 * argument to point to the array index of the name that encountered 1164 * the failure. 1165 * 1166 * Returns 0 on success, or an error code on failure. 1167 */ 1168 int 1169 register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, 1170 const char *names[], int *num_names) 1171 { 1172 struct tcp_function *f[TCP_FUNCTION_NAME_NUM_MAX]; 1173 struct tcp_function_set fs; 1174 int error, i, num_registered; 1175 1176 KASSERT(names != NULL, ("%s: Called with NULL name list", __func__)); 1177 KASSERT(*num_names > 0, 1178 ("%s: Called with non-positive length of name list", __func__)); 1179 KASSERT(rw_initialized(&tcp_function_lock), 1180 ("%s: called too early", __func__)); 1181 1182 if (*num_names > TCP_FUNCTION_NAME_NUM_MAX) { 1183 /* Too many names. */ 1184 *num_names = 0; 1185 return (E2BIG); 1186 } 1187 if ((blk->tfb_tcp_output == NULL) || 1188 (blk->tfb_tcp_do_segment == NULL) || 1189 (blk->tfb_tcp_ctloutput == NULL) || 1190 (blk->tfb_tcp_handoff_ok == NULL) || 1191 (strlen(blk->tfb_tcp_block_name) == 0)) { 1192 /* These functions are required and a name is needed. */ 1193 *num_names = 0; 1194 return (EINVAL); 1195 } 1196 1197 for (i = 0; i < *num_names; i++) { 1198 f[i] = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); 1199 if (f[i] == NULL) { 1200 while (--i >= 0) 1201 free(f[i], M_TCPFUNCTIONS); 1202 *num_names = 0; 1203 return (ENOMEM); 1204 } 1205 } 1206 1207 num_registered = 0; 1208 rw_wlock(&tcp_function_lock); 1209 if (find_tcp_fb_locked(blk, NULL) != NULL) { 1210 /* A TCP function block can only be registered once. */ 1211 error = EALREADY; 1212 goto cleanup; 1213 } 1214 if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { 1215 error = EINVAL; 1216 goto cleanup; 1217 } 1218 refcount_init(&blk->tfb_refcnt, 0); 1219 blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1); 1220 for (i = 0; i < *num_names; i++) { 1221 (void)strlcpy(fs.function_set_name, names[i], 1222 sizeof(fs.function_set_name)); 1223 if (find_tcp_functions_locked(&fs) != NULL) { 1224 /* Duplicate name space not allowed */ 1225 error = EALREADY; 1226 goto cleanup; 1227 } 1228 f[i]->tf_fb = blk; 1229 (void)strlcpy(f[i]->tf_name, names[i], sizeof(f[i]->tf_name)); 1230 TAILQ_INSERT_TAIL(&t_functions, f[i], tf_next); 1231 tcp_fb_cnt++; 1232 num_registered++; 1233 } 1234 rw_wunlock(&tcp_function_lock); 1235 return (0); 1236 1237 cleanup: 1238 /* Remove the entries just added. */ 1239 for (i = 0; i < *num_names; i++) { 1240 if (i < num_registered) { 1241 TAILQ_REMOVE(&t_functions, f[i], tf_next); 1242 tcp_fb_cnt--; 1243 } 1244 f[i]->tf_fb = NULL; 1245 free(f[i], M_TCPFUNCTIONS); 1246 } 1247 rw_wunlock(&tcp_function_lock); 1248 *num_names = num_registered; 1249 return (error); 1250 } 1251 1252 /* 1253 * Register a TCP function block using the name provided in the name 1254 * argument. 1255 * 1256 * Returns 0 on success, or an error code on failure. 1257 */ 1258 int 1259 register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, 1260 int wait) 1261 { 1262 const char *name_list[1]; 1263 int num_names, rv; 1264 1265 num_names = 1; 1266 if (name != NULL) 1267 name_list[0] = name; 1268 else 1269 name_list[0] = blk->tfb_tcp_block_name; 1270 rv = register_tcp_functions_as_names(blk, wait, name_list, &num_names); 1271 return (rv); 1272 } 1273 1274 /* 1275 * Register a TCP function block using the name defined in 1276 * blk->tfb_tcp_block_name. 1277 * 1278 * Returns 0 on success, or an error code on failure. 1279 */ 1280 int 1281 register_tcp_functions(struct tcp_function_block *blk, int wait) 1282 { 1283 1284 return (register_tcp_functions_as_name(blk, NULL, wait)); 1285 } 1286 1287 /* 1288 * Deregister all names associated with a function block. This 1289 * functionally removes the function block from use within the system. 1290 * 1291 * When called with a true quiesce argument, mark the function block 1292 * as being removed so no more stacks will use it and determine 1293 * whether the removal would succeed. 1294 * 1295 * When called with a false quiesce argument, actually attempt the 1296 * removal. 1297 * 1298 * When called with a force argument, attempt to switch all TCBs to 1299 * use the default stack instead of returning EBUSY. 1300 * 1301 * Returns 0 on success (or if the removal would succeed), or an error 1302 * code on failure. 1303 */ 1304 int 1305 deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, 1306 bool force) 1307 { 1308 struct tcp_function *f; 1309 VNET_ITERATOR_DECL(vnet_iter); 1310 1311 if (blk == &tcp_def_funcblk) { 1312 /* You can't un-register the default */ 1313 return (EPERM); 1314 } 1315 rw_wlock(&tcp_function_lock); 1316 VNET_LIST_RLOCK_NOSLEEP(); 1317 VNET_FOREACH(vnet_iter) { 1318 CURVNET_SET(vnet_iter); 1319 if (blk == V_tcp_func_set_ptr) { 1320 /* You can't free the current default in some vnet. */ 1321 CURVNET_RESTORE(); 1322 VNET_LIST_RUNLOCK_NOSLEEP(); 1323 rw_wunlock(&tcp_function_lock); 1324 return (EBUSY); 1325 } 1326 CURVNET_RESTORE(); 1327 } 1328 VNET_LIST_RUNLOCK_NOSLEEP(); 1329 /* Mark the block so no more stacks can use it. */ 1330 blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; 1331 /* 1332 * If TCBs are still attached to the stack, attempt to switch them 1333 * to the default stack. 1334 */ 1335 if (force && blk->tfb_refcnt) { 1336 struct inpcb *inp; 1337 struct tcpcb *tp; 1338 VNET_ITERATOR_DECL(vnet_iter); 1339 1340 rw_wunlock(&tcp_function_lock); 1341 1342 VNET_LIST_RLOCK(); 1343 VNET_FOREACH(vnet_iter) { 1344 CURVNET_SET(vnet_iter); 1345 struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, 1346 INPLOOKUP_WLOCKPCB); 1347 1348 while ((inp = inp_next(&inpi)) != NULL) { 1349 tp = intotcpcb(inp); 1350 if (tp == NULL || tp->t_fb != blk) 1351 continue; 1352 tcp_switch_back_to_default(tp); 1353 } 1354 CURVNET_RESTORE(); 1355 } 1356 VNET_LIST_RUNLOCK(); 1357 1358 rw_wlock(&tcp_function_lock); 1359 } 1360 if (blk->tfb_refcnt) { 1361 /* TCBs still attached. */ 1362 rw_wunlock(&tcp_function_lock); 1363 return (EBUSY); 1364 } 1365 if (quiesce) { 1366 /* Skip removal. */ 1367 rw_wunlock(&tcp_function_lock); 1368 return (0); 1369 } 1370 /* Remove any function names that map to this function block. */ 1371 while (find_tcp_fb_locked(blk, &f) != NULL) { 1372 TAILQ_REMOVE(&t_functions, f, tf_next); 1373 tcp_fb_cnt--; 1374 f->tf_fb = NULL; 1375 free(f, M_TCPFUNCTIONS); 1376 } 1377 rw_wunlock(&tcp_function_lock); 1378 return (0); 1379 } 1380 1381 static void 1382 tcp_drain(void *ctx __unused, int flags __unused) 1383 { 1384 struct epoch_tracker et; 1385 VNET_ITERATOR_DECL(vnet_iter); 1386 1387 if (!do_tcpdrain) 1388 return; 1389 1390 NET_EPOCH_ENTER(et); 1391 VNET_LIST_RLOCK_NOSLEEP(); 1392 VNET_FOREACH(vnet_iter) { 1393 CURVNET_SET(vnet_iter); 1394 struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, 1395 INPLOOKUP_WLOCKPCB); 1396 struct inpcb *inpb; 1397 struct tcpcb *tcpb; 1398 1399 /* 1400 * Walk the tcpbs, if existing, and flush the reassembly queue, 1401 * if there is one... 1402 * XXX: The "Net/3" implementation doesn't imply that the TCP 1403 * reassembly queue should be flushed, but in a situation 1404 * where we're really low on mbufs, this is potentially 1405 * useful. 1406 */ 1407 while ((inpb = inp_next(&inpi)) != NULL) { 1408 if ((tcpb = intotcpcb(inpb)) != NULL) { 1409 tcp_reass_flush(tcpb); 1410 tcp_clean_sackreport(tcpb); 1411 #ifdef TCP_BLACKBOX 1412 tcp_log_drain(tcpb); 1413 #endif 1414 } 1415 } 1416 CURVNET_RESTORE(); 1417 } 1418 VNET_LIST_RUNLOCK_NOSLEEP(); 1419 NET_EPOCH_EXIT(et); 1420 } 1421 1422 static void 1423 tcp_vnet_init(void *arg __unused) 1424 { 1425 1426 #ifdef TCP_HHOOK 1427 if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, 1428 &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 1429 printf("%s: WARNING: unable to register helper hook\n", __func__); 1430 if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, 1431 &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) 1432 printf("%s: WARNING: unable to register helper hook\n", __func__); 1433 #endif 1434 #ifdef STATS 1435 if (tcp_stats_init()) 1436 printf("%s: WARNING: unable to initialise TCP stats\n", 1437 __func__); 1438 #endif 1439 in_pcbinfo_init(&V_tcbinfo, &tcpcbstor, tcp_tcbhashsize, 1440 tcp_tcbhashsize); 1441 1442 syncache_init(); 1443 tcp_hc_init(); 1444 1445 TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); 1446 V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), 1447 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1448 1449 tcp_fastopen_init(); 1450 1451 COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK); 1452 VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); 1453 1454 V_tcp_msl = TCPTV_MSL; 1455 V_tcp_msl_local = TCPTV_MSL_LOCAL; 1456 arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0); 1457 } 1458 VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, 1459 tcp_vnet_init, NULL); 1460 1461 static void 1462 tcp_init(void *arg __unused) 1463 { 1464 int hashsize; 1465 1466 tcp_reass_global_init(); 1467 1468 /* XXX virtualize those below? */ 1469 tcp_delacktime = TCPTV_DELACK; 1470 tcp_keepinit = TCPTV_KEEP_INIT; 1471 tcp_keepidle = TCPTV_KEEP_IDLE; 1472 tcp_keepintvl = TCPTV_KEEPINTVL; 1473 tcp_maxpersistidle = TCPTV_KEEP_IDLE; 1474 tcp_rexmit_initial = TCPTV_RTOBASE; 1475 tcp_rexmit_min = TCPTV_MIN; 1476 tcp_rexmit_max = TCPTV_REXMTMAX; 1477 tcp_persmin = TCPTV_PERSMIN; 1478 tcp_persmax = TCPTV_PERSMAX; 1479 tcp_rexmit_slop = TCPTV_CPU_VAR; 1480 tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; 1481 1482 /* Setup the tcp function block list */ 1483 TAILQ_INIT(&t_functions); 1484 rw_init(&tcp_function_lock, "tcp_func_lock"); 1485 register_tcp_functions(&tcp_def_funcblk, M_WAITOK); 1486 sx_init(&tcpoudp_lock, "TCP over UDP configuration"); 1487 #ifdef TCP_BLACKBOX 1488 /* Initialize the TCP logging data. */ 1489 tcp_log_init(); 1490 #endif 1491 1492 if (tcp_soreceive_stream) { 1493 #ifdef INET 1494 tcp_protosw.pr_soreceive = soreceive_stream; 1495 #endif 1496 #ifdef INET6 1497 tcp6_protosw.pr_soreceive = soreceive_stream; 1498 #endif /* INET6 */ 1499 } 1500 1501 #ifdef INET6 1502 max_protohdr_grow(sizeof(struct ip6_hdr) + sizeof(struct tcphdr)); 1503 #else /* INET6 */ 1504 max_protohdr_grow(sizeof(struct tcpiphdr)); 1505 #endif /* INET6 */ 1506 1507 ISN_LOCK_INIT(); 1508 EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, 1509 SHUTDOWN_PRI_DEFAULT); 1510 EVENTHANDLER_REGISTER(vm_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT); 1511 EVENTHANDLER_REGISTER(mbuf_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT); 1512 1513 tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK); 1514 tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK); 1515 tcp_inp_lro_compressed = counter_u64_alloc(M_WAITOK); 1516 tcp_inp_lro_locks_taken = counter_u64_alloc(M_WAITOK); 1517 tcp_extra_mbuf = counter_u64_alloc(M_WAITOK); 1518 tcp_would_have_but = counter_u64_alloc(M_WAITOK); 1519 tcp_comp_total = counter_u64_alloc(M_WAITOK); 1520 tcp_uncomp_total = counter_u64_alloc(M_WAITOK); 1521 tcp_bad_csums = counter_u64_alloc(M_WAITOK); 1522 tcp_pacing_failures = counter_u64_alloc(M_WAITOK); 1523 tcp_dgp_failures = counter_u64_alloc(M_WAITOK); 1524 1525 hashsize = tcp_tcbhashsize; 1526 if (hashsize == 0) { 1527 /* 1528 * Auto tune the hash size based on maxsockets. 1529 * A perfect hash would have a 1:1 mapping 1530 * (hashsize = maxsockets) however it's been 1531 * suggested that O(2) average is better. 1532 */ 1533 hashsize = maketcp_hashsize(maxsockets / 4); 1534 /* 1535 * Our historical default is 512, 1536 * do not autotune lower than this. 1537 */ 1538 if (hashsize < 512) 1539 hashsize = 512; 1540 if (bootverbose) 1541 printf("%s: %s auto tuned to %d\n", __func__, 1542 "net.inet.tcp.tcbhashsize", hashsize); 1543 } 1544 /* 1545 * We require a hashsize to be a power of two. 1546 * Previously if it was not a power of two we would just reset it 1547 * back to 512, which could be a nasty surprise if you did not notice 1548 * the error message. 1549 * Instead what we do is clip it to the closest power of two lower 1550 * than the specified hash value. 1551 */ 1552 if (!powerof2(hashsize)) { 1553 int oldhashsize = hashsize; 1554 1555 hashsize = maketcp_hashsize(hashsize); 1556 /* prevent absurdly low value */ 1557 if (hashsize < 16) 1558 hashsize = 16; 1559 printf("%s: WARNING: TCB hash size not a power of 2, " 1560 "clipped from %d to %d.\n", __func__, oldhashsize, 1561 hashsize); 1562 } 1563 tcp_tcbhashsize = hashsize; 1564 1565 #ifdef INET 1566 IPPROTO_REGISTER(IPPROTO_TCP, tcp_input, tcp_ctlinput); 1567 #endif 1568 #ifdef INET6 1569 IP6PROTO_REGISTER(IPPROTO_TCP, tcp6_input, tcp6_ctlinput); 1570 #endif 1571 } 1572 SYSINIT(tcp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, tcp_init, NULL); 1573 1574 #ifdef VIMAGE 1575 static void 1576 tcp_destroy(void *unused __unused) 1577 { 1578 #ifdef TCP_HHOOK 1579 int error; 1580 #endif 1581 1582 tcp_hc_destroy(); 1583 syncache_destroy(); 1584 in_pcbinfo_destroy(&V_tcbinfo); 1585 /* tcp_discardcb() clears the sack_holes up. */ 1586 uma_zdestroy(V_sack_hole_zone); 1587 1588 /* 1589 * Cannot free the zone until all tcpcbs are released as we attach 1590 * the allocations to them. 1591 */ 1592 tcp_fastopen_destroy(); 1593 1594 COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES); 1595 VNET_PCPUSTAT_FREE(tcpstat); 1596 1597 #ifdef TCP_HHOOK 1598 error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); 1599 if (error != 0) { 1600 printf("%s: WARNING: unable to deregister helper hook " 1601 "type=%d, id=%d: error %d returned\n", __func__, 1602 HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); 1603 } 1604 error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); 1605 if (error != 0) { 1606 printf("%s: WARNING: unable to deregister helper hook " 1607 "type=%d, id=%d: error %d returned\n", __func__, 1608 HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); 1609 } 1610 #endif 1611 } 1612 VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL); 1613 #endif 1614 1615 void 1616 tcp_fini(void *xtp) 1617 { 1618 1619 } 1620 1621 /* 1622 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. 1623 * tcp_template used to store this data in mbufs, but we now recopy it out 1624 * of the tcpcb each time to conserve mbufs. 1625 */ 1626 void 1627 tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr) 1628 { 1629 struct tcphdr *th = (struct tcphdr *)tcp_ptr; 1630 1631 INP_WLOCK_ASSERT(inp); 1632 1633 #ifdef INET6 1634 if ((inp->inp_vflag & INP_IPV6) != 0) { 1635 struct ip6_hdr *ip6; 1636 1637 ip6 = (struct ip6_hdr *)ip_ptr; 1638 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | 1639 (inp->inp_flow & IPV6_FLOWINFO_MASK); 1640 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | 1641 (IPV6_VERSION & IPV6_VERSION_MASK); 1642 if (port == 0) 1643 ip6->ip6_nxt = IPPROTO_TCP; 1644 else 1645 ip6->ip6_nxt = IPPROTO_UDP; 1646 ip6->ip6_plen = htons(sizeof(struct tcphdr)); 1647 ip6->ip6_src = inp->in6p_laddr; 1648 ip6->ip6_dst = inp->in6p_faddr; 1649 } 1650 #endif /* INET6 */ 1651 #if defined(INET6) && defined(INET) 1652 else 1653 #endif 1654 #ifdef INET 1655 { 1656 struct ip *ip; 1657 1658 ip = (struct ip *)ip_ptr; 1659 ip->ip_v = IPVERSION; 1660 ip->ip_hl = 5; 1661 ip->ip_tos = inp->inp_ip_tos; 1662 ip->ip_len = 0; 1663 ip->ip_id = 0; 1664 ip->ip_off = 0; 1665 ip->ip_ttl = inp->inp_ip_ttl; 1666 ip->ip_sum = 0; 1667 if (port == 0) 1668 ip->ip_p = IPPROTO_TCP; 1669 else 1670 ip->ip_p = IPPROTO_UDP; 1671 ip->ip_src = inp->inp_laddr; 1672 ip->ip_dst = inp->inp_faddr; 1673 } 1674 #endif /* INET */ 1675 th->th_sport = inp->inp_lport; 1676 th->th_dport = inp->inp_fport; 1677 th->th_seq = 0; 1678 th->th_ack = 0; 1679 th->th_off = 5; 1680 tcp_set_flags(th, 0); 1681 th->th_win = 0; 1682 th->th_urp = 0; 1683 th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ 1684 } 1685 1686 /* 1687 * Create template to be used to send tcp packets on a connection. 1688 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only 1689 * use for this function is in keepalives, which use tcp_respond. 1690 */ 1691 struct tcptemp * 1692 tcpip_maketemplate(struct inpcb *inp) 1693 { 1694 struct tcptemp *t; 1695 1696 t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); 1697 if (t == NULL) 1698 return (NULL); 1699 tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t); 1700 return (t); 1701 } 1702 1703 /* 1704 * Send a single message to the TCP at address specified by 1705 * the given TCP/IP header. If m == NULL, then we make a copy 1706 * of the tcpiphdr at th and send directly to the addressed host. 1707 * This is used to force keep alive messages out using the TCP 1708 * template for a connection. If flags are given then we send 1709 * a message back to the TCP which originated the segment th, 1710 * and discard the mbuf containing it and any other attached mbufs. 1711 * 1712 * In any case the ack and sequence number of the transmitted 1713 * segment are as specified by the parameters. 1714 * 1715 * NOTE: If m != NULL, then th must point to *inside* the mbuf. 1716 */ 1717 1718 void 1719 tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, 1720 tcp_seq ack, tcp_seq seq, uint16_t flags) 1721 { 1722 struct tcpopt to; 1723 struct inpcb *inp; 1724 struct ip *ip; 1725 struct mbuf *optm; 1726 struct udphdr *uh = NULL; 1727 struct tcphdr *nth; 1728 struct tcp_log_buffer *lgb; 1729 u_char *optp; 1730 #ifdef INET6 1731 struct ip6_hdr *ip6; 1732 int isipv6; 1733 #endif /* INET6 */ 1734 int optlen, tlen, win, ulen; 1735 int ect = 0; 1736 bool incl_opts; 1737 uint16_t port; 1738 int output_ret; 1739 #ifdef INVARIANTS 1740 int thflags = tcp_get_flags(th); 1741 #endif 1742 1743 KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); 1744 NET_EPOCH_ASSERT(); 1745 1746 #ifdef INET6 1747 isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); 1748 ip6 = ipgen; 1749 #endif /* INET6 */ 1750 ip = ipgen; 1751 1752 if (tp != NULL) { 1753 inp = tptoinpcb(tp); 1754 INP_LOCK_ASSERT(inp); 1755 } else 1756 inp = NULL; 1757 1758 if (m != NULL) { 1759 #ifdef INET6 1760 if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP)) 1761 port = m->m_pkthdr.tcp_tun_port; 1762 else 1763 #endif 1764 if (ip && (ip->ip_p == IPPROTO_UDP)) 1765 port = m->m_pkthdr.tcp_tun_port; 1766 else 1767 port = 0; 1768 } else 1769 port = tp->t_port; 1770 1771 incl_opts = false; 1772 win = 0; 1773 if (tp != NULL) { 1774 if (!(flags & TH_RST)) { 1775 win = sbspace(&inp->inp_socket->so_rcv); 1776 if (win > TCP_MAXWIN << tp->rcv_scale) 1777 win = TCP_MAXWIN << tp->rcv_scale; 1778 } 1779 if ((tp->t_flags & TF_NOOPT) == 0) 1780 incl_opts = true; 1781 } 1782 if (m == NULL) { 1783 m = m_gethdr(M_NOWAIT, MT_DATA); 1784 if (m == NULL) 1785 return; 1786 m->m_data += max_linkhdr; 1787 #ifdef INET6 1788 if (isipv6) { 1789 bcopy((caddr_t)ip6, mtod(m, caddr_t), 1790 sizeof(struct ip6_hdr)); 1791 ip6 = mtod(m, struct ip6_hdr *); 1792 nth = (struct tcphdr *)(ip6 + 1); 1793 if (port) { 1794 /* Insert a UDP header */ 1795 uh = (struct udphdr *)nth; 1796 uh->uh_sport = htons(V_tcp_udp_tunneling_port); 1797 uh->uh_dport = port; 1798 nth = (struct tcphdr *)(uh + 1); 1799 } 1800 } else 1801 #endif /* INET6 */ 1802 { 1803 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); 1804 ip = mtod(m, struct ip *); 1805 nth = (struct tcphdr *)(ip + 1); 1806 if (port) { 1807 /* Insert a UDP header */ 1808 uh = (struct udphdr *)nth; 1809 uh->uh_sport = htons(V_tcp_udp_tunneling_port); 1810 uh->uh_dport = port; 1811 nth = (struct tcphdr *)(uh + 1); 1812 } 1813 } 1814 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); 1815 flags = TH_ACK; 1816 } else if ((!M_WRITABLE(m)) || (port != 0)) { 1817 struct mbuf *n; 1818 1819 /* Can't reuse 'm', allocate a new mbuf. */ 1820 n = m_gethdr(M_NOWAIT, MT_DATA); 1821 if (n == NULL) { 1822 m_freem(m); 1823 return; 1824 } 1825 1826 if (!m_dup_pkthdr(n, m, M_NOWAIT)) { 1827 m_freem(m); 1828 m_freem(n); 1829 return; 1830 } 1831 1832 n->m_data += max_linkhdr; 1833 /* m_len is set later */ 1834 #define xchg(a,b,type) { type t; t=a; a=b; b=t; } 1835 #ifdef INET6 1836 if (isipv6) { 1837 bcopy((caddr_t)ip6, mtod(n, caddr_t), 1838 sizeof(struct ip6_hdr)); 1839 ip6 = mtod(n, struct ip6_hdr *); 1840 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 1841 nth = (struct tcphdr *)(ip6 + 1); 1842 if (port) { 1843 /* Insert a UDP header */ 1844 uh = (struct udphdr *)nth; 1845 uh->uh_sport = htons(V_tcp_udp_tunneling_port); 1846 uh->uh_dport = port; 1847 nth = (struct tcphdr *)(uh + 1); 1848 } 1849 } else 1850 #endif /* INET6 */ 1851 { 1852 bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip)); 1853 ip = mtod(n, struct ip *); 1854 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); 1855 nth = (struct tcphdr *)(ip + 1); 1856 if (port) { 1857 /* Insert a UDP header */ 1858 uh = (struct udphdr *)nth; 1859 uh->uh_sport = htons(V_tcp_udp_tunneling_port); 1860 uh->uh_dport = port; 1861 nth = (struct tcphdr *)(uh + 1); 1862 } 1863 } 1864 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); 1865 xchg(nth->th_dport, nth->th_sport, uint16_t); 1866 th = nth; 1867 m_freem(m); 1868 m = n; 1869 } else { 1870 /* 1871 * reuse the mbuf. 1872 * XXX MRT We inherit the FIB, which is lucky. 1873 */ 1874 m_freem(m->m_next); 1875 m->m_next = NULL; 1876 m->m_data = (caddr_t)ipgen; 1877 /* clear any receive flags for proper bpf timestamping */ 1878 m->m_flags &= ~(M_TSTMP | M_TSTMP_LRO); 1879 /* m_len is set later */ 1880 #ifdef INET6 1881 if (isipv6) { 1882 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 1883 nth = (struct tcphdr *)(ip6 + 1); 1884 } else 1885 #endif /* INET6 */ 1886 { 1887 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); 1888 nth = (struct tcphdr *)(ip + 1); 1889 } 1890 if (th != nth) { 1891 /* 1892 * this is usually a case when an extension header 1893 * exists between the IPv6 header and the 1894 * TCP header. 1895 */ 1896 nth->th_sport = th->th_sport; 1897 nth->th_dport = th->th_dport; 1898 } 1899 xchg(nth->th_dport, nth->th_sport, uint16_t); 1900 #undef xchg 1901 } 1902 tlen = 0; 1903 #ifdef INET6 1904 if (isipv6) 1905 tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); 1906 #endif 1907 #if defined(INET) && defined(INET6) 1908 else 1909 #endif 1910 #ifdef INET 1911 tlen = sizeof (struct tcpiphdr); 1912 #endif 1913 if (port) 1914 tlen += sizeof (struct udphdr); 1915 #ifdef INVARIANTS 1916 m->m_len = 0; 1917 KASSERT(M_TRAILINGSPACE(m) >= tlen, 1918 ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", 1919 m, tlen, (long)M_TRAILINGSPACE(m))); 1920 #endif 1921 m->m_len = tlen; 1922 to.to_flags = 0; 1923 if (incl_opts) { 1924 ect = tcp_ecn_output_established(tp, &flags, 0, false); 1925 /* Make sure we have room. */ 1926 if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { 1927 m->m_next = m_get(M_NOWAIT, MT_DATA); 1928 if (m->m_next) { 1929 optp = mtod(m->m_next, u_char *); 1930 optm = m->m_next; 1931 } else 1932 incl_opts = false; 1933 } else { 1934 optp = (u_char *) (nth + 1); 1935 optm = m; 1936 } 1937 } 1938 if (incl_opts) { 1939 /* Timestamps. */ 1940 if (tp->t_flags & TF_RCVD_TSTMP) { 1941 to.to_tsval = tcp_ts_getticks() + tp->ts_offset; 1942 to.to_tsecr = tp->ts_recent; 1943 to.to_flags |= TOF_TS; 1944 } 1945 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 1946 /* TCP-MD5 (RFC2385). */ 1947 if (tp->t_flags & TF_SIGNATURE) 1948 to.to_flags |= TOF_SIGNATURE; 1949 #endif 1950 /* Add the options. */ 1951 tlen += optlen = tcp_addoptions(&to, optp); 1952 1953 /* Update m_len in the correct mbuf. */ 1954 optm->m_len += optlen; 1955 } else 1956 optlen = 0; 1957 #ifdef INET6 1958 if (isipv6) { 1959 if (uh) { 1960 ulen = tlen - sizeof(struct ip6_hdr); 1961 uh->uh_ulen = htons(ulen); 1962 } 1963 ip6->ip6_flow = htonl(ect << IPV6_FLOWLABEL_LEN); 1964 ip6->ip6_vfc = IPV6_VERSION; 1965 if (port) 1966 ip6->ip6_nxt = IPPROTO_UDP; 1967 else 1968 ip6->ip6_nxt = IPPROTO_TCP; 1969 ip6->ip6_plen = htons(tlen - sizeof(*ip6)); 1970 } 1971 #endif 1972 #if defined(INET) && defined(INET6) 1973 else 1974 #endif 1975 #ifdef INET 1976 { 1977 if (uh) { 1978 ulen = tlen - sizeof(struct ip); 1979 uh->uh_ulen = htons(ulen); 1980 } 1981 ip->ip_len = htons(tlen); 1982 if (inp != NULL) { 1983 ip->ip_tos = inp->inp_ip_tos & ~IPTOS_ECN_MASK; 1984 ip->ip_ttl = inp->inp_ip_ttl; 1985 } else { 1986 ip->ip_tos = 0; 1987 ip->ip_ttl = V_ip_defttl; 1988 } 1989 ip->ip_tos |= ect; 1990 if (port) { 1991 ip->ip_p = IPPROTO_UDP; 1992 } else { 1993 ip->ip_p = IPPROTO_TCP; 1994 } 1995 if (V_path_mtu_discovery) 1996 ip->ip_off |= htons(IP_DF); 1997 } 1998 #endif 1999 m->m_pkthdr.len = tlen; 2000 m->m_pkthdr.rcvif = NULL; 2001 #ifdef MAC 2002 if (inp != NULL) { 2003 /* 2004 * Packet is associated with a socket, so allow the 2005 * label of the response to reflect the socket label. 2006 */ 2007 INP_LOCK_ASSERT(inp); 2008 mac_inpcb_create_mbuf(inp, m); 2009 } else { 2010 /* 2011 * Packet is not associated with a socket, so possibly 2012 * update the label in place. 2013 */ 2014 mac_netinet_tcp_reply(m); 2015 } 2016 #endif 2017 nth->th_seq = htonl(seq); 2018 nth->th_ack = htonl(ack); 2019 nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; 2020 tcp_set_flags(nth, flags); 2021 if (tp && (flags & TH_RST)) { 2022 /* Log the reset */ 2023 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 2024 } 2025 if (tp != NULL) 2026 nth->th_win = htons((u_short) (win >> tp->rcv_scale)); 2027 else 2028 nth->th_win = htons((u_short)win); 2029 nth->th_urp = 0; 2030 2031 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 2032 if (to.to_flags & TOF_SIGNATURE) { 2033 if (!TCPMD5_ENABLED() || 2034 TCPMD5_OUTPUT(m, nth, to.to_signature) != 0) { 2035 m_freem(m); 2036 return; 2037 } 2038 } 2039 #endif 2040 2041 #ifdef INET6 2042 if (isipv6) { 2043 if (port) { 2044 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 2045 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 2046 uh->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 2047 nth->th_sum = 0; 2048 } else { 2049 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 2050 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 2051 nth->th_sum = in6_cksum_pseudo(ip6, 2052 tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); 2053 } 2054 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 2055 } 2056 #endif /* INET6 */ 2057 #if defined(INET6) && defined(INET) 2058 else 2059 #endif 2060 #ifdef INET 2061 { 2062 if (port) { 2063 uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 2064 htons(ulen + IPPROTO_UDP)); 2065 m->m_pkthdr.csum_flags = CSUM_UDP; 2066 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 2067 nth->th_sum = 0; 2068 } else { 2069 m->m_pkthdr.csum_flags = CSUM_TCP; 2070 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 2071 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 2072 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); 2073 } 2074 } 2075 #endif /* INET */ 2076 TCP_PROBE3(debug__output, tp, th, m); 2077 if (flags & TH_RST) 2078 TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth); 2079 lgb = NULL; 2080 if ((tp != NULL) && tcp_bblogging_on(tp)) { 2081 if (INP_WLOCKED(inp)) { 2082 union tcp_log_stackspecific log; 2083 struct timeval tv; 2084 2085 memset(&log, 0, sizeof(log)); 2086 log.u_bbr.inhpts = tcp_in_hpts(tp); 2087 log.u_bbr.flex8 = 4; 2088 log.u_bbr.pkts_out = tp->t_maxseg; 2089 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2090 log.u_bbr.delivered = 0; 2091 lgb = tcp_log_event(tp, nth, NULL, NULL, TCP_LOG_OUT, 2092 ERRNO_UNK, 0, &log, false, NULL, NULL, 0, &tv); 2093 } else { 2094 /* 2095 * We can not log the packet, since we only own the 2096 * read lock, but a write lock is needed. The read lock 2097 * is not upgraded to a write lock, since only getting 2098 * the read lock was done intentionally to improve the 2099 * handling of SYN flooding attacks. 2100 * This happens only for pure SYN segments received in 2101 * the initial CLOSED state, or received in a more 2102 * advanced state than listen and the UDP encapsulation 2103 * port is unexpected. 2104 * The incoming SYN segments do not really belong to 2105 * the TCP connection and the handling does not change 2106 * the state of the TCP connection. Therefore, the 2107 * sending of the RST segments is not logged. Please 2108 * note that also the incoming SYN segments are not 2109 * logged. 2110 * 2111 * The following code ensures that the above description 2112 * is and stays correct. 2113 */ 2114 KASSERT((thflags & (TH_ACK|TH_SYN)) == TH_SYN && 2115 (tp->t_state == TCPS_CLOSED || 2116 (tp->t_state > TCPS_LISTEN && tp->t_port != port)), 2117 ("%s: Logging of TCP segment with flags 0x%b and " 2118 "UDP encapsulation port %u skipped in state %s", 2119 __func__, thflags, PRINT_TH_FLAGS, 2120 ntohs(port), tcpstates[tp->t_state])); 2121 } 2122 } 2123 2124 if (flags & TH_ACK) 2125 TCPSTAT_INC(tcps_sndacks); 2126 else if (flags & (TH_SYN|TH_FIN|TH_RST)) 2127 TCPSTAT_INC(tcps_sndctrl); 2128 TCPSTAT_INC(tcps_sndtotal); 2129 2130 #ifdef INET6 2131 if (isipv6) { 2132 TCP_PROBE5(send, NULL, tp, ip6, tp, nth); 2133 output_ret = ip6_output(m, inp ? inp->in6p_outputopts : NULL, 2134 NULL, 0, NULL, NULL, inp); 2135 } 2136 #endif /* INET6 */ 2137 #if defined(INET) && defined(INET6) 2138 else 2139 #endif 2140 #ifdef INET 2141 { 2142 TCP_PROBE5(send, NULL, tp, ip, tp, nth); 2143 output_ret = ip_output(m, NULL, NULL, 0, NULL, inp); 2144 } 2145 #endif 2146 if (lgb != NULL) 2147 lgb->tlb_errno = output_ret; 2148 } 2149 2150 /* 2151 * Send a challenge ack (no data, no SACK option), but not more than 2152 * V_tcp_ack_war_cnt per V_tcp_ack_war_time_window (per TCP connection). 2153 */ 2154 void 2155 tcp_send_challenge_ack(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m) 2156 { 2157 sbintime_t now; 2158 bool send_challenge_ack; 2159 2160 /* 2161 * The sending of a challenge ACK could be triggered by a blind attacker 2162 * to detect an existing TCP connection. To mitigate that, increment 2163 * also the global counter which would be incremented if the attacker 2164 * would have guessed wrongly. 2165 */ 2166 (void)badport_bandlim(BANDLIM_TCP_RST); 2167 if (V_tcp_ack_war_time_window == 0 || V_tcp_ack_war_cnt == 0) { 2168 /* ACK war protection is disabled. */ 2169 send_challenge_ack = true; 2170 } else { 2171 /* Start new epoch, if the previous one is already over. */ 2172 now = getsbinuptime(); 2173 if (tp->t_challenge_ack_end < now) { 2174 tp->t_challenge_ack_cnt = 0; 2175 tp->t_challenge_ack_end = now + 2176 V_tcp_ack_war_time_window * SBT_1MS; 2177 } 2178 /* 2179 * Send a challenge ACK, if less than tcp_ack_war_cnt have been 2180 * sent in the current epoch. 2181 */ 2182 if (tp->t_challenge_ack_cnt < V_tcp_ack_war_cnt) { 2183 send_challenge_ack = true; 2184 tp->t_challenge_ack_cnt++; 2185 } else { 2186 send_challenge_ack = false; 2187 } 2188 } 2189 if (send_challenge_ack) { 2190 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, 2191 tp->snd_nxt, TH_ACK); 2192 tp->last_ack_sent = tp->rcv_nxt; 2193 } 2194 } 2195 2196 /* 2197 * Create a new TCP control block, making an empty reassembly queue and hooking 2198 * it to the argument protocol control block. The `inp' parameter must have 2199 * come from the zone allocator set up by tcpcbstor declaration. 2200 * The caller can provide a pointer to a tcpcb of the listener to inherit the 2201 * TCP function block from the listener. 2202 */ 2203 struct tcpcb * 2204 tcp_newtcpcb(struct inpcb *inp, struct tcpcb *listening_tcb) 2205 { 2206 struct tcpcb *tp = intotcpcb(inp); 2207 #ifdef INET6 2208 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 2209 #endif /* INET6 */ 2210 2211 /* 2212 * Historically allocation was done with M_ZERO. There is a lot of 2213 * code that rely on that. For now take safe approach and zero whole 2214 * tcpcb. This definitely can be optimized. 2215 */ 2216 bzero(&tp->t_start_zero, t_zero_size); 2217 2218 /* Initialise cc_var struct for this tcpcb. */ 2219 tp->t_ccv.tp = tp; 2220 rw_rlock(&tcp_function_lock); 2221 if (listening_tcb != NULL) { 2222 INP_LOCK_ASSERT(tptoinpcb(listening_tcb)); 2223 KASSERT(listening_tcb->t_fb != NULL, 2224 ("tcp_newtcpcb: listening_tcb->t_fb is NULL")); 2225 if (listening_tcb->t_fb->tfb_flags & TCP_FUNC_BEING_REMOVED) { 2226 rw_runlock(&tcp_function_lock); 2227 return (NULL); 2228 } 2229 tp->t_fb = listening_tcb->t_fb; 2230 } else { 2231 tp->t_fb = V_tcp_func_set_ptr; 2232 } 2233 refcount_acquire(&tp->t_fb->tfb_refcnt); 2234 KASSERT((tp->t_fb->tfb_flags & TCP_FUNC_BEING_REMOVED) == 0, 2235 ("tcp_newtcpcb: using TFB being removed")); 2236 rw_runlock(&tcp_function_lock); 2237 CC_LIST_RLOCK(); 2238 if (listening_tcb != NULL) { 2239 if (CC_ALGO(listening_tcb)->flags & CC_MODULE_BEING_REMOVED) { 2240 CC_LIST_RUNLOCK(); 2241 if (tp->t_fb->tfb_tcp_fb_fini) 2242 (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); 2243 refcount_release(&tp->t_fb->tfb_refcnt); 2244 return (NULL); 2245 } 2246 CC_ALGO(tp) = CC_ALGO(listening_tcb); 2247 } else 2248 CC_ALGO(tp) = CC_DEFAULT_ALGO(); 2249 cc_refer(CC_ALGO(tp)); 2250 CC_LIST_RUNLOCK(); 2251 if (CC_ALGO(tp)->cb_init != NULL) 2252 if (CC_ALGO(tp)->cb_init(&tp->t_ccv, NULL) > 0) { 2253 cc_detach(tp); 2254 if (tp->t_fb->tfb_tcp_fb_fini) 2255 (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); 2256 refcount_release(&tp->t_fb->tfb_refcnt); 2257 return (NULL); 2258 } 2259 2260 #ifdef TCP_HHOOK 2261 if (khelp_init_osd(HELPER_CLASS_TCP, &tp->t_osd)) { 2262 if (CC_ALGO(tp)->cb_destroy != NULL) 2263 CC_ALGO(tp)->cb_destroy(&tp->t_ccv); 2264 CC_DATA(tp) = NULL; 2265 cc_detach(tp); 2266 if (tp->t_fb->tfb_tcp_fb_fini) 2267 (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); 2268 refcount_release(&tp->t_fb->tfb_refcnt); 2269 return (NULL); 2270 } 2271 #endif 2272 2273 TAILQ_INIT(&tp->t_segq); 2274 STAILQ_INIT(&tp->t_inqueue); 2275 tp->t_maxseg = 2276 #ifdef INET6 2277 isipv6 ? V_tcp_v6mssdflt : 2278 #endif /* INET6 */ 2279 V_tcp_mssdflt; 2280 2281 /* All mbuf queue/ack compress flags should be off */ 2282 tcp_lro_features_off(tp); 2283 2284 tp->t_hpts_cpu = HPTS_CPU_NONE; 2285 tp->t_lro_cpu = HPTS_CPU_NONE; 2286 2287 callout_init_rw(&tp->t_callout, &inp->inp_lock, 2288 CALLOUT_TRYLOCK | CALLOUT_RETURNUNLOCKED); 2289 for (int i = 0; i < TT_N; i++) 2290 tp->t_timers[i] = SBT_MAX; 2291 2292 switch (V_tcp_do_rfc1323) { 2293 case 0: 2294 break; 2295 default: 2296 case 1: 2297 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); 2298 break; 2299 case 2: 2300 tp->t_flags = TF_REQ_SCALE; 2301 break; 2302 case 3: 2303 tp->t_flags = TF_REQ_TSTMP; 2304 break; 2305 } 2306 if (V_tcp_do_sack) 2307 tp->t_flags |= TF_SACK_PERMIT; 2308 TAILQ_INIT(&tp->snd_holes); 2309 2310 /* 2311 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 2312 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives 2313 * reasonable initial retransmit time. 2314 */ 2315 tp->t_srtt = TCPTV_SRTTBASE; 2316 tp->t_rttvar = ((tcp_rexmit_initial - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; 2317 tp->t_rttmin = tcp_rexmit_min; 2318 tp->t_rxtcur = tcp_rexmit_initial; 2319 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 2320 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 2321 tp->t_rcvtime = ticks; 2322 /* We always start with ticks granularity */ 2323 tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS; 2324 /* 2325 * IPv4 TTL initialization is necessary for an IPv6 socket as well, 2326 * because the socket may be bound to an IPv6 wildcard address, 2327 * which may match an IPv4-mapped IPv6 address. 2328 */ 2329 inp->inp_ip_ttl = V_ip_defttl; 2330 #ifdef TCP_BLACKBOX 2331 /* Initialize the per-TCPCB log data. */ 2332 tcp_log_tcpcbinit(tp); 2333 #endif 2334 tp->t_pacing_rate = -1; 2335 if (tp->t_fb->tfb_tcp_fb_init) { 2336 if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) { 2337 if (CC_ALGO(tp)->cb_destroy != NULL) 2338 CC_ALGO(tp)->cb_destroy(&tp->t_ccv); 2339 CC_DATA(tp) = NULL; 2340 cc_detach(tp); 2341 #ifdef TCP_HHOOK 2342 khelp_destroy_osd(&tp->t_osd); 2343 #endif 2344 refcount_release(&tp->t_fb->tfb_refcnt); 2345 return (NULL); 2346 } 2347 } 2348 #ifdef STATS 2349 if (V_tcp_perconn_stats_enable == 1) 2350 tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0); 2351 #endif 2352 if (V_tcp_do_lrd) 2353 tp->t_flags |= TF_LRD; 2354 2355 return (tp); 2356 } 2357 2358 /* 2359 * Drop a TCP connection, reporting 2360 * the specified error. If connection is synchronized, 2361 * then send a RST to peer. 2362 */ 2363 struct tcpcb * 2364 tcp_drop(struct tcpcb *tp, int errno) 2365 { 2366 struct socket *so = tptosocket(tp); 2367 2368 NET_EPOCH_ASSERT(); 2369 INP_WLOCK_ASSERT(tptoinpcb(tp)); 2370 2371 if (TCPS_HAVERCVDSYN(tp->t_state)) { 2372 tcp_state_change(tp, TCPS_CLOSED); 2373 /* Don't use tcp_output() here due to possible recursion. */ 2374 (void)tcp_output_nodrop(tp); 2375 TCPSTAT_INC(tcps_drops); 2376 } else 2377 TCPSTAT_INC(tcps_conndrops); 2378 if (errno == ETIMEDOUT && tp->t_softerror) 2379 errno = tp->t_softerror; 2380 so->so_error = errno; 2381 return (tcp_close(tp)); 2382 } 2383 2384 void 2385 tcp_discardcb(struct tcpcb *tp) 2386 { 2387 struct inpcb *inp = tptoinpcb(tp); 2388 struct socket *so = tptosocket(tp); 2389 struct mbuf *m; 2390 #ifdef INET6 2391 bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 2392 #endif 2393 2394 INP_WLOCK_ASSERT(inp); 2395 MPASS(!callout_active(&tp->t_callout)); 2396 MPASS(TAILQ_EMPTY(&tp->snd_holes)); 2397 2398 /* free the reassembly queue, if any */ 2399 tcp_reass_flush(tp); 2400 2401 #ifdef TCP_OFFLOAD 2402 /* Disconnect offload device, if any. */ 2403 if (tp->t_flags & TF_TOE) 2404 tcp_offload_detach(tp); 2405 #endif 2406 2407 /* Allow the CC algorithm to clean up after itself. */ 2408 if (CC_ALGO(tp)->cb_destroy != NULL) 2409 CC_ALGO(tp)->cb_destroy(&tp->t_ccv); 2410 CC_DATA(tp) = NULL; 2411 /* Detach from the CC algorithm */ 2412 cc_detach(tp); 2413 2414 #ifdef TCP_HHOOK 2415 khelp_destroy_osd(&tp->t_osd); 2416 #endif 2417 #ifdef STATS 2418 stats_blob_destroy(tp->t_stats); 2419 #endif 2420 2421 CC_ALGO(tp) = NULL; 2422 if ((m = STAILQ_FIRST(&tp->t_inqueue)) != NULL) { 2423 struct mbuf *prev; 2424 2425 STAILQ_INIT(&tp->t_inqueue); 2426 STAILQ_FOREACH_FROM_SAFE(m, &tp->t_inqueue, m_stailqpkt, prev) 2427 m_freem(m); 2428 } 2429 TCPSTATES_DEC(tp->t_state); 2430 2431 if (tp->t_fb->tfb_tcp_fb_fini) 2432 (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); 2433 MPASS(!tcp_in_hpts(tp)); 2434 #ifdef TCP_BLACKBOX 2435 tcp_log_tcpcbfini(tp); 2436 #endif 2437 2438 /* 2439 * If we got enough samples through the srtt filter, 2440 * save the rtt and rttvar in the routing entry. 2441 * 'Enough' is arbitrarily defined as 4 rtt samples. 2442 * 4 samples is enough for the srtt filter to converge 2443 * to within enough % of the correct value; fewer samples 2444 * and we could save a bogus rtt. The danger is not high 2445 * as tcp quickly recovers from everything. 2446 * XXX: Works very well but needs some more statistics! 2447 * 2448 * XXXRRS: Updating must be after the stack fini() since 2449 * that may be converting some internal representation of 2450 * say srtt etc into the general one used by other stacks. 2451 */ 2452 if (tp->t_rttupdated >= 4) { 2453 struct hc_metrics_lite metrics; 2454 uint32_t ssthresh; 2455 2456 bzero(&metrics, sizeof(metrics)); 2457 /* 2458 * Update the ssthresh always when the conditions below 2459 * are satisfied. This gives us better new start value 2460 * for the congestion avoidance for new connections. 2461 * ssthresh is only set if packet loss occurred on a session. 2462 */ 2463 ssthresh = tp->snd_ssthresh; 2464 if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { 2465 /* 2466 * convert the limit from user data bytes to 2467 * packets then to packet data bytes. 2468 */ 2469 ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; 2470 if (ssthresh < 2) 2471 ssthresh = 2; 2472 ssthresh *= (tp->t_maxseg + 2473 #ifdef INET6 2474 (isipv6 ? sizeof (struct ip6_hdr) + 2475 sizeof (struct tcphdr) : 2476 #endif 2477 sizeof (struct tcpiphdr) 2478 #ifdef INET6 2479 ) 2480 #endif 2481 ); 2482 } else 2483 ssthresh = 0; 2484 metrics.hc_ssthresh = ssthresh; 2485 2486 metrics.hc_rtt = tp->t_srtt; 2487 metrics.hc_rttvar = tp->t_rttvar; 2488 metrics.hc_cwnd = tp->snd_cwnd; 2489 metrics.hc_sendpipe = 0; 2490 metrics.hc_recvpipe = 0; 2491 2492 tcp_hc_update(&inp->inp_inc, &metrics); 2493 } 2494 2495 refcount_release(&tp->t_fb->tfb_refcnt); 2496 } 2497 2498 /* 2499 * Attempt to close a TCP control block, marking it as dropped, and freeing 2500 * the socket if we hold the only reference. 2501 */ 2502 struct tcpcb * 2503 tcp_close(struct tcpcb *tp) 2504 { 2505 struct inpcb *inp = tptoinpcb(tp); 2506 struct socket *so = tptosocket(tp); 2507 2508 INP_WLOCK_ASSERT(inp); 2509 2510 #ifdef TCP_OFFLOAD 2511 if (tp->t_state == TCPS_LISTEN) 2512 tcp_offload_listen_stop(tp); 2513 #endif 2514 /* 2515 * This releases the TFO pending counter resource for TFO listen 2516 * sockets as well as passively-created TFO sockets that transition 2517 * from SYN_RECEIVED to CLOSED. 2518 */ 2519 if (tp->t_tfo_pending) { 2520 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 2521 tp->t_tfo_pending = NULL; 2522 } 2523 tcp_timer_stop(tp); 2524 if (tp->t_fb->tfb_tcp_timer_stop_all != NULL) 2525 tp->t_fb->tfb_tcp_timer_stop_all(tp); 2526 in_pcbdrop(inp); 2527 TCPSTAT_INC(tcps_closed); 2528 if (tp->t_state != TCPS_CLOSED) 2529 tcp_state_change(tp, TCPS_CLOSED); 2530 KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); 2531 tcp_free_sackholes(tp); 2532 soisdisconnected(so); 2533 if (inp->inp_flags & INP_SOCKREF) { 2534 inp->inp_flags &= ~INP_SOCKREF; 2535 INP_WUNLOCK(inp); 2536 sorele(so); 2537 return (NULL); 2538 } 2539 return (tp); 2540 } 2541 2542 /* 2543 * Notify a tcp user of an asynchronous error; 2544 * store error as soft error, but wake up user 2545 * (for now, won't do anything until can select for soft error). 2546 * 2547 * Do not wake up user since there currently is no mechanism for 2548 * reporting soft errors (yet - a kqueue filter may be added). 2549 */ 2550 static struct inpcb * 2551 tcp_notify(struct inpcb *inp, int error) 2552 { 2553 struct tcpcb *tp; 2554 2555 INP_WLOCK_ASSERT(inp); 2556 2557 tp = intotcpcb(inp); 2558 KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); 2559 2560 /* 2561 * Ignore some errors if we are hooked up. 2562 * If connection hasn't completed, has retransmitted several times, 2563 * and receives a second error, give up now. This is better 2564 * than waiting a long time to establish a connection that 2565 * can never complete. 2566 */ 2567 if (tp->t_state == TCPS_ESTABLISHED && 2568 (error == EHOSTUNREACH || error == ENETUNREACH || 2569 error == EHOSTDOWN)) { 2570 if (inp->inp_route.ro_nh) { 2571 NH_FREE(inp->inp_route.ro_nh); 2572 inp->inp_route.ro_nh = (struct nhop_object *)NULL; 2573 } 2574 return (inp); 2575 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && 2576 tp->t_softerror) { 2577 tp = tcp_drop(tp, error); 2578 if (tp != NULL) 2579 return (inp); 2580 else 2581 return (NULL); 2582 } else { 2583 tp->t_softerror = error; 2584 return (inp); 2585 } 2586 #if 0 2587 wakeup( &so->so_timeo); 2588 sorwakeup(so); 2589 sowwakeup(so); 2590 #endif 2591 } 2592 2593 static int 2594 tcp_pcblist(SYSCTL_HANDLER_ARGS) 2595 { 2596 struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, 2597 INPLOOKUP_RLOCKPCB); 2598 struct xinpgen xig; 2599 struct inpcb *inp; 2600 int error; 2601 2602 if (req->newptr != NULL) 2603 return (EPERM); 2604 2605 if (req->oldptr == NULL) { 2606 int n; 2607 2608 n = V_tcbinfo.ipi_count + 2609 counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); 2610 n += imax(n / 8, 10); 2611 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); 2612 return (0); 2613 } 2614 2615 if ((error = sysctl_wire_old_buffer(req, 0)) != 0) 2616 return (error); 2617 2618 bzero(&xig, sizeof(xig)); 2619 xig.xig_len = sizeof xig; 2620 xig.xig_count = V_tcbinfo.ipi_count + 2621 counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); 2622 xig.xig_gen = V_tcbinfo.ipi_gencnt; 2623 xig.xig_sogen = so_gencnt; 2624 error = SYSCTL_OUT(req, &xig, sizeof xig); 2625 if (error) 2626 return (error); 2627 2628 error = syncache_pcblist(req); 2629 if (error) 2630 return (error); 2631 2632 while ((inp = inp_next(&inpi)) != NULL) { 2633 if (inp->inp_gencnt <= xig.xig_gen && 2634 cr_canseeinpcb(req->td->td_ucred, inp) == 0) { 2635 struct xtcpcb xt; 2636 2637 tcp_inptoxtp(inp, &xt); 2638 error = SYSCTL_OUT(req, &xt, sizeof xt); 2639 if (error) { 2640 INP_RUNLOCK(inp); 2641 break; 2642 } else 2643 continue; 2644 } 2645 } 2646 2647 if (!error) { 2648 /* 2649 * Give the user an updated idea of our state. 2650 * If the generation differs from what we told 2651 * her before, she knows that something happened 2652 * while we were processing this request, and it 2653 * might be necessary to retry. 2654 */ 2655 xig.xig_gen = V_tcbinfo.ipi_gencnt; 2656 xig.xig_sogen = so_gencnt; 2657 xig.xig_count = V_tcbinfo.ipi_count + 2658 counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); 2659 error = SYSCTL_OUT(req, &xig, sizeof xig); 2660 } 2661 2662 return (error); 2663 } 2664 2665 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, 2666 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 2667 NULL, 0, tcp_pcblist, "S,xtcpcb", 2668 "List of active TCP connections"); 2669 2670 #define SND_TAG_STATUS_MAXLEN 128 2671 2672 #ifdef KERN_TLS 2673 2674 static struct sx ktlslist_lock; 2675 SX_SYSINIT(ktlslistlock, &ktlslist_lock, "ktlslist"); 2676 static uint64_t ktls_glob_gen = 1; 2677 2678 static int 2679 tcp_ktlslist_locked(SYSCTL_HANDLER_ARGS, bool export_keys) 2680 { 2681 struct xinpgen xig; 2682 struct inpcb *inp; 2683 struct socket *so; 2684 struct ktls_session *ksr, *kss; 2685 char *buf; 2686 struct xktls_session *xktls; 2687 uint64_t ipi_gencnt; 2688 size_t buflen, len, sz; 2689 u_int cnt; 2690 int error; 2691 bool ek, p; 2692 2693 sx_assert(&ktlslist_lock, SA_XLOCKED); 2694 if (req->newptr != NULL) 2695 return (EPERM); 2696 2697 len = 0; 2698 cnt = 0; 2699 ipi_gencnt = V_tcbinfo.ipi_gencnt; 2700 bzero(&xig, sizeof(xig)); 2701 xig.xig_len = sizeof(xig); 2702 xig.xig_gen = ktls_glob_gen++; 2703 xig.xig_sogen = so_gencnt; 2704 2705 struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, 2706 INPLOOKUP_RLOCKPCB); 2707 while ((inp = inp_next(&inpi)) != NULL) { 2708 if (inp->inp_gencnt > ipi_gencnt || 2709 cr_canseeinpcb(req->td->td_ucred, inp) != 0) 2710 continue; 2711 2712 so = inp->inp_socket; 2713 if (so != NULL && so->so_gencnt <= xig.xig_sogen) { 2714 p = false; 2715 ek = export_keys && cr_canexport_ktlskeys( 2716 req->td, inp); 2717 ksr = so->so_rcv.sb_tls_info; 2718 if (ksr != NULL) { 2719 ksr->gen = xig.xig_gen; 2720 p = true; 2721 if (ek) { 2722 sz = SIZE_T_MAX; 2723 ktls_session_copy_keys(ksr, 2724 NULL, &sz); 2725 len += sz; 2726 } 2727 if (ksr->snd_tag != NULL && 2728 ksr->snd_tag->sw->snd_tag_status_str != 2729 NULL) { 2730 sz = SND_TAG_STATUS_MAXLEN; 2731 in_pcbref(inp); 2732 INP_RUNLOCK(inp); 2733 error = ksr->snd_tag->sw-> 2734 snd_tag_status_str( 2735 ksr->snd_tag, NULL, &sz); 2736 if (in_pcbrele_rlock(inp)) 2737 return (EDEADLK); 2738 if (error == 0) 2739 len += sz; 2740 } 2741 } 2742 kss = so->so_snd.sb_tls_info; 2743 if (kss != NULL) { 2744 kss->gen = xig.xig_gen; 2745 p = true; 2746 if (ek) { 2747 sz = SIZE_T_MAX; 2748 ktls_session_copy_keys(kss, 2749 NULL, &sz); 2750 len += sz; 2751 } 2752 if (kss->snd_tag != NULL && 2753 kss->snd_tag->sw->snd_tag_status_str != 2754 NULL) { 2755 sz = SND_TAG_STATUS_MAXLEN; 2756 in_pcbref(inp); 2757 INP_RUNLOCK(inp); 2758 error = kss->snd_tag->sw-> 2759 snd_tag_status_str( 2760 kss->snd_tag, NULL, &sz); 2761 if (in_pcbrele_rlock(inp)) 2762 return (EDEADLK); 2763 if (error == 0) 2764 len += sz; 2765 } 2766 } 2767 if (p) { 2768 len += sizeof(*xktls); 2769 len = roundup2(len, __alignof(struct 2770 xktls_session)); 2771 } 2772 } 2773 } 2774 if (req->oldptr == NULL) { 2775 len += 2 * sizeof(xig); 2776 len += 3 * len / 4; 2777 req->oldidx = len; 2778 return (0); 2779 } 2780 2781 if ((error = sysctl_wire_old_buffer(req, 0)) != 0) 2782 return (error); 2783 2784 error = SYSCTL_OUT(req, &xig, sizeof xig); 2785 if (error != 0) 2786 return (error); 2787 2788 buflen = roundup2(sizeof(*xktls) + 2 * TLS_MAX_PARAM_SIZE + 2789 2 * SND_TAG_STATUS_MAXLEN, __alignof(struct xktls_session)); 2790 buf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); 2791 struct inpcb_iterator inpi1 = INP_ALL_ITERATOR(&V_tcbinfo, 2792 INPLOOKUP_RLOCKPCB); 2793 while ((inp = inp_next(&inpi1)) != NULL) { 2794 if (inp->inp_gencnt > ipi_gencnt || 2795 cr_canseeinpcb(req->td->td_ucred, inp) != 0) 2796 continue; 2797 2798 so = inp->inp_socket; 2799 if (so == NULL) 2800 continue; 2801 2802 p = false; 2803 ek = export_keys && cr_canexport_ktlskeys(req->td, inp); 2804 ksr = so->so_rcv.sb_tls_info; 2805 kss = so->so_snd.sb_tls_info; 2806 xktls = (struct xktls_session *)buf; 2807 if (ksr != NULL && ksr->gen == xig.xig_gen) { 2808 p = true; 2809 ktls_session_to_xktls_onedir(ksr, ek, &xktls->rcv); 2810 } 2811 if (kss != NULL && kss->gen == xig.xig_gen) { 2812 p = true; 2813 ktls_session_to_xktls_onedir(kss, ek, &xktls->snd); 2814 } 2815 if (!p) 2816 continue; 2817 2818 xktls->inp_gencnt = inp->inp_gencnt; 2819 xktls->so_pcb = (kvaddr_t)inp; 2820 memcpy(&xktls->coninf, &inp->inp_inc, sizeof(xktls->coninf)); 2821 len = sizeof(*xktls); 2822 if (ksr != NULL && ksr->gen == xig.xig_gen) { 2823 if (ek) { 2824 sz = buflen - len; 2825 ktls_session_copy_keys(ksr, buf + len, &sz); 2826 len += sz; 2827 } else { 2828 xktls->rcv.cipher_key_len = 0; 2829 xktls->rcv.auth_key_len = 0; 2830 } 2831 if (ksr->snd_tag != NULL && 2832 ksr->snd_tag->sw->snd_tag_status_str != NULL) { 2833 sz = SND_TAG_STATUS_MAXLEN; 2834 in_pcbref(inp); 2835 INP_RUNLOCK(inp); 2836 error = ksr->snd_tag->sw->snd_tag_status_str( 2837 ksr->snd_tag, buf + len, &sz); 2838 if (in_pcbrele_rlock(inp)) 2839 return (EDEADLK); 2840 if (error == 0) { 2841 xktls->rcv.drv_st_len = sz; 2842 len += sz; 2843 } 2844 } 2845 } 2846 if (kss != NULL && kss->gen == xig.xig_gen) { 2847 if (ek) { 2848 sz = buflen - len; 2849 ktls_session_copy_keys(kss, buf + len, &sz); 2850 len += sz; 2851 } else { 2852 xktls->snd.cipher_key_len = 0; 2853 xktls->snd.auth_key_len = 0; 2854 } 2855 if (kss->snd_tag != NULL && 2856 kss->snd_tag->sw->snd_tag_status_str != NULL) { 2857 sz = SND_TAG_STATUS_MAXLEN; 2858 in_pcbref(inp); 2859 INP_RUNLOCK(inp); 2860 error = kss->snd_tag->sw->snd_tag_status_str( 2861 kss->snd_tag, buf + len, &sz); 2862 if (in_pcbrele_rlock(inp)) 2863 return (EDEADLK); 2864 if (error == 0) { 2865 xktls->snd.drv_st_len = sz; 2866 len += sz; 2867 } 2868 } 2869 } 2870 len = roundup2(len, __alignof(*xktls)); 2871 xktls->tsz = len; 2872 xktls->fsz = sizeof(*xktls); 2873 2874 error = SYSCTL_OUT(req, xktls, len); 2875 if (error != 0) { 2876 INP_RUNLOCK(inp); 2877 break; 2878 } 2879 cnt++; 2880 } 2881 2882 if (error == 0) { 2883 xig.xig_sogen = so_gencnt; 2884 xig.xig_count = cnt; 2885 error = SYSCTL_OUT(req, &xig, sizeof(xig)); 2886 } 2887 2888 zfree(buf, M_TEMP); 2889 return (error); 2890 } 2891 2892 static int 2893 tcp_ktlslist1(SYSCTL_HANDLER_ARGS, bool export_keys) 2894 { 2895 int repeats, error; 2896 2897 for (repeats = 0; repeats < 100; repeats++) { 2898 if (sx_xlock_sig(&ktlslist_lock)) 2899 return (EINTR); 2900 error = tcp_ktlslist_locked(oidp, arg1, arg2, req, 2901 export_keys); 2902 sx_xunlock(&ktlslist_lock); 2903 if (error != EDEADLK) 2904 break; 2905 if (sig_intr() != 0) { 2906 error = EINTR; 2907 break; 2908 } 2909 req->oldidx = 0; 2910 } 2911 return (error); 2912 } 2913 2914 static int 2915 tcp_ktlslist_nokeys(SYSCTL_HANDLER_ARGS) 2916 { 2917 return (tcp_ktlslist1(oidp, arg1, arg2, req, false)); 2918 } 2919 2920 static int 2921 tcp_ktlslist_wkeys(SYSCTL_HANDLER_ARGS) 2922 { 2923 return (tcp_ktlslist1(oidp, arg1, arg2, req, true)); 2924 } 2925 2926 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KTLSLIST, ktlslist, 2927 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 2928 NULL, 0, tcp_ktlslist_nokeys, "S,xktls_session", 2929 "List of active kTLS sessions for TCP connections"); 2930 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KTLSLIST_WKEYS, ktlslist_wkeys, 2931 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 2932 NULL, 0, tcp_ktlslist_wkeys, "S,xktls_session", 2933 "List of active kTLS sessions for TCP connections with keys"); 2934 #endif /* KERN_TLS */ 2935 2936 #ifdef INET 2937 static int 2938 tcp_getcred(SYSCTL_HANDLER_ARGS) 2939 { 2940 struct xucred xuc; 2941 struct sockaddr_in addrs[2]; 2942 struct epoch_tracker et; 2943 struct inpcb *inp; 2944 int error; 2945 2946 if (req->newptr == NULL) 2947 return (EINVAL); 2948 error = priv_check(req->td, PRIV_NETINET_GETCRED); 2949 if (error) 2950 return (error); 2951 error = SYSCTL_IN(req, addrs, sizeof(addrs)); 2952 if (error) 2953 return (error); 2954 NET_EPOCH_ENTER(et); 2955 inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, 2956 addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); 2957 NET_EPOCH_EXIT(et); 2958 if (inp != NULL) { 2959 if (error == 0) 2960 error = cr_canseeinpcb(req->td->td_ucred, inp); 2961 if (error == 0) 2962 cru2x(inp->inp_cred, &xuc); 2963 INP_RUNLOCK(inp); 2964 } else 2965 error = ENOENT; 2966 if (error == 0) 2967 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 2968 return (error); 2969 } 2970 2971 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, 2972 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 2973 0, 0, tcp_getcred, "S,xucred", 2974 "Get the xucred of a TCP connection"); 2975 #endif /* INET */ 2976 2977 #ifdef INET6 2978 static int 2979 tcp6_getcred(SYSCTL_HANDLER_ARGS) 2980 { 2981 struct epoch_tracker et; 2982 struct xucred xuc; 2983 struct sockaddr_in6 addrs[2]; 2984 struct inpcb *inp; 2985 int error; 2986 #ifdef INET 2987 int mapped = 0; 2988 #endif 2989 2990 if (req->newptr == NULL) 2991 return (EINVAL); 2992 error = priv_check(req->td, PRIV_NETINET_GETCRED); 2993 if (error) 2994 return (error); 2995 error = SYSCTL_IN(req, addrs, sizeof(addrs)); 2996 if (error) 2997 return (error); 2998 if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || 2999 (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { 3000 return (error); 3001 } 3002 if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { 3003 #ifdef INET 3004 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) 3005 mapped = 1; 3006 else 3007 #endif 3008 return (EINVAL); 3009 } 3010 3011 NET_EPOCH_ENTER(et); 3012 #ifdef INET 3013 if (mapped == 1) 3014 inp = in_pcblookup(&V_tcbinfo, 3015 *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], 3016 addrs[1].sin6_port, 3017 *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], 3018 addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); 3019 else 3020 #endif 3021 inp = in6_pcblookup(&V_tcbinfo, 3022 &addrs[1].sin6_addr, addrs[1].sin6_port, 3023 &addrs[0].sin6_addr, addrs[0].sin6_port, 3024 INPLOOKUP_RLOCKPCB, NULL); 3025 NET_EPOCH_EXIT(et); 3026 if (inp != NULL) { 3027 if (error == 0) 3028 error = cr_canseeinpcb(req->td->td_ucred, inp); 3029 if (error == 0) 3030 cru2x(inp->inp_cred, &xuc); 3031 INP_RUNLOCK(inp); 3032 } else 3033 error = ENOENT; 3034 if (error == 0) 3035 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 3036 return (error); 3037 } 3038 3039 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, 3040 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 3041 0, 0, tcp6_getcred, "S,xucred", 3042 "Get the xucred of a TCP6 connection"); 3043 #endif /* INET6 */ 3044 3045 #ifdef INET 3046 /* Path MTU to try next when a fragmentation-needed message is received. */ 3047 static inline int 3048 tcp_next_pmtu(const struct icmp *icp, const struct ip *ip) 3049 { 3050 int mtu = ntohs(icp->icmp_nextmtu); 3051 3052 /* If no alternative MTU was proposed, try the next smaller one. */ 3053 if (!mtu) 3054 mtu = ip_next_mtu(ntohs(ip->ip_len), 1); 3055 if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr)) 3056 mtu = V_tcp_minmss + sizeof(struct tcpiphdr); 3057 3058 return (mtu); 3059 } 3060 3061 static void 3062 tcp_ctlinput_with_port(struct icmp *icp, uint16_t port) 3063 { 3064 struct ip *ip; 3065 struct tcphdr *th; 3066 struct inpcb *inp; 3067 struct tcpcb *tp; 3068 struct inpcb *(*notify)(struct inpcb *, int); 3069 struct in_conninfo inc; 3070 tcp_seq icmp_tcp_seq; 3071 int errno, mtu; 3072 3073 errno = icmp_errmap(icp); 3074 switch (errno) { 3075 case 0: 3076 return; 3077 case EMSGSIZE: 3078 notify = tcp_mtudisc_notify; 3079 break; 3080 case ECONNREFUSED: 3081 if (V_icmp_may_rst) 3082 notify = tcp_drop_syn_sent; 3083 else 3084 notify = tcp_notify; 3085 break; 3086 case EHOSTUNREACH: 3087 if (V_icmp_may_rst && icp->icmp_type == ICMP_TIMXCEED) 3088 notify = tcp_drop_syn_sent; 3089 else 3090 notify = tcp_notify; 3091 break; 3092 default: 3093 notify = tcp_notify; 3094 } 3095 3096 ip = &icp->icmp_ip; 3097 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); 3098 icmp_tcp_seq = th->th_seq; 3099 inp = in_pcblookup(&V_tcbinfo, ip->ip_dst, th->th_dport, ip->ip_src, 3100 th->th_sport, INPLOOKUP_WLOCKPCB, NULL); 3101 if (inp != NULL) { 3102 tp = intotcpcb(inp); 3103 #ifdef TCP_OFFLOAD 3104 if (tp->t_flags & TF_TOE && errno == EMSGSIZE) { 3105 /* 3106 * MTU discovery for offloaded connections. Let 3107 * the TOE driver verify seq# and process it. 3108 */ 3109 mtu = tcp_next_pmtu(icp, ip); 3110 tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); 3111 goto out; 3112 } 3113 #endif 3114 if (tp->t_port != port) 3115 goto out; 3116 if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && 3117 SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { 3118 if (errno == EMSGSIZE) { 3119 /* 3120 * MTU discovery: we got a needfrag and 3121 * will potentially try a lower MTU. 3122 */ 3123 mtu = tcp_next_pmtu(icp, ip); 3124 3125 /* 3126 * Only process the offered MTU if it 3127 * is smaller than the current one. 3128 */ 3129 if (mtu < tp->t_maxseg + 3130 sizeof(struct tcpiphdr)) { 3131 bzero(&inc, sizeof(inc)); 3132 inc.inc_faddr = ip->ip_dst; 3133 inc.inc_fibnum = 3134 inp->inp_inc.inc_fibnum; 3135 tcp_hc_updatemtu(&inc, mtu); 3136 inp = tcp_mtudisc(inp, mtu); 3137 } 3138 } else 3139 inp = (*notify)(inp, errno); 3140 } 3141 } else { 3142 bzero(&inc, sizeof(inc)); 3143 inc.inc_fport = th->th_dport; 3144 inc.inc_lport = th->th_sport; 3145 inc.inc_faddr = ip->ip_dst; 3146 inc.inc_laddr = ip->ip_src; 3147 syncache_unreach(&inc, icmp_tcp_seq, port); 3148 } 3149 out: 3150 if (inp != NULL) 3151 INP_WUNLOCK(inp); 3152 } 3153 3154 static void 3155 tcp_ctlinput(struct icmp *icmp) 3156 { 3157 tcp_ctlinput_with_port(icmp, htons(0)); 3158 } 3159 3160 static void 3161 tcp_ctlinput_viaudp(udp_tun_icmp_param_t param) 3162 { 3163 /* Its a tunneled TCP over UDP icmp */ 3164 struct icmp *icmp = param.icmp; 3165 struct ip *outer_ip, *inner_ip; 3166 struct udphdr *udp; 3167 struct tcphdr *th, ttemp; 3168 int i_hlen, o_len; 3169 uint16_t port; 3170 3171 outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip)); 3172 inner_ip = &icmp->icmp_ip; 3173 i_hlen = inner_ip->ip_hl << 2; 3174 o_len = ntohs(outer_ip->ip_len); 3175 if (o_len < 3176 (sizeof(struct ip) + 8 + i_hlen + sizeof(struct udphdr) + offsetof(struct tcphdr, th_ack))) { 3177 /* Not enough data present */ 3178 return; 3179 } 3180 /* Ok lets strip out the inner udphdr header by copying up on top of it the tcp hdr */ 3181 udp = (struct udphdr *)(((caddr_t)inner_ip) + i_hlen); 3182 if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { 3183 return; 3184 } 3185 port = udp->uh_dport; 3186 th = (struct tcphdr *)(udp + 1); 3187 memcpy(&ttemp, th, sizeof(struct tcphdr)); 3188 memcpy(udp, &ttemp, sizeof(struct tcphdr)); 3189 /* Now adjust down the size of the outer IP header */ 3190 o_len -= sizeof(struct udphdr); 3191 outer_ip->ip_len = htons(o_len); 3192 /* Now call in to the normal handling code */ 3193 tcp_ctlinput_with_port(icmp, port); 3194 } 3195 #endif /* INET */ 3196 3197 #ifdef INET6 3198 static inline int 3199 tcp6_next_pmtu(const struct icmp6_hdr *icmp6) 3200 { 3201 int mtu = ntohl(icmp6->icmp6_mtu); 3202 3203 /* 3204 * If no alternative MTU was proposed, or the proposed MTU was too 3205 * small, set to the min. 3206 */ 3207 if (mtu < IPV6_MMTU) 3208 mtu = IPV6_MMTU; 3209 return (mtu); 3210 } 3211 3212 static void 3213 tcp6_ctlinput_with_port(struct ip6ctlparam *ip6cp, uint16_t port) 3214 { 3215 struct in6_addr *dst; 3216 struct inpcb *(*notify)(struct inpcb *, int); 3217 struct ip6_hdr *ip6; 3218 struct mbuf *m; 3219 struct inpcb *inp; 3220 struct tcpcb *tp; 3221 struct icmp6_hdr *icmp6; 3222 struct in_conninfo inc; 3223 struct tcp_ports { 3224 uint16_t th_sport; 3225 uint16_t th_dport; 3226 } t_ports; 3227 tcp_seq icmp_tcp_seq; 3228 unsigned int mtu; 3229 unsigned int off; 3230 int errno; 3231 3232 icmp6 = ip6cp->ip6c_icmp6; 3233 m = ip6cp->ip6c_m; 3234 ip6 = ip6cp->ip6c_ip6; 3235 off = ip6cp->ip6c_off; 3236 dst = &ip6cp->ip6c_finaldst->sin6_addr; 3237 3238 errno = icmp6_errmap(icmp6); 3239 switch (errno) { 3240 case 0: 3241 return; 3242 case EMSGSIZE: 3243 notify = tcp_mtudisc_notify; 3244 break; 3245 case ECONNREFUSED: 3246 if (V_icmp_may_rst) 3247 notify = tcp_drop_syn_sent; 3248 else 3249 notify = tcp_notify; 3250 break; 3251 case EHOSTUNREACH: 3252 /* 3253 * There are only four ICMPs that may reset connection: 3254 * - administratively prohibited 3255 * - port unreachable 3256 * - time exceeded in transit 3257 * - unknown next header 3258 */ 3259 if (V_icmp_may_rst && 3260 ((icmp6->icmp6_type == ICMP6_DST_UNREACH && 3261 (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN || 3262 icmp6->icmp6_code == ICMP6_DST_UNREACH_NOPORT)) || 3263 (icmp6->icmp6_type == ICMP6_TIME_EXCEEDED && 3264 icmp6->icmp6_code == ICMP6_TIME_EXCEED_TRANSIT) || 3265 (icmp6->icmp6_type == ICMP6_PARAM_PROB && 3266 icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER))) 3267 notify = tcp_drop_syn_sent; 3268 else 3269 notify = tcp_notify; 3270 break; 3271 default: 3272 notify = tcp_notify; 3273 } 3274 3275 /* Check if we can safely get the ports from the tcp hdr */ 3276 if (m == NULL || 3277 (m->m_pkthdr.len < 3278 (int32_t) (off + sizeof(struct tcp_ports)))) { 3279 return; 3280 } 3281 bzero(&t_ports, sizeof(struct tcp_ports)); 3282 m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports); 3283 inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport, 3284 &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL); 3285 off += sizeof(struct tcp_ports); 3286 if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) { 3287 goto out; 3288 } 3289 m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq); 3290 if (inp != NULL) { 3291 tp = intotcpcb(inp); 3292 #ifdef TCP_OFFLOAD 3293 if (tp->t_flags & TF_TOE && errno == EMSGSIZE) { 3294 /* MTU discovery for offloaded connections. */ 3295 mtu = tcp6_next_pmtu(icmp6); 3296 tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); 3297 goto out; 3298 } 3299 #endif 3300 if (tp->t_port != port) 3301 goto out; 3302 if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && 3303 SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { 3304 if (errno == EMSGSIZE) { 3305 /* 3306 * MTU discovery: 3307 * If we got a needfrag set the MTU 3308 * in the route to the suggested new 3309 * value (if given) and then notify. 3310 */ 3311 mtu = tcp6_next_pmtu(icmp6); 3312 3313 bzero(&inc, sizeof(inc)); 3314 inc.inc_fibnum = M_GETFIB(m); 3315 inc.inc_flags |= INC_ISIPV6; 3316 inc.inc6_faddr = *dst; 3317 if (in6_setscope(&inc.inc6_faddr, 3318 m->m_pkthdr.rcvif, NULL)) 3319 goto out; 3320 /* 3321 * Only process the offered MTU if it 3322 * is smaller than the current one. 3323 */ 3324 if (mtu < tp->t_maxseg + 3325 sizeof (struct tcphdr) + 3326 sizeof (struct ip6_hdr)) { 3327 tcp_hc_updatemtu(&inc, mtu); 3328 tcp_mtudisc(inp, mtu); 3329 ICMP6STAT_INC(icp6s_pmtuchg); 3330 } 3331 } else 3332 inp = (*notify)(inp, errno); 3333 } 3334 } else { 3335 bzero(&inc, sizeof(inc)); 3336 inc.inc_fibnum = M_GETFIB(m); 3337 inc.inc_flags |= INC_ISIPV6; 3338 inc.inc_fport = t_ports.th_dport; 3339 inc.inc_lport = t_ports.th_sport; 3340 inc.inc6_faddr = *dst; 3341 inc.inc6_laddr = ip6->ip6_src; 3342 syncache_unreach(&inc, icmp_tcp_seq, port); 3343 } 3344 out: 3345 if (inp != NULL) 3346 INP_WUNLOCK(inp); 3347 } 3348 3349 static void 3350 tcp6_ctlinput(struct ip6ctlparam *ctl) 3351 { 3352 tcp6_ctlinput_with_port(ctl, htons(0)); 3353 } 3354 3355 static void 3356 tcp6_ctlinput_viaudp(udp_tun_icmp_param_t param) 3357 { 3358 struct ip6ctlparam *ip6cp = param.ip6cp; 3359 struct mbuf *m; 3360 struct udphdr *udp; 3361 uint16_t port; 3362 3363 m = m_pulldown(ip6cp->ip6c_m, ip6cp->ip6c_off, sizeof(struct udphdr), NULL); 3364 if (m == NULL) { 3365 return; 3366 } 3367 udp = mtod(m, struct udphdr *); 3368 if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { 3369 return; 3370 } 3371 port = udp->uh_dport; 3372 m_adj(m, sizeof(struct udphdr)); 3373 if ((m->m_flags & M_PKTHDR) == 0) { 3374 ip6cp->ip6c_m->m_pkthdr.len -= sizeof(struct udphdr); 3375 } 3376 /* Now call in to the normal handling code */ 3377 tcp6_ctlinput_with_port(ip6cp, port); 3378 } 3379 3380 #endif /* INET6 */ 3381 3382 static uint32_t 3383 tcp_keyed_hash(struct in_conninfo *inc, u_char *key, u_int len) 3384 { 3385 SIPHASH_CTX ctx; 3386 uint32_t hash[2]; 3387 3388 KASSERT(len >= SIPHASH_KEY_LENGTH, 3389 ("%s: keylen %u too short ", __func__, len)); 3390 SipHash24_Init(&ctx); 3391 SipHash_SetKey(&ctx, (uint8_t *)key); 3392 SipHash_Update(&ctx, &inc->inc_fport, sizeof(uint16_t)); 3393 SipHash_Update(&ctx, &inc->inc_lport, sizeof(uint16_t)); 3394 switch (inc->inc_flags & INC_ISIPV6) { 3395 #ifdef INET 3396 case 0: 3397 SipHash_Update(&ctx, &inc->inc_faddr, sizeof(struct in_addr)); 3398 SipHash_Update(&ctx, &inc->inc_laddr, sizeof(struct in_addr)); 3399 break; 3400 #endif 3401 #ifdef INET6 3402 case INC_ISIPV6: 3403 SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(struct in6_addr)); 3404 SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(struct in6_addr)); 3405 break; 3406 #endif 3407 } 3408 SipHash_Final((uint8_t *)hash, &ctx); 3409 3410 return (hash[0] ^ hash[1]); 3411 } 3412 3413 uint32_t 3414 tcp_new_ts_offset(struct in_conninfo *inc) 3415 { 3416 struct in_conninfo inc_store, *local_inc; 3417 3418 if (!V_tcp_ts_offset_per_conn) { 3419 memcpy(&inc_store, inc, sizeof(struct in_conninfo)); 3420 inc_store.inc_lport = 0; 3421 inc_store.inc_fport = 0; 3422 local_inc = &inc_store; 3423 } else { 3424 local_inc = inc; 3425 } 3426 return (tcp_keyed_hash(local_inc, V_ts_offset_secret, 3427 sizeof(V_ts_offset_secret))); 3428 } 3429 3430 /* 3431 * Following is where TCP initial sequence number generation occurs. 3432 * 3433 * There are two places where we must use initial sequence numbers: 3434 * 1. In SYN-ACK packets. 3435 * 2. In SYN packets. 3436 * 3437 * All ISNs for SYN-ACK packets are generated by the syncache. See 3438 * tcp_syncache.c for details. 3439 * 3440 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling 3441 * depends on this property. In addition, these ISNs should be 3442 * unguessable so as to prevent connection hijacking. To satisfy 3443 * the requirements of this situation, the algorithm outlined in 3444 * RFC 1948 is used, with only small modifications. 3445 * 3446 * Implementation details: 3447 * 3448 * Time is based off the system timer, and is corrected so that it 3449 * increases by one megabyte per second. This allows for proper 3450 * recycling on high speed LANs while still leaving over an hour 3451 * before rollover. 3452 * 3453 * As reading the *exact* system time is too expensive to be done 3454 * whenever setting up a TCP connection, we increment the time 3455 * offset in two ways. First, a small random positive increment 3456 * is added to isn_offset for each connection that is set up. 3457 * Second, the function tcp_isn_tick fires once per clock tick 3458 * and increments isn_offset as necessary so that sequence numbers 3459 * are incremented at approximately ISN_BYTES_PER_SECOND. The 3460 * random positive increments serve only to ensure that the same 3461 * exact sequence number is never sent out twice (as could otherwise 3462 * happen when a port is recycled in less than the system tick 3463 * interval.) 3464 * 3465 * net.inet.tcp.isn_reseed_interval controls the number of seconds 3466 * between seeding of isn_secret. This is normally set to zero, 3467 * as reseeding should not be necessary. 3468 * 3469 * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, 3470 * isn_offset_old, and isn_ctx is performed using the ISN lock. In 3471 * general, this means holding an exclusive (write) lock. 3472 */ 3473 3474 #define ISN_BYTES_PER_SECOND 1048576 3475 #define ISN_STATIC_INCREMENT 4096 3476 #define ISN_RANDOM_INCREMENT (4096 - 1) 3477 #define ISN_SECRET_LENGTH SIPHASH_KEY_LENGTH 3478 3479 VNET_DEFINE_STATIC(u_char, isn_secret[ISN_SECRET_LENGTH]); 3480 VNET_DEFINE_STATIC(int, isn_last); 3481 VNET_DEFINE_STATIC(int, isn_last_reseed); 3482 VNET_DEFINE_STATIC(u_int32_t, isn_offset); 3483 VNET_DEFINE_STATIC(u_int32_t, isn_offset_old); 3484 3485 #define V_isn_secret VNET(isn_secret) 3486 #define V_isn_last VNET(isn_last) 3487 #define V_isn_last_reseed VNET(isn_last_reseed) 3488 #define V_isn_offset VNET(isn_offset) 3489 #define V_isn_offset_old VNET(isn_offset_old) 3490 3491 tcp_seq 3492 tcp_new_isn(struct in_conninfo *inc) 3493 { 3494 tcp_seq new_isn; 3495 u_int32_t projected_offset; 3496 3497 ISN_LOCK(); 3498 /* Seed if this is the first use, reseed if requested. */ 3499 if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && 3500 (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) 3501 < (u_int)ticks))) { 3502 arc4rand(&V_isn_secret, sizeof(V_isn_secret), 0); 3503 V_isn_last_reseed = ticks; 3504 } 3505 3506 /* Compute the hash and return the ISN. */ 3507 new_isn = (tcp_seq)tcp_keyed_hash(inc, V_isn_secret, 3508 sizeof(V_isn_secret)); 3509 V_isn_offset += ISN_STATIC_INCREMENT + 3510 (arc4random() & ISN_RANDOM_INCREMENT); 3511 if (ticks != V_isn_last) { 3512 projected_offset = V_isn_offset_old + 3513 ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); 3514 if (SEQ_GT(projected_offset, V_isn_offset)) 3515 V_isn_offset = projected_offset; 3516 V_isn_offset_old = V_isn_offset; 3517 V_isn_last = ticks; 3518 } 3519 new_isn += V_isn_offset; 3520 ISN_UNLOCK(); 3521 return (new_isn); 3522 } 3523 3524 /* 3525 * When a specific ICMP unreachable message is received and the 3526 * connection state is SYN-SENT, drop the connection. This behavior 3527 * is controlled by the icmp_may_rst sysctl. 3528 */ 3529 static struct inpcb * 3530 tcp_drop_syn_sent(struct inpcb *inp, int errno) 3531 { 3532 struct tcpcb *tp; 3533 3534 NET_EPOCH_ASSERT(); 3535 INP_WLOCK_ASSERT(inp); 3536 3537 tp = intotcpcb(inp); 3538 if (tp->t_state != TCPS_SYN_SENT) 3539 return (inp); 3540 3541 if (tp->t_flags & TF_FASTOPEN) 3542 tcp_fastopen_disable_path(tp); 3543 3544 tp = tcp_drop(tp, errno); 3545 if (tp != NULL) 3546 return (inp); 3547 else 3548 return (NULL); 3549 } 3550 3551 /* 3552 * When `need fragmentation' ICMP is received, update our idea of the MSS 3553 * based on the new value. Also nudge TCP to send something, since we 3554 * know the packet we just sent was dropped. 3555 * This duplicates some code in the tcp_mss() function in tcp_input.c. 3556 */ 3557 static struct inpcb * 3558 tcp_mtudisc_notify(struct inpcb *inp, int error) 3559 { 3560 3561 return (tcp_mtudisc(inp, -1)); 3562 } 3563 3564 static struct inpcb * 3565 tcp_mtudisc(struct inpcb *inp, int mtuoffer) 3566 { 3567 struct tcpcb *tp; 3568 struct socket *so; 3569 3570 INP_WLOCK_ASSERT(inp); 3571 3572 tp = intotcpcb(inp); 3573 KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); 3574 3575 tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); 3576 3577 so = inp->inp_socket; 3578 SOCK_SENDBUF_LOCK(so); 3579 /* If the mss is larger than the socket buffer, decrease the mss. */ 3580 if (so->so_snd.sb_hiwat < tp->t_maxseg) { 3581 tp->t_maxseg = so->so_snd.sb_hiwat; 3582 if (tp->t_maxseg < V_tcp_mssdflt) { 3583 /* 3584 * The MSS is so small we should not process incoming 3585 * SACK's since we are subject to attack in such a 3586 * case. 3587 */ 3588 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; 3589 } else { 3590 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; 3591 } 3592 } 3593 SOCK_SENDBUF_UNLOCK(so); 3594 3595 TCPSTAT_INC(tcps_mturesent); 3596 tp->t_rtttime = 0; 3597 tp->snd_nxt = tp->snd_una; 3598 tcp_free_sackholes(tp); 3599 tp->snd_recover = tp->snd_max; 3600 if (tp->t_flags & TF_SACK_PERMIT) 3601 EXIT_FASTRECOVERY(tp->t_flags); 3602 if (tp->t_fb->tfb_tcp_mtu_chg != NULL) { 3603 /* 3604 * Conceptually the snd_nxt setting 3605 * and freeing sack holes should 3606 * be done by the default stacks 3607 * own tfb_tcp_mtu_chg(). 3608 */ 3609 tp->t_fb->tfb_tcp_mtu_chg(tp); 3610 } 3611 if (tcp_output(tp) < 0) 3612 return (NULL); 3613 else 3614 return (inp); 3615 } 3616 3617 #ifdef INET 3618 /* 3619 * Look-up the routing entry to the peer of this inpcb. If no route 3620 * is found and it cannot be allocated, then return 0. This routine 3621 * is called by TCP routines that access the rmx structure and by 3622 * tcp_mss_update to get the peer/interface MTU. 3623 */ 3624 uint32_t 3625 tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) 3626 { 3627 struct nhop_object *nh; 3628 struct ifnet *ifp; 3629 uint32_t maxmtu = 0; 3630 3631 KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); 3632 3633 if (inc->inc_faddr.s_addr != INADDR_ANY) { 3634 nh = fib4_lookup(inc->inc_fibnum, inc->inc_faddr, 0, NHR_NONE, 0); 3635 if (nh == NULL) 3636 return (0); 3637 3638 ifp = nh->nh_ifp; 3639 maxmtu = nh->nh_mtu; 3640 3641 /* Report additional interface capabilities. */ 3642 if (cap != NULL) { 3643 if (ifp->if_capenable & IFCAP_TSO4 && 3644 ifp->if_hwassist & CSUM_TSO) { 3645 cap->ifcap |= CSUM_TSO; 3646 cap->tsomax = ifp->if_hw_tsomax; 3647 cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; 3648 cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; 3649 /* XXXKIB IFCAP2_IPSEC_OFFLOAD_TSO */ 3650 cap->ipsec_tso = (ifp->if_capenable2 & 3651 IFCAP2_BIT(IFCAP2_IPSEC_OFFLOAD)) != 0; 3652 } 3653 } 3654 } 3655 return (maxmtu); 3656 } 3657 #endif /* INET */ 3658 3659 #ifdef INET6 3660 uint32_t 3661 tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) 3662 { 3663 struct nhop_object *nh; 3664 struct in6_addr dst6; 3665 uint32_t scopeid; 3666 struct ifnet *ifp; 3667 uint32_t maxmtu = 0; 3668 3669 KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); 3670 3671 if (inc->inc_flags & INC_IPV6MINMTU) 3672 return (IPV6_MMTU); 3673 3674 if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { 3675 in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); 3676 nh = fib6_lookup(inc->inc_fibnum, &dst6, scopeid, NHR_NONE, 0); 3677 if (nh == NULL) 3678 return (0); 3679 3680 ifp = nh->nh_ifp; 3681 maxmtu = nh->nh_mtu; 3682 3683 /* Report additional interface capabilities. */ 3684 if (cap != NULL) { 3685 if (ifp->if_capenable & IFCAP_TSO6 && 3686 ifp->if_hwassist & CSUM_TSO) { 3687 cap->ifcap |= CSUM_TSO; 3688 cap->tsomax = ifp->if_hw_tsomax; 3689 cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; 3690 cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; 3691 cap->ipsec_tso = false; /* XXXKIB */ 3692 } 3693 } 3694 } 3695 3696 return (maxmtu); 3697 } 3698 3699 /* 3700 * Handle setsockopt(IPV6_USE_MIN_MTU) by a TCP stack. 3701 * 3702 * XXXGL: we are updating inpcb here with INC_IPV6MINMTU flag. 3703 * The right place to do that is ip6_setpktopt() that has just been 3704 * executed. By the way it just filled ip6po_minmtu for us. 3705 */ 3706 void 3707 tcp6_use_min_mtu(struct tcpcb *tp) 3708 { 3709 struct inpcb *inp = tptoinpcb(tp); 3710 3711 INP_WLOCK_ASSERT(inp); 3712 /* 3713 * In case of the IPV6_USE_MIN_MTU socket 3714 * option, the INC_IPV6MINMTU flag to announce 3715 * a corresponding MSS during the initial 3716 * handshake. If the TCP connection is not in 3717 * the front states, just reduce the MSS being 3718 * used. This avoids the sending of TCP 3719 * segments which will be fragmented at the 3720 * IPv6 layer. 3721 */ 3722 inp->inp_inc.inc_flags |= INC_IPV6MINMTU; 3723 if ((tp->t_state >= TCPS_SYN_SENT) && 3724 (inp->inp_inc.inc_flags & INC_ISIPV6)) { 3725 struct ip6_pktopts *opt; 3726 3727 opt = inp->in6p_outputopts; 3728 if (opt != NULL && opt->ip6po_minmtu == IP6PO_MINMTU_ALL && 3729 tp->t_maxseg > TCP6_MSS) { 3730 tp->t_maxseg = TCP6_MSS; 3731 if (tp->t_maxseg < V_tcp_mssdflt) { 3732 /* 3733 * The MSS is so small we should not process incoming 3734 * SACK's since we are subject to attack in such a 3735 * case. 3736 */ 3737 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; 3738 } else { 3739 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; 3740 } 3741 } 3742 } 3743 } 3744 #endif /* INET6 */ 3745 3746 /* 3747 * Calculate effective SMSS per RFC5681 definition for a given TCP 3748 * connection at its current state, taking into account SACK and etc. 3749 */ 3750 u_int 3751 tcp_maxseg(const struct tcpcb *tp) 3752 { 3753 u_int optlen; 3754 3755 if (tp->t_flags & TF_NOOPT) 3756 return (tp->t_maxseg); 3757 3758 /* 3759 * Here we have a simplified code from tcp_addoptions(), 3760 * without a proper loop, and having most of paddings hardcoded. 3761 * We might make mistakes with padding here in some edge cases, 3762 * but this is harmless, since result of tcp_maxseg() is used 3763 * only in cwnd and ssthresh estimations. 3764 */ 3765 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 3766 if (tp->t_flags & TF_RCVD_TSTMP) 3767 optlen = TCPOLEN_TSTAMP_APPA; 3768 else 3769 optlen = 0; 3770 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 3771 if (tp->t_flags & TF_SIGNATURE) 3772 optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); 3773 #endif 3774 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { 3775 optlen += TCPOLEN_SACKHDR; 3776 optlen += tp->rcv_numsacks * TCPOLEN_SACK; 3777 optlen = PADTCPOLEN(optlen); 3778 } 3779 } else { 3780 if (tp->t_flags & TF_REQ_TSTMP) 3781 optlen = TCPOLEN_TSTAMP_APPA; 3782 else 3783 optlen = PADTCPOLEN(TCPOLEN_MAXSEG); 3784 if (tp->t_flags & TF_REQ_SCALE) 3785 optlen += PADTCPOLEN(TCPOLEN_WINDOW); 3786 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 3787 if (tp->t_flags & TF_SIGNATURE) 3788 optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); 3789 #endif 3790 if (tp->t_flags & TF_SACK_PERMIT) 3791 optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED); 3792 } 3793 optlen = min(optlen, TCP_MAXOLEN); 3794 return (tp->t_maxseg - optlen); 3795 } 3796 3797 3798 u_int 3799 tcp_fixed_maxseg(const struct tcpcb *tp) 3800 { 3801 int optlen; 3802 3803 if (tp->t_flags & TF_NOOPT) 3804 return (tp->t_maxseg); 3805 3806 /* 3807 * Here we have a simplified code from tcp_addoptions(), 3808 * without a proper loop, and having most of paddings hardcoded. 3809 * We only consider fixed options that we would send every 3810 * time I.e. SACK is not considered. This is important 3811 * for cc modules to figure out what the modulo of the 3812 * cwnd should be. 3813 */ 3814 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 3815 if (tp->t_flags & TF_RCVD_TSTMP) 3816 optlen = TCPOLEN_TSTAMP_APPA; 3817 else 3818 optlen = 0; 3819 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 3820 if (tp->t_flags & TF_SIGNATURE) 3821 optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); 3822 #endif 3823 } else { 3824 if (tp->t_flags & TF_REQ_TSTMP) 3825 optlen = TCPOLEN_TSTAMP_APPA; 3826 else 3827 optlen = PADTCPOLEN(TCPOLEN_MAXSEG); 3828 if (tp->t_flags & TF_REQ_SCALE) 3829 optlen += PADTCPOLEN(TCPOLEN_WINDOW); 3830 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 3831 if (tp->t_flags & TF_SIGNATURE) 3832 optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); 3833 #endif 3834 if (tp->t_flags & TF_SACK_PERMIT) 3835 optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED); 3836 } 3837 optlen = min(optlen, TCP_MAXOLEN); 3838 return (tp->t_maxseg - optlen); 3839 } 3840 3841 3842 3843 static int 3844 sysctl_drop(SYSCTL_HANDLER_ARGS) 3845 { 3846 /* addrs[0] is a foreign socket, addrs[1] is a local one. */ 3847 struct sockaddr_storage addrs[2]; 3848 struct inpcb *inp; 3849 struct tcpcb *tp; 3850 #ifdef INET 3851 struct sockaddr_in *fin = NULL, *lin = NULL; 3852 #endif 3853 struct epoch_tracker et; 3854 #ifdef INET6 3855 struct sockaddr_in6 *fin6, *lin6; 3856 #endif 3857 int error; 3858 3859 inp = NULL; 3860 #ifdef INET6 3861 fin6 = lin6 = NULL; 3862 #endif 3863 error = 0; 3864 3865 if (req->oldptr != NULL || req->oldlen != 0) 3866 return (EINVAL); 3867 if (req->newptr == NULL) 3868 return (EPERM); 3869 if (req->newlen < sizeof(addrs)) 3870 return (ENOMEM); 3871 error = SYSCTL_IN(req, &addrs, sizeof(addrs)); 3872 if (error) 3873 return (error); 3874 3875 switch (addrs[0].ss_family) { 3876 #ifdef INET6 3877 case AF_INET6: 3878 fin6 = (struct sockaddr_in6 *)&addrs[0]; 3879 lin6 = (struct sockaddr_in6 *)&addrs[1]; 3880 if (fin6->sin6_len != sizeof(struct sockaddr_in6) || 3881 lin6->sin6_len != sizeof(struct sockaddr_in6)) 3882 return (EINVAL); 3883 if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { 3884 if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) 3885 return (EINVAL); 3886 in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); 3887 in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); 3888 #ifdef INET 3889 fin = (struct sockaddr_in *)&addrs[0]; 3890 lin = (struct sockaddr_in *)&addrs[1]; 3891 #endif 3892 break; 3893 } 3894 error = sa6_embedscope(fin6, V_ip6_use_defzone); 3895 if (error) 3896 return (error); 3897 error = sa6_embedscope(lin6, V_ip6_use_defzone); 3898 if (error) 3899 return (error); 3900 break; 3901 #endif 3902 #ifdef INET 3903 case AF_INET: 3904 fin = (struct sockaddr_in *)&addrs[0]; 3905 lin = (struct sockaddr_in *)&addrs[1]; 3906 if (fin->sin_len != sizeof(struct sockaddr_in) || 3907 lin->sin_len != sizeof(struct sockaddr_in)) 3908 return (EINVAL); 3909 break; 3910 #endif 3911 default: 3912 return (EINVAL); 3913 } 3914 NET_EPOCH_ENTER(et); 3915 switch (addrs[0].ss_family) { 3916 #ifdef INET6 3917 case AF_INET6: 3918 inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, 3919 fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, 3920 INPLOOKUP_WLOCKPCB, NULL); 3921 break; 3922 #endif 3923 #ifdef INET 3924 case AF_INET: 3925 inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, 3926 lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); 3927 break; 3928 #endif 3929 } 3930 if (inp != NULL) { 3931 if (!SOLISTENING(inp->inp_socket)) { 3932 tp = intotcpcb(inp); 3933 tp = tcp_drop(tp, ECONNABORTED); 3934 if (tp != NULL) 3935 INP_WUNLOCK(inp); 3936 } else 3937 INP_WUNLOCK(inp); 3938 } else 3939 error = ESRCH; 3940 NET_EPOCH_EXIT(et); 3941 return (error); 3942 } 3943 3944 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, 3945 CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | 3946 CTLFLAG_NEEDGIANT, NULL, 0, sysctl_drop, "", 3947 "Drop TCP connection"); 3948 3949 static int 3950 tcp_sysctl_setsockopt(SYSCTL_HANDLER_ARGS) 3951 { 3952 return (sysctl_setsockopt(oidp, arg1, arg2, req, &V_tcbinfo, 3953 &tcp_ctloutput_set)); 3954 } 3955 3956 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, setsockopt, 3957 CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | 3958 CTLFLAG_MPSAFE, NULL, 0, tcp_sysctl_setsockopt, "", 3959 "Set socket option for TCP endpoint"); 3960 3961 #ifdef KERN_TLS 3962 static int 3963 sysctl_switch_tls(SYSCTL_HANDLER_ARGS) 3964 { 3965 /* addrs[0] is a foreign socket, addrs[1] is a local one. */ 3966 struct sockaddr_storage addrs[2]; 3967 struct inpcb *inp; 3968 #ifdef INET 3969 struct sockaddr_in *fin = NULL, *lin = NULL; 3970 #endif 3971 struct epoch_tracker et; 3972 #ifdef INET6 3973 struct sockaddr_in6 *fin6, *lin6; 3974 #endif 3975 int error; 3976 3977 inp = NULL; 3978 #ifdef INET6 3979 fin6 = lin6 = NULL; 3980 #endif 3981 error = 0; 3982 3983 if (req->oldptr != NULL || req->oldlen != 0) 3984 return (EINVAL); 3985 if (req->newptr == NULL) 3986 return (EPERM); 3987 if (req->newlen < sizeof(addrs)) 3988 return (ENOMEM); 3989 error = SYSCTL_IN(req, &addrs, sizeof(addrs)); 3990 if (error) 3991 return (error); 3992 3993 switch (addrs[0].ss_family) { 3994 #ifdef INET6 3995 case AF_INET6: 3996 fin6 = (struct sockaddr_in6 *)&addrs[0]; 3997 lin6 = (struct sockaddr_in6 *)&addrs[1]; 3998 if (fin6->sin6_len != sizeof(struct sockaddr_in6) || 3999 lin6->sin6_len != sizeof(struct sockaddr_in6)) 4000 return (EINVAL); 4001 if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { 4002 if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) 4003 return (EINVAL); 4004 in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); 4005 in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); 4006 #ifdef INET 4007 fin = (struct sockaddr_in *)&addrs[0]; 4008 lin = (struct sockaddr_in *)&addrs[1]; 4009 #endif 4010 break; 4011 } 4012 error = sa6_embedscope(fin6, V_ip6_use_defzone); 4013 if (error) 4014 return (error); 4015 error = sa6_embedscope(lin6, V_ip6_use_defzone); 4016 if (error) 4017 return (error); 4018 break; 4019 #endif 4020 #ifdef INET 4021 case AF_INET: 4022 fin = (struct sockaddr_in *)&addrs[0]; 4023 lin = (struct sockaddr_in *)&addrs[1]; 4024 if (fin->sin_len != sizeof(struct sockaddr_in) || 4025 lin->sin_len != sizeof(struct sockaddr_in)) 4026 return (EINVAL); 4027 break; 4028 #endif 4029 default: 4030 return (EINVAL); 4031 } 4032 NET_EPOCH_ENTER(et); 4033 switch (addrs[0].ss_family) { 4034 #ifdef INET6 4035 case AF_INET6: 4036 inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, 4037 fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, 4038 INPLOOKUP_WLOCKPCB, NULL); 4039 break; 4040 #endif 4041 #ifdef INET 4042 case AF_INET: 4043 inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, 4044 lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); 4045 break; 4046 #endif 4047 } 4048 NET_EPOCH_EXIT(et); 4049 if (inp != NULL) { 4050 struct socket *so; 4051 4052 so = inp->inp_socket; 4053 soref(so); 4054 error = ktls_set_tx_mode(so, 4055 arg2 == 0 ? TCP_TLS_MODE_SW : TCP_TLS_MODE_IFNET); 4056 INP_WUNLOCK(inp); 4057 sorele(so); 4058 } else 4059 error = ESRCH; 4060 return (error); 4061 } 4062 4063 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_sw_tls, 4064 CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | 4065 CTLFLAG_NEEDGIANT, NULL, 0, sysctl_switch_tls, "", 4066 "Switch TCP connection to SW TLS"); 4067 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_ifnet_tls, 4068 CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | 4069 CTLFLAG_NEEDGIANT, NULL, 1, sysctl_switch_tls, "", 4070 "Switch TCP connection to ifnet TLS"); 4071 #endif 4072 4073 /* 4074 * Generate a standardized TCP log line for use throughout the 4075 * tcp subsystem. Memory allocation is done with M_NOWAIT to 4076 * allow use in the interrupt context. 4077 * 4078 * NB: The caller MUST free(s, M_TCPLOG) the returned string. 4079 * NB: The function may return NULL if memory allocation failed. 4080 * 4081 * Due to header inclusion and ordering limitations the struct ip 4082 * and ip6_hdr pointers have to be passed as void pointers. 4083 */ 4084 char * 4085 tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, 4086 const void *ip6hdr) 4087 { 4088 4089 /* Is logging enabled? */ 4090 if (V_tcp_log_in_vain == 0) 4091 return (NULL); 4092 4093 return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); 4094 } 4095 4096 char * 4097 tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, 4098 const void *ip6hdr) 4099 { 4100 4101 /* Is logging enabled? */ 4102 if (tcp_log_debug == 0) 4103 return (NULL); 4104 4105 return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); 4106 } 4107 4108 static char * 4109 tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, 4110 const void *ip6hdr) 4111 { 4112 char *s, *sp; 4113 size_t size; 4114 #ifdef INET 4115 const struct ip *ip = (const struct ip *)ip4hdr; 4116 #endif 4117 #ifdef INET6 4118 const struct ip6_hdr *ip6 = (const struct ip6_hdr *)ip6hdr; 4119 #endif /* INET6 */ 4120 4121 /* 4122 * The log line looks like this: 4123 * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>" 4124 */ 4125 size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + 4126 sizeof(PRINT_TH_FLAGS) + 1 + 4127 #ifdef INET6 4128 2 * INET6_ADDRSTRLEN; 4129 #else 4130 2 * INET_ADDRSTRLEN; 4131 #endif /* INET6 */ 4132 4133 s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); 4134 if (s == NULL) 4135 return (NULL); 4136 4137 strcat(s, "TCP: ["); 4138 sp = s + strlen(s); 4139 4140 if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { 4141 inet_ntoa_r(inc->inc_faddr, sp); 4142 sp = s + strlen(s); 4143 sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); 4144 sp = s + strlen(s); 4145 inet_ntoa_r(inc->inc_laddr, sp); 4146 sp = s + strlen(s); 4147 sprintf(sp, "]:%i", ntohs(inc->inc_lport)); 4148 #ifdef INET6 4149 } else if (inc) { 4150 ip6_sprintf(sp, &inc->inc6_faddr); 4151 sp = s + strlen(s); 4152 sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); 4153 sp = s + strlen(s); 4154 ip6_sprintf(sp, &inc->inc6_laddr); 4155 sp = s + strlen(s); 4156 sprintf(sp, "]:%i", ntohs(inc->inc_lport)); 4157 } else if (ip6 && th) { 4158 ip6_sprintf(sp, &ip6->ip6_src); 4159 sp = s + strlen(s); 4160 sprintf(sp, "]:%i to [", ntohs(th->th_sport)); 4161 sp = s + strlen(s); 4162 ip6_sprintf(sp, &ip6->ip6_dst); 4163 sp = s + strlen(s); 4164 sprintf(sp, "]:%i", ntohs(th->th_dport)); 4165 #endif /* INET6 */ 4166 #ifdef INET 4167 } else if (ip && th) { 4168 inet_ntoa_r(ip->ip_src, sp); 4169 sp = s + strlen(s); 4170 sprintf(sp, "]:%i to [", ntohs(th->th_sport)); 4171 sp = s + strlen(s); 4172 inet_ntoa_r(ip->ip_dst, sp); 4173 sp = s + strlen(s); 4174 sprintf(sp, "]:%i", ntohs(th->th_dport)); 4175 #endif /* INET */ 4176 } else { 4177 free(s, M_TCPLOG); 4178 return (NULL); 4179 } 4180 sp = s + strlen(s); 4181 if (th) 4182 sprintf(sp, " tcpflags 0x%b", tcp_get_flags(th), PRINT_TH_FLAGS); 4183 if (*(s + size - 1) != '\0') 4184 panic("%s: string too long", __func__); 4185 return (s); 4186 } 4187 4188 /* 4189 * A subroutine which makes it easy to track TCP state changes with DTrace. 4190 * This function shouldn't be called for t_state initializations that don't 4191 * correspond to actual TCP state transitions. 4192 */ 4193 void 4194 tcp_state_change(struct tcpcb *tp, int newstate) 4195 { 4196 #if defined(KDTRACE_HOOKS) 4197 int pstate = tp->t_state; 4198 #endif 4199 4200 TCPSTATES_DEC(tp->t_state); 4201 TCPSTATES_INC(newstate); 4202 tp->t_state = newstate; 4203 TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); 4204 } 4205 4206 /* 4207 * Create an external-format (``xtcpcb'') structure using the information in 4208 * the kernel-format tcpcb structure pointed to by tp. This is done to 4209 * reduce the spew of irrelevant information over this interface, to isolate 4210 * user code from changes in the kernel structure, and potentially to provide 4211 * information-hiding if we decide that some of this information should be 4212 * hidden from users. 4213 */ 4214 void 4215 tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt) 4216 { 4217 struct tcpcb *tp = intotcpcb(inp); 4218 sbintime_t now; 4219 4220 bzero(xt, sizeof(*xt)); 4221 xt->t_state = tp->t_state; 4222 xt->t_logstate = tcp_get_bblog_state(tp); 4223 xt->t_flags = tp->t_flags; 4224 xt->t_sndzerowin = tp->t_sndzerowin; 4225 xt->t_sndrexmitpack = tp->t_sndrexmitpack; 4226 xt->t_rcvoopack = tp->t_rcvoopack; 4227 xt->t_rcv_wnd = tp->rcv_wnd; 4228 xt->t_snd_wnd = tp->snd_wnd; 4229 xt->t_snd_cwnd = tp->snd_cwnd; 4230 xt->t_snd_ssthresh = tp->snd_ssthresh; 4231 xt->t_dsack_bytes = tp->t_dsack_bytes; 4232 xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes; 4233 xt->t_dsack_pack = tp->t_dsack_pack; 4234 xt->t_maxseg = tp->t_maxseg; 4235 xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 + 4236 (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0; 4237 4238 now = getsbinuptime(); 4239 #define COPYTIMER(which,where) do { \ 4240 if (tp->t_timers[which] != SBT_MAX) \ 4241 xt->where = (tp->t_timers[which] - now) / SBT_1MS; \ 4242 else \ 4243 xt->where = 0; \ 4244 } while (0) 4245 COPYTIMER(TT_DELACK, tt_delack); 4246 COPYTIMER(TT_REXMT, tt_rexmt); 4247 COPYTIMER(TT_PERSIST, tt_persist); 4248 COPYTIMER(TT_KEEP, tt_keep); 4249 COPYTIMER(TT_2MSL, tt_2msl); 4250 #undef COPYTIMER 4251 xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz; 4252 4253 xt->xt_encaps_port = tp->t_port; 4254 bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack, 4255 TCP_FUNCTION_NAME_LEN_MAX); 4256 bcopy(CC_ALGO(tp)->name, xt->xt_cc, TCP_CA_NAME_MAX); 4257 #ifdef TCP_BLACKBOX 4258 (void)tcp_log_get_id(tp, xt->xt_logid); 4259 #endif 4260 4261 xt->xt_len = sizeof(struct xtcpcb); 4262 in_pcbtoxinpcb(inp, &xt->xt_inp); 4263 } 4264 4265 void 4266 tcp_log_end_status(struct tcpcb *tp, uint8_t status) 4267 { 4268 uint32_t bit, i; 4269 4270 if ((tp == NULL) || 4271 (status > TCP_EI_STATUS_MAX_VALUE) || 4272 (status == 0)) { 4273 /* Invalid */ 4274 return; 4275 } 4276 if (status > (sizeof(uint32_t) * 8)) { 4277 /* Should this be a KASSERT? */ 4278 return; 4279 } 4280 bit = 1U << (status - 1); 4281 if (bit & tp->t_end_info_status) { 4282 /* already logged */ 4283 return; 4284 } 4285 for (i = 0; i < TCP_END_BYTE_INFO; i++) { 4286 if (tp->t_end_info_bytes[i] == TCP_EI_EMPTY_SLOT) { 4287 tp->t_end_info_bytes[i] = status; 4288 tp->t_end_info_status |= bit; 4289 break; 4290 } 4291 } 4292 } 4293 4294 int 4295 tcp_can_enable_pacing(void) 4296 { 4297 4298 if ((tcp_pacing_limit == -1) || 4299 (tcp_pacing_limit > number_of_tcp_connections_pacing)) { 4300 atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1); 4301 shadow_num_connections = number_of_tcp_connections_pacing; 4302 return (1); 4303 } else { 4304 counter_u64_add(tcp_pacing_failures, 1); 4305 return (0); 4306 } 4307 } 4308 4309 int 4310 tcp_incr_dgp_pacing_cnt(void) 4311 { 4312 if ((tcp_dgp_limit == -1) || 4313 (tcp_dgp_limit > number_of_dgp_connections)) { 4314 atomic_fetchadd_int(&number_of_dgp_connections, 1); 4315 shadow_tcp_pacing_dgp = number_of_dgp_connections; 4316 return (1); 4317 } else { 4318 counter_u64_add(tcp_dgp_failures, 1); 4319 return (0); 4320 } 4321 } 4322 4323 static uint8_t tcp_dgp_warning = 0; 4324 4325 void 4326 tcp_dec_dgp_pacing_cnt(void) 4327 { 4328 uint32_t ret; 4329 4330 ret = atomic_fetchadd_int(&number_of_dgp_connections, -1); 4331 shadow_tcp_pacing_dgp = number_of_dgp_connections; 4332 KASSERT(ret != 0, ("number_of_dgp_connections -1 would cause wrap?")); 4333 if (ret == 0) { 4334 if (tcp_dgp_limit != -1) { 4335 printf("Warning all DGP is now disabled, count decrements invalidly!\n"); 4336 tcp_dgp_limit = 0; 4337 tcp_dgp_warning = 1; 4338 } else if (tcp_dgp_warning == 0) { 4339 printf("Warning DGP pacing is invalid, invalid decrement\n"); 4340 tcp_dgp_warning = 1; 4341 } 4342 } 4343 4344 } 4345 4346 static uint8_t tcp_pacing_warning = 0; 4347 4348 void 4349 tcp_decrement_paced_conn(void) 4350 { 4351 uint32_t ret; 4352 4353 ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1); 4354 shadow_num_connections = number_of_tcp_connections_pacing; 4355 KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?")); 4356 if (ret == 0) { 4357 if (tcp_pacing_limit != -1) { 4358 printf("Warning all pacing is now disabled, count decrements invalidly!\n"); 4359 tcp_pacing_limit = 0; 4360 } else if (tcp_pacing_warning == 0) { 4361 printf("Warning pacing count is invalid, invalid decrement\n"); 4362 tcp_pacing_warning = 1; 4363 } 4364 } 4365 } 4366 4367 static void 4368 tcp_default_switch_failed(struct tcpcb *tp) 4369 { 4370 /* 4371 * If a switch fails we only need to 4372 * care about two things: 4373 * a) The t_flags2 4374 * and 4375 * b) The timer granularity. 4376 * Timeouts, at least for now, don't use the 4377 * old callout system in the other stacks so 4378 * those are hopefully safe. 4379 */ 4380 tcp_lro_features_off(tp); 4381 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); 4382 } 4383 4384 #ifdef TCP_ACCOUNTING 4385 int 4386 tcp_do_ack_accounting(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t tiwin, int mss) 4387 { 4388 if (SEQ_LT(th->th_ack, tp->snd_una)) { 4389 /* Do we have a SACK? */ 4390 if (to->to_flags & TOF_SACK) { 4391 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 4392 tp->tcp_cnt_counters[ACK_SACK]++; 4393 } 4394 return (ACK_SACK); 4395 } else { 4396 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 4397 tp->tcp_cnt_counters[ACK_BEHIND]++; 4398 } 4399 return (ACK_BEHIND); 4400 } 4401 } else if (th->th_ack == tp->snd_una) { 4402 /* Do we have a SACK? */ 4403 if (to->to_flags & TOF_SACK) { 4404 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 4405 tp->tcp_cnt_counters[ACK_SACK]++; 4406 } 4407 return (ACK_SACK); 4408 } else if (tiwin != tp->snd_wnd) { 4409 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 4410 tp->tcp_cnt_counters[ACK_RWND]++; 4411 } 4412 return (ACK_RWND); 4413 } else { 4414 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 4415 tp->tcp_cnt_counters[ACK_DUPACK]++; 4416 } 4417 return (ACK_DUPACK); 4418 } 4419 } else { 4420 if (!SEQ_GT(th->th_ack, tp->snd_max)) { 4421 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 4422 tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((th->th_ack - tp->snd_una) + mss - 1)/mss); 4423 } 4424 } 4425 if (to->to_flags & TOF_SACK) { 4426 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 4427 tp->tcp_cnt_counters[ACK_CUMACK_SACK]++; 4428 } 4429 return (ACK_CUMACK_SACK); 4430 } else { 4431 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 4432 tp->tcp_cnt_counters[ACK_CUMACK]++; 4433 } 4434 return (ACK_CUMACK); 4435 } 4436 } 4437 } 4438 #endif 4439 4440 void 4441 tcp_change_time_units(struct tcpcb *tp, int granularity) 4442 { 4443 if (tp->t_tmr_granularity == granularity) { 4444 /* We are there */ 4445 return; 4446 } 4447 if (granularity == TCP_TMR_GRANULARITY_USEC) { 4448 KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_TICKS), 4449 ("Granularity is not TICKS its %u in tp:%p", 4450 tp->t_tmr_granularity, tp)); 4451 tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow); 4452 if (tp->t_srtt > 1) { 4453 uint32_t val, frac; 4454 4455 val = tp->t_srtt >> TCP_RTT_SHIFT; 4456 frac = tp->t_srtt & 0x1f; 4457 tp->t_srtt = TICKS_2_USEC(val); 4458 /* 4459 * frac is the fractional part of the srtt (if any) 4460 * but its in ticks and every bit represents 4461 * 1/32nd of a hz. 4462 */ 4463 if (frac) { 4464 if (hz == 1000) { 4465 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); 4466 } else { 4467 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); 4468 } 4469 tp->t_srtt += frac; 4470 } 4471 } 4472 if (tp->t_rttvar) { 4473 uint32_t val, frac; 4474 4475 val = tp->t_rttvar >> TCP_RTTVAR_SHIFT; 4476 frac = tp->t_rttvar & 0x1f; 4477 tp->t_rttvar = TICKS_2_USEC(val); 4478 /* 4479 * frac is the fractional part of the srtt (if any) 4480 * but its in ticks and every bit represents 4481 * 1/32nd of a hz. 4482 */ 4483 if (frac) { 4484 if (hz == 1000) { 4485 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); 4486 } else { 4487 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); 4488 } 4489 tp->t_rttvar += frac; 4490 } 4491 } 4492 tp->t_tmr_granularity = TCP_TMR_GRANULARITY_USEC; 4493 } else if (granularity == TCP_TMR_GRANULARITY_TICKS) { 4494 /* Convert back to ticks, with */ 4495 KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_USEC), 4496 ("Granularity is not USEC its %u in tp:%p", 4497 tp->t_tmr_granularity, tp)); 4498 if (tp->t_srtt > 1) { 4499 uint32_t val, frac; 4500 4501 val = USEC_2_TICKS(tp->t_srtt); 4502 frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); 4503 tp->t_srtt = val << TCP_RTT_SHIFT; 4504 /* 4505 * frac is the fractional part here is left 4506 * over from converting to hz and shifting. 4507 * We need to convert this to the 5 bit 4508 * remainder. 4509 */ 4510 if (frac) { 4511 if (hz == 1000) { 4512 frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); 4513 } else { 4514 frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); 4515 } 4516 tp->t_srtt += frac; 4517 } 4518 } 4519 if (tp->t_rttvar) { 4520 uint32_t val, frac; 4521 4522 val = USEC_2_TICKS(tp->t_rttvar); 4523 frac = tp->t_rttvar % (HPTS_USEC_IN_SEC / hz); 4524 tp->t_rttvar = val << TCP_RTTVAR_SHIFT; 4525 /* 4526 * frac is the fractional part here is left 4527 * over from converting to hz and shifting. 4528 * We need to convert this to the 4 bit 4529 * remainder. 4530 */ 4531 if (frac) { 4532 if (hz == 1000) { 4533 frac = (((uint64_t)frac * (uint64_t)TCP_RTTVAR_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); 4534 } else { 4535 frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTTVAR_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); 4536 } 4537 tp->t_rttvar += frac; 4538 } 4539 } 4540 tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow); 4541 tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS; 4542 } 4543 #ifdef INVARIANTS 4544 else { 4545 panic("Unknown granularity:%d tp:%p", 4546 granularity, tp); 4547 } 4548 #endif 4549 } 4550 4551 void 4552 tcp_handle_orphaned_packets(struct tcpcb *tp) 4553 { 4554 struct mbuf *save, *m, *prev; 4555 /* 4556 * Called when a stack switch is occuring from the fini() 4557 * of the old stack. We assue the init() as already been 4558 * run of the new stack and it has set the t_flags2 to 4559 * what it supports. This function will then deal with any 4560 * differences i.e. cleanup packets that maybe queued that 4561 * the newstack does not support. 4562 */ 4563 4564 if (tp->t_flags2 & TF2_MBUF_L_ACKS) 4565 return; 4566 if ((tp->t_flags2 & TF2_SUPPORTS_MBUFQ) == 0 && 4567 !STAILQ_EMPTY(&tp->t_inqueue)) { 4568 /* 4569 * It is unsafe to process the packets since a 4570 * reset may be lurking in them (its rare but it 4571 * can occur). If we were to find a RST, then we 4572 * would end up dropping the connection and the 4573 * INP lock, so when we return the caller (tcp_usrreq) 4574 * will blow up when it trys to unlock the inp. 4575 * This new stack does not do any fancy LRO features 4576 * so all we can do is toss the packets. 4577 */ 4578 m = STAILQ_FIRST(&tp->t_inqueue); 4579 STAILQ_INIT(&tp->t_inqueue); 4580 STAILQ_FOREACH_FROM_SAFE(m, &tp->t_inqueue, m_stailqpkt, save) 4581 m_freem(m); 4582 } else { 4583 /* 4584 * Here we have a stack that does mbuf queuing but 4585 * does not support compressed ack's. We must 4586 * walk all the mbufs and discard any compressed acks. 4587 */ 4588 STAILQ_FOREACH_SAFE(m, &tp->t_inqueue, m_stailqpkt, save) { 4589 if (m->m_flags & M_ACKCMP) { 4590 if (m == STAILQ_FIRST(&tp->t_inqueue)) 4591 STAILQ_REMOVE_HEAD(&tp->t_inqueue, 4592 m_stailqpkt); 4593 else 4594 STAILQ_REMOVE_AFTER(&tp->t_inqueue, 4595 prev, m_stailqpkt); 4596 m_freem(m); 4597 } else 4598 prev = m; 4599 } 4600 } 4601 } 4602 4603 #ifdef TCP_REQUEST_TRK 4604 uint32_t 4605 tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes) 4606 { 4607 #ifdef KERN_TLS 4608 struct ktls_session *tls; 4609 uint32_t rec_oh, records; 4610 4611 tls = so->so_snd.sb_tls_info; 4612 if (tls == NULL) 4613 return (0); 4614 4615 rec_oh = tls->params.tls_hlen + tls->params.tls_tlen; 4616 records = ((tls_usr_bytes + tls->params.max_frame_len - 1)/tls->params.max_frame_len); 4617 return (records * rec_oh); 4618 #else 4619 return (0); 4620 #endif 4621 } 4622 4623 extern uint32_t tcp_stale_entry_time; 4624 uint32_t tcp_stale_entry_time = 250000; 4625 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, usrlog_stale, CTLFLAG_RW, 4626 &tcp_stale_entry_time, 250000, "Time that a tcpreq entry without a sendfile ages out"); 4627 4628 void 4629 tcp_req_log_req_info(struct tcpcb *tp, struct tcp_sendfile_track *req, 4630 uint16_t slot, uint8_t val, uint64_t offset, uint64_t nbytes) 4631 { 4632 if (tcp_bblogging_on(tp)) { 4633 union tcp_log_stackspecific log; 4634 struct timeval tv; 4635 4636 memset(&log, 0, sizeof(log)); 4637 log.u_bbr.inhpts = tcp_in_hpts(tp); 4638 log.u_bbr.flex8 = val; 4639 log.u_bbr.rttProp = req->timestamp; 4640 log.u_bbr.delRate = req->start; 4641 log.u_bbr.cur_del_rate = req->end; 4642 log.u_bbr.flex1 = req->start_seq; 4643 log.u_bbr.flex2 = req->end_seq; 4644 log.u_bbr.flex3 = req->flags; 4645 log.u_bbr.flex4 = ((req->localtime >> 32) & 0x00000000ffffffff); 4646 log.u_bbr.flex5 = (req->localtime & 0x00000000ffffffff); 4647 log.u_bbr.flex7 = slot; 4648 log.u_bbr.bw_inuse = offset; 4649 /* nbytes = flex6 | epoch */ 4650 log.u_bbr.flex6 = ((nbytes >> 32) & 0x00000000ffffffff); 4651 log.u_bbr.epoch = (nbytes & 0x00000000ffffffff); 4652 /* cspr = lt_epoch | pkts_out */ 4653 log.u_bbr.lt_epoch = ((req->cspr >> 32) & 0x00000000ffffffff); 4654 log.u_bbr.pkts_out |= (req->cspr & 0x00000000ffffffff); 4655 log.u_bbr.applimited = tp->t_tcpreq_closed; 4656 log.u_bbr.applimited <<= 8; 4657 log.u_bbr.applimited |= tp->t_tcpreq_open; 4658 log.u_bbr.applimited <<= 8; 4659 log.u_bbr.applimited |= tp->t_tcpreq_req; 4660 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4661 TCP_LOG_EVENTP(tp, NULL, 4662 &tptosocket(tp)->so_rcv, 4663 &tptosocket(tp)->so_snd, 4664 TCP_LOG_REQ_T, 0, 4665 0, &log, false, &tv); 4666 } 4667 } 4668 4669 void 4670 tcp_req_free_a_slot(struct tcpcb *tp, struct tcp_sendfile_track *ent) 4671 { 4672 if (tp->t_tcpreq_req > 0) 4673 tp->t_tcpreq_req--; 4674 if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) { 4675 if (tp->t_tcpreq_open > 0) 4676 tp->t_tcpreq_open--; 4677 } else { 4678 if (tp->t_tcpreq_closed > 0) 4679 tp->t_tcpreq_closed--; 4680 } 4681 ent->flags = TCP_TRK_TRACK_FLG_EMPTY; 4682 } 4683 4684 static void 4685 tcp_req_check_for_stale_entries(struct tcpcb *tp, uint64_t ts, int rm_oldest) 4686 { 4687 struct tcp_sendfile_track *ent; 4688 uint64_t time_delta, oldest_delta; 4689 int i, oldest, oldest_set = 0, cnt_rm = 0; 4690 4691 for (i = 0; i < MAX_TCP_TRK_REQ; i++) { 4692 ent = &tp->t_tcpreq_info[i]; 4693 if (ent->flags != TCP_TRK_TRACK_FLG_USED) { 4694 /* 4695 * We only care about closed end ranges 4696 * that are allocated and have no sendfile 4697 * ever touching them. They would be in 4698 * state USED. 4699 */ 4700 continue; 4701 } 4702 if (ts >= ent->localtime) 4703 time_delta = ts - ent->localtime; 4704 else 4705 time_delta = 0; 4706 if (time_delta && 4707 ((oldest_delta < time_delta) || (oldest_set == 0))) { 4708 oldest_set = 1; 4709 oldest = i; 4710 oldest_delta = time_delta; 4711 } 4712 if (tcp_stale_entry_time && (time_delta >= tcp_stale_entry_time)) { 4713 /* 4714 * No sendfile in a our time-limit 4715 * time to purge it. 4716 */ 4717 cnt_rm++; 4718 tcp_req_log_req_info(tp, &tp->t_tcpreq_info[i], i, TCP_TRK_REQ_LOG_STALE, 4719 time_delta, 0); 4720 tcp_req_free_a_slot(tp, ent); 4721 } 4722 } 4723 if ((cnt_rm == 0) && rm_oldest && oldest_set) { 4724 ent = &tp->t_tcpreq_info[oldest]; 4725 tcp_req_log_req_info(tp, &tp->t_tcpreq_info[i], i, TCP_TRK_REQ_LOG_STALE, 4726 oldest_delta, 1); 4727 tcp_req_free_a_slot(tp, ent); 4728 } 4729 } 4730 4731 int 4732 tcp_req_check_for_comp(struct tcpcb *tp, tcp_seq ack_point) 4733 { 4734 int i, ret = 0; 4735 struct tcp_sendfile_track *ent; 4736 4737 /* Clean up any old closed end requests that are now completed */ 4738 if (tp->t_tcpreq_req == 0) 4739 return (0); 4740 if (tp->t_tcpreq_closed == 0) 4741 return (0); 4742 for (i = 0; i < MAX_TCP_TRK_REQ; i++) { 4743 ent = &tp->t_tcpreq_info[i]; 4744 /* Skip empty ones */ 4745 if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) 4746 continue; 4747 /* Skip open ones */ 4748 if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) 4749 continue; 4750 if (SEQ_GEQ(ack_point, ent->end_seq)) { 4751 /* We are past it -- free it */ 4752 tcp_req_log_req_info(tp, ent, 4753 i, TCP_TRK_REQ_LOG_FREED, 0, 0); 4754 tcp_req_free_a_slot(tp, ent); 4755 ret++; 4756 } 4757 } 4758 return (ret); 4759 } 4760 4761 int 4762 tcp_req_is_entry_comp(struct tcpcb *tp, struct tcp_sendfile_track *ent, tcp_seq ack_point) 4763 { 4764 if (tp->t_tcpreq_req == 0) 4765 return (-1); 4766 if (tp->t_tcpreq_closed == 0) 4767 return (-1); 4768 if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) 4769 return (-1); 4770 if (SEQ_GEQ(ack_point, ent->end_seq)) { 4771 return (1); 4772 } 4773 return (0); 4774 } 4775 4776 struct tcp_sendfile_track * 4777 tcp_req_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip) 4778 { 4779 /* 4780 * Given an ack point (th_ack) walk through our entries and 4781 * return the first one found that th_ack goes past the 4782 * end_seq. 4783 */ 4784 struct tcp_sendfile_track *ent; 4785 int i; 4786 4787 if (tp->t_tcpreq_req == 0) { 4788 /* none open */ 4789 return (NULL); 4790 } 4791 for (i = 0; i < MAX_TCP_TRK_REQ; i++) { 4792 ent = &tp->t_tcpreq_info[i]; 4793 if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) 4794 continue; 4795 if ((ent->flags & TCP_TRK_TRACK_FLG_OPEN) == 0) { 4796 if (SEQ_GEQ(th_ack, ent->end_seq)) { 4797 *ip = i; 4798 return (ent); 4799 } 4800 } 4801 } 4802 return (NULL); 4803 } 4804 4805 struct tcp_sendfile_track * 4806 tcp_req_find_req_for_seq(struct tcpcb *tp, tcp_seq seq) 4807 { 4808 struct tcp_sendfile_track *ent; 4809 int i; 4810 4811 if (tp->t_tcpreq_req == 0) { 4812 /* none open */ 4813 return (NULL); 4814 } 4815 for (i = 0; i < MAX_TCP_TRK_REQ; i++) { 4816 ent = &tp->t_tcpreq_info[i]; 4817 tcp_req_log_req_info(tp, ent, i, TCP_TRK_REQ_LOG_SEARCH, 4818 (uint64_t)seq, 0); 4819 if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) { 4820 continue; 4821 } 4822 if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) { 4823 /* 4824 * An open end request only needs to 4825 * match the beginning seq or be 4826 * all we have (once we keep going on 4827 * a open end request we may have a seq 4828 * wrap). 4829 */ 4830 if ((SEQ_GEQ(seq, ent->start_seq)) || 4831 (tp->t_tcpreq_closed == 0)) 4832 return (ent); 4833 } else { 4834 /* 4835 * For this one we need to 4836 * be a bit more careful if its 4837 * completed at least. 4838 */ 4839 if ((SEQ_GEQ(seq, ent->start_seq)) && 4840 (SEQ_LT(seq, ent->end_seq))) { 4841 return (ent); 4842 } 4843 } 4844 } 4845 return (NULL); 4846 } 4847 4848 /* Should this be in its own file tcp_req.c ? */ 4849 struct tcp_sendfile_track * 4850 tcp_req_alloc_req_full(struct tcpcb *tp, struct tcp_snd_req *req, uint64_t ts, int rec_dups) 4851 { 4852 struct tcp_sendfile_track *fil; 4853 int i, allocated; 4854 4855 /* In case the stack does not check for completions do so now */ 4856 tcp_req_check_for_comp(tp, tp->snd_una); 4857 /* Check for stale entries */ 4858 if (tp->t_tcpreq_req) 4859 tcp_req_check_for_stale_entries(tp, ts, 4860 (tp->t_tcpreq_req >= MAX_TCP_TRK_REQ)); 4861 /* Check to see if this is a duplicate of one not started */ 4862 if (tp->t_tcpreq_req) { 4863 for (i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) { 4864 fil = &tp->t_tcpreq_info[i]; 4865 if ((fil->flags & TCP_TRK_TRACK_FLG_USED) == 0) 4866 continue; 4867 if ((fil->timestamp == req->timestamp) && 4868 (fil->start == req->start) && 4869 ((fil->flags & TCP_TRK_TRACK_FLG_OPEN) || 4870 (fil->end == req->end))) { 4871 /* 4872 * We already have this request 4873 * and it has not been started with sendfile. 4874 * This probably means the user was returned 4875 * a 4xx of some sort and its going to age 4876 * out, lets not duplicate it. 4877 */ 4878 return (fil); 4879 } 4880 } 4881 } 4882 /* Ok if there is no room at the inn we are in trouble */ 4883 if (tp->t_tcpreq_req >= MAX_TCP_TRK_REQ) { 4884 tcp_trace_point(tp, TCP_TP_REQ_LOG_FAIL); 4885 for (i = 0; i < MAX_TCP_TRK_REQ; i++) { 4886 tcp_req_log_req_info(tp, &tp->t_tcpreq_info[i], 4887 i, TCP_TRK_REQ_LOG_ALLOCFAIL, 0, 0); 4888 } 4889 return (NULL); 4890 } 4891 for (i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) { 4892 fil = &tp->t_tcpreq_info[i]; 4893 if (fil->flags == TCP_TRK_TRACK_FLG_EMPTY) { 4894 allocated = 1; 4895 fil->flags = TCP_TRK_TRACK_FLG_USED; 4896 fil->timestamp = req->timestamp; 4897 fil->playout_ms = req->playout_ms; 4898 fil->localtime = ts; 4899 fil->start = req->start; 4900 if (req->flags & TCP_LOG_HTTPD_RANGE_END) { 4901 fil->end = req->end; 4902 } else { 4903 fil->end = 0; 4904 fil->flags |= TCP_TRK_TRACK_FLG_OPEN; 4905 } 4906 /* 4907 * We can set the min boundaries to the TCP Sequence space, 4908 * but it might be found to be further up when sendfile 4909 * actually runs on this range (if it ever does). 4910 */ 4911 fil->sbcc_at_s = tptosocket(tp)->so_snd.sb_ccc; 4912 fil->start_seq = tp->snd_una + 4913 tptosocket(tp)->so_snd.sb_ccc; 4914 if (req->flags & TCP_LOG_HTTPD_RANGE_END) 4915 fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start))); 4916 else 4917 fil->end_seq = 0; 4918 if (tptosocket(tp)->so_snd.sb_tls_info) { 4919 /* 4920 * This session is doing TLS. Take a swag guess 4921 * at the overhead. 4922 */ 4923 fil->end_seq += tcp_estimate_tls_overhead( 4924 tptosocket(tp), (fil->end - fil->start)); 4925 } 4926 tp->t_tcpreq_req++; 4927 if (fil->flags & TCP_TRK_TRACK_FLG_OPEN) 4928 tp->t_tcpreq_open++; 4929 else 4930 tp->t_tcpreq_closed++; 4931 tcp_req_log_req_info(tp, fil, i, 4932 TCP_TRK_REQ_LOG_NEW, 0, 0); 4933 break; 4934 } else 4935 fil = NULL; 4936 } 4937 return (fil); 4938 } 4939 4940 void 4941 tcp_req_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user, uint64_t ts) 4942 { 4943 (void)tcp_req_alloc_req_full(tp, &user->tcp_req, ts, 1); 4944 } 4945 #endif 4946 4947 void 4948 tcp_log_socket_option(struct tcpcb *tp, uint32_t option_num, uint32_t option_val, int err) 4949 { 4950 if (tcp_bblogging_on(tp)) { 4951 struct tcp_log_buffer *l; 4952 4953 l = tcp_log_event(tp, NULL, 4954 &tptosocket(tp)->so_rcv, 4955 &tptosocket(tp)->so_snd, 4956 TCP_LOG_SOCKET_OPT, 4957 err, 0, NULL, 1, 4958 NULL, NULL, 0, NULL); 4959 if (l) { 4960 l->tlb_flex1 = option_num; 4961 l->tlb_flex2 = option_val; 4962 } 4963 } 4964 } 4965 4966 uint32_t 4967 tcp_get_srtt(struct tcpcb *tp, int granularity) 4968 { 4969 uint32_t srtt; 4970 4971 KASSERT(granularity == TCP_TMR_GRANULARITY_USEC || 4972 granularity == TCP_TMR_GRANULARITY_TICKS, 4973 ("%s: called with unexpected granularity %d", __func__, 4974 granularity)); 4975 4976 srtt = tp->t_srtt; 4977 4978 /* 4979 * We only support two granularities. If the stored granularity 4980 * does not match the granularity requested by the caller, 4981 * convert the stored value to the requested unit of granularity. 4982 */ 4983 if (tp->t_tmr_granularity != granularity) { 4984 if (granularity == TCP_TMR_GRANULARITY_USEC) 4985 srtt = TICKS_2_USEC(srtt); 4986 else 4987 srtt = USEC_2_TICKS(srtt); 4988 } 4989 4990 /* 4991 * If the srtt is stored with ticks granularity, we need to 4992 * unshift to get the actual value. We do this after the 4993 * conversion above (if one was necessary) in order to maximize 4994 * precision. 4995 */ 4996 if (tp->t_tmr_granularity == TCP_TMR_GRANULARITY_TICKS) 4997 srtt = srtt >> TCP_RTT_SHIFT; 4998 4999 return (srtt); 5000 } 5001 5002 void 5003 tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt, 5004 uint8_t is_tlp, bool hw_tls) 5005 { 5006 5007 if (is_tlp) { 5008 tp->t_sndtlppack++; 5009 tp->t_sndtlpbyte += len; 5010 } 5011 /* To get total bytes sent you must add t_snd_rxt_bytes to t_sndbytes */ 5012 if (is_rxt) 5013 tp->t_snd_rxt_bytes += len; 5014 else 5015 tp->t_sndbytes += len; 5016 5017 #ifdef KERN_TLS 5018 if (hw_tls && is_rxt && len != 0) { 5019 uint64_t rexmit_percent; 5020 5021 rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / 5022 (10ULL * (tp->t_snd_rxt_bytes + tp->t_sndbytes)); 5023 if (rexmit_percent > ktls_ifnet_max_rexmit_pct) 5024 ktls_disable_ifnet(tp); 5025 } 5026 #endif 5027 } 5028