1 /* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 34 * $FreeBSD$ 35 */ 36 37 #include "opt_compat.h" 38 #include "opt_inet6.h" 39 #include "opt_ipsec.h" 40 #include "opt_mac.h" 41 #include "opt_tcpdebug.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/callout.h> 46 #include <sys/kernel.h> 47 #include <sys/sysctl.h> 48 #include <sys/mac.h> 49 #include <sys/malloc.h> 50 #include <sys/mbuf.h> 51 #ifdef INET6 52 #include <sys/domain.h> 53 #endif 54 #include <sys/proc.h> 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 #include <sys/protosw.h> 58 #include <sys/random.h> 59 60 #include <vm/uma.h> 61 62 #include <net/route.h> 63 #include <net/if.h> 64 65 #include <netinet/in.h> 66 #include <netinet/in_systm.h> 67 #include <netinet/ip.h> 68 #ifdef INET6 69 #include <netinet/ip6.h> 70 #endif 71 #include <netinet/in_pcb.h> 72 #ifdef INET6 73 #include <netinet6/in6_pcb.h> 74 #endif 75 #include <netinet/in_var.h> 76 #include <netinet/ip_var.h> 77 #ifdef INET6 78 #include <netinet6/ip6_var.h> 79 #endif 80 #include <netinet/tcp.h> 81 #include <netinet/tcp_fsm.h> 82 #include <netinet/tcp_seq.h> 83 #include <netinet/tcp_timer.h> 84 #include <netinet/tcp_var.h> 85 #ifdef INET6 86 #include <netinet6/tcp6_var.h> 87 #endif 88 #include <netinet/tcpip.h> 89 #ifdef TCPDEBUG 90 #include <netinet/tcp_debug.h> 91 #endif 92 #include <netinet6/ip6protosw.h> 93 94 #ifdef IPSEC 95 #include <netinet6/ipsec.h> 96 #ifdef INET6 97 #include <netinet6/ipsec6.h> 98 #endif 99 #endif /*IPSEC*/ 100 101 #ifdef FAST_IPSEC 102 #include <netipsec/ipsec.h> 103 #ifdef INET6 104 #include <netipsec/ipsec6.h> 105 #endif 106 #define IPSEC 107 #endif /*FAST_IPSEC*/ 108 109 #include <machine/in_cksum.h> 110 #include <sys/md5.h> 111 112 int tcp_mssdflt = TCP_MSS; 113 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, 114 &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); 115 116 #ifdef INET6 117 int tcp_v6mssdflt = TCP6_MSS; 118 SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, 119 CTLFLAG_RW, &tcp_v6mssdflt , 0, 120 "Default TCP Maximum Segment Size for IPv6"); 121 #endif 122 123 #if 0 124 static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; 125 SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, 126 &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); 127 #endif 128 129 int tcp_do_rfc1323 = 1; 130 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, 131 &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); 132 133 int tcp_do_rfc1644 = 0; 134 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, 135 &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); 136 137 static int tcp_tcbhashsize = 0; 138 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, 139 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); 140 141 static int do_tcpdrain = 1; 142 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, 143 "Enable tcp_drain routine for extra help when low on mbufs"); 144 145 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, 146 &tcbinfo.ipi_count, 0, "Number of active PCBs"); 147 148 static int icmp_may_rst = 1; 149 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, 150 "Certain ICMP unreachable messages may abort connections in SYN_SENT"); 151 152 static int tcp_isn_reseed_interval = 0; 153 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, 154 &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); 155 156 /* 157 * TCP bandwidth limiting sysctls. Note that the default lower bound of 158 * 1024 exists only for debugging. A good production default would be 159 * something like 6100. 160 */ 161 static int tcp_inflight_enable = 0; 162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW, 163 &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); 164 165 static int tcp_inflight_debug = 0; 166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW, 167 &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); 168 169 static int tcp_inflight_min = 6144; 170 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW, 171 &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); 172 173 static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; 174 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW, 175 &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); 176 static int tcp_inflight_stab = 20; 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, 178 &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); 179 180 static void tcp_cleartaocache(void); 181 static struct inpcb *tcp_notify(struct inpcb *, int); 182 static void tcp_discardcb(struct tcpcb *); 183 184 /* 185 * Target size of TCP PCB hash tables. Must be a power of two. 186 * 187 * Note that this can be overridden by the kernel environment 188 * variable net.inet.tcp.tcbhashsize 189 */ 190 #ifndef TCBHASHSIZE 191 #define TCBHASHSIZE 512 192 #endif 193 194 /* 195 * XXX 196 * Callouts should be moved into struct tcp directly. They are currently 197 * separate becuase the tcpcb structure is exported to userland for sysctl 198 * parsing purposes, which do not know about callouts. 199 */ 200 struct tcpcb_mem { 201 struct tcpcb tcb; 202 struct callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep; 203 struct callout tcpcb_mem_2msl, tcpcb_mem_delack; 204 }; 205 struct tcptw_mem { 206 struct tcptw tw; 207 struct callout tcptw_mem_2msl; 208 }; 209 210 static uma_zone_t tcpcb_zone; 211 static uma_zone_t tcptw_zone; 212 213 /* 214 * Tcp initialization 215 */ 216 void 217 tcp_init() 218 { 219 int hashsize = TCBHASHSIZE; 220 221 tcp_ccgen = 1; 222 tcp_cleartaocache(); 223 224 tcp_delacktime = TCPTV_DELACK; 225 tcp_keepinit = TCPTV_KEEP_INIT; 226 tcp_keepidle = TCPTV_KEEP_IDLE; 227 tcp_keepintvl = TCPTV_KEEPINTVL; 228 tcp_maxpersistidle = TCPTV_KEEP_IDLE; 229 tcp_msl = TCPTV_MSL; 230 tcp_rexmit_min = TCPTV_MIN; 231 tcp_rexmit_slop = TCPTV_CPU_VAR; 232 233 INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); 234 LIST_INIT(&tcb); 235 tcbinfo.listhead = &tcb; 236 TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); 237 if (!powerof2(hashsize)) { 238 printf("WARNING: TCB hash size not a power of 2\n"); 239 hashsize = 512; /* safe default */ 240 } 241 tcp_tcbhashsize = hashsize; 242 tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); 243 tcbinfo.porthashbase = hashinit(hashsize, M_PCB, 244 &tcbinfo.porthashmask); 245 tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), 246 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 247 uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); 248 #ifdef INET6 249 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) 250 #else /* INET6 */ 251 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) 252 #endif /* INET6 */ 253 if (max_protohdr < TCP_MINPROTOHDR) 254 max_protohdr = TCP_MINPROTOHDR; 255 if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) 256 panic("tcp_init"); 257 #undef TCP_MINPROTOHDR 258 /* 259 * These have to be type stable for the benefit of the timers. 260 */ 261 tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), 262 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 263 uma_zone_set_max(tcpcb_zone, maxsockets); 264 tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw_mem), 265 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 266 uma_zone_set_max(tcptw_zone, maxsockets); 267 268 syncache_init(); 269 } 270 271 /* 272 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. 273 * tcp_template used to store this data in mbufs, but we now recopy it out 274 * of the tcpcb each time to conserve mbufs. 275 */ 276 void 277 tcpip_fillheaders(inp, ip_ptr, tcp_ptr) 278 struct inpcb *inp; 279 void *ip_ptr; 280 void *tcp_ptr; 281 { 282 struct tcphdr *th = (struct tcphdr *)tcp_ptr; 283 284 #ifdef INET6 285 if ((inp->inp_vflag & INP_IPV6) != 0) { 286 struct ip6_hdr *ip6; 287 288 ip6 = (struct ip6_hdr *)ip_ptr; 289 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | 290 (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); 291 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | 292 (IPV6_VERSION & IPV6_VERSION_MASK); 293 ip6->ip6_nxt = IPPROTO_TCP; 294 ip6->ip6_plen = sizeof(struct tcphdr); 295 ip6->ip6_src = inp->in6p_laddr; 296 ip6->ip6_dst = inp->in6p_faddr; 297 } else 298 #endif 299 { 300 struct ip *ip; 301 302 ip = (struct ip *)ip_ptr; 303 ip->ip_v = IPVERSION; 304 ip->ip_hl = 5; 305 ip->ip_tos = inp->inp_ip_tos; 306 ip->ip_len = 0; 307 ip->ip_id = 0; 308 ip->ip_off = 0; 309 ip->ip_ttl = inp->inp_ip_ttl; 310 ip->ip_sum = 0; 311 ip->ip_p = IPPROTO_TCP; 312 ip->ip_src = inp->inp_laddr; 313 ip->ip_dst = inp->inp_faddr; 314 } 315 th->th_sport = inp->inp_lport; 316 th->th_dport = inp->inp_fport; 317 th->th_seq = 0; 318 th->th_ack = 0; 319 th->th_x2 = 0; 320 th->th_off = 5; 321 th->th_flags = 0; 322 th->th_win = 0; 323 th->th_urp = 0; 324 th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ 325 } 326 327 /* 328 * Create template to be used to send tcp packets on a connection. 329 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only 330 * use for this function is in keepalives, which use tcp_respond. 331 */ 332 struct tcptemp * 333 tcpip_maketemplate(inp) 334 struct inpcb *inp; 335 { 336 struct mbuf *m; 337 struct tcptemp *n; 338 339 m = m_get(M_DONTWAIT, MT_HEADER); 340 if (m == NULL) 341 return (0); 342 m->m_len = sizeof(struct tcptemp); 343 n = mtod(m, struct tcptemp *); 344 345 tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t); 346 return (n); 347 } 348 349 /* 350 * Send a single message to the TCP at address specified by 351 * the given TCP/IP header. If m == 0, then we make a copy 352 * of the tcpiphdr at ti and send directly to the addressed host. 353 * This is used to force keep alive messages out using the TCP 354 * template for a connection. If flags are given then we send 355 * a message back to the TCP which originated the * segment ti, 356 * and discard the mbuf containing it and any other attached mbufs. 357 * 358 * In any case the ack and sequence number of the transmitted 359 * segment are as specified by the parameters. 360 * 361 * NOTE: If m != NULL, then ti must point to *inside* the mbuf. 362 */ 363 void 364 tcp_respond(tp, ipgen, th, m, ack, seq, flags) 365 struct tcpcb *tp; 366 void *ipgen; 367 register struct tcphdr *th; 368 register struct mbuf *m; 369 tcp_seq ack, seq; 370 int flags; 371 { 372 register int tlen; 373 int win = 0; 374 struct route *ro = 0; 375 struct route sro; 376 struct ip *ip; 377 struct tcphdr *nth; 378 #ifdef INET6 379 struct route_in6 *ro6 = 0; 380 struct route_in6 sro6; 381 struct ip6_hdr *ip6; 382 int isipv6; 383 #endif /* INET6 */ 384 int ipflags = 0; 385 386 KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); 387 388 #ifdef INET6 389 isipv6 = ((struct ip *)ipgen)->ip_v == 6; 390 ip6 = ipgen; 391 #endif /* INET6 */ 392 ip = ipgen; 393 394 if (tp) { 395 if (!(flags & TH_RST)) { 396 win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); 397 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 398 win = (long)TCP_MAXWIN << tp->rcv_scale; 399 } 400 #ifdef INET6 401 if (isipv6) 402 ro6 = &tp->t_inpcb->in6p_route; 403 else 404 #endif /* INET6 */ 405 ro = &tp->t_inpcb->inp_route; 406 } else { 407 #ifdef INET6 408 if (isipv6) { 409 ro6 = &sro6; 410 bzero(ro6, sizeof *ro6); 411 } else 412 #endif /* INET6 */ 413 { 414 ro = &sro; 415 bzero(ro, sizeof *ro); 416 } 417 } 418 if (m == 0) { 419 m = m_gethdr(M_DONTWAIT, MT_HEADER); 420 if (m == NULL) 421 return; 422 tlen = 0; 423 m->m_data += max_linkhdr; 424 #ifdef INET6 425 if (isipv6) { 426 bcopy((caddr_t)ip6, mtod(m, caddr_t), 427 sizeof(struct ip6_hdr)); 428 ip6 = mtod(m, struct ip6_hdr *); 429 nth = (struct tcphdr *)(ip6 + 1); 430 } else 431 #endif /* INET6 */ 432 { 433 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); 434 ip = mtod(m, struct ip *); 435 nth = (struct tcphdr *)(ip + 1); 436 } 437 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); 438 flags = TH_ACK; 439 } else { 440 m_freem(m->m_next); 441 m->m_next = 0; 442 m->m_data = (caddr_t)ipgen; 443 /* m_len is set later */ 444 tlen = 0; 445 #define xchg(a,b,type) { type t; t=a; a=b; b=t; } 446 #ifdef INET6 447 if (isipv6) { 448 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 449 nth = (struct tcphdr *)(ip6 + 1); 450 } else 451 #endif /* INET6 */ 452 { 453 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); 454 nth = (struct tcphdr *)(ip + 1); 455 } 456 if (th != nth) { 457 /* 458 * this is usually a case when an extension header 459 * exists between the IPv6 header and the 460 * TCP header. 461 */ 462 nth->th_sport = th->th_sport; 463 nth->th_dport = th->th_dport; 464 } 465 xchg(nth->th_dport, nth->th_sport, n_short); 466 #undef xchg 467 } 468 #ifdef INET6 469 if (isipv6) { 470 ip6->ip6_flow = 0; 471 ip6->ip6_vfc = IPV6_VERSION; 472 ip6->ip6_nxt = IPPROTO_TCP; 473 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + 474 tlen)); 475 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); 476 } else 477 #endif 478 { 479 tlen += sizeof (struct tcpiphdr); 480 ip->ip_len = tlen; 481 ip->ip_ttl = ip_defttl; 482 } 483 m->m_len = tlen; 484 m->m_pkthdr.len = tlen; 485 m->m_pkthdr.rcvif = (struct ifnet *) 0; 486 #ifdef MAC 487 if (tp != NULL && tp->t_inpcb != NULL) { 488 /* 489 * Packet is associated with a socket, so allow the 490 * label of the response to reflect the socket label. 491 */ 492 mac_create_mbuf_from_socket(tp->t_inpcb->inp_socket, m); 493 } else { 494 /* 495 * XXXMAC: This will need to call a mac function that 496 * modifies the mbuf label in place for TCP datagrams 497 * not associated with a PCB. 498 */ 499 } 500 #endif 501 nth->th_seq = htonl(seq); 502 nth->th_ack = htonl(ack); 503 nth->th_x2 = 0; 504 nth->th_off = sizeof (struct tcphdr) >> 2; 505 nth->th_flags = flags; 506 if (tp) 507 nth->th_win = htons((u_short) (win >> tp->rcv_scale)); 508 else 509 nth->th_win = htons((u_short)win); 510 nth->th_urp = 0; 511 #ifdef INET6 512 if (isipv6) { 513 nth->th_sum = 0; 514 nth->th_sum = in6_cksum(m, IPPROTO_TCP, 515 sizeof(struct ip6_hdr), 516 tlen - sizeof(struct ip6_hdr)); 517 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, 518 ro6 && ro6->ro_rt ? 519 ro6->ro_rt->rt_ifp : 520 NULL); 521 } else 522 #endif /* INET6 */ 523 { 524 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 525 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); 526 m->m_pkthdr.csum_flags = CSUM_TCP; 527 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 528 } 529 #ifdef TCPDEBUG 530 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 531 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); 532 #endif 533 #ifdef INET6 534 if (isipv6) { 535 (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL, 536 tp ? tp->t_inpcb : NULL); 537 if (ro6 == &sro6 && ro6->ro_rt) { 538 RTFREE(ro6->ro_rt); 539 ro6->ro_rt = NULL; 540 } 541 } else 542 #endif /* INET6 */ 543 { 544 (void) ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL); 545 if (ro == &sro && ro->ro_rt) { 546 RTFREE(ro->ro_rt); 547 ro->ro_rt = NULL; 548 } 549 } 550 } 551 552 /* 553 * Create a new TCP control block, making an 554 * empty reassembly queue and hooking it to the argument 555 * protocol control block. The `inp' parameter must have 556 * come from the zone allocator set up in tcp_init(). 557 */ 558 struct tcpcb * 559 tcp_newtcpcb(inp) 560 struct inpcb *inp; 561 { 562 struct tcpcb_mem *tm; 563 struct tcpcb *tp; 564 #ifdef INET6 565 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 566 #endif /* INET6 */ 567 568 tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO); 569 if (tm == NULL) 570 return (NULL); 571 tp = &tm->tcb; 572 /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ 573 tp->t_maxseg = tp->t_maxopd = 574 #ifdef INET6 575 isipv6 ? tcp_v6mssdflt : 576 #endif /* INET6 */ 577 tcp_mssdflt; 578 579 /* Set up our timeouts. */ 580 callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, 0); 581 callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, 0); 582 callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, 0); 583 callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, 0); 584 callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, 0); 585 586 if (tcp_do_rfc1323) 587 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); 588 if (tcp_do_rfc1644) 589 tp->t_flags |= TF_REQ_CC; 590 tp->t_inpcb = inp; /* XXX */ 591 /* 592 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 593 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives 594 * reasonable initial retransmit time. 595 */ 596 tp->t_srtt = TCPTV_SRTTBASE; 597 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; 598 tp->t_rttmin = tcp_rexmit_min; 599 tp->t_rxtcur = TCPTV_RTOBASE; 600 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 601 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 602 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 603 tp->t_rcvtime = ticks; 604 tp->t_bw_rtttime = ticks; 605 /* 606 * IPv4 TTL initialization is necessary for an IPv6 socket as well, 607 * because the socket may be bound to an IPv6 wildcard address, 608 * which may match an IPv4-mapped IPv6 address. 609 */ 610 inp->inp_ip_ttl = ip_defttl; 611 inp->inp_ppcb = (caddr_t)tp; 612 return (tp); /* XXX */ 613 } 614 615 /* 616 * Drop a TCP connection, reporting 617 * the specified error. If connection is synchronized, 618 * then send a RST to peer. 619 */ 620 struct tcpcb * 621 tcp_drop(tp, errno) 622 register struct tcpcb *tp; 623 int errno; 624 { 625 struct socket *so = tp->t_inpcb->inp_socket; 626 627 if (TCPS_HAVERCVDSYN(tp->t_state)) { 628 tp->t_state = TCPS_CLOSED; 629 (void) tcp_output(tp); 630 tcpstat.tcps_drops++; 631 } else 632 tcpstat.tcps_conndrops++; 633 if (errno == ETIMEDOUT && tp->t_softerror) 634 errno = tp->t_softerror; 635 so->so_error = errno; 636 return (tcp_close(tp)); 637 } 638 639 static void 640 tcp_discardcb(tp) 641 struct tcpcb *tp; 642 { 643 struct tseg_qent *q; 644 struct inpcb *inp = tp->t_inpcb; 645 struct socket *so = inp->inp_socket; 646 #ifdef INET6 647 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 648 #endif /* INET6 */ 649 struct rtentry *rt; 650 int dosavessthresh; 651 652 /* 653 * Make sure that all of our timers are stopped before we 654 * delete the PCB. 655 */ 656 callout_stop(tp->tt_rexmt); 657 callout_stop(tp->tt_persist); 658 callout_stop(tp->tt_keep); 659 callout_stop(tp->tt_2msl); 660 callout_stop(tp->tt_delack); 661 662 /* 663 * If we got enough samples through the srtt filter, 664 * save the rtt and rttvar in the routing entry. 665 * 'Enough' is arbitrarily defined as the 16 samples. 666 * 16 samples is enough for the srtt filter to converge 667 * to within 5% of the correct value; fewer samples and 668 * we could save a very bogus rtt. 669 * 670 * Don't update the default route's characteristics and don't 671 * update anything that the user "locked". 672 */ 673 if (tp->t_rttupdated >= 16) { 674 register u_long i = 0; 675 #ifdef INET6 676 if (isipv6) { 677 struct sockaddr_in6 *sin6; 678 679 if ((rt = inp->in6p_route.ro_rt) == NULL) 680 goto no_valid_rt; 681 sin6 = (struct sockaddr_in6 *)rt_key(rt); 682 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 683 goto no_valid_rt; 684 } 685 else 686 #endif /* INET6 */ 687 if ((rt = inp->inp_route.ro_rt) == NULL || 688 ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr 689 == INADDR_ANY) 690 goto no_valid_rt; 691 692 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { 693 i = tp->t_srtt * 694 (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); 695 if (rt->rt_rmx.rmx_rtt && i) 696 /* 697 * filter this update to half the old & half 698 * the new values, converting scale. 699 * See route.h and tcp_var.h for a 700 * description of the scaling constants. 701 */ 702 rt->rt_rmx.rmx_rtt = 703 (rt->rt_rmx.rmx_rtt + i) / 2; 704 else 705 rt->rt_rmx.rmx_rtt = i; 706 tcpstat.tcps_cachedrtt++; 707 } 708 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { 709 i = tp->t_rttvar * 710 (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); 711 if (rt->rt_rmx.rmx_rttvar && i) 712 rt->rt_rmx.rmx_rttvar = 713 (rt->rt_rmx.rmx_rttvar + i) / 2; 714 else 715 rt->rt_rmx.rmx_rttvar = i; 716 tcpstat.tcps_cachedrttvar++; 717 } 718 /* 719 * The old comment here said: 720 * update the pipelimit (ssthresh) if it has been updated 721 * already or if a pipesize was specified & the threshhold 722 * got below half the pipesize. I.e., wait for bad news 723 * before we start updating, then update on both good 724 * and bad news. 725 * 726 * But we want to save the ssthresh even if no pipesize is 727 * specified explicitly in the route, because such 728 * connections still have an implicit pipesize specified 729 * by the global tcp_sendspace. In the absence of a reliable 730 * way to calculate the pipesize, it will have to do. 731 */ 732 i = tp->snd_ssthresh; 733 if (rt->rt_rmx.rmx_sendpipe != 0) 734 dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); 735 else 736 dosavessthresh = (i < so->so_snd.sb_hiwat / 2); 737 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && 738 i != 0 && rt->rt_rmx.rmx_ssthresh != 0) 739 || dosavessthresh) { 740 /* 741 * convert the limit from user data bytes to 742 * packets then to packet data bytes. 743 */ 744 i = (i + tp->t_maxseg / 2) / tp->t_maxseg; 745 if (i < 2) 746 i = 2; 747 i *= (u_long)(tp->t_maxseg + 748 #ifdef INET6 749 (isipv6 ? sizeof (struct ip6_hdr) + 750 sizeof (struct tcphdr) : 751 #endif 752 sizeof (struct tcpiphdr) 753 #ifdef INET6 754 ) 755 #endif 756 ); 757 if (rt->rt_rmx.rmx_ssthresh) 758 rt->rt_rmx.rmx_ssthresh = 759 (rt->rt_rmx.rmx_ssthresh + i) / 2; 760 else 761 rt->rt_rmx.rmx_ssthresh = i; 762 tcpstat.tcps_cachedssthresh++; 763 } 764 } 765 no_valid_rt: 766 /* free the reassembly queue, if any */ 767 while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { 768 LIST_REMOVE(q, tqe_q); 769 m_freem(q->tqe_m); 770 FREE(q, M_TSEGQ); 771 } 772 inp->inp_ppcb = NULL; 773 tp->t_inpcb = NULL; 774 uma_zfree(tcpcb_zone, tp); 775 soisdisconnected(so); 776 } 777 778 /* 779 * Close a TCP control block: 780 * discard all space held by the tcp 781 * discard internet protocol block 782 * wake up any sleepers 783 */ 784 struct tcpcb * 785 tcp_close(tp) 786 struct tcpcb *tp; 787 { 788 struct inpcb *inp = tp->t_inpcb; 789 #ifdef INET6 790 struct socket *so = inp->inp_socket; 791 #endif 792 793 tcp_discardcb(tp); 794 #ifdef INET6 795 if (INP_CHECK_SOCKAF(so, AF_INET6)) 796 in6_pcbdetach(inp); 797 else 798 #endif 799 in_pcbdetach(inp); 800 tcpstat.tcps_closed++; 801 return ((struct tcpcb *)0); 802 } 803 804 void 805 tcp_drain() 806 { 807 if (do_tcpdrain) 808 { 809 struct inpcb *inpb; 810 struct tcpcb *tcpb; 811 struct tseg_qent *te; 812 813 /* 814 * Walk the tcpbs, if existing, and flush the reassembly queue, 815 * if there is one... 816 * XXX: The "Net/3" implementation doesn't imply that the TCP 817 * reassembly queue should be flushed, but in a situation 818 * where we're really low on mbufs, this is potentially 819 * usefull. 820 */ 821 INP_INFO_RLOCK(&tcbinfo); 822 LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { 823 if (inpb->inp_vflag & INP_TIMEWAIT) 824 continue; 825 INP_LOCK(inpb); 826 if ((tcpb = intotcpcb(inpb))) { 827 while ((te = LIST_FIRST(&tcpb->t_segq)) 828 != NULL) { 829 LIST_REMOVE(te, tqe_q); 830 m_freem(te->tqe_m); 831 FREE(te, M_TSEGQ); 832 } 833 } 834 INP_UNLOCK(inpb); 835 } 836 INP_INFO_RUNLOCK(&tcbinfo); 837 } 838 } 839 840 /* 841 * Notify a tcp user of an asynchronous error; 842 * store error as soft error, but wake up user 843 * (for now, won't do anything until can select for soft error). 844 * 845 * Do not wake up user since there currently is no mechanism for 846 * reporting soft errors (yet - a kqueue filter may be added). 847 */ 848 static struct inpcb * 849 tcp_notify(inp, error) 850 struct inpcb *inp; 851 int error; 852 { 853 struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; 854 855 /* 856 * Ignore some errors if we are hooked up. 857 * If connection hasn't completed, has retransmitted several times, 858 * and receives a second error, give up now. This is better 859 * than waiting a long time to establish a connection that 860 * can never complete. 861 */ 862 if (tp->t_state == TCPS_ESTABLISHED && 863 (error == EHOSTUNREACH || error == ENETUNREACH || 864 error == EHOSTDOWN)) { 865 return inp; 866 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && 867 tp->t_softerror) { 868 tcp_drop(tp, error); 869 return (struct inpcb *)0; 870 } else { 871 tp->t_softerror = error; 872 return inp; 873 } 874 #if 0 875 wakeup( &so->so_timeo); 876 sorwakeup(so); 877 sowwakeup(so); 878 #endif 879 } 880 881 static int 882 tcp_pcblist(SYSCTL_HANDLER_ARGS) 883 { 884 int error, i, n, s; 885 struct inpcb *inp, **inp_list; 886 inp_gen_t gencnt; 887 struct xinpgen xig; 888 889 /* 890 * The process of preparing the TCB list is too time-consuming and 891 * resource-intensive to repeat twice on every request. 892 */ 893 if (req->oldptr == 0) { 894 n = tcbinfo.ipi_count; 895 req->oldidx = 2 * (sizeof xig) 896 + (n + n/8) * sizeof(struct xtcpcb); 897 return 0; 898 } 899 900 if (req->newptr != 0) 901 return EPERM; 902 903 /* 904 * OK, now we're committed to doing something. 905 */ 906 s = splnet(); 907 INP_INFO_RLOCK(&tcbinfo); 908 gencnt = tcbinfo.ipi_gencnt; 909 n = tcbinfo.ipi_count; 910 INP_INFO_RUNLOCK(&tcbinfo); 911 splx(s); 912 913 sysctl_wire_old_buffer(req, 2 * (sizeof xig) 914 + n * sizeof(struct xtcpcb)); 915 916 xig.xig_len = sizeof xig; 917 xig.xig_count = n; 918 xig.xig_gen = gencnt; 919 xig.xig_sogen = so_gencnt; 920 error = SYSCTL_OUT(req, &xig, sizeof xig); 921 if (error) 922 return error; 923 924 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 925 if (inp_list == 0) 926 return ENOMEM; 927 928 s = splnet(); 929 INP_INFO_RLOCK(&tcbinfo); 930 for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; 931 inp = LIST_NEXT(inp, inp_list)) { 932 INP_LOCK(inp); 933 if (inp->inp_gencnt <= gencnt && 934 (((inp->inp_vflag & INP_TIMEWAIT) && 935 cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred) == 0) || 936 cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)) 937 inp_list[i++] = inp; 938 INP_UNLOCK(inp); 939 } 940 INP_INFO_RUNLOCK(&tcbinfo); 941 splx(s); 942 n = i; 943 944 error = 0; 945 for (i = 0; i < n; i++) { 946 inp = inp_list[i]; 947 if (inp->inp_gencnt <= gencnt) { 948 struct xtcpcb xt; 949 caddr_t inp_ppcb; 950 xt.xt_len = sizeof xt; 951 /* XXX should avoid extra copy */ 952 bcopy(inp, &xt.xt_inp, sizeof *inp); 953 inp_ppcb = inp->inp_ppcb; 954 if (inp_ppcb == NULL) 955 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 956 else if (inp->inp_vflag & INP_TIMEWAIT) { 957 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 958 xt.xt_tp.t_state = TCPS_TIME_WAIT; 959 } else 960 bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); 961 if (inp->inp_socket) 962 sotoxsocket(inp->inp_socket, &xt.xt_socket); 963 else { 964 bzero(&xt.xt_socket, sizeof xt.xt_socket); 965 xt.xt_socket.xso_protocol = IPPROTO_TCP; 966 } 967 xt.xt_inp.inp_gencnt = inp->inp_gencnt; 968 error = SYSCTL_OUT(req, &xt, sizeof xt); 969 } 970 } 971 if (!error) { 972 /* 973 * Give the user an updated idea of our state. 974 * If the generation differs from what we told 975 * her before, she knows that something happened 976 * while we were processing this request, and it 977 * might be necessary to retry. 978 */ 979 s = splnet(); 980 INP_INFO_RLOCK(&tcbinfo); 981 xig.xig_gen = tcbinfo.ipi_gencnt; 982 xig.xig_sogen = so_gencnt; 983 xig.xig_count = tcbinfo.ipi_count; 984 INP_INFO_RUNLOCK(&tcbinfo); 985 splx(s); 986 error = SYSCTL_OUT(req, &xig, sizeof xig); 987 } 988 free(inp_list, M_TEMP); 989 return error; 990 } 991 992 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, 993 tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); 994 995 static int 996 tcp_getcred(SYSCTL_HANDLER_ARGS) 997 { 998 struct xucred xuc; 999 struct sockaddr_in addrs[2]; 1000 struct inpcb *inp; 1001 int error, s; 1002 1003 error = suser_cred(req->td->td_ucred, PRISON_ROOT); 1004 if (error) 1005 return (error); 1006 error = SYSCTL_IN(req, addrs, sizeof(addrs)); 1007 if (error) 1008 return (error); 1009 s = splnet(); 1010 INP_INFO_RLOCK(&tcbinfo); 1011 inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, 1012 addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); 1013 if (inp == NULL) { 1014 error = ENOENT; 1015 goto outunlocked; 1016 } 1017 INP_LOCK(inp); 1018 if (inp->inp_socket == NULL) { 1019 error = ENOENT; 1020 goto out; 1021 } 1022 error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); 1023 if (error) 1024 goto out; 1025 cru2x(inp->inp_socket->so_cred, &xuc); 1026 out: 1027 INP_UNLOCK(inp); 1028 outunlocked: 1029 INP_INFO_RUNLOCK(&tcbinfo); 1030 splx(s); 1031 if (error == 0) 1032 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 1033 return (error); 1034 } 1035 1036 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, 1037 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, 1038 tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); 1039 1040 #ifdef INET6 1041 static int 1042 tcp6_getcred(SYSCTL_HANDLER_ARGS) 1043 { 1044 struct xucred xuc; 1045 struct sockaddr_in6 addrs[2]; 1046 struct inpcb *inp; 1047 int error, s, mapped = 0; 1048 1049 error = suser_cred(req->td->td_ucred, PRISON_ROOT); 1050 if (error) 1051 return (error); 1052 error = SYSCTL_IN(req, addrs, sizeof(addrs)); 1053 if (error) 1054 return (error); 1055 if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { 1056 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) 1057 mapped = 1; 1058 else 1059 return (EINVAL); 1060 } 1061 s = splnet(); 1062 INP_INFO_RLOCK(&tcbinfo); 1063 if (mapped == 1) 1064 inp = in_pcblookup_hash(&tcbinfo, 1065 *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], 1066 addrs[1].sin6_port, 1067 *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], 1068 addrs[0].sin6_port, 1069 0, NULL); 1070 else 1071 inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, 1072 addrs[1].sin6_port, 1073 &addrs[0].sin6_addr, addrs[0].sin6_port, 1074 0, NULL); 1075 if (inp == NULL) { 1076 error = ENOENT; 1077 goto outunlocked; 1078 } 1079 INP_LOCK(inp); 1080 if (inp->inp_socket == NULL) { 1081 error = ENOENT; 1082 goto out; 1083 } 1084 error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); 1085 if (error) 1086 goto out; 1087 cru2x(inp->inp_socket->so_cred, &xuc); 1088 out: 1089 INP_UNLOCK(inp); 1090 outunlocked: 1091 INP_INFO_RUNLOCK(&tcbinfo); 1092 splx(s); 1093 if (error == 0) 1094 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 1095 return (error); 1096 } 1097 1098 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, 1099 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, 1100 tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); 1101 #endif 1102 1103 1104 void 1105 tcp_ctlinput(cmd, sa, vip) 1106 int cmd; 1107 struct sockaddr *sa; 1108 void *vip; 1109 { 1110 struct ip *ip = vip; 1111 struct tcphdr *th; 1112 struct in_addr faddr; 1113 struct inpcb *inp; 1114 struct tcpcb *tp; 1115 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; 1116 tcp_seq icmp_seq; 1117 int s; 1118 1119 faddr = ((struct sockaddr_in *)sa)->sin_addr; 1120 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1121 return; 1122 1123 if (cmd == PRC_QUENCH) 1124 notify = tcp_quench; 1125 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || 1126 cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) 1127 notify = tcp_drop_syn_sent; 1128 else if (cmd == PRC_MSGSIZE) 1129 notify = tcp_mtudisc; 1130 else if (PRC_IS_REDIRECT(cmd)) { 1131 ip = 0; 1132 notify = in_rtchange; 1133 } else if (cmd == PRC_HOSTDEAD) 1134 ip = 0; 1135 else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0) 1136 return; 1137 if (ip) { 1138 s = splnet(); 1139 th = (struct tcphdr *)((caddr_t)ip 1140 + (ip->ip_hl << 2)); 1141 INP_INFO_WLOCK(&tcbinfo); 1142 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, 1143 ip->ip_src, th->th_sport, 0, NULL); 1144 if (inp != NULL) { 1145 INP_LOCK(inp); 1146 if (inp->inp_socket != NULL) { 1147 icmp_seq = htonl(th->th_seq); 1148 tp = intotcpcb(inp); 1149 if (SEQ_GEQ(icmp_seq, tp->snd_una) && 1150 SEQ_LT(icmp_seq, tp->snd_max)) 1151 inp = (*notify)(inp, inetctlerrmap[cmd]); 1152 } 1153 if (inp) 1154 INP_UNLOCK(inp); 1155 } else { 1156 struct in_conninfo inc; 1157 1158 inc.inc_fport = th->th_dport; 1159 inc.inc_lport = th->th_sport; 1160 inc.inc_faddr = faddr; 1161 inc.inc_laddr = ip->ip_src; 1162 #ifdef INET6 1163 inc.inc_isipv6 = 0; 1164 #endif 1165 syncache_unreach(&inc, th); 1166 } 1167 INP_INFO_WUNLOCK(&tcbinfo); 1168 splx(s); 1169 } else 1170 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); 1171 } 1172 1173 #ifdef INET6 1174 void 1175 tcp6_ctlinput(cmd, sa, d) 1176 int cmd; 1177 struct sockaddr *sa; 1178 void *d; 1179 { 1180 struct tcphdr th; 1181 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; 1182 struct ip6_hdr *ip6; 1183 struct mbuf *m; 1184 struct ip6ctlparam *ip6cp = NULL; 1185 const struct sockaddr_in6 *sa6_src = NULL; 1186 int off; 1187 struct tcp_portonly { 1188 u_int16_t th_sport; 1189 u_int16_t th_dport; 1190 } *thp; 1191 1192 if (sa->sa_family != AF_INET6 || 1193 sa->sa_len != sizeof(struct sockaddr_in6)) 1194 return; 1195 1196 if (cmd == PRC_QUENCH) 1197 notify = tcp_quench; 1198 else if (cmd == PRC_MSGSIZE) 1199 notify = tcp_mtudisc; 1200 else if (!PRC_IS_REDIRECT(cmd) && 1201 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) 1202 return; 1203 1204 /* if the parameter is from icmp6, decode it. */ 1205 if (d != NULL) { 1206 ip6cp = (struct ip6ctlparam *)d; 1207 m = ip6cp->ip6c_m; 1208 ip6 = ip6cp->ip6c_ip6; 1209 off = ip6cp->ip6c_off; 1210 sa6_src = ip6cp->ip6c_src; 1211 } else { 1212 m = NULL; 1213 ip6 = NULL; 1214 off = 0; /* fool gcc */ 1215 sa6_src = &sa6_any; 1216 } 1217 1218 if (ip6) { 1219 struct in_conninfo inc; 1220 /* 1221 * XXX: We assume that when IPV6 is non NULL, 1222 * M and OFF are valid. 1223 */ 1224 1225 /* check if we can safely examine src and dst ports */ 1226 if (m->m_pkthdr.len < off + sizeof(*thp)) 1227 return; 1228 1229 bzero(&th, sizeof(th)); 1230 m_copydata(m, off, sizeof(*thp), (caddr_t)&th); 1231 1232 in6_pcbnotify(&tcb, sa, th.th_dport, 1233 (struct sockaddr *)ip6cp->ip6c_src, 1234 th.th_sport, cmd, notify); 1235 1236 inc.inc_fport = th.th_dport; 1237 inc.inc_lport = th.th_sport; 1238 inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; 1239 inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; 1240 inc.inc_isipv6 = 1; 1241 syncache_unreach(&inc, &th); 1242 } else 1243 in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src, 1244 0, cmd, notify); 1245 } 1246 #endif /* INET6 */ 1247 1248 1249 /* 1250 * Following is where TCP initial sequence number generation occurs. 1251 * 1252 * There are two places where we must use initial sequence numbers: 1253 * 1. In SYN-ACK packets. 1254 * 2. In SYN packets. 1255 * 1256 * All ISNs for SYN-ACK packets are generated by the syncache. See 1257 * tcp_syncache.c for details. 1258 * 1259 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling 1260 * depends on this property. In addition, these ISNs should be 1261 * unguessable so as to prevent connection hijacking. To satisfy 1262 * the requirements of this situation, the algorithm outlined in 1263 * RFC 1948 is used to generate sequence numbers. 1264 * 1265 * Implementation details: 1266 * 1267 * Time is based off the system timer, and is corrected so that it 1268 * increases by one megabyte per second. This allows for proper 1269 * recycling on high speed LANs while still leaving over an hour 1270 * before rollover. 1271 * 1272 * net.inet.tcp.isn_reseed_interval controls the number of seconds 1273 * between seeding of isn_secret. This is normally set to zero, 1274 * as reseeding should not be necessary. 1275 * 1276 */ 1277 1278 #define ISN_BYTES_PER_SECOND 1048576 1279 1280 u_char isn_secret[32]; 1281 int isn_last_reseed; 1282 MD5_CTX isn_ctx; 1283 1284 tcp_seq 1285 tcp_new_isn(tp) 1286 struct tcpcb *tp; 1287 { 1288 u_int32_t md5_buffer[4]; 1289 tcp_seq new_isn; 1290 1291 /* Seed if this is the first use, reseed if requested. */ 1292 if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && 1293 (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) 1294 < (u_int)ticks))) { 1295 read_random(&isn_secret, sizeof(isn_secret)); 1296 isn_last_reseed = ticks; 1297 } 1298 1299 /* Compute the md5 hash and return the ISN. */ 1300 MD5Init(&isn_ctx); 1301 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); 1302 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); 1303 #ifdef INET6 1304 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { 1305 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, 1306 sizeof(struct in6_addr)); 1307 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, 1308 sizeof(struct in6_addr)); 1309 } else 1310 #endif 1311 { 1312 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, 1313 sizeof(struct in_addr)); 1314 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, 1315 sizeof(struct in_addr)); 1316 } 1317 MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); 1318 MD5Final((u_char *) &md5_buffer, &isn_ctx); 1319 new_isn = (tcp_seq) md5_buffer[0]; 1320 new_isn += ticks * (ISN_BYTES_PER_SECOND / hz); 1321 return new_isn; 1322 } 1323 1324 /* 1325 * When a source quench is received, close congestion window 1326 * to one segment. We will gradually open it again as we proceed. 1327 */ 1328 struct inpcb * 1329 tcp_quench(inp, errno) 1330 struct inpcb *inp; 1331 int errno; 1332 { 1333 struct tcpcb *tp = intotcpcb(inp); 1334 1335 if (tp) 1336 tp->snd_cwnd = tp->t_maxseg; 1337 return (inp); 1338 } 1339 1340 /* 1341 * When a specific ICMP unreachable message is received and the 1342 * connection state is SYN-SENT, drop the connection. This behavior 1343 * is controlled by the icmp_may_rst sysctl. 1344 */ 1345 struct inpcb * 1346 tcp_drop_syn_sent(inp, errno) 1347 struct inpcb *inp; 1348 int errno; 1349 { 1350 struct tcpcb *tp = intotcpcb(inp); 1351 1352 if (tp && tp->t_state == TCPS_SYN_SENT) { 1353 tcp_drop(tp, errno); 1354 return (struct inpcb *)0; 1355 } 1356 return inp; 1357 } 1358 1359 /* 1360 * When `need fragmentation' ICMP is received, update our idea of the MSS 1361 * based on the new value in the route. Also nudge TCP to send something, 1362 * since we know the packet we just sent was dropped. 1363 * This duplicates some code in the tcp_mss() function in tcp_input.c. 1364 */ 1365 struct inpcb * 1366 tcp_mtudisc(inp, errno) 1367 struct inpcb *inp; 1368 int errno; 1369 { 1370 struct tcpcb *tp = intotcpcb(inp); 1371 struct rtentry *rt; 1372 struct rmxp_tao *taop; 1373 struct socket *so = inp->inp_socket; 1374 int offered; 1375 int mss; 1376 #ifdef INET6 1377 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 1378 #endif /* INET6 */ 1379 1380 if (tp) { 1381 #ifdef INET6 1382 if (isipv6) 1383 rt = tcp_rtlookup6(&inp->inp_inc); 1384 else 1385 #endif /* INET6 */ 1386 rt = tcp_rtlookup(&inp->inp_inc); 1387 if (!rt || !rt->rt_rmx.rmx_mtu) { 1388 tp->t_maxopd = tp->t_maxseg = 1389 #ifdef INET6 1390 isipv6 ? tcp_v6mssdflt : 1391 #endif /* INET6 */ 1392 tcp_mssdflt; 1393 return inp; 1394 } 1395 taop = rmx_taop(rt->rt_rmx); 1396 offered = taop->tao_mssopt; 1397 mss = rt->rt_rmx.rmx_mtu - 1398 #ifdef INET6 1399 (isipv6 ? 1400 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : 1401 #endif /* INET6 */ 1402 sizeof(struct tcpiphdr) 1403 #ifdef INET6 1404 ) 1405 #endif /* INET6 */ 1406 ; 1407 1408 if (offered) 1409 mss = min(mss, offered); 1410 /* 1411 * XXX - The above conditional probably violates the TCP 1412 * spec. The problem is that, since we don't know the 1413 * other end's MSS, we are supposed to use a conservative 1414 * default. But, if we do that, then MTU discovery will 1415 * never actually take place, because the conservative 1416 * default is much less than the MTUs typically seen 1417 * on the Internet today. For the moment, we'll sweep 1418 * this under the carpet. 1419 * 1420 * The conservative default might not actually be a problem 1421 * if the only case this occurs is when sending an initial 1422 * SYN with options and data to a host we've never talked 1423 * to before. Then, they will reply with an MSS value which 1424 * will get recorded and the new parameters should get 1425 * recomputed. For Further Study. 1426 */ 1427 if (tp->t_maxopd <= mss) 1428 return inp; 1429 tp->t_maxopd = mss; 1430 1431 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 1432 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 1433 mss -= TCPOLEN_TSTAMP_APPA; 1434 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && 1435 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) 1436 mss -= TCPOLEN_CC_APPA; 1437 #if (MCLBYTES & (MCLBYTES - 1)) == 0 1438 if (mss > MCLBYTES) 1439 mss &= ~(MCLBYTES-1); 1440 #else 1441 if (mss > MCLBYTES) 1442 mss = mss / MCLBYTES * MCLBYTES; 1443 #endif 1444 if (so->so_snd.sb_hiwat < mss) 1445 mss = so->so_snd.sb_hiwat; 1446 1447 tp->t_maxseg = mss; 1448 1449 tcpstat.tcps_mturesent++; 1450 tp->t_rtttime = 0; 1451 tp->snd_nxt = tp->snd_una; 1452 tcp_output(tp); 1453 } 1454 return inp; 1455 } 1456 1457 /* 1458 * Look-up the routing entry to the peer of this inpcb. If no route 1459 * is found and it cannot be allocated, then return NULL. This routine 1460 * is called by TCP routines that access the rmx structure and by tcp_mss 1461 * to get the interface MTU. 1462 */ 1463 struct rtentry * 1464 tcp_rtlookup(inc) 1465 struct in_conninfo *inc; 1466 { 1467 struct route *ro; 1468 struct rtentry *rt; 1469 1470 ro = &inc->inc_route; 1471 rt = ro->ro_rt; 1472 if (rt == NULL || !(rt->rt_flags & RTF_UP)) { 1473 /* No route yet, so try to acquire one */ 1474 if (inc->inc_faddr.s_addr != INADDR_ANY) { 1475 ro->ro_dst.sa_family = AF_INET; 1476 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 1477 ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = 1478 inc->inc_faddr; 1479 rtalloc(ro); 1480 rt = ro->ro_rt; 1481 } 1482 } 1483 return rt; 1484 } 1485 1486 #ifdef INET6 1487 struct rtentry * 1488 tcp_rtlookup6(inc) 1489 struct in_conninfo *inc; 1490 { 1491 struct route_in6 *ro6; 1492 struct rtentry *rt; 1493 1494 ro6 = &inc->inc6_route; 1495 rt = ro6->ro_rt; 1496 if (rt == NULL || !(rt->rt_flags & RTF_UP)) { 1497 /* No route yet, so try to acquire one */ 1498 if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { 1499 ro6->ro_dst.sin6_family = AF_INET6; 1500 ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6); 1501 ro6->ro_dst.sin6_addr = inc->inc6_faddr; 1502 rtalloc((struct route *)ro6); 1503 rt = ro6->ro_rt; 1504 } 1505 } 1506 return rt; 1507 } 1508 #endif /* INET6 */ 1509 1510 #ifdef IPSEC 1511 /* compute ESP/AH header size for TCP, including outer IP header. */ 1512 size_t 1513 ipsec_hdrsiz_tcp(tp) 1514 struct tcpcb *tp; 1515 { 1516 struct inpcb *inp; 1517 struct mbuf *m; 1518 size_t hdrsiz; 1519 struct ip *ip; 1520 #ifdef INET6 1521 struct ip6_hdr *ip6; 1522 #endif 1523 struct tcphdr *th; 1524 1525 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) 1526 return 0; 1527 MGETHDR(m, M_DONTWAIT, MT_DATA); 1528 if (!m) 1529 return 0; 1530 1531 #ifdef INET6 1532 if ((inp->inp_vflag & INP_IPV6) != 0) { 1533 ip6 = mtod(m, struct ip6_hdr *); 1534 th = (struct tcphdr *)(ip6 + 1); 1535 m->m_pkthdr.len = m->m_len = 1536 sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 1537 tcpip_fillheaders(inp, ip6, th); 1538 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 1539 } else 1540 #endif /* INET6 */ 1541 { 1542 ip = mtod(m, struct ip *); 1543 th = (struct tcphdr *)(ip + 1); 1544 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); 1545 tcpip_fillheaders(inp, ip, th); 1546 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 1547 } 1548 1549 m_free(m); 1550 return hdrsiz; 1551 } 1552 #endif /*IPSEC*/ 1553 1554 /* 1555 * Return a pointer to the cached information about the remote host. 1556 * The cached information is stored in the protocol specific part of 1557 * the route metrics. 1558 */ 1559 struct rmxp_tao * 1560 tcp_gettaocache(inc) 1561 struct in_conninfo *inc; 1562 { 1563 struct rtentry *rt; 1564 1565 #ifdef INET6 1566 if (inc->inc_isipv6) 1567 rt = tcp_rtlookup6(inc); 1568 else 1569 #endif /* INET6 */ 1570 rt = tcp_rtlookup(inc); 1571 1572 /* Make sure this is a host route and is up. */ 1573 if (rt == NULL || 1574 (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) 1575 return NULL; 1576 1577 return rmx_taop(rt->rt_rmx); 1578 } 1579 1580 /* 1581 * Clear all the TAO cache entries, called from tcp_init. 1582 * 1583 * XXX 1584 * This routine is just an empty one, because we assume that the routing 1585 * routing tables are initialized at the same time when TCP, so there is 1586 * nothing in the cache left over. 1587 */ 1588 static void 1589 tcp_cleartaocache() 1590 { 1591 } 1592 1593 /* 1594 * Move a TCP connection into TIME_WAIT state. 1595 * tcbinfo is unlocked. 1596 * inp is locked, and is unlocked before returning. 1597 */ 1598 void 1599 tcp_twstart(tp) 1600 struct tcpcb *tp; 1601 { 1602 struct tcptw_mem *tm; 1603 struct tcptw *tw; 1604 struct inpcb *inp; 1605 int tw_time, acknow; 1606 struct socket *so; 1607 1608 tm = uma_zalloc(tcptw_zone, M_NOWAIT); 1609 if (tm == NULL) 1610 /* EEEK! -- preserve old structure or just kill everything? */ 1611 /* must obtain tcbinfo lock in order to drop the structure. */ 1612 panic("uma_zalloc(tcptw)"); 1613 tw = &tm->tw; 1614 inp = tp->t_inpcb; 1615 tw->tw_inpcb = inp; 1616 1617 /* 1618 * Recover last window size sent. 1619 */ 1620 tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; 1621 1622 /* 1623 * Set t_recent if timestamps are used on the connection. 1624 */ 1625 if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == 1626 (TF_REQ_TSTMP|TF_RCVD_TSTMP)) 1627 tw->t_recent = tp->ts_recent; 1628 else 1629 tw->t_recent = 0; 1630 1631 tw->snd_nxt = tp->snd_nxt; 1632 tw->rcv_nxt = tp->rcv_nxt; 1633 tw->cc_recv = tp->cc_recv; 1634 tw->cc_send = tp->cc_send; 1635 tw->t_starttime = tp->t_starttime; 1636 callout_init(tw->tt_2msl = &tm->tcptw_mem_2msl, 0); 1637 1638 /* XXX 1639 * If this code will 1640 * be used for fin-wait-2 state also, then we may need 1641 * a ts_recent from the last segment. 1642 */ 1643 /* Shorten TIME_WAIT [RFC-1644, p.28] */ 1644 if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) { 1645 tw_time = tp->t_rxtcur * TCPTV_TWTRUNC; 1646 /* For T/TCP client, force ACK now. */ 1647 acknow = 1; 1648 } else { 1649 tw_time = 2 * tcp_msl; 1650 acknow = tp->t_flags & TF_ACKNOW; 1651 } 1652 tcp_discardcb(tp); 1653 so = inp->inp_socket; 1654 so->so_pcb = NULL; 1655 tw->tw_cred = crhold(so->so_cred); 1656 tw->tw_so_options = so->so_options; 1657 sotryfree(so); 1658 inp->inp_socket = NULL; 1659 inp->inp_ppcb = (caddr_t)tw; 1660 inp->inp_vflag |= INP_TIMEWAIT; 1661 callout_reset(tw->tt_2msl, tw_time, tcp_timer_2msl_tw, tw); 1662 if (acknow) 1663 tcp_twrespond(tw, TH_ACK); 1664 INP_UNLOCK(inp); 1665 } 1666 1667 void 1668 tcp_twclose(tw) 1669 struct tcptw *tw; 1670 { 1671 struct inpcb *inp; 1672 1673 inp = tw->tw_inpcb; 1674 tw->tw_inpcb = NULL; 1675 callout_stop(tw->tt_2msl); 1676 inp->inp_ppcb = NULL; 1677 uma_zfree(tcptw_zone, tw); 1678 #ifdef INET6 1679 if (inp->inp_vflag & INP_IPV6PROTO) 1680 in6_pcbdetach(inp); 1681 else 1682 #endif 1683 in_pcbdetach(inp); 1684 tcpstat.tcps_closed++; 1685 } 1686 1687 int 1688 tcp_twrespond(struct tcptw *tw, int flags) 1689 { 1690 struct inpcb *inp = tw->tw_inpcb; 1691 struct tcphdr *th; 1692 struct mbuf *m; 1693 struct ip *ip = NULL; 1694 u_int8_t *optp; 1695 u_int hdrlen, optlen; 1696 int error; 1697 #ifdef INET6 1698 struct ip6_hdr *ip6 = NULL; 1699 int isipv6 = inp->inp_inc.inc_isipv6; 1700 #endif 1701 1702 m = m_gethdr(M_DONTWAIT, MT_HEADER); 1703 if (m == NULL) 1704 return (ENOBUFS); 1705 m->m_data += max_linkhdr; 1706 1707 #ifdef INET6 1708 if (isipv6) { 1709 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 1710 ip6 = mtod(m, struct ip6_hdr *); 1711 th = (struct tcphdr *)(ip6 + 1); 1712 tcpip_fillheaders(inp, ip6, th); 1713 } else 1714 #endif 1715 { 1716 hdrlen = sizeof(struct tcpiphdr); 1717 ip = mtod(m, struct ip *); 1718 th = (struct tcphdr *)(ip + 1); 1719 tcpip_fillheaders(inp, ip, th); 1720 } 1721 optp = (u_int8_t *)(th + 1); 1722 1723 /* 1724 * Send a timestamp and echo-reply if both our side and our peer 1725 * have sent timestamps in our SYN's and this is not a RST. 1726 */ 1727 if (tw->t_recent && flags == TH_ACK) { 1728 u_int32_t *lp = (u_int32_t *)optp; 1729 1730 /* Form timestamp option as shown in appendix A of RFC 1323. */ 1731 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 1732 *lp++ = htonl(ticks); 1733 *lp = htonl(tw->t_recent); 1734 optp += TCPOLEN_TSTAMP_APPA; 1735 } 1736 1737 /* 1738 * Send `CC-family' options if needed, and it's not a RST. 1739 */ 1740 if (tw->cc_recv != 0 && flags == TH_ACK) { 1741 u_int32_t *lp = (u_int32_t *)optp; 1742 1743 *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC)); 1744 *lp = htonl(tw->cc_send); 1745 optp += TCPOLEN_CC_APPA; 1746 } 1747 optlen = optp - (u_int8_t *)(th + 1); 1748 1749 m->m_len = hdrlen + optlen; 1750 m->m_pkthdr.len = m->m_len; 1751 1752 KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); 1753 1754 th->th_seq = htonl(tw->snd_nxt); 1755 th->th_ack = htonl(tw->rcv_nxt); 1756 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 1757 th->th_flags = flags; 1758 th->th_win = htons(tw->last_win); 1759 1760 #ifdef INET6 1761 if (isipv6) { 1762 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), 1763 sizeof(struct tcphdr) + optlen); 1764 ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ? 1765 inp->in6p_route.ro_rt->rt_ifp : NULL); 1766 error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route, 1767 (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); 1768 } else 1769 #endif 1770 { 1771 th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 1772 htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); 1773 m->m_pkthdr.csum_flags = CSUM_TCP; 1774 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 1775 ip->ip_len = m->m_pkthdr.len; 1776 error = ip_output(m, inp->inp_options, &inp->inp_route, 1777 (tw->tw_so_options & SO_DONTROUTE), NULL, inp); 1778 } 1779 if (flags & TH_ACK) 1780 tcpstat.tcps_sndacks++; 1781 else 1782 tcpstat.tcps_sndctrl++; 1783 tcpstat.tcps_sndtotal++; 1784 return (error); 1785 } 1786 1787 /* 1788 * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING 1789 * 1790 * This code attempts to calculate the bandwidth-delay product as a 1791 * means of determining the optimal window size to maximize bandwidth, 1792 * minimize RTT, and avoid the over-allocation of buffers on interfaces and 1793 * routers. This code also does a fairly good job keeping RTTs in check 1794 * across slow links like modems. We implement an algorithm which is very 1795 * similar (but not meant to be) TCP/Vegas. The code operates on the 1796 * transmitter side of a TCP connection and so only effects the transmit 1797 * side of the connection. 1798 * 1799 * BACKGROUND: TCP makes no provision for the management of buffer space 1800 * at the end points or at the intermediate routers and switches. A TCP 1801 * stream, whether using NewReno or not, will eventually buffer as 1802 * many packets as it is able and the only reason this typically works is 1803 * due to the fairly small default buffers made available for a connection 1804 * (typicaly 16K or 32K). As machines use larger windows and/or window 1805 * scaling it is now fairly easy for even a single TCP connection to blow-out 1806 * all available buffer space not only on the local interface, but on 1807 * intermediate routers and switches as well. NewReno makes a misguided 1808 * attempt to 'solve' this problem by waiting for an actual failure to occur, 1809 * then backing off, then steadily increasing the window again until another 1810 * failure occurs, ad-infinitum. This results in terrible oscillation that 1811 * is only made worse as network loads increase and the idea of intentionally 1812 * blowing out network buffers is, frankly, a terrible way to manage network 1813 * resources. 1814 * 1815 * It is far better to limit the transmit window prior to the failure 1816 * condition being achieved. There are two general ways to do this: First 1817 * you can 'scan' through different transmit window sizes and locate the 1818 * point where the RTT stops increasing, indicating that you have filled the 1819 * pipe, then scan backwards until you note that RTT stops decreasing, then 1820 * repeat ad-infinitum. This method works in principle but has severe 1821 * implementation issues due to RTT variances, timer granularity, and 1822 * instability in the algorithm which can lead to many false positives and 1823 * create oscillations as well as interact badly with other TCP streams 1824 * implementing the same algorithm. 1825 * 1826 * The second method is to limit the window to the bandwidth delay product 1827 * of the link. This is the method we implement. RTT variances and our 1828 * own manipulation of the congestion window, bwnd, can potentially 1829 * destabilize the algorithm. For this reason we have to stabilize the 1830 * elements used to calculate the window. We do this by using the minimum 1831 * observed RTT, the long term average of the observed bandwidth, and 1832 * by adding two segments worth of slop. It isn't perfect but it is able 1833 * to react to changing conditions and gives us a very stable basis on 1834 * which to extend the algorithm. 1835 */ 1836 void 1837 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) 1838 { 1839 u_long bw; 1840 u_long bwnd; 1841 int save_ticks; 1842 1843 /* 1844 * If inflight_enable is disabled in the middle of a tcp connection, 1845 * make sure snd_bwnd is effectively disabled. 1846 */ 1847 if (tcp_inflight_enable == 0) { 1848 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 1849 tp->snd_bandwidth = 0; 1850 return; 1851 } 1852 1853 /* 1854 * Figure out the bandwidth. Due to the tick granularity this 1855 * is a very rough number and it MUST be averaged over a fairly 1856 * long period of time. XXX we need to take into account a link 1857 * that is not using all available bandwidth, but for now our 1858 * slop will ramp us up if this case occurs and the bandwidth later 1859 * increases. 1860 * 1861 * Note: if ticks rollover 'bw' may wind up negative. We must 1862 * effectively reset t_bw_rtttime for this case. 1863 */ 1864 save_ticks = ticks; 1865 if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) 1866 return; 1867 1868 bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 1869 (save_ticks - tp->t_bw_rtttime); 1870 tp->t_bw_rtttime = save_ticks; 1871 tp->t_bw_rtseq = ack_seq; 1872 if (tp->t_bw_rtttime == 0 || (int)bw < 0) 1873 return; 1874 bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; 1875 1876 tp->snd_bandwidth = bw; 1877 1878 /* 1879 * Calculate the semi-static bandwidth delay product, plus two maximal 1880 * segments. The additional slop puts us squarely in the sweet 1881 * spot and also handles the bandwidth run-up case and stabilization. 1882 * Without the slop we could be locking ourselves into a lower 1883 * bandwidth. 1884 * 1885 * Situations Handled: 1886 * (1) Prevents over-queueing of packets on LANs, especially on 1887 * high speed LANs, allowing larger TCP buffers to be 1888 * specified, and also does a good job preventing 1889 * over-queueing of packets over choke points like modems 1890 * (at least for the transmit side). 1891 * 1892 * (2) Is able to handle changing network loads (bandwidth 1893 * drops so bwnd drops, bandwidth increases so bwnd 1894 * increases). 1895 * 1896 * (3) Theoretically should stabilize in the face of multiple 1897 * connections implementing the same algorithm (this may need 1898 * a little work). 1899 * 1900 * (4) Stability value (defaults to 20 = 2 maximal packets) can 1901 * be adjusted with a sysctl but typically only needs to be 1902 * on very slow connections. A value no smaller then 5 1903 * should be used, but only reduce this default if you have 1904 * no other choice. 1905 */ 1906 #define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) 1907 bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10; 1908 #undef USERTT 1909 1910 if (tcp_inflight_debug > 0) { 1911 static int ltime; 1912 if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { 1913 ltime = ticks; 1914 printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", 1915 tp, 1916 bw, 1917 tp->t_rttbest, 1918 tp->t_srtt, 1919 bwnd 1920 ); 1921 } 1922 } 1923 if ((long)bwnd < tcp_inflight_min) 1924 bwnd = tcp_inflight_min; 1925 if (bwnd > tcp_inflight_max) 1926 bwnd = tcp_inflight_max; 1927 if ((long)bwnd < tp->t_maxseg * 2) 1928 bwnd = tp->t_maxseg * 2; 1929 tp->snd_bwnd = bwnd; 1930 } 1931 1932