1 /* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 34 * $FreeBSD$ 35 */ 36 37 #include "opt_compat.h" 38 #include "opt_inet6.h" 39 #include "opt_ipsec.h" 40 #include "opt_mac.h" 41 #include "opt_tcpdebug.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/callout.h> 46 #include <sys/kernel.h> 47 #include <sys/sysctl.h> 48 #include <sys/mac.h> 49 #include <sys/malloc.h> 50 #include <sys/mbuf.h> 51 #ifdef INET6 52 #include <sys/domain.h> 53 #endif 54 #include <sys/proc.h> 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 #include <sys/protosw.h> 58 #include <sys/random.h> 59 60 #include <vm/uma.h> 61 62 #include <net/route.h> 63 #include <net/if.h> 64 65 #include <netinet/in.h> 66 #include <netinet/in_systm.h> 67 #include <netinet/ip.h> 68 #ifdef INET6 69 #include <netinet/ip6.h> 70 #endif 71 #include <netinet/in_pcb.h> 72 #ifdef INET6 73 #include <netinet6/in6_pcb.h> 74 #endif 75 #include <netinet/in_var.h> 76 #include <netinet/ip_var.h> 77 #ifdef INET6 78 #include <netinet6/ip6_var.h> 79 #endif 80 #include <netinet/tcp.h> 81 #include <netinet/tcp_fsm.h> 82 #include <netinet/tcp_seq.h> 83 #include <netinet/tcp_timer.h> 84 #include <netinet/tcp_var.h> 85 #ifdef INET6 86 #include <netinet6/tcp6_var.h> 87 #endif 88 #include <netinet/tcpip.h> 89 #ifdef TCPDEBUG 90 #include <netinet/tcp_debug.h> 91 #endif 92 #include <netinet6/ip6protosw.h> 93 94 #ifdef IPSEC 95 #include <netinet6/ipsec.h> 96 #ifdef INET6 97 #include <netinet6/ipsec6.h> 98 #endif 99 #endif /*IPSEC*/ 100 101 #ifdef FAST_IPSEC 102 #include <netipsec/ipsec.h> 103 #ifdef INET6 104 #include <netipsec/ipsec6.h> 105 #endif 106 #define IPSEC 107 #endif /*FAST_IPSEC*/ 108 109 #include <machine/in_cksum.h> 110 #include <sys/md5.h> 111 112 int tcp_mssdflt = TCP_MSS; 113 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, 114 &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); 115 116 #ifdef INET6 117 int tcp_v6mssdflt = TCP6_MSS; 118 SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, 119 CTLFLAG_RW, &tcp_v6mssdflt , 0, 120 "Default TCP Maximum Segment Size for IPv6"); 121 #endif 122 123 #if 0 124 static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; 125 SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, 126 &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time"); 127 #endif 128 129 int tcp_do_rfc1323 = 1; 130 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, 131 &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); 132 133 int tcp_do_rfc1644 = 0; 134 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, 135 &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); 136 137 static int tcp_tcbhashsize = 0; 138 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, 139 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); 140 141 static int do_tcpdrain = 1; 142 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, 143 "Enable tcp_drain routine for extra help when low on mbufs"); 144 145 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, 146 &tcbinfo.ipi_count, 0, "Number of active PCBs"); 147 148 static int icmp_may_rst = 1; 149 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, 150 "Certain ICMP unreachable messages may abort connections in SYN_SENT"); 151 152 static int tcp_isn_reseed_interval = 0; 153 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, 154 &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); 155 156 /* 157 * TCP bandwidth limiting sysctls. Note that the default lower bound of 158 * 1024 exists only for debugging. A good production default would be 159 * something like 6100. 160 */ 161 static int tcp_inflight_enable = 0; 162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW, 163 &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); 164 165 static int tcp_inflight_debug = 0; 166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW, 167 &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); 168 169 static int tcp_inflight_min = 6144; 170 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW, 171 &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); 172 173 static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; 174 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW, 175 &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); 176 static int tcp_inflight_stab = 20; 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, 178 &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); 179 180 static void tcp_cleartaocache(void); 181 static struct inpcb *tcp_notify(struct inpcb *, int); 182 183 /* 184 * Target size of TCP PCB hash tables. Must be a power of two. 185 * 186 * Note that this can be overridden by the kernel environment 187 * variable net.inet.tcp.tcbhashsize 188 */ 189 #ifndef TCBHASHSIZE 190 #define TCBHASHSIZE 512 191 #endif 192 193 /* 194 * This is the actual shape of what we allocate using the zone 195 * allocator. Doing it this way allows us to protect both structures 196 * using the same generation count, and also eliminates the overhead 197 * of allocating tcpcbs separately. By hiding the structure here, 198 * we avoid changing most of the rest of the code (although it needs 199 * to be changed, eventually, for greater efficiency). 200 */ 201 #define ALIGNMENT 32 202 #define ALIGNM1 (ALIGNMENT - 1) 203 struct inp_tp { 204 union { 205 struct inpcb inp; 206 char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; 207 } inp_tp_u; 208 struct tcpcb tcb; 209 struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl; 210 struct callout inp_tp_delack; 211 }; 212 #undef ALIGNMENT 213 #undef ALIGNM1 214 215 /* 216 * Tcp initialization 217 */ 218 void 219 tcp_init() 220 { 221 int hashsize = TCBHASHSIZE; 222 223 tcp_ccgen = 1; 224 tcp_cleartaocache(); 225 226 tcp_delacktime = TCPTV_DELACK; 227 tcp_keepinit = TCPTV_KEEP_INIT; 228 tcp_keepidle = TCPTV_KEEP_IDLE; 229 tcp_keepintvl = TCPTV_KEEPINTVL; 230 tcp_maxpersistidle = TCPTV_KEEP_IDLE; 231 tcp_msl = TCPTV_MSL; 232 tcp_rexmit_min = TCPTV_MIN; 233 tcp_rexmit_slop = TCPTV_CPU_VAR; 234 235 INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); 236 LIST_INIT(&tcb); 237 tcbinfo.listhead = &tcb; 238 TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); 239 if (!powerof2(hashsize)) { 240 printf("WARNING: TCB hash size not a power of 2\n"); 241 hashsize = 512; /* safe default */ 242 } 243 tcp_tcbhashsize = hashsize; 244 tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); 245 tcbinfo.porthashbase = hashinit(hashsize, M_PCB, 246 &tcbinfo.porthashmask); 247 tcbinfo.ipi_zone = uma_zcreate("tcpcb", sizeof(struct inp_tp), 248 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 249 uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); 250 #ifdef INET6 251 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) 252 #else /* INET6 */ 253 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) 254 #endif /* INET6 */ 255 if (max_protohdr < TCP_MINPROTOHDR) 256 max_protohdr = TCP_MINPROTOHDR; 257 if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) 258 panic("tcp_init"); 259 #undef TCP_MINPROTOHDR 260 261 syncache_init(); 262 } 263 264 /* 265 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. 266 * tcp_template used to store this data in mbufs, but we now recopy it out 267 * of the tcpcb each time to conserve mbufs. 268 */ 269 void 270 tcp_fillheaders(tp, ip_ptr, tcp_ptr) 271 struct tcpcb *tp; 272 void *ip_ptr; 273 void *tcp_ptr; 274 { 275 struct inpcb *inp = tp->t_inpcb; 276 struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr; 277 278 #ifdef INET6 279 if ((inp->inp_vflag & INP_IPV6) != 0) { 280 struct ip6_hdr *ip6; 281 282 ip6 = (struct ip6_hdr *)ip_ptr; 283 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | 284 (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK); 285 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | 286 (IPV6_VERSION & IPV6_VERSION_MASK); 287 ip6->ip6_nxt = IPPROTO_TCP; 288 ip6->ip6_plen = sizeof(struct tcphdr); 289 ip6->ip6_src = inp->in6p_laddr; 290 ip6->ip6_dst = inp->in6p_faddr; 291 tcp_hdr->th_sum = 0; 292 } else 293 #endif 294 { 295 struct ip *ip = (struct ip *) ip_ptr; 296 297 ip->ip_v = IPVERSION; 298 ip->ip_hl = 5; 299 ip->ip_tos = 0; 300 ip->ip_len = 0; 301 ip->ip_id = 0; 302 ip->ip_off = 0; 303 ip->ip_ttl = 0; 304 ip->ip_sum = 0; 305 ip->ip_p = IPPROTO_TCP; 306 ip->ip_src = inp->inp_laddr; 307 ip->ip_dst = inp->inp_faddr; 308 tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 309 htons(sizeof(struct tcphdr) + IPPROTO_TCP)); 310 } 311 312 tcp_hdr->th_sport = inp->inp_lport; 313 tcp_hdr->th_dport = inp->inp_fport; 314 tcp_hdr->th_seq = 0; 315 tcp_hdr->th_ack = 0; 316 tcp_hdr->th_x2 = 0; 317 tcp_hdr->th_off = 5; 318 tcp_hdr->th_flags = 0; 319 tcp_hdr->th_win = 0; 320 tcp_hdr->th_urp = 0; 321 } 322 323 /* 324 * Create template to be used to send tcp packets on a connection. 325 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only 326 * use for this function is in keepalives, which use tcp_respond. 327 */ 328 struct tcptemp * 329 tcp_maketemplate(tp) 330 struct tcpcb *tp; 331 { 332 struct mbuf *m; 333 struct tcptemp *n; 334 335 m = m_get(M_NOWAIT, MT_HEADER); 336 if (m == NULL) 337 return (0); 338 m->m_len = sizeof(struct tcptemp); 339 n = mtod(m, struct tcptemp *); 340 341 tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t); 342 return (n); 343 } 344 345 /* 346 * Send a single message to the TCP at address specified by 347 * the given TCP/IP header. If m == 0, then we make a copy 348 * of the tcpiphdr at ti and send directly to the addressed host. 349 * This is used to force keep alive messages out using the TCP 350 * template for a connection. If flags are given then we send 351 * a message back to the TCP which originated the * segment ti, 352 * and discard the mbuf containing it and any other attached mbufs. 353 * 354 * In any case the ack and sequence number of the transmitted 355 * segment are as specified by the parameters. 356 * 357 * NOTE: If m != NULL, then ti must point to *inside* the mbuf. 358 */ 359 void 360 tcp_respond(tp, ipgen, th, m, ack, seq, flags) 361 struct tcpcb *tp; 362 void *ipgen; 363 register struct tcphdr *th; 364 register struct mbuf *m; 365 tcp_seq ack, seq; 366 int flags; 367 { 368 register int tlen; 369 int win = 0; 370 struct route *ro = 0; 371 struct route sro; 372 struct ip *ip; 373 struct tcphdr *nth; 374 #ifdef INET6 375 struct route_in6 *ro6 = 0; 376 struct route_in6 sro6; 377 struct ip6_hdr *ip6; 378 int isipv6; 379 #endif /* INET6 */ 380 int ipflags = 0; 381 382 KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); 383 384 #ifdef INET6 385 isipv6 = ((struct ip *)ipgen)->ip_v == 6; 386 ip6 = ipgen; 387 #endif /* INET6 */ 388 ip = ipgen; 389 390 if (tp) { 391 if (!(flags & TH_RST)) { 392 win = sbspace(&tp->t_inpcb->inp_socket->so_rcv); 393 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 394 win = (long)TCP_MAXWIN << tp->rcv_scale; 395 } 396 #ifdef INET6 397 if (isipv6) 398 ro6 = &tp->t_inpcb->in6p_route; 399 else 400 #endif /* INET6 */ 401 ro = &tp->t_inpcb->inp_route; 402 } else { 403 #ifdef INET6 404 if (isipv6) { 405 ro6 = &sro6; 406 bzero(ro6, sizeof *ro6); 407 } else 408 #endif /* INET6 */ 409 { 410 ro = &sro; 411 bzero(ro, sizeof *ro); 412 } 413 } 414 if (m == 0) { 415 m = m_gethdr(M_NOWAIT, MT_HEADER); 416 if (m == NULL) 417 return; 418 tlen = 0; 419 m->m_data += max_linkhdr; 420 #ifdef INET6 421 if (isipv6) { 422 bcopy((caddr_t)ip6, mtod(m, caddr_t), 423 sizeof(struct ip6_hdr)); 424 ip6 = mtod(m, struct ip6_hdr *); 425 nth = (struct tcphdr *)(ip6 + 1); 426 } else 427 #endif /* INET6 */ 428 { 429 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); 430 ip = mtod(m, struct ip *); 431 nth = (struct tcphdr *)(ip + 1); 432 } 433 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); 434 flags = TH_ACK; 435 } else { 436 m_freem(m->m_next); 437 m->m_next = 0; 438 m->m_data = (caddr_t)ipgen; 439 /* m_len is set later */ 440 tlen = 0; 441 #define xchg(a,b,type) { type t; t=a; a=b; b=t; } 442 #ifdef INET6 443 if (isipv6) { 444 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 445 nth = (struct tcphdr *)(ip6 + 1); 446 } else 447 #endif /* INET6 */ 448 { 449 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long); 450 nth = (struct tcphdr *)(ip + 1); 451 } 452 if (th != nth) { 453 /* 454 * this is usually a case when an extension header 455 * exists between the IPv6 header and the 456 * TCP header. 457 */ 458 nth->th_sport = th->th_sport; 459 nth->th_dport = th->th_dport; 460 } 461 xchg(nth->th_dport, nth->th_sport, n_short); 462 #undef xchg 463 } 464 #ifdef INET6 465 if (isipv6) { 466 ip6->ip6_flow = 0; 467 ip6->ip6_vfc = IPV6_VERSION; 468 ip6->ip6_nxt = IPPROTO_TCP; 469 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + 470 tlen)); 471 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); 472 } else 473 #endif 474 { 475 tlen += sizeof (struct tcpiphdr); 476 ip->ip_len = tlen; 477 ip->ip_ttl = ip_defttl; 478 } 479 m->m_len = tlen; 480 m->m_pkthdr.len = tlen; 481 m->m_pkthdr.rcvif = (struct ifnet *) 0; 482 #ifdef MAC 483 if (tp != NULL) { 484 /* 485 * Packet is associated with a socket, so allow the 486 * label of the response to reflect the socket label. 487 */ 488 mac_create_mbuf_from_socket(tp->t_inpcb->inp_socket, m); 489 } else { 490 /* 491 * XXXMAC: This will need to call a mac function that 492 * modifies the mbuf label in place for TCP datagrams 493 * not associated with a PCB. 494 */ 495 } 496 #endif 497 nth->th_seq = htonl(seq); 498 nth->th_ack = htonl(ack); 499 nth->th_x2 = 0; 500 nth->th_off = sizeof (struct tcphdr) >> 2; 501 nth->th_flags = flags; 502 if (tp) 503 nth->th_win = htons((u_short) (win >> tp->rcv_scale)); 504 else 505 nth->th_win = htons((u_short)win); 506 nth->th_urp = 0; 507 #ifdef INET6 508 if (isipv6) { 509 nth->th_sum = 0; 510 nth->th_sum = in6_cksum(m, IPPROTO_TCP, 511 sizeof(struct ip6_hdr), 512 tlen - sizeof(struct ip6_hdr)); 513 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, 514 ro6 && ro6->ro_rt ? 515 ro6->ro_rt->rt_ifp : 516 NULL); 517 } else 518 #endif /* INET6 */ 519 { 520 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 521 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); 522 m->m_pkthdr.csum_flags = CSUM_TCP; 523 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 524 } 525 #ifdef TCPDEBUG 526 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 527 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); 528 #endif 529 #ifdef INET6 530 if (isipv6) { 531 (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL, 532 tp ? tp->t_inpcb : NULL); 533 if (ro6 == &sro6 && ro6->ro_rt) { 534 RTFREE(ro6->ro_rt); 535 ro6->ro_rt = NULL; 536 } 537 } else 538 #endif /* INET6 */ 539 { 540 (void) ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL); 541 if (ro == &sro && ro->ro_rt) { 542 RTFREE(ro->ro_rt); 543 ro->ro_rt = NULL; 544 } 545 } 546 } 547 548 /* 549 * Create a new TCP control block, making an 550 * empty reassembly queue and hooking it to the argument 551 * protocol control block. The `inp' parameter must have 552 * come from the zone allocator set up in tcp_init(). 553 */ 554 struct tcpcb * 555 tcp_newtcpcb(inp) 556 struct inpcb *inp; 557 { 558 struct inp_tp *it; 559 register struct tcpcb *tp; 560 #ifdef INET6 561 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 562 #endif /* INET6 */ 563 564 it = (struct inp_tp *)inp; 565 tp = &it->tcb; 566 bzero((char *) tp, sizeof(struct tcpcb)); 567 LIST_INIT(&tp->t_segq); 568 tp->t_maxseg = tp->t_maxopd = 569 #ifdef INET6 570 isipv6 ? tcp_v6mssdflt : 571 #endif /* INET6 */ 572 tcp_mssdflt; 573 574 /* Set up our timeouts. */ 575 callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0); 576 callout_init(tp->tt_persist = &it->inp_tp_persist, 0); 577 callout_init(tp->tt_keep = &it->inp_tp_keep, 0); 578 callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0); 579 callout_init(tp->tt_delack = &it->inp_tp_delack, 0); 580 581 if (tcp_do_rfc1323) 582 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); 583 if (tcp_do_rfc1644) 584 tp->t_flags |= TF_REQ_CC; 585 tp->t_inpcb = inp; /* XXX */ 586 /* 587 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 588 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives 589 * reasonable initial retransmit time. 590 */ 591 tp->t_srtt = TCPTV_SRTTBASE; 592 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; 593 tp->t_rttmin = tcp_rexmit_min; 594 tp->t_rxtcur = TCPTV_RTOBASE; 595 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 596 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 597 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 598 tp->t_rcvtime = ticks; 599 tp->t_bw_rtttime = ticks; 600 /* 601 * IPv4 TTL initialization is necessary for an IPv6 socket as well, 602 * because the socket may be bound to an IPv6 wildcard address, 603 * which may match an IPv4-mapped IPv6 address. 604 */ 605 inp->inp_ip_ttl = ip_defttl; 606 inp->inp_ppcb = (caddr_t)tp; 607 return (tp); /* XXX */ 608 } 609 610 /* 611 * Drop a TCP connection, reporting 612 * the specified error. If connection is synchronized, 613 * then send a RST to peer. 614 */ 615 struct tcpcb * 616 tcp_drop(tp, errno) 617 register struct tcpcb *tp; 618 int errno; 619 { 620 struct socket *so = tp->t_inpcb->inp_socket; 621 622 if (TCPS_HAVERCVDSYN(tp->t_state)) { 623 tp->t_state = TCPS_CLOSED; 624 (void) tcp_output(tp); 625 tcpstat.tcps_drops++; 626 } else 627 tcpstat.tcps_conndrops++; 628 if (errno == ETIMEDOUT && tp->t_softerror) 629 errno = tp->t_softerror; 630 so->so_error = errno; 631 return (tcp_close(tp)); 632 } 633 634 /* 635 * Close a TCP control block: 636 * discard all space held by the tcp 637 * discard internet protocol block 638 * wake up any sleepers 639 */ 640 struct tcpcb * 641 tcp_close(tp) 642 register struct tcpcb *tp; 643 { 644 register struct tseg_qent *q; 645 struct inpcb *inp = tp->t_inpcb; 646 struct socket *so = inp->inp_socket; 647 #ifdef INET6 648 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 649 #endif /* INET6 */ 650 register struct rtentry *rt; 651 int dosavessthresh; 652 653 /* 654 * Make sure that all of our timers are stopped before we 655 * delete the PCB. 656 */ 657 callout_stop(tp->tt_rexmt); 658 callout_stop(tp->tt_persist); 659 callout_stop(tp->tt_keep); 660 callout_stop(tp->tt_2msl); 661 callout_stop(tp->tt_delack); 662 663 /* 664 * If we got enough samples through the srtt filter, 665 * save the rtt and rttvar in the routing entry. 666 * 'Enough' is arbitrarily defined as the 16 samples. 667 * 16 samples is enough for the srtt filter to converge 668 * to within 5% of the correct value; fewer samples and 669 * we could save a very bogus rtt. 670 * 671 * Don't update the default route's characteristics and don't 672 * update anything that the user "locked". 673 */ 674 if (tp->t_rttupdated >= 16) { 675 register u_long i = 0; 676 #ifdef INET6 677 if (isipv6) { 678 struct sockaddr_in6 *sin6; 679 680 if ((rt = inp->in6p_route.ro_rt) == NULL) 681 goto no_valid_rt; 682 sin6 = (struct sockaddr_in6 *)rt_key(rt); 683 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 684 goto no_valid_rt; 685 } 686 else 687 #endif /* INET6 */ 688 if ((rt = inp->inp_route.ro_rt) == NULL || 689 ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr 690 == INADDR_ANY) 691 goto no_valid_rt; 692 693 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { 694 i = tp->t_srtt * 695 (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); 696 if (rt->rt_rmx.rmx_rtt && i) 697 /* 698 * filter this update to half the old & half 699 * the new values, converting scale. 700 * See route.h and tcp_var.h for a 701 * description of the scaling constants. 702 */ 703 rt->rt_rmx.rmx_rtt = 704 (rt->rt_rmx.rmx_rtt + i) / 2; 705 else 706 rt->rt_rmx.rmx_rtt = i; 707 tcpstat.tcps_cachedrtt++; 708 } 709 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { 710 i = tp->t_rttvar * 711 (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); 712 if (rt->rt_rmx.rmx_rttvar && i) 713 rt->rt_rmx.rmx_rttvar = 714 (rt->rt_rmx.rmx_rttvar + i) / 2; 715 else 716 rt->rt_rmx.rmx_rttvar = i; 717 tcpstat.tcps_cachedrttvar++; 718 } 719 /* 720 * The old comment here said: 721 * update the pipelimit (ssthresh) if it has been updated 722 * already or if a pipesize was specified & the threshhold 723 * got below half the pipesize. I.e., wait for bad news 724 * before we start updating, then update on both good 725 * and bad news. 726 * 727 * But we want to save the ssthresh even if no pipesize is 728 * specified explicitly in the route, because such 729 * connections still have an implicit pipesize specified 730 * by the global tcp_sendspace. In the absence of a reliable 731 * way to calculate the pipesize, it will have to do. 732 */ 733 i = tp->snd_ssthresh; 734 if (rt->rt_rmx.rmx_sendpipe != 0) 735 dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); 736 else 737 dosavessthresh = (i < so->so_snd.sb_hiwat / 2); 738 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && 739 i != 0 && rt->rt_rmx.rmx_ssthresh != 0) 740 || dosavessthresh) { 741 /* 742 * convert the limit from user data bytes to 743 * packets then to packet data bytes. 744 */ 745 i = (i + tp->t_maxseg / 2) / tp->t_maxseg; 746 if (i < 2) 747 i = 2; 748 i *= (u_long)(tp->t_maxseg + 749 #ifdef INET6 750 (isipv6 ? sizeof (struct ip6_hdr) + 751 sizeof (struct tcphdr) : 752 #endif 753 sizeof (struct tcpiphdr) 754 #ifdef INET6 755 ) 756 #endif 757 ); 758 if (rt->rt_rmx.rmx_ssthresh) 759 rt->rt_rmx.rmx_ssthresh = 760 (rt->rt_rmx.rmx_ssthresh + i) / 2; 761 else 762 rt->rt_rmx.rmx_ssthresh = i; 763 tcpstat.tcps_cachedssthresh++; 764 } 765 } 766 no_valid_rt: 767 /* free the reassembly queue, if any */ 768 while((q = LIST_FIRST(&tp->t_segq)) != NULL) { 769 LIST_REMOVE(q, tqe_q); 770 m_freem(q->tqe_m); 771 FREE(q, M_TSEGQ); 772 } 773 inp->inp_ppcb = NULL; 774 tp->t_inpcb = NULL; 775 soisdisconnected(so); 776 #ifdef INET6 777 if (INP_CHECK_SOCKAF(so, AF_INET6)) 778 in6_pcbdetach(inp); 779 else 780 #endif /* INET6 */ 781 in_pcbdetach(inp); 782 tcpstat.tcps_closed++; 783 return ((struct tcpcb *)0); 784 } 785 786 void 787 tcp_drain() 788 { 789 if (do_tcpdrain) 790 { 791 struct inpcb *inpb; 792 struct tcpcb *tcpb; 793 struct tseg_qent *te; 794 795 /* 796 * Walk the tcpbs, if existing, and flush the reassembly queue, 797 * if there is one... 798 * XXX: The "Net/3" implementation doesn't imply that the TCP 799 * reassembly queue should be flushed, but in a situation 800 * where we're really low on mbufs, this is potentially 801 * usefull. 802 */ 803 INP_INFO_RLOCK(&tcbinfo); 804 LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { 805 INP_LOCK(inpb); 806 if ((tcpb = intotcpcb(inpb))) { 807 while ((te = LIST_FIRST(&tcpb->t_segq)) 808 != NULL) { 809 LIST_REMOVE(te, tqe_q); 810 m_freem(te->tqe_m); 811 FREE(te, M_TSEGQ); 812 } 813 } 814 INP_UNLOCK(inpb); 815 } 816 INP_INFO_RUNLOCK(&tcbinfo); 817 } 818 } 819 820 /* 821 * Notify a tcp user of an asynchronous error; 822 * store error as soft error, but wake up user 823 * (for now, won't do anything until can select for soft error). 824 * 825 * Do not wake up user since there currently is no mechanism for 826 * reporting soft errors (yet - a kqueue filter may be added). 827 */ 828 static struct inpcb * 829 tcp_notify(inp, error) 830 struct inpcb *inp; 831 int error; 832 { 833 struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb; 834 835 /* 836 * Ignore some errors if we are hooked up. 837 * If connection hasn't completed, has retransmitted several times, 838 * and receives a second error, give up now. This is better 839 * than waiting a long time to establish a connection that 840 * can never complete. 841 */ 842 if (tp->t_state == TCPS_ESTABLISHED && 843 (error == EHOSTUNREACH || error == ENETUNREACH || 844 error == EHOSTDOWN)) { 845 return inp; 846 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && 847 tp->t_softerror) { 848 tcp_drop(tp, error); 849 return (struct inpcb *)0; 850 } else { 851 tp->t_softerror = error; 852 return inp; 853 } 854 #if 0 855 wakeup((caddr_t) &so->so_timeo); 856 sorwakeup(so); 857 sowwakeup(so); 858 #endif 859 } 860 861 static int 862 tcp_pcblist(SYSCTL_HANDLER_ARGS) 863 { 864 int error, i, n, s; 865 struct inpcb *inp, **inp_list; 866 inp_gen_t gencnt; 867 struct xinpgen xig; 868 869 /* 870 * The process of preparing the TCB list is too time-consuming and 871 * resource-intensive to repeat twice on every request. 872 */ 873 if (req->oldptr == 0) { 874 n = tcbinfo.ipi_count; 875 req->oldidx = 2 * (sizeof xig) 876 + (n + n/8) * sizeof(struct xtcpcb); 877 return 0; 878 } 879 880 if (req->newptr != 0) 881 return EPERM; 882 883 /* 884 * OK, now we're committed to doing something. 885 */ 886 s = splnet(); 887 INP_INFO_RLOCK(&tcbinfo); 888 gencnt = tcbinfo.ipi_gencnt; 889 n = tcbinfo.ipi_count; 890 INP_INFO_RUNLOCK(&tcbinfo); 891 splx(s); 892 893 sysctl_wire_old_buffer(req, 2 * (sizeof xig) 894 + n * sizeof(struct xtcpcb)); 895 896 xig.xig_len = sizeof xig; 897 xig.xig_count = n; 898 xig.xig_gen = gencnt; 899 xig.xig_sogen = so_gencnt; 900 error = SYSCTL_OUT(req, &xig, sizeof xig); 901 if (error) 902 return error; 903 904 inp_list = malloc(n * sizeof *inp_list, M_TEMP, 0); 905 if (inp_list == 0) 906 return ENOMEM; 907 908 s = splnet(); 909 INP_INFO_RLOCK(&tcbinfo); 910 for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n; 911 inp = LIST_NEXT(inp, inp_list)) { 912 INP_LOCK(inp); 913 if (inp->inp_gencnt <= gencnt && 914 cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) 915 inp_list[i++] = inp; 916 INP_UNLOCK(inp); 917 } 918 INP_INFO_RUNLOCK(&tcbinfo); 919 splx(s); 920 n = i; 921 922 error = 0; 923 for (i = 0; i < n; i++) { 924 inp = inp_list[i]; 925 INP_LOCK(inp); 926 if (inp->inp_gencnt <= gencnt) { 927 struct xtcpcb xt; 928 caddr_t inp_ppcb; 929 xt.xt_len = sizeof xt; 930 /* XXX should avoid extra copy */ 931 bcopy(inp, &xt.xt_inp, sizeof *inp); 932 inp_ppcb = inp->inp_ppcb; 933 if (inp_ppcb != NULL) 934 bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); 935 else 936 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); 937 if (inp->inp_socket) 938 sotoxsocket(inp->inp_socket, &xt.xt_socket); 939 error = SYSCTL_OUT(req, &xt, sizeof xt); 940 } 941 INP_UNLOCK(inp); 942 } 943 if (!error) { 944 /* 945 * Give the user an updated idea of our state. 946 * If the generation differs from what we told 947 * her before, she knows that something happened 948 * while we were processing this request, and it 949 * might be necessary to retry. 950 */ 951 s = splnet(); 952 INP_INFO_RLOCK(&tcbinfo); 953 xig.xig_gen = tcbinfo.ipi_gencnt; 954 xig.xig_sogen = so_gencnt; 955 xig.xig_count = tcbinfo.ipi_count; 956 INP_INFO_RUNLOCK(&tcbinfo); 957 splx(s); 958 error = SYSCTL_OUT(req, &xig, sizeof xig); 959 } 960 free(inp_list, M_TEMP); 961 return error; 962 } 963 964 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, 965 tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); 966 967 static int 968 tcp_getcred(SYSCTL_HANDLER_ARGS) 969 { 970 struct xucred xuc; 971 struct sockaddr_in addrs[2]; 972 struct inpcb *inp; 973 int error, s; 974 975 error = suser_cred(req->td->td_ucred, PRISON_ROOT); 976 if (error) 977 return (error); 978 error = SYSCTL_IN(req, addrs, sizeof(addrs)); 979 if (error) 980 return (error); 981 s = splnet(); 982 INP_INFO_RLOCK(&tcbinfo); 983 inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, 984 addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); 985 if (inp == NULL) { 986 error = ENOENT; 987 goto outunlocked; 988 } 989 INP_LOCK(inp); 990 if (inp->inp_socket == NULL) { 991 error = ENOENT; 992 goto out; 993 } 994 error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); 995 if (error) 996 goto out; 997 cru2x(inp->inp_socket->so_cred, &xuc); 998 out: 999 INP_UNLOCK(inp); 1000 outunlocked: 1001 INP_INFO_RUNLOCK(&tcbinfo); 1002 splx(s); 1003 if (error == 0) 1004 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 1005 return (error); 1006 } 1007 1008 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, 1009 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, 1010 tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); 1011 1012 #ifdef INET6 1013 static int 1014 tcp6_getcred(SYSCTL_HANDLER_ARGS) 1015 { 1016 struct xucred xuc; 1017 struct sockaddr_in6 addrs[2]; 1018 struct inpcb *inp; 1019 int error, s, mapped = 0; 1020 1021 error = suser_cred(req->td->td_ucred, PRISON_ROOT); 1022 if (error) 1023 return (error); 1024 error = SYSCTL_IN(req, addrs, sizeof(addrs)); 1025 if (error) 1026 return (error); 1027 if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { 1028 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) 1029 mapped = 1; 1030 else 1031 return (EINVAL); 1032 } 1033 s = splnet(); 1034 INP_INFO_RLOCK(&tcbinfo); 1035 if (mapped == 1) 1036 inp = in_pcblookup_hash(&tcbinfo, 1037 *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], 1038 addrs[1].sin6_port, 1039 *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], 1040 addrs[0].sin6_port, 1041 0, NULL); 1042 else 1043 inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr, 1044 addrs[1].sin6_port, 1045 &addrs[0].sin6_addr, addrs[0].sin6_port, 1046 0, NULL); 1047 if (inp == NULL) { 1048 error = ENOENT; 1049 goto outunlocked; 1050 } 1051 INP_LOCK(inp); 1052 if (inp->inp_socket == NULL) { 1053 error = ENOENT; 1054 goto out; 1055 } 1056 error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); 1057 if (error) 1058 goto out; 1059 cru2x(inp->inp_socket->so_cred, &xuc); 1060 out: 1061 INP_UNLOCK(inp); 1062 outunlocked: 1063 INP_INFO_RUNLOCK(&tcbinfo); 1064 splx(s); 1065 if (error == 0) 1066 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); 1067 return (error); 1068 } 1069 1070 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, 1071 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, 1072 tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); 1073 #endif 1074 1075 1076 void 1077 tcp_ctlinput(cmd, sa, vip) 1078 int cmd; 1079 struct sockaddr *sa; 1080 void *vip; 1081 { 1082 struct ip *ip = vip; 1083 struct tcphdr *th; 1084 struct in_addr faddr; 1085 struct inpcb *inp; 1086 struct tcpcb *tp; 1087 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; 1088 tcp_seq icmp_seq; 1089 int s; 1090 1091 faddr = ((struct sockaddr_in *)sa)->sin_addr; 1092 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1093 return; 1094 1095 if (cmd == PRC_QUENCH) 1096 notify = tcp_quench; 1097 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || 1098 cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) 1099 notify = tcp_drop_syn_sent; 1100 else if (cmd == PRC_MSGSIZE) 1101 notify = tcp_mtudisc; 1102 else if (PRC_IS_REDIRECT(cmd)) { 1103 ip = 0; 1104 notify = in_rtchange; 1105 } else if (cmd == PRC_HOSTDEAD) 1106 ip = 0; 1107 else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0) 1108 return; 1109 if (ip) { 1110 s = splnet(); 1111 th = (struct tcphdr *)((caddr_t)ip 1112 + (ip->ip_hl << 2)); 1113 INP_INFO_WLOCK(&tcbinfo); 1114 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, 1115 ip->ip_src, th->th_sport, 0, NULL); 1116 if (inp != NULL) { 1117 INP_LOCK(inp); 1118 if (inp->inp_socket != NULL) { 1119 icmp_seq = htonl(th->th_seq); 1120 tp = intotcpcb(inp); 1121 if (SEQ_GEQ(icmp_seq, tp->snd_una) && 1122 SEQ_LT(icmp_seq, tp->snd_max)) 1123 inp = (*notify)(inp, inetctlerrmap[cmd]); 1124 } 1125 if (inp) 1126 INP_UNLOCK(inp); 1127 } else { 1128 struct in_conninfo inc; 1129 1130 inc.inc_fport = th->th_dport; 1131 inc.inc_lport = th->th_sport; 1132 inc.inc_faddr = faddr; 1133 inc.inc_laddr = ip->ip_src; 1134 #ifdef INET6 1135 inc.inc_isipv6 = 0; 1136 #endif 1137 syncache_unreach(&inc, th); 1138 } 1139 INP_INFO_WUNLOCK(&tcbinfo); 1140 splx(s); 1141 } else 1142 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); 1143 } 1144 1145 #ifdef INET6 1146 void 1147 tcp6_ctlinput(cmd, sa, d) 1148 int cmd; 1149 struct sockaddr *sa; 1150 void *d; 1151 { 1152 struct tcphdr th; 1153 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; 1154 struct ip6_hdr *ip6; 1155 struct mbuf *m; 1156 struct ip6ctlparam *ip6cp = NULL; 1157 const struct sockaddr_in6 *sa6_src = NULL; 1158 int off; 1159 struct tcp_portonly { 1160 u_int16_t th_sport; 1161 u_int16_t th_dport; 1162 } *thp; 1163 1164 if (sa->sa_family != AF_INET6 || 1165 sa->sa_len != sizeof(struct sockaddr_in6)) 1166 return; 1167 1168 if (cmd == PRC_QUENCH) 1169 notify = tcp_quench; 1170 else if (cmd == PRC_MSGSIZE) 1171 notify = tcp_mtudisc; 1172 else if (!PRC_IS_REDIRECT(cmd) && 1173 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) 1174 return; 1175 1176 /* if the parameter is from icmp6, decode it. */ 1177 if (d != NULL) { 1178 ip6cp = (struct ip6ctlparam *)d; 1179 m = ip6cp->ip6c_m; 1180 ip6 = ip6cp->ip6c_ip6; 1181 off = ip6cp->ip6c_off; 1182 sa6_src = ip6cp->ip6c_src; 1183 } else { 1184 m = NULL; 1185 ip6 = NULL; 1186 off = 0; /* fool gcc */ 1187 sa6_src = &sa6_any; 1188 } 1189 1190 if (ip6) { 1191 struct in_conninfo inc; 1192 /* 1193 * XXX: We assume that when IPV6 is non NULL, 1194 * M and OFF are valid. 1195 */ 1196 1197 /* check if we can safely examine src and dst ports */ 1198 if (m->m_pkthdr.len < off + sizeof(*thp)) 1199 return; 1200 1201 bzero(&th, sizeof(th)); 1202 m_copydata(m, off, sizeof(*thp), (caddr_t)&th); 1203 1204 in6_pcbnotify(&tcb, sa, th.th_dport, 1205 (struct sockaddr *)ip6cp->ip6c_src, 1206 th.th_sport, cmd, notify); 1207 1208 inc.inc_fport = th.th_dport; 1209 inc.inc_lport = th.th_sport; 1210 inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; 1211 inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; 1212 inc.inc_isipv6 = 1; 1213 syncache_unreach(&inc, &th); 1214 } else 1215 in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src, 1216 0, cmd, notify); 1217 } 1218 #endif /* INET6 */ 1219 1220 1221 /* 1222 * Following is where TCP initial sequence number generation occurs. 1223 * 1224 * There are two places where we must use initial sequence numbers: 1225 * 1. In SYN-ACK packets. 1226 * 2. In SYN packets. 1227 * 1228 * All ISNs for SYN-ACK packets are generated by the syncache. See 1229 * tcp_syncache.c for details. 1230 * 1231 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling 1232 * depends on this property. In addition, these ISNs should be 1233 * unguessable so as to prevent connection hijacking. To satisfy 1234 * the requirements of this situation, the algorithm outlined in 1235 * RFC 1948 is used to generate sequence numbers. 1236 * 1237 * Implementation details: 1238 * 1239 * Time is based off the system timer, and is corrected so that it 1240 * increases by one megabyte per second. This allows for proper 1241 * recycling on high speed LANs while still leaving over an hour 1242 * before rollover. 1243 * 1244 * net.inet.tcp.isn_reseed_interval controls the number of seconds 1245 * between seeding of isn_secret. This is normally set to zero, 1246 * as reseeding should not be necessary. 1247 * 1248 */ 1249 1250 #define ISN_BYTES_PER_SECOND 1048576 1251 1252 u_char isn_secret[32]; 1253 int isn_last_reseed; 1254 MD5_CTX isn_ctx; 1255 1256 tcp_seq 1257 tcp_new_isn(tp) 1258 struct tcpcb *tp; 1259 { 1260 u_int32_t md5_buffer[4]; 1261 tcp_seq new_isn; 1262 1263 /* Seed if this is the first use, reseed if requested. */ 1264 if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) && 1265 (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz) 1266 < (u_int)ticks))) { 1267 read_random(&isn_secret, sizeof(isn_secret)); 1268 isn_last_reseed = ticks; 1269 } 1270 1271 /* Compute the md5 hash and return the ISN. */ 1272 MD5Init(&isn_ctx); 1273 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); 1274 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); 1275 #ifdef INET6 1276 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { 1277 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, 1278 sizeof(struct in6_addr)); 1279 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, 1280 sizeof(struct in6_addr)); 1281 } else 1282 #endif 1283 { 1284 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, 1285 sizeof(struct in_addr)); 1286 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, 1287 sizeof(struct in_addr)); 1288 } 1289 MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret)); 1290 MD5Final((u_char *) &md5_buffer, &isn_ctx); 1291 new_isn = (tcp_seq) md5_buffer[0]; 1292 new_isn += ticks * (ISN_BYTES_PER_SECOND / hz); 1293 return new_isn; 1294 } 1295 1296 /* 1297 * When a source quench is received, close congestion window 1298 * to one segment. We will gradually open it again as we proceed. 1299 */ 1300 struct inpcb * 1301 tcp_quench(inp, errno) 1302 struct inpcb *inp; 1303 int errno; 1304 { 1305 struct tcpcb *tp = intotcpcb(inp); 1306 1307 if (tp) 1308 tp->snd_cwnd = tp->t_maxseg; 1309 return (inp); 1310 } 1311 1312 /* 1313 * When a specific ICMP unreachable message is received and the 1314 * connection state is SYN-SENT, drop the connection. This behavior 1315 * is controlled by the icmp_may_rst sysctl. 1316 */ 1317 struct inpcb * 1318 tcp_drop_syn_sent(inp, errno) 1319 struct inpcb *inp; 1320 int errno; 1321 { 1322 struct tcpcb *tp = intotcpcb(inp); 1323 1324 if (tp && tp->t_state == TCPS_SYN_SENT) { 1325 tcp_drop(tp, errno); 1326 return (struct inpcb *)0; 1327 } 1328 return inp; 1329 } 1330 1331 /* 1332 * When `need fragmentation' ICMP is received, update our idea of the MSS 1333 * based on the new value in the route. Also nudge TCP to send something, 1334 * since we know the packet we just sent was dropped. 1335 * This duplicates some code in the tcp_mss() function in tcp_input.c. 1336 */ 1337 struct inpcb * 1338 tcp_mtudisc(inp, errno) 1339 struct inpcb *inp; 1340 int errno; 1341 { 1342 struct tcpcb *tp = intotcpcb(inp); 1343 struct rtentry *rt; 1344 struct rmxp_tao *taop; 1345 struct socket *so = inp->inp_socket; 1346 int offered; 1347 int mss; 1348 #ifdef INET6 1349 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 1350 #endif /* INET6 */ 1351 1352 if (tp) { 1353 #ifdef INET6 1354 if (isipv6) 1355 rt = tcp_rtlookup6(&inp->inp_inc); 1356 else 1357 #endif /* INET6 */ 1358 rt = tcp_rtlookup(&inp->inp_inc); 1359 if (!rt || !rt->rt_rmx.rmx_mtu) { 1360 tp->t_maxopd = tp->t_maxseg = 1361 #ifdef INET6 1362 isipv6 ? tcp_v6mssdflt : 1363 #endif /* INET6 */ 1364 tcp_mssdflt; 1365 return inp; 1366 } 1367 taop = rmx_taop(rt->rt_rmx); 1368 offered = taop->tao_mssopt; 1369 mss = rt->rt_rmx.rmx_mtu - 1370 #ifdef INET6 1371 (isipv6 ? 1372 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : 1373 #endif /* INET6 */ 1374 sizeof(struct tcpiphdr) 1375 #ifdef INET6 1376 ) 1377 #endif /* INET6 */ 1378 ; 1379 1380 if (offered) 1381 mss = min(mss, offered); 1382 /* 1383 * XXX - The above conditional probably violates the TCP 1384 * spec. The problem is that, since we don't know the 1385 * other end's MSS, we are supposed to use a conservative 1386 * default. But, if we do that, then MTU discovery will 1387 * never actually take place, because the conservative 1388 * default is much less than the MTUs typically seen 1389 * on the Internet today. For the moment, we'll sweep 1390 * this under the carpet. 1391 * 1392 * The conservative default might not actually be a problem 1393 * if the only case this occurs is when sending an initial 1394 * SYN with options and data to a host we've never talked 1395 * to before. Then, they will reply with an MSS value which 1396 * will get recorded and the new parameters should get 1397 * recomputed. For Further Study. 1398 */ 1399 if (tp->t_maxopd <= mss) 1400 return inp; 1401 tp->t_maxopd = mss; 1402 1403 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 1404 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) 1405 mss -= TCPOLEN_TSTAMP_APPA; 1406 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC && 1407 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC) 1408 mss -= TCPOLEN_CC_APPA; 1409 #if (MCLBYTES & (MCLBYTES - 1)) == 0 1410 if (mss > MCLBYTES) 1411 mss &= ~(MCLBYTES-1); 1412 #else 1413 if (mss > MCLBYTES) 1414 mss = mss / MCLBYTES * MCLBYTES; 1415 #endif 1416 if (so->so_snd.sb_hiwat < mss) 1417 mss = so->so_snd.sb_hiwat; 1418 1419 tp->t_maxseg = mss; 1420 1421 tcpstat.tcps_mturesent++; 1422 tp->t_rtttime = 0; 1423 tp->snd_nxt = tp->snd_una; 1424 tcp_output(tp); 1425 } 1426 return inp; 1427 } 1428 1429 /* 1430 * Look-up the routing entry to the peer of this inpcb. If no route 1431 * is found and it cannot be allocated, then return NULL. This routine 1432 * is called by TCP routines that access the rmx structure and by tcp_mss 1433 * to get the interface MTU. 1434 */ 1435 struct rtentry * 1436 tcp_rtlookup(inc) 1437 struct in_conninfo *inc; 1438 { 1439 struct route *ro; 1440 struct rtentry *rt; 1441 1442 ro = &inc->inc_route; 1443 rt = ro->ro_rt; 1444 if (rt == NULL || !(rt->rt_flags & RTF_UP)) { 1445 /* No route yet, so try to acquire one */ 1446 if (inc->inc_faddr.s_addr != INADDR_ANY) { 1447 ro->ro_dst.sa_family = AF_INET; 1448 ro->ro_dst.sa_len = sizeof(struct sockaddr_in); 1449 ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = 1450 inc->inc_faddr; 1451 rtalloc(ro); 1452 rt = ro->ro_rt; 1453 } 1454 } 1455 return rt; 1456 } 1457 1458 #ifdef INET6 1459 struct rtentry * 1460 tcp_rtlookup6(inc) 1461 struct in_conninfo *inc; 1462 { 1463 struct route_in6 *ro6; 1464 struct rtentry *rt; 1465 1466 ro6 = &inc->inc6_route; 1467 rt = ro6->ro_rt; 1468 if (rt == NULL || !(rt->rt_flags & RTF_UP)) { 1469 /* No route yet, so try to acquire one */ 1470 if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { 1471 ro6->ro_dst.sin6_family = AF_INET6; 1472 ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6); 1473 ro6->ro_dst.sin6_addr = inc->inc6_faddr; 1474 rtalloc((struct route *)ro6); 1475 rt = ro6->ro_rt; 1476 } 1477 } 1478 return rt; 1479 } 1480 #endif /* INET6 */ 1481 1482 #ifdef IPSEC 1483 /* compute ESP/AH header size for TCP, including outer IP header. */ 1484 size_t 1485 ipsec_hdrsiz_tcp(tp) 1486 struct tcpcb *tp; 1487 { 1488 struct inpcb *inp; 1489 struct mbuf *m; 1490 size_t hdrsiz; 1491 struct ip *ip; 1492 #ifdef INET6 1493 struct ip6_hdr *ip6; 1494 #endif /* INET6 */ 1495 struct tcphdr *th; 1496 1497 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) 1498 return 0; 1499 MGETHDR(m, M_NOWAIT, MT_DATA); 1500 if (!m) 1501 return 0; 1502 1503 #ifdef INET6 1504 if ((inp->inp_vflag & INP_IPV6) != 0) { 1505 ip6 = mtod(m, struct ip6_hdr *); 1506 th = (struct tcphdr *)(ip6 + 1); 1507 m->m_pkthdr.len = m->m_len = 1508 sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 1509 tcp_fillheaders(tp, ip6, th); 1510 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 1511 } else 1512 #endif /* INET6 */ 1513 { 1514 ip = mtod(m, struct ip *); 1515 th = (struct tcphdr *)(ip + 1); 1516 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); 1517 tcp_fillheaders(tp, ip, th); 1518 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); 1519 } 1520 1521 m_free(m); 1522 return hdrsiz; 1523 } 1524 #endif /*IPSEC*/ 1525 1526 /* 1527 * Return a pointer to the cached information about the remote host. 1528 * The cached information is stored in the protocol specific part of 1529 * the route metrics. 1530 */ 1531 struct rmxp_tao * 1532 tcp_gettaocache(inc) 1533 struct in_conninfo *inc; 1534 { 1535 struct rtentry *rt; 1536 1537 #ifdef INET6 1538 if (inc->inc_isipv6) 1539 rt = tcp_rtlookup6(inc); 1540 else 1541 #endif /* INET6 */ 1542 rt = tcp_rtlookup(inc); 1543 1544 /* Make sure this is a host route and is up. */ 1545 if (rt == NULL || 1546 (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) 1547 return NULL; 1548 1549 return rmx_taop(rt->rt_rmx); 1550 } 1551 1552 /* 1553 * Clear all the TAO cache entries, called from tcp_init. 1554 * 1555 * XXX 1556 * This routine is just an empty one, because we assume that the routing 1557 * routing tables are initialized at the same time when TCP, so there is 1558 * nothing in the cache left over. 1559 */ 1560 static void 1561 tcp_cleartaocache() 1562 { 1563 } 1564 1565 /* 1566 * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING 1567 * 1568 * This code attempts to calculate the bandwidth-delay product as a 1569 * means of determining the optimal window size to maximize bandwidth, 1570 * minimize RTT, and avoid the over-allocation of buffers on interfaces and 1571 * routers. This code also does a fairly good job keeping RTTs in check 1572 * across slow links like modems. We implement an algorithm which is very 1573 * similar (but not meant to be) TCP/Vegas. The code operates on the 1574 * transmitter side of a TCP connection and so only effects the transmit 1575 * side of the connection. 1576 * 1577 * BACKGROUND: TCP makes no provision for the management of buffer space 1578 * at the end points or at the intermediate routers and switches. A TCP 1579 * stream, whether using NewReno or not, will eventually buffer as 1580 * many packets as it is able and the only reason this typically works is 1581 * due to the fairly small default buffers made available for a connection 1582 * (typicaly 16K or 32K). As machines use larger windows and/or window 1583 * scaling it is now fairly easy for even a single TCP connection to blow-out 1584 * all available buffer space not only on the local interface, but on 1585 * intermediate routers and switches as well. NewReno makes a misguided 1586 * attempt to 'solve' this problem by waiting for an actual failure to occur, 1587 * then backing off, then steadily increasing the window again until another 1588 * failure occurs, ad-infinitum. This results in terrible oscillation that 1589 * is only made worse as network loads increase and the idea of intentionally 1590 * blowing out network buffers is, frankly, a terrible way to manage network 1591 * resources. 1592 * 1593 * It is far better to limit the transmit window prior to the failure 1594 * condition being achieved. There are two general ways to do this: First 1595 * you can 'scan' through different transmit window sizes and locate the 1596 * point where the RTT stops increasing, indicating that you have filled the 1597 * pipe, then scan backwards until you note that RTT stops decreasing, then 1598 * repeat ad-infinitum. This method works in principle but has severe 1599 * implementation issues due to RTT variances, timer granularity, and 1600 * instability in the algorithm which can lead to many false positives and 1601 * create oscillations as well as interact badly with other TCP streams 1602 * implementing the same algorithm. 1603 * 1604 * The second method is to limit the window to the bandwidth delay product 1605 * of the link. This is the method we implement. RTT variances and our 1606 * own manipulation of the congestion window, bwnd, can potentially 1607 * destabilize the algorithm. For this reason we have to stabilize the 1608 * elements used to calculate the window. We do this by using the minimum 1609 * observed RTT, the long term average of the observed bandwidth, and 1610 * by adding two segments worth of slop. It isn't perfect but it is able 1611 * to react to changing conditions and gives us a very stable basis on 1612 * which to extend the algorithm. 1613 */ 1614 void 1615 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) 1616 { 1617 u_long bw; 1618 u_long bwnd; 1619 int save_ticks; 1620 1621 /* 1622 * If inflight_enable is disabled in the middle of a tcp connection, 1623 * make sure snd_bwnd is effectively disabled. 1624 */ 1625 if (tcp_inflight_enable == 0) { 1626 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 1627 tp->snd_bandwidth = 0; 1628 return; 1629 } 1630 1631 /* 1632 * Figure out the bandwidth. Due to the tick granularity this 1633 * is a very rough number and it MUST be averaged over a fairly 1634 * long period of time. XXX we need to take into account a link 1635 * that is not using all available bandwidth, but for now our 1636 * slop will ramp us up if this case occurs and the bandwidth later 1637 * increases. 1638 * 1639 * Note: if ticks rollover 'bw' may wind up negative. We must 1640 * effectively reset t_bw_rtttime for this case. 1641 */ 1642 save_ticks = ticks; 1643 if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) 1644 return; 1645 1646 bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 1647 (save_ticks - tp->t_bw_rtttime); 1648 tp->t_bw_rtttime = save_ticks; 1649 tp->t_bw_rtseq = ack_seq; 1650 if (tp->t_bw_rtttime == 0 || (int)bw < 0) 1651 return; 1652 bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; 1653 1654 tp->snd_bandwidth = bw; 1655 1656 /* 1657 * Calculate the semi-static bandwidth delay product, plus two maximal 1658 * segments. The additional slop puts us squarely in the sweet 1659 * spot and also handles the bandwidth run-up case and stabilization. 1660 * Without the slop we could be locking ourselves into a lower 1661 * bandwidth. 1662 * 1663 * Situations Handled: 1664 * (1) Prevents over-queueing of packets on LANs, especially on 1665 * high speed LANs, allowing larger TCP buffers to be 1666 * specified, and also does a good job preventing 1667 * over-queueing of packets over choke points like modems 1668 * (at least for the transmit side). 1669 * 1670 * (2) Is able to handle changing network loads (bandwidth 1671 * drops so bwnd drops, bandwidth increases so bwnd 1672 * increases). 1673 * 1674 * (3) Theoretically should stabilize in the face of multiple 1675 * connections implementing the same algorithm (this may need 1676 * a little work). 1677 * 1678 * (4) Stability value (defaults to 20 = 2 maximal packets) can 1679 * be adjusted with a sysctl but typically only needs to be 1680 * on very slow connections. A value no smaller then 5 1681 * should be used, but only reduce this default if you have 1682 * no other choice. 1683 */ 1684 #define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) 1685 bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10; 1686 #undef USERTT 1687 1688 if (tcp_inflight_debug > 0) { 1689 static int ltime; 1690 if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { 1691 ltime = ticks; 1692 printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", 1693 tp, 1694 bw, 1695 tp->t_rttbest, 1696 tp->t_srtt, 1697 bwnd 1698 ); 1699 } 1700 } 1701 if ((long)bwnd < tcp_inflight_min) 1702 bwnd = tcp_inflight_min; 1703 if (bwnd > tcp_inflight_max) 1704 bwnd = tcp_inflight_max; 1705 if ((long)bwnd < tp->t_maxseg * 2) 1706 bwnd = tp->t_maxseg * 2; 1707 tp->snd_bwnd = bwnd; 1708 } 1709 1710