xref: /freebsd/sys/netinet/tcp_subr.c (revision f9218d3d4fd34f082473b3a021c6d4d109fb47cf)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
34  * $FreeBSD$
35  */
36 
37 #include "opt_compat.h"
38 #include "opt_inet6.h"
39 #include "opt_ipsec.h"
40 #include "opt_mac.h"
41 #include "opt_tcpdebug.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/callout.h>
46 #include <sys/kernel.h>
47 #include <sys/sysctl.h>
48 #include <sys/mac.h>
49 #include <sys/malloc.h>
50 #include <sys/mbuf.h>
51 #ifdef INET6
52 #include <sys/domain.h>
53 #endif
54 #include <sys/proc.h>
55 #include <sys/socket.h>
56 #include <sys/socketvar.h>
57 #include <sys/protosw.h>
58 #include <sys/random.h>
59 
60 #include <vm/uma.h>
61 
62 #include <net/route.h>
63 #include <net/if.h>
64 
65 #include <netinet/in.h>
66 #include <netinet/in_systm.h>
67 #include <netinet/ip.h>
68 #ifdef INET6
69 #include <netinet/ip6.h>
70 #endif
71 #include <netinet/in_pcb.h>
72 #ifdef INET6
73 #include <netinet6/in6_pcb.h>
74 #endif
75 #include <netinet/in_var.h>
76 #include <netinet/ip_var.h>
77 #ifdef INET6
78 #include <netinet6/ip6_var.h>
79 #endif
80 #include <netinet/tcp.h>
81 #include <netinet/tcp_fsm.h>
82 #include <netinet/tcp_seq.h>
83 #include <netinet/tcp_timer.h>
84 #include <netinet/tcp_var.h>
85 #ifdef INET6
86 #include <netinet6/tcp6_var.h>
87 #endif
88 #include <netinet/tcpip.h>
89 #ifdef TCPDEBUG
90 #include <netinet/tcp_debug.h>
91 #endif
92 #include <netinet6/ip6protosw.h>
93 
94 #ifdef IPSEC
95 #include <netinet6/ipsec.h>
96 #ifdef INET6
97 #include <netinet6/ipsec6.h>
98 #endif
99 #endif /*IPSEC*/
100 
101 #ifdef FAST_IPSEC
102 #include <netipsec/ipsec.h>
103 #ifdef INET6
104 #include <netipsec/ipsec6.h>
105 #endif
106 #define	IPSEC
107 #endif /*FAST_IPSEC*/
108 
109 #include <machine/in_cksum.h>
110 #include <sys/md5.h>
111 
112 int 	tcp_mssdflt = TCP_MSS;
113 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
114     &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
115 
116 #ifdef INET6
117 int	tcp_v6mssdflt = TCP6_MSS;
118 SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
119 	CTLFLAG_RW, &tcp_v6mssdflt , 0,
120 	"Default TCP Maximum Segment Size for IPv6");
121 #endif
122 
123 #if 0
124 static int 	tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
125 SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
126     &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
127 #endif
128 
129 int	tcp_do_rfc1323 = 1;
130 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
131     &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
132 
133 int	tcp_do_rfc1644 = 0;
134 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
135     &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
136 
137 static int	tcp_tcbhashsize = 0;
138 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
139      &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
140 
141 static int	do_tcpdrain = 1;
142 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
143      "Enable tcp_drain routine for extra help when low on mbufs");
144 
145 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
146     &tcbinfo.ipi_count, 0, "Number of active PCBs");
147 
148 static int	icmp_may_rst = 1;
149 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0,
150     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
151 
152 static int	tcp_isn_reseed_interval = 0;
153 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
154     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
155 
156 /*
157  * TCP bandwidth limiting sysctls.  Note that the default lower bound of
158  * 1024 exists only for debugging.  A good production default would be
159  * something like 6100.
160  */
161 static int	tcp_inflight_enable = 0;
162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
163     &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
164 
165 static int	tcp_inflight_debug = 0;
166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
167     &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
168 
169 static int	tcp_inflight_min = 6144;
170 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
171     &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
172 
173 static int	tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
174 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
175     &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
176 static int	tcp_inflight_stab = 20;
177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
178     &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
179 
180 static void	tcp_cleartaocache(void);
181 static struct inpcb *tcp_notify(struct inpcb *, int);
182 static void	tcp_discardcb(struct tcpcb *);
183 
184 /*
185  * Target size of TCP PCB hash tables. Must be a power of two.
186  *
187  * Note that this can be overridden by the kernel environment
188  * variable net.inet.tcp.tcbhashsize
189  */
190 #ifndef TCBHASHSIZE
191 #define TCBHASHSIZE	512
192 #endif
193 
194 /*
195  * XXX
196  * Callouts should be moved into struct tcp directly.  They are currently
197  * separate becuase the tcpcb structure is exported to userland for sysctl
198  * parsing purposes, which do not know about callouts.
199  */
200 struct	tcpcb_mem {
201 	struct	tcpcb tcb;
202 	struct	callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep;
203 	struct	callout tcpcb_mem_2msl, tcpcb_mem_delack;
204 };
205 struct	tcptw_mem {
206 	struct	tcptw tw;
207 	struct	callout tcptw_mem_2msl;
208 };
209 
210 static uma_zone_t tcpcb_zone;
211 static uma_zone_t tcptw_zone;
212 
213 /*
214  * Tcp initialization
215  */
216 void
217 tcp_init()
218 {
219 	int hashsize = TCBHASHSIZE;
220 
221 	tcp_ccgen = 1;
222 	tcp_cleartaocache();
223 
224 	tcp_delacktime = TCPTV_DELACK;
225 	tcp_keepinit = TCPTV_KEEP_INIT;
226 	tcp_keepidle = TCPTV_KEEP_IDLE;
227 	tcp_keepintvl = TCPTV_KEEPINTVL;
228 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
229 	tcp_msl = TCPTV_MSL;
230 	tcp_rexmit_min = TCPTV_MIN;
231 	tcp_rexmit_slop = TCPTV_CPU_VAR;
232 
233 	INP_INFO_LOCK_INIT(&tcbinfo, "tcp");
234 	LIST_INIT(&tcb);
235 	tcbinfo.listhead = &tcb;
236 	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
237 	if (!powerof2(hashsize)) {
238 		printf("WARNING: TCB hash size not a power of 2\n");
239 		hashsize = 512; /* safe default */
240 	}
241 	tcp_tcbhashsize = hashsize;
242 	tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
243 	tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
244 					&tcbinfo.porthashmask);
245 	tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb),
246 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
247 	uma_zone_set_max(tcbinfo.ipi_zone, maxsockets);
248 #ifdef INET6
249 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
250 #else /* INET6 */
251 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
252 #endif /* INET6 */
253 	if (max_protohdr < TCP_MINPROTOHDR)
254 		max_protohdr = TCP_MINPROTOHDR;
255 	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
256 		panic("tcp_init");
257 #undef TCP_MINPROTOHDR
258 	/*
259 	 * These have to be type stable for the benefit of the timers.
260 	 */
261 	tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
262 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
263 	uma_zone_set_max(tcpcb_zone, maxsockets);
264 	tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw_mem),
265 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
266 	uma_zone_set_max(tcptw_zone, maxsockets);
267 
268 	syncache_init();
269 }
270 
271 /*
272  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
273  * tcp_template used to store this data in mbufs, but we now recopy it out
274  * of the tcpcb each time to conserve mbufs.
275  */
276 void
277 tcpip_fillheaders(inp, ip_ptr, tcp_ptr)
278 	struct inpcb *inp;
279 	void *ip_ptr;
280 	void *tcp_ptr;
281 {
282 	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
283 
284 #ifdef INET6
285 	if ((inp->inp_vflag & INP_IPV6) != 0) {
286 		struct ip6_hdr *ip6;
287 
288 		ip6 = (struct ip6_hdr *)ip_ptr;
289 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
290 			(inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
291 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
292 			(IPV6_VERSION & IPV6_VERSION_MASK);
293 		ip6->ip6_nxt = IPPROTO_TCP;
294 		ip6->ip6_plen = sizeof(struct tcphdr);
295 		ip6->ip6_src = inp->in6p_laddr;
296 		ip6->ip6_dst = inp->in6p_faddr;
297 	} else
298 #endif
299 	{
300 		struct ip *ip;
301 
302 		ip = (struct ip *)ip_ptr;
303 		ip->ip_v = IPVERSION;
304 		ip->ip_hl = 5;
305 		ip->ip_tos = inp->inp_ip_tos;
306 		ip->ip_len = 0;
307 		ip->ip_id = 0;
308 		ip->ip_off = 0;
309 		ip->ip_ttl = inp->inp_ip_ttl;
310 		ip->ip_sum = 0;
311 		ip->ip_p = IPPROTO_TCP;
312 		ip->ip_src = inp->inp_laddr;
313 		ip->ip_dst = inp->inp_faddr;
314 	}
315 	th->th_sport = inp->inp_lport;
316 	th->th_dport = inp->inp_fport;
317 	th->th_seq = 0;
318 	th->th_ack = 0;
319 	th->th_x2 = 0;
320 	th->th_off = 5;
321 	th->th_flags = 0;
322 	th->th_win = 0;
323 	th->th_urp = 0;
324 	th->th_sum = 0;		/* in_pseudo() is called later for ipv4 */
325 }
326 
327 /*
328  * Create template to be used to send tcp packets on a connection.
329  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
330  * use for this function is in keepalives, which use tcp_respond.
331  */
332 struct tcptemp *
333 tcpip_maketemplate(inp)
334 	struct inpcb *inp;
335 {
336 	struct mbuf *m;
337 	struct tcptemp *n;
338 
339 	m = m_get(M_DONTWAIT, MT_HEADER);
340 	if (m == NULL)
341 		return (0);
342 	m->m_len = sizeof(struct tcptemp);
343 	n = mtod(m, struct tcptemp *);
344 
345 	tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
346 	return (n);
347 }
348 
349 /*
350  * Send a single message to the TCP at address specified by
351  * the given TCP/IP header.  If m == 0, then we make a copy
352  * of the tcpiphdr at ti and send directly to the addressed host.
353  * This is used to force keep alive messages out using the TCP
354  * template for a connection.  If flags are given then we send
355  * a message back to the TCP which originated the * segment ti,
356  * and discard the mbuf containing it and any other attached mbufs.
357  *
358  * In any case the ack and sequence number of the transmitted
359  * segment are as specified by the parameters.
360  *
361  * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
362  */
363 void
364 tcp_respond(tp, ipgen, th, m, ack, seq, flags)
365 	struct tcpcb *tp;
366 	void *ipgen;
367 	register struct tcphdr *th;
368 	register struct mbuf *m;
369 	tcp_seq ack, seq;
370 	int flags;
371 {
372 	register int tlen;
373 	int win = 0;
374 	struct route *ro = 0;
375 	struct route sro;
376 	struct ip *ip;
377 	struct tcphdr *nth;
378 #ifdef INET6
379 	struct route_in6 *ro6 = 0;
380 	struct route_in6 sro6;
381 	struct ip6_hdr *ip6;
382 	int isipv6;
383 #endif /* INET6 */
384 	int ipflags = 0;
385 
386 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
387 
388 #ifdef INET6
389 	isipv6 = ((struct ip *)ipgen)->ip_v == 6;
390 	ip6 = ipgen;
391 #endif /* INET6 */
392 	ip = ipgen;
393 
394 	if (tp) {
395 		if (!(flags & TH_RST)) {
396 			win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
397 			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
398 				win = (long)TCP_MAXWIN << tp->rcv_scale;
399 		}
400 #ifdef INET6
401 		if (isipv6)
402 			ro6 = &tp->t_inpcb->in6p_route;
403 		else
404 #endif /* INET6 */
405 		ro = &tp->t_inpcb->inp_route;
406 	} else {
407 #ifdef INET6
408 		if (isipv6) {
409 			ro6 = &sro6;
410 			bzero(ro6, sizeof *ro6);
411 		} else
412 #endif /* INET6 */
413 	      {
414 		ro = &sro;
415 		bzero(ro, sizeof *ro);
416 	      }
417 	}
418 	if (m == 0) {
419 		m = m_gethdr(M_DONTWAIT, MT_HEADER);
420 		if (m == NULL)
421 			return;
422 		tlen = 0;
423 		m->m_data += max_linkhdr;
424 #ifdef INET6
425 		if (isipv6) {
426 			bcopy((caddr_t)ip6, mtod(m, caddr_t),
427 			      sizeof(struct ip6_hdr));
428 			ip6 = mtod(m, struct ip6_hdr *);
429 			nth = (struct tcphdr *)(ip6 + 1);
430 		} else
431 #endif /* INET6 */
432 	      {
433 		bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
434 		ip = mtod(m, struct ip *);
435 		nth = (struct tcphdr *)(ip + 1);
436 	      }
437 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
438 		flags = TH_ACK;
439 	} else {
440 		m_freem(m->m_next);
441 		m->m_next = 0;
442 		m->m_data = (caddr_t)ipgen;
443 		/* m_len is set later */
444 		tlen = 0;
445 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
446 #ifdef INET6
447 		if (isipv6) {
448 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
449 			nth = (struct tcphdr *)(ip6 + 1);
450 		} else
451 #endif /* INET6 */
452 	      {
453 		xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
454 		nth = (struct tcphdr *)(ip + 1);
455 	      }
456 		if (th != nth) {
457 			/*
458 			 * this is usually a case when an extension header
459 			 * exists between the IPv6 header and the
460 			 * TCP header.
461 			 */
462 			nth->th_sport = th->th_sport;
463 			nth->th_dport = th->th_dport;
464 		}
465 		xchg(nth->th_dport, nth->th_sport, n_short);
466 #undef xchg
467 	}
468 #ifdef INET6
469 	if (isipv6) {
470 		ip6->ip6_flow = 0;
471 		ip6->ip6_vfc = IPV6_VERSION;
472 		ip6->ip6_nxt = IPPROTO_TCP;
473 		ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
474 						tlen));
475 		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
476 	} else
477 #endif
478       {
479 	tlen += sizeof (struct tcpiphdr);
480 	ip->ip_len = tlen;
481 	ip->ip_ttl = ip_defttl;
482       }
483 	m->m_len = tlen;
484 	m->m_pkthdr.len = tlen;
485 	m->m_pkthdr.rcvif = (struct ifnet *) 0;
486 #ifdef MAC
487 	if (tp != NULL && tp->t_inpcb != NULL) {
488 		/*
489 		 * Packet is associated with a socket, so allow the
490 		 * label of the response to reflect the socket label.
491 		 */
492 		mac_create_mbuf_from_socket(tp->t_inpcb->inp_socket, m);
493 	} else {
494 		/*
495 		 * XXXMAC: This will need to call a mac function that
496 		 * modifies the mbuf label in place for TCP datagrams
497 		 * not associated with a PCB.
498 		 */
499 	}
500 #endif
501 	nth->th_seq = htonl(seq);
502 	nth->th_ack = htonl(ack);
503 	nth->th_x2 = 0;
504 	nth->th_off = sizeof (struct tcphdr) >> 2;
505 	nth->th_flags = flags;
506 	if (tp)
507 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
508 	else
509 		nth->th_win = htons((u_short)win);
510 	nth->th_urp = 0;
511 #ifdef INET6
512 	if (isipv6) {
513 		nth->th_sum = 0;
514 		nth->th_sum = in6_cksum(m, IPPROTO_TCP,
515 					sizeof(struct ip6_hdr),
516 					tlen - sizeof(struct ip6_hdr));
517 		ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
518 					       ro6 && ro6->ro_rt ?
519 					       ro6->ro_rt->rt_ifp :
520 					       NULL);
521 	} else
522 #endif /* INET6 */
523       {
524         nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
525 	    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
526         m->m_pkthdr.csum_flags = CSUM_TCP;
527         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
528       }
529 #ifdef TCPDEBUG
530 	if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
531 		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
532 #endif
533 #ifdef INET6
534 	if (isipv6) {
535 		(void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL,
536 			tp ? tp->t_inpcb : NULL);
537 		if (ro6 == &sro6 && ro6->ro_rt) {
538 			RTFREE(ro6->ro_rt);
539 			ro6->ro_rt = NULL;
540 		}
541 	} else
542 #endif /* INET6 */
543       {
544 	(void) ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL);
545 	if (ro == &sro && ro->ro_rt) {
546 		RTFREE(ro->ro_rt);
547 		ro->ro_rt = NULL;
548 	}
549       }
550 }
551 
552 /*
553  * Create a new TCP control block, making an
554  * empty reassembly queue and hooking it to the argument
555  * protocol control block.  The `inp' parameter must have
556  * come from the zone allocator set up in tcp_init().
557  */
558 struct tcpcb *
559 tcp_newtcpcb(inp)
560 	struct inpcb *inp;
561 {
562 	struct tcpcb_mem *tm;
563 	struct tcpcb *tp;
564 #ifdef INET6
565 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
566 #endif /* INET6 */
567 
568 	tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO);
569 	if (tm == NULL)
570 		return (NULL);
571 	tp = &tm->tcb;
572 	/*	LIST_INIT(&tp->t_segq); */	/* XXX covered by M_ZERO */
573 	tp->t_maxseg = tp->t_maxopd =
574 #ifdef INET6
575 		isipv6 ? tcp_v6mssdflt :
576 #endif /* INET6 */
577 		tcp_mssdflt;
578 
579 	/* Set up our timeouts. */
580 	callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, 0);
581 	callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, 0);
582 	callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, 0);
583 	callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, 0);
584 	callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, 0);
585 
586 	if (tcp_do_rfc1323)
587 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
588 	if (tcp_do_rfc1644)
589 		tp->t_flags |= TF_REQ_CC;
590 	tp->t_inpcb = inp;	/* XXX */
591 	/*
592 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
593 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
594 	 * reasonable initial retransmit time.
595 	 */
596 	tp->t_srtt = TCPTV_SRTTBASE;
597 	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
598 	tp->t_rttmin = tcp_rexmit_min;
599 	tp->t_rxtcur = TCPTV_RTOBASE;
600 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
601 	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
602 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
603 	tp->t_rcvtime = ticks;
604 	tp->t_bw_rtttime = ticks;
605         /*
606 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
607 	 * because the socket may be bound to an IPv6 wildcard address,
608 	 * which may match an IPv4-mapped IPv6 address.
609 	 */
610 	inp->inp_ip_ttl = ip_defttl;
611 	inp->inp_ppcb = (caddr_t)tp;
612 	return (tp);		/* XXX */
613 }
614 
615 /*
616  * Drop a TCP connection, reporting
617  * the specified error.  If connection is synchronized,
618  * then send a RST to peer.
619  */
620 struct tcpcb *
621 tcp_drop(tp, errno)
622 	register struct tcpcb *tp;
623 	int errno;
624 {
625 	struct socket *so = tp->t_inpcb->inp_socket;
626 
627 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
628 		tp->t_state = TCPS_CLOSED;
629 		(void) tcp_output(tp);
630 		tcpstat.tcps_drops++;
631 	} else
632 		tcpstat.tcps_conndrops++;
633 	if (errno == ETIMEDOUT && tp->t_softerror)
634 		errno = tp->t_softerror;
635 	so->so_error = errno;
636 	return (tcp_close(tp));
637 }
638 
639 static void
640 tcp_discardcb(tp)
641 	struct tcpcb *tp;
642 {
643 	struct tseg_qent *q;
644 	struct inpcb *inp = tp->t_inpcb;
645 	struct socket *so = inp->inp_socket;
646 #ifdef INET6
647 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
648 #endif /* INET6 */
649 	struct rtentry *rt;
650 	int dosavessthresh;
651 
652 	/*
653 	 * Make sure that all of our timers are stopped before we
654 	 * delete the PCB.
655 	 */
656 	callout_stop(tp->tt_rexmt);
657 	callout_stop(tp->tt_persist);
658 	callout_stop(tp->tt_keep);
659 	callout_stop(tp->tt_2msl);
660 	callout_stop(tp->tt_delack);
661 
662 	/*
663 	 * If we got enough samples through the srtt filter,
664 	 * save the rtt and rttvar in the routing entry.
665 	 * 'Enough' is arbitrarily defined as the 16 samples.
666 	 * 16 samples is enough for the srtt filter to converge
667 	 * to within 5% of the correct value; fewer samples and
668 	 * we could save a very bogus rtt.
669 	 *
670 	 * Don't update the default route's characteristics and don't
671 	 * update anything that the user "locked".
672 	 */
673 	if (tp->t_rttupdated >= 16) {
674 		register u_long i = 0;
675 #ifdef INET6
676 		if (isipv6) {
677 			struct sockaddr_in6 *sin6;
678 
679 			if ((rt = inp->in6p_route.ro_rt) == NULL)
680 				goto no_valid_rt;
681 			sin6 = (struct sockaddr_in6 *)rt_key(rt);
682 			if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
683 				goto no_valid_rt;
684 		}
685 		else
686 #endif /* INET6 */
687 		if ((rt = inp->inp_route.ro_rt) == NULL ||
688 		    ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
689 		    == INADDR_ANY)
690 			goto no_valid_rt;
691 
692 		if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
693 			i = tp->t_srtt *
694 			    (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
695 			if (rt->rt_rmx.rmx_rtt && i)
696 				/*
697 				 * filter this update to half the old & half
698 				 * the new values, converting scale.
699 				 * See route.h and tcp_var.h for a
700 				 * description of the scaling constants.
701 				 */
702 				rt->rt_rmx.rmx_rtt =
703 				    (rt->rt_rmx.rmx_rtt + i) / 2;
704 			else
705 				rt->rt_rmx.rmx_rtt = i;
706 			tcpstat.tcps_cachedrtt++;
707 		}
708 		if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
709 			i = tp->t_rttvar *
710 			    (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
711 			if (rt->rt_rmx.rmx_rttvar && i)
712 				rt->rt_rmx.rmx_rttvar =
713 				    (rt->rt_rmx.rmx_rttvar + i) / 2;
714 			else
715 				rt->rt_rmx.rmx_rttvar = i;
716 			tcpstat.tcps_cachedrttvar++;
717 		}
718 		/*
719 		 * The old comment here said:
720 		 * update the pipelimit (ssthresh) if it has been updated
721 		 * already or if a pipesize was specified & the threshhold
722 		 * got below half the pipesize.  I.e., wait for bad news
723 		 * before we start updating, then update on both good
724 		 * and bad news.
725 		 *
726 		 * But we want to save the ssthresh even if no pipesize is
727 		 * specified explicitly in the route, because such
728 		 * connections still have an implicit pipesize specified
729 		 * by the global tcp_sendspace.  In the absence of a reliable
730 		 * way to calculate the pipesize, it will have to do.
731 		 */
732 		i = tp->snd_ssthresh;
733 		if (rt->rt_rmx.rmx_sendpipe != 0)
734 			dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
735 		else
736 			dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
737 		if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
738 		     i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
739 		    || dosavessthresh) {
740 			/*
741 			 * convert the limit from user data bytes to
742 			 * packets then to packet data bytes.
743 			 */
744 			i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
745 			if (i < 2)
746 				i = 2;
747 			i *= (u_long)(tp->t_maxseg +
748 #ifdef INET6
749 				      (isipv6 ? sizeof (struct ip6_hdr) +
750 					       sizeof (struct tcphdr) :
751 #endif
752 				       sizeof (struct tcpiphdr)
753 #ifdef INET6
754 				       )
755 #endif
756 				      );
757 			if (rt->rt_rmx.rmx_ssthresh)
758 				rt->rt_rmx.rmx_ssthresh =
759 				    (rt->rt_rmx.rmx_ssthresh + i) / 2;
760 			else
761 				rt->rt_rmx.rmx_ssthresh = i;
762 			tcpstat.tcps_cachedssthresh++;
763 		}
764 	}
765     no_valid_rt:
766 	/* free the reassembly queue, if any */
767 	while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
768 		LIST_REMOVE(q, tqe_q);
769 		m_freem(q->tqe_m);
770 		FREE(q, M_TSEGQ);
771 	}
772 	inp->inp_ppcb = NULL;
773 	tp->t_inpcb = NULL;
774 	uma_zfree(tcpcb_zone, tp);
775 	soisdisconnected(so);
776 }
777 
778 /*
779  * Close a TCP control block:
780  *    discard all space held by the tcp
781  *    discard internet protocol block
782  *    wake up any sleepers
783  */
784 struct tcpcb *
785 tcp_close(tp)
786 	struct tcpcb *tp;
787 {
788 	struct inpcb *inp = tp->t_inpcb;
789 #ifdef INET6
790 	struct socket *so = inp->inp_socket;
791 #endif
792 
793 	tcp_discardcb(tp);
794 #ifdef INET6
795 	if (INP_CHECK_SOCKAF(so, AF_INET6))
796 		in6_pcbdetach(inp);
797 	else
798 #endif
799 		in_pcbdetach(inp);
800 	tcpstat.tcps_closed++;
801 	return ((struct tcpcb *)0);
802 }
803 
804 void
805 tcp_drain()
806 {
807 	if (do_tcpdrain)
808 	{
809 		struct inpcb *inpb;
810 		struct tcpcb *tcpb;
811 		struct tseg_qent *te;
812 
813 	/*
814 	 * Walk the tcpbs, if existing, and flush the reassembly queue,
815 	 * if there is one...
816 	 * XXX: The "Net/3" implementation doesn't imply that the TCP
817 	 *      reassembly queue should be flushed, but in a situation
818 	 * 	where we're really low on mbufs, this is potentially
819 	 *  	usefull.
820 	 */
821 		INP_INFO_RLOCK(&tcbinfo);
822 		LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
823 			if (inpb->inp_vflag & INP_TIMEWAIT)
824 				continue;
825 			INP_LOCK(inpb);
826 			if ((tcpb = intotcpcb(inpb))) {
827 				while ((te = LIST_FIRST(&tcpb->t_segq))
828 			            != NULL) {
829 					LIST_REMOVE(te, tqe_q);
830 					m_freem(te->tqe_m);
831 					FREE(te, M_TSEGQ);
832 				}
833 			}
834 			INP_UNLOCK(inpb);
835 		}
836 		INP_INFO_RUNLOCK(&tcbinfo);
837 	}
838 }
839 
840 /*
841  * Notify a tcp user of an asynchronous error;
842  * store error as soft error, but wake up user
843  * (for now, won't do anything until can select for soft error).
844  *
845  * Do not wake up user since there currently is no mechanism for
846  * reporting soft errors (yet - a kqueue filter may be added).
847  */
848 static struct inpcb *
849 tcp_notify(inp, error)
850 	struct inpcb *inp;
851 	int error;
852 {
853 	struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
854 
855 	/*
856 	 * Ignore some errors if we are hooked up.
857 	 * If connection hasn't completed, has retransmitted several times,
858 	 * and receives a second error, give up now.  This is better
859 	 * than waiting a long time to establish a connection that
860 	 * can never complete.
861 	 */
862 	if (tp->t_state == TCPS_ESTABLISHED &&
863 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
864 	     error == EHOSTDOWN)) {
865 		return inp;
866 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
867 	    tp->t_softerror) {
868 		tcp_drop(tp, error);
869 		return (struct inpcb *)0;
870 	} else {
871 		tp->t_softerror = error;
872 		return inp;
873 	}
874 #if 0
875 	wakeup( &so->so_timeo);
876 	sorwakeup(so);
877 	sowwakeup(so);
878 #endif
879 }
880 
881 static int
882 tcp_pcblist(SYSCTL_HANDLER_ARGS)
883 {
884 	int error, i, n, s;
885 	struct inpcb *inp, **inp_list;
886 	inp_gen_t gencnt;
887 	struct xinpgen xig;
888 
889 	/*
890 	 * The process of preparing the TCB list is too time-consuming and
891 	 * resource-intensive to repeat twice on every request.
892 	 */
893 	if (req->oldptr == 0) {
894 		n = tcbinfo.ipi_count;
895 		req->oldidx = 2 * (sizeof xig)
896 			+ (n + n/8) * sizeof(struct xtcpcb);
897 		return 0;
898 	}
899 
900 	if (req->newptr != 0)
901 		return EPERM;
902 
903 	/*
904 	 * OK, now we're committed to doing something.
905 	 */
906 	s = splnet();
907 	INP_INFO_RLOCK(&tcbinfo);
908 	gencnt = tcbinfo.ipi_gencnt;
909 	n = tcbinfo.ipi_count;
910 	INP_INFO_RUNLOCK(&tcbinfo);
911 	splx(s);
912 
913 	sysctl_wire_old_buffer(req, 2 * (sizeof xig)
914 		+ n * sizeof(struct xtcpcb));
915 
916 	xig.xig_len = sizeof xig;
917 	xig.xig_count = n;
918 	xig.xig_gen = gencnt;
919 	xig.xig_sogen = so_gencnt;
920 	error = SYSCTL_OUT(req, &xig, sizeof xig);
921 	if (error)
922 		return error;
923 
924 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
925 	if (inp_list == 0)
926 		return ENOMEM;
927 
928 	s = splnet();
929 	INP_INFO_RLOCK(&tcbinfo);
930 	for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
931 	     inp = LIST_NEXT(inp, inp_list)) {
932 		INP_LOCK(inp);
933 		if (inp->inp_gencnt <= gencnt &&
934 		    (((inp->inp_vflag & INP_TIMEWAIT) &&
935 		    cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred) == 0) ||
936 		    cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0))
937 			inp_list[i++] = inp;
938 		INP_UNLOCK(inp);
939 	}
940 	INP_INFO_RUNLOCK(&tcbinfo);
941 	splx(s);
942 	n = i;
943 
944 	error = 0;
945 	for (i = 0; i < n; i++) {
946 		inp = inp_list[i];
947 		if (inp->inp_gencnt <= gencnt) {
948 			struct xtcpcb xt;
949 			caddr_t inp_ppcb;
950 			xt.xt_len = sizeof xt;
951 			/* XXX should avoid extra copy */
952 			bcopy(inp, &xt.xt_inp, sizeof *inp);
953 			inp_ppcb = inp->inp_ppcb;
954 			if (inp_ppcb == NULL)
955 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
956 			else if (inp->inp_vflag & INP_TIMEWAIT) {
957 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
958 				xt.xt_tp.t_state = TCPS_TIME_WAIT;
959 			} else
960 				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
961 			if (inp->inp_socket)
962 				sotoxsocket(inp->inp_socket, &xt.xt_socket);
963 			else {
964 				bzero(&xt.xt_socket, sizeof xt.xt_socket);
965 				xt.xt_socket.xso_protocol = IPPROTO_TCP;
966 			}
967 			xt.xt_inp.inp_gencnt = inp->inp_gencnt;
968 			error = SYSCTL_OUT(req, &xt, sizeof xt);
969 		}
970 	}
971 	if (!error) {
972 		/*
973 		 * Give the user an updated idea of our state.
974 		 * If the generation differs from what we told
975 		 * her before, she knows that something happened
976 		 * while we were processing this request, and it
977 		 * might be necessary to retry.
978 		 */
979 		s = splnet();
980 		INP_INFO_RLOCK(&tcbinfo);
981 		xig.xig_gen = tcbinfo.ipi_gencnt;
982 		xig.xig_sogen = so_gencnt;
983 		xig.xig_count = tcbinfo.ipi_count;
984 		INP_INFO_RUNLOCK(&tcbinfo);
985 		splx(s);
986 		error = SYSCTL_OUT(req, &xig, sizeof xig);
987 	}
988 	free(inp_list, M_TEMP);
989 	return error;
990 }
991 
992 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
993 	    tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
994 
995 static int
996 tcp_getcred(SYSCTL_HANDLER_ARGS)
997 {
998 	struct xucred xuc;
999 	struct sockaddr_in addrs[2];
1000 	struct inpcb *inp;
1001 	int error, s;
1002 
1003 	error = suser_cred(req->td->td_ucred, PRISON_ROOT);
1004 	if (error)
1005 		return (error);
1006 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
1007 	if (error)
1008 		return (error);
1009 	s = splnet();
1010 	INP_INFO_RLOCK(&tcbinfo);
1011 	inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
1012 	    addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
1013 	if (inp == NULL) {
1014 		error = ENOENT;
1015 		goto outunlocked;
1016 	}
1017 	INP_LOCK(inp);
1018 	if (inp->inp_socket == NULL) {
1019 		error = ENOENT;
1020 		goto out;
1021 	}
1022 	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
1023 	if (error)
1024 		goto out;
1025 	cru2x(inp->inp_socket->so_cred, &xuc);
1026 out:
1027 	INP_UNLOCK(inp);
1028 outunlocked:
1029 	INP_INFO_RUNLOCK(&tcbinfo);
1030 	splx(s);
1031 	if (error == 0)
1032 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
1033 	return (error);
1034 }
1035 
1036 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
1037     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
1038     tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
1039 
1040 #ifdef INET6
1041 static int
1042 tcp6_getcred(SYSCTL_HANDLER_ARGS)
1043 {
1044 	struct xucred xuc;
1045 	struct sockaddr_in6 addrs[2];
1046 	struct inpcb *inp;
1047 	int error, s, mapped = 0;
1048 
1049 	error = suser_cred(req->td->td_ucred, PRISON_ROOT);
1050 	if (error)
1051 		return (error);
1052 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
1053 	if (error)
1054 		return (error);
1055 	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
1056 		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
1057 			mapped = 1;
1058 		else
1059 			return (EINVAL);
1060 	}
1061 	s = splnet();
1062 	INP_INFO_RLOCK(&tcbinfo);
1063 	if (mapped == 1)
1064 		inp = in_pcblookup_hash(&tcbinfo,
1065 			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
1066 			addrs[1].sin6_port,
1067 			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
1068 			addrs[0].sin6_port,
1069 			0, NULL);
1070 	else
1071 		inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
1072 				 addrs[1].sin6_port,
1073 				 &addrs[0].sin6_addr, addrs[0].sin6_port,
1074 				 0, NULL);
1075 	if (inp == NULL) {
1076 		error = ENOENT;
1077 		goto outunlocked;
1078 	}
1079 	INP_LOCK(inp);
1080 	if (inp->inp_socket == NULL) {
1081 		error = ENOENT;
1082 		goto out;
1083 	}
1084 	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
1085 	if (error)
1086 		goto out;
1087 	cru2x(inp->inp_socket->so_cred, &xuc);
1088 out:
1089 	INP_UNLOCK(inp);
1090 outunlocked:
1091 	INP_INFO_RUNLOCK(&tcbinfo);
1092 	splx(s);
1093 	if (error == 0)
1094 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
1095 	return (error);
1096 }
1097 
1098 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
1099     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
1100     tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
1101 #endif
1102 
1103 
1104 void
1105 tcp_ctlinput(cmd, sa, vip)
1106 	int cmd;
1107 	struct sockaddr *sa;
1108 	void *vip;
1109 {
1110 	struct ip *ip = vip;
1111 	struct tcphdr *th;
1112 	struct in_addr faddr;
1113 	struct inpcb *inp;
1114 	struct tcpcb *tp;
1115 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
1116 	tcp_seq icmp_seq;
1117 	int s;
1118 
1119 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
1120 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1121 		return;
1122 
1123 	if (cmd == PRC_QUENCH)
1124 		notify = tcp_quench;
1125 	else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
1126 		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
1127 		notify = tcp_drop_syn_sent;
1128 	else if (cmd == PRC_MSGSIZE)
1129 		notify = tcp_mtudisc;
1130 	else if (PRC_IS_REDIRECT(cmd)) {
1131 		ip = 0;
1132 		notify = in_rtchange;
1133 	} else if (cmd == PRC_HOSTDEAD)
1134 		ip = 0;
1135 	else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
1136 		return;
1137 	if (ip) {
1138 		s = splnet();
1139 		th = (struct tcphdr *)((caddr_t)ip
1140 				       + (ip->ip_hl << 2));
1141 		INP_INFO_WLOCK(&tcbinfo);
1142 		inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
1143 		    ip->ip_src, th->th_sport, 0, NULL);
1144 		if (inp != NULL)  {
1145 			INP_LOCK(inp);
1146 			if (inp->inp_socket != NULL) {
1147 				icmp_seq = htonl(th->th_seq);
1148 				tp = intotcpcb(inp);
1149 				if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
1150 			    		SEQ_LT(icmp_seq, tp->snd_max))
1151 					inp = (*notify)(inp, inetctlerrmap[cmd]);
1152 			}
1153 			if (inp)
1154 				INP_UNLOCK(inp);
1155 		} else {
1156 			struct in_conninfo inc;
1157 
1158 			inc.inc_fport = th->th_dport;
1159 			inc.inc_lport = th->th_sport;
1160 			inc.inc_faddr = faddr;
1161 			inc.inc_laddr = ip->ip_src;
1162 #ifdef INET6
1163 			inc.inc_isipv6 = 0;
1164 #endif
1165 			syncache_unreach(&inc, th);
1166 		}
1167 		INP_INFO_WUNLOCK(&tcbinfo);
1168 		splx(s);
1169 	} else
1170 		in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
1171 }
1172 
1173 #ifdef INET6
1174 void
1175 tcp6_ctlinput(cmd, sa, d)
1176 	int cmd;
1177 	struct sockaddr *sa;
1178 	void *d;
1179 {
1180 	struct tcphdr th;
1181 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
1182 	struct ip6_hdr *ip6;
1183 	struct mbuf *m;
1184 	struct ip6ctlparam *ip6cp = NULL;
1185 	const struct sockaddr_in6 *sa6_src = NULL;
1186 	int off;
1187 	struct tcp_portonly {
1188 		u_int16_t th_sport;
1189 		u_int16_t th_dport;
1190 	} *thp;
1191 
1192 	if (sa->sa_family != AF_INET6 ||
1193 	    sa->sa_len != sizeof(struct sockaddr_in6))
1194 		return;
1195 
1196 	if (cmd == PRC_QUENCH)
1197 		notify = tcp_quench;
1198 	else if (cmd == PRC_MSGSIZE)
1199 		notify = tcp_mtudisc;
1200 	else if (!PRC_IS_REDIRECT(cmd) &&
1201 		 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
1202 		return;
1203 
1204 	/* if the parameter is from icmp6, decode it. */
1205 	if (d != NULL) {
1206 		ip6cp = (struct ip6ctlparam *)d;
1207 		m = ip6cp->ip6c_m;
1208 		ip6 = ip6cp->ip6c_ip6;
1209 		off = ip6cp->ip6c_off;
1210 		sa6_src = ip6cp->ip6c_src;
1211 	} else {
1212 		m = NULL;
1213 		ip6 = NULL;
1214 		off = 0;	/* fool gcc */
1215 		sa6_src = &sa6_any;
1216 	}
1217 
1218 	if (ip6) {
1219 		struct in_conninfo inc;
1220 		/*
1221 		 * XXX: We assume that when IPV6 is non NULL,
1222 		 * M and OFF are valid.
1223 		 */
1224 
1225 		/* check if we can safely examine src and dst ports */
1226 		if (m->m_pkthdr.len < off + sizeof(*thp))
1227 			return;
1228 
1229 		bzero(&th, sizeof(th));
1230 		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
1231 
1232 		in6_pcbnotify(&tcb, sa, th.th_dport,
1233 		    (struct sockaddr *)ip6cp->ip6c_src,
1234 		    th.th_sport, cmd, notify);
1235 
1236 		inc.inc_fport = th.th_dport;
1237 		inc.inc_lport = th.th_sport;
1238 		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
1239 		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
1240 		inc.inc_isipv6 = 1;
1241 		syncache_unreach(&inc, &th);
1242 	} else
1243 		in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src,
1244 			      0, cmd, notify);
1245 }
1246 #endif /* INET6 */
1247 
1248 
1249 /*
1250  * Following is where TCP initial sequence number generation occurs.
1251  *
1252  * There are two places where we must use initial sequence numbers:
1253  * 1.  In SYN-ACK packets.
1254  * 2.  In SYN packets.
1255  *
1256  * All ISNs for SYN-ACK packets are generated by the syncache.  See
1257  * tcp_syncache.c for details.
1258  *
1259  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
1260  * depends on this property.  In addition, these ISNs should be
1261  * unguessable so as to prevent connection hijacking.  To satisfy
1262  * the requirements of this situation, the algorithm outlined in
1263  * RFC 1948 is used to generate sequence numbers.
1264  *
1265  * Implementation details:
1266  *
1267  * Time is based off the system timer, and is corrected so that it
1268  * increases by one megabyte per second.  This allows for proper
1269  * recycling on high speed LANs while still leaving over an hour
1270  * before rollover.
1271  *
1272  * net.inet.tcp.isn_reseed_interval controls the number of seconds
1273  * between seeding of isn_secret.  This is normally set to zero,
1274  * as reseeding should not be necessary.
1275  *
1276  */
1277 
1278 #define ISN_BYTES_PER_SECOND 1048576
1279 
1280 u_char isn_secret[32];
1281 int isn_last_reseed;
1282 MD5_CTX isn_ctx;
1283 
1284 tcp_seq
1285 tcp_new_isn(tp)
1286 	struct tcpcb *tp;
1287 {
1288 	u_int32_t md5_buffer[4];
1289 	tcp_seq new_isn;
1290 
1291 	/* Seed if this is the first use, reseed if requested. */
1292 	if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) &&
1293 	     (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
1294 		< (u_int)ticks))) {
1295 		read_random(&isn_secret, sizeof(isn_secret));
1296 		isn_last_reseed = ticks;
1297 	}
1298 
1299 	/* Compute the md5 hash and return the ISN. */
1300 	MD5Init(&isn_ctx);
1301 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
1302 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
1303 #ifdef INET6
1304 	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
1305 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
1306 			  sizeof(struct in6_addr));
1307 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
1308 			  sizeof(struct in6_addr));
1309 	} else
1310 #endif
1311 	{
1312 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
1313 			  sizeof(struct in_addr));
1314 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
1315 			  sizeof(struct in_addr));
1316 	}
1317 	MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
1318 	MD5Final((u_char *) &md5_buffer, &isn_ctx);
1319 	new_isn = (tcp_seq) md5_buffer[0];
1320 	new_isn += ticks * (ISN_BYTES_PER_SECOND / hz);
1321 	return new_isn;
1322 }
1323 
1324 /*
1325  * When a source quench is received, close congestion window
1326  * to one segment.  We will gradually open it again as we proceed.
1327  */
1328 struct inpcb *
1329 tcp_quench(inp, errno)
1330 	struct inpcb *inp;
1331 	int errno;
1332 {
1333 	struct tcpcb *tp = intotcpcb(inp);
1334 
1335 	if (tp)
1336 		tp->snd_cwnd = tp->t_maxseg;
1337 	return (inp);
1338 }
1339 
1340 /*
1341  * When a specific ICMP unreachable message is received and the
1342  * connection state is SYN-SENT, drop the connection.  This behavior
1343  * is controlled by the icmp_may_rst sysctl.
1344  */
1345 struct inpcb *
1346 tcp_drop_syn_sent(inp, errno)
1347 	struct inpcb *inp;
1348 	int errno;
1349 {
1350 	struct tcpcb *tp = intotcpcb(inp);
1351 
1352 	if (tp && tp->t_state == TCPS_SYN_SENT) {
1353 		tcp_drop(tp, errno);
1354 		return (struct inpcb *)0;
1355 	}
1356 	return inp;
1357 }
1358 
1359 /*
1360  * When `need fragmentation' ICMP is received, update our idea of the MSS
1361  * based on the new value in the route.  Also nudge TCP to send something,
1362  * since we know the packet we just sent was dropped.
1363  * This duplicates some code in the tcp_mss() function in tcp_input.c.
1364  */
1365 struct inpcb *
1366 tcp_mtudisc(inp, errno)
1367 	struct inpcb *inp;
1368 	int errno;
1369 {
1370 	struct tcpcb *tp = intotcpcb(inp);
1371 	struct rtentry *rt;
1372 	struct rmxp_tao *taop;
1373 	struct socket *so = inp->inp_socket;
1374 	int offered;
1375 	int mss;
1376 #ifdef INET6
1377 	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
1378 #endif /* INET6 */
1379 
1380 	if (tp) {
1381 #ifdef INET6
1382 		if (isipv6)
1383 			rt = tcp_rtlookup6(&inp->inp_inc);
1384 		else
1385 #endif /* INET6 */
1386 		rt = tcp_rtlookup(&inp->inp_inc);
1387 		if (!rt || !rt->rt_rmx.rmx_mtu) {
1388 			tp->t_maxopd = tp->t_maxseg =
1389 #ifdef INET6
1390 				isipv6 ? tcp_v6mssdflt :
1391 #endif /* INET6 */
1392 				tcp_mssdflt;
1393 			return inp;
1394 		}
1395 		taop = rmx_taop(rt->rt_rmx);
1396 		offered = taop->tao_mssopt;
1397 		mss = rt->rt_rmx.rmx_mtu -
1398 #ifdef INET6
1399 			(isipv6 ?
1400 			 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
1401 #endif /* INET6 */
1402 			 sizeof(struct tcpiphdr)
1403 #ifdef INET6
1404 			 )
1405 #endif /* INET6 */
1406 			;
1407 
1408 		if (offered)
1409 			mss = min(mss, offered);
1410 		/*
1411 		 * XXX - The above conditional probably violates the TCP
1412 		 * spec.  The problem is that, since we don't know the
1413 		 * other end's MSS, we are supposed to use a conservative
1414 		 * default.  But, if we do that, then MTU discovery will
1415 		 * never actually take place, because the conservative
1416 		 * default is much less than the MTUs typically seen
1417 		 * on the Internet today.  For the moment, we'll sweep
1418 		 * this under the carpet.
1419 		 *
1420 		 * The conservative default might not actually be a problem
1421 		 * if the only case this occurs is when sending an initial
1422 		 * SYN with options and data to a host we've never talked
1423 		 * to before.  Then, they will reply with an MSS value which
1424 		 * will get recorded and the new parameters should get
1425 		 * recomputed.  For Further Study.
1426 		 */
1427 		if (tp->t_maxopd <= mss)
1428 			return inp;
1429 		tp->t_maxopd = mss;
1430 
1431 		if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1432 		    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
1433 			mss -= TCPOLEN_TSTAMP_APPA;
1434 		if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
1435 		    (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
1436 			mss -= TCPOLEN_CC_APPA;
1437 #if	(MCLBYTES & (MCLBYTES - 1)) == 0
1438 		if (mss > MCLBYTES)
1439 			mss &= ~(MCLBYTES-1);
1440 #else
1441 		if (mss > MCLBYTES)
1442 			mss = mss / MCLBYTES * MCLBYTES;
1443 #endif
1444 		if (so->so_snd.sb_hiwat < mss)
1445 			mss = so->so_snd.sb_hiwat;
1446 
1447 		tp->t_maxseg = mss;
1448 
1449 		tcpstat.tcps_mturesent++;
1450 		tp->t_rtttime = 0;
1451 		tp->snd_nxt = tp->snd_una;
1452 		tcp_output(tp);
1453 	}
1454 	return inp;
1455 }
1456 
1457 /*
1458  * Look-up the routing entry to the peer of this inpcb.  If no route
1459  * is found and it cannot be allocated, then return NULL.  This routine
1460  * is called by TCP routines that access the rmx structure and by tcp_mss
1461  * to get the interface MTU.
1462  */
1463 struct rtentry *
1464 tcp_rtlookup(inc)
1465 	struct in_conninfo *inc;
1466 {
1467 	struct route *ro;
1468 	struct rtentry *rt;
1469 
1470 	ro = &inc->inc_route;
1471 	rt = ro->ro_rt;
1472 	if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1473 		/* No route yet, so try to acquire one */
1474 		if (inc->inc_faddr.s_addr != INADDR_ANY) {
1475 			ro->ro_dst.sa_family = AF_INET;
1476 			ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
1477 			((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
1478 			    inc->inc_faddr;
1479 			rtalloc(ro);
1480 			rt = ro->ro_rt;
1481 		}
1482 	}
1483 	return rt;
1484 }
1485 
1486 #ifdef INET6
1487 struct rtentry *
1488 tcp_rtlookup6(inc)
1489 	struct in_conninfo *inc;
1490 {
1491 	struct route_in6 *ro6;
1492 	struct rtentry *rt;
1493 
1494 	ro6 = &inc->inc6_route;
1495 	rt = ro6->ro_rt;
1496 	if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1497 		/* No route yet, so try to acquire one */
1498 		if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
1499 			ro6->ro_dst.sin6_family = AF_INET6;
1500 			ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
1501 			ro6->ro_dst.sin6_addr = inc->inc6_faddr;
1502 			rtalloc((struct route *)ro6);
1503 			rt = ro6->ro_rt;
1504 		}
1505 	}
1506 	return rt;
1507 }
1508 #endif /* INET6 */
1509 
1510 #ifdef IPSEC
1511 /* compute ESP/AH header size for TCP, including outer IP header. */
1512 size_t
1513 ipsec_hdrsiz_tcp(tp)
1514 	struct tcpcb *tp;
1515 {
1516 	struct inpcb *inp;
1517 	struct mbuf *m;
1518 	size_t hdrsiz;
1519 	struct ip *ip;
1520 #ifdef INET6
1521 	struct ip6_hdr *ip6;
1522 #endif
1523 	struct tcphdr *th;
1524 
1525 	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
1526 		return 0;
1527 	MGETHDR(m, M_DONTWAIT, MT_DATA);
1528 	if (!m)
1529 		return 0;
1530 
1531 #ifdef INET6
1532 	if ((inp->inp_vflag & INP_IPV6) != 0) {
1533 		ip6 = mtod(m, struct ip6_hdr *);
1534 		th = (struct tcphdr *)(ip6 + 1);
1535 		m->m_pkthdr.len = m->m_len =
1536 			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1537 		tcpip_fillheaders(inp, ip6, th);
1538 		hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1539 	} else
1540 #endif /* INET6 */
1541       {
1542 	ip = mtod(m, struct ip *);
1543 	th = (struct tcphdr *)(ip + 1);
1544 	m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
1545 	tcpip_fillheaders(inp, ip, th);
1546 	hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1547       }
1548 
1549 	m_free(m);
1550 	return hdrsiz;
1551 }
1552 #endif /*IPSEC*/
1553 
1554 /*
1555  * Return a pointer to the cached information about the remote host.
1556  * The cached information is stored in the protocol specific part of
1557  * the route metrics.
1558  */
1559 struct rmxp_tao *
1560 tcp_gettaocache(inc)
1561 	struct in_conninfo *inc;
1562 {
1563 	struct rtentry *rt;
1564 
1565 #ifdef INET6
1566 	if (inc->inc_isipv6)
1567 		rt = tcp_rtlookup6(inc);
1568 	else
1569 #endif /* INET6 */
1570 	rt = tcp_rtlookup(inc);
1571 
1572 	/* Make sure this is a host route and is up. */
1573 	if (rt == NULL ||
1574 	    (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
1575 		return NULL;
1576 
1577 	return rmx_taop(rt->rt_rmx);
1578 }
1579 
1580 /*
1581  * Clear all the TAO cache entries, called from tcp_init.
1582  *
1583  * XXX
1584  * This routine is just an empty one, because we assume that the routing
1585  * routing tables are initialized at the same time when TCP, so there is
1586  * nothing in the cache left over.
1587  */
1588 static void
1589 tcp_cleartaocache()
1590 {
1591 }
1592 
1593 /*
1594  * Move a TCP connection into TIME_WAIT state.
1595  *    tcbinfo is unlocked.
1596  *    inp is locked, and is unlocked before returning.
1597  */
1598 void
1599 tcp_twstart(tp)
1600 	struct tcpcb *tp;
1601 {
1602 	struct tcptw_mem *tm;
1603 	struct tcptw *tw;
1604 	struct inpcb *inp;
1605 	int tw_time, acknow;
1606 	struct socket *so;
1607 
1608 	tm = uma_zalloc(tcptw_zone, M_NOWAIT);
1609 	if (tm == NULL)
1610 		/* EEEK! -- preserve old structure or just kill everything? */
1611 		/* must obtain tcbinfo lock in order to drop the structure. */
1612 		panic("uma_zalloc(tcptw)");
1613 	tw = &tm->tw;
1614 	inp = tp->t_inpcb;
1615 	tw->tw_inpcb = inp;
1616 
1617 	/*
1618 	 * Recover last window size sent.
1619 	 */
1620 	tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale;
1621 
1622 	/*
1623 	 * Set t_recent if timestamps are used on the connection.
1624 	 */
1625         if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
1626             (TF_REQ_TSTMP|TF_RCVD_TSTMP))
1627 		tw->t_recent = tp->ts_recent;
1628 	else
1629 		tw->t_recent = 0;
1630 
1631 	tw->snd_nxt = tp->snd_nxt;
1632 	tw->rcv_nxt = tp->rcv_nxt;
1633 	tw->cc_recv = tp->cc_recv;
1634 	tw->cc_send = tp->cc_send;
1635 	tw->t_starttime = tp->t_starttime;
1636 	callout_init(tw->tt_2msl = &tm->tcptw_mem_2msl, 0);
1637 
1638 /* XXX
1639  * If this code will
1640  * be used for fin-wait-2 state also, then we may need
1641  * a ts_recent from the last segment.
1642  */
1643 	/* Shorten TIME_WAIT [RFC-1644, p.28] */
1644 	if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) {
1645 		tw_time = tp->t_rxtcur * TCPTV_TWTRUNC;
1646 		/* For T/TCP client, force ACK now. */
1647 		acknow = 1;
1648 	} else {
1649 		tw_time = 2 * tcp_msl;
1650 		acknow = tp->t_flags & TF_ACKNOW;
1651 	}
1652 	tcp_discardcb(tp);
1653 	so = inp->inp_socket;
1654 	so->so_pcb = NULL;
1655 	tw->tw_cred = crhold(so->so_cred);
1656 	tw->tw_so_options = so->so_options;
1657 	sotryfree(so);
1658 	inp->inp_socket = NULL;
1659 	inp->inp_ppcb = (caddr_t)tw;
1660 	inp->inp_vflag |= INP_TIMEWAIT;
1661 	callout_reset(tw->tt_2msl, tw_time, tcp_timer_2msl_tw, tw);
1662 	if (acknow)
1663 		tcp_twrespond(tw, TH_ACK);
1664 	INP_UNLOCK(inp);
1665 }
1666 
1667 void
1668 tcp_twclose(tw)
1669 	struct tcptw *tw;
1670 {
1671 	struct inpcb *inp;
1672 
1673 	inp = tw->tw_inpcb;
1674 	tw->tw_inpcb = NULL;
1675 	callout_stop(tw->tt_2msl);
1676 	inp->inp_ppcb = NULL;
1677 	uma_zfree(tcptw_zone, tw);
1678 #ifdef INET6
1679 	if (inp->inp_vflag & INP_IPV6PROTO)
1680 		in6_pcbdetach(inp);
1681 	else
1682 #endif
1683 		in_pcbdetach(inp);
1684 	tcpstat.tcps_closed++;
1685 }
1686 
1687 int
1688 tcp_twrespond(struct tcptw *tw, int flags)
1689 {
1690 	struct inpcb *inp = tw->tw_inpcb;
1691 	struct tcphdr *th;
1692 	struct mbuf *m;
1693 	struct ip *ip = NULL;
1694 	u_int8_t *optp;
1695 	u_int hdrlen, optlen;
1696 	int error;
1697 #ifdef INET6
1698 	struct ip6_hdr *ip6 = NULL;
1699 	int isipv6 = inp->inp_inc.inc_isipv6;
1700 #endif
1701 
1702 	m = m_gethdr(M_DONTWAIT, MT_HEADER);
1703 	if (m == NULL)
1704 		return (ENOBUFS);
1705 	m->m_data += max_linkhdr;
1706 
1707 #ifdef INET6
1708 	if (isipv6) {
1709 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1710 		ip6 = mtod(m, struct ip6_hdr *);
1711 		th = (struct tcphdr *)(ip6 + 1);
1712 		tcpip_fillheaders(inp, ip6, th);
1713 	} else
1714 #endif
1715 	{
1716 		hdrlen = sizeof(struct tcpiphdr);
1717 		ip = mtod(m, struct ip *);
1718 		th = (struct tcphdr *)(ip + 1);
1719 		tcpip_fillheaders(inp, ip, th);
1720 	}
1721 	optp = (u_int8_t *)(th + 1);
1722 
1723  	/*
1724 	 * Send a timestamp and echo-reply if both our side and our peer
1725 	 * have sent timestamps in our SYN's and this is not a RST.
1726  	 */
1727 	if (tw->t_recent && flags == TH_ACK) {
1728 		u_int32_t *lp = (u_int32_t *)optp;
1729 
1730  		/* Form timestamp option as shown in appendix A of RFC 1323. */
1731  		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
1732  		*lp++ = htonl(ticks);
1733  		*lp   = htonl(tw->t_recent);
1734  		optp += TCPOLEN_TSTAMP_APPA;
1735  	}
1736 
1737  	/*
1738 	 * Send `CC-family' options if needed, and it's not a RST.
1739  	 */
1740 	if (tw->cc_recv != 0 && flags == TH_ACK) {
1741 		u_int32_t *lp = (u_int32_t *)optp;
1742 
1743 		*lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC));
1744 		*lp   = htonl(tw->cc_send);
1745 		optp += TCPOLEN_CC_APPA;
1746  	}
1747 	optlen = optp - (u_int8_t *)(th + 1);
1748 
1749 	m->m_len = hdrlen + optlen;
1750 	m->m_pkthdr.len = m->m_len;
1751 
1752 	KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));
1753 
1754 	th->th_seq = htonl(tw->snd_nxt);
1755 	th->th_ack = htonl(tw->rcv_nxt);
1756 	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
1757 	th->th_flags = flags;
1758 	th->th_win = htons(tw->last_win);
1759 
1760 #ifdef INET6
1761 	if (isipv6) {
1762 		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
1763 		    sizeof(struct tcphdr) + optlen);
1764 		ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
1765 		    inp->in6p_route.ro_rt->rt_ifp : NULL);
1766 		error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route,
1767 		    (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
1768 	} else
1769 #endif
1770 	{
1771 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1772                     htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
1773 		m->m_pkthdr.csum_flags = CSUM_TCP;
1774 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1775 		ip->ip_len = m->m_pkthdr.len;
1776 		error = ip_output(m, inp->inp_options, &inp->inp_route,
1777 		    (tw->tw_so_options & SO_DONTROUTE), NULL, inp);
1778 	}
1779 	if (flags & TH_ACK)
1780 		tcpstat.tcps_sndacks++;
1781 	else
1782 		tcpstat.tcps_sndctrl++;
1783 	tcpstat.tcps_sndtotal++;
1784 	return (error);
1785 }
1786 
1787 /*
1788  * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
1789  *
1790  * This code attempts to calculate the bandwidth-delay product as a
1791  * means of determining the optimal window size to maximize bandwidth,
1792  * minimize RTT, and avoid the over-allocation of buffers on interfaces and
1793  * routers.  This code also does a fairly good job keeping RTTs in check
1794  * across slow links like modems.  We implement an algorithm which is very
1795  * similar (but not meant to be) TCP/Vegas.  The code operates on the
1796  * transmitter side of a TCP connection and so only effects the transmit
1797  * side of the connection.
1798  *
1799  * BACKGROUND:  TCP makes no provision for the management of buffer space
1800  * at the end points or at the intermediate routers and switches.  A TCP
1801  * stream, whether using NewReno or not, will eventually buffer as
1802  * many packets as it is able and the only reason this typically works is
1803  * due to the fairly small default buffers made available for a connection
1804  * (typicaly 16K or 32K).  As machines use larger windows and/or window
1805  * scaling it is now fairly easy for even a single TCP connection to blow-out
1806  * all available buffer space not only on the local interface, but on
1807  * intermediate routers and switches as well.  NewReno makes a misguided
1808  * attempt to 'solve' this problem by waiting for an actual failure to occur,
1809  * then backing off, then steadily increasing the window again until another
1810  * failure occurs, ad-infinitum.  This results in terrible oscillation that
1811  * is only made worse as network loads increase and the idea of intentionally
1812  * blowing out network buffers is, frankly, a terrible way to manage network
1813  * resources.
1814  *
1815  * It is far better to limit the transmit window prior to the failure
1816  * condition being achieved.  There are two general ways to do this:  First
1817  * you can 'scan' through different transmit window sizes and locate the
1818  * point where the RTT stops increasing, indicating that you have filled the
1819  * pipe, then scan backwards until you note that RTT stops decreasing, then
1820  * repeat ad-infinitum.  This method works in principle but has severe
1821  * implementation issues due to RTT variances, timer granularity, and
1822  * instability in the algorithm which can lead to many false positives and
1823  * create oscillations as well as interact badly with other TCP streams
1824  * implementing the same algorithm.
1825  *
1826  * The second method is to limit the window to the bandwidth delay product
1827  * of the link.  This is the method we implement.  RTT variances and our
1828  * own manipulation of the congestion window, bwnd, can potentially
1829  * destabilize the algorithm.  For this reason we have to stabilize the
1830  * elements used to calculate the window.  We do this by using the minimum
1831  * observed RTT, the long term average of the observed bandwidth, and
1832  * by adding two segments worth of slop.  It isn't perfect but it is able
1833  * to react to changing conditions and gives us a very stable basis on
1834  * which to extend the algorithm.
1835  */
1836 void
1837 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
1838 {
1839 	u_long bw;
1840 	u_long bwnd;
1841 	int save_ticks;
1842 
1843 	/*
1844 	 * If inflight_enable is disabled in the middle of a tcp connection,
1845 	 * make sure snd_bwnd is effectively disabled.
1846 	 */
1847 	if (tcp_inflight_enable == 0) {
1848 		tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1849 		tp->snd_bandwidth = 0;
1850 		return;
1851 	}
1852 
1853 	/*
1854 	 * Figure out the bandwidth.  Due to the tick granularity this
1855 	 * is a very rough number and it MUST be averaged over a fairly
1856 	 * long period of time.  XXX we need to take into account a link
1857 	 * that is not using all available bandwidth, but for now our
1858 	 * slop will ramp us up if this case occurs and the bandwidth later
1859 	 * increases.
1860 	 *
1861 	 * Note: if ticks rollover 'bw' may wind up negative.  We must
1862 	 * effectively reset t_bw_rtttime for this case.
1863 	 */
1864 	save_ticks = ticks;
1865 	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
1866 		return;
1867 
1868 	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
1869 	    (save_ticks - tp->t_bw_rtttime);
1870 	tp->t_bw_rtttime = save_ticks;
1871 	tp->t_bw_rtseq = ack_seq;
1872 	if (tp->t_bw_rtttime == 0 || (int)bw < 0)
1873 		return;
1874 	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
1875 
1876 	tp->snd_bandwidth = bw;
1877 
1878 	/*
1879 	 * Calculate the semi-static bandwidth delay product, plus two maximal
1880 	 * segments.  The additional slop puts us squarely in the sweet
1881 	 * spot and also handles the bandwidth run-up case and stabilization.
1882 	 * Without the slop we could be locking ourselves into a lower
1883 	 * bandwidth.
1884 	 *
1885 	 * Situations Handled:
1886 	 *	(1) Prevents over-queueing of packets on LANs, especially on
1887 	 *	    high speed LANs, allowing larger TCP buffers to be
1888 	 *	    specified, and also does a good job preventing
1889 	 *	    over-queueing of packets over choke points like modems
1890 	 *	    (at least for the transmit side).
1891 	 *
1892 	 *	(2) Is able to handle changing network loads (bandwidth
1893 	 *	    drops so bwnd drops, bandwidth increases so bwnd
1894 	 *	    increases).
1895 	 *
1896 	 *	(3) Theoretically should stabilize in the face of multiple
1897 	 *	    connections implementing the same algorithm (this may need
1898 	 *	    a little work).
1899 	 *
1900 	 *	(4) Stability value (defaults to 20 = 2 maximal packets) can
1901 	 *	    be adjusted with a sysctl but typically only needs to be
1902 	 *	    on very slow connections.  A value no smaller then 5
1903 	 *	    should be used, but only reduce this default if you have
1904 	 *	    no other choice.
1905 	 */
1906 #define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
1907 	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10;
1908 #undef USERTT
1909 
1910 	if (tcp_inflight_debug > 0) {
1911 		static int ltime;
1912 		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
1913 			ltime = ticks;
1914 			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
1915 			    tp,
1916 			    bw,
1917 			    tp->t_rttbest,
1918 			    tp->t_srtt,
1919 			    bwnd
1920 			);
1921 		}
1922 	}
1923 	if ((long)bwnd < tcp_inflight_min)
1924 		bwnd = tcp_inflight_min;
1925 	if (bwnd > tcp_inflight_max)
1926 		bwnd = tcp_inflight_max;
1927 	if ((long)bwnd < tp->t_maxseg * 2)
1928 		bwnd = tp->t_maxseg * 2;
1929 	tp->snd_bwnd = bwnd;
1930 }
1931 
1932