xref: /freebsd/sys/netinet/tcp_subr.c (revision 56d9e932072f81ebaa7bb1bf5995a46813bc91c4)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
34  * $FreeBSD$
35  */
36 
37 #include "opt_compat.h"
38 #include "opt_inet6.h"
39 #include "opt_ipsec.h"
40 #include "opt_mac.h"
41 #include "opt_tcpdebug.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/callout.h>
46 #include <sys/kernel.h>
47 #include <sys/sysctl.h>
48 #include <sys/mac.h>
49 #include <sys/malloc.h>
50 #include <sys/mbuf.h>
51 #ifdef INET6
52 #include <sys/domain.h>
53 #endif
54 #include <sys/proc.h>
55 #include <sys/socket.h>
56 #include <sys/socketvar.h>
57 #include <sys/protosw.h>
58 #include <sys/random.h>
59 
60 #include <vm/uma.h>
61 
62 #include <net/route.h>
63 #include <net/if.h>
64 
65 #include <netinet/in.h>
66 #include <netinet/in_systm.h>
67 #include <netinet/ip.h>
68 #ifdef INET6
69 #include <netinet/ip6.h>
70 #endif
71 #include <netinet/in_pcb.h>
72 #ifdef INET6
73 #include <netinet6/in6_pcb.h>
74 #endif
75 #include <netinet/in_var.h>
76 #include <netinet/ip_var.h>
77 #ifdef INET6
78 #include <netinet6/ip6_var.h>
79 #include <netinet6/nd6.h>
80 #endif
81 #include <netinet/tcp.h>
82 #include <netinet/tcp_fsm.h>
83 #include <netinet/tcp_seq.h>
84 #include <netinet/tcp_timer.h>
85 #include <netinet/tcp_var.h>
86 #ifdef INET6
87 #include <netinet6/tcp6_var.h>
88 #endif
89 #include <netinet/tcpip.h>
90 #ifdef TCPDEBUG
91 #include <netinet/tcp_debug.h>
92 #endif
93 #include <netinet6/ip6protosw.h>
94 
95 #ifdef IPSEC
96 #include <netinet6/ipsec.h>
97 #ifdef INET6
98 #include <netinet6/ipsec6.h>
99 #endif
100 #endif /*IPSEC*/
101 
102 #ifdef FAST_IPSEC
103 #include <netipsec/ipsec.h>
104 #ifdef INET6
105 #include <netipsec/ipsec6.h>
106 #endif
107 #define	IPSEC
108 #endif /*FAST_IPSEC*/
109 
110 #include <machine/in_cksum.h>
111 #include <sys/md5.h>
112 
113 int 	tcp_mssdflt = TCP_MSS;
114 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
115     &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
116 
117 #ifdef INET6
118 int	tcp_v6mssdflt = TCP6_MSS;
119 SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
120 	CTLFLAG_RW, &tcp_v6mssdflt , 0,
121 	"Default TCP Maximum Segment Size for IPv6");
122 #endif
123 
124 #if 0
125 static int 	tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
126 SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
127     &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
128 #endif
129 
130 int	tcp_do_rfc1323 = 1;
131 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
132     &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
133 
134 int	tcp_do_rfc1644 = 0;
135 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
136     &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
137 
138 static int	tcp_tcbhashsize = 0;
139 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
140      &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
141 
142 static int	do_tcpdrain = 1;
143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
144      "Enable tcp_drain routine for extra help when low on mbufs");
145 
146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
147     &tcbinfo.ipi_count, 0, "Number of active PCBs");
148 
149 static int	icmp_may_rst = 1;
150 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0,
151     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
152 
153 static int	tcp_isn_reseed_interval = 0;
154 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
155     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
156 
157 /*
158  * TCP bandwidth limiting sysctls.  Note that the default lower bound of
159  * 1024 exists only for debugging.  A good production default would be
160  * something like 6100.
161  */
162 static int	tcp_inflight_enable = 0;
163 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
164     &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
165 
166 static int	tcp_inflight_debug = 0;
167 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
168     &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
169 
170 static int	tcp_inflight_min = 6144;
171 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
172     &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
173 
174 static int	tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
175 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
176     &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
177 static int	tcp_inflight_stab = 20;
178 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
179     &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
180 
181 static struct inpcb *tcp_notify(struct inpcb *, int);
182 static void	tcp_discardcb(struct tcpcb *);
183 
184 /*
185  * Target size of TCP PCB hash tables. Must be a power of two.
186  *
187  * Note that this can be overridden by the kernel environment
188  * variable net.inet.tcp.tcbhashsize
189  */
190 #ifndef TCBHASHSIZE
191 #define TCBHASHSIZE	512
192 #endif
193 
194 /*
195  * XXX
196  * Callouts should be moved into struct tcp directly.  They are currently
197  * separate becuase the tcpcb structure is exported to userland for sysctl
198  * parsing purposes, which do not know about callouts.
199  */
200 struct	tcpcb_mem {
201 	struct	tcpcb tcb;
202 	struct	callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep;
203 	struct	callout tcpcb_mem_2msl, tcpcb_mem_delack;
204 };
205 
206 static uma_zone_t tcpcb_zone;
207 static uma_zone_t tcptw_zone;
208 
209 /*
210  * Tcp initialization
211  */
212 void
213 tcp_init()
214 {
215 	int hashsize = TCBHASHSIZE;
216 
217 	tcp_ccgen = 1;
218 
219 	tcp_delacktime = TCPTV_DELACK;
220 	tcp_keepinit = TCPTV_KEEP_INIT;
221 	tcp_keepidle = TCPTV_KEEP_IDLE;
222 	tcp_keepintvl = TCPTV_KEEPINTVL;
223 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
224 	tcp_msl = TCPTV_MSL;
225 	tcp_rexmit_min = TCPTV_MIN;
226 	tcp_rexmit_slop = TCPTV_CPU_VAR;
227 
228 	INP_INFO_LOCK_INIT(&tcbinfo, "tcp");
229 	LIST_INIT(&tcb);
230 	tcbinfo.listhead = &tcb;
231 	TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
232 	if (!powerof2(hashsize)) {
233 		printf("WARNING: TCB hash size not a power of 2\n");
234 		hashsize = 512; /* safe default */
235 	}
236 	tcp_tcbhashsize = hashsize;
237 	tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
238 	tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
239 					&tcbinfo.porthashmask);
240 	tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb),
241 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
242 	uma_zone_set_max(tcbinfo.ipi_zone, maxsockets);
243 #ifdef INET6
244 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
245 #else /* INET6 */
246 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
247 #endif /* INET6 */
248 	if (max_protohdr < TCP_MINPROTOHDR)
249 		max_protohdr = TCP_MINPROTOHDR;
250 	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
251 		panic("tcp_init");
252 #undef TCP_MINPROTOHDR
253 	/*
254 	 * These have to be type stable for the benefit of the timers.
255 	 */
256 	tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
257 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
258 	uma_zone_set_max(tcpcb_zone, maxsockets);
259 	tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
260 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
261 	uma_zone_set_max(tcptw_zone, maxsockets / 5);
262 	tcp_timer_init();
263 	syncache_init();
264 	tcp_hc_init();
265 }
266 
267 /*
268  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
269  * tcp_template used to store this data in mbufs, but we now recopy it out
270  * of the tcpcb each time to conserve mbufs.
271  */
272 void
273 tcpip_fillheaders(inp, ip_ptr, tcp_ptr)
274 	struct inpcb *inp;
275 	void *ip_ptr;
276 	void *tcp_ptr;
277 {
278 	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
279 
280 #ifdef INET6
281 	if ((inp->inp_vflag & INP_IPV6) != 0) {
282 		struct ip6_hdr *ip6;
283 
284 		ip6 = (struct ip6_hdr *)ip_ptr;
285 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
286 			(inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
287 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
288 			(IPV6_VERSION & IPV6_VERSION_MASK);
289 		ip6->ip6_nxt = IPPROTO_TCP;
290 		ip6->ip6_plen = sizeof(struct tcphdr);
291 		ip6->ip6_src = inp->in6p_laddr;
292 		ip6->ip6_dst = inp->in6p_faddr;
293 	} else
294 #endif
295 	{
296 		struct ip *ip;
297 
298 		ip = (struct ip *)ip_ptr;
299 		ip->ip_v = IPVERSION;
300 		ip->ip_hl = 5;
301 		ip->ip_tos = inp->inp_ip_tos;
302 		ip->ip_len = 0;
303 		ip->ip_id = 0;
304 		ip->ip_off = 0;
305 		ip->ip_ttl = inp->inp_ip_ttl;
306 		ip->ip_sum = 0;
307 		ip->ip_p = IPPROTO_TCP;
308 		ip->ip_src = inp->inp_laddr;
309 		ip->ip_dst = inp->inp_faddr;
310 	}
311 	th->th_sport = inp->inp_lport;
312 	th->th_dport = inp->inp_fport;
313 	th->th_seq = 0;
314 	th->th_ack = 0;
315 	th->th_x2 = 0;
316 	th->th_off = 5;
317 	th->th_flags = 0;
318 	th->th_win = 0;
319 	th->th_urp = 0;
320 	th->th_sum = 0;		/* in_pseudo() is called later for ipv4 */
321 }
322 
323 /*
324  * Create template to be used to send tcp packets on a connection.
325  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
326  * use for this function is in keepalives, which use tcp_respond.
327  */
328 struct tcptemp *
329 tcpip_maketemplate(inp)
330 	struct inpcb *inp;
331 {
332 	struct mbuf *m;
333 	struct tcptemp *n;
334 
335 	m = m_get(M_DONTWAIT, MT_HEADER);
336 	if (m == NULL)
337 		return (0);
338 	m->m_len = sizeof(struct tcptemp);
339 	n = mtod(m, struct tcptemp *);
340 
341 	tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
342 	return (n);
343 }
344 
345 /*
346  * Send a single message to the TCP at address specified by
347  * the given TCP/IP header.  If m == 0, then we make a copy
348  * of the tcpiphdr at ti and send directly to the addressed host.
349  * This is used to force keep alive messages out using the TCP
350  * template for a connection.  If flags are given then we send
351  * a message back to the TCP which originated the * segment ti,
352  * and discard the mbuf containing it and any other attached mbufs.
353  *
354  * In any case the ack and sequence number of the transmitted
355  * segment are as specified by the parameters.
356  *
357  * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
358  */
359 void
360 tcp_respond(tp, ipgen, th, m, ack, seq, flags)
361 	struct tcpcb *tp;
362 	void *ipgen;
363 	register struct tcphdr *th;
364 	register struct mbuf *m;
365 	tcp_seq ack, seq;
366 	int flags;
367 {
368 	register int tlen;
369 	int win = 0;
370 	struct ip *ip;
371 	struct tcphdr *nth;
372 #ifdef INET6
373 	struct ip6_hdr *ip6;
374 	int isipv6;
375 #endif /* INET6 */
376 	int ipflags = 0;
377 	struct inpcb *inp = NULL;
378 
379 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
380 
381 #ifdef INET6
382 	isipv6 = ((struct ip *)ipgen)->ip_v == 6;
383 	ip6 = ipgen;
384 #endif /* INET6 */
385 	ip = ipgen;
386 
387 	if (tp) {
388 		inp = tp->t_inpcb;
389 		KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
390 		INP_INFO_WLOCK_ASSERT(&tcbinfo);
391 		INP_LOCK_ASSERT(inp);
392 		if (!(flags & TH_RST)) {
393 			win = sbspace(&inp->inp_socket->so_rcv);
394 			if (win > (long)TCP_MAXWIN << tp->rcv_scale)
395 				win = (long)TCP_MAXWIN << tp->rcv_scale;
396 		}
397 	}
398 	if (m == 0) {
399 		m = m_gethdr(M_DONTWAIT, MT_HEADER);
400 		if (m == NULL)
401 			return;
402 		tlen = 0;
403 		m->m_data += max_linkhdr;
404 #ifdef INET6
405 		if (isipv6) {
406 			bcopy((caddr_t)ip6, mtod(m, caddr_t),
407 			      sizeof(struct ip6_hdr));
408 			ip6 = mtod(m, struct ip6_hdr *);
409 			nth = (struct tcphdr *)(ip6 + 1);
410 		} else
411 #endif /* INET6 */
412 	      {
413 		bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
414 		ip = mtod(m, struct ip *);
415 		nth = (struct tcphdr *)(ip + 1);
416 	      }
417 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
418 		flags = TH_ACK;
419 	} else {
420 		m_freem(m->m_next);
421 		m->m_next = 0;
422 		m->m_data = (caddr_t)ipgen;
423 		/* m_len is set later */
424 		tlen = 0;
425 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
426 #ifdef INET6
427 		if (isipv6) {
428 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
429 			nth = (struct tcphdr *)(ip6 + 1);
430 		} else
431 #endif /* INET6 */
432 	      {
433 		xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
434 		nth = (struct tcphdr *)(ip + 1);
435 	      }
436 		if (th != nth) {
437 			/*
438 			 * this is usually a case when an extension header
439 			 * exists between the IPv6 header and the
440 			 * TCP header.
441 			 */
442 			nth->th_sport = th->th_sport;
443 			nth->th_dport = th->th_dport;
444 		}
445 		xchg(nth->th_dport, nth->th_sport, n_short);
446 #undef xchg
447 	}
448 #ifdef INET6
449 	if (isipv6) {
450 		ip6->ip6_flow = 0;
451 		ip6->ip6_vfc = IPV6_VERSION;
452 		ip6->ip6_nxt = IPPROTO_TCP;
453 		ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
454 						tlen));
455 		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
456 	} else
457 #endif
458       {
459 	tlen += sizeof (struct tcpiphdr);
460 	ip->ip_len = tlen;
461 	ip->ip_ttl = ip_defttl;
462       }
463 	m->m_len = tlen;
464 	m->m_pkthdr.len = tlen;
465 	m->m_pkthdr.rcvif = (struct ifnet *) 0;
466 #ifdef MAC
467 	if (inp != NULL) {
468 		/*
469 		 * Packet is associated with a socket, so allow the
470 		 * label of the response to reflect the socket label.
471 		 */
472 		mac_create_mbuf_from_socket(inp->inp_socket, m);
473 	} else {
474 		/*
475 		 * Packet is not associated with a socket, so possibly
476 		 * update the label in place.
477 		 */
478 		mac_reflect_mbuf_tcp(m);
479 	}
480 #endif
481 	nth->th_seq = htonl(seq);
482 	nth->th_ack = htonl(ack);
483 	nth->th_x2 = 0;
484 	nth->th_off = sizeof (struct tcphdr) >> 2;
485 	nth->th_flags = flags;
486 	if (tp)
487 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
488 	else
489 		nth->th_win = htons((u_short)win);
490 	nth->th_urp = 0;
491 #ifdef INET6
492 	if (isipv6) {
493 		nth->th_sum = 0;
494 		nth->th_sum = in6_cksum(m, IPPROTO_TCP,
495 					sizeof(struct ip6_hdr),
496 					tlen - sizeof(struct ip6_hdr));
497 		ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL);
498 	} else
499 #endif /* INET6 */
500       {
501         nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
502 	    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
503         m->m_pkthdr.csum_flags = CSUM_TCP;
504         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
505       }
506 #ifdef TCPDEBUG
507 	if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
508 		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
509 #endif
510 #ifdef INET6
511 	if (isipv6)
512 		(void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
513 	else
514 #endif /* INET6 */
515 	(void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
516 }
517 
518 /*
519  * Create a new TCP control block, making an
520  * empty reassembly queue and hooking it to the argument
521  * protocol control block.  The `inp' parameter must have
522  * come from the zone allocator set up in tcp_init().
523  */
524 struct tcpcb *
525 tcp_newtcpcb(inp)
526 	struct inpcb *inp;
527 {
528 	struct tcpcb_mem *tm;
529 	struct tcpcb *tp;
530 #ifdef INET6
531 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
532 #endif /* INET6 */
533 
534 	tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO);
535 	if (tm == NULL)
536 		return (NULL);
537 	tp = &tm->tcb;
538 	/*	LIST_INIT(&tp->t_segq); */	/* XXX covered by M_ZERO */
539 	tp->t_maxseg = tp->t_maxopd =
540 #ifdef INET6
541 		isipv6 ? tcp_v6mssdflt :
542 #endif /* INET6 */
543 		tcp_mssdflt;
544 
545 	/* Set up our timeouts. */
546 	callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, 0);
547 	callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, 0);
548 	callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, 0);
549 	callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, 0);
550 	callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, 0);
551 
552 	if (tcp_do_rfc1323)
553 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
554 	if (tcp_do_rfc1644)
555 		tp->t_flags |= TF_REQ_CC;
556 	tp->t_inpcb = inp;	/* XXX */
557 	/*
558 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
559 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
560 	 * reasonable initial retransmit time.
561 	 */
562 	tp->t_srtt = TCPTV_SRTTBASE;
563 	tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
564 	tp->t_rttmin = tcp_rexmit_min;
565 	tp->t_rxtcur = TCPTV_RTOBASE;
566 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
567 	tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
568 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
569 	tp->t_rcvtime = ticks;
570 	tp->t_bw_rtttime = ticks;
571         /*
572 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
573 	 * because the socket may be bound to an IPv6 wildcard address,
574 	 * which may match an IPv4-mapped IPv6 address.
575 	 */
576 	inp->inp_ip_ttl = ip_defttl;
577 	inp->inp_ppcb = (caddr_t)tp;
578 	return (tp);		/* XXX */
579 }
580 
581 /*
582  * Drop a TCP connection, reporting
583  * the specified error.  If connection is synchronized,
584  * then send a RST to peer.
585  */
586 struct tcpcb *
587 tcp_drop(tp, errno)
588 	register struct tcpcb *tp;
589 	int errno;
590 {
591 	struct socket *so = tp->t_inpcb->inp_socket;
592 
593 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
594 		tp->t_state = TCPS_CLOSED;
595 		(void) tcp_output(tp);
596 		tcpstat.tcps_drops++;
597 	} else
598 		tcpstat.tcps_conndrops++;
599 	if (errno == ETIMEDOUT && tp->t_softerror)
600 		errno = tp->t_softerror;
601 	so->so_error = errno;
602 	return (tcp_close(tp));
603 }
604 
605 static void
606 tcp_discardcb(tp)
607 	struct tcpcb *tp;
608 {
609 	struct tseg_qent *q;
610 	struct inpcb *inp = tp->t_inpcb;
611 	struct socket *so = inp->inp_socket;
612 #ifdef INET6
613 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
614 #endif /* INET6 */
615 
616 	/*
617 	 * Make sure that all of our timers are stopped before we
618 	 * delete the PCB.
619 	 */
620 	callout_stop(tp->tt_rexmt);
621 	callout_stop(tp->tt_persist);
622 	callout_stop(tp->tt_keep);
623 	callout_stop(tp->tt_2msl);
624 	callout_stop(tp->tt_delack);
625 
626 	/*
627 	 * If we got enough samples through the srtt filter,
628 	 * save the rtt and rttvar in the routing entry.
629 	 * 'Enough' is arbitrarily defined as 4 rtt samples.
630 	 * 4 samples is enough for the srtt filter to converge
631 	 * to within enough % of the correct value; fewer samples
632 	 * and we could save a bogus rtt. The danger is not high
633 	 * as tcp quickly recovers from everything.
634 	 * XXX: Works very well but needs some more statistics!
635 	 */
636 	if (tp->t_rttupdated >= 4) {
637 		struct hc_metrics_lite metrics;
638 		u_long ssthresh;
639 
640 		bzero(&metrics, sizeof(metrics));
641 		/*
642 		 * Update the ssthresh always when the conditions below
643 		 * are satisfied. This gives us better new start value
644 		 * for the congestion avoidance for new connections.
645 		 * ssthresh is only set if packet loss occured on a session.
646 		 */
647 		ssthresh = tp->snd_ssthresh;
648 		if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
649 			/*
650 			 * convert the limit from user data bytes to
651 			 * packets then to packet data bytes.
652 			 */
653 			ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
654 			if (ssthresh < 2)
655 				ssthresh = 2;
656 			ssthresh *= (u_long)(tp->t_maxseg +
657 #ifdef INET6
658 				      (isipv6 ? sizeof (struct ip6_hdr) +
659 					       sizeof (struct tcphdr) :
660 #endif
661 				       sizeof (struct tcpiphdr)
662 #ifdef INET6
663 				       )
664 #endif
665 				      );
666 		} else
667 			ssthresh = 0;
668 		metrics.rmx_ssthresh = ssthresh;
669 
670 		metrics.rmx_rtt = tp->t_srtt;
671 		metrics.rmx_rttvar = tp->t_rttvar;
672 		/* XXX: This wraps if the pipe is more than 4 Gbit per second */
673 		metrics.rmx_bandwidth = tp->snd_bandwidth;
674 		metrics.rmx_cwnd = tp->snd_cwnd;
675 		metrics.rmx_sendpipe = 0;
676 		metrics.rmx_recvpipe = 0;
677 
678 		tcp_hc_update(&inp->inp_inc, &metrics);
679 	}
680 
681 	/* free the reassembly queue, if any */
682 	while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
683 		LIST_REMOVE(q, tqe_q);
684 		m_freem(q->tqe_m);
685 		FREE(q, M_TSEGQ);
686 	}
687 	inp->inp_ppcb = NULL;
688 	tp->t_inpcb = NULL;
689 	uma_zfree(tcpcb_zone, tp);
690 	soisdisconnected(so);
691 }
692 
693 /*
694  * Close a TCP control block:
695  *    discard all space held by the tcp
696  *    discard internet protocol block
697  *    wake up any sleepers
698  */
699 struct tcpcb *
700 tcp_close(tp)
701 	struct tcpcb *tp;
702 {
703 	struct inpcb *inp = tp->t_inpcb;
704 #ifdef INET6
705 	struct socket *so = inp->inp_socket;
706 #endif
707 
708 	tcp_discardcb(tp);
709 #ifdef INET6
710 	if (INP_CHECK_SOCKAF(so, AF_INET6))
711 		in6_pcbdetach(inp);
712 	else
713 #endif
714 		in_pcbdetach(inp);
715 	tcpstat.tcps_closed++;
716 	return ((struct tcpcb *)0);
717 }
718 
719 void
720 tcp_drain()
721 {
722 	if (do_tcpdrain)
723 	{
724 		struct inpcb *inpb;
725 		struct tcpcb *tcpb;
726 		struct tseg_qent *te;
727 
728 	/*
729 	 * Walk the tcpbs, if existing, and flush the reassembly queue,
730 	 * if there is one...
731 	 * XXX: The "Net/3" implementation doesn't imply that the TCP
732 	 *      reassembly queue should be flushed, but in a situation
733 	 * 	where we're really low on mbufs, this is potentially
734 	 *  	usefull.
735 	 */
736 		INP_INFO_RLOCK(&tcbinfo);
737 		LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
738 			if (inpb->inp_vflag & INP_TIMEWAIT)
739 				continue;
740 			INP_LOCK(inpb);
741 			if ((tcpb = intotcpcb(inpb))) {
742 				while ((te = LIST_FIRST(&tcpb->t_segq))
743 			            != NULL) {
744 					LIST_REMOVE(te, tqe_q);
745 					m_freem(te->tqe_m);
746 					FREE(te, M_TSEGQ);
747 				}
748 			}
749 			INP_UNLOCK(inpb);
750 		}
751 		INP_INFO_RUNLOCK(&tcbinfo);
752 	}
753 }
754 
755 /*
756  * Notify a tcp user of an asynchronous error;
757  * store error as soft error, but wake up user
758  * (for now, won't do anything until can select for soft error).
759  *
760  * Do not wake up user since there currently is no mechanism for
761  * reporting soft errors (yet - a kqueue filter may be added).
762  */
763 static struct inpcb *
764 tcp_notify(inp, error)
765 	struct inpcb *inp;
766 	int error;
767 {
768 	struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
769 
770 	/*
771 	 * Ignore some errors if we are hooked up.
772 	 * If connection hasn't completed, has retransmitted several times,
773 	 * and receives a second error, give up now.  This is better
774 	 * than waiting a long time to establish a connection that
775 	 * can never complete.
776 	 */
777 	if (tp->t_state == TCPS_ESTABLISHED &&
778 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
779 	     error == EHOSTDOWN)) {
780 		return inp;
781 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
782 	    tp->t_softerror) {
783 		tcp_drop(tp, error);
784 		return (struct inpcb *)0;
785 	} else {
786 		tp->t_softerror = error;
787 		return inp;
788 	}
789 #if 0
790 	wakeup( &so->so_timeo);
791 	sorwakeup(so);
792 	sowwakeup(so);
793 #endif
794 }
795 
796 static int
797 tcp_pcblist(SYSCTL_HANDLER_ARGS)
798 {
799 	int error, i, n, s;
800 	struct inpcb *inp, **inp_list;
801 	inp_gen_t gencnt;
802 	struct xinpgen xig;
803 
804 	/*
805 	 * The process of preparing the TCB list is too time-consuming and
806 	 * resource-intensive to repeat twice on every request.
807 	 */
808 	if (req->oldptr == 0) {
809 		n = tcbinfo.ipi_count;
810 		req->oldidx = 2 * (sizeof xig)
811 			+ (n + n/8) * sizeof(struct xtcpcb);
812 		return 0;
813 	}
814 
815 	if (req->newptr != 0)
816 		return EPERM;
817 
818 	/*
819 	 * OK, now we're committed to doing something.
820 	 */
821 	s = splnet();
822 	INP_INFO_RLOCK(&tcbinfo);
823 	gencnt = tcbinfo.ipi_gencnt;
824 	n = tcbinfo.ipi_count;
825 	INP_INFO_RUNLOCK(&tcbinfo);
826 	splx(s);
827 
828 	sysctl_wire_old_buffer(req, 2 * (sizeof xig)
829 		+ n * sizeof(struct xtcpcb));
830 
831 	xig.xig_len = sizeof xig;
832 	xig.xig_count = n;
833 	xig.xig_gen = gencnt;
834 	xig.xig_sogen = so_gencnt;
835 	error = SYSCTL_OUT(req, &xig, sizeof xig);
836 	if (error)
837 		return error;
838 
839 	inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
840 	if (inp_list == 0)
841 		return ENOMEM;
842 
843 	s = splnet();
844 	INP_INFO_RLOCK(&tcbinfo);
845 	for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
846 	     inp = LIST_NEXT(inp, inp_list)) {
847 		INP_LOCK(inp);
848 		if (inp->inp_gencnt <= gencnt) {
849 			/*
850 			 * XXX: This use of cr_cansee(), introduced with
851 			 * TCP state changes, is not quite right, but for
852 			 * now, better than nothing.
853 			 */
854 			if (inp->inp_vflag & INP_TIMEWAIT)
855 				error = cr_cansee(req->td->td_ucred,
856 				    intotw(inp)->tw_cred);
857 			else
858 				error = cr_canseesocket(req->td->td_ucred,
859 				    inp->inp_socket);
860 			if (error == 0)
861 				inp_list[i++] = inp;
862 		}
863 		INP_UNLOCK(inp);
864 	}
865 	INP_INFO_RUNLOCK(&tcbinfo);
866 	splx(s);
867 	n = i;
868 
869 	error = 0;
870 	for (i = 0; i < n; i++) {
871 		inp = inp_list[i];
872 		if (inp->inp_gencnt <= gencnt) {
873 			struct xtcpcb xt;
874 			caddr_t inp_ppcb;
875 			xt.xt_len = sizeof xt;
876 			/* XXX should avoid extra copy */
877 			bcopy(inp, &xt.xt_inp, sizeof *inp);
878 			inp_ppcb = inp->inp_ppcb;
879 			if (inp_ppcb == NULL)
880 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
881 			else if (inp->inp_vflag & INP_TIMEWAIT) {
882 				bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
883 				xt.xt_tp.t_state = TCPS_TIME_WAIT;
884 			} else
885 				bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
886 			if (inp->inp_socket)
887 				sotoxsocket(inp->inp_socket, &xt.xt_socket);
888 			else {
889 				bzero(&xt.xt_socket, sizeof xt.xt_socket);
890 				xt.xt_socket.xso_protocol = IPPROTO_TCP;
891 			}
892 			xt.xt_inp.inp_gencnt = inp->inp_gencnt;
893 			error = SYSCTL_OUT(req, &xt, sizeof xt);
894 		}
895 	}
896 	if (!error) {
897 		/*
898 		 * Give the user an updated idea of our state.
899 		 * If the generation differs from what we told
900 		 * her before, she knows that something happened
901 		 * while we were processing this request, and it
902 		 * might be necessary to retry.
903 		 */
904 		s = splnet();
905 		INP_INFO_RLOCK(&tcbinfo);
906 		xig.xig_gen = tcbinfo.ipi_gencnt;
907 		xig.xig_sogen = so_gencnt;
908 		xig.xig_count = tcbinfo.ipi_count;
909 		INP_INFO_RUNLOCK(&tcbinfo);
910 		splx(s);
911 		error = SYSCTL_OUT(req, &xig, sizeof xig);
912 	}
913 	free(inp_list, M_TEMP);
914 	return error;
915 }
916 
917 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
918 	    tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
919 
920 static int
921 tcp_getcred(SYSCTL_HANDLER_ARGS)
922 {
923 	struct xucred xuc;
924 	struct sockaddr_in addrs[2];
925 	struct inpcb *inp;
926 	int error, s;
927 
928 	error = suser_cred(req->td->td_ucred, PRISON_ROOT);
929 	if (error)
930 		return (error);
931 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
932 	if (error)
933 		return (error);
934 	s = splnet();
935 	INP_INFO_RLOCK(&tcbinfo);
936 	inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
937 	    addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
938 	if (inp == NULL) {
939 		error = ENOENT;
940 		goto outunlocked;
941 	}
942 	INP_LOCK(inp);
943 	if (inp->inp_socket == NULL) {
944 		error = ENOENT;
945 		goto out;
946 	}
947 	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
948 	if (error)
949 		goto out;
950 	cru2x(inp->inp_socket->so_cred, &xuc);
951 out:
952 	INP_UNLOCK(inp);
953 outunlocked:
954 	INP_INFO_RUNLOCK(&tcbinfo);
955 	splx(s);
956 	if (error == 0)
957 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
958 	return (error);
959 }
960 
961 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
962     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
963     tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
964 
965 #ifdef INET6
966 static int
967 tcp6_getcred(SYSCTL_HANDLER_ARGS)
968 {
969 	struct xucred xuc;
970 	struct sockaddr_in6 addrs[2];
971 	struct inpcb *inp;
972 	int error, s, mapped = 0;
973 
974 	error = suser_cred(req->td->td_ucred, PRISON_ROOT);
975 	if (error)
976 		return (error);
977 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
978 	if (error)
979 		return (error);
980 	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
981 		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
982 			mapped = 1;
983 		else
984 			return (EINVAL);
985 	}
986 	s = splnet();
987 	INP_INFO_RLOCK(&tcbinfo);
988 	if (mapped == 1)
989 		inp = in_pcblookup_hash(&tcbinfo,
990 			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
991 			addrs[1].sin6_port,
992 			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
993 			addrs[0].sin6_port,
994 			0, NULL);
995 	else
996 		inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
997 				 addrs[1].sin6_port,
998 				 &addrs[0].sin6_addr, addrs[0].sin6_port,
999 				 0, NULL);
1000 	if (inp == NULL) {
1001 		error = ENOENT;
1002 		goto outunlocked;
1003 	}
1004 	INP_LOCK(inp);
1005 	if (inp->inp_socket == NULL) {
1006 		error = ENOENT;
1007 		goto out;
1008 	}
1009 	error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
1010 	if (error)
1011 		goto out;
1012 	cru2x(inp->inp_socket->so_cred, &xuc);
1013 out:
1014 	INP_UNLOCK(inp);
1015 outunlocked:
1016 	INP_INFO_RUNLOCK(&tcbinfo);
1017 	splx(s);
1018 	if (error == 0)
1019 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
1020 	return (error);
1021 }
1022 
1023 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
1024     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
1025     tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
1026 #endif
1027 
1028 
1029 void
1030 tcp_ctlinput(cmd, sa, vip)
1031 	int cmd;
1032 	struct sockaddr *sa;
1033 	void *vip;
1034 {
1035 	struct ip *ip = vip;
1036 	struct tcphdr *th;
1037 	struct in_addr faddr;
1038 	struct inpcb *inp;
1039 	struct tcpcb *tp;
1040 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
1041 	tcp_seq icmp_seq;
1042 	int s;
1043 
1044 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
1045 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1046 		return;
1047 
1048 	if (cmd == PRC_QUENCH)
1049 		notify = tcp_quench;
1050 	else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
1051 		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
1052 		notify = tcp_drop_syn_sent;
1053 	else if (cmd == PRC_MSGSIZE)
1054 		notify = tcp_mtudisc;
1055 	/*
1056 	 * Redirects don't need to be handled up here.
1057 	 */
1058 	else if (PRC_IS_REDIRECT(cmd))
1059 		return;
1060 	/*
1061 	 * Hostdead is ugly because it goes linearly through all PCBs.
1062 	 * XXX: We never get this from ICMP, otherwise it makes an
1063 	 * excellent DoS attack on machines with many connections.
1064 	 */
1065 	else if (cmd == PRC_HOSTDEAD)
1066 		ip = 0;
1067 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
1068 		return;
1069 	if (ip) {
1070 		s = splnet();
1071 		th = (struct tcphdr *)((caddr_t)ip
1072 				       + (ip->ip_hl << 2));
1073 		INP_INFO_WLOCK(&tcbinfo);
1074 		inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
1075 		    ip->ip_src, th->th_sport, 0, NULL);
1076 		if (inp != NULL)  {
1077 			INP_LOCK(inp);
1078 			if (inp->inp_socket != NULL) {
1079 				icmp_seq = htonl(th->th_seq);
1080 				tp = intotcpcb(inp);
1081 				if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
1082 			    		SEQ_LT(icmp_seq, tp->snd_max))
1083 					inp = (*notify)(inp, inetctlerrmap[cmd]);
1084 			}
1085 			if (inp)
1086 				INP_UNLOCK(inp);
1087 		} else {
1088 			struct in_conninfo inc;
1089 
1090 			inc.inc_fport = th->th_dport;
1091 			inc.inc_lport = th->th_sport;
1092 			inc.inc_faddr = faddr;
1093 			inc.inc_laddr = ip->ip_src;
1094 #ifdef INET6
1095 			inc.inc_isipv6 = 0;
1096 #endif
1097 			syncache_unreach(&inc, th);
1098 		}
1099 		INP_INFO_WUNLOCK(&tcbinfo);
1100 		splx(s);
1101 	} else
1102 		in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
1103 }
1104 
1105 #ifdef INET6
1106 void
1107 tcp6_ctlinput(cmd, sa, d)
1108 	int cmd;
1109 	struct sockaddr *sa;
1110 	void *d;
1111 {
1112 	struct tcphdr th;
1113 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
1114 	struct ip6_hdr *ip6;
1115 	struct mbuf *m;
1116 	struct ip6ctlparam *ip6cp = NULL;
1117 	const struct sockaddr_in6 *sa6_src = NULL;
1118 	int off;
1119 	struct tcp_portonly {
1120 		u_int16_t th_sport;
1121 		u_int16_t th_dport;
1122 	} *thp;
1123 
1124 	if (sa->sa_family != AF_INET6 ||
1125 	    sa->sa_len != sizeof(struct sockaddr_in6))
1126 		return;
1127 
1128 	if (cmd == PRC_QUENCH)
1129 		notify = tcp_quench;
1130 	else if (cmd == PRC_MSGSIZE)
1131 		notify = tcp_mtudisc;
1132 	else if (!PRC_IS_REDIRECT(cmd) &&
1133 		 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
1134 		return;
1135 
1136 	/* if the parameter is from icmp6, decode it. */
1137 	if (d != NULL) {
1138 		ip6cp = (struct ip6ctlparam *)d;
1139 		m = ip6cp->ip6c_m;
1140 		ip6 = ip6cp->ip6c_ip6;
1141 		off = ip6cp->ip6c_off;
1142 		sa6_src = ip6cp->ip6c_src;
1143 	} else {
1144 		m = NULL;
1145 		ip6 = NULL;
1146 		off = 0;	/* fool gcc */
1147 		sa6_src = &sa6_any;
1148 	}
1149 
1150 	if (ip6) {
1151 		struct in_conninfo inc;
1152 		/*
1153 		 * XXX: We assume that when IPV6 is non NULL,
1154 		 * M and OFF are valid.
1155 		 */
1156 
1157 		/* check if we can safely examine src and dst ports */
1158 		if (m->m_pkthdr.len < off + sizeof(*thp))
1159 			return;
1160 
1161 		bzero(&th, sizeof(th));
1162 		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
1163 
1164 		in6_pcbnotify(&tcb, sa, th.th_dport,
1165 		    (struct sockaddr *)ip6cp->ip6c_src,
1166 		    th.th_sport, cmd, notify);
1167 
1168 		inc.inc_fport = th.th_dport;
1169 		inc.inc_lport = th.th_sport;
1170 		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
1171 		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
1172 		inc.inc_isipv6 = 1;
1173 		syncache_unreach(&inc, &th);
1174 	} else
1175 		in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src,
1176 			      0, cmd, notify);
1177 }
1178 #endif /* INET6 */
1179 
1180 
1181 /*
1182  * Following is where TCP initial sequence number generation occurs.
1183  *
1184  * There are two places where we must use initial sequence numbers:
1185  * 1.  In SYN-ACK packets.
1186  * 2.  In SYN packets.
1187  *
1188  * All ISNs for SYN-ACK packets are generated by the syncache.  See
1189  * tcp_syncache.c for details.
1190  *
1191  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
1192  * depends on this property.  In addition, these ISNs should be
1193  * unguessable so as to prevent connection hijacking.  To satisfy
1194  * the requirements of this situation, the algorithm outlined in
1195  * RFC 1948 is used to generate sequence numbers.
1196  *
1197  * Implementation details:
1198  *
1199  * Time is based off the system timer, and is corrected so that it
1200  * increases by one megabyte per second.  This allows for proper
1201  * recycling on high speed LANs while still leaving over an hour
1202  * before rollover.
1203  *
1204  * net.inet.tcp.isn_reseed_interval controls the number of seconds
1205  * between seeding of isn_secret.  This is normally set to zero,
1206  * as reseeding should not be necessary.
1207  *
1208  */
1209 
1210 #define ISN_BYTES_PER_SECOND 1048576
1211 
1212 u_char isn_secret[32];
1213 int isn_last_reseed;
1214 MD5_CTX isn_ctx;
1215 
1216 tcp_seq
1217 tcp_new_isn(tp)
1218 	struct tcpcb *tp;
1219 {
1220 	u_int32_t md5_buffer[4];
1221 	tcp_seq new_isn;
1222 
1223 	/* Seed if this is the first use, reseed if requested. */
1224 	if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) &&
1225 	     (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
1226 		< (u_int)ticks))) {
1227 		read_random(&isn_secret, sizeof(isn_secret));
1228 		isn_last_reseed = ticks;
1229 	}
1230 
1231 	/* Compute the md5 hash and return the ISN. */
1232 	MD5Init(&isn_ctx);
1233 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
1234 	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
1235 #ifdef INET6
1236 	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
1237 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
1238 			  sizeof(struct in6_addr));
1239 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
1240 			  sizeof(struct in6_addr));
1241 	} else
1242 #endif
1243 	{
1244 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
1245 			  sizeof(struct in_addr));
1246 		MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
1247 			  sizeof(struct in_addr));
1248 	}
1249 	MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
1250 	MD5Final((u_char *) &md5_buffer, &isn_ctx);
1251 	new_isn = (tcp_seq) md5_buffer[0];
1252 	new_isn += ticks * (ISN_BYTES_PER_SECOND / hz);
1253 	return new_isn;
1254 }
1255 
1256 /*
1257  * When a source quench is received, close congestion window
1258  * to one segment.  We will gradually open it again as we proceed.
1259  */
1260 struct inpcb *
1261 tcp_quench(inp, errno)
1262 	struct inpcb *inp;
1263 	int errno;
1264 {
1265 	struct tcpcb *tp = intotcpcb(inp);
1266 
1267 	if (tp)
1268 		tp->snd_cwnd = tp->t_maxseg;
1269 	return (inp);
1270 }
1271 
1272 /*
1273  * When a specific ICMP unreachable message is received and the
1274  * connection state is SYN-SENT, drop the connection.  This behavior
1275  * is controlled by the icmp_may_rst sysctl.
1276  */
1277 struct inpcb *
1278 tcp_drop_syn_sent(inp, errno)
1279 	struct inpcb *inp;
1280 	int errno;
1281 {
1282 	struct tcpcb *tp = intotcpcb(inp);
1283 
1284 	if (tp && tp->t_state == TCPS_SYN_SENT) {
1285 		tcp_drop(tp, errno);
1286 		return (struct inpcb *)0;
1287 	}
1288 	return inp;
1289 }
1290 
1291 /*
1292  * When `need fragmentation' ICMP is received, update our idea of the MSS
1293  * based on the new value in the route.  Also nudge TCP to send something,
1294  * since we know the packet we just sent was dropped.
1295  * This duplicates some code in the tcp_mss() function in tcp_input.c.
1296  */
1297 struct inpcb *
1298 tcp_mtudisc(inp, errno)
1299 	struct inpcb *inp;
1300 	int errno;
1301 {
1302 	struct tcpcb *tp = intotcpcb(inp);
1303 	struct rmxp_tao tao;
1304 	struct socket *so = inp->inp_socket;
1305 	u_int maxmtu;
1306 	u_int romtu;
1307 	int mss;
1308 #ifdef INET6
1309 	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
1310 #endif /* INET6 */
1311 	bzero(&tao, sizeof(tao));
1312 
1313 	if (tp) {
1314 		maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */
1315 		romtu =
1316 #ifdef INET6
1317 		    isipv6 ? tcp_maxmtu6(&inp->inp_inc) :
1318 #endif /* INET6 */
1319 		    tcp_maxmtu(&inp->inp_inc);
1320 		if (!maxmtu)
1321 			maxmtu = romtu;
1322 		else
1323 			maxmtu = min(maxmtu, romtu);
1324 		if (!maxmtu) {
1325 			tp->t_maxopd = tp->t_maxseg =
1326 #ifdef INET6
1327 				isipv6 ? tcp_v6mssdflt :
1328 #endif /* INET6 */
1329 				tcp_mssdflt;
1330 			return inp;
1331 		}
1332 		mss = maxmtu -
1333 #ifdef INET6
1334 			(isipv6 ?
1335 			 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
1336 #endif /* INET6 */
1337 			 sizeof(struct tcpiphdr)
1338 #ifdef INET6
1339 			 )
1340 #endif /* INET6 */
1341 			;
1342 
1343 		if (tcp_do_rfc1644) {
1344 			tcp_hc_gettao(&inp->inp_inc, &tao);
1345 			if (tao.tao_mssopt)
1346 				mss = min(mss, tao.tao_mssopt);
1347 		}
1348 		/*
1349 		 * XXX - The above conditional probably violates the TCP
1350 		 * spec.  The problem is that, since we don't know the
1351 		 * other end's MSS, we are supposed to use a conservative
1352 		 * default.  But, if we do that, then MTU discovery will
1353 		 * never actually take place, because the conservative
1354 		 * default is much less than the MTUs typically seen
1355 		 * on the Internet today.  For the moment, we'll sweep
1356 		 * this under the carpet.
1357 		 *
1358 		 * The conservative default might not actually be a problem
1359 		 * if the only case this occurs is when sending an initial
1360 		 * SYN with options and data to a host we've never talked
1361 		 * to before.  Then, they will reply with an MSS value which
1362 		 * will get recorded and the new parameters should get
1363 		 * recomputed.  For Further Study.
1364 		 */
1365 		if (tp->t_maxopd <= mss)
1366 			return inp;
1367 		tp->t_maxopd = mss;
1368 
1369 		if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1370 		    (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
1371 			mss -= TCPOLEN_TSTAMP_APPA;
1372 		if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
1373 		    (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
1374 			mss -= TCPOLEN_CC_APPA;
1375 #if	(MCLBYTES & (MCLBYTES - 1)) == 0
1376 		if (mss > MCLBYTES)
1377 			mss &= ~(MCLBYTES-1);
1378 #else
1379 		if (mss > MCLBYTES)
1380 			mss = mss / MCLBYTES * MCLBYTES;
1381 #endif
1382 		if (so->so_snd.sb_hiwat < mss)
1383 			mss = so->so_snd.sb_hiwat;
1384 
1385 		tp->t_maxseg = mss;
1386 
1387 		tcpstat.tcps_mturesent++;
1388 		tp->t_rtttime = 0;
1389 		tp->snd_nxt = tp->snd_una;
1390 		tcp_output(tp);
1391 	}
1392 	return inp;
1393 }
1394 
1395 /*
1396  * Look-up the routing entry to the peer of this inpcb.  If no route
1397  * is found and it cannot be allocated, then return NULL.  This routine
1398  * is called by TCP routines that access the rmx structure and by tcp_mss
1399  * to get the interface MTU.
1400  */
1401 u_long
1402 tcp_maxmtu(inc)
1403 	struct in_conninfo *inc;
1404 {
1405 	struct route sro;
1406 	struct sockaddr_in *dst;
1407 	struct ifnet *ifp;
1408 	u_long maxmtu = 0;
1409 
1410 	KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
1411 
1412 	bzero(&sro, sizeof(sro));
1413 	if (inc->inc_faddr.s_addr != INADDR_ANY) {
1414 	        dst = (struct sockaddr_in *)&sro.ro_dst;
1415 		dst->sin_family = AF_INET;
1416 		dst->sin_len = sizeof(*dst);
1417 		dst->sin_addr = inc->inc_faddr;
1418 		rtalloc_ign(&sro, RTF_CLONING);
1419 	}
1420 	if (sro.ro_rt != NULL) {
1421 		ifp = sro.ro_rt->rt_ifp;
1422 		if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
1423 			maxmtu = ifp->if_mtu;
1424 		else
1425 			maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
1426 		RTFREE(sro.ro_rt);
1427 	}
1428 	return (maxmtu);
1429 }
1430 
1431 #ifdef INET6
1432 u_long
1433 tcp_maxmtu6(inc)
1434 	struct in_conninfo *inc;
1435 {
1436 	struct route_in6 sro6;
1437 	struct ifnet *ifp;
1438 	u_long maxmtu = 0;
1439 
1440 	KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
1441 
1442 	bzero(&sro6, sizeof(sro6));
1443 	if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
1444 		sro6.ro_dst.sin6_family = AF_INET6;
1445 		sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
1446 		sro6.ro_dst.sin6_addr = inc->inc6_faddr;
1447 		rtalloc_ign((struct route *)&sro6, RTF_CLONING);
1448 	}
1449 	if (sro6.ro_rt != NULL) {
1450 		ifp = sro6.ro_rt->rt_ifp;
1451 		if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
1452 			maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
1453 		else
1454 			maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
1455 				     IN6_LINKMTU(sro6.ro_rt->rt_ifp));
1456 		RTFREE(sro6.ro_rt);
1457 	}
1458 
1459 	return (maxmtu);
1460 }
1461 #endif /* INET6 */
1462 
1463 #ifdef IPSEC
1464 /* compute ESP/AH header size for TCP, including outer IP header. */
1465 size_t
1466 ipsec_hdrsiz_tcp(tp)
1467 	struct tcpcb *tp;
1468 {
1469 	struct inpcb *inp;
1470 	struct mbuf *m;
1471 	size_t hdrsiz;
1472 	struct ip *ip;
1473 #ifdef INET6
1474 	struct ip6_hdr *ip6;
1475 #endif
1476 	struct tcphdr *th;
1477 
1478 	if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
1479 		return 0;
1480 	MGETHDR(m, M_DONTWAIT, MT_DATA);
1481 	if (!m)
1482 		return 0;
1483 
1484 #ifdef INET6
1485 	if ((inp->inp_vflag & INP_IPV6) != 0) {
1486 		ip6 = mtod(m, struct ip6_hdr *);
1487 		th = (struct tcphdr *)(ip6 + 1);
1488 		m->m_pkthdr.len = m->m_len =
1489 			sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1490 		tcpip_fillheaders(inp, ip6, th);
1491 		hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1492 	} else
1493 #endif /* INET6 */
1494       {
1495 	ip = mtod(m, struct ip *);
1496 	th = (struct tcphdr *)(ip + 1);
1497 	m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
1498 	tcpip_fillheaders(inp, ip, th);
1499 	hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1500       }
1501 
1502 	m_free(m);
1503 	return hdrsiz;
1504 }
1505 #endif /*IPSEC*/
1506 
1507 /*
1508  * Move a TCP connection into TIME_WAIT state.
1509  *    tcbinfo is unlocked.
1510  *    inp is locked, and is unlocked before returning.
1511  */
1512 void
1513 tcp_twstart(tp)
1514 	struct tcpcb *tp;
1515 {
1516 	struct tcptw *tw;
1517 	struct inpcb *inp;
1518 	int tw_time, acknow;
1519 	struct socket *so;
1520 
1521 	tw = uma_zalloc(tcptw_zone, M_NOWAIT);
1522 	if (tw == NULL) {
1523 		tw = tcp_timer_2msl_tw(1);
1524 		if (tw == NULL) {
1525 			tcp_close(tp);
1526 			return;
1527 		}
1528 	}
1529 	inp = tp->t_inpcb;
1530 	tw->tw_inpcb = inp;
1531 
1532 	/*
1533 	 * Recover last window size sent.
1534 	 */
1535 	tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale;
1536 
1537 	/*
1538 	 * Set t_recent if timestamps are used on the connection.
1539 	 */
1540         if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
1541             (TF_REQ_TSTMP|TF_RCVD_TSTMP))
1542 		tw->t_recent = tp->ts_recent;
1543 	else
1544 		tw->t_recent = 0;
1545 
1546 	tw->snd_nxt = tp->snd_nxt;
1547 	tw->rcv_nxt = tp->rcv_nxt;
1548 	tw->iss     = tp->iss;
1549 	tw->irs     = tp->irs;
1550 	tw->cc_recv = tp->cc_recv;
1551 	tw->cc_send = tp->cc_send;
1552 	tw->t_starttime = tp->t_starttime;
1553 	tw->tw_time = 0;
1554 
1555 /* XXX
1556  * If this code will
1557  * be used for fin-wait-2 state also, then we may need
1558  * a ts_recent from the last segment.
1559  */
1560 	/* Shorten TIME_WAIT [RFC-1644, p.28] */
1561 	if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) {
1562 		tw_time = tp->t_rxtcur * TCPTV_TWTRUNC;
1563 		/* For T/TCP client, force ACK now. */
1564 		acknow = 1;
1565 	} else {
1566 		tw_time = 2 * tcp_msl;
1567 		acknow = tp->t_flags & TF_ACKNOW;
1568 	}
1569 	tcp_discardcb(tp);
1570 	so = inp->inp_socket;
1571 	so->so_pcb = NULL;
1572 	tw->tw_cred = crhold(so->so_cred);
1573 	tw->tw_so_options = so->so_options;
1574 	if (acknow)
1575 		tcp_twrespond(tw, so, NULL, TH_ACK);
1576 	sotryfree(so);
1577 	inp->inp_socket = NULL;
1578 	inp->inp_ppcb = (caddr_t)tw;
1579 	inp->inp_vflag |= INP_TIMEWAIT;
1580 	tcp_timer_2msl_reset(tw, tw_time);
1581 	INP_UNLOCK(inp);
1582 }
1583 
1584 /*
1585  * The appromixate rate of ISN increase of Microsoft TCP stacks;
1586  * the actual rate is slightly higher due to the addition of
1587  * random positive increments.
1588  *
1589  * Most other new OSes use semi-randomized ISN values, so we
1590  * do not need to worry about them.
1591  */
1592 #define MS_ISN_BYTES_PER_SECOND		250000
1593 
1594 /*
1595  * Determine if the ISN we will generate has advanced beyond the last
1596  * sequence number used by the previous connection.  If so, indicate
1597  * that it is safe to recycle this tw socket by returning 1.
1598  */
1599 int
1600 tcp_twrecycleable(struct tcptw *tw)
1601 {
1602 	tcp_seq new_iss = tw->iss;
1603 	tcp_seq new_irs = tw->irs;
1604 
1605 	new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz);
1606 	new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz);
1607 
1608 	if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt))
1609 		return 1;
1610 	else
1611 		return 0;
1612 }
1613 
1614 struct tcptw *
1615 tcp_twclose(struct tcptw *tw, int reuse)
1616 {
1617 	struct inpcb *inp;
1618 
1619 	inp = tw->tw_inpcb;
1620 	tw->tw_inpcb = NULL;
1621 	tcp_timer_2msl_stop(tw);
1622 	inp->inp_ppcb = NULL;
1623 #ifdef INET6
1624 	if (inp->inp_vflag & INP_IPV6PROTO)
1625 		in6_pcbdetach(inp);
1626 	else
1627 #endif
1628 		in_pcbdetach(inp);
1629 	tcpstat.tcps_closed++;
1630 	if (reuse)
1631 		return (tw);
1632 	uma_zfree(tcptw_zone, tw);
1633 	return (NULL);
1634 }
1635 
1636 /*
1637  * One of so and msrc must be non-NULL for use by the MAC Framework to
1638  * construct a label for ay resulting packet.
1639  */
1640 int
1641 tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
1642     int flags)
1643 {
1644 	struct inpcb *inp = tw->tw_inpcb;
1645 	struct tcphdr *th;
1646 	struct mbuf *m;
1647 	struct ip *ip = NULL;
1648 	u_int8_t *optp;
1649 	u_int hdrlen, optlen;
1650 	int error;
1651 #ifdef INET6
1652 	struct ip6_hdr *ip6 = NULL;
1653 	int isipv6 = inp->inp_inc.inc_isipv6;
1654 #endif
1655 
1656 	KASSERT(so != NULL || msrc != NULL,
1657 	    ("tcp_twrespond: so and msrc NULL"));
1658 
1659 	m = m_gethdr(M_DONTWAIT, MT_HEADER);
1660 	if (m == NULL)
1661 		return (ENOBUFS);
1662 	m->m_data += max_linkhdr;
1663 
1664 #ifdef MAC
1665 	if (so != NULL)
1666 		mac_create_mbuf_from_socket(so, m);
1667 	else
1668 		mac_create_mbuf_netlayer(msrc, m);
1669 #endif
1670 
1671 #ifdef INET6
1672 	if (isipv6) {
1673 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1674 		ip6 = mtod(m, struct ip6_hdr *);
1675 		th = (struct tcphdr *)(ip6 + 1);
1676 		tcpip_fillheaders(inp, ip6, th);
1677 	} else
1678 #endif
1679 	{
1680 		hdrlen = sizeof(struct tcpiphdr);
1681 		ip = mtod(m, struct ip *);
1682 		th = (struct tcphdr *)(ip + 1);
1683 		tcpip_fillheaders(inp, ip, th);
1684 	}
1685 	optp = (u_int8_t *)(th + 1);
1686 
1687  	/*
1688 	 * Send a timestamp and echo-reply if both our side and our peer
1689 	 * have sent timestamps in our SYN's and this is not a RST.
1690  	 */
1691 	if (tw->t_recent && flags == TH_ACK) {
1692 		u_int32_t *lp = (u_int32_t *)optp;
1693 
1694  		/* Form timestamp option as shown in appendix A of RFC 1323. */
1695  		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
1696  		*lp++ = htonl(ticks);
1697  		*lp   = htonl(tw->t_recent);
1698  		optp += TCPOLEN_TSTAMP_APPA;
1699  	}
1700 
1701  	/*
1702 	 * Send `CC-family' options if needed, and it's not a RST.
1703  	 */
1704 	if (tw->cc_recv != 0 && flags == TH_ACK) {
1705 		u_int32_t *lp = (u_int32_t *)optp;
1706 
1707 		*lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC));
1708 		*lp   = htonl(tw->cc_send);
1709 		optp += TCPOLEN_CC_APPA;
1710  	}
1711 	optlen = optp - (u_int8_t *)(th + 1);
1712 
1713 	m->m_len = hdrlen + optlen;
1714 	m->m_pkthdr.len = m->m_len;
1715 
1716 	KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));
1717 
1718 	th->th_seq = htonl(tw->snd_nxt);
1719 	th->th_ack = htonl(tw->rcv_nxt);
1720 	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
1721 	th->th_flags = flags;
1722 	th->th_win = htons(tw->last_win);
1723 
1724 #ifdef INET6
1725 	if (isipv6) {
1726 		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
1727 		    sizeof(struct tcphdr) + optlen);
1728 		ip6->ip6_hlim = in6_selecthlim(inp, NULL);
1729 		error = ip6_output(m, inp->in6p_outputopts, NULL,
1730 		    (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
1731 	} else
1732 #endif
1733 	{
1734 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1735                     htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
1736 		m->m_pkthdr.csum_flags = CSUM_TCP;
1737 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1738 		ip->ip_len = m->m_pkthdr.len;
1739 		error = ip_output(m, inp->inp_options, NULL,
1740 		    (tw->tw_so_options & SO_DONTROUTE), NULL, inp);
1741 	}
1742 	if (flags & TH_ACK)
1743 		tcpstat.tcps_sndacks++;
1744 	else
1745 		tcpstat.tcps_sndctrl++;
1746 	tcpstat.tcps_sndtotal++;
1747 	return (error);
1748 }
1749 
1750 /*
1751  * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
1752  *
1753  * This code attempts to calculate the bandwidth-delay product as a
1754  * means of determining the optimal window size to maximize bandwidth,
1755  * minimize RTT, and avoid the over-allocation of buffers on interfaces and
1756  * routers.  This code also does a fairly good job keeping RTTs in check
1757  * across slow links like modems.  We implement an algorithm which is very
1758  * similar (but not meant to be) TCP/Vegas.  The code operates on the
1759  * transmitter side of a TCP connection and so only effects the transmit
1760  * side of the connection.
1761  *
1762  * BACKGROUND:  TCP makes no provision for the management of buffer space
1763  * at the end points or at the intermediate routers and switches.  A TCP
1764  * stream, whether using NewReno or not, will eventually buffer as
1765  * many packets as it is able and the only reason this typically works is
1766  * due to the fairly small default buffers made available for a connection
1767  * (typicaly 16K or 32K).  As machines use larger windows and/or window
1768  * scaling it is now fairly easy for even a single TCP connection to blow-out
1769  * all available buffer space not only on the local interface, but on
1770  * intermediate routers and switches as well.  NewReno makes a misguided
1771  * attempt to 'solve' this problem by waiting for an actual failure to occur,
1772  * then backing off, then steadily increasing the window again until another
1773  * failure occurs, ad-infinitum.  This results in terrible oscillation that
1774  * is only made worse as network loads increase and the idea of intentionally
1775  * blowing out network buffers is, frankly, a terrible way to manage network
1776  * resources.
1777  *
1778  * It is far better to limit the transmit window prior to the failure
1779  * condition being achieved.  There are two general ways to do this:  First
1780  * you can 'scan' through different transmit window sizes and locate the
1781  * point where the RTT stops increasing, indicating that you have filled the
1782  * pipe, then scan backwards until you note that RTT stops decreasing, then
1783  * repeat ad-infinitum.  This method works in principle but has severe
1784  * implementation issues due to RTT variances, timer granularity, and
1785  * instability in the algorithm which can lead to many false positives and
1786  * create oscillations as well as interact badly with other TCP streams
1787  * implementing the same algorithm.
1788  *
1789  * The second method is to limit the window to the bandwidth delay product
1790  * of the link.  This is the method we implement.  RTT variances and our
1791  * own manipulation of the congestion window, bwnd, can potentially
1792  * destabilize the algorithm.  For this reason we have to stabilize the
1793  * elements used to calculate the window.  We do this by using the minimum
1794  * observed RTT, the long term average of the observed bandwidth, and
1795  * by adding two segments worth of slop.  It isn't perfect but it is able
1796  * to react to changing conditions and gives us a very stable basis on
1797  * which to extend the algorithm.
1798  */
1799 void
1800 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
1801 {
1802 	u_long bw;
1803 	u_long bwnd;
1804 	int save_ticks;
1805 
1806 	/*
1807 	 * If inflight_enable is disabled in the middle of a tcp connection,
1808 	 * make sure snd_bwnd is effectively disabled.
1809 	 */
1810 	if (tcp_inflight_enable == 0) {
1811 		tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1812 		tp->snd_bandwidth = 0;
1813 		return;
1814 	}
1815 
1816 	/*
1817 	 * Figure out the bandwidth.  Due to the tick granularity this
1818 	 * is a very rough number and it MUST be averaged over a fairly
1819 	 * long period of time.  XXX we need to take into account a link
1820 	 * that is not using all available bandwidth, but for now our
1821 	 * slop will ramp us up if this case occurs and the bandwidth later
1822 	 * increases.
1823 	 *
1824 	 * Note: if ticks rollover 'bw' may wind up negative.  We must
1825 	 * effectively reset t_bw_rtttime for this case.
1826 	 */
1827 	save_ticks = ticks;
1828 	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
1829 		return;
1830 
1831 	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
1832 	    (save_ticks - tp->t_bw_rtttime);
1833 	tp->t_bw_rtttime = save_ticks;
1834 	tp->t_bw_rtseq = ack_seq;
1835 	if (tp->t_bw_rtttime == 0 || (int)bw < 0)
1836 		return;
1837 	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
1838 
1839 	tp->snd_bandwidth = bw;
1840 
1841 	/*
1842 	 * Calculate the semi-static bandwidth delay product, plus two maximal
1843 	 * segments.  The additional slop puts us squarely in the sweet
1844 	 * spot and also handles the bandwidth run-up case and stabilization.
1845 	 * Without the slop we could be locking ourselves into a lower
1846 	 * bandwidth.
1847 	 *
1848 	 * Situations Handled:
1849 	 *	(1) Prevents over-queueing of packets on LANs, especially on
1850 	 *	    high speed LANs, allowing larger TCP buffers to be
1851 	 *	    specified, and also does a good job preventing
1852 	 *	    over-queueing of packets over choke points like modems
1853 	 *	    (at least for the transmit side).
1854 	 *
1855 	 *	(2) Is able to handle changing network loads (bandwidth
1856 	 *	    drops so bwnd drops, bandwidth increases so bwnd
1857 	 *	    increases).
1858 	 *
1859 	 *	(3) Theoretically should stabilize in the face of multiple
1860 	 *	    connections implementing the same algorithm (this may need
1861 	 *	    a little work).
1862 	 *
1863 	 *	(4) Stability value (defaults to 20 = 2 maximal packets) can
1864 	 *	    be adjusted with a sysctl but typically only needs to be
1865 	 *	    on very slow connections.  A value no smaller then 5
1866 	 *	    should be used, but only reduce this default if you have
1867 	 *	    no other choice.
1868 	 */
1869 #define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
1870 	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10;
1871 #undef USERTT
1872 
1873 	if (tcp_inflight_debug > 0) {
1874 		static int ltime;
1875 		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
1876 			ltime = ticks;
1877 			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
1878 			    tp,
1879 			    bw,
1880 			    tp->t_rttbest,
1881 			    tp->t_srtt,
1882 			    bwnd
1883 			);
1884 		}
1885 	}
1886 	if ((long)bwnd < tcp_inflight_min)
1887 		bwnd = tcp_inflight_min;
1888 	if (bwnd > tcp_inflight_max)
1889 		bwnd = tcp_inflight_max;
1890 	if ((long)bwnd < tp->t_maxseg * 2)
1891 		bwnd = tp->t_maxseg * 2;
1892 	tp->snd_bwnd = bwnd;
1893 }
1894 
1895