xref: /freebsd/sys/netinet/tcp_output.c (revision 4cf49a43559ed9fdad601bdcccd2c55963008675)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)tcp_output.c	8.4 (Berkeley) 5/24/95
34  * $FreeBSD$
35  */
36 
37 #include "opt_tcpdebug.h"
38 
39 #include <stddef.h>
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/sysctl.h>
45 #include <sys/mbuf.h>
46 #include <sys/protosw.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 
50 #include <net/route.h>
51 
52 #include <netinet/in.h>
53 #include <netinet/in_systm.h>
54 #include <netinet/ip.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/ip_var.h>
57 #include <netinet/tcp.h>
58 #define	TCPOUTFLAGS
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_timer.h>
62 #include <netinet/tcp_var.h>
63 #include <netinet/tcpip.h>
64 #ifdef TCPDEBUG
65 #include <netinet/tcp_debug.h>
66 #endif
67 
68 #ifdef notyet
69 extern struct mbuf *m_copypack();
70 #endif
71 
72 static int path_mtu_discovery = 1;
73 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
74 	&path_mtu_discovery, 1, "Enable Path MTU Discovery");
75 
76 int ss_fltsz = 1;
77 SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
78 	&ss_fltsz, 1, "Slow start flight size");
79 
80 int ss_fltsz_local = TCP_MAXWIN;               /* something large */
81 SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
82 	&ss_fltsz_local, 1, "Slow start flight size for local networks");
83 
84 /*
85  * Tcp output routine: figure out what should be sent and send it.
86  */
87 int
88 tcp_output(tp)
89 	register struct tcpcb *tp;
90 {
91 	register struct socket *so = tp->t_inpcb->inp_socket;
92 	register long len, win;
93 	int off, flags, error;
94 	register struct mbuf *m;
95 	register struct tcpiphdr *ti;
96 	u_char opt[TCP_MAXOLEN];
97 	unsigned ipoptlen, optlen, hdrlen;
98 	int idle, sendalot;
99 	struct rmxp_tao *taop;
100 	struct rmxp_tao tao_noncached;
101 
102 	/*
103 	 * Determine length of data that should be transmitted,
104 	 * and flags that will be used.
105 	 * If there is some data or critical controls (SYN, RST)
106 	 * to send, then transmit; otherwise, investigate further.
107 	 */
108 	idle = (tp->snd_max == tp->snd_una);
109 	if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
110 		/*
111 		 * We have been idle for "a while" and no acks are
112 		 * expected to clock out any data we send --
113 		 * slow start to get ack "clock" running again.
114 		 *
115 		 * Set the slow-start flight size depending on whether
116 		 * this is a local network or not.
117 		 */
118 		if (in_localaddr(tp->t_inpcb->inp_faddr))
119 			tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local;
120 		else
121 			tp->snd_cwnd = tp->t_maxseg * ss_fltsz;
122 	}
123 again:
124 	sendalot = 0;
125 	off = tp->snd_nxt - tp->snd_una;
126 	win = min(tp->snd_wnd, tp->snd_cwnd);
127 
128 	flags = tcp_outflags[tp->t_state];
129 	/*
130 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
131 	 * state flags.
132 	 */
133 	if (tp->t_flags & TF_NEEDFIN)
134 		flags |= TH_FIN;
135 	if (tp->t_flags & TF_NEEDSYN)
136 		flags |= TH_SYN;
137 
138 	/*
139 	 * If in persist timeout with window of 0, send 1 byte.
140 	 * Otherwise, if window is small but nonzero
141 	 * and timer expired, we will send what we can
142 	 * and go to transmit state.
143 	 */
144 	if (tp->t_force) {
145 		if (win == 0) {
146 			/*
147 			 * If we still have some data to send, then
148 			 * clear the FIN bit.  Usually this would
149 			 * happen below when it realizes that we
150 			 * aren't sending all the data.  However,
151 			 * if we have exactly 1 byte of unsent data,
152 			 * then it won't clear the FIN bit below,
153 			 * and if we are in persist state, we wind
154 			 * up sending the packet without recording
155 			 * that we sent the FIN bit.
156 			 *
157 			 * We can't just blindly clear the FIN bit,
158 			 * because if we don't have any more data
159 			 * to send then the probe will be the FIN
160 			 * itself.
161 			 */
162 			if (off < so->so_snd.sb_cc)
163 				flags &= ~TH_FIN;
164 			win = 1;
165 		} else {
166 			callout_stop(tp->tt_persist);
167 			tp->t_rxtshift = 0;
168 		}
169 	}
170 
171 	len = (long)ulmin(so->so_snd.sb_cc, win) - off;
172 
173 	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
174 		taop = &tao_noncached;
175 		bzero(taop, sizeof(*taop));
176 	}
177 
178 	/*
179 	 * Lop off SYN bit if it has already been sent.  However, if this
180 	 * is SYN-SENT state and if segment contains data and if we don't
181 	 * know that foreign host supports TAO, suppress sending segment.
182 	 */
183 	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
184 		flags &= ~TH_SYN;
185 		off--, len++;
186 		if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
187 		    taop->tao_ccsent == 0)
188 			return 0;
189 	}
190 
191 	/*
192 	 * Be careful not to send data and/or FIN on SYN segments
193 	 * in cases when no CC option will be sent.
194 	 * This measure is needed to prevent interoperability problems
195 	 * with not fully conformant TCP implementations.
196 	 */
197 	if ((flags & TH_SYN) &&
198 	    ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) ||
199 	     ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) {
200 		len = 0;
201 		flags &= ~TH_FIN;
202 	}
203 
204 	if (len < 0) {
205 		/*
206 		 * If FIN has been sent but not acked,
207 		 * but we haven't been called to retransmit,
208 		 * len will be -1.  Otherwise, window shrank
209 		 * after we sent into it.  If window shrank to 0,
210 		 * cancel pending retransmit, pull snd_nxt back
211 		 * to (closed) window, and set the persist timer
212 		 * if it isn't already going.  If the window didn't
213 		 * close completely, just wait for an ACK.
214 		 */
215 		len = 0;
216 		if (win == 0) {
217 			callout_stop(tp->tt_rexmt);
218 			tp->t_rxtshift = 0;
219 			tp->snd_nxt = tp->snd_una;
220 			if (!callout_active(tp->tt_persist))
221 				tcp_setpersist(tp);
222 		}
223 	}
224 	if (len > tp->t_maxseg) {
225 		len = tp->t_maxseg;
226 		sendalot = 1;
227 	}
228 	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
229 		flags &= ~TH_FIN;
230 
231 	win = sbspace(&so->so_rcv);
232 
233 	/*
234 	 * Sender silly window avoidance.  If connection is idle
235 	 * and can send all data, a maximum segment,
236 	 * at least a maximum default-size segment do it,
237 	 * or are forced, do it; otherwise don't bother.
238 	 * If peer's buffer is tiny, then send
239 	 * when window is at least half open.
240 	 * If retransmitting (possibly after persist timer forced us
241 	 * to send into a small window), then must resend.
242 	 */
243 	if (len) {
244 		if (len == tp->t_maxseg)
245 			goto send;
246 		if (!(tp->t_flags & TF_MORETOCOME) &&
247 		    (idle || tp->t_flags & TF_NODELAY) &&
248 		    (tp->t_flags & TF_NOPUSH) == 0 &&
249 		    len + off >= so->so_snd.sb_cc)
250 			goto send;
251 		if (tp->t_force)
252 			goto send;
253 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
254 			goto send;
255 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))
256 			goto send;
257 	}
258 
259 	/*
260 	 * Compare available window to amount of window
261 	 * known to peer (as advertised window less
262 	 * next expected input).  If the difference is at least two
263 	 * max size segments, or at least 50% of the maximum possible
264 	 * window, then want to send a window update to peer.
265 	 */
266 	if (win > 0) {
267 		/*
268 		 * "adv" is the amount we can increase the window,
269 		 * taking into account that we are limited by
270 		 * TCP_MAXWIN << tp->rcv_scale.
271 		 */
272 		long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
273 			(tp->rcv_adv - tp->rcv_nxt);
274 
275 		if (adv >= (long) (2 * tp->t_maxseg))
276 			goto send;
277 		if (2 * adv >= (long) so->so_rcv.sb_hiwat)
278 			goto send;
279 	}
280 
281 	/*
282 	 * Send if we owe peer an ACK.
283 	 */
284 	if (tp->t_flags & TF_ACKNOW)
285 		goto send;
286 	if ((flags & TH_RST) ||
287 	    ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
288 		goto send;
289 	if (SEQ_GT(tp->snd_up, tp->snd_una))
290 		goto send;
291 	/*
292 	 * If our state indicates that FIN should be sent
293 	 * and we have not yet done so, or we're retransmitting the FIN,
294 	 * then we need to send.
295 	 */
296 	if (flags & TH_FIN &&
297 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
298 		goto send;
299 
300 	/*
301 	 * TCP window updates are not reliable, rather a polling protocol
302 	 * using ``persist'' packets is used to insure receipt of window
303 	 * updates.  The three ``states'' for the output side are:
304 	 *	idle			not doing retransmits or persists
305 	 *	persisting		to move a small or zero window
306 	 *	(re)transmitting	and thereby not persisting
307 	 *
308 	 * callout_active(tp->tt_persist)
309 	 *	is true when we are in persist state.
310 	 * tp->t_force
311 	 *	is set when we are called to send a persist packet.
312 	 * callout_active(tp->tt_rexmt)
313 	 *	is set when we are retransmitting
314 	 * The output side is idle when both timers are zero.
315 	 *
316 	 * If send window is too small, there is data to transmit, and no
317 	 * retransmit or persist is pending, then go to persist state.
318 	 * If nothing happens soon, send when timer expires:
319 	 * if window is nonzero, transmit what we can,
320 	 * otherwise force out a byte.
321 	 */
322 	if (so->so_snd.sb_cc && !callout_active(tp->tt_rexmt) &&
323 	    !callout_active(tp->tt_persist)) {
324 		tp->t_rxtshift = 0;
325 		tcp_setpersist(tp);
326 	}
327 
328 	/*
329 	 * No reason to send a segment, just return.
330 	 */
331 	return (0);
332 
333 send:
334 	/*
335 	 * Before ESTABLISHED, force sending of initial options
336 	 * unless TCP set not to do any options.
337 	 * NOTE: we assume that the IP/TCP header plus TCP options
338 	 * always fit in a single mbuf, leaving room for a maximum
339 	 * link header, i.e.
340 	 *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN
341 	 */
342 	optlen = 0;
343 	hdrlen = sizeof (struct tcpiphdr);
344 	if (flags & TH_SYN) {
345 		tp->snd_nxt = tp->iss;
346 		if ((tp->t_flags & TF_NOOPT) == 0) {
347 			u_short mss;
348 
349 			opt[0] = TCPOPT_MAXSEG;
350 			opt[1] = TCPOLEN_MAXSEG;
351 			mss = htons((u_short) tcp_mssopt(tp));
352 			(void)memcpy(opt + 2, &mss, sizeof(mss));
353 			optlen = TCPOLEN_MAXSEG;
354 
355 			if ((tp->t_flags & TF_REQ_SCALE) &&
356 			    ((flags & TH_ACK) == 0 ||
357 			    (tp->t_flags & TF_RCVD_SCALE))) {
358 				*((u_int32_t *)(opt + optlen)) = htonl(
359 					TCPOPT_NOP << 24 |
360 					TCPOPT_WINDOW << 16 |
361 					TCPOLEN_WINDOW << 8 |
362 					tp->request_r_scale);
363 				optlen += 4;
364 			}
365 		}
366  	}
367 
368  	/*
369 	 * Send a timestamp and echo-reply if this is a SYN and our side
370 	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
371 	 * and our peer have sent timestamps in our SYN's.
372  	 */
373  	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
374  	    (flags & TH_RST) == 0 &&
375 	    ((flags & TH_ACK) == 0 ||
376 	     (tp->t_flags & TF_RCVD_TSTMP))) {
377 		u_int32_t *lp = (u_int32_t *)(opt + optlen);
378 
379  		/* Form timestamp option as shown in appendix A of RFC 1323. */
380  		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
381  		*lp++ = htonl(ticks);
382  		*lp   = htonl(tp->ts_recent);
383  		optlen += TCPOLEN_TSTAMP_APPA;
384  	}
385 
386  	/*
387 	 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
388 	 * options are allowed (!TF_NOOPT) and it's not a RST.
389  	 */
390  	if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
391  	     (flags & TH_RST) == 0) {
392 		switch (flags & (TH_SYN|TH_ACK)) {
393 		/*
394 		 * This is a normal ACK, send CC if we received CC before
395 		 * from our peer.
396 		 */
397 		case TH_ACK:
398 			if (!(tp->t_flags & TF_RCVD_CC))
399 				break;
400 			/*FALLTHROUGH*/
401 
402 		/*
403 		 * We can only get here in T/TCP's SYN_SENT* state, when
404 		 * we're a sending a non-SYN segment without waiting for
405 		 * the ACK of our SYN.  A check above assures that we only
406 		 * do this if our peer understands T/TCP.
407 		 */
408 		case 0:
409 			opt[optlen++] = TCPOPT_NOP;
410 			opt[optlen++] = TCPOPT_NOP;
411 			opt[optlen++] = TCPOPT_CC;
412 			opt[optlen++] = TCPOLEN_CC;
413 			*(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
414 
415 			optlen += 4;
416 			break;
417 
418 		/*
419 		 * This is our initial SYN, check whether we have to use
420 		 * CC or CC.new.
421 		 */
422 		case TH_SYN:
423 			opt[optlen++] = TCPOPT_NOP;
424 			opt[optlen++] = TCPOPT_NOP;
425 			opt[optlen++] = tp->t_flags & TF_SENDCCNEW ?
426 						TCPOPT_CCNEW : TCPOPT_CC;
427 			opt[optlen++] = TCPOLEN_CC;
428 			*(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
429  			optlen += 4;
430 			break;
431 
432 		/*
433 		 * This is a SYN,ACK; send CC and CC.echo if we received
434 		 * CC from our peer.
435 		 */
436 		case (TH_SYN|TH_ACK):
437 			if (tp->t_flags & TF_RCVD_CC) {
438 				opt[optlen++] = TCPOPT_NOP;
439 				opt[optlen++] = TCPOPT_NOP;
440 				opt[optlen++] = TCPOPT_CC;
441 				opt[optlen++] = TCPOLEN_CC;
442 				*(u_int32_t *)&opt[optlen] =
443 					htonl(tp->cc_send);
444 				optlen += 4;
445 				opt[optlen++] = TCPOPT_NOP;
446 				opt[optlen++] = TCPOPT_NOP;
447 				opt[optlen++] = TCPOPT_CCECHO;
448 				opt[optlen++] = TCPOLEN_CC;
449 				*(u_int32_t *)&opt[optlen] =
450 					htonl(tp->cc_recv);
451 				optlen += 4;
452 			}
453 			break;
454 		}
455  	}
456 
457  	hdrlen += optlen;
458 
459 	if (tp->t_inpcb->inp_options) {
460 		ipoptlen = tp->t_inpcb->inp_options->m_len -
461 				offsetof(struct ipoption, ipopt_list);
462 	} else {
463 		ipoptlen = 0;
464 	}
465 
466 	/*
467 	 * Adjust data length if insertion of options will
468 	 * bump the packet length beyond the t_maxopd length.
469 	 * Clear the FIN bit because we cut off the tail of
470 	 * the segment.
471 	 */
472 	if (len + optlen + ipoptlen > tp->t_maxopd) {
473 		/*
474 		 * If there is still more to send, don't close the connection.
475 		 */
476 		flags &= ~TH_FIN;
477 		len = tp->t_maxopd - optlen - ipoptlen;
478 		sendalot = 1;
479 	}
480 
481 /*#ifdef DIAGNOSTIC*/
482  	if (max_linkhdr + hdrlen > MHLEN)
483 		panic("tcphdr too big");
484 /*#endif*/
485 
486 	/*
487 	 * Grab a header mbuf, attaching a copy of data to
488 	 * be transmitted, and initialize the header from
489 	 * the template for sends on this connection.
490 	 */
491 	if (len) {
492 		if (tp->t_force && len == 1)
493 			tcpstat.tcps_sndprobe++;
494 		else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
495 			tcpstat.tcps_sndrexmitpack++;
496 			tcpstat.tcps_sndrexmitbyte += len;
497 		} else {
498 			tcpstat.tcps_sndpack++;
499 			tcpstat.tcps_sndbyte += len;
500 		}
501 #ifdef notyet
502 		if ((m = m_copypack(so->so_snd.sb_mb, off,
503 		    (int)len, max_linkhdr + hdrlen)) == 0) {
504 			error = ENOBUFS;
505 			goto out;
506 		}
507 		/*
508 		 * m_copypack left space for our hdr; use it.
509 		 */
510 		m->m_len += hdrlen;
511 		m->m_data -= hdrlen;
512 #else
513 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
514 		if (m == NULL) {
515 			error = ENOBUFS;
516 			goto out;
517 		}
518 		m->m_data += max_linkhdr;
519 		m->m_len = hdrlen;
520 		if (len <= MHLEN - hdrlen - max_linkhdr) {
521 			m_copydata(so->so_snd.sb_mb, off, (int) len,
522 			    mtod(m, caddr_t) + hdrlen);
523 			m->m_len += len;
524 		} else {
525 			m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
526 			if (m->m_next == 0) {
527 				(void) m_free(m);
528 				error = ENOBUFS;
529 				goto out;
530 			}
531 		}
532 #endif
533 		/*
534 		 * If we're sending everything we've got, set PUSH.
535 		 * (This will keep happy those implementations which only
536 		 * give data to the user when a buffer fills or
537 		 * a PUSH comes in.)
538 		 */
539 		if (off + len == so->so_snd.sb_cc)
540 			flags |= TH_PUSH;
541 	} else {
542 		if (tp->t_flags & TF_ACKNOW)
543 			tcpstat.tcps_sndacks++;
544 		else if (flags & (TH_SYN|TH_FIN|TH_RST))
545 			tcpstat.tcps_sndctrl++;
546 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
547 			tcpstat.tcps_sndurg++;
548 		else
549 			tcpstat.tcps_sndwinup++;
550 
551 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
552 		if (m == NULL) {
553 			error = ENOBUFS;
554 			goto out;
555 		}
556 		m->m_data += max_linkhdr;
557 		m->m_len = hdrlen;
558 	}
559 	m->m_pkthdr.rcvif = (struct ifnet *)0;
560 	ti = mtod(m, struct tcpiphdr *);
561 	if (tp->t_template == 0)
562 		panic("tcp_output");
563 	(void)memcpy(ti, tp->t_template, sizeof (struct tcpiphdr));
564 
565 	/*
566 	 * Fill in fields, remembering maximum advertised
567 	 * window for use in delaying messages about window sizes.
568 	 * If resending a FIN, be sure not to use a new sequence number.
569 	 */
570 	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
571 	    tp->snd_nxt == tp->snd_max)
572 		tp->snd_nxt--;
573 	/*
574 	 * If we are doing retransmissions, then snd_nxt will
575 	 * not reflect the first unsent octet.  For ACK only
576 	 * packets, we do not want the sequence number of the
577 	 * retransmitted packet, we want the sequence number
578 	 * of the next unsent octet.  So, if there is no data
579 	 * (and no SYN or FIN), use snd_max instead of snd_nxt
580 	 * when filling in ti_seq.  But if we are in persist
581 	 * state, snd_max might reflect one byte beyond the
582 	 * right edge of the window, so use snd_nxt in that
583 	 * case, since we know we aren't doing a retransmission.
584 	 * (retransmit and persist are mutually exclusive...)
585 	 */
586 	if (len || (flags & (TH_SYN|TH_FIN))
587 	    || callout_active(tp->tt_persist))
588 		ti->ti_seq = htonl(tp->snd_nxt);
589 	else
590 		ti->ti_seq = htonl(tp->snd_max);
591 	ti->ti_ack = htonl(tp->rcv_nxt);
592 	if (optlen) {
593 		bcopy(opt, ti + 1, optlen);
594 		ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2;
595 	}
596 	ti->ti_flags = flags;
597 	/*
598 	 * Calculate receive window.  Don't shrink window,
599 	 * but avoid silly window syndrome.
600 	 */
601 	if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
602 		win = 0;
603 	if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
604 		win = (long)(tp->rcv_adv - tp->rcv_nxt);
605 	if (win > (long)TCP_MAXWIN << tp->rcv_scale)
606 		win = (long)TCP_MAXWIN << tp->rcv_scale;
607 	ti->ti_win = htons((u_short) (win>>tp->rcv_scale));
608 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
609 		ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
610 		ti->ti_flags |= TH_URG;
611 	} else
612 		/*
613 		 * If no urgent pointer to send, then we pull
614 		 * the urgent pointer to the left edge of the send window
615 		 * so that it doesn't drift into the send window on sequence
616 		 * number wraparound.
617 		 */
618 		tp->snd_up = tp->snd_una;		/* drag it along */
619 
620 	/*
621 	 * Put TCP length in extended header, and then
622 	 * checksum extended header and data.
623 	 */
624 	if (len + optlen)
625 		ti->ti_len = htons((u_short)(sizeof (struct tcphdr) +
626 		    optlen + len));
627 	ti->ti_sum = in_cksum(m, (int)(hdrlen + len));
628 
629 	/*
630 	 * In transmit state, time the transmission and arrange for
631 	 * the retransmit.  In persist state, just set snd_max.
632 	 */
633 	if (tp->t_force == 0 || !callout_active(tp->tt_persist)) {
634 		tcp_seq startseq = tp->snd_nxt;
635 
636 		/*
637 		 * Advance snd_nxt over sequence space of this segment.
638 		 */
639 		if (flags & (TH_SYN|TH_FIN)) {
640 			if (flags & TH_SYN)
641 				tp->snd_nxt++;
642 			if (flags & TH_FIN) {
643 				tp->snd_nxt++;
644 				tp->t_flags |= TF_SENTFIN;
645 			}
646 		}
647 		tp->snd_nxt += len;
648 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
649 			tp->snd_max = tp->snd_nxt;
650 			/*
651 			 * Time this transmission if not a retransmission and
652 			 * not currently timing anything.
653 			 */
654 			if (tp->t_rtttime == 0) {
655 				tp->t_rtttime = ticks;
656 				tp->t_rtseq = startseq;
657 				tcpstat.tcps_segstimed++;
658 			}
659 		}
660 
661 		/*
662 		 * Set retransmit timer if not currently set,
663 		 * and not doing an ack or a keep-alive probe.
664 		 * Initial value for retransmit timer is smoothed
665 		 * round-trip time + 2 * round-trip time variance.
666 		 * Initialize shift counter which is used for backoff
667 		 * of retransmit time.
668 		 */
669 		if (!callout_active(tp->tt_rexmt) &&
670 		    tp->snd_nxt != tp->snd_una) {
671 			callout_reset(tp->tt_rexmt, tp->t_rxtcur,
672 				      tcp_timer_rexmt, tp);
673 			if (callout_active(tp->tt_persist)) {
674 				callout_stop(tp->tt_persist);
675 				tp->t_rxtshift = 0;
676 			}
677 		}
678 	} else
679 		if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
680 			tp->snd_max = tp->snd_nxt + len;
681 
682 #ifdef TCPDEBUG
683 	/*
684 	 * Trace.
685 	 */
686 	if (so->so_options & SO_DEBUG)
687 		tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0);
688 #endif
689 
690 	/*
691 	 * Fill in IP length and desired time to live and
692 	 * send to IP level.  There should be a better way
693 	 * to handle ttl and tos; we could keep them in
694 	 * the template, but need a way to checksum without them.
695 	 */
696 	m->m_pkthdr.len = hdrlen + len;
697     {
698 	struct rtentry *rt;
699 	((struct ip *)ti)->ip_len = m->m_pkthdr.len;
700 	((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip_ttl;	/* XXX */
701 	((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip_tos;	/* XXX */
702 	/*
703 	 * See if we should do MTU discovery.  We do it only if the following
704 	 * are true:
705 	 *	1) we have a valid route to the destination
706 	 *	2) the MTU is not locked (if it is, then discovery has been
707 	 *	   disabled)
708 	 */
709 	if (path_mtu_discovery
710 	    && (rt = tp->t_inpcb->inp_route.ro_rt)
711 	    && rt->rt_flags & RTF_UP
712 	    && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
713 		((struct ip *)ti)->ip_off |= IP_DF;
714 	}
715 	error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
716 	    so->so_options & SO_DONTROUTE, 0);
717     }
718 	if (error) {
719 out:
720 		if (error == ENOBUFS) {
721 			tcp_quench(tp->t_inpcb, 0);
722 			return (0);
723 		}
724 		if (error == EMSGSIZE) {
725 			/*
726 			 * ip_output() will have already fixed the route
727 			 * for us.  tcp_mtudisc() will, as its last action,
728 			 * initiate retransmission, so it is important to
729 			 * not do so here.
730 			 */
731 			tcp_mtudisc(tp->t_inpcb, 0);
732 			return 0;
733 		}
734 		if ((error == EHOSTUNREACH || error == ENETDOWN)
735 		    && TCPS_HAVERCVDSYN(tp->t_state)) {
736 			tp->t_softerror = error;
737 			return (0);
738 		}
739 		return (error);
740 	}
741 	tcpstat.tcps_sndtotal++;
742 
743 	/*
744 	 * Data sent (as far as we can tell).
745 	 * If this advertises a larger window than any other segment,
746 	 * then remember the size of the advertised window.
747 	 * Any pending ACK has now been sent.
748 	 */
749 	if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
750 		tp->rcv_adv = tp->rcv_nxt + win;
751 	tp->last_ack_sent = tp->rcv_nxt;
752 	tp->t_flags &= ~TF_ACKNOW;
753 	if (tcp_delack_enabled)
754 		callout_stop(tp->tt_delack);
755 	if (sendalot)
756 		goto again;
757 	return (0);
758 }
759 
760 void
761 tcp_setpersist(tp)
762 	register struct tcpcb *tp;
763 {
764 	int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
765 	int tt;
766 
767 	if (callout_active(tp->tt_rexmt))
768 		panic("tcp_setpersist: retransmit pending");
769 	/*
770 	 * Start/restart persistance timer.
771 	 */
772 	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
773 		      TCPTV_PERSMIN, TCPTV_PERSMAX);
774 	callout_reset(tp->tt_persist, tt, tcp_timer_persist, tp);
775 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
776 		tp->t_rxtshift++;
777 }
778