xref: /freebsd/sys/netinet/tcp_output.c (revision 6e8394b8baa7d5d9153ab90de6824bcd19b3b4e1)
1 /*
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)tcp_output.c	8.4 (Berkeley) 5/24/95
34  *	$Id: tcp_output.c,v 1.33 1999/04/07 22:22:06 julian Exp $
35  */
36 
37 #include "opt_tcpdebug.h"
38 
39 #include <stddef.h>
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/sysctl.h>
45 #include <sys/mbuf.h>
46 #include <sys/protosw.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 
50 #include <net/route.h>
51 
52 #include <netinet/in.h>
53 #include <netinet/in_systm.h>
54 #include <netinet/ip.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/ip_var.h>
57 #include <netinet/tcp.h>
58 #define	TCPOUTFLAGS
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_timer.h>
62 #include <netinet/tcp_var.h>
63 #include <netinet/tcpip.h>
64 #ifdef TCPDEBUG
65 #include <netinet/tcp_debug.h>
66 #endif
67 
68 #ifdef notyet
69 extern struct mbuf *m_copypack();
70 #endif
71 
72 static int path_mtu_discovery = 1;
73 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
74 	&path_mtu_discovery, 1, "Enable Path MTU Discovery");
75 
76 
77 /*
78  * Tcp output routine: figure out what should be sent and send it.
79  */
80 int
81 tcp_output(tp)
82 	register struct tcpcb *tp;
83 {
84 	register struct socket *so = tp->t_inpcb->inp_socket;
85 	register long len, win;
86 	int off, flags, error;
87 	register struct mbuf *m;
88 	register struct tcpiphdr *ti;
89 	u_char opt[TCP_MAXOLEN];
90 	unsigned ipoptlen, optlen, hdrlen;
91 	int idle, sendalot;
92 	struct rmxp_tao *taop;
93 	struct rmxp_tao tao_noncached;
94 
95 	/*
96 	 * Determine length of data that should be transmitted,
97 	 * and flags that will be used.
98 	 * If there is some data or critical controls (SYN, RST)
99 	 * to send, then transmit; otherwise, investigate further.
100 	 */
101 	idle = (tp->snd_max == tp->snd_una);
102 	if (idle && tp->t_idle >= tp->t_rxtcur)
103 		/*
104 		 * We have been idle for "a while" and no acks are
105 		 * expected to clock out any data we send --
106 		 * slow start to get ack "clock" running again.
107 		 */
108 		tp->snd_cwnd = tp->t_maxseg;
109 again:
110 	sendalot = 0;
111 	off = tp->snd_nxt - tp->snd_una;
112 	win = min(tp->snd_wnd, tp->snd_cwnd);
113 
114 	flags = tcp_outflags[tp->t_state];
115 	/*
116 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
117 	 * state flags.
118 	 */
119 	if (tp->t_flags & TF_NEEDFIN)
120 		flags |= TH_FIN;
121 	if (tp->t_flags & TF_NEEDSYN)
122 		flags |= TH_SYN;
123 
124 	/*
125 	 * If in persist timeout with window of 0, send 1 byte.
126 	 * Otherwise, if window is small but nonzero
127 	 * and timer expired, we will send what we can
128 	 * and go to transmit state.
129 	 */
130 	if (tp->t_force) {
131 		if (win == 0) {
132 			/*
133 			 * If we still have some data to send, then
134 			 * clear the FIN bit.  Usually this would
135 			 * happen below when it realizes that we
136 			 * aren't sending all the data.  However,
137 			 * if we have exactly 1 byte of unsent data,
138 			 * then it won't clear the FIN bit below,
139 			 * and if we are in persist state, we wind
140 			 * up sending the packet without recording
141 			 * that we sent the FIN bit.
142 			 *
143 			 * We can't just blindly clear the FIN bit,
144 			 * because if we don't have any more data
145 			 * to send then the probe will be the FIN
146 			 * itself.
147 			 */
148 			if (off < so->so_snd.sb_cc)
149 				flags &= ~TH_FIN;
150 			win = 1;
151 		} else {
152 			tp->t_timer[TCPT_PERSIST] = 0;
153 			tp->t_rxtshift = 0;
154 		}
155 	}
156 
157 	len = (long)ulmin(so->so_snd.sb_cc, win) - off;
158 
159 	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
160 		taop = &tao_noncached;
161 		bzero(taop, sizeof(*taop));
162 	}
163 
164 	/*
165 	 * Lop off SYN bit if it has already been sent.  However, if this
166 	 * is SYN-SENT state and if segment contains data and if we don't
167 	 * know that foreign host supports TAO, suppress sending segment.
168 	 */
169 	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
170 		flags &= ~TH_SYN;
171 		off--, len++;
172 		if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
173 		    taop->tao_ccsent == 0)
174 			return 0;
175 	}
176 
177 	/*
178 	 * Be careful not to send data and/or FIN on SYN segments
179 	 * in cases when no CC option will be sent.
180 	 * This measure is needed to prevent interoperability problems
181 	 * with not fully conformant TCP implementations.
182 	 */
183 	if ((flags & TH_SYN) &&
184 	    ((tp->t_flags & TF_NOOPT) || !(tp->t_flags & TF_REQ_CC) ||
185 	     ((flags & TH_ACK) && !(tp->t_flags & TF_RCVD_CC)))) {
186 		len = 0;
187 		flags &= ~TH_FIN;
188 	}
189 
190 	if (len < 0) {
191 		/*
192 		 * If FIN has been sent but not acked,
193 		 * but we haven't been called to retransmit,
194 		 * len will be -1.  Otherwise, window shrank
195 		 * after we sent into it.  If window shrank to 0,
196 		 * cancel pending retransmit, pull snd_nxt back
197 		 * to (closed) window, and set the persist timer
198 		 * if it isn't already going.  If the window didn't
199 		 * close completely, just wait for an ACK.
200 		 */
201 		len = 0;
202 		if (win == 0) {
203 			tp->t_timer[TCPT_REXMT] = 0;
204 			tp->t_rxtshift = 0;
205 			tp->snd_nxt = tp->snd_una;
206 			if (tp->t_timer[TCPT_PERSIST] == 0)
207 				tcp_setpersist(tp);
208 		}
209 	}
210 	if (len > tp->t_maxseg) {
211 		len = tp->t_maxseg;
212 		sendalot = 1;
213 	}
214 	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
215 		flags &= ~TH_FIN;
216 
217 	win = sbspace(&so->so_rcv);
218 
219 	/*
220 	 * Sender silly window avoidance.  If connection is idle
221 	 * and can send all data, a maximum segment,
222 	 * at least a maximum default-size segment do it,
223 	 * or are forced, do it; otherwise don't bother.
224 	 * If peer's buffer is tiny, then send
225 	 * when window is at least half open.
226 	 * If retransmitting (possibly after persist timer forced us
227 	 * to send into a small window), then must resend.
228 	 */
229 	if (len) {
230 		if (len == tp->t_maxseg)
231 			goto send;
232 		if (!(tp->t_flags & TF_MORETOCOME) &&
233 		    (idle || tp->t_flags & TF_NODELAY) &&
234 		    (tp->t_flags & TF_NOPUSH) == 0 &&
235 		    len + off >= so->so_snd.sb_cc)
236 			goto send;
237 		if (tp->t_force)
238 			goto send;
239 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
240 			goto send;
241 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))
242 			goto send;
243 	}
244 
245 	/*
246 	 * Compare available window to amount of window
247 	 * known to peer (as advertised window less
248 	 * next expected input).  If the difference is at least two
249 	 * max size segments, or at least 50% of the maximum possible
250 	 * window, then want to send a window update to peer.
251 	 */
252 	if (win > 0) {
253 		/*
254 		 * "adv" is the amount we can increase the window,
255 		 * taking into account that we are limited by
256 		 * TCP_MAXWIN << tp->rcv_scale.
257 		 */
258 		long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
259 			(tp->rcv_adv - tp->rcv_nxt);
260 
261 		if (adv >= (long) (2 * tp->t_maxseg))
262 			goto send;
263 		if (2 * adv >= (long) so->so_rcv.sb_hiwat)
264 			goto send;
265 	}
266 
267 	/*
268 	 * Send if we owe peer an ACK.
269 	 */
270 	if (tp->t_flags & TF_ACKNOW)
271 		goto send;
272 	if ((flags & TH_RST) ||
273 	    ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0))
274 		goto send;
275 	if (SEQ_GT(tp->snd_up, tp->snd_una))
276 		goto send;
277 	/*
278 	 * If our state indicates that FIN should be sent
279 	 * and we have not yet done so, or we're retransmitting the FIN,
280 	 * then we need to send.
281 	 */
282 	if (flags & TH_FIN &&
283 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
284 		goto send;
285 
286 	/*
287 	 * TCP window updates are not reliable, rather a polling protocol
288 	 * using ``persist'' packets is used to insure receipt of window
289 	 * updates.  The three ``states'' for the output side are:
290 	 *	idle			not doing retransmits or persists
291 	 *	persisting		to move a small or zero window
292 	 *	(re)transmitting	and thereby not persisting
293 	 *
294 	 * tp->t_timer[TCPT_PERSIST]
295 	 *	is set when we are in persist state.
296 	 * tp->t_force
297 	 *	is set when we are called to send a persist packet.
298 	 * tp->t_timer[TCPT_REXMT]
299 	 *	is set when we are retransmitting
300 	 * The output side is idle when both timers are zero.
301 	 *
302 	 * If send window is too small, there is data to transmit, and no
303 	 * retransmit or persist is pending, then go to persist state.
304 	 * If nothing happens soon, send when timer expires:
305 	 * if window is nonzero, transmit what we can,
306 	 * otherwise force out a byte.
307 	 */
308 	if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
309 	    tp->t_timer[TCPT_PERSIST] == 0) {
310 		tp->t_rxtshift = 0;
311 		tcp_setpersist(tp);
312 	}
313 
314 	/*
315 	 * No reason to send a segment, just return.
316 	 */
317 	return (0);
318 
319 send:
320 	/*
321 	 * Before ESTABLISHED, force sending of initial options
322 	 * unless TCP set not to do any options.
323 	 * NOTE: we assume that the IP/TCP header plus TCP options
324 	 * always fit in a single mbuf, leaving room for a maximum
325 	 * link header, i.e.
326 	 *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN
327 	 */
328 	optlen = 0;
329 	hdrlen = sizeof (struct tcpiphdr);
330 	if (flags & TH_SYN) {
331 		tp->snd_nxt = tp->iss;
332 		if ((tp->t_flags & TF_NOOPT) == 0) {
333 			u_short mss;
334 
335 			opt[0] = TCPOPT_MAXSEG;
336 			opt[1] = TCPOLEN_MAXSEG;
337 			mss = htons((u_short) tcp_mssopt(tp));
338 			(void)memcpy(opt + 2, &mss, sizeof(mss));
339 			optlen = TCPOLEN_MAXSEG;
340 
341 			if ((tp->t_flags & TF_REQ_SCALE) &&
342 			    ((flags & TH_ACK) == 0 ||
343 			    (tp->t_flags & TF_RCVD_SCALE))) {
344 				*((u_int32_t *)(opt + optlen)) = htonl(
345 					TCPOPT_NOP << 24 |
346 					TCPOPT_WINDOW << 16 |
347 					TCPOLEN_WINDOW << 8 |
348 					tp->request_r_scale);
349 				optlen += 4;
350 			}
351 		}
352  	}
353 
354  	/*
355 	 * Send a timestamp and echo-reply if this is a SYN and our side
356 	 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
357 	 * and our peer have sent timestamps in our SYN's.
358  	 */
359  	if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
360  	    (flags & TH_RST) == 0 &&
361 	    ((flags & TH_ACK) == 0 ||
362 	     (tp->t_flags & TF_RCVD_TSTMP))) {
363 		u_int32_t *lp = (u_int32_t *)(opt + optlen);
364 
365  		/* Form timestamp option as shown in appendix A of RFC 1323. */
366  		*lp++ = htonl(TCPOPT_TSTAMP_HDR);
367  		*lp++ = htonl(tcp_now);
368  		*lp   = htonl(tp->ts_recent);
369  		optlen += TCPOLEN_TSTAMP_APPA;
370  	}
371 
372  	/*
373 	 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
374 	 * options are allowed (!TF_NOOPT) and it's not a RST.
375  	 */
376  	if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
377  	     (flags & TH_RST) == 0) {
378 		switch (flags & (TH_SYN|TH_ACK)) {
379 		/*
380 		 * This is a normal ACK, send CC if we received CC before
381 		 * from our peer.
382 		 */
383 		case TH_ACK:
384 			if (!(tp->t_flags & TF_RCVD_CC))
385 				break;
386 			/*FALLTHROUGH*/
387 
388 		/*
389 		 * We can only get here in T/TCP's SYN_SENT* state, when
390 		 * we're a sending a non-SYN segment without waiting for
391 		 * the ACK of our SYN.  A check above assures that we only
392 		 * do this if our peer understands T/TCP.
393 		 */
394 		case 0:
395 			opt[optlen++] = TCPOPT_NOP;
396 			opt[optlen++] = TCPOPT_NOP;
397 			opt[optlen++] = TCPOPT_CC;
398 			opt[optlen++] = TCPOLEN_CC;
399 			*(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
400 
401 			optlen += 4;
402 			break;
403 
404 		/*
405 		 * This is our initial SYN, check whether we have to use
406 		 * CC or CC.new.
407 		 */
408 		case TH_SYN:
409 			opt[optlen++] = TCPOPT_NOP;
410 			opt[optlen++] = TCPOPT_NOP;
411 			opt[optlen++] = tp->t_flags & TF_SENDCCNEW ?
412 						TCPOPT_CCNEW : TCPOPT_CC;
413 			opt[optlen++] = TCPOLEN_CC;
414 			*(u_int32_t *)&opt[optlen] = htonl(tp->cc_send);
415  			optlen += 4;
416 			break;
417 
418 		/*
419 		 * This is a SYN,ACK; send CC and CC.echo if we received
420 		 * CC from our peer.
421 		 */
422 		case (TH_SYN|TH_ACK):
423 			if (tp->t_flags & TF_RCVD_CC) {
424 				opt[optlen++] = TCPOPT_NOP;
425 				opt[optlen++] = TCPOPT_NOP;
426 				opt[optlen++] = TCPOPT_CC;
427 				opt[optlen++] = TCPOLEN_CC;
428 				*(u_int32_t *)&opt[optlen] =
429 					htonl(tp->cc_send);
430 				optlen += 4;
431 				opt[optlen++] = TCPOPT_NOP;
432 				opt[optlen++] = TCPOPT_NOP;
433 				opt[optlen++] = TCPOPT_CCECHO;
434 				opt[optlen++] = TCPOLEN_CC;
435 				*(u_int32_t *)&opt[optlen] =
436 					htonl(tp->cc_recv);
437 				optlen += 4;
438 			}
439 			break;
440 		}
441  	}
442 
443  	hdrlen += optlen;
444 
445 	if (tp->t_inpcb->inp_options) {
446 		ipoptlen = tp->t_inpcb->inp_options->m_len -
447 				offsetof(struct ipoption, ipopt_list);
448 	} else {
449 		ipoptlen = 0;
450 	}
451 
452 	/*
453 	 * Adjust data length if insertion of options will
454 	 * bump the packet length beyond the t_maxopd length.
455 	 * Clear the FIN bit because we cut off the tail of
456 	 * the segment.
457 	 */
458 	if (len + optlen + ipoptlen > tp->t_maxopd) {
459 		/*
460 		 * If there is still more to send, don't close the connection.
461 		 */
462 		flags &= ~TH_FIN;
463 		len = tp->t_maxopd - optlen - ipoptlen;
464 		sendalot = 1;
465 	}
466 
467 /*#ifdef DIAGNOSTIC*/
468  	if (max_linkhdr + hdrlen > MHLEN)
469 		panic("tcphdr too big");
470 /*#endif*/
471 
472 	/*
473 	 * Grab a header mbuf, attaching a copy of data to
474 	 * be transmitted, and initialize the header from
475 	 * the template for sends on this connection.
476 	 */
477 	if (len) {
478 		if (tp->t_force && len == 1)
479 			tcpstat.tcps_sndprobe++;
480 		else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
481 			tcpstat.tcps_sndrexmitpack++;
482 			tcpstat.tcps_sndrexmitbyte += len;
483 		} else {
484 			tcpstat.tcps_sndpack++;
485 			tcpstat.tcps_sndbyte += len;
486 		}
487 #ifdef notyet
488 		if ((m = m_copypack(so->so_snd.sb_mb, off,
489 		    (int)len, max_linkhdr + hdrlen)) == 0) {
490 			error = ENOBUFS;
491 			goto out;
492 		}
493 		/*
494 		 * m_copypack left space for our hdr; use it.
495 		 */
496 		m->m_len += hdrlen;
497 		m->m_data -= hdrlen;
498 #else
499 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
500 		if (m == NULL) {
501 			error = ENOBUFS;
502 			goto out;
503 		}
504 		m->m_data += max_linkhdr;
505 		m->m_len = hdrlen;
506 		if (len <= MHLEN - hdrlen - max_linkhdr) {
507 			m_copydata(so->so_snd.sb_mb, off, (int) len,
508 			    mtod(m, caddr_t) + hdrlen);
509 			m->m_len += len;
510 		} else {
511 			m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
512 			if (m->m_next == 0) {
513 				(void) m_free(m);
514 				error = ENOBUFS;
515 				goto out;
516 			}
517 		}
518 #endif
519 		/*
520 		 * If we're sending everything we've got, set PUSH.
521 		 * (This will keep happy those implementations which only
522 		 * give data to the user when a buffer fills or
523 		 * a PUSH comes in.)
524 		 */
525 		if (off + len == so->so_snd.sb_cc)
526 			flags |= TH_PUSH;
527 	} else {
528 		if (tp->t_flags & TF_ACKNOW)
529 			tcpstat.tcps_sndacks++;
530 		else if (flags & (TH_SYN|TH_FIN|TH_RST))
531 			tcpstat.tcps_sndctrl++;
532 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
533 			tcpstat.tcps_sndurg++;
534 		else
535 			tcpstat.tcps_sndwinup++;
536 
537 		MGETHDR(m, M_DONTWAIT, MT_HEADER);
538 		if (m == NULL) {
539 			error = ENOBUFS;
540 			goto out;
541 		}
542 		m->m_data += max_linkhdr;
543 		m->m_len = hdrlen;
544 	}
545 	m->m_pkthdr.rcvif = (struct ifnet *)0;
546 	ti = mtod(m, struct tcpiphdr *);
547 	if (tp->t_template == 0)
548 		panic("tcp_output");
549 	(void)memcpy(ti, tp->t_template, sizeof (struct tcpiphdr));
550 
551 	/*
552 	 * Fill in fields, remembering maximum advertised
553 	 * window for use in delaying messages about window sizes.
554 	 * If resending a FIN, be sure not to use a new sequence number.
555 	 */
556 	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
557 	    tp->snd_nxt == tp->snd_max)
558 		tp->snd_nxt--;
559 	/*
560 	 * If we are doing retransmissions, then snd_nxt will
561 	 * not reflect the first unsent octet.  For ACK only
562 	 * packets, we do not want the sequence number of the
563 	 * retransmitted packet, we want the sequence number
564 	 * of the next unsent octet.  So, if there is no data
565 	 * (and no SYN or FIN), use snd_max instead of snd_nxt
566 	 * when filling in ti_seq.  But if we are in persist
567 	 * state, snd_max might reflect one byte beyond the
568 	 * right edge of the window, so use snd_nxt in that
569 	 * case, since we know we aren't doing a retransmission.
570 	 * (retransmit and persist are mutually exclusive...)
571 	 */
572 	if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
573 		ti->ti_seq = htonl(tp->snd_nxt);
574 	else
575 		ti->ti_seq = htonl(tp->snd_max);
576 	ti->ti_ack = htonl(tp->rcv_nxt);
577 	if (optlen) {
578 		bcopy(opt, ti + 1, optlen);
579 		ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2;
580 	}
581 	ti->ti_flags = flags;
582 	/*
583 	 * Calculate receive window.  Don't shrink window,
584 	 * but avoid silly window syndrome.
585 	 */
586 	if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
587 		win = 0;
588 	if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
589 		win = (long)(tp->rcv_adv - tp->rcv_nxt);
590 	if (win > (long)TCP_MAXWIN << tp->rcv_scale)
591 		win = (long)TCP_MAXWIN << tp->rcv_scale;
592 	ti->ti_win = htons((u_short) (win>>tp->rcv_scale));
593 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
594 		ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
595 		ti->ti_flags |= TH_URG;
596 	} else
597 		/*
598 		 * If no urgent pointer to send, then we pull
599 		 * the urgent pointer to the left edge of the send window
600 		 * so that it doesn't drift into the send window on sequence
601 		 * number wraparound.
602 		 */
603 		tp->snd_up = tp->snd_una;		/* drag it along */
604 
605 	/*
606 	 * Put TCP length in extended header, and then
607 	 * checksum extended header and data.
608 	 */
609 	if (len + optlen)
610 		ti->ti_len = htons((u_short)(sizeof (struct tcphdr) +
611 		    optlen + len));
612 	ti->ti_sum = in_cksum(m, (int)(hdrlen + len));
613 
614 	/*
615 	 * In transmit state, time the transmission and arrange for
616 	 * the retransmit.  In persist state, just set snd_max.
617 	 */
618 	if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
619 		tcp_seq startseq = tp->snd_nxt;
620 
621 		/*
622 		 * Advance snd_nxt over sequence space of this segment.
623 		 */
624 		if (flags & (TH_SYN|TH_FIN)) {
625 			if (flags & TH_SYN)
626 				tp->snd_nxt++;
627 			if (flags & TH_FIN) {
628 				tp->snd_nxt++;
629 				tp->t_flags |= TF_SENTFIN;
630 			}
631 		}
632 		tp->snd_nxt += len;
633 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
634 			tp->snd_max = tp->snd_nxt;
635 			/*
636 			 * Time this transmission if not a retransmission and
637 			 * not currently timing anything.
638 			 */
639 			if (tp->t_rtt == 0) {
640 				tp->t_rtt = 1;
641 				tp->t_rtseq = startseq;
642 				tcpstat.tcps_segstimed++;
643 			}
644 		}
645 
646 		/*
647 		 * Set retransmit timer if not currently set,
648 		 * and not doing an ack or a keep-alive probe.
649 		 * Initial value for retransmit timer is smoothed
650 		 * round-trip time + 2 * round-trip time variance.
651 		 * Initialize shift counter which is used for backoff
652 		 * of retransmit time.
653 		 */
654 		if (tp->t_timer[TCPT_REXMT] == 0 &&
655 		    tp->snd_nxt != tp->snd_una) {
656 			tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
657 			if (tp->t_timer[TCPT_PERSIST]) {
658 				tp->t_timer[TCPT_PERSIST] = 0;
659 				tp->t_rxtshift = 0;
660 			}
661 		}
662 	} else
663 		if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
664 			tp->snd_max = tp->snd_nxt + len;
665 
666 #ifdef TCPDEBUG
667 	/*
668 	 * Trace.
669 	 */
670 	if (so->so_options & SO_DEBUG)
671 		tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0);
672 #endif
673 
674 	/*
675 	 * Fill in IP length and desired time to live and
676 	 * send to IP level.  There should be a better way
677 	 * to handle ttl and tos; we could keep them in
678 	 * the template, but need a way to checksum without them.
679 	 */
680 	m->m_pkthdr.len = hdrlen + len;
681     {
682 	struct rtentry *rt;
683 	((struct ip *)ti)->ip_len = m->m_pkthdr.len;
684 	((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip_ttl;	/* XXX */
685 	((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip_tos;	/* XXX */
686 	/*
687 	 * See if we should do MTU discovery.  We do it only if the following
688 	 * are true:
689 	 *	1) we have a valid route to the destination
690 	 *	2) the MTU is not locked (if it is, then discovery has been
691 	 *	   disabled)
692 	 */
693 	if (path_mtu_discovery
694 	    && (rt = tp->t_inpcb->inp_route.ro_rt)
695 	    && rt->rt_flags & RTF_UP
696 	    && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
697 		((struct ip *)ti)->ip_off |= IP_DF;
698 	}
699 	error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
700 	    so->so_options & SO_DONTROUTE, 0);
701     }
702 	if (error) {
703 out:
704 		if (error == ENOBUFS) {
705 			tcp_quench(tp->t_inpcb, 0);
706 			return (0);
707 		}
708 		if (error == EMSGSIZE) {
709 			/*
710 			 * ip_output() will have already fixed the route
711 			 * for us.  tcp_mtudisc() will, as its last action,
712 			 * initiate retransmission, so it is important to
713 			 * not do so here.
714 			 */
715 			tcp_mtudisc(tp->t_inpcb, 0);
716 			return 0;
717 		}
718 		if ((error == EHOSTUNREACH || error == ENETDOWN)
719 		    && TCPS_HAVERCVDSYN(tp->t_state)) {
720 			tp->t_softerror = error;
721 			return (0);
722 		}
723 		return (error);
724 	}
725 	tcpstat.tcps_sndtotal++;
726 
727 	/*
728 	 * Data sent (as far as we can tell).
729 	 * If this advertises a larger window than any other segment,
730 	 * then remember the size of the advertised window.
731 	 * Any pending ACK has now been sent.
732 	 */
733 	if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
734 		tp->rcv_adv = tp->rcv_nxt + win;
735 	tp->last_ack_sent = tp->rcv_nxt;
736 	tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
737 	if (sendalot)
738 		goto again;
739 	return (0);
740 }
741 
742 void
743 tcp_setpersist(tp)
744 	register struct tcpcb *tp;
745 {
746 	register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
747 
748 	if (tp->t_timer[TCPT_REXMT])
749 		panic("tcp_output REXMT");
750 	/*
751 	 * Start/restart persistance timer.
752 	 */
753 	TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
754 	    t * tcp_backoff[tp->t_rxtshift],
755 	    TCPTV_PERSMIN, TCPTV_PERSMAX);
756 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
757 		tp->t_rxtshift++;
758 }
759