xref: /titanic_41/usr/src/uts/common/inet/tcp/tcp_output.c (revision 82629e3015252bf18319ba3815c773df23e21436)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* This file contains all TCP output processing functions. */
28 
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/strsun.h>
32 #include <sys/strsubr.h>
33 #include <sys/stropts.h>
34 #include <sys/strlog.h>
35 #define	_SUN_TPI_VERSION 2
36 #include <sys/tihdr.h>
37 #include <sys/suntpi.h>
38 #include <sys/xti_inet.h>
39 #include <sys/timod.h>
40 #include <sys/pattr.h>
41 #include <sys/squeue_impl.h>
42 #include <sys/squeue.h>
43 #include <sys/sockio.h>
44 #include <sys/tsol/tnet.h>
45 
46 #include <inet/common.h>
47 #include <inet/ip.h>
48 #include <inet/tcp.h>
49 #include <inet/tcp_impl.h>
50 #include <inet/snmpcom.h>
51 #include <inet/proto_set.h>
52 #include <inet/ipsec_impl.h>
53 #include <inet/ip_ndp.h>
54 
55 static mblk_t	*tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *);
56 static void	tcp_wput_cmdblk(queue_t *, mblk_t *);
57 static void	tcp_wput_flush(tcp_t *, mblk_t *);
58 static void	tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
59 static int	tcp_xmit_end(tcp_t *);
60 static int	tcp_send(tcp_t *, const int, const int, const int,
61 		    const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
62 static void	tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
63 		    int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
64 static boolean_t	tcp_send_rst_chk(tcp_stack_t *);
65 static void	tcp_process_shrunk_swnd(tcp_t *, uint32_t);
66 static void	tcp_fill_header(tcp_t *, uchar_t *, clock_t, int);
67 
68 /*
69  * Functions called directly via squeue having a prototype of edesc_t.
70  */
71 static void	tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *);
72 static void	tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *);
73 static void	tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
74 
75 /*
76  * This controls how tiny a write must be before we try to copy it
77  * into the mblk on the tail of the transmit queue.  Not much
78  * speedup is observed for values larger than sixteen.  Zero will
79  * disable the optimisation.
80  */
81 static int tcp_tx_pull_len = 16;
82 
83 void
84 tcp_wput(queue_t *q, mblk_t *mp)
85 {
86 	conn_t	*connp = Q_TO_CONN(q);
87 	tcp_t	*tcp;
88 	void (*output_proc)();
89 	t_scalar_t type;
90 	uchar_t *rptr;
91 	struct iocblk	*iocp;
92 	size_t size;
93 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
94 
95 	ASSERT(connp->conn_ref >= 2);
96 
97 	switch (DB_TYPE(mp)) {
98 	case M_DATA:
99 		tcp = connp->conn_tcp;
100 		ASSERT(tcp != NULL);
101 
102 		size = msgdsize(mp);
103 
104 		mutex_enter(&tcp->tcp_non_sq_lock);
105 		tcp->tcp_squeue_bytes += size;
106 		if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
107 			tcp_setqfull(tcp);
108 		}
109 		mutex_exit(&tcp->tcp_non_sq_lock);
110 
111 		CONN_INC_REF(connp);
112 		SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp,
113 		    NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
114 		return;
115 
116 	case M_CMD:
117 		tcp_wput_cmdblk(q, mp);
118 		return;
119 
120 	case M_PROTO:
121 	case M_PCPROTO:
122 		/*
123 		 * if it is a snmp message, don't get behind the squeue
124 		 */
125 		tcp = connp->conn_tcp;
126 		rptr = mp->b_rptr;
127 		if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
128 			type = ((union T_primitives *)rptr)->type;
129 		} else {
130 			if (connp->conn_debug) {
131 				(void) strlog(TCP_MOD_ID, 0, 1,
132 				    SL_ERROR|SL_TRACE,
133 				    "tcp_wput_proto, dropping one...");
134 			}
135 			freemsg(mp);
136 			return;
137 		}
138 		if (type == T_SVR4_OPTMGMT_REQ) {
139 			/*
140 			 * All Solaris components should pass a db_credp
141 			 * for this TPI message, hence we ASSERT.
142 			 * But in case there is some other M_PROTO that looks
143 			 * like a TPI message sent by some other kernel
144 			 * component, we check and return an error.
145 			 */
146 			cred_t	*cr = msg_getcred(mp, NULL);
147 
148 			ASSERT(cr != NULL);
149 			if (cr == NULL) {
150 				tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
151 				return;
152 			}
153 			if (snmpcom_req(q, mp, tcp_snmp_set, ip_snmp_get,
154 			    cr)) {
155 				/*
156 				 * This was a SNMP request
157 				 */
158 				return;
159 			} else {
160 				output_proc = tcp_wput_proto;
161 			}
162 		} else {
163 			output_proc = tcp_wput_proto;
164 		}
165 		break;
166 	case M_IOCTL:
167 		/*
168 		 * Most ioctls can be processed right away without going via
169 		 * squeues - process them right here. Those that do require
170 		 * squeue (currently _SIOCSOCKFALLBACK)
171 		 * are processed by tcp_wput_ioctl().
172 		 */
173 		iocp = (struct iocblk *)mp->b_rptr;
174 		tcp = connp->conn_tcp;
175 
176 		switch (iocp->ioc_cmd) {
177 		case TCP_IOC_ABORT_CONN:
178 			tcp_ioctl_abort_conn(q, mp);
179 			return;
180 		case TI_GETPEERNAME:
181 		case TI_GETMYNAME:
182 			mi_copyin(q, mp, NULL,
183 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
184 			return;
185 		case ND_SET:
186 			/* nd_getset does the necessary checks */
187 		case ND_GET:
188 			if (nd_getset(q, tcps->tcps_g_nd, mp)) {
189 				qreply(q, mp);
190 				return;
191 			}
192 			CONN_INC_IOCTLREF(connp);
193 			ip_wput_nondata(q, mp);
194 			CONN_DEC_IOCTLREF(connp);
195 			return;
196 
197 		default:
198 			output_proc = tcp_wput_ioctl;
199 			break;
200 		}
201 		break;
202 	default:
203 		output_proc = tcp_wput_nondata;
204 		break;
205 	}
206 
207 	CONN_INC_REF(connp);
208 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp,
209 	    NULL, tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER);
210 }
211 
212 /*
213  * The TCP normal data output path.
214  * NOTE: the logic of the fast path is duplicated from this function.
215  */
216 void
217 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
218 {
219 	int		len;
220 	mblk_t		*local_time;
221 	mblk_t		*mp1;
222 	uint32_t	snxt;
223 	int		tail_unsent;
224 	int		tcpstate;
225 	int		usable = 0;
226 	mblk_t		*xmit_tail;
227 	int32_t		mss;
228 	int32_t		num_sack_blk = 0;
229 	int32_t		total_hdr_len;
230 	int32_t		tcp_hdr_len;
231 	int		rc;
232 	tcp_stack_t	*tcps = tcp->tcp_tcps;
233 	conn_t		*connp = tcp->tcp_connp;
234 	clock_t		now = LBOLT_FASTPATH;
235 
236 	tcpstate = tcp->tcp_state;
237 	if (mp == NULL) {
238 		/*
239 		 * tcp_wput_data() with NULL mp should only be called when
240 		 * there is unsent data.
241 		 */
242 		ASSERT(tcp->tcp_unsent > 0);
243 		/* Really tacky... but we need this for detached closes. */
244 		len = tcp->tcp_unsent;
245 		goto data_null;
246 	}
247 
248 	ASSERT(mp->b_datap->db_type == M_DATA);
249 	/*
250 	 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ,
251 	 * or before a connection attempt has begun.
252 	 */
253 	if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT ||
254 	    (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) {
255 		if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) {
256 #ifdef DEBUG
257 			cmn_err(CE_WARN,
258 			    "tcp_wput_data: data after ordrel, %s",
259 			    tcp_display(tcp, NULL,
260 			    DISP_ADDR_AND_PORT));
261 #else
262 			if (connp->conn_debug) {
263 				(void) strlog(TCP_MOD_ID, 0, 1,
264 				    SL_TRACE|SL_ERROR,
265 				    "tcp_wput_data: data after ordrel, %s\n",
266 				    tcp_display(tcp, NULL,
267 				    DISP_ADDR_AND_PORT));
268 			}
269 #endif /* DEBUG */
270 		}
271 		if (tcp->tcp_snd_zcopy_aware &&
272 		    (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
273 			tcp_zcopy_notify(tcp);
274 		freemsg(mp);
275 		mutex_enter(&tcp->tcp_non_sq_lock);
276 		if (tcp->tcp_flow_stopped &&
277 		    TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
278 			tcp_clrqfull(tcp);
279 		}
280 		mutex_exit(&tcp->tcp_non_sq_lock);
281 		return;
282 	}
283 
284 	/* Strip empties */
285 	for (;;) {
286 		ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
287 		    (uintptr_t)INT_MAX);
288 		len = (int)(mp->b_wptr - mp->b_rptr);
289 		if (len > 0)
290 			break;
291 		mp1 = mp;
292 		mp = mp->b_cont;
293 		freeb(mp1);
294 		if (mp == NULL) {
295 			return;
296 		}
297 	}
298 
299 	/* If we are the first on the list ... */
300 	if (tcp->tcp_xmit_head == NULL) {
301 		tcp->tcp_xmit_head = mp;
302 		tcp->tcp_xmit_tail = mp;
303 		tcp->tcp_xmit_tail_unsent = len;
304 	} else {
305 		/* If tiny tx and room in txq tail, pullup to save mblks. */
306 		struct datab *dp;
307 
308 		mp1 = tcp->tcp_xmit_last;
309 		if (len < tcp_tx_pull_len &&
310 		    (dp = mp1->b_datap)->db_ref == 1 &&
311 		    dp->db_lim - mp1->b_wptr >= len) {
312 			ASSERT(len > 0);
313 			ASSERT(!mp1->b_cont);
314 			if (len == 1) {
315 				*mp1->b_wptr++ = *mp->b_rptr;
316 			} else {
317 				bcopy(mp->b_rptr, mp1->b_wptr, len);
318 				mp1->b_wptr += len;
319 			}
320 			if (mp1 == tcp->tcp_xmit_tail)
321 				tcp->tcp_xmit_tail_unsent += len;
322 			mp1->b_cont = mp->b_cont;
323 			if (tcp->tcp_snd_zcopy_aware &&
324 			    (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
325 				mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
326 			freeb(mp);
327 			mp = mp1;
328 		} else {
329 			tcp->tcp_xmit_last->b_cont = mp;
330 		}
331 		len += tcp->tcp_unsent;
332 	}
333 
334 	/* Tack on however many more positive length mblks we have */
335 	if ((mp1 = mp->b_cont) != NULL) {
336 		do {
337 			int tlen;
338 			ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
339 			    (uintptr_t)INT_MAX);
340 			tlen = (int)(mp1->b_wptr - mp1->b_rptr);
341 			if (tlen <= 0) {
342 				mp->b_cont = mp1->b_cont;
343 				freeb(mp1);
344 			} else {
345 				len += tlen;
346 				mp = mp1;
347 			}
348 		} while ((mp1 = mp->b_cont) != NULL);
349 	}
350 	tcp->tcp_xmit_last = mp;
351 	tcp->tcp_unsent = len;
352 
353 	if (urgent)
354 		usable = 1;
355 
356 data_null:
357 	snxt = tcp->tcp_snxt;
358 	xmit_tail = tcp->tcp_xmit_tail;
359 	tail_unsent = tcp->tcp_xmit_tail_unsent;
360 
361 	/*
362 	 * Note that tcp_mss has been adjusted to take into account the
363 	 * timestamp option if applicable.  Because SACK options do not
364 	 * appear in every TCP segments and they are of variable lengths,
365 	 * they cannot be included in tcp_mss.  Thus we need to calculate
366 	 * the actual segment length when we need to send a segment which
367 	 * includes SACK options.
368 	 */
369 	if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
370 		int32_t	opt_len;
371 
372 		num_sack_blk = MIN(tcp->tcp_max_sack_blk,
373 		    tcp->tcp_num_sack_blk);
374 		opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
375 		    2 + TCPOPT_HEADER_LEN;
376 		mss = tcp->tcp_mss - opt_len;
377 		total_hdr_len = connp->conn_ht_iphc_len + opt_len;
378 		tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
379 	} else {
380 		mss = tcp->tcp_mss;
381 		total_hdr_len = connp->conn_ht_iphc_len;
382 		tcp_hdr_len = connp->conn_ht_ulp_len;
383 	}
384 
385 	if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
386 	    (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
387 		TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
388 	}
389 	if (tcpstate == TCPS_SYN_RCVD) {
390 		/*
391 		 * The three-way connection establishment handshake is not
392 		 * complete yet. We want to queue the data for transmission
393 		 * after entering ESTABLISHED state (RFC793). A jump to
394 		 * "done" label effectively leaves data on the queue.
395 		 */
396 		goto done;
397 	} else {
398 		int usable_r;
399 
400 		/*
401 		 * In the special case when cwnd is zero, which can only
402 		 * happen if the connection is ECN capable, return now.
403 		 * New segments is sent using tcp_timer().  The timer
404 		 * is set in tcp_input_data().
405 		 */
406 		if (tcp->tcp_cwnd == 0) {
407 			/*
408 			 * Note that tcp_cwnd is 0 before 3-way handshake is
409 			 * finished.
410 			 */
411 			ASSERT(tcp->tcp_ecn_ok ||
412 			    tcp->tcp_state < TCPS_ESTABLISHED);
413 			return;
414 		}
415 
416 		/* NOTE: trouble if xmitting while SYN not acked? */
417 		usable_r = snxt - tcp->tcp_suna;
418 		usable_r = tcp->tcp_swnd - usable_r;
419 
420 		/*
421 		 * Check if the receiver has shrunk the window.  If
422 		 * tcp_wput_data() with NULL mp is called, tcp_fin_sent
423 		 * cannot be set as there is unsent data, so FIN cannot
424 		 * be sent out.  Otherwise, we need to take into account
425 		 * of FIN as it consumes an "invisible" sequence number.
426 		 */
427 		ASSERT(tcp->tcp_fin_sent == 0);
428 		if (usable_r < 0) {
429 			/*
430 			 * The receiver has shrunk the window and we have sent
431 			 * -usable_r date beyond the window, re-adjust.
432 			 *
433 			 * If TCP window scaling is enabled, there can be
434 			 * round down error as the advertised receive window
435 			 * is actually right shifted n bits.  This means that
436 			 * the lower n bits info is wiped out.  It will look
437 			 * like the window is shrunk.  Do a check here to
438 			 * see if the shrunk amount is actually within the
439 			 * error in window calculation.  If it is, just
440 			 * return.  Note that this check is inside the
441 			 * shrunk window check.  This makes sure that even
442 			 * though tcp_process_shrunk_swnd() is not called,
443 			 * we will stop further processing.
444 			 */
445 			if ((-usable_r >> tcp->tcp_snd_ws) > 0) {
446 				tcp_process_shrunk_swnd(tcp, -usable_r);
447 			}
448 			return;
449 		}
450 
451 		/* usable = MIN(swnd, cwnd) - unacked_bytes */
452 		if (tcp->tcp_swnd > tcp->tcp_cwnd)
453 			usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd;
454 
455 		/* usable = MIN(usable, unsent) */
456 		if (usable_r > len)
457 			usable_r = len;
458 
459 		/* usable = MAX(usable, {1 for urgent, 0 for data}) */
460 		if (usable_r > 0) {
461 			usable = usable_r;
462 		} else {
463 			/* Bypass all other unnecessary processing. */
464 			goto done;
465 		}
466 	}
467 
468 	local_time = (mblk_t *)now;
469 
470 	/*
471 	 * "Our" Nagle Algorithm.  This is not the same as in the old
472 	 * BSD.  This is more in line with the true intent of Nagle.
473 	 *
474 	 * The conditions are:
475 	 * 1. The amount of unsent data (or amount of data which can be
476 	 *    sent, whichever is smaller) is less than Nagle limit.
477 	 * 2. The last sent size is also less than Nagle limit.
478 	 * 3. There is unack'ed data.
479 	 * 4. Urgent pointer is not set.  Send urgent data ignoring the
480 	 *    Nagle algorithm.  This reduces the probability that urgent
481 	 *    bytes get "merged" together.
482 	 * 5. The app has not closed the connection.  This eliminates the
483 	 *    wait time of the receiving side waiting for the last piece of
484 	 *    (small) data.
485 	 *
486 	 * If all are satisified, exit without sending anything.  Note
487 	 * that Nagle limit can be smaller than 1 MSS.  Nagle limit is
488 	 * the smaller of 1 MSS and global tcp_naglim_def (default to be
489 	 * 4095).
490 	 */
491 	if (usable < (int)tcp->tcp_naglim &&
492 	    tcp->tcp_naglim > tcp->tcp_last_sent_len &&
493 	    snxt != tcp->tcp_suna &&
494 	    !(tcp->tcp_valid_bits & TCP_URG_VALID) &&
495 	    !(tcp->tcp_valid_bits & TCP_FSS_VALID)) {
496 		goto done;
497 	}
498 
499 	/*
500 	 * If tcp_zero_win_probe is not set and the tcp->tcp_cork option
501 	 * is set, then we have to force TCP not to send partial segment
502 	 * (smaller than MSS bytes). We are calculating the usable now
503 	 * based on full mss and will save the rest of remaining data for
504 	 * later. When tcp_zero_win_probe is set, TCP needs to send out
505 	 * something to do zero window probe.
506 	 */
507 	if (tcp->tcp_cork && !tcp->tcp_zero_win_probe) {
508 		if (usable < mss)
509 			goto done;
510 		usable = (usable / mss) * mss;
511 	}
512 
513 	/* Update the latest receive window size in TCP header. */
514 	tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
515 
516 	/* Send the packet. */
517 	rc = tcp_send(tcp, mss, total_hdr_len, tcp_hdr_len,
518 	    num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail,
519 	    local_time);
520 
521 	/* Pretend that all we were trying to send really got sent */
522 	if (rc < 0 && tail_unsent < 0) {
523 		do {
524 			xmit_tail = xmit_tail->b_cont;
525 			xmit_tail->b_prev = local_time;
526 			ASSERT((uintptr_t)(xmit_tail->b_wptr -
527 			    xmit_tail->b_rptr) <= (uintptr_t)INT_MAX);
528 			tail_unsent += (int)(xmit_tail->b_wptr -
529 			    xmit_tail->b_rptr);
530 		} while (tail_unsent < 0);
531 	}
532 done:;
533 	tcp->tcp_xmit_tail = xmit_tail;
534 	tcp->tcp_xmit_tail_unsent = tail_unsent;
535 	len = tcp->tcp_snxt - snxt;
536 	if (len) {
537 		/*
538 		 * If new data was sent, need to update the notsack
539 		 * list, which is, afterall, data blocks that have
540 		 * not been sack'ed by the receiver.  New data is
541 		 * not sack'ed.
542 		 */
543 		if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
544 			/* len is a negative value. */
545 			tcp->tcp_pipe -= len;
546 			tcp_notsack_update(&(tcp->tcp_notsack_list),
547 			    tcp->tcp_snxt, snxt,
548 			    &(tcp->tcp_num_notsack_blk),
549 			    &(tcp->tcp_cnt_notsack_list));
550 		}
551 		tcp->tcp_snxt = snxt + tcp->tcp_fin_sent;
552 		tcp->tcp_rack = tcp->tcp_rnxt;
553 		tcp->tcp_rack_cnt = 0;
554 		if ((snxt + len) == tcp->tcp_suna) {
555 			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
556 		}
557 	} else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) {
558 		/*
559 		 * Didn't send anything. Make sure the timer is running
560 		 * so that we will probe a zero window.
561 		 */
562 		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
563 	}
564 	/* Note that len is the amount we just sent but with a negative sign */
565 	tcp->tcp_unsent += len;
566 	mutex_enter(&tcp->tcp_non_sq_lock);
567 	if (tcp->tcp_flow_stopped) {
568 		if (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
569 			tcp_clrqfull(tcp);
570 		}
571 	} else if (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf) {
572 		if (!(tcp->tcp_detached))
573 			tcp_setqfull(tcp);
574 	}
575 	mutex_exit(&tcp->tcp_non_sq_lock);
576 }
577 
578 /*
579  * Initial STREAMS write side put() procedure for sockets. It tries to
580  * handle the T_CAPABILITY_REQ which sockfs sends down while setting
581  * up the socket without using the squeue. Non T_CAPABILITY_REQ messages
582  * are handled by tcp_wput() as usual.
583  *
584  * All further messages will also be handled by tcp_wput() because we cannot
585  * be sure that the above short cut is safe later.
586  */
587 void
588 tcp_wput_sock(queue_t *wq, mblk_t *mp)
589 {
590 	conn_t			*connp = Q_TO_CONN(wq);
591 	tcp_t			*tcp = connp->conn_tcp;
592 	struct T_capability_req	*car = (struct T_capability_req *)mp->b_rptr;
593 
594 	ASSERT(wq->q_qinfo == &tcp_sock_winit);
595 	wq->q_qinfo = &tcp_winit;
596 
597 	ASSERT(IPCL_IS_TCP(connp));
598 	ASSERT(TCP_IS_SOCKET(tcp));
599 
600 	if (DB_TYPE(mp) == M_PCPROTO &&
601 	    MBLKL(mp) == sizeof (struct T_capability_req) &&
602 	    car->PRIM_type == T_CAPABILITY_REQ) {
603 		tcp_capability_req(tcp, mp);
604 		return;
605 	}
606 
607 	tcp_wput(wq, mp);
608 }
609 
610 /* ARGSUSED */
611 void
612 tcp_wput_fallback(queue_t *wq, mblk_t *mp)
613 {
614 #ifdef DEBUG
615 	cmn_err(CE_CONT, "tcp_wput_fallback: Message during fallback \n");
616 #endif
617 	freemsg(mp);
618 }
619 
620 /*
621  * Call by tcp_wput() to handle misc non M_DATA messages.
622  */
623 /* ARGSUSED */
624 static void
625 tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
626 {
627 	conn_t	*connp = (conn_t *)arg;
628 	tcp_t	*tcp = connp->conn_tcp;
629 
630 	ASSERT(DB_TYPE(mp) != M_IOCTL);
631 	/*
632 	 * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close.
633 	 * Once the close starts, streamhead and sockfs will not let any data
634 	 * packets come down (close ensures that there are no threads using the
635 	 * queue and no new threads will come down) but since qprocsoff()
636 	 * hasn't happened yet, a M_FLUSH or some non data message might
637 	 * get reflected back (in response to our own FLUSHRW) and get
638 	 * processed after tcp_close() is done. The conn would still be valid
639 	 * because a ref would have added but we need to check the state
640 	 * before actually processing the packet.
641 	 */
642 	if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) {
643 		freemsg(mp);
644 		return;
645 	}
646 
647 	switch (DB_TYPE(mp)) {
648 	case M_IOCDATA:
649 		tcp_wput_iocdata(tcp, mp);
650 		break;
651 	case M_FLUSH:
652 		tcp_wput_flush(tcp, mp);
653 		break;
654 	default:
655 		ip_wput_nondata(connp->conn_wq, mp);
656 		break;
657 	}
658 }
659 
660 /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */
661 static void
662 tcp_wput_flush(tcp_t *tcp, mblk_t *mp)
663 {
664 	uchar_t	fval = *mp->b_rptr;
665 	mblk_t	*tail;
666 	conn_t	*connp = tcp->tcp_connp;
667 	queue_t	*q = connp->conn_wq;
668 
669 	/* TODO: How should flush interact with urgent data? */
670 	if ((fval & FLUSHW) && tcp->tcp_xmit_head != NULL &&
671 	    !(tcp->tcp_valid_bits & TCP_URG_VALID)) {
672 		/*
673 		 * Flush only data that has not yet been put on the wire.  If
674 		 * we flush data that we have already transmitted, life, as we
675 		 * know it, may come to an end.
676 		 */
677 		tail = tcp->tcp_xmit_tail;
678 		tail->b_wptr -= tcp->tcp_xmit_tail_unsent;
679 		tcp->tcp_xmit_tail_unsent = 0;
680 		tcp->tcp_unsent = 0;
681 		if (tail->b_wptr != tail->b_rptr)
682 			tail = tail->b_cont;
683 		if (tail) {
684 			mblk_t **excess = &tcp->tcp_xmit_head;
685 			for (;;) {
686 				mblk_t *mp1 = *excess;
687 				if (mp1 == tail)
688 					break;
689 				tcp->tcp_xmit_tail = mp1;
690 				tcp->tcp_xmit_last = mp1;
691 				excess = &mp1->b_cont;
692 			}
693 			*excess = NULL;
694 			tcp_close_mpp(&tail);
695 			if (tcp->tcp_snd_zcopy_aware)
696 				tcp_zcopy_notify(tcp);
697 		}
698 		/*
699 		 * We have no unsent data, so unsent must be less than
700 		 * conn_sndlowat, so re-enable flow.
701 		 */
702 		mutex_enter(&tcp->tcp_non_sq_lock);
703 		if (tcp->tcp_flow_stopped) {
704 			tcp_clrqfull(tcp);
705 		}
706 		mutex_exit(&tcp->tcp_non_sq_lock);
707 	}
708 	/*
709 	 * TODO: you can't just flush these, you have to increase rwnd for one
710 	 * thing.  For another, how should urgent data interact?
711 	 */
712 	if (fval & FLUSHR) {
713 		*mp->b_rptr = fval & ~FLUSHW;
714 		/* XXX */
715 		qreply(q, mp);
716 		return;
717 	}
718 	freemsg(mp);
719 }
720 
721 /*
722  * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA
723  * messages.
724  */
725 static void
726 tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp)
727 {
728 	mblk_t		*mp1;
729 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
730 	STRUCT_HANDLE(strbuf, sb);
731 	uint_t		addrlen;
732 	conn_t		*connp = tcp->tcp_connp;
733 	queue_t 	*q = connp->conn_wq;
734 
735 	/* Make sure it is one of ours. */
736 	switch (iocp->ioc_cmd) {
737 	case TI_GETMYNAME:
738 	case TI_GETPEERNAME:
739 		break;
740 	default:
741 		/*
742 		 * If the conn is closing, then error the ioctl here. Otherwise
743 		 * use the CONN_IOCTLREF_* macros to hold off tcp_close until
744 		 * we're done here.
745 		 */
746 		mutex_enter(&connp->conn_lock);
747 		if (connp->conn_state_flags & CONN_CLOSING) {
748 			mutex_exit(&connp->conn_lock);
749 			iocp->ioc_error = EINVAL;
750 			mp->b_datap->db_type = M_IOCNAK;
751 			iocp->ioc_count = 0;
752 			qreply(q, mp);
753 			return;
754 		}
755 
756 		CONN_INC_IOCTLREF_LOCKED(connp);
757 		ip_wput_nondata(q, mp);
758 		CONN_DEC_IOCTLREF(connp);
759 		return;
760 	}
761 	switch (mi_copy_state(q, mp, &mp1)) {
762 	case -1:
763 		return;
764 	case MI_COPY_CASE(MI_COPY_IN, 1):
765 		break;
766 	case MI_COPY_CASE(MI_COPY_OUT, 1):
767 		/* Copy out the strbuf. */
768 		mi_copyout(q, mp);
769 		return;
770 	case MI_COPY_CASE(MI_COPY_OUT, 2):
771 		/* All done. */
772 		mi_copy_done(q, mp, 0);
773 		return;
774 	default:
775 		mi_copy_done(q, mp, EPROTO);
776 		return;
777 	}
778 	/* Check alignment of the strbuf */
779 	if (!OK_32PTR(mp1->b_rptr)) {
780 		mi_copy_done(q, mp, EINVAL);
781 		return;
782 	}
783 
784 	STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
785 
786 	if (connp->conn_family == AF_INET)
787 		addrlen = sizeof (sin_t);
788 	else
789 		addrlen = sizeof (sin6_t);
790 
791 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
792 		mi_copy_done(q, mp, EINVAL);
793 		return;
794 	}
795 
796 	switch (iocp->ioc_cmd) {
797 	case TI_GETMYNAME:
798 		break;
799 	case TI_GETPEERNAME:
800 		if (tcp->tcp_state < TCPS_SYN_RCVD) {
801 			mi_copy_done(q, mp, ENOTCONN);
802 			return;
803 		}
804 		break;
805 	}
806 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
807 	if (!mp1)
808 		return;
809 
810 	STRUCT_FSET(sb, len, addrlen);
811 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
812 	case TI_GETMYNAME:
813 		(void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
814 		    &addrlen);
815 		break;
816 	case TI_GETPEERNAME:
817 		(void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
818 		    &addrlen);
819 		break;
820 	}
821 	mp1->b_wptr += addrlen;
822 	/* Copy out the address */
823 	mi_copyout(q, mp);
824 }
825 
826 /*
827  * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL
828  * messages.
829  */
830 /* ARGSUSED */
831 static void
832 tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
833 {
834 	conn_t 		*connp = (conn_t *)arg;
835 	tcp_t		*tcp = connp->conn_tcp;
836 	queue_t		*q = connp->conn_wq;
837 	struct iocblk	*iocp;
838 
839 	ASSERT(DB_TYPE(mp) == M_IOCTL);
840 	/*
841 	 * Try and ASSERT the minimum possible references on the
842 	 * conn early enough. Since we are executing on write side,
843 	 * the connection is obviously not detached and that means
844 	 * there is a ref each for TCP and IP. Since we are behind
845 	 * the squeue, the minimum references needed are 3. If the
846 	 * conn is in classifier hash list, there should be an
847 	 * extra ref for that (we check both the possibilities).
848 	 */
849 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
850 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
851 
852 	iocp = (struct iocblk *)mp->b_rptr;
853 	switch (iocp->ioc_cmd) {
854 	case _SIOCSOCKFALLBACK:
855 		/*
856 		 * Either sockmod is about to be popped and the socket
857 		 * would now be treated as a plain stream, or a module
858 		 * is about to be pushed so we could no longer use read-
859 		 * side synchronous streams for fused loopback tcp.
860 		 * Drain any queued data and disable direct sockfs
861 		 * interface from now on.
862 		 */
863 		if (!tcp->tcp_issocket) {
864 			DB_TYPE(mp) = M_IOCNAK;
865 			iocp->ioc_error = EINVAL;
866 		} else {
867 			tcp_use_pure_tpi(tcp);
868 			DB_TYPE(mp) = M_IOCACK;
869 			iocp->ioc_error = 0;
870 		}
871 		iocp->ioc_count = 0;
872 		iocp->ioc_rval = 0;
873 		qreply(q, mp);
874 		return;
875 	}
876 
877 	/*
878 	 * If the conn is closing, then error the ioctl here. Otherwise bump the
879 	 * conn_ioctlref to hold off tcp_close until we're done here.
880 	 */
881 	mutex_enter(&(connp)->conn_lock);
882 	if ((connp)->conn_state_flags & CONN_CLOSING) {
883 		mutex_exit(&(connp)->conn_lock);
884 		iocp->ioc_error = EINVAL;
885 		mp->b_datap->db_type = M_IOCNAK;
886 		iocp->ioc_count = 0;
887 		qreply(q, mp);
888 		return;
889 	}
890 
891 	CONN_INC_IOCTLREF_LOCKED(connp);
892 	ip_wput_nondata(q, mp);
893 	CONN_DEC_IOCTLREF(connp);
894 }
895 
896 /*
897  * This routine is called by tcp_wput() to handle all TPI requests.
898  */
899 /* ARGSUSED */
900 static void
901 tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
902 {
903 	conn_t		*connp = (conn_t *)arg;
904 	tcp_t		*tcp = connp->conn_tcp;
905 	union T_primitives *tprim = (union T_primitives *)mp->b_rptr;
906 	uchar_t		*rptr;
907 	t_scalar_t	type;
908 	cred_t		*cr;
909 
910 	/*
911 	 * Try and ASSERT the minimum possible references on the
912 	 * conn early enough. Since we are executing on write side,
913 	 * the connection is obviously not detached and that means
914 	 * there is a ref each for TCP and IP. Since we are behind
915 	 * the squeue, the minimum references needed are 3. If the
916 	 * conn is in classifier hash list, there should be an
917 	 * extra ref for that (we check both the possibilities).
918 	 */
919 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
920 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
921 
922 	rptr = mp->b_rptr;
923 	ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
924 	if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
925 		type = ((union T_primitives *)rptr)->type;
926 		if (type == T_EXDATA_REQ) {
927 			tcp_output_urgent(connp, mp, arg2, NULL);
928 		} else if (type != T_DATA_REQ) {
929 			goto non_urgent_data;
930 		} else {
931 			/* TODO: options, flags, ... from user */
932 			/* Set length to zero for reclamation below */
933 			tcp_wput_data(tcp, mp->b_cont, B_TRUE);
934 			freeb(mp);
935 		}
936 		return;
937 	} else {
938 		if (connp->conn_debug) {
939 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
940 			    "tcp_wput_proto, dropping one...");
941 		}
942 		freemsg(mp);
943 		return;
944 	}
945 
946 non_urgent_data:
947 
948 	switch ((int)tprim->type) {
949 	case T_SSL_PROXY_BIND_REQ:	/* an SSL proxy endpoint bind request */
950 		/*
951 		 * save the kssl_ent_t from the next block, and convert this
952 		 * back to a normal bind_req.
953 		 */
954 		if (mp->b_cont != NULL) {
955 			ASSERT(MBLKL(mp->b_cont) >= sizeof (kssl_ent_t));
956 
957 			if (tcp->tcp_kssl_ent != NULL) {
958 				kssl_release_ent(tcp->tcp_kssl_ent, NULL,
959 				    KSSL_NO_PROXY);
960 				tcp->tcp_kssl_ent = NULL;
961 			}
962 			bcopy(mp->b_cont->b_rptr, &tcp->tcp_kssl_ent,
963 			    sizeof (kssl_ent_t));
964 			kssl_hold_ent(tcp->tcp_kssl_ent);
965 			freemsg(mp->b_cont);
966 			mp->b_cont = NULL;
967 		}
968 		tprim->type = T_BIND_REQ;
969 
970 	/* FALLTHROUGH */
971 	case O_T_BIND_REQ:	/* bind request */
972 	case T_BIND_REQ:	/* new semantics bind request */
973 		tcp_tpi_bind(tcp, mp);
974 		break;
975 	case T_UNBIND_REQ:	/* unbind request */
976 		tcp_tpi_unbind(tcp, mp);
977 		break;
978 	case O_T_CONN_RES:	/* old connection response XXX */
979 	case T_CONN_RES:	/* connection response */
980 		tcp_tli_accept(tcp, mp);
981 		break;
982 	case T_CONN_REQ:	/* connection request */
983 		tcp_tpi_connect(tcp, mp);
984 		break;
985 	case T_DISCON_REQ:	/* disconnect request */
986 		tcp_disconnect(tcp, mp);
987 		break;
988 	case T_CAPABILITY_REQ:
989 		tcp_capability_req(tcp, mp);	/* capability request */
990 		break;
991 	case T_INFO_REQ:	/* information request */
992 		tcp_info_req(tcp, mp);
993 		break;
994 	case T_SVR4_OPTMGMT_REQ:	/* manage options req */
995 	case T_OPTMGMT_REQ:
996 		/*
997 		 * Note:  no support for snmpcom_req() through new
998 		 * T_OPTMGMT_REQ. See comments in ip.c
999 		 */
1000 
1001 		/*
1002 		 * All Solaris components should pass a db_credp
1003 		 * for this TPI message, hence we ASSERT.
1004 		 * But in case there is some other M_PROTO that looks
1005 		 * like a TPI message sent by some other kernel
1006 		 * component, we check and return an error.
1007 		 */
1008 		cr = msg_getcred(mp, NULL);
1009 		ASSERT(cr != NULL);
1010 		if (cr == NULL) {
1011 			tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
1012 			return;
1013 		}
1014 		/*
1015 		 * If EINPROGRESS is returned, the request has been queued
1016 		 * for subsequent processing by ip_restart_optmgmt(), which
1017 		 * will do the CONN_DEC_REF().
1018 		 */
1019 		if ((int)tprim->type == T_SVR4_OPTMGMT_REQ) {
1020 			svr4_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj);
1021 		} else {
1022 			tpi_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj);
1023 		}
1024 		break;
1025 
1026 	case T_UNITDATA_REQ:	/* unitdata request */
1027 		tcp_err_ack(tcp, mp, TNOTSUPPORT, 0);
1028 		break;
1029 	case T_ORDREL_REQ:	/* orderly release req */
1030 		freemsg(mp);
1031 
1032 		if (tcp->tcp_fused)
1033 			tcp_unfuse(tcp);
1034 
1035 		if (tcp_xmit_end(tcp) != 0) {
1036 			/*
1037 			 * We were crossing FINs and got a reset from
1038 			 * the other side. Just ignore it.
1039 			 */
1040 			if (connp->conn_debug) {
1041 				(void) strlog(TCP_MOD_ID, 0, 1,
1042 				    SL_ERROR|SL_TRACE,
1043 				    "tcp_wput_proto, T_ORDREL_REQ out of "
1044 				    "state %s",
1045 				    tcp_display(tcp, NULL,
1046 				    DISP_ADDR_AND_PORT));
1047 			}
1048 		}
1049 		break;
1050 	case T_ADDR_REQ:
1051 		tcp_addr_req(tcp, mp);
1052 		break;
1053 	default:
1054 		if (connp->conn_debug) {
1055 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
1056 			    "tcp_wput_proto, bogus TPI msg, type %d",
1057 			    tprim->type);
1058 		}
1059 		/*
1060 		 * We used to M_ERROR.  Sending TNOTSUPPORT gives the user
1061 		 * to recover.
1062 		 */
1063 		tcp_err_ack(tcp, mp, TNOTSUPPORT, 0);
1064 		break;
1065 	}
1066 }
1067 
1068 /*
1069  * Handle special out-of-band ioctl requests (see PSARC/2008/265).
1070  */
1071 static void
1072 tcp_wput_cmdblk(queue_t *q, mblk_t *mp)
1073 {
1074 	void	*data;
1075 	mblk_t	*datamp = mp->b_cont;
1076 	conn_t	*connp = Q_TO_CONN(q);
1077 	tcp_t	*tcp = connp->conn_tcp;
1078 	cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr;
1079 
1080 	if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) {
1081 		cmdp->cb_error = EPROTO;
1082 		qreply(q, mp);
1083 		return;
1084 	}
1085 
1086 	data = datamp->b_rptr;
1087 
1088 	switch (cmdp->cb_cmd) {
1089 	case TI_GETPEERNAME:
1090 		if (tcp->tcp_state < TCPS_SYN_RCVD)
1091 			cmdp->cb_error = ENOTCONN;
1092 		else
1093 			cmdp->cb_error = conn_getpeername(connp, data,
1094 			    &cmdp->cb_len);
1095 		break;
1096 	case TI_GETMYNAME:
1097 		cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len);
1098 		break;
1099 	default:
1100 		cmdp->cb_error = EINVAL;
1101 		break;
1102 	}
1103 
1104 	qreply(q, mp);
1105 }
1106 
1107 /*
1108  * The TCP fast path write put procedure.
1109  * NOTE: the logic of the fast path is duplicated from tcp_wput_data()
1110  */
1111 /* ARGSUSED */
1112 void
1113 tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1114 {
1115 	int		len;
1116 	int		hdrlen;
1117 	int		plen;
1118 	mblk_t		*mp1;
1119 	uchar_t		*rptr;
1120 	uint32_t	snxt;
1121 	tcpha_t		*tcpha;
1122 	struct datab	*db;
1123 	uint32_t	suna;
1124 	uint32_t	mss;
1125 	ipaddr_t	*dst;
1126 	ipaddr_t	*src;
1127 	uint32_t	sum;
1128 	int		usable;
1129 	conn_t		*connp = (conn_t *)arg;
1130 	tcp_t		*tcp = connp->conn_tcp;
1131 	uint32_t	msize;
1132 	tcp_stack_t	*tcps = tcp->tcp_tcps;
1133 	ip_xmit_attr_t	*ixa;
1134 	clock_t		now;
1135 
1136 	/*
1137 	 * Try and ASSERT the minimum possible references on the
1138 	 * conn early enough. Since we are executing on write side,
1139 	 * the connection is obviously not detached and that means
1140 	 * there is a ref each for TCP and IP. Since we are behind
1141 	 * the squeue, the minimum references needed are 3. If the
1142 	 * conn is in classifier hash list, there should be an
1143 	 * extra ref for that (we check both the possibilities).
1144 	 */
1145 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
1146 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
1147 
1148 	ASSERT(DB_TYPE(mp) == M_DATA);
1149 	msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1150 
1151 	mutex_enter(&tcp->tcp_non_sq_lock);
1152 	tcp->tcp_squeue_bytes -= msize;
1153 	mutex_exit(&tcp->tcp_non_sq_lock);
1154 
1155 	/* Bypass tcp protocol for fused tcp loopback */
1156 	if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
1157 		return;
1158 
1159 	mss = tcp->tcp_mss;
1160 	/*
1161 	 * If ZEROCOPY has turned off, try not to send any zero-copy message
1162 	 * down. Do backoff, now.
1163 	 */
1164 	if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on)
1165 		mp = tcp_zcopy_backoff(tcp, mp, B_FALSE);
1166 
1167 
1168 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
1169 	len = (int)(mp->b_wptr - mp->b_rptr);
1170 
1171 	/*
1172 	 * Criteria for fast path:
1173 	 *
1174 	 *   1. no unsent data
1175 	 *   2. single mblk in request
1176 	 *   3. connection established
1177 	 *   4. data in mblk
1178 	 *   5. len <= mss
1179 	 *   6. no tcp_valid bits
1180 	 */
1181 	if ((tcp->tcp_unsent != 0) ||
1182 	    (tcp->tcp_cork) ||
1183 	    (mp->b_cont != NULL) ||
1184 	    (tcp->tcp_state != TCPS_ESTABLISHED) ||
1185 	    (len == 0) ||
1186 	    (len > mss) ||
1187 	    (tcp->tcp_valid_bits != 0)) {
1188 		tcp_wput_data(tcp, mp, B_FALSE);
1189 		return;
1190 	}
1191 
1192 	ASSERT(tcp->tcp_xmit_tail_unsent == 0);
1193 	ASSERT(tcp->tcp_fin_sent == 0);
1194 
1195 	/* queue new packet onto retransmission queue */
1196 	if (tcp->tcp_xmit_head == NULL) {
1197 		tcp->tcp_xmit_head = mp;
1198 	} else {
1199 		tcp->tcp_xmit_last->b_cont = mp;
1200 	}
1201 	tcp->tcp_xmit_last = mp;
1202 	tcp->tcp_xmit_tail = mp;
1203 
1204 	/* find out how much we can send */
1205 	/* BEGIN CSTYLED */
1206 	/*
1207 	 *    un-acked	   usable
1208 	 *  |--------------|-----------------|
1209 	 *  tcp_suna       tcp_snxt	  tcp_suna+tcp_swnd
1210 	 */
1211 	/* END CSTYLED */
1212 
1213 	/* start sending from tcp_snxt */
1214 	snxt = tcp->tcp_snxt;
1215 
1216 	/*
1217 	 * Check to see if this connection has been idled for some
1218 	 * time and no ACK is expected.  If it is, we need to slow
1219 	 * start again to get back the connection's "self-clock" as
1220 	 * described in VJ's paper.
1221 	 *
1222 	 * Reinitialize tcp_cwnd after idle.
1223 	 */
1224 	now = LBOLT_FASTPATH;
1225 	if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
1226 	    (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
1227 		TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
1228 	}
1229 
1230 	usable = tcp->tcp_swnd;		/* tcp window size */
1231 	if (usable > tcp->tcp_cwnd)
1232 		usable = tcp->tcp_cwnd;	/* congestion window smaller */
1233 	usable -= snxt;		/* subtract stuff already sent */
1234 	suna = tcp->tcp_suna;
1235 	usable += suna;
1236 	/* usable can be < 0 if the congestion window is smaller */
1237 	if (len > usable) {
1238 		/* Can't send complete M_DATA in one shot */
1239 		goto slow;
1240 	}
1241 
1242 	mutex_enter(&tcp->tcp_non_sq_lock);
1243 	if (tcp->tcp_flow_stopped &&
1244 	    TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
1245 		tcp_clrqfull(tcp);
1246 	}
1247 	mutex_exit(&tcp->tcp_non_sq_lock);
1248 
1249 	/*
1250 	 * determine if anything to send (Nagle).
1251 	 *
1252 	 *   1. len < tcp_mss (i.e. small)
1253 	 *   2. unacknowledged data present
1254 	 *   3. len < nagle limit
1255 	 *   4. last packet sent < nagle limit (previous packet sent)
1256 	 */
1257 	if ((len < mss) && (snxt != suna) &&
1258 	    (len < (int)tcp->tcp_naglim) &&
1259 	    (tcp->tcp_last_sent_len < tcp->tcp_naglim)) {
1260 		/*
1261 		 * This was the first unsent packet and normally
1262 		 * mss < xmit_hiwater so there is no need to worry
1263 		 * about flow control. The next packet will go
1264 		 * through the flow control check in tcp_wput_data().
1265 		 */
1266 		/* leftover work from above */
1267 		tcp->tcp_unsent = len;
1268 		tcp->tcp_xmit_tail_unsent = len;
1269 
1270 		return;
1271 	}
1272 
1273 	/*
1274 	 * len <= tcp->tcp_mss && len == unsent so no sender silly window.  Can
1275 	 * send now.
1276 	 */
1277 
1278 	if (snxt == suna) {
1279 		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1280 	}
1281 
1282 	/* we have always sent something */
1283 	tcp->tcp_rack_cnt = 0;
1284 
1285 	tcp->tcp_snxt = snxt + len;
1286 	tcp->tcp_rack = tcp->tcp_rnxt;
1287 
1288 	if ((mp1 = dupb(mp)) == 0)
1289 		goto no_memory;
1290 	mp->b_prev = (mblk_t *)(uintptr_t)now;
1291 	mp->b_next = (mblk_t *)(uintptr_t)snxt;
1292 
1293 	/* adjust tcp header information */
1294 	tcpha = tcp->tcp_tcpha;
1295 	tcpha->tha_flags = (TH_ACK|TH_PUSH);
1296 
1297 	sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
1298 	sum = (sum >> 16) + (sum & 0xFFFF);
1299 	tcpha->tha_sum = htons(sum);
1300 
1301 	tcpha->tha_seq = htonl(snxt);
1302 
1303 	TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1304 	TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1305 	BUMP_LOCAL(tcp->tcp_obsegs);
1306 
1307 	/* Update the latest receive window size in TCP header. */
1308 	tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
1309 
1310 	tcp->tcp_last_sent_len = (ushort_t)len;
1311 
1312 	plen = len + connp->conn_ht_iphc_len;
1313 
1314 	ixa = connp->conn_ixa;
1315 	ixa->ixa_pktlen = plen;
1316 
1317 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
1318 		tcp->tcp_ipha->ipha_length = htons(plen);
1319 	} else {
1320 		tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN);
1321 	}
1322 
1323 	/* see if we need to allocate a mblk for the headers */
1324 	hdrlen = connp->conn_ht_iphc_len;
1325 	rptr = mp1->b_rptr - hdrlen;
1326 	db = mp1->b_datap;
1327 	if ((db->db_ref != 2) || rptr < db->db_base ||
1328 	    (!OK_32PTR(rptr))) {
1329 		/* NOTE: we assume allocb returns an OK_32PTR */
1330 		mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
1331 		if (!mp) {
1332 			freemsg(mp1);
1333 			goto no_memory;
1334 		}
1335 		mp->b_cont = mp1;
1336 		mp1 = mp;
1337 		/* Leave room for Link Level header */
1338 		rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
1339 		mp1->b_wptr = &rptr[hdrlen];
1340 	}
1341 	mp1->b_rptr = rptr;
1342 
1343 	/* Fill in the timestamp option. */
1344 	if (tcp->tcp_snd_ts_ok) {
1345 		uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
1346 
1347 		U32_TO_BE32(llbolt,
1348 		    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
1349 		U32_TO_BE32(tcp->tcp_ts_recent,
1350 		    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
1351 	} else {
1352 		ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
1353 	}
1354 
1355 	/* copy header into outgoing packet */
1356 	dst = (ipaddr_t *)rptr;
1357 	src = (ipaddr_t *)connp->conn_ht_iphc;
1358 	dst[0] = src[0];
1359 	dst[1] = src[1];
1360 	dst[2] = src[2];
1361 	dst[3] = src[3];
1362 	dst[4] = src[4];
1363 	dst[5] = src[5];
1364 	dst[6] = src[6];
1365 	dst[7] = src[7];
1366 	dst[8] = src[8];
1367 	dst[9] = src[9];
1368 	if (hdrlen -= 40) {
1369 		hdrlen >>= 2;
1370 		dst += 10;
1371 		src += 10;
1372 		do {
1373 			*dst++ = *src++;
1374 		} while (--hdrlen);
1375 	}
1376 
1377 	/*
1378 	 * Set the ECN info in the TCP header.  Note that this
1379 	 * is not the template header.
1380 	 */
1381 	if (tcp->tcp_ecn_ok) {
1382 		TCP_SET_ECT(tcp, rptr);
1383 
1384 		tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length);
1385 		if (tcp->tcp_ecn_echo_on)
1386 			tcpha->tha_flags |= TH_ECE;
1387 		if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
1388 			tcpha->tha_flags |= TH_CWR;
1389 			tcp->tcp_ecn_cwr_sent = B_TRUE;
1390 		}
1391 	}
1392 
1393 	if (tcp->tcp_ip_forward_progress) {
1394 		tcp->tcp_ip_forward_progress = B_FALSE;
1395 		connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
1396 	} else {
1397 		connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
1398 	}
1399 	tcp_send_data(tcp, mp1);
1400 	return;
1401 
1402 	/*
1403 	 * If we ran out of memory, we pretend to have sent the packet
1404 	 * and that it was lost on the wire.
1405 	 */
1406 no_memory:
1407 	return;
1408 
1409 slow:
1410 	/* leftover work from above */
1411 	tcp->tcp_unsent = len;
1412 	tcp->tcp_xmit_tail_unsent = len;
1413 	tcp_wput_data(tcp, NULL, B_FALSE);
1414 }
1415 
1416 /* ARGSUSED2 */
1417 void
1418 tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1419 {
1420 	int len;
1421 	uint32_t msize;
1422 	conn_t *connp = (conn_t *)arg;
1423 	tcp_t *tcp = connp->conn_tcp;
1424 
1425 	msize = msgdsize(mp);
1426 
1427 	len = msize - 1;
1428 	if (len < 0) {
1429 		freemsg(mp);
1430 		return;
1431 	}
1432 
1433 	/*
1434 	 * Try to force urgent data out on the wire. Even if we have unsent
1435 	 * data this will at least send the urgent flag.
1436 	 * XXX does not handle more flag correctly.
1437 	 */
1438 	len += tcp->tcp_unsent;
1439 	len += tcp->tcp_snxt;
1440 	tcp->tcp_urg = len;
1441 	tcp->tcp_valid_bits |= TCP_URG_VALID;
1442 
1443 	/* Bypass tcp protocol for fused tcp loopback */
1444 	if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
1445 		return;
1446 
1447 	/* Strip off the T_EXDATA_REQ if the data is from TPI */
1448 	if (DB_TYPE(mp) != M_DATA) {
1449 		mblk_t *mp1 = mp;
1450 		ASSERT(!IPCL_IS_NONSTR(connp));
1451 		mp = mp->b_cont;
1452 		freeb(mp1);
1453 	}
1454 	tcp_wput_data(tcp, mp, B_TRUE);
1455 }
1456 
1457 /*
1458  * Called by streams close routine via squeues when our client blows off her
1459  * descriptor, we take this to mean: "close the stream state NOW, close the tcp
1460  * connection politely" When SO_LINGER is set (with a non-zero linger time and
1461  * it is not a nonblocking socket) then this routine sleeps until the FIN is
1462  * acked.
1463  *
1464  * NOTE: tcp_close potentially returns error when lingering.
1465  * However, the stream head currently does not pass these errors
1466  * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK
1467  * errors to the application (from tsleep()) and not errors
1468  * like ECONNRESET caused by receiving a reset packet.
1469  */
1470 
1471 /* ARGSUSED */
1472 void
1473 tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1474 {
1475 	char	*msg;
1476 	conn_t	*connp = (conn_t *)arg;
1477 	tcp_t	*tcp = connp->conn_tcp;
1478 	clock_t	delta = 0;
1479 	tcp_stack_t	*tcps = tcp->tcp_tcps;
1480 
1481 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
1482 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
1483 
1484 	mutex_enter(&tcp->tcp_eager_lock);
1485 	if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
1486 		/* Cleanup for listener */
1487 		tcp_eager_cleanup(tcp, 0);
1488 		tcp->tcp_wait_for_eagers = 1;
1489 	}
1490 	mutex_exit(&tcp->tcp_eager_lock);
1491 
1492 	tcp->tcp_lso = B_FALSE;
1493 
1494 	msg = NULL;
1495 	switch (tcp->tcp_state) {
1496 	case TCPS_CLOSED:
1497 	case TCPS_IDLE:
1498 	case TCPS_BOUND:
1499 	case TCPS_LISTEN:
1500 		break;
1501 	case TCPS_SYN_SENT:
1502 		msg = "tcp_close, during connect";
1503 		break;
1504 	case TCPS_SYN_RCVD:
1505 		/*
1506 		 * Close during the connect 3-way handshake
1507 		 * but here there may or may not be pending data
1508 		 * already on queue. Process almost same as in
1509 		 * the ESTABLISHED state.
1510 		 */
1511 		/* FALLTHRU */
1512 	default:
1513 		if (tcp->tcp_fused)
1514 			tcp_unfuse(tcp);
1515 
1516 		/*
1517 		 * If SO_LINGER has set a zero linger time, abort the
1518 		 * connection with a reset.
1519 		 */
1520 		if (connp->conn_linger && connp->conn_lingertime == 0) {
1521 			msg = "tcp_close, zero lingertime";
1522 			break;
1523 		}
1524 
1525 		/*
1526 		 * Abort connection if there is unread data queued.
1527 		 */
1528 		if (tcp->tcp_rcv_list || tcp->tcp_reass_head) {
1529 			msg = "tcp_close, unread data";
1530 			break;
1531 		}
1532 		/*
1533 		 * We have done a qwait() above which could have possibly
1534 		 * drained more messages in turn causing transition to a
1535 		 * different state. Check whether we have to do the rest
1536 		 * of the processing or not.
1537 		 */
1538 		if (tcp->tcp_state <= TCPS_LISTEN)
1539 			break;
1540 
1541 		/*
1542 		 * Transmit the FIN before detaching the tcp_t.
1543 		 * After tcp_detach returns this queue/perimeter
1544 		 * no longer owns the tcp_t thus others can modify it.
1545 		 */
1546 		(void) tcp_xmit_end(tcp);
1547 
1548 		/*
1549 		 * If lingering on close then wait until the fin is acked,
1550 		 * the SO_LINGER time passes, or a reset is sent/received.
1551 		 */
1552 		if (connp->conn_linger && connp->conn_lingertime > 0 &&
1553 		    !(tcp->tcp_fin_acked) &&
1554 		    tcp->tcp_state >= TCPS_ESTABLISHED) {
1555 			if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) {
1556 				tcp->tcp_client_errno = EWOULDBLOCK;
1557 			} else if (tcp->tcp_client_errno == 0) {
1558 
1559 				ASSERT(tcp->tcp_linger_tid == 0);
1560 
1561 				tcp->tcp_linger_tid = TCP_TIMER(tcp,
1562 				    tcp_close_linger_timeout,
1563 				    connp->conn_lingertime * hz);
1564 
1565 				/* tcp_close_linger_timeout will finish close */
1566 				if (tcp->tcp_linger_tid == 0)
1567 					tcp->tcp_client_errno = ENOSR;
1568 				else
1569 					return;
1570 			}
1571 
1572 			/*
1573 			 * Check if we need to detach or just close
1574 			 * the instance.
1575 			 */
1576 			if (tcp->tcp_state <= TCPS_LISTEN)
1577 				break;
1578 		}
1579 
1580 		/*
1581 		 * Make sure that no other thread will access the conn_rq of
1582 		 * this instance (through lookups etc.) as conn_rq will go
1583 		 * away shortly.
1584 		 */
1585 		tcp_acceptor_hash_remove(tcp);
1586 
1587 		mutex_enter(&tcp->tcp_non_sq_lock);
1588 		if (tcp->tcp_flow_stopped) {
1589 			tcp_clrqfull(tcp);
1590 		}
1591 		mutex_exit(&tcp->tcp_non_sq_lock);
1592 
1593 		if (tcp->tcp_timer_tid != 0) {
1594 			delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
1595 			tcp->tcp_timer_tid = 0;
1596 		}
1597 		/*
1598 		 * Need to cancel those timers which will not be used when
1599 		 * TCP is detached.  This has to be done before the conn_wq
1600 		 * is set to NULL.
1601 		 */
1602 		tcp_timers_stop(tcp);
1603 
1604 		tcp->tcp_detached = B_TRUE;
1605 		if (tcp->tcp_state == TCPS_TIME_WAIT) {
1606 			tcp_time_wait_append(tcp);
1607 			TCP_DBGSTAT(tcps, tcp_detach_time_wait);
1608 			ASSERT(connp->conn_ref >= 3);
1609 			goto finish;
1610 		}
1611 
1612 		/*
1613 		 * If delta is zero the timer event wasn't executed and was
1614 		 * successfully canceled. In this case we need to restart it
1615 		 * with the minimal delta possible.
1616 		 */
1617 		if (delta >= 0)
1618 			tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
1619 			    delta ? delta : 1);
1620 
1621 		ASSERT(connp->conn_ref >= 3);
1622 		goto finish;
1623 	}
1624 
1625 	/* Detach did not complete. Still need to remove q from stream. */
1626 	if (msg) {
1627 		if (tcp->tcp_state == TCPS_ESTABLISHED ||
1628 		    tcp->tcp_state == TCPS_CLOSE_WAIT)
1629 			TCPS_BUMP_MIB(tcps, tcpEstabResets);
1630 		if (tcp->tcp_state == TCPS_SYN_SENT ||
1631 		    tcp->tcp_state == TCPS_SYN_RCVD)
1632 			TCPS_BUMP_MIB(tcps, tcpAttemptFails);
1633 		tcp_xmit_ctl(msg, tcp,  tcp->tcp_snxt, 0, TH_RST);
1634 	}
1635 
1636 	tcp_closei_local(tcp);
1637 	CONN_DEC_REF(connp);
1638 	ASSERT(connp->conn_ref >= 2);
1639 
1640 finish:
1641 	mutex_enter(&tcp->tcp_closelock);
1642 	/*
1643 	 * Don't change the queues in the case of a listener that has
1644 	 * eagers in its q or q0. It could surprise the eagers.
1645 	 * Instead wait for the eagers outside the squeue.
1646 	 */
1647 	if (!tcp->tcp_wait_for_eagers) {
1648 		tcp->tcp_detached = B_TRUE;
1649 		connp->conn_rq = NULL;
1650 		connp->conn_wq = NULL;
1651 	}
1652 
1653 	/* Signal tcp_close() to finish closing. */
1654 	tcp->tcp_closed = 1;
1655 	cv_signal(&tcp->tcp_closecv);
1656 	mutex_exit(&tcp->tcp_closelock);
1657 }
1658 
1659 /* ARGSUSED */
1660 void
1661 tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1662 {
1663 	conn_t 	*connp = (conn_t *)arg;
1664 	tcp_t	*tcp = connp->conn_tcp;
1665 
1666 	freemsg(mp);
1667 
1668 	if (tcp->tcp_fused)
1669 		tcp_unfuse(tcp);
1670 
1671 	if (tcp_xmit_end(tcp) != 0) {
1672 		/*
1673 		 * We were crossing FINs and got a reset from
1674 		 * the other side. Just ignore it.
1675 		 */
1676 		if (connp->conn_debug) {
1677 			(void) strlog(TCP_MOD_ID, 0, 1,
1678 			    SL_ERROR|SL_TRACE,
1679 			    "tcp_shutdown_output() out of state %s",
1680 			    tcp_display(tcp, NULL, DISP_ADDR_AND_PORT));
1681 		}
1682 	}
1683 }
1684 
1685 #pragma inline(tcp_send_data)
1686 
1687 void
1688 tcp_send_data(tcp_t *tcp, mblk_t *mp)
1689 {
1690 	conn_t		*connp = tcp->tcp_connp;
1691 
1692 	/*
1693 	 * Check here to avoid sending zero-copy message down to IP when
1694 	 * ZEROCOPY capability has turned off. We only need to deal with
1695 	 * the race condition between sockfs and the notification here.
1696 	 * Since we have tried to backoff the tcp_xmit_head when turning
1697 	 * zero-copy off and new messages in tcp_output(), we simply drop
1698 	 * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean
1699 	 * is not true.
1700 	 */
1701 	if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on &&
1702 	    !tcp->tcp_xmit_zc_clean) {
1703 		ip_drop_output("TCP ZC was disabled but not clean", mp, NULL);
1704 		freemsg(mp);
1705 		return;
1706 	}
1707 
1708 	ASSERT(connp->conn_ixa->ixa_notify_cookie == connp->conn_tcp);
1709 	(void) conn_ip_output(mp, connp->conn_ixa);
1710 }
1711 
1712 /* ARGSUSED2 */
1713 void
1714 tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1715 {
1716 	conn_t	*econnp = (conn_t *)arg;
1717 	tcp_t	*tcp = econnp->conn_tcp;
1718 
1719 	/* Guard against a RST having blown it away while on the squeue */
1720 	if (tcp->tcp_state == TCPS_CLOSED) {
1721 		freemsg(mp);
1722 		return;
1723 	}
1724 
1725 	(void) conn_ip_output(mp, econnp->conn_ixa);
1726 }
1727 
1728 /*
1729  * tcp_send() is called by tcp_wput_data() and returns one of the following:
1730  *
1731  * -1 = failed allocation.
1732  *  0 = success; burst count reached, or usable send window is too small,
1733  *      and that we'd rather wait until later before sending again.
1734  */
1735 static int
1736 tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
1737     const int tcp_hdr_len, const int num_sack_blk, int *usable,
1738     uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
1739 {
1740 	int		num_burst_seg = tcp->tcp_snd_burst;
1741 	int		num_lso_seg = 1;
1742 	uint_t		lso_usable;
1743 	boolean_t	do_lso_send = B_FALSE;
1744 	tcp_stack_t	*tcps = tcp->tcp_tcps;
1745 	conn_t		*connp = tcp->tcp_connp;
1746 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
1747 
1748 	/*
1749 	 * Check LSO possibility. The value of tcp->tcp_lso indicates whether
1750 	 * the underlying connection is LSO capable. Will check whether having
1751 	 * enough available data to initiate LSO transmission in the for(){}
1752 	 * loops.
1753 	 */
1754 	if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
1755 		do_lso_send = B_TRUE;
1756 
1757 	for (;;) {
1758 		struct datab	*db;
1759 		tcpha_t		*tcpha;
1760 		uint32_t	sum;
1761 		mblk_t		*mp, *mp1;
1762 		uchar_t		*rptr;
1763 		int		len;
1764 
1765 		/*
1766 		 * Burst count reached, return successfully.
1767 		 */
1768 		if (num_burst_seg == 0)
1769 			break;
1770 
1771 		/*
1772 		 * Calculate the maximum payload length we can send at one
1773 		 * time.
1774 		 */
1775 		if (do_lso_send) {
1776 			/*
1777 			 * Check whether be able to to do LSO for the current
1778 			 * available data.
1779 			 */
1780 			if (num_burst_seg >= 2 && (*usable - 1) / mss >= 1) {
1781 				lso_usable = MIN(tcp->tcp_lso_max, *usable);
1782 				lso_usable = MIN(lso_usable,
1783 				    num_burst_seg * mss);
1784 
1785 				num_lso_seg = lso_usable / mss;
1786 				if (lso_usable % mss) {
1787 					num_lso_seg++;
1788 					tcp->tcp_last_sent_len = (ushort_t)
1789 					    (lso_usable % mss);
1790 				} else {
1791 					tcp->tcp_last_sent_len = (ushort_t)mss;
1792 				}
1793 			} else {
1794 				do_lso_send = B_FALSE;
1795 				num_lso_seg = 1;
1796 				lso_usable = mss;
1797 			}
1798 		}
1799 
1800 		ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1);
1801 #ifdef DEBUG
1802 		DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, boolean_t,
1803 		    do_lso_send);
1804 #endif
1805 		/*
1806 		 * Adjust num_burst_seg here.
1807 		 */
1808 		num_burst_seg -= num_lso_seg;
1809 
1810 		len = mss;
1811 		if (len > *usable) {
1812 			ASSERT(do_lso_send == B_FALSE);
1813 
1814 			len = *usable;
1815 			if (len <= 0) {
1816 				/* Terminate the loop */
1817 				break;	/* success; too small */
1818 			}
1819 			/*
1820 			 * Sender silly-window avoidance.
1821 			 * Ignore this if we are going to send a
1822 			 * zero window probe out.
1823 			 *
1824 			 * TODO: force data into microscopic window?
1825 			 *	==> (!pushed || (unsent > usable))
1826 			 */
1827 			if (len < (tcp->tcp_max_swnd >> 1) &&
1828 			    (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len &&
1829 			    !((tcp->tcp_valid_bits & TCP_URG_VALID) &&
1830 			    len == 1) && (! tcp->tcp_zero_win_probe)) {
1831 				/*
1832 				 * If the retransmit timer is not running
1833 				 * we start it so that we will retransmit
1834 				 * in the case when the receiver has
1835 				 * decremented the window.
1836 				 */
1837 				if (*snxt == tcp->tcp_snxt &&
1838 				    *snxt == tcp->tcp_suna) {
1839 					/*
1840 					 * We are not supposed to send
1841 					 * anything.  So let's wait a little
1842 					 * bit longer before breaking SWS
1843 					 * avoidance.
1844 					 *
1845 					 * What should the value be?
1846 					 * Suggestion: MAX(init rexmit time,
1847 					 * tcp->tcp_rto)
1848 					 */
1849 					TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
1850 				}
1851 				break;	/* success; too small */
1852 			}
1853 		}
1854 
1855 		tcpha = tcp->tcp_tcpha;
1856 
1857 		/*
1858 		 * The reason to adjust len here is that we need to set flags
1859 		 * and calculate checksum.
1860 		 */
1861 		if (do_lso_send)
1862 			len = lso_usable;
1863 
1864 		*usable -= len; /* Approximate - can be adjusted later */
1865 		if (*usable > 0)
1866 			tcpha->tha_flags = TH_ACK;
1867 		else
1868 			tcpha->tha_flags = (TH_ACK | TH_PUSH);
1869 
1870 		/*
1871 		 * Prime pump for IP's checksumming on our behalf.
1872 		 * Include the adjustment for a source route if any.
1873 		 * In case of LSO, the partial pseudo-header checksum should
1874 		 * exclusive TCP length, so zero tha_sum before IP calculate
1875 		 * pseudo-header checksum for partial checksum offload.
1876 		 */
1877 		if (do_lso_send) {
1878 			sum = 0;
1879 		} else {
1880 			sum = len + tcp_hdr_len + connp->conn_sum;
1881 			sum = (sum >> 16) + (sum & 0xFFFF);
1882 		}
1883 		tcpha->tha_sum = htons(sum);
1884 		tcpha->tha_seq = htonl(*snxt);
1885 
1886 		/*
1887 		 * Branch off to tcp_xmit_mp() if any of the VALID bits is
1888 		 * set.  For the case when TCP_FSS_VALID is the only valid
1889 		 * bit (normal active close), branch off only when we think
1890 		 * that the FIN flag needs to be set.  Note for this case,
1891 		 * that (snxt + len) may not reflect the actual seg_len,
1892 		 * as len may be further reduced in tcp_xmit_mp().  If len
1893 		 * gets modified, we will end up here again.
1894 		 */
1895 		if (tcp->tcp_valid_bits != 0 &&
1896 		    (tcp->tcp_valid_bits != TCP_FSS_VALID ||
1897 		    ((*snxt + len) == tcp->tcp_fss))) {
1898 			uchar_t		*prev_rptr;
1899 			uint32_t	prev_snxt = tcp->tcp_snxt;
1900 
1901 			if (*tail_unsent == 0) {
1902 				ASSERT((*xmit_tail)->b_cont != NULL);
1903 				*xmit_tail = (*xmit_tail)->b_cont;
1904 				prev_rptr = (*xmit_tail)->b_rptr;
1905 				*tail_unsent = (int)((*xmit_tail)->b_wptr -
1906 				    (*xmit_tail)->b_rptr);
1907 			} else {
1908 				prev_rptr = (*xmit_tail)->b_rptr;
1909 				(*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr -
1910 				    *tail_unsent;
1911 			}
1912 			mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL,
1913 			    *snxt, B_FALSE, (uint32_t *)&len, B_FALSE);
1914 			/* Restore tcp_snxt so we get amount sent right. */
1915 			tcp->tcp_snxt = prev_snxt;
1916 			if (prev_rptr == (*xmit_tail)->b_rptr) {
1917 				/*
1918 				 * If the previous timestamp is still in use,
1919 				 * don't stomp on it.
1920 				 */
1921 				if ((*xmit_tail)->b_next == NULL) {
1922 					(*xmit_tail)->b_prev = local_time;
1923 					(*xmit_tail)->b_next =
1924 					    (mblk_t *)(uintptr_t)(*snxt);
1925 				}
1926 			} else
1927 				(*xmit_tail)->b_rptr = prev_rptr;
1928 
1929 			if (mp == NULL) {
1930 				return (-1);
1931 			}
1932 			mp1 = mp->b_cont;
1933 
1934 			if (len <= mss) /* LSO is unusable (!do_lso_send) */
1935 				tcp->tcp_last_sent_len = (ushort_t)len;
1936 			while (mp1->b_cont) {
1937 				*xmit_tail = (*xmit_tail)->b_cont;
1938 				(*xmit_tail)->b_prev = local_time;
1939 				(*xmit_tail)->b_next =
1940 				    (mblk_t *)(uintptr_t)(*snxt);
1941 				mp1 = mp1->b_cont;
1942 			}
1943 			*snxt += len;
1944 			*tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr;
1945 			BUMP_LOCAL(tcp->tcp_obsegs);
1946 			TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1947 			TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1948 			tcp_send_data(tcp, mp);
1949 			continue;
1950 		}
1951 
1952 		*snxt += len;	/* Adjust later if we don't send all of len */
1953 		TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
1954 		TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len);
1955 
1956 		if (*tail_unsent) {
1957 			/* Are the bytes above us in flight? */
1958 			rptr = (*xmit_tail)->b_wptr - *tail_unsent;
1959 			if (rptr != (*xmit_tail)->b_rptr) {
1960 				*tail_unsent -= len;
1961 				if (len <= mss) /* LSO is unusable */
1962 					tcp->tcp_last_sent_len = (ushort_t)len;
1963 				len += total_hdr_len;
1964 				ixa->ixa_pktlen = len;
1965 
1966 				if (ixa->ixa_flags & IXAF_IS_IPV4) {
1967 					tcp->tcp_ipha->ipha_length = htons(len);
1968 				} else {
1969 					tcp->tcp_ip6h->ip6_plen =
1970 					    htons(len - IPV6_HDR_LEN);
1971 				}
1972 
1973 				mp = dupb(*xmit_tail);
1974 				if (mp == NULL) {
1975 					return (-1);	/* out_of_mem */
1976 				}
1977 				mp->b_rptr = rptr;
1978 				/*
1979 				 * If the old timestamp is no longer in use,
1980 				 * sample a new timestamp now.
1981 				 */
1982 				if ((*xmit_tail)->b_next == NULL) {
1983 					(*xmit_tail)->b_prev = local_time;
1984 					(*xmit_tail)->b_next =
1985 					    (mblk_t *)(uintptr_t)(*snxt-len);
1986 				}
1987 				goto must_alloc;
1988 			}
1989 		} else {
1990 			*xmit_tail = (*xmit_tail)->b_cont;
1991 			ASSERT((uintptr_t)((*xmit_tail)->b_wptr -
1992 			    (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX);
1993 			*tail_unsent = (int)((*xmit_tail)->b_wptr -
1994 			    (*xmit_tail)->b_rptr);
1995 		}
1996 
1997 		(*xmit_tail)->b_prev = local_time;
1998 		(*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len);
1999 
2000 		*tail_unsent -= len;
2001 		if (len <= mss) /* LSO is unusable (!do_lso_send) */
2002 			tcp->tcp_last_sent_len = (ushort_t)len;
2003 
2004 		len += total_hdr_len;
2005 		ixa->ixa_pktlen = len;
2006 
2007 		if (ixa->ixa_flags & IXAF_IS_IPV4) {
2008 			tcp->tcp_ipha->ipha_length = htons(len);
2009 		} else {
2010 			tcp->tcp_ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2011 		}
2012 
2013 		mp = dupb(*xmit_tail);
2014 		if (mp == NULL) {
2015 			return (-1);	/* out_of_mem */
2016 		}
2017 
2018 		len = total_hdr_len;
2019 		/*
2020 		 * There are four reasons to allocate a new hdr mblk:
2021 		 *  1) The bytes above us are in use by another packet
2022 		 *  2) We don't have good alignment
2023 		 *  3) The mblk is being shared
2024 		 *  4) We don't have enough room for a header
2025 		 */
2026 		rptr = mp->b_rptr - len;
2027 		if (!OK_32PTR(rptr) ||
2028 		    ((db = mp->b_datap), db->db_ref != 2) ||
2029 		    rptr < db->db_base) {
2030 			/* NOTE: we assume allocb returns an OK_32PTR */
2031 
2032 		must_alloc:;
2033 			mp1 = allocb(connp->conn_ht_iphc_allocated +
2034 			    tcps->tcps_wroff_xtra, BPRI_MED);
2035 			if (mp1 == NULL) {
2036 				freemsg(mp);
2037 				return (-1);	/* out_of_mem */
2038 			}
2039 			mp1->b_cont = mp;
2040 			mp = mp1;
2041 			/* Leave room for Link Level header */
2042 			len = total_hdr_len;
2043 			rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2044 			mp->b_wptr = &rptr[len];
2045 		}
2046 
2047 		/*
2048 		 * Fill in the header using the template header, and add
2049 		 * options such as time-stamp, ECN and/or SACK, as needed.
2050 		 */
2051 		tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
2052 
2053 		mp->b_rptr = rptr;
2054 
2055 		if (*tail_unsent) {
2056 			int spill = *tail_unsent;
2057 
2058 			mp1 = mp->b_cont;
2059 			if (mp1 == NULL)
2060 				mp1 = mp;
2061 
2062 			/*
2063 			 * If we're a little short, tack on more mblks until
2064 			 * there is no more spillover.
2065 			 */
2066 			while (spill < 0) {
2067 				mblk_t *nmp;
2068 				int nmpsz;
2069 
2070 				nmp = (*xmit_tail)->b_cont;
2071 				nmpsz = MBLKL(nmp);
2072 
2073 				/*
2074 				 * Excess data in mblk; can we split it?
2075 				 * If LSO is enabled for the connection,
2076 				 * keep on splitting as this is a transient
2077 				 * send path.
2078 				 */
2079 				if (!do_lso_send && (spill + nmpsz > 0)) {
2080 					/*
2081 					 * Don't split if stream head was
2082 					 * told to break up larger writes
2083 					 * into smaller ones.
2084 					 */
2085 					if (tcp->tcp_maxpsz_multiplier > 0)
2086 						break;
2087 
2088 					/*
2089 					 * Next mblk is less than SMSS/2
2090 					 * rounded up to nearest 64-byte;
2091 					 * let it get sent as part of the
2092 					 * next segment.
2093 					 */
2094 					if (tcp->tcp_localnet &&
2095 					    !tcp->tcp_cork &&
2096 					    (nmpsz < roundup((mss >> 1), 64)))
2097 						break;
2098 				}
2099 
2100 				*xmit_tail = nmp;
2101 				ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX);
2102 				/* Stash for rtt use later */
2103 				(*xmit_tail)->b_prev = local_time;
2104 				(*xmit_tail)->b_next =
2105 				    (mblk_t *)(uintptr_t)(*snxt - len);
2106 				mp1->b_cont = dupb(*xmit_tail);
2107 				mp1 = mp1->b_cont;
2108 
2109 				spill += nmpsz;
2110 				if (mp1 == NULL) {
2111 					*tail_unsent = spill;
2112 					freemsg(mp);
2113 					return (-1);	/* out_of_mem */
2114 				}
2115 			}
2116 
2117 			/* Trim back any surplus on the last mblk */
2118 			if (spill >= 0) {
2119 				mp1->b_wptr -= spill;
2120 				*tail_unsent = spill;
2121 			} else {
2122 				/*
2123 				 * We did not send everything we could in
2124 				 * order to remain within the b_cont limit.
2125 				 */
2126 				*usable -= spill;
2127 				*snxt += spill;
2128 				tcp->tcp_last_sent_len += spill;
2129 				TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill);
2130 				/*
2131 				 * Adjust the checksum
2132 				 */
2133 				tcpha = (tcpha_t *)(rptr +
2134 				    ixa->ixa_ip_hdr_length);
2135 				sum += spill;
2136 				sum = (sum >> 16) + (sum & 0xFFFF);
2137 				tcpha->tha_sum = htons(sum);
2138 				if (connp->conn_ipversion == IPV4_VERSION) {
2139 					sum = ntohs(
2140 					    ((ipha_t *)rptr)->ipha_length) +
2141 					    spill;
2142 					((ipha_t *)rptr)->ipha_length =
2143 					    htons(sum);
2144 				} else {
2145 					sum = ntohs(
2146 					    ((ip6_t *)rptr)->ip6_plen) +
2147 					    spill;
2148 					((ip6_t *)rptr)->ip6_plen =
2149 					    htons(sum);
2150 				}
2151 				ixa->ixa_pktlen += spill;
2152 				*tail_unsent = 0;
2153 			}
2154 		}
2155 		if (tcp->tcp_ip_forward_progress) {
2156 			tcp->tcp_ip_forward_progress = B_FALSE;
2157 			ixa->ixa_flags |= IXAF_REACH_CONF;
2158 		} else {
2159 			ixa->ixa_flags &= ~IXAF_REACH_CONF;
2160 		}
2161 
2162 		if (do_lso_send) {
2163 			/* Append LSO information to the mp. */
2164 			lso_info_set(mp, mss, HW_LSO);
2165 			ixa->ixa_fragsize = IP_MAXPACKET;
2166 			ixa->ixa_extra_ident = num_lso_seg - 1;
2167 
2168 			DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg,
2169 			    boolean_t, B_TRUE);
2170 
2171 			tcp_send_data(tcp, mp);
2172 
2173 			/*
2174 			 * Restore values of ixa_fragsize and ixa_extra_ident.
2175 			 */
2176 			ixa->ixa_fragsize = ixa->ixa_pmtu;
2177 			ixa->ixa_extra_ident = 0;
2178 			tcp->tcp_obsegs += num_lso_seg;
2179 			TCP_STAT(tcps, tcp_lso_times);
2180 			TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
2181 		} else {
2182 			/*
2183 			 * Make sure to clean up LSO information. Wherever a
2184 			 * new mp uses the prepended header room after dupb(),
2185 			 * lso_info_cleanup() should be called.
2186 			 */
2187 			lso_info_cleanup(mp);
2188 			tcp_send_data(tcp, mp);
2189 			BUMP_LOCAL(tcp->tcp_obsegs);
2190 		}
2191 	}
2192 
2193 	return (0);
2194 }
2195 
2196 /*
2197  * Initiate closedown sequence on an active connection.  (May be called as
2198  * writer.)  Return value zero for OK return, non-zero for error return.
2199  */
2200 static int
2201 tcp_xmit_end(tcp_t *tcp)
2202 {
2203 	mblk_t		*mp;
2204 	tcp_stack_t	*tcps = tcp->tcp_tcps;
2205 	iulp_t		uinfo;
2206 	ip_stack_t	*ipst = tcps->tcps_netstack->netstack_ip;
2207 	conn_t		*connp = tcp->tcp_connp;
2208 
2209 	if (tcp->tcp_state < TCPS_SYN_RCVD ||
2210 	    tcp->tcp_state > TCPS_CLOSE_WAIT) {
2211 		/*
2212 		 * Invalid state, only states TCPS_SYN_RCVD,
2213 		 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid
2214 		 */
2215 		return (-1);
2216 	}
2217 
2218 	tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent;
2219 	tcp->tcp_valid_bits |= TCP_FSS_VALID;
2220 	/*
2221 	 * If there is nothing more unsent, send the FIN now.
2222 	 * Otherwise, it will go out with the last segment.
2223 	 */
2224 	if (tcp->tcp_unsent == 0) {
2225 		mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL,
2226 		    tcp->tcp_fss, B_FALSE, NULL, B_FALSE);
2227 
2228 		if (mp) {
2229 			tcp_send_data(tcp, mp);
2230 		} else {
2231 			/*
2232 			 * Couldn't allocate msg.  Pretend we got it out.
2233 			 * Wait for rexmit timeout.
2234 			 */
2235 			tcp->tcp_snxt = tcp->tcp_fss + 1;
2236 			TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
2237 		}
2238 
2239 		/*
2240 		 * If needed, update tcp_rexmit_snxt as tcp_snxt is
2241 		 * changed.
2242 		 */
2243 		if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) {
2244 			tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2245 		}
2246 	} else {
2247 		/*
2248 		 * If tcp->tcp_cork is set, then the data will not get sent,
2249 		 * so we have to check that and unset it first.
2250 		 */
2251 		if (tcp->tcp_cork)
2252 			tcp->tcp_cork = B_FALSE;
2253 		tcp_wput_data(tcp, NULL, B_FALSE);
2254 	}
2255 
2256 	/*
2257 	 * If TCP does not get enough samples of RTT or tcp_rtt_updates
2258 	 * is 0, don't update the cache.
2259 	 */
2260 	if (tcps->tcps_rtt_updates == 0 ||
2261 	    tcp->tcp_rtt_update < tcps->tcps_rtt_updates)
2262 		return (0);
2263 
2264 	/*
2265 	 * We do not have a good algorithm to update ssthresh at this time.
2266 	 * So don't do any update.
2267 	 */
2268 	bzero(&uinfo, sizeof (uinfo));
2269 	uinfo.iulp_rtt = tcp->tcp_rtt_sa;
2270 	uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
2271 
2272 	/*
2273 	 * Note that uinfo is kept for conn_faddr in the DCE. Could update even
2274 	 * if source routed but we don't.
2275 	 */
2276 	if (connp->conn_ipversion == IPV4_VERSION) {
2277 		if (connp->conn_faddr_v4 !=  tcp->tcp_ipha->ipha_dst) {
2278 			return (0);
2279 		}
2280 		(void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
2281 	} else {
2282 		uint_t ifindex;
2283 
2284 		if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
2285 		    &tcp->tcp_ip6h->ip6_dst))) {
2286 			return (0);
2287 		}
2288 		ifindex = 0;
2289 		if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
2290 			ip_xmit_attr_t *ixa = connp->conn_ixa;
2291 
2292 			/*
2293 			 * If we are going to create a DCE we'd better have
2294 			 * an ifindex
2295 			 */
2296 			if (ixa->ixa_nce != NULL) {
2297 				ifindex = ixa->ixa_nce->nce_common->ncec_ill->
2298 				    ill_phyint->phyint_ifindex;
2299 			} else {
2300 				return (0);
2301 			}
2302 		}
2303 
2304 		(void) dce_update_uinfo(&connp->conn_faddr_v6, ifindex, &uinfo,
2305 		    ipst);
2306 	}
2307 	return (0);
2308 }
2309 
2310 /*
2311  * Send out a control packet on the tcp connection specified.  This routine
2312  * is typically called where we need a simple ACK or RST generated.
2313  */
2314 void
2315 tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl)
2316 {
2317 	uchar_t		*rptr;
2318 	tcpha_t		*tcpha;
2319 	ipha_t		*ipha = NULL;
2320 	ip6_t		*ip6h = NULL;
2321 	uint32_t	sum;
2322 	int		total_hdr_len;
2323 	int		ip_hdr_len;
2324 	mblk_t		*mp;
2325 	tcp_stack_t	*tcps = tcp->tcp_tcps;
2326 	conn_t		*connp = tcp->tcp_connp;
2327 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
2328 
2329 	/*
2330 	 * Save sum for use in source route later.
2331 	 */
2332 	sum = connp->conn_ht_ulp_len + connp->conn_sum;
2333 	total_hdr_len = connp->conn_ht_iphc_len;
2334 	ip_hdr_len = ixa->ixa_ip_hdr_length;
2335 
2336 	/* If a text string is passed in with the request, pass it to strlog. */
2337 	if (str != NULL && connp->conn_debug) {
2338 		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
2339 		    "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x",
2340 		    str, seq, ack, ctl);
2341 	}
2342 	mp = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
2343 	    BPRI_MED);
2344 	if (mp == NULL) {
2345 		return;
2346 	}
2347 	rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
2348 	mp->b_rptr = rptr;
2349 	mp->b_wptr = &rptr[total_hdr_len];
2350 	bcopy(connp->conn_ht_iphc, rptr, total_hdr_len);
2351 
2352 	ixa->ixa_pktlen = total_hdr_len;
2353 
2354 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
2355 		ipha = (ipha_t *)rptr;
2356 		ipha->ipha_length = htons(total_hdr_len);
2357 	} else {
2358 		ip6h = (ip6_t *)rptr;
2359 		ip6h->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN);
2360 	}
2361 	tcpha = (tcpha_t *)&rptr[ip_hdr_len];
2362 	tcpha->tha_flags = (uint8_t)ctl;
2363 	if (ctl & TH_RST) {
2364 		TCPS_BUMP_MIB(tcps, tcpOutRsts);
2365 		TCPS_BUMP_MIB(tcps, tcpOutControl);
2366 		/*
2367 		 * Don't send TSopt w/ TH_RST packets per RFC 1323.
2368 		 */
2369 		if (tcp->tcp_snd_ts_ok &&
2370 		    tcp->tcp_state > TCPS_SYN_SENT) {
2371 			mp->b_wptr = &rptr[total_hdr_len - TCPOPT_REAL_TS_LEN];
2372 			*(mp->b_wptr) = TCPOPT_EOL;
2373 
2374 			ixa->ixa_pktlen = total_hdr_len - TCPOPT_REAL_TS_LEN;
2375 
2376 			if (connp->conn_ipversion == IPV4_VERSION) {
2377 				ipha->ipha_length = htons(total_hdr_len -
2378 				    TCPOPT_REAL_TS_LEN);
2379 			} else {
2380 				ip6h->ip6_plen = htons(total_hdr_len -
2381 				    IPV6_HDR_LEN - TCPOPT_REAL_TS_LEN);
2382 			}
2383 			tcpha->tha_offset_and_reserved -= (3 << 4);
2384 			sum -= TCPOPT_REAL_TS_LEN;
2385 		}
2386 	}
2387 	if (ctl & TH_ACK) {
2388 		if (tcp->tcp_snd_ts_ok) {
2389 			uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
2390 
2391 			U32_TO_BE32(llbolt,
2392 			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
2393 			U32_TO_BE32(tcp->tcp_ts_recent,
2394 			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
2395 		}
2396 
2397 		/* Update the latest receive window size in TCP header. */
2398 		tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
2399 		/* Track what we sent to the peer */
2400 		tcp->tcp_tcpha->tha_win = tcpha->tha_win;
2401 		tcp->tcp_rack = ack;
2402 		tcp->tcp_rack_cnt = 0;
2403 		TCPS_BUMP_MIB(tcps, tcpOutAck);
2404 	}
2405 	BUMP_LOCAL(tcp->tcp_obsegs);
2406 	tcpha->tha_seq = htonl(seq);
2407 	tcpha->tha_ack = htonl(ack);
2408 	/*
2409 	 * Include the adjustment for a source route if any.
2410 	 */
2411 	sum = (sum >> 16) + (sum & 0xFFFF);
2412 	tcpha->tha_sum = htons(sum);
2413 	tcp_send_data(tcp, mp);
2414 }
2415 
2416 /*
2417  * Generate a reset based on an inbound packet, connp is set by caller
2418  * when RST is in response to an unexpected inbound packet for which
2419  * there is active tcp state in the system.
2420  *
2421  * IPSEC NOTE : Try to send the reply with the same protection as it came
2422  * in.  We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t.
2423  * That way the packet will go out at the same level of protection as it
2424  * came in with.
2425  */
2426 static void
2427 tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl,
2428     ip_recv_attr_t *ira, ip_stack_t *ipst, conn_t *connp)
2429 {
2430 	ipha_t		*ipha = NULL;
2431 	ip6_t		*ip6h = NULL;
2432 	ushort_t	len;
2433 	tcpha_t		*tcpha;
2434 	int		i;
2435 	ipaddr_t	v4addr;
2436 	in6_addr_t	v6addr;
2437 	netstack_t	*ns = ipst->ips_netstack;
2438 	tcp_stack_t	*tcps = ns->netstack_tcp;
2439 	ip_xmit_attr_t	ixas, *ixa;
2440 	uint_t		ip_hdr_len = ira->ira_ip_hdr_length;
2441 	boolean_t	need_refrele = B_FALSE;		/* ixa_refrele(ixa) */
2442 	ushort_t	port;
2443 
2444 	if (!tcp_send_rst_chk(tcps)) {
2445 		TCP_STAT(tcps, tcp_rst_unsent);
2446 		freemsg(mp);
2447 		return;
2448 	}
2449 
2450 	/*
2451 	 * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other
2452 	 * options from the listener. In that case the caller must ensure that
2453 	 * we are running on the listener = connp squeue.
2454 	 *
2455 	 * We get a safe copy of conn_ixa so we don't need to restore anything
2456 	 * we or ip_output_simple might change in the ixa.
2457 	 */
2458 	if (connp != NULL) {
2459 		ASSERT(connp->conn_on_sqp);
2460 
2461 		ixa = conn_get_ixa_exclusive(connp);
2462 		if (ixa == NULL) {
2463 			TCP_STAT(tcps, tcp_rst_unsent);
2464 			freemsg(mp);
2465 			return;
2466 		}
2467 		need_refrele = B_TRUE;
2468 	} else {
2469 		bzero(&ixas, sizeof (ixas));
2470 		ixa = &ixas;
2471 		/*
2472 		 * IXAF_VERIFY_SOURCE is overkill since we know the
2473 		 * packet was for us.
2474 		 */
2475 		ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE;
2476 		ixa->ixa_protocol = IPPROTO_TCP;
2477 		ixa->ixa_zoneid = ira->ira_zoneid;
2478 		ixa->ixa_ifindex = 0;
2479 		ixa->ixa_ipst = ipst;
2480 		ixa->ixa_cred = kcred;
2481 		ixa->ixa_cpid = NOPID;
2482 	}
2483 
2484 	if (str && tcps->tcps_dbg) {
2485 		(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
2486 		    "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, "
2487 		    "flags 0x%x",
2488 		    str, seq, ack, ctl);
2489 	}
2490 	if (mp->b_datap->db_ref != 1) {
2491 		mblk_t *mp1 = copyb(mp);
2492 		freemsg(mp);
2493 		mp = mp1;
2494 		if (mp == NULL)
2495 			goto done;
2496 	} else if (mp->b_cont) {
2497 		freemsg(mp->b_cont);
2498 		mp->b_cont = NULL;
2499 		DB_CKSUMFLAGS(mp) = 0;
2500 	}
2501 	/*
2502 	 * We skip reversing source route here.
2503 	 * (for now we replace all IP options with EOL)
2504 	 */
2505 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
2506 		ipha = (ipha_t *)mp->b_rptr;
2507 		for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++)
2508 			mp->b_rptr[i] = IPOPT_EOL;
2509 		/*
2510 		 * Make sure that src address isn't flagrantly invalid.
2511 		 * Not all broadcast address checking for the src address
2512 		 * is possible, since we don't know the netmask of the src
2513 		 * addr.  No check for destination address is done, since
2514 		 * IP will not pass up a packet with a broadcast dest
2515 		 * address to TCP.  Similar checks are done below for IPv6.
2516 		 */
2517 		if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST ||
2518 		    CLASSD(ipha->ipha_src)) {
2519 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
2520 			ip_drop_input("ipIfStatsInDiscards", mp, NULL);
2521 			freemsg(mp);
2522 			goto done;
2523 		}
2524 	} else {
2525 		ip6h = (ip6_t *)mp->b_rptr;
2526 
2527 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) ||
2528 		    IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) {
2529 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards);
2530 			ip_drop_input("ipIfStatsInDiscards", mp, NULL);
2531 			freemsg(mp);
2532 			goto done;
2533 		}
2534 
2535 		/* Remove any extension headers assuming partial overlay */
2536 		if (ip_hdr_len > IPV6_HDR_LEN) {
2537 			uint8_t *to;
2538 
2539 			to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN;
2540 			ovbcopy(ip6h, to, IPV6_HDR_LEN);
2541 			mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN;
2542 			ip_hdr_len = IPV6_HDR_LEN;
2543 			ip6h = (ip6_t *)mp->b_rptr;
2544 			ip6h->ip6_nxt = IPPROTO_TCP;
2545 		}
2546 	}
2547 	tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len];
2548 	if (tcpha->tha_flags & TH_RST) {
2549 		freemsg(mp);
2550 		goto done;
2551 	}
2552 	tcpha->tha_offset_and_reserved = (5 << 4);
2553 	len = ip_hdr_len + sizeof (tcpha_t);
2554 	mp->b_wptr = &mp->b_rptr[len];
2555 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
2556 		ipha->ipha_length = htons(len);
2557 		/* Swap addresses */
2558 		v4addr = ipha->ipha_src;
2559 		ipha->ipha_src = ipha->ipha_dst;
2560 		ipha->ipha_dst = v4addr;
2561 		ipha->ipha_ident = 0;
2562 		ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
2563 		ixa->ixa_flags |= IXAF_IS_IPV4;
2564 		ixa->ixa_ip_hdr_length = ip_hdr_len;
2565 	} else {
2566 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2567 		/* Swap addresses */
2568 		v6addr = ip6h->ip6_src;
2569 		ip6h->ip6_src = ip6h->ip6_dst;
2570 		ip6h->ip6_dst = v6addr;
2571 		ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit;
2572 		ixa->ixa_flags &= ~IXAF_IS_IPV4;
2573 
2574 		if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) {
2575 			ixa->ixa_flags |= IXAF_SCOPEID_SET;
2576 			ixa->ixa_scopeid = ira->ira_ruifindex;
2577 		}
2578 		ixa->ixa_ip_hdr_length = IPV6_HDR_LEN;
2579 	}
2580 	ixa->ixa_pktlen = len;
2581 
2582 	/* Swap the ports */
2583 	port = tcpha->tha_fport;
2584 	tcpha->tha_fport = tcpha->tha_lport;
2585 	tcpha->tha_lport = port;
2586 
2587 	tcpha->tha_ack = htonl(ack);
2588 	tcpha->tha_seq = htonl(seq);
2589 	tcpha->tha_win = 0;
2590 	tcpha->tha_sum = htons(sizeof (tcpha_t));
2591 	tcpha->tha_flags = (uint8_t)ctl;
2592 	if (ctl & TH_RST) {
2593 		TCPS_BUMP_MIB(tcps, tcpOutRsts);
2594 		TCPS_BUMP_MIB(tcps, tcpOutControl);
2595 	}
2596 
2597 	/* Discard any old label */
2598 	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
2599 		ASSERT(ixa->ixa_tsl != NULL);
2600 		label_rele(ixa->ixa_tsl);
2601 		ixa->ixa_free_flags &= ~IXA_FREE_TSL;
2602 	}
2603 	ixa->ixa_tsl = ira->ira_tsl;	/* Behave as a multi-level responder */
2604 
2605 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
2606 		/*
2607 		 * Apply IPsec based on how IPsec was applied to
2608 		 * the packet that caused the RST.
2609 		 */
2610 		if (!ipsec_in_to_out(ira, ixa, mp, ipha, ip6h)) {
2611 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
2612 			/* Note: mp already consumed and ip_drop_packet done */
2613 			goto done;
2614 		}
2615 	} else {
2616 		/*
2617 		 * This is in clear. The RST message we are building
2618 		 * here should go out in clear, independent of our policy.
2619 		 */
2620 		ixa->ixa_flags |= IXAF_NO_IPSEC;
2621 	}
2622 
2623 	/*
2624 	 * NOTE:  one might consider tracing a TCP packet here, but
2625 	 * this function has no active TCP state and no tcp structure
2626 	 * that has a trace buffer.  If we traced here, we would have
2627 	 * to keep a local trace buffer in tcp_record_trace().
2628 	 */
2629 
2630 	(void) ip_output_simple(mp, ixa);
2631 done:
2632 	ixa_cleanup(ixa);
2633 	if (need_refrele) {
2634 		ASSERT(ixa != &ixas);
2635 		ixa_refrele(ixa);
2636 	}
2637 }
2638 
2639 /*
2640  * Generate a "no listener here" RST in response to an "unknown" segment.
2641  * connp is set by caller when RST is in response to an unexpected
2642  * inbound packet for which there is active tcp state in the system.
2643  * Note that we are reusing the incoming mp to construct the outgoing RST.
2644  */
2645 void
2646 tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst,
2647     conn_t *connp)
2648 {
2649 	uchar_t		*rptr;
2650 	uint32_t	seg_len;
2651 	tcpha_t		*tcpha;
2652 	uint32_t	seg_seq;
2653 	uint32_t	seg_ack;
2654 	uint_t		flags;
2655 	ipha_t 		*ipha;
2656 	ip6_t 		*ip6h;
2657 	boolean_t	policy_present;
2658 	netstack_t	*ns = ipst->ips_netstack;
2659 	tcp_stack_t	*tcps = ns->netstack_tcp;
2660 	ipsec_stack_t	*ipss = tcps->tcps_netstack->netstack_ipsec;
2661 	uint_t		ip_hdr_len = ira->ira_ip_hdr_length;
2662 
2663 	TCP_STAT(tcps, tcp_no_listener);
2664 
2665 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
2666 		policy_present = ipss->ipsec_inbound_v4_policy_present;
2667 		ipha = (ipha_t *)mp->b_rptr;
2668 		ip6h = NULL;
2669 	} else {
2670 		policy_present = ipss->ipsec_inbound_v6_policy_present;
2671 		ipha = NULL;
2672 		ip6h = (ip6_t *)mp->b_rptr;
2673 	}
2674 
2675 	if (policy_present) {
2676 		/*
2677 		 * The conn_t parameter is NULL because we already know
2678 		 * nobody's home.
2679 		 */
2680 		mp = ipsec_check_global_policy(mp, (conn_t *)NULL, ipha, ip6h,
2681 		    ira, ns);
2682 		if (mp == NULL)
2683 			return;
2684 	}
2685 	if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
2686 		DTRACE_PROBE2(
2687 		    tx__ip__log__error__nolistener__tcp,
2688 		    char *, "Could not reply with RST to mp(1)",
2689 		    mblk_t *, mp);
2690 		ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n"));
2691 		freemsg(mp);
2692 		return;
2693 	}
2694 
2695 	rptr = mp->b_rptr;
2696 
2697 	tcpha = (tcpha_t *)&rptr[ip_hdr_len];
2698 	seg_seq = ntohl(tcpha->tha_seq);
2699 	seg_ack = ntohl(tcpha->tha_ack);
2700 	flags = tcpha->tha_flags;
2701 
2702 	seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcpha) + ip_hdr_len);
2703 	if (flags & TH_RST) {
2704 		freemsg(mp);
2705 	} else if (flags & TH_ACK) {
2706 		tcp_xmit_early_reset("no tcp, reset", mp, seg_ack, 0, TH_RST,
2707 		    ira, ipst, connp);
2708 	} else {
2709 		if (flags & TH_SYN) {
2710 			seg_len++;
2711 		} else {
2712 			/*
2713 			 * Here we violate the RFC.  Note that a normal
2714 			 * TCP will never send a segment without the ACK
2715 			 * flag, except for RST or SYN segment.  This
2716 			 * segment is neither.  Just drop it on the
2717 			 * floor.
2718 			 */
2719 			freemsg(mp);
2720 			TCP_STAT(tcps, tcp_rst_unsent);
2721 			return;
2722 		}
2723 
2724 		tcp_xmit_early_reset("no tcp, reset/ack", mp, 0,
2725 		    seg_seq + seg_len, TH_RST | TH_ACK, ira, ipst, connp);
2726 	}
2727 }
2728 
2729 /*
2730  * tcp_xmit_mp is called to return a pointer to an mblk chain complete with
2731  * ip and tcp header ready to pass down to IP.  If the mp passed in is
2732  * non-NULL, then up to max_to_send bytes of data will be dup'ed off that
2733  * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary
2734  * otherwise it will dup partial mblks.)
2735  * Otherwise, an appropriate ACK packet will be generated.  This
2736  * routine is not usually called to send new data for the first time.  It
2737  * is mostly called out of the timer for retransmits, and to generate ACKs.
2738  *
2739  * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will
2740  * be adjusted by *offset.  And after dupb(), the offset and the ending mblk
2741  * of the original mblk chain will be returned in *offset and *end_mp.
2742  */
2743 mblk_t *
2744 tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
2745     mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len,
2746     boolean_t rexmit)
2747 {
2748 	int	data_length;
2749 	int32_t	off = 0;
2750 	uint_t	flags;
2751 	mblk_t	*mp1;
2752 	mblk_t	*mp2;
2753 	uchar_t	*rptr;
2754 	tcpha_t	*tcpha;
2755 	int32_t	num_sack_blk = 0;
2756 	int32_t	sack_opt_len = 0;
2757 	tcp_stack_t	*tcps = tcp->tcp_tcps;
2758 	conn_t		*connp = tcp->tcp_connp;
2759 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
2760 
2761 	/* Allocate for our maximum TCP header + link-level */
2762 	mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
2763 	    BPRI_MED);
2764 	if (!mp1)
2765 		return (NULL);
2766 	data_length = 0;
2767 
2768 	/*
2769 	 * Note that tcp_mss has been adjusted to take into account the
2770 	 * timestamp option if applicable.  Because SACK options do not
2771 	 * appear in every TCP segments and they are of variable lengths,
2772 	 * they cannot be included in tcp_mss.  Thus we need to calculate
2773 	 * the actual segment length when we need to send a segment which
2774 	 * includes SACK options.
2775 	 */
2776 	if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
2777 		num_sack_blk = MIN(tcp->tcp_max_sack_blk,
2778 		    tcp->tcp_num_sack_blk);
2779 		sack_opt_len = num_sack_blk * sizeof (sack_blk_t) +
2780 		    TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN;
2781 		if (max_to_send + sack_opt_len > tcp->tcp_mss)
2782 			max_to_send -= sack_opt_len;
2783 	}
2784 
2785 	if (offset != NULL) {
2786 		off = *offset;
2787 		/* We use offset as an indicator that end_mp is not NULL. */
2788 		*end_mp = NULL;
2789 	}
2790 	for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) {
2791 		/* This could be faster with cooperation from downstream */
2792 		if (mp2 != mp1 && !sendall &&
2793 		    data_length + (int)(mp->b_wptr - mp->b_rptr) >
2794 		    max_to_send)
2795 			/*
2796 			 * Don't send the next mblk since the whole mblk
2797 			 * does not fit.
2798 			 */
2799 			break;
2800 		mp2->b_cont = dupb(mp);
2801 		mp2 = mp2->b_cont;
2802 		if (!mp2) {
2803 			freemsg(mp1);
2804 			return (NULL);
2805 		}
2806 		mp2->b_rptr += off;
2807 		ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <=
2808 		    (uintptr_t)INT_MAX);
2809 
2810 		data_length += (int)(mp2->b_wptr - mp2->b_rptr);
2811 		if (data_length > max_to_send) {
2812 			mp2->b_wptr -= data_length - max_to_send;
2813 			data_length = max_to_send;
2814 			off = mp2->b_wptr - mp->b_rptr;
2815 			break;
2816 		} else {
2817 			off = 0;
2818 		}
2819 	}
2820 	if (offset != NULL) {
2821 		*offset = off;
2822 		*end_mp = mp;
2823 	}
2824 	if (seg_len != NULL) {
2825 		*seg_len = data_length;
2826 	}
2827 
2828 	/* Update the latest receive window size in TCP header. */
2829 	tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
2830 
2831 	rptr = mp1->b_rptr + tcps->tcps_wroff_xtra;
2832 	mp1->b_rptr = rptr;
2833 	mp1->b_wptr = rptr + connp->conn_ht_iphc_len + sack_opt_len;
2834 	bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len);
2835 	tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length];
2836 	tcpha->tha_seq = htonl(seq);
2837 
2838 	/*
2839 	 * Use tcp_unsent to determine if the PUSH bit should be used assumes
2840 	 * that this function was called from tcp_wput_data. Thus, when called
2841 	 * to retransmit data the setting of the PUSH bit may appear some
2842 	 * what random in that it might get set when it should not. This
2843 	 * should not pose any performance issues.
2844 	 */
2845 	if (data_length != 0 && (tcp->tcp_unsent == 0 ||
2846 	    tcp->tcp_unsent == data_length)) {
2847 		flags = TH_ACK | TH_PUSH;
2848 	} else {
2849 		flags = TH_ACK;
2850 	}
2851 
2852 	if (tcp->tcp_ecn_ok) {
2853 		if (tcp->tcp_ecn_echo_on)
2854 			flags |= TH_ECE;
2855 
2856 		/*
2857 		 * Only set ECT bit and ECN_CWR if a segment contains new data.
2858 		 * There is no TCP flow control for non-data segments, and
2859 		 * only data segment is transmitted reliably.
2860 		 */
2861 		if (data_length > 0 && !rexmit) {
2862 			TCP_SET_ECT(tcp, rptr);
2863 			if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
2864 				flags |= TH_CWR;
2865 				tcp->tcp_ecn_cwr_sent = B_TRUE;
2866 			}
2867 		}
2868 	}
2869 
2870 	if (tcp->tcp_valid_bits) {
2871 		uint32_t u1;
2872 
2873 		if ((tcp->tcp_valid_bits & TCP_ISS_VALID) &&
2874 		    seq == tcp->tcp_iss) {
2875 			uchar_t	*wptr;
2876 
2877 			/*
2878 			 * If TCP_ISS_VALID and the seq number is tcp_iss,
2879 			 * TCP can only be in SYN-SENT, SYN-RCVD or
2880 			 * FIN-WAIT-1 state.  It can be FIN-WAIT-1 if
2881 			 * our SYN is not ack'ed but the app closes this
2882 			 * TCP connection.
2883 			 */
2884 			ASSERT(tcp->tcp_state == TCPS_SYN_SENT ||
2885 			    tcp->tcp_state == TCPS_SYN_RCVD ||
2886 			    tcp->tcp_state == TCPS_FIN_WAIT_1);
2887 
2888 			/*
2889 			 * Tack on the MSS option.  It is always needed
2890 			 * for both active and passive open.
2891 			 *
2892 			 * MSS option value should be interface MTU - MIN
2893 			 * TCP/IP header according to RFC 793 as it means
2894 			 * the maximum segment size TCP can receive.  But
2895 			 * to get around some broken middle boxes/end hosts
2896 			 * out there, we allow the option value to be the
2897 			 * same as the MSS option size on the peer side.
2898 			 * In this way, the other side will not send
2899 			 * anything larger than they can receive.
2900 			 *
2901 			 * Note that for SYN_SENT state, the ndd param
2902 			 * tcp_use_smss_as_mss_opt has no effect as we
2903 			 * don't know the peer's MSS option value. So
2904 			 * the only case we need to take care of is in
2905 			 * SYN_RCVD state, which is done later.
2906 			 */
2907 			wptr = mp1->b_wptr;
2908 			wptr[0] = TCPOPT_MAXSEG;
2909 			wptr[1] = TCPOPT_MAXSEG_LEN;
2910 			wptr += 2;
2911 			u1 = tcp->tcp_initial_pmtu -
2912 			    (connp->conn_ipversion == IPV4_VERSION ?
2913 			    IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) -
2914 			    TCP_MIN_HEADER_LENGTH;
2915 			U16_TO_BE16(u1, wptr);
2916 			mp1->b_wptr = wptr + 2;
2917 			/* Update the offset to cover the additional word */
2918 			tcpha->tha_offset_and_reserved += (1 << 4);
2919 
2920 			/*
2921 			 * Note that the following way of filling in
2922 			 * TCP options are not optimal.  Some NOPs can
2923 			 * be saved.  But there is no need at this time
2924 			 * to optimize it.  When it is needed, we will
2925 			 * do it.
2926 			 */
2927 			switch (tcp->tcp_state) {
2928 			case TCPS_SYN_SENT:
2929 				flags = TH_SYN;
2930 
2931 				if (tcp->tcp_snd_ts_ok) {
2932 					uint32_t llbolt =
2933 					    (uint32_t)LBOLT_FASTPATH;
2934 
2935 					wptr = mp1->b_wptr;
2936 					wptr[0] = TCPOPT_NOP;
2937 					wptr[1] = TCPOPT_NOP;
2938 					wptr[2] = TCPOPT_TSTAMP;
2939 					wptr[3] = TCPOPT_TSTAMP_LEN;
2940 					wptr += 4;
2941 					U32_TO_BE32(llbolt, wptr);
2942 					wptr += 4;
2943 					ASSERT(tcp->tcp_ts_recent == 0);
2944 					U32_TO_BE32(0L, wptr);
2945 					mp1->b_wptr += TCPOPT_REAL_TS_LEN;
2946 					tcpha->tha_offset_and_reserved +=
2947 					    (3 << 4);
2948 				}
2949 
2950 				/*
2951 				 * Set up all the bits to tell other side
2952 				 * we are ECN capable.
2953 				 */
2954 				if (tcp->tcp_ecn_ok) {
2955 					flags |= (TH_ECE | TH_CWR);
2956 				}
2957 				break;
2958 			case TCPS_SYN_RCVD:
2959 				flags |= TH_SYN;
2960 
2961 				/*
2962 				 * Reset the MSS option value to be SMSS
2963 				 * We should probably add back the bytes
2964 				 * for timestamp option and IPsec.  We
2965 				 * don't do that as this is a workaround
2966 				 * for broken middle boxes/end hosts, it
2967 				 * is better for us to be more cautious.
2968 				 * They may not take these things into
2969 				 * account in their SMSS calculation.  Thus
2970 				 * the peer's calculated SMSS may be smaller
2971 				 * than what it can be.  This should be OK.
2972 				 */
2973 				if (tcps->tcps_use_smss_as_mss_opt) {
2974 					u1 = tcp->tcp_mss;
2975 					U16_TO_BE16(u1, wptr);
2976 				}
2977 
2978 				/*
2979 				 * If the other side is ECN capable, reply
2980 				 * that we are also ECN capable.
2981 				 */
2982 				if (tcp->tcp_ecn_ok)
2983 					flags |= TH_ECE;
2984 				break;
2985 			default:
2986 				/*
2987 				 * The above ASSERT() makes sure that this
2988 				 * must be FIN-WAIT-1 state.  Our SYN has
2989 				 * not been ack'ed so retransmit it.
2990 				 */
2991 				flags |= TH_SYN;
2992 				break;
2993 			}
2994 
2995 			if (tcp->tcp_snd_ws_ok) {
2996 				wptr = mp1->b_wptr;
2997 				wptr[0] =  TCPOPT_NOP;
2998 				wptr[1] =  TCPOPT_WSCALE;
2999 				wptr[2] =  TCPOPT_WS_LEN;
3000 				wptr[3] = (uchar_t)tcp->tcp_rcv_ws;
3001 				mp1->b_wptr += TCPOPT_REAL_WS_LEN;
3002 				tcpha->tha_offset_and_reserved += (1 << 4);
3003 			}
3004 
3005 			if (tcp->tcp_snd_sack_ok) {
3006 				wptr = mp1->b_wptr;
3007 				wptr[0] = TCPOPT_NOP;
3008 				wptr[1] = TCPOPT_NOP;
3009 				wptr[2] = TCPOPT_SACK_PERMITTED;
3010 				wptr[3] = TCPOPT_SACK_OK_LEN;
3011 				mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN;
3012 				tcpha->tha_offset_and_reserved += (1 << 4);
3013 			}
3014 
3015 			/* allocb() of adequate mblk assures space */
3016 			ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
3017 			    (uintptr_t)INT_MAX);
3018 			u1 = (int)(mp1->b_wptr - mp1->b_rptr);
3019 			/*
3020 			 * Get IP set to checksum on our behalf
3021 			 * Include the adjustment for a source route if any.
3022 			 */
3023 			u1 += connp->conn_sum;
3024 			u1 = (u1 >> 16) + (u1 & 0xFFFF);
3025 			tcpha->tha_sum = htons(u1);
3026 			TCPS_BUMP_MIB(tcps, tcpOutControl);
3027 		}
3028 		if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
3029 		    (seq + data_length) == tcp->tcp_fss) {
3030 			if (!tcp->tcp_fin_acked) {
3031 				flags |= TH_FIN;
3032 				TCPS_BUMP_MIB(tcps, tcpOutControl);
3033 			}
3034 			if (!tcp->tcp_fin_sent) {
3035 				tcp->tcp_fin_sent = B_TRUE;
3036 				switch (tcp->tcp_state) {
3037 				case TCPS_SYN_RCVD:
3038 				case TCPS_ESTABLISHED:
3039 					tcp->tcp_state = TCPS_FIN_WAIT_1;
3040 					break;
3041 				case TCPS_CLOSE_WAIT:
3042 					tcp->tcp_state = TCPS_LAST_ACK;
3043 					break;
3044 				}
3045 				if (tcp->tcp_suna == tcp->tcp_snxt)
3046 					TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3047 				tcp->tcp_snxt = tcp->tcp_fss + 1;
3048 			}
3049 		}
3050 		/*
3051 		 * Note the trick here.  u1 is unsigned.  When tcp_urg
3052 		 * is smaller than seq, u1 will become a very huge value.
3053 		 * So the comparison will fail.  Also note that tcp_urp
3054 		 * should be positive, see RFC 793 page 17.
3055 		 */
3056 		u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION;
3057 		if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 &&
3058 		    u1 < (uint32_t)(64 * 1024)) {
3059 			flags |= TH_URG;
3060 			TCPS_BUMP_MIB(tcps, tcpOutUrg);
3061 			tcpha->tha_urp = htons(u1);
3062 		}
3063 	}
3064 	tcpha->tha_flags = (uchar_t)flags;
3065 	tcp->tcp_rack = tcp->tcp_rnxt;
3066 	tcp->tcp_rack_cnt = 0;
3067 
3068 	if (tcp->tcp_snd_ts_ok) {
3069 		if (tcp->tcp_state != TCPS_SYN_SENT) {
3070 			uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
3071 
3072 			U32_TO_BE32(llbolt,
3073 			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
3074 			U32_TO_BE32(tcp->tcp_ts_recent,
3075 			    (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
3076 		}
3077 	}
3078 
3079 	if (num_sack_blk > 0) {
3080 		uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len;
3081 		sack_blk_t *tmp;
3082 		int32_t	i;
3083 
3084 		wptr[0] = TCPOPT_NOP;
3085 		wptr[1] = TCPOPT_NOP;
3086 		wptr[2] = TCPOPT_SACK;
3087 		wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
3088 		    sizeof (sack_blk_t);
3089 		wptr += TCPOPT_REAL_SACK_LEN;
3090 
3091 		tmp = tcp->tcp_sack_list;
3092 		for (i = 0; i < num_sack_blk; i++) {
3093 			U32_TO_BE32(tmp[i].begin, wptr);
3094 			wptr += sizeof (tcp_seq);
3095 			U32_TO_BE32(tmp[i].end, wptr);
3096 			wptr += sizeof (tcp_seq);
3097 		}
3098 		tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4);
3099 	}
3100 	ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX);
3101 	data_length += (int)(mp1->b_wptr - rptr);
3102 
3103 	ixa->ixa_pktlen = data_length;
3104 
3105 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
3106 		((ipha_t *)rptr)->ipha_length = htons(data_length);
3107 	} else {
3108 		ip6_t *ip6 = (ip6_t *)rptr;
3109 
3110 		ip6->ip6_plen = htons(data_length - IPV6_HDR_LEN);
3111 	}
3112 
3113 	/*
3114 	 * Prime pump for IP
3115 	 * Include the adjustment for a source route if any.
3116 	 */
3117 	data_length -= ixa->ixa_ip_hdr_length;
3118 	data_length += connp->conn_sum;
3119 	data_length = (data_length >> 16) + (data_length & 0xFFFF);
3120 	tcpha->tha_sum = htons(data_length);
3121 	if (tcp->tcp_ip_forward_progress) {
3122 		tcp->tcp_ip_forward_progress = B_FALSE;
3123 		connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
3124 	} else {
3125 		connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
3126 	}
3127 	return (mp1);
3128 }
3129 
3130 /*
3131  * If this routine returns B_TRUE, TCP can generate a RST in response
3132  * to a segment.  If it returns B_FALSE, TCP should not respond.
3133  */
3134 static boolean_t
3135 tcp_send_rst_chk(tcp_stack_t *tcps)
3136 {
3137 	int64_t	now;
3138 
3139 	/*
3140 	 * TCP needs to protect itself from generating too many RSTs.
3141 	 * This can be a DoS attack by sending us random segments
3142 	 * soliciting RSTs.
3143 	 *
3144 	 * What we do here is to have a limit of tcp_rst_sent_rate RSTs
3145 	 * in each 1 second interval.  In this way, TCP still generate
3146 	 * RSTs in normal cases but when under attack, the impact is
3147 	 * limited.
3148 	 */
3149 	if (tcps->tcps_rst_sent_rate_enabled != 0) {
3150 		now = ddi_get_lbolt64();
3151 		if (TICK_TO_MSEC(now - tcps->tcps_last_rst_intrvl) >
3152 		    1*SECONDS) {
3153 			tcps->tcps_last_rst_intrvl = now;
3154 			tcps->tcps_rst_cnt = 1;
3155 		} else if (++tcps->tcps_rst_cnt > tcps->tcps_rst_sent_rate) {
3156 			return (B_FALSE);
3157 		}
3158 	}
3159 	return (B_TRUE);
3160 }
3161 
3162 /*
3163  * This function handles all retransmissions if SACK is enabled for this
3164  * connection.  First it calculates how many segments can be retransmitted
3165  * based on tcp_pipe.  Then it goes thru the notsack list to find eligible
3166  * segments.  A segment is eligible if sack_cnt for that segment is greater
3167  * than or equal tcp_dupack_fast_retransmit.  After it has retransmitted
3168  * all eligible segments, it checks to see if TCP can send some new segments
3169  * (fast recovery).  If it can, set the appropriate flag for tcp_input_data().
3170  *
3171  * Parameters:
3172  *	tcp_t *tcp: the tcp structure of the connection.
3173  *	uint_t *flags: in return, appropriate value will be set for
3174  *	tcp_input_data().
3175  */
3176 void
3177 tcp_sack_rexmit(tcp_t *tcp, uint_t *flags)
3178 {
3179 	notsack_blk_t	*notsack_blk;
3180 	int32_t		usable_swnd;
3181 	int32_t		mss;
3182 	uint32_t	seg_len;
3183 	mblk_t		*xmit_mp;
3184 	tcp_stack_t	*tcps = tcp->tcp_tcps;
3185 
3186 	ASSERT(tcp->tcp_sack_info != NULL);
3187 	ASSERT(tcp->tcp_notsack_list != NULL);
3188 	ASSERT(tcp->tcp_rexmit == B_FALSE);
3189 
3190 	/* Defensive coding in case there is a bug... */
3191 	if (tcp->tcp_notsack_list == NULL) {
3192 		return;
3193 	}
3194 	notsack_blk = tcp->tcp_notsack_list;
3195 	mss = tcp->tcp_mss;
3196 
3197 	/*
3198 	 * Limit the num of outstanding data in the network to be
3199 	 * tcp_cwnd_ssthresh, which is half of the original congestion wnd.
3200 	 */
3201 	usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe;
3202 
3203 	/* At least retransmit 1 MSS of data. */
3204 	if (usable_swnd <= 0) {
3205 		usable_swnd = mss;
3206 	}
3207 
3208 	/* Make sure no new RTT samples will be taken. */
3209 	tcp->tcp_csuna = tcp->tcp_snxt;
3210 
3211 	notsack_blk = tcp->tcp_notsack_list;
3212 	while (usable_swnd > 0) {
3213 		mblk_t		*snxt_mp, *tmp_mp;
3214 		tcp_seq		begin = tcp->tcp_sack_snxt;
3215 		tcp_seq		end;
3216 		int32_t		off;
3217 
3218 		for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) {
3219 			if (SEQ_GT(notsack_blk->end, begin) &&
3220 			    (notsack_blk->sack_cnt >=
3221 			    tcps->tcps_dupack_fast_retransmit)) {
3222 				end = notsack_blk->end;
3223 				if (SEQ_LT(begin, notsack_blk->begin)) {
3224 					begin = notsack_blk->begin;
3225 				}
3226 				break;
3227 			}
3228 		}
3229 		/*
3230 		 * All holes are filled.  Manipulate tcp_cwnd to send more
3231 		 * if we can.  Note that after the SACK recovery, tcp_cwnd is
3232 		 * set to tcp_cwnd_ssthresh.
3233 		 */
3234 		if (notsack_blk == NULL) {
3235 			usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe;
3236 			if (usable_swnd <= 0 || tcp->tcp_unsent == 0) {
3237 				tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna;
3238 				ASSERT(tcp->tcp_cwnd > 0);
3239 				return;
3240 			} else {
3241 				usable_swnd = usable_swnd / mss;
3242 				tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna +
3243 				    MAX(usable_swnd * mss, mss);
3244 				*flags |= TH_XMIT_NEEDED;
3245 				return;
3246 			}
3247 		}
3248 
3249 		/*
3250 		 * Note that we may send more than usable_swnd allows here
3251 		 * because of round off, but no more than 1 MSS of data.
3252 		 */
3253 		seg_len = end - begin;
3254 		if (seg_len > mss)
3255 			seg_len = mss;
3256 		snxt_mp = tcp_get_seg_mp(tcp, begin, &off);
3257 		ASSERT(snxt_mp != NULL);
3258 		/* This should not happen.  Defensive coding again... */
3259 		if (snxt_mp == NULL) {
3260 			return;
3261 		}
3262 
3263 		xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off,
3264 		    &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE);
3265 		if (xmit_mp == NULL)
3266 			return;
3267 
3268 		usable_swnd -= seg_len;
3269 		tcp->tcp_pipe += seg_len;
3270 		tcp->tcp_sack_snxt = begin + seg_len;
3271 
3272 		tcp_send_data(tcp, xmit_mp);
3273 
3274 		/*
3275 		 * Update the send timestamp to avoid false retransmission.
3276 		 */
3277 		snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3278 
3279 		TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3280 		TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
3281 		TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs);
3282 		/*
3283 		 * Update tcp_rexmit_max to extend this SACK recovery phase.
3284 		 * This happens when new data sent during fast recovery is
3285 		 * also lost.  If TCP retransmits those new data, it needs
3286 		 * to extend SACK recover phase to avoid starting another
3287 		 * fast retransmit/recovery unnecessarily.
3288 		 */
3289 		if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) {
3290 			tcp->tcp_rexmit_max = tcp->tcp_sack_snxt;
3291 		}
3292 	}
3293 }
3294 
3295 /*
3296  * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
3297  * or ICMP errors.
3298  *
3299  * To limit the number of duplicate segments, we limit the number of segment
3300  * to be sent in one time to tcp_snd_burst, the burst variable.
3301  */
3302 void
3303 tcp_ss_rexmit(tcp_t *tcp)
3304 {
3305 	uint32_t	snxt;
3306 	uint32_t	smax;
3307 	int32_t		win;
3308 	int32_t		mss;
3309 	int32_t		off;
3310 	int32_t		burst = tcp->tcp_snd_burst;
3311 	mblk_t		*snxt_mp;
3312 	tcp_stack_t	*tcps = tcp->tcp_tcps;
3313 
3314 	/*
3315 	 * Note that tcp_rexmit can be set even though TCP has retransmitted
3316 	 * all unack'ed segments.
3317 	 */
3318 	if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) {
3319 		smax = tcp->tcp_rexmit_max;
3320 		snxt = tcp->tcp_rexmit_nxt;
3321 		if (SEQ_LT(snxt, tcp->tcp_suna)) {
3322 			snxt = tcp->tcp_suna;
3323 		}
3324 		win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd);
3325 		win -= snxt - tcp->tcp_suna;
3326 		mss = tcp->tcp_mss;
3327 		snxt_mp = tcp_get_seg_mp(tcp, snxt, &off);
3328 
3329 		while (SEQ_LT(snxt, smax) && (win > 0) &&
3330 		    (burst > 0) && (snxt_mp != NULL)) {
3331 			mblk_t	*xmit_mp;
3332 			mblk_t	*old_snxt_mp = snxt_mp;
3333 			uint32_t cnt = mss;
3334 
3335 			if (win < cnt) {
3336 				cnt = win;
3337 			}
3338 			if (SEQ_GT(snxt + cnt, smax)) {
3339 				cnt = smax - snxt;
3340 			}
3341 			xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off,
3342 			    &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE);
3343 			if (xmit_mp == NULL)
3344 				return;
3345 
3346 			tcp_send_data(tcp, xmit_mp);
3347 
3348 			snxt += cnt;
3349 			win -= cnt;
3350 			/*
3351 			 * Update the send timestamp to avoid false
3352 			 * retransmission.
3353 			 */
3354 			old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
3355 			TCPS_BUMP_MIB(tcps, tcpRetransSegs);
3356 			TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
3357 
3358 			tcp->tcp_rexmit_nxt = snxt;
3359 			burst--;
3360 		}
3361 		/*
3362 		 * If we have transmitted all we have at the time
3363 		 * we started the retranmission, we can leave
3364 		 * the rest of the job to tcp_wput_data().  But we
3365 		 * need to check the send window first.  If the
3366 		 * win is not 0, go on with tcp_wput_data().
3367 		 */
3368 		if (SEQ_LT(snxt, smax) || win == 0) {
3369 			return;
3370 		}
3371 	}
3372 	/* Only call tcp_wput_data() if there is data to be sent. */
3373 	if (tcp->tcp_unsent) {
3374 		tcp_wput_data(tcp, NULL, B_FALSE);
3375 	}
3376 }
3377 
3378 /*
3379  * Do slow start retransmission after ICMP errors of PMTU changes.
3380  */
3381 void
3382 tcp_rexmit_after_error(tcp_t *tcp)
3383 {
3384 	/*
3385 	 * All sent data has been acknowledged or no data left to send, just
3386 	 * to return.
3387 	 */
3388 	if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) ||
3389 	    (tcp->tcp_xmit_head == NULL))
3390 		return;
3391 
3392 	if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0))
3393 		tcp->tcp_rexmit_max = tcp->tcp_fss;
3394 	else
3395 		tcp->tcp_rexmit_max = tcp->tcp_snxt;
3396 
3397 	tcp->tcp_rexmit_nxt = tcp->tcp_suna;
3398 	tcp->tcp_rexmit = B_TRUE;
3399 	tcp->tcp_dupack_cnt = 0;
3400 	tcp->tcp_snd_burst = TCP_CWND_SS;
3401 	tcp_ss_rexmit(tcp);
3402 }
3403 
3404 /*
3405  * tcp_get_seg_mp() is called to get the pointer to a segment in the
3406  * send queue which starts at the given sequence number. If the given
3407  * sequence number is equal to last valid sequence number (tcp_snxt), the
3408  * returned mblk is the last valid mblk, and off is set to the length of
3409  * that mblk.
3410  *
3411  * send queue which starts at the given seq. no.
3412  *
3413  * Parameters:
3414  *	tcp_t *tcp: the tcp instance pointer.
3415  *	uint32_t seq: the starting seq. no of the requested segment.
3416  *	int32_t *off: after the execution, *off will be the offset to
3417  *		the returned mblk which points to the requested seq no.
3418  *		It is the caller's responsibility to send in a non-null off.
3419  *
3420  * Return:
3421  *	A mblk_t pointer pointing to the requested segment in send queue.
3422  */
3423 static mblk_t *
3424 tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off)
3425 {
3426 	int32_t	cnt;
3427 	mblk_t	*mp;
3428 
3429 	/* Defensive coding.  Make sure we don't send incorrect data. */
3430 	if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GT(seq, tcp->tcp_snxt))
3431 		return (NULL);
3432 
3433 	cnt = seq - tcp->tcp_suna;
3434 	mp = tcp->tcp_xmit_head;
3435 	while (cnt > 0 && mp != NULL) {
3436 		cnt -= mp->b_wptr - mp->b_rptr;
3437 		if (cnt <= 0) {
3438 			cnt += mp->b_wptr - mp->b_rptr;
3439 			break;
3440 		}
3441 		mp = mp->b_cont;
3442 	}
3443 	ASSERT(mp != NULL);
3444 	*off = cnt;
3445 	return (mp);
3446 }
3447 
3448 /*
3449  * This routine adjusts next-to-send sequence number variables, in the
3450  * case where the reciever has shrunk it's window.
3451  */
3452 void
3453 tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt)
3454 {
3455 	mblk_t *xmit_tail;
3456 	int32_t offset;
3457 
3458 	tcp->tcp_snxt = snxt;
3459 
3460 	/* Get the mblk, and the offset in it, as per the shrunk window */
3461 	xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset);
3462 	ASSERT(xmit_tail != NULL);
3463 	tcp->tcp_xmit_tail = xmit_tail;
3464 	tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr -
3465 	    xmit_tail->b_rptr - offset;
3466 }
3467 
3468 /*
3469  * This handles the case when the receiver has shrunk its win. Per RFC 1122
3470  * if the receiver shrinks the window, i.e. moves the right window to the
3471  * left, the we should not send new data, but should retransmit normally the
3472  * old unacked data between suna and suna + swnd. We might has sent data
3473  * that is now outside the new window, pretend that we didn't send  it.
3474  */
3475 static void
3476 tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count)
3477 {
3478 	uint32_t	snxt = tcp->tcp_snxt;
3479 
3480 	ASSERT(shrunk_count > 0);
3481 
3482 	if (!tcp->tcp_is_wnd_shrnk) {
3483 		tcp->tcp_snxt_shrunk = snxt;
3484 		tcp->tcp_is_wnd_shrnk = B_TRUE;
3485 	} else if (SEQ_GT(snxt, tcp->tcp_snxt_shrunk)) {
3486 		tcp->tcp_snxt_shrunk = snxt;
3487 	}
3488 
3489 	/* Pretend we didn't send the data outside the window */
3490 	snxt -= shrunk_count;
3491 
3492 	/* Reset all the values per the now shrunk window */
3493 	tcp_update_xmit_tail(tcp, snxt);
3494 	tcp->tcp_unsent += shrunk_count;
3495 
3496 	/*
3497 	 * If the SACK option is set, delete the entire list of
3498 	 * notsack'ed blocks.
3499 	 */
3500 	if (tcp->tcp_sack_info != NULL) {
3501 		if (tcp->tcp_notsack_list != NULL)
3502 			TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
3503 	}
3504 
3505 	if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0)
3506 		/*
3507 		 * Make sure the timer is running so that we will probe a zero
3508 		 * window.
3509 		 */
3510 		TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
3511 }
3512 
3513 /*
3514  * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
3515  * with the template header, as well as other options such as time-stamp,
3516  * ECN and/or SACK.
3517  */
3518 static void
3519 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
3520 {
3521 	tcpha_t *tcp_tmpl, *tcpha;
3522 	uint32_t *dst, *src;
3523 	int hdrlen;
3524 	conn_t *connp = tcp->tcp_connp;
3525 
3526 	ASSERT(OK_32PTR(rptr));
3527 
3528 	/* Template header */
3529 	tcp_tmpl = tcp->tcp_tcpha;
3530 
3531 	/* Header of outgoing packet */
3532 	tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
3533 
3534 	/* dst and src are opaque 32-bit fields, used for copying */
3535 	dst = (uint32_t *)rptr;
3536 	src = (uint32_t *)connp->conn_ht_iphc;
3537 	hdrlen = connp->conn_ht_iphc_len;
3538 
3539 	/* Fill time-stamp option if needed */
3540 	if (tcp->tcp_snd_ts_ok) {
3541 		U32_TO_BE32((uint32_t)now,
3542 		    (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
3543 		U32_TO_BE32(tcp->tcp_ts_recent,
3544 		    (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
3545 	} else {
3546 		ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
3547 	}
3548 
3549 	/*
3550 	 * Copy the template header; is this really more efficient than
3551 	 * calling bcopy()?  For simple IPv4/TCP, it may be the case,
3552 	 * but perhaps not for other scenarios.
3553 	 */
3554 	dst[0] = src[0];
3555 	dst[1] = src[1];
3556 	dst[2] = src[2];
3557 	dst[3] = src[3];
3558 	dst[4] = src[4];
3559 	dst[5] = src[5];
3560 	dst[6] = src[6];
3561 	dst[7] = src[7];
3562 	dst[8] = src[8];
3563 	dst[9] = src[9];
3564 	if (hdrlen -= 40) {
3565 		hdrlen >>= 2;
3566 		dst += 10;
3567 		src += 10;
3568 		do {
3569 			*dst++ = *src++;
3570 		} while (--hdrlen);
3571 	}
3572 
3573 	/*
3574 	 * Set the ECN info in the TCP header if it is not a zero
3575 	 * window probe.  Zero window probe is only sent in
3576 	 * tcp_wput_data() and tcp_timer().
3577 	 */
3578 	if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) {
3579 		TCP_SET_ECT(tcp, rptr);
3580 
3581 		if (tcp->tcp_ecn_echo_on)
3582 			tcpha->tha_flags |= TH_ECE;
3583 		if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
3584 			tcpha->tha_flags |= TH_CWR;
3585 			tcp->tcp_ecn_cwr_sent = B_TRUE;
3586 		}
3587 	}
3588 
3589 	/* Fill in SACK options */
3590 	if (num_sack_blk > 0) {
3591 		uchar_t *wptr = rptr + connp->conn_ht_iphc_len;
3592 		sack_blk_t *tmp;
3593 		int32_t	i;
3594 
3595 		wptr[0] = TCPOPT_NOP;
3596 		wptr[1] = TCPOPT_NOP;
3597 		wptr[2] = TCPOPT_SACK;
3598 		wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
3599 		    sizeof (sack_blk_t);
3600 		wptr += TCPOPT_REAL_SACK_LEN;
3601 
3602 		tmp = tcp->tcp_sack_list;
3603 		for (i = 0; i < num_sack_blk; i++) {
3604 			U32_TO_BE32(tmp[i].begin, wptr);
3605 			wptr += sizeof (tcp_seq);
3606 			U32_TO_BE32(tmp[i].end, wptr);
3607 			wptr += sizeof (tcp_seq);
3608 		}
3609 		tcpha->tha_offset_and_reserved +=
3610 		    ((num_sack_blk * 2 + 1) << 4);
3611 	}
3612 }
3613