xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_tpi.c (revision 3e95bd4ab92abca814bd28e854607d1975c7dc88)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /* This files contains all TCP TLI/TPI related functions */
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsun.h>
31 #include <sys/strsubr.h>
32 #include <sys/stropts.h>
33 #include <sys/strlog.h>
34 #define	_SUN_TPI_VERSION 2
35 #include <sys/tihdr.h>
36 #include <sys/suntpi.h>
37 #include <sys/xti_inet.h>
38 #include <sys/squeue_impl.h>
39 #include <sys/squeue.h>
40 
41 #include <inet/common.h>
42 #include <inet/ip.h>
43 #include <inet/tcp.h>
44 #include <inet/tcp_impl.h>
45 #include <inet/proto_set.h>
46 
47 static void	tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *);
48 static int	tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *);
49 
50 void
51 tcp_use_pure_tpi(tcp_t *tcp)
52 {
53 	conn_t		*connp = tcp->tcp_connp;
54 
55 #ifdef	_ILP32
56 	tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq;
57 #else
58 	tcp->tcp_acceptor_id = connp->conn_dev;
59 #endif
60 	/*
61 	 * Insert this socket into the acceptor hash.
62 	 * We might need it for T_CONN_RES message
63 	 */
64 	tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
65 
66 	tcp->tcp_issocket = B_FALSE;
67 	TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback);
68 }
69 
70 /* Shorthand to generate and send TPI error acks to our client */
71 void
72 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
73 {
74 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
75 		putnext(tcp->tcp_connp->conn_rq, mp);
76 }
77 
78 /* Shorthand to generate and send TPI error acks to our client */
79 void
80 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
81     int t_error, int sys_error)
82 {
83 	struct T_error_ack	*teackp;
84 
85 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
86 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
87 		teackp = (struct T_error_ack *)mp->b_rptr;
88 		teackp->ERROR_prim = primitive;
89 		teackp->TLI_error = t_error;
90 		teackp->UNIX_error = sys_error;
91 		putnext(tcp->tcp_connp->conn_rq, mp);
92 	}
93 }
94 
95 /*
96  * TCP routine to get the values of options.
97  */
98 int
99 tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
100 {
101 	return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr));
102 }
103 
104 /* ARGSUSED */
105 int
106 tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
107     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
108     void *thisdg_attrs, cred_t *cr)
109 {
110 	conn_t	*connp =  Q_TO_CONN(q);
111 
112 	return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp,
113 	    outlenp, outvalp, thisdg_attrs, cr));
114 }
115 
116 static int
117 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp,
118     int *t_errorp, int *sys_errorp)
119 {
120 	int error;
121 	int is_absreq_failure;
122 	t_scalar_t *opt_lenp;
123 	t_scalar_t opt_offset;
124 	int prim_type;
125 	struct T_conn_req *tcreqp;
126 	struct T_conn_res *tcresp;
127 	cred_t *cr;
128 
129 	/*
130 	 * All Solaris components should pass a db_credp
131 	 * for this TPI message, hence we ASSERT.
132 	 * But in case there is some other M_PROTO that looks
133 	 * like a TPI message sent by some other kernel
134 	 * component, we check and return an error.
135 	 */
136 	cr = msg_getcred(mp, NULL);
137 	ASSERT(cr != NULL);
138 	if (cr == NULL)
139 		return (-1);
140 
141 	prim_type = ((union T_primitives *)mp->b_rptr)->type;
142 	ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES ||
143 	    prim_type == T_CONN_RES);
144 
145 	switch (prim_type) {
146 	case T_CONN_REQ:
147 		tcreqp = (struct T_conn_req *)mp->b_rptr;
148 		opt_offset = tcreqp->OPT_offset;
149 		opt_lenp = (t_scalar_t *)&tcreqp->OPT_length;
150 		break;
151 	case O_T_CONN_RES:
152 	case T_CONN_RES:
153 		tcresp = (struct T_conn_res *)mp->b_rptr;
154 		opt_offset = tcresp->OPT_offset;
155 		opt_lenp = (t_scalar_t *)&tcresp->OPT_length;
156 		break;
157 	}
158 
159 	*t_errorp = 0;
160 	*sys_errorp = 0;
161 	*do_disconnectp = 0;
162 
163 	error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp,
164 	    opt_offset, cr, &tcp_opt_obj,
165 	    NULL, &is_absreq_failure);
166 
167 	switch (error) {
168 	case  0:		/* no error */
169 		ASSERT(is_absreq_failure == 0);
170 		return (0);
171 	case ENOPROTOOPT:
172 		*t_errorp = TBADOPT;
173 		break;
174 	case EACCES:
175 		*t_errorp = TACCES;
176 		break;
177 	default:
178 		*t_errorp = TSYSERR; *sys_errorp = error;
179 		break;
180 	}
181 	if (is_absreq_failure != 0) {
182 		/*
183 		 * The connection request should get the local ack
184 		 * T_OK_ACK and then a T_DISCON_IND.
185 		 */
186 		*do_disconnectp = 1;
187 	}
188 	return (-1);
189 }
190 
191 void
192 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
193 {
194 	int	error;
195 	conn_t	*connp = tcp->tcp_connp;
196 	struct sockaddr	*sa;
197 	mblk_t  *mp1;
198 	struct T_bind_req *tbr;
199 	int	backlog;
200 	socklen_t	len;
201 	sin_t	*sin;
202 	sin6_t	*sin6;
203 	cred_t		*cr;
204 
205 	/*
206 	 * All Solaris components should pass a db_credp
207 	 * for this TPI message, hence we ASSERT.
208 	 * But in case there is some other M_PROTO that looks
209 	 * like a TPI message sent by some other kernel
210 	 * component, we check and return an error.
211 	 */
212 	cr = msg_getcred(mp, NULL);
213 	ASSERT(cr != NULL);
214 	if (cr == NULL) {
215 		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
216 		return;
217 	}
218 
219 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
220 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
221 		if (connp->conn_debug) {
222 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
223 			    "tcp_tpi_bind: bad req, len %u",
224 			    (uint_t)(mp->b_wptr - mp->b_rptr));
225 		}
226 		tcp_err_ack(tcp, mp, TPROTO, 0);
227 		return;
228 	}
229 	/* Make sure the largest address fits */
230 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
231 	if (mp1 == NULL) {
232 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
233 		return;
234 	}
235 	mp = mp1;
236 	tbr = (struct T_bind_req *)mp->b_rptr;
237 
238 	backlog = tbr->CONIND_number;
239 	len = tbr->ADDR_length;
240 
241 	switch (len) {
242 	case 0:		/* request for a generic port */
243 		tbr->ADDR_offset = sizeof (struct T_bind_req);
244 		if (connp->conn_family == AF_INET) {
245 			tbr->ADDR_length = sizeof (sin_t);
246 			sin = (sin_t *)&tbr[1];
247 			*sin = sin_null;
248 			sin->sin_family = AF_INET;
249 			sa = (struct sockaddr *)sin;
250 			len = sizeof (sin_t);
251 			mp->b_wptr = (uchar_t *)&sin[1];
252 		} else {
253 			ASSERT(connp->conn_family == AF_INET6);
254 			tbr->ADDR_length = sizeof (sin6_t);
255 			sin6 = (sin6_t *)&tbr[1];
256 			*sin6 = sin6_null;
257 			sin6->sin6_family = AF_INET6;
258 			sa = (struct sockaddr *)sin6;
259 			len = sizeof (sin6_t);
260 			mp->b_wptr = (uchar_t *)&sin6[1];
261 		}
262 		break;
263 
264 	case sizeof (sin_t):    /* Complete IPv4 address */
265 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
266 		    sizeof (sin_t));
267 		break;
268 
269 	case sizeof (sin6_t): /* Complete IPv6 address */
270 		sa = (struct sockaddr *)mi_offset_param(mp,
271 		    tbr->ADDR_offset, sizeof (sin6_t));
272 		break;
273 
274 	default:
275 		if (connp->conn_debug) {
276 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
277 			    "tcp_tpi_bind: bad address length, %d",
278 			    tbr->ADDR_length);
279 		}
280 		tcp_err_ack(tcp, mp, TBADADDR, 0);
281 		return;
282 	}
283 
284 	if (backlog > 0) {
285 		error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp),
286 		    tbr->PRIM_type != O_T_BIND_REQ);
287 	} else {
288 		error = tcp_do_bind(connp, sa, len, DB_CRED(mp),
289 		    tbr->PRIM_type != O_T_BIND_REQ);
290 	}
291 done:
292 	if (error > 0) {
293 		tcp_err_ack(tcp, mp, TSYSERR, error);
294 	} else if (error < 0) {
295 		tcp_err_ack(tcp, mp, -error, 0);
296 	} else {
297 		/*
298 		 * Update port information as sockfs/tpi needs it for checking
299 		 */
300 		if (connp->conn_family == AF_INET) {
301 			sin = (sin_t *)sa;
302 			sin->sin_port = connp->conn_lport;
303 		} else {
304 			sin6 = (sin6_t *)sa;
305 			sin6->sin6_port = connp->conn_lport;
306 		}
307 		mp->b_datap->db_type = M_PCPROTO;
308 		tbr->PRIM_type = T_BIND_ACK;
309 		putnext(connp->conn_rq, mp);
310 	}
311 }
312 
313 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */
314 void
315 tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp)
316 {
317 	conn_t *connp = tcp->tcp_connp;
318 	int error;
319 
320 	error = tcp_do_unbind(connp);
321 	if (error > 0) {
322 		tcp_err_ack(tcp, mp, TSYSERR, error);
323 	} else if (error < 0) {
324 		tcp_err_ack(tcp, mp, -error, 0);
325 	} else {
326 		/* Send M_FLUSH according to TPI */
327 		(void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
328 
329 		mp = mi_tpi_ok_ack_alloc(mp);
330 		if (mp != NULL)
331 			putnext(connp->conn_rq, mp);
332 	}
333 }
334 
335 int
336 tcp_tpi_close(queue_t *q, int flags)
337 {
338 	conn_t		*connp;
339 
340 	ASSERT(WR(q)->q_next == NULL);
341 
342 	if (flags & SO_FALLBACK) {
343 		/*
344 		 * stream is being closed while in fallback
345 		 * simply free the resources that were allocated
346 		 */
347 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
348 		qprocsoff(q);
349 		goto done;
350 	}
351 
352 	connp = Q_TO_CONN(q);
353 	/*
354 	 * We are being closed as /dev/tcp or /dev/tcp6.
355 	 */
356 	tcp_close_common(connp, flags);
357 
358 	qprocsoff(q);
359 	inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
360 
361 	/*
362 	 * Drop IP's reference on the conn. This is the last reference
363 	 * on the connp if the state was less than established. If the
364 	 * connection has gone into timewait state, then we will have
365 	 * one ref for the TCP and one more ref (total of two) for the
366 	 * classifier connected hash list (a timewait connections stays
367 	 * in connected hash till closed).
368 	 *
369 	 * We can't assert the references because there might be other
370 	 * transient reference places because of some walkers or queued
371 	 * packets in squeue for the timewait state.
372 	 */
373 	CONN_DEC_REF(connp);
374 done:
375 	q->q_ptr = WR(q)->q_ptr = NULL;
376 	return (0);
377 }
378 
379 int
380 tcp_tpi_close_accept(queue_t *q)
381 {
382 	vmem_t	*minor_arena;
383 	dev_t	conn_dev;
384 	extern struct qinit tcp_acceptor_winit;
385 
386 	ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
387 
388 	/*
389 	 * We had opened an acceptor STREAM for sockfs which is
390 	 * now being closed due to some error.
391 	 */
392 	qprocsoff(q);
393 
394 	minor_arena = (vmem_t *)WR(q)->q_ptr;
395 	conn_dev = (dev_t)RD(q)->q_ptr;
396 	ASSERT(minor_arena != NULL);
397 	ASSERT(conn_dev != 0);
398 	inet_minor_free(minor_arena, conn_dev);
399 	q->q_ptr = WR(q)->q_ptr = NULL;
400 	return (0);
401 }
402 
403 /*
404  * Put a connection confirmation message upstream built from the
405  * address/flowid information with the conn and iph. Report our success or
406  * failure.
407  */
408 boolean_t
409 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp,
410     mblk_t **defermp, ip_recv_attr_t *ira)
411 {
412 	sin_t	sin;
413 	sin6_t	sin6;
414 	mblk_t	*mp;
415 	char	*optp = NULL;
416 	int	optlen = 0;
417 	conn_t	*connp = tcp->tcp_connp;
418 
419 	if (defermp != NULL)
420 		*defermp = NULL;
421 
422 	if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
423 		/*
424 		 * Return in T_CONN_CON results of option negotiation through
425 		 * the T_CONN_REQ. Note: If there is an real end-to-end option
426 		 * negotiation, then what is received from remote end needs
427 		 * to be taken into account but there is no such thing (yet?)
428 		 * in our TCP/IP.
429 		 * Note: We do not use mi_offset_param() here as
430 		 * tcp_opts_conn_req contents do not directly come from
431 		 * an application and are either generated in kernel or
432 		 * from user input that was already verified.
433 		 */
434 		mp = tcp->tcp_conn.tcp_opts_conn_req;
435 		optp = (char *)(mp->b_rptr +
436 		    ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
437 		optlen = (int)
438 		    ((struct T_conn_req *)mp->b_rptr)->OPT_length;
439 	}
440 
441 	if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
442 
443 		/* packet is IPv4 */
444 		if (connp->conn_family == AF_INET) {
445 			sin = sin_null;
446 			sin.sin_addr.s_addr = connp->conn_faddr_v4;
447 			sin.sin_port = connp->conn_fport;
448 			sin.sin_family = AF_INET;
449 			mp = mi_tpi_conn_con(NULL, (char *)&sin,
450 			    (int)sizeof (sin_t), optp, optlen);
451 		} else {
452 			sin6 = sin6_null;
453 			sin6.sin6_addr = connp->conn_faddr_v6;
454 			sin6.sin6_port = connp->conn_fport;
455 			sin6.sin6_family = AF_INET6;
456 			mp = mi_tpi_conn_con(NULL, (char *)&sin6,
457 			    (int)sizeof (sin6_t), optp, optlen);
458 
459 		}
460 	} else {
461 		ip6_t	*ip6h = (ip6_t *)iphdr;
462 
463 		ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
464 		ASSERT(connp->conn_family == AF_INET6);
465 		sin6 = sin6_null;
466 		sin6.sin6_addr = connp->conn_faddr_v6;
467 		sin6.sin6_port = connp->conn_fport;
468 		sin6.sin6_family = AF_INET6;
469 		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
470 		mp = mi_tpi_conn_con(NULL, (char *)&sin6,
471 		    (int)sizeof (sin6_t), optp, optlen);
472 	}
473 
474 	if (!mp)
475 		return (B_FALSE);
476 
477 	mblk_copycred(mp, idmp);
478 
479 	if (defermp == NULL) {
480 		conn_t *connp = tcp->tcp_connp;
481 		if (IPCL_IS_NONSTR(connp)) {
482 			(*connp->conn_upcalls->su_connected)
483 			    (connp->conn_upper_handle, tcp->tcp_connid,
484 			    ira->ira_cred, ira->ira_cpid);
485 			freemsg(mp);
486 		} else {
487 			if (ira->ira_cred != NULL) {
488 				/* So that getpeerucred works for TPI sockfs */
489 				mblk_setcred(mp, ira->ira_cred, ira->ira_cpid);
490 			}
491 			putnext(connp->conn_rq, mp);
492 		}
493 	} else {
494 		*defermp = mp;
495 	}
496 
497 	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
498 		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
499 	return (B_TRUE);
500 }
501 
502 /*
503  * Successful connect request processing begins when our client passes
504  * a T_CONN_REQ message into tcp_wput(), which performs function calls into
505  * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream).
506  *
507  * After various error checks are completed, tcp_tpi_connect() lays
508  * the target address and port into the composite header template.
509  * Then we ask IP for information, including a source address if we didn't
510  * already have one. Finally we prepare to send the SYN packet, and then
511  * send up the T_OK_ACK reply message.
512  */
513 void
514 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
515 {
516 	sin_t		*sin;
517 	struct T_conn_req	*tcr;
518 	struct sockaddr	*sa;
519 	socklen_t	len;
520 	int		error;
521 	cred_t		*cr;
522 	pid_t		cpid;
523 	conn_t		*connp = tcp->tcp_connp;
524 	queue_t		*q = connp->conn_wq;
525 
526 	/*
527 	 * All Solaris components should pass a db_credp
528 	 * for this TPI message, hence we ASSERT.
529 	 * But in case there is some other M_PROTO that looks
530 	 * like a TPI message sent by some other kernel
531 	 * component, we check and return an error.
532 	 */
533 	cr = msg_getcred(mp, &cpid);
534 	ASSERT(cr != NULL);
535 	if (cr == NULL) {
536 		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
537 		return;
538 	}
539 
540 	tcr = (struct T_conn_req *)mp->b_rptr;
541 
542 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
543 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
544 		tcp_err_ack(tcp, mp, TPROTO, 0);
545 		return;
546 	}
547 
548 	/*
549 	 * Pre-allocate the T_ordrel_ind mblk so that at close time, we
550 	 * will always have that to send up.  Otherwise, we need to do
551 	 * special handling in case the allocation fails at that time.
552 	 * If the end point is TPI, the tcp_t can be reused and the
553 	 * tcp_ordrel_mp may be allocated already.
554 	 */
555 	if (tcp->tcp_ordrel_mp == NULL) {
556 		if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) {
557 			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
558 			return;
559 		}
560 	}
561 
562 	/*
563 	 * Determine packet type based on type of address passed in
564 	 * the request should contain an IPv4 or IPv6 address.
565 	 * Make sure that address family matches the type of
566 	 * family of the address passed down.
567 	 */
568 	switch (tcr->DEST_length) {
569 	default:
570 		tcp_err_ack(tcp, mp, TBADADDR, 0);
571 		return;
572 
573 	case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
574 		/*
575 		 * XXX: The check for valid DEST_length was not there
576 		 * in earlier releases and some buggy
577 		 * TLI apps (e.g Sybase) got away with not feeding
578 		 * in sin_zero part of address.
579 		 * We allow that bug to keep those buggy apps humming.
580 		 * Test suites require the check on DEST_length.
581 		 * We construct a new mblk with valid DEST_length
582 		 * free the original so the rest of the code does
583 		 * not have to keep track of this special shorter
584 		 * length address case.
585 		 */
586 		mblk_t *nmp;
587 		struct T_conn_req *ntcr;
588 		sin_t *nsin;
589 
590 		nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
591 		    tcr->OPT_length, BPRI_HI);
592 		if (nmp == NULL) {
593 			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
594 			return;
595 		}
596 		ntcr = (struct T_conn_req *)nmp->b_rptr;
597 		bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
598 		ntcr->PRIM_type = T_CONN_REQ;
599 		ntcr->DEST_length = sizeof (sin_t);
600 		ntcr->DEST_offset = sizeof (struct T_conn_req);
601 
602 		nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
603 		*nsin = sin_null;
604 		/* Get pointer to shorter address to copy from original mp */
605 		sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
606 		    tcr->DEST_length); /* extract DEST_length worth of sin_t */
607 		if (sin == NULL || !OK_32PTR((char *)sin)) {
608 			freemsg(nmp);
609 			tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
610 			return;
611 		}
612 		nsin->sin_family = sin->sin_family;
613 		nsin->sin_port = sin->sin_port;
614 		nsin->sin_addr = sin->sin_addr;
615 		/* Note:nsin->sin_zero zero-fill with sin_null assign above */
616 		nmp->b_wptr = (uchar_t *)&nsin[1];
617 		if (tcr->OPT_length != 0) {
618 			ntcr->OPT_length = tcr->OPT_length;
619 			ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
620 			bcopy((uchar_t *)tcr + tcr->OPT_offset,
621 			    (uchar_t *)ntcr + ntcr->OPT_offset,
622 			    tcr->OPT_length);
623 			nmp->b_wptr += tcr->OPT_length;
624 		}
625 		freemsg(mp);	/* original mp freed */
626 		mp = nmp;	/* re-initialize original variables */
627 		tcr = ntcr;
628 	}
629 	/* FALLTHRU */
630 
631 	case sizeof (sin_t):
632 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
633 		    sizeof (sin_t));
634 		len = sizeof (sin_t);
635 		break;
636 
637 	case sizeof (sin6_t):
638 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
639 		    sizeof (sin6_t));
640 		len = sizeof (sin6_t);
641 		break;
642 	}
643 
644 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
645 	if (error != 0) {
646 		tcp_err_ack(tcp, mp, TSYSERR, error);
647 		return;
648 	}
649 
650 	/*
651 	 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
652 	 * should key on their sequence number and cut them loose.
653 	 */
654 
655 	/*
656 	 * If options passed in, feed it for verification and handling
657 	 */
658 	if (tcr->OPT_length != 0) {
659 		mblk_t	*ok_mp;
660 		mblk_t	*discon_mp;
661 		mblk_t  *conn_opts_mp;
662 		int t_error, sys_error, do_disconnect;
663 
664 		conn_opts_mp = NULL;
665 
666 		if (tcp_conprim_opt_process(tcp, mp,
667 		    &do_disconnect, &t_error, &sys_error) < 0) {
668 			if (do_disconnect) {
669 				ASSERT(t_error == 0 && sys_error == 0);
670 				discon_mp = mi_tpi_discon_ind(NULL,
671 				    ECONNREFUSED, 0);
672 				if (!discon_mp) {
673 					tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
674 					    TSYSERR, ENOMEM);
675 					return;
676 				}
677 				ok_mp = mi_tpi_ok_ack_alloc(mp);
678 				if (!ok_mp) {
679 					tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
680 					    TSYSERR, ENOMEM);
681 					return;
682 				}
683 				qreply(q, ok_mp);
684 				qreply(q, discon_mp); /* no flush! */
685 			} else {
686 				ASSERT(t_error != 0);
687 				tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
688 				    sys_error);
689 			}
690 			return;
691 		}
692 		/*
693 		 * Success in setting options, the mp option buffer represented
694 		 * by OPT_length/offset has been potentially modified and
695 		 * contains results of option processing. We copy it in
696 		 * another mp to save it for potentially influencing returning
697 		 * it in T_CONN_CONN.
698 		 */
699 		if (tcr->OPT_length != 0) { /* there are resulting options */
700 			conn_opts_mp = copyb(mp);
701 			if (!conn_opts_mp) {
702 				tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
703 				    TSYSERR, ENOMEM);
704 				return;
705 			}
706 			ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
707 			tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
708 			/*
709 			 * Note:
710 			 * These resulting option negotiation can include any
711 			 * end-to-end negotiation options but there no such
712 			 * thing (yet?) in our TCP/IP.
713 			 */
714 		}
715 	}
716 
717 	/* call the non-TPI version */
718 	error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid);
719 	if (error < 0) {
720 		mp = mi_tpi_err_ack_alloc(mp, -error, 0);
721 	} else if (error > 0) {
722 		mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
723 	} else {
724 		mp = mi_tpi_ok_ack_alloc(mp);
725 	}
726 
727 	/*
728 	 * Note: Code below is the "failure" case
729 	 */
730 	/* return error ack and blow away saved option results if any */
731 connect_failed:
732 	if (mp != NULL)
733 		putnext(connp->conn_rq, mp);
734 	else {
735 		tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
736 		    TSYSERR, ENOMEM);
737 	}
738 }
739 
740 /* Return the TPI/TLI equivalent of our current tcp_state */
741 static int
742 tcp_tpistate(tcp_t *tcp)
743 {
744 	switch (tcp->tcp_state) {
745 	case TCPS_IDLE:
746 		return (TS_UNBND);
747 	case TCPS_LISTEN:
748 		/*
749 		 * Return whether there are outstanding T_CONN_IND waiting
750 		 * for the matching T_CONN_RES. Therefore don't count q0.
751 		 */
752 		if (tcp->tcp_conn_req_cnt_q > 0)
753 			return (TS_WRES_CIND);
754 		else
755 			return (TS_IDLE);
756 	case TCPS_BOUND:
757 		return (TS_IDLE);
758 	case TCPS_SYN_SENT:
759 		return (TS_WCON_CREQ);
760 	case TCPS_SYN_RCVD:
761 		/*
762 		 * Note: assumption: this has to the active open SYN_RCVD.
763 		 * The passive instance is detached in SYN_RCVD stage of
764 		 * incoming connection processing so we cannot get request
765 		 * for T_info_ack on it.
766 		 */
767 		return (TS_WACK_CRES);
768 	case TCPS_ESTABLISHED:
769 		return (TS_DATA_XFER);
770 	case TCPS_CLOSE_WAIT:
771 		return (TS_WREQ_ORDREL);
772 	case TCPS_FIN_WAIT_1:
773 		return (TS_WIND_ORDREL);
774 	case TCPS_FIN_WAIT_2:
775 		return (TS_WIND_ORDREL);
776 
777 	case TCPS_CLOSING:
778 	case TCPS_LAST_ACK:
779 	case TCPS_TIME_WAIT:
780 	case TCPS_CLOSED:
781 		/*
782 		 * Following TS_WACK_DREQ7 is a rendition of "not
783 		 * yet TS_IDLE" TPI state. There is no best match to any
784 		 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
785 		 * choose a value chosen that will map to TLI/XTI level
786 		 * state of TSTATECHNG (state is process of changing) which
787 		 * captures what this dummy state represents.
788 		 */
789 		return (TS_WACK_DREQ7);
790 	default:
791 		cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
792 		    tcp->tcp_state, tcp_display(tcp, NULL,
793 		    DISP_PORT_ONLY));
794 		return (TS_UNBND);
795 	}
796 }
797 
798 static void
799 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
800 {
801 	tcp_stack_t	*tcps = tcp->tcp_tcps;
802 	conn_t		*connp = tcp->tcp_connp;
803 	extern struct T_info_ack tcp_g_t_info_ack;
804 	extern struct T_info_ack tcp_g_t_info_ack_v6;
805 
806 	if (connp->conn_family == AF_INET6)
807 		*tia = tcp_g_t_info_ack_v6;
808 	else
809 		*tia = tcp_g_t_info_ack;
810 	tia->CURRENT_state = tcp_tpistate(tcp);
811 	tia->OPT_size = tcp_max_optsize;
812 	if (tcp->tcp_mss == 0) {
813 		/* Not yet set - tcp_open does not set mss */
814 		if (connp->conn_ipversion == IPV4_VERSION)
815 			tia->TIDU_size = tcps->tcps_mss_def_ipv4;
816 		else
817 			tia->TIDU_size = tcps->tcps_mss_def_ipv6;
818 	} else {
819 		tia->TIDU_size = tcp->tcp_mss;
820 	}
821 	/* TODO: Default ETSDU is 1.  Is that correct for tcp? */
822 }
823 
824 void
825 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
826     t_uscalar_t cap_bits1)
827 {
828 	tcap->CAP_bits1 = 0;
829 
830 	if (cap_bits1 & TC1_INFO) {
831 		tcp_copy_info(&tcap->INFO_ack, tcp);
832 		tcap->CAP_bits1 |= TC1_INFO;
833 	}
834 
835 	if (cap_bits1 & TC1_ACCEPTOR_ID) {
836 		tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
837 		tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
838 	}
839 
840 }
841 
842 /*
843  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
844  * tcp_wput.  Much of the T_CAPABILITY_ACK information is copied from
845  * tcp_g_t_info_ack.  The current state of the stream is copied from
846  * tcp_state.
847  */
848 void
849 tcp_capability_req(tcp_t *tcp, mblk_t *mp)
850 {
851 	t_uscalar_t		cap_bits1;
852 	struct T_capability_ack	*tcap;
853 
854 	if (MBLKL(mp) < sizeof (struct T_capability_req)) {
855 		freemsg(mp);
856 		return;
857 	}
858 
859 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
860 
861 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
862 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
863 	if (mp == NULL)
864 		return;
865 
866 	tcap = (struct T_capability_ack *)mp->b_rptr;
867 	tcp_do_capability_ack(tcp, tcap, cap_bits1);
868 
869 	putnext(tcp->tcp_connp->conn_rq, mp);
870 }
871 
872 /*
873  * This routine responds to T_INFO_REQ messages.  It is called by tcp_wput.
874  * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
875  * The current state of the stream is copied from tcp_state.
876  */
877 void
878 tcp_info_req(tcp_t *tcp, mblk_t *mp)
879 {
880 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
881 	    T_INFO_ACK);
882 	if (!mp) {
883 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
884 		return;
885 	}
886 	tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
887 	putnext(tcp->tcp_connp->conn_rq, mp);
888 }
889 
890 /* Respond to the TPI addr request */
891 void
892 tcp_addr_req(tcp_t *tcp, mblk_t *mp)
893 {
894 	struct sockaddr *sa;
895 	mblk_t	*ackmp;
896 	struct T_addr_ack *taa;
897 	conn_t	*connp = tcp->tcp_connp;
898 	uint_t	addrlen;
899 
900 	/* Make it large enough for worst case */
901 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
902 	    2 * sizeof (sin6_t), 1);
903 	if (ackmp == NULL) {
904 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
905 		return;
906 	}
907 
908 	taa = (struct T_addr_ack *)ackmp->b_rptr;
909 
910 	bzero(taa, sizeof (struct T_addr_ack));
911 	ackmp->b_wptr = (uchar_t *)&taa[1];
912 
913 	taa->PRIM_type = T_ADDR_ACK;
914 	ackmp->b_datap->db_type = M_PCPROTO;
915 
916 	if (connp->conn_family == AF_INET)
917 		addrlen = sizeof (sin_t);
918 	else
919 		addrlen = sizeof (sin6_t);
920 
921 	/*
922 	 * Note: Following code assumes 32 bit alignment of basic
923 	 * data structures like sin_t and struct T_addr_ack.
924 	 */
925 	if (tcp->tcp_state >= TCPS_BOUND) {
926 		/*
927 		 * Fill in local address first
928 		 */
929 		taa->LOCADDR_offset = sizeof (*taa);
930 		taa->LOCADDR_length = addrlen;
931 		sa = (struct sockaddr *)&taa[1];
932 		(void) conn_getsockname(connp, sa, &addrlen);
933 		ackmp->b_wptr += addrlen;
934 	}
935 	if (tcp->tcp_state >= TCPS_SYN_RCVD) {
936 		/*
937 		 * Fill in Remote address
938 		 */
939 		taa->REMADDR_length = addrlen;
940 		/* assumed 32-bit alignment */
941 		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
942 		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
943 		(void) conn_getpeername(connp, sa, &addrlen);
944 		ackmp->b_wptr += addrlen;
945 	}
946 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
947 	putnext(tcp->tcp_connp->conn_rq, ackmp);
948 }
949 
950 /*
951  * Swap information between the eager and acceptor for a TLI/XTI client.
952  * The sockfs accept is done on the acceptor stream and control goes
953  * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
954  * called. In either case, both the eager and listener are in their own
955  * perimeter (squeue) and the code has to deal with potential race.
956  *
957  * See the block comment on top of tcp_accept() and tcp_tli_accept().
958  */
959 static void
960 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
961 {
962 	conn_t	*econnp, *aconnp;
963 
964 	ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
965 	ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
966 	ASSERT(!TCP_IS_SOCKET(acceptor));
967 	ASSERT(!TCP_IS_SOCKET(eager));
968 	ASSERT(!TCP_IS_SOCKET(listener));
969 
970 	/*
971 	 * Trusted Extensions may need to use a security label that is
972 	 * different from the acceptor's label on MLP and MAC-Exempt
973 	 * sockets. If this is the case, the required security label
974 	 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the
975 	 * acceptor stream refer to econnp we atomatically get that label.
976 	 */
977 
978 	acceptor->tcp_detached = B_TRUE;
979 	/*
980 	 * To permit stream re-use by TLI/XTI, the eager needs a copy of
981 	 * the acceptor id.
982 	 */
983 	eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
984 
985 	/* remove eager from listen list... */
986 	mutex_enter(&listener->tcp_eager_lock);
987 	tcp_eager_unlink(eager);
988 	ASSERT(eager->tcp_eager_next_q == NULL &&
989 	    eager->tcp_eager_last_q == NULL);
990 	ASSERT(eager->tcp_eager_next_q0 == NULL &&
991 	    eager->tcp_eager_prev_q0 == NULL);
992 	mutex_exit(&listener->tcp_eager_lock);
993 
994 	econnp = eager->tcp_connp;
995 	aconnp = acceptor->tcp_connp;
996 	econnp->conn_rq = aconnp->conn_rq;
997 	econnp->conn_wq = aconnp->conn_wq;
998 	econnp->conn_rq->q_ptr = econnp;
999 	econnp->conn_wq->q_ptr = econnp;
1000 
1001 	/*
1002 	 * In the TLI/XTI loopback case, we are inside the listener's squeue,
1003 	 * which might be a different squeue from our peer TCP instance.
1004 	 * For TCP Fusion, the peer expects that whenever tcp_detached is
1005 	 * clear, our TCP queues point to the acceptor's queues.  Thus, use
1006 	 * membar_producer() to ensure that the assignments of conn_rq/conn_wq
1007 	 * above reach global visibility prior to the clearing of tcp_detached.
1008 	 */
1009 	membar_producer();
1010 	eager->tcp_detached = B_FALSE;
1011 
1012 	ASSERT(eager->tcp_ack_tid == 0);
1013 
1014 	econnp->conn_dev = aconnp->conn_dev;
1015 	econnp->conn_minor_arena = aconnp->conn_minor_arena;
1016 
1017 	ASSERT(econnp->conn_minor_arena != NULL);
1018 	if (econnp->conn_cred != NULL)
1019 		crfree(econnp->conn_cred);
1020 	econnp->conn_cred = aconnp->conn_cred;
1021 	ASSERT(!(econnp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1022 	econnp->conn_ixa->ixa_cred = econnp->conn_cred;
1023 	aconnp->conn_cred = NULL;
1024 	econnp->conn_cpid = aconnp->conn_cpid;
1025 	ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
1026 	ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
1027 
1028 	econnp->conn_zoneid = aconnp->conn_zoneid;
1029 	econnp->conn_allzones = aconnp->conn_allzones;
1030 	econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
1031 
1032 	econnp->conn_mac_mode = aconnp->conn_mac_mode;
1033 	econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
1034 	aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
1035 
1036 	/* Do the IPC initialization */
1037 	CONN_INC_REF(econnp);
1038 
1039 	/* Done with old IPC. Drop its ref on its connp */
1040 	CONN_DEC_REF(aconnp);
1041 }
1042 
1043 /*
1044  * This runs at the tail end of accept processing on the squeue of the
1045  * new connection.
1046  */
1047 /* ARGSUSED */
1048 static void
1049 tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1050 {
1051 	conn_t			*connp = (conn_t *)arg;
1052 	tcp_t			*tcp = connp->conn_tcp;
1053 	queue_t			*q = connp->conn_rq;
1054 	tcp_stack_t		*tcps = tcp->tcp_tcps;
1055 	struct stroptions 	*stropt;
1056 	struct sock_proto_props sopp;
1057 
1058 	/* Should never be called for non-STREAMS sockets */
1059 	ASSERT(!IPCL_IS_NONSTR(connp));
1060 
1061 	/* We should just receive a single mblk that fits a T_discon_ind */
1062 	ASSERT(mp->b_cont == NULL);
1063 
1064 	/*
1065 	 * Drop the eager's ref on the listener, that was placed when
1066 	 * this eager began life in tcp_input_listener.
1067 	 */
1068 	CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
1069 
1070 	tcp->tcp_detached = B_FALSE;
1071 
1072 	if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) {
1073 		/*
1074 		 * Someone blewoff the eager before we could finish
1075 		 * the accept.
1076 		 *
1077 		 * The only reason eager exists it because we put in
1078 		 * a ref on it when conn ind went up. We need to send
1079 		 * a disconnect indication up while the last reference
1080 		 * on the eager will be dropped by the squeue when we
1081 		 * return.
1082 		 */
1083 		ASSERT(tcp->tcp_listener == NULL);
1084 		if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) {
1085 			struct	T_discon_ind	*tdi;
1086 
1087 			(void) putnextctl1(q, M_FLUSH, FLUSHRW);
1088 			/*
1089 			 * Let us reuse the incoming mblk to avoid
1090 			 * memory allocation failure problems. We know
1091 			 * that the size of the incoming mblk i.e.
1092 			 * stroptions is greater than sizeof
1093 			 * T_discon_ind.
1094 			 */
1095 			ASSERT(DB_REF(mp) == 1);
1096 			ASSERT(MBLKSIZE(mp) >=
1097 			    sizeof (struct T_discon_ind));
1098 
1099 			DB_TYPE(mp) = M_PROTO;
1100 			((union T_primitives *)mp->b_rptr)->type =
1101 			    T_DISCON_IND;
1102 			tdi = (struct T_discon_ind *)mp->b_rptr;
1103 			if (tcp->tcp_issocket) {
1104 				tdi->DISCON_reason = ECONNREFUSED;
1105 				tdi->SEQ_number = 0;
1106 			} else {
1107 				tdi->DISCON_reason = ENOPROTOOPT;
1108 				tdi->SEQ_number =
1109 				    tcp->tcp_conn_req_seqnum;
1110 			}
1111 			mp->b_wptr = mp->b_rptr +
1112 			    sizeof (struct T_discon_ind);
1113 			putnext(q, mp);
1114 		}
1115 		tcp->tcp_hard_binding = B_FALSE;
1116 		return;
1117 	}
1118 
1119 	/*
1120 	 * This is the first time we run on the correct
1121 	 * queue after tcp_accept. So fix all the q parameters
1122 	 * here.
1123 	 *
1124 	 * Let us reuse the incoming mblk to avoid
1125 	 * memory allocation failure problems. We know
1126 	 * that the size of the incoming mblk is at least
1127 	 * stroptions
1128 	 */
1129 	tcp_get_proto_props(tcp, &sopp);
1130 
1131 	ASSERT(DB_REF(mp) == 1);
1132 	ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions));
1133 
1134 	DB_TYPE(mp) = M_SETOPTS;
1135 	stropt = (struct stroptions *)mp->b_rptr;
1136 	mp->b_wptr = mp->b_rptr + sizeof (struct stroptions);
1137 	stropt = (struct stroptions *)mp->b_rptr;
1138 	ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK));
1139 	stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
1140 	stropt->so_hiwat = sopp.sopp_rxhiwat;
1141 	stropt->so_wroff = sopp.sopp_wroff;
1142 	stropt->so_maxblk = sopp.sopp_maxblk;
1143 
1144 	if (sopp.sopp_flags & SOCKOPT_TAIL) {
1145 		ASSERT(tcp->tcp_kssl_ctx != NULL);
1146 
1147 		stropt->so_flags |= SO_TAIL | SO_COPYOPT;
1148 		stropt->so_tail = sopp.sopp_tail;
1149 		stropt->so_copyopt = sopp.sopp_zcopyflag;
1150 	}
1151 
1152 	/* Send the options up */
1153 	putnext(q, mp);
1154 
1155 	/*
1156 	 * Pass up any data and/or a fin that has been received.
1157 	 *
1158 	 * Adjust receive window in case it had decreased
1159 	 * (because there is data <=> tcp_rcv_list != NULL)
1160 	 * while the connection was detached. Note that
1161 	 * in case the eager was flow-controlled, w/o this
1162 	 * code, the rwnd may never open up again!
1163 	 */
1164 	if (tcp->tcp_rcv_list != NULL) {
1165 		/* We drain directly in case of fused tcp loopback */
1166 
1167 		if (!tcp->tcp_fused && canputnext(q)) {
1168 			tcp->tcp_rwnd = connp->conn_rcvbuf;
1169 			if (tcp->tcp_state >= TCPS_ESTABLISHED &&
1170 			    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
1171 				tcp_xmit_ctl(NULL,
1172 				    tcp, (tcp->tcp_swnd == 0) ?
1173 				    tcp->tcp_suna : tcp->tcp_snxt,
1174 				    tcp->tcp_rnxt, TH_ACK);
1175 			}
1176 		}
1177 
1178 		(void) tcp_rcv_drain(tcp);
1179 
1180 		/*
1181 		 * For fused tcp loopback, back-enable peer endpoint
1182 		 * if it's currently flow-controlled.
1183 		 */
1184 		if (tcp->tcp_fused) {
1185 			tcp_t *peer_tcp = tcp->tcp_loopback_peer;
1186 
1187 			ASSERT(peer_tcp != NULL);
1188 			ASSERT(peer_tcp->tcp_fused);
1189 
1190 			mutex_enter(&peer_tcp->tcp_non_sq_lock);
1191 			if (peer_tcp->tcp_flow_stopped) {
1192 				tcp_clrqfull(peer_tcp);
1193 				TCP_STAT(tcps, tcp_fusion_backenabled);
1194 			}
1195 			mutex_exit(&peer_tcp->tcp_non_sq_lock);
1196 		}
1197 	}
1198 	ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
1199 	if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
1200 		tcp->tcp_ordrel_done = B_TRUE;
1201 		mp = tcp->tcp_ordrel_mp;
1202 		tcp->tcp_ordrel_mp = NULL;
1203 		putnext(q, mp);
1204 	}
1205 	tcp->tcp_hard_binding = B_FALSE;
1206 
1207 	if (connp->conn_keepalive) {
1208 		tcp->tcp_ka_last_intrvl = 0;
1209 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1210 		    tcp->tcp_ka_interval);
1211 	}
1212 
1213 	/*
1214 	 * At this point, eager is fully established and will
1215 	 * have the following references -
1216 	 *
1217 	 * 2 references for connection to exist (1 for TCP and 1 for IP).
1218 	 * 1 reference for the squeue which will be dropped by the squeue as
1219 	 *	soon as this function returns.
1220 	 * There will be 1 additonal reference for being in classifier
1221 	 *	hash list provided something bad hasn't happened.
1222 	 */
1223 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
1224 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
1225 }
1226 
1227 
1228 /*
1229  * Reply to a clients T_CONN_RES TPI message. This function
1230  * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
1231  * on the acceptor STREAM and processed in tcp_accept_common().
1232  * Read the block comment on top of tcp_input_listener().
1233  */
1234 void
1235 tcp_tli_accept(tcp_t *listener, mblk_t *mp)
1236 {
1237 	tcp_t		*acceptor;
1238 	tcp_t		*eager;
1239 	tcp_t   	*tcp;
1240 	struct T_conn_res	*tcr;
1241 	t_uscalar_t	acceptor_id;
1242 	t_scalar_t	seqnum;
1243 	mblk_t		*discon_mp = NULL;
1244 	mblk_t		*ok_mp;
1245 	mblk_t		*mp1;
1246 	tcp_stack_t	*tcps = listener->tcp_tcps;
1247 	conn_t		*econnp;
1248 
1249 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
1250 		tcp_err_ack(listener, mp, TPROTO, 0);
1251 		return;
1252 	}
1253 	tcr = (struct T_conn_res *)mp->b_rptr;
1254 
1255 	/*
1256 	 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
1257 	 * read side queue of the streams device underneath us i.e. the
1258 	 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
1259 	 * look it up in the queue_hash.  Under LP64 it sends down the
1260 	 * minor_t of the accepting endpoint.
1261 	 *
1262 	 * Once the acceptor/eager are modified (in tcp_accept_swap) the
1263 	 * fanout hash lock is held.
1264 	 * This prevents any thread from entering the acceptor queue from
1265 	 * below (since it has not been hard bound yet i.e. any inbound
1266 	 * packets will arrive on the listener conn_t and
1267 	 * go through the classifier).
1268 	 * The CONN_INC_REF will prevent the acceptor from closing.
1269 	 *
1270 	 * XXX It is still possible for a tli application to send down data
1271 	 * on the accepting stream while another thread calls t_accept.
1272 	 * This should not be a problem for well-behaved applications since
1273 	 * the T_OK_ACK is sent after the queue swapping is completed.
1274 	 *
1275 	 * If the accepting fd is the same as the listening fd, avoid
1276 	 * queue hash lookup since that will return an eager listener in a
1277 	 * already established state.
1278 	 */
1279 	acceptor_id = tcr->ACCEPTOR_id;
1280 	mutex_enter(&listener->tcp_eager_lock);
1281 	if (listener->tcp_acceptor_id == acceptor_id) {
1282 		eager = listener->tcp_eager_next_q;
1283 		/* only count how many T_CONN_INDs so don't count q0 */
1284 		if ((listener->tcp_conn_req_cnt_q != 1) ||
1285 		    (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
1286 			mutex_exit(&listener->tcp_eager_lock);
1287 			tcp_err_ack(listener, mp, TBADF, 0);
1288 			return;
1289 		}
1290 		if (listener->tcp_conn_req_cnt_q0 != 0) {
1291 			/* Throw away all the eagers on q0. */
1292 			tcp_eager_cleanup(listener, 1);
1293 		}
1294 		if (listener->tcp_syn_defense) {
1295 			listener->tcp_syn_defense = B_FALSE;
1296 			if (listener->tcp_ip_addr_cache != NULL) {
1297 				kmem_free(listener->tcp_ip_addr_cache,
1298 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1299 				listener->tcp_ip_addr_cache = NULL;
1300 			}
1301 		}
1302 		/*
1303 		 * Transfer tcp_conn_req_max to the eager so that when
1304 		 * a disconnect occurs we can revert the endpoint to the
1305 		 * listen state.
1306 		 */
1307 		eager->tcp_conn_req_max = listener->tcp_conn_req_max;
1308 		ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
1309 		/*
1310 		 * Get a reference on the acceptor just like the
1311 		 * tcp_acceptor_hash_lookup below.
1312 		 */
1313 		acceptor = listener;
1314 		CONN_INC_REF(acceptor->tcp_connp);
1315 	} else {
1316 		acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
1317 		if (acceptor == NULL) {
1318 			if (listener->tcp_connp->conn_debug) {
1319 				(void) strlog(TCP_MOD_ID, 0, 1,
1320 				    SL_ERROR|SL_TRACE,
1321 				    "tcp_accept: did not find acceptor 0x%x\n",
1322 				    acceptor_id);
1323 			}
1324 			mutex_exit(&listener->tcp_eager_lock);
1325 			tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
1326 			return;
1327 		}
1328 		/*
1329 		 * Verify acceptor state. The acceptable states for an acceptor
1330 		 * include TCPS_IDLE and TCPS_BOUND.
1331 		 */
1332 		switch (acceptor->tcp_state) {
1333 		case TCPS_IDLE:
1334 			/* FALLTHRU */
1335 		case TCPS_BOUND:
1336 			break;
1337 		default:
1338 			CONN_DEC_REF(acceptor->tcp_connp);
1339 			mutex_exit(&listener->tcp_eager_lock);
1340 			tcp_err_ack(listener, mp, TOUTSTATE, 0);
1341 			return;
1342 		}
1343 	}
1344 
1345 	/* The listener must be in TCPS_LISTEN */
1346 	if (listener->tcp_state != TCPS_LISTEN) {
1347 		CONN_DEC_REF(acceptor->tcp_connp);
1348 		mutex_exit(&listener->tcp_eager_lock);
1349 		tcp_err_ack(listener, mp, TOUTSTATE, 0);
1350 		return;
1351 	}
1352 
1353 	/*
1354 	 * Rendezvous with an eager connection request packet hanging off
1355 	 * 'tcp' that has the 'seqnum' tag.  We tagged the detached open
1356 	 * tcp structure when the connection packet arrived in
1357 	 * tcp_input_listener().
1358 	 */
1359 	seqnum = tcr->SEQ_number;
1360 	eager = listener;
1361 	do {
1362 		eager = eager->tcp_eager_next_q;
1363 		if (eager == NULL) {
1364 			CONN_DEC_REF(acceptor->tcp_connp);
1365 			mutex_exit(&listener->tcp_eager_lock);
1366 			tcp_err_ack(listener, mp, TBADSEQ, 0);
1367 			return;
1368 		}
1369 	} while (eager->tcp_conn_req_seqnum != seqnum);
1370 	mutex_exit(&listener->tcp_eager_lock);
1371 
1372 	/*
1373 	 * At this point, both acceptor and listener have 2 ref
1374 	 * that they begin with. Acceptor has one additional ref
1375 	 * we placed in lookup while listener has 3 additional
1376 	 * ref for being behind the squeue (tcp_accept() is
1377 	 * done on listener's squeue); being in classifier hash;
1378 	 * and eager's ref on listener.
1379 	 */
1380 	ASSERT(listener->tcp_connp->conn_ref >= 5);
1381 	ASSERT(acceptor->tcp_connp->conn_ref >= 3);
1382 
1383 	/*
1384 	 * The eager at this point is set in its own squeue and
1385 	 * could easily have been killed (tcp_accept_finish will
1386 	 * deal with that) because of a TH_RST so we can only
1387 	 * ASSERT for a single ref.
1388 	 */
1389 	ASSERT(eager->tcp_connp->conn_ref >= 1);
1390 
1391 	/*
1392 	 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
1393 	 * use it if something failed.
1394 	 */
1395 	discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
1396 	    sizeof (struct stroptions)), BPRI_HI);
1397 	if (discon_mp == NULL) {
1398 		CONN_DEC_REF(acceptor->tcp_connp);
1399 		CONN_DEC_REF(eager->tcp_connp);
1400 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
1401 		return;
1402 	}
1403 
1404 	econnp = eager->tcp_connp;
1405 
1406 	/* Hold a copy of mp, in case reallocb fails */
1407 	if ((mp1 = copymsg(mp)) == NULL) {
1408 		CONN_DEC_REF(acceptor->tcp_connp);
1409 		CONN_DEC_REF(eager->tcp_connp);
1410 		freemsg(discon_mp);
1411 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
1412 		return;
1413 	}
1414 
1415 	tcr = (struct T_conn_res *)mp1->b_rptr;
1416 
1417 	/*
1418 	 * This is an expanded version of mi_tpi_ok_ack_alloc()
1419 	 * which allocates a larger mblk and appends the new
1420 	 * local address to the ok_ack.  The address is copied by
1421 	 * soaccept() for getsockname().
1422 	 */
1423 	{
1424 		int extra;
1425 
1426 		extra = (econnp->conn_family == AF_INET) ?
1427 		    sizeof (sin_t) : sizeof (sin6_t);
1428 
1429 		/*
1430 		 * Try to re-use mp, if possible.  Otherwise, allocate
1431 		 * an mblk and return it as ok_mp.  In any case, mp
1432 		 * is no longer usable upon return.
1433 		 */
1434 		if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
1435 			CONN_DEC_REF(acceptor->tcp_connp);
1436 			CONN_DEC_REF(eager->tcp_connp);
1437 			freemsg(discon_mp);
1438 			/* Original mp has been freed by now, so use mp1 */
1439 			tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
1440 			return;
1441 		}
1442 
1443 		mp = NULL;	/* We should never use mp after this point */
1444 
1445 		switch (extra) {
1446 		case sizeof (sin_t): {
1447 			sin_t *sin = (sin_t *)ok_mp->b_wptr;
1448 
1449 			ok_mp->b_wptr += extra;
1450 			sin->sin_family = AF_INET;
1451 			sin->sin_port = econnp->conn_lport;
1452 			sin->sin_addr.s_addr = econnp->conn_laddr_v4;
1453 			break;
1454 		}
1455 		case sizeof (sin6_t): {
1456 			sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
1457 
1458 			ok_mp->b_wptr += extra;
1459 			sin6->sin6_family = AF_INET6;
1460 			sin6->sin6_port = econnp->conn_lport;
1461 			sin6->sin6_addr = econnp->conn_laddr_v6;
1462 			sin6->sin6_flowinfo = econnp->conn_flowinfo;
1463 			if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
1464 			    (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
1465 				sin6->sin6_scope_id =
1466 				    econnp->conn_ixa->ixa_scopeid;
1467 			} else {
1468 				sin6->sin6_scope_id = 0;
1469 			}
1470 			sin6->__sin6_src_id = 0;
1471 			break;
1472 		}
1473 		default:
1474 			break;
1475 		}
1476 		ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
1477 	}
1478 
1479 	/*
1480 	 * If there are no options we know that the T_CONN_RES will
1481 	 * succeed. However, we can't send the T_OK_ACK upstream until
1482 	 * the tcp_accept_swap is done since it would be dangerous to
1483 	 * let the application start using the new fd prior to the swap.
1484 	 */
1485 	tcp_accept_swap(listener, acceptor, eager);
1486 
1487 	/*
1488 	 * tcp_accept_swap unlinks eager from listener but does not drop
1489 	 * the eager's reference on the listener.
1490 	 */
1491 	ASSERT(eager->tcp_listener == NULL);
1492 	ASSERT(listener->tcp_connp->conn_ref >= 5);
1493 
1494 	/*
1495 	 * The eager is now associated with its own queue. Insert in
1496 	 * the hash so that the connection can be reused for a future
1497 	 * T_CONN_RES.
1498 	 */
1499 	tcp_acceptor_hash_insert(acceptor_id, eager);
1500 
1501 	/*
1502 	 * We now do the processing of options with T_CONN_RES.
1503 	 * We delay till now since we wanted to have queue to pass to
1504 	 * option processing routines that points back to the right
1505 	 * instance structure which does not happen until after
1506 	 * tcp_accept_swap().
1507 	 *
1508 	 * Note:
1509 	 * The sanity of the logic here assumes that whatever options
1510 	 * are appropriate to inherit from listner=>eager are done
1511 	 * before this point, and whatever were to be overridden (or not)
1512 	 * in transfer logic from eager=>acceptor in tcp_accept_swap().
1513 	 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
1514 	 *   before its ACCEPTOR_id comes down in T_CONN_RES ]
1515 	 * This may not be true at this point in time but can be fixed
1516 	 * independently. This option processing code starts with
1517 	 * the instantiated acceptor instance and the final queue at
1518 	 * this point.
1519 	 */
1520 
1521 	if (tcr->OPT_length != 0) {
1522 		/* Options to process */
1523 		int t_error = 0;
1524 		int sys_error = 0;
1525 		int do_disconnect = 0;
1526 
1527 		if (tcp_conprim_opt_process(eager, mp1,
1528 		    &do_disconnect, &t_error, &sys_error) < 0) {
1529 			eager->tcp_accept_error = 1;
1530 			if (do_disconnect) {
1531 				/*
1532 				 * An option failed which does not allow
1533 				 * connection to be accepted.
1534 				 *
1535 				 * We allow T_CONN_RES to succeed and
1536 				 * put a T_DISCON_IND on the eager queue.
1537 				 */
1538 				ASSERT(t_error == 0 && sys_error == 0);
1539 				eager->tcp_send_discon_ind = 1;
1540 			} else {
1541 				ASSERT(t_error != 0);
1542 				freemsg(ok_mp);
1543 				/*
1544 				 * Original mp was either freed or set
1545 				 * to ok_mp above, so use mp1 instead.
1546 				 */
1547 				tcp_err_ack(listener, mp1, t_error, sys_error);
1548 				goto finish;
1549 			}
1550 		}
1551 		/*
1552 		 * Most likely success in setting options (except if
1553 		 * eager->tcp_send_discon_ind set).
1554 		 * mp1 option buffer represented by OPT_length/offset
1555 		 * potentially modified and contains results of setting
1556 		 * options at this point
1557 		 */
1558 	}
1559 
1560 	/* We no longer need mp1, since all options processing has passed */
1561 	freemsg(mp1);
1562 
1563 	putnext(listener->tcp_connp->conn_rq, ok_mp);
1564 
1565 	mutex_enter(&listener->tcp_eager_lock);
1566 	if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
1567 		tcp_t	*tail;
1568 		mblk_t	*conn_ind;
1569 
1570 		/*
1571 		 * This path should not be executed if listener and
1572 		 * acceptor streams are the same.
1573 		 */
1574 		ASSERT(listener != acceptor);
1575 
1576 		tcp = listener->tcp_eager_prev_q0;
1577 		/*
1578 		 * listener->tcp_eager_prev_q0 points to the TAIL of the
1579 		 * deferred T_conn_ind queue. We need to get to the head of
1580 		 * the queue in order to send up T_conn_ind the same order as
1581 		 * how the 3WHS is completed.
1582 		 */
1583 		while (tcp != listener) {
1584 			if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
1585 				break;
1586 			else
1587 				tcp = tcp->tcp_eager_prev_q0;
1588 		}
1589 		ASSERT(tcp != listener);
1590 		conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
1591 		ASSERT(conn_ind != NULL);
1592 		tcp->tcp_conn.tcp_eager_conn_ind = NULL;
1593 
1594 		/* Move from q0 to q */
1595 		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1596 		listener->tcp_conn_req_cnt_q0--;
1597 		listener->tcp_conn_req_cnt_q++;
1598 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1599 		    tcp->tcp_eager_prev_q0;
1600 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1601 		    tcp->tcp_eager_next_q0;
1602 		tcp->tcp_eager_prev_q0 = NULL;
1603 		tcp->tcp_eager_next_q0 = NULL;
1604 		tcp->tcp_conn_def_q0 = B_FALSE;
1605 
1606 		/* Make sure the tcp isn't in the list of droppables */
1607 		ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
1608 		    tcp->tcp_eager_prev_drop_q0 == NULL);
1609 
1610 		/*
1611 		 * Insert at end of the queue because sockfs sends
1612 		 * down T_CONN_RES in chronological order. Leaving
1613 		 * the older conn indications at front of the queue
1614 		 * helps reducing search time.
1615 		 */
1616 		tail = listener->tcp_eager_last_q;
1617 		if (tail != NULL)
1618 			tail->tcp_eager_next_q = tcp;
1619 		else
1620 			listener->tcp_eager_next_q = tcp;
1621 		listener->tcp_eager_last_q = tcp;
1622 		tcp->tcp_eager_next_q = NULL;
1623 		mutex_exit(&listener->tcp_eager_lock);
1624 		putnext(tcp->tcp_connp->conn_rq, conn_ind);
1625 	} else {
1626 		mutex_exit(&listener->tcp_eager_lock);
1627 	}
1628 
1629 	/*
1630 	 * Done with the acceptor - free it
1631 	 *
1632 	 * Note: from this point on, no access to listener should be made
1633 	 * as listener can be equal to acceptor.
1634 	 */
1635 finish:
1636 	ASSERT(acceptor->tcp_detached);
1637 	acceptor->tcp_connp->conn_rq = NULL;
1638 	ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
1639 	acceptor->tcp_connp->conn_wq = NULL;
1640 	(void) tcp_clean_death(acceptor, 0);
1641 	CONN_DEC_REF(acceptor->tcp_connp);
1642 
1643 	/*
1644 	 * We pass discon_mp to tcp_accept_finish to get on the right squeue.
1645 	 *
1646 	 * It will update the setting for sockfs/stream head and also take
1647 	 * care of any data that arrived before accept() wad called.
1648 	 * In case we already received a FIN then tcp_accept_finish will send up
1649 	 * the ordrel. It will also send up a window update if the window
1650 	 * has opened up.
1651 	 */
1652 
1653 	/*
1654 	 * XXX: we currently have a problem if XTI application closes the
1655 	 * acceptor stream in between. This problem exists in on10-gate also
1656 	 * and is well know but nothing can be done short of major rewrite
1657 	 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
1658 	 * eager same squeue as listener (we can distinguish non socket
1659 	 * listeners at the time of handling a SYN in tcp_input_listener)
1660 	 * and do most of the work that tcp_accept_finish does here itself
1661 	 * and then get behind the acceptor squeue to access the acceptor
1662 	 * queue.
1663 	 */
1664 	/*
1665 	 * We already have a ref on tcp so no need to do one before squeue_enter
1666 	 */
1667 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
1668 	    tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
1669 	    SQTAG_TCP_ACCEPT_FINISH);
1670 }
1671 
1672 
1673 /*
1674  * This is the STREAMS entry point for T_CONN_RES coming down on
1675  * Acceptor STREAM when  sockfs listener does accept processing.
1676  * Read the block comment on top of tcp_input_listener().
1677  */
1678 void
1679 tcp_tpi_accept(queue_t *q, mblk_t *mp)
1680 {
1681 	queue_t *rq = RD(q);
1682 	struct T_conn_res *conn_res;
1683 	tcp_t *eager;
1684 	tcp_t *listener;
1685 	struct T_ok_ack *ok;
1686 	t_scalar_t PRIM_type;
1687 	mblk_t *discon_mp;
1688 	conn_t *econnp;
1689 	cred_t *cr;
1690 
1691 	ASSERT(DB_TYPE(mp) == M_PROTO);
1692 
1693 	/*
1694 	 * All Solaris components should pass a db_credp
1695 	 * for this TPI message, hence we ASSERT.
1696 	 * But in case there is some other M_PROTO that looks
1697 	 * like a TPI message sent by some other kernel
1698 	 * component, we check and return an error.
1699 	 */
1700 	cr = msg_getcred(mp, NULL);
1701 	ASSERT(cr != NULL);
1702 	if (cr == NULL) {
1703 		mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
1704 		if (mp != NULL)
1705 			putnext(rq, mp);
1706 		return;
1707 	}
1708 	conn_res = (struct T_conn_res *)mp->b_rptr;
1709 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
1710 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) {
1711 		mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
1712 		if (mp != NULL)
1713 			putnext(rq, mp);
1714 		return;
1715 	}
1716 	switch (conn_res->PRIM_type) {
1717 	case O_T_CONN_RES:
1718 	case T_CONN_RES:
1719 		/*
1720 		 * We pass up an err ack if allocb fails. This will
1721 		 * cause sockfs to issue a T_DISCON_REQ which will cause
1722 		 * tcp_eager_blowoff to be called. sockfs will then call
1723 		 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream.
1724 		 * we need to do the allocb up here because we have to
1725 		 * make sure rq->q_qinfo->qi_qclose still points to the
1726 		 * correct function (tcp_tpi_close_accept) in case allocb
1727 		 * fails.
1728 		 */
1729 		bcopy(mp->b_rptr + conn_res->OPT_offset,
1730 		    &eager, conn_res->OPT_length);
1731 		PRIM_type = conn_res->PRIM_type;
1732 		mp->b_datap->db_type = M_PCPROTO;
1733 		mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack);
1734 		ok = (struct T_ok_ack *)mp->b_rptr;
1735 		ok->PRIM_type = T_OK_ACK;
1736 		ok->CORRECT_prim = PRIM_type;
1737 		econnp = eager->tcp_connp;
1738 		econnp->conn_dev = (dev_t)RD(q)->q_ptr;
1739 		econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr);
1740 		econnp->conn_rq = rq;
1741 		econnp->conn_wq = q;
1742 		rq->q_ptr = econnp;
1743 		rq->q_qinfo = &tcp_rinitv4;	/* No open - same as rinitv6 */
1744 		q->q_ptr = econnp;
1745 		q->q_qinfo = &tcp_winit;
1746 		listener = eager->tcp_listener;
1747 
1748 		/*
1749 		 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
1750 		 * use it if something failed.
1751 		 */
1752 		discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
1753 		    sizeof (struct stroptions)), BPRI_HI);
1754 
1755 		if (discon_mp == NULL) {
1756 			mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
1757 			if (mp != NULL)
1758 				putnext(rq, mp);
1759 			return;
1760 		}
1761 
1762 		eager->tcp_issocket = B_TRUE;
1763 
1764 		ASSERT(econnp->conn_netstack ==
1765 		    listener->tcp_connp->conn_netstack);
1766 		ASSERT(eager->tcp_tcps == listener->tcp_tcps);
1767 
1768 		/* Put the ref for IP */
1769 		CONN_INC_REF(econnp);
1770 
1771 		/*
1772 		 * We should have minimum of 3 references on the conn
1773 		 * at this point. One each for TCP and IP and one for
1774 		 * the T_conn_ind that was sent up when the 3-way handshake
1775 		 * completed. In the normal case we would also have another
1776 		 * reference (making a total of 4) for the conn being in the
1777 		 * classifier hash list. However the eager could have received
1778 		 * an RST subsequently and tcp_closei_local could have removed
1779 		 * the eager from the classifier hash list, hence we can't
1780 		 * assert that reference.
1781 		 */
1782 		ASSERT(econnp->conn_ref >= 3);
1783 
1784 		mutex_enter(&listener->tcp_eager_lock);
1785 		if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
1786 
1787 			tcp_t *tail;
1788 			tcp_t *tcp;
1789 			mblk_t *mp1;
1790 
1791 			tcp = listener->tcp_eager_prev_q0;
1792 			/*
1793 			 * listener->tcp_eager_prev_q0 points to the TAIL of the
1794 			 * deferred T_conn_ind queue. We need to get to the head
1795 			 * of the queue in order to send up T_conn_ind the same
1796 			 * order as how the 3WHS is completed.
1797 			 */
1798 			while (tcp != listener) {
1799 				if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 &&
1800 				    !tcp->tcp_kssl_pending)
1801 					break;
1802 				else
1803 					tcp = tcp->tcp_eager_prev_q0;
1804 			}
1805 			/* None of the pending eagers can be sent up now */
1806 			if (tcp == listener)
1807 				goto no_more_eagers;
1808 
1809 			mp1 = tcp->tcp_conn.tcp_eager_conn_ind;
1810 			tcp->tcp_conn.tcp_eager_conn_ind = NULL;
1811 			/* Move from q0 to q */
1812 			ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1813 			listener->tcp_conn_req_cnt_q0--;
1814 			listener->tcp_conn_req_cnt_q++;
1815 			tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1816 			    tcp->tcp_eager_prev_q0;
1817 			tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1818 			    tcp->tcp_eager_next_q0;
1819 			tcp->tcp_eager_prev_q0 = NULL;
1820 			tcp->tcp_eager_next_q0 = NULL;
1821 			tcp->tcp_conn_def_q0 = B_FALSE;
1822 
1823 			/* Make sure the tcp isn't in the list of droppables */
1824 			ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
1825 			    tcp->tcp_eager_prev_drop_q0 == NULL);
1826 
1827 			/*
1828 			 * Insert at end of the queue because sockfs sends
1829 			 * down T_CONN_RES in chronological order. Leaving
1830 			 * the older conn indications at front of the queue
1831 			 * helps reducing search time.
1832 			 */
1833 			tail = listener->tcp_eager_last_q;
1834 			if (tail != NULL) {
1835 				tail->tcp_eager_next_q = tcp;
1836 			} else {
1837 				listener->tcp_eager_next_q = tcp;
1838 			}
1839 			listener->tcp_eager_last_q = tcp;
1840 			tcp->tcp_eager_next_q = NULL;
1841 
1842 			/* Need to get inside the listener perimeter */
1843 			CONN_INC_REF(listener->tcp_connp);
1844 			SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1,
1845 			    tcp_send_pending, listener->tcp_connp, NULL,
1846 			    SQ_FILL, SQTAG_TCP_SEND_PENDING);
1847 		}
1848 no_more_eagers:
1849 		tcp_eager_unlink(eager);
1850 		mutex_exit(&listener->tcp_eager_lock);
1851 
1852 		/*
1853 		 * At this point, the eager is detached from the listener
1854 		 * but we still have an extra refs on eager (apart from the
1855 		 * usual tcp references). The ref was placed in tcp_input_data
1856 		 * before sending the conn_ind in tcp_send_conn_ind.
1857 		 * The ref will be dropped in tcp_accept_finish().
1858 		 */
1859 		SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish,
1860 		    econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
1861 
1862 		/*
1863 		 * Send the new local address also up to sockfs. There
1864 		 * should already be enough space in the mp that came
1865 		 * down from soaccept().
1866 		 */
1867 		if (econnp->conn_family == AF_INET) {
1868 			sin_t *sin;
1869 
1870 			ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
1871 			    (sizeof (struct T_ok_ack) + sizeof (sin_t)));
1872 			sin = (sin_t *)mp->b_wptr;
1873 			mp->b_wptr += sizeof (sin_t);
1874 			sin->sin_family = AF_INET;
1875 			sin->sin_port = econnp->conn_lport;
1876 			sin->sin_addr.s_addr = econnp->conn_laddr_v4;
1877 		} else {
1878 			sin6_t *sin6;
1879 
1880 			ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
1881 			    sizeof (struct T_ok_ack) + sizeof (sin6_t));
1882 			sin6 = (sin6_t *)mp->b_wptr;
1883 			mp->b_wptr += sizeof (sin6_t);
1884 			sin6->sin6_family = AF_INET6;
1885 			sin6->sin6_port = econnp->conn_lport;
1886 			sin6->sin6_addr = econnp->conn_laddr_v6;
1887 			if (econnp->conn_ipversion == IPV4_VERSION)
1888 				sin6->sin6_flowinfo = 0;
1889 			else
1890 				sin6->sin6_flowinfo = econnp->conn_flowinfo;
1891 			if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
1892 			    (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
1893 				sin6->sin6_scope_id =
1894 				    econnp->conn_ixa->ixa_scopeid;
1895 			} else {
1896 				sin6->sin6_scope_id = 0;
1897 			}
1898 			sin6->__sin6_src_id = 0;
1899 		}
1900 
1901 		putnext(rq, mp);
1902 		return;
1903 	default:
1904 		mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0);
1905 		if (mp != NULL)
1906 			putnext(rq, mp);
1907 		return;
1908 	}
1909 }
1910 
1911 /*
1912  * The function called through squeue to get behind listener's perimeter to
1913  * send a deferred conn_ind.
1914  */
1915 /* ARGSUSED */
1916 void
1917 tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1918 {
1919 	conn_t	*lconnp = (conn_t *)arg;
1920 	tcp_t *listener = lconnp->conn_tcp;
1921 	struct T_conn_ind *conn_ind;
1922 	tcp_t *tcp;
1923 
1924 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1925 	bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
1926 	    conn_ind->OPT_length);
1927 
1928 	if (listener->tcp_state != TCPS_LISTEN) {
1929 		/*
1930 		 * If listener has closed, it would have caused a
1931 		 * a cleanup/blowoff to happen for the eager, so
1932 		 * we don't need to do anything more.
1933 		 */
1934 		freemsg(mp);
1935 		return;
1936 	}
1937 
1938 	putnext(lconnp->conn_rq, mp);
1939 }
1940 
1941 /*
1942  * Sends the T_CONN_IND to the listener. The caller calls this
1943  * functions via squeue to get inside the listener's perimeter
1944  * once the 3 way hand shake is done a T_CONN_IND needs to be
1945  * sent. As an optimization, the caller can call this directly
1946  * if listener's perimeter is same as eager's.
1947  */
1948 /* ARGSUSED */
1949 void
1950 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
1951 {
1952 	conn_t			*lconnp = (conn_t *)arg;
1953 	tcp_t			*listener = lconnp->conn_tcp;
1954 	tcp_t			*tcp;
1955 	struct T_conn_ind	*conn_ind;
1956 	ipaddr_t 		*addr_cache;
1957 	boolean_t		need_send_conn_ind = B_FALSE;
1958 	tcp_stack_t		*tcps = listener->tcp_tcps;
1959 
1960 	/* retrieve the eager */
1961 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1962 	ASSERT(conn_ind->OPT_offset != 0 &&
1963 	    conn_ind->OPT_length == sizeof (intptr_t));
1964 	bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
1965 	    conn_ind->OPT_length);
1966 
1967 	/*
1968 	 * TLI/XTI applications will get confused by
1969 	 * sending eager as an option since it violates
1970 	 * the option semantics. So remove the eager as
1971 	 * option since TLI/XTI app doesn't need it anyway.
1972 	 */
1973 	if (!TCP_IS_SOCKET(listener)) {
1974 		conn_ind->OPT_length = 0;
1975 		conn_ind->OPT_offset = 0;
1976 	}
1977 	if (listener->tcp_state != TCPS_LISTEN) {
1978 		/*
1979 		 * If listener has closed, it would have caused a
1980 		 * a cleanup/blowoff to happen for the eager. We
1981 		 * just need to return.
1982 		 */
1983 		freemsg(mp);
1984 		return;
1985 	}
1986 
1987 
1988 	/*
1989 	 * if the conn_req_q is full defer passing up the
1990 	 * T_CONN_IND until space is availabe after t_accept()
1991 	 * processing
1992 	 */
1993 	mutex_enter(&listener->tcp_eager_lock);
1994 
1995 	/*
1996 	 * Take the eager out, if it is in the list of droppable eagers
1997 	 * as we are here because the 3W handshake is over.
1998 	 */
1999 	MAKE_UNDROPPABLE(tcp);
2000 
2001 	if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) {
2002 		tcp_t *tail;
2003 
2004 		/*
2005 		 * The eager already has an extra ref put in tcp_input_data
2006 		 * so that it stays till accept comes back even though it
2007 		 * might get into TCPS_CLOSED as a result of a TH_RST etc.
2008 		 */
2009 		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
2010 		listener->tcp_conn_req_cnt_q0--;
2011 		listener->tcp_conn_req_cnt_q++;
2012 
2013 		/* Move from SYN_RCVD to ESTABLISHED list  */
2014 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
2015 		    tcp->tcp_eager_prev_q0;
2016 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
2017 		    tcp->tcp_eager_next_q0;
2018 		tcp->tcp_eager_prev_q0 = NULL;
2019 		tcp->tcp_eager_next_q0 = NULL;
2020 
2021 		/*
2022 		 * Insert at end of the queue because sockfs
2023 		 * sends down T_CONN_RES in chronological
2024 		 * order. Leaving the older conn indications
2025 		 * at front of the queue helps reducing search
2026 		 * time.
2027 		 */
2028 		tail = listener->tcp_eager_last_q;
2029 		if (tail != NULL)
2030 			tail->tcp_eager_next_q = tcp;
2031 		else
2032 			listener->tcp_eager_next_q = tcp;
2033 		listener->tcp_eager_last_q = tcp;
2034 		tcp->tcp_eager_next_q = NULL;
2035 		/*
2036 		 * Delay sending up the T_conn_ind until we are
2037 		 * done with the eager. Once we have have sent up
2038 		 * the T_conn_ind, the accept can potentially complete
2039 		 * any time and release the refhold we have on the eager.
2040 		 */
2041 		need_send_conn_ind = B_TRUE;
2042 	} else {
2043 		/*
2044 		 * Defer connection on q0 and set deferred
2045 		 * connection bit true
2046 		 */
2047 		tcp->tcp_conn_def_q0 = B_TRUE;
2048 
2049 		/* take tcp out of q0 ... */
2050 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
2051 		    tcp->tcp_eager_next_q0;
2052 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
2053 		    tcp->tcp_eager_prev_q0;
2054 
2055 		/* ... and place it at the end of q0 */
2056 		tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0;
2057 		tcp->tcp_eager_next_q0 = listener;
2058 		listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp;
2059 		listener->tcp_eager_prev_q0 = tcp;
2060 		tcp->tcp_conn.tcp_eager_conn_ind = mp;
2061 	}
2062 
2063 	/* we have timed out before */
2064 	if (tcp->tcp_syn_rcvd_timeout != 0) {
2065 		tcp->tcp_syn_rcvd_timeout = 0;
2066 		listener->tcp_syn_rcvd_timeout--;
2067 		if (listener->tcp_syn_defense &&
2068 		    listener->tcp_syn_rcvd_timeout <=
2069 		    (tcps->tcps_conn_req_max_q0 >> 5) &&
2070 		    10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
2071 		    listener->tcp_last_rcv_lbolt)) {
2072 			/*
2073 			 * Turn off the defense mode if we
2074 			 * believe the SYN attack is over.
2075 			 */
2076 			listener->tcp_syn_defense = B_FALSE;
2077 			if (listener->tcp_ip_addr_cache) {
2078 				kmem_free((void *)listener->tcp_ip_addr_cache,
2079 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
2080 				listener->tcp_ip_addr_cache = NULL;
2081 			}
2082 		}
2083 	}
2084 	addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
2085 	if (addr_cache != NULL) {
2086 		/*
2087 		 * We have finished a 3-way handshake with this
2088 		 * remote host. This proves the IP addr is good.
2089 		 * Cache it!
2090 		 */
2091 		addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
2092 		    tcp->tcp_connp->conn_faddr_v4;
2093 	}
2094 	mutex_exit(&listener->tcp_eager_lock);
2095 	if (need_send_conn_ind)
2096 		putnext(lconnp->conn_rq, mp);
2097 }
2098