xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_tpi.c (revision b1d7ec75953cd517f5b7c3d9cb427ff8ec5d7d07)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* This files contains all TCP TLI/TPI related functions */
28 
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/strsun.h>
32 #include <sys/strsubr.h>
33 #include <sys/stropts.h>
34 #include <sys/strlog.h>
35 #define	_SUN_TPI_VERSION 2
36 #include <sys/tihdr.h>
37 #include <sys/suntpi.h>
38 #include <sys/xti_inet.h>
39 #include <sys/squeue_impl.h>
40 #include <sys/squeue.h>
41 
42 #include <inet/common.h>
43 #include <inet/ip.h>
44 #include <inet/tcp.h>
45 #include <inet/tcp_impl.h>
46 #include <inet/proto_set.h>
47 
48 static void	tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *);
49 static int	tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *);
50 static void	tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *);
51 
52 void
53 tcp_use_pure_tpi(tcp_t *tcp)
54 {
55 	conn_t		*connp = tcp->tcp_connp;
56 
57 #ifdef	_ILP32
58 	tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq;
59 #else
60 	tcp->tcp_acceptor_id = connp->conn_dev;
61 #endif
62 	/*
63 	 * Insert this socket into the acceptor hash.
64 	 * We might need it for T_CONN_RES message
65 	 */
66 	tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
67 
68 	tcp->tcp_issocket = B_FALSE;
69 	TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback);
70 }
71 
72 /* Shorthand to generate and send TPI error acks to our client */
73 void
74 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
75 {
76 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
77 		putnext(tcp->tcp_connp->conn_rq, mp);
78 }
79 
80 /* Shorthand to generate and send TPI error acks to our client */
81 void
82 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
83     int t_error, int sys_error)
84 {
85 	struct T_error_ack	*teackp;
86 
87 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
88 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
89 		teackp = (struct T_error_ack *)mp->b_rptr;
90 		teackp->ERROR_prim = primitive;
91 		teackp->TLI_error = t_error;
92 		teackp->UNIX_error = sys_error;
93 		putnext(tcp->tcp_connp->conn_rq, mp);
94 	}
95 }
96 
97 /*
98  * TCP routine to get the values of options.
99  */
100 int
101 tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
102 {
103 	return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr));
104 }
105 
106 /* ARGSUSED */
107 int
108 tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
109     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
110     void *thisdg_attrs, cred_t *cr)
111 {
112 	conn_t	*connp =  Q_TO_CONN(q);
113 
114 	return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp,
115 	    outlenp, outvalp, thisdg_attrs, cr));
116 }
117 
118 static int
119 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp,
120     int *t_errorp, int *sys_errorp)
121 {
122 	int error;
123 	int is_absreq_failure;
124 	t_scalar_t *opt_lenp;
125 	t_scalar_t opt_offset;
126 	int prim_type;
127 	struct T_conn_req *tcreqp;
128 	struct T_conn_res *tcresp;
129 	cred_t *cr;
130 
131 	/*
132 	 * All Solaris components should pass a db_credp
133 	 * for this TPI message, hence we ASSERT.
134 	 * But in case there is some other M_PROTO that looks
135 	 * like a TPI message sent by some other kernel
136 	 * component, we check and return an error.
137 	 */
138 	cr = msg_getcred(mp, NULL);
139 	ASSERT(cr != NULL);
140 	if (cr == NULL)
141 		return (-1);
142 
143 	prim_type = ((union T_primitives *)mp->b_rptr)->type;
144 	ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES ||
145 	    prim_type == T_CONN_RES);
146 
147 	switch (prim_type) {
148 	case T_CONN_REQ:
149 		tcreqp = (struct T_conn_req *)mp->b_rptr;
150 		opt_offset = tcreqp->OPT_offset;
151 		opt_lenp = (t_scalar_t *)&tcreqp->OPT_length;
152 		break;
153 	case O_T_CONN_RES:
154 	case T_CONN_RES:
155 		tcresp = (struct T_conn_res *)mp->b_rptr;
156 		opt_offset = tcresp->OPT_offset;
157 		opt_lenp = (t_scalar_t *)&tcresp->OPT_length;
158 		break;
159 	}
160 
161 	*t_errorp = 0;
162 	*sys_errorp = 0;
163 	*do_disconnectp = 0;
164 
165 	error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp,
166 	    opt_offset, cr, &tcp_opt_obj,
167 	    NULL, &is_absreq_failure);
168 
169 	switch (error) {
170 	case  0:		/* no error */
171 		ASSERT(is_absreq_failure == 0);
172 		return (0);
173 	case ENOPROTOOPT:
174 		*t_errorp = TBADOPT;
175 		break;
176 	case EACCES:
177 		*t_errorp = TACCES;
178 		break;
179 	default:
180 		*t_errorp = TSYSERR; *sys_errorp = error;
181 		break;
182 	}
183 	if (is_absreq_failure != 0) {
184 		/*
185 		 * The connection request should get the local ack
186 		 * T_OK_ACK and then a T_DISCON_IND.
187 		 */
188 		*do_disconnectp = 1;
189 	}
190 	return (-1);
191 }
192 
193 void
194 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
195 {
196 	int	error;
197 	conn_t	*connp = tcp->tcp_connp;
198 	struct sockaddr	*sa;
199 	mblk_t  *mp1;
200 	struct T_bind_req *tbr;
201 	int	backlog;
202 	socklen_t	len;
203 	sin_t	*sin;
204 	sin6_t	*sin6;
205 	cred_t		*cr;
206 
207 	/*
208 	 * All Solaris components should pass a db_credp
209 	 * for this TPI message, hence we ASSERT.
210 	 * But in case there is some other M_PROTO that looks
211 	 * like a TPI message sent by some other kernel
212 	 * component, we check and return an error.
213 	 */
214 	cr = msg_getcred(mp, NULL);
215 	ASSERT(cr != NULL);
216 	if (cr == NULL) {
217 		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
218 		return;
219 	}
220 
221 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
222 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
223 		if (connp->conn_debug) {
224 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
225 			    "tcp_tpi_bind: bad req, len %u",
226 			    (uint_t)(mp->b_wptr - mp->b_rptr));
227 		}
228 		tcp_err_ack(tcp, mp, TPROTO, 0);
229 		return;
230 	}
231 	/* Make sure the largest address fits */
232 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
233 	if (mp1 == NULL) {
234 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
235 		return;
236 	}
237 	mp = mp1;
238 	tbr = (struct T_bind_req *)mp->b_rptr;
239 
240 	backlog = tbr->CONIND_number;
241 	len = tbr->ADDR_length;
242 
243 	switch (len) {
244 	case 0:		/* request for a generic port */
245 		tbr->ADDR_offset = sizeof (struct T_bind_req);
246 		if (connp->conn_family == AF_INET) {
247 			tbr->ADDR_length = sizeof (sin_t);
248 			sin = (sin_t *)&tbr[1];
249 			*sin = sin_null;
250 			sin->sin_family = AF_INET;
251 			sa = (struct sockaddr *)sin;
252 			len = sizeof (sin_t);
253 			mp->b_wptr = (uchar_t *)&sin[1];
254 		} else {
255 			ASSERT(connp->conn_family == AF_INET6);
256 			tbr->ADDR_length = sizeof (sin6_t);
257 			sin6 = (sin6_t *)&tbr[1];
258 			*sin6 = sin6_null;
259 			sin6->sin6_family = AF_INET6;
260 			sa = (struct sockaddr *)sin6;
261 			len = sizeof (sin6_t);
262 			mp->b_wptr = (uchar_t *)&sin6[1];
263 		}
264 		break;
265 
266 	case sizeof (sin_t):    /* Complete IPv4 address */
267 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
268 		    sizeof (sin_t));
269 		break;
270 
271 	case sizeof (sin6_t): /* Complete IPv6 address */
272 		sa = (struct sockaddr *)mi_offset_param(mp,
273 		    tbr->ADDR_offset, sizeof (sin6_t));
274 		break;
275 
276 	default:
277 		if (connp->conn_debug) {
278 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
279 			    "tcp_tpi_bind: bad address length, %d",
280 			    tbr->ADDR_length);
281 		}
282 		tcp_err_ack(tcp, mp, TBADADDR, 0);
283 		return;
284 	}
285 
286 	if (backlog > 0) {
287 		error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp),
288 		    tbr->PRIM_type != O_T_BIND_REQ);
289 	} else {
290 		error = tcp_do_bind(connp, sa, len, DB_CRED(mp),
291 		    tbr->PRIM_type != O_T_BIND_REQ);
292 	}
293 done:
294 	if (error > 0) {
295 		tcp_err_ack(tcp, mp, TSYSERR, error);
296 	} else if (error < 0) {
297 		tcp_err_ack(tcp, mp, -error, 0);
298 	} else {
299 		/*
300 		 * Update port information as sockfs/tpi needs it for checking
301 		 */
302 		if (connp->conn_family == AF_INET) {
303 			sin = (sin_t *)sa;
304 			sin->sin_port = connp->conn_lport;
305 		} else {
306 			sin6 = (sin6_t *)sa;
307 			sin6->sin6_port = connp->conn_lport;
308 		}
309 		mp->b_datap->db_type = M_PCPROTO;
310 		tbr->PRIM_type = T_BIND_ACK;
311 		putnext(connp->conn_rq, mp);
312 	}
313 }
314 
315 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */
316 void
317 tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp)
318 {
319 	conn_t *connp = tcp->tcp_connp;
320 	int error;
321 
322 	error = tcp_do_unbind(connp);
323 	if (error > 0) {
324 		tcp_err_ack(tcp, mp, TSYSERR, error);
325 	} else if (error < 0) {
326 		tcp_err_ack(tcp, mp, -error, 0);
327 	} else {
328 		/* Send M_FLUSH according to TPI */
329 		(void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
330 
331 		mp = mi_tpi_ok_ack_alloc(mp);
332 		if (mp != NULL)
333 			putnext(connp->conn_rq, mp);
334 	}
335 }
336 
337 int
338 tcp_tpi_close(queue_t *q, int flags)
339 {
340 	conn_t		*connp;
341 
342 	ASSERT(WR(q)->q_next == NULL);
343 
344 	if (flags & SO_FALLBACK) {
345 		/*
346 		 * stream is being closed while in fallback
347 		 * simply free the resources that were allocated
348 		 */
349 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
350 		qprocsoff(q);
351 		goto done;
352 	}
353 
354 	connp = Q_TO_CONN(q);
355 	/*
356 	 * We are being closed as /dev/tcp or /dev/tcp6.
357 	 */
358 	tcp_close_common(connp, flags);
359 
360 	qprocsoff(q);
361 	inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
362 
363 	/*
364 	 * Drop IP's reference on the conn. This is the last reference
365 	 * on the connp if the state was less than established. If the
366 	 * connection has gone into timewait state, then we will have
367 	 * one ref for the TCP and one more ref (total of two) for the
368 	 * classifier connected hash list (a timewait connections stays
369 	 * in connected hash till closed).
370 	 *
371 	 * We can't assert the references because there might be other
372 	 * transient reference places because of some walkers or queued
373 	 * packets in squeue for the timewait state.
374 	 */
375 	CONN_DEC_REF(connp);
376 done:
377 	q->q_ptr = WR(q)->q_ptr = NULL;
378 	return (0);
379 }
380 
381 int
382 tcp_tpi_close_accept(queue_t *q)
383 {
384 	vmem_t	*minor_arena;
385 	dev_t	conn_dev;
386 	extern struct qinit tcp_acceptor_winit;
387 
388 	ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
389 
390 	/*
391 	 * We had opened an acceptor STREAM for sockfs which is
392 	 * now being closed due to some error.
393 	 */
394 	qprocsoff(q);
395 
396 	minor_arena = (vmem_t *)WR(q)->q_ptr;
397 	conn_dev = (dev_t)RD(q)->q_ptr;
398 	ASSERT(minor_arena != NULL);
399 	ASSERT(conn_dev != 0);
400 	inet_minor_free(minor_arena, conn_dev);
401 	q->q_ptr = WR(q)->q_ptr = NULL;
402 	return (0);
403 }
404 
405 /*
406  * Put a connection confirmation message upstream built from the
407  * address/flowid information with the conn and iph. Report our success or
408  * failure.
409  */
410 boolean_t
411 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp,
412     mblk_t **defermp, ip_recv_attr_t *ira)
413 {
414 	sin_t	sin;
415 	sin6_t	sin6;
416 	mblk_t	*mp;
417 	char	*optp = NULL;
418 	int	optlen = 0;
419 	conn_t	*connp = tcp->tcp_connp;
420 
421 	if (defermp != NULL)
422 		*defermp = NULL;
423 
424 	if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
425 		/*
426 		 * Return in T_CONN_CON results of option negotiation through
427 		 * the T_CONN_REQ. Note: If there is an real end-to-end option
428 		 * negotiation, then what is received from remote end needs
429 		 * to be taken into account but there is no such thing (yet?)
430 		 * in our TCP/IP.
431 		 * Note: We do not use mi_offset_param() here as
432 		 * tcp_opts_conn_req contents do not directly come from
433 		 * an application and are either generated in kernel or
434 		 * from user input that was already verified.
435 		 */
436 		mp = tcp->tcp_conn.tcp_opts_conn_req;
437 		optp = (char *)(mp->b_rptr +
438 		    ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
439 		optlen = (int)
440 		    ((struct T_conn_req *)mp->b_rptr)->OPT_length;
441 	}
442 
443 	if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
444 
445 		/* packet is IPv4 */
446 		if (connp->conn_family == AF_INET) {
447 			sin = sin_null;
448 			sin.sin_addr.s_addr = connp->conn_faddr_v4;
449 			sin.sin_port = connp->conn_fport;
450 			sin.sin_family = AF_INET;
451 			mp = mi_tpi_conn_con(NULL, (char *)&sin,
452 			    (int)sizeof (sin_t), optp, optlen);
453 		} else {
454 			sin6 = sin6_null;
455 			sin6.sin6_addr = connp->conn_faddr_v6;
456 			sin6.sin6_port = connp->conn_fport;
457 			sin6.sin6_family = AF_INET6;
458 			mp = mi_tpi_conn_con(NULL, (char *)&sin6,
459 			    (int)sizeof (sin6_t), optp, optlen);
460 
461 		}
462 	} else {
463 		ip6_t	*ip6h = (ip6_t *)iphdr;
464 
465 		ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
466 		ASSERT(connp->conn_family == AF_INET6);
467 		sin6 = sin6_null;
468 		sin6.sin6_addr = connp->conn_faddr_v6;
469 		sin6.sin6_port = connp->conn_fport;
470 		sin6.sin6_family = AF_INET6;
471 		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
472 		mp = mi_tpi_conn_con(NULL, (char *)&sin6,
473 		    (int)sizeof (sin6_t), optp, optlen);
474 	}
475 
476 	if (!mp)
477 		return (B_FALSE);
478 
479 	mblk_copycred(mp, idmp);
480 
481 	if (defermp == NULL) {
482 		conn_t *connp = tcp->tcp_connp;
483 		if (IPCL_IS_NONSTR(connp)) {
484 			(*connp->conn_upcalls->su_connected)
485 			    (connp->conn_upper_handle, tcp->tcp_connid,
486 			    ira->ira_cred, ira->ira_cpid);
487 			freemsg(mp);
488 		} else {
489 			if (ira->ira_cred != NULL) {
490 				/* So that getpeerucred works for TPI sockfs */
491 				mblk_setcred(mp, ira->ira_cred, ira->ira_cpid);
492 			}
493 			putnext(connp->conn_rq, mp);
494 		}
495 	} else {
496 		*defermp = mp;
497 	}
498 
499 	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
500 		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
501 	return (B_TRUE);
502 }
503 
504 /*
505  * Successful connect request processing begins when our client passes
506  * a T_CONN_REQ message into tcp_wput(), which performs function calls into
507  * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream).
508  *
509  * After various error checks are completed, tcp_tpi_connect() lays
510  * the target address and port into the composite header template.
511  * Then we ask IP for information, including a source address if we didn't
512  * already have one. Finally we prepare to send the SYN packet, and then
513  * send up the T_OK_ACK reply message.
514  */
515 void
516 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
517 {
518 	sin_t		*sin;
519 	struct T_conn_req	*tcr;
520 	struct sockaddr	*sa;
521 	socklen_t	len;
522 	int		error;
523 	cred_t		*cr;
524 	pid_t		cpid;
525 	conn_t		*connp = tcp->tcp_connp;
526 	queue_t		*q = connp->conn_wq;
527 
528 	/*
529 	 * All Solaris components should pass a db_credp
530 	 * for this TPI message, hence we ASSERT.
531 	 * But in case there is some other M_PROTO that looks
532 	 * like a TPI message sent by some other kernel
533 	 * component, we check and return an error.
534 	 */
535 	cr = msg_getcred(mp, &cpid);
536 	ASSERT(cr != NULL);
537 	if (cr == NULL) {
538 		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
539 		return;
540 	}
541 
542 	tcr = (struct T_conn_req *)mp->b_rptr;
543 
544 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
545 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
546 		tcp_err_ack(tcp, mp, TPROTO, 0);
547 		return;
548 	}
549 
550 	/*
551 	 * Pre-allocate the T_ordrel_ind mblk so that at close time, we
552 	 * will always have that to send up.  Otherwise, we need to do
553 	 * special handling in case the allocation fails at that time.
554 	 * If the end point is TPI, the tcp_t can be reused and the
555 	 * tcp_ordrel_mp may be allocated already.
556 	 */
557 	if (tcp->tcp_ordrel_mp == NULL) {
558 		if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) {
559 			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
560 			return;
561 		}
562 	}
563 
564 	/*
565 	 * Determine packet type based on type of address passed in
566 	 * the request should contain an IPv4 or IPv6 address.
567 	 * Make sure that address family matches the type of
568 	 * family of the address passed down.
569 	 */
570 	switch (tcr->DEST_length) {
571 	default:
572 		tcp_err_ack(tcp, mp, TBADADDR, 0);
573 		return;
574 
575 	case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
576 		/*
577 		 * XXX: The check for valid DEST_length was not there
578 		 * in earlier releases and some buggy
579 		 * TLI apps (e.g Sybase) got away with not feeding
580 		 * in sin_zero part of address.
581 		 * We allow that bug to keep those buggy apps humming.
582 		 * Test suites require the check on DEST_length.
583 		 * We construct a new mblk with valid DEST_length
584 		 * free the original so the rest of the code does
585 		 * not have to keep track of this special shorter
586 		 * length address case.
587 		 */
588 		mblk_t *nmp;
589 		struct T_conn_req *ntcr;
590 		sin_t *nsin;
591 
592 		nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
593 		    tcr->OPT_length, BPRI_HI);
594 		if (nmp == NULL) {
595 			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
596 			return;
597 		}
598 		ntcr = (struct T_conn_req *)nmp->b_rptr;
599 		bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
600 		ntcr->PRIM_type = T_CONN_REQ;
601 		ntcr->DEST_length = sizeof (sin_t);
602 		ntcr->DEST_offset = sizeof (struct T_conn_req);
603 
604 		nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
605 		*nsin = sin_null;
606 		/* Get pointer to shorter address to copy from original mp */
607 		sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
608 		    tcr->DEST_length); /* extract DEST_length worth of sin_t */
609 		if (sin == NULL || !OK_32PTR((char *)sin)) {
610 			freemsg(nmp);
611 			tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
612 			return;
613 		}
614 		nsin->sin_family = sin->sin_family;
615 		nsin->sin_port = sin->sin_port;
616 		nsin->sin_addr = sin->sin_addr;
617 		/* Note:nsin->sin_zero zero-fill with sin_null assign above */
618 		nmp->b_wptr = (uchar_t *)&nsin[1];
619 		if (tcr->OPT_length != 0) {
620 			ntcr->OPT_length = tcr->OPT_length;
621 			ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
622 			bcopy((uchar_t *)tcr + tcr->OPT_offset,
623 			    (uchar_t *)ntcr + ntcr->OPT_offset,
624 			    tcr->OPT_length);
625 			nmp->b_wptr += tcr->OPT_length;
626 		}
627 		freemsg(mp);	/* original mp freed */
628 		mp = nmp;	/* re-initialize original variables */
629 		tcr = ntcr;
630 	}
631 	/* FALLTHRU */
632 
633 	case sizeof (sin_t):
634 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
635 		    sizeof (sin_t));
636 		len = sizeof (sin_t);
637 		break;
638 
639 	case sizeof (sin6_t):
640 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
641 		    sizeof (sin6_t));
642 		len = sizeof (sin6_t);
643 		break;
644 	}
645 
646 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
647 	if (error != 0) {
648 		tcp_err_ack(tcp, mp, TSYSERR, error);
649 		return;
650 	}
651 
652 	/*
653 	 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
654 	 * should key on their sequence number and cut them loose.
655 	 */
656 
657 	/*
658 	 * If options passed in, feed it for verification and handling
659 	 */
660 	if (tcr->OPT_length != 0) {
661 		mblk_t	*ok_mp;
662 		mblk_t	*discon_mp;
663 		mblk_t  *conn_opts_mp;
664 		int t_error, sys_error, do_disconnect;
665 
666 		conn_opts_mp = NULL;
667 
668 		if (tcp_conprim_opt_process(tcp, mp,
669 		    &do_disconnect, &t_error, &sys_error) < 0) {
670 			if (do_disconnect) {
671 				ASSERT(t_error == 0 && sys_error == 0);
672 				discon_mp = mi_tpi_discon_ind(NULL,
673 				    ECONNREFUSED, 0);
674 				if (!discon_mp) {
675 					tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
676 					    TSYSERR, ENOMEM);
677 					return;
678 				}
679 				ok_mp = mi_tpi_ok_ack_alloc(mp);
680 				if (!ok_mp) {
681 					tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
682 					    TSYSERR, ENOMEM);
683 					return;
684 				}
685 				qreply(q, ok_mp);
686 				qreply(q, discon_mp); /* no flush! */
687 			} else {
688 				ASSERT(t_error != 0);
689 				tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
690 				    sys_error);
691 			}
692 			return;
693 		}
694 		/*
695 		 * Success in setting options, the mp option buffer represented
696 		 * by OPT_length/offset has been potentially modified and
697 		 * contains results of option processing. We copy it in
698 		 * another mp to save it for potentially influencing returning
699 		 * it in T_CONN_CONN.
700 		 */
701 		if (tcr->OPT_length != 0) { /* there are resulting options */
702 			conn_opts_mp = copyb(mp);
703 			if (!conn_opts_mp) {
704 				tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
705 				    TSYSERR, ENOMEM);
706 				return;
707 			}
708 			ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
709 			tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
710 			/*
711 			 * Note:
712 			 * These resulting option negotiation can include any
713 			 * end-to-end negotiation options but there no such
714 			 * thing (yet?) in our TCP/IP.
715 			 */
716 		}
717 	}
718 
719 	/* call the non-TPI version */
720 	error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid);
721 	if (error < 0) {
722 		mp = mi_tpi_err_ack_alloc(mp, -error, 0);
723 	} else if (error > 0) {
724 		mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
725 	} else {
726 		mp = mi_tpi_ok_ack_alloc(mp);
727 	}
728 
729 	/*
730 	 * Note: Code below is the "failure" case
731 	 */
732 	/* return error ack and blow away saved option results if any */
733 connect_failed:
734 	if (mp != NULL)
735 		putnext(connp->conn_rq, mp);
736 	else {
737 		tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
738 		    TSYSERR, ENOMEM);
739 	}
740 }
741 
742 /* Return the TPI/TLI equivalent of our current tcp_state */
743 static int
744 tcp_tpistate(tcp_t *tcp)
745 {
746 	switch (tcp->tcp_state) {
747 	case TCPS_IDLE:
748 		return (TS_UNBND);
749 	case TCPS_LISTEN:
750 		/*
751 		 * Return whether there are outstanding T_CONN_IND waiting
752 		 * for the matching T_CONN_RES. Therefore don't count q0.
753 		 */
754 		if (tcp->tcp_conn_req_cnt_q > 0)
755 			return (TS_WRES_CIND);
756 		else
757 			return (TS_IDLE);
758 	case TCPS_BOUND:
759 		return (TS_IDLE);
760 	case TCPS_SYN_SENT:
761 		return (TS_WCON_CREQ);
762 	case TCPS_SYN_RCVD:
763 		/*
764 		 * Note: assumption: this has to the active open SYN_RCVD.
765 		 * The passive instance is detached in SYN_RCVD stage of
766 		 * incoming connection processing so we cannot get request
767 		 * for T_info_ack on it.
768 		 */
769 		return (TS_WACK_CRES);
770 	case TCPS_ESTABLISHED:
771 		return (TS_DATA_XFER);
772 	case TCPS_CLOSE_WAIT:
773 		return (TS_WREQ_ORDREL);
774 	case TCPS_FIN_WAIT_1:
775 		return (TS_WIND_ORDREL);
776 	case TCPS_FIN_WAIT_2:
777 		return (TS_WIND_ORDREL);
778 
779 	case TCPS_CLOSING:
780 	case TCPS_LAST_ACK:
781 	case TCPS_TIME_WAIT:
782 	case TCPS_CLOSED:
783 		/*
784 		 * Following TS_WACK_DREQ7 is a rendition of "not
785 		 * yet TS_IDLE" TPI state. There is no best match to any
786 		 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
787 		 * choose a value chosen that will map to TLI/XTI level
788 		 * state of TSTATECHNG (state is process of changing) which
789 		 * captures what this dummy state represents.
790 		 */
791 		return (TS_WACK_DREQ7);
792 	default:
793 		cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
794 		    tcp->tcp_state, tcp_display(tcp, NULL,
795 		    DISP_PORT_ONLY));
796 		return (TS_UNBND);
797 	}
798 }
799 
800 static void
801 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
802 {
803 	tcp_stack_t	*tcps = tcp->tcp_tcps;
804 	conn_t		*connp = tcp->tcp_connp;
805 	extern struct T_info_ack tcp_g_t_info_ack;
806 	extern struct T_info_ack tcp_g_t_info_ack_v6;
807 
808 	if (connp->conn_family == AF_INET6)
809 		*tia = tcp_g_t_info_ack_v6;
810 	else
811 		*tia = tcp_g_t_info_ack;
812 	tia->CURRENT_state = tcp_tpistate(tcp);
813 	tia->OPT_size = tcp_max_optsize;
814 	if (tcp->tcp_mss == 0) {
815 		/* Not yet set - tcp_open does not set mss */
816 		if (connp->conn_ipversion == IPV4_VERSION)
817 			tia->TIDU_size = tcps->tcps_mss_def_ipv4;
818 		else
819 			tia->TIDU_size = tcps->tcps_mss_def_ipv6;
820 	} else {
821 		tia->TIDU_size = tcp->tcp_mss;
822 	}
823 	/* TODO: Default ETSDU is 1.  Is that correct for tcp? */
824 }
825 
826 static void
827 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
828     t_uscalar_t cap_bits1)
829 {
830 	tcap->CAP_bits1 = 0;
831 
832 	if (cap_bits1 & TC1_INFO) {
833 		tcp_copy_info(&tcap->INFO_ack, tcp);
834 		tcap->CAP_bits1 |= TC1_INFO;
835 	}
836 
837 	if (cap_bits1 & TC1_ACCEPTOR_ID) {
838 		tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
839 		tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
840 	}
841 
842 }
843 
844 /*
845  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
846  * tcp_wput.  Much of the T_CAPABILITY_ACK information is copied from
847  * tcp_g_t_info_ack.  The current state of the stream is copied from
848  * tcp_state.
849  */
850 void
851 tcp_capability_req(tcp_t *tcp, mblk_t *mp)
852 {
853 	t_uscalar_t		cap_bits1;
854 	struct T_capability_ack	*tcap;
855 
856 	if (MBLKL(mp) < sizeof (struct T_capability_req)) {
857 		freemsg(mp);
858 		return;
859 	}
860 
861 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
862 
863 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
864 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
865 	if (mp == NULL)
866 		return;
867 
868 	tcap = (struct T_capability_ack *)mp->b_rptr;
869 	tcp_do_capability_ack(tcp, tcap, cap_bits1);
870 
871 	putnext(tcp->tcp_connp->conn_rq, mp);
872 }
873 
874 /*
875  * This routine responds to T_INFO_REQ messages.  It is called by tcp_wput.
876  * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
877  * The current state of the stream is copied from tcp_state.
878  */
879 void
880 tcp_info_req(tcp_t *tcp, mblk_t *mp)
881 {
882 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
883 	    T_INFO_ACK);
884 	if (!mp) {
885 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
886 		return;
887 	}
888 	tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
889 	putnext(tcp->tcp_connp->conn_rq, mp);
890 }
891 
892 /* Respond to the TPI addr request */
893 void
894 tcp_addr_req(tcp_t *tcp, mblk_t *mp)
895 {
896 	struct sockaddr *sa;
897 	mblk_t	*ackmp;
898 	struct T_addr_ack *taa;
899 	conn_t	*connp = tcp->tcp_connp;
900 	uint_t	addrlen;
901 
902 	/* Make it large enough for worst case */
903 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
904 	    2 * sizeof (sin6_t), 1);
905 	if (ackmp == NULL) {
906 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
907 		return;
908 	}
909 
910 	taa = (struct T_addr_ack *)ackmp->b_rptr;
911 
912 	bzero(taa, sizeof (struct T_addr_ack));
913 	ackmp->b_wptr = (uchar_t *)&taa[1];
914 
915 	taa->PRIM_type = T_ADDR_ACK;
916 	ackmp->b_datap->db_type = M_PCPROTO;
917 
918 	if (connp->conn_family == AF_INET)
919 		addrlen = sizeof (sin_t);
920 	else
921 		addrlen = sizeof (sin6_t);
922 
923 	/*
924 	 * Note: Following code assumes 32 bit alignment of basic
925 	 * data structures like sin_t and struct T_addr_ack.
926 	 */
927 	if (tcp->tcp_state >= TCPS_BOUND) {
928 		/*
929 		 * Fill in local address first
930 		 */
931 		taa->LOCADDR_offset = sizeof (*taa);
932 		taa->LOCADDR_length = addrlen;
933 		sa = (struct sockaddr *)&taa[1];
934 		(void) conn_getsockname(connp, sa, &addrlen);
935 		ackmp->b_wptr += addrlen;
936 	}
937 	if (tcp->tcp_state >= TCPS_SYN_RCVD) {
938 		/*
939 		 * Fill in Remote address
940 		 */
941 		taa->REMADDR_length = addrlen;
942 		/* assumed 32-bit alignment */
943 		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
944 		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
945 		(void) conn_getpeername(connp, sa, &addrlen);
946 		ackmp->b_wptr += addrlen;
947 	}
948 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
949 	putnext(tcp->tcp_connp->conn_rq, ackmp);
950 }
951 
952 /*
953  * tcp_fallback
954  *
955  * A direct socket is falling back to using STREAMS. The queue
956  * that is being passed down was created using tcp_open() with
957  * the SO_FALLBACK flag set. As a result, the queue is not
958  * associated with a conn, and the q_ptrs instead contain the
959  * dev and minor area that should be used.
960  *
961  * The 'issocket' flag indicates whether the FireEngine
962  * optimizations should be used. The common case would be that
963  * optimizations are enabled, and they might be subsequently
964  * disabled using the _SIOCSOCKFALLBACK ioctl.
965  */
966 
967 /*
968  * An active connection is falling back to TPI. Gather all the information
969  * required by the STREAM head and TPI sonode and send it up.
970  */
971 void
972 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
973     boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb)
974 {
975 	conn_t			*connp = tcp->tcp_connp;
976 	struct stroptions	*stropt;
977 	struct T_capability_ack tca;
978 	struct sockaddr_in6	laddr, faddr;
979 	socklen_t 		laddrlen, faddrlen;
980 	short			opts;
981 	int			error;
982 	mblk_t			*mp;
983 
984 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
985 	connp->conn_minor_arena = WR(q)->q_ptr;
986 
987 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
988 
989 	connp->conn_rq = RD(q);
990 	connp->conn_wq = WR(q);
991 
992 	WR(q)->q_qinfo = &tcp_sock_winit;
993 
994 	if (!issocket)
995 		tcp_use_pure_tpi(tcp);
996 
997 	/*
998 	 * free the helper stream
999 	 */
1000 	ip_free_helper_stream(connp);
1001 
1002 	/*
1003 	 * Notify the STREAM head about options
1004 	 */
1005 	DB_TYPE(stropt_mp) = M_SETOPTS;
1006 	stropt = (struct stroptions *)stropt_mp->b_rptr;
1007 	stropt_mp->b_wptr += sizeof (struct stroptions);
1008 	stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
1009 
1010 	stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
1011 	    tcp->tcp_tcps->tcps_wroff_xtra);
1012 	if (tcp->tcp_snd_sack_ok)
1013 		stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
1014 	stropt->so_hiwat = connp->conn_rcvbuf;
1015 	stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
1016 
1017 	putnext(RD(q), stropt_mp);
1018 
1019 	/*
1020 	 * Collect the information needed to sync with the sonode
1021 	 */
1022 	tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
1023 
1024 	laddrlen = faddrlen = sizeof (sin6_t);
1025 	(void) tcp_getsockname((sock_lower_handle_t)connp,
1026 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
1027 	error = tcp_getpeername((sock_lower_handle_t)connp,
1028 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
1029 	if (error != 0)
1030 		faddrlen = 0;
1031 
1032 	opts = 0;
1033 	if (connp->conn_oobinline)
1034 		opts |= SO_OOBINLINE;
1035 	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
1036 		opts |= SO_DONTROUTE;
1037 
1038 	/*
1039 	 * Notify the socket that the protocol is now quiescent,
1040 	 * and it's therefore safe move data from the socket
1041 	 * to the stream head.
1042 	 */
1043 	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
1044 	    (struct sockaddr *)&laddr, laddrlen,
1045 	    (struct sockaddr *)&faddr, faddrlen, opts);
1046 
1047 	while ((mp = tcp->tcp_rcv_list) != NULL) {
1048 		tcp->tcp_rcv_list = mp->b_next;
1049 		mp->b_next = NULL;
1050 		/* We never do fallback for kernel RPC */
1051 		putnext(q, mp);
1052 	}
1053 	tcp->tcp_rcv_last_head = NULL;
1054 	tcp->tcp_rcv_last_tail = NULL;
1055 	tcp->tcp_rcv_cnt = 0;
1056 }
1057 
1058 /*
1059  * An eager is falling back to TPI. All we have to do is send
1060  * up a T_CONN_IND.
1061  */
1062 void
1063 tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs)
1064 {
1065 	tcp_t *listener = eager->tcp_listener;
1066 	mblk_t *mp = eager->tcp_conn.tcp_eager_conn_ind;
1067 
1068 	ASSERT(listener != NULL);
1069 	ASSERT(mp != NULL);
1070 
1071 	eager->tcp_conn.tcp_eager_conn_ind = NULL;
1072 
1073 	/*
1074 	 * TLI/XTI applications will get confused by
1075 	 * sending eager as an option since it violates
1076 	 * the option semantics. So remove the eager as
1077 	 * option since TLI/XTI app doesn't need it anyway.
1078 	 */
1079 	if (!direct_sockfs) {
1080 		struct T_conn_ind *conn_ind;
1081 
1082 		conn_ind = (struct T_conn_ind *)mp->b_rptr;
1083 		conn_ind->OPT_length = 0;
1084 		conn_ind->OPT_offset = 0;
1085 	}
1086 
1087 	/*
1088 	 * Sockfs guarantees that the listener will not be closed
1089 	 * during fallback. So we can safely use the listener's queue.
1090 	 */
1091 	putnext(listener->tcp_connp->conn_rq, mp);
1092 }
1093 
1094 /*
1095  * Swap information between the eager and acceptor for a TLI/XTI client.
1096  * The sockfs accept is done on the acceptor stream and control goes
1097  * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
1098  * called. In either case, both the eager and listener are in their own
1099  * perimeter (squeue) and the code has to deal with potential race.
1100  *
1101  * See the block comment on top of tcp_accept() and tcp_tli_accept().
1102  */
1103 static void
1104 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
1105 {
1106 	conn_t	*econnp, *aconnp;
1107 
1108 	ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
1109 	ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
1110 	ASSERT(!TCP_IS_SOCKET(acceptor));
1111 	ASSERT(!TCP_IS_SOCKET(eager));
1112 	ASSERT(!TCP_IS_SOCKET(listener));
1113 
1114 	/*
1115 	 * Trusted Extensions may need to use a security label that is
1116 	 * different from the acceptor's label on MLP and MAC-Exempt
1117 	 * sockets. If this is the case, the required security label
1118 	 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the
1119 	 * acceptor stream refer to econnp we atomatically get that label.
1120 	 */
1121 
1122 	acceptor->tcp_detached = B_TRUE;
1123 	/*
1124 	 * To permit stream re-use by TLI/XTI, the eager needs a copy of
1125 	 * the acceptor id.
1126 	 */
1127 	eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
1128 
1129 	/* remove eager from listen list... */
1130 	mutex_enter(&listener->tcp_eager_lock);
1131 	tcp_eager_unlink(eager);
1132 	ASSERT(eager->tcp_eager_next_q == NULL &&
1133 	    eager->tcp_eager_last_q == NULL);
1134 	ASSERT(eager->tcp_eager_next_q0 == NULL &&
1135 	    eager->tcp_eager_prev_q0 == NULL);
1136 	mutex_exit(&listener->tcp_eager_lock);
1137 
1138 	econnp = eager->tcp_connp;
1139 	aconnp = acceptor->tcp_connp;
1140 	econnp->conn_rq = aconnp->conn_rq;
1141 	econnp->conn_wq = aconnp->conn_wq;
1142 	econnp->conn_rq->q_ptr = econnp;
1143 	econnp->conn_wq->q_ptr = econnp;
1144 
1145 	/*
1146 	 * In the TLI/XTI loopback case, we are inside the listener's squeue,
1147 	 * which might be a different squeue from our peer TCP instance.
1148 	 * For TCP Fusion, the peer expects that whenever tcp_detached is
1149 	 * clear, our TCP queues point to the acceptor's queues.  Thus, use
1150 	 * membar_producer() to ensure that the assignments of conn_rq/conn_wq
1151 	 * above reach global visibility prior to the clearing of tcp_detached.
1152 	 */
1153 	membar_producer();
1154 	eager->tcp_detached = B_FALSE;
1155 
1156 	ASSERT(eager->tcp_ack_tid == 0);
1157 
1158 	econnp->conn_dev = aconnp->conn_dev;
1159 	econnp->conn_minor_arena = aconnp->conn_minor_arena;
1160 
1161 	ASSERT(econnp->conn_minor_arena != NULL);
1162 	if (econnp->conn_cred != NULL)
1163 		crfree(econnp->conn_cred);
1164 	econnp->conn_cred = aconnp->conn_cred;
1165 	ASSERT(!(econnp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1166 	econnp->conn_ixa->ixa_cred = econnp->conn_cred;
1167 	aconnp->conn_cred = NULL;
1168 	econnp->conn_cpid = aconnp->conn_cpid;
1169 	ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
1170 	ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
1171 
1172 	econnp->conn_zoneid = aconnp->conn_zoneid;
1173 	econnp->conn_allzones = aconnp->conn_allzones;
1174 	econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
1175 
1176 	econnp->conn_mac_mode = aconnp->conn_mac_mode;
1177 	econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
1178 	aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
1179 
1180 	/* Do the IPC initialization */
1181 	CONN_INC_REF(econnp);
1182 
1183 	/* Done with old IPC. Drop its ref on its connp */
1184 	CONN_DEC_REF(aconnp);
1185 }
1186 
1187 /*
1188  * Reply to a clients T_CONN_RES TPI message. This function
1189  * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
1190  * on the acceptor STREAM and processed in tcp_accept_common().
1191  * Read the block comment on top of tcp_input_listener().
1192  */
1193 void
1194 tcp_tli_accept(tcp_t *listener, mblk_t *mp)
1195 {
1196 	tcp_t		*acceptor;
1197 	tcp_t		*eager;
1198 	tcp_t   	*tcp;
1199 	struct T_conn_res	*tcr;
1200 	t_uscalar_t	acceptor_id;
1201 	t_scalar_t	seqnum;
1202 	mblk_t		*discon_mp = NULL;
1203 	mblk_t		*ok_mp;
1204 	mblk_t		*mp1;
1205 	tcp_stack_t	*tcps = listener->tcp_tcps;
1206 	conn_t		*econnp;
1207 
1208 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
1209 		tcp_err_ack(listener, mp, TPROTO, 0);
1210 		return;
1211 	}
1212 	tcr = (struct T_conn_res *)mp->b_rptr;
1213 
1214 	/*
1215 	 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
1216 	 * read side queue of the streams device underneath us i.e. the
1217 	 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
1218 	 * look it up in the queue_hash.  Under LP64 it sends down the
1219 	 * minor_t of the accepting endpoint.
1220 	 *
1221 	 * Once the acceptor/eager are modified (in tcp_accept_swap) the
1222 	 * fanout hash lock is held.
1223 	 * This prevents any thread from entering the acceptor queue from
1224 	 * below (since it has not been hard bound yet i.e. any inbound
1225 	 * packets will arrive on the listener conn_t and
1226 	 * go through the classifier).
1227 	 * The CONN_INC_REF will prevent the acceptor from closing.
1228 	 *
1229 	 * XXX It is still possible for a tli application to send down data
1230 	 * on the accepting stream while another thread calls t_accept.
1231 	 * This should not be a problem for well-behaved applications since
1232 	 * the T_OK_ACK is sent after the queue swapping is completed.
1233 	 *
1234 	 * If the accepting fd is the same as the listening fd, avoid
1235 	 * queue hash lookup since that will return an eager listener in a
1236 	 * already established state.
1237 	 */
1238 	acceptor_id = tcr->ACCEPTOR_id;
1239 	mutex_enter(&listener->tcp_eager_lock);
1240 	if (listener->tcp_acceptor_id == acceptor_id) {
1241 		eager = listener->tcp_eager_next_q;
1242 		/* only count how many T_CONN_INDs so don't count q0 */
1243 		if ((listener->tcp_conn_req_cnt_q != 1) ||
1244 		    (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
1245 			mutex_exit(&listener->tcp_eager_lock);
1246 			tcp_err_ack(listener, mp, TBADF, 0);
1247 			return;
1248 		}
1249 		if (listener->tcp_conn_req_cnt_q0 != 0) {
1250 			/* Throw away all the eagers on q0. */
1251 			tcp_eager_cleanup(listener, 1);
1252 		}
1253 		if (listener->tcp_syn_defense) {
1254 			listener->tcp_syn_defense = B_FALSE;
1255 			if (listener->tcp_ip_addr_cache != NULL) {
1256 				kmem_free(listener->tcp_ip_addr_cache,
1257 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1258 				listener->tcp_ip_addr_cache = NULL;
1259 			}
1260 		}
1261 		/*
1262 		 * Transfer tcp_conn_req_max to the eager so that when
1263 		 * a disconnect occurs we can revert the endpoint to the
1264 		 * listen state.
1265 		 */
1266 		eager->tcp_conn_req_max = listener->tcp_conn_req_max;
1267 		ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
1268 		/*
1269 		 * Get a reference on the acceptor just like the
1270 		 * tcp_acceptor_hash_lookup below.
1271 		 */
1272 		acceptor = listener;
1273 		CONN_INC_REF(acceptor->tcp_connp);
1274 	} else {
1275 		acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
1276 		if (acceptor == NULL) {
1277 			if (listener->tcp_connp->conn_debug) {
1278 				(void) strlog(TCP_MOD_ID, 0, 1,
1279 				    SL_ERROR|SL_TRACE,
1280 				    "tcp_accept: did not find acceptor 0x%x\n",
1281 				    acceptor_id);
1282 			}
1283 			mutex_exit(&listener->tcp_eager_lock);
1284 			tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
1285 			return;
1286 		}
1287 		/*
1288 		 * Verify acceptor state. The acceptable states for an acceptor
1289 		 * include TCPS_IDLE and TCPS_BOUND.
1290 		 */
1291 		switch (acceptor->tcp_state) {
1292 		case TCPS_IDLE:
1293 			/* FALLTHRU */
1294 		case TCPS_BOUND:
1295 			break;
1296 		default:
1297 			CONN_DEC_REF(acceptor->tcp_connp);
1298 			mutex_exit(&listener->tcp_eager_lock);
1299 			tcp_err_ack(listener, mp, TOUTSTATE, 0);
1300 			return;
1301 		}
1302 	}
1303 
1304 	/* The listener must be in TCPS_LISTEN */
1305 	if (listener->tcp_state != TCPS_LISTEN) {
1306 		CONN_DEC_REF(acceptor->tcp_connp);
1307 		mutex_exit(&listener->tcp_eager_lock);
1308 		tcp_err_ack(listener, mp, TOUTSTATE, 0);
1309 		return;
1310 	}
1311 
1312 	/*
1313 	 * Rendezvous with an eager connection request packet hanging off
1314 	 * 'tcp' that has the 'seqnum' tag.  We tagged the detached open
1315 	 * tcp structure when the connection packet arrived in
1316 	 * tcp_input_listener().
1317 	 */
1318 	seqnum = tcr->SEQ_number;
1319 	eager = listener;
1320 	do {
1321 		eager = eager->tcp_eager_next_q;
1322 		if (eager == NULL) {
1323 			CONN_DEC_REF(acceptor->tcp_connp);
1324 			mutex_exit(&listener->tcp_eager_lock);
1325 			tcp_err_ack(listener, mp, TBADSEQ, 0);
1326 			return;
1327 		}
1328 	} while (eager->tcp_conn_req_seqnum != seqnum);
1329 	mutex_exit(&listener->tcp_eager_lock);
1330 
1331 	/*
1332 	 * At this point, both acceptor and listener have 2 ref
1333 	 * that they begin with. Acceptor has one additional ref
1334 	 * we placed in lookup while listener has 3 additional
1335 	 * ref for being behind the squeue (tcp_accept() is
1336 	 * done on listener's squeue); being in classifier hash;
1337 	 * and eager's ref on listener.
1338 	 */
1339 	ASSERT(listener->tcp_connp->conn_ref >= 5);
1340 	ASSERT(acceptor->tcp_connp->conn_ref >= 3);
1341 
1342 	/*
1343 	 * The eager at this point is set in its own squeue and
1344 	 * could easily have been killed (tcp_accept_finish will
1345 	 * deal with that) because of a TH_RST so we can only
1346 	 * ASSERT for a single ref.
1347 	 */
1348 	ASSERT(eager->tcp_connp->conn_ref >= 1);
1349 
1350 	/*
1351 	 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
1352 	 * use it if something failed.
1353 	 */
1354 	discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
1355 	    sizeof (struct stroptions)), BPRI_HI);
1356 	if (discon_mp == NULL) {
1357 		CONN_DEC_REF(acceptor->tcp_connp);
1358 		CONN_DEC_REF(eager->tcp_connp);
1359 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
1360 		return;
1361 	}
1362 
1363 	econnp = eager->tcp_connp;
1364 
1365 	/* Hold a copy of mp, in case reallocb fails */
1366 	if ((mp1 = copymsg(mp)) == NULL) {
1367 		CONN_DEC_REF(acceptor->tcp_connp);
1368 		CONN_DEC_REF(eager->tcp_connp);
1369 		freemsg(discon_mp);
1370 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
1371 		return;
1372 	}
1373 
1374 	tcr = (struct T_conn_res *)mp1->b_rptr;
1375 
1376 	/*
1377 	 * This is an expanded version of mi_tpi_ok_ack_alloc()
1378 	 * which allocates a larger mblk and appends the new
1379 	 * local address to the ok_ack.  The address is copied by
1380 	 * soaccept() for getsockname().
1381 	 */
1382 	{
1383 		int extra;
1384 
1385 		extra = (econnp->conn_family == AF_INET) ?
1386 		    sizeof (sin_t) : sizeof (sin6_t);
1387 
1388 		/*
1389 		 * Try to re-use mp, if possible.  Otherwise, allocate
1390 		 * an mblk and return it as ok_mp.  In any case, mp
1391 		 * is no longer usable upon return.
1392 		 */
1393 		if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
1394 			CONN_DEC_REF(acceptor->tcp_connp);
1395 			CONN_DEC_REF(eager->tcp_connp);
1396 			freemsg(discon_mp);
1397 			/* Original mp has been freed by now, so use mp1 */
1398 			tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
1399 			return;
1400 		}
1401 
1402 		mp = NULL;	/* We should never use mp after this point */
1403 
1404 		switch (extra) {
1405 		case sizeof (sin_t): {
1406 			sin_t *sin = (sin_t *)ok_mp->b_wptr;
1407 
1408 			ok_mp->b_wptr += extra;
1409 			sin->sin_family = AF_INET;
1410 			sin->sin_port = econnp->conn_lport;
1411 			sin->sin_addr.s_addr = econnp->conn_laddr_v4;
1412 			break;
1413 		}
1414 		case sizeof (sin6_t): {
1415 			sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
1416 
1417 			ok_mp->b_wptr += extra;
1418 			sin6->sin6_family = AF_INET6;
1419 			sin6->sin6_port = econnp->conn_lport;
1420 			sin6->sin6_addr = econnp->conn_laddr_v6;
1421 			sin6->sin6_flowinfo = econnp->conn_flowinfo;
1422 			if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
1423 			    (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
1424 				sin6->sin6_scope_id =
1425 				    econnp->conn_ixa->ixa_scopeid;
1426 			} else {
1427 				sin6->sin6_scope_id = 0;
1428 			}
1429 			sin6->__sin6_src_id = 0;
1430 			break;
1431 		}
1432 		default:
1433 			break;
1434 		}
1435 		ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
1436 	}
1437 
1438 	/*
1439 	 * If there are no options we know that the T_CONN_RES will
1440 	 * succeed. However, we can't send the T_OK_ACK upstream until
1441 	 * the tcp_accept_swap is done since it would be dangerous to
1442 	 * let the application start using the new fd prior to the swap.
1443 	 */
1444 	tcp_accept_swap(listener, acceptor, eager);
1445 
1446 	/*
1447 	 * tcp_accept_swap unlinks eager from listener but does not drop
1448 	 * the eager's reference on the listener.
1449 	 */
1450 	ASSERT(eager->tcp_listener == NULL);
1451 	ASSERT(listener->tcp_connp->conn_ref >= 5);
1452 
1453 	/*
1454 	 * The eager is now associated with its own queue. Insert in
1455 	 * the hash so that the connection can be reused for a future
1456 	 * T_CONN_RES.
1457 	 */
1458 	tcp_acceptor_hash_insert(acceptor_id, eager);
1459 
1460 	/*
1461 	 * We now do the processing of options with T_CONN_RES.
1462 	 * We delay till now since we wanted to have queue to pass to
1463 	 * option processing routines that points back to the right
1464 	 * instance structure which does not happen until after
1465 	 * tcp_accept_swap().
1466 	 *
1467 	 * Note:
1468 	 * The sanity of the logic here assumes that whatever options
1469 	 * are appropriate to inherit from listner=>eager are done
1470 	 * before this point, and whatever were to be overridden (or not)
1471 	 * in transfer logic from eager=>acceptor in tcp_accept_swap().
1472 	 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
1473 	 *   before its ACCEPTOR_id comes down in T_CONN_RES ]
1474 	 * This may not be true at this point in time but can be fixed
1475 	 * independently. This option processing code starts with
1476 	 * the instantiated acceptor instance and the final queue at
1477 	 * this point.
1478 	 */
1479 
1480 	if (tcr->OPT_length != 0) {
1481 		/* Options to process */
1482 		int t_error = 0;
1483 		int sys_error = 0;
1484 		int do_disconnect = 0;
1485 
1486 		if (tcp_conprim_opt_process(eager, mp1,
1487 		    &do_disconnect, &t_error, &sys_error) < 0) {
1488 			eager->tcp_accept_error = 1;
1489 			if (do_disconnect) {
1490 				/*
1491 				 * An option failed which does not allow
1492 				 * connection to be accepted.
1493 				 *
1494 				 * We allow T_CONN_RES to succeed and
1495 				 * put a T_DISCON_IND on the eager queue.
1496 				 */
1497 				ASSERT(t_error == 0 && sys_error == 0);
1498 				eager->tcp_send_discon_ind = 1;
1499 			} else {
1500 				ASSERT(t_error != 0);
1501 				freemsg(ok_mp);
1502 				/*
1503 				 * Original mp was either freed or set
1504 				 * to ok_mp above, so use mp1 instead.
1505 				 */
1506 				tcp_err_ack(listener, mp1, t_error, sys_error);
1507 				goto finish;
1508 			}
1509 		}
1510 		/*
1511 		 * Most likely success in setting options (except if
1512 		 * eager->tcp_send_discon_ind set).
1513 		 * mp1 option buffer represented by OPT_length/offset
1514 		 * potentially modified and contains results of setting
1515 		 * options at this point
1516 		 */
1517 	}
1518 
1519 	/* We no longer need mp1, since all options processing has passed */
1520 	freemsg(mp1);
1521 
1522 	putnext(listener->tcp_connp->conn_rq, ok_mp);
1523 
1524 	mutex_enter(&listener->tcp_eager_lock);
1525 	if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
1526 		tcp_t	*tail;
1527 		mblk_t	*conn_ind;
1528 
1529 		/*
1530 		 * This path should not be executed if listener and
1531 		 * acceptor streams are the same.
1532 		 */
1533 		ASSERT(listener != acceptor);
1534 
1535 		tcp = listener->tcp_eager_prev_q0;
1536 		/*
1537 		 * listener->tcp_eager_prev_q0 points to the TAIL of the
1538 		 * deferred T_conn_ind queue. We need to get to the head of
1539 		 * the queue in order to send up T_conn_ind the same order as
1540 		 * how the 3WHS is completed.
1541 		 */
1542 		while (tcp != listener) {
1543 			if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
1544 				break;
1545 			else
1546 				tcp = tcp->tcp_eager_prev_q0;
1547 		}
1548 		ASSERT(tcp != listener);
1549 		conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
1550 		ASSERT(conn_ind != NULL);
1551 		tcp->tcp_conn.tcp_eager_conn_ind = NULL;
1552 
1553 		/* Move from q0 to q */
1554 		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1555 		listener->tcp_conn_req_cnt_q0--;
1556 		listener->tcp_conn_req_cnt_q++;
1557 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1558 		    tcp->tcp_eager_prev_q0;
1559 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1560 		    tcp->tcp_eager_next_q0;
1561 		tcp->tcp_eager_prev_q0 = NULL;
1562 		tcp->tcp_eager_next_q0 = NULL;
1563 		tcp->tcp_conn_def_q0 = B_FALSE;
1564 
1565 		/* Make sure the tcp isn't in the list of droppables */
1566 		ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
1567 		    tcp->tcp_eager_prev_drop_q0 == NULL);
1568 
1569 		/*
1570 		 * Insert at end of the queue because sockfs sends
1571 		 * down T_CONN_RES in chronological order. Leaving
1572 		 * the older conn indications at front of the queue
1573 		 * helps reducing search time.
1574 		 */
1575 		tail = listener->tcp_eager_last_q;
1576 		if (tail != NULL)
1577 			tail->tcp_eager_next_q = tcp;
1578 		else
1579 			listener->tcp_eager_next_q = tcp;
1580 		listener->tcp_eager_last_q = tcp;
1581 		tcp->tcp_eager_next_q = NULL;
1582 		mutex_exit(&listener->tcp_eager_lock);
1583 		putnext(tcp->tcp_connp->conn_rq, conn_ind);
1584 	} else {
1585 		mutex_exit(&listener->tcp_eager_lock);
1586 	}
1587 
1588 	/*
1589 	 * Done with the acceptor - free it
1590 	 *
1591 	 * Note: from this point on, no access to listener should be made
1592 	 * as listener can be equal to acceptor.
1593 	 */
1594 finish:
1595 	ASSERT(acceptor->tcp_detached);
1596 	acceptor->tcp_connp->conn_rq = NULL;
1597 	ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
1598 	acceptor->tcp_connp->conn_wq = NULL;
1599 	(void) tcp_clean_death(acceptor, 0);
1600 	CONN_DEC_REF(acceptor->tcp_connp);
1601 
1602 	/*
1603 	 * We pass discon_mp to tcp_accept_finish to get on the right squeue.
1604 	 *
1605 	 * It will update the setting for sockfs/stream head and also take
1606 	 * care of any data that arrived before accept() wad called.
1607 	 * In case we already received a FIN then tcp_accept_finish will send up
1608 	 * the ordrel. It will also send up a window update if the window
1609 	 * has opened up.
1610 	 */
1611 
1612 	/*
1613 	 * XXX: we currently have a problem if XTI application closes the
1614 	 * acceptor stream in between. This problem exists in on10-gate also
1615 	 * and is well know but nothing can be done short of major rewrite
1616 	 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
1617 	 * eager same squeue as listener (we can distinguish non socket
1618 	 * listeners at the time of handling a SYN in tcp_input_listener)
1619 	 * and do most of the work that tcp_accept_finish does here itself
1620 	 * and then get behind the acceptor squeue to access the acceptor
1621 	 * queue.
1622 	 */
1623 	/*
1624 	 * We already have a ref on tcp so no need to do one before squeue_enter
1625 	 */
1626 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
1627 	    tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
1628 	    SQTAG_TCP_ACCEPT_FINISH);
1629 }
1630 
1631 
1632 /*
1633  * This is the STREAMS entry point for T_CONN_RES coming down on
1634  * Acceptor STREAM when  sockfs listener does accept processing.
1635  * Read the block comment on top of tcp_input_listener().
1636  */
1637 void
1638 tcp_tpi_accept(queue_t *q, mblk_t *mp)
1639 {
1640 	queue_t *rq = RD(q);
1641 	struct T_conn_res *conn_res;
1642 	tcp_t *eager;
1643 	tcp_t *listener;
1644 	struct T_ok_ack *ok;
1645 	t_scalar_t PRIM_type;
1646 	conn_t *econnp;
1647 	cred_t *cr;
1648 
1649 	ASSERT(DB_TYPE(mp) == M_PROTO);
1650 
1651 	/*
1652 	 * All Solaris components should pass a db_credp
1653 	 * for this TPI message, hence we ASSERT.
1654 	 * But in case there is some other M_PROTO that looks
1655 	 * like a TPI message sent by some other kernel
1656 	 * component, we check and return an error.
1657 	 */
1658 	cr = msg_getcred(mp, NULL);
1659 	ASSERT(cr != NULL);
1660 	if (cr == NULL) {
1661 		mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
1662 		if (mp != NULL)
1663 			putnext(rq, mp);
1664 		return;
1665 	}
1666 	conn_res = (struct T_conn_res *)mp->b_rptr;
1667 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
1668 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) {
1669 		mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
1670 		if (mp != NULL)
1671 			putnext(rq, mp);
1672 		return;
1673 	}
1674 	switch (conn_res->PRIM_type) {
1675 	case O_T_CONN_RES:
1676 	case T_CONN_RES:
1677 		/*
1678 		 * We pass up an err ack if allocb fails. This will
1679 		 * cause sockfs to issue a T_DISCON_REQ which will cause
1680 		 * tcp_eager_blowoff to be called. sockfs will then call
1681 		 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream.
1682 		 * we need to do the allocb up here because we have to
1683 		 * make sure rq->q_qinfo->qi_qclose still points to the
1684 		 * correct function (tcp_tpi_close_accept) in case allocb
1685 		 * fails.
1686 		 */
1687 		bcopy(mp->b_rptr + conn_res->OPT_offset,
1688 		    &eager, conn_res->OPT_length);
1689 		PRIM_type = conn_res->PRIM_type;
1690 		mp->b_datap->db_type = M_PCPROTO;
1691 		mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack);
1692 		ok = (struct T_ok_ack *)mp->b_rptr;
1693 		ok->PRIM_type = T_OK_ACK;
1694 		ok->CORRECT_prim = PRIM_type;
1695 		econnp = eager->tcp_connp;
1696 		econnp->conn_dev = (dev_t)RD(q)->q_ptr;
1697 		econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr);
1698 		econnp->conn_rq = rq;
1699 		econnp->conn_wq = q;
1700 		rq->q_ptr = econnp;
1701 		rq->q_qinfo = &tcp_rinitv4;	/* No open - same as rinitv6 */
1702 		q->q_ptr = econnp;
1703 		q->q_qinfo = &tcp_winit;
1704 		listener = eager->tcp_listener;
1705 
1706 		if (tcp_accept_common(listener->tcp_connp,
1707 		    econnp, cr) < 0) {
1708 			mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
1709 			if (mp != NULL)
1710 				putnext(rq, mp);
1711 			return;
1712 		}
1713 
1714 		/*
1715 		 * Send the new local address also up to sockfs. There
1716 		 * should already be enough space in the mp that came
1717 		 * down from soaccept().
1718 		 */
1719 		if (econnp->conn_family == AF_INET) {
1720 			sin_t *sin;
1721 
1722 			ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
1723 			    (sizeof (struct T_ok_ack) + sizeof (sin_t)));
1724 			sin = (sin_t *)mp->b_wptr;
1725 			mp->b_wptr += sizeof (sin_t);
1726 			sin->sin_family = AF_INET;
1727 			sin->sin_port = econnp->conn_lport;
1728 			sin->sin_addr.s_addr = econnp->conn_laddr_v4;
1729 		} else {
1730 			sin6_t *sin6;
1731 
1732 			ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
1733 			    sizeof (struct T_ok_ack) + sizeof (sin6_t));
1734 			sin6 = (sin6_t *)mp->b_wptr;
1735 			mp->b_wptr += sizeof (sin6_t);
1736 			sin6->sin6_family = AF_INET6;
1737 			sin6->sin6_port = econnp->conn_lport;
1738 			sin6->sin6_addr = econnp->conn_laddr_v6;
1739 			if (econnp->conn_ipversion == IPV4_VERSION)
1740 				sin6->sin6_flowinfo = 0;
1741 			else
1742 				sin6->sin6_flowinfo = econnp->conn_flowinfo;
1743 			if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
1744 			    (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
1745 				sin6->sin6_scope_id =
1746 				    econnp->conn_ixa->ixa_scopeid;
1747 			} else {
1748 				sin6->sin6_scope_id = 0;
1749 			}
1750 			sin6->__sin6_src_id = 0;
1751 		}
1752 
1753 		putnext(rq, mp);
1754 		return;
1755 	default:
1756 		mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0);
1757 		if (mp != NULL)
1758 			putnext(rq, mp);
1759 		return;
1760 	}
1761 }
1762 
1763 /*
1764  * Send the newconn notification to ulp. The eager is blown off if the
1765  * notification fails.
1766  */
1767 static void
1768 tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp)
1769 {
1770 	if (IPCL_IS_NONSTR(lconnp)) {
1771 		cred_t	*cr;
1772 		pid_t	cpid = NOPID;
1773 
1774 		ASSERT(econnp->conn_tcp->tcp_listener == lconnp->conn_tcp);
1775 		ASSERT(econnp->conn_tcp->tcp_saved_listener ==
1776 		    lconnp->conn_tcp);
1777 
1778 		cr = msg_getcred(mp, &cpid);
1779 
1780 		/* Keep the message around in case of a fallback to TPI */
1781 		econnp->conn_tcp->tcp_conn.tcp_eager_conn_ind = mp;
1782 		/*
1783 		 * Notify the ULP about the newconn. It is guaranteed that no
1784 		 * tcp_accept() call will be made for the eager if the
1785 		 * notification fails, so it's safe to blow it off in that
1786 		 * case.
1787 		 *
1788 		 * The upper handle will be assigned when tcp_accept() is
1789 		 * called.
1790 		 */
1791 		if ((*lconnp->conn_upcalls->su_newconn)
1792 		    (lconnp->conn_upper_handle,
1793 		    (sock_lower_handle_t)econnp,
1794 		    &sock_tcp_downcalls, cr, cpid,
1795 		    &econnp->conn_upcalls) == NULL) {
1796 			/* Failed to allocate a socket */
1797 			TCPS_BUMP_MIB(lconnp->conn_tcp->tcp_tcps,
1798 			    tcpEstabResets);
1799 			(void) tcp_eager_blowoff(lconnp->conn_tcp,
1800 			    econnp->conn_tcp->tcp_conn_req_seqnum);
1801 		}
1802 	} else {
1803 		putnext(lconnp->conn_rq, mp);
1804 	}
1805 }
1806 
1807 /*
1808  * The function called through squeue to get behind listener's perimeter to
1809  * send a deferred conn_ind.
1810  */
1811 /* ARGSUSED */
1812 void
1813 tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1814 {
1815 	conn_t	*lconnp = (conn_t *)arg;
1816 	tcp_t *listener = lconnp->conn_tcp;
1817 	struct T_conn_ind *conn_ind;
1818 	tcp_t *tcp;
1819 
1820 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1821 	bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
1822 	    conn_ind->OPT_length);
1823 
1824 	if (listener->tcp_state != TCPS_LISTEN) {
1825 		/*
1826 		 * If listener has closed, it would have caused a
1827 		 * a cleanup/blowoff to happen for the eager, so
1828 		 * we don't need to do anything more.
1829 		 */
1830 		freemsg(mp);
1831 		return;
1832 	}
1833 
1834 	tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp);
1835 }
1836 
1837 /*
1838  * Sends the T_CONN_IND to the listener. The caller calls this
1839  * functions via squeue to get inside the listener's perimeter
1840  * once the 3 way hand shake is done a T_CONN_IND needs to be
1841  * sent. As an optimization, the caller can call this directly
1842  * if listener's perimeter is same as eager's.
1843  */
1844 /* ARGSUSED */
1845 void
1846 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
1847 {
1848 	conn_t			*lconnp = (conn_t *)arg;
1849 	tcp_t			*listener = lconnp->conn_tcp;
1850 	tcp_t			*tcp;
1851 	struct T_conn_ind	*conn_ind;
1852 	ipaddr_t 		*addr_cache;
1853 	boolean_t		need_send_conn_ind = B_FALSE;
1854 	tcp_stack_t		*tcps = listener->tcp_tcps;
1855 
1856 	/* retrieve the eager */
1857 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1858 	ASSERT(conn_ind->OPT_offset != 0 &&
1859 	    conn_ind->OPT_length == sizeof (intptr_t));
1860 	bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
1861 	    conn_ind->OPT_length);
1862 
1863 	/*
1864 	 * TLI/XTI applications will get confused by
1865 	 * sending eager as an option since it violates
1866 	 * the option semantics. So remove the eager as
1867 	 * option since TLI/XTI app doesn't need it anyway.
1868 	 */
1869 	if (!TCP_IS_SOCKET(listener)) {
1870 		conn_ind->OPT_length = 0;
1871 		conn_ind->OPT_offset = 0;
1872 	}
1873 	if (listener->tcp_state != TCPS_LISTEN) {
1874 		/*
1875 		 * If listener has closed, it would have caused a
1876 		 * a cleanup/blowoff to happen for the eager. We
1877 		 * just need to return.
1878 		 */
1879 		freemsg(mp);
1880 		return;
1881 	}
1882 
1883 
1884 	/*
1885 	 * if the conn_req_q is full defer passing up the
1886 	 * T_CONN_IND until space is availabe after t_accept()
1887 	 * processing
1888 	 */
1889 	mutex_enter(&listener->tcp_eager_lock);
1890 
1891 	/*
1892 	 * Take the eager out, if it is in the list of droppable eagers
1893 	 * as we are here because the 3W handshake is over.
1894 	 */
1895 	MAKE_UNDROPPABLE(tcp);
1896 
1897 	if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) {
1898 		tcp_t *tail;
1899 
1900 		/*
1901 		 * The eager already has an extra ref put in tcp_input_data
1902 		 * so that it stays till accept comes back even though it
1903 		 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1904 		 */
1905 		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1906 		listener->tcp_conn_req_cnt_q0--;
1907 		listener->tcp_conn_req_cnt_q++;
1908 
1909 		/* Move from SYN_RCVD to ESTABLISHED list  */
1910 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1911 		    tcp->tcp_eager_prev_q0;
1912 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1913 		    tcp->tcp_eager_next_q0;
1914 		tcp->tcp_eager_prev_q0 = NULL;
1915 		tcp->tcp_eager_next_q0 = NULL;
1916 
1917 		/*
1918 		 * Insert at end of the queue because sockfs
1919 		 * sends down T_CONN_RES in chronological
1920 		 * order. Leaving the older conn indications
1921 		 * at front of the queue helps reducing search
1922 		 * time.
1923 		 */
1924 		tail = listener->tcp_eager_last_q;
1925 		if (tail != NULL)
1926 			tail->tcp_eager_next_q = tcp;
1927 		else
1928 			listener->tcp_eager_next_q = tcp;
1929 		listener->tcp_eager_last_q = tcp;
1930 		tcp->tcp_eager_next_q = NULL;
1931 		/*
1932 		 * Delay sending up the T_conn_ind until we are
1933 		 * done with the eager. Once we have have sent up
1934 		 * the T_conn_ind, the accept can potentially complete
1935 		 * any time and release the refhold we have on the eager.
1936 		 */
1937 		need_send_conn_ind = B_TRUE;
1938 	} else {
1939 		/*
1940 		 * Defer connection on q0 and set deferred
1941 		 * connection bit true
1942 		 */
1943 		tcp->tcp_conn_def_q0 = B_TRUE;
1944 
1945 		/* take tcp out of q0 ... */
1946 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1947 		    tcp->tcp_eager_next_q0;
1948 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1949 		    tcp->tcp_eager_prev_q0;
1950 
1951 		/* ... and place it at the end of q0 */
1952 		tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0;
1953 		tcp->tcp_eager_next_q0 = listener;
1954 		listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp;
1955 		listener->tcp_eager_prev_q0 = tcp;
1956 		tcp->tcp_conn.tcp_eager_conn_ind = mp;
1957 	}
1958 
1959 	/* we have timed out before */
1960 	if (tcp->tcp_syn_rcvd_timeout != 0) {
1961 		tcp->tcp_syn_rcvd_timeout = 0;
1962 		listener->tcp_syn_rcvd_timeout--;
1963 		if (listener->tcp_syn_defense &&
1964 		    listener->tcp_syn_rcvd_timeout <=
1965 		    (tcps->tcps_conn_req_max_q0 >> 5) &&
1966 		    10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1967 		    listener->tcp_last_rcv_lbolt)) {
1968 			/*
1969 			 * Turn off the defense mode if we
1970 			 * believe the SYN attack is over.
1971 			 */
1972 			listener->tcp_syn_defense = B_FALSE;
1973 			if (listener->tcp_ip_addr_cache) {
1974 				kmem_free((void *)listener->tcp_ip_addr_cache,
1975 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1976 				listener->tcp_ip_addr_cache = NULL;
1977 			}
1978 		}
1979 	}
1980 	addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1981 	if (addr_cache != NULL) {
1982 		/*
1983 		 * We have finished a 3-way handshake with this
1984 		 * remote host. This proves the IP addr is good.
1985 		 * Cache it!
1986 		 */
1987 		addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1988 		    tcp->tcp_connp->conn_faddr_v4;
1989 	}
1990 	mutex_exit(&listener->tcp_eager_lock);
1991 	if (need_send_conn_ind)
1992 		tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp);
1993 }
1994