xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_tpi.c (revision 33efde4275d24731ef87927237b0ffb0630b6b2d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /* This files contains all TCP TLI/TPI related functions */
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsun.h>
31 #include <sys/strsubr.h>
32 #include <sys/stropts.h>
33 #include <sys/strlog.h>
34 #define	_SUN_TPI_VERSION 2
35 #include <sys/tihdr.h>
36 #include <sys/suntpi.h>
37 #include <sys/xti_inet.h>
38 #include <sys/squeue_impl.h>
39 #include <sys/squeue.h>
40 
41 #include <inet/common.h>
42 #include <inet/ip.h>
43 #include <inet/tcp.h>
44 #include <inet/tcp_impl.h>
45 #include <inet/proto_set.h>
46 
47 static void	tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *);
48 static int	tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *);
49 
50 void
tcp_use_pure_tpi(tcp_t * tcp)51 tcp_use_pure_tpi(tcp_t *tcp)
52 {
53 	conn_t		*connp = tcp->tcp_connp;
54 
55 #ifdef	_ILP32
56 	tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq;
57 #else
58 	tcp->tcp_acceptor_id = connp->conn_dev;
59 #endif
60 	/*
61 	 * Insert this socket into the acceptor hash.
62 	 * We might need it for T_CONN_RES message
63 	 */
64 	tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
65 
66 	tcp->tcp_issocket = B_FALSE;
67 	TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback);
68 }
69 
70 /* Shorthand to generate and send TPI error acks to our client */
71 void
tcp_err_ack(tcp_t * tcp,mblk_t * mp,int t_error,int sys_error)72 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
73 {
74 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
75 		putnext(tcp->tcp_connp->conn_rq, mp);
76 }
77 
78 /* Shorthand to generate and send TPI error acks to our client */
79 void
tcp_err_ack_prim(tcp_t * tcp,mblk_t * mp,int primitive,int t_error,int sys_error)80 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
81     int t_error, int sys_error)
82 {
83 	struct T_error_ack	*teackp;
84 
85 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
86 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
87 		teackp = (struct T_error_ack *)mp->b_rptr;
88 		teackp->ERROR_prim = primitive;
89 		teackp->TLI_error = t_error;
90 		teackp->UNIX_error = sys_error;
91 		putnext(tcp->tcp_connp->conn_rq, mp);
92 	}
93 }
94 
95 /*
96  * TCP routine to get the values of options.
97  */
98 int
tcp_tpi_opt_get(queue_t * q,int level,int name,uchar_t * ptr)99 tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
100 {
101 	return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr));
102 }
103 
104 /* ARGSUSED */
105 int
tcp_tpi_opt_set(queue_t * q,uint_t optset_context,int level,int name,uint_t inlen,uchar_t * invalp,uint_t * outlenp,uchar_t * outvalp,void * thisdg_attrs,cred_t * cr)106 tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
107     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
108     void *thisdg_attrs, cred_t *cr)
109 {
110 	conn_t	*connp =  Q_TO_CONN(q);
111 
112 	return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp,
113 	    outlenp, outvalp, thisdg_attrs, cr));
114 }
115 
116 static int
tcp_conprim_opt_process(tcp_t * tcp,mblk_t * mp,int * do_disconnectp,int * t_errorp,int * sys_errorp)117 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp,
118     int *t_errorp, int *sys_errorp)
119 {
120 	int error;
121 	int is_absreq_failure;
122 	t_scalar_t *opt_lenp;
123 	t_scalar_t opt_offset;
124 	int prim_type;
125 	struct T_conn_req *tcreqp;
126 	struct T_conn_res *tcresp;
127 	cred_t *cr;
128 
129 	/*
130 	 * All Solaris components should pass a db_credp
131 	 * for this TPI message, hence we ASSERT.
132 	 * But in case there is some other M_PROTO that looks
133 	 * like a TPI message sent by some other kernel
134 	 * component, we check and return an error.
135 	 */
136 	cr = msg_getcred(mp, NULL);
137 	ASSERT(cr != NULL);
138 	if (cr == NULL)
139 		return (-1);
140 
141 	prim_type = ((union T_primitives *)mp->b_rptr)->type;
142 	ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES ||
143 	    prim_type == T_CONN_RES);
144 
145 	switch (prim_type) {
146 	case T_CONN_REQ:
147 		tcreqp = (struct T_conn_req *)mp->b_rptr;
148 		opt_offset = tcreqp->OPT_offset;
149 		opt_lenp = (t_scalar_t *)&tcreqp->OPT_length;
150 		break;
151 	case O_T_CONN_RES:
152 	case T_CONN_RES:
153 		tcresp = (struct T_conn_res *)mp->b_rptr;
154 		opt_offset = tcresp->OPT_offset;
155 		opt_lenp = (t_scalar_t *)&tcresp->OPT_length;
156 		break;
157 	default:
158 		opt_lenp = 0;
159 		opt_offset = 0;
160 		break;
161 	}
162 
163 	*t_errorp = 0;
164 	*sys_errorp = 0;
165 	*do_disconnectp = 0;
166 
167 	error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp,
168 	    opt_offset, cr, &tcp_opt_obj,
169 	    NULL, &is_absreq_failure);
170 
171 	switch (error) {
172 	case  0:		/* no error */
173 		ASSERT(is_absreq_failure == 0);
174 		return (0);
175 	case ENOPROTOOPT:
176 		*t_errorp = TBADOPT;
177 		break;
178 	case EACCES:
179 		*t_errorp = TACCES;
180 		break;
181 	default:
182 		*t_errorp = TSYSERR; *sys_errorp = error;
183 		break;
184 	}
185 	if (is_absreq_failure != 0) {
186 		/*
187 		 * The connection request should get the local ack
188 		 * T_OK_ACK and then a T_DISCON_IND.
189 		 */
190 		*do_disconnectp = 1;
191 	}
192 	return (-1);
193 }
194 
195 void
tcp_tpi_bind(tcp_t * tcp,mblk_t * mp)196 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
197 {
198 	int	error;
199 	conn_t	*connp = tcp->tcp_connp;
200 	struct sockaddr	*sa;
201 	mblk_t  *mp1;
202 	struct T_bind_req *tbr;
203 	int	backlog;
204 	socklen_t	len;
205 	sin_t	*sin;
206 	sin6_t	*sin6;
207 	cred_t		*cr;
208 
209 	/*
210 	 * All Solaris components should pass a db_credp
211 	 * for this TPI message, hence we ASSERT.
212 	 * But in case there is some other M_PROTO that looks
213 	 * like a TPI message sent by some other kernel
214 	 * component, we check and return an error.
215 	 */
216 	cr = msg_getcred(mp, NULL);
217 	ASSERT(cr != NULL);
218 	if (cr == NULL) {
219 		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
220 		return;
221 	}
222 
223 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
224 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
225 		if (connp->conn_debug) {
226 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
227 			    "tcp_tpi_bind: bad req, len %u",
228 			    (uint_t)(mp->b_wptr - mp->b_rptr));
229 		}
230 		tcp_err_ack(tcp, mp, TPROTO, 0);
231 		return;
232 	}
233 	/* Make sure the largest address fits */
234 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
235 	if (mp1 == NULL) {
236 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
237 		return;
238 	}
239 	mp = mp1;
240 	tbr = (struct T_bind_req *)mp->b_rptr;
241 
242 	backlog = tbr->CONIND_number;
243 	len = tbr->ADDR_length;
244 
245 	switch (len) {
246 	case 0:		/* request for a generic port */
247 		tbr->ADDR_offset = sizeof (struct T_bind_req);
248 		if (connp->conn_family == AF_INET) {
249 			tbr->ADDR_length = sizeof (sin_t);
250 			sin = (sin_t *)&tbr[1];
251 			*sin = sin_null;
252 			sin->sin_family = AF_INET;
253 			sa = (struct sockaddr *)sin;
254 			len = sizeof (sin_t);
255 			mp->b_wptr = (uchar_t *)&sin[1];
256 		} else {
257 			ASSERT(connp->conn_family == AF_INET6);
258 			tbr->ADDR_length = sizeof (sin6_t);
259 			sin6 = (sin6_t *)&tbr[1];
260 			*sin6 = sin6_null;
261 			sin6->sin6_family = AF_INET6;
262 			sa = (struct sockaddr *)sin6;
263 			len = sizeof (sin6_t);
264 			mp->b_wptr = (uchar_t *)&sin6[1];
265 		}
266 		break;
267 
268 	case sizeof (sin_t):    /* Complete IPv4 address */
269 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
270 		    sizeof (sin_t));
271 		break;
272 
273 	case sizeof (sin6_t): /* Complete IPv6 address */
274 		sa = (struct sockaddr *)mi_offset_param(mp,
275 		    tbr->ADDR_offset, sizeof (sin6_t));
276 		break;
277 
278 	default:
279 		if (connp->conn_debug) {
280 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
281 			    "tcp_tpi_bind: bad address length, %d",
282 			    tbr->ADDR_length);
283 		}
284 		tcp_err_ack(tcp, mp, TBADADDR, 0);
285 		return;
286 	}
287 
288 	if (backlog > 0) {
289 		error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp),
290 		    tbr->PRIM_type != O_T_BIND_REQ);
291 	} else {
292 		error = tcp_do_bind(connp, sa, len, DB_CRED(mp),
293 		    tbr->PRIM_type != O_T_BIND_REQ);
294 	}
295 
296 	if (error > 0) {
297 		tcp_err_ack(tcp, mp, TSYSERR, error);
298 	} else if (error < 0) {
299 		tcp_err_ack(tcp, mp, -error, 0);
300 	} else {
301 		/*
302 		 * Update port information as sockfs/tpi needs it for checking
303 		 */
304 		if (connp->conn_family == AF_INET) {
305 			sin = (sin_t *)sa;
306 			sin->sin_port = connp->conn_lport;
307 		} else {
308 			sin6 = (sin6_t *)sa;
309 			sin6->sin6_port = connp->conn_lport;
310 		}
311 		mp->b_datap->db_type = M_PCPROTO;
312 		tbr->PRIM_type = T_BIND_ACK;
313 		putnext(connp->conn_rq, mp);
314 	}
315 }
316 
317 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */
318 void
tcp_tpi_unbind(tcp_t * tcp,mblk_t * mp)319 tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp)
320 {
321 	conn_t *connp = tcp->tcp_connp;
322 	int error;
323 
324 	error = tcp_do_unbind(connp);
325 	if (error > 0) {
326 		tcp_err_ack(tcp, mp, TSYSERR, error);
327 	} else if (error < 0) {
328 		tcp_err_ack(tcp, mp, -error, 0);
329 	} else {
330 		/* Send M_FLUSH according to TPI */
331 		(void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
332 
333 		mp = mi_tpi_ok_ack_alloc(mp);
334 		if (mp != NULL)
335 			putnext(connp->conn_rq, mp);
336 	}
337 }
338 
339 /* ARGSUSED */
340 int
tcp_tpi_close(queue_t * q,int flags,cred_t * credp __unused)341 tcp_tpi_close(queue_t *q, int flags, cred_t *credp __unused)
342 {
343 	conn_t		*connp;
344 
345 	ASSERT(WR(q)->q_next == NULL);
346 
347 	if (flags & SO_FALLBACK) {
348 		/*
349 		 * stream is being closed while in fallback
350 		 * simply free the resources that were allocated
351 		 */
352 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
353 		qprocsoff(q);
354 		goto done;
355 	}
356 
357 	connp = Q_TO_CONN(q);
358 	/*
359 	 * We are being closed as /dev/tcp or /dev/tcp6.
360 	 */
361 	tcp_close_common(connp, flags);
362 
363 	qprocsoff(q);
364 	inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
365 
366 	/*
367 	 * Drop IP's reference on the conn. This is the last reference
368 	 * on the connp if the state was less than established. If the
369 	 * connection has gone into timewait state, then we will have
370 	 * one ref for the TCP and one more ref (total of two) for the
371 	 * classifier connected hash list (a timewait connections stays
372 	 * in connected hash till closed).
373 	 *
374 	 * We can't assert the references because there might be other
375 	 * transient reference places because of some walkers or queued
376 	 * packets in squeue for the timewait state.
377 	 */
378 	CONN_DEC_REF(connp);
379 done:
380 	q->q_ptr = WR(q)->q_ptr = NULL;
381 	return (0);
382 }
383 
384 /* ARGSUSED */
385 int
tcp_tpi_close_accept(queue_t * q,int flags __unused,cred_t * credp __unused)386 tcp_tpi_close_accept(queue_t *q, int flags __unused, cred_t *credp __unused)
387 {
388 	vmem_t	*minor_arena;
389 	dev_t	conn_dev;
390 	extern struct qinit tcp_acceptor_winit;
391 
392 	ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
393 
394 	/*
395 	 * We had opened an acceptor STREAM for sockfs which is
396 	 * now being closed due to some error.
397 	 */
398 	qprocsoff(q);
399 
400 	minor_arena = (vmem_t *)WR(q)->q_ptr;
401 	conn_dev = (dev_t)RD(q)->q_ptr;
402 	ASSERT(minor_arena != NULL);
403 	ASSERT(conn_dev != 0);
404 	inet_minor_free(minor_arena, conn_dev);
405 	q->q_ptr = WR(q)->q_ptr = NULL;
406 	return (0);
407 }
408 
409 /*
410  * Put a connection confirmation message upstream built from the
411  * address/flowid information with the conn and iph. Report our success or
412  * failure.
413  */
414 boolean_t
tcp_conn_con(tcp_t * tcp,uchar_t * iphdr,mblk_t * idmp,mblk_t ** defermp,ip_recv_attr_t * ira)415 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp,
416     mblk_t **defermp, ip_recv_attr_t *ira)
417 {
418 	sin_t	sin;
419 	sin6_t	sin6;
420 	mblk_t	*mp;
421 	char	*optp = NULL;
422 	int	optlen = 0;
423 	conn_t	*connp = tcp->tcp_connp;
424 
425 	if (defermp != NULL)
426 		*defermp = NULL;
427 
428 	if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
429 		/*
430 		 * Return in T_CONN_CON results of option negotiation through
431 		 * the T_CONN_REQ. Note: If there is an real end-to-end option
432 		 * negotiation, then what is received from remote end needs
433 		 * to be taken into account but there is no such thing (yet?)
434 		 * in our TCP/IP.
435 		 * Note: We do not use mi_offset_param() here as
436 		 * tcp_opts_conn_req contents do not directly come from
437 		 * an application and are either generated in kernel or
438 		 * from user input that was already verified.
439 		 */
440 		mp = tcp->tcp_conn.tcp_opts_conn_req;
441 		optp = (char *)(mp->b_rptr +
442 		    ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
443 		optlen = (int)
444 		    ((struct T_conn_req *)mp->b_rptr)->OPT_length;
445 	}
446 
447 	if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
448 
449 		/* packet is IPv4 */
450 		if (connp->conn_family == AF_INET) {
451 			sin = sin_null;
452 			sin.sin_addr.s_addr = connp->conn_faddr_v4;
453 			sin.sin_port = connp->conn_fport;
454 			sin.sin_family = AF_INET;
455 			mp = mi_tpi_conn_con(NULL, (char *)&sin,
456 			    (int)sizeof (sin_t), optp, optlen);
457 		} else {
458 			sin6 = sin6_null;
459 			sin6.sin6_addr = connp->conn_faddr_v6;
460 			sin6.sin6_port = connp->conn_fport;
461 			sin6.sin6_family = AF_INET6;
462 			mp = mi_tpi_conn_con(NULL, (char *)&sin6,
463 			    (int)sizeof (sin6_t), optp, optlen);
464 
465 		}
466 	} else {
467 		ip6_t	*ip6h = (ip6_t *)iphdr;
468 
469 		ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
470 		ASSERT(connp->conn_family == AF_INET6);
471 		sin6 = sin6_null;
472 		sin6.sin6_addr = connp->conn_faddr_v6;
473 		sin6.sin6_port = connp->conn_fport;
474 		sin6.sin6_family = AF_INET6;
475 		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
476 		mp = mi_tpi_conn_con(NULL, (char *)&sin6,
477 		    (int)sizeof (sin6_t), optp, optlen);
478 	}
479 
480 	if (!mp)
481 		return (B_FALSE);
482 
483 	mblk_copycred(mp, idmp);
484 
485 	if (defermp == NULL) {
486 		conn_t *connp = tcp->tcp_connp;
487 		if (IPCL_IS_NONSTR(connp)) {
488 			(*connp->conn_upcalls->su_connected)
489 			    (connp->conn_upper_handle, tcp->tcp_connid,
490 			    ira->ira_cred, ira->ira_cpid);
491 			freemsg(mp);
492 		} else {
493 			if (ira->ira_cred != NULL) {
494 				/* So that getpeerucred works for TPI sockfs */
495 				mblk_setcred(mp, ira->ira_cred, ira->ira_cpid);
496 			}
497 			putnext(connp->conn_rq, mp);
498 		}
499 	} else {
500 		*defermp = mp;
501 	}
502 
503 	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
504 		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
505 	return (B_TRUE);
506 }
507 
508 /*
509  * Successful connect request processing begins when our client passes
510  * a T_CONN_REQ message into tcp_wput(), which performs function calls into
511  * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream).
512  *
513  * After various error checks are completed, tcp_tpi_connect() lays
514  * the target address and port into the composite header template.
515  * Then we ask IP for information, including a source address if we didn't
516  * already have one. Finally we prepare to send the SYN packet, and then
517  * send up the T_OK_ACK reply message.
518  */
519 void
tcp_tpi_connect(tcp_t * tcp,mblk_t * mp)520 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
521 {
522 	sin_t		*sin;
523 	struct T_conn_req	*tcr;
524 	struct sockaddr	*sa;
525 	socklen_t	len;
526 	int		error;
527 	cred_t		*cr;
528 	pid_t		cpid;
529 	conn_t		*connp = tcp->tcp_connp;
530 	queue_t		*q = connp->conn_wq;
531 
532 	/*
533 	 * All Solaris components should pass a db_credp
534 	 * for this TPI message, hence we ASSERT.
535 	 * But in case there is some other M_PROTO that looks
536 	 * like a TPI message sent by some other kernel
537 	 * component, we check and return an error.
538 	 */
539 	cr = msg_getcred(mp, &cpid);
540 	ASSERT(cr != NULL);
541 	if (cr == NULL) {
542 		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
543 		return;
544 	}
545 
546 	tcr = (struct T_conn_req *)mp->b_rptr;
547 
548 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
549 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
550 		tcp_err_ack(tcp, mp, TPROTO, 0);
551 		return;
552 	}
553 
554 	/*
555 	 * Pre-allocate the T_ordrel_ind mblk so that at close time, we
556 	 * will always have that to send up.  Otherwise, we need to do
557 	 * special handling in case the allocation fails at that time.
558 	 * If the end point is TPI, the tcp_t can be reused and the
559 	 * tcp_ordrel_mp may be allocated already.
560 	 */
561 	if (tcp->tcp_ordrel_mp == NULL) {
562 		if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) {
563 			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
564 			return;
565 		}
566 	}
567 
568 	/*
569 	 * Determine packet type based on type of address passed in
570 	 * the request should contain an IPv4 or IPv6 address.
571 	 * Make sure that address family matches the type of
572 	 * family of the address passed down.
573 	 */
574 	switch (tcr->DEST_length) {
575 	default:
576 		tcp_err_ack(tcp, mp, TBADADDR, 0);
577 		return;
578 
579 	case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
580 		/*
581 		 * XXX: The check for valid DEST_length was not there
582 		 * in earlier releases and some buggy
583 		 * TLI apps (e.g Sybase) got away with not feeding
584 		 * in sin_zero part of address.
585 		 * We allow that bug to keep those buggy apps humming.
586 		 * Test suites require the check on DEST_length.
587 		 * We construct a new mblk with valid DEST_length
588 		 * free the original so the rest of the code does
589 		 * not have to keep track of this special shorter
590 		 * length address case.
591 		 */
592 		mblk_t *nmp;
593 		struct T_conn_req *ntcr;
594 		sin_t *nsin;
595 
596 		nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
597 		    tcr->OPT_length, BPRI_HI);
598 		if (nmp == NULL) {
599 			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
600 			return;
601 		}
602 		ntcr = (struct T_conn_req *)nmp->b_rptr;
603 		bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
604 		ntcr->PRIM_type = T_CONN_REQ;
605 		ntcr->DEST_length = sizeof (sin_t);
606 		ntcr->DEST_offset = sizeof (struct T_conn_req);
607 
608 		nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
609 		*nsin = sin_null;
610 		/* Get pointer to shorter address to copy from original mp */
611 		sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
612 		    tcr->DEST_length); /* extract DEST_length worth of sin_t */
613 		if (sin == NULL || !OK_32PTR((char *)sin)) {
614 			freemsg(nmp);
615 			tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
616 			return;
617 		}
618 		nsin->sin_family = sin->sin_family;
619 		nsin->sin_port = sin->sin_port;
620 		nsin->sin_addr = sin->sin_addr;
621 		/* Note:nsin->sin_zero zero-fill with sin_null assign above */
622 		nmp->b_wptr = (uchar_t *)&nsin[1];
623 		if (tcr->OPT_length != 0) {
624 			ntcr->OPT_length = tcr->OPT_length;
625 			ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
626 			bcopy((uchar_t *)tcr + tcr->OPT_offset,
627 			    (uchar_t *)ntcr + ntcr->OPT_offset,
628 			    tcr->OPT_length);
629 			nmp->b_wptr += tcr->OPT_length;
630 		}
631 		freemsg(mp);	/* original mp freed */
632 		mp = nmp;	/* re-initialize original variables */
633 		tcr = ntcr;
634 	}
635 	/* FALLTHRU */
636 
637 	case sizeof (sin_t):
638 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
639 		    sizeof (sin_t));
640 		len = sizeof (sin_t);
641 		break;
642 
643 	case sizeof (sin6_t):
644 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
645 		    sizeof (sin6_t));
646 		len = sizeof (sin6_t);
647 		break;
648 	}
649 
650 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
651 	if (error != 0) {
652 		tcp_err_ack(tcp, mp, TSYSERR, error);
653 		return;
654 	}
655 
656 	/*
657 	 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
658 	 * should key on their sequence number and cut them loose.
659 	 */
660 
661 	/*
662 	 * If options passed in, feed it for verification and handling
663 	 */
664 	if (tcr->OPT_length != 0) {
665 		mblk_t	*ok_mp;
666 		mblk_t	*discon_mp;
667 		mblk_t  *conn_opts_mp;
668 		int t_error, sys_error, do_disconnect;
669 
670 		conn_opts_mp = NULL;
671 
672 		if (tcp_conprim_opt_process(tcp, mp,
673 		    &do_disconnect, &t_error, &sys_error) < 0) {
674 			if (do_disconnect) {
675 				ASSERT(t_error == 0 && sys_error == 0);
676 				discon_mp = mi_tpi_discon_ind(NULL,
677 				    ECONNREFUSED, 0);
678 				if (!discon_mp) {
679 					tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
680 					    TSYSERR, ENOMEM);
681 					return;
682 				}
683 				ok_mp = mi_tpi_ok_ack_alloc(mp);
684 				if (!ok_mp) {
685 					tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
686 					    TSYSERR, ENOMEM);
687 					return;
688 				}
689 				qreply(q, ok_mp);
690 				qreply(q, discon_mp); /* no flush! */
691 			} else {
692 				ASSERT(t_error != 0);
693 				tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
694 				    sys_error);
695 			}
696 			return;
697 		}
698 		/*
699 		 * Success in setting options, the mp option buffer represented
700 		 * by OPT_length/offset has been potentially modified and
701 		 * contains results of option processing. We copy it in
702 		 * another mp to save it for potentially influencing returning
703 		 * it in T_CONN_CONN.
704 		 */
705 		if (tcr->OPT_length != 0) { /* there are resulting options */
706 			conn_opts_mp = copyb(mp);
707 			if (!conn_opts_mp) {
708 				tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
709 				    TSYSERR, ENOMEM);
710 				return;
711 			}
712 			ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
713 			tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
714 			/*
715 			 * Note:
716 			 * These resulting option negotiation can include any
717 			 * end-to-end negotiation options but there no such
718 			 * thing (yet?) in our TCP/IP.
719 			 */
720 		}
721 	}
722 
723 	/* call the non-TPI version */
724 	error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid);
725 	if (error < 0) {
726 		mp = mi_tpi_err_ack_alloc(mp, -error, 0);
727 	} else if (error > 0) {
728 		mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
729 	} else {
730 		mp = mi_tpi_ok_ack_alloc(mp);
731 	}
732 
733 	/*
734 	 * Note: Code below is the "failure" case
735 	 */
736 	/* return error ack and blow away saved option results if any */
737 	if (mp != NULL)
738 		putnext(connp->conn_rq, mp);
739 	else {
740 		tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
741 		    TSYSERR, ENOMEM);
742 	}
743 }
744 
745 /* Return the TPI/TLI equivalent of our current tcp_state */
746 static int
tcp_tpistate(tcp_t * tcp)747 tcp_tpistate(tcp_t *tcp)
748 {
749 	switch (tcp->tcp_state) {
750 	case TCPS_IDLE:
751 		return (TS_UNBND);
752 	case TCPS_LISTEN:
753 		/*
754 		 * Return whether there are outstanding T_CONN_IND waiting
755 		 * for the matching T_CONN_RES. Therefore don't count q0.
756 		 */
757 		if (tcp->tcp_conn_req_cnt_q > 0)
758 			return (TS_WRES_CIND);
759 		else
760 			return (TS_IDLE);
761 	case TCPS_BOUND:
762 		return (TS_IDLE);
763 	case TCPS_SYN_SENT:
764 		return (TS_WCON_CREQ);
765 	case TCPS_SYN_RCVD:
766 		/*
767 		 * Note: assumption: this has to the active open SYN_RCVD.
768 		 * The passive instance is detached in SYN_RCVD stage of
769 		 * incoming connection processing so we cannot get request
770 		 * for T_info_ack on it.
771 		 */
772 		return (TS_WACK_CRES);
773 	case TCPS_ESTABLISHED:
774 		return (TS_DATA_XFER);
775 	case TCPS_CLOSE_WAIT:
776 		return (TS_WREQ_ORDREL);
777 	case TCPS_FIN_WAIT_1:
778 		return (TS_WIND_ORDREL);
779 	case TCPS_FIN_WAIT_2:
780 		return (TS_WIND_ORDREL);
781 
782 	case TCPS_CLOSING:
783 	case TCPS_LAST_ACK:
784 	case TCPS_TIME_WAIT:
785 	case TCPS_CLOSED:
786 		/*
787 		 * Following TS_WACK_DREQ7 is a rendition of "not
788 		 * yet TS_IDLE" TPI state. There is no best match to any
789 		 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
790 		 * choose a value chosen that will map to TLI/XTI level
791 		 * state of TSTATECHNG (state is process of changing) which
792 		 * captures what this dummy state represents.
793 		 */
794 		return (TS_WACK_DREQ7);
795 	default:
796 		cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
797 		    tcp->tcp_state, tcp_display(tcp, NULL,
798 		    DISP_PORT_ONLY));
799 		return (TS_UNBND);
800 	}
801 }
802 
803 static void
tcp_copy_info(struct T_info_ack * tia,tcp_t * tcp)804 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
805 {
806 	tcp_stack_t	*tcps = tcp->tcp_tcps;
807 	conn_t		*connp = tcp->tcp_connp;
808 	extern struct T_info_ack tcp_g_t_info_ack;
809 	extern struct T_info_ack tcp_g_t_info_ack_v6;
810 
811 	if (connp->conn_family == AF_INET6)
812 		*tia = tcp_g_t_info_ack_v6;
813 	else
814 		*tia = tcp_g_t_info_ack;
815 	tia->CURRENT_state = tcp_tpistate(tcp);
816 	tia->OPT_size = tcp_max_optsize;
817 	if (tcp->tcp_mss == 0) {
818 		/* Not yet set - tcp_open does not set mss */
819 		if (connp->conn_ipversion == IPV4_VERSION)
820 			tia->TIDU_size = tcps->tcps_mss_def_ipv4;
821 		else
822 			tia->TIDU_size = tcps->tcps_mss_def_ipv6;
823 	} else {
824 		tia->TIDU_size = tcp->tcp_mss;
825 	}
826 	/* TODO: Default ETSDU is 1.  Is that correct for tcp? */
827 }
828 
829 void
tcp_do_capability_ack(tcp_t * tcp,struct T_capability_ack * tcap,t_uscalar_t cap_bits1)830 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
831     t_uscalar_t cap_bits1)
832 {
833 	tcap->CAP_bits1 = 0;
834 
835 	if (cap_bits1 & TC1_INFO) {
836 		tcp_copy_info(&tcap->INFO_ack, tcp);
837 		tcap->CAP_bits1 |= TC1_INFO;
838 	}
839 
840 	if (cap_bits1 & TC1_ACCEPTOR_ID) {
841 		tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
842 		tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
843 	}
844 
845 }
846 
847 /*
848  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
849  * tcp_wput.  Much of the T_CAPABILITY_ACK information is copied from
850  * tcp_g_t_info_ack.  The current state of the stream is copied from
851  * tcp_state.
852  */
853 void
tcp_capability_req(tcp_t * tcp,mblk_t * mp)854 tcp_capability_req(tcp_t *tcp, mblk_t *mp)
855 {
856 	t_uscalar_t		cap_bits1;
857 	struct T_capability_ack	*tcap;
858 
859 	if (MBLKL(mp) < sizeof (struct T_capability_req)) {
860 		freemsg(mp);
861 		return;
862 	}
863 
864 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
865 
866 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
867 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
868 	if (mp == NULL)
869 		return;
870 
871 	tcap = (struct T_capability_ack *)mp->b_rptr;
872 	tcp_do_capability_ack(tcp, tcap, cap_bits1);
873 
874 	putnext(tcp->tcp_connp->conn_rq, mp);
875 }
876 
877 /*
878  * This routine responds to T_INFO_REQ messages.  It is called by tcp_wput.
879  * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
880  * The current state of the stream is copied from tcp_state.
881  */
882 void
tcp_info_req(tcp_t * tcp,mblk_t * mp)883 tcp_info_req(tcp_t *tcp, mblk_t *mp)
884 {
885 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
886 	    T_INFO_ACK);
887 	if (!mp) {
888 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
889 		return;
890 	}
891 	tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
892 	putnext(tcp->tcp_connp->conn_rq, mp);
893 }
894 
895 /* Respond to the TPI addr request */
896 void
tcp_addr_req(tcp_t * tcp,mblk_t * mp)897 tcp_addr_req(tcp_t *tcp, mblk_t *mp)
898 {
899 	struct sockaddr *sa;
900 	mblk_t	*ackmp;
901 	struct T_addr_ack *taa;
902 	conn_t	*connp = tcp->tcp_connp;
903 	uint_t	addrlen;
904 
905 	/* Make it large enough for worst case */
906 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
907 	    2 * sizeof (sin6_t), 1);
908 	if (ackmp == NULL) {
909 		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
910 		return;
911 	}
912 
913 	taa = (struct T_addr_ack *)ackmp->b_rptr;
914 
915 	bzero(taa, sizeof (struct T_addr_ack));
916 	ackmp->b_wptr = (uchar_t *)&taa[1];
917 
918 	taa->PRIM_type = T_ADDR_ACK;
919 	ackmp->b_datap->db_type = M_PCPROTO;
920 
921 	if (connp->conn_family == AF_INET)
922 		addrlen = sizeof (sin_t);
923 	else
924 		addrlen = sizeof (sin6_t);
925 
926 	/*
927 	 * Note: Following code assumes 32 bit alignment of basic
928 	 * data structures like sin_t and struct T_addr_ack.
929 	 */
930 	if (tcp->tcp_state >= TCPS_BOUND) {
931 		/*
932 		 * Fill in local address first
933 		 */
934 		taa->LOCADDR_offset = sizeof (*taa);
935 		taa->LOCADDR_length = addrlen;
936 		sa = (struct sockaddr *)&taa[1];
937 		(void) conn_getsockname(connp, sa, &addrlen);
938 		ackmp->b_wptr += addrlen;
939 	}
940 	if (tcp->tcp_state >= TCPS_SYN_RCVD) {
941 		/*
942 		 * Fill in Remote address
943 		 */
944 		taa->REMADDR_length = addrlen;
945 		/* assumed 32-bit alignment */
946 		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
947 		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
948 		(void) conn_getpeername(connp, sa, &addrlen);
949 		ackmp->b_wptr += addrlen;
950 	}
951 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
952 	putnext(tcp->tcp_connp->conn_rq, ackmp);
953 }
954 
955 /*
956  * Swap information between the eager and acceptor for a TLI/XTI client.
957  * The sockfs accept is done on the acceptor stream and control goes
958  * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
959  * called. In either case, both the eager and listener are in their own
960  * perimeter (squeue) and the code has to deal with potential race.
961  *
962  * See the block comment on top of tcp_accept() and tcp_tli_accept().
963  */
964 static void
tcp_accept_swap(tcp_t * listener,tcp_t * acceptor,tcp_t * eager)965 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
966 {
967 	conn_t	*econnp, *aconnp;
968 
969 	ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
970 	ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
971 	ASSERT(!TCP_IS_SOCKET(acceptor));
972 	ASSERT(!TCP_IS_SOCKET(eager));
973 	ASSERT(!TCP_IS_SOCKET(listener));
974 
975 	/*
976 	 * Trusted Extensions may need to use a security label that is
977 	 * different from the acceptor's label on MLP and MAC-Exempt
978 	 * sockets. If this is the case, the required security label
979 	 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the
980 	 * acceptor stream refer to econnp we atomatically get that label.
981 	 */
982 
983 	acceptor->tcp_detached = B_TRUE;
984 	/*
985 	 * To permit stream re-use by TLI/XTI, the eager needs a copy of
986 	 * the acceptor id.
987 	 */
988 	eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
989 
990 	/* remove eager from listen list... */
991 	mutex_enter(&listener->tcp_eager_lock);
992 	tcp_eager_unlink(eager);
993 	ASSERT(eager->tcp_eager_next_q == NULL &&
994 	    eager->tcp_eager_last_q == NULL);
995 	ASSERT(eager->tcp_eager_next_q0 == NULL &&
996 	    eager->tcp_eager_prev_q0 == NULL);
997 	mutex_exit(&listener->tcp_eager_lock);
998 
999 	econnp = eager->tcp_connp;
1000 	aconnp = acceptor->tcp_connp;
1001 	econnp->conn_rq = aconnp->conn_rq;
1002 	econnp->conn_wq = aconnp->conn_wq;
1003 	econnp->conn_rq->q_ptr = econnp;
1004 	econnp->conn_wq->q_ptr = econnp;
1005 
1006 	/*
1007 	 * In the TLI/XTI loopback case, we are inside the listener's squeue,
1008 	 * which might be a different squeue from our peer TCP instance.
1009 	 * For TCP Fusion, the peer expects that whenever tcp_detached is
1010 	 * clear, our TCP queues point to the acceptor's queues.  Thus, use
1011 	 * membar_producer() to ensure that the assignments of conn_rq/conn_wq
1012 	 * above reach global visibility prior to the clearing of tcp_detached.
1013 	 */
1014 	membar_producer();
1015 	eager->tcp_detached = B_FALSE;
1016 
1017 	ASSERT(eager->tcp_ack_tid == 0);
1018 
1019 	econnp->conn_dev = aconnp->conn_dev;
1020 	econnp->conn_minor_arena = aconnp->conn_minor_arena;
1021 
1022 	ASSERT(econnp->conn_minor_arena != NULL);
1023 	if (econnp->conn_cred != NULL)
1024 		crfree(econnp->conn_cred);
1025 	econnp->conn_cred = aconnp->conn_cred;
1026 	ASSERT(!(econnp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1027 	econnp->conn_ixa->ixa_cred = econnp->conn_cred;
1028 	aconnp->conn_cred = NULL;
1029 	econnp->conn_cpid = aconnp->conn_cpid;
1030 	ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
1031 	ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
1032 
1033 	econnp->conn_zoneid = aconnp->conn_zoneid;
1034 	econnp->conn_allzones = aconnp->conn_allzones;
1035 	econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
1036 
1037 	econnp->conn_mac_mode = aconnp->conn_mac_mode;
1038 	econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
1039 	aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
1040 
1041 	/* Do the IPC initialization */
1042 	CONN_INC_REF(econnp);
1043 
1044 	/* Done with old IPC. Drop its ref on its connp */
1045 	CONN_DEC_REF(aconnp);
1046 }
1047 
1048 /*
1049  * This runs at the tail end of accept processing on the squeue of the
1050  * new connection.
1051  */
1052 /* ARGSUSED */
1053 static void
tcp_accept_finish(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)1054 tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1055 {
1056 	conn_t			*connp = (conn_t *)arg;
1057 	tcp_t			*tcp = connp->conn_tcp;
1058 	queue_t			*q = connp->conn_rq;
1059 	tcp_stack_t		*tcps = tcp->tcp_tcps;
1060 	struct stroptions	*stropt;
1061 	struct sock_proto_props sopp;
1062 
1063 	/* Should never be called for non-STREAMS sockets */
1064 	ASSERT(!IPCL_IS_NONSTR(connp));
1065 
1066 	/* We should just receive a single mblk that fits a T_discon_ind */
1067 	ASSERT(mp->b_cont == NULL);
1068 
1069 	/*
1070 	 * Drop the eager's ref on the listener, that was placed when
1071 	 * this eager began life in tcp_input_listener.
1072 	 */
1073 	CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
1074 
1075 	tcp->tcp_detached = B_FALSE;
1076 
1077 	if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) {
1078 		/*
1079 		 * Someone blewoff the eager before we could finish
1080 		 * the accept.
1081 		 *
1082 		 * The only reason eager exists it because we put in
1083 		 * a ref on it when conn ind went up. We need to send
1084 		 * a disconnect indication up while the last reference
1085 		 * on the eager will be dropped by the squeue when we
1086 		 * return.
1087 		 */
1088 		ASSERT(tcp->tcp_listener == NULL);
1089 		if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) {
1090 			struct	T_discon_ind	*tdi;
1091 
1092 			(void) putnextctl1(q, M_FLUSH, FLUSHRW);
1093 			/*
1094 			 * Let us reuse the incoming mblk to avoid
1095 			 * memory allocation failure problems. We know
1096 			 * that the size of the incoming mblk i.e.
1097 			 * stroptions is greater than sizeof
1098 			 * T_discon_ind.
1099 			 */
1100 			ASSERT(DB_REF(mp) == 1);
1101 			ASSERT(MBLKSIZE(mp) >=
1102 			    sizeof (struct T_discon_ind));
1103 
1104 			DB_TYPE(mp) = M_PROTO;
1105 			((union T_primitives *)mp->b_rptr)->type =
1106 			    T_DISCON_IND;
1107 			tdi = (struct T_discon_ind *)mp->b_rptr;
1108 			if (tcp->tcp_issocket) {
1109 				tdi->DISCON_reason = ECONNREFUSED;
1110 				tdi->SEQ_number = 0;
1111 			} else {
1112 				tdi->DISCON_reason = ENOPROTOOPT;
1113 				tdi->SEQ_number =
1114 				    tcp->tcp_conn_req_seqnum;
1115 			}
1116 			mp->b_wptr = mp->b_rptr +
1117 			    sizeof (struct T_discon_ind);
1118 			putnext(q, mp);
1119 		}
1120 		tcp->tcp_hard_binding = B_FALSE;
1121 		return;
1122 	}
1123 
1124 	/*
1125 	 * This is the first time we run on the correct
1126 	 * queue after tcp_accept. So fix all the q parameters
1127 	 * here.
1128 	 *
1129 	 * Let us reuse the incoming mblk to avoid
1130 	 * memory allocation failure problems. We know
1131 	 * that the size of the incoming mblk is at least
1132 	 * stroptions
1133 	 */
1134 	tcp_get_proto_props(tcp, &sopp);
1135 
1136 	ASSERT(DB_REF(mp) == 1);
1137 	ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions));
1138 
1139 	DB_TYPE(mp) = M_SETOPTS;
1140 	stropt = (struct stroptions *)mp->b_rptr;
1141 	mp->b_wptr = mp->b_rptr + sizeof (struct stroptions);
1142 	stropt = (struct stroptions *)mp->b_rptr;
1143 	ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK));
1144 	stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
1145 	stropt->so_hiwat = sopp.sopp_rxhiwat;
1146 	stropt->so_wroff = sopp.sopp_wroff;
1147 	stropt->so_maxblk = sopp.sopp_maxblk;
1148 
1149 	/* Send the options up */
1150 	putnext(q, mp);
1151 
1152 	/*
1153 	 * Pass up any data and/or a fin that has been received.
1154 	 *
1155 	 * Adjust receive window in case it had decreased
1156 	 * (because there is data <=> tcp_rcv_list != NULL)
1157 	 * while the connection was detached. Note that
1158 	 * in case the eager was flow-controlled, w/o this
1159 	 * code, the rwnd may never open up again!
1160 	 */
1161 	if (tcp->tcp_rcv_list != NULL) {
1162 		/* We drain directly in case of fused tcp loopback */
1163 
1164 		if (!tcp->tcp_fused && canputnext(q)) {
1165 			tcp->tcp_rwnd = connp->conn_rcvbuf;
1166 			if (tcp->tcp_state >= TCPS_ESTABLISHED &&
1167 			    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
1168 				tcp_xmit_ctl(NULL,
1169 				    tcp, (tcp->tcp_swnd == 0) ?
1170 				    tcp->tcp_suna : tcp->tcp_snxt,
1171 				    tcp->tcp_rnxt, TH_ACK);
1172 			}
1173 		}
1174 
1175 		(void) tcp_rcv_drain(tcp);
1176 
1177 		/*
1178 		 * For fused tcp loopback, back-enable peer endpoint
1179 		 * if it's currently flow-controlled.
1180 		 */
1181 		if (tcp->tcp_fused) {
1182 			tcp_t *peer_tcp = tcp->tcp_loopback_peer;
1183 
1184 			ASSERT(peer_tcp != NULL);
1185 			ASSERT(peer_tcp->tcp_fused);
1186 
1187 			mutex_enter(&peer_tcp->tcp_non_sq_lock);
1188 			if (peer_tcp->tcp_flow_stopped) {
1189 				tcp_clrqfull(peer_tcp);
1190 				TCP_STAT(tcps, tcp_fusion_backenabled);
1191 			}
1192 			mutex_exit(&peer_tcp->tcp_non_sq_lock);
1193 		}
1194 	}
1195 	ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
1196 	if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
1197 		tcp->tcp_ordrel_done = B_TRUE;
1198 		mp = tcp->tcp_ordrel_mp;
1199 		tcp->tcp_ordrel_mp = NULL;
1200 		putnext(q, mp);
1201 	}
1202 	tcp->tcp_hard_binding = B_FALSE;
1203 
1204 	if (connp->conn_keepalive) {
1205 		tcp->tcp_ka_last_intrvl = 0;
1206 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1207 		    tcp->tcp_ka_interval);
1208 	}
1209 
1210 	/*
1211 	 * At this point, eager is fully established and will
1212 	 * have the following references -
1213 	 *
1214 	 * 2 references for connection to exist (1 for TCP and 1 for IP).
1215 	 * 1 reference for the squeue which will be dropped by the squeue as
1216 	 *	soon as this function returns.
1217 	 * There will be 1 additonal reference for being in classifier
1218 	 *	hash list provided something bad hasn't happened.
1219 	 */
1220 	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
1221 	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
1222 }
1223 
1224 /*
1225  * Pull a deferred connection indication off of the listener. The caller
1226  * must verify that there is a deferred conn ind under eager_lock before
1227  * calling this function.
1228  */
1229 static mblk_t *
tcp_get_def_conn_ind(tcp_t * listener)1230 tcp_get_def_conn_ind(tcp_t *listener)
1231 {
1232 	tcp_t *tail;
1233 	tcp_t *tcp;
1234 	mblk_t *conn_ind;
1235 
1236 	ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
1237 	ASSERT(listener->tcp_eager_prev_q0->tcp_conn_def_q0);
1238 
1239 	tcp = listener->tcp_eager_prev_q0;
1240 	/*
1241 	 * listener->tcp_eager_prev_q0 points to the TAIL of the
1242 	 * deferred T_conn_ind queue. We need to get to the head
1243 	 * of the queue in order to send up T_conn_ind the same
1244 	 * order as how the 3WHS is completed.
1245 	 */
1246 	while (tcp != listener) {
1247 		if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
1248 			break;
1249 		else
1250 			tcp = tcp->tcp_eager_prev_q0;
1251 	}
1252 
1253 	conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
1254 	tcp->tcp_conn.tcp_eager_conn_ind = NULL;
1255 	/* Move from q0 to q */
1256 	ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1257 	listener->tcp_conn_req_cnt_q0--;
1258 	listener->tcp_conn_req_cnt_q++;
1259 	tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1260 	    tcp->tcp_eager_prev_q0;
1261 	tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1262 	    tcp->tcp_eager_next_q0;
1263 	tcp->tcp_eager_prev_q0 = NULL;
1264 	tcp->tcp_eager_next_q0 = NULL;
1265 	tcp->tcp_conn_def_q0 = B_FALSE;
1266 
1267 	/* Make sure the tcp isn't in the list of droppables */
1268 	ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
1269 	    tcp->tcp_eager_prev_drop_q0 == NULL);
1270 
1271 	/*
1272 	 * Insert at end of the queue because sockfs sends
1273 	 * down T_CONN_RES in chronological order. Leaving
1274 	 * the older conn indications at front of the queue
1275 	 * helps reducing search time.
1276 	 */
1277 	tail = listener->tcp_eager_last_q;
1278 	if (tail != NULL) {
1279 		tail->tcp_eager_next_q = tcp;
1280 	} else {
1281 		listener->tcp_eager_next_q = tcp;
1282 	}
1283 	listener->tcp_eager_last_q = tcp;
1284 	tcp->tcp_eager_next_q = NULL;
1285 
1286 	return (conn_ind);
1287 }
1288 
1289 
1290 /*
1291  * Reply to a clients T_CONN_RES TPI message. This function
1292  * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
1293  * on the acceptor STREAM and processed in tcp_accept_common().
1294  * Read the block comment on top of tcp_input_listener().
1295  */
1296 void
tcp_tli_accept(tcp_t * listener,mblk_t * mp)1297 tcp_tli_accept(tcp_t *listener, mblk_t *mp)
1298 {
1299 	tcp_t		*acceptor;
1300 	tcp_t		*eager;
1301 	struct T_conn_res	*tcr;
1302 	t_uscalar_t	acceptor_id;
1303 	t_scalar_t	seqnum;
1304 	mblk_t		*discon_mp = NULL;
1305 	mblk_t		*ok_mp;
1306 	mblk_t		*mp1;
1307 	tcp_stack_t	*tcps = listener->tcp_tcps;
1308 	conn_t		*econnp;
1309 
1310 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
1311 		tcp_err_ack(listener, mp, TPROTO, 0);
1312 		return;
1313 	}
1314 	tcr = (struct T_conn_res *)mp->b_rptr;
1315 
1316 	/*
1317 	 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
1318 	 * read side queue of the streams device underneath us i.e. the
1319 	 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
1320 	 * look it up in the queue_hash.  Under LP64 it sends down the
1321 	 * minor_t of the accepting endpoint.
1322 	 *
1323 	 * Once the acceptor/eager are modified (in tcp_accept_swap) the
1324 	 * fanout hash lock is held.
1325 	 * This prevents any thread from entering the acceptor queue from
1326 	 * below (since it has not been hard bound yet i.e. any inbound
1327 	 * packets will arrive on the listener conn_t and
1328 	 * go through the classifier).
1329 	 * The CONN_INC_REF will prevent the acceptor from closing.
1330 	 *
1331 	 * XXX It is still possible for a tli application to send down data
1332 	 * on the accepting stream while another thread calls t_accept.
1333 	 * This should not be a problem for well-behaved applications since
1334 	 * the T_OK_ACK is sent after the queue swapping is completed.
1335 	 *
1336 	 * If the accepting fd is the same as the listening fd, avoid
1337 	 * queue hash lookup since that will return an eager listener in a
1338 	 * already established state.
1339 	 */
1340 	acceptor_id = tcr->ACCEPTOR_id;
1341 	mutex_enter(&listener->tcp_eager_lock);
1342 	if (listener->tcp_acceptor_id == acceptor_id) {
1343 		eager = listener->tcp_eager_next_q;
1344 		/* only count how many T_CONN_INDs so don't count q0 */
1345 		if ((listener->tcp_conn_req_cnt_q != 1) ||
1346 		    (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
1347 			mutex_exit(&listener->tcp_eager_lock);
1348 			tcp_err_ack(listener, mp, TBADF, 0);
1349 			return;
1350 		}
1351 		if (listener->tcp_conn_req_cnt_q0 != 0) {
1352 			/* Throw away all the eagers on q0. */
1353 			tcp_eager_cleanup(listener, 1);
1354 		}
1355 		if (listener->tcp_syn_defense) {
1356 			listener->tcp_syn_defense = B_FALSE;
1357 			if (listener->tcp_ip_addr_cache != NULL) {
1358 				kmem_free(listener->tcp_ip_addr_cache,
1359 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1360 				listener->tcp_ip_addr_cache = NULL;
1361 			}
1362 		}
1363 		/*
1364 		 * Transfer tcp_conn_req_max to the eager so that when
1365 		 * a disconnect occurs we can revert the endpoint to the
1366 		 * listen state.
1367 		 */
1368 		eager->tcp_conn_req_max = listener->tcp_conn_req_max;
1369 		ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
1370 		/*
1371 		 * Get a reference on the acceptor just like the
1372 		 * tcp_acceptor_hash_lookup below.
1373 		 */
1374 		acceptor = listener;
1375 		CONN_INC_REF(acceptor->tcp_connp);
1376 	} else {
1377 		acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
1378 		if (acceptor == NULL) {
1379 			if (listener->tcp_connp->conn_debug) {
1380 				(void) strlog(TCP_MOD_ID, 0, 1,
1381 				    SL_ERROR|SL_TRACE,
1382 				    "tcp_accept: did not find acceptor 0x%x\n",
1383 				    acceptor_id);
1384 			}
1385 			mutex_exit(&listener->tcp_eager_lock);
1386 			tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
1387 			return;
1388 		}
1389 		/*
1390 		 * Verify acceptor state. The acceptable states for an acceptor
1391 		 * include TCPS_IDLE and TCPS_BOUND.
1392 		 */
1393 		switch (acceptor->tcp_state) {
1394 		case TCPS_IDLE:
1395 			/* FALLTHRU */
1396 		case TCPS_BOUND:
1397 			break;
1398 		default:
1399 			CONN_DEC_REF(acceptor->tcp_connp);
1400 			mutex_exit(&listener->tcp_eager_lock);
1401 			tcp_err_ack(listener, mp, TOUTSTATE, 0);
1402 			return;
1403 		}
1404 	}
1405 
1406 	/* The listener must be in TCPS_LISTEN */
1407 	if (listener->tcp_state != TCPS_LISTEN) {
1408 		CONN_DEC_REF(acceptor->tcp_connp);
1409 		mutex_exit(&listener->tcp_eager_lock);
1410 		tcp_err_ack(listener, mp, TOUTSTATE, 0);
1411 		return;
1412 	}
1413 
1414 	/*
1415 	 * Rendezvous with an eager connection request packet hanging off
1416 	 * 'tcp' that has the 'seqnum' tag.  We tagged the detached open
1417 	 * tcp structure when the connection packet arrived in
1418 	 * tcp_input_listener().
1419 	 */
1420 	seqnum = tcr->SEQ_number;
1421 	eager = listener;
1422 	do {
1423 		eager = eager->tcp_eager_next_q;
1424 		if (eager == NULL) {
1425 			CONN_DEC_REF(acceptor->tcp_connp);
1426 			mutex_exit(&listener->tcp_eager_lock);
1427 			tcp_err_ack(listener, mp, TBADSEQ, 0);
1428 			return;
1429 		}
1430 	} while (eager->tcp_conn_req_seqnum != seqnum);
1431 	mutex_exit(&listener->tcp_eager_lock);
1432 
1433 	/*
1434 	 * At this point, both acceptor and listener have 2 ref
1435 	 * that they begin with. Acceptor has one additional ref
1436 	 * we placed in lookup while listener has 3 additional
1437 	 * ref for being behind the squeue (tcp_accept() is
1438 	 * done on listener's squeue); being in classifier hash;
1439 	 * and eager's ref on listener.
1440 	 */
1441 	ASSERT(listener->tcp_connp->conn_ref >= 5);
1442 	ASSERT(acceptor->tcp_connp->conn_ref >= 3);
1443 
1444 	/*
1445 	 * The eager at this point is set in its own squeue and
1446 	 * could easily have been killed (tcp_accept_finish will
1447 	 * deal with that) because of a TH_RST so we can only
1448 	 * ASSERT for a single ref.
1449 	 */
1450 	ASSERT(eager->tcp_connp->conn_ref >= 1);
1451 
1452 	/*
1453 	 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
1454 	 * use it if something failed.
1455 	 */
1456 	discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
1457 	    sizeof (struct stroptions)), BPRI_HI);
1458 	if (discon_mp == NULL) {
1459 		CONN_DEC_REF(acceptor->tcp_connp);
1460 		CONN_DEC_REF(eager->tcp_connp);
1461 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
1462 		return;
1463 	}
1464 
1465 	econnp = eager->tcp_connp;
1466 
1467 	/* Hold a copy of mp, in case reallocb fails */
1468 	if ((mp1 = copymsg(mp)) == NULL) {
1469 		CONN_DEC_REF(acceptor->tcp_connp);
1470 		CONN_DEC_REF(eager->tcp_connp);
1471 		freemsg(discon_mp);
1472 		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
1473 		return;
1474 	}
1475 
1476 	tcr = (struct T_conn_res *)mp1->b_rptr;
1477 
1478 	/*
1479 	 * This is an expanded version of mi_tpi_ok_ack_alloc()
1480 	 * which allocates a larger mblk and appends the new
1481 	 * local address to the ok_ack.  The address is copied by
1482 	 * soaccept() for getsockname().
1483 	 */
1484 	{
1485 		int extra;
1486 
1487 		extra = (econnp->conn_family == AF_INET) ?
1488 		    sizeof (sin_t) : sizeof (sin6_t);
1489 
1490 		/*
1491 		 * Try to re-use mp, if possible.  Otherwise, allocate
1492 		 * an mblk and return it as ok_mp.  In any case, mp
1493 		 * is no longer usable upon return.
1494 		 */
1495 		if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
1496 			CONN_DEC_REF(acceptor->tcp_connp);
1497 			CONN_DEC_REF(eager->tcp_connp);
1498 			freemsg(discon_mp);
1499 			/* Original mp has been freed by now, so use mp1 */
1500 			tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
1501 			return;
1502 		}
1503 
1504 		mp = NULL;	/* We should never use mp after this point */
1505 
1506 		switch (extra) {
1507 		case sizeof (sin_t): {
1508 			sin_t *sin = (sin_t *)ok_mp->b_wptr;
1509 
1510 			ok_mp->b_wptr += extra;
1511 			sin->sin_family = AF_INET;
1512 			sin->sin_port = econnp->conn_lport;
1513 			sin->sin_addr.s_addr = econnp->conn_laddr_v4;
1514 			break;
1515 		}
1516 		case sizeof (sin6_t): {
1517 			sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
1518 
1519 			ok_mp->b_wptr += extra;
1520 			sin6->sin6_family = AF_INET6;
1521 			sin6->sin6_port = econnp->conn_lport;
1522 			sin6->sin6_addr = econnp->conn_laddr_v6;
1523 			sin6->sin6_flowinfo = econnp->conn_flowinfo;
1524 			if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
1525 			    (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
1526 				sin6->sin6_scope_id =
1527 				    econnp->conn_ixa->ixa_scopeid;
1528 			} else {
1529 				sin6->sin6_scope_id = 0;
1530 			}
1531 			sin6->__sin6_src_id = 0;
1532 			break;
1533 		}
1534 		default:
1535 			break;
1536 		}
1537 		ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
1538 	}
1539 
1540 	/*
1541 	 * If there are no options we know that the T_CONN_RES will
1542 	 * succeed. However, we can't send the T_OK_ACK upstream until
1543 	 * the tcp_accept_swap is done since it would be dangerous to
1544 	 * let the application start using the new fd prior to the swap.
1545 	 */
1546 	tcp_accept_swap(listener, acceptor, eager);
1547 
1548 	/*
1549 	 * tcp_accept_swap unlinks eager from listener but does not drop
1550 	 * the eager's reference on the listener.
1551 	 */
1552 	ASSERT(eager->tcp_listener == NULL);
1553 	ASSERT(listener->tcp_connp->conn_ref >= 5);
1554 
1555 	/*
1556 	 * The eager is now associated with its own queue. Insert in
1557 	 * the hash so that the connection can be reused for a future
1558 	 * T_CONN_RES.
1559 	 */
1560 	tcp_acceptor_hash_insert(acceptor_id, eager);
1561 
1562 	/*
1563 	 * We now do the processing of options with T_CONN_RES.
1564 	 * We delay till now since we wanted to have queue to pass to
1565 	 * option processing routines that points back to the right
1566 	 * instance structure which does not happen until after
1567 	 * tcp_accept_swap().
1568 	 *
1569 	 * Note:
1570 	 * The sanity of the logic here assumes that whatever options
1571 	 * are appropriate to inherit from listner=>eager are done
1572 	 * before this point, and whatever were to be overridden (or not)
1573 	 * in transfer logic from eager=>acceptor in tcp_accept_swap().
1574 	 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
1575 	 *   before its ACCEPTOR_id comes down in T_CONN_RES ]
1576 	 * This may not be true at this point in time but can be fixed
1577 	 * independently. This option processing code starts with
1578 	 * the instantiated acceptor instance and the final queue at
1579 	 * this point.
1580 	 */
1581 
1582 	if (tcr->OPT_length != 0) {
1583 		/* Options to process */
1584 		int t_error = 0;
1585 		int sys_error = 0;
1586 		int do_disconnect = 0;
1587 
1588 		if (tcp_conprim_opt_process(eager, mp1,
1589 		    &do_disconnect, &t_error, &sys_error) < 0) {
1590 			eager->tcp_accept_error = 1;
1591 			if (do_disconnect) {
1592 				/*
1593 				 * An option failed which does not allow
1594 				 * connection to be accepted.
1595 				 *
1596 				 * We allow T_CONN_RES to succeed and
1597 				 * put a T_DISCON_IND on the eager queue.
1598 				 */
1599 				ASSERT(t_error == 0 && sys_error == 0);
1600 				eager->tcp_send_discon_ind = 1;
1601 			} else {
1602 				ASSERT(t_error != 0);
1603 				freemsg(ok_mp);
1604 				/*
1605 				 * Original mp was either freed or set
1606 				 * to ok_mp above, so use mp1 instead.
1607 				 */
1608 				tcp_err_ack(listener, mp1, t_error, sys_error);
1609 				goto finish;
1610 			}
1611 		}
1612 		/*
1613 		 * Most likely success in setting options (except if
1614 		 * eager->tcp_send_discon_ind set).
1615 		 * mp1 option buffer represented by OPT_length/offset
1616 		 * potentially modified and contains results of setting
1617 		 * options at this point
1618 		 */
1619 	}
1620 
1621 	/* We no longer need mp1, since all options processing has passed */
1622 	freemsg(mp1);
1623 
1624 	putnext(listener->tcp_connp->conn_rq, ok_mp);
1625 
1626 	mutex_enter(&listener->tcp_eager_lock);
1627 	if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
1628 		mblk_t	*conn_ind;
1629 
1630 		/*
1631 		 * This path should not be executed if listener and
1632 		 * acceptor streams are the same.
1633 		 */
1634 		ASSERT(listener != acceptor);
1635 		conn_ind = tcp_get_def_conn_ind(listener);
1636 		mutex_exit(&listener->tcp_eager_lock);
1637 		putnext(listener->tcp_connp->conn_rq, conn_ind);
1638 	} else {
1639 		mutex_exit(&listener->tcp_eager_lock);
1640 	}
1641 
1642 	/*
1643 	 * Done with the acceptor - free it
1644 	 *
1645 	 * Note: from this point on, no access to listener should be made
1646 	 * as listener can be equal to acceptor.
1647 	 */
1648 finish:
1649 	ASSERT(acceptor->tcp_detached);
1650 	acceptor->tcp_connp->conn_rq = NULL;
1651 	ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
1652 	acceptor->tcp_connp->conn_wq = NULL;
1653 	(void) tcp_clean_death(acceptor, 0);
1654 	CONN_DEC_REF(acceptor->tcp_connp);
1655 
1656 	/*
1657 	 * We pass discon_mp to tcp_accept_finish to get on the right squeue.
1658 	 *
1659 	 * It will update the setting for sockfs/stream head and also take
1660 	 * care of any data that arrived before accept() wad called.
1661 	 * In case we already received a FIN then tcp_accept_finish will send up
1662 	 * the ordrel. It will also send up a window update if the window
1663 	 * has opened up.
1664 	 */
1665 
1666 	/*
1667 	 * XXX: we currently have a problem if XTI application closes the
1668 	 * acceptor stream in between. This problem exists in on10-gate also
1669 	 * and is well know but nothing can be done short of major rewrite
1670 	 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
1671 	 * eager same squeue as listener (we can distinguish non socket
1672 	 * listeners at the time of handling a SYN in tcp_input_listener)
1673 	 * and do most of the work that tcp_accept_finish does here itself
1674 	 * and then get behind the acceptor squeue to access the acceptor
1675 	 * queue.
1676 	 */
1677 	/*
1678 	 * We already have a ref on tcp so no need to do one before squeue_enter
1679 	 */
1680 	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
1681 	    tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
1682 	    SQTAG_TCP_ACCEPT_FINISH);
1683 }
1684 
1685 
1686 /*
1687  * This is the STREAMS entry point for T_CONN_RES coming down on
1688  * Acceptor STREAM when  sockfs listener does accept processing.
1689  * Read the block comment on top of tcp_input_listener().
1690  */
1691 int
tcp_tpi_accept(queue_t * q,mblk_t * mp)1692 tcp_tpi_accept(queue_t *q, mblk_t *mp)
1693 {
1694 	queue_t *rq = RD(q);
1695 	struct T_conn_res *conn_res;
1696 	tcp_t *eager;
1697 	tcp_t *listener;
1698 	struct T_ok_ack *ok;
1699 	t_scalar_t PRIM_type;
1700 	mblk_t *discon_mp;
1701 	conn_t *econnp;
1702 	cred_t *cr;
1703 
1704 	ASSERT(DB_TYPE(mp) == M_PROTO);
1705 
1706 	/*
1707 	 * All Solaris components should pass a db_credp
1708 	 * for this TPI message, hence we ASSERT.
1709 	 * But in case there is some other M_PROTO that looks
1710 	 * like a TPI message sent by some other kernel
1711 	 * component, we check and return an error.
1712 	 */
1713 	cr = msg_getcred(mp, NULL);
1714 	ASSERT(cr != NULL);
1715 	if (cr == NULL) {
1716 		mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
1717 		if (mp != NULL)
1718 			putnext(rq, mp);
1719 		return (0);
1720 	}
1721 	conn_res = (struct T_conn_res *)mp->b_rptr;
1722 	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
1723 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) {
1724 		mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
1725 		if (mp != NULL)
1726 			putnext(rq, mp);
1727 		return (0);
1728 	}
1729 	switch (conn_res->PRIM_type) {
1730 	case O_T_CONN_RES:
1731 	case T_CONN_RES:
1732 		/*
1733 		 * We pass up an err ack if allocb fails. This will
1734 		 * cause sockfs to issue a T_DISCON_REQ which will cause
1735 		 * tcp_eager_blowoff to be called. sockfs will then call
1736 		 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream.
1737 		 * we need to do the allocb up here because we have to
1738 		 * make sure rq->q_qinfo->qi_qclose still points to the
1739 		 * correct function (tcp_tpi_close_accept) in case allocb
1740 		 * fails.
1741 		 */
1742 		bcopy(mp->b_rptr + conn_res->OPT_offset,
1743 		    &eager, conn_res->OPT_length);
1744 		PRIM_type = conn_res->PRIM_type;
1745 		mp->b_datap->db_type = M_PCPROTO;
1746 		mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack);
1747 		ok = (struct T_ok_ack *)mp->b_rptr;
1748 		ok->PRIM_type = T_OK_ACK;
1749 		ok->CORRECT_prim = PRIM_type;
1750 		econnp = eager->tcp_connp;
1751 		econnp->conn_dev = (dev_t)RD(q)->q_ptr;
1752 		econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr);
1753 		econnp->conn_rq = rq;
1754 		econnp->conn_wq = q;
1755 		rq->q_ptr = econnp;
1756 		rq->q_qinfo = &tcp_rinitv4;	/* No open - same as rinitv6 */
1757 		q->q_ptr = econnp;
1758 		q->q_qinfo = &tcp_winit;
1759 		listener = eager->tcp_listener;
1760 
1761 		/*
1762 		 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
1763 		 * use it if something failed.
1764 		 */
1765 		discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
1766 		    sizeof (struct stroptions)), BPRI_HI);
1767 
1768 		if (discon_mp == NULL) {
1769 			mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
1770 			if (mp != NULL)
1771 				putnext(rq, mp);
1772 			return (0);
1773 		}
1774 
1775 		eager->tcp_issocket = B_TRUE;
1776 
1777 		ASSERT(econnp->conn_netstack ==
1778 		    listener->tcp_connp->conn_netstack);
1779 		ASSERT(eager->tcp_tcps == listener->tcp_tcps);
1780 
1781 		/* Put the ref for IP */
1782 		CONN_INC_REF(econnp);
1783 
1784 		/*
1785 		 * We should have minimum of 3 references on the conn
1786 		 * at this point. One each for TCP and IP and one for
1787 		 * the T_conn_ind that was sent up when the 3-way handshake
1788 		 * completed. In the normal case we would also have another
1789 		 * reference (making a total of 4) for the conn being in the
1790 		 * classifier hash list. However the eager could have received
1791 		 * an RST subsequently and tcp_closei_local could have removed
1792 		 * the eager from the classifier hash list, hence we can't
1793 		 * assert that reference.
1794 		 */
1795 		ASSERT(econnp->conn_ref >= 3);
1796 
1797 		mutex_enter(&listener->tcp_eager_lock);
1798 		if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
1799 			mblk_t *conn_ind = tcp_get_def_conn_ind(listener);
1800 
1801 			/* Need to get inside the listener perimeter */
1802 			CONN_INC_REF(listener->tcp_connp);
1803 			SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
1804 			    conn_ind, tcp_send_pending, listener->tcp_connp,
1805 			    NULL, SQ_FILL, SQTAG_TCP_SEND_PENDING);
1806 		}
1807 		tcp_eager_unlink(eager);
1808 		mutex_exit(&listener->tcp_eager_lock);
1809 
1810 		/*
1811 		 * At this point, the eager is detached from the listener
1812 		 * but we still have an extra refs on eager (apart from the
1813 		 * usual tcp references). The ref was placed in tcp_input_data
1814 		 * before sending the conn_ind in tcp_send_conn_ind.
1815 		 * The ref will be dropped in tcp_accept_finish().
1816 		 */
1817 		SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish,
1818 		    econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
1819 
1820 		/*
1821 		 * Send the new local address also up to sockfs. There
1822 		 * should already be enough space in the mp that came
1823 		 * down from soaccept().
1824 		 */
1825 		if (econnp->conn_family == AF_INET) {
1826 			sin_t *sin;
1827 
1828 			ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
1829 			    (sizeof (struct T_ok_ack) + sizeof (sin_t)));
1830 			sin = (sin_t *)mp->b_wptr;
1831 			mp->b_wptr += sizeof (sin_t);
1832 			sin->sin_family = AF_INET;
1833 			sin->sin_port = econnp->conn_lport;
1834 			sin->sin_addr.s_addr = econnp->conn_laddr_v4;
1835 		} else {
1836 			sin6_t *sin6;
1837 
1838 			ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
1839 			    sizeof (struct T_ok_ack) + sizeof (sin6_t));
1840 			sin6 = (sin6_t *)mp->b_wptr;
1841 			mp->b_wptr += sizeof (sin6_t);
1842 			sin6->sin6_family = AF_INET6;
1843 			sin6->sin6_port = econnp->conn_lport;
1844 			sin6->sin6_addr = econnp->conn_laddr_v6;
1845 			if (econnp->conn_ipversion == IPV4_VERSION)
1846 				sin6->sin6_flowinfo = 0;
1847 			else
1848 				sin6->sin6_flowinfo = econnp->conn_flowinfo;
1849 			if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
1850 			    (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
1851 				sin6->sin6_scope_id =
1852 				    econnp->conn_ixa->ixa_scopeid;
1853 			} else {
1854 				sin6->sin6_scope_id = 0;
1855 			}
1856 			sin6->__sin6_src_id = 0;
1857 		}
1858 
1859 		putnext(rq, mp);
1860 		break;
1861 	default:
1862 		mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0);
1863 		if (mp != NULL)
1864 			putnext(rq, mp);
1865 		break;
1866 	}
1867 	return (0);
1868 }
1869 
1870 /*
1871  * The function called through squeue to get behind listener's perimeter to
1872  * send a deferred conn_ind.
1873  */
1874 /* ARGSUSED */
1875 void
tcp_send_pending(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)1876 tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1877 {
1878 	conn_t	*lconnp = (conn_t *)arg;
1879 	tcp_t *listener = lconnp->conn_tcp;
1880 	struct T_conn_ind *conn_ind;
1881 	tcp_t *tcp;
1882 
1883 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1884 	bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
1885 	    conn_ind->OPT_length);
1886 
1887 	if (listener->tcp_state != TCPS_LISTEN) {
1888 		/*
1889 		 * If listener has closed, it would have caused a
1890 		 * a cleanup/blowoff to happen for the eager, so
1891 		 * we don't need to do anything more.
1892 		 */
1893 		freemsg(mp);
1894 		return;
1895 	}
1896 
1897 	putnext(lconnp->conn_rq, mp);
1898 }
1899 
1900 /*
1901  * Sends the T_CONN_IND to the listener. The caller calls this
1902  * functions via squeue to get inside the listener's perimeter
1903  * once the 3 way hand shake is done a T_CONN_IND needs to be
1904  * sent. As an optimization, the caller can call this directly
1905  * if listener's perimeter is same as eager's.
1906  */
1907 /* ARGSUSED */
1908 void
tcp_send_conn_ind(void * arg,mblk_t * mp,void * arg2)1909 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
1910 {
1911 	conn_t			*lconnp = (conn_t *)arg;
1912 	tcp_t			*listener = lconnp->conn_tcp;
1913 	tcp_t			*tcp;
1914 	struct T_conn_ind	*conn_ind;
1915 	ipaddr_t		*addr_cache;
1916 	boolean_t		need_send_conn_ind = B_FALSE;
1917 	tcp_stack_t		*tcps = listener->tcp_tcps;
1918 
1919 	/* retrieve the eager */
1920 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1921 	ASSERT(conn_ind->OPT_offset != 0 &&
1922 	    conn_ind->OPT_length == sizeof (intptr_t));
1923 	bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
1924 	    conn_ind->OPT_length);
1925 
1926 	/*
1927 	 * TLI/XTI applications will get confused by
1928 	 * sending eager as an option since it violates
1929 	 * the option semantics. So remove the eager as
1930 	 * option since TLI/XTI app doesn't need it anyway.
1931 	 */
1932 	if (!TCP_IS_SOCKET(listener)) {
1933 		conn_ind->OPT_length = 0;
1934 		conn_ind->OPT_offset = 0;
1935 	}
1936 	if (listener->tcp_state != TCPS_LISTEN) {
1937 		/*
1938 		 * If listener has closed, it would have caused a
1939 		 * a cleanup/blowoff to happen for the eager. We
1940 		 * just need to return.
1941 		 */
1942 		freemsg(mp);
1943 		return;
1944 	}
1945 
1946 
1947 	/*
1948 	 * if the conn_req_q is full defer passing up the
1949 	 * T_CONN_IND until space is availabe after t_accept()
1950 	 * processing
1951 	 */
1952 	mutex_enter(&listener->tcp_eager_lock);
1953 
1954 	/*
1955 	 * Take the eager out, if it is in the list of droppable eagers
1956 	 * as we are here because the 3W handshake is over.
1957 	 */
1958 	MAKE_UNDROPPABLE(tcp);
1959 
1960 	if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) {
1961 		tcp_t *tail;
1962 
1963 		/*
1964 		 * The eager already has an extra ref put in tcp_input_data
1965 		 * so that it stays till accept comes back even though it
1966 		 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1967 		 */
1968 		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1969 		listener->tcp_conn_req_cnt_q0--;
1970 		listener->tcp_conn_req_cnt_q++;
1971 
1972 		/* Move from SYN_RCVD to ESTABLISHED list  */
1973 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1974 		    tcp->tcp_eager_prev_q0;
1975 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1976 		    tcp->tcp_eager_next_q0;
1977 		tcp->tcp_eager_prev_q0 = NULL;
1978 		tcp->tcp_eager_next_q0 = NULL;
1979 
1980 		/*
1981 		 * Insert at end of the queue because sockfs
1982 		 * sends down T_CONN_RES in chronological
1983 		 * order. Leaving the older conn indications
1984 		 * at front of the queue helps reducing search
1985 		 * time.
1986 		 */
1987 		tail = listener->tcp_eager_last_q;
1988 		if (tail != NULL)
1989 			tail->tcp_eager_next_q = tcp;
1990 		else
1991 			listener->tcp_eager_next_q = tcp;
1992 		listener->tcp_eager_last_q = tcp;
1993 		tcp->tcp_eager_next_q = NULL;
1994 		/*
1995 		 * Delay sending up the T_conn_ind until we are
1996 		 * done with the eager. Once we have have sent up
1997 		 * the T_conn_ind, the accept can potentially complete
1998 		 * any time and release the refhold we have on the eager.
1999 		 */
2000 		need_send_conn_ind = B_TRUE;
2001 	} else {
2002 		/*
2003 		 * Defer connection on q0 and set deferred
2004 		 * connection bit true
2005 		 */
2006 		tcp->tcp_conn_def_q0 = B_TRUE;
2007 
2008 		/* take tcp out of q0 ... */
2009 		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
2010 		    tcp->tcp_eager_next_q0;
2011 		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
2012 		    tcp->tcp_eager_prev_q0;
2013 
2014 		/* ... and place it at the end of q0 */
2015 		tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0;
2016 		tcp->tcp_eager_next_q0 = listener;
2017 		listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp;
2018 		listener->tcp_eager_prev_q0 = tcp;
2019 		tcp->tcp_conn.tcp_eager_conn_ind = mp;
2020 	}
2021 
2022 	/* we have timed out before */
2023 	if (tcp->tcp_syn_rcvd_timeout != 0) {
2024 		tcp->tcp_syn_rcvd_timeout = 0;
2025 		listener->tcp_syn_rcvd_timeout--;
2026 		if (listener->tcp_syn_defense &&
2027 		    listener->tcp_syn_rcvd_timeout <=
2028 		    (tcps->tcps_conn_req_max_q0 >> 5) &&
2029 		    10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
2030 		    listener->tcp_last_rcv_lbolt)) {
2031 			/*
2032 			 * Turn off the defense mode if we
2033 			 * believe the SYN attack is over.
2034 			 */
2035 			listener->tcp_syn_defense = B_FALSE;
2036 			if (listener->tcp_ip_addr_cache) {
2037 				kmem_free((void *)listener->tcp_ip_addr_cache,
2038 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
2039 				listener->tcp_ip_addr_cache = NULL;
2040 			}
2041 		}
2042 	}
2043 	addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
2044 	if (addr_cache != NULL) {
2045 		/*
2046 		 * We have finished a 3-way handshake with this
2047 		 * remote host. This proves the IP addr is good.
2048 		 * Cache it!
2049 		 */
2050 		addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
2051 		    tcp->tcp_connp->conn_faddr_v4;
2052 	}
2053 	mutex_exit(&listener->tcp_eager_lock);
2054 	if (need_send_conn_ind)
2055 		putnext(lconnp->conn_rq, mp);
2056 }
2057