xref: /titanic_41/usr/src/uts/common/fs/sockfs/sockstr.c (revision 56e2cc86321ec889bf83a888d902c60d6fb2ef8d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/inttypes.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/buf.h>
33 #include <sys/conf.h>
34 #include <sys/cred.h>
35 #include <sys/kmem.h>
36 #include <sys/sysmacros.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/debug.h>
40 #include <sys/errno.h>
41 #include <sys/time.h>
42 #include <sys/file.h>
43 #include <sys/user.h>
44 #include <sys/stream.h>
45 #include <sys/strsubr.h>
46 #include <sys/esunddi.h>
47 #include <sys/flock.h>
48 #include <sys/modctl.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/cmn_err.h>
52 #include <sys/proc.h>
53 #include <sys/ddi.h>
54 
55 #include <sys/suntpi.h>
56 #include <sys/socket.h>
57 #include <sys/sockio.h>
58 #include <sys/socketvar.h>
59 #include <netinet/in.h>
60 #include <inet/common.h>
61 #include <inet/proto_set.h>
62 
63 #include <sys/tiuser.h>
64 #define	_SUN_TPI_VERSION	2
65 #include <sys/tihdr.h>
66 
67 #include <inet/kssl/ksslapi.h>
68 
69 #include <c2/audit.h>
70 
71 #include <fs/sockfs/socktpi.h>
72 #include <fs/sockfs/socktpi_impl.h>
73 
74 int so_default_version = SOV_SOCKSTREAM;
75 
76 #ifdef DEBUG
77 /* Set sockdebug to print debug messages when SO_DEBUG is set */
78 int sockdebug = 0;
79 
80 /* Set sockprinterr to print error messages when SO_DEBUG is set */
81 int sockprinterr = 0;
82 
83 /*
84  * Set so_default_options to SO_DEBUG is all sockets should be created
85  * with SO_DEBUG set. This is needed to get debug printouts from the
86  * socket() call itself.
87  */
88 int so_default_options = 0;
89 #endif /* DEBUG */
90 
91 #ifdef SOCK_TEST
92 /*
93  * Set to number of ticks to limit cv_waits for code coverage testing.
94  * Set to 1000 when SO_DEBUG is set to 2.
95  */
96 clock_t sock_test_timelimit = 0;
97 #endif /* SOCK_TEST */
98 
99 /*
100  * For concurrency testing of e.g. opening /dev/ip which does not
101  * handle T_INFO_REQ messages.
102  */
103 int so_no_tinfo = 0;
104 
105 /*
106  * Timeout for getting a T_CAPABILITY_ACK - it is possible for a provider
107  * to simply ignore the T_CAPABILITY_REQ.
108  */
109 clock_t	sock_capability_timeout	= 2;	/* seconds */
110 
111 static int	do_tcapability(struct sonode *so, t_uscalar_t cap_bits1);
112 static void	so_removehooks(struct sonode *so);
113 
114 static mblk_t *strsock_proto(vnode_t *vp, mblk_t *mp,
115 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
116 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
117 static mblk_t *strsock_misc(vnode_t *vp, mblk_t *mp,
118 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
119 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
120 
121 /*
122  * Convert a socket to a stream. Invoked when the illusory sockmod
123  * is popped from the stream.
124  * Change the stream head back to default operation without losing
125  * any messages (T_conn_ind's are moved to the stream head queue).
126  */
127 int
128 so_sock2stream(struct sonode *so)
129 {
130 	struct vnode		*vp = SOTOV(so);
131 	queue_t			*rq;
132 	mblk_t			*mp;
133 	int			error = 0;
134 	sotpi_info_t		*sti = SOTOTPI(so);
135 
136 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
137 
138 	mutex_enter(&so->so_lock);
139 	so_lock_single(so);
140 
141 	ASSERT(so->so_version != SOV_STREAM);
142 
143 	if (sti->sti_direct) {
144 		mblk_t **mpp;
145 		int rval;
146 
147 		/*
148 		 * Tell the transport below that sockmod is being popped
149 		 */
150 		mutex_exit(&so->so_lock);
151 		error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K, CRED(),
152 		    &rval);
153 		mutex_enter(&so->so_lock);
154 		if (error != 0) {
155 			dprintso(so, 0, ("so_sock2stream(%p): "
156 			    "_SIOCSOCKFALLBACK failed\n", (void *)so));
157 			goto exit;
158 		}
159 		sti->sti_direct = 0;
160 
161 		for (mpp = &sti->sti_conn_ind_head; (mp = *mpp) != NULL;
162 		    mpp = &mp->b_next) {
163 			struct T_conn_ind	*conn_ind;
164 
165 			/*
166 			 * strsock_proto() has already verified the length of
167 			 * this message block.
168 			 */
169 			ASSERT(MBLKL(mp) >= sizeof (struct T_conn_ind));
170 
171 			conn_ind = (struct T_conn_ind *)mp->b_rptr;
172 			if (conn_ind->OPT_length == 0 &&
173 			    conn_ind->OPT_offset == 0)
174 				continue;
175 
176 			if (DB_REF(mp) > 1) {
177 				mblk_t	*newmp;
178 				size_t	length;
179 				cred_t	*cr;
180 				pid_t	cpid;
181 				int error;	/* Dummy - error not returned */
182 
183 				/*
184 				 * Copy the message block because it is used
185 				 * elsewhere, too.
186 				 * Can't use copyb since we want to wait
187 				 * yet allow for EINTR.
188 				 */
189 				/* Round up size for reuse */
190 				length = MAX(MBLKL(mp), 64);
191 				cr = msg_getcred(mp, &cpid);
192 				if (cr != NULL) {
193 					newmp = allocb_cred_wait(length, 0,
194 					    &error, cr, cpid);
195 				} else {
196 					newmp = allocb_wait(length, 0, 0,
197 					    &error);
198 				}
199 				if (newmp == NULL) {
200 					error = EINTR;
201 					goto exit;
202 				}
203 				bcopy(mp->b_rptr, newmp->b_wptr, length);
204 				newmp->b_wptr += length;
205 				newmp->b_next = mp->b_next;
206 
207 				/*
208 				 * Link the new message block into the queue
209 				 * and free the old one.
210 				 */
211 				*mpp = newmp;
212 				mp->b_next = NULL;
213 				freemsg(mp);
214 
215 				mp = newmp;
216 				conn_ind = (struct T_conn_ind *)mp->b_rptr;
217 			}
218 
219 			/*
220 			 * Remove options added by TCP for accept fast-path.
221 			 */
222 			conn_ind->OPT_length = 0;
223 			conn_ind->OPT_offset = 0;
224 		}
225 	}
226 
227 	so->so_version = SOV_STREAM;
228 	so->so_proto_handle = NULL;
229 
230 	/*
231 	 * Remove the hooks in the stream head to avoid queuing more
232 	 * packets in sockfs.
233 	 */
234 	mutex_exit(&so->so_lock);
235 	so_removehooks(so);
236 	mutex_enter(&so->so_lock);
237 
238 	/*
239 	 * Clear any state related to urgent data. Leave any T_EXDATA_IND
240 	 * on the queue - the behavior of urgent data after a switch is
241 	 * left undefined.
242 	 */
243 	so->so_error = sti->sti_delayed_error = 0;
244 	freemsg(so->so_oobmsg);
245 	so->so_oobmsg = NULL;
246 	sti->sti_oobsigcnt = sti->sti_oobcnt = 0;
247 
248 	so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
249 	    SS_SAVEDEOR);
250 	ASSERT(so_verify_oobstate(so));
251 
252 	freemsg(sti->sti_ack_mp);
253 	sti->sti_ack_mp = NULL;
254 
255 	/*
256 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
257 	 */
258 	so_flush_discon_ind(so);
259 
260 	/*
261 	 * Move any queued T_CONN_IND messages to stream head queue.
262 	 */
263 	rq = RD(strvp2wq(vp));
264 	while ((mp = sti->sti_conn_ind_head) != NULL) {
265 		sti->sti_conn_ind_head = mp->b_next;
266 		mp->b_next = NULL;
267 		if (sti->sti_conn_ind_head == NULL) {
268 			ASSERT(sti->sti_conn_ind_tail == mp);
269 			sti->sti_conn_ind_tail = NULL;
270 		}
271 		dprintso(so, 0,
272 		    ("so_sock2stream(%p): moving T_CONN_IND\n", (void *)so));
273 
274 		/* Drop lock across put() */
275 		mutex_exit(&so->so_lock);
276 		put(rq, mp);
277 		mutex_enter(&so->so_lock);
278 	}
279 
280 exit:
281 	ASSERT(MUTEX_HELD(&so->so_lock));
282 	so_unlock_single(so, SOLOCKED);
283 	mutex_exit(&so->so_lock);
284 	return (error);
285 }
286 
287 /*
288  * Covert a stream back to a socket. This is invoked when the illusory
289  * sockmod is pushed on a stream (where the stream was "created" by
290  * popping the illusory sockmod).
291  * This routine can not recreate the socket state (certain aspects of
292  * it like urgent data state and the bound/connected addresses for AF_UNIX
293  * sockets can not be recreated by asking the transport for information).
294  * Thus this routine implicitly assumes that the socket is in an initial
295  * state (as if it was just created). It flushes any messages queued on the
296  * read queue to avoid dealing with e.g. TPI acks or T_exdata_ind messages.
297  */
298 void
299 so_stream2sock(struct sonode *so)
300 {
301 	struct vnode *vp = SOTOV(so);
302 	sotpi_info_t *sti = SOTOTPI(so);
303 
304 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
305 
306 	mutex_enter(&so->so_lock);
307 	so_lock_single(so);
308 	ASSERT(so->so_version == SOV_STREAM);
309 	so->so_version = SOV_SOCKSTREAM;
310 	sti->sti_pushcnt = 0;
311 	mutex_exit(&so->so_lock);
312 
313 	/*
314 	 * Set a permenent error to force any thread in sorecvmsg to
315 	 * return (and drop SOREADLOCKED). Clear the error once
316 	 * we have SOREADLOCKED.
317 	 * This makes a read sleeping during the I_PUSH of sockmod return
318 	 * EIO.
319 	 */
320 	strsetrerror(SOTOV(so), EIO, 1, NULL);
321 
322 	/*
323 	 * Get the read lock before flushing data to avoid
324 	 * problems with the T_EXDATA_IND MSG_PEEK code in sorecvmsg.
325 	 */
326 	mutex_enter(&so->so_lock);
327 	(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
328 	mutex_exit(&so->so_lock);
329 
330 	strsetrerror(SOTOV(so), 0, 0, NULL);
331 	so_installhooks(so);
332 
333 	/*
334 	 * Flush everything on the read queue.
335 	 * This ensures that no T_CONN_IND remain and that no T_EXDATA_IND
336 	 * remain; those types of messages would confuse sockfs.
337 	 */
338 	strflushrq(vp, FLUSHALL);
339 	mutex_enter(&so->so_lock);
340 
341 	/*
342 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
343 	 */
344 	so_flush_discon_ind(so);
345 	so_unlock_read(so);	/* Clear SOREADLOCKED */
346 
347 	so_unlock_single(so, SOLOCKED);
348 	mutex_exit(&so->so_lock);
349 }
350 
351 /*
352  * Install the hooks in the stream head.
353  */
354 void
355 so_installhooks(struct sonode *so)
356 {
357 	struct vnode *vp = SOTOV(so);
358 
359 	strsetrputhooks(vp, SH_SIGALLDATA | SH_IGN_ZEROLEN | SH_CONSOL_DATA,
360 	    strsock_proto, strsock_misc);
361 	strsetwputhooks(vp, SH_SIGPIPE | SH_RECHECK_ERR, 0);
362 }
363 
364 /*
365  * Remove the hooks in the stream head.
366  */
367 static void
368 so_removehooks(struct sonode *so)
369 {
370 	struct vnode *vp = SOTOV(so);
371 
372 	strsetrputhooks(vp, 0, NULL, NULL);
373 	strsetwputhooks(vp, 0, STRTIMOUT);
374 	/*
375 	 * Leave read behavior as it would have been for a normal
376 	 * stream i.e. a read of an M_PROTO will fail.
377 	 */
378 }
379 
380 void
381 so_basic_strinit(struct sonode *so)
382 {
383 	struct vnode *vp = SOTOV(so);
384 	struct stdata *stp;
385 	mblk_t *mp;
386 	sotpi_info_t *sti = SOTOTPI(so);
387 
388 	/* Preallocate an unbind_req message */
389 	mp = soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP, CRED());
390 	mutex_enter(&so->so_lock);
391 	sti->sti_unbind_mp = mp;
392 #ifdef DEBUG
393 	so->so_options = so_default_options;
394 #endif /* DEBUG */
395 	mutex_exit(&so->so_lock);
396 
397 	so_installhooks(so);
398 
399 	stp = vp->v_stream;
400 	/*
401 	 * Have to keep minpsz at zero in order to allow write/send of zero
402 	 * bytes.
403 	 */
404 	mutex_enter(&stp->sd_lock);
405 	if (stp->sd_qn_minpsz == 1)
406 		stp->sd_qn_minpsz = 0;
407 	mutex_exit(&stp->sd_lock);
408 }
409 
410 /*
411  * Initialize the streams side of a socket including
412  * T_info_req/ack processing. If tso is not NULL its values are used thereby
413  * avoiding the T_INFO_REQ.
414  */
415 int
416 so_strinit(struct sonode *so, struct sonode *tso)
417 {
418 	sotpi_info_t *sti = SOTOTPI(so);
419 	sotpi_info_t *tsti;
420 	int error;
421 
422 	so_basic_strinit(so);
423 
424 	/*
425 	 * The T_CAPABILITY_REQ should be the first message sent down because
426 	 * at least TCP has a fast-path for this which avoids timeouts while
427 	 * waiting for the T_CAPABILITY_ACK under high system load.
428 	 */
429 	if (tso == NULL) {
430 		error = do_tcapability(so, TC1_ACCEPTOR_ID | TC1_INFO);
431 		if (error)
432 			return (error);
433 	} else {
434 		tsti = SOTOTPI(tso);
435 
436 		mutex_enter(&so->so_lock);
437 		sti->sti_tsdu_size = tsti->sti_tsdu_size;
438 		sti->sti_etsdu_size = tsti->sti_etsdu_size;
439 		sti->sti_addr_size = tsti->sti_addr_size;
440 		sti->sti_opt_size = tsti->sti_opt_size;
441 		sti->sti_tidu_size = tsti->sti_tidu_size;
442 		sti->sti_serv_type = tsti->sti_serv_type;
443 		so->so_mode = tso->so_mode & ~SM_ACCEPTOR_ID;
444 		mutex_exit(&so->so_lock);
445 
446 		/* the following do_tcapability may update so->so_mode */
447 		if ((tsti->sti_serv_type != T_CLTS) &&
448 		    (sti->sti_direct == 0)) {
449 			error = do_tcapability(so, TC1_ACCEPTOR_ID);
450 			if (error)
451 				return (error);
452 		}
453 	}
454 	/*
455 	 * If the addr_size is 0 we treat it as already bound
456 	 * and connected. This is used by the routing socket.
457 	 * We set the addr_size to something to allocate a the address
458 	 * structures.
459 	 */
460 	if (sti->sti_addr_size == 0) {
461 		so->so_state |= SS_ISBOUND | SS_ISCONNECTED;
462 		/* Address size can vary with address families. */
463 		if (so->so_family == AF_INET6)
464 			sti->sti_addr_size =
465 			    (t_scalar_t)sizeof (struct sockaddr_in6);
466 		else
467 			sti->sti_addr_size =
468 			    (t_scalar_t)sizeof (struct sockaddr_in);
469 		ASSERT(sti->sti_unbind_mp);
470 	}
471 
472 	so_alloc_addr(so, sti->sti_addr_size);
473 
474 	return (0);
475 }
476 
477 static void
478 copy_tinfo(struct sonode *so, struct T_info_ack *tia)
479 {
480 	sotpi_info_t *sti = SOTOTPI(so);
481 
482 	sti->sti_tsdu_size = tia->TSDU_size;
483 	sti->sti_etsdu_size = tia->ETSDU_size;
484 	sti->sti_addr_size = tia->ADDR_size;
485 	sti->sti_opt_size = tia->OPT_size;
486 	sti->sti_tidu_size = tia->TIDU_size;
487 	sti->sti_serv_type = tia->SERV_type;
488 	switch (tia->CURRENT_state) {
489 	case TS_UNBND:
490 		break;
491 	case TS_IDLE:
492 		so->so_state |= SS_ISBOUND;
493 		sti->sti_laddr_len = 0;
494 		sti->sti_laddr_valid = 0;
495 		break;
496 	case TS_DATA_XFER:
497 		so->so_state |= SS_ISBOUND|SS_ISCONNECTED;
498 		sti->sti_laddr_len = 0;
499 		sti->sti_faddr_len = 0;
500 		sti->sti_laddr_valid = 0;
501 		sti->sti_faddr_valid = 0;
502 		break;
503 	}
504 
505 	/*
506 	 * Heuristics for determining the socket mode flags
507 	 * (SM_ATOMIC, SM_CONNREQUIRED, SM_ADDR, SM_FDPASSING,
508 	 * and SM_EXDATA, SM_OPTDATA, and SM_BYTESTREAM)
509 	 * from the info ack.
510 	 */
511 	if (sti->sti_serv_type == T_CLTS) {
512 		so->so_mode |= SM_ATOMIC | SM_ADDR;
513 	} else {
514 		so->so_mode |= SM_CONNREQUIRED;
515 		if (sti->sti_etsdu_size != 0 && sti->sti_etsdu_size != -2)
516 			so->so_mode |= SM_EXDATA;
517 	}
518 	if (so->so_type == SOCK_SEQPACKET || so->so_type == SOCK_RAW) {
519 		/* Semantics are to discard tail end of messages */
520 		so->so_mode |= SM_ATOMIC;
521 	}
522 	if (so->so_family == AF_UNIX) {
523 		so->so_mode |= SM_FDPASSING | SM_OPTDATA;
524 		if (sti->sti_addr_size == -1) {
525 			/* MAXPATHLEN + soun_family + nul termination */
526 			sti->sti_addr_size = (t_scalar_t)(MAXPATHLEN +
527 			    sizeof (short) + 1);
528 		}
529 		if (so->so_type == SOCK_STREAM) {
530 			/*
531 			 * Make it into a byte-stream transport.
532 			 * SOCK_SEQPACKET sockets are unchanged.
533 			 */
534 			sti->sti_tsdu_size = 0;
535 		}
536 	} else if (sti->sti_addr_size == -1) {
537 		/*
538 		 * Logic extracted from sockmod - have to pick some max address
539 		 * length in order to preallocate the addresses.
540 		 */
541 		sti->sti_addr_size = SOA_DEFSIZE;
542 	}
543 	if (sti->sti_tsdu_size == 0)
544 		so->so_mode |= SM_BYTESTREAM;
545 }
546 
547 static int
548 check_tinfo(struct sonode *so)
549 {
550 	sotpi_info_t *sti = SOTOTPI(so);
551 
552 	/* Consistency checks */
553 	if (so->so_type == SOCK_DGRAM && sti->sti_serv_type != T_CLTS) {
554 		eprintso(so, ("service type and socket type mismatch\n"));
555 		eprintsoline(so, EPROTO);
556 		return (EPROTO);
557 	}
558 	if (so->so_type == SOCK_STREAM && sti->sti_serv_type == T_CLTS) {
559 		eprintso(so, ("service type and socket type mismatch\n"));
560 		eprintsoline(so, EPROTO);
561 		return (EPROTO);
562 	}
563 	if (so->so_type == SOCK_SEQPACKET && sti->sti_serv_type == T_CLTS) {
564 		eprintso(so, ("service type and socket type mismatch\n"));
565 		eprintsoline(so, EPROTO);
566 		return (EPROTO);
567 	}
568 	if (so->so_family == AF_INET &&
569 	    sti->sti_addr_size != (t_scalar_t)sizeof (struct sockaddr_in)) {
570 		eprintso(so,
571 		    ("AF_INET must have sockaddr_in address length. Got %d\n",
572 		    sti->sti_addr_size));
573 		eprintsoline(so, EMSGSIZE);
574 		return (EMSGSIZE);
575 	}
576 	if (so->so_family == AF_INET6 &&
577 	    sti->sti_addr_size != (t_scalar_t)sizeof (struct sockaddr_in6)) {
578 		eprintso(so,
579 		    ("AF_INET6 must have sockaddr_in6 address length. Got %d\n",
580 		    sti->sti_addr_size));
581 		eprintsoline(so, EMSGSIZE);
582 		return (EMSGSIZE);
583 	}
584 
585 	dprintso(so, 1, (
586 	    "tinfo: serv %d tsdu %d, etsdu %d, addr %d, opt %d, tidu %d\n",
587 	    sti->sti_serv_type, sti->sti_tsdu_size, sti->sti_etsdu_size,
588 	    sti->sti_addr_size, sti->sti_opt_size,
589 	    sti->sti_tidu_size));
590 	dprintso(so, 1, ("tinfo: so_state %s\n",
591 	    pr_state(so->so_state, so->so_mode)));
592 	return (0);
593 }
594 
595 /*
596  * Send down T_info_req and wait for the ack.
597  * Record interesting T_info_ack values in the sonode.
598  */
599 static int
600 do_tinfo(struct sonode *so)
601 {
602 	struct T_info_req tir;
603 	mblk_t *mp;
604 	int error;
605 
606 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
607 
608 	if (so_no_tinfo) {
609 		SOTOTPI(so)->sti_addr_size = 0;
610 		return (0);
611 	}
612 
613 	dprintso(so, 1, ("do_tinfo(%p)\n", (void *)so));
614 
615 	/* Send T_INFO_REQ */
616 	tir.PRIM_type = T_INFO_REQ;
617 	mp = soallocproto1(&tir, sizeof (tir),
618 	    sizeof (struct T_info_req) + sizeof (struct T_info_ack),
619 	    _ALLOC_INTR, CRED());
620 	if (mp == NULL) {
621 		eprintsoline(so, ENOBUFS);
622 		return (ENOBUFS);
623 	}
624 	/* T_INFO_REQ has to be M_PCPROTO */
625 	DB_TYPE(mp) = M_PCPROTO;
626 
627 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
628 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
629 	if (error) {
630 		eprintsoline(so, error);
631 		return (error);
632 	}
633 	mutex_enter(&so->so_lock);
634 	/* Wait for T_INFO_ACK */
635 	if ((error = sowaitprim(so, T_INFO_REQ, T_INFO_ACK,
636 	    (t_uscalar_t)sizeof (struct T_info_ack), &mp, 0))) {
637 		mutex_exit(&so->so_lock);
638 		eprintsoline(so, error);
639 		return (error);
640 	}
641 
642 	ASSERT(mp);
643 	copy_tinfo(so, (struct T_info_ack *)mp->b_rptr);
644 	mutex_exit(&so->so_lock);
645 	freemsg(mp);
646 	return (check_tinfo(so));
647 }
648 
649 /*
650  * Send down T_capability_req and wait for the ack.
651  * Record interesting T_capability_ack values in the sonode.
652  */
653 static int
654 do_tcapability(struct sonode *so, t_uscalar_t cap_bits1)
655 {
656 	struct T_capability_req tcr;
657 	struct T_capability_ack *tca;
658 	mblk_t *mp;
659 	int error;
660 	sotpi_info_t *sti = SOTOTPI(so);
661 
662 	ASSERT(cap_bits1 != 0);
663 	ASSERT((cap_bits1 & ~(TC1_ACCEPTOR_ID | TC1_INFO)) == 0);
664 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
665 
666 	if (sti->sti_provinfo->tpi_capability == PI_NO)
667 		return (do_tinfo(so));
668 
669 	if (so_no_tinfo) {
670 		sti->sti_addr_size = 0;
671 		if ((cap_bits1 &= ~TC1_INFO) == 0)
672 			return (0);
673 	}
674 
675 	dprintso(so, 1, ("do_tcapability(%p)\n", (void *)so));
676 
677 	/* Send T_CAPABILITY_REQ */
678 	tcr.PRIM_type = T_CAPABILITY_REQ;
679 	tcr.CAP_bits1 = cap_bits1;
680 	mp = soallocproto1(&tcr, sizeof (tcr),
681 	    sizeof (struct T_capability_req) + sizeof (struct T_capability_ack),
682 	    _ALLOC_INTR, CRED());
683 	if (mp == NULL) {
684 		eprintsoline(so, ENOBUFS);
685 		return (ENOBUFS);
686 	}
687 	/* T_CAPABILITY_REQ should be M_PCPROTO here */
688 	DB_TYPE(mp) = M_PCPROTO;
689 
690 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
691 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
692 	if (error) {
693 		eprintsoline(so, error);
694 		return (error);
695 	}
696 	mutex_enter(&so->so_lock);
697 	/* Wait for T_CAPABILITY_ACK */
698 	if ((error = sowaitprim(so, T_CAPABILITY_REQ, T_CAPABILITY_ACK,
699 	    (t_uscalar_t)sizeof (*tca), &mp, sock_capability_timeout * hz))) {
700 		mutex_exit(&so->so_lock);
701 		PI_PROVLOCK(sti->sti_provinfo);
702 		if (sti->sti_provinfo->tpi_capability == PI_DONTKNOW)
703 			sti->sti_provinfo->tpi_capability = PI_NO;
704 		PI_PROVUNLOCK(sti->sti_provinfo);
705 		ASSERT((so->so_mode & SM_ACCEPTOR_ID) == 0);
706 		if (cap_bits1 & TC1_INFO) {
707 			/*
708 			 * If the T_CAPABILITY_REQ timed out and then a
709 			 * T_INFO_REQ gets a protocol error, most likely
710 			 * the capability was slow (vs. unsupported). Return
711 			 * ENOSR for this case as a best guess.
712 			 */
713 			if (error == ETIME) {
714 				return ((error = do_tinfo(so)) == EPROTO ?
715 				    ENOSR : error);
716 			}
717 			return (do_tinfo(so));
718 		}
719 		return (0);
720 	}
721 
722 	ASSERT(mp);
723 	tca = (struct T_capability_ack *)mp->b_rptr;
724 
725 	ASSERT((cap_bits1 & TC1_INFO) == (tca->CAP_bits1 & TC1_INFO));
726 	so_proc_tcapability_ack(so, tca);
727 
728 	cap_bits1 = tca->CAP_bits1;
729 
730 	mutex_exit(&so->so_lock);
731 	freemsg(mp);
732 
733 	if (cap_bits1 & TC1_INFO)
734 		return (check_tinfo(so));
735 
736 	return (0);
737 }
738 
739 /*
740  * Process a T_CAPABILITY_ACK
741  */
742 void
743 so_proc_tcapability_ack(struct sonode *so, struct T_capability_ack *tca)
744 {
745 	sotpi_info_t *sti = SOTOTPI(so);
746 
747 	if (sti->sti_provinfo->tpi_capability == PI_DONTKNOW) {
748 		PI_PROVLOCK(sti->sti_provinfo);
749 		sti->sti_provinfo->tpi_capability = PI_YES;
750 		PI_PROVUNLOCK(sti->sti_provinfo);
751 	}
752 
753 	if (tca->CAP_bits1 & TC1_ACCEPTOR_ID) {
754 		sti->sti_acceptor_id = tca->ACCEPTOR_id;
755 		so->so_mode |= SM_ACCEPTOR_ID;
756 	}
757 
758 	if (tca->CAP_bits1 & TC1_INFO)
759 		copy_tinfo(so, &tca->INFO_ack);
760 }
761 
762 /*
763  * Retrieve socket error, clear error if not peek.
764  */
765 int
766 sogeterr(struct sonode *so, boolean_t clear_err)
767 {
768 	int error;
769 
770 	ASSERT(MUTEX_HELD(&so->so_lock));
771 
772 	error = so->so_error;
773 	if (clear_err)
774 		so->so_error = 0;
775 
776 	return (error);
777 }
778 
779 /*
780  * This routine is registered with the stream head to retrieve read
781  * side errors.
782  * It does not clear the socket error for a peeking read side operation.
783  * It the error is to be cleared it sets *clearerr.
784  */
785 int
786 sogetrderr(vnode_t *vp, int ispeek, int *clearerr)
787 {
788 	struct sonode *so = VTOSO(vp);
789 	int error;
790 
791 	mutex_enter(&so->so_lock);
792 	if (ispeek) {
793 		error = so->so_error;
794 		*clearerr = 0;
795 	} else {
796 		error = so->so_error;
797 		so->so_error = 0;
798 		*clearerr = 1;
799 	}
800 	mutex_exit(&so->so_lock);
801 	return (error);
802 }
803 
804 /*
805  * This routine is registered with the stream head to retrieve write
806  * side errors.
807  * It does not clear the socket error for a peeking read side operation.
808  * It the error is to be cleared it sets *clearerr.
809  */
810 int
811 sogetwrerr(vnode_t *vp, int ispeek, int *clearerr)
812 {
813 	struct sonode *so = VTOSO(vp);
814 	int error;
815 
816 	mutex_enter(&so->so_lock);
817 	if (so->so_state & SS_CANTSENDMORE) {
818 		error = EPIPE;
819 		*clearerr = 0;
820 	} else {
821 		error = so->so_error;
822 		if (ispeek) {
823 			*clearerr = 0;
824 		} else {
825 			so->so_error = 0;
826 			*clearerr = 1;
827 		}
828 	}
829 	mutex_exit(&so->so_lock);
830 	return (error);
831 }
832 
833 /*
834  * Set a nonpersistent read and write error on the socket.
835  * Used when there is a T_uderror_ind for a connected socket.
836  * The caller also needs to call strsetrerror and strsetwerror
837  * after dropping the lock.
838  */
839 void
840 soseterror(struct sonode *so, int error)
841 {
842 	ASSERT(error != 0);
843 
844 	ASSERT(MUTEX_HELD(&so->so_lock));
845 	so->so_error = (ushort_t)error;
846 }
847 
848 void
849 soisconnecting(struct sonode *so)
850 {
851 	ASSERT(MUTEX_HELD(&so->so_lock));
852 	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
853 	so->so_state |= SS_ISCONNECTING;
854 	cv_broadcast(&so->so_state_cv);
855 }
856 
857 void
858 soisconnected(struct sonode *so)
859 {
860 	ASSERT(MUTEX_HELD(&so->so_lock));
861 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
862 	so->so_state |= SS_ISCONNECTED;
863 	cv_broadcast(&so->so_state_cv);
864 }
865 
866 /*
867  * The caller also needs to call strsetrerror, strsetwerror and strseteof.
868  */
869 void
870 soisdisconnected(struct sonode *so, int error)
871 {
872 	ASSERT(MUTEX_HELD(&so->so_lock));
873 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
874 	so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
875 	so->so_error = (ushort_t)error;
876 	if (so->so_peercred != NULL) {
877 		crfree(so->so_peercred);
878 		so->so_peercred = NULL;
879 	}
880 	cv_broadcast(&so->so_state_cv);
881 }
882 
883 /*
884  * For connected AF_UNIX SOCK_DGRAM sockets when the peer closes.
885  * Does not affect write side.
886  * The caller also has to call strsetrerror.
887  */
888 static void
889 sobreakconn(struct sonode *so, int error)
890 {
891 	ASSERT(MUTEX_HELD(&so->so_lock));
892 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
893 	so->so_error = (ushort_t)error;
894 	cv_broadcast(&so->so_state_cv);
895 }
896 
897 /*
898  * Can no longer send.
899  * Caller must also call strsetwerror.
900  *
901  * We mark the peer address as no longer valid for getpeername, but
902  * leave it around for so_unix_close to notify the peer (that
903  * transport has no addressing held at that layer).
904  */
905 void
906 socantsendmore(struct sonode *so)
907 {
908 	ASSERT(MUTEX_HELD(&so->so_lock));
909 	so->so_state |= SS_CANTSENDMORE;
910 	cv_broadcast(&so->so_state_cv);
911 }
912 
913 /*
914  * The caller must call strseteof(,1) as well as this routine
915  * to change the socket state.
916  */
917 void
918 socantrcvmore(struct sonode *so)
919 {
920 	ASSERT(MUTEX_HELD(&so->so_lock));
921 	so->so_state |= SS_CANTRCVMORE;
922 	cv_broadcast(&so->so_state_cv);
923 }
924 
925 /*
926  * The caller has sent down a "request_prim" primitive and wants to wait for
927  * an ack ("ack_prim") or an T_ERROR_ACK for it.
928  * The specified "ack_prim" can be a T_OK_ACK.
929  *
930  * Assumes that all the TPI acks are M_PCPROTO messages.
931  *
932  * Note that the socket is single-threaded (using so_lock_single)
933  * for all operations that generate TPI ack messages. Since
934  * only TPI ack messages are M_PCPROTO we should never receive
935  * anything except either the ack we are expecting or a T_ERROR_ACK
936  * for the same primitive.
937  */
938 int
939 sowaitprim(struct sonode *so, t_scalar_t request_prim, t_scalar_t ack_prim,
940 	    t_uscalar_t min_size, mblk_t **mpp, clock_t wait)
941 {
942 	mblk_t *mp;
943 	union T_primitives *tpr;
944 	int error;
945 
946 	dprintso(so, 1, ("sowaitprim(%p, %d, %d, %d, %p, %lu)\n",
947 	    (void *)so, request_prim, ack_prim, min_size, (void *)mpp, wait));
948 
949 	ASSERT(MUTEX_HELD(&so->so_lock));
950 
951 	error = sowaitack(so, &mp, wait);
952 	if (error)
953 		return (error);
954 
955 	dprintso(so, 1, ("got msg %p\n", (void *)mp));
956 	if (DB_TYPE(mp) != M_PCPROTO ||
957 	    MBLKL(mp) < sizeof (tpr->type)) {
958 		freemsg(mp);
959 		eprintsoline(so, EPROTO);
960 		return (EPROTO);
961 	}
962 	tpr = (union T_primitives *)mp->b_rptr;
963 	/*
964 	 * Did we get the primitive that we were asking for?
965 	 * For T_OK_ACK we also check that it matches the request primitive.
966 	 */
967 	if (tpr->type == ack_prim &&
968 	    (ack_prim != T_OK_ACK ||
969 	    tpr->ok_ack.CORRECT_prim == request_prim)) {
970 		if (MBLKL(mp) >= (ssize_t)min_size) {
971 			/* Found what we are looking for */
972 			*mpp = mp;
973 			return (0);
974 		}
975 		/* Too short */
976 		freemsg(mp);
977 		eprintsoline(so, EPROTO);
978 		return (EPROTO);
979 	}
980 
981 	if (tpr->type == T_ERROR_ACK &&
982 	    tpr->error_ack.ERROR_prim == request_prim) {
983 		/* Error to the primitive we were looking for */
984 		if (tpr->error_ack.TLI_error == TSYSERR) {
985 			error = tpr->error_ack.UNIX_error;
986 		} else {
987 			error = proto_tlitosyserr(tpr->error_ack.TLI_error);
988 		}
989 		dprintso(so, 0, ("error_ack for %d: %d/%d ->%d\n",
990 		    tpr->error_ack.ERROR_prim, tpr->error_ack.TLI_error,
991 		    tpr->error_ack.UNIX_error, error));
992 		freemsg(mp);
993 		return (error);
994 	}
995 	/*
996 	 * Wrong primitive or T_ERROR_ACK for the wrong primitive
997 	 */
998 #ifdef DEBUG
999 	if (tpr->type == T_ERROR_ACK) {
1000 		dprintso(so, 0, ("error_ack for %d: %d/%d\n",
1001 		    tpr->error_ack.ERROR_prim, tpr->error_ack.TLI_error,
1002 		    tpr->error_ack.UNIX_error));
1003 	} else if (tpr->type == T_OK_ACK) {
1004 		dprintso(so, 0, ("ok_ack for %d, expected %d for %d\n",
1005 		    tpr->ok_ack.CORRECT_prim, ack_prim, request_prim));
1006 	} else {
1007 		dprintso(so, 0,
1008 		    ("unexpected primitive %d, expected %d for %d\n",
1009 		    tpr->type, ack_prim, request_prim));
1010 	}
1011 #endif /* DEBUG */
1012 
1013 	freemsg(mp);
1014 	eprintsoline(so, EPROTO);
1015 	return (EPROTO);
1016 }
1017 
1018 /*
1019  * Wait for a T_OK_ACK for the specified primitive.
1020  */
1021 int
1022 sowaitokack(struct sonode *so, t_scalar_t request_prim)
1023 {
1024 	mblk_t *mp;
1025 	int error;
1026 
1027 	error = sowaitprim(so, request_prim, T_OK_ACK,
1028 	    (t_uscalar_t)sizeof (struct T_ok_ack), &mp, 0);
1029 	if (error)
1030 		return (error);
1031 	freemsg(mp);
1032 	return (0);
1033 }
1034 
1035 /*
1036  * Queue a received TPI ack message on sti_ack_mp.
1037  */
1038 void
1039 soqueueack(struct sonode *so, mblk_t *mp)
1040 {
1041 	sotpi_info_t *sti = SOTOTPI(so);
1042 
1043 	if (DB_TYPE(mp) != M_PCPROTO) {
1044 		zcmn_err(getzoneid(), CE_WARN,
1045 		    "sockfs: received unexpected M_PROTO TPI ack. Prim %d\n",
1046 		    *(t_scalar_t *)mp->b_rptr);
1047 		freemsg(mp);
1048 		return;
1049 	}
1050 
1051 	mutex_enter(&so->so_lock);
1052 	if (sti->sti_ack_mp != NULL) {
1053 		dprintso(so, 1, ("sti_ack_mp already set\n"));
1054 		freemsg(sti->sti_ack_mp);
1055 		sti->sti_ack_mp = NULL;
1056 	}
1057 	sti->sti_ack_mp = mp;
1058 	cv_broadcast(&sti->sti_ack_cv);
1059 	mutex_exit(&so->so_lock);
1060 }
1061 
1062 /*
1063  * Wait for a TPI ack ignoring signals and errors.
1064  */
1065 int
1066 sowaitack(struct sonode *so, mblk_t **mpp, clock_t wait)
1067 {
1068 	sotpi_info_t *sti = SOTOTPI(so);
1069 
1070 	ASSERT(MUTEX_HELD(&so->so_lock));
1071 
1072 	while (sti->sti_ack_mp == NULL) {
1073 #ifdef SOCK_TEST
1074 		if (wait == 0 && sock_test_timelimit != 0)
1075 			wait = sock_test_timelimit;
1076 #endif
1077 		if (wait != 0) {
1078 			/*
1079 			 * Only wait for the time limit.
1080 			 */
1081 			clock_t now;
1082 
1083 			time_to_wait(&now, wait);
1084 			if (cv_timedwait(&sti->sti_ack_cv, &so->so_lock,
1085 			    now) == -1) {
1086 				eprintsoline(so, ETIME);
1087 				return (ETIME);
1088 			}
1089 		}
1090 		else
1091 			cv_wait(&sti->sti_ack_cv, &so->so_lock);
1092 	}
1093 	*mpp = sti->sti_ack_mp;
1094 #ifdef DEBUG
1095 	{
1096 		union T_primitives *tpr;
1097 		mblk_t *mp = *mpp;
1098 
1099 		tpr = (union T_primitives *)mp->b_rptr;
1100 		ASSERT(DB_TYPE(mp) == M_PCPROTO);
1101 		ASSERT(tpr->type == T_OK_ACK ||
1102 		    tpr->type == T_ERROR_ACK ||
1103 		    tpr->type == T_BIND_ACK ||
1104 		    tpr->type == T_CAPABILITY_ACK ||
1105 		    tpr->type == T_INFO_ACK ||
1106 		    tpr->type == T_OPTMGMT_ACK);
1107 	}
1108 #endif /* DEBUG */
1109 	sti->sti_ack_mp = NULL;
1110 	return (0);
1111 }
1112 
1113 /*
1114  * Queue a received T_CONN_IND message on sti_conn_ind_head/tail.
1115  */
1116 void
1117 soqueueconnind(struct sonode *so, mblk_t *mp)
1118 {
1119 	sotpi_info_t *sti = SOTOTPI(so);
1120 
1121 	if (DB_TYPE(mp) != M_PROTO) {
1122 		zcmn_err(getzoneid(), CE_WARN,
1123 		    "sockfs: received unexpected M_PCPROTO T_CONN_IND\n");
1124 		freemsg(mp);
1125 		return;
1126 	}
1127 
1128 	mutex_enter(&so->so_lock);
1129 	ASSERT(mp->b_next == NULL);
1130 	if (sti->sti_conn_ind_head == NULL) {
1131 		sti->sti_conn_ind_head = mp;
1132 	} else {
1133 		ASSERT(sti->sti_conn_ind_tail->b_next == NULL);
1134 		sti->sti_conn_ind_tail->b_next = mp;
1135 	}
1136 	sti->sti_conn_ind_tail = mp;
1137 	/* Wakeup a single consumer of the T_CONN_IND */
1138 	cv_signal(&so->so_acceptq_cv);
1139 	mutex_exit(&so->so_lock);
1140 }
1141 
1142 /*
1143  * Wait for a T_CONN_IND.
1144  * Don't wait if nonblocking.
1145  * Accept signals and socket errors.
1146  */
1147 int
1148 sowaitconnind(struct sonode *so, int fmode, mblk_t **mpp)
1149 {
1150 	mblk_t *mp;
1151 	sotpi_info_t *sti = SOTOTPI(so);
1152 	int error = 0;
1153 
1154 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1155 	mutex_enter(&so->so_lock);
1156 check_error:
1157 	if (so->so_error) {
1158 		error = sogeterr(so, B_TRUE);
1159 		if (error) {
1160 			mutex_exit(&so->so_lock);
1161 			return (error);
1162 		}
1163 	}
1164 
1165 	if (sti->sti_conn_ind_head == NULL) {
1166 		if (fmode & (FNDELAY|FNONBLOCK)) {
1167 			error = EWOULDBLOCK;
1168 			goto done;
1169 		}
1170 
1171 		if (so->so_state & SS_CLOSING) {
1172 			error = EINTR;
1173 			goto done;
1174 		}
1175 
1176 		if (!cv_wait_sig_swap(&so->so_acceptq_cv, &so->so_lock)) {
1177 			error = EINTR;
1178 			goto done;
1179 		}
1180 		goto check_error;
1181 	}
1182 	mp = sti->sti_conn_ind_head;
1183 	sti->sti_conn_ind_head = mp->b_next;
1184 	mp->b_next = NULL;
1185 	if (sti->sti_conn_ind_head == NULL) {
1186 		ASSERT(sti->sti_conn_ind_tail == mp);
1187 		sti->sti_conn_ind_tail = NULL;
1188 	}
1189 	*mpp = mp;
1190 done:
1191 	mutex_exit(&so->so_lock);
1192 	return (error);
1193 }
1194 
1195 /*
1196  * Flush a T_CONN_IND matching the sequence number from the list.
1197  * Return zero if found; non-zero otherwise.
1198  * This is called very infrequently thus it is ok to do a linear search.
1199  */
1200 int
1201 soflushconnind(struct sonode *so, t_scalar_t seqno)
1202 {
1203 	mblk_t *prevmp, *mp;
1204 	struct T_conn_ind *tci;
1205 	sotpi_info_t *sti = SOTOTPI(so);
1206 
1207 	mutex_enter(&so->so_lock);
1208 	for (prevmp = NULL, mp = sti->sti_conn_ind_head; mp != NULL;
1209 	    prevmp = mp, mp = mp->b_next) {
1210 		tci = (struct T_conn_ind *)mp->b_rptr;
1211 		if (tci->SEQ_number == seqno) {
1212 			dprintso(so, 1,
1213 			    ("t_discon_ind: found T_CONN_IND %d\n", seqno));
1214 			/* Deleting last? */
1215 			if (sti->sti_conn_ind_tail == mp) {
1216 				sti->sti_conn_ind_tail = prevmp;
1217 			}
1218 			if (prevmp == NULL) {
1219 				/* Deleting first */
1220 				sti->sti_conn_ind_head = mp->b_next;
1221 			} else {
1222 				prevmp->b_next = mp->b_next;
1223 			}
1224 			mp->b_next = NULL;
1225 
1226 			ASSERT((sti->sti_conn_ind_head == NULL &&
1227 			    sti->sti_conn_ind_tail == NULL) ||
1228 			    (sti->sti_conn_ind_head != NULL &&
1229 			    sti->sti_conn_ind_tail != NULL));
1230 
1231 			so->so_error = ECONNABORTED;
1232 			mutex_exit(&so->so_lock);
1233 
1234 			/*
1235 			 * T_KSSL_PROXY_CONN_IND may carry a handle for
1236 			 * an SSL context, and needs to be released.
1237 			 */
1238 			if ((tci->PRIM_type == T_SSL_PROXY_CONN_IND) &&
1239 			    (mp->b_cont != NULL)) {
1240 				kssl_ctx_t kssl_ctx;
1241 
1242 				ASSERT(MBLKL(mp->b_cont) ==
1243 				    sizeof (kssl_ctx_t));
1244 				kssl_ctx = *((kssl_ctx_t *)mp->b_cont->b_rptr);
1245 				kssl_release_ctx(kssl_ctx);
1246 			}
1247 			freemsg(mp);
1248 			return (0);
1249 		}
1250 	}
1251 	mutex_exit(&so->so_lock);
1252 	dprintso(so, 1,	("t_discon_ind: NOT found T_CONN_IND %d\n", seqno));
1253 	return (-1);
1254 }
1255 
1256 /*
1257  * Wait until the socket is connected or there is an error.
1258  * fmode should contain any nonblocking flags. nosig should be
1259  * set if the caller does not want the wait to be interrupted by a signal.
1260  */
1261 int
1262 sowaitconnected(struct sonode *so, int fmode, int nosig)
1263 {
1264 	int error;
1265 
1266 	ASSERT(MUTEX_HELD(&so->so_lock));
1267 
1268 	while ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) ==
1269 	    SS_ISCONNECTING && so->so_error == 0) {
1270 
1271 		dprintso(so, 1, ("waiting for SS_ISCONNECTED on %p\n",
1272 		    (void *)so));
1273 		if (fmode & (FNDELAY|FNONBLOCK))
1274 			return (EINPROGRESS);
1275 
1276 		if (so->so_state & SS_CLOSING)
1277 			return (EINTR);
1278 
1279 		if (nosig)
1280 			cv_wait(&so->so_state_cv, &so->so_lock);
1281 		else if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) {
1282 			/*
1283 			 * Return EINTR and let the application use
1284 			 * nonblocking techniques for detecting when
1285 			 * the connection has been established.
1286 			 */
1287 			return (EINTR);
1288 		}
1289 		dprintso(so, 1, ("awoken on %p\n", (void *)so));
1290 	}
1291 
1292 	if (so->so_error != 0) {
1293 		error = sogeterr(so, B_TRUE);
1294 		ASSERT(error != 0);
1295 		dprintso(so, 1, ("sowaitconnected: error %d\n", error));
1296 		return (error);
1297 	}
1298 	if (!(so->so_state & SS_ISCONNECTED)) {
1299 		/*
1300 		 * Could have received a T_ORDREL_IND or a T_DISCON_IND with
1301 		 * zero errno. Or another thread could have consumed so_error
1302 		 * e.g. by calling read.
1303 		 */
1304 		error = ECONNREFUSED;
1305 		dprintso(so, 1, ("sowaitconnected: error %d\n", error));
1306 		return (error);
1307 	}
1308 	return (0);
1309 }
1310 
1311 
1312 /*
1313  * Handle the signal generation aspect of urgent data.
1314  */
1315 static void
1316 so_oob_sig(struct sonode *so, int extrasig,
1317     strsigset_t *signals, strpollset_t *pollwakeups)
1318 {
1319 	sotpi_info_t *sti = SOTOTPI(so);
1320 
1321 	ASSERT(MUTEX_HELD(&so->so_lock));
1322 
1323 	ASSERT(so_verify_oobstate(so));
1324 	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1325 	if (sti->sti_oobsigcnt > sti->sti_oobcnt) {
1326 		/*
1327 		 * Signal has already been generated once for this
1328 		 * urgent "event". However, since TCP can receive updated
1329 		 * urgent pointers we still generate a signal.
1330 		 */
1331 		ASSERT(so->so_state & SS_OOBPEND);
1332 		if (extrasig) {
1333 			*signals |= S_RDBAND;
1334 			*pollwakeups |= POLLRDBAND;
1335 		}
1336 		return;
1337 	}
1338 
1339 	sti->sti_oobsigcnt++;
1340 	ASSERT(sti->sti_oobsigcnt > 0);	/* Wraparound */
1341 	ASSERT(sti->sti_oobsigcnt > sti->sti_oobcnt);
1342 
1343 	/*
1344 	 * Record (for select/poll) that urgent data is pending.
1345 	 */
1346 	so->so_state |= SS_OOBPEND;
1347 	/*
1348 	 * New urgent data on the way so forget about any old
1349 	 * urgent data.
1350 	 */
1351 	so->so_state &= ~(SS_HAVEOOBDATA|SS_HADOOBDATA);
1352 	if (so->so_oobmsg != NULL) {
1353 		dprintso(so, 1, ("sock: discarding old oob\n"));
1354 		freemsg(so->so_oobmsg);
1355 		so->so_oobmsg = NULL;
1356 	}
1357 	*signals |= S_RDBAND;
1358 	*pollwakeups |= POLLRDBAND;
1359 	ASSERT(so_verify_oobstate(so));
1360 }
1361 
1362 /*
1363  * Handle the processing of the T_EXDATA_IND with urgent data.
1364  * Returns the T_EXDATA_IND if it should be queued on the read queue.
1365  */
1366 /* ARGSUSED2 */
1367 static mblk_t *
1368 so_oob_exdata(struct sonode *so, mblk_t *mp,
1369 	strsigset_t *signals, strpollset_t *pollwakeups)
1370 {
1371 	sotpi_info_t *sti = SOTOTPI(so);
1372 
1373 	ASSERT(MUTEX_HELD(&so->so_lock));
1374 
1375 	ASSERT(so_verify_oobstate(so));
1376 
1377 	ASSERT(sti->sti_oobsigcnt > sti->sti_oobcnt);
1378 
1379 	sti->sti_oobcnt++;
1380 	ASSERT(sti->sti_oobcnt > 0);	/* wraparound? */
1381 	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1382 
1383 	/*
1384 	 * Set MSGMARK for SIOCATMARK.
1385 	 */
1386 	mp->b_flag |= MSGMARK;
1387 
1388 	ASSERT(so_verify_oobstate(so));
1389 	return (mp);
1390 }
1391 
1392 /*
1393  * Handle the processing of the actual urgent data.
1394  * Returns the data mblk if it should be queued on the read queue.
1395  */
1396 static mblk_t *
1397 so_oob_data(struct sonode *so, mblk_t *mp,
1398 	strsigset_t *signals, strpollset_t *pollwakeups)
1399 {
1400 	sotpi_info_t *sti = SOTOTPI(so);
1401 
1402 	ASSERT(MUTEX_HELD(&so->so_lock));
1403 
1404 	ASSERT(so_verify_oobstate(so));
1405 
1406 	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1407 	ASSERT(mp != NULL);
1408 	/*
1409 	 * For OOBINLINE we keep the data in the T_EXDATA_IND.
1410 	 * Otherwise we store it in so_oobmsg.
1411 	 */
1412 	ASSERT(so->so_oobmsg == NULL);
1413 	if (so->so_options & SO_OOBINLINE) {
1414 		*pollwakeups |= POLLIN | POLLRDNORM | POLLRDBAND;
1415 		*signals |= S_INPUT | S_RDNORM;
1416 	} else {
1417 		*pollwakeups |= POLLRDBAND;
1418 		so->so_state |= SS_HAVEOOBDATA;
1419 		so->so_oobmsg = mp;
1420 		mp = NULL;
1421 	}
1422 	ASSERT(so_verify_oobstate(so));
1423 	return (mp);
1424 }
1425 
1426 /*
1427  * Caller must hold the mutex.
1428  * For delayed processing, save the T_DISCON_IND received
1429  * from below on sti_discon_ind_mp.
1430  * When the message is processed the framework will call:
1431  *      (*func)(so, mp);
1432  */
1433 static void
1434 so_save_discon_ind(struct sonode *so,
1435 	mblk_t *mp,
1436 	void (*func)(struct sonode *so, mblk_t *))
1437 {
1438 	sotpi_info_t *sti = SOTOTPI(so);
1439 
1440 	ASSERT(MUTEX_HELD(&so->so_lock));
1441 
1442 	/*
1443 	 * Discard new T_DISCON_IND if we have already received another.
1444 	 * Currently the earlier message can either be on sti_discon_ind_mp
1445 	 * or being processed.
1446 	 */
1447 	if (sti->sti_discon_ind_mp != NULL || (so->so_flag & SOASYNC_UNBIND)) {
1448 		zcmn_err(getzoneid(), CE_WARN,
1449 		    "sockfs: received unexpected additional T_DISCON_IND\n");
1450 		freemsg(mp);
1451 		return;
1452 	}
1453 	mp->b_prev = (mblk_t *)func;
1454 	mp->b_next = NULL;
1455 	sti->sti_discon_ind_mp = mp;
1456 }
1457 
1458 /*
1459  * Caller must hold the mutex and make sure that either SOLOCKED
1460  * or SOASYNC_UNBIND is set. Called from so_unlock_single().
1461  * Perform delayed processing of T_DISCON_IND message on sti_discon_ind_mp.
1462  * Need to ensure that strsock_proto() will not end up sleeping for
1463  * SOASYNC_UNBIND, while executing this function.
1464  */
1465 void
1466 so_drain_discon_ind(struct sonode *so)
1467 {
1468 	mblk_t	*bp;
1469 	void (*func)(struct sonode *so, mblk_t *);
1470 	sotpi_info_t *sti = SOTOTPI(so);
1471 
1472 	ASSERT(MUTEX_HELD(&so->so_lock));
1473 	ASSERT(so->so_flag & (SOLOCKED|SOASYNC_UNBIND));
1474 
1475 	/* Process T_DISCON_IND on sti_discon_ind_mp */
1476 	if ((bp = sti->sti_discon_ind_mp) != NULL) {
1477 		sti->sti_discon_ind_mp = NULL;
1478 		func = (void (*)())bp->b_prev;
1479 		bp->b_prev = NULL;
1480 
1481 		/*
1482 		 * This (*func) is supposed to generate a message downstream
1483 		 * and we need to have a flag set until the corresponding
1484 		 * upstream message reaches stream head.
1485 		 * When processing T_DISCON_IND in strsock_discon_ind
1486 		 * we hold SOASYN_UNBIND when sending T_UNBIND_REQ down and
1487 		 * drop the flag after we get the ACK in strsock_proto.
1488 		 */
1489 		(void) (*func)(so, bp);
1490 	}
1491 }
1492 
1493 /*
1494  * Caller must hold the mutex.
1495  * Remove the T_DISCON_IND on sti_discon_ind_mp.
1496  */
1497 void
1498 so_flush_discon_ind(struct sonode *so)
1499 {
1500 	mblk_t	*bp;
1501 	sotpi_info_t *sti = SOTOTPI(so);
1502 
1503 	ASSERT(MUTEX_HELD(&so->so_lock));
1504 
1505 	/*
1506 	 * Remove T_DISCON_IND mblk at sti_discon_ind_mp.
1507 	 */
1508 	if ((bp = sti->sti_discon_ind_mp) != NULL) {
1509 		sti->sti_discon_ind_mp = NULL;
1510 		bp->b_prev = NULL;
1511 		freemsg(bp);
1512 	}
1513 }
1514 
1515 /*
1516  * Caller must hold the mutex.
1517  *
1518  * This function is used to process the T_DISCON_IND message. It does
1519  * immediate processing when called from strsock_proto and delayed
1520  * processing of discon_ind saved on sti_discon_ind_mp when called from
1521  * so_drain_discon_ind. When a T_DISCON_IND message is saved in
1522  * sti_discon_ind_mp for delayed processing, this function is registered
1523  * as the callback function to process the message.
1524  *
1525  * SOASYNC_UNBIND should be held in this function, during the non-blocking
1526  * unbind operation, and should be released only after we receive the ACK
1527  * in strsock_proto, for the T_UNBIND_REQ sent here. Since SOLOCKED is not set,
1528  * no TPI messages would be sent down at this time. This is to prevent M_FLUSH
1529  * sent from either this function or tcp_unbind(), flushing away any TPI
1530  * message that is being sent down and stays in a lower module's queue.
1531  *
1532  * This function drops so_lock and grabs it again.
1533  */
1534 static void
1535 strsock_discon_ind(struct sonode *so, mblk_t *discon_mp)
1536 {
1537 	struct vnode *vp;
1538 	struct stdata *stp;
1539 	union T_primitives *tpr;
1540 	struct T_unbind_req *ubr;
1541 	mblk_t *mp;
1542 	int error;
1543 	sotpi_info_t *sti = SOTOTPI(so);
1544 
1545 	ASSERT(MUTEX_HELD(&so->so_lock));
1546 	ASSERT(discon_mp);
1547 	ASSERT(discon_mp->b_rptr);
1548 
1549 	tpr = (union T_primitives *)discon_mp->b_rptr;
1550 	ASSERT(tpr->type == T_DISCON_IND);
1551 
1552 	vp = SOTOV(so);
1553 	stp = vp->v_stream;
1554 	ASSERT(stp);
1555 
1556 	/*
1557 	 * Not a listener
1558 	 */
1559 	ASSERT((so->so_state & SS_ACCEPTCONN) == 0);
1560 
1561 	/*
1562 	 * This assumes that the name space for DISCON_reason
1563 	 * is the errno name space.
1564 	 */
1565 	soisdisconnected(so, tpr->discon_ind.DISCON_reason);
1566 	sti->sti_laddr_valid = 0;
1567 	sti->sti_faddr_valid = 0;
1568 
1569 	/*
1570 	 * Unbind with the transport without blocking.
1571 	 * If we've already received a T_DISCON_IND do not unbind.
1572 	 *
1573 	 * If there is no preallocated unbind message, we have already
1574 	 * unbound with the transport
1575 	 *
1576 	 * If the socket is not bound, no need to unbind.
1577 	 */
1578 	mp = sti->sti_unbind_mp;
1579 	if (mp == NULL) {
1580 		ASSERT(!(so->so_state & SS_ISBOUND));
1581 		mutex_exit(&so->so_lock);
1582 	} else if (!(so->so_state & SS_ISBOUND))  {
1583 		mutex_exit(&so->so_lock);
1584 	} else {
1585 		sti->sti_unbind_mp = NULL;
1586 
1587 		/*
1588 		 * Is another T_DISCON_IND being processed.
1589 		 */
1590 		ASSERT((so->so_flag & SOASYNC_UNBIND) == 0);
1591 
1592 		/*
1593 		 * Make strsock_proto ignore T_OK_ACK and T_ERROR_ACK for
1594 		 * this unbind. Set SOASYNC_UNBIND. This should be cleared
1595 		 * only after we receive the ACK in strsock_proto.
1596 		 */
1597 		so->so_flag |= SOASYNC_UNBIND;
1598 		ASSERT(!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)));
1599 		so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1600 		sti->sti_laddr_valid = 0;
1601 		mutex_exit(&so->so_lock);
1602 
1603 		/*
1604 		 * Send down T_UNBIND_REQ ignoring flow control.
1605 		 * XXX Assumes that MSG_IGNFLOW implies that this thread
1606 		 * does not run service procedures.
1607 		 */
1608 		ASSERT(DB_TYPE(mp) == M_PROTO);
1609 		ubr = (struct T_unbind_req *)mp->b_rptr;
1610 		mp->b_wptr += sizeof (*ubr);
1611 		ubr->PRIM_type = T_UNBIND_REQ;
1612 
1613 		/*
1614 		 * Flush the read and write side (except stream head read queue)
1615 		 * and send down T_UNBIND_REQ.
1616 		 */
1617 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1618 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1619 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
1620 		/* LINTED - warning: statement has no consequent: if */
1621 		if (error) {
1622 			eprintsoline(so, error);
1623 		}
1624 	}
1625 
1626 	if (tpr->discon_ind.DISCON_reason != 0)
1627 		strsetrerror(SOTOV(so), 0, 0, sogetrderr);
1628 	strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
1629 	strseteof(SOTOV(so), 1);
1630 	/*
1631 	 * strseteof takes care of read side wakeups,
1632 	 * pollwakeups, and signals.
1633 	 */
1634 	dprintso(so, 1, ("T_DISCON_IND: error %d\n", so->so_error));
1635 	freemsg(discon_mp);
1636 
1637 
1638 	pollwakeup(&stp->sd_pollist, POLLOUT);
1639 	mutex_enter(&stp->sd_lock);
1640 
1641 	/*
1642 	 * Wake sleeping write
1643 	 */
1644 	if (stp->sd_flag & WSLEEP) {
1645 		stp->sd_flag &= ~WSLEEP;
1646 		cv_broadcast(&stp->sd_wrq->q_wait);
1647 	}
1648 
1649 	/*
1650 	 * strsendsig can handle multiple signals with a
1651 	 * single call.  Send SIGPOLL for S_OUTPUT event.
1652 	 */
1653 	if (stp->sd_sigflags & S_OUTPUT)
1654 		strsendsig(stp->sd_siglist, S_OUTPUT, 0, 0);
1655 
1656 	mutex_exit(&stp->sd_lock);
1657 	mutex_enter(&so->so_lock);
1658 }
1659 
1660 /*
1661  * This routine is registered with the stream head to receive M_PROTO
1662  * and M_PCPROTO messages.
1663  *
1664  * Returns NULL if the message was consumed.
1665  * Returns an mblk to make that mblk be processed (and queued) by the stream
1666  * head.
1667  *
1668  * Sets the return parameters (*wakeups, *firstmsgsigs, *allmsgsigs, and
1669  * *pollwakeups) for the stream head to take action on. Note that since
1670  * sockets always deliver SIGIO for every new piece of data this routine
1671  * never sets *firstmsgsigs; any signals are returned in *allmsgsigs.
1672  *
1673  * This routine handles all data related TPI messages independent of
1674  * the type of the socket i.e. it doesn't care if T_UNITDATA_IND message
1675  * arrive on a SOCK_STREAM.
1676  */
1677 static mblk_t *
1678 strsock_proto(vnode_t *vp, mblk_t *mp,
1679 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
1680 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
1681 {
1682 	union T_primitives *tpr;
1683 	struct sonode *so;
1684 	sotpi_info_t *sti;
1685 
1686 	so = VTOSO(vp);
1687 	sti = SOTOTPI(so);
1688 
1689 	dprintso(so, 1, ("strsock_proto(%p, %p)\n", (void *)vp, (void *)mp));
1690 
1691 	/* Set default return values */
1692 	*firstmsgsigs = *wakeups = *allmsgsigs = *pollwakeups = 0;
1693 
1694 	ASSERT(DB_TYPE(mp) == M_PROTO ||
1695 	    DB_TYPE(mp) == M_PCPROTO);
1696 
1697 	if (MBLKL(mp) < sizeof (tpr->type)) {
1698 		/* The message is too short to even contain the primitive */
1699 		zcmn_err(getzoneid(), CE_WARN,
1700 		    "sockfs: Too short TPI message received. Len = %ld\n",
1701 		    (ptrdiff_t)(MBLKL(mp)));
1702 		freemsg(mp);
1703 		return (NULL);
1704 	}
1705 	if (!__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
1706 		/* The read pointer is not aligned correctly for TPI */
1707 		zcmn_err(getzoneid(), CE_WARN,
1708 		    "sockfs: Unaligned TPI message received. rptr = %p\n",
1709 		    (void *)mp->b_rptr);
1710 		freemsg(mp);
1711 		return (NULL);
1712 	}
1713 	tpr = (union T_primitives *)mp->b_rptr;
1714 	dprintso(so, 1, ("strsock_proto: primitive %d\n", tpr->type));
1715 
1716 	switch (tpr->type) {
1717 
1718 	case T_DATA_IND:
1719 		if (MBLKL(mp) < sizeof (struct T_data_ind)) {
1720 			zcmn_err(getzoneid(), CE_WARN,
1721 			    "sockfs: Too short T_DATA_IND. Len = %ld\n",
1722 			    (ptrdiff_t)(MBLKL(mp)));
1723 			freemsg(mp);
1724 			return (NULL);
1725 		}
1726 		/*
1727 		 * Ignore zero-length T_DATA_IND messages. These might be
1728 		 * generated by some transports.
1729 		 * This is needed to prevent read (which skips the M_PROTO
1730 		 * part) to unexpectedly return 0 (or return EWOULDBLOCK
1731 		 * on a non-blocking socket after select/poll has indicated
1732 		 * that data is available).
1733 		 */
1734 		if (msgdsize(mp->b_cont) == 0) {
1735 			dprintso(so, 0,
1736 			    ("strsock_proto: zero length T_DATA_IND\n"));
1737 			freemsg(mp);
1738 			return (NULL);
1739 		}
1740 		*allmsgsigs = S_INPUT | S_RDNORM;
1741 		*pollwakeups = POLLIN | POLLRDNORM;
1742 		*wakeups = RSLEEP;
1743 		return (mp);
1744 
1745 	case T_UNITDATA_IND: {
1746 		struct T_unitdata_ind	*tudi = &tpr->unitdata_ind;
1747 		void			*addr;
1748 		t_uscalar_t		addrlen;
1749 
1750 		if (MBLKL(mp) < sizeof (struct T_unitdata_ind)) {
1751 			zcmn_err(getzoneid(), CE_WARN,
1752 			    "sockfs: Too short T_UNITDATA_IND. Len = %ld\n",
1753 			    (ptrdiff_t)(MBLKL(mp)));
1754 			freemsg(mp);
1755 			return (NULL);
1756 		}
1757 
1758 		/* Is this is not a connected datagram socket? */
1759 		if ((so->so_mode & SM_CONNREQUIRED) ||
1760 		    !(so->so_state & SS_ISCONNECTED)) {
1761 			/*
1762 			 * Not a connected datagram socket. Look for
1763 			 * the SO_UNIX_CLOSE option. If such an option is found
1764 			 * discard the message (since it has no meaning
1765 			 * unless connected).
1766 			 */
1767 			if (so->so_family == AF_UNIX && msgdsize(mp) == 0 &&
1768 			    tudi->OPT_length != 0) {
1769 				void *opt;
1770 				t_uscalar_t optlen = tudi->OPT_length;
1771 
1772 				opt = sogetoff(mp, tudi->OPT_offset,
1773 				    optlen, __TPI_ALIGN_SIZE);
1774 				if (opt == NULL) {
1775 					/* The len/off falls outside mp */
1776 					freemsg(mp);
1777 					mutex_enter(&so->so_lock);
1778 					soseterror(so, EPROTO);
1779 					mutex_exit(&so->so_lock);
1780 					zcmn_err(getzoneid(), CE_WARN,
1781 					    "sockfs: T_unidata_ind with "
1782 					    "invalid optlen/offset %u/%d\n",
1783 					    optlen, tudi->OPT_offset);
1784 					return (NULL);
1785 				}
1786 				if (so_getopt_unix_close(opt, optlen)) {
1787 					freemsg(mp);
1788 					return (NULL);
1789 				}
1790 			}
1791 			*allmsgsigs = S_INPUT | S_RDNORM;
1792 			*pollwakeups = POLLIN | POLLRDNORM;
1793 			*wakeups = RSLEEP;
1794 			if (audit_active)
1795 				audit_sock(T_UNITDATA_IND, strvp2wq(vp),
1796 				    mp, 0);
1797 			return (mp);
1798 		}
1799 
1800 		/*
1801 		 * A connect datagram socket. For AF_INET{,6} we verify that
1802 		 * the source address matches the "connected to" address.
1803 		 * The semantics of AF_UNIX sockets is to not verify
1804 		 * the source address.
1805 		 * Note that this source address verification is transport
1806 		 * specific. Thus the real fix would be to extent TPI
1807 		 * to allow T_CONN_REQ messages to be send to connectionless
1808 		 * transport providers and always let the transport provider
1809 		 * do whatever filtering is needed.
1810 		 *
1811 		 * The verification/filtering semantics for transports
1812 		 * other than AF_INET and AF_UNIX are unknown. The choice
1813 		 * would be to either filter using bcmp or let all messages
1814 		 * get through. This code does not filter other address
1815 		 * families since this at least allows the application to
1816 		 * work around any missing filtering.
1817 		 *
1818 		 * XXX Should we move filtering to UDP/ICMP???
1819 		 * That would require passing e.g. a T_DISCON_REQ to UDP
1820 		 * when the socket becomes unconnected.
1821 		 */
1822 		addrlen = tudi->SRC_length;
1823 		/*
1824 		 * The alignment restriction is really to strict but
1825 		 * we want enough alignment to inspect the fields of
1826 		 * a sockaddr_in.
1827 		 */
1828 		addr = sogetoff(mp, tudi->SRC_offset, addrlen,
1829 		    __TPI_ALIGN_SIZE);
1830 		if (addr == NULL) {
1831 			freemsg(mp);
1832 			mutex_enter(&so->so_lock);
1833 			soseterror(so, EPROTO);
1834 			mutex_exit(&so->so_lock);
1835 			zcmn_err(getzoneid(), CE_WARN,
1836 			    "sockfs: T_unidata_ind with invalid "
1837 			    "addrlen/offset %u/%d\n",
1838 			    addrlen, tudi->SRC_offset);
1839 			return (NULL);
1840 		}
1841 
1842 		if (so->so_family == AF_INET) {
1843 			/*
1844 			 * For AF_INET we allow wildcarding both sin_addr
1845 			 * and sin_port.
1846 			 */
1847 			struct sockaddr_in *faddr, *sin;
1848 
1849 			/* Prevent sti_faddr_sa from changing while accessed */
1850 			mutex_enter(&so->so_lock);
1851 			ASSERT(sti->sti_faddr_len ==
1852 			    (socklen_t)sizeof (struct sockaddr_in));
1853 			faddr = (struct sockaddr_in *)sti->sti_faddr_sa;
1854 			sin = (struct sockaddr_in *)addr;
1855 			if (addrlen !=
1856 			    (t_uscalar_t)sizeof (struct sockaddr_in) ||
1857 			    (sin->sin_addr.s_addr != faddr->sin_addr.s_addr &&
1858 			    faddr->sin_addr.s_addr != INADDR_ANY) ||
1859 			    (so->so_type != SOCK_RAW &&
1860 			    sin->sin_port != faddr->sin_port &&
1861 			    faddr->sin_port != 0)) {
1862 #ifdef DEBUG
1863 				dprintso(so, 0,
1864 				    ("sockfs: T_UNITDATA_IND mismatch: %s",
1865 				    pr_addr(so->so_family,
1866 				    (struct sockaddr *)addr, addrlen)));
1867 				dprintso(so, 0, (" - %s\n",
1868 				    pr_addr(so->so_family, sti->sti_faddr_sa,
1869 				    (t_uscalar_t)sti->sti_faddr_len)));
1870 #endif /* DEBUG */
1871 				mutex_exit(&so->so_lock);
1872 				freemsg(mp);
1873 				return (NULL);
1874 			}
1875 			mutex_exit(&so->so_lock);
1876 		} else if (so->so_family == AF_INET6) {
1877 			/*
1878 			 * For AF_INET6 we allow wildcarding both sin6_addr
1879 			 * and sin6_port.
1880 			 */
1881 			struct sockaddr_in6 *faddr6, *sin6;
1882 			static struct in6_addr zeroes; /* inits to all zeros */
1883 
1884 			/* Prevent sti_faddr_sa from changing while accessed */
1885 			mutex_enter(&so->so_lock);
1886 			ASSERT(sti->sti_faddr_len ==
1887 			    (socklen_t)sizeof (struct sockaddr_in6));
1888 			faddr6 = (struct sockaddr_in6 *)sti->sti_faddr_sa;
1889 			sin6 = (struct sockaddr_in6 *)addr;
1890 			/* XXX could we get a mapped address ::ffff:0.0.0.0 ? */
1891 			if (addrlen !=
1892 			    (t_uscalar_t)sizeof (struct sockaddr_in6) ||
1893 			    (!IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
1894 			    &faddr6->sin6_addr) &&
1895 			    !IN6_ARE_ADDR_EQUAL(&faddr6->sin6_addr, &zeroes)) ||
1896 			    (so->so_type != SOCK_RAW &&
1897 			    sin6->sin6_port != faddr6->sin6_port &&
1898 			    faddr6->sin6_port != 0)) {
1899 #ifdef DEBUG
1900 				dprintso(so, 0,
1901 				    ("sockfs: T_UNITDATA_IND mismatch: %s",
1902 				    pr_addr(so->so_family,
1903 				    (struct sockaddr *)addr, addrlen)));
1904 				dprintso(so, 0, (" - %s\n",
1905 				    pr_addr(so->so_family, sti->sti_faddr_sa,
1906 				    (t_uscalar_t)sti->sti_faddr_len)));
1907 #endif /* DEBUG */
1908 				mutex_exit(&so->so_lock);
1909 				freemsg(mp);
1910 				return (NULL);
1911 			}
1912 			mutex_exit(&so->so_lock);
1913 		} else if (so->so_family == AF_UNIX &&
1914 		    msgdsize(mp->b_cont) == 0 &&
1915 		    tudi->OPT_length != 0) {
1916 			/*
1917 			 * Attempt to extract AF_UNIX
1918 			 * SO_UNIX_CLOSE indication from options.
1919 			 */
1920 			void *opt;
1921 			t_uscalar_t optlen = tudi->OPT_length;
1922 
1923 			opt = sogetoff(mp, tudi->OPT_offset,
1924 			    optlen, __TPI_ALIGN_SIZE);
1925 			if (opt == NULL) {
1926 				/* The len/off falls outside mp */
1927 				freemsg(mp);
1928 				mutex_enter(&so->so_lock);
1929 				soseterror(so, EPROTO);
1930 				mutex_exit(&so->so_lock);
1931 				zcmn_err(getzoneid(), CE_WARN,
1932 				    "sockfs: T_unidata_ind with invalid "
1933 				    "optlen/offset %u/%d\n",
1934 				    optlen, tudi->OPT_offset);
1935 				return (NULL);
1936 			}
1937 			/*
1938 			 * If we received a unix close indication mark the
1939 			 * socket and discard this message.
1940 			 */
1941 			if (so_getopt_unix_close(opt, optlen)) {
1942 				mutex_enter(&so->so_lock);
1943 				sobreakconn(so, ECONNRESET);
1944 				mutex_exit(&so->so_lock);
1945 				strsetrerror(SOTOV(so), 0, 0, sogetrderr);
1946 				freemsg(mp);
1947 				*pollwakeups = POLLIN | POLLRDNORM;
1948 				*allmsgsigs = S_INPUT | S_RDNORM;
1949 				*wakeups = RSLEEP;
1950 				return (NULL);
1951 			}
1952 		}
1953 		*allmsgsigs = S_INPUT | S_RDNORM;
1954 		*pollwakeups = POLLIN | POLLRDNORM;
1955 		*wakeups = RSLEEP;
1956 		return (mp);
1957 	}
1958 
1959 	case T_OPTDATA_IND: {
1960 		struct T_optdata_ind	*tdi = &tpr->optdata_ind;
1961 
1962 		if (MBLKL(mp) < sizeof (struct T_optdata_ind)) {
1963 			zcmn_err(getzoneid(), CE_WARN,
1964 			    "sockfs: Too short T_OPTDATA_IND. Len = %ld\n",
1965 			    (ptrdiff_t)(MBLKL(mp)));
1966 			freemsg(mp);
1967 			return (NULL);
1968 		}
1969 		/*
1970 		 * Allow zero-length messages carrying options.
1971 		 * This is used when carrying the SO_UNIX_CLOSE option.
1972 		 */
1973 		if (so->so_family == AF_UNIX && msgdsize(mp->b_cont) == 0 &&
1974 		    tdi->OPT_length != 0) {
1975 			/*
1976 			 * Attempt to extract AF_UNIX close indication
1977 			 * from the options. Ignore any other options -
1978 			 * those are handled once the message is removed
1979 			 * from the queue.
1980 			 * The close indication message should not carry data.
1981 			 */
1982 			void *opt;
1983 			t_uscalar_t optlen = tdi->OPT_length;
1984 
1985 			opt = sogetoff(mp, tdi->OPT_offset,
1986 			    optlen, __TPI_ALIGN_SIZE);
1987 			if (opt == NULL) {
1988 				/* The len/off falls outside mp */
1989 				freemsg(mp);
1990 				mutex_enter(&so->so_lock);
1991 				soseterror(so, EPROTO);
1992 				mutex_exit(&so->so_lock);
1993 				zcmn_err(getzoneid(), CE_WARN,
1994 				    "sockfs: T_optdata_ind with invalid "
1995 				    "optlen/offset %u/%d\n",
1996 				    optlen, tdi->OPT_offset);
1997 				return (NULL);
1998 			}
1999 			/*
2000 			 * If we received a close indication mark the
2001 			 * socket and discard this message.
2002 			 */
2003 			if (so_getopt_unix_close(opt, optlen)) {
2004 				mutex_enter(&so->so_lock);
2005 				socantsendmore(so);
2006 				sti->sti_faddr_valid = 0;
2007 				mutex_exit(&so->so_lock);
2008 				strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2009 				freemsg(mp);
2010 				return (NULL);
2011 			}
2012 		}
2013 		*allmsgsigs = S_INPUT | S_RDNORM;
2014 		*pollwakeups = POLLIN | POLLRDNORM;
2015 		*wakeups = RSLEEP;
2016 		return (mp);
2017 	}
2018 
2019 	case T_EXDATA_IND: {
2020 		mblk_t		*mctl, *mdata;
2021 		mblk_t *lbp;
2022 		union T_primitives *tprp;
2023 		struct stdata   *stp;
2024 		queue_t *qp;
2025 
2026 		if (MBLKL(mp) < sizeof (struct T_exdata_ind)) {
2027 			zcmn_err(getzoneid(), CE_WARN,
2028 			    "sockfs: Too short T_EXDATA_IND. Len = %ld\n",
2029 			    (ptrdiff_t)(MBLKL(mp)));
2030 			freemsg(mp);
2031 			return (NULL);
2032 		}
2033 		/*
2034 		 * Ignore zero-length T_EXDATA_IND messages. These might be
2035 		 * generated by some transports.
2036 		 *
2037 		 * This is needed to prevent read (which skips the M_PROTO
2038 		 * part) to unexpectedly return 0 (or return EWOULDBLOCK
2039 		 * on a non-blocking socket after select/poll has indicated
2040 		 * that data is available).
2041 		 */
2042 		dprintso(so, 1,
2043 		    ("T_EXDATA_IND(%p): counts %d/%d state %s\n",
2044 		    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2045 		    pr_state(so->so_state, so->so_mode)));
2046 
2047 		if (msgdsize(mp->b_cont) == 0) {
2048 			dprintso(so, 0,
2049 			    ("strsock_proto: zero length T_EXDATA_IND\n"));
2050 			freemsg(mp);
2051 			return (NULL);
2052 		}
2053 
2054 		/*
2055 		 * Split into the T_EXDATA_IND and the M_DATA part.
2056 		 * We process these three pieces separately:
2057 		 *	signal generation
2058 		 *	handling T_EXDATA_IND
2059 		 *	handling M_DATA component
2060 		 */
2061 		mctl = mp;
2062 		mdata = mctl->b_cont;
2063 		mctl->b_cont = NULL;
2064 		mutex_enter(&so->so_lock);
2065 		so_oob_sig(so, 0, allmsgsigs, pollwakeups);
2066 		mctl = so_oob_exdata(so, mctl, allmsgsigs, pollwakeups);
2067 		mdata = so_oob_data(so, mdata, allmsgsigs, pollwakeups);
2068 
2069 		stp = vp->v_stream;
2070 		ASSERT(stp != NULL);
2071 		qp = _RD(stp->sd_wrq);
2072 
2073 		mutex_enter(QLOCK(qp));
2074 		lbp = qp->q_last;
2075 
2076 		/*
2077 		 * We want to avoid queueing up a string of T_EXDATA_IND
2078 		 * messages with no intervening data messages at the stream
2079 		 * head. These messages contribute to the total message
2080 		 * count. Eventually this can lead to STREAMS flow contol
2081 		 * and also cause TCP to advertise a zero window condition
2082 		 * to the peer. This can happen in the degenerate case where
2083 		 * the sender and receiver exchange only OOB data. The sender
2084 		 * only sends messages with MSG_OOB flag and the receiver
2085 		 * receives only MSG_OOB messages and does not use SO_OOBINLINE.
2086 		 * An example of this scenario has been reported in applications
2087 		 * that use OOB data to exchange heart beats. Flow control
2088 		 * relief will never happen if the application only reads OOB
2089 		 * data which is done directly by sorecvoob() and the
2090 		 * T_EXDATA_IND messages at the streamhead won't be consumed.
2091 		 * Note that there is no correctness issue in compressing the
2092 		 * string of T_EXDATA_IND messages into a single T_EXDATA_IND
2093 		 * message. A single read that does not specify MSG_OOB will
2094 		 * read across all the marks in a loop in sotpi_recvmsg().
2095 		 * Each mark is individually distinguishable only if the
2096 		 * T_EXDATA_IND messages are separated by data messages.
2097 		 */
2098 		if ((qp->q_first != NULL) && (DB_TYPE(lbp) == M_PROTO)) {
2099 			tprp = (union T_primitives *)lbp->b_rptr;
2100 			if ((tprp->type == T_EXDATA_IND) &&
2101 			    !(so->so_options & SO_OOBINLINE)) {
2102 
2103 				/*
2104 				 * free the new M_PROTO message
2105 				 */
2106 				freemsg(mctl);
2107 
2108 				/*
2109 				 * adjust the OOB count and OOB	signal count
2110 				 * just incremented for the new OOB data.
2111 				 */
2112 				sti->sti_oobcnt--;
2113 				sti->sti_oobsigcnt--;
2114 				mutex_exit(QLOCK(qp));
2115 				mutex_exit(&so->so_lock);
2116 				return (NULL);
2117 			}
2118 		}
2119 		mutex_exit(QLOCK(qp));
2120 
2121 		/*
2122 		 * Pass the T_EXDATA_IND and the M_DATA back separately
2123 		 * by using b_next linkage. (The stream head will queue any
2124 		 * b_next linked messages separately.) This is needed
2125 		 * since MSGMARK applies to the last by of the message
2126 		 * hence we can not have any M_DATA component attached
2127 		 * to the marked T_EXDATA_IND. Note that the stream head
2128 		 * will not consolidate M_DATA messages onto an MSGMARK'ed
2129 		 * message in order to preserve the constraint that
2130 		 * the T_EXDATA_IND always is a separate message.
2131 		 */
2132 		ASSERT(mctl != NULL);
2133 		mctl->b_next = mdata;
2134 		mp = mctl;
2135 #ifdef DEBUG
2136 		if (mdata == NULL) {
2137 			dprintso(so, 1,
2138 			    ("after outofline T_EXDATA_IND(%p): "
2139 			    "counts %d/%d  poll 0x%x sig 0x%x state %s\n",
2140 			    (void *)vp, sti->sti_oobsigcnt,
2141 			    sti->sti_oobcnt, *pollwakeups, *allmsgsigs,
2142 			    pr_state(so->so_state, so->so_mode)));
2143 		} else {
2144 			dprintso(so, 1,
2145 			    ("after inline T_EXDATA_IND(%p): "
2146 			    "counts %d/%d  poll 0x%x sig 0x%x state %s\n",
2147 			    (void *)vp, sti->sti_oobsigcnt,
2148 			    sti->sti_oobcnt, *pollwakeups, *allmsgsigs,
2149 			    pr_state(so->so_state, so->so_mode)));
2150 		}
2151 #endif /* DEBUG */
2152 		mutex_exit(&so->so_lock);
2153 		*wakeups = RSLEEP;
2154 		return (mp);
2155 	}
2156 
2157 	case T_CONN_CON: {
2158 		struct T_conn_con	*conn_con;
2159 		void			*addr;
2160 		t_uscalar_t		addrlen;
2161 
2162 		/*
2163 		 * Verify the state, update the state to ISCONNECTED,
2164 		 * record the potentially new address in the message,
2165 		 * and drop the message.
2166 		 */
2167 		if (MBLKL(mp) < sizeof (struct T_conn_con)) {
2168 			zcmn_err(getzoneid(), CE_WARN,
2169 			    "sockfs: Too short T_CONN_CON. Len = %ld\n",
2170 			    (ptrdiff_t)(MBLKL(mp)));
2171 			freemsg(mp);
2172 			return (NULL);
2173 		}
2174 
2175 		mutex_enter(&so->so_lock);
2176 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) !=
2177 		    SS_ISCONNECTING) {
2178 			mutex_exit(&so->so_lock);
2179 			dprintso(so, 1,
2180 			    ("T_CONN_CON: state %x\n", so->so_state));
2181 			freemsg(mp);
2182 			return (NULL);
2183 		}
2184 
2185 		conn_con = &tpr->conn_con;
2186 		addrlen = conn_con->RES_length;
2187 		/*
2188 		 * Allow the address to be of different size than sent down
2189 		 * in the T_CONN_REQ as long as it doesn't exceed the maxlen.
2190 		 * For AF_UNIX require the identical length.
2191 		 */
2192 		if (so->so_family == AF_UNIX ?
2193 		    addrlen != (t_uscalar_t)sizeof (sti->sti_ux_laddr) :
2194 		    addrlen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2195 			zcmn_err(getzoneid(), CE_WARN,
2196 			    "sockfs: T_conn_con with different "
2197 			    "length %u/%d\n",
2198 			    addrlen, conn_con->RES_length);
2199 			soisdisconnected(so, EPROTO);
2200 			sti->sti_laddr_valid = 0;
2201 			sti->sti_faddr_valid = 0;
2202 			mutex_exit(&so->so_lock);
2203 			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2204 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2205 			strseteof(SOTOV(so), 1);
2206 			freemsg(mp);
2207 			/*
2208 			 * strseteof takes care of read side wakeups,
2209 			 * pollwakeups, and signals.
2210 			 */
2211 			*wakeups = WSLEEP;
2212 			*allmsgsigs = S_OUTPUT;
2213 			*pollwakeups = POLLOUT;
2214 			return (NULL);
2215 		}
2216 		addr = sogetoff(mp, conn_con->RES_offset, addrlen, 1);
2217 		if (addr == NULL) {
2218 			zcmn_err(getzoneid(), CE_WARN,
2219 			    "sockfs: T_conn_con with invalid "
2220 			    "addrlen/offset %u/%d\n",
2221 			    addrlen, conn_con->RES_offset);
2222 			mutex_exit(&so->so_lock);
2223 			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2224 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2225 			strseteof(SOTOV(so), 1);
2226 			freemsg(mp);
2227 			/*
2228 			 * strseteof takes care of read side wakeups,
2229 			 * pollwakeups, and signals.
2230 			 */
2231 			*wakeups = WSLEEP;
2232 			*allmsgsigs = S_OUTPUT;
2233 			*pollwakeups = POLLOUT;
2234 			return (NULL);
2235 		}
2236 
2237 		/*
2238 		 * Save for getpeername.
2239 		 */
2240 		if (so->so_family != AF_UNIX) {
2241 			sti->sti_faddr_len = (socklen_t)addrlen;
2242 			ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2243 			bcopy(addr, sti->sti_faddr_sa, addrlen);
2244 			sti->sti_faddr_valid = 1;
2245 		}
2246 
2247 		if (so->so_peercred != NULL)
2248 			crfree(so->so_peercred);
2249 		so->so_peercred = msg_getcred(mp, &so->so_cpid);
2250 		if (so->so_peercred != NULL)
2251 			crhold(so->so_peercred);
2252 
2253 		/* Wakeup anybody sleeping in sowaitconnected */
2254 		soisconnected(so);
2255 		mutex_exit(&so->so_lock);
2256 
2257 		/*
2258 		 * The socket is now available for sending data.
2259 		 */
2260 		*wakeups = WSLEEP;
2261 		*allmsgsigs = S_OUTPUT;
2262 		*pollwakeups = POLLOUT;
2263 		freemsg(mp);
2264 		return (NULL);
2265 	}
2266 
2267 	/*
2268 	 * Extra processing in case of an SSL proxy, before queuing or
2269 	 * forwarding to the fallback endpoint
2270 	 */
2271 	case T_SSL_PROXY_CONN_IND:
2272 	case T_CONN_IND:
2273 		/*
2274 		 * Verify the min size and queue the message on
2275 		 * the sti_conn_ind_head/tail list.
2276 		 */
2277 		if (MBLKL(mp) < sizeof (struct T_conn_ind)) {
2278 			zcmn_err(getzoneid(), CE_WARN,
2279 			    "sockfs: Too short T_CONN_IND. Len = %ld\n",
2280 			    (ptrdiff_t)(MBLKL(mp)));
2281 			freemsg(mp);
2282 			return (NULL);
2283 		}
2284 
2285 		if (audit_active)
2286 			audit_sock(T_CONN_IND, strvp2wq(vp), mp, 0);
2287 		if (!(so->so_state & SS_ACCEPTCONN)) {
2288 			zcmn_err(getzoneid(), CE_WARN,
2289 			    "sockfs: T_conn_ind on non-listening socket\n");
2290 			freemsg(mp);
2291 			return (NULL);
2292 		}
2293 
2294 		if (tpr->type == T_SSL_PROXY_CONN_IND && mp->b_cont == NULL) {
2295 			/* No context: need to fall back */
2296 			struct sonode *fbso;
2297 			stdata_t *fbstp;
2298 
2299 			tpr->type = T_CONN_IND;
2300 
2301 			fbso = kssl_find_fallback(sti->sti_kssl_ent);
2302 
2303 			/*
2304 			 * No fallback: the remote will timeout and
2305 			 * disconnect.
2306 			 */
2307 			if (fbso == NULL) {
2308 				freemsg(mp);
2309 				return (NULL);
2310 			}
2311 			fbstp = SOTOV(fbso)->v_stream;
2312 			qreply(fbstp->sd_wrq->q_next, mp);
2313 			return (NULL);
2314 		}
2315 		soqueueconnind(so, mp);
2316 		*allmsgsigs = S_INPUT | S_RDNORM;
2317 		*pollwakeups = POLLIN | POLLRDNORM;
2318 		*wakeups = RSLEEP;
2319 		return (NULL);
2320 
2321 	case T_ORDREL_IND:
2322 		if (MBLKL(mp) < sizeof (struct T_ordrel_ind)) {
2323 			zcmn_err(getzoneid(), CE_WARN,
2324 			    "sockfs: Too short T_ORDREL_IND. Len = %ld\n",
2325 			    (ptrdiff_t)(MBLKL(mp)));
2326 			freemsg(mp);
2327 			return (NULL);
2328 		}
2329 
2330 		/*
2331 		 * Some providers send this when not fully connected.
2332 		 * SunLink X.25 needs to retrieve disconnect reason after
2333 		 * disconnect for compatibility. It uses T_ORDREL_IND
2334 		 * instead of T_DISCON_IND so that it may use the
2335 		 * endpoint after a connect failure to retrieve the
2336 		 * reason using an ioctl. Thus we explicitly clear
2337 		 * SS_ISCONNECTING here for SunLink X.25.
2338 		 * This is a needed TPI violation.
2339 		 */
2340 		mutex_enter(&so->so_lock);
2341 		so->so_state &= ~SS_ISCONNECTING;
2342 		socantrcvmore(so);
2343 		mutex_exit(&so->so_lock);
2344 		strseteof(SOTOV(so), 1);
2345 		/*
2346 		 * strseteof takes care of read side wakeups,
2347 		 * pollwakeups, and signals.
2348 		 */
2349 		freemsg(mp);
2350 		return (NULL);
2351 
2352 	case T_DISCON_IND:
2353 		if (MBLKL(mp) < sizeof (struct T_discon_ind)) {
2354 			zcmn_err(getzoneid(), CE_WARN,
2355 			    "sockfs: Too short T_DISCON_IND. Len = %ld\n",
2356 			    (ptrdiff_t)(MBLKL(mp)));
2357 			freemsg(mp);
2358 			return (NULL);
2359 		}
2360 		if (so->so_state & SS_ACCEPTCONN) {
2361 			/*
2362 			 * This is a listener. Look for a queued T_CONN_IND
2363 			 * with a matching sequence number and remove it
2364 			 * from the list.
2365 			 * It is normal to not find the sequence number since
2366 			 * the soaccept might have already dequeued it
2367 			 * (in which case the T_CONN_RES will fail with
2368 			 * TBADSEQ).
2369 			 */
2370 			(void) soflushconnind(so, tpr->discon_ind.SEQ_number);
2371 			freemsg(mp);
2372 			return (0);
2373 		}
2374 
2375 		/*
2376 		 * Not a listener
2377 		 *
2378 		 * If SS_CANTRCVMORE for AF_UNIX ignore the discon_reason.
2379 		 * Such a discon_ind appears when the peer has first done
2380 		 * a shutdown() followed by a close() in which case we just
2381 		 * want to record socantsendmore.
2382 		 * In this case sockfs first receives a T_ORDREL_IND followed
2383 		 * by a T_DISCON_IND.
2384 		 * Note that for other transports (e.g. TCP) we need to handle
2385 		 * the discon_ind in this case since it signals an error.
2386 		 */
2387 		mutex_enter(&so->so_lock);
2388 		if ((so->so_state & SS_CANTRCVMORE) &&
2389 		    (so->so_family == AF_UNIX)) {
2390 			socantsendmore(so);
2391 			sti->sti_faddr_valid = 0;
2392 			mutex_exit(&so->so_lock);
2393 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2394 			dprintso(so, 1,
2395 			    ("T_DISCON_IND: error %d\n", so->so_error));
2396 			freemsg(mp);
2397 			/*
2398 			 * Set these variables for caller to process them.
2399 			 * For the else part where T_DISCON_IND is processed,
2400 			 * this will be done in the function being called
2401 			 * (strsock_discon_ind())
2402 			 */
2403 			*wakeups = WSLEEP;
2404 			*allmsgsigs = S_OUTPUT;
2405 			*pollwakeups = POLLOUT;
2406 		} else if (so->so_flag & (SOASYNC_UNBIND | SOLOCKED)) {
2407 			/*
2408 			 * Deferred processing of T_DISCON_IND
2409 			 */
2410 			so_save_discon_ind(so, mp, strsock_discon_ind);
2411 			mutex_exit(&so->so_lock);
2412 		} else {
2413 			/*
2414 			 * Process T_DISCON_IND now
2415 			 */
2416 			(void) strsock_discon_ind(so, mp);
2417 			mutex_exit(&so->so_lock);
2418 		}
2419 		return (NULL);
2420 
2421 	case T_UDERROR_IND: {
2422 		struct T_uderror_ind	*tudi = &tpr->uderror_ind;
2423 		void			*addr;
2424 		t_uscalar_t		addrlen;
2425 		int			error;
2426 
2427 		dprintso(so, 0,
2428 		    ("T_UDERROR_IND: error %d\n", tudi->ERROR_type));
2429 
2430 		if (MBLKL(mp) < sizeof (struct T_uderror_ind)) {
2431 			zcmn_err(getzoneid(), CE_WARN,
2432 			    "sockfs: Too short T_UDERROR_IND. Len = %ld\n",
2433 			    (ptrdiff_t)(MBLKL(mp)));
2434 			freemsg(mp);
2435 			return (NULL);
2436 		}
2437 		/* Ignore on connection-oriented transports */
2438 		if (so->so_mode & SM_CONNREQUIRED) {
2439 			freemsg(mp);
2440 			eprintsoline(so, 0);
2441 			zcmn_err(getzoneid(), CE_WARN,
2442 			    "sockfs: T_uderror_ind on connection-oriented "
2443 			    "transport\n");
2444 			return (NULL);
2445 		}
2446 		addrlen = tudi->DEST_length;
2447 		addr = sogetoff(mp, tudi->DEST_offset, addrlen, 1);
2448 		if (addr == NULL) {
2449 			zcmn_err(getzoneid(), CE_WARN,
2450 			    "sockfs: T_uderror_ind with invalid "
2451 			    "addrlen/offset %u/%d\n",
2452 			    addrlen, tudi->DEST_offset);
2453 			freemsg(mp);
2454 			return (NULL);
2455 		}
2456 
2457 		/* Verify source address for connected socket. */
2458 		mutex_enter(&so->so_lock);
2459 		if (so->so_state & SS_ISCONNECTED) {
2460 			void *faddr;
2461 			t_uscalar_t faddr_len;
2462 			boolean_t match = B_FALSE;
2463 
2464 			switch (so->so_family) {
2465 			case AF_INET: {
2466 				/* Compare just IP address and port */
2467 				struct sockaddr_in *sin1, *sin2;
2468 
2469 				sin1 = (struct sockaddr_in *)sti->sti_faddr_sa;
2470 				sin2 = (struct sockaddr_in *)addr;
2471 				if (addrlen == sizeof (struct sockaddr_in) &&
2472 				    sin1->sin_port == sin2->sin_port &&
2473 				    sin1->sin_addr.s_addr ==
2474 				    sin2->sin_addr.s_addr)
2475 					match = B_TRUE;
2476 				break;
2477 			}
2478 			case AF_INET6: {
2479 				/* Compare just IP address and port. Not flow */
2480 				struct sockaddr_in6 *sin1, *sin2;
2481 
2482 				sin1 = (struct sockaddr_in6 *)sti->sti_faddr_sa;
2483 				sin2 = (struct sockaddr_in6 *)addr;
2484 				if (addrlen == sizeof (struct sockaddr_in6) &&
2485 				    sin1->sin6_port == sin2->sin6_port &&
2486 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
2487 				    &sin2->sin6_addr))
2488 					match = B_TRUE;
2489 				break;
2490 			}
2491 			case AF_UNIX:
2492 				faddr = &sti->sti_ux_faddr;
2493 				faddr_len =
2494 				    (t_uscalar_t)sizeof (sti->sti_ux_faddr);
2495 				if (faddr_len == addrlen &&
2496 				    bcmp(addr, faddr, addrlen) == 0)
2497 					match = B_TRUE;
2498 				break;
2499 			default:
2500 				faddr = sti->sti_faddr_sa;
2501 				faddr_len = (t_uscalar_t)sti->sti_faddr_len;
2502 				if (faddr_len == addrlen &&
2503 				    bcmp(addr, faddr, addrlen) == 0)
2504 					match = B_TRUE;
2505 				break;
2506 			}
2507 
2508 			if (!match) {
2509 #ifdef DEBUG
2510 				dprintso(so, 0,
2511 				    ("sockfs: T_UDERR_IND mismatch: %s - ",
2512 				    pr_addr(so->so_family,
2513 				    (struct sockaddr *)addr, addrlen)));
2514 				dprintso(so, 0, ("%s\n",
2515 				    pr_addr(so->so_family, sti->sti_faddr_sa,
2516 				    sti->sti_faddr_len)));
2517 #endif /* DEBUG */
2518 				mutex_exit(&so->so_lock);
2519 				freemsg(mp);
2520 				return (NULL);
2521 			}
2522 			/*
2523 			 * Make the write error nonpersistent. If the error
2524 			 * is zero we use ECONNRESET.
2525 			 * This assumes that the name space for ERROR_type
2526 			 * is the errno name space.
2527 			 */
2528 			if (tudi->ERROR_type != 0)
2529 				error = tudi->ERROR_type;
2530 			else
2531 				error = ECONNRESET;
2532 
2533 			soseterror(so, error);
2534 			mutex_exit(&so->so_lock);
2535 			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2536 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2537 			*wakeups = RSLEEP | WSLEEP;
2538 			*allmsgsigs = S_INPUT | S_RDNORM | S_OUTPUT;
2539 			*pollwakeups = POLLIN | POLLRDNORM | POLLOUT;
2540 			freemsg(mp);
2541 			return (NULL);
2542 		}
2543 		/*
2544 		 * If the application asked for delayed errors
2545 		 * record the T_UDERROR_IND sti_eaddr_mp and the reason in
2546 		 * sti_delayed_error for delayed error posting. If the reason
2547 		 * is zero use ECONNRESET.
2548 		 * Note that delayed error indications do not make sense for
2549 		 * AF_UNIX sockets since sendto checks that the destination
2550 		 * address is valid at the time of the sendto.
2551 		 */
2552 		if (!(so->so_options & SO_DGRAM_ERRIND)) {
2553 			mutex_exit(&so->so_lock);
2554 			freemsg(mp);
2555 			return (NULL);
2556 		}
2557 		if (sti->sti_eaddr_mp != NULL)
2558 			freemsg(sti->sti_eaddr_mp);
2559 
2560 		sti->sti_eaddr_mp = mp;
2561 		if (tudi->ERROR_type != 0)
2562 			error = tudi->ERROR_type;
2563 		else
2564 			error = ECONNRESET;
2565 		sti->sti_delayed_error = (ushort_t)error;
2566 		mutex_exit(&so->so_lock);
2567 		return (NULL);
2568 	}
2569 
2570 	case T_ERROR_ACK:
2571 		dprintso(so, 0,
2572 		    ("strsock_proto: T_ERROR_ACK for %d, error %d/%d\n",
2573 		    tpr->error_ack.ERROR_prim,
2574 		    tpr->error_ack.TLI_error,
2575 		    tpr->error_ack.UNIX_error));
2576 
2577 		if (MBLKL(mp) < sizeof (struct T_error_ack)) {
2578 			zcmn_err(getzoneid(), CE_WARN,
2579 			    "sockfs: Too short T_ERROR_ACK. Len = %ld\n",
2580 			    (ptrdiff_t)(MBLKL(mp)));
2581 			freemsg(mp);
2582 			return (NULL);
2583 		}
2584 		/*
2585 		 * Check if we were waiting for the async message
2586 		 */
2587 		mutex_enter(&so->so_lock);
2588 		if ((so->so_flag & SOASYNC_UNBIND) &&
2589 		    tpr->error_ack.ERROR_prim == T_UNBIND_REQ) {
2590 			so_unlock_single(so, SOASYNC_UNBIND);
2591 			mutex_exit(&so->so_lock);
2592 			freemsg(mp);
2593 			return (NULL);
2594 		}
2595 		mutex_exit(&so->so_lock);
2596 		soqueueack(so, mp);
2597 		return (NULL);
2598 
2599 	case T_OK_ACK:
2600 		if (MBLKL(mp) < sizeof (struct T_ok_ack)) {
2601 			zcmn_err(getzoneid(), CE_WARN,
2602 			    "sockfs: Too short T_OK_ACK. Len = %ld\n",
2603 			    (ptrdiff_t)(MBLKL(mp)));
2604 			freemsg(mp);
2605 			return (NULL);
2606 		}
2607 		/*
2608 		 * Check if we were waiting for the async message
2609 		 */
2610 		mutex_enter(&so->so_lock);
2611 		if ((so->so_flag & SOASYNC_UNBIND) &&
2612 		    tpr->ok_ack.CORRECT_prim == T_UNBIND_REQ) {
2613 			dprintso(so, 1,
2614 			    ("strsock_proto: T_OK_ACK async unbind\n"));
2615 			so_unlock_single(so, SOASYNC_UNBIND);
2616 			mutex_exit(&so->so_lock);
2617 			freemsg(mp);
2618 			return (NULL);
2619 		}
2620 		mutex_exit(&so->so_lock);
2621 		soqueueack(so, mp);
2622 		return (NULL);
2623 
2624 	case T_INFO_ACK:
2625 		if (MBLKL(mp) < sizeof (struct T_info_ack)) {
2626 			zcmn_err(getzoneid(), CE_WARN,
2627 			    "sockfs: Too short T_INFO_ACK. Len = %ld\n",
2628 			    (ptrdiff_t)(MBLKL(mp)));
2629 			freemsg(mp);
2630 			return (NULL);
2631 		}
2632 		soqueueack(so, mp);
2633 		return (NULL);
2634 
2635 	case T_CAPABILITY_ACK:
2636 		/*
2637 		 * A T_capability_ack need only be large enough to hold
2638 		 * the PRIM_type and CAP_bits1 fields; checking for anything
2639 		 * larger might reject a correct response from an older
2640 		 * provider.
2641 		 */
2642 		if (MBLKL(mp) < 2 * sizeof (t_uscalar_t)) {
2643 			zcmn_err(getzoneid(), CE_WARN,
2644 			    "sockfs: Too short T_CAPABILITY_ACK. Len = %ld\n",
2645 			    (ptrdiff_t)(MBLKL(mp)));
2646 			freemsg(mp);
2647 			return (NULL);
2648 		}
2649 		soqueueack(so, mp);
2650 		return (NULL);
2651 
2652 	case T_BIND_ACK:
2653 		if (MBLKL(mp) < sizeof (struct T_bind_ack)) {
2654 			zcmn_err(getzoneid(), CE_WARN,
2655 			    "sockfs: Too short T_BIND_ACK. Len = %ld\n",
2656 			    (ptrdiff_t)(MBLKL(mp)));
2657 			freemsg(mp);
2658 			return (NULL);
2659 		}
2660 		soqueueack(so, mp);
2661 		return (NULL);
2662 
2663 	case T_OPTMGMT_ACK:
2664 		if (MBLKL(mp) < sizeof (struct T_optmgmt_ack)) {
2665 			zcmn_err(getzoneid(), CE_WARN,
2666 			    "sockfs: Too short T_OPTMGMT_ACK. Len = %ld\n",
2667 			    (ptrdiff_t)(MBLKL(mp)));
2668 			freemsg(mp);
2669 			return (NULL);
2670 		}
2671 		soqueueack(so, mp);
2672 		return (NULL);
2673 	default:
2674 #ifdef DEBUG
2675 		zcmn_err(getzoneid(), CE_WARN,
2676 		    "sockfs: unknown TPI primitive %d received\n",
2677 		    tpr->type);
2678 #endif /* DEBUG */
2679 		freemsg(mp);
2680 		return (NULL);
2681 	}
2682 }
2683 
2684 /*
2685  * This routine is registered with the stream head to receive other
2686  * (non-data, and non-proto) messages.
2687  *
2688  * Returns NULL if the message was consumed.
2689  * Returns an mblk to make that mblk be processed by the stream head.
2690  *
2691  * Sets the return parameters (*wakeups, *firstmsgsigs, *allmsgsigs, and
2692  * *pollwakeups) for the stream head to take action on.
2693  */
2694 static mblk_t *
2695 strsock_misc(vnode_t *vp, mblk_t *mp,
2696 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
2697 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
2698 {
2699 	struct sonode *so;
2700 	sotpi_info_t *sti;
2701 
2702 	so = VTOSO(vp);
2703 	sti = SOTOTPI(so);
2704 
2705 	dprintso(so, 1, ("strsock_misc(%p, %p, 0x%x)\n",
2706 	    (void *)vp, (void *)mp, DB_TYPE(mp)));
2707 
2708 	/* Set default return values */
2709 	*wakeups = *allmsgsigs = *firstmsgsigs = *pollwakeups = 0;
2710 
2711 	switch (DB_TYPE(mp)) {
2712 	case M_PCSIG:
2713 		/*
2714 		 * This assumes that an M_PCSIG for the urgent data arrives
2715 		 * before the corresponding T_EXDATA_IND.
2716 		 *
2717 		 * Note: Just like in SunOS 4.X and 4.4BSD a poll will be
2718 		 * awoken before the urgent data shows up.
2719 		 * For OOBINLINE this can result in select returning
2720 		 * only exceptions as opposed to except|read.
2721 		 */
2722 		if (*mp->b_rptr == SIGURG) {
2723 			mutex_enter(&so->so_lock);
2724 			dprintso(so, 1,
2725 			    ("SIGURG(%p): counts %d/%d state %s\n",
2726 			    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2727 			    pr_state(so->so_state, so->so_mode)));
2728 			so_oob_sig(so, 1, allmsgsigs, pollwakeups);
2729 			dprintso(so, 1,
2730 			    ("after SIGURG(%p): counts %d/%d "
2731 			    " poll 0x%x sig 0x%x state %s\n",
2732 			    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2733 			    *pollwakeups, *allmsgsigs,
2734 			    pr_state(so->so_state, so->so_mode)));
2735 			mutex_exit(&so->so_lock);
2736 		}
2737 		freemsg(mp);
2738 		return (NULL);
2739 
2740 	case M_SIG:
2741 	case M_HANGUP:
2742 	case M_UNHANGUP:
2743 	case M_ERROR:
2744 		/* M_ERRORs etc are ignored */
2745 		freemsg(mp);
2746 		return (NULL);
2747 
2748 	case M_FLUSH:
2749 		/*
2750 		 * Do not flush read queue. If the M_FLUSH
2751 		 * arrives because of an impending T_discon_ind
2752 		 * we still have to keep any queued data - this is part of
2753 		 * socket semantics.
2754 		 */
2755 		if (*mp->b_rptr & FLUSHW) {
2756 			*mp->b_rptr &= ~FLUSHR;
2757 			return (mp);
2758 		}
2759 		freemsg(mp);
2760 		return (NULL);
2761 
2762 	default:
2763 		return (mp);
2764 	}
2765 }
2766 
2767 
2768 /* Register to receive signals for certain events */
2769 int
2770 so_set_asyncsigs(vnode_t *vp, pid_t pgrp, int events, int mode, cred_t *cr)
2771 {
2772 	struct strsigset ss;
2773 	int32_t rval;
2774 
2775 	/*
2776 	 * Note that SOLOCKED will be set except for the call from soaccept().
2777 	 */
2778 	ASSERT(!mutex_owned(&VTOSO(vp)->so_lock));
2779 	ss.ss_pid = pgrp;
2780 	ss.ss_events = events;
2781 	return (strioctl(vp, I_ESETSIG, (intptr_t)&ss, mode, K_TO_K, cr,
2782 	    &rval));
2783 }
2784 
2785 
2786 /* Register for events matching the SS_ASYNC flag */
2787 int
2788 so_set_events(struct sonode *so, vnode_t *vp, cred_t *cr)
2789 {
2790 	int events = so->so_state & SS_ASYNC ?
2791 	    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT :
2792 	    S_RDBAND | S_BANDURG;
2793 
2794 	return (so_set_asyncsigs(vp, so->so_pgrp, events, 0, cr));
2795 }
2796 
2797 
2798 /* Change the SS_ASYNC flag, and update signal delivery if needed */
2799 int
2800 so_flip_async(struct sonode *so, vnode_t *vp, int mode, cred_t *cr)
2801 {
2802 	ASSERT(mutex_owned(&so->so_lock));
2803 	if (so->so_pgrp != 0) {
2804 		int error;
2805 		int events = so->so_state & SS_ASYNC ?		/* Old flag */
2806 		    S_RDBAND | S_BANDURG :			/* New sigs */
2807 		    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT;
2808 
2809 		so_lock_single(so);
2810 		mutex_exit(&so->so_lock);
2811 
2812 		error = so_set_asyncsigs(vp, so->so_pgrp, events, mode, cr);
2813 
2814 		mutex_enter(&so->so_lock);
2815 		so_unlock_single(so, SOLOCKED);
2816 		if (error)
2817 			return (error);
2818 	}
2819 	so->so_state ^= SS_ASYNC;
2820 	return (0);
2821 }
2822 
2823 /*
2824  * Set new pid/pgrp for SIGPOLL (or SIGIO for FIOASYNC mode), replacing
2825  * any existing one.  If passed zero, just clear the existing one.
2826  */
2827 int
2828 so_set_siggrp(struct sonode *so, vnode_t *vp, pid_t pgrp, int mode, cred_t *cr)
2829 {
2830 	int events = so->so_state & SS_ASYNC ?
2831 	    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT :
2832 	    S_RDBAND | S_BANDURG;
2833 	int error;
2834 
2835 	ASSERT(mutex_owned(&so->so_lock));
2836 
2837 	/*
2838 	 * Change socket process (group).
2839 	 *
2840 	 * strioctl (via so_set_asyncsigs) will perform permission check and
2841 	 * also keep a PID_HOLD to prevent the pid from being reused.
2842 	 */
2843 	so_lock_single(so);
2844 	mutex_exit(&so->so_lock);
2845 
2846 	if (pgrp != 0) {
2847 		dprintso(so, 1, ("setown: adding pgrp %d ev 0x%x\n",
2848 		    pgrp, events));
2849 		error = so_set_asyncsigs(vp, pgrp, events, mode, cr);
2850 		if (error != 0) {
2851 			eprintsoline(so, error);
2852 			goto bad;
2853 		}
2854 	}
2855 	/* Remove the previously registered process/group */
2856 	if (so->so_pgrp != 0) {
2857 		dprintso(so, 1, ("setown: removing pgrp %d\n", so->so_pgrp));
2858 		error = so_set_asyncsigs(vp, so->so_pgrp, 0, mode, cr);
2859 		if (error != 0) {
2860 			eprintsoline(so, error);
2861 			error = 0;
2862 		}
2863 	}
2864 	mutex_enter(&so->so_lock);
2865 	so_unlock_single(so, SOLOCKED);
2866 	so->so_pgrp = pgrp;
2867 	return (0);
2868 bad:
2869 	mutex_enter(&so->so_lock);
2870 	so_unlock_single(so, SOLOCKED);
2871 	return (error);
2872 }
2873 
2874 /*
2875  * Wrapper for getmsg. If the socket has been converted to a stream
2876  * pass the request to the stream head.
2877  */
2878 int
2879 sock_getmsg(
2880 	struct vnode *vp,
2881 	struct strbuf *mctl,
2882 	struct strbuf *mdata,
2883 	uchar_t *prip,
2884 	int *flagsp,
2885 	int fmode,
2886 	rval_t *rvp
2887 )
2888 {
2889 	struct sonode *so;
2890 
2891 	ASSERT(vp->v_type == VSOCK);
2892 	/*
2893 	 * Use the stream head to find the real socket vnode.
2894 	 * This is needed when namefs sits above sockfs.  Some
2895 	 * sockets (like SCTP) are not streams.
2896 	 */
2897 	if (!vp->v_stream) {
2898 		return (ENOSTR);
2899 	}
2900 	ASSERT(vp->v_stream->sd_vnode);
2901 	vp = vp->v_stream->sd_vnode;
2902 	ASSERT(vn_matchops(vp, socket_vnodeops));
2903 	so = VTOSO(vp);
2904 
2905 	dprintso(so, 1, ("sock_getmsg(%p) %s\n",
2906 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2907 
2908 	if (so->so_version == SOV_STREAM) {
2909 		/* The imaginary "sockmod" has been popped - act as a stream */
2910 		return (strgetmsg(vp, mctl, mdata, prip, flagsp, fmode, rvp));
2911 	}
2912 	eprintsoline(so, ENOSTR);
2913 	return (ENOSTR);
2914 }
2915 
2916 /*
2917  * Wrapper for putmsg. If the socket has been converted to a stream
2918  * pass the request to the stream head.
2919  *
2920  * Note that a while a regular socket (SOV_SOCKSTREAM) does support the
2921  * streams ioctl set it does not support putmsg and getmsg.
2922  * Allowing putmsg would prevent sockfs from tracking the state of
2923  * the socket/transport and would also invalidate the locking in sockfs.
2924  */
2925 int
2926 sock_putmsg(
2927 	struct vnode *vp,
2928 	struct strbuf *mctl,
2929 	struct strbuf *mdata,
2930 	uchar_t pri,
2931 	int flag,
2932 	int fmode
2933 )
2934 {
2935 	struct sonode *so;
2936 
2937 	ASSERT(vp->v_type == VSOCK);
2938 	/*
2939 	 * Use the stream head to find the real socket vnode.
2940 	 * This is needed when namefs sits above sockfs.
2941 	 */
2942 	if (!vp->v_stream) {
2943 		return (ENOSTR);
2944 	}
2945 	ASSERT(vp->v_stream->sd_vnode);
2946 	vp = vp->v_stream->sd_vnode;
2947 	ASSERT(vn_matchops(vp, socket_vnodeops));
2948 	so = VTOSO(vp);
2949 
2950 	dprintso(so, 1, ("sock_putmsg(%p) %s\n",
2951 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2952 
2953 	if (so->so_version == SOV_STREAM) {
2954 		/* The imaginary "sockmod" has been popped - act as a stream */
2955 		return (strputmsg(vp, mctl, mdata, pri, flag, fmode));
2956 	}
2957 	eprintsoline(so, ENOSTR);
2958 	return (ENOSTR);
2959 }
2960 
2961 /*
2962  * Special function called only from f_getfl().
2963  * Returns FASYNC if the SS_ASYNC flag is set on a socket, else 0.
2964  * No locks are acquired here, so it is safe to use while uf_lock is held.
2965  * This exists solely for BSD fcntl() FASYNC compatibility.
2966  */
2967 int
2968 sock_getfasync(vnode_t *vp)
2969 {
2970 	struct sonode *so;
2971 
2972 	ASSERT(vp->v_type == VSOCK);
2973 	/*
2974 	 * For stream model, v_stream is used; For non-stream, v_stream always
2975 	 * equals NULL
2976 	 */
2977 	if (vp->v_stream != NULL)
2978 		so = VTOSO(vp->v_stream->sd_vnode);
2979 	else
2980 		so = VTOSO(vp);
2981 
2982 	if (so->so_version == SOV_STREAM || !(so->so_state & SS_ASYNC))
2983 		return (0);
2984 
2985 	return (FASYNC);
2986 }
2987