xref: /titanic_50/usr/src/uts/common/fs/sockfs/sockstr.c (revision d00d0b26c4591469742c6f5e781603b0d18de013)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/inttypes.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/buf.h>
33 #include <sys/conf.h>
34 #include <sys/cred.h>
35 #include <sys/kmem.h>
36 #include <sys/sysmacros.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/debug.h>
40 #include <sys/errno.h>
41 #include <sys/time.h>
42 #include <sys/file.h>
43 #include <sys/user.h>
44 #include <sys/stream.h>
45 #include <sys/strsubr.h>
46 #include <sys/esunddi.h>
47 #include <sys/flock.h>
48 #include <sys/modctl.h>
49 #include <sys/vtrace.h>
50 #include <sys/strsun.h>
51 #include <sys/cmn_err.h>
52 #include <sys/proc.h>
53 #include <sys/ddi.h>
54 
55 #include <sys/suntpi.h>
56 #include <sys/socket.h>
57 #include <sys/sockio.h>
58 #include <sys/socketvar.h>
59 #include <sys/sodirect.h>
60 #include <netinet/in.h>
61 #include <inet/common.h>
62 #include <inet/proto_set.h>
63 
64 #include <sys/tiuser.h>
65 #define	_SUN_TPI_VERSION	2
66 #include <sys/tihdr.h>
67 
68 #include <inet/kssl/ksslapi.h>
69 
70 #include <c2/audit.h>
71 
72 #include <fs/sockfs/socktpi.h>
73 #include <fs/sockfs/socktpi_impl.h>
74 #include <sys/dcopy.h>
75 
76 int so_default_version = SOV_SOCKSTREAM;
77 
78 #ifdef DEBUG
79 /* Set sockdebug to print debug messages when SO_DEBUG is set */
80 int sockdebug = 0;
81 
82 /* Set sockprinterr to print error messages when SO_DEBUG is set */
83 int sockprinterr = 0;
84 
85 /*
86  * Set so_default_options to SO_DEBUG is all sockets should be created
87  * with SO_DEBUG set. This is needed to get debug printouts from the
88  * socket() call itself.
89  */
90 int so_default_options = 0;
91 #endif /* DEBUG */
92 
93 #ifdef SOCK_TEST
94 /*
95  * Set to number of ticks to limit cv_waits for code coverage testing.
96  * Set to 1000 when SO_DEBUG is set to 2.
97  */
98 clock_t sock_test_timelimit = 0;
99 #endif /* SOCK_TEST */
100 
101 /*
102  * For concurrency testing of e.g. opening /dev/ip which does not
103  * handle T_INFO_REQ messages.
104  */
105 int so_no_tinfo = 0;
106 
107 /*
108  * Timeout for getting a T_CAPABILITY_ACK - it is possible for a provider
109  * to simply ignore the T_CAPABILITY_REQ.
110  */
111 clock_t	sock_capability_timeout	= 2;	/* seconds */
112 
113 static int	do_tcapability(struct sonode *so, t_uscalar_t cap_bits1);
114 static void	so_removehooks(struct sonode *so);
115 
116 static mblk_t *strsock_proto(vnode_t *vp, mblk_t *mp,
117 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
118 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
119 static mblk_t *strsock_misc(vnode_t *vp, mblk_t *mp,
120 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
121 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
122 /*
123  * STREAMS based sodirect put/wakeup functions.
124  */
125 static int sodput(sodirect_t *, mblk_t *);
126 static void sodwakeup(sodirect_t *);
127 
128 /*
129  * Called by sockinit() when sockfs is loaded.
130  */
131 int
132 sostr_init()
133 {
134 	sod_init();
135 	return (0);
136 }
137 
138 /*
139  * Convert a socket to a stream. Invoked when the illusory sockmod
140  * is popped from the stream.
141  * Change the stream head back to default operation without losing
142  * any messages (T_conn_ind's are moved to the stream head queue).
143  */
144 int
145 so_sock2stream(struct sonode *so)
146 {
147 	struct vnode		*vp = SOTOV(so);
148 	queue_t			*rq;
149 	mblk_t			*mp;
150 	int			error = 0;
151 	sotpi_info_t		*sti = SOTOTPI(so);
152 
153 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
154 
155 	mutex_enter(&so->so_lock);
156 	so_lock_single(so);
157 
158 	ASSERT(so->so_version != SOV_STREAM);
159 
160 	if (sti->sti_direct) {
161 		mblk_t **mpp;
162 		int rval;
163 
164 		/*
165 		 * Tell the transport below that sockmod is being popped
166 		 */
167 		mutex_exit(&so->so_lock);
168 		error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K, CRED(),
169 		    &rval);
170 		mutex_enter(&so->so_lock);
171 		if (error != 0) {
172 			dprintso(so, 0, ("so_sock2stream(%p): "
173 			    "_SIOCSOCKFALLBACK failed\n", (void *)so));
174 			goto exit;
175 		}
176 		sti->sti_direct = 0;
177 
178 		for (mpp = &sti->sti_conn_ind_head; (mp = *mpp) != NULL;
179 		    mpp = &mp->b_next) {
180 			struct T_conn_ind	*conn_ind;
181 
182 			/*
183 			 * strsock_proto() has already verified the length of
184 			 * this message block.
185 			 */
186 			ASSERT(MBLKL(mp) >= sizeof (struct T_conn_ind));
187 
188 			conn_ind = (struct T_conn_ind *)mp->b_rptr;
189 			if (conn_ind->OPT_length == 0 &&
190 			    conn_ind->OPT_offset == 0)
191 				continue;
192 
193 			if (DB_REF(mp) > 1) {
194 				mblk_t	*newmp;
195 				size_t	length;
196 				cred_t	*cr;
197 				pid_t	cpid;
198 				int error;	/* Dummy - error not returned */
199 
200 				/*
201 				 * Copy the message block because it is used
202 				 * elsewhere, too.
203 				 * Can't use copyb since we want to wait
204 				 * yet allow for EINTR.
205 				 */
206 				/* Round up size for reuse */
207 				length = MAX(MBLKL(mp), 64);
208 				cr = msg_getcred(mp, &cpid);
209 				if (cr != NULL) {
210 					newmp = allocb_cred_wait(length, 0,
211 					    &error, cr, cpid);
212 				} else {
213 					newmp = allocb_wait(length, 0, 0,
214 					    &error);
215 				}
216 				if (newmp == NULL) {
217 					error = EINTR;
218 					goto exit;
219 				}
220 				bcopy(mp->b_rptr, newmp->b_wptr, length);
221 				newmp->b_wptr += length;
222 				newmp->b_next = mp->b_next;
223 
224 				/*
225 				 * Link the new message block into the queue
226 				 * and free the old one.
227 				 */
228 				*mpp = newmp;
229 				mp->b_next = NULL;
230 				freemsg(mp);
231 
232 				mp = newmp;
233 				conn_ind = (struct T_conn_ind *)mp->b_rptr;
234 			}
235 
236 			/*
237 			 * Remove options added by TCP for accept fast-path.
238 			 */
239 			conn_ind->OPT_length = 0;
240 			conn_ind->OPT_offset = 0;
241 		}
242 	}
243 
244 	so->so_version = SOV_STREAM;
245 	so->so_proto_handle = NULL;
246 
247 	/*
248 	 * Remove the hooks in the stream head to avoid queuing more
249 	 * packets in sockfs.
250 	 */
251 	mutex_exit(&so->so_lock);
252 	so_removehooks(so);
253 	mutex_enter(&so->so_lock);
254 
255 	/*
256 	 * Clear any state related to urgent data. Leave any T_EXDATA_IND
257 	 * on the queue - the behavior of urgent data after a switch is
258 	 * left undefined.
259 	 */
260 	so->so_error = sti->sti_delayed_error = 0;
261 	freemsg(so->so_oobmsg);
262 	so->so_oobmsg = NULL;
263 	sti->sti_oobsigcnt = sti->sti_oobcnt = 0;
264 
265 	so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
266 	    SS_SAVEDEOR);
267 	ASSERT(so_verify_oobstate(so));
268 
269 	freemsg(sti->sti_ack_mp);
270 	sti->sti_ack_mp = NULL;
271 
272 	/*
273 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
274 	 */
275 	so_flush_discon_ind(so);
276 
277 	/*
278 	 * Move any queued T_CONN_IND messages to stream head queue.
279 	 */
280 	rq = RD(strvp2wq(vp));
281 	while ((mp = sti->sti_conn_ind_head) != NULL) {
282 		sti->sti_conn_ind_head = mp->b_next;
283 		mp->b_next = NULL;
284 		if (sti->sti_conn_ind_head == NULL) {
285 			ASSERT(sti->sti_conn_ind_tail == mp);
286 			sti->sti_conn_ind_tail = NULL;
287 		}
288 		dprintso(so, 0,
289 		    ("so_sock2stream(%p): moving T_CONN_IND\n", (void *)so));
290 
291 		/* Drop lock across put() */
292 		mutex_exit(&so->so_lock);
293 		put(rq, mp);
294 		mutex_enter(&so->so_lock);
295 	}
296 
297 exit:
298 	ASSERT(MUTEX_HELD(&so->so_lock));
299 	so_unlock_single(so, SOLOCKED);
300 	mutex_exit(&so->so_lock);
301 	return (error);
302 }
303 
304 /*
305  * Covert a stream back to a socket. This is invoked when the illusory
306  * sockmod is pushed on a stream (where the stream was "created" by
307  * popping the illusory sockmod).
308  * This routine can not recreate the socket state (certain aspects of
309  * it like urgent data state and the bound/connected addresses for AF_UNIX
310  * sockets can not be recreated by asking the transport for information).
311  * Thus this routine implicitly assumes that the socket is in an initial
312  * state (as if it was just created). It flushes any messages queued on the
313  * read queue to avoid dealing with e.g. TPI acks or T_exdata_ind messages.
314  */
315 void
316 so_stream2sock(struct sonode *so)
317 {
318 	struct vnode *vp = SOTOV(so);
319 	sotpi_info_t *sti = SOTOTPI(so);
320 
321 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
322 
323 	mutex_enter(&so->so_lock);
324 	so_lock_single(so);
325 	ASSERT(so->so_version == SOV_STREAM);
326 	so->so_version = SOV_SOCKSTREAM;
327 	sti->sti_pushcnt = 0;
328 	mutex_exit(&so->so_lock);
329 
330 	/*
331 	 * Set a permenent error to force any thread in sorecvmsg to
332 	 * return (and drop SOREADLOCKED). Clear the error once
333 	 * we have SOREADLOCKED.
334 	 * This makes a read sleeping during the I_PUSH of sockmod return
335 	 * EIO.
336 	 */
337 	strsetrerror(SOTOV(so), EIO, 1, NULL);
338 
339 	/*
340 	 * Get the read lock before flushing data to avoid
341 	 * problems with the T_EXDATA_IND MSG_PEEK code in sorecvmsg.
342 	 */
343 	mutex_enter(&so->so_lock);
344 	(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
345 	mutex_exit(&so->so_lock);
346 
347 	strsetrerror(SOTOV(so), 0, 0, NULL);
348 	so_installhooks(so);
349 
350 	/*
351 	 * Flush everything on the read queue.
352 	 * This ensures that no T_CONN_IND remain and that no T_EXDATA_IND
353 	 * remain; those types of messages would confuse sockfs.
354 	 */
355 	strflushrq(vp, FLUSHALL);
356 	mutex_enter(&so->so_lock);
357 
358 	/*
359 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
360 	 */
361 	so_flush_discon_ind(so);
362 	so_unlock_read(so);	/* Clear SOREADLOCKED */
363 
364 	so_unlock_single(so, SOLOCKED);
365 	mutex_exit(&so->so_lock);
366 }
367 
368 /*
369  * Install the hooks in the stream head.
370  */
371 void
372 so_installhooks(struct sonode *so)
373 {
374 	struct vnode *vp = SOTOV(so);
375 
376 	strsetrputhooks(vp, SH_SIGALLDATA | SH_IGN_ZEROLEN | SH_CONSOL_DATA,
377 	    strsock_proto, strsock_misc);
378 	strsetwputhooks(vp, SH_SIGPIPE | SH_RECHECK_ERR, 0);
379 }
380 
381 /*
382  * Remove the hooks in the stream head.
383  */
384 static void
385 so_removehooks(struct sonode *so)
386 {
387 	struct vnode *vp = SOTOV(so);
388 
389 	strsetrputhooks(vp, 0, NULL, NULL);
390 	strsetwputhooks(vp, 0, STRTIMOUT);
391 	/*
392 	 * Leave read behavior as it would have been for a normal
393 	 * stream i.e. a read of an M_PROTO will fail.
394 	 */
395 }
396 
397 void
398 so_basic_strinit(struct sonode *so)
399 {
400 	struct vnode *vp = SOTOV(so);
401 	struct stdata *stp;
402 	mblk_t *mp;
403 	sotpi_info_t *sti = SOTOTPI(so);
404 
405 	/* Preallocate an unbind_req message */
406 	mp = soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP, CRED());
407 	mutex_enter(&so->so_lock);
408 	sti->sti_unbind_mp = mp;
409 #ifdef DEBUG
410 	so->so_options = so_default_options;
411 #endif /* DEBUG */
412 	mutex_exit(&so->so_lock);
413 
414 	so_installhooks(so);
415 
416 	stp = vp->v_stream;
417 	/*
418 	 * Have to keep minpsz at zero in order to allow write/send of zero
419 	 * bytes.
420 	 */
421 	mutex_enter(&stp->sd_lock);
422 	if (stp->sd_qn_minpsz == 1)
423 		stp->sd_qn_minpsz = 0;
424 	mutex_exit(&stp->sd_lock);
425 
426 	/*
427 	 * If sodirect capable allocate and initialize sodirect_t.
428 	 * Note, SS_SODIRECT is set in socktpi_open().
429 	 */
430 	if ((so->so_state & SS_SODIRECT) &&
431 	    !(so->so_state & SS_FALLBACK_PENDING)) {
432 		sod_sock_init(so, stp, sodput, sodwakeup, &stp->sd_lock);
433 	}
434 }
435 
436 /*
437  * Initialize the streams side of a socket including
438  * T_info_req/ack processing. If tso is not NULL its values are used thereby
439  * avoiding the T_INFO_REQ.
440  */
441 int
442 so_strinit(struct sonode *so, struct sonode *tso)
443 {
444 	sotpi_info_t *sti = SOTOTPI(so);
445 	sotpi_info_t *tsti;
446 	int error;
447 
448 	so_basic_strinit(so);
449 
450 	/*
451 	 * The T_CAPABILITY_REQ should be the first message sent down because
452 	 * at least TCP has a fast-path for this which avoids timeouts while
453 	 * waiting for the T_CAPABILITY_ACK under high system load.
454 	 */
455 	if (tso == NULL) {
456 		error = do_tcapability(so, TC1_ACCEPTOR_ID | TC1_INFO);
457 		if (error)
458 			return (error);
459 	} else {
460 		tsti = SOTOTPI(tso);
461 
462 		mutex_enter(&so->so_lock);
463 		sti->sti_tsdu_size = tsti->sti_tsdu_size;
464 		sti->sti_etsdu_size = tsti->sti_etsdu_size;
465 		sti->sti_addr_size = tsti->sti_addr_size;
466 		sti->sti_opt_size = tsti->sti_opt_size;
467 		sti->sti_tidu_size = tsti->sti_tidu_size;
468 		sti->sti_serv_type = tsti->sti_serv_type;
469 		so->so_mode = tso->so_mode & ~SM_ACCEPTOR_ID;
470 		mutex_exit(&so->so_lock);
471 
472 		/* the following do_tcapability may update so->so_mode */
473 		if ((tsti->sti_serv_type != T_CLTS) &&
474 		    (sti->sti_direct == 0)) {
475 			error = do_tcapability(so, TC1_ACCEPTOR_ID);
476 			if (error)
477 				return (error);
478 		}
479 	}
480 	/*
481 	 * If the addr_size is 0 we treat it as already bound
482 	 * and connected. This is used by the routing socket.
483 	 * We set the addr_size to something to allocate a the address
484 	 * structures.
485 	 */
486 	if (sti->sti_addr_size == 0) {
487 		so->so_state |= SS_ISBOUND | SS_ISCONNECTED;
488 		/* Address size can vary with address families. */
489 		if (so->so_family == AF_INET6)
490 			sti->sti_addr_size =
491 			    (t_scalar_t)sizeof (struct sockaddr_in6);
492 		else
493 			sti->sti_addr_size =
494 			    (t_scalar_t)sizeof (struct sockaddr_in);
495 		ASSERT(sti->sti_unbind_mp);
496 	}
497 
498 	so_alloc_addr(so, sti->sti_addr_size);
499 
500 	return (0);
501 }
502 
503 static void
504 copy_tinfo(struct sonode *so, struct T_info_ack *tia)
505 {
506 	sotpi_info_t *sti = SOTOTPI(so);
507 
508 	sti->sti_tsdu_size = tia->TSDU_size;
509 	sti->sti_etsdu_size = tia->ETSDU_size;
510 	sti->sti_addr_size = tia->ADDR_size;
511 	sti->sti_opt_size = tia->OPT_size;
512 	sti->sti_tidu_size = tia->TIDU_size;
513 	sti->sti_serv_type = tia->SERV_type;
514 	switch (tia->CURRENT_state) {
515 	case TS_UNBND:
516 		break;
517 	case TS_IDLE:
518 		so->so_state |= SS_ISBOUND;
519 		sti->sti_laddr_len = 0;
520 		sti->sti_laddr_valid = 0;
521 		break;
522 	case TS_DATA_XFER:
523 		so->so_state |= SS_ISBOUND|SS_ISCONNECTED;
524 		sti->sti_laddr_len = 0;
525 		sti->sti_faddr_len = 0;
526 		sti->sti_laddr_valid = 0;
527 		sti->sti_faddr_valid = 0;
528 		break;
529 	}
530 
531 	/*
532 	 * Heuristics for determining the socket mode flags
533 	 * (SM_ATOMIC, SM_CONNREQUIRED, SM_ADDR, SM_FDPASSING,
534 	 * and SM_EXDATA, SM_OPTDATA, and SM_BYTESTREAM)
535 	 * from the info ack.
536 	 */
537 	if (sti->sti_serv_type == T_CLTS) {
538 		so->so_mode |= SM_ATOMIC | SM_ADDR;
539 	} else {
540 		so->so_mode |= SM_CONNREQUIRED;
541 		if (sti->sti_etsdu_size != 0 && sti->sti_etsdu_size != -2)
542 			so->so_mode |= SM_EXDATA;
543 	}
544 	if (so->so_type == SOCK_SEQPACKET || so->so_type == SOCK_RAW) {
545 		/* Semantics are to discard tail end of messages */
546 		so->so_mode |= SM_ATOMIC;
547 	}
548 	if (so->so_family == AF_UNIX) {
549 		so->so_mode |= SM_FDPASSING | SM_OPTDATA;
550 		if (sti->sti_addr_size == -1) {
551 			/* MAXPATHLEN + soun_family + nul termination */
552 			sti->sti_addr_size = (t_scalar_t)(MAXPATHLEN +
553 			    sizeof (short) + 1);
554 		}
555 		if (so->so_type == SOCK_STREAM) {
556 			/*
557 			 * Make it into a byte-stream transport.
558 			 * SOCK_SEQPACKET sockets are unchanged.
559 			 */
560 			sti->sti_tsdu_size = 0;
561 		}
562 	} else if (sti->sti_addr_size == -1) {
563 		/*
564 		 * Logic extracted from sockmod - have to pick some max address
565 		 * length in order to preallocate the addresses.
566 		 */
567 		sti->sti_addr_size = SOA_DEFSIZE;
568 	}
569 	if (sti->sti_tsdu_size == 0)
570 		so->so_mode |= SM_BYTESTREAM;
571 }
572 
573 static int
574 check_tinfo(struct sonode *so)
575 {
576 	sotpi_info_t *sti = SOTOTPI(so);
577 
578 	/* Consistency checks */
579 	if (so->so_type == SOCK_DGRAM && sti->sti_serv_type != T_CLTS) {
580 		eprintso(so, ("service type and socket type mismatch\n"));
581 		eprintsoline(so, EPROTO);
582 		return (EPROTO);
583 	}
584 	if (so->so_type == SOCK_STREAM && sti->sti_serv_type == T_CLTS) {
585 		eprintso(so, ("service type and socket type mismatch\n"));
586 		eprintsoline(so, EPROTO);
587 		return (EPROTO);
588 	}
589 	if (so->so_type == SOCK_SEQPACKET && sti->sti_serv_type == T_CLTS) {
590 		eprintso(so, ("service type and socket type mismatch\n"));
591 		eprintsoline(so, EPROTO);
592 		return (EPROTO);
593 	}
594 	if (so->so_family == AF_INET &&
595 	    sti->sti_addr_size != (t_scalar_t)sizeof (struct sockaddr_in)) {
596 		eprintso(so,
597 		    ("AF_INET must have sockaddr_in address length. Got %d\n",
598 		    sti->sti_addr_size));
599 		eprintsoline(so, EMSGSIZE);
600 		return (EMSGSIZE);
601 	}
602 	if (so->so_family == AF_INET6 &&
603 	    sti->sti_addr_size != (t_scalar_t)sizeof (struct sockaddr_in6)) {
604 		eprintso(so,
605 		    ("AF_INET6 must have sockaddr_in6 address length. Got %d\n",
606 		    sti->sti_addr_size));
607 		eprintsoline(so, EMSGSIZE);
608 		return (EMSGSIZE);
609 	}
610 
611 	dprintso(so, 1, (
612 	    "tinfo: serv %d tsdu %d, etsdu %d, addr %d, opt %d, tidu %d\n",
613 	    sti->sti_serv_type, sti->sti_tsdu_size, sti->sti_etsdu_size,
614 	    sti->sti_addr_size, sti->sti_opt_size,
615 	    sti->sti_tidu_size));
616 	dprintso(so, 1, ("tinfo: so_state %s\n",
617 	    pr_state(so->so_state, so->so_mode)));
618 	return (0);
619 }
620 
621 /*
622  * Send down T_info_req and wait for the ack.
623  * Record interesting T_info_ack values in the sonode.
624  */
625 static int
626 do_tinfo(struct sonode *so)
627 {
628 	struct T_info_req tir;
629 	mblk_t *mp;
630 	int error;
631 
632 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
633 
634 	if (so_no_tinfo) {
635 		SOTOTPI(so)->sti_addr_size = 0;
636 		return (0);
637 	}
638 
639 	dprintso(so, 1, ("do_tinfo(%p)\n", (void *)so));
640 
641 	/* Send T_INFO_REQ */
642 	tir.PRIM_type = T_INFO_REQ;
643 	mp = soallocproto1(&tir, sizeof (tir),
644 	    sizeof (struct T_info_req) + sizeof (struct T_info_ack),
645 	    _ALLOC_INTR, CRED());
646 	if (mp == NULL) {
647 		eprintsoline(so, ENOBUFS);
648 		return (ENOBUFS);
649 	}
650 	/* T_INFO_REQ has to be M_PCPROTO */
651 	DB_TYPE(mp) = M_PCPROTO;
652 
653 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
654 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
655 	if (error) {
656 		eprintsoline(so, error);
657 		return (error);
658 	}
659 	mutex_enter(&so->so_lock);
660 	/* Wait for T_INFO_ACK */
661 	if ((error = sowaitprim(so, T_INFO_REQ, T_INFO_ACK,
662 	    (t_uscalar_t)sizeof (struct T_info_ack), &mp, 0))) {
663 		mutex_exit(&so->so_lock);
664 		eprintsoline(so, error);
665 		return (error);
666 	}
667 
668 	ASSERT(mp);
669 	copy_tinfo(so, (struct T_info_ack *)mp->b_rptr);
670 	mutex_exit(&so->so_lock);
671 	freemsg(mp);
672 	return (check_tinfo(so));
673 }
674 
675 /*
676  * Send down T_capability_req and wait for the ack.
677  * Record interesting T_capability_ack values in the sonode.
678  */
679 static int
680 do_tcapability(struct sonode *so, t_uscalar_t cap_bits1)
681 {
682 	struct T_capability_req tcr;
683 	struct T_capability_ack *tca;
684 	mblk_t *mp;
685 	int error;
686 	sotpi_info_t *sti = SOTOTPI(so);
687 
688 	ASSERT(cap_bits1 != 0);
689 	ASSERT((cap_bits1 & ~(TC1_ACCEPTOR_ID | TC1_INFO)) == 0);
690 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
691 
692 	if (sti->sti_provinfo->tpi_capability == PI_NO)
693 		return (do_tinfo(so));
694 
695 	if (so_no_tinfo) {
696 		sti->sti_addr_size = 0;
697 		if ((cap_bits1 &= ~TC1_INFO) == 0)
698 			return (0);
699 	}
700 
701 	dprintso(so, 1, ("do_tcapability(%p)\n", (void *)so));
702 
703 	/* Send T_CAPABILITY_REQ */
704 	tcr.PRIM_type = T_CAPABILITY_REQ;
705 	tcr.CAP_bits1 = cap_bits1;
706 	mp = soallocproto1(&tcr, sizeof (tcr),
707 	    sizeof (struct T_capability_req) + sizeof (struct T_capability_ack),
708 	    _ALLOC_INTR, CRED());
709 	if (mp == NULL) {
710 		eprintsoline(so, ENOBUFS);
711 		return (ENOBUFS);
712 	}
713 	/* T_CAPABILITY_REQ should be M_PCPROTO here */
714 	DB_TYPE(mp) = M_PCPROTO;
715 
716 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
717 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
718 	if (error) {
719 		eprintsoline(so, error);
720 		return (error);
721 	}
722 	mutex_enter(&so->so_lock);
723 	/* Wait for T_CAPABILITY_ACK */
724 	if ((error = sowaitprim(so, T_CAPABILITY_REQ, T_CAPABILITY_ACK,
725 	    (t_uscalar_t)sizeof (*tca), &mp, sock_capability_timeout * hz))) {
726 		mutex_exit(&so->so_lock);
727 		PI_PROVLOCK(sti->sti_provinfo);
728 		if (sti->sti_provinfo->tpi_capability == PI_DONTKNOW)
729 			sti->sti_provinfo->tpi_capability = PI_NO;
730 		PI_PROVUNLOCK(sti->sti_provinfo);
731 		ASSERT((so->so_mode & SM_ACCEPTOR_ID) == 0);
732 		if (cap_bits1 & TC1_INFO) {
733 			/*
734 			 * If the T_CAPABILITY_REQ timed out and then a
735 			 * T_INFO_REQ gets a protocol error, most likely
736 			 * the capability was slow (vs. unsupported). Return
737 			 * ENOSR for this case as a best guess.
738 			 */
739 			if (error == ETIME) {
740 				return ((error = do_tinfo(so)) == EPROTO ?
741 				    ENOSR : error);
742 			}
743 			return (do_tinfo(so));
744 		}
745 		return (0);
746 	}
747 
748 	ASSERT(mp);
749 	tca = (struct T_capability_ack *)mp->b_rptr;
750 
751 	ASSERT((cap_bits1 & TC1_INFO) == (tca->CAP_bits1 & TC1_INFO));
752 	so_proc_tcapability_ack(so, tca);
753 
754 	cap_bits1 = tca->CAP_bits1;
755 
756 	mutex_exit(&so->so_lock);
757 	freemsg(mp);
758 
759 	if (cap_bits1 & TC1_INFO)
760 		return (check_tinfo(so));
761 
762 	return (0);
763 }
764 
765 /*
766  * Process a T_CAPABILITY_ACK
767  */
768 void
769 so_proc_tcapability_ack(struct sonode *so, struct T_capability_ack *tca)
770 {
771 	sotpi_info_t *sti = SOTOTPI(so);
772 
773 	if (sti->sti_provinfo->tpi_capability == PI_DONTKNOW) {
774 		PI_PROVLOCK(sti->sti_provinfo);
775 		sti->sti_provinfo->tpi_capability = PI_YES;
776 		PI_PROVUNLOCK(sti->sti_provinfo);
777 	}
778 
779 	if (tca->CAP_bits1 & TC1_ACCEPTOR_ID) {
780 		sti->sti_acceptor_id = tca->ACCEPTOR_id;
781 		so->so_mode |= SM_ACCEPTOR_ID;
782 	}
783 
784 	if (tca->CAP_bits1 & TC1_INFO)
785 		copy_tinfo(so, &tca->INFO_ack);
786 }
787 
788 /*
789  * Retrieve socket error, clear error if not peek.
790  */
791 int
792 sogeterr(struct sonode *so, boolean_t clear_err)
793 {
794 	int error;
795 
796 	ASSERT(MUTEX_HELD(&so->so_lock));
797 
798 	error = so->so_error;
799 	if (clear_err)
800 		so->so_error = 0;
801 
802 	return (error);
803 }
804 
805 /*
806  * This routine is registered with the stream head to retrieve read
807  * side errors.
808  * It does not clear the socket error for a peeking read side operation.
809  * It the error is to be cleared it sets *clearerr.
810  */
811 int
812 sogetrderr(vnode_t *vp, int ispeek, int *clearerr)
813 {
814 	struct sonode *so = VTOSO(vp);
815 	int error;
816 
817 	mutex_enter(&so->so_lock);
818 	if (ispeek) {
819 		error = so->so_error;
820 		*clearerr = 0;
821 	} else {
822 		error = so->so_error;
823 		so->so_error = 0;
824 		*clearerr = 1;
825 	}
826 	mutex_exit(&so->so_lock);
827 	return (error);
828 }
829 
830 /*
831  * This routine is registered with the stream head to retrieve write
832  * side errors.
833  * It does not clear the socket error for a peeking read side operation.
834  * It the error is to be cleared it sets *clearerr.
835  */
836 int
837 sogetwrerr(vnode_t *vp, int ispeek, int *clearerr)
838 {
839 	struct sonode *so = VTOSO(vp);
840 	int error;
841 
842 	mutex_enter(&so->so_lock);
843 	if (so->so_state & SS_CANTSENDMORE) {
844 		error = EPIPE;
845 		*clearerr = 0;
846 	} else {
847 		error = so->so_error;
848 		if (ispeek) {
849 			*clearerr = 0;
850 		} else {
851 			so->so_error = 0;
852 			*clearerr = 1;
853 		}
854 	}
855 	mutex_exit(&so->so_lock);
856 	return (error);
857 }
858 
859 /*
860  * Set a nonpersistent read and write error on the socket.
861  * Used when there is a T_uderror_ind for a connected socket.
862  * The caller also needs to call strsetrerror and strsetwerror
863  * after dropping the lock.
864  */
865 void
866 soseterror(struct sonode *so, int error)
867 {
868 	ASSERT(error != 0);
869 
870 	ASSERT(MUTEX_HELD(&so->so_lock));
871 	so->so_error = (ushort_t)error;
872 }
873 
874 void
875 soisconnecting(struct sonode *so)
876 {
877 	ASSERT(MUTEX_HELD(&so->so_lock));
878 	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
879 	so->so_state |= SS_ISCONNECTING;
880 	cv_broadcast(&so->so_state_cv);
881 }
882 
883 void
884 soisconnected(struct sonode *so)
885 {
886 	ASSERT(MUTEX_HELD(&so->so_lock));
887 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
888 	so->so_state |= SS_ISCONNECTED;
889 	cv_broadcast(&so->so_state_cv);
890 }
891 
892 /*
893  * The caller also needs to call strsetrerror, strsetwerror and strseteof.
894  */
895 void
896 soisdisconnected(struct sonode *so, int error)
897 {
898 	ASSERT(MUTEX_HELD(&so->so_lock));
899 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
900 	so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
901 	so->so_error = (ushort_t)error;
902 	if (so->so_peercred != NULL) {
903 		crfree(so->so_peercred);
904 		so->so_peercred = NULL;
905 	}
906 	cv_broadcast(&so->so_state_cv);
907 }
908 
909 /*
910  * For connected AF_UNIX SOCK_DGRAM sockets when the peer closes.
911  * Does not affect write side.
912  * The caller also has to call strsetrerror.
913  */
914 static void
915 sobreakconn(struct sonode *so, int error)
916 {
917 	ASSERT(MUTEX_HELD(&so->so_lock));
918 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
919 	so->so_error = (ushort_t)error;
920 	cv_broadcast(&so->so_state_cv);
921 }
922 
923 /*
924  * Can no longer send.
925  * Caller must also call strsetwerror.
926  *
927  * We mark the peer address as no longer valid for getpeername, but
928  * leave it around for so_unix_close to notify the peer (that
929  * transport has no addressing held at that layer).
930  */
931 void
932 socantsendmore(struct sonode *so)
933 {
934 	ASSERT(MUTEX_HELD(&so->so_lock));
935 	so->so_state |= SS_CANTSENDMORE;
936 	cv_broadcast(&so->so_state_cv);
937 }
938 
939 /*
940  * The caller must call strseteof(,1) as well as this routine
941  * to change the socket state.
942  */
943 void
944 socantrcvmore(struct sonode *so)
945 {
946 	ASSERT(MUTEX_HELD(&so->so_lock));
947 	so->so_state |= SS_CANTRCVMORE;
948 	cv_broadcast(&so->so_state_cv);
949 }
950 
951 /*
952  * The caller has sent down a "request_prim" primitive and wants to wait for
953  * an ack ("ack_prim") or an T_ERROR_ACK for it.
954  * The specified "ack_prim" can be a T_OK_ACK.
955  *
956  * Assumes that all the TPI acks are M_PCPROTO messages.
957  *
958  * Note that the socket is single-threaded (using so_lock_single)
959  * for all operations that generate TPI ack messages. Since
960  * only TPI ack messages are M_PCPROTO we should never receive
961  * anything except either the ack we are expecting or a T_ERROR_ACK
962  * for the same primitive.
963  */
964 int
965 sowaitprim(struct sonode *so, t_scalar_t request_prim, t_scalar_t ack_prim,
966 	    t_uscalar_t min_size, mblk_t **mpp, clock_t wait)
967 {
968 	mblk_t *mp;
969 	union T_primitives *tpr;
970 	int error;
971 
972 	dprintso(so, 1, ("sowaitprim(%p, %d, %d, %d, %p, %lu)\n",
973 	    (void *)so, request_prim, ack_prim, min_size, (void *)mpp, wait));
974 
975 	ASSERT(MUTEX_HELD(&so->so_lock));
976 
977 	error = sowaitack(so, &mp, wait);
978 	if (error)
979 		return (error);
980 
981 	dprintso(so, 1, ("got msg %p\n", (void *)mp));
982 	if (DB_TYPE(mp) != M_PCPROTO ||
983 	    MBLKL(mp) < sizeof (tpr->type)) {
984 		freemsg(mp);
985 		eprintsoline(so, EPROTO);
986 		return (EPROTO);
987 	}
988 	tpr = (union T_primitives *)mp->b_rptr;
989 	/*
990 	 * Did we get the primitive that we were asking for?
991 	 * For T_OK_ACK we also check that it matches the request primitive.
992 	 */
993 	if (tpr->type == ack_prim &&
994 	    (ack_prim != T_OK_ACK ||
995 	    tpr->ok_ack.CORRECT_prim == request_prim)) {
996 		if (MBLKL(mp) >= (ssize_t)min_size) {
997 			/* Found what we are looking for */
998 			*mpp = mp;
999 			return (0);
1000 		}
1001 		/* Too short */
1002 		freemsg(mp);
1003 		eprintsoline(so, EPROTO);
1004 		return (EPROTO);
1005 	}
1006 
1007 	if (tpr->type == T_ERROR_ACK &&
1008 	    tpr->error_ack.ERROR_prim == request_prim) {
1009 		/* Error to the primitive we were looking for */
1010 		if (tpr->error_ack.TLI_error == TSYSERR) {
1011 			error = tpr->error_ack.UNIX_error;
1012 		} else {
1013 			error = proto_tlitosyserr(tpr->error_ack.TLI_error);
1014 		}
1015 		dprintso(so, 0, ("error_ack for %d: %d/%d ->%d\n",
1016 		    tpr->error_ack.ERROR_prim, tpr->error_ack.TLI_error,
1017 		    tpr->error_ack.UNIX_error, error));
1018 		freemsg(mp);
1019 		return (error);
1020 	}
1021 	/*
1022 	 * Wrong primitive or T_ERROR_ACK for the wrong primitive
1023 	 */
1024 #ifdef DEBUG
1025 	if (tpr->type == T_ERROR_ACK) {
1026 		dprintso(so, 0, ("error_ack for %d: %d/%d\n",
1027 		    tpr->error_ack.ERROR_prim, tpr->error_ack.TLI_error,
1028 		    tpr->error_ack.UNIX_error));
1029 	} else if (tpr->type == T_OK_ACK) {
1030 		dprintso(so, 0, ("ok_ack for %d, expected %d for %d\n",
1031 		    tpr->ok_ack.CORRECT_prim, ack_prim, request_prim));
1032 	} else {
1033 		dprintso(so, 0,
1034 		    ("unexpected primitive %d, expected %d for %d\n",
1035 		    tpr->type, ack_prim, request_prim));
1036 	}
1037 #endif /* DEBUG */
1038 
1039 	freemsg(mp);
1040 	eprintsoline(so, EPROTO);
1041 	return (EPROTO);
1042 }
1043 
1044 /*
1045  * Wait for a T_OK_ACK for the specified primitive.
1046  */
1047 int
1048 sowaitokack(struct sonode *so, t_scalar_t request_prim)
1049 {
1050 	mblk_t *mp;
1051 	int error;
1052 
1053 	error = sowaitprim(so, request_prim, T_OK_ACK,
1054 	    (t_uscalar_t)sizeof (struct T_ok_ack), &mp, 0);
1055 	if (error)
1056 		return (error);
1057 	freemsg(mp);
1058 	return (0);
1059 }
1060 
1061 /*
1062  * Queue a received TPI ack message on sti_ack_mp.
1063  */
1064 void
1065 soqueueack(struct sonode *so, mblk_t *mp)
1066 {
1067 	sotpi_info_t *sti = SOTOTPI(so);
1068 
1069 	if (DB_TYPE(mp) != M_PCPROTO) {
1070 		zcmn_err(getzoneid(), CE_WARN,
1071 		    "sockfs: received unexpected M_PROTO TPI ack. Prim %d\n",
1072 		    *(t_scalar_t *)mp->b_rptr);
1073 		freemsg(mp);
1074 		return;
1075 	}
1076 
1077 	mutex_enter(&so->so_lock);
1078 	if (sti->sti_ack_mp != NULL) {
1079 		dprintso(so, 1, ("sti_ack_mp already set\n"));
1080 		freemsg(sti->sti_ack_mp);
1081 		sti->sti_ack_mp = NULL;
1082 	}
1083 	sti->sti_ack_mp = mp;
1084 	cv_broadcast(&sti->sti_ack_cv);
1085 	mutex_exit(&so->so_lock);
1086 }
1087 
1088 /*
1089  * Wait for a TPI ack ignoring signals and errors.
1090  */
1091 int
1092 sowaitack(struct sonode *so, mblk_t **mpp, clock_t wait)
1093 {
1094 	sotpi_info_t *sti = SOTOTPI(so);
1095 
1096 	ASSERT(MUTEX_HELD(&so->so_lock));
1097 
1098 	while (sti->sti_ack_mp == NULL) {
1099 #ifdef SOCK_TEST
1100 		if (wait == 0 && sock_test_timelimit != 0)
1101 			wait = sock_test_timelimit;
1102 #endif
1103 		if (wait != 0) {
1104 			/*
1105 			 * Only wait for the time limit.
1106 			 */
1107 			clock_t now;
1108 
1109 			time_to_wait(&now, wait);
1110 			if (cv_timedwait(&sti->sti_ack_cv, &so->so_lock,
1111 			    now) == -1) {
1112 				eprintsoline(so, ETIME);
1113 				return (ETIME);
1114 			}
1115 		}
1116 		else
1117 			cv_wait(&sti->sti_ack_cv, &so->so_lock);
1118 	}
1119 	*mpp = sti->sti_ack_mp;
1120 #ifdef DEBUG
1121 	{
1122 		union T_primitives *tpr;
1123 		mblk_t *mp = *mpp;
1124 
1125 		tpr = (union T_primitives *)mp->b_rptr;
1126 		ASSERT(DB_TYPE(mp) == M_PCPROTO);
1127 		ASSERT(tpr->type == T_OK_ACK ||
1128 		    tpr->type == T_ERROR_ACK ||
1129 		    tpr->type == T_BIND_ACK ||
1130 		    tpr->type == T_CAPABILITY_ACK ||
1131 		    tpr->type == T_INFO_ACK ||
1132 		    tpr->type == T_OPTMGMT_ACK);
1133 	}
1134 #endif /* DEBUG */
1135 	sti->sti_ack_mp = NULL;
1136 	return (0);
1137 }
1138 
1139 /*
1140  * Queue a received T_CONN_IND message on sti_conn_ind_head/tail.
1141  */
1142 void
1143 soqueueconnind(struct sonode *so, mblk_t *mp)
1144 {
1145 	sotpi_info_t *sti = SOTOTPI(so);
1146 
1147 	if (DB_TYPE(mp) != M_PROTO) {
1148 		zcmn_err(getzoneid(), CE_WARN,
1149 		    "sockfs: received unexpected M_PCPROTO T_CONN_IND\n");
1150 		freemsg(mp);
1151 		return;
1152 	}
1153 
1154 	mutex_enter(&so->so_lock);
1155 	ASSERT(mp->b_next == NULL);
1156 	if (sti->sti_conn_ind_head == NULL) {
1157 		sti->sti_conn_ind_head = mp;
1158 	} else {
1159 		ASSERT(sti->sti_conn_ind_tail->b_next == NULL);
1160 		sti->sti_conn_ind_tail->b_next = mp;
1161 	}
1162 	sti->sti_conn_ind_tail = mp;
1163 	/* Wakeup a single consumer of the T_CONN_IND */
1164 	cv_signal(&so->so_acceptq_cv);
1165 	mutex_exit(&so->so_lock);
1166 }
1167 
1168 /*
1169  * Wait for a T_CONN_IND.
1170  * Don't wait if nonblocking.
1171  * Accept signals and socket errors.
1172  */
1173 int
1174 sowaitconnind(struct sonode *so, int fmode, mblk_t **mpp)
1175 {
1176 	mblk_t *mp;
1177 	sotpi_info_t *sti = SOTOTPI(so);
1178 	int error = 0;
1179 
1180 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1181 	mutex_enter(&so->so_lock);
1182 check_error:
1183 	if (so->so_error) {
1184 		error = sogeterr(so, B_TRUE);
1185 		if (error) {
1186 			mutex_exit(&so->so_lock);
1187 			return (error);
1188 		}
1189 	}
1190 
1191 	if (sti->sti_conn_ind_head == NULL) {
1192 		if (fmode & (FNDELAY|FNONBLOCK)) {
1193 			error = EWOULDBLOCK;
1194 			goto done;
1195 		}
1196 
1197 		if (so->so_state & SS_CLOSING) {
1198 			error = EINTR;
1199 			goto done;
1200 		}
1201 
1202 		if (!cv_wait_sig_swap(&so->so_acceptq_cv, &so->so_lock)) {
1203 			error = EINTR;
1204 			goto done;
1205 		}
1206 		goto check_error;
1207 	}
1208 	mp = sti->sti_conn_ind_head;
1209 	sti->sti_conn_ind_head = mp->b_next;
1210 	mp->b_next = NULL;
1211 	if (sti->sti_conn_ind_head == NULL) {
1212 		ASSERT(sti->sti_conn_ind_tail == mp);
1213 		sti->sti_conn_ind_tail = NULL;
1214 	}
1215 	*mpp = mp;
1216 done:
1217 	mutex_exit(&so->so_lock);
1218 	return (error);
1219 }
1220 
1221 /*
1222  * Flush a T_CONN_IND matching the sequence number from the list.
1223  * Return zero if found; non-zero otherwise.
1224  * This is called very infrequently thus it is ok to do a linear search.
1225  */
1226 int
1227 soflushconnind(struct sonode *so, t_scalar_t seqno)
1228 {
1229 	mblk_t *prevmp, *mp;
1230 	struct T_conn_ind *tci;
1231 	sotpi_info_t *sti = SOTOTPI(so);
1232 
1233 	mutex_enter(&so->so_lock);
1234 	for (prevmp = NULL, mp = sti->sti_conn_ind_head; mp != NULL;
1235 	    prevmp = mp, mp = mp->b_next) {
1236 		tci = (struct T_conn_ind *)mp->b_rptr;
1237 		if (tci->SEQ_number == seqno) {
1238 			dprintso(so, 1,
1239 			    ("t_discon_ind: found T_CONN_IND %d\n", seqno));
1240 			/* Deleting last? */
1241 			if (sti->sti_conn_ind_tail == mp) {
1242 				sti->sti_conn_ind_tail = prevmp;
1243 			}
1244 			if (prevmp == NULL) {
1245 				/* Deleting first */
1246 				sti->sti_conn_ind_head = mp->b_next;
1247 			} else {
1248 				prevmp->b_next = mp->b_next;
1249 			}
1250 			mp->b_next = NULL;
1251 
1252 			ASSERT((sti->sti_conn_ind_head == NULL &&
1253 			    sti->sti_conn_ind_tail == NULL) ||
1254 			    (sti->sti_conn_ind_head != NULL &&
1255 			    sti->sti_conn_ind_tail != NULL));
1256 
1257 			so->so_error = ECONNABORTED;
1258 			mutex_exit(&so->so_lock);
1259 
1260 			/*
1261 			 * T_KSSL_PROXY_CONN_IND may carry a handle for
1262 			 * an SSL context, and needs to be released.
1263 			 */
1264 			if ((tci->PRIM_type == T_SSL_PROXY_CONN_IND) &&
1265 			    (mp->b_cont != NULL)) {
1266 				kssl_ctx_t kssl_ctx;
1267 
1268 				ASSERT(MBLKL(mp->b_cont) ==
1269 				    sizeof (kssl_ctx_t));
1270 				kssl_ctx = *((kssl_ctx_t *)mp->b_cont->b_rptr);
1271 				kssl_release_ctx(kssl_ctx);
1272 			}
1273 			freemsg(mp);
1274 			return (0);
1275 		}
1276 	}
1277 	mutex_exit(&so->so_lock);
1278 	dprintso(so, 1,	("t_discon_ind: NOT found T_CONN_IND %d\n", seqno));
1279 	return (-1);
1280 }
1281 
1282 /*
1283  * Wait until the socket is connected or there is an error.
1284  * fmode should contain any nonblocking flags. nosig should be
1285  * set if the caller does not want the wait to be interrupted by a signal.
1286  */
1287 int
1288 sowaitconnected(struct sonode *so, int fmode, int nosig)
1289 {
1290 	int error;
1291 
1292 	ASSERT(MUTEX_HELD(&so->so_lock));
1293 
1294 	while ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) ==
1295 	    SS_ISCONNECTING && so->so_error == 0) {
1296 
1297 		dprintso(so, 1, ("waiting for SS_ISCONNECTED on %p\n",
1298 		    (void *)so));
1299 		if (fmode & (FNDELAY|FNONBLOCK))
1300 			return (EINPROGRESS);
1301 
1302 		if (so->so_state & SS_CLOSING)
1303 			return (EINTR);
1304 
1305 		if (nosig)
1306 			cv_wait(&so->so_state_cv, &so->so_lock);
1307 		else if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) {
1308 			/*
1309 			 * Return EINTR and let the application use
1310 			 * nonblocking techniques for detecting when
1311 			 * the connection has been established.
1312 			 */
1313 			return (EINTR);
1314 		}
1315 		dprintso(so, 1, ("awoken on %p\n", (void *)so));
1316 	}
1317 
1318 	if (so->so_error != 0) {
1319 		error = sogeterr(so, B_TRUE);
1320 		ASSERT(error != 0);
1321 		dprintso(so, 1, ("sowaitconnected: error %d\n", error));
1322 		return (error);
1323 	}
1324 	if (!(so->so_state & SS_ISCONNECTED)) {
1325 		/*
1326 		 * Could have received a T_ORDREL_IND or a T_DISCON_IND with
1327 		 * zero errno. Or another thread could have consumed so_error
1328 		 * e.g. by calling read.
1329 		 */
1330 		error = ECONNREFUSED;
1331 		dprintso(so, 1, ("sowaitconnected: error %d\n", error));
1332 		return (error);
1333 	}
1334 	return (0);
1335 }
1336 
1337 
1338 /*
1339  * Handle the signal generation aspect of urgent data.
1340  */
1341 static void
1342 so_oob_sig(struct sonode *so, int extrasig,
1343     strsigset_t *signals, strpollset_t *pollwakeups)
1344 {
1345 	sotpi_info_t *sti = SOTOTPI(so);
1346 
1347 	ASSERT(MUTEX_HELD(&so->so_lock));
1348 
1349 	ASSERT(so_verify_oobstate(so));
1350 	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1351 	if (sti->sti_oobsigcnt > sti->sti_oobcnt) {
1352 		/*
1353 		 * Signal has already been generated once for this
1354 		 * urgent "event". However, since TCP can receive updated
1355 		 * urgent pointers we still generate a signal.
1356 		 */
1357 		ASSERT(so->so_state & SS_OOBPEND);
1358 		if (extrasig) {
1359 			*signals |= S_RDBAND;
1360 			*pollwakeups |= POLLRDBAND;
1361 		}
1362 		return;
1363 	}
1364 
1365 	sti->sti_oobsigcnt++;
1366 	ASSERT(sti->sti_oobsigcnt > 0);	/* Wraparound */
1367 	ASSERT(sti->sti_oobsigcnt > sti->sti_oobcnt);
1368 
1369 	/*
1370 	 * Record (for select/poll) that urgent data is pending.
1371 	 */
1372 	so->so_state |= SS_OOBPEND;
1373 	/*
1374 	 * New urgent data on the way so forget about any old
1375 	 * urgent data.
1376 	 */
1377 	so->so_state &= ~(SS_HAVEOOBDATA|SS_HADOOBDATA);
1378 	if (so->so_oobmsg != NULL) {
1379 		dprintso(so, 1, ("sock: discarding old oob\n"));
1380 		freemsg(so->so_oobmsg);
1381 		so->so_oobmsg = NULL;
1382 	}
1383 	*signals |= S_RDBAND;
1384 	*pollwakeups |= POLLRDBAND;
1385 	ASSERT(so_verify_oobstate(so));
1386 }
1387 
1388 /*
1389  * Handle the processing of the T_EXDATA_IND with urgent data.
1390  * Returns the T_EXDATA_IND if it should be queued on the read queue.
1391  */
1392 /* ARGSUSED2 */
1393 static mblk_t *
1394 so_oob_exdata(struct sonode *so, mblk_t *mp,
1395 	strsigset_t *signals, strpollset_t *pollwakeups)
1396 {
1397 	sotpi_info_t *sti = SOTOTPI(so);
1398 
1399 	ASSERT(MUTEX_HELD(&so->so_lock));
1400 
1401 	ASSERT(so_verify_oobstate(so));
1402 
1403 	ASSERT(sti->sti_oobsigcnt > sti->sti_oobcnt);
1404 
1405 	sti->sti_oobcnt++;
1406 	ASSERT(sti->sti_oobcnt > 0);	/* wraparound? */
1407 	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1408 
1409 	/*
1410 	 * Set MSGMARK for SIOCATMARK.
1411 	 */
1412 	mp->b_flag |= MSGMARK;
1413 
1414 	ASSERT(so_verify_oobstate(so));
1415 	return (mp);
1416 }
1417 
1418 /*
1419  * Handle the processing of the actual urgent data.
1420  * Returns the data mblk if it should be queued on the read queue.
1421  */
1422 static mblk_t *
1423 so_oob_data(struct sonode *so, mblk_t *mp,
1424 	strsigset_t *signals, strpollset_t *pollwakeups)
1425 {
1426 	sotpi_info_t *sti = SOTOTPI(so);
1427 
1428 	ASSERT(MUTEX_HELD(&so->so_lock));
1429 
1430 	ASSERT(so_verify_oobstate(so));
1431 
1432 	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1433 	ASSERT(mp != NULL);
1434 	/*
1435 	 * For OOBINLINE we keep the data in the T_EXDATA_IND.
1436 	 * Otherwise we store it in so_oobmsg.
1437 	 */
1438 	ASSERT(so->so_oobmsg == NULL);
1439 	if (so->so_options & SO_OOBINLINE) {
1440 		*pollwakeups |= POLLIN | POLLRDNORM | POLLRDBAND;
1441 		*signals |= S_INPUT | S_RDNORM;
1442 	} else {
1443 		*pollwakeups |= POLLRDBAND;
1444 		so->so_state |= SS_HAVEOOBDATA;
1445 		so->so_oobmsg = mp;
1446 		mp = NULL;
1447 	}
1448 	ASSERT(so_verify_oobstate(so));
1449 	return (mp);
1450 }
1451 
1452 /*
1453  * Caller must hold the mutex.
1454  * For delayed processing, save the T_DISCON_IND received
1455  * from below on sti_discon_ind_mp.
1456  * When the message is processed the framework will call:
1457  *      (*func)(so, mp);
1458  */
1459 static void
1460 so_save_discon_ind(struct sonode *so,
1461 	mblk_t *mp,
1462 	void (*func)(struct sonode *so, mblk_t *))
1463 {
1464 	sotpi_info_t *sti = SOTOTPI(so);
1465 
1466 	ASSERT(MUTEX_HELD(&so->so_lock));
1467 
1468 	/*
1469 	 * Discard new T_DISCON_IND if we have already received another.
1470 	 * Currently the earlier message can either be on sti_discon_ind_mp
1471 	 * or being processed.
1472 	 */
1473 	if (sti->sti_discon_ind_mp != NULL || (so->so_flag & SOASYNC_UNBIND)) {
1474 		zcmn_err(getzoneid(), CE_WARN,
1475 		    "sockfs: received unexpected additional T_DISCON_IND\n");
1476 		freemsg(mp);
1477 		return;
1478 	}
1479 	mp->b_prev = (mblk_t *)func;
1480 	mp->b_next = NULL;
1481 	sti->sti_discon_ind_mp = mp;
1482 }
1483 
1484 /*
1485  * Caller must hold the mutex and make sure that either SOLOCKED
1486  * or SOASYNC_UNBIND is set. Called from so_unlock_single().
1487  * Perform delayed processing of T_DISCON_IND message on sti_discon_ind_mp.
1488  * Need to ensure that strsock_proto() will not end up sleeping for
1489  * SOASYNC_UNBIND, while executing this function.
1490  */
1491 void
1492 so_drain_discon_ind(struct sonode *so)
1493 {
1494 	mblk_t	*bp;
1495 	void (*func)(struct sonode *so, mblk_t *);
1496 	sotpi_info_t *sti = SOTOTPI(so);
1497 
1498 	ASSERT(MUTEX_HELD(&so->so_lock));
1499 	ASSERT(so->so_flag & (SOLOCKED|SOASYNC_UNBIND));
1500 
1501 	/* Process T_DISCON_IND on sti_discon_ind_mp */
1502 	if ((bp = sti->sti_discon_ind_mp) != NULL) {
1503 		sti->sti_discon_ind_mp = NULL;
1504 		func = (void (*)())bp->b_prev;
1505 		bp->b_prev = NULL;
1506 
1507 		/*
1508 		 * This (*func) is supposed to generate a message downstream
1509 		 * and we need to have a flag set until the corresponding
1510 		 * upstream message reaches stream head.
1511 		 * When processing T_DISCON_IND in strsock_discon_ind
1512 		 * we hold SOASYN_UNBIND when sending T_UNBIND_REQ down and
1513 		 * drop the flag after we get the ACK in strsock_proto.
1514 		 */
1515 		(void) (*func)(so, bp);
1516 	}
1517 }
1518 
1519 /*
1520  * Caller must hold the mutex.
1521  * Remove the T_DISCON_IND on sti_discon_ind_mp.
1522  */
1523 void
1524 so_flush_discon_ind(struct sonode *so)
1525 {
1526 	mblk_t	*bp;
1527 	sotpi_info_t *sti = SOTOTPI(so);
1528 
1529 	ASSERT(MUTEX_HELD(&so->so_lock));
1530 
1531 	/*
1532 	 * Remove T_DISCON_IND mblk at sti_discon_ind_mp.
1533 	 */
1534 	if ((bp = sti->sti_discon_ind_mp) != NULL) {
1535 		sti->sti_discon_ind_mp = NULL;
1536 		bp->b_prev = NULL;
1537 		freemsg(bp);
1538 	}
1539 }
1540 
1541 /*
1542  * Caller must hold the mutex.
1543  *
1544  * This function is used to process the T_DISCON_IND message. It does
1545  * immediate processing when called from strsock_proto and delayed
1546  * processing of discon_ind saved on sti_discon_ind_mp when called from
1547  * so_drain_discon_ind. When a T_DISCON_IND message is saved in
1548  * sti_discon_ind_mp for delayed processing, this function is registered
1549  * as the callback function to process the message.
1550  *
1551  * SOASYNC_UNBIND should be held in this function, during the non-blocking
1552  * unbind operation, and should be released only after we receive the ACK
1553  * in strsock_proto, for the T_UNBIND_REQ sent here. Since SOLOCKED is not set,
1554  * no TPI messages would be sent down at this time. This is to prevent M_FLUSH
1555  * sent from either this function or tcp_unbind(), flushing away any TPI
1556  * message that is being sent down and stays in a lower module's queue.
1557  *
1558  * This function drops so_lock and grabs it again.
1559  */
1560 static void
1561 strsock_discon_ind(struct sonode *so, mblk_t *discon_mp)
1562 {
1563 	struct vnode *vp;
1564 	struct stdata *stp;
1565 	union T_primitives *tpr;
1566 	struct T_unbind_req *ubr;
1567 	mblk_t *mp;
1568 	int error;
1569 	sotpi_info_t *sti = SOTOTPI(so);
1570 
1571 	ASSERT(MUTEX_HELD(&so->so_lock));
1572 	ASSERT(discon_mp);
1573 	ASSERT(discon_mp->b_rptr);
1574 
1575 	tpr = (union T_primitives *)discon_mp->b_rptr;
1576 	ASSERT(tpr->type == T_DISCON_IND);
1577 
1578 	vp = SOTOV(so);
1579 	stp = vp->v_stream;
1580 	ASSERT(stp);
1581 
1582 	/*
1583 	 * Not a listener
1584 	 */
1585 	ASSERT((so->so_state & SS_ACCEPTCONN) == 0);
1586 
1587 	/*
1588 	 * This assumes that the name space for DISCON_reason
1589 	 * is the errno name space.
1590 	 */
1591 	soisdisconnected(so, tpr->discon_ind.DISCON_reason);
1592 	sti->sti_laddr_valid = 0;
1593 	sti->sti_faddr_valid = 0;
1594 
1595 	/*
1596 	 * Unbind with the transport without blocking.
1597 	 * If we've already received a T_DISCON_IND do not unbind.
1598 	 *
1599 	 * If there is no preallocated unbind message, we have already
1600 	 * unbound with the transport
1601 	 *
1602 	 * If the socket is not bound, no need to unbind.
1603 	 */
1604 	mp = sti->sti_unbind_mp;
1605 	if (mp == NULL) {
1606 		ASSERT(!(so->so_state & SS_ISBOUND));
1607 		mutex_exit(&so->so_lock);
1608 	} else if (!(so->so_state & SS_ISBOUND))  {
1609 		mutex_exit(&so->so_lock);
1610 	} else {
1611 		sti->sti_unbind_mp = NULL;
1612 
1613 		/*
1614 		 * Is another T_DISCON_IND being processed.
1615 		 */
1616 		ASSERT((so->so_flag & SOASYNC_UNBIND) == 0);
1617 
1618 		/*
1619 		 * Make strsock_proto ignore T_OK_ACK and T_ERROR_ACK for
1620 		 * this unbind. Set SOASYNC_UNBIND. This should be cleared
1621 		 * only after we receive the ACK in strsock_proto.
1622 		 */
1623 		so->so_flag |= SOASYNC_UNBIND;
1624 		ASSERT(!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)));
1625 		so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1626 		sti->sti_laddr_valid = 0;
1627 		mutex_exit(&so->so_lock);
1628 
1629 		/*
1630 		 * Send down T_UNBIND_REQ ignoring flow control.
1631 		 * XXX Assumes that MSG_IGNFLOW implies that this thread
1632 		 * does not run service procedures.
1633 		 */
1634 		ASSERT(DB_TYPE(mp) == M_PROTO);
1635 		ubr = (struct T_unbind_req *)mp->b_rptr;
1636 		mp->b_wptr += sizeof (*ubr);
1637 		ubr->PRIM_type = T_UNBIND_REQ;
1638 
1639 		/*
1640 		 * Flush the read and write side (except stream head read queue)
1641 		 * and send down T_UNBIND_REQ.
1642 		 */
1643 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1644 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1645 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
1646 		/* LINTED - warning: statement has no consequent: if */
1647 		if (error) {
1648 			eprintsoline(so, error);
1649 		}
1650 	}
1651 
1652 	if (tpr->discon_ind.DISCON_reason != 0)
1653 		strsetrerror(SOTOV(so), 0, 0, sogetrderr);
1654 	strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
1655 	strseteof(SOTOV(so), 1);
1656 	/*
1657 	 * strseteof takes care of read side wakeups,
1658 	 * pollwakeups, and signals.
1659 	 */
1660 	dprintso(so, 1, ("T_DISCON_IND: error %d\n", so->so_error));
1661 	freemsg(discon_mp);
1662 
1663 
1664 	pollwakeup(&stp->sd_pollist, POLLOUT);
1665 	mutex_enter(&stp->sd_lock);
1666 
1667 	/*
1668 	 * Wake sleeping write
1669 	 */
1670 	if (stp->sd_flag & WSLEEP) {
1671 		stp->sd_flag &= ~WSLEEP;
1672 		cv_broadcast(&stp->sd_wrq->q_wait);
1673 	}
1674 
1675 	/*
1676 	 * strsendsig can handle multiple signals with a
1677 	 * single call.  Send SIGPOLL for S_OUTPUT event.
1678 	 */
1679 	if (stp->sd_sigflags & S_OUTPUT)
1680 		strsendsig(stp->sd_siglist, S_OUTPUT, 0, 0);
1681 
1682 	mutex_exit(&stp->sd_lock);
1683 	mutex_enter(&so->so_lock);
1684 }
1685 
1686 /*
1687  * This routine is registered with the stream head to receive M_PROTO
1688  * and M_PCPROTO messages.
1689  *
1690  * Returns NULL if the message was consumed.
1691  * Returns an mblk to make that mblk be processed (and queued) by the stream
1692  * head.
1693  *
1694  * Sets the return parameters (*wakeups, *firstmsgsigs, *allmsgsigs, and
1695  * *pollwakeups) for the stream head to take action on. Note that since
1696  * sockets always deliver SIGIO for every new piece of data this routine
1697  * never sets *firstmsgsigs; any signals are returned in *allmsgsigs.
1698  *
1699  * This routine handles all data related TPI messages independent of
1700  * the type of the socket i.e. it doesn't care if T_UNITDATA_IND message
1701  * arrive on a SOCK_STREAM.
1702  */
1703 static mblk_t *
1704 strsock_proto(vnode_t *vp, mblk_t *mp,
1705 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
1706 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
1707 {
1708 	union T_primitives *tpr;
1709 	struct sonode *so;
1710 	sotpi_info_t *sti;
1711 
1712 	so = VTOSO(vp);
1713 	sti = SOTOTPI(so);
1714 
1715 	dprintso(so, 1, ("strsock_proto(%p, %p)\n", (void *)vp, (void *)mp));
1716 
1717 	/* Set default return values */
1718 	*firstmsgsigs = *wakeups = *allmsgsigs = *pollwakeups = 0;
1719 
1720 	ASSERT(DB_TYPE(mp) == M_PROTO ||
1721 	    DB_TYPE(mp) == M_PCPROTO);
1722 
1723 	if (MBLKL(mp) < sizeof (tpr->type)) {
1724 		/* The message is too short to even contain the primitive */
1725 		zcmn_err(getzoneid(), CE_WARN,
1726 		    "sockfs: Too short TPI message received. Len = %ld\n",
1727 		    (ptrdiff_t)(MBLKL(mp)));
1728 		freemsg(mp);
1729 		return (NULL);
1730 	}
1731 	if (!__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
1732 		/* The read pointer is not aligned correctly for TPI */
1733 		zcmn_err(getzoneid(), CE_WARN,
1734 		    "sockfs: Unaligned TPI message received. rptr = %p\n",
1735 		    (void *)mp->b_rptr);
1736 		freemsg(mp);
1737 		return (NULL);
1738 	}
1739 	tpr = (union T_primitives *)mp->b_rptr;
1740 	dprintso(so, 1, ("strsock_proto: primitive %d\n", tpr->type));
1741 
1742 	switch (tpr->type) {
1743 
1744 	case T_DATA_IND:
1745 		if (MBLKL(mp) < sizeof (struct T_data_ind)) {
1746 			zcmn_err(getzoneid(), CE_WARN,
1747 			    "sockfs: Too short T_DATA_IND. Len = %ld\n",
1748 			    (ptrdiff_t)(MBLKL(mp)));
1749 			freemsg(mp);
1750 			return (NULL);
1751 		}
1752 		/*
1753 		 * Ignore zero-length T_DATA_IND messages. These might be
1754 		 * generated by some transports.
1755 		 * This is needed to prevent read (which skips the M_PROTO
1756 		 * part) to unexpectedly return 0 (or return EWOULDBLOCK
1757 		 * on a non-blocking socket after select/poll has indicated
1758 		 * that data is available).
1759 		 */
1760 		if (msgdsize(mp->b_cont) == 0) {
1761 			dprintso(so, 0,
1762 			    ("strsock_proto: zero length T_DATA_IND\n"));
1763 			freemsg(mp);
1764 			return (NULL);
1765 		}
1766 		*allmsgsigs = S_INPUT | S_RDNORM;
1767 		*pollwakeups = POLLIN | POLLRDNORM;
1768 		*wakeups = RSLEEP;
1769 		return (mp);
1770 
1771 	case T_UNITDATA_IND: {
1772 		struct T_unitdata_ind	*tudi = &tpr->unitdata_ind;
1773 		void			*addr;
1774 		t_uscalar_t		addrlen;
1775 
1776 		if (MBLKL(mp) < sizeof (struct T_unitdata_ind)) {
1777 			zcmn_err(getzoneid(), CE_WARN,
1778 			    "sockfs: Too short T_UNITDATA_IND. Len = %ld\n",
1779 			    (ptrdiff_t)(MBLKL(mp)));
1780 			freemsg(mp);
1781 			return (NULL);
1782 		}
1783 
1784 		/* Is this is not a connected datagram socket? */
1785 		if ((so->so_mode & SM_CONNREQUIRED) ||
1786 		    !(so->so_state & SS_ISCONNECTED)) {
1787 			/*
1788 			 * Not a connected datagram socket. Look for
1789 			 * the SO_UNIX_CLOSE option. If such an option is found
1790 			 * discard the message (since it has no meaning
1791 			 * unless connected).
1792 			 */
1793 			if (so->so_family == AF_UNIX && msgdsize(mp) == 0 &&
1794 			    tudi->OPT_length != 0) {
1795 				void *opt;
1796 				t_uscalar_t optlen = tudi->OPT_length;
1797 
1798 				opt = sogetoff(mp, tudi->OPT_offset,
1799 				    optlen, __TPI_ALIGN_SIZE);
1800 				if (opt == NULL) {
1801 					/* The len/off falls outside mp */
1802 					freemsg(mp);
1803 					mutex_enter(&so->so_lock);
1804 					soseterror(so, EPROTO);
1805 					mutex_exit(&so->so_lock);
1806 					zcmn_err(getzoneid(), CE_WARN,
1807 					    "sockfs: T_unidata_ind with "
1808 					    "invalid optlen/offset %u/%d\n",
1809 					    optlen, tudi->OPT_offset);
1810 					return (NULL);
1811 				}
1812 				if (so_getopt_unix_close(opt, optlen)) {
1813 					freemsg(mp);
1814 					return (NULL);
1815 				}
1816 			}
1817 			*allmsgsigs = S_INPUT | S_RDNORM;
1818 			*pollwakeups = POLLIN | POLLRDNORM;
1819 			*wakeups = RSLEEP;
1820 			if (audit_active)
1821 				audit_sock(T_UNITDATA_IND, strvp2wq(vp),
1822 				    mp, 0);
1823 			return (mp);
1824 		}
1825 
1826 		/*
1827 		 * A connect datagram socket. For AF_INET{,6} we verify that
1828 		 * the source address matches the "connected to" address.
1829 		 * The semantics of AF_UNIX sockets is to not verify
1830 		 * the source address.
1831 		 * Note that this source address verification is transport
1832 		 * specific. Thus the real fix would be to extent TPI
1833 		 * to allow T_CONN_REQ messages to be send to connectionless
1834 		 * transport providers and always let the transport provider
1835 		 * do whatever filtering is needed.
1836 		 *
1837 		 * The verification/filtering semantics for transports
1838 		 * other than AF_INET and AF_UNIX are unknown. The choice
1839 		 * would be to either filter using bcmp or let all messages
1840 		 * get through. This code does not filter other address
1841 		 * families since this at least allows the application to
1842 		 * work around any missing filtering.
1843 		 *
1844 		 * XXX Should we move filtering to UDP/ICMP???
1845 		 * That would require passing e.g. a T_DISCON_REQ to UDP
1846 		 * when the socket becomes unconnected.
1847 		 */
1848 		addrlen = tudi->SRC_length;
1849 		/*
1850 		 * The alignment restriction is really to strict but
1851 		 * we want enough alignment to inspect the fields of
1852 		 * a sockaddr_in.
1853 		 */
1854 		addr = sogetoff(mp, tudi->SRC_offset, addrlen,
1855 		    __TPI_ALIGN_SIZE);
1856 		if (addr == NULL) {
1857 			freemsg(mp);
1858 			mutex_enter(&so->so_lock);
1859 			soseterror(so, EPROTO);
1860 			mutex_exit(&so->so_lock);
1861 			zcmn_err(getzoneid(), CE_WARN,
1862 			    "sockfs: T_unidata_ind with invalid "
1863 			    "addrlen/offset %u/%d\n",
1864 			    addrlen, tudi->SRC_offset);
1865 			return (NULL);
1866 		}
1867 
1868 		if (so->so_family == AF_INET) {
1869 			/*
1870 			 * For AF_INET we allow wildcarding both sin_addr
1871 			 * and sin_port.
1872 			 */
1873 			struct sockaddr_in *faddr, *sin;
1874 
1875 			/* Prevent sti_faddr_sa from changing while accessed */
1876 			mutex_enter(&so->so_lock);
1877 			ASSERT(sti->sti_faddr_len ==
1878 			    (socklen_t)sizeof (struct sockaddr_in));
1879 			faddr = (struct sockaddr_in *)sti->sti_faddr_sa;
1880 			sin = (struct sockaddr_in *)addr;
1881 			if (addrlen !=
1882 			    (t_uscalar_t)sizeof (struct sockaddr_in) ||
1883 			    (sin->sin_addr.s_addr != faddr->sin_addr.s_addr &&
1884 			    faddr->sin_addr.s_addr != INADDR_ANY) ||
1885 			    (so->so_type != SOCK_RAW &&
1886 			    sin->sin_port != faddr->sin_port &&
1887 			    faddr->sin_port != 0)) {
1888 #ifdef DEBUG
1889 				dprintso(so, 0,
1890 				    ("sockfs: T_UNITDATA_IND mismatch: %s",
1891 				    pr_addr(so->so_family,
1892 				    (struct sockaddr *)addr, addrlen)));
1893 				dprintso(so, 0, (" - %s\n",
1894 				    pr_addr(so->so_family, sti->sti_faddr_sa,
1895 				    (t_uscalar_t)sti->sti_faddr_len)));
1896 #endif /* DEBUG */
1897 				mutex_exit(&so->so_lock);
1898 				freemsg(mp);
1899 				return (NULL);
1900 			}
1901 			mutex_exit(&so->so_lock);
1902 		} else if (so->so_family == AF_INET6) {
1903 			/*
1904 			 * For AF_INET6 we allow wildcarding both sin6_addr
1905 			 * and sin6_port.
1906 			 */
1907 			struct sockaddr_in6 *faddr6, *sin6;
1908 			static struct in6_addr zeroes; /* inits to all zeros */
1909 
1910 			/* Prevent sti_faddr_sa from changing while accessed */
1911 			mutex_enter(&so->so_lock);
1912 			ASSERT(sti->sti_faddr_len ==
1913 			    (socklen_t)sizeof (struct sockaddr_in6));
1914 			faddr6 = (struct sockaddr_in6 *)sti->sti_faddr_sa;
1915 			sin6 = (struct sockaddr_in6 *)addr;
1916 			/* XXX could we get a mapped address ::ffff:0.0.0.0 ? */
1917 			if (addrlen !=
1918 			    (t_uscalar_t)sizeof (struct sockaddr_in6) ||
1919 			    (!IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
1920 			    &faddr6->sin6_addr) &&
1921 			    !IN6_ARE_ADDR_EQUAL(&faddr6->sin6_addr, &zeroes)) ||
1922 			    (so->so_type != SOCK_RAW &&
1923 			    sin6->sin6_port != faddr6->sin6_port &&
1924 			    faddr6->sin6_port != 0)) {
1925 #ifdef DEBUG
1926 				dprintso(so, 0,
1927 				    ("sockfs: T_UNITDATA_IND mismatch: %s",
1928 				    pr_addr(so->so_family,
1929 				    (struct sockaddr *)addr, addrlen)));
1930 				dprintso(so, 0, (" - %s\n",
1931 				    pr_addr(so->so_family, sti->sti_faddr_sa,
1932 				    (t_uscalar_t)sti->sti_faddr_len)));
1933 #endif /* DEBUG */
1934 				mutex_exit(&so->so_lock);
1935 				freemsg(mp);
1936 				return (NULL);
1937 			}
1938 			mutex_exit(&so->so_lock);
1939 		} else if (so->so_family == AF_UNIX &&
1940 		    msgdsize(mp->b_cont) == 0 &&
1941 		    tudi->OPT_length != 0) {
1942 			/*
1943 			 * Attempt to extract AF_UNIX
1944 			 * SO_UNIX_CLOSE indication from options.
1945 			 */
1946 			void *opt;
1947 			t_uscalar_t optlen = tudi->OPT_length;
1948 
1949 			opt = sogetoff(mp, tudi->OPT_offset,
1950 			    optlen, __TPI_ALIGN_SIZE);
1951 			if (opt == NULL) {
1952 				/* The len/off falls outside mp */
1953 				freemsg(mp);
1954 				mutex_enter(&so->so_lock);
1955 				soseterror(so, EPROTO);
1956 				mutex_exit(&so->so_lock);
1957 				zcmn_err(getzoneid(), CE_WARN,
1958 				    "sockfs: T_unidata_ind with invalid "
1959 				    "optlen/offset %u/%d\n",
1960 				    optlen, tudi->OPT_offset);
1961 				return (NULL);
1962 			}
1963 			/*
1964 			 * If we received a unix close indication mark the
1965 			 * socket and discard this message.
1966 			 */
1967 			if (so_getopt_unix_close(opt, optlen)) {
1968 				mutex_enter(&so->so_lock);
1969 				sobreakconn(so, ECONNRESET);
1970 				mutex_exit(&so->so_lock);
1971 				strsetrerror(SOTOV(so), 0, 0, sogetrderr);
1972 				freemsg(mp);
1973 				*pollwakeups = POLLIN | POLLRDNORM;
1974 				*allmsgsigs = S_INPUT | S_RDNORM;
1975 				*wakeups = RSLEEP;
1976 				return (NULL);
1977 			}
1978 		}
1979 		*allmsgsigs = S_INPUT | S_RDNORM;
1980 		*pollwakeups = POLLIN | POLLRDNORM;
1981 		*wakeups = RSLEEP;
1982 		return (mp);
1983 	}
1984 
1985 	case T_OPTDATA_IND: {
1986 		struct T_optdata_ind	*tdi = &tpr->optdata_ind;
1987 
1988 		if (MBLKL(mp) < sizeof (struct T_optdata_ind)) {
1989 			zcmn_err(getzoneid(), CE_WARN,
1990 			    "sockfs: Too short T_OPTDATA_IND. Len = %ld\n",
1991 			    (ptrdiff_t)(MBLKL(mp)));
1992 			freemsg(mp);
1993 			return (NULL);
1994 		}
1995 		/*
1996 		 * Allow zero-length messages carrying options.
1997 		 * This is used when carrying the SO_UNIX_CLOSE option.
1998 		 */
1999 		if (so->so_family == AF_UNIX && msgdsize(mp->b_cont) == 0 &&
2000 		    tdi->OPT_length != 0) {
2001 			/*
2002 			 * Attempt to extract AF_UNIX close indication
2003 			 * from the options. Ignore any other options -
2004 			 * those are handled once the message is removed
2005 			 * from the queue.
2006 			 * The close indication message should not carry data.
2007 			 */
2008 			void *opt;
2009 			t_uscalar_t optlen = tdi->OPT_length;
2010 
2011 			opt = sogetoff(mp, tdi->OPT_offset,
2012 			    optlen, __TPI_ALIGN_SIZE);
2013 			if (opt == NULL) {
2014 				/* The len/off falls outside mp */
2015 				freemsg(mp);
2016 				mutex_enter(&so->so_lock);
2017 				soseterror(so, EPROTO);
2018 				mutex_exit(&so->so_lock);
2019 				zcmn_err(getzoneid(), CE_WARN,
2020 				    "sockfs: T_optdata_ind with invalid "
2021 				    "optlen/offset %u/%d\n",
2022 				    optlen, tdi->OPT_offset);
2023 				return (NULL);
2024 			}
2025 			/*
2026 			 * If we received a close indication mark the
2027 			 * socket and discard this message.
2028 			 */
2029 			if (so_getopt_unix_close(opt, optlen)) {
2030 				mutex_enter(&so->so_lock);
2031 				socantsendmore(so);
2032 				sti->sti_faddr_valid = 0;
2033 				mutex_exit(&so->so_lock);
2034 				strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2035 				freemsg(mp);
2036 				return (NULL);
2037 			}
2038 		}
2039 		*allmsgsigs = S_INPUT | S_RDNORM;
2040 		*pollwakeups = POLLIN | POLLRDNORM;
2041 		*wakeups = RSLEEP;
2042 		return (mp);
2043 	}
2044 
2045 	case T_EXDATA_IND: {
2046 		mblk_t		*mctl, *mdata;
2047 		mblk_t *lbp;
2048 		union T_primitives *tprp;
2049 		struct stdata   *stp;
2050 		queue_t *qp;
2051 
2052 		if (MBLKL(mp) < sizeof (struct T_exdata_ind)) {
2053 			zcmn_err(getzoneid(), CE_WARN,
2054 			    "sockfs: Too short T_EXDATA_IND. Len = %ld\n",
2055 			    (ptrdiff_t)(MBLKL(mp)));
2056 			freemsg(mp);
2057 			return (NULL);
2058 		}
2059 		/*
2060 		 * Ignore zero-length T_EXDATA_IND messages. These might be
2061 		 * generated by some transports.
2062 		 *
2063 		 * This is needed to prevent read (which skips the M_PROTO
2064 		 * part) to unexpectedly return 0 (or return EWOULDBLOCK
2065 		 * on a non-blocking socket after select/poll has indicated
2066 		 * that data is available).
2067 		 */
2068 		dprintso(so, 1,
2069 		    ("T_EXDATA_IND(%p): counts %d/%d state %s\n",
2070 		    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2071 		    pr_state(so->so_state, so->so_mode)));
2072 
2073 		if (msgdsize(mp->b_cont) == 0) {
2074 			dprintso(so, 0,
2075 			    ("strsock_proto: zero length T_EXDATA_IND\n"));
2076 			freemsg(mp);
2077 			return (NULL);
2078 		}
2079 
2080 		/*
2081 		 * Split into the T_EXDATA_IND and the M_DATA part.
2082 		 * We process these three pieces separately:
2083 		 *	signal generation
2084 		 *	handling T_EXDATA_IND
2085 		 *	handling M_DATA component
2086 		 */
2087 		mctl = mp;
2088 		mdata = mctl->b_cont;
2089 		mctl->b_cont = NULL;
2090 		mutex_enter(&so->so_lock);
2091 		so_oob_sig(so, 0, allmsgsigs, pollwakeups);
2092 		mctl = so_oob_exdata(so, mctl, allmsgsigs, pollwakeups);
2093 		mdata = so_oob_data(so, mdata, allmsgsigs, pollwakeups);
2094 
2095 		stp = vp->v_stream;
2096 		ASSERT(stp != NULL);
2097 		qp = _RD(stp->sd_wrq);
2098 
2099 		mutex_enter(QLOCK(qp));
2100 		lbp = qp->q_last;
2101 
2102 		/*
2103 		 * We want to avoid queueing up a string of T_EXDATA_IND
2104 		 * messages with no intervening data messages at the stream
2105 		 * head. These messages contribute to the total message
2106 		 * count. Eventually this can lead to STREAMS flow contol
2107 		 * and also cause TCP to advertise a zero window condition
2108 		 * to the peer. This can happen in the degenerate case where
2109 		 * the sender and receiver exchange only OOB data. The sender
2110 		 * only sends messages with MSG_OOB flag and the receiver
2111 		 * receives only MSG_OOB messages and does not use SO_OOBINLINE.
2112 		 * An example of this scenario has been reported in applications
2113 		 * that use OOB data to exchange heart beats. Flow control
2114 		 * relief will never happen if the application only reads OOB
2115 		 * data which is done directly by sorecvoob() and the
2116 		 * T_EXDATA_IND messages at the streamhead won't be consumed.
2117 		 * Note that there is no correctness issue in compressing the
2118 		 * string of T_EXDATA_IND messages into a single T_EXDATA_IND
2119 		 * message. A single read that does not specify MSG_OOB will
2120 		 * read across all the marks in a loop in sotpi_recvmsg().
2121 		 * Each mark is individually distinguishable only if the
2122 		 * T_EXDATA_IND messages are separated by data messages.
2123 		 */
2124 		if ((qp->q_first != NULL) && (DB_TYPE(lbp) == M_PROTO)) {
2125 			tprp = (union T_primitives *)lbp->b_rptr;
2126 			if ((tprp->type == T_EXDATA_IND) &&
2127 			    !(so->so_options & SO_OOBINLINE)) {
2128 
2129 				/*
2130 				 * free the new M_PROTO message
2131 				 */
2132 				freemsg(mctl);
2133 
2134 				/*
2135 				 * adjust the OOB count and OOB	signal count
2136 				 * just incremented for the new OOB data.
2137 				 */
2138 				sti->sti_oobcnt--;
2139 				sti->sti_oobsigcnt--;
2140 				mutex_exit(QLOCK(qp));
2141 				mutex_exit(&so->so_lock);
2142 				return (NULL);
2143 			}
2144 		}
2145 		mutex_exit(QLOCK(qp));
2146 
2147 		/*
2148 		 * Pass the T_EXDATA_IND and the M_DATA back separately
2149 		 * by using b_next linkage. (The stream head will queue any
2150 		 * b_next linked messages separately.) This is needed
2151 		 * since MSGMARK applies to the last by of the message
2152 		 * hence we can not have any M_DATA component attached
2153 		 * to the marked T_EXDATA_IND. Note that the stream head
2154 		 * will not consolidate M_DATA messages onto an MSGMARK'ed
2155 		 * message in order to preserve the constraint that
2156 		 * the T_EXDATA_IND always is a separate message.
2157 		 */
2158 		ASSERT(mctl != NULL);
2159 		mctl->b_next = mdata;
2160 		mp = mctl;
2161 #ifdef DEBUG
2162 		if (mdata == NULL) {
2163 			dprintso(so, 1,
2164 			    ("after outofline T_EXDATA_IND(%p): "
2165 			    "counts %d/%d  poll 0x%x sig 0x%x state %s\n",
2166 			    (void *)vp, sti->sti_oobsigcnt,
2167 			    sti->sti_oobcnt, *pollwakeups, *allmsgsigs,
2168 			    pr_state(so->so_state, so->so_mode)));
2169 		} else {
2170 			dprintso(so, 1,
2171 			    ("after inline T_EXDATA_IND(%p): "
2172 			    "counts %d/%d  poll 0x%x sig 0x%x state %s\n",
2173 			    (void *)vp, sti->sti_oobsigcnt,
2174 			    sti->sti_oobcnt, *pollwakeups, *allmsgsigs,
2175 			    pr_state(so->so_state, so->so_mode)));
2176 		}
2177 #endif /* DEBUG */
2178 		mutex_exit(&so->so_lock);
2179 		*wakeups = RSLEEP;
2180 		return (mp);
2181 	}
2182 
2183 	case T_CONN_CON: {
2184 		struct T_conn_con	*conn_con;
2185 		void			*addr;
2186 		t_uscalar_t		addrlen;
2187 
2188 		/*
2189 		 * Verify the state, update the state to ISCONNECTED,
2190 		 * record the potentially new address in the message,
2191 		 * and drop the message.
2192 		 */
2193 		if (MBLKL(mp) < sizeof (struct T_conn_con)) {
2194 			zcmn_err(getzoneid(), CE_WARN,
2195 			    "sockfs: Too short T_CONN_CON. Len = %ld\n",
2196 			    (ptrdiff_t)(MBLKL(mp)));
2197 			freemsg(mp);
2198 			return (NULL);
2199 		}
2200 
2201 		mutex_enter(&so->so_lock);
2202 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) !=
2203 		    SS_ISCONNECTING) {
2204 			mutex_exit(&so->so_lock);
2205 			dprintso(so, 1,
2206 			    ("T_CONN_CON: state %x\n", so->so_state));
2207 			freemsg(mp);
2208 			return (NULL);
2209 		}
2210 
2211 		conn_con = &tpr->conn_con;
2212 		addrlen = conn_con->RES_length;
2213 		/*
2214 		 * Allow the address to be of different size than sent down
2215 		 * in the T_CONN_REQ as long as it doesn't exceed the maxlen.
2216 		 * For AF_UNIX require the identical length.
2217 		 */
2218 		if (so->so_family == AF_UNIX ?
2219 		    addrlen != (t_uscalar_t)sizeof (sti->sti_ux_laddr) :
2220 		    addrlen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2221 			zcmn_err(getzoneid(), CE_WARN,
2222 			    "sockfs: T_conn_con with different "
2223 			    "length %u/%d\n",
2224 			    addrlen, conn_con->RES_length);
2225 			soisdisconnected(so, EPROTO);
2226 			sti->sti_laddr_valid = 0;
2227 			sti->sti_faddr_valid = 0;
2228 			mutex_exit(&so->so_lock);
2229 			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2230 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2231 			strseteof(SOTOV(so), 1);
2232 			freemsg(mp);
2233 			/*
2234 			 * strseteof takes care of read side wakeups,
2235 			 * pollwakeups, and signals.
2236 			 */
2237 			*wakeups = WSLEEP;
2238 			*allmsgsigs = S_OUTPUT;
2239 			*pollwakeups = POLLOUT;
2240 			return (NULL);
2241 		}
2242 		addr = sogetoff(mp, conn_con->RES_offset, addrlen, 1);
2243 		if (addr == NULL) {
2244 			zcmn_err(getzoneid(), CE_WARN,
2245 			    "sockfs: T_conn_con with invalid "
2246 			    "addrlen/offset %u/%d\n",
2247 			    addrlen, conn_con->RES_offset);
2248 			mutex_exit(&so->so_lock);
2249 			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2250 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2251 			strseteof(SOTOV(so), 1);
2252 			freemsg(mp);
2253 			/*
2254 			 * strseteof takes care of read side wakeups,
2255 			 * pollwakeups, and signals.
2256 			 */
2257 			*wakeups = WSLEEP;
2258 			*allmsgsigs = S_OUTPUT;
2259 			*pollwakeups = POLLOUT;
2260 			return (NULL);
2261 		}
2262 
2263 		/*
2264 		 * Save for getpeername.
2265 		 */
2266 		if (so->so_family != AF_UNIX) {
2267 			sti->sti_faddr_len = (socklen_t)addrlen;
2268 			ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2269 			bcopy(addr, sti->sti_faddr_sa, addrlen);
2270 			sti->sti_faddr_valid = 1;
2271 		}
2272 
2273 		if (so->so_peercred != NULL)
2274 			crfree(so->so_peercred);
2275 		so->so_peercred = msg_getcred(mp, &so->so_cpid);
2276 		if (so->so_peercred != NULL)
2277 			crhold(so->so_peercred);
2278 
2279 		/* Wakeup anybody sleeping in sowaitconnected */
2280 		soisconnected(so);
2281 		mutex_exit(&so->so_lock);
2282 
2283 		/*
2284 		 * The socket is now available for sending data.
2285 		 */
2286 		*wakeups = WSLEEP;
2287 		*allmsgsigs = S_OUTPUT;
2288 		*pollwakeups = POLLOUT;
2289 		freemsg(mp);
2290 		return (NULL);
2291 	}
2292 
2293 	/*
2294 	 * Extra processing in case of an SSL proxy, before queuing or
2295 	 * forwarding to the fallback endpoint
2296 	 */
2297 	case T_SSL_PROXY_CONN_IND:
2298 	case T_CONN_IND:
2299 		/*
2300 		 * Verify the min size and queue the message on
2301 		 * the sti_conn_ind_head/tail list.
2302 		 */
2303 		if (MBLKL(mp) < sizeof (struct T_conn_ind)) {
2304 			zcmn_err(getzoneid(), CE_WARN,
2305 			    "sockfs: Too short T_CONN_IND. Len = %ld\n",
2306 			    (ptrdiff_t)(MBLKL(mp)));
2307 			freemsg(mp);
2308 			return (NULL);
2309 		}
2310 
2311 		if (audit_active)
2312 			audit_sock(T_CONN_IND, strvp2wq(vp), mp, 0);
2313 		if (!(so->so_state & SS_ACCEPTCONN)) {
2314 			zcmn_err(getzoneid(), CE_WARN,
2315 			    "sockfs: T_conn_ind on non-listening socket\n");
2316 			freemsg(mp);
2317 			return (NULL);
2318 		}
2319 
2320 		if (tpr->type == T_SSL_PROXY_CONN_IND && mp->b_cont == NULL) {
2321 			/* No context: need to fall back */
2322 			struct sonode *fbso;
2323 			stdata_t *fbstp;
2324 
2325 			tpr->type = T_CONN_IND;
2326 
2327 			fbso = kssl_find_fallback(sti->sti_kssl_ent);
2328 
2329 			/*
2330 			 * No fallback: the remote will timeout and
2331 			 * disconnect.
2332 			 */
2333 			if (fbso == NULL) {
2334 				freemsg(mp);
2335 				return (NULL);
2336 			}
2337 			fbstp = SOTOV(fbso)->v_stream;
2338 			qreply(fbstp->sd_wrq->q_next, mp);
2339 			return (NULL);
2340 		}
2341 		soqueueconnind(so, mp);
2342 		*allmsgsigs = S_INPUT | S_RDNORM;
2343 		*pollwakeups = POLLIN | POLLRDNORM;
2344 		*wakeups = RSLEEP;
2345 		return (NULL);
2346 
2347 	case T_ORDREL_IND:
2348 		if (MBLKL(mp) < sizeof (struct T_ordrel_ind)) {
2349 			zcmn_err(getzoneid(), CE_WARN,
2350 			    "sockfs: Too short T_ORDREL_IND. Len = %ld\n",
2351 			    (ptrdiff_t)(MBLKL(mp)));
2352 			freemsg(mp);
2353 			return (NULL);
2354 		}
2355 
2356 		/*
2357 		 * Some providers send this when not fully connected.
2358 		 * SunLink X.25 needs to retrieve disconnect reason after
2359 		 * disconnect for compatibility. It uses T_ORDREL_IND
2360 		 * instead of T_DISCON_IND so that it may use the
2361 		 * endpoint after a connect failure to retrieve the
2362 		 * reason using an ioctl. Thus we explicitly clear
2363 		 * SS_ISCONNECTING here for SunLink X.25.
2364 		 * This is a needed TPI violation.
2365 		 */
2366 		mutex_enter(&so->so_lock);
2367 		so->so_state &= ~SS_ISCONNECTING;
2368 		socantrcvmore(so);
2369 		mutex_exit(&so->so_lock);
2370 		strseteof(SOTOV(so), 1);
2371 		/*
2372 		 * strseteof takes care of read side wakeups,
2373 		 * pollwakeups, and signals.
2374 		 */
2375 		freemsg(mp);
2376 		return (NULL);
2377 
2378 	case T_DISCON_IND:
2379 		if (MBLKL(mp) < sizeof (struct T_discon_ind)) {
2380 			zcmn_err(getzoneid(), CE_WARN,
2381 			    "sockfs: Too short T_DISCON_IND. Len = %ld\n",
2382 			    (ptrdiff_t)(MBLKL(mp)));
2383 			freemsg(mp);
2384 			return (NULL);
2385 		}
2386 		if (so->so_state & SS_ACCEPTCONN) {
2387 			/*
2388 			 * This is a listener. Look for a queued T_CONN_IND
2389 			 * with a matching sequence number and remove it
2390 			 * from the list.
2391 			 * It is normal to not find the sequence number since
2392 			 * the soaccept might have already dequeued it
2393 			 * (in which case the T_CONN_RES will fail with
2394 			 * TBADSEQ).
2395 			 */
2396 			(void) soflushconnind(so, tpr->discon_ind.SEQ_number);
2397 			freemsg(mp);
2398 			return (0);
2399 		}
2400 
2401 		/*
2402 		 * Not a listener
2403 		 *
2404 		 * If SS_CANTRCVMORE for AF_UNIX ignore the discon_reason.
2405 		 * Such a discon_ind appears when the peer has first done
2406 		 * a shutdown() followed by a close() in which case we just
2407 		 * want to record socantsendmore.
2408 		 * In this case sockfs first receives a T_ORDREL_IND followed
2409 		 * by a T_DISCON_IND.
2410 		 * Note that for other transports (e.g. TCP) we need to handle
2411 		 * the discon_ind in this case since it signals an error.
2412 		 */
2413 		mutex_enter(&so->so_lock);
2414 		if ((so->so_state & SS_CANTRCVMORE) &&
2415 		    (so->so_family == AF_UNIX)) {
2416 			socantsendmore(so);
2417 			sti->sti_faddr_valid = 0;
2418 			mutex_exit(&so->so_lock);
2419 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2420 			dprintso(so, 1,
2421 			    ("T_DISCON_IND: error %d\n", so->so_error));
2422 			freemsg(mp);
2423 			/*
2424 			 * Set these variables for caller to process them.
2425 			 * For the else part where T_DISCON_IND is processed,
2426 			 * this will be done in the function being called
2427 			 * (strsock_discon_ind())
2428 			 */
2429 			*wakeups = WSLEEP;
2430 			*allmsgsigs = S_OUTPUT;
2431 			*pollwakeups = POLLOUT;
2432 		} else if (so->so_flag & (SOASYNC_UNBIND | SOLOCKED)) {
2433 			/*
2434 			 * Deferred processing of T_DISCON_IND
2435 			 */
2436 			so_save_discon_ind(so, mp, strsock_discon_ind);
2437 			mutex_exit(&so->so_lock);
2438 		} else {
2439 			/*
2440 			 * Process T_DISCON_IND now
2441 			 */
2442 			(void) strsock_discon_ind(so, mp);
2443 			mutex_exit(&so->so_lock);
2444 		}
2445 		return (NULL);
2446 
2447 	case T_UDERROR_IND: {
2448 		struct T_uderror_ind	*tudi = &tpr->uderror_ind;
2449 		void			*addr;
2450 		t_uscalar_t		addrlen;
2451 		int			error;
2452 
2453 		dprintso(so, 0,
2454 		    ("T_UDERROR_IND: error %d\n", tudi->ERROR_type));
2455 
2456 		if (MBLKL(mp) < sizeof (struct T_uderror_ind)) {
2457 			zcmn_err(getzoneid(), CE_WARN,
2458 			    "sockfs: Too short T_UDERROR_IND. Len = %ld\n",
2459 			    (ptrdiff_t)(MBLKL(mp)));
2460 			freemsg(mp);
2461 			return (NULL);
2462 		}
2463 		/* Ignore on connection-oriented transports */
2464 		if (so->so_mode & SM_CONNREQUIRED) {
2465 			freemsg(mp);
2466 			eprintsoline(so, 0);
2467 			zcmn_err(getzoneid(), CE_WARN,
2468 			    "sockfs: T_uderror_ind on connection-oriented "
2469 			    "transport\n");
2470 			return (NULL);
2471 		}
2472 		addrlen = tudi->DEST_length;
2473 		addr = sogetoff(mp, tudi->DEST_offset, addrlen, 1);
2474 		if (addr == NULL) {
2475 			zcmn_err(getzoneid(), CE_WARN,
2476 			    "sockfs: T_uderror_ind with invalid "
2477 			    "addrlen/offset %u/%d\n",
2478 			    addrlen, tudi->DEST_offset);
2479 			freemsg(mp);
2480 			return (NULL);
2481 		}
2482 
2483 		/* Verify source address for connected socket. */
2484 		mutex_enter(&so->so_lock);
2485 		if (so->so_state & SS_ISCONNECTED) {
2486 			void *faddr;
2487 			t_uscalar_t faddr_len;
2488 			boolean_t match = B_FALSE;
2489 
2490 			switch (so->so_family) {
2491 			case AF_INET: {
2492 				/* Compare just IP address and port */
2493 				struct sockaddr_in *sin1, *sin2;
2494 
2495 				sin1 = (struct sockaddr_in *)sti->sti_faddr_sa;
2496 				sin2 = (struct sockaddr_in *)addr;
2497 				if (addrlen == sizeof (struct sockaddr_in) &&
2498 				    sin1->sin_port == sin2->sin_port &&
2499 				    sin1->sin_addr.s_addr ==
2500 				    sin2->sin_addr.s_addr)
2501 					match = B_TRUE;
2502 				break;
2503 			}
2504 			case AF_INET6: {
2505 				/* Compare just IP address and port. Not flow */
2506 				struct sockaddr_in6 *sin1, *sin2;
2507 
2508 				sin1 = (struct sockaddr_in6 *)sti->sti_faddr_sa;
2509 				sin2 = (struct sockaddr_in6 *)addr;
2510 				if (addrlen == sizeof (struct sockaddr_in6) &&
2511 				    sin1->sin6_port == sin2->sin6_port &&
2512 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
2513 				    &sin2->sin6_addr))
2514 					match = B_TRUE;
2515 				break;
2516 			}
2517 			case AF_UNIX:
2518 				faddr = &sti->sti_ux_faddr;
2519 				faddr_len =
2520 				    (t_uscalar_t)sizeof (sti->sti_ux_faddr);
2521 				if (faddr_len == addrlen &&
2522 				    bcmp(addr, faddr, addrlen) == 0)
2523 					match = B_TRUE;
2524 				break;
2525 			default:
2526 				faddr = sti->sti_faddr_sa;
2527 				faddr_len = (t_uscalar_t)sti->sti_faddr_len;
2528 				if (faddr_len == addrlen &&
2529 				    bcmp(addr, faddr, addrlen) == 0)
2530 					match = B_TRUE;
2531 				break;
2532 			}
2533 
2534 			if (!match) {
2535 #ifdef DEBUG
2536 				dprintso(so, 0,
2537 				    ("sockfs: T_UDERR_IND mismatch: %s - ",
2538 				    pr_addr(so->so_family,
2539 				    (struct sockaddr *)addr, addrlen)));
2540 				dprintso(so, 0, ("%s\n",
2541 				    pr_addr(so->so_family, sti->sti_faddr_sa,
2542 				    sti->sti_faddr_len)));
2543 #endif /* DEBUG */
2544 				mutex_exit(&so->so_lock);
2545 				freemsg(mp);
2546 				return (NULL);
2547 			}
2548 			/*
2549 			 * Make the write error nonpersistent. If the error
2550 			 * is zero we use ECONNRESET.
2551 			 * This assumes that the name space for ERROR_type
2552 			 * is the errno name space.
2553 			 */
2554 			if (tudi->ERROR_type != 0)
2555 				error = tudi->ERROR_type;
2556 			else
2557 				error = ECONNRESET;
2558 
2559 			soseterror(so, error);
2560 			mutex_exit(&so->so_lock);
2561 			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2562 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2563 			*wakeups = RSLEEP | WSLEEP;
2564 			*allmsgsigs = S_INPUT | S_RDNORM | S_OUTPUT;
2565 			*pollwakeups = POLLIN | POLLRDNORM | POLLOUT;
2566 			freemsg(mp);
2567 			return (NULL);
2568 		}
2569 		/*
2570 		 * If the application asked for delayed errors
2571 		 * record the T_UDERROR_IND sti_eaddr_mp and the reason in
2572 		 * sti_delayed_error for delayed error posting. If the reason
2573 		 * is zero use ECONNRESET.
2574 		 * Note that delayed error indications do not make sense for
2575 		 * AF_UNIX sockets since sendto checks that the destination
2576 		 * address is valid at the time of the sendto.
2577 		 */
2578 		if (!(so->so_options & SO_DGRAM_ERRIND)) {
2579 			mutex_exit(&so->so_lock);
2580 			freemsg(mp);
2581 			return (NULL);
2582 		}
2583 		if (sti->sti_eaddr_mp != NULL)
2584 			freemsg(sti->sti_eaddr_mp);
2585 
2586 		sti->sti_eaddr_mp = mp;
2587 		if (tudi->ERROR_type != 0)
2588 			error = tudi->ERROR_type;
2589 		else
2590 			error = ECONNRESET;
2591 		sti->sti_delayed_error = (ushort_t)error;
2592 		mutex_exit(&so->so_lock);
2593 		return (NULL);
2594 	}
2595 
2596 	case T_ERROR_ACK:
2597 		dprintso(so, 0,
2598 		    ("strsock_proto: T_ERROR_ACK for %d, error %d/%d\n",
2599 		    tpr->error_ack.ERROR_prim,
2600 		    tpr->error_ack.TLI_error,
2601 		    tpr->error_ack.UNIX_error));
2602 
2603 		if (MBLKL(mp) < sizeof (struct T_error_ack)) {
2604 			zcmn_err(getzoneid(), CE_WARN,
2605 			    "sockfs: Too short T_ERROR_ACK. Len = %ld\n",
2606 			    (ptrdiff_t)(MBLKL(mp)));
2607 			freemsg(mp);
2608 			return (NULL);
2609 		}
2610 		/*
2611 		 * Check if we were waiting for the async message
2612 		 */
2613 		mutex_enter(&so->so_lock);
2614 		if ((so->so_flag & SOASYNC_UNBIND) &&
2615 		    tpr->error_ack.ERROR_prim == T_UNBIND_REQ) {
2616 			so_unlock_single(so, SOASYNC_UNBIND);
2617 			mutex_exit(&so->so_lock);
2618 			freemsg(mp);
2619 			return (NULL);
2620 		}
2621 		mutex_exit(&so->so_lock);
2622 		soqueueack(so, mp);
2623 		return (NULL);
2624 
2625 	case T_OK_ACK:
2626 		if (MBLKL(mp) < sizeof (struct T_ok_ack)) {
2627 			zcmn_err(getzoneid(), CE_WARN,
2628 			    "sockfs: Too short T_OK_ACK. Len = %ld\n",
2629 			    (ptrdiff_t)(MBLKL(mp)));
2630 			freemsg(mp);
2631 			return (NULL);
2632 		}
2633 		/*
2634 		 * Check if we were waiting for the async message
2635 		 */
2636 		mutex_enter(&so->so_lock);
2637 		if ((so->so_flag & SOASYNC_UNBIND) &&
2638 		    tpr->ok_ack.CORRECT_prim == T_UNBIND_REQ) {
2639 			dprintso(so, 1,
2640 			    ("strsock_proto: T_OK_ACK async unbind\n"));
2641 			so_unlock_single(so, SOASYNC_UNBIND);
2642 			mutex_exit(&so->so_lock);
2643 			freemsg(mp);
2644 			return (NULL);
2645 		}
2646 		mutex_exit(&so->so_lock);
2647 		soqueueack(so, mp);
2648 		return (NULL);
2649 
2650 	case T_INFO_ACK:
2651 		if (MBLKL(mp) < sizeof (struct T_info_ack)) {
2652 			zcmn_err(getzoneid(), CE_WARN,
2653 			    "sockfs: Too short T_INFO_ACK. Len = %ld\n",
2654 			    (ptrdiff_t)(MBLKL(mp)));
2655 			freemsg(mp);
2656 			return (NULL);
2657 		}
2658 		soqueueack(so, mp);
2659 		return (NULL);
2660 
2661 	case T_CAPABILITY_ACK:
2662 		/*
2663 		 * A T_capability_ack need only be large enough to hold
2664 		 * the PRIM_type and CAP_bits1 fields; checking for anything
2665 		 * larger might reject a correct response from an older
2666 		 * provider.
2667 		 */
2668 		if (MBLKL(mp) < 2 * sizeof (t_uscalar_t)) {
2669 			zcmn_err(getzoneid(), CE_WARN,
2670 			    "sockfs: Too short T_CAPABILITY_ACK. Len = %ld\n",
2671 			    (ptrdiff_t)(MBLKL(mp)));
2672 			freemsg(mp);
2673 			return (NULL);
2674 		}
2675 		soqueueack(so, mp);
2676 		return (NULL);
2677 
2678 	case T_BIND_ACK:
2679 		if (MBLKL(mp) < sizeof (struct T_bind_ack)) {
2680 			zcmn_err(getzoneid(), CE_WARN,
2681 			    "sockfs: Too short T_BIND_ACK. Len = %ld\n",
2682 			    (ptrdiff_t)(MBLKL(mp)));
2683 			freemsg(mp);
2684 			return (NULL);
2685 		}
2686 		soqueueack(so, mp);
2687 		return (NULL);
2688 
2689 	case T_OPTMGMT_ACK:
2690 		if (MBLKL(mp) < sizeof (struct T_optmgmt_ack)) {
2691 			zcmn_err(getzoneid(), CE_WARN,
2692 			    "sockfs: Too short T_OPTMGMT_ACK. Len = %ld\n",
2693 			    (ptrdiff_t)(MBLKL(mp)));
2694 			freemsg(mp);
2695 			return (NULL);
2696 		}
2697 		soqueueack(so, mp);
2698 		return (NULL);
2699 	default:
2700 #ifdef DEBUG
2701 		zcmn_err(getzoneid(), CE_WARN,
2702 		    "sockfs: unknown TPI primitive %d received\n",
2703 		    tpr->type);
2704 #endif /* DEBUG */
2705 		freemsg(mp);
2706 		return (NULL);
2707 	}
2708 }
2709 
2710 /*
2711  * This routine is registered with the stream head to receive other
2712  * (non-data, and non-proto) messages.
2713  *
2714  * Returns NULL if the message was consumed.
2715  * Returns an mblk to make that mblk be processed by the stream head.
2716  *
2717  * Sets the return parameters (*wakeups, *firstmsgsigs, *allmsgsigs, and
2718  * *pollwakeups) for the stream head to take action on.
2719  */
2720 static mblk_t *
2721 strsock_misc(vnode_t *vp, mblk_t *mp,
2722 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
2723 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
2724 {
2725 	struct sonode *so;
2726 	sotpi_info_t *sti;
2727 
2728 	so = VTOSO(vp);
2729 	sti = SOTOTPI(so);
2730 
2731 	dprintso(so, 1, ("strsock_misc(%p, %p, 0x%x)\n",
2732 	    (void *)vp, (void *)mp, DB_TYPE(mp)));
2733 
2734 	/* Set default return values */
2735 	*wakeups = *allmsgsigs = *firstmsgsigs = *pollwakeups = 0;
2736 
2737 	switch (DB_TYPE(mp)) {
2738 	case M_PCSIG:
2739 		/*
2740 		 * This assumes that an M_PCSIG for the urgent data arrives
2741 		 * before the corresponding T_EXDATA_IND.
2742 		 *
2743 		 * Note: Just like in SunOS 4.X and 4.4BSD a poll will be
2744 		 * awoken before the urgent data shows up.
2745 		 * For OOBINLINE this can result in select returning
2746 		 * only exceptions as opposed to except|read.
2747 		 */
2748 		if (*mp->b_rptr == SIGURG) {
2749 			mutex_enter(&so->so_lock);
2750 			dprintso(so, 1,
2751 			    ("SIGURG(%p): counts %d/%d state %s\n",
2752 			    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2753 			    pr_state(so->so_state, so->so_mode)));
2754 			so_oob_sig(so, 1, allmsgsigs, pollwakeups);
2755 			dprintso(so, 1,
2756 			    ("after SIGURG(%p): counts %d/%d "
2757 			    " poll 0x%x sig 0x%x state %s\n",
2758 			    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2759 			    *pollwakeups, *allmsgsigs,
2760 			    pr_state(so->so_state, so->so_mode)));
2761 			mutex_exit(&so->so_lock);
2762 		}
2763 		freemsg(mp);
2764 		return (NULL);
2765 
2766 	case M_SIG:
2767 	case M_HANGUP:
2768 	case M_UNHANGUP:
2769 	case M_ERROR:
2770 		/* M_ERRORs etc are ignored */
2771 		freemsg(mp);
2772 		return (NULL);
2773 
2774 	case M_FLUSH:
2775 		/*
2776 		 * Do not flush read queue. If the M_FLUSH
2777 		 * arrives because of an impending T_discon_ind
2778 		 * we still have to keep any queued data - this is part of
2779 		 * socket semantics.
2780 		 */
2781 		if (*mp->b_rptr & FLUSHW) {
2782 			*mp->b_rptr &= ~FLUSHR;
2783 			return (mp);
2784 		}
2785 		freemsg(mp);
2786 		return (NULL);
2787 
2788 	default:
2789 		return (mp);
2790 	}
2791 }
2792 
2793 
2794 /* Register to receive signals for certain events */
2795 int
2796 so_set_asyncsigs(vnode_t *vp, pid_t pgrp, int events, int mode, cred_t *cr)
2797 {
2798 	struct strsigset ss;
2799 	int32_t rval;
2800 
2801 	/*
2802 	 * Note that SOLOCKED will be set except for the call from soaccept().
2803 	 */
2804 	ASSERT(!mutex_owned(&VTOSO(vp)->so_lock));
2805 	ss.ss_pid = pgrp;
2806 	ss.ss_events = events;
2807 	return (strioctl(vp, I_ESETSIG, (intptr_t)&ss, mode, K_TO_K, cr,
2808 	    &rval));
2809 }
2810 
2811 
2812 /* Register for events matching the SS_ASYNC flag */
2813 int
2814 so_set_events(struct sonode *so, vnode_t *vp, cred_t *cr)
2815 {
2816 	int events = so->so_state & SS_ASYNC ?
2817 	    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT :
2818 	    S_RDBAND | S_BANDURG;
2819 
2820 	return (so_set_asyncsigs(vp, so->so_pgrp, events, 0, cr));
2821 }
2822 
2823 
2824 /* Change the SS_ASYNC flag, and update signal delivery if needed */
2825 int
2826 so_flip_async(struct sonode *so, vnode_t *vp, int mode, cred_t *cr)
2827 {
2828 	ASSERT(mutex_owned(&so->so_lock));
2829 	if (so->so_pgrp != 0) {
2830 		int error;
2831 		int events = so->so_state & SS_ASYNC ?		/* Old flag */
2832 		    S_RDBAND | S_BANDURG :			/* New sigs */
2833 		    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT;
2834 
2835 		so_lock_single(so);
2836 		mutex_exit(&so->so_lock);
2837 
2838 		error = so_set_asyncsigs(vp, so->so_pgrp, events, mode, cr);
2839 
2840 		mutex_enter(&so->so_lock);
2841 		so_unlock_single(so, SOLOCKED);
2842 		if (error)
2843 			return (error);
2844 	}
2845 	so->so_state ^= SS_ASYNC;
2846 	return (0);
2847 }
2848 
2849 /*
2850  * Set new pid/pgrp for SIGPOLL (or SIGIO for FIOASYNC mode), replacing
2851  * any existing one.  If passed zero, just clear the existing one.
2852  */
2853 int
2854 so_set_siggrp(struct sonode *so, vnode_t *vp, pid_t pgrp, int mode, cred_t *cr)
2855 {
2856 	int events = so->so_state & SS_ASYNC ?
2857 	    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT :
2858 	    S_RDBAND | S_BANDURG;
2859 	int error;
2860 
2861 	ASSERT(mutex_owned(&so->so_lock));
2862 
2863 	/*
2864 	 * Change socket process (group).
2865 	 *
2866 	 * strioctl (via so_set_asyncsigs) will perform permission check and
2867 	 * also keep a PID_HOLD to prevent the pid from being reused.
2868 	 */
2869 	so_lock_single(so);
2870 	mutex_exit(&so->so_lock);
2871 
2872 	if (pgrp != 0) {
2873 		dprintso(so, 1, ("setown: adding pgrp %d ev 0x%x\n",
2874 		    pgrp, events));
2875 		error = so_set_asyncsigs(vp, pgrp, events, mode, cr);
2876 		if (error != 0) {
2877 			eprintsoline(so, error);
2878 			goto bad;
2879 		}
2880 	}
2881 	/* Remove the previously registered process/group */
2882 	if (so->so_pgrp != 0) {
2883 		dprintso(so, 1, ("setown: removing pgrp %d\n", so->so_pgrp));
2884 		error = so_set_asyncsigs(vp, so->so_pgrp, 0, mode, cr);
2885 		if (error != 0) {
2886 			eprintsoline(so, error);
2887 			error = 0;
2888 		}
2889 	}
2890 	mutex_enter(&so->so_lock);
2891 	so_unlock_single(so, SOLOCKED);
2892 	so->so_pgrp = pgrp;
2893 	return (0);
2894 bad:
2895 	mutex_enter(&so->so_lock);
2896 	so_unlock_single(so, SOLOCKED);
2897 	return (error);
2898 }
2899 
2900 /*
2901  * Wrapper for getmsg. If the socket has been converted to a stream
2902  * pass the request to the stream head.
2903  */
2904 int
2905 sock_getmsg(
2906 	struct vnode *vp,
2907 	struct strbuf *mctl,
2908 	struct strbuf *mdata,
2909 	uchar_t *prip,
2910 	int *flagsp,
2911 	int fmode,
2912 	rval_t *rvp
2913 )
2914 {
2915 	struct sonode *so;
2916 
2917 	ASSERT(vp->v_type == VSOCK);
2918 	/*
2919 	 * Use the stream head to find the real socket vnode.
2920 	 * This is needed when namefs sits above sockfs.  Some
2921 	 * sockets (like SCTP) are not streams.
2922 	 */
2923 	if (!vp->v_stream) {
2924 		return (ENOSTR);
2925 	}
2926 	ASSERT(vp->v_stream->sd_vnode);
2927 	vp = vp->v_stream->sd_vnode;
2928 	ASSERT(vn_matchops(vp, socket_vnodeops));
2929 	so = VTOSO(vp);
2930 
2931 	dprintso(so, 1, ("sock_getmsg(%p) %s\n",
2932 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2933 
2934 	if (so->so_version == SOV_STREAM) {
2935 		/* The imaginary "sockmod" has been popped - act as a stream */
2936 		return (strgetmsg(vp, mctl, mdata, prip, flagsp, fmode, rvp));
2937 	}
2938 	eprintsoline(so, ENOSTR);
2939 	return (ENOSTR);
2940 }
2941 
2942 /*
2943  * Wrapper for putmsg. If the socket has been converted to a stream
2944  * pass the request to the stream head.
2945  *
2946  * Note that a while a regular socket (SOV_SOCKSTREAM) does support the
2947  * streams ioctl set it does not support putmsg and getmsg.
2948  * Allowing putmsg would prevent sockfs from tracking the state of
2949  * the socket/transport and would also invalidate the locking in sockfs.
2950  */
2951 int
2952 sock_putmsg(
2953 	struct vnode *vp,
2954 	struct strbuf *mctl,
2955 	struct strbuf *mdata,
2956 	uchar_t pri,
2957 	int flag,
2958 	int fmode
2959 )
2960 {
2961 	struct sonode *so;
2962 
2963 	ASSERT(vp->v_type == VSOCK);
2964 	/*
2965 	 * Use the stream head to find the real socket vnode.
2966 	 * This is needed when namefs sits above sockfs.
2967 	 */
2968 	if (!vp->v_stream) {
2969 		return (ENOSTR);
2970 	}
2971 	ASSERT(vp->v_stream->sd_vnode);
2972 	vp = vp->v_stream->sd_vnode;
2973 	ASSERT(vn_matchops(vp, socket_vnodeops));
2974 	so = VTOSO(vp);
2975 
2976 	dprintso(so, 1, ("sock_putmsg(%p) %s\n",
2977 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2978 
2979 	if (so->so_version == SOV_STREAM) {
2980 		/* The imaginary "sockmod" has been popped - act as a stream */
2981 		return (strputmsg(vp, mctl, mdata, pri, flag, fmode));
2982 	}
2983 	eprintsoline(so, ENOSTR);
2984 	return (ENOSTR);
2985 }
2986 
2987 /*
2988  * Special function called only from f_getfl().
2989  * Returns FASYNC if the SS_ASYNC flag is set on a socket, else 0.
2990  * No locks are acquired here, so it is safe to use while uf_lock is held.
2991  * This exists solely for BSD fcntl() FASYNC compatibility.
2992  */
2993 int
2994 sock_getfasync(vnode_t *vp)
2995 {
2996 	struct sonode *so;
2997 
2998 	ASSERT(vp->v_type == VSOCK);
2999 	/*
3000 	 * For stream model, v_stream is used; For non-stream, v_stream always
3001 	 * equals NULL
3002 	 */
3003 	if (vp->v_stream != NULL)
3004 		so = VTOSO(vp->v_stream->sd_vnode);
3005 	else
3006 		so = VTOSO(vp);
3007 
3008 	if (so->so_version == SOV_STREAM || !(so->so_state & SS_ASYNC))
3009 		return (0);
3010 
3011 	return (FASYNC);
3012 }
3013 
3014 /*
3015  * Sockfs sodirect STREAMS read put procedure. Called from sodirect enable
3016  * transport driver/module with an mblk_t chain.
3017  *
3018  * Note, we in-line putq() for the fast-path cases of q is empty, q_last and
3019  * bp are of type M_DATA. All other cases we call putq().
3020  *
3021  * On success a zero will be return, else an errno will be returned.
3022  */
3023 int
3024 sodput(sodirect_t *sodp, mblk_t *bp)
3025 {
3026 	queue_t		*q = sodp->sod_q;
3027 	struct stdata	*stp = (struct stdata *)q->q_ptr;
3028 	mblk_t		*nbp;
3029 	mblk_t		*last = q->q_last;
3030 	int		bytecnt = 0;
3031 	int		mblkcnt = 0;
3032 
3033 
3034 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
3035 
3036 	if (stp->sd_flag == STREOF) {
3037 		do {
3038 			if ((nbp = bp->b_next) != NULL)
3039 				bp->b_next = NULL;
3040 			freemsg(bp);
3041 		} while ((bp = nbp) != NULL);
3042 
3043 		return (0);
3044 	}
3045 
3046 	mutex_enter(QLOCK(q));
3047 	if (q->q_first == NULL) {
3048 		/* Q empty, really fast fast-path */
3049 		bp->b_prev = NULL;
3050 		bp->b_next = NULL;
3051 		q->q_first = bp;
3052 		q->q_last = bp;
3053 
3054 	} else if (last->b_datap->db_type == M_DATA &&
3055 	    bp->b_datap->db_type == M_DATA) {
3056 		/*
3057 		 * Last mblk_t chain and bp are both type M_DATA so
3058 		 * in-line putq() here, if the DBLK_UIOA state match
3059 		 * add bp to the end of the current last chain, else
3060 		 * start a new last chain with bp.
3061 		 */
3062 		if ((last->b_datap->db_flags & DBLK_UIOA) ==
3063 		    (bp->b_datap->db_flags & DBLK_UIOA)) {
3064 			/* Added to end */
3065 			while ((nbp = last->b_cont) != NULL)
3066 				last = nbp;
3067 			last->b_cont = bp;
3068 		} else {
3069 			/* New last */
3070 			ASSERT((bp->b_datap->db_flags & DBLK_UIOA) == 0 ||
3071 			    msgdsize(bp) == sodp->sod_uioa.uioa_mbytes);
3072 			last->b_next = bp;
3073 			bp->b_next = NULL;
3074 			bp->b_prev = last;
3075 			q->q_last = bp;
3076 		}
3077 	} else {
3078 		/*
3079 		 * Can't use q_last so just call putq().
3080 		 */
3081 		mutex_exit(QLOCK(q));
3082 
3083 		ASSERT((bp->b_datap->db_flags & DBLK_UIOA) == 0 ||
3084 		    msgdsize(bp) == sodp->sod_uioa.uioa_mbytes);
3085 		(void) putq(q, bp);
3086 		return (0);
3087 	}
3088 
3089 	/* Count bytes and mblk_t's */
3090 	do {
3091 		bytecnt += MBLKL(bp);
3092 		mblkcnt++;
3093 	} while ((bp = bp->b_cont) != NULL);
3094 	q->q_count += bytecnt;
3095 	q->q_mblkcnt += mblkcnt;
3096 
3097 	/* Check for QFULL */
3098 	if (q->q_count >= q->q_hiwat + sodp->sod_want ||
3099 	    q->q_mblkcnt >= q->q_hiwat) {
3100 		q->q_flag |= QFULL;
3101 	}
3102 
3103 	mutex_exit(QLOCK(q));
3104 	return (0);
3105 }
3106 
3107 /*
3108  * Sockfs sodirect read wakeup. Called from a sodirect enabled transport
3109  * driver/module to indicate that read-side data is available.
3110  *
3111  * On return the sodirect_t.lock mutex will be exited so this must be the
3112  * last sodirect_t call to guarantee atomic access of *sodp.
3113  */
3114 void
3115 sodwakeup(sodirect_t *sodp)
3116 {
3117 	queue_t		*q = sodp->sod_q;
3118 	struct stdata	*stp = (struct stdata *)q->q_ptr;
3119 
3120 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
3121 
3122 	if (stp->sd_flag & RSLEEP) {
3123 		stp->sd_flag &= ~RSLEEP;
3124 		cv_broadcast(&q->q_wait);
3125 	}
3126 
3127 	if (stp->sd_rput_opt & SR_POLLIN) {
3128 		stp->sd_rput_opt &= ~SR_POLLIN;
3129 		mutex_exit(sodp->sod_lockp);
3130 		pollwakeup(&stp->sd_pollist, POLLIN | POLLRDNORM);
3131 	} else
3132 		mutex_exit(sodp->sod_lockp);
3133 }
3134