xref: /titanic_52/usr/src/uts/common/fs/sockfs/sockstr.c (revision 209e49b2ff611e7d61ff58e13756ae67f51be550)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/inttypes.h>
28 #include <sys/t_lock.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/buf.h>
32 #include <sys/conf.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/sysmacros.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/debug.h>
39 #include <sys/errno.h>
40 #include <sys/time.h>
41 #include <sys/file.h>
42 #include <sys/user.h>
43 #include <sys/stream.h>
44 #include <sys/strsubr.h>
45 #include <sys/esunddi.h>
46 #include <sys/flock.h>
47 #include <sys/modctl.h>
48 #include <sys/vtrace.h>
49 #include <sys/strsun.h>
50 #include <sys/cmn_err.h>
51 #include <sys/proc.h>
52 #include <sys/ddi.h>
53 
54 #include <sys/suntpi.h>
55 #include <sys/socket.h>
56 #include <sys/sockio.h>
57 #include <sys/socketvar.h>
58 #include <netinet/in.h>
59 #include <inet/common.h>
60 #include <inet/proto_set.h>
61 
62 #include <sys/tiuser.h>
63 #define	_SUN_TPI_VERSION	2
64 #include <sys/tihdr.h>
65 
66 #include <c2/audit.h>
67 
68 #include <fs/sockfs/socktpi.h>
69 #include <fs/sockfs/socktpi_impl.h>
70 
71 int so_default_version = SOV_SOCKSTREAM;
72 
73 #ifdef DEBUG
74 /* Set sockdebug to print debug messages when SO_DEBUG is set */
75 int sockdebug = 0;
76 
77 /* Set sockprinterr to print error messages when SO_DEBUG is set */
78 int sockprinterr = 0;
79 
80 /*
81  * Set so_default_options to SO_DEBUG is all sockets should be created
82  * with SO_DEBUG set. This is needed to get debug printouts from the
83  * socket() call itself.
84  */
85 int so_default_options = 0;
86 #endif /* DEBUG */
87 
88 #ifdef SOCK_TEST
89 /*
90  * Set to number of ticks to limit cv_waits for code coverage testing.
91  * Set to 1000 when SO_DEBUG is set to 2.
92  */
93 clock_t sock_test_timelimit = 0;
94 #endif /* SOCK_TEST */
95 
96 /*
97  * For concurrency testing of e.g. opening /dev/ip which does not
98  * handle T_INFO_REQ messages.
99  */
100 int so_no_tinfo = 0;
101 
102 /*
103  * Timeout for getting a T_CAPABILITY_ACK - it is possible for a provider
104  * to simply ignore the T_CAPABILITY_REQ.
105  */
106 clock_t	sock_capability_timeout	= 2;	/* seconds */
107 
108 static int	do_tcapability(struct sonode *so, t_uscalar_t cap_bits1);
109 static void	so_removehooks(struct sonode *so);
110 
111 static mblk_t *strsock_proto(vnode_t *vp, mblk_t *mp,
112 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
113 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
114 static mblk_t *strsock_misc(vnode_t *vp, mblk_t *mp,
115 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
116 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
117 
118 /*
119  * Convert a socket to a stream. Invoked when the illusory sockmod
120  * is popped from the stream.
121  * Change the stream head back to default operation without losing
122  * any messages (T_conn_ind's are moved to the stream head queue).
123  */
124 int
125 so_sock2stream(struct sonode *so)
126 {
127 	struct vnode		*vp = SOTOV(so);
128 	queue_t			*rq;
129 	mblk_t			*mp;
130 	int			error = 0;
131 	sotpi_info_t		*sti = SOTOTPI(so);
132 
133 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
134 
135 	mutex_enter(&so->so_lock);
136 	so_lock_single(so);
137 
138 	ASSERT(so->so_version != SOV_STREAM);
139 
140 	if (sti->sti_direct) {
141 		mblk_t **mpp;
142 		int rval;
143 
144 		/*
145 		 * Tell the transport below that sockmod is being popped
146 		 */
147 		mutex_exit(&so->so_lock);
148 		error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K, CRED(),
149 		    &rval);
150 		mutex_enter(&so->so_lock);
151 		if (error != 0) {
152 			dprintso(so, 0, ("so_sock2stream(%p): "
153 			    "_SIOCSOCKFALLBACK failed\n", (void *)so));
154 			goto exit;
155 		}
156 		sti->sti_direct = 0;
157 
158 		for (mpp = &sti->sti_conn_ind_head; (mp = *mpp) != NULL;
159 		    mpp = &mp->b_next) {
160 			struct T_conn_ind	*conn_ind;
161 
162 			/*
163 			 * strsock_proto() has already verified the length of
164 			 * this message block.
165 			 */
166 			ASSERT(MBLKL(mp) >= sizeof (struct T_conn_ind));
167 
168 			conn_ind = (struct T_conn_ind *)mp->b_rptr;
169 			if (conn_ind->OPT_length == 0 &&
170 			    conn_ind->OPT_offset == 0)
171 				continue;
172 
173 			if (DB_REF(mp) > 1) {
174 				mblk_t	*newmp;
175 				size_t	length;
176 				cred_t	*cr;
177 				pid_t	cpid;
178 				int error;	/* Dummy - error not returned */
179 
180 				/*
181 				 * Copy the message block because it is used
182 				 * elsewhere, too.
183 				 * Can't use copyb since we want to wait
184 				 * yet allow for EINTR.
185 				 */
186 				/* Round up size for reuse */
187 				length = MAX(MBLKL(mp), 64);
188 				cr = msg_getcred(mp, &cpid);
189 				if (cr != NULL) {
190 					newmp = allocb_cred_wait(length, 0,
191 					    &error, cr, cpid);
192 				} else {
193 					newmp = allocb_wait(length, 0, 0,
194 					    &error);
195 				}
196 				if (newmp == NULL) {
197 					error = EINTR;
198 					goto exit;
199 				}
200 				bcopy(mp->b_rptr, newmp->b_wptr, length);
201 				newmp->b_wptr += length;
202 				newmp->b_next = mp->b_next;
203 
204 				/*
205 				 * Link the new message block into the queue
206 				 * and free the old one.
207 				 */
208 				*mpp = newmp;
209 				mp->b_next = NULL;
210 				freemsg(mp);
211 
212 				mp = newmp;
213 				conn_ind = (struct T_conn_ind *)mp->b_rptr;
214 			}
215 
216 			/*
217 			 * Remove options added by TCP for accept fast-path.
218 			 */
219 			conn_ind->OPT_length = 0;
220 			conn_ind->OPT_offset = 0;
221 		}
222 	}
223 
224 	so->so_version = SOV_STREAM;
225 	so->so_proto_handle = NULL;
226 
227 	/*
228 	 * Remove the hooks in the stream head to avoid queuing more
229 	 * packets in sockfs.
230 	 */
231 	mutex_exit(&so->so_lock);
232 	so_removehooks(so);
233 	mutex_enter(&so->so_lock);
234 
235 	/*
236 	 * Clear any state related to urgent data. Leave any T_EXDATA_IND
237 	 * on the queue - the behavior of urgent data after a switch is
238 	 * left undefined.
239 	 */
240 	so->so_error = sti->sti_delayed_error = 0;
241 	freemsg(so->so_oobmsg);
242 	so->so_oobmsg = NULL;
243 	sti->sti_oobsigcnt = sti->sti_oobcnt = 0;
244 
245 	so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
246 	    SS_SAVEDEOR);
247 	ASSERT(so_verify_oobstate(so));
248 
249 	freemsg(sti->sti_ack_mp);
250 	sti->sti_ack_mp = NULL;
251 
252 	/*
253 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
254 	 */
255 	so_flush_discon_ind(so);
256 
257 	/*
258 	 * Move any queued T_CONN_IND messages to stream head queue.
259 	 */
260 	rq = RD(strvp2wq(vp));
261 	while ((mp = sti->sti_conn_ind_head) != NULL) {
262 		sti->sti_conn_ind_head = mp->b_next;
263 		mp->b_next = NULL;
264 		if (sti->sti_conn_ind_head == NULL) {
265 			ASSERT(sti->sti_conn_ind_tail == mp);
266 			sti->sti_conn_ind_tail = NULL;
267 		}
268 		dprintso(so, 0,
269 		    ("so_sock2stream(%p): moving T_CONN_IND\n", (void *)so));
270 
271 		/* Drop lock across put() */
272 		mutex_exit(&so->so_lock);
273 		put(rq, mp);
274 		mutex_enter(&so->so_lock);
275 	}
276 
277 exit:
278 	ASSERT(MUTEX_HELD(&so->so_lock));
279 	so_unlock_single(so, SOLOCKED);
280 	mutex_exit(&so->so_lock);
281 	return (error);
282 }
283 
284 /*
285  * Covert a stream back to a socket. This is invoked when the illusory
286  * sockmod is pushed on a stream (where the stream was "created" by
287  * popping the illusory sockmod).
288  * This routine can not recreate the socket state (certain aspects of
289  * it like urgent data state and the bound/connected addresses for AF_UNIX
290  * sockets can not be recreated by asking the transport for information).
291  * Thus this routine implicitly assumes that the socket is in an initial
292  * state (as if it was just created). It flushes any messages queued on the
293  * read queue to avoid dealing with e.g. TPI acks or T_exdata_ind messages.
294  */
295 void
296 so_stream2sock(struct sonode *so)
297 {
298 	struct vnode *vp = SOTOV(so);
299 	sotpi_info_t *sti = SOTOTPI(so);
300 
301 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
302 
303 	mutex_enter(&so->so_lock);
304 	so_lock_single(so);
305 	ASSERT(so->so_version == SOV_STREAM);
306 	so->so_version = SOV_SOCKSTREAM;
307 	sti->sti_pushcnt = 0;
308 	mutex_exit(&so->so_lock);
309 
310 	/*
311 	 * Set a permenent error to force any thread in sorecvmsg to
312 	 * return (and drop SOREADLOCKED). Clear the error once
313 	 * we have SOREADLOCKED.
314 	 * This makes a read sleeping during the I_PUSH of sockmod return
315 	 * EIO.
316 	 */
317 	strsetrerror(SOTOV(so), EIO, 1, NULL);
318 
319 	/*
320 	 * Get the read lock before flushing data to avoid
321 	 * problems with the T_EXDATA_IND MSG_PEEK code in sorecvmsg.
322 	 */
323 	mutex_enter(&so->so_lock);
324 	(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
325 	mutex_exit(&so->so_lock);
326 
327 	strsetrerror(SOTOV(so), 0, 0, NULL);
328 	so_installhooks(so);
329 
330 	/*
331 	 * Flush everything on the read queue.
332 	 * This ensures that no T_CONN_IND remain and that no T_EXDATA_IND
333 	 * remain; those types of messages would confuse sockfs.
334 	 */
335 	strflushrq(vp, FLUSHALL);
336 	mutex_enter(&so->so_lock);
337 
338 	/*
339 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
340 	 */
341 	so_flush_discon_ind(so);
342 	so_unlock_read(so);	/* Clear SOREADLOCKED */
343 
344 	so_unlock_single(so, SOLOCKED);
345 	mutex_exit(&so->so_lock);
346 }
347 
348 /*
349  * Install the hooks in the stream head.
350  */
351 void
352 so_installhooks(struct sonode *so)
353 {
354 	struct vnode *vp = SOTOV(so);
355 
356 	strsetrputhooks(vp, SH_SIGALLDATA | SH_IGN_ZEROLEN | SH_CONSOL_DATA,
357 	    strsock_proto, strsock_misc);
358 	strsetwputhooks(vp, SH_SIGPIPE | SH_RECHECK_ERR, 0);
359 }
360 
361 /*
362  * Remove the hooks in the stream head.
363  */
364 static void
365 so_removehooks(struct sonode *so)
366 {
367 	struct vnode *vp = SOTOV(so);
368 
369 	strsetrputhooks(vp, 0, NULL, NULL);
370 	strsetwputhooks(vp, 0, STRTIMOUT);
371 	/*
372 	 * Leave read behavior as it would have been for a normal
373 	 * stream i.e. a read of an M_PROTO will fail.
374 	 */
375 }
376 
377 void
378 so_basic_strinit(struct sonode *so)
379 {
380 	struct vnode *vp = SOTOV(so);
381 	struct stdata *stp;
382 	mblk_t *mp;
383 	sotpi_info_t *sti = SOTOTPI(so);
384 
385 	/* Preallocate an unbind_req message */
386 	mp = soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP, CRED());
387 	mutex_enter(&so->so_lock);
388 	sti->sti_unbind_mp = mp;
389 #ifdef DEBUG
390 	so->so_options = so_default_options;
391 #endif /* DEBUG */
392 	mutex_exit(&so->so_lock);
393 
394 	so_installhooks(so);
395 
396 	stp = vp->v_stream;
397 	/*
398 	 * Have to keep minpsz at zero in order to allow write/send of zero
399 	 * bytes.
400 	 */
401 	mutex_enter(&stp->sd_lock);
402 	if (stp->sd_qn_minpsz == 1)
403 		stp->sd_qn_minpsz = 0;
404 	mutex_exit(&stp->sd_lock);
405 }
406 
407 /*
408  * Initialize the streams side of a socket including
409  * T_info_req/ack processing. If tso is not NULL its values are used thereby
410  * avoiding the T_INFO_REQ.
411  */
412 int
413 so_strinit(struct sonode *so, struct sonode *tso)
414 {
415 	sotpi_info_t *sti = SOTOTPI(so);
416 	sotpi_info_t *tsti;
417 	int error;
418 
419 	so_basic_strinit(so);
420 
421 	/*
422 	 * The T_CAPABILITY_REQ should be the first message sent down because
423 	 * at least TCP has a fast-path for this which avoids timeouts while
424 	 * waiting for the T_CAPABILITY_ACK under high system load.
425 	 */
426 	if (tso == NULL) {
427 		error = do_tcapability(so, TC1_ACCEPTOR_ID | TC1_INFO);
428 		if (error)
429 			return (error);
430 	} else {
431 		tsti = SOTOTPI(tso);
432 
433 		mutex_enter(&so->so_lock);
434 		sti->sti_tsdu_size = tsti->sti_tsdu_size;
435 		sti->sti_etsdu_size = tsti->sti_etsdu_size;
436 		sti->sti_addr_size = tsti->sti_addr_size;
437 		sti->sti_opt_size = tsti->sti_opt_size;
438 		sti->sti_tidu_size = tsti->sti_tidu_size;
439 		sti->sti_serv_type = tsti->sti_serv_type;
440 		so->so_mode = tso->so_mode & ~SM_ACCEPTOR_ID;
441 		mutex_exit(&so->so_lock);
442 
443 		/* the following do_tcapability may update so->so_mode */
444 		if ((tsti->sti_serv_type != T_CLTS) &&
445 		    (sti->sti_direct == 0)) {
446 			error = do_tcapability(so, TC1_ACCEPTOR_ID);
447 			if (error)
448 				return (error);
449 		}
450 	}
451 	/*
452 	 * If the addr_size is 0 we treat it as already bound
453 	 * and connected. This is used by the routing socket.
454 	 * We set the addr_size to something to allocate a the address
455 	 * structures.
456 	 */
457 	if (sti->sti_addr_size == 0) {
458 		so->so_state |= SS_ISBOUND | SS_ISCONNECTED;
459 		/* Address size can vary with address families. */
460 		if (so->so_family == AF_INET6)
461 			sti->sti_addr_size =
462 			    (t_scalar_t)sizeof (struct sockaddr_in6);
463 		else
464 			sti->sti_addr_size =
465 			    (t_scalar_t)sizeof (struct sockaddr_in);
466 		ASSERT(sti->sti_unbind_mp);
467 	}
468 
469 	so_alloc_addr(so, sti->sti_addr_size);
470 
471 	return (0);
472 }
473 
474 static void
475 copy_tinfo(struct sonode *so, struct T_info_ack *tia)
476 {
477 	sotpi_info_t *sti = SOTOTPI(so);
478 
479 	sti->sti_tsdu_size = tia->TSDU_size;
480 	sti->sti_etsdu_size = tia->ETSDU_size;
481 	sti->sti_addr_size = tia->ADDR_size;
482 	sti->sti_opt_size = tia->OPT_size;
483 	sti->sti_tidu_size = tia->TIDU_size;
484 	sti->sti_serv_type = tia->SERV_type;
485 	switch (tia->CURRENT_state) {
486 	case TS_UNBND:
487 		break;
488 	case TS_IDLE:
489 		so->so_state |= SS_ISBOUND;
490 		sti->sti_laddr_len = 0;
491 		sti->sti_laddr_valid = 0;
492 		break;
493 	case TS_DATA_XFER:
494 		so->so_state |= SS_ISBOUND|SS_ISCONNECTED;
495 		sti->sti_laddr_len = 0;
496 		sti->sti_faddr_len = 0;
497 		sti->sti_laddr_valid = 0;
498 		sti->sti_faddr_valid = 0;
499 		break;
500 	}
501 
502 	/*
503 	 * Heuristics for determining the socket mode flags
504 	 * (SM_ATOMIC, SM_CONNREQUIRED, SM_ADDR, SM_FDPASSING,
505 	 * and SM_EXDATA, SM_OPTDATA, and SM_BYTESTREAM)
506 	 * from the info ack.
507 	 */
508 	if (sti->sti_serv_type == T_CLTS) {
509 		so->so_mode |= SM_ATOMIC | SM_ADDR;
510 	} else {
511 		so->so_mode |= SM_CONNREQUIRED;
512 		if (sti->sti_etsdu_size != 0 && sti->sti_etsdu_size != -2)
513 			so->so_mode |= SM_EXDATA;
514 	}
515 	if (so->so_type == SOCK_SEQPACKET || so->so_type == SOCK_RAW) {
516 		/* Semantics are to discard tail end of messages */
517 		so->so_mode |= SM_ATOMIC;
518 	}
519 	if (so->so_family == AF_UNIX) {
520 		so->so_mode |= SM_FDPASSING | SM_OPTDATA;
521 		if (sti->sti_addr_size == -1) {
522 			/* MAXPATHLEN + soun_family + nul termination */
523 			sti->sti_addr_size = (t_scalar_t)(MAXPATHLEN +
524 			    sizeof (short) + 1);
525 		}
526 		if (so->so_type == SOCK_STREAM) {
527 			/*
528 			 * Make it into a byte-stream transport.
529 			 * SOCK_SEQPACKET sockets are unchanged.
530 			 */
531 			sti->sti_tsdu_size = 0;
532 		}
533 	} else if (sti->sti_addr_size == -1) {
534 		/*
535 		 * Logic extracted from sockmod - have to pick some max address
536 		 * length in order to preallocate the addresses.
537 		 */
538 		sti->sti_addr_size = SOA_DEFSIZE;
539 	}
540 	if (sti->sti_tsdu_size == 0)
541 		so->so_mode |= SM_BYTESTREAM;
542 }
543 
544 static int
545 check_tinfo(struct sonode *so)
546 {
547 	sotpi_info_t *sti = SOTOTPI(so);
548 
549 	/* Consistency checks */
550 	if (so->so_type == SOCK_DGRAM && sti->sti_serv_type != T_CLTS) {
551 		eprintso(so, ("service type and socket type mismatch\n"));
552 		eprintsoline(so, EPROTO);
553 		return (EPROTO);
554 	}
555 	if (so->so_type == SOCK_STREAM && sti->sti_serv_type == T_CLTS) {
556 		eprintso(so, ("service type and socket type mismatch\n"));
557 		eprintsoline(so, EPROTO);
558 		return (EPROTO);
559 	}
560 	if (so->so_type == SOCK_SEQPACKET && sti->sti_serv_type == T_CLTS) {
561 		eprintso(so, ("service type and socket type mismatch\n"));
562 		eprintsoline(so, EPROTO);
563 		return (EPROTO);
564 	}
565 	if (so->so_family == AF_INET &&
566 	    sti->sti_addr_size != (t_scalar_t)sizeof (struct sockaddr_in)) {
567 		eprintso(so,
568 		    ("AF_INET must have sockaddr_in address length. Got %d\n",
569 		    sti->sti_addr_size));
570 		eprintsoline(so, EMSGSIZE);
571 		return (EMSGSIZE);
572 	}
573 	if (so->so_family == AF_INET6 &&
574 	    sti->sti_addr_size != (t_scalar_t)sizeof (struct sockaddr_in6)) {
575 		eprintso(so,
576 		    ("AF_INET6 must have sockaddr_in6 address length. Got %d\n",
577 		    sti->sti_addr_size));
578 		eprintsoline(so, EMSGSIZE);
579 		return (EMSGSIZE);
580 	}
581 
582 	dprintso(so, 1, (
583 	    "tinfo: serv %d tsdu %d, etsdu %d, addr %d, opt %d, tidu %d\n",
584 	    sti->sti_serv_type, sti->sti_tsdu_size, sti->sti_etsdu_size,
585 	    sti->sti_addr_size, sti->sti_opt_size,
586 	    sti->sti_tidu_size));
587 	dprintso(so, 1, ("tinfo: so_state %s\n",
588 	    pr_state(so->so_state, so->so_mode)));
589 	return (0);
590 }
591 
592 /*
593  * Send down T_info_req and wait for the ack.
594  * Record interesting T_info_ack values in the sonode.
595  */
596 static int
597 do_tinfo(struct sonode *so)
598 {
599 	struct T_info_req tir;
600 	mblk_t *mp;
601 	int error;
602 
603 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
604 
605 	if (so_no_tinfo) {
606 		SOTOTPI(so)->sti_addr_size = 0;
607 		return (0);
608 	}
609 
610 	dprintso(so, 1, ("do_tinfo(%p)\n", (void *)so));
611 
612 	/* Send T_INFO_REQ */
613 	tir.PRIM_type = T_INFO_REQ;
614 	mp = soallocproto1(&tir, sizeof (tir),
615 	    sizeof (struct T_info_req) + sizeof (struct T_info_ack),
616 	    _ALLOC_INTR, CRED());
617 	if (mp == NULL) {
618 		eprintsoline(so, ENOBUFS);
619 		return (ENOBUFS);
620 	}
621 	/* T_INFO_REQ has to be M_PCPROTO */
622 	DB_TYPE(mp) = M_PCPROTO;
623 
624 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
625 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
626 	if (error) {
627 		eprintsoline(so, error);
628 		return (error);
629 	}
630 	mutex_enter(&so->so_lock);
631 	/* Wait for T_INFO_ACK */
632 	if ((error = sowaitprim(so, T_INFO_REQ, T_INFO_ACK,
633 	    (t_uscalar_t)sizeof (struct T_info_ack), &mp, 0))) {
634 		mutex_exit(&so->so_lock);
635 		eprintsoline(so, error);
636 		return (error);
637 	}
638 
639 	ASSERT(mp);
640 	copy_tinfo(so, (struct T_info_ack *)mp->b_rptr);
641 	mutex_exit(&so->so_lock);
642 	freemsg(mp);
643 	return (check_tinfo(so));
644 }
645 
646 /*
647  * Send down T_capability_req and wait for the ack.
648  * Record interesting T_capability_ack values in the sonode.
649  */
650 static int
651 do_tcapability(struct sonode *so, t_uscalar_t cap_bits1)
652 {
653 	struct T_capability_req tcr;
654 	struct T_capability_ack *tca;
655 	mblk_t *mp;
656 	int error;
657 	sotpi_info_t *sti = SOTOTPI(so);
658 
659 	ASSERT(cap_bits1 != 0);
660 	ASSERT((cap_bits1 & ~(TC1_ACCEPTOR_ID | TC1_INFO)) == 0);
661 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
662 
663 	if (sti->sti_provinfo->tpi_capability == PI_NO)
664 		return (do_tinfo(so));
665 
666 	if (so_no_tinfo) {
667 		sti->sti_addr_size = 0;
668 		if ((cap_bits1 &= ~TC1_INFO) == 0)
669 			return (0);
670 	}
671 
672 	dprintso(so, 1, ("do_tcapability(%p)\n", (void *)so));
673 
674 	/* Send T_CAPABILITY_REQ */
675 	tcr.PRIM_type = T_CAPABILITY_REQ;
676 	tcr.CAP_bits1 = cap_bits1;
677 	mp = soallocproto1(&tcr, sizeof (tcr),
678 	    sizeof (struct T_capability_req) + sizeof (struct T_capability_ack),
679 	    _ALLOC_INTR, CRED());
680 	if (mp == NULL) {
681 		eprintsoline(so, ENOBUFS);
682 		return (ENOBUFS);
683 	}
684 	/* T_CAPABILITY_REQ should be M_PCPROTO here */
685 	DB_TYPE(mp) = M_PCPROTO;
686 
687 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
688 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
689 	if (error) {
690 		eprintsoline(so, error);
691 		return (error);
692 	}
693 	mutex_enter(&so->so_lock);
694 	/* Wait for T_CAPABILITY_ACK */
695 	if ((error = sowaitprim(so, T_CAPABILITY_REQ, T_CAPABILITY_ACK,
696 	    (t_uscalar_t)sizeof (*tca), &mp, sock_capability_timeout * hz))) {
697 		mutex_exit(&so->so_lock);
698 		PI_PROVLOCK(sti->sti_provinfo);
699 		if (sti->sti_provinfo->tpi_capability == PI_DONTKNOW)
700 			sti->sti_provinfo->tpi_capability = PI_NO;
701 		PI_PROVUNLOCK(sti->sti_provinfo);
702 		ASSERT((so->so_mode & SM_ACCEPTOR_ID) == 0);
703 		if (cap_bits1 & TC1_INFO) {
704 			/*
705 			 * If the T_CAPABILITY_REQ timed out and then a
706 			 * T_INFO_REQ gets a protocol error, most likely
707 			 * the capability was slow (vs. unsupported). Return
708 			 * ENOSR for this case as a best guess.
709 			 */
710 			if (error == ETIME) {
711 				return ((error = do_tinfo(so)) == EPROTO ?
712 				    ENOSR : error);
713 			}
714 			return (do_tinfo(so));
715 		}
716 		return (0);
717 	}
718 
719 	ASSERT(mp);
720 	tca = (struct T_capability_ack *)mp->b_rptr;
721 
722 	ASSERT((cap_bits1 & TC1_INFO) == (tca->CAP_bits1 & TC1_INFO));
723 	so_proc_tcapability_ack(so, tca);
724 
725 	cap_bits1 = tca->CAP_bits1;
726 
727 	mutex_exit(&so->so_lock);
728 	freemsg(mp);
729 
730 	if (cap_bits1 & TC1_INFO)
731 		return (check_tinfo(so));
732 
733 	return (0);
734 }
735 
736 /*
737  * Process a T_CAPABILITY_ACK
738  */
739 void
740 so_proc_tcapability_ack(struct sonode *so, struct T_capability_ack *tca)
741 {
742 	sotpi_info_t *sti = SOTOTPI(so);
743 
744 	if (sti->sti_provinfo->tpi_capability == PI_DONTKNOW) {
745 		PI_PROVLOCK(sti->sti_provinfo);
746 		sti->sti_provinfo->tpi_capability = PI_YES;
747 		PI_PROVUNLOCK(sti->sti_provinfo);
748 	}
749 
750 	if (tca->CAP_bits1 & TC1_ACCEPTOR_ID) {
751 		sti->sti_acceptor_id = tca->ACCEPTOR_id;
752 		so->so_mode |= SM_ACCEPTOR_ID;
753 	}
754 
755 	if (tca->CAP_bits1 & TC1_INFO)
756 		copy_tinfo(so, &tca->INFO_ack);
757 }
758 
759 /*
760  * Retrieve socket error, clear error if not peek.
761  */
762 int
763 sogeterr(struct sonode *so, boolean_t clear_err)
764 {
765 	int error;
766 
767 	ASSERT(MUTEX_HELD(&so->so_lock));
768 
769 	error = so->so_error;
770 	if (clear_err)
771 		so->so_error = 0;
772 
773 	return (error);
774 }
775 
776 /*
777  * This routine is registered with the stream head to retrieve read
778  * side errors.
779  * It does not clear the socket error for a peeking read side operation.
780  * It the error is to be cleared it sets *clearerr.
781  */
782 int
783 sogetrderr(vnode_t *vp, int ispeek, int *clearerr)
784 {
785 	struct sonode *so = VTOSO(vp);
786 	int error;
787 
788 	mutex_enter(&so->so_lock);
789 	if (ispeek) {
790 		error = so->so_error;
791 		*clearerr = 0;
792 	} else {
793 		error = so->so_error;
794 		so->so_error = 0;
795 		*clearerr = 1;
796 	}
797 	mutex_exit(&so->so_lock);
798 	return (error);
799 }
800 
801 /*
802  * This routine is registered with the stream head to retrieve write
803  * side errors.
804  * It does not clear the socket error for a peeking read side operation.
805  * It the error is to be cleared it sets *clearerr.
806  */
807 int
808 sogetwrerr(vnode_t *vp, int ispeek, int *clearerr)
809 {
810 	struct sonode *so = VTOSO(vp);
811 	int error;
812 
813 	mutex_enter(&so->so_lock);
814 	if (so->so_state & SS_CANTSENDMORE) {
815 		error = EPIPE;
816 		*clearerr = 0;
817 	} else {
818 		error = so->so_error;
819 		if (ispeek) {
820 			*clearerr = 0;
821 		} else {
822 			so->so_error = 0;
823 			*clearerr = 1;
824 		}
825 	}
826 	mutex_exit(&so->so_lock);
827 	return (error);
828 }
829 
830 /*
831  * Set a nonpersistent read and write error on the socket.
832  * Used when there is a T_uderror_ind for a connected socket.
833  * The caller also needs to call strsetrerror and strsetwerror
834  * after dropping the lock.
835  */
836 void
837 soseterror(struct sonode *so, int error)
838 {
839 	ASSERT(error != 0);
840 
841 	ASSERT(MUTEX_HELD(&so->so_lock));
842 	so->so_error = (ushort_t)error;
843 }
844 
845 void
846 soisconnecting(struct sonode *so)
847 {
848 	ASSERT(MUTEX_HELD(&so->so_lock));
849 	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
850 	so->so_state |= SS_ISCONNECTING;
851 	cv_broadcast(&so->so_state_cv);
852 }
853 
854 void
855 soisconnected(struct sonode *so)
856 {
857 	ASSERT(MUTEX_HELD(&so->so_lock));
858 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
859 	so->so_state |= SS_ISCONNECTED;
860 	cv_broadcast(&so->so_state_cv);
861 }
862 
863 /*
864  * The caller also needs to call strsetrerror, strsetwerror and strseteof.
865  */
866 void
867 soisdisconnected(struct sonode *so, int error)
868 {
869 	ASSERT(MUTEX_HELD(&so->so_lock));
870 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
871 	so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
872 	so->so_error = (ushort_t)error;
873 	if (so->so_peercred != NULL) {
874 		crfree(so->so_peercred);
875 		so->so_peercred = NULL;
876 	}
877 	cv_broadcast(&so->so_state_cv);
878 }
879 
880 /*
881  * For connected AF_UNIX SOCK_DGRAM sockets when the peer closes.
882  * Does not affect write side.
883  * The caller also has to call strsetrerror.
884  */
885 static void
886 sobreakconn(struct sonode *so, int error)
887 {
888 	ASSERT(MUTEX_HELD(&so->so_lock));
889 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
890 	so->so_error = (ushort_t)error;
891 	cv_broadcast(&so->so_state_cv);
892 }
893 
894 /*
895  * Can no longer send.
896  * Caller must also call strsetwerror.
897  *
898  * We mark the peer address as no longer valid for getpeername, but
899  * leave it around for so_unix_close to notify the peer (that
900  * transport has no addressing held at that layer).
901  */
902 void
903 socantsendmore(struct sonode *so)
904 {
905 	ASSERT(MUTEX_HELD(&so->so_lock));
906 	so->so_state |= SS_CANTSENDMORE;
907 	cv_broadcast(&so->so_state_cv);
908 }
909 
910 /*
911  * The caller must call strseteof(,1) as well as this routine
912  * to change the socket state.
913  */
914 void
915 socantrcvmore(struct sonode *so)
916 {
917 	ASSERT(MUTEX_HELD(&so->so_lock));
918 	so->so_state |= SS_CANTRCVMORE;
919 	cv_broadcast(&so->so_state_cv);
920 }
921 
922 /*
923  * The caller has sent down a "request_prim" primitive and wants to wait for
924  * an ack ("ack_prim") or an T_ERROR_ACK for it.
925  * The specified "ack_prim" can be a T_OK_ACK.
926  *
927  * Assumes that all the TPI acks are M_PCPROTO messages.
928  *
929  * Note that the socket is single-threaded (using so_lock_single)
930  * for all operations that generate TPI ack messages. Since
931  * only TPI ack messages are M_PCPROTO we should never receive
932  * anything except either the ack we are expecting or a T_ERROR_ACK
933  * for the same primitive.
934  */
935 int
936 sowaitprim(struct sonode *so, t_scalar_t request_prim, t_scalar_t ack_prim,
937 	    t_uscalar_t min_size, mblk_t **mpp, clock_t wait)
938 {
939 	mblk_t *mp;
940 	union T_primitives *tpr;
941 	int error;
942 
943 	dprintso(so, 1, ("sowaitprim(%p, %d, %d, %d, %p, %lu)\n",
944 	    (void *)so, request_prim, ack_prim, min_size, (void *)mpp, wait));
945 
946 	ASSERT(MUTEX_HELD(&so->so_lock));
947 
948 	error = sowaitack(so, &mp, wait);
949 	if (error)
950 		return (error);
951 
952 	dprintso(so, 1, ("got msg %p\n", (void *)mp));
953 	if (DB_TYPE(mp) != M_PCPROTO ||
954 	    MBLKL(mp) < sizeof (tpr->type)) {
955 		freemsg(mp);
956 		eprintsoline(so, EPROTO);
957 		return (EPROTO);
958 	}
959 	tpr = (union T_primitives *)mp->b_rptr;
960 	/*
961 	 * Did we get the primitive that we were asking for?
962 	 * For T_OK_ACK we also check that it matches the request primitive.
963 	 */
964 	if (tpr->type == ack_prim &&
965 	    (ack_prim != T_OK_ACK ||
966 	    tpr->ok_ack.CORRECT_prim == request_prim)) {
967 		if (MBLKL(mp) >= (ssize_t)min_size) {
968 			/* Found what we are looking for */
969 			*mpp = mp;
970 			return (0);
971 		}
972 		/* Too short */
973 		freemsg(mp);
974 		eprintsoline(so, EPROTO);
975 		return (EPROTO);
976 	}
977 
978 	if (tpr->type == T_ERROR_ACK &&
979 	    tpr->error_ack.ERROR_prim == request_prim) {
980 		/* Error to the primitive we were looking for */
981 		if (tpr->error_ack.TLI_error == TSYSERR) {
982 			error = tpr->error_ack.UNIX_error;
983 		} else {
984 			error = proto_tlitosyserr(tpr->error_ack.TLI_error);
985 		}
986 		dprintso(so, 0, ("error_ack for %d: %d/%d ->%d\n",
987 		    tpr->error_ack.ERROR_prim, tpr->error_ack.TLI_error,
988 		    tpr->error_ack.UNIX_error, error));
989 		freemsg(mp);
990 		return (error);
991 	}
992 	/*
993 	 * Wrong primitive or T_ERROR_ACK for the wrong primitive
994 	 */
995 #ifdef DEBUG
996 	if (tpr->type == T_ERROR_ACK) {
997 		dprintso(so, 0, ("error_ack for %d: %d/%d\n",
998 		    tpr->error_ack.ERROR_prim, tpr->error_ack.TLI_error,
999 		    tpr->error_ack.UNIX_error));
1000 	} else if (tpr->type == T_OK_ACK) {
1001 		dprintso(so, 0, ("ok_ack for %d, expected %d for %d\n",
1002 		    tpr->ok_ack.CORRECT_prim, ack_prim, request_prim));
1003 	} else {
1004 		dprintso(so, 0,
1005 		    ("unexpected primitive %d, expected %d for %d\n",
1006 		    tpr->type, ack_prim, request_prim));
1007 	}
1008 #endif /* DEBUG */
1009 
1010 	freemsg(mp);
1011 	eprintsoline(so, EPROTO);
1012 	return (EPROTO);
1013 }
1014 
1015 /*
1016  * Wait for a T_OK_ACK for the specified primitive.
1017  */
1018 int
1019 sowaitokack(struct sonode *so, t_scalar_t request_prim)
1020 {
1021 	mblk_t *mp;
1022 	int error;
1023 
1024 	error = sowaitprim(so, request_prim, T_OK_ACK,
1025 	    (t_uscalar_t)sizeof (struct T_ok_ack), &mp, 0);
1026 	if (error)
1027 		return (error);
1028 	freemsg(mp);
1029 	return (0);
1030 }
1031 
1032 /*
1033  * Queue a received TPI ack message on sti_ack_mp.
1034  */
1035 void
1036 soqueueack(struct sonode *so, mblk_t *mp)
1037 {
1038 	sotpi_info_t *sti = SOTOTPI(so);
1039 
1040 	if (DB_TYPE(mp) != M_PCPROTO) {
1041 		zcmn_err(getzoneid(), CE_WARN,
1042 		    "sockfs: received unexpected M_PROTO TPI ack. Prim %d\n",
1043 		    *(t_scalar_t *)mp->b_rptr);
1044 		freemsg(mp);
1045 		return;
1046 	}
1047 
1048 	mutex_enter(&so->so_lock);
1049 	if (sti->sti_ack_mp != NULL) {
1050 		dprintso(so, 1, ("sti_ack_mp already set\n"));
1051 		freemsg(sti->sti_ack_mp);
1052 		sti->sti_ack_mp = NULL;
1053 	}
1054 	sti->sti_ack_mp = mp;
1055 	cv_broadcast(&sti->sti_ack_cv);
1056 	mutex_exit(&so->so_lock);
1057 }
1058 
1059 /*
1060  * Wait for a TPI ack ignoring signals and errors.
1061  */
1062 int
1063 sowaitack(struct sonode *so, mblk_t **mpp, clock_t wait)
1064 {
1065 	sotpi_info_t *sti = SOTOTPI(so);
1066 
1067 	ASSERT(MUTEX_HELD(&so->so_lock));
1068 
1069 	while (sti->sti_ack_mp == NULL) {
1070 #ifdef SOCK_TEST
1071 		if (wait == 0 && sock_test_timelimit != 0)
1072 			wait = sock_test_timelimit;
1073 #endif
1074 		if (wait != 0) {
1075 			/*
1076 			 * Only wait for the time limit.
1077 			 */
1078 			if (cv_reltimedwait(&sti->sti_ack_cv, &so->so_lock,
1079 			    wait, TR_CLOCK_TICK) == -1) {
1080 				eprintsoline(so, ETIME);
1081 				return (ETIME);
1082 			}
1083 		}
1084 		else
1085 			cv_wait(&sti->sti_ack_cv, &so->so_lock);
1086 	}
1087 	*mpp = sti->sti_ack_mp;
1088 #ifdef DEBUG
1089 	{
1090 		union T_primitives *tpr;
1091 		mblk_t *mp = *mpp;
1092 
1093 		tpr = (union T_primitives *)mp->b_rptr;
1094 		ASSERT(DB_TYPE(mp) == M_PCPROTO);
1095 		ASSERT(tpr->type == T_OK_ACK ||
1096 		    tpr->type == T_ERROR_ACK ||
1097 		    tpr->type == T_BIND_ACK ||
1098 		    tpr->type == T_CAPABILITY_ACK ||
1099 		    tpr->type == T_INFO_ACK ||
1100 		    tpr->type == T_OPTMGMT_ACK);
1101 	}
1102 #endif /* DEBUG */
1103 	sti->sti_ack_mp = NULL;
1104 	return (0);
1105 }
1106 
1107 /*
1108  * Queue a received T_CONN_IND message on sti_conn_ind_head/tail.
1109  */
1110 void
1111 soqueueconnind(struct sonode *so, mblk_t *mp)
1112 {
1113 	sotpi_info_t *sti = SOTOTPI(so);
1114 
1115 	if (DB_TYPE(mp) != M_PROTO) {
1116 		zcmn_err(getzoneid(), CE_WARN,
1117 		    "sockfs: received unexpected M_PCPROTO T_CONN_IND\n");
1118 		freemsg(mp);
1119 		return;
1120 	}
1121 
1122 	mutex_enter(&so->so_lock);
1123 	ASSERT(mp->b_next == NULL);
1124 	if (sti->sti_conn_ind_head == NULL) {
1125 		sti->sti_conn_ind_head = mp;
1126 	} else {
1127 		ASSERT(sti->sti_conn_ind_tail->b_next == NULL);
1128 		sti->sti_conn_ind_tail->b_next = mp;
1129 	}
1130 	sti->sti_conn_ind_tail = mp;
1131 	/* Wakeup a single consumer of the T_CONN_IND */
1132 	cv_signal(&so->so_acceptq_cv);
1133 	mutex_exit(&so->so_lock);
1134 }
1135 
1136 /*
1137  * Wait for a T_CONN_IND.
1138  * Don't wait if nonblocking.
1139  * Accept signals and socket errors.
1140  */
1141 int
1142 sowaitconnind(struct sonode *so, int fmode, mblk_t **mpp)
1143 {
1144 	mblk_t *mp;
1145 	sotpi_info_t *sti = SOTOTPI(so);
1146 	int error = 0;
1147 
1148 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1149 	mutex_enter(&so->so_lock);
1150 check_error:
1151 	if (so->so_error) {
1152 		error = sogeterr(so, B_TRUE);
1153 		if (error) {
1154 			mutex_exit(&so->so_lock);
1155 			return (error);
1156 		}
1157 	}
1158 
1159 	if (sti->sti_conn_ind_head == NULL) {
1160 		if (fmode & (FNDELAY|FNONBLOCK)) {
1161 			error = EWOULDBLOCK;
1162 			goto done;
1163 		}
1164 
1165 		if (so->so_state & SS_CLOSING) {
1166 			error = EINTR;
1167 			goto done;
1168 		}
1169 
1170 		if (!cv_wait_sig_swap(&so->so_acceptq_cv, &so->so_lock)) {
1171 			error = EINTR;
1172 			goto done;
1173 		}
1174 		goto check_error;
1175 	}
1176 	mp = sti->sti_conn_ind_head;
1177 	sti->sti_conn_ind_head = mp->b_next;
1178 	mp->b_next = NULL;
1179 	if (sti->sti_conn_ind_head == NULL) {
1180 		ASSERT(sti->sti_conn_ind_tail == mp);
1181 		sti->sti_conn_ind_tail = NULL;
1182 	}
1183 	*mpp = mp;
1184 done:
1185 	mutex_exit(&so->so_lock);
1186 	return (error);
1187 }
1188 
1189 /*
1190  * Flush a T_CONN_IND matching the sequence number from the list.
1191  * Return zero if found; non-zero otherwise.
1192  * This is called very infrequently thus it is ok to do a linear search.
1193  */
1194 int
1195 soflushconnind(struct sonode *so, t_scalar_t seqno)
1196 {
1197 	mblk_t *prevmp, *mp;
1198 	struct T_conn_ind *tci;
1199 	sotpi_info_t *sti = SOTOTPI(so);
1200 
1201 	mutex_enter(&so->so_lock);
1202 	for (prevmp = NULL, mp = sti->sti_conn_ind_head; mp != NULL;
1203 	    prevmp = mp, mp = mp->b_next) {
1204 		tci = (struct T_conn_ind *)mp->b_rptr;
1205 		if (tci->SEQ_number == seqno) {
1206 			dprintso(so, 1,
1207 			    ("t_discon_ind: found T_CONN_IND %d\n", seqno));
1208 			/* Deleting last? */
1209 			if (sti->sti_conn_ind_tail == mp) {
1210 				sti->sti_conn_ind_tail = prevmp;
1211 			}
1212 			if (prevmp == NULL) {
1213 				/* Deleting first */
1214 				sti->sti_conn_ind_head = mp->b_next;
1215 			} else {
1216 				prevmp->b_next = mp->b_next;
1217 			}
1218 			mp->b_next = NULL;
1219 
1220 			ASSERT((sti->sti_conn_ind_head == NULL &&
1221 			    sti->sti_conn_ind_tail == NULL) ||
1222 			    (sti->sti_conn_ind_head != NULL &&
1223 			    sti->sti_conn_ind_tail != NULL));
1224 
1225 			so->so_error = ECONNABORTED;
1226 			mutex_exit(&so->so_lock);
1227 
1228 			freemsg(mp);
1229 			return (0);
1230 		}
1231 	}
1232 	mutex_exit(&so->so_lock);
1233 	dprintso(so, 1,	("t_discon_ind: NOT found T_CONN_IND %d\n", seqno));
1234 	return (-1);
1235 }
1236 
1237 /*
1238  * Wait until the socket is connected or there is an error.
1239  * fmode should contain any nonblocking flags. nosig should be
1240  * set if the caller does not want the wait to be interrupted by a signal.
1241  */
1242 int
1243 sowaitconnected(struct sonode *so, int fmode, int nosig)
1244 {
1245 	int error;
1246 
1247 	ASSERT(MUTEX_HELD(&so->so_lock));
1248 
1249 	while ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) ==
1250 	    SS_ISCONNECTING && so->so_error == 0) {
1251 
1252 		dprintso(so, 1, ("waiting for SS_ISCONNECTED on %p\n",
1253 		    (void *)so));
1254 		if (fmode & (FNDELAY|FNONBLOCK))
1255 			return (EINPROGRESS);
1256 
1257 		if (so->so_state & SS_CLOSING)
1258 			return (EINTR);
1259 
1260 		if (nosig)
1261 			cv_wait(&so->so_state_cv, &so->so_lock);
1262 		else if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) {
1263 			/*
1264 			 * Return EINTR and let the application use
1265 			 * nonblocking techniques for detecting when
1266 			 * the connection has been established.
1267 			 */
1268 			return (EINTR);
1269 		}
1270 		dprintso(so, 1, ("awoken on %p\n", (void *)so));
1271 	}
1272 
1273 	if (so->so_error != 0) {
1274 		error = sogeterr(so, B_TRUE);
1275 		ASSERT(error != 0);
1276 		dprintso(so, 1, ("sowaitconnected: error %d\n", error));
1277 		return (error);
1278 	}
1279 	if (!(so->so_state & SS_ISCONNECTED)) {
1280 		/*
1281 		 * Could have received a T_ORDREL_IND or a T_DISCON_IND with
1282 		 * zero errno. Or another thread could have consumed so_error
1283 		 * e.g. by calling read.
1284 		 */
1285 		error = ECONNREFUSED;
1286 		dprintso(so, 1, ("sowaitconnected: error %d\n", error));
1287 		return (error);
1288 	}
1289 	return (0);
1290 }
1291 
1292 
1293 /*
1294  * Handle the signal generation aspect of urgent data.
1295  */
1296 static void
1297 so_oob_sig(struct sonode *so, int extrasig,
1298     strsigset_t *signals, strpollset_t *pollwakeups)
1299 {
1300 	sotpi_info_t *sti = SOTOTPI(so);
1301 
1302 	ASSERT(MUTEX_HELD(&so->so_lock));
1303 
1304 	ASSERT(so_verify_oobstate(so));
1305 	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1306 	if (sti->sti_oobsigcnt > sti->sti_oobcnt) {
1307 		/*
1308 		 * Signal has already been generated once for this
1309 		 * urgent "event". However, since TCP can receive updated
1310 		 * urgent pointers we still generate a signal.
1311 		 */
1312 		ASSERT(so->so_state & SS_OOBPEND);
1313 		if (extrasig) {
1314 			*signals |= S_RDBAND;
1315 			*pollwakeups |= POLLRDBAND;
1316 		}
1317 		return;
1318 	}
1319 
1320 	sti->sti_oobsigcnt++;
1321 	ASSERT(sti->sti_oobsigcnt > 0);	/* Wraparound */
1322 	ASSERT(sti->sti_oobsigcnt > sti->sti_oobcnt);
1323 
1324 	/*
1325 	 * Record (for select/poll) that urgent data is pending.
1326 	 */
1327 	so->so_state |= SS_OOBPEND;
1328 	/*
1329 	 * New urgent data on the way so forget about any old
1330 	 * urgent data.
1331 	 */
1332 	so->so_state &= ~(SS_HAVEOOBDATA|SS_HADOOBDATA);
1333 	if (so->so_oobmsg != NULL) {
1334 		dprintso(so, 1, ("sock: discarding old oob\n"));
1335 		freemsg(so->so_oobmsg);
1336 		so->so_oobmsg = NULL;
1337 	}
1338 	*signals |= S_RDBAND;
1339 	*pollwakeups |= POLLRDBAND;
1340 	ASSERT(so_verify_oobstate(so));
1341 }
1342 
1343 /*
1344  * Handle the processing of the T_EXDATA_IND with urgent data.
1345  * Returns the T_EXDATA_IND if it should be queued on the read queue.
1346  */
1347 /* ARGSUSED2 */
1348 static mblk_t *
1349 so_oob_exdata(struct sonode *so, mblk_t *mp,
1350 	strsigset_t *signals, strpollset_t *pollwakeups)
1351 {
1352 	sotpi_info_t *sti = SOTOTPI(so);
1353 
1354 	ASSERT(MUTEX_HELD(&so->so_lock));
1355 
1356 	ASSERT(so_verify_oobstate(so));
1357 
1358 	ASSERT(sti->sti_oobsigcnt > sti->sti_oobcnt);
1359 
1360 	sti->sti_oobcnt++;
1361 	ASSERT(sti->sti_oobcnt > 0);	/* wraparound? */
1362 	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1363 
1364 	/*
1365 	 * Set MSGMARK for SIOCATMARK.
1366 	 */
1367 	mp->b_flag |= MSGMARK;
1368 
1369 	ASSERT(so_verify_oobstate(so));
1370 	return (mp);
1371 }
1372 
1373 /*
1374  * Handle the processing of the actual urgent data.
1375  * Returns the data mblk if it should be queued on the read queue.
1376  */
1377 static mblk_t *
1378 so_oob_data(struct sonode *so, mblk_t *mp,
1379 	strsigset_t *signals, strpollset_t *pollwakeups)
1380 {
1381 	sotpi_info_t *sti = SOTOTPI(so);
1382 
1383 	ASSERT(MUTEX_HELD(&so->so_lock));
1384 
1385 	ASSERT(so_verify_oobstate(so));
1386 
1387 	ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
1388 	ASSERT(mp != NULL);
1389 	/*
1390 	 * For OOBINLINE we keep the data in the T_EXDATA_IND.
1391 	 * Otherwise we store it in so_oobmsg.
1392 	 */
1393 	ASSERT(so->so_oobmsg == NULL);
1394 	if (so->so_options & SO_OOBINLINE) {
1395 		*pollwakeups |= POLLIN | POLLRDNORM | POLLRDBAND;
1396 		*signals |= S_INPUT | S_RDNORM;
1397 	} else {
1398 		*pollwakeups |= POLLRDBAND;
1399 		so->so_state |= SS_HAVEOOBDATA;
1400 		so->so_oobmsg = mp;
1401 		mp = NULL;
1402 	}
1403 	ASSERT(so_verify_oobstate(so));
1404 	return (mp);
1405 }
1406 
1407 /*
1408  * Caller must hold the mutex.
1409  * For delayed processing, save the T_DISCON_IND received
1410  * from below on sti_discon_ind_mp.
1411  * When the message is processed the framework will call:
1412  *      (*func)(so, mp);
1413  */
1414 static void
1415 so_save_discon_ind(struct sonode *so,
1416 	mblk_t *mp,
1417 	void (*func)(struct sonode *so, mblk_t *))
1418 {
1419 	sotpi_info_t *sti = SOTOTPI(so);
1420 
1421 	ASSERT(MUTEX_HELD(&so->so_lock));
1422 
1423 	/*
1424 	 * Discard new T_DISCON_IND if we have already received another.
1425 	 * Currently the earlier message can either be on sti_discon_ind_mp
1426 	 * or being processed.
1427 	 */
1428 	if (sti->sti_discon_ind_mp != NULL || (so->so_flag & SOASYNC_UNBIND)) {
1429 		zcmn_err(getzoneid(), CE_WARN,
1430 		    "sockfs: received unexpected additional T_DISCON_IND\n");
1431 		freemsg(mp);
1432 		return;
1433 	}
1434 	mp->b_prev = (mblk_t *)func;
1435 	mp->b_next = NULL;
1436 	sti->sti_discon_ind_mp = mp;
1437 }
1438 
1439 /*
1440  * Caller must hold the mutex and make sure that either SOLOCKED
1441  * or SOASYNC_UNBIND is set. Called from so_unlock_single().
1442  * Perform delayed processing of T_DISCON_IND message on sti_discon_ind_mp.
1443  * Need to ensure that strsock_proto() will not end up sleeping for
1444  * SOASYNC_UNBIND, while executing this function.
1445  */
1446 void
1447 so_drain_discon_ind(struct sonode *so)
1448 {
1449 	mblk_t	*bp;
1450 	void (*func)(struct sonode *so, mblk_t *);
1451 	sotpi_info_t *sti = SOTOTPI(so);
1452 
1453 	ASSERT(MUTEX_HELD(&so->so_lock));
1454 	ASSERT(so->so_flag & (SOLOCKED|SOASYNC_UNBIND));
1455 
1456 	/* Process T_DISCON_IND on sti_discon_ind_mp */
1457 	if ((bp = sti->sti_discon_ind_mp) != NULL) {
1458 		sti->sti_discon_ind_mp = NULL;
1459 		func = (void (*)())bp->b_prev;
1460 		bp->b_prev = NULL;
1461 
1462 		/*
1463 		 * This (*func) is supposed to generate a message downstream
1464 		 * and we need to have a flag set until the corresponding
1465 		 * upstream message reaches stream head.
1466 		 * When processing T_DISCON_IND in strsock_discon_ind
1467 		 * we hold SOASYN_UNBIND when sending T_UNBIND_REQ down and
1468 		 * drop the flag after we get the ACK in strsock_proto.
1469 		 */
1470 		(void) (*func)(so, bp);
1471 	}
1472 }
1473 
1474 /*
1475  * Caller must hold the mutex.
1476  * Remove the T_DISCON_IND on sti_discon_ind_mp.
1477  */
1478 void
1479 so_flush_discon_ind(struct sonode *so)
1480 {
1481 	mblk_t	*bp;
1482 	sotpi_info_t *sti = SOTOTPI(so);
1483 
1484 	ASSERT(MUTEX_HELD(&so->so_lock));
1485 
1486 	/*
1487 	 * Remove T_DISCON_IND mblk at sti_discon_ind_mp.
1488 	 */
1489 	if ((bp = sti->sti_discon_ind_mp) != NULL) {
1490 		sti->sti_discon_ind_mp = NULL;
1491 		bp->b_prev = NULL;
1492 		freemsg(bp);
1493 	}
1494 }
1495 
1496 /*
1497  * Caller must hold the mutex.
1498  *
1499  * This function is used to process the T_DISCON_IND message. It does
1500  * immediate processing when called from strsock_proto and delayed
1501  * processing of discon_ind saved on sti_discon_ind_mp when called from
1502  * so_drain_discon_ind. When a T_DISCON_IND message is saved in
1503  * sti_discon_ind_mp for delayed processing, this function is registered
1504  * as the callback function to process the message.
1505  *
1506  * SOASYNC_UNBIND should be held in this function, during the non-blocking
1507  * unbind operation, and should be released only after we receive the ACK
1508  * in strsock_proto, for the T_UNBIND_REQ sent here. Since SOLOCKED is not set,
1509  * no TPI messages would be sent down at this time. This is to prevent M_FLUSH
1510  * sent from either this function or tcp_unbind(), flushing away any TPI
1511  * message that is being sent down and stays in a lower module's queue.
1512  *
1513  * This function drops so_lock and grabs it again.
1514  */
1515 static void
1516 strsock_discon_ind(struct sonode *so, mblk_t *discon_mp)
1517 {
1518 	struct vnode *vp;
1519 	struct stdata *stp;
1520 	union T_primitives *tpr;
1521 	struct T_unbind_req *ubr;
1522 	mblk_t *mp;
1523 	int error;
1524 	sotpi_info_t *sti = SOTOTPI(so);
1525 
1526 	ASSERT(MUTEX_HELD(&so->so_lock));
1527 	ASSERT(discon_mp);
1528 	ASSERT(discon_mp->b_rptr);
1529 
1530 	tpr = (union T_primitives *)discon_mp->b_rptr;
1531 	ASSERT(tpr->type == T_DISCON_IND);
1532 
1533 	vp = SOTOV(so);
1534 	stp = vp->v_stream;
1535 	ASSERT(stp);
1536 
1537 	/*
1538 	 * Not a listener
1539 	 */
1540 	ASSERT((so->so_state & SS_ACCEPTCONN) == 0);
1541 
1542 	/*
1543 	 * This assumes that the name space for DISCON_reason
1544 	 * is the errno name space.
1545 	 */
1546 	soisdisconnected(so, tpr->discon_ind.DISCON_reason);
1547 	sti->sti_laddr_valid = 0;
1548 	sti->sti_faddr_valid = 0;
1549 
1550 	/*
1551 	 * Unbind with the transport without blocking.
1552 	 * If we've already received a T_DISCON_IND do not unbind.
1553 	 *
1554 	 * If there is no preallocated unbind message, we have already
1555 	 * unbound with the transport
1556 	 *
1557 	 * If the socket is not bound, no need to unbind.
1558 	 */
1559 	mp = sti->sti_unbind_mp;
1560 	if (mp == NULL) {
1561 		ASSERT(!(so->so_state & SS_ISBOUND));
1562 		mutex_exit(&so->so_lock);
1563 	} else if (!(so->so_state & SS_ISBOUND))  {
1564 		mutex_exit(&so->so_lock);
1565 	} else {
1566 		sti->sti_unbind_mp = NULL;
1567 
1568 		/*
1569 		 * Is another T_DISCON_IND being processed.
1570 		 */
1571 		ASSERT((so->so_flag & SOASYNC_UNBIND) == 0);
1572 
1573 		/*
1574 		 * Make strsock_proto ignore T_OK_ACK and T_ERROR_ACK for
1575 		 * this unbind. Set SOASYNC_UNBIND. This should be cleared
1576 		 * only after we receive the ACK in strsock_proto.
1577 		 */
1578 		so->so_flag |= SOASYNC_UNBIND;
1579 		ASSERT(!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)));
1580 		so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1581 		sti->sti_laddr_valid = 0;
1582 		mutex_exit(&so->so_lock);
1583 
1584 		/*
1585 		 * Send down T_UNBIND_REQ ignoring flow control.
1586 		 * XXX Assumes that MSG_IGNFLOW implies that this thread
1587 		 * does not run service procedures.
1588 		 */
1589 		ASSERT(DB_TYPE(mp) == M_PROTO);
1590 		ubr = (struct T_unbind_req *)mp->b_rptr;
1591 		mp->b_wptr += sizeof (*ubr);
1592 		ubr->PRIM_type = T_UNBIND_REQ;
1593 
1594 		/*
1595 		 * Flush the read and write side (except stream head read queue)
1596 		 * and send down T_UNBIND_REQ.
1597 		 */
1598 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1599 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1600 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
1601 		/* LINTED - warning: statement has no consequent: if */
1602 		if (error) {
1603 			eprintsoline(so, error);
1604 		}
1605 	}
1606 
1607 	if (tpr->discon_ind.DISCON_reason != 0)
1608 		strsetrerror(SOTOV(so), 0, 0, sogetrderr);
1609 	strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
1610 	strseteof(SOTOV(so), 1);
1611 	/*
1612 	 * strseteof takes care of read side wakeups,
1613 	 * pollwakeups, and signals.
1614 	 */
1615 	dprintso(so, 1, ("T_DISCON_IND: error %d\n", so->so_error));
1616 	freemsg(discon_mp);
1617 
1618 
1619 	pollwakeup(&stp->sd_pollist, POLLOUT);
1620 	mutex_enter(&stp->sd_lock);
1621 
1622 	/*
1623 	 * Wake sleeping write
1624 	 */
1625 	if (stp->sd_flag & WSLEEP) {
1626 		stp->sd_flag &= ~WSLEEP;
1627 		cv_broadcast(&stp->sd_wrq->q_wait);
1628 	}
1629 
1630 	/*
1631 	 * strsendsig can handle multiple signals with a
1632 	 * single call.  Send SIGPOLL for S_OUTPUT event.
1633 	 */
1634 	if (stp->sd_sigflags & S_OUTPUT)
1635 		strsendsig(stp->sd_siglist, S_OUTPUT, 0, 0);
1636 
1637 	mutex_exit(&stp->sd_lock);
1638 	mutex_enter(&so->so_lock);
1639 }
1640 
1641 /*
1642  * This routine is registered with the stream head to receive M_PROTO
1643  * and M_PCPROTO messages.
1644  *
1645  * Returns NULL if the message was consumed.
1646  * Returns an mblk to make that mblk be processed (and queued) by the stream
1647  * head.
1648  *
1649  * Sets the return parameters (*wakeups, *firstmsgsigs, *allmsgsigs, and
1650  * *pollwakeups) for the stream head to take action on. Note that since
1651  * sockets always deliver SIGIO for every new piece of data this routine
1652  * never sets *firstmsgsigs; any signals are returned in *allmsgsigs.
1653  *
1654  * This routine handles all data related TPI messages independent of
1655  * the type of the socket i.e. it doesn't care if T_UNITDATA_IND message
1656  * arrive on a SOCK_STREAM.
1657  */
1658 static mblk_t *
1659 strsock_proto(vnode_t *vp, mblk_t *mp,
1660 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
1661 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
1662 {
1663 	union T_primitives *tpr;
1664 	struct sonode *so;
1665 	sotpi_info_t *sti;
1666 	uint32_t auditing = AU_AUDITING();
1667 
1668 	so = VTOSO(vp);
1669 	sti = SOTOTPI(so);
1670 
1671 	dprintso(so, 1, ("strsock_proto(%p, %p)\n", (void *)vp, (void *)mp));
1672 
1673 	/* Set default return values */
1674 	*firstmsgsigs = *wakeups = *allmsgsigs = *pollwakeups = 0;
1675 
1676 	ASSERT(DB_TYPE(mp) == M_PROTO ||
1677 	    DB_TYPE(mp) == M_PCPROTO);
1678 
1679 	if (MBLKL(mp) < sizeof (tpr->type)) {
1680 		/* The message is too short to even contain the primitive */
1681 		zcmn_err(getzoneid(), CE_WARN,
1682 		    "sockfs: Too short TPI message received. Len = %ld\n",
1683 		    (ptrdiff_t)(MBLKL(mp)));
1684 		freemsg(mp);
1685 		return (NULL);
1686 	}
1687 	if (!__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
1688 		/* The read pointer is not aligned correctly for TPI */
1689 		zcmn_err(getzoneid(), CE_WARN,
1690 		    "sockfs: Unaligned TPI message received. rptr = %p\n",
1691 		    (void *)mp->b_rptr);
1692 		freemsg(mp);
1693 		return (NULL);
1694 	}
1695 	tpr = (union T_primitives *)mp->b_rptr;
1696 	dprintso(so, 1, ("strsock_proto: primitive %d\n", tpr->type));
1697 
1698 	switch (tpr->type) {
1699 
1700 	case T_DATA_IND:
1701 		if (MBLKL(mp) < sizeof (struct T_data_ind)) {
1702 			zcmn_err(getzoneid(), CE_WARN,
1703 			    "sockfs: Too short T_DATA_IND. Len = %ld\n",
1704 			    (ptrdiff_t)(MBLKL(mp)));
1705 			freemsg(mp);
1706 			return (NULL);
1707 		}
1708 		/*
1709 		 * Ignore zero-length T_DATA_IND messages. These might be
1710 		 * generated by some transports.
1711 		 * This is needed to prevent read (which skips the M_PROTO
1712 		 * part) to unexpectedly return 0 (or return EWOULDBLOCK
1713 		 * on a non-blocking socket after select/poll has indicated
1714 		 * that data is available).
1715 		 */
1716 		if (msgdsize(mp->b_cont) == 0) {
1717 			dprintso(so, 0,
1718 			    ("strsock_proto: zero length T_DATA_IND\n"));
1719 			freemsg(mp);
1720 			return (NULL);
1721 		}
1722 		*allmsgsigs = S_INPUT | S_RDNORM;
1723 		*pollwakeups = POLLIN | POLLRDNORM;
1724 		*wakeups = RSLEEP;
1725 		return (mp);
1726 
1727 	case T_UNITDATA_IND: {
1728 		struct T_unitdata_ind	*tudi = &tpr->unitdata_ind;
1729 		void			*addr;
1730 		t_uscalar_t		addrlen;
1731 
1732 		if (MBLKL(mp) < sizeof (struct T_unitdata_ind)) {
1733 			zcmn_err(getzoneid(), CE_WARN,
1734 			    "sockfs: Too short T_UNITDATA_IND. Len = %ld\n",
1735 			    (ptrdiff_t)(MBLKL(mp)));
1736 			freemsg(mp);
1737 			return (NULL);
1738 		}
1739 
1740 		/* Is this is not a connected datagram socket? */
1741 		if ((so->so_mode & SM_CONNREQUIRED) ||
1742 		    !(so->so_state & SS_ISCONNECTED)) {
1743 			/*
1744 			 * Not a connected datagram socket. Look for
1745 			 * the SO_UNIX_CLOSE option. If such an option is found
1746 			 * discard the message (since it has no meaning
1747 			 * unless connected).
1748 			 */
1749 			if (so->so_family == AF_UNIX && msgdsize(mp) == 0 &&
1750 			    tudi->OPT_length != 0) {
1751 				void *opt;
1752 				t_uscalar_t optlen = tudi->OPT_length;
1753 
1754 				opt = sogetoff(mp, tudi->OPT_offset,
1755 				    optlen, __TPI_ALIGN_SIZE);
1756 				if (opt == NULL) {
1757 					/* The len/off falls outside mp */
1758 					freemsg(mp);
1759 					mutex_enter(&so->so_lock);
1760 					soseterror(so, EPROTO);
1761 					mutex_exit(&so->so_lock);
1762 					zcmn_err(getzoneid(), CE_WARN,
1763 					    "sockfs: T_unidata_ind with "
1764 					    "invalid optlen/offset %u/%d\n",
1765 					    optlen, tudi->OPT_offset);
1766 					return (NULL);
1767 				}
1768 				if (so_getopt_unix_close(opt, optlen)) {
1769 					freemsg(mp);
1770 					return (NULL);
1771 				}
1772 			}
1773 			*allmsgsigs = S_INPUT | S_RDNORM;
1774 			*pollwakeups = POLLIN | POLLRDNORM;
1775 			*wakeups = RSLEEP;
1776 			if (auditing)
1777 				audit_sock(T_UNITDATA_IND, strvp2wq(vp),
1778 				    mp, 0);
1779 			return (mp);
1780 		}
1781 
1782 		/*
1783 		 * A connect datagram socket. For AF_INET{,6} we verify that
1784 		 * the source address matches the "connected to" address.
1785 		 * The semantics of AF_UNIX sockets is to not verify
1786 		 * the source address.
1787 		 * Note that this source address verification is transport
1788 		 * specific. Thus the real fix would be to extent TPI
1789 		 * to allow T_CONN_REQ messages to be send to connectionless
1790 		 * transport providers and always let the transport provider
1791 		 * do whatever filtering is needed.
1792 		 *
1793 		 * The verification/filtering semantics for transports
1794 		 * other than AF_INET and AF_UNIX are unknown. The choice
1795 		 * would be to either filter using bcmp or let all messages
1796 		 * get through. This code does not filter other address
1797 		 * families since this at least allows the application to
1798 		 * work around any missing filtering.
1799 		 *
1800 		 * XXX Should we move filtering to UDP/ICMP???
1801 		 * That would require passing e.g. a T_DISCON_REQ to UDP
1802 		 * when the socket becomes unconnected.
1803 		 */
1804 		addrlen = tudi->SRC_length;
1805 		/*
1806 		 * The alignment restriction is really to strict but
1807 		 * we want enough alignment to inspect the fields of
1808 		 * a sockaddr_in.
1809 		 */
1810 		addr = sogetoff(mp, tudi->SRC_offset, addrlen,
1811 		    __TPI_ALIGN_SIZE);
1812 		if (addr == NULL) {
1813 			freemsg(mp);
1814 			mutex_enter(&so->so_lock);
1815 			soseterror(so, EPROTO);
1816 			mutex_exit(&so->so_lock);
1817 			zcmn_err(getzoneid(), CE_WARN,
1818 			    "sockfs: T_unidata_ind with invalid "
1819 			    "addrlen/offset %u/%d\n",
1820 			    addrlen, tudi->SRC_offset);
1821 			return (NULL);
1822 		}
1823 
1824 		if (so->so_family == AF_INET) {
1825 			/*
1826 			 * For AF_INET we allow wildcarding both sin_addr
1827 			 * and sin_port.
1828 			 */
1829 			struct sockaddr_in *faddr, *sin;
1830 
1831 			/* Prevent sti_faddr_sa from changing while accessed */
1832 			mutex_enter(&so->so_lock);
1833 			ASSERT(sti->sti_faddr_len ==
1834 			    (socklen_t)sizeof (struct sockaddr_in));
1835 			faddr = (struct sockaddr_in *)sti->sti_faddr_sa;
1836 			sin = (struct sockaddr_in *)addr;
1837 			if (addrlen !=
1838 			    (t_uscalar_t)sizeof (struct sockaddr_in) ||
1839 			    (sin->sin_addr.s_addr != faddr->sin_addr.s_addr &&
1840 			    faddr->sin_addr.s_addr != INADDR_ANY) ||
1841 			    (so->so_type != SOCK_RAW &&
1842 			    sin->sin_port != faddr->sin_port &&
1843 			    faddr->sin_port != 0)) {
1844 #ifdef DEBUG
1845 				dprintso(so, 0,
1846 				    ("sockfs: T_UNITDATA_IND mismatch: %s",
1847 				    pr_addr(so->so_family,
1848 				    (struct sockaddr *)addr, addrlen)));
1849 				dprintso(so, 0, (" - %s\n",
1850 				    pr_addr(so->so_family, sti->sti_faddr_sa,
1851 				    (t_uscalar_t)sti->sti_faddr_len)));
1852 #endif /* DEBUG */
1853 				mutex_exit(&so->so_lock);
1854 				freemsg(mp);
1855 				return (NULL);
1856 			}
1857 			mutex_exit(&so->so_lock);
1858 		} else if (so->so_family == AF_INET6) {
1859 			/*
1860 			 * For AF_INET6 we allow wildcarding both sin6_addr
1861 			 * and sin6_port.
1862 			 */
1863 			struct sockaddr_in6 *faddr6, *sin6;
1864 			static struct in6_addr zeroes; /* inits to all zeros */
1865 
1866 			/* Prevent sti_faddr_sa from changing while accessed */
1867 			mutex_enter(&so->so_lock);
1868 			ASSERT(sti->sti_faddr_len ==
1869 			    (socklen_t)sizeof (struct sockaddr_in6));
1870 			faddr6 = (struct sockaddr_in6 *)sti->sti_faddr_sa;
1871 			sin6 = (struct sockaddr_in6 *)addr;
1872 			/* XXX could we get a mapped address ::ffff:0.0.0.0 ? */
1873 			if (addrlen !=
1874 			    (t_uscalar_t)sizeof (struct sockaddr_in6) ||
1875 			    (!IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
1876 			    &faddr6->sin6_addr) &&
1877 			    !IN6_ARE_ADDR_EQUAL(&faddr6->sin6_addr, &zeroes)) ||
1878 			    (so->so_type != SOCK_RAW &&
1879 			    sin6->sin6_port != faddr6->sin6_port &&
1880 			    faddr6->sin6_port != 0)) {
1881 #ifdef DEBUG
1882 				dprintso(so, 0,
1883 				    ("sockfs: T_UNITDATA_IND mismatch: %s",
1884 				    pr_addr(so->so_family,
1885 				    (struct sockaddr *)addr, addrlen)));
1886 				dprintso(so, 0, (" - %s\n",
1887 				    pr_addr(so->so_family, sti->sti_faddr_sa,
1888 				    (t_uscalar_t)sti->sti_faddr_len)));
1889 #endif /* DEBUG */
1890 				mutex_exit(&so->so_lock);
1891 				freemsg(mp);
1892 				return (NULL);
1893 			}
1894 			mutex_exit(&so->so_lock);
1895 		} else if (so->so_family == AF_UNIX &&
1896 		    msgdsize(mp->b_cont) == 0 &&
1897 		    tudi->OPT_length != 0) {
1898 			/*
1899 			 * Attempt to extract AF_UNIX
1900 			 * SO_UNIX_CLOSE indication from options.
1901 			 */
1902 			void *opt;
1903 			t_uscalar_t optlen = tudi->OPT_length;
1904 
1905 			opt = sogetoff(mp, tudi->OPT_offset,
1906 			    optlen, __TPI_ALIGN_SIZE);
1907 			if (opt == NULL) {
1908 				/* The len/off falls outside mp */
1909 				freemsg(mp);
1910 				mutex_enter(&so->so_lock);
1911 				soseterror(so, EPROTO);
1912 				mutex_exit(&so->so_lock);
1913 				zcmn_err(getzoneid(), CE_WARN,
1914 				    "sockfs: T_unidata_ind with invalid "
1915 				    "optlen/offset %u/%d\n",
1916 				    optlen, tudi->OPT_offset);
1917 				return (NULL);
1918 			}
1919 			/*
1920 			 * If we received a unix close indication mark the
1921 			 * socket and discard this message.
1922 			 */
1923 			if (so_getopt_unix_close(opt, optlen)) {
1924 				mutex_enter(&so->so_lock);
1925 				sobreakconn(so, ECONNRESET);
1926 				mutex_exit(&so->so_lock);
1927 				strsetrerror(SOTOV(so), 0, 0, sogetrderr);
1928 				freemsg(mp);
1929 				*pollwakeups = POLLIN | POLLRDNORM;
1930 				*allmsgsigs = S_INPUT | S_RDNORM;
1931 				*wakeups = RSLEEP;
1932 				return (NULL);
1933 			}
1934 		}
1935 		*allmsgsigs = S_INPUT | S_RDNORM;
1936 		*pollwakeups = POLLIN | POLLRDNORM;
1937 		*wakeups = RSLEEP;
1938 		return (mp);
1939 	}
1940 
1941 	case T_OPTDATA_IND: {
1942 		struct T_optdata_ind	*tdi = &tpr->optdata_ind;
1943 
1944 		if (MBLKL(mp) < sizeof (struct T_optdata_ind)) {
1945 			zcmn_err(getzoneid(), CE_WARN,
1946 			    "sockfs: Too short T_OPTDATA_IND. Len = %ld\n",
1947 			    (ptrdiff_t)(MBLKL(mp)));
1948 			freemsg(mp);
1949 			return (NULL);
1950 		}
1951 		/*
1952 		 * Allow zero-length messages carrying options.
1953 		 * This is used when carrying the SO_UNIX_CLOSE option.
1954 		 */
1955 		if (so->so_family == AF_UNIX && msgdsize(mp->b_cont) == 0 &&
1956 		    tdi->OPT_length != 0) {
1957 			/*
1958 			 * Attempt to extract AF_UNIX close indication
1959 			 * from the options. Ignore any other options -
1960 			 * those are handled once the message is removed
1961 			 * from the queue.
1962 			 * The close indication message should not carry data.
1963 			 */
1964 			void *opt;
1965 			t_uscalar_t optlen = tdi->OPT_length;
1966 
1967 			opt = sogetoff(mp, tdi->OPT_offset,
1968 			    optlen, __TPI_ALIGN_SIZE);
1969 			if (opt == NULL) {
1970 				/* The len/off falls outside mp */
1971 				freemsg(mp);
1972 				mutex_enter(&so->so_lock);
1973 				soseterror(so, EPROTO);
1974 				mutex_exit(&so->so_lock);
1975 				zcmn_err(getzoneid(), CE_WARN,
1976 				    "sockfs: T_optdata_ind with invalid "
1977 				    "optlen/offset %u/%d\n",
1978 				    optlen, tdi->OPT_offset);
1979 				return (NULL);
1980 			}
1981 			/*
1982 			 * If we received a close indication mark the
1983 			 * socket and discard this message.
1984 			 */
1985 			if (so_getopt_unix_close(opt, optlen)) {
1986 				mutex_enter(&so->so_lock);
1987 				socantsendmore(so);
1988 				sti->sti_faddr_valid = 0;
1989 				mutex_exit(&so->so_lock);
1990 				strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
1991 				freemsg(mp);
1992 				return (NULL);
1993 			}
1994 		}
1995 		*allmsgsigs = S_INPUT | S_RDNORM;
1996 		*pollwakeups = POLLIN | POLLRDNORM;
1997 		*wakeups = RSLEEP;
1998 		return (mp);
1999 	}
2000 
2001 	case T_EXDATA_IND: {
2002 		mblk_t		*mctl, *mdata;
2003 		mblk_t *lbp;
2004 		union T_primitives *tprp;
2005 		struct stdata   *stp;
2006 		queue_t *qp;
2007 
2008 		if (MBLKL(mp) < sizeof (struct T_exdata_ind)) {
2009 			zcmn_err(getzoneid(), CE_WARN,
2010 			    "sockfs: Too short T_EXDATA_IND. Len = %ld\n",
2011 			    (ptrdiff_t)(MBLKL(mp)));
2012 			freemsg(mp);
2013 			return (NULL);
2014 		}
2015 		/*
2016 		 * Ignore zero-length T_EXDATA_IND messages. These might be
2017 		 * generated by some transports.
2018 		 *
2019 		 * This is needed to prevent read (which skips the M_PROTO
2020 		 * part) to unexpectedly return 0 (or return EWOULDBLOCK
2021 		 * on a non-blocking socket after select/poll has indicated
2022 		 * that data is available).
2023 		 */
2024 		dprintso(so, 1,
2025 		    ("T_EXDATA_IND(%p): counts %d/%d state %s\n",
2026 		    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2027 		    pr_state(so->so_state, so->so_mode)));
2028 
2029 		if (msgdsize(mp->b_cont) == 0) {
2030 			dprintso(so, 0,
2031 			    ("strsock_proto: zero length T_EXDATA_IND\n"));
2032 			freemsg(mp);
2033 			return (NULL);
2034 		}
2035 
2036 		/*
2037 		 * Split into the T_EXDATA_IND and the M_DATA part.
2038 		 * We process these three pieces separately:
2039 		 *	signal generation
2040 		 *	handling T_EXDATA_IND
2041 		 *	handling M_DATA component
2042 		 */
2043 		mctl = mp;
2044 		mdata = mctl->b_cont;
2045 		mctl->b_cont = NULL;
2046 		mutex_enter(&so->so_lock);
2047 		so_oob_sig(so, 0, allmsgsigs, pollwakeups);
2048 		mctl = so_oob_exdata(so, mctl, allmsgsigs, pollwakeups);
2049 		mdata = so_oob_data(so, mdata, allmsgsigs, pollwakeups);
2050 
2051 		stp = vp->v_stream;
2052 		ASSERT(stp != NULL);
2053 		qp = _RD(stp->sd_wrq);
2054 
2055 		mutex_enter(QLOCK(qp));
2056 		lbp = qp->q_last;
2057 
2058 		/*
2059 		 * We want to avoid queueing up a string of T_EXDATA_IND
2060 		 * messages with no intervening data messages at the stream
2061 		 * head. These messages contribute to the total message
2062 		 * count. Eventually this can lead to STREAMS flow contol
2063 		 * and also cause TCP to advertise a zero window condition
2064 		 * to the peer. This can happen in the degenerate case where
2065 		 * the sender and receiver exchange only OOB data. The sender
2066 		 * only sends messages with MSG_OOB flag and the receiver
2067 		 * receives only MSG_OOB messages and does not use SO_OOBINLINE.
2068 		 * An example of this scenario has been reported in applications
2069 		 * that use OOB data to exchange heart beats. Flow control
2070 		 * relief will never happen if the application only reads OOB
2071 		 * data which is done directly by sorecvoob() and the
2072 		 * T_EXDATA_IND messages at the streamhead won't be consumed.
2073 		 * Note that there is no correctness issue in compressing the
2074 		 * string of T_EXDATA_IND messages into a single T_EXDATA_IND
2075 		 * message. A single read that does not specify MSG_OOB will
2076 		 * read across all the marks in a loop in sotpi_recvmsg().
2077 		 * Each mark is individually distinguishable only if the
2078 		 * T_EXDATA_IND messages are separated by data messages.
2079 		 */
2080 		if ((qp->q_first != NULL) && (DB_TYPE(lbp) == M_PROTO)) {
2081 			tprp = (union T_primitives *)lbp->b_rptr;
2082 			if ((tprp->type == T_EXDATA_IND) &&
2083 			    !(so->so_options & SO_OOBINLINE)) {
2084 
2085 				/*
2086 				 * free the new M_PROTO message
2087 				 */
2088 				freemsg(mctl);
2089 
2090 				/*
2091 				 * adjust the OOB count and OOB	signal count
2092 				 * just incremented for the new OOB data.
2093 				 */
2094 				sti->sti_oobcnt--;
2095 				sti->sti_oobsigcnt--;
2096 				mutex_exit(QLOCK(qp));
2097 				mutex_exit(&so->so_lock);
2098 				return (NULL);
2099 			}
2100 		}
2101 		mutex_exit(QLOCK(qp));
2102 
2103 		/*
2104 		 * Pass the T_EXDATA_IND and the M_DATA back separately
2105 		 * by using b_next linkage. (The stream head will queue any
2106 		 * b_next linked messages separately.) This is needed
2107 		 * since MSGMARK applies to the last by of the message
2108 		 * hence we can not have any M_DATA component attached
2109 		 * to the marked T_EXDATA_IND. Note that the stream head
2110 		 * will not consolidate M_DATA messages onto an MSGMARK'ed
2111 		 * message in order to preserve the constraint that
2112 		 * the T_EXDATA_IND always is a separate message.
2113 		 */
2114 		ASSERT(mctl != NULL);
2115 		mctl->b_next = mdata;
2116 		mp = mctl;
2117 #ifdef DEBUG
2118 		if (mdata == NULL) {
2119 			dprintso(so, 1,
2120 			    ("after outofline T_EXDATA_IND(%p): "
2121 			    "counts %d/%d  poll 0x%x sig 0x%x state %s\n",
2122 			    (void *)vp, sti->sti_oobsigcnt,
2123 			    sti->sti_oobcnt, *pollwakeups, *allmsgsigs,
2124 			    pr_state(so->so_state, so->so_mode)));
2125 		} else {
2126 			dprintso(so, 1,
2127 			    ("after inline T_EXDATA_IND(%p): "
2128 			    "counts %d/%d  poll 0x%x sig 0x%x state %s\n",
2129 			    (void *)vp, sti->sti_oobsigcnt,
2130 			    sti->sti_oobcnt, *pollwakeups, *allmsgsigs,
2131 			    pr_state(so->so_state, so->so_mode)));
2132 		}
2133 #endif /* DEBUG */
2134 		mutex_exit(&so->so_lock);
2135 		*wakeups = RSLEEP;
2136 		return (mp);
2137 	}
2138 
2139 	case T_CONN_CON: {
2140 		struct T_conn_con	*conn_con;
2141 		void			*addr;
2142 		t_uscalar_t		addrlen;
2143 
2144 		/*
2145 		 * Verify the state, update the state to ISCONNECTED,
2146 		 * record the potentially new address in the message,
2147 		 * and drop the message.
2148 		 */
2149 		if (MBLKL(mp) < sizeof (struct T_conn_con)) {
2150 			zcmn_err(getzoneid(), CE_WARN,
2151 			    "sockfs: Too short T_CONN_CON. Len = %ld\n",
2152 			    (ptrdiff_t)(MBLKL(mp)));
2153 			freemsg(mp);
2154 			return (NULL);
2155 		}
2156 
2157 		mutex_enter(&so->so_lock);
2158 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) !=
2159 		    SS_ISCONNECTING) {
2160 			mutex_exit(&so->so_lock);
2161 			dprintso(so, 1,
2162 			    ("T_CONN_CON: state %x\n", so->so_state));
2163 			freemsg(mp);
2164 			return (NULL);
2165 		}
2166 
2167 		conn_con = &tpr->conn_con;
2168 		addrlen = conn_con->RES_length;
2169 		/*
2170 		 * Allow the address to be of different size than sent down
2171 		 * in the T_CONN_REQ as long as it doesn't exceed the maxlen.
2172 		 * For AF_UNIX require the identical length.
2173 		 */
2174 		if (so->so_family == AF_UNIX ?
2175 		    addrlen != (t_uscalar_t)sizeof (sti->sti_ux_laddr) :
2176 		    addrlen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2177 			zcmn_err(getzoneid(), CE_WARN,
2178 			    "sockfs: T_conn_con with different "
2179 			    "length %u/%d\n",
2180 			    addrlen, conn_con->RES_length);
2181 			soisdisconnected(so, EPROTO);
2182 			sti->sti_laddr_valid = 0;
2183 			sti->sti_faddr_valid = 0;
2184 			mutex_exit(&so->so_lock);
2185 			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2186 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2187 			strseteof(SOTOV(so), 1);
2188 			freemsg(mp);
2189 			/*
2190 			 * strseteof takes care of read side wakeups,
2191 			 * pollwakeups, and signals.
2192 			 */
2193 			*wakeups = WSLEEP;
2194 			*allmsgsigs = S_OUTPUT;
2195 			*pollwakeups = POLLOUT;
2196 			return (NULL);
2197 		}
2198 		addr = sogetoff(mp, conn_con->RES_offset, addrlen, 1);
2199 		if (addr == NULL) {
2200 			zcmn_err(getzoneid(), CE_WARN,
2201 			    "sockfs: T_conn_con with invalid "
2202 			    "addrlen/offset %u/%d\n",
2203 			    addrlen, conn_con->RES_offset);
2204 			mutex_exit(&so->so_lock);
2205 			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2206 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2207 			strseteof(SOTOV(so), 1);
2208 			freemsg(mp);
2209 			/*
2210 			 * strseteof takes care of read side wakeups,
2211 			 * pollwakeups, and signals.
2212 			 */
2213 			*wakeups = WSLEEP;
2214 			*allmsgsigs = S_OUTPUT;
2215 			*pollwakeups = POLLOUT;
2216 			return (NULL);
2217 		}
2218 
2219 		/*
2220 		 * Save for getpeername.
2221 		 */
2222 		if (so->so_family != AF_UNIX) {
2223 			sti->sti_faddr_len = (socklen_t)addrlen;
2224 			ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2225 			bcopy(addr, sti->sti_faddr_sa, addrlen);
2226 			sti->sti_faddr_valid = 1;
2227 		}
2228 
2229 		if (so->so_peercred != NULL)
2230 			crfree(so->so_peercred);
2231 		so->so_peercred = msg_getcred(mp, &so->so_cpid);
2232 		if (so->so_peercred != NULL)
2233 			crhold(so->so_peercred);
2234 
2235 		/* Wakeup anybody sleeping in sowaitconnected */
2236 		soisconnected(so);
2237 		mutex_exit(&so->so_lock);
2238 
2239 		/*
2240 		 * The socket is now available for sending data.
2241 		 */
2242 		*wakeups = WSLEEP;
2243 		*allmsgsigs = S_OUTPUT;
2244 		*pollwakeups = POLLOUT;
2245 		freemsg(mp);
2246 		return (NULL);
2247 	}
2248 
2249 	case T_CONN_IND:
2250 		/*
2251 		 * Verify the min size and queue the message on
2252 		 * the sti_conn_ind_head/tail list.
2253 		 */
2254 		if (MBLKL(mp) < sizeof (struct T_conn_ind)) {
2255 			zcmn_err(getzoneid(), CE_WARN,
2256 			    "sockfs: Too short T_CONN_IND. Len = %ld\n",
2257 			    (ptrdiff_t)(MBLKL(mp)));
2258 			freemsg(mp);
2259 			return (NULL);
2260 		}
2261 
2262 		if (auditing)
2263 			audit_sock(T_CONN_IND, strvp2wq(vp), mp, 0);
2264 		if (!(so->so_state & SS_ACCEPTCONN)) {
2265 			zcmn_err(getzoneid(), CE_WARN,
2266 			    "sockfs: T_conn_ind on non-listening socket\n");
2267 			freemsg(mp);
2268 			return (NULL);
2269 		}
2270 
2271 		soqueueconnind(so, mp);
2272 		*allmsgsigs = S_INPUT | S_RDNORM;
2273 		*pollwakeups = POLLIN | POLLRDNORM;
2274 		*wakeups = RSLEEP;
2275 		return (NULL);
2276 
2277 	case T_ORDREL_IND:
2278 		if (MBLKL(mp) < sizeof (struct T_ordrel_ind)) {
2279 			zcmn_err(getzoneid(), CE_WARN,
2280 			    "sockfs: Too short T_ORDREL_IND. Len = %ld\n",
2281 			    (ptrdiff_t)(MBLKL(mp)));
2282 			freemsg(mp);
2283 			return (NULL);
2284 		}
2285 
2286 		/*
2287 		 * Some providers send this when not fully connected.
2288 		 * SunLink X.25 needs to retrieve disconnect reason after
2289 		 * disconnect for compatibility. It uses T_ORDREL_IND
2290 		 * instead of T_DISCON_IND so that it may use the
2291 		 * endpoint after a connect failure to retrieve the
2292 		 * reason using an ioctl. Thus we explicitly clear
2293 		 * SS_ISCONNECTING here for SunLink X.25.
2294 		 * This is a needed TPI violation.
2295 		 */
2296 		mutex_enter(&so->so_lock);
2297 		so->so_state &= ~SS_ISCONNECTING;
2298 		socantrcvmore(so);
2299 		mutex_exit(&so->so_lock);
2300 		strseteof(SOTOV(so), 1);
2301 		/*
2302 		 * strseteof takes care of read side wakeups,
2303 		 * pollwakeups, and signals.
2304 		 */
2305 		freemsg(mp);
2306 		return (NULL);
2307 
2308 	case T_DISCON_IND:
2309 		if (MBLKL(mp) < sizeof (struct T_discon_ind)) {
2310 			zcmn_err(getzoneid(), CE_WARN,
2311 			    "sockfs: Too short T_DISCON_IND. Len = %ld\n",
2312 			    (ptrdiff_t)(MBLKL(mp)));
2313 			freemsg(mp);
2314 			return (NULL);
2315 		}
2316 		if (so->so_state & SS_ACCEPTCONN) {
2317 			/*
2318 			 * This is a listener. Look for a queued T_CONN_IND
2319 			 * with a matching sequence number and remove it
2320 			 * from the list.
2321 			 * It is normal to not find the sequence number since
2322 			 * the soaccept might have already dequeued it
2323 			 * (in which case the T_CONN_RES will fail with
2324 			 * TBADSEQ).
2325 			 */
2326 			(void) soflushconnind(so, tpr->discon_ind.SEQ_number);
2327 			freemsg(mp);
2328 			return (0);
2329 		}
2330 
2331 		/*
2332 		 * Not a listener
2333 		 *
2334 		 * If SS_CANTRCVMORE for AF_UNIX ignore the discon_reason.
2335 		 * Such a discon_ind appears when the peer has first done
2336 		 * a shutdown() followed by a close() in which case we just
2337 		 * want to record socantsendmore.
2338 		 * In this case sockfs first receives a T_ORDREL_IND followed
2339 		 * by a T_DISCON_IND.
2340 		 * Note that for other transports (e.g. TCP) we need to handle
2341 		 * the discon_ind in this case since it signals an error.
2342 		 */
2343 		mutex_enter(&so->so_lock);
2344 		if ((so->so_state & SS_CANTRCVMORE) &&
2345 		    (so->so_family == AF_UNIX)) {
2346 			socantsendmore(so);
2347 			sti->sti_faddr_valid = 0;
2348 			mutex_exit(&so->so_lock);
2349 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2350 			dprintso(so, 1,
2351 			    ("T_DISCON_IND: error %d\n", so->so_error));
2352 			freemsg(mp);
2353 			/*
2354 			 * Set these variables for caller to process them.
2355 			 * For the else part where T_DISCON_IND is processed,
2356 			 * this will be done in the function being called
2357 			 * (strsock_discon_ind())
2358 			 */
2359 			*wakeups = WSLEEP;
2360 			*allmsgsigs = S_OUTPUT;
2361 			*pollwakeups = POLLOUT;
2362 		} else if (so->so_flag & (SOASYNC_UNBIND | SOLOCKED)) {
2363 			/*
2364 			 * Deferred processing of T_DISCON_IND
2365 			 */
2366 			so_save_discon_ind(so, mp, strsock_discon_ind);
2367 			mutex_exit(&so->so_lock);
2368 		} else {
2369 			/*
2370 			 * Process T_DISCON_IND now
2371 			 */
2372 			(void) strsock_discon_ind(so, mp);
2373 			mutex_exit(&so->so_lock);
2374 		}
2375 		return (NULL);
2376 
2377 	case T_UDERROR_IND: {
2378 		struct T_uderror_ind	*tudi = &tpr->uderror_ind;
2379 		void			*addr;
2380 		t_uscalar_t		addrlen;
2381 		int			error;
2382 
2383 		dprintso(so, 0,
2384 		    ("T_UDERROR_IND: error %d\n", tudi->ERROR_type));
2385 
2386 		if (MBLKL(mp) < sizeof (struct T_uderror_ind)) {
2387 			zcmn_err(getzoneid(), CE_WARN,
2388 			    "sockfs: Too short T_UDERROR_IND. Len = %ld\n",
2389 			    (ptrdiff_t)(MBLKL(mp)));
2390 			freemsg(mp);
2391 			return (NULL);
2392 		}
2393 		/* Ignore on connection-oriented transports */
2394 		if (so->so_mode & SM_CONNREQUIRED) {
2395 			freemsg(mp);
2396 			eprintsoline(so, 0);
2397 			zcmn_err(getzoneid(), CE_WARN,
2398 			    "sockfs: T_uderror_ind on connection-oriented "
2399 			    "transport\n");
2400 			return (NULL);
2401 		}
2402 		addrlen = tudi->DEST_length;
2403 		addr = sogetoff(mp, tudi->DEST_offset, addrlen, 1);
2404 		if (addr == NULL) {
2405 			zcmn_err(getzoneid(), CE_WARN,
2406 			    "sockfs: T_uderror_ind with invalid "
2407 			    "addrlen/offset %u/%d\n",
2408 			    addrlen, tudi->DEST_offset);
2409 			freemsg(mp);
2410 			return (NULL);
2411 		}
2412 
2413 		/* Verify source address for connected socket. */
2414 		mutex_enter(&so->so_lock);
2415 		if (so->so_state & SS_ISCONNECTED) {
2416 			void *faddr;
2417 			t_uscalar_t faddr_len;
2418 			boolean_t match = B_FALSE;
2419 
2420 			switch (so->so_family) {
2421 			case AF_INET: {
2422 				/* Compare just IP address and port */
2423 				struct sockaddr_in *sin1, *sin2;
2424 
2425 				sin1 = (struct sockaddr_in *)sti->sti_faddr_sa;
2426 				sin2 = (struct sockaddr_in *)addr;
2427 				if (addrlen == sizeof (struct sockaddr_in) &&
2428 				    sin1->sin_port == sin2->sin_port &&
2429 				    sin1->sin_addr.s_addr ==
2430 				    sin2->sin_addr.s_addr)
2431 					match = B_TRUE;
2432 				break;
2433 			}
2434 			case AF_INET6: {
2435 				/* Compare just IP address and port. Not flow */
2436 				struct sockaddr_in6 *sin1, *sin2;
2437 
2438 				sin1 = (struct sockaddr_in6 *)sti->sti_faddr_sa;
2439 				sin2 = (struct sockaddr_in6 *)addr;
2440 				if (addrlen == sizeof (struct sockaddr_in6) &&
2441 				    sin1->sin6_port == sin2->sin6_port &&
2442 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
2443 				    &sin2->sin6_addr))
2444 					match = B_TRUE;
2445 				break;
2446 			}
2447 			case AF_UNIX:
2448 				faddr = &sti->sti_ux_faddr;
2449 				faddr_len =
2450 				    (t_uscalar_t)sizeof (sti->sti_ux_faddr);
2451 				if (faddr_len == addrlen &&
2452 				    bcmp(addr, faddr, addrlen) == 0)
2453 					match = B_TRUE;
2454 				break;
2455 			default:
2456 				faddr = sti->sti_faddr_sa;
2457 				faddr_len = (t_uscalar_t)sti->sti_faddr_len;
2458 				if (faddr_len == addrlen &&
2459 				    bcmp(addr, faddr, addrlen) == 0)
2460 					match = B_TRUE;
2461 				break;
2462 			}
2463 
2464 			if (!match) {
2465 #ifdef DEBUG
2466 				dprintso(so, 0,
2467 				    ("sockfs: T_UDERR_IND mismatch: %s - ",
2468 				    pr_addr(so->so_family,
2469 				    (struct sockaddr *)addr, addrlen)));
2470 				dprintso(so, 0, ("%s\n",
2471 				    pr_addr(so->so_family, sti->sti_faddr_sa,
2472 				    sti->sti_faddr_len)));
2473 #endif /* DEBUG */
2474 				mutex_exit(&so->so_lock);
2475 				freemsg(mp);
2476 				return (NULL);
2477 			}
2478 			/*
2479 			 * Make the write error nonpersistent. If the error
2480 			 * is zero we use ECONNRESET.
2481 			 * This assumes that the name space for ERROR_type
2482 			 * is the errno name space.
2483 			 */
2484 			if (tudi->ERROR_type != 0)
2485 				error = tudi->ERROR_type;
2486 			else
2487 				error = ECONNRESET;
2488 
2489 			soseterror(so, error);
2490 			mutex_exit(&so->so_lock);
2491 			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2492 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2493 			*wakeups = RSLEEP | WSLEEP;
2494 			*allmsgsigs = S_INPUT | S_RDNORM | S_OUTPUT;
2495 			*pollwakeups = POLLIN | POLLRDNORM | POLLOUT;
2496 			freemsg(mp);
2497 			return (NULL);
2498 		}
2499 		/*
2500 		 * If the application asked for delayed errors
2501 		 * record the T_UDERROR_IND sti_eaddr_mp and the reason in
2502 		 * sti_delayed_error for delayed error posting. If the reason
2503 		 * is zero use ECONNRESET.
2504 		 * Note that delayed error indications do not make sense for
2505 		 * AF_UNIX sockets since sendto checks that the destination
2506 		 * address is valid at the time of the sendto.
2507 		 */
2508 		if (!(so->so_options & SO_DGRAM_ERRIND)) {
2509 			mutex_exit(&so->so_lock);
2510 			freemsg(mp);
2511 			return (NULL);
2512 		}
2513 		if (sti->sti_eaddr_mp != NULL)
2514 			freemsg(sti->sti_eaddr_mp);
2515 
2516 		sti->sti_eaddr_mp = mp;
2517 		if (tudi->ERROR_type != 0)
2518 			error = tudi->ERROR_type;
2519 		else
2520 			error = ECONNRESET;
2521 		sti->sti_delayed_error = (ushort_t)error;
2522 		mutex_exit(&so->so_lock);
2523 		return (NULL);
2524 	}
2525 
2526 	case T_ERROR_ACK:
2527 		dprintso(so, 0,
2528 		    ("strsock_proto: T_ERROR_ACK for %d, error %d/%d\n",
2529 		    tpr->error_ack.ERROR_prim,
2530 		    tpr->error_ack.TLI_error,
2531 		    tpr->error_ack.UNIX_error));
2532 
2533 		if (MBLKL(mp) < sizeof (struct T_error_ack)) {
2534 			zcmn_err(getzoneid(), CE_WARN,
2535 			    "sockfs: Too short T_ERROR_ACK. Len = %ld\n",
2536 			    (ptrdiff_t)(MBLKL(mp)));
2537 			freemsg(mp);
2538 			return (NULL);
2539 		}
2540 		/*
2541 		 * Check if we were waiting for the async message
2542 		 */
2543 		mutex_enter(&so->so_lock);
2544 		if ((so->so_flag & SOASYNC_UNBIND) &&
2545 		    tpr->error_ack.ERROR_prim == T_UNBIND_REQ) {
2546 			so_unlock_single(so, SOASYNC_UNBIND);
2547 			mutex_exit(&so->so_lock);
2548 			freemsg(mp);
2549 			return (NULL);
2550 		}
2551 		mutex_exit(&so->so_lock);
2552 		soqueueack(so, mp);
2553 		return (NULL);
2554 
2555 	case T_OK_ACK:
2556 		if (MBLKL(mp) < sizeof (struct T_ok_ack)) {
2557 			zcmn_err(getzoneid(), CE_WARN,
2558 			    "sockfs: Too short T_OK_ACK. Len = %ld\n",
2559 			    (ptrdiff_t)(MBLKL(mp)));
2560 			freemsg(mp);
2561 			return (NULL);
2562 		}
2563 		/*
2564 		 * Check if we were waiting for the async message
2565 		 */
2566 		mutex_enter(&so->so_lock);
2567 		if ((so->so_flag & SOASYNC_UNBIND) &&
2568 		    tpr->ok_ack.CORRECT_prim == T_UNBIND_REQ) {
2569 			dprintso(so, 1,
2570 			    ("strsock_proto: T_OK_ACK async unbind\n"));
2571 			so_unlock_single(so, SOASYNC_UNBIND);
2572 			mutex_exit(&so->so_lock);
2573 			freemsg(mp);
2574 			return (NULL);
2575 		}
2576 		mutex_exit(&so->so_lock);
2577 		soqueueack(so, mp);
2578 		return (NULL);
2579 
2580 	case T_INFO_ACK:
2581 		if (MBLKL(mp) < sizeof (struct T_info_ack)) {
2582 			zcmn_err(getzoneid(), CE_WARN,
2583 			    "sockfs: Too short T_INFO_ACK. Len = %ld\n",
2584 			    (ptrdiff_t)(MBLKL(mp)));
2585 			freemsg(mp);
2586 			return (NULL);
2587 		}
2588 		soqueueack(so, mp);
2589 		return (NULL);
2590 
2591 	case T_CAPABILITY_ACK:
2592 		/*
2593 		 * A T_capability_ack need only be large enough to hold
2594 		 * the PRIM_type and CAP_bits1 fields; checking for anything
2595 		 * larger might reject a correct response from an older
2596 		 * provider.
2597 		 */
2598 		if (MBLKL(mp) < 2 * sizeof (t_uscalar_t)) {
2599 			zcmn_err(getzoneid(), CE_WARN,
2600 			    "sockfs: Too short T_CAPABILITY_ACK. Len = %ld\n",
2601 			    (ptrdiff_t)(MBLKL(mp)));
2602 			freemsg(mp);
2603 			return (NULL);
2604 		}
2605 		soqueueack(so, mp);
2606 		return (NULL);
2607 
2608 	case T_BIND_ACK:
2609 		if (MBLKL(mp) < sizeof (struct T_bind_ack)) {
2610 			zcmn_err(getzoneid(), CE_WARN,
2611 			    "sockfs: Too short T_BIND_ACK. Len = %ld\n",
2612 			    (ptrdiff_t)(MBLKL(mp)));
2613 			freemsg(mp);
2614 			return (NULL);
2615 		}
2616 		soqueueack(so, mp);
2617 		return (NULL);
2618 
2619 	case T_OPTMGMT_ACK:
2620 		if (MBLKL(mp) < sizeof (struct T_optmgmt_ack)) {
2621 			zcmn_err(getzoneid(), CE_WARN,
2622 			    "sockfs: Too short T_OPTMGMT_ACK. Len = %ld\n",
2623 			    (ptrdiff_t)(MBLKL(mp)));
2624 			freemsg(mp);
2625 			return (NULL);
2626 		}
2627 		soqueueack(so, mp);
2628 		return (NULL);
2629 	default:
2630 #ifdef DEBUG
2631 		zcmn_err(getzoneid(), CE_WARN,
2632 		    "sockfs: unknown TPI primitive %d received\n",
2633 		    tpr->type);
2634 #endif /* DEBUG */
2635 		freemsg(mp);
2636 		return (NULL);
2637 	}
2638 }
2639 
2640 /*
2641  * This routine is registered with the stream head to receive other
2642  * (non-data, and non-proto) messages.
2643  *
2644  * Returns NULL if the message was consumed.
2645  * Returns an mblk to make that mblk be processed by the stream head.
2646  *
2647  * Sets the return parameters (*wakeups, *firstmsgsigs, *allmsgsigs, and
2648  * *pollwakeups) for the stream head to take action on.
2649  */
2650 static mblk_t *
2651 strsock_misc(vnode_t *vp, mblk_t *mp,
2652 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
2653 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
2654 {
2655 	struct sonode *so;
2656 	sotpi_info_t *sti;
2657 
2658 	so = VTOSO(vp);
2659 	sti = SOTOTPI(so);
2660 
2661 	dprintso(so, 1, ("strsock_misc(%p, %p, 0x%x)\n",
2662 	    (void *)vp, (void *)mp, DB_TYPE(mp)));
2663 
2664 	/* Set default return values */
2665 	*wakeups = *allmsgsigs = *firstmsgsigs = *pollwakeups = 0;
2666 
2667 	switch (DB_TYPE(mp)) {
2668 	case M_PCSIG:
2669 		/*
2670 		 * This assumes that an M_PCSIG for the urgent data arrives
2671 		 * before the corresponding T_EXDATA_IND.
2672 		 *
2673 		 * Note: Just like in SunOS 4.X and 4.4BSD a poll will be
2674 		 * awoken before the urgent data shows up.
2675 		 * For OOBINLINE this can result in select returning
2676 		 * only exceptions as opposed to except|read.
2677 		 */
2678 		if (*mp->b_rptr == SIGURG) {
2679 			mutex_enter(&so->so_lock);
2680 			dprintso(so, 1,
2681 			    ("SIGURG(%p): counts %d/%d state %s\n",
2682 			    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2683 			    pr_state(so->so_state, so->so_mode)));
2684 			so_oob_sig(so, 1, allmsgsigs, pollwakeups);
2685 			dprintso(so, 1,
2686 			    ("after SIGURG(%p): counts %d/%d "
2687 			    " poll 0x%x sig 0x%x state %s\n",
2688 			    (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
2689 			    *pollwakeups, *allmsgsigs,
2690 			    pr_state(so->so_state, so->so_mode)));
2691 			mutex_exit(&so->so_lock);
2692 		}
2693 		freemsg(mp);
2694 		return (NULL);
2695 
2696 	case M_SIG:
2697 	case M_HANGUP:
2698 	case M_UNHANGUP:
2699 	case M_ERROR:
2700 		/* M_ERRORs etc are ignored */
2701 		freemsg(mp);
2702 		return (NULL);
2703 
2704 	case M_FLUSH:
2705 		/*
2706 		 * Do not flush read queue. If the M_FLUSH
2707 		 * arrives because of an impending T_discon_ind
2708 		 * we still have to keep any queued data - this is part of
2709 		 * socket semantics.
2710 		 */
2711 		if (*mp->b_rptr & FLUSHW) {
2712 			*mp->b_rptr &= ~FLUSHR;
2713 			return (mp);
2714 		}
2715 		freemsg(mp);
2716 		return (NULL);
2717 
2718 	default:
2719 		return (mp);
2720 	}
2721 }
2722 
2723 
2724 /* Register to receive signals for certain events */
2725 int
2726 so_set_asyncsigs(vnode_t *vp, pid_t pgrp, int events, int mode, cred_t *cr)
2727 {
2728 	struct strsigset ss;
2729 	int32_t rval;
2730 
2731 	/*
2732 	 * Note that SOLOCKED will be set except for the call from soaccept().
2733 	 */
2734 	ASSERT(!mutex_owned(&VTOSO(vp)->so_lock));
2735 	ss.ss_pid = pgrp;
2736 	ss.ss_events = events;
2737 	return (strioctl(vp, I_ESETSIG, (intptr_t)&ss, mode, K_TO_K, cr,
2738 	    &rval));
2739 }
2740 
2741 
2742 /* Register for events matching the SS_ASYNC flag */
2743 int
2744 so_set_events(struct sonode *so, vnode_t *vp, cred_t *cr)
2745 {
2746 	int events = so->so_state & SS_ASYNC ?
2747 	    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT :
2748 	    S_RDBAND | S_BANDURG;
2749 
2750 	return (so_set_asyncsigs(vp, so->so_pgrp, events, 0, cr));
2751 }
2752 
2753 
2754 /* Change the SS_ASYNC flag, and update signal delivery if needed */
2755 int
2756 so_flip_async(struct sonode *so, vnode_t *vp, int mode, cred_t *cr)
2757 {
2758 	ASSERT(mutex_owned(&so->so_lock));
2759 	if (so->so_pgrp != 0) {
2760 		int error;
2761 		int events = so->so_state & SS_ASYNC ?		/* Old flag */
2762 		    S_RDBAND | S_BANDURG :			/* New sigs */
2763 		    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT;
2764 
2765 		so_lock_single(so);
2766 		mutex_exit(&so->so_lock);
2767 
2768 		error = so_set_asyncsigs(vp, so->so_pgrp, events, mode, cr);
2769 
2770 		mutex_enter(&so->so_lock);
2771 		so_unlock_single(so, SOLOCKED);
2772 		if (error)
2773 			return (error);
2774 	}
2775 	so->so_state ^= SS_ASYNC;
2776 	return (0);
2777 }
2778 
2779 /*
2780  * Set new pid/pgrp for SIGPOLL (or SIGIO for FIOASYNC mode), replacing
2781  * any existing one.  If passed zero, just clear the existing one.
2782  */
2783 int
2784 so_set_siggrp(struct sonode *so, vnode_t *vp, pid_t pgrp, int mode, cred_t *cr)
2785 {
2786 	int events = so->so_state & SS_ASYNC ?
2787 	    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT :
2788 	    S_RDBAND | S_BANDURG;
2789 	int error;
2790 
2791 	ASSERT(mutex_owned(&so->so_lock));
2792 
2793 	/*
2794 	 * Change socket process (group).
2795 	 *
2796 	 * strioctl (via so_set_asyncsigs) will perform permission check and
2797 	 * also keep a PID_HOLD to prevent the pid from being reused.
2798 	 */
2799 	so_lock_single(so);
2800 	mutex_exit(&so->so_lock);
2801 
2802 	if (pgrp != 0) {
2803 		dprintso(so, 1, ("setown: adding pgrp %d ev 0x%x\n",
2804 		    pgrp, events));
2805 		error = so_set_asyncsigs(vp, pgrp, events, mode, cr);
2806 		if (error != 0) {
2807 			eprintsoline(so, error);
2808 			goto bad;
2809 		}
2810 	}
2811 	/* Remove the previously registered process/group */
2812 	if (so->so_pgrp != 0) {
2813 		dprintso(so, 1, ("setown: removing pgrp %d\n", so->so_pgrp));
2814 		error = so_set_asyncsigs(vp, so->so_pgrp, 0, mode, cr);
2815 		if (error != 0) {
2816 			eprintsoline(so, error);
2817 			error = 0;
2818 		}
2819 	}
2820 	mutex_enter(&so->so_lock);
2821 	so_unlock_single(so, SOLOCKED);
2822 	so->so_pgrp = pgrp;
2823 	return (0);
2824 bad:
2825 	mutex_enter(&so->so_lock);
2826 	so_unlock_single(so, SOLOCKED);
2827 	return (error);
2828 }
2829 
2830 /*
2831  * Wrapper for getmsg. If the socket has been converted to a stream
2832  * pass the request to the stream head.
2833  */
2834 int
2835 sock_getmsg(
2836 	struct vnode *vp,
2837 	struct strbuf *mctl,
2838 	struct strbuf *mdata,
2839 	uchar_t *prip,
2840 	int *flagsp,
2841 	int fmode,
2842 	rval_t *rvp
2843 )
2844 {
2845 	struct sonode *so;
2846 
2847 	ASSERT(vp->v_type == VSOCK);
2848 	/*
2849 	 * Use the stream head to find the real socket vnode.
2850 	 * This is needed when namefs sits above sockfs.  Some
2851 	 * sockets (like SCTP) are not streams.
2852 	 */
2853 	if (!vp->v_stream) {
2854 		return (ENOSTR);
2855 	}
2856 	ASSERT(vp->v_stream->sd_vnode);
2857 	vp = vp->v_stream->sd_vnode;
2858 	ASSERT(vn_matchops(vp, socket_vnodeops));
2859 	so = VTOSO(vp);
2860 
2861 	dprintso(so, 1, ("sock_getmsg(%p) %s\n",
2862 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2863 
2864 	if (so->so_version == SOV_STREAM) {
2865 		/* The imaginary "sockmod" has been popped - act as a stream */
2866 		return (strgetmsg(vp, mctl, mdata, prip, flagsp, fmode, rvp));
2867 	}
2868 	eprintsoline(so, ENOSTR);
2869 	return (ENOSTR);
2870 }
2871 
2872 /*
2873  * Wrapper for putmsg. If the socket has been converted to a stream
2874  * pass the request to the stream head.
2875  *
2876  * Note that a while a regular socket (SOV_SOCKSTREAM) does support the
2877  * streams ioctl set it does not support putmsg and getmsg.
2878  * Allowing putmsg would prevent sockfs from tracking the state of
2879  * the socket/transport and would also invalidate the locking in sockfs.
2880  */
2881 int
2882 sock_putmsg(
2883 	struct vnode *vp,
2884 	struct strbuf *mctl,
2885 	struct strbuf *mdata,
2886 	uchar_t pri,
2887 	int flag,
2888 	int fmode
2889 )
2890 {
2891 	struct sonode *so;
2892 
2893 	ASSERT(vp->v_type == VSOCK);
2894 	/*
2895 	 * Use the stream head to find the real socket vnode.
2896 	 * This is needed when namefs sits above sockfs.
2897 	 */
2898 	if (!vp->v_stream) {
2899 		return (ENOSTR);
2900 	}
2901 	ASSERT(vp->v_stream->sd_vnode);
2902 	vp = vp->v_stream->sd_vnode;
2903 	ASSERT(vn_matchops(vp, socket_vnodeops));
2904 	so = VTOSO(vp);
2905 
2906 	dprintso(so, 1, ("sock_putmsg(%p) %s\n",
2907 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2908 
2909 	if (so->so_version == SOV_STREAM) {
2910 		/* The imaginary "sockmod" has been popped - act as a stream */
2911 		return (strputmsg(vp, mctl, mdata, pri, flag, fmode));
2912 	}
2913 	eprintsoline(so, ENOSTR);
2914 	return (ENOSTR);
2915 }
2916 
2917 /*
2918  * Special function called only from f_getfl().
2919  * Returns FASYNC if the SS_ASYNC flag is set on a socket, else 0.
2920  * No locks are acquired here, so it is safe to use while uf_lock is held.
2921  * This exists solely for BSD fcntl() FASYNC compatibility.
2922  */
2923 int
2924 sock_getfasync(vnode_t *vp)
2925 {
2926 	struct sonode *so;
2927 
2928 	ASSERT(vp->v_type == VSOCK);
2929 	/*
2930 	 * For stream model, v_stream is used; For non-stream, v_stream always
2931 	 * equals NULL
2932 	 */
2933 	if (vp->v_stream != NULL)
2934 		so = VTOSO(vp->v_stream->sd_vnode);
2935 	else
2936 		so = VTOSO(vp);
2937 
2938 	if (so->so_version == SOV_STREAM || !(so->so_state & SS_ASYNC))
2939 		return (0);
2940 
2941 	return (FASYNC);
2942 }
2943