xref: /titanic_50/usr/src/uts/common/fs/sockfs/sockstr.c (revision f8c3982ab1838a24e4b671d13329f52bbbebc2a7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/inttypes.h>
31 #include <sys/t_lock.h>
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/buf.h>
35 #include <sys/conf.h>
36 #include <sys/cred.h>
37 #include <sys/kmem.h>
38 #include <sys/sysmacros.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/debug.h>
42 #include <sys/errno.h>
43 #include <sys/time.h>
44 #include <sys/file.h>
45 #include <sys/user.h>
46 #include <sys/stream.h>
47 #include <sys/strsubr.h>
48 #include <sys/esunddi.h>
49 #include <sys/flock.h>
50 #include <sys/modctl.h>
51 #include <sys/vtrace.h>
52 #include <sys/strsun.h>
53 #include <sys/cmn_err.h>
54 #include <sys/proc.h>
55 #include <sys/ddi.h>
56 #include <sys/kmem_impl.h>
57 
58 #include <sys/suntpi.h>
59 #include <sys/socket.h>
60 #include <sys/sockio.h>
61 #include <sys/socketvar.h>
62 #include <netinet/in.h>
63 
64 #include <sys/tiuser.h>
65 #define	_SUN_TPI_VERSION	2
66 #include <sys/tihdr.h>
67 
68 #include <inet/kssl/ksslapi.h>
69 
70 #include <c2/audit.h>
71 
72 int so_default_version = SOV_SOCKSTREAM;
73 
74 #ifdef DEBUG
75 /* Set sockdebug to print debug messages when SO_DEBUG is set */
76 int sockdebug = 0;
77 
78 /* Set sockprinterr to print error messages when SO_DEBUG is set */
79 int sockprinterr = 0;
80 
81 /*
82  * Set so_default_options to SO_DEBUG is all sockets should be created
83  * with SO_DEBUG set. This is needed to get debug printouts from the
84  * socket() call itself.
85  */
86 int so_default_options = 0;
87 #endif /* DEBUG */
88 
89 #ifdef SOCK_TEST
90 /*
91  * Set to number of ticks to limit cv_waits for code coverage testing.
92  * Set to 1000 when SO_DEBUG is set to 2.
93  */
94 clock_t sock_test_timelimit = 0;
95 #endif /* SOCK_TEST */
96 
97 /*
98  * For concurrency testing of e.g. opening /dev/ip which does not
99  * handle T_INFO_REQ messages.
100  */
101 int so_no_tinfo = 0;
102 
103 /*
104  * Timeout for getting a T_CAPABILITY_ACK - it is possible for a provider
105  * to simply ignore the T_CAPABILITY_REQ.
106  */
107 clock_t	sock_capability_timeout	= 2;	/* seconds */
108 
109 static int	do_tcapability(struct sonode *so, t_uscalar_t cap_bits1);
110 static void	so_removehooks(struct sonode *so);
111 
112 static mblk_t *strsock_proto(vnode_t *vp, mblk_t *mp,
113 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
114 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
115 static mblk_t *strsock_misc(vnode_t *vp, mblk_t *mp,
116 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
117 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
118 
119 static int tlitosyserr(int terr);
120 
121 /*
122  * Convert a socket to a stream. Invoked when the illusory sockmod
123  * is popped from the stream.
124  * Change the stream head back to default operation without losing
125  * any messages (T_conn_ind's are moved to the stream head queue).
126  */
127 int
128 so_sock2stream(struct sonode *so)
129 {
130 	struct vnode		*vp = SOTOV(so);
131 	queue_t			*rq;
132 	mblk_t			*mp;
133 	int			error = 0;
134 
135 	ASSERT(MUTEX_HELD(&so->so_plumb_lock));
136 
137 	mutex_enter(&so->so_lock);
138 	so_lock_single(so);
139 
140 	ASSERT(so->so_version != SOV_STREAM);
141 
142 	if (so->so_state & SS_DIRECT) {
143 		mblk_t **mpp;
144 		int rval;
145 
146 		/*
147 		 * Tell the transport below that sockmod is being popped
148 		 */
149 		mutex_exit(&so->so_lock);
150 		error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K, CRED(),
151 		    &rval);
152 		mutex_enter(&so->so_lock);
153 		if (error != 0) {
154 			dprintso(so, 0, ("so_sock2stream(%p): "
155 			    "_SIOCSOCKFALLBACK failed\n", so));
156 			goto exit;
157 		}
158 		so->so_state &= ~SS_DIRECT;
159 
160 		for (mpp = &so->so_conn_ind_head; (mp = *mpp) != NULL;
161 		    mpp = &mp->b_next) {
162 			struct T_conn_ind	*conn_ind;
163 
164 			/*
165 			 * strsock_proto() has already verified the length of
166 			 * this message block.
167 			 */
168 			ASSERT(MBLKL(mp) >= sizeof (struct T_conn_ind));
169 
170 			conn_ind = (struct T_conn_ind *)mp->b_rptr;
171 			if (conn_ind->OPT_length == 0 &&
172 			    conn_ind->OPT_offset == 0)
173 				continue;
174 
175 			if (DB_REF(mp) > 1) {
176 				mblk_t	*newmp;
177 				size_t	length;
178 				cred_t	*cr;
179 
180 				/*
181 				 * Copy the message block because it is used
182 				 * elsewhere, too.
183 				 */
184 				length = MBLKL(mp);
185 				newmp = soallocproto(length, _ALLOC_INTR);
186 				if (newmp == NULL) {
187 					error = EINTR;
188 					goto exit;
189 				}
190 				bcopy(mp->b_rptr, newmp->b_wptr, length);
191 				newmp->b_wptr += length;
192 				newmp->b_next = mp->b_next;
193 				cr = DB_CRED(mp);
194 				if (cr != NULL)
195 					mblk_setcred(newmp, cr);
196 				DB_CPID(newmp) = DB_CPID(mp);
197 
198 				/*
199 				 * Link the new message block into the queue
200 				 * and free the old one.
201 				 */
202 				*mpp = newmp;
203 				mp->b_next = NULL;
204 				freemsg(mp);
205 
206 				mp = newmp;
207 				conn_ind = (struct T_conn_ind *)mp->b_rptr;
208 			}
209 
210 			/*
211 			 * Remove options added by TCP for accept fast-path.
212 			 */
213 			conn_ind->OPT_length = 0;
214 			conn_ind->OPT_offset = 0;
215 		}
216 	}
217 
218 	so->so_version = SOV_STREAM;
219 	so->so_priv = NULL;
220 
221 	/*
222 	 * Remove the hooks in the stream head to avoid queuing more
223 	 * packets in sockfs.
224 	 */
225 	mutex_exit(&so->so_lock);
226 	so_removehooks(so);
227 	mutex_enter(&so->so_lock);
228 
229 	/*
230 	 * Clear any state related to urgent data. Leave any T_EXDATA_IND
231 	 * on the queue - the behavior of urgent data after a switch is
232 	 * left undefined.
233 	 */
234 	so->so_error = so->so_delayed_error = 0;
235 	freemsg(so->so_oobmsg);
236 	so->so_oobmsg = NULL;
237 	so->so_oobsigcnt = so->so_oobcnt = 0;
238 
239 	so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
240 	    SS_HASCONNIND|SS_SAVEDEOR);
241 	ASSERT(so_verify_oobstate(so));
242 
243 	freemsg(so->so_ack_mp);
244 	so->so_ack_mp = NULL;
245 
246 	/*
247 	 * Flush the T_DISCON_IND on so_discon_ind_mp.
248 	 */
249 	so_flush_discon_ind(so);
250 
251 	/*
252 	 * Move any queued T_CONN_IND messages to stream head queue.
253 	 */
254 	rq = RD(strvp2wq(vp));
255 	while ((mp = so->so_conn_ind_head) != NULL) {
256 		so->so_conn_ind_head = mp->b_next;
257 		mp->b_next = NULL;
258 		if (so->so_conn_ind_head == NULL) {
259 			ASSERT(so->so_conn_ind_tail == mp);
260 			so->so_conn_ind_tail = NULL;
261 		}
262 		dprintso(so, 0,
263 		    ("so_sock2stream(%p): moving T_CONN_IND\n",
264 		    so));
265 
266 		/* Drop lock across put() */
267 		mutex_exit(&so->so_lock);
268 		put(rq, mp);
269 		mutex_enter(&so->so_lock);
270 	}
271 
272 exit:
273 	ASSERT(MUTEX_HELD(&so->so_lock));
274 	so_unlock_single(so, SOLOCKED);
275 	mutex_exit(&so->so_lock);
276 	return (error);
277 }
278 
279 /*
280  * Covert a stream back to a socket. This is invoked when the illusory
281  * sockmod is pushed on a stream (where the stream was "created" by
282  * popping the illusory sockmod).
283  * This routine can not recreate the socket state (certain aspects of
284  * it like urgent data state and the bound/connected addresses for AF_UNIX
285  * sockets can not be recreated by asking the transport for information).
286  * Thus this routine implicitly assumes that the socket is in an initial
287  * state (as if it was just created). It flushes any messages queued on the
288  * read queue to avoid dealing with e.g. TPI acks or T_exdata_ind messages.
289  */
290 void
291 so_stream2sock(struct sonode *so)
292 {
293 	struct vnode *vp = SOTOV(so);
294 
295 	ASSERT(MUTEX_HELD(&so->so_plumb_lock));
296 
297 	mutex_enter(&so->so_lock);
298 	so_lock_single(so);
299 	ASSERT(so->so_version == SOV_STREAM);
300 	so->so_version = SOV_SOCKSTREAM;
301 	so->so_pushcnt = 0;
302 	mutex_exit(&so->so_lock);
303 
304 	/*
305 	 * Set a permenent error to force any thread in sorecvmsg to
306 	 * return (and drop SOREADLOCKED). Clear the error once
307 	 * we have SOREADLOCKED.
308 	 * This makes a read sleeping during the I_PUSH of sockmod return
309 	 * EIO.
310 	 */
311 	strsetrerror(SOTOV(so), EIO, 1, NULL);
312 
313 	/*
314 	 * Get the read lock before flushing data to avoid
315 	 * problems with the T_EXDATA_IND MSG_PEEK code in sorecvmsg.
316 	 */
317 	mutex_enter(&so->so_lock);
318 	(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
319 	mutex_exit(&so->so_lock);
320 
321 	strsetrerror(SOTOV(so), 0, 0, NULL);
322 	so_installhooks(so);
323 
324 	/*
325 	 * Flush everything on the read queue.
326 	 * This ensures that no T_CONN_IND remain and that no T_EXDATA_IND
327 	 * remain; those types of messages would confuse sockfs.
328 	 */
329 	strflushrq(vp, FLUSHALL);
330 	mutex_enter(&so->so_lock);
331 
332 	/*
333 	 * Flush the T_DISCON_IND on so_discon_ind_mp.
334 	 */
335 	so_flush_discon_ind(so);
336 	so_unlock_read(so);	/* Clear SOREADLOCKED */
337 
338 	so_unlock_single(so, SOLOCKED);
339 	mutex_exit(&so->so_lock);
340 }
341 
342 /*
343  * Install the hooks in the stream head.
344  */
345 void
346 so_installhooks(struct sonode *so)
347 {
348 	struct vnode *vp = SOTOV(so);
349 
350 	strsetrputhooks(vp, SH_SIGALLDATA | SH_IGN_ZEROLEN | SH_CONSOL_DATA,
351 	    strsock_proto, strsock_misc);
352 	strsetwputhooks(vp, SH_SIGPIPE | SH_RECHECK_ERR, 0);
353 }
354 
355 /*
356  * Remove the hooks in the stream head.
357  */
358 static void
359 so_removehooks(struct sonode *so)
360 {
361 	struct vnode *vp = SOTOV(so);
362 
363 	strsetrputhooks(vp, 0, NULL, NULL);
364 	strsetwputhooks(vp, 0, STRTIMOUT);
365 	/*
366 	 * Leave read behavior as it would have been for a normal
367 	 * stream i.e. a read of an M_PROTO will fail.
368 	 */
369 }
370 
371 /*
372  * Initialize the streams side of a socket including
373  * T_info_req/ack processing. If tso is not NULL its values are used thereby
374  * avoiding the T_INFO_REQ.
375  */
376 int
377 so_strinit(struct sonode *so, struct sonode *tso)
378 {
379 	struct vnode *vp = SOTOV(so);
380 	struct stdata *stp;
381 	mblk_t *mp;
382 	int error;
383 
384 	dprintso(so, 1, ("so_strinit(%p)\n", so));
385 
386 	/* Preallocate an unbind_req message */
387 	mp = soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
388 	mutex_enter(&so->so_lock);
389 	so->so_unbind_mp = mp;
390 #ifdef DEBUG
391 	so->so_options = so_default_options;
392 #endif /* DEBUG */
393 	mutex_exit(&so->so_lock);
394 
395 	so_installhooks(so);
396 
397 	/*
398 	 * The T_CAPABILITY_REQ should be the first message sent down because
399 	 * at least TCP has a fast-path for this which avoids timeouts while
400 	 * waiting for the T_CAPABILITY_ACK under high system load.
401 	 */
402 	if (tso == NULL) {
403 		error = do_tcapability(so, TC1_ACCEPTOR_ID | TC1_INFO);
404 		if (error)
405 			return (error);
406 	} else {
407 		mutex_enter(&so->so_lock);
408 		so->so_tsdu_size = tso->so_tsdu_size;
409 		so->so_etsdu_size = tso->so_etsdu_size;
410 		so->so_addr_size = tso->so_addr_size;
411 		so->so_opt_size = tso->so_opt_size;
412 		so->so_tidu_size = tso->so_tidu_size;
413 		so->so_serv_type = tso->so_serv_type;
414 		so->so_mode = tso->so_mode & ~SM_ACCEPTOR_ID;
415 		mutex_exit(&so->so_lock);
416 
417 		/* the following do_tcapability may update so->so_mode */
418 		if ((tso->so_serv_type != T_CLTS) &&
419 		    !(tso->so_state & SS_DIRECT)) {
420 			error = do_tcapability(so, TC1_ACCEPTOR_ID);
421 			if (error)
422 				return (error);
423 		}
424 	}
425 	/*
426 	 * If the addr_size is 0 we treat it as already bound
427 	 * and connected. This is used by the routing socket.
428 	 * We set the addr_size to something to allocate a the address
429 	 * structures.
430 	 */
431 	if (so->so_addr_size == 0) {
432 		so->so_state |= SS_ISBOUND | SS_ISCONNECTED;
433 		/* Address size can vary with address families. */
434 		if (so->so_family == AF_INET6)
435 			so->so_addr_size =
436 			    (t_scalar_t)sizeof (struct sockaddr_in6);
437 		else
438 			so->so_addr_size =
439 			    (t_scalar_t)sizeof (struct sockaddr_in);
440 		ASSERT(so->so_unbind_mp);
441 	}
442 	/*
443 	 * Allocate the addresses.
444 	 */
445 	ASSERT(so->so_laddr_sa == NULL && so->so_faddr_sa == NULL);
446 	ASSERT(so->so_laddr_len == 0 && so->so_faddr_len == 0);
447 	so->so_laddr_maxlen = so->so_faddr_maxlen =
448 	    P2ROUNDUP(so->so_addr_size, KMEM_ALIGN);
449 	so->so_laddr_sa = kmem_alloc(so->so_laddr_maxlen * 2, KM_SLEEP);
450 	so->so_faddr_sa = (struct sockaddr *)((caddr_t)so->so_laddr_sa
451 	    + so->so_laddr_maxlen);
452 
453 	if (so->so_family == AF_UNIX) {
454 		/*
455 		 * Initialize AF_UNIX related fields.
456 		 */
457 		bzero(&so->so_ux_laddr, sizeof (so->so_ux_laddr));
458 		bzero(&so->so_ux_faddr, sizeof (so->so_ux_faddr));
459 	}
460 
461 	stp = vp->v_stream;
462 	/*
463 	 * Have to keep minpsz at zero in order to allow write/send of zero
464 	 * bytes.
465 	 */
466 	mutex_enter(&stp->sd_lock);
467 	if (stp->sd_qn_minpsz == 1)
468 		stp->sd_qn_minpsz = 0;
469 	mutex_exit(&stp->sd_lock);
470 
471 	return (0);
472 }
473 
474 static void
475 copy_tinfo(struct sonode *so, struct T_info_ack *tia)
476 {
477 	so->so_tsdu_size = tia->TSDU_size;
478 	so->so_etsdu_size = tia->ETSDU_size;
479 	so->so_addr_size = tia->ADDR_size;
480 	so->so_opt_size = tia->OPT_size;
481 	so->so_tidu_size = tia->TIDU_size;
482 	so->so_serv_type = tia->SERV_type;
483 	switch (tia->CURRENT_state) {
484 	case TS_UNBND:
485 		break;
486 	case TS_IDLE:
487 		so->so_state |= SS_ISBOUND;
488 		so->so_laddr_len = 0;
489 		so->so_state &= ~SS_LADDR_VALID;
490 		break;
491 	case TS_DATA_XFER:
492 		so->so_state |= SS_ISBOUND|SS_ISCONNECTED;
493 		so->so_laddr_len = 0;
494 		so->so_faddr_len = 0;
495 		so->so_state &= ~(SS_LADDR_VALID | SS_FADDR_VALID);
496 		break;
497 	}
498 
499 	/*
500 	 * Heuristics for determining the socket mode flags
501 	 * (SM_ATOMIC, SM_CONNREQUIRED, SM_ADDR, SM_FDPASSING,
502 	 * and SM_EXDATA, SM_OPTDATA, and SM_BYTESTREAM)
503 	 * from the info ack.
504 	 */
505 	if (so->so_serv_type == T_CLTS) {
506 		so->so_mode |= SM_ATOMIC | SM_ADDR;
507 	} else {
508 		so->so_mode |= SM_CONNREQUIRED;
509 		if (so->so_etsdu_size != 0 && so->so_etsdu_size != -2)
510 			so->so_mode |= SM_EXDATA;
511 	}
512 	if (so->so_type == SOCK_SEQPACKET || so->so_type == SOCK_RAW) {
513 		/* Semantics are to discard tail end of messages */
514 		so->so_mode |= SM_ATOMIC;
515 	}
516 	if (so->so_family == AF_UNIX) {
517 		so->so_mode |= SM_FDPASSING | SM_OPTDATA;
518 		if (so->so_addr_size == -1) {
519 			/* MAXPATHLEN + soun_family + nul termination */
520 			so->so_addr_size = (t_scalar_t)(MAXPATHLEN +
521 			    sizeof (short) + 1);
522 		}
523 		if (so->so_type == SOCK_STREAM) {
524 			/*
525 			 * Make it into a byte-stream transport.
526 			 * SOCK_SEQPACKET sockets are unchanged.
527 			 */
528 			so->so_tsdu_size = 0;
529 		}
530 	} else if (so->so_addr_size == -1) {
531 		/*
532 		 * Logic extracted from sockmod - have to pick some max address
533 		 * length in order to preallocate the addresses.
534 		 */
535 		so->so_addr_size = SOA_DEFSIZE;
536 	}
537 	if (so->so_tsdu_size == 0)
538 		so->so_mode |= SM_BYTESTREAM;
539 }
540 
541 static int
542 check_tinfo(struct sonode *so)
543 {
544 	/* Consistency checks */
545 	if (so->so_type == SOCK_DGRAM && so->so_serv_type != T_CLTS) {
546 		eprintso(so, ("service type and socket type mismatch\n"));
547 		eprintsoline(so, EPROTO);
548 		return (EPROTO);
549 	}
550 	if (so->so_type == SOCK_STREAM && so->so_serv_type == T_CLTS) {
551 		eprintso(so, ("service type and socket type mismatch\n"));
552 		eprintsoline(so, EPROTO);
553 		return (EPROTO);
554 	}
555 	if (so->so_type == SOCK_SEQPACKET && so->so_serv_type == T_CLTS) {
556 		eprintso(so, ("service type and socket type mismatch\n"));
557 		eprintsoline(so, EPROTO);
558 		return (EPROTO);
559 	}
560 	if (so->so_family == AF_INET &&
561 	    so->so_addr_size != (t_scalar_t)sizeof (struct sockaddr_in)) {
562 		eprintso(so,
563 		    ("AF_INET must have sockaddr_in address length. Got %d\n",
564 		    so->so_addr_size));
565 		eprintsoline(so, EMSGSIZE);
566 		return (EMSGSIZE);
567 	}
568 	if (so->so_family == AF_INET6 &&
569 	    so->so_addr_size != (t_scalar_t)sizeof (struct sockaddr_in6)) {
570 		eprintso(so,
571 		    ("AF_INET6 must have sockaddr_in6 address length. Got %d\n",
572 		    so->so_addr_size));
573 		eprintsoline(so, EMSGSIZE);
574 		return (EMSGSIZE);
575 	}
576 
577 	dprintso(so, 1, (
578 	    "tinfo: serv %d tsdu %d, etsdu %d, addr %d, opt %d, tidu %d\n",
579 	    so->so_serv_type, so->so_tsdu_size, so->so_etsdu_size,
580 	    so->so_addr_size, so->so_opt_size,
581 	    so->so_tidu_size));
582 	dprintso(so, 1, ("tinfo: so_state %s\n",
583 	    pr_state(so->so_state, so->so_mode)));
584 	return (0);
585 }
586 
587 /*
588  * Send down T_info_req and wait for the ack.
589  * Record interesting T_info_ack values in the sonode.
590  */
591 static int
592 do_tinfo(struct sonode *so)
593 {
594 	struct T_info_req tir;
595 	mblk_t *mp;
596 	int error;
597 
598 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
599 
600 	if (so_no_tinfo) {
601 		so->so_addr_size = 0;
602 		return (0);
603 	}
604 
605 	dprintso(so, 1, ("do_tinfo(%p)\n", so));
606 
607 	/* Send T_INFO_REQ */
608 	tir.PRIM_type = T_INFO_REQ;
609 	mp = soallocproto1(&tir, sizeof (tir),
610 	    sizeof (struct T_info_req) + sizeof (struct T_info_ack),
611 	    _ALLOC_INTR);
612 	if (mp == NULL) {
613 		eprintsoline(so, ENOBUFS);
614 		return (ENOBUFS);
615 	}
616 	/* T_INFO_REQ has to be M_PCPROTO */
617 	DB_TYPE(mp) = M_PCPROTO;
618 
619 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
620 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
621 	if (error) {
622 		eprintsoline(so, error);
623 		return (error);
624 	}
625 	mutex_enter(&so->so_lock);
626 	/* Wait for T_INFO_ACK */
627 	if ((error = sowaitprim(so, T_INFO_REQ, T_INFO_ACK,
628 	    (t_uscalar_t)sizeof (struct T_info_ack), &mp, 0))) {
629 		mutex_exit(&so->so_lock);
630 		eprintsoline(so, error);
631 		return (error);
632 	}
633 
634 	ASSERT(mp);
635 	copy_tinfo(so, (struct T_info_ack *)mp->b_rptr);
636 	mutex_exit(&so->so_lock);
637 	freemsg(mp);
638 	return (check_tinfo(so));
639 }
640 
641 /*
642  * Send down T_capability_req and wait for the ack.
643  * Record interesting T_capability_ack values in the sonode.
644  */
645 static int
646 do_tcapability(struct sonode *so, t_uscalar_t cap_bits1)
647 {
648 	struct T_capability_req tcr;
649 	struct T_capability_ack *tca;
650 	mblk_t *mp;
651 	int error;
652 
653 	ASSERT(cap_bits1 != 0);
654 	ASSERT((cap_bits1 & ~(TC1_ACCEPTOR_ID | TC1_INFO)) == 0);
655 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
656 
657 	if (so->so_provinfo->tpi_capability == PI_NO)
658 		return (do_tinfo(so));
659 
660 	if (so_no_tinfo) {
661 		so->so_addr_size = 0;
662 		if ((cap_bits1 &= ~TC1_INFO) == 0)
663 			return (0);
664 	}
665 
666 	dprintso(so, 1, ("do_tcapability(%p)\n", so));
667 
668 	/* Send T_CAPABILITY_REQ */
669 	tcr.PRIM_type = T_CAPABILITY_REQ;
670 	tcr.CAP_bits1 = cap_bits1;
671 	mp = soallocproto1(&tcr, sizeof (tcr),
672 	    sizeof (struct T_capability_req) + sizeof (struct T_capability_ack),
673 	    _ALLOC_INTR);
674 	if (mp == NULL) {
675 		eprintsoline(so, ENOBUFS);
676 		return (ENOBUFS);
677 	}
678 	/* T_CAPABILITY_REQ should be M_PCPROTO here */
679 	DB_TYPE(mp) = M_PCPROTO;
680 
681 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
682 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
683 	if (error) {
684 		eprintsoline(so, error);
685 		return (error);
686 	}
687 	mutex_enter(&so->so_lock);
688 	/* Wait for T_CAPABILITY_ACK */
689 	if ((error = sowaitprim(so, T_CAPABILITY_REQ, T_CAPABILITY_ACK,
690 	    (t_uscalar_t)sizeof (*tca), &mp, sock_capability_timeout * hz))) {
691 		mutex_exit(&so->so_lock);
692 		PI_PROVLOCK(so->so_provinfo);
693 		if (so->so_provinfo->tpi_capability == PI_DONTKNOW)
694 			so->so_provinfo->tpi_capability = PI_NO;
695 		PI_PROVUNLOCK(so->so_provinfo);
696 		ASSERT((so->so_mode & SM_ACCEPTOR_ID) == 0);
697 		if (cap_bits1 & TC1_INFO) {
698 			/*
699 			 * If the T_CAPABILITY_REQ timed out and then a
700 			 * T_INFO_REQ gets a protocol error, most likely
701 			 * the capability was slow (vs. unsupported). Return
702 			 * ENOSR for this case as a best guess.
703 			 */
704 			if (error == ETIME) {
705 				return ((error = do_tinfo(so)) == EPROTO ?
706 				    ENOSR : error);
707 			}
708 			return (do_tinfo(so));
709 		}
710 		return (0);
711 	}
712 
713 	if (so->so_provinfo->tpi_capability == PI_DONTKNOW) {
714 		PI_PROVLOCK(so->so_provinfo);
715 		so->so_provinfo->tpi_capability = PI_YES;
716 		PI_PROVUNLOCK(so->so_provinfo);
717 	}
718 
719 	ASSERT(mp);
720 	tca = (struct T_capability_ack *)mp->b_rptr;
721 
722 	ASSERT((cap_bits1 & TC1_INFO) == (tca->CAP_bits1 & TC1_INFO));
723 
724 	cap_bits1 = tca->CAP_bits1;
725 
726 	if (cap_bits1 & TC1_ACCEPTOR_ID) {
727 		so->so_acceptor_id = tca->ACCEPTOR_id;
728 		so->so_mode |= SM_ACCEPTOR_ID;
729 	}
730 
731 	if (cap_bits1 & TC1_INFO)
732 		copy_tinfo(so, &tca->INFO_ack);
733 
734 	mutex_exit(&so->so_lock);
735 	freemsg(mp);
736 
737 	if (cap_bits1 & TC1_INFO)
738 		return (check_tinfo(so));
739 
740 	return (0);
741 }
742 
743 /*
744  * Retrieve and clear the socket error.
745  */
746 int
747 sogeterr(struct sonode *so)
748 {
749 	int error;
750 
751 	ASSERT(MUTEX_HELD(&so->so_lock));
752 
753 	error = so->so_error;
754 	so->so_error = 0;
755 
756 	return (error);
757 }
758 
759 /*
760  * This routine is registered with the stream head to retrieve read
761  * side errors.
762  * It does not clear the socket error for a peeking read side operation.
763  * It the error is to be cleared it sets *clearerr.
764  */
765 int
766 sogetrderr(vnode_t *vp, int ispeek, int *clearerr)
767 {
768 	struct sonode *so = VTOSO(vp);
769 	int error;
770 
771 	mutex_enter(&so->so_lock);
772 	if (ispeek) {
773 		error = so->so_error;
774 		*clearerr = 0;
775 	} else {
776 		error = so->so_error;
777 		so->so_error = 0;
778 		*clearerr = 1;
779 	}
780 	mutex_exit(&so->so_lock);
781 	return (error);
782 }
783 
784 /*
785  * This routine is registered with the stream head to retrieve write
786  * side errors.
787  * It does not clear the socket error for a peeking read side operation.
788  * It the error is to be cleared it sets *clearerr.
789  */
790 int
791 sogetwrerr(vnode_t *vp, int ispeek, int *clearerr)
792 {
793 	struct sonode *so = VTOSO(vp);
794 	int error;
795 
796 	mutex_enter(&so->so_lock);
797 	if (so->so_state & SS_CANTSENDMORE) {
798 		error = EPIPE;
799 		*clearerr = 0;
800 	} else {
801 		error = so->so_error;
802 		if (ispeek) {
803 			*clearerr = 0;
804 		} else {
805 			so->so_error = 0;
806 			*clearerr = 1;
807 		}
808 	}
809 	mutex_exit(&so->so_lock);
810 	return (error);
811 }
812 
813 /*
814  * Set a nonpersistent read and write error on the socket.
815  * Used when there is a T_uderror_ind for a connected socket.
816  * The caller also needs to call strsetrerror and strsetwerror
817  * after dropping the lock.
818  */
819 void
820 soseterror(struct sonode *so, int error)
821 {
822 	ASSERT(error != 0);
823 
824 	ASSERT(MUTEX_HELD(&so->so_lock));
825 	so->so_error = (ushort_t)error;
826 }
827 
828 void
829 soisconnecting(struct sonode *so)
830 {
831 	ASSERT(MUTEX_HELD(&so->so_lock));
832 	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
833 	so->so_state |= SS_ISCONNECTING;
834 	cv_broadcast(&so->so_state_cv);
835 }
836 
837 void
838 soisconnected(struct sonode *so)
839 {
840 	ASSERT(MUTEX_HELD(&so->so_lock));
841 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING);
842 	so->so_state |= SS_ISCONNECTED;
843 	cv_broadcast(&so->so_state_cv);
844 }
845 
846 /*
847  * The caller also needs to call strsetrerror, strsetwerror and strseteof.
848  */
849 void
850 soisdisconnected(struct sonode *so, int error)
851 {
852 	ASSERT(MUTEX_HELD(&so->so_lock));
853 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING|
854 	    SS_LADDR_VALID|SS_FADDR_VALID);
855 	so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
856 	so->so_error = (ushort_t)error;
857 	if (so->so_peercred != NULL) {
858 		crfree(so->so_peercred);
859 		so->so_peercred = NULL;
860 	}
861 	cv_broadcast(&so->so_state_cv);
862 }
863 
864 /*
865  * For connected AF_UNIX SOCK_DGRAM sockets when the peer closes.
866  * Does not affect write side.
867  * The caller also has to call strsetrerror.
868  */
869 static void
870 sobreakconn(struct sonode *so, int error)
871 {
872 	ASSERT(MUTEX_HELD(&so->so_lock));
873 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
874 	so->so_error = (ushort_t)error;
875 	cv_broadcast(&so->so_state_cv);
876 }
877 
878 /*
879  * Can no longer send.
880  * Caller must also call strsetwerror.
881  *
882  * We mark the peer address as no longer valid for getpeername, but
883  * leave it around for so_unix_close to notify the peer (that
884  * transport has no addressing held at that layer).
885  */
886 void
887 socantsendmore(struct sonode *so)
888 {
889 	ASSERT(MUTEX_HELD(&so->so_lock));
890 	so->so_state = so->so_state & ~SS_FADDR_VALID | SS_CANTSENDMORE;
891 	cv_broadcast(&so->so_state_cv);
892 }
893 
894 /*
895  * The caller must call strseteof(,1) as well as this routine
896  * to change the socket state.
897  */
898 void
899 socantrcvmore(struct sonode *so)
900 {
901 	ASSERT(MUTEX_HELD(&so->so_lock));
902 	so->so_state |= SS_CANTRCVMORE;
903 	cv_broadcast(&so->so_state_cv);
904 }
905 
906 /*
907  * The caller has sent down a "request_prim" primitive and wants to wait for
908  * an ack ("ack_prim") or an T_ERROR_ACK for it.
909  * The specified "ack_prim" can be a T_OK_ACK.
910  *
911  * Assumes that all the TPI acks are M_PCPROTO messages.
912  *
913  * Note that the socket is single-threaded (using so_lock_single)
914  * for all operations that generate TPI ack messages. Since
915  * only TPI ack messages are M_PCPROTO we should never receive
916  * anything except either the ack we are expecting or a T_ERROR_ACK
917  * for the same primitive.
918  */
919 int
920 sowaitprim(struct sonode *so, t_scalar_t request_prim, t_scalar_t ack_prim,
921 	    t_uscalar_t min_size, mblk_t **mpp, clock_t wait)
922 {
923 	mblk_t *mp;
924 	union T_primitives *tpr;
925 	int error;
926 
927 	dprintso(so, 1, ("sowaitprim(%p, %d, %d, %d, %p, %lu)\n",
928 	    so, request_prim, ack_prim, min_size, mpp, wait));
929 
930 	ASSERT(MUTEX_HELD(&so->so_lock));
931 
932 	error = sowaitack(so, &mp, wait);
933 	if (error)
934 		return (error);
935 
936 	dprintso(so, 1, ("got msg %p\n", mp));
937 	if (DB_TYPE(mp) != M_PCPROTO ||
938 	    MBLKL(mp) < sizeof (tpr->type)) {
939 		freemsg(mp);
940 		eprintsoline(so, EPROTO);
941 		return (EPROTO);
942 	}
943 	tpr = (union T_primitives *)mp->b_rptr;
944 	/*
945 	 * Did we get the primitive that we were asking for?
946 	 * For T_OK_ACK we also check that it matches the request primitive.
947 	 */
948 	if (tpr->type == ack_prim &&
949 	    (ack_prim != T_OK_ACK ||
950 	    tpr->ok_ack.CORRECT_prim == request_prim)) {
951 		if (MBLKL(mp) >= (ssize_t)min_size) {
952 			/* Found what we are looking for */
953 			*mpp = mp;
954 			return (0);
955 		}
956 		/* Too short */
957 		freemsg(mp);
958 		eprintsoline(so, EPROTO);
959 		return (EPROTO);
960 	}
961 
962 	if (tpr->type == T_ERROR_ACK &&
963 	    tpr->error_ack.ERROR_prim == request_prim) {
964 		/* Error to the primitive we were looking for */
965 		if (tpr->error_ack.TLI_error == TSYSERR) {
966 			error = tpr->error_ack.UNIX_error;
967 		} else {
968 			error = tlitosyserr(tpr->error_ack.TLI_error);
969 		}
970 		dprintso(so, 0, ("error_ack for %d: %d/%d ->%d\n",
971 		    tpr->error_ack.ERROR_prim,
972 		    tpr->error_ack.TLI_error,
973 		    tpr->error_ack.UNIX_error,
974 		    error));
975 		freemsg(mp);
976 		return (error);
977 	}
978 	/*
979 	 * Wrong primitive or T_ERROR_ACK for the wrong primitive
980 	 */
981 #ifdef DEBUG
982 	if (tpr->type == T_ERROR_ACK) {
983 		dprintso(so, 0, ("error_ack for %d: %d/%d\n",
984 		    tpr->error_ack.ERROR_prim,
985 		    tpr->error_ack.TLI_error,
986 		    tpr->error_ack.UNIX_error));
987 	} else if (tpr->type == T_OK_ACK) {
988 		dprintso(so, 0, ("ok_ack for %d, expected %d for %d\n",
989 		    tpr->ok_ack.CORRECT_prim,
990 		    ack_prim, request_prim));
991 	} else {
992 		dprintso(so, 0,
993 		    ("unexpected primitive %d, expected %d for %d\n",
994 		    tpr->type, ack_prim, request_prim));
995 	}
996 #endif /* DEBUG */
997 
998 	freemsg(mp);
999 	eprintsoline(so, EPROTO);
1000 	return (EPROTO);
1001 }
1002 
1003 /*
1004  * Wait for a T_OK_ACK for the specified primitive.
1005  */
1006 int
1007 sowaitokack(struct sonode *so, t_scalar_t request_prim)
1008 {
1009 	mblk_t *mp;
1010 	int error;
1011 
1012 	error = sowaitprim(so, request_prim, T_OK_ACK,
1013 	    (t_uscalar_t)sizeof (struct T_ok_ack), &mp, 0);
1014 	if (error)
1015 		return (error);
1016 	freemsg(mp);
1017 	return (0);
1018 }
1019 
1020 /*
1021  * Queue a received TPI ack message on so_ack_mp.
1022  */
1023 void
1024 soqueueack(struct sonode *so, mblk_t *mp)
1025 {
1026 	if (DB_TYPE(mp) != M_PCPROTO) {
1027 		zcmn_err(getzoneid(), CE_WARN,
1028 		    "sockfs: received unexpected M_PROTO TPI ack. Prim %d\n",
1029 		    *(t_scalar_t *)mp->b_rptr);
1030 		freemsg(mp);
1031 		return;
1032 	}
1033 
1034 	mutex_enter(&so->so_lock);
1035 	if (so->so_ack_mp != NULL) {
1036 		dprintso(so, 1, ("so_ack_mp already set\n"));
1037 		freemsg(so->so_ack_mp);
1038 		so->so_ack_mp = NULL;
1039 	}
1040 	so->so_ack_mp = mp;
1041 	cv_broadcast(&so->so_ack_cv);
1042 	mutex_exit(&so->so_lock);
1043 }
1044 
1045 /*
1046  * Wait for a TPI ack ignoring signals and errors.
1047  */
1048 int
1049 sowaitack(struct sonode *so, mblk_t **mpp, clock_t wait)
1050 {
1051 	ASSERT(MUTEX_HELD(&so->so_lock));
1052 
1053 	while (so->so_ack_mp == NULL) {
1054 #ifdef SOCK_TEST
1055 		if (wait == 0 && sock_test_timelimit != 0)
1056 			wait = sock_test_timelimit;
1057 #endif
1058 		if (wait != 0) {
1059 			/*
1060 			 * Only wait for the time limit.
1061 			 */
1062 			clock_t now;
1063 
1064 			time_to_wait(&now, wait);
1065 			if (cv_timedwait(&so->so_ack_cv, &so->so_lock,
1066 			    now) == -1) {
1067 				eprintsoline(so, ETIME);
1068 				return (ETIME);
1069 			}
1070 		}
1071 		else
1072 			cv_wait(&so->so_ack_cv, &so->so_lock);
1073 	}
1074 	*mpp = so->so_ack_mp;
1075 #ifdef DEBUG
1076 	{
1077 		union T_primitives *tpr;
1078 		mblk_t *mp = *mpp;
1079 
1080 		tpr = (union T_primitives *)mp->b_rptr;
1081 		ASSERT(DB_TYPE(mp) == M_PCPROTO);
1082 		ASSERT(tpr->type == T_OK_ACK ||
1083 		    tpr->type == T_ERROR_ACK ||
1084 		    tpr->type == T_BIND_ACK ||
1085 		    tpr->type == T_CAPABILITY_ACK ||
1086 		    tpr->type == T_INFO_ACK ||
1087 		    tpr->type == T_OPTMGMT_ACK);
1088 	}
1089 #endif /* DEBUG */
1090 	so->so_ack_mp = NULL;
1091 	return (0);
1092 }
1093 
1094 /*
1095  * Queue a received T_CONN_IND message on so_conn_ind_head/tail.
1096  */
1097 void
1098 soqueueconnind(struct sonode *so, mblk_t *mp)
1099 {
1100 	if (DB_TYPE(mp) != M_PROTO) {
1101 		zcmn_err(getzoneid(), CE_WARN,
1102 		    "sockfs: received unexpected M_PCPROTO T_CONN_IND\n");
1103 		freemsg(mp);
1104 		return;
1105 	}
1106 
1107 	mutex_enter(&so->so_lock);
1108 	ASSERT(mp->b_next == NULL);
1109 	if (so->so_conn_ind_head == NULL) {
1110 		so->so_conn_ind_head = mp;
1111 		so->so_state |= SS_HASCONNIND;
1112 	} else {
1113 		ASSERT(so->so_state & SS_HASCONNIND);
1114 		ASSERT(so->so_conn_ind_tail->b_next == NULL);
1115 		so->so_conn_ind_tail->b_next = mp;
1116 	}
1117 	so->so_conn_ind_tail = mp;
1118 	/* Wakeup a single consumer of the T_CONN_IND */
1119 	cv_signal(&so->so_connind_cv);
1120 	mutex_exit(&so->so_lock);
1121 }
1122 
1123 /*
1124  * Wait for a T_CONN_IND.
1125  * Don't wait if nonblocking.
1126  * Accept signals and socket errors.
1127  */
1128 int
1129 sowaitconnind(struct sonode *so, int fmode, mblk_t **mpp)
1130 {
1131 	mblk_t *mp;
1132 	int error = 0;
1133 
1134 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1135 	mutex_enter(&so->so_lock);
1136 check_error:
1137 	if (so->so_error) {
1138 		error = sogeterr(so);
1139 		if (error) {
1140 			mutex_exit(&so->so_lock);
1141 			return (error);
1142 		}
1143 	}
1144 
1145 	if (so->so_conn_ind_head == NULL) {
1146 		if (fmode & (FNDELAY|FNONBLOCK)) {
1147 			error = EWOULDBLOCK;
1148 			goto done;
1149 		}
1150 		if (!cv_wait_sig_swap(&so->so_connind_cv, &so->so_lock)) {
1151 			error = EINTR;
1152 			goto done;
1153 		}
1154 		goto check_error;
1155 	}
1156 	mp = so->so_conn_ind_head;
1157 	so->so_conn_ind_head = mp->b_next;
1158 	mp->b_next = NULL;
1159 	if (so->so_conn_ind_head == NULL) {
1160 		ASSERT(so->so_conn_ind_tail == mp);
1161 		so->so_conn_ind_tail = NULL;
1162 		so->so_state &= ~SS_HASCONNIND;
1163 	}
1164 	*mpp = mp;
1165 done:
1166 	mutex_exit(&so->so_lock);
1167 	return (error);
1168 }
1169 
1170 /*
1171  * Flush a T_CONN_IND matching the sequence number from the list.
1172  * Return zero if found; non-zero otherwise.
1173  * This is called very infrequently thus it is ok to do a linear search.
1174  */
1175 int
1176 soflushconnind(struct sonode *so, t_scalar_t seqno)
1177 {
1178 	mblk_t *prevmp, *mp;
1179 	struct T_conn_ind *tci;
1180 
1181 	mutex_enter(&so->so_lock);
1182 	for (prevmp = NULL, mp = so->so_conn_ind_head; mp != NULL;
1183 	    prevmp = mp, mp = mp->b_next) {
1184 		tci = (struct T_conn_ind *)mp->b_rptr;
1185 		if (tci->SEQ_number == seqno) {
1186 			dprintso(so, 1,
1187 			    ("t_discon_ind: found T_CONN_IND %d\n", seqno));
1188 			/* Deleting last? */
1189 			if (so->so_conn_ind_tail == mp) {
1190 				so->so_conn_ind_tail = prevmp;
1191 			}
1192 			if (prevmp == NULL) {
1193 				/* Deleting first */
1194 				so->so_conn_ind_head = mp->b_next;
1195 			} else {
1196 				prevmp->b_next = mp->b_next;
1197 			}
1198 			mp->b_next = NULL;
1199 			if (so->so_conn_ind_head == NULL) {
1200 				ASSERT(so->so_conn_ind_tail == NULL);
1201 				so->so_state &= ~SS_HASCONNIND;
1202 			} else {
1203 				ASSERT(so->so_conn_ind_tail != NULL);
1204 			}
1205 			so->so_error = ECONNABORTED;
1206 			mutex_exit(&so->so_lock);
1207 
1208 			/*
1209 			 * T_KSSL_PROXY_CONN_IND may carry a handle for
1210 			 * an SSL context, and needs to be released.
1211 			 */
1212 			if ((tci->PRIM_type == T_SSL_PROXY_CONN_IND) &&
1213 			    (mp->b_cont != NULL)) {
1214 				kssl_ctx_t kssl_ctx;
1215 
1216 				ASSERT(MBLKL(mp->b_cont) ==
1217 				    sizeof (kssl_ctx_t));
1218 				kssl_ctx = *((kssl_ctx_t *)mp->b_cont->b_rptr);
1219 				kssl_release_ctx(kssl_ctx);
1220 			}
1221 			freemsg(mp);
1222 			return (0);
1223 		}
1224 	}
1225 	mutex_exit(&so->so_lock);
1226 	dprintso(so, 1,	("t_discon_ind: NOT found T_CONN_IND %d\n", seqno));
1227 	return (-1);
1228 }
1229 
1230 /*
1231  * Wait until the socket is connected or there is an error.
1232  * fmode should contain any nonblocking flags. nosig should be
1233  * set if the caller does not want the wait to be interrupted by a signal.
1234  */
1235 int
1236 sowaitconnected(struct sonode *so, int fmode, int nosig)
1237 {
1238 	int error;
1239 
1240 	ASSERT(MUTEX_HELD(&so->so_lock));
1241 
1242 	while ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) ==
1243 	    SS_ISCONNECTING && so->so_error == 0) {
1244 
1245 		dprintso(so, 1, ("waiting for SS_ISCONNECTED on %p\n", so));
1246 		if (fmode & (FNDELAY|FNONBLOCK))
1247 			return (EINPROGRESS);
1248 
1249 		if (nosig)
1250 			cv_wait(&so->so_state_cv, &so->so_lock);
1251 		else if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) {
1252 			/*
1253 			 * Return EINTR and let the application use
1254 			 * nonblocking techniques for detecting when
1255 			 * the connection has been established.
1256 			 */
1257 			return (EINTR);
1258 		}
1259 		dprintso(so, 1, ("awoken on %p\n", so));
1260 	}
1261 
1262 	if (so->so_error != 0) {
1263 		error = sogeterr(so);
1264 		ASSERT(error != 0);
1265 		dprintso(so, 1, ("sowaitconnected: error %d\n", error));
1266 		return (error);
1267 	}
1268 	if (!(so->so_state & SS_ISCONNECTED)) {
1269 		/*
1270 		 * Could have received a T_ORDREL_IND or a T_DISCON_IND with
1271 		 * zero errno. Or another thread could have consumed so_error
1272 		 * e.g. by calling read.
1273 		 */
1274 		error = ECONNREFUSED;
1275 		dprintso(so, 1, ("sowaitconnected: error %d\n", error));
1276 		return (error);
1277 	}
1278 	return (0);
1279 }
1280 
1281 
1282 /*
1283  * Handle the signal generation aspect of urgent data.
1284  */
1285 static void
1286 so_oob_sig(struct sonode *so, int extrasig,
1287     strsigset_t *signals, strpollset_t *pollwakeups)
1288 {
1289 	ASSERT(MUTEX_HELD(&so->so_lock));
1290 
1291 	ASSERT(so_verify_oobstate(so));
1292 	ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
1293 	if (so->so_oobsigcnt > so->so_oobcnt) {
1294 		/*
1295 		 * Signal has already been generated once for this
1296 		 * urgent "event". However, since TCP can receive updated
1297 		 * urgent pointers we still generate a signal.
1298 		 */
1299 		ASSERT(so->so_state & SS_OOBPEND);
1300 		if (extrasig) {
1301 			*signals |= S_RDBAND;
1302 			*pollwakeups |= POLLRDBAND;
1303 		}
1304 		return;
1305 	}
1306 
1307 	so->so_oobsigcnt++;
1308 	ASSERT(so->so_oobsigcnt > 0);	/* Wraparound */
1309 	ASSERT(so->so_oobsigcnt > so->so_oobcnt);
1310 
1311 	/*
1312 	 * Record (for select/poll) that urgent data is pending.
1313 	 */
1314 	so->so_state |= SS_OOBPEND;
1315 	/*
1316 	 * New urgent data on the way so forget about any old
1317 	 * urgent data.
1318 	 */
1319 	so->so_state &= ~(SS_HAVEOOBDATA|SS_HADOOBDATA);
1320 	if (so->so_oobmsg != NULL) {
1321 		dprintso(so, 1, ("sock: discarding old oob\n"));
1322 		freemsg(so->so_oobmsg);
1323 		so->so_oobmsg = NULL;
1324 	}
1325 	*signals |= S_RDBAND;
1326 	*pollwakeups |= POLLRDBAND;
1327 	ASSERT(so_verify_oobstate(so));
1328 }
1329 
1330 /*
1331  * Handle the processing of the T_EXDATA_IND with urgent data.
1332  * Returns the T_EXDATA_IND if it should be queued on the read queue.
1333  */
1334 /* ARGSUSED2 */
1335 static mblk_t *
1336 so_oob_exdata(struct sonode *so, mblk_t *mp,
1337 	strsigset_t *signals, strpollset_t *pollwakeups)
1338 {
1339 	ASSERT(MUTEX_HELD(&so->so_lock));
1340 
1341 	ASSERT(so_verify_oobstate(so));
1342 
1343 	ASSERT(so->so_oobsigcnt > so->so_oobcnt);
1344 
1345 	so->so_oobcnt++;
1346 	ASSERT(so->so_oobcnt > 0);	/* wraparound? */
1347 	ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
1348 
1349 	/*
1350 	 * Set MSGMARK for SIOCATMARK.
1351 	 */
1352 	mp->b_flag |= MSGMARK;
1353 
1354 	ASSERT(so_verify_oobstate(so));
1355 	return (mp);
1356 }
1357 
1358 /*
1359  * Handle the processing of the actual urgent data.
1360  * Returns the data mblk if it should be queued on the read queue.
1361  */
1362 static mblk_t *
1363 so_oob_data(struct sonode *so, mblk_t *mp,
1364 	strsigset_t *signals, strpollset_t *pollwakeups)
1365 {
1366 	ASSERT(MUTEX_HELD(&so->so_lock));
1367 
1368 	ASSERT(so_verify_oobstate(so));
1369 
1370 	ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
1371 	ASSERT(mp != NULL);
1372 	/*
1373 	 * For OOBINLINE we keep the data in the T_EXDATA_IND.
1374 	 * Otherwise we store it in so_oobmsg.
1375 	 */
1376 	ASSERT(so->so_oobmsg == NULL);
1377 	if (so->so_options & SO_OOBINLINE) {
1378 		*pollwakeups |= POLLIN | POLLRDNORM | POLLRDBAND;
1379 		*signals |= S_INPUT | S_RDNORM;
1380 	} else {
1381 		*pollwakeups |= POLLRDBAND;
1382 		so->so_state |= SS_HAVEOOBDATA;
1383 		so->so_oobmsg = mp;
1384 		mp = NULL;
1385 	}
1386 	ASSERT(so_verify_oobstate(so));
1387 	return (mp);
1388 }
1389 
1390 /*
1391  * Caller must hold the mutex.
1392  * For delayed processing, save the T_DISCON_IND received
1393  * from below on so_discon_ind_mp.
1394  * When the message is processed the framework will call:
1395  *      (*func)(so, mp);
1396  */
1397 static void
1398 so_save_discon_ind(struct sonode *so,
1399 	mblk_t *mp,
1400 	void (*func)(struct sonode *so, mblk_t *))
1401 {
1402 	ASSERT(MUTEX_HELD(&so->so_lock));
1403 
1404 	/*
1405 	 * Discard new T_DISCON_IND if we have already received another.
1406 	 * Currently the earlier message can either be on so_discon_ind_mp
1407 	 * or being processed.
1408 	 */
1409 	if (so->so_discon_ind_mp != NULL || (so->so_flag & SOASYNC_UNBIND)) {
1410 		zcmn_err(getzoneid(), CE_WARN,
1411 		    "sockfs: received unexpected additional T_DISCON_IND\n");
1412 		freemsg(mp);
1413 		return;
1414 	}
1415 	mp->b_prev = (mblk_t *)func;
1416 	mp->b_next = NULL;
1417 	so->so_discon_ind_mp = mp;
1418 }
1419 
1420 /*
1421  * Caller must hold the mutex and make sure that either SOLOCKED
1422  * or SOASYNC_UNBIND is set. Called from so_unlock_single().
1423  * Perform delayed processing of T_DISCON_IND message on so_discon_ind_mp.
1424  * Need to ensure that strsock_proto() will not end up sleeping for
1425  * SOASYNC_UNBIND, while executing this function.
1426  */
1427 void
1428 so_drain_discon_ind(struct sonode *so)
1429 {
1430 	mblk_t	*bp;
1431 	void (*func)(struct sonode *so, mblk_t *);
1432 
1433 	ASSERT(MUTEX_HELD(&so->so_lock));
1434 	ASSERT(so->so_flag & (SOLOCKED|SOASYNC_UNBIND));
1435 
1436 	/* Process T_DISCON_IND on so_discon_ind_mp */
1437 	if ((bp = so->so_discon_ind_mp) != NULL) {
1438 		so->so_discon_ind_mp = NULL;
1439 		func = (void (*)())bp->b_prev;
1440 		bp->b_prev = NULL;
1441 
1442 		/*
1443 		 * This (*func) is supposed to generate a message downstream
1444 		 * and we need to have a flag set until the corresponding
1445 		 * upstream message reaches stream head.
1446 		 * When processing T_DISCON_IND in strsock_discon_ind
1447 		 * we hold SOASYN_UNBIND when sending T_UNBIND_REQ down and
1448 		 * drop the flag after we get the ACK in strsock_proto.
1449 		 */
1450 		(void) (*func)(so, bp);
1451 	}
1452 }
1453 
1454 /*
1455  * Caller must hold the mutex.
1456  * Remove the T_DISCON_IND on so_discon_ind_mp.
1457  */
1458 void
1459 so_flush_discon_ind(struct sonode *so)
1460 {
1461 	mblk_t	*bp;
1462 
1463 	ASSERT(MUTEX_HELD(&so->so_lock));
1464 
1465 	/*
1466 	 * Remove T_DISCON_IND mblk at so_discon_ind_mp.
1467 	 */
1468 	if ((bp = so->so_discon_ind_mp) != NULL) {
1469 		so->so_discon_ind_mp = NULL;
1470 		bp->b_prev = NULL;
1471 		freemsg(bp);
1472 	}
1473 }
1474 
1475 /*
1476  * Caller must hold the mutex.
1477  *
1478  * This function is used to process the T_DISCON_IND message. It does
1479  * immediate processing when called from strsock_proto and delayed
1480  * processing of discon_ind saved on so_discon_ind_mp when called from
1481  * so_drain_discon_ind. When a T_DISCON_IND message is saved in
1482  * so_discon_ind_mp for delayed processing, this function is registered
1483  * as the callback function to process the message.
1484  *
1485  * SOASYNC_UNBIND should be held in this function, during the non-blocking
1486  * unbind operation, and should be released only after we receive the ACK
1487  * in strsock_proto, for the T_UNBIND_REQ sent here. Since SOLOCKED is not set,
1488  * no TPI messages would be sent down at this time. This is to prevent M_FLUSH
1489  * sent from either this function or tcp_unbind(), flushing away any TPI
1490  * message that is being sent down and stays in a lower module's queue.
1491  *
1492  * This function drops so_lock and grabs it again.
1493  */
1494 static void
1495 strsock_discon_ind(struct sonode *so, mblk_t *discon_mp)
1496 {
1497 	struct vnode *vp;
1498 	struct stdata *stp;
1499 	union T_primitives *tpr;
1500 	struct T_unbind_req *ubr;
1501 	mblk_t *mp;
1502 	int error;
1503 
1504 	ASSERT(MUTEX_HELD(&so->so_lock));
1505 	ASSERT(discon_mp);
1506 	ASSERT(discon_mp->b_rptr);
1507 
1508 	tpr = (union T_primitives *)discon_mp->b_rptr;
1509 	ASSERT(tpr->type == T_DISCON_IND);
1510 
1511 	vp = SOTOV(so);
1512 	stp = vp->v_stream;
1513 	ASSERT(stp);
1514 
1515 	/*
1516 	 * Not a listener
1517 	 */
1518 	ASSERT((so->so_state & SS_ACCEPTCONN) == 0);
1519 
1520 	/*
1521 	 * This assumes that the name space for DISCON_reason
1522 	 * is the errno name space.
1523 	 */
1524 	soisdisconnected(so, tpr->discon_ind.DISCON_reason);
1525 
1526 	/*
1527 	 * Unbind with the transport without blocking.
1528 	 * If we've already received a T_DISCON_IND do not unbind.
1529 	 *
1530 	 * If there is no preallocated unbind message, we have already
1531 	 * unbound with the transport
1532 	 *
1533 	 * If the socket is not bound, no need to unbind.
1534 	 */
1535 	mp = so->so_unbind_mp;
1536 	if (mp == NULL) {
1537 		ASSERT(!(so->so_state & SS_ISBOUND));
1538 		mutex_exit(&so->so_lock);
1539 	} else if (!(so->so_state & SS_ISBOUND))  {
1540 		mutex_exit(&so->so_lock);
1541 	} else {
1542 		so->so_unbind_mp = NULL;
1543 
1544 		/*
1545 		 * Is another T_DISCON_IND being processed.
1546 		 */
1547 		ASSERT((so->so_flag & SOASYNC_UNBIND) == 0);
1548 
1549 		/*
1550 		 * Make strsock_proto ignore T_OK_ACK and T_ERROR_ACK for
1551 		 * this unbind. Set SOASYNC_UNBIND. This should be cleared
1552 		 * only after we receive the ACK in strsock_proto.
1553 		 */
1554 		so->so_flag |= SOASYNC_UNBIND;
1555 		ASSERT(!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)));
1556 		so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN|SS_LADDR_VALID);
1557 		mutex_exit(&so->so_lock);
1558 
1559 		/*
1560 		 * Send down T_UNBIND_REQ ignoring flow control.
1561 		 * XXX Assumes that MSG_IGNFLOW implies that this thread
1562 		 * does not run service procedures.
1563 		 */
1564 		ASSERT(DB_TYPE(mp) == M_PROTO);
1565 		ubr = (struct T_unbind_req *)mp->b_rptr;
1566 		mp->b_wptr += sizeof (*ubr);
1567 		ubr->PRIM_type = T_UNBIND_REQ;
1568 
1569 		/*
1570 		 * Flush the read and write side (except stream head read queue)
1571 		 * and send down T_UNBIND_REQ.
1572 		 */
1573 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1574 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1575 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
1576 		/* LINTED - warning: statement has no consequent: if */
1577 		if (error) {
1578 			eprintsoline(so, error);
1579 		}
1580 	}
1581 
1582 	if (tpr->discon_ind.DISCON_reason != 0)
1583 		strsetrerror(SOTOV(so), 0, 0, sogetrderr);
1584 	strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
1585 	strseteof(SOTOV(so), 1);
1586 	/*
1587 	 * strseteof takes care of read side wakeups,
1588 	 * pollwakeups, and signals.
1589 	 */
1590 	dprintso(so, 1, ("T_DISCON_IND: error %d\n", so->so_error));
1591 	freemsg(discon_mp);
1592 
1593 
1594 	pollwakeup(&stp->sd_pollist, POLLOUT);
1595 	mutex_enter(&stp->sd_lock);
1596 
1597 	/*
1598 	 * Wake sleeping write
1599 	 */
1600 	if (stp->sd_flag & WSLEEP) {
1601 		stp->sd_flag &= ~WSLEEP;
1602 		cv_broadcast(&stp->sd_wrq->q_wait);
1603 	}
1604 
1605 	/*
1606 	 * strsendsig can handle multiple signals with a
1607 	 * single call.  Send SIGPOLL for S_OUTPUT event.
1608 	 */
1609 	if (stp->sd_sigflags & S_OUTPUT)
1610 		strsendsig(stp->sd_siglist, S_OUTPUT, 0, 0);
1611 
1612 	mutex_exit(&stp->sd_lock);
1613 	mutex_enter(&so->so_lock);
1614 }
1615 
1616 /*
1617  * This routine is registered with the stream head to receive M_PROTO
1618  * and M_PCPROTO messages.
1619  *
1620  * Returns NULL if the message was consumed.
1621  * Returns an mblk to make that mblk be processed (and queued) by the stream
1622  * head.
1623  *
1624  * Sets the return parameters (*wakeups, *firstmsgsigs, *allmsgsigs, and
1625  * *pollwakeups) for the stream head to take action on. Note that since
1626  * sockets always deliver SIGIO for every new piece of data this routine
1627  * never sets *firstmsgsigs; any signals are returned in *allmsgsigs.
1628  *
1629  * This routine handles all data related TPI messages independent of
1630  * the type of the socket i.e. it doesn't care if T_UNITDATA_IND message
1631  * arrive on a SOCK_STREAM.
1632  */
1633 static mblk_t *
1634 strsock_proto(vnode_t *vp, mblk_t *mp,
1635 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
1636 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
1637 {
1638 	union T_primitives *tpr;
1639 	struct sonode *so;
1640 
1641 	so = VTOSO(vp);
1642 
1643 	dprintso(so, 1, ("strsock_proto(%p, %p)\n", vp, mp));
1644 
1645 	/* Set default return values */
1646 	*firstmsgsigs = *wakeups = *allmsgsigs = *pollwakeups = 0;
1647 
1648 	ASSERT(DB_TYPE(mp) == M_PROTO ||
1649 	    DB_TYPE(mp) == M_PCPROTO);
1650 
1651 	if (MBLKL(mp) < sizeof (tpr->type)) {
1652 		/* The message is too short to even contain the primitive */
1653 		zcmn_err(getzoneid(), CE_WARN,
1654 		    "sockfs: Too short TPI message received. Len = %ld\n",
1655 		    (ptrdiff_t)(MBLKL(mp)));
1656 		freemsg(mp);
1657 		return (NULL);
1658 	}
1659 	if (!__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
1660 		/* The read pointer is not aligned correctly for TPI */
1661 		zcmn_err(getzoneid(), CE_WARN,
1662 		    "sockfs: Unaligned TPI message received. rptr = %p\n",
1663 		    (void *)mp->b_rptr);
1664 		freemsg(mp);
1665 		return (NULL);
1666 	}
1667 	tpr = (union T_primitives *)mp->b_rptr;
1668 	dprintso(so, 1, ("strsock_proto: primitive %d\n", tpr->type));
1669 
1670 	switch (tpr->type) {
1671 
1672 	case T_DATA_IND:
1673 		if (MBLKL(mp) < sizeof (struct T_data_ind)) {
1674 			zcmn_err(getzoneid(), CE_WARN,
1675 			    "sockfs: Too short T_DATA_IND. Len = %ld\n",
1676 			    (ptrdiff_t)(MBLKL(mp)));
1677 			freemsg(mp);
1678 			return (NULL);
1679 		}
1680 		/*
1681 		 * Ignore zero-length T_DATA_IND messages. These might be
1682 		 * generated by some transports.
1683 		 * This is needed to prevent read (which skips the M_PROTO
1684 		 * part) to unexpectedly return 0 (or return EWOULDBLOCK
1685 		 * on a non-blocking socket after select/poll has indicated
1686 		 * that data is available).
1687 		 */
1688 		if (msgdsize(mp->b_cont) == 0) {
1689 			dprintso(so, 0,
1690 			    ("strsock_proto: zero length T_DATA_IND\n"));
1691 			freemsg(mp);
1692 			return (NULL);
1693 		}
1694 		*allmsgsigs = S_INPUT | S_RDNORM;
1695 		*pollwakeups = POLLIN | POLLRDNORM;
1696 		*wakeups = RSLEEP;
1697 		return (mp);
1698 
1699 	case T_UNITDATA_IND: {
1700 		struct T_unitdata_ind	*tudi = &tpr->unitdata_ind;
1701 		void			*addr;
1702 		t_uscalar_t		addrlen;
1703 
1704 		if (MBLKL(mp) < sizeof (struct T_unitdata_ind)) {
1705 			zcmn_err(getzoneid(), CE_WARN,
1706 			    "sockfs: Too short T_UNITDATA_IND. Len = %ld\n",
1707 			    (ptrdiff_t)(MBLKL(mp)));
1708 			freemsg(mp);
1709 			return (NULL);
1710 		}
1711 
1712 		/* Is this is not a connected datagram socket? */
1713 		if ((so->so_mode & SM_CONNREQUIRED) ||
1714 		    !(so->so_state & SS_ISCONNECTED)) {
1715 			/*
1716 			 * Not a connected datagram socket. Look for
1717 			 * the SO_UNIX_CLOSE option. If such an option is found
1718 			 * discard the message (since it has no meaning
1719 			 * unless connected).
1720 			 */
1721 			if (so->so_family == AF_UNIX && msgdsize(mp) == 0 &&
1722 			    tudi->OPT_length != 0) {
1723 				void *opt;
1724 				t_uscalar_t optlen = tudi->OPT_length;
1725 
1726 				opt = sogetoff(mp, tudi->OPT_offset,
1727 				    optlen, __TPI_ALIGN_SIZE);
1728 				if (opt == NULL) {
1729 					/* The len/off falls outside mp */
1730 					freemsg(mp);
1731 					mutex_enter(&so->so_lock);
1732 					soseterror(so, EPROTO);
1733 					mutex_exit(&so->so_lock);
1734 					zcmn_err(getzoneid(), CE_WARN,
1735 					    "sockfs: T_unidata_ind with "
1736 					    "invalid optlen/offset %u/%d\n",
1737 					    optlen, tudi->OPT_offset);
1738 					return (NULL);
1739 				}
1740 				if (so_getopt_unix_close(opt, optlen)) {
1741 					freemsg(mp);
1742 					return (NULL);
1743 				}
1744 			}
1745 			*allmsgsigs = S_INPUT | S_RDNORM;
1746 			*pollwakeups = POLLIN | POLLRDNORM;
1747 			*wakeups = RSLEEP;
1748 			if (audit_active)
1749 				audit_sock(T_UNITDATA_IND, strvp2wq(vp),
1750 				    mp, 0);
1751 			return (mp);
1752 		}
1753 
1754 		/*
1755 		 * A connect datagram socket. For AF_INET{,6} we verify that
1756 		 * the source address matches the "connected to" address.
1757 		 * The semantics of AF_UNIX sockets is to not verify
1758 		 * the source address.
1759 		 * Note that this source address verification is transport
1760 		 * specific. Thus the real fix would be to extent TPI
1761 		 * to allow T_CONN_REQ messages to be send to connectionless
1762 		 * transport providers and always let the transport provider
1763 		 * do whatever filtering is needed.
1764 		 *
1765 		 * The verification/filtering semantics for transports
1766 		 * other than AF_INET and AF_UNIX are unknown. The choice
1767 		 * would be to either filter using bcmp or let all messages
1768 		 * get through. This code does not filter other address
1769 		 * families since this at least allows the application to
1770 		 * work around any missing filtering.
1771 		 *
1772 		 * XXX Should we move filtering to UDP/ICMP???
1773 		 * That would require passing e.g. a T_DISCON_REQ to UDP
1774 		 * when the socket becomes unconnected.
1775 		 */
1776 		addrlen = tudi->SRC_length;
1777 		/*
1778 		 * The alignment restriction is really to strict but
1779 		 * we want enough alignment to inspect the fields of
1780 		 * a sockaddr_in.
1781 		 */
1782 		addr = sogetoff(mp, tudi->SRC_offset, addrlen,
1783 		    __TPI_ALIGN_SIZE);
1784 		if (addr == NULL) {
1785 			freemsg(mp);
1786 			mutex_enter(&so->so_lock);
1787 			soseterror(so, EPROTO);
1788 			mutex_exit(&so->so_lock);
1789 			zcmn_err(getzoneid(), CE_WARN,
1790 			    "sockfs: T_unidata_ind with invalid "
1791 			    "addrlen/offset %u/%d\n",
1792 			    addrlen, tudi->SRC_offset);
1793 			return (NULL);
1794 		}
1795 
1796 		if (so->so_family == AF_INET) {
1797 			/*
1798 			 * For AF_INET we allow wildcarding both sin_addr
1799 			 * and sin_port.
1800 			 */
1801 			struct sockaddr_in *faddr, *sin;
1802 
1803 			/* Prevent so_faddr_sa from changing while accessed */
1804 			mutex_enter(&so->so_lock);
1805 			ASSERT(so->so_faddr_len ==
1806 			    (socklen_t)sizeof (struct sockaddr_in));
1807 			faddr = (struct sockaddr_in *)so->so_faddr_sa;
1808 			sin = (struct sockaddr_in *)addr;
1809 			if (addrlen !=
1810 			    (t_uscalar_t)sizeof (struct sockaddr_in) ||
1811 			    (sin->sin_addr.s_addr != faddr->sin_addr.s_addr &&
1812 			    faddr->sin_addr.s_addr != INADDR_ANY) ||
1813 			    (so->so_type != SOCK_RAW &&
1814 			    sin->sin_port != faddr->sin_port &&
1815 			    faddr->sin_port != 0)) {
1816 #ifdef DEBUG
1817 				dprintso(so, 0,
1818 				    ("sockfs: T_UNITDATA_IND mismatch: %s",
1819 				    pr_addr(so->so_family,
1820 				    (struct sockaddr *)addr,
1821 				    addrlen)));
1822 				dprintso(so, 0, (" - %s\n",
1823 				    pr_addr(so->so_family, so->so_faddr_sa,
1824 				    (t_uscalar_t)so->so_faddr_len)));
1825 #endif /* DEBUG */
1826 				mutex_exit(&so->so_lock);
1827 				freemsg(mp);
1828 				return (NULL);
1829 			}
1830 			mutex_exit(&so->so_lock);
1831 		} else if (so->so_family == AF_INET6) {
1832 			/*
1833 			 * For AF_INET6 we allow wildcarding both sin6_addr
1834 			 * and sin6_port.
1835 			 */
1836 			struct sockaddr_in6 *faddr6, *sin6;
1837 			static struct in6_addr zeroes; /* inits to all zeros */
1838 
1839 			/* Prevent so_faddr_sa from changing while accessed */
1840 			mutex_enter(&so->so_lock);
1841 			ASSERT(so->so_faddr_len ==
1842 			    (socklen_t)sizeof (struct sockaddr_in6));
1843 			faddr6 = (struct sockaddr_in6 *)so->so_faddr_sa;
1844 			sin6 = (struct sockaddr_in6 *)addr;
1845 			/* XXX could we get a mapped address ::ffff:0.0.0.0 ? */
1846 			if (addrlen !=
1847 			    (t_uscalar_t)sizeof (struct sockaddr_in6) ||
1848 			    (!IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
1849 			    &faddr6->sin6_addr) &&
1850 			    !IN6_ARE_ADDR_EQUAL(&faddr6->sin6_addr, &zeroes)) ||
1851 			    (so->so_type != SOCK_RAW &&
1852 			    sin6->sin6_port != faddr6->sin6_port &&
1853 			    faddr6->sin6_port != 0)) {
1854 #ifdef DEBUG
1855 				dprintso(so, 0,
1856 				    ("sockfs: T_UNITDATA_IND mismatch: %s",
1857 				    pr_addr(so->so_family,
1858 				    (struct sockaddr *)addr,
1859 				    addrlen)));
1860 				dprintso(so, 0, (" - %s\n",
1861 				    pr_addr(so->so_family, so->so_faddr_sa,
1862 				    (t_uscalar_t)so->so_faddr_len)));
1863 #endif /* DEBUG */
1864 				mutex_exit(&so->so_lock);
1865 				freemsg(mp);
1866 				return (NULL);
1867 			}
1868 			mutex_exit(&so->so_lock);
1869 		} else if (so->so_family == AF_UNIX &&
1870 		    msgdsize(mp->b_cont) == 0 &&
1871 		    tudi->OPT_length != 0) {
1872 			/*
1873 			 * Attempt to extract AF_UNIX
1874 			 * SO_UNIX_CLOSE indication from options.
1875 			 */
1876 			void *opt;
1877 			t_uscalar_t optlen = tudi->OPT_length;
1878 
1879 			opt = sogetoff(mp, tudi->OPT_offset,
1880 			    optlen, __TPI_ALIGN_SIZE);
1881 			if (opt == NULL) {
1882 				/* The len/off falls outside mp */
1883 				freemsg(mp);
1884 				mutex_enter(&so->so_lock);
1885 				soseterror(so, EPROTO);
1886 				mutex_exit(&so->so_lock);
1887 				zcmn_err(getzoneid(), CE_WARN,
1888 				    "sockfs: T_unidata_ind with invalid "
1889 				    "optlen/offset %u/%d\n",
1890 				    optlen, tudi->OPT_offset);
1891 				return (NULL);
1892 			}
1893 			/*
1894 			 * If we received a unix close indication mark the
1895 			 * socket and discard this message.
1896 			 */
1897 			if (so_getopt_unix_close(opt, optlen)) {
1898 				mutex_enter(&so->so_lock);
1899 				sobreakconn(so, ECONNRESET);
1900 				mutex_exit(&so->so_lock);
1901 				strsetrerror(SOTOV(so), 0, 0, sogetrderr);
1902 				freemsg(mp);
1903 				*pollwakeups = POLLIN | POLLRDNORM;
1904 				*allmsgsigs = S_INPUT | S_RDNORM;
1905 				*wakeups = RSLEEP;
1906 				return (NULL);
1907 			}
1908 		}
1909 		*allmsgsigs = S_INPUT | S_RDNORM;
1910 		*pollwakeups = POLLIN | POLLRDNORM;
1911 		*wakeups = RSLEEP;
1912 		return (mp);
1913 	}
1914 
1915 	case T_OPTDATA_IND: {
1916 		struct T_optdata_ind	*tdi = &tpr->optdata_ind;
1917 
1918 		if (MBLKL(mp) < sizeof (struct T_optdata_ind)) {
1919 			zcmn_err(getzoneid(), CE_WARN,
1920 			    "sockfs: Too short T_OPTDATA_IND. Len = %ld\n",
1921 			    (ptrdiff_t)(MBLKL(mp)));
1922 			freemsg(mp);
1923 			return (NULL);
1924 		}
1925 		/*
1926 		 * Allow zero-length messages carrying options.
1927 		 * This is used when carrying the SO_UNIX_CLOSE option.
1928 		 */
1929 		if (so->so_family == AF_UNIX && msgdsize(mp->b_cont) == 0 &&
1930 		    tdi->OPT_length != 0) {
1931 			/*
1932 			 * Attempt to extract AF_UNIX close indication
1933 			 * from the options. Ignore any other options -
1934 			 * those are handled once the message is removed
1935 			 * from the queue.
1936 			 * The close indication message should not carry data.
1937 			 */
1938 			void *opt;
1939 			t_uscalar_t optlen = tdi->OPT_length;
1940 
1941 			opt = sogetoff(mp, tdi->OPT_offset,
1942 			    optlen, __TPI_ALIGN_SIZE);
1943 			if (opt == NULL) {
1944 				/* The len/off falls outside mp */
1945 				freemsg(mp);
1946 				mutex_enter(&so->so_lock);
1947 				soseterror(so, EPROTO);
1948 				mutex_exit(&so->so_lock);
1949 				zcmn_err(getzoneid(), CE_WARN,
1950 				    "sockfs: T_optdata_ind with invalid "
1951 				    "optlen/offset %u/%d\n",
1952 				    optlen, tdi->OPT_offset);
1953 				return (NULL);
1954 			}
1955 			/*
1956 			 * If we received a close indication mark the
1957 			 * socket and discard this message.
1958 			 */
1959 			if (so_getopt_unix_close(opt, optlen)) {
1960 				mutex_enter(&so->so_lock);
1961 				socantsendmore(so);
1962 				mutex_exit(&so->so_lock);
1963 				strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
1964 				freemsg(mp);
1965 				return (NULL);
1966 			}
1967 		}
1968 		*allmsgsigs = S_INPUT | S_RDNORM;
1969 		*pollwakeups = POLLIN | POLLRDNORM;
1970 		*wakeups = RSLEEP;
1971 		return (mp);
1972 	}
1973 
1974 	case T_EXDATA_IND: {
1975 		mblk_t		*mctl, *mdata;
1976 		mblk_t *lbp;
1977 		union T_primitives *tprp;
1978 		struct stdata   *stp;
1979 		queue_t *qp;
1980 
1981 		if (MBLKL(mp) < sizeof (struct T_exdata_ind)) {
1982 			zcmn_err(getzoneid(), CE_WARN,
1983 			    "sockfs: Too short T_EXDATA_IND. Len = %ld\n",
1984 			    (ptrdiff_t)(MBLKL(mp)));
1985 			freemsg(mp);
1986 			return (NULL);
1987 		}
1988 		/*
1989 		 * Ignore zero-length T_EXDATA_IND messages. These might be
1990 		 * generated by some transports.
1991 		 *
1992 		 * This is needed to prevent read (which skips the M_PROTO
1993 		 * part) to unexpectedly return 0 (or return EWOULDBLOCK
1994 		 * on a non-blocking socket after select/poll has indicated
1995 		 * that data is available).
1996 		 */
1997 		dprintso(so, 1,
1998 		    ("T_EXDATA_IND(%p): counts %d/%d state %s\n",
1999 		    vp, so->so_oobsigcnt, so->so_oobcnt,
2000 		    pr_state(so->so_state, so->so_mode)));
2001 
2002 		if (msgdsize(mp->b_cont) == 0) {
2003 			dprintso(so, 0,
2004 			    ("strsock_proto: zero length T_EXDATA_IND\n"));
2005 			freemsg(mp);
2006 			return (NULL);
2007 		}
2008 
2009 		/*
2010 		 * Split into the T_EXDATA_IND and the M_DATA part.
2011 		 * We process these three pieces separately:
2012 		 *	signal generation
2013 		 *	handling T_EXDATA_IND
2014 		 *	handling M_DATA component
2015 		 */
2016 		mctl = mp;
2017 		mdata = mctl->b_cont;
2018 		mctl->b_cont = NULL;
2019 		mutex_enter(&so->so_lock);
2020 		so_oob_sig(so, 0, allmsgsigs, pollwakeups);
2021 		mctl = so_oob_exdata(so, mctl, allmsgsigs, pollwakeups);
2022 		mdata = so_oob_data(so, mdata, allmsgsigs, pollwakeups);
2023 
2024 		stp = vp->v_stream;
2025 		ASSERT(stp != NULL);
2026 		qp = _RD(stp->sd_wrq);
2027 
2028 		mutex_enter(QLOCK(qp));
2029 		lbp = qp->q_last;
2030 
2031 		/*
2032 		 * We want to avoid queueing up a string of T_EXDATA_IND
2033 		 * messages with no intervening data messages at the stream
2034 		 * head. These messages contribute to the total message
2035 		 * count. Eventually this can lead to STREAMS flow contol
2036 		 * and also cause TCP to advertise a zero window condition
2037 		 * to the peer. This can happen in the degenerate case where
2038 		 * the sender and receiver exchange only OOB data. The sender
2039 		 * only sends messages with MSG_OOB flag and the receiver
2040 		 * receives only MSG_OOB messages and does not use SO_OOBINLINE.
2041 		 * An example of this scenario has been reported in applications
2042 		 * that use OOB data to exchange heart beats. Flow control
2043 		 * relief will never happen if the application only reads OOB
2044 		 * data which is done directly by sorecvoob() and the
2045 		 * T_EXDATA_IND messages at the streamhead won't be consumed.
2046 		 * Note that there is no correctness issue in compressing the
2047 		 * string of T_EXDATA_IND messages into a single T_EXDATA_IND
2048 		 * message. A single read that does not specify MSG_OOB will
2049 		 * read across all the marks in a loop in sotpi_recvmsg().
2050 		 * Each mark is individually distinguishable only if the
2051 		 * T_EXDATA_IND messages are separated by data messages.
2052 		 */
2053 		if ((qp->q_first != NULL) && (DB_TYPE(lbp) == M_PROTO)) {
2054 			tprp = (union T_primitives *)lbp->b_rptr;
2055 			if ((tprp->type == T_EXDATA_IND) &&
2056 			    !(so->so_options & SO_OOBINLINE)) {
2057 
2058 				/*
2059 				 * free the new M_PROTO message
2060 				 */
2061 				freemsg(mctl);
2062 
2063 				/*
2064 				 * adjust the OOB count and OOB	signal count
2065 				 * just incremented for the new OOB data.
2066 				 */
2067 				so->so_oobcnt--;
2068 				so->so_oobsigcnt--;
2069 				mutex_exit(QLOCK(qp));
2070 				mutex_exit(&so->so_lock);
2071 				return (NULL);
2072 			}
2073 		}
2074 		mutex_exit(QLOCK(qp));
2075 
2076 		/*
2077 		 * Pass the T_EXDATA_IND and the M_DATA back separately
2078 		 * by using b_next linkage. (The stream head will queue any
2079 		 * b_next linked messages separately.) This is needed
2080 		 * since MSGMARK applies to the last by of the message
2081 		 * hence we can not have any M_DATA component attached
2082 		 * to the marked T_EXDATA_IND. Note that the stream head
2083 		 * will not consolidate M_DATA messages onto an MSGMARK'ed
2084 		 * message in order to preserve the constraint that
2085 		 * the T_EXDATA_IND always is a separate message.
2086 		 */
2087 		ASSERT(mctl != NULL);
2088 		mctl->b_next = mdata;
2089 		mp = mctl;
2090 #ifdef DEBUG
2091 		if (mdata == NULL) {
2092 			dprintso(so, 1,
2093 			    ("after outofline T_EXDATA_IND(%p): "
2094 			    "counts %d/%d  poll 0x%x sig 0x%x state %s\n",
2095 			    vp, so->so_oobsigcnt,
2096 			    so->so_oobcnt, *pollwakeups, *allmsgsigs,
2097 			    pr_state(so->so_state, so->so_mode)));
2098 		} else {
2099 			dprintso(so, 1,
2100 			    ("after inline T_EXDATA_IND(%p): "
2101 			    "counts %d/%d  poll 0x%x sig 0x%x state %s\n",
2102 			    vp, so->so_oobsigcnt,
2103 			    so->so_oobcnt, *pollwakeups, *allmsgsigs,
2104 			    pr_state(so->so_state, so->so_mode)));
2105 		}
2106 #endif /* DEBUG */
2107 		mutex_exit(&so->so_lock);
2108 		*wakeups = RSLEEP;
2109 		return (mp);
2110 	}
2111 
2112 	case T_CONN_CON: {
2113 		struct T_conn_con	*conn_con;
2114 		void			*addr;
2115 		t_uscalar_t		addrlen;
2116 
2117 		/*
2118 		 * Verify the state, update the state to ISCONNECTED,
2119 		 * record the potentially new address in the message,
2120 		 * and drop the message.
2121 		 */
2122 		if (MBLKL(mp) < sizeof (struct T_conn_con)) {
2123 			zcmn_err(getzoneid(), CE_WARN,
2124 			    "sockfs: Too short T_CONN_CON. Len = %ld\n",
2125 			    (ptrdiff_t)(MBLKL(mp)));
2126 			freemsg(mp);
2127 			return (NULL);
2128 		}
2129 
2130 		mutex_enter(&so->so_lock);
2131 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) !=
2132 		    SS_ISCONNECTING) {
2133 			mutex_exit(&so->so_lock);
2134 			dprintso(so, 1,
2135 			    ("T_CONN_CON: state %x\n", so->so_state));
2136 			freemsg(mp);
2137 			return (NULL);
2138 		}
2139 
2140 		conn_con = &tpr->conn_con;
2141 		addrlen = conn_con->RES_length;
2142 		/*
2143 		 * Allow the address to be of different size than sent down
2144 		 * in the T_CONN_REQ as long as it doesn't exceed the maxlen.
2145 		 * For AF_UNIX require the identical length.
2146 		 */
2147 		if (so->so_family == AF_UNIX ?
2148 		    addrlen != (t_uscalar_t)sizeof (so->so_ux_laddr) :
2149 		    addrlen > (t_uscalar_t)so->so_faddr_maxlen) {
2150 			zcmn_err(getzoneid(), CE_WARN,
2151 			    "sockfs: T_conn_con with different "
2152 			    "length %u/%d\n",
2153 			    addrlen, conn_con->RES_length);
2154 			soisdisconnected(so, EPROTO);
2155 			mutex_exit(&so->so_lock);
2156 			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2157 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2158 			strseteof(SOTOV(so), 1);
2159 			freemsg(mp);
2160 			/*
2161 			 * strseteof takes care of read side wakeups,
2162 			 * pollwakeups, and signals.
2163 			 */
2164 			*wakeups = WSLEEP;
2165 			*allmsgsigs = S_OUTPUT;
2166 			*pollwakeups = POLLOUT;
2167 			return (NULL);
2168 		}
2169 		addr = sogetoff(mp, conn_con->RES_offset, addrlen, 1);
2170 		if (addr == NULL) {
2171 			zcmn_err(getzoneid(), CE_WARN,
2172 			    "sockfs: T_conn_con with invalid "
2173 			    "addrlen/offset %u/%d\n",
2174 			    addrlen, conn_con->RES_offset);
2175 			mutex_exit(&so->so_lock);
2176 			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2177 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2178 			strseteof(SOTOV(so), 1);
2179 			freemsg(mp);
2180 			/*
2181 			 * strseteof takes care of read side wakeups,
2182 			 * pollwakeups, and signals.
2183 			 */
2184 			*wakeups = WSLEEP;
2185 			*allmsgsigs = S_OUTPUT;
2186 			*pollwakeups = POLLOUT;
2187 			return (NULL);
2188 		}
2189 
2190 		/*
2191 		 * Save for getpeername.
2192 		 */
2193 		if (so->so_family != AF_UNIX) {
2194 			so->so_faddr_len = (socklen_t)addrlen;
2195 			ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
2196 			bcopy(addr, so->so_faddr_sa, addrlen);
2197 			so->so_state |= SS_FADDR_VALID;
2198 		}
2199 
2200 		if (so->so_peercred != NULL)
2201 			crfree(so->so_peercred);
2202 		so->so_peercred = DB_CRED(mp);
2203 		so->so_cpid = DB_CPID(mp);
2204 		if (so->so_peercred != NULL)
2205 			crhold(so->so_peercred);
2206 
2207 		/* Wakeup anybody sleeping in sowaitconnected */
2208 		soisconnected(so);
2209 		mutex_exit(&so->so_lock);
2210 
2211 		/*
2212 		 * The socket is now available for sending data.
2213 		 */
2214 		*wakeups = WSLEEP;
2215 		*allmsgsigs = S_OUTPUT;
2216 		*pollwakeups = POLLOUT;
2217 		freemsg(mp);
2218 		return (NULL);
2219 	}
2220 
2221 	/*
2222 	 * Extra processing in case of an SSL proxy, before queuing or
2223 	 * forwarding to the fallback endpoint
2224 	 */
2225 	case T_SSL_PROXY_CONN_IND:
2226 	case T_CONN_IND:
2227 		/*
2228 		 * Verify the min size and queue the message on
2229 		 * the so_conn_ind_head/tail list.
2230 		 */
2231 		if (MBLKL(mp) < sizeof (struct T_conn_ind)) {
2232 			zcmn_err(getzoneid(), CE_WARN,
2233 			    "sockfs: Too short T_CONN_IND. Len = %ld\n",
2234 			    (ptrdiff_t)(MBLKL(mp)));
2235 			freemsg(mp);
2236 			return (NULL);
2237 		}
2238 
2239 		if (audit_active)
2240 			audit_sock(T_CONN_IND, strvp2wq(vp), mp, 0);
2241 		if (!(so->so_state & SS_ACCEPTCONN)) {
2242 			zcmn_err(getzoneid(), CE_WARN,
2243 			    "sockfs: T_conn_ind on non-listening socket\n");
2244 			freemsg(mp);
2245 			return (NULL);
2246 		}
2247 
2248 		if (tpr->type == T_SSL_PROXY_CONN_IND && mp->b_cont == NULL) {
2249 			/* No context: need to fall back */
2250 			struct sonode *fbso;
2251 			stdata_t *fbstp;
2252 
2253 			tpr->type = T_CONN_IND;
2254 
2255 			fbso = kssl_find_fallback(so->so_kssl_ent);
2256 
2257 			/*
2258 			 * No fallback: the remote will timeout and
2259 			 * disconnect.
2260 			 */
2261 			if (fbso == NULL) {
2262 				freemsg(mp);
2263 				return (NULL);
2264 			}
2265 			fbstp = SOTOV(fbso)->v_stream;
2266 			qreply(fbstp->sd_wrq->q_next, mp);
2267 			return (NULL);
2268 		}
2269 		soqueueconnind(so, mp);
2270 		*allmsgsigs = S_INPUT | S_RDNORM;
2271 		*pollwakeups = POLLIN | POLLRDNORM;
2272 		*wakeups = RSLEEP;
2273 		return (NULL);
2274 
2275 	case T_ORDREL_IND:
2276 		if (MBLKL(mp) < sizeof (struct T_ordrel_ind)) {
2277 			zcmn_err(getzoneid(), CE_WARN,
2278 			    "sockfs: Too short T_ORDREL_IND. Len = %ld\n",
2279 			    (ptrdiff_t)(MBLKL(mp)));
2280 			freemsg(mp);
2281 			return (NULL);
2282 		}
2283 
2284 		/*
2285 		 * Some providers send this when not fully connected.
2286 		 * SunLink X.25 needs to retrieve disconnect reason after
2287 		 * disconnect for compatibility. It uses T_ORDREL_IND
2288 		 * instead of T_DISCON_IND so that it may use the
2289 		 * endpoint after a connect failure to retrieve the
2290 		 * reason using an ioctl. Thus we explicitly clear
2291 		 * SS_ISCONNECTING here for SunLink X.25.
2292 		 * This is a needed TPI violation.
2293 		 */
2294 		mutex_enter(&so->so_lock);
2295 		so->so_state &= ~SS_ISCONNECTING;
2296 		socantrcvmore(so);
2297 		mutex_exit(&so->so_lock);
2298 		strseteof(SOTOV(so), 1);
2299 		/*
2300 		 * strseteof takes care of read side wakeups,
2301 		 * pollwakeups, and signals.
2302 		 */
2303 		freemsg(mp);
2304 		return (NULL);
2305 
2306 	case T_DISCON_IND:
2307 		if (MBLKL(mp) < sizeof (struct T_discon_ind)) {
2308 			zcmn_err(getzoneid(), CE_WARN,
2309 			    "sockfs: Too short T_DISCON_IND. Len = %ld\n",
2310 			    (ptrdiff_t)(MBLKL(mp)));
2311 			freemsg(mp);
2312 			return (NULL);
2313 		}
2314 		if (so->so_state & SS_ACCEPTCONN) {
2315 			/*
2316 			 * This is a listener. Look for a queued T_CONN_IND
2317 			 * with a matching sequence number and remove it
2318 			 * from the list.
2319 			 * It is normal to not find the sequence number since
2320 			 * the soaccept might have already dequeued it
2321 			 * (in which case the T_CONN_RES will fail with
2322 			 * TBADSEQ).
2323 			 */
2324 			(void) soflushconnind(so, tpr->discon_ind.SEQ_number);
2325 			freemsg(mp);
2326 			return (0);
2327 		}
2328 
2329 		/*
2330 		 * Not a listener
2331 		 *
2332 		 * If SS_CANTRCVMORE for AF_UNIX ignore the discon_reason.
2333 		 * Such a discon_ind appears when the peer has first done
2334 		 * a shutdown() followed by a close() in which case we just
2335 		 * want to record socantsendmore.
2336 		 * In this case sockfs first receives a T_ORDREL_IND followed
2337 		 * by a T_DISCON_IND.
2338 		 * Note that for other transports (e.g. TCP) we need to handle
2339 		 * the discon_ind in this case since it signals an error.
2340 		 */
2341 		mutex_enter(&so->so_lock);
2342 		if ((so->so_state & SS_CANTRCVMORE) &&
2343 		    (so->so_family == AF_UNIX)) {
2344 			socantsendmore(so);
2345 			mutex_exit(&so->so_lock);
2346 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2347 			dprintso(so, 1,
2348 			    ("T_DISCON_IND: error %d\n", so->so_error));
2349 			freemsg(mp);
2350 			/*
2351 			 * Set these variables for caller to process them.
2352 			 * For the else part where T_DISCON_IND is processed,
2353 			 * this will be done in the function being called
2354 			 * (strsock_discon_ind())
2355 			 */
2356 			*wakeups = WSLEEP;
2357 			*allmsgsigs = S_OUTPUT;
2358 			*pollwakeups = POLLOUT;
2359 		} else if (so->so_flag & (SOASYNC_UNBIND | SOLOCKED)) {
2360 			/*
2361 			 * Deferred processing of T_DISCON_IND
2362 			 */
2363 			so_save_discon_ind(so, mp, strsock_discon_ind);
2364 			mutex_exit(&so->so_lock);
2365 		} else {
2366 			/*
2367 			 * Process T_DISCON_IND now
2368 			 */
2369 			(void) strsock_discon_ind(so, mp);
2370 			mutex_exit(&so->so_lock);
2371 		}
2372 		return (NULL);
2373 
2374 	case T_UDERROR_IND: {
2375 		struct T_uderror_ind	*tudi = &tpr->uderror_ind;
2376 		void			*addr;
2377 		t_uscalar_t		addrlen;
2378 		int			error;
2379 
2380 		dprintso(so, 0,
2381 		    ("T_UDERROR_IND: error %d\n", tudi->ERROR_type));
2382 
2383 		if (MBLKL(mp) < sizeof (struct T_uderror_ind)) {
2384 			zcmn_err(getzoneid(), CE_WARN,
2385 			    "sockfs: Too short T_UDERROR_IND. Len = %ld\n",
2386 			    (ptrdiff_t)(MBLKL(mp)));
2387 			freemsg(mp);
2388 			return (NULL);
2389 		}
2390 		/* Ignore on connection-oriented transports */
2391 		if (so->so_mode & SM_CONNREQUIRED) {
2392 			freemsg(mp);
2393 			eprintsoline(so, 0);
2394 			zcmn_err(getzoneid(), CE_WARN,
2395 			    "sockfs: T_uderror_ind on connection-oriented "
2396 			    "transport\n");
2397 			return (NULL);
2398 		}
2399 		addrlen = tudi->DEST_length;
2400 		addr = sogetoff(mp, tudi->DEST_offset, addrlen, 1);
2401 		if (addr == NULL) {
2402 			zcmn_err(getzoneid(), CE_WARN,
2403 			    "sockfs: T_uderror_ind with invalid "
2404 			    "addrlen/offset %u/%d\n",
2405 			    addrlen, tudi->DEST_offset);
2406 			freemsg(mp);
2407 			return (NULL);
2408 		}
2409 
2410 		/* Verify source address for connected socket. */
2411 		mutex_enter(&so->so_lock);
2412 		if (so->so_state & SS_ISCONNECTED) {
2413 			void *faddr;
2414 			t_uscalar_t faddr_len;
2415 			boolean_t match = B_FALSE;
2416 
2417 			switch (so->so_family) {
2418 			case AF_INET: {
2419 				/* Compare just IP address and port */
2420 				struct sockaddr_in *sin1, *sin2;
2421 
2422 				sin1 = (struct sockaddr_in *)so->so_faddr_sa;
2423 				sin2 = (struct sockaddr_in *)addr;
2424 				if (addrlen == sizeof (struct sockaddr_in) &&
2425 				    sin1->sin_port == sin2->sin_port &&
2426 				    sin1->sin_addr.s_addr ==
2427 				    sin2->sin_addr.s_addr)
2428 					match = B_TRUE;
2429 				break;
2430 			}
2431 			case AF_INET6: {
2432 				/* Compare just IP address and port. Not flow */
2433 				struct sockaddr_in6 *sin1, *sin2;
2434 
2435 				sin1 = (struct sockaddr_in6 *)so->so_faddr_sa;
2436 				sin2 = (struct sockaddr_in6 *)addr;
2437 				if (addrlen == sizeof (struct sockaddr_in6) &&
2438 				    sin1->sin6_port == sin2->sin6_port &&
2439 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
2440 				    &sin2->sin6_addr))
2441 					match = B_TRUE;
2442 				break;
2443 			}
2444 			case AF_UNIX:
2445 				faddr = &so->so_ux_faddr;
2446 				faddr_len =
2447 				    (t_uscalar_t)sizeof (so->so_ux_faddr);
2448 				if (faddr_len == addrlen &&
2449 				    bcmp(addr, faddr, addrlen) == 0)
2450 					match = B_TRUE;
2451 				break;
2452 			default:
2453 				faddr = so->so_faddr_sa;
2454 				faddr_len = (t_uscalar_t)so->so_faddr_len;
2455 				if (faddr_len == addrlen &&
2456 				    bcmp(addr, faddr, addrlen) == 0)
2457 					match = B_TRUE;
2458 				break;
2459 			}
2460 
2461 			if (!match) {
2462 #ifdef DEBUG
2463 				dprintso(so, 0,
2464 				    ("sockfs: T_UDERR_IND mismatch: %s - ",
2465 				    pr_addr(so->so_family,
2466 				    (struct sockaddr *)addr,
2467 				    addrlen)));
2468 				dprintso(so, 0, ("%s\n",
2469 				    pr_addr(so->so_family, so->so_faddr_sa,
2470 				    so->so_faddr_len)));
2471 #endif /* DEBUG */
2472 				mutex_exit(&so->so_lock);
2473 				freemsg(mp);
2474 				return (NULL);
2475 			}
2476 			/*
2477 			 * Make the write error nonpersistent. If the error
2478 			 * is zero we use ECONNRESET.
2479 			 * This assumes that the name space for ERROR_type
2480 			 * is the errno name space.
2481 			 */
2482 			if (tudi->ERROR_type != 0)
2483 				error = tudi->ERROR_type;
2484 			else
2485 				error = ECONNRESET;
2486 
2487 			soseterror(so, error);
2488 			mutex_exit(&so->so_lock);
2489 			strsetrerror(SOTOV(so), 0, 0, sogetrderr);
2490 			strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2491 			*wakeups = RSLEEP | WSLEEP;
2492 			*allmsgsigs = S_INPUT | S_RDNORM | S_OUTPUT;
2493 			*pollwakeups = POLLIN | POLLRDNORM | POLLOUT;
2494 			freemsg(mp);
2495 			return (NULL);
2496 		}
2497 		/*
2498 		 * If the application asked for delayed errors
2499 		 * record the T_UDERROR_IND so_eaddr_mp and the reason in
2500 		 * so_delayed_error for delayed error posting. If the reason
2501 		 * is zero use ECONNRESET.
2502 		 * Note that delayed error indications do not make sense for
2503 		 * AF_UNIX sockets since sendto checks that the destination
2504 		 * address is valid at the time of the sendto.
2505 		 */
2506 		if (!(so->so_options & SO_DGRAM_ERRIND)) {
2507 			mutex_exit(&so->so_lock);
2508 			freemsg(mp);
2509 			return (NULL);
2510 		}
2511 		if (so->so_eaddr_mp != NULL)
2512 			freemsg(so->so_eaddr_mp);
2513 
2514 		so->so_eaddr_mp = mp;
2515 		if (tudi->ERROR_type != 0)
2516 			error = tudi->ERROR_type;
2517 		else
2518 			error = ECONNRESET;
2519 		so->so_delayed_error = (ushort_t)error;
2520 		mutex_exit(&so->so_lock);
2521 		return (NULL);
2522 	}
2523 
2524 	case T_ERROR_ACK:
2525 		dprintso(so, 0,
2526 		    ("strsock_proto: T_ERROR_ACK for %d, error %d/%d\n",
2527 		    tpr->error_ack.ERROR_prim,
2528 		    tpr->error_ack.TLI_error,
2529 		    tpr->error_ack.UNIX_error));
2530 
2531 		if (MBLKL(mp) < sizeof (struct T_error_ack)) {
2532 			zcmn_err(getzoneid(), CE_WARN,
2533 			    "sockfs: Too short T_ERROR_ACK. Len = %ld\n",
2534 			    (ptrdiff_t)(MBLKL(mp)));
2535 			freemsg(mp);
2536 			return (NULL);
2537 		}
2538 		/*
2539 		 * Check if we were waiting for the async message
2540 		 */
2541 		mutex_enter(&so->so_lock);
2542 		if ((so->so_flag & SOASYNC_UNBIND) &&
2543 		    tpr->error_ack.ERROR_prim == T_UNBIND_REQ) {
2544 			so_unlock_single(so, SOASYNC_UNBIND);
2545 			mutex_exit(&so->so_lock);
2546 			freemsg(mp);
2547 			return (NULL);
2548 		}
2549 		mutex_exit(&so->so_lock);
2550 		soqueueack(so, mp);
2551 		return (NULL);
2552 
2553 	case T_OK_ACK:
2554 		if (MBLKL(mp) < sizeof (struct T_ok_ack)) {
2555 			zcmn_err(getzoneid(), CE_WARN,
2556 			    "sockfs: Too short T_OK_ACK. Len = %ld\n",
2557 			    (ptrdiff_t)(MBLKL(mp)));
2558 			freemsg(mp);
2559 			return (NULL);
2560 		}
2561 		/*
2562 		 * Check if we were waiting for the async message
2563 		 */
2564 		mutex_enter(&so->so_lock);
2565 		if ((so->so_flag & SOASYNC_UNBIND) &&
2566 		    tpr->ok_ack.CORRECT_prim == T_UNBIND_REQ) {
2567 			dprintso(so, 1,
2568 			    ("strsock_proto: T_OK_ACK async unbind\n"));
2569 			so_unlock_single(so, SOASYNC_UNBIND);
2570 			mutex_exit(&so->so_lock);
2571 			freemsg(mp);
2572 			return (NULL);
2573 		}
2574 		mutex_exit(&so->so_lock);
2575 		soqueueack(so, mp);
2576 		return (NULL);
2577 
2578 	case T_INFO_ACK:
2579 		if (MBLKL(mp) < sizeof (struct T_info_ack)) {
2580 			zcmn_err(getzoneid(), CE_WARN,
2581 			    "sockfs: Too short T_INFO_ACK. Len = %ld\n",
2582 			    (ptrdiff_t)(MBLKL(mp)));
2583 			freemsg(mp);
2584 			return (NULL);
2585 		}
2586 		soqueueack(so, mp);
2587 		return (NULL);
2588 
2589 	case T_CAPABILITY_ACK:
2590 		/*
2591 		 * A T_capability_ack need only be large enough to hold
2592 		 * the PRIM_type and CAP_bits1 fields; checking for anything
2593 		 * larger might reject a correct response from an older
2594 		 * provider.
2595 		 */
2596 		if (MBLKL(mp) < 2 * sizeof (t_uscalar_t)) {
2597 			zcmn_err(getzoneid(), CE_WARN,
2598 			    "sockfs: Too short T_CAPABILITY_ACK. Len = %ld\n",
2599 			    (ptrdiff_t)(MBLKL(mp)));
2600 			freemsg(mp);
2601 			return (NULL);
2602 		}
2603 		soqueueack(so, mp);
2604 		return (NULL);
2605 
2606 	case T_BIND_ACK:
2607 		if (MBLKL(mp) < sizeof (struct T_bind_ack)) {
2608 			zcmn_err(getzoneid(), CE_WARN,
2609 			    "sockfs: Too short T_BIND_ACK. Len = %ld\n",
2610 			    (ptrdiff_t)(MBLKL(mp)));
2611 			freemsg(mp);
2612 			return (NULL);
2613 		}
2614 		soqueueack(so, mp);
2615 		return (NULL);
2616 
2617 	case T_OPTMGMT_ACK:
2618 		if (MBLKL(mp) < sizeof (struct T_optmgmt_ack)) {
2619 			zcmn_err(getzoneid(), CE_WARN,
2620 			    "sockfs: Too short T_OPTMGMT_ACK. Len = %ld\n",
2621 			    (ptrdiff_t)(MBLKL(mp)));
2622 			freemsg(mp);
2623 			return (NULL);
2624 		}
2625 		soqueueack(so, mp);
2626 		return (NULL);
2627 	default:
2628 #ifdef DEBUG
2629 		zcmn_err(getzoneid(), CE_WARN,
2630 		    "sockfs: unknown TPI primitive %d received\n",
2631 		    tpr->type);
2632 #endif /* DEBUG */
2633 		freemsg(mp);
2634 		return (NULL);
2635 	}
2636 }
2637 
2638 /*
2639  * This routine is registered with the stream head to receive other
2640  * (non-data, and non-proto) messages.
2641  *
2642  * Returns NULL if the message was consumed.
2643  * Returns an mblk to make that mblk be processed by the stream head.
2644  *
2645  * Sets the return parameters (*wakeups, *firstmsgsigs, *allmsgsigs, and
2646  * *pollwakeups) for the stream head to take action on.
2647  */
2648 static mblk_t *
2649 strsock_misc(vnode_t *vp, mblk_t *mp,
2650 		strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
2651 		strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
2652 {
2653 	struct sonode *so;
2654 
2655 	so = VTOSO(vp);
2656 
2657 	dprintso(so, 1, ("strsock_misc(%p, %p, 0x%x)\n",
2658 	    vp, mp, DB_TYPE(mp)));
2659 
2660 	/* Set default return values */
2661 	*wakeups = *allmsgsigs = *firstmsgsigs = *pollwakeups = 0;
2662 
2663 	switch (DB_TYPE(mp)) {
2664 	case M_PCSIG:
2665 		/*
2666 		 * This assumes that an M_PCSIG for the urgent data arrives
2667 		 * before the corresponding T_EXDATA_IND.
2668 		 *
2669 		 * Note: Just like in SunOS 4.X and 4.4BSD a poll will be
2670 		 * awoken before the urgent data shows up.
2671 		 * For OOBINLINE this can result in select returning
2672 		 * only exceptions as opposed to except|read.
2673 		 */
2674 		if (*mp->b_rptr == SIGURG) {
2675 			mutex_enter(&so->so_lock);
2676 			dprintso(so, 1,
2677 			    ("SIGURG(%p): counts %d/%d state %s\n",
2678 			    vp, so->so_oobsigcnt,
2679 			    so->so_oobcnt,
2680 			    pr_state(so->so_state, so->so_mode)));
2681 			so_oob_sig(so, 1, allmsgsigs, pollwakeups);
2682 			dprintso(so, 1,
2683 			    ("after SIGURG(%p): counts %d/%d "
2684 			    " poll 0x%x sig 0x%x state %s\n",
2685 			    vp, so->so_oobsigcnt,
2686 			    so->so_oobcnt, *pollwakeups, *allmsgsigs,
2687 			    pr_state(so->so_state, so->so_mode)));
2688 			mutex_exit(&so->so_lock);
2689 		}
2690 		freemsg(mp);
2691 		return (NULL);
2692 
2693 	case M_SIG:
2694 	case M_HANGUP:
2695 	case M_UNHANGUP:
2696 	case M_ERROR:
2697 		/* M_ERRORs etc are ignored */
2698 		freemsg(mp);
2699 		return (NULL);
2700 
2701 	case M_FLUSH:
2702 		/*
2703 		 * Do not flush read queue. If the M_FLUSH
2704 		 * arrives because of an impending T_discon_ind
2705 		 * we still have to keep any queued data - this is part of
2706 		 * socket semantics.
2707 		 */
2708 		if (*mp->b_rptr & FLUSHW) {
2709 			*mp->b_rptr &= ~FLUSHR;
2710 			return (mp);
2711 		}
2712 		freemsg(mp);
2713 		return (NULL);
2714 
2715 	default:
2716 		return (mp);
2717 	}
2718 }
2719 
2720 
2721 /* Register to receive signals for certain events */
2722 int
2723 so_set_asyncsigs(vnode_t *vp, pid_t pgrp, int events, int mode, cred_t *cr)
2724 {
2725 	struct strsigset ss;
2726 	int32_t rval;
2727 
2728 	/*
2729 	 * Note that SOLOCKED will be set except for the call from soaccept().
2730 	 */
2731 	ASSERT(!mutex_owned(&VTOSO(vp)->so_lock));
2732 	ss.ss_pid = pgrp;
2733 	ss.ss_events = events;
2734 	return (strioctl(vp, I_ESETSIG, (intptr_t)&ss, mode, K_TO_K, cr,
2735 	    &rval));
2736 }
2737 
2738 
2739 /* Register for events matching the SS_ASYNC flag */
2740 int
2741 so_set_events(struct sonode *so, vnode_t *vp, cred_t *cr)
2742 {
2743 	int events = so->so_state & SS_ASYNC ?
2744 	    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT :
2745 	    S_RDBAND | S_BANDURG;
2746 
2747 	return (so_set_asyncsigs(vp, so->so_pgrp, events, 0, cr));
2748 }
2749 
2750 
2751 /* Change the SS_ASYNC flag, and update signal delivery if needed */
2752 int
2753 so_flip_async(struct sonode *so, vnode_t *vp, int mode, cred_t *cr)
2754 {
2755 	ASSERT(mutex_owned(&so->so_lock));
2756 	if (so->so_pgrp != 0) {
2757 		int error;
2758 		int events = so->so_state & SS_ASYNC ?		/* Old flag */
2759 		    S_RDBAND | S_BANDURG :			/* New sigs */
2760 		    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT;
2761 
2762 		so_lock_single(so);
2763 		mutex_exit(&so->so_lock);
2764 
2765 		error = so_set_asyncsigs(vp, so->so_pgrp, events, mode, cr);
2766 
2767 		mutex_enter(&so->so_lock);
2768 		so_unlock_single(so, SOLOCKED);
2769 		if (error)
2770 			return (error);
2771 	}
2772 	so->so_state ^= SS_ASYNC;
2773 	return (0);
2774 }
2775 
2776 /*
2777  * Set new pid/pgrp for SIGPOLL (or SIGIO for FIOASYNC mode), replacing
2778  * any existing one.  If passed zero, just clear the existing one.
2779  */
2780 int
2781 so_set_siggrp(struct sonode *so, vnode_t *vp, pid_t pgrp, int mode, cred_t *cr)
2782 {
2783 	int events = so->so_state & SS_ASYNC ?
2784 	    S_RDBAND | S_BANDURG | S_RDNORM | S_OUTPUT :
2785 	    S_RDBAND | S_BANDURG;
2786 	int error;
2787 
2788 	ASSERT(mutex_owned(&so->so_lock));
2789 
2790 	/*
2791 	 * Change socket process (group).
2792 	 *
2793 	 * strioctl (via so_set_asyncsigs) will perform permission check and
2794 	 * also keep a PID_HOLD to prevent the pid from being reused.
2795 	 */
2796 	so_lock_single(so);
2797 	mutex_exit(&so->so_lock);
2798 
2799 	if (pgrp != 0) {
2800 		dprintso(so, 1, ("setown: adding pgrp %d ev 0x%x\n",
2801 		    pgrp, events));
2802 		error = so_set_asyncsigs(vp, pgrp, events, mode, cr);
2803 		if (error != 0) {
2804 			eprintsoline(so, error);
2805 			goto bad;
2806 		}
2807 	}
2808 	/* Remove the previously registered process/group */
2809 	if (so->so_pgrp != 0) {
2810 		dprintso(so, 1, ("setown: removing pgrp %d\n", so->so_pgrp));
2811 		error = so_set_asyncsigs(vp, so->so_pgrp, 0, mode, cr);
2812 		if (error != 0) {
2813 			eprintsoline(so, error);
2814 			error = 0;
2815 		}
2816 	}
2817 	mutex_enter(&so->so_lock);
2818 	so_unlock_single(so, SOLOCKED);
2819 	so->so_pgrp = pgrp;
2820 	return (0);
2821 bad:
2822 	mutex_enter(&so->so_lock);
2823 	so_unlock_single(so, SOLOCKED);
2824 	return (error);
2825 }
2826 
2827 
2828 
2829 /*
2830  * Translate a TLI(/XTI) error into a system error as best we can.
2831  */
2832 static const int tli_errs[] = {
2833 		0,		/* no error	*/
2834 		EADDRNOTAVAIL,  /* TBADADDR	*/
2835 		ENOPROTOOPT,	/* TBADOPT	*/
2836 		EACCES,		/* TACCES	*/
2837 		EBADF,		/* TBADF	*/
2838 		EADDRNOTAVAIL,	/* TNOADDR	*/
2839 		EPROTO,		/* TOUTSTATE	*/
2840 		ECONNABORTED,	/* TBADSEQ	*/
2841 		0,		/* TSYSERR - will never get	*/
2842 		EPROTO,		/* TLOOK - should never be sent by transport */
2843 		EMSGSIZE,	/* TBADDATA	*/
2844 		EMSGSIZE,	/* TBUFOVFLW	*/
2845 		EPROTO,		/* TFLOW	*/
2846 		EWOULDBLOCK,	/* TNODATA	*/
2847 		EPROTO,		/* TNODIS	*/
2848 		EPROTO,		/* TNOUDERR	*/
2849 		EINVAL,		/* TBADFLAG	*/
2850 		EPROTO,		/* TNOREL	*/
2851 		EOPNOTSUPP,	/* TNOTSUPPORT	*/
2852 		EPROTO,		/* TSTATECHNG	*/
2853 		/* following represent error namespace expansion with XTI */
2854 		EPROTO,		/* TNOSTRUCTYPE - never sent by transport */
2855 		EPROTO,		/* TBADNAME - never sent by transport */
2856 		EPROTO,		/* TBADQLEN - never sent by transport */
2857 		EADDRINUSE,	/* TADDRBUSY	*/
2858 		EBADF,		/* TINDOUT	*/
2859 		EBADF,		/* TPROVMISMATCH */
2860 		EBADF,		/* TRESQLEN	*/
2861 		EBADF,		/* TRESADDR	*/
2862 		EPROTO,		/* TQFULL - never sent by transport */
2863 		EPROTO,		/* TPROTO	*/
2864 };
2865 
2866 static int
2867 tlitosyserr(int terr)
2868 {
2869 	ASSERT(terr != TSYSERR);
2870 	if (terr >= (sizeof (tli_errs) / sizeof (tli_errs[0])))
2871 		return (EPROTO);
2872 	else
2873 		return (tli_errs[terr]);
2874 }
2875