xref: /titanic_52/usr/src/uts/common/fs/sockfs/socktpi.c (revision 3589c4f01c20349ca65899d209cdc0c17a641433)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/t_lock.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/buf.h>
32 #include <sys/conf.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/kmem_impl.h>
36 #include <sys/sysmacros.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/debug.h>
40 #include <sys/errno.h>
41 #include <sys/time.h>
42 #include <sys/file.h>
43 #include <sys/open.h>
44 #include <sys/user.h>
45 #include <sys/termios.h>
46 #include <sys/stream.h>
47 #include <sys/strsubr.h>
48 #include <sys/strsun.h>
49 #include <sys/suntpi.h>
50 #include <sys/ddi.h>
51 #include <sys/esunddi.h>
52 #include <sys/flock.h>
53 #include <sys/modctl.h>
54 #include <sys/vtrace.h>
55 #include <sys/cmn_err.h>
56 #include <sys/pathname.h>
57 
58 #include <sys/socket.h>
59 #include <sys/socketvar.h>
60 #include <sys/sockio.h>
61 #include <sys/sodirect.h>
62 #include <netinet/in.h>
63 #include <sys/un.h>
64 #include <sys/strsun.h>
65 
66 #include <sys/tiuser.h>
67 #define	_SUN_TPI_VERSION	2
68 #include <sys/tihdr.h>
69 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
70 
71 #include <c2/audit.h>
72 
73 #include <inet/common.h>
74 #include <inet/ip.h>
75 #include <inet/ip6.h>
76 #include <inet/tcp.h>
77 #include <inet/udp_impl.h>
78 
79 #include <sys/zone.h>
80 
81 #include <fs/sockfs/nl7c.h>
82 #include <fs/sockfs/nl7curi.h>
83 
84 #include <inet/kssl/ksslapi.h>
85 
86 #include <fs/sockfs/sockcommon.h>
87 #include <fs/sockfs/socktpi.h>
88 #include <fs/sockfs/socktpi_impl.h>
89 
90 /*
91  * Possible failures when memory can't be allocated. The documented behavior:
92  *
93  * 		5.5:			4.X:		XNET:
94  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
95  *							EINTR
96  *	(4.X does not document EINTR but returns it)
97  * bind:	ENOSR			-		ENOBUFS/ENOSR
98  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
99  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
100  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
101  *	(4.X getpeername and getsockname do not fail in practice)
102  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
103  * listen:	-			-		ENOBUFS
104  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
105  *							EINTR
106  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
107  *							EINTR
108  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
109  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
110  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
111  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
112  *
113  * Resolution. When allocation fails:
114  *	recv: return EINTR
115  *	send: return EINTR
116  *	connect, accept: EINTR
117  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
118  *	socket, socketpair: ENOBUFS
119  *	getpeername, getsockname: sleep
120  *	getsockopt, setsockopt: sleep
121  */
122 
123 #ifdef SOCK_TEST
124 /*
125  * Variables that make sockfs do something other than the standard TPI
126  * for the AF_INET transports.
127  *
128  * solisten_tpi_tcp:
129  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
130  *	the transport is already bound. This is needed to avoid loosing the
131  *	port number should listen() do a T_UNBIND_REQ followed by a
132  *	O_T_BIND_REQ.
133  *
134  * soconnect_tpi_udp:
135  *	UDP and ICMP can handle a T_CONN_REQ.
136  *	This is needed to make the sequence of connect(), getsockname()
137  *	return the local IP address used to send packets to the connected to
138  *	destination.
139  *
140  * soconnect_tpi_tcp:
141  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
142  *	Set this to non-zero to send TPI conformant messages to TCP in this
143  *	respect. This is a performance optimization.
144  *
145  * soaccept_tpi_tcp:
146  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
147  *	This is a performance optimization that has been picked up in XTI.
148  *
149  * soaccept_tpi_multioptions:
150  *	When inheriting SOL_SOCKET options from the listener to the accepting
151  *	socket send them as a single message for AF_INET{,6}.
152  */
153 int solisten_tpi_tcp = 0;
154 int soconnect_tpi_udp = 0;
155 int soconnect_tpi_tcp = 0;
156 int soaccept_tpi_tcp = 0;
157 int soaccept_tpi_multioptions = 1;
158 #else /* SOCK_TEST */
159 #define	soconnect_tpi_tcp	0
160 #define	soconnect_tpi_udp	0
161 #define	solisten_tpi_tcp	0
162 #define	soaccept_tpi_tcp	0
163 #define	soaccept_tpi_multioptions	1
164 #endif /* SOCK_TEST */
165 
166 #ifdef SOCK_TEST
167 extern int do_useracc;
168 extern clock_t sock_test_timelimit;
169 #endif /* SOCK_TEST */
170 
171 /*
172  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
173  * applications working. Turn on this flag to disable these checks.
174  */
175 int xnet_skip_checks = 0;
176 int xnet_check_print = 0;
177 int xnet_truncate_print = 0;
178 
179 static void sotpi_destroy(struct sonode *);
180 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
181     int, int *, cred_t *cr);
182 
183 static boolean_t	sotpi_info_create(struct sonode *, int);
184 static void		sotpi_info_init(struct sonode *);
185 static void 		sotpi_info_fini(struct sonode *);
186 static void 		sotpi_info_destroy(struct sonode *);
187 
188 /*
189  * Do direct function call to the transport layer below; this would
190  * also allow the transport to utilize read-side synchronous stream
191  * interface if necessary.  This is a /etc/system tunable that must
192  * not be modified on a running system.  By default this is enabled
193  * for performance reasons and may be disabled for debugging purposes.
194  */
195 boolean_t socktpi_direct = B_TRUE;
196 
197 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
198 
199 extern	void sigintr(k_sigset_t *, int);
200 extern	void sigunintr(k_sigset_t *);
201 
202 /* Sockets acting as an in-kernel SSL proxy */
203 extern mblk_t	*strsock_kssl_input(vnode_t *, mblk_t *, strwakeup_t *,
204 		    strsigset_t *, strsigset_t *, strpollset_t *);
205 extern mblk_t	*strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *,
206 		    strsigset_t *, strsigset_t *, strpollset_t *);
207 
208 static int	sotpi_unbind(struct sonode *, int);
209 
210 extern int	sodput(sodirect_t *, mblk_t *);
211 extern void	sodwakeup(sodirect_t *);
212 
213 /* TPI sockfs sonode operations */
214 int 		sotpi_init(struct sonode *, struct sonode *, struct cred *,
215 		    int);
216 static int	sotpi_accept(struct sonode *, int, struct cred *,
217 		    struct sonode **);
218 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
219 		    int, struct cred *);
220 static int	sotpi_listen(struct sonode *, int, struct cred *);
221 static int	sotpi_connect(struct sonode *, const struct sockaddr *,
222 		    socklen_t, int, int, struct cred *);
223 extern int	sotpi_recvmsg(struct sonode *, struct nmsghdr *,
224 		    struct uio *, struct cred *);
225 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
226 		    struct uio *, struct cred *);
227 static int	sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
228 		    struct cred *, mblk_t **);
229 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
230 		    struct uio *, void *, t_uscalar_t, int);
231 static int	sodgram_direct(struct sonode *, struct sockaddr *,
232 		    socklen_t, struct uio *, int);
233 extern int	sotpi_getpeername(struct sonode *, struct sockaddr *,
234 		    socklen_t *, boolean_t, struct cred *);
235 static int	sotpi_getsockname(struct sonode *, struct sockaddr *,
236 		    socklen_t *, struct cred *);
237 static int	sotpi_shutdown(struct sonode *, int, struct cred *);
238 extern int	sotpi_getsockopt(struct sonode *, int, int, void *,
239 		    socklen_t *, int, struct cred *);
240 extern int	sotpi_setsockopt(struct sonode *, int, int, const void *,
241 		    socklen_t, struct cred *);
242 static int 	sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
243 		    int32_t *);
244 static int 	socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
245 		    struct cred *, int32_t *);
246 static int 	sotpi_poll(struct sonode *, short, int, short *,
247 		    struct pollhead **);
248 static int 	sotpi_close(struct sonode *, int, struct cred *);
249 
250 static int	i_sotpi_info_constructor(sotpi_info_t *);
251 static void 	i_sotpi_info_destructor(sotpi_info_t *);
252 
253 sonodeops_t sotpi_sonodeops = {
254 	sotpi_init,		/* sop_init		*/
255 	sotpi_accept,		/* sop_accept		*/
256 	sotpi_bind,		/* sop_bind		*/
257 	sotpi_listen,		/* sop_listen		*/
258 	sotpi_connect,		/* sop_connect		*/
259 	sotpi_recvmsg,		/* sop_recvmsg		*/
260 	sotpi_sendmsg,		/* sop_sendmsg		*/
261 	sotpi_sendmblk,		/* sop_sendmblk		*/
262 	sotpi_getpeername,	/* sop_getpeername	*/
263 	sotpi_getsockname,	/* sop_getsockname	*/
264 	sotpi_shutdown,		/* sop_shutdown		*/
265 	sotpi_getsockopt,	/* sop_getsockopt	*/
266 	sotpi_setsockopt,	/* sop_setsockopt	*/
267 	sotpi_ioctl,		/* sop_ioctl		*/
268 	sotpi_poll,		/* sop_poll		*/
269 	sotpi_close,		/* sop_close		*/
270 };
271 
272 /*
273  * Return a TPI socket vnode.
274  *
275  * Note that sockets assume that the driver will clone (either itself
276  * or by using the clone driver) i.e. a socket() call will always
277  * result in a new vnode being created.
278  */
279 
280 /*
281  * Common create code for socket and accept. If tso is set the values
282  * from that node is used instead of issuing a T_INFO_REQ.
283  */
284 
285 /* ARGSUSED */
286 static struct sonode *
287 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
288     int version, int sflags, int *errorp, cred_t *cr)
289 {
290 	struct sonode	*so;
291 	kmem_cache_t 	*cp;
292 	int		sfamily = family;
293 
294 	ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
295 
296 	if (family == AF_NCA) {
297 		/*
298 		 * The request is for an NCA socket so for NL7C use the
299 		 * INET domain instead and mark NL7C_AF_NCA below.
300 		 */
301 		family = AF_INET;
302 		/*
303 		 * NL7C is not supported in the non-global zone,
304 		 * we enforce this restriction here.
305 		 */
306 		if (getzoneid() != GLOBAL_ZONEID) {
307 			*errorp = ENOTSUP;
308 			return (NULL);
309 		}
310 	}
311 
312 	/*
313 	 * to be compatible with old tpi socket implementation ignore
314 	 * sleep flag (sflags) passed in
315 	 */
316 	cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
317 	so = kmem_cache_alloc(cp, KM_SLEEP);
318 	if (so == NULL) {
319 		*errorp = ENOMEM;
320 		return (NULL);
321 	}
322 
323 	sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
324 	sotpi_info_init(so);
325 
326 	if (sfamily == AF_NCA) {
327 		SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
328 	}
329 
330 	if (version == SOV_DEFAULT)
331 		version = so_default_version;
332 
333 	so->so_version = (short)version;
334 	*errorp = 0;
335 
336 	return (so);
337 }
338 
339 static void
340 sotpi_destroy(struct sonode *so)
341 {
342 	kmem_cache_t *cp;
343 	struct sockparams *origsp;
344 
345 	/*
346 	 * If there is a new dealloc function (ie. smod_destroy_func),
347 	 * then it should check the correctness of the ops.
348 	 */
349 
350 	ASSERT(so->so_ops == &sotpi_sonodeops);
351 
352 	origsp = SOTOTPI(so)->sti_orig_sp;
353 
354 	sotpi_info_fini(so);
355 
356 	if (so->so_state & SS_FALLBACK_COMP) {
357 		/*
358 		 * A fallback happend, which means that a sotpi_info_t struct
359 		 * was allocated (as opposed to being allocated from the TPI
360 		 * sonode cache. Therefore we explicitly free the struct
361 		 * here.
362 		 */
363 		sotpi_info_destroy(so);
364 		ASSERT(origsp != NULL);
365 
366 		origsp->sp_smod_info->smod_sock_destroy_func(so);
367 		SOCKPARAMS_DEC_REF(origsp);
368 	} else {
369 		sonode_fini(so);
370 		cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
371 		    socktpi_cache;
372 		kmem_cache_free(cp, so);
373 	}
374 }
375 
376 /* ARGSUSED1 */
377 int
378 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
379 {
380 	major_t maj;
381 	dev_t newdev;
382 	struct vnode *vp;
383 	int error = 0;
384 	struct stdata *stp;
385 
386 	sotpi_info_t *sti = SOTOTPI(so);
387 
388 	dprint(1, ("sotpi_init()\n"));
389 
390 	/*
391 	 * over write the sleep flag passed in but that is ok
392 	 * as tpi socket does not honor sleep flag.
393 	 */
394 	flags |= FREAD|FWRITE;
395 
396 	/*
397 	 * Record in so_flag that it is a clone.
398 	 */
399 	if (getmajor(sti->sti_dev) == clone_major)
400 		so->so_flag |= SOCLONE;
401 
402 	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
403 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
404 	    (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
405 	    so->so_protocol == IPPROTO_IP)) {
406 		/* Tell tcp or udp that it's talking to sockets */
407 		flags |= SO_SOCKSTR;
408 
409 		/*
410 		 * Here we indicate to socktpi_open() our attempt to
411 		 * make direct calls between sockfs and transport.
412 		 * The final decision is left to socktpi_open().
413 		 */
414 		sti->sti_direct = 1;
415 
416 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
417 		if (so->so_type == SOCK_STREAM && tso != NULL) {
418 			if (SOTOTPI(tso)->sti_direct) {
419 				/*
420 				 * Inherit sti_direct from listener and pass
421 				 * SO_ACCEPTOR open flag to tcp, indicating
422 				 * that this is an accept fast-path instance.
423 				 */
424 				flags |= SO_ACCEPTOR;
425 			} else {
426 				/*
427 				 * sti_direct is not set on listener, meaning
428 				 * that the listener has been converted from
429 				 * a socket to a stream.  Ensure that the
430 				 * acceptor inherits these settings.
431 				 */
432 				sti->sti_direct = 0;
433 				flags &= ~SO_SOCKSTR;
434 			}
435 		}
436 	}
437 
438 	/*
439 	 * Tell local transport that it is talking to sockets.
440 	 */
441 	if (so->so_family == AF_UNIX) {
442 		flags |= SO_SOCKSTR;
443 	}
444 
445 	vp = SOTOV(so);
446 	newdev = vp->v_rdev;
447 	maj = getmajor(newdev);
448 	ASSERT(STREAMSTAB(maj));
449 
450 	error = stropen(vp, &newdev, flags, cr);
451 
452 	stp = vp->v_stream;
453 	if (error == 0) {
454 		if (so->so_flag & SOCLONE)
455 			ASSERT(newdev != vp->v_rdev);
456 		mutex_enter(&so->so_lock);
457 		sti->sti_dev = newdev;
458 		vp->v_rdev = newdev;
459 		mutex_exit(&so->so_lock);
460 
461 		if (stp->sd_flag & STRISTTY) {
462 			/*
463 			 * this is a post SVR4 tty driver - a socket can not
464 			 * be a controlling terminal. Fail the open.
465 			 */
466 			(void) sotpi_close(so, flags, cr);
467 			return (ENOTTY);	/* XXX */
468 		}
469 
470 		ASSERT(stp->sd_wrq != NULL);
471 		sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
472 
473 		/*
474 		 * If caller is interested in doing direct function call
475 		 * interface to/from transport module, probe the module
476 		 * directly beneath the streamhead to see if it qualifies.
477 		 *
478 		 * We turn off the direct interface when qualifications fail.
479 		 * In the acceptor case, we simply turn off the sti_direct
480 		 * flag on the socket. We do the fallback after the accept
481 		 * has completed, before the new socket is returned to the
482 		 * application.
483 		 */
484 		if (sti->sti_direct) {
485 			queue_t *tq = stp->sd_wrq->q_next;
486 
487 			/*
488 			 * sti_direct is currently supported and tested
489 			 * only for tcp/udp; this is the main reason to
490 			 * have the following assertions.
491 			 */
492 			ASSERT(so->so_family == AF_INET ||
493 			    so->so_family == AF_INET6);
494 			ASSERT(so->so_protocol == IPPROTO_UDP ||
495 			    so->so_protocol == IPPROTO_TCP ||
496 			    so->so_protocol == IPPROTO_IP);
497 			ASSERT(so->so_type == SOCK_DGRAM ||
498 			    so->so_type == SOCK_STREAM);
499 
500 			/*
501 			 * Abort direct call interface if the module directly
502 			 * underneath the stream head is not defined with the
503 			 * _D_DIRECT flag.  This could happen in the tcp or
504 			 * udp case, when some other module is autopushed
505 			 * above it, or for some reasons the expected module
506 			 * isn't purely D_MP (which is the main requirement).
507 			 *
508 			 * Else, SS_DIRECT is valid. If the read-side Q has
509 			 * _QSODIRECT set then and uioasync is enabled then
510 			 * set SS_SODIRECT to enable sodirect.
511 			 */
512 			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
513 			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
514 				int rval;
515 
516 				/* Continue on without direct calls */
517 				sti->sti_direct = 0;
518 
519 				/*
520 				 * Cannot issue ioctl on fallback socket since
521 				 * there is no conn associated with the queue.
522 				 * The fallback downcall will notify the proto
523 				 * of the change.
524 				 */
525 				if (!(flags & SO_ACCEPTOR) &&
526 				    !(flags & SO_FALLBACK)) {
527 					if ((error = strioctl(vp,
528 					    _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
529 					    cr, &rval)) != 0) {
530 						(void) sotpi_close(so, flags,
531 						    cr);
532 						return (error);
533 					}
534 				}
535 			} else if ((_OTHERQ(tq)->q_flag & _QSODIRECT) &&
536 			    uioasync.enabled) {
537 				/* Enable sodirect */
538 				so->so_state |= SS_SODIRECT;
539 			}
540 		}
541 
542 		if (flags & SO_FALLBACK) {
543 			/*
544 			 * The stream created does not have a conn.
545 			 * do stream set up after conn has been assigned
546 			 */
547 			return (error);
548 		}
549 		if (error = so_strinit(so, tso)) {
550 			(void) sotpi_close(so, flags, cr);
551 			return (error);
552 		}
553 
554 		/* Wildcard */
555 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
556 			int protocol = so->so_protocol;
557 			/*
558 			 * Issue SO_PROTOTYPE setsockopt.
559 			 */
560 			error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
561 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
562 			if (error != 0) {
563 				(void) sotpi_close(so, flags, cr);
564 				/*
565 				 * Setsockopt often fails with ENOPROTOOPT but
566 				 * socket() should fail with
567 				 * EPROTONOSUPPORT/EPROTOTYPE.
568 				 */
569 				return (EPROTONOSUPPORT);
570 			}
571 		}
572 
573 	} else {
574 		/*
575 		 * While the same socket can not be reopened (unlike specfs)
576 		 * the stream head sets STREOPENFAIL when the autopush fails.
577 		 */
578 		if ((stp != NULL) &&
579 		    (stp->sd_flag & STREOPENFAIL)) {
580 			/*
581 			 * Open failed part way through.
582 			 */
583 			mutex_enter(&stp->sd_lock);
584 			stp->sd_flag &= ~STREOPENFAIL;
585 			mutex_exit(&stp->sd_lock);
586 			(void) sotpi_close(so, flags, cr);
587 			return (error);
588 			/*NOTREACHED*/
589 		}
590 		ASSERT(stp == NULL);
591 	}
592 	TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
593 	    "sockfs open:maj %d vp %p so %p error %d",
594 	    maj, vp, so, error);
595 	return (error);
596 }
597 
598 /*
599  * Bind the socket to an unspecified address in sockfs only.
600  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
601  * required in all cases.
602  */
603 static void
604 so_automatic_bind(struct sonode *so)
605 {
606 	sotpi_info_t *sti = SOTOTPI(so);
607 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
608 
609 	ASSERT(MUTEX_HELD(&so->so_lock));
610 	ASSERT(!(so->so_state & SS_ISBOUND));
611 	ASSERT(sti->sti_unbind_mp);
612 
613 	ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
614 	bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
615 	sti->sti_laddr_sa->sa_family = so->so_family;
616 	so->so_state |= SS_ISBOUND;
617 }
618 
619 
620 /*
621  * bind the socket.
622  *
623  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
624  * are passed in we allow rebinding. Note that for backwards compatibility
625  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
626  * Thus the rebinding code is currently not executed.
627  *
628  * The constraints for rebinding are:
629  * - it is a SOCK_DGRAM, or
630  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
631  *   and no listen() has been done.
632  * This rebinding code was added based on some language in the XNET book
633  * about not returning EINVAL it the protocol allows rebinding. However,
634  * this language is not present in the Posix socket draft. Thus maybe the
635  * rebinding logic should be deleted from the source.
636  *
637  * A null "name" can be used to unbind the socket if:
638  * - it is a SOCK_DGRAM, or
639  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
640  *   and no listen() has been done.
641  */
642 /* ARGSUSED */
643 static int
644 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
645     socklen_t namelen, int backlog, int flags, struct cred *cr)
646 {
647 	struct T_bind_req	bind_req;
648 	struct T_bind_ack	*bind_ack;
649 	int			error = 0;
650 	mblk_t			*mp;
651 	void			*addr;
652 	t_uscalar_t		addrlen;
653 	int			unbind_on_err = 1;
654 	boolean_t		clear_acceptconn_on_err = B_FALSE;
655 	boolean_t		restore_backlog_on_err = B_FALSE;
656 	int			save_so_backlog;
657 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
658 	boolean_t		tcp_udp_xport;
659 	void			*nl7c = NULL;
660 	sotpi_info_t		*sti = SOTOTPI(so);
661 
662 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
663 	    (void *)so, (void *)name, namelen, backlog, flags,
664 	    pr_state(so->so_state, so->so_mode)));
665 
666 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
667 
668 	if (!(flags & _SOBIND_LOCK_HELD)) {
669 		mutex_enter(&so->so_lock);
670 		so_lock_single(so);	/* Set SOLOCKED */
671 	} else {
672 		ASSERT(MUTEX_HELD(&so->so_lock));
673 		ASSERT(so->so_flag & SOLOCKED);
674 	}
675 
676 	/*
677 	 * Make sure that there is a preallocated unbind_req message
678 	 * before binding. This message allocated when the socket is
679 	 * created  but it might be have been consumed.
680 	 */
681 	if (sti->sti_unbind_mp == NULL) {
682 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
683 		/* NOTE: holding so_lock while sleeping */
684 		sti->sti_unbind_mp =
685 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
686 		    cr);
687 	}
688 
689 	if (flags & _SOBIND_REBIND) {
690 		/*
691 		 * Called from solisten after doing an sotpi_unbind() or
692 		 * potentially without the unbind (latter for AF_INET{,6}).
693 		 */
694 		ASSERT(name == NULL && namelen == 0);
695 
696 		if (so->so_family == AF_UNIX) {
697 			ASSERT(sti->sti_ux_bound_vp);
698 			addr = &sti->sti_ux_laddr;
699 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
700 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
701 			    "addr 0x%p, vp %p\n",
702 			    addrlen,
703 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
704 			    (void *)sti->sti_ux_bound_vp));
705 		} else {
706 			addr = sti->sti_laddr_sa;
707 			addrlen = (t_uscalar_t)sti->sti_laddr_len;
708 		}
709 	} else if (flags & _SOBIND_UNSPEC) {
710 		ASSERT(name == NULL && namelen == 0);
711 
712 		/*
713 		 * The caller checked SS_ISBOUND but not necessarily
714 		 * under so_lock
715 		 */
716 		if (so->so_state & SS_ISBOUND) {
717 			/* No error */
718 			goto done;
719 		}
720 
721 		/* Set an initial local address */
722 		switch (so->so_family) {
723 		case AF_UNIX:
724 			/*
725 			 * Use an address with same size as struct sockaddr
726 			 * just like BSD.
727 			 */
728 			sti->sti_laddr_len =
729 			    (socklen_t)sizeof (struct sockaddr);
730 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
731 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
732 			sti->sti_laddr_sa->sa_family = so->so_family;
733 
734 			/*
735 			 * Pass down an address with the implicit bind
736 			 * magic number and the rest all zeros.
737 			 * The transport will return a unique address.
738 			 */
739 			sti->sti_ux_laddr.soua_vp = NULL;
740 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
741 			addr = &sti->sti_ux_laddr;
742 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
743 			break;
744 
745 		case AF_INET:
746 		case AF_INET6:
747 			/*
748 			 * An unspecified bind in TPI has a NULL address.
749 			 * Set the address in sockfs to have the sa_family.
750 			 */
751 			sti->sti_laddr_len = (so->so_family == AF_INET) ?
752 			    (socklen_t)sizeof (sin_t) :
753 			    (socklen_t)sizeof (sin6_t);
754 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
755 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
756 			sti->sti_laddr_sa->sa_family = so->so_family;
757 			addr = NULL;
758 			addrlen = 0;
759 			break;
760 
761 		default:
762 			/*
763 			 * An unspecified bind in TPI has a NULL address.
764 			 * Set the address in sockfs to be zero length.
765 			 *
766 			 * Can not assume there is a sa_family for all
767 			 * protocol families. For example, AF_X25 does not
768 			 * have a family field.
769 			 */
770 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
771 			sti->sti_laddr_len = 0;	/* XXX correct? */
772 			addr = NULL;
773 			addrlen = 0;
774 			break;
775 		}
776 
777 	} else {
778 		if (so->so_state & SS_ISBOUND) {
779 			/*
780 			 * If it is ok to rebind the socket, first unbind
781 			 * with the transport. A rebind to the NULL address
782 			 * is interpreted as an unbind.
783 			 * Note that a bind to NULL in BSD does unbind the
784 			 * socket but it fails with EINVAL.
785 			 * Note that regular sockets set SOV_SOCKBSD i.e.
786 			 * _SOBIND_SOCKBSD gets set here hence no type of
787 			 * socket does currently allow rebinding.
788 			 *
789 			 * If the name is NULL just do an unbind.
790 			 */
791 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
792 			    name != NULL) {
793 				error = EINVAL;
794 				unbind_on_err = 0;
795 				eprintsoline(so, error);
796 				goto done;
797 			}
798 			if ((so->so_mode & SM_CONNREQUIRED) &&
799 			    (so->so_state & SS_CANTREBIND)) {
800 				error = EINVAL;
801 				unbind_on_err = 0;
802 				eprintsoline(so, error);
803 				goto done;
804 			}
805 			error = sotpi_unbind(so, 0);
806 			if (error) {
807 				eprintsoline(so, error);
808 				goto done;
809 			}
810 			ASSERT(!(so->so_state & SS_ISBOUND));
811 			if (name == NULL) {
812 				so->so_state &=
813 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
814 				goto done;
815 			}
816 		}
817 
818 		/* X/Open requires this check */
819 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
820 			if (xnet_check_print) {
821 				printf("sockfs: X/Open bind state check "
822 				    "caused EINVAL\n");
823 			}
824 			error = EINVAL;
825 			goto done;
826 		}
827 
828 		switch (so->so_family) {
829 		case AF_UNIX:
830 			/*
831 			 * All AF_UNIX addresses are nul terminated
832 			 * when copied (copyin_name) in so the minimum
833 			 * length is 3 bytes.
834 			 */
835 			if (name == NULL ||
836 			    (ssize_t)namelen <= sizeof (short) + 1) {
837 				error = EISDIR;
838 				eprintsoline(so, error);
839 				goto done;
840 			}
841 			/*
842 			 * Verify so_family matches the bound family.
843 			 * BSD does not check this for AF_UNIX resulting
844 			 * in funny mknods.
845 			 */
846 			if (name->sa_family != so->so_family) {
847 				error = EAFNOSUPPORT;
848 				goto done;
849 			}
850 			break;
851 		case AF_INET:
852 			if (name == NULL) {
853 				error = EINVAL;
854 				eprintsoline(so, error);
855 				goto done;
856 			}
857 			if ((size_t)namelen != sizeof (sin_t)) {
858 				error = name->sa_family != so->so_family ?
859 				    EAFNOSUPPORT : EINVAL;
860 				eprintsoline(so, error);
861 				goto done;
862 			}
863 			if ((flags & _SOBIND_XPG4_2) &&
864 			    (name->sa_family != so->so_family)) {
865 				/*
866 				 * This check has to be made for X/Open
867 				 * sockets however application failures have
868 				 * been observed when it is applied to
869 				 * all sockets.
870 				 */
871 				error = EAFNOSUPPORT;
872 				eprintsoline(so, error);
873 				goto done;
874 			}
875 			/*
876 			 * Force a zero sa_family to match so_family.
877 			 *
878 			 * Some programs like inetd(1M) don't set the
879 			 * family field. Other programs leave
880 			 * sin_family set to garbage - SunOS 4.X does
881 			 * not check the family field on a bind.
882 			 * We use the family field that
883 			 * was passed in to the socket() call.
884 			 */
885 			name->sa_family = so->so_family;
886 			break;
887 
888 		case AF_INET6: {
889 #ifdef DEBUG
890 			sin6_t *sin6 = (sin6_t *)name;
891 #endif /* DEBUG */
892 
893 			if (name == NULL) {
894 				error = EINVAL;
895 				eprintsoline(so, error);
896 				goto done;
897 			}
898 			if ((size_t)namelen != sizeof (sin6_t)) {
899 				error = name->sa_family != so->so_family ?
900 				    EAFNOSUPPORT : EINVAL;
901 				eprintsoline(so, error);
902 				goto done;
903 			}
904 			if (name->sa_family != so->so_family) {
905 				/*
906 				 * With IPv6 we require the family to match
907 				 * unlike in IPv4.
908 				 */
909 				error = EAFNOSUPPORT;
910 				eprintsoline(so, error);
911 				goto done;
912 			}
913 #ifdef DEBUG
914 			/*
915 			 * Verify that apps don't forget to clear
916 			 * sin6_scope_id etc
917 			 */
918 			if (sin6->sin6_scope_id != 0 &&
919 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
920 				zcmn_err(getzoneid(), CE_WARN,
921 				    "bind with uninitialized sin6_scope_id "
922 				    "(%d) on socket. Pid = %d\n",
923 				    (int)sin6->sin6_scope_id,
924 				    (int)curproc->p_pid);
925 			}
926 			if (sin6->__sin6_src_id != 0) {
927 				zcmn_err(getzoneid(), CE_WARN,
928 				    "bind with uninitialized __sin6_src_id "
929 				    "(%d) on socket. Pid = %d\n",
930 				    (int)sin6->__sin6_src_id,
931 				    (int)curproc->p_pid);
932 			}
933 #endif /* DEBUG */
934 			break;
935 		}
936 		default:
937 			/*
938 			 * Don't do any length or sa_family check to allow
939 			 * non-sockaddr style addresses.
940 			 */
941 			if (name == NULL) {
942 				error = EINVAL;
943 				eprintsoline(so, error);
944 				goto done;
945 			}
946 			break;
947 		}
948 
949 		if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
950 			error = ENAMETOOLONG;
951 			eprintsoline(so, error);
952 			goto done;
953 		}
954 		/*
955 		 * Save local address.
956 		 */
957 		sti->sti_laddr_len = (socklen_t)namelen;
958 		ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
959 		bcopy(name, sti->sti_laddr_sa, namelen);
960 
961 		addr = sti->sti_laddr_sa;
962 		addrlen = (t_uscalar_t)sti->sti_laddr_len;
963 		switch (so->so_family) {
964 		case AF_INET6:
965 		case AF_INET:
966 			break;
967 		case AF_UNIX: {
968 			struct sockaddr_un *soun =
969 			    (struct sockaddr_un *)sti->sti_laddr_sa;
970 			struct vnode *vp, *rvp;
971 			struct vattr vattr;
972 
973 			ASSERT(sti->sti_ux_bound_vp == NULL);
974 			/*
975 			 * Create vnode for the specified path name.
976 			 * Keep vnode held with a reference in sti_ux_bound_vp.
977 			 * Use the vnode pointer as the address used in the
978 			 * bind with the transport.
979 			 *
980 			 * Use the same mode as in BSD. In particular this does
981 			 * not observe the umask.
982 			 */
983 			/* MAXPATHLEN + soun_family + nul termination */
984 			if (sti->sti_laddr_len >
985 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
986 				error = ENAMETOOLONG;
987 				eprintsoline(so, error);
988 				goto done;
989 			}
990 			vattr.va_type = VSOCK;
991 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
992 			vattr.va_mask = AT_TYPE|AT_MODE;
993 			/* NOTE: holding so_lock */
994 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
995 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
996 			if (error) {
997 				if (error == EEXIST)
998 					error = EADDRINUSE;
999 				eprintsoline(so, error);
1000 				goto done;
1001 			}
1002 			/*
1003 			 * Establish pointer from the underlying filesystem
1004 			 * vnode to the socket node.
1005 			 * sti_ux_bound_vp and v_stream->sd_vnode form the
1006 			 * cross-linkage between the underlying filesystem
1007 			 * node and the socket node.
1008 			 */
1009 
1010 			if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
1011 				VN_HOLD(rvp);
1012 				VN_RELE(vp);
1013 				vp = rvp;
1014 			}
1015 
1016 			ASSERT(SOTOV(so)->v_stream);
1017 			mutex_enter(&vp->v_lock);
1018 			vp->v_stream = SOTOV(so)->v_stream;
1019 			sti->sti_ux_bound_vp = vp;
1020 			mutex_exit(&vp->v_lock);
1021 
1022 			/*
1023 			 * Use the vnode pointer value as a unique address
1024 			 * (together with the magic number to avoid conflicts
1025 			 * with implicit binds) in the transport provider.
1026 			 */
1027 			sti->sti_ux_laddr.soua_vp =
1028 			    (void *)sti->sti_ux_bound_vp;
1029 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1030 			addr = &sti->sti_ux_laddr;
1031 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1032 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1033 			    addrlen,
1034 			    (void *)((struct so_ux_addr *)addr)->soua_vp));
1035 			break;
1036 		}
1037 		} /* end switch (so->so_family) */
1038 	}
1039 
1040 	/*
1041 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1042 	 * the transport can start passing up T_CONN_IND messages
1043 	 * as soon as it receives the bind req and strsock_proto()
1044 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1045 	 */
1046 	if (flags & _SOBIND_LISTEN) {
1047 		if ((so->so_state & SS_ACCEPTCONN) == 0)
1048 			clear_acceptconn_on_err = B_TRUE;
1049 		save_so_backlog = so->so_backlog;
1050 		restore_backlog_on_err = B_TRUE;
1051 		so->so_state |= SS_ACCEPTCONN;
1052 		so->so_backlog = backlog;
1053 	}
1054 
1055 	/*
1056 	 * If NL7C addr(s) have been configured check for addr/port match,
1057 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1058 	 *
1059 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1060 	 * family sockets only. If match mark as such.
1061 	 */
1062 	if (nl7c_enabled && ((addr != NULL &&
1063 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1064 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1065 	    sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1066 		/*
1067 		 * NL7C is not supported in non-global zones,
1068 		 * we enforce this restriction here.
1069 		 */
1070 		if (so->so_zoneid == GLOBAL_ZONEID) {
1071 			/* An NL7C socket, mark it */
1072 			sti->sti_nl7c_flags |= NL7C_ENABLED;
1073 			if (nl7c == NULL) {
1074 				/*
1075 				 * Was an AF_NCA bind() so add it to the
1076 				 * addr list for reporting purposes.
1077 				 */
1078 				nl7c = nl7c_add_addr(addr, addrlen);
1079 			}
1080 		} else
1081 			nl7c = NULL;
1082 	}
1083 
1084 	/*
1085 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1086 	 * for other transports we will send in a O_T_BIND_REQ.
1087 	 */
1088 	if (tcp_udp_xport &&
1089 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
1090 		PRIM_type = T_BIND_REQ;
1091 
1092 	bind_req.PRIM_type = PRIM_type;
1093 	bind_req.ADDR_length = addrlen;
1094 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1095 	bind_req.CONIND_number = backlog;
1096 	/* NOTE: holding so_lock while sleeping */
1097 	mp = soallocproto2(&bind_req, sizeof (bind_req),
1098 	    addr, addrlen, 0, _ALLOC_SLEEP, cr);
1099 	sti->sti_laddr_valid = 0;
1100 
1101 	/* Done using sti_laddr_sa - can drop the lock */
1102 	mutex_exit(&so->so_lock);
1103 
1104 	/*
1105 	 * Intercept the bind_req message here to check if this <address/port>
1106 	 * was configured as an SSL proxy server, or if another endpoint was
1107 	 * already configured to act as a proxy for us.
1108 	 *
1109 	 * Note, only if NL7C not enabled for this socket.
1110 	 */
1111 	if (nl7c == NULL &&
1112 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1113 	    so->so_type == SOCK_STREAM) {
1114 
1115 		if (sti->sti_kssl_ent != NULL) {
1116 			kssl_release_ent(sti->sti_kssl_ent, so,
1117 			    sti->sti_kssl_type);
1118 			sti->sti_kssl_ent = NULL;
1119 		}
1120 
1121 		sti->sti_kssl_type = kssl_check_proxy(mp, so,
1122 		    &sti->sti_kssl_ent);
1123 		switch (sti->sti_kssl_type) {
1124 		case KSSL_NO_PROXY:
1125 			break;
1126 
1127 		case KSSL_HAS_PROXY:
1128 			mutex_enter(&so->so_lock);
1129 			goto skip_transport;
1130 
1131 		case KSSL_IS_PROXY:
1132 			break;
1133 		}
1134 	}
1135 
1136 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1137 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1138 	if (error) {
1139 		eprintsoline(so, error);
1140 		mutex_enter(&so->so_lock);
1141 		goto done;
1142 	}
1143 
1144 	mutex_enter(&so->so_lock);
1145 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1146 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1147 	if (error) {
1148 		eprintsoline(so, error);
1149 		goto done;
1150 	}
1151 skip_transport:
1152 	ASSERT(mp);
1153 	/*
1154 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1155 	 * strsock_proto while the lock was dropped above, the bind
1156 	 * is allowed to complete.
1157 	 */
1158 
1159 	/* Mark as bound. This will be undone if we detect errors below. */
1160 	if (flags & _SOBIND_NOXLATE) {
1161 		ASSERT(so->so_family == AF_UNIX);
1162 		sti->sti_faddr_noxlate = 1;
1163 	}
1164 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1165 	so->so_state |= SS_ISBOUND;
1166 	ASSERT(sti->sti_unbind_mp);
1167 
1168 	/* note that we've already set SS_ACCEPTCONN above */
1169 
1170 	/*
1171 	 * Recompute addrlen - an unspecied bind sent down an
1172 	 * address of length zero but we expect the appropriate length
1173 	 * in return.
1174 	 */
1175 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1176 	    sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1177 
1178 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
1179 	/*
1180 	 * The alignment restriction is really too strict but
1181 	 * we want enough alignment to inspect the fields of
1182 	 * a sockaddr_in.
1183 	 */
1184 	addr = sogetoff(mp, bind_ack->ADDR_offset,
1185 	    bind_ack->ADDR_length,
1186 	    __TPI_ALIGN_SIZE);
1187 	if (addr == NULL) {
1188 		freemsg(mp);
1189 		error = EPROTO;
1190 		eprintsoline(so, error);
1191 		goto done;
1192 	}
1193 	if (!(flags & _SOBIND_UNSPEC)) {
1194 		/*
1195 		 * Verify that the transport didn't return something we
1196 		 * did not want e.g. an address other than what we asked for.
1197 		 *
1198 		 * NOTE: These checks would go away if/when we switch to
1199 		 * using the new TPI (in which the transport would fail
1200 		 * the request instead of assigning a different address).
1201 		 *
1202 		 * NOTE2: For protocols that we don't know (i.e. any
1203 		 * other than AF_INET6, AF_INET and AF_UNIX), we
1204 		 * cannot know if the transport should be expected to
1205 		 * return the same address as that requested.
1206 		 *
1207 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1208 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1209 		 *
1210 		 * For example, in the case of netatalk it may be
1211 		 * inappropriate for the transport to return the
1212 		 * requested address (as it may have allocated a local
1213 		 * port number in behaviour similar to that of an
1214 		 * AF_INET bind request with a port number of zero).
1215 		 *
1216 		 * Given the definition of O_T_BIND_REQ, where the
1217 		 * transport may bind to an address other than the
1218 		 * requested address, it's not possible to determine
1219 		 * whether a returned address that differs from the
1220 		 * requested address is a reason to fail (because the
1221 		 * requested address was not available) or succeed
1222 		 * (because the transport allocated an appropriate
1223 		 * address and/or port).
1224 		 *
1225 		 * sockfs currently requires that the transport return
1226 		 * the requested address in the T_BIND_ACK, unless
1227 		 * there is code here to allow for any discrepancy.
1228 		 * Such code exists for AF_INET and AF_INET6.
1229 		 *
1230 		 * Netatalk chooses to return the requested address
1231 		 * rather than the (correct) allocated address.  This
1232 		 * means that netatalk violates the TPI specification
1233 		 * (and would not function correctly if used from a
1234 		 * TLI application), but it does mean that it works
1235 		 * with sockfs.
1236 		 *
1237 		 * As noted above, using the newer XTI bind primitive
1238 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1239 		 * allow sockfs to be more sure about whether or not
1240 		 * the bind request had succeeded (as transports are
1241 		 * not permitted to bind to a different address than
1242 		 * that requested - they must return failure).
1243 		 * Unfortunately, support for T_BIND_REQ may not be
1244 		 * present in all transport implementations (netatalk,
1245 		 * for example, doesn't have it), making the
1246 		 * transition difficult.
1247 		 */
1248 		if (bind_ack->ADDR_length != addrlen) {
1249 			/* Assumes that the requested address was in use */
1250 			freemsg(mp);
1251 			error = EADDRINUSE;
1252 			eprintsoline(so, error);
1253 			goto done;
1254 		}
1255 
1256 		switch (so->so_family) {
1257 		case AF_INET6:
1258 		case AF_INET: {
1259 			sin_t *rname, *aname;
1260 
1261 			rname = (sin_t *)addr;
1262 			aname = (sin_t *)sti->sti_laddr_sa;
1263 
1264 			/*
1265 			 * Take advantage of the alignment
1266 			 * of sin_port and sin6_port which fall
1267 			 * in the same place in their data structures.
1268 			 * Just use sin_port for either address family.
1269 			 *
1270 			 * This may become a problem if (heaven forbid)
1271 			 * there's a separate ipv6port_reserved... :-P
1272 			 *
1273 			 * Binding to port 0 has the semantics of letting
1274 			 * the transport bind to any port.
1275 			 *
1276 			 * If the transport is TCP or UDP since we had sent
1277 			 * a T_BIND_REQ we would not get a port other than
1278 			 * what we asked for.
1279 			 */
1280 			if (tcp_udp_xport) {
1281 				/*
1282 				 * Pick up the new port number if we bound to
1283 				 * port 0.
1284 				 */
1285 				if (aname->sin_port == 0)
1286 					aname->sin_port = rname->sin_port;
1287 				sti->sti_laddr_valid = 1;
1288 				break;
1289 			}
1290 			if (aname->sin_port != 0 &&
1291 			    aname->sin_port != rname->sin_port) {
1292 				freemsg(mp);
1293 				error = EADDRINUSE;
1294 				eprintsoline(so, error);
1295 				goto done;
1296 			}
1297 			/*
1298 			 * Pick up the new port number if we bound to port 0.
1299 			 */
1300 			aname->sin_port = rname->sin_port;
1301 
1302 			/*
1303 			 * Unfortunately, addresses aren't _quite_ the same.
1304 			 */
1305 			if (so->so_family == AF_INET) {
1306 				if (aname->sin_addr.s_addr !=
1307 				    rname->sin_addr.s_addr) {
1308 					freemsg(mp);
1309 					error = EADDRNOTAVAIL;
1310 					eprintsoline(so, error);
1311 					goto done;
1312 				}
1313 			} else {
1314 				sin6_t *rname6 = (sin6_t *)rname;
1315 				sin6_t *aname6 = (sin6_t *)aname;
1316 
1317 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1318 				    &rname6->sin6_addr)) {
1319 					freemsg(mp);
1320 					error = EADDRNOTAVAIL;
1321 					eprintsoline(so, error);
1322 					goto done;
1323 				}
1324 			}
1325 			break;
1326 		}
1327 		case AF_UNIX:
1328 			if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1329 				freemsg(mp);
1330 				error = EADDRINUSE;
1331 				eprintsoline(so, error);
1332 				eprintso(so,
1333 				    ("addrlen %d, addr 0x%x, vp %p\n",
1334 				    addrlen, *((int *)addr),
1335 				    (void *)sti->sti_ux_bound_vp));
1336 				goto done;
1337 			}
1338 			sti->sti_laddr_valid = 1;
1339 			break;
1340 		default:
1341 			/*
1342 			 * NOTE: This assumes that addresses can be
1343 			 * byte-compared for equivalence.
1344 			 */
1345 			if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1346 				freemsg(mp);
1347 				error = EADDRINUSE;
1348 				eprintsoline(so, error);
1349 				goto done;
1350 			}
1351 			/*
1352 			 * Don't mark sti_laddr_valid, as we cannot be
1353 			 * sure that the returned address is the real
1354 			 * bound address when talking to an unknown
1355 			 * transport.
1356 			 */
1357 			break;
1358 		}
1359 	} else {
1360 		/*
1361 		 * Save for returned address for getsockname.
1362 		 * Needed for unspecific bind unless transport supports
1363 		 * the TI_GETMYNAME ioctl.
1364 		 * Do this for AF_INET{,6} even though they do, as
1365 		 * caching info here is much better performance than
1366 		 * a TPI/STREAMS trip to the transport for getsockname.
1367 		 * Any which can't for some reason _must_ _not_ set
1368 		 * sti_laddr_valid here for the caching version of
1369 		 * getsockname to not break;
1370 		 */
1371 		switch (so->so_family) {
1372 		case AF_UNIX:
1373 			/*
1374 			 * Record the address bound with the transport
1375 			 * for use by socketpair.
1376 			 */
1377 			bcopy(addr, &sti->sti_ux_laddr, addrlen);
1378 			sti->sti_laddr_valid = 1;
1379 			break;
1380 		case AF_INET:
1381 		case AF_INET6:
1382 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1383 			bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1384 			sti->sti_laddr_valid = 1;
1385 			break;
1386 		default:
1387 			/*
1388 			 * Don't mark sti_laddr_valid, as we cannot be
1389 			 * sure that the returned address is the real
1390 			 * bound address when talking to an unknown
1391 			 * transport.
1392 			 */
1393 			break;
1394 		}
1395 	}
1396 
1397 	if (nl7c != NULL) {
1398 		/* Register listen()er sonode pointer with NL7C */
1399 		nl7c_listener_addr(nl7c, so);
1400 	}
1401 
1402 	freemsg(mp);
1403 
1404 done:
1405 	if (error) {
1406 		/* reset state & backlog to values held on entry */
1407 		if (clear_acceptconn_on_err == B_TRUE)
1408 			so->so_state &= ~SS_ACCEPTCONN;
1409 		if (restore_backlog_on_err == B_TRUE)
1410 			so->so_backlog = save_so_backlog;
1411 
1412 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1413 			int err;
1414 
1415 			err = sotpi_unbind(so, 0);
1416 			/* LINTED - statement has no consequent: if */
1417 			if (err) {
1418 				eprintsoline(so, error);
1419 			} else {
1420 				ASSERT(!(so->so_state & SS_ISBOUND));
1421 			}
1422 		}
1423 	}
1424 	if (!(flags & _SOBIND_LOCK_HELD)) {
1425 		so_unlock_single(so, SOLOCKED);
1426 		mutex_exit(&so->so_lock);
1427 	} else {
1428 		ASSERT(MUTEX_HELD(&so->so_lock));
1429 		ASSERT(so->so_flag & SOLOCKED);
1430 	}
1431 	return (error);
1432 }
1433 
1434 /* bind the socket */
1435 static int
1436 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1437     int flags, struct cred *cr)
1438 {
1439 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1440 		return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1441 
1442 	flags &= ~_SOBIND_SOCKETPAIR;
1443 	return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1444 }
1445 
1446 /*
1447  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1448  * address, or when listen needs to unbind and bind.
1449  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1450  * so that a sobind can pick them up.
1451  */
1452 static int
1453 sotpi_unbind(struct sonode *so, int flags)
1454 {
1455 	struct T_unbind_req	unbind_req;
1456 	int			error = 0;
1457 	mblk_t			*mp;
1458 	sotpi_info_t		*sti = SOTOTPI(so);
1459 
1460 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1461 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1462 
1463 	ASSERT(MUTEX_HELD(&so->so_lock));
1464 	ASSERT(so->so_flag & SOLOCKED);
1465 
1466 	if (!(so->so_state & SS_ISBOUND)) {
1467 		error = EINVAL;
1468 		eprintsoline(so, error);
1469 		goto done;
1470 	}
1471 
1472 	mutex_exit(&so->so_lock);
1473 
1474 	/*
1475 	 * Flush the read and write side (except stream head read queue)
1476 	 * and send down T_UNBIND_REQ.
1477 	 */
1478 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1479 
1480 	unbind_req.PRIM_type = T_UNBIND_REQ;
1481 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1482 	    0, _ALLOC_SLEEP, CRED());
1483 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1484 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1485 	mutex_enter(&so->so_lock);
1486 	if (error) {
1487 		eprintsoline(so, error);
1488 		goto done;
1489 	}
1490 
1491 	error = sowaitokack(so, T_UNBIND_REQ);
1492 	if (error) {
1493 		eprintsoline(so, error);
1494 		goto done;
1495 	}
1496 
1497 	/*
1498 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1499 	 * strsock_proto while the lock was dropped above, the unbind
1500 	 * is allowed to complete.
1501 	 */
1502 	if (!(flags & _SOUNBIND_REBIND)) {
1503 		/*
1504 		 * Clear out bound address.
1505 		 */
1506 		vnode_t *vp;
1507 
1508 		if ((vp = sti->sti_ux_bound_vp) != NULL) {
1509 
1510 			/* Undo any SSL proxy setup */
1511 			if ((so->so_family == AF_INET ||
1512 			    so->so_family == AF_INET6) &&
1513 			    (so->so_type == SOCK_STREAM) &&
1514 			    (sti->sti_kssl_ent != NULL)) {
1515 				kssl_release_ent(sti->sti_kssl_ent, so,
1516 				    sti->sti_kssl_type);
1517 				sti->sti_kssl_ent = NULL;
1518 				sti->sti_kssl_type = KSSL_NO_PROXY;
1519 			}
1520 			sti->sti_ux_bound_vp = NULL;
1521 			vn_rele_stream(vp);
1522 		}
1523 		/* Clear out address */
1524 		sti->sti_laddr_len = 0;
1525 	}
1526 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1527 	sti->sti_laddr_valid = 0;
1528 
1529 done:
1530 
1531 	/* If the caller held the lock don't release it here */
1532 	ASSERT(MUTEX_HELD(&so->so_lock));
1533 	ASSERT(so->so_flag & SOLOCKED);
1534 
1535 	return (error);
1536 }
1537 
1538 /*
1539  * listen on the socket.
1540  * For TPI conforming transports this has to first unbind with the transport
1541  * and then bind again using the new backlog.
1542  */
1543 /* ARGSUSED */
1544 int
1545 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1546 {
1547 	int		error = 0;
1548 	sotpi_info_t	*sti = SOTOTPI(so);
1549 
1550 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1551 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1552 
1553 	if (sti->sti_serv_type == T_CLTS)
1554 		return (EOPNOTSUPP);
1555 
1556 	/*
1557 	 * If the socket is ready to accept connections already, then
1558 	 * return without doing anything.  This avoids a problem where
1559 	 * a second listen() call fails if a connection is pending and
1560 	 * leaves the socket unbound. Only when we are not unbinding
1561 	 * with the transport can we safely increase the backlog.
1562 	 */
1563 	if (so->so_state & SS_ACCEPTCONN &&
1564 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1565 	    /*CONSTCOND*/
1566 	    !solisten_tpi_tcp))
1567 		return (0);
1568 
1569 	if (so->so_state & SS_ISCONNECTED)
1570 		return (EINVAL);
1571 
1572 	mutex_enter(&so->so_lock);
1573 	so_lock_single(so);	/* Set SOLOCKED */
1574 
1575 	/*
1576 	 * If the listen doesn't change the backlog we do nothing.
1577 	 * This avoids an EPROTO error from the transport.
1578 	 */
1579 	if ((so->so_state & SS_ACCEPTCONN) &&
1580 	    so->so_backlog == backlog)
1581 		goto done;
1582 
1583 	if (!(so->so_state & SS_ISBOUND)) {
1584 		/*
1585 		 * Must have been explicitly bound in the UNIX domain.
1586 		 */
1587 		if (so->so_family == AF_UNIX) {
1588 			error = EINVAL;
1589 			goto done;
1590 		}
1591 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1592 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1593 	} else if (backlog > 0) {
1594 		/*
1595 		 * AF_INET{,6} hack to avoid losing the port.
1596 		 * Assumes that all AF_INET{,6} transports can handle a
1597 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1598 		 * has already bound thus it is possible to avoid the unbind.
1599 		 */
1600 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1601 		    /*CONSTCOND*/
1602 		    !solisten_tpi_tcp)) {
1603 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1604 			if (error)
1605 				goto done;
1606 		}
1607 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1608 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1609 	} else {
1610 		so->so_state |= SS_ACCEPTCONN;
1611 		so->so_backlog = backlog;
1612 	}
1613 	if (error)
1614 		goto done;
1615 	ASSERT(so->so_state & SS_ACCEPTCONN);
1616 done:
1617 	so_unlock_single(so, SOLOCKED);
1618 	mutex_exit(&so->so_lock);
1619 	return (error);
1620 }
1621 
1622 /*
1623  * Disconnect either a specified seqno or all (-1).
1624  * The former is used on listening sockets only.
1625  *
1626  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1627  * the current use of sodisconnect(seqno == -1) is only for shutdown
1628  * so there is no point (and potentially incorrect) to unbind.
1629  */
1630 static int
1631 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1632 {
1633 	struct T_discon_req	discon_req;
1634 	int			error = 0;
1635 	mblk_t			*mp;
1636 
1637 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1638 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1639 
1640 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1641 		mutex_enter(&so->so_lock);
1642 		so_lock_single(so);	/* Set SOLOCKED */
1643 	} else {
1644 		ASSERT(MUTEX_HELD(&so->so_lock));
1645 		ASSERT(so->so_flag & SOLOCKED);
1646 	}
1647 
1648 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1649 		error = EINVAL;
1650 		eprintsoline(so, error);
1651 		goto done;
1652 	}
1653 
1654 	mutex_exit(&so->so_lock);
1655 	/*
1656 	 * Flush the write side (unless this is a listener)
1657 	 * and then send down a T_DISCON_REQ.
1658 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1659 	 * and other messages.)
1660 	 */
1661 	if (!(so->so_state & SS_ACCEPTCONN))
1662 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1663 
1664 	discon_req.PRIM_type = T_DISCON_REQ;
1665 	discon_req.SEQ_number = seqno;
1666 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1667 	    0, _ALLOC_SLEEP, CRED());
1668 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1669 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1670 	mutex_enter(&so->so_lock);
1671 	if (error) {
1672 		eprintsoline(so, error);
1673 		goto done;
1674 	}
1675 
1676 	error = sowaitokack(so, T_DISCON_REQ);
1677 	if (error) {
1678 		eprintsoline(so, error);
1679 		goto done;
1680 	}
1681 	/*
1682 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1683 	 * strsock_proto while the lock was dropped above, the disconnect
1684 	 * is allowed to complete. However, it is not possible to
1685 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1686 	 */
1687 	so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1688 	SOTOTPI(so)->sti_laddr_valid = 0;
1689 	SOTOTPI(so)->sti_faddr_valid = 0;
1690 done:
1691 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1692 		so_unlock_single(so, SOLOCKED);
1693 		mutex_exit(&so->so_lock);
1694 	} else {
1695 		/* If the caller held the lock don't release it here */
1696 		ASSERT(MUTEX_HELD(&so->so_lock));
1697 		ASSERT(so->so_flag & SOLOCKED);
1698 	}
1699 	return (error);
1700 }
1701 
1702 /* ARGSUSED */
1703 int
1704 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1705     struct sonode **nsop)
1706 {
1707 	struct T_conn_ind	*conn_ind;
1708 	struct T_conn_res	*conn_res;
1709 	int			error = 0;
1710 	mblk_t			*mp, *ctxmp, *ack_mp;
1711 	struct sonode		*nso;
1712 	vnode_t			*nvp;
1713 	void			*src;
1714 	t_uscalar_t		srclen;
1715 	void			*opt;
1716 	t_uscalar_t		optlen;
1717 	t_scalar_t		PRIM_type;
1718 	t_scalar_t		SEQ_number;
1719 	size_t			sinlen;
1720 	sotpi_info_t		*sti = SOTOTPI(so);
1721 	sotpi_info_t		*nsti;
1722 
1723 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1724 	    (void *)so, fflag, (void *)nsop,
1725 	    pr_state(so->so_state, so->so_mode)));
1726 
1727 	/*
1728 	 * Defer single-threading the accepting socket until
1729 	 * the T_CONN_IND has been received and parsed and the
1730 	 * new sonode has been opened.
1731 	 */
1732 
1733 	/* Check that we are not already connected */
1734 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1735 		goto conn_bad;
1736 again:
1737 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1738 		goto e_bad;
1739 
1740 	ASSERT(mp != NULL);
1741 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1742 	ctxmp = mp->b_cont;
1743 
1744 	/*
1745 	 * Save SEQ_number for error paths.
1746 	 */
1747 	SEQ_number = conn_ind->SEQ_number;
1748 
1749 	srclen = conn_ind->SRC_length;
1750 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1751 	if (src == NULL) {
1752 		error = EPROTO;
1753 		freemsg(mp);
1754 		eprintsoline(so, error);
1755 		goto disconnect_unlocked;
1756 	}
1757 	optlen = conn_ind->OPT_length;
1758 	switch (so->so_family) {
1759 	case AF_INET:
1760 	case AF_INET6:
1761 		if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1762 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1763 			    &opt, conn_ind->OPT_length);
1764 		} else {
1765 			/*
1766 			 * The transport (in this case TCP) hasn't sent up
1767 			 * a pointer to an instance for the accept fast-path.
1768 			 * Disable fast-path completely because the call to
1769 			 * sotpi_create() below would otherwise create an
1770 			 * incomplete TCP instance, which would lead to
1771 			 * problems when sockfs sends a normal T_CONN_RES
1772 			 * message down the new stream.
1773 			 */
1774 			if (sti->sti_direct) {
1775 				int rval;
1776 				/*
1777 				 * For consistency we inform tcp to disable
1778 				 * direct interface on the listener, though
1779 				 * we can certainly live without doing this
1780 				 * because no data will ever travel upstream
1781 				 * on the listening socket.
1782 				 */
1783 				sti->sti_direct = 0;
1784 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1785 				    0, 0, K_TO_K, cr, &rval);
1786 			}
1787 			opt = NULL;
1788 			optlen = 0;
1789 		}
1790 		break;
1791 	case AF_UNIX:
1792 	default:
1793 		if (optlen != 0) {
1794 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1795 			    __TPI_ALIGN_SIZE);
1796 			if (opt == NULL) {
1797 				error = EPROTO;
1798 				freemsg(mp);
1799 				eprintsoline(so, error);
1800 				goto disconnect_unlocked;
1801 			}
1802 		}
1803 		if (so->so_family == AF_UNIX) {
1804 			if (!sti->sti_faddr_noxlate) {
1805 				src = NULL;
1806 				srclen = 0;
1807 			}
1808 			/* Extract src address from options */
1809 			if (optlen != 0)
1810 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1811 		}
1812 		break;
1813 	}
1814 
1815 	/*
1816 	 * Create the new socket.
1817 	 */
1818 	nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1819 	if (nso == NULL) {
1820 		ASSERT(error != 0);
1821 		/*
1822 		 * Accept can not fail with ENOBUFS. sotpi_create
1823 		 * sleeps waiting for memory until a signal is caught
1824 		 * so return EINTR.
1825 		 */
1826 		freemsg(mp);
1827 		if (error == ENOBUFS)
1828 			error = EINTR;
1829 		goto e_disc_unl;
1830 	}
1831 	nvp = SOTOV(nso);
1832 	nsti = SOTOTPI(nso);
1833 
1834 	/*
1835 	 * If the transport sent up an SSL connection context, then attach
1836 	 * it the new socket, and set the (sd_wputdatafunc)() and
1837 	 * (sd_rputdatafunc)() stream head hooks to intercept and process
1838 	 * SSL records.
1839 	 */
1840 	if (ctxmp != NULL) {
1841 		/*
1842 		 * This kssl_ctx_t is already held for us by the transport.
1843 		 * So, we don't need to do a kssl_hold_ctx() here.
1844 		 */
1845 		nsti->sti_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr);
1846 		freemsg(ctxmp);
1847 		mp->b_cont = NULL;
1848 		strsetrwputdatahooks(nvp, strsock_kssl_input,
1849 		    strsock_kssl_output);
1850 
1851 		/* Disable sodirect if any */
1852 		if (nso->so_direct != NULL) {
1853 			mutex_enter(nso->so_direct->sod_lockp);
1854 			SOD_DISABLE(nso->so_direct);
1855 			mutex_exit(nso->so_direct->sod_lockp);
1856 		}
1857 	}
1858 #ifdef DEBUG
1859 	/*
1860 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1861 	 * it's inherited early to allow debugging of the accept code itself.
1862 	 */
1863 	nso->so_options |= so->so_options & SO_DEBUG;
1864 #endif /* DEBUG */
1865 
1866 	/*
1867 	 * Save the SRC address from the T_CONN_IND
1868 	 * for getpeername to work on AF_UNIX and on transports that do not
1869 	 * support TI_GETPEERNAME.
1870 	 *
1871 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1872 	 * copyin_name().
1873 	 */
1874 	if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1875 		error = EINVAL;
1876 		freemsg(mp);
1877 		eprintsoline(so, error);
1878 		goto disconnect_vp_unlocked;
1879 	}
1880 	nsti->sti_faddr_len = (socklen_t)srclen;
1881 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1882 	bcopy(src, nsti->sti_faddr_sa, srclen);
1883 	nsti->sti_faddr_valid = 1;
1884 
1885 	/*
1886 	 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1887 	 */
1888 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1889 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1890 		cred_t	*cr;
1891 		pid_t	cpid;
1892 
1893 		cr = msg_getcred(mp, &cpid);
1894 		if (cr != NULL) {
1895 			crhold(cr);
1896 			nso->so_peercred = cr;
1897 			nso->so_cpid = cpid;
1898 		}
1899 		freemsg(mp);
1900 
1901 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1902 		    sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1903 		if (mp == NULL) {
1904 			/*
1905 			 * Accept can not fail with ENOBUFS.
1906 			 * A signal was caught so return EINTR.
1907 			 */
1908 			error = EINTR;
1909 			eprintsoline(so, error);
1910 			goto disconnect_vp_unlocked;
1911 		}
1912 		conn_res = (struct T_conn_res *)mp->b_rptr;
1913 	} else {
1914 		/*
1915 		 * For efficency reasons we use msg_extractcred; no crhold
1916 		 * needed since db_credp is cleared (i.e., we move the cred
1917 		 * from the message to so_peercred.
1918 		 */
1919 		nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1920 
1921 		mp->b_rptr = DB_BASE(mp);
1922 		conn_res = (struct T_conn_res *)mp->b_rptr;
1923 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1924 
1925 		mblk_setcred(mp, cr, curproc->p_pid);
1926 	}
1927 
1928 	/*
1929 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1930 	 * (or AF_INET6) it also has to be bound in the transport provider.
1931 	 * We set the local address in the sonode from the T_OK_ACK of the
1932 	 * T_CONN_RES. For this reason the address we bind to here isn't
1933 	 * important.
1934 	 */
1935 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1936 	    /*CONSTCOND*/
1937 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1938 		/*
1939 		 * Optimization for AF_INET{,6} transports
1940 		 * that can handle a T_CONN_RES without being bound.
1941 		 */
1942 		mutex_enter(&nso->so_lock);
1943 		so_automatic_bind(nso);
1944 		mutex_exit(&nso->so_lock);
1945 	} else {
1946 		/* Perform NULL bind with the transport provider. */
1947 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1948 		    cr)) != 0) {
1949 			ASSERT(error != ENOBUFS);
1950 			freemsg(mp);
1951 			eprintsoline(nso, error);
1952 			goto disconnect_vp_unlocked;
1953 		}
1954 	}
1955 
1956 	/*
1957 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1958 	 * so that any data arriving on the new socket will cause the
1959 	 * appropriate signals to be delivered for the new socket.
1960 	 *
1961 	 * No other thread (except strsock_proto and strsock_misc)
1962 	 * can access the new socket thus we relax the locking.
1963 	 */
1964 	nso->so_pgrp = so->so_pgrp;
1965 	nso->so_state |= so->so_state & SS_ASYNC;
1966 	nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1967 
1968 	if (nso->so_pgrp != 0) {
1969 		if ((error = so_set_events(nso, nvp, cr)) != 0) {
1970 			eprintsoline(nso, error);
1971 			error = 0;
1972 			nso->so_pgrp = 0;
1973 		}
1974 	}
1975 
1976 	/*
1977 	 * Make note of the socket level options. TCP and IP level options
1978 	 * are already inherited. We could do all this after accept is
1979 	 * successful but doing it here simplifies code and no harm done
1980 	 * for error case.
1981 	 */
1982 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1983 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1984 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1985 	nso->so_sndbuf = so->so_sndbuf;
1986 	nso->so_rcvbuf = so->so_rcvbuf;
1987 	if (nso->so_options & SO_LINGER)
1988 		nso->so_linger = so->so_linger;
1989 
1990 	/*
1991 	 * Note that the following sti_direct code path should be
1992 	 * removed once we are confident that the direct sockets
1993 	 * do not result in any degradation.
1994 	 */
1995 	if (sti->sti_direct) {
1996 
1997 		ASSERT(opt != NULL);
1998 
1999 		conn_res->OPT_length = optlen;
2000 		conn_res->OPT_offset = MBLKL(mp);
2001 		bcopy(&opt, mp->b_wptr, optlen);
2002 		mp->b_wptr += optlen;
2003 		conn_res->PRIM_type = T_CONN_RES;
2004 		conn_res->ACCEPTOR_id = 0;
2005 		PRIM_type = T_CONN_RES;
2006 
2007 		/* Send down the T_CONN_RES on acceptor STREAM */
2008 		error = kstrputmsg(SOTOV(nso), mp, NULL,
2009 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2010 		if (error) {
2011 			mutex_enter(&so->so_lock);
2012 			so_lock_single(so);
2013 			eprintsoline(so, error);
2014 			goto disconnect_vp;
2015 		}
2016 		mutex_enter(&nso->so_lock);
2017 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
2018 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2019 		if (error) {
2020 			mutex_exit(&nso->so_lock);
2021 			mutex_enter(&so->so_lock);
2022 			so_lock_single(so);
2023 			eprintsoline(so, error);
2024 			goto disconnect_vp;
2025 		}
2026 		if (nso->so_family == AF_INET) {
2027 			sin_t *sin;
2028 
2029 			sin = (sin_t *)(ack_mp->b_rptr +
2030 			    sizeof (struct T_ok_ack));
2031 			bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
2032 			nsti->sti_laddr_len = sizeof (sin_t);
2033 		} else {
2034 			sin6_t *sin6;
2035 
2036 			sin6 = (sin6_t *)(ack_mp->b_rptr +
2037 			    sizeof (struct T_ok_ack));
2038 			bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
2039 			nsti->sti_laddr_len = sizeof (sin6_t);
2040 		}
2041 		freemsg(ack_mp);
2042 
2043 		nso->so_state |= SS_ISCONNECTED;
2044 		nso->so_proto_handle = (sock_lower_handle_t)opt;
2045 		nsti->sti_laddr_valid = 1;
2046 
2047 		if (sti->sti_nl7c_flags & NL7C_ENABLED) {
2048 			/*
2049 			 * A NL7C marked listen()er so the new socket
2050 			 * inherits the listen()er's NL7C state, except
2051 			 * for NL7C_POLLIN.
2052 			 *
2053 			 * Only call NL7C to process the new socket if
2054 			 * the listen socket allows blocking i/o.
2055 			 */
2056 			nsti->sti_nl7c_flags =
2057 			    sti->sti_nl7c_flags & (~NL7C_POLLIN);
2058 			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
2059 				/*
2060 				 * Nonblocking accept() just make it
2061 				 * persist to defer processing to the
2062 				 * read-side syscall (e.g. read).
2063 				 */
2064 				nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
2065 			} else if (nl7c_process(nso, B_FALSE)) {
2066 				/*
2067 				 * NL7C has completed processing on the
2068 				 * socket, close the socket and back to
2069 				 * the top to await the next T_CONN_IND.
2070 				 */
2071 				mutex_exit(&nso->so_lock);
2072 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
2073 				    cr, NULL);
2074 				VN_RELE(nvp);
2075 				goto again;
2076 			}
2077 			/* Pass the new socket out */
2078 		}
2079 
2080 		mutex_exit(&nso->so_lock);
2081 
2082 		/*
2083 		 * It's possible, through the use of autopush for example,
2084 		 * that the acceptor stream may not support sti_direct
2085 		 * semantics. If the new socket does not support sti_direct
2086 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
2087 		 * as we would in the I_PUSH case.
2088 		 */
2089 		if (nsti->sti_direct == 0) {
2090 			int	rval;
2091 
2092 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2093 			    0, 0, K_TO_K, cr, &rval)) != 0) {
2094 				mutex_enter(&so->so_lock);
2095 				so_lock_single(so);
2096 				eprintsoline(so, error);
2097 				goto disconnect_vp;
2098 			}
2099 		}
2100 
2101 		/*
2102 		 * Pass out new socket.
2103 		 */
2104 		if (nsop != NULL)
2105 			*nsop = nso;
2106 
2107 		return (0);
2108 	}
2109 
2110 	/*
2111 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2112 	 * which don't support the FireEngine accept fast-path. It is also
2113 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2114 	 * again. Neither sockfs nor TCP attempt to find out if some other
2115 	 * random module has been inserted in between (in which case we
2116 	 * should follow TLI accept behaviour). We blindly assume the worst
2117 	 * case and revert back to old behaviour i.e. TCP will not send us
2118 	 * any option (eager) and the accept should happen on the listener
2119 	 * queue. Any queued T_conn_ind have already got their options removed
2120 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
2121 	 */
2122 	/*
2123 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2124 	 */
2125 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2126 #ifdef	_ILP32
2127 		queue_t	*q;
2128 
2129 		/*
2130 		 * Find read queue in driver
2131 		 * Can safely do this since we "own" nso/nvp.
2132 		 */
2133 		q = strvp2wq(nvp)->q_next;
2134 		while (SAMESTR(q))
2135 			q = q->q_next;
2136 		q = RD(q);
2137 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2138 #else
2139 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2140 #endif	/* _ILP32 */
2141 		conn_res->PRIM_type = O_T_CONN_RES;
2142 		PRIM_type = O_T_CONN_RES;
2143 	} else {
2144 		conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2145 		conn_res->PRIM_type = T_CONN_RES;
2146 		PRIM_type = T_CONN_RES;
2147 	}
2148 	conn_res->SEQ_number = SEQ_number;
2149 	conn_res->OPT_length = 0;
2150 	conn_res->OPT_offset = 0;
2151 
2152 	mutex_enter(&so->so_lock);
2153 	so_lock_single(so);	/* Set SOLOCKED */
2154 	mutex_exit(&so->so_lock);
2155 
2156 	error = kstrputmsg(SOTOV(so), mp, NULL,
2157 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2158 	mutex_enter(&so->so_lock);
2159 	if (error) {
2160 		eprintsoline(so, error);
2161 		goto disconnect_vp;
2162 	}
2163 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
2164 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2165 	if (error) {
2166 		eprintsoline(so, error);
2167 		goto disconnect_vp;
2168 	}
2169 	/*
2170 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
2171 	 * that to set the local address. If this is not present
2172 	 * then we zero out the address and don't set the
2173 	 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2174 	 * the pathname from the listening socket.
2175 	 */
2176 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2177 	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2178 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2179 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
2180 		bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2181 		nsti->sti_laddr_len = sinlen;
2182 		nsti->sti_laddr_valid = 1;
2183 	} else if (nso->so_family == AF_UNIX) {
2184 		ASSERT(so->so_family == AF_UNIX);
2185 		nsti->sti_laddr_len = sti->sti_laddr_len;
2186 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2187 		bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2188 		    nsti->sti_laddr_len);
2189 		nsti->sti_laddr_valid = 1;
2190 	} else {
2191 		nsti->sti_laddr_len = sti->sti_laddr_len;
2192 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2193 		bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2194 		nsti->sti_laddr_sa->sa_family = nso->so_family;
2195 	}
2196 	freemsg(ack_mp);
2197 
2198 	so_unlock_single(so, SOLOCKED);
2199 	mutex_exit(&so->so_lock);
2200 
2201 	nso->so_state |= SS_ISCONNECTED;
2202 
2203 	/*
2204 	 * Pass out new socket.
2205 	 */
2206 	if (nsop != NULL)
2207 		*nsop = nso;
2208 
2209 	return (0);
2210 
2211 
2212 eproto_disc_unl:
2213 	error = EPROTO;
2214 e_disc_unl:
2215 	eprintsoline(so, error);
2216 	goto disconnect_unlocked;
2217 
2218 pr_disc_vp_unl:
2219 	eprintsoline(so, error);
2220 disconnect_vp_unlocked:
2221 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2222 	VN_RELE(nvp);
2223 disconnect_unlocked:
2224 	(void) sodisconnect(so, SEQ_number, 0);
2225 	return (error);
2226 
2227 pr_disc_vp:
2228 	eprintsoline(so, error);
2229 disconnect_vp:
2230 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2231 	so_unlock_single(so, SOLOCKED);
2232 	mutex_exit(&so->so_lock);
2233 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2234 	VN_RELE(nvp);
2235 	return (error);
2236 
2237 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2238 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2239 	    ? EOPNOTSUPP : EINVAL;
2240 e_bad:
2241 	eprintsoline(so, error);
2242 	return (error);
2243 }
2244 
2245 /*
2246  * connect a socket.
2247  *
2248  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2249  * unconnect (by specifying a null address).
2250  */
2251 int
2252 sotpi_connect(struct sonode *so,
2253 	const struct sockaddr *name,
2254 	socklen_t namelen,
2255 	int fflag,
2256 	int flags,
2257 	struct cred *cr)
2258 {
2259 	struct T_conn_req	conn_req;
2260 	int			error = 0;
2261 	mblk_t			*mp;
2262 	void			*src;
2263 	socklen_t		srclen;
2264 	void			*addr;
2265 	socklen_t		addrlen;
2266 	boolean_t		need_unlock;
2267 	sotpi_info_t		*sti = SOTOTPI(so);
2268 
2269 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2270 	    (void *)so, (void *)name, namelen, fflag, flags,
2271 	    pr_state(so->so_state, so->so_mode)));
2272 
2273 	/*
2274 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2275 	 * avoid sleeping for memory with SOLOCKED held.
2276 	 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2277 	 * + sizeof (struct T_opthdr).
2278 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2279 	 * exceed sti_faddr_maxlen).
2280 	 */
2281 	mp = soallocproto(sizeof (struct T_conn_req) +
2282 	    2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2283 	    cr);
2284 	if (mp == NULL) {
2285 		/*
2286 		 * Connect can not fail with ENOBUFS. A signal was
2287 		 * caught so return EINTR.
2288 		 */
2289 		error = EINTR;
2290 		eprintsoline(so, error);
2291 		return (error);
2292 	}
2293 
2294 	mutex_enter(&so->so_lock);
2295 	/*
2296 	 * Make sure there is a preallocated T_unbind_req message
2297 	 * before any binding. This message is allocated when the
2298 	 * socket is created. Since another thread can consume
2299 	 * so_unbind_mp by the time we return from so_lock_single(),
2300 	 * we should check the availability of so_unbind_mp after
2301 	 * we return from so_lock_single().
2302 	 */
2303 
2304 	so_lock_single(so);	/* Set SOLOCKED */
2305 	need_unlock = B_TRUE;
2306 
2307 	if (sti->sti_unbind_mp == NULL) {
2308 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2309 		/* NOTE: holding so_lock while sleeping */
2310 		sti->sti_unbind_mp =
2311 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2312 		if (sti->sti_unbind_mp == NULL) {
2313 			error = EINTR;
2314 			goto done;
2315 		}
2316 	}
2317 
2318 	/*
2319 	 * Can't have done a listen before connecting.
2320 	 */
2321 	if (so->so_state & SS_ACCEPTCONN) {
2322 		error = EOPNOTSUPP;
2323 		goto done;
2324 	}
2325 
2326 	/*
2327 	 * Must be bound with the transport
2328 	 */
2329 	if (!(so->so_state & SS_ISBOUND)) {
2330 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2331 		    /*CONSTCOND*/
2332 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2333 			/*
2334 			 * Optimization for AF_INET{,6} transports
2335 			 * that can handle a T_CONN_REQ without being bound.
2336 			 */
2337 			so_automatic_bind(so);
2338 		} else {
2339 			error = sotpi_bind(so, NULL, 0,
2340 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2341 			if (error)
2342 				goto done;
2343 		}
2344 		ASSERT(so->so_state & SS_ISBOUND);
2345 		flags |= _SOCONNECT_DID_BIND;
2346 	}
2347 
2348 	/*
2349 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2350 	 * connect to a null address. This is the portable method to
2351 	 * unconnect a socket.
2352 	 */
2353 	if ((namelen >= sizeof (sa_family_t)) &&
2354 	    (name->sa_family == AF_UNSPEC)) {
2355 		name = NULL;
2356 		namelen = 0;
2357 	}
2358 
2359 	/*
2360 	 * Check that we are not already connected.
2361 	 * A connection-oriented socket cannot be reconnected.
2362 	 * A connected connection-less socket can be
2363 	 * - connected to a different address by a subsequent connect
2364 	 * - "unconnected" by a connect to the NULL address
2365 	 */
2366 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2367 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2368 		if (so->so_mode & SM_CONNREQUIRED) {
2369 			/* Connection-oriented socket */
2370 			error = so->so_state & SS_ISCONNECTED ?
2371 			    EISCONN : EALREADY;
2372 			goto done;
2373 		}
2374 		/* Connection-less socket */
2375 		if (name == NULL) {
2376 			/*
2377 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2378 			 * since it was set when the socket was connected.
2379 			 * If this is UDP also send down a T_DISCON_REQ.
2380 			 */
2381 			int val;
2382 
2383 			if ((so->so_family == AF_INET ||
2384 			    so->so_family == AF_INET6) &&
2385 			    (so->so_type == SOCK_DGRAM ||
2386 			    so->so_type == SOCK_RAW) &&
2387 			    /*CONSTCOND*/
2388 			    !soconnect_tpi_udp) {
2389 				/* XXX What about implicitly unbinding here? */
2390 				error = sodisconnect(so, -1,
2391 				    _SODISCONNECT_LOCK_HELD);
2392 			} else {
2393 				so->so_state &=
2394 				    ~(SS_ISCONNECTED | SS_ISCONNECTING);
2395 				sti->sti_faddr_valid = 0;
2396 				sti->sti_faddr_len = 0;
2397 			}
2398 
2399 			/* Remove SOLOCKED since setsockopt will grab it */
2400 			so_unlock_single(so, SOLOCKED);
2401 			mutex_exit(&so->so_lock);
2402 
2403 			val = 0;
2404 			(void) sotpi_setsockopt(so, SOL_SOCKET,
2405 			    SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2406 			    cr);
2407 
2408 			mutex_enter(&so->so_lock);
2409 			so_lock_single(so);	/* Set SOLOCKED */
2410 			goto done;
2411 		}
2412 	}
2413 	ASSERT(so->so_state & SS_ISBOUND);
2414 
2415 	if (name == NULL || namelen == 0) {
2416 		error = EINVAL;
2417 		goto done;
2418 	}
2419 	/*
2420 	 * Mark the socket if sti_faddr_sa represents the transport level
2421 	 * address.
2422 	 */
2423 	if (flags & _SOCONNECT_NOXLATE) {
2424 		struct sockaddr_ux	*soaddr_ux;
2425 
2426 		ASSERT(so->so_family == AF_UNIX);
2427 		if (namelen != sizeof (struct sockaddr_ux)) {
2428 			error = EINVAL;
2429 			goto done;
2430 		}
2431 		soaddr_ux = (struct sockaddr_ux *)name;
2432 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2433 		namelen = sizeof (soaddr_ux->sou_addr);
2434 		sti->sti_faddr_noxlate = 1;
2435 	}
2436 
2437 	/*
2438 	 * Length and family checks.
2439 	 */
2440 	error = so_addr_verify(so, name, namelen);
2441 	if (error)
2442 		goto bad;
2443 
2444 	/*
2445 	 * Save foreign address. Needed for AF_UNIX as well as
2446 	 * transport providers that do not support TI_GETPEERNAME.
2447 	 * Also used for cached foreign address for TCP and UDP.
2448 	 */
2449 	if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2450 		error = EINVAL;
2451 		goto done;
2452 	}
2453 	sti->sti_faddr_len = (socklen_t)namelen;
2454 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2455 	bcopy(name, sti->sti_faddr_sa, namelen);
2456 	sti->sti_faddr_valid = 1;
2457 
2458 	if (so->so_family == AF_UNIX) {
2459 		if (sti->sti_faddr_noxlate) {
2460 			/*
2461 			 * Already have a transport internal address. Do not
2462 			 * pass any (transport internal) source address.
2463 			 */
2464 			addr = sti->sti_faddr_sa;
2465 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2466 			src = NULL;
2467 			srclen = 0;
2468 		} else {
2469 			/*
2470 			 * Pass the sockaddr_un source address as an option
2471 			 * and translate the remote address.
2472 			 * Holding so_lock thus sti_laddr_sa can not change.
2473 			 */
2474 			src = sti->sti_laddr_sa;
2475 			srclen = (t_uscalar_t)sti->sti_laddr_len;
2476 			dprintso(so, 1,
2477 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2478 			    srclen, src));
2479 			error = so_ux_addr_xlate(so,
2480 			    sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2481 			    (flags & _SOCONNECT_XPG4_2),
2482 			    &addr, &addrlen);
2483 			if (error)
2484 				goto bad;
2485 		}
2486 	} else {
2487 		addr = sti->sti_faddr_sa;
2488 		addrlen = (t_uscalar_t)sti->sti_faddr_len;
2489 		src = NULL;
2490 		srclen = 0;
2491 	}
2492 	/*
2493 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2494 	 * option which asks the transport provider to send T_UDERR_IND
2495 	 * messages. These T_UDERR_IND messages are used to return connected
2496 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2497 	 *
2498 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2499 	 * we send down a T_CONN_REQ. This is needed to let the
2500 	 * transport assign a local address that is consistent with
2501 	 * the remote address. Applications depend on a getsockname()
2502 	 * after a connect() to retrieve the "source" IP address for
2503 	 * the connected socket.  Invalidate the cached local address
2504 	 * to force getsockname() to enquire of the transport.
2505 	 */
2506 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2507 		/*
2508 		 * Datagram socket.
2509 		 */
2510 		int32_t val;
2511 
2512 		so_unlock_single(so, SOLOCKED);
2513 		mutex_exit(&so->so_lock);
2514 
2515 		val = 1;
2516 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2517 		    &val, (t_uscalar_t)sizeof (val), cr);
2518 
2519 		mutex_enter(&so->so_lock);
2520 		so_lock_single(so);	/* Set SOLOCKED */
2521 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2522 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2523 		    soconnect_tpi_udp) {
2524 			soisconnected(so);
2525 			goto done;
2526 		}
2527 		/*
2528 		 * Send down T_CONN_REQ etc.
2529 		 * Clear fflag to avoid returning EWOULDBLOCK.
2530 		 */
2531 		fflag = 0;
2532 		ASSERT(so->so_family != AF_UNIX);
2533 		sti->sti_laddr_valid = 0;
2534 	} else if (sti->sti_laddr_len != 0) {
2535 		/*
2536 		 * If the local address or port was "any" then it may be
2537 		 * changed by the transport as a result of the
2538 		 * connect.  Invalidate the cached version if we have one.
2539 		 */
2540 		switch (so->so_family) {
2541 		case AF_INET:
2542 			ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2543 			if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2544 			    INADDR_ANY ||
2545 			    ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2546 				sti->sti_laddr_valid = 0;
2547 			break;
2548 
2549 		case AF_INET6:
2550 			ASSERT(sti->sti_laddr_len ==
2551 			    (socklen_t)sizeof (sin6_t));
2552 			if (IN6_IS_ADDR_UNSPECIFIED(
2553 			    &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2554 			    IN6_IS_ADDR_V4MAPPED_ANY(
2555 			    &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2556 			    ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2557 				sti->sti_laddr_valid = 0;
2558 			break;
2559 
2560 		default:
2561 			break;
2562 		}
2563 	}
2564 
2565 	/*
2566 	 * Check for failure of an earlier call
2567 	 */
2568 	if (so->so_error != 0)
2569 		goto so_bad;
2570 
2571 	/*
2572 	 * Send down T_CONN_REQ. Message was allocated above.
2573 	 */
2574 	conn_req.PRIM_type = T_CONN_REQ;
2575 	conn_req.DEST_length = addrlen;
2576 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2577 	if (srclen == 0) {
2578 		conn_req.OPT_length = 0;
2579 		conn_req.OPT_offset = 0;
2580 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2581 		soappendmsg(mp, addr, addrlen);
2582 	} else {
2583 		/*
2584 		 * There is a AF_UNIX sockaddr_un to include as a source
2585 		 * address option.
2586 		 */
2587 		struct T_opthdr toh;
2588 
2589 		toh.level = SOL_SOCKET;
2590 		toh.name = SO_SRCADDR;
2591 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2592 		toh.status = 0;
2593 		conn_req.OPT_length =
2594 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2595 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2596 		    _TPI_ALIGN_TOPT(addrlen));
2597 
2598 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2599 		soappendmsg(mp, addr, addrlen);
2600 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2601 		soappendmsg(mp, &toh, sizeof (toh));
2602 		soappendmsg(mp, src, srclen);
2603 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2604 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2605 	}
2606 	/*
2607 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2608 	 * in order to have the right state when the T_CONN_CON shows up.
2609 	 */
2610 	soisconnecting(so);
2611 	mutex_exit(&so->so_lock);
2612 
2613 	if (audit_active)
2614 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2615 
2616 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2617 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2618 	mp = NULL;
2619 	mutex_enter(&so->so_lock);
2620 	if (error != 0)
2621 		goto bad;
2622 
2623 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2624 		goto bad;
2625 
2626 	/* Allow other threads to access the socket */
2627 	so_unlock_single(so, SOLOCKED);
2628 	need_unlock = B_FALSE;
2629 
2630 	/*
2631 	 * Wait until we get a T_CONN_CON or an error
2632 	 */
2633 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2634 		so_lock_single(so);	/* Set SOLOCKED */
2635 		need_unlock = B_TRUE;
2636 	}
2637 
2638 done:
2639 	freemsg(mp);
2640 	switch (error) {
2641 	case EINPROGRESS:
2642 	case EALREADY:
2643 	case EISCONN:
2644 	case EINTR:
2645 		/* Non-fatal errors */
2646 		sti->sti_laddr_valid = 0;
2647 		/* FALLTHRU */
2648 	case 0:
2649 		break;
2650 	default:
2651 		ASSERT(need_unlock);
2652 		/*
2653 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2654 		 * and invalidate local-address cache
2655 		 */
2656 		so->so_state &= ~SS_ISCONNECTING;
2657 		sti->sti_laddr_valid = 0;
2658 		/* A discon_ind might have already unbound us */
2659 		if ((flags & _SOCONNECT_DID_BIND) &&
2660 		    (so->so_state & SS_ISBOUND)) {
2661 			int err;
2662 
2663 			err = sotpi_unbind(so, 0);
2664 			/* LINTED - statement has no conseq */
2665 			if (err) {
2666 				eprintsoline(so, err);
2667 			}
2668 		}
2669 		break;
2670 	}
2671 	if (need_unlock)
2672 		so_unlock_single(so, SOLOCKED);
2673 	mutex_exit(&so->so_lock);
2674 	return (error);
2675 
2676 so_bad:	error = sogeterr(so, B_TRUE);
2677 bad:	eprintsoline(so, error);
2678 	goto done;
2679 }
2680 
2681 /* ARGSUSED */
2682 int
2683 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2684 {
2685 	struct T_ordrel_req	ordrel_req;
2686 	mblk_t			*mp;
2687 	uint_t			old_state, state_change;
2688 	int			error = 0;
2689 	sotpi_info_t		*sti = SOTOTPI(so);
2690 
2691 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2692 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2693 
2694 	mutex_enter(&so->so_lock);
2695 	so_lock_single(so);	/* Set SOLOCKED */
2696 
2697 	/*
2698 	 * SunOS 4.X has no check for datagram sockets.
2699 	 * 5.X checks that it is connected (ENOTCONN)
2700 	 * X/Open requires that we check the connected state.
2701 	 */
2702 	if (!(so->so_state & SS_ISCONNECTED)) {
2703 		if (!xnet_skip_checks) {
2704 			error = ENOTCONN;
2705 			if (xnet_check_print) {
2706 				printf("sockfs: X/Open shutdown check "
2707 				    "caused ENOTCONN\n");
2708 			}
2709 		}
2710 		goto done;
2711 	}
2712 	/*
2713 	 * Record the current state and then perform any state changes.
2714 	 * Then use the difference between the old and new states to
2715 	 * determine which messages need to be sent.
2716 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2717 	 * duplicate calls to shutdown().
2718 	 */
2719 	old_state = so->so_state;
2720 
2721 	switch (how) {
2722 	case 0:
2723 		socantrcvmore(so);
2724 		break;
2725 	case 1:
2726 		socantsendmore(so);
2727 		break;
2728 	case 2:
2729 		socantsendmore(so);
2730 		socantrcvmore(so);
2731 		break;
2732 	default:
2733 		error = EINVAL;
2734 		goto done;
2735 	}
2736 
2737 	/*
2738 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2739 	 */
2740 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2741 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2742 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2743 
2744 	switch (state_change) {
2745 	case 0:
2746 		dprintso(so, 1,
2747 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2748 		    so->so_state));
2749 		goto done;
2750 
2751 	case SS_CANTRCVMORE:
2752 		mutex_exit(&so->so_lock);
2753 		strseteof(SOTOV(so), 1);
2754 		/*
2755 		 * strseteof takes care of read side wakeups,
2756 		 * pollwakeups, and signals.
2757 		 */
2758 		/*
2759 		 * Get the read lock before flushing data to avoid problems
2760 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2761 		 */
2762 		mutex_enter(&so->so_lock);
2763 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2764 		mutex_exit(&so->so_lock);
2765 
2766 		/* Flush read side queue */
2767 		strflushrq(SOTOV(so), FLUSHALL);
2768 
2769 		mutex_enter(&so->so_lock);
2770 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2771 		break;
2772 
2773 	case SS_CANTSENDMORE:
2774 		mutex_exit(&so->so_lock);
2775 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2776 		mutex_enter(&so->so_lock);
2777 		break;
2778 
2779 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2780 		mutex_exit(&so->so_lock);
2781 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2782 		strseteof(SOTOV(so), 1);
2783 		/*
2784 		 * strseteof takes care of read side wakeups,
2785 		 * pollwakeups, and signals.
2786 		 */
2787 		/*
2788 		 * Get the read lock before flushing data to avoid problems
2789 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2790 		 */
2791 		mutex_enter(&so->so_lock);
2792 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2793 		mutex_exit(&so->so_lock);
2794 
2795 		/* Flush read side queue */
2796 		strflushrq(SOTOV(so), FLUSHALL);
2797 
2798 		mutex_enter(&so->so_lock);
2799 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2800 		break;
2801 	}
2802 
2803 	ASSERT(MUTEX_HELD(&so->so_lock));
2804 
2805 	/*
2806 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2807 	 * was set due to this call and the new state has both of them set:
2808 	 *	Send the AF_UNIX close indication
2809 	 *	For T_COTS send a discon_ind
2810 	 *
2811 	 * If cantsend was set due to this call:
2812 	 *	For T_COTSORD send an ordrel_ind
2813 	 *
2814 	 * Note that for T_CLTS there is no message sent here.
2815 	 */
2816 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2817 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2818 		/*
2819 		 * For SunOS 4.X compatibility we tell the other end
2820 		 * that we are unable to receive at this point.
2821 		 */
2822 		if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2823 			so_unix_close(so);
2824 
2825 		if (sti->sti_serv_type == T_COTS)
2826 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2827 	}
2828 	if ((state_change & SS_CANTSENDMORE) &&
2829 	    (sti->sti_serv_type == T_COTS_ORD)) {
2830 		/* Send an orderly release */
2831 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2832 
2833 		mutex_exit(&so->so_lock);
2834 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2835 		    0, _ALLOC_SLEEP, cr);
2836 		/*
2837 		 * Send down the T_ORDREL_REQ even if there is flow control.
2838 		 * This prevents shutdown from blocking.
2839 		 * Note that there is no T_OK_ACK for ordrel_req.
2840 		 */
2841 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2842 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2843 		mutex_enter(&so->so_lock);
2844 		if (error) {
2845 			eprintsoline(so, error);
2846 			goto done;
2847 		}
2848 	}
2849 
2850 done:
2851 	so_unlock_single(so, SOLOCKED);
2852 	mutex_exit(&so->so_lock);
2853 	return (error);
2854 }
2855 
2856 /*
2857  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2858  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2859  * that we have closed.
2860  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2861  * T_UNITDATA_REQ containing the same option.
2862  *
2863  * For SOCK_DGRAM half-connections (somebody connected to this end
2864  * but this end is not connect) we don't know where to send any
2865  * SO_UNIX_CLOSE.
2866  *
2867  * We have to ignore stream head errors just in case there has been
2868  * a shutdown(output).
2869  * Ignore any flow control to try to get the message more quickly to the peer.
2870  * While locally ignoring flow control solves the problem when there
2871  * is only the loopback transport on the stream it would not provide
2872  * the correct AF_UNIX socket semantics when one or more modules have
2873  * been pushed.
2874  */
2875 void
2876 so_unix_close(struct sonode *so)
2877 {
2878 	int		error;
2879 	struct T_opthdr	toh;
2880 	mblk_t		*mp;
2881 	sotpi_info_t	*sti = SOTOTPI(so);
2882 
2883 	ASSERT(MUTEX_HELD(&so->so_lock));
2884 
2885 	ASSERT(so->so_family == AF_UNIX);
2886 
2887 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2888 	    (SS_ISCONNECTED|SS_ISBOUND))
2889 		return;
2890 
2891 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2892 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2893 
2894 	toh.level = SOL_SOCKET;
2895 	toh.name = SO_UNIX_CLOSE;
2896 
2897 	/* zero length + header */
2898 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2899 	toh.status = 0;
2900 
2901 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2902 		struct T_optdata_req tdr;
2903 
2904 		tdr.PRIM_type = T_OPTDATA_REQ;
2905 		tdr.DATA_flag = 0;
2906 
2907 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2908 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2909 
2910 		/* NOTE: holding so_lock while sleeping */
2911 		mp = soallocproto2(&tdr, sizeof (tdr),
2912 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2913 	} else {
2914 		struct T_unitdata_req	tudr;
2915 		void			*addr;
2916 		socklen_t		addrlen;
2917 		void			*src;
2918 		socklen_t		srclen;
2919 		struct T_opthdr		toh2;
2920 		t_scalar_t		size;
2921 
2922 		/* Connecteded DGRAM socket */
2923 
2924 		/*
2925 		 * For AF_UNIX the destination address is translated to
2926 		 * an internal name and the source address is passed as
2927 		 * an option.
2928 		 */
2929 		/*
2930 		 * Length and family checks.
2931 		 */
2932 		error = so_addr_verify(so, sti->sti_faddr_sa,
2933 		    (t_uscalar_t)sti->sti_faddr_len);
2934 		if (error) {
2935 			eprintsoline(so, error);
2936 			return;
2937 		}
2938 		if (sti->sti_faddr_noxlate) {
2939 			/*
2940 			 * Already have a transport internal address. Do not
2941 			 * pass any (transport internal) source address.
2942 			 */
2943 			addr = sti->sti_faddr_sa;
2944 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2945 			src = NULL;
2946 			srclen = 0;
2947 		} else {
2948 			/*
2949 			 * Pass the sockaddr_un source address as an option
2950 			 * and translate the remote address.
2951 			 * Holding so_lock thus sti_laddr_sa can not change.
2952 			 */
2953 			src = sti->sti_laddr_sa;
2954 			srclen = (socklen_t)sti->sti_laddr_len;
2955 			dprintso(so, 1,
2956 			    ("so_ux_close: srclen %d, src %p\n",
2957 			    srclen, src));
2958 			error = so_ux_addr_xlate(so,
2959 			    sti->sti_faddr_sa,
2960 			    (socklen_t)sti->sti_faddr_len, 0,
2961 			    &addr, &addrlen);
2962 			if (error) {
2963 				eprintsoline(so, error);
2964 				return;
2965 			}
2966 		}
2967 		tudr.PRIM_type = T_UNITDATA_REQ;
2968 		tudr.DEST_length = addrlen;
2969 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2970 		if (srclen == 0) {
2971 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2972 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2973 			    _TPI_ALIGN_TOPT(addrlen));
2974 
2975 			size = tudr.OPT_offset + tudr.OPT_length;
2976 			/* NOTE: holding so_lock while sleeping */
2977 			mp = soallocproto2(&tudr, sizeof (tudr),
2978 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2979 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2980 			soappendmsg(mp, &toh, sizeof (toh));
2981 		} else {
2982 			/*
2983 			 * There is a AF_UNIX sockaddr_un to include as a
2984 			 * source address option.
2985 			 */
2986 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2987 			    _TPI_ALIGN_TOPT(srclen));
2988 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2989 			    _TPI_ALIGN_TOPT(addrlen));
2990 
2991 			toh2.level = SOL_SOCKET;
2992 			toh2.name = SO_SRCADDR;
2993 			toh2.len = (t_uscalar_t)(srclen +
2994 			    sizeof (struct T_opthdr));
2995 			toh2.status = 0;
2996 
2997 			size = tudr.OPT_offset + tudr.OPT_length;
2998 
2999 			/* NOTE: holding so_lock while sleeping */
3000 			mp = soallocproto2(&tudr, sizeof (tudr),
3001 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
3002 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3003 			soappendmsg(mp, &toh, sizeof (toh));
3004 			soappendmsg(mp, &toh2, sizeof (toh2));
3005 			soappendmsg(mp, src, srclen);
3006 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3007 		}
3008 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3009 	}
3010 	mutex_exit(&so->so_lock);
3011 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
3012 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
3013 	mutex_enter(&so->so_lock);
3014 }
3015 
3016 /*
3017  * Called by sotpi_recvmsg when reading a non-zero amount of data.
3018  * In addition, the caller typically verifies that there is some
3019  * potential state to clear by checking
3020  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
3021  * before calling this routine.
3022  * Note that such a check can be made without holding so_lock since
3023  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
3024  * decrements sti_oobsigcnt.
3025  *
3026  * When data is read *after* the point that all pending
3027  * oob data has been consumed the oob indication is cleared.
3028  *
3029  * This logic keeps select/poll returning POLLRDBAND and
3030  * SIOCATMARK returning true until we have read past
3031  * the mark.
3032  */
3033 static void
3034 sorecv_update_oobstate(struct sonode *so)
3035 {
3036 	sotpi_info_t *sti = SOTOTPI(so);
3037 
3038 	mutex_enter(&so->so_lock);
3039 	ASSERT(so_verify_oobstate(so));
3040 	dprintso(so, 1,
3041 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
3042 	    sti->sti_oobsigcnt,
3043 	    sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
3044 	if (sti->sti_oobsigcnt == 0) {
3045 		/* No more pending oob indications */
3046 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
3047 		freemsg(so->so_oobmsg);
3048 		so->so_oobmsg = NULL;
3049 	}
3050 	ASSERT(so_verify_oobstate(so));
3051 	mutex_exit(&so->so_lock);
3052 }
3053 
3054 /*
3055  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
3056  */
3057 static int
3058 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
3059 {
3060 	sotpi_info_t *sti = SOTOTPI(so);
3061 	int	error = 0;
3062 	mblk_t *tmp = NULL;
3063 	mblk_t *pmp = NULL;
3064 	mblk_t *nmp = sti->sti_nl7c_rcv_mp;
3065 
3066 	ASSERT(nmp != NULL);
3067 
3068 	while (nmp != NULL && uiop->uio_resid > 0) {
3069 		ssize_t n;
3070 
3071 		if (DB_TYPE(nmp) == M_DATA) {
3072 			/*
3073 			 * We have some data, uiomove up to resid bytes.
3074 			 */
3075 			n = MIN(MBLKL(nmp), uiop->uio_resid);
3076 			if (n > 0)
3077 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
3078 			nmp->b_rptr += n;
3079 			if (nmp->b_rptr == nmp->b_wptr) {
3080 				pmp = nmp;
3081 				nmp = nmp->b_cont;
3082 			}
3083 			if (error)
3084 				break;
3085 		} else {
3086 			/*
3087 			 * We only handle data, save for caller to handle.
3088 			 */
3089 			if (pmp != NULL) {
3090 				pmp->b_cont = nmp->b_cont;
3091 			}
3092 			nmp->b_cont = NULL;
3093 			if (*rmp == NULL) {
3094 				*rmp = nmp;
3095 			} else {
3096 				tmp->b_cont = nmp;
3097 			}
3098 			nmp = nmp->b_cont;
3099 			tmp = nmp;
3100 		}
3101 	}
3102 	if (pmp != NULL) {
3103 		/* Free any mblk_t(s) which we have consumed */
3104 		pmp->b_cont = NULL;
3105 		freemsg(sti->sti_nl7c_rcv_mp);
3106 	}
3107 	if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3108 		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
3109 		if (error == 0) {
3110 			rval_t	*p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3111 
3112 			error = p->r_v.r_v2;
3113 			p->r_v.r_v2 = 0;
3114 		}
3115 		rp->r_vals = sti->sti_nl7c_rcv_rval;
3116 		sti->sti_nl7c_rcv_rval = 0;
3117 	} else {
3118 		/* More mblk_t(s) to process so no rval to return */
3119 		rp->r_vals = 0;
3120 	}
3121 	return (error);
3122 }
3123 /*
3124  * Receive the next message on the queue.
3125  * If msg_controllen is non-zero when called the caller is interested in
3126  * any received control info (options).
3127  * If msg_namelen is non-zero when called the caller is interested in
3128  * any received source address.
3129  * The routine returns with msg_control and msg_name pointing to
3130  * kmem_alloc'ed memory which the caller has to free.
3131  */
3132 /* ARGSUSED */
3133 int
3134 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3135     struct cred *cr)
3136 {
3137 	union T_primitives	*tpr;
3138 	mblk_t			*mp;
3139 	uchar_t			pri;
3140 	int			pflag, opflag;
3141 	void			*control;
3142 	t_uscalar_t		controllen;
3143 	t_uscalar_t		namelen;
3144 	int			so_state = so->so_state; /* Snapshot */
3145 	ssize_t			saved_resid;
3146 	rval_t			rval;
3147 	int			flags;
3148 	clock_t			timout;
3149 	int			error = 0;
3150 	int			reterr = 0;
3151 	struct uio		*suiop = NULL;
3152 	sotpi_info_t		*sti = SOTOTPI(so);
3153 
3154 	flags = msg->msg_flags;
3155 	msg->msg_flags = 0;
3156 
3157 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3158 	    (void *)so, (void *)msg, flags,
3159 	    pr_state(so->so_state, so->so_mode), so->so_error));
3160 
3161 	if (so->so_version == SOV_STREAM) {
3162 		so_update_attrs(so, SOACC);
3163 		/* The imaginary "sockmod" has been popped - act as a stream */
3164 		return (strread(SOTOV(so), uiop, cr));
3165 	}
3166 
3167 	/*
3168 	 * If we are not connected because we have never been connected
3169 	 * we return ENOTCONN. If we have been connected (but are no longer
3170 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3171 	 * the EOF.
3172 	 *
3173 	 * An alternative would be to post an ENOTCONN error in stream head
3174 	 * (read+write) and clear it when we're connected. However, that error
3175 	 * would cause incorrect poll/select behavior!
3176 	 */
3177 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3178 	    (so->so_mode & SM_CONNREQUIRED)) {
3179 		return (ENOTCONN);
3180 	}
3181 
3182 	/*
3183 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3184 	 * after checking that the read queue is empty) and returns zero.
3185 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
3186 	 * is zero.
3187 	 */
3188 
3189 	if (flags & MSG_OOB) {
3190 		/* Check that the transport supports OOB */
3191 		if (!(so->so_mode & SM_EXDATA))
3192 			return (EOPNOTSUPP);
3193 		so_update_attrs(so, SOACC);
3194 		return (sorecvoob(so, msg, uiop, flags,
3195 		    (so->so_options & SO_OOBINLINE)));
3196 	}
3197 
3198 	so_update_attrs(so, SOACC);
3199 
3200 	/*
3201 	 * Set msg_controllen and msg_namelen to zero here to make it
3202 	 * simpler in the cases that no control or name is returned.
3203 	 */
3204 	controllen = msg->msg_controllen;
3205 	namelen = msg->msg_namelen;
3206 	msg->msg_controllen = 0;
3207 	msg->msg_namelen = 0;
3208 
3209 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3210 	    namelen, controllen));
3211 
3212 	mutex_enter(&so->so_lock);
3213 	/*
3214 	 * If an NL7C enabled socket and not waiting for write data.
3215 	 */
3216 	if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3217 	    NL7C_ENABLED) {
3218 		if (sti->sti_nl7c_uri) {
3219 			/* Close uri processing for a previous request */
3220 			nl7c_close(so);
3221 		}
3222 		if ((so_state & SS_CANTRCVMORE) &&
3223 		    sti->sti_nl7c_rcv_mp == NULL) {
3224 			/* Nothing to process, EOF */
3225 			mutex_exit(&so->so_lock);
3226 			return (0);
3227 		} else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3228 			/* Persistent NL7C socket, try to process request */
3229 			boolean_t ret;
3230 
3231 			ret = nl7c_process(so,
3232 			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3233 			rval.r_vals = sti->sti_nl7c_rcv_rval;
3234 			error = rval.r_v.r_v2;
3235 			if (error) {
3236 				/* Error of some sort, return it */
3237 				mutex_exit(&so->so_lock);
3238 				return (error);
3239 			}
3240 			if (sti->sti_nl7c_flags &&
3241 			    ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3242 				/*
3243 				 * Still an NL7C socket and no data
3244 				 * to pass up to the caller.
3245 				 */
3246 				mutex_exit(&so->so_lock);
3247 				if (ret) {
3248 					/* EOF */
3249 					return (0);
3250 				} else {
3251 					/* Need more data */
3252 					return (EAGAIN);
3253 				}
3254 			}
3255 		} else {
3256 			/*
3257 			 * Not persistent so no further NL7C processing.
3258 			 */
3259 			sti->sti_nl7c_flags = 0;
3260 		}
3261 	}
3262 	/*
3263 	 * Only one reader is allowed at any given time. This is needed
3264 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3265 	 *
3266 	 * This is slightly different that BSD behavior in that it fails with
3267 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3268 	 * is single-threaded using sblock(), which is dropped while waiting
3269 	 * for data to appear. The difference shows up e.g. if one
3270 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3271 	 * does use nonblocking io and different threads are reading each
3272 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3273 	 * in this case as long as the read queue doesn't get empty.
3274 	 * In this implementation the thread using nonblocking io can
3275 	 * get an EWOULDBLOCK error due to the blocking thread executing
3276 	 * e.g. in the uiomove in kstrgetmsg.
3277 	 * This difference is not believed to be significant.
3278 	 */
3279 	/* Set SOREADLOCKED */
3280 	error = so_lock_read_intr(so,
3281 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3282 	mutex_exit(&so->so_lock);
3283 	if (error)
3284 		return (error);
3285 
3286 	/*
3287 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3288 	 * queued data has been consumed.
3289 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3290 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3291 	 *
3292 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3293 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3294 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3295 	 */
3296 	pflag = MSG_ANY | MSG_DELAYERROR;
3297 	if (flags & MSG_PEEK) {
3298 		pflag |= MSG_IPEEK;
3299 		flags &= ~MSG_WAITALL;
3300 	}
3301 	if (so->so_mode & SM_ATOMIC)
3302 		pflag |= MSG_DISCARDTAIL;
3303 
3304 	if (flags & MSG_DONTWAIT)
3305 		timout = 0;
3306 	else
3307 		timout = -1;
3308 	opflag = pflag;
3309 
3310 	suiop = sod_rcv_init(so, flags, &uiop);
3311 retry:
3312 	saved_resid = uiop->uio_resid;
3313 	pri = 0;
3314 	mp = NULL;
3315 	if (sti->sti_nl7c_rcv_mp != NULL) {
3316 		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3317 		error = nl7c_sorecv(so, &mp, uiop, &rval);
3318 	} else {
3319 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3320 		    timout, &rval);
3321 	}
3322 	if (error != 0) {
3323 		/* kstrgetmsg returns ETIME when timeout expires */
3324 		if (error == ETIME)
3325 			error = EWOULDBLOCK;
3326 		goto out;
3327 	}
3328 	/*
3329 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3330 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3331 	 */
3332 	ASSERT(!(rval.r_val1 & MORECTL));
3333 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3334 		msg->msg_flags |= MSG_TRUNC;
3335 
3336 	if (mp == NULL) {
3337 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3338 		/*
3339 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3340 		 * The draft Posix socket spec states that the mark should
3341 		 * not be cleared when peeking. We follow the latter.
3342 		 */
3343 		if ((so->so_state &
3344 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3345 		    (uiop->uio_resid != saved_resid) &&
3346 		    !(flags & MSG_PEEK)) {
3347 			sorecv_update_oobstate(so);
3348 		}
3349 
3350 		mutex_enter(&so->so_lock);
3351 		/* Set MSG_EOR based on MOREDATA */
3352 		if (!(rval.r_val1 & MOREDATA)) {
3353 			if (so->so_state & SS_SAVEDEOR) {
3354 				msg->msg_flags |= MSG_EOR;
3355 				so->so_state &= ~SS_SAVEDEOR;
3356 			}
3357 		}
3358 		/*
3359 		 * If some data was received (i.e. not EOF) and the
3360 		 * read/recv* has not been satisfied wait for some more.
3361 		 */
3362 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3363 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3364 			mutex_exit(&so->so_lock);
3365 			pflag = opflag | MSG_NOMARK;
3366 			goto retry;
3367 		}
3368 		goto out_locked;
3369 	}
3370 
3371 	/* strsock_proto has already verified length and alignment */
3372 	tpr = (union T_primitives *)mp->b_rptr;
3373 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3374 
3375 	switch (tpr->type) {
3376 	case T_DATA_IND: {
3377 		if ((so->so_state &
3378 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3379 		    (uiop->uio_resid != saved_resid) &&
3380 		    !(flags & MSG_PEEK)) {
3381 			sorecv_update_oobstate(so);
3382 		}
3383 
3384 		/*
3385 		 * Set msg_flags to MSG_EOR based on
3386 		 * MORE_flag and MOREDATA.
3387 		 */
3388 		mutex_enter(&so->so_lock);
3389 		so->so_state &= ~SS_SAVEDEOR;
3390 		if (!(tpr->data_ind.MORE_flag & 1)) {
3391 			if (!(rval.r_val1 & MOREDATA))
3392 				msg->msg_flags |= MSG_EOR;
3393 			else
3394 				so->so_state |= SS_SAVEDEOR;
3395 		}
3396 		freemsg(mp);
3397 		/*
3398 		 * If some data was received (i.e. not EOF) and the
3399 		 * read/recv* has not been satisfied wait for some more.
3400 		 */
3401 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3402 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3403 			mutex_exit(&so->so_lock);
3404 			pflag = opflag | MSG_NOMARK;
3405 			goto retry;
3406 		}
3407 		goto out_locked;
3408 	}
3409 	case T_UNITDATA_IND: {
3410 		void *addr;
3411 		t_uscalar_t addrlen;
3412 		void *abuf;
3413 		t_uscalar_t optlen;
3414 		void *opt;
3415 
3416 		if ((so->so_state &
3417 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3418 		    (uiop->uio_resid != saved_resid) &&
3419 		    !(flags & MSG_PEEK)) {
3420 			sorecv_update_oobstate(so);
3421 		}
3422 
3423 		if (namelen != 0) {
3424 			/* Caller wants source address */
3425 			addrlen = tpr->unitdata_ind.SRC_length;
3426 			addr = sogetoff(mp,
3427 			    tpr->unitdata_ind.SRC_offset,
3428 			    addrlen, 1);
3429 			if (addr == NULL) {
3430 				freemsg(mp);
3431 				error = EPROTO;
3432 				eprintsoline(so, error);
3433 				goto out;
3434 			}
3435 			if (so->so_family == AF_UNIX) {
3436 				/*
3437 				 * Can not use the transport level address.
3438 				 * If there is a SO_SRCADDR option carrying
3439 				 * the socket level address it will be
3440 				 * extracted below.
3441 				 */
3442 				addr = NULL;
3443 				addrlen = 0;
3444 			}
3445 		}
3446 		optlen = tpr->unitdata_ind.OPT_length;
3447 		if (optlen != 0) {
3448 			t_uscalar_t ncontrollen;
3449 
3450 			/*
3451 			 * Extract any source address option.
3452 			 * Determine how large cmsg buffer is needed.
3453 			 */
3454 			opt = sogetoff(mp,
3455 			    tpr->unitdata_ind.OPT_offset,
3456 			    optlen, __TPI_ALIGN_SIZE);
3457 
3458 			if (opt == NULL) {
3459 				freemsg(mp);
3460 				error = EPROTO;
3461 				eprintsoline(so, error);
3462 				goto out;
3463 			}
3464 			if (so->so_family == AF_UNIX)
3465 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3466 			ncontrollen = so_cmsglen(mp, opt, optlen,
3467 			    !(flags & MSG_XPG4_2));
3468 			if (controllen != 0)
3469 				controllen = ncontrollen;
3470 			else if (ncontrollen != 0)
3471 				msg->msg_flags |= MSG_CTRUNC;
3472 		} else {
3473 			controllen = 0;
3474 		}
3475 
3476 		if (namelen != 0) {
3477 			/*
3478 			 * Return address to caller.
3479 			 * Caller handles truncation if length
3480 			 * exceeds msg_namelen.
3481 			 * NOTE: AF_UNIX NUL termination is ensured by
3482 			 * the sender's copyin_name().
3483 			 */
3484 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3485 
3486 			bcopy(addr, abuf, addrlen);
3487 			msg->msg_name = abuf;
3488 			msg->msg_namelen = addrlen;
3489 		}
3490 
3491 		if (controllen != 0) {
3492 			/*
3493 			 * Return control msg to caller.
3494 			 * Caller handles truncation if length
3495 			 * exceeds msg_controllen.
3496 			 */
3497 			control = kmem_zalloc(controllen, KM_SLEEP);
3498 
3499 			error = so_opt2cmsg(mp, opt, optlen,
3500 			    !(flags & MSG_XPG4_2),
3501 			    control, controllen);
3502 			if (error) {
3503 				freemsg(mp);
3504 				if (msg->msg_namelen != 0)
3505 					kmem_free(msg->msg_name,
3506 					    msg->msg_namelen);
3507 				kmem_free(control, controllen);
3508 				eprintsoline(so, error);
3509 				goto out;
3510 			}
3511 			msg->msg_control = control;
3512 			msg->msg_controllen = controllen;
3513 		}
3514 
3515 		freemsg(mp);
3516 		goto out;
3517 	}
3518 	case T_OPTDATA_IND: {
3519 		struct T_optdata_req *tdr;
3520 		void *opt;
3521 		t_uscalar_t optlen;
3522 
3523 		if ((so->so_state &
3524 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3525 		    (uiop->uio_resid != saved_resid) &&
3526 		    !(flags & MSG_PEEK)) {
3527 			sorecv_update_oobstate(so);
3528 		}
3529 
3530 		tdr = (struct T_optdata_req *)mp->b_rptr;
3531 		optlen = tdr->OPT_length;
3532 		if (optlen != 0) {
3533 			t_uscalar_t ncontrollen;
3534 			/*
3535 			 * Determine how large cmsg buffer is needed.
3536 			 */
3537 			opt = sogetoff(mp,
3538 			    tpr->optdata_ind.OPT_offset,
3539 			    optlen, __TPI_ALIGN_SIZE);
3540 
3541 			if (opt == NULL) {
3542 				freemsg(mp);
3543 				error = EPROTO;
3544 				eprintsoline(so, error);
3545 				goto out;
3546 			}
3547 
3548 			ncontrollen = so_cmsglen(mp, opt, optlen,
3549 			    !(flags & MSG_XPG4_2));
3550 			if (controllen != 0)
3551 				controllen = ncontrollen;
3552 			else if (ncontrollen != 0)
3553 				msg->msg_flags |= MSG_CTRUNC;
3554 		} else {
3555 			controllen = 0;
3556 		}
3557 
3558 		if (controllen != 0) {
3559 			/*
3560 			 * Return control msg to caller.
3561 			 * Caller handles truncation if length
3562 			 * exceeds msg_controllen.
3563 			 */
3564 			control = kmem_zalloc(controllen, KM_SLEEP);
3565 
3566 			error = so_opt2cmsg(mp, opt, optlen,
3567 			    !(flags & MSG_XPG4_2),
3568 			    control, controllen);
3569 			if (error) {
3570 				freemsg(mp);
3571 				kmem_free(control, controllen);
3572 				eprintsoline(so, error);
3573 				goto out;
3574 			}
3575 			msg->msg_control = control;
3576 			msg->msg_controllen = controllen;
3577 		}
3578 
3579 		/*
3580 		 * Set msg_flags to MSG_EOR based on
3581 		 * DATA_flag and MOREDATA.
3582 		 */
3583 		mutex_enter(&so->so_lock);
3584 		so->so_state &= ~SS_SAVEDEOR;
3585 		if (!(tpr->data_ind.MORE_flag & 1)) {
3586 			if (!(rval.r_val1 & MOREDATA))
3587 				msg->msg_flags |= MSG_EOR;
3588 			else
3589 				so->so_state |= SS_SAVEDEOR;
3590 		}
3591 		freemsg(mp);
3592 		/*
3593 		 * If some data was received (i.e. not EOF) and the
3594 		 * read/recv* has not been satisfied wait for some more.
3595 		 * Not possible to wait if control info was received.
3596 		 */
3597 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3598 		    controllen == 0 &&
3599 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3600 			mutex_exit(&so->so_lock);
3601 			pflag = opflag | MSG_NOMARK;
3602 			goto retry;
3603 		}
3604 		goto out_locked;
3605 	}
3606 	case T_EXDATA_IND: {
3607 		dprintso(so, 1,
3608 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3609 		    "state %s\n",
3610 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3611 		    saved_resid - uiop->uio_resid,
3612 		    pr_state(so->so_state, so->so_mode)));
3613 		/*
3614 		 * kstrgetmsg handles MSGMARK so there is nothing to
3615 		 * inspect in the T_EXDATA_IND.
3616 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3617 		 * as a separate message with no M_DATA component. Furthermore,
3618 		 * the stream head does not consolidate M_DATA messages onto
3619 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3620 		 * remains a message by itself. This is needed since MSGMARK
3621 		 * marks both the whole message as well as the last byte
3622 		 * of the message.
3623 		 */
3624 		freemsg(mp);
3625 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3626 		if (flags & MSG_PEEK) {
3627 			/*
3628 			 * Even though we are peeking we consume the
3629 			 * T_EXDATA_IND thereby moving the mark information
3630 			 * to SS_RCVATMARK. Then the oob code below will
3631 			 * retry the peeking kstrgetmsg.
3632 			 * Note that the stream head read queue is
3633 			 * never flushed without holding SOREADLOCKED
3634 			 * thus the T_EXDATA_IND can not disappear
3635 			 * underneath us.
3636 			 */
3637 			dprintso(so, 1,
3638 			    ("sotpi_recvmsg: consume EXDATA_IND "
3639 			    "counts %d/%d state %s\n",
3640 			    sti->sti_oobsigcnt,
3641 			    sti->sti_oobcnt,
3642 			    pr_state(so->so_state, so->so_mode)));
3643 
3644 			pflag = MSG_ANY | MSG_DELAYERROR;
3645 			if (so->so_mode & SM_ATOMIC)
3646 				pflag |= MSG_DISCARDTAIL;
3647 
3648 			pri = 0;
3649 			mp = NULL;
3650 
3651 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3652 			    &pri, &pflag, (clock_t)-1, &rval);
3653 			ASSERT(uiop->uio_resid == saved_resid);
3654 
3655 			if (error) {
3656 #ifdef SOCK_DEBUG
3657 				if (error != EWOULDBLOCK && error != EINTR) {
3658 					eprintsoline(so, error);
3659 				}
3660 #endif /* SOCK_DEBUG */
3661 				goto out;
3662 			}
3663 			ASSERT(mp);
3664 			tpr = (union T_primitives *)mp->b_rptr;
3665 			ASSERT(tpr->type == T_EXDATA_IND);
3666 			freemsg(mp);
3667 		} /* end "if (flags & MSG_PEEK)" */
3668 
3669 		/*
3670 		 * Decrement the number of queued and pending oob.
3671 		 *
3672 		 * SS_RCVATMARK is cleared when we read past a mark.
3673 		 * SS_HAVEOOBDATA is cleared when we've read past the
3674 		 * last mark.
3675 		 * SS_OOBPEND is cleared if we've read past the last
3676 		 * mark and no (new) SIGURG has been posted.
3677 		 */
3678 		mutex_enter(&so->so_lock);
3679 		ASSERT(so_verify_oobstate(so));
3680 		ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3681 		ASSERT(sti->sti_oobsigcnt > 0);
3682 		sti->sti_oobsigcnt--;
3683 		ASSERT(sti->sti_oobcnt > 0);
3684 		sti->sti_oobcnt--;
3685 		/*
3686 		 * Since the T_EXDATA_IND has been removed from the stream
3687 		 * head, but we have not read data past the mark,
3688 		 * sockfs needs to track that the socket is still at the mark.
3689 		 *
3690 		 * Since no data was received call kstrgetmsg again to wait
3691 		 * for data.
3692 		 */
3693 		so->so_state |= SS_RCVATMARK;
3694 		mutex_exit(&so->so_lock);
3695 		dprintso(so, 1,
3696 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3697 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3698 		    pr_state(so->so_state, so->so_mode)));
3699 		pflag = opflag;
3700 		goto retry;
3701 	}
3702 	default:
3703 		cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3704 		    (void *)so, tpr->type, (void *)mp);
3705 		ASSERT(0);
3706 		freemsg(mp);
3707 		error = EPROTO;
3708 		eprintsoline(so, error);
3709 		goto out;
3710 	}
3711 	/* NOTREACHED */
3712 out:
3713 	mutex_enter(&so->so_lock);
3714 out_locked:
3715 	if (so->so_direct != NULL) {
3716 		mutex_enter(so->so_direct->sod_lockp);
3717 		reterr = sod_rcv_done(so, suiop, uiop);
3718 		mutex_exit(so->so_direct->sod_lockp);
3719 	}
3720 	if (reterr != 0 && error == 0)
3721 		error = reterr;
3722 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3723 	mutex_exit(&so->so_lock);
3724 	return (error);
3725 }
3726 
3727 /*
3728  * Sending data with options on a datagram socket.
3729  * Assumes caller has verified that SS_ISBOUND etc. are set.
3730  */
3731 static int
3732 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3733     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3734 {
3735 	struct T_unitdata_req	tudr;
3736 	mblk_t			*mp;
3737 	int			error;
3738 	void			*addr;
3739 	socklen_t		addrlen;
3740 	void			*src;
3741 	socklen_t		srclen;
3742 	ssize_t			len;
3743 	int			size;
3744 	struct T_opthdr		toh;
3745 	struct fdbuf		*fdbuf;
3746 	t_uscalar_t		optlen;
3747 	void			*fds;
3748 	int			fdlen;
3749 	sotpi_info_t		*sti = SOTOTPI(so);
3750 
3751 	ASSERT(name && namelen);
3752 	ASSERT(control && controllen);
3753 
3754 	len = uiop->uio_resid;
3755 	if (len > (ssize_t)sti->sti_tidu_size) {
3756 		return (EMSGSIZE);
3757 	}
3758 
3759 	/*
3760 	 * For AF_UNIX the destination address is translated to an internal
3761 	 * name and the source address is passed as an option.
3762 	 * Also, file descriptors are passed as file pointers in an
3763 	 * option.
3764 	 */
3765 
3766 	/*
3767 	 * Length and family checks.
3768 	 */
3769 	error = so_addr_verify(so, name, namelen);
3770 	if (error) {
3771 		eprintsoline(so, error);
3772 		return (error);
3773 	}
3774 	if (so->so_family == AF_UNIX) {
3775 		if (sti->sti_faddr_noxlate) {
3776 			/*
3777 			 * Already have a transport internal address. Do not
3778 			 * pass any (transport internal) source address.
3779 			 */
3780 			addr = name;
3781 			addrlen = namelen;
3782 			src = NULL;
3783 			srclen = 0;
3784 		} else {
3785 			/*
3786 			 * Pass the sockaddr_un source address as an option
3787 			 * and translate the remote address.
3788 			 *
3789 			 * Note that this code does not prevent sti_laddr_sa
3790 			 * from changing while it is being used. Thus
3791 			 * if an unbind+bind occurs concurrently with this
3792 			 * send the peer might see a partially new and a
3793 			 * partially old "from" address.
3794 			 */
3795 			src = sti->sti_laddr_sa;
3796 			srclen = (t_uscalar_t)sti->sti_laddr_len;
3797 			dprintso(so, 1,
3798 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3799 			    srclen, src));
3800 			error = so_ux_addr_xlate(so, name, namelen,
3801 			    (flags & MSG_XPG4_2),
3802 			    &addr, &addrlen);
3803 			if (error) {
3804 				eprintsoline(so, error);
3805 				return (error);
3806 			}
3807 		}
3808 	} else {
3809 		addr = name;
3810 		addrlen = namelen;
3811 		src = NULL;
3812 		srclen = 0;
3813 	}
3814 	optlen = so_optlen(control, controllen,
3815 	    !(flags & MSG_XPG4_2));
3816 	tudr.PRIM_type = T_UNITDATA_REQ;
3817 	tudr.DEST_length = addrlen;
3818 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3819 	if (srclen != 0)
3820 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3821 		    _TPI_ALIGN_TOPT(srclen));
3822 	else
3823 		tudr.OPT_length = optlen;
3824 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3825 	    _TPI_ALIGN_TOPT(addrlen));
3826 
3827 	size = tudr.OPT_offset + tudr.OPT_length;
3828 
3829 	/*
3830 	 * File descriptors only when SM_FDPASSING set.
3831 	 */
3832 	error = so_getfdopt(control, controllen,
3833 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3834 	if (error)
3835 		return (error);
3836 	if (fdlen != -1) {
3837 		if (!(so->so_mode & SM_FDPASSING))
3838 			return (EOPNOTSUPP);
3839 
3840 		error = fdbuf_create(fds, fdlen, &fdbuf);
3841 		if (error)
3842 			return (error);
3843 		mp = fdbuf_allocmsg(size, fdbuf);
3844 	} else {
3845 		mp = soallocproto(size, _ALLOC_INTR, CRED());
3846 		if (mp == NULL) {
3847 			/*
3848 			 * Caught a signal waiting for memory.
3849 			 * Let send* return EINTR.
3850 			 */
3851 			return (EINTR);
3852 		}
3853 	}
3854 	soappendmsg(mp, &tudr, sizeof (tudr));
3855 	soappendmsg(mp, addr, addrlen);
3856 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3857 
3858 	if (fdlen != -1) {
3859 		ASSERT(fdbuf != NULL);
3860 		toh.level = SOL_SOCKET;
3861 		toh.name = SO_FILEP;
3862 		toh.len = fdbuf->fd_size +
3863 		    (t_uscalar_t)sizeof (struct T_opthdr);
3864 		toh.status = 0;
3865 		soappendmsg(mp, &toh, sizeof (toh));
3866 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3867 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3868 	}
3869 	if (srclen != 0) {
3870 		/*
3871 		 * There is a AF_UNIX sockaddr_un to include as a source
3872 		 * address option.
3873 		 */
3874 		toh.level = SOL_SOCKET;
3875 		toh.name = SO_SRCADDR;
3876 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3877 		toh.status = 0;
3878 		soappendmsg(mp, &toh, sizeof (toh));
3879 		soappendmsg(mp, src, srclen);
3880 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3881 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3882 	}
3883 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3884 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3885 	/* At most 3 bytes left in the message */
3886 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3887 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3888 
3889 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3890 	if (audit_active)
3891 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3892 
3893 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3894 #ifdef SOCK_DEBUG
3895 	if (error) {
3896 		eprintsoline(so, error);
3897 	}
3898 #endif /* SOCK_DEBUG */
3899 	return (error);
3900 }
3901 
3902 /*
3903  * Sending data with options on a connected stream socket.
3904  * Assumes caller has verified that SS_ISCONNECTED is set.
3905  */
3906 static int
3907 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3908     t_uscalar_t controllen, int flags)
3909 {
3910 	struct T_optdata_req	tdr;
3911 	mblk_t			*mp;
3912 	int			error;
3913 	ssize_t			iosize;
3914 	int			size;
3915 	struct fdbuf		*fdbuf;
3916 	t_uscalar_t		optlen;
3917 	void			*fds;
3918 	int			fdlen;
3919 	struct T_opthdr		toh;
3920 	sotpi_info_t		*sti = SOTOTPI(so);
3921 
3922 	dprintso(so, 1,
3923 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3924 
3925 	/*
3926 	 * Has to be bound and connected. However, since no locks are
3927 	 * held the state could have changed after sotpi_sendmsg checked it
3928 	 * thus it is not possible to ASSERT on the state.
3929 	 */
3930 
3931 	/* Options on connection-oriented only when SM_OPTDATA set. */
3932 	if (!(so->so_mode & SM_OPTDATA))
3933 		return (EOPNOTSUPP);
3934 
3935 	do {
3936 		/*
3937 		 * Set the MORE flag if uio_resid does not fit in this
3938 		 * message or if the caller passed in "more".
3939 		 * Error for transports with zero tidu_size.
3940 		 */
3941 		tdr.PRIM_type = T_OPTDATA_REQ;
3942 		iosize = sti->sti_tidu_size;
3943 		if (iosize <= 0)
3944 			return (EMSGSIZE);
3945 		if (uiop->uio_resid > iosize) {
3946 			tdr.DATA_flag = 1;
3947 		} else {
3948 			if (more)
3949 				tdr.DATA_flag = 1;
3950 			else
3951 				tdr.DATA_flag = 0;
3952 			iosize = uiop->uio_resid;
3953 		}
3954 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3955 		    tdr.DATA_flag, iosize));
3956 
3957 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3958 		tdr.OPT_length = optlen;
3959 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3960 
3961 		size = (int)sizeof (tdr) + optlen;
3962 		/*
3963 		 * File descriptors only when SM_FDPASSING set.
3964 		 */
3965 		error = so_getfdopt(control, controllen,
3966 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3967 		if (error)
3968 			return (error);
3969 		if (fdlen != -1) {
3970 			if (!(so->so_mode & SM_FDPASSING))
3971 				return (EOPNOTSUPP);
3972 
3973 			error = fdbuf_create(fds, fdlen, &fdbuf);
3974 			if (error)
3975 				return (error);
3976 			mp = fdbuf_allocmsg(size, fdbuf);
3977 		} else {
3978 			mp = soallocproto(size, _ALLOC_INTR, CRED());
3979 			if (mp == NULL) {
3980 				/*
3981 				 * Caught a signal waiting for memory.
3982 				 * Let send* return EINTR.
3983 				 */
3984 				return (EINTR);
3985 			}
3986 		}
3987 		soappendmsg(mp, &tdr, sizeof (tdr));
3988 
3989 		if (fdlen != -1) {
3990 			ASSERT(fdbuf != NULL);
3991 			toh.level = SOL_SOCKET;
3992 			toh.name = SO_FILEP;
3993 			toh.len = fdbuf->fd_size +
3994 			    (t_uscalar_t)sizeof (struct T_opthdr);
3995 			toh.status = 0;
3996 			soappendmsg(mp, &toh, sizeof (toh));
3997 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3998 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3999 		}
4000 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
4001 		/* At most 3 bytes left in the message */
4002 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
4003 		ASSERT(MBLKL(mp) <= (ssize_t)size);
4004 
4005 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4006 
4007 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4008 		    0, MSG_BAND, 0);
4009 		if (error) {
4010 			eprintsoline(so, error);
4011 			return (error);
4012 		}
4013 		control = NULL;
4014 		if (uiop->uio_resid > 0) {
4015 			/*
4016 			 * Recheck for fatal errors. Fail write even though
4017 			 * some data have been written. This is consistent
4018 			 * with strwrite semantics and BSD sockets semantics.
4019 			 */
4020 			if (so->so_state & SS_CANTSENDMORE) {
4021 				eprintsoline(so, error);
4022 				return (EPIPE);
4023 			}
4024 			if (so->so_error != 0) {
4025 				mutex_enter(&so->so_lock);
4026 				error = sogeterr(so, B_TRUE);
4027 				mutex_exit(&so->so_lock);
4028 				if (error != 0) {
4029 					eprintsoline(so, error);
4030 					return (error);
4031 				}
4032 			}
4033 		}
4034 	} while (uiop->uio_resid > 0);
4035 	return (0);
4036 }
4037 
4038 /*
4039  * Sending data on a datagram socket.
4040  * Assumes caller has verified that SS_ISBOUND etc. are set.
4041  *
4042  * For AF_UNIX the destination address is translated to an internal
4043  * name and the source address is passed as an option.
4044  */
4045 int
4046 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
4047     struct uio *uiop, int flags)
4048 {
4049 	struct T_unitdata_req	tudr;
4050 	mblk_t			*mp;
4051 	int			error;
4052 	void			*addr;
4053 	socklen_t		addrlen;
4054 	void			*src;
4055 	socklen_t		srclen;
4056 	ssize_t			len;
4057 	sotpi_info_t		*sti = SOTOTPI(so);
4058 
4059 	ASSERT(name != NULL && namelen != 0);
4060 
4061 	len = uiop->uio_resid;
4062 	if (len > sti->sti_tidu_size) {
4063 		error = EMSGSIZE;
4064 		goto done;
4065 	}
4066 
4067 	/* Length and family checks */
4068 	error = so_addr_verify(so, name, namelen);
4069 	if (error != 0)
4070 		goto done;
4071 
4072 	if (sti->sti_direct)
4073 		return (sodgram_direct(so, name, namelen, uiop, flags));
4074 
4075 	if (so->so_family == AF_UNIX) {
4076 		if (sti->sti_faddr_noxlate) {
4077 			/*
4078 			 * Already have a transport internal address. Do not
4079 			 * pass any (transport internal) source address.
4080 			 */
4081 			addr = name;
4082 			addrlen = namelen;
4083 			src = NULL;
4084 			srclen = 0;
4085 		} else {
4086 			/*
4087 			 * Pass the sockaddr_un source address as an option
4088 			 * and translate the remote address.
4089 			 *
4090 			 * Note that this code does not prevent sti_laddr_sa
4091 			 * from changing while it is being used. Thus
4092 			 * if an unbind+bind occurs concurrently with this
4093 			 * send the peer might see a partially new and a
4094 			 * partially old "from" address.
4095 			 */
4096 			src = sti->sti_laddr_sa;
4097 			srclen = (socklen_t)sti->sti_laddr_len;
4098 			dprintso(so, 1,
4099 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
4100 			    srclen, src));
4101 			error = so_ux_addr_xlate(so, name, namelen,
4102 			    (flags & MSG_XPG4_2),
4103 			    &addr, &addrlen);
4104 			if (error) {
4105 				eprintsoline(so, error);
4106 				goto done;
4107 			}
4108 		}
4109 	} else {
4110 		addr = name;
4111 		addrlen = namelen;
4112 		src = NULL;
4113 		srclen = 0;
4114 	}
4115 	tudr.PRIM_type = T_UNITDATA_REQ;
4116 	tudr.DEST_length = addrlen;
4117 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4118 	if (srclen == 0) {
4119 		tudr.OPT_length = 0;
4120 		tudr.OPT_offset = 0;
4121 
4122 		mp = soallocproto2(&tudr, sizeof (tudr),
4123 		    addr, addrlen, 0, _ALLOC_INTR, CRED());
4124 		if (mp == NULL) {
4125 			/*
4126 			 * Caught a signal waiting for memory.
4127 			 * Let send* return EINTR.
4128 			 */
4129 			error = EINTR;
4130 			goto done;
4131 		}
4132 	} else {
4133 		/*
4134 		 * There is a AF_UNIX sockaddr_un to include as a source
4135 		 * address option.
4136 		 */
4137 		struct T_opthdr toh;
4138 		ssize_t size;
4139 
4140 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4141 		    _TPI_ALIGN_TOPT(srclen));
4142 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4143 		    _TPI_ALIGN_TOPT(addrlen));
4144 
4145 		toh.level = SOL_SOCKET;
4146 		toh.name = SO_SRCADDR;
4147 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4148 		toh.status = 0;
4149 
4150 		size = tudr.OPT_offset + tudr.OPT_length;
4151 		mp = soallocproto2(&tudr, sizeof (tudr),
4152 		    addr, addrlen, size, _ALLOC_INTR, CRED());
4153 		if (mp == NULL) {
4154 			/*
4155 			 * Caught a signal waiting for memory.
4156 			 * Let send* return EINTR.
4157 			 */
4158 			error = EINTR;
4159 			goto done;
4160 		}
4161 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4162 		soappendmsg(mp, &toh, sizeof (toh));
4163 		soappendmsg(mp, src, srclen);
4164 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4165 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4166 	}
4167 
4168 	if (audit_active)
4169 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4170 
4171 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4172 done:
4173 #ifdef SOCK_DEBUG
4174 	if (error) {
4175 		eprintsoline(so, error);
4176 	}
4177 #endif /* SOCK_DEBUG */
4178 	return (error);
4179 }
4180 
4181 /*
4182  * Sending data on a connected stream socket.
4183  * Assumes caller has verified that SS_ISCONNECTED is set.
4184  */
4185 int
4186 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4187     int sflag)
4188 {
4189 	struct T_data_req	tdr;
4190 	mblk_t			*mp;
4191 	int			error;
4192 	ssize_t			iosize;
4193 	sotpi_info_t		*sti = SOTOTPI(so);
4194 
4195 	dprintso(so, 1,
4196 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4197 	    (void *)so, uiop->uio_resid, prim, sflag));
4198 
4199 	/*
4200 	 * Has to be bound and connected. However, since no locks are
4201 	 * held the state could have changed after sotpi_sendmsg checked it
4202 	 * thus it is not possible to ASSERT on the state.
4203 	 */
4204 
4205 	do {
4206 		/*
4207 		 * Set the MORE flag if uio_resid does not fit in this
4208 		 * message or if the caller passed in "more".
4209 		 * Error for transports with zero tidu_size.
4210 		 */
4211 		tdr.PRIM_type = prim;
4212 		iosize = sti->sti_tidu_size;
4213 		if (iosize <= 0)
4214 			return (EMSGSIZE);
4215 		if (uiop->uio_resid > iosize) {
4216 			tdr.MORE_flag = 1;
4217 		} else {
4218 			if (more)
4219 				tdr.MORE_flag = 1;
4220 			else
4221 				tdr.MORE_flag = 0;
4222 			iosize = uiop->uio_resid;
4223 		}
4224 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4225 		    prim, tdr.MORE_flag, iosize));
4226 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4227 		if (mp == NULL) {
4228 			/*
4229 			 * Caught a signal waiting for memory.
4230 			 * Let send* return EINTR.
4231 			 */
4232 			return (EINTR);
4233 		}
4234 
4235 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4236 		    0, sflag | MSG_BAND, 0);
4237 		if (error) {
4238 			eprintsoline(so, error);
4239 			return (error);
4240 		}
4241 		if (uiop->uio_resid > 0) {
4242 			/*
4243 			 * Recheck for fatal errors. Fail write even though
4244 			 * some data have been written. This is consistent
4245 			 * with strwrite semantics and BSD sockets semantics.
4246 			 */
4247 			if (so->so_state & SS_CANTSENDMORE) {
4248 				eprintsoline(so, error);
4249 				return (EPIPE);
4250 			}
4251 			if (so->so_error != 0) {
4252 				mutex_enter(&so->so_lock);
4253 				error = sogeterr(so, B_TRUE);
4254 				mutex_exit(&so->so_lock);
4255 				if (error != 0) {
4256 					eprintsoline(so, error);
4257 					return (error);
4258 				}
4259 			}
4260 		}
4261 	} while (uiop->uio_resid > 0);
4262 	return (0);
4263 }
4264 
4265 /*
4266  * Check the state for errors and call the appropriate send function.
4267  *
4268  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4269  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4270  * after sending the message.
4271  */
4272 static int
4273 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4274     struct cred *cr)
4275 {
4276 	int		so_state;
4277 	int		so_mode;
4278 	int		error;
4279 	struct sockaddr *name;
4280 	t_uscalar_t	namelen;
4281 	int		dontroute;
4282 	int		flags;
4283 	sotpi_info_t	*sti = SOTOTPI(so);
4284 
4285 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4286 	    (void *)so, (void *)msg, msg->msg_flags,
4287 	    pr_state(so->so_state, so->so_mode), so->so_error));
4288 
4289 	if (so->so_version == SOV_STREAM) {
4290 		/* The imaginary "sockmod" has been popped - act as a stream */
4291 		so_update_attrs(so, SOMOD);
4292 		return (strwrite(SOTOV(so), uiop, cr));
4293 	}
4294 
4295 	mutex_enter(&so->so_lock);
4296 	so_state = so->so_state;
4297 
4298 	if (so_state & SS_CANTSENDMORE) {
4299 		mutex_exit(&so->so_lock);
4300 		return (EPIPE);
4301 	}
4302 
4303 	if (so->so_error != 0) {
4304 		error = sogeterr(so, B_TRUE);
4305 		if (error != 0) {
4306 			mutex_exit(&so->so_lock);
4307 			return (error);
4308 		}
4309 	}
4310 
4311 	name = (struct sockaddr *)msg->msg_name;
4312 	namelen = msg->msg_namelen;
4313 
4314 	so_mode = so->so_mode;
4315 
4316 	if (name == NULL) {
4317 		if (!(so_state & SS_ISCONNECTED)) {
4318 			mutex_exit(&so->so_lock);
4319 			if (so_mode & SM_CONNREQUIRED)
4320 				return (ENOTCONN);
4321 			else
4322 				return (EDESTADDRREQ);
4323 		}
4324 		if (so_mode & SM_CONNREQUIRED) {
4325 			name = NULL;
4326 			namelen = 0;
4327 		} else {
4328 			/*
4329 			 * Note that this code does not prevent sti_faddr_sa
4330 			 * from changing while it is being used. Thus
4331 			 * if an "unconnect"+connect occurs concurrently with
4332 			 * this send the datagram might be delivered to a
4333 			 * garbaled address.
4334 			 */
4335 			ASSERT(sti->sti_faddr_sa);
4336 			name = sti->sti_faddr_sa;
4337 			namelen = (t_uscalar_t)sti->sti_faddr_len;
4338 		}
4339 	} else {
4340 		if (!(so_state & SS_ISCONNECTED) &&
4341 		    (so_mode & SM_CONNREQUIRED)) {
4342 			/* Required but not connected */
4343 			mutex_exit(&so->so_lock);
4344 			return (ENOTCONN);
4345 		}
4346 		/*
4347 		 * Ignore the address on connection-oriented sockets.
4348 		 * Just like BSD this code does not generate an error for
4349 		 * TCP (a CONNREQUIRED socket) when sending to an address
4350 		 * passed in with sendto/sendmsg. Instead the data is
4351 		 * delivered on the connection as if no address had been
4352 		 * supplied.
4353 		 */
4354 		if ((so_state & SS_ISCONNECTED) &&
4355 		    !(so_mode & SM_CONNREQUIRED)) {
4356 			mutex_exit(&so->so_lock);
4357 			return (EISCONN);
4358 		}
4359 		if (!(so_state & SS_ISBOUND)) {
4360 			so_lock_single(so);	/* Set SOLOCKED */
4361 			error = sotpi_bind(so, NULL, 0,
4362 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4363 			so_unlock_single(so, SOLOCKED);
4364 			if (error) {
4365 				mutex_exit(&so->so_lock);
4366 				eprintsoline(so, error);
4367 				return (error);
4368 			}
4369 		}
4370 		/*
4371 		 * Handle delayed datagram errors. These are only queued
4372 		 * when the application sets SO_DGRAM_ERRIND.
4373 		 * Return the error if we are sending to the address
4374 		 * that was returned in the last T_UDERROR_IND.
4375 		 * If sending to some other address discard the delayed
4376 		 * error indication.
4377 		 */
4378 		if (sti->sti_delayed_error) {
4379 			struct T_uderror_ind	*tudi;
4380 			void			*addr;
4381 			t_uscalar_t		addrlen;
4382 			boolean_t		match = B_FALSE;
4383 
4384 			ASSERT(sti->sti_eaddr_mp);
4385 			error = sti->sti_delayed_error;
4386 			sti->sti_delayed_error = 0;
4387 			tudi =
4388 			    (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4389 			addrlen = tudi->DEST_length;
4390 			addr = sogetoff(sti->sti_eaddr_mp,
4391 			    tudi->DEST_offset, addrlen, 1);
4392 			ASSERT(addr);	/* Checked by strsock_proto */
4393 			switch (so->so_family) {
4394 			case AF_INET: {
4395 				/* Compare just IP address and port */
4396 				sin_t *sin1 = (sin_t *)name;
4397 				sin_t *sin2 = (sin_t *)addr;
4398 
4399 				if (addrlen == sizeof (sin_t) &&
4400 				    namelen == addrlen &&
4401 				    sin1->sin_port == sin2->sin_port &&
4402 				    sin1->sin_addr.s_addr ==
4403 				    sin2->sin_addr.s_addr)
4404 					match = B_TRUE;
4405 				break;
4406 			}
4407 			case AF_INET6: {
4408 				/* Compare just IP address and port. Not flow */
4409 				sin6_t *sin1 = (sin6_t *)name;
4410 				sin6_t *sin2 = (sin6_t *)addr;
4411 
4412 				if (addrlen == sizeof (sin6_t) &&
4413 				    namelen == addrlen &&
4414 				    sin1->sin6_port == sin2->sin6_port &&
4415 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4416 				    &sin2->sin6_addr))
4417 					match = B_TRUE;
4418 				break;
4419 			}
4420 			case AF_UNIX:
4421 			default:
4422 				if (namelen == addrlen &&
4423 				    bcmp(name, addr, namelen) == 0)
4424 					match = B_TRUE;
4425 			}
4426 			if (match) {
4427 				freemsg(sti->sti_eaddr_mp);
4428 				sti->sti_eaddr_mp = NULL;
4429 				mutex_exit(&so->so_lock);
4430 #ifdef DEBUG
4431 				dprintso(so, 0,
4432 				    ("sockfs delayed error %d for %s\n",
4433 				    error,
4434 				    pr_addr(so->so_family, name, namelen)));
4435 #endif /* DEBUG */
4436 				return (error);
4437 			}
4438 			freemsg(sti->sti_eaddr_mp);
4439 			sti->sti_eaddr_mp = NULL;
4440 		}
4441 	}
4442 	mutex_exit(&so->so_lock);
4443 
4444 	flags = msg->msg_flags;
4445 	dontroute = 0;
4446 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4447 		uint32_t	val;
4448 
4449 		val = 1;
4450 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4451 		    &val, (t_uscalar_t)sizeof (val), cr);
4452 		if (error)
4453 			return (error);
4454 		dontroute = 1;
4455 	}
4456 
4457 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4458 		error = EOPNOTSUPP;
4459 		goto done;
4460 	}
4461 	if (msg->msg_controllen != 0) {
4462 		if (!(so_mode & SM_CONNREQUIRED)) {
4463 			so_update_attrs(so, SOMOD);
4464 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4465 			    msg->msg_control, msg->msg_controllen, flags);
4466 		} else {
4467 			if (flags & MSG_OOB) {
4468 				/* Can't generate T_EXDATA_REQ with options */
4469 				error = EOPNOTSUPP;
4470 				goto done;
4471 			}
4472 			so_update_attrs(so, SOMOD);
4473 			error = sosend_svccmsg(so, uiop,
4474 			    !(flags & MSG_EOR),
4475 			    msg->msg_control, msg->msg_controllen,
4476 			    flags);
4477 		}
4478 		goto done;
4479 	}
4480 
4481 	so_update_attrs(so, SOMOD);
4482 	if (!(so_mode & SM_CONNREQUIRED)) {
4483 		/*
4484 		 * If there is no SO_DONTROUTE to turn off return immediately
4485 		 * from send_dgram. This can allow tail-call optimizations.
4486 		 */
4487 		if (!dontroute) {
4488 			return (sosend_dgram(so, name, namelen, uiop, flags));
4489 		}
4490 		error = sosend_dgram(so, name, namelen, uiop, flags);
4491 	} else {
4492 		t_scalar_t prim;
4493 		int sflag;
4494 
4495 		/* Ignore msg_name in the connected state */
4496 		if (flags & MSG_OOB) {
4497 			prim = T_EXDATA_REQ;
4498 			/*
4499 			 * Send down T_EXDATA_REQ even if there is flow
4500 			 * control for data.
4501 			 */
4502 			sflag = MSG_IGNFLOW;
4503 		} else {
4504 			if (so_mode & SM_BYTESTREAM) {
4505 				/* Byte stream transport - use write */
4506 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4507 
4508 				/* Send M_DATA messages */
4509 				if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4510 				    (error = nl7c_data(so, uiop)) >= 0) {
4511 					/* NL7C consumed the data */
4512 					return (error);
4513 				}
4514 				/*
4515 				 * If there is no SO_DONTROUTE to turn off,
4516 				 * sti_direct is on, and there is no flow
4517 				 * control, we can take the fast path.
4518 				 */
4519 				if (!dontroute && sti->sti_direct != 0 &&
4520 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4521 					return (sostream_direct(so, uiop,
4522 					    NULL, cr));
4523 				}
4524 				error = strwrite(SOTOV(so), uiop, cr);
4525 				goto done;
4526 			}
4527 			prim = T_DATA_REQ;
4528 			sflag = 0;
4529 		}
4530 		/*
4531 		 * If there is no SO_DONTROUTE to turn off return immediately
4532 		 * from sosend_svc. This can allow tail-call optimizations.
4533 		 */
4534 		if (!dontroute)
4535 			return (sosend_svc(so, uiop, prim,
4536 			    !(flags & MSG_EOR), sflag));
4537 		error = sosend_svc(so, uiop, prim,
4538 		    !(flags & MSG_EOR), sflag);
4539 	}
4540 	ASSERT(dontroute);
4541 done:
4542 	if (dontroute) {
4543 		uint32_t	val;
4544 
4545 		val = 0;
4546 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4547 		    &val, (t_uscalar_t)sizeof (val), cr);
4548 	}
4549 	return (error);
4550 }
4551 
4552 /*
4553  * kstrwritemp() has very similar semantics as that of strwrite().
4554  * The main difference is it obtains mblks from the caller and also
4555  * does not do any copy as done in strwrite() from user buffers to
4556  * kernel buffers.
4557  *
4558  * Currently, this routine is used by sendfile to send data allocated
4559  * within the kernel without any copying. This interface does not use the
4560  * synchronous stream interface as synch. stream interface implies
4561  * copying.
4562  */
4563 int
4564 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4565 {
4566 	struct stdata *stp;
4567 	struct queue *wqp;
4568 	mblk_t *newmp;
4569 	char waitflag;
4570 	int tempmode;
4571 	int error = 0;
4572 	int done = 0;
4573 	struct sonode *so;
4574 	boolean_t direct;
4575 
4576 	ASSERT(vp->v_stream);
4577 	stp = vp->v_stream;
4578 
4579 	so = VTOSO(vp);
4580 	direct = _SOTOTPI(so)->sti_direct;
4581 
4582 	/*
4583 	 * This is the sockfs direct fast path. canputnext() need
4584 	 * not be accurate so we don't grab the sd_lock here. If
4585 	 * we get flow-controlled, we grab sd_lock just before the
4586 	 * do..while loop below to emulate what strwrite() does.
4587 	 */
4588 	wqp = stp->sd_wrq;
4589 	if (canputnext(wqp) && direct &&
4590 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4591 		return (sostream_direct(so, NULL, mp, CRED()));
4592 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4593 		/* Fast check of flags before acquiring the lock */
4594 		mutex_enter(&stp->sd_lock);
4595 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4596 		mutex_exit(&stp->sd_lock);
4597 		if (error != 0) {
4598 			if (!(stp->sd_flag & STPLEX) &&
4599 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
4600 				error = EPIPE;
4601 			}
4602 			return (error);
4603 		}
4604 	}
4605 
4606 	waitflag = WRITEWAIT;
4607 	if (stp->sd_flag & OLDNDELAY)
4608 		tempmode = fmode & ~FNDELAY;
4609 	else
4610 		tempmode = fmode;
4611 
4612 	mutex_enter(&stp->sd_lock);
4613 	do {
4614 		if (canputnext(wqp)) {
4615 			mutex_exit(&stp->sd_lock);
4616 			if (stp->sd_wputdatafunc != NULL) {
4617 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4618 				    NULL, NULL, NULL);
4619 				if (newmp == NULL) {
4620 					/* The caller will free mp */
4621 					return (ECOMM);
4622 				}
4623 				mp = newmp;
4624 			}
4625 			putnext(wqp, mp);
4626 			return (0);
4627 		}
4628 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4629 		    &done);
4630 	} while (error == 0 && !done);
4631 
4632 	mutex_exit(&stp->sd_lock);
4633 	/*
4634 	 * EAGAIN tells the application to try again. ENOMEM
4635 	 * is returned only if the memory allocation size
4636 	 * exceeds the physical limits of the system. ENOMEM
4637 	 * can't be true here.
4638 	 */
4639 	if (error == ENOMEM)
4640 		error = EAGAIN;
4641 	return (error);
4642 }
4643 
4644 /* ARGSUSED */
4645 static int
4646 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4647     struct cred *cr, mblk_t **mpp)
4648 {
4649 	int error;
4650 
4651 	if (so->so_family != AF_INET && so->so_family != AF_INET6)
4652 		return (EAFNOSUPPORT);
4653 
4654 	if (so->so_state & SS_CANTSENDMORE)
4655 		return (EPIPE);
4656 
4657 	if (so->so_type != SOCK_STREAM)
4658 		return (EOPNOTSUPP);
4659 
4660 	if ((so->so_state & SS_ISCONNECTED) == 0)
4661 		return (ENOTCONN);
4662 
4663 	error = kstrwritemp(so->so_vnode, *mpp, fflag);
4664 	if (error == 0)
4665 		*mpp = NULL;
4666 	return (error);
4667 }
4668 
4669 /*
4670  * Sending data on a datagram socket.
4671  * Assumes caller has verified that SS_ISBOUND etc. are set.
4672  */
4673 /* ARGSUSED */
4674 static int
4675 sodgram_direct(struct sonode *so, struct sockaddr *name,
4676     socklen_t namelen, struct uio *uiop, int flags)
4677 {
4678 	struct T_unitdata_req	tudr;
4679 	mblk_t			*mp = NULL;
4680 	int			error = 0;
4681 	void			*addr;
4682 	socklen_t		addrlen;
4683 	ssize_t			len;
4684 	struct stdata		*stp = SOTOV(so)->v_stream;
4685 	int			so_state;
4686 	queue_t			*udp_wq;
4687 	boolean_t		connected;
4688 	mblk_t			*mpdata = NULL;
4689 	sotpi_info_t		*sti = SOTOTPI(so);
4690 
4691 	ASSERT(name != NULL && namelen != 0);
4692 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4693 	ASSERT(!(so->so_mode & SM_EXDATA));
4694 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4695 	ASSERT(SOTOV(so)->v_type == VSOCK);
4696 
4697 	/* Caller checked for proper length */
4698 	len = uiop->uio_resid;
4699 	ASSERT(len <= sti->sti_tidu_size);
4700 
4701 	/* Length and family checks have been done by caller */
4702 	ASSERT(name->sa_family == so->so_family);
4703 	ASSERT(so->so_family == AF_INET ||
4704 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4705 	ASSERT(so->so_family == AF_INET6 ||
4706 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4707 
4708 	addr = name;
4709 	addrlen = namelen;
4710 
4711 	if (stp->sd_sidp != NULL &&
4712 	    (error = straccess(stp, JCWRITE)) != 0)
4713 		goto done;
4714 
4715 	so_state = so->so_state;
4716 
4717 	connected = so_state & SS_ISCONNECTED;
4718 	if (!connected) {
4719 		tudr.PRIM_type = T_UNITDATA_REQ;
4720 		tudr.DEST_length = addrlen;
4721 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4722 		tudr.OPT_length = 0;
4723 		tudr.OPT_offset = 0;
4724 
4725 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4726 		    _ALLOC_INTR, CRED());
4727 		if (mp == NULL) {
4728 			/*
4729 			 * Caught a signal waiting for memory.
4730 			 * Let send* return EINTR.
4731 			 */
4732 			error = EINTR;
4733 			goto done;
4734 		}
4735 	}
4736 
4737 	/*
4738 	 * For UDP we don't break up the copyin into smaller pieces
4739 	 * as in the TCP case.  That means if ENOMEM is returned by
4740 	 * mcopyinuio() then the uio vector has not been modified at
4741 	 * all and we fallback to either strwrite() or kstrputmsg()
4742 	 * below.  Note also that we never generate priority messages
4743 	 * from here.
4744 	 */
4745 	udp_wq = stp->sd_wrq->q_next;
4746 	if (canput(udp_wq) &&
4747 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4748 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4749 		ASSERT(uiop->uio_resid == 0);
4750 		if (!connected)
4751 			linkb(mp, mpdata);
4752 		else
4753 			mp = mpdata;
4754 		if (audit_active)
4755 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4756 
4757 		udp_wput(udp_wq, mp);
4758 		return (0);
4759 	}
4760 
4761 	ASSERT(mpdata == NULL);
4762 	if (error != 0 && error != ENOMEM) {
4763 		freemsg(mp);
4764 		return (error);
4765 	}
4766 
4767 	/*
4768 	 * For connected, let strwrite() handle the blocking case.
4769 	 * Otherwise we fall thru and use kstrputmsg().
4770 	 */
4771 	if (connected)
4772 		return (strwrite(SOTOV(so), uiop, CRED()));
4773 
4774 	if (audit_active)
4775 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4776 
4777 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4778 done:
4779 #ifdef SOCK_DEBUG
4780 	if (error != 0) {
4781 		eprintsoline(so, error);
4782 	}
4783 #endif /* SOCK_DEBUG */
4784 	return (error);
4785 }
4786 
4787 int
4788 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4789 {
4790 	struct stdata *stp = SOTOV(so)->v_stream;
4791 	ssize_t iosize, rmax, maxblk;
4792 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4793 	mblk_t *newmp;
4794 	int error = 0, wflag = 0;
4795 
4796 	ASSERT(so->so_mode & SM_BYTESTREAM);
4797 	ASSERT(SOTOV(so)->v_type == VSOCK);
4798 
4799 	if (stp->sd_sidp != NULL &&
4800 	    (error = straccess(stp, JCWRITE)) != 0)
4801 		return (error);
4802 
4803 	if (uiop == NULL) {
4804 		/*
4805 		 * kstrwritemp() should have checked sd_flag and
4806 		 * flow-control before coming here.  If we end up
4807 		 * here it means that we can simply pass down the
4808 		 * data to tcp.
4809 		 */
4810 		ASSERT(mp != NULL);
4811 		if (stp->sd_wputdatafunc != NULL) {
4812 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4813 			    NULL, NULL, NULL);
4814 			if (newmp == NULL) {
4815 				/* The caller will free mp */
4816 				return (ECOMM);
4817 			}
4818 			mp = newmp;
4819 		}
4820 		tcp_wput(tcp_wq, mp);
4821 		return (0);
4822 	}
4823 
4824 	/* Fallback to strwrite() to do proper error handling */
4825 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4826 		return (strwrite(SOTOV(so), uiop, cr));
4827 
4828 	rmax = stp->sd_qn_maxpsz;
4829 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4830 	if (rmax == 0 || uiop->uio_resid <= 0)
4831 		return (0);
4832 
4833 	if (rmax == INFPSZ)
4834 		rmax = uiop->uio_resid;
4835 
4836 	maxblk = stp->sd_maxblk;
4837 
4838 	for (;;) {
4839 		iosize = MIN(uiop->uio_resid, rmax);
4840 
4841 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4842 		if (mp == NULL) {
4843 			/*
4844 			 * Fallback to strwrite() for ENOMEM; if this
4845 			 * is our first time in this routine and the uio
4846 			 * vector has not been modified, we will end up
4847 			 * calling strwrite() without any flag set.
4848 			 */
4849 			if (error == ENOMEM)
4850 				goto slow_send;
4851 			else
4852 				return (error);
4853 		}
4854 		ASSERT(uiop->uio_resid >= 0);
4855 		/*
4856 		 * If mp is non-NULL and ENOMEM is set, it means that
4857 		 * mcopyinuio() was able to break down some of the user
4858 		 * data into one or more mblks.  Send the partial data
4859 		 * to tcp and let the rest be handled in strwrite().
4860 		 */
4861 		ASSERT(error == 0 || error == ENOMEM);
4862 		if (stp->sd_wputdatafunc != NULL) {
4863 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4864 			    NULL, NULL, NULL);
4865 			if (newmp == NULL) {
4866 				/* The caller will free mp */
4867 				return (ECOMM);
4868 			}
4869 			mp = newmp;
4870 		}
4871 		tcp_wput(tcp_wq, mp);
4872 
4873 		wflag |= NOINTR;
4874 
4875 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4876 			ASSERT(error == 0);
4877 			break;
4878 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4879 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4880 slow_send:
4881 			/*
4882 			 * We were able to send down partial data using
4883 			 * the direct call interface, but are now relying
4884 			 * on strwrite() to handle the non-fastpath cases.
4885 			 * If the socket is blocking we will sleep in
4886 			 * strwaitq() until write is permitted, otherwise,
4887 			 * we will need to return the amount of bytes
4888 			 * written so far back to the app.  This is the
4889 			 * reason why we pass NOINTR flag to strwrite()
4890 			 * for non-blocking socket, because we don't want
4891 			 * to return EAGAIN when portion of the user data
4892 			 * has actually been sent down.
4893 			 */
4894 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4895 		}
4896 	}
4897 	return (0);
4898 }
4899 
4900 /*
4901  * Update sti_faddr by asking the transport (unless AF_UNIX).
4902  */
4903 /* ARGSUSED */
4904 int
4905 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4906     boolean_t accept, struct cred *cr)
4907 {
4908 	struct strbuf	strbuf;
4909 	int		error = 0, res;
4910 	void		*addr;
4911 	t_uscalar_t	addrlen;
4912 	k_sigset_t	smask;
4913 	sotpi_info_t	*sti = SOTOTPI(so);
4914 
4915 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4916 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4917 
4918 	ASSERT(*namelen > 0);
4919 	mutex_enter(&so->so_lock);
4920 	so_lock_single(so);	/* Set SOLOCKED */
4921 
4922 	if (accept) {
4923 		bcopy(sti->sti_faddr_sa, name,
4924 		    MIN(*namelen, sti->sti_faddr_len));
4925 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4926 		goto done;
4927 	}
4928 
4929 	if (!(so->so_state & SS_ISCONNECTED)) {
4930 		error = ENOTCONN;
4931 		goto done;
4932 	}
4933 	/* Added this check for X/Open */
4934 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4935 		error = EINVAL;
4936 		if (xnet_check_print) {
4937 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4938 		}
4939 		goto done;
4940 	}
4941 
4942 	if (sti->sti_faddr_valid) {
4943 		bcopy(sti->sti_faddr_sa, name,
4944 		    MIN(*namelen, sti->sti_faddr_len));
4945 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4946 		goto done;
4947 	}
4948 
4949 #ifdef DEBUG
4950 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4951 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4952 	    (t_uscalar_t)sti->sti_faddr_len)));
4953 #endif /* DEBUG */
4954 
4955 	if (so->so_family == AF_UNIX) {
4956 		/* Transport has different name space - return local info */
4957 		if (sti->sti_faddr_noxlate)
4958 			*namelen = 0;
4959 		error = 0;
4960 		goto done;
4961 	}
4962 
4963 	ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4964 
4965 	ASSERT(sti->sti_faddr_sa);
4966 	/* Allocate local buffer to use with ioctl */
4967 	addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4968 	mutex_exit(&so->so_lock);
4969 	addr = kmem_alloc(addrlen, KM_SLEEP);
4970 
4971 	/*
4972 	 * Issue TI_GETPEERNAME with signals masked.
4973 	 * Put the result in sti_faddr_sa so that getpeername works after
4974 	 * a shutdown(output).
4975 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4976 	 * back to the socket.
4977 	 */
4978 	strbuf.buf = addr;
4979 	strbuf.maxlen = addrlen;
4980 	strbuf.len = 0;
4981 
4982 	sigintr(&smask, 0);
4983 	res = 0;
4984 	ASSERT(cr);
4985 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4986 	    0, K_TO_K, cr, &res);
4987 	sigunintr(&smask);
4988 
4989 	mutex_enter(&so->so_lock);
4990 	/*
4991 	 * If there is an error record the error in so_error put don't fail
4992 	 * the getpeername. Instead fallback on the recorded
4993 	 * sti->sti_faddr_sa.
4994 	 */
4995 	if (error) {
4996 		/*
4997 		 * Various stream head errors can be returned to the ioctl.
4998 		 * However, it is impossible to determine which ones of
4999 		 * these are really socket level errors that were incorrectly
5000 		 * consumed by the ioctl. Thus this code silently ignores the
5001 		 * error - to code explicitly does not reinstate the error
5002 		 * using soseterror().
5003 		 * Experiments have shows that at least this set of
5004 		 * errors are reported and should not be reinstated on the
5005 		 * socket:
5006 		 *	EINVAL	E.g. if an I_LINK was in effect when
5007 		 *		getpeername was called.
5008 		 *	EPIPE	The ioctl error semantics prefer the write
5009 		 *		side error over the read side error.
5010 		 *	ENOTCONN The transport just got disconnected but
5011 		 *		sockfs had not yet seen the T_DISCON_IND
5012 		 *		when issuing the ioctl.
5013 		 */
5014 		error = 0;
5015 	} else if (res == 0 && strbuf.len > 0 &&
5016 	    (so->so_state & SS_ISCONNECTED)) {
5017 		ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
5018 		sti->sti_faddr_len = (socklen_t)strbuf.len;
5019 		bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
5020 		sti->sti_faddr_valid = 1;
5021 
5022 		bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
5023 		*namelen = sti->sti_faddr_len;
5024 	}
5025 	kmem_free(addr, addrlen);
5026 #ifdef DEBUG
5027 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
5028 	    pr_addr(so->so_family, sti->sti_faddr_sa,
5029 	    (t_uscalar_t)sti->sti_faddr_len)));
5030 #endif /* DEBUG */
5031 done:
5032 	so_unlock_single(so, SOLOCKED);
5033 	mutex_exit(&so->so_lock);
5034 	return (error);
5035 }
5036 
5037 /*
5038  * Update sti_laddr by asking the transport (unless AF_UNIX).
5039  */
5040 int
5041 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
5042     struct cred *cr)
5043 {
5044 	struct strbuf	strbuf;
5045 	int		error = 0, res;
5046 	void		*addr;
5047 	t_uscalar_t	addrlen;
5048 	k_sigset_t	smask;
5049 	sotpi_info_t	*sti = SOTOTPI(so);
5050 
5051 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
5052 	    (void *)so, pr_state(so->so_state, so->so_mode)));
5053 
5054 	ASSERT(*namelen > 0);
5055 	mutex_enter(&so->so_lock);
5056 	so_lock_single(so);	/* Set SOLOCKED */
5057 
5058 #ifdef DEBUG
5059 
5060 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
5061 	    pr_addr(so->so_family, sti->sti_laddr_sa,
5062 	    (t_uscalar_t)sti->sti_laddr_len)));
5063 #endif /* DEBUG */
5064 	if (sti->sti_laddr_valid) {
5065 		bcopy(sti->sti_laddr_sa, name,
5066 		    MIN(*namelen, sti->sti_laddr_len));
5067 		*namelen = sti->sti_laddr_len;
5068 		goto done;
5069 	}
5070 
5071 	if (so->so_family == AF_UNIX) {
5072 		/* Transport has different name space - return local info */
5073 		error = 0;
5074 		*namelen = 0;
5075 		goto done;
5076 	}
5077 	if (!(so->so_state & SS_ISBOUND)) {
5078 		/* If not bound, then nothing to return. */
5079 		error = 0;
5080 		goto done;
5081 	}
5082 
5083 	/* Allocate local buffer to use with ioctl */
5084 	addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
5085 	mutex_exit(&so->so_lock);
5086 	addr = kmem_alloc(addrlen, KM_SLEEP);
5087 
5088 	/*
5089 	 * Issue TI_GETMYNAME with signals masked.
5090 	 * Put the result in sti_laddr_sa so that getsockname works after
5091 	 * a shutdown(output).
5092 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5093 	 * back to the socket.
5094 	 */
5095 	strbuf.buf = addr;
5096 	strbuf.maxlen = addrlen;
5097 	strbuf.len = 0;
5098 
5099 	sigintr(&smask, 0);
5100 	res = 0;
5101 	ASSERT(cr);
5102 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
5103 	    0, K_TO_K, cr, &res);
5104 	sigunintr(&smask);
5105 
5106 	mutex_enter(&so->so_lock);
5107 	/*
5108 	 * If there is an error record the error in so_error put don't fail
5109 	 * the getsockname. Instead fallback on the recorded
5110 	 * sti->sti_laddr_sa.
5111 	 */
5112 	if (error) {
5113 		/*
5114 		 * Various stream head errors can be returned to the ioctl.
5115 		 * However, it is impossible to determine which ones of
5116 		 * these are really socket level errors that were incorrectly
5117 		 * consumed by the ioctl. Thus this code silently ignores the
5118 		 * error - to code explicitly does not reinstate the error
5119 		 * using soseterror().
5120 		 * Experiments have shows that at least this set of
5121 		 * errors are reported and should not be reinstated on the
5122 		 * socket:
5123 		 *	EINVAL	E.g. if an I_LINK was in effect when
5124 		 *		getsockname was called.
5125 		 *	EPIPE	The ioctl error semantics prefer the write
5126 		 *		side error over the read side error.
5127 		 */
5128 		error = 0;
5129 	} else if (res == 0 && strbuf.len > 0 &&
5130 	    (so->so_state & SS_ISBOUND)) {
5131 		ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5132 		sti->sti_laddr_len = (socklen_t)strbuf.len;
5133 		bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5134 		sti->sti_laddr_valid = 1;
5135 
5136 		bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5137 		*namelen = sti->sti_laddr_len;
5138 	}
5139 	kmem_free(addr, addrlen);
5140 #ifdef DEBUG
5141 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5142 	    pr_addr(so->so_family, sti->sti_laddr_sa,
5143 	    (t_uscalar_t)sti->sti_laddr_len)));
5144 #endif /* DEBUG */
5145 done:
5146 	so_unlock_single(so, SOLOCKED);
5147 	mutex_exit(&so->so_lock);
5148 	return (error);
5149 }
5150 
5151 /*
5152  * Get socket options. For SOL_SOCKET options some options are handled
5153  * by the sockfs while others use the value recorded in the sonode as a
5154  * fallback should the T_SVR4_OPTMGMT_REQ fail.
5155  *
5156  * On the return most *optlenp bytes are copied to optval.
5157  */
5158 /* ARGSUSED */
5159 int
5160 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5161 		void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5162 {
5163 	struct T_optmgmt_req	optmgmt_req;
5164 	struct T_optmgmt_ack	*optmgmt_ack;
5165 	struct opthdr		oh;
5166 	struct opthdr		*opt_res;
5167 	mblk_t			*mp = NULL;
5168 	int			error = 0;
5169 	void			*option = NULL;	/* Set if fallback value */
5170 	t_uscalar_t		maxlen = *optlenp;
5171 	t_uscalar_t		len;
5172 	uint32_t		value;
5173 	struct timeval		tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5174 	struct timeval32	tmo_val32;
5175 	struct so_snd_bufinfo	snd_bufinfo;	/* used for zero copy */
5176 
5177 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5178 	    (void *)so, level, option_name, optval, (void *)optlenp,
5179 	    pr_state(so->so_state, so->so_mode)));
5180 
5181 	mutex_enter(&so->so_lock);
5182 	so_lock_single(so);	/* Set SOLOCKED */
5183 
5184 	/*
5185 	 * Check for SOL_SOCKET options.
5186 	 * Certain SOL_SOCKET options are returned directly whereas
5187 	 * others only provide a default (fallback) value should
5188 	 * the T_SVR4_OPTMGMT_REQ fail.
5189 	 */
5190 	if (level == SOL_SOCKET) {
5191 		/* Check parameters */
5192 		switch (option_name) {
5193 		case SO_TYPE:
5194 		case SO_ERROR:
5195 		case SO_DEBUG:
5196 		case SO_ACCEPTCONN:
5197 		case SO_REUSEADDR:
5198 		case SO_KEEPALIVE:
5199 		case SO_DONTROUTE:
5200 		case SO_BROADCAST:
5201 		case SO_USELOOPBACK:
5202 		case SO_OOBINLINE:
5203 		case SO_SNDBUF:
5204 		case SO_RCVBUF:
5205 #ifdef notyet
5206 		case SO_SNDLOWAT:
5207 		case SO_RCVLOWAT:
5208 #endif /* notyet */
5209 		case SO_DOMAIN:
5210 		case SO_DGRAM_ERRIND:
5211 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5212 				error = EINVAL;
5213 				eprintsoline(so, error);
5214 				goto done2;
5215 			}
5216 			break;
5217 		case SO_RCVTIMEO:
5218 		case SO_SNDTIMEO:
5219 			if (get_udatamodel() == DATAMODEL_NONE ||
5220 			    get_udatamodel() == DATAMODEL_NATIVE) {
5221 				if (maxlen < sizeof (struct timeval)) {
5222 					error = EINVAL;
5223 					eprintsoline(so, error);
5224 					goto done2;
5225 				}
5226 			} else {
5227 				if (maxlen < sizeof (struct timeval32)) {
5228 					error = EINVAL;
5229 					eprintsoline(so, error);
5230 					goto done2;
5231 				}
5232 
5233 			}
5234 			break;
5235 		case SO_LINGER:
5236 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5237 				error = EINVAL;
5238 				eprintsoline(so, error);
5239 				goto done2;
5240 			}
5241 			break;
5242 		case SO_SND_BUFINFO:
5243 			if (maxlen < (t_uscalar_t)
5244 			    sizeof (struct so_snd_bufinfo)) {
5245 				error = EINVAL;
5246 				eprintsoline(so, error);
5247 				goto done2;
5248 			}
5249 			break;
5250 		}
5251 
5252 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
5253 
5254 		switch (option_name) {
5255 		case SO_TYPE:
5256 			value = so->so_type;
5257 			option = &value;
5258 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5259 
5260 		case SO_ERROR:
5261 			value = sogeterr(so, B_TRUE);
5262 			option = &value;
5263 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5264 
5265 		case SO_ACCEPTCONN:
5266 			if (so->so_state & SS_ACCEPTCONN)
5267 				value = SO_ACCEPTCONN;
5268 			else
5269 				value = 0;
5270 #ifdef DEBUG
5271 			if (value) {
5272 				dprintso(so, 1,
5273 				    ("sotpi_getsockopt: 0x%x is set\n",
5274 				    option_name));
5275 			} else {
5276 				dprintso(so, 1,
5277 				    ("sotpi_getsockopt: 0x%x not set\n",
5278 				    option_name));
5279 			}
5280 #endif /* DEBUG */
5281 			option = &value;
5282 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5283 
5284 		case SO_DEBUG:
5285 		case SO_REUSEADDR:
5286 		case SO_KEEPALIVE:
5287 		case SO_DONTROUTE:
5288 		case SO_BROADCAST:
5289 		case SO_USELOOPBACK:
5290 		case SO_OOBINLINE:
5291 		case SO_DGRAM_ERRIND:
5292 			value = (so->so_options & option_name);
5293 #ifdef DEBUG
5294 			if (value) {
5295 				dprintso(so, 1,
5296 				    ("sotpi_getsockopt: 0x%x is set\n",
5297 				    option_name));
5298 			} else {
5299 				dprintso(so, 1,
5300 				    ("sotpi_getsockopt: 0x%x not set\n",
5301 				    option_name));
5302 			}
5303 #endif /* DEBUG */
5304 			option = &value;
5305 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5306 
5307 		/*
5308 		 * The following options are only returned by sockfs when the
5309 		 * T_SVR4_OPTMGMT_REQ fails.
5310 		 */
5311 		case SO_LINGER:
5312 			option = &so->so_linger;
5313 			len = (t_uscalar_t)sizeof (struct linger);
5314 			break;
5315 		case SO_SNDBUF: {
5316 			ssize_t lvalue;
5317 
5318 			/*
5319 			 * If the option has not been set then get a default
5320 			 * value from the read queue. This value is
5321 			 * returned if the transport fails
5322 			 * the T_SVR4_OPTMGMT_REQ.
5323 			 */
5324 			lvalue = so->so_sndbuf;
5325 			if (lvalue == 0) {
5326 				mutex_exit(&so->so_lock);
5327 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
5328 				    QHIWAT, 0, &lvalue);
5329 				mutex_enter(&so->so_lock);
5330 				dprintso(so, 1,
5331 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5332 			}
5333 			value = (int)lvalue;
5334 			option = &value;
5335 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5336 			break;
5337 		}
5338 		case SO_RCVBUF: {
5339 			ssize_t lvalue;
5340 
5341 			/*
5342 			 * If the option has not been set then get a default
5343 			 * value from the read queue. This value is
5344 			 * returned if the transport fails
5345 			 * the T_SVR4_OPTMGMT_REQ.
5346 			 *
5347 			 * XXX If SO_RCVBUF has been set and this is an
5348 			 * XPG 4.2 application then do not ask the transport
5349 			 * since the transport might adjust the value and not
5350 			 * return exactly what was set by the application.
5351 			 * For non-XPG 4.2 application we return the value
5352 			 * that the transport is actually using.
5353 			 */
5354 			lvalue = so->so_rcvbuf;
5355 			if (lvalue == 0) {
5356 				mutex_exit(&so->so_lock);
5357 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5358 				    QHIWAT, 0, &lvalue);
5359 				mutex_enter(&so->so_lock);
5360 				dprintso(so, 1,
5361 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5362 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5363 				value = (int)lvalue;
5364 				option = &value;
5365 				goto copyout;	/* skip asking transport */
5366 			}
5367 			value = (int)lvalue;
5368 			option = &value;
5369 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5370 			break;
5371 		}
5372 		case SO_DOMAIN:
5373 			value = so->so_family;
5374 			option = &value;
5375 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5376 
5377 #ifdef notyet
5378 		/*
5379 		 * We do not implement the semantics of these options
5380 		 * thus we shouldn't implement the options either.
5381 		 */
5382 		case SO_SNDLOWAT:
5383 			value = so->so_sndlowat;
5384 			option = &value;
5385 			break;
5386 		case SO_RCVLOWAT:
5387 			value = so->so_rcvlowat;
5388 			option = &value;
5389 			break;
5390 #endif /* notyet */
5391 		case SO_SNDTIMEO:
5392 		case SO_RCVTIMEO: {
5393 			clock_t val;
5394 
5395 			if (option_name == SO_RCVTIMEO)
5396 				val = drv_hztousec(so->so_rcvtimeo);
5397 			else
5398 				val = drv_hztousec(so->so_sndtimeo);
5399 			tmo_val.tv_sec = val / (1000 * 1000);
5400 			tmo_val.tv_usec = val % (1000 * 1000);
5401 			if (get_udatamodel() == DATAMODEL_NONE ||
5402 			    get_udatamodel() == DATAMODEL_NATIVE) {
5403 				option = &tmo_val;
5404 				len = sizeof (struct timeval);
5405 			} else {
5406 				TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5407 				option = &tmo_val32;
5408 				len = sizeof (struct timeval32);
5409 			}
5410 			break;
5411 		}
5412 		case SO_SND_BUFINFO: {
5413 			snd_bufinfo.sbi_wroff =
5414 			    (so->so_proto_props).sopp_wroff;
5415 			snd_bufinfo.sbi_maxblk =
5416 			    (so->so_proto_props).sopp_maxblk;
5417 			snd_bufinfo.sbi_maxpsz =
5418 			    (so->so_proto_props).sopp_maxpsz;
5419 			snd_bufinfo.sbi_tail =
5420 			    (so->so_proto_props).sopp_tail;
5421 			option = &snd_bufinfo;
5422 			len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5423 			break;
5424 		}
5425 		}
5426 	}
5427 
5428 	mutex_exit(&so->so_lock);
5429 
5430 	/* Send request */
5431 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5432 	optmgmt_req.MGMT_flags = T_CHECK;
5433 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5434 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5435 
5436 	oh.level = level;
5437 	oh.name = option_name;
5438 	oh.len = maxlen;
5439 
5440 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5441 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5442 	/* Let option management work in the presence of data flow control */
5443 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5444 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5445 	mp = NULL;
5446 	mutex_enter(&so->so_lock);
5447 	if (error) {
5448 		eprintsoline(so, error);
5449 		goto done2;
5450 	}
5451 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5452 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5453 	if (error) {
5454 		if (option != NULL) {
5455 			/* We have a fallback value */
5456 			error = 0;
5457 			goto copyout;
5458 		}
5459 		eprintsoline(so, error);
5460 		goto done2;
5461 	}
5462 	ASSERT(mp);
5463 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5464 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5465 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5466 	if (opt_res == NULL) {
5467 		if (option != NULL) {
5468 			/* We have a fallback value */
5469 			error = 0;
5470 			goto copyout;
5471 		}
5472 		error = EPROTO;
5473 		eprintsoline(so, error);
5474 		goto done;
5475 	}
5476 	option = &opt_res[1];
5477 
5478 	/* check to ensure that the option is within bounds */
5479 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5480 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5481 		if (option != NULL) {
5482 			/* We have a fallback value */
5483 			error = 0;
5484 			goto copyout;
5485 		}
5486 		error = EPROTO;
5487 		eprintsoline(so, error);
5488 		goto done;
5489 	}
5490 
5491 	len = opt_res->len;
5492 
5493 copyout: {
5494 		t_uscalar_t size = MIN(len, maxlen);
5495 		bcopy(option, optval, size);
5496 		bcopy(&size, optlenp, sizeof (size));
5497 	}
5498 done:
5499 	freemsg(mp);
5500 done2:
5501 	so_unlock_single(so, SOLOCKED);
5502 	mutex_exit(&so->so_lock);
5503 
5504 	return (error);
5505 }
5506 
5507 /*
5508  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5509  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5510  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5511  * setsockopt has to work even if the transport does not support the option.
5512  */
5513 /* ARGSUSED */
5514 int
5515 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5516 	const void *optval, t_uscalar_t optlen, struct cred *cr)
5517 {
5518 	struct T_optmgmt_req	optmgmt_req;
5519 	struct opthdr		oh;
5520 	mblk_t			*mp;
5521 	int			error = 0;
5522 	boolean_t		handled = B_FALSE;
5523 
5524 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5525 	    (void *)so, level, option_name, optval, optlen,
5526 	    pr_state(so->so_state, so->so_mode)));
5527 
5528 	/* X/Open requires this check */
5529 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5530 		if (xnet_check_print)
5531 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5532 		return (EINVAL);
5533 	}
5534 
5535 	mutex_enter(&so->so_lock);
5536 	so_lock_single(so);	/* Set SOLOCKED */
5537 	mutex_exit(&so->so_lock);
5538 
5539 	/*
5540 	 * For SOCKET or TCP level options, try to set it here itself
5541 	 * provided socket has not been popped and we know the tcp
5542 	 * structure (stored in so_priv).
5543 	 */
5544 	if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
5545 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
5546 	    (so->so_version == SOV_SOCKSTREAM) &&
5547 	    (so->so_proto_handle != NULL)) {
5548 		tcp_t		*tcp = (tcp_t *)so->so_proto_handle;
5549 		boolean_t	onoff;
5550 
5551 #define	intvalue	(*(int32_t *)optval)
5552 
5553 		switch (level) {
5554 		case SOL_SOCKET:
5555 			switch (option_name) {		/* Check length param */
5556 			case SO_DEBUG:
5557 			case SO_REUSEADDR:
5558 			case SO_DONTROUTE:
5559 			case SO_BROADCAST:
5560 			case SO_USELOOPBACK:
5561 			case SO_OOBINLINE:
5562 			case SO_DGRAM_ERRIND:
5563 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5564 					error = EINVAL;
5565 					eprintsoline(so, error);
5566 					mutex_enter(&so->so_lock);
5567 					goto done2;
5568 				}
5569 				ASSERT(optval);
5570 				onoff = intvalue != 0;
5571 				handled = B_TRUE;
5572 				break;
5573 			case SO_SNDTIMEO:
5574 			case SO_RCVTIMEO:
5575 				if (get_udatamodel() == DATAMODEL_NONE ||
5576 				    get_udatamodel() == DATAMODEL_NATIVE) {
5577 					if (optlen !=
5578 					    sizeof (struct timeval)) {
5579 						error = EINVAL;
5580 						eprintsoline(so, error);
5581 						mutex_enter(&so->so_lock);
5582 						goto done2;
5583 					}
5584 				} else {
5585 					if (optlen !=
5586 					    sizeof (struct timeval32)) {
5587 						error = EINVAL;
5588 						eprintsoline(so, error);
5589 						mutex_enter(&so->so_lock);
5590 						goto done2;
5591 					}
5592 				}
5593 				ASSERT(optval);
5594 				handled = B_TRUE;
5595 				break;
5596 			case SO_LINGER:
5597 				if (optlen !=
5598 				    (t_uscalar_t)sizeof (struct linger)) {
5599 					error = EINVAL;
5600 					eprintsoline(so, error);
5601 					mutex_enter(&so->so_lock);
5602 					goto done2;
5603 				}
5604 				ASSERT(optval);
5605 				handled = B_TRUE;
5606 				break;
5607 			}
5608 
5609 			switch (option_name) {			/* Do actions */
5610 			case SO_LINGER: {
5611 				struct linger *lgr = (struct linger *)optval;
5612 
5613 				if (lgr->l_onoff) {
5614 					tcp->tcp_linger = 1;
5615 					tcp->tcp_lingertime = lgr->l_linger;
5616 					so->so_linger.l_onoff = SO_LINGER;
5617 					so->so_options |= SO_LINGER;
5618 				} else {
5619 					tcp->tcp_linger = 0;
5620 					tcp->tcp_lingertime = 0;
5621 					so->so_linger.l_onoff = 0;
5622 					so->so_options &= ~SO_LINGER;
5623 				}
5624 				so->so_linger.l_linger = lgr->l_linger;
5625 				handled = B_TRUE;
5626 				break;
5627 			}
5628 			case SO_SNDTIMEO:
5629 			case SO_RCVTIMEO: {
5630 				struct timeval tl;
5631 				clock_t val;
5632 
5633 				if (get_udatamodel() == DATAMODEL_NONE ||
5634 				    get_udatamodel() == DATAMODEL_NATIVE)
5635 					bcopy(&tl, (struct timeval *)optval,
5636 					    sizeof (struct timeval));
5637 				else
5638 					TIMEVAL32_TO_TIMEVAL(&tl,
5639 					    (struct timeval32 *)optval);
5640 				val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5641 				if (option_name == SO_RCVTIMEO)
5642 					so->so_rcvtimeo = drv_usectohz(val);
5643 				else
5644 					so->so_sndtimeo = drv_usectohz(val);
5645 				break;
5646 			}
5647 
5648 			case SO_DEBUG:
5649 				tcp->tcp_debug = onoff;
5650 #ifdef SOCK_TEST
5651 				if (intvalue & 2)
5652 					sock_test_timelimit = 10 * hz;
5653 				else
5654 					sock_test_timelimit = 0;
5655 
5656 				if (intvalue & 4)
5657 					do_useracc = 0;
5658 				else
5659 					do_useracc = 1;
5660 #endif /* SOCK_TEST */
5661 				break;
5662 			case SO_DONTROUTE:
5663 				/*
5664 				 * SO_DONTROUTE, SO_USELOOPBACK and
5665 				 * SO_BROADCAST are only of interest to IP.
5666 				 * We track them here only so
5667 				 * that we can report their current value.
5668 				 */
5669 				tcp->tcp_dontroute = onoff;
5670 				if (onoff)
5671 					so->so_options |= option_name;
5672 				else
5673 					so->so_options &= ~option_name;
5674 				break;
5675 			case SO_USELOOPBACK:
5676 				tcp->tcp_useloopback = onoff;
5677 				if (onoff)
5678 					so->so_options |= option_name;
5679 				else
5680 					so->so_options &= ~option_name;
5681 				break;
5682 			case SO_BROADCAST:
5683 				tcp->tcp_broadcast = onoff;
5684 				if (onoff)
5685 					so->so_options |= option_name;
5686 				else
5687 					so->so_options &= ~option_name;
5688 				break;
5689 			case SO_REUSEADDR:
5690 				tcp->tcp_reuseaddr = onoff;
5691 				if (onoff)
5692 					so->so_options |= option_name;
5693 				else
5694 					so->so_options &= ~option_name;
5695 				break;
5696 			case SO_OOBINLINE:
5697 				tcp->tcp_oobinline = onoff;
5698 				if (onoff)
5699 					so->so_options |= option_name;
5700 				else
5701 					so->so_options &= ~option_name;
5702 				break;
5703 			case SO_DGRAM_ERRIND:
5704 				tcp->tcp_dgram_errind = onoff;
5705 				if (onoff)
5706 					so->so_options |= option_name;
5707 				else
5708 					so->so_options &= ~option_name;
5709 				break;
5710 			}
5711 			break;
5712 		case IPPROTO_TCP:
5713 			switch (option_name) {
5714 			case TCP_NODELAY:
5715 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5716 					error = EINVAL;
5717 					eprintsoline(so, error);
5718 					mutex_enter(&so->so_lock);
5719 					goto done2;
5720 				}
5721 				ASSERT(optval);
5722 				tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss;
5723 				handled = B_TRUE;
5724 				break;
5725 			}
5726 			break;
5727 		default:
5728 			handled = B_FALSE;
5729 			break;
5730 		}
5731 	}
5732 
5733 	if (handled) {
5734 		mutex_enter(&so->so_lock);
5735 		goto done2;
5736 	}
5737 
5738 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5739 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5740 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5741 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5742 
5743 	oh.level = level;
5744 	oh.name = option_name;
5745 	oh.len = optlen;
5746 
5747 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5748 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5749 	/* Let option management work in the presence of data flow control */
5750 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5751 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5752 	mp = NULL;
5753 	mutex_enter(&so->so_lock);
5754 	if (error) {
5755 		eprintsoline(so, error);
5756 		goto done2;
5757 	}
5758 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5759 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5760 	if (error) {
5761 		eprintsoline(so, error);
5762 		goto done;
5763 	}
5764 	ASSERT(mp);
5765 	/* No need to verify T_optmgmt_ack */
5766 	freemsg(mp);
5767 done:
5768 	/*
5769 	 * Check for SOL_SOCKET options and record their values.
5770 	 * If we know about a SOL_SOCKET parameter and the transport
5771 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5772 	 * EPROTO) we let the setsockopt succeed.
5773 	 */
5774 	if (level == SOL_SOCKET) {
5775 		/* Check parameters */
5776 		switch (option_name) {
5777 		case SO_DEBUG:
5778 		case SO_REUSEADDR:
5779 		case SO_KEEPALIVE:
5780 		case SO_DONTROUTE:
5781 		case SO_BROADCAST:
5782 		case SO_USELOOPBACK:
5783 		case SO_OOBINLINE:
5784 		case SO_SNDBUF:
5785 		case SO_RCVBUF:
5786 #ifdef notyet
5787 		case SO_SNDLOWAT:
5788 		case SO_RCVLOWAT:
5789 #endif /* notyet */
5790 		case SO_DGRAM_ERRIND:
5791 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5792 				error = EINVAL;
5793 				eprintsoline(so, error);
5794 				goto done2;
5795 			}
5796 			ASSERT(optval);
5797 			handled = B_TRUE;
5798 			break;
5799 		case SO_SNDTIMEO:
5800 		case SO_RCVTIMEO:
5801 			if (get_udatamodel() == DATAMODEL_NONE ||
5802 			    get_udatamodel() == DATAMODEL_NATIVE) {
5803 				if (optlen != sizeof (struct timeval)) {
5804 					error = EINVAL;
5805 					eprintsoline(so, error);
5806 					goto done2;
5807 				}
5808 			} else {
5809 				if (optlen != sizeof (struct timeval32)) {
5810 					error = EINVAL;
5811 					eprintsoline(so, error);
5812 					goto done2;
5813 				}
5814 			}
5815 			ASSERT(optval);
5816 			handled = B_TRUE;
5817 			break;
5818 		case SO_LINGER:
5819 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5820 				error = EINVAL;
5821 				eprintsoline(so, error);
5822 				goto done2;
5823 			}
5824 			ASSERT(optval);
5825 			handled = B_TRUE;
5826 			break;
5827 		}
5828 
5829 #define	intvalue	(*(int32_t *)optval)
5830 
5831 		switch (option_name) {
5832 		case SO_TYPE:
5833 		case SO_ERROR:
5834 		case SO_ACCEPTCONN:
5835 			/* Can't be set */
5836 			error = ENOPROTOOPT;
5837 			goto done2;
5838 		case SO_LINGER: {
5839 			struct linger *l = (struct linger *)optval;
5840 
5841 			so->so_linger.l_linger = l->l_linger;
5842 			if (l->l_onoff) {
5843 				so->so_linger.l_onoff = SO_LINGER;
5844 				so->so_options |= SO_LINGER;
5845 			} else {
5846 				so->so_linger.l_onoff = 0;
5847 				so->so_options &= ~SO_LINGER;
5848 			}
5849 			break;
5850 		}
5851 
5852 		case SO_DEBUG:
5853 #ifdef SOCK_TEST
5854 			if (intvalue & 2)
5855 				sock_test_timelimit = 10 * hz;
5856 			else
5857 				sock_test_timelimit = 0;
5858 
5859 			if (intvalue & 4)
5860 				do_useracc = 0;
5861 			else
5862 				do_useracc = 1;
5863 #endif /* SOCK_TEST */
5864 			/* FALLTHRU */
5865 		case SO_REUSEADDR:
5866 		case SO_KEEPALIVE:
5867 		case SO_DONTROUTE:
5868 		case SO_BROADCAST:
5869 		case SO_USELOOPBACK:
5870 		case SO_OOBINLINE:
5871 		case SO_DGRAM_ERRIND:
5872 			if (intvalue != 0) {
5873 				dprintso(so, 1,
5874 				    ("socket_setsockopt: setting 0x%x\n",
5875 				    option_name));
5876 				so->so_options |= option_name;
5877 			} else {
5878 				dprintso(so, 1,
5879 				    ("socket_setsockopt: clearing 0x%x\n",
5880 				    option_name));
5881 				so->so_options &= ~option_name;
5882 			}
5883 			break;
5884 		/*
5885 		 * The following options are only returned by us when the
5886 		 * transport layer fails.
5887 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5888 		 * since the transport might adjust the value and not
5889 		 * return exactly what was set by the application.
5890 		 */
5891 		case SO_SNDBUF:
5892 			so->so_sndbuf = intvalue;
5893 			break;
5894 		case SO_RCVBUF:
5895 			so->so_rcvbuf = intvalue;
5896 			break;
5897 		case SO_RCVPSH:
5898 			so->so_rcv_timer_interval = intvalue;
5899 			break;
5900 #ifdef notyet
5901 		/*
5902 		 * We do not implement the semantics of these options
5903 		 * thus we shouldn't implement the options either.
5904 		 */
5905 		case SO_SNDLOWAT:
5906 			so->so_sndlowat = intvalue;
5907 			break;
5908 		case SO_RCVLOWAT:
5909 			so->so_rcvlowat = intvalue;
5910 			break;
5911 #endif /* notyet */
5912 		case SO_SNDTIMEO:
5913 		case SO_RCVTIMEO: {
5914 			struct timeval tl;
5915 			clock_t val;
5916 
5917 			if (get_udatamodel() == DATAMODEL_NONE ||
5918 			    get_udatamodel() == DATAMODEL_NATIVE)
5919 				bcopy(&tl, (struct timeval *)optval,
5920 				    sizeof (struct timeval));
5921 			else
5922 				TIMEVAL32_TO_TIMEVAL(&tl,
5923 				    (struct timeval32 *)optval);
5924 			val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5925 			if (option_name == SO_RCVTIMEO)
5926 				so->so_rcvtimeo = drv_usectohz(val);
5927 			else
5928 				so->so_sndtimeo = drv_usectohz(val);
5929 			break;
5930 		}
5931 		}
5932 #undef	intvalue
5933 
5934 		if (error) {
5935 			if ((error == ENOPROTOOPT || error == EPROTO ||
5936 			    error == EINVAL) && handled) {
5937 				dprintso(so, 1,
5938 				    ("setsockopt: ignoring error %d for 0x%x\n",
5939 				    error, option_name));
5940 				error = 0;
5941 			}
5942 		}
5943 	}
5944 done2:
5945 	so_unlock_single(so, SOLOCKED);
5946 	mutex_exit(&so->so_lock);
5947 	return (error);
5948 }
5949 
5950 /*
5951  * sotpi_close() is called when the last open reference goes away.
5952  */
5953 /* ARGSUSED */
5954 int
5955 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5956 {
5957 	struct vnode *vp = SOTOV(so);
5958 	dev_t dev;
5959 	int error = 0;
5960 	sotpi_info_t *sti = SOTOTPI(so);
5961 
5962 	dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5963 	    (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5964 
5965 	dev = sti->sti_dev;
5966 
5967 	ASSERT(STREAMSTAB(getmajor(dev)));
5968 
5969 	mutex_enter(&so->so_lock);
5970 	so_lock_single(so);	/* Set SOLOCKED */
5971 
5972 	ASSERT(so_verify_oobstate(so));
5973 
5974 	if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5975 		sti->sti_nl7c_flags = 0;
5976 		nl7c_close(so);
5977 	}
5978 
5979 	if (vp->v_stream != NULL) {
5980 		vnode_t *ux_vp;
5981 
5982 		if (so->so_family == AF_UNIX) {
5983 			/* Could avoid this when CANTSENDMORE for !dgram */
5984 			so_unix_close(so);
5985 		}
5986 
5987 		mutex_exit(&so->so_lock);
5988 		/*
5989 		 * Disassemble the linkage from the AF_UNIX underlying file
5990 		 * system vnode to this socket (by atomically clearing
5991 		 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5992 		 * and frees the stream head.
5993 		 */
5994 		if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5995 			ASSERT(ux_vp->v_stream);
5996 			sti->sti_ux_bound_vp = NULL;
5997 			vn_rele_stream(ux_vp);
5998 		}
5999 		if (so->so_family == AF_INET || so->so_family == AF_INET6) {
6000 			strsetrwputdatahooks(SOTOV(so), NULL, NULL);
6001 			if (sti->sti_kssl_ent != NULL) {
6002 				kssl_release_ent(sti->sti_kssl_ent, so,
6003 				    sti->sti_kssl_type);
6004 				sti->sti_kssl_ent = NULL;
6005 			}
6006 			if (sti->sti_kssl_ctx != NULL) {
6007 				kssl_release_ctx(sti->sti_kssl_ctx);
6008 				sti->sti_kssl_ctx = NULL;
6009 			}
6010 			sti->sti_kssl_type = KSSL_NO_PROXY;
6011 		}
6012 		error = strclose(vp, flag, cr);
6013 		vp->v_stream = NULL;
6014 		mutex_enter(&so->so_lock);
6015 	}
6016 
6017 	/*
6018 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
6019 	 */
6020 	so_flush_discon_ind(so);
6021 
6022 	so_unlock_single(so, SOLOCKED);
6023 	mutex_exit(&so->so_lock);
6024 
6025 	/*
6026 	 * Needed for STREAMs.
6027 	 * Decrement the device driver's reference count for streams
6028 	 * opened via the clone dip. The driver was held in clone_open().
6029 	 * The absence of clone_close() forces this asymmetry.
6030 	 */
6031 	if (so->so_flag & SOCLONE)
6032 		ddi_rele_driver(getmajor(dev));
6033 
6034 	return (error);
6035 }
6036 
6037 static int
6038 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
6039     struct cred *cr, int32_t *rvalp)
6040 {
6041 	struct vnode *vp = SOTOV(so);
6042 	sotpi_info_t *sti = SOTOTPI(so);
6043 	int error = 0;
6044 
6045 	dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
6046 	    cmd, arg, pr_state(so->so_state, so->so_mode)));
6047 
6048 	switch (cmd) {
6049 	case SIOCSQPTR:
6050 		/*
6051 		 * SIOCSQPTR is valid only when helper stream is created
6052 		 * by the protocol.
6053 		 */
6054 	case _I_INSERT:
6055 	case _I_REMOVE:
6056 		/*
6057 		 * Since there's no compelling reason to support these ioctls
6058 		 * on sockets, and doing so would increase the complexity
6059 		 * markedly, prevent it.
6060 		 */
6061 		return (EOPNOTSUPP);
6062 
6063 	case I_FIND:
6064 	case I_LIST:
6065 	case I_LOOK:
6066 	case I_POP:
6067 	case I_PUSH:
6068 		/*
6069 		 * To prevent races and inconsistencies between the actual
6070 		 * state of the stream and the state according to the sonode,
6071 		 * we serialize all operations which modify or operate on the
6072 		 * list of modules on the socket's stream.
6073 		 */
6074 		mutex_enter(&sti->sti_plumb_lock);
6075 		error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
6076 		mutex_exit(&sti->sti_plumb_lock);
6077 		return (error);
6078 
6079 	default:
6080 		if (so->so_version != SOV_STREAM)
6081 			break;
6082 
6083 		/*
6084 		 * The imaginary "sockmod" has been popped; act as a stream.
6085 		 */
6086 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6087 	}
6088 
6089 	ASSERT(so->so_version != SOV_STREAM);
6090 
6091 	/*
6092 	 * Process socket-specific ioctls.
6093 	 */
6094 	switch (cmd) {
6095 	case FIONBIO: {
6096 		int32_t value;
6097 
6098 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
6099 		    (mode & (int)FKIOCTL)))
6100 			return (EFAULT);
6101 
6102 		mutex_enter(&so->so_lock);
6103 		if (value) {
6104 			so->so_state |= SS_NDELAY;
6105 		} else {
6106 			so->so_state &= ~SS_NDELAY;
6107 		}
6108 		mutex_exit(&so->so_lock);
6109 		return (0);
6110 	}
6111 
6112 	case FIOASYNC: {
6113 		int32_t value;
6114 
6115 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
6116 		    (mode & (int)FKIOCTL)))
6117 			return (EFAULT);
6118 
6119 		mutex_enter(&so->so_lock);
6120 		/*
6121 		 * SS_ASYNC flag not already set correctly?
6122 		 * (!value != !(so->so_state & SS_ASYNC))
6123 		 * but some engineers find that too hard to read.
6124 		 */
6125 		if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
6126 		    value != 0 && (so->so_state & SS_ASYNC) == 0)
6127 			error = so_flip_async(so, vp, mode, cr);
6128 		mutex_exit(&so->so_lock);
6129 		return (error);
6130 	}
6131 
6132 	case SIOCSPGRP:
6133 	case FIOSETOWN: {
6134 		pid_t pgrp;
6135 
6136 		if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
6137 		    (mode & (int)FKIOCTL)))
6138 			return (EFAULT);
6139 
6140 		mutex_enter(&so->so_lock);
6141 		dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
6142 		/* Any change? */
6143 		if (pgrp != so->so_pgrp)
6144 			error = so_set_siggrp(so, vp, pgrp, mode, cr);
6145 		mutex_exit(&so->so_lock);
6146 		return (error);
6147 	}
6148 	case SIOCGPGRP:
6149 	case FIOGETOWN:
6150 		if (so_copyout(&so->so_pgrp, (void *)arg,
6151 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
6152 			return (EFAULT);
6153 		return (0);
6154 
6155 	case SIOCATMARK: {
6156 		int retval;
6157 		uint_t so_state;
6158 
6159 		/*
6160 		 * strwaitmark has a finite timeout after which it
6161 		 * returns -1 if the mark state is undetermined.
6162 		 * In order to avoid any race between the mark state
6163 		 * in sockfs and the mark state in the stream head this
6164 		 * routine loops until the mark state can be determined
6165 		 * (or the urgent data indication has been removed by some
6166 		 * other thread).
6167 		 */
6168 		do {
6169 			mutex_enter(&so->so_lock);
6170 			so_state = so->so_state;
6171 			mutex_exit(&so->so_lock);
6172 			if (so_state & SS_RCVATMARK) {
6173 				retval = 1;
6174 			} else if (!(so_state & SS_OOBPEND)) {
6175 				/*
6176 				 * No SIGURG has been generated -- there is no
6177 				 * pending or present urgent data. Thus can't
6178 				 * possibly be at the mark.
6179 				 */
6180 				retval = 0;
6181 			} else {
6182 				/*
6183 				 * Have the stream head wait until there is
6184 				 * either some messages on the read queue, or
6185 				 * STRATMARK or STRNOTATMARK gets set. The
6186 				 * STRNOTATMARK flag is used so that the
6187 				 * transport can send up a MSGNOTMARKNEXT
6188 				 * M_DATA to indicate that it is not
6189 				 * at the mark and additional data is not about
6190 				 * to be send upstream.
6191 				 *
6192 				 * If the mark state is undetermined this will
6193 				 * return -1 and we will loop rechecking the
6194 				 * socket state.
6195 				 */
6196 				retval = strwaitmark(vp);
6197 			}
6198 		} while (retval == -1);
6199 
6200 		if (so_copyout(&retval, (void *)arg, sizeof (int),
6201 		    (mode & (int)FKIOCTL)))
6202 			return (EFAULT);
6203 		return (0);
6204 	}
6205 
6206 	case I_FDINSERT:
6207 	case I_SENDFD:
6208 	case I_RECVFD:
6209 	case I_ATMARK:
6210 	case _SIOCSOCKFALLBACK:
6211 		/*
6212 		 * These ioctls do not apply to sockets. I_FDINSERT can be
6213 		 * used to send M_PROTO messages without modifying the socket
6214 		 * state. I_SENDFD/RECVFD should not be used for socket file
6215 		 * descriptor passing since they assume a twisted stream.
6216 		 * SIOCATMARK must be used instead of I_ATMARK.
6217 		 *
6218 		 * _SIOCSOCKFALLBACK from an application should never be
6219 		 * processed.  It is only generated by socktpi_open() or
6220 		 * in response to I_POP or I_PUSH.
6221 		 */
6222 #ifdef DEBUG
6223 		zcmn_err(getzoneid(), CE_WARN,
6224 		    "Unsupported STREAMS ioctl 0x%x on socket. "
6225 		    "Pid = %d\n", cmd, curproc->p_pid);
6226 #endif /* DEBUG */
6227 		return (EOPNOTSUPP);
6228 
6229 	case _I_GETPEERCRED:
6230 		if ((mode & FKIOCTL) == 0)
6231 			return (EINVAL);
6232 
6233 		mutex_enter(&so->so_lock);
6234 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
6235 			error = ENOTSUP;
6236 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
6237 			error = ENOTCONN;
6238 		} else if (so->so_peercred != NULL) {
6239 			k_peercred_t *kp = (k_peercred_t *)arg;
6240 			kp->pc_cr = so->so_peercred;
6241 			kp->pc_cpid = so->so_cpid;
6242 			crhold(so->so_peercred);
6243 		} else {
6244 			error = EINVAL;
6245 		}
6246 		mutex_exit(&so->so_lock);
6247 		return (error);
6248 
6249 	default:
6250 		/*
6251 		 * Do the higher-order bits of the ioctl cmd indicate
6252 		 * that it is an I_* streams ioctl?
6253 		 */
6254 		if ((cmd & 0xffffff00U) == STR &&
6255 		    so->so_version == SOV_SOCKBSD) {
6256 #ifdef DEBUG
6257 			zcmn_err(getzoneid(), CE_WARN,
6258 			    "Unsupported STREAMS ioctl 0x%x on socket. "
6259 			    "Pid = %d\n", cmd, 	curproc->p_pid);
6260 #endif /* DEBUG */
6261 			return (EOPNOTSUPP);
6262 		}
6263 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6264 	}
6265 }
6266 
6267 /*
6268  * Handle plumbing-related ioctls.
6269  */
6270 static int
6271 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
6272     struct cred *cr, int32_t *rvalp)
6273 {
6274 	static const char sockmod_name[] = "sockmod";
6275 	struct sonode	*so = VTOSO(vp);
6276 	char		mname[FMNAMESZ + 1];
6277 	int		error;
6278 	sotpi_info_t	*sti = SOTOTPI(so);
6279 
6280 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
6281 
6282 	if (so->so_version == SOV_SOCKBSD)
6283 		return (EOPNOTSUPP);
6284 
6285 	if (so->so_version == SOV_STREAM) {
6286 		/*
6287 		 * The imaginary "sockmod" has been popped - act as a stream.
6288 		 * If this is a push of sockmod then change back to a socket.
6289 		 */
6290 		if (cmd == I_PUSH) {
6291 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6292 			    (void *)arg, mname, sizeof (mname), NULL);
6293 
6294 			if (error == 0 && strcmp(mname, sockmod_name) == 0) {
6295 				dprintso(so, 0, ("socktpi_ioctl: going to "
6296 				    "socket version\n"));
6297 				so_stream2sock(so);
6298 				return (0);
6299 			}
6300 		}
6301 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6302 	}
6303 
6304 	switch (cmd) {
6305 	case I_PUSH:
6306 		if (sti->sti_direct) {
6307 			mutex_enter(&so->so_lock);
6308 			so_lock_single(so);
6309 			mutex_exit(&so->so_lock);
6310 
6311 			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6312 			    cr, rvalp);
6313 
6314 			mutex_enter(&so->so_lock);
6315 			if (error == 0)
6316 				sti->sti_direct = 0;
6317 			so_unlock_single(so, SOLOCKED);
6318 			mutex_exit(&so->so_lock);
6319 
6320 			if (error != 0)
6321 				return (error);
6322 		}
6323 
6324 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6325 		if (error == 0)
6326 			sti->sti_pushcnt++;
6327 		return (error);
6328 
6329 	case I_POP:
6330 		if (sti->sti_pushcnt == 0) {
6331 			/* Emulate sockmod being popped */
6332 			dprintso(so, 0,
6333 			    ("socktpi_ioctl: going to STREAMS version\n"));
6334 			return (so_sock2stream(so));
6335 		}
6336 
6337 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6338 		if (error == 0)
6339 			sti->sti_pushcnt--;
6340 		return (error);
6341 
6342 	case I_LIST: {
6343 		struct str_mlist *kmlistp, *umlistp;
6344 		struct str_list	kstrlist;
6345 		ssize_t		kstrlistsize;
6346 		int		i, nmods;
6347 
6348 		STRUCT_DECL(str_list, ustrlist);
6349 		STRUCT_INIT(ustrlist, mode);
6350 
6351 		if (arg == NULL) {
6352 			error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6353 			if (error == 0)
6354 				(*rvalp)++;	/* Add one for sockmod */
6355 			return (error);
6356 		}
6357 
6358 		error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6359 		    STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6360 		if (error != 0)
6361 			return (error);
6362 
6363 		nmods = STRUCT_FGET(ustrlist, sl_nmods);
6364 		if (nmods <= 0)
6365 			return (EINVAL);
6366 		/*
6367 		 * Ceiling nmods at nstrpush to prevent someone from
6368 		 * maliciously consuming lots of kernel memory.
6369 		 */
6370 		nmods = MIN(nmods, nstrpush);
6371 
6372 		kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6373 		kstrlist.sl_nmods = nmods;
6374 		kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6375 
6376 		error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6377 		    cr, rvalp);
6378 		if (error != 0)
6379 			goto done;
6380 
6381 		/*
6382 		 * Considering the module list as a 0-based array of sl_nmods
6383 		 * modules, sockmod should conceptually exist at slot
6384 		 * sti_pushcnt.  Insert sockmod at this location by sliding all
6385 		 * of the module names after so_pushcnt over by one.  We know
6386 		 * that there will be room to do this since we allocated
6387 		 * sl_modlist with an additional slot.
6388 		 */
6389 		for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6390 			kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6391 
6392 		(void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6393 		kstrlist.sl_nmods++;
6394 
6395 		/*
6396 		 * Copy all of the entries out to ustrlist.
6397 		 */
6398 		kmlistp = kstrlist.sl_modlist;
6399 		umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6400 		for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6401 			error = so_copyout(kmlistp++, umlistp++,
6402 			    sizeof (struct str_mlist), mode & FKIOCTL);
6403 			if (error != 0)
6404 				goto done;
6405 		}
6406 
6407 		error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6408 		    mode & FKIOCTL);
6409 		if (error == 0)
6410 			*rvalp = 0;
6411 	done:
6412 		kmem_free(kstrlist.sl_modlist, kstrlistsize);
6413 		return (error);
6414 	}
6415 	case I_LOOK:
6416 		if (sti->sti_pushcnt == 0) {
6417 			return (so_copyout(sockmod_name, (void *)arg,
6418 			    sizeof (sockmod_name), mode & FKIOCTL));
6419 		}
6420 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6421 
6422 	case I_FIND:
6423 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6424 		if (error && error != EINVAL)
6425 			return (error);
6426 
6427 		/* if not found and string was sockmod return 1 */
6428 		if (*rvalp == 0 || error == EINVAL) {
6429 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6430 			    (void *)arg, mname, sizeof (mname), NULL);
6431 			if (error == ENAMETOOLONG)
6432 				error = EINVAL;
6433 
6434 			if (error == 0 && strcmp(mname, sockmod_name) == 0)
6435 				*rvalp = 1;
6436 		}
6437 		return (error);
6438 
6439 	default:
6440 		panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6441 		break;
6442 	}
6443 
6444 	return (0);
6445 }
6446 
6447 /*
6448  * Wrapper around the streams poll routine that implements socket poll
6449  * semantics.
6450  * The sockfs never calls pollwakeup itself - the stream head take care
6451  * of all pollwakeups. Since sockfs never holds so_lock when calling the
6452  * stream head there can never be a deadlock due to holding so_lock across
6453  * pollwakeup and acquiring so_lock in this routine.
6454  *
6455  * However, since the performance of VOP_POLL is critical we avoid
6456  * acquiring so_lock here. This is based on two assumptions:
6457  *  - The poll implementation holds locks to serialize the VOP_POLL call
6458  *    and a pollwakeup for the same pollhead. This ensures that should
6459  *    e.g. so_state change during a socktpi_poll call the pollwakeup
6460  *    (which strsock_* and strrput conspire to issue) is issued after
6461  *    the state change. Thus the pollwakeup will block until VOP_POLL has
6462  *    returned and then wake up poll and have it call VOP_POLL again.
6463  *  - The reading of so_state without holding so_lock does not result in
6464  *    stale data that is older than the latest state change that has dropped
6465  *    so_lock. This is ensured by the mutex_exit issuing the appropriate
6466  *    memory barrier to force the data into the coherency domain.
6467  */
6468 static int
6469 sotpi_poll(
6470 	struct sonode	*so,
6471 	short		events,
6472 	int		anyyet,
6473 	short		*reventsp,
6474 	struct pollhead **phpp)
6475 {
6476 	short origevents = events;
6477 	struct vnode *vp = SOTOV(so);
6478 	int error;
6479 	int so_state = so->so_state;	/* snapshot */
6480 	sotpi_info_t *sti = SOTOTPI(so);
6481 
6482 	dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6483 	    (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6484 
6485 	ASSERT(vp->v_type == VSOCK);
6486 	ASSERT(vp->v_stream != NULL);
6487 
6488 	if (so->so_version == SOV_STREAM) {
6489 		/* The imaginary "sockmod" has been popped - act as a stream */
6490 		return (strpoll(vp->v_stream, events, anyyet,
6491 		    reventsp, phpp));
6492 	}
6493 
6494 	if (!(so_state & SS_ISCONNECTED) &&
6495 	    (so->so_mode & SM_CONNREQUIRED)) {
6496 		/* Not connected yet - turn off write side events */
6497 		events &= ~(POLLOUT|POLLWRBAND);
6498 	}
6499 	/*
6500 	 * Check for errors without calling strpoll if the caller wants them.
6501 	 * In sockets the errors are represented as input/output events
6502 	 * and there is no need to ask the stream head for this information.
6503 	 */
6504 	if (so->so_error != 0 &&
6505 	    ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
6506 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6507 		return (0);
6508 	}
6509 	/*
6510 	 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6511 	 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6512 	 * will not trigger a POLLIN event with POLLRDDATA set.
6513 	 * The handling of urgent data (causing POLLRDBAND) is done by
6514 	 * inspecting SS_OOBPEND below.
6515 	 */
6516 	events |= POLLRDDATA;
6517 
6518 	/*
6519 	 * After shutdown(output) a stream head write error is set.
6520 	 * However, we should not return output events.
6521 	 */
6522 	events |= POLLNOERR;
6523 	error = strpoll(vp->v_stream, events, anyyet,
6524 	    reventsp, phpp);
6525 	if (error)
6526 		return (error);
6527 
6528 	ASSERT(!(*reventsp & POLLERR));
6529 
6530 	/*
6531 	 * Notes on T_CONN_IND handling for sockets.
6532 	 *
6533 	 * If strpoll() returned without events, SR_POLLIN is guaranteed
6534 	 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6535 	 *
6536 	 * Since the so_lock is not held, soqueueconnind() may have run
6537 	 * and a T_CONN_IND may be waiting. We now check for any queued
6538 	 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6539 	 * to ensure poll returns.
6540 	 *
6541 	 * However:
6542 	 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6543 	 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6544 	 * the following actions will occur; taken together they ensure the
6545 	 * syscall will return.
6546 	 *
6547 	 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6548 	 *    the accept() was run on a non-blocking socket sowaitconnind()
6549 	 *    may have already returned EWOULDBLOCK, so not be waiting to
6550 	 *    process the message. Additionally socktpi_poll() has probably
6551 	 *    proceeded past the sti_conn_ind_head check below.
6552 	 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6553 	 *    this thread,  however that could occur before poll_common()
6554 	 *    has entered cv_wait.
6555 	 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6556 	 *
6557 	 * Before proceeding to cv_wait() in poll_common() for an event,
6558 	 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6559 	 * and if set, re-calls strpoll() to ensure the late arriving
6560 	 * T_CONN_IND is recognized, and pollsys() returns.
6561 	 */
6562 
6563 	if (sti->sti_conn_ind_head != NULL)
6564 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6565 
6566 	if (so->so_state & SS_OOBPEND)
6567 		*reventsp |= POLLRDBAND & events;
6568 
6569 	if (sti->sti_nl7c_rcv_mp != NULL) {
6570 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6571 	}
6572 	if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6573 	    ((POLLIN|POLLRDNORM) & *reventsp)) {
6574 		sti->sti_nl7c_flags |= NL7C_POLLIN;
6575 	}
6576 
6577 	return (0);
6578 }
6579 
6580 /*ARGSUSED*/
6581 static int
6582 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6583 {
6584 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6585 	int error = 0;
6586 
6587 	error = sonode_constructor(buf, cdrarg, kmflags);
6588 	if (error != 0)
6589 		return (error);
6590 
6591 	error = i_sotpi_info_constructor(&st->st_info);
6592 	if (error != 0)
6593 		sonode_destructor(buf, cdrarg);
6594 
6595 	st->st_sonode.so_priv = &st->st_info;
6596 
6597 	return (error);
6598 }
6599 
6600 /*ARGSUSED1*/
6601 static void
6602 socktpi_destructor(void *buf, void *cdrarg)
6603 {
6604 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6605 
6606 	ASSERT(st->st_sonode.so_priv == &st->st_info);
6607 	st->st_sonode.so_priv = NULL;
6608 
6609 	i_sotpi_info_destructor(&st->st_info);
6610 	sonode_destructor(buf, cdrarg);
6611 }
6612 
6613 static int
6614 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6615 {
6616 	int retval;
6617 
6618 	if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6619 		struct sonode *so = (struct sonode *)buf;
6620 		sotpi_info_t *sti = SOTOTPI(so);
6621 
6622 		mutex_enter(&socklist.sl_lock);
6623 
6624 		sti->sti_next_so = socklist.sl_list;
6625 		sti->sti_prev_so = NULL;
6626 		if (sti->sti_next_so != NULL)
6627 			SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6628 		socklist.sl_list = so;
6629 
6630 		mutex_exit(&socklist.sl_lock);
6631 
6632 	}
6633 	return (retval);
6634 }
6635 
6636 static void
6637 socktpi_unix_destructor(void *buf, void *cdrarg)
6638 {
6639 	struct sonode	*so = (struct sonode *)buf;
6640 	sotpi_info_t	*sti = SOTOTPI(so);
6641 
6642 	mutex_enter(&socklist.sl_lock);
6643 
6644 	if (sti->sti_next_so != NULL)
6645 		SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6646 	if (sti->sti_prev_so != NULL)
6647 		SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6648 	else
6649 		socklist.sl_list = sti->sti_next_so;
6650 
6651 	mutex_exit(&socklist.sl_lock);
6652 
6653 	socktpi_destructor(buf, cdrarg);
6654 }
6655 
6656 int
6657 socktpi_init(void)
6658 {
6659 	/*
6660 	 * Create sonode caches.  We create a special one for AF_UNIX so
6661 	 * that we can track them for netstat(1m).
6662 	 */
6663 	socktpi_cache = kmem_cache_create("socktpi_cache",
6664 	    sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6665 	    socktpi_destructor, NULL, NULL, NULL, 0);
6666 
6667 	socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6668 	    sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6669 	    socktpi_unix_destructor, NULL, NULL, NULL, 0);
6670 
6671 	return (0);
6672 }
6673 
6674 /*
6675  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6676  *
6677  * Caller must still update state and mode using sotpi_update_state().
6678  */
6679 int
6680 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6681     boolean_t *direct, queue_t **qp, struct cred *cr)
6682 {
6683 	sotpi_info_t *sti;
6684 	struct sockparams *origsp = so->so_sockparams;
6685 	sock_lower_handle_t handle = so->so_proto_handle;
6686 	struct stdata *stp;
6687 	struct vnode *vp;
6688 	queue_t *q;
6689 	int error = 0;
6690 
6691 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6692 	    SS_FALLBACK_PENDING);
6693 	ASSERT(SOCK_IS_NONSTR(so));
6694 
6695 	*qp = NULL;
6696 	*direct = B_FALSE;
6697 	so->so_sockparams = newsp;
6698 	/*
6699 	 * Allocate and initalize fields required by TPI.
6700 	 */
6701 	(void) sotpi_info_create(so, KM_SLEEP);
6702 	sotpi_info_init(so);
6703 
6704 	if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6705 		sotpi_info_fini(so);
6706 		sotpi_info_destroy(so);
6707 		return (error);
6708 	}
6709 	ASSERT(handle == so->so_proto_handle);
6710 	sti = SOTOTPI(so);
6711 	if (sti->sti_direct != 0)
6712 		*direct = B_TRUE;
6713 
6714 	/*
6715 	 * When it comes to urgent data we have two cases to deal with;
6716 	 * (1) The oob byte has already arrived, or (2) the protocol has
6717 	 * notified that oob data is pending, but it has not yet arrived.
6718 	 *
6719 	 * For (1) all we need to do is send a T_EXDATA_IND to indicate were
6720 	 * in the byte stream the oob byte is. For (2) we have to send a
6721 	 * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
6722 	 * the oob byte will be the next byte from the protocol.
6723 	 *
6724 	 * So in the worst case we need two mblks, one for the signal, another
6725 	 * for mark indication. In that case we use the exdata_mp for the sig.
6726 	 */
6727 	sti->sti_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind), BPRI_MED,
6728 	    STR_NOSIG, NULL);
6729 	sti->sti_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
6730 
6731 	/*
6732 	 * Keep the original sp around so we can properly dispose of the
6733 	 * sonode when the socket is being closed.
6734 	 */
6735 	sti->sti_orig_sp = origsp;
6736 
6737 	so_basic_strinit(so);	/* skips the T_CAPABILITY_REQ */
6738 	so_alloc_addr(so, so->so_max_addr_len);
6739 
6740 	/*
6741 	 * If the application has done a SIOCSPGRP, make sure the
6742 	 * STREAM head is aware. This needs to take place before
6743 	 * the protocol start sending up messages. Otherwise we
6744 	 * might miss to generate SIGPOLL.
6745 	 *
6746 	 * It is possible that the application will receive duplicate
6747 	 * signals if some were already generated for either data or
6748 	 * connection indications.
6749 	 */
6750 	if (so->so_pgrp != 0) {
6751 		if (so_set_events(so, so->so_vnode, cr) != 0)
6752 			so->so_pgrp = 0;
6753 	}
6754 
6755 	/*
6756 	 * Determine which queue to use.
6757 	 */
6758 	vp = SOTOV(so);
6759 	stp = vp->v_stream;
6760 	ASSERT(stp != NULL);
6761 	q = stp->sd_wrq->q_next;
6762 
6763 	/*
6764 	 * Skip any modules that may have been auto pushed when the device
6765 	 * was opened
6766 	 */
6767 	while (q->q_next != NULL)
6768 		q = q->q_next;
6769 	*qp = _RD(q);
6770 
6771 	/* This is now a STREAMS sockets */
6772 	so->so_not_str = B_FALSE;
6773 
6774 	return (error);
6775 }
6776 
6777 /*
6778  * Revert a TPI sonode. It is only allowed to revert the sonode during
6779  * the fallback process.
6780  */
6781 void
6782 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6783 {
6784 	vnode_t *vp = SOTOV(so);
6785 
6786 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6787 	    SS_FALLBACK_PENDING);
6788 	ASSERT(!SOCK_IS_NONSTR(so));
6789 	ASSERT(vp->v_stream != NULL);
6790 
6791 	if (SOTOTPI(so)->sti_exdata_mp != NULL) {
6792 		freeb(SOTOTPI(so)->sti_exdata_mp);
6793 		SOTOTPI(so)->sti_exdata_mp = NULL;
6794 	}
6795 
6796 	if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
6797 		freeb(SOTOTPI(so)->sti_urgmark_mp);
6798 		SOTOTPI(so)->sti_urgmark_mp = NULL;
6799 	}
6800 
6801 	strclean(vp);
6802 	(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6803 
6804 	/*
6805 	 * Restore the original sockparams. The caller is responsible for
6806 	 * dropping the ref to the new sp.
6807 	 */
6808 	so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6809 
6810 	sotpi_info_fini(so);
6811 	sotpi_info_destroy(so);
6812 
6813 	/* This is no longer a STREAMS sockets */
6814 	so->so_not_str = B_TRUE;
6815 }
6816 
6817 void
6818 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6819     struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6820     socklen_t faddrlen, short opts)
6821 {
6822 	sotpi_info_t *sti = SOTOTPI(so);
6823 
6824 	so_proc_tcapability_ack(so, tcap);
6825 
6826 	so->so_options |= opts;
6827 
6828 	/*
6829 	 * Determine whether the foreign and local address are valid
6830 	 */
6831 	if (laddrlen != 0) {
6832 		ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6833 		sti->sti_laddr_len = laddrlen;
6834 		bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6835 		sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6836 	}
6837 
6838 	if (faddrlen != 0) {
6839 		ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6840 		sti->sti_faddr_len = faddrlen;
6841 		bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6842 		sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6843 	}
6844 
6845 }
6846 
6847 /*
6848  * Allocate enough space to cache the local and foreign addresses.
6849  */
6850 void
6851 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6852 {
6853 	sotpi_info_t *sti = SOTOTPI(so);
6854 
6855 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6856 	ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6857 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6858 	    P2ROUNDUP(maxlen, KMEM_ALIGN);
6859 	so->so_max_addr_len = sti->sti_laddr_maxlen;
6860 	sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6861 	sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6862 	    + sti->sti_laddr_maxlen);
6863 
6864 	if (so->so_family == AF_UNIX) {
6865 		/*
6866 		 * Initialize AF_UNIX related fields.
6867 		 */
6868 		bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6869 		bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6870 	}
6871 }
6872 
6873 
6874 sotpi_info_t *
6875 sotpi_sototpi(struct sonode *so)
6876 {
6877 	sotpi_info_t *sti;
6878 
6879 	ASSERT(so != NULL);
6880 
6881 	sti = (sotpi_info_t *)so->so_priv;
6882 
6883 	ASSERT(sti != NULL);
6884 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6885 
6886 	return (sti);
6887 }
6888 
6889 static int
6890 i_sotpi_info_constructor(sotpi_info_t *sti)
6891 {
6892 	sti->sti_magic		= SOTPI_INFO_MAGIC;
6893 	sti->sti_ack_mp		= NULL;
6894 	sti->sti_discon_ind_mp	= NULL;
6895 	sti->sti_ux_bound_vp	= NULL;
6896 	sti->sti_unbind_mp	= NULL;
6897 
6898 	sti->sti_conn_ind_head	= NULL;
6899 	sti->sti_conn_ind_tail	= NULL;
6900 
6901 	sti->sti_laddr_sa	= NULL;
6902 	sti->sti_faddr_sa	= NULL;
6903 
6904 	sti->sti_nl7c_flags	= 0;
6905 	sti->sti_nl7c_uri	= NULL;
6906 	sti->sti_nl7c_rcv_mp	= NULL;
6907 
6908 	sti->sti_exdata_mp	= NULL;
6909 	sti->sti_urgmark_mp	= NULL;
6910 
6911 	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6912 	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6913 
6914 	return (0);
6915 }
6916 
6917 static void
6918 i_sotpi_info_destructor(sotpi_info_t *sti)
6919 {
6920 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6921 	ASSERT(sti->sti_ack_mp == NULL);
6922 	ASSERT(sti->sti_discon_ind_mp == NULL);
6923 	ASSERT(sti->sti_ux_bound_vp == NULL);
6924 	ASSERT(sti->sti_unbind_mp == NULL);
6925 
6926 	ASSERT(sti->sti_conn_ind_head == NULL);
6927 	ASSERT(sti->sti_conn_ind_tail == NULL);
6928 
6929 	ASSERT(sti->sti_laddr_sa == NULL);
6930 	ASSERT(sti->sti_faddr_sa == NULL);
6931 
6932 	ASSERT(sti->sti_nl7c_flags == 0);
6933 	ASSERT(sti->sti_nl7c_uri == NULL);
6934 	ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6935 
6936 	ASSERT(sti->sti_exdata_mp == NULL);
6937 	ASSERT(sti->sti_urgmark_mp == NULL);
6938 
6939 	mutex_destroy(&sti->sti_plumb_lock);
6940 	cv_destroy(&sti->sti_ack_cv);
6941 }
6942 
6943 /*
6944  * Creates and attaches TPI information to the given sonode
6945  */
6946 static boolean_t
6947 sotpi_info_create(struct sonode *so, int kmflags)
6948 {
6949 	sotpi_info_t *sti;
6950 
6951 	ASSERT(so->so_priv == NULL);
6952 
6953 	if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6954 		return (B_FALSE);
6955 
6956 	if (i_sotpi_info_constructor(sti) != 0) {
6957 		kmem_free(sti, sizeof (*sti));
6958 		return (B_FALSE);
6959 	}
6960 
6961 	so->so_priv = (void *)sti;
6962 	return (B_TRUE);
6963 }
6964 
6965 /*
6966  * Initializes the TPI information.
6967  */
6968 static void
6969 sotpi_info_init(struct sonode *so)
6970 {
6971 	struct vnode *vp = SOTOV(so);
6972 	sotpi_info_t *sti = SOTOTPI(so);
6973 	time_t now;
6974 
6975 	sti->sti_dev 	= so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6976 	vp->v_rdev	= sti->sti_dev;
6977 
6978 	sti->sti_orig_sp = NULL;
6979 
6980 	sti->sti_pushcnt = 0;
6981 
6982 	now = gethrestime_sec();
6983 	sti->sti_atime	= now;
6984 	sti->sti_mtime	= now;
6985 	sti->sti_ctime	= now;
6986 
6987 	sti->sti_eaddr_mp = NULL;
6988 	sti->sti_delayed_error = 0;
6989 
6990 	sti->sti_provinfo = NULL;
6991 
6992 	sti->sti_oobcnt = 0;
6993 	sti->sti_oobsigcnt = 0;
6994 
6995 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6996 
6997 	sti->sti_laddr_sa	= 0;
6998 	sti->sti_faddr_sa	= 0;
6999 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
7000 	sti->sti_laddr_len = sti->sti_faddr_len = 0;
7001 
7002 	sti->sti_laddr_valid = 0;
7003 	sti->sti_faddr_valid = 0;
7004 	sti->sti_faddr_noxlate = 0;
7005 
7006 	sti->sti_direct = 0;
7007 
7008 	ASSERT(sti->sti_ack_mp == NULL);
7009 	ASSERT(sti->sti_ux_bound_vp == NULL);
7010 	ASSERT(sti->sti_unbind_mp == NULL);
7011 
7012 	ASSERT(sti->sti_conn_ind_head == NULL);
7013 	ASSERT(sti->sti_conn_ind_tail == NULL);
7014 
7015 	/* Initialize the kernel SSL proxy fields */
7016 	sti->sti_kssl_type = KSSL_NO_PROXY;
7017 	sti->sti_kssl_ent = NULL;
7018 	sti->sti_kssl_ctx = NULL;
7019 }
7020 
7021 /*
7022  * Given a sonode, grab the TPI info and free any data.
7023  */
7024 static void
7025 sotpi_info_fini(struct sonode *so)
7026 {
7027 	sotpi_info_t *sti = SOTOTPI(so);
7028 	mblk_t *mp;
7029 
7030 	ASSERT(sti->sti_discon_ind_mp == NULL);
7031 
7032 	if ((mp = sti->sti_conn_ind_head) != NULL) {
7033 		mblk_t *mp1;
7034 
7035 		while (mp) {
7036 			mp1 = mp->b_next;
7037 			mp->b_next = NULL;
7038 			freemsg(mp);
7039 			mp = mp1;
7040 		}
7041 		sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
7042 	}
7043 
7044 	/*
7045 	 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
7046 	 * indirect them.  It also uses so_count as a validity test.
7047 	 */
7048 	mutex_enter(&so->so_lock);
7049 
7050 	if (sti->sti_laddr_sa) {
7051 		ASSERT((caddr_t)sti->sti_faddr_sa ==
7052 		    (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
7053 		ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
7054 		sti->sti_laddr_valid = 0;
7055 		sti->sti_faddr_valid = 0;
7056 		kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
7057 		sti->sti_laddr_sa = NULL;
7058 		sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
7059 		sti->sti_faddr_sa = NULL;
7060 		sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
7061 	}
7062 
7063 	mutex_exit(&so->so_lock);
7064 
7065 	if ((mp = sti->sti_eaddr_mp) != NULL) {
7066 		freemsg(mp);
7067 		sti->sti_eaddr_mp = NULL;
7068 		sti->sti_delayed_error = 0;
7069 	}
7070 
7071 	if ((mp = sti->sti_ack_mp) != NULL) {
7072 		freemsg(mp);
7073 		sti->sti_ack_mp = NULL;
7074 	}
7075 
7076 	if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
7077 		sti->sti_nl7c_rcv_mp = NULL;
7078 		freemsg(mp);
7079 	}
7080 	sti->sti_nl7c_rcv_rval = 0;
7081 	if (sti->sti_nl7c_uri != NULL) {
7082 		nl7c_urifree(so);
7083 		/* urifree() cleared nl7c_uri */
7084 	}
7085 	if (sti->sti_nl7c_flags) {
7086 		sti->sti_nl7c_flags = 0;
7087 	}
7088 
7089 	ASSERT(sti->sti_ux_bound_vp == NULL);
7090 	if ((mp = sti->sti_unbind_mp) != NULL) {
7091 		freemsg(mp);
7092 		sti->sti_unbind_mp = NULL;
7093 	}
7094 }
7095 
7096 /*
7097  * Destroys the TPI information attached to a sonode.
7098  */
7099 static void
7100 sotpi_info_destroy(struct sonode *so)
7101 {
7102 	sotpi_info_t *sti = SOTOTPI(so);
7103 
7104 	i_sotpi_info_destructor(sti);
7105 	kmem_free(sti, sizeof (*sti));
7106 
7107 	so->so_priv = NULL;
7108 }
7109 
7110 /*
7111  * Create the global sotpi socket module entry. It will never be freed.
7112  */
7113 smod_info_t *
7114 sotpi_smod_create(void)
7115 {
7116 	smod_info_t *smodp;
7117 
7118 	smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
7119 	smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
7120 	(void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
7121 	/*
7122 	 * Initialize the smod_refcnt to 1 so it will never be freed.
7123 	 */
7124 	smodp->smod_refcnt = 1;
7125 	smodp->smod_uc_version = SOCK_UC_VERSION;
7126 	smodp->smod_dc_version = SOCK_DC_VERSION;
7127 	smodp->smod_sock_create_func = &sotpi_create;
7128 	smodp->smod_sock_destroy_func = &sotpi_destroy;
7129 	return (smodp);
7130 }
7131