xref: /titanic_52/usr/src/uts/common/fs/sockfs/socktpi.c (revision 9b4e3ac25d882519cad3fc11f0c53b07f4e60536)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/t_lock.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/buf.h>
32 #include <sys/conf.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/kmem_impl.h>
36 #include <sys/sysmacros.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/debug.h>
40 #include <sys/errno.h>
41 #include <sys/time.h>
42 #include <sys/file.h>
43 #include <sys/open.h>
44 #include <sys/user.h>
45 #include <sys/termios.h>
46 #include <sys/stream.h>
47 #include <sys/strsubr.h>
48 #include <sys/strsun.h>
49 #include <sys/suntpi.h>
50 #include <sys/ddi.h>
51 #include <sys/esunddi.h>
52 #include <sys/flock.h>
53 #include <sys/modctl.h>
54 #include <sys/vtrace.h>
55 #include <sys/cmn_err.h>
56 #include <sys/pathname.h>
57 
58 #include <sys/socket.h>
59 #include <sys/socketvar.h>
60 #include <sys/sockio.h>
61 #include <sys/sodirect.h>
62 #include <netinet/in.h>
63 #include <sys/un.h>
64 #include <sys/strsun.h>
65 
66 #include <sys/tiuser.h>
67 #define	_SUN_TPI_VERSION	2
68 #include <sys/tihdr.h>
69 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
70 
71 #include <c2/audit.h>
72 
73 #include <inet/common.h>
74 #include <inet/ip.h>
75 #include <inet/ip6.h>
76 #include <inet/tcp.h>
77 #include <inet/udp_impl.h>
78 
79 #include <sys/zone.h>
80 
81 #include <fs/sockfs/nl7c.h>
82 #include <fs/sockfs/nl7curi.h>
83 
84 #include <inet/kssl/ksslapi.h>
85 
86 #include <fs/sockfs/sockcommon.h>
87 #include <fs/sockfs/socktpi.h>
88 #include <fs/sockfs/socktpi_impl.h>
89 
90 /*
91  * Possible failures when memory can't be allocated. The documented behavior:
92  *
93  * 		5.5:			4.X:		XNET:
94  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
95  *							EINTR
96  *	(4.X does not document EINTR but returns it)
97  * bind:	ENOSR			-		ENOBUFS/ENOSR
98  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
99  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
100  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
101  *	(4.X getpeername and getsockname do not fail in practice)
102  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
103  * listen:	-			-		ENOBUFS
104  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
105  *							EINTR
106  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
107  *							EINTR
108  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
109  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
110  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
111  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
112  *
113  * Resolution. When allocation fails:
114  *	recv: return EINTR
115  *	send: return EINTR
116  *	connect, accept: EINTR
117  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
118  *	socket, socketpair: ENOBUFS
119  *	getpeername, getsockname: sleep
120  *	getsockopt, setsockopt: sleep
121  */
122 
123 #ifdef SOCK_TEST
124 /*
125  * Variables that make sockfs do something other than the standard TPI
126  * for the AF_INET transports.
127  *
128  * solisten_tpi_tcp:
129  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
130  *	the transport is already bound. This is needed to avoid loosing the
131  *	port number should listen() do a T_UNBIND_REQ followed by a
132  *	O_T_BIND_REQ.
133  *
134  * soconnect_tpi_udp:
135  *	UDP and ICMP can handle a T_CONN_REQ.
136  *	This is needed to make the sequence of connect(), getsockname()
137  *	return the local IP address used to send packets to the connected to
138  *	destination.
139  *
140  * soconnect_tpi_tcp:
141  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
142  *	Set this to non-zero to send TPI conformant messages to TCP in this
143  *	respect. This is a performance optimization.
144  *
145  * soaccept_tpi_tcp:
146  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
147  *	This is a performance optimization that has been picked up in XTI.
148  *
149  * soaccept_tpi_multioptions:
150  *	When inheriting SOL_SOCKET options from the listener to the accepting
151  *	socket send them as a single message for AF_INET{,6}.
152  */
153 int solisten_tpi_tcp = 0;
154 int soconnect_tpi_udp = 0;
155 int soconnect_tpi_tcp = 0;
156 int soaccept_tpi_tcp = 0;
157 int soaccept_tpi_multioptions = 1;
158 #else /* SOCK_TEST */
159 #define	soconnect_tpi_tcp	0
160 #define	soconnect_tpi_udp	0
161 #define	solisten_tpi_tcp	0
162 #define	soaccept_tpi_tcp	0
163 #define	soaccept_tpi_multioptions	1
164 #endif /* SOCK_TEST */
165 
166 #ifdef SOCK_TEST
167 extern int do_useracc;
168 extern clock_t sock_test_timelimit;
169 #endif /* SOCK_TEST */
170 
171 /*
172  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
173  * applications working. Turn on this flag to disable these checks.
174  */
175 int xnet_skip_checks = 0;
176 int xnet_check_print = 0;
177 int xnet_truncate_print = 0;
178 
179 static void sotpi_destroy(struct sonode *);
180 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
181     int, int *, cred_t *cr);
182 
183 static boolean_t	sotpi_info_create(struct sonode *, int);
184 static void		sotpi_info_init(struct sonode *);
185 static void 		sotpi_info_fini(struct sonode *);
186 static void 		sotpi_info_destroy(struct sonode *);
187 
188 /*
189  * Do direct function call to the transport layer below; this would
190  * also allow the transport to utilize read-side synchronous stream
191  * interface if necessary.  This is a /etc/system tunable that must
192  * not be modified on a running system.  By default this is enabled
193  * for performance reasons and may be disabled for debugging purposes.
194  */
195 boolean_t socktpi_direct = B_TRUE;
196 
197 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
198 
199 extern	void sigintr(k_sigset_t *, int);
200 extern	void sigunintr(k_sigset_t *);
201 
202 /* Sockets acting as an in-kernel SSL proxy */
203 extern mblk_t	*strsock_kssl_input(vnode_t *, mblk_t *, strwakeup_t *,
204 		    strsigset_t *, strsigset_t *, strpollset_t *);
205 extern mblk_t	*strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *,
206 		    strsigset_t *, strsigset_t *, strpollset_t *);
207 
208 static int	sotpi_unbind(struct sonode *, int);
209 
210 extern int	sodput(sodirect_t *, mblk_t *);
211 extern void	sodwakeup(sodirect_t *);
212 
213 /* TPI sockfs sonode operations */
214 int 		sotpi_init(struct sonode *, struct sonode *, struct cred *,
215 		    int);
216 static int	sotpi_accept(struct sonode *, int, struct cred *,
217 		    struct sonode **);
218 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
219 		    int, struct cred *);
220 static int	sotpi_listen(struct sonode *, int, struct cred *);
221 static int	sotpi_connect(struct sonode *, const struct sockaddr *,
222 		    socklen_t, int, int, struct cred *);
223 extern int	sotpi_recvmsg(struct sonode *, struct nmsghdr *,
224 		    struct uio *, struct cred *);
225 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
226 		    struct uio *, struct cred *);
227 static int	sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
228 		    struct cred *, mblk_t **);
229 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
230 		    struct uio *, void *, t_uscalar_t, int);
231 static int	sodgram_direct(struct sonode *, struct sockaddr *,
232 		    socklen_t, struct uio *, int);
233 extern int	sotpi_getpeername(struct sonode *, struct sockaddr *,
234 		    socklen_t *, boolean_t, struct cred *);
235 static int	sotpi_getsockname(struct sonode *, struct sockaddr *,
236 		    socklen_t *, struct cred *);
237 static int	sotpi_shutdown(struct sonode *, int, struct cred *);
238 extern int	sotpi_getsockopt(struct sonode *, int, int, void *,
239 		    socklen_t *, int, struct cred *);
240 extern int	sotpi_setsockopt(struct sonode *, int, int, const void *,
241 		    socklen_t, struct cred *);
242 static int 	sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
243 		    int32_t *);
244 static int 	socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
245 		    struct cred *, int32_t *);
246 static int 	sotpi_poll(struct sonode *, short, int, short *,
247 		    struct pollhead **);
248 static int 	sotpi_close(struct sonode *, int, struct cred *);
249 
250 static int	i_sotpi_info_constructor(sotpi_info_t *);
251 static void 	i_sotpi_info_destructor(sotpi_info_t *);
252 
253 sonodeops_t sotpi_sonodeops = {
254 	sotpi_init,		/* sop_init		*/
255 	sotpi_accept,		/* sop_accept		*/
256 	sotpi_bind,		/* sop_bind		*/
257 	sotpi_listen,		/* sop_listen		*/
258 	sotpi_connect,		/* sop_connect		*/
259 	sotpi_recvmsg,		/* sop_recvmsg		*/
260 	sotpi_sendmsg,		/* sop_sendmsg		*/
261 	sotpi_sendmblk,		/* sop_sendmblk		*/
262 	sotpi_getpeername,	/* sop_getpeername	*/
263 	sotpi_getsockname,	/* sop_getsockname	*/
264 	sotpi_shutdown,		/* sop_shutdown		*/
265 	sotpi_getsockopt,	/* sop_getsockopt	*/
266 	sotpi_setsockopt,	/* sop_setsockopt	*/
267 	sotpi_ioctl,		/* sop_ioctl		*/
268 	sotpi_poll,		/* sop_poll		*/
269 	sotpi_close,		/* sop_close		*/
270 };
271 
272 /*
273  * Return a TPI socket vnode.
274  *
275  * Note that sockets assume that the driver will clone (either itself
276  * or by using the clone driver) i.e. a socket() call will always
277  * result in a new vnode being created.
278  */
279 
280 /*
281  * Common create code for socket and accept. If tso is set the values
282  * from that node is used instead of issuing a T_INFO_REQ.
283  */
284 
285 /* ARGSUSED */
286 static struct sonode *
287 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
288     int version, int sflags, int *errorp, cred_t *cr)
289 {
290 	struct sonode	*so;
291 	kmem_cache_t 	*cp;
292 	int		sfamily = family;
293 
294 	ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
295 
296 	if (family == AF_NCA) {
297 		/*
298 		 * The request is for an NCA socket so for NL7C use the
299 		 * INET domain instead and mark NL7C_AF_NCA below.
300 		 */
301 		family = AF_INET;
302 		/*
303 		 * NL7C is not supported in the non-global zone,
304 		 * we enforce this restriction here.
305 		 */
306 		if (getzoneid() != GLOBAL_ZONEID) {
307 			*errorp = ENOTSUP;
308 			return (NULL);
309 		}
310 	}
311 
312 	/*
313 	 * to be compatible with old tpi socket implementation ignore
314 	 * sleep flag (sflags) passed in
315 	 */
316 	cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
317 	so = kmem_cache_alloc(cp, KM_SLEEP);
318 	if (so == NULL) {
319 		*errorp = ENOMEM;
320 		return (NULL);
321 	}
322 
323 	sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
324 	sotpi_info_init(so);
325 
326 	if (sfamily == AF_NCA) {
327 		SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
328 	}
329 
330 	if (version == SOV_DEFAULT)
331 		version = so_default_version;
332 
333 	so->so_version = (short)version;
334 	*errorp = 0;
335 
336 	return (so);
337 }
338 
339 static void
340 sotpi_destroy(struct sonode *so)
341 {
342 	kmem_cache_t *cp;
343 	struct sockparams *origsp;
344 
345 	/*
346 	 * If there is a new dealloc function (ie. smod_destroy_func),
347 	 * then it should check the correctness of the ops.
348 	 */
349 
350 	ASSERT(so->so_ops == &sotpi_sonodeops);
351 
352 	origsp = SOTOTPI(so)->sti_orig_sp;
353 
354 	sotpi_info_fini(so);
355 
356 	if (so->so_state & SS_FALLBACK_COMP) {
357 		/*
358 		 * A fallback happend, which means that a sotpi_info_t struct
359 		 * was allocated (as opposed to being allocated from the TPI
360 		 * sonode cache. Therefore we explicitly free the struct
361 		 * here.
362 		 */
363 		sotpi_info_destroy(so);
364 		ASSERT(origsp != NULL);
365 
366 		origsp->sp_smod_info->smod_sock_destroy_func(so);
367 		SOCKPARAMS_DEC_REF(origsp);
368 	} else {
369 		sonode_fini(so);
370 		cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
371 		    socktpi_cache;
372 		kmem_cache_free(cp, so);
373 	}
374 }
375 
376 /* ARGSUSED1 */
377 int
378 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
379 {
380 	major_t maj;
381 	dev_t newdev;
382 	struct vnode *vp;
383 	int error = 0;
384 	struct stdata *stp;
385 
386 	sotpi_info_t *sti = SOTOTPI(so);
387 
388 	dprint(1, ("sotpi_init()\n"));
389 
390 	/*
391 	 * over write the sleep flag passed in but that is ok
392 	 * as tpi socket does not honor sleep flag.
393 	 */
394 	flags |= FREAD|FWRITE;
395 
396 	/*
397 	 * Record in so_flag that it is a clone.
398 	 */
399 	if (getmajor(sti->sti_dev) == clone_major)
400 		so->so_flag |= SOCLONE;
401 
402 	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
403 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
404 	    (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
405 	    so->so_protocol == IPPROTO_IP)) {
406 		/* Tell tcp or udp that it's talking to sockets */
407 		flags |= SO_SOCKSTR;
408 
409 		/*
410 		 * Here we indicate to socktpi_open() our attempt to
411 		 * make direct calls between sockfs and transport.
412 		 * The final decision is left to socktpi_open().
413 		 */
414 		sti->sti_direct = 1;
415 
416 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
417 		if (so->so_type == SOCK_STREAM && tso != NULL) {
418 			if (SOTOTPI(tso)->sti_direct) {
419 				/*
420 				 * Inherit sti_direct from listener and pass
421 				 * SO_ACCEPTOR open flag to tcp, indicating
422 				 * that this is an accept fast-path instance.
423 				 */
424 				flags |= SO_ACCEPTOR;
425 			} else {
426 				/*
427 				 * sti_direct is not set on listener, meaning
428 				 * that the listener has been converted from
429 				 * a socket to a stream.  Ensure that the
430 				 * acceptor inherits these settings.
431 				 */
432 				sti->sti_direct = 0;
433 				flags &= ~SO_SOCKSTR;
434 			}
435 		}
436 	}
437 
438 	/*
439 	 * Tell local transport that it is talking to sockets.
440 	 */
441 	if (so->so_family == AF_UNIX) {
442 		flags |= SO_SOCKSTR;
443 	}
444 
445 	vp = SOTOV(so);
446 	newdev = vp->v_rdev;
447 	maj = getmajor(newdev);
448 	ASSERT(STREAMSTAB(maj));
449 
450 	error = stropen(vp, &newdev, flags, cr);
451 
452 	stp = vp->v_stream;
453 	if (error == 0) {
454 		if (so->so_flag & SOCLONE)
455 			ASSERT(newdev != vp->v_rdev);
456 		mutex_enter(&so->so_lock);
457 		sti->sti_dev = newdev;
458 		vp->v_rdev = newdev;
459 		mutex_exit(&so->so_lock);
460 
461 		if (stp->sd_flag & STRISTTY) {
462 			/*
463 			 * this is a post SVR4 tty driver - a socket can not
464 			 * be a controlling terminal. Fail the open.
465 			 */
466 			(void) sotpi_close(so, flags, cr);
467 			return (ENOTTY);	/* XXX */
468 		}
469 
470 		ASSERT(stp->sd_wrq != NULL);
471 		sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
472 
473 		/*
474 		 * If caller is interested in doing direct function call
475 		 * interface to/from transport module, probe the module
476 		 * directly beneath the streamhead to see if it qualifies.
477 		 *
478 		 * We turn off the direct interface when qualifications fail.
479 		 * In the acceptor case, we simply turn off the sti_direct
480 		 * flag on the socket. We do the fallback after the accept
481 		 * has completed, before the new socket is returned to the
482 		 * application.
483 		 */
484 		if (sti->sti_direct) {
485 			queue_t *tq = stp->sd_wrq->q_next;
486 
487 			/*
488 			 * sti_direct is currently supported and tested
489 			 * only for tcp/udp; this is the main reason to
490 			 * have the following assertions.
491 			 */
492 			ASSERT(so->so_family == AF_INET ||
493 			    so->so_family == AF_INET6);
494 			ASSERT(so->so_protocol == IPPROTO_UDP ||
495 			    so->so_protocol == IPPROTO_TCP ||
496 			    so->so_protocol == IPPROTO_IP);
497 			ASSERT(so->so_type == SOCK_DGRAM ||
498 			    so->so_type == SOCK_STREAM);
499 
500 			/*
501 			 * Abort direct call interface if the module directly
502 			 * underneath the stream head is not defined with the
503 			 * _D_DIRECT flag.  This could happen in the tcp or
504 			 * udp case, when some other module is autopushed
505 			 * above it, or for some reasons the expected module
506 			 * isn't purely D_MP (which is the main requirement).
507 			 *
508 			 * Else, SS_DIRECT is valid. If the read-side Q has
509 			 * _QSODIRECT set then and uioasync is enabled then
510 			 * set SS_SODIRECT to enable sodirect.
511 			 */
512 			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
513 			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
514 				int rval;
515 
516 				/* Continue on without direct calls */
517 				sti->sti_direct = 0;
518 
519 				/*
520 				 * Cannot issue ioctl on fallback socket since
521 				 * there is no conn associated with the queue.
522 				 * The fallback downcall will notify the proto
523 				 * of the change.
524 				 */
525 				if (!(flags & SO_ACCEPTOR) &&
526 				    !(flags & SO_FALLBACK)) {
527 					if ((error = strioctl(vp,
528 					    _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
529 					    cr, &rval)) != 0) {
530 						(void) sotpi_close(so, flags,
531 						    cr);
532 						return (error);
533 					}
534 				}
535 			} else if ((_OTHERQ(tq)->q_flag & _QSODIRECT) &&
536 			    uioasync.enabled) {
537 				/* Enable sodirect */
538 				so->so_state |= SS_SODIRECT;
539 			}
540 		}
541 
542 		if (flags & SO_FALLBACK) {
543 			/*
544 			 * The stream created does not have a conn.
545 			 * do stream set up after conn has been assigned
546 			 */
547 			return (error);
548 		}
549 		if (error = so_strinit(so, tso)) {
550 			(void) sotpi_close(so, flags, cr);
551 			return (error);
552 		}
553 
554 		/* Wildcard */
555 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
556 			int protocol = so->so_protocol;
557 			/*
558 			 * Issue SO_PROTOTYPE setsockopt.
559 			 */
560 			error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
561 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
562 			if (error != 0) {
563 				(void) sotpi_close(so, flags, cr);
564 				/*
565 				 * Setsockopt often fails with ENOPROTOOPT but
566 				 * socket() should fail with
567 				 * EPROTONOSUPPORT/EPROTOTYPE.
568 				 */
569 				return (EPROTONOSUPPORT);
570 			}
571 		}
572 
573 	} else {
574 		/*
575 		 * While the same socket can not be reopened (unlike specfs)
576 		 * the stream head sets STREOPENFAIL when the autopush fails.
577 		 */
578 		if ((stp != NULL) &&
579 		    (stp->sd_flag & STREOPENFAIL)) {
580 			/*
581 			 * Open failed part way through.
582 			 */
583 			mutex_enter(&stp->sd_lock);
584 			stp->sd_flag &= ~STREOPENFAIL;
585 			mutex_exit(&stp->sd_lock);
586 			(void) sotpi_close(so, flags, cr);
587 			return (error);
588 			/*NOTREACHED*/
589 		}
590 		ASSERT(stp == NULL);
591 	}
592 	TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
593 	    "sockfs open:maj %d vp %p so %p error %d",
594 	    maj, vp, so, error);
595 	return (error);
596 }
597 
598 /*
599  * Bind the socket to an unspecified address in sockfs only.
600  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
601  * required in all cases.
602  */
603 static void
604 so_automatic_bind(struct sonode *so)
605 {
606 	sotpi_info_t *sti = SOTOTPI(so);
607 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
608 
609 	ASSERT(MUTEX_HELD(&so->so_lock));
610 	ASSERT(!(so->so_state & SS_ISBOUND));
611 	ASSERT(sti->sti_unbind_mp);
612 
613 	ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
614 	bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
615 	sti->sti_laddr_sa->sa_family = so->so_family;
616 	so->so_state |= SS_ISBOUND;
617 }
618 
619 
620 /*
621  * bind the socket.
622  *
623  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
624  * are passed in we allow rebinding. Note that for backwards compatibility
625  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
626  * Thus the rebinding code is currently not executed.
627  *
628  * The constraints for rebinding are:
629  * - it is a SOCK_DGRAM, or
630  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
631  *   and no listen() has been done.
632  * This rebinding code was added based on some language in the XNET book
633  * about not returning EINVAL it the protocol allows rebinding. However,
634  * this language is not present in the Posix socket draft. Thus maybe the
635  * rebinding logic should be deleted from the source.
636  *
637  * A null "name" can be used to unbind the socket if:
638  * - it is a SOCK_DGRAM, or
639  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
640  *   and no listen() has been done.
641  */
642 /* ARGSUSED */
643 static int
644 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
645     socklen_t namelen, int backlog, int flags, struct cred *cr)
646 {
647 	struct T_bind_req	bind_req;
648 	struct T_bind_ack	*bind_ack;
649 	int			error = 0;
650 	mblk_t			*mp;
651 	void			*addr;
652 	t_uscalar_t		addrlen;
653 	int			unbind_on_err = 1;
654 	boolean_t		clear_acceptconn_on_err = B_FALSE;
655 	boolean_t		restore_backlog_on_err = B_FALSE;
656 	int			save_so_backlog;
657 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
658 	boolean_t		tcp_udp_xport;
659 	void			*nl7c = NULL;
660 	sotpi_info_t		*sti = SOTOTPI(so);
661 
662 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
663 	    (void *)so, (void *)name, namelen, backlog, flags,
664 	    pr_state(so->so_state, so->so_mode)));
665 
666 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
667 
668 	if (!(flags & _SOBIND_LOCK_HELD)) {
669 		mutex_enter(&so->so_lock);
670 		so_lock_single(so);	/* Set SOLOCKED */
671 	} else {
672 		ASSERT(MUTEX_HELD(&so->so_lock));
673 		ASSERT(so->so_flag & SOLOCKED);
674 	}
675 
676 	/*
677 	 * Make sure that there is a preallocated unbind_req message
678 	 * before binding. This message allocated when the socket is
679 	 * created  but it might be have been consumed.
680 	 */
681 	if (sti->sti_unbind_mp == NULL) {
682 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
683 		/* NOTE: holding so_lock while sleeping */
684 		sti->sti_unbind_mp =
685 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
686 	}
687 
688 	if (flags & _SOBIND_REBIND) {
689 		/*
690 		 * Called from solisten after doing an sotpi_unbind() or
691 		 * potentially without the unbind (latter for AF_INET{,6}).
692 		 */
693 		ASSERT(name == NULL && namelen == 0);
694 
695 		if (so->so_family == AF_UNIX) {
696 			ASSERT(sti->sti_ux_bound_vp);
697 			addr = &sti->sti_ux_laddr;
698 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
699 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
700 			    "addr 0x%p, vp %p\n",
701 			    addrlen,
702 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
703 			    (void *)sti->sti_ux_bound_vp));
704 		} else {
705 			addr = sti->sti_laddr_sa;
706 			addrlen = (t_uscalar_t)sti->sti_laddr_len;
707 		}
708 	} else if (flags & _SOBIND_UNSPEC) {
709 		ASSERT(name == NULL && namelen == 0);
710 
711 		/*
712 		 * The caller checked SS_ISBOUND but not necessarily
713 		 * under so_lock
714 		 */
715 		if (so->so_state & SS_ISBOUND) {
716 			/* No error */
717 			goto done;
718 		}
719 
720 		/* Set an initial local address */
721 		switch (so->so_family) {
722 		case AF_UNIX:
723 			/*
724 			 * Use an address with same size as struct sockaddr
725 			 * just like BSD.
726 			 */
727 			sti->sti_laddr_len =
728 			    (socklen_t)sizeof (struct sockaddr);
729 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
730 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
731 			sti->sti_laddr_sa->sa_family = so->so_family;
732 
733 			/*
734 			 * Pass down an address with the implicit bind
735 			 * magic number and the rest all zeros.
736 			 * The transport will return a unique address.
737 			 */
738 			sti->sti_ux_laddr.soua_vp = NULL;
739 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
740 			addr = &sti->sti_ux_laddr;
741 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
742 			break;
743 
744 		case AF_INET:
745 		case AF_INET6:
746 			/*
747 			 * An unspecified bind in TPI has a NULL address.
748 			 * Set the address in sockfs to have the sa_family.
749 			 */
750 			sti->sti_laddr_len = (so->so_family == AF_INET) ?
751 			    (socklen_t)sizeof (sin_t) :
752 			    (socklen_t)sizeof (sin6_t);
753 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
754 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
755 			sti->sti_laddr_sa->sa_family = so->so_family;
756 			addr = NULL;
757 			addrlen = 0;
758 			break;
759 
760 		default:
761 			/*
762 			 * An unspecified bind in TPI has a NULL address.
763 			 * Set the address in sockfs to be zero length.
764 			 *
765 			 * Can not assume there is a sa_family for all
766 			 * protocol families. For example, AF_X25 does not
767 			 * have a family field.
768 			 */
769 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
770 			sti->sti_laddr_len = 0;	/* XXX correct? */
771 			addr = NULL;
772 			addrlen = 0;
773 			break;
774 		}
775 
776 	} else {
777 		if (so->so_state & SS_ISBOUND) {
778 			/*
779 			 * If it is ok to rebind the socket, first unbind
780 			 * with the transport. A rebind to the NULL address
781 			 * is interpreted as an unbind.
782 			 * Note that a bind to NULL in BSD does unbind the
783 			 * socket but it fails with EINVAL.
784 			 * Note that regular sockets set SOV_SOCKBSD i.e.
785 			 * _SOBIND_SOCKBSD gets set here hence no type of
786 			 * socket does currently allow rebinding.
787 			 *
788 			 * If the name is NULL just do an unbind.
789 			 */
790 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
791 			    name != NULL) {
792 				error = EINVAL;
793 				unbind_on_err = 0;
794 				eprintsoline(so, error);
795 				goto done;
796 			}
797 			if ((so->so_mode & SM_CONNREQUIRED) &&
798 			    (so->so_state & SS_CANTREBIND)) {
799 				error = EINVAL;
800 				unbind_on_err = 0;
801 				eprintsoline(so, error);
802 				goto done;
803 			}
804 			error = sotpi_unbind(so, 0);
805 			if (error) {
806 				eprintsoline(so, error);
807 				goto done;
808 			}
809 			ASSERT(!(so->so_state & SS_ISBOUND));
810 			if (name == NULL) {
811 				so->so_state &=
812 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
813 				goto done;
814 			}
815 		}
816 
817 		/* X/Open requires this check */
818 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
819 			if (xnet_check_print) {
820 				printf("sockfs: X/Open bind state check "
821 				    "caused EINVAL\n");
822 			}
823 			error = EINVAL;
824 			goto done;
825 		}
826 
827 		switch (so->so_family) {
828 		case AF_UNIX:
829 			/*
830 			 * All AF_UNIX addresses are nul terminated
831 			 * when copied (copyin_name) in so the minimum
832 			 * length is 3 bytes.
833 			 */
834 			if (name == NULL ||
835 			    (ssize_t)namelen <= sizeof (short) + 1) {
836 				error = EISDIR;
837 				eprintsoline(so, error);
838 				goto done;
839 			}
840 			/*
841 			 * Verify so_family matches the bound family.
842 			 * BSD does not check this for AF_UNIX resulting
843 			 * in funny mknods.
844 			 */
845 			if (name->sa_family != so->so_family) {
846 				error = EAFNOSUPPORT;
847 				goto done;
848 			}
849 			break;
850 		case AF_INET:
851 			if (name == NULL) {
852 				error = EINVAL;
853 				eprintsoline(so, error);
854 				goto done;
855 			}
856 			if ((size_t)namelen != sizeof (sin_t)) {
857 				error = name->sa_family != so->so_family ?
858 				    EAFNOSUPPORT : EINVAL;
859 				eprintsoline(so, error);
860 				goto done;
861 			}
862 			if ((flags & _SOBIND_XPG4_2) &&
863 			    (name->sa_family != so->so_family)) {
864 				/*
865 				 * This check has to be made for X/Open
866 				 * sockets however application failures have
867 				 * been observed when it is applied to
868 				 * all sockets.
869 				 */
870 				error = EAFNOSUPPORT;
871 				eprintsoline(so, error);
872 				goto done;
873 			}
874 			/*
875 			 * Force a zero sa_family to match so_family.
876 			 *
877 			 * Some programs like inetd(1M) don't set the
878 			 * family field. Other programs leave
879 			 * sin_family set to garbage - SunOS 4.X does
880 			 * not check the family field on a bind.
881 			 * We use the family field that
882 			 * was passed in to the socket() call.
883 			 */
884 			name->sa_family = so->so_family;
885 			break;
886 
887 		case AF_INET6: {
888 #ifdef DEBUG
889 			sin6_t *sin6 = (sin6_t *)name;
890 #endif /* DEBUG */
891 
892 			if (name == NULL) {
893 				error = EINVAL;
894 				eprintsoline(so, error);
895 				goto done;
896 			}
897 			if ((size_t)namelen != sizeof (sin6_t)) {
898 				error = name->sa_family != so->so_family ?
899 				    EAFNOSUPPORT : EINVAL;
900 				eprintsoline(so, error);
901 				goto done;
902 			}
903 			if (name->sa_family != so->so_family) {
904 				/*
905 				 * With IPv6 we require the family to match
906 				 * unlike in IPv4.
907 				 */
908 				error = EAFNOSUPPORT;
909 				eprintsoline(so, error);
910 				goto done;
911 			}
912 #ifdef DEBUG
913 			/*
914 			 * Verify that apps don't forget to clear
915 			 * sin6_scope_id etc
916 			 */
917 			if (sin6->sin6_scope_id != 0 &&
918 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
919 				zcmn_err(getzoneid(), CE_WARN,
920 				    "bind with uninitialized sin6_scope_id "
921 				    "(%d) on socket. Pid = %d\n",
922 				    (int)sin6->sin6_scope_id,
923 				    (int)curproc->p_pid);
924 			}
925 			if (sin6->__sin6_src_id != 0) {
926 				zcmn_err(getzoneid(), CE_WARN,
927 				    "bind with uninitialized __sin6_src_id "
928 				    "(%d) on socket. Pid = %d\n",
929 				    (int)sin6->__sin6_src_id,
930 				    (int)curproc->p_pid);
931 			}
932 #endif /* DEBUG */
933 			break;
934 		}
935 		default:
936 			/*
937 			 * Don't do any length or sa_family check to allow
938 			 * non-sockaddr style addresses.
939 			 */
940 			if (name == NULL) {
941 				error = EINVAL;
942 				eprintsoline(so, error);
943 				goto done;
944 			}
945 			break;
946 		}
947 
948 		if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
949 			error = ENAMETOOLONG;
950 			eprintsoline(so, error);
951 			goto done;
952 		}
953 		/*
954 		 * Save local address.
955 		 */
956 		sti->sti_laddr_len = (socklen_t)namelen;
957 		ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
958 		bcopy(name, sti->sti_laddr_sa, namelen);
959 
960 		addr = sti->sti_laddr_sa;
961 		addrlen = (t_uscalar_t)sti->sti_laddr_len;
962 		switch (so->so_family) {
963 		case AF_INET6:
964 		case AF_INET:
965 			break;
966 		case AF_UNIX: {
967 			struct sockaddr_un *soun =
968 			    (struct sockaddr_un *)sti->sti_laddr_sa;
969 			struct vnode *vp, *rvp;
970 			struct vattr vattr;
971 
972 			ASSERT(sti->sti_ux_bound_vp == NULL);
973 			/*
974 			 * Create vnode for the specified path name.
975 			 * Keep vnode held with a reference in sti_ux_bound_vp.
976 			 * Use the vnode pointer as the address used in the
977 			 * bind with the transport.
978 			 *
979 			 * Use the same mode as in BSD. In particular this does
980 			 * not observe the umask.
981 			 */
982 			/* MAXPATHLEN + soun_family + nul termination */
983 			if (sti->sti_laddr_len >
984 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
985 				error = ENAMETOOLONG;
986 				eprintsoline(so, error);
987 				goto done;
988 			}
989 			vattr.va_type = VSOCK;
990 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
991 			vattr.va_mask = AT_TYPE|AT_MODE;
992 			/* NOTE: holding so_lock */
993 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
994 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
995 			if (error) {
996 				if (error == EEXIST)
997 					error = EADDRINUSE;
998 				eprintsoline(so, error);
999 				goto done;
1000 			}
1001 			/*
1002 			 * Establish pointer from the underlying filesystem
1003 			 * vnode to the socket node.
1004 			 * sti_ux_bound_vp and v_stream->sd_vnode form the
1005 			 * cross-linkage between the underlying filesystem
1006 			 * node and the socket node.
1007 			 */
1008 
1009 			if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
1010 				VN_HOLD(rvp);
1011 				VN_RELE(vp);
1012 				vp = rvp;
1013 			}
1014 
1015 			ASSERT(SOTOV(so)->v_stream);
1016 			mutex_enter(&vp->v_lock);
1017 			vp->v_stream = SOTOV(so)->v_stream;
1018 			sti->sti_ux_bound_vp = vp;
1019 			mutex_exit(&vp->v_lock);
1020 
1021 			/*
1022 			 * Use the vnode pointer value as a unique address
1023 			 * (together with the magic number to avoid conflicts
1024 			 * with implicit binds) in the transport provider.
1025 			 */
1026 			sti->sti_ux_laddr.soua_vp =
1027 			    (void *)sti->sti_ux_bound_vp;
1028 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1029 			addr = &sti->sti_ux_laddr;
1030 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1031 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1032 			    addrlen,
1033 			    (void *)((struct so_ux_addr *)addr)->soua_vp));
1034 			break;
1035 		}
1036 		} /* end switch (so->so_family) */
1037 	}
1038 
1039 	/*
1040 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1041 	 * the transport can start passing up T_CONN_IND messages
1042 	 * as soon as it receives the bind req and strsock_proto()
1043 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1044 	 */
1045 	if (flags & _SOBIND_LISTEN) {
1046 		if ((so->so_state & SS_ACCEPTCONN) == 0)
1047 			clear_acceptconn_on_err = B_TRUE;
1048 		save_so_backlog = so->so_backlog;
1049 		restore_backlog_on_err = B_TRUE;
1050 		so->so_state |= SS_ACCEPTCONN;
1051 		so->so_backlog = backlog;
1052 	}
1053 
1054 	/*
1055 	 * If NL7C addr(s) have been configured check for addr/port match,
1056 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1057 	 *
1058 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1059 	 * family sockets only. If match mark as such.
1060 	 */
1061 	if (nl7c_enabled && ((addr != NULL &&
1062 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1063 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1064 	    sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1065 		/*
1066 		 * NL7C is not supported in non-global zones,
1067 		 * we enforce this restriction here.
1068 		 */
1069 		if (so->so_zoneid == GLOBAL_ZONEID) {
1070 			/* An NL7C socket, mark it */
1071 			sti->sti_nl7c_flags |= NL7C_ENABLED;
1072 			if (nl7c == NULL) {
1073 				/*
1074 				 * Was an AF_NCA bind() so add it to the
1075 				 * addr list for reporting purposes.
1076 				 */
1077 				nl7c = nl7c_add_addr(addr, addrlen);
1078 			}
1079 		} else
1080 			nl7c = NULL;
1081 	}
1082 
1083 	/*
1084 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1085 	 * for other transports we will send in a O_T_BIND_REQ.
1086 	 */
1087 	if (tcp_udp_xport &&
1088 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
1089 		PRIM_type = T_BIND_REQ;
1090 
1091 	bind_req.PRIM_type = PRIM_type;
1092 	bind_req.ADDR_length = addrlen;
1093 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1094 	bind_req.CONIND_number = backlog;
1095 	/* NOTE: holding so_lock while sleeping */
1096 	mp = soallocproto2(&bind_req, sizeof (bind_req),
1097 	    addr, addrlen, 0, _ALLOC_SLEEP);
1098 	sti->sti_laddr_valid = 0;
1099 
1100 	/* Done using sti_laddr_sa - can drop the lock */
1101 	mutex_exit(&so->so_lock);
1102 
1103 	/*
1104 	 * Intercept the bind_req message here to check if this <address/port>
1105 	 * was configured as an SSL proxy server, or if another endpoint was
1106 	 * already configured to act as a proxy for us.
1107 	 *
1108 	 * Note, only if NL7C not enabled for this socket.
1109 	 */
1110 	if (nl7c == NULL &&
1111 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1112 	    so->so_type == SOCK_STREAM) {
1113 
1114 		if (sti->sti_kssl_ent != NULL) {
1115 			kssl_release_ent(sti->sti_kssl_ent, so,
1116 			    sti->sti_kssl_type);
1117 			sti->sti_kssl_ent = NULL;
1118 		}
1119 
1120 		sti->sti_kssl_type = kssl_check_proxy(mp, so,
1121 		    &sti->sti_kssl_ent);
1122 		switch (sti->sti_kssl_type) {
1123 		case KSSL_NO_PROXY:
1124 			break;
1125 
1126 		case KSSL_HAS_PROXY:
1127 			mutex_enter(&so->so_lock);
1128 			goto skip_transport;
1129 
1130 		case KSSL_IS_PROXY:
1131 			break;
1132 		}
1133 	}
1134 
1135 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1136 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1137 	if (error) {
1138 		eprintsoline(so, error);
1139 		mutex_enter(&so->so_lock);
1140 		goto done;
1141 	}
1142 
1143 	mutex_enter(&so->so_lock);
1144 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1145 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1146 	if (error) {
1147 		eprintsoline(so, error);
1148 		goto done;
1149 	}
1150 skip_transport:
1151 	ASSERT(mp);
1152 	/*
1153 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1154 	 * strsock_proto while the lock was dropped above, the bind
1155 	 * is allowed to complete.
1156 	 */
1157 
1158 	/* Mark as bound. This will be undone if we detect errors below. */
1159 	if (flags & _SOBIND_NOXLATE) {
1160 		ASSERT(so->so_family == AF_UNIX);
1161 		sti->sti_faddr_noxlate = 1;
1162 	}
1163 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1164 	so->so_state |= SS_ISBOUND;
1165 	ASSERT(sti->sti_unbind_mp);
1166 
1167 	/* note that we've already set SS_ACCEPTCONN above */
1168 
1169 	/*
1170 	 * Recompute addrlen - an unspecied bind sent down an
1171 	 * address of length zero but we expect the appropriate length
1172 	 * in return.
1173 	 */
1174 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1175 	    sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1176 
1177 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
1178 	/*
1179 	 * The alignment restriction is really too strict but
1180 	 * we want enough alignment to inspect the fields of
1181 	 * a sockaddr_in.
1182 	 */
1183 	addr = sogetoff(mp, bind_ack->ADDR_offset,
1184 	    bind_ack->ADDR_length,
1185 	    __TPI_ALIGN_SIZE);
1186 	if (addr == NULL) {
1187 		freemsg(mp);
1188 		error = EPROTO;
1189 		eprintsoline(so, error);
1190 		goto done;
1191 	}
1192 	if (!(flags & _SOBIND_UNSPEC)) {
1193 		/*
1194 		 * Verify that the transport didn't return something we
1195 		 * did not want e.g. an address other than what we asked for.
1196 		 *
1197 		 * NOTE: These checks would go away if/when we switch to
1198 		 * using the new TPI (in which the transport would fail
1199 		 * the request instead of assigning a different address).
1200 		 *
1201 		 * NOTE2: For protocols that we don't know (i.e. any
1202 		 * other than AF_INET6, AF_INET and AF_UNIX), we
1203 		 * cannot know if the transport should be expected to
1204 		 * return the same address as that requested.
1205 		 *
1206 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1207 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1208 		 *
1209 		 * For example, in the case of netatalk it may be
1210 		 * inappropriate for the transport to return the
1211 		 * requested address (as it may have allocated a local
1212 		 * port number in behaviour similar to that of an
1213 		 * AF_INET bind request with a port number of zero).
1214 		 *
1215 		 * Given the definition of O_T_BIND_REQ, where the
1216 		 * transport may bind to an address other than the
1217 		 * requested address, it's not possible to determine
1218 		 * whether a returned address that differs from the
1219 		 * requested address is a reason to fail (because the
1220 		 * requested address was not available) or succeed
1221 		 * (because the transport allocated an appropriate
1222 		 * address and/or port).
1223 		 *
1224 		 * sockfs currently requires that the transport return
1225 		 * the requested address in the T_BIND_ACK, unless
1226 		 * there is code here to allow for any discrepancy.
1227 		 * Such code exists for AF_INET and AF_INET6.
1228 		 *
1229 		 * Netatalk chooses to return the requested address
1230 		 * rather than the (correct) allocated address.  This
1231 		 * means that netatalk violates the TPI specification
1232 		 * (and would not function correctly if used from a
1233 		 * TLI application), but it does mean that it works
1234 		 * with sockfs.
1235 		 *
1236 		 * As noted above, using the newer XTI bind primitive
1237 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1238 		 * allow sockfs to be more sure about whether or not
1239 		 * the bind request had succeeded (as transports are
1240 		 * not permitted to bind to a different address than
1241 		 * that requested - they must return failure).
1242 		 * Unfortunately, support for T_BIND_REQ may not be
1243 		 * present in all transport implementations (netatalk,
1244 		 * for example, doesn't have it), making the
1245 		 * transition difficult.
1246 		 */
1247 		if (bind_ack->ADDR_length != addrlen) {
1248 			/* Assumes that the requested address was in use */
1249 			freemsg(mp);
1250 			error = EADDRINUSE;
1251 			eprintsoline(so, error);
1252 			goto done;
1253 		}
1254 
1255 		switch (so->so_family) {
1256 		case AF_INET6:
1257 		case AF_INET: {
1258 			sin_t *rname, *aname;
1259 
1260 			rname = (sin_t *)addr;
1261 			aname = (sin_t *)sti->sti_laddr_sa;
1262 
1263 			/*
1264 			 * Take advantage of the alignment
1265 			 * of sin_port and sin6_port which fall
1266 			 * in the same place in their data structures.
1267 			 * Just use sin_port for either address family.
1268 			 *
1269 			 * This may become a problem if (heaven forbid)
1270 			 * there's a separate ipv6port_reserved... :-P
1271 			 *
1272 			 * Binding to port 0 has the semantics of letting
1273 			 * the transport bind to any port.
1274 			 *
1275 			 * If the transport is TCP or UDP since we had sent
1276 			 * a T_BIND_REQ we would not get a port other than
1277 			 * what we asked for.
1278 			 */
1279 			if (tcp_udp_xport) {
1280 				/*
1281 				 * Pick up the new port number if we bound to
1282 				 * port 0.
1283 				 */
1284 				if (aname->sin_port == 0)
1285 					aname->sin_port = rname->sin_port;
1286 				sti->sti_laddr_valid = 1;
1287 				break;
1288 			}
1289 			if (aname->sin_port != 0 &&
1290 			    aname->sin_port != rname->sin_port) {
1291 				freemsg(mp);
1292 				error = EADDRINUSE;
1293 				eprintsoline(so, error);
1294 				goto done;
1295 			}
1296 			/*
1297 			 * Pick up the new port number if we bound to port 0.
1298 			 */
1299 			aname->sin_port = rname->sin_port;
1300 
1301 			/*
1302 			 * Unfortunately, addresses aren't _quite_ the same.
1303 			 */
1304 			if (so->so_family == AF_INET) {
1305 				if (aname->sin_addr.s_addr !=
1306 				    rname->sin_addr.s_addr) {
1307 					freemsg(mp);
1308 					error = EADDRNOTAVAIL;
1309 					eprintsoline(so, error);
1310 					goto done;
1311 				}
1312 			} else {
1313 				sin6_t *rname6 = (sin6_t *)rname;
1314 				sin6_t *aname6 = (sin6_t *)aname;
1315 
1316 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1317 				    &rname6->sin6_addr)) {
1318 					freemsg(mp);
1319 					error = EADDRNOTAVAIL;
1320 					eprintsoline(so, error);
1321 					goto done;
1322 				}
1323 			}
1324 			break;
1325 		}
1326 		case AF_UNIX:
1327 			if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1328 				freemsg(mp);
1329 				error = EADDRINUSE;
1330 				eprintsoline(so, error);
1331 				eprintso(so,
1332 				    ("addrlen %d, addr 0x%x, vp %p\n",
1333 				    addrlen, *((int *)addr),
1334 				    (void *)sti->sti_ux_bound_vp));
1335 				goto done;
1336 			}
1337 			sti->sti_laddr_valid = 1;
1338 			break;
1339 		default:
1340 			/*
1341 			 * NOTE: This assumes that addresses can be
1342 			 * byte-compared for equivalence.
1343 			 */
1344 			if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1345 				freemsg(mp);
1346 				error = EADDRINUSE;
1347 				eprintsoline(so, error);
1348 				goto done;
1349 			}
1350 			/*
1351 			 * Don't mark sti_laddr_valid, as we cannot be
1352 			 * sure that the returned address is the real
1353 			 * bound address when talking to an unknown
1354 			 * transport.
1355 			 */
1356 			break;
1357 		}
1358 	} else {
1359 		/*
1360 		 * Save for returned address for getsockname.
1361 		 * Needed for unspecific bind unless transport supports
1362 		 * the TI_GETMYNAME ioctl.
1363 		 * Do this for AF_INET{,6} even though they do, as
1364 		 * caching info here is much better performance than
1365 		 * a TPI/STREAMS trip to the transport for getsockname.
1366 		 * Any which can't for some reason _must_ _not_ set
1367 		 * sti_laddr_valid here for the caching version of
1368 		 * getsockname to not break;
1369 		 */
1370 		switch (so->so_family) {
1371 		case AF_UNIX:
1372 			/*
1373 			 * Record the address bound with the transport
1374 			 * for use by socketpair.
1375 			 */
1376 			bcopy(addr, &sti->sti_ux_laddr, addrlen);
1377 			sti->sti_laddr_valid = 1;
1378 			break;
1379 		case AF_INET:
1380 		case AF_INET6:
1381 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1382 			bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1383 			sti->sti_laddr_valid = 1;
1384 			break;
1385 		default:
1386 			/*
1387 			 * Don't mark sti_laddr_valid, as we cannot be
1388 			 * sure that the returned address is the real
1389 			 * bound address when talking to an unknown
1390 			 * transport.
1391 			 */
1392 			break;
1393 		}
1394 	}
1395 
1396 	if (nl7c != NULL) {
1397 		/* Register listen()er sonode pointer with NL7C */
1398 		nl7c_listener_addr(nl7c, so);
1399 	}
1400 
1401 	freemsg(mp);
1402 
1403 done:
1404 	if (error) {
1405 		/* reset state & backlog to values held on entry */
1406 		if (clear_acceptconn_on_err == B_TRUE)
1407 			so->so_state &= ~SS_ACCEPTCONN;
1408 		if (restore_backlog_on_err == B_TRUE)
1409 			so->so_backlog = save_so_backlog;
1410 
1411 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1412 			int err;
1413 
1414 			err = sotpi_unbind(so, 0);
1415 			/* LINTED - statement has no consequent: if */
1416 			if (err) {
1417 				eprintsoline(so, error);
1418 			} else {
1419 				ASSERT(!(so->so_state & SS_ISBOUND));
1420 			}
1421 		}
1422 	}
1423 	if (!(flags & _SOBIND_LOCK_HELD)) {
1424 		so_unlock_single(so, SOLOCKED);
1425 		mutex_exit(&so->so_lock);
1426 	} else {
1427 		ASSERT(MUTEX_HELD(&so->so_lock));
1428 		ASSERT(so->so_flag & SOLOCKED);
1429 	}
1430 	return (error);
1431 }
1432 
1433 /* bind the socket */
1434 static int
1435 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1436     int flags, struct cred *cr)
1437 {
1438 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1439 		return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1440 
1441 	flags &= ~_SOBIND_SOCKETPAIR;
1442 	return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1443 }
1444 
1445 /*
1446  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1447  * address, or when listen needs to unbind and bind.
1448  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1449  * so that a sobind can pick them up.
1450  */
1451 static int
1452 sotpi_unbind(struct sonode *so, int flags)
1453 {
1454 	struct T_unbind_req	unbind_req;
1455 	int			error = 0;
1456 	mblk_t			*mp;
1457 	sotpi_info_t		*sti = SOTOTPI(so);
1458 
1459 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1460 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1461 
1462 	ASSERT(MUTEX_HELD(&so->so_lock));
1463 	ASSERT(so->so_flag & SOLOCKED);
1464 
1465 	if (!(so->so_state & SS_ISBOUND)) {
1466 		error = EINVAL;
1467 		eprintsoline(so, error);
1468 		goto done;
1469 	}
1470 
1471 	mutex_exit(&so->so_lock);
1472 
1473 	/*
1474 	 * Flush the read and write side (except stream head read queue)
1475 	 * and send down T_UNBIND_REQ.
1476 	 */
1477 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1478 
1479 	unbind_req.PRIM_type = T_UNBIND_REQ;
1480 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1481 	    0, _ALLOC_SLEEP);
1482 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1483 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1484 	mutex_enter(&so->so_lock);
1485 	if (error) {
1486 		eprintsoline(so, error);
1487 		goto done;
1488 	}
1489 
1490 	error = sowaitokack(so, T_UNBIND_REQ);
1491 	if (error) {
1492 		eprintsoline(so, error);
1493 		goto done;
1494 	}
1495 
1496 	/*
1497 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1498 	 * strsock_proto while the lock was dropped above, the unbind
1499 	 * is allowed to complete.
1500 	 */
1501 	if (!(flags & _SOUNBIND_REBIND)) {
1502 		/*
1503 		 * Clear out bound address.
1504 		 */
1505 		vnode_t *vp;
1506 
1507 		if ((vp = sti->sti_ux_bound_vp) != NULL) {
1508 
1509 			/* Undo any SSL proxy setup */
1510 			if ((so->so_family == AF_INET ||
1511 			    so->so_family == AF_INET6) &&
1512 			    (so->so_type == SOCK_STREAM) &&
1513 			    (sti->sti_kssl_ent != NULL)) {
1514 				kssl_release_ent(sti->sti_kssl_ent, so,
1515 				    sti->sti_kssl_type);
1516 				sti->sti_kssl_ent = NULL;
1517 				sti->sti_kssl_type = KSSL_NO_PROXY;
1518 			}
1519 			sti->sti_ux_bound_vp = NULL;
1520 			vn_rele_stream(vp);
1521 		}
1522 		/* Clear out address */
1523 		sti->sti_laddr_len = 0;
1524 	}
1525 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1526 	sti->sti_laddr_valid = 0;
1527 
1528 done:
1529 
1530 	/* If the caller held the lock don't release it here */
1531 	ASSERT(MUTEX_HELD(&so->so_lock));
1532 	ASSERT(so->so_flag & SOLOCKED);
1533 
1534 	return (error);
1535 }
1536 
1537 /*
1538  * listen on the socket.
1539  * For TPI conforming transports this has to first unbind with the transport
1540  * and then bind again using the new backlog.
1541  */
1542 /* ARGSUSED */
1543 int
1544 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1545 {
1546 	int		error = 0;
1547 	sotpi_info_t	*sti = SOTOTPI(so);
1548 
1549 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1550 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1551 
1552 	if (sti->sti_serv_type == T_CLTS)
1553 		return (EOPNOTSUPP);
1554 
1555 	/*
1556 	 * If the socket is ready to accept connections already, then
1557 	 * return without doing anything.  This avoids a problem where
1558 	 * a second listen() call fails if a connection is pending and
1559 	 * leaves the socket unbound. Only when we are not unbinding
1560 	 * with the transport can we safely increase the backlog.
1561 	 */
1562 	if (so->so_state & SS_ACCEPTCONN &&
1563 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1564 	    /*CONSTCOND*/
1565 	    !solisten_tpi_tcp))
1566 		return (0);
1567 
1568 	if (so->so_state & SS_ISCONNECTED)
1569 		return (EINVAL);
1570 
1571 	mutex_enter(&so->so_lock);
1572 	so_lock_single(so);	/* Set SOLOCKED */
1573 
1574 	/*
1575 	 * If the listen doesn't change the backlog we do nothing.
1576 	 * This avoids an EPROTO error from the transport.
1577 	 */
1578 	if ((so->so_state & SS_ACCEPTCONN) &&
1579 	    so->so_backlog == backlog)
1580 		goto done;
1581 
1582 	if (!(so->so_state & SS_ISBOUND)) {
1583 		/*
1584 		 * Must have been explicitly bound in the UNIX domain.
1585 		 */
1586 		if (so->so_family == AF_UNIX) {
1587 			error = EINVAL;
1588 			goto done;
1589 		}
1590 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1591 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1592 	} else if (backlog > 0) {
1593 		/*
1594 		 * AF_INET{,6} hack to avoid losing the port.
1595 		 * Assumes that all AF_INET{,6} transports can handle a
1596 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1597 		 * has already bound thus it is possible to avoid the unbind.
1598 		 */
1599 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1600 		    /*CONSTCOND*/
1601 		    !solisten_tpi_tcp)) {
1602 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1603 			if (error)
1604 				goto done;
1605 		}
1606 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1607 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1608 	} else {
1609 		so->so_state |= SS_ACCEPTCONN;
1610 		so->so_backlog = backlog;
1611 	}
1612 	if (error)
1613 		goto done;
1614 	ASSERT(so->so_state & SS_ACCEPTCONN);
1615 done:
1616 	so_unlock_single(so, SOLOCKED);
1617 	mutex_exit(&so->so_lock);
1618 	return (error);
1619 }
1620 
1621 /*
1622  * Disconnect either a specified seqno or all (-1).
1623  * The former is used on listening sockets only.
1624  *
1625  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1626  * the current use of sodisconnect(seqno == -1) is only for shutdown
1627  * so there is no point (and potentially incorrect) to unbind.
1628  */
1629 static int
1630 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1631 {
1632 	struct T_discon_req	discon_req;
1633 	int			error = 0;
1634 	mblk_t			*mp;
1635 
1636 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1637 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1638 
1639 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1640 		mutex_enter(&so->so_lock);
1641 		so_lock_single(so);	/* Set SOLOCKED */
1642 	} else {
1643 		ASSERT(MUTEX_HELD(&so->so_lock));
1644 		ASSERT(so->so_flag & SOLOCKED);
1645 	}
1646 
1647 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1648 		error = EINVAL;
1649 		eprintsoline(so, error);
1650 		goto done;
1651 	}
1652 
1653 	mutex_exit(&so->so_lock);
1654 	/*
1655 	 * Flush the write side (unless this is a listener)
1656 	 * and then send down a T_DISCON_REQ.
1657 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1658 	 * and other messages.)
1659 	 */
1660 	if (!(so->so_state & SS_ACCEPTCONN))
1661 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1662 
1663 	discon_req.PRIM_type = T_DISCON_REQ;
1664 	discon_req.SEQ_number = seqno;
1665 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1666 	    0, _ALLOC_SLEEP);
1667 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1668 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1669 	mutex_enter(&so->so_lock);
1670 	if (error) {
1671 		eprintsoline(so, error);
1672 		goto done;
1673 	}
1674 
1675 	error = sowaitokack(so, T_DISCON_REQ);
1676 	if (error) {
1677 		eprintsoline(so, error);
1678 		goto done;
1679 	}
1680 	/*
1681 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1682 	 * strsock_proto while the lock was dropped above, the disconnect
1683 	 * is allowed to complete. However, it is not possible to
1684 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1685 	 */
1686 	so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1687 	SOTOTPI(so)->sti_laddr_valid = 0;
1688 	SOTOTPI(so)->sti_faddr_valid = 0;
1689 done:
1690 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1691 		so_unlock_single(so, SOLOCKED);
1692 		mutex_exit(&so->so_lock);
1693 	} else {
1694 		/* If the caller held the lock don't release it here */
1695 		ASSERT(MUTEX_HELD(&so->so_lock));
1696 		ASSERT(so->so_flag & SOLOCKED);
1697 	}
1698 	return (error);
1699 }
1700 
1701 /* ARGSUSED */
1702 int
1703 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1704     struct sonode **nsop)
1705 {
1706 	struct T_conn_ind	*conn_ind;
1707 	struct T_conn_res	*conn_res;
1708 	int			error = 0;
1709 	mblk_t			*mp, *ctxmp, *ack_mp;
1710 	struct sonode		*nso;
1711 	vnode_t			*nvp;
1712 	void			*src;
1713 	t_uscalar_t		srclen;
1714 	void			*opt;
1715 	t_uscalar_t		optlen;
1716 	t_scalar_t		PRIM_type;
1717 	t_scalar_t		SEQ_number;
1718 	size_t			sinlen;
1719 	sotpi_info_t		*sti = SOTOTPI(so);
1720 	sotpi_info_t		*nsti;
1721 
1722 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1723 	    (void *)so, fflag, (void *)nsop,
1724 	    pr_state(so->so_state, so->so_mode)));
1725 
1726 	/*
1727 	 * Defer single-threading the accepting socket until
1728 	 * the T_CONN_IND has been received and parsed and the
1729 	 * new sonode has been opened.
1730 	 */
1731 
1732 	/* Check that we are not already connected */
1733 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1734 		goto conn_bad;
1735 again:
1736 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1737 		goto e_bad;
1738 
1739 	ASSERT(mp != NULL);
1740 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1741 	ctxmp = mp->b_cont;
1742 
1743 	/*
1744 	 * Save SEQ_number for error paths.
1745 	 */
1746 	SEQ_number = conn_ind->SEQ_number;
1747 
1748 	srclen = conn_ind->SRC_length;
1749 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1750 	if (src == NULL) {
1751 		error = EPROTO;
1752 		freemsg(mp);
1753 		eprintsoline(so, error);
1754 		goto disconnect_unlocked;
1755 	}
1756 	optlen = conn_ind->OPT_length;
1757 	switch (so->so_family) {
1758 	case AF_INET:
1759 	case AF_INET6:
1760 		if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1761 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1762 			    &opt, conn_ind->OPT_length);
1763 		} else {
1764 			/*
1765 			 * The transport (in this case TCP) hasn't sent up
1766 			 * a pointer to an instance for the accept fast-path.
1767 			 * Disable fast-path completely because the call to
1768 			 * sotpi_create() below would otherwise create an
1769 			 * incomplete TCP instance, which would lead to
1770 			 * problems when sockfs sends a normal T_CONN_RES
1771 			 * message down the new stream.
1772 			 */
1773 			if (sti->sti_direct) {
1774 				int rval;
1775 				/*
1776 				 * For consistency we inform tcp to disable
1777 				 * direct interface on the listener, though
1778 				 * we can certainly live without doing this
1779 				 * because no data will ever travel upstream
1780 				 * on the listening socket.
1781 				 */
1782 				sti->sti_direct = 0;
1783 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1784 				    0, 0, K_TO_K, CRED(), &rval);
1785 			}
1786 			opt = NULL;
1787 			optlen = 0;
1788 		}
1789 		break;
1790 	case AF_UNIX:
1791 	default:
1792 		if (optlen != 0) {
1793 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1794 			    __TPI_ALIGN_SIZE);
1795 			if (opt == NULL) {
1796 				error = EPROTO;
1797 				freemsg(mp);
1798 				eprintsoline(so, error);
1799 				goto disconnect_unlocked;
1800 			}
1801 		}
1802 		if (so->so_family == AF_UNIX) {
1803 			if (!sti->sti_faddr_noxlate) {
1804 				src = NULL;
1805 				srclen = 0;
1806 			}
1807 			/* Extract src address from options */
1808 			if (optlen != 0)
1809 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1810 		}
1811 		break;
1812 	}
1813 
1814 	/*
1815 	 * Create the new socket.
1816 	 */
1817 	nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1818 	if (nso == NULL) {
1819 		ASSERT(error != 0);
1820 		/*
1821 		 * Accept can not fail with ENOBUFS. sotpi_create
1822 		 * sleeps waiting for memory until a signal is caught
1823 		 * so return EINTR.
1824 		 */
1825 		freemsg(mp);
1826 		if (error == ENOBUFS)
1827 			error = EINTR;
1828 		goto e_disc_unl;
1829 	}
1830 	nvp = SOTOV(nso);
1831 	nsti = SOTOTPI(nso);
1832 
1833 	/*
1834 	 * If the transport sent up an SSL connection context, then attach
1835 	 * it the new socket, and set the (sd_wputdatafunc)() and
1836 	 * (sd_rputdatafunc)() stream head hooks to intercept and process
1837 	 * SSL records.
1838 	 */
1839 	if (ctxmp != NULL) {
1840 		/*
1841 		 * This kssl_ctx_t is already held for us by the transport.
1842 		 * So, we don't need to do a kssl_hold_ctx() here.
1843 		 */
1844 		nsti->sti_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr);
1845 		freemsg(ctxmp);
1846 		mp->b_cont = NULL;
1847 		strsetrwputdatahooks(nvp, strsock_kssl_input,
1848 		    strsock_kssl_output);
1849 
1850 		/* Disable sodirect if any */
1851 		if (nso->so_direct != NULL) {
1852 			mutex_enter(nso->so_direct->sod_lockp);
1853 			SOD_DISABLE(nso->so_direct);
1854 			mutex_exit(nso->so_direct->sod_lockp);
1855 		}
1856 	}
1857 #ifdef DEBUG
1858 	/*
1859 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1860 	 * it's inherited early to allow debugging of the accept code itself.
1861 	 */
1862 	nso->so_options |= so->so_options & SO_DEBUG;
1863 #endif /* DEBUG */
1864 
1865 	/*
1866 	 * Save the SRC address from the T_CONN_IND
1867 	 * for getpeername to work on AF_UNIX and on transports that do not
1868 	 * support TI_GETPEERNAME.
1869 	 *
1870 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1871 	 * copyin_name().
1872 	 */
1873 	if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1874 		error = EINVAL;
1875 		freemsg(mp);
1876 		eprintsoline(so, error);
1877 		goto disconnect_vp_unlocked;
1878 	}
1879 	nsti->sti_faddr_len = (socklen_t)srclen;
1880 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1881 	bcopy(src, nsti->sti_faddr_sa, srclen);
1882 	nsti->sti_faddr_valid = 1;
1883 
1884 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1885 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1886 		cred_t *cr;
1887 
1888 		if ((cr = DB_CRED(mp)) != NULL) {
1889 			crhold(cr);
1890 			nso->so_peercred = cr;
1891 			nso->so_cpid = DB_CPID(mp);
1892 		}
1893 		freemsg(mp);
1894 
1895 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1896 		    sizeof (intptr_t), 0, _ALLOC_INTR);
1897 		if (mp == NULL) {
1898 			/*
1899 			 * Accept can not fail with ENOBUFS.
1900 			 * A signal was caught so return EINTR.
1901 			 */
1902 			error = EINTR;
1903 			eprintsoline(so, error);
1904 			goto disconnect_vp_unlocked;
1905 		}
1906 		conn_res = (struct T_conn_res *)mp->b_rptr;
1907 	} else {
1908 		nso->so_peercred = DB_CRED(mp);
1909 		nso->so_cpid = DB_CPID(mp);
1910 		DB_CRED(mp) = NULL;
1911 
1912 		mp->b_rptr = DB_BASE(mp);
1913 		conn_res = (struct T_conn_res *)mp->b_rptr;
1914 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1915 	}
1916 
1917 	/*
1918 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1919 	 * (or AF_INET6) it also has to be bound in the transport provider.
1920 	 * We set the local address in the sonode from the T_OK_ACK of the
1921 	 * T_CONN_RES. For this reason the address we bind to here isn't
1922 	 * important.
1923 	 */
1924 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1925 	    /*CONSTCOND*/
1926 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1927 		/*
1928 		 * Optimization for AF_INET{,6} transports
1929 		 * that can handle a T_CONN_RES without being bound.
1930 		 */
1931 		mutex_enter(&nso->so_lock);
1932 		so_automatic_bind(nso);
1933 		mutex_exit(&nso->so_lock);
1934 	} else {
1935 		/* Perform NULL bind with the transport provider. */
1936 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1937 		    cr)) != 0) {
1938 			ASSERT(error != ENOBUFS);
1939 			freemsg(mp);
1940 			eprintsoline(nso, error);
1941 			goto disconnect_vp_unlocked;
1942 		}
1943 	}
1944 
1945 	/*
1946 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1947 	 * so that any data arriving on the new socket will cause the
1948 	 * appropriate signals to be delivered for the new socket.
1949 	 *
1950 	 * No other thread (except strsock_proto and strsock_misc)
1951 	 * can access the new socket thus we relax the locking.
1952 	 */
1953 	nso->so_pgrp = so->so_pgrp;
1954 	nso->so_state |= so->so_state & SS_ASYNC;
1955 	nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1956 
1957 	if (nso->so_pgrp != 0) {
1958 		if ((error = so_set_events(nso, nvp, CRED())) != 0) {
1959 			eprintsoline(nso, error);
1960 			error = 0;
1961 			nso->so_pgrp = 0;
1962 		}
1963 	}
1964 
1965 	/*
1966 	 * Make note of the socket level options. TCP and IP level options
1967 	 * are already inherited. We could do all this after accept is
1968 	 * successful but doing it here simplifies code and no harm done
1969 	 * for error case.
1970 	 */
1971 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1972 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1973 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1974 	nso->so_sndbuf = so->so_sndbuf;
1975 	nso->so_rcvbuf = so->so_rcvbuf;
1976 	if (nso->so_options & SO_LINGER)
1977 		nso->so_linger = so->so_linger;
1978 
1979 	/*
1980 	 * Note that the following sti_direct code path should be
1981 	 * removed once we are confident that the direct sockets
1982 	 * do not result in any degradation.
1983 	 */
1984 	if (sti->sti_direct) {
1985 
1986 		ASSERT(opt != NULL);
1987 
1988 		conn_res->OPT_length = optlen;
1989 		conn_res->OPT_offset = MBLKL(mp);
1990 		bcopy(&opt, mp->b_wptr, optlen);
1991 		mp->b_wptr += optlen;
1992 		conn_res->PRIM_type = T_CONN_RES;
1993 		conn_res->ACCEPTOR_id = 0;
1994 		PRIM_type = T_CONN_RES;
1995 
1996 		/* Send down the T_CONN_RES on acceptor STREAM */
1997 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1998 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1999 		if (error) {
2000 			mutex_enter(&so->so_lock);
2001 			so_lock_single(so);
2002 			eprintsoline(so, error);
2003 			goto disconnect_vp;
2004 		}
2005 		mutex_enter(&nso->so_lock);
2006 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
2007 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2008 		if (error) {
2009 			mutex_exit(&nso->so_lock);
2010 			mutex_enter(&so->so_lock);
2011 			so_lock_single(so);
2012 			eprintsoline(so, error);
2013 			goto disconnect_vp;
2014 		}
2015 		if (nso->so_family == AF_INET) {
2016 			sin_t *sin;
2017 
2018 			sin = (sin_t *)(ack_mp->b_rptr +
2019 			    sizeof (struct T_ok_ack));
2020 			bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
2021 			nsti->sti_laddr_len = sizeof (sin_t);
2022 		} else {
2023 			sin6_t *sin6;
2024 
2025 			sin6 = (sin6_t *)(ack_mp->b_rptr +
2026 			    sizeof (struct T_ok_ack));
2027 			bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
2028 			nsti->sti_laddr_len = sizeof (sin6_t);
2029 		}
2030 		freemsg(ack_mp);
2031 
2032 		nso->so_state |= SS_ISCONNECTED;
2033 		nso->so_proto_handle = (sock_lower_handle_t)opt;
2034 		nsti->sti_laddr_valid = 1;
2035 
2036 		if (sti->sti_nl7c_flags & NL7C_ENABLED) {
2037 			/*
2038 			 * A NL7C marked listen()er so the new socket
2039 			 * inherits the listen()er's NL7C state, except
2040 			 * for NL7C_POLLIN.
2041 			 *
2042 			 * Only call NL7C to process the new socket if
2043 			 * the listen socket allows blocking i/o.
2044 			 */
2045 			nsti->sti_nl7c_flags =
2046 			    sti->sti_nl7c_flags & (~NL7C_POLLIN);
2047 			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
2048 				/*
2049 				 * Nonblocking accept() just make it
2050 				 * persist to defer processing to the
2051 				 * read-side syscall (e.g. read).
2052 				 */
2053 				nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
2054 			} else if (nl7c_process(nso, B_FALSE)) {
2055 				/*
2056 				 * NL7C has completed processing on the
2057 				 * socket, close the socket and back to
2058 				 * the top to await the next T_CONN_IND.
2059 				 */
2060 				mutex_exit(&nso->so_lock);
2061 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
2062 				    CRED(), NULL);
2063 				VN_RELE(nvp);
2064 				goto again;
2065 			}
2066 			/* Pass the new socket out */
2067 		}
2068 
2069 		mutex_exit(&nso->so_lock);
2070 
2071 		/*
2072 		 * It's possible, through the use of autopush for example,
2073 		 * that the acceptor stream may not support sti_direct
2074 		 * semantics. If the new socket does not support sti_direct
2075 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
2076 		 * as we would in the I_PUSH case.
2077 		 */
2078 		if (nsti->sti_direct == 0) {
2079 			int	rval;
2080 
2081 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2082 			    0, 0, K_TO_K, CRED(), &rval)) != 0) {
2083 				mutex_enter(&so->so_lock);
2084 				so_lock_single(so);
2085 				eprintsoline(so, error);
2086 				goto disconnect_vp;
2087 			}
2088 		}
2089 
2090 		/*
2091 		 * Pass out new socket.
2092 		 */
2093 		if (nsop != NULL)
2094 			*nsop = nso;
2095 
2096 		return (0);
2097 	}
2098 
2099 	/*
2100 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2101 	 * which don't support the FireEngine accept fast-path. It is also
2102 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2103 	 * again. Neither sockfs nor TCP attempt to find out if some other
2104 	 * random module has been inserted in between (in which case we
2105 	 * should follow TLI accept behaviour). We blindly assume the worst
2106 	 * case and revert back to old behaviour i.e. TCP will not send us
2107 	 * any option (eager) and the accept should happen on the listener
2108 	 * queue. Any queued T_conn_ind have already got their options removed
2109 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
2110 	 */
2111 	/*
2112 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2113 	 */
2114 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2115 #ifdef	_ILP32
2116 		queue_t	*q;
2117 
2118 		/*
2119 		 * Find read queue in driver
2120 		 * Can safely do this since we "own" nso/nvp.
2121 		 */
2122 		q = strvp2wq(nvp)->q_next;
2123 		while (SAMESTR(q))
2124 			q = q->q_next;
2125 		q = RD(q);
2126 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2127 #else
2128 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2129 #endif	/* _ILP32 */
2130 		conn_res->PRIM_type = O_T_CONN_RES;
2131 		PRIM_type = O_T_CONN_RES;
2132 	} else {
2133 		conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2134 		conn_res->PRIM_type = T_CONN_RES;
2135 		PRIM_type = T_CONN_RES;
2136 	}
2137 	conn_res->SEQ_number = SEQ_number;
2138 	conn_res->OPT_length = 0;
2139 	conn_res->OPT_offset = 0;
2140 
2141 	mutex_enter(&so->so_lock);
2142 	so_lock_single(so);	/* Set SOLOCKED */
2143 	mutex_exit(&so->so_lock);
2144 
2145 	error = kstrputmsg(SOTOV(so), mp, NULL,
2146 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2147 	mutex_enter(&so->so_lock);
2148 	if (error) {
2149 		eprintsoline(so, error);
2150 		goto disconnect_vp;
2151 	}
2152 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
2153 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2154 	if (error) {
2155 		eprintsoline(so, error);
2156 		goto disconnect_vp;
2157 	}
2158 	/*
2159 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
2160 	 * that to set the local address. If this is not present
2161 	 * then we zero out the address and don't set the
2162 	 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2163 	 * the pathname from the listening socket.
2164 	 */
2165 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2166 	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2167 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2168 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
2169 		bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2170 		nsti->sti_laddr_len = sinlen;
2171 		nsti->sti_laddr_valid = 1;
2172 	} else if (nso->so_family == AF_UNIX) {
2173 		ASSERT(so->so_family == AF_UNIX);
2174 		nsti->sti_laddr_len = sti->sti_laddr_len;
2175 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2176 		bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2177 		    nsti->sti_laddr_len);
2178 		nsti->sti_laddr_valid = 1;
2179 	} else {
2180 		nsti->sti_laddr_len = sti->sti_laddr_len;
2181 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2182 		bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2183 		nsti->sti_laddr_sa->sa_family = nso->so_family;
2184 	}
2185 	freemsg(ack_mp);
2186 
2187 	so_unlock_single(so, SOLOCKED);
2188 	mutex_exit(&so->so_lock);
2189 
2190 	nso->so_state |= SS_ISCONNECTED;
2191 
2192 	/*
2193 	 * Pass out new socket.
2194 	 */
2195 	if (nsop != NULL)
2196 		*nsop = nso;
2197 
2198 	return (0);
2199 
2200 
2201 eproto_disc_unl:
2202 	error = EPROTO;
2203 e_disc_unl:
2204 	eprintsoline(so, error);
2205 	goto disconnect_unlocked;
2206 
2207 pr_disc_vp_unl:
2208 	eprintsoline(so, error);
2209 disconnect_vp_unlocked:
2210 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
2211 	VN_RELE(nvp);
2212 disconnect_unlocked:
2213 	(void) sodisconnect(so, SEQ_number, 0);
2214 	return (error);
2215 
2216 pr_disc_vp:
2217 	eprintsoline(so, error);
2218 disconnect_vp:
2219 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2220 	so_unlock_single(so, SOLOCKED);
2221 	mutex_exit(&so->so_lock);
2222 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
2223 	VN_RELE(nvp);
2224 	return (error);
2225 
2226 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2227 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2228 	    ? EOPNOTSUPP : EINVAL;
2229 e_bad:
2230 	eprintsoline(so, error);
2231 	return (error);
2232 }
2233 
2234 /*
2235  * connect a socket.
2236  *
2237  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2238  * unconnect (by specifying a null address).
2239  */
2240 int
2241 sotpi_connect(struct sonode *so,
2242 	const struct sockaddr *name,
2243 	socklen_t namelen,
2244 	int fflag,
2245 	int flags,
2246 	struct cred *cr)
2247 {
2248 	struct T_conn_req	conn_req;
2249 	int			error = 0;
2250 	mblk_t			*mp;
2251 	void			*src;
2252 	socklen_t		srclen;
2253 	void			*addr;
2254 	socklen_t		addrlen;
2255 	boolean_t		need_unlock;
2256 	sotpi_info_t		*sti = SOTOTPI(so);
2257 
2258 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2259 	    (void *)so, (void *)name, namelen, fflag, flags,
2260 	    pr_state(so->so_state, so->so_mode)));
2261 
2262 	/*
2263 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2264 	 * avoid sleeping for memory with SOLOCKED held.
2265 	 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2266 	 * + sizeof (struct T_opthdr).
2267 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2268 	 * exceed sti_faddr_maxlen).
2269 	 */
2270 	mp = soallocproto(sizeof (struct T_conn_req) +
2271 	    2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR);
2272 	if (mp == NULL) {
2273 		/*
2274 		 * Connect can not fail with ENOBUFS. A signal was
2275 		 * caught so return EINTR.
2276 		 */
2277 		error = EINTR;
2278 		eprintsoline(so, error);
2279 		return (error);
2280 	}
2281 
2282 	mutex_enter(&so->so_lock);
2283 	/*
2284 	 * Make sure there is a preallocated T_unbind_req message
2285 	 * before any binding. This message is allocated when the
2286 	 * socket is created. Since another thread can consume
2287 	 * so_unbind_mp by the time we return from so_lock_single(),
2288 	 * we should check the availability of so_unbind_mp after
2289 	 * we return from so_lock_single().
2290 	 */
2291 
2292 	so_lock_single(so);	/* Set SOLOCKED */
2293 	need_unlock = B_TRUE;
2294 
2295 	if (sti->sti_unbind_mp == NULL) {
2296 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2297 		/* NOTE: holding so_lock while sleeping */
2298 		sti->sti_unbind_mp =
2299 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR);
2300 		if (sti->sti_unbind_mp == NULL) {
2301 			error = EINTR;
2302 			goto done;
2303 		}
2304 	}
2305 
2306 	/*
2307 	 * Can't have done a listen before connecting.
2308 	 */
2309 	if (so->so_state & SS_ACCEPTCONN) {
2310 		error = EOPNOTSUPP;
2311 		goto done;
2312 	}
2313 
2314 	/*
2315 	 * Must be bound with the transport
2316 	 */
2317 	if (!(so->so_state & SS_ISBOUND)) {
2318 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2319 		    /*CONSTCOND*/
2320 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2321 			/*
2322 			 * Optimization for AF_INET{,6} transports
2323 			 * that can handle a T_CONN_REQ without being bound.
2324 			 */
2325 			so_automatic_bind(so);
2326 		} else {
2327 			error = sotpi_bind(so, NULL, 0,
2328 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2329 			if (error)
2330 				goto done;
2331 		}
2332 		ASSERT(so->so_state & SS_ISBOUND);
2333 		flags |= _SOCONNECT_DID_BIND;
2334 	}
2335 
2336 	/*
2337 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2338 	 * connect to a null address. This is the portable method to
2339 	 * unconnect a socket.
2340 	 */
2341 	if ((namelen >= sizeof (sa_family_t)) &&
2342 	    (name->sa_family == AF_UNSPEC)) {
2343 		name = NULL;
2344 		namelen = 0;
2345 	}
2346 
2347 	/*
2348 	 * Check that we are not already connected.
2349 	 * A connection-oriented socket cannot be reconnected.
2350 	 * A connected connection-less socket can be
2351 	 * - connected to a different address by a subsequent connect
2352 	 * - "unconnected" by a connect to the NULL address
2353 	 */
2354 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2355 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2356 		if (so->so_mode & SM_CONNREQUIRED) {
2357 			/* Connection-oriented socket */
2358 			error = so->so_state & SS_ISCONNECTED ?
2359 			    EISCONN : EALREADY;
2360 			goto done;
2361 		}
2362 		/* Connection-less socket */
2363 		if (name == NULL) {
2364 			/*
2365 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2366 			 * since it was set when the socket was connected.
2367 			 * If this is UDP also send down a T_DISCON_REQ.
2368 			 */
2369 			int val;
2370 
2371 			if ((so->so_family == AF_INET ||
2372 			    so->so_family == AF_INET6) &&
2373 			    (so->so_type == SOCK_DGRAM ||
2374 			    so->so_type == SOCK_RAW) &&
2375 			    /*CONSTCOND*/
2376 			    !soconnect_tpi_udp) {
2377 				/* XXX What about implicitly unbinding here? */
2378 				error = sodisconnect(so, -1,
2379 				    _SODISCONNECT_LOCK_HELD);
2380 			} else {
2381 				so->so_state &=
2382 				    ~(SS_ISCONNECTED | SS_ISCONNECTING);
2383 				sti->sti_faddr_valid = 0;
2384 				sti->sti_faddr_len = 0;
2385 			}
2386 
2387 			/* Remove SOLOCKED since setsockopt will grab it */
2388 			so_unlock_single(so, SOLOCKED);
2389 			mutex_exit(&so->so_lock);
2390 
2391 			val = 0;
2392 			(void) sotpi_setsockopt(so, SOL_SOCKET,
2393 			    SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2394 			    cr);
2395 
2396 			mutex_enter(&so->so_lock);
2397 			so_lock_single(so);	/* Set SOLOCKED */
2398 			goto done;
2399 		}
2400 	}
2401 	ASSERT(so->so_state & SS_ISBOUND);
2402 
2403 	if (name == NULL || namelen == 0) {
2404 		error = EINVAL;
2405 		goto done;
2406 	}
2407 	/*
2408 	 * Mark the socket if sti_faddr_sa represents the transport level
2409 	 * address.
2410 	 */
2411 	if (flags & _SOCONNECT_NOXLATE) {
2412 		struct sockaddr_ux	*soaddr_ux;
2413 
2414 		ASSERT(so->so_family == AF_UNIX);
2415 		if (namelen != sizeof (struct sockaddr_ux)) {
2416 			error = EINVAL;
2417 			goto done;
2418 		}
2419 		soaddr_ux = (struct sockaddr_ux *)name;
2420 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2421 		namelen = sizeof (soaddr_ux->sou_addr);
2422 		sti->sti_faddr_noxlate = 1;
2423 	}
2424 
2425 	/*
2426 	 * Length and family checks.
2427 	 */
2428 	error = so_addr_verify(so, name, namelen);
2429 	if (error)
2430 		goto bad;
2431 
2432 	/*
2433 	 * Save foreign address. Needed for AF_UNIX as well as
2434 	 * transport providers that do not support TI_GETPEERNAME.
2435 	 * Also used for cached foreign address for TCP and UDP.
2436 	 */
2437 	if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2438 		error = EINVAL;
2439 		goto done;
2440 	}
2441 	sti->sti_faddr_len = (socklen_t)namelen;
2442 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2443 	bcopy(name, sti->sti_faddr_sa, namelen);
2444 	sti->sti_faddr_valid = 1;
2445 
2446 	if (so->so_family == AF_UNIX) {
2447 		if (sti->sti_faddr_noxlate) {
2448 			/*
2449 			 * Already have a transport internal address. Do not
2450 			 * pass any (transport internal) source address.
2451 			 */
2452 			addr = sti->sti_faddr_sa;
2453 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2454 			src = NULL;
2455 			srclen = 0;
2456 		} else {
2457 			/*
2458 			 * Pass the sockaddr_un source address as an option
2459 			 * and translate the remote address.
2460 			 * Holding so_lock thus sti_laddr_sa can not change.
2461 			 */
2462 			src = sti->sti_laddr_sa;
2463 			srclen = (t_uscalar_t)sti->sti_laddr_len;
2464 			dprintso(so, 1,
2465 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2466 			    srclen, src));
2467 			error = so_ux_addr_xlate(so,
2468 			    sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2469 			    (flags & _SOCONNECT_XPG4_2),
2470 			    &addr, &addrlen);
2471 			if (error)
2472 				goto bad;
2473 		}
2474 	} else {
2475 		addr = sti->sti_faddr_sa;
2476 		addrlen = (t_uscalar_t)sti->sti_faddr_len;
2477 		src = NULL;
2478 		srclen = 0;
2479 	}
2480 	/*
2481 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2482 	 * option which asks the transport provider to send T_UDERR_IND
2483 	 * messages. These T_UDERR_IND messages are used to return connected
2484 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2485 	 *
2486 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2487 	 * we send down a T_CONN_REQ. This is needed to let the
2488 	 * transport assign a local address that is consistent with
2489 	 * the remote address. Applications depend on a getsockname()
2490 	 * after a connect() to retrieve the "source" IP address for
2491 	 * the connected socket.  Invalidate the cached local address
2492 	 * to force getsockname() to enquire of the transport.
2493 	 */
2494 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2495 		/*
2496 		 * Datagram socket.
2497 		 */
2498 		int32_t val;
2499 
2500 		so_unlock_single(so, SOLOCKED);
2501 		mutex_exit(&so->so_lock);
2502 
2503 		val = 1;
2504 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2505 		    &val, (t_uscalar_t)sizeof (val), cr);
2506 
2507 		mutex_enter(&so->so_lock);
2508 		so_lock_single(so);	/* Set SOLOCKED */
2509 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2510 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2511 		    soconnect_tpi_udp) {
2512 			soisconnected(so);
2513 			goto done;
2514 		}
2515 		/*
2516 		 * Send down T_CONN_REQ etc.
2517 		 * Clear fflag to avoid returning EWOULDBLOCK.
2518 		 */
2519 		fflag = 0;
2520 		ASSERT(so->so_family != AF_UNIX);
2521 		sti->sti_laddr_valid = 0;
2522 	} else if (sti->sti_laddr_len != 0) {
2523 		/*
2524 		 * If the local address or port was "any" then it may be
2525 		 * changed by the transport as a result of the
2526 		 * connect.  Invalidate the cached version if we have one.
2527 		 */
2528 		switch (so->so_family) {
2529 		case AF_INET:
2530 			ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2531 			if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2532 			    INADDR_ANY ||
2533 			    ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2534 				sti->sti_laddr_valid = 0;
2535 			break;
2536 
2537 		case AF_INET6:
2538 			ASSERT(sti->sti_laddr_len ==
2539 			    (socklen_t)sizeof (sin6_t));
2540 			if (IN6_IS_ADDR_UNSPECIFIED(
2541 			    &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2542 			    IN6_IS_ADDR_V4MAPPED_ANY(
2543 			    &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2544 			    ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2545 				sti->sti_laddr_valid = 0;
2546 			break;
2547 
2548 		default:
2549 			break;
2550 		}
2551 	}
2552 
2553 	/*
2554 	 * Check for failure of an earlier call
2555 	 */
2556 	if (so->so_error != 0)
2557 		goto so_bad;
2558 
2559 	/*
2560 	 * Send down T_CONN_REQ. Message was allocated above.
2561 	 */
2562 	conn_req.PRIM_type = T_CONN_REQ;
2563 	conn_req.DEST_length = addrlen;
2564 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2565 	if (srclen == 0) {
2566 		conn_req.OPT_length = 0;
2567 		conn_req.OPT_offset = 0;
2568 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2569 		soappendmsg(mp, addr, addrlen);
2570 	} else {
2571 		/*
2572 		 * There is a AF_UNIX sockaddr_un to include as a source
2573 		 * address option.
2574 		 */
2575 		struct T_opthdr toh;
2576 
2577 		toh.level = SOL_SOCKET;
2578 		toh.name = SO_SRCADDR;
2579 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2580 		toh.status = 0;
2581 		conn_req.OPT_length =
2582 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2583 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2584 		    _TPI_ALIGN_TOPT(addrlen));
2585 
2586 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2587 		soappendmsg(mp, addr, addrlen);
2588 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2589 		soappendmsg(mp, &toh, sizeof (toh));
2590 		soappendmsg(mp, src, srclen);
2591 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2592 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2593 	}
2594 	/*
2595 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2596 	 * in order to have the right state when the T_CONN_CON shows up.
2597 	 */
2598 	soisconnecting(so);
2599 	mutex_exit(&so->so_lock);
2600 
2601 	if (audit_active)
2602 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2603 
2604 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2605 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2606 	mp = NULL;
2607 	mutex_enter(&so->so_lock);
2608 	if (error != 0)
2609 		goto bad;
2610 
2611 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2612 		goto bad;
2613 
2614 	/* Allow other threads to access the socket */
2615 	so_unlock_single(so, SOLOCKED);
2616 	need_unlock = B_FALSE;
2617 
2618 	/*
2619 	 * Wait until we get a T_CONN_CON or an error
2620 	 */
2621 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2622 		so_lock_single(so);	/* Set SOLOCKED */
2623 		need_unlock = B_TRUE;
2624 	}
2625 
2626 done:
2627 	freemsg(mp);
2628 	switch (error) {
2629 	case EINPROGRESS:
2630 	case EALREADY:
2631 	case EISCONN:
2632 	case EINTR:
2633 		/* Non-fatal errors */
2634 		sti->sti_laddr_valid = 0;
2635 		/* FALLTHRU */
2636 	case 0:
2637 		break;
2638 	default:
2639 		ASSERT(need_unlock);
2640 		/*
2641 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2642 		 * and invalidate local-address cache
2643 		 */
2644 		so->so_state &= ~SS_ISCONNECTING;
2645 		sti->sti_laddr_valid = 0;
2646 		/* A discon_ind might have already unbound us */
2647 		if ((flags & _SOCONNECT_DID_BIND) &&
2648 		    (so->so_state & SS_ISBOUND)) {
2649 			int err;
2650 
2651 			err = sotpi_unbind(so, 0);
2652 			/* LINTED - statement has no conseq */
2653 			if (err) {
2654 				eprintsoline(so, err);
2655 			}
2656 		}
2657 		break;
2658 	}
2659 	if (need_unlock)
2660 		so_unlock_single(so, SOLOCKED);
2661 	mutex_exit(&so->so_lock);
2662 	return (error);
2663 
2664 so_bad:	error = sogeterr(so, B_TRUE);
2665 bad:	eprintsoline(so, error);
2666 	goto done;
2667 }
2668 
2669 /* ARGSUSED */
2670 int
2671 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2672 {
2673 	struct T_ordrel_req	ordrel_req;
2674 	mblk_t			*mp;
2675 	uint_t			old_state, state_change;
2676 	int			error = 0;
2677 	sotpi_info_t		*sti = SOTOTPI(so);
2678 
2679 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2680 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2681 
2682 	mutex_enter(&so->so_lock);
2683 	so_lock_single(so);	/* Set SOLOCKED */
2684 
2685 	/*
2686 	 * SunOS 4.X has no check for datagram sockets.
2687 	 * 5.X checks that it is connected (ENOTCONN)
2688 	 * X/Open requires that we check the connected state.
2689 	 */
2690 	if (!(so->so_state & SS_ISCONNECTED)) {
2691 		if (!xnet_skip_checks) {
2692 			error = ENOTCONN;
2693 			if (xnet_check_print) {
2694 				printf("sockfs: X/Open shutdown check "
2695 				    "caused ENOTCONN\n");
2696 			}
2697 		}
2698 		goto done;
2699 	}
2700 	/*
2701 	 * Record the current state and then perform any state changes.
2702 	 * Then use the difference between the old and new states to
2703 	 * determine which messages need to be sent.
2704 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2705 	 * duplicate calls to shutdown().
2706 	 */
2707 	old_state = so->so_state;
2708 
2709 	switch (how) {
2710 	case 0:
2711 		socantrcvmore(so);
2712 		break;
2713 	case 1:
2714 		socantsendmore(so);
2715 		break;
2716 	case 2:
2717 		socantsendmore(so);
2718 		socantrcvmore(so);
2719 		break;
2720 	default:
2721 		error = EINVAL;
2722 		goto done;
2723 	}
2724 
2725 	/*
2726 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2727 	 */
2728 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2729 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2730 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2731 
2732 	switch (state_change) {
2733 	case 0:
2734 		dprintso(so, 1,
2735 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2736 		    so->so_state));
2737 		goto done;
2738 
2739 	case SS_CANTRCVMORE:
2740 		mutex_exit(&so->so_lock);
2741 		strseteof(SOTOV(so), 1);
2742 		/*
2743 		 * strseteof takes care of read side wakeups,
2744 		 * pollwakeups, and signals.
2745 		 */
2746 		/*
2747 		 * Get the read lock before flushing data to avoid problems
2748 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2749 		 */
2750 		mutex_enter(&so->so_lock);
2751 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2752 		mutex_exit(&so->so_lock);
2753 
2754 		/* Flush read side queue */
2755 		strflushrq(SOTOV(so), FLUSHALL);
2756 
2757 		mutex_enter(&so->so_lock);
2758 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2759 		break;
2760 
2761 	case SS_CANTSENDMORE:
2762 		mutex_exit(&so->so_lock);
2763 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2764 		mutex_enter(&so->so_lock);
2765 		break;
2766 
2767 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2768 		mutex_exit(&so->so_lock);
2769 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2770 		strseteof(SOTOV(so), 1);
2771 		/*
2772 		 * strseteof takes care of read side wakeups,
2773 		 * pollwakeups, and signals.
2774 		 */
2775 		/*
2776 		 * Get the read lock before flushing data to avoid problems
2777 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2778 		 */
2779 		mutex_enter(&so->so_lock);
2780 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2781 		mutex_exit(&so->so_lock);
2782 
2783 		/* Flush read side queue */
2784 		strflushrq(SOTOV(so), FLUSHALL);
2785 
2786 		mutex_enter(&so->so_lock);
2787 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2788 		break;
2789 	}
2790 
2791 	ASSERT(MUTEX_HELD(&so->so_lock));
2792 
2793 	/*
2794 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2795 	 * was set due to this call and the new state has both of them set:
2796 	 *	Send the AF_UNIX close indication
2797 	 *	For T_COTS send a discon_ind
2798 	 *
2799 	 * If cantsend was set due to this call:
2800 	 *	For T_COTSORD send an ordrel_ind
2801 	 *
2802 	 * Note that for T_CLTS there is no message sent here.
2803 	 */
2804 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2805 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2806 		/*
2807 		 * For SunOS 4.X compatibility we tell the other end
2808 		 * that we are unable to receive at this point.
2809 		 */
2810 		if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2811 			so_unix_close(so);
2812 
2813 		if (sti->sti_serv_type == T_COTS)
2814 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2815 	}
2816 	if ((state_change & SS_CANTSENDMORE) &&
2817 	    (sti->sti_serv_type == T_COTS_ORD)) {
2818 		/* Send an orderly release */
2819 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2820 
2821 		mutex_exit(&so->so_lock);
2822 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2823 		    0, _ALLOC_SLEEP);
2824 		/*
2825 		 * Send down the T_ORDREL_REQ even if there is flow control.
2826 		 * This prevents shutdown from blocking.
2827 		 * Note that there is no T_OK_ACK for ordrel_req.
2828 		 */
2829 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2830 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2831 		mutex_enter(&so->so_lock);
2832 		if (error) {
2833 			eprintsoline(so, error);
2834 			goto done;
2835 		}
2836 	}
2837 
2838 done:
2839 	so_unlock_single(so, SOLOCKED);
2840 	mutex_exit(&so->so_lock);
2841 	return (error);
2842 }
2843 
2844 /*
2845  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2846  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2847  * that we have closed.
2848  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2849  * T_UNITDATA_REQ containing the same option.
2850  *
2851  * For SOCK_DGRAM half-connections (somebody connected to this end
2852  * but this end is not connect) we don't know where to send any
2853  * SO_UNIX_CLOSE.
2854  *
2855  * We have to ignore stream head errors just in case there has been
2856  * a shutdown(output).
2857  * Ignore any flow control to try to get the message more quickly to the peer.
2858  * While locally ignoring flow control solves the problem when there
2859  * is only the loopback transport on the stream it would not provide
2860  * the correct AF_UNIX socket semantics when one or more modules have
2861  * been pushed.
2862  */
2863 void
2864 so_unix_close(struct sonode *so)
2865 {
2866 	int		error;
2867 	struct T_opthdr	toh;
2868 	mblk_t		*mp;
2869 	sotpi_info_t	*sti = SOTOTPI(so);
2870 
2871 	ASSERT(MUTEX_HELD(&so->so_lock));
2872 
2873 	ASSERT(so->so_family == AF_UNIX);
2874 
2875 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2876 	    (SS_ISCONNECTED|SS_ISBOUND))
2877 		return;
2878 
2879 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2880 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2881 
2882 	toh.level = SOL_SOCKET;
2883 	toh.name = SO_UNIX_CLOSE;
2884 
2885 	/* zero length + header */
2886 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2887 	toh.status = 0;
2888 
2889 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2890 		struct T_optdata_req tdr;
2891 
2892 		tdr.PRIM_type = T_OPTDATA_REQ;
2893 		tdr.DATA_flag = 0;
2894 
2895 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2896 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2897 
2898 		/* NOTE: holding so_lock while sleeping */
2899 		mp = soallocproto2(&tdr, sizeof (tdr),
2900 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP);
2901 	} else {
2902 		struct T_unitdata_req	tudr;
2903 		void			*addr;
2904 		socklen_t		addrlen;
2905 		void			*src;
2906 		socklen_t		srclen;
2907 		struct T_opthdr		toh2;
2908 		t_scalar_t		size;
2909 
2910 		/* Connecteded DGRAM socket */
2911 
2912 		/*
2913 		 * For AF_UNIX the destination address is translated to
2914 		 * an internal name and the source address is passed as
2915 		 * an option.
2916 		 */
2917 		/*
2918 		 * Length and family checks.
2919 		 */
2920 		error = so_addr_verify(so, sti->sti_faddr_sa,
2921 		    (t_uscalar_t)sti->sti_faddr_len);
2922 		if (error) {
2923 			eprintsoline(so, error);
2924 			return;
2925 		}
2926 		if (sti->sti_faddr_noxlate) {
2927 			/*
2928 			 * Already have a transport internal address. Do not
2929 			 * pass any (transport internal) source address.
2930 			 */
2931 			addr = sti->sti_faddr_sa;
2932 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2933 			src = NULL;
2934 			srclen = 0;
2935 		} else {
2936 			/*
2937 			 * Pass the sockaddr_un source address as an option
2938 			 * and translate the remote address.
2939 			 * Holding so_lock thus sti_laddr_sa can not change.
2940 			 */
2941 			src = sti->sti_laddr_sa;
2942 			srclen = (socklen_t)sti->sti_laddr_len;
2943 			dprintso(so, 1,
2944 			    ("so_ux_close: srclen %d, src %p\n",
2945 			    srclen, src));
2946 			error = so_ux_addr_xlate(so,
2947 			    sti->sti_faddr_sa,
2948 			    (socklen_t)sti->sti_faddr_len, 0,
2949 			    &addr, &addrlen);
2950 			if (error) {
2951 				eprintsoline(so, error);
2952 				return;
2953 			}
2954 		}
2955 		tudr.PRIM_type = T_UNITDATA_REQ;
2956 		tudr.DEST_length = addrlen;
2957 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2958 		if (srclen == 0) {
2959 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2960 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2961 			    _TPI_ALIGN_TOPT(addrlen));
2962 
2963 			size = tudr.OPT_offset + tudr.OPT_length;
2964 			/* NOTE: holding so_lock while sleeping */
2965 			mp = soallocproto2(&tudr, sizeof (tudr),
2966 			    addr, addrlen, size, _ALLOC_SLEEP);
2967 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2968 			soappendmsg(mp, &toh, sizeof (toh));
2969 		} else {
2970 			/*
2971 			 * There is a AF_UNIX sockaddr_un to include as a
2972 			 * source address option.
2973 			 */
2974 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2975 			    _TPI_ALIGN_TOPT(srclen));
2976 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2977 			    _TPI_ALIGN_TOPT(addrlen));
2978 
2979 			toh2.level = SOL_SOCKET;
2980 			toh2.name = SO_SRCADDR;
2981 			toh2.len = (t_uscalar_t)(srclen +
2982 			    sizeof (struct T_opthdr));
2983 			toh2.status = 0;
2984 
2985 			size = tudr.OPT_offset + tudr.OPT_length;
2986 
2987 			/* NOTE: holding so_lock while sleeping */
2988 			mp = soallocproto2(&tudr, sizeof (tudr),
2989 			    addr, addrlen, size, _ALLOC_SLEEP);
2990 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2991 			soappendmsg(mp, &toh, sizeof (toh));
2992 			soappendmsg(mp, &toh2, sizeof (toh2));
2993 			soappendmsg(mp, src, srclen);
2994 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2995 		}
2996 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2997 	}
2998 	mutex_exit(&so->so_lock);
2999 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
3000 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
3001 	mutex_enter(&so->so_lock);
3002 }
3003 
3004 /*
3005  * Called by sotpi_recvmsg when reading a non-zero amount of data.
3006  * In addition, the caller typically verifies that there is some
3007  * potential state to clear by checking
3008  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
3009  * before calling this routine.
3010  * Note that such a check can be made without holding so_lock since
3011  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
3012  * decrements sti_oobsigcnt.
3013  *
3014  * When data is read *after* the point that all pending
3015  * oob data has been consumed the oob indication is cleared.
3016  *
3017  * This logic keeps select/poll returning POLLRDBAND and
3018  * SIOCATMARK returning true until we have read past
3019  * the mark.
3020  */
3021 static void
3022 sorecv_update_oobstate(struct sonode *so)
3023 {
3024 	sotpi_info_t *sti = SOTOTPI(so);
3025 
3026 	mutex_enter(&so->so_lock);
3027 	ASSERT(so_verify_oobstate(so));
3028 	dprintso(so, 1,
3029 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
3030 	    sti->sti_oobsigcnt,
3031 	    sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
3032 	if (sti->sti_oobsigcnt == 0) {
3033 		/* No more pending oob indications */
3034 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
3035 		freemsg(so->so_oobmsg);
3036 		so->so_oobmsg = NULL;
3037 	}
3038 	ASSERT(so_verify_oobstate(so));
3039 	mutex_exit(&so->so_lock);
3040 }
3041 
3042 /*
3043  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
3044  */
3045 static int
3046 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
3047 {
3048 	sotpi_info_t *sti = SOTOTPI(so);
3049 	int	error = 0;
3050 	mblk_t *tmp = NULL;
3051 	mblk_t *pmp = NULL;
3052 	mblk_t *nmp = sti->sti_nl7c_rcv_mp;
3053 
3054 	ASSERT(nmp != NULL);
3055 
3056 	while (nmp != NULL && uiop->uio_resid > 0) {
3057 		ssize_t n;
3058 
3059 		if (DB_TYPE(nmp) == M_DATA) {
3060 			/*
3061 			 * We have some data, uiomove up to resid bytes.
3062 			 */
3063 			n = MIN(MBLKL(nmp), uiop->uio_resid);
3064 			if (n > 0)
3065 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
3066 			nmp->b_rptr += n;
3067 			if (nmp->b_rptr == nmp->b_wptr) {
3068 				pmp = nmp;
3069 				nmp = nmp->b_cont;
3070 			}
3071 			if (error)
3072 				break;
3073 		} else {
3074 			/*
3075 			 * We only handle data, save for caller to handle.
3076 			 */
3077 			if (pmp != NULL) {
3078 				pmp->b_cont = nmp->b_cont;
3079 			}
3080 			nmp->b_cont = NULL;
3081 			if (*rmp == NULL) {
3082 				*rmp = nmp;
3083 			} else {
3084 				tmp->b_cont = nmp;
3085 			}
3086 			nmp = nmp->b_cont;
3087 			tmp = nmp;
3088 		}
3089 	}
3090 	if (pmp != NULL) {
3091 		/* Free any mblk_t(s) which we have consumed */
3092 		pmp->b_cont = NULL;
3093 		freemsg(sti->sti_nl7c_rcv_mp);
3094 	}
3095 	if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3096 		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
3097 		if (error == 0) {
3098 			rval_t	*p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3099 
3100 			error = p->r_v.r_v2;
3101 			p->r_v.r_v2 = 0;
3102 		}
3103 		rp->r_vals = sti->sti_nl7c_rcv_rval;
3104 		sti->sti_nl7c_rcv_rval = 0;
3105 	} else {
3106 		/* More mblk_t(s) to process so no rval to return */
3107 		rp->r_vals = 0;
3108 	}
3109 	return (error);
3110 }
3111 /*
3112  * Receive the next message on the queue.
3113  * If msg_controllen is non-zero when called the caller is interested in
3114  * any received control info (options).
3115  * If msg_namelen is non-zero when called the caller is interested in
3116  * any received source address.
3117  * The routine returns with msg_control and msg_name pointing to
3118  * kmem_alloc'ed memory which the caller has to free.
3119  */
3120 /* ARGSUSED */
3121 int
3122 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3123     struct cred *cr)
3124 {
3125 	union T_primitives	*tpr;
3126 	mblk_t			*mp;
3127 	uchar_t			pri;
3128 	int			pflag, opflag;
3129 	void			*control;
3130 	t_uscalar_t		controllen;
3131 	t_uscalar_t		namelen;
3132 	int			so_state = so->so_state; /* Snapshot */
3133 	ssize_t			saved_resid;
3134 	rval_t			rval;
3135 	int			flags;
3136 	clock_t			timout;
3137 	int			error = 0;
3138 	int			reterr = 0;
3139 	struct uio		*suiop = NULL;
3140 	sotpi_info_t		*sti = SOTOTPI(so);
3141 
3142 	flags = msg->msg_flags;
3143 	msg->msg_flags = 0;
3144 
3145 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3146 	    (void *)so, (void *)msg, flags,
3147 	    pr_state(so->so_state, so->so_mode), so->so_error));
3148 
3149 	if (so->so_version == SOV_STREAM) {
3150 		so_update_attrs(so, SOACC);
3151 		/* The imaginary "sockmod" has been popped - act as a stream */
3152 		return (strread(SOTOV(so), uiop, cr));
3153 	}
3154 
3155 	/*
3156 	 * If we are not connected because we have never been connected
3157 	 * we return ENOTCONN. If we have been connected (but are no longer
3158 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3159 	 * the EOF.
3160 	 *
3161 	 * An alternative would be to post an ENOTCONN error in stream head
3162 	 * (read+write) and clear it when we're connected. However, that error
3163 	 * would cause incorrect poll/select behavior!
3164 	 */
3165 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3166 	    (so->so_mode & SM_CONNREQUIRED)) {
3167 		return (ENOTCONN);
3168 	}
3169 
3170 	/*
3171 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3172 	 * after checking that the read queue is empty) and returns zero.
3173 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
3174 	 * is zero.
3175 	 */
3176 
3177 	if (flags & MSG_OOB) {
3178 		/* Check that the transport supports OOB */
3179 		if (!(so->so_mode & SM_EXDATA))
3180 			return (EOPNOTSUPP);
3181 		so_update_attrs(so, SOACC);
3182 		return (sorecvoob(so, msg, uiop, flags,
3183 		    (so->so_options & SO_OOBINLINE)));
3184 	}
3185 
3186 	so_update_attrs(so, SOACC);
3187 
3188 	/*
3189 	 * Set msg_controllen and msg_namelen to zero here to make it
3190 	 * simpler in the cases that no control or name is returned.
3191 	 */
3192 	controllen = msg->msg_controllen;
3193 	namelen = msg->msg_namelen;
3194 	msg->msg_controllen = 0;
3195 	msg->msg_namelen = 0;
3196 
3197 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3198 	    namelen, controllen));
3199 
3200 	mutex_enter(&so->so_lock);
3201 	/*
3202 	 * If an NL7C enabled socket and not waiting for write data.
3203 	 */
3204 	if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3205 	    NL7C_ENABLED) {
3206 		if (sti->sti_nl7c_uri) {
3207 			/* Close uri processing for a previous request */
3208 			nl7c_close(so);
3209 		}
3210 		if ((so_state & SS_CANTRCVMORE) &&
3211 		    sti->sti_nl7c_rcv_mp == NULL) {
3212 			/* Nothing to process, EOF */
3213 			mutex_exit(&so->so_lock);
3214 			return (0);
3215 		} else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3216 			/* Persistent NL7C socket, try to process request */
3217 			boolean_t ret;
3218 
3219 			ret = nl7c_process(so,
3220 			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3221 			rval.r_vals = sti->sti_nl7c_rcv_rval;
3222 			error = rval.r_v.r_v2;
3223 			if (error) {
3224 				/* Error of some sort, return it */
3225 				mutex_exit(&so->so_lock);
3226 				return (error);
3227 			}
3228 			if (sti->sti_nl7c_flags &&
3229 			    ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3230 				/*
3231 				 * Still an NL7C socket and no data
3232 				 * to pass up to the caller.
3233 				 */
3234 				mutex_exit(&so->so_lock);
3235 				if (ret) {
3236 					/* EOF */
3237 					return (0);
3238 				} else {
3239 					/* Need more data */
3240 					return (EAGAIN);
3241 				}
3242 			}
3243 		} else {
3244 			/*
3245 			 * Not persistent so no further NL7C processing.
3246 			 */
3247 			sti->sti_nl7c_flags = 0;
3248 		}
3249 	}
3250 	/*
3251 	 * Only one reader is allowed at any given time. This is needed
3252 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3253 	 *
3254 	 * This is slightly different that BSD behavior in that it fails with
3255 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3256 	 * is single-threaded using sblock(), which is dropped while waiting
3257 	 * for data to appear. The difference shows up e.g. if one
3258 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3259 	 * does use nonblocking io and different threads are reading each
3260 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3261 	 * in this case as long as the read queue doesn't get empty.
3262 	 * In this implementation the thread using nonblocking io can
3263 	 * get an EWOULDBLOCK error due to the blocking thread executing
3264 	 * e.g. in the uiomove in kstrgetmsg.
3265 	 * This difference is not believed to be significant.
3266 	 */
3267 	/* Set SOREADLOCKED */
3268 	error = so_lock_read_intr(so,
3269 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3270 	mutex_exit(&so->so_lock);
3271 	if (error)
3272 		return (error);
3273 
3274 	/*
3275 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3276 	 * queued data has been consumed.
3277 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3278 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3279 	 *
3280 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3281 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3282 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3283 	 */
3284 	pflag = MSG_ANY | MSG_DELAYERROR;
3285 	if (flags & MSG_PEEK) {
3286 		pflag |= MSG_IPEEK;
3287 		flags &= ~MSG_WAITALL;
3288 	}
3289 	if (so->so_mode & SM_ATOMIC)
3290 		pflag |= MSG_DISCARDTAIL;
3291 
3292 	if (flags & MSG_DONTWAIT)
3293 		timout = 0;
3294 	else
3295 		timout = -1;
3296 	opflag = pflag;
3297 
3298 	suiop = sod_rcv_init(so, flags, &uiop);
3299 retry:
3300 	saved_resid = uiop->uio_resid;
3301 	pri = 0;
3302 	mp = NULL;
3303 	if (sti->sti_nl7c_rcv_mp != NULL) {
3304 		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3305 		error = nl7c_sorecv(so, &mp, uiop, &rval);
3306 	} else {
3307 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3308 		    timout, &rval);
3309 	}
3310 	if (error != 0) {
3311 		/* kstrgetmsg returns ETIME when timeout expires */
3312 		if (error == ETIME)
3313 			error = EWOULDBLOCK;
3314 		goto out;
3315 	}
3316 	/*
3317 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3318 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3319 	 */
3320 	ASSERT(!(rval.r_val1 & MORECTL));
3321 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3322 		msg->msg_flags |= MSG_TRUNC;
3323 
3324 	if (mp == NULL) {
3325 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3326 		/*
3327 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3328 		 * The draft Posix socket spec states that the mark should
3329 		 * not be cleared when peeking. We follow the latter.
3330 		 */
3331 		if ((so->so_state &
3332 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3333 		    (uiop->uio_resid != saved_resid) &&
3334 		    !(flags & MSG_PEEK)) {
3335 			sorecv_update_oobstate(so);
3336 		}
3337 
3338 		mutex_enter(&so->so_lock);
3339 		/* Set MSG_EOR based on MOREDATA */
3340 		if (!(rval.r_val1 & MOREDATA)) {
3341 			if (so->so_state & SS_SAVEDEOR) {
3342 				msg->msg_flags |= MSG_EOR;
3343 				so->so_state &= ~SS_SAVEDEOR;
3344 			}
3345 		}
3346 		/*
3347 		 * If some data was received (i.e. not EOF) and the
3348 		 * read/recv* has not been satisfied wait for some more.
3349 		 */
3350 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3351 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3352 			mutex_exit(&so->so_lock);
3353 			pflag = opflag | MSG_NOMARK;
3354 			goto retry;
3355 		}
3356 		goto out_locked;
3357 	}
3358 
3359 	/* strsock_proto has already verified length and alignment */
3360 	tpr = (union T_primitives *)mp->b_rptr;
3361 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3362 
3363 	switch (tpr->type) {
3364 	case T_DATA_IND: {
3365 		if ((so->so_state &
3366 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3367 		    (uiop->uio_resid != saved_resid) &&
3368 		    !(flags & MSG_PEEK)) {
3369 			sorecv_update_oobstate(so);
3370 		}
3371 
3372 		/*
3373 		 * Set msg_flags to MSG_EOR based on
3374 		 * MORE_flag and MOREDATA.
3375 		 */
3376 		mutex_enter(&so->so_lock);
3377 		so->so_state &= ~SS_SAVEDEOR;
3378 		if (!(tpr->data_ind.MORE_flag & 1)) {
3379 			if (!(rval.r_val1 & MOREDATA))
3380 				msg->msg_flags |= MSG_EOR;
3381 			else
3382 				so->so_state |= SS_SAVEDEOR;
3383 		}
3384 		freemsg(mp);
3385 		/*
3386 		 * If some data was received (i.e. not EOF) and the
3387 		 * read/recv* has not been satisfied wait for some more.
3388 		 */
3389 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3390 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3391 			mutex_exit(&so->so_lock);
3392 			pflag = opflag | MSG_NOMARK;
3393 			goto retry;
3394 		}
3395 		goto out_locked;
3396 	}
3397 	case T_UNITDATA_IND: {
3398 		void *addr;
3399 		t_uscalar_t addrlen;
3400 		void *abuf;
3401 		t_uscalar_t optlen;
3402 		void *opt;
3403 
3404 		if ((so->so_state &
3405 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3406 		    (uiop->uio_resid != saved_resid) &&
3407 		    !(flags & MSG_PEEK)) {
3408 			sorecv_update_oobstate(so);
3409 		}
3410 
3411 		if (namelen != 0) {
3412 			/* Caller wants source address */
3413 			addrlen = tpr->unitdata_ind.SRC_length;
3414 			addr = sogetoff(mp,
3415 			    tpr->unitdata_ind.SRC_offset,
3416 			    addrlen, 1);
3417 			if (addr == NULL) {
3418 				freemsg(mp);
3419 				error = EPROTO;
3420 				eprintsoline(so, error);
3421 				goto out;
3422 			}
3423 			if (so->so_family == AF_UNIX) {
3424 				/*
3425 				 * Can not use the transport level address.
3426 				 * If there is a SO_SRCADDR option carrying
3427 				 * the socket level address it will be
3428 				 * extracted below.
3429 				 */
3430 				addr = NULL;
3431 				addrlen = 0;
3432 			}
3433 		}
3434 		optlen = tpr->unitdata_ind.OPT_length;
3435 		if (optlen != 0) {
3436 			t_uscalar_t ncontrollen;
3437 
3438 			/*
3439 			 * Extract any source address option.
3440 			 * Determine how large cmsg buffer is needed.
3441 			 */
3442 			opt = sogetoff(mp,
3443 			    tpr->unitdata_ind.OPT_offset,
3444 			    optlen, __TPI_ALIGN_SIZE);
3445 
3446 			if (opt == NULL) {
3447 				freemsg(mp);
3448 				error = EPROTO;
3449 				eprintsoline(so, error);
3450 				goto out;
3451 			}
3452 			if (so->so_family == AF_UNIX)
3453 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3454 			ncontrollen = so_cmsglen(mp, opt, optlen,
3455 			    !(flags & MSG_XPG4_2));
3456 			if (controllen != 0)
3457 				controllen = ncontrollen;
3458 			else if (ncontrollen != 0)
3459 				msg->msg_flags |= MSG_CTRUNC;
3460 		} else {
3461 			controllen = 0;
3462 		}
3463 
3464 		if (namelen != 0) {
3465 			/*
3466 			 * Return address to caller.
3467 			 * Caller handles truncation if length
3468 			 * exceeds msg_namelen.
3469 			 * NOTE: AF_UNIX NUL termination is ensured by
3470 			 * the sender's copyin_name().
3471 			 */
3472 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3473 
3474 			bcopy(addr, abuf, addrlen);
3475 			msg->msg_name = abuf;
3476 			msg->msg_namelen = addrlen;
3477 		}
3478 
3479 		if (controllen != 0) {
3480 			/*
3481 			 * Return control msg to caller.
3482 			 * Caller handles truncation if length
3483 			 * exceeds msg_controllen.
3484 			 */
3485 			control = kmem_zalloc(controllen, KM_SLEEP);
3486 
3487 			error = so_opt2cmsg(mp, opt, optlen,
3488 			    !(flags & MSG_XPG4_2),
3489 			    control, controllen);
3490 			if (error) {
3491 				freemsg(mp);
3492 				if (msg->msg_namelen != 0)
3493 					kmem_free(msg->msg_name,
3494 					    msg->msg_namelen);
3495 				kmem_free(control, controllen);
3496 				eprintsoline(so, error);
3497 				goto out;
3498 			}
3499 			msg->msg_control = control;
3500 			msg->msg_controllen = controllen;
3501 		}
3502 
3503 		freemsg(mp);
3504 		goto out;
3505 	}
3506 	case T_OPTDATA_IND: {
3507 		struct T_optdata_req *tdr;
3508 		void *opt;
3509 		t_uscalar_t optlen;
3510 
3511 		if ((so->so_state &
3512 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3513 		    (uiop->uio_resid != saved_resid) &&
3514 		    !(flags & MSG_PEEK)) {
3515 			sorecv_update_oobstate(so);
3516 		}
3517 
3518 		tdr = (struct T_optdata_req *)mp->b_rptr;
3519 		optlen = tdr->OPT_length;
3520 		if (optlen != 0) {
3521 			t_uscalar_t ncontrollen;
3522 			/*
3523 			 * Determine how large cmsg buffer is needed.
3524 			 */
3525 			opt = sogetoff(mp,
3526 			    tpr->optdata_ind.OPT_offset,
3527 			    optlen, __TPI_ALIGN_SIZE);
3528 
3529 			if (opt == NULL) {
3530 				freemsg(mp);
3531 				error = EPROTO;
3532 				eprintsoline(so, error);
3533 				goto out;
3534 			}
3535 
3536 			ncontrollen = so_cmsglen(mp, opt, optlen,
3537 			    !(flags & MSG_XPG4_2));
3538 			if (controllen != 0)
3539 				controllen = ncontrollen;
3540 			else if (ncontrollen != 0)
3541 				msg->msg_flags |= MSG_CTRUNC;
3542 		} else {
3543 			controllen = 0;
3544 		}
3545 
3546 		if (controllen != 0) {
3547 			/*
3548 			 * Return control msg to caller.
3549 			 * Caller handles truncation if length
3550 			 * exceeds msg_controllen.
3551 			 */
3552 			control = kmem_zalloc(controllen, KM_SLEEP);
3553 
3554 			error = so_opt2cmsg(mp, opt, optlen,
3555 			    !(flags & MSG_XPG4_2),
3556 			    control, controllen);
3557 			if (error) {
3558 				freemsg(mp);
3559 				kmem_free(control, controllen);
3560 				eprintsoline(so, error);
3561 				goto out;
3562 			}
3563 			msg->msg_control = control;
3564 			msg->msg_controllen = controllen;
3565 		}
3566 
3567 		/*
3568 		 * Set msg_flags to MSG_EOR based on
3569 		 * DATA_flag and MOREDATA.
3570 		 */
3571 		mutex_enter(&so->so_lock);
3572 		so->so_state &= ~SS_SAVEDEOR;
3573 		if (!(tpr->data_ind.MORE_flag & 1)) {
3574 			if (!(rval.r_val1 & MOREDATA))
3575 				msg->msg_flags |= MSG_EOR;
3576 			else
3577 				so->so_state |= SS_SAVEDEOR;
3578 		}
3579 		freemsg(mp);
3580 		/*
3581 		 * If some data was received (i.e. not EOF) and the
3582 		 * read/recv* has not been satisfied wait for some more.
3583 		 * Not possible to wait if control info was received.
3584 		 */
3585 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3586 		    controllen == 0 &&
3587 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3588 			mutex_exit(&so->so_lock);
3589 			pflag = opflag | MSG_NOMARK;
3590 			goto retry;
3591 		}
3592 		goto out_locked;
3593 	}
3594 	case T_EXDATA_IND: {
3595 		dprintso(so, 1,
3596 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3597 		    "state %s\n",
3598 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3599 		    saved_resid - uiop->uio_resid,
3600 		    pr_state(so->so_state, so->so_mode)));
3601 		/*
3602 		 * kstrgetmsg handles MSGMARK so there is nothing to
3603 		 * inspect in the T_EXDATA_IND.
3604 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3605 		 * as a separate message with no M_DATA component. Furthermore,
3606 		 * the stream head does not consolidate M_DATA messages onto
3607 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3608 		 * remains a message by itself. This is needed since MSGMARK
3609 		 * marks both the whole message as well as the last byte
3610 		 * of the message.
3611 		 */
3612 		freemsg(mp);
3613 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3614 		if (flags & MSG_PEEK) {
3615 			/*
3616 			 * Even though we are peeking we consume the
3617 			 * T_EXDATA_IND thereby moving the mark information
3618 			 * to SS_RCVATMARK. Then the oob code below will
3619 			 * retry the peeking kstrgetmsg.
3620 			 * Note that the stream head read queue is
3621 			 * never flushed without holding SOREADLOCKED
3622 			 * thus the T_EXDATA_IND can not disappear
3623 			 * underneath us.
3624 			 */
3625 			dprintso(so, 1,
3626 			    ("sotpi_recvmsg: consume EXDATA_IND "
3627 			    "counts %d/%d state %s\n",
3628 			    sti->sti_oobsigcnt,
3629 			    sti->sti_oobcnt,
3630 			    pr_state(so->so_state, so->so_mode)));
3631 
3632 			pflag = MSG_ANY | MSG_DELAYERROR;
3633 			if (so->so_mode & SM_ATOMIC)
3634 				pflag |= MSG_DISCARDTAIL;
3635 
3636 			pri = 0;
3637 			mp = NULL;
3638 
3639 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3640 			    &pri, &pflag, (clock_t)-1, &rval);
3641 			ASSERT(uiop->uio_resid == saved_resid);
3642 
3643 			if (error) {
3644 #ifdef SOCK_DEBUG
3645 				if (error != EWOULDBLOCK && error != EINTR) {
3646 					eprintsoline(so, error);
3647 				}
3648 #endif /* SOCK_DEBUG */
3649 				goto out;
3650 			}
3651 			ASSERT(mp);
3652 			tpr = (union T_primitives *)mp->b_rptr;
3653 			ASSERT(tpr->type == T_EXDATA_IND);
3654 			freemsg(mp);
3655 		} /* end "if (flags & MSG_PEEK)" */
3656 
3657 		/*
3658 		 * Decrement the number of queued and pending oob.
3659 		 *
3660 		 * SS_RCVATMARK is cleared when we read past a mark.
3661 		 * SS_HAVEOOBDATA is cleared when we've read past the
3662 		 * last mark.
3663 		 * SS_OOBPEND is cleared if we've read past the last
3664 		 * mark and no (new) SIGURG has been posted.
3665 		 */
3666 		mutex_enter(&so->so_lock);
3667 		ASSERT(so_verify_oobstate(so));
3668 		ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3669 		ASSERT(sti->sti_oobsigcnt > 0);
3670 		sti->sti_oobsigcnt--;
3671 		ASSERT(sti->sti_oobcnt > 0);
3672 		sti->sti_oobcnt--;
3673 		/*
3674 		 * Since the T_EXDATA_IND has been removed from the stream
3675 		 * head, but we have not read data past the mark,
3676 		 * sockfs needs to track that the socket is still at the mark.
3677 		 *
3678 		 * Since no data was received call kstrgetmsg again to wait
3679 		 * for data.
3680 		 */
3681 		so->so_state |= SS_RCVATMARK;
3682 		mutex_exit(&so->so_lock);
3683 		dprintso(so, 1,
3684 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3685 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3686 		    pr_state(so->so_state, so->so_mode)));
3687 		pflag = opflag;
3688 		goto retry;
3689 	}
3690 	default:
3691 		cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3692 		    (void *)so, tpr->type, (void *)mp);
3693 		ASSERT(0);
3694 		freemsg(mp);
3695 		error = EPROTO;
3696 		eprintsoline(so, error);
3697 		goto out;
3698 	}
3699 	/* NOTREACHED */
3700 out:
3701 	mutex_enter(&so->so_lock);
3702 out_locked:
3703 	if (so->so_direct != NULL) {
3704 		mutex_enter(so->so_direct->sod_lockp);
3705 		reterr = sod_rcv_done(so, suiop, uiop);
3706 		mutex_exit(so->so_direct->sod_lockp);
3707 	}
3708 	if (reterr != 0 && error == 0)
3709 		error = reterr;
3710 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3711 	mutex_exit(&so->so_lock);
3712 	return (error);
3713 }
3714 
3715 /*
3716  * Sending data with options on a datagram socket.
3717  * Assumes caller has verified that SS_ISBOUND etc. are set.
3718  */
3719 static int
3720 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3721     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3722 {
3723 	struct T_unitdata_req	tudr;
3724 	mblk_t			*mp;
3725 	int			error;
3726 	void			*addr;
3727 	socklen_t		addrlen;
3728 	void			*src;
3729 	socklen_t		srclen;
3730 	ssize_t			len;
3731 	int			size;
3732 	struct T_opthdr		toh;
3733 	struct fdbuf		*fdbuf;
3734 	t_uscalar_t		optlen;
3735 	void			*fds;
3736 	int			fdlen;
3737 	sotpi_info_t		*sti = SOTOTPI(so);
3738 
3739 	ASSERT(name && namelen);
3740 	ASSERT(control && controllen);
3741 
3742 	len = uiop->uio_resid;
3743 	if (len > (ssize_t)sti->sti_tidu_size) {
3744 		return (EMSGSIZE);
3745 	}
3746 
3747 	/*
3748 	 * For AF_UNIX the destination address is translated to an internal
3749 	 * name and the source address is passed as an option.
3750 	 * Also, file descriptors are passed as file pointers in an
3751 	 * option.
3752 	 */
3753 
3754 	/*
3755 	 * Length and family checks.
3756 	 */
3757 	error = so_addr_verify(so, name, namelen);
3758 	if (error) {
3759 		eprintsoline(so, error);
3760 		return (error);
3761 	}
3762 	if (so->so_family == AF_UNIX) {
3763 		if (sti->sti_faddr_noxlate) {
3764 			/*
3765 			 * Already have a transport internal address. Do not
3766 			 * pass any (transport internal) source address.
3767 			 */
3768 			addr = name;
3769 			addrlen = namelen;
3770 			src = NULL;
3771 			srclen = 0;
3772 		} else {
3773 			/*
3774 			 * Pass the sockaddr_un source address as an option
3775 			 * and translate the remote address.
3776 			 *
3777 			 * Note that this code does not prevent sti_laddr_sa
3778 			 * from changing while it is being used. Thus
3779 			 * if an unbind+bind occurs concurrently with this
3780 			 * send the peer might see a partially new and a
3781 			 * partially old "from" address.
3782 			 */
3783 			src = sti->sti_laddr_sa;
3784 			srclen = (t_uscalar_t)sti->sti_laddr_len;
3785 			dprintso(so, 1,
3786 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3787 			    srclen, src));
3788 			error = so_ux_addr_xlate(so, name, namelen,
3789 			    (flags & MSG_XPG4_2),
3790 			    &addr, &addrlen);
3791 			if (error) {
3792 				eprintsoline(so, error);
3793 				return (error);
3794 			}
3795 		}
3796 	} else {
3797 		addr = name;
3798 		addrlen = namelen;
3799 		src = NULL;
3800 		srclen = 0;
3801 	}
3802 	optlen = so_optlen(control, controllen,
3803 	    !(flags & MSG_XPG4_2));
3804 	tudr.PRIM_type = T_UNITDATA_REQ;
3805 	tudr.DEST_length = addrlen;
3806 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3807 	if (srclen != 0)
3808 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3809 		    _TPI_ALIGN_TOPT(srclen));
3810 	else
3811 		tudr.OPT_length = optlen;
3812 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3813 	    _TPI_ALIGN_TOPT(addrlen));
3814 
3815 	size = tudr.OPT_offset + tudr.OPT_length;
3816 
3817 	/*
3818 	 * File descriptors only when SM_FDPASSING set.
3819 	 */
3820 	error = so_getfdopt(control, controllen,
3821 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3822 	if (error)
3823 		return (error);
3824 	if (fdlen != -1) {
3825 		if (!(so->so_mode & SM_FDPASSING))
3826 			return (EOPNOTSUPP);
3827 
3828 		error = fdbuf_create(fds, fdlen, &fdbuf);
3829 		if (error)
3830 			return (error);
3831 		mp = fdbuf_allocmsg(size, fdbuf);
3832 	} else {
3833 		mp = soallocproto(size, _ALLOC_INTR);
3834 		if (mp == NULL) {
3835 			/*
3836 			 * Caught a signal waiting for memory.
3837 			 * Let send* return EINTR.
3838 			 */
3839 			return (EINTR);
3840 		}
3841 	}
3842 	soappendmsg(mp, &tudr, sizeof (tudr));
3843 	soappendmsg(mp, addr, addrlen);
3844 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3845 
3846 	if (fdlen != -1) {
3847 		ASSERT(fdbuf != NULL);
3848 		toh.level = SOL_SOCKET;
3849 		toh.name = SO_FILEP;
3850 		toh.len = fdbuf->fd_size +
3851 		    (t_uscalar_t)sizeof (struct T_opthdr);
3852 		toh.status = 0;
3853 		soappendmsg(mp, &toh, sizeof (toh));
3854 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3855 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3856 	}
3857 	if (srclen != 0) {
3858 		/*
3859 		 * There is a AF_UNIX sockaddr_un to include as a source
3860 		 * address option.
3861 		 */
3862 		toh.level = SOL_SOCKET;
3863 		toh.name = SO_SRCADDR;
3864 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3865 		toh.status = 0;
3866 		soappendmsg(mp, &toh, sizeof (toh));
3867 		soappendmsg(mp, src, srclen);
3868 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3869 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3870 	}
3871 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3872 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3873 	/* At most 3 bytes left in the message */
3874 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3875 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3876 
3877 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3878 	if (audit_active)
3879 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3880 
3881 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3882 #ifdef SOCK_DEBUG
3883 	if (error) {
3884 		eprintsoline(so, error);
3885 	}
3886 #endif /* SOCK_DEBUG */
3887 	return (error);
3888 }
3889 
3890 /*
3891  * Sending data with options on a connected stream socket.
3892  * Assumes caller has verified that SS_ISCONNECTED is set.
3893  */
3894 static int
3895 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3896     t_uscalar_t controllen, int flags)
3897 {
3898 	struct T_optdata_req	tdr;
3899 	mblk_t			*mp;
3900 	int			error;
3901 	ssize_t			iosize;
3902 	int			size;
3903 	struct fdbuf		*fdbuf;
3904 	t_uscalar_t		optlen;
3905 	void			*fds;
3906 	int			fdlen;
3907 	struct T_opthdr		toh;
3908 	sotpi_info_t		*sti = SOTOTPI(so);
3909 
3910 	dprintso(so, 1,
3911 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3912 
3913 	/*
3914 	 * Has to be bound and connected. However, since no locks are
3915 	 * held the state could have changed after sotpi_sendmsg checked it
3916 	 * thus it is not possible to ASSERT on the state.
3917 	 */
3918 
3919 	/* Options on connection-oriented only when SM_OPTDATA set. */
3920 	if (!(so->so_mode & SM_OPTDATA))
3921 		return (EOPNOTSUPP);
3922 
3923 	do {
3924 		/*
3925 		 * Set the MORE flag if uio_resid does not fit in this
3926 		 * message or if the caller passed in "more".
3927 		 * Error for transports with zero tidu_size.
3928 		 */
3929 		tdr.PRIM_type = T_OPTDATA_REQ;
3930 		iosize = sti->sti_tidu_size;
3931 		if (iosize <= 0)
3932 			return (EMSGSIZE);
3933 		if (uiop->uio_resid > iosize) {
3934 			tdr.DATA_flag = 1;
3935 		} else {
3936 			if (more)
3937 				tdr.DATA_flag = 1;
3938 			else
3939 				tdr.DATA_flag = 0;
3940 			iosize = uiop->uio_resid;
3941 		}
3942 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3943 		    tdr.DATA_flag, iosize));
3944 
3945 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3946 		tdr.OPT_length = optlen;
3947 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3948 
3949 		size = (int)sizeof (tdr) + optlen;
3950 		/*
3951 		 * File descriptors only when SM_FDPASSING set.
3952 		 */
3953 		error = so_getfdopt(control, controllen,
3954 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3955 		if (error)
3956 			return (error);
3957 		if (fdlen != -1) {
3958 			if (!(so->so_mode & SM_FDPASSING))
3959 				return (EOPNOTSUPP);
3960 
3961 			error = fdbuf_create(fds, fdlen, &fdbuf);
3962 			if (error)
3963 				return (error);
3964 			mp = fdbuf_allocmsg(size, fdbuf);
3965 		} else {
3966 			mp = soallocproto(size, _ALLOC_INTR);
3967 			if (mp == NULL) {
3968 				/*
3969 				 * Caught a signal waiting for memory.
3970 				 * Let send* return EINTR.
3971 				 */
3972 				return (EINTR);
3973 			}
3974 		}
3975 		soappendmsg(mp, &tdr, sizeof (tdr));
3976 
3977 		if (fdlen != -1) {
3978 			ASSERT(fdbuf != NULL);
3979 			toh.level = SOL_SOCKET;
3980 			toh.name = SO_FILEP;
3981 			toh.len = fdbuf->fd_size +
3982 			    (t_uscalar_t)sizeof (struct T_opthdr);
3983 			toh.status = 0;
3984 			soappendmsg(mp, &toh, sizeof (toh));
3985 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3986 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3987 		}
3988 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3989 		/* At most 3 bytes left in the message */
3990 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3991 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3992 
3993 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3994 
3995 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3996 		    0, MSG_BAND, 0);
3997 		if (error) {
3998 			eprintsoline(so, error);
3999 			return (error);
4000 		}
4001 		control = NULL;
4002 		if (uiop->uio_resid > 0) {
4003 			/*
4004 			 * Recheck for fatal errors. Fail write even though
4005 			 * some data have been written. This is consistent
4006 			 * with strwrite semantics and BSD sockets semantics.
4007 			 */
4008 			if (so->so_state & SS_CANTSENDMORE) {
4009 				eprintsoline(so, error);
4010 				return (EPIPE);
4011 			}
4012 			if (so->so_error != 0) {
4013 				mutex_enter(&so->so_lock);
4014 				error = sogeterr(so, B_TRUE);
4015 				mutex_exit(&so->so_lock);
4016 				if (error != 0) {
4017 					eprintsoline(so, error);
4018 					return (error);
4019 				}
4020 			}
4021 		}
4022 	} while (uiop->uio_resid > 0);
4023 	return (0);
4024 }
4025 
4026 /*
4027  * Sending data on a datagram socket.
4028  * Assumes caller has verified that SS_ISBOUND etc. are set.
4029  *
4030  * For AF_UNIX the destination address is translated to an internal
4031  * name and the source address is passed as an option.
4032  */
4033 int
4034 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
4035     struct uio *uiop, int flags)
4036 {
4037 	struct T_unitdata_req	tudr;
4038 	mblk_t			*mp;
4039 	int			error;
4040 	void			*addr;
4041 	socklen_t		addrlen;
4042 	void			*src;
4043 	socklen_t		srclen;
4044 	ssize_t			len;
4045 	sotpi_info_t		*sti = SOTOTPI(so);
4046 
4047 	ASSERT(name != NULL && namelen != 0);
4048 
4049 	len = uiop->uio_resid;
4050 	if (len > sti->sti_tidu_size) {
4051 		error = EMSGSIZE;
4052 		goto done;
4053 	}
4054 
4055 	/* Length and family checks */
4056 	error = so_addr_verify(so, name, namelen);
4057 	if (error != 0)
4058 		goto done;
4059 
4060 	if (sti->sti_direct)
4061 		return (sodgram_direct(so, name, namelen, uiop, flags));
4062 
4063 	if (so->so_family == AF_UNIX) {
4064 		if (sti->sti_faddr_noxlate) {
4065 			/*
4066 			 * Already have a transport internal address. Do not
4067 			 * pass any (transport internal) source address.
4068 			 */
4069 			addr = name;
4070 			addrlen = namelen;
4071 			src = NULL;
4072 			srclen = 0;
4073 		} else {
4074 			/*
4075 			 * Pass the sockaddr_un source address as an option
4076 			 * and translate the remote address.
4077 			 *
4078 			 * Note that this code does not prevent sti_laddr_sa
4079 			 * from changing while it is being used. Thus
4080 			 * if an unbind+bind occurs concurrently with this
4081 			 * send the peer might see a partially new and a
4082 			 * partially old "from" address.
4083 			 */
4084 			src = sti->sti_laddr_sa;
4085 			srclen = (socklen_t)sti->sti_laddr_len;
4086 			dprintso(so, 1,
4087 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
4088 			    srclen, src));
4089 			error = so_ux_addr_xlate(so, name, namelen,
4090 			    (flags & MSG_XPG4_2),
4091 			    &addr, &addrlen);
4092 			if (error) {
4093 				eprintsoline(so, error);
4094 				goto done;
4095 			}
4096 		}
4097 	} else {
4098 		addr = name;
4099 		addrlen = namelen;
4100 		src = NULL;
4101 		srclen = 0;
4102 	}
4103 	tudr.PRIM_type = T_UNITDATA_REQ;
4104 	tudr.DEST_length = addrlen;
4105 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4106 	if (srclen == 0) {
4107 		tudr.OPT_length = 0;
4108 		tudr.OPT_offset = 0;
4109 
4110 		mp = soallocproto2(&tudr, sizeof (tudr),
4111 		    addr, addrlen, 0, _ALLOC_INTR);
4112 		if (mp == NULL) {
4113 			/*
4114 			 * Caught a signal waiting for memory.
4115 			 * Let send* return EINTR.
4116 			 */
4117 			error = EINTR;
4118 			goto done;
4119 		}
4120 	} else {
4121 		/*
4122 		 * There is a AF_UNIX sockaddr_un to include as a source
4123 		 * address option.
4124 		 */
4125 		struct T_opthdr toh;
4126 		ssize_t size;
4127 
4128 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4129 		    _TPI_ALIGN_TOPT(srclen));
4130 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4131 		    _TPI_ALIGN_TOPT(addrlen));
4132 
4133 		toh.level = SOL_SOCKET;
4134 		toh.name = SO_SRCADDR;
4135 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4136 		toh.status = 0;
4137 
4138 		size = tudr.OPT_offset + tudr.OPT_length;
4139 		mp = soallocproto2(&tudr, sizeof (tudr),
4140 		    addr, addrlen, size, _ALLOC_INTR);
4141 		if (mp == NULL) {
4142 			/*
4143 			 * Caught a signal waiting for memory.
4144 			 * Let send* return EINTR.
4145 			 */
4146 			error = EINTR;
4147 			goto done;
4148 		}
4149 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4150 		soappendmsg(mp, &toh, sizeof (toh));
4151 		soappendmsg(mp, src, srclen);
4152 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4153 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4154 	}
4155 
4156 	if (audit_active)
4157 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4158 
4159 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4160 done:
4161 #ifdef SOCK_DEBUG
4162 	if (error) {
4163 		eprintsoline(so, error);
4164 	}
4165 #endif /* SOCK_DEBUG */
4166 	return (error);
4167 }
4168 
4169 /*
4170  * Sending data on a connected stream socket.
4171  * Assumes caller has verified that SS_ISCONNECTED is set.
4172  */
4173 int
4174 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4175     int sflag)
4176 {
4177 	struct T_data_req	tdr;
4178 	mblk_t			*mp;
4179 	int			error;
4180 	ssize_t			iosize;
4181 	sotpi_info_t		*sti = SOTOTPI(so);
4182 
4183 	dprintso(so, 1,
4184 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4185 	    (void *)so, uiop->uio_resid, prim, sflag));
4186 
4187 	/*
4188 	 * Has to be bound and connected. However, since no locks are
4189 	 * held the state could have changed after sotpi_sendmsg checked it
4190 	 * thus it is not possible to ASSERT on the state.
4191 	 */
4192 
4193 	do {
4194 		/*
4195 		 * Set the MORE flag if uio_resid does not fit in this
4196 		 * message or if the caller passed in "more".
4197 		 * Error for transports with zero tidu_size.
4198 		 */
4199 		tdr.PRIM_type = prim;
4200 		iosize = sti->sti_tidu_size;
4201 		if (iosize <= 0)
4202 			return (EMSGSIZE);
4203 		if (uiop->uio_resid > iosize) {
4204 			tdr.MORE_flag = 1;
4205 		} else {
4206 			if (more)
4207 				tdr.MORE_flag = 1;
4208 			else
4209 				tdr.MORE_flag = 0;
4210 			iosize = uiop->uio_resid;
4211 		}
4212 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4213 		    prim, tdr.MORE_flag, iosize));
4214 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR);
4215 		if (mp == NULL) {
4216 			/*
4217 			 * Caught a signal waiting for memory.
4218 			 * Let send* return EINTR.
4219 			 */
4220 			return (EINTR);
4221 		}
4222 
4223 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4224 		    0, sflag | MSG_BAND, 0);
4225 		if (error) {
4226 			eprintsoline(so, error);
4227 			return (error);
4228 		}
4229 		if (uiop->uio_resid > 0) {
4230 			/*
4231 			 * Recheck for fatal errors. Fail write even though
4232 			 * some data have been written. This is consistent
4233 			 * with strwrite semantics and BSD sockets semantics.
4234 			 */
4235 			if (so->so_state & SS_CANTSENDMORE) {
4236 				eprintsoline(so, error);
4237 				return (EPIPE);
4238 			}
4239 			if (so->so_error != 0) {
4240 				mutex_enter(&so->so_lock);
4241 				error = sogeterr(so, B_TRUE);
4242 				mutex_exit(&so->so_lock);
4243 				if (error != 0) {
4244 					eprintsoline(so, error);
4245 					return (error);
4246 				}
4247 			}
4248 		}
4249 	} while (uiop->uio_resid > 0);
4250 	return (0);
4251 }
4252 
4253 /*
4254  * Check the state for errors and call the appropriate send function.
4255  *
4256  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4257  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4258  * after sending the message.
4259  */
4260 static int
4261 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4262     struct cred *cr)
4263 {
4264 	int		so_state;
4265 	int		so_mode;
4266 	int		error;
4267 	struct sockaddr *name;
4268 	t_uscalar_t	namelen;
4269 	int		dontroute;
4270 	int		flags;
4271 	sotpi_info_t	*sti = SOTOTPI(so);
4272 
4273 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4274 	    (void *)so, (void *)msg, msg->msg_flags,
4275 	    pr_state(so->so_state, so->so_mode), so->so_error));
4276 
4277 	if (so->so_version == SOV_STREAM) {
4278 		/* The imaginary "sockmod" has been popped - act as a stream */
4279 		so_update_attrs(so, SOMOD);
4280 		return (strwrite(SOTOV(so), uiop, cr));
4281 	}
4282 
4283 	mutex_enter(&so->so_lock);
4284 	so_state = so->so_state;
4285 
4286 	if (so_state & SS_CANTSENDMORE) {
4287 		mutex_exit(&so->so_lock);
4288 		return (EPIPE);
4289 	}
4290 
4291 	if (so->so_error != 0) {
4292 		error = sogeterr(so, B_TRUE);
4293 		if (error != 0) {
4294 			mutex_exit(&so->so_lock);
4295 			return (error);
4296 		}
4297 	}
4298 
4299 	name = (struct sockaddr *)msg->msg_name;
4300 	namelen = msg->msg_namelen;
4301 
4302 	so_mode = so->so_mode;
4303 
4304 	if (name == NULL) {
4305 		if (!(so_state & SS_ISCONNECTED)) {
4306 			mutex_exit(&so->so_lock);
4307 			if (so_mode & SM_CONNREQUIRED)
4308 				return (ENOTCONN);
4309 			else
4310 				return (EDESTADDRREQ);
4311 		}
4312 		if (so_mode & SM_CONNREQUIRED) {
4313 			name = NULL;
4314 			namelen = 0;
4315 		} else {
4316 			/*
4317 			 * Note that this code does not prevent sti_faddr_sa
4318 			 * from changing while it is being used. Thus
4319 			 * if an "unconnect"+connect occurs concurrently with
4320 			 * this send the datagram might be delivered to a
4321 			 * garbaled address.
4322 			 */
4323 			ASSERT(sti->sti_faddr_sa);
4324 			name = sti->sti_faddr_sa;
4325 			namelen = (t_uscalar_t)sti->sti_faddr_len;
4326 		}
4327 	} else {
4328 		if (!(so_state & SS_ISCONNECTED) &&
4329 		    (so_mode & SM_CONNREQUIRED)) {
4330 			/* Required but not connected */
4331 			mutex_exit(&so->so_lock);
4332 			return (ENOTCONN);
4333 		}
4334 		/*
4335 		 * Ignore the address on connection-oriented sockets.
4336 		 * Just like BSD this code does not generate an error for
4337 		 * TCP (a CONNREQUIRED socket) when sending to an address
4338 		 * passed in with sendto/sendmsg. Instead the data is
4339 		 * delivered on the connection as if no address had been
4340 		 * supplied.
4341 		 */
4342 		if ((so_state & SS_ISCONNECTED) &&
4343 		    !(so_mode & SM_CONNREQUIRED)) {
4344 			mutex_exit(&so->so_lock);
4345 			return (EISCONN);
4346 		}
4347 		if (!(so_state & SS_ISBOUND)) {
4348 			so_lock_single(so);	/* Set SOLOCKED */
4349 			error = sotpi_bind(so, NULL, 0,
4350 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4351 			so_unlock_single(so, SOLOCKED);
4352 			if (error) {
4353 				mutex_exit(&so->so_lock);
4354 				eprintsoline(so, error);
4355 				return (error);
4356 			}
4357 		}
4358 		/*
4359 		 * Handle delayed datagram errors. These are only queued
4360 		 * when the application sets SO_DGRAM_ERRIND.
4361 		 * Return the error if we are sending to the address
4362 		 * that was returned in the last T_UDERROR_IND.
4363 		 * If sending to some other address discard the delayed
4364 		 * error indication.
4365 		 */
4366 		if (sti->sti_delayed_error) {
4367 			struct T_uderror_ind	*tudi;
4368 			void			*addr;
4369 			t_uscalar_t		addrlen;
4370 			boolean_t		match = B_FALSE;
4371 
4372 			ASSERT(sti->sti_eaddr_mp);
4373 			error = sti->sti_delayed_error;
4374 			sti->sti_delayed_error = 0;
4375 			tudi =
4376 			    (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4377 			addrlen = tudi->DEST_length;
4378 			addr = sogetoff(sti->sti_eaddr_mp,
4379 			    tudi->DEST_offset, addrlen, 1);
4380 			ASSERT(addr);	/* Checked by strsock_proto */
4381 			switch (so->so_family) {
4382 			case AF_INET: {
4383 				/* Compare just IP address and port */
4384 				sin_t *sin1 = (sin_t *)name;
4385 				sin_t *sin2 = (sin_t *)addr;
4386 
4387 				if (addrlen == sizeof (sin_t) &&
4388 				    namelen == addrlen &&
4389 				    sin1->sin_port == sin2->sin_port &&
4390 				    sin1->sin_addr.s_addr ==
4391 				    sin2->sin_addr.s_addr)
4392 					match = B_TRUE;
4393 				break;
4394 			}
4395 			case AF_INET6: {
4396 				/* Compare just IP address and port. Not flow */
4397 				sin6_t *sin1 = (sin6_t *)name;
4398 				sin6_t *sin2 = (sin6_t *)addr;
4399 
4400 				if (addrlen == sizeof (sin6_t) &&
4401 				    namelen == addrlen &&
4402 				    sin1->sin6_port == sin2->sin6_port &&
4403 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4404 				    &sin2->sin6_addr))
4405 					match = B_TRUE;
4406 				break;
4407 			}
4408 			case AF_UNIX:
4409 			default:
4410 				if (namelen == addrlen &&
4411 				    bcmp(name, addr, namelen) == 0)
4412 					match = B_TRUE;
4413 			}
4414 			if (match) {
4415 				freemsg(sti->sti_eaddr_mp);
4416 				sti->sti_eaddr_mp = NULL;
4417 				mutex_exit(&so->so_lock);
4418 #ifdef DEBUG
4419 				dprintso(so, 0,
4420 				    ("sockfs delayed error %d for %s\n",
4421 				    error,
4422 				    pr_addr(so->so_family, name, namelen)));
4423 #endif /* DEBUG */
4424 				return (error);
4425 			}
4426 			freemsg(sti->sti_eaddr_mp);
4427 			sti->sti_eaddr_mp = NULL;
4428 		}
4429 	}
4430 	mutex_exit(&so->so_lock);
4431 
4432 	flags = msg->msg_flags;
4433 	dontroute = 0;
4434 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4435 		uint32_t	val;
4436 
4437 		val = 1;
4438 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4439 		    &val, (t_uscalar_t)sizeof (val), cr);
4440 		if (error)
4441 			return (error);
4442 		dontroute = 1;
4443 	}
4444 
4445 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4446 		error = EOPNOTSUPP;
4447 		goto done;
4448 	}
4449 	if (msg->msg_controllen != 0) {
4450 		if (!(so_mode & SM_CONNREQUIRED)) {
4451 			so_update_attrs(so, SOMOD);
4452 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4453 			    msg->msg_control, msg->msg_controllen, flags);
4454 		} else {
4455 			if (flags & MSG_OOB) {
4456 				/* Can't generate T_EXDATA_REQ with options */
4457 				error = EOPNOTSUPP;
4458 				goto done;
4459 			}
4460 			so_update_attrs(so, SOMOD);
4461 			error = sosend_svccmsg(so, uiop,
4462 			    !(flags & MSG_EOR),
4463 			    msg->msg_control, msg->msg_controllen,
4464 			    flags);
4465 		}
4466 		goto done;
4467 	}
4468 
4469 	so_update_attrs(so, SOMOD);
4470 	if (!(so_mode & SM_CONNREQUIRED)) {
4471 		/*
4472 		 * If there is no SO_DONTROUTE to turn off return immediately
4473 		 * from send_dgram. This can allow tail-call optimizations.
4474 		 */
4475 		if (!dontroute) {
4476 			return (sosend_dgram(so, name, namelen, uiop, flags));
4477 		}
4478 		error = sosend_dgram(so, name, namelen, uiop, flags);
4479 	} else {
4480 		t_scalar_t prim;
4481 		int sflag;
4482 
4483 		/* Ignore msg_name in the connected state */
4484 		if (flags & MSG_OOB) {
4485 			prim = T_EXDATA_REQ;
4486 			/*
4487 			 * Send down T_EXDATA_REQ even if there is flow
4488 			 * control for data.
4489 			 */
4490 			sflag = MSG_IGNFLOW;
4491 		} else {
4492 			if (so_mode & SM_BYTESTREAM) {
4493 				/* Byte stream transport - use write */
4494 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4495 
4496 				/* Send M_DATA messages */
4497 				if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4498 				    (error = nl7c_data(so, uiop)) >= 0) {
4499 					/* NL7C consumed the data */
4500 					return (error);
4501 				}
4502 				/*
4503 				 * If there is no SO_DONTROUTE to turn off,
4504 				 * sti_direct is on, and there is no flow
4505 				 * control, we can take the fast path.
4506 				 */
4507 				if (!dontroute && sti->sti_direct != 0 &&
4508 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4509 					return (sostream_direct(so, uiop,
4510 					    NULL, cr));
4511 				}
4512 				error = strwrite(SOTOV(so), uiop, cr);
4513 				goto done;
4514 			}
4515 			prim = T_DATA_REQ;
4516 			sflag = 0;
4517 		}
4518 		/*
4519 		 * If there is no SO_DONTROUTE to turn off return immediately
4520 		 * from sosend_svc. This can allow tail-call optimizations.
4521 		 */
4522 		if (!dontroute)
4523 			return (sosend_svc(so, uiop, prim,
4524 			    !(flags & MSG_EOR), sflag));
4525 		error = sosend_svc(so, uiop, prim,
4526 		    !(flags & MSG_EOR), sflag);
4527 	}
4528 	ASSERT(dontroute);
4529 done:
4530 	if (dontroute) {
4531 		uint32_t	val;
4532 
4533 		val = 0;
4534 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4535 		    &val, (t_uscalar_t)sizeof (val), cr);
4536 	}
4537 	return (error);
4538 }
4539 
4540 /*
4541  * kstrwritemp() has very similar semantics as that of strwrite().
4542  * The main difference is it obtains mblks from the caller and also
4543  * does not do any copy as done in strwrite() from user buffers to
4544  * kernel buffers.
4545  *
4546  * Currently, this routine is used by sendfile to send data allocated
4547  * within the kernel without any copying. This interface does not use the
4548  * synchronous stream interface as synch. stream interface implies
4549  * copying.
4550  */
4551 int
4552 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4553 {
4554 	struct stdata *stp;
4555 	struct queue *wqp;
4556 	mblk_t *newmp;
4557 	char waitflag;
4558 	int tempmode;
4559 	int error = 0;
4560 	int done = 0;
4561 	struct sonode *so;
4562 	boolean_t direct;
4563 
4564 	ASSERT(vp->v_stream);
4565 	stp = vp->v_stream;
4566 
4567 	so = VTOSO(vp);
4568 	direct = _SOTOTPI(so)->sti_direct;
4569 
4570 	/*
4571 	 * This is the sockfs direct fast path. canputnext() need
4572 	 * not be accurate so we don't grab the sd_lock here. If
4573 	 * we get flow-controlled, we grab sd_lock just before the
4574 	 * do..while loop below to emulate what strwrite() does.
4575 	 */
4576 	wqp = stp->sd_wrq;
4577 	if (canputnext(wqp) && direct &&
4578 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4579 		return (sostream_direct(so, NULL, mp, CRED()));
4580 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4581 		/* Fast check of flags before acquiring the lock */
4582 		mutex_enter(&stp->sd_lock);
4583 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4584 		mutex_exit(&stp->sd_lock);
4585 		if (error != 0) {
4586 			if (!(stp->sd_flag & STPLEX) &&
4587 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
4588 				error = EPIPE;
4589 			}
4590 			return (error);
4591 		}
4592 	}
4593 
4594 	waitflag = WRITEWAIT;
4595 	if (stp->sd_flag & OLDNDELAY)
4596 		tempmode = fmode & ~FNDELAY;
4597 	else
4598 		tempmode = fmode;
4599 
4600 	mutex_enter(&stp->sd_lock);
4601 	do {
4602 		if (canputnext(wqp)) {
4603 			mutex_exit(&stp->sd_lock);
4604 			if (stp->sd_wputdatafunc != NULL) {
4605 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4606 				    NULL, NULL, NULL);
4607 				if (newmp == NULL) {
4608 					/* The caller will free mp */
4609 					return (ECOMM);
4610 				}
4611 				mp = newmp;
4612 			}
4613 			putnext(wqp, mp);
4614 			return (0);
4615 		}
4616 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4617 		    &done);
4618 	} while (error == 0 && !done);
4619 
4620 	mutex_exit(&stp->sd_lock);
4621 	/*
4622 	 * EAGAIN tells the application to try again. ENOMEM
4623 	 * is returned only if the memory allocation size
4624 	 * exceeds the physical limits of the system. ENOMEM
4625 	 * can't be true here.
4626 	 */
4627 	if (error == ENOMEM)
4628 		error = EAGAIN;
4629 	return (error);
4630 }
4631 
4632 /* ARGSUSED */
4633 static int
4634 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4635     struct cred *cr, mblk_t **mpp)
4636 {
4637 	int error;
4638 
4639 	if (so->so_family != AF_INET && so->so_family != AF_INET6)
4640 		return (EAFNOSUPPORT);
4641 
4642 	if (so->so_state & SS_CANTSENDMORE)
4643 		return (EPIPE);
4644 
4645 	if (so->so_type != SOCK_STREAM)
4646 		return (EOPNOTSUPP);
4647 
4648 	if ((so->so_state & SS_ISCONNECTED) == 0)
4649 		return (ENOTCONN);
4650 
4651 	error = kstrwritemp(so->so_vnode, *mpp, fflag);
4652 	if (error == 0)
4653 		*mpp = NULL;
4654 	return (error);
4655 }
4656 
4657 /*
4658  * Sending data on a datagram socket.
4659  * Assumes caller has verified that SS_ISBOUND etc. are set.
4660  */
4661 /* ARGSUSED */
4662 static int
4663 sodgram_direct(struct sonode *so, struct sockaddr *name,
4664     socklen_t namelen, struct uio *uiop, int flags)
4665 {
4666 	struct T_unitdata_req	tudr;
4667 	mblk_t			*mp = NULL;
4668 	int			error = 0;
4669 	void			*addr;
4670 	socklen_t		addrlen;
4671 	ssize_t			len;
4672 	struct stdata		*stp = SOTOV(so)->v_stream;
4673 	int			so_state;
4674 	queue_t			*udp_wq;
4675 	boolean_t		connected;
4676 	mblk_t			*mpdata = NULL;
4677 	sotpi_info_t		*sti = SOTOTPI(so);
4678 
4679 	ASSERT(name != NULL && namelen != 0);
4680 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4681 	ASSERT(!(so->so_mode & SM_EXDATA));
4682 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4683 	ASSERT(SOTOV(so)->v_type == VSOCK);
4684 
4685 	/* Caller checked for proper length */
4686 	len = uiop->uio_resid;
4687 	ASSERT(len <= sti->sti_tidu_size);
4688 
4689 	/* Length and family checks have been done by caller */
4690 	ASSERT(name->sa_family == so->so_family);
4691 	ASSERT(so->so_family == AF_INET ||
4692 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4693 	ASSERT(so->so_family == AF_INET6 ||
4694 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4695 
4696 	addr = name;
4697 	addrlen = namelen;
4698 
4699 	if (stp->sd_sidp != NULL &&
4700 	    (error = straccess(stp, JCWRITE)) != 0)
4701 		goto done;
4702 
4703 	so_state = so->so_state;
4704 
4705 	connected = so_state & SS_ISCONNECTED;
4706 	if (!connected) {
4707 		tudr.PRIM_type = T_UNITDATA_REQ;
4708 		tudr.DEST_length = addrlen;
4709 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4710 		tudr.OPT_length = 0;
4711 		tudr.OPT_offset = 0;
4712 
4713 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4714 		    _ALLOC_INTR);
4715 		if (mp == NULL) {
4716 			/*
4717 			 * Caught a signal waiting for memory.
4718 			 * Let send* return EINTR.
4719 			 */
4720 			error = EINTR;
4721 			goto done;
4722 		}
4723 	}
4724 
4725 	/*
4726 	 * For UDP we don't break up the copyin into smaller pieces
4727 	 * as in the TCP case.  That means if ENOMEM is returned by
4728 	 * mcopyinuio() then the uio vector has not been modified at
4729 	 * all and we fallback to either strwrite() or kstrputmsg()
4730 	 * below.  Note also that we never generate priority messages
4731 	 * from here.
4732 	 */
4733 	udp_wq = stp->sd_wrq->q_next;
4734 	if (canput(udp_wq) &&
4735 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4736 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4737 		ASSERT(uiop->uio_resid == 0);
4738 		if (!connected)
4739 			linkb(mp, mpdata);
4740 		else
4741 			mp = mpdata;
4742 		if (audit_active)
4743 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4744 
4745 		udp_wput(udp_wq, mp);
4746 		return (0);
4747 	}
4748 
4749 	ASSERT(mpdata == NULL);
4750 	if (error != 0 && error != ENOMEM) {
4751 		freemsg(mp);
4752 		return (error);
4753 	}
4754 
4755 	/*
4756 	 * For connected, let strwrite() handle the blocking case.
4757 	 * Otherwise we fall thru and use kstrputmsg().
4758 	 */
4759 	if (connected)
4760 		return (strwrite(SOTOV(so), uiop, CRED()));
4761 
4762 	if (audit_active)
4763 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4764 
4765 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4766 done:
4767 #ifdef SOCK_DEBUG
4768 	if (error != 0) {
4769 		eprintsoline(so, error);
4770 	}
4771 #endif /* SOCK_DEBUG */
4772 	return (error);
4773 }
4774 
4775 int
4776 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4777 {
4778 	struct stdata *stp = SOTOV(so)->v_stream;
4779 	ssize_t iosize, rmax, maxblk;
4780 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4781 	mblk_t *newmp;
4782 	int error = 0, wflag = 0;
4783 
4784 	ASSERT(so->so_mode & SM_BYTESTREAM);
4785 	ASSERT(SOTOV(so)->v_type == VSOCK);
4786 
4787 	if (stp->sd_sidp != NULL &&
4788 	    (error = straccess(stp, JCWRITE)) != 0)
4789 		return (error);
4790 
4791 	if (uiop == NULL) {
4792 		/*
4793 		 * kstrwritemp() should have checked sd_flag and
4794 		 * flow-control before coming here.  If we end up
4795 		 * here it means that we can simply pass down the
4796 		 * data to tcp.
4797 		 */
4798 		ASSERT(mp != NULL);
4799 		if (stp->sd_wputdatafunc != NULL) {
4800 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4801 			    NULL, NULL, NULL);
4802 			if (newmp == NULL) {
4803 				/* The caller will free mp */
4804 				return (ECOMM);
4805 			}
4806 			mp = newmp;
4807 		}
4808 		tcp_wput(tcp_wq, mp);
4809 		return (0);
4810 	}
4811 
4812 	/* Fallback to strwrite() to do proper error handling */
4813 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4814 		return (strwrite(SOTOV(so), uiop, cr));
4815 
4816 	rmax = stp->sd_qn_maxpsz;
4817 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4818 	if (rmax == 0 || uiop->uio_resid <= 0)
4819 		return (0);
4820 
4821 	if (rmax == INFPSZ)
4822 		rmax = uiop->uio_resid;
4823 
4824 	maxblk = stp->sd_maxblk;
4825 
4826 	for (;;) {
4827 		iosize = MIN(uiop->uio_resid, rmax);
4828 
4829 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4830 		if (mp == NULL) {
4831 			/*
4832 			 * Fallback to strwrite() for ENOMEM; if this
4833 			 * is our first time in this routine and the uio
4834 			 * vector has not been modified, we will end up
4835 			 * calling strwrite() without any flag set.
4836 			 */
4837 			if (error == ENOMEM)
4838 				goto slow_send;
4839 			else
4840 				return (error);
4841 		}
4842 		ASSERT(uiop->uio_resid >= 0);
4843 		/*
4844 		 * If mp is non-NULL and ENOMEM is set, it means that
4845 		 * mcopyinuio() was able to break down some of the user
4846 		 * data into one or more mblks.  Send the partial data
4847 		 * to tcp and let the rest be handled in strwrite().
4848 		 */
4849 		ASSERT(error == 0 || error == ENOMEM);
4850 		if (stp->sd_wputdatafunc != NULL) {
4851 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4852 			    NULL, NULL, NULL);
4853 			if (newmp == NULL) {
4854 				/* The caller will free mp */
4855 				return (ECOMM);
4856 			}
4857 			mp = newmp;
4858 		}
4859 		tcp_wput(tcp_wq, mp);
4860 
4861 		wflag |= NOINTR;
4862 
4863 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4864 			ASSERT(error == 0);
4865 			break;
4866 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4867 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4868 slow_send:
4869 			/*
4870 			 * We were able to send down partial data using
4871 			 * the direct call interface, but are now relying
4872 			 * on strwrite() to handle the non-fastpath cases.
4873 			 * If the socket is blocking we will sleep in
4874 			 * strwaitq() until write is permitted, otherwise,
4875 			 * we will need to return the amount of bytes
4876 			 * written so far back to the app.  This is the
4877 			 * reason why we pass NOINTR flag to strwrite()
4878 			 * for non-blocking socket, because we don't want
4879 			 * to return EAGAIN when portion of the user data
4880 			 * has actually been sent down.
4881 			 */
4882 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4883 		}
4884 	}
4885 	return (0);
4886 }
4887 
4888 /*
4889  * Update sti_faddr by asking the transport (unless AF_UNIX).
4890  */
4891 /* ARGSUSED */
4892 int
4893 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4894     boolean_t accept, struct cred *cr)
4895 {
4896 	struct strbuf	strbuf;
4897 	int		error = 0, res;
4898 	void		*addr;
4899 	t_uscalar_t	addrlen;
4900 	k_sigset_t	smask;
4901 	sotpi_info_t	*sti = SOTOTPI(so);
4902 
4903 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4904 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4905 
4906 	ASSERT(*namelen > 0);
4907 	mutex_enter(&so->so_lock);
4908 	so_lock_single(so);	/* Set SOLOCKED */
4909 
4910 	if (accept) {
4911 		bcopy(sti->sti_faddr_sa, name,
4912 		    MIN(*namelen, sti->sti_faddr_len));
4913 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4914 		goto done;
4915 	}
4916 
4917 	if (!(so->so_state & SS_ISCONNECTED)) {
4918 		error = ENOTCONN;
4919 		goto done;
4920 	}
4921 	/* Added this check for X/Open */
4922 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4923 		error = EINVAL;
4924 		if (xnet_check_print) {
4925 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4926 		}
4927 		goto done;
4928 	}
4929 
4930 	if (sti->sti_faddr_valid) {
4931 		bcopy(sti->sti_faddr_sa, name,
4932 		    MIN(*namelen, sti->sti_faddr_len));
4933 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4934 		goto done;
4935 	}
4936 
4937 #ifdef DEBUG
4938 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4939 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4940 	    (t_uscalar_t)sti->sti_faddr_len)));
4941 #endif /* DEBUG */
4942 
4943 	if (so->so_family == AF_UNIX) {
4944 		/* Transport has different name space - return local info */
4945 		if (sti->sti_faddr_noxlate)
4946 			*namelen = 0;
4947 		error = 0;
4948 		goto done;
4949 	}
4950 
4951 	ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4952 
4953 	ASSERT(sti->sti_faddr_sa);
4954 	/* Allocate local buffer to use with ioctl */
4955 	addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4956 	mutex_exit(&so->so_lock);
4957 	addr = kmem_alloc(addrlen, KM_SLEEP);
4958 
4959 	/*
4960 	 * Issue TI_GETPEERNAME with signals masked.
4961 	 * Put the result in sti_faddr_sa so that getpeername works after
4962 	 * a shutdown(output).
4963 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4964 	 * back to the socket.
4965 	 */
4966 	strbuf.buf = addr;
4967 	strbuf.maxlen = addrlen;
4968 	strbuf.len = 0;
4969 
4970 	sigintr(&smask, 0);
4971 	res = 0;
4972 	ASSERT(cr);
4973 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4974 	    0, K_TO_K, cr, &res);
4975 	sigunintr(&smask);
4976 
4977 	mutex_enter(&so->so_lock);
4978 	/*
4979 	 * If there is an error record the error in so_error put don't fail
4980 	 * the getpeername. Instead fallback on the recorded
4981 	 * sti->sti_faddr_sa.
4982 	 */
4983 	if (error) {
4984 		/*
4985 		 * Various stream head errors can be returned to the ioctl.
4986 		 * However, it is impossible to determine which ones of
4987 		 * these are really socket level errors that were incorrectly
4988 		 * consumed by the ioctl. Thus this code silently ignores the
4989 		 * error - to code explicitly does not reinstate the error
4990 		 * using soseterror().
4991 		 * Experiments have shows that at least this set of
4992 		 * errors are reported and should not be reinstated on the
4993 		 * socket:
4994 		 *	EINVAL	E.g. if an I_LINK was in effect when
4995 		 *		getpeername was called.
4996 		 *	EPIPE	The ioctl error semantics prefer the write
4997 		 *		side error over the read side error.
4998 		 *	ENOTCONN The transport just got disconnected but
4999 		 *		sockfs had not yet seen the T_DISCON_IND
5000 		 *		when issuing the ioctl.
5001 		 */
5002 		error = 0;
5003 	} else if (res == 0 && strbuf.len > 0 &&
5004 	    (so->so_state & SS_ISCONNECTED)) {
5005 		ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
5006 		sti->sti_faddr_len = (socklen_t)strbuf.len;
5007 		bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
5008 		sti->sti_faddr_valid = 1;
5009 
5010 		bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
5011 		*namelen = sti->sti_faddr_len;
5012 	}
5013 	kmem_free(addr, addrlen);
5014 #ifdef DEBUG
5015 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
5016 	    pr_addr(so->so_family, sti->sti_faddr_sa,
5017 	    (t_uscalar_t)sti->sti_faddr_len)));
5018 #endif /* DEBUG */
5019 done:
5020 	so_unlock_single(so, SOLOCKED);
5021 	mutex_exit(&so->so_lock);
5022 	return (error);
5023 }
5024 
5025 /*
5026  * Update sti_laddr by asking the transport (unless AF_UNIX).
5027  */
5028 int
5029 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
5030     struct cred *cr)
5031 {
5032 	struct strbuf	strbuf;
5033 	int		error = 0, res;
5034 	void		*addr;
5035 	t_uscalar_t	addrlen;
5036 	k_sigset_t	smask;
5037 	sotpi_info_t	*sti = SOTOTPI(so);
5038 
5039 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
5040 	    (void *)so, pr_state(so->so_state, so->so_mode)));
5041 
5042 	ASSERT(*namelen > 0);
5043 	mutex_enter(&so->so_lock);
5044 	so_lock_single(so);	/* Set SOLOCKED */
5045 
5046 #ifdef DEBUG
5047 
5048 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
5049 	    pr_addr(so->so_family, sti->sti_laddr_sa,
5050 	    (t_uscalar_t)sti->sti_laddr_len)));
5051 #endif /* DEBUG */
5052 	if (sti->sti_laddr_valid) {
5053 		bcopy(sti->sti_laddr_sa, name,
5054 		    MIN(*namelen, sti->sti_laddr_len));
5055 		*namelen = sti->sti_laddr_len;
5056 		goto done;
5057 	}
5058 
5059 	if (so->so_family == AF_UNIX) {
5060 		/* Transport has different name space - return local info */
5061 		error = 0;
5062 		goto done;
5063 	}
5064 	if (!(so->so_state & SS_ISBOUND)) {
5065 		/* If not bound, then nothing to return. */
5066 		error = 0;
5067 		goto done;
5068 	}
5069 
5070 	/* Allocate local buffer to use with ioctl */
5071 	addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
5072 	mutex_exit(&so->so_lock);
5073 	addr = kmem_alloc(addrlen, KM_SLEEP);
5074 
5075 	/*
5076 	 * Issue TI_GETMYNAME with signals masked.
5077 	 * Put the result in sti_laddr_sa so that getsockname works after
5078 	 * a shutdown(output).
5079 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5080 	 * back to the socket.
5081 	 */
5082 	strbuf.buf = addr;
5083 	strbuf.maxlen = addrlen;
5084 	strbuf.len = 0;
5085 
5086 	sigintr(&smask, 0);
5087 	res = 0;
5088 	ASSERT(cr);
5089 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
5090 	    0, K_TO_K, cr, &res);
5091 	sigunintr(&smask);
5092 
5093 	mutex_enter(&so->so_lock);
5094 	/*
5095 	 * If there is an error record the error in so_error put don't fail
5096 	 * the getsockname. Instead fallback on the recorded
5097 	 * sti->sti_laddr_sa.
5098 	 */
5099 	if (error) {
5100 		/*
5101 		 * Various stream head errors can be returned to the ioctl.
5102 		 * However, it is impossible to determine which ones of
5103 		 * these are really socket level errors that were incorrectly
5104 		 * consumed by the ioctl. Thus this code silently ignores the
5105 		 * error - to code explicitly does not reinstate the error
5106 		 * using soseterror().
5107 		 * Experiments have shows that at least this set of
5108 		 * errors are reported and should not be reinstated on the
5109 		 * socket:
5110 		 *	EINVAL	E.g. if an I_LINK was in effect when
5111 		 *		getsockname was called.
5112 		 *	EPIPE	The ioctl error semantics prefer the write
5113 		 *		side error over the read side error.
5114 		 */
5115 		error = 0;
5116 	} else if (res == 0 && strbuf.len > 0 &&
5117 	    (so->so_state & SS_ISBOUND)) {
5118 		ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5119 		sti->sti_laddr_len = (socklen_t)strbuf.len;
5120 		bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5121 		sti->sti_laddr_valid = 1;
5122 
5123 		bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5124 		*namelen = sti->sti_laddr_len;
5125 	}
5126 	kmem_free(addr, addrlen);
5127 #ifdef DEBUG
5128 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5129 	    pr_addr(so->so_family, sti->sti_laddr_sa,
5130 	    (t_uscalar_t)sti->sti_laddr_len)));
5131 #endif /* DEBUG */
5132 done:
5133 	so_unlock_single(so, SOLOCKED);
5134 	mutex_exit(&so->so_lock);
5135 	return (error);
5136 }
5137 
5138 /*
5139  * Get socket options. For SOL_SOCKET options some options are handled
5140  * by the sockfs while others use the value recorded in the sonode as a
5141  * fallback should the T_SVR4_OPTMGMT_REQ fail.
5142  *
5143  * On the return most *optlenp bytes are copied to optval.
5144  */
5145 /* ARGSUSED */
5146 int
5147 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5148 		void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5149 {
5150 	struct T_optmgmt_req	optmgmt_req;
5151 	struct T_optmgmt_ack	*optmgmt_ack;
5152 	struct opthdr		oh;
5153 	struct opthdr		*opt_res;
5154 	mblk_t			*mp = NULL;
5155 	int			error = 0;
5156 	void			*option = NULL;	/* Set if fallback value */
5157 	t_uscalar_t		maxlen = *optlenp;
5158 	t_uscalar_t		len;
5159 	uint32_t		value;
5160 	struct timeval		tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5161 	struct so_snd_bufinfo	snd_bufinfo;	/* used for zero copy */
5162 
5163 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5164 	    (void *)so, level, option_name, optval, (void *)optlenp,
5165 	    pr_state(so->so_state, so->so_mode)));
5166 
5167 	mutex_enter(&so->so_lock);
5168 	so_lock_single(so);	/* Set SOLOCKED */
5169 
5170 	/*
5171 	 * Check for SOL_SOCKET options.
5172 	 * Certain SOL_SOCKET options are returned directly whereas
5173 	 * others only provide a default (fallback) value should
5174 	 * the T_SVR4_OPTMGMT_REQ fail.
5175 	 */
5176 	if (level == SOL_SOCKET) {
5177 		/* Check parameters */
5178 		switch (option_name) {
5179 		case SO_TYPE:
5180 		case SO_ERROR:
5181 		case SO_DEBUG:
5182 		case SO_ACCEPTCONN:
5183 		case SO_REUSEADDR:
5184 		case SO_KEEPALIVE:
5185 		case SO_DONTROUTE:
5186 		case SO_BROADCAST:
5187 		case SO_USELOOPBACK:
5188 		case SO_OOBINLINE:
5189 		case SO_SNDBUF:
5190 		case SO_RCVBUF:
5191 #ifdef notyet
5192 		case SO_SNDLOWAT:
5193 		case SO_RCVLOWAT:
5194 #endif /* notyet */
5195 		case SO_DOMAIN:
5196 		case SO_DGRAM_ERRIND:
5197 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5198 				error = EINVAL;
5199 				eprintsoline(so, error);
5200 				goto done2;
5201 			}
5202 			break;
5203 		case SO_RCVTIMEO:
5204 		case SO_SNDTIMEO:
5205 			if (maxlen < (t_uscalar_t)sizeof (struct timeval)) {
5206 				error = EINVAL;
5207 				eprintsoline(so, error);
5208 				goto done2;
5209 			}
5210 			break;
5211 		case SO_LINGER:
5212 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5213 				error = EINVAL;
5214 				eprintsoline(so, error);
5215 				goto done2;
5216 			}
5217 			break;
5218 		case SO_SND_BUFINFO:
5219 			if (maxlen < (t_uscalar_t)
5220 			    sizeof (struct so_snd_bufinfo)) {
5221 				error = EINVAL;
5222 				eprintsoline(so, error);
5223 				goto done2;
5224 			}
5225 			break;
5226 		}
5227 
5228 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
5229 
5230 		switch (option_name) {
5231 		case SO_TYPE:
5232 			value = so->so_type;
5233 			option = &value;
5234 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5235 
5236 		case SO_ERROR:
5237 			value = sogeterr(so, B_TRUE);
5238 			option = &value;
5239 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5240 
5241 		case SO_ACCEPTCONN:
5242 			if (so->so_state & SS_ACCEPTCONN)
5243 				value = SO_ACCEPTCONN;
5244 			else
5245 				value = 0;
5246 #ifdef DEBUG
5247 			if (value) {
5248 				dprintso(so, 1,
5249 				    ("sotpi_getsockopt: 0x%x is set\n",
5250 				    option_name));
5251 			} else {
5252 				dprintso(so, 1,
5253 				    ("sotpi_getsockopt: 0x%x not set\n",
5254 				    option_name));
5255 			}
5256 #endif /* DEBUG */
5257 			option = &value;
5258 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5259 
5260 		case SO_DEBUG:
5261 		case SO_REUSEADDR:
5262 		case SO_KEEPALIVE:
5263 		case SO_DONTROUTE:
5264 		case SO_BROADCAST:
5265 		case SO_USELOOPBACK:
5266 		case SO_OOBINLINE:
5267 		case SO_DGRAM_ERRIND:
5268 			value = (so->so_options & option_name);
5269 #ifdef DEBUG
5270 			if (value) {
5271 				dprintso(so, 1,
5272 				    ("sotpi_getsockopt: 0x%x is set\n",
5273 				    option_name));
5274 			} else {
5275 				dprintso(so, 1,
5276 				    ("sotpi_getsockopt: 0x%x not set\n",
5277 				    option_name));
5278 			}
5279 #endif /* DEBUG */
5280 			option = &value;
5281 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5282 
5283 		/*
5284 		 * The following options are only returned by sockfs when the
5285 		 * T_SVR4_OPTMGMT_REQ fails.
5286 		 */
5287 		case SO_LINGER:
5288 			option = &so->so_linger;
5289 			len = (t_uscalar_t)sizeof (struct linger);
5290 			break;
5291 		case SO_SNDBUF: {
5292 			ssize_t lvalue;
5293 
5294 			/*
5295 			 * If the option has not been set then get a default
5296 			 * value from the read queue. This value is
5297 			 * returned if the transport fails
5298 			 * the T_SVR4_OPTMGMT_REQ.
5299 			 */
5300 			lvalue = so->so_sndbuf;
5301 			if (lvalue == 0) {
5302 				mutex_exit(&so->so_lock);
5303 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
5304 				    QHIWAT, 0, &lvalue);
5305 				mutex_enter(&so->so_lock);
5306 				dprintso(so, 1,
5307 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5308 			}
5309 			value = (int)lvalue;
5310 			option = &value;
5311 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5312 			break;
5313 		}
5314 		case SO_RCVBUF: {
5315 			ssize_t lvalue;
5316 
5317 			/*
5318 			 * If the option has not been set then get a default
5319 			 * value from the read queue. This value is
5320 			 * returned if the transport fails
5321 			 * the T_SVR4_OPTMGMT_REQ.
5322 			 *
5323 			 * XXX If SO_RCVBUF has been set and this is an
5324 			 * XPG 4.2 application then do not ask the transport
5325 			 * since the transport might adjust the value and not
5326 			 * return exactly what was set by the application.
5327 			 * For non-XPG 4.2 application we return the value
5328 			 * that the transport is actually using.
5329 			 */
5330 			lvalue = so->so_rcvbuf;
5331 			if (lvalue == 0) {
5332 				mutex_exit(&so->so_lock);
5333 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5334 				    QHIWAT, 0, &lvalue);
5335 				mutex_enter(&so->so_lock);
5336 				dprintso(so, 1,
5337 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5338 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5339 				value = (int)lvalue;
5340 				option = &value;
5341 				goto copyout;	/* skip asking transport */
5342 			}
5343 			value = (int)lvalue;
5344 			option = &value;
5345 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5346 			break;
5347 		}
5348 		case SO_DOMAIN:
5349 			value = so->so_family;
5350 			option = &value;
5351 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5352 
5353 #ifdef notyet
5354 		/*
5355 		 * We do not implement the semantics of these options
5356 		 * thus we shouldn't implement the options either.
5357 		 */
5358 		case SO_SNDLOWAT:
5359 			value = so->so_sndlowat;
5360 			option = &value;
5361 			break;
5362 		case SO_RCVLOWAT:
5363 			value = so->so_rcvlowat;
5364 			option = &value;
5365 			break;
5366 #endif /* notyet */
5367 		case SO_SNDTIMEO:
5368 		case SO_RCVTIMEO: {
5369 			clock_t val;
5370 			if (option_name == SO_RCVTIMEO)
5371 				val = drv_hztousec(so->so_rcvtimeo);
5372 			else
5373 				val = drv_hztousec(so->so_sndtimeo);
5374 			tmo_val.tv_sec = val / (1000 * 1000);
5375 			tmo_val.tv_usec = val % (1000 * 1000);
5376 			option = &tmo_val;
5377 			len = (t_uscalar_t)sizeof (struct timeval);
5378 			break;
5379 		}
5380 		case SO_SND_BUFINFO: {
5381 			snd_bufinfo.sbi_wroff =
5382 			    (so->so_proto_props).sopp_wroff;
5383 			snd_bufinfo.sbi_maxblk =
5384 			    (so->so_proto_props).sopp_maxblk;
5385 			snd_bufinfo.sbi_maxpsz =
5386 			    (so->so_proto_props).sopp_maxpsz;
5387 			snd_bufinfo.sbi_tail =
5388 			    (so->so_proto_props).sopp_tail;
5389 			option = &snd_bufinfo;
5390 			len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5391 			break;
5392 		}
5393 		}
5394 	}
5395 
5396 	mutex_exit(&so->so_lock);
5397 
5398 	/* Send request */
5399 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5400 	optmgmt_req.MGMT_flags = T_CHECK;
5401 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5402 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5403 
5404 	oh.level = level;
5405 	oh.name = option_name;
5406 	oh.len = maxlen;
5407 
5408 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5409 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP);
5410 	/* Let option management work in the presence of data flow control */
5411 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5412 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5413 	mp = NULL;
5414 	mutex_enter(&so->so_lock);
5415 	if (error) {
5416 		eprintsoline(so, error);
5417 		goto done2;
5418 	}
5419 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5420 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5421 	if (error) {
5422 		if (option != NULL) {
5423 			/* We have a fallback value */
5424 			error = 0;
5425 			goto copyout;
5426 		}
5427 		eprintsoline(so, error);
5428 		goto done2;
5429 	}
5430 	ASSERT(mp);
5431 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5432 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5433 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5434 	if (opt_res == NULL) {
5435 		if (option != NULL) {
5436 			/* We have a fallback value */
5437 			error = 0;
5438 			goto copyout;
5439 		}
5440 		error = EPROTO;
5441 		eprintsoline(so, error);
5442 		goto done;
5443 	}
5444 	option = &opt_res[1];
5445 
5446 	/* check to ensure that the option is within bounds */
5447 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5448 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5449 		if (option != NULL) {
5450 			/* We have a fallback value */
5451 			error = 0;
5452 			goto copyout;
5453 		}
5454 		error = EPROTO;
5455 		eprintsoline(so, error);
5456 		goto done;
5457 	}
5458 
5459 	len = opt_res->len;
5460 
5461 copyout: {
5462 		t_uscalar_t size = MIN(len, maxlen);
5463 		bcopy(option, optval, size);
5464 		bcopy(&size, optlenp, sizeof (size));
5465 	}
5466 done:
5467 	freemsg(mp);
5468 done2:
5469 	so_unlock_single(so, SOLOCKED);
5470 	mutex_exit(&so->so_lock);
5471 
5472 	return (error);
5473 }
5474 
5475 /*
5476  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5477  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5478  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5479  * setsockopt has to work even if the transport does not support the option.
5480  */
5481 /* ARGSUSED */
5482 int
5483 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5484 	const void *optval, t_uscalar_t optlen, struct cred *cr)
5485 {
5486 	struct T_optmgmt_req	optmgmt_req;
5487 	struct opthdr		oh;
5488 	mblk_t			*mp;
5489 	int			error = 0;
5490 	boolean_t		handled = B_FALSE;
5491 
5492 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5493 	    (void *)so, level, option_name, optval, optlen,
5494 	    pr_state(so->so_state, so->so_mode)));
5495 
5496 	/* X/Open requires this check */
5497 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5498 		if (xnet_check_print)
5499 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5500 		return (EINVAL);
5501 	}
5502 
5503 	mutex_enter(&so->so_lock);
5504 	so_lock_single(so);	/* Set SOLOCKED */
5505 	mutex_exit(&so->so_lock);
5506 
5507 	/*
5508 	 * For SOCKET or TCP level options, try to set it here itself
5509 	 * provided socket has not been popped and we know the tcp
5510 	 * structure (stored in so_priv).
5511 	 */
5512 	if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
5513 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
5514 	    (so->so_version == SOV_SOCKSTREAM) &&
5515 	    (so->so_proto_handle != NULL)) {
5516 		tcp_t		*tcp = (tcp_t *)so->so_proto_handle;
5517 		boolean_t	onoff;
5518 
5519 #define	intvalue	(*(int32_t *)optval)
5520 
5521 		switch (level) {
5522 		case SOL_SOCKET:
5523 			switch (option_name) {		/* Check length param */
5524 			case SO_DEBUG:
5525 			case SO_REUSEADDR:
5526 			case SO_DONTROUTE:
5527 			case SO_BROADCAST:
5528 			case SO_USELOOPBACK:
5529 			case SO_OOBINLINE:
5530 			case SO_DGRAM_ERRIND:
5531 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5532 					error = EINVAL;
5533 					eprintsoline(so, error);
5534 					mutex_enter(&so->so_lock);
5535 					goto done2;
5536 				}
5537 				ASSERT(optval);
5538 				onoff = intvalue != 0;
5539 				handled = B_TRUE;
5540 				break;
5541 			case SO_SNDTIMEO:
5542 			case SO_RCVTIMEO:
5543 				if (optlen !=
5544 				    (t_uscalar_t)sizeof (struct timeval)) {
5545 					error = EINVAL;
5546 					eprintsoline(so, error);
5547 					mutex_enter(&so->so_lock);
5548 					goto done2;
5549 				}
5550 				ASSERT(optval);
5551 				handled = B_TRUE;
5552 				break;
5553 			case SO_LINGER:
5554 				if (optlen !=
5555 				    (t_uscalar_t)sizeof (struct linger)) {
5556 					error = EINVAL;
5557 					eprintsoline(so, error);
5558 					mutex_enter(&so->so_lock);
5559 					goto done2;
5560 				}
5561 				ASSERT(optval);
5562 				handled = B_TRUE;
5563 				break;
5564 			}
5565 
5566 			switch (option_name) {			/* Do actions */
5567 			case SO_LINGER: {
5568 				struct linger *lgr = (struct linger *)optval;
5569 
5570 				if (lgr->l_onoff) {
5571 					tcp->tcp_linger = 1;
5572 					tcp->tcp_lingertime = lgr->l_linger;
5573 					so->so_linger.l_onoff = SO_LINGER;
5574 					so->so_options |= SO_LINGER;
5575 				} else {
5576 					tcp->tcp_linger = 0;
5577 					tcp->tcp_lingertime = 0;
5578 					so->so_linger.l_onoff = 0;
5579 					so->so_options &= ~SO_LINGER;
5580 				}
5581 				so->so_linger.l_linger = lgr->l_linger;
5582 				handled = B_TRUE;
5583 				break;
5584 			}
5585 			case SO_DEBUG:
5586 				tcp->tcp_debug = onoff;
5587 #ifdef SOCK_TEST
5588 				if (intvalue & 2)
5589 					sock_test_timelimit = 10 * hz;
5590 				else
5591 					sock_test_timelimit = 0;
5592 
5593 				if (intvalue & 4)
5594 					do_useracc = 0;
5595 				else
5596 					do_useracc = 1;
5597 #endif /* SOCK_TEST */
5598 				break;
5599 			case SO_DONTROUTE:
5600 				/*
5601 				 * SO_DONTROUTE, SO_USELOOPBACK and
5602 				 * SO_BROADCAST are only of interest to IP.
5603 				 * We track them here only so
5604 				 * that we can report their current value.
5605 				 */
5606 				tcp->tcp_dontroute = onoff;
5607 				if (onoff)
5608 					so->so_options |= option_name;
5609 				else
5610 					so->so_options &= ~option_name;
5611 				break;
5612 			case SO_USELOOPBACK:
5613 				tcp->tcp_useloopback = onoff;
5614 				if (onoff)
5615 					so->so_options |= option_name;
5616 				else
5617 					so->so_options &= ~option_name;
5618 				break;
5619 			case SO_BROADCAST:
5620 				tcp->tcp_broadcast = onoff;
5621 				if (onoff)
5622 					so->so_options |= option_name;
5623 				else
5624 					so->so_options &= ~option_name;
5625 				break;
5626 			case SO_REUSEADDR:
5627 				tcp->tcp_reuseaddr = onoff;
5628 				if (onoff)
5629 					so->so_options |= option_name;
5630 				else
5631 					so->so_options &= ~option_name;
5632 				break;
5633 			case SO_OOBINLINE:
5634 				tcp->tcp_oobinline = onoff;
5635 				if (onoff)
5636 					so->so_options |= option_name;
5637 				else
5638 					so->so_options &= ~option_name;
5639 				break;
5640 			case SO_DGRAM_ERRIND:
5641 				tcp->tcp_dgram_errind = onoff;
5642 				if (onoff)
5643 					so->so_options |= option_name;
5644 				else
5645 					so->so_options &= ~option_name;
5646 				break;
5647 			}
5648 			break;
5649 		case IPPROTO_TCP:
5650 			switch (option_name) {
5651 			case TCP_NODELAY:
5652 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5653 					error = EINVAL;
5654 					eprintsoline(so, error);
5655 					mutex_enter(&so->so_lock);
5656 					goto done2;
5657 				}
5658 				ASSERT(optval);
5659 				tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss;
5660 				handled = B_TRUE;
5661 				break;
5662 			}
5663 			break;
5664 		default:
5665 			handled = B_FALSE;
5666 			break;
5667 		}
5668 	}
5669 
5670 	if (handled) {
5671 		mutex_enter(&so->so_lock);
5672 		goto done2;
5673 	}
5674 
5675 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5676 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5677 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5678 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5679 
5680 	oh.level = level;
5681 	oh.name = option_name;
5682 	oh.len = optlen;
5683 
5684 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5685 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP);
5686 	/* Let option management work in the presence of data flow control */
5687 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5688 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5689 	mp = NULL;
5690 	mutex_enter(&so->so_lock);
5691 	if (error) {
5692 		eprintsoline(so, error);
5693 		goto done2;
5694 	}
5695 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5696 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5697 	if (error) {
5698 		eprintsoline(so, error);
5699 		goto done;
5700 	}
5701 	ASSERT(mp);
5702 	/* No need to verify T_optmgmt_ack */
5703 	freemsg(mp);
5704 done:
5705 	/*
5706 	 * Check for SOL_SOCKET options and record their values.
5707 	 * If we know about a SOL_SOCKET parameter and the transport
5708 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5709 	 * EPROTO) we let the setsockopt succeed.
5710 	 */
5711 	if (level == SOL_SOCKET) {
5712 		/* Check parameters */
5713 		switch (option_name) {
5714 		case SO_DEBUG:
5715 		case SO_REUSEADDR:
5716 		case SO_KEEPALIVE:
5717 		case SO_DONTROUTE:
5718 		case SO_BROADCAST:
5719 		case SO_USELOOPBACK:
5720 		case SO_OOBINLINE:
5721 		case SO_SNDBUF:
5722 		case SO_RCVBUF:
5723 #ifdef notyet
5724 		case SO_SNDLOWAT:
5725 		case SO_RCVLOWAT:
5726 #endif /* notyet */
5727 		case SO_DGRAM_ERRIND:
5728 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5729 				error = EINVAL;
5730 				eprintsoline(so, error);
5731 				goto done2;
5732 			}
5733 			ASSERT(optval);
5734 			handled = B_TRUE;
5735 			break;
5736 		case SO_SNDTIMEO:
5737 		case SO_RCVTIMEO:
5738 			if (optlen != (t_uscalar_t)sizeof (struct timeval)) {
5739 				error = EINVAL;
5740 				eprintsoline(so, error);
5741 				goto done2;
5742 			}
5743 			ASSERT(optval);
5744 			handled = B_TRUE;
5745 			break;
5746 		case SO_LINGER:
5747 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5748 				error = EINVAL;
5749 				eprintsoline(so, error);
5750 				goto done2;
5751 			}
5752 			ASSERT(optval);
5753 			handled = B_TRUE;
5754 			break;
5755 		}
5756 
5757 #define	intvalue	(*(int32_t *)optval)
5758 
5759 		switch (option_name) {
5760 		case SO_TYPE:
5761 		case SO_ERROR:
5762 		case SO_ACCEPTCONN:
5763 			/* Can't be set */
5764 			error = ENOPROTOOPT;
5765 			goto done2;
5766 		case SO_LINGER: {
5767 			struct linger *l = (struct linger *)optval;
5768 
5769 			so->so_linger.l_linger = l->l_linger;
5770 			if (l->l_onoff) {
5771 				so->so_linger.l_onoff = SO_LINGER;
5772 				so->so_options |= SO_LINGER;
5773 			} else {
5774 				so->so_linger.l_onoff = 0;
5775 				so->so_options &= ~SO_LINGER;
5776 			}
5777 			break;
5778 		}
5779 
5780 		case SO_DEBUG:
5781 #ifdef SOCK_TEST
5782 			if (intvalue & 2)
5783 				sock_test_timelimit = 10 * hz;
5784 			else
5785 				sock_test_timelimit = 0;
5786 
5787 			if (intvalue & 4)
5788 				do_useracc = 0;
5789 			else
5790 				do_useracc = 1;
5791 #endif /* SOCK_TEST */
5792 			/* FALLTHRU */
5793 		case SO_REUSEADDR:
5794 		case SO_KEEPALIVE:
5795 		case SO_DONTROUTE:
5796 		case SO_BROADCAST:
5797 		case SO_USELOOPBACK:
5798 		case SO_OOBINLINE:
5799 		case SO_DGRAM_ERRIND:
5800 			if (intvalue != 0) {
5801 				dprintso(so, 1,
5802 				    ("socket_setsockopt: setting 0x%x\n",
5803 				    option_name));
5804 				so->so_options |= option_name;
5805 			} else {
5806 				dprintso(so, 1,
5807 				    ("socket_setsockopt: clearing 0x%x\n",
5808 				    option_name));
5809 				so->so_options &= ~option_name;
5810 			}
5811 			break;
5812 		/*
5813 		 * The following options are only returned by us when the
5814 		 * transport layer fails.
5815 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5816 		 * since the transport might adjust the value and not
5817 		 * return exactly what was set by the application.
5818 		 */
5819 		case SO_SNDBUF:
5820 			so->so_sndbuf = intvalue;
5821 			break;
5822 		case SO_RCVBUF:
5823 			so->so_rcvbuf = intvalue;
5824 			break;
5825 		case SO_RCVPSH:
5826 			so->so_rcv_timer_interval = intvalue;
5827 			break;
5828 #ifdef notyet
5829 		/*
5830 		 * We do not implement the semantics of these options
5831 		 * thus we shouldn't implement the options either.
5832 		 */
5833 		case SO_SNDLOWAT:
5834 			so->so_sndlowat = intvalue;
5835 			break;
5836 		case SO_RCVLOWAT:
5837 			so->so_rcvlowat = intvalue;
5838 			break;
5839 #endif /* notyet */
5840 		case SO_SNDTIMEO:
5841 		case SO_RCVTIMEO: {
5842 			struct timeval *tl = (struct timeval *)optval;
5843 			clock_t val = tl->tv_sec * 1000 * 1000 + tl->tv_usec;
5844 			if (option_name == SO_RCVTIMEO)
5845 				so->so_rcvtimeo = drv_usectohz(val);
5846 			else
5847 				so->so_sndtimeo = drv_usectohz(val);
5848 			break;
5849 		}
5850 		}
5851 #undef	intvalue
5852 
5853 		if (error) {
5854 			if ((error == ENOPROTOOPT || error == EPROTO ||
5855 			    error == EINVAL) && handled) {
5856 				dprintso(so, 1,
5857 				    ("setsockopt: ignoring error %d for 0x%x\n",
5858 				    error, option_name));
5859 				error = 0;
5860 			}
5861 		}
5862 	}
5863 done2:
5864 	so_unlock_single(so, SOLOCKED);
5865 	mutex_exit(&so->so_lock);
5866 	return (error);
5867 }
5868 
5869 /* ARGSUSED */
5870 int
5871 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5872 {
5873 	struct vnode *vp = SOTOV(so);
5874 	dev_t dev;
5875 	int error = 0;
5876 	sotpi_info_t *sti = SOTOTPI(so);
5877 
5878 	dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5879 	    (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5880 
5881 	dev = sti->sti_dev;
5882 
5883 	ASSERT(STREAMSTAB(getmajor(dev)));
5884 
5885 	mutex_enter(&so->so_lock);
5886 	so_lock_single(so);	/* Set SOLOCKED */
5887 
5888 	/*
5889 	 * Only call NL7C's close on last open reference.
5890 	 */
5891 	if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5892 		sti->sti_nl7c_flags = 0;
5893 		nl7c_close(so);
5894 	}
5895 
5896 	/*
5897 	 * Only call the close routine when the last open reference through
5898 	 * any [s, v]node goes away.
5899 	 */
5900 	if (vp->v_stream != NULL) {
5901 		vnode_t *ux_vp;
5902 
5903 		if (so->so_family == AF_UNIX) {
5904 			/* Could avoid this when CANTSENDMORE for !dgram */
5905 			so_unix_close(so);
5906 		}
5907 
5908 		mutex_exit(&so->so_lock);
5909 		/*
5910 		 * Disassemble the linkage from the AF_UNIX underlying file
5911 		 * system vnode to this socket (by atomically clearing
5912 		 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5913 		 * and frees the stream head.
5914 		 */
5915 		if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5916 			ASSERT(ux_vp->v_stream);
5917 			sti->sti_ux_bound_vp = NULL;
5918 			vn_rele_stream(ux_vp);
5919 		}
5920 		if (so->so_family == AF_INET || so->so_family == AF_INET6) {
5921 			strsetrwputdatahooks(SOTOV(so), NULL, NULL);
5922 			if (sti->sti_kssl_ent != NULL) {
5923 				kssl_release_ent(sti->sti_kssl_ent, so,
5924 				    sti->sti_kssl_type);
5925 				sti->sti_kssl_ent = NULL;
5926 			}
5927 			if (sti->sti_kssl_ctx != NULL) {
5928 				kssl_release_ctx(sti->sti_kssl_ctx);
5929 				sti->sti_kssl_ctx = NULL;
5930 			}
5931 			sti->sti_kssl_type = KSSL_NO_PROXY;
5932 		}
5933 		error = strclose(vp, flag, cr);
5934 		vp->v_stream = NULL;
5935 		mutex_enter(&so->so_lock);
5936 	}
5937 
5938 	/*
5939 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5940 	 */
5941 	so_flush_discon_ind(so);
5942 
5943 	so_unlock_single(so, SOLOCKED);
5944 	mutex_exit(&so->so_lock);
5945 
5946 	/*
5947 	 * Needed for STREAMs.
5948 	 * Decrement the device driver's reference count for streams
5949 	 * opened via the clone dip. The driver was held in clone_open().
5950 	 * The absence of clone_close() forces this asymmetry.
5951 	 */
5952 	if (so->so_flag & SOCLONE)
5953 		ddi_rele_driver(getmajor(dev));
5954 
5955 	return (error);
5956 }
5957 
5958 static int
5959 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5960     struct cred *cr, int32_t *rvalp)
5961 {
5962 	struct vnode *vp = SOTOV(so);
5963 	sotpi_info_t *sti = SOTOTPI(so);
5964 	int error = 0;
5965 
5966 	dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5967 	    cmd, arg, pr_state(so->so_state, so->so_mode)));
5968 
5969 	switch (cmd) {
5970 	case _I_INSERT:
5971 	case _I_REMOVE:
5972 		/*
5973 		 * Since there's no compelling reason to support these ioctls
5974 		 * on sockets, and doing so would increase the complexity
5975 		 * markedly, prevent it.
5976 		 */
5977 		return (EOPNOTSUPP);
5978 
5979 	case I_FIND:
5980 	case I_LIST:
5981 	case I_LOOK:
5982 	case I_POP:
5983 	case I_PUSH:
5984 		/*
5985 		 * To prevent races and inconsistencies between the actual
5986 		 * state of the stream and the state according to the sonode,
5987 		 * we serialize all operations which modify or operate on the
5988 		 * list of modules on the socket's stream.
5989 		 */
5990 		mutex_enter(&sti->sti_plumb_lock);
5991 		error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5992 		mutex_exit(&sti->sti_plumb_lock);
5993 		return (error);
5994 
5995 	default:
5996 		if (so->so_version != SOV_STREAM)
5997 			break;
5998 
5999 		/*
6000 		 * The imaginary "sockmod" has been popped; act as a stream.
6001 		 */
6002 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6003 	}
6004 
6005 	ASSERT(so->so_version != SOV_STREAM);
6006 
6007 	/*
6008 	 * Process socket-specific ioctls.
6009 	 */
6010 	switch (cmd) {
6011 	case FIONBIO: {
6012 		int32_t value;
6013 
6014 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
6015 		    (mode & (int)FKIOCTL)))
6016 			return (EFAULT);
6017 
6018 		mutex_enter(&so->so_lock);
6019 		if (value) {
6020 			so->so_state |= SS_NDELAY;
6021 		} else {
6022 			so->so_state &= ~SS_NDELAY;
6023 		}
6024 		mutex_exit(&so->so_lock);
6025 		return (0);
6026 	}
6027 
6028 	case FIOASYNC: {
6029 		int32_t value;
6030 
6031 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
6032 		    (mode & (int)FKIOCTL)))
6033 			return (EFAULT);
6034 
6035 		mutex_enter(&so->so_lock);
6036 		/*
6037 		 * SS_ASYNC flag not already set correctly?
6038 		 * (!value != !(so->so_state & SS_ASYNC))
6039 		 * but some engineers find that too hard to read.
6040 		 */
6041 		if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
6042 		    value != 0 && (so->so_state & SS_ASYNC) == 0)
6043 			error = so_flip_async(so, vp, mode, cr);
6044 		mutex_exit(&so->so_lock);
6045 		return (error);
6046 	}
6047 
6048 	case SIOCSPGRP:
6049 	case FIOSETOWN: {
6050 		pid_t pgrp;
6051 
6052 		if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
6053 		    (mode & (int)FKIOCTL)))
6054 			return (EFAULT);
6055 
6056 		mutex_enter(&so->so_lock);
6057 		dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
6058 		/* Any change? */
6059 		if (pgrp != so->so_pgrp)
6060 			error = so_set_siggrp(so, vp, pgrp, mode, cr);
6061 		mutex_exit(&so->so_lock);
6062 		return (error);
6063 	}
6064 	case SIOCGPGRP:
6065 	case FIOGETOWN:
6066 		if (so_copyout(&so->so_pgrp, (void *)arg,
6067 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
6068 			return (EFAULT);
6069 		return (0);
6070 
6071 	case SIOCATMARK: {
6072 		int retval;
6073 		uint_t so_state;
6074 
6075 		/*
6076 		 * strwaitmark has a finite timeout after which it
6077 		 * returns -1 if the mark state is undetermined.
6078 		 * In order to avoid any race between the mark state
6079 		 * in sockfs and the mark state in the stream head this
6080 		 * routine loops until the mark state can be determined
6081 		 * (or the urgent data indication has been removed by some
6082 		 * other thread).
6083 		 */
6084 		do {
6085 			mutex_enter(&so->so_lock);
6086 			so_state = so->so_state;
6087 			mutex_exit(&so->so_lock);
6088 			if (so_state & SS_RCVATMARK) {
6089 				retval = 1;
6090 			} else if (!(so_state & SS_OOBPEND)) {
6091 				/*
6092 				 * No SIGURG has been generated -- there is no
6093 				 * pending or present urgent data. Thus can't
6094 				 * possibly be at the mark.
6095 				 */
6096 				retval = 0;
6097 			} else {
6098 				/*
6099 				 * Have the stream head wait until there is
6100 				 * either some messages on the read queue, or
6101 				 * STRATMARK or STRNOTATMARK gets set. The
6102 				 * STRNOTATMARK flag is used so that the
6103 				 * transport can send up a MSGNOTMARKNEXT
6104 				 * M_DATA to indicate that it is not
6105 				 * at the mark and additional data is not about
6106 				 * to be send upstream.
6107 				 *
6108 				 * If the mark state is undetermined this will
6109 				 * return -1 and we will loop rechecking the
6110 				 * socket state.
6111 				 */
6112 				retval = strwaitmark(vp);
6113 			}
6114 		} while (retval == -1);
6115 
6116 		if (so_copyout(&retval, (void *)arg, sizeof (int),
6117 		    (mode & (int)FKIOCTL)))
6118 			return (EFAULT);
6119 		return (0);
6120 	}
6121 
6122 	case I_FDINSERT:
6123 	case I_SENDFD:
6124 	case I_RECVFD:
6125 	case I_ATMARK:
6126 	case _SIOCSOCKFALLBACK:
6127 		/*
6128 		 * These ioctls do not apply to sockets. I_FDINSERT can be
6129 		 * used to send M_PROTO messages without modifying the socket
6130 		 * state. I_SENDFD/RECVFD should not be used for socket file
6131 		 * descriptor passing since they assume a twisted stream.
6132 		 * SIOCATMARK must be used instead of I_ATMARK.
6133 		 *
6134 		 * _SIOCSOCKFALLBACK from an application should never be
6135 		 * processed.  It is only generated by socktpi_open() or
6136 		 * in response to I_POP or I_PUSH.
6137 		 */
6138 #ifdef DEBUG
6139 		zcmn_err(getzoneid(), CE_WARN,
6140 		    "Unsupported STREAMS ioctl 0x%x on socket. "
6141 		    "Pid = %d\n", cmd, curproc->p_pid);
6142 #endif /* DEBUG */
6143 		return (EOPNOTSUPP);
6144 
6145 	case _I_GETPEERCRED:
6146 		if ((mode & FKIOCTL) == 0)
6147 			return (EINVAL);
6148 
6149 		mutex_enter(&so->so_lock);
6150 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
6151 			error = ENOTSUP;
6152 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
6153 			error = ENOTCONN;
6154 		} else if (so->so_peercred != NULL) {
6155 			k_peercred_t *kp = (k_peercred_t *)arg;
6156 			kp->pc_cr = so->so_peercred;
6157 			kp->pc_cpid = so->so_cpid;
6158 			crhold(so->so_peercred);
6159 		} else {
6160 			error = EINVAL;
6161 		}
6162 		mutex_exit(&so->so_lock);
6163 		return (error);
6164 
6165 	default:
6166 		/*
6167 		 * Do the higher-order bits of the ioctl cmd indicate
6168 		 * that it is an I_* streams ioctl?
6169 		 */
6170 		if ((cmd & 0xffffff00U) == STR &&
6171 		    so->so_version == SOV_SOCKBSD) {
6172 #ifdef DEBUG
6173 			zcmn_err(getzoneid(), CE_WARN,
6174 			    "Unsupported STREAMS ioctl 0x%x on socket. "
6175 			    "Pid = %d\n", cmd, 	curproc->p_pid);
6176 #endif /* DEBUG */
6177 			return (EOPNOTSUPP);
6178 		}
6179 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6180 	}
6181 }
6182 
6183 /*
6184  * Handle plumbing-related ioctls.
6185  */
6186 static int
6187 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
6188     struct cred *cr, int32_t *rvalp)
6189 {
6190 	static const char sockmod_name[] = "sockmod";
6191 	struct sonode	*so = VTOSO(vp);
6192 	char		mname[FMNAMESZ + 1];
6193 	int		error;
6194 	sotpi_info_t	*sti = SOTOTPI(so);
6195 
6196 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
6197 
6198 	if (so->so_version == SOV_SOCKBSD)
6199 		return (EOPNOTSUPP);
6200 
6201 	if (so->so_version == SOV_STREAM) {
6202 		/*
6203 		 * The imaginary "sockmod" has been popped - act as a stream.
6204 		 * If this is a push of sockmod then change back to a socket.
6205 		 */
6206 		if (cmd == I_PUSH) {
6207 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6208 			    (void *)arg, mname, sizeof (mname), NULL);
6209 
6210 			if (error == 0 && strcmp(mname, sockmod_name) == 0) {
6211 				dprintso(so, 0, ("socktpi_ioctl: going to "
6212 				    "socket version\n"));
6213 				so_stream2sock(so);
6214 				return (0);
6215 			}
6216 		}
6217 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6218 	}
6219 
6220 	switch (cmd) {
6221 	case I_PUSH:
6222 		if (sti->sti_direct) {
6223 			mutex_enter(&so->so_lock);
6224 			so_lock_single(so);
6225 			mutex_exit(&so->so_lock);
6226 
6227 			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6228 			    CRED(), rvalp);
6229 
6230 			mutex_enter(&so->so_lock);
6231 			if (error == 0)
6232 				sti->sti_direct = 0;
6233 			so_unlock_single(so, SOLOCKED);
6234 			mutex_exit(&so->so_lock);
6235 
6236 			if (error != 0)
6237 				return (error);
6238 		}
6239 
6240 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6241 		if (error == 0)
6242 			sti->sti_pushcnt++;
6243 		return (error);
6244 
6245 	case I_POP:
6246 		if (sti->sti_pushcnt == 0) {
6247 			/* Emulate sockmod being popped */
6248 			dprintso(so, 0,
6249 			    ("socktpi_ioctl: going to STREAMS version\n"));
6250 			return (so_sock2stream(so));
6251 		}
6252 
6253 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6254 		if (error == 0)
6255 			sti->sti_pushcnt--;
6256 		return (error);
6257 
6258 	case I_LIST: {
6259 		struct str_mlist *kmlistp, *umlistp;
6260 		struct str_list	kstrlist;
6261 		ssize_t		kstrlistsize;
6262 		int		i, nmods;
6263 
6264 		STRUCT_DECL(str_list, ustrlist);
6265 		STRUCT_INIT(ustrlist, mode);
6266 
6267 		if (arg == NULL) {
6268 			error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6269 			if (error == 0)
6270 				(*rvalp)++;	/* Add one for sockmod */
6271 			return (error);
6272 		}
6273 
6274 		error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6275 		    STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6276 		if (error != 0)
6277 			return (error);
6278 
6279 		nmods = STRUCT_FGET(ustrlist, sl_nmods);
6280 		if (nmods <= 0)
6281 			return (EINVAL);
6282 		/*
6283 		 * Ceiling nmods at nstrpush to prevent someone from
6284 		 * maliciously consuming lots of kernel memory.
6285 		 */
6286 		nmods = MIN(nmods, nstrpush);
6287 
6288 		kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6289 		kstrlist.sl_nmods = nmods;
6290 		kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6291 
6292 		error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6293 		    cr, rvalp);
6294 		if (error != 0)
6295 			goto done;
6296 
6297 		/*
6298 		 * Considering the module list as a 0-based array of sl_nmods
6299 		 * modules, sockmod should conceptually exist at slot
6300 		 * sti_pushcnt.  Insert sockmod at this location by sliding all
6301 		 * of the module names after so_pushcnt over by one.  We know
6302 		 * that there will be room to do this since we allocated
6303 		 * sl_modlist with an additional slot.
6304 		 */
6305 		for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6306 			kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6307 
6308 		(void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6309 		kstrlist.sl_nmods++;
6310 
6311 		/*
6312 		 * Copy all of the entries out to ustrlist.
6313 		 */
6314 		kmlistp = kstrlist.sl_modlist;
6315 		umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6316 		for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6317 			error = so_copyout(kmlistp++, umlistp++,
6318 			    sizeof (struct str_mlist), mode & FKIOCTL);
6319 			if (error != 0)
6320 				goto done;
6321 		}
6322 
6323 		error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6324 		    mode & FKIOCTL);
6325 		if (error == 0)
6326 			*rvalp = 0;
6327 	done:
6328 		kmem_free(kstrlist.sl_modlist, kstrlistsize);
6329 		return (error);
6330 	}
6331 	case I_LOOK:
6332 		if (sti->sti_pushcnt == 0) {
6333 			return (so_copyout(sockmod_name, (void *)arg,
6334 			    sizeof (sockmod_name), mode & FKIOCTL));
6335 		}
6336 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6337 
6338 	case I_FIND:
6339 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6340 		if (error && error != EINVAL)
6341 			return (error);
6342 
6343 		/* if not found and string was sockmod return 1 */
6344 		if (*rvalp == 0 || error == EINVAL) {
6345 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6346 			    (void *)arg, mname, sizeof (mname), NULL);
6347 			if (error == ENAMETOOLONG)
6348 				error = EINVAL;
6349 
6350 			if (error == 0 && strcmp(mname, sockmod_name) == 0)
6351 				*rvalp = 1;
6352 		}
6353 		return (error);
6354 
6355 	default:
6356 		panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6357 		break;
6358 	}
6359 
6360 	return (0);
6361 }
6362 
6363 /*
6364  * Wrapper around the streams poll routine that implements socket poll
6365  * semantics.
6366  * The sockfs never calls pollwakeup itself - the stream head take care
6367  * of all pollwakeups. Since sockfs never holds so_lock when calling the
6368  * stream head there can never be a deadlock due to holding so_lock across
6369  * pollwakeup and acquiring so_lock in this routine.
6370  *
6371  * However, since the performance of VOP_POLL is critical we avoid
6372  * acquiring so_lock here. This is based on two assumptions:
6373  *  - The poll implementation holds locks to serialize the VOP_POLL call
6374  *    and a pollwakeup for the same pollhead. This ensures that should
6375  *    e.g. so_state change during a socktpi_poll call the pollwakeup
6376  *    (which strsock_* and strrput conspire to issue) is issued after
6377  *    the state change. Thus the pollwakeup will block until VOP_POLL has
6378  *    returned and then wake up poll and have it call VOP_POLL again.
6379  *  - The reading of so_state without holding so_lock does not result in
6380  *    stale data that is older than the latest state change that has dropped
6381  *    so_lock. This is ensured by the mutex_exit issuing the appropriate
6382  *    memory barrier to force the data into the coherency domain.
6383  */
6384 static int
6385 sotpi_poll(
6386 	struct sonode	*so,
6387 	short		events,
6388 	int		anyyet,
6389 	short		*reventsp,
6390 	struct pollhead **phpp)
6391 {
6392 	short origevents = events;
6393 	struct vnode *vp = SOTOV(so);
6394 	int error;
6395 	int so_state = so->so_state;	/* snapshot */
6396 	sotpi_info_t *sti = SOTOTPI(so);
6397 
6398 	dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6399 	    (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6400 
6401 	ASSERT(vp->v_type == VSOCK);
6402 	ASSERT(vp->v_stream != NULL);
6403 
6404 	if (so->so_version == SOV_STREAM) {
6405 		/* The imaginary "sockmod" has been popped - act as a stream */
6406 		return (strpoll(vp->v_stream, events, anyyet,
6407 		    reventsp, phpp));
6408 	}
6409 
6410 	if (!(so_state & SS_ISCONNECTED) &&
6411 	    (so->so_mode & SM_CONNREQUIRED)) {
6412 		/* Not connected yet - turn off write side events */
6413 		events &= ~(POLLOUT|POLLWRBAND);
6414 	}
6415 	/*
6416 	 * Check for errors without calling strpoll if the caller wants them.
6417 	 * In sockets the errors are represented as input/output events
6418 	 * and there is no need to ask the stream head for this information.
6419 	 */
6420 	if (so->so_error != 0 &&
6421 	    ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
6422 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6423 		return (0);
6424 	}
6425 	/*
6426 	 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6427 	 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6428 	 * will not trigger a POLLIN event with POLLRDDATA set.
6429 	 * The handling of urgent data (causing POLLRDBAND) is done by
6430 	 * inspecting SS_OOBPEND below.
6431 	 */
6432 	events |= POLLRDDATA;
6433 
6434 	/*
6435 	 * After shutdown(output) a stream head write error is set.
6436 	 * However, we should not return output events.
6437 	 */
6438 	events |= POLLNOERR;
6439 	error = strpoll(vp->v_stream, events, anyyet,
6440 	    reventsp, phpp);
6441 	if (error)
6442 		return (error);
6443 
6444 	ASSERT(!(*reventsp & POLLERR));
6445 
6446 	/*
6447 	 * Notes on T_CONN_IND handling for sockets.
6448 	 *
6449 	 * If strpoll() returned without events, SR_POLLIN is guaranteed
6450 	 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6451 	 *
6452 	 * Since the so_lock is not held, soqueueconnind() may have run
6453 	 * and a T_CONN_IND may be waiting. We now check for any queued
6454 	 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6455 	 * to ensure poll returns.
6456 	 *
6457 	 * However:
6458 	 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6459 	 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6460 	 * the following actions will occur; taken together they ensure the
6461 	 * syscall will return.
6462 	 *
6463 	 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6464 	 *    the accept() was run on a non-blocking socket sowaitconnind()
6465 	 *    may have already returned EWOULDBLOCK, so not be waiting to
6466 	 *    process the message. Additionally socktpi_poll() has probably
6467 	 *    proceeded past the sti_conn_ind_head check below.
6468 	 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6469 	 *    this thread,  however that could occur before poll_common()
6470 	 *    has entered cv_wait.
6471 	 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6472 	 *
6473 	 * Before proceeding to cv_wait() in poll_common() for an event,
6474 	 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6475 	 * and if set, re-calls strpoll() to ensure the late arriving
6476 	 * T_CONN_IND is recognized, and pollsys() returns.
6477 	 */
6478 
6479 	if (sti->sti_conn_ind_head != NULL)
6480 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6481 
6482 	if (so->so_state & SS_OOBPEND)
6483 		*reventsp |= POLLRDBAND & events;
6484 
6485 	if (sti->sti_nl7c_rcv_mp != NULL) {
6486 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6487 	}
6488 	if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6489 	    ((POLLIN|POLLRDNORM) & *reventsp)) {
6490 		sti->sti_nl7c_flags |= NL7C_POLLIN;
6491 	}
6492 
6493 	return (0);
6494 }
6495 
6496 /*ARGSUSED*/
6497 static int
6498 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6499 {
6500 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6501 	int error = 0;
6502 
6503 	error = sonode_constructor(buf, cdrarg, kmflags);
6504 	if (error != 0)
6505 		return (error);
6506 
6507 	error = i_sotpi_info_constructor(&st->st_info);
6508 	if (error != 0)
6509 		sonode_destructor(buf, cdrarg);
6510 
6511 	st->st_sonode.so_priv = &st->st_info;
6512 
6513 	return (error);
6514 }
6515 
6516 /*ARGSUSED1*/
6517 static void
6518 socktpi_destructor(void *buf, void *cdrarg)
6519 {
6520 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6521 
6522 	ASSERT(st->st_sonode.so_priv == &st->st_info);
6523 	st->st_sonode.so_priv = NULL;
6524 
6525 	i_sotpi_info_destructor(&st->st_info);
6526 	sonode_destructor(buf, cdrarg);
6527 }
6528 
6529 static int
6530 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6531 {
6532 	int retval;
6533 
6534 	if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6535 		struct sonode *so = (struct sonode *)buf;
6536 		sotpi_info_t *sti = SOTOTPI(so);
6537 
6538 		mutex_enter(&socklist.sl_lock);
6539 
6540 		sti->sti_next_so = socklist.sl_list;
6541 		sti->sti_prev_so = NULL;
6542 		if (sti->sti_next_so != NULL)
6543 			SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6544 		socklist.sl_list = so;
6545 
6546 		mutex_exit(&socklist.sl_lock);
6547 
6548 	}
6549 	return (retval);
6550 }
6551 
6552 static void
6553 socktpi_unix_destructor(void *buf, void *cdrarg)
6554 {
6555 	struct sonode	*so = (struct sonode *)buf;
6556 	sotpi_info_t	*sti = SOTOTPI(so);
6557 
6558 	mutex_enter(&socklist.sl_lock);
6559 
6560 	if (sti->sti_next_so != NULL)
6561 		SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6562 	if (sti->sti_prev_so != NULL)
6563 		SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6564 	else
6565 		socklist.sl_list = sti->sti_next_so;
6566 
6567 	mutex_exit(&socklist.sl_lock);
6568 
6569 	socktpi_destructor(buf, cdrarg);
6570 }
6571 
6572 int
6573 socktpi_init(void)
6574 {
6575 	/*
6576 	 * Create sonode caches.  We create a special one for AF_UNIX so
6577 	 * that we can track them for netstat(1m).
6578 	 */
6579 	socktpi_cache = kmem_cache_create("socktpi_cache",
6580 	    sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6581 	    socktpi_destructor, NULL, NULL, NULL, 0);
6582 
6583 	socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6584 	    sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6585 	    socktpi_unix_destructor, NULL, NULL, NULL, 0);
6586 
6587 	return (0);
6588 }
6589 
6590 /*
6591  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6592  *
6593  * Caller must still update state and mode using sotpi_update_state().
6594  *
6595  * Returns the STREAM queue that the protocol should use.
6596  */
6597 queue_t *
6598 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6599     boolean_t *direct, struct cred *cr)
6600 {
6601 	sotpi_info_t *sti;
6602 	struct sockparams *origsp = so->so_sockparams;
6603 	sock_lower_handle_t handle = so->so_proto_handle;
6604 	uint_t old_state = so->so_state;
6605 	struct stdata *stp;
6606 	struct vnode *vp;
6607 	queue_t *q;
6608 
6609 	*direct = B_FALSE;
6610 	so->so_sockparams = newsp;
6611 	/*
6612 	 * Allocate and initalize fields required by TPI.
6613 	 */
6614 	(void) sotpi_info_create(so, KM_SLEEP);
6615 	sotpi_info_init(so);
6616 
6617 	if (sotpi_init(so, NULL, cr, SO_FALLBACK) != 0) {
6618 		sotpi_info_fini(so);
6619 		sotpi_info_destroy(so);
6620 		so->so_state = old_state;
6621 		return (NULL);
6622 	}
6623 	ASSERT(handle == so->so_proto_handle);
6624 	sti = SOTOTPI(so);
6625 	if (sti->sti_direct != 0)
6626 		*direct = B_TRUE;
6627 
6628 	/*
6629 	 * Keep the original sp around so we can properly dispose of the
6630 	 * sonode when the socket is being closed.
6631 	 */
6632 	sti->sti_orig_sp = origsp;
6633 
6634 	so_basic_strinit(so);	/* skips the T_CAPABILITY_REQ */
6635 	so_alloc_addr(so, so->so_max_addr_len);
6636 
6637 	/*
6638 	 * If the application has done a SIOCSPGRP, make sure the
6639 	 * STREAM head is aware. This needs to take place before
6640 	 * the protocol start sending up messages. Otherwise we
6641 	 * might miss to generate SIGPOLL.
6642 	 *
6643 	 * It is possible that the application will receive duplicate
6644 	 * signals if some were already generated for either data or
6645 	 * connection indications.
6646 	 */
6647 	if (so->so_pgrp != 0) {
6648 		mutex_enter(&so->so_lock);
6649 		if (so_set_events(so, so->so_vnode, cr) != 0)
6650 			so->so_pgrp = 0;
6651 		mutex_exit(&so->so_lock);
6652 	}
6653 
6654 	/*
6655 	 * Determine which queue to use.
6656 	 */
6657 	vp = SOTOV(so);
6658 	stp = vp->v_stream;
6659 	ASSERT(stp != NULL);
6660 	q = stp->sd_wrq->q_next;
6661 
6662 	/*
6663 	 * Skip any modules that may have been auto pushed when the device
6664 	 * was opened
6665 	 */
6666 	while (q->q_next != NULL)
6667 		q = q->q_next;
6668 	q = _RD(q);
6669 
6670 	return (q);
6671 }
6672 
6673 void
6674 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6675     struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6676     socklen_t faddrlen, short opts)
6677 {
6678 	sotpi_info_t *sti = SOTOTPI(so);
6679 
6680 	so_proc_tcapability_ack(so, tcap);
6681 
6682 	so->so_options |= opts;
6683 
6684 	/*
6685 	 * Determine whether the foreign and local address are valid
6686 	 */
6687 	if (laddrlen != 0) {
6688 		ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6689 		sti->sti_laddr_len = laddrlen;
6690 		bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6691 		sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6692 	}
6693 
6694 	if (faddrlen != 0) {
6695 		ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6696 		sti->sti_faddr_len = faddrlen;
6697 		bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6698 		sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6699 	}
6700 
6701 }
6702 
6703 /*
6704  * Allocate enough space to cache the local and foreign addresses.
6705  */
6706 void
6707 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6708 {
6709 	sotpi_info_t *sti = SOTOTPI(so);
6710 
6711 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6712 	ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6713 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6714 	    P2ROUNDUP(maxlen, KMEM_ALIGN);
6715 	so->so_max_addr_len = sti->sti_laddr_maxlen;
6716 	sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6717 	sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6718 	    + sti->sti_laddr_maxlen);
6719 
6720 	if (so->so_family == AF_UNIX) {
6721 		/*
6722 		 * Initialize AF_UNIX related fields.
6723 		 */
6724 		bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6725 		bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6726 	}
6727 }
6728 
6729 
6730 sotpi_info_t *
6731 sotpi_sototpi(struct sonode *so)
6732 {
6733 	sotpi_info_t *sti;
6734 
6735 	if (so == NULL)
6736 		return (NULL);
6737 
6738 	sti = (sotpi_info_t *)so->so_priv;
6739 
6740 	ASSERT(sti != NULL);
6741 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6742 
6743 	return (sti);
6744 }
6745 
6746 static int
6747 i_sotpi_info_constructor(sotpi_info_t *sti)
6748 {
6749 	sti->sti_magic		= SOTPI_INFO_MAGIC;
6750 	sti->sti_ack_mp		= NULL;
6751 	sti->sti_discon_ind_mp	= NULL;
6752 	sti->sti_ux_bound_vp	= NULL;
6753 	sti->sti_unbind_mp	= NULL;
6754 
6755 	sti->sti_conn_ind_head	= NULL;
6756 	sti->sti_conn_ind_tail	= NULL;
6757 
6758 	sti->sti_laddr_sa	= NULL;
6759 	sti->sti_faddr_sa	= NULL;
6760 
6761 	sti->sti_nl7c_flags	= 0;
6762 	sti->sti_nl7c_uri	= NULL;
6763 	sti->sti_nl7c_rcv_mp	= NULL;
6764 
6765 	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6766 	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6767 
6768 	return (0);
6769 }
6770 
6771 static void
6772 i_sotpi_info_destructor(sotpi_info_t *sti)
6773 {
6774 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6775 	ASSERT(sti->sti_ack_mp == NULL);
6776 	ASSERT(sti->sti_discon_ind_mp == NULL);
6777 	ASSERT(sti->sti_ux_bound_vp == NULL);
6778 	ASSERT(sti->sti_unbind_mp == NULL);
6779 
6780 	ASSERT(sti->sti_conn_ind_head == NULL);
6781 	ASSERT(sti->sti_conn_ind_tail == NULL);
6782 
6783 	ASSERT(sti->sti_laddr_sa == NULL);
6784 	ASSERT(sti->sti_faddr_sa == NULL);
6785 
6786 	ASSERT(sti->sti_nl7c_flags == 0);
6787 	ASSERT(sti->sti_nl7c_uri == NULL);
6788 	ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6789 
6790 	mutex_destroy(&sti->sti_plumb_lock);
6791 	cv_destroy(&sti->sti_ack_cv);
6792 }
6793 
6794 /*
6795  * Creates and attaches TPI information to the given sonode
6796  */
6797 static boolean_t
6798 sotpi_info_create(struct sonode *so, int kmflags)
6799 {
6800 	sotpi_info_t *sti;
6801 
6802 	ASSERT(so->so_priv == NULL);
6803 
6804 	if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6805 		return (B_FALSE);
6806 
6807 	if (i_sotpi_info_constructor(sti) != 0) {
6808 		kmem_free(sti, sizeof (*sti));
6809 		return (B_FALSE);
6810 	}
6811 
6812 	so->so_priv = (void *)sti;
6813 	return (B_TRUE);
6814 }
6815 
6816 /*
6817  * Initializes the TPI information.
6818  */
6819 static void
6820 sotpi_info_init(struct sonode *so)
6821 {
6822 	struct vnode *vp = SOTOV(so);
6823 	sotpi_info_t *sti = SOTOTPI(so);
6824 	time_t now;
6825 
6826 	sti->sti_dev 	= so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6827 	vp->v_rdev	= sti->sti_dev;
6828 
6829 	sti->sti_orig_sp = NULL;
6830 
6831 	sti->sti_pushcnt = 0;
6832 
6833 	now = gethrestime_sec();
6834 	sti->sti_atime	= now;
6835 	sti->sti_mtime	= now;
6836 	sti->sti_ctime	= now;
6837 
6838 	sti->sti_eaddr_mp = NULL;
6839 	sti->sti_delayed_error = 0;
6840 
6841 	sti->sti_provinfo = NULL;
6842 
6843 	sti->sti_oobcnt = 0;
6844 	sti->sti_oobsigcnt = 0;
6845 
6846 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6847 
6848 	sti->sti_laddr_sa	= 0;
6849 	sti->sti_faddr_sa	= 0;
6850 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6851 	sti->sti_laddr_len = sti->sti_faddr_len = 0;
6852 
6853 	sti->sti_laddr_valid = 0;
6854 	sti->sti_faddr_valid = 0;
6855 	sti->sti_faddr_noxlate = 0;
6856 
6857 	sti->sti_direct = 0;
6858 
6859 	ASSERT(sti->sti_ack_mp == NULL);
6860 	ASSERT(sti->sti_ux_bound_vp == NULL);
6861 	ASSERT(sti->sti_unbind_mp == NULL);
6862 
6863 	ASSERT(sti->sti_conn_ind_head == NULL);
6864 	ASSERT(sti->sti_conn_ind_tail == NULL);
6865 
6866 	/* Initialize the kernel SSL proxy fields */
6867 	sti->sti_kssl_type = KSSL_NO_PROXY;
6868 	sti->sti_kssl_ent = NULL;
6869 	sti->sti_kssl_ctx = NULL;
6870 }
6871 
6872 /*
6873  * Given a sonode, grab the TPI info and free any data.
6874  */
6875 static void
6876 sotpi_info_fini(struct sonode *so)
6877 {
6878 	sotpi_info_t *sti = SOTOTPI(so);
6879 	mblk_t *mp;
6880 
6881 	ASSERT(sti->sti_discon_ind_mp == NULL);
6882 
6883 	if ((mp = sti->sti_conn_ind_head) != NULL) {
6884 		mblk_t *mp1;
6885 
6886 		while (mp) {
6887 			mp1 = mp->b_next;
6888 			mp->b_next = NULL;
6889 			freemsg(mp);
6890 			mp = mp1;
6891 		}
6892 		sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6893 	}
6894 
6895 	/*
6896 	 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6897 	 * indirect them.  It also uses so_count as a validity test.
6898 	 */
6899 	mutex_enter(&so->so_lock);
6900 
6901 	if (sti->sti_laddr_sa) {
6902 		ASSERT((caddr_t)sti->sti_faddr_sa ==
6903 		    (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6904 		ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6905 		sti->sti_laddr_valid = 0;
6906 		sti->sti_faddr_valid = 0;
6907 		kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6908 		sti->sti_laddr_sa = NULL;
6909 		sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6910 		sti->sti_faddr_sa = NULL;
6911 		sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6912 	}
6913 
6914 	mutex_exit(&so->so_lock);
6915 
6916 	if ((mp = sti->sti_eaddr_mp) != NULL) {
6917 		freemsg(mp);
6918 		sti->sti_eaddr_mp = NULL;
6919 		sti->sti_delayed_error = 0;
6920 	}
6921 
6922 	if ((mp = sti->sti_ack_mp) != NULL) {
6923 		freemsg(mp);
6924 		sti->sti_ack_mp = NULL;
6925 	}
6926 
6927 	if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
6928 		sti->sti_nl7c_rcv_mp = NULL;
6929 		freemsg(mp);
6930 	}
6931 	sti->sti_nl7c_rcv_rval = 0;
6932 	if (sti->sti_nl7c_uri != NULL) {
6933 		nl7c_urifree(so);
6934 		/* urifree() cleared nl7c_uri */
6935 	}
6936 	if (sti->sti_nl7c_flags) {
6937 		sti->sti_nl7c_flags = 0;
6938 	}
6939 
6940 	ASSERT(sti->sti_ux_bound_vp == NULL);
6941 	if ((mp = sti->sti_unbind_mp) != NULL) {
6942 		freemsg(mp);
6943 		sti->sti_unbind_mp = NULL;
6944 	}
6945 }
6946 
6947 /*
6948  * Destroys the TPI information attached to a sonode.
6949  */
6950 static void
6951 sotpi_info_destroy(struct sonode *so)
6952 {
6953 	sotpi_info_t *sti = SOTOTPI(so);
6954 
6955 	i_sotpi_info_destructor(sti);
6956 	kmem_free(sti, sizeof (*sti));
6957 
6958 	so->so_priv = NULL;
6959 }
6960 
6961 /*
6962  * Create the global sotpi socket module entry. It will never be free.
6963  */
6964 smod_info_t *
6965 sotpi_smod_create(void)
6966 {
6967 	smod_info_t *smodp;
6968 
6969 	smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6970 	smodp->smod_name = kmem_zalloc(strlen(SOTPI_SMOD_NAME), + 1);
6971 	(void *)strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6972 	/*
6973 	 * Initilization the refcnt to 1 so it will never be free.
6974 	 */
6975 	smodp->smod_refcnt = 1;
6976 	smodp->smod_uc_version = SOCK_UC_VERSION;
6977 	smodp->smod_dc_version = SOCK_DC_VERSION;
6978 	smodp->smod_sock_create_func = &sotpi_create;
6979 	smodp->smod_sock_destroy_func = &sotpi_destroy;
6980 	return (smodp);
6981 }
6982