xref: /illumos-gate/usr/src/uts/common/fs/sockfs/socktpi.c (revision 068cf9dc70b00292d5e5ff1ab17f1b046215d6f6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2015, Joyent, Inc.
25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
26  * Copyright 2022 Garrett D'Amore
27  */
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/kmem_impl.h>
38 #include <sys/sysmacros.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/debug.h>
42 #include <sys/errno.h>
43 #include <sys/time.h>
44 #include <sys/file.h>
45 #include <sys/open.h>
46 #include <sys/user.h>
47 #include <sys/termios.h>
48 #include <sys/stream.h>
49 #include <sys/strsubr.h>
50 #include <sys/strsun.h>
51 #include <sys/suntpi.h>
52 #include <sys/ddi.h>
53 #include <sys/esunddi.h>
54 #include <sys/flock.h>
55 #include <sys/modctl.h>
56 #include <sys/vtrace.h>
57 #include <sys/cmn_err.h>
58 #include <sys/pathname.h>
59 
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/sockio.h>
63 #include <netinet/in.h>
64 #include <sys/un.h>
65 #include <sys/strsun.h>
66 
67 #include <sys/tiuser.h>
68 #define	_SUN_TPI_VERSION	2
69 #include <sys/tihdr.h>
70 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
71 
72 #include <c2/audit.h>
73 
74 #include <inet/common.h>
75 #include <inet/ip.h>
76 #include <inet/ip6.h>
77 #include <inet/tcp.h>
78 #include <inet/udp_impl.h>
79 
80 #include <sys/zone.h>
81 
82 #include <fs/sockfs/sockcommon.h>
83 #include <fs/sockfs/socktpi.h>
84 #include <fs/sockfs/socktpi_impl.h>
85 
86 /*
87  * Possible failures when memory can't be allocated. The documented behavior:
88  *
89  *		5.5:			4.X:		XNET:
90  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
91  *							EINTR
92  *	(4.X does not document EINTR but returns it)
93  * bind:	ENOSR			-		ENOBUFS/ENOSR
94  * connect:	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
95  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
96  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
97  *	(4.X getpeername and getsockname do not fail in practice)
98  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
99  * listen:	-			-		ENOBUFS
100  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
101  *							EINTR
102  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
103  *							EINTR
104  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
105  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
106  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
107  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
108  *
109  * Resolution. When allocation fails:
110  *	recv: return EINTR
111  *	send: return EINTR
112  *	connect, accept: EINTR
113  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
114  *	socket, socketpair: ENOBUFS
115  *	getpeername, getsockname: sleep
116  *	getsockopt, setsockopt: sleep
117  */
118 
119 #ifdef SOCK_TEST
120 /*
121  * Variables that make sockfs do something other than the standard TPI
122  * for the AF_INET transports.
123  *
124  * solisten_tpi_tcp:
125  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
126  *	the transport is already bound. This is needed to avoid loosing the
127  *	port number should listen() do a T_UNBIND_REQ followed by a
128  *	O_T_BIND_REQ.
129  *
130  * soconnect_tpi_udp:
131  *	UDP and ICMP can handle a T_CONN_REQ.
132  *	This is needed to make the sequence of connect(), getsockname()
133  *	return the local IP address used to send packets to the connected to
134  *	destination.
135  *
136  * soconnect_tpi_tcp:
137  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
138  *	Set this to non-zero to send TPI conformant messages to TCP in this
139  *	respect. This is a performance optimization.
140  *
141  * soaccept_tpi_tcp:
142  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
143  *	This is a performance optimization that has been picked up in XTI.
144  *
145  * soaccept_tpi_multioptions:
146  *	When inheriting SOL_SOCKET options from the listener to the accepting
147  *	socket send them as a single message for AF_INET{,6}.
148  */
149 int solisten_tpi_tcp = 0;
150 int soconnect_tpi_udp = 0;
151 int soconnect_tpi_tcp = 0;
152 int soaccept_tpi_tcp = 0;
153 int soaccept_tpi_multioptions = 1;
154 #else /* SOCK_TEST */
155 #define	soconnect_tpi_tcp	0
156 #define	soconnect_tpi_udp	0
157 #define	solisten_tpi_tcp	0
158 #define	soaccept_tpi_tcp	0
159 #define	soaccept_tpi_multioptions	1
160 #endif /* SOCK_TEST */
161 
162 #ifdef SOCK_TEST
163 extern int do_useracc;
164 extern clock_t sock_test_timelimit;
165 #endif /* SOCK_TEST */
166 
167 extern uint32_t ucredsize;
168 
169 /*
170  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
171  * applications working. Turn on this flag to disable these checks.
172  */
173 int xnet_skip_checks = 0;
174 int xnet_check_print = 0;
175 int xnet_truncate_print = 0;
176 
177 static void sotpi_destroy(struct sonode *);
178 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
179     int, int *, cred_t *cr);
180 
181 static boolean_t	sotpi_info_create(struct sonode *, int);
182 static void		sotpi_info_init(struct sonode *);
183 static void		sotpi_info_fini(struct sonode *);
184 static void		sotpi_info_destroy(struct sonode *);
185 
186 /*
187  * Do direct function call to the transport layer below; this would
188  * also allow the transport to utilize read-side synchronous stream
189  * interface if necessary.  This is a /etc/system tunable that must
190  * not be modified on a running system.  By default this is enabled
191  * for performance reasons and may be disabled for debugging purposes.
192  */
193 boolean_t socktpi_direct = B_TRUE;
194 
195 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
196 
197 extern	void sigintr(k_sigset_t *, int);
198 extern	void sigunintr(k_sigset_t *);
199 
200 static int	sotpi_unbind(struct sonode *, int);
201 
202 /* TPI sockfs sonode operations */
203 int		sotpi_init(struct sonode *, struct sonode *, struct cred *,
204 		    int);
205 static int	sotpi_accept(struct sonode *, int, struct cred *,
206 		    struct sonode **);
207 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
208 		    int, struct cred *);
209 static int	sotpi_listen(struct sonode *, int, struct cred *);
210 static int	sotpi_connect(struct sonode *, struct sockaddr *,
211 		    socklen_t, int, int, struct cred *);
212 extern int	sotpi_recvmsg(struct sonode *, struct nmsghdr *,
213 		    struct uio *, struct cred *);
214 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
215 		    struct uio *, struct cred *);
216 static int	sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
217 		    struct cred *, mblk_t **);
218 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
219 		    struct uio *, void *, t_uscalar_t, int);
220 static int	sodgram_direct(struct sonode *, struct sockaddr *,
221 		    socklen_t, struct uio *, int);
222 extern int	sotpi_getpeername(struct sonode *, struct sockaddr *,
223 		    socklen_t *, boolean_t, struct cred *);
224 static int	sotpi_getsockname(struct sonode *, struct sockaddr *,
225 		    socklen_t *, struct cred *);
226 static int	sotpi_shutdown(struct sonode *, int, struct cred *);
227 extern int	sotpi_getsockopt(struct sonode *, int, int, void *,
228 		    socklen_t *, int, struct cred *);
229 extern int	sotpi_setsockopt(struct sonode *, int, int, const void *,
230 		    socklen_t, struct cred *);
231 static int	sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
232 		    int32_t *);
233 static int	socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
234 		    struct cred *, int32_t *);
235 static int	sotpi_poll(struct sonode *, short, int, short *,
236 		    struct pollhead **);
237 static int	sotpi_close(struct sonode *, int, struct cred *);
238 
239 static int	i_sotpi_info_constructor(sotpi_info_t *);
240 static void	i_sotpi_info_destructor(sotpi_info_t *);
241 
242 sonodeops_t sotpi_sonodeops = {
243 	sotpi_init,		/* sop_init		*/
244 	sotpi_accept,		/* sop_accept		*/
245 	sotpi_bind,		/* sop_bind		*/
246 	sotpi_listen,		/* sop_listen		*/
247 	sotpi_connect,		/* sop_connect		*/
248 	sotpi_recvmsg,		/* sop_recvmsg		*/
249 	sotpi_sendmsg,		/* sop_sendmsg		*/
250 	sotpi_sendmblk,		/* sop_sendmblk		*/
251 	sotpi_getpeername,	/* sop_getpeername	*/
252 	sotpi_getsockname,	/* sop_getsockname	*/
253 	sotpi_shutdown,		/* sop_shutdown		*/
254 	sotpi_getsockopt,	/* sop_getsockopt	*/
255 	sotpi_setsockopt,	/* sop_setsockopt	*/
256 	sotpi_ioctl,		/* sop_ioctl		*/
257 	sotpi_poll,		/* sop_poll		*/
258 	sotpi_close,		/* sop_close		*/
259 };
260 
261 /*
262  * Return a TPI socket vnode.
263  *
264  * Note that sockets assume that the driver will clone (either itself
265  * or by using the clone driver) i.e. a socket() call will always
266  * result in a new vnode being created.
267  */
268 
269 /*
270  * Common create code for socket and accept. If tso is set the values
271  * from that node is used instead of issuing a T_INFO_REQ.
272  */
273 
274 /* ARGSUSED */
275 static struct sonode *
276 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
277     int version, int sflags, int *errorp, cred_t *cr)
278 {
279 	struct sonode	*so;
280 	kmem_cache_t	*cp;
281 
282 	ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
283 
284 	if (family == AF_NCA) {
285 		/*
286 		 * The request is for an NCA socket so for NL7C use the
287 		 * INET domain instead and mark NL7C_AF_NCA below.
288 		 */
289 		family = AF_INET;
290 		/*
291 		 * NL7C is not supported in the non-global zone,
292 		 * we enforce this restriction here.
293 		 */
294 		if (getzoneid() != GLOBAL_ZONEID) {
295 			*errorp = ENOTSUP;
296 			return (NULL);
297 		}
298 	}
299 
300 	/*
301 	 * to be compatible with old tpi socket implementation ignore
302 	 * sleep flag (sflags) passed in
303 	 */
304 	cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
305 	so = kmem_cache_alloc(cp, KM_SLEEP);
306 	if (so == NULL) {
307 		*errorp = ENOMEM;
308 		return (NULL);
309 	}
310 
311 	sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
312 	sotpi_info_init(so);
313 
314 	if (version == SOV_DEFAULT)
315 		version = so_default_version;
316 
317 	so->so_version = (short)version;
318 	*errorp = 0;
319 
320 	return (so);
321 }
322 
323 static void
324 sotpi_destroy(struct sonode *so)
325 {
326 	kmem_cache_t *cp;
327 	struct sockparams *origsp;
328 
329 	/*
330 	 * If there is a new dealloc function (ie. smod_destroy_func),
331 	 * then it should check the correctness of the ops.
332 	 */
333 
334 	ASSERT(so->so_ops == &sotpi_sonodeops);
335 
336 	origsp = SOTOTPI(so)->sti_orig_sp;
337 
338 	sotpi_info_fini(so);
339 
340 	if (so->so_state & SS_FALLBACK_COMP) {
341 		/*
342 		 * A fallback happend, which means that a sotpi_info_t struct
343 		 * was allocated (as opposed to being allocated from the TPI
344 		 * sonode cache. Therefore we explicitly free the struct
345 		 * here.
346 		 */
347 		sotpi_info_destroy(so);
348 		ASSERT(origsp != NULL);
349 
350 		origsp->sp_smod_info->smod_sock_destroy_func(so);
351 		SOCKPARAMS_DEC_REF(origsp);
352 	} else {
353 		sonode_fini(so);
354 		cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
355 		    socktpi_cache;
356 		kmem_cache_free(cp, so);
357 	}
358 }
359 
360 /* ARGSUSED1 */
361 int
362 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
363 {
364 	major_t maj;
365 	dev_t newdev;
366 	struct vnode *vp;
367 	int error = 0;
368 	struct stdata *stp;
369 
370 	sotpi_info_t *sti = SOTOTPI(so);
371 
372 	dprint(1, ("sotpi_init()\n"));
373 
374 	/*
375 	 * over write the sleep flag passed in but that is ok
376 	 * as tpi socket does not honor sleep flag.
377 	 */
378 	flags |= FREAD|FWRITE;
379 
380 	/*
381 	 * Record in so_flag that it is a clone.
382 	 */
383 	if (getmajor(sti->sti_dev) == clone_major)
384 		so->so_flag |= SOCLONE;
385 
386 	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
387 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
388 	    (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
389 	    so->so_protocol == IPPROTO_IP)) {
390 		/* Tell tcp or udp that it's talking to sockets */
391 		flags |= SO_SOCKSTR;
392 
393 		/*
394 		 * Here we indicate to socktpi_open() our attempt to
395 		 * make direct calls between sockfs and transport.
396 		 * The final decision is left to socktpi_open().
397 		 */
398 		sti->sti_direct = 1;
399 
400 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
401 		if (so->so_type == SOCK_STREAM && tso != NULL) {
402 			if (SOTOTPI(tso)->sti_direct) {
403 				/*
404 				 * Inherit sti_direct from listener and pass
405 				 * SO_ACCEPTOR open flag to tcp, indicating
406 				 * that this is an accept fast-path instance.
407 				 */
408 				flags |= SO_ACCEPTOR;
409 			} else {
410 				/*
411 				 * sti_direct is not set on listener, meaning
412 				 * that the listener has been converted from
413 				 * a socket to a stream.  Ensure that the
414 				 * acceptor inherits these settings.
415 				 */
416 				sti->sti_direct = 0;
417 				flags &= ~SO_SOCKSTR;
418 			}
419 		}
420 	}
421 
422 	/*
423 	 * Tell local transport that it is talking to sockets.
424 	 */
425 	if (so->so_family == AF_UNIX) {
426 		flags |= SO_SOCKSTR;
427 	}
428 
429 	vp = SOTOV(so);
430 	newdev = vp->v_rdev;
431 	maj = getmajor(newdev);
432 	ASSERT(STREAMSTAB(maj));
433 
434 	error = stropen(vp, &newdev, flags, cr);
435 
436 	stp = vp->v_stream;
437 	if (error == 0) {
438 		if (so->so_flag & SOCLONE)
439 			ASSERT(newdev != vp->v_rdev);
440 		mutex_enter(&so->so_lock);
441 		sti->sti_dev = newdev;
442 		vp->v_rdev = newdev;
443 		mutex_exit(&so->so_lock);
444 
445 		if (stp->sd_flag & STRISTTY) {
446 			/*
447 			 * this is a post SVR4 tty driver - a socket can not
448 			 * be a controlling terminal. Fail the open.
449 			 */
450 			(void) sotpi_close(so, flags, cr);
451 			return (ENOTTY);	/* XXX */
452 		}
453 
454 		ASSERT(stp->sd_wrq != NULL);
455 		sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
456 
457 		/*
458 		 * If caller is interested in doing direct function call
459 		 * interface to/from transport module, probe the module
460 		 * directly beneath the streamhead to see if it qualifies.
461 		 *
462 		 * We turn off the direct interface when qualifications fail.
463 		 * In the acceptor case, we simply turn off the sti_direct
464 		 * flag on the socket. We do the fallback after the accept
465 		 * has completed, before the new socket is returned to the
466 		 * application.
467 		 */
468 		if (sti->sti_direct) {
469 			queue_t *tq = stp->sd_wrq->q_next;
470 
471 			/*
472 			 * sti_direct is currently supported and tested
473 			 * only for tcp/udp; this is the main reason to
474 			 * have the following assertions.
475 			 */
476 			ASSERT(so->so_family == AF_INET ||
477 			    so->so_family == AF_INET6);
478 			ASSERT(so->so_protocol == IPPROTO_UDP ||
479 			    so->so_protocol == IPPROTO_TCP ||
480 			    so->so_protocol == IPPROTO_IP);
481 			ASSERT(so->so_type == SOCK_DGRAM ||
482 			    so->so_type == SOCK_STREAM);
483 
484 			/*
485 			 * Abort direct call interface if the module directly
486 			 * underneath the stream head is not defined with the
487 			 * _D_DIRECT flag.  This could happen in the tcp or
488 			 * udp case, when some other module is autopushed
489 			 * above it, or for some reasons the expected module
490 			 * isn't purely D_MP (which is the main requirement).
491 			 */
492 			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
493 			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
494 				int rval;
495 
496 				/* Continue on without direct calls */
497 				sti->sti_direct = 0;
498 
499 				/*
500 				 * Cannot issue ioctl on fallback socket since
501 				 * there is no conn associated with the queue.
502 				 * The fallback downcall will notify the proto
503 				 * of the change.
504 				 */
505 				if (!(flags & SO_ACCEPTOR) &&
506 				    !(flags & SO_FALLBACK)) {
507 					if ((error = strioctl(vp,
508 					    _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
509 					    cr, &rval)) != 0) {
510 						(void) sotpi_close(so, flags,
511 						    cr);
512 						return (error);
513 					}
514 				}
515 			}
516 		}
517 
518 		if (flags & SO_FALLBACK) {
519 			/*
520 			 * The stream created does not have a conn.
521 			 * do stream set up after conn has been assigned
522 			 */
523 			return (error);
524 		}
525 		error = so_strinit(so, tso);
526 		if (error != 0) {
527 			(void) sotpi_close(so, flags, cr);
528 			return (error);
529 		}
530 
531 		/* Enable sendfile() on AF_UNIX streams */
532 		if (so->so_family == AF_UNIX && so->so_type == SOCK_STREAM) {
533 			mutex_enter(&so->so_lock);
534 			so->so_mode |= SM_SENDFILESUPP;
535 			mutex_exit(&so->so_lock);
536 		}
537 
538 		/* Wildcard */
539 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
540 			int protocol = so->so_protocol;
541 			/*
542 			 * Issue SO_PROTOTYPE setsockopt.
543 			 */
544 			error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
545 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
546 			if (error != 0) {
547 				(void) sotpi_close(so, flags, cr);
548 				/*
549 				 * Setsockopt often fails with ENOPROTOOPT but
550 				 * socket() should fail with
551 				 * EPROTONOSUPPORT/EPROTOTYPE.
552 				 */
553 				return (EPROTONOSUPPORT);
554 			}
555 		}
556 
557 	} else {
558 		/*
559 		 * While the same socket can not be reopened (unlike specfs)
560 		 * the stream head sets STREOPENFAIL when the autopush fails.
561 		 */
562 		if ((stp != NULL) &&
563 		    (stp->sd_flag & STREOPENFAIL)) {
564 			/*
565 			 * Open failed part way through.
566 			 */
567 			mutex_enter(&stp->sd_lock);
568 			stp->sd_flag &= ~STREOPENFAIL;
569 			mutex_exit(&stp->sd_lock);
570 			(void) sotpi_close(so, flags, cr);
571 			return (error);
572 			/*NOTREACHED*/
573 		}
574 		ASSERT(stp == NULL);
575 	}
576 	TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
577 	    "sockfs open:maj %d vp %p so %p error %d",
578 	    maj, vp, so, error);
579 	return (error);
580 }
581 
582 /*
583  * Bind the socket to an unspecified address in sockfs only.
584  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
585  * required in all cases.
586  */
587 static void
588 so_automatic_bind(struct sonode *so)
589 {
590 	sotpi_info_t *sti = SOTOTPI(so);
591 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
592 
593 	ASSERT(MUTEX_HELD(&so->so_lock));
594 	ASSERT(!(so->so_state & SS_ISBOUND));
595 	ASSERT(sti->sti_unbind_mp);
596 
597 	ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
598 	bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
599 	sti->sti_laddr_sa->sa_family = so->so_family;
600 	so->so_state |= SS_ISBOUND;
601 }
602 
603 
604 /*
605  * bind the socket.
606  *
607  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
608  * are passed in we allow rebinding. Note that for backwards compatibility
609  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
610  * Thus the rebinding code is currently not executed.
611  *
612  * The constraints for rebinding are:
613  * - it is a SOCK_DGRAM, or
614  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
615  *   and no listen() has been done.
616  * This rebinding code was added based on some language in the XNET book
617  * about not returning EINVAL it the protocol allows rebinding. However,
618  * this language is not present in the Posix socket draft. Thus maybe the
619  * rebinding logic should be deleted from the source.
620  *
621  * A null "name" can be used to unbind the socket if:
622  * - it is a SOCK_DGRAM, or
623  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
624  *   and no listen() has been done.
625  */
626 /* ARGSUSED */
627 static int
628 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
629     socklen_t namelen, int backlog, int flags, struct cred *cr)
630 {
631 	struct T_bind_req	bind_req;
632 	struct T_bind_ack	*bind_ack;
633 	int			error = 0;
634 	mblk_t			*mp;
635 	void			*addr;
636 	t_uscalar_t		addrlen;
637 	int			unbind_on_err = 1;
638 	boolean_t		clear_acceptconn_on_err = B_FALSE;
639 	boolean_t		restore_backlog_on_err = B_FALSE;
640 	int			save_so_backlog = 0;
641 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
642 	boolean_t		tcp_udp_xport;
643 	sotpi_info_t		*sti = SOTOTPI(so);
644 
645 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
646 	    (void *)so, (void *)name, namelen, backlog, flags,
647 	    pr_state(so->so_state, so->so_mode)));
648 
649 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
650 
651 	if (!(flags & _SOBIND_LOCK_HELD)) {
652 		mutex_enter(&so->so_lock);
653 		so_lock_single(so);	/* Set SOLOCKED */
654 	} else {
655 		ASSERT(MUTEX_HELD(&so->so_lock));
656 		ASSERT(so->so_flag & SOLOCKED);
657 	}
658 
659 	/*
660 	 * Make sure that there is a preallocated unbind_req message
661 	 * before binding. This message allocated when the socket is
662 	 * created  but it might be have been consumed.
663 	 */
664 	if (sti->sti_unbind_mp == NULL) {
665 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
666 		/* NOTE: holding so_lock while sleeping */
667 		sti->sti_unbind_mp =
668 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
669 		    cr);
670 	}
671 
672 	if (flags & _SOBIND_REBIND) {
673 		/*
674 		 * Called from solisten after doing an sotpi_unbind() or
675 		 * potentially without the unbind (latter for AF_INET{,6}).
676 		 */
677 		ASSERT(name == NULL && namelen == 0);
678 
679 		if (so->so_family == AF_UNIX) {
680 			ASSERT(sti->sti_ux_bound_vp);
681 			addr = &sti->sti_ux_laddr;
682 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
683 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
684 			    "addr 0x%p, vp %p\n",
685 			    addrlen,
686 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
687 			    (void *)sti->sti_ux_bound_vp));
688 		} else {
689 			addr = sti->sti_laddr_sa;
690 			addrlen = (t_uscalar_t)sti->sti_laddr_len;
691 		}
692 	} else if (flags & _SOBIND_UNSPEC) {
693 		ASSERT(name == NULL && namelen == 0);
694 
695 		/*
696 		 * The caller checked SS_ISBOUND but not necessarily
697 		 * under so_lock
698 		 */
699 		if (so->so_state & SS_ISBOUND) {
700 			/* No error */
701 			goto done;
702 		}
703 
704 		/* Set an initial local address */
705 		switch (so->so_family) {
706 		case AF_UNIX:
707 			/*
708 			 * Use an address with same size as struct sockaddr
709 			 * just like BSD.
710 			 */
711 			sti->sti_laddr_len =
712 			    (socklen_t)sizeof (struct sockaddr);
713 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
714 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
715 			sti->sti_laddr_sa->sa_family = so->so_family;
716 
717 			/*
718 			 * Pass down an address with the implicit bind
719 			 * magic number and the rest all zeros.
720 			 * The transport will return a unique address.
721 			 */
722 			sti->sti_ux_laddr.soua_vp = NULL;
723 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
724 			addr = &sti->sti_ux_laddr;
725 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
726 			break;
727 
728 		case AF_INET:
729 		case AF_INET6:
730 			/*
731 			 * An unspecified bind in TPI has a NULL address.
732 			 * Set the address in sockfs to have the sa_family.
733 			 */
734 			sti->sti_laddr_len = (so->so_family == AF_INET) ?
735 			    (socklen_t)sizeof (sin_t) :
736 			    (socklen_t)sizeof (sin6_t);
737 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
738 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
739 			sti->sti_laddr_sa->sa_family = so->so_family;
740 			addr = NULL;
741 			addrlen = 0;
742 			break;
743 
744 		default:
745 			/*
746 			 * An unspecified bind in TPI has a NULL address.
747 			 * Set the address in sockfs to be zero length.
748 			 *
749 			 * Can not assume there is a sa_family for all
750 			 * protocol families. For example, AF_X25 does not
751 			 * have a family field.
752 			 */
753 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
754 			sti->sti_laddr_len = 0;	/* XXX correct? */
755 			addr = NULL;
756 			addrlen = 0;
757 			break;
758 		}
759 
760 	} else {
761 		if (so->so_state & SS_ISBOUND) {
762 			/*
763 			 * If it is ok to rebind the socket, first unbind
764 			 * with the transport. A rebind to the NULL address
765 			 * is interpreted as an unbind.
766 			 * Note that a bind to NULL in BSD does unbind the
767 			 * socket but it fails with EINVAL.
768 			 * Note that regular sockets set SOV_SOCKBSD i.e.
769 			 * _SOBIND_SOCKBSD gets set here hence no type of
770 			 * socket does currently allow rebinding.
771 			 *
772 			 * If the name is NULL just do an unbind.
773 			 */
774 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
775 			    name != NULL) {
776 				error = EINVAL;
777 				unbind_on_err = 0;
778 				eprintsoline(so, error);
779 				goto done;
780 			}
781 			if ((so->so_mode & SM_CONNREQUIRED) &&
782 			    (so->so_state & SS_CANTREBIND)) {
783 				error = EINVAL;
784 				unbind_on_err = 0;
785 				eprintsoline(so, error);
786 				goto done;
787 			}
788 			error = sotpi_unbind(so, 0);
789 			if (error) {
790 				eprintsoline(so, error);
791 				goto done;
792 			}
793 			ASSERT(!(so->so_state & SS_ISBOUND));
794 			if (name == NULL) {
795 				so->so_state &=
796 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
797 				goto done;
798 			}
799 		}
800 
801 		/* X/Open requires this check */
802 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
803 			if (xnet_check_print) {
804 				printf("sockfs: X/Open bind state check "
805 				    "caused EINVAL\n");
806 			}
807 			error = EINVAL;
808 			goto done;
809 		}
810 
811 		switch (so->so_family) {
812 		case AF_UNIX:
813 			/*
814 			 * All AF_UNIX addresses are nul terminated
815 			 * when copied (copyin_name) in so the minimum
816 			 * length is 3 bytes.
817 			 */
818 			if (name == NULL ||
819 			    (ssize_t)namelen <= sizeof (short) + 1) {
820 				error = EISDIR;
821 				eprintsoline(so, error);
822 				goto done;
823 			}
824 			/*
825 			 * Verify so_family matches the bound family.
826 			 * BSD does not check this for AF_UNIX resulting
827 			 * in funny mknods.
828 			 */
829 			if (name->sa_family != so->so_family) {
830 				error = EAFNOSUPPORT;
831 				goto done;
832 			}
833 			break;
834 		case AF_INET:
835 			if (name == NULL) {
836 				error = EINVAL;
837 				eprintsoline(so, error);
838 				goto done;
839 			}
840 			if ((size_t)namelen != sizeof (sin_t)) {
841 				error = name->sa_family != so->so_family ?
842 				    EAFNOSUPPORT : EINVAL;
843 				eprintsoline(so, error);
844 				goto done;
845 			}
846 			if ((flags & _SOBIND_XPG4_2) &&
847 			    (name->sa_family != so->so_family)) {
848 				/*
849 				 * This check has to be made for X/Open
850 				 * sockets however application failures have
851 				 * been observed when it is applied to
852 				 * all sockets.
853 				 */
854 				error = EAFNOSUPPORT;
855 				eprintsoline(so, error);
856 				goto done;
857 			}
858 			/*
859 			 * Force a zero sa_family to match so_family.
860 			 *
861 			 * Some programs like inetd(8) don't set the
862 			 * family field. Other programs leave
863 			 * sin_family set to garbage - SunOS 4.X does
864 			 * not check the family field on a bind.
865 			 * We use the family field that
866 			 * was passed in to the socket() call.
867 			 */
868 			name->sa_family = so->so_family;
869 			break;
870 
871 		case AF_INET6: {
872 #ifdef DEBUG
873 			sin6_t *sin6 = (sin6_t *)name;
874 #endif /* DEBUG */
875 
876 			if (name == NULL) {
877 				error = EINVAL;
878 				eprintsoline(so, error);
879 				goto done;
880 			}
881 			if ((size_t)namelen != sizeof (sin6_t)) {
882 				error = name->sa_family != so->so_family ?
883 				    EAFNOSUPPORT : EINVAL;
884 				eprintsoline(so, error);
885 				goto done;
886 			}
887 			if (name->sa_family != so->so_family) {
888 				/*
889 				 * With IPv6 we require the family to match
890 				 * unlike in IPv4.
891 				 */
892 				error = EAFNOSUPPORT;
893 				eprintsoline(so, error);
894 				goto done;
895 			}
896 #ifdef DEBUG
897 			/*
898 			 * Verify that apps don't forget to clear
899 			 * sin6_scope_id etc
900 			 */
901 			if (sin6->sin6_scope_id != 0 &&
902 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
903 				zcmn_err(getzoneid(), CE_WARN,
904 				    "bind with uninitialized sin6_scope_id "
905 				    "(%d) on socket. Pid = %d\n",
906 				    (int)sin6->sin6_scope_id,
907 				    (int)curproc->p_pid);
908 			}
909 			if (sin6->__sin6_src_id != 0) {
910 				zcmn_err(getzoneid(), CE_WARN,
911 				    "bind with uninitialized __sin6_src_id "
912 				    "(%d) on socket. Pid = %d\n",
913 				    (int)sin6->__sin6_src_id,
914 				    (int)curproc->p_pid);
915 			}
916 #endif /* DEBUG */
917 			break;
918 		}
919 		default:
920 			/*
921 			 * Don't do any length or sa_family check to allow
922 			 * non-sockaddr style addresses.
923 			 */
924 			if (name == NULL) {
925 				error = EINVAL;
926 				eprintsoline(so, error);
927 				goto done;
928 			}
929 			break;
930 		}
931 
932 		if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
933 			error = ENAMETOOLONG;
934 			eprintsoline(so, error);
935 			goto done;
936 		}
937 		/*
938 		 * Save local address.
939 		 */
940 		sti->sti_laddr_len = (socklen_t)namelen;
941 		ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
942 		bcopy(name, sti->sti_laddr_sa, namelen);
943 
944 		addr = sti->sti_laddr_sa;
945 		addrlen = (t_uscalar_t)sti->sti_laddr_len;
946 		switch (so->so_family) {
947 		case AF_INET6:
948 		case AF_INET:
949 			break;
950 		case AF_UNIX: {
951 			struct sockaddr_un *soun =
952 			    (struct sockaddr_un *)sti->sti_laddr_sa;
953 			struct vnode *vp, *rvp;
954 			struct vattr vattr;
955 
956 			ASSERT(sti->sti_ux_bound_vp == NULL);
957 			/*
958 			 * Create vnode for the specified path name.
959 			 * Keep vnode held with a reference in sti_ux_bound_vp.
960 			 * Use the vnode pointer as the address used in the
961 			 * bind with the transport.
962 			 *
963 			 * Use the same mode as in BSD. In particular this does
964 			 * not observe the umask.
965 			 */
966 			/* MAXPATHLEN + soun_family + nul termination */
967 			if (sti->sti_laddr_len >
968 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
969 				error = ENAMETOOLONG;
970 				eprintsoline(so, error);
971 				goto done;
972 			}
973 			vattr.va_type = VSOCK;
974 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
975 			vattr.va_mask = AT_TYPE|AT_MODE;
976 			/* NOTE: holding so_lock */
977 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
978 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
979 			if (error) {
980 				if (error == EEXIST)
981 					error = EADDRINUSE;
982 				eprintsoline(so, error);
983 				goto done;
984 			}
985 			/*
986 			 * Establish pointer from the underlying filesystem
987 			 * vnode to the socket node.
988 			 * sti_ux_bound_vp and v_stream->sd_vnode form the
989 			 * cross-linkage between the underlying filesystem
990 			 * node and the socket node.
991 			 */
992 
993 			if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
994 				VN_HOLD(rvp);
995 				VN_RELE(vp);
996 				vp = rvp;
997 			}
998 
999 			ASSERT(SOTOV(so)->v_stream);
1000 			mutex_enter(&vp->v_lock);
1001 			vp->v_stream = SOTOV(so)->v_stream;
1002 			sti->sti_ux_bound_vp = vp;
1003 			mutex_exit(&vp->v_lock);
1004 
1005 			/*
1006 			 * Use the vnode pointer value as a unique address
1007 			 * (together with the magic number to avoid conflicts
1008 			 * with implicit binds) in the transport provider.
1009 			 */
1010 			sti->sti_ux_laddr.soua_vp =
1011 			    (void *)sti->sti_ux_bound_vp;
1012 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1013 			addr = &sti->sti_ux_laddr;
1014 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1015 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1016 			    addrlen,
1017 			    (void *)((struct so_ux_addr *)addr)->soua_vp));
1018 			break;
1019 		}
1020 		} /* end switch (so->so_family) */
1021 	}
1022 
1023 	/*
1024 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1025 	 * the transport can start passing up T_CONN_IND messages
1026 	 * as soon as it receives the bind req and strsock_proto()
1027 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1028 	 */
1029 	if (flags & _SOBIND_LISTEN) {
1030 		if ((so->so_state & SS_ACCEPTCONN) == 0)
1031 			clear_acceptconn_on_err = B_TRUE;
1032 		save_so_backlog = so->so_backlog;
1033 		restore_backlog_on_err = B_TRUE;
1034 		so->so_state |= SS_ACCEPTCONN;
1035 		so->so_backlog = backlog;
1036 	}
1037 
1038 	/*
1039 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1040 	 * for other transports we will send in a O_T_BIND_REQ.
1041 	 */
1042 	if (tcp_udp_xport &&
1043 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
1044 		PRIM_type = T_BIND_REQ;
1045 
1046 	bind_req.PRIM_type = PRIM_type;
1047 	bind_req.ADDR_length = addrlen;
1048 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1049 	bind_req.CONIND_number = backlog;
1050 	/* NOTE: holding so_lock while sleeping */
1051 	mp = soallocproto2(&bind_req, sizeof (bind_req),
1052 	    addr, addrlen, 0, _ALLOC_SLEEP, cr);
1053 	sti->sti_laddr_valid = 0;
1054 
1055 	/* Done using sti_laddr_sa - can drop the lock */
1056 	mutex_exit(&so->so_lock);
1057 
1058 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1059 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1060 	if (error) {
1061 		eprintsoline(so, error);
1062 		mutex_enter(&so->so_lock);
1063 		goto done;
1064 	}
1065 
1066 	mutex_enter(&so->so_lock);
1067 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1068 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1069 	if (error) {
1070 		eprintsoline(so, error);
1071 		goto done;
1072 	}
1073 	ASSERT(mp);
1074 	/*
1075 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1076 	 * strsock_proto while the lock was dropped above, the bind
1077 	 * is allowed to complete.
1078 	 */
1079 
1080 	/* Mark as bound. This will be undone if we detect errors below. */
1081 	if (flags & _SOBIND_NOXLATE) {
1082 		ASSERT(so->so_family == AF_UNIX);
1083 		sti->sti_faddr_noxlate = 1;
1084 	}
1085 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1086 	so->so_state |= SS_ISBOUND;
1087 	ASSERT(sti->sti_unbind_mp);
1088 
1089 	/* note that we've already set SS_ACCEPTCONN above */
1090 
1091 	/*
1092 	 * Recompute addrlen - an unspecied bind sent down an
1093 	 * address of length zero but we expect the appropriate length
1094 	 * in return.
1095 	 */
1096 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1097 	    sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1098 
1099 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
1100 	/*
1101 	 * The alignment restriction is really too strict but
1102 	 * we want enough alignment to inspect the fields of
1103 	 * a sockaddr_in.
1104 	 */
1105 	addr = sogetoff(mp, bind_ack->ADDR_offset,
1106 	    bind_ack->ADDR_length,
1107 	    __TPI_ALIGN_SIZE);
1108 	if (addr == NULL) {
1109 		freemsg(mp);
1110 		error = EPROTO;
1111 		eprintsoline(so, error);
1112 		goto done;
1113 	}
1114 	if (!(flags & _SOBIND_UNSPEC)) {
1115 		/*
1116 		 * Verify that the transport didn't return something we
1117 		 * did not want e.g. an address other than what we asked for.
1118 		 *
1119 		 * NOTE: These checks would go away if/when we switch to
1120 		 * using the new TPI (in which the transport would fail
1121 		 * the request instead of assigning a different address).
1122 		 *
1123 		 * NOTE2: For protocols that we don't know (i.e. any
1124 		 * other than AF_INET6, AF_INET and AF_UNIX), we
1125 		 * cannot know if the transport should be expected to
1126 		 * return the same address as that requested.
1127 		 *
1128 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1129 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1130 		 *
1131 		 * For example, in the case of netatalk it may be
1132 		 * inappropriate for the transport to return the
1133 		 * requested address (as it may have allocated a local
1134 		 * port number in behaviour similar to that of an
1135 		 * AF_INET bind request with a port number of zero).
1136 		 *
1137 		 * Given the definition of O_T_BIND_REQ, where the
1138 		 * transport may bind to an address other than the
1139 		 * requested address, it's not possible to determine
1140 		 * whether a returned address that differs from the
1141 		 * requested address is a reason to fail (because the
1142 		 * requested address was not available) or succeed
1143 		 * (because the transport allocated an appropriate
1144 		 * address and/or port).
1145 		 *
1146 		 * sockfs currently requires that the transport return
1147 		 * the requested address in the T_BIND_ACK, unless
1148 		 * there is code here to allow for any discrepancy.
1149 		 * Such code exists for AF_INET and AF_INET6.
1150 		 *
1151 		 * Netatalk chooses to return the requested address
1152 		 * rather than the (correct) allocated address.  This
1153 		 * means that netatalk violates the TPI specification
1154 		 * (and would not function correctly if used from a
1155 		 * TLI application), but it does mean that it works
1156 		 * with sockfs.
1157 		 *
1158 		 * As noted above, using the newer XTI bind primitive
1159 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1160 		 * allow sockfs to be more sure about whether or not
1161 		 * the bind request had succeeded (as transports are
1162 		 * not permitted to bind to a different address than
1163 		 * that requested - they must return failure).
1164 		 * Unfortunately, support for T_BIND_REQ may not be
1165 		 * present in all transport implementations (netatalk,
1166 		 * for example, doesn't have it), making the
1167 		 * transition difficult.
1168 		 */
1169 		if (bind_ack->ADDR_length != addrlen) {
1170 			/* Assumes that the requested address was in use */
1171 			freemsg(mp);
1172 			error = EADDRINUSE;
1173 			eprintsoline(so, error);
1174 			goto done;
1175 		}
1176 
1177 		switch (so->so_family) {
1178 		case AF_INET6:
1179 		case AF_INET: {
1180 			sin_t *rname, *aname;
1181 
1182 			rname = (sin_t *)addr;
1183 			aname = (sin_t *)sti->sti_laddr_sa;
1184 
1185 			/*
1186 			 * Take advantage of the alignment
1187 			 * of sin_port and sin6_port which fall
1188 			 * in the same place in their data structures.
1189 			 * Just use sin_port for either address family.
1190 			 *
1191 			 * This may become a problem if (heaven forbid)
1192 			 * there's a separate ipv6port_reserved... :-P
1193 			 *
1194 			 * Binding to port 0 has the semantics of letting
1195 			 * the transport bind to any port.
1196 			 *
1197 			 * If the transport is TCP or UDP since we had sent
1198 			 * a T_BIND_REQ we would not get a port other than
1199 			 * what we asked for.
1200 			 */
1201 			if (tcp_udp_xport) {
1202 				/*
1203 				 * Pick up the new port number if we bound to
1204 				 * port 0.
1205 				 */
1206 				if (aname->sin_port == 0)
1207 					aname->sin_port = rname->sin_port;
1208 				sti->sti_laddr_valid = 1;
1209 				break;
1210 			}
1211 			if (aname->sin_port != 0 &&
1212 			    aname->sin_port != rname->sin_port) {
1213 				freemsg(mp);
1214 				error = EADDRINUSE;
1215 				eprintsoline(so, error);
1216 				goto done;
1217 			}
1218 			/*
1219 			 * Pick up the new port number if we bound to port 0.
1220 			 */
1221 			aname->sin_port = rname->sin_port;
1222 
1223 			/*
1224 			 * Unfortunately, addresses aren't _quite_ the same.
1225 			 */
1226 			if (so->so_family == AF_INET) {
1227 				if (aname->sin_addr.s_addr !=
1228 				    rname->sin_addr.s_addr) {
1229 					freemsg(mp);
1230 					error = EADDRNOTAVAIL;
1231 					eprintsoline(so, error);
1232 					goto done;
1233 				}
1234 			} else {
1235 				sin6_t *rname6 = (sin6_t *)rname;
1236 				sin6_t *aname6 = (sin6_t *)aname;
1237 
1238 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1239 				    &rname6->sin6_addr)) {
1240 					freemsg(mp);
1241 					error = EADDRNOTAVAIL;
1242 					eprintsoline(so, error);
1243 					goto done;
1244 				}
1245 			}
1246 			break;
1247 		}
1248 		case AF_UNIX:
1249 			if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1250 				freemsg(mp);
1251 				error = EADDRINUSE;
1252 				eprintsoline(so, error);
1253 				eprintso(so,
1254 				    ("addrlen %d, addr 0x%x, vp %p\n",
1255 				    addrlen, *((int *)addr),
1256 				    (void *)sti->sti_ux_bound_vp));
1257 				goto done;
1258 			}
1259 			sti->sti_laddr_valid = 1;
1260 			break;
1261 		default:
1262 			/*
1263 			 * NOTE: This assumes that addresses can be
1264 			 * byte-compared for equivalence.
1265 			 */
1266 			if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1267 				freemsg(mp);
1268 				error = EADDRINUSE;
1269 				eprintsoline(so, error);
1270 				goto done;
1271 			}
1272 			/*
1273 			 * Don't mark sti_laddr_valid, as we cannot be
1274 			 * sure that the returned address is the real
1275 			 * bound address when talking to an unknown
1276 			 * transport.
1277 			 */
1278 			break;
1279 		}
1280 	} else {
1281 		/*
1282 		 * Save for returned address for getsockname.
1283 		 * Needed for unspecific bind unless transport supports
1284 		 * the TI_GETMYNAME ioctl.
1285 		 * Do this for AF_INET{,6} even though they do, as
1286 		 * caching info here is much better performance than
1287 		 * a TPI/STREAMS trip to the transport for getsockname.
1288 		 * Any which can't for some reason _must_ _not_ set
1289 		 * sti_laddr_valid here for the caching version of
1290 		 * getsockname to not break;
1291 		 */
1292 		switch (so->so_family) {
1293 		case AF_UNIX:
1294 			/*
1295 			 * Record the address bound with the transport
1296 			 * for use by socketpair.
1297 			 */
1298 			bcopy(addr, &sti->sti_ux_laddr, addrlen);
1299 			sti->sti_laddr_valid = 1;
1300 			break;
1301 		case AF_INET:
1302 		case AF_INET6:
1303 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1304 			bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1305 			sti->sti_laddr_valid = 1;
1306 			break;
1307 		default:
1308 			/*
1309 			 * Don't mark sti_laddr_valid, as we cannot be
1310 			 * sure that the returned address is the real
1311 			 * bound address when talking to an unknown
1312 			 * transport.
1313 			 */
1314 			break;
1315 		}
1316 	}
1317 
1318 	freemsg(mp);
1319 
1320 done:
1321 	if (error) {
1322 		/* reset state & backlog to values held on entry */
1323 		if (clear_acceptconn_on_err == B_TRUE)
1324 			so->so_state &= ~SS_ACCEPTCONN;
1325 		if (restore_backlog_on_err == B_TRUE)
1326 			so->so_backlog = save_so_backlog;
1327 
1328 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1329 			int err;
1330 
1331 			err = sotpi_unbind(so, 0);
1332 			/* LINTED - statement has no consequent: if */
1333 			if (err) {
1334 				eprintsoline(so, error);
1335 			} else {
1336 				ASSERT(!(so->so_state & SS_ISBOUND));
1337 			}
1338 		}
1339 	}
1340 	if (!(flags & _SOBIND_LOCK_HELD)) {
1341 		so_unlock_single(so, SOLOCKED);
1342 		mutex_exit(&so->so_lock);
1343 	} else {
1344 		ASSERT(MUTEX_HELD(&so->so_lock));
1345 		ASSERT(so->so_flag & SOLOCKED);
1346 	}
1347 	return (error);
1348 }
1349 
1350 /* bind the socket */
1351 static int
1352 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1353     int flags, struct cred *cr)
1354 {
1355 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1356 		return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1357 
1358 	flags &= ~_SOBIND_SOCKETPAIR;
1359 	return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1360 }
1361 
1362 /*
1363  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1364  * address, or when listen needs to unbind and bind.
1365  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1366  * so that a sobind can pick them up.
1367  */
1368 static int
1369 sotpi_unbind(struct sonode *so, int flags)
1370 {
1371 	struct T_unbind_req	unbind_req;
1372 	int			error = 0;
1373 	mblk_t			*mp;
1374 	sotpi_info_t		*sti = SOTOTPI(so);
1375 
1376 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1377 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1378 
1379 	ASSERT(MUTEX_HELD(&so->so_lock));
1380 	ASSERT(so->so_flag & SOLOCKED);
1381 
1382 	if (!(so->so_state & SS_ISBOUND)) {
1383 		error = EINVAL;
1384 		eprintsoline(so, error);
1385 		goto done;
1386 	}
1387 
1388 	mutex_exit(&so->so_lock);
1389 
1390 	/*
1391 	 * Flush the read and write side (except stream head read queue)
1392 	 * and send down T_UNBIND_REQ.
1393 	 */
1394 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1395 
1396 	unbind_req.PRIM_type = T_UNBIND_REQ;
1397 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1398 	    0, _ALLOC_SLEEP, CRED());
1399 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1400 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1401 	mutex_enter(&so->so_lock);
1402 	if (error) {
1403 		eprintsoline(so, error);
1404 		goto done;
1405 	}
1406 
1407 	error = sowaitokack(so, T_UNBIND_REQ);
1408 	if (error) {
1409 		eprintsoline(so, error);
1410 		goto done;
1411 	}
1412 
1413 	/*
1414 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1415 	 * strsock_proto while the lock was dropped above, the unbind
1416 	 * is allowed to complete.
1417 	 */
1418 	if (!(flags & _SOUNBIND_REBIND)) {
1419 		/*
1420 		 * Clear out bound address.
1421 		 */
1422 		vnode_t *vp;
1423 
1424 		if ((vp = sti->sti_ux_bound_vp) != NULL) {
1425 			sti->sti_ux_bound_vp = NULL;
1426 			vn_rele_stream(vp);
1427 		}
1428 		/* Clear out address */
1429 		sti->sti_laddr_len = 0;
1430 	}
1431 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1432 	sti->sti_laddr_valid = 0;
1433 
1434 done:
1435 
1436 	/* If the caller held the lock don't release it here */
1437 	ASSERT(MUTEX_HELD(&so->so_lock));
1438 	ASSERT(so->so_flag & SOLOCKED);
1439 
1440 	return (error);
1441 }
1442 
1443 /*
1444  * listen on the socket.
1445  * For TPI conforming transports this has to first unbind with the transport
1446  * and then bind again using the new backlog.
1447  */
1448 /* ARGSUSED */
1449 int
1450 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1451 {
1452 	int		error = 0;
1453 	sotpi_info_t	*sti = SOTOTPI(so);
1454 
1455 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1456 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1457 
1458 	if (sti->sti_serv_type == T_CLTS)
1459 		return (EOPNOTSUPP);
1460 
1461 	/*
1462 	 * If the socket is ready to accept connections already, then
1463 	 * return without doing anything.  This avoids a problem where
1464 	 * a second listen() call fails if a connection is pending and
1465 	 * leaves the socket unbound. Only when we are not unbinding
1466 	 * with the transport can we safely increase the backlog.
1467 	 */
1468 	if (so->so_state & SS_ACCEPTCONN &&
1469 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1470 	    /*CONSTCOND*/
1471 	    !solisten_tpi_tcp))
1472 		return (0);
1473 
1474 	if (so->so_state & SS_ISCONNECTED)
1475 		return (EINVAL);
1476 
1477 	mutex_enter(&so->so_lock);
1478 	so_lock_single(so);	/* Set SOLOCKED */
1479 
1480 	/*
1481 	 * If the listen doesn't change the backlog we do nothing.
1482 	 * This avoids an EPROTO error from the transport.
1483 	 */
1484 	if ((so->so_state & SS_ACCEPTCONN) &&
1485 	    so->so_backlog == backlog)
1486 		goto done;
1487 
1488 	if (!(so->so_state & SS_ISBOUND)) {
1489 		/*
1490 		 * Must have been explicitly bound in the UNIX domain.
1491 		 */
1492 		if (so->so_family == AF_UNIX) {
1493 			error = EINVAL;
1494 			goto done;
1495 		}
1496 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1497 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1498 	} else if (backlog > 0) {
1499 		/*
1500 		 * AF_INET{,6} hack to avoid losing the port.
1501 		 * Assumes that all AF_INET{,6} transports can handle a
1502 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1503 		 * has already bound thus it is possible to avoid the unbind.
1504 		 */
1505 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1506 		    /*CONSTCOND*/
1507 		    !solisten_tpi_tcp)) {
1508 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1509 			if (error)
1510 				goto done;
1511 		}
1512 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1513 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1514 	} else {
1515 		so->so_state |= SS_ACCEPTCONN;
1516 		so->so_backlog = backlog;
1517 	}
1518 	if (error)
1519 		goto done;
1520 	ASSERT(so->so_state & SS_ACCEPTCONN);
1521 done:
1522 	so_unlock_single(so, SOLOCKED);
1523 	mutex_exit(&so->so_lock);
1524 	return (error);
1525 }
1526 
1527 /*
1528  * Disconnect either a specified seqno or all (-1).
1529  * The former is used on listening sockets only.
1530  *
1531  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1532  * the current use of sodisconnect(seqno == -1) is only for shutdown
1533  * so there is no point (and potentially incorrect) to unbind.
1534  */
1535 static int
1536 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1537 {
1538 	struct T_discon_req	discon_req;
1539 	int			error = 0;
1540 	mblk_t			*mp;
1541 
1542 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1543 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1544 
1545 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1546 		mutex_enter(&so->so_lock);
1547 		so_lock_single(so);	/* Set SOLOCKED */
1548 	} else {
1549 		ASSERT(MUTEX_HELD(&so->so_lock));
1550 		ASSERT(so->so_flag & SOLOCKED);
1551 	}
1552 
1553 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1554 		error = EINVAL;
1555 		eprintsoline(so, error);
1556 		goto done;
1557 	}
1558 
1559 	mutex_exit(&so->so_lock);
1560 	/*
1561 	 * Flush the write side (unless this is a listener)
1562 	 * and then send down a T_DISCON_REQ.
1563 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1564 	 * and other messages.)
1565 	 */
1566 	if (!(so->so_state & SS_ACCEPTCONN))
1567 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1568 
1569 	discon_req.PRIM_type = T_DISCON_REQ;
1570 	discon_req.SEQ_number = seqno;
1571 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1572 	    0, _ALLOC_SLEEP, CRED());
1573 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1574 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1575 	mutex_enter(&so->so_lock);
1576 	if (error) {
1577 		eprintsoline(so, error);
1578 		goto done;
1579 	}
1580 
1581 	error = sowaitokack(so, T_DISCON_REQ);
1582 	if (error) {
1583 		eprintsoline(so, error);
1584 		goto done;
1585 	}
1586 	/*
1587 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1588 	 * strsock_proto while the lock was dropped above, the disconnect
1589 	 * is allowed to complete. However, it is not possible to
1590 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1591 	 */
1592 	so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1593 	SOTOTPI(so)->sti_laddr_valid = 0;
1594 	SOTOTPI(so)->sti_faddr_valid = 0;
1595 done:
1596 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1597 		so_unlock_single(so, SOLOCKED);
1598 		mutex_exit(&so->so_lock);
1599 	} else {
1600 		/* If the caller held the lock don't release it here */
1601 		ASSERT(MUTEX_HELD(&so->so_lock));
1602 		ASSERT(so->so_flag & SOLOCKED);
1603 	}
1604 	return (error);
1605 }
1606 
1607 /* ARGSUSED */
1608 int
1609 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1610     struct sonode **nsop)
1611 {
1612 	struct T_conn_ind	*conn_ind;
1613 	struct T_conn_res	*conn_res;
1614 	int			error = 0;
1615 	mblk_t			*mp, *ack_mp;
1616 	struct sonode		*nso;
1617 	vnode_t			*nvp;
1618 	void			*src;
1619 	t_uscalar_t		srclen;
1620 	void			*opt;
1621 	t_uscalar_t		optlen;
1622 	t_scalar_t		PRIM_type;
1623 	t_scalar_t		SEQ_number;
1624 	size_t			sinlen;
1625 	sotpi_info_t		*sti = SOTOTPI(so);
1626 	sotpi_info_t		*nsti;
1627 
1628 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1629 	    (void *)so, fflag, (void *)nsop,
1630 	    pr_state(so->so_state, so->so_mode)));
1631 
1632 	/*
1633 	 * Defer single-threading the accepting socket until
1634 	 * the T_CONN_IND has been received and parsed and the
1635 	 * new sonode has been opened.
1636 	 */
1637 
1638 	/* Check that we are not already connected */
1639 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1640 		goto conn_bad;
1641 
1642 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1643 		goto e_bad;
1644 
1645 	ASSERT(mp != NULL);
1646 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1647 
1648 	/*
1649 	 * Save SEQ_number for error paths.
1650 	 */
1651 	SEQ_number = conn_ind->SEQ_number;
1652 
1653 	srclen = conn_ind->SRC_length;
1654 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1655 	if (src == NULL) {
1656 		error = EPROTO;
1657 		freemsg(mp);
1658 		eprintsoline(so, error);
1659 		goto disconnect_unlocked;
1660 	}
1661 	optlen = conn_ind->OPT_length;
1662 	switch (so->so_family) {
1663 	case AF_INET:
1664 	case AF_INET6:
1665 		if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1666 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1667 			    &opt, conn_ind->OPT_length);
1668 		} else {
1669 			/*
1670 			 * The transport (in this case TCP) hasn't sent up
1671 			 * a pointer to an instance for the accept fast-path.
1672 			 * Disable fast-path completely because the call to
1673 			 * sotpi_create() below would otherwise create an
1674 			 * incomplete TCP instance, which would lead to
1675 			 * problems when sockfs sends a normal T_CONN_RES
1676 			 * message down the new stream.
1677 			 */
1678 			if (sti->sti_direct) {
1679 				int rval;
1680 				/*
1681 				 * For consistency we inform tcp to disable
1682 				 * direct interface on the listener, though
1683 				 * we can certainly live without doing this
1684 				 * because no data will ever travel upstream
1685 				 * on the listening socket.
1686 				 */
1687 				sti->sti_direct = 0;
1688 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1689 				    0, 0, K_TO_K, cr, &rval);
1690 			}
1691 			opt = NULL;
1692 			optlen = 0;
1693 		}
1694 		break;
1695 	case AF_UNIX:
1696 	default:
1697 		if (optlen != 0) {
1698 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1699 			    __TPI_ALIGN_SIZE);
1700 			if (opt == NULL) {
1701 				error = EPROTO;
1702 				freemsg(mp);
1703 				eprintsoline(so, error);
1704 				goto disconnect_unlocked;
1705 			}
1706 		}
1707 		if (so->so_family == AF_UNIX) {
1708 			if (!sti->sti_faddr_noxlate) {
1709 				src = NULL;
1710 				srclen = 0;
1711 			}
1712 			/* Extract src address from options */
1713 			if (optlen != 0)
1714 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1715 		}
1716 		break;
1717 	}
1718 
1719 	/*
1720 	 * Create the new socket.
1721 	 */
1722 	nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1723 	if (nso == NULL) {
1724 		ASSERT(error != 0);
1725 		/*
1726 		 * Accept can not fail with ENOBUFS. sotpi_create
1727 		 * sleeps waiting for memory until a signal is caught
1728 		 * so return EINTR.
1729 		 */
1730 		freemsg(mp);
1731 		if (error == ENOBUFS)
1732 			error = EINTR;
1733 		goto e_disc_unl;
1734 	}
1735 	nvp = SOTOV(nso);
1736 	nsti = SOTOTPI(nso);
1737 
1738 #ifdef DEBUG
1739 	/*
1740 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1741 	 * it's inherited early to allow debugging of the accept code itself.
1742 	 */
1743 	nso->so_options |= so->so_options & SO_DEBUG;
1744 #endif /* DEBUG */
1745 
1746 	/*
1747 	 * Save the SRC address from the T_CONN_IND
1748 	 * for getpeername to work on AF_UNIX and on transports that do not
1749 	 * support TI_GETPEERNAME.
1750 	 *
1751 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1752 	 * copyin_name().
1753 	 */
1754 	if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1755 		error = EINVAL;
1756 		freemsg(mp);
1757 		eprintsoline(so, error);
1758 		goto disconnect_vp_unlocked;
1759 	}
1760 	nsti->sti_faddr_len = (socklen_t)srclen;
1761 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1762 	bcopy(src, nsti->sti_faddr_sa, srclen);
1763 	nsti->sti_faddr_valid = 1;
1764 
1765 	/*
1766 	 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1767 	 */
1768 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1769 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1770 		cred_t	*cr;
1771 		pid_t	cpid;
1772 
1773 		cr = msg_getcred(mp, &cpid);
1774 		if (cr != NULL) {
1775 			crhold(cr);
1776 			nso->so_peercred = cr;
1777 			nso->so_cpid = cpid;
1778 		}
1779 		freemsg(mp);
1780 
1781 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1782 		    sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1783 		if (mp == NULL) {
1784 			/*
1785 			 * Accept can not fail with ENOBUFS.
1786 			 * A signal was caught so return EINTR.
1787 			 */
1788 			error = EINTR;
1789 			eprintsoline(so, error);
1790 			goto disconnect_vp_unlocked;
1791 		}
1792 		conn_res = (struct T_conn_res *)mp->b_rptr;
1793 	} else {
1794 		/*
1795 		 * For efficency reasons we use msg_extractcred; no crhold
1796 		 * needed since db_credp is cleared (i.e., we move the cred
1797 		 * from the message to so_peercred.
1798 		 */
1799 		nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1800 
1801 		mp->b_rptr = DB_BASE(mp);
1802 		conn_res = (struct T_conn_res *)mp->b_rptr;
1803 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1804 
1805 		mblk_setcred(mp, cr, curproc->p_pid);
1806 	}
1807 
1808 	/*
1809 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1810 	 * (or AF_INET6) it also has to be bound in the transport provider.
1811 	 * We set the local address in the sonode from the T_OK_ACK of the
1812 	 * T_CONN_RES. For this reason the address we bind to here isn't
1813 	 * important.
1814 	 */
1815 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1816 	    /*CONSTCOND*/
1817 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1818 		/*
1819 		 * Optimization for AF_INET{,6} transports
1820 		 * that can handle a T_CONN_RES without being bound.
1821 		 */
1822 		mutex_enter(&nso->so_lock);
1823 		so_automatic_bind(nso);
1824 		mutex_exit(&nso->so_lock);
1825 	} else {
1826 		/* Perform NULL bind with the transport provider. */
1827 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1828 		    cr)) != 0) {
1829 			ASSERT(error != ENOBUFS);
1830 			freemsg(mp);
1831 			eprintsoline(nso, error);
1832 			goto disconnect_vp_unlocked;
1833 		}
1834 	}
1835 
1836 	/*
1837 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1838 	 * so that any data arriving on the new socket will cause the
1839 	 * appropriate signals to be delivered for the new socket.
1840 	 *
1841 	 * No other thread (except strsock_proto and strsock_misc)
1842 	 * can access the new socket thus we relax the locking.
1843 	 */
1844 	nso->so_pgrp = so->so_pgrp;
1845 	nso->so_state |= so->so_state & SS_ASYNC;
1846 	nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1847 
1848 	if (nso->so_pgrp != 0) {
1849 		if ((error = so_set_events(nso, nvp, cr)) != 0) {
1850 			eprintsoline(nso, error);
1851 			error = 0;
1852 			nso->so_pgrp = 0;
1853 		}
1854 	}
1855 
1856 	/*
1857 	 * Make note of the socket level options. TCP and IP level options
1858 	 * are already inherited. We could do all this after accept is
1859 	 * successful but doing it here simplifies code and no harm done
1860 	 * for error case.
1861 	 */
1862 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1863 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1864 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1865 	nso->so_sndbuf = so->so_sndbuf;
1866 	nso->so_rcvbuf = so->so_rcvbuf;
1867 	if (nso->so_options & SO_LINGER)
1868 		nso->so_linger = so->so_linger;
1869 
1870 	/*
1871 	 * Note that the following sti_direct code path should be
1872 	 * removed once we are confident that the direct sockets
1873 	 * do not result in any degradation.
1874 	 */
1875 	if (sti->sti_direct) {
1876 
1877 		ASSERT(opt != NULL);
1878 
1879 		conn_res->OPT_length = optlen;
1880 		conn_res->OPT_offset = MBLKL(mp);
1881 		bcopy(&opt, mp->b_wptr, optlen);
1882 		mp->b_wptr += optlen;
1883 		conn_res->PRIM_type = T_CONN_RES;
1884 		conn_res->ACCEPTOR_id = 0;
1885 		PRIM_type = T_CONN_RES;
1886 
1887 		/* Send down the T_CONN_RES on acceptor STREAM */
1888 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1889 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1890 		if (error) {
1891 			mutex_enter(&so->so_lock);
1892 			so_lock_single(so);
1893 			eprintsoline(so, error);
1894 			goto disconnect_vp;
1895 		}
1896 		mutex_enter(&nso->so_lock);
1897 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1898 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1899 		if (error) {
1900 			mutex_exit(&nso->so_lock);
1901 			mutex_enter(&so->so_lock);
1902 			so_lock_single(so);
1903 			eprintsoline(so, error);
1904 			goto disconnect_vp;
1905 		}
1906 		if (nso->so_family == AF_INET) {
1907 			sin_t *sin;
1908 
1909 			sin = (sin_t *)(ack_mp->b_rptr +
1910 			    sizeof (struct T_ok_ack));
1911 			bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1912 			nsti->sti_laddr_len = sizeof (sin_t);
1913 		} else {
1914 			sin6_t *sin6;
1915 
1916 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1917 			    sizeof (struct T_ok_ack));
1918 			bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1919 			nsti->sti_laddr_len = sizeof (sin6_t);
1920 		}
1921 		freemsg(ack_mp);
1922 
1923 		nso->so_state |= SS_ISCONNECTED;
1924 		nso->so_proto_handle = (sock_lower_handle_t)opt;
1925 		nsti->sti_laddr_valid = 1;
1926 
1927 		mutex_exit(&nso->so_lock);
1928 
1929 		/*
1930 		 * It's possible, through the use of autopush for example,
1931 		 * that the acceptor stream may not support sti_direct
1932 		 * semantics. If the new socket does not support sti_direct
1933 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
1934 		 * as we would in the I_PUSH case.
1935 		 */
1936 		if (nsti->sti_direct == 0) {
1937 			int	rval;
1938 
1939 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
1940 			    0, 0, K_TO_K, cr, &rval)) != 0) {
1941 				mutex_enter(&so->so_lock);
1942 				so_lock_single(so);
1943 				eprintsoline(so, error);
1944 				goto disconnect_vp;
1945 			}
1946 		}
1947 
1948 		/*
1949 		 * Pass out new socket.
1950 		 */
1951 		if (nsop != NULL)
1952 			*nsop = nso;
1953 
1954 		return (0);
1955 	}
1956 
1957 	/*
1958 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1959 	 * which don't support the FireEngine accept fast-path. It is also
1960 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1961 	 * again. Neither sockfs nor TCP attempt to find out if some other
1962 	 * random module has been inserted in between (in which case we
1963 	 * should follow TLI accept behaviour). We blindly assume the worst
1964 	 * case and revert back to old behaviour i.e. TCP will not send us
1965 	 * any option (eager) and the accept should happen on the listener
1966 	 * queue. Any queued T_conn_ind have already got their options removed
1967 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
1968 	 */
1969 	/*
1970 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1971 	 */
1972 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1973 #ifdef	_ILP32
1974 		queue_t	*q;
1975 
1976 		/*
1977 		 * Find read queue in driver
1978 		 * Can safely do this since we "own" nso/nvp.
1979 		 */
1980 		q = strvp2wq(nvp)->q_next;
1981 		while (SAMESTR(q))
1982 			q = q->q_next;
1983 		q = RD(q);
1984 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1985 #else
1986 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1987 #endif	/* _ILP32 */
1988 		conn_res->PRIM_type = O_T_CONN_RES;
1989 		PRIM_type = O_T_CONN_RES;
1990 	} else {
1991 		conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
1992 		conn_res->PRIM_type = T_CONN_RES;
1993 		PRIM_type = T_CONN_RES;
1994 	}
1995 	conn_res->SEQ_number = SEQ_number;
1996 	conn_res->OPT_length = 0;
1997 	conn_res->OPT_offset = 0;
1998 
1999 	mutex_enter(&so->so_lock);
2000 	so_lock_single(so);	/* Set SOLOCKED */
2001 	mutex_exit(&so->so_lock);
2002 
2003 	error = kstrputmsg(SOTOV(so), mp, NULL,
2004 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2005 	mutex_enter(&so->so_lock);
2006 	if (error) {
2007 		eprintsoline(so, error);
2008 		goto disconnect_vp;
2009 	}
2010 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
2011 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2012 	if (error) {
2013 		eprintsoline(so, error);
2014 		goto disconnect_vp;
2015 	}
2016 	mutex_exit(&so->so_lock);
2017 	/*
2018 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
2019 	 * that to set the local address. If this is not present
2020 	 * then we zero out the address and don't set the
2021 	 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2022 	 * the pathname from the listening socket.
2023 	 * In the case where this is TCP or an AF_UNIX socket the
2024 	 * client side may have queued data or a T_ORDREL in the
2025 	 * transport. Having now sent the T_CONN_RES we may receive
2026 	 * those queued messages at any time. Hold the acceptor
2027 	 * so_lock until its state and laddr are finalized.
2028 	 */
2029 	mutex_enter(&nso->so_lock);
2030 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2031 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
2032 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2033 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
2034 		bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2035 		nsti->sti_laddr_len = sinlen;
2036 		nsti->sti_laddr_valid = 1;
2037 	} else if (nso->so_family == AF_UNIX) {
2038 		ASSERT(so->so_family == AF_UNIX);
2039 		nsti->sti_laddr_len = sti->sti_laddr_len;
2040 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2041 		bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2042 		    nsti->sti_laddr_len);
2043 		nsti->sti_laddr_valid = 1;
2044 	} else {
2045 		nsti->sti_laddr_len = sti->sti_laddr_len;
2046 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2047 		bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2048 		nsti->sti_laddr_sa->sa_family = nso->so_family;
2049 	}
2050 	nso->so_state |= SS_ISCONNECTED;
2051 	mutex_exit(&nso->so_lock);
2052 
2053 	freemsg(ack_mp);
2054 
2055 	mutex_enter(&so->so_lock);
2056 	so_unlock_single(so, SOLOCKED);
2057 	mutex_exit(&so->so_lock);
2058 
2059 	/*
2060 	 * Pass out new socket.
2061 	 */
2062 	if (nsop != NULL)
2063 		*nsop = nso;
2064 
2065 	return (0);
2066 
2067 e_disc_unl:
2068 	eprintsoline(so, error);
2069 	goto disconnect_unlocked;
2070 
2071 disconnect_vp_unlocked:
2072 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2073 	VN_RELE(nvp);
2074 disconnect_unlocked:
2075 	(void) sodisconnect(so, SEQ_number, 0);
2076 	return (error);
2077 
2078 disconnect_vp:
2079 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2080 	so_unlock_single(so, SOLOCKED);
2081 	mutex_exit(&so->so_lock);
2082 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2083 	VN_RELE(nvp);
2084 	return (error);
2085 
2086 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2087 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2088 	    ? EOPNOTSUPP : EINVAL;
2089 e_bad:
2090 	eprintsoline(so, error);
2091 	return (error);
2092 }
2093 
2094 /*
2095  * connect a socket.
2096  *
2097  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2098  * unconnect (by specifying a null address).
2099  */
2100 int
2101 sotpi_connect(struct sonode *so,
2102     struct sockaddr *name,
2103     socklen_t namelen,
2104     int fflag,
2105     int flags,
2106     struct cred *cr)
2107 {
2108 	struct T_conn_req	conn_req;
2109 	int			error = 0;
2110 	mblk_t			*mp;
2111 	void			*src;
2112 	socklen_t		srclen;
2113 	void			*addr;
2114 	socklen_t		addrlen;
2115 	boolean_t		need_unlock;
2116 	sotpi_info_t		*sti = SOTOTPI(so);
2117 
2118 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2119 	    (void *)so, (void *)name, namelen, fflag, flags,
2120 	    pr_state(so->so_state, so->so_mode)));
2121 
2122 	/*
2123 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2124 	 * avoid sleeping for memory with SOLOCKED held.
2125 	 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2126 	 * + sizeof (struct T_opthdr).
2127 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2128 	 * exceed sti_faddr_maxlen).
2129 	 */
2130 	mp = soallocproto(sizeof (struct T_conn_req) +
2131 	    2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2132 	    cr);
2133 	if (mp == NULL) {
2134 		/*
2135 		 * Connect can not fail with ENOBUFS. A signal was
2136 		 * caught so return EINTR.
2137 		 */
2138 		error = EINTR;
2139 		eprintsoline(so, error);
2140 		return (error);
2141 	}
2142 
2143 	mutex_enter(&so->so_lock);
2144 	/*
2145 	 * Make sure there is a preallocated T_unbind_req message
2146 	 * before any binding. This message is allocated when the
2147 	 * socket is created. Since another thread can consume
2148 	 * so_unbind_mp by the time we return from so_lock_single(),
2149 	 * we should check the availability of so_unbind_mp after
2150 	 * we return from so_lock_single().
2151 	 */
2152 
2153 	so_lock_single(so);	/* Set SOLOCKED */
2154 	need_unlock = B_TRUE;
2155 
2156 	if (sti->sti_unbind_mp == NULL) {
2157 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2158 		/* NOTE: holding so_lock while sleeping */
2159 		sti->sti_unbind_mp =
2160 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2161 		if (sti->sti_unbind_mp == NULL) {
2162 			error = EINTR;
2163 			goto done;
2164 		}
2165 	}
2166 
2167 	/*
2168 	 * Can't have done a listen before connecting.
2169 	 */
2170 	if (so->so_state & SS_ACCEPTCONN) {
2171 		error = EOPNOTSUPP;
2172 		goto done;
2173 	}
2174 
2175 	/*
2176 	 * Must be bound with the transport
2177 	 */
2178 	if (!(so->so_state & SS_ISBOUND)) {
2179 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2180 		    /*CONSTCOND*/
2181 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2182 			/*
2183 			 * Optimization for AF_INET{,6} transports
2184 			 * that can handle a T_CONN_REQ without being bound.
2185 			 */
2186 			so_automatic_bind(so);
2187 		} else {
2188 			error = sotpi_bind(so, NULL, 0,
2189 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2190 			if (error)
2191 				goto done;
2192 		}
2193 		ASSERT(so->so_state & SS_ISBOUND);
2194 		flags |= _SOCONNECT_DID_BIND;
2195 	}
2196 
2197 	/*
2198 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2199 	 * connect to a null address. This is the portable method to
2200 	 * unconnect a socket.
2201 	 */
2202 	if ((namelen >= sizeof (sa_family_t)) &&
2203 	    (name->sa_family == AF_UNSPEC)) {
2204 		name = NULL;
2205 		namelen = 0;
2206 	}
2207 
2208 	/*
2209 	 * Check that we are not already connected.
2210 	 * A connection-oriented socket cannot be reconnected.
2211 	 * A connected connection-less socket can be
2212 	 * - connected to a different address by a subsequent connect
2213 	 * - "unconnected" by a connect to the NULL address
2214 	 */
2215 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2216 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2217 		if (so->so_mode & SM_CONNREQUIRED) {
2218 			/* Connection-oriented socket */
2219 			error = so->so_state & SS_ISCONNECTED ?
2220 			    EISCONN : EALREADY;
2221 			goto done;
2222 		}
2223 		/* Connection-less socket */
2224 		if (name == NULL) {
2225 			/*
2226 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2227 			 * since it was set when the socket was connected.
2228 			 * If this is UDP also send down a T_DISCON_REQ.
2229 			 */
2230 			int val;
2231 
2232 			if ((so->so_family == AF_INET ||
2233 			    so->so_family == AF_INET6) &&
2234 			    (so->so_type == SOCK_DGRAM ||
2235 			    so->so_type == SOCK_RAW) &&
2236 			    /*CONSTCOND*/
2237 			    !soconnect_tpi_udp) {
2238 				/* XXX What about implicitly unbinding here? */
2239 				error = sodisconnect(so, -1,
2240 				    _SODISCONNECT_LOCK_HELD);
2241 			} else {
2242 				so->so_state &=
2243 				    ~(SS_ISCONNECTED | SS_ISCONNECTING);
2244 				sti->sti_faddr_valid = 0;
2245 				sti->sti_faddr_len = 0;
2246 			}
2247 
2248 			/* Remove SOLOCKED since setsockopt will grab it */
2249 			so_unlock_single(so, SOLOCKED);
2250 			mutex_exit(&so->so_lock);
2251 
2252 			val = 0;
2253 			(void) sotpi_setsockopt(so, SOL_SOCKET,
2254 			    SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2255 			    cr);
2256 
2257 			mutex_enter(&so->so_lock);
2258 			so_lock_single(so);	/* Set SOLOCKED */
2259 			goto done;
2260 		}
2261 	}
2262 	ASSERT(so->so_state & SS_ISBOUND);
2263 
2264 	if (name == NULL || namelen == 0) {
2265 		error = EINVAL;
2266 		goto done;
2267 	}
2268 	/*
2269 	 * Mark the socket if sti_faddr_sa represents the transport level
2270 	 * address.
2271 	 */
2272 	if (flags & _SOCONNECT_NOXLATE) {
2273 		struct sockaddr_ux	*soaddr_ux;
2274 
2275 		ASSERT(so->so_family == AF_UNIX);
2276 		if (namelen != sizeof (struct sockaddr_ux)) {
2277 			error = EINVAL;
2278 			goto done;
2279 		}
2280 		soaddr_ux = (struct sockaddr_ux *)name;
2281 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2282 		namelen = sizeof (soaddr_ux->sou_addr);
2283 		sti->sti_faddr_noxlate = 1;
2284 	}
2285 
2286 	/*
2287 	 * Length and family checks.
2288 	 */
2289 	error = so_addr_verify(so, name, namelen);
2290 	if (error)
2291 		goto bad;
2292 
2293 	/*
2294 	 * Save foreign address. Needed for AF_UNIX as well as
2295 	 * transport providers that do not support TI_GETPEERNAME.
2296 	 * Also used for cached foreign address for TCP and UDP.
2297 	 */
2298 	if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2299 		error = EINVAL;
2300 		goto done;
2301 	}
2302 	sti->sti_faddr_len = (socklen_t)namelen;
2303 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2304 	bcopy(name, sti->sti_faddr_sa, namelen);
2305 	sti->sti_faddr_valid = 1;
2306 
2307 	if (so->so_family == AF_UNIX) {
2308 		if (sti->sti_faddr_noxlate) {
2309 			/*
2310 			 * sti_faddr is a transport-level address, so
2311 			 * don't pass it as an option.  Do save it in
2312 			 * sti_ux_faddr, used for connected DG send.
2313 			 */
2314 			src = NULL;
2315 			srclen = 0;
2316 			addr = sti->sti_faddr_sa;
2317 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2318 			bcopy(addr, &sti->sti_ux_faddr,
2319 			    sizeof (sti->sti_ux_faddr));
2320 		} else {
2321 			/*
2322 			 * Pass the sockaddr_un source address as an option
2323 			 * and translate the remote address.
2324 			 * Holding so_lock thus sti_laddr_sa can not change.
2325 			 */
2326 			src = sti->sti_laddr_sa;
2327 			srclen = (t_uscalar_t)sti->sti_laddr_len;
2328 			dprintso(so, 1,
2329 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2330 			    srclen, src));
2331 			/*
2332 			 * Translate the destination address into our
2333 			 * internal form, and save it in sti_ux_faddr.
2334 			 * After this call, addr==&sti->sti_ux_taddr,
2335 			 * and we copy that to sti->sti_ux_faddr so
2336 			 * we save the connected peer address.
2337 			 */
2338 			error = so_ux_addr_xlate(so,
2339 			    sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2340 			    (flags & _SOCONNECT_XPG4_2),
2341 			    &addr, &addrlen);
2342 			if (error)
2343 				goto bad;
2344 			bcopy(&sti->sti_ux_taddr, &sti->sti_ux_faddr,
2345 			    sizeof (sti->sti_ux_faddr));
2346 		}
2347 	} else {
2348 		addr = sti->sti_faddr_sa;
2349 		addrlen = (t_uscalar_t)sti->sti_faddr_len;
2350 		src = NULL;
2351 		srclen = 0;
2352 	}
2353 	/*
2354 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2355 	 * option which asks the transport provider to send T_UDERR_IND
2356 	 * messages. These T_UDERR_IND messages are used to return connected
2357 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2358 	 *
2359 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2360 	 * we send down a T_CONN_REQ. This is needed to let the
2361 	 * transport assign a local address that is consistent with
2362 	 * the remote address. Applications depend on a getsockname()
2363 	 * after a connect() to retrieve the "source" IP address for
2364 	 * the connected socket.  Invalidate the cached local address
2365 	 * to force getsockname() to enquire of the transport.
2366 	 */
2367 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2368 		/*
2369 		 * Datagram socket.
2370 		 */
2371 		int32_t val;
2372 
2373 		so_unlock_single(so, SOLOCKED);
2374 		mutex_exit(&so->so_lock);
2375 
2376 		val = 1;
2377 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2378 		    &val, (t_uscalar_t)sizeof (val), cr);
2379 
2380 		mutex_enter(&so->so_lock);
2381 		so_lock_single(so);	/* Set SOLOCKED */
2382 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2383 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2384 		    soconnect_tpi_udp) {
2385 			soisconnected(so);
2386 			goto done;
2387 		}
2388 		/*
2389 		 * Send down T_CONN_REQ etc.
2390 		 * Clear fflag to avoid returning EWOULDBLOCK.
2391 		 */
2392 		fflag = 0;
2393 		ASSERT(so->so_family != AF_UNIX);
2394 		sti->sti_laddr_valid = 0;
2395 	} else if (sti->sti_laddr_len != 0) {
2396 		/*
2397 		 * If the local address or port was "any" then it may be
2398 		 * changed by the transport as a result of the
2399 		 * connect.  Invalidate the cached version if we have one.
2400 		 */
2401 		switch (so->so_family) {
2402 		case AF_INET:
2403 			ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2404 			if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2405 			    INADDR_ANY ||
2406 			    ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2407 				sti->sti_laddr_valid = 0;
2408 			break;
2409 
2410 		case AF_INET6:
2411 			ASSERT(sti->sti_laddr_len ==
2412 			    (socklen_t)sizeof (sin6_t));
2413 			if (IN6_IS_ADDR_UNSPECIFIED(
2414 			    &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2415 			    IN6_IS_ADDR_V4MAPPED_ANY(
2416 			    &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2417 			    ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2418 				sti->sti_laddr_valid = 0;
2419 			break;
2420 
2421 		default:
2422 			break;
2423 		}
2424 	}
2425 
2426 	/*
2427 	 * Check for failure of an earlier call
2428 	 */
2429 	if (so->so_error != 0)
2430 		goto so_bad;
2431 
2432 	/*
2433 	 * Send down T_CONN_REQ. Message was allocated above.
2434 	 */
2435 	conn_req.PRIM_type = T_CONN_REQ;
2436 	conn_req.DEST_length = addrlen;
2437 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2438 	if (srclen == 0) {
2439 		conn_req.OPT_length = 0;
2440 		conn_req.OPT_offset = 0;
2441 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2442 		soappendmsg(mp, addr, addrlen);
2443 	} else {
2444 		/*
2445 		 * There is a AF_UNIX sockaddr_un to include as a source
2446 		 * address option.
2447 		 */
2448 		struct T_opthdr toh;
2449 
2450 		toh.level = SOL_SOCKET;
2451 		toh.name = SO_SRCADDR;
2452 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2453 		toh.status = 0;
2454 		conn_req.OPT_length =
2455 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2456 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2457 		    _TPI_ALIGN_TOPT(addrlen));
2458 
2459 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2460 		soappendmsg(mp, addr, addrlen);
2461 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2462 		soappendmsg(mp, &toh, sizeof (toh));
2463 		soappendmsg(mp, src, srclen);
2464 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2465 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2466 	}
2467 	/*
2468 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2469 	 * in order to have the right state when the T_CONN_CON shows up.
2470 	 */
2471 	soisconnecting(so);
2472 	mutex_exit(&so->so_lock);
2473 
2474 	if (AU_AUDITING())
2475 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2476 
2477 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2478 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2479 	mp = NULL;
2480 	mutex_enter(&so->so_lock);
2481 	if (error != 0)
2482 		goto bad;
2483 
2484 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2485 		goto bad;
2486 
2487 	/* Allow other threads to access the socket */
2488 	so_unlock_single(so, SOLOCKED);
2489 	need_unlock = B_FALSE;
2490 
2491 	/*
2492 	 * Wait until we get a T_CONN_CON or an error
2493 	 */
2494 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2495 		so_lock_single(so);	/* Set SOLOCKED */
2496 		need_unlock = B_TRUE;
2497 	}
2498 
2499 done:
2500 	freemsg(mp);
2501 	switch (error) {
2502 	case EINPROGRESS:
2503 	case EALREADY:
2504 	case EISCONN:
2505 	case EINTR:
2506 		/* Non-fatal errors */
2507 		sti->sti_laddr_valid = 0;
2508 		/* FALLTHRU */
2509 	case 0:
2510 		break;
2511 	default:
2512 		ASSERT(need_unlock);
2513 		/*
2514 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2515 		 * and invalidate local-address cache
2516 		 */
2517 		so->so_state &= ~SS_ISCONNECTING;
2518 		sti->sti_laddr_valid = 0;
2519 		/* A discon_ind might have already unbound us */
2520 		if ((flags & _SOCONNECT_DID_BIND) &&
2521 		    (so->so_state & SS_ISBOUND)) {
2522 			int err;
2523 
2524 			err = sotpi_unbind(so, 0);
2525 			/* LINTED - statement has no conseq */
2526 			if (err) {
2527 				eprintsoline(so, err);
2528 			}
2529 		}
2530 		break;
2531 	}
2532 	if (need_unlock)
2533 		so_unlock_single(so, SOLOCKED);
2534 	mutex_exit(&so->so_lock);
2535 	return (error);
2536 
2537 so_bad:	error = sogeterr(so, B_TRUE);
2538 bad:	eprintsoline(so, error);
2539 	goto done;
2540 }
2541 
2542 /* ARGSUSED */
2543 int
2544 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2545 {
2546 	struct T_ordrel_req	ordrel_req;
2547 	mblk_t			*mp;
2548 	uint_t			old_state, state_change;
2549 	int			error = 0;
2550 	sotpi_info_t		*sti = SOTOTPI(so);
2551 
2552 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2553 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2554 
2555 	mutex_enter(&so->so_lock);
2556 	so_lock_single(so);	/* Set SOLOCKED */
2557 
2558 	/*
2559 	 * SunOS 4.X has no check for datagram sockets.
2560 	 * 5.X checks that it is connected (ENOTCONN)
2561 	 * X/Open requires that we check the connected state.
2562 	 */
2563 	if (!(so->so_state & SS_ISCONNECTED)) {
2564 		if (!xnet_skip_checks) {
2565 			error = ENOTCONN;
2566 			if (xnet_check_print) {
2567 				printf("sockfs: X/Open shutdown check "
2568 				    "caused ENOTCONN\n");
2569 			}
2570 		}
2571 		goto done;
2572 	}
2573 	/*
2574 	 * Record the current state and then perform any state changes.
2575 	 * Then use the difference between the old and new states to
2576 	 * determine which messages need to be sent.
2577 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2578 	 * duplicate calls to shutdown().
2579 	 */
2580 	old_state = so->so_state;
2581 
2582 	switch (how) {
2583 	case 0:
2584 		socantrcvmore(so);
2585 		break;
2586 	case 1:
2587 		socantsendmore(so);
2588 		break;
2589 	case 2:
2590 		socantsendmore(so);
2591 		socantrcvmore(so);
2592 		break;
2593 	default:
2594 		error = EINVAL;
2595 		goto done;
2596 	}
2597 
2598 	/*
2599 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2600 	 */
2601 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2602 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2603 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2604 
2605 	switch (state_change) {
2606 	case 0:
2607 		dprintso(so, 1,
2608 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2609 		    so->so_state));
2610 		goto done;
2611 
2612 	case SS_CANTRCVMORE:
2613 		mutex_exit(&so->so_lock);
2614 		strseteof(SOTOV(so), 1);
2615 		/*
2616 		 * strseteof takes care of read side wakeups,
2617 		 * pollwakeups, and signals.
2618 		 */
2619 		/*
2620 		 * Get the read lock before flushing data to avoid problems
2621 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2622 		 */
2623 		mutex_enter(&so->so_lock);
2624 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2625 		mutex_exit(&so->so_lock);
2626 
2627 		/* Flush read side queue */
2628 		strflushrq(SOTOV(so), FLUSHALL);
2629 
2630 		mutex_enter(&so->so_lock);
2631 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2632 		break;
2633 
2634 	case SS_CANTSENDMORE:
2635 		mutex_exit(&so->so_lock);
2636 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2637 		mutex_enter(&so->so_lock);
2638 		break;
2639 
2640 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2641 		mutex_exit(&so->so_lock);
2642 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2643 		strseteof(SOTOV(so), 1);
2644 		/*
2645 		 * strseteof takes care of read side wakeups,
2646 		 * pollwakeups, and signals.
2647 		 */
2648 		/*
2649 		 * Get the read lock before flushing data to avoid problems
2650 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2651 		 */
2652 		mutex_enter(&so->so_lock);
2653 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2654 		mutex_exit(&so->so_lock);
2655 
2656 		/* Flush read side queue */
2657 		strflushrq(SOTOV(so), FLUSHALL);
2658 
2659 		mutex_enter(&so->so_lock);
2660 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2661 		break;
2662 	}
2663 
2664 	ASSERT(MUTEX_HELD(&so->so_lock));
2665 
2666 	/*
2667 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2668 	 * was set due to this call and the new state has both of them set:
2669 	 *	Send the AF_UNIX close indication
2670 	 *	For T_COTS send a discon_ind
2671 	 *
2672 	 * If cantsend was set due to this call:
2673 	 *	For T_COTSORD send an ordrel_ind
2674 	 *
2675 	 * Note that for T_CLTS there is no message sent here.
2676 	 */
2677 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2678 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2679 		/*
2680 		 * For SunOS 4.X compatibility we tell the other end
2681 		 * that we are unable to receive at this point.
2682 		 */
2683 		if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2684 			so_unix_close(so);
2685 
2686 		if (sti->sti_serv_type == T_COTS)
2687 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2688 	}
2689 	if ((state_change & SS_CANTSENDMORE) &&
2690 	    (sti->sti_serv_type == T_COTS_ORD)) {
2691 		/* Send an orderly release */
2692 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2693 
2694 		mutex_exit(&so->so_lock);
2695 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2696 		    0, _ALLOC_SLEEP, cr);
2697 		/*
2698 		 * Send down the T_ORDREL_REQ even if there is flow control.
2699 		 * This prevents shutdown from blocking.
2700 		 * Note that there is no T_OK_ACK for ordrel_req.
2701 		 */
2702 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2703 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2704 		mutex_enter(&so->so_lock);
2705 		if (error) {
2706 			eprintsoline(so, error);
2707 			goto done;
2708 		}
2709 	}
2710 
2711 done:
2712 	so_unlock_single(so, SOLOCKED);
2713 	mutex_exit(&so->so_lock);
2714 	return (error);
2715 }
2716 
2717 /*
2718  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2719  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2720  * that we have closed.
2721  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2722  * T_UNITDATA_REQ containing the same option.
2723  *
2724  * For SOCK_DGRAM half-connections (somebody connected to this end
2725  * but this end is not connect) we don't know where to send any
2726  * SO_UNIX_CLOSE.
2727  *
2728  * We have to ignore stream head errors just in case there has been
2729  * a shutdown(output).
2730  * Ignore any flow control to try to get the message more quickly to the peer.
2731  * While locally ignoring flow control solves the problem when there
2732  * is only the loopback transport on the stream it would not provide
2733  * the correct AF_UNIX socket semantics when one or more modules have
2734  * been pushed.
2735  */
2736 void
2737 so_unix_close(struct sonode *so)
2738 {
2739 	struct T_opthdr	toh;
2740 	mblk_t		*mp;
2741 	sotpi_info_t	*sti = SOTOTPI(so);
2742 
2743 	ASSERT(MUTEX_HELD(&so->so_lock));
2744 
2745 	ASSERT(so->so_family == AF_UNIX);
2746 
2747 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2748 	    (SS_ISCONNECTED|SS_ISBOUND))
2749 		return;
2750 
2751 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2752 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2753 
2754 	toh.level = SOL_SOCKET;
2755 	toh.name = SO_UNIX_CLOSE;
2756 
2757 	/* zero length + header */
2758 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2759 	toh.status = 0;
2760 
2761 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2762 		struct T_optdata_req tdr;
2763 
2764 		tdr.PRIM_type = T_OPTDATA_REQ;
2765 		tdr.DATA_flag = 0;
2766 
2767 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2768 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2769 
2770 		/* NOTE: holding so_lock while sleeping */
2771 		mp = soallocproto2(&tdr, sizeof (tdr),
2772 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2773 	} else {
2774 		struct T_unitdata_req	tudr;
2775 		void			*addr;
2776 		socklen_t		addrlen;
2777 		void			*src;
2778 		socklen_t		srclen;
2779 		struct T_opthdr		toh2;
2780 		t_scalar_t		size;
2781 
2782 		/*
2783 		 * We know this is an AF_UNIX connected DGRAM socket.
2784 		 * We therefore already have the destination address
2785 		 * in the internal form needed for this send.  This is
2786 		 * similar to the sosend_dgram call later in this file
2787 		 * when there's no user-specified destination address.
2788 		 */
2789 		if (sti->sti_faddr_noxlate) {
2790 			/*
2791 			 * Already have a transport internal address. Do not
2792 			 * pass any (transport internal) source address.
2793 			 */
2794 			addr = sti->sti_faddr_sa;
2795 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2796 			src = NULL;
2797 			srclen = 0;
2798 		} else {
2799 			/*
2800 			 * Pass the sockaddr_un source address as an option
2801 			 * and translate the remote address.
2802 			 * Holding so_lock thus sti_laddr_sa can not change.
2803 			 */
2804 			src = sti->sti_laddr_sa;
2805 			srclen = (socklen_t)sti->sti_laddr_len;
2806 			dprintso(so, 1,
2807 			    ("so_ux_close: srclen %d, src %p\n",
2808 			    srclen, src));
2809 			/*
2810 			 * Use the destination address saved in connect.
2811 			 */
2812 			addr = &sti->sti_ux_faddr;
2813 			addrlen = sizeof (sti->sti_ux_faddr);
2814 		}
2815 		tudr.PRIM_type = T_UNITDATA_REQ;
2816 		tudr.DEST_length = addrlen;
2817 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2818 		if (srclen == 0) {
2819 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2820 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2821 			    _TPI_ALIGN_TOPT(addrlen));
2822 
2823 			size = tudr.OPT_offset + tudr.OPT_length;
2824 			/* NOTE: holding so_lock while sleeping */
2825 			mp = soallocproto2(&tudr, sizeof (tudr),
2826 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2827 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2828 			soappendmsg(mp, &toh, sizeof (toh));
2829 		} else {
2830 			/*
2831 			 * There is a AF_UNIX sockaddr_un to include as a
2832 			 * source address option.
2833 			 */
2834 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2835 			    _TPI_ALIGN_TOPT(srclen));
2836 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2837 			    _TPI_ALIGN_TOPT(addrlen));
2838 
2839 			toh2.level = SOL_SOCKET;
2840 			toh2.name = SO_SRCADDR;
2841 			toh2.len = (t_uscalar_t)(srclen +
2842 			    sizeof (struct T_opthdr));
2843 			toh2.status = 0;
2844 
2845 			size = tudr.OPT_offset + tudr.OPT_length;
2846 
2847 			/* NOTE: holding so_lock while sleeping */
2848 			mp = soallocproto2(&tudr, sizeof (tudr),
2849 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2850 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2851 			soappendmsg(mp, &toh, sizeof (toh));
2852 			soappendmsg(mp, &toh2, sizeof (toh2));
2853 			soappendmsg(mp, src, srclen);
2854 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2855 		}
2856 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2857 	}
2858 	mutex_exit(&so->so_lock);
2859 	(void) kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2860 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2861 	mutex_enter(&so->so_lock);
2862 }
2863 
2864 /*
2865  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2866  * In addition, the caller typically verifies that there is some
2867  * potential state to clear by checking
2868  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2869  * before calling this routine.
2870  * Note that such a check can be made without holding so_lock since
2871  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2872  * decrements sti_oobsigcnt.
2873  *
2874  * When data is read *after* the point that all pending
2875  * oob data has been consumed the oob indication is cleared.
2876  *
2877  * This logic keeps select/poll returning POLLRDBAND and
2878  * SIOCATMARK returning true until we have read past
2879  * the mark.
2880  */
2881 static void
2882 sorecv_update_oobstate(struct sonode *so)
2883 {
2884 	sotpi_info_t *sti = SOTOTPI(so);
2885 
2886 	mutex_enter(&so->so_lock);
2887 	ASSERT(so_verify_oobstate(so));
2888 	dprintso(so, 1,
2889 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
2890 	    sti->sti_oobsigcnt,
2891 	    sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2892 	if (sti->sti_oobsigcnt == 0) {
2893 		/* No more pending oob indications */
2894 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2895 		freemsg(so->so_oobmsg);
2896 		so->so_oobmsg = NULL;
2897 	}
2898 	ASSERT(so_verify_oobstate(so));
2899 	mutex_exit(&so->so_lock);
2900 }
2901 
2902 /*
2903  * Receive the next message on the queue.
2904  * If msg_controllen is non-zero when called the caller is interested in
2905  * any received control info (options).
2906  * If msg_namelen is non-zero when called the caller is interested in
2907  * any received source address.
2908  * The routine returns with msg_control and msg_name pointing to
2909  * kmem_alloc'ed memory which the caller has to free.
2910  */
2911 /* ARGSUSED */
2912 int
2913 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
2914     struct cred *cr)
2915 {
2916 	union T_primitives	*tpr;
2917 	mblk_t			*mp;
2918 	uchar_t			pri;
2919 	int			pflag, opflag;
2920 	void			*control;
2921 	t_uscalar_t		controllen;
2922 	t_uscalar_t		namelen;
2923 	int			so_state = so->so_state; /* Snapshot */
2924 	ssize_t			saved_resid;
2925 	rval_t			rval;
2926 	int			flags;
2927 	clock_t			timout;
2928 	int			error = 0;
2929 	sotpi_info_t		*sti = SOTOTPI(so);
2930 
2931 	flags = msg->msg_flags;
2932 	msg->msg_flags = 0;
2933 
2934 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2935 	    (void *)so, (void *)msg, flags,
2936 	    pr_state(so->so_state, so->so_mode), so->so_error));
2937 
2938 	if (so->so_version == SOV_STREAM) {
2939 		so_update_attrs(so, SOACC);
2940 		/* The imaginary "sockmod" has been popped - act as a stream */
2941 		return (strread(SOTOV(so), uiop, cr));
2942 	}
2943 
2944 	/*
2945 	 * If we are not connected because we have never been connected
2946 	 * we return ENOTCONN. If we have been connected (but are no longer
2947 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2948 	 * the EOF.
2949 	 *
2950 	 * An alternative would be to post an ENOTCONN error in stream head
2951 	 * (read+write) and clear it when we're connected. However, that error
2952 	 * would cause incorrect poll/select behavior!
2953 	 */
2954 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2955 	    (so->so_mode & SM_CONNREQUIRED)) {
2956 		return (ENOTCONN);
2957 	}
2958 
2959 	/*
2960 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2961 	 * after checking that the read queue is empty) and returns zero.
2962 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2963 	 * is zero.
2964 	 */
2965 
2966 	if (flags & MSG_OOB) {
2967 		/* Check that the transport supports OOB */
2968 		if (!(so->so_mode & SM_EXDATA))
2969 			return (EOPNOTSUPP);
2970 		so_update_attrs(so, SOACC);
2971 		return (sorecvoob(so, msg, uiop, flags,
2972 		    (so->so_options & SO_OOBINLINE)));
2973 	}
2974 
2975 	so_update_attrs(so, SOACC);
2976 
2977 	/*
2978 	 * Set msg_controllen and msg_namelen to zero here to make it
2979 	 * simpler in the cases that no control or name is returned.
2980 	 */
2981 	controllen = msg->msg_controllen;
2982 	namelen = msg->msg_namelen;
2983 	msg->msg_controllen = 0;
2984 	msg->msg_namelen = 0;
2985 
2986 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2987 	    namelen, controllen));
2988 
2989 	mutex_enter(&so->so_lock);
2990 	/*
2991 	 * Only one reader is allowed at any given time. This is needed
2992 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
2993 	 *
2994 	 * This is slightly different that BSD behavior in that it fails with
2995 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
2996 	 * is single-threaded using sblock(), which is dropped while waiting
2997 	 * for data to appear. The difference shows up e.g. if one
2998 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
2999 	 * does use nonblocking io and different threads are reading each
3000 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3001 	 * in this case as long as the read queue doesn't get empty.
3002 	 * In this implementation the thread using nonblocking io can
3003 	 * get an EWOULDBLOCK error due to the blocking thread executing
3004 	 * e.g. in the uiomove in kstrgetmsg.
3005 	 * This difference is not believed to be significant.
3006 	 */
3007 	/* Set SOREADLOCKED */
3008 	error = so_lock_read_intr(so,
3009 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3010 	mutex_exit(&so->so_lock);
3011 	if (error)
3012 		return (error);
3013 
3014 	/*
3015 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3016 	 * queued data has been consumed.
3017 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3018 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3019 	 *
3020 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3021 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3022 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3023 	 */
3024 	pflag = MSG_ANY | MSG_DELAYERROR;
3025 	if (flags & MSG_PEEK) {
3026 		pflag |= MSG_IPEEK;
3027 		flags &= ~MSG_WAITALL;
3028 	}
3029 	if (so->so_mode & SM_ATOMIC)
3030 		pflag |= MSG_DISCARDTAIL;
3031 
3032 	if (flags & MSG_DONTWAIT)
3033 		timout = 0;
3034 	else if (so->so_rcvtimeo != 0)
3035 		timout = TICK_TO_MSEC(so->so_rcvtimeo);
3036 	else
3037 		timout = -1;
3038 	opflag = pflag;
3039 retry:
3040 	saved_resid = uiop->uio_resid;
3041 	pri = 0;
3042 	mp = NULL;
3043 	error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3044 	    timout, &rval);
3045 	if (error != 0) {
3046 		/* kstrgetmsg returns ETIME when timeout expires */
3047 		if (error == ETIME)
3048 			error = EWOULDBLOCK;
3049 		goto out;
3050 	}
3051 	/*
3052 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3053 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3054 	 */
3055 	ASSERT(!(rval.r_val1 & MORECTL));
3056 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3057 		msg->msg_flags |= MSG_TRUNC;
3058 
3059 	if (mp == NULL) {
3060 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3061 		/*
3062 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3063 		 * The draft Posix socket spec states that the mark should
3064 		 * not be cleared when peeking. We follow the latter.
3065 		 */
3066 		if ((so->so_state &
3067 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3068 		    (uiop->uio_resid != saved_resid) &&
3069 		    !(flags & MSG_PEEK)) {
3070 			sorecv_update_oobstate(so);
3071 		}
3072 
3073 		mutex_enter(&so->so_lock);
3074 		/* Set MSG_EOR based on MOREDATA */
3075 		if (!(rval.r_val1 & MOREDATA)) {
3076 			if (so->so_state & SS_SAVEDEOR) {
3077 				msg->msg_flags |= MSG_EOR;
3078 				so->so_state &= ~SS_SAVEDEOR;
3079 			}
3080 		}
3081 		/*
3082 		 * If some data was received (i.e. not EOF) and the
3083 		 * read/recv* has not been satisfied wait for some more.
3084 		 */
3085 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3086 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3087 			mutex_exit(&so->so_lock);
3088 			pflag = opflag | MSG_NOMARK;
3089 			goto retry;
3090 		}
3091 		goto out_locked;
3092 	}
3093 
3094 	/* strsock_proto has already verified length and alignment */
3095 	tpr = (union T_primitives *)mp->b_rptr;
3096 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3097 
3098 	switch (tpr->type) {
3099 	case T_DATA_IND: {
3100 		if ((so->so_state &
3101 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3102 		    (uiop->uio_resid != saved_resid) &&
3103 		    !(flags & MSG_PEEK)) {
3104 			sorecv_update_oobstate(so);
3105 		}
3106 
3107 		/*
3108 		 * Set msg_flags to MSG_EOR based on
3109 		 * MORE_flag and MOREDATA.
3110 		 */
3111 		mutex_enter(&so->so_lock);
3112 		so->so_state &= ~SS_SAVEDEOR;
3113 		if (!(tpr->data_ind.MORE_flag & 1)) {
3114 			if (!(rval.r_val1 & MOREDATA))
3115 				msg->msg_flags |= MSG_EOR;
3116 			else
3117 				so->so_state |= SS_SAVEDEOR;
3118 		}
3119 		freemsg(mp);
3120 		/*
3121 		 * If some data was received (i.e. not EOF) and the
3122 		 * read/recv* has not been satisfied wait for some more.
3123 		 */
3124 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3125 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3126 			mutex_exit(&so->so_lock);
3127 			pflag = opflag | MSG_NOMARK;
3128 			goto retry;
3129 		}
3130 		goto out_locked;
3131 	}
3132 	case T_UNITDATA_IND: {
3133 		void *addr;
3134 		t_uscalar_t addrlen;
3135 		void *abuf;
3136 		t_uscalar_t optlen;
3137 		void *opt;
3138 
3139 		if ((so->so_state &
3140 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3141 		    (uiop->uio_resid != saved_resid) &&
3142 		    !(flags & MSG_PEEK)) {
3143 			sorecv_update_oobstate(so);
3144 		}
3145 
3146 		if (namelen != 0) {
3147 			/* Caller wants source address */
3148 			addrlen = tpr->unitdata_ind.SRC_length;
3149 			addr = sogetoff(mp,
3150 			    tpr->unitdata_ind.SRC_offset,
3151 			    addrlen, 1);
3152 			if (addr == NULL) {
3153 				freemsg(mp);
3154 				error = EPROTO;
3155 				eprintsoline(so, error);
3156 				goto out;
3157 			}
3158 			if (so->so_family == AF_UNIX) {
3159 				/*
3160 				 * Can not use the transport level address.
3161 				 * If there is a SO_SRCADDR option carrying
3162 				 * the socket level address it will be
3163 				 * extracted below.
3164 				 */
3165 				addr = NULL;
3166 				addrlen = 0;
3167 			}
3168 		}
3169 		optlen = tpr->unitdata_ind.OPT_length;
3170 		if (optlen != 0) {
3171 			t_uscalar_t ncontrollen;
3172 
3173 			/*
3174 			 * Extract any source address option.
3175 			 * Determine how large cmsg buffer is needed.
3176 			 */
3177 			opt = sogetoff(mp,
3178 			    tpr->unitdata_ind.OPT_offset,
3179 			    optlen, __TPI_ALIGN_SIZE);
3180 
3181 			if (opt == NULL) {
3182 				freemsg(mp);
3183 				error = EPROTO;
3184 				eprintsoline(so, error);
3185 				goto out;
3186 			}
3187 			if (so->so_family == AF_UNIX)
3188 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3189 			ncontrollen = so_cmsglen(mp, opt, optlen,
3190 			    !(flags & MSG_XPG4_2));
3191 			if (controllen != 0)
3192 				controllen = ncontrollen;
3193 			else if (ncontrollen != 0)
3194 				msg->msg_flags |= MSG_CTRUNC;
3195 		} else {
3196 			controllen = 0;
3197 		}
3198 
3199 		if (namelen != 0) {
3200 			/*
3201 			 * Return address to caller.
3202 			 * Caller handles truncation if length
3203 			 * exceeds msg_namelen.
3204 			 * NOTE: AF_UNIX NUL termination is ensured by
3205 			 * the sender's copyin_name().
3206 			 */
3207 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3208 
3209 			bcopy(addr, abuf, addrlen);
3210 			msg->msg_name = abuf;
3211 			msg->msg_namelen = addrlen;
3212 		}
3213 
3214 		if (controllen != 0) {
3215 			/*
3216 			 * Return control msg to caller.
3217 			 * Caller handles truncation if length
3218 			 * exceeds msg_controllen.
3219 			 */
3220 			control = kmem_zalloc(controllen, KM_SLEEP);
3221 
3222 			error = so_opt2cmsg(mp, opt, optlen, flags, control,
3223 			    controllen);
3224 			if (error) {
3225 				freemsg(mp);
3226 				if (msg->msg_namelen != 0)
3227 					kmem_free(msg->msg_name,
3228 					    msg->msg_namelen);
3229 				kmem_free(control, controllen);
3230 				eprintsoline(so, error);
3231 				goto out;
3232 			}
3233 			msg->msg_control = control;
3234 			msg->msg_controllen = controllen;
3235 		}
3236 
3237 		freemsg(mp);
3238 		goto out;
3239 	}
3240 	case T_OPTDATA_IND: {
3241 		struct T_optdata_req *tdr;
3242 		void *opt;
3243 		t_uscalar_t optlen;
3244 
3245 		if ((so->so_state &
3246 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3247 		    (uiop->uio_resid != saved_resid) &&
3248 		    !(flags & MSG_PEEK)) {
3249 			sorecv_update_oobstate(so);
3250 		}
3251 
3252 		tdr = (struct T_optdata_req *)mp->b_rptr;
3253 		optlen = tdr->OPT_length;
3254 		if (optlen != 0) {
3255 			t_uscalar_t ncontrollen;
3256 			/*
3257 			 * Determine how large cmsg buffer is needed.
3258 			 */
3259 			opt = sogetoff(mp,
3260 			    tpr->optdata_ind.OPT_offset,
3261 			    optlen, __TPI_ALIGN_SIZE);
3262 
3263 			if (opt == NULL) {
3264 				freemsg(mp);
3265 				error = EPROTO;
3266 				eprintsoline(so, error);
3267 				goto out;
3268 			}
3269 
3270 			ncontrollen = so_cmsglen(mp, opt, optlen,
3271 			    !(flags & MSG_XPG4_2));
3272 			if (controllen != 0)
3273 				controllen = ncontrollen;
3274 			else if (ncontrollen != 0)
3275 				msg->msg_flags |= MSG_CTRUNC;
3276 		} else {
3277 			controllen = 0;
3278 		}
3279 
3280 		if (controllen != 0) {
3281 			/*
3282 			 * Return control msg to caller.
3283 			 * Caller handles truncation if length
3284 			 * exceeds msg_controllen.
3285 			 */
3286 			control = kmem_zalloc(controllen, KM_SLEEP);
3287 
3288 			error = so_opt2cmsg(mp, opt, optlen, flags, control,
3289 			    controllen);
3290 			if (error) {
3291 				freemsg(mp);
3292 				kmem_free(control, controllen);
3293 				eprintsoline(so, error);
3294 				goto out;
3295 			}
3296 			msg->msg_control = control;
3297 			msg->msg_controllen = controllen;
3298 		}
3299 
3300 		/*
3301 		 * Set msg_flags to MSG_EOR based on
3302 		 * DATA_flag and MOREDATA.
3303 		 */
3304 		mutex_enter(&so->so_lock);
3305 		so->so_state &= ~SS_SAVEDEOR;
3306 		if (!(tpr->data_ind.MORE_flag & 1)) {
3307 			if (!(rval.r_val1 & MOREDATA))
3308 				msg->msg_flags |= MSG_EOR;
3309 			else
3310 				so->so_state |= SS_SAVEDEOR;
3311 		}
3312 		freemsg(mp);
3313 		/*
3314 		 * If some data was received (i.e. not EOF) and the
3315 		 * read/recv* has not been satisfied wait for some more.
3316 		 * Not possible to wait if control info was received.
3317 		 */
3318 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3319 		    controllen == 0 &&
3320 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3321 			mutex_exit(&so->so_lock);
3322 			pflag = opflag | MSG_NOMARK;
3323 			goto retry;
3324 		}
3325 		goto out_locked;
3326 	}
3327 	case T_EXDATA_IND: {
3328 		dprintso(so, 1,
3329 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3330 		    "state %s\n",
3331 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3332 		    saved_resid - uiop->uio_resid,
3333 		    pr_state(so->so_state, so->so_mode)));
3334 		/*
3335 		 * kstrgetmsg handles MSGMARK so there is nothing to
3336 		 * inspect in the T_EXDATA_IND.
3337 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3338 		 * as a separate message with no M_DATA component. Furthermore,
3339 		 * the stream head does not consolidate M_DATA messages onto
3340 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3341 		 * remains a message by itself. This is needed since MSGMARK
3342 		 * marks both the whole message as well as the last byte
3343 		 * of the message.
3344 		 */
3345 		freemsg(mp);
3346 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3347 		if (flags & MSG_PEEK) {
3348 			/*
3349 			 * Even though we are peeking we consume the
3350 			 * T_EXDATA_IND thereby moving the mark information
3351 			 * to SS_RCVATMARK. Then the oob code below will
3352 			 * retry the peeking kstrgetmsg.
3353 			 * Note that the stream head read queue is
3354 			 * never flushed without holding SOREADLOCKED
3355 			 * thus the T_EXDATA_IND can not disappear
3356 			 * underneath us.
3357 			 */
3358 			dprintso(so, 1,
3359 			    ("sotpi_recvmsg: consume EXDATA_IND "
3360 			    "counts %d/%d state %s\n",
3361 			    sti->sti_oobsigcnt,
3362 			    sti->sti_oobcnt,
3363 			    pr_state(so->so_state, so->so_mode)));
3364 
3365 			pflag = MSG_ANY | MSG_DELAYERROR;
3366 			if (so->so_mode & SM_ATOMIC)
3367 				pflag |= MSG_DISCARDTAIL;
3368 
3369 			pri = 0;
3370 			mp = NULL;
3371 
3372 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3373 			    &pri, &pflag, (clock_t)-1, &rval);
3374 			ASSERT(uiop->uio_resid == saved_resid);
3375 
3376 			if (error) {
3377 #ifdef SOCK_DEBUG
3378 				if (error != EWOULDBLOCK && error != EINTR) {
3379 					eprintsoline(so, error);
3380 				}
3381 #endif /* SOCK_DEBUG */
3382 				goto out;
3383 			}
3384 			ASSERT(mp);
3385 			tpr = (union T_primitives *)mp->b_rptr;
3386 			ASSERT(tpr->type == T_EXDATA_IND);
3387 			freemsg(mp);
3388 		} /* end "if (flags & MSG_PEEK)" */
3389 
3390 		/*
3391 		 * Decrement the number of queued and pending oob.
3392 		 *
3393 		 * SS_RCVATMARK is cleared when we read past a mark.
3394 		 * SS_HAVEOOBDATA is cleared when we've read past the
3395 		 * last mark.
3396 		 * SS_OOBPEND is cleared if we've read past the last
3397 		 * mark and no (new) SIGURG has been posted.
3398 		 */
3399 		mutex_enter(&so->so_lock);
3400 		ASSERT(so_verify_oobstate(so));
3401 		ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3402 		ASSERT(sti->sti_oobsigcnt > 0);
3403 		sti->sti_oobsigcnt--;
3404 		ASSERT(sti->sti_oobcnt > 0);
3405 		sti->sti_oobcnt--;
3406 		/*
3407 		 * Since the T_EXDATA_IND has been removed from the stream
3408 		 * head, but we have not read data past the mark,
3409 		 * sockfs needs to track that the socket is still at the mark.
3410 		 *
3411 		 * Since no data was received call kstrgetmsg again to wait
3412 		 * for data.
3413 		 */
3414 		so->so_state |= SS_RCVATMARK;
3415 		mutex_exit(&so->so_lock);
3416 		dprintso(so, 1,
3417 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3418 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3419 		    pr_state(so->so_state, so->so_mode)));
3420 		pflag = opflag;
3421 		goto retry;
3422 	}
3423 	default:
3424 		cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3425 		    (void *)so, tpr->type, (void *)mp);
3426 		ASSERT(0);
3427 		freemsg(mp);
3428 		error = EPROTO;
3429 		eprintsoline(so, error);
3430 		goto out;
3431 	}
3432 	/* NOTREACHED */
3433 out:
3434 	mutex_enter(&so->so_lock);
3435 out_locked:
3436 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3437 	mutex_exit(&so->so_lock);
3438 	return (error);
3439 }
3440 
3441 /*
3442  * Sending data with options on a datagram socket.
3443  * Assumes caller has verified that SS_ISBOUND etc. are set.
3444  *
3445  * For AF_UNIX the destination address may be already in
3446  * internal form, as indicated by sti->sti_faddr_noxlate
3447  * or the MSG_SENDTO_NOXLATE flag.  Otherwise we need to
3448  * translate the destination address to internal form.
3449  *
3450  * The source address is passed as an option.  If passing
3451  * file descriptors, those are passed as file pointers in
3452  * another option.
3453  */
3454 static int
3455 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3456     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3457 {
3458 	struct T_unitdata_req	tudr;
3459 	mblk_t			*mp;
3460 	int			error;
3461 	void			*addr;
3462 	socklen_t		addrlen;
3463 	void			*src;
3464 	socklen_t		srclen;
3465 	ssize_t			len;
3466 	int			size;
3467 	struct T_opthdr		toh;
3468 	struct fdbuf		*fdbuf;
3469 	t_uscalar_t		optlen;
3470 	void			*fds;
3471 	int			fdlen;
3472 	sotpi_info_t		*sti = SOTOTPI(so);
3473 
3474 	ASSERT(name && namelen);
3475 	ASSERT(control && controllen);
3476 
3477 	len = uiop->uio_resid;
3478 	if (len > (ssize_t)sti->sti_tidu_size) {
3479 		return (EMSGSIZE);
3480 	}
3481 
3482 	if (sti->sti_faddr_noxlate == 0 &&
3483 	    (flags & MSG_SENDTO_NOXLATE) == 0) {
3484 		/*
3485 		 * Length and family checks.
3486 		 * Don't verify internal form.
3487 		 */
3488 		error = so_addr_verify(so, name, namelen);
3489 		if (error) {
3490 			eprintsoline(so, error);
3491 			return (error);
3492 		}
3493 	}
3494 
3495 	if (so->so_family == AF_UNIX) {
3496 		if (sti->sti_faddr_noxlate) {
3497 			/*
3498 			 * Already have a transport internal address. Do not
3499 			 * pass any (transport internal) source address.
3500 			 */
3501 			addr = name;
3502 			addrlen = namelen;
3503 			src = NULL;
3504 			srclen = 0;
3505 		} else if (flags & MSG_SENDTO_NOXLATE) {
3506 			/*
3507 			 * Have an internal form dest. address.
3508 			 * Pass the source address as usual.
3509 			 */
3510 			addr = name;
3511 			addrlen = namelen;
3512 			src = sti->sti_laddr_sa;
3513 			srclen = (socklen_t)sti->sti_laddr_len;
3514 		} else {
3515 			/*
3516 			 * Pass the sockaddr_un source address as an option
3517 			 * and translate the remote address.
3518 			 *
3519 			 * Note that this code does not prevent sti_laddr_sa
3520 			 * from changing while it is being used. Thus
3521 			 * if an unbind+bind occurs concurrently with this
3522 			 * send the peer might see a partially new and a
3523 			 * partially old "from" address.
3524 			 */
3525 			src = sti->sti_laddr_sa;
3526 			srclen = (socklen_t)sti->sti_laddr_len;
3527 			dprintso(so, 1,
3528 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3529 			    srclen, src));
3530 			/*
3531 			 * The sendmsg caller specified a destination
3532 			 * address, which we must translate into our
3533 			 * internal form.  addr = &sti->sti_ux_taddr
3534 			 */
3535 			error = so_ux_addr_xlate(so, name, namelen,
3536 			    (flags & MSG_XPG4_2),
3537 			    &addr, &addrlen);
3538 			if (error) {
3539 				eprintsoline(so, error);
3540 				return (error);
3541 			}
3542 		}
3543 	} else {
3544 		addr = name;
3545 		addrlen = namelen;
3546 		src = NULL;
3547 		srclen = 0;
3548 	}
3549 	optlen = so_optlen(control, controllen,
3550 	    !(flags & MSG_XPG4_2));
3551 	tudr.PRIM_type = T_UNITDATA_REQ;
3552 	tudr.DEST_length = addrlen;
3553 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3554 	if (srclen != 0)
3555 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3556 		    _TPI_ALIGN_TOPT(srclen));
3557 	else
3558 		tudr.OPT_length = optlen;
3559 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3560 	    _TPI_ALIGN_TOPT(addrlen));
3561 
3562 	size = tudr.OPT_offset + tudr.OPT_length;
3563 
3564 	/*
3565 	 * File descriptors only when SM_FDPASSING set.
3566 	 */
3567 	error = so_getfdopt(control, controllen,
3568 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3569 	if (error)
3570 		return (error);
3571 	if (fdlen != -1) {
3572 		if (!(so->so_mode & SM_FDPASSING))
3573 			return (EOPNOTSUPP);
3574 
3575 		error = fdbuf_create(fds, fdlen, &fdbuf);
3576 		if (error)
3577 			return (error);
3578 
3579 		/*
3580 		 * Pre-allocate enough additional space for lower level modules
3581 		 * to append an option (e.g. see tl_unitdata). The following
3582 		 * is enough extra space for the largest option we might append.
3583 		 */
3584 		size += sizeof (struct T_opthdr) + ucredsize;
3585 		mp = fdbuf_allocmsg(size, fdbuf);
3586 	} else {
3587 		mp = soallocproto(size, _ALLOC_INTR, CRED());
3588 		if (mp == NULL) {
3589 			/*
3590 			 * Caught a signal waiting for memory.
3591 			 * Let send* return EINTR.
3592 			 */
3593 			return (EINTR);
3594 		}
3595 	}
3596 	soappendmsg(mp, &tudr, sizeof (tudr));
3597 	soappendmsg(mp, addr, addrlen);
3598 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3599 
3600 	if (fdlen != -1) {
3601 		ASSERT(fdbuf != NULL);
3602 		toh.level = SOL_SOCKET;
3603 		toh.name = SO_FILEP;
3604 		toh.len = fdbuf->fd_size +
3605 		    (t_uscalar_t)sizeof (struct T_opthdr);
3606 		toh.status = 0;
3607 		soappendmsg(mp, &toh, sizeof (toh));
3608 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3609 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3610 	}
3611 	if (srclen != 0) {
3612 		/*
3613 		 * There is a AF_UNIX sockaddr_un to include as a source
3614 		 * address option.
3615 		 */
3616 		toh.level = SOL_SOCKET;
3617 		toh.name = SO_SRCADDR;
3618 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3619 		toh.status = 0;
3620 		soappendmsg(mp, &toh, sizeof (toh));
3621 		soappendmsg(mp, src, srclen);
3622 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3623 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3624 	}
3625 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3626 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3627 	/*
3628 	 * Normally at most 3 bytes left in the message, but we might have
3629 	 * allowed for extra space if we're passing fd's through.
3630 	 */
3631 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3632 
3633 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3634 	if (AU_AUDITING())
3635 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3636 
3637 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3638 #ifdef SOCK_DEBUG
3639 	if (error) {
3640 		eprintsoline(so, error);
3641 	}
3642 #endif /* SOCK_DEBUG */
3643 	return (error);
3644 }
3645 
3646 /*
3647  * Sending data with options on a connected stream socket.
3648  * Assumes caller has verified that SS_ISCONNECTED is set.
3649  */
3650 static int
3651 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3652     t_uscalar_t controllen, int flags)
3653 {
3654 	struct T_optdata_req	tdr;
3655 	mblk_t			*mp;
3656 	int			error;
3657 	ssize_t			iosize;
3658 	int			size;
3659 	struct fdbuf		*fdbuf;
3660 	t_uscalar_t		optlen;
3661 	void			*fds;
3662 	int			fdlen;
3663 	struct T_opthdr		toh;
3664 	sotpi_info_t		*sti = SOTOTPI(so);
3665 
3666 	dprintso(so, 1,
3667 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3668 
3669 	/*
3670 	 * Has to be bound and connected. However, since no locks are
3671 	 * held the state could have changed after sotpi_sendmsg checked it
3672 	 * thus it is not possible to ASSERT on the state.
3673 	 */
3674 
3675 	/* Options on connection-oriented only when SM_OPTDATA set. */
3676 	if (!(so->so_mode & SM_OPTDATA))
3677 		return (EOPNOTSUPP);
3678 
3679 	do {
3680 		/*
3681 		 * Set the MORE flag if uio_resid does not fit in this
3682 		 * message or if the caller passed in "more".
3683 		 * Error for transports with zero tidu_size.
3684 		 */
3685 		tdr.PRIM_type = T_OPTDATA_REQ;
3686 		iosize = sti->sti_tidu_size;
3687 		if (iosize <= 0)
3688 			return (EMSGSIZE);
3689 		if (uiop->uio_resid > iosize) {
3690 			tdr.DATA_flag = 1;
3691 		} else {
3692 			if (more)
3693 				tdr.DATA_flag = 1;
3694 			else
3695 				tdr.DATA_flag = 0;
3696 			iosize = uiop->uio_resid;
3697 		}
3698 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3699 		    tdr.DATA_flag, iosize));
3700 
3701 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3702 		tdr.OPT_length = optlen;
3703 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3704 
3705 		size = (int)sizeof (tdr) + optlen;
3706 		/*
3707 		 * File descriptors only when SM_FDPASSING set.
3708 		 */
3709 		error = so_getfdopt(control, controllen,
3710 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3711 		if (error)
3712 			return (error);
3713 		if (fdlen != -1) {
3714 			if (!(so->so_mode & SM_FDPASSING))
3715 				return (EOPNOTSUPP);
3716 
3717 			error = fdbuf_create(fds, fdlen, &fdbuf);
3718 			if (error)
3719 				return (error);
3720 
3721 			/*
3722 			 * Pre-allocate enough additional space for lower level
3723 			 * modules to append an option (e.g. see tl_unitdata).
3724 			 * The following is enough extra space for the largest
3725 			 * option we might append.
3726 			 */
3727 			size += sizeof (struct T_opthdr) + ucredsize;
3728 			mp = fdbuf_allocmsg(size, fdbuf);
3729 		} else {
3730 			mp = soallocproto(size, _ALLOC_INTR, CRED());
3731 			if (mp == NULL) {
3732 				/*
3733 				 * Caught a signal waiting for memory.
3734 				 * Let send* return EINTR.
3735 				 */
3736 				return (EINTR);
3737 			}
3738 		}
3739 		soappendmsg(mp, &tdr, sizeof (tdr));
3740 
3741 		if (fdlen != -1) {
3742 			ASSERT(fdbuf != NULL);
3743 			toh.level = SOL_SOCKET;
3744 			toh.name = SO_FILEP;
3745 			toh.len = fdbuf->fd_size +
3746 			    (t_uscalar_t)sizeof (struct T_opthdr);
3747 			toh.status = 0;
3748 			soappendmsg(mp, &toh, sizeof (toh));
3749 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3750 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3751 		}
3752 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3753 		/*
3754 		 * Normally at most 3 bytes left in the message, but we might
3755 		 * have allowed for extra space if we're passing fd's through.
3756 		 */
3757 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3758 
3759 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3760 
3761 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3762 		    0, MSG_BAND, 0);
3763 		if (error) {
3764 			eprintsoline(so, error);
3765 			return (error);
3766 		}
3767 		control = NULL;
3768 		if (uiop->uio_resid > 0) {
3769 			/*
3770 			 * Recheck for fatal errors. Fail write even though
3771 			 * some data have been written. This is consistent
3772 			 * with strwrite semantics and BSD sockets semantics.
3773 			 */
3774 			if (so->so_state & SS_CANTSENDMORE) {
3775 				eprintsoline(so, error);
3776 				return (EPIPE);
3777 			}
3778 			if (so->so_error != 0) {
3779 				mutex_enter(&so->so_lock);
3780 				error = sogeterr(so, B_TRUE);
3781 				mutex_exit(&so->so_lock);
3782 				if (error != 0) {
3783 					eprintsoline(so, error);
3784 					return (error);
3785 				}
3786 			}
3787 		}
3788 	} while (uiop->uio_resid > 0);
3789 	return (0);
3790 }
3791 
3792 /*
3793  * Sending data on a datagram socket.
3794  * Assumes caller has verified that SS_ISBOUND etc. are set.
3795  *
3796  * For AF_UNIX the destination address may be already in
3797  * internal form, as indicated by sti->sti_faddr_noxlate
3798  * or the MSG_SENDTO_NOXLATE flag.  Otherwise we need to
3799  * translate the destination address to internal form.
3800  *
3801  * The source address is passed as an option.
3802  */
3803 int
3804 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
3805     struct uio *uiop, int flags)
3806 {
3807 	struct T_unitdata_req	tudr;
3808 	mblk_t			*mp;
3809 	int			error;
3810 	void			*addr;
3811 	socklen_t		addrlen;
3812 	void			*src;
3813 	socklen_t		srclen;
3814 	ssize_t			len;
3815 	sotpi_info_t		*sti = SOTOTPI(so);
3816 
3817 	ASSERT(name != NULL && namelen != 0);
3818 
3819 	len = uiop->uio_resid;
3820 	if (len > sti->sti_tidu_size) {
3821 		error = EMSGSIZE;
3822 		goto done;
3823 	}
3824 
3825 	if (sti->sti_faddr_noxlate == 0 &&
3826 	    (flags & MSG_SENDTO_NOXLATE) == 0) {
3827 		/*
3828 		 * Length and family checks.
3829 		 * Don't verify internal form.
3830 		 */
3831 		error = so_addr_verify(so, name, namelen);
3832 		if (error != 0)
3833 			goto done;
3834 	}
3835 
3836 	if (sti->sti_direct)	/* Never on AF_UNIX */
3837 		return (sodgram_direct(so, name, namelen, uiop, flags));
3838 
3839 	if (so->so_family == AF_UNIX) {
3840 		if (sti->sti_faddr_noxlate) {
3841 			/*
3842 			 * Already have a transport internal address. Do not
3843 			 * pass any (transport internal) source address.
3844 			 */
3845 			addr = name;
3846 			addrlen = namelen;
3847 			src = NULL;
3848 			srclen = 0;
3849 		} else if (flags & MSG_SENDTO_NOXLATE) {
3850 			/*
3851 			 * Have an internal form dest. address.
3852 			 * Pass the source address as usual.
3853 			 */
3854 			addr = name;
3855 			addrlen = namelen;
3856 			src = sti->sti_laddr_sa;
3857 			srclen = (socklen_t)sti->sti_laddr_len;
3858 		} else {
3859 			/*
3860 			 * Pass the sockaddr_un source address as an option
3861 			 * and translate the remote address.
3862 			 *
3863 			 * Note that this code does not prevent sti_laddr_sa
3864 			 * from changing while it is being used. Thus
3865 			 * if an unbind+bind occurs concurrently with this
3866 			 * send the peer might see a partially new and a
3867 			 * partially old "from" address.
3868 			 */
3869 			src = sti->sti_laddr_sa;
3870 			srclen = (socklen_t)sti->sti_laddr_len;
3871 			dprintso(so, 1,
3872 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
3873 			    srclen, src));
3874 			/*
3875 			 * The sendmsg caller specified a destination
3876 			 * address, which we must translate into our
3877 			 * internal form.  addr = &sti->sti_ux_taddr
3878 			 */
3879 			error = so_ux_addr_xlate(so, name, namelen,
3880 			    (flags & MSG_XPG4_2),
3881 			    &addr, &addrlen);
3882 			if (error) {
3883 				eprintsoline(so, error);
3884 				goto done;
3885 			}
3886 		}
3887 	} else {
3888 		addr = name;
3889 		addrlen = namelen;
3890 		src = NULL;
3891 		srclen = 0;
3892 	}
3893 	tudr.PRIM_type = T_UNITDATA_REQ;
3894 	tudr.DEST_length = addrlen;
3895 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3896 	if (srclen == 0) {
3897 		tudr.OPT_length = 0;
3898 		tudr.OPT_offset = 0;
3899 
3900 		mp = soallocproto2(&tudr, sizeof (tudr),
3901 		    addr, addrlen, 0, _ALLOC_INTR, CRED());
3902 		if (mp == NULL) {
3903 			/*
3904 			 * Caught a signal waiting for memory.
3905 			 * Let send* return EINTR.
3906 			 */
3907 			error = EINTR;
3908 			goto done;
3909 		}
3910 	} else {
3911 		/*
3912 		 * There is a AF_UNIX sockaddr_un to include as a source
3913 		 * address option.
3914 		 */
3915 		struct T_opthdr toh;
3916 		ssize_t size;
3917 
3918 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
3919 		    _TPI_ALIGN_TOPT(srclen));
3920 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3921 		    _TPI_ALIGN_TOPT(addrlen));
3922 
3923 		toh.level = SOL_SOCKET;
3924 		toh.name = SO_SRCADDR;
3925 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3926 		toh.status = 0;
3927 
3928 		size = tudr.OPT_offset + tudr.OPT_length;
3929 		mp = soallocproto2(&tudr, sizeof (tudr),
3930 		    addr, addrlen, size, _ALLOC_INTR, CRED());
3931 		if (mp == NULL) {
3932 			/*
3933 			 * Caught a signal waiting for memory.
3934 			 * Let send* return EINTR.
3935 			 */
3936 			error = EINTR;
3937 			goto done;
3938 		}
3939 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3940 		soappendmsg(mp, &toh, sizeof (toh));
3941 		soappendmsg(mp, src, srclen);
3942 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3943 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3944 	}
3945 
3946 	if (AU_AUDITING())
3947 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3948 
3949 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3950 done:
3951 #ifdef SOCK_DEBUG
3952 	if (error) {
3953 		eprintsoline(so, error);
3954 	}
3955 #endif /* SOCK_DEBUG */
3956 	return (error);
3957 }
3958 
3959 /*
3960  * Sending data on a connected stream socket.
3961  * Assumes caller has verified that SS_ISCONNECTED is set.
3962  */
3963 int
3964 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
3965     int sflag)
3966 {
3967 	struct T_data_req	tdr;
3968 	mblk_t			*mp;
3969 	int			error;
3970 	ssize_t			iosize;
3971 	sotpi_info_t		*sti = SOTOTPI(so);
3972 
3973 	dprintso(so, 1,
3974 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
3975 	    (void *)so, uiop->uio_resid, prim, sflag));
3976 
3977 	/*
3978 	 * Has to be bound and connected. However, since no locks are
3979 	 * held the state could have changed after sotpi_sendmsg checked it
3980 	 * thus it is not possible to ASSERT on the state.
3981 	 */
3982 
3983 	do {
3984 		/*
3985 		 * Set the MORE flag if uio_resid does not fit in this
3986 		 * message or if the caller passed in "more".
3987 		 * Error for transports with zero tidu_size.
3988 		 */
3989 		tdr.PRIM_type = prim;
3990 		iosize = sti->sti_tidu_size;
3991 		if (iosize <= 0)
3992 			return (EMSGSIZE);
3993 		if (uiop->uio_resid > iosize) {
3994 			tdr.MORE_flag = 1;
3995 		} else {
3996 			if (more)
3997 				tdr.MORE_flag = 1;
3998 			else
3999 				tdr.MORE_flag = 0;
4000 			iosize = uiop->uio_resid;
4001 		}
4002 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4003 		    prim, tdr.MORE_flag, iosize));
4004 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4005 		if (mp == NULL) {
4006 			/*
4007 			 * Caught a signal waiting for memory.
4008 			 * Let send* return EINTR.
4009 			 */
4010 			return (EINTR);
4011 		}
4012 
4013 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4014 		    0, sflag | MSG_BAND, 0);
4015 		if (error) {
4016 			eprintsoline(so, error);
4017 			return (error);
4018 		}
4019 		if (uiop->uio_resid > 0) {
4020 			/*
4021 			 * Recheck for fatal errors. Fail write even though
4022 			 * some data have been written. This is consistent
4023 			 * with strwrite semantics and BSD sockets semantics.
4024 			 */
4025 			if (so->so_state & SS_CANTSENDMORE) {
4026 				eprintsoline(so, error);
4027 				return (EPIPE);
4028 			}
4029 			if (so->so_error != 0) {
4030 				mutex_enter(&so->so_lock);
4031 				error = sogeterr(so, B_TRUE);
4032 				mutex_exit(&so->so_lock);
4033 				if (error != 0) {
4034 					eprintsoline(so, error);
4035 					return (error);
4036 				}
4037 			}
4038 		}
4039 	} while (uiop->uio_resid > 0);
4040 	return (0);
4041 }
4042 
4043 /*
4044  * Check the state for errors and call the appropriate send function.
4045  *
4046  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4047  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4048  * after sending the message.
4049  *
4050  * The caller may optionally specify a destination address, for either
4051  * stream or datagram sockets.  This table summarizes the cases:
4052  *
4053  *    Socket type    Dest. given    Connected    Result
4054  *    -----------    -----------    ---------    --------------
4055  *    Stream         *              Yes	         send to conn. addr.
4056  *    Stream         *              No           error ENOTCONN
4057  *    Dgram          yes            *            send to given addr.
4058  *    Dgram          no             yes          send to conn. addr.
4059  *    Dgram          no             no	         error EDESTADDRREQ
4060  *
4061  * There are subtleties around the destination address when using
4062  * AF_UNIX datagram sockets.  When the sendmsg call specifies the
4063  * destination address, it's in (struct sockaddr_un) form and we
4064  * need to translate it to our internal form (struct so_ux_addr).
4065  *
4066  * When the sendmsg call does not specify a destination address
4067  * we're using the peer address saved during sotpi_connect, and
4068  * that address is already in internal form.  In this case, the
4069  * (internal only) flag MSG_SENDTO_NOXLATE is set in the flags
4070  * passed to sosend_dgram or sosend_dgramcmsg to indicate that
4071  * those functions should skip translation to internal form.
4072  * Avoiding that translation is not only more efficient, but it's
4073  * also necessary when a process does a connect on an AF_UNIX
4074  * datagram socket and then drops privileges.  After the process
4075  * has dropped privileges, it may no longer be able to lookup the
4076  * the external name in the filesystem, but it should still be
4077  * able to send messages on the connected socket by leaving the
4078  * destination name unspecified.
4079  *
4080  * Yet more subtleties arise with sockets connected by socketpair(),
4081  * which puts internal form addresses in the fields where normally
4082  * the external form is found, and sets sti_faddr_noxlate=1, which
4083  * (like flag MSG_SENDTO_NOXLATE) causes the sosend_dgram functions
4084  * to skip translation of destination addresses to internal form.
4085  * However, beware that the flag sti_faddr_noxlate=1 also triggers
4086  * different behaviour almost everywhere AF_UNIX addresses appear.
4087  */
4088 static int
4089 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4090     struct cred *cr)
4091 {
4092 	int		so_state;
4093 	int		so_mode;
4094 	int		error;
4095 	struct sockaddr *name;
4096 	t_uscalar_t	namelen;
4097 	int		dontroute;
4098 	int		flags;
4099 	sotpi_info_t	*sti = SOTOTPI(so);
4100 
4101 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4102 	    (void *)so, (void *)msg, msg->msg_flags,
4103 	    pr_state(so->so_state, so->so_mode), so->so_error));
4104 
4105 	if (so->so_version == SOV_STREAM) {
4106 		/* The imaginary "sockmod" has been popped - act as a stream */
4107 		so_update_attrs(so, SOMOD);
4108 		return (strwrite(SOTOV(so), uiop, cr));
4109 	}
4110 
4111 	mutex_enter(&so->so_lock);
4112 	so_state = so->so_state;
4113 
4114 	if (so_state & SS_CANTSENDMORE) {
4115 		mutex_exit(&so->so_lock);
4116 		return (EPIPE);
4117 	}
4118 
4119 	if (so->so_error != 0) {
4120 		error = sogeterr(so, B_TRUE);
4121 		if (error != 0) {
4122 			mutex_exit(&so->so_lock);
4123 			return (error);
4124 		}
4125 	}
4126 
4127 	name = (struct sockaddr *)msg->msg_name;
4128 	namelen = msg->msg_namelen;
4129 	flags = msg->msg_flags;
4130 
4131 	/*
4132 	 * Historically, this function does not validate the flags
4133 	 * passed in, and any errant bits are ignored.  However,
4134 	 * we would not want any such errant flag bits accidently
4135 	 * being treated as one of the internal-only flags, so
4136 	 * clear the internal-only flag bits.
4137 	 */
4138 	flags &= ~MSG_SENDTO_NOXLATE;
4139 
4140 	so_mode = so->so_mode;
4141 
4142 	if (name == NULL) {
4143 		if (!(so_state & SS_ISCONNECTED)) {
4144 			mutex_exit(&so->so_lock);
4145 			if (so_mode & SM_CONNREQUIRED)
4146 				return (ENOTCONN);
4147 			else
4148 				return (EDESTADDRREQ);
4149 		}
4150 		/*
4151 		 * This is a connected socket.
4152 		 */
4153 		if (so_mode & SM_CONNREQUIRED) {
4154 			/*
4155 			 * This is a connected STREAM socket,
4156 			 * destination not specified.
4157 			 */
4158 			name = NULL;
4159 			namelen = 0;
4160 		} else {
4161 			/*
4162 			 * Datagram send on connected socket with
4163 			 * the destination name not specified.
4164 			 * Use the peer address from connect.
4165 			 */
4166 			if (so->so_family == AF_UNIX) {
4167 				/*
4168 				 * Use the (internal form) address saved
4169 				 * in sotpi_connect.  See above.
4170 				 */
4171 				name = (void *)&sti->sti_ux_faddr;
4172 				namelen = sizeof (sti->sti_ux_faddr);
4173 				flags |= MSG_SENDTO_NOXLATE;
4174 			} else {
4175 				ASSERT(sti->sti_faddr_sa);
4176 				name = sti->sti_faddr_sa;
4177 				namelen = (t_uscalar_t)sti->sti_faddr_len;
4178 			}
4179 		}
4180 	} else {
4181 		/*
4182 		 * Sendmsg specifies a destination name
4183 		 */
4184 		if (!(so_state & SS_ISCONNECTED) &&
4185 		    (so_mode & SM_CONNREQUIRED)) {
4186 			/* i.e. TCP not connected */
4187 			mutex_exit(&so->so_lock);
4188 			return (ENOTCONN);
4189 		}
4190 		/*
4191 		 * Ignore the address on connection-oriented sockets.
4192 		 * Just like BSD this code does not generate an error for
4193 		 * TCP (a CONNREQUIRED socket) when sending to an address
4194 		 * passed in with sendto/sendmsg. Instead the data is
4195 		 * delivered on the connection as if no address had been
4196 		 * supplied.
4197 		 */
4198 		if ((so_state & SS_ISCONNECTED) &&
4199 		    !(so_mode & SM_CONNREQUIRED)) {
4200 			mutex_exit(&so->so_lock);
4201 			return (EISCONN);
4202 		}
4203 		if (!(so_state & SS_ISBOUND)) {
4204 			so_lock_single(so);	/* Set SOLOCKED */
4205 			error = sotpi_bind(so, NULL, 0,
4206 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4207 			so_unlock_single(so, SOLOCKED);
4208 			if (error) {
4209 				mutex_exit(&so->so_lock);
4210 				eprintsoline(so, error);
4211 				return (error);
4212 			}
4213 		}
4214 		/*
4215 		 * Handle delayed datagram errors. These are only queued
4216 		 * when the application sets SO_DGRAM_ERRIND.
4217 		 * Return the error if we are sending to the address
4218 		 * that was returned in the last T_UDERROR_IND.
4219 		 * If sending to some other address discard the delayed
4220 		 * error indication.
4221 		 */
4222 		if (sti->sti_delayed_error) {
4223 			struct T_uderror_ind	*tudi;
4224 			void			*addr;
4225 			t_uscalar_t		addrlen;
4226 			boolean_t		match = B_FALSE;
4227 
4228 			ASSERT(sti->sti_eaddr_mp);
4229 			error = sti->sti_delayed_error;
4230 			sti->sti_delayed_error = 0;
4231 			tudi =
4232 			    (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4233 			addrlen = tudi->DEST_length;
4234 			addr = sogetoff(sti->sti_eaddr_mp,
4235 			    tudi->DEST_offset, addrlen, 1);
4236 			ASSERT(addr);	/* Checked by strsock_proto */
4237 			switch (so->so_family) {
4238 			case AF_INET: {
4239 				/* Compare just IP address and port */
4240 				sin_t *sin1 = (sin_t *)name;
4241 				sin_t *sin2 = (sin_t *)addr;
4242 
4243 				if (addrlen == sizeof (sin_t) &&
4244 				    namelen == addrlen &&
4245 				    sin1->sin_port == sin2->sin_port &&
4246 				    sin1->sin_addr.s_addr ==
4247 				    sin2->sin_addr.s_addr)
4248 					match = B_TRUE;
4249 				break;
4250 			}
4251 			case AF_INET6: {
4252 				/* Compare just IP address and port. Not flow */
4253 				sin6_t *sin1 = (sin6_t *)name;
4254 				sin6_t *sin2 = (sin6_t *)addr;
4255 
4256 				if (addrlen == sizeof (sin6_t) &&
4257 				    namelen == addrlen &&
4258 				    sin1->sin6_port == sin2->sin6_port &&
4259 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4260 				    &sin2->sin6_addr))
4261 					match = B_TRUE;
4262 				break;
4263 			}
4264 			case AF_UNIX:
4265 			default:
4266 				if (namelen == addrlen &&
4267 				    bcmp(name, addr, namelen) == 0)
4268 					match = B_TRUE;
4269 			}
4270 			if (match) {
4271 				freemsg(sti->sti_eaddr_mp);
4272 				sti->sti_eaddr_mp = NULL;
4273 				mutex_exit(&so->so_lock);
4274 #ifdef DEBUG
4275 				dprintso(so, 0,
4276 				    ("sockfs delayed error %d for %s\n",
4277 				    error,
4278 				    pr_addr(so->so_family, name, namelen)));
4279 #endif /* DEBUG */
4280 				return (error);
4281 			}
4282 			freemsg(sti->sti_eaddr_mp);
4283 			sti->sti_eaddr_mp = NULL;
4284 		}
4285 	}
4286 	mutex_exit(&so->so_lock);
4287 
4288 	dontroute = 0;
4289 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4290 		uint32_t	val;
4291 
4292 		val = 1;
4293 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4294 		    &val, (t_uscalar_t)sizeof (val), cr);
4295 		if (error)
4296 			return (error);
4297 		dontroute = 1;
4298 	}
4299 
4300 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4301 		error = EOPNOTSUPP;
4302 		goto done;
4303 	}
4304 	if (msg->msg_controllen != 0) {
4305 		if (!(so_mode & SM_CONNREQUIRED)) {
4306 			so_update_attrs(so, SOMOD);
4307 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4308 			    msg->msg_control, msg->msg_controllen, flags);
4309 		} else {
4310 			if (flags & MSG_OOB) {
4311 				/* Can't generate T_EXDATA_REQ with options */
4312 				error = EOPNOTSUPP;
4313 				goto done;
4314 			}
4315 			so_update_attrs(so, SOMOD);
4316 			error = sosend_svccmsg(so, uiop,
4317 			    !(flags & MSG_EOR),
4318 			    msg->msg_control, msg->msg_controllen,
4319 			    flags);
4320 		}
4321 		goto done;
4322 	}
4323 
4324 	so_update_attrs(so, SOMOD);
4325 	if (!(so_mode & SM_CONNREQUIRED)) {
4326 		/*
4327 		 * If there is no SO_DONTROUTE to turn off return immediately
4328 		 * from send_dgram. This can allow tail-call optimizations.
4329 		 */
4330 		if (!dontroute) {
4331 			return (sosend_dgram(so, name, namelen, uiop, flags));
4332 		}
4333 		error = sosend_dgram(so, name, namelen, uiop, flags);
4334 	} else {
4335 		t_scalar_t prim;
4336 		int sflag;
4337 
4338 		/* Ignore msg_name in the connected state */
4339 		if (flags & MSG_OOB) {
4340 			prim = T_EXDATA_REQ;
4341 			/*
4342 			 * Send down T_EXDATA_REQ even if there is flow
4343 			 * control for data.
4344 			 */
4345 			sflag = MSG_IGNFLOW;
4346 		} else {
4347 			if (so_mode & SM_BYTESTREAM) {
4348 				/* Byte stream transport - use write */
4349 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4350 
4351 				/* Send M_DATA messages */
4352 				/*
4353 				 * If there is no SO_DONTROUTE to turn off,
4354 				 * sti_direct is on, and there is no flow
4355 				 * control, we can take the fast path.
4356 				 */
4357 				if (!dontroute && sti->sti_direct != 0 &&
4358 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4359 					return (sostream_direct(so, uiop,
4360 					    NULL, cr));
4361 				}
4362 				error = strwrite(SOTOV(so), uiop, cr);
4363 				goto done;
4364 			}
4365 			prim = T_DATA_REQ;
4366 			sflag = 0;
4367 		}
4368 		/*
4369 		 * If there is no SO_DONTROUTE to turn off return immediately
4370 		 * from sosend_svc. This can allow tail-call optimizations.
4371 		 */
4372 		if (!dontroute)
4373 			return (sosend_svc(so, uiop, prim,
4374 			    !(flags & MSG_EOR), sflag));
4375 		error = sosend_svc(so, uiop, prim,
4376 		    !(flags & MSG_EOR), sflag);
4377 	}
4378 	ASSERT(dontroute);
4379 done:
4380 	if (dontroute) {
4381 		uint32_t	val;
4382 
4383 		val = 0;
4384 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4385 		    &val, (t_uscalar_t)sizeof (val), cr);
4386 	}
4387 	return (error);
4388 }
4389 
4390 /*
4391  * kstrwritemp() has very similar semantics as that of strwrite().
4392  * The main difference is it obtains mblks from the caller and also
4393  * does not do any copy as done in strwrite() from user buffers to
4394  * kernel buffers.
4395  *
4396  * Currently, this routine is used by sendfile to send data allocated
4397  * within the kernel without any copying. This interface does not use the
4398  * synchronous stream interface as synch. stream interface implies
4399  * copying.
4400  */
4401 int
4402 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4403 {
4404 	struct stdata *stp;
4405 	struct queue *wqp;
4406 	mblk_t *newmp;
4407 	char waitflag;
4408 	int tempmode;
4409 	int error = 0;
4410 	int done = 0;
4411 	struct sonode *so;
4412 	boolean_t direct;
4413 
4414 	ASSERT(vp->v_stream);
4415 	stp = vp->v_stream;
4416 
4417 	so = VTOSO(vp);
4418 	direct = _SOTOTPI(so)->sti_direct;
4419 
4420 	/*
4421 	 * This is the sockfs direct fast path. canputnext() need
4422 	 * not be accurate so we don't grab the sd_lock here. If
4423 	 * we get flow-controlled, we grab sd_lock just before the
4424 	 * do..while loop below to emulate what strwrite() does.
4425 	 */
4426 	wqp = stp->sd_wrq;
4427 	if (canputnext(wqp) && direct &&
4428 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4429 		return (sostream_direct(so, NULL, mp, CRED()));
4430 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4431 		/* Fast check of flags before acquiring the lock */
4432 		mutex_enter(&stp->sd_lock);
4433 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4434 		mutex_exit(&stp->sd_lock);
4435 		if (error != 0) {
4436 			if (!(stp->sd_flag & STPLEX) &&
4437 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
4438 				error = EPIPE;
4439 			}
4440 			return (error);
4441 		}
4442 	}
4443 
4444 	waitflag = WRITEWAIT;
4445 	if (stp->sd_flag & OLDNDELAY)
4446 		tempmode = fmode & ~FNDELAY;
4447 	else
4448 		tempmode = fmode;
4449 
4450 	mutex_enter(&stp->sd_lock);
4451 	do {
4452 		if (canputnext(wqp)) {
4453 			mutex_exit(&stp->sd_lock);
4454 			if (stp->sd_wputdatafunc != NULL) {
4455 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4456 				    NULL, NULL, NULL);
4457 				if (newmp == NULL) {
4458 					/* The caller will free mp */
4459 					return (ECOMM);
4460 				}
4461 				mp = newmp;
4462 			}
4463 			putnext(wqp, mp);
4464 			return (0);
4465 		}
4466 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4467 		    &done);
4468 	} while (error == 0 && !done);
4469 
4470 	mutex_exit(&stp->sd_lock);
4471 	/*
4472 	 * EAGAIN tells the application to try again. ENOMEM
4473 	 * is returned only if the memory allocation size
4474 	 * exceeds the physical limits of the system. ENOMEM
4475 	 * can't be true here.
4476 	 */
4477 	if (error == ENOMEM)
4478 		error = EAGAIN;
4479 	return (error);
4480 }
4481 
4482 /* ARGSUSED */
4483 static int
4484 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4485     struct cred *cr, mblk_t **mpp)
4486 {
4487 	int error;
4488 
4489 	switch (so->so_family) {
4490 	case AF_INET:
4491 	case AF_INET6:
4492 	case AF_UNIX:
4493 		break;
4494 	default:
4495 		return (EAFNOSUPPORT);
4496 
4497 	}
4498 
4499 	if (so->so_state & SS_CANTSENDMORE)
4500 		return (EPIPE);
4501 
4502 	if (so->so_type != SOCK_STREAM)
4503 		return (EOPNOTSUPP);
4504 
4505 	if ((so->so_state & SS_ISCONNECTED) == 0)
4506 		return (ENOTCONN);
4507 
4508 	error = kstrwritemp(so->so_vnode, *mpp, fflag);
4509 	if (error == 0)
4510 		*mpp = NULL;
4511 	return (error);
4512 }
4513 
4514 /*
4515  * Sending data on a datagram socket.
4516  * Assumes caller has verified that SS_ISBOUND etc. are set.
4517  */
4518 /* ARGSUSED */
4519 static int
4520 sodgram_direct(struct sonode *so, struct sockaddr *name,
4521     socklen_t namelen, struct uio *uiop, int flags)
4522 {
4523 	struct T_unitdata_req	tudr;
4524 	mblk_t			*mp = NULL;
4525 	int			error = 0;
4526 	void			*addr;
4527 	socklen_t		addrlen;
4528 	ssize_t			len;
4529 	struct stdata		*stp = SOTOV(so)->v_stream;
4530 	int			so_state;
4531 	queue_t			*udp_wq;
4532 	boolean_t		connected;
4533 	mblk_t			*mpdata = NULL;
4534 	sotpi_info_t		*sti = SOTOTPI(so);
4535 	uint32_t		auditing = AU_AUDITING();
4536 
4537 	ASSERT(name != NULL && namelen != 0);
4538 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4539 	ASSERT(!(so->so_mode & SM_EXDATA));
4540 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4541 	ASSERT(SOTOV(so)->v_type == VSOCK);
4542 
4543 	/* Caller checked for proper length */
4544 	len = uiop->uio_resid;
4545 	ASSERT(len <= sti->sti_tidu_size);
4546 
4547 	/* Length and family checks have been done by caller */
4548 	ASSERT(name->sa_family == so->so_family);
4549 	ASSERT(so->so_family == AF_INET ||
4550 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4551 	ASSERT(so->so_family == AF_INET6 ||
4552 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4553 
4554 	addr = name;
4555 	addrlen = namelen;
4556 
4557 	if (stp->sd_sidp != NULL &&
4558 	    (error = straccess(stp, JCWRITE)) != 0)
4559 		goto done;
4560 
4561 	so_state = so->so_state;
4562 
4563 	connected = so_state & SS_ISCONNECTED;
4564 	if (!connected) {
4565 		tudr.PRIM_type = T_UNITDATA_REQ;
4566 		tudr.DEST_length = addrlen;
4567 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4568 		tudr.OPT_length = 0;
4569 		tudr.OPT_offset = 0;
4570 
4571 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4572 		    _ALLOC_INTR, CRED());
4573 		if (mp == NULL) {
4574 			/*
4575 			 * Caught a signal waiting for memory.
4576 			 * Let send* return EINTR.
4577 			 */
4578 			error = EINTR;
4579 			goto done;
4580 		}
4581 	}
4582 
4583 	/*
4584 	 * For UDP we don't break up the copyin into smaller pieces
4585 	 * as in the TCP case.  That means if ENOMEM is returned by
4586 	 * mcopyinuio() then the uio vector has not been modified at
4587 	 * all and we fallback to either strwrite() or kstrputmsg()
4588 	 * below.  Note also that we never generate priority messages
4589 	 * from here.
4590 	 */
4591 	udp_wq = stp->sd_wrq->q_next;
4592 	if (canput(udp_wq) &&
4593 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4594 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4595 		ASSERT(uiop->uio_resid == 0);
4596 		if (!connected)
4597 			linkb(mp, mpdata);
4598 		else
4599 			mp = mpdata;
4600 		if (auditing)
4601 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4602 
4603 		/* Always returns 0... */
4604 		return (udp_wput(udp_wq, mp));
4605 	}
4606 
4607 	ASSERT(mpdata == NULL);
4608 	if (error != 0 && error != ENOMEM) {
4609 		freemsg(mp);
4610 		return (error);
4611 	}
4612 
4613 	/*
4614 	 * For connected, let strwrite() handle the blocking case.
4615 	 * Otherwise we fall thru and use kstrputmsg().
4616 	 */
4617 	if (connected)
4618 		return (strwrite(SOTOV(so), uiop, CRED()));
4619 
4620 	if (auditing)
4621 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4622 
4623 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4624 done:
4625 #ifdef SOCK_DEBUG
4626 	if (error != 0) {
4627 		eprintsoline(so, error);
4628 	}
4629 #endif /* SOCK_DEBUG */
4630 	return (error);
4631 }
4632 
4633 int
4634 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4635 {
4636 	struct stdata *stp = SOTOV(so)->v_stream;
4637 	ssize_t iosize, rmax, maxblk;
4638 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4639 	mblk_t *newmp;
4640 	int error = 0, wflag = 0;
4641 
4642 	ASSERT(so->so_mode & SM_BYTESTREAM);
4643 	ASSERT(SOTOV(so)->v_type == VSOCK);
4644 
4645 	if (stp->sd_sidp != NULL &&
4646 	    (error = straccess(stp, JCWRITE)) != 0)
4647 		return (error);
4648 
4649 	if (uiop == NULL) {
4650 		/*
4651 		 * kstrwritemp() should have checked sd_flag and
4652 		 * flow-control before coming here.  If we end up
4653 		 * here it means that we can simply pass down the
4654 		 * data to tcp.
4655 		 */
4656 		ASSERT(mp != NULL);
4657 		if (stp->sd_wputdatafunc != NULL) {
4658 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4659 			    NULL, NULL, NULL);
4660 			if (newmp == NULL) {
4661 				/* The caller will free mp */
4662 				return (ECOMM);
4663 			}
4664 			mp = newmp;
4665 		}
4666 		/* Always returns 0... */
4667 		return (tcp_wput(tcp_wq, mp));
4668 	}
4669 
4670 	/* Fallback to strwrite() to do proper error handling */
4671 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4672 		return (strwrite(SOTOV(so), uiop, cr));
4673 
4674 	rmax = stp->sd_qn_maxpsz;
4675 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4676 	if (rmax == 0 || uiop->uio_resid <= 0)
4677 		return (0);
4678 
4679 	if (rmax == INFPSZ)
4680 		rmax = uiop->uio_resid;
4681 
4682 	maxblk = stp->sd_maxblk;
4683 
4684 	for (;;) {
4685 		iosize = MIN(uiop->uio_resid, rmax);
4686 
4687 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4688 		if (mp == NULL) {
4689 			/*
4690 			 * Fallback to strwrite() for ENOMEM; if this
4691 			 * is our first time in this routine and the uio
4692 			 * vector has not been modified, we will end up
4693 			 * calling strwrite() without any flag set.
4694 			 */
4695 			if (error == ENOMEM)
4696 				goto slow_send;
4697 			else
4698 				return (error);
4699 		}
4700 		ASSERT(uiop->uio_resid >= 0);
4701 		/*
4702 		 * If mp is non-NULL and ENOMEM is set, it means that
4703 		 * mcopyinuio() was able to break down some of the user
4704 		 * data into one or more mblks.  Send the partial data
4705 		 * to tcp and let the rest be handled in strwrite().
4706 		 */
4707 		ASSERT(error == 0 || error == ENOMEM);
4708 		if (stp->sd_wputdatafunc != NULL) {
4709 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4710 			    NULL, NULL, NULL);
4711 			if (newmp == NULL) {
4712 				/* The caller will free mp */
4713 				return (ECOMM);
4714 			}
4715 			mp = newmp;
4716 		}
4717 		(void) tcp_wput(tcp_wq, mp);	/* Always returns 0 anyway. */
4718 
4719 		wflag |= NOINTR;
4720 
4721 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4722 			ASSERT(error == 0);
4723 			break;
4724 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4725 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4726 slow_send:
4727 			/*
4728 			 * We were able to send down partial data using
4729 			 * the direct call interface, but are now relying
4730 			 * on strwrite() to handle the non-fastpath cases.
4731 			 * If the socket is blocking we will sleep in
4732 			 * strwaitq() until write is permitted, otherwise,
4733 			 * we will need to return the amount of bytes
4734 			 * written so far back to the app.  This is the
4735 			 * reason why we pass NOINTR flag to strwrite()
4736 			 * for non-blocking socket, because we don't want
4737 			 * to return EAGAIN when portion of the user data
4738 			 * has actually been sent down.
4739 			 */
4740 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4741 		}
4742 	}
4743 	return (0);
4744 }
4745 
4746 /*
4747  * Update sti_faddr by asking the transport (unless AF_UNIX).
4748  */
4749 /* ARGSUSED */
4750 int
4751 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4752     boolean_t accept, struct cred *cr)
4753 {
4754 	struct strbuf	strbuf;
4755 	int		error = 0, res;
4756 	void		*addr;
4757 	t_uscalar_t	addrlen;
4758 	k_sigset_t	smask;
4759 	sotpi_info_t	*sti = SOTOTPI(so);
4760 
4761 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4762 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4763 
4764 	ASSERT(*namelen > 0);
4765 	mutex_enter(&so->so_lock);
4766 	so_lock_single(so);	/* Set SOLOCKED */
4767 
4768 	if (accept) {
4769 		bcopy(sti->sti_faddr_sa, name,
4770 		    MIN(*namelen, sti->sti_faddr_len));
4771 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4772 		goto done;
4773 	}
4774 
4775 	if (!(so->so_state & SS_ISCONNECTED)) {
4776 		error = ENOTCONN;
4777 		goto done;
4778 	}
4779 	/* Added this check for X/Open */
4780 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4781 		error = EINVAL;
4782 		if (xnet_check_print) {
4783 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4784 		}
4785 		goto done;
4786 	}
4787 
4788 	if (sti->sti_faddr_valid) {
4789 		bcopy(sti->sti_faddr_sa, name,
4790 		    MIN(*namelen, sti->sti_faddr_len));
4791 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4792 		goto done;
4793 	}
4794 
4795 #ifdef DEBUG
4796 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4797 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4798 	    (t_uscalar_t)sti->sti_faddr_len)));
4799 #endif /* DEBUG */
4800 
4801 	if (so->so_family == AF_UNIX) {
4802 		/* Transport has different name space - return local info */
4803 		if (sti->sti_faddr_noxlate)
4804 			*namelen = 0;
4805 		error = 0;
4806 		goto done;
4807 	}
4808 
4809 	ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4810 
4811 	ASSERT(sti->sti_faddr_sa);
4812 	/* Allocate local buffer to use with ioctl */
4813 	addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4814 	mutex_exit(&so->so_lock);
4815 	addr = kmem_alloc(addrlen, KM_SLEEP);
4816 
4817 	/*
4818 	 * Issue TI_GETPEERNAME with signals masked.
4819 	 * Put the result in sti_faddr_sa so that getpeername works after
4820 	 * a shutdown(output).
4821 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4822 	 * back to the socket.
4823 	 */
4824 	strbuf.buf = addr;
4825 	strbuf.maxlen = addrlen;
4826 	strbuf.len = 0;
4827 
4828 	sigintr(&smask, 0);
4829 	res = 0;
4830 	ASSERT(cr);
4831 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4832 	    0, K_TO_K, cr, &res);
4833 	sigunintr(&smask);
4834 
4835 	mutex_enter(&so->so_lock);
4836 	/*
4837 	 * If there is an error record the error in so_error put don't fail
4838 	 * the getpeername. Instead fallback on the recorded
4839 	 * sti->sti_faddr_sa.
4840 	 */
4841 	if (error) {
4842 		/*
4843 		 * Various stream head errors can be returned to the ioctl.
4844 		 * However, it is impossible to determine which ones of
4845 		 * these are really socket level errors that were incorrectly
4846 		 * consumed by the ioctl. Thus this code silently ignores the
4847 		 * error - to code explicitly does not reinstate the error
4848 		 * using soseterror().
4849 		 * Experiments have shows that at least this set of
4850 		 * errors are reported and should not be reinstated on the
4851 		 * socket:
4852 		 *	EINVAL	E.g. if an I_LINK was in effect when
4853 		 *		getpeername was called.
4854 		 *	EPIPE	The ioctl error semantics prefer the write
4855 		 *		side error over the read side error.
4856 		 *	ENOTCONN The transport just got disconnected but
4857 		 *		sockfs had not yet seen the T_DISCON_IND
4858 		 *		when issuing the ioctl.
4859 		 */
4860 		error = 0;
4861 	} else if (res == 0 && strbuf.len > 0 &&
4862 	    (so->so_state & SS_ISCONNECTED)) {
4863 		ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
4864 		sti->sti_faddr_len = (socklen_t)strbuf.len;
4865 		bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
4866 		sti->sti_faddr_valid = 1;
4867 
4868 		bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
4869 		*namelen = sti->sti_faddr_len;
4870 	}
4871 	kmem_free(addr, addrlen);
4872 #ifdef DEBUG
4873 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4874 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4875 	    (t_uscalar_t)sti->sti_faddr_len)));
4876 #endif /* DEBUG */
4877 done:
4878 	so_unlock_single(so, SOLOCKED);
4879 	mutex_exit(&so->so_lock);
4880 	return (error);
4881 }
4882 
4883 /*
4884  * Update sti_laddr by asking the transport (unless AF_UNIX).
4885  */
4886 int
4887 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4888     struct cred *cr)
4889 {
4890 	struct strbuf	strbuf;
4891 	int		error = 0, res;
4892 	void		*addr;
4893 	t_uscalar_t	addrlen;
4894 	k_sigset_t	smask;
4895 	sotpi_info_t	*sti = SOTOTPI(so);
4896 
4897 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4898 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4899 
4900 	ASSERT(*namelen > 0);
4901 	mutex_enter(&so->so_lock);
4902 	so_lock_single(so);	/* Set SOLOCKED */
4903 
4904 #ifdef DEBUG
4905 
4906 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4907 	    pr_addr(so->so_family, sti->sti_laddr_sa,
4908 	    (t_uscalar_t)sti->sti_laddr_len)));
4909 #endif /* DEBUG */
4910 	if (sti->sti_laddr_valid) {
4911 		bcopy(sti->sti_laddr_sa, name,
4912 		    MIN(*namelen, sti->sti_laddr_len));
4913 		*namelen = sti->sti_laddr_len;
4914 		goto done;
4915 	}
4916 
4917 	if (so->so_family == AF_UNIX) {
4918 		/*
4919 		 * Transport has different name space - return local info. If we
4920 		 * have enough space, let consumers know the family.
4921 		 */
4922 		if (*namelen >= sizeof (sa_family_t)) {
4923 			name->sa_family = AF_UNIX;
4924 			*namelen = sizeof (sa_family_t);
4925 		} else {
4926 			*namelen = 0;
4927 		}
4928 		error = 0;
4929 		goto done;
4930 	}
4931 	if (!(so->so_state & SS_ISBOUND)) {
4932 		/* If not bound, then nothing to return. */
4933 		error = 0;
4934 		goto done;
4935 	}
4936 
4937 	/* Allocate local buffer to use with ioctl */
4938 	addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
4939 	mutex_exit(&so->so_lock);
4940 	addr = kmem_alloc(addrlen, KM_SLEEP);
4941 
4942 	/*
4943 	 * Issue TI_GETMYNAME with signals masked.
4944 	 * Put the result in sti_laddr_sa so that getsockname works after
4945 	 * a shutdown(output).
4946 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4947 	 * back to the socket.
4948 	 */
4949 	strbuf.buf = addr;
4950 	strbuf.maxlen = addrlen;
4951 	strbuf.len = 0;
4952 
4953 	sigintr(&smask, 0);
4954 	res = 0;
4955 	ASSERT(cr);
4956 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4957 	    0, K_TO_K, cr, &res);
4958 	sigunintr(&smask);
4959 
4960 	mutex_enter(&so->so_lock);
4961 	/*
4962 	 * If there is an error record the error in so_error put don't fail
4963 	 * the getsockname. Instead fallback on the recorded
4964 	 * sti->sti_laddr_sa.
4965 	 */
4966 	if (error) {
4967 		/*
4968 		 * Various stream head errors can be returned to the ioctl.
4969 		 * However, it is impossible to determine which ones of
4970 		 * these are really socket level errors that were incorrectly
4971 		 * consumed by the ioctl. Thus this code silently ignores the
4972 		 * error - to code explicitly does not reinstate the error
4973 		 * using soseterror().
4974 		 * Experiments have shows that at least this set of
4975 		 * errors are reported and should not be reinstated on the
4976 		 * socket:
4977 		 *	EINVAL	E.g. if an I_LINK was in effect when
4978 		 *		getsockname was called.
4979 		 *	EPIPE	The ioctl error semantics prefer the write
4980 		 *		side error over the read side error.
4981 		 */
4982 		error = 0;
4983 	} else if (res == 0 && strbuf.len > 0 &&
4984 	    (so->so_state & SS_ISBOUND)) {
4985 		ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
4986 		sti->sti_laddr_len = (socklen_t)strbuf.len;
4987 		bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
4988 		sti->sti_laddr_valid = 1;
4989 
4990 		bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
4991 		*namelen = sti->sti_laddr_len;
4992 	}
4993 	kmem_free(addr, addrlen);
4994 #ifdef DEBUG
4995 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
4996 	    pr_addr(so->so_family, sti->sti_laddr_sa,
4997 	    (t_uscalar_t)sti->sti_laddr_len)));
4998 #endif /* DEBUG */
4999 done:
5000 	so_unlock_single(so, SOLOCKED);
5001 	mutex_exit(&so->so_lock);
5002 	return (error);
5003 }
5004 
5005 /*
5006  * Get socket options. For SOL_SOCKET options some options are handled
5007  * by the sockfs while others use the value recorded in the sonode as a
5008  * fallback should the T_SVR4_OPTMGMT_REQ fail.
5009  *
5010  * On the return most *optlenp bytes are copied to optval.
5011  */
5012 /* ARGSUSED */
5013 int
5014 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5015     void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5016 {
5017 	struct T_optmgmt_req	optmgmt_req;
5018 	struct T_optmgmt_ack	*optmgmt_ack;
5019 	struct opthdr		oh;
5020 	struct opthdr		*opt_res;
5021 	mblk_t			*mp = NULL;
5022 	int			error = 0;
5023 	void			*option = NULL;	/* Set if fallback value */
5024 	t_uscalar_t		maxlen = *optlenp;
5025 	t_uscalar_t		len;
5026 	uint32_t		value;
5027 	struct timeval		tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5028 	struct timeval32	tmo_val32;
5029 	struct so_snd_bufinfo	snd_bufinfo;	/* used for zero copy */
5030 
5031 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5032 	    (void *)so, level, option_name, optval, (void *)optlenp,
5033 	    pr_state(so->so_state, so->so_mode)));
5034 
5035 	mutex_enter(&so->so_lock);
5036 	so_lock_single(so);	/* Set SOLOCKED */
5037 
5038 	len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
5039 
5040 	/*
5041 	 * Check for SOL_SOCKET options.
5042 	 * Certain SOL_SOCKET options are returned directly whereas
5043 	 * others only provide a default (fallback) value should
5044 	 * the T_SVR4_OPTMGMT_REQ fail.
5045 	 */
5046 	if (level == SOL_SOCKET) {
5047 		/* Check parameters */
5048 		switch (option_name) {
5049 		case SO_TYPE:
5050 		case SO_ERROR:
5051 		case SO_DEBUG:
5052 		case SO_ACCEPTCONN:
5053 		case SO_REUSEADDR:
5054 		case SO_KEEPALIVE:
5055 		case SO_DONTROUTE:
5056 		case SO_BROADCAST:
5057 		case SO_USELOOPBACK:
5058 		case SO_OOBINLINE:
5059 		case SO_SNDBUF:
5060 		case SO_RCVBUF:
5061 #ifdef notyet
5062 		case SO_SNDLOWAT:
5063 		case SO_RCVLOWAT:
5064 #endif /* notyet */
5065 		case SO_DOMAIN:
5066 		case SO_DGRAM_ERRIND:
5067 		case SO_PROTOCOL:
5068 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5069 				error = EINVAL;
5070 				eprintsoline(so, error);
5071 				goto done2;
5072 			}
5073 			break;
5074 		case SO_RCVTIMEO:
5075 		case SO_SNDTIMEO:
5076 			if (get_udatamodel() == DATAMODEL_NONE ||
5077 			    get_udatamodel() == DATAMODEL_NATIVE) {
5078 				if (maxlen < sizeof (struct timeval)) {
5079 					error = EINVAL;
5080 					eprintsoline(so, error);
5081 					goto done2;
5082 				}
5083 			} else {
5084 				if (maxlen < sizeof (struct timeval32)) {
5085 					error = EINVAL;
5086 					eprintsoline(so, error);
5087 					goto done2;
5088 				}
5089 
5090 			}
5091 			break;
5092 		case SO_LINGER:
5093 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5094 				error = EINVAL;
5095 				eprintsoline(so, error);
5096 				goto done2;
5097 			}
5098 			break;
5099 		case SO_SND_BUFINFO:
5100 			if (maxlen < (t_uscalar_t)
5101 			    sizeof (struct so_snd_bufinfo)) {
5102 				error = EINVAL;
5103 				eprintsoline(so, error);
5104 				goto done2;
5105 			}
5106 			break;
5107 		}
5108 
5109 		switch (option_name) {
5110 		case SO_TYPE:
5111 			value = so->so_type;
5112 			option = &value;
5113 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5114 
5115 		case SO_ERROR:
5116 			value = sogeterr(so, B_TRUE);
5117 			option = &value;
5118 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5119 
5120 		case SO_ACCEPTCONN:
5121 			if (so->so_state & SS_ACCEPTCONN)
5122 				value = SO_ACCEPTCONN;
5123 			else
5124 				value = 0;
5125 #ifdef DEBUG
5126 			if (value) {
5127 				dprintso(so, 1,
5128 				    ("sotpi_getsockopt: 0x%x is set\n",
5129 				    option_name));
5130 			} else {
5131 				dprintso(so, 1,
5132 				    ("sotpi_getsockopt: 0x%x not set\n",
5133 				    option_name));
5134 			}
5135 #endif /* DEBUG */
5136 			option = &value;
5137 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5138 
5139 		case SO_DEBUG:
5140 		case SO_REUSEADDR:
5141 		case SO_KEEPALIVE:
5142 		case SO_DONTROUTE:
5143 		case SO_BROADCAST:
5144 		case SO_USELOOPBACK:
5145 		case SO_OOBINLINE:
5146 		case SO_DGRAM_ERRIND:
5147 			value = (so->so_options & option_name);
5148 #ifdef DEBUG
5149 			if (value) {
5150 				dprintso(so, 1,
5151 				    ("sotpi_getsockopt: 0x%x is set\n",
5152 				    option_name));
5153 			} else {
5154 				dprintso(so, 1,
5155 				    ("sotpi_getsockopt: 0x%x not set\n",
5156 				    option_name));
5157 			}
5158 #endif /* DEBUG */
5159 			option = &value;
5160 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5161 
5162 		/*
5163 		 * The following options are only returned by sockfs when the
5164 		 * T_SVR4_OPTMGMT_REQ fails.
5165 		 */
5166 		case SO_LINGER:
5167 			option = &so->so_linger;
5168 			len = (t_uscalar_t)sizeof (struct linger);
5169 			break;
5170 		case SO_SNDBUF: {
5171 			ssize_t lvalue;
5172 
5173 			/*
5174 			 * If the option has not been set then get a default
5175 			 * value from the read queue. This value is
5176 			 * returned if the transport fails
5177 			 * the T_SVR4_OPTMGMT_REQ.
5178 			 */
5179 			lvalue = so->so_sndbuf;
5180 			if (lvalue == 0) {
5181 				mutex_exit(&so->so_lock);
5182 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
5183 				    QHIWAT, 0, &lvalue);
5184 				mutex_enter(&so->so_lock);
5185 				dprintso(so, 1,
5186 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5187 			}
5188 			value = (int)lvalue;
5189 			option = &value;
5190 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5191 			break;
5192 		}
5193 		case SO_RCVBUF: {
5194 			ssize_t lvalue;
5195 
5196 			/*
5197 			 * If the option has not been set then get a default
5198 			 * value from the read queue. This value is
5199 			 * returned if the transport fails
5200 			 * the T_SVR4_OPTMGMT_REQ.
5201 			 *
5202 			 * XXX If SO_RCVBUF has been set and this is an
5203 			 * XPG 4.2 application then do not ask the transport
5204 			 * since the transport might adjust the value and not
5205 			 * return exactly what was set by the application.
5206 			 * For non-XPG 4.2 application we return the value
5207 			 * that the transport is actually using.
5208 			 */
5209 			lvalue = so->so_rcvbuf;
5210 			if (lvalue == 0) {
5211 				mutex_exit(&so->so_lock);
5212 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5213 				    QHIWAT, 0, &lvalue);
5214 				mutex_enter(&so->so_lock);
5215 				dprintso(so, 1,
5216 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5217 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5218 				value = (int)lvalue;
5219 				option = &value;
5220 				goto copyout;	/* skip asking transport */
5221 			}
5222 			value = (int)lvalue;
5223 			option = &value;
5224 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5225 			break;
5226 		}
5227 		case SO_DOMAIN:
5228 			value = so->so_family;
5229 			option = &value;
5230 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5231 
5232 		case SO_PROTOCOL:
5233 			value = so->so_protocol;
5234 			option = &value;
5235 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5236 
5237 #ifdef notyet
5238 		/*
5239 		 * We do not implement the semantics of these options
5240 		 * thus we shouldn't implement the options either.
5241 		 */
5242 		case SO_SNDLOWAT:
5243 			value = so->so_sndlowat;
5244 			option = &value;
5245 			break;
5246 		case SO_RCVLOWAT:
5247 			value = so->so_rcvlowat;
5248 			option = &value;
5249 			break;
5250 #endif /* notyet */
5251 		case SO_SNDTIMEO:
5252 		case SO_RCVTIMEO: {
5253 			clock_t val;
5254 
5255 			if (option_name == SO_RCVTIMEO)
5256 				val = drv_hztousec(so->so_rcvtimeo);
5257 			else
5258 				val = drv_hztousec(so->so_sndtimeo);
5259 			tmo_val.tv_sec = val / (1000 * 1000);
5260 			tmo_val.tv_usec = val % (1000 * 1000);
5261 			if (get_udatamodel() == DATAMODEL_NONE ||
5262 			    get_udatamodel() == DATAMODEL_NATIVE) {
5263 				option = &tmo_val;
5264 				len = sizeof (struct timeval);
5265 			} else {
5266 				TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5267 				option = &tmo_val32;
5268 				len = sizeof (struct timeval32);
5269 			}
5270 			break;
5271 		}
5272 		case SO_SND_BUFINFO: {
5273 			snd_bufinfo.sbi_wroff =
5274 			    (so->so_proto_props).sopp_wroff;
5275 			snd_bufinfo.sbi_maxblk =
5276 			    (so->so_proto_props).sopp_maxblk;
5277 			snd_bufinfo.sbi_maxpsz =
5278 			    (so->so_proto_props).sopp_maxpsz;
5279 			snd_bufinfo.sbi_tail =
5280 			    (so->so_proto_props).sopp_tail;
5281 			option = &snd_bufinfo;
5282 			len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5283 			break;
5284 		}
5285 		}
5286 	}
5287 
5288 	mutex_exit(&so->so_lock);
5289 
5290 	/* Send request */
5291 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5292 	optmgmt_req.MGMT_flags = T_CHECK;
5293 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5294 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5295 
5296 	oh.level = level;
5297 	oh.name = option_name;
5298 	oh.len = maxlen;
5299 
5300 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5301 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5302 	/* Let option management work in the presence of data flow control */
5303 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5304 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5305 	mp = NULL;
5306 	mutex_enter(&so->so_lock);
5307 	if (error) {
5308 		eprintsoline(so, error);
5309 		goto done2;
5310 	}
5311 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5312 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5313 	if (error) {
5314 		if (option != NULL) {
5315 			/* We have a fallback value */
5316 			error = 0;
5317 			goto copyout;
5318 		}
5319 		eprintsoline(so, error);
5320 		goto done2;
5321 	}
5322 	ASSERT(mp);
5323 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5324 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5325 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5326 	if (opt_res == NULL) {
5327 		if (option != NULL) {
5328 			/* We have a fallback value */
5329 			error = 0;
5330 			goto copyout;
5331 		}
5332 		error = EPROTO;
5333 		eprintsoline(so, error);
5334 		goto done;
5335 	}
5336 	option = &opt_res[1];
5337 
5338 	/* check to ensure that the option is within bounds */
5339 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5340 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5341 		if (option != NULL) {
5342 			/* We have a fallback value */
5343 			error = 0;
5344 			goto copyout;
5345 		}
5346 		error = EPROTO;
5347 		eprintsoline(so, error);
5348 		goto done;
5349 	}
5350 
5351 	len = opt_res->len;
5352 
5353 copyout: {
5354 		t_uscalar_t size = MIN(len, maxlen);
5355 		bcopy(option, optval, size);
5356 		bcopy(&size, optlenp, sizeof (size));
5357 	}
5358 done:
5359 	freemsg(mp);
5360 done2:
5361 	so_unlock_single(so, SOLOCKED);
5362 	mutex_exit(&so->so_lock);
5363 
5364 	return (error);
5365 }
5366 
5367 /*
5368  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5369  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5370  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5371  * setsockopt has to work even if the transport does not support the option.
5372  */
5373 /* ARGSUSED */
5374 int
5375 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5376     const void *optval, t_uscalar_t optlen, struct cred *cr)
5377 {
5378 	struct T_optmgmt_req	optmgmt_req;
5379 	struct opthdr		oh;
5380 	mblk_t			*mp;
5381 	int			error = 0;
5382 	boolean_t		handled = B_FALSE;
5383 
5384 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5385 	    (void *)so, level, option_name, optval, optlen,
5386 	    pr_state(so->so_state, so->so_mode)));
5387 
5388 	/* X/Open requires this check */
5389 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5390 		if (xnet_check_print)
5391 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5392 		return (EINVAL);
5393 	}
5394 
5395 	mutex_enter(&so->so_lock);
5396 	so_lock_single(so);	/* Set SOLOCKED */
5397 	mutex_exit(&so->so_lock);
5398 
5399 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5400 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5401 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5402 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5403 
5404 	oh.level = level;
5405 	oh.name = option_name;
5406 	oh.len = optlen;
5407 
5408 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5409 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5410 	/* Let option management work in the presence of data flow control */
5411 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5412 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5413 	mp = NULL;
5414 	mutex_enter(&so->so_lock);
5415 	if (error) {
5416 		eprintsoline(so, error);
5417 		goto done2;
5418 	}
5419 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5420 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5421 	if (error) {
5422 		eprintsoline(so, error);
5423 		goto done;
5424 	}
5425 	ASSERT(mp);
5426 	/* No need to verify T_optmgmt_ack */
5427 	freemsg(mp);
5428 done:
5429 	/*
5430 	 * Check for SOL_SOCKET options and record their values.
5431 	 * If we know about a SOL_SOCKET parameter and the transport
5432 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5433 	 * EPROTO) we let the setsockopt succeed.
5434 	 */
5435 	if (level == SOL_SOCKET) {
5436 		/* Check parameters */
5437 		switch (option_name) {
5438 		case SO_DEBUG:
5439 		case SO_REUSEADDR:
5440 		case SO_KEEPALIVE:
5441 		case SO_DONTROUTE:
5442 		case SO_BROADCAST:
5443 		case SO_USELOOPBACK:
5444 		case SO_OOBINLINE:
5445 		case SO_SNDBUF:
5446 		case SO_RCVBUF:
5447 #ifdef notyet
5448 		case SO_SNDLOWAT:
5449 		case SO_RCVLOWAT:
5450 #endif /* notyet */
5451 		case SO_DGRAM_ERRIND:
5452 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5453 				error = EINVAL;
5454 				eprintsoline(so, error);
5455 				goto done2;
5456 			}
5457 			ASSERT(optval);
5458 			handled = B_TRUE;
5459 			break;
5460 		case SO_SNDTIMEO:
5461 		case SO_RCVTIMEO:
5462 			if (get_udatamodel() == DATAMODEL_NONE ||
5463 			    get_udatamodel() == DATAMODEL_NATIVE) {
5464 				if (optlen != sizeof (struct timeval)) {
5465 					error = EINVAL;
5466 					eprintsoline(so, error);
5467 					goto done2;
5468 				}
5469 			} else {
5470 				if (optlen != sizeof (struct timeval32)) {
5471 					error = EINVAL;
5472 					eprintsoline(so, error);
5473 					goto done2;
5474 				}
5475 			}
5476 			ASSERT(optval);
5477 			handled = B_TRUE;
5478 			break;
5479 		case SO_LINGER:
5480 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5481 				error = EINVAL;
5482 				eprintsoline(so, error);
5483 				goto done2;
5484 			}
5485 			ASSERT(optval);
5486 			handled = B_TRUE;
5487 			break;
5488 		}
5489 
5490 #define	intvalue	(*(int32_t *)optval)
5491 
5492 		switch (option_name) {
5493 		case SO_TYPE:
5494 		case SO_ERROR:
5495 		case SO_ACCEPTCONN:
5496 			/* Can't be set */
5497 			error = ENOPROTOOPT;
5498 			goto done2;
5499 		case SO_LINGER: {
5500 			struct linger *l = (struct linger *)optval;
5501 
5502 			so->so_linger.l_linger = l->l_linger;
5503 			if (l->l_onoff) {
5504 				so->so_linger.l_onoff = SO_LINGER;
5505 				so->so_options |= SO_LINGER;
5506 			} else {
5507 				so->so_linger.l_onoff = 0;
5508 				so->so_options &= ~SO_LINGER;
5509 			}
5510 			break;
5511 		}
5512 
5513 		case SO_DEBUG:
5514 #ifdef SOCK_TEST
5515 			if (intvalue & 2)
5516 				sock_test_timelimit = 10 * hz;
5517 			else
5518 				sock_test_timelimit = 0;
5519 
5520 			if (intvalue & 4)
5521 				do_useracc = 0;
5522 			else
5523 				do_useracc = 1;
5524 #endif /* SOCK_TEST */
5525 			/* FALLTHRU */
5526 		case SO_REUSEADDR:
5527 		case SO_KEEPALIVE:
5528 		case SO_DONTROUTE:
5529 		case SO_BROADCAST:
5530 		case SO_USELOOPBACK:
5531 		case SO_OOBINLINE:
5532 		case SO_DGRAM_ERRIND:
5533 			if (intvalue != 0) {
5534 				dprintso(so, 1,
5535 				    ("socket_setsockopt: setting 0x%x\n",
5536 				    option_name));
5537 				so->so_options |= option_name;
5538 			} else {
5539 				dprintso(so, 1,
5540 				    ("socket_setsockopt: clearing 0x%x\n",
5541 				    option_name));
5542 				so->so_options &= ~option_name;
5543 			}
5544 			break;
5545 		/*
5546 		 * The following options are only returned by us when the
5547 		 * transport layer fails.
5548 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5549 		 * since the transport might adjust the value and not
5550 		 * return exactly what was set by the application.
5551 		 */
5552 		case SO_SNDBUF:
5553 			so->so_sndbuf = intvalue;
5554 			break;
5555 		case SO_RCVBUF:
5556 			so->so_rcvbuf = intvalue;
5557 			break;
5558 		case SO_RCVPSH:
5559 			so->so_rcv_timer_interval = intvalue;
5560 			break;
5561 #ifdef notyet
5562 		/*
5563 		 * We do not implement the semantics of these options
5564 		 * thus we shouldn't implement the options either.
5565 		 */
5566 		case SO_SNDLOWAT:
5567 			so->so_sndlowat = intvalue;
5568 			break;
5569 		case SO_RCVLOWAT:
5570 			so->so_rcvlowat = intvalue;
5571 			break;
5572 #endif /* notyet */
5573 		case SO_SNDTIMEO:
5574 		case SO_RCVTIMEO: {
5575 			struct timeval tl;
5576 			clock_t val;
5577 
5578 			if (get_udatamodel() == DATAMODEL_NONE ||
5579 			    get_udatamodel() == DATAMODEL_NATIVE)
5580 				bcopy(&tl, (struct timeval *)optval,
5581 				    sizeof (struct timeval));
5582 			else
5583 				TIMEVAL32_TO_TIMEVAL(&tl,
5584 				    (struct timeval32 *)optval);
5585 			val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5586 			if (option_name == SO_RCVTIMEO)
5587 				so->so_rcvtimeo = drv_usectohz(val);
5588 			else
5589 				so->so_sndtimeo = drv_usectohz(val);
5590 			break;
5591 		}
5592 		}
5593 #undef	intvalue
5594 
5595 		if (error) {
5596 			if ((error == ENOPROTOOPT || error == EPROTO ||
5597 			    error == EINVAL) && handled) {
5598 				dprintso(so, 1,
5599 				    ("setsockopt: ignoring error %d for 0x%x\n",
5600 				    error, option_name));
5601 				error = 0;
5602 			}
5603 		}
5604 	}
5605 done2:
5606 	so_unlock_single(so, SOLOCKED);
5607 	mutex_exit(&so->so_lock);
5608 	return (error);
5609 }
5610 
5611 /*
5612  * sotpi_close() is called when the last open reference goes away.
5613  */
5614 /* ARGSUSED */
5615 int
5616 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5617 {
5618 	struct vnode *vp = SOTOV(so);
5619 	dev_t dev;
5620 	int error = 0;
5621 	sotpi_info_t *sti = SOTOTPI(so);
5622 
5623 	dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5624 	    (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5625 
5626 	dev = sti->sti_dev;
5627 
5628 	ASSERT(STREAMSTAB(getmajor(dev)));
5629 
5630 	mutex_enter(&so->so_lock);
5631 	so_lock_single(so);	/* Set SOLOCKED */
5632 
5633 	ASSERT(so_verify_oobstate(so));
5634 
5635 	if (vp->v_stream != NULL) {
5636 		vnode_t *ux_vp;
5637 
5638 		if (so->so_family == AF_UNIX) {
5639 			/* Could avoid this when CANTSENDMORE for !dgram */
5640 			so_unix_close(so);
5641 		}
5642 
5643 		mutex_exit(&so->so_lock);
5644 		/*
5645 		 * Disassemble the linkage from the AF_UNIX underlying file
5646 		 * system vnode to this socket (by atomically clearing
5647 		 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5648 		 * and frees the stream head.
5649 		 */
5650 		if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5651 			ASSERT(ux_vp->v_stream);
5652 			sti->sti_ux_bound_vp = NULL;
5653 			vn_rele_stream(ux_vp);
5654 		}
5655 		error = strclose(vp, flag, cr);
5656 		vp->v_stream = NULL;
5657 		mutex_enter(&so->so_lock);
5658 	}
5659 
5660 	/*
5661 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5662 	 */
5663 	so_flush_discon_ind(so);
5664 
5665 	so_unlock_single(so, SOLOCKED);
5666 	mutex_exit(&so->so_lock);
5667 
5668 	/*
5669 	 * Needed for STREAMs.
5670 	 * Decrement the device driver's reference count for streams
5671 	 * opened via the clone dip. The driver was held in clone_open().
5672 	 * The absence of clone_close() forces this asymmetry.
5673 	 */
5674 	if (so->so_flag & SOCLONE)
5675 		ddi_rele_driver(getmajor(dev));
5676 
5677 	return (error);
5678 }
5679 
5680 static int
5681 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5682     struct cred *cr, int32_t *rvalp)
5683 {
5684 	struct vnode *vp = SOTOV(so);
5685 	sotpi_info_t *sti = SOTOTPI(so);
5686 	int error = 0;
5687 
5688 	dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5689 	    cmd, arg, pr_state(so->so_state, so->so_mode)));
5690 
5691 	switch (cmd) {
5692 	case SIOCSQPTR:
5693 		/*
5694 		 * SIOCSQPTR is valid only when helper stream is created
5695 		 * by the protocol.
5696 		 */
5697 	case _I_INSERT:
5698 	case _I_REMOVE:
5699 		/*
5700 		 * Since there's no compelling reason to support these ioctls
5701 		 * on sockets, and doing so would increase the complexity
5702 		 * markedly, prevent it.
5703 		 */
5704 		return (EOPNOTSUPP);
5705 
5706 	case I_FIND:
5707 	case I_LIST:
5708 	case I_LOOK:
5709 	case I_POP:
5710 	case I_PUSH:
5711 		/*
5712 		 * To prevent races and inconsistencies between the actual
5713 		 * state of the stream and the state according to the sonode,
5714 		 * we serialize all operations which modify or operate on the
5715 		 * list of modules on the socket's stream.
5716 		 */
5717 		mutex_enter(&sti->sti_plumb_lock);
5718 		error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5719 		mutex_exit(&sti->sti_plumb_lock);
5720 		return (error);
5721 
5722 	default:
5723 		if (so->so_version != SOV_STREAM)
5724 			break;
5725 
5726 		/*
5727 		 * The imaginary "sockmod" has been popped; act as a stream.
5728 		 */
5729 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5730 	}
5731 
5732 	ASSERT(so->so_version != SOV_STREAM);
5733 
5734 	/*
5735 	 * Process socket-specific ioctls.
5736 	 */
5737 	switch (cmd) {
5738 	case FIONBIO: {
5739 		int32_t value;
5740 
5741 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5742 		    (mode & (int)FKIOCTL)))
5743 			return (EFAULT);
5744 
5745 		mutex_enter(&so->so_lock);
5746 		if (value) {
5747 			so->so_state |= SS_NDELAY;
5748 		} else {
5749 			so->so_state &= ~SS_NDELAY;
5750 		}
5751 		mutex_exit(&so->so_lock);
5752 		return (0);
5753 	}
5754 
5755 	case FIOASYNC: {
5756 		int32_t value;
5757 
5758 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5759 		    (mode & (int)FKIOCTL)))
5760 			return (EFAULT);
5761 
5762 		mutex_enter(&so->so_lock);
5763 		/*
5764 		 * SS_ASYNC flag not already set correctly?
5765 		 * (!value != !(so->so_state & SS_ASYNC))
5766 		 * but some engineers find that too hard to read.
5767 		 */
5768 		if ((value == 0 && (so->so_state & SS_ASYNC) != 0) ||
5769 		    (value != 0 && (so->so_state & SS_ASYNC) == 0))
5770 			error = so_flip_async(so, vp, mode, cr);
5771 		mutex_exit(&so->so_lock);
5772 		return (error);
5773 	}
5774 
5775 	case SIOCSPGRP:
5776 	case FIOSETOWN: {
5777 		pid_t pgrp;
5778 
5779 		if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5780 		    (mode & (int)FKIOCTL)))
5781 			return (EFAULT);
5782 
5783 		mutex_enter(&so->so_lock);
5784 		dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5785 		/* Any change? */
5786 		if (pgrp != so->so_pgrp)
5787 			error = so_set_siggrp(so, vp, pgrp, mode, cr);
5788 		mutex_exit(&so->so_lock);
5789 		return (error);
5790 	}
5791 	case SIOCGPGRP:
5792 	case FIOGETOWN:
5793 		if (so_copyout(&so->so_pgrp, (void *)arg,
5794 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
5795 			return (EFAULT);
5796 		return (0);
5797 
5798 	case SIOCATMARK: {
5799 		int retval;
5800 		uint_t so_state;
5801 
5802 		/*
5803 		 * strwaitmark has a finite timeout after which it
5804 		 * returns -1 if the mark state is undetermined.
5805 		 * In order to avoid any race between the mark state
5806 		 * in sockfs and the mark state in the stream head this
5807 		 * routine loops until the mark state can be determined
5808 		 * (or the urgent data indication has been removed by some
5809 		 * other thread).
5810 		 */
5811 		do {
5812 			mutex_enter(&so->so_lock);
5813 			so_state = so->so_state;
5814 			mutex_exit(&so->so_lock);
5815 			if (so_state & SS_RCVATMARK) {
5816 				retval = 1;
5817 			} else if (!(so_state & SS_OOBPEND)) {
5818 				/*
5819 				 * No SIGURG has been generated -- there is no
5820 				 * pending or present urgent data. Thus can't
5821 				 * possibly be at the mark.
5822 				 */
5823 				retval = 0;
5824 			} else {
5825 				/*
5826 				 * Have the stream head wait until there is
5827 				 * either some messages on the read queue, or
5828 				 * STRATMARK or STRNOTATMARK gets set. The
5829 				 * STRNOTATMARK flag is used so that the
5830 				 * transport can send up a MSGNOTMARKNEXT
5831 				 * M_DATA to indicate that it is not
5832 				 * at the mark and additional data is not about
5833 				 * to be send upstream.
5834 				 *
5835 				 * If the mark state is undetermined this will
5836 				 * return -1 and we will loop rechecking the
5837 				 * socket state.
5838 				 */
5839 				retval = strwaitmark(vp);
5840 			}
5841 		} while (retval == -1);
5842 
5843 		if (so_copyout(&retval, (void *)arg, sizeof (int),
5844 		    (mode & (int)FKIOCTL)))
5845 			return (EFAULT);
5846 		return (0);
5847 	}
5848 
5849 	case I_FDINSERT:
5850 	case I_SENDFD:
5851 	case I_RECVFD:
5852 	case I_ATMARK:
5853 	case _SIOCSOCKFALLBACK:
5854 		/*
5855 		 * These ioctls do not apply to sockets. I_FDINSERT can be
5856 		 * used to send M_PROTO messages without modifying the socket
5857 		 * state. I_SENDFD/RECVFD should not be used for socket file
5858 		 * descriptor passing since they assume a twisted stream.
5859 		 * SIOCATMARK must be used instead of I_ATMARK.
5860 		 *
5861 		 * _SIOCSOCKFALLBACK from an application should never be
5862 		 * processed.  It is only generated by socktpi_open() or
5863 		 * in response to I_POP or I_PUSH.
5864 		 */
5865 #ifdef DEBUG
5866 		zcmn_err(getzoneid(), CE_WARN,
5867 		    "Unsupported STREAMS ioctl 0x%x on socket. "
5868 		    "Pid = %d\n", cmd, curproc->p_pid);
5869 #endif /* DEBUG */
5870 		return (EOPNOTSUPP);
5871 
5872 	case _I_GETPEERCRED:
5873 		if ((mode & FKIOCTL) == 0)
5874 			return (EINVAL);
5875 
5876 		mutex_enter(&so->so_lock);
5877 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
5878 			error = ENOTSUP;
5879 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
5880 			error = ENOTCONN;
5881 		} else if (so->so_peercred != NULL) {
5882 			k_peercred_t *kp = (k_peercred_t *)arg;
5883 			kp->pc_cr = so->so_peercred;
5884 			kp->pc_cpid = so->so_cpid;
5885 			crhold(so->so_peercred);
5886 		} else {
5887 			error = EINVAL;
5888 		}
5889 		mutex_exit(&so->so_lock);
5890 		return (error);
5891 
5892 	default:
5893 		/*
5894 		 * Do the higher-order bits of the ioctl cmd indicate
5895 		 * that it is an I_* streams ioctl?
5896 		 */
5897 		if ((cmd & 0xffffff00U) == STR &&
5898 		    so->so_version == SOV_SOCKBSD) {
5899 #ifdef DEBUG
5900 			zcmn_err(getzoneid(), CE_WARN,
5901 			    "Unsupported STREAMS ioctl 0x%x on socket. "
5902 			    "Pid = %d\n", cmd, curproc->p_pid);
5903 #endif /* DEBUG */
5904 			return (EOPNOTSUPP);
5905 		}
5906 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5907 	}
5908 }
5909 
5910 /*
5911  * Handle plumbing-related ioctls.
5912  */
5913 static int
5914 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
5915     struct cred *cr, int32_t *rvalp)
5916 {
5917 	static const char sockmod_name[] = "sockmod";
5918 	struct sonode	*so = VTOSO(vp);
5919 	char		mname[FMNAMESZ + 1];
5920 	int		error;
5921 	sotpi_info_t	*sti = SOTOTPI(so);
5922 
5923 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
5924 
5925 	if (so->so_version == SOV_SOCKBSD)
5926 		return (EOPNOTSUPP);
5927 
5928 	if (so->so_version == SOV_STREAM) {
5929 		/*
5930 		 * The imaginary "sockmod" has been popped - act as a stream.
5931 		 * If this is a push of sockmod then change back to a socket.
5932 		 */
5933 		if (cmd == I_PUSH) {
5934 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
5935 			    (void *)arg, mname, sizeof (mname), NULL);
5936 
5937 			if (error == 0 && strcmp(mname, sockmod_name) == 0) {
5938 				dprintso(so, 0, ("socktpi_ioctl: going to "
5939 				    "socket version\n"));
5940 				so_stream2sock(so);
5941 				return (0);
5942 			}
5943 		}
5944 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5945 	}
5946 
5947 	switch (cmd) {
5948 	case I_PUSH:
5949 		if (sti->sti_direct) {
5950 			mutex_enter(&so->so_lock);
5951 			so_lock_single(so);
5952 			mutex_exit(&so->so_lock);
5953 
5954 			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
5955 			    cr, rvalp);
5956 
5957 			mutex_enter(&so->so_lock);
5958 			if (error == 0)
5959 				sti->sti_direct = 0;
5960 			so_unlock_single(so, SOLOCKED);
5961 			mutex_exit(&so->so_lock);
5962 
5963 			if (error != 0)
5964 				return (error);
5965 		}
5966 
5967 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5968 		if (error == 0)
5969 			sti->sti_pushcnt++;
5970 		return (error);
5971 
5972 	case I_POP:
5973 		if (sti->sti_pushcnt == 0) {
5974 			/* Emulate sockmod being popped */
5975 			dprintso(so, 0,
5976 			    ("socktpi_ioctl: going to STREAMS version\n"));
5977 			return (so_sock2stream(so));
5978 		}
5979 
5980 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5981 		if (error == 0)
5982 			sti->sti_pushcnt--;
5983 		return (error);
5984 
5985 	case I_LIST: {
5986 		struct str_mlist *kmlistp, *umlistp;
5987 		struct str_list	kstrlist;
5988 		ssize_t		kstrlistsize;
5989 		int		i, nmods;
5990 
5991 		STRUCT_DECL(str_list, ustrlist);
5992 		STRUCT_INIT(ustrlist, mode);
5993 
5994 		if (arg == 0) {
5995 			error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5996 			if (error == 0)
5997 				(*rvalp)++;	/* Add one for sockmod */
5998 			return (error);
5999 		}
6000 
6001 		error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6002 		    STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6003 		if (error != 0)
6004 			return (error);
6005 
6006 		nmods = STRUCT_FGET(ustrlist, sl_nmods);
6007 		if (nmods <= 0)
6008 			return (EINVAL);
6009 		/*
6010 		 * Ceiling nmods at nstrpush to prevent someone from
6011 		 * maliciously consuming lots of kernel memory.
6012 		 */
6013 		nmods = MIN(nmods, nstrpush);
6014 
6015 		kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6016 		kstrlist.sl_nmods = nmods;
6017 		kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6018 
6019 		error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6020 		    cr, rvalp);
6021 		if (error != 0)
6022 			goto done;
6023 
6024 		/*
6025 		 * Considering the module list as a 0-based array of sl_nmods
6026 		 * modules, sockmod should conceptually exist at slot
6027 		 * sti_pushcnt.  Insert sockmod at this location by sliding all
6028 		 * of the module names after so_pushcnt over by one.  We know
6029 		 * that there will be room to do this since we allocated
6030 		 * sl_modlist with an additional slot.
6031 		 */
6032 		for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6033 			kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6034 
6035 		(void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6036 		kstrlist.sl_nmods++;
6037 
6038 		/*
6039 		 * Copy all of the entries out to ustrlist.
6040 		 */
6041 		kmlistp = kstrlist.sl_modlist;
6042 		umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6043 		for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6044 			error = so_copyout(kmlistp++, umlistp++,
6045 			    sizeof (struct str_mlist), mode & FKIOCTL);
6046 			if (error != 0)
6047 				goto done;
6048 		}
6049 
6050 		error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6051 		    mode & FKIOCTL);
6052 		if (error == 0)
6053 			*rvalp = 0;
6054 	done:
6055 		kmem_free(kstrlist.sl_modlist, kstrlistsize);
6056 		return (error);
6057 	}
6058 	case I_LOOK:
6059 		if (sti->sti_pushcnt == 0) {
6060 			return (so_copyout(sockmod_name, (void *)arg,
6061 			    sizeof (sockmod_name), mode & FKIOCTL));
6062 		}
6063 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6064 
6065 	case I_FIND:
6066 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6067 		if (error && error != EINVAL)
6068 			return (error);
6069 
6070 		/* if not found and string was sockmod return 1 */
6071 		if (*rvalp == 0 || error == EINVAL) {
6072 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6073 			    (void *)arg, mname, sizeof (mname), NULL);
6074 			if (error == ENAMETOOLONG)
6075 				error = EINVAL;
6076 
6077 			if (error == 0 && strcmp(mname, sockmod_name) == 0)
6078 				*rvalp = 1;
6079 		}
6080 		return (error);
6081 
6082 	default:
6083 		panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6084 		break;
6085 	}
6086 
6087 	return (0);
6088 }
6089 
6090 /*
6091  * Wrapper around the streams poll routine that implements socket poll
6092  * semantics.
6093  * The sockfs never calls pollwakeup itself - the stream head take care
6094  * of all pollwakeups. Since sockfs never holds so_lock when calling the
6095  * stream head there can never be a deadlock due to holding so_lock across
6096  * pollwakeup and acquiring so_lock in this routine.
6097  *
6098  * However, since the performance of VOP_POLL is critical we avoid
6099  * acquiring so_lock here. This is based on two assumptions:
6100  *  - The poll implementation holds locks to serialize the VOP_POLL call
6101  *    and a pollwakeup for the same pollhead. This ensures that should
6102  *    e.g. so_state change during a socktpi_poll call the pollwakeup
6103  *    (which strsock_* and strrput conspire to issue) is issued after
6104  *    the state change. Thus the pollwakeup will block until VOP_POLL has
6105  *    returned and then wake up poll and have it call VOP_POLL again.
6106  *  - The reading of so_state without holding so_lock does not result in
6107  *    stale data that is older than the latest state change that has dropped
6108  *    so_lock. This is ensured by the mutex_exit issuing the appropriate
6109  *    memory barrier to force the data into the coherency domain.
6110  */
6111 static int
6112 sotpi_poll(
6113 	struct sonode	*so,
6114 	short		events,
6115 	int		anyyet,
6116 	short		*reventsp,
6117 	struct pollhead **phpp)
6118 {
6119 	short origevents = events;
6120 	struct vnode *vp = SOTOV(so);
6121 	int error;
6122 	int so_state = so->so_state;	/* snapshot */
6123 	sotpi_info_t *sti = SOTOTPI(so);
6124 
6125 	dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6126 	    (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6127 
6128 	ASSERT(vp->v_type == VSOCK);
6129 	ASSERT(vp->v_stream != NULL);
6130 
6131 	if (so->so_version == SOV_STREAM) {
6132 		/* The imaginary "sockmod" has been popped - act as a stream */
6133 		return (strpoll(vp->v_stream, events, anyyet,
6134 		    reventsp, phpp));
6135 	}
6136 
6137 	if (!(so_state & SS_ISCONNECTED) &&
6138 	    (so->so_mode & SM_CONNREQUIRED)) {
6139 		/* Not connected yet - turn off write side events */
6140 		events &= ~(POLLOUT|POLLWRBAND);
6141 	}
6142 	/*
6143 	 * Check for errors without calling strpoll if the caller wants them.
6144 	 * In sockets the errors are represented as input/output events
6145 	 * and there is no need to ask the stream head for this information.
6146 	 */
6147 	if (so->so_error != 0 &&
6148 	    ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
6149 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6150 		return (0);
6151 	}
6152 	/*
6153 	 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6154 	 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6155 	 * will not trigger a POLLIN event with POLLRDDATA set.
6156 	 * The handling of urgent data (causing POLLRDBAND) is done by
6157 	 * inspecting SS_OOBPEND below.
6158 	 */
6159 	events |= POLLRDDATA;
6160 
6161 	/*
6162 	 * After shutdown(output) a stream head write error is set.
6163 	 * However, we should not return output events.
6164 	 */
6165 	events |= POLLNOERR;
6166 	error = strpoll(vp->v_stream, events, anyyet,
6167 	    reventsp, phpp);
6168 	if (error)
6169 		return (error);
6170 
6171 	ASSERT(!(*reventsp & POLLERR));
6172 
6173 	/*
6174 	 * Notes on T_CONN_IND handling for sockets.
6175 	 *
6176 	 * If strpoll() returned without events, SR_POLLIN is guaranteed
6177 	 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6178 	 *
6179 	 * Since the so_lock is not held, soqueueconnind() may have run
6180 	 * and a T_CONN_IND may be waiting. We now check for any queued
6181 	 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6182 	 * to ensure poll returns.
6183 	 *
6184 	 * However:
6185 	 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6186 	 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6187 	 * the following actions will occur; taken together they ensure the
6188 	 * syscall will return.
6189 	 *
6190 	 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6191 	 *    the accept() was run on a non-blocking socket sowaitconnind()
6192 	 *    may have already returned EWOULDBLOCK, so not be waiting to
6193 	 *    process the message. Additionally socktpi_poll() has probably
6194 	 *    proceeded past the sti_conn_ind_head check below.
6195 	 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6196 	 *    this thread,  however that could occur before poll_common()
6197 	 *    has entered cv_wait.
6198 	 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6199 	 *
6200 	 * Before proceeding to cv_wait() in poll_common() for an event,
6201 	 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6202 	 * and if set, re-calls strpoll() to ensure the late arriving
6203 	 * T_CONN_IND is recognized, and pollsys() returns.
6204 	 */
6205 
6206 	if (sti->sti_conn_ind_head != NULL)
6207 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6208 
6209 	if (so->so_state & SS_CANTRCVMORE) {
6210 		*reventsp |= POLLRDHUP & events;
6211 
6212 		if (so->so_state & SS_CANTSENDMORE)
6213 			*reventsp |= POLLHUP;
6214 	}
6215 
6216 	if (so->so_state & SS_OOBPEND)
6217 		*reventsp |= POLLRDBAND & events;
6218 
6219 	return (0);
6220 }
6221 
6222 /*ARGSUSED*/
6223 static int
6224 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6225 {
6226 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6227 	int error = 0;
6228 
6229 	error = sonode_constructor(buf, cdrarg, kmflags);
6230 	if (error != 0)
6231 		return (error);
6232 
6233 	error = i_sotpi_info_constructor(&st->st_info);
6234 	if (error != 0)
6235 		sonode_destructor(buf, cdrarg);
6236 
6237 	st->st_sonode.so_priv = &st->st_info;
6238 
6239 	return (error);
6240 }
6241 
6242 /*ARGSUSED1*/
6243 static void
6244 socktpi_destructor(void *buf, void *cdrarg)
6245 {
6246 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6247 
6248 	ASSERT(st->st_sonode.so_priv == &st->st_info);
6249 	st->st_sonode.so_priv = NULL;
6250 
6251 	i_sotpi_info_destructor(&st->st_info);
6252 	sonode_destructor(buf, cdrarg);
6253 }
6254 
6255 static int
6256 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6257 {
6258 	int retval;
6259 
6260 	if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6261 		struct sonode *so = (struct sonode *)buf;
6262 		sotpi_info_t *sti = SOTOTPI(so);
6263 
6264 		mutex_enter(&socklist.sl_lock);
6265 
6266 		sti->sti_next_so = socklist.sl_list;
6267 		sti->sti_prev_so = NULL;
6268 		if (sti->sti_next_so != NULL)
6269 			SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6270 		socklist.sl_list = so;
6271 
6272 		mutex_exit(&socklist.sl_lock);
6273 
6274 	}
6275 	return (retval);
6276 }
6277 
6278 static void
6279 socktpi_unix_destructor(void *buf, void *cdrarg)
6280 {
6281 	struct sonode	*so = (struct sonode *)buf;
6282 	sotpi_info_t	*sti = SOTOTPI(so);
6283 
6284 	mutex_enter(&socklist.sl_lock);
6285 
6286 	if (sti->sti_next_so != NULL)
6287 		SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6288 	if (sti->sti_prev_so != NULL)
6289 		SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6290 	else
6291 		socklist.sl_list = sti->sti_next_so;
6292 
6293 	mutex_exit(&socklist.sl_lock);
6294 
6295 	socktpi_destructor(buf, cdrarg);
6296 }
6297 
6298 int
6299 socktpi_init(void)
6300 {
6301 	/*
6302 	 * Create sonode caches.  We create a special one for AF_UNIX so
6303 	 * that we can track them for netstat(8).
6304 	 */
6305 	socktpi_cache = kmem_cache_create("socktpi_cache",
6306 	    sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6307 	    socktpi_destructor, NULL, NULL, NULL, 0);
6308 
6309 	socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6310 	    sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6311 	    socktpi_unix_destructor, NULL, NULL, NULL, 0);
6312 
6313 	return (0);
6314 }
6315 
6316 /*
6317  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6318  *
6319  * Caller must still update state and mode using sotpi_update_state().
6320  */
6321 int
6322 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6323     boolean_t *direct, queue_t **qp, struct cred *cr)
6324 {
6325 	sotpi_info_t *sti;
6326 	struct sockparams *origsp = so->so_sockparams;
6327 	sock_lower_handle_t handle = so->so_proto_handle;
6328 	struct stdata *stp;
6329 	struct vnode *vp;
6330 	queue_t *q;
6331 	int error = 0;
6332 
6333 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6334 	    SS_FALLBACK_PENDING);
6335 	ASSERT(SOCK_IS_NONSTR(so));
6336 
6337 	*qp = NULL;
6338 	*direct = B_FALSE;
6339 	so->so_sockparams = newsp;
6340 	/*
6341 	 * Allocate and initalize fields required by TPI.
6342 	 */
6343 	(void) sotpi_info_create(so, KM_SLEEP);
6344 	sotpi_info_init(so);
6345 
6346 	if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6347 		sotpi_info_fini(so);
6348 		sotpi_info_destroy(so);
6349 		return (error);
6350 	}
6351 	ASSERT(handle == so->so_proto_handle);
6352 	sti = SOTOTPI(so);
6353 	if (sti->sti_direct != 0)
6354 		*direct = B_TRUE;
6355 
6356 	/*
6357 	 * Keep the original sp around so we can properly dispose of the
6358 	 * sonode when the socket is being closed.
6359 	 */
6360 	sti->sti_orig_sp = origsp;
6361 
6362 	so_basic_strinit(so);	/* skips the T_CAPABILITY_REQ */
6363 	so_alloc_addr(so, so->so_max_addr_len);
6364 
6365 	/*
6366 	 * If the application has done a SIOCSPGRP, make sure the
6367 	 * STREAM head is aware. This needs to take place before
6368 	 * the protocol start sending up messages. Otherwise we
6369 	 * might miss to generate SIGPOLL.
6370 	 *
6371 	 * It is possible that the application will receive duplicate
6372 	 * signals if some were already generated for either data or
6373 	 * connection indications.
6374 	 */
6375 	if (so->so_pgrp != 0) {
6376 		if (so_set_events(so, so->so_vnode, cr) != 0)
6377 			so->so_pgrp = 0;
6378 	}
6379 
6380 	/*
6381 	 * Determine which queue to use.
6382 	 */
6383 	vp = SOTOV(so);
6384 	stp = vp->v_stream;
6385 	ASSERT(stp != NULL);
6386 	q = stp->sd_wrq->q_next;
6387 
6388 	/*
6389 	 * Skip any modules that may have been auto pushed when the device
6390 	 * was opened
6391 	 */
6392 	while (q->q_next != NULL)
6393 		q = q->q_next;
6394 	*qp = _RD(q);
6395 
6396 	/* This is now a STREAMS sockets */
6397 	so->so_not_str = B_FALSE;
6398 
6399 	return (error);
6400 }
6401 
6402 /*
6403  * Revert a TPI sonode. It is only allowed to revert the sonode during
6404  * the fallback process.
6405  */
6406 void
6407 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6408 {
6409 	vnode_t *vp = SOTOV(so);
6410 
6411 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6412 	    SS_FALLBACK_PENDING);
6413 	ASSERT(!SOCK_IS_NONSTR(so));
6414 	ASSERT(vp->v_stream != NULL);
6415 
6416 	strclean(vp);
6417 	(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6418 
6419 	/*
6420 	 * Restore the original sockparams. The caller is responsible for
6421 	 * dropping the ref to the new sp.
6422 	 */
6423 	so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6424 
6425 	sotpi_info_fini(so);
6426 	sotpi_info_destroy(so);
6427 
6428 	/* This is no longer a STREAMS sockets */
6429 	so->so_not_str = B_TRUE;
6430 }
6431 
6432 void
6433 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6434     struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6435     socklen_t faddrlen, short opts)
6436 {
6437 	sotpi_info_t *sti = SOTOTPI(so);
6438 
6439 	so_proc_tcapability_ack(so, tcap);
6440 
6441 	so->so_options |= opts;
6442 
6443 	/*
6444 	 * Determine whether the foreign and local address are valid
6445 	 */
6446 	if (laddrlen != 0) {
6447 		ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6448 		sti->sti_laddr_len = laddrlen;
6449 		bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6450 		sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6451 	}
6452 
6453 	if (faddrlen != 0) {
6454 		ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6455 		sti->sti_faddr_len = faddrlen;
6456 		bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6457 		sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6458 	}
6459 
6460 }
6461 
6462 /*
6463  * Allocate enough space to cache the local and foreign addresses.
6464  */
6465 void
6466 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6467 {
6468 	sotpi_info_t *sti = SOTOTPI(so);
6469 
6470 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6471 	ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6472 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6473 	    P2ROUNDUP(maxlen, KMEM_ALIGN);
6474 	so->so_max_addr_len = sti->sti_laddr_maxlen;
6475 	sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6476 	sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6477 	    + sti->sti_laddr_maxlen);
6478 
6479 	if (so->so_family == AF_UNIX) {
6480 		/*
6481 		 * Initialize AF_UNIX related fields.
6482 		 */
6483 		bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6484 		bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6485 	}
6486 }
6487 
6488 
6489 sotpi_info_t *
6490 sotpi_sototpi(struct sonode *so)
6491 {
6492 	sotpi_info_t *sti;
6493 
6494 	ASSERT(so != NULL);
6495 
6496 	sti = (sotpi_info_t *)so->so_priv;
6497 
6498 	ASSERT(sti != NULL);
6499 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6500 
6501 	return (sti);
6502 }
6503 
6504 static int
6505 i_sotpi_info_constructor(sotpi_info_t *sti)
6506 {
6507 	sti->sti_magic		= SOTPI_INFO_MAGIC;
6508 	sti->sti_ack_mp		= NULL;
6509 	sti->sti_discon_ind_mp	= NULL;
6510 	sti->sti_ux_bound_vp	= NULL;
6511 	sti->sti_unbind_mp	= NULL;
6512 
6513 	sti->sti_conn_ind_head	= NULL;
6514 	sti->sti_conn_ind_tail	= NULL;
6515 
6516 	sti->sti_laddr_sa	= NULL;
6517 	sti->sti_faddr_sa	= NULL;
6518 
6519 	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6520 	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6521 
6522 	return (0);
6523 }
6524 
6525 static void
6526 i_sotpi_info_destructor(sotpi_info_t *sti)
6527 {
6528 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6529 	ASSERT(sti->sti_ack_mp == NULL);
6530 	ASSERT(sti->sti_discon_ind_mp == NULL);
6531 	ASSERT(sti->sti_ux_bound_vp == NULL);
6532 	ASSERT(sti->sti_unbind_mp == NULL);
6533 
6534 	ASSERT(sti->sti_conn_ind_head == NULL);
6535 	ASSERT(sti->sti_conn_ind_tail == NULL);
6536 
6537 	ASSERT(sti->sti_laddr_sa == NULL);
6538 	ASSERT(sti->sti_faddr_sa == NULL);
6539 
6540 	mutex_destroy(&sti->sti_plumb_lock);
6541 	cv_destroy(&sti->sti_ack_cv);
6542 }
6543 
6544 /*
6545  * Creates and attaches TPI information to the given sonode
6546  */
6547 static boolean_t
6548 sotpi_info_create(struct sonode *so, int kmflags)
6549 {
6550 	sotpi_info_t *sti;
6551 
6552 	ASSERT(so->so_priv == NULL);
6553 
6554 	if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6555 		return (B_FALSE);
6556 
6557 	if (i_sotpi_info_constructor(sti) != 0) {
6558 		kmem_free(sti, sizeof (*sti));
6559 		return (B_FALSE);
6560 	}
6561 
6562 	so->so_priv = (void *)sti;
6563 	return (B_TRUE);
6564 }
6565 
6566 /*
6567  * Initializes the TPI information.
6568  */
6569 static void
6570 sotpi_info_init(struct sonode *so)
6571 {
6572 	struct vnode *vp = SOTOV(so);
6573 	sotpi_info_t *sti = SOTOTPI(so);
6574 	time_t now;
6575 
6576 	sti->sti_dev	= so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6577 	vp->v_rdev	= sti->sti_dev;
6578 
6579 	sti->sti_orig_sp = NULL;
6580 
6581 	sti->sti_pushcnt = 0;
6582 
6583 	now = gethrestime_sec();
6584 	sti->sti_atime	= now;
6585 	sti->sti_mtime	= now;
6586 	sti->sti_ctime	= now;
6587 
6588 	sti->sti_eaddr_mp = NULL;
6589 	sti->sti_delayed_error = 0;
6590 
6591 	sti->sti_provinfo = NULL;
6592 
6593 	sti->sti_oobcnt = 0;
6594 	sti->sti_oobsigcnt = 0;
6595 
6596 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6597 
6598 	sti->sti_laddr_sa	= 0;
6599 	sti->sti_faddr_sa	= 0;
6600 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6601 	sti->sti_laddr_len = sti->sti_faddr_len = 0;
6602 
6603 	sti->sti_laddr_valid = 0;
6604 	sti->sti_faddr_valid = 0;
6605 	sti->sti_faddr_noxlate = 0;
6606 
6607 	sti->sti_direct = 0;
6608 
6609 	ASSERT(sti->sti_ack_mp == NULL);
6610 	ASSERT(sti->sti_ux_bound_vp == NULL);
6611 	ASSERT(sti->sti_unbind_mp == NULL);
6612 
6613 	ASSERT(sti->sti_conn_ind_head == NULL);
6614 	ASSERT(sti->sti_conn_ind_tail == NULL);
6615 }
6616 
6617 /*
6618  * Given a sonode, grab the TPI info and free any data.
6619  */
6620 static void
6621 sotpi_info_fini(struct sonode *so)
6622 {
6623 	sotpi_info_t *sti = SOTOTPI(so);
6624 	mblk_t *mp;
6625 
6626 	ASSERT(sti->sti_discon_ind_mp == NULL);
6627 
6628 	if ((mp = sti->sti_conn_ind_head) != NULL) {
6629 		mblk_t *mp1;
6630 
6631 		while (mp) {
6632 			mp1 = mp->b_next;
6633 			mp->b_next = NULL;
6634 			freemsg(mp);
6635 			mp = mp1;
6636 		}
6637 		sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6638 	}
6639 
6640 	/*
6641 	 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6642 	 * indirect them.  It also uses so_count as a validity test.
6643 	 */
6644 	mutex_enter(&so->so_lock);
6645 
6646 	if (sti->sti_laddr_sa) {
6647 		ASSERT((caddr_t)sti->sti_faddr_sa ==
6648 		    (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6649 		ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6650 		sti->sti_laddr_valid = 0;
6651 		sti->sti_faddr_valid = 0;
6652 		kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6653 		sti->sti_laddr_sa = NULL;
6654 		sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6655 		sti->sti_faddr_sa = NULL;
6656 		sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6657 	}
6658 
6659 	mutex_exit(&so->so_lock);
6660 
6661 	if ((mp = sti->sti_eaddr_mp) != NULL) {
6662 		freemsg(mp);
6663 		sti->sti_eaddr_mp = NULL;
6664 		sti->sti_delayed_error = 0;
6665 	}
6666 
6667 	if ((mp = sti->sti_ack_mp) != NULL) {
6668 		freemsg(mp);
6669 		sti->sti_ack_mp = NULL;
6670 	}
6671 
6672 	ASSERT(sti->sti_ux_bound_vp == NULL);
6673 	if ((mp = sti->sti_unbind_mp) != NULL) {
6674 		freemsg(mp);
6675 		sti->sti_unbind_mp = NULL;
6676 	}
6677 }
6678 
6679 /*
6680  * Destroys the TPI information attached to a sonode.
6681  */
6682 static void
6683 sotpi_info_destroy(struct sonode *so)
6684 {
6685 	sotpi_info_t *sti = SOTOTPI(so);
6686 
6687 	i_sotpi_info_destructor(sti);
6688 	kmem_free(sti, sizeof (*sti));
6689 
6690 	so->so_priv = NULL;
6691 }
6692 
6693 /*
6694  * Create the global sotpi socket module entry. It will never be freed.
6695  */
6696 smod_info_t *
6697 sotpi_smod_create(void)
6698 {
6699 	smod_info_t *smodp;
6700 
6701 	smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6702 	smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6703 	(void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6704 	/*
6705 	 * Initialize the smod_refcnt to 1 so it will never be freed.
6706 	 */
6707 	smodp->smod_refcnt = 1;
6708 	smodp->smod_uc_version = SOCK_UC_VERSION;
6709 	smodp->smod_dc_version = SOCK_DC_VERSION;
6710 	smodp->smod_sock_create_func = &sotpi_create;
6711 	smodp->smod_sock_destroy_func = &sotpi_destroy;
6712 	return (smodp);
6713 }
6714