xref: /illumos-gate/usr/src/uts/common/fs/sockfs/socktpi.c (revision 43449cdcd0600512dd862537f2cf014140dd0844)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2015, Joyent, Inc.
25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/buf.h>
33 #include <sys/conf.h>
34 #include <sys/cred.h>
35 #include <sys/kmem.h>
36 #include <sys/kmem_impl.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/strsun.h>
50 #include <sys/suntpi.h>
51 #include <sys/ddi.h>
52 #include <sys/esunddi.h>
53 #include <sys/flock.h>
54 #include <sys/modctl.h>
55 #include <sys/vtrace.h>
56 #include <sys/cmn_err.h>
57 #include <sys/pathname.h>
58 
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/sockio.h>
62 #include <netinet/in.h>
63 #include <sys/un.h>
64 #include <sys/strsun.h>
65 
66 #include <sys/tiuser.h>
67 #define	_SUN_TPI_VERSION	2
68 #include <sys/tihdr.h>
69 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
70 
71 #include <c2/audit.h>
72 
73 #include <inet/common.h>
74 #include <inet/ip.h>
75 #include <inet/ip6.h>
76 #include <inet/tcp.h>
77 #include <inet/udp_impl.h>
78 
79 #include <sys/zone.h>
80 
81 #include <fs/sockfs/nl7c.h>
82 #include <fs/sockfs/nl7curi.h>
83 
84 #include <fs/sockfs/sockcommon.h>
85 #include <fs/sockfs/socktpi.h>
86 #include <fs/sockfs/socktpi_impl.h>
87 
88 /*
89  * Possible failures when memory can't be allocated. The documented behavior:
90  *
91  *		5.5:			4.X:		XNET:
92  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
93  *							EINTR
94  *	(4.X does not document EINTR but returns it)
95  * bind:	ENOSR			-		ENOBUFS/ENOSR
96  * connect:	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
97  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
98  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
99  *	(4.X getpeername and getsockname do not fail in practice)
100  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
101  * listen:	-			-		ENOBUFS
102  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
103  *							EINTR
104  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
105  *							EINTR
106  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
107  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
108  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
109  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
110  *
111  * Resolution. When allocation fails:
112  *	recv: return EINTR
113  *	send: return EINTR
114  *	connect, accept: EINTR
115  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
116  *	socket, socketpair: ENOBUFS
117  *	getpeername, getsockname: sleep
118  *	getsockopt, setsockopt: sleep
119  */
120 
121 #ifdef SOCK_TEST
122 /*
123  * Variables that make sockfs do something other than the standard TPI
124  * for the AF_INET transports.
125  *
126  * solisten_tpi_tcp:
127  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
128  *	the transport is already bound. This is needed to avoid loosing the
129  *	port number should listen() do a T_UNBIND_REQ followed by a
130  *	O_T_BIND_REQ.
131  *
132  * soconnect_tpi_udp:
133  *	UDP and ICMP can handle a T_CONN_REQ.
134  *	This is needed to make the sequence of connect(), getsockname()
135  *	return the local IP address used to send packets to the connected to
136  *	destination.
137  *
138  * soconnect_tpi_tcp:
139  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
140  *	Set this to non-zero to send TPI conformant messages to TCP in this
141  *	respect. This is a performance optimization.
142  *
143  * soaccept_tpi_tcp:
144  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
145  *	This is a performance optimization that has been picked up in XTI.
146  *
147  * soaccept_tpi_multioptions:
148  *	When inheriting SOL_SOCKET options from the listener to the accepting
149  *	socket send them as a single message for AF_INET{,6}.
150  */
151 int solisten_tpi_tcp = 0;
152 int soconnect_tpi_udp = 0;
153 int soconnect_tpi_tcp = 0;
154 int soaccept_tpi_tcp = 0;
155 int soaccept_tpi_multioptions = 1;
156 #else /* SOCK_TEST */
157 #define	soconnect_tpi_tcp	0
158 #define	soconnect_tpi_udp	0
159 #define	solisten_tpi_tcp	0
160 #define	soaccept_tpi_tcp	0
161 #define	soaccept_tpi_multioptions	1
162 #endif /* SOCK_TEST */
163 
164 #ifdef SOCK_TEST
165 extern int do_useracc;
166 extern clock_t sock_test_timelimit;
167 #endif /* SOCK_TEST */
168 
169 extern uint32_t ucredsize;
170 
171 /*
172  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
173  * applications working. Turn on this flag to disable these checks.
174  */
175 int xnet_skip_checks = 0;
176 int xnet_check_print = 0;
177 int xnet_truncate_print = 0;
178 
179 static void sotpi_destroy(struct sonode *);
180 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
181     int, int *, cred_t *cr);
182 
183 static boolean_t	sotpi_info_create(struct sonode *, int);
184 static void		sotpi_info_init(struct sonode *);
185 static void		sotpi_info_fini(struct sonode *);
186 static void		sotpi_info_destroy(struct sonode *);
187 
188 /*
189  * Do direct function call to the transport layer below; this would
190  * also allow the transport to utilize read-side synchronous stream
191  * interface if necessary.  This is a /etc/system tunable that must
192  * not be modified on a running system.  By default this is enabled
193  * for performance reasons and may be disabled for debugging purposes.
194  */
195 boolean_t socktpi_direct = B_TRUE;
196 
197 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
198 
199 extern	void sigintr(k_sigset_t *, int);
200 extern	void sigunintr(k_sigset_t *);
201 
202 static int	sotpi_unbind(struct sonode *, int);
203 
204 /* TPI sockfs sonode operations */
205 int		sotpi_init(struct sonode *, struct sonode *, struct cred *,
206 		    int);
207 static int	sotpi_accept(struct sonode *, int, struct cred *,
208 		    struct sonode **);
209 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
210 		    int, struct cred *);
211 static int	sotpi_listen(struct sonode *, int, struct cred *);
212 static int	sotpi_connect(struct sonode *, struct sockaddr *,
213 		    socklen_t, int, int, struct cred *);
214 extern int	sotpi_recvmsg(struct sonode *, struct nmsghdr *,
215 		    struct uio *, struct cred *);
216 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
217 		    struct uio *, struct cred *);
218 static int	sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
219 		    struct cred *, mblk_t **);
220 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
221 		    struct uio *, void *, t_uscalar_t, int);
222 static int	sodgram_direct(struct sonode *, struct sockaddr *,
223 		    socklen_t, struct uio *, int);
224 extern int	sotpi_getpeername(struct sonode *, struct sockaddr *,
225 		    socklen_t *, boolean_t, struct cred *);
226 static int	sotpi_getsockname(struct sonode *, struct sockaddr *,
227 		    socklen_t *, struct cred *);
228 static int	sotpi_shutdown(struct sonode *, int, struct cred *);
229 extern int	sotpi_getsockopt(struct sonode *, int, int, void *,
230 		    socklen_t *, int, struct cred *);
231 extern int	sotpi_setsockopt(struct sonode *, int, int, const void *,
232 		    socklen_t, struct cred *);
233 static int	sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
234 		    int32_t *);
235 static int	socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
236 		    struct cred *, int32_t *);
237 static int	sotpi_poll(struct sonode *, short, int, short *,
238 		    struct pollhead **);
239 static int	sotpi_close(struct sonode *, int, struct cred *);
240 
241 static int	i_sotpi_info_constructor(sotpi_info_t *);
242 static void	i_sotpi_info_destructor(sotpi_info_t *);
243 
244 sonodeops_t sotpi_sonodeops = {
245 	sotpi_init,		/* sop_init		*/
246 	sotpi_accept,		/* sop_accept		*/
247 	sotpi_bind,		/* sop_bind		*/
248 	sotpi_listen,		/* sop_listen		*/
249 	sotpi_connect,		/* sop_connect		*/
250 	sotpi_recvmsg,		/* sop_recvmsg		*/
251 	sotpi_sendmsg,		/* sop_sendmsg		*/
252 	sotpi_sendmblk,		/* sop_sendmblk		*/
253 	sotpi_getpeername,	/* sop_getpeername	*/
254 	sotpi_getsockname,	/* sop_getsockname	*/
255 	sotpi_shutdown,		/* sop_shutdown		*/
256 	sotpi_getsockopt,	/* sop_getsockopt	*/
257 	sotpi_setsockopt,	/* sop_setsockopt	*/
258 	sotpi_ioctl,		/* sop_ioctl		*/
259 	sotpi_poll,		/* sop_poll		*/
260 	sotpi_close,		/* sop_close		*/
261 };
262 
263 /*
264  * Return a TPI socket vnode.
265  *
266  * Note that sockets assume that the driver will clone (either itself
267  * or by using the clone driver) i.e. a socket() call will always
268  * result in a new vnode being created.
269  */
270 
271 /*
272  * Common create code for socket and accept. If tso is set the values
273  * from that node is used instead of issuing a T_INFO_REQ.
274  */
275 
276 /* ARGSUSED */
277 static struct sonode *
278 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
279     int version, int sflags, int *errorp, cred_t *cr)
280 {
281 	struct sonode	*so;
282 	kmem_cache_t	*cp;
283 	int		sfamily = family;
284 
285 	ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
286 
287 	if (family == AF_NCA) {
288 		/*
289 		 * The request is for an NCA socket so for NL7C use the
290 		 * INET domain instead and mark NL7C_AF_NCA below.
291 		 */
292 		family = AF_INET;
293 		/*
294 		 * NL7C is not supported in the non-global zone,
295 		 * we enforce this restriction here.
296 		 */
297 		if (getzoneid() != GLOBAL_ZONEID) {
298 			*errorp = ENOTSUP;
299 			return (NULL);
300 		}
301 	}
302 
303 	/*
304 	 * to be compatible with old tpi socket implementation ignore
305 	 * sleep flag (sflags) passed in
306 	 */
307 	cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
308 	so = kmem_cache_alloc(cp, KM_SLEEP);
309 	if (so == NULL) {
310 		*errorp = ENOMEM;
311 		return (NULL);
312 	}
313 
314 	sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
315 	sotpi_info_init(so);
316 
317 	if (sfamily == AF_NCA) {
318 		SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
319 	}
320 
321 	if (version == SOV_DEFAULT)
322 		version = so_default_version;
323 
324 	so->so_version = (short)version;
325 	*errorp = 0;
326 
327 	return (so);
328 }
329 
330 static void
331 sotpi_destroy(struct sonode *so)
332 {
333 	kmem_cache_t *cp;
334 	struct sockparams *origsp;
335 
336 	/*
337 	 * If there is a new dealloc function (ie. smod_destroy_func),
338 	 * then it should check the correctness of the ops.
339 	 */
340 
341 	ASSERT(so->so_ops == &sotpi_sonodeops);
342 
343 	origsp = SOTOTPI(so)->sti_orig_sp;
344 
345 	sotpi_info_fini(so);
346 
347 	if (so->so_state & SS_FALLBACK_COMP) {
348 		/*
349 		 * A fallback happend, which means that a sotpi_info_t struct
350 		 * was allocated (as opposed to being allocated from the TPI
351 		 * sonode cache. Therefore we explicitly free the struct
352 		 * here.
353 		 */
354 		sotpi_info_destroy(so);
355 		ASSERT(origsp != NULL);
356 
357 		origsp->sp_smod_info->smod_sock_destroy_func(so);
358 		SOCKPARAMS_DEC_REF(origsp);
359 	} else {
360 		sonode_fini(so);
361 		cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
362 		    socktpi_cache;
363 		kmem_cache_free(cp, so);
364 	}
365 }
366 
367 /* ARGSUSED1 */
368 int
369 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
370 {
371 	major_t maj;
372 	dev_t newdev;
373 	struct vnode *vp;
374 	int error = 0;
375 	struct stdata *stp;
376 
377 	sotpi_info_t *sti = SOTOTPI(so);
378 
379 	dprint(1, ("sotpi_init()\n"));
380 
381 	/*
382 	 * over write the sleep flag passed in but that is ok
383 	 * as tpi socket does not honor sleep flag.
384 	 */
385 	flags |= FREAD|FWRITE;
386 
387 	/*
388 	 * Record in so_flag that it is a clone.
389 	 */
390 	if (getmajor(sti->sti_dev) == clone_major)
391 		so->so_flag |= SOCLONE;
392 
393 	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
394 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
395 	    (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
396 	    so->so_protocol == IPPROTO_IP)) {
397 		/* Tell tcp or udp that it's talking to sockets */
398 		flags |= SO_SOCKSTR;
399 
400 		/*
401 		 * Here we indicate to socktpi_open() our attempt to
402 		 * make direct calls between sockfs and transport.
403 		 * The final decision is left to socktpi_open().
404 		 */
405 		sti->sti_direct = 1;
406 
407 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
408 		if (so->so_type == SOCK_STREAM && tso != NULL) {
409 			if (SOTOTPI(tso)->sti_direct) {
410 				/*
411 				 * Inherit sti_direct from listener and pass
412 				 * SO_ACCEPTOR open flag to tcp, indicating
413 				 * that this is an accept fast-path instance.
414 				 */
415 				flags |= SO_ACCEPTOR;
416 			} else {
417 				/*
418 				 * sti_direct is not set on listener, meaning
419 				 * that the listener has been converted from
420 				 * a socket to a stream.  Ensure that the
421 				 * acceptor inherits these settings.
422 				 */
423 				sti->sti_direct = 0;
424 				flags &= ~SO_SOCKSTR;
425 			}
426 		}
427 	}
428 
429 	/*
430 	 * Tell local transport that it is talking to sockets.
431 	 */
432 	if (so->so_family == AF_UNIX) {
433 		flags |= SO_SOCKSTR;
434 	}
435 
436 	vp = SOTOV(so);
437 	newdev = vp->v_rdev;
438 	maj = getmajor(newdev);
439 	ASSERT(STREAMSTAB(maj));
440 
441 	error = stropen(vp, &newdev, flags, cr);
442 
443 	stp = vp->v_stream;
444 	if (error == 0) {
445 		if (so->so_flag & SOCLONE)
446 			ASSERT(newdev != vp->v_rdev);
447 		mutex_enter(&so->so_lock);
448 		sti->sti_dev = newdev;
449 		vp->v_rdev = newdev;
450 		mutex_exit(&so->so_lock);
451 
452 		if (stp->sd_flag & STRISTTY) {
453 			/*
454 			 * this is a post SVR4 tty driver - a socket can not
455 			 * be a controlling terminal. Fail the open.
456 			 */
457 			(void) sotpi_close(so, flags, cr);
458 			return (ENOTTY);	/* XXX */
459 		}
460 
461 		ASSERT(stp->sd_wrq != NULL);
462 		sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
463 
464 		/*
465 		 * If caller is interested in doing direct function call
466 		 * interface to/from transport module, probe the module
467 		 * directly beneath the streamhead to see if it qualifies.
468 		 *
469 		 * We turn off the direct interface when qualifications fail.
470 		 * In the acceptor case, we simply turn off the sti_direct
471 		 * flag on the socket. We do the fallback after the accept
472 		 * has completed, before the new socket is returned to the
473 		 * application.
474 		 */
475 		if (sti->sti_direct) {
476 			queue_t *tq = stp->sd_wrq->q_next;
477 
478 			/*
479 			 * sti_direct is currently supported and tested
480 			 * only for tcp/udp; this is the main reason to
481 			 * have the following assertions.
482 			 */
483 			ASSERT(so->so_family == AF_INET ||
484 			    so->so_family == AF_INET6);
485 			ASSERT(so->so_protocol == IPPROTO_UDP ||
486 			    so->so_protocol == IPPROTO_TCP ||
487 			    so->so_protocol == IPPROTO_IP);
488 			ASSERT(so->so_type == SOCK_DGRAM ||
489 			    so->so_type == SOCK_STREAM);
490 
491 			/*
492 			 * Abort direct call interface if the module directly
493 			 * underneath the stream head is not defined with the
494 			 * _D_DIRECT flag.  This could happen in the tcp or
495 			 * udp case, when some other module is autopushed
496 			 * above it, or for some reasons the expected module
497 			 * isn't purely D_MP (which is the main requirement).
498 			 */
499 			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
500 			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
501 				int rval;
502 
503 				/* Continue on without direct calls */
504 				sti->sti_direct = 0;
505 
506 				/*
507 				 * Cannot issue ioctl on fallback socket since
508 				 * there is no conn associated with the queue.
509 				 * The fallback downcall will notify the proto
510 				 * of the change.
511 				 */
512 				if (!(flags & SO_ACCEPTOR) &&
513 				    !(flags & SO_FALLBACK)) {
514 					if ((error = strioctl(vp,
515 					    _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
516 					    cr, &rval)) != 0) {
517 						(void) sotpi_close(so, flags,
518 						    cr);
519 						return (error);
520 					}
521 				}
522 			}
523 		}
524 
525 		if (flags & SO_FALLBACK) {
526 			/*
527 			 * The stream created does not have a conn.
528 			 * do stream set up after conn has been assigned
529 			 */
530 			return (error);
531 		}
532 		if (error = so_strinit(so, tso)) {
533 			(void) sotpi_close(so, flags, cr);
534 			return (error);
535 		}
536 
537 		/* Enable sendfile() on AF_UNIX streams */
538 		if (so->so_family == AF_UNIX && so->so_type == SOCK_STREAM) {
539 			mutex_enter(&so->so_lock);
540 			so->so_mode |= SM_SENDFILESUPP;
541 			mutex_exit(&so->so_lock);
542 		}
543 
544 		/* Wildcard */
545 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
546 			int protocol = so->so_protocol;
547 			/*
548 			 * Issue SO_PROTOTYPE setsockopt.
549 			 */
550 			error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
551 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
552 			if (error != 0) {
553 				(void) sotpi_close(so, flags, cr);
554 				/*
555 				 * Setsockopt often fails with ENOPROTOOPT but
556 				 * socket() should fail with
557 				 * EPROTONOSUPPORT/EPROTOTYPE.
558 				 */
559 				return (EPROTONOSUPPORT);
560 			}
561 		}
562 
563 	} else {
564 		/*
565 		 * While the same socket can not be reopened (unlike specfs)
566 		 * the stream head sets STREOPENFAIL when the autopush fails.
567 		 */
568 		if ((stp != NULL) &&
569 		    (stp->sd_flag & STREOPENFAIL)) {
570 			/*
571 			 * Open failed part way through.
572 			 */
573 			mutex_enter(&stp->sd_lock);
574 			stp->sd_flag &= ~STREOPENFAIL;
575 			mutex_exit(&stp->sd_lock);
576 			(void) sotpi_close(so, flags, cr);
577 			return (error);
578 			/*NOTREACHED*/
579 		}
580 		ASSERT(stp == NULL);
581 	}
582 	TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
583 	    "sockfs open:maj %d vp %p so %p error %d",
584 	    maj, vp, so, error);
585 	return (error);
586 }
587 
588 /*
589  * Bind the socket to an unspecified address in sockfs only.
590  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
591  * required in all cases.
592  */
593 static void
594 so_automatic_bind(struct sonode *so)
595 {
596 	sotpi_info_t *sti = SOTOTPI(so);
597 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
598 
599 	ASSERT(MUTEX_HELD(&so->so_lock));
600 	ASSERT(!(so->so_state & SS_ISBOUND));
601 	ASSERT(sti->sti_unbind_mp);
602 
603 	ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
604 	bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
605 	sti->sti_laddr_sa->sa_family = so->so_family;
606 	so->so_state |= SS_ISBOUND;
607 }
608 
609 
610 /*
611  * bind the socket.
612  *
613  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
614  * are passed in we allow rebinding. Note that for backwards compatibility
615  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
616  * Thus the rebinding code is currently not executed.
617  *
618  * The constraints for rebinding are:
619  * - it is a SOCK_DGRAM, or
620  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
621  *   and no listen() has been done.
622  * This rebinding code was added based on some language in the XNET book
623  * about not returning EINVAL it the protocol allows rebinding. However,
624  * this language is not present in the Posix socket draft. Thus maybe the
625  * rebinding logic should be deleted from the source.
626  *
627  * A null "name" can be used to unbind the socket if:
628  * - it is a SOCK_DGRAM, or
629  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
630  *   and no listen() has been done.
631  */
632 /* ARGSUSED */
633 static int
634 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
635     socklen_t namelen, int backlog, int flags, struct cred *cr)
636 {
637 	struct T_bind_req	bind_req;
638 	struct T_bind_ack	*bind_ack;
639 	int			error = 0;
640 	mblk_t			*mp;
641 	void			*addr;
642 	t_uscalar_t		addrlen;
643 	int			unbind_on_err = 1;
644 	boolean_t		clear_acceptconn_on_err = B_FALSE;
645 	boolean_t		restore_backlog_on_err = B_FALSE;
646 	int			save_so_backlog;
647 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
648 	boolean_t		tcp_udp_xport;
649 	void			*nl7c = NULL;
650 	sotpi_info_t		*sti = SOTOTPI(so);
651 
652 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
653 	    (void *)so, (void *)name, namelen, backlog, flags,
654 	    pr_state(so->so_state, so->so_mode)));
655 
656 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
657 
658 	if (!(flags & _SOBIND_LOCK_HELD)) {
659 		mutex_enter(&so->so_lock);
660 		so_lock_single(so);	/* Set SOLOCKED */
661 	} else {
662 		ASSERT(MUTEX_HELD(&so->so_lock));
663 		ASSERT(so->so_flag & SOLOCKED);
664 	}
665 
666 	/*
667 	 * Make sure that there is a preallocated unbind_req message
668 	 * before binding. This message allocated when the socket is
669 	 * created  but it might be have been consumed.
670 	 */
671 	if (sti->sti_unbind_mp == NULL) {
672 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
673 		/* NOTE: holding so_lock while sleeping */
674 		sti->sti_unbind_mp =
675 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
676 		    cr);
677 	}
678 
679 	if (flags & _SOBIND_REBIND) {
680 		/*
681 		 * Called from solisten after doing an sotpi_unbind() or
682 		 * potentially without the unbind (latter for AF_INET{,6}).
683 		 */
684 		ASSERT(name == NULL && namelen == 0);
685 
686 		if (so->so_family == AF_UNIX) {
687 			ASSERT(sti->sti_ux_bound_vp);
688 			addr = &sti->sti_ux_laddr;
689 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
690 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
691 			    "addr 0x%p, vp %p\n",
692 			    addrlen,
693 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
694 			    (void *)sti->sti_ux_bound_vp));
695 		} else {
696 			addr = sti->sti_laddr_sa;
697 			addrlen = (t_uscalar_t)sti->sti_laddr_len;
698 		}
699 	} else if (flags & _SOBIND_UNSPEC) {
700 		ASSERT(name == NULL && namelen == 0);
701 
702 		/*
703 		 * The caller checked SS_ISBOUND but not necessarily
704 		 * under so_lock
705 		 */
706 		if (so->so_state & SS_ISBOUND) {
707 			/* No error */
708 			goto done;
709 		}
710 
711 		/* Set an initial local address */
712 		switch (so->so_family) {
713 		case AF_UNIX:
714 			/*
715 			 * Use an address with same size as struct sockaddr
716 			 * just like BSD.
717 			 */
718 			sti->sti_laddr_len =
719 			    (socklen_t)sizeof (struct sockaddr);
720 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
721 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
722 			sti->sti_laddr_sa->sa_family = so->so_family;
723 
724 			/*
725 			 * Pass down an address with the implicit bind
726 			 * magic number and the rest all zeros.
727 			 * The transport will return a unique address.
728 			 */
729 			sti->sti_ux_laddr.soua_vp = NULL;
730 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
731 			addr = &sti->sti_ux_laddr;
732 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
733 			break;
734 
735 		case AF_INET:
736 		case AF_INET6:
737 			/*
738 			 * An unspecified bind in TPI has a NULL address.
739 			 * Set the address in sockfs to have the sa_family.
740 			 */
741 			sti->sti_laddr_len = (so->so_family == AF_INET) ?
742 			    (socklen_t)sizeof (sin_t) :
743 			    (socklen_t)sizeof (sin6_t);
744 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
745 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
746 			sti->sti_laddr_sa->sa_family = so->so_family;
747 			addr = NULL;
748 			addrlen = 0;
749 			break;
750 
751 		default:
752 			/*
753 			 * An unspecified bind in TPI has a NULL address.
754 			 * Set the address in sockfs to be zero length.
755 			 *
756 			 * Can not assume there is a sa_family for all
757 			 * protocol families. For example, AF_X25 does not
758 			 * have a family field.
759 			 */
760 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
761 			sti->sti_laddr_len = 0;	/* XXX correct? */
762 			addr = NULL;
763 			addrlen = 0;
764 			break;
765 		}
766 
767 	} else {
768 		if (so->so_state & SS_ISBOUND) {
769 			/*
770 			 * If it is ok to rebind the socket, first unbind
771 			 * with the transport. A rebind to the NULL address
772 			 * is interpreted as an unbind.
773 			 * Note that a bind to NULL in BSD does unbind the
774 			 * socket but it fails with EINVAL.
775 			 * Note that regular sockets set SOV_SOCKBSD i.e.
776 			 * _SOBIND_SOCKBSD gets set here hence no type of
777 			 * socket does currently allow rebinding.
778 			 *
779 			 * If the name is NULL just do an unbind.
780 			 */
781 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
782 			    name != NULL) {
783 				error = EINVAL;
784 				unbind_on_err = 0;
785 				eprintsoline(so, error);
786 				goto done;
787 			}
788 			if ((so->so_mode & SM_CONNREQUIRED) &&
789 			    (so->so_state & SS_CANTREBIND)) {
790 				error = EINVAL;
791 				unbind_on_err = 0;
792 				eprintsoline(so, error);
793 				goto done;
794 			}
795 			error = sotpi_unbind(so, 0);
796 			if (error) {
797 				eprintsoline(so, error);
798 				goto done;
799 			}
800 			ASSERT(!(so->so_state & SS_ISBOUND));
801 			if (name == NULL) {
802 				so->so_state &=
803 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
804 				goto done;
805 			}
806 		}
807 
808 		/* X/Open requires this check */
809 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
810 			if (xnet_check_print) {
811 				printf("sockfs: X/Open bind state check "
812 				    "caused EINVAL\n");
813 			}
814 			error = EINVAL;
815 			goto done;
816 		}
817 
818 		switch (so->so_family) {
819 		case AF_UNIX:
820 			/*
821 			 * All AF_UNIX addresses are nul terminated
822 			 * when copied (copyin_name) in so the minimum
823 			 * length is 3 bytes.
824 			 */
825 			if (name == NULL ||
826 			    (ssize_t)namelen <= sizeof (short) + 1) {
827 				error = EISDIR;
828 				eprintsoline(so, error);
829 				goto done;
830 			}
831 			/*
832 			 * Verify so_family matches the bound family.
833 			 * BSD does not check this for AF_UNIX resulting
834 			 * in funny mknods.
835 			 */
836 			if (name->sa_family != so->so_family) {
837 				error = EAFNOSUPPORT;
838 				goto done;
839 			}
840 			break;
841 		case AF_INET:
842 			if (name == NULL) {
843 				error = EINVAL;
844 				eprintsoline(so, error);
845 				goto done;
846 			}
847 			if ((size_t)namelen != sizeof (sin_t)) {
848 				error = name->sa_family != so->so_family ?
849 				    EAFNOSUPPORT : EINVAL;
850 				eprintsoline(so, error);
851 				goto done;
852 			}
853 			if ((flags & _SOBIND_XPG4_2) &&
854 			    (name->sa_family != so->so_family)) {
855 				/*
856 				 * This check has to be made for X/Open
857 				 * sockets however application failures have
858 				 * been observed when it is applied to
859 				 * all sockets.
860 				 */
861 				error = EAFNOSUPPORT;
862 				eprintsoline(so, error);
863 				goto done;
864 			}
865 			/*
866 			 * Force a zero sa_family to match so_family.
867 			 *
868 			 * Some programs like inetd(1M) don't set the
869 			 * family field. Other programs leave
870 			 * sin_family set to garbage - SunOS 4.X does
871 			 * not check the family field on a bind.
872 			 * We use the family field that
873 			 * was passed in to the socket() call.
874 			 */
875 			name->sa_family = so->so_family;
876 			break;
877 
878 		case AF_INET6: {
879 #ifdef DEBUG
880 			sin6_t *sin6 = (sin6_t *)name;
881 #endif /* DEBUG */
882 
883 			if (name == NULL) {
884 				error = EINVAL;
885 				eprintsoline(so, error);
886 				goto done;
887 			}
888 			if ((size_t)namelen != sizeof (sin6_t)) {
889 				error = name->sa_family != so->so_family ?
890 				    EAFNOSUPPORT : EINVAL;
891 				eprintsoline(so, error);
892 				goto done;
893 			}
894 			if (name->sa_family != so->so_family) {
895 				/*
896 				 * With IPv6 we require the family to match
897 				 * unlike in IPv4.
898 				 */
899 				error = EAFNOSUPPORT;
900 				eprintsoline(so, error);
901 				goto done;
902 			}
903 #ifdef DEBUG
904 			/*
905 			 * Verify that apps don't forget to clear
906 			 * sin6_scope_id etc
907 			 */
908 			if (sin6->sin6_scope_id != 0 &&
909 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
910 				zcmn_err(getzoneid(), CE_WARN,
911 				    "bind with uninitialized sin6_scope_id "
912 				    "(%d) on socket. Pid = %d\n",
913 				    (int)sin6->sin6_scope_id,
914 				    (int)curproc->p_pid);
915 			}
916 			if (sin6->__sin6_src_id != 0) {
917 				zcmn_err(getzoneid(), CE_WARN,
918 				    "bind with uninitialized __sin6_src_id "
919 				    "(%d) on socket. Pid = %d\n",
920 				    (int)sin6->__sin6_src_id,
921 				    (int)curproc->p_pid);
922 			}
923 #endif /* DEBUG */
924 			break;
925 		}
926 		default:
927 			/*
928 			 * Don't do any length or sa_family check to allow
929 			 * non-sockaddr style addresses.
930 			 */
931 			if (name == NULL) {
932 				error = EINVAL;
933 				eprintsoline(so, error);
934 				goto done;
935 			}
936 			break;
937 		}
938 
939 		if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
940 			error = ENAMETOOLONG;
941 			eprintsoline(so, error);
942 			goto done;
943 		}
944 		/*
945 		 * Save local address.
946 		 */
947 		sti->sti_laddr_len = (socklen_t)namelen;
948 		ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
949 		bcopy(name, sti->sti_laddr_sa, namelen);
950 
951 		addr = sti->sti_laddr_sa;
952 		addrlen = (t_uscalar_t)sti->sti_laddr_len;
953 		switch (so->so_family) {
954 		case AF_INET6:
955 		case AF_INET:
956 			break;
957 		case AF_UNIX: {
958 			struct sockaddr_un *soun =
959 			    (struct sockaddr_un *)sti->sti_laddr_sa;
960 			struct vnode *vp, *rvp;
961 			struct vattr vattr;
962 
963 			ASSERT(sti->sti_ux_bound_vp == NULL);
964 			/*
965 			 * Create vnode for the specified path name.
966 			 * Keep vnode held with a reference in sti_ux_bound_vp.
967 			 * Use the vnode pointer as the address used in the
968 			 * bind with the transport.
969 			 *
970 			 * Use the same mode as in BSD. In particular this does
971 			 * not observe the umask.
972 			 */
973 			/* MAXPATHLEN + soun_family + nul termination */
974 			if (sti->sti_laddr_len >
975 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
976 				error = ENAMETOOLONG;
977 				eprintsoline(so, error);
978 				goto done;
979 			}
980 			vattr.va_type = VSOCK;
981 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
982 			vattr.va_mask = AT_TYPE|AT_MODE;
983 			/* NOTE: holding so_lock */
984 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
985 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
986 			if (error) {
987 				if (error == EEXIST)
988 					error = EADDRINUSE;
989 				eprintsoline(so, error);
990 				goto done;
991 			}
992 			/*
993 			 * Establish pointer from the underlying filesystem
994 			 * vnode to the socket node.
995 			 * sti_ux_bound_vp and v_stream->sd_vnode form the
996 			 * cross-linkage between the underlying filesystem
997 			 * node and the socket node.
998 			 */
999 
1000 			if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
1001 				VN_HOLD(rvp);
1002 				VN_RELE(vp);
1003 				vp = rvp;
1004 			}
1005 
1006 			ASSERT(SOTOV(so)->v_stream);
1007 			mutex_enter(&vp->v_lock);
1008 			vp->v_stream = SOTOV(so)->v_stream;
1009 			sti->sti_ux_bound_vp = vp;
1010 			mutex_exit(&vp->v_lock);
1011 
1012 			/*
1013 			 * Use the vnode pointer value as a unique address
1014 			 * (together with the magic number to avoid conflicts
1015 			 * with implicit binds) in the transport provider.
1016 			 */
1017 			sti->sti_ux_laddr.soua_vp =
1018 			    (void *)sti->sti_ux_bound_vp;
1019 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1020 			addr = &sti->sti_ux_laddr;
1021 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1022 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1023 			    addrlen,
1024 			    (void *)((struct so_ux_addr *)addr)->soua_vp));
1025 			break;
1026 		}
1027 		} /* end switch (so->so_family) */
1028 	}
1029 
1030 	/*
1031 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1032 	 * the transport can start passing up T_CONN_IND messages
1033 	 * as soon as it receives the bind req and strsock_proto()
1034 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1035 	 */
1036 	if (flags & _SOBIND_LISTEN) {
1037 		if ((so->so_state & SS_ACCEPTCONN) == 0)
1038 			clear_acceptconn_on_err = B_TRUE;
1039 		save_so_backlog = so->so_backlog;
1040 		restore_backlog_on_err = B_TRUE;
1041 		so->so_state |= SS_ACCEPTCONN;
1042 		so->so_backlog = backlog;
1043 	}
1044 
1045 	/*
1046 	 * If NL7C addr(s) have been configured check for addr/port match,
1047 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1048 	 *
1049 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1050 	 * family sockets only. If match mark as such.
1051 	 */
1052 	if (nl7c_enabled && ((addr != NULL &&
1053 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1054 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1055 	    sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1056 		/*
1057 		 * NL7C is not supported in non-global zones,
1058 		 * we enforce this restriction here.
1059 		 */
1060 		if (so->so_zoneid == GLOBAL_ZONEID) {
1061 			/* An NL7C socket, mark it */
1062 			sti->sti_nl7c_flags |= NL7C_ENABLED;
1063 			if (nl7c == NULL) {
1064 				/*
1065 				 * Was an AF_NCA bind() so add it to the
1066 				 * addr list for reporting purposes.
1067 				 */
1068 				nl7c = nl7c_add_addr(addr, addrlen);
1069 			}
1070 		} else
1071 			nl7c = NULL;
1072 	}
1073 
1074 	/*
1075 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1076 	 * for other transports we will send in a O_T_BIND_REQ.
1077 	 */
1078 	if (tcp_udp_xport &&
1079 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
1080 		PRIM_type = T_BIND_REQ;
1081 
1082 	bind_req.PRIM_type = PRIM_type;
1083 	bind_req.ADDR_length = addrlen;
1084 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1085 	bind_req.CONIND_number = backlog;
1086 	/* NOTE: holding so_lock while sleeping */
1087 	mp = soallocproto2(&bind_req, sizeof (bind_req),
1088 	    addr, addrlen, 0, _ALLOC_SLEEP, cr);
1089 	sti->sti_laddr_valid = 0;
1090 
1091 	/* Done using sti_laddr_sa - can drop the lock */
1092 	mutex_exit(&so->so_lock);
1093 
1094 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1095 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1096 	if (error) {
1097 		eprintsoline(so, error);
1098 		mutex_enter(&so->so_lock);
1099 		goto done;
1100 	}
1101 
1102 	mutex_enter(&so->so_lock);
1103 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1104 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1105 	if (error) {
1106 		eprintsoline(so, error);
1107 		goto done;
1108 	}
1109 	ASSERT(mp);
1110 	/*
1111 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1112 	 * strsock_proto while the lock was dropped above, the bind
1113 	 * is allowed to complete.
1114 	 */
1115 
1116 	/* Mark as bound. This will be undone if we detect errors below. */
1117 	if (flags & _SOBIND_NOXLATE) {
1118 		ASSERT(so->so_family == AF_UNIX);
1119 		sti->sti_faddr_noxlate = 1;
1120 	}
1121 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1122 	so->so_state |= SS_ISBOUND;
1123 	ASSERT(sti->sti_unbind_mp);
1124 
1125 	/* note that we've already set SS_ACCEPTCONN above */
1126 
1127 	/*
1128 	 * Recompute addrlen - an unspecied bind sent down an
1129 	 * address of length zero but we expect the appropriate length
1130 	 * in return.
1131 	 */
1132 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1133 	    sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1134 
1135 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
1136 	/*
1137 	 * The alignment restriction is really too strict but
1138 	 * we want enough alignment to inspect the fields of
1139 	 * a sockaddr_in.
1140 	 */
1141 	addr = sogetoff(mp, bind_ack->ADDR_offset,
1142 	    bind_ack->ADDR_length,
1143 	    __TPI_ALIGN_SIZE);
1144 	if (addr == NULL) {
1145 		freemsg(mp);
1146 		error = EPROTO;
1147 		eprintsoline(so, error);
1148 		goto done;
1149 	}
1150 	if (!(flags & _SOBIND_UNSPEC)) {
1151 		/*
1152 		 * Verify that the transport didn't return something we
1153 		 * did not want e.g. an address other than what we asked for.
1154 		 *
1155 		 * NOTE: These checks would go away if/when we switch to
1156 		 * using the new TPI (in which the transport would fail
1157 		 * the request instead of assigning a different address).
1158 		 *
1159 		 * NOTE2: For protocols that we don't know (i.e. any
1160 		 * other than AF_INET6, AF_INET and AF_UNIX), we
1161 		 * cannot know if the transport should be expected to
1162 		 * return the same address as that requested.
1163 		 *
1164 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1165 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1166 		 *
1167 		 * For example, in the case of netatalk it may be
1168 		 * inappropriate for the transport to return the
1169 		 * requested address (as it may have allocated a local
1170 		 * port number in behaviour similar to that of an
1171 		 * AF_INET bind request with a port number of zero).
1172 		 *
1173 		 * Given the definition of O_T_BIND_REQ, where the
1174 		 * transport may bind to an address other than the
1175 		 * requested address, it's not possible to determine
1176 		 * whether a returned address that differs from the
1177 		 * requested address is a reason to fail (because the
1178 		 * requested address was not available) or succeed
1179 		 * (because the transport allocated an appropriate
1180 		 * address and/or port).
1181 		 *
1182 		 * sockfs currently requires that the transport return
1183 		 * the requested address in the T_BIND_ACK, unless
1184 		 * there is code here to allow for any discrepancy.
1185 		 * Such code exists for AF_INET and AF_INET6.
1186 		 *
1187 		 * Netatalk chooses to return the requested address
1188 		 * rather than the (correct) allocated address.  This
1189 		 * means that netatalk violates the TPI specification
1190 		 * (and would not function correctly if used from a
1191 		 * TLI application), but it does mean that it works
1192 		 * with sockfs.
1193 		 *
1194 		 * As noted above, using the newer XTI bind primitive
1195 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1196 		 * allow sockfs to be more sure about whether or not
1197 		 * the bind request had succeeded (as transports are
1198 		 * not permitted to bind to a different address than
1199 		 * that requested - they must return failure).
1200 		 * Unfortunately, support for T_BIND_REQ may not be
1201 		 * present in all transport implementations (netatalk,
1202 		 * for example, doesn't have it), making the
1203 		 * transition difficult.
1204 		 */
1205 		if (bind_ack->ADDR_length != addrlen) {
1206 			/* Assumes that the requested address was in use */
1207 			freemsg(mp);
1208 			error = EADDRINUSE;
1209 			eprintsoline(so, error);
1210 			goto done;
1211 		}
1212 
1213 		switch (so->so_family) {
1214 		case AF_INET6:
1215 		case AF_INET: {
1216 			sin_t *rname, *aname;
1217 
1218 			rname = (sin_t *)addr;
1219 			aname = (sin_t *)sti->sti_laddr_sa;
1220 
1221 			/*
1222 			 * Take advantage of the alignment
1223 			 * of sin_port and sin6_port which fall
1224 			 * in the same place in their data structures.
1225 			 * Just use sin_port for either address family.
1226 			 *
1227 			 * This may become a problem if (heaven forbid)
1228 			 * there's a separate ipv6port_reserved... :-P
1229 			 *
1230 			 * Binding to port 0 has the semantics of letting
1231 			 * the transport bind to any port.
1232 			 *
1233 			 * If the transport is TCP or UDP since we had sent
1234 			 * a T_BIND_REQ we would not get a port other than
1235 			 * what we asked for.
1236 			 */
1237 			if (tcp_udp_xport) {
1238 				/*
1239 				 * Pick up the new port number if we bound to
1240 				 * port 0.
1241 				 */
1242 				if (aname->sin_port == 0)
1243 					aname->sin_port = rname->sin_port;
1244 				sti->sti_laddr_valid = 1;
1245 				break;
1246 			}
1247 			if (aname->sin_port != 0 &&
1248 			    aname->sin_port != rname->sin_port) {
1249 				freemsg(mp);
1250 				error = EADDRINUSE;
1251 				eprintsoline(so, error);
1252 				goto done;
1253 			}
1254 			/*
1255 			 * Pick up the new port number if we bound to port 0.
1256 			 */
1257 			aname->sin_port = rname->sin_port;
1258 
1259 			/*
1260 			 * Unfortunately, addresses aren't _quite_ the same.
1261 			 */
1262 			if (so->so_family == AF_INET) {
1263 				if (aname->sin_addr.s_addr !=
1264 				    rname->sin_addr.s_addr) {
1265 					freemsg(mp);
1266 					error = EADDRNOTAVAIL;
1267 					eprintsoline(so, error);
1268 					goto done;
1269 				}
1270 			} else {
1271 				sin6_t *rname6 = (sin6_t *)rname;
1272 				sin6_t *aname6 = (sin6_t *)aname;
1273 
1274 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1275 				    &rname6->sin6_addr)) {
1276 					freemsg(mp);
1277 					error = EADDRNOTAVAIL;
1278 					eprintsoline(so, error);
1279 					goto done;
1280 				}
1281 			}
1282 			break;
1283 		}
1284 		case AF_UNIX:
1285 			if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1286 				freemsg(mp);
1287 				error = EADDRINUSE;
1288 				eprintsoline(so, error);
1289 				eprintso(so,
1290 				    ("addrlen %d, addr 0x%x, vp %p\n",
1291 				    addrlen, *((int *)addr),
1292 				    (void *)sti->sti_ux_bound_vp));
1293 				goto done;
1294 			}
1295 			sti->sti_laddr_valid = 1;
1296 			break;
1297 		default:
1298 			/*
1299 			 * NOTE: This assumes that addresses can be
1300 			 * byte-compared for equivalence.
1301 			 */
1302 			if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1303 				freemsg(mp);
1304 				error = EADDRINUSE;
1305 				eprintsoline(so, error);
1306 				goto done;
1307 			}
1308 			/*
1309 			 * Don't mark sti_laddr_valid, as we cannot be
1310 			 * sure that the returned address is the real
1311 			 * bound address when talking to an unknown
1312 			 * transport.
1313 			 */
1314 			break;
1315 		}
1316 	} else {
1317 		/*
1318 		 * Save for returned address for getsockname.
1319 		 * Needed for unspecific bind unless transport supports
1320 		 * the TI_GETMYNAME ioctl.
1321 		 * Do this for AF_INET{,6} even though they do, as
1322 		 * caching info here is much better performance than
1323 		 * a TPI/STREAMS trip to the transport for getsockname.
1324 		 * Any which can't for some reason _must_ _not_ set
1325 		 * sti_laddr_valid here for the caching version of
1326 		 * getsockname to not break;
1327 		 */
1328 		switch (so->so_family) {
1329 		case AF_UNIX:
1330 			/*
1331 			 * Record the address bound with the transport
1332 			 * for use by socketpair.
1333 			 */
1334 			bcopy(addr, &sti->sti_ux_laddr, addrlen);
1335 			sti->sti_laddr_valid = 1;
1336 			break;
1337 		case AF_INET:
1338 		case AF_INET6:
1339 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1340 			bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1341 			sti->sti_laddr_valid = 1;
1342 			break;
1343 		default:
1344 			/*
1345 			 * Don't mark sti_laddr_valid, as we cannot be
1346 			 * sure that the returned address is the real
1347 			 * bound address when talking to an unknown
1348 			 * transport.
1349 			 */
1350 			break;
1351 		}
1352 	}
1353 
1354 	if (nl7c != NULL) {
1355 		/* Register listen()er sonode pointer with NL7C */
1356 		nl7c_listener_addr(nl7c, so);
1357 	}
1358 
1359 	freemsg(mp);
1360 
1361 done:
1362 	if (error) {
1363 		/* reset state & backlog to values held on entry */
1364 		if (clear_acceptconn_on_err == B_TRUE)
1365 			so->so_state &= ~SS_ACCEPTCONN;
1366 		if (restore_backlog_on_err == B_TRUE)
1367 			so->so_backlog = save_so_backlog;
1368 
1369 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1370 			int err;
1371 
1372 			err = sotpi_unbind(so, 0);
1373 			/* LINTED - statement has no consequent: if */
1374 			if (err) {
1375 				eprintsoline(so, error);
1376 			} else {
1377 				ASSERT(!(so->so_state & SS_ISBOUND));
1378 			}
1379 		}
1380 	}
1381 	if (!(flags & _SOBIND_LOCK_HELD)) {
1382 		so_unlock_single(so, SOLOCKED);
1383 		mutex_exit(&so->so_lock);
1384 	} else {
1385 		ASSERT(MUTEX_HELD(&so->so_lock));
1386 		ASSERT(so->so_flag & SOLOCKED);
1387 	}
1388 	return (error);
1389 }
1390 
1391 /* bind the socket */
1392 static int
1393 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1394     int flags, struct cred *cr)
1395 {
1396 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1397 		return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1398 
1399 	flags &= ~_SOBIND_SOCKETPAIR;
1400 	return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1401 }
1402 
1403 /*
1404  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1405  * address, or when listen needs to unbind and bind.
1406  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1407  * so that a sobind can pick them up.
1408  */
1409 static int
1410 sotpi_unbind(struct sonode *so, int flags)
1411 {
1412 	struct T_unbind_req	unbind_req;
1413 	int			error = 0;
1414 	mblk_t			*mp;
1415 	sotpi_info_t		*sti = SOTOTPI(so);
1416 
1417 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1418 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1419 
1420 	ASSERT(MUTEX_HELD(&so->so_lock));
1421 	ASSERT(so->so_flag & SOLOCKED);
1422 
1423 	if (!(so->so_state & SS_ISBOUND)) {
1424 		error = EINVAL;
1425 		eprintsoline(so, error);
1426 		goto done;
1427 	}
1428 
1429 	mutex_exit(&so->so_lock);
1430 
1431 	/*
1432 	 * Flush the read and write side (except stream head read queue)
1433 	 * and send down T_UNBIND_REQ.
1434 	 */
1435 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1436 
1437 	unbind_req.PRIM_type = T_UNBIND_REQ;
1438 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1439 	    0, _ALLOC_SLEEP, CRED());
1440 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1441 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1442 	mutex_enter(&so->so_lock);
1443 	if (error) {
1444 		eprintsoline(so, error);
1445 		goto done;
1446 	}
1447 
1448 	error = sowaitokack(so, T_UNBIND_REQ);
1449 	if (error) {
1450 		eprintsoline(so, error);
1451 		goto done;
1452 	}
1453 
1454 	/*
1455 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1456 	 * strsock_proto while the lock was dropped above, the unbind
1457 	 * is allowed to complete.
1458 	 */
1459 	if (!(flags & _SOUNBIND_REBIND)) {
1460 		/*
1461 		 * Clear out bound address.
1462 		 */
1463 		vnode_t *vp;
1464 
1465 		if ((vp = sti->sti_ux_bound_vp) != NULL) {
1466 			sti->sti_ux_bound_vp = NULL;
1467 			vn_rele_stream(vp);
1468 		}
1469 		/* Clear out address */
1470 		sti->sti_laddr_len = 0;
1471 	}
1472 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1473 	sti->sti_laddr_valid = 0;
1474 
1475 done:
1476 
1477 	/* If the caller held the lock don't release it here */
1478 	ASSERT(MUTEX_HELD(&so->so_lock));
1479 	ASSERT(so->so_flag & SOLOCKED);
1480 
1481 	return (error);
1482 }
1483 
1484 /*
1485  * listen on the socket.
1486  * For TPI conforming transports this has to first unbind with the transport
1487  * and then bind again using the new backlog.
1488  */
1489 /* ARGSUSED */
1490 int
1491 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1492 {
1493 	int		error = 0;
1494 	sotpi_info_t	*sti = SOTOTPI(so);
1495 
1496 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1497 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1498 
1499 	if (sti->sti_serv_type == T_CLTS)
1500 		return (EOPNOTSUPP);
1501 
1502 	/*
1503 	 * If the socket is ready to accept connections already, then
1504 	 * return without doing anything.  This avoids a problem where
1505 	 * a second listen() call fails if a connection is pending and
1506 	 * leaves the socket unbound. Only when we are not unbinding
1507 	 * with the transport can we safely increase the backlog.
1508 	 */
1509 	if (so->so_state & SS_ACCEPTCONN &&
1510 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1511 	    /*CONSTCOND*/
1512 	    !solisten_tpi_tcp))
1513 		return (0);
1514 
1515 	if (so->so_state & SS_ISCONNECTED)
1516 		return (EINVAL);
1517 
1518 	mutex_enter(&so->so_lock);
1519 	so_lock_single(so);	/* Set SOLOCKED */
1520 
1521 	/*
1522 	 * If the listen doesn't change the backlog we do nothing.
1523 	 * This avoids an EPROTO error from the transport.
1524 	 */
1525 	if ((so->so_state & SS_ACCEPTCONN) &&
1526 	    so->so_backlog == backlog)
1527 		goto done;
1528 
1529 	if (!(so->so_state & SS_ISBOUND)) {
1530 		/*
1531 		 * Must have been explicitly bound in the UNIX domain.
1532 		 */
1533 		if (so->so_family == AF_UNIX) {
1534 			error = EINVAL;
1535 			goto done;
1536 		}
1537 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1538 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1539 	} else if (backlog > 0) {
1540 		/*
1541 		 * AF_INET{,6} hack to avoid losing the port.
1542 		 * Assumes that all AF_INET{,6} transports can handle a
1543 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1544 		 * has already bound thus it is possible to avoid the unbind.
1545 		 */
1546 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1547 		    /*CONSTCOND*/
1548 		    !solisten_tpi_tcp)) {
1549 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1550 			if (error)
1551 				goto done;
1552 		}
1553 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1554 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1555 	} else {
1556 		so->so_state |= SS_ACCEPTCONN;
1557 		so->so_backlog = backlog;
1558 	}
1559 	if (error)
1560 		goto done;
1561 	ASSERT(so->so_state & SS_ACCEPTCONN);
1562 done:
1563 	so_unlock_single(so, SOLOCKED);
1564 	mutex_exit(&so->so_lock);
1565 	return (error);
1566 }
1567 
1568 /*
1569  * Disconnect either a specified seqno or all (-1).
1570  * The former is used on listening sockets only.
1571  *
1572  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1573  * the current use of sodisconnect(seqno == -1) is only for shutdown
1574  * so there is no point (and potentially incorrect) to unbind.
1575  */
1576 static int
1577 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1578 {
1579 	struct T_discon_req	discon_req;
1580 	int			error = 0;
1581 	mblk_t			*mp;
1582 
1583 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1584 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1585 
1586 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1587 		mutex_enter(&so->so_lock);
1588 		so_lock_single(so);	/* Set SOLOCKED */
1589 	} else {
1590 		ASSERT(MUTEX_HELD(&so->so_lock));
1591 		ASSERT(so->so_flag & SOLOCKED);
1592 	}
1593 
1594 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1595 		error = EINVAL;
1596 		eprintsoline(so, error);
1597 		goto done;
1598 	}
1599 
1600 	mutex_exit(&so->so_lock);
1601 	/*
1602 	 * Flush the write side (unless this is a listener)
1603 	 * and then send down a T_DISCON_REQ.
1604 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1605 	 * and other messages.)
1606 	 */
1607 	if (!(so->so_state & SS_ACCEPTCONN))
1608 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1609 
1610 	discon_req.PRIM_type = T_DISCON_REQ;
1611 	discon_req.SEQ_number = seqno;
1612 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1613 	    0, _ALLOC_SLEEP, CRED());
1614 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1615 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1616 	mutex_enter(&so->so_lock);
1617 	if (error) {
1618 		eprintsoline(so, error);
1619 		goto done;
1620 	}
1621 
1622 	error = sowaitokack(so, T_DISCON_REQ);
1623 	if (error) {
1624 		eprintsoline(so, error);
1625 		goto done;
1626 	}
1627 	/*
1628 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1629 	 * strsock_proto while the lock was dropped above, the disconnect
1630 	 * is allowed to complete. However, it is not possible to
1631 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1632 	 */
1633 	so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1634 	SOTOTPI(so)->sti_laddr_valid = 0;
1635 	SOTOTPI(so)->sti_faddr_valid = 0;
1636 done:
1637 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1638 		so_unlock_single(so, SOLOCKED);
1639 		mutex_exit(&so->so_lock);
1640 	} else {
1641 		/* If the caller held the lock don't release it here */
1642 		ASSERT(MUTEX_HELD(&so->so_lock));
1643 		ASSERT(so->so_flag & SOLOCKED);
1644 	}
1645 	return (error);
1646 }
1647 
1648 /* ARGSUSED */
1649 int
1650 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1651     struct sonode **nsop)
1652 {
1653 	struct T_conn_ind	*conn_ind;
1654 	struct T_conn_res	*conn_res;
1655 	int			error = 0;
1656 	mblk_t			*mp, *ack_mp;
1657 	struct sonode		*nso;
1658 	vnode_t			*nvp;
1659 	void			*src;
1660 	t_uscalar_t		srclen;
1661 	void			*opt;
1662 	t_uscalar_t		optlen;
1663 	t_scalar_t		PRIM_type;
1664 	t_scalar_t		SEQ_number;
1665 	size_t			sinlen;
1666 	sotpi_info_t		*sti = SOTOTPI(so);
1667 	sotpi_info_t		*nsti;
1668 
1669 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1670 	    (void *)so, fflag, (void *)nsop,
1671 	    pr_state(so->so_state, so->so_mode)));
1672 
1673 	/*
1674 	 * Defer single-threading the accepting socket until
1675 	 * the T_CONN_IND has been received and parsed and the
1676 	 * new sonode has been opened.
1677 	 */
1678 
1679 	/* Check that we are not already connected */
1680 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1681 		goto conn_bad;
1682 again:
1683 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1684 		goto e_bad;
1685 
1686 	ASSERT(mp != NULL);
1687 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1688 
1689 	/*
1690 	 * Save SEQ_number for error paths.
1691 	 */
1692 	SEQ_number = conn_ind->SEQ_number;
1693 
1694 	srclen = conn_ind->SRC_length;
1695 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1696 	if (src == NULL) {
1697 		error = EPROTO;
1698 		freemsg(mp);
1699 		eprintsoline(so, error);
1700 		goto disconnect_unlocked;
1701 	}
1702 	optlen = conn_ind->OPT_length;
1703 	switch (so->so_family) {
1704 	case AF_INET:
1705 	case AF_INET6:
1706 		if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1707 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1708 			    &opt, conn_ind->OPT_length);
1709 		} else {
1710 			/*
1711 			 * The transport (in this case TCP) hasn't sent up
1712 			 * a pointer to an instance for the accept fast-path.
1713 			 * Disable fast-path completely because the call to
1714 			 * sotpi_create() below would otherwise create an
1715 			 * incomplete TCP instance, which would lead to
1716 			 * problems when sockfs sends a normal T_CONN_RES
1717 			 * message down the new stream.
1718 			 */
1719 			if (sti->sti_direct) {
1720 				int rval;
1721 				/*
1722 				 * For consistency we inform tcp to disable
1723 				 * direct interface on the listener, though
1724 				 * we can certainly live without doing this
1725 				 * because no data will ever travel upstream
1726 				 * on the listening socket.
1727 				 */
1728 				sti->sti_direct = 0;
1729 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1730 				    0, 0, K_TO_K, cr, &rval);
1731 			}
1732 			opt = NULL;
1733 			optlen = 0;
1734 		}
1735 		break;
1736 	case AF_UNIX:
1737 	default:
1738 		if (optlen != 0) {
1739 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1740 			    __TPI_ALIGN_SIZE);
1741 			if (opt == NULL) {
1742 				error = EPROTO;
1743 				freemsg(mp);
1744 				eprintsoline(so, error);
1745 				goto disconnect_unlocked;
1746 			}
1747 		}
1748 		if (so->so_family == AF_UNIX) {
1749 			if (!sti->sti_faddr_noxlate) {
1750 				src = NULL;
1751 				srclen = 0;
1752 			}
1753 			/* Extract src address from options */
1754 			if (optlen != 0)
1755 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1756 		}
1757 		break;
1758 	}
1759 
1760 	/*
1761 	 * Create the new socket.
1762 	 */
1763 	nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1764 	if (nso == NULL) {
1765 		ASSERT(error != 0);
1766 		/*
1767 		 * Accept can not fail with ENOBUFS. sotpi_create
1768 		 * sleeps waiting for memory until a signal is caught
1769 		 * so return EINTR.
1770 		 */
1771 		freemsg(mp);
1772 		if (error == ENOBUFS)
1773 			error = EINTR;
1774 		goto e_disc_unl;
1775 	}
1776 	nvp = SOTOV(nso);
1777 	nsti = SOTOTPI(nso);
1778 
1779 #ifdef DEBUG
1780 	/*
1781 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1782 	 * it's inherited early to allow debugging of the accept code itself.
1783 	 */
1784 	nso->so_options |= so->so_options & SO_DEBUG;
1785 #endif /* DEBUG */
1786 
1787 	/*
1788 	 * Save the SRC address from the T_CONN_IND
1789 	 * for getpeername to work on AF_UNIX and on transports that do not
1790 	 * support TI_GETPEERNAME.
1791 	 *
1792 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1793 	 * copyin_name().
1794 	 */
1795 	if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1796 		error = EINVAL;
1797 		freemsg(mp);
1798 		eprintsoline(so, error);
1799 		goto disconnect_vp_unlocked;
1800 	}
1801 	nsti->sti_faddr_len = (socklen_t)srclen;
1802 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1803 	bcopy(src, nsti->sti_faddr_sa, srclen);
1804 	nsti->sti_faddr_valid = 1;
1805 
1806 	/*
1807 	 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1808 	 */
1809 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1810 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1811 		cred_t	*cr;
1812 		pid_t	cpid;
1813 
1814 		cr = msg_getcred(mp, &cpid);
1815 		if (cr != NULL) {
1816 			crhold(cr);
1817 			nso->so_peercred = cr;
1818 			nso->so_cpid = cpid;
1819 		}
1820 		freemsg(mp);
1821 
1822 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1823 		    sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1824 		if (mp == NULL) {
1825 			/*
1826 			 * Accept can not fail with ENOBUFS.
1827 			 * A signal was caught so return EINTR.
1828 			 */
1829 			error = EINTR;
1830 			eprintsoline(so, error);
1831 			goto disconnect_vp_unlocked;
1832 		}
1833 		conn_res = (struct T_conn_res *)mp->b_rptr;
1834 	} else {
1835 		/*
1836 		 * For efficency reasons we use msg_extractcred; no crhold
1837 		 * needed since db_credp is cleared (i.e., we move the cred
1838 		 * from the message to so_peercred.
1839 		 */
1840 		nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1841 
1842 		mp->b_rptr = DB_BASE(mp);
1843 		conn_res = (struct T_conn_res *)mp->b_rptr;
1844 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1845 
1846 		mblk_setcred(mp, cr, curproc->p_pid);
1847 	}
1848 
1849 	/*
1850 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1851 	 * (or AF_INET6) it also has to be bound in the transport provider.
1852 	 * We set the local address in the sonode from the T_OK_ACK of the
1853 	 * T_CONN_RES. For this reason the address we bind to here isn't
1854 	 * important.
1855 	 */
1856 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1857 	    /*CONSTCOND*/
1858 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1859 		/*
1860 		 * Optimization for AF_INET{,6} transports
1861 		 * that can handle a T_CONN_RES without being bound.
1862 		 */
1863 		mutex_enter(&nso->so_lock);
1864 		so_automatic_bind(nso);
1865 		mutex_exit(&nso->so_lock);
1866 	} else {
1867 		/* Perform NULL bind with the transport provider. */
1868 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1869 		    cr)) != 0) {
1870 			ASSERT(error != ENOBUFS);
1871 			freemsg(mp);
1872 			eprintsoline(nso, error);
1873 			goto disconnect_vp_unlocked;
1874 		}
1875 	}
1876 
1877 	/*
1878 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1879 	 * so that any data arriving on the new socket will cause the
1880 	 * appropriate signals to be delivered for the new socket.
1881 	 *
1882 	 * No other thread (except strsock_proto and strsock_misc)
1883 	 * can access the new socket thus we relax the locking.
1884 	 */
1885 	nso->so_pgrp = so->so_pgrp;
1886 	nso->so_state |= so->so_state & SS_ASYNC;
1887 	nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1888 
1889 	if (nso->so_pgrp != 0) {
1890 		if ((error = so_set_events(nso, nvp, cr)) != 0) {
1891 			eprintsoline(nso, error);
1892 			error = 0;
1893 			nso->so_pgrp = 0;
1894 		}
1895 	}
1896 
1897 	/*
1898 	 * Make note of the socket level options. TCP and IP level options
1899 	 * are already inherited. We could do all this after accept is
1900 	 * successful but doing it here simplifies code and no harm done
1901 	 * for error case.
1902 	 */
1903 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1904 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1905 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1906 	nso->so_sndbuf = so->so_sndbuf;
1907 	nso->so_rcvbuf = so->so_rcvbuf;
1908 	if (nso->so_options & SO_LINGER)
1909 		nso->so_linger = so->so_linger;
1910 
1911 	/*
1912 	 * Note that the following sti_direct code path should be
1913 	 * removed once we are confident that the direct sockets
1914 	 * do not result in any degradation.
1915 	 */
1916 	if (sti->sti_direct) {
1917 
1918 		ASSERT(opt != NULL);
1919 
1920 		conn_res->OPT_length = optlen;
1921 		conn_res->OPT_offset = MBLKL(mp);
1922 		bcopy(&opt, mp->b_wptr, optlen);
1923 		mp->b_wptr += optlen;
1924 		conn_res->PRIM_type = T_CONN_RES;
1925 		conn_res->ACCEPTOR_id = 0;
1926 		PRIM_type = T_CONN_RES;
1927 
1928 		/* Send down the T_CONN_RES on acceptor STREAM */
1929 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1930 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1931 		if (error) {
1932 			mutex_enter(&so->so_lock);
1933 			so_lock_single(so);
1934 			eprintsoline(so, error);
1935 			goto disconnect_vp;
1936 		}
1937 		mutex_enter(&nso->so_lock);
1938 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1939 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1940 		if (error) {
1941 			mutex_exit(&nso->so_lock);
1942 			mutex_enter(&so->so_lock);
1943 			so_lock_single(so);
1944 			eprintsoline(so, error);
1945 			goto disconnect_vp;
1946 		}
1947 		if (nso->so_family == AF_INET) {
1948 			sin_t *sin;
1949 
1950 			sin = (sin_t *)(ack_mp->b_rptr +
1951 			    sizeof (struct T_ok_ack));
1952 			bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1953 			nsti->sti_laddr_len = sizeof (sin_t);
1954 		} else {
1955 			sin6_t *sin6;
1956 
1957 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1958 			    sizeof (struct T_ok_ack));
1959 			bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1960 			nsti->sti_laddr_len = sizeof (sin6_t);
1961 		}
1962 		freemsg(ack_mp);
1963 
1964 		nso->so_state |= SS_ISCONNECTED;
1965 		nso->so_proto_handle = (sock_lower_handle_t)opt;
1966 		nsti->sti_laddr_valid = 1;
1967 
1968 		if (sti->sti_nl7c_flags & NL7C_ENABLED) {
1969 			/*
1970 			 * A NL7C marked listen()er so the new socket
1971 			 * inherits the listen()er's NL7C state, except
1972 			 * for NL7C_POLLIN.
1973 			 *
1974 			 * Only call NL7C to process the new socket if
1975 			 * the listen socket allows blocking i/o.
1976 			 */
1977 			nsti->sti_nl7c_flags =
1978 			    sti->sti_nl7c_flags & (~NL7C_POLLIN);
1979 			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
1980 				/*
1981 				 * Nonblocking accept() just make it
1982 				 * persist to defer processing to the
1983 				 * read-side syscall (e.g. read).
1984 				 */
1985 				nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
1986 			} else if (nl7c_process(nso, B_FALSE)) {
1987 				/*
1988 				 * NL7C has completed processing on the
1989 				 * socket, close the socket and back to
1990 				 * the top to await the next T_CONN_IND.
1991 				 */
1992 				mutex_exit(&nso->so_lock);
1993 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1994 				    cr, NULL);
1995 				VN_RELE(nvp);
1996 				goto again;
1997 			}
1998 			/* Pass the new socket out */
1999 		}
2000 
2001 		mutex_exit(&nso->so_lock);
2002 
2003 		/*
2004 		 * It's possible, through the use of autopush for example,
2005 		 * that the acceptor stream may not support sti_direct
2006 		 * semantics. If the new socket does not support sti_direct
2007 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
2008 		 * as we would in the I_PUSH case.
2009 		 */
2010 		if (nsti->sti_direct == 0) {
2011 			int	rval;
2012 
2013 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2014 			    0, 0, K_TO_K, cr, &rval)) != 0) {
2015 				mutex_enter(&so->so_lock);
2016 				so_lock_single(so);
2017 				eprintsoline(so, error);
2018 				goto disconnect_vp;
2019 			}
2020 		}
2021 
2022 		/*
2023 		 * Pass out new socket.
2024 		 */
2025 		if (nsop != NULL)
2026 			*nsop = nso;
2027 
2028 		return (0);
2029 	}
2030 
2031 	/*
2032 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2033 	 * which don't support the FireEngine accept fast-path. It is also
2034 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2035 	 * again. Neither sockfs nor TCP attempt to find out if some other
2036 	 * random module has been inserted in between (in which case we
2037 	 * should follow TLI accept behaviour). We blindly assume the worst
2038 	 * case and revert back to old behaviour i.e. TCP will not send us
2039 	 * any option (eager) and the accept should happen on the listener
2040 	 * queue. Any queued T_conn_ind have already got their options removed
2041 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
2042 	 */
2043 	/*
2044 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2045 	 */
2046 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2047 #ifdef	_ILP32
2048 		queue_t	*q;
2049 
2050 		/*
2051 		 * Find read queue in driver
2052 		 * Can safely do this since we "own" nso/nvp.
2053 		 */
2054 		q = strvp2wq(nvp)->q_next;
2055 		while (SAMESTR(q))
2056 			q = q->q_next;
2057 		q = RD(q);
2058 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2059 #else
2060 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2061 #endif	/* _ILP32 */
2062 		conn_res->PRIM_type = O_T_CONN_RES;
2063 		PRIM_type = O_T_CONN_RES;
2064 	} else {
2065 		conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2066 		conn_res->PRIM_type = T_CONN_RES;
2067 		PRIM_type = T_CONN_RES;
2068 	}
2069 	conn_res->SEQ_number = SEQ_number;
2070 	conn_res->OPT_length = 0;
2071 	conn_res->OPT_offset = 0;
2072 
2073 	mutex_enter(&so->so_lock);
2074 	so_lock_single(so);	/* Set SOLOCKED */
2075 	mutex_exit(&so->so_lock);
2076 
2077 	error = kstrputmsg(SOTOV(so), mp, NULL,
2078 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2079 	mutex_enter(&so->so_lock);
2080 	if (error) {
2081 		eprintsoline(so, error);
2082 		goto disconnect_vp;
2083 	}
2084 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
2085 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2086 	if (error) {
2087 		eprintsoline(so, error);
2088 		goto disconnect_vp;
2089 	}
2090 	mutex_exit(&so->so_lock);
2091 	/*
2092 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
2093 	 * that to set the local address. If this is not present
2094 	 * then we zero out the address and don't set the
2095 	 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2096 	 * the pathname from the listening socket.
2097 	 * In the case where this is TCP or an AF_UNIX socket the
2098 	 * client side may have queued data or a T_ORDREL in the
2099 	 * transport. Having now sent the T_CONN_RES we may receive
2100 	 * those queued messages at any time. Hold the acceptor
2101 	 * so_lock until its state and laddr are finalized.
2102 	 */
2103 	mutex_enter(&nso->so_lock);
2104 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2105 	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2106 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2107 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
2108 		bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2109 		nsti->sti_laddr_len = sinlen;
2110 		nsti->sti_laddr_valid = 1;
2111 	} else if (nso->so_family == AF_UNIX) {
2112 		ASSERT(so->so_family == AF_UNIX);
2113 		nsti->sti_laddr_len = sti->sti_laddr_len;
2114 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2115 		bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2116 		    nsti->sti_laddr_len);
2117 		nsti->sti_laddr_valid = 1;
2118 	} else {
2119 		nsti->sti_laddr_len = sti->sti_laddr_len;
2120 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2121 		bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2122 		nsti->sti_laddr_sa->sa_family = nso->so_family;
2123 	}
2124 	nso->so_state |= SS_ISCONNECTED;
2125 	mutex_exit(&nso->so_lock);
2126 
2127 	freemsg(ack_mp);
2128 
2129 	mutex_enter(&so->so_lock);
2130 	so_unlock_single(so, SOLOCKED);
2131 	mutex_exit(&so->so_lock);
2132 
2133 	/*
2134 	 * Pass out new socket.
2135 	 */
2136 	if (nsop != NULL)
2137 		*nsop = nso;
2138 
2139 	return (0);
2140 
2141 
2142 eproto_disc_unl:
2143 	error = EPROTO;
2144 e_disc_unl:
2145 	eprintsoline(so, error);
2146 	goto disconnect_unlocked;
2147 
2148 pr_disc_vp_unl:
2149 	eprintsoline(so, error);
2150 disconnect_vp_unlocked:
2151 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2152 	VN_RELE(nvp);
2153 disconnect_unlocked:
2154 	(void) sodisconnect(so, SEQ_number, 0);
2155 	return (error);
2156 
2157 pr_disc_vp:
2158 	eprintsoline(so, error);
2159 disconnect_vp:
2160 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2161 	so_unlock_single(so, SOLOCKED);
2162 	mutex_exit(&so->so_lock);
2163 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2164 	VN_RELE(nvp);
2165 	return (error);
2166 
2167 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2168 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2169 	    ? EOPNOTSUPP : EINVAL;
2170 e_bad:
2171 	eprintsoline(so, error);
2172 	return (error);
2173 }
2174 
2175 /*
2176  * connect a socket.
2177  *
2178  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2179  * unconnect (by specifying a null address).
2180  */
2181 int
2182 sotpi_connect(struct sonode *so,
2183     struct sockaddr *name,
2184     socklen_t namelen,
2185     int fflag,
2186     int flags,
2187     struct cred *cr)
2188 {
2189 	struct T_conn_req	conn_req;
2190 	int			error = 0;
2191 	mblk_t			*mp;
2192 	void			*src;
2193 	socklen_t		srclen;
2194 	void			*addr;
2195 	socklen_t		addrlen;
2196 	boolean_t		need_unlock;
2197 	sotpi_info_t		*sti = SOTOTPI(so);
2198 
2199 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2200 	    (void *)so, (void *)name, namelen, fflag, flags,
2201 	    pr_state(so->so_state, so->so_mode)));
2202 
2203 	/*
2204 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2205 	 * avoid sleeping for memory with SOLOCKED held.
2206 	 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2207 	 * + sizeof (struct T_opthdr).
2208 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2209 	 * exceed sti_faddr_maxlen).
2210 	 */
2211 	mp = soallocproto(sizeof (struct T_conn_req) +
2212 	    2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2213 	    cr);
2214 	if (mp == NULL) {
2215 		/*
2216 		 * Connect can not fail with ENOBUFS. A signal was
2217 		 * caught so return EINTR.
2218 		 */
2219 		error = EINTR;
2220 		eprintsoline(so, error);
2221 		return (error);
2222 	}
2223 
2224 	mutex_enter(&so->so_lock);
2225 	/*
2226 	 * Make sure there is a preallocated T_unbind_req message
2227 	 * before any binding. This message is allocated when the
2228 	 * socket is created. Since another thread can consume
2229 	 * so_unbind_mp by the time we return from so_lock_single(),
2230 	 * we should check the availability of so_unbind_mp after
2231 	 * we return from so_lock_single().
2232 	 */
2233 
2234 	so_lock_single(so);	/* Set SOLOCKED */
2235 	need_unlock = B_TRUE;
2236 
2237 	if (sti->sti_unbind_mp == NULL) {
2238 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2239 		/* NOTE: holding so_lock while sleeping */
2240 		sti->sti_unbind_mp =
2241 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2242 		if (sti->sti_unbind_mp == NULL) {
2243 			error = EINTR;
2244 			goto done;
2245 		}
2246 	}
2247 
2248 	/*
2249 	 * Can't have done a listen before connecting.
2250 	 */
2251 	if (so->so_state & SS_ACCEPTCONN) {
2252 		error = EOPNOTSUPP;
2253 		goto done;
2254 	}
2255 
2256 	/*
2257 	 * Must be bound with the transport
2258 	 */
2259 	if (!(so->so_state & SS_ISBOUND)) {
2260 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2261 		    /*CONSTCOND*/
2262 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2263 			/*
2264 			 * Optimization for AF_INET{,6} transports
2265 			 * that can handle a T_CONN_REQ without being bound.
2266 			 */
2267 			so_automatic_bind(so);
2268 		} else {
2269 			error = sotpi_bind(so, NULL, 0,
2270 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2271 			if (error)
2272 				goto done;
2273 		}
2274 		ASSERT(so->so_state & SS_ISBOUND);
2275 		flags |= _SOCONNECT_DID_BIND;
2276 	}
2277 
2278 	/*
2279 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2280 	 * connect to a null address. This is the portable method to
2281 	 * unconnect a socket.
2282 	 */
2283 	if ((namelen >= sizeof (sa_family_t)) &&
2284 	    (name->sa_family == AF_UNSPEC)) {
2285 		name = NULL;
2286 		namelen = 0;
2287 	}
2288 
2289 	/*
2290 	 * Check that we are not already connected.
2291 	 * A connection-oriented socket cannot be reconnected.
2292 	 * A connected connection-less socket can be
2293 	 * - connected to a different address by a subsequent connect
2294 	 * - "unconnected" by a connect to the NULL address
2295 	 */
2296 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2297 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2298 		if (so->so_mode & SM_CONNREQUIRED) {
2299 			/* Connection-oriented socket */
2300 			error = so->so_state & SS_ISCONNECTED ?
2301 			    EISCONN : EALREADY;
2302 			goto done;
2303 		}
2304 		/* Connection-less socket */
2305 		if (name == NULL) {
2306 			/*
2307 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2308 			 * since it was set when the socket was connected.
2309 			 * If this is UDP also send down a T_DISCON_REQ.
2310 			 */
2311 			int val;
2312 
2313 			if ((so->so_family == AF_INET ||
2314 			    so->so_family == AF_INET6) &&
2315 			    (so->so_type == SOCK_DGRAM ||
2316 			    so->so_type == SOCK_RAW) &&
2317 			    /*CONSTCOND*/
2318 			    !soconnect_tpi_udp) {
2319 				/* XXX What about implicitly unbinding here? */
2320 				error = sodisconnect(so, -1,
2321 				    _SODISCONNECT_LOCK_HELD);
2322 			} else {
2323 				so->so_state &=
2324 				    ~(SS_ISCONNECTED | SS_ISCONNECTING);
2325 				sti->sti_faddr_valid = 0;
2326 				sti->sti_faddr_len = 0;
2327 			}
2328 
2329 			/* Remove SOLOCKED since setsockopt will grab it */
2330 			so_unlock_single(so, SOLOCKED);
2331 			mutex_exit(&so->so_lock);
2332 
2333 			val = 0;
2334 			(void) sotpi_setsockopt(so, SOL_SOCKET,
2335 			    SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2336 			    cr);
2337 
2338 			mutex_enter(&so->so_lock);
2339 			so_lock_single(so);	/* Set SOLOCKED */
2340 			goto done;
2341 		}
2342 	}
2343 	ASSERT(so->so_state & SS_ISBOUND);
2344 
2345 	if (name == NULL || namelen == 0) {
2346 		error = EINVAL;
2347 		goto done;
2348 	}
2349 	/*
2350 	 * Mark the socket if sti_faddr_sa represents the transport level
2351 	 * address.
2352 	 */
2353 	if (flags & _SOCONNECT_NOXLATE) {
2354 		struct sockaddr_ux	*soaddr_ux;
2355 
2356 		ASSERT(so->so_family == AF_UNIX);
2357 		if (namelen != sizeof (struct sockaddr_ux)) {
2358 			error = EINVAL;
2359 			goto done;
2360 		}
2361 		soaddr_ux = (struct sockaddr_ux *)name;
2362 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2363 		namelen = sizeof (soaddr_ux->sou_addr);
2364 		sti->sti_faddr_noxlate = 1;
2365 	}
2366 
2367 	/*
2368 	 * Length and family checks.
2369 	 */
2370 	error = so_addr_verify(so, name, namelen);
2371 	if (error)
2372 		goto bad;
2373 
2374 	/*
2375 	 * Save foreign address. Needed for AF_UNIX as well as
2376 	 * transport providers that do not support TI_GETPEERNAME.
2377 	 * Also used for cached foreign address for TCP and UDP.
2378 	 */
2379 	if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2380 		error = EINVAL;
2381 		goto done;
2382 	}
2383 	sti->sti_faddr_len = (socklen_t)namelen;
2384 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2385 	bcopy(name, sti->sti_faddr_sa, namelen);
2386 	sti->sti_faddr_valid = 1;
2387 
2388 	if (so->so_family == AF_UNIX) {
2389 		if (sti->sti_faddr_noxlate) {
2390 			/*
2391 			 * sti_faddr is a transport-level address, so
2392 			 * don't pass it as an option.  Do save it in
2393 			 * sti_ux_faddr, used for connected DG send.
2394 			 */
2395 			src = NULL;
2396 			srclen = 0;
2397 			addr = sti->sti_faddr_sa;
2398 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2399 			bcopy(addr, &sti->sti_ux_faddr,
2400 			    sizeof (sti->sti_ux_faddr));
2401 		} else {
2402 			/*
2403 			 * Pass the sockaddr_un source address as an option
2404 			 * and translate the remote address.
2405 			 * Holding so_lock thus sti_laddr_sa can not change.
2406 			 */
2407 			src = sti->sti_laddr_sa;
2408 			srclen = (t_uscalar_t)sti->sti_laddr_len;
2409 			dprintso(so, 1,
2410 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2411 			    srclen, src));
2412 			/*
2413 			 * Translate the destination address into our
2414 			 * internal form, and save it in sti_ux_faddr.
2415 			 * After this call, addr==&sti->sti_ux_taddr,
2416 			 * and we copy that to sti->sti_ux_faddr so
2417 			 * we save the connected peer address.
2418 			 */
2419 			error = so_ux_addr_xlate(so,
2420 			    sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2421 			    (flags & _SOCONNECT_XPG4_2),
2422 			    &addr, &addrlen);
2423 			if (error)
2424 				goto bad;
2425 			bcopy(&sti->sti_ux_taddr, &sti->sti_ux_faddr,
2426 			    sizeof (sti->sti_ux_faddr));
2427 		}
2428 	} else {
2429 		addr = sti->sti_faddr_sa;
2430 		addrlen = (t_uscalar_t)sti->sti_faddr_len;
2431 		src = NULL;
2432 		srclen = 0;
2433 	}
2434 	/*
2435 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2436 	 * option which asks the transport provider to send T_UDERR_IND
2437 	 * messages. These T_UDERR_IND messages are used to return connected
2438 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2439 	 *
2440 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2441 	 * we send down a T_CONN_REQ. This is needed to let the
2442 	 * transport assign a local address that is consistent with
2443 	 * the remote address. Applications depend on a getsockname()
2444 	 * after a connect() to retrieve the "source" IP address for
2445 	 * the connected socket.  Invalidate the cached local address
2446 	 * to force getsockname() to enquire of the transport.
2447 	 */
2448 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2449 		/*
2450 		 * Datagram socket.
2451 		 */
2452 		int32_t val;
2453 
2454 		so_unlock_single(so, SOLOCKED);
2455 		mutex_exit(&so->so_lock);
2456 
2457 		val = 1;
2458 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2459 		    &val, (t_uscalar_t)sizeof (val), cr);
2460 
2461 		mutex_enter(&so->so_lock);
2462 		so_lock_single(so);	/* Set SOLOCKED */
2463 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2464 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2465 		    soconnect_tpi_udp) {
2466 			soisconnected(so);
2467 			goto done;
2468 		}
2469 		/*
2470 		 * Send down T_CONN_REQ etc.
2471 		 * Clear fflag to avoid returning EWOULDBLOCK.
2472 		 */
2473 		fflag = 0;
2474 		ASSERT(so->so_family != AF_UNIX);
2475 		sti->sti_laddr_valid = 0;
2476 	} else if (sti->sti_laddr_len != 0) {
2477 		/*
2478 		 * If the local address or port was "any" then it may be
2479 		 * changed by the transport as a result of the
2480 		 * connect.  Invalidate the cached version if we have one.
2481 		 */
2482 		switch (so->so_family) {
2483 		case AF_INET:
2484 			ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2485 			if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2486 			    INADDR_ANY ||
2487 			    ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2488 				sti->sti_laddr_valid = 0;
2489 			break;
2490 
2491 		case AF_INET6:
2492 			ASSERT(sti->sti_laddr_len ==
2493 			    (socklen_t)sizeof (sin6_t));
2494 			if (IN6_IS_ADDR_UNSPECIFIED(
2495 			    &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2496 			    IN6_IS_ADDR_V4MAPPED_ANY(
2497 			    &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2498 			    ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2499 				sti->sti_laddr_valid = 0;
2500 			break;
2501 
2502 		default:
2503 			break;
2504 		}
2505 	}
2506 
2507 	/*
2508 	 * Check for failure of an earlier call
2509 	 */
2510 	if (so->so_error != 0)
2511 		goto so_bad;
2512 
2513 	/*
2514 	 * Send down T_CONN_REQ. Message was allocated above.
2515 	 */
2516 	conn_req.PRIM_type = T_CONN_REQ;
2517 	conn_req.DEST_length = addrlen;
2518 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2519 	if (srclen == 0) {
2520 		conn_req.OPT_length = 0;
2521 		conn_req.OPT_offset = 0;
2522 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2523 		soappendmsg(mp, addr, addrlen);
2524 	} else {
2525 		/*
2526 		 * There is a AF_UNIX sockaddr_un to include as a source
2527 		 * address option.
2528 		 */
2529 		struct T_opthdr toh;
2530 
2531 		toh.level = SOL_SOCKET;
2532 		toh.name = SO_SRCADDR;
2533 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2534 		toh.status = 0;
2535 		conn_req.OPT_length =
2536 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2537 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2538 		    _TPI_ALIGN_TOPT(addrlen));
2539 
2540 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2541 		soappendmsg(mp, addr, addrlen);
2542 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2543 		soappendmsg(mp, &toh, sizeof (toh));
2544 		soappendmsg(mp, src, srclen);
2545 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2546 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2547 	}
2548 	/*
2549 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2550 	 * in order to have the right state when the T_CONN_CON shows up.
2551 	 */
2552 	soisconnecting(so);
2553 	mutex_exit(&so->so_lock);
2554 
2555 	if (AU_AUDITING())
2556 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2557 
2558 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2559 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2560 	mp = NULL;
2561 	mutex_enter(&so->so_lock);
2562 	if (error != 0)
2563 		goto bad;
2564 
2565 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2566 		goto bad;
2567 
2568 	/* Allow other threads to access the socket */
2569 	so_unlock_single(so, SOLOCKED);
2570 	need_unlock = B_FALSE;
2571 
2572 	/*
2573 	 * Wait until we get a T_CONN_CON or an error
2574 	 */
2575 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2576 		so_lock_single(so);	/* Set SOLOCKED */
2577 		need_unlock = B_TRUE;
2578 	}
2579 
2580 done:
2581 	freemsg(mp);
2582 	switch (error) {
2583 	case EINPROGRESS:
2584 	case EALREADY:
2585 	case EISCONN:
2586 	case EINTR:
2587 		/* Non-fatal errors */
2588 		sti->sti_laddr_valid = 0;
2589 		/* FALLTHRU */
2590 	case 0:
2591 		break;
2592 	default:
2593 		ASSERT(need_unlock);
2594 		/*
2595 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2596 		 * and invalidate local-address cache
2597 		 */
2598 		so->so_state &= ~SS_ISCONNECTING;
2599 		sti->sti_laddr_valid = 0;
2600 		/* A discon_ind might have already unbound us */
2601 		if ((flags & _SOCONNECT_DID_BIND) &&
2602 		    (so->so_state & SS_ISBOUND)) {
2603 			int err;
2604 
2605 			err = sotpi_unbind(so, 0);
2606 			/* LINTED - statement has no conseq */
2607 			if (err) {
2608 				eprintsoline(so, err);
2609 			}
2610 		}
2611 		break;
2612 	}
2613 	if (need_unlock)
2614 		so_unlock_single(so, SOLOCKED);
2615 	mutex_exit(&so->so_lock);
2616 	return (error);
2617 
2618 so_bad:	error = sogeterr(so, B_TRUE);
2619 bad:	eprintsoline(so, error);
2620 	goto done;
2621 }
2622 
2623 /* ARGSUSED */
2624 int
2625 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2626 {
2627 	struct T_ordrel_req	ordrel_req;
2628 	mblk_t			*mp;
2629 	uint_t			old_state, state_change;
2630 	int			error = 0;
2631 	sotpi_info_t		*sti = SOTOTPI(so);
2632 
2633 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2634 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2635 
2636 	mutex_enter(&so->so_lock);
2637 	so_lock_single(so);	/* Set SOLOCKED */
2638 
2639 	/*
2640 	 * SunOS 4.X has no check for datagram sockets.
2641 	 * 5.X checks that it is connected (ENOTCONN)
2642 	 * X/Open requires that we check the connected state.
2643 	 */
2644 	if (!(so->so_state & SS_ISCONNECTED)) {
2645 		if (!xnet_skip_checks) {
2646 			error = ENOTCONN;
2647 			if (xnet_check_print) {
2648 				printf("sockfs: X/Open shutdown check "
2649 				    "caused ENOTCONN\n");
2650 			}
2651 		}
2652 		goto done;
2653 	}
2654 	/*
2655 	 * Record the current state and then perform any state changes.
2656 	 * Then use the difference between the old and new states to
2657 	 * determine which messages need to be sent.
2658 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2659 	 * duplicate calls to shutdown().
2660 	 */
2661 	old_state = so->so_state;
2662 
2663 	switch (how) {
2664 	case 0:
2665 		socantrcvmore(so);
2666 		break;
2667 	case 1:
2668 		socantsendmore(so);
2669 		break;
2670 	case 2:
2671 		socantsendmore(so);
2672 		socantrcvmore(so);
2673 		break;
2674 	default:
2675 		error = EINVAL;
2676 		goto done;
2677 	}
2678 
2679 	/*
2680 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2681 	 */
2682 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2683 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2684 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2685 
2686 	switch (state_change) {
2687 	case 0:
2688 		dprintso(so, 1,
2689 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2690 		    so->so_state));
2691 		goto done;
2692 
2693 	case SS_CANTRCVMORE:
2694 		mutex_exit(&so->so_lock);
2695 		strseteof(SOTOV(so), 1);
2696 		/*
2697 		 * strseteof takes care of read side wakeups,
2698 		 * pollwakeups, and signals.
2699 		 */
2700 		/*
2701 		 * Get the read lock before flushing data to avoid problems
2702 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2703 		 */
2704 		mutex_enter(&so->so_lock);
2705 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2706 		mutex_exit(&so->so_lock);
2707 
2708 		/* Flush read side queue */
2709 		strflushrq(SOTOV(so), FLUSHALL);
2710 
2711 		mutex_enter(&so->so_lock);
2712 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2713 		break;
2714 
2715 	case SS_CANTSENDMORE:
2716 		mutex_exit(&so->so_lock);
2717 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2718 		mutex_enter(&so->so_lock);
2719 		break;
2720 
2721 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2722 		mutex_exit(&so->so_lock);
2723 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2724 		strseteof(SOTOV(so), 1);
2725 		/*
2726 		 * strseteof takes care of read side wakeups,
2727 		 * pollwakeups, and signals.
2728 		 */
2729 		/*
2730 		 * Get the read lock before flushing data to avoid problems
2731 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2732 		 */
2733 		mutex_enter(&so->so_lock);
2734 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2735 		mutex_exit(&so->so_lock);
2736 
2737 		/* Flush read side queue */
2738 		strflushrq(SOTOV(so), FLUSHALL);
2739 
2740 		mutex_enter(&so->so_lock);
2741 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2742 		break;
2743 	}
2744 
2745 	ASSERT(MUTEX_HELD(&so->so_lock));
2746 
2747 	/*
2748 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2749 	 * was set due to this call and the new state has both of them set:
2750 	 *	Send the AF_UNIX close indication
2751 	 *	For T_COTS send a discon_ind
2752 	 *
2753 	 * If cantsend was set due to this call:
2754 	 *	For T_COTSORD send an ordrel_ind
2755 	 *
2756 	 * Note that for T_CLTS there is no message sent here.
2757 	 */
2758 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2759 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2760 		/*
2761 		 * For SunOS 4.X compatibility we tell the other end
2762 		 * that we are unable to receive at this point.
2763 		 */
2764 		if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2765 			so_unix_close(so);
2766 
2767 		if (sti->sti_serv_type == T_COTS)
2768 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2769 	}
2770 	if ((state_change & SS_CANTSENDMORE) &&
2771 	    (sti->sti_serv_type == T_COTS_ORD)) {
2772 		/* Send an orderly release */
2773 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2774 
2775 		mutex_exit(&so->so_lock);
2776 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2777 		    0, _ALLOC_SLEEP, cr);
2778 		/*
2779 		 * Send down the T_ORDREL_REQ even if there is flow control.
2780 		 * This prevents shutdown from blocking.
2781 		 * Note that there is no T_OK_ACK for ordrel_req.
2782 		 */
2783 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2784 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2785 		mutex_enter(&so->so_lock);
2786 		if (error) {
2787 			eprintsoline(so, error);
2788 			goto done;
2789 		}
2790 	}
2791 
2792 done:
2793 	so_unlock_single(so, SOLOCKED);
2794 	mutex_exit(&so->so_lock);
2795 	return (error);
2796 }
2797 
2798 /*
2799  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2800  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2801  * that we have closed.
2802  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2803  * T_UNITDATA_REQ containing the same option.
2804  *
2805  * For SOCK_DGRAM half-connections (somebody connected to this end
2806  * but this end is not connect) we don't know where to send any
2807  * SO_UNIX_CLOSE.
2808  *
2809  * We have to ignore stream head errors just in case there has been
2810  * a shutdown(output).
2811  * Ignore any flow control to try to get the message more quickly to the peer.
2812  * While locally ignoring flow control solves the problem when there
2813  * is only the loopback transport on the stream it would not provide
2814  * the correct AF_UNIX socket semantics when one or more modules have
2815  * been pushed.
2816  */
2817 void
2818 so_unix_close(struct sonode *so)
2819 {
2820 	struct T_opthdr	toh;
2821 	mblk_t		*mp;
2822 	sotpi_info_t	*sti = SOTOTPI(so);
2823 
2824 	ASSERT(MUTEX_HELD(&so->so_lock));
2825 
2826 	ASSERT(so->so_family == AF_UNIX);
2827 
2828 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2829 	    (SS_ISCONNECTED|SS_ISBOUND))
2830 		return;
2831 
2832 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2833 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2834 
2835 	toh.level = SOL_SOCKET;
2836 	toh.name = SO_UNIX_CLOSE;
2837 
2838 	/* zero length + header */
2839 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2840 	toh.status = 0;
2841 
2842 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2843 		struct T_optdata_req tdr;
2844 
2845 		tdr.PRIM_type = T_OPTDATA_REQ;
2846 		tdr.DATA_flag = 0;
2847 
2848 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2849 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2850 
2851 		/* NOTE: holding so_lock while sleeping */
2852 		mp = soallocproto2(&tdr, sizeof (tdr),
2853 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2854 	} else {
2855 		struct T_unitdata_req	tudr;
2856 		void			*addr;
2857 		socklen_t		addrlen;
2858 		void			*src;
2859 		socklen_t		srclen;
2860 		struct T_opthdr		toh2;
2861 		t_scalar_t		size;
2862 
2863 		/*
2864 		 * We know this is an AF_UNIX connected DGRAM socket.
2865 		 * We therefore already have the destination address
2866 		 * in the internal form needed for this send.  This is
2867 		 * similar to the sosend_dgram call later in this file
2868 		 * when there's no user-specified destination address.
2869 		 */
2870 		if (sti->sti_faddr_noxlate) {
2871 			/*
2872 			 * Already have a transport internal address. Do not
2873 			 * pass any (transport internal) source address.
2874 			 */
2875 			addr = sti->sti_faddr_sa;
2876 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2877 			src = NULL;
2878 			srclen = 0;
2879 		} else {
2880 			/*
2881 			 * Pass the sockaddr_un source address as an option
2882 			 * and translate the remote address.
2883 			 * Holding so_lock thus sti_laddr_sa can not change.
2884 			 */
2885 			src = sti->sti_laddr_sa;
2886 			srclen = (socklen_t)sti->sti_laddr_len;
2887 			dprintso(so, 1,
2888 			    ("so_ux_close: srclen %d, src %p\n",
2889 			    srclen, src));
2890 			/*
2891 			 * Use the destination address saved in connect.
2892 			 */
2893 			addr = &sti->sti_ux_faddr;
2894 			addrlen = sizeof (sti->sti_ux_faddr);
2895 		}
2896 		tudr.PRIM_type = T_UNITDATA_REQ;
2897 		tudr.DEST_length = addrlen;
2898 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2899 		if (srclen == 0) {
2900 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2901 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2902 			    _TPI_ALIGN_TOPT(addrlen));
2903 
2904 			size = tudr.OPT_offset + tudr.OPT_length;
2905 			/* NOTE: holding so_lock while sleeping */
2906 			mp = soallocproto2(&tudr, sizeof (tudr),
2907 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2908 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2909 			soappendmsg(mp, &toh, sizeof (toh));
2910 		} else {
2911 			/*
2912 			 * There is a AF_UNIX sockaddr_un to include as a
2913 			 * source address option.
2914 			 */
2915 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2916 			    _TPI_ALIGN_TOPT(srclen));
2917 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2918 			    _TPI_ALIGN_TOPT(addrlen));
2919 
2920 			toh2.level = SOL_SOCKET;
2921 			toh2.name = SO_SRCADDR;
2922 			toh2.len = (t_uscalar_t)(srclen +
2923 			    sizeof (struct T_opthdr));
2924 			toh2.status = 0;
2925 
2926 			size = tudr.OPT_offset + tudr.OPT_length;
2927 
2928 			/* NOTE: holding so_lock while sleeping */
2929 			mp = soallocproto2(&tudr, sizeof (tudr),
2930 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2931 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2932 			soappendmsg(mp, &toh, sizeof (toh));
2933 			soappendmsg(mp, &toh2, sizeof (toh2));
2934 			soappendmsg(mp, src, srclen);
2935 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2936 		}
2937 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2938 	}
2939 	mutex_exit(&so->so_lock);
2940 	(void) kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2941 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2942 	mutex_enter(&so->so_lock);
2943 }
2944 
2945 /*
2946  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2947  * In addition, the caller typically verifies that there is some
2948  * potential state to clear by checking
2949  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2950  * before calling this routine.
2951  * Note that such a check can be made without holding so_lock since
2952  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2953  * decrements sti_oobsigcnt.
2954  *
2955  * When data is read *after* the point that all pending
2956  * oob data has been consumed the oob indication is cleared.
2957  *
2958  * This logic keeps select/poll returning POLLRDBAND and
2959  * SIOCATMARK returning true until we have read past
2960  * the mark.
2961  */
2962 static void
2963 sorecv_update_oobstate(struct sonode *so)
2964 {
2965 	sotpi_info_t *sti = SOTOTPI(so);
2966 
2967 	mutex_enter(&so->so_lock);
2968 	ASSERT(so_verify_oobstate(so));
2969 	dprintso(so, 1,
2970 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
2971 	    sti->sti_oobsigcnt,
2972 	    sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2973 	if (sti->sti_oobsigcnt == 0) {
2974 		/* No more pending oob indications */
2975 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2976 		freemsg(so->so_oobmsg);
2977 		so->so_oobmsg = NULL;
2978 	}
2979 	ASSERT(so_verify_oobstate(so));
2980 	mutex_exit(&so->so_lock);
2981 }
2982 
2983 /*
2984  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2985  */
2986 static int
2987 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2988 {
2989 	sotpi_info_t *sti = SOTOTPI(so);
2990 	int	error = 0;
2991 	mblk_t *tmp = NULL;
2992 	mblk_t *pmp = NULL;
2993 	mblk_t *nmp = sti->sti_nl7c_rcv_mp;
2994 
2995 	ASSERT(nmp != NULL);
2996 
2997 	while (nmp != NULL && uiop->uio_resid > 0) {
2998 		ssize_t n;
2999 
3000 		if (DB_TYPE(nmp) == M_DATA) {
3001 			/*
3002 			 * We have some data, uiomove up to resid bytes.
3003 			 */
3004 			n = MIN(MBLKL(nmp), uiop->uio_resid);
3005 			if (n > 0)
3006 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
3007 			nmp->b_rptr += n;
3008 			if (nmp->b_rptr == nmp->b_wptr) {
3009 				pmp = nmp;
3010 				nmp = nmp->b_cont;
3011 			}
3012 			if (error)
3013 				break;
3014 		} else {
3015 			/*
3016 			 * We only handle data, save for caller to handle.
3017 			 */
3018 			if (pmp != NULL) {
3019 				pmp->b_cont = nmp->b_cont;
3020 			}
3021 			nmp->b_cont = NULL;
3022 			if (*rmp == NULL) {
3023 				*rmp = nmp;
3024 			} else {
3025 				tmp->b_cont = nmp;
3026 			}
3027 			nmp = nmp->b_cont;
3028 			tmp = nmp;
3029 		}
3030 	}
3031 	if (pmp != NULL) {
3032 		/* Free any mblk_t(s) which we have consumed */
3033 		pmp->b_cont = NULL;
3034 		freemsg(sti->sti_nl7c_rcv_mp);
3035 	}
3036 	if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3037 		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
3038 		if (error == 0) {
3039 			rval_t	*p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3040 
3041 			error = p->r_v.r_v2;
3042 			p->r_v.r_v2 = 0;
3043 		}
3044 		rp->r_vals = sti->sti_nl7c_rcv_rval;
3045 		sti->sti_nl7c_rcv_rval = 0;
3046 	} else {
3047 		/* More mblk_t(s) to process so no rval to return */
3048 		rp->r_vals = 0;
3049 	}
3050 	return (error);
3051 }
3052 /*
3053  * Receive the next message on the queue.
3054  * If msg_controllen is non-zero when called the caller is interested in
3055  * any received control info (options).
3056  * If msg_namelen is non-zero when called the caller is interested in
3057  * any received source address.
3058  * The routine returns with msg_control and msg_name pointing to
3059  * kmem_alloc'ed memory which the caller has to free.
3060  */
3061 /* ARGSUSED */
3062 int
3063 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3064     struct cred *cr)
3065 {
3066 	union T_primitives	*tpr;
3067 	mblk_t			*mp;
3068 	uchar_t			pri;
3069 	int			pflag, opflag;
3070 	void			*control;
3071 	t_uscalar_t		controllen;
3072 	t_uscalar_t		namelen;
3073 	int			so_state = so->so_state; /* Snapshot */
3074 	ssize_t			saved_resid;
3075 	rval_t			rval;
3076 	int			flags;
3077 	clock_t			timout;
3078 	int			error = 0;
3079 	sotpi_info_t		*sti = SOTOTPI(so);
3080 
3081 	flags = msg->msg_flags;
3082 	msg->msg_flags = 0;
3083 
3084 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3085 	    (void *)so, (void *)msg, flags,
3086 	    pr_state(so->so_state, so->so_mode), so->so_error));
3087 
3088 	if (so->so_version == SOV_STREAM) {
3089 		so_update_attrs(so, SOACC);
3090 		/* The imaginary "sockmod" has been popped - act as a stream */
3091 		return (strread(SOTOV(so), uiop, cr));
3092 	}
3093 
3094 	/*
3095 	 * If we are not connected because we have never been connected
3096 	 * we return ENOTCONN. If we have been connected (but are no longer
3097 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3098 	 * the EOF.
3099 	 *
3100 	 * An alternative would be to post an ENOTCONN error in stream head
3101 	 * (read+write) and clear it when we're connected. However, that error
3102 	 * would cause incorrect poll/select behavior!
3103 	 */
3104 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3105 	    (so->so_mode & SM_CONNREQUIRED)) {
3106 		return (ENOTCONN);
3107 	}
3108 
3109 	/*
3110 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3111 	 * after checking that the read queue is empty) and returns zero.
3112 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
3113 	 * is zero.
3114 	 */
3115 
3116 	if (flags & MSG_OOB) {
3117 		/* Check that the transport supports OOB */
3118 		if (!(so->so_mode & SM_EXDATA))
3119 			return (EOPNOTSUPP);
3120 		so_update_attrs(so, SOACC);
3121 		return (sorecvoob(so, msg, uiop, flags,
3122 		    (so->so_options & SO_OOBINLINE)));
3123 	}
3124 
3125 	so_update_attrs(so, SOACC);
3126 
3127 	/*
3128 	 * Set msg_controllen and msg_namelen to zero here to make it
3129 	 * simpler in the cases that no control or name is returned.
3130 	 */
3131 	controllen = msg->msg_controllen;
3132 	namelen = msg->msg_namelen;
3133 	msg->msg_controllen = 0;
3134 	msg->msg_namelen = 0;
3135 
3136 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3137 	    namelen, controllen));
3138 
3139 	mutex_enter(&so->so_lock);
3140 	/*
3141 	 * If an NL7C enabled socket and not waiting for write data.
3142 	 */
3143 	if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3144 	    NL7C_ENABLED) {
3145 		if (sti->sti_nl7c_uri) {
3146 			/* Close uri processing for a previous request */
3147 			nl7c_close(so);
3148 		}
3149 		if ((so_state & SS_CANTRCVMORE) &&
3150 		    sti->sti_nl7c_rcv_mp == NULL) {
3151 			/* Nothing to process, EOF */
3152 			mutex_exit(&so->so_lock);
3153 			return (0);
3154 		} else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3155 			/* Persistent NL7C socket, try to process request */
3156 			boolean_t ret;
3157 
3158 			ret = nl7c_process(so,
3159 			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3160 			rval.r_vals = sti->sti_nl7c_rcv_rval;
3161 			error = rval.r_v.r_v2;
3162 			if (error) {
3163 				/* Error of some sort, return it */
3164 				mutex_exit(&so->so_lock);
3165 				return (error);
3166 			}
3167 			if (sti->sti_nl7c_flags &&
3168 			    ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3169 				/*
3170 				 * Still an NL7C socket and no data
3171 				 * to pass up to the caller.
3172 				 */
3173 				mutex_exit(&so->so_lock);
3174 				if (ret) {
3175 					/* EOF */
3176 					return (0);
3177 				} else {
3178 					/* Need more data */
3179 					return (EAGAIN);
3180 				}
3181 			}
3182 		} else {
3183 			/*
3184 			 * Not persistent so no further NL7C processing.
3185 			 */
3186 			sti->sti_nl7c_flags = 0;
3187 		}
3188 	}
3189 	/*
3190 	 * Only one reader is allowed at any given time. This is needed
3191 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3192 	 *
3193 	 * This is slightly different that BSD behavior in that it fails with
3194 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3195 	 * is single-threaded using sblock(), which is dropped while waiting
3196 	 * for data to appear. The difference shows up e.g. if one
3197 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3198 	 * does use nonblocking io and different threads are reading each
3199 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3200 	 * in this case as long as the read queue doesn't get empty.
3201 	 * In this implementation the thread using nonblocking io can
3202 	 * get an EWOULDBLOCK error due to the blocking thread executing
3203 	 * e.g. in the uiomove in kstrgetmsg.
3204 	 * This difference is not believed to be significant.
3205 	 */
3206 	/* Set SOREADLOCKED */
3207 	error = so_lock_read_intr(so,
3208 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3209 	mutex_exit(&so->so_lock);
3210 	if (error)
3211 		return (error);
3212 
3213 	/*
3214 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3215 	 * queued data has been consumed.
3216 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3217 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3218 	 *
3219 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3220 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3221 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3222 	 */
3223 	pflag = MSG_ANY | MSG_DELAYERROR;
3224 	if (flags & MSG_PEEK) {
3225 		pflag |= MSG_IPEEK;
3226 		flags &= ~MSG_WAITALL;
3227 	}
3228 	if (so->so_mode & SM_ATOMIC)
3229 		pflag |= MSG_DISCARDTAIL;
3230 
3231 	if (flags & MSG_DONTWAIT)
3232 		timout = 0;
3233 	else if (so->so_rcvtimeo != 0)
3234 		timout = TICK_TO_MSEC(so->so_rcvtimeo);
3235 	else
3236 		timout = -1;
3237 	opflag = pflag;
3238 retry:
3239 	saved_resid = uiop->uio_resid;
3240 	pri = 0;
3241 	mp = NULL;
3242 	if (sti->sti_nl7c_rcv_mp != NULL) {
3243 		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3244 		error = nl7c_sorecv(so, &mp, uiop, &rval);
3245 	} else {
3246 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3247 		    timout, &rval);
3248 	}
3249 	if (error != 0) {
3250 		/* kstrgetmsg returns ETIME when timeout expires */
3251 		if (error == ETIME)
3252 			error = EWOULDBLOCK;
3253 		goto out;
3254 	}
3255 	/*
3256 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3257 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3258 	 */
3259 	ASSERT(!(rval.r_val1 & MORECTL));
3260 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3261 		msg->msg_flags |= MSG_TRUNC;
3262 
3263 	if (mp == NULL) {
3264 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3265 		/*
3266 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3267 		 * The draft Posix socket spec states that the mark should
3268 		 * not be cleared when peeking. We follow the latter.
3269 		 */
3270 		if ((so->so_state &
3271 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3272 		    (uiop->uio_resid != saved_resid) &&
3273 		    !(flags & MSG_PEEK)) {
3274 			sorecv_update_oobstate(so);
3275 		}
3276 
3277 		mutex_enter(&so->so_lock);
3278 		/* Set MSG_EOR based on MOREDATA */
3279 		if (!(rval.r_val1 & MOREDATA)) {
3280 			if (so->so_state & SS_SAVEDEOR) {
3281 				msg->msg_flags |= MSG_EOR;
3282 				so->so_state &= ~SS_SAVEDEOR;
3283 			}
3284 		}
3285 		/*
3286 		 * If some data was received (i.e. not EOF) and the
3287 		 * read/recv* has not been satisfied wait for some more.
3288 		 */
3289 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3290 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3291 			mutex_exit(&so->so_lock);
3292 			pflag = opflag | MSG_NOMARK;
3293 			goto retry;
3294 		}
3295 		goto out_locked;
3296 	}
3297 
3298 	/* strsock_proto has already verified length and alignment */
3299 	tpr = (union T_primitives *)mp->b_rptr;
3300 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3301 
3302 	switch (tpr->type) {
3303 	case T_DATA_IND: {
3304 		if ((so->so_state &
3305 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3306 		    (uiop->uio_resid != saved_resid) &&
3307 		    !(flags & MSG_PEEK)) {
3308 			sorecv_update_oobstate(so);
3309 		}
3310 
3311 		/*
3312 		 * Set msg_flags to MSG_EOR based on
3313 		 * MORE_flag and MOREDATA.
3314 		 */
3315 		mutex_enter(&so->so_lock);
3316 		so->so_state &= ~SS_SAVEDEOR;
3317 		if (!(tpr->data_ind.MORE_flag & 1)) {
3318 			if (!(rval.r_val1 & MOREDATA))
3319 				msg->msg_flags |= MSG_EOR;
3320 			else
3321 				so->so_state |= SS_SAVEDEOR;
3322 		}
3323 		freemsg(mp);
3324 		/*
3325 		 * If some data was received (i.e. not EOF) and the
3326 		 * read/recv* has not been satisfied wait for some more.
3327 		 */
3328 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3329 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3330 			mutex_exit(&so->so_lock);
3331 			pflag = opflag | MSG_NOMARK;
3332 			goto retry;
3333 		}
3334 		goto out_locked;
3335 	}
3336 	case T_UNITDATA_IND: {
3337 		void *addr;
3338 		t_uscalar_t addrlen;
3339 		void *abuf;
3340 		t_uscalar_t optlen;
3341 		void *opt;
3342 
3343 		if ((so->so_state &
3344 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3345 		    (uiop->uio_resid != saved_resid) &&
3346 		    !(flags & MSG_PEEK)) {
3347 			sorecv_update_oobstate(so);
3348 		}
3349 
3350 		if (namelen != 0) {
3351 			/* Caller wants source address */
3352 			addrlen = tpr->unitdata_ind.SRC_length;
3353 			addr = sogetoff(mp,
3354 			    tpr->unitdata_ind.SRC_offset,
3355 			    addrlen, 1);
3356 			if (addr == NULL) {
3357 				freemsg(mp);
3358 				error = EPROTO;
3359 				eprintsoline(so, error);
3360 				goto out;
3361 			}
3362 			if (so->so_family == AF_UNIX) {
3363 				/*
3364 				 * Can not use the transport level address.
3365 				 * If there is a SO_SRCADDR option carrying
3366 				 * the socket level address it will be
3367 				 * extracted below.
3368 				 */
3369 				addr = NULL;
3370 				addrlen = 0;
3371 			}
3372 		}
3373 		optlen = tpr->unitdata_ind.OPT_length;
3374 		if (optlen != 0) {
3375 			t_uscalar_t ncontrollen;
3376 
3377 			/*
3378 			 * Extract any source address option.
3379 			 * Determine how large cmsg buffer is needed.
3380 			 */
3381 			opt = sogetoff(mp,
3382 			    tpr->unitdata_ind.OPT_offset,
3383 			    optlen, __TPI_ALIGN_SIZE);
3384 
3385 			if (opt == NULL) {
3386 				freemsg(mp);
3387 				error = EPROTO;
3388 				eprintsoline(so, error);
3389 				goto out;
3390 			}
3391 			if (so->so_family == AF_UNIX)
3392 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3393 			ncontrollen = so_cmsglen(mp, opt, optlen,
3394 			    !(flags & MSG_XPG4_2));
3395 			if (controllen != 0)
3396 				controllen = ncontrollen;
3397 			else if (ncontrollen != 0)
3398 				msg->msg_flags |= MSG_CTRUNC;
3399 		} else {
3400 			controllen = 0;
3401 		}
3402 
3403 		if (namelen != 0) {
3404 			/*
3405 			 * Return address to caller.
3406 			 * Caller handles truncation if length
3407 			 * exceeds msg_namelen.
3408 			 * NOTE: AF_UNIX NUL termination is ensured by
3409 			 * the sender's copyin_name().
3410 			 */
3411 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3412 
3413 			bcopy(addr, abuf, addrlen);
3414 			msg->msg_name = abuf;
3415 			msg->msg_namelen = addrlen;
3416 		}
3417 
3418 		if (controllen != 0) {
3419 			/*
3420 			 * Return control msg to caller.
3421 			 * Caller handles truncation if length
3422 			 * exceeds msg_controllen.
3423 			 */
3424 			control = kmem_zalloc(controllen, KM_SLEEP);
3425 
3426 			error = so_opt2cmsg(mp, opt, optlen,
3427 			    !(flags & MSG_XPG4_2),
3428 			    control, controllen);
3429 			if (error) {
3430 				freemsg(mp);
3431 				if (msg->msg_namelen != 0)
3432 					kmem_free(msg->msg_name,
3433 					    msg->msg_namelen);
3434 				kmem_free(control, controllen);
3435 				eprintsoline(so, error);
3436 				goto out;
3437 			}
3438 			msg->msg_control = control;
3439 			msg->msg_controllen = controllen;
3440 		}
3441 
3442 		freemsg(mp);
3443 		goto out;
3444 	}
3445 	case T_OPTDATA_IND: {
3446 		struct T_optdata_req *tdr;
3447 		void *opt;
3448 		t_uscalar_t optlen;
3449 
3450 		if ((so->so_state &
3451 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3452 		    (uiop->uio_resid != saved_resid) &&
3453 		    !(flags & MSG_PEEK)) {
3454 			sorecv_update_oobstate(so);
3455 		}
3456 
3457 		tdr = (struct T_optdata_req *)mp->b_rptr;
3458 		optlen = tdr->OPT_length;
3459 		if (optlen != 0) {
3460 			t_uscalar_t ncontrollen;
3461 			/*
3462 			 * Determine how large cmsg buffer is needed.
3463 			 */
3464 			opt = sogetoff(mp,
3465 			    tpr->optdata_ind.OPT_offset,
3466 			    optlen, __TPI_ALIGN_SIZE);
3467 
3468 			if (opt == NULL) {
3469 				freemsg(mp);
3470 				error = EPROTO;
3471 				eprintsoline(so, error);
3472 				goto out;
3473 			}
3474 
3475 			ncontrollen = so_cmsglen(mp, opt, optlen,
3476 			    !(flags & MSG_XPG4_2));
3477 			if (controllen != 0)
3478 				controllen = ncontrollen;
3479 			else if (ncontrollen != 0)
3480 				msg->msg_flags |= MSG_CTRUNC;
3481 		} else {
3482 			controllen = 0;
3483 		}
3484 
3485 		if (controllen != 0) {
3486 			/*
3487 			 * Return control msg to caller.
3488 			 * Caller handles truncation if length
3489 			 * exceeds msg_controllen.
3490 			 */
3491 			control = kmem_zalloc(controllen, KM_SLEEP);
3492 
3493 			error = so_opt2cmsg(mp, opt, optlen,
3494 			    !(flags & MSG_XPG4_2),
3495 			    control, controllen);
3496 			if (error) {
3497 				freemsg(mp);
3498 				kmem_free(control, controllen);
3499 				eprintsoline(so, error);
3500 				goto out;
3501 			}
3502 			msg->msg_control = control;
3503 			msg->msg_controllen = controllen;
3504 		}
3505 
3506 		/*
3507 		 * Set msg_flags to MSG_EOR based on
3508 		 * DATA_flag and MOREDATA.
3509 		 */
3510 		mutex_enter(&so->so_lock);
3511 		so->so_state &= ~SS_SAVEDEOR;
3512 		if (!(tpr->data_ind.MORE_flag & 1)) {
3513 			if (!(rval.r_val1 & MOREDATA))
3514 				msg->msg_flags |= MSG_EOR;
3515 			else
3516 				so->so_state |= SS_SAVEDEOR;
3517 		}
3518 		freemsg(mp);
3519 		/*
3520 		 * If some data was received (i.e. not EOF) and the
3521 		 * read/recv* has not been satisfied wait for some more.
3522 		 * Not possible to wait if control info was received.
3523 		 */
3524 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3525 		    controllen == 0 &&
3526 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3527 			mutex_exit(&so->so_lock);
3528 			pflag = opflag | MSG_NOMARK;
3529 			goto retry;
3530 		}
3531 		goto out_locked;
3532 	}
3533 	case T_EXDATA_IND: {
3534 		dprintso(so, 1,
3535 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3536 		    "state %s\n",
3537 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3538 		    saved_resid - uiop->uio_resid,
3539 		    pr_state(so->so_state, so->so_mode)));
3540 		/*
3541 		 * kstrgetmsg handles MSGMARK so there is nothing to
3542 		 * inspect in the T_EXDATA_IND.
3543 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3544 		 * as a separate message with no M_DATA component. Furthermore,
3545 		 * the stream head does not consolidate M_DATA messages onto
3546 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3547 		 * remains a message by itself. This is needed since MSGMARK
3548 		 * marks both the whole message as well as the last byte
3549 		 * of the message.
3550 		 */
3551 		freemsg(mp);
3552 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3553 		if (flags & MSG_PEEK) {
3554 			/*
3555 			 * Even though we are peeking we consume the
3556 			 * T_EXDATA_IND thereby moving the mark information
3557 			 * to SS_RCVATMARK. Then the oob code below will
3558 			 * retry the peeking kstrgetmsg.
3559 			 * Note that the stream head read queue is
3560 			 * never flushed without holding SOREADLOCKED
3561 			 * thus the T_EXDATA_IND can not disappear
3562 			 * underneath us.
3563 			 */
3564 			dprintso(so, 1,
3565 			    ("sotpi_recvmsg: consume EXDATA_IND "
3566 			    "counts %d/%d state %s\n",
3567 			    sti->sti_oobsigcnt,
3568 			    sti->sti_oobcnt,
3569 			    pr_state(so->so_state, so->so_mode)));
3570 
3571 			pflag = MSG_ANY | MSG_DELAYERROR;
3572 			if (so->so_mode & SM_ATOMIC)
3573 				pflag |= MSG_DISCARDTAIL;
3574 
3575 			pri = 0;
3576 			mp = NULL;
3577 
3578 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3579 			    &pri, &pflag, (clock_t)-1, &rval);
3580 			ASSERT(uiop->uio_resid == saved_resid);
3581 
3582 			if (error) {
3583 #ifdef SOCK_DEBUG
3584 				if (error != EWOULDBLOCK && error != EINTR) {
3585 					eprintsoline(so, error);
3586 				}
3587 #endif /* SOCK_DEBUG */
3588 				goto out;
3589 			}
3590 			ASSERT(mp);
3591 			tpr = (union T_primitives *)mp->b_rptr;
3592 			ASSERT(tpr->type == T_EXDATA_IND);
3593 			freemsg(mp);
3594 		} /* end "if (flags & MSG_PEEK)" */
3595 
3596 		/*
3597 		 * Decrement the number of queued and pending oob.
3598 		 *
3599 		 * SS_RCVATMARK is cleared when we read past a mark.
3600 		 * SS_HAVEOOBDATA is cleared when we've read past the
3601 		 * last mark.
3602 		 * SS_OOBPEND is cleared if we've read past the last
3603 		 * mark and no (new) SIGURG has been posted.
3604 		 */
3605 		mutex_enter(&so->so_lock);
3606 		ASSERT(so_verify_oobstate(so));
3607 		ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3608 		ASSERT(sti->sti_oobsigcnt > 0);
3609 		sti->sti_oobsigcnt--;
3610 		ASSERT(sti->sti_oobcnt > 0);
3611 		sti->sti_oobcnt--;
3612 		/*
3613 		 * Since the T_EXDATA_IND has been removed from the stream
3614 		 * head, but we have not read data past the mark,
3615 		 * sockfs needs to track that the socket is still at the mark.
3616 		 *
3617 		 * Since no data was received call kstrgetmsg again to wait
3618 		 * for data.
3619 		 */
3620 		so->so_state |= SS_RCVATMARK;
3621 		mutex_exit(&so->so_lock);
3622 		dprintso(so, 1,
3623 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3624 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3625 		    pr_state(so->so_state, so->so_mode)));
3626 		pflag = opflag;
3627 		goto retry;
3628 	}
3629 	default:
3630 		cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3631 		    (void *)so, tpr->type, (void *)mp);
3632 		ASSERT(0);
3633 		freemsg(mp);
3634 		error = EPROTO;
3635 		eprintsoline(so, error);
3636 		goto out;
3637 	}
3638 	/* NOTREACHED */
3639 out:
3640 	mutex_enter(&so->so_lock);
3641 out_locked:
3642 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3643 	mutex_exit(&so->so_lock);
3644 	return (error);
3645 }
3646 
3647 /*
3648  * Sending data with options on a datagram socket.
3649  * Assumes caller has verified that SS_ISBOUND etc. are set.
3650  *
3651  * For AF_UNIX the destination address may be already in
3652  * internal form, as indicated by sti->sti_faddr_noxlate
3653  * or the MSG_SENDTO_NOXLATE flag.  Otherwise we need to
3654  * translate the destination address to internal form.
3655  *
3656  * The source address is passed as an option.  If passing
3657  * file descriptors, those are passed as file pointers in
3658  * another option.
3659  */
3660 static int
3661 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3662     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3663 {
3664 	struct T_unitdata_req	tudr;
3665 	mblk_t			*mp;
3666 	int			error;
3667 	void			*addr;
3668 	socklen_t		addrlen;
3669 	void			*src;
3670 	socklen_t		srclen;
3671 	ssize_t			len;
3672 	int			size;
3673 	struct T_opthdr		toh;
3674 	struct fdbuf		*fdbuf;
3675 	t_uscalar_t		optlen;
3676 	void			*fds;
3677 	int			fdlen;
3678 	sotpi_info_t		*sti = SOTOTPI(so);
3679 
3680 	ASSERT(name && namelen);
3681 	ASSERT(control && controllen);
3682 
3683 	len = uiop->uio_resid;
3684 	if (len > (ssize_t)sti->sti_tidu_size) {
3685 		return (EMSGSIZE);
3686 	}
3687 
3688 	if (sti->sti_faddr_noxlate == 0 &&
3689 	    (flags & MSG_SENDTO_NOXLATE) == 0) {
3690 		/*
3691 		 * Length and family checks.
3692 		 * Don't verify internal form.
3693 		 */
3694 		error = so_addr_verify(so, name, namelen);
3695 		if (error) {
3696 			eprintsoline(so, error);
3697 			return (error);
3698 		}
3699 	}
3700 
3701 	if (so->so_family == AF_UNIX) {
3702 		if (sti->sti_faddr_noxlate) {
3703 			/*
3704 			 * Already have a transport internal address. Do not
3705 			 * pass any (transport internal) source address.
3706 			 */
3707 			addr = name;
3708 			addrlen = namelen;
3709 			src = NULL;
3710 			srclen = 0;
3711 		} else if (flags & MSG_SENDTO_NOXLATE) {
3712 			/*
3713 			 * Have an internal form dest. address.
3714 			 * Pass the source address as usual.
3715 			 */
3716 			addr = name;
3717 			addrlen = namelen;
3718 			src = sti->sti_laddr_sa;
3719 			srclen = (socklen_t)sti->sti_laddr_len;
3720 		} else {
3721 			/*
3722 			 * Pass the sockaddr_un source address as an option
3723 			 * and translate the remote address.
3724 			 *
3725 			 * Note that this code does not prevent sti_laddr_sa
3726 			 * from changing while it is being used. Thus
3727 			 * if an unbind+bind occurs concurrently with this
3728 			 * send the peer might see a partially new and a
3729 			 * partially old "from" address.
3730 			 */
3731 			src = sti->sti_laddr_sa;
3732 			srclen = (socklen_t)sti->sti_laddr_len;
3733 			dprintso(so, 1,
3734 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3735 			    srclen, src));
3736 			/*
3737 			 * The sendmsg caller specified a destination
3738 			 * address, which we must translate into our
3739 			 * internal form.  addr = &sti->sti_ux_taddr
3740 			 */
3741 			error = so_ux_addr_xlate(so, name, namelen,
3742 			    (flags & MSG_XPG4_2),
3743 			    &addr, &addrlen);
3744 			if (error) {
3745 				eprintsoline(so, error);
3746 				return (error);
3747 			}
3748 		}
3749 	} else {
3750 		addr = name;
3751 		addrlen = namelen;
3752 		src = NULL;
3753 		srclen = 0;
3754 	}
3755 	optlen = so_optlen(control, controllen,
3756 	    !(flags & MSG_XPG4_2));
3757 	tudr.PRIM_type = T_UNITDATA_REQ;
3758 	tudr.DEST_length = addrlen;
3759 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3760 	if (srclen != 0)
3761 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3762 		    _TPI_ALIGN_TOPT(srclen));
3763 	else
3764 		tudr.OPT_length = optlen;
3765 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3766 	    _TPI_ALIGN_TOPT(addrlen));
3767 
3768 	size = tudr.OPT_offset + tudr.OPT_length;
3769 
3770 	/*
3771 	 * File descriptors only when SM_FDPASSING set.
3772 	 */
3773 	error = so_getfdopt(control, controllen,
3774 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3775 	if (error)
3776 		return (error);
3777 	if (fdlen != -1) {
3778 		if (!(so->so_mode & SM_FDPASSING))
3779 			return (EOPNOTSUPP);
3780 
3781 		error = fdbuf_create(fds, fdlen, &fdbuf);
3782 		if (error)
3783 			return (error);
3784 
3785 		/*
3786 		 * Pre-allocate enough additional space for lower level modules
3787 		 * to append an option (e.g. see tl_unitdata). The following
3788 		 * is enough extra space for the largest option we might append.
3789 		 */
3790 		size += sizeof (struct T_opthdr) + ucredsize;
3791 		mp = fdbuf_allocmsg(size, fdbuf);
3792 	} else {
3793 		mp = soallocproto(size, _ALLOC_INTR, CRED());
3794 		if (mp == NULL) {
3795 			/*
3796 			 * Caught a signal waiting for memory.
3797 			 * Let send* return EINTR.
3798 			 */
3799 			return (EINTR);
3800 		}
3801 	}
3802 	soappendmsg(mp, &tudr, sizeof (tudr));
3803 	soappendmsg(mp, addr, addrlen);
3804 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3805 
3806 	if (fdlen != -1) {
3807 		ASSERT(fdbuf != NULL);
3808 		toh.level = SOL_SOCKET;
3809 		toh.name = SO_FILEP;
3810 		toh.len = fdbuf->fd_size +
3811 		    (t_uscalar_t)sizeof (struct T_opthdr);
3812 		toh.status = 0;
3813 		soappendmsg(mp, &toh, sizeof (toh));
3814 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3815 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3816 	}
3817 	if (srclen != 0) {
3818 		/*
3819 		 * There is a AF_UNIX sockaddr_un to include as a source
3820 		 * address option.
3821 		 */
3822 		toh.level = SOL_SOCKET;
3823 		toh.name = SO_SRCADDR;
3824 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3825 		toh.status = 0;
3826 		soappendmsg(mp, &toh, sizeof (toh));
3827 		soappendmsg(mp, src, srclen);
3828 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3829 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3830 	}
3831 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3832 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3833 	/*
3834 	 * Normally at most 3 bytes left in the message, but we might have
3835 	 * allowed for extra space if we're passing fd's through.
3836 	 */
3837 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3838 
3839 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3840 	if (AU_AUDITING())
3841 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3842 
3843 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3844 #ifdef SOCK_DEBUG
3845 	if (error) {
3846 		eprintsoline(so, error);
3847 	}
3848 #endif /* SOCK_DEBUG */
3849 	return (error);
3850 }
3851 
3852 /*
3853  * Sending data with options on a connected stream socket.
3854  * Assumes caller has verified that SS_ISCONNECTED is set.
3855  */
3856 static int
3857 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3858     t_uscalar_t controllen, int flags)
3859 {
3860 	struct T_optdata_req	tdr;
3861 	mblk_t			*mp;
3862 	int			error;
3863 	ssize_t			iosize;
3864 	int			size;
3865 	struct fdbuf		*fdbuf;
3866 	t_uscalar_t		optlen;
3867 	void			*fds;
3868 	int			fdlen;
3869 	struct T_opthdr		toh;
3870 	sotpi_info_t		*sti = SOTOTPI(so);
3871 
3872 	dprintso(so, 1,
3873 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3874 
3875 	/*
3876 	 * Has to be bound and connected. However, since no locks are
3877 	 * held the state could have changed after sotpi_sendmsg checked it
3878 	 * thus it is not possible to ASSERT on the state.
3879 	 */
3880 
3881 	/* Options on connection-oriented only when SM_OPTDATA set. */
3882 	if (!(so->so_mode & SM_OPTDATA))
3883 		return (EOPNOTSUPP);
3884 
3885 	do {
3886 		/*
3887 		 * Set the MORE flag if uio_resid does not fit in this
3888 		 * message or if the caller passed in "more".
3889 		 * Error for transports with zero tidu_size.
3890 		 */
3891 		tdr.PRIM_type = T_OPTDATA_REQ;
3892 		iosize = sti->sti_tidu_size;
3893 		if (iosize <= 0)
3894 			return (EMSGSIZE);
3895 		if (uiop->uio_resid > iosize) {
3896 			tdr.DATA_flag = 1;
3897 		} else {
3898 			if (more)
3899 				tdr.DATA_flag = 1;
3900 			else
3901 				tdr.DATA_flag = 0;
3902 			iosize = uiop->uio_resid;
3903 		}
3904 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3905 		    tdr.DATA_flag, iosize));
3906 
3907 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3908 		tdr.OPT_length = optlen;
3909 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3910 
3911 		size = (int)sizeof (tdr) + optlen;
3912 		/*
3913 		 * File descriptors only when SM_FDPASSING set.
3914 		 */
3915 		error = so_getfdopt(control, controllen,
3916 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3917 		if (error)
3918 			return (error);
3919 		if (fdlen != -1) {
3920 			if (!(so->so_mode & SM_FDPASSING))
3921 				return (EOPNOTSUPP);
3922 
3923 			error = fdbuf_create(fds, fdlen, &fdbuf);
3924 			if (error)
3925 				return (error);
3926 
3927 			/*
3928 			 * Pre-allocate enough additional space for lower level
3929 			 * modules to append an option (e.g. see tl_unitdata).
3930 			 * The following is enough extra space for the largest
3931 			 * option we might append.
3932 			 */
3933 			size += sizeof (struct T_opthdr) + ucredsize;
3934 			mp = fdbuf_allocmsg(size, fdbuf);
3935 		} else {
3936 			mp = soallocproto(size, _ALLOC_INTR, CRED());
3937 			if (mp == NULL) {
3938 				/*
3939 				 * Caught a signal waiting for memory.
3940 				 * Let send* return EINTR.
3941 				 */
3942 				return (EINTR);
3943 			}
3944 		}
3945 		soappendmsg(mp, &tdr, sizeof (tdr));
3946 
3947 		if (fdlen != -1) {
3948 			ASSERT(fdbuf != NULL);
3949 			toh.level = SOL_SOCKET;
3950 			toh.name = SO_FILEP;
3951 			toh.len = fdbuf->fd_size +
3952 			    (t_uscalar_t)sizeof (struct T_opthdr);
3953 			toh.status = 0;
3954 			soappendmsg(mp, &toh, sizeof (toh));
3955 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3956 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3957 		}
3958 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3959 		/*
3960 		 * Normally at most 3 bytes left in the message, but we might
3961 		 * have allowed for extra space if we're passing fd's through.
3962 		 */
3963 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3964 
3965 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3966 
3967 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3968 		    0, MSG_BAND, 0);
3969 		if (error) {
3970 			eprintsoline(so, error);
3971 			return (error);
3972 		}
3973 		control = NULL;
3974 		if (uiop->uio_resid > 0) {
3975 			/*
3976 			 * Recheck for fatal errors. Fail write even though
3977 			 * some data have been written. This is consistent
3978 			 * with strwrite semantics and BSD sockets semantics.
3979 			 */
3980 			if (so->so_state & SS_CANTSENDMORE) {
3981 				eprintsoline(so, error);
3982 				return (EPIPE);
3983 			}
3984 			if (so->so_error != 0) {
3985 				mutex_enter(&so->so_lock);
3986 				error = sogeterr(so, B_TRUE);
3987 				mutex_exit(&so->so_lock);
3988 				if (error != 0) {
3989 					eprintsoline(so, error);
3990 					return (error);
3991 				}
3992 			}
3993 		}
3994 	} while (uiop->uio_resid > 0);
3995 	return (0);
3996 }
3997 
3998 /*
3999  * Sending data on a datagram socket.
4000  * Assumes caller has verified that SS_ISBOUND etc. are set.
4001  *
4002  * For AF_UNIX the destination address may be already in
4003  * internal form, as indicated by sti->sti_faddr_noxlate
4004  * or the MSG_SENDTO_NOXLATE flag.  Otherwise we need to
4005  * translate the destination address to internal form.
4006  *
4007  * The source address is passed as an option.
4008  */
4009 int
4010 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
4011     struct uio *uiop, int flags)
4012 {
4013 	struct T_unitdata_req	tudr;
4014 	mblk_t			*mp;
4015 	int			error;
4016 	void			*addr;
4017 	socklen_t		addrlen;
4018 	void			*src;
4019 	socklen_t		srclen;
4020 	ssize_t			len;
4021 	sotpi_info_t		*sti = SOTOTPI(so);
4022 
4023 	ASSERT(name != NULL && namelen != 0);
4024 
4025 	len = uiop->uio_resid;
4026 	if (len > sti->sti_tidu_size) {
4027 		error = EMSGSIZE;
4028 		goto done;
4029 	}
4030 
4031 	if (sti->sti_faddr_noxlate == 0 &&
4032 	    (flags & MSG_SENDTO_NOXLATE) == 0) {
4033 		/*
4034 		 * Length and family checks.
4035 		 * Don't verify internal form.
4036 		 */
4037 		error = so_addr_verify(so, name, namelen);
4038 		if (error != 0)
4039 			goto done;
4040 	}
4041 
4042 	if (sti->sti_direct)	/* Never on AF_UNIX */
4043 		return (sodgram_direct(so, name, namelen, uiop, flags));
4044 
4045 	if (so->so_family == AF_UNIX) {
4046 		if (sti->sti_faddr_noxlate) {
4047 			/*
4048 			 * Already have a transport internal address. Do not
4049 			 * pass any (transport internal) source address.
4050 			 */
4051 			addr = name;
4052 			addrlen = namelen;
4053 			src = NULL;
4054 			srclen = 0;
4055 		} else if (flags & MSG_SENDTO_NOXLATE) {
4056 			/*
4057 			 * Have an internal form dest. address.
4058 			 * Pass the source address as usual.
4059 			 */
4060 			addr = name;
4061 			addrlen = namelen;
4062 			src = sti->sti_laddr_sa;
4063 			srclen = (socklen_t)sti->sti_laddr_len;
4064 		} else {
4065 			/*
4066 			 * Pass the sockaddr_un source address as an option
4067 			 * and translate the remote address.
4068 			 *
4069 			 * Note that this code does not prevent sti_laddr_sa
4070 			 * from changing while it is being used. Thus
4071 			 * if an unbind+bind occurs concurrently with this
4072 			 * send the peer might see a partially new and a
4073 			 * partially old "from" address.
4074 			 */
4075 			src = sti->sti_laddr_sa;
4076 			srclen = (socklen_t)sti->sti_laddr_len;
4077 			dprintso(so, 1,
4078 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
4079 			    srclen, src));
4080 			/*
4081 			 * The sendmsg caller specified a destination
4082 			 * address, which we must translate into our
4083 			 * internal form.  addr = &sti->sti_ux_taddr
4084 			 */
4085 			error = so_ux_addr_xlate(so, name, namelen,
4086 			    (flags & MSG_XPG4_2),
4087 			    &addr, &addrlen);
4088 			if (error) {
4089 				eprintsoline(so, error);
4090 				goto done;
4091 			}
4092 		}
4093 	} else {
4094 		addr = name;
4095 		addrlen = namelen;
4096 		src = NULL;
4097 		srclen = 0;
4098 	}
4099 	tudr.PRIM_type = T_UNITDATA_REQ;
4100 	tudr.DEST_length = addrlen;
4101 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4102 	if (srclen == 0) {
4103 		tudr.OPT_length = 0;
4104 		tudr.OPT_offset = 0;
4105 
4106 		mp = soallocproto2(&tudr, sizeof (tudr),
4107 		    addr, addrlen, 0, _ALLOC_INTR, CRED());
4108 		if (mp == NULL) {
4109 			/*
4110 			 * Caught a signal waiting for memory.
4111 			 * Let send* return EINTR.
4112 			 */
4113 			error = EINTR;
4114 			goto done;
4115 		}
4116 	} else {
4117 		/*
4118 		 * There is a AF_UNIX sockaddr_un to include as a source
4119 		 * address option.
4120 		 */
4121 		struct T_opthdr toh;
4122 		ssize_t size;
4123 
4124 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4125 		    _TPI_ALIGN_TOPT(srclen));
4126 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4127 		    _TPI_ALIGN_TOPT(addrlen));
4128 
4129 		toh.level = SOL_SOCKET;
4130 		toh.name = SO_SRCADDR;
4131 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4132 		toh.status = 0;
4133 
4134 		size = tudr.OPT_offset + tudr.OPT_length;
4135 		mp = soallocproto2(&tudr, sizeof (tudr),
4136 		    addr, addrlen, size, _ALLOC_INTR, CRED());
4137 		if (mp == NULL) {
4138 			/*
4139 			 * Caught a signal waiting for memory.
4140 			 * Let send* return EINTR.
4141 			 */
4142 			error = EINTR;
4143 			goto done;
4144 		}
4145 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4146 		soappendmsg(mp, &toh, sizeof (toh));
4147 		soappendmsg(mp, src, srclen);
4148 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4149 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4150 	}
4151 
4152 	if (AU_AUDITING())
4153 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4154 
4155 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4156 done:
4157 #ifdef SOCK_DEBUG
4158 	if (error) {
4159 		eprintsoline(so, error);
4160 	}
4161 #endif /* SOCK_DEBUG */
4162 	return (error);
4163 }
4164 
4165 /*
4166  * Sending data on a connected stream socket.
4167  * Assumes caller has verified that SS_ISCONNECTED is set.
4168  */
4169 int
4170 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4171     int sflag)
4172 {
4173 	struct T_data_req	tdr;
4174 	mblk_t			*mp;
4175 	int			error;
4176 	ssize_t			iosize;
4177 	sotpi_info_t		*sti = SOTOTPI(so);
4178 
4179 	dprintso(so, 1,
4180 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4181 	    (void *)so, uiop->uio_resid, prim, sflag));
4182 
4183 	/*
4184 	 * Has to be bound and connected. However, since no locks are
4185 	 * held the state could have changed after sotpi_sendmsg checked it
4186 	 * thus it is not possible to ASSERT on the state.
4187 	 */
4188 
4189 	do {
4190 		/*
4191 		 * Set the MORE flag if uio_resid does not fit in this
4192 		 * message or if the caller passed in "more".
4193 		 * Error for transports with zero tidu_size.
4194 		 */
4195 		tdr.PRIM_type = prim;
4196 		iosize = sti->sti_tidu_size;
4197 		if (iosize <= 0)
4198 			return (EMSGSIZE);
4199 		if (uiop->uio_resid > iosize) {
4200 			tdr.MORE_flag = 1;
4201 		} else {
4202 			if (more)
4203 				tdr.MORE_flag = 1;
4204 			else
4205 				tdr.MORE_flag = 0;
4206 			iosize = uiop->uio_resid;
4207 		}
4208 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4209 		    prim, tdr.MORE_flag, iosize));
4210 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4211 		if (mp == NULL) {
4212 			/*
4213 			 * Caught a signal waiting for memory.
4214 			 * Let send* return EINTR.
4215 			 */
4216 			return (EINTR);
4217 		}
4218 
4219 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4220 		    0, sflag | MSG_BAND, 0);
4221 		if (error) {
4222 			eprintsoline(so, error);
4223 			return (error);
4224 		}
4225 		if (uiop->uio_resid > 0) {
4226 			/*
4227 			 * Recheck for fatal errors. Fail write even though
4228 			 * some data have been written. This is consistent
4229 			 * with strwrite semantics and BSD sockets semantics.
4230 			 */
4231 			if (so->so_state & SS_CANTSENDMORE) {
4232 				eprintsoline(so, error);
4233 				return (EPIPE);
4234 			}
4235 			if (so->so_error != 0) {
4236 				mutex_enter(&so->so_lock);
4237 				error = sogeterr(so, B_TRUE);
4238 				mutex_exit(&so->so_lock);
4239 				if (error != 0) {
4240 					eprintsoline(so, error);
4241 					return (error);
4242 				}
4243 			}
4244 		}
4245 	} while (uiop->uio_resid > 0);
4246 	return (0);
4247 }
4248 
4249 /*
4250  * Check the state for errors and call the appropriate send function.
4251  *
4252  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4253  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4254  * after sending the message.
4255  *
4256  * The caller may optionally specify a destination address, for either
4257  * stream or datagram sockets.  This table summarizes the cases:
4258  *
4259  *    Socket type    Dest. given    Connected    Result
4260  *    -----------    -----------    ---------    --------------
4261  *    Stream         *              Yes	         send to conn. addr.
4262  *    Stream         *              No           error ENOTCONN
4263  *    Dgram          yes            *            send to given addr.
4264  *    Dgram          no             yes          send to conn. addr.
4265  *    Dgram          no             no	         error EDESTADDRREQ
4266  *
4267  * There are subtleties around the destination address when using
4268  * AF_UNIX datagram sockets.  When the sendmsg call specifies the
4269  * destination address, it's in (struct sockaddr_un) form and we
4270  * need to translate it to our internal form (struct so_ux_addr).
4271  *
4272  * When the sendmsg call does not specify a destination address
4273  * we're using the peer address saved during sotpi_connect, and
4274  * that address is already in internal form.  In this case, the
4275  * (internal only) flag MSG_SENDTO_NOXLATE is set in the flags
4276  * passed to sosend_dgram or sosend_dgramcmsg to indicate that
4277  * those functions should skip translation to internal form.
4278  * Avoiding that translation is not only more efficient, but it's
4279  * also necessary when a process does a connect on an AF_UNIX
4280  * datagram socket and then drops privileges.  After the process
4281  * has dropped privileges, it may no longer be able to lookup the
4282  * the external name in the filesystem, but it should still be
4283  * able to send messages on the connected socket by leaving the
4284  * destination name unspecified.
4285  *
4286  * Yet more subtleties arise with sockets connected by socketpair(),
4287  * which puts internal form addresses in the fields where normally
4288  * the external form is found, and sets sti_faddr_noxlate=1, which
4289  * (like flag MSG_SENDTO_NOXLATE) causes the sosend_dgram functions
4290  * to skip translation of destination addresses to internal form.
4291  * However, beware that the flag sti_faddr_noxlate=1 also triggers
4292  * different behaviour almost everywhere AF_UNIX addresses appear.
4293  */
4294 static int
4295 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4296     struct cred *cr)
4297 {
4298 	int		so_state;
4299 	int		so_mode;
4300 	int		error;
4301 	struct sockaddr *name;
4302 	t_uscalar_t	namelen;
4303 	int		dontroute;
4304 	int		flags;
4305 	sotpi_info_t	*sti = SOTOTPI(so);
4306 
4307 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4308 	    (void *)so, (void *)msg, msg->msg_flags,
4309 	    pr_state(so->so_state, so->so_mode), so->so_error));
4310 
4311 	if (so->so_version == SOV_STREAM) {
4312 		/* The imaginary "sockmod" has been popped - act as a stream */
4313 		so_update_attrs(so, SOMOD);
4314 		return (strwrite(SOTOV(so), uiop, cr));
4315 	}
4316 
4317 	mutex_enter(&so->so_lock);
4318 	so_state = so->so_state;
4319 
4320 	if (so_state & SS_CANTSENDMORE) {
4321 		mutex_exit(&so->so_lock);
4322 		return (EPIPE);
4323 	}
4324 
4325 	if (so->so_error != 0) {
4326 		error = sogeterr(so, B_TRUE);
4327 		if (error != 0) {
4328 			mutex_exit(&so->so_lock);
4329 			return (error);
4330 		}
4331 	}
4332 
4333 	name = (struct sockaddr *)msg->msg_name;
4334 	namelen = msg->msg_namelen;
4335 	flags = msg->msg_flags;
4336 
4337 	/*
4338 	 * Historically, this function does not validate the flags
4339 	 * passed in, and any errant bits are ignored.  However,
4340 	 * we would not want any such errant flag bits accidently
4341 	 * being treated as one of the internal-only flags, so
4342 	 * clear the internal-only flag bits.
4343 	 */
4344 	flags &= ~MSG_SENDTO_NOXLATE;
4345 
4346 	so_mode = so->so_mode;
4347 
4348 	if (name == NULL) {
4349 		if (!(so_state & SS_ISCONNECTED)) {
4350 			mutex_exit(&so->so_lock);
4351 			if (so_mode & SM_CONNREQUIRED)
4352 				return (ENOTCONN);
4353 			else
4354 				return (EDESTADDRREQ);
4355 		}
4356 		/*
4357 		 * This is a connected socket.
4358 		 */
4359 		if (so_mode & SM_CONNREQUIRED) {
4360 			/*
4361 			 * This is a connected STREAM socket,
4362 			 * destination not specified.
4363 			 */
4364 			name = NULL;
4365 			namelen = 0;
4366 		} else {
4367 			/*
4368 			 * Datagram send on connected socket with
4369 			 * the destination name not specified.
4370 			 * Use the peer address from connect.
4371 			 */
4372 			if (so->so_family == AF_UNIX) {
4373 				/*
4374 				 * Use the (internal form) address saved
4375 				 * in sotpi_connect.  See above.
4376 				 */
4377 				name = (void *)&sti->sti_ux_faddr;
4378 				namelen = sizeof (sti->sti_ux_faddr);
4379 				flags |= MSG_SENDTO_NOXLATE;
4380 			} else {
4381 				ASSERT(sti->sti_faddr_sa);
4382 				name = sti->sti_faddr_sa;
4383 				namelen = (t_uscalar_t)sti->sti_faddr_len;
4384 			}
4385 		}
4386 	} else {
4387 		/*
4388 		 * Sendmsg specifies a destination name
4389 		 */
4390 		if (!(so_state & SS_ISCONNECTED) &&
4391 		    (so_mode & SM_CONNREQUIRED)) {
4392 			/* i.e. TCP not connected */
4393 			mutex_exit(&so->so_lock);
4394 			return (ENOTCONN);
4395 		}
4396 		/*
4397 		 * Ignore the address on connection-oriented sockets.
4398 		 * Just like BSD this code does not generate an error for
4399 		 * TCP (a CONNREQUIRED socket) when sending to an address
4400 		 * passed in with sendto/sendmsg. Instead the data is
4401 		 * delivered on the connection as if no address had been
4402 		 * supplied.
4403 		 */
4404 		if ((so_state & SS_ISCONNECTED) &&
4405 		    !(so_mode & SM_CONNREQUIRED)) {
4406 			mutex_exit(&so->so_lock);
4407 			return (EISCONN);
4408 		}
4409 		if (!(so_state & SS_ISBOUND)) {
4410 			so_lock_single(so);	/* Set SOLOCKED */
4411 			error = sotpi_bind(so, NULL, 0,
4412 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4413 			so_unlock_single(so, SOLOCKED);
4414 			if (error) {
4415 				mutex_exit(&so->so_lock);
4416 				eprintsoline(so, error);
4417 				return (error);
4418 			}
4419 		}
4420 		/*
4421 		 * Handle delayed datagram errors. These are only queued
4422 		 * when the application sets SO_DGRAM_ERRIND.
4423 		 * Return the error if we are sending to the address
4424 		 * that was returned in the last T_UDERROR_IND.
4425 		 * If sending to some other address discard the delayed
4426 		 * error indication.
4427 		 */
4428 		if (sti->sti_delayed_error) {
4429 			struct T_uderror_ind	*tudi;
4430 			void			*addr;
4431 			t_uscalar_t		addrlen;
4432 			boolean_t		match = B_FALSE;
4433 
4434 			ASSERT(sti->sti_eaddr_mp);
4435 			error = sti->sti_delayed_error;
4436 			sti->sti_delayed_error = 0;
4437 			tudi =
4438 			    (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4439 			addrlen = tudi->DEST_length;
4440 			addr = sogetoff(sti->sti_eaddr_mp,
4441 			    tudi->DEST_offset, addrlen, 1);
4442 			ASSERT(addr);	/* Checked by strsock_proto */
4443 			switch (so->so_family) {
4444 			case AF_INET: {
4445 				/* Compare just IP address and port */
4446 				sin_t *sin1 = (sin_t *)name;
4447 				sin_t *sin2 = (sin_t *)addr;
4448 
4449 				if (addrlen == sizeof (sin_t) &&
4450 				    namelen == addrlen &&
4451 				    sin1->sin_port == sin2->sin_port &&
4452 				    sin1->sin_addr.s_addr ==
4453 				    sin2->sin_addr.s_addr)
4454 					match = B_TRUE;
4455 				break;
4456 			}
4457 			case AF_INET6: {
4458 				/* Compare just IP address and port. Not flow */
4459 				sin6_t *sin1 = (sin6_t *)name;
4460 				sin6_t *sin2 = (sin6_t *)addr;
4461 
4462 				if (addrlen == sizeof (sin6_t) &&
4463 				    namelen == addrlen &&
4464 				    sin1->sin6_port == sin2->sin6_port &&
4465 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4466 				    &sin2->sin6_addr))
4467 					match = B_TRUE;
4468 				break;
4469 			}
4470 			case AF_UNIX:
4471 			default:
4472 				if (namelen == addrlen &&
4473 				    bcmp(name, addr, namelen) == 0)
4474 					match = B_TRUE;
4475 			}
4476 			if (match) {
4477 				freemsg(sti->sti_eaddr_mp);
4478 				sti->sti_eaddr_mp = NULL;
4479 				mutex_exit(&so->so_lock);
4480 #ifdef DEBUG
4481 				dprintso(so, 0,
4482 				    ("sockfs delayed error %d for %s\n",
4483 				    error,
4484 				    pr_addr(so->so_family, name, namelen)));
4485 #endif /* DEBUG */
4486 				return (error);
4487 			}
4488 			freemsg(sti->sti_eaddr_mp);
4489 			sti->sti_eaddr_mp = NULL;
4490 		}
4491 	}
4492 	mutex_exit(&so->so_lock);
4493 
4494 	dontroute = 0;
4495 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4496 		uint32_t	val;
4497 
4498 		val = 1;
4499 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4500 		    &val, (t_uscalar_t)sizeof (val), cr);
4501 		if (error)
4502 			return (error);
4503 		dontroute = 1;
4504 	}
4505 
4506 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4507 		error = EOPNOTSUPP;
4508 		goto done;
4509 	}
4510 	if (msg->msg_controllen != 0) {
4511 		if (!(so_mode & SM_CONNREQUIRED)) {
4512 			so_update_attrs(so, SOMOD);
4513 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4514 			    msg->msg_control, msg->msg_controllen, flags);
4515 		} else {
4516 			if (flags & MSG_OOB) {
4517 				/* Can't generate T_EXDATA_REQ with options */
4518 				error = EOPNOTSUPP;
4519 				goto done;
4520 			}
4521 			so_update_attrs(so, SOMOD);
4522 			error = sosend_svccmsg(so, uiop,
4523 			    !(flags & MSG_EOR),
4524 			    msg->msg_control, msg->msg_controllen,
4525 			    flags);
4526 		}
4527 		goto done;
4528 	}
4529 
4530 	so_update_attrs(so, SOMOD);
4531 	if (!(so_mode & SM_CONNREQUIRED)) {
4532 		/*
4533 		 * If there is no SO_DONTROUTE to turn off return immediately
4534 		 * from send_dgram. This can allow tail-call optimizations.
4535 		 */
4536 		if (!dontroute) {
4537 			return (sosend_dgram(so, name, namelen, uiop, flags));
4538 		}
4539 		error = sosend_dgram(so, name, namelen, uiop, flags);
4540 	} else {
4541 		t_scalar_t prim;
4542 		int sflag;
4543 
4544 		/* Ignore msg_name in the connected state */
4545 		if (flags & MSG_OOB) {
4546 			prim = T_EXDATA_REQ;
4547 			/*
4548 			 * Send down T_EXDATA_REQ even if there is flow
4549 			 * control for data.
4550 			 */
4551 			sflag = MSG_IGNFLOW;
4552 		} else {
4553 			if (so_mode & SM_BYTESTREAM) {
4554 				/* Byte stream transport - use write */
4555 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4556 
4557 				/* Send M_DATA messages */
4558 				if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4559 				    (error = nl7c_data(so, uiop)) >= 0) {
4560 					/* NL7C consumed the data */
4561 					return (error);
4562 				}
4563 				/*
4564 				 * If there is no SO_DONTROUTE to turn off,
4565 				 * sti_direct is on, and there is no flow
4566 				 * control, we can take the fast path.
4567 				 */
4568 				if (!dontroute && sti->sti_direct != 0 &&
4569 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4570 					return (sostream_direct(so, uiop,
4571 					    NULL, cr));
4572 				}
4573 				error = strwrite(SOTOV(so), uiop, cr);
4574 				goto done;
4575 			}
4576 			prim = T_DATA_REQ;
4577 			sflag = 0;
4578 		}
4579 		/*
4580 		 * If there is no SO_DONTROUTE to turn off return immediately
4581 		 * from sosend_svc. This can allow tail-call optimizations.
4582 		 */
4583 		if (!dontroute)
4584 			return (sosend_svc(so, uiop, prim,
4585 			    !(flags & MSG_EOR), sflag));
4586 		error = sosend_svc(so, uiop, prim,
4587 		    !(flags & MSG_EOR), sflag);
4588 	}
4589 	ASSERT(dontroute);
4590 done:
4591 	if (dontroute) {
4592 		uint32_t	val;
4593 
4594 		val = 0;
4595 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4596 		    &val, (t_uscalar_t)sizeof (val), cr);
4597 	}
4598 	return (error);
4599 }
4600 
4601 /*
4602  * kstrwritemp() has very similar semantics as that of strwrite().
4603  * The main difference is it obtains mblks from the caller and also
4604  * does not do any copy as done in strwrite() from user buffers to
4605  * kernel buffers.
4606  *
4607  * Currently, this routine is used by sendfile to send data allocated
4608  * within the kernel without any copying. This interface does not use the
4609  * synchronous stream interface as synch. stream interface implies
4610  * copying.
4611  */
4612 int
4613 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4614 {
4615 	struct stdata *stp;
4616 	struct queue *wqp;
4617 	mblk_t *newmp;
4618 	char waitflag;
4619 	int tempmode;
4620 	int error = 0;
4621 	int done = 0;
4622 	struct sonode *so;
4623 	boolean_t direct;
4624 
4625 	ASSERT(vp->v_stream);
4626 	stp = vp->v_stream;
4627 
4628 	so = VTOSO(vp);
4629 	direct = _SOTOTPI(so)->sti_direct;
4630 
4631 	/*
4632 	 * This is the sockfs direct fast path. canputnext() need
4633 	 * not be accurate so we don't grab the sd_lock here. If
4634 	 * we get flow-controlled, we grab sd_lock just before the
4635 	 * do..while loop below to emulate what strwrite() does.
4636 	 */
4637 	wqp = stp->sd_wrq;
4638 	if (canputnext(wqp) && direct &&
4639 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4640 		return (sostream_direct(so, NULL, mp, CRED()));
4641 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4642 		/* Fast check of flags before acquiring the lock */
4643 		mutex_enter(&stp->sd_lock);
4644 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4645 		mutex_exit(&stp->sd_lock);
4646 		if (error != 0) {
4647 			if (!(stp->sd_flag & STPLEX) &&
4648 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
4649 				error = EPIPE;
4650 			}
4651 			return (error);
4652 		}
4653 	}
4654 
4655 	waitflag = WRITEWAIT;
4656 	if (stp->sd_flag & OLDNDELAY)
4657 		tempmode = fmode & ~FNDELAY;
4658 	else
4659 		tempmode = fmode;
4660 
4661 	mutex_enter(&stp->sd_lock);
4662 	do {
4663 		if (canputnext(wqp)) {
4664 			mutex_exit(&stp->sd_lock);
4665 			if (stp->sd_wputdatafunc != NULL) {
4666 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4667 				    NULL, NULL, NULL);
4668 				if (newmp == NULL) {
4669 					/* The caller will free mp */
4670 					return (ECOMM);
4671 				}
4672 				mp = newmp;
4673 			}
4674 			putnext(wqp, mp);
4675 			return (0);
4676 		}
4677 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4678 		    &done);
4679 	} while (error == 0 && !done);
4680 
4681 	mutex_exit(&stp->sd_lock);
4682 	/*
4683 	 * EAGAIN tells the application to try again. ENOMEM
4684 	 * is returned only if the memory allocation size
4685 	 * exceeds the physical limits of the system. ENOMEM
4686 	 * can't be true here.
4687 	 */
4688 	if (error == ENOMEM)
4689 		error = EAGAIN;
4690 	return (error);
4691 }
4692 
4693 /* ARGSUSED */
4694 static int
4695 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4696     struct cred *cr, mblk_t **mpp)
4697 {
4698 	int error;
4699 
4700 	switch (so->so_family) {
4701 	case AF_INET:
4702 	case AF_INET6:
4703 	case AF_UNIX:
4704 		break;
4705 	default:
4706 		return (EAFNOSUPPORT);
4707 
4708 	}
4709 
4710 	if (so->so_state & SS_CANTSENDMORE)
4711 		return (EPIPE);
4712 
4713 	if (so->so_type != SOCK_STREAM)
4714 		return (EOPNOTSUPP);
4715 
4716 	if ((so->so_state & SS_ISCONNECTED) == 0)
4717 		return (ENOTCONN);
4718 
4719 	error = kstrwritemp(so->so_vnode, *mpp, fflag);
4720 	if (error == 0)
4721 		*mpp = NULL;
4722 	return (error);
4723 }
4724 
4725 /*
4726  * Sending data on a datagram socket.
4727  * Assumes caller has verified that SS_ISBOUND etc. are set.
4728  */
4729 /* ARGSUSED */
4730 static int
4731 sodgram_direct(struct sonode *so, struct sockaddr *name,
4732     socklen_t namelen, struct uio *uiop, int flags)
4733 {
4734 	struct T_unitdata_req	tudr;
4735 	mblk_t			*mp = NULL;
4736 	int			error = 0;
4737 	void			*addr;
4738 	socklen_t		addrlen;
4739 	ssize_t			len;
4740 	struct stdata		*stp = SOTOV(so)->v_stream;
4741 	int			so_state;
4742 	queue_t			*udp_wq;
4743 	boolean_t		connected;
4744 	mblk_t			*mpdata = NULL;
4745 	sotpi_info_t		*sti = SOTOTPI(so);
4746 	uint32_t		auditing = AU_AUDITING();
4747 
4748 	ASSERT(name != NULL && namelen != 0);
4749 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4750 	ASSERT(!(so->so_mode & SM_EXDATA));
4751 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4752 	ASSERT(SOTOV(so)->v_type == VSOCK);
4753 
4754 	/* Caller checked for proper length */
4755 	len = uiop->uio_resid;
4756 	ASSERT(len <= sti->sti_tidu_size);
4757 
4758 	/* Length and family checks have been done by caller */
4759 	ASSERT(name->sa_family == so->so_family);
4760 	ASSERT(so->so_family == AF_INET ||
4761 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4762 	ASSERT(so->so_family == AF_INET6 ||
4763 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4764 
4765 	addr = name;
4766 	addrlen = namelen;
4767 
4768 	if (stp->sd_sidp != NULL &&
4769 	    (error = straccess(stp, JCWRITE)) != 0)
4770 		goto done;
4771 
4772 	so_state = so->so_state;
4773 
4774 	connected = so_state & SS_ISCONNECTED;
4775 	if (!connected) {
4776 		tudr.PRIM_type = T_UNITDATA_REQ;
4777 		tudr.DEST_length = addrlen;
4778 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4779 		tudr.OPT_length = 0;
4780 		tudr.OPT_offset = 0;
4781 
4782 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4783 		    _ALLOC_INTR, CRED());
4784 		if (mp == NULL) {
4785 			/*
4786 			 * Caught a signal waiting for memory.
4787 			 * Let send* return EINTR.
4788 			 */
4789 			error = EINTR;
4790 			goto done;
4791 		}
4792 	}
4793 
4794 	/*
4795 	 * For UDP we don't break up the copyin into smaller pieces
4796 	 * as in the TCP case.  That means if ENOMEM is returned by
4797 	 * mcopyinuio() then the uio vector has not been modified at
4798 	 * all and we fallback to either strwrite() or kstrputmsg()
4799 	 * below.  Note also that we never generate priority messages
4800 	 * from here.
4801 	 */
4802 	udp_wq = stp->sd_wrq->q_next;
4803 	if (canput(udp_wq) &&
4804 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4805 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4806 		ASSERT(uiop->uio_resid == 0);
4807 		if (!connected)
4808 			linkb(mp, mpdata);
4809 		else
4810 			mp = mpdata;
4811 		if (auditing)
4812 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4813 
4814 		/* Always returns 0... */
4815 		return (udp_wput(udp_wq, mp));
4816 	}
4817 
4818 	ASSERT(mpdata == NULL);
4819 	if (error != 0 && error != ENOMEM) {
4820 		freemsg(mp);
4821 		return (error);
4822 	}
4823 
4824 	/*
4825 	 * For connected, let strwrite() handle the blocking case.
4826 	 * Otherwise we fall thru and use kstrputmsg().
4827 	 */
4828 	if (connected)
4829 		return (strwrite(SOTOV(so), uiop, CRED()));
4830 
4831 	if (auditing)
4832 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4833 
4834 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4835 done:
4836 #ifdef SOCK_DEBUG
4837 	if (error != 0) {
4838 		eprintsoline(so, error);
4839 	}
4840 #endif /* SOCK_DEBUG */
4841 	return (error);
4842 }
4843 
4844 int
4845 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4846 {
4847 	struct stdata *stp = SOTOV(so)->v_stream;
4848 	ssize_t iosize, rmax, maxblk;
4849 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4850 	mblk_t *newmp;
4851 	int error = 0, wflag = 0;
4852 
4853 	ASSERT(so->so_mode & SM_BYTESTREAM);
4854 	ASSERT(SOTOV(so)->v_type == VSOCK);
4855 
4856 	if (stp->sd_sidp != NULL &&
4857 	    (error = straccess(stp, JCWRITE)) != 0)
4858 		return (error);
4859 
4860 	if (uiop == NULL) {
4861 		/*
4862 		 * kstrwritemp() should have checked sd_flag and
4863 		 * flow-control before coming here.  If we end up
4864 		 * here it means that we can simply pass down the
4865 		 * data to tcp.
4866 		 */
4867 		ASSERT(mp != NULL);
4868 		if (stp->sd_wputdatafunc != NULL) {
4869 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4870 			    NULL, NULL, NULL);
4871 			if (newmp == NULL) {
4872 				/* The caller will free mp */
4873 				return (ECOMM);
4874 			}
4875 			mp = newmp;
4876 		}
4877 		/* Always returns 0... */
4878 		return (tcp_wput(tcp_wq, mp));
4879 	}
4880 
4881 	/* Fallback to strwrite() to do proper error handling */
4882 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4883 		return (strwrite(SOTOV(so), uiop, cr));
4884 
4885 	rmax = stp->sd_qn_maxpsz;
4886 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4887 	if (rmax == 0 || uiop->uio_resid <= 0)
4888 		return (0);
4889 
4890 	if (rmax == INFPSZ)
4891 		rmax = uiop->uio_resid;
4892 
4893 	maxblk = stp->sd_maxblk;
4894 
4895 	for (;;) {
4896 		iosize = MIN(uiop->uio_resid, rmax);
4897 
4898 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4899 		if (mp == NULL) {
4900 			/*
4901 			 * Fallback to strwrite() for ENOMEM; if this
4902 			 * is our first time in this routine and the uio
4903 			 * vector has not been modified, we will end up
4904 			 * calling strwrite() without any flag set.
4905 			 */
4906 			if (error == ENOMEM)
4907 				goto slow_send;
4908 			else
4909 				return (error);
4910 		}
4911 		ASSERT(uiop->uio_resid >= 0);
4912 		/*
4913 		 * If mp is non-NULL and ENOMEM is set, it means that
4914 		 * mcopyinuio() was able to break down some of the user
4915 		 * data into one or more mblks.  Send the partial data
4916 		 * to tcp and let the rest be handled in strwrite().
4917 		 */
4918 		ASSERT(error == 0 || error == ENOMEM);
4919 		if (stp->sd_wputdatafunc != NULL) {
4920 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4921 			    NULL, NULL, NULL);
4922 			if (newmp == NULL) {
4923 				/* The caller will free mp */
4924 				return (ECOMM);
4925 			}
4926 			mp = newmp;
4927 		}
4928 		(void) tcp_wput(tcp_wq, mp);	/* Always returns 0 anyway. */
4929 
4930 		wflag |= NOINTR;
4931 
4932 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4933 			ASSERT(error == 0);
4934 			break;
4935 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4936 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4937 slow_send:
4938 			/*
4939 			 * We were able to send down partial data using
4940 			 * the direct call interface, but are now relying
4941 			 * on strwrite() to handle the non-fastpath cases.
4942 			 * If the socket is blocking we will sleep in
4943 			 * strwaitq() until write is permitted, otherwise,
4944 			 * we will need to return the amount of bytes
4945 			 * written so far back to the app.  This is the
4946 			 * reason why we pass NOINTR flag to strwrite()
4947 			 * for non-blocking socket, because we don't want
4948 			 * to return EAGAIN when portion of the user data
4949 			 * has actually been sent down.
4950 			 */
4951 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4952 		}
4953 	}
4954 	return (0);
4955 }
4956 
4957 /*
4958  * Update sti_faddr by asking the transport (unless AF_UNIX).
4959  */
4960 /* ARGSUSED */
4961 int
4962 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4963     boolean_t accept, struct cred *cr)
4964 {
4965 	struct strbuf	strbuf;
4966 	int		error = 0, res;
4967 	void		*addr;
4968 	t_uscalar_t	addrlen;
4969 	k_sigset_t	smask;
4970 	sotpi_info_t	*sti = SOTOTPI(so);
4971 
4972 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4973 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4974 
4975 	ASSERT(*namelen > 0);
4976 	mutex_enter(&so->so_lock);
4977 	so_lock_single(so);	/* Set SOLOCKED */
4978 
4979 	if (accept) {
4980 		bcopy(sti->sti_faddr_sa, name,
4981 		    MIN(*namelen, sti->sti_faddr_len));
4982 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4983 		goto done;
4984 	}
4985 
4986 	if (!(so->so_state & SS_ISCONNECTED)) {
4987 		error = ENOTCONN;
4988 		goto done;
4989 	}
4990 	/* Added this check for X/Open */
4991 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4992 		error = EINVAL;
4993 		if (xnet_check_print) {
4994 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4995 		}
4996 		goto done;
4997 	}
4998 
4999 	if (sti->sti_faddr_valid) {
5000 		bcopy(sti->sti_faddr_sa, name,
5001 		    MIN(*namelen, sti->sti_faddr_len));
5002 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
5003 		goto done;
5004 	}
5005 
5006 #ifdef DEBUG
5007 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
5008 	    pr_addr(so->so_family, sti->sti_faddr_sa,
5009 	    (t_uscalar_t)sti->sti_faddr_len)));
5010 #endif /* DEBUG */
5011 
5012 	if (so->so_family == AF_UNIX) {
5013 		/* Transport has different name space - return local info */
5014 		if (sti->sti_faddr_noxlate)
5015 			*namelen = 0;
5016 		error = 0;
5017 		goto done;
5018 	}
5019 
5020 	ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
5021 
5022 	ASSERT(sti->sti_faddr_sa);
5023 	/* Allocate local buffer to use with ioctl */
5024 	addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
5025 	mutex_exit(&so->so_lock);
5026 	addr = kmem_alloc(addrlen, KM_SLEEP);
5027 
5028 	/*
5029 	 * Issue TI_GETPEERNAME with signals masked.
5030 	 * Put the result in sti_faddr_sa so that getpeername works after
5031 	 * a shutdown(output).
5032 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5033 	 * back to the socket.
5034 	 */
5035 	strbuf.buf = addr;
5036 	strbuf.maxlen = addrlen;
5037 	strbuf.len = 0;
5038 
5039 	sigintr(&smask, 0);
5040 	res = 0;
5041 	ASSERT(cr);
5042 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
5043 	    0, K_TO_K, cr, &res);
5044 	sigunintr(&smask);
5045 
5046 	mutex_enter(&so->so_lock);
5047 	/*
5048 	 * If there is an error record the error in so_error put don't fail
5049 	 * the getpeername. Instead fallback on the recorded
5050 	 * sti->sti_faddr_sa.
5051 	 */
5052 	if (error) {
5053 		/*
5054 		 * Various stream head errors can be returned to the ioctl.
5055 		 * However, it is impossible to determine which ones of
5056 		 * these are really socket level errors that were incorrectly
5057 		 * consumed by the ioctl. Thus this code silently ignores the
5058 		 * error - to code explicitly does not reinstate the error
5059 		 * using soseterror().
5060 		 * Experiments have shows that at least this set of
5061 		 * errors are reported and should not be reinstated on the
5062 		 * socket:
5063 		 *	EINVAL	E.g. if an I_LINK was in effect when
5064 		 *		getpeername was called.
5065 		 *	EPIPE	The ioctl error semantics prefer the write
5066 		 *		side error over the read side error.
5067 		 *	ENOTCONN The transport just got disconnected but
5068 		 *		sockfs had not yet seen the T_DISCON_IND
5069 		 *		when issuing the ioctl.
5070 		 */
5071 		error = 0;
5072 	} else if (res == 0 && strbuf.len > 0 &&
5073 	    (so->so_state & SS_ISCONNECTED)) {
5074 		ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
5075 		sti->sti_faddr_len = (socklen_t)strbuf.len;
5076 		bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
5077 		sti->sti_faddr_valid = 1;
5078 
5079 		bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
5080 		*namelen = sti->sti_faddr_len;
5081 	}
5082 	kmem_free(addr, addrlen);
5083 #ifdef DEBUG
5084 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
5085 	    pr_addr(so->so_family, sti->sti_faddr_sa,
5086 	    (t_uscalar_t)sti->sti_faddr_len)));
5087 #endif /* DEBUG */
5088 done:
5089 	so_unlock_single(so, SOLOCKED);
5090 	mutex_exit(&so->so_lock);
5091 	return (error);
5092 }
5093 
5094 /*
5095  * Update sti_laddr by asking the transport (unless AF_UNIX).
5096  */
5097 int
5098 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
5099     struct cred *cr)
5100 {
5101 	struct strbuf	strbuf;
5102 	int		error = 0, res;
5103 	void		*addr;
5104 	t_uscalar_t	addrlen;
5105 	k_sigset_t	smask;
5106 	sotpi_info_t	*sti = SOTOTPI(so);
5107 
5108 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
5109 	    (void *)so, pr_state(so->so_state, so->so_mode)));
5110 
5111 	ASSERT(*namelen > 0);
5112 	mutex_enter(&so->so_lock);
5113 	so_lock_single(so);	/* Set SOLOCKED */
5114 
5115 #ifdef DEBUG
5116 
5117 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
5118 	    pr_addr(so->so_family, sti->sti_laddr_sa,
5119 	    (t_uscalar_t)sti->sti_laddr_len)));
5120 #endif /* DEBUG */
5121 	if (sti->sti_laddr_valid) {
5122 		bcopy(sti->sti_laddr_sa, name,
5123 		    MIN(*namelen, sti->sti_laddr_len));
5124 		*namelen = sti->sti_laddr_len;
5125 		goto done;
5126 	}
5127 
5128 	if (so->so_family == AF_UNIX) {
5129 		/*
5130 		 * Transport has different name space - return local info. If we
5131 		 * have enough space, let consumers know the family.
5132 		 */
5133 		if (*namelen >= sizeof (sa_family_t)) {
5134 			name->sa_family = AF_UNIX;
5135 			*namelen = sizeof (sa_family_t);
5136 		} else {
5137 			*namelen = 0;
5138 		}
5139 		error = 0;
5140 		goto done;
5141 	}
5142 	if (!(so->so_state & SS_ISBOUND)) {
5143 		/* If not bound, then nothing to return. */
5144 		error = 0;
5145 		goto done;
5146 	}
5147 
5148 	/* Allocate local buffer to use with ioctl */
5149 	addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
5150 	mutex_exit(&so->so_lock);
5151 	addr = kmem_alloc(addrlen, KM_SLEEP);
5152 
5153 	/*
5154 	 * Issue TI_GETMYNAME with signals masked.
5155 	 * Put the result in sti_laddr_sa so that getsockname works after
5156 	 * a shutdown(output).
5157 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5158 	 * back to the socket.
5159 	 */
5160 	strbuf.buf = addr;
5161 	strbuf.maxlen = addrlen;
5162 	strbuf.len = 0;
5163 
5164 	sigintr(&smask, 0);
5165 	res = 0;
5166 	ASSERT(cr);
5167 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
5168 	    0, K_TO_K, cr, &res);
5169 	sigunintr(&smask);
5170 
5171 	mutex_enter(&so->so_lock);
5172 	/*
5173 	 * If there is an error record the error in so_error put don't fail
5174 	 * the getsockname. Instead fallback on the recorded
5175 	 * sti->sti_laddr_sa.
5176 	 */
5177 	if (error) {
5178 		/*
5179 		 * Various stream head errors can be returned to the ioctl.
5180 		 * However, it is impossible to determine which ones of
5181 		 * these are really socket level errors that were incorrectly
5182 		 * consumed by the ioctl. Thus this code silently ignores the
5183 		 * error - to code explicitly does not reinstate the error
5184 		 * using soseterror().
5185 		 * Experiments have shows that at least this set of
5186 		 * errors are reported and should not be reinstated on the
5187 		 * socket:
5188 		 *	EINVAL	E.g. if an I_LINK was in effect when
5189 		 *		getsockname was called.
5190 		 *	EPIPE	The ioctl error semantics prefer the write
5191 		 *		side error over the read side error.
5192 		 */
5193 		error = 0;
5194 	} else if (res == 0 && strbuf.len > 0 &&
5195 	    (so->so_state & SS_ISBOUND)) {
5196 		ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5197 		sti->sti_laddr_len = (socklen_t)strbuf.len;
5198 		bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5199 		sti->sti_laddr_valid = 1;
5200 
5201 		bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5202 		*namelen = sti->sti_laddr_len;
5203 	}
5204 	kmem_free(addr, addrlen);
5205 #ifdef DEBUG
5206 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5207 	    pr_addr(so->so_family, sti->sti_laddr_sa,
5208 	    (t_uscalar_t)sti->sti_laddr_len)));
5209 #endif /* DEBUG */
5210 done:
5211 	so_unlock_single(so, SOLOCKED);
5212 	mutex_exit(&so->so_lock);
5213 	return (error);
5214 }
5215 
5216 /*
5217  * Get socket options. For SOL_SOCKET options some options are handled
5218  * by the sockfs while others use the value recorded in the sonode as a
5219  * fallback should the T_SVR4_OPTMGMT_REQ fail.
5220  *
5221  * On the return most *optlenp bytes are copied to optval.
5222  */
5223 /* ARGSUSED */
5224 int
5225 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5226     void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5227 {
5228 	struct T_optmgmt_req	optmgmt_req;
5229 	struct T_optmgmt_ack	*optmgmt_ack;
5230 	struct opthdr		oh;
5231 	struct opthdr		*opt_res;
5232 	mblk_t			*mp = NULL;
5233 	int			error = 0;
5234 	void			*option = NULL;	/* Set if fallback value */
5235 	t_uscalar_t		maxlen = *optlenp;
5236 	t_uscalar_t		len;
5237 	uint32_t		value;
5238 	struct timeval		tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5239 	struct timeval32	tmo_val32;
5240 	struct so_snd_bufinfo	snd_bufinfo;	/* used for zero copy */
5241 
5242 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5243 	    (void *)so, level, option_name, optval, (void *)optlenp,
5244 	    pr_state(so->so_state, so->so_mode)));
5245 
5246 	mutex_enter(&so->so_lock);
5247 	so_lock_single(so);	/* Set SOLOCKED */
5248 
5249 	/*
5250 	 * Check for SOL_SOCKET options.
5251 	 * Certain SOL_SOCKET options are returned directly whereas
5252 	 * others only provide a default (fallback) value should
5253 	 * the T_SVR4_OPTMGMT_REQ fail.
5254 	 */
5255 	if (level == SOL_SOCKET) {
5256 		/* Check parameters */
5257 		switch (option_name) {
5258 		case SO_TYPE:
5259 		case SO_ERROR:
5260 		case SO_DEBUG:
5261 		case SO_ACCEPTCONN:
5262 		case SO_REUSEADDR:
5263 		case SO_KEEPALIVE:
5264 		case SO_DONTROUTE:
5265 		case SO_BROADCAST:
5266 		case SO_USELOOPBACK:
5267 		case SO_OOBINLINE:
5268 		case SO_SNDBUF:
5269 		case SO_RCVBUF:
5270 #ifdef notyet
5271 		case SO_SNDLOWAT:
5272 		case SO_RCVLOWAT:
5273 #endif /* notyet */
5274 		case SO_DOMAIN:
5275 		case SO_DGRAM_ERRIND:
5276 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5277 				error = EINVAL;
5278 				eprintsoline(so, error);
5279 				goto done2;
5280 			}
5281 			break;
5282 		case SO_RCVTIMEO:
5283 		case SO_SNDTIMEO:
5284 			if (get_udatamodel() == DATAMODEL_NONE ||
5285 			    get_udatamodel() == DATAMODEL_NATIVE) {
5286 				if (maxlen < sizeof (struct timeval)) {
5287 					error = EINVAL;
5288 					eprintsoline(so, error);
5289 					goto done2;
5290 				}
5291 			} else {
5292 				if (maxlen < sizeof (struct timeval32)) {
5293 					error = EINVAL;
5294 					eprintsoline(so, error);
5295 					goto done2;
5296 				}
5297 
5298 			}
5299 			break;
5300 		case SO_LINGER:
5301 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5302 				error = EINVAL;
5303 				eprintsoline(so, error);
5304 				goto done2;
5305 			}
5306 			break;
5307 		case SO_SND_BUFINFO:
5308 			if (maxlen < (t_uscalar_t)
5309 			    sizeof (struct so_snd_bufinfo)) {
5310 				error = EINVAL;
5311 				eprintsoline(so, error);
5312 				goto done2;
5313 			}
5314 			break;
5315 		}
5316 
5317 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
5318 
5319 		switch (option_name) {
5320 		case SO_TYPE:
5321 			value = so->so_type;
5322 			option = &value;
5323 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5324 
5325 		case SO_ERROR:
5326 			value = sogeterr(so, B_TRUE);
5327 			option = &value;
5328 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5329 
5330 		case SO_ACCEPTCONN:
5331 			if (so->so_state & SS_ACCEPTCONN)
5332 				value = SO_ACCEPTCONN;
5333 			else
5334 				value = 0;
5335 #ifdef DEBUG
5336 			if (value) {
5337 				dprintso(so, 1,
5338 				    ("sotpi_getsockopt: 0x%x is set\n",
5339 				    option_name));
5340 			} else {
5341 				dprintso(so, 1,
5342 				    ("sotpi_getsockopt: 0x%x not set\n",
5343 				    option_name));
5344 			}
5345 #endif /* DEBUG */
5346 			option = &value;
5347 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5348 
5349 		case SO_DEBUG:
5350 		case SO_REUSEADDR:
5351 		case SO_KEEPALIVE:
5352 		case SO_DONTROUTE:
5353 		case SO_BROADCAST:
5354 		case SO_USELOOPBACK:
5355 		case SO_OOBINLINE:
5356 		case SO_DGRAM_ERRIND:
5357 			value = (so->so_options & option_name);
5358 #ifdef DEBUG
5359 			if (value) {
5360 				dprintso(so, 1,
5361 				    ("sotpi_getsockopt: 0x%x is set\n",
5362 				    option_name));
5363 			} else {
5364 				dprintso(so, 1,
5365 				    ("sotpi_getsockopt: 0x%x not set\n",
5366 				    option_name));
5367 			}
5368 #endif /* DEBUG */
5369 			option = &value;
5370 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5371 
5372 		/*
5373 		 * The following options are only returned by sockfs when the
5374 		 * T_SVR4_OPTMGMT_REQ fails.
5375 		 */
5376 		case SO_LINGER:
5377 			option = &so->so_linger;
5378 			len = (t_uscalar_t)sizeof (struct linger);
5379 			break;
5380 		case SO_SNDBUF: {
5381 			ssize_t lvalue;
5382 
5383 			/*
5384 			 * If the option has not been set then get a default
5385 			 * value from the read queue. This value is
5386 			 * returned if the transport fails
5387 			 * the T_SVR4_OPTMGMT_REQ.
5388 			 */
5389 			lvalue = so->so_sndbuf;
5390 			if (lvalue == 0) {
5391 				mutex_exit(&so->so_lock);
5392 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
5393 				    QHIWAT, 0, &lvalue);
5394 				mutex_enter(&so->so_lock);
5395 				dprintso(so, 1,
5396 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5397 			}
5398 			value = (int)lvalue;
5399 			option = &value;
5400 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5401 			break;
5402 		}
5403 		case SO_RCVBUF: {
5404 			ssize_t lvalue;
5405 
5406 			/*
5407 			 * If the option has not been set then get a default
5408 			 * value from the read queue. This value is
5409 			 * returned if the transport fails
5410 			 * the T_SVR4_OPTMGMT_REQ.
5411 			 *
5412 			 * XXX If SO_RCVBUF has been set and this is an
5413 			 * XPG 4.2 application then do not ask the transport
5414 			 * since the transport might adjust the value and not
5415 			 * return exactly what was set by the application.
5416 			 * For non-XPG 4.2 application we return the value
5417 			 * that the transport is actually using.
5418 			 */
5419 			lvalue = so->so_rcvbuf;
5420 			if (lvalue == 0) {
5421 				mutex_exit(&so->so_lock);
5422 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5423 				    QHIWAT, 0, &lvalue);
5424 				mutex_enter(&so->so_lock);
5425 				dprintso(so, 1,
5426 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5427 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5428 				value = (int)lvalue;
5429 				option = &value;
5430 				goto copyout;	/* skip asking transport */
5431 			}
5432 			value = (int)lvalue;
5433 			option = &value;
5434 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5435 			break;
5436 		}
5437 		case SO_DOMAIN:
5438 			value = so->so_family;
5439 			option = &value;
5440 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5441 
5442 #ifdef notyet
5443 		/*
5444 		 * We do not implement the semantics of these options
5445 		 * thus we shouldn't implement the options either.
5446 		 */
5447 		case SO_SNDLOWAT:
5448 			value = so->so_sndlowat;
5449 			option = &value;
5450 			break;
5451 		case SO_RCVLOWAT:
5452 			value = so->so_rcvlowat;
5453 			option = &value;
5454 			break;
5455 #endif /* notyet */
5456 		case SO_SNDTIMEO:
5457 		case SO_RCVTIMEO: {
5458 			clock_t val;
5459 
5460 			if (option_name == SO_RCVTIMEO)
5461 				val = drv_hztousec(so->so_rcvtimeo);
5462 			else
5463 				val = drv_hztousec(so->so_sndtimeo);
5464 			tmo_val.tv_sec = val / (1000 * 1000);
5465 			tmo_val.tv_usec = val % (1000 * 1000);
5466 			if (get_udatamodel() == DATAMODEL_NONE ||
5467 			    get_udatamodel() == DATAMODEL_NATIVE) {
5468 				option = &tmo_val;
5469 				len = sizeof (struct timeval);
5470 			} else {
5471 				TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5472 				option = &tmo_val32;
5473 				len = sizeof (struct timeval32);
5474 			}
5475 			break;
5476 		}
5477 		case SO_SND_BUFINFO: {
5478 			snd_bufinfo.sbi_wroff =
5479 			    (so->so_proto_props).sopp_wroff;
5480 			snd_bufinfo.sbi_maxblk =
5481 			    (so->so_proto_props).sopp_maxblk;
5482 			snd_bufinfo.sbi_maxpsz =
5483 			    (so->so_proto_props).sopp_maxpsz;
5484 			snd_bufinfo.sbi_tail =
5485 			    (so->so_proto_props).sopp_tail;
5486 			option = &snd_bufinfo;
5487 			len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5488 			break;
5489 		}
5490 		}
5491 	}
5492 
5493 	mutex_exit(&so->so_lock);
5494 
5495 	/* Send request */
5496 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5497 	optmgmt_req.MGMT_flags = T_CHECK;
5498 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5499 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5500 
5501 	oh.level = level;
5502 	oh.name = option_name;
5503 	oh.len = maxlen;
5504 
5505 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5506 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5507 	/* Let option management work in the presence of data flow control */
5508 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5509 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5510 	mp = NULL;
5511 	mutex_enter(&so->so_lock);
5512 	if (error) {
5513 		eprintsoline(so, error);
5514 		goto done2;
5515 	}
5516 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5517 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5518 	if (error) {
5519 		if (option != NULL) {
5520 			/* We have a fallback value */
5521 			error = 0;
5522 			goto copyout;
5523 		}
5524 		eprintsoline(so, error);
5525 		goto done2;
5526 	}
5527 	ASSERT(mp);
5528 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5529 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5530 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5531 	if (opt_res == NULL) {
5532 		if (option != NULL) {
5533 			/* We have a fallback value */
5534 			error = 0;
5535 			goto copyout;
5536 		}
5537 		error = EPROTO;
5538 		eprintsoline(so, error);
5539 		goto done;
5540 	}
5541 	option = &opt_res[1];
5542 
5543 	/* check to ensure that the option is within bounds */
5544 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5545 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5546 		if (option != NULL) {
5547 			/* We have a fallback value */
5548 			error = 0;
5549 			goto copyout;
5550 		}
5551 		error = EPROTO;
5552 		eprintsoline(so, error);
5553 		goto done;
5554 	}
5555 
5556 	len = opt_res->len;
5557 
5558 copyout: {
5559 		t_uscalar_t size = MIN(len, maxlen);
5560 		bcopy(option, optval, size);
5561 		bcopy(&size, optlenp, sizeof (size));
5562 	}
5563 done:
5564 	freemsg(mp);
5565 done2:
5566 	so_unlock_single(so, SOLOCKED);
5567 	mutex_exit(&so->so_lock);
5568 
5569 	return (error);
5570 }
5571 
5572 /*
5573  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5574  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5575  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5576  * setsockopt has to work even if the transport does not support the option.
5577  */
5578 /* ARGSUSED */
5579 int
5580 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5581     const void *optval, t_uscalar_t optlen, struct cred *cr)
5582 {
5583 	struct T_optmgmt_req	optmgmt_req;
5584 	struct opthdr		oh;
5585 	mblk_t			*mp;
5586 	int			error = 0;
5587 	boolean_t		handled = B_FALSE;
5588 
5589 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5590 	    (void *)so, level, option_name, optval, optlen,
5591 	    pr_state(so->so_state, so->so_mode)));
5592 
5593 	/* X/Open requires this check */
5594 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5595 		if (xnet_check_print)
5596 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5597 		return (EINVAL);
5598 	}
5599 
5600 	mutex_enter(&so->so_lock);
5601 	so_lock_single(so);	/* Set SOLOCKED */
5602 	mutex_exit(&so->so_lock);
5603 
5604 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5605 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5606 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5607 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5608 
5609 	oh.level = level;
5610 	oh.name = option_name;
5611 	oh.len = optlen;
5612 
5613 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5614 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5615 	/* Let option management work in the presence of data flow control */
5616 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5617 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5618 	mp = NULL;
5619 	mutex_enter(&so->so_lock);
5620 	if (error) {
5621 		eprintsoline(so, error);
5622 		goto done2;
5623 	}
5624 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5625 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5626 	if (error) {
5627 		eprintsoline(so, error);
5628 		goto done;
5629 	}
5630 	ASSERT(mp);
5631 	/* No need to verify T_optmgmt_ack */
5632 	freemsg(mp);
5633 done:
5634 	/*
5635 	 * Check for SOL_SOCKET options and record their values.
5636 	 * If we know about a SOL_SOCKET parameter and the transport
5637 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5638 	 * EPROTO) we let the setsockopt succeed.
5639 	 */
5640 	if (level == SOL_SOCKET) {
5641 		/* Check parameters */
5642 		switch (option_name) {
5643 		case SO_DEBUG:
5644 		case SO_REUSEADDR:
5645 		case SO_KEEPALIVE:
5646 		case SO_DONTROUTE:
5647 		case SO_BROADCAST:
5648 		case SO_USELOOPBACK:
5649 		case SO_OOBINLINE:
5650 		case SO_SNDBUF:
5651 		case SO_RCVBUF:
5652 #ifdef notyet
5653 		case SO_SNDLOWAT:
5654 		case SO_RCVLOWAT:
5655 #endif /* notyet */
5656 		case SO_DGRAM_ERRIND:
5657 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5658 				error = EINVAL;
5659 				eprintsoline(so, error);
5660 				goto done2;
5661 			}
5662 			ASSERT(optval);
5663 			handled = B_TRUE;
5664 			break;
5665 		case SO_SNDTIMEO:
5666 		case SO_RCVTIMEO:
5667 			if (get_udatamodel() == DATAMODEL_NONE ||
5668 			    get_udatamodel() == DATAMODEL_NATIVE) {
5669 				if (optlen != sizeof (struct timeval)) {
5670 					error = EINVAL;
5671 					eprintsoline(so, error);
5672 					goto done2;
5673 				}
5674 			} else {
5675 				if (optlen != sizeof (struct timeval32)) {
5676 					error = EINVAL;
5677 					eprintsoline(so, error);
5678 					goto done2;
5679 				}
5680 			}
5681 			ASSERT(optval);
5682 			handled = B_TRUE;
5683 			break;
5684 		case SO_LINGER:
5685 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5686 				error = EINVAL;
5687 				eprintsoline(so, error);
5688 				goto done2;
5689 			}
5690 			ASSERT(optval);
5691 			handled = B_TRUE;
5692 			break;
5693 		}
5694 
5695 #define	intvalue	(*(int32_t *)optval)
5696 
5697 		switch (option_name) {
5698 		case SO_TYPE:
5699 		case SO_ERROR:
5700 		case SO_ACCEPTCONN:
5701 			/* Can't be set */
5702 			error = ENOPROTOOPT;
5703 			goto done2;
5704 		case SO_LINGER: {
5705 			struct linger *l = (struct linger *)optval;
5706 
5707 			so->so_linger.l_linger = l->l_linger;
5708 			if (l->l_onoff) {
5709 				so->so_linger.l_onoff = SO_LINGER;
5710 				so->so_options |= SO_LINGER;
5711 			} else {
5712 				so->so_linger.l_onoff = 0;
5713 				so->so_options &= ~SO_LINGER;
5714 			}
5715 			break;
5716 		}
5717 
5718 		case SO_DEBUG:
5719 #ifdef SOCK_TEST
5720 			if (intvalue & 2)
5721 				sock_test_timelimit = 10 * hz;
5722 			else
5723 				sock_test_timelimit = 0;
5724 
5725 			if (intvalue & 4)
5726 				do_useracc = 0;
5727 			else
5728 				do_useracc = 1;
5729 #endif /* SOCK_TEST */
5730 			/* FALLTHRU */
5731 		case SO_REUSEADDR:
5732 		case SO_KEEPALIVE:
5733 		case SO_DONTROUTE:
5734 		case SO_BROADCAST:
5735 		case SO_USELOOPBACK:
5736 		case SO_OOBINLINE:
5737 		case SO_DGRAM_ERRIND:
5738 			if (intvalue != 0) {
5739 				dprintso(so, 1,
5740 				    ("socket_setsockopt: setting 0x%x\n",
5741 				    option_name));
5742 				so->so_options |= option_name;
5743 			} else {
5744 				dprintso(so, 1,
5745 				    ("socket_setsockopt: clearing 0x%x\n",
5746 				    option_name));
5747 				so->so_options &= ~option_name;
5748 			}
5749 			break;
5750 		/*
5751 		 * The following options are only returned by us when the
5752 		 * transport layer fails.
5753 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5754 		 * since the transport might adjust the value and not
5755 		 * return exactly what was set by the application.
5756 		 */
5757 		case SO_SNDBUF:
5758 			so->so_sndbuf = intvalue;
5759 			break;
5760 		case SO_RCVBUF:
5761 			so->so_rcvbuf = intvalue;
5762 			break;
5763 		case SO_RCVPSH:
5764 			so->so_rcv_timer_interval = intvalue;
5765 			break;
5766 #ifdef notyet
5767 		/*
5768 		 * We do not implement the semantics of these options
5769 		 * thus we shouldn't implement the options either.
5770 		 */
5771 		case SO_SNDLOWAT:
5772 			so->so_sndlowat = intvalue;
5773 			break;
5774 		case SO_RCVLOWAT:
5775 			so->so_rcvlowat = intvalue;
5776 			break;
5777 #endif /* notyet */
5778 		case SO_SNDTIMEO:
5779 		case SO_RCVTIMEO: {
5780 			struct timeval tl;
5781 			clock_t val;
5782 
5783 			if (get_udatamodel() == DATAMODEL_NONE ||
5784 			    get_udatamodel() == DATAMODEL_NATIVE)
5785 				bcopy(&tl, (struct timeval *)optval,
5786 				    sizeof (struct timeval));
5787 			else
5788 				TIMEVAL32_TO_TIMEVAL(&tl,
5789 				    (struct timeval32 *)optval);
5790 			val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5791 			if (option_name == SO_RCVTIMEO)
5792 				so->so_rcvtimeo = drv_usectohz(val);
5793 			else
5794 				so->so_sndtimeo = drv_usectohz(val);
5795 			break;
5796 		}
5797 		}
5798 #undef	intvalue
5799 
5800 		if (error) {
5801 			if ((error == ENOPROTOOPT || error == EPROTO ||
5802 			    error == EINVAL) && handled) {
5803 				dprintso(so, 1,
5804 				    ("setsockopt: ignoring error %d for 0x%x\n",
5805 				    error, option_name));
5806 				error = 0;
5807 			}
5808 		}
5809 	}
5810 done2:
5811 	so_unlock_single(so, SOLOCKED);
5812 	mutex_exit(&so->so_lock);
5813 	return (error);
5814 }
5815 
5816 /*
5817  * sotpi_close() is called when the last open reference goes away.
5818  */
5819 /* ARGSUSED */
5820 int
5821 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5822 {
5823 	struct vnode *vp = SOTOV(so);
5824 	dev_t dev;
5825 	int error = 0;
5826 	sotpi_info_t *sti = SOTOTPI(so);
5827 
5828 	dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5829 	    (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5830 
5831 	dev = sti->sti_dev;
5832 
5833 	ASSERT(STREAMSTAB(getmajor(dev)));
5834 
5835 	mutex_enter(&so->so_lock);
5836 	so_lock_single(so);	/* Set SOLOCKED */
5837 
5838 	ASSERT(so_verify_oobstate(so));
5839 
5840 	if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5841 		sti->sti_nl7c_flags = 0;
5842 		nl7c_close(so);
5843 	}
5844 
5845 	if (vp->v_stream != NULL) {
5846 		vnode_t *ux_vp;
5847 
5848 		if (so->so_family == AF_UNIX) {
5849 			/* Could avoid this when CANTSENDMORE for !dgram */
5850 			so_unix_close(so);
5851 		}
5852 
5853 		mutex_exit(&so->so_lock);
5854 		/*
5855 		 * Disassemble the linkage from the AF_UNIX underlying file
5856 		 * system vnode to this socket (by atomically clearing
5857 		 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5858 		 * and frees the stream head.
5859 		 */
5860 		if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5861 			ASSERT(ux_vp->v_stream);
5862 			sti->sti_ux_bound_vp = NULL;
5863 			vn_rele_stream(ux_vp);
5864 		}
5865 		error = strclose(vp, flag, cr);
5866 		vp->v_stream = NULL;
5867 		mutex_enter(&so->so_lock);
5868 	}
5869 
5870 	/*
5871 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5872 	 */
5873 	so_flush_discon_ind(so);
5874 
5875 	so_unlock_single(so, SOLOCKED);
5876 	mutex_exit(&so->so_lock);
5877 
5878 	/*
5879 	 * Needed for STREAMs.
5880 	 * Decrement the device driver's reference count for streams
5881 	 * opened via the clone dip. The driver was held in clone_open().
5882 	 * The absence of clone_close() forces this asymmetry.
5883 	 */
5884 	if (so->so_flag & SOCLONE)
5885 		ddi_rele_driver(getmajor(dev));
5886 
5887 	return (error);
5888 }
5889 
5890 static int
5891 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5892     struct cred *cr, int32_t *rvalp)
5893 {
5894 	struct vnode *vp = SOTOV(so);
5895 	sotpi_info_t *sti = SOTOTPI(so);
5896 	int error = 0;
5897 
5898 	dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5899 	    cmd, arg, pr_state(so->so_state, so->so_mode)));
5900 
5901 	switch (cmd) {
5902 	case SIOCSQPTR:
5903 		/*
5904 		 * SIOCSQPTR is valid only when helper stream is created
5905 		 * by the protocol.
5906 		 */
5907 	case _I_INSERT:
5908 	case _I_REMOVE:
5909 		/*
5910 		 * Since there's no compelling reason to support these ioctls
5911 		 * on sockets, and doing so would increase the complexity
5912 		 * markedly, prevent it.
5913 		 */
5914 		return (EOPNOTSUPP);
5915 
5916 	case I_FIND:
5917 	case I_LIST:
5918 	case I_LOOK:
5919 	case I_POP:
5920 	case I_PUSH:
5921 		/*
5922 		 * To prevent races and inconsistencies between the actual
5923 		 * state of the stream and the state according to the sonode,
5924 		 * we serialize all operations which modify or operate on the
5925 		 * list of modules on the socket's stream.
5926 		 */
5927 		mutex_enter(&sti->sti_plumb_lock);
5928 		error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5929 		mutex_exit(&sti->sti_plumb_lock);
5930 		return (error);
5931 
5932 	default:
5933 		if (so->so_version != SOV_STREAM)
5934 			break;
5935 
5936 		/*
5937 		 * The imaginary "sockmod" has been popped; act as a stream.
5938 		 */
5939 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5940 	}
5941 
5942 	ASSERT(so->so_version != SOV_STREAM);
5943 
5944 	/*
5945 	 * Process socket-specific ioctls.
5946 	 */
5947 	switch (cmd) {
5948 	case FIONBIO: {
5949 		int32_t value;
5950 
5951 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5952 		    (mode & (int)FKIOCTL)))
5953 			return (EFAULT);
5954 
5955 		mutex_enter(&so->so_lock);
5956 		if (value) {
5957 			so->so_state |= SS_NDELAY;
5958 		} else {
5959 			so->so_state &= ~SS_NDELAY;
5960 		}
5961 		mutex_exit(&so->so_lock);
5962 		return (0);
5963 	}
5964 
5965 	case FIOASYNC: {
5966 		int32_t value;
5967 
5968 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5969 		    (mode & (int)FKIOCTL)))
5970 			return (EFAULT);
5971 
5972 		mutex_enter(&so->so_lock);
5973 		/*
5974 		 * SS_ASYNC flag not already set correctly?
5975 		 * (!value != !(so->so_state & SS_ASYNC))
5976 		 * but some engineers find that too hard to read.
5977 		 */
5978 		if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
5979 		    value != 0 && (so->so_state & SS_ASYNC) == 0)
5980 			error = so_flip_async(so, vp, mode, cr);
5981 		mutex_exit(&so->so_lock);
5982 		return (error);
5983 	}
5984 
5985 	case SIOCSPGRP:
5986 	case FIOSETOWN: {
5987 		pid_t pgrp;
5988 
5989 		if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5990 		    (mode & (int)FKIOCTL)))
5991 			return (EFAULT);
5992 
5993 		mutex_enter(&so->so_lock);
5994 		dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5995 		/* Any change? */
5996 		if (pgrp != so->so_pgrp)
5997 			error = so_set_siggrp(so, vp, pgrp, mode, cr);
5998 		mutex_exit(&so->so_lock);
5999 		return (error);
6000 	}
6001 	case SIOCGPGRP:
6002 	case FIOGETOWN:
6003 		if (so_copyout(&so->so_pgrp, (void *)arg,
6004 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
6005 			return (EFAULT);
6006 		return (0);
6007 
6008 	case SIOCATMARK: {
6009 		int retval;
6010 		uint_t so_state;
6011 
6012 		/*
6013 		 * strwaitmark has a finite timeout after which it
6014 		 * returns -1 if the mark state is undetermined.
6015 		 * In order to avoid any race between the mark state
6016 		 * in sockfs and the mark state in the stream head this
6017 		 * routine loops until the mark state can be determined
6018 		 * (or the urgent data indication has been removed by some
6019 		 * other thread).
6020 		 */
6021 		do {
6022 			mutex_enter(&so->so_lock);
6023 			so_state = so->so_state;
6024 			mutex_exit(&so->so_lock);
6025 			if (so_state & SS_RCVATMARK) {
6026 				retval = 1;
6027 			} else if (!(so_state & SS_OOBPEND)) {
6028 				/*
6029 				 * No SIGURG has been generated -- there is no
6030 				 * pending or present urgent data. Thus can't
6031 				 * possibly be at the mark.
6032 				 */
6033 				retval = 0;
6034 			} else {
6035 				/*
6036 				 * Have the stream head wait until there is
6037 				 * either some messages on the read queue, or
6038 				 * STRATMARK or STRNOTATMARK gets set. The
6039 				 * STRNOTATMARK flag is used so that the
6040 				 * transport can send up a MSGNOTMARKNEXT
6041 				 * M_DATA to indicate that it is not
6042 				 * at the mark and additional data is not about
6043 				 * to be send upstream.
6044 				 *
6045 				 * If the mark state is undetermined this will
6046 				 * return -1 and we will loop rechecking the
6047 				 * socket state.
6048 				 */
6049 				retval = strwaitmark(vp);
6050 			}
6051 		} while (retval == -1);
6052 
6053 		if (so_copyout(&retval, (void *)arg, sizeof (int),
6054 		    (mode & (int)FKIOCTL)))
6055 			return (EFAULT);
6056 		return (0);
6057 	}
6058 
6059 	case I_FDINSERT:
6060 	case I_SENDFD:
6061 	case I_RECVFD:
6062 	case I_ATMARK:
6063 	case _SIOCSOCKFALLBACK:
6064 		/*
6065 		 * These ioctls do not apply to sockets. I_FDINSERT can be
6066 		 * used to send M_PROTO messages without modifying the socket
6067 		 * state. I_SENDFD/RECVFD should not be used for socket file
6068 		 * descriptor passing since they assume a twisted stream.
6069 		 * SIOCATMARK must be used instead of I_ATMARK.
6070 		 *
6071 		 * _SIOCSOCKFALLBACK from an application should never be
6072 		 * processed.  It is only generated by socktpi_open() or
6073 		 * in response to I_POP or I_PUSH.
6074 		 */
6075 #ifdef DEBUG
6076 		zcmn_err(getzoneid(), CE_WARN,
6077 		    "Unsupported STREAMS ioctl 0x%x on socket. "
6078 		    "Pid = %d\n", cmd, curproc->p_pid);
6079 #endif /* DEBUG */
6080 		return (EOPNOTSUPP);
6081 
6082 	case _I_GETPEERCRED:
6083 		if ((mode & FKIOCTL) == 0)
6084 			return (EINVAL);
6085 
6086 		mutex_enter(&so->so_lock);
6087 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
6088 			error = ENOTSUP;
6089 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
6090 			error = ENOTCONN;
6091 		} else if (so->so_peercred != NULL) {
6092 			k_peercred_t *kp = (k_peercred_t *)arg;
6093 			kp->pc_cr = so->so_peercred;
6094 			kp->pc_cpid = so->so_cpid;
6095 			crhold(so->so_peercred);
6096 		} else {
6097 			error = EINVAL;
6098 		}
6099 		mutex_exit(&so->so_lock);
6100 		return (error);
6101 
6102 	default:
6103 		/*
6104 		 * Do the higher-order bits of the ioctl cmd indicate
6105 		 * that it is an I_* streams ioctl?
6106 		 */
6107 		if ((cmd & 0xffffff00U) == STR &&
6108 		    so->so_version == SOV_SOCKBSD) {
6109 #ifdef DEBUG
6110 			zcmn_err(getzoneid(), CE_WARN,
6111 			    "Unsupported STREAMS ioctl 0x%x on socket. "
6112 			    "Pid = %d\n", cmd, curproc->p_pid);
6113 #endif /* DEBUG */
6114 			return (EOPNOTSUPP);
6115 		}
6116 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6117 	}
6118 }
6119 
6120 /*
6121  * Handle plumbing-related ioctls.
6122  */
6123 static int
6124 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
6125     struct cred *cr, int32_t *rvalp)
6126 {
6127 	static const char sockmod_name[] = "sockmod";
6128 	struct sonode	*so = VTOSO(vp);
6129 	char		mname[FMNAMESZ + 1];
6130 	int		error;
6131 	sotpi_info_t	*sti = SOTOTPI(so);
6132 
6133 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
6134 
6135 	if (so->so_version == SOV_SOCKBSD)
6136 		return (EOPNOTSUPP);
6137 
6138 	if (so->so_version == SOV_STREAM) {
6139 		/*
6140 		 * The imaginary "sockmod" has been popped - act as a stream.
6141 		 * If this is a push of sockmod then change back to a socket.
6142 		 */
6143 		if (cmd == I_PUSH) {
6144 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6145 			    (void *)arg, mname, sizeof (mname), NULL);
6146 
6147 			if (error == 0 && strcmp(mname, sockmod_name) == 0) {
6148 				dprintso(so, 0, ("socktpi_ioctl: going to "
6149 				    "socket version\n"));
6150 				so_stream2sock(so);
6151 				return (0);
6152 			}
6153 		}
6154 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6155 	}
6156 
6157 	switch (cmd) {
6158 	case I_PUSH:
6159 		if (sti->sti_direct) {
6160 			mutex_enter(&so->so_lock);
6161 			so_lock_single(so);
6162 			mutex_exit(&so->so_lock);
6163 
6164 			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6165 			    cr, rvalp);
6166 
6167 			mutex_enter(&so->so_lock);
6168 			if (error == 0)
6169 				sti->sti_direct = 0;
6170 			so_unlock_single(so, SOLOCKED);
6171 			mutex_exit(&so->so_lock);
6172 
6173 			if (error != 0)
6174 				return (error);
6175 		}
6176 
6177 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6178 		if (error == 0)
6179 			sti->sti_pushcnt++;
6180 		return (error);
6181 
6182 	case I_POP:
6183 		if (sti->sti_pushcnt == 0) {
6184 			/* Emulate sockmod being popped */
6185 			dprintso(so, 0,
6186 			    ("socktpi_ioctl: going to STREAMS version\n"));
6187 			return (so_sock2stream(so));
6188 		}
6189 
6190 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6191 		if (error == 0)
6192 			sti->sti_pushcnt--;
6193 		return (error);
6194 
6195 	case I_LIST: {
6196 		struct str_mlist *kmlistp, *umlistp;
6197 		struct str_list	kstrlist;
6198 		ssize_t		kstrlistsize;
6199 		int		i, nmods;
6200 
6201 		STRUCT_DECL(str_list, ustrlist);
6202 		STRUCT_INIT(ustrlist, mode);
6203 
6204 		if (arg == 0) {
6205 			error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6206 			if (error == 0)
6207 				(*rvalp)++;	/* Add one for sockmod */
6208 			return (error);
6209 		}
6210 
6211 		error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6212 		    STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6213 		if (error != 0)
6214 			return (error);
6215 
6216 		nmods = STRUCT_FGET(ustrlist, sl_nmods);
6217 		if (nmods <= 0)
6218 			return (EINVAL);
6219 		/*
6220 		 * Ceiling nmods at nstrpush to prevent someone from
6221 		 * maliciously consuming lots of kernel memory.
6222 		 */
6223 		nmods = MIN(nmods, nstrpush);
6224 
6225 		kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6226 		kstrlist.sl_nmods = nmods;
6227 		kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6228 
6229 		error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6230 		    cr, rvalp);
6231 		if (error != 0)
6232 			goto done;
6233 
6234 		/*
6235 		 * Considering the module list as a 0-based array of sl_nmods
6236 		 * modules, sockmod should conceptually exist at slot
6237 		 * sti_pushcnt.  Insert sockmod at this location by sliding all
6238 		 * of the module names after so_pushcnt over by one.  We know
6239 		 * that there will be room to do this since we allocated
6240 		 * sl_modlist with an additional slot.
6241 		 */
6242 		for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6243 			kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6244 
6245 		(void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6246 		kstrlist.sl_nmods++;
6247 
6248 		/*
6249 		 * Copy all of the entries out to ustrlist.
6250 		 */
6251 		kmlistp = kstrlist.sl_modlist;
6252 		umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6253 		for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6254 			error = so_copyout(kmlistp++, umlistp++,
6255 			    sizeof (struct str_mlist), mode & FKIOCTL);
6256 			if (error != 0)
6257 				goto done;
6258 		}
6259 
6260 		error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6261 		    mode & FKIOCTL);
6262 		if (error == 0)
6263 			*rvalp = 0;
6264 	done:
6265 		kmem_free(kstrlist.sl_modlist, kstrlistsize);
6266 		return (error);
6267 	}
6268 	case I_LOOK:
6269 		if (sti->sti_pushcnt == 0) {
6270 			return (so_copyout(sockmod_name, (void *)arg,
6271 			    sizeof (sockmod_name), mode & FKIOCTL));
6272 		}
6273 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6274 
6275 	case I_FIND:
6276 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6277 		if (error && error != EINVAL)
6278 			return (error);
6279 
6280 		/* if not found and string was sockmod return 1 */
6281 		if (*rvalp == 0 || error == EINVAL) {
6282 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6283 			    (void *)arg, mname, sizeof (mname), NULL);
6284 			if (error == ENAMETOOLONG)
6285 				error = EINVAL;
6286 
6287 			if (error == 0 && strcmp(mname, sockmod_name) == 0)
6288 				*rvalp = 1;
6289 		}
6290 		return (error);
6291 
6292 	default:
6293 		panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6294 		break;
6295 	}
6296 
6297 	return (0);
6298 }
6299 
6300 /*
6301  * Wrapper around the streams poll routine that implements socket poll
6302  * semantics.
6303  * The sockfs never calls pollwakeup itself - the stream head take care
6304  * of all pollwakeups. Since sockfs never holds so_lock when calling the
6305  * stream head there can never be a deadlock due to holding so_lock across
6306  * pollwakeup and acquiring so_lock in this routine.
6307  *
6308  * However, since the performance of VOP_POLL is critical we avoid
6309  * acquiring so_lock here. This is based on two assumptions:
6310  *  - The poll implementation holds locks to serialize the VOP_POLL call
6311  *    and a pollwakeup for the same pollhead. This ensures that should
6312  *    e.g. so_state change during a socktpi_poll call the pollwakeup
6313  *    (which strsock_* and strrput conspire to issue) is issued after
6314  *    the state change. Thus the pollwakeup will block until VOP_POLL has
6315  *    returned and then wake up poll and have it call VOP_POLL again.
6316  *  - The reading of so_state without holding so_lock does not result in
6317  *    stale data that is older than the latest state change that has dropped
6318  *    so_lock. This is ensured by the mutex_exit issuing the appropriate
6319  *    memory barrier to force the data into the coherency domain.
6320  */
6321 static int
6322 sotpi_poll(
6323 	struct sonode	*so,
6324 	short		events,
6325 	int		anyyet,
6326 	short		*reventsp,
6327 	struct pollhead **phpp)
6328 {
6329 	short origevents = events;
6330 	struct vnode *vp = SOTOV(so);
6331 	int error;
6332 	int so_state = so->so_state;	/* snapshot */
6333 	sotpi_info_t *sti = SOTOTPI(so);
6334 
6335 	dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6336 	    (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6337 
6338 	ASSERT(vp->v_type == VSOCK);
6339 	ASSERT(vp->v_stream != NULL);
6340 
6341 	if (so->so_version == SOV_STREAM) {
6342 		/* The imaginary "sockmod" has been popped - act as a stream */
6343 		return (strpoll(vp->v_stream, events, anyyet,
6344 		    reventsp, phpp));
6345 	}
6346 
6347 	if (!(so_state & SS_ISCONNECTED) &&
6348 	    (so->so_mode & SM_CONNREQUIRED)) {
6349 		/* Not connected yet - turn off write side events */
6350 		events &= ~(POLLOUT|POLLWRBAND);
6351 	}
6352 	/*
6353 	 * Check for errors without calling strpoll if the caller wants them.
6354 	 * In sockets the errors are represented as input/output events
6355 	 * and there is no need to ask the stream head for this information.
6356 	 */
6357 	if (so->so_error != 0 &&
6358 	    ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
6359 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6360 		return (0);
6361 	}
6362 	/*
6363 	 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6364 	 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6365 	 * will not trigger a POLLIN event with POLLRDDATA set.
6366 	 * The handling of urgent data (causing POLLRDBAND) is done by
6367 	 * inspecting SS_OOBPEND below.
6368 	 */
6369 	events |= POLLRDDATA;
6370 
6371 	/*
6372 	 * After shutdown(output) a stream head write error is set.
6373 	 * However, we should not return output events.
6374 	 */
6375 	events |= POLLNOERR;
6376 	error = strpoll(vp->v_stream, events, anyyet,
6377 	    reventsp, phpp);
6378 	if (error)
6379 		return (error);
6380 
6381 	ASSERT(!(*reventsp & POLLERR));
6382 
6383 	/*
6384 	 * Notes on T_CONN_IND handling for sockets.
6385 	 *
6386 	 * If strpoll() returned without events, SR_POLLIN is guaranteed
6387 	 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6388 	 *
6389 	 * Since the so_lock is not held, soqueueconnind() may have run
6390 	 * and a T_CONN_IND may be waiting. We now check for any queued
6391 	 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6392 	 * to ensure poll returns.
6393 	 *
6394 	 * However:
6395 	 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6396 	 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6397 	 * the following actions will occur; taken together they ensure the
6398 	 * syscall will return.
6399 	 *
6400 	 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6401 	 *    the accept() was run on a non-blocking socket sowaitconnind()
6402 	 *    may have already returned EWOULDBLOCK, so not be waiting to
6403 	 *    process the message. Additionally socktpi_poll() has probably
6404 	 *    proceeded past the sti_conn_ind_head check below.
6405 	 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6406 	 *    this thread,  however that could occur before poll_common()
6407 	 *    has entered cv_wait.
6408 	 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6409 	 *
6410 	 * Before proceeding to cv_wait() in poll_common() for an event,
6411 	 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6412 	 * and if set, re-calls strpoll() to ensure the late arriving
6413 	 * T_CONN_IND is recognized, and pollsys() returns.
6414 	 */
6415 
6416 	if (sti->sti_conn_ind_head != NULL)
6417 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6418 
6419 	if (so->so_state & SS_CANTRCVMORE) {
6420 		*reventsp |= POLLRDHUP & events;
6421 
6422 		if (so->so_state & SS_CANTSENDMORE)
6423 			*reventsp |= POLLHUP;
6424 	}
6425 
6426 	if (so->so_state & SS_OOBPEND)
6427 		*reventsp |= POLLRDBAND & events;
6428 
6429 	if (sti->sti_nl7c_rcv_mp != NULL) {
6430 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6431 	}
6432 	if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6433 	    ((POLLIN|POLLRDNORM) & *reventsp)) {
6434 		sti->sti_nl7c_flags |= NL7C_POLLIN;
6435 	}
6436 
6437 	return (0);
6438 }
6439 
6440 /*ARGSUSED*/
6441 static int
6442 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6443 {
6444 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6445 	int error = 0;
6446 
6447 	error = sonode_constructor(buf, cdrarg, kmflags);
6448 	if (error != 0)
6449 		return (error);
6450 
6451 	error = i_sotpi_info_constructor(&st->st_info);
6452 	if (error != 0)
6453 		sonode_destructor(buf, cdrarg);
6454 
6455 	st->st_sonode.so_priv = &st->st_info;
6456 
6457 	return (error);
6458 }
6459 
6460 /*ARGSUSED1*/
6461 static void
6462 socktpi_destructor(void *buf, void *cdrarg)
6463 {
6464 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6465 
6466 	ASSERT(st->st_sonode.so_priv == &st->st_info);
6467 	st->st_sonode.so_priv = NULL;
6468 
6469 	i_sotpi_info_destructor(&st->st_info);
6470 	sonode_destructor(buf, cdrarg);
6471 }
6472 
6473 static int
6474 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6475 {
6476 	int retval;
6477 
6478 	if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6479 		struct sonode *so = (struct sonode *)buf;
6480 		sotpi_info_t *sti = SOTOTPI(so);
6481 
6482 		mutex_enter(&socklist.sl_lock);
6483 
6484 		sti->sti_next_so = socklist.sl_list;
6485 		sti->sti_prev_so = NULL;
6486 		if (sti->sti_next_so != NULL)
6487 			SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6488 		socklist.sl_list = so;
6489 
6490 		mutex_exit(&socklist.sl_lock);
6491 
6492 	}
6493 	return (retval);
6494 }
6495 
6496 static void
6497 socktpi_unix_destructor(void *buf, void *cdrarg)
6498 {
6499 	struct sonode	*so = (struct sonode *)buf;
6500 	sotpi_info_t	*sti = SOTOTPI(so);
6501 
6502 	mutex_enter(&socklist.sl_lock);
6503 
6504 	if (sti->sti_next_so != NULL)
6505 		SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6506 	if (sti->sti_prev_so != NULL)
6507 		SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6508 	else
6509 		socklist.sl_list = sti->sti_next_so;
6510 
6511 	mutex_exit(&socklist.sl_lock);
6512 
6513 	socktpi_destructor(buf, cdrarg);
6514 }
6515 
6516 int
6517 socktpi_init(void)
6518 {
6519 	/*
6520 	 * Create sonode caches.  We create a special one for AF_UNIX so
6521 	 * that we can track them for netstat(1m).
6522 	 */
6523 	socktpi_cache = kmem_cache_create("socktpi_cache",
6524 	    sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6525 	    socktpi_destructor, NULL, NULL, NULL, 0);
6526 
6527 	socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6528 	    sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6529 	    socktpi_unix_destructor, NULL, NULL, NULL, 0);
6530 
6531 	return (0);
6532 }
6533 
6534 /*
6535  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6536  *
6537  * Caller must still update state and mode using sotpi_update_state().
6538  */
6539 int
6540 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6541     boolean_t *direct, queue_t **qp, struct cred *cr)
6542 {
6543 	sotpi_info_t *sti;
6544 	struct sockparams *origsp = so->so_sockparams;
6545 	sock_lower_handle_t handle = so->so_proto_handle;
6546 	struct stdata *stp;
6547 	struct vnode *vp;
6548 	queue_t *q;
6549 	int error = 0;
6550 
6551 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6552 	    SS_FALLBACK_PENDING);
6553 	ASSERT(SOCK_IS_NONSTR(so));
6554 
6555 	*qp = NULL;
6556 	*direct = B_FALSE;
6557 	so->so_sockparams = newsp;
6558 	/*
6559 	 * Allocate and initalize fields required by TPI.
6560 	 */
6561 	(void) sotpi_info_create(so, KM_SLEEP);
6562 	sotpi_info_init(so);
6563 
6564 	if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6565 		sotpi_info_fini(so);
6566 		sotpi_info_destroy(so);
6567 		return (error);
6568 	}
6569 	ASSERT(handle == so->so_proto_handle);
6570 	sti = SOTOTPI(so);
6571 	if (sti->sti_direct != 0)
6572 		*direct = B_TRUE;
6573 
6574 	/*
6575 	 * Keep the original sp around so we can properly dispose of the
6576 	 * sonode when the socket is being closed.
6577 	 */
6578 	sti->sti_orig_sp = origsp;
6579 
6580 	so_basic_strinit(so);	/* skips the T_CAPABILITY_REQ */
6581 	so_alloc_addr(so, so->so_max_addr_len);
6582 
6583 	/*
6584 	 * If the application has done a SIOCSPGRP, make sure the
6585 	 * STREAM head is aware. This needs to take place before
6586 	 * the protocol start sending up messages. Otherwise we
6587 	 * might miss to generate SIGPOLL.
6588 	 *
6589 	 * It is possible that the application will receive duplicate
6590 	 * signals if some were already generated for either data or
6591 	 * connection indications.
6592 	 */
6593 	if (so->so_pgrp != 0) {
6594 		if (so_set_events(so, so->so_vnode, cr) != 0)
6595 			so->so_pgrp = 0;
6596 	}
6597 
6598 	/*
6599 	 * Determine which queue to use.
6600 	 */
6601 	vp = SOTOV(so);
6602 	stp = vp->v_stream;
6603 	ASSERT(stp != NULL);
6604 	q = stp->sd_wrq->q_next;
6605 
6606 	/*
6607 	 * Skip any modules that may have been auto pushed when the device
6608 	 * was opened
6609 	 */
6610 	while (q->q_next != NULL)
6611 		q = q->q_next;
6612 	*qp = _RD(q);
6613 
6614 	/* This is now a STREAMS sockets */
6615 	so->so_not_str = B_FALSE;
6616 
6617 	return (error);
6618 }
6619 
6620 /*
6621  * Revert a TPI sonode. It is only allowed to revert the sonode during
6622  * the fallback process.
6623  */
6624 void
6625 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6626 {
6627 	vnode_t *vp = SOTOV(so);
6628 
6629 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6630 	    SS_FALLBACK_PENDING);
6631 	ASSERT(!SOCK_IS_NONSTR(so));
6632 	ASSERT(vp->v_stream != NULL);
6633 
6634 	strclean(vp);
6635 	(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6636 
6637 	/*
6638 	 * Restore the original sockparams. The caller is responsible for
6639 	 * dropping the ref to the new sp.
6640 	 */
6641 	so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6642 
6643 	sotpi_info_fini(so);
6644 	sotpi_info_destroy(so);
6645 
6646 	/* This is no longer a STREAMS sockets */
6647 	so->so_not_str = B_TRUE;
6648 }
6649 
6650 void
6651 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6652     struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6653     socklen_t faddrlen, short opts)
6654 {
6655 	sotpi_info_t *sti = SOTOTPI(so);
6656 
6657 	so_proc_tcapability_ack(so, tcap);
6658 
6659 	so->so_options |= opts;
6660 
6661 	/*
6662 	 * Determine whether the foreign and local address are valid
6663 	 */
6664 	if (laddrlen != 0) {
6665 		ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6666 		sti->sti_laddr_len = laddrlen;
6667 		bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6668 		sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6669 	}
6670 
6671 	if (faddrlen != 0) {
6672 		ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6673 		sti->sti_faddr_len = faddrlen;
6674 		bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6675 		sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6676 	}
6677 
6678 }
6679 
6680 /*
6681  * Allocate enough space to cache the local and foreign addresses.
6682  */
6683 void
6684 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6685 {
6686 	sotpi_info_t *sti = SOTOTPI(so);
6687 
6688 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6689 	ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6690 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6691 	    P2ROUNDUP(maxlen, KMEM_ALIGN);
6692 	so->so_max_addr_len = sti->sti_laddr_maxlen;
6693 	sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6694 	sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6695 	    + sti->sti_laddr_maxlen);
6696 
6697 	if (so->so_family == AF_UNIX) {
6698 		/*
6699 		 * Initialize AF_UNIX related fields.
6700 		 */
6701 		bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6702 		bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6703 	}
6704 }
6705 
6706 
6707 sotpi_info_t *
6708 sotpi_sototpi(struct sonode *so)
6709 {
6710 	sotpi_info_t *sti;
6711 
6712 	ASSERT(so != NULL);
6713 
6714 	sti = (sotpi_info_t *)so->so_priv;
6715 
6716 	ASSERT(sti != NULL);
6717 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6718 
6719 	return (sti);
6720 }
6721 
6722 static int
6723 i_sotpi_info_constructor(sotpi_info_t *sti)
6724 {
6725 	sti->sti_magic		= SOTPI_INFO_MAGIC;
6726 	sti->sti_ack_mp		= NULL;
6727 	sti->sti_discon_ind_mp	= NULL;
6728 	sti->sti_ux_bound_vp	= NULL;
6729 	sti->sti_unbind_mp	= NULL;
6730 
6731 	sti->sti_conn_ind_head	= NULL;
6732 	sti->sti_conn_ind_tail	= NULL;
6733 
6734 	sti->sti_laddr_sa	= NULL;
6735 	sti->sti_faddr_sa	= NULL;
6736 
6737 	sti->sti_nl7c_flags	= 0;
6738 	sti->sti_nl7c_uri	= NULL;
6739 	sti->sti_nl7c_rcv_mp	= NULL;
6740 
6741 	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6742 	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6743 
6744 	return (0);
6745 }
6746 
6747 static void
6748 i_sotpi_info_destructor(sotpi_info_t *sti)
6749 {
6750 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6751 	ASSERT(sti->sti_ack_mp == NULL);
6752 	ASSERT(sti->sti_discon_ind_mp == NULL);
6753 	ASSERT(sti->sti_ux_bound_vp == NULL);
6754 	ASSERT(sti->sti_unbind_mp == NULL);
6755 
6756 	ASSERT(sti->sti_conn_ind_head == NULL);
6757 	ASSERT(sti->sti_conn_ind_tail == NULL);
6758 
6759 	ASSERT(sti->sti_laddr_sa == NULL);
6760 	ASSERT(sti->sti_faddr_sa == NULL);
6761 
6762 	ASSERT(sti->sti_nl7c_flags == 0);
6763 	ASSERT(sti->sti_nl7c_uri == NULL);
6764 	ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6765 
6766 	mutex_destroy(&sti->sti_plumb_lock);
6767 	cv_destroy(&sti->sti_ack_cv);
6768 }
6769 
6770 /*
6771  * Creates and attaches TPI information to the given sonode
6772  */
6773 static boolean_t
6774 sotpi_info_create(struct sonode *so, int kmflags)
6775 {
6776 	sotpi_info_t *sti;
6777 
6778 	ASSERT(so->so_priv == NULL);
6779 
6780 	if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6781 		return (B_FALSE);
6782 
6783 	if (i_sotpi_info_constructor(sti) != 0) {
6784 		kmem_free(sti, sizeof (*sti));
6785 		return (B_FALSE);
6786 	}
6787 
6788 	so->so_priv = (void *)sti;
6789 	return (B_TRUE);
6790 }
6791 
6792 /*
6793  * Initializes the TPI information.
6794  */
6795 static void
6796 sotpi_info_init(struct sonode *so)
6797 {
6798 	struct vnode *vp = SOTOV(so);
6799 	sotpi_info_t *sti = SOTOTPI(so);
6800 	time_t now;
6801 
6802 	sti->sti_dev	= so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6803 	vp->v_rdev	= sti->sti_dev;
6804 
6805 	sti->sti_orig_sp = NULL;
6806 
6807 	sti->sti_pushcnt = 0;
6808 
6809 	now = gethrestime_sec();
6810 	sti->sti_atime	= now;
6811 	sti->sti_mtime	= now;
6812 	sti->sti_ctime	= now;
6813 
6814 	sti->sti_eaddr_mp = NULL;
6815 	sti->sti_delayed_error = 0;
6816 
6817 	sti->sti_provinfo = NULL;
6818 
6819 	sti->sti_oobcnt = 0;
6820 	sti->sti_oobsigcnt = 0;
6821 
6822 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6823 
6824 	sti->sti_laddr_sa	= 0;
6825 	sti->sti_faddr_sa	= 0;
6826 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6827 	sti->sti_laddr_len = sti->sti_faddr_len = 0;
6828 
6829 	sti->sti_laddr_valid = 0;
6830 	sti->sti_faddr_valid = 0;
6831 	sti->sti_faddr_noxlate = 0;
6832 
6833 	sti->sti_direct = 0;
6834 
6835 	ASSERT(sti->sti_ack_mp == NULL);
6836 	ASSERT(sti->sti_ux_bound_vp == NULL);
6837 	ASSERT(sti->sti_unbind_mp == NULL);
6838 
6839 	ASSERT(sti->sti_conn_ind_head == NULL);
6840 	ASSERT(sti->sti_conn_ind_tail == NULL);
6841 }
6842 
6843 /*
6844  * Given a sonode, grab the TPI info and free any data.
6845  */
6846 static void
6847 sotpi_info_fini(struct sonode *so)
6848 {
6849 	sotpi_info_t *sti = SOTOTPI(so);
6850 	mblk_t *mp;
6851 
6852 	ASSERT(sti->sti_discon_ind_mp == NULL);
6853 
6854 	if ((mp = sti->sti_conn_ind_head) != NULL) {
6855 		mblk_t *mp1;
6856 
6857 		while (mp) {
6858 			mp1 = mp->b_next;
6859 			mp->b_next = NULL;
6860 			freemsg(mp);
6861 			mp = mp1;
6862 		}
6863 		sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6864 	}
6865 
6866 	/*
6867 	 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6868 	 * indirect them.  It also uses so_count as a validity test.
6869 	 */
6870 	mutex_enter(&so->so_lock);
6871 
6872 	if (sti->sti_laddr_sa) {
6873 		ASSERT((caddr_t)sti->sti_faddr_sa ==
6874 		    (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6875 		ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6876 		sti->sti_laddr_valid = 0;
6877 		sti->sti_faddr_valid = 0;
6878 		kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6879 		sti->sti_laddr_sa = NULL;
6880 		sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6881 		sti->sti_faddr_sa = NULL;
6882 		sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6883 	}
6884 
6885 	mutex_exit(&so->so_lock);
6886 
6887 	if ((mp = sti->sti_eaddr_mp) != NULL) {
6888 		freemsg(mp);
6889 		sti->sti_eaddr_mp = NULL;
6890 		sti->sti_delayed_error = 0;
6891 	}
6892 
6893 	if ((mp = sti->sti_ack_mp) != NULL) {
6894 		freemsg(mp);
6895 		sti->sti_ack_mp = NULL;
6896 	}
6897 
6898 	if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
6899 		sti->sti_nl7c_rcv_mp = NULL;
6900 		freemsg(mp);
6901 	}
6902 	sti->sti_nl7c_rcv_rval = 0;
6903 	if (sti->sti_nl7c_uri != NULL) {
6904 		nl7c_urifree(so);
6905 		/* urifree() cleared nl7c_uri */
6906 	}
6907 	if (sti->sti_nl7c_flags) {
6908 		sti->sti_nl7c_flags = 0;
6909 	}
6910 
6911 	ASSERT(sti->sti_ux_bound_vp == NULL);
6912 	if ((mp = sti->sti_unbind_mp) != NULL) {
6913 		freemsg(mp);
6914 		sti->sti_unbind_mp = NULL;
6915 	}
6916 }
6917 
6918 /*
6919  * Destroys the TPI information attached to a sonode.
6920  */
6921 static void
6922 sotpi_info_destroy(struct sonode *so)
6923 {
6924 	sotpi_info_t *sti = SOTOTPI(so);
6925 
6926 	i_sotpi_info_destructor(sti);
6927 	kmem_free(sti, sizeof (*sti));
6928 
6929 	so->so_priv = NULL;
6930 }
6931 
6932 /*
6933  * Create the global sotpi socket module entry. It will never be freed.
6934  */
6935 smod_info_t *
6936 sotpi_smod_create(void)
6937 {
6938 	smod_info_t *smodp;
6939 
6940 	smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6941 	smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6942 	(void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6943 	/*
6944 	 * Initialize the smod_refcnt to 1 so it will never be freed.
6945 	 */
6946 	smodp->smod_refcnt = 1;
6947 	smodp->smod_uc_version = SOCK_UC_VERSION;
6948 	smodp->smod_dc_version = SOCK_DC_VERSION;
6949 	smodp->smod_sock_create_func = &sotpi_create;
6950 	smodp->smod_sock_destroy_func = &sotpi_destroy;
6951 	return (smodp);
6952 }
6953