xref: /illumos-gate/usr/src/uts/common/fs/sockfs/socktpi.c (revision e8d80663e4f91871f843bb8ad9108dc0b76dfcf3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/t_lock.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/buf.h>
31 #include <sys/conf.h>
32 #include <sys/cred.h>
33 #include <sys/kmem.h>
34 #include <sys/kmem_impl.h>
35 #include <sys/sysmacros.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/debug.h>
39 #include <sys/errno.h>
40 #include <sys/time.h>
41 #include <sys/file.h>
42 #include <sys/open.h>
43 #include <sys/user.h>
44 #include <sys/termios.h>
45 #include <sys/stream.h>
46 #include <sys/strsubr.h>
47 #include <sys/strsun.h>
48 #include <sys/suntpi.h>
49 #include <sys/ddi.h>
50 #include <sys/esunddi.h>
51 #include <sys/flock.h>
52 #include <sys/modctl.h>
53 #include <sys/vtrace.h>
54 #include <sys/cmn_err.h>
55 #include <sys/pathname.h>
56 
57 #include <sys/socket.h>
58 #include <sys/socketvar.h>
59 #include <sys/sockio.h>
60 #include <netinet/in.h>
61 #include <sys/un.h>
62 #include <sys/strsun.h>
63 
64 #include <sys/tiuser.h>
65 #define	_SUN_TPI_VERSION	2
66 #include <sys/tihdr.h>
67 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
68 
69 #include <c2/audit.h>
70 
71 #include <inet/common.h>
72 #include <inet/ip.h>
73 #include <inet/ip6.h>
74 #include <inet/tcp.h>
75 #include <inet/udp_impl.h>
76 
77 #include <sys/zone.h>
78 
79 #include <fs/sockfs/nl7c.h>
80 #include <fs/sockfs/nl7curi.h>
81 
82 #include <fs/sockfs/sockcommon.h>
83 #include <fs/sockfs/socktpi.h>
84 #include <fs/sockfs/socktpi_impl.h>
85 
86 /*
87  * Possible failures when memory can't be allocated. The documented behavior:
88  *
89  * 		5.5:			4.X:		XNET:
90  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
91  *							EINTR
92  *	(4.X does not document EINTR but returns it)
93  * bind:	ENOSR			-		ENOBUFS/ENOSR
94  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
95  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
96  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
97  *	(4.X getpeername and getsockname do not fail in practice)
98  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
99  * listen:	-			-		ENOBUFS
100  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
101  *							EINTR
102  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
103  *							EINTR
104  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
105  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
106  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
107  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
108  *
109  * Resolution. When allocation fails:
110  *	recv: return EINTR
111  *	send: return EINTR
112  *	connect, accept: EINTR
113  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
114  *	socket, socketpair: ENOBUFS
115  *	getpeername, getsockname: sleep
116  *	getsockopt, setsockopt: sleep
117  */
118 
119 #ifdef SOCK_TEST
120 /*
121  * Variables that make sockfs do something other than the standard TPI
122  * for the AF_INET transports.
123  *
124  * solisten_tpi_tcp:
125  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
126  *	the transport is already bound. This is needed to avoid loosing the
127  *	port number should listen() do a T_UNBIND_REQ followed by a
128  *	O_T_BIND_REQ.
129  *
130  * soconnect_tpi_udp:
131  *	UDP and ICMP can handle a T_CONN_REQ.
132  *	This is needed to make the sequence of connect(), getsockname()
133  *	return the local IP address used to send packets to the connected to
134  *	destination.
135  *
136  * soconnect_tpi_tcp:
137  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
138  *	Set this to non-zero to send TPI conformant messages to TCP in this
139  *	respect. This is a performance optimization.
140  *
141  * soaccept_tpi_tcp:
142  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
143  *	This is a performance optimization that has been picked up in XTI.
144  *
145  * soaccept_tpi_multioptions:
146  *	When inheriting SOL_SOCKET options from the listener to the accepting
147  *	socket send them as a single message for AF_INET{,6}.
148  */
149 int solisten_tpi_tcp = 0;
150 int soconnect_tpi_udp = 0;
151 int soconnect_tpi_tcp = 0;
152 int soaccept_tpi_tcp = 0;
153 int soaccept_tpi_multioptions = 1;
154 #else /* SOCK_TEST */
155 #define	soconnect_tpi_tcp	0
156 #define	soconnect_tpi_udp	0
157 #define	solisten_tpi_tcp	0
158 #define	soaccept_tpi_tcp	0
159 #define	soaccept_tpi_multioptions	1
160 #endif /* SOCK_TEST */
161 
162 #ifdef SOCK_TEST
163 extern int do_useracc;
164 extern clock_t sock_test_timelimit;
165 #endif /* SOCK_TEST */
166 
167 /*
168  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
169  * applications working. Turn on this flag to disable these checks.
170  */
171 int xnet_skip_checks = 0;
172 int xnet_check_print = 0;
173 int xnet_truncate_print = 0;
174 
175 static void sotpi_destroy(struct sonode *);
176 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
177     int, int *, cred_t *cr);
178 
179 static boolean_t	sotpi_info_create(struct sonode *, int);
180 static void		sotpi_info_init(struct sonode *);
181 static void 		sotpi_info_fini(struct sonode *);
182 static void 		sotpi_info_destroy(struct sonode *);
183 
184 /*
185  * Do direct function call to the transport layer below; this would
186  * also allow the transport to utilize read-side synchronous stream
187  * interface if necessary.  This is a /etc/system tunable that must
188  * not be modified on a running system.  By default this is enabled
189  * for performance reasons and may be disabled for debugging purposes.
190  */
191 boolean_t socktpi_direct = B_TRUE;
192 
193 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
194 
195 extern	void sigintr(k_sigset_t *, int);
196 extern	void sigunintr(k_sigset_t *);
197 
198 static int	sotpi_unbind(struct sonode *, int);
199 
200 /* TPI sockfs sonode operations */
201 int 		sotpi_init(struct sonode *, struct sonode *, struct cred *,
202 		    int);
203 static int	sotpi_accept(struct sonode *, int, struct cred *,
204 		    struct sonode **);
205 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
206 		    int, struct cred *);
207 static int	sotpi_listen(struct sonode *, int, struct cred *);
208 static int	sotpi_connect(struct sonode *, struct sockaddr *,
209 		    socklen_t, int, int, struct cred *);
210 extern int	sotpi_recvmsg(struct sonode *, struct nmsghdr *,
211 		    struct uio *, struct cred *);
212 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
213 		    struct uio *, struct cred *);
214 static int	sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
215 		    struct cred *, mblk_t **);
216 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
217 		    struct uio *, void *, t_uscalar_t, int);
218 static int	sodgram_direct(struct sonode *, struct sockaddr *,
219 		    socklen_t, struct uio *, int);
220 extern int	sotpi_getpeername(struct sonode *, struct sockaddr *,
221 		    socklen_t *, boolean_t, struct cred *);
222 static int	sotpi_getsockname(struct sonode *, struct sockaddr *,
223 		    socklen_t *, struct cred *);
224 static int	sotpi_shutdown(struct sonode *, int, struct cred *);
225 extern int	sotpi_getsockopt(struct sonode *, int, int, void *,
226 		    socklen_t *, int, struct cred *);
227 extern int	sotpi_setsockopt(struct sonode *, int, int, const void *,
228 		    socklen_t, struct cred *);
229 static int 	sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
230 		    int32_t *);
231 static int 	socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
232 		    struct cred *, int32_t *);
233 static int 	sotpi_poll(struct sonode *, short, int, short *,
234 		    struct pollhead **);
235 static int 	sotpi_close(struct sonode *, int, struct cred *);
236 
237 static int	i_sotpi_info_constructor(sotpi_info_t *);
238 static void 	i_sotpi_info_destructor(sotpi_info_t *);
239 
240 sonodeops_t sotpi_sonodeops = {
241 	sotpi_init,		/* sop_init		*/
242 	sotpi_accept,		/* sop_accept		*/
243 	sotpi_bind,		/* sop_bind		*/
244 	sotpi_listen,		/* sop_listen		*/
245 	sotpi_connect,		/* sop_connect		*/
246 	sotpi_recvmsg,		/* sop_recvmsg		*/
247 	sotpi_sendmsg,		/* sop_sendmsg		*/
248 	sotpi_sendmblk,		/* sop_sendmblk		*/
249 	sotpi_getpeername,	/* sop_getpeername	*/
250 	sotpi_getsockname,	/* sop_getsockname	*/
251 	sotpi_shutdown,		/* sop_shutdown		*/
252 	sotpi_getsockopt,	/* sop_getsockopt	*/
253 	sotpi_setsockopt,	/* sop_setsockopt	*/
254 	sotpi_ioctl,		/* sop_ioctl		*/
255 	sotpi_poll,		/* sop_poll		*/
256 	sotpi_close,		/* sop_close		*/
257 };
258 
259 /*
260  * Return a TPI socket vnode.
261  *
262  * Note that sockets assume that the driver will clone (either itself
263  * or by using the clone driver) i.e. a socket() call will always
264  * result in a new vnode being created.
265  */
266 
267 /*
268  * Common create code for socket and accept. If tso is set the values
269  * from that node is used instead of issuing a T_INFO_REQ.
270  */
271 
272 /* ARGSUSED */
273 static struct sonode *
274 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
275     int version, int sflags, int *errorp, cred_t *cr)
276 {
277 	struct sonode	*so;
278 	kmem_cache_t 	*cp;
279 	int		sfamily = family;
280 
281 	ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
282 
283 	if (family == AF_NCA) {
284 		/*
285 		 * The request is for an NCA socket so for NL7C use the
286 		 * INET domain instead and mark NL7C_AF_NCA below.
287 		 */
288 		family = AF_INET;
289 		/*
290 		 * NL7C is not supported in the non-global zone,
291 		 * we enforce this restriction here.
292 		 */
293 		if (getzoneid() != GLOBAL_ZONEID) {
294 			*errorp = ENOTSUP;
295 			return (NULL);
296 		}
297 	}
298 
299 	/*
300 	 * to be compatible with old tpi socket implementation ignore
301 	 * sleep flag (sflags) passed in
302 	 */
303 	cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
304 	so = kmem_cache_alloc(cp, KM_SLEEP);
305 	if (so == NULL) {
306 		*errorp = ENOMEM;
307 		return (NULL);
308 	}
309 
310 	sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
311 	sotpi_info_init(so);
312 
313 	if (sfamily == AF_NCA) {
314 		SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
315 	}
316 
317 	if (version == SOV_DEFAULT)
318 		version = so_default_version;
319 
320 	so->so_version = (short)version;
321 	*errorp = 0;
322 
323 	return (so);
324 }
325 
326 static void
327 sotpi_destroy(struct sonode *so)
328 {
329 	kmem_cache_t *cp;
330 	struct sockparams *origsp;
331 
332 	/*
333 	 * If there is a new dealloc function (ie. smod_destroy_func),
334 	 * then it should check the correctness of the ops.
335 	 */
336 
337 	ASSERT(so->so_ops == &sotpi_sonodeops);
338 
339 	origsp = SOTOTPI(so)->sti_orig_sp;
340 
341 	sotpi_info_fini(so);
342 
343 	if (so->so_state & SS_FALLBACK_COMP) {
344 		/*
345 		 * A fallback happend, which means that a sotpi_info_t struct
346 		 * was allocated (as opposed to being allocated from the TPI
347 		 * sonode cache. Therefore we explicitly free the struct
348 		 * here.
349 		 */
350 		sotpi_info_destroy(so);
351 		ASSERT(origsp != NULL);
352 
353 		origsp->sp_smod_info->smod_sock_destroy_func(so);
354 		SOCKPARAMS_DEC_REF(origsp);
355 	} else {
356 		sonode_fini(so);
357 		cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
358 		    socktpi_cache;
359 		kmem_cache_free(cp, so);
360 	}
361 }
362 
363 /* ARGSUSED1 */
364 int
365 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
366 {
367 	major_t maj;
368 	dev_t newdev;
369 	struct vnode *vp;
370 	int error = 0;
371 	struct stdata *stp;
372 
373 	sotpi_info_t *sti = SOTOTPI(so);
374 
375 	dprint(1, ("sotpi_init()\n"));
376 
377 	/*
378 	 * over write the sleep flag passed in but that is ok
379 	 * as tpi socket does not honor sleep flag.
380 	 */
381 	flags |= FREAD|FWRITE;
382 
383 	/*
384 	 * Record in so_flag that it is a clone.
385 	 */
386 	if (getmajor(sti->sti_dev) == clone_major)
387 		so->so_flag |= SOCLONE;
388 
389 	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
390 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
391 	    (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
392 	    so->so_protocol == IPPROTO_IP)) {
393 		/* Tell tcp or udp that it's talking to sockets */
394 		flags |= SO_SOCKSTR;
395 
396 		/*
397 		 * Here we indicate to socktpi_open() our attempt to
398 		 * make direct calls between sockfs and transport.
399 		 * The final decision is left to socktpi_open().
400 		 */
401 		sti->sti_direct = 1;
402 
403 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
404 		if (so->so_type == SOCK_STREAM && tso != NULL) {
405 			if (SOTOTPI(tso)->sti_direct) {
406 				/*
407 				 * Inherit sti_direct from listener and pass
408 				 * SO_ACCEPTOR open flag to tcp, indicating
409 				 * that this is an accept fast-path instance.
410 				 */
411 				flags |= SO_ACCEPTOR;
412 			} else {
413 				/*
414 				 * sti_direct is not set on listener, meaning
415 				 * that the listener has been converted from
416 				 * a socket to a stream.  Ensure that the
417 				 * acceptor inherits these settings.
418 				 */
419 				sti->sti_direct = 0;
420 				flags &= ~SO_SOCKSTR;
421 			}
422 		}
423 	}
424 
425 	/*
426 	 * Tell local transport that it is talking to sockets.
427 	 */
428 	if (so->so_family == AF_UNIX) {
429 		flags |= SO_SOCKSTR;
430 	}
431 
432 	vp = SOTOV(so);
433 	newdev = vp->v_rdev;
434 	maj = getmajor(newdev);
435 	ASSERT(STREAMSTAB(maj));
436 
437 	error = stropen(vp, &newdev, flags, cr);
438 
439 	stp = vp->v_stream;
440 	if (error == 0) {
441 		if (so->so_flag & SOCLONE)
442 			ASSERT(newdev != vp->v_rdev);
443 		mutex_enter(&so->so_lock);
444 		sti->sti_dev = newdev;
445 		vp->v_rdev = newdev;
446 		mutex_exit(&so->so_lock);
447 
448 		if (stp->sd_flag & STRISTTY) {
449 			/*
450 			 * this is a post SVR4 tty driver - a socket can not
451 			 * be a controlling terminal. Fail the open.
452 			 */
453 			(void) sotpi_close(so, flags, cr);
454 			return (ENOTTY);	/* XXX */
455 		}
456 
457 		ASSERT(stp->sd_wrq != NULL);
458 		sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
459 
460 		/*
461 		 * If caller is interested in doing direct function call
462 		 * interface to/from transport module, probe the module
463 		 * directly beneath the streamhead to see if it qualifies.
464 		 *
465 		 * We turn off the direct interface when qualifications fail.
466 		 * In the acceptor case, we simply turn off the sti_direct
467 		 * flag on the socket. We do the fallback after the accept
468 		 * has completed, before the new socket is returned to the
469 		 * application.
470 		 */
471 		if (sti->sti_direct) {
472 			queue_t *tq = stp->sd_wrq->q_next;
473 
474 			/*
475 			 * sti_direct is currently supported and tested
476 			 * only for tcp/udp; this is the main reason to
477 			 * have the following assertions.
478 			 */
479 			ASSERT(so->so_family == AF_INET ||
480 			    so->so_family == AF_INET6);
481 			ASSERT(so->so_protocol == IPPROTO_UDP ||
482 			    so->so_protocol == IPPROTO_TCP ||
483 			    so->so_protocol == IPPROTO_IP);
484 			ASSERT(so->so_type == SOCK_DGRAM ||
485 			    so->so_type == SOCK_STREAM);
486 
487 			/*
488 			 * Abort direct call interface if the module directly
489 			 * underneath the stream head is not defined with the
490 			 * _D_DIRECT flag.  This could happen in the tcp or
491 			 * udp case, when some other module is autopushed
492 			 * above it, or for some reasons the expected module
493 			 * isn't purely D_MP (which is the main requirement).
494 			 */
495 			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
496 			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
497 				int rval;
498 
499 				/* Continue on without direct calls */
500 				sti->sti_direct = 0;
501 
502 				/*
503 				 * Cannot issue ioctl on fallback socket since
504 				 * there is no conn associated with the queue.
505 				 * The fallback downcall will notify the proto
506 				 * of the change.
507 				 */
508 				if (!(flags & SO_ACCEPTOR) &&
509 				    !(flags & SO_FALLBACK)) {
510 					if ((error = strioctl(vp,
511 					    _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
512 					    cr, &rval)) != 0) {
513 						(void) sotpi_close(so, flags,
514 						    cr);
515 						return (error);
516 					}
517 				}
518 			}
519 		}
520 
521 		if (flags & SO_FALLBACK) {
522 			/*
523 			 * The stream created does not have a conn.
524 			 * do stream set up after conn has been assigned
525 			 */
526 			return (error);
527 		}
528 		if (error = so_strinit(so, tso)) {
529 			(void) sotpi_close(so, flags, cr);
530 			return (error);
531 		}
532 
533 		/* Wildcard */
534 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
535 			int protocol = so->so_protocol;
536 			/*
537 			 * Issue SO_PROTOTYPE setsockopt.
538 			 */
539 			error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
540 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
541 			if (error != 0) {
542 				(void) sotpi_close(so, flags, cr);
543 				/*
544 				 * Setsockopt often fails with ENOPROTOOPT but
545 				 * socket() should fail with
546 				 * EPROTONOSUPPORT/EPROTOTYPE.
547 				 */
548 				return (EPROTONOSUPPORT);
549 			}
550 		}
551 
552 	} else {
553 		/*
554 		 * While the same socket can not be reopened (unlike specfs)
555 		 * the stream head sets STREOPENFAIL when the autopush fails.
556 		 */
557 		if ((stp != NULL) &&
558 		    (stp->sd_flag & STREOPENFAIL)) {
559 			/*
560 			 * Open failed part way through.
561 			 */
562 			mutex_enter(&stp->sd_lock);
563 			stp->sd_flag &= ~STREOPENFAIL;
564 			mutex_exit(&stp->sd_lock);
565 			(void) sotpi_close(so, flags, cr);
566 			return (error);
567 			/*NOTREACHED*/
568 		}
569 		ASSERT(stp == NULL);
570 	}
571 	TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
572 	    "sockfs open:maj %d vp %p so %p error %d",
573 	    maj, vp, so, error);
574 	return (error);
575 }
576 
577 /*
578  * Bind the socket to an unspecified address in sockfs only.
579  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
580  * required in all cases.
581  */
582 static void
583 so_automatic_bind(struct sonode *so)
584 {
585 	sotpi_info_t *sti = SOTOTPI(so);
586 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
587 
588 	ASSERT(MUTEX_HELD(&so->so_lock));
589 	ASSERT(!(so->so_state & SS_ISBOUND));
590 	ASSERT(sti->sti_unbind_mp);
591 
592 	ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
593 	bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
594 	sti->sti_laddr_sa->sa_family = so->so_family;
595 	so->so_state |= SS_ISBOUND;
596 }
597 
598 
599 /*
600  * bind the socket.
601  *
602  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
603  * are passed in we allow rebinding. Note that for backwards compatibility
604  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
605  * Thus the rebinding code is currently not executed.
606  *
607  * The constraints for rebinding are:
608  * - it is a SOCK_DGRAM, or
609  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
610  *   and no listen() has been done.
611  * This rebinding code was added based on some language in the XNET book
612  * about not returning EINVAL it the protocol allows rebinding. However,
613  * this language is not present in the Posix socket draft. Thus maybe the
614  * rebinding logic should be deleted from the source.
615  *
616  * A null "name" can be used to unbind the socket if:
617  * - it is a SOCK_DGRAM, or
618  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
619  *   and no listen() has been done.
620  */
621 /* ARGSUSED */
622 static int
623 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
624     socklen_t namelen, int backlog, int flags, struct cred *cr)
625 {
626 	struct T_bind_req	bind_req;
627 	struct T_bind_ack	*bind_ack;
628 	int			error = 0;
629 	mblk_t			*mp;
630 	void			*addr;
631 	t_uscalar_t		addrlen;
632 	int			unbind_on_err = 1;
633 	boolean_t		clear_acceptconn_on_err = B_FALSE;
634 	boolean_t		restore_backlog_on_err = B_FALSE;
635 	int			save_so_backlog;
636 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
637 	boolean_t		tcp_udp_xport;
638 	void			*nl7c = NULL;
639 	sotpi_info_t		*sti = SOTOTPI(so);
640 
641 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
642 	    (void *)so, (void *)name, namelen, backlog, flags,
643 	    pr_state(so->so_state, so->so_mode)));
644 
645 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
646 
647 	if (!(flags & _SOBIND_LOCK_HELD)) {
648 		mutex_enter(&so->so_lock);
649 		so_lock_single(so);	/* Set SOLOCKED */
650 	} else {
651 		ASSERT(MUTEX_HELD(&so->so_lock));
652 		ASSERT(so->so_flag & SOLOCKED);
653 	}
654 
655 	/*
656 	 * Make sure that there is a preallocated unbind_req message
657 	 * before binding. This message allocated when the socket is
658 	 * created  but it might be have been consumed.
659 	 */
660 	if (sti->sti_unbind_mp == NULL) {
661 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
662 		/* NOTE: holding so_lock while sleeping */
663 		sti->sti_unbind_mp =
664 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
665 		    cr);
666 	}
667 
668 	if (flags & _SOBIND_REBIND) {
669 		/*
670 		 * Called from solisten after doing an sotpi_unbind() or
671 		 * potentially without the unbind (latter for AF_INET{,6}).
672 		 */
673 		ASSERT(name == NULL && namelen == 0);
674 
675 		if (so->so_family == AF_UNIX) {
676 			ASSERT(sti->sti_ux_bound_vp);
677 			addr = &sti->sti_ux_laddr;
678 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
679 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
680 			    "addr 0x%p, vp %p\n",
681 			    addrlen,
682 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
683 			    (void *)sti->sti_ux_bound_vp));
684 		} else {
685 			addr = sti->sti_laddr_sa;
686 			addrlen = (t_uscalar_t)sti->sti_laddr_len;
687 		}
688 	} else if (flags & _SOBIND_UNSPEC) {
689 		ASSERT(name == NULL && namelen == 0);
690 
691 		/*
692 		 * The caller checked SS_ISBOUND but not necessarily
693 		 * under so_lock
694 		 */
695 		if (so->so_state & SS_ISBOUND) {
696 			/* No error */
697 			goto done;
698 		}
699 
700 		/* Set an initial local address */
701 		switch (so->so_family) {
702 		case AF_UNIX:
703 			/*
704 			 * Use an address with same size as struct sockaddr
705 			 * just like BSD.
706 			 */
707 			sti->sti_laddr_len =
708 			    (socklen_t)sizeof (struct sockaddr);
709 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
710 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
711 			sti->sti_laddr_sa->sa_family = so->so_family;
712 
713 			/*
714 			 * Pass down an address with the implicit bind
715 			 * magic number and the rest all zeros.
716 			 * The transport will return a unique address.
717 			 */
718 			sti->sti_ux_laddr.soua_vp = NULL;
719 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
720 			addr = &sti->sti_ux_laddr;
721 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
722 			break;
723 
724 		case AF_INET:
725 		case AF_INET6:
726 			/*
727 			 * An unspecified bind in TPI has a NULL address.
728 			 * Set the address in sockfs to have the sa_family.
729 			 */
730 			sti->sti_laddr_len = (so->so_family == AF_INET) ?
731 			    (socklen_t)sizeof (sin_t) :
732 			    (socklen_t)sizeof (sin6_t);
733 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
734 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
735 			sti->sti_laddr_sa->sa_family = so->so_family;
736 			addr = NULL;
737 			addrlen = 0;
738 			break;
739 
740 		default:
741 			/*
742 			 * An unspecified bind in TPI has a NULL address.
743 			 * Set the address in sockfs to be zero length.
744 			 *
745 			 * Can not assume there is a sa_family for all
746 			 * protocol families. For example, AF_X25 does not
747 			 * have a family field.
748 			 */
749 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
750 			sti->sti_laddr_len = 0;	/* XXX correct? */
751 			addr = NULL;
752 			addrlen = 0;
753 			break;
754 		}
755 
756 	} else {
757 		if (so->so_state & SS_ISBOUND) {
758 			/*
759 			 * If it is ok to rebind the socket, first unbind
760 			 * with the transport. A rebind to the NULL address
761 			 * is interpreted as an unbind.
762 			 * Note that a bind to NULL in BSD does unbind the
763 			 * socket but it fails with EINVAL.
764 			 * Note that regular sockets set SOV_SOCKBSD i.e.
765 			 * _SOBIND_SOCKBSD gets set here hence no type of
766 			 * socket does currently allow rebinding.
767 			 *
768 			 * If the name is NULL just do an unbind.
769 			 */
770 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
771 			    name != NULL) {
772 				error = EINVAL;
773 				unbind_on_err = 0;
774 				eprintsoline(so, error);
775 				goto done;
776 			}
777 			if ((so->so_mode & SM_CONNREQUIRED) &&
778 			    (so->so_state & SS_CANTREBIND)) {
779 				error = EINVAL;
780 				unbind_on_err = 0;
781 				eprintsoline(so, error);
782 				goto done;
783 			}
784 			error = sotpi_unbind(so, 0);
785 			if (error) {
786 				eprintsoline(so, error);
787 				goto done;
788 			}
789 			ASSERT(!(so->so_state & SS_ISBOUND));
790 			if (name == NULL) {
791 				so->so_state &=
792 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
793 				goto done;
794 			}
795 		}
796 
797 		/* X/Open requires this check */
798 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
799 			if (xnet_check_print) {
800 				printf("sockfs: X/Open bind state check "
801 				    "caused EINVAL\n");
802 			}
803 			error = EINVAL;
804 			goto done;
805 		}
806 
807 		switch (so->so_family) {
808 		case AF_UNIX:
809 			/*
810 			 * All AF_UNIX addresses are nul terminated
811 			 * when copied (copyin_name) in so the minimum
812 			 * length is 3 bytes.
813 			 */
814 			if (name == NULL ||
815 			    (ssize_t)namelen <= sizeof (short) + 1) {
816 				error = EISDIR;
817 				eprintsoline(so, error);
818 				goto done;
819 			}
820 			/*
821 			 * Verify so_family matches the bound family.
822 			 * BSD does not check this for AF_UNIX resulting
823 			 * in funny mknods.
824 			 */
825 			if (name->sa_family != so->so_family) {
826 				error = EAFNOSUPPORT;
827 				goto done;
828 			}
829 			break;
830 		case AF_INET:
831 			if (name == NULL) {
832 				error = EINVAL;
833 				eprintsoline(so, error);
834 				goto done;
835 			}
836 			if ((size_t)namelen != sizeof (sin_t)) {
837 				error = name->sa_family != so->so_family ?
838 				    EAFNOSUPPORT : EINVAL;
839 				eprintsoline(so, error);
840 				goto done;
841 			}
842 			if ((flags & _SOBIND_XPG4_2) &&
843 			    (name->sa_family != so->so_family)) {
844 				/*
845 				 * This check has to be made for X/Open
846 				 * sockets however application failures have
847 				 * been observed when it is applied to
848 				 * all sockets.
849 				 */
850 				error = EAFNOSUPPORT;
851 				eprintsoline(so, error);
852 				goto done;
853 			}
854 			/*
855 			 * Force a zero sa_family to match so_family.
856 			 *
857 			 * Some programs like inetd(1M) don't set the
858 			 * family field. Other programs leave
859 			 * sin_family set to garbage - SunOS 4.X does
860 			 * not check the family field on a bind.
861 			 * We use the family field that
862 			 * was passed in to the socket() call.
863 			 */
864 			name->sa_family = so->so_family;
865 			break;
866 
867 		case AF_INET6: {
868 #ifdef DEBUG
869 			sin6_t *sin6 = (sin6_t *)name;
870 #endif /* DEBUG */
871 
872 			if (name == NULL) {
873 				error = EINVAL;
874 				eprintsoline(so, error);
875 				goto done;
876 			}
877 			if ((size_t)namelen != sizeof (sin6_t)) {
878 				error = name->sa_family != so->so_family ?
879 				    EAFNOSUPPORT : EINVAL;
880 				eprintsoline(so, error);
881 				goto done;
882 			}
883 			if (name->sa_family != so->so_family) {
884 				/*
885 				 * With IPv6 we require the family to match
886 				 * unlike in IPv4.
887 				 */
888 				error = EAFNOSUPPORT;
889 				eprintsoline(so, error);
890 				goto done;
891 			}
892 #ifdef DEBUG
893 			/*
894 			 * Verify that apps don't forget to clear
895 			 * sin6_scope_id etc
896 			 */
897 			if (sin6->sin6_scope_id != 0 &&
898 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
899 				zcmn_err(getzoneid(), CE_WARN,
900 				    "bind with uninitialized sin6_scope_id "
901 				    "(%d) on socket. Pid = %d\n",
902 				    (int)sin6->sin6_scope_id,
903 				    (int)curproc->p_pid);
904 			}
905 			if (sin6->__sin6_src_id != 0) {
906 				zcmn_err(getzoneid(), CE_WARN,
907 				    "bind with uninitialized __sin6_src_id "
908 				    "(%d) on socket. Pid = %d\n",
909 				    (int)sin6->__sin6_src_id,
910 				    (int)curproc->p_pid);
911 			}
912 #endif /* DEBUG */
913 			break;
914 		}
915 		default:
916 			/*
917 			 * Don't do any length or sa_family check to allow
918 			 * non-sockaddr style addresses.
919 			 */
920 			if (name == NULL) {
921 				error = EINVAL;
922 				eprintsoline(so, error);
923 				goto done;
924 			}
925 			break;
926 		}
927 
928 		if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
929 			error = ENAMETOOLONG;
930 			eprintsoline(so, error);
931 			goto done;
932 		}
933 		/*
934 		 * Save local address.
935 		 */
936 		sti->sti_laddr_len = (socklen_t)namelen;
937 		ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
938 		bcopy(name, sti->sti_laddr_sa, namelen);
939 
940 		addr = sti->sti_laddr_sa;
941 		addrlen = (t_uscalar_t)sti->sti_laddr_len;
942 		switch (so->so_family) {
943 		case AF_INET6:
944 		case AF_INET:
945 			break;
946 		case AF_UNIX: {
947 			struct sockaddr_un *soun =
948 			    (struct sockaddr_un *)sti->sti_laddr_sa;
949 			struct vnode *vp, *rvp;
950 			struct vattr vattr;
951 
952 			ASSERT(sti->sti_ux_bound_vp == NULL);
953 			/*
954 			 * Create vnode for the specified path name.
955 			 * Keep vnode held with a reference in sti_ux_bound_vp.
956 			 * Use the vnode pointer as the address used in the
957 			 * bind with the transport.
958 			 *
959 			 * Use the same mode as in BSD. In particular this does
960 			 * not observe the umask.
961 			 */
962 			/* MAXPATHLEN + soun_family + nul termination */
963 			if (sti->sti_laddr_len >
964 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
965 				error = ENAMETOOLONG;
966 				eprintsoline(so, error);
967 				goto done;
968 			}
969 			vattr.va_type = VSOCK;
970 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
971 			vattr.va_mask = AT_TYPE|AT_MODE;
972 			/* NOTE: holding so_lock */
973 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
974 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
975 			if (error) {
976 				if (error == EEXIST)
977 					error = EADDRINUSE;
978 				eprintsoline(so, error);
979 				goto done;
980 			}
981 			/*
982 			 * Establish pointer from the underlying filesystem
983 			 * vnode to the socket node.
984 			 * sti_ux_bound_vp and v_stream->sd_vnode form the
985 			 * cross-linkage between the underlying filesystem
986 			 * node and the socket node.
987 			 */
988 
989 			if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
990 				VN_HOLD(rvp);
991 				VN_RELE(vp);
992 				vp = rvp;
993 			}
994 
995 			ASSERT(SOTOV(so)->v_stream);
996 			mutex_enter(&vp->v_lock);
997 			vp->v_stream = SOTOV(so)->v_stream;
998 			sti->sti_ux_bound_vp = vp;
999 			mutex_exit(&vp->v_lock);
1000 
1001 			/*
1002 			 * Use the vnode pointer value as a unique address
1003 			 * (together with the magic number to avoid conflicts
1004 			 * with implicit binds) in the transport provider.
1005 			 */
1006 			sti->sti_ux_laddr.soua_vp =
1007 			    (void *)sti->sti_ux_bound_vp;
1008 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1009 			addr = &sti->sti_ux_laddr;
1010 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1011 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1012 			    addrlen,
1013 			    (void *)((struct so_ux_addr *)addr)->soua_vp));
1014 			break;
1015 		}
1016 		} /* end switch (so->so_family) */
1017 	}
1018 
1019 	/*
1020 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1021 	 * the transport can start passing up T_CONN_IND messages
1022 	 * as soon as it receives the bind req and strsock_proto()
1023 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1024 	 */
1025 	if (flags & _SOBIND_LISTEN) {
1026 		if ((so->so_state & SS_ACCEPTCONN) == 0)
1027 			clear_acceptconn_on_err = B_TRUE;
1028 		save_so_backlog = so->so_backlog;
1029 		restore_backlog_on_err = B_TRUE;
1030 		so->so_state |= SS_ACCEPTCONN;
1031 		so->so_backlog = backlog;
1032 	}
1033 
1034 	/*
1035 	 * If NL7C addr(s) have been configured check for addr/port match,
1036 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1037 	 *
1038 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1039 	 * family sockets only. If match mark as such.
1040 	 */
1041 	if (nl7c_enabled && ((addr != NULL &&
1042 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1043 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1044 	    sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1045 		/*
1046 		 * NL7C is not supported in non-global zones,
1047 		 * we enforce this restriction here.
1048 		 */
1049 		if (so->so_zoneid == GLOBAL_ZONEID) {
1050 			/* An NL7C socket, mark it */
1051 			sti->sti_nl7c_flags |= NL7C_ENABLED;
1052 			if (nl7c == NULL) {
1053 				/*
1054 				 * Was an AF_NCA bind() so add it to the
1055 				 * addr list for reporting purposes.
1056 				 */
1057 				nl7c = nl7c_add_addr(addr, addrlen);
1058 			}
1059 		} else
1060 			nl7c = NULL;
1061 	}
1062 
1063 	/*
1064 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1065 	 * for other transports we will send in a O_T_BIND_REQ.
1066 	 */
1067 	if (tcp_udp_xport &&
1068 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
1069 		PRIM_type = T_BIND_REQ;
1070 
1071 	bind_req.PRIM_type = PRIM_type;
1072 	bind_req.ADDR_length = addrlen;
1073 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1074 	bind_req.CONIND_number = backlog;
1075 	/* NOTE: holding so_lock while sleeping */
1076 	mp = soallocproto2(&bind_req, sizeof (bind_req),
1077 	    addr, addrlen, 0, _ALLOC_SLEEP, cr);
1078 	sti->sti_laddr_valid = 0;
1079 
1080 	/* Done using sti_laddr_sa - can drop the lock */
1081 	mutex_exit(&so->so_lock);
1082 
1083 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1084 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1085 	if (error) {
1086 		eprintsoline(so, error);
1087 		mutex_enter(&so->so_lock);
1088 		goto done;
1089 	}
1090 
1091 	mutex_enter(&so->so_lock);
1092 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1093 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1094 	if (error) {
1095 		eprintsoline(so, error);
1096 		goto done;
1097 	}
1098 	ASSERT(mp);
1099 	/*
1100 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1101 	 * strsock_proto while the lock was dropped above, the bind
1102 	 * is allowed to complete.
1103 	 */
1104 
1105 	/* Mark as bound. This will be undone if we detect errors below. */
1106 	if (flags & _SOBIND_NOXLATE) {
1107 		ASSERT(so->so_family == AF_UNIX);
1108 		sti->sti_faddr_noxlate = 1;
1109 	}
1110 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1111 	so->so_state |= SS_ISBOUND;
1112 	ASSERT(sti->sti_unbind_mp);
1113 
1114 	/* note that we've already set SS_ACCEPTCONN above */
1115 
1116 	/*
1117 	 * Recompute addrlen - an unspecied bind sent down an
1118 	 * address of length zero but we expect the appropriate length
1119 	 * in return.
1120 	 */
1121 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1122 	    sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1123 
1124 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
1125 	/*
1126 	 * The alignment restriction is really too strict but
1127 	 * we want enough alignment to inspect the fields of
1128 	 * a sockaddr_in.
1129 	 */
1130 	addr = sogetoff(mp, bind_ack->ADDR_offset,
1131 	    bind_ack->ADDR_length,
1132 	    __TPI_ALIGN_SIZE);
1133 	if (addr == NULL) {
1134 		freemsg(mp);
1135 		error = EPROTO;
1136 		eprintsoline(so, error);
1137 		goto done;
1138 	}
1139 	if (!(flags & _SOBIND_UNSPEC)) {
1140 		/*
1141 		 * Verify that the transport didn't return something we
1142 		 * did not want e.g. an address other than what we asked for.
1143 		 *
1144 		 * NOTE: These checks would go away if/when we switch to
1145 		 * using the new TPI (in which the transport would fail
1146 		 * the request instead of assigning a different address).
1147 		 *
1148 		 * NOTE2: For protocols that we don't know (i.e. any
1149 		 * other than AF_INET6, AF_INET and AF_UNIX), we
1150 		 * cannot know if the transport should be expected to
1151 		 * return the same address as that requested.
1152 		 *
1153 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1154 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1155 		 *
1156 		 * For example, in the case of netatalk it may be
1157 		 * inappropriate for the transport to return the
1158 		 * requested address (as it may have allocated a local
1159 		 * port number in behaviour similar to that of an
1160 		 * AF_INET bind request with a port number of zero).
1161 		 *
1162 		 * Given the definition of O_T_BIND_REQ, where the
1163 		 * transport may bind to an address other than the
1164 		 * requested address, it's not possible to determine
1165 		 * whether a returned address that differs from the
1166 		 * requested address is a reason to fail (because the
1167 		 * requested address was not available) or succeed
1168 		 * (because the transport allocated an appropriate
1169 		 * address and/or port).
1170 		 *
1171 		 * sockfs currently requires that the transport return
1172 		 * the requested address in the T_BIND_ACK, unless
1173 		 * there is code here to allow for any discrepancy.
1174 		 * Such code exists for AF_INET and AF_INET6.
1175 		 *
1176 		 * Netatalk chooses to return the requested address
1177 		 * rather than the (correct) allocated address.  This
1178 		 * means that netatalk violates the TPI specification
1179 		 * (and would not function correctly if used from a
1180 		 * TLI application), but it does mean that it works
1181 		 * with sockfs.
1182 		 *
1183 		 * As noted above, using the newer XTI bind primitive
1184 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1185 		 * allow sockfs to be more sure about whether or not
1186 		 * the bind request had succeeded (as transports are
1187 		 * not permitted to bind to a different address than
1188 		 * that requested - they must return failure).
1189 		 * Unfortunately, support for T_BIND_REQ may not be
1190 		 * present in all transport implementations (netatalk,
1191 		 * for example, doesn't have it), making the
1192 		 * transition difficult.
1193 		 */
1194 		if (bind_ack->ADDR_length != addrlen) {
1195 			/* Assumes that the requested address was in use */
1196 			freemsg(mp);
1197 			error = EADDRINUSE;
1198 			eprintsoline(so, error);
1199 			goto done;
1200 		}
1201 
1202 		switch (so->so_family) {
1203 		case AF_INET6:
1204 		case AF_INET: {
1205 			sin_t *rname, *aname;
1206 
1207 			rname = (sin_t *)addr;
1208 			aname = (sin_t *)sti->sti_laddr_sa;
1209 
1210 			/*
1211 			 * Take advantage of the alignment
1212 			 * of sin_port and sin6_port which fall
1213 			 * in the same place in their data structures.
1214 			 * Just use sin_port for either address family.
1215 			 *
1216 			 * This may become a problem if (heaven forbid)
1217 			 * there's a separate ipv6port_reserved... :-P
1218 			 *
1219 			 * Binding to port 0 has the semantics of letting
1220 			 * the transport bind to any port.
1221 			 *
1222 			 * If the transport is TCP or UDP since we had sent
1223 			 * a T_BIND_REQ we would not get a port other than
1224 			 * what we asked for.
1225 			 */
1226 			if (tcp_udp_xport) {
1227 				/*
1228 				 * Pick up the new port number if we bound to
1229 				 * port 0.
1230 				 */
1231 				if (aname->sin_port == 0)
1232 					aname->sin_port = rname->sin_port;
1233 				sti->sti_laddr_valid = 1;
1234 				break;
1235 			}
1236 			if (aname->sin_port != 0 &&
1237 			    aname->sin_port != rname->sin_port) {
1238 				freemsg(mp);
1239 				error = EADDRINUSE;
1240 				eprintsoline(so, error);
1241 				goto done;
1242 			}
1243 			/*
1244 			 * Pick up the new port number if we bound to port 0.
1245 			 */
1246 			aname->sin_port = rname->sin_port;
1247 
1248 			/*
1249 			 * Unfortunately, addresses aren't _quite_ the same.
1250 			 */
1251 			if (so->so_family == AF_INET) {
1252 				if (aname->sin_addr.s_addr !=
1253 				    rname->sin_addr.s_addr) {
1254 					freemsg(mp);
1255 					error = EADDRNOTAVAIL;
1256 					eprintsoline(so, error);
1257 					goto done;
1258 				}
1259 			} else {
1260 				sin6_t *rname6 = (sin6_t *)rname;
1261 				sin6_t *aname6 = (sin6_t *)aname;
1262 
1263 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1264 				    &rname6->sin6_addr)) {
1265 					freemsg(mp);
1266 					error = EADDRNOTAVAIL;
1267 					eprintsoline(so, error);
1268 					goto done;
1269 				}
1270 			}
1271 			break;
1272 		}
1273 		case AF_UNIX:
1274 			if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1275 				freemsg(mp);
1276 				error = EADDRINUSE;
1277 				eprintsoline(so, error);
1278 				eprintso(so,
1279 				    ("addrlen %d, addr 0x%x, vp %p\n",
1280 				    addrlen, *((int *)addr),
1281 				    (void *)sti->sti_ux_bound_vp));
1282 				goto done;
1283 			}
1284 			sti->sti_laddr_valid = 1;
1285 			break;
1286 		default:
1287 			/*
1288 			 * NOTE: This assumes that addresses can be
1289 			 * byte-compared for equivalence.
1290 			 */
1291 			if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1292 				freemsg(mp);
1293 				error = EADDRINUSE;
1294 				eprintsoline(so, error);
1295 				goto done;
1296 			}
1297 			/*
1298 			 * Don't mark sti_laddr_valid, as we cannot be
1299 			 * sure that the returned address is the real
1300 			 * bound address when talking to an unknown
1301 			 * transport.
1302 			 */
1303 			break;
1304 		}
1305 	} else {
1306 		/*
1307 		 * Save for returned address for getsockname.
1308 		 * Needed for unspecific bind unless transport supports
1309 		 * the TI_GETMYNAME ioctl.
1310 		 * Do this for AF_INET{,6} even though they do, as
1311 		 * caching info here is much better performance than
1312 		 * a TPI/STREAMS trip to the transport for getsockname.
1313 		 * Any which can't for some reason _must_ _not_ set
1314 		 * sti_laddr_valid here for the caching version of
1315 		 * getsockname to not break;
1316 		 */
1317 		switch (so->so_family) {
1318 		case AF_UNIX:
1319 			/*
1320 			 * Record the address bound with the transport
1321 			 * for use by socketpair.
1322 			 */
1323 			bcopy(addr, &sti->sti_ux_laddr, addrlen);
1324 			sti->sti_laddr_valid = 1;
1325 			break;
1326 		case AF_INET:
1327 		case AF_INET6:
1328 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1329 			bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1330 			sti->sti_laddr_valid = 1;
1331 			break;
1332 		default:
1333 			/*
1334 			 * Don't mark sti_laddr_valid, as we cannot be
1335 			 * sure that the returned address is the real
1336 			 * bound address when talking to an unknown
1337 			 * transport.
1338 			 */
1339 			break;
1340 		}
1341 	}
1342 
1343 	if (nl7c != NULL) {
1344 		/* Register listen()er sonode pointer with NL7C */
1345 		nl7c_listener_addr(nl7c, so);
1346 	}
1347 
1348 	freemsg(mp);
1349 
1350 done:
1351 	if (error) {
1352 		/* reset state & backlog to values held on entry */
1353 		if (clear_acceptconn_on_err == B_TRUE)
1354 			so->so_state &= ~SS_ACCEPTCONN;
1355 		if (restore_backlog_on_err == B_TRUE)
1356 			so->so_backlog = save_so_backlog;
1357 
1358 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1359 			int err;
1360 
1361 			err = sotpi_unbind(so, 0);
1362 			/* LINTED - statement has no consequent: if */
1363 			if (err) {
1364 				eprintsoline(so, error);
1365 			} else {
1366 				ASSERT(!(so->so_state & SS_ISBOUND));
1367 			}
1368 		}
1369 	}
1370 	if (!(flags & _SOBIND_LOCK_HELD)) {
1371 		so_unlock_single(so, SOLOCKED);
1372 		mutex_exit(&so->so_lock);
1373 	} else {
1374 		ASSERT(MUTEX_HELD(&so->so_lock));
1375 		ASSERT(so->so_flag & SOLOCKED);
1376 	}
1377 	return (error);
1378 }
1379 
1380 /* bind the socket */
1381 static int
1382 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1383     int flags, struct cred *cr)
1384 {
1385 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1386 		return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1387 
1388 	flags &= ~_SOBIND_SOCKETPAIR;
1389 	return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1390 }
1391 
1392 /*
1393  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1394  * address, or when listen needs to unbind and bind.
1395  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1396  * so that a sobind can pick them up.
1397  */
1398 static int
1399 sotpi_unbind(struct sonode *so, int flags)
1400 {
1401 	struct T_unbind_req	unbind_req;
1402 	int			error = 0;
1403 	mblk_t			*mp;
1404 	sotpi_info_t		*sti = SOTOTPI(so);
1405 
1406 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1407 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1408 
1409 	ASSERT(MUTEX_HELD(&so->so_lock));
1410 	ASSERT(so->so_flag & SOLOCKED);
1411 
1412 	if (!(so->so_state & SS_ISBOUND)) {
1413 		error = EINVAL;
1414 		eprintsoline(so, error);
1415 		goto done;
1416 	}
1417 
1418 	mutex_exit(&so->so_lock);
1419 
1420 	/*
1421 	 * Flush the read and write side (except stream head read queue)
1422 	 * and send down T_UNBIND_REQ.
1423 	 */
1424 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1425 
1426 	unbind_req.PRIM_type = T_UNBIND_REQ;
1427 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1428 	    0, _ALLOC_SLEEP, CRED());
1429 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1430 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1431 	mutex_enter(&so->so_lock);
1432 	if (error) {
1433 		eprintsoline(so, error);
1434 		goto done;
1435 	}
1436 
1437 	error = sowaitokack(so, T_UNBIND_REQ);
1438 	if (error) {
1439 		eprintsoline(so, error);
1440 		goto done;
1441 	}
1442 
1443 	/*
1444 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1445 	 * strsock_proto while the lock was dropped above, the unbind
1446 	 * is allowed to complete.
1447 	 */
1448 	if (!(flags & _SOUNBIND_REBIND)) {
1449 		/*
1450 		 * Clear out bound address.
1451 		 */
1452 		vnode_t *vp;
1453 
1454 		if ((vp = sti->sti_ux_bound_vp) != NULL) {
1455 			sti->sti_ux_bound_vp = NULL;
1456 			vn_rele_stream(vp);
1457 		}
1458 		/* Clear out address */
1459 		sti->sti_laddr_len = 0;
1460 	}
1461 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1462 	sti->sti_laddr_valid = 0;
1463 
1464 done:
1465 
1466 	/* If the caller held the lock don't release it here */
1467 	ASSERT(MUTEX_HELD(&so->so_lock));
1468 	ASSERT(so->so_flag & SOLOCKED);
1469 
1470 	return (error);
1471 }
1472 
1473 /*
1474  * listen on the socket.
1475  * For TPI conforming transports this has to first unbind with the transport
1476  * and then bind again using the new backlog.
1477  */
1478 /* ARGSUSED */
1479 int
1480 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1481 {
1482 	int		error = 0;
1483 	sotpi_info_t	*sti = SOTOTPI(so);
1484 
1485 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1486 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1487 
1488 	if (sti->sti_serv_type == T_CLTS)
1489 		return (EOPNOTSUPP);
1490 
1491 	/*
1492 	 * If the socket is ready to accept connections already, then
1493 	 * return without doing anything.  This avoids a problem where
1494 	 * a second listen() call fails if a connection is pending and
1495 	 * leaves the socket unbound. Only when we are not unbinding
1496 	 * with the transport can we safely increase the backlog.
1497 	 */
1498 	if (so->so_state & SS_ACCEPTCONN &&
1499 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1500 	    /*CONSTCOND*/
1501 	    !solisten_tpi_tcp))
1502 		return (0);
1503 
1504 	if (so->so_state & SS_ISCONNECTED)
1505 		return (EINVAL);
1506 
1507 	mutex_enter(&so->so_lock);
1508 	so_lock_single(so);	/* Set SOLOCKED */
1509 
1510 	/*
1511 	 * If the listen doesn't change the backlog we do nothing.
1512 	 * This avoids an EPROTO error from the transport.
1513 	 */
1514 	if ((so->so_state & SS_ACCEPTCONN) &&
1515 	    so->so_backlog == backlog)
1516 		goto done;
1517 
1518 	if (!(so->so_state & SS_ISBOUND)) {
1519 		/*
1520 		 * Must have been explicitly bound in the UNIX domain.
1521 		 */
1522 		if (so->so_family == AF_UNIX) {
1523 			error = EINVAL;
1524 			goto done;
1525 		}
1526 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1527 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1528 	} else if (backlog > 0) {
1529 		/*
1530 		 * AF_INET{,6} hack to avoid losing the port.
1531 		 * Assumes that all AF_INET{,6} transports can handle a
1532 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1533 		 * has already bound thus it is possible to avoid the unbind.
1534 		 */
1535 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1536 		    /*CONSTCOND*/
1537 		    !solisten_tpi_tcp)) {
1538 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1539 			if (error)
1540 				goto done;
1541 		}
1542 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1543 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1544 	} else {
1545 		so->so_state |= SS_ACCEPTCONN;
1546 		so->so_backlog = backlog;
1547 	}
1548 	if (error)
1549 		goto done;
1550 	ASSERT(so->so_state & SS_ACCEPTCONN);
1551 done:
1552 	so_unlock_single(so, SOLOCKED);
1553 	mutex_exit(&so->so_lock);
1554 	return (error);
1555 }
1556 
1557 /*
1558  * Disconnect either a specified seqno or all (-1).
1559  * The former is used on listening sockets only.
1560  *
1561  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1562  * the current use of sodisconnect(seqno == -1) is only for shutdown
1563  * so there is no point (and potentially incorrect) to unbind.
1564  */
1565 static int
1566 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1567 {
1568 	struct T_discon_req	discon_req;
1569 	int			error = 0;
1570 	mblk_t			*mp;
1571 
1572 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1573 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1574 
1575 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1576 		mutex_enter(&so->so_lock);
1577 		so_lock_single(so);	/* Set SOLOCKED */
1578 	} else {
1579 		ASSERT(MUTEX_HELD(&so->so_lock));
1580 		ASSERT(so->so_flag & SOLOCKED);
1581 	}
1582 
1583 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1584 		error = EINVAL;
1585 		eprintsoline(so, error);
1586 		goto done;
1587 	}
1588 
1589 	mutex_exit(&so->so_lock);
1590 	/*
1591 	 * Flush the write side (unless this is a listener)
1592 	 * and then send down a T_DISCON_REQ.
1593 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1594 	 * and other messages.)
1595 	 */
1596 	if (!(so->so_state & SS_ACCEPTCONN))
1597 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1598 
1599 	discon_req.PRIM_type = T_DISCON_REQ;
1600 	discon_req.SEQ_number = seqno;
1601 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1602 	    0, _ALLOC_SLEEP, CRED());
1603 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1604 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1605 	mutex_enter(&so->so_lock);
1606 	if (error) {
1607 		eprintsoline(so, error);
1608 		goto done;
1609 	}
1610 
1611 	error = sowaitokack(so, T_DISCON_REQ);
1612 	if (error) {
1613 		eprintsoline(so, error);
1614 		goto done;
1615 	}
1616 	/*
1617 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1618 	 * strsock_proto while the lock was dropped above, the disconnect
1619 	 * is allowed to complete. However, it is not possible to
1620 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1621 	 */
1622 	so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1623 	SOTOTPI(so)->sti_laddr_valid = 0;
1624 	SOTOTPI(so)->sti_faddr_valid = 0;
1625 done:
1626 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1627 		so_unlock_single(so, SOLOCKED);
1628 		mutex_exit(&so->so_lock);
1629 	} else {
1630 		/* If the caller held the lock don't release it here */
1631 		ASSERT(MUTEX_HELD(&so->so_lock));
1632 		ASSERT(so->so_flag & SOLOCKED);
1633 	}
1634 	return (error);
1635 }
1636 
1637 /* ARGSUSED */
1638 int
1639 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1640     struct sonode **nsop)
1641 {
1642 	struct T_conn_ind	*conn_ind;
1643 	struct T_conn_res	*conn_res;
1644 	int			error = 0;
1645 	mblk_t			*mp, *ack_mp;
1646 	struct sonode		*nso;
1647 	vnode_t			*nvp;
1648 	void			*src;
1649 	t_uscalar_t		srclen;
1650 	void			*opt;
1651 	t_uscalar_t		optlen;
1652 	t_scalar_t		PRIM_type;
1653 	t_scalar_t		SEQ_number;
1654 	size_t			sinlen;
1655 	sotpi_info_t		*sti = SOTOTPI(so);
1656 	sotpi_info_t		*nsti;
1657 
1658 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1659 	    (void *)so, fflag, (void *)nsop,
1660 	    pr_state(so->so_state, so->so_mode)));
1661 
1662 	/*
1663 	 * Defer single-threading the accepting socket until
1664 	 * the T_CONN_IND has been received and parsed and the
1665 	 * new sonode has been opened.
1666 	 */
1667 
1668 	/* Check that we are not already connected */
1669 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1670 		goto conn_bad;
1671 again:
1672 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1673 		goto e_bad;
1674 
1675 	ASSERT(mp != NULL);
1676 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1677 
1678 	/*
1679 	 * Save SEQ_number for error paths.
1680 	 */
1681 	SEQ_number = conn_ind->SEQ_number;
1682 
1683 	srclen = conn_ind->SRC_length;
1684 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1685 	if (src == NULL) {
1686 		error = EPROTO;
1687 		freemsg(mp);
1688 		eprintsoline(so, error);
1689 		goto disconnect_unlocked;
1690 	}
1691 	optlen = conn_ind->OPT_length;
1692 	switch (so->so_family) {
1693 	case AF_INET:
1694 	case AF_INET6:
1695 		if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1696 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1697 			    &opt, conn_ind->OPT_length);
1698 		} else {
1699 			/*
1700 			 * The transport (in this case TCP) hasn't sent up
1701 			 * a pointer to an instance for the accept fast-path.
1702 			 * Disable fast-path completely because the call to
1703 			 * sotpi_create() below would otherwise create an
1704 			 * incomplete TCP instance, which would lead to
1705 			 * problems when sockfs sends a normal T_CONN_RES
1706 			 * message down the new stream.
1707 			 */
1708 			if (sti->sti_direct) {
1709 				int rval;
1710 				/*
1711 				 * For consistency we inform tcp to disable
1712 				 * direct interface on the listener, though
1713 				 * we can certainly live without doing this
1714 				 * because no data will ever travel upstream
1715 				 * on the listening socket.
1716 				 */
1717 				sti->sti_direct = 0;
1718 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1719 				    0, 0, K_TO_K, cr, &rval);
1720 			}
1721 			opt = NULL;
1722 			optlen = 0;
1723 		}
1724 		break;
1725 	case AF_UNIX:
1726 	default:
1727 		if (optlen != 0) {
1728 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1729 			    __TPI_ALIGN_SIZE);
1730 			if (opt == NULL) {
1731 				error = EPROTO;
1732 				freemsg(mp);
1733 				eprintsoline(so, error);
1734 				goto disconnect_unlocked;
1735 			}
1736 		}
1737 		if (so->so_family == AF_UNIX) {
1738 			if (!sti->sti_faddr_noxlate) {
1739 				src = NULL;
1740 				srclen = 0;
1741 			}
1742 			/* Extract src address from options */
1743 			if (optlen != 0)
1744 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1745 		}
1746 		break;
1747 	}
1748 
1749 	/*
1750 	 * Create the new socket.
1751 	 */
1752 	nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1753 	if (nso == NULL) {
1754 		ASSERT(error != 0);
1755 		/*
1756 		 * Accept can not fail with ENOBUFS. sotpi_create
1757 		 * sleeps waiting for memory until a signal is caught
1758 		 * so return EINTR.
1759 		 */
1760 		freemsg(mp);
1761 		if (error == ENOBUFS)
1762 			error = EINTR;
1763 		goto e_disc_unl;
1764 	}
1765 	nvp = SOTOV(nso);
1766 	nsti = SOTOTPI(nso);
1767 
1768 #ifdef DEBUG
1769 	/*
1770 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1771 	 * it's inherited early to allow debugging of the accept code itself.
1772 	 */
1773 	nso->so_options |= so->so_options & SO_DEBUG;
1774 #endif /* DEBUG */
1775 
1776 	/*
1777 	 * Save the SRC address from the T_CONN_IND
1778 	 * for getpeername to work on AF_UNIX and on transports that do not
1779 	 * support TI_GETPEERNAME.
1780 	 *
1781 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1782 	 * copyin_name().
1783 	 */
1784 	if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1785 		error = EINVAL;
1786 		freemsg(mp);
1787 		eprintsoline(so, error);
1788 		goto disconnect_vp_unlocked;
1789 	}
1790 	nsti->sti_faddr_len = (socklen_t)srclen;
1791 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1792 	bcopy(src, nsti->sti_faddr_sa, srclen);
1793 	nsti->sti_faddr_valid = 1;
1794 
1795 	/*
1796 	 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1797 	 */
1798 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1799 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1800 		cred_t	*cr;
1801 		pid_t	cpid;
1802 
1803 		cr = msg_getcred(mp, &cpid);
1804 		if (cr != NULL) {
1805 			crhold(cr);
1806 			nso->so_peercred = cr;
1807 			nso->so_cpid = cpid;
1808 		}
1809 		freemsg(mp);
1810 
1811 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1812 		    sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1813 		if (mp == NULL) {
1814 			/*
1815 			 * Accept can not fail with ENOBUFS.
1816 			 * A signal was caught so return EINTR.
1817 			 */
1818 			error = EINTR;
1819 			eprintsoline(so, error);
1820 			goto disconnect_vp_unlocked;
1821 		}
1822 		conn_res = (struct T_conn_res *)mp->b_rptr;
1823 	} else {
1824 		/*
1825 		 * For efficency reasons we use msg_extractcred; no crhold
1826 		 * needed since db_credp is cleared (i.e., we move the cred
1827 		 * from the message to so_peercred.
1828 		 */
1829 		nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1830 
1831 		mp->b_rptr = DB_BASE(mp);
1832 		conn_res = (struct T_conn_res *)mp->b_rptr;
1833 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1834 
1835 		mblk_setcred(mp, cr, curproc->p_pid);
1836 	}
1837 
1838 	/*
1839 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1840 	 * (or AF_INET6) it also has to be bound in the transport provider.
1841 	 * We set the local address in the sonode from the T_OK_ACK of the
1842 	 * T_CONN_RES. For this reason the address we bind to here isn't
1843 	 * important.
1844 	 */
1845 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1846 	    /*CONSTCOND*/
1847 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1848 		/*
1849 		 * Optimization for AF_INET{,6} transports
1850 		 * that can handle a T_CONN_RES without being bound.
1851 		 */
1852 		mutex_enter(&nso->so_lock);
1853 		so_automatic_bind(nso);
1854 		mutex_exit(&nso->so_lock);
1855 	} else {
1856 		/* Perform NULL bind with the transport provider. */
1857 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1858 		    cr)) != 0) {
1859 			ASSERT(error != ENOBUFS);
1860 			freemsg(mp);
1861 			eprintsoline(nso, error);
1862 			goto disconnect_vp_unlocked;
1863 		}
1864 	}
1865 
1866 	/*
1867 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1868 	 * so that any data arriving on the new socket will cause the
1869 	 * appropriate signals to be delivered for the new socket.
1870 	 *
1871 	 * No other thread (except strsock_proto and strsock_misc)
1872 	 * can access the new socket thus we relax the locking.
1873 	 */
1874 	nso->so_pgrp = so->so_pgrp;
1875 	nso->so_state |= so->so_state & SS_ASYNC;
1876 	nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1877 
1878 	if (nso->so_pgrp != 0) {
1879 		if ((error = so_set_events(nso, nvp, cr)) != 0) {
1880 			eprintsoline(nso, error);
1881 			error = 0;
1882 			nso->so_pgrp = 0;
1883 		}
1884 	}
1885 
1886 	/*
1887 	 * Make note of the socket level options. TCP and IP level options
1888 	 * are already inherited. We could do all this after accept is
1889 	 * successful but doing it here simplifies code and no harm done
1890 	 * for error case.
1891 	 */
1892 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1893 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1894 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1895 	nso->so_sndbuf = so->so_sndbuf;
1896 	nso->so_rcvbuf = so->so_rcvbuf;
1897 	if (nso->so_options & SO_LINGER)
1898 		nso->so_linger = so->so_linger;
1899 
1900 	/*
1901 	 * Note that the following sti_direct code path should be
1902 	 * removed once we are confident that the direct sockets
1903 	 * do not result in any degradation.
1904 	 */
1905 	if (sti->sti_direct) {
1906 
1907 		ASSERT(opt != NULL);
1908 
1909 		conn_res->OPT_length = optlen;
1910 		conn_res->OPT_offset = MBLKL(mp);
1911 		bcopy(&opt, mp->b_wptr, optlen);
1912 		mp->b_wptr += optlen;
1913 		conn_res->PRIM_type = T_CONN_RES;
1914 		conn_res->ACCEPTOR_id = 0;
1915 		PRIM_type = T_CONN_RES;
1916 
1917 		/* Send down the T_CONN_RES on acceptor STREAM */
1918 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1919 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1920 		if (error) {
1921 			mutex_enter(&so->so_lock);
1922 			so_lock_single(so);
1923 			eprintsoline(so, error);
1924 			goto disconnect_vp;
1925 		}
1926 		mutex_enter(&nso->so_lock);
1927 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1928 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1929 		if (error) {
1930 			mutex_exit(&nso->so_lock);
1931 			mutex_enter(&so->so_lock);
1932 			so_lock_single(so);
1933 			eprintsoline(so, error);
1934 			goto disconnect_vp;
1935 		}
1936 		if (nso->so_family == AF_INET) {
1937 			sin_t *sin;
1938 
1939 			sin = (sin_t *)(ack_mp->b_rptr +
1940 			    sizeof (struct T_ok_ack));
1941 			bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1942 			nsti->sti_laddr_len = sizeof (sin_t);
1943 		} else {
1944 			sin6_t *sin6;
1945 
1946 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1947 			    sizeof (struct T_ok_ack));
1948 			bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1949 			nsti->sti_laddr_len = sizeof (sin6_t);
1950 		}
1951 		freemsg(ack_mp);
1952 
1953 		nso->so_state |= SS_ISCONNECTED;
1954 		nso->so_proto_handle = (sock_lower_handle_t)opt;
1955 		nsti->sti_laddr_valid = 1;
1956 
1957 		if (sti->sti_nl7c_flags & NL7C_ENABLED) {
1958 			/*
1959 			 * A NL7C marked listen()er so the new socket
1960 			 * inherits the listen()er's NL7C state, except
1961 			 * for NL7C_POLLIN.
1962 			 *
1963 			 * Only call NL7C to process the new socket if
1964 			 * the listen socket allows blocking i/o.
1965 			 */
1966 			nsti->sti_nl7c_flags =
1967 			    sti->sti_nl7c_flags & (~NL7C_POLLIN);
1968 			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
1969 				/*
1970 				 * Nonblocking accept() just make it
1971 				 * persist to defer processing to the
1972 				 * read-side syscall (e.g. read).
1973 				 */
1974 				nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
1975 			} else if (nl7c_process(nso, B_FALSE)) {
1976 				/*
1977 				 * NL7C has completed processing on the
1978 				 * socket, close the socket and back to
1979 				 * the top to await the next T_CONN_IND.
1980 				 */
1981 				mutex_exit(&nso->so_lock);
1982 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1983 				    cr, NULL);
1984 				VN_RELE(nvp);
1985 				goto again;
1986 			}
1987 			/* Pass the new socket out */
1988 		}
1989 
1990 		mutex_exit(&nso->so_lock);
1991 
1992 		/*
1993 		 * It's possible, through the use of autopush for example,
1994 		 * that the acceptor stream may not support sti_direct
1995 		 * semantics. If the new socket does not support sti_direct
1996 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
1997 		 * as we would in the I_PUSH case.
1998 		 */
1999 		if (nsti->sti_direct == 0) {
2000 			int	rval;
2001 
2002 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2003 			    0, 0, K_TO_K, cr, &rval)) != 0) {
2004 				mutex_enter(&so->so_lock);
2005 				so_lock_single(so);
2006 				eprintsoline(so, error);
2007 				goto disconnect_vp;
2008 			}
2009 		}
2010 
2011 		/*
2012 		 * Pass out new socket.
2013 		 */
2014 		if (nsop != NULL)
2015 			*nsop = nso;
2016 
2017 		return (0);
2018 	}
2019 
2020 	/*
2021 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2022 	 * which don't support the FireEngine accept fast-path. It is also
2023 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2024 	 * again. Neither sockfs nor TCP attempt to find out if some other
2025 	 * random module has been inserted in between (in which case we
2026 	 * should follow TLI accept behaviour). We blindly assume the worst
2027 	 * case and revert back to old behaviour i.e. TCP will not send us
2028 	 * any option (eager) and the accept should happen on the listener
2029 	 * queue. Any queued T_conn_ind have already got their options removed
2030 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
2031 	 */
2032 	/*
2033 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2034 	 */
2035 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2036 #ifdef	_ILP32
2037 		queue_t	*q;
2038 
2039 		/*
2040 		 * Find read queue in driver
2041 		 * Can safely do this since we "own" nso/nvp.
2042 		 */
2043 		q = strvp2wq(nvp)->q_next;
2044 		while (SAMESTR(q))
2045 			q = q->q_next;
2046 		q = RD(q);
2047 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2048 #else
2049 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2050 #endif	/* _ILP32 */
2051 		conn_res->PRIM_type = O_T_CONN_RES;
2052 		PRIM_type = O_T_CONN_RES;
2053 	} else {
2054 		conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2055 		conn_res->PRIM_type = T_CONN_RES;
2056 		PRIM_type = T_CONN_RES;
2057 	}
2058 	conn_res->SEQ_number = SEQ_number;
2059 	conn_res->OPT_length = 0;
2060 	conn_res->OPT_offset = 0;
2061 
2062 	mutex_enter(&so->so_lock);
2063 	so_lock_single(so);	/* Set SOLOCKED */
2064 	mutex_exit(&so->so_lock);
2065 
2066 	error = kstrputmsg(SOTOV(so), mp, NULL,
2067 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2068 	mutex_enter(&so->so_lock);
2069 	if (error) {
2070 		eprintsoline(so, error);
2071 		goto disconnect_vp;
2072 	}
2073 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
2074 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2075 	if (error) {
2076 		eprintsoline(so, error);
2077 		goto disconnect_vp;
2078 	}
2079 	mutex_exit(&so->so_lock);
2080 	/*
2081 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
2082 	 * that to set the local address. If this is not present
2083 	 * then we zero out the address and don't set the
2084 	 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2085 	 * the pathname from the listening socket.
2086 	 * In the case where this is TCP or an AF_UNIX socket the
2087 	 * client side may have queued data or a T_ORDREL in the
2088 	 * transport. Having now sent the T_CONN_RES we may receive
2089 	 * those queued messages at any time. Hold the acceptor
2090 	 * so_lock until its state and laddr are finalized.
2091 	 */
2092 	mutex_enter(&nso->so_lock);
2093 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2094 	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2095 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2096 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
2097 		bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2098 		nsti->sti_laddr_len = sinlen;
2099 		nsti->sti_laddr_valid = 1;
2100 	} else if (nso->so_family == AF_UNIX) {
2101 		ASSERT(so->so_family == AF_UNIX);
2102 		nsti->sti_laddr_len = sti->sti_laddr_len;
2103 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2104 		bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2105 		    nsti->sti_laddr_len);
2106 		nsti->sti_laddr_valid = 1;
2107 	} else {
2108 		nsti->sti_laddr_len = sti->sti_laddr_len;
2109 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2110 		bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2111 		nsti->sti_laddr_sa->sa_family = nso->so_family;
2112 	}
2113 	nso->so_state |= SS_ISCONNECTED;
2114 	mutex_exit(&nso->so_lock);
2115 
2116 	freemsg(ack_mp);
2117 
2118 	mutex_enter(&so->so_lock);
2119 	so_unlock_single(so, SOLOCKED);
2120 	mutex_exit(&so->so_lock);
2121 
2122 	/*
2123 	 * Pass out new socket.
2124 	 */
2125 	if (nsop != NULL)
2126 		*nsop = nso;
2127 
2128 	return (0);
2129 
2130 
2131 eproto_disc_unl:
2132 	error = EPROTO;
2133 e_disc_unl:
2134 	eprintsoline(so, error);
2135 	goto disconnect_unlocked;
2136 
2137 pr_disc_vp_unl:
2138 	eprintsoline(so, error);
2139 disconnect_vp_unlocked:
2140 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2141 	VN_RELE(nvp);
2142 disconnect_unlocked:
2143 	(void) sodisconnect(so, SEQ_number, 0);
2144 	return (error);
2145 
2146 pr_disc_vp:
2147 	eprintsoline(so, error);
2148 disconnect_vp:
2149 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2150 	so_unlock_single(so, SOLOCKED);
2151 	mutex_exit(&so->so_lock);
2152 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2153 	VN_RELE(nvp);
2154 	return (error);
2155 
2156 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2157 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2158 	    ? EOPNOTSUPP : EINVAL;
2159 e_bad:
2160 	eprintsoline(so, error);
2161 	return (error);
2162 }
2163 
2164 /*
2165  * connect a socket.
2166  *
2167  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2168  * unconnect (by specifying a null address).
2169  */
2170 int
2171 sotpi_connect(struct sonode *so,
2172 	struct sockaddr *name,
2173 	socklen_t namelen,
2174 	int fflag,
2175 	int flags,
2176 	struct cred *cr)
2177 {
2178 	struct T_conn_req	conn_req;
2179 	int			error = 0;
2180 	mblk_t			*mp;
2181 	void			*src;
2182 	socklen_t		srclen;
2183 	void			*addr;
2184 	socklen_t		addrlen;
2185 	boolean_t		need_unlock;
2186 	sotpi_info_t		*sti = SOTOTPI(so);
2187 
2188 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2189 	    (void *)so, (void *)name, namelen, fflag, flags,
2190 	    pr_state(so->so_state, so->so_mode)));
2191 
2192 	/*
2193 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2194 	 * avoid sleeping for memory with SOLOCKED held.
2195 	 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2196 	 * + sizeof (struct T_opthdr).
2197 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2198 	 * exceed sti_faddr_maxlen).
2199 	 */
2200 	mp = soallocproto(sizeof (struct T_conn_req) +
2201 	    2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2202 	    cr);
2203 	if (mp == NULL) {
2204 		/*
2205 		 * Connect can not fail with ENOBUFS. A signal was
2206 		 * caught so return EINTR.
2207 		 */
2208 		error = EINTR;
2209 		eprintsoline(so, error);
2210 		return (error);
2211 	}
2212 
2213 	mutex_enter(&so->so_lock);
2214 	/*
2215 	 * Make sure there is a preallocated T_unbind_req message
2216 	 * before any binding. This message is allocated when the
2217 	 * socket is created. Since another thread can consume
2218 	 * so_unbind_mp by the time we return from so_lock_single(),
2219 	 * we should check the availability of so_unbind_mp after
2220 	 * we return from so_lock_single().
2221 	 */
2222 
2223 	so_lock_single(so);	/* Set SOLOCKED */
2224 	need_unlock = B_TRUE;
2225 
2226 	if (sti->sti_unbind_mp == NULL) {
2227 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2228 		/* NOTE: holding so_lock while sleeping */
2229 		sti->sti_unbind_mp =
2230 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2231 		if (sti->sti_unbind_mp == NULL) {
2232 			error = EINTR;
2233 			goto done;
2234 		}
2235 	}
2236 
2237 	/*
2238 	 * Can't have done a listen before connecting.
2239 	 */
2240 	if (so->so_state & SS_ACCEPTCONN) {
2241 		error = EOPNOTSUPP;
2242 		goto done;
2243 	}
2244 
2245 	/*
2246 	 * Must be bound with the transport
2247 	 */
2248 	if (!(so->so_state & SS_ISBOUND)) {
2249 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2250 		    /*CONSTCOND*/
2251 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2252 			/*
2253 			 * Optimization for AF_INET{,6} transports
2254 			 * that can handle a T_CONN_REQ without being bound.
2255 			 */
2256 			so_automatic_bind(so);
2257 		} else {
2258 			error = sotpi_bind(so, NULL, 0,
2259 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2260 			if (error)
2261 				goto done;
2262 		}
2263 		ASSERT(so->so_state & SS_ISBOUND);
2264 		flags |= _SOCONNECT_DID_BIND;
2265 	}
2266 
2267 	/*
2268 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2269 	 * connect to a null address. This is the portable method to
2270 	 * unconnect a socket.
2271 	 */
2272 	if ((namelen >= sizeof (sa_family_t)) &&
2273 	    (name->sa_family == AF_UNSPEC)) {
2274 		name = NULL;
2275 		namelen = 0;
2276 	}
2277 
2278 	/*
2279 	 * Check that we are not already connected.
2280 	 * A connection-oriented socket cannot be reconnected.
2281 	 * A connected connection-less socket can be
2282 	 * - connected to a different address by a subsequent connect
2283 	 * - "unconnected" by a connect to the NULL address
2284 	 */
2285 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2286 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2287 		if (so->so_mode & SM_CONNREQUIRED) {
2288 			/* Connection-oriented socket */
2289 			error = so->so_state & SS_ISCONNECTED ?
2290 			    EISCONN : EALREADY;
2291 			goto done;
2292 		}
2293 		/* Connection-less socket */
2294 		if (name == NULL) {
2295 			/*
2296 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2297 			 * since it was set when the socket was connected.
2298 			 * If this is UDP also send down a T_DISCON_REQ.
2299 			 */
2300 			int val;
2301 
2302 			if ((so->so_family == AF_INET ||
2303 			    so->so_family == AF_INET6) &&
2304 			    (so->so_type == SOCK_DGRAM ||
2305 			    so->so_type == SOCK_RAW) &&
2306 			    /*CONSTCOND*/
2307 			    !soconnect_tpi_udp) {
2308 				/* XXX What about implicitly unbinding here? */
2309 				error = sodisconnect(so, -1,
2310 				    _SODISCONNECT_LOCK_HELD);
2311 			} else {
2312 				so->so_state &=
2313 				    ~(SS_ISCONNECTED | SS_ISCONNECTING);
2314 				sti->sti_faddr_valid = 0;
2315 				sti->sti_faddr_len = 0;
2316 			}
2317 
2318 			/* Remove SOLOCKED since setsockopt will grab it */
2319 			so_unlock_single(so, SOLOCKED);
2320 			mutex_exit(&so->so_lock);
2321 
2322 			val = 0;
2323 			(void) sotpi_setsockopt(so, SOL_SOCKET,
2324 			    SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2325 			    cr);
2326 
2327 			mutex_enter(&so->so_lock);
2328 			so_lock_single(so);	/* Set SOLOCKED */
2329 			goto done;
2330 		}
2331 	}
2332 	ASSERT(so->so_state & SS_ISBOUND);
2333 
2334 	if (name == NULL || namelen == 0) {
2335 		error = EINVAL;
2336 		goto done;
2337 	}
2338 	/*
2339 	 * Mark the socket if sti_faddr_sa represents the transport level
2340 	 * address.
2341 	 */
2342 	if (flags & _SOCONNECT_NOXLATE) {
2343 		struct sockaddr_ux	*soaddr_ux;
2344 
2345 		ASSERT(so->so_family == AF_UNIX);
2346 		if (namelen != sizeof (struct sockaddr_ux)) {
2347 			error = EINVAL;
2348 			goto done;
2349 		}
2350 		soaddr_ux = (struct sockaddr_ux *)name;
2351 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2352 		namelen = sizeof (soaddr_ux->sou_addr);
2353 		sti->sti_faddr_noxlate = 1;
2354 	}
2355 
2356 	/*
2357 	 * Length and family checks.
2358 	 */
2359 	error = so_addr_verify(so, name, namelen);
2360 	if (error)
2361 		goto bad;
2362 
2363 	/*
2364 	 * Save foreign address. Needed for AF_UNIX as well as
2365 	 * transport providers that do not support TI_GETPEERNAME.
2366 	 * Also used for cached foreign address for TCP and UDP.
2367 	 */
2368 	if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2369 		error = EINVAL;
2370 		goto done;
2371 	}
2372 	sti->sti_faddr_len = (socklen_t)namelen;
2373 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2374 	bcopy(name, sti->sti_faddr_sa, namelen);
2375 	sti->sti_faddr_valid = 1;
2376 
2377 	if (so->so_family == AF_UNIX) {
2378 		if (sti->sti_faddr_noxlate) {
2379 			/*
2380 			 * Already have a transport internal address. Do not
2381 			 * pass any (transport internal) source address.
2382 			 */
2383 			addr = sti->sti_faddr_sa;
2384 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2385 			src = NULL;
2386 			srclen = 0;
2387 		} else {
2388 			/*
2389 			 * Pass the sockaddr_un source address as an option
2390 			 * and translate the remote address.
2391 			 * Holding so_lock thus sti_laddr_sa can not change.
2392 			 */
2393 			src = sti->sti_laddr_sa;
2394 			srclen = (t_uscalar_t)sti->sti_laddr_len;
2395 			dprintso(so, 1,
2396 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2397 			    srclen, src));
2398 			error = so_ux_addr_xlate(so,
2399 			    sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2400 			    (flags & _SOCONNECT_XPG4_2),
2401 			    &addr, &addrlen);
2402 			if (error)
2403 				goto bad;
2404 		}
2405 	} else {
2406 		addr = sti->sti_faddr_sa;
2407 		addrlen = (t_uscalar_t)sti->sti_faddr_len;
2408 		src = NULL;
2409 		srclen = 0;
2410 	}
2411 	/*
2412 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2413 	 * option which asks the transport provider to send T_UDERR_IND
2414 	 * messages. These T_UDERR_IND messages are used to return connected
2415 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2416 	 *
2417 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2418 	 * we send down a T_CONN_REQ. This is needed to let the
2419 	 * transport assign a local address that is consistent with
2420 	 * the remote address. Applications depend on a getsockname()
2421 	 * after a connect() to retrieve the "source" IP address for
2422 	 * the connected socket.  Invalidate the cached local address
2423 	 * to force getsockname() to enquire of the transport.
2424 	 */
2425 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2426 		/*
2427 		 * Datagram socket.
2428 		 */
2429 		int32_t val;
2430 
2431 		so_unlock_single(so, SOLOCKED);
2432 		mutex_exit(&so->so_lock);
2433 
2434 		val = 1;
2435 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2436 		    &val, (t_uscalar_t)sizeof (val), cr);
2437 
2438 		mutex_enter(&so->so_lock);
2439 		so_lock_single(so);	/* Set SOLOCKED */
2440 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2441 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2442 		    soconnect_tpi_udp) {
2443 			soisconnected(so);
2444 			goto done;
2445 		}
2446 		/*
2447 		 * Send down T_CONN_REQ etc.
2448 		 * Clear fflag to avoid returning EWOULDBLOCK.
2449 		 */
2450 		fflag = 0;
2451 		ASSERT(so->so_family != AF_UNIX);
2452 		sti->sti_laddr_valid = 0;
2453 	} else if (sti->sti_laddr_len != 0) {
2454 		/*
2455 		 * If the local address or port was "any" then it may be
2456 		 * changed by the transport as a result of the
2457 		 * connect.  Invalidate the cached version if we have one.
2458 		 */
2459 		switch (so->so_family) {
2460 		case AF_INET:
2461 			ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2462 			if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2463 			    INADDR_ANY ||
2464 			    ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2465 				sti->sti_laddr_valid = 0;
2466 			break;
2467 
2468 		case AF_INET6:
2469 			ASSERT(sti->sti_laddr_len ==
2470 			    (socklen_t)sizeof (sin6_t));
2471 			if (IN6_IS_ADDR_UNSPECIFIED(
2472 			    &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2473 			    IN6_IS_ADDR_V4MAPPED_ANY(
2474 			    &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2475 			    ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2476 				sti->sti_laddr_valid = 0;
2477 			break;
2478 
2479 		default:
2480 			break;
2481 		}
2482 	}
2483 
2484 	/*
2485 	 * Check for failure of an earlier call
2486 	 */
2487 	if (so->so_error != 0)
2488 		goto so_bad;
2489 
2490 	/*
2491 	 * Send down T_CONN_REQ. Message was allocated above.
2492 	 */
2493 	conn_req.PRIM_type = T_CONN_REQ;
2494 	conn_req.DEST_length = addrlen;
2495 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2496 	if (srclen == 0) {
2497 		conn_req.OPT_length = 0;
2498 		conn_req.OPT_offset = 0;
2499 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2500 		soappendmsg(mp, addr, addrlen);
2501 	} else {
2502 		/*
2503 		 * There is a AF_UNIX sockaddr_un to include as a source
2504 		 * address option.
2505 		 */
2506 		struct T_opthdr toh;
2507 
2508 		toh.level = SOL_SOCKET;
2509 		toh.name = SO_SRCADDR;
2510 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2511 		toh.status = 0;
2512 		conn_req.OPT_length =
2513 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2514 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2515 		    _TPI_ALIGN_TOPT(addrlen));
2516 
2517 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2518 		soappendmsg(mp, addr, addrlen);
2519 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2520 		soappendmsg(mp, &toh, sizeof (toh));
2521 		soappendmsg(mp, src, srclen);
2522 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2523 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2524 	}
2525 	/*
2526 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2527 	 * in order to have the right state when the T_CONN_CON shows up.
2528 	 */
2529 	soisconnecting(so);
2530 	mutex_exit(&so->so_lock);
2531 
2532 	if (AU_AUDITING())
2533 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2534 
2535 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2536 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2537 	mp = NULL;
2538 	mutex_enter(&so->so_lock);
2539 	if (error != 0)
2540 		goto bad;
2541 
2542 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2543 		goto bad;
2544 
2545 	/* Allow other threads to access the socket */
2546 	so_unlock_single(so, SOLOCKED);
2547 	need_unlock = B_FALSE;
2548 
2549 	/*
2550 	 * Wait until we get a T_CONN_CON or an error
2551 	 */
2552 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2553 		so_lock_single(so);	/* Set SOLOCKED */
2554 		need_unlock = B_TRUE;
2555 	}
2556 
2557 done:
2558 	freemsg(mp);
2559 	switch (error) {
2560 	case EINPROGRESS:
2561 	case EALREADY:
2562 	case EISCONN:
2563 	case EINTR:
2564 		/* Non-fatal errors */
2565 		sti->sti_laddr_valid = 0;
2566 		/* FALLTHRU */
2567 	case 0:
2568 		break;
2569 	default:
2570 		ASSERT(need_unlock);
2571 		/*
2572 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2573 		 * and invalidate local-address cache
2574 		 */
2575 		so->so_state &= ~SS_ISCONNECTING;
2576 		sti->sti_laddr_valid = 0;
2577 		/* A discon_ind might have already unbound us */
2578 		if ((flags & _SOCONNECT_DID_BIND) &&
2579 		    (so->so_state & SS_ISBOUND)) {
2580 			int err;
2581 
2582 			err = sotpi_unbind(so, 0);
2583 			/* LINTED - statement has no conseq */
2584 			if (err) {
2585 				eprintsoline(so, err);
2586 			}
2587 		}
2588 		break;
2589 	}
2590 	if (need_unlock)
2591 		so_unlock_single(so, SOLOCKED);
2592 	mutex_exit(&so->so_lock);
2593 	return (error);
2594 
2595 so_bad:	error = sogeterr(so, B_TRUE);
2596 bad:	eprintsoline(so, error);
2597 	goto done;
2598 }
2599 
2600 /* ARGSUSED */
2601 int
2602 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2603 {
2604 	struct T_ordrel_req	ordrel_req;
2605 	mblk_t			*mp;
2606 	uint_t			old_state, state_change;
2607 	int			error = 0;
2608 	sotpi_info_t		*sti = SOTOTPI(so);
2609 
2610 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2611 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2612 
2613 	mutex_enter(&so->so_lock);
2614 	so_lock_single(so);	/* Set SOLOCKED */
2615 
2616 	/*
2617 	 * SunOS 4.X has no check for datagram sockets.
2618 	 * 5.X checks that it is connected (ENOTCONN)
2619 	 * X/Open requires that we check the connected state.
2620 	 */
2621 	if (!(so->so_state & SS_ISCONNECTED)) {
2622 		if (!xnet_skip_checks) {
2623 			error = ENOTCONN;
2624 			if (xnet_check_print) {
2625 				printf("sockfs: X/Open shutdown check "
2626 				    "caused ENOTCONN\n");
2627 			}
2628 		}
2629 		goto done;
2630 	}
2631 	/*
2632 	 * Record the current state and then perform any state changes.
2633 	 * Then use the difference between the old and new states to
2634 	 * determine which messages need to be sent.
2635 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2636 	 * duplicate calls to shutdown().
2637 	 */
2638 	old_state = so->so_state;
2639 
2640 	switch (how) {
2641 	case 0:
2642 		socantrcvmore(so);
2643 		break;
2644 	case 1:
2645 		socantsendmore(so);
2646 		break;
2647 	case 2:
2648 		socantsendmore(so);
2649 		socantrcvmore(so);
2650 		break;
2651 	default:
2652 		error = EINVAL;
2653 		goto done;
2654 	}
2655 
2656 	/*
2657 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2658 	 */
2659 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2660 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2661 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2662 
2663 	switch (state_change) {
2664 	case 0:
2665 		dprintso(so, 1,
2666 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2667 		    so->so_state));
2668 		goto done;
2669 
2670 	case SS_CANTRCVMORE:
2671 		mutex_exit(&so->so_lock);
2672 		strseteof(SOTOV(so), 1);
2673 		/*
2674 		 * strseteof takes care of read side wakeups,
2675 		 * pollwakeups, and signals.
2676 		 */
2677 		/*
2678 		 * Get the read lock before flushing data to avoid problems
2679 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2680 		 */
2681 		mutex_enter(&so->so_lock);
2682 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2683 		mutex_exit(&so->so_lock);
2684 
2685 		/* Flush read side queue */
2686 		strflushrq(SOTOV(so), FLUSHALL);
2687 
2688 		mutex_enter(&so->so_lock);
2689 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2690 		break;
2691 
2692 	case SS_CANTSENDMORE:
2693 		mutex_exit(&so->so_lock);
2694 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2695 		mutex_enter(&so->so_lock);
2696 		break;
2697 
2698 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2699 		mutex_exit(&so->so_lock);
2700 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2701 		strseteof(SOTOV(so), 1);
2702 		/*
2703 		 * strseteof takes care of read side wakeups,
2704 		 * pollwakeups, and signals.
2705 		 */
2706 		/*
2707 		 * Get the read lock before flushing data to avoid problems
2708 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2709 		 */
2710 		mutex_enter(&so->so_lock);
2711 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2712 		mutex_exit(&so->so_lock);
2713 
2714 		/* Flush read side queue */
2715 		strflushrq(SOTOV(so), FLUSHALL);
2716 
2717 		mutex_enter(&so->so_lock);
2718 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2719 		break;
2720 	}
2721 
2722 	ASSERT(MUTEX_HELD(&so->so_lock));
2723 
2724 	/*
2725 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2726 	 * was set due to this call and the new state has both of them set:
2727 	 *	Send the AF_UNIX close indication
2728 	 *	For T_COTS send a discon_ind
2729 	 *
2730 	 * If cantsend was set due to this call:
2731 	 *	For T_COTSORD send an ordrel_ind
2732 	 *
2733 	 * Note that for T_CLTS there is no message sent here.
2734 	 */
2735 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2736 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2737 		/*
2738 		 * For SunOS 4.X compatibility we tell the other end
2739 		 * that we are unable to receive at this point.
2740 		 */
2741 		if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2742 			so_unix_close(so);
2743 
2744 		if (sti->sti_serv_type == T_COTS)
2745 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2746 	}
2747 	if ((state_change & SS_CANTSENDMORE) &&
2748 	    (sti->sti_serv_type == T_COTS_ORD)) {
2749 		/* Send an orderly release */
2750 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2751 
2752 		mutex_exit(&so->so_lock);
2753 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2754 		    0, _ALLOC_SLEEP, cr);
2755 		/*
2756 		 * Send down the T_ORDREL_REQ even if there is flow control.
2757 		 * This prevents shutdown from blocking.
2758 		 * Note that there is no T_OK_ACK for ordrel_req.
2759 		 */
2760 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2761 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2762 		mutex_enter(&so->so_lock);
2763 		if (error) {
2764 			eprintsoline(so, error);
2765 			goto done;
2766 		}
2767 	}
2768 
2769 done:
2770 	so_unlock_single(so, SOLOCKED);
2771 	mutex_exit(&so->so_lock);
2772 	return (error);
2773 }
2774 
2775 /*
2776  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2777  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2778  * that we have closed.
2779  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2780  * T_UNITDATA_REQ containing the same option.
2781  *
2782  * For SOCK_DGRAM half-connections (somebody connected to this end
2783  * but this end is not connect) we don't know where to send any
2784  * SO_UNIX_CLOSE.
2785  *
2786  * We have to ignore stream head errors just in case there has been
2787  * a shutdown(output).
2788  * Ignore any flow control to try to get the message more quickly to the peer.
2789  * While locally ignoring flow control solves the problem when there
2790  * is only the loopback transport on the stream it would not provide
2791  * the correct AF_UNIX socket semantics when one or more modules have
2792  * been pushed.
2793  */
2794 void
2795 so_unix_close(struct sonode *so)
2796 {
2797 	int		error;
2798 	struct T_opthdr	toh;
2799 	mblk_t		*mp;
2800 	sotpi_info_t	*sti = SOTOTPI(so);
2801 
2802 	ASSERT(MUTEX_HELD(&so->so_lock));
2803 
2804 	ASSERT(so->so_family == AF_UNIX);
2805 
2806 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2807 	    (SS_ISCONNECTED|SS_ISBOUND))
2808 		return;
2809 
2810 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2811 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2812 
2813 	toh.level = SOL_SOCKET;
2814 	toh.name = SO_UNIX_CLOSE;
2815 
2816 	/* zero length + header */
2817 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2818 	toh.status = 0;
2819 
2820 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2821 		struct T_optdata_req tdr;
2822 
2823 		tdr.PRIM_type = T_OPTDATA_REQ;
2824 		tdr.DATA_flag = 0;
2825 
2826 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2827 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2828 
2829 		/* NOTE: holding so_lock while sleeping */
2830 		mp = soallocproto2(&tdr, sizeof (tdr),
2831 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2832 	} else {
2833 		struct T_unitdata_req	tudr;
2834 		void			*addr;
2835 		socklen_t		addrlen;
2836 		void			*src;
2837 		socklen_t		srclen;
2838 		struct T_opthdr		toh2;
2839 		t_scalar_t		size;
2840 
2841 		/* Connecteded DGRAM socket */
2842 
2843 		/*
2844 		 * For AF_UNIX the destination address is translated to
2845 		 * an internal name and the source address is passed as
2846 		 * an option.
2847 		 */
2848 		/*
2849 		 * Length and family checks.
2850 		 */
2851 		error = so_addr_verify(so, sti->sti_faddr_sa,
2852 		    (t_uscalar_t)sti->sti_faddr_len);
2853 		if (error) {
2854 			eprintsoline(so, error);
2855 			return;
2856 		}
2857 		if (sti->sti_faddr_noxlate) {
2858 			/*
2859 			 * Already have a transport internal address. Do not
2860 			 * pass any (transport internal) source address.
2861 			 */
2862 			addr = sti->sti_faddr_sa;
2863 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2864 			src = NULL;
2865 			srclen = 0;
2866 		} else {
2867 			/*
2868 			 * Pass the sockaddr_un source address as an option
2869 			 * and translate the remote address.
2870 			 * Holding so_lock thus sti_laddr_sa can not change.
2871 			 */
2872 			src = sti->sti_laddr_sa;
2873 			srclen = (socklen_t)sti->sti_laddr_len;
2874 			dprintso(so, 1,
2875 			    ("so_ux_close: srclen %d, src %p\n",
2876 			    srclen, src));
2877 			error = so_ux_addr_xlate(so,
2878 			    sti->sti_faddr_sa,
2879 			    (socklen_t)sti->sti_faddr_len, 0,
2880 			    &addr, &addrlen);
2881 			if (error) {
2882 				eprintsoline(so, error);
2883 				return;
2884 			}
2885 		}
2886 		tudr.PRIM_type = T_UNITDATA_REQ;
2887 		tudr.DEST_length = addrlen;
2888 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2889 		if (srclen == 0) {
2890 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2891 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2892 			    _TPI_ALIGN_TOPT(addrlen));
2893 
2894 			size = tudr.OPT_offset + tudr.OPT_length;
2895 			/* NOTE: holding so_lock while sleeping */
2896 			mp = soallocproto2(&tudr, sizeof (tudr),
2897 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2898 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2899 			soappendmsg(mp, &toh, sizeof (toh));
2900 		} else {
2901 			/*
2902 			 * There is a AF_UNIX sockaddr_un to include as a
2903 			 * source address option.
2904 			 */
2905 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2906 			    _TPI_ALIGN_TOPT(srclen));
2907 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2908 			    _TPI_ALIGN_TOPT(addrlen));
2909 
2910 			toh2.level = SOL_SOCKET;
2911 			toh2.name = SO_SRCADDR;
2912 			toh2.len = (t_uscalar_t)(srclen +
2913 			    sizeof (struct T_opthdr));
2914 			toh2.status = 0;
2915 
2916 			size = tudr.OPT_offset + tudr.OPT_length;
2917 
2918 			/* NOTE: holding so_lock while sleeping */
2919 			mp = soallocproto2(&tudr, sizeof (tudr),
2920 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2921 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2922 			soappendmsg(mp, &toh, sizeof (toh));
2923 			soappendmsg(mp, &toh2, sizeof (toh2));
2924 			soappendmsg(mp, src, srclen);
2925 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2926 		}
2927 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2928 	}
2929 	mutex_exit(&so->so_lock);
2930 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2931 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2932 	mutex_enter(&so->so_lock);
2933 }
2934 
2935 /*
2936  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2937  * In addition, the caller typically verifies that there is some
2938  * potential state to clear by checking
2939  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2940  * before calling this routine.
2941  * Note that such a check can be made without holding so_lock since
2942  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2943  * decrements sti_oobsigcnt.
2944  *
2945  * When data is read *after* the point that all pending
2946  * oob data has been consumed the oob indication is cleared.
2947  *
2948  * This logic keeps select/poll returning POLLRDBAND and
2949  * SIOCATMARK returning true until we have read past
2950  * the mark.
2951  */
2952 static void
2953 sorecv_update_oobstate(struct sonode *so)
2954 {
2955 	sotpi_info_t *sti = SOTOTPI(so);
2956 
2957 	mutex_enter(&so->so_lock);
2958 	ASSERT(so_verify_oobstate(so));
2959 	dprintso(so, 1,
2960 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
2961 	    sti->sti_oobsigcnt,
2962 	    sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2963 	if (sti->sti_oobsigcnt == 0) {
2964 		/* No more pending oob indications */
2965 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2966 		freemsg(so->so_oobmsg);
2967 		so->so_oobmsg = NULL;
2968 	}
2969 	ASSERT(so_verify_oobstate(so));
2970 	mutex_exit(&so->so_lock);
2971 }
2972 
2973 /*
2974  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2975  */
2976 static int
2977 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2978 {
2979 	sotpi_info_t *sti = SOTOTPI(so);
2980 	int	error = 0;
2981 	mblk_t *tmp = NULL;
2982 	mblk_t *pmp = NULL;
2983 	mblk_t *nmp = sti->sti_nl7c_rcv_mp;
2984 
2985 	ASSERT(nmp != NULL);
2986 
2987 	while (nmp != NULL && uiop->uio_resid > 0) {
2988 		ssize_t n;
2989 
2990 		if (DB_TYPE(nmp) == M_DATA) {
2991 			/*
2992 			 * We have some data, uiomove up to resid bytes.
2993 			 */
2994 			n = MIN(MBLKL(nmp), uiop->uio_resid);
2995 			if (n > 0)
2996 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
2997 			nmp->b_rptr += n;
2998 			if (nmp->b_rptr == nmp->b_wptr) {
2999 				pmp = nmp;
3000 				nmp = nmp->b_cont;
3001 			}
3002 			if (error)
3003 				break;
3004 		} else {
3005 			/*
3006 			 * We only handle data, save for caller to handle.
3007 			 */
3008 			if (pmp != NULL) {
3009 				pmp->b_cont = nmp->b_cont;
3010 			}
3011 			nmp->b_cont = NULL;
3012 			if (*rmp == NULL) {
3013 				*rmp = nmp;
3014 			} else {
3015 				tmp->b_cont = nmp;
3016 			}
3017 			nmp = nmp->b_cont;
3018 			tmp = nmp;
3019 		}
3020 	}
3021 	if (pmp != NULL) {
3022 		/* Free any mblk_t(s) which we have consumed */
3023 		pmp->b_cont = NULL;
3024 		freemsg(sti->sti_nl7c_rcv_mp);
3025 	}
3026 	if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3027 		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
3028 		if (error == 0) {
3029 			rval_t	*p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3030 
3031 			error = p->r_v.r_v2;
3032 			p->r_v.r_v2 = 0;
3033 		}
3034 		rp->r_vals = sti->sti_nl7c_rcv_rval;
3035 		sti->sti_nl7c_rcv_rval = 0;
3036 	} else {
3037 		/* More mblk_t(s) to process so no rval to return */
3038 		rp->r_vals = 0;
3039 	}
3040 	return (error);
3041 }
3042 /*
3043  * Receive the next message on the queue.
3044  * If msg_controllen is non-zero when called the caller is interested in
3045  * any received control info (options).
3046  * If msg_namelen is non-zero when called the caller is interested in
3047  * any received source address.
3048  * The routine returns with msg_control and msg_name pointing to
3049  * kmem_alloc'ed memory which the caller has to free.
3050  */
3051 /* ARGSUSED */
3052 int
3053 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3054     struct cred *cr)
3055 {
3056 	union T_primitives	*tpr;
3057 	mblk_t			*mp;
3058 	uchar_t			pri;
3059 	int			pflag, opflag;
3060 	void			*control;
3061 	t_uscalar_t		controllen;
3062 	t_uscalar_t		namelen;
3063 	int			so_state = so->so_state; /* Snapshot */
3064 	ssize_t			saved_resid;
3065 	rval_t			rval;
3066 	int			flags;
3067 	clock_t			timout;
3068 	int			error = 0;
3069 	sotpi_info_t		*sti = SOTOTPI(so);
3070 
3071 	flags = msg->msg_flags;
3072 	msg->msg_flags = 0;
3073 
3074 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3075 	    (void *)so, (void *)msg, flags,
3076 	    pr_state(so->so_state, so->so_mode), so->so_error));
3077 
3078 	if (so->so_version == SOV_STREAM) {
3079 		so_update_attrs(so, SOACC);
3080 		/* The imaginary "sockmod" has been popped - act as a stream */
3081 		return (strread(SOTOV(so), uiop, cr));
3082 	}
3083 
3084 	/*
3085 	 * If we are not connected because we have never been connected
3086 	 * we return ENOTCONN. If we have been connected (but are no longer
3087 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3088 	 * the EOF.
3089 	 *
3090 	 * An alternative would be to post an ENOTCONN error in stream head
3091 	 * (read+write) and clear it when we're connected. However, that error
3092 	 * would cause incorrect poll/select behavior!
3093 	 */
3094 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3095 	    (so->so_mode & SM_CONNREQUIRED)) {
3096 		return (ENOTCONN);
3097 	}
3098 
3099 	/*
3100 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3101 	 * after checking that the read queue is empty) and returns zero.
3102 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
3103 	 * is zero.
3104 	 */
3105 
3106 	if (flags & MSG_OOB) {
3107 		/* Check that the transport supports OOB */
3108 		if (!(so->so_mode & SM_EXDATA))
3109 			return (EOPNOTSUPP);
3110 		so_update_attrs(so, SOACC);
3111 		return (sorecvoob(so, msg, uiop, flags,
3112 		    (so->so_options & SO_OOBINLINE)));
3113 	}
3114 
3115 	so_update_attrs(so, SOACC);
3116 
3117 	/*
3118 	 * Set msg_controllen and msg_namelen to zero here to make it
3119 	 * simpler in the cases that no control or name is returned.
3120 	 */
3121 	controllen = msg->msg_controllen;
3122 	namelen = msg->msg_namelen;
3123 	msg->msg_controllen = 0;
3124 	msg->msg_namelen = 0;
3125 
3126 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3127 	    namelen, controllen));
3128 
3129 	mutex_enter(&so->so_lock);
3130 	/*
3131 	 * If an NL7C enabled socket and not waiting for write data.
3132 	 */
3133 	if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3134 	    NL7C_ENABLED) {
3135 		if (sti->sti_nl7c_uri) {
3136 			/* Close uri processing for a previous request */
3137 			nl7c_close(so);
3138 		}
3139 		if ((so_state & SS_CANTRCVMORE) &&
3140 		    sti->sti_nl7c_rcv_mp == NULL) {
3141 			/* Nothing to process, EOF */
3142 			mutex_exit(&so->so_lock);
3143 			return (0);
3144 		} else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3145 			/* Persistent NL7C socket, try to process request */
3146 			boolean_t ret;
3147 
3148 			ret = nl7c_process(so,
3149 			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3150 			rval.r_vals = sti->sti_nl7c_rcv_rval;
3151 			error = rval.r_v.r_v2;
3152 			if (error) {
3153 				/* Error of some sort, return it */
3154 				mutex_exit(&so->so_lock);
3155 				return (error);
3156 			}
3157 			if (sti->sti_nl7c_flags &&
3158 			    ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3159 				/*
3160 				 * Still an NL7C socket and no data
3161 				 * to pass up to the caller.
3162 				 */
3163 				mutex_exit(&so->so_lock);
3164 				if (ret) {
3165 					/* EOF */
3166 					return (0);
3167 				} else {
3168 					/* Need more data */
3169 					return (EAGAIN);
3170 				}
3171 			}
3172 		} else {
3173 			/*
3174 			 * Not persistent so no further NL7C processing.
3175 			 */
3176 			sti->sti_nl7c_flags = 0;
3177 		}
3178 	}
3179 	/*
3180 	 * Only one reader is allowed at any given time. This is needed
3181 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3182 	 *
3183 	 * This is slightly different that BSD behavior in that it fails with
3184 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3185 	 * is single-threaded using sblock(), which is dropped while waiting
3186 	 * for data to appear. The difference shows up e.g. if one
3187 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3188 	 * does use nonblocking io and different threads are reading each
3189 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3190 	 * in this case as long as the read queue doesn't get empty.
3191 	 * In this implementation the thread using nonblocking io can
3192 	 * get an EWOULDBLOCK error due to the blocking thread executing
3193 	 * e.g. in the uiomove in kstrgetmsg.
3194 	 * This difference is not believed to be significant.
3195 	 */
3196 	/* Set SOREADLOCKED */
3197 	error = so_lock_read_intr(so,
3198 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3199 	mutex_exit(&so->so_lock);
3200 	if (error)
3201 		return (error);
3202 
3203 	/*
3204 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3205 	 * queued data has been consumed.
3206 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3207 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3208 	 *
3209 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3210 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3211 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3212 	 */
3213 	pflag = MSG_ANY | MSG_DELAYERROR;
3214 	if (flags & MSG_PEEK) {
3215 		pflag |= MSG_IPEEK;
3216 		flags &= ~MSG_WAITALL;
3217 	}
3218 	if (so->so_mode & SM_ATOMIC)
3219 		pflag |= MSG_DISCARDTAIL;
3220 
3221 	if (flags & MSG_DONTWAIT)
3222 		timout = 0;
3223 	else
3224 		timout = -1;
3225 	opflag = pflag;
3226 retry:
3227 	saved_resid = uiop->uio_resid;
3228 	pri = 0;
3229 	mp = NULL;
3230 	if (sti->sti_nl7c_rcv_mp != NULL) {
3231 		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3232 		error = nl7c_sorecv(so, &mp, uiop, &rval);
3233 	} else {
3234 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3235 		    timout, &rval);
3236 	}
3237 	if (error != 0) {
3238 		/* kstrgetmsg returns ETIME when timeout expires */
3239 		if (error == ETIME)
3240 			error = EWOULDBLOCK;
3241 		goto out;
3242 	}
3243 	/*
3244 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3245 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3246 	 */
3247 	ASSERT(!(rval.r_val1 & MORECTL));
3248 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3249 		msg->msg_flags |= MSG_TRUNC;
3250 
3251 	if (mp == NULL) {
3252 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3253 		/*
3254 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3255 		 * The draft Posix socket spec states that the mark should
3256 		 * not be cleared when peeking. We follow the latter.
3257 		 */
3258 		if ((so->so_state &
3259 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3260 		    (uiop->uio_resid != saved_resid) &&
3261 		    !(flags & MSG_PEEK)) {
3262 			sorecv_update_oobstate(so);
3263 		}
3264 
3265 		mutex_enter(&so->so_lock);
3266 		/* Set MSG_EOR based on MOREDATA */
3267 		if (!(rval.r_val1 & MOREDATA)) {
3268 			if (so->so_state & SS_SAVEDEOR) {
3269 				msg->msg_flags |= MSG_EOR;
3270 				so->so_state &= ~SS_SAVEDEOR;
3271 			}
3272 		}
3273 		/*
3274 		 * If some data was received (i.e. not EOF) and the
3275 		 * read/recv* has not been satisfied wait for some more.
3276 		 */
3277 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3278 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3279 			mutex_exit(&so->so_lock);
3280 			pflag = opflag | MSG_NOMARK;
3281 			goto retry;
3282 		}
3283 		goto out_locked;
3284 	}
3285 
3286 	/* strsock_proto has already verified length and alignment */
3287 	tpr = (union T_primitives *)mp->b_rptr;
3288 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3289 
3290 	switch (tpr->type) {
3291 	case T_DATA_IND: {
3292 		if ((so->so_state &
3293 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3294 		    (uiop->uio_resid != saved_resid) &&
3295 		    !(flags & MSG_PEEK)) {
3296 			sorecv_update_oobstate(so);
3297 		}
3298 
3299 		/*
3300 		 * Set msg_flags to MSG_EOR based on
3301 		 * MORE_flag and MOREDATA.
3302 		 */
3303 		mutex_enter(&so->so_lock);
3304 		so->so_state &= ~SS_SAVEDEOR;
3305 		if (!(tpr->data_ind.MORE_flag & 1)) {
3306 			if (!(rval.r_val1 & MOREDATA))
3307 				msg->msg_flags |= MSG_EOR;
3308 			else
3309 				so->so_state |= SS_SAVEDEOR;
3310 		}
3311 		freemsg(mp);
3312 		/*
3313 		 * If some data was received (i.e. not EOF) and the
3314 		 * read/recv* has not been satisfied wait for some more.
3315 		 */
3316 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3317 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3318 			mutex_exit(&so->so_lock);
3319 			pflag = opflag | MSG_NOMARK;
3320 			goto retry;
3321 		}
3322 		goto out_locked;
3323 	}
3324 	case T_UNITDATA_IND: {
3325 		void *addr;
3326 		t_uscalar_t addrlen;
3327 		void *abuf;
3328 		t_uscalar_t optlen;
3329 		void *opt;
3330 
3331 		if ((so->so_state &
3332 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3333 		    (uiop->uio_resid != saved_resid) &&
3334 		    !(flags & MSG_PEEK)) {
3335 			sorecv_update_oobstate(so);
3336 		}
3337 
3338 		if (namelen != 0) {
3339 			/* Caller wants source address */
3340 			addrlen = tpr->unitdata_ind.SRC_length;
3341 			addr = sogetoff(mp,
3342 			    tpr->unitdata_ind.SRC_offset,
3343 			    addrlen, 1);
3344 			if (addr == NULL) {
3345 				freemsg(mp);
3346 				error = EPROTO;
3347 				eprintsoline(so, error);
3348 				goto out;
3349 			}
3350 			if (so->so_family == AF_UNIX) {
3351 				/*
3352 				 * Can not use the transport level address.
3353 				 * If there is a SO_SRCADDR option carrying
3354 				 * the socket level address it will be
3355 				 * extracted below.
3356 				 */
3357 				addr = NULL;
3358 				addrlen = 0;
3359 			}
3360 		}
3361 		optlen = tpr->unitdata_ind.OPT_length;
3362 		if (optlen != 0) {
3363 			t_uscalar_t ncontrollen;
3364 
3365 			/*
3366 			 * Extract any source address option.
3367 			 * Determine how large cmsg buffer is needed.
3368 			 */
3369 			opt = sogetoff(mp,
3370 			    tpr->unitdata_ind.OPT_offset,
3371 			    optlen, __TPI_ALIGN_SIZE);
3372 
3373 			if (opt == NULL) {
3374 				freemsg(mp);
3375 				error = EPROTO;
3376 				eprintsoline(so, error);
3377 				goto out;
3378 			}
3379 			if (so->so_family == AF_UNIX)
3380 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3381 			ncontrollen = so_cmsglen(mp, opt, optlen,
3382 			    !(flags & MSG_XPG4_2));
3383 			if (controllen != 0)
3384 				controllen = ncontrollen;
3385 			else if (ncontrollen != 0)
3386 				msg->msg_flags |= MSG_CTRUNC;
3387 		} else {
3388 			controllen = 0;
3389 		}
3390 
3391 		if (namelen != 0) {
3392 			/*
3393 			 * Return address to caller.
3394 			 * Caller handles truncation if length
3395 			 * exceeds msg_namelen.
3396 			 * NOTE: AF_UNIX NUL termination is ensured by
3397 			 * the sender's copyin_name().
3398 			 */
3399 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3400 
3401 			bcopy(addr, abuf, addrlen);
3402 			msg->msg_name = abuf;
3403 			msg->msg_namelen = addrlen;
3404 		}
3405 
3406 		if (controllen != 0) {
3407 			/*
3408 			 * Return control msg to caller.
3409 			 * Caller handles truncation if length
3410 			 * exceeds msg_controllen.
3411 			 */
3412 			control = kmem_zalloc(controllen, KM_SLEEP);
3413 
3414 			error = so_opt2cmsg(mp, opt, optlen,
3415 			    !(flags & MSG_XPG4_2),
3416 			    control, controllen);
3417 			if (error) {
3418 				freemsg(mp);
3419 				if (msg->msg_namelen != 0)
3420 					kmem_free(msg->msg_name,
3421 					    msg->msg_namelen);
3422 				kmem_free(control, controllen);
3423 				eprintsoline(so, error);
3424 				goto out;
3425 			}
3426 			msg->msg_control = control;
3427 			msg->msg_controllen = controllen;
3428 		}
3429 
3430 		freemsg(mp);
3431 		goto out;
3432 	}
3433 	case T_OPTDATA_IND: {
3434 		struct T_optdata_req *tdr;
3435 		void *opt;
3436 		t_uscalar_t optlen;
3437 
3438 		if ((so->so_state &
3439 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3440 		    (uiop->uio_resid != saved_resid) &&
3441 		    !(flags & MSG_PEEK)) {
3442 			sorecv_update_oobstate(so);
3443 		}
3444 
3445 		tdr = (struct T_optdata_req *)mp->b_rptr;
3446 		optlen = tdr->OPT_length;
3447 		if (optlen != 0) {
3448 			t_uscalar_t ncontrollen;
3449 			/*
3450 			 * Determine how large cmsg buffer is needed.
3451 			 */
3452 			opt = sogetoff(mp,
3453 			    tpr->optdata_ind.OPT_offset,
3454 			    optlen, __TPI_ALIGN_SIZE);
3455 
3456 			if (opt == NULL) {
3457 				freemsg(mp);
3458 				error = EPROTO;
3459 				eprintsoline(so, error);
3460 				goto out;
3461 			}
3462 
3463 			ncontrollen = so_cmsglen(mp, opt, optlen,
3464 			    !(flags & MSG_XPG4_2));
3465 			if (controllen != 0)
3466 				controllen = ncontrollen;
3467 			else if (ncontrollen != 0)
3468 				msg->msg_flags |= MSG_CTRUNC;
3469 		} else {
3470 			controllen = 0;
3471 		}
3472 
3473 		if (controllen != 0) {
3474 			/*
3475 			 * Return control msg to caller.
3476 			 * Caller handles truncation if length
3477 			 * exceeds msg_controllen.
3478 			 */
3479 			control = kmem_zalloc(controllen, KM_SLEEP);
3480 
3481 			error = so_opt2cmsg(mp, opt, optlen,
3482 			    !(flags & MSG_XPG4_2),
3483 			    control, controllen);
3484 			if (error) {
3485 				freemsg(mp);
3486 				kmem_free(control, controllen);
3487 				eprintsoline(so, error);
3488 				goto out;
3489 			}
3490 			msg->msg_control = control;
3491 			msg->msg_controllen = controllen;
3492 		}
3493 
3494 		/*
3495 		 * Set msg_flags to MSG_EOR based on
3496 		 * DATA_flag and MOREDATA.
3497 		 */
3498 		mutex_enter(&so->so_lock);
3499 		so->so_state &= ~SS_SAVEDEOR;
3500 		if (!(tpr->data_ind.MORE_flag & 1)) {
3501 			if (!(rval.r_val1 & MOREDATA))
3502 				msg->msg_flags |= MSG_EOR;
3503 			else
3504 				so->so_state |= SS_SAVEDEOR;
3505 		}
3506 		freemsg(mp);
3507 		/*
3508 		 * If some data was received (i.e. not EOF) and the
3509 		 * read/recv* has not been satisfied wait for some more.
3510 		 * Not possible to wait if control info was received.
3511 		 */
3512 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3513 		    controllen == 0 &&
3514 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3515 			mutex_exit(&so->so_lock);
3516 			pflag = opflag | MSG_NOMARK;
3517 			goto retry;
3518 		}
3519 		goto out_locked;
3520 	}
3521 	case T_EXDATA_IND: {
3522 		dprintso(so, 1,
3523 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3524 		    "state %s\n",
3525 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3526 		    saved_resid - uiop->uio_resid,
3527 		    pr_state(so->so_state, so->so_mode)));
3528 		/*
3529 		 * kstrgetmsg handles MSGMARK so there is nothing to
3530 		 * inspect in the T_EXDATA_IND.
3531 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3532 		 * as a separate message with no M_DATA component. Furthermore,
3533 		 * the stream head does not consolidate M_DATA messages onto
3534 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3535 		 * remains a message by itself. This is needed since MSGMARK
3536 		 * marks both the whole message as well as the last byte
3537 		 * of the message.
3538 		 */
3539 		freemsg(mp);
3540 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3541 		if (flags & MSG_PEEK) {
3542 			/*
3543 			 * Even though we are peeking we consume the
3544 			 * T_EXDATA_IND thereby moving the mark information
3545 			 * to SS_RCVATMARK. Then the oob code below will
3546 			 * retry the peeking kstrgetmsg.
3547 			 * Note that the stream head read queue is
3548 			 * never flushed without holding SOREADLOCKED
3549 			 * thus the T_EXDATA_IND can not disappear
3550 			 * underneath us.
3551 			 */
3552 			dprintso(so, 1,
3553 			    ("sotpi_recvmsg: consume EXDATA_IND "
3554 			    "counts %d/%d state %s\n",
3555 			    sti->sti_oobsigcnt,
3556 			    sti->sti_oobcnt,
3557 			    pr_state(so->so_state, so->so_mode)));
3558 
3559 			pflag = MSG_ANY | MSG_DELAYERROR;
3560 			if (so->so_mode & SM_ATOMIC)
3561 				pflag |= MSG_DISCARDTAIL;
3562 
3563 			pri = 0;
3564 			mp = NULL;
3565 
3566 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3567 			    &pri, &pflag, (clock_t)-1, &rval);
3568 			ASSERT(uiop->uio_resid == saved_resid);
3569 
3570 			if (error) {
3571 #ifdef SOCK_DEBUG
3572 				if (error != EWOULDBLOCK && error != EINTR) {
3573 					eprintsoline(so, error);
3574 				}
3575 #endif /* SOCK_DEBUG */
3576 				goto out;
3577 			}
3578 			ASSERT(mp);
3579 			tpr = (union T_primitives *)mp->b_rptr;
3580 			ASSERT(tpr->type == T_EXDATA_IND);
3581 			freemsg(mp);
3582 		} /* end "if (flags & MSG_PEEK)" */
3583 
3584 		/*
3585 		 * Decrement the number of queued and pending oob.
3586 		 *
3587 		 * SS_RCVATMARK is cleared when we read past a mark.
3588 		 * SS_HAVEOOBDATA is cleared when we've read past the
3589 		 * last mark.
3590 		 * SS_OOBPEND is cleared if we've read past the last
3591 		 * mark and no (new) SIGURG has been posted.
3592 		 */
3593 		mutex_enter(&so->so_lock);
3594 		ASSERT(so_verify_oobstate(so));
3595 		ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3596 		ASSERT(sti->sti_oobsigcnt > 0);
3597 		sti->sti_oobsigcnt--;
3598 		ASSERT(sti->sti_oobcnt > 0);
3599 		sti->sti_oobcnt--;
3600 		/*
3601 		 * Since the T_EXDATA_IND has been removed from the stream
3602 		 * head, but we have not read data past the mark,
3603 		 * sockfs needs to track that the socket is still at the mark.
3604 		 *
3605 		 * Since no data was received call kstrgetmsg again to wait
3606 		 * for data.
3607 		 */
3608 		so->so_state |= SS_RCVATMARK;
3609 		mutex_exit(&so->so_lock);
3610 		dprintso(so, 1,
3611 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3612 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3613 		    pr_state(so->so_state, so->so_mode)));
3614 		pflag = opflag;
3615 		goto retry;
3616 	}
3617 	default:
3618 		cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3619 		    (void *)so, tpr->type, (void *)mp);
3620 		ASSERT(0);
3621 		freemsg(mp);
3622 		error = EPROTO;
3623 		eprintsoline(so, error);
3624 		goto out;
3625 	}
3626 	/* NOTREACHED */
3627 out:
3628 	mutex_enter(&so->so_lock);
3629 out_locked:
3630 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3631 	mutex_exit(&so->so_lock);
3632 	return (error);
3633 }
3634 
3635 /*
3636  * Sending data with options on a datagram socket.
3637  * Assumes caller has verified that SS_ISBOUND etc. are set.
3638  */
3639 static int
3640 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3641     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3642 {
3643 	struct T_unitdata_req	tudr;
3644 	mblk_t			*mp;
3645 	int			error;
3646 	void			*addr;
3647 	socklen_t		addrlen;
3648 	void			*src;
3649 	socklen_t		srclen;
3650 	ssize_t			len;
3651 	int			size;
3652 	struct T_opthdr		toh;
3653 	struct fdbuf		*fdbuf;
3654 	t_uscalar_t		optlen;
3655 	void			*fds;
3656 	int			fdlen;
3657 	sotpi_info_t		*sti = SOTOTPI(so);
3658 
3659 	ASSERT(name && namelen);
3660 	ASSERT(control && controllen);
3661 
3662 	len = uiop->uio_resid;
3663 	if (len > (ssize_t)sti->sti_tidu_size) {
3664 		return (EMSGSIZE);
3665 	}
3666 
3667 	/*
3668 	 * For AF_UNIX the destination address is translated to an internal
3669 	 * name and the source address is passed as an option.
3670 	 * Also, file descriptors are passed as file pointers in an
3671 	 * option.
3672 	 */
3673 
3674 	/*
3675 	 * Length and family checks.
3676 	 */
3677 	error = so_addr_verify(so, name, namelen);
3678 	if (error) {
3679 		eprintsoline(so, error);
3680 		return (error);
3681 	}
3682 	if (so->so_family == AF_UNIX) {
3683 		if (sti->sti_faddr_noxlate) {
3684 			/*
3685 			 * Already have a transport internal address. Do not
3686 			 * pass any (transport internal) source address.
3687 			 */
3688 			addr = name;
3689 			addrlen = namelen;
3690 			src = NULL;
3691 			srclen = 0;
3692 		} else {
3693 			/*
3694 			 * Pass the sockaddr_un source address as an option
3695 			 * and translate the remote address.
3696 			 *
3697 			 * Note that this code does not prevent sti_laddr_sa
3698 			 * from changing while it is being used. Thus
3699 			 * if an unbind+bind occurs concurrently with this
3700 			 * send the peer might see a partially new and a
3701 			 * partially old "from" address.
3702 			 */
3703 			src = sti->sti_laddr_sa;
3704 			srclen = (t_uscalar_t)sti->sti_laddr_len;
3705 			dprintso(so, 1,
3706 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3707 			    srclen, src));
3708 			error = so_ux_addr_xlate(so, name, namelen,
3709 			    (flags & MSG_XPG4_2),
3710 			    &addr, &addrlen);
3711 			if (error) {
3712 				eprintsoline(so, error);
3713 				return (error);
3714 			}
3715 		}
3716 	} else {
3717 		addr = name;
3718 		addrlen = namelen;
3719 		src = NULL;
3720 		srclen = 0;
3721 	}
3722 	optlen = so_optlen(control, controllen,
3723 	    !(flags & MSG_XPG4_2));
3724 	tudr.PRIM_type = T_UNITDATA_REQ;
3725 	tudr.DEST_length = addrlen;
3726 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3727 	if (srclen != 0)
3728 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3729 		    _TPI_ALIGN_TOPT(srclen));
3730 	else
3731 		tudr.OPT_length = optlen;
3732 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3733 	    _TPI_ALIGN_TOPT(addrlen));
3734 
3735 	size = tudr.OPT_offset + tudr.OPT_length;
3736 
3737 	/*
3738 	 * File descriptors only when SM_FDPASSING set.
3739 	 */
3740 	error = so_getfdopt(control, controllen,
3741 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3742 	if (error)
3743 		return (error);
3744 	if (fdlen != -1) {
3745 		if (!(so->so_mode & SM_FDPASSING))
3746 			return (EOPNOTSUPP);
3747 
3748 		error = fdbuf_create(fds, fdlen, &fdbuf);
3749 		if (error)
3750 			return (error);
3751 		mp = fdbuf_allocmsg(size, fdbuf);
3752 	} else {
3753 		mp = soallocproto(size, _ALLOC_INTR, CRED());
3754 		if (mp == NULL) {
3755 			/*
3756 			 * Caught a signal waiting for memory.
3757 			 * Let send* return EINTR.
3758 			 */
3759 			return (EINTR);
3760 		}
3761 	}
3762 	soappendmsg(mp, &tudr, sizeof (tudr));
3763 	soappendmsg(mp, addr, addrlen);
3764 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3765 
3766 	if (fdlen != -1) {
3767 		ASSERT(fdbuf != NULL);
3768 		toh.level = SOL_SOCKET;
3769 		toh.name = SO_FILEP;
3770 		toh.len = fdbuf->fd_size +
3771 		    (t_uscalar_t)sizeof (struct T_opthdr);
3772 		toh.status = 0;
3773 		soappendmsg(mp, &toh, sizeof (toh));
3774 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3775 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3776 	}
3777 	if (srclen != 0) {
3778 		/*
3779 		 * There is a AF_UNIX sockaddr_un to include as a source
3780 		 * address option.
3781 		 */
3782 		toh.level = SOL_SOCKET;
3783 		toh.name = SO_SRCADDR;
3784 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3785 		toh.status = 0;
3786 		soappendmsg(mp, &toh, sizeof (toh));
3787 		soappendmsg(mp, src, srclen);
3788 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3789 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3790 	}
3791 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3792 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3793 	/* At most 3 bytes left in the message */
3794 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3795 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3796 
3797 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3798 	if (AU_AUDITING())
3799 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3800 
3801 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3802 #ifdef SOCK_DEBUG
3803 	if (error) {
3804 		eprintsoline(so, error);
3805 	}
3806 #endif /* SOCK_DEBUG */
3807 	return (error);
3808 }
3809 
3810 /*
3811  * Sending data with options on a connected stream socket.
3812  * Assumes caller has verified that SS_ISCONNECTED is set.
3813  */
3814 static int
3815 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3816     t_uscalar_t controllen, int flags)
3817 {
3818 	struct T_optdata_req	tdr;
3819 	mblk_t			*mp;
3820 	int			error;
3821 	ssize_t			iosize;
3822 	int			size;
3823 	struct fdbuf		*fdbuf;
3824 	t_uscalar_t		optlen;
3825 	void			*fds;
3826 	int			fdlen;
3827 	struct T_opthdr		toh;
3828 	sotpi_info_t		*sti = SOTOTPI(so);
3829 
3830 	dprintso(so, 1,
3831 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3832 
3833 	/*
3834 	 * Has to be bound and connected. However, since no locks are
3835 	 * held the state could have changed after sotpi_sendmsg checked it
3836 	 * thus it is not possible to ASSERT on the state.
3837 	 */
3838 
3839 	/* Options on connection-oriented only when SM_OPTDATA set. */
3840 	if (!(so->so_mode & SM_OPTDATA))
3841 		return (EOPNOTSUPP);
3842 
3843 	do {
3844 		/*
3845 		 * Set the MORE flag if uio_resid does not fit in this
3846 		 * message or if the caller passed in "more".
3847 		 * Error for transports with zero tidu_size.
3848 		 */
3849 		tdr.PRIM_type = T_OPTDATA_REQ;
3850 		iosize = sti->sti_tidu_size;
3851 		if (iosize <= 0)
3852 			return (EMSGSIZE);
3853 		if (uiop->uio_resid > iosize) {
3854 			tdr.DATA_flag = 1;
3855 		} else {
3856 			if (more)
3857 				tdr.DATA_flag = 1;
3858 			else
3859 				tdr.DATA_flag = 0;
3860 			iosize = uiop->uio_resid;
3861 		}
3862 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3863 		    tdr.DATA_flag, iosize));
3864 
3865 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3866 		tdr.OPT_length = optlen;
3867 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3868 
3869 		size = (int)sizeof (tdr) + optlen;
3870 		/*
3871 		 * File descriptors only when SM_FDPASSING set.
3872 		 */
3873 		error = so_getfdopt(control, controllen,
3874 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3875 		if (error)
3876 			return (error);
3877 		if (fdlen != -1) {
3878 			if (!(so->so_mode & SM_FDPASSING))
3879 				return (EOPNOTSUPP);
3880 
3881 			error = fdbuf_create(fds, fdlen, &fdbuf);
3882 			if (error)
3883 				return (error);
3884 			mp = fdbuf_allocmsg(size, fdbuf);
3885 		} else {
3886 			mp = soallocproto(size, _ALLOC_INTR, CRED());
3887 			if (mp == NULL) {
3888 				/*
3889 				 * Caught a signal waiting for memory.
3890 				 * Let send* return EINTR.
3891 				 */
3892 				return (EINTR);
3893 			}
3894 		}
3895 		soappendmsg(mp, &tdr, sizeof (tdr));
3896 
3897 		if (fdlen != -1) {
3898 			ASSERT(fdbuf != NULL);
3899 			toh.level = SOL_SOCKET;
3900 			toh.name = SO_FILEP;
3901 			toh.len = fdbuf->fd_size +
3902 			    (t_uscalar_t)sizeof (struct T_opthdr);
3903 			toh.status = 0;
3904 			soappendmsg(mp, &toh, sizeof (toh));
3905 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3906 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3907 		}
3908 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3909 		/* At most 3 bytes left in the message */
3910 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3911 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3912 
3913 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3914 
3915 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3916 		    0, MSG_BAND, 0);
3917 		if (error) {
3918 			eprintsoline(so, error);
3919 			return (error);
3920 		}
3921 		control = NULL;
3922 		if (uiop->uio_resid > 0) {
3923 			/*
3924 			 * Recheck for fatal errors. Fail write even though
3925 			 * some data have been written. This is consistent
3926 			 * with strwrite semantics and BSD sockets semantics.
3927 			 */
3928 			if (so->so_state & SS_CANTSENDMORE) {
3929 				eprintsoline(so, error);
3930 				return (EPIPE);
3931 			}
3932 			if (so->so_error != 0) {
3933 				mutex_enter(&so->so_lock);
3934 				error = sogeterr(so, B_TRUE);
3935 				mutex_exit(&so->so_lock);
3936 				if (error != 0) {
3937 					eprintsoline(so, error);
3938 					return (error);
3939 				}
3940 			}
3941 		}
3942 	} while (uiop->uio_resid > 0);
3943 	return (0);
3944 }
3945 
3946 /*
3947  * Sending data on a datagram socket.
3948  * Assumes caller has verified that SS_ISBOUND etc. are set.
3949  *
3950  * For AF_UNIX the destination address is translated to an internal
3951  * name and the source address is passed as an option.
3952  */
3953 int
3954 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
3955     struct uio *uiop, int flags)
3956 {
3957 	struct T_unitdata_req	tudr;
3958 	mblk_t			*mp;
3959 	int			error;
3960 	void			*addr;
3961 	socklen_t		addrlen;
3962 	void			*src;
3963 	socklen_t		srclen;
3964 	ssize_t			len;
3965 	sotpi_info_t		*sti = SOTOTPI(so);
3966 
3967 	ASSERT(name != NULL && namelen != 0);
3968 
3969 	len = uiop->uio_resid;
3970 	if (len > sti->sti_tidu_size) {
3971 		error = EMSGSIZE;
3972 		goto done;
3973 	}
3974 
3975 	/* Length and family checks */
3976 	error = so_addr_verify(so, name, namelen);
3977 	if (error != 0)
3978 		goto done;
3979 
3980 	if (sti->sti_direct)
3981 		return (sodgram_direct(so, name, namelen, uiop, flags));
3982 
3983 	if (so->so_family == AF_UNIX) {
3984 		if (sti->sti_faddr_noxlate) {
3985 			/*
3986 			 * Already have a transport internal address. Do not
3987 			 * pass any (transport internal) source address.
3988 			 */
3989 			addr = name;
3990 			addrlen = namelen;
3991 			src = NULL;
3992 			srclen = 0;
3993 		} else {
3994 			/*
3995 			 * Pass the sockaddr_un source address as an option
3996 			 * and translate the remote address.
3997 			 *
3998 			 * Note that this code does not prevent sti_laddr_sa
3999 			 * from changing while it is being used. Thus
4000 			 * if an unbind+bind occurs concurrently with this
4001 			 * send the peer might see a partially new and a
4002 			 * partially old "from" address.
4003 			 */
4004 			src = sti->sti_laddr_sa;
4005 			srclen = (socklen_t)sti->sti_laddr_len;
4006 			dprintso(so, 1,
4007 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
4008 			    srclen, src));
4009 			error = so_ux_addr_xlate(so, name, namelen,
4010 			    (flags & MSG_XPG4_2),
4011 			    &addr, &addrlen);
4012 			if (error) {
4013 				eprintsoline(so, error);
4014 				goto done;
4015 			}
4016 		}
4017 	} else {
4018 		addr = name;
4019 		addrlen = namelen;
4020 		src = NULL;
4021 		srclen = 0;
4022 	}
4023 	tudr.PRIM_type = T_UNITDATA_REQ;
4024 	tudr.DEST_length = addrlen;
4025 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4026 	if (srclen == 0) {
4027 		tudr.OPT_length = 0;
4028 		tudr.OPT_offset = 0;
4029 
4030 		mp = soallocproto2(&tudr, sizeof (tudr),
4031 		    addr, addrlen, 0, _ALLOC_INTR, CRED());
4032 		if (mp == NULL) {
4033 			/*
4034 			 * Caught a signal waiting for memory.
4035 			 * Let send* return EINTR.
4036 			 */
4037 			error = EINTR;
4038 			goto done;
4039 		}
4040 	} else {
4041 		/*
4042 		 * There is a AF_UNIX sockaddr_un to include as a source
4043 		 * address option.
4044 		 */
4045 		struct T_opthdr toh;
4046 		ssize_t size;
4047 
4048 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4049 		    _TPI_ALIGN_TOPT(srclen));
4050 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4051 		    _TPI_ALIGN_TOPT(addrlen));
4052 
4053 		toh.level = SOL_SOCKET;
4054 		toh.name = SO_SRCADDR;
4055 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4056 		toh.status = 0;
4057 
4058 		size = tudr.OPT_offset + tudr.OPT_length;
4059 		mp = soallocproto2(&tudr, sizeof (tudr),
4060 		    addr, addrlen, size, _ALLOC_INTR, CRED());
4061 		if (mp == NULL) {
4062 			/*
4063 			 * Caught a signal waiting for memory.
4064 			 * Let send* return EINTR.
4065 			 */
4066 			error = EINTR;
4067 			goto done;
4068 		}
4069 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4070 		soappendmsg(mp, &toh, sizeof (toh));
4071 		soappendmsg(mp, src, srclen);
4072 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4073 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4074 	}
4075 
4076 	if (AU_AUDITING())
4077 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4078 
4079 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4080 done:
4081 #ifdef SOCK_DEBUG
4082 	if (error) {
4083 		eprintsoline(so, error);
4084 	}
4085 #endif /* SOCK_DEBUG */
4086 	return (error);
4087 }
4088 
4089 /*
4090  * Sending data on a connected stream socket.
4091  * Assumes caller has verified that SS_ISCONNECTED is set.
4092  */
4093 int
4094 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4095     int sflag)
4096 {
4097 	struct T_data_req	tdr;
4098 	mblk_t			*mp;
4099 	int			error;
4100 	ssize_t			iosize;
4101 	sotpi_info_t		*sti = SOTOTPI(so);
4102 
4103 	dprintso(so, 1,
4104 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4105 	    (void *)so, uiop->uio_resid, prim, sflag));
4106 
4107 	/*
4108 	 * Has to be bound and connected. However, since no locks are
4109 	 * held the state could have changed after sotpi_sendmsg checked it
4110 	 * thus it is not possible to ASSERT on the state.
4111 	 */
4112 
4113 	do {
4114 		/*
4115 		 * Set the MORE flag if uio_resid does not fit in this
4116 		 * message or if the caller passed in "more".
4117 		 * Error for transports with zero tidu_size.
4118 		 */
4119 		tdr.PRIM_type = prim;
4120 		iosize = sti->sti_tidu_size;
4121 		if (iosize <= 0)
4122 			return (EMSGSIZE);
4123 		if (uiop->uio_resid > iosize) {
4124 			tdr.MORE_flag = 1;
4125 		} else {
4126 			if (more)
4127 				tdr.MORE_flag = 1;
4128 			else
4129 				tdr.MORE_flag = 0;
4130 			iosize = uiop->uio_resid;
4131 		}
4132 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4133 		    prim, tdr.MORE_flag, iosize));
4134 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4135 		if (mp == NULL) {
4136 			/*
4137 			 * Caught a signal waiting for memory.
4138 			 * Let send* return EINTR.
4139 			 */
4140 			return (EINTR);
4141 		}
4142 
4143 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4144 		    0, sflag | MSG_BAND, 0);
4145 		if (error) {
4146 			eprintsoline(so, error);
4147 			return (error);
4148 		}
4149 		if (uiop->uio_resid > 0) {
4150 			/*
4151 			 * Recheck for fatal errors. Fail write even though
4152 			 * some data have been written. This is consistent
4153 			 * with strwrite semantics and BSD sockets semantics.
4154 			 */
4155 			if (so->so_state & SS_CANTSENDMORE) {
4156 				eprintsoline(so, error);
4157 				return (EPIPE);
4158 			}
4159 			if (so->so_error != 0) {
4160 				mutex_enter(&so->so_lock);
4161 				error = sogeterr(so, B_TRUE);
4162 				mutex_exit(&so->so_lock);
4163 				if (error != 0) {
4164 					eprintsoline(so, error);
4165 					return (error);
4166 				}
4167 			}
4168 		}
4169 	} while (uiop->uio_resid > 0);
4170 	return (0);
4171 }
4172 
4173 /*
4174  * Check the state for errors and call the appropriate send function.
4175  *
4176  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4177  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4178  * after sending the message.
4179  */
4180 static int
4181 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4182     struct cred *cr)
4183 {
4184 	int		so_state;
4185 	int		so_mode;
4186 	int		error;
4187 	struct sockaddr *name;
4188 	t_uscalar_t	namelen;
4189 	int		dontroute;
4190 	int		flags;
4191 	sotpi_info_t	*sti = SOTOTPI(so);
4192 
4193 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4194 	    (void *)so, (void *)msg, msg->msg_flags,
4195 	    pr_state(so->so_state, so->so_mode), so->so_error));
4196 
4197 	if (so->so_version == SOV_STREAM) {
4198 		/* The imaginary "sockmod" has been popped - act as a stream */
4199 		so_update_attrs(so, SOMOD);
4200 		return (strwrite(SOTOV(so), uiop, cr));
4201 	}
4202 
4203 	mutex_enter(&so->so_lock);
4204 	so_state = so->so_state;
4205 
4206 	if (so_state & SS_CANTSENDMORE) {
4207 		mutex_exit(&so->so_lock);
4208 		return (EPIPE);
4209 	}
4210 
4211 	if (so->so_error != 0) {
4212 		error = sogeterr(so, B_TRUE);
4213 		if (error != 0) {
4214 			mutex_exit(&so->so_lock);
4215 			return (error);
4216 		}
4217 	}
4218 
4219 	name = (struct sockaddr *)msg->msg_name;
4220 	namelen = msg->msg_namelen;
4221 
4222 	so_mode = so->so_mode;
4223 
4224 	if (name == NULL) {
4225 		if (!(so_state & SS_ISCONNECTED)) {
4226 			mutex_exit(&so->so_lock);
4227 			if (so_mode & SM_CONNREQUIRED)
4228 				return (ENOTCONN);
4229 			else
4230 				return (EDESTADDRREQ);
4231 		}
4232 		if (so_mode & SM_CONNREQUIRED) {
4233 			name = NULL;
4234 			namelen = 0;
4235 		} else {
4236 			/*
4237 			 * Note that this code does not prevent sti_faddr_sa
4238 			 * from changing while it is being used. Thus
4239 			 * if an "unconnect"+connect occurs concurrently with
4240 			 * this send the datagram might be delivered to a
4241 			 * garbaled address.
4242 			 */
4243 			ASSERT(sti->sti_faddr_sa);
4244 			name = sti->sti_faddr_sa;
4245 			namelen = (t_uscalar_t)sti->sti_faddr_len;
4246 		}
4247 	} else {
4248 		if (!(so_state & SS_ISCONNECTED) &&
4249 		    (so_mode & SM_CONNREQUIRED)) {
4250 			/* Required but not connected */
4251 			mutex_exit(&so->so_lock);
4252 			return (ENOTCONN);
4253 		}
4254 		/*
4255 		 * Ignore the address on connection-oriented sockets.
4256 		 * Just like BSD this code does not generate an error for
4257 		 * TCP (a CONNREQUIRED socket) when sending to an address
4258 		 * passed in with sendto/sendmsg. Instead the data is
4259 		 * delivered on the connection as if no address had been
4260 		 * supplied.
4261 		 */
4262 		if ((so_state & SS_ISCONNECTED) &&
4263 		    !(so_mode & SM_CONNREQUIRED)) {
4264 			mutex_exit(&so->so_lock);
4265 			return (EISCONN);
4266 		}
4267 		if (!(so_state & SS_ISBOUND)) {
4268 			so_lock_single(so);	/* Set SOLOCKED */
4269 			error = sotpi_bind(so, NULL, 0,
4270 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4271 			so_unlock_single(so, SOLOCKED);
4272 			if (error) {
4273 				mutex_exit(&so->so_lock);
4274 				eprintsoline(so, error);
4275 				return (error);
4276 			}
4277 		}
4278 		/*
4279 		 * Handle delayed datagram errors. These are only queued
4280 		 * when the application sets SO_DGRAM_ERRIND.
4281 		 * Return the error if we are sending to the address
4282 		 * that was returned in the last T_UDERROR_IND.
4283 		 * If sending to some other address discard the delayed
4284 		 * error indication.
4285 		 */
4286 		if (sti->sti_delayed_error) {
4287 			struct T_uderror_ind	*tudi;
4288 			void			*addr;
4289 			t_uscalar_t		addrlen;
4290 			boolean_t		match = B_FALSE;
4291 
4292 			ASSERT(sti->sti_eaddr_mp);
4293 			error = sti->sti_delayed_error;
4294 			sti->sti_delayed_error = 0;
4295 			tudi =
4296 			    (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4297 			addrlen = tudi->DEST_length;
4298 			addr = sogetoff(sti->sti_eaddr_mp,
4299 			    tudi->DEST_offset, addrlen, 1);
4300 			ASSERT(addr);	/* Checked by strsock_proto */
4301 			switch (so->so_family) {
4302 			case AF_INET: {
4303 				/* Compare just IP address and port */
4304 				sin_t *sin1 = (sin_t *)name;
4305 				sin_t *sin2 = (sin_t *)addr;
4306 
4307 				if (addrlen == sizeof (sin_t) &&
4308 				    namelen == addrlen &&
4309 				    sin1->sin_port == sin2->sin_port &&
4310 				    sin1->sin_addr.s_addr ==
4311 				    sin2->sin_addr.s_addr)
4312 					match = B_TRUE;
4313 				break;
4314 			}
4315 			case AF_INET6: {
4316 				/* Compare just IP address and port. Not flow */
4317 				sin6_t *sin1 = (sin6_t *)name;
4318 				sin6_t *sin2 = (sin6_t *)addr;
4319 
4320 				if (addrlen == sizeof (sin6_t) &&
4321 				    namelen == addrlen &&
4322 				    sin1->sin6_port == sin2->sin6_port &&
4323 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4324 				    &sin2->sin6_addr))
4325 					match = B_TRUE;
4326 				break;
4327 			}
4328 			case AF_UNIX:
4329 			default:
4330 				if (namelen == addrlen &&
4331 				    bcmp(name, addr, namelen) == 0)
4332 					match = B_TRUE;
4333 			}
4334 			if (match) {
4335 				freemsg(sti->sti_eaddr_mp);
4336 				sti->sti_eaddr_mp = NULL;
4337 				mutex_exit(&so->so_lock);
4338 #ifdef DEBUG
4339 				dprintso(so, 0,
4340 				    ("sockfs delayed error %d for %s\n",
4341 				    error,
4342 				    pr_addr(so->so_family, name, namelen)));
4343 #endif /* DEBUG */
4344 				return (error);
4345 			}
4346 			freemsg(sti->sti_eaddr_mp);
4347 			sti->sti_eaddr_mp = NULL;
4348 		}
4349 	}
4350 	mutex_exit(&so->so_lock);
4351 
4352 	flags = msg->msg_flags;
4353 	dontroute = 0;
4354 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4355 		uint32_t	val;
4356 
4357 		val = 1;
4358 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4359 		    &val, (t_uscalar_t)sizeof (val), cr);
4360 		if (error)
4361 			return (error);
4362 		dontroute = 1;
4363 	}
4364 
4365 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4366 		error = EOPNOTSUPP;
4367 		goto done;
4368 	}
4369 	if (msg->msg_controllen != 0) {
4370 		if (!(so_mode & SM_CONNREQUIRED)) {
4371 			so_update_attrs(so, SOMOD);
4372 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4373 			    msg->msg_control, msg->msg_controllen, flags);
4374 		} else {
4375 			if (flags & MSG_OOB) {
4376 				/* Can't generate T_EXDATA_REQ with options */
4377 				error = EOPNOTSUPP;
4378 				goto done;
4379 			}
4380 			so_update_attrs(so, SOMOD);
4381 			error = sosend_svccmsg(so, uiop,
4382 			    !(flags & MSG_EOR),
4383 			    msg->msg_control, msg->msg_controllen,
4384 			    flags);
4385 		}
4386 		goto done;
4387 	}
4388 
4389 	so_update_attrs(so, SOMOD);
4390 	if (!(so_mode & SM_CONNREQUIRED)) {
4391 		/*
4392 		 * If there is no SO_DONTROUTE to turn off return immediately
4393 		 * from send_dgram. This can allow tail-call optimizations.
4394 		 */
4395 		if (!dontroute) {
4396 			return (sosend_dgram(so, name, namelen, uiop, flags));
4397 		}
4398 		error = sosend_dgram(so, name, namelen, uiop, flags);
4399 	} else {
4400 		t_scalar_t prim;
4401 		int sflag;
4402 
4403 		/* Ignore msg_name in the connected state */
4404 		if (flags & MSG_OOB) {
4405 			prim = T_EXDATA_REQ;
4406 			/*
4407 			 * Send down T_EXDATA_REQ even if there is flow
4408 			 * control for data.
4409 			 */
4410 			sflag = MSG_IGNFLOW;
4411 		} else {
4412 			if (so_mode & SM_BYTESTREAM) {
4413 				/* Byte stream transport - use write */
4414 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4415 
4416 				/* Send M_DATA messages */
4417 				if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4418 				    (error = nl7c_data(so, uiop)) >= 0) {
4419 					/* NL7C consumed the data */
4420 					return (error);
4421 				}
4422 				/*
4423 				 * If there is no SO_DONTROUTE to turn off,
4424 				 * sti_direct is on, and there is no flow
4425 				 * control, we can take the fast path.
4426 				 */
4427 				if (!dontroute && sti->sti_direct != 0 &&
4428 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4429 					return (sostream_direct(so, uiop,
4430 					    NULL, cr));
4431 				}
4432 				error = strwrite(SOTOV(so), uiop, cr);
4433 				goto done;
4434 			}
4435 			prim = T_DATA_REQ;
4436 			sflag = 0;
4437 		}
4438 		/*
4439 		 * If there is no SO_DONTROUTE to turn off return immediately
4440 		 * from sosend_svc. This can allow tail-call optimizations.
4441 		 */
4442 		if (!dontroute)
4443 			return (sosend_svc(so, uiop, prim,
4444 			    !(flags & MSG_EOR), sflag));
4445 		error = sosend_svc(so, uiop, prim,
4446 		    !(flags & MSG_EOR), sflag);
4447 	}
4448 	ASSERT(dontroute);
4449 done:
4450 	if (dontroute) {
4451 		uint32_t	val;
4452 
4453 		val = 0;
4454 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4455 		    &val, (t_uscalar_t)sizeof (val), cr);
4456 	}
4457 	return (error);
4458 }
4459 
4460 /*
4461  * kstrwritemp() has very similar semantics as that of strwrite().
4462  * The main difference is it obtains mblks from the caller and also
4463  * does not do any copy as done in strwrite() from user buffers to
4464  * kernel buffers.
4465  *
4466  * Currently, this routine is used by sendfile to send data allocated
4467  * within the kernel without any copying. This interface does not use the
4468  * synchronous stream interface as synch. stream interface implies
4469  * copying.
4470  */
4471 int
4472 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4473 {
4474 	struct stdata *stp;
4475 	struct queue *wqp;
4476 	mblk_t *newmp;
4477 	char waitflag;
4478 	int tempmode;
4479 	int error = 0;
4480 	int done = 0;
4481 	struct sonode *so;
4482 	boolean_t direct;
4483 
4484 	ASSERT(vp->v_stream);
4485 	stp = vp->v_stream;
4486 
4487 	so = VTOSO(vp);
4488 	direct = _SOTOTPI(so)->sti_direct;
4489 
4490 	/*
4491 	 * This is the sockfs direct fast path. canputnext() need
4492 	 * not be accurate so we don't grab the sd_lock here. If
4493 	 * we get flow-controlled, we grab sd_lock just before the
4494 	 * do..while loop below to emulate what strwrite() does.
4495 	 */
4496 	wqp = stp->sd_wrq;
4497 	if (canputnext(wqp) && direct &&
4498 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4499 		return (sostream_direct(so, NULL, mp, CRED()));
4500 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4501 		/* Fast check of flags before acquiring the lock */
4502 		mutex_enter(&stp->sd_lock);
4503 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4504 		mutex_exit(&stp->sd_lock);
4505 		if (error != 0) {
4506 			if (!(stp->sd_flag & STPLEX) &&
4507 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
4508 				error = EPIPE;
4509 			}
4510 			return (error);
4511 		}
4512 	}
4513 
4514 	waitflag = WRITEWAIT;
4515 	if (stp->sd_flag & OLDNDELAY)
4516 		tempmode = fmode & ~FNDELAY;
4517 	else
4518 		tempmode = fmode;
4519 
4520 	mutex_enter(&stp->sd_lock);
4521 	do {
4522 		if (canputnext(wqp)) {
4523 			mutex_exit(&stp->sd_lock);
4524 			if (stp->sd_wputdatafunc != NULL) {
4525 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4526 				    NULL, NULL, NULL);
4527 				if (newmp == NULL) {
4528 					/* The caller will free mp */
4529 					return (ECOMM);
4530 				}
4531 				mp = newmp;
4532 			}
4533 			putnext(wqp, mp);
4534 			return (0);
4535 		}
4536 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4537 		    &done);
4538 	} while (error == 0 && !done);
4539 
4540 	mutex_exit(&stp->sd_lock);
4541 	/*
4542 	 * EAGAIN tells the application to try again. ENOMEM
4543 	 * is returned only if the memory allocation size
4544 	 * exceeds the physical limits of the system. ENOMEM
4545 	 * can't be true here.
4546 	 */
4547 	if (error == ENOMEM)
4548 		error = EAGAIN;
4549 	return (error);
4550 }
4551 
4552 /* ARGSUSED */
4553 static int
4554 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4555     struct cred *cr, mblk_t **mpp)
4556 {
4557 	int error;
4558 
4559 	if (so->so_family != AF_INET && so->so_family != AF_INET6)
4560 		return (EAFNOSUPPORT);
4561 
4562 	if (so->so_state & SS_CANTSENDMORE)
4563 		return (EPIPE);
4564 
4565 	if (so->so_type != SOCK_STREAM)
4566 		return (EOPNOTSUPP);
4567 
4568 	if ((so->so_state & SS_ISCONNECTED) == 0)
4569 		return (ENOTCONN);
4570 
4571 	error = kstrwritemp(so->so_vnode, *mpp, fflag);
4572 	if (error == 0)
4573 		*mpp = NULL;
4574 	return (error);
4575 }
4576 
4577 /*
4578  * Sending data on a datagram socket.
4579  * Assumes caller has verified that SS_ISBOUND etc. are set.
4580  */
4581 /* ARGSUSED */
4582 static int
4583 sodgram_direct(struct sonode *so, struct sockaddr *name,
4584     socklen_t namelen, struct uio *uiop, int flags)
4585 {
4586 	struct T_unitdata_req	tudr;
4587 	mblk_t			*mp = NULL;
4588 	int			error = 0;
4589 	void			*addr;
4590 	socklen_t		addrlen;
4591 	ssize_t			len;
4592 	struct stdata		*stp = SOTOV(so)->v_stream;
4593 	int			so_state;
4594 	queue_t			*udp_wq;
4595 	boolean_t		connected;
4596 	mblk_t			*mpdata = NULL;
4597 	sotpi_info_t		*sti = SOTOTPI(so);
4598 	uint32_t		auditing = AU_AUDITING();
4599 
4600 	ASSERT(name != NULL && namelen != 0);
4601 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4602 	ASSERT(!(so->so_mode & SM_EXDATA));
4603 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4604 	ASSERT(SOTOV(so)->v_type == VSOCK);
4605 
4606 	/* Caller checked for proper length */
4607 	len = uiop->uio_resid;
4608 	ASSERT(len <= sti->sti_tidu_size);
4609 
4610 	/* Length and family checks have been done by caller */
4611 	ASSERT(name->sa_family == so->so_family);
4612 	ASSERT(so->so_family == AF_INET ||
4613 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4614 	ASSERT(so->so_family == AF_INET6 ||
4615 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4616 
4617 	addr = name;
4618 	addrlen = namelen;
4619 
4620 	if (stp->sd_sidp != NULL &&
4621 	    (error = straccess(stp, JCWRITE)) != 0)
4622 		goto done;
4623 
4624 	so_state = so->so_state;
4625 
4626 	connected = so_state & SS_ISCONNECTED;
4627 	if (!connected) {
4628 		tudr.PRIM_type = T_UNITDATA_REQ;
4629 		tudr.DEST_length = addrlen;
4630 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4631 		tudr.OPT_length = 0;
4632 		tudr.OPT_offset = 0;
4633 
4634 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4635 		    _ALLOC_INTR, CRED());
4636 		if (mp == NULL) {
4637 			/*
4638 			 * Caught a signal waiting for memory.
4639 			 * Let send* return EINTR.
4640 			 */
4641 			error = EINTR;
4642 			goto done;
4643 		}
4644 	}
4645 
4646 	/*
4647 	 * For UDP we don't break up the copyin into smaller pieces
4648 	 * as in the TCP case.  That means if ENOMEM is returned by
4649 	 * mcopyinuio() then the uio vector has not been modified at
4650 	 * all and we fallback to either strwrite() or kstrputmsg()
4651 	 * below.  Note also that we never generate priority messages
4652 	 * from here.
4653 	 */
4654 	udp_wq = stp->sd_wrq->q_next;
4655 	if (canput(udp_wq) &&
4656 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4657 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4658 		ASSERT(uiop->uio_resid == 0);
4659 		if (!connected)
4660 			linkb(mp, mpdata);
4661 		else
4662 			mp = mpdata;
4663 		if (auditing)
4664 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4665 
4666 		udp_wput(udp_wq, mp);
4667 		return (0);
4668 	}
4669 
4670 	ASSERT(mpdata == NULL);
4671 	if (error != 0 && error != ENOMEM) {
4672 		freemsg(mp);
4673 		return (error);
4674 	}
4675 
4676 	/*
4677 	 * For connected, let strwrite() handle the blocking case.
4678 	 * Otherwise we fall thru and use kstrputmsg().
4679 	 */
4680 	if (connected)
4681 		return (strwrite(SOTOV(so), uiop, CRED()));
4682 
4683 	if (auditing)
4684 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4685 
4686 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4687 done:
4688 #ifdef SOCK_DEBUG
4689 	if (error != 0) {
4690 		eprintsoline(so, error);
4691 	}
4692 #endif /* SOCK_DEBUG */
4693 	return (error);
4694 }
4695 
4696 int
4697 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4698 {
4699 	struct stdata *stp = SOTOV(so)->v_stream;
4700 	ssize_t iosize, rmax, maxblk;
4701 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4702 	mblk_t *newmp;
4703 	int error = 0, wflag = 0;
4704 
4705 	ASSERT(so->so_mode & SM_BYTESTREAM);
4706 	ASSERT(SOTOV(so)->v_type == VSOCK);
4707 
4708 	if (stp->sd_sidp != NULL &&
4709 	    (error = straccess(stp, JCWRITE)) != 0)
4710 		return (error);
4711 
4712 	if (uiop == NULL) {
4713 		/*
4714 		 * kstrwritemp() should have checked sd_flag and
4715 		 * flow-control before coming here.  If we end up
4716 		 * here it means that we can simply pass down the
4717 		 * data to tcp.
4718 		 */
4719 		ASSERT(mp != NULL);
4720 		if (stp->sd_wputdatafunc != NULL) {
4721 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4722 			    NULL, NULL, NULL);
4723 			if (newmp == NULL) {
4724 				/* The caller will free mp */
4725 				return (ECOMM);
4726 			}
4727 			mp = newmp;
4728 		}
4729 		tcp_wput(tcp_wq, mp);
4730 		return (0);
4731 	}
4732 
4733 	/* Fallback to strwrite() to do proper error handling */
4734 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4735 		return (strwrite(SOTOV(so), uiop, cr));
4736 
4737 	rmax = stp->sd_qn_maxpsz;
4738 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4739 	if (rmax == 0 || uiop->uio_resid <= 0)
4740 		return (0);
4741 
4742 	if (rmax == INFPSZ)
4743 		rmax = uiop->uio_resid;
4744 
4745 	maxblk = stp->sd_maxblk;
4746 
4747 	for (;;) {
4748 		iosize = MIN(uiop->uio_resid, rmax);
4749 
4750 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4751 		if (mp == NULL) {
4752 			/*
4753 			 * Fallback to strwrite() for ENOMEM; if this
4754 			 * is our first time in this routine and the uio
4755 			 * vector has not been modified, we will end up
4756 			 * calling strwrite() without any flag set.
4757 			 */
4758 			if (error == ENOMEM)
4759 				goto slow_send;
4760 			else
4761 				return (error);
4762 		}
4763 		ASSERT(uiop->uio_resid >= 0);
4764 		/*
4765 		 * If mp is non-NULL and ENOMEM is set, it means that
4766 		 * mcopyinuio() was able to break down some of the user
4767 		 * data into one or more mblks.  Send the partial data
4768 		 * to tcp and let the rest be handled in strwrite().
4769 		 */
4770 		ASSERT(error == 0 || error == ENOMEM);
4771 		if (stp->sd_wputdatafunc != NULL) {
4772 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4773 			    NULL, NULL, NULL);
4774 			if (newmp == NULL) {
4775 				/* The caller will free mp */
4776 				return (ECOMM);
4777 			}
4778 			mp = newmp;
4779 		}
4780 		tcp_wput(tcp_wq, mp);
4781 
4782 		wflag |= NOINTR;
4783 
4784 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4785 			ASSERT(error == 0);
4786 			break;
4787 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4788 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4789 slow_send:
4790 			/*
4791 			 * We were able to send down partial data using
4792 			 * the direct call interface, but are now relying
4793 			 * on strwrite() to handle the non-fastpath cases.
4794 			 * If the socket is blocking we will sleep in
4795 			 * strwaitq() until write is permitted, otherwise,
4796 			 * we will need to return the amount of bytes
4797 			 * written so far back to the app.  This is the
4798 			 * reason why we pass NOINTR flag to strwrite()
4799 			 * for non-blocking socket, because we don't want
4800 			 * to return EAGAIN when portion of the user data
4801 			 * has actually been sent down.
4802 			 */
4803 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4804 		}
4805 	}
4806 	return (0);
4807 }
4808 
4809 /*
4810  * Update sti_faddr by asking the transport (unless AF_UNIX).
4811  */
4812 /* ARGSUSED */
4813 int
4814 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4815     boolean_t accept, struct cred *cr)
4816 {
4817 	struct strbuf	strbuf;
4818 	int		error = 0, res;
4819 	void		*addr;
4820 	t_uscalar_t	addrlen;
4821 	k_sigset_t	smask;
4822 	sotpi_info_t	*sti = SOTOTPI(so);
4823 
4824 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4825 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4826 
4827 	ASSERT(*namelen > 0);
4828 	mutex_enter(&so->so_lock);
4829 	so_lock_single(so);	/* Set SOLOCKED */
4830 
4831 	if (accept) {
4832 		bcopy(sti->sti_faddr_sa, name,
4833 		    MIN(*namelen, sti->sti_faddr_len));
4834 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4835 		goto done;
4836 	}
4837 
4838 	if (!(so->so_state & SS_ISCONNECTED)) {
4839 		error = ENOTCONN;
4840 		goto done;
4841 	}
4842 	/* Added this check for X/Open */
4843 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4844 		error = EINVAL;
4845 		if (xnet_check_print) {
4846 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4847 		}
4848 		goto done;
4849 	}
4850 
4851 	if (sti->sti_faddr_valid) {
4852 		bcopy(sti->sti_faddr_sa, name,
4853 		    MIN(*namelen, sti->sti_faddr_len));
4854 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4855 		goto done;
4856 	}
4857 
4858 #ifdef DEBUG
4859 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4860 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4861 	    (t_uscalar_t)sti->sti_faddr_len)));
4862 #endif /* DEBUG */
4863 
4864 	if (so->so_family == AF_UNIX) {
4865 		/* Transport has different name space - return local info */
4866 		if (sti->sti_faddr_noxlate)
4867 			*namelen = 0;
4868 		error = 0;
4869 		goto done;
4870 	}
4871 
4872 	ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4873 
4874 	ASSERT(sti->sti_faddr_sa);
4875 	/* Allocate local buffer to use with ioctl */
4876 	addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4877 	mutex_exit(&so->so_lock);
4878 	addr = kmem_alloc(addrlen, KM_SLEEP);
4879 
4880 	/*
4881 	 * Issue TI_GETPEERNAME with signals masked.
4882 	 * Put the result in sti_faddr_sa so that getpeername works after
4883 	 * a shutdown(output).
4884 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4885 	 * back to the socket.
4886 	 */
4887 	strbuf.buf = addr;
4888 	strbuf.maxlen = addrlen;
4889 	strbuf.len = 0;
4890 
4891 	sigintr(&smask, 0);
4892 	res = 0;
4893 	ASSERT(cr);
4894 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4895 	    0, K_TO_K, cr, &res);
4896 	sigunintr(&smask);
4897 
4898 	mutex_enter(&so->so_lock);
4899 	/*
4900 	 * If there is an error record the error in so_error put don't fail
4901 	 * the getpeername. Instead fallback on the recorded
4902 	 * sti->sti_faddr_sa.
4903 	 */
4904 	if (error) {
4905 		/*
4906 		 * Various stream head errors can be returned to the ioctl.
4907 		 * However, it is impossible to determine which ones of
4908 		 * these are really socket level errors that were incorrectly
4909 		 * consumed by the ioctl. Thus this code silently ignores the
4910 		 * error - to code explicitly does not reinstate the error
4911 		 * using soseterror().
4912 		 * Experiments have shows that at least this set of
4913 		 * errors are reported and should not be reinstated on the
4914 		 * socket:
4915 		 *	EINVAL	E.g. if an I_LINK was in effect when
4916 		 *		getpeername was called.
4917 		 *	EPIPE	The ioctl error semantics prefer the write
4918 		 *		side error over the read side error.
4919 		 *	ENOTCONN The transport just got disconnected but
4920 		 *		sockfs had not yet seen the T_DISCON_IND
4921 		 *		when issuing the ioctl.
4922 		 */
4923 		error = 0;
4924 	} else if (res == 0 && strbuf.len > 0 &&
4925 	    (so->so_state & SS_ISCONNECTED)) {
4926 		ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
4927 		sti->sti_faddr_len = (socklen_t)strbuf.len;
4928 		bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
4929 		sti->sti_faddr_valid = 1;
4930 
4931 		bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
4932 		*namelen = sti->sti_faddr_len;
4933 	}
4934 	kmem_free(addr, addrlen);
4935 #ifdef DEBUG
4936 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4937 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4938 	    (t_uscalar_t)sti->sti_faddr_len)));
4939 #endif /* DEBUG */
4940 done:
4941 	so_unlock_single(so, SOLOCKED);
4942 	mutex_exit(&so->so_lock);
4943 	return (error);
4944 }
4945 
4946 /*
4947  * Update sti_laddr by asking the transport (unless AF_UNIX).
4948  */
4949 int
4950 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4951     struct cred *cr)
4952 {
4953 	struct strbuf	strbuf;
4954 	int		error = 0, res;
4955 	void		*addr;
4956 	t_uscalar_t	addrlen;
4957 	k_sigset_t	smask;
4958 	sotpi_info_t	*sti = SOTOTPI(so);
4959 
4960 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4961 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4962 
4963 	ASSERT(*namelen > 0);
4964 	mutex_enter(&so->so_lock);
4965 	so_lock_single(so);	/* Set SOLOCKED */
4966 
4967 #ifdef DEBUG
4968 
4969 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4970 	    pr_addr(so->so_family, sti->sti_laddr_sa,
4971 	    (t_uscalar_t)sti->sti_laddr_len)));
4972 #endif /* DEBUG */
4973 	if (sti->sti_laddr_valid) {
4974 		bcopy(sti->sti_laddr_sa, name,
4975 		    MIN(*namelen, sti->sti_laddr_len));
4976 		*namelen = sti->sti_laddr_len;
4977 		goto done;
4978 	}
4979 
4980 	if (so->so_family == AF_UNIX) {
4981 		/* Transport has different name space - return local info */
4982 		error = 0;
4983 		*namelen = 0;
4984 		goto done;
4985 	}
4986 	if (!(so->so_state & SS_ISBOUND)) {
4987 		/* If not bound, then nothing to return. */
4988 		error = 0;
4989 		goto done;
4990 	}
4991 
4992 	/* Allocate local buffer to use with ioctl */
4993 	addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
4994 	mutex_exit(&so->so_lock);
4995 	addr = kmem_alloc(addrlen, KM_SLEEP);
4996 
4997 	/*
4998 	 * Issue TI_GETMYNAME with signals masked.
4999 	 * Put the result in sti_laddr_sa so that getsockname works after
5000 	 * a shutdown(output).
5001 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5002 	 * back to the socket.
5003 	 */
5004 	strbuf.buf = addr;
5005 	strbuf.maxlen = addrlen;
5006 	strbuf.len = 0;
5007 
5008 	sigintr(&smask, 0);
5009 	res = 0;
5010 	ASSERT(cr);
5011 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
5012 	    0, K_TO_K, cr, &res);
5013 	sigunintr(&smask);
5014 
5015 	mutex_enter(&so->so_lock);
5016 	/*
5017 	 * If there is an error record the error in so_error put don't fail
5018 	 * the getsockname. Instead fallback on the recorded
5019 	 * sti->sti_laddr_sa.
5020 	 */
5021 	if (error) {
5022 		/*
5023 		 * Various stream head errors can be returned to the ioctl.
5024 		 * However, it is impossible to determine which ones of
5025 		 * these are really socket level errors that were incorrectly
5026 		 * consumed by the ioctl. Thus this code silently ignores the
5027 		 * error - to code explicitly does not reinstate the error
5028 		 * using soseterror().
5029 		 * Experiments have shows that at least this set of
5030 		 * errors are reported and should not be reinstated on the
5031 		 * socket:
5032 		 *	EINVAL	E.g. if an I_LINK was in effect when
5033 		 *		getsockname was called.
5034 		 *	EPIPE	The ioctl error semantics prefer the write
5035 		 *		side error over the read side error.
5036 		 */
5037 		error = 0;
5038 	} else if (res == 0 && strbuf.len > 0 &&
5039 	    (so->so_state & SS_ISBOUND)) {
5040 		ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5041 		sti->sti_laddr_len = (socklen_t)strbuf.len;
5042 		bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5043 		sti->sti_laddr_valid = 1;
5044 
5045 		bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5046 		*namelen = sti->sti_laddr_len;
5047 	}
5048 	kmem_free(addr, addrlen);
5049 #ifdef DEBUG
5050 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5051 	    pr_addr(so->so_family, sti->sti_laddr_sa,
5052 	    (t_uscalar_t)sti->sti_laddr_len)));
5053 #endif /* DEBUG */
5054 done:
5055 	so_unlock_single(so, SOLOCKED);
5056 	mutex_exit(&so->so_lock);
5057 	return (error);
5058 }
5059 
5060 /*
5061  * Get socket options. For SOL_SOCKET options some options are handled
5062  * by the sockfs while others use the value recorded in the sonode as a
5063  * fallback should the T_SVR4_OPTMGMT_REQ fail.
5064  *
5065  * On the return most *optlenp bytes are copied to optval.
5066  */
5067 /* ARGSUSED */
5068 int
5069 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5070 		void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5071 {
5072 	struct T_optmgmt_req	optmgmt_req;
5073 	struct T_optmgmt_ack	*optmgmt_ack;
5074 	struct opthdr		oh;
5075 	struct opthdr		*opt_res;
5076 	mblk_t			*mp = NULL;
5077 	int			error = 0;
5078 	void			*option = NULL;	/* Set if fallback value */
5079 	t_uscalar_t		maxlen = *optlenp;
5080 	t_uscalar_t		len;
5081 	uint32_t		value;
5082 	struct timeval		tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5083 	struct timeval32	tmo_val32;
5084 	struct so_snd_bufinfo	snd_bufinfo;	/* used for zero copy */
5085 
5086 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5087 	    (void *)so, level, option_name, optval, (void *)optlenp,
5088 	    pr_state(so->so_state, so->so_mode)));
5089 
5090 	mutex_enter(&so->so_lock);
5091 	so_lock_single(so);	/* Set SOLOCKED */
5092 
5093 	/*
5094 	 * Check for SOL_SOCKET options.
5095 	 * Certain SOL_SOCKET options are returned directly whereas
5096 	 * others only provide a default (fallback) value should
5097 	 * the T_SVR4_OPTMGMT_REQ fail.
5098 	 */
5099 	if (level == SOL_SOCKET) {
5100 		/* Check parameters */
5101 		switch (option_name) {
5102 		case SO_TYPE:
5103 		case SO_ERROR:
5104 		case SO_DEBUG:
5105 		case SO_ACCEPTCONN:
5106 		case SO_REUSEADDR:
5107 		case SO_KEEPALIVE:
5108 		case SO_DONTROUTE:
5109 		case SO_BROADCAST:
5110 		case SO_USELOOPBACK:
5111 		case SO_OOBINLINE:
5112 		case SO_SNDBUF:
5113 		case SO_RCVBUF:
5114 #ifdef notyet
5115 		case SO_SNDLOWAT:
5116 		case SO_RCVLOWAT:
5117 #endif /* notyet */
5118 		case SO_DOMAIN:
5119 		case SO_DGRAM_ERRIND:
5120 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5121 				error = EINVAL;
5122 				eprintsoline(so, error);
5123 				goto done2;
5124 			}
5125 			break;
5126 		case SO_RCVTIMEO:
5127 		case SO_SNDTIMEO:
5128 			if (get_udatamodel() == DATAMODEL_NONE ||
5129 			    get_udatamodel() == DATAMODEL_NATIVE) {
5130 				if (maxlen < sizeof (struct timeval)) {
5131 					error = EINVAL;
5132 					eprintsoline(so, error);
5133 					goto done2;
5134 				}
5135 			} else {
5136 				if (maxlen < sizeof (struct timeval32)) {
5137 					error = EINVAL;
5138 					eprintsoline(so, error);
5139 					goto done2;
5140 				}
5141 
5142 			}
5143 			break;
5144 		case SO_LINGER:
5145 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5146 				error = EINVAL;
5147 				eprintsoline(so, error);
5148 				goto done2;
5149 			}
5150 			break;
5151 		case SO_SND_BUFINFO:
5152 			if (maxlen < (t_uscalar_t)
5153 			    sizeof (struct so_snd_bufinfo)) {
5154 				error = EINVAL;
5155 				eprintsoline(so, error);
5156 				goto done2;
5157 			}
5158 			break;
5159 		}
5160 
5161 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
5162 
5163 		switch (option_name) {
5164 		case SO_TYPE:
5165 			value = so->so_type;
5166 			option = &value;
5167 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5168 
5169 		case SO_ERROR:
5170 			value = sogeterr(so, B_TRUE);
5171 			option = &value;
5172 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5173 
5174 		case SO_ACCEPTCONN:
5175 			if (so->so_state & SS_ACCEPTCONN)
5176 				value = SO_ACCEPTCONN;
5177 			else
5178 				value = 0;
5179 #ifdef DEBUG
5180 			if (value) {
5181 				dprintso(so, 1,
5182 				    ("sotpi_getsockopt: 0x%x is set\n",
5183 				    option_name));
5184 			} else {
5185 				dprintso(so, 1,
5186 				    ("sotpi_getsockopt: 0x%x not set\n",
5187 				    option_name));
5188 			}
5189 #endif /* DEBUG */
5190 			option = &value;
5191 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5192 
5193 		case SO_DEBUG:
5194 		case SO_REUSEADDR:
5195 		case SO_KEEPALIVE:
5196 		case SO_DONTROUTE:
5197 		case SO_BROADCAST:
5198 		case SO_USELOOPBACK:
5199 		case SO_OOBINLINE:
5200 		case SO_DGRAM_ERRIND:
5201 			value = (so->so_options & option_name);
5202 #ifdef DEBUG
5203 			if (value) {
5204 				dprintso(so, 1,
5205 				    ("sotpi_getsockopt: 0x%x is set\n",
5206 				    option_name));
5207 			} else {
5208 				dprintso(so, 1,
5209 				    ("sotpi_getsockopt: 0x%x not set\n",
5210 				    option_name));
5211 			}
5212 #endif /* DEBUG */
5213 			option = &value;
5214 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5215 
5216 		/*
5217 		 * The following options are only returned by sockfs when the
5218 		 * T_SVR4_OPTMGMT_REQ fails.
5219 		 */
5220 		case SO_LINGER:
5221 			option = &so->so_linger;
5222 			len = (t_uscalar_t)sizeof (struct linger);
5223 			break;
5224 		case SO_SNDBUF: {
5225 			ssize_t lvalue;
5226 
5227 			/*
5228 			 * If the option has not been set then get a default
5229 			 * value from the read queue. This value is
5230 			 * returned if the transport fails
5231 			 * the T_SVR4_OPTMGMT_REQ.
5232 			 */
5233 			lvalue = so->so_sndbuf;
5234 			if (lvalue == 0) {
5235 				mutex_exit(&so->so_lock);
5236 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
5237 				    QHIWAT, 0, &lvalue);
5238 				mutex_enter(&so->so_lock);
5239 				dprintso(so, 1,
5240 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5241 			}
5242 			value = (int)lvalue;
5243 			option = &value;
5244 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5245 			break;
5246 		}
5247 		case SO_RCVBUF: {
5248 			ssize_t lvalue;
5249 
5250 			/*
5251 			 * If the option has not been set then get a default
5252 			 * value from the read queue. This value is
5253 			 * returned if the transport fails
5254 			 * the T_SVR4_OPTMGMT_REQ.
5255 			 *
5256 			 * XXX If SO_RCVBUF has been set and this is an
5257 			 * XPG 4.2 application then do not ask the transport
5258 			 * since the transport might adjust the value and not
5259 			 * return exactly what was set by the application.
5260 			 * For non-XPG 4.2 application we return the value
5261 			 * that the transport is actually using.
5262 			 */
5263 			lvalue = so->so_rcvbuf;
5264 			if (lvalue == 0) {
5265 				mutex_exit(&so->so_lock);
5266 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5267 				    QHIWAT, 0, &lvalue);
5268 				mutex_enter(&so->so_lock);
5269 				dprintso(so, 1,
5270 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5271 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5272 				value = (int)lvalue;
5273 				option = &value;
5274 				goto copyout;	/* skip asking transport */
5275 			}
5276 			value = (int)lvalue;
5277 			option = &value;
5278 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5279 			break;
5280 		}
5281 		case SO_DOMAIN:
5282 			value = so->so_family;
5283 			option = &value;
5284 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5285 
5286 #ifdef notyet
5287 		/*
5288 		 * We do not implement the semantics of these options
5289 		 * thus we shouldn't implement the options either.
5290 		 */
5291 		case SO_SNDLOWAT:
5292 			value = so->so_sndlowat;
5293 			option = &value;
5294 			break;
5295 		case SO_RCVLOWAT:
5296 			value = so->so_rcvlowat;
5297 			option = &value;
5298 			break;
5299 #endif /* notyet */
5300 		case SO_SNDTIMEO:
5301 		case SO_RCVTIMEO: {
5302 			clock_t val;
5303 
5304 			if (option_name == SO_RCVTIMEO)
5305 				val = drv_hztousec(so->so_rcvtimeo);
5306 			else
5307 				val = drv_hztousec(so->so_sndtimeo);
5308 			tmo_val.tv_sec = val / (1000 * 1000);
5309 			tmo_val.tv_usec = val % (1000 * 1000);
5310 			if (get_udatamodel() == DATAMODEL_NONE ||
5311 			    get_udatamodel() == DATAMODEL_NATIVE) {
5312 				option = &tmo_val;
5313 				len = sizeof (struct timeval);
5314 			} else {
5315 				TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5316 				option = &tmo_val32;
5317 				len = sizeof (struct timeval32);
5318 			}
5319 			break;
5320 		}
5321 		case SO_SND_BUFINFO: {
5322 			snd_bufinfo.sbi_wroff =
5323 			    (so->so_proto_props).sopp_wroff;
5324 			snd_bufinfo.sbi_maxblk =
5325 			    (so->so_proto_props).sopp_maxblk;
5326 			snd_bufinfo.sbi_maxpsz =
5327 			    (so->so_proto_props).sopp_maxpsz;
5328 			snd_bufinfo.sbi_tail =
5329 			    (so->so_proto_props).sopp_tail;
5330 			option = &snd_bufinfo;
5331 			len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5332 			break;
5333 		}
5334 		}
5335 	}
5336 
5337 	mutex_exit(&so->so_lock);
5338 
5339 	/* Send request */
5340 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5341 	optmgmt_req.MGMT_flags = T_CHECK;
5342 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5343 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5344 
5345 	oh.level = level;
5346 	oh.name = option_name;
5347 	oh.len = maxlen;
5348 
5349 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5350 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5351 	/* Let option management work in the presence of data flow control */
5352 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5353 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5354 	mp = NULL;
5355 	mutex_enter(&so->so_lock);
5356 	if (error) {
5357 		eprintsoline(so, error);
5358 		goto done2;
5359 	}
5360 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5361 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5362 	if (error) {
5363 		if (option != NULL) {
5364 			/* We have a fallback value */
5365 			error = 0;
5366 			goto copyout;
5367 		}
5368 		eprintsoline(so, error);
5369 		goto done2;
5370 	}
5371 	ASSERT(mp);
5372 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5373 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5374 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5375 	if (opt_res == NULL) {
5376 		if (option != NULL) {
5377 			/* We have a fallback value */
5378 			error = 0;
5379 			goto copyout;
5380 		}
5381 		error = EPROTO;
5382 		eprintsoline(so, error);
5383 		goto done;
5384 	}
5385 	option = &opt_res[1];
5386 
5387 	/* check to ensure that the option is within bounds */
5388 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5389 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5390 		if (option != NULL) {
5391 			/* We have a fallback value */
5392 			error = 0;
5393 			goto copyout;
5394 		}
5395 		error = EPROTO;
5396 		eprintsoline(so, error);
5397 		goto done;
5398 	}
5399 
5400 	len = opt_res->len;
5401 
5402 copyout: {
5403 		t_uscalar_t size = MIN(len, maxlen);
5404 		bcopy(option, optval, size);
5405 		bcopy(&size, optlenp, sizeof (size));
5406 	}
5407 done:
5408 	freemsg(mp);
5409 done2:
5410 	so_unlock_single(so, SOLOCKED);
5411 	mutex_exit(&so->so_lock);
5412 
5413 	return (error);
5414 }
5415 
5416 /*
5417  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5418  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5419  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5420  * setsockopt has to work even if the transport does not support the option.
5421  */
5422 /* ARGSUSED */
5423 int
5424 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5425 	const void *optval, t_uscalar_t optlen, struct cred *cr)
5426 {
5427 	struct T_optmgmt_req	optmgmt_req;
5428 	struct opthdr		oh;
5429 	mblk_t			*mp;
5430 	int			error = 0;
5431 	boolean_t		handled = B_FALSE;
5432 
5433 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5434 	    (void *)so, level, option_name, optval, optlen,
5435 	    pr_state(so->so_state, so->so_mode)));
5436 
5437 	/* X/Open requires this check */
5438 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5439 		if (xnet_check_print)
5440 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5441 		return (EINVAL);
5442 	}
5443 
5444 	mutex_enter(&so->so_lock);
5445 	so_lock_single(so);	/* Set SOLOCKED */
5446 	mutex_exit(&so->so_lock);
5447 
5448 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5449 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5450 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5451 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5452 
5453 	oh.level = level;
5454 	oh.name = option_name;
5455 	oh.len = optlen;
5456 
5457 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5458 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5459 	/* Let option management work in the presence of data flow control */
5460 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5461 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5462 	mp = NULL;
5463 	mutex_enter(&so->so_lock);
5464 	if (error) {
5465 		eprintsoline(so, error);
5466 		goto done2;
5467 	}
5468 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5469 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5470 	if (error) {
5471 		eprintsoline(so, error);
5472 		goto done;
5473 	}
5474 	ASSERT(mp);
5475 	/* No need to verify T_optmgmt_ack */
5476 	freemsg(mp);
5477 done:
5478 	/*
5479 	 * Check for SOL_SOCKET options and record their values.
5480 	 * If we know about a SOL_SOCKET parameter and the transport
5481 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5482 	 * EPROTO) we let the setsockopt succeed.
5483 	 */
5484 	if (level == SOL_SOCKET) {
5485 		/* Check parameters */
5486 		switch (option_name) {
5487 		case SO_DEBUG:
5488 		case SO_REUSEADDR:
5489 		case SO_KEEPALIVE:
5490 		case SO_DONTROUTE:
5491 		case SO_BROADCAST:
5492 		case SO_USELOOPBACK:
5493 		case SO_OOBINLINE:
5494 		case SO_SNDBUF:
5495 		case SO_RCVBUF:
5496 #ifdef notyet
5497 		case SO_SNDLOWAT:
5498 		case SO_RCVLOWAT:
5499 #endif /* notyet */
5500 		case SO_DGRAM_ERRIND:
5501 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5502 				error = EINVAL;
5503 				eprintsoline(so, error);
5504 				goto done2;
5505 			}
5506 			ASSERT(optval);
5507 			handled = B_TRUE;
5508 			break;
5509 		case SO_SNDTIMEO:
5510 		case SO_RCVTIMEO:
5511 			if (get_udatamodel() == DATAMODEL_NONE ||
5512 			    get_udatamodel() == DATAMODEL_NATIVE) {
5513 				if (optlen != sizeof (struct timeval)) {
5514 					error = EINVAL;
5515 					eprintsoline(so, error);
5516 					goto done2;
5517 				}
5518 			} else {
5519 				if (optlen != sizeof (struct timeval32)) {
5520 					error = EINVAL;
5521 					eprintsoline(so, error);
5522 					goto done2;
5523 				}
5524 			}
5525 			ASSERT(optval);
5526 			handled = B_TRUE;
5527 			break;
5528 		case SO_LINGER:
5529 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5530 				error = EINVAL;
5531 				eprintsoline(so, error);
5532 				goto done2;
5533 			}
5534 			ASSERT(optval);
5535 			handled = B_TRUE;
5536 			break;
5537 		}
5538 
5539 #define	intvalue	(*(int32_t *)optval)
5540 
5541 		switch (option_name) {
5542 		case SO_TYPE:
5543 		case SO_ERROR:
5544 		case SO_ACCEPTCONN:
5545 			/* Can't be set */
5546 			error = ENOPROTOOPT;
5547 			goto done2;
5548 		case SO_LINGER: {
5549 			struct linger *l = (struct linger *)optval;
5550 
5551 			so->so_linger.l_linger = l->l_linger;
5552 			if (l->l_onoff) {
5553 				so->so_linger.l_onoff = SO_LINGER;
5554 				so->so_options |= SO_LINGER;
5555 			} else {
5556 				so->so_linger.l_onoff = 0;
5557 				so->so_options &= ~SO_LINGER;
5558 			}
5559 			break;
5560 		}
5561 
5562 		case SO_DEBUG:
5563 #ifdef SOCK_TEST
5564 			if (intvalue & 2)
5565 				sock_test_timelimit = 10 * hz;
5566 			else
5567 				sock_test_timelimit = 0;
5568 
5569 			if (intvalue & 4)
5570 				do_useracc = 0;
5571 			else
5572 				do_useracc = 1;
5573 #endif /* SOCK_TEST */
5574 			/* FALLTHRU */
5575 		case SO_REUSEADDR:
5576 		case SO_KEEPALIVE:
5577 		case SO_DONTROUTE:
5578 		case SO_BROADCAST:
5579 		case SO_USELOOPBACK:
5580 		case SO_OOBINLINE:
5581 		case SO_DGRAM_ERRIND:
5582 			if (intvalue != 0) {
5583 				dprintso(so, 1,
5584 				    ("socket_setsockopt: setting 0x%x\n",
5585 				    option_name));
5586 				so->so_options |= option_name;
5587 			} else {
5588 				dprintso(so, 1,
5589 				    ("socket_setsockopt: clearing 0x%x\n",
5590 				    option_name));
5591 				so->so_options &= ~option_name;
5592 			}
5593 			break;
5594 		/*
5595 		 * The following options are only returned by us when the
5596 		 * transport layer fails.
5597 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5598 		 * since the transport might adjust the value and not
5599 		 * return exactly what was set by the application.
5600 		 */
5601 		case SO_SNDBUF:
5602 			so->so_sndbuf = intvalue;
5603 			break;
5604 		case SO_RCVBUF:
5605 			so->so_rcvbuf = intvalue;
5606 			break;
5607 		case SO_RCVPSH:
5608 			so->so_rcv_timer_interval = intvalue;
5609 			break;
5610 #ifdef notyet
5611 		/*
5612 		 * We do not implement the semantics of these options
5613 		 * thus we shouldn't implement the options either.
5614 		 */
5615 		case SO_SNDLOWAT:
5616 			so->so_sndlowat = intvalue;
5617 			break;
5618 		case SO_RCVLOWAT:
5619 			so->so_rcvlowat = intvalue;
5620 			break;
5621 #endif /* notyet */
5622 		case SO_SNDTIMEO:
5623 		case SO_RCVTIMEO: {
5624 			struct timeval tl;
5625 			clock_t val;
5626 
5627 			if (get_udatamodel() == DATAMODEL_NONE ||
5628 			    get_udatamodel() == DATAMODEL_NATIVE)
5629 				bcopy(&tl, (struct timeval *)optval,
5630 				    sizeof (struct timeval));
5631 			else
5632 				TIMEVAL32_TO_TIMEVAL(&tl,
5633 				    (struct timeval32 *)optval);
5634 			val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5635 			if (option_name == SO_RCVTIMEO)
5636 				so->so_rcvtimeo = drv_usectohz(val);
5637 			else
5638 				so->so_sndtimeo = drv_usectohz(val);
5639 			break;
5640 		}
5641 		}
5642 #undef	intvalue
5643 
5644 		if (error) {
5645 			if ((error == ENOPROTOOPT || error == EPROTO ||
5646 			    error == EINVAL) && handled) {
5647 				dprintso(so, 1,
5648 				    ("setsockopt: ignoring error %d for 0x%x\n",
5649 				    error, option_name));
5650 				error = 0;
5651 			}
5652 		}
5653 	}
5654 done2:
5655 	so_unlock_single(so, SOLOCKED);
5656 	mutex_exit(&so->so_lock);
5657 	return (error);
5658 }
5659 
5660 /*
5661  * sotpi_close() is called when the last open reference goes away.
5662  */
5663 /* ARGSUSED */
5664 int
5665 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5666 {
5667 	struct vnode *vp = SOTOV(so);
5668 	dev_t dev;
5669 	int error = 0;
5670 	sotpi_info_t *sti = SOTOTPI(so);
5671 
5672 	dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5673 	    (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5674 
5675 	dev = sti->sti_dev;
5676 
5677 	ASSERT(STREAMSTAB(getmajor(dev)));
5678 
5679 	mutex_enter(&so->so_lock);
5680 	so_lock_single(so);	/* Set SOLOCKED */
5681 
5682 	ASSERT(so_verify_oobstate(so));
5683 
5684 	if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5685 		sti->sti_nl7c_flags = 0;
5686 		nl7c_close(so);
5687 	}
5688 
5689 	if (vp->v_stream != NULL) {
5690 		vnode_t *ux_vp;
5691 
5692 		if (so->so_family == AF_UNIX) {
5693 			/* Could avoid this when CANTSENDMORE for !dgram */
5694 			so_unix_close(so);
5695 		}
5696 
5697 		mutex_exit(&so->so_lock);
5698 		/*
5699 		 * Disassemble the linkage from the AF_UNIX underlying file
5700 		 * system vnode to this socket (by atomically clearing
5701 		 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5702 		 * and frees the stream head.
5703 		 */
5704 		if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5705 			ASSERT(ux_vp->v_stream);
5706 			sti->sti_ux_bound_vp = NULL;
5707 			vn_rele_stream(ux_vp);
5708 		}
5709 		error = strclose(vp, flag, cr);
5710 		vp->v_stream = NULL;
5711 		mutex_enter(&so->so_lock);
5712 	}
5713 
5714 	/*
5715 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5716 	 */
5717 	so_flush_discon_ind(so);
5718 
5719 	so_unlock_single(so, SOLOCKED);
5720 	mutex_exit(&so->so_lock);
5721 
5722 	/*
5723 	 * Needed for STREAMs.
5724 	 * Decrement the device driver's reference count for streams
5725 	 * opened via the clone dip. The driver was held in clone_open().
5726 	 * The absence of clone_close() forces this asymmetry.
5727 	 */
5728 	if (so->so_flag & SOCLONE)
5729 		ddi_rele_driver(getmajor(dev));
5730 
5731 	return (error);
5732 }
5733 
5734 static int
5735 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5736     struct cred *cr, int32_t *rvalp)
5737 {
5738 	struct vnode *vp = SOTOV(so);
5739 	sotpi_info_t *sti = SOTOTPI(so);
5740 	int error = 0;
5741 
5742 	dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5743 	    cmd, arg, pr_state(so->so_state, so->so_mode)));
5744 
5745 	switch (cmd) {
5746 	case SIOCSQPTR:
5747 		/*
5748 		 * SIOCSQPTR is valid only when helper stream is created
5749 		 * by the protocol.
5750 		 */
5751 	case _I_INSERT:
5752 	case _I_REMOVE:
5753 		/*
5754 		 * Since there's no compelling reason to support these ioctls
5755 		 * on sockets, and doing so would increase the complexity
5756 		 * markedly, prevent it.
5757 		 */
5758 		return (EOPNOTSUPP);
5759 
5760 	case I_FIND:
5761 	case I_LIST:
5762 	case I_LOOK:
5763 	case I_POP:
5764 	case I_PUSH:
5765 		/*
5766 		 * To prevent races and inconsistencies between the actual
5767 		 * state of the stream and the state according to the sonode,
5768 		 * we serialize all operations which modify or operate on the
5769 		 * list of modules on the socket's stream.
5770 		 */
5771 		mutex_enter(&sti->sti_plumb_lock);
5772 		error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5773 		mutex_exit(&sti->sti_plumb_lock);
5774 		return (error);
5775 
5776 	default:
5777 		if (so->so_version != SOV_STREAM)
5778 			break;
5779 
5780 		/*
5781 		 * The imaginary "sockmod" has been popped; act as a stream.
5782 		 */
5783 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5784 	}
5785 
5786 	ASSERT(so->so_version != SOV_STREAM);
5787 
5788 	/*
5789 	 * Process socket-specific ioctls.
5790 	 */
5791 	switch (cmd) {
5792 	case FIONBIO: {
5793 		int32_t value;
5794 
5795 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5796 		    (mode & (int)FKIOCTL)))
5797 			return (EFAULT);
5798 
5799 		mutex_enter(&so->so_lock);
5800 		if (value) {
5801 			so->so_state |= SS_NDELAY;
5802 		} else {
5803 			so->so_state &= ~SS_NDELAY;
5804 		}
5805 		mutex_exit(&so->so_lock);
5806 		return (0);
5807 	}
5808 
5809 	case FIOASYNC: {
5810 		int32_t value;
5811 
5812 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5813 		    (mode & (int)FKIOCTL)))
5814 			return (EFAULT);
5815 
5816 		mutex_enter(&so->so_lock);
5817 		/*
5818 		 * SS_ASYNC flag not already set correctly?
5819 		 * (!value != !(so->so_state & SS_ASYNC))
5820 		 * but some engineers find that too hard to read.
5821 		 */
5822 		if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
5823 		    value != 0 && (so->so_state & SS_ASYNC) == 0)
5824 			error = so_flip_async(so, vp, mode, cr);
5825 		mutex_exit(&so->so_lock);
5826 		return (error);
5827 	}
5828 
5829 	case SIOCSPGRP:
5830 	case FIOSETOWN: {
5831 		pid_t pgrp;
5832 
5833 		if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5834 		    (mode & (int)FKIOCTL)))
5835 			return (EFAULT);
5836 
5837 		mutex_enter(&so->so_lock);
5838 		dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5839 		/* Any change? */
5840 		if (pgrp != so->so_pgrp)
5841 			error = so_set_siggrp(so, vp, pgrp, mode, cr);
5842 		mutex_exit(&so->so_lock);
5843 		return (error);
5844 	}
5845 	case SIOCGPGRP:
5846 	case FIOGETOWN:
5847 		if (so_copyout(&so->so_pgrp, (void *)arg,
5848 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
5849 			return (EFAULT);
5850 		return (0);
5851 
5852 	case SIOCATMARK: {
5853 		int retval;
5854 		uint_t so_state;
5855 
5856 		/*
5857 		 * strwaitmark has a finite timeout after which it
5858 		 * returns -1 if the mark state is undetermined.
5859 		 * In order to avoid any race between the mark state
5860 		 * in sockfs and the mark state in the stream head this
5861 		 * routine loops until the mark state can be determined
5862 		 * (or the urgent data indication has been removed by some
5863 		 * other thread).
5864 		 */
5865 		do {
5866 			mutex_enter(&so->so_lock);
5867 			so_state = so->so_state;
5868 			mutex_exit(&so->so_lock);
5869 			if (so_state & SS_RCVATMARK) {
5870 				retval = 1;
5871 			} else if (!(so_state & SS_OOBPEND)) {
5872 				/*
5873 				 * No SIGURG has been generated -- there is no
5874 				 * pending or present urgent data. Thus can't
5875 				 * possibly be at the mark.
5876 				 */
5877 				retval = 0;
5878 			} else {
5879 				/*
5880 				 * Have the stream head wait until there is
5881 				 * either some messages on the read queue, or
5882 				 * STRATMARK or STRNOTATMARK gets set. The
5883 				 * STRNOTATMARK flag is used so that the
5884 				 * transport can send up a MSGNOTMARKNEXT
5885 				 * M_DATA to indicate that it is not
5886 				 * at the mark and additional data is not about
5887 				 * to be send upstream.
5888 				 *
5889 				 * If the mark state is undetermined this will
5890 				 * return -1 and we will loop rechecking the
5891 				 * socket state.
5892 				 */
5893 				retval = strwaitmark(vp);
5894 			}
5895 		} while (retval == -1);
5896 
5897 		if (so_copyout(&retval, (void *)arg, sizeof (int),
5898 		    (mode & (int)FKIOCTL)))
5899 			return (EFAULT);
5900 		return (0);
5901 	}
5902 
5903 	case I_FDINSERT:
5904 	case I_SENDFD:
5905 	case I_RECVFD:
5906 	case I_ATMARK:
5907 	case _SIOCSOCKFALLBACK:
5908 		/*
5909 		 * These ioctls do not apply to sockets. I_FDINSERT can be
5910 		 * used to send M_PROTO messages without modifying the socket
5911 		 * state. I_SENDFD/RECVFD should not be used for socket file
5912 		 * descriptor passing since they assume a twisted stream.
5913 		 * SIOCATMARK must be used instead of I_ATMARK.
5914 		 *
5915 		 * _SIOCSOCKFALLBACK from an application should never be
5916 		 * processed.  It is only generated by socktpi_open() or
5917 		 * in response to I_POP or I_PUSH.
5918 		 */
5919 #ifdef DEBUG
5920 		zcmn_err(getzoneid(), CE_WARN,
5921 		    "Unsupported STREAMS ioctl 0x%x on socket. "
5922 		    "Pid = %d\n", cmd, curproc->p_pid);
5923 #endif /* DEBUG */
5924 		return (EOPNOTSUPP);
5925 
5926 	case _I_GETPEERCRED:
5927 		if ((mode & FKIOCTL) == 0)
5928 			return (EINVAL);
5929 
5930 		mutex_enter(&so->so_lock);
5931 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
5932 			error = ENOTSUP;
5933 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
5934 			error = ENOTCONN;
5935 		} else if (so->so_peercred != NULL) {
5936 			k_peercred_t *kp = (k_peercred_t *)arg;
5937 			kp->pc_cr = so->so_peercred;
5938 			kp->pc_cpid = so->so_cpid;
5939 			crhold(so->so_peercred);
5940 		} else {
5941 			error = EINVAL;
5942 		}
5943 		mutex_exit(&so->so_lock);
5944 		return (error);
5945 
5946 	default:
5947 		/*
5948 		 * Do the higher-order bits of the ioctl cmd indicate
5949 		 * that it is an I_* streams ioctl?
5950 		 */
5951 		if ((cmd & 0xffffff00U) == STR &&
5952 		    so->so_version == SOV_SOCKBSD) {
5953 #ifdef DEBUG
5954 			zcmn_err(getzoneid(), CE_WARN,
5955 			    "Unsupported STREAMS ioctl 0x%x on socket. "
5956 			    "Pid = %d\n", cmd, 	curproc->p_pid);
5957 #endif /* DEBUG */
5958 			return (EOPNOTSUPP);
5959 		}
5960 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5961 	}
5962 }
5963 
5964 /*
5965  * Handle plumbing-related ioctls.
5966  */
5967 static int
5968 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
5969     struct cred *cr, int32_t *rvalp)
5970 {
5971 	static const char sockmod_name[] = "sockmod";
5972 	struct sonode	*so = VTOSO(vp);
5973 	char		mname[FMNAMESZ + 1];
5974 	int		error;
5975 	sotpi_info_t	*sti = SOTOTPI(so);
5976 
5977 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
5978 
5979 	if (so->so_version == SOV_SOCKBSD)
5980 		return (EOPNOTSUPP);
5981 
5982 	if (so->so_version == SOV_STREAM) {
5983 		/*
5984 		 * The imaginary "sockmod" has been popped - act as a stream.
5985 		 * If this is a push of sockmod then change back to a socket.
5986 		 */
5987 		if (cmd == I_PUSH) {
5988 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
5989 			    (void *)arg, mname, sizeof (mname), NULL);
5990 
5991 			if (error == 0 && strcmp(mname, sockmod_name) == 0) {
5992 				dprintso(so, 0, ("socktpi_ioctl: going to "
5993 				    "socket version\n"));
5994 				so_stream2sock(so);
5995 				return (0);
5996 			}
5997 		}
5998 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5999 	}
6000 
6001 	switch (cmd) {
6002 	case I_PUSH:
6003 		if (sti->sti_direct) {
6004 			mutex_enter(&so->so_lock);
6005 			so_lock_single(so);
6006 			mutex_exit(&so->so_lock);
6007 
6008 			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6009 			    cr, rvalp);
6010 
6011 			mutex_enter(&so->so_lock);
6012 			if (error == 0)
6013 				sti->sti_direct = 0;
6014 			so_unlock_single(so, SOLOCKED);
6015 			mutex_exit(&so->so_lock);
6016 
6017 			if (error != 0)
6018 				return (error);
6019 		}
6020 
6021 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6022 		if (error == 0)
6023 			sti->sti_pushcnt++;
6024 		return (error);
6025 
6026 	case I_POP:
6027 		if (sti->sti_pushcnt == 0) {
6028 			/* Emulate sockmod being popped */
6029 			dprintso(so, 0,
6030 			    ("socktpi_ioctl: going to STREAMS version\n"));
6031 			return (so_sock2stream(so));
6032 		}
6033 
6034 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6035 		if (error == 0)
6036 			sti->sti_pushcnt--;
6037 		return (error);
6038 
6039 	case I_LIST: {
6040 		struct str_mlist *kmlistp, *umlistp;
6041 		struct str_list	kstrlist;
6042 		ssize_t		kstrlistsize;
6043 		int		i, nmods;
6044 
6045 		STRUCT_DECL(str_list, ustrlist);
6046 		STRUCT_INIT(ustrlist, mode);
6047 
6048 		if (arg == NULL) {
6049 			error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6050 			if (error == 0)
6051 				(*rvalp)++;	/* Add one for sockmod */
6052 			return (error);
6053 		}
6054 
6055 		error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6056 		    STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6057 		if (error != 0)
6058 			return (error);
6059 
6060 		nmods = STRUCT_FGET(ustrlist, sl_nmods);
6061 		if (nmods <= 0)
6062 			return (EINVAL);
6063 		/*
6064 		 * Ceiling nmods at nstrpush to prevent someone from
6065 		 * maliciously consuming lots of kernel memory.
6066 		 */
6067 		nmods = MIN(nmods, nstrpush);
6068 
6069 		kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6070 		kstrlist.sl_nmods = nmods;
6071 		kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6072 
6073 		error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6074 		    cr, rvalp);
6075 		if (error != 0)
6076 			goto done;
6077 
6078 		/*
6079 		 * Considering the module list as a 0-based array of sl_nmods
6080 		 * modules, sockmod should conceptually exist at slot
6081 		 * sti_pushcnt.  Insert sockmod at this location by sliding all
6082 		 * of the module names after so_pushcnt over by one.  We know
6083 		 * that there will be room to do this since we allocated
6084 		 * sl_modlist with an additional slot.
6085 		 */
6086 		for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6087 			kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6088 
6089 		(void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6090 		kstrlist.sl_nmods++;
6091 
6092 		/*
6093 		 * Copy all of the entries out to ustrlist.
6094 		 */
6095 		kmlistp = kstrlist.sl_modlist;
6096 		umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6097 		for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6098 			error = so_copyout(kmlistp++, umlistp++,
6099 			    sizeof (struct str_mlist), mode & FKIOCTL);
6100 			if (error != 0)
6101 				goto done;
6102 		}
6103 
6104 		error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6105 		    mode & FKIOCTL);
6106 		if (error == 0)
6107 			*rvalp = 0;
6108 	done:
6109 		kmem_free(kstrlist.sl_modlist, kstrlistsize);
6110 		return (error);
6111 	}
6112 	case I_LOOK:
6113 		if (sti->sti_pushcnt == 0) {
6114 			return (so_copyout(sockmod_name, (void *)arg,
6115 			    sizeof (sockmod_name), mode & FKIOCTL));
6116 		}
6117 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6118 
6119 	case I_FIND:
6120 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6121 		if (error && error != EINVAL)
6122 			return (error);
6123 
6124 		/* if not found and string was sockmod return 1 */
6125 		if (*rvalp == 0 || error == EINVAL) {
6126 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6127 			    (void *)arg, mname, sizeof (mname), NULL);
6128 			if (error == ENAMETOOLONG)
6129 				error = EINVAL;
6130 
6131 			if (error == 0 && strcmp(mname, sockmod_name) == 0)
6132 				*rvalp = 1;
6133 		}
6134 		return (error);
6135 
6136 	default:
6137 		panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6138 		break;
6139 	}
6140 
6141 	return (0);
6142 }
6143 
6144 /*
6145  * Wrapper around the streams poll routine that implements socket poll
6146  * semantics.
6147  * The sockfs never calls pollwakeup itself - the stream head take care
6148  * of all pollwakeups. Since sockfs never holds so_lock when calling the
6149  * stream head there can never be a deadlock due to holding so_lock across
6150  * pollwakeup and acquiring so_lock in this routine.
6151  *
6152  * However, since the performance of VOP_POLL is critical we avoid
6153  * acquiring so_lock here. This is based on two assumptions:
6154  *  - The poll implementation holds locks to serialize the VOP_POLL call
6155  *    and a pollwakeup for the same pollhead. This ensures that should
6156  *    e.g. so_state change during a socktpi_poll call the pollwakeup
6157  *    (which strsock_* and strrput conspire to issue) is issued after
6158  *    the state change. Thus the pollwakeup will block until VOP_POLL has
6159  *    returned and then wake up poll and have it call VOP_POLL again.
6160  *  - The reading of so_state without holding so_lock does not result in
6161  *    stale data that is older than the latest state change that has dropped
6162  *    so_lock. This is ensured by the mutex_exit issuing the appropriate
6163  *    memory barrier to force the data into the coherency domain.
6164  */
6165 static int
6166 sotpi_poll(
6167 	struct sonode	*so,
6168 	short		events,
6169 	int		anyyet,
6170 	short		*reventsp,
6171 	struct pollhead **phpp)
6172 {
6173 	short origevents = events;
6174 	struct vnode *vp = SOTOV(so);
6175 	int error;
6176 	int so_state = so->so_state;	/* snapshot */
6177 	sotpi_info_t *sti = SOTOTPI(so);
6178 
6179 	dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6180 	    (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6181 
6182 	ASSERT(vp->v_type == VSOCK);
6183 	ASSERT(vp->v_stream != NULL);
6184 
6185 	if (so->so_version == SOV_STREAM) {
6186 		/* The imaginary "sockmod" has been popped - act as a stream */
6187 		return (strpoll(vp->v_stream, events, anyyet,
6188 		    reventsp, phpp));
6189 	}
6190 
6191 	if (!(so_state & SS_ISCONNECTED) &&
6192 	    (so->so_mode & SM_CONNREQUIRED)) {
6193 		/* Not connected yet - turn off write side events */
6194 		events &= ~(POLLOUT|POLLWRBAND);
6195 	}
6196 	/*
6197 	 * Check for errors without calling strpoll if the caller wants them.
6198 	 * In sockets the errors are represented as input/output events
6199 	 * and there is no need to ask the stream head for this information.
6200 	 */
6201 	if (so->so_error != 0 &&
6202 	    ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
6203 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6204 		return (0);
6205 	}
6206 	/*
6207 	 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6208 	 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6209 	 * will not trigger a POLLIN event with POLLRDDATA set.
6210 	 * The handling of urgent data (causing POLLRDBAND) is done by
6211 	 * inspecting SS_OOBPEND below.
6212 	 */
6213 	events |= POLLRDDATA;
6214 
6215 	/*
6216 	 * After shutdown(output) a stream head write error is set.
6217 	 * However, we should not return output events.
6218 	 */
6219 	events |= POLLNOERR;
6220 	error = strpoll(vp->v_stream, events, anyyet,
6221 	    reventsp, phpp);
6222 	if (error)
6223 		return (error);
6224 
6225 	ASSERT(!(*reventsp & POLLERR));
6226 
6227 	/*
6228 	 * Notes on T_CONN_IND handling for sockets.
6229 	 *
6230 	 * If strpoll() returned without events, SR_POLLIN is guaranteed
6231 	 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6232 	 *
6233 	 * Since the so_lock is not held, soqueueconnind() may have run
6234 	 * and a T_CONN_IND may be waiting. We now check for any queued
6235 	 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6236 	 * to ensure poll returns.
6237 	 *
6238 	 * However:
6239 	 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6240 	 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6241 	 * the following actions will occur; taken together they ensure the
6242 	 * syscall will return.
6243 	 *
6244 	 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6245 	 *    the accept() was run on a non-blocking socket sowaitconnind()
6246 	 *    may have already returned EWOULDBLOCK, so not be waiting to
6247 	 *    process the message. Additionally socktpi_poll() has probably
6248 	 *    proceeded past the sti_conn_ind_head check below.
6249 	 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6250 	 *    this thread,  however that could occur before poll_common()
6251 	 *    has entered cv_wait.
6252 	 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6253 	 *
6254 	 * Before proceeding to cv_wait() in poll_common() for an event,
6255 	 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6256 	 * and if set, re-calls strpoll() to ensure the late arriving
6257 	 * T_CONN_IND is recognized, and pollsys() returns.
6258 	 */
6259 
6260 	if (sti->sti_conn_ind_head != NULL)
6261 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6262 
6263 	if (so->so_state & SS_OOBPEND)
6264 		*reventsp |= POLLRDBAND & events;
6265 
6266 	if (sti->sti_nl7c_rcv_mp != NULL) {
6267 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6268 	}
6269 	if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6270 	    ((POLLIN|POLLRDNORM) & *reventsp)) {
6271 		sti->sti_nl7c_flags |= NL7C_POLLIN;
6272 	}
6273 
6274 	return (0);
6275 }
6276 
6277 /*ARGSUSED*/
6278 static int
6279 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6280 {
6281 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6282 	int error = 0;
6283 
6284 	error = sonode_constructor(buf, cdrarg, kmflags);
6285 	if (error != 0)
6286 		return (error);
6287 
6288 	error = i_sotpi_info_constructor(&st->st_info);
6289 	if (error != 0)
6290 		sonode_destructor(buf, cdrarg);
6291 
6292 	st->st_sonode.so_priv = &st->st_info;
6293 
6294 	return (error);
6295 }
6296 
6297 /*ARGSUSED1*/
6298 static void
6299 socktpi_destructor(void *buf, void *cdrarg)
6300 {
6301 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6302 
6303 	ASSERT(st->st_sonode.so_priv == &st->st_info);
6304 	st->st_sonode.so_priv = NULL;
6305 
6306 	i_sotpi_info_destructor(&st->st_info);
6307 	sonode_destructor(buf, cdrarg);
6308 }
6309 
6310 static int
6311 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6312 {
6313 	int retval;
6314 
6315 	if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6316 		struct sonode *so = (struct sonode *)buf;
6317 		sotpi_info_t *sti = SOTOTPI(so);
6318 
6319 		mutex_enter(&socklist.sl_lock);
6320 
6321 		sti->sti_next_so = socklist.sl_list;
6322 		sti->sti_prev_so = NULL;
6323 		if (sti->sti_next_so != NULL)
6324 			SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6325 		socklist.sl_list = so;
6326 
6327 		mutex_exit(&socklist.sl_lock);
6328 
6329 	}
6330 	return (retval);
6331 }
6332 
6333 static void
6334 socktpi_unix_destructor(void *buf, void *cdrarg)
6335 {
6336 	struct sonode	*so = (struct sonode *)buf;
6337 	sotpi_info_t	*sti = SOTOTPI(so);
6338 
6339 	mutex_enter(&socklist.sl_lock);
6340 
6341 	if (sti->sti_next_so != NULL)
6342 		SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6343 	if (sti->sti_prev_so != NULL)
6344 		SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6345 	else
6346 		socklist.sl_list = sti->sti_next_so;
6347 
6348 	mutex_exit(&socklist.sl_lock);
6349 
6350 	socktpi_destructor(buf, cdrarg);
6351 }
6352 
6353 int
6354 socktpi_init(void)
6355 {
6356 	/*
6357 	 * Create sonode caches.  We create a special one for AF_UNIX so
6358 	 * that we can track them for netstat(1m).
6359 	 */
6360 	socktpi_cache = kmem_cache_create("socktpi_cache",
6361 	    sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6362 	    socktpi_destructor, NULL, NULL, NULL, 0);
6363 
6364 	socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6365 	    sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6366 	    socktpi_unix_destructor, NULL, NULL, NULL, 0);
6367 
6368 	return (0);
6369 }
6370 
6371 /*
6372  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6373  *
6374  * Caller must still update state and mode using sotpi_update_state().
6375  */
6376 int
6377 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6378     boolean_t *direct, queue_t **qp, struct cred *cr)
6379 {
6380 	sotpi_info_t *sti;
6381 	struct sockparams *origsp = so->so_sockparams;
6382 	sock_lower_handle_t handle = so->so_proto_handle;
6383 	struct stdata *stp;
6384 	struct vnode *vp;
6385 	queue_t *q;
6386 	int error = 0;
6387 
6388 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6389 	    SS_FALLBACK_PENDING);
6390 	ASSERT(SOCK_IS_NONSTR(so));
6391 
6392 	*qp = NULL;
6393 	*direct = B_FALSE;
6394 	so->so_sockparams = newsp;
6395 	/*
6396 	 * Allocate and initalize fields required by TPI.
6397 	 */
6398 	(void) sotpi_info_create(so, KM_SLEEP);
6399 	sotpi_info_init(so);
6400 
6401 	if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6402 		sotpi_info_fini(so);
6403 		sotpi_info_destroy(so);
6404 		return (error);
6405 	}
6406 	ASSERT(handle == so->so_proto_handle);
6407 	sti = SOTOTPI(so);
6408 	if (sti->sti_direct != 0)
6409 		*direct = B_TRUE;
6410 
6411 	/*
6412 	 * Keep the original sp around so we can properly dispose of the
6413 	 * sonode when the socket is being closed.
6414 	 */
6415 	sti->sti_orig_sp = origsp;
6416 
6417 	so_basic_strinit(so);	/* skips the T_CAPABILITY_REQ */
6418 	so_alloc_addr(so, so->so_max_addr_len);
6419 
6420 	/*
6421 	 * If the application has done a SIOCSPGRP, make sure the
6422 	 * STREAM head is aware. This needs to take place before
6423 	 * the protocol start sending up messages. Otherwise we
6424 	 * might miss to generate SIGPOLL.
6425 	 *
6426 	 * It is possible that the application will receive duplicate
6427 	 * signals if some were already generated for either data or
6428 	 * connection indications.
6429 	 */
6430 	if (so->so_pgrp != 0) {
6431 		if (so_set_events(so, so->so_vnode, cr) != 0)
6432 			so->so_pgrp = 0;
6433 	}
6434 
6435 	/*
6436 	 * Determine which queue to use.
6437 	 */
6438 	vp = SOTOV(so);
6439 	stp = vp->v_stream;
6440 	ASSERT(stp != NULL);
6441 	q = stp->sd_wrq->q_next;
6442 
6443 	/*
6444 	 * Skip any modules that may have been auto pushed when the device
6445 	 * was opened
6446 	 */
6447 	while (q->q_next != NULL)
6448 		q = q->q_next;
6449 	*qp = _RD(q);
6450 
6451 	/* This is now a STREAMS sockets */
6452 	so->so_not_str = B_FALSE;
6453 
6454 	return (error);
6455 }
6456 
6457 /*
6458  * Revert a TPI sonode. It is only allowed to revert the sonode during
6459  * the fallback process.
6460  */
6461 void
6462 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6463 {
6464 	vnode_t *vp = SOTOV(so);
6465 
6466 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6467 	    SS_FALLBACK_PENDING);
6468 	ASSERT(!SOCK_IS_NONSTR(so));
6469 	ASSERT(vp->v_stream != NULL);
6470 
6471 	strclean(vp);
6472 	(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6473 
6474 	/*
6475 	 * Restore the original sockparams. The caller is responsible for
6476 	 * dropping the ref to the new sp.
6477 	 */
6478 	so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6479 
6480 	sotpi_info_fini(so);
6481 	sotpi_info_destroy(so);
6482 
6483 	/* This is no longer a STREAMS sockets */
6484 	so->so_not_str = B_TRUE;
6485 }
6486 
6487 void
6488 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6489     struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6490     socklen_t faddrlen, short opts)
6491 {
6492 	sotpi_info_t *sti = SOTOTPI(so);
6493 
6494 	so_proc_tcapability_ack(so, tcap);
6495 
6496 	so->so_options |= opts;
6497 
6498 	/*
6499 	 * Determine whether the foreign and local address are valid
6500 	 */
6501 	if (laddrlen != 0) {
6502 		ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6503 		sti->sti_laddr_len = laddrlen;
6504 		bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6505 		sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6506 	}
6507 
6508 	if (faddrlen != 0) {
6509 		ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6510 		sti->sti_faddr_len = faddrlen;
6511 		bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6512 		sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6513 	}
6514 
6515 }
6516 
6517 /*
6518  * Allocate enough space to cache the local and foreign addresses.
6519  */
6520 void
6521 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6522 {
6523 	sotpi_info_t *sti = SOTOTPI(so);
6524 
6525 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6526 	ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6527 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6528 	    P2ROUNDUP(maxlen, KMEM_ALIGN);
6529 	so->so_max_addr_len = sti->sti_laddr_maxlen;
6530 	sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6531 	sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6532 	    + sti->sti_laddr_maxlen);
6533 
6534 	if (so->so_family == AF_UNIX) {
6535 		/*
6536 		 * Initialize AF_UNIX related fields.
6537 		 */
6538 		bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6539 		bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6540 	}
6541 }
6542 
6543 
6544 sotpi_info_t *
6545 sotpi_sototpi(struct sonode *so)
6546 {
6547 	sotpi_info_t *sti;
6548 
6549 	ASSERT(so != NULL);
6550 
6551 	sti = (sotpi_info_t *)so->so_priv;
6552 
6553 	ASSERT(sti != NULL);
6554 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6555 
6556 	return (sti);
6557 }
6558 
6559 static int
6560 i_sotpi_info_constructor(sotpi_info_t *sti)
6561 {
6562 	sti->sti_magic		= SOTPI_INFO_MAGIC;
6563 	sti->sti_ack_mp		= NULL;
6564 	sti->sti_discon_ind_mp	= NULL;
6565 	sti->sti_ux_bound_vp	= NULL;
6566 	sti->sti_unbind_mp	= NULL;
6567 
6568 	sti->sti_conn_ind_head	= NULL;
6569 	sti->sti_conn_ind_tail	= NULL;
6570 
6571 	sti->sti_laddr_sa	= NULL;
6572 	sti->sti_faddr_sa	= NULL;
6573 
6574 	sti->sti_nl7c_flags	= 0;
6575 	sti->sti_nl7c_uri	= NULL;
6576 	sti->sti_nl7c_rcv_mp	= NULL;
6577 
6578 	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6579 	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6580 
6581 	return (0);
6582 }
6583 
6584 static void
6585 i_sotpi_info_destructor(sotpi_info_t *sti)
6586 {
6587 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6588 	ASSERT(sti->sti_ack_mp == NULL);
6589 	ASSERT(sti->sti_discon_ind_mp == NULL);
6590 	ASSERT(sti->sti_ux_bound_vp == NULL);
6591 	ASSERT(sti->sti_unbind_mp == NULL);
6592 
6593 	ASSERT(sti->sti_conn_ind_head == NULL);
6594 	ASSERT(sti->sti_conn_ind_tail == NULL);
6595 
6596 	ASSERT(sti->sti_laddr_sa == NULL);
6597 	ASSERT(sti->sti_faddr_sa == NULL);
6598 
6599 	ASSERT(sti->sti_nl7c_flags == 0);
6600 	ASSERT(sti->sti_nl7c_uri == NULL);
6601 	ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6602 
6603 	mutex_destroy(&sti->sti_plumb_lock);
6604 	cv_destroy(&sti->sti_ack_cv);
6605 }
6606 
6607 /*
6608  * Creates and attaches TPI information to the given sonode
6609  */
6610 static boolean_t
6611 sotpi_info_create(struct sonode *so, int kmflags)
6612 {
6613 	sotpi_info_t *sti;
6614 
6615 	ASSERT(so->so_priv == NULL);
6616 
6617 	if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6618 		return (B_FALSE);
6619 
6620 	if (i_sotpi_info_constructor(sti) != 0) {
6621 		kmem_free(sti, sizeof (*sti));
6622 		return (B_FALSE);
6623 	}
6624 
6625 	so->so_priv = (void *)sti;
6626 	return (B_TRUE);
6627 }
6628 
6629 /*
6630  * Initializes the TPI information.
6631  */
6632 static void
6633 sotpi_info_init(struct sonode *so)
6634 {
6635 	struct vnode *vp = SOTOV(so);
6636 	sotpi_info_t *sti = SOTOTPI(so);
6637 	time_t now;
6638 
6639 	sti->sti_dev 	= so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6640 	vp->v_rdev	= sti->sti_dev;
6641 
6642 	sti->sti_orig_sp = NULL;
6643 
6644 	sti->sti_pushcnt = 0;
6645 
6646 	now = gethrestime_sec();
6647 	sti->sti_atime	= now;
6648 	sti->sti_mtime	= now;
6649 	sti->sti_ctime	= now;
6650 
6651 	sti->sti_eaddr_mp = NULL;
6652 	sti->sti_delayed_error = 0;
6653 
6654 	sti->sti_provinfo = NULL;
6655 
6656 	sti->sti_oobcnt = 0;
6657 	sti->sti_oobsigcnt = 0;
6658 
6659 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6660 
6661 	sti->sti_laddr_sa	= 0;
6662 	sti->sti_faddr_sa	= 0;
6663 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6664 	sti->sti_laddr_len = sti->sti_faddr_len = 0;
6665 
6666 	sti->sti_laddr_valid = 0;
6667 	sti->sti_faddr_valid = 0;
6668 	sti->sti_faddr_noxlate = 0;
6669 
6670 	sti->sti_direct = 0;
6671 
6672 	ASSERT(sti->sti_ack_mp == NULL);
6673 	ASSERT(sti->sti_ux_bound_vp == NULL);
6674 	ASSERT(sti->sti_unbind_mp == NULL);
6675 
6676 	ASSERT(sti->sti_conn_ind_head == NULL);
6677 	ASSERT(sti->sti_conn_ind_tail == NULL);
6678 }
6679 
6680 /*
6681  * Given a sonode, grab the TPI info and free any data.
6682  */
6683 static void
6684 sotpi_info_fini(struct sonode *so)
6685 {
6686 	sotpi_info_t *sti = SOTOTPI(so);
6687 	mblk_t *mp;
6688 
6689 	ASSERT(sti->sti_discon_ind_mp == NULL);
6690 
6691 	if ((mp = sti->sti_conn_ind_head) != NULL) {
6692 		mblk_t *mp1;
6693 
6694 		while (mp) {
6695 			mp1 = mp->b_next;
6696 			mp->b_next = NULL;
6697 			freemsg(mp);
6698 			mp = mp1;
6699 		}
6700 		sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6701 	}
6702 
6703 	/*
6704 	 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6705 	 * indirect them.  It also uses so_count as a validity test.
6706 	 */
6707 	mutex_enter(&so->so_lock);
6708 
6709 	if (sti->sti_laddr_sa) {
6710 		ASSERT((caddr_t)sti->sti_faddr_sa ==
6711 		    (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6712 		ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6713 		sti->sti_laddr_valid = 0;
6714 		sti->sti_faddr_valid = 0;
6715 		kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6716 		sti->sti_laddr_sa = NULL;
6717 		sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6718 		sti->sti_faddr_sa = NULL;
6719 		sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6720 	}
6721 
6722 	mutex_exit(&so->so_lock);
6723 
6724 	if ((mp = sti->sti_eaddr_mp) != NULL) {
6725 		freemsg(mp);
6726 		sti->sti_eaddr_mp = NULL;
6727 		sti->sti_delayed_error = 0;
6728 	}
6729 
6730 	if ((mp = sti->sti_ack_mp) != NULL) {
6731 		freemsg(mp);
6732 		sti->sti_ack_mp = NULL;
6733 	}
6734 
6735 	if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
6736 		sti->sti_nl7c_rcv_mp = NULL;
6737 		freemsg(mp);
6738 	}
6739 	sti->sti_nl7c_rcv_rval = 0;
6740 	if (sti->sti_nl7c_uri != NULL) {
6741 		nl7c_urifree(so);
6742 		/* urifree() cleared nl7c_uri */
6743 	}
6744 	if (sti->sti_nl7c_flags) {
6745 		sti->sti_nl7c_flags = 0;
6746 	}
6747 
6748 	ASSERT(sti->sti_ux_bound_vp == NULL);
6749 	if ((mp = sti->sti_unbind_mp) != NULL) {
6750 		freemsg(mp);
6751 		sti->sti_unbind_mp = NULL;
6752 	}
6753 }
6754 
6755 /*
6756  * Destroys the TPI information attached to a sonode.
6757  */
6758 static void
6759 sotpi_info_destroy(struct sonode *so)
6760 {
6761 	sotpi_info_t *sti = SOTOTPI(so);
6762 
6763 	i_sotpi_info_destructor(sti);
6764 	kmem_free(sti, sizeof (*sti));
6765 
6766 	so->so_priv = NULL;
6767 }
6768 
6769 /*
6770  * Create the global sotpi socket module entry. It will never be freed.
6771  */
6772 smod_info_t *
6773 sotpi_smod_create(void)
6774 {
6775 	smod_info_t *smodp;
6776 
6777 	smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6778 	smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6779 	(void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6780 	/*
6781 	 * Initialize the smod_refcnt to 1 so it will never be freed.
6782 	 */
6783 	smodp->smod_refcnt = 1;
6784 	smodp->smod_uc_version = SOCK_UC_VERSION;
6785 	smodp->smod_dc_version = SOCK_DC_VERSION;
6786 	smodp->smod_sock_create_func = &sotpi_create;
6787 	smodp->smod_sock_destroy_func = &sotpi_destroy;
6788 	return (smodp);
6789 }
6790