xref: /illumos-gate/usr/src/uts/common/fs/sockfs/socktpi.c (revision fd75ca8de430ee0ba5ce650efee0ac0b85ed43e9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 /*
26  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/kmem_impl.h>
38 #include <sys/sysmacros.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/debug.h>
42 #include <sys/errno.h>
43 #include <sys/time.h>
44 #include <sys/file.h>
45 #include <sys/open.h>
46 #include <sys/user.h>
47 #include <sys/termios.h>
48 #include <sys/stream.h>
49 #include <sys/strsubr.h>
50 #include <sys/strsun.h>
51 #include <sys/suntpi.h>
52 #include <sys/ddi.h>
53 #include <sys/esunddi.h>
54 #include <sys/flock.h>
55 #include <sys/modctl.h>
56 #include <sys/vtrace.h>
57 #include <sys/cmn_err.h>
58 #include <sys/pathname.h>
59 
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/sockio.h>
63 #include <netinet/in.h>
64 #include <sys/un.h>
65 #include <sys/strsun.h>
66 
67 #include <sys/tiuser.h>
68 #define	_SUN_TPI_VERSION	2
69 #include <sys/tihdr.h>
70 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
71 
72 #include <c2/audit.h>
73 
74 #include <inet/common.h>
75 #include <inet/ip.h>
76 #include <inet/ip6.h>
77 #include <inet/tcp.h>
78 #include <inet/udp_impl.h>
79 
80 #include <sys/zone.h>
81 
82 #include <fs/sockfs/nl7c.h>
83 #include <fs/sockfs/nl7curi.h>
84 
85 #include <fs/sockfs/sockcommon.h>
86 #include <fs/sockfs/socktpi.h>
87 #include <fs/sockfs/socktpi_impl.h>
88 
89 /*
90  * Possible failures when memory can't be allocated. The documented behavior:
91  *
92  * 		5.5:			4.X:		XNET:
93  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
94  *							EINTR
95  *	(4.X does not document EINTR but returns it)
96  * bind:	ENOSR			-		ENOBUFS/ENOSR
97  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
98  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
99  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
100  *	(4.X getpeername and getsockname do not fail in practice)
101  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
102  * listen:	-			-		ENOBUFS
103  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
104  *							EINTR
105  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
106  *							EINTR
107  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
108  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
109  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
110  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
111  *
112  * Resolution. When allocation fails:
113  *	recv: return EINTR
114  *	send: return EINTR
115  *	connect, accept: EINTR
116  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
117  *	socket, socketpair: ENOBUFS
118  *	getpeername, getsockname: sleep
119  *	getsockopt, setsockopt: sleep
120  */
121 
122 #ifdef SOCK_TEST
123 /*
124  * Variables that make sockfs do something other than the standard TPI
125  * for the AF_INET transports.
126  *
127  * solisten_tpi_tcp:
128  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
129  *	the transport is already bound. This is needed to avoid loosing the
130  *	port number should listen() do a T_UNBIND_REQ followed by a
131  *	O_T_BIND_REQ.
132  *
133  * soconnect_tpi_udp:
134  *	UDP and ICMP can handle a T_CONN_REQ.
135  *	This is needed to make the sequence of connect(), getsockname()
136  *	return the local IP address used to send packets to the connected to
137  *	destination.
138  *
139  * soconnect_tpi_tcp:
140  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
141  *	Set this to non-zero to send TPI conformant messages to TCP in this
142  *	respect. This is a performance optimization.
143  *
144  * soaccept_tpi_tcp:
145  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
146  *	This is a performance optimization that has been picked up in XTI.
147  *
148  * soaccept_tpi_multioptions:
149  *	When inheriting SOL_SOCKET options from the listener to the accepting
150  *	socket send them as a single message for AF_INET{,6}.
151  */
152 int solisten_tpi_tcp = 0;
153 int soconnect_tpi_udp = 0;
154 int soconnect_tpi_tcp = 0;
155 int soaccept_tpi_tcp = 0;
156 int soaccept_tpi_multioptions = 1;
157 #else /* SOCK_TEST */
158 #define	soconnect_tpi_tcp	0
159 #define	soconnect_tpi_udp	0
160 #define	solisten_tpi_tcp	0
161 #define	soaccept_tpi_tcp	0
162 #define	soaccept_tpi_multioptions	1
163 #endif /* SOCK_TEST */
164 
165 #ifdef SOCK_TEST
166 extern int do_useracc;
167 extern clock_t sock_test_timelimit;
168 #endif /* SOCK_TEST */
169 
170 /*
171  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
172  * applications working. Turn on this flag to disable these checks.
173  */
174 int xnet_skip_checks = 0;
175 int xnet_check_print = 0;
176 int xnet_truncate_print = 0;
177 
178 static void sotpi_destroy(struct sonode *);
179 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
180     int, int *, cred_t *cr);
181 
182 static boolean_t	sotpi_info_create(struct sonode *, int);
183 static void		sotpi_info_init(struct sonode *);
184 static void 		sotpi_info_fini(struct sonode *);
185 static void 		sotpi_info_destroy(struct sonode *);
186 
187 /*
188  * Do direct function call to the transport layer below; this would
189  * also allow the transport to utilize read-side synchronous stream
190  * interface if necessary.  This is a /etc/system tunable that must
191  * not be modified on a running system.  By default this is enabled
192  * for performance reasons and may be disabled for debugging purposes.
193  */
194 boolean_t socktpi_direct = B_TRUE;
195 
196 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
197 
198 extern	void sigintr(k_sigset_t *, int);
199 extern	void sigunintr(k_sigset_t *);
200 
201 static int	sotpi_unbind(struct sonode *, int);
202 
203 /* TPI sockfs sonode operations */
204 int 		sotpi_init(struct sonode *, struct sonode *, struct cred *,
205 		    int);
206 static int	sotpi_accept(struct sonode *, int, struct cred *,
207 		    struct sonode **);
208 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
209 		    int, struct cred *);
210 static int	sotpi_listen(struct sonode *, int, struct cred *);
211 static int	sotpi_connect(struct sonode *, struct sockaddr *,
212 		    socklen_t, int, int, struct cred *);
213 extern int	sotpi_recvmsg(struct sonode *, struct nmsghdr *,
214 		    struct uio *, struct cred *);
215 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
216 		    struct uio *, struct cred *);
217 static int	sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
218 		    struct cred *, mblk_t **);
219 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
220 		    struct uio *, void *, t_uscalar_t, int);
221 static int	sodgram_direct(struct sonode *, struct sockaddr *,
222 		    socklen_t, struct uio *, int);
223 extern int	sotpi_getpeername(struct sonode *, struct sockaddr *,
224 		    socklen_t *, boolean_t, struct cred *);
225 static int	sotpi_getsockname(struct sonode *, struct sockaddr *,
226 		    socklen_t *, struct cred *);
227 static int	sotpi_shutdown(struct sonode *, int, struct cred *);
228 extern int	sotpi_getsockopt(struct sonode *, int, int, void *,
229 		    socklen_t *, int, struct cred *);
230 extern int	sotpi_setsockopt(struct sonode *, int, int, const void *,
231 		    socklen_t, struct cred *);
232 static int 	sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
233 		    int32_t *);
234 static int 	socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
235 		    struct cred *, int32_t *);
236 static int 	sotpi_poll(struct sonode *, short, int, short *,
237 		    struct pollhead **);
238 static int 	sotpi_close(struct sonode *, int, struct cred *);
239 
240 static int	i_sotpi_info_constructor(sotpi_info_t *);
241 static void 	i_sotpi_info_destructor(sotpi_info_t *);
242 
243 sonodeops_t sotpi_sonodeops = {
244 	sotpi_init,		/* sop_init		*/
245 	sotpi_accept,		/* sop_accept		*/
246 	sotpi_bind,		/* sop_bind		*/
247 	sotpi_listen,		/* sop_listen		*/
248 	sotpi_connect,		/* sop_connect		*/
249 	sotpi_recvmsg,		/* sop_recvmsg		*/
250 	sotpi_sendmsg,		/* sop_sendmsg		*/
251 	sotpi_sendmblk,		/* sop_sendmblk		*/
252 	sotpi_getpeername,	/* sop_getpeername	*/
253 	sotpi_getsockname,	/* sop_getsockname	*/
254 	sotpi_shutdown,		/* sop_shutdown		*/
255 	sotpi_getsockopt,	/* sop_getsockopt	*/
256 	sotpi_setsockopt,	/* sop_setsockopt	*/
257 	sotpi_ioctl,		/* sop_ioctl		*/
258 	sotpi_poll,		/* sop_poll		*/
259 	sotpi_close,		/* sop_close		*/
260 };
261 
262 /*
263  * Return a TPI socket vnode.
264  *
265  * Note that sockets assume that the driver will clone (either itself
266  * or by using the clone driver) i.e. a socket() call will always
267  * result in a new vnode being created.
268  */
269 
270 /*
271  * Common create code for socket and accept. If tso is set the values
272  * from that node is used instead of issuing a T_INFO_REQ.
273  */
274 
275 /* ARGSUSED */
276 static struct sonode *
277 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
278     int version, int sflags, int *errorp, cred_t *cr)
279 {
280 	struct sonode	*so;
281 	kmem_cache_t 	*cp;
282 	int		sfamily = family;
283 
284 	ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
285 
286 	if (family == AF_NCA) {
287 		/*
288 		 * The request is for an NCA socket so for NL7C use the
289 		 * INET domain instead and mark NL7C_AF_NCA below.
290 		 */
291 		family = AF_INET;
292 		/*
293 		 * NL7C is not supported in the non-global zone,
294 		 * we enforce this restriction here.
295 		 */
296 		if (getzoneid() != GLOBAL_ZONEID) {
297 			*errorp = ENOTSUP;
298 			return (NULL);
299 		}
300 	}
301 
302 	/*
303 	 * to be compatible with old tpi socket implementation ignore
304 	 * sleep flag (sflags) passed in
305 	 */
306 	cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
307 	so = kmem_cache_alloc(cp, KM_SLEEP);
308 	if (so == NULL) {
309 		*errorp = ENOMEM;
310 		return (NULL);
311 	}
312 
313 	sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
314 	sotpi_info_init(so);
315 
316 	if (sfamily == AF_NCA) {
317 		SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
318 	}
319 
320 	if (version == SOV_DEFAULT)
321 		version = so_default_version;
322 
323 	so->so_version = (short)version;
324 	*errorp = 0;
325 
326 	return (so);
327 }
328 
329 static void
330 sotpi_destroy(struct sonode *so)
331 {
332 	kmem_cache_t *cp;
333 	struct sockparams *origsp;
334 
335 	/*
336 	 * If there is a new dealloc function (ie. smod_destroy_func),
337 	 * then it should check the correctness of the ops.
338 	 */
339 
340 	ASSERT(so->so_ops == &sotpi_sonodeops);
341 
342 	origsp = SOTOTPI(so)->sti_orig_sp;
343 
344 	sotpi_info_fini(so);
345 
346 	if (so->so_state & SS_FALLBACK_COMP) {
347 		/*
348 		 * A fallback happend, which means that a sotpi_info_t struct
349 		 * was allocated (as opposed to being allocated from the TPI
350 		 * sonode cache. Therefore we explicitly free the struct
351 		 * here.
352 		 */
353 		sotpi_info_destroy(so);
354 		ASSERT(origsp != NULL);
355 
356 		origsp->sp_smod_info->smod_sock_destroy_func(so);
357 		SOCKPARAMS_DEC_REF(origsp);
358 	} else {
359 		sonode_fini(so);
360 		cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
361 		    socktpi_cache;
362 		kmem_cache_free(cp, so);
363 	}
364 }
365 
366 /* ARGSUSED1 */
367 int
368 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
369 {
370 	major_t maj;
371 	dev_t newdev;
372 	struct vnode *vp;
373 	int error = 0;
374 	struct stdata *stp;
375 
376 	sotpi_info_t *sti = SOTOTPI(so);
377 
378 	dprint(1, ("sotpi_init()\n"));
379 
380 	/*
381 	 * over write the sleep flag passed in but that is ok
382 	 * as tpi socket does not honor sleep flag.
383 	 */
384 	flags |= FREAD|FWRITE;
385 
386 	/*
387 	 * Record in so_flag that it is a clone.
388 	 */
389 	if (getmajor(sti->sti_dev) == clone_major)
390 		so->so_flag |= SOCLONE;
391 
392 	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
393 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
394 	    (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
395 	    so->so_protocol == IPPROTO_IP)) {
396 		/* Tell tcp or udp that it's talking to sockets */
397 		flags |= SO_SOCKSTR;
398 
399 		/*
400 		 * Here we indicate to socktpi_open() our attempt to
401 		 * make direct calls between sockfs and transport.
402 		 * The final decision is left to socktpi_open().
403 		 */
404 		sti->sti_direct = 1;
405 
406 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
407 		if (so->so_type == SOCK_STREAM && tso != NULL) {
408 			if (SOTOTPI(tso)->sti_direct) {
409 				/*
410 				 * Inherit sti_direct from listener and pass
411 				 * SO_ACCEPTOR open flag to tcp, indicating
412 				 * that this is an accept fast-path instance.
413 				 */
414 				flags |= SO_ACCEPTOR;
415 			} else {
416 				/*
417 				 * sti_direct is not set on listener, meaning
418 				 * that the listener has been converted from
419 				 * a socket to a stream.  Ensure that the
420 				 * acceptor inherits these settings.
421 				 */
422 				sti->sti_direct = 0;
423 				flags &= ~SO_SOCKSTR;
424 			}
425 		}
426 	}
427 
428 	/*
429 	 * Tell local transport that it is talking to sockets.
430 	 */
431 	if (so->so_family == AF_UNIX) {
432 		flags |= SO_SOCKSTR;
433 	}
434 
435 	vp = SOTOV(so);
436 	newdev = vp->v_rdev;
437 	maj = getmajor(newdev);
438 	ASSERT(STREAMSTAB(maj));
439 
440 	error = stropen(vp, &newdev, flags, cr);
441 
442 	stp = vp->v_stream;
443 	if (error == 0) {
444 		if (so->so_flag & SOCLONE)
445 			ASSERT(newdev != vp->v_rdev);
446 		mutex_enter(&so->so_lock);
447 		sti->sti_dev = newdev;
448 		vp->v_rdev = newdev;
449 		mutex_exit(&so->so_lock);
450 
451 		if (stp->sd_flag & STRISTTY) {
452 			/*
453 			 * this is a post SVR4 tty driver - a socket can not
454 			 * be a controlling terminal. Fail the open.
455 			 */
456 			(void) sotpi_close(so, flags, cr);
457 			return (ENOTTY);	/* XXX */
458 		}
459 
460 		ASSERT(stp->sd_wrq != NULL);
461 		sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
462 
463 		/*
464 		 * If caller is interested in doing direct function call
465 		 * interface to/from transport module, probe the module
466 		 * directly beneath the streamhead to see if it qualifies.
467 		 *
468 		 * We turn off the direct interface when qualifications fail.
469 		 * In the acceptor case, we simply turn off the sti_direct
470 		 * flag on the socket. We do the fallback after the accept
471 		 * has completed, before the new socket is returned to the
472 		 * application.
473 		 */
474 		if (sti->sti_direct) {
475 			queue_t *tq = stp->sd_wrq->q_next;
476 
477 			/*
478 			 * sti_direct is currently supported and tested
479 			 * only for tcp/udp; this is the main reason to
480 			 * have the following assertions.
481 			 */
482 			ASSERT(so->so_family == AF_INET ||
483 			    so->so_family == AF_INET6);
484 			ASSERT(so->so_protocol == IPPROTO_UDP ||
485 			    so->so_protocol == IPPROTO_TCP ||
486 			    so->so_protocol == IPPROTO_IP);
487 			ASSERT(so->so_type == SOCK_DGRAM ||
488 			    so->so_type == SOCK_STREAM);
489 
490 			/*
491 			 * Abort direct call interface if the module directly
492 			 * underneath the stream head is not defined with the
493 			 * _D_DIRECT flag.  This could happen in the tcp or
494 			 * udp case, when some other module is autopushed
495 			 * above it, or for some reasons the expected module
496 			 * isn't purely D_MP (which is the main requirement).
497 			 */
498 			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
499 			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
500 				int rval;
501 
502 				/* Continue on without direct calls */
503 				sti->sti_direct = 0;
504 
505 				/*
506 				 * Cannot issue ioctl on fallback socket since
507 				 * there is no conn associated with the queue.
508 				 * The fallback downcall will notify the proto
509 				 * of the change.
510 				 */
511 				if (!(flags & SO_ACCEPTOR) &&
512 				    !(flags & SO_FALLBACK)) {
513 					if ((error = strioctl(vp,
514 					    _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
515 					    cr, &rval)) != 0) {
516 						(void) sotpi_close(so, flags,
517 						    cr);
518 						return (error);
519 					}
520 				}
521 			}
522 		}
523 
524 		if (flags & SO_FALLBACK) {
525 			/*
526 			 * The stream created does not have a conn.
527 			 * do stream set up after conn has been assigned
528 			 */
529 			return (error);
530 		}
531 		if (error = so_strinit(so, tso)) {
532 			(void) sotpi_close(so, flags, cr);
533 			return (error);
534 		}
535 
536 		/* Wildcard */
537 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
538 			int protocol = so->so_protocol;
539 			/*
540 			 * Issue SO_PROTOTYPE setsockopt.
541 			 */
542 			error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
543 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
544 			if (error != 0) {
545 				(void) sotpi_close(so, flags, cr);
546 				/*
547 				 * Setsockopt often fails with ENOPROTOOPT but
548 				 * socket() should fail with
549 				 * EPROTONOSUPPORT/EPROTOTYPE.
550 				 */
551 				return (EPROTONOSUPPORT);
552 			}
553 		}
554 
555 	} else {
556 		/*
557 		 * While the same socket can not be reopened (unlike specfs)
558 		 * the stream head sets STREOPENFAIL when the autopush fails.
559 		 */
560 		if ((stp != NULL) &&
561 		    (stp->sd_flag & STREOPENFAIL)) {
562 			/*
563 			 * Open failed part way through.
564 			 */
565 			mutex_enter(&stp->sd_lock);
566 			stp->sd_flag &= ~STREOPENFAIL;
567 			mutex_exit(&stp->sd_lock);
568 			(void) sotpi_close(so, flags, cr);
569 			return (error);
570 			/*NOTREACHED*/
571 		}
572 		ASSERT(stp == NULL);
573 	}
574 	TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
575 	    "sockfs open:maj %d vp %p so %p error %d",
576 	    maj, vp, so, error);
577 	return (error);
578 }
579 
580 /*
581  * Bind the socket to an unspecified address in sockfs only.
582  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
583  * required in all cases.
584  */
585 static void
586 so_automatic_bind(struct sonode *so)
587 {
588 	sotpi_info_t *sti = SOTOTPI(so);
589 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
590 
591 	ASSERT(MUTEX_HELD(&so->so_lock));
592 	ASSERT(!(so->so_state & SS_ISBOUND));
593 	ASSERT(sti->sti_unbind_mp);
594 
595 	ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
596 	bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
597 	sti->sti_laddr_sa->sa_family = so->so_family;
598 	so->so_state |= SS_ISBOUND;
599 }
600 
601 
602 /*
603  * bind the socket.
604  *
605  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
606  * are passed in we allow rebinding. Note that for backwards compatibility
607  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
608  * Thus the rebinding code is currently not executed.
609  *
610  * The constraints for rebinding are:
611  * - it is a SOCK_DGRAM, or
612  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
613  *   and no listen() has been done.
614  * This rebinding code was added based on some language in the XNET book
615  * about not returning EINVAL it the protocol allows rebinding. However,
616  * this language is not present in the Posix socket draft. Thus maybe the
617  * rebinding logic should be deleted from the source.
618  *
619  * A null "name" can be used to unbind the socket if:
620  * - it is a SOCK_DGRAM, or
621  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
622  *   and no listen() has been done.
623  */
624 /* ARGSUSED */
625 static int
626 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
627     socklen_t namelen, int backlog, int flags, struct cred *cr)
628 {
629 	struct T_bind_req	bind_req;
630 	struct T_bind_ack	*bind_ack;
631 	int			error = 0;
632 	mblk_t			*mp;
633 	void			*addr;
634 	t_uscalar_t		addrlen;
635 	int			unbind_on_err = 1;
636 	boolean_t		clear_acceptconn_on_err = B_FALSE;
637 	boolean_t		restore_backlog_on_err = B_FALSE;
638 	int			save_so_backlog;
639 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
640 	boolean_t		tcp_udp_xport;
641 	void			*nl7c = NULL;
642 	sotpi_info_t		*sti = SOTOTPI(so);
643 
644 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
645 	    (void *)so, (void *)name, namelen, backlog, flags,
646 	    pr_state(so->so_state, so->so_mode)));
647 
648 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
649 
650 	if (!(flags & _SOBIND_LOCK_HELD)) {
651 		mutex_enter(&so->so_lock);
652 		so_lock_single(so);	/* Set SOLOCKED */
653 	} else {
654 		ASSERT(MUTEX_HELD(&so->so_lock));
655 		ASSERT(so->so_flag & SOLOCKED);
656 	}
657 
658 	/*
659 	 * Make sure that there is a preallocated unbind_req message
660 	 * before binding. This message allocated when the socket is
661 	 * created  but it might be have been consumed.
662 	 */
663 	if (sti->sti_unbind_mp == NULL) {
664 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
665 		/* NOTE: holding so_lock while sleeping */
666 		sti->sti_unbind_mp =
667 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
668 		    cr);
669 	}
670 
671 	if (flags & _SOBIND_REBIND) {
672 		/*
673 		 * Called from solisten after doing an sotpi_unbind() or
674 		 * potentially without the unbind (latter for AF_INET{,6}).
675 		 */
676 		ASSERT(name == NULL && namelen == 0);
677 
678 		if (so->so_family == AF_UNIX) {
679 			ASSERT(sti->sti_ux_bound_vp);
680 			addr = &sti->sti_ux_laddr;
681 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
682 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
683 			    "addr 0x%p, vp %p\n",
684 			    addrlen,
685 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
686 			    (void *)sti->sti_ux_bound_vp));
687 		} else {
688 			addr = sti->sti_laddr_sa;
689 			addrlen = (t_uscalar_t)sti->sti_laddr_len;
690 		}
691 	} else if (flags & _SOBIND_UNSPEC) {
692 		ASSERT(name == NULL && namelen == 0);
693 
694 		/*
695 		 * The caller checked SS_ISBOUND but not necessarily
696 		 * under so_lock
697 		 */
698 		if (so->so_state & SS_ISBOUND) {
699 			/* No error */
700 			goto done;
701 		}
702 
703 		/* Set an initial local address */
704 		switch (so->so_family) {
705 		case AF_UNIX:
706 			/*
707 			 * Use an address with same size as struct sockaddr
708 			 * just like BSD.
709 			 */
710 			sti->sti_laddr_len =
711 			    (socklen_t)sizeof (struct sockaddr);
712 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
713 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
714 			sti->sti_laddr_sa->sa_family = so->so_family;
715 
716 			/*
717 			 * Pass down an address with the implicit bind
718 			 * magic number and the rest all zeros.
719 			 * The transport will return a unique address.
720 			 */
721 			sti->sti_ux_laddr.soua_vp = NULL;
722 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
723 			addr = &sti->sti_ux_laddr;
724 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
725 			break;
726 
727 		case AF_INET:
728 		case AF_INET6:
729 			/*
730 			 * An unspecified bind in TPI has a NULL address.
731 			 * Set the address in sockfs to have the sa_family.
732 			 */
733 			sti->sti_laddr_len = (so->so_family == AF_INET) ?
734 			    (socklen_t)sizeof (sin_t) :
735 			    (socklen_t)sizeof (sin6_t);
736 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
737 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
738 			sti->sti_laddr_sa->sa_family = so->so_family;
739 			addr = NULL;
740 			addrlen = 0;
741 			break;
742 
743 		default:
744 			/*
745 			 * An unspecified bind in TPI has a NULL address.
746 			 * Set the address in sockfs to be zero length.
747 			 *
748 			 * Can not assume there is a sa_family for all
749 			 * protocol families. For example, AF_X25 does not
750 			 * have a family field.
751 			 */
752 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
753 			sti->sti_laddr_len = 0;	/* XXX correct? */
754 			addr = NULL;
755 			addrlen = 0;
756 			break;
757 		}
758 
759 	} else {
760 		if (so->so_state & SS_ISBOUND) {
761 			/*
762 			 * If it is ok to rebind the socket, first unbind
763 			 * with the transport. A rebind to the NULL address
764 			 * is interpreted as an unbind.
765 			 * Note that a bind to NULL in BSD does unbind the
766 			 * socket but it fails with EINVAL.
767 			 * Note that regular sockets set SOV_SOCKBSD i.e.
768 			 * _SOBIND_SOCKBSD gets set here hence no type of
769 			 * socket does currently allow rebinding.
770 			 *
771 			 * If the name is NULL just do an unbind.
772 			 */
773 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
774 			    name != NULL) {
775 				error = EINVAL;
776 				unbind_on_err = 0;
777 				eprintsoline(so, error);
778 				goto done;
779 			}
780 			if ((so->so_mode & SM_CONNREQUIRED) &&
781 			    (so->so_state & SS_CANTREBIND)) {
782 				error = EINVAL;
783 				unbind_on_err = 0;
784 				eprintsoline(so, error);
785 				goto done;
786 			}
787 			error = sotpi_unbind(so, 0);
788 			if (error) {
789 				eprintsoline(so, error);
790 				goto done;
791 			}
792 			ASSERT(!(so->so_state & SS_ISBOUND));
793 			if (name == NULL) {
794 				so->so_state &=
795 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
796 				goto done;
797 			}
798 		}
799 
800 		/* X/Open requires this check */
801 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
802 			if (xnet_check_print) {
803 				printf("sockfs: X/Open bind state check "
804 				    "caused EINVAL\n");
805 			}
806 			error = EINVAL;
807 			goto done;
808 		}
809 
810 		switch (so->so_family) {
811 		case AF_UNIX:
812 			/*
813 			 * All AF_UNIX addresses are nul terminated
814 			 * when copied (copyin_name) in so the minimum
815 			 * length is 3 bytes.
816 			 */
817 			if (name == NULL ||
818 			    (ssize_t)namelen <= sizeof (short) + 1) {
819 				error = EISDIR;
820 				eprintsoline(so, error);
821 				goto done;
822 			}
823 			/*
824 			 * Verify so_family matches the bound family.
825 			 * BSD does not check this for AF_UNIX resulting
826 			 * in funny mknods.
827 			 */
828 			if (name->sa_family != so->so_family) {
829 				error = EAFNOSUPPORT;
830 				goto done;
831 			}
832 			break;
833 		case AF_INET:
834 			if (name == NULL) {
835 				error = EINVAL;
836 				eprintsoline(so, error);
837 				goto done;
838 			}
839 			if ((size_t)namelen != sizeof (sin_t)) {
840 				error = name->sa_family != so->so_family ?
841 				    EAFNOSUPPORT : EINVAL;
842 				eprintsoline(so, error);
843 				goto done;
844 			}
845 			if ((flags & _SOBIND_XPG4_2) &&
846 			    (name->sa_family != so->so_family)) {
847 				/*
848 				 * This check has to be made for X/Open
849 				 * sockets however application failures have
850 				 * been observed when it is applied to
851 				 * all sockets.
852 				 */
853 				error = EAFNOSUPPORT;
854 				eprintsoline(so, error);
855 				goto done;
856 			}
857 			/*
858 			 * Force a zero sa_family to match so_family.
859 			 *
860 			 * Some programs like inetd(1M) don't set the
861 			 * family field. Other programs leave
862 			 * sin_family set to garbage - SunOS 4.X does
863 			 * not check the family field on a bind.
864 			 * We use the family field that
865 			 * was passed in to the socket() call.
866 			 */
867 			name->sa_family = so->so_family;
868 			break;
869 
870 		case AF_INET6: {
871 #ifdef DEBUG
872 			sin6_t *sin6 = (sin6_t *)name;
873 #endif /* DEBUG */
874 
875 			if (name == NULL) {
876 				error = EINVAL;
877 				eprintsoline(so, error);
878 				goto done;
879 			}
880 			if ((size_t)namelen != sizeof (sin6_t)) {
881 				error = name->sa_family != so->so_family ?
882 				    EAFNOSUPPORT : EINVAL;
883 				eprintsoline(so, error);
884 				goto done;
885 			}
886 			if (name->sa_family != so->so_family) {
887 				/*
888 				 * With IPv6 we require the family to match
889 				 * unlike in IPv4.
890 				 */
891 				error = EAFNOSUPPORT;
892 				eprintsoline(so, error);
893 				goto done;
894 			}
895 #ifdef DEBUG
896 			/*
897 			 * Verify that apps don't forget to clear
898 			 * sin6_scope_id etc
899 			 */
900 			if (sin6->sin6_scope_id != 0 &&
901 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
902 				zcmn_err(getzoneid(), CE_WARN,
903 				    "bind with uninitialized sin6_scope_id "
904 				    "(%d) on socket. Pid = %d\n",
905 				    (int)sin6->sin6_scope_id,
906 				    (int)curproc->p_pid);
907 			}
908 			if (sin6->__sin6_src_id != 0) {
909 				zcmn_err(getzoneid(), CE_WARN,
910 				    "bind with uninitialized __sin6_src_id "
911 				    "(%d) on socket. Pid = %d\n",
912 				    (int)sin6->__sin6_src_id,
913 				    (int)curproc->p_pid);
914 			}
915 #endif /* DEBUG */
916 			break;
917 		}
918 		default:
919 			/*
920 			 * Don't do any length or sa_family check to allow
921 			 * non-sockaddr style addresses.
922 			 */
923 			if (name == NULL) {
924 				error = EINVAL;
925 				eprintsoline(so, error);
926 				goto done;
927 			}
928 			break;
929 		}
930 
931 		if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
932 			error = ENAMETOOLONG;
933 			eprintsoline(so, error);
934 			goto done;
935 		}
936 		/*
937 		 * Save local address.
938 		 */
939 		sti->sti_laddr_len = (socklen_t)namelen;
940 		ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
941 		bcopy(name, sti->sti_laddr_sa, namelen);
942 
943 		addr = sti->sti_laddr_sa;
944 		addrlen = (t_uscalar_t)sti->sti_laddr_len;
945 		switch (so->so_family) {
946 		case AF_INET6:
947 		case AF_INET:
948 			break;
949 		case AF_UNIX: {
950 			struct sockaddr_un *soun =
951 			    (struct sockaddr_un *)sti->sti_laddr_sa;
952 			struct vnode *vp, *rvp;
953 			struct vattr vattr;
954 
955 			ASSERT(sti->sti_ux_bound_vp == NULL);
956 			/*
957 			 * Create vnode for the specified path name.
958 			 * Keep vnode held with a reference in sti_ux_bound_vp.
959 			 * Use the vnode pointer as the address used in the
960 			 * bind with the transport.
961 			 *
962 			 * Use the same mode as in BSD. In particular this does
963 			 * not observe the umask.
964 			 */
965 			/* MAXPATHLEN + soun_family + nul termination */
966 			if (sti->sti_laddr_len >
967 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
968 				error = ENAMETOOLONG;
969 				eprintsoline(so, error);
970 				goto done;
971 			}
972 			vattr.va_type = VSOCK;
973 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
974 			vattr.va_mask = AT_TYPE|AT_MODE;
975 			/* NOTE: holding so_lock */
976 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
977 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
978 			if (error) {
979 				if (error == EEXIST)
980 					error = EADDRINUSE;
981 				eprintsoline(so, error);
982 				goto done;
983 			}
984 			/*
985 			 * Establish pointer from the underlying filesystem
986 			 * vnode to the socket node.
987 			 * sti_ux_bound_vp and v_stream->sd_vnode form the
988 			 * cross-linkage between the underlying filesystem
989 			 * node and the socket node.
990 			 */
991 
992 			if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
993 				VN_HOLD(rvp);
994 				VN_RELE(vp);
995 				vp = rvp;
996 			}
997 
998 			ASSERT(SOTOV(so)->v_stream);
999 			mutex_enter(&vp->v_lock);
1000 			vp->v_stream = SOTOV(so)->v_stream;
1001 			sti->sti_ux_bound_vp = vp;
1002 			mutex_exit(&vp->v_lock);
1003 
1004 			/*
1005 			 * Use the vnode pointer value as a unique address
1006 			 * (together with the magic number to avoid conflicts
1007 			 * with implicit binds) in the transport provider.
1008 			 */
1009 			sti->sti_ux_laddr.soua_vp =
1010 			    (void *)sti->sti_ux_bound_vp;
1011 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1012 			addr = &sti->sti_ux_laddr;
1013 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1014 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1015 			    addrlen,
1016 			    (void *)((struct so_ux_addr *)addr)->soua_vp));
1017 			break;
1018 		}
1019 		} /* end switch (so->so_family) */
1020 	}
1021 
1022 	/*
1023 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1024 	 * the transport can start passing up T_CONN_IND messages
1025 	 * as soon as it receives the bind req and strsock_proto()
1026 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1027 	 */
1028 	if (flags & _SOBIND_LISTEN) {
1029 		if ((so->so_state & SS_ACCEPTCONN) == 0)
1030 			clear_acceptconn_on_err = B_TRUE;
1031 		save_so_backlog = so->so_backlog;
1032 		restore_backlog_on_err = B_TRUE;
1033 		so->so_state |= SS_ACCEPTCONN;
1034 		so->so_backlog = backlog;
1035 	}
1036 
1037 	/*
1038 	 * If NL7C addr(s) have been configured check for addr/port match,
1039 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1040 	 *
1041 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1042 	 * family sockets only. If match mark as such.
1043 	 */
1044 	if (nl7c_enabled && ((addr != NULL &&
1045 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1046 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1047 	    sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1048 		/*
1049 		 * NL7C is not supported in non-global zones,
1050 		 * we enforce this restriction here.
1051 		 */
1052 		if (so->so_zoneid == GLOBAL_ZONEID) {
1053 			/* An NL7C socket, mark it */
1054 			sti->sti_nl7c_flags |= NL7C_ENABLED;
1055 			if (nl7c == NULL) {
1056 				/*
1057 				 * Was an AF_NCA bind() so add it to the
1058 				 * addr list for reporting purposes.
1059 				 */
1060 				nl7c = nl7c_add_addr(addr, addrlen);
1061 			}
1062 		} else
1063 			nl7c = NULL;
1064 	}
1065 
1066 	/*
1067 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1068 	 * for other transports we will send in a O_T_BIND_REQ.
1069 	 */
1070 	if (tcp_udp_xport &&
1071 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
1072 		PRIM_type = T_BIND_REQ;
1073 
1074 	bind_req.PRIM_type = PRIM_type;
1075 	bind_req.ADDR_length = addrlen;
1076 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1077 	bind_req.CONIND_number = backlog;
1078 	/* NOTE: holding so_lock while sleeping */
1079 	mp = soallocproto2(&bind_req, sizeof (bind_req),
1080 	    addr, addrlen, 0, _ALLOC_SLEEP, cr);
1081 	sti->sti_laddr_valid = 0;
1082 
1083 	/* Done using sti_laddr_sa - can drop the lock */
1084 	mutex_exit(&so->so_lock);
1085 
1086 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1087 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1088 	if (error) {
1089 		eprintsoline(so, error);
1090 		mutex_enter(&so->so_lock);
1091 		goto done;
1092 	}
1093 
1094 	mutex_enter(&so->so_lock);
1095 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1096 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1097 	if (error) {
1098 		eprintsoline(so, error);
1099 		goto done;
1100 	}
1101 	ASSERT(mp);
1102 	/*
1103 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1104 	 * strsock_proto while the lock was dropped above, the bind
1105 	 * is allowed to complete.
1106 	 */
1107 
1108 	/* Mark as bound. This will be undone if we detect errors below. */
1109 	if (flags & _SOBIND_NOXLATE) {
1110 		ASSERT(so->so_family == AF_UNIX);
1111 		sti->sti_faddr_noxlate = 1;
1112 	}
1113 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1114 	so->so_state |= SS_ISBOUND;
1115 	ASSERT(sti->sti_unbind_mp);
1116 
1117 	/* note that we've already set SS_ACCEPTCONN above */
1118 
1119 	/*
1120 	 * Recompute addrlen - an unspecied bind sent down an
1121 	 * address of length zero but we expect the appropriate length
1122 	 * in return.
1123 	 */
1124 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1125 	    sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1126 
1127 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
1128 	/*
1129 	 * The alignment restriction is really too strict but
1130 	 * we want enough alignment to inspect the fields of
1131 	 * a sockaddr_in.
1132 	 */
1133 	addr = sogetoff(mp, bind_ack->ADDR_offset,
1134 	    bind_ack->ADDR_length,
1135 	    __TPI_ALIGN_SIZE);
1136 	if (addr == NULL) {
1137 		freemsg(mp);
1138 		error = EPROTO;
1139 		eprintsoline(so, error);
1140 		goto done;
1141 	}
1142 	if (!(flags & _SOBIND_UNSPEC)) {
1143 		/*
1144 		 * Verify that the transport didn't return something we
1145 		 * did not want e.g. an address other than what we asked for.
1146 		 *
1147 		 * NOTE: These checks would go away if/when we switch to
1148 		 * using the new TPI (in which the transport would fail
1149 		 * the request instead of assigning a different address).
1150 		 *
1151 		 * NOTE2: For protocols that we don't know (i.e. any
1152 		 * other than AF_INET6, AF_INET and AF_UNIX), we
1153 		 * cannot know if the transport should be expected to
1154 		 * return the same address as that requested.
1155 		 *
1156 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1157 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1158 		 *
1159 		 * For example, in the case of netatalk it may be
1160 		 * inappropriate for the transport to return the
1161 		 * requested address (as it may have allocated a local
1162 		 * port number in behaviour similar to that of an
1163 		 * AF_INET bind request with a port number of zero).
1164 		 *
1165 		 * Given the definition of O_T_BIND_REQ, where the
1166 		 * transport may bind to an address other than the
1167 		 * requested address, it's not possible to determine
1168 		 * whether a returned address that differs from the
1169 		 * requested address is a reason to fail (because the
1170 		 * requested address was not available) or succeed
1171 		 * (because the transport allocated an appropriate
1172 		 * address and/or port).
1173 		 *
1174 		 * sockfs currently requires that the transport return
1175 		 * the requested address in the T_BIND_ACK, unless
1176 		 * there is code here to allow for any discrepancy.
1177 		 * Such code exists for AF_INET and AF_INET6.
1178 		 *
1179 		 * Netatalk chooses to return the requested address
1180 		 * rather than the (correct) allocated address.  This
1181 		 * means that netatalk violates the TPI specification
1182 		 * (and would not function correctly if used from a
1183 		 * TLI application), but it does mean that it works
1184 		 * with sockfs.
1185 		 *
1186 		 * As noted above, using the newer XTI bind primitive
1187 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1188 		 * allow sockfs to be more sure about whether or not
1189 		 * the bind request had succeeded (as transports are
1190 		 * not permitted to bind to a different address than
1191 		 * that requested - they must return failure).
1192 		 * Unfortunately, support for T_BIND_REQ may not be
1193 		 * present in all transport implementations (netatalk,
1194 		 * for example, doesn't have it), making the
1195 		 * transition difficult.
1196 		 */
1197 		if (bind_ack->ADDR_length != addrlen) {
1198 			/* Assumes that the requested address was in use */
1199 			freemsg(mp);
1200 			error = EADDRINUSE;
1201 			eprintsoline(so, error);
1202 			goto done;
1203 		}
1204 
1205 		switch (so->so_family) {
1206 		case AF_INET6:
1207 		case AF_INET: {
1208 			sin_t *rname, *aname;
1209 
1210 			rname = (sin_t *)addr;
1211 			aname = (sin_t *)sti->sti_laddr_sa;
1212 
1213 			/*
1214 			 * Take advantage of the alignment
1215 			 * of sin_port and sin6_port which fall
1216 			 * in the same place in their data structures.
1217 			 * Just use sin_port for either address family.
1218 			 *
1219 			 * This may become a problem if (heaven forbid)
1220 			 * there's a separate ipv6port_reserved... :-P
1221 			 *
1222 			 * Binding to port 0 has the semantics of letting
1223 			 * the transport bind to any port.
1224 			 *
1225 			 * If the transport is TCP or UDP since we had sent
1226 			 * a T_BIND_REQ we would not get a port other than
1227 			 * what we asked for.
1228 			 */
1229 			if (tcp_udp_xport) {
1230 				/*
1231 				 * Pick up the new port number if we bound to
1232 				 * port 0.
1233 				 */
1234 				if (aname->sin_port == 0)
1235 					aname->sin_port = rname->sin_port;
1236 				sti->sti_laddr_valid = 1;
1237 				break;
1238 			}
1239 			if (aname->sin_port != 0 &&
1240 			    aname->sin_port != rname->sin_port) {
1241 				freemsg(mp);
1242 				error = EADDRINUSE;
1243 				eprintsoline(so, error);
1244 				goto done;
1245 			}
1246 			/*
1247 			 * Pick up the new port number if we bound to port 0.
1248 			 */
1249 			aname->sin_port = rname->sin_port;
1250 
1251 			/*
1252 			 * Unfortunately, addresses aren't _quite_ the same.
1253 			 */
1254 			if (so->so_family == AF_INET) {
1255 				if (aname->sin_addr.s_addr !=
1256 				    rname->sin_addr.s_addr) {
1257 					freemsg(mp);
1258 					error = EADDRNOTAVAIL;
1259 					eprintsoline(so, error);
1260 					goto done;
1261 				}
1262 			} else {
1263 				sin6_t *rname6 = (sin6_t *)rname;
1264 				sin6_t *aname6 = (sin6_t *)aname;
1265 
1266 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1267 				    &rname6->sin6_addr)) {
1268 					freemsg(mp);
1269 					error = EADDRNOTAVAIL;
1270 					eprintsoline(so, error);
1271 					goto done;
1272 				}
1273 			}
1274 			break;
1275 		}
1276 		case AF_UNIX:
1277 			if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1278 				freemsg(mp);
1279 				error = EADDRINUSE;
1280 				eprintsoline(so, error);
1281 				eprintso(so,
1282 				    ("addrlen %d, addr 0x%x, vp %p\n",
1283 				    addrlen, *((int *)addr),
1284 				    (void *)sti->sti_ux_bound_vp));
1285 				goto done;
1286 			}
1287 			sti->sti_laddr_valid = 1;
1288 			break;
1289 		default:
1290 			/*
1291 			 * NOTE: This assumes that addresses can be
1292 			 * byte-compared for equivalence.
1293 			 */
1294 			if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1295 				freemsg(mp);
1296 				error = EADDRINUSE;
1297 				eprintsoline(so, error);
1298 				goto done;
1299 			}
1300 			/*
1301 			 * Don't mark sti_laddr_valid, as we cannot be
1302 			 * sure that the returned address is the real
1303 			 * bound address when talking to an unknown
1304 			 * transport.
1305 			 */
1306 			break;
1307 		}
1308 	} else {
1309 		/*
1310 		 * Save for returned address for getsockname.
1311 		 * Needed for unspecific bind unless transport supports
1312 		 * the TI_GETMYNAME ioctl.
1313 		 * Do this for AF_INET{,6} even though they do, as
1314 		 * caching info here is much better performance than
1315 		 * a TPI/STREAMS trip to the transport for getsockname.
1316 		 * Any which can't for some reason _must_ _not_ set
1317 		 * sti_laddr_valid here for the caching version of
1318 		 * getsockname to not break;
1319 		 */
1320 		switch (so->so_family) {
1321 		case AF_UNIX:
1322 			/*
1323 			 * Record the address bound with the transport
1324 			 * for use by socketpair.
1325 			 */
1326 			bcopy(addr, &sti->sti_ux_laddr, addrlen);
1327 			sti->sti_laddr_valid = 1;
1328 			break;
1329 		case AF_INET:
1330 		case AF_INET6:
1331 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1332 			bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1333 			sti->sti_laddr_valid = 1;
1334 			break;
1335 		default:
1336 			/*
1337 			 * Don't mark sti_laddr_valid, as we cannot be
1338 			 * sure that the returned address is the real
1339 			 * bound address when talking to an unknown
1340 			 * transport.
1341 			 */
1342 			break;
1343 		}
1344 	}
1345 
1346 	if (nl7c != NULL) {
1347 		/* Register listen()er sonode pointer with NL7C */
1348 		nl7c_listener_addr(nl7c, so);
1349 	}
1350 
1351 	freemsg(mp);
1352 
1353 done:
1354 	if (error) {
1355 		/* reset state & backlog to values held on entry */
1356 		if (clear_acceptconn_on_err == B_TRUE)
1357 			so->so_state &= ~SS_ACCEPTCONN;
1358 		if (restore_backlog_on_err == B_TRUE)
1359 			so->so_backlog = save_so_backlog;
1360 
1361 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1362 			int err;
1363 
1364 			err = sotpi_unbind(so, 0);
1365 			/* LINTED - statement has no consequent: if */
1366 			if (err) {
1367 				eprintsoline(so, error);
1368 			} else {
1369 				ASSERT(!(so->so_state & SS_ISBOUND));
1370 			}
1371 		}
1372 	}
1373 	if (!(flags & _SOBIND_LOCK_HELD)) {
1374 		so_unlock_single(so, SOLOCKED);
1375 		mutex_exit(&so->so_lock);
1376 	} else {
1377 		ASSERT(MUTEX_HELD(&so->so_lock));
1378 		ASSERT(so->so_flag & SOLOCKED);
1379 	}
1380 	return (error);
1381 }
1382 
1383 /* bind the socket */
1384 static int
1385 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1386     int flags, struct cred *cr)
1387 {
1388 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1389 		return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1390 
1391 	flags &= ~_SOBIND_SOCKETPAIR;
1392 	return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1393 }
1394 
1395 /*
1396  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1397  * address, or when listen needs to unbind and bind.
1398  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1399  * so that a sobind can pick them up.
1400  */
1401 static int
1402 sotpi_unbind(struct sonode *so, int flags)
1403 {
1404 	struct T_unbind_req	unbind_req;
1405 	int			error = 0;
1406 	mblk_t			*mp;
1407 	sotpi_info_t		*sti = SOTOTPI(so);
1408 
1409 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1410 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1411 
1412 	ASSERT(MUTEX_HELD(&so->so_lock));
1413 	ASSERT(so->so_flag & SOLOCKED);
1414 
1415 	if (!(so->so_state & SS_ISBOUND)) {
1416 		error = EINVAL;
1417 		eprintsoline(so, error);
1418 		goto done;
1419 	}
1420 
1421 	mutex_exit(&so->so_lock);
1422 
1423 	/*
1424 	 * Flush the read and write side (except stream head read queue)
1425 	 * and send down T_UNBIND_REQ.
1426 	 */
1427 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1428 
1429 	unbind_req.PRIM_type = T_UNBIND_REQ;
1430 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1431 	    0, _ALLOC_SLEEP, CRED());
1432 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1433 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1434 	mutex_enter(&so->so_lock);
1435 	if (error) {
1436 		eprintsoline(so, error);
1437 		goto done;
1438 	}
1439 
1440 	error = sowaitokack(so, T_UNBIND_REQ);
1441 	if (error) {
1442 		eprintsoline(so, error);
1443 		goto done;
1444 	}
1445 
1446 	/*
1447 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1448 	 * strsock_proto while the lock was dropped above, the unbind
1449 	 * is allowed to complete.
1450 	 */
1451 	if (!(flags & _SOUNBIND_REBIND)) {
1452 		/*
1453 		 * Clear out bound address.
1454 		 */
1455 		vnode_t *vp;
1456 
1457 		if ((vp = sti->sti_ux_bound_vp) != NULL) {
1458 			sti->sti_ux_bound_vp = NULL;
1459 			vn_rele_stream(vp);
1460 		}
1461 		/* Clear out address */
1462 		sti->sti_laddr_len = 0;
1463 	}
1464 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1465 	sti->sti_laddr_valid = 0;
1466 
1467 done:
1468 
1469 	/* If the caller held the lock don't release it here */
1470 	ASSERT(MUTEX_HELD(&so->so_lock));
1471 	ASSERT(so->so_flag & SOLOCKED);
1472 
1473 	return (error);
1474 }
1475 
1476 /*
1477  * listen on the socket.
1478  * For TPI conforming transports this has to first unbind with the transport
1479  * and then bind again using the new backlog.
1480  */
1481 /* ARGSUSED */
1482 int
1483 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1484 {
1485 	int		error = 0;
1486 	sotpi_info_t	*sti = SOTOTPI(so);
1487 
1488 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1489 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1490 
1491 	if (sti->sti_serv_type == T_CLTS)
1492 		return (EOPNOTSUPP);
1493 
1494 	/*
1495 	 * If the socket is ready to accept connections already, then
1496 	 * return without doing anything.  This avoids a problem where
1497 	 * a second listen() call fails if a connection is pending and
1498 	 * leaves the socket unbound. Only when we are not unbinding
1499 	 * with the transport can we safely increase the backlog.
1500 	 */
1501 	if (so->so_state & SS_ACCEPTCONN &&
1502 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1503 	    /*CONSTCOND*/
1504 	    !solisten_tpi_tcp))
1505 		return (0);
1506 
1507 	if (so->so_state & SS_ISCONNECTED)
1508 		return (EINVAL);
1509 
1510 	mutex_enter(&so->so_lock);
1511 	so_lock_single(so);	/* Set SOLOCKED */
1512 
1513 	/*
1514 	 * If the listen doesn't change the backlog we do nothing.
1515 	 * This avoids an EPROTO error from the transport.
1516 	 */
1517 	if ((so->so_state & SS_ACCEPTCONN) &&
1518 	    so->so_backlog == backlog)
1519 		goto done;
1520 
1521 	if (!(so->so_state & SS_ISBOUND)) {
1522 		/*
1523 		 * Must have been explicitly bound in the UNIX domain.
1524 		 */
1525 		if (so->so_family == AF_UNIX) {
1526 			error = EINVAL;
1527 			goto done;
1528 		}
1529 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1530 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1531 	} else if (backlog > 0) {
1532 		/*
1533 		 * AF_INET{,6} hack to avoid losing the port.
1534 		 * Assumes that all AF_INET{,6} transports can handle a
1535 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1536 		 * has already bound thus it is possible to avoid the unbind.
1537 		 */
1538 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1539 		    /*CONSTCOND*/
1540 		    !solisten_tpi_tcp)) {
1541 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1542 			if (error)
1543 				goto done;
1544 		}
1545 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1546 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1547 	} else {
1548 		so->so_state |= SS_ACCEPTCONN;
1549 		so->so_backlog = backlog;
1550 	}
1551 	if (error)
1552 		goto done;
1553 	ASSERT(so->so_state & SS_ACCEPTCONN);
1554 done:
1555 	so_unlock_single(so, SOLOCKED);
1556 	mutex_exit(&so->so_lock);
1557 	return (error);
1558 }
1559 
1560 /*
1561  * Disconnect either a specified seqno or all (-1).
1562  * The former is used on listening sockets only.
1563  *
1564  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1565  * the current use of sodisconnect(seqno == -1) is only for shutdown
1566  * so there is no point (and potentially incorrect) to unbind.
1567  */
1568 static int
1569 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1570 {
1571 	struct T_discon_req	discon_req;
1572 	int			error = 0;
1573 	mblk_t			*mp;
1574 
1575 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1576 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1577 
1578 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1579 		mutex_enter(&so->so_lock);
1580 		so_lock_single(so);	/* Set SOLOCKED */
1581 	} else {
1582 		ASSERT(MUTEX_HELD(&so->so_lock));
1583 		ASSERT(so->so_flag & SOLOCKED);
1584 	}
1585 
1586 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1587 		error = EINVAL;
1588 		eprintsoline(so, error);
1589 		goto done;
1590 	}
1591 
1592 	mutex_exit(&so->so_lock);
1593 	/*
1594 	 * Flush the write side (unless this is a listener)
1595 	 * and then send down a T_DISCON_REQ.
1596 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1597 	 * and other messages.)
1598 	 */
1599 	if (!(so->so_state & SS_ACCEPTCONN))
1600 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1601 
1602 	discon_req.PRIM_type = T_DISCON_REQ;
1603 	discon_req.SEQ_number = seqno;
1604 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1605 	    0, _ALLOC_SLEEP, CRED());
1606 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1607 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1608 	mutex_enter(&so->so_lock);
1609 	if (error) {
1610 		eprintsoline(so, error);
1611 		goto done;
1612 	}
1613 
1614 	error = sowaitokack(so, T_DISCON_REQ);
1615 	if (error) {
1616 		eprintsoline(so, error);
1617 		goto done;
1618 	}
1619 	/*
1620 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1621 	 * strsock_proto while the lock was dropped above, the disconnect
1622 	 * is allowed to complete. However, it is not possible to
1623 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1624 	 */
1625 	so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1626 	SOTOTPI(so)->sti_laddr_valid = 0;
1627 	SOTOTPI(so)->sti_faddr_valid = 0;
1628 done:
1629 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1630 		so_unlock_single(so, SOLOCKED);
1631 		mutex_exit(&so->so_lock);
1632 	} else {
1633 		/* If the caller held the lock don't release it here */
1634 		ASSERT(MUTEX_HELD(&so->so_lock));
1635 		ASSERT(so->so_flag & SOLOCKED);
1636 	}
1637 	return (error);
1638 }
1639 
1640 /* ARGSUSED */
1641 int
1642 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1643     struct sonode **nsop)
1644 {
1645 	struct T_conn_ind	*conn_ind;
1646 	struct T_conn_res	*conn_res;
1647 	int			error = 0;
1648 	mblk_t			*mp, *ack_mp;
1649 	struct sonode		*nso;
1650 	vnode_t			*nvp;
1651 	void			*src;
1652 	t_uscalar_t		srclen;
1653 	void			*opt;
1654 	t_uscalar_t		optlen;
1655 	t_scalar_t		PRIM_type;
1656 	t_scalar_t		SEQ_number;
1657 	size_t			sinlen;
1658 	sotpi_info_t		*sti = SOTOTPI(so);
1659 	sotpi_info_t		*nsti;
1660 
1661 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1662 	    (void *)so, fflag, (void *)nsop,
1663 	    pr_state(so->so_state, so->so_mode)));
1664 
1665 	/*
1666 	 * Defer single-threading the accepting socket until
1667 	 * the T_CONN_IND has been received and parsed and the
1668 	 * new sonode has been opened.
1669 	 */
1670 
1671 	/* Check that we are not already connected */
1672 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1673 		goto conn_bad;
1674 again:
1675 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1676 		goto e_bad;
1677 
1678 	ASSERT(mp != NULL);
1679 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1680 
1681 	/*
1682 	 * Save SEQ_number for error paths.
1683 	 */
1684 	SEQ_number = conn_ind->SEQ_number;
1685 
1686 	srclen = conn_ind->SRC_length;
1687 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1688 	if (src == NULL) {
1689 		error = EPROTO;
1690 		freemsg(mp);
1691 		eprintsoline(so, error);
1692 		goto disconnect_unlocked;
1693 	}
1694 	optlen = conn_ind->OPT_length;
1695 	switch (so->so_family) {
1696 	case AF_INET:
1697 	case AF_INET6:
1698 		if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1699 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1700 			    &opt, conn_ind->OPT_length);
1701 		} else {
1702 			/*
1703 			 * The transport (in this case TCP) hasn't sent up
1704 			 * a pointer to an instance for the accept fast-path.
1705 			 * Disable fast-path completely because the call to
1706 			 * sotpi_create() below would otherwise create an
1707 			 * incomplete TCP instance, which would lead to
1708 			 * problems when sockfs sends a normal T_CONN_RES
1709 			 * message down the new stream.
1710 			 */
1711 			if (sti->sti_direct) {
1712 				int rval;
1713 				/*
1714 				 * For consistency we inform tcp to disable
1715 				 * direct interface on the listener, though
1716 				 * we can certainly live without doing this
1717 				 * because no data will ever travel upstream
1718 				 * on the listening socket.
1719 				 */
1720 				sti->sti_direct = 0;
1721 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1722 				    0, 0, K_TO_K, cr, &rval);
1723 			}
1724 			opt = NULL;
1725 			optlen = 0;
1726 		}
1727 		break;
1728 	case AF_UNIX:
1729 	default:
1730 		if (optlen != 0) {
1731 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1732 			    __TPI_ALIGN_SIZE);
1733 			if (opt == NULL) {
1734 				error = EPROTO;
1735 				freemsg(mp);
1736 				eprintsoline(so, error);
1737 				goto disconnect_unlocked;
1738 			}
1739 		}
1740 		if (so->so_family == AF_UNIX) {
1741 			if (!sti->sti_faddr_noxlate) {
1742 				src = NULL;
1743 				srclen = 0;
1744 			}
1745 			/* Extract src address from options */
1746 			if (optlen != 0)
1747 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1748 		}
1749 		break;
1750 	}
1751 
1752 	/*
1753 	 * Create the new socket.
1754 	 */
1755 	nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1756 	if (nso == NULL) {
1757 		ASSERT(error != 0);
1758 		/*
1759 		 * Accept can not fail with ENOBUFS. sotpi_create
1760 		 * sleeps waiting for memory until a signal is caught
1761 		 * so return EINTR.
1762 		 */
1763 		freemsg(mp);
1764 		if (error == ENOBUFS)
1765 			error = EINTR;
1766 		goto e_disc_unl;
1767 	}
1768 	nvp = SOTOV(nso);
1769 	nsti = SOTOTPI(nso);
1770 
1771 #ifdef DEBUG
1772 	/*
1773 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1774 	 * it's inherited early to allow debugging of the accept code itself.
1775 	 */
1776 	nso->so_options |= so->so_options & SO_DEBUG;
1777 #endif /* DEBUG */
1778 
1779 	/*
1780 	 * Save the SRC address from the T_CONN_IND
1781 	 * for getpeername to work on AF_UNIX and on transports that do not
1782 	 * support TI_GETPEERNAME.
1783 	 *
1784 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1785 	 * copyin_name().
1786 	 */
1787 	if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1788 		error = EINVAL;
1789 		freemsg(mp);
1790 		eprintsoline(so, error);
1791 		goto disconnect_vp_unlocked;
1792 	}
1793 	nsti->sti_faddr_len = (socklen_t)srclen;
1794 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1795 	bcopy(src, nsti->sti_faddr_sa, srclen);
1796 	nsti->sti_faddr_valid = 1;
1797 
1798 	/*
1799 	 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1800 	 */
1801 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1802 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1803 		cred_t	*cr;
1804 		pid_t	cpid;
1805 
1806 		cr = msg_getcred(mp, &cpid);
1807 		if (cr != NULL) {
1808 			crhold(cr);
1809 			nso->so_peercred = cr;
1810 			nso->so_cpid = cpid;
1811 		}
1812 		freemsg(mp);
1813 
1814 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1815 		    sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1816 		if (mp == NULL) {
1817 			/*
1818 			 * Accept can not fail with ENOBUFS.
1819 			 * A signal was caught so return EINTR.
1820 			 */
1821 			error = EINTR;
1822 			eprintsoline(so, error);
1823 			goto disconnect_vp_unlocked;
1824 		}
1825 		conn_res = (struct T_conn_res *)mp->b_rptr;
1826 	} else {
1827 		/*
1828 		 * For efficency reasons we use msg_extractcred; no crhold
1829 		 * needed since db_credp is cleared (i.e., we move the cred
1830 		 * from the message to so_peercred.
1831 		 */
1832 		nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1833 
1834 		mp->b_rptr = DB_BASE(mp);
1835 		conn_res = (struct T_conn_res *)mp->b_rptr;
1836 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1837 
1838 		mblk_setcred(mp, cr, curproc->p_pid);
1839 	}
1840 
1841 	/*
1842 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1843 	 * (or AF_INET6) it also has to be bound in the transport provider.
1844 	 * We set the local address in the sonode from the T_OK_ACK of the
1845 	 * T_CONN_RES. For this reason the address we bind to here isn't
1846 	 * important.
1847 	 */
1848 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1849 	    /*CONSTCOND*/
1850 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1851 		/*
1852 		 * Optimization for AF_INET{,6} transports
1853 		 * that can handle a T_CONN_RES without being bound.
1854 		 */
1855 		mutex_enter(&nso->so_lock);
1856 		so_automatic_bind(nso);
1857 		mutex_exit(&nso->so_lock);
1858 	} else {
1859 		/* Perform NULL bind with the transport provider. */
1860 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1861 		    cr)) != 0) {
1862 			ASSERT(error != ENOBUFS);
1863 			freemsg(mp);
1864 			eprintsoline(nso, error);
1865 			goto disconnect_vp_unlocked;
1866 		}
1867 	}
1868 
1869 	/*
1870 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1871 	 * so that any data arriving on the new socket will cause the
1872 	 * appropriate signals to be delivered for the new socket.
1873 	 *
1874 	 * No other thread (except strsock_proto and strsock_misc)
1875 	 * can access the new socket thus we relax the locking.
1876 	 */
1877 	nso->so_pgrp = so->so_pgrp;
1878 	nso->so_state |= so->so_state & SS_ASYNC;
1879 	nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1880 
1881 	if (nso->so_pgrp != 0) {
1882 		if ((error = so_set_events(nso, nvp, cr)) != 0) {
1883 			eprintsoline(nso, error);
1884 			error = 0;
1885 			nso->so_pgrp = 0;
1886 		}
1887 	}
1888 
1889 	/*
1890 	 * Make note of the socket level options. TCP and IP level options
1891 	 * are already inherited. We could do all this after accept is
1892 	 * successful but doing it here simplifies code and no harm done
1893 	 * for error case.
1894 	 */
1895 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1896 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1897 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1898 	nso->so_sndbuf = so->so_sndbuf;
1899 	nso->so_rcvbuf = so->so_rcvbuf;
1900 	if (nso->so_options & SO_LINGER)
1901 		nso->so_linger = so->so_linger;
1902 
1903 	/*
1904 	 * Note that the following sti_direct code path should be
1905 	 * removed once we are confident that the direct sockets
1906 	 * do not result in any degradation.
1907 	 */
1908 	if (sti->sti_direct) {
1909 
1910 		ASSERT(opt != NULL);
1911 
1912 		conn_res->OPT_length = optlen;
1913 		conn_res->OPT_offset = MBLKL(mp);
1914 		bcopy(&opt, mp->b_wptr, optlen);
1915 		mp->b_wptr += optlen;
1916 		conn_res->PRIM_type = T_CONN_RES;
1917 		conn_res->ACCEPTOR_id = 0;
1918 		PRIM_type = T_CONN_RES;
1919 
1920 		/* Send down the T_CONN_RES on acceptor STREAM */
1921 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1922 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1923 		if (error) {
1924 			mutex_enter(&so->so_lock);
1925 			so_lock_single(so);
1926 			eprintsoline(so, error);
1927 			goto disconnect_vp;
1928 		}
1929 		mutex_enter(&nso->so_lock);
1930 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1931 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1932 		if (error) {
1933 			mutex_exit(&nso->so_lock);
1934 			mutex_enter(&so->so_lock);
1935 			so_lock_single(so);
1936 			eprintsoline(so, error);
1937 			goto disconnect_vp;
1938 		}
1939 		if (nso->so_family == AF_INET) {
1940 			sin_t *sin;
1941 
1942 			sin = (sin_t *)(ack_mp->b_rptr +
1943 			    sizeof (struct T_ok_ack));
1944 			bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1945 			nsti->sti_laddr_len = sizeof (sin_t);
1946 		} else {
1947 			sin6_t *sin6;
1948 
1949 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1950 			    sizeof (struct T_ok_ack));
1951 			bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1952 			nsti->sti_laddr_len = sizeof (sin6_t);
1953 		}
1954 		freemsg(ack_mp);
1955 
1956 		nso->so_state |= SS_ISCONNECTED;
1957 		nso->so_proto_handle = (sock_lower_handle_t)opt;
1958 		nsti->sti_laddr_valid = 1;
1959 
1960 		if (sti->sti_nl7c_flags & NL7C_ENABLED) {
1961 			/*
1962 			 * A NL7C marked listen()er so the new socket
1963 			 * inherits the listen()er's NL7C state, except
1964 			 * for NL7C_POLLIN.
1965 			 *
1966 			 * Only call NL7C to process the new socket if
1967 			 * the listen socket allows blocking i/o.
1968 			 */
1969 			nsti->sti_nl7c_flags =
1970 			    sti->sti_nl7c_flags & (~NL7C_POLLIN);
1971 			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
1972 				/*
1973 				 * Nonblocking accept() just make it
1974 				 * persist to defer processing to the
1975 				 * read-side syscall (e.g. read).
1976 				 */
1977 				nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
1978 			} else if (nl7c_process(nso, B_FALSE)) {
1979 				/*
1980 				 * NL7C has completed processing on the
1981 				 * socket, close the socket and back to
1982 				 * the top to await the next T_CONN_IND.
1983 				 */
1984 				mutex_exit(&nso->so_lock);
1985 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1986 				    cr, NULL);
1987 				VN_RELE(nvp);
1988 				goto again;
1989 			}
1990 			/* Pass the new socket out */
1991 		}
1992 
1993 		mutex_exit(&nso->so_lock);
1994 
1995 		/*
1996 		 * It's possible, through the use of autopush for example,
1997 		 * that the acceptor stream may not support sti_direct
1998 		 * semantics. If the new socket does not support sti_direct
1999 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
2000 		 * as we would in the I_PUSH case.
2001 		 */
2002 		if (nsti->sti_direct == 0) {
2003 			int	rval;
2004 
2005 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2006 			    0, 0, K_TO_K, cr, &rval)) != 0) {
2007 				mutex_enter(&so->so_lock);
2008 				so_lock_single(so);
2009 				eprintsoline(so, error);
2010 				goto disconnect_vp;
2011 			}
2012 		}
2013 
2014 		/*
2015 		 * Pass out new socket.
2016 		 */
2017 		if (nsop != NULL)
2018 			*nsop = nso;
2019 
2020 		return (0);
2021 	}
2022 
2023 	/*
2024 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2025 	 * which don't support the FireEngine accept fast-path. It is also
2026 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2027 	 * again. Neither sockfs nor TCP attempt to find out if some other
2028 	 * random module has been inserted in between (in which case we
2029 	 * should follow TLI accept behaviour). We blindly assume the worst
2030 	 * case and revert back to old behaviour i.e. TCP will not send us
2031 	 * any option (eager) and the accept should happen on the listener
2032 	 * queue. Any queued T_conn_ind have already got their options removed
2033 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
2034 	 */
2035 	/*
2036 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2037 	 */
2038 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2039 #ifdef	_ILP32
2040 		queue_t	*q;
2041 
2042 		/*
2043 		 * Find read queue in driver
2044 		 * Can safely do this since we "own" nso/nvp.
2045 		 */
2046 		q = strvp2wq(nvp)->q_next;
2047 		while (SAMESTR(q))
2048 			q = q->q_next;
2049 		q = RD(q);
2050 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2051 #else
2052 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2053 #endif	/* _ILP32 */
2054 		conn_res->PRIM_type = O_T_CONN_RES;
2055 		PRIM_type = O_T_CONN_RES;
2056 	} else {
2057 		conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2058 		conn_res->PRIM_type = T_CONN_RES;
2059 		PRIM_type = T_CONN_RES;
2060 	}
2061 	conn_res->SEQ_number = SEQ_number;
2062 	conn_res->OPT_length = 0;
2063 	conn_res->OPT_offset = 0;
2064 
2065 	mutex_enter(&so->so_lock);
2066 	so_lock_single(so);	/* Set SOLOCKED */
2067 	mutex_exit(&so->so_lock);
2068 
2069 	error = kstrputmsg(SOTOV(so), mp, NULL,
2070 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2071 	mutex_enter(&so->so_lock);
2072 	if (error) {
2073 		eprintsoline(so, error);
2074 		goto disconnect_vp;
2075 	}
2076 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
2077 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2078 	if (error) {
2079 		eprintsoline(so, error);
2080 		goto disconnect_vp;
2081 	}
2082 	mutex_exit(&so->so_lock);
2083 	/*
2084 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
2085 	 * that to set the local address. If this is not present
2086 	 * then we zero out the address and don't set the
2087 	 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2088 	 * the pathname from the listening socket.
2089 	 * In the case where this is TCP or an AF_UNIX socket the
2090 	 * client side may have queued data or a T_ORDREL in the
2091 	 * transport. Having now sent the T_CONN_RES we may receive
2092 	 * those queued messages at any time. Hold the acceptor
2093 	 * so_lock until its state and laddr are finalized.
2094 	 */
2095 	mutex_enter(&nso->so_lock);
2096 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2097 	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2098 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2099 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
2100 		bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2101 		nsti->sti_laddr_len = sinlen;
2102 		nsti->sti_laddr_valid = 1;
2103 	} else if (nso->so_family == AF_UNIX) {
2104 		ASSERT(so->so_family == AF_UNIX);
2105 		nsti->sti_laddr_len = sti->sti_laddr_len;
2106 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2107 		bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2108 		    nsti->sti_laddr_len);
2109 		nsti->sti_laddr_valid = 1;
2110 	} else {
2111 		nsti->sti_laddr_len = sti->sti_laddr_len;
2112 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2113 		bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2114 		nsti->sti_laddr_sa->sa_family = nso->so_family;
2115 	}
2116 	nso->so_state |= SS_ISCONNECTED;
2117 	mutex_exit(&nso->so_lock);
2118 
2119 	freemsg(ack_mp);
2120 
2121 	mutex_enter(&so->so_lock);
2122 	so_unlock_single(so, SOLOCKED);
2123 	mutex_exit(&so->so_lock);
2124 
2125 	/*
2126 	 * Pass out new socket.
2127 	 */
2128 	if (nsop != NULL)
2129 		*nsop = nso;
2130 
2131 	return (0);
2132 
2133 
2134 eproto_disc_unl:
2135 	error = EPROTO;
2136 e_disc_unl:
2137 	eprintsoline(so, error);
2138 	goto disconnect_unlocked;
2139 
2140 pr_disc_vp_unl:
2141 	eprintsoline(so, error);
2142 disconnect_vp_unlocked:
2143 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2144 	VN_RELE(nvp);
2145 disconnect_unlocked:
2146 	(void) sodisconnect(so, SEQ_number, 0);
2147 	return (error);
2148 
2149 pr_disc_vp:
2150 	eprintsoline(so, error);
2151 disconnect_vp:
2152 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2153 	so_unlock_single(so, SOLOCKED);
2154 	mutex_exit(&so->so_lock);
2155 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2156 	VN_RELE(nvp);
2157 	return (error);
2158 
2159 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2160 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2161 	    ? EOPNOTSUPP : EINVAL;
2162 e_bad:
2163 	eprintsoline(so, error);
2164 	return (error);
2165 }
2166 
2167 /*
2168  * connect a socket.
2169  *
2170  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2171  * unconnect (by specifying a null address).
2172  */
2173 int
2174 sotpi_connect(struct sonode *so,
2175 	struct sockaddr *name,
2176 	socklen_t namelen,
2177 	int fflag,
2178 	int flags,
2179 	struct cred *cr)
2180 {
2181 	struct T_conn_req	conn_req;
2182 	int			error = 0;
2183 	mblk_t			*mp;
2184 	void			*src;
2185 	socklen_t		srclen;
2186 	void			*addr;
2187 	socklen_t		addrlen;
2188 	boolean_t		need_unlock;
2189 	sotpi_info_t		*sti = SOTOTPI(so);
2190 
2191 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2192 	    (void *)so, (void *)name, namelen, fflag, flags,
2193 	    pr_state(so->so_state, so->so_mode)));
2194 
2195 	/*
2196 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2197 	 * avoid sleeping for memory with SOLOCKED held.
2198 	 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2199 	 * + sizeof (struct T_opthdr).
2200 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2201 	 * exceed sti_faddr_maxlen).
2202 	 */
2203 	mp = soallocproto(sizeof (struct T_conn_req) +
2204 	    2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2205 	    cr);
2206 	if (mp == NULL) {
2207 		/*
2208 		 * Connect can not fail with ENOBUFS. A signal was
2209 		 * caught so return EINTR.
2210 		 */
2211 		error = EINTR;
2212 		eprintsoline(so, error);
2213 		return (error);
2214 	}
2215 
2216 	mutex_enter(&so->so_lock);
2217 	/*
2218 	 * Make sure there is a preallocated T_unbind_req message
2219 	 * before any binding. This message is allocated when the
2220 	 * socket is created. Since another thread can consume
2221 	 * so_unbind_mp by the time we return from so_lock_single(),
2222 	 * we should check the availability of so_unbind_mp after
2223 	 * we return from so_lock_single().
2224 	 */
2225 
2226 	so_lock_single(so);	/* Set SOLOCKED */
2227 	need_unlock = B_TRUE;
2228 
2229 	if (sti->sti_unbind_mp == NULL) {
2230 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2231 		/* NOTE: holding so_lock while sleeping */
2232 		sti->sti_unbind_mp =
2233 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2234 		if (sti->sti_unbind_mp == NULL) {
2235 			error = EINTR;
2236 			goto done;
2237 		}
2238 	}
2239 
2240 	/*
2241 	 * Can't have done a listen before connecting.
2242 	 */
2243 	if (so->so_state & SS_ACCEPTCONN) {
2244 		error = EOPNOTSUPP;
2245 		goto done;
2246 	}
2247 
2248 	/*
2249 	 * Must be bound with the transport
2250 	 */
2251 	if (!(so->so_state & SS_ISBOUND)) {
2252 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2253 		    /*CONSTCOND*/
2254 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2255 			/*
2256 			 * Optimization for AF_INET{,6} transports
2257 			 * that can handle a T_CONN_REQ without being bound.
2258 			 */
2259 			so_automatic_bind(so);
2260 		} else {
2261 			error = sotpi_bind(so, NULL, 0,
2262 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2263 			if (error)
2264 				goto done;
2265 		}
2266 		ASSERT(so->so_state & SS_ISBOUND);
2267 		flags |= _SOCONNECT_DID_BIND;
2268 	}
2269 
2270 	/*
2271 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2272 	 * connect to a null address. This is the portable method to
2273 	 * unconnect a socket.
2274 	 */
2275 	if ((namelen >= sizeof (sa_family_t)) &&
2276 	    (name->sa_family == AF_UNSPEC)) {
2277 		name = NULL;
2278 		namelen = 0;
2279 	}
2280 
2281 	/*
2282 	 * Check that we are not already connected.
2283 	 * A connection-oriented socket cannot be reconnected.
2284 	 * A connected connection-less socket can be
2285 	 * - connected to a different address by a subsequent connect
2286 	 * - "unconnected" by a connect to the NULL address
2287 	 */
2288 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2289 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2290 		if (so->so_mode & SM_CONNREQUIRED) {
2291 			/* Connection-oriented socket */
2292 			error = so->so_state & SS_ISCONNECTED ?
2293 			    EISCONN : EALREADY;
2294 			goto done;
2295 		}
2296 		/* Connection-less socket */
2297 		if (name == NULL) {
2298 			/*
2299 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2300 			 * since it was set when the socket was connected.
2301 			 * If this is UDP also send down a T_DISCON_REQ.
2302 			 */
2303 			int val;
2304 
2305 			if ((so->so_family == AF_INET ||
2306 			    so->so_family == AF_INET6) &&
2307 			    (so->so_type == SOCK_DGRAM ||
2308 			    so->so_type == SOCK_RAW) &&
2309 			    /*CONSTCOND*/
2310 			    !soconnect_tpi_udp) {
2311 				/* XXX What about implicitly unbinding here? */
2312 				error = sodisconnect(so, -1,
2313 				    _SODISCONNECT_LOCK_HELD);
2314 			} else {
2315 				so->so_state &=
2316 				    ~(SS_ISCONNECTED | SS_ISCONNECTING);
2317 				sti->sti_faddr_valid = 0;
2318 				sti->sti_faddr_len = 0;
2319 			}
2320 
2321 			/* Remove SOLOCKED since setsockopt will grab it */
2322 			so_unlock_single(so, SOLOCKED);
2323 			mutex_exit(&so->so_lock);
2324 
2325 			val = 0;
2326 			(void) sotpi_setsockopt(so, SOL_SOCKET,
2327 			    SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2328 			    cr);
2329 
2330 			mutex_enter(&so->so_lock);
2331 			so_lock_single(so);	/* Set SOLOCKED */
2332 			goto done;
2333 		}
2334 	}
2335 	ASSERT(so->so_state & SS_ISBOUND);
2336 
2337 	if (name == NULL || namelen == 0) {
2338 		error = EINVAL;
2339 		goto done;
2340 	}
2341 	/*
2342 	 * Mark the socket if sti_faddr_sa represents the transport level
2343 	 * address.
2344 	 */
2345 	if (flags & _SOCONNECT_NOXLATE) {
2346 		struct sockaddr_ux	*soaddr_ux;
2347 
2348 		ASSERT(so->so_family == AF_UNIX);
2349 		if (namelen != sizeof (struct sockaddr_ux)) {
2350 			error = EINVAL;
2351 			goto done;
2352 		}
2353 		soaddr_ux = (struct sockaddr_ux *)name;
2354 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2355 		namelen = sizeof (soaddr_ux->sou_addr);
2356 		sti->sti_faddr_noxlate = 1;
2357 	}
2358 
2359 	/*
2360 	 * Length and family checks.
2361 	 */
2362 	error = so_addr_verify(so, name, namelen);
2363 	if (error)
2364 		goto bad;
2365 
2366 	/*
2367 	 * Save foreign address. Needed for AF_UNIX as well as
2368 	 * transport providers that do not support TI_GETPEERNAME.
2369 	 * Also used for cached foreign address for TCP and UDP.
2370 	 */
2371 	if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2372 		error = EINVAL;
2373 		goto done;
2374 	}
2375 	sti->sti_faddr_len = (socklen_t)namelen;
2376 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2377 	bcopy(name, sti->sti_faddr_sa, namelen);
2378 	sti->sti_faddr_valid = 1;
2379 
2380 	if (so->so_family == AF_UNIX) {
2381 		if (sti->sti_faddr_noxlate) {
2382 			/*
2383 			 * Already have a transport internal address. Do not
2384 			 * pass any (transport internal) source address.
2385 			 */
2386 			addr = sti->sti_faddr_sa;
2387 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2388 			src = NULL;
2389 			srclen = 0;
2390 		} else {
2391 			/*
2392 			 * Pass the sockaddr_un source address as an option
2393 			 * and translate the remote address.
2394 			 * Holding so_lock thus sti_laddr_sa can not change.
2395 			 */
2396 			src = sti->sti_laddr_sa;
2397 			srclen = (t_uscalar_t)sti->sti_laddr_len;
2398 			dprintso(so, 1,
2399 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2400 			    srclen, src));
2401 			error = so_ux_addr_xlate(so,
2402 			    sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2403 			    (flags & _SOCONNECT_XPG4_2),
2404 			    &addr, &addrlen);
2405 			if (error)
2406 				goto bad;
2407 		}
2408 	} else {
2409 		addr = sti->sti_faddr_sa;
2410 		addrlen = (t_uscalar_t)sti->sti_faddr_len;
2411 		src = NULL;
2412 		srclen = 0;
2413 	}
2414 	/*
2415 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2416 	 * option which asks the transport provider to send T_UDERR_IND
2417 	 * messages. These T_UDERR_IND messages are used to return connected
2418 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2419 	 *
2420 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2421 	 * we send down a T_CONN_REQ. This is needed to let the
2422 	 * transport assign a local address that is consistent with
2423 	 * the remote address. Applications depend on a getsockname()
2424 	 * after a connect() to retrieve the "source" IP address for
2425 	 * the connected socket.  Invalidate the cached local address
2426 	 * to force getsockname() to enquire of the transport.
2427 	 */
2428 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2429 		/*
2430 		 * Datagram socket.
2431 		 */
2432 		int32_t val;
2433 
2434 		so_unlock_single(so, SOLOCKED);
2435 		mutex_exit(&so->so_lock);
2436 
2437 		val = 1;
2438 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2439 		    &val, (t_uscalar_t)sizeof (val), cr);
2440 
2441 		mutex_enter(&so->so_lock);
2442 		so_lock_single(so);	/* Set SOLOCKED */
2443 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2444 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2445 		    soconnect_tpi_udp) {
2446 			soisconnected(so);
2447 			goto done;
2448 		}
2449 		/*
2450 		 * Send down T_CONN_REQ etc.
2451 		 * Clear fflag to avoid returning EWOULDBLOCK.
2452 		 */
2453 		fflag = 0;
2454 		ASSERT(so->so_family != AF_UNIX);
2455 		sti->sti_laddr_valid = 0;
2456 	} else if (sti->sti_laddr_len != 0) {
2457 		/*
2458 		 * If the local address or port was "any" then it may be
2459 		 * changed by the transport as a result of the
2460 		 * connect.  Invalidate the cached version if we have one.
2461 		 */
2462 		switch (so->so_family) {
2463 		case AF_INET:
2464 			ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2465 			if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2466 			    INADDR_ANY ||
2467 			    ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2468 				sti->sti_laddr_valid = 0;
2469 			break;
2470 
2471 		case AF_INET6:
2472 			ASSERT(sti->sti_laddr_len ==
2473 			    (socklen_t)sizeof (sin6_t));
2474 			if (IN6_IS_ADDR_UNSPECIFIED(
2475 			    &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2476 			    IN6_IS_ADDR_V4MAPPED_ANY(
2477 			    &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2478 			    ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2479 				sti->sti_laddr_valid = 0;
2480 			break;
2481 
2482 		default:
2483 			break;
2484 		}
2485 	}
2486 
2487 	/*
2488 	 * Check for failure of an earlier call
2489 	 */
2490 	if (so->so_error != 0)
2491 		goto so_bad;
2492 
2493 	/*
2494 	 * Send down T_CONN_REQ. Message was allocated above.
2495 	 */
2496 	conn_req.PRIM_type = T_CONN_REQ;
2497 	conn_req.DEST_length = addrlen;
2498 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2499 	if (srclen == 0) {
2500 		conn_req.OPT_length = 0;
2501 		conn_req.OPT_offset = 0;
2502 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2503 		soappendmsg(mp, addr, addrlen);
2504 	} else {
2505 		/*
2506 		 * There is a AF_UNIX sockaddr_un to include as a source
2507 		 * address option.
2508 		 */
2509 		struct T_opthdr toh;
2510 
2511 		toh.level = SOL_SOCKET;
2512 		toh.name = SO_SRCADDR;
2513 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2514 		toh.status = 0;
2515 		conn_req.OPT_length =
2516 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2517 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2518 		    _TPI_ALIGN_TOPT(addrlen));
2519 
2520 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2521 		soappendmsg(mp, addr, addrlen);
2522 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2523 		soappendmsg(mp, &toh, sizeof (toh));
2524 		soappendmsg(mp, src, srclen);
2525 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2526 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2527 	}
2528 	/*
2529 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2530 	 * in order to have the right state when the T_CONN_CON shows up.
2531 	 */
2532 	soisconnecting(so);
2533 	mutex_exit(&so->so_lock);
2534 
2535 	if (AU_AUDITING())
2536 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2537 
2538 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2539 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2540 	mp = NULL;
2541 	mutex_enter(&so->so_lock);
2542 	if (error != 0)
2543 		goto bad;
2544 
2545 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2546 		goto bad;
2547 
2548 	/* Allow other threads to access the socket */
2549 	so_unlock_single(so, SOLOCKED);
2550 	need_unlock = B_FALSE;
2551 
2552 	/*
2553 	 * Wait until we get a T_CONN_CON or an error
2554 	 */
2555 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2556 		so_lock_single(so);	/* Set SOLOCKED */
2557 		need_unlock = B_TRUE;
2558 	}
2559 
2560 done:
2561 	freemsg(mp);
2562 	switch (error) {
2563 	case EINPROGRESS:
2564 	case EALREADY:
2565 	case EISCONN:
2566 	case EINTR:
2567 		/* Non-fatal errors */
2568 		sti->sti_laddr_valid = 0;
2569 		/* FALLTHRU */
2570 	case 0:
2571 		break;
2572 	default:
2573 		ASSERT(need_unlock);
2574 		/*
2575 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2576 		 * and invalidate local-address cache
2577 		 */
2578 		so->so_state &= ~SS_ISCONNECTING;
2579 		sti->sti_laddr_valid = 0;
2580 		/* A discon_ind might have already unbound us */
2581 		if ((flags & _SOCONNECT_DID_BIND) &&
2582 		    (so->so_state & SS_ISBOUND)) {
2583 			int err;
2584 
2585 			err = sotpi_unbind(so, 0);
2586 			/* LINTED - statement has no conseq */
2587 			if (err) {
2588 				eprintsoline(so, err);
2589 			}
2590 		}
2591 		break;
2592 	}
2593 	if (need_unlock)
2594 		so_unlock_single(so, SOLOCKED);
2595 	mutex_exit(&so->so_lock);
2596 	return (error);
2597 
2598 so_bad:	error = sogeterr(so, B_TRUE);
2599 bad:	eprintsoline(so, error);
2600 	goto done;
2601 }
2602 
2603 /* ARGSUSED */
2604 int
2605 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2606 {
2607 	struct T_ordrel_req	ordrel_req;
2608 	mblk_t			*mp;
2609 	uint_t			old_state, state_change;
2610 	int			error = 0;
2611 	sotpi_info_t		*sti = SOTOTPI(so);
2612 
2613 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2614 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2615 
2616 	mutex_enter(&so->so_lock);
2617 	so_lock_single(so);	/* Set SOLOCKED */
2618 
2619 	/*
2620 	 * SunOS 4.X has no check for datagram sockets.
2621 	 * 5.X checks that it is connected (ENOTCONN)
2622 	 * X/Open requires that we check the connected state.
2623 	 */
2624 	if (!(so->so_state & SS_ISCONNECTED)) {
2625 		if (!xnet_skip_checks) {
2626 			error = ENOTCONN;
2627 			if (xnet_check_print) {
2628 				printf("sockfs: X/Open shutdown check "
2629 				    "caused ENOTCONN\n");
2630 			}
2631 		}
2632 		goto done;
2633 	}
2634 	/*
2635 	 * Record the current state and then perform any state changes.
2636 	 * Then use the difference between the old and new states to
2637 	 * determine which messages need to be sent.
2638 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2639 	 * duplicate calls to shutdown().
2640 	 */
2641 	old_state = so->so_state;
2642 
2643 	switch (how) {
2644 	case 0:
2645 		socantrcvmore(so);
2646 		break;
2647 	case 1:
2648 		socantsendmore(so);
2649 		break;
2650 	case 2:
2651 		socantsendmore(so);
2652 		socantrcvmore(so);
2653 		break;
2654 	default:
2655 		error = EINVAL;
2656 		goto done;
2657 	}
2658 
2659 	/*
2660 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2661 	 */
2662 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2663 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2664 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2665 
2666 	switch (state_change) {
2667 	case 0:
2668 		dprintso(so, 1,
2669 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2670 		    so->so_state));
2671 		goto done;
2672 
2673 	case SS_CANTRCVMORE:
2674 		mutex_exit(&so->so_lock);
2675 		strseteof(SOTOV(so), 1);
2676 		/*
2677 		 * strseteof takes care of read side wakeups,
2678 		 * pollwakeups, and signals.
2679 		 */
2680 		/*
2681 		 * Get the read lock before flushing data to avoid problems
2682 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2683 		 */
2684 		mutex_enter(&so->so_lock);
2685 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2686 		mutex_exit(&so->so_lock);
2687 
2688 		/* Flush read side queue */
2689 		strflushrq(SOTOV(so), FLUSHALL);
2690 
2691 		mutex_enter(&so->so_lock);
2692 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2693 		break;
2694 
2695 	case SS_CANTSENDMORE:
2696 		mutex_exit(&so->so_lock);
2697 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2698 		mutex_enter(&so->so_lock);
2699 		break;
2700 
2701 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2702 		mutex_exit(&so->so_lock);
2703 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2704 		strseteof(SOTOV(so), 1);
2705 		/*
2706 		 * strseteof takes care of read side wakeups,
2707 		 * pollwakeups, and signals.
2708 		 */
2709 		/*
2710 		 * Get the read lock before flushing data to avoid problems
2711 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2712 		 */
2713 		mutex_enter(&so->so_lock);
2714 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2715 		mutex_exit(&so->so_lock);
2716 
2717 		/* Flush read side queue */
2718 		strflushrq(SOTOV(so), FLUSHALL);
2719 
2720 		mutex_enter(&so->so_lock);
2721 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2722 		break;
2723 	}
2724 
2725 	ASSERT(MUTEX_HELD(&so->so_lock));
2726 
2727 	/*
2728 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2729 	 * was set due to this call and the new state has both of them set:
2730 	 *	Send the AF_UNIX close indication
2731 	 *	For T_COTS send a discon_ind
2732 	 *
2733 	 * If cantsend was set due to this call:
2734 	 *	For T_COTSORD send an ordrel_ind
2735 	 *
2736 	 * Note that for T_CLTS there is no message sent here.
2737 	 */
2738 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2739 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2740 		/*
2741 		 * For SunOS 4.X compatibility we tell the other end
2742 		 * that we are unable to receive at this point.
2743 		 */
2744 		if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2745 			so_unix_close(so);
2746 
2747 		if (sti->sti_serv_type == T_COTS)
2748 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2749 	}
2750 	if ((state_change & SS_CANTSENDMORE) &&
2751 	    (sti->sti_serv_type == T_COTS_ORD)) {
2752 		/* Send an orderly release */
2753 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2754 
2755 		mutex_exit(&so->so_lock);
2756 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2757 		    0, _ALLOC_SLEEP, cr);
2758 		/*
2759 		 * Send down the T_ORDREL_REQ even if there is flow control.
2760 		 * This prevents shutdown from blocking.
2761 		 * Note that there is no T_OK_ACK for ordrel_req.
2762 		 */
2763 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2764 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2765 		mutex_enter(&so->so_lock);
2766 		if (error) {
2767 			eprintsoline(so, error);
2768 			goto done;
2769 		}
2770 	}
2771 
2772 done:
2773 	so_unlock_single(so, SOLOCKED);
2774 	mutex_exit(&so->so_lock);
2775 	return (error);
2776 }
2777 
2778 /*
2779  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2780  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2781  * that we have closed.
2782  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2783  * T_UNITDATA_REQ containing the same option.
2784  *
2785  * For SOCK_DGRAM half-connections (somebody connected to this end
2786  * but this end is not connect) we don't know where to send any
2787  * SO_UNIX_CLOSE.
2788  *
2789  * We have to ignore stream head errors just in case there has been
2790  * a shutdown(output).
2791  * Ignore any flow control to try to get the message more quickly to the peer.
2792  * While locally ignoring flow control solves the problem when there
2793  * is only the loopback transport on the stream it would not provide
2794  * the correct AF_UNIX socket semantics when one or more modules have
2795  * been pushed.
2796  */
2797 void
2798 so_unix_close(struct sonode *so)
2799 {
2800 	int		error;
2801 	struct T_opthdr	toh;
2802 	mblk_t		*mp;
2803 	sotpi_info_t	*sti = SOTOTPI(so);
2804 
2805 	ASSERT(MUTEX_HELD(&so->so_lock));
2806 
2807 	ASSERT(so->so_family == AF_UNIX);
2808 
2809 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2810 	    (SS_ISCONNECTED|SS_ISBOUND))
2811 		return;
2812 
2813 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2814 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2815 
2816 	toh.level = SOL_SOCKET;
2817 	toh.name = SO_UNIX_CLOSE;
2818 
2819 	/* zero length + header */
2820 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2821 	toh.status = 0;
2822 
2823 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2824 		struct T_optdata_req tdr;
2825 
2826 		tdr.PRIM_type = T_OPTDATA_REQ;
2827 		tdr.DATA_flag = 0;
2828 
2829 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2830 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2831 
2832 		/* NOTE: holding so_lock while sleeping */
2833 		mp = soallocproto2(&tdr, sizeof (tdr),
2834 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2835 	} else {
2836 		struct T_unitdata_req	tudr;
2837 		void			*addr;
2838 		socklen_t		addrlen;
2839 		void			*src;
2840 		socklen_t		srclen;
2841 		struct T_opthdr		toh2;
2842 		t_scalar_t		size;
2843 
2844 		/* Connecteded DGRAM socket */
2845 
2846 		/*
2847 		 * For AF_UNIX the destination address is translated to
2848 		 * an internal name and the source address is passed as
2849 		 * an option.
2850 		 */
2851 		/*
2852 		 * Length and family checks.
2853 		 */
2854 		error = so_addr_verify(so, sti->sti_faddr_sa,
2855 		    (t_uscalar_t)sti->sti_faddr_len);
2856 		if (error) {
2857 			eprintsoline(so, error);
2858 			return;
2859 		}
2860 		if (sti->sti_faddr_noxlate) {
2861 			/*
2862 			 * Already have a transport internal address. Do not
2863 			 * pass any (transport internal) source address.
2864 			 */
2865 			addr = sti->sti_faddr_sa;
2866 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2867 			src = NULL;
2868 			srclen = 0;
2869 		} else {
2870 			/*
2871 			 * Pass the sockaddr_un source address as an option
2872 			 * and translate the remote address.
2873 			 * Holding so_lock thus sti_laddr_sa can not change.
2874 			 */
2875 			src = sti->sti_laddr_sa;
2876 			srclen = (socklen_t)sti->sti_laddr_len;
2877 			dprintso(so, 1,
2878 			    ("so_ux_close: srclen %d, src %p\n",
2879 			    srclen, src));
2880 			error = so_ux_addr_xlate(so,
2881 			    sti->sti_faddr_sa,
2882 			    (socklen_t)sti->sti_faddr_len, 0,
2883 			    &addr, &addrlen);
2884 			if (error) {
2885 				eprintsoline(so, error);
2886 				return;
2887 			}
2888 		}
2889 		tudr.PRIM_type = T_UNITDATA_REQ;
2890 		tudr.DEST_length = addrlen;
2891 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2892 		if (srclen == 0) {
2893 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2894 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2895 			    _TPI_ALIGN_TOPT(addrlen));
2896 
2897 			size = tudr.OPT_offset + tudr.OPT_length;
2898 			/* NOTE: holding so_lock while sleeping */
2899 			mp = soallocproto2(&tudr, sizeof (tudr),
2900 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2901 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2902 			soappendmsg(mp, &toh, sizeof (toh));
2903 		} else {
2904 			/*
2905 			 * There is a AF_UNIX sockaddr_un to include as a
2906 			 * source address option.
2907 			 */
2908 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2909 			    _TPI_ALIGN_TOPT(srclen));
2910 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2911 			    _TPI_ALIGN_TOPT(addrlen));
2912 
2913 			toh2.level = SOL_SOCKET;
2914 			toh2.name = SO_SRCADDR;
2915 			toh2.len = (t_uscalar_t)(srclen +
2916 			    sizeof (struct T_opthdr));
2917 			toh2.status = 0;
2918 
2919 			size = tudr.OPT_offset + tudr.OPT_length;
2920 
2921 			/* NOTE: holding so_lock while sleeping */
2922 			mp = soallocproto2(&tudr, sizeof (tudr),
2923 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2924 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2925 			soappendmsg(mp, &toh, sizeof (toh));
2926 			soappendmsg(mp, &toh2, sizeof (toh2));
2927 			soappendmsg(mp, src, srclen);
2928 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2929 		}
2930 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2931 	}
2932 	mutex_exit(&so->so_lock);
2933 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2934 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2935 	mutex_enter(&so->so_lock);
2936 }
2937 
2938 /*
2939  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2940  * In addition, the caller typically verifies that there is some
2941  * potential state to clear by checking
2942  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2943  * before calling this routine.
2944  * Note that such a check can be made without holding so_lock since
2945  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2946  * decrements sti_oobsigcnt.
2947  *
2948  * When data is read *after* the point that all pending
2949  * oob data has been consumed the oob indication is cleared.
2950  *
2951  * This logic keeps select/poll returning POLLRDBAND and
2952  * SIOCATMARK returning true until we have read past
2953  * the mark.
2954  */
2955 static void
2956 sorecv_update_oobstate(struct sonode *so)
2957 {
2958 	sotpi_info_t *sti = SOTOTPI(so);
2959 
2960 	mutex_enter(&so->so_lock);
2961 	ASSERT(so_verify_oobstate(so));
2962 	dprintso(so, 1,
2963 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
2964 	    sti->sti_oobsigcnt,
2965 	    sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2966 	if (sti->sti_oobsigcnt == 0) {
2967 		/* No more pending oob indications */
2968 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2969 		freemsg(so->so_oobmsg);
2970 		so->so_oobmsg = NULL;
2971 	}
2972 	ASSERT(so_verify_oobstate(so));
2973 	mutex_exit(&so->so_lock);
2974 }
2975 
2976 /*
2977  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2978  */
2979 static int
2980 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2981 {
2982 	sotpi_info_t *sti = SOTOTPI(so);
2983 	int	error = 0;
2984 	mblk_t *tmp = NULL;
2985 	mblk_t *pmp = NULL;
2986 	mblk_t *nmp = sti->sti_nl7c_rcv_mp;
2987 
2988 	ASSERT(nmp != NULL);
2989 
2990 	while (nmp != NULL && uiop->uio_resid > 0) {
2991 		ssize_t n;
2992 
2993 		if (DB_TYPE(nmp) == M_DATA) {
2994 			/*
2995 			 * We have some data, uiomove up to resid bytes.
2996 			 */
2997 			n = MIN(MBLKL(nmp), uiop->uio_resid);
2998 			if (n > 0)
2999 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
3000 			nmp->b_rptr += n;
3001 			if (nmp->b_rptr == nmp->b_wptr) {
3002 				pmp = nmp;
3003 				nmp = nmp->b_cont;
3004 			}
3005 			if (error)
3006 				break;
3007 		} else {
3008 			/*
3009 			 * We only handle data, save for caller to handle.
3010 			 */
3011 			if (pmp != NULL) {
3012 				pmp->b_cont = nmp->b_cont;
3013 			}
3014 			nmp->b_cont = NULL;
3015 			if (*rmp == NULL) {
3016 				*rmp = nmp;
3017 			} else {
3018 				tmp->b_cont = nmp;
3019 			}
3020 			nmp = nmp->b_cont;
3021 			tmp = nmp;
3022 		}
3023 	}
3024 	if (pmp != NULL) {
3025 		/* Free any mblk_t(s) which we have consumed */
3026 		pmp->b_cont = NULL;
3027 		freemsg(sti->sti_nl7c_rcv_mp);
3028 	}
3029 	if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3030 		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
3031 		if (error == 0) {
3032 			rval_t	*p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3033 
3034 			error = p->r_v.r_v2;
3035 			p->r_v.r_v2 = 0;
3036 		}
3037 		rp->r_vals = sti->sti_nl7c_rcv_rval;
3038 		sti->sti_nl7c_rcv_rval = 0;
3039 	} else {
3040 		/* More mblk_t(s) to process so no rval to return */
3041 		rp->r_vals = 0;
3042 	}
3043 	return (error);
3044 }
3045 /*
3046  * Receive the next message on the queue.
3047  * If msg_controllen is non-zero when called the caller is interested in
3048  * any received control info (options).
3049  * If msg_namelen is non-zero when called the caller is interested in
3050  * any received source address.
3051  * The routine returns with msg_control and msg_name pointing to
3052  * kmem_alloc'ed memory which the caller has to free.
3053  */
3054 /* ARGSUSED */
3055 int
3056 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3057     struct cred *cr)
3058 {
3059 	union T_primitives	*tpr;
3060 	mblk_t			*mp;
3061 	uchar_t			pri;
3062 	int			pflag, opflag;
3063 	void			*control;
3064 	t_uscalar_t		controllen;
3065 	t_uscalar_t		namelen;
3066 	int			so_state = so->so_state; /* Snapshot */
3067 	ssize_t			saved_resid;
3068 	rval_t			rval;
3069 	int			flags;
3070 	clock_t			timout;
3071 	int			error = 0;
3072 	sotpi_info_t		*sti = SOTOTPI(so);
3073 
3074 	flags = msg->msg_flags;
3075 	msg->msg_flags = 0;
3076 
3077 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3078 	    (void *)so, (void *)msg, flags,
3079 	    pr_state(so->so_state, so->so_mode), so->so_error));
3080 
3081 	if (so->so_version == SOV_STREAM) {
3082 		so_update_attrs(so, SOACC);
3083 		/* The imaginary "sockmod" has been popped - act as a stream */
3084 		return (strread(SOTOV(so), uiop, cr));
3085 	}
3086 
3087 	/*
3088 	 * If we are not connected because we have never been connected
3089 	 * we return ENOTCONN. If we have been connected (but are no longer
3090 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3091 	 * the EOF.
3092 	 *
3093 	 * An alternative would be to post an ENOTCONN error in stream head
3094 	 * (read+write) and clear it when we're connected. However, that error
3095 	 * would cause incorrect poll/select behavior!
3096 	 */
3097 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3098 	    (so->so_mode & SM_CONNREQUIRED)) {
3099 		return (ENOTCONN);
3100 	}
3101 
3102 	/*
3103 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3104 	 * after checking that the read queue is empty) and returns zero.
3105 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
3106 	 * is zero.
3107 	 */
3108 
3109 	if (flags & MSG_OOB) {
3110 		/* Check that the transport supports OOB */
3111 		if (!(so->so_mode & SM_EXDATA))
3112 			return (EOPNOTSUPP);
3113 		so_update_attrs(so, SOACC);
3114 		return (sorecvoob(so, msg, uiop, flags,
3115 		    (so->so_options & SO_OOBINLINE)));
3116 	}
3117 
3118 	so_update_attrs(so, SOACC);
3119 
3120 	/*
3121 	 * Set msg_controllen and msg_namelen to zero here to make it
3122 	 * simpler in the cases that no control or name is returned.
3123 	 */
3124 	controllen = msg->msg_controllen;
3125 	namelen = msg->msg_namelen;
3126 	msg->msg_controllen = 0;
3127 	msg->msg_namelen = 0;
3128 
3129 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3130 	    namelen, controllen));
3131 
3132 	mutex_enter(&so->so_lock);
3133 	/*
3134 	 * If an NL7C enabled socket and not waiting for write data.
3135 	 */
3136 	if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3137 	    NL7C_ENABLED) {
3138 		if (sti->sti_nl7c_uri) {
3139 			/* Close uri processing for a previous request */
3140 			nl7c_close(so);
3141 		}
3142 		if ((so_state & SS_CANTRCVMORE) &&
3143 		    sti->sti_nl7c_rcv_mp == NULL) {
3144 			/* Nothing to process, EOF */
3145 			mutex_exit(&so->so_lock);
3146 			return (0);
3147 		} else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3148 			/* Persistent NL7C socket, try to process request */
3149 			boolean_t ret;
3150 
3151 			ret = nl7c_process(so,
3152 			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3153 			rval.r_vals = sti->sti_nl7c_rcv_rval;
3154 			error = rval.r_v.r_v2;
3155 			if (error) {
3156 				/* Error of some sort, return it */
3157 				mutex_exit(&so->so_lock);
3158 				return (error);
3159 			}
3160 			if (sti->sti_nl7c_flags &&
3161 			    ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3162 				/*
3163 				 * Still an NL7C socket and no data
3164 				 * to pass up to the caller.
3165 				 */
3166 				mutex_exit(&so->so_lock);
3167 				if (ret) {
3168 					/* EOF */
3169 					return (0);
3170 				} else {
3171 					/* Need more data */
3172 					return (EAGAIN);
3173 				}
3174 			}
3175 		} else {
3176 			/*
3177 			 * Not persistent so no further NL7C processing.
3178 			 */
3179 			sti->sti_nl7c_flags = 0;
3180 		}
3181 	}
3182 	/*
3183 	 * Only one reader is allowed at any given time. This is needed
3184 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3185 	 *
3186 	 * This is slightly different that BSD behavior in that it fails with
3187 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3188 	 * is single-threaded using sblock(), which is dropped while waiting
3189 	 * for data to appear. The difference shows up e.g. if one
3190 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3191 	 * does use nonblocking io and different threads are reading each
3192 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3193 	 * in this case as long as the read queue doesn't get empty.
3194 	 * In this implementation the thread using nonblocking io can
3195 	 * get an EWOULDBLOCK error due to the blocking thread executing
3196 	 * e.g. in the uiomove in kstrgetmsg.
3197 	 * This difference is not believed to be significant.
3198 	 */
3199 	/* Set SOREADLOCKED */
3200 	error = so_lock_read_intr(so,
3201 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3202 	mutex_exit(&so->so_lock);
3203 	if (error)
3204 		return (error);
3205 
3206 	/*
3207 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3208 	 * queued data has been consumed.
3209 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3210 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3211 	 *
3212 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3213 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3214 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3215 	 */
3216 	pflag = MSG_ANY | MSG_DELAYERROR;
3217 	if (flags & MSG_PEEK) {
3218 		pflag |= MSG_IPEEK;
3219 		flags &= ~MSG_WAITALL;
3220 	}
3221 	if (so->so_mode & SM_ATOMIC)
3222 		pflag |= MSG_DISCARDTAIL;
3223 
3224 	if (flags & MSG_DONTWAIT)
3225 		timout = 0;
3226 	else
3227 		timout = -1;
3228 	opflag = pflag;
3229 retry:
3230 	saved_resid = uiop->uio_resid;
3231 	pri = 0;
3232 	mp = NULL;
3233 	if (sti->sti_nl7c_rcv_mp != NULL) {
3234 		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3235 		error = nl7c_sorecv(so, &mp, uiop, &rval);
3236 	} else {
3237 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3238 		    timout, &rval);
3239 	}
3240 	if (error != 0) {
3241 		/* kstrgetmsg returns ETIME when timeout expires */
3242 		if (error == ETIME)
3243 			error = EWOULDBLOCK;
3244 		goto out;
3245 	}
3246 	/*
3247 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3248 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3249 	 */
3250 	ASSERT(!(rval.r_val1 & MORECTL));
3251 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3252 		msg->msg_flags |= MSG_TRUNC;
3253 
3254 	if (mp == NULL) {
3255 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3256 		/*
3257 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3258 		 * The draft Posix socket spec states that the mark should
3259 		 * not be cleared when peeking. We follow the latter.
3260 		 */
3261 		if ((so->so_state &
3262 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3263 		    (uiop->uio_resid != saved_resid) &&
3264 		    !(flags & MSG_PEEK)) {
3265 			sorecv_update_oobstate(so);
3266 		}
3267 
3268 		mutex_enter(&so->so_lock);
3269 		/* Set MSG_EOR based on MOREDATA */
3270 		if (!(rval.r_val1 & MOREDATA)) {
3271 			if (so->so_state & SS_SAVEDEOR) {
3272 				msg->msg_flags |= MSG_EOR;
3273 				so->so_state &= ~SS_SAVEDEOR;
3274 			}
3275 		}
3276 		/*
3277 		 * If some data was received (i.e. not EOF) and the
3278 		 * read/recv* has not been satisfied wait for some more.
3279 		 */
3280 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3281 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3282 			mutex_exit(&so->so_lock);
3283 			pflag = opflag | MSG_NOMARK;
3284 			goto retry;
3285 		}
3286 		goto out_locked;
3287 	}
3288 
3289 	/* strsock_proto has already verified length and alignment */
3290 	tpr = (union T_primitives *)mp->b_rptr;
3291 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3292 
3293 	switch (tpr->type) {
3294 	case T_DATA_IND: {
3295 		if ((so->so_state &
3296 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3297 		    (uiop->uio_resid != saved_resid) &&
3298 		    !(flags & MSG_PEEK)) {
3299 			sorecv_update_oobstate(so);
3300 		}
3301 
3302 		/*
3303 		 * Set msg_flags to MSG_EOR based on
3304 		 * MORE_flag and MOREDATA.
3305 		 */
3306 		mutex_enter(&so->so_lock);
3307 		so->so_state &= ~SS_SAVEDEOR;
3308 		if (!(tpr->data_ind.MORE_flag & 1)) {
3309 			if (!(rval.r_val1 & MOREDATA))
3310 				msg->msg_flags |= MSG_EOR;
3311 			else
3312 				so->so_state |= SS_SAVEDEOR;
3313 		}
3314 		freemsg(mp);
3315 		/*
3316 		 * If some data was received (i.e. not EOF) and the
3317 		 * read/recv* has not been satisfied wait for some more.
3318 		 */
3319 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3320 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3321 			mutex_exit(&so->so_lock);
3322 			pflag = opflag | MSG_NOMARK;
3323 			goto retry;
3324 		}
3325 		goto out_locked;
3326 	}
3327 	case T_UNITDATA_IND: {
3328 		void *addr;
3329 		t_uscalar_t addrlen;
3330 		void *abuf;
3331 		t_uscalar_t optlen;
3332 		void *opt;
3333 
3334 		if ((so->so_state &
3335 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3336 		    (uiop->uio_resid != saved_resid) &&
3337 		    !(flags & MSG_PEEK)) {
3338 			sorecv_update_oobstate(so);
3339 		}
3340 
3341 		if (namelen != 0) {
3342 			/* Caller wants source address */
3343 			addrlen = tpr->unitdata_ind.SRC_length;
3344 			addr = sogetoff(mp,
3345 			    tpr->unitdata_ind.SRC_offset,
3346 			    addrlen, 1);
3347 			if (addr == NULL) {
3348 				freemsg(mp);
3349 				error = EPROTO;
3350 				eprintsoline(so, error);
3351 				goto out;
3352 			}
3353 			if (so->so_family == AF_UNIX) {
3354 				/*
3355 				 * Can not use the transport level address.
3356 				 * If there is a SO_SRCADDR option carrying
3357 				 * the socket level address it will be
3358 				 * extracted below.
3359 				 */
3360 				addr = NULL;
3361 				addrlen = 0;
3362 			}
3363 		}
3364 		optlen = tpr->unitdata_ind.OPT_length;
3365 		if (optlen != 0) {
3366 			t_uscalar_t ncontrollen;
3367 
3368 			/*
3369 			 * Extract any source address option.
3370 			 * Determine how large cmsg buffer is needed.
3371 			 */
3372 			opt = sogetoff(mp,
3373 			    tpr->unitdata_ind.OPT_offset,
3374 			    optlen, __TPI_ALIGN_SIZE);
3375 
3376 			if (opt == NULL) {
3377 				freemsg(mp);
3378 				error = EPROTO;
3379 				eprintsoline(so, error);
3380 				goto out;
3381 			}
3382 			if (so->so_family == AF_UNIX)
3383 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3384 			ncontrollen = so_cmsglen(mp, opt, optlen,
3385 			    !(flags & MSG_XPG4_2));
3386 			if (controllen != 0)
3387 				controllen = ncontrollen;
3388 			else if (ncontrollen != 0)
3389 				msg->msg_flags |= MSG_CTRUNC;
3390 		} else {
3391 			controllen = 0;
3392 		}
3393 
3394 		if (namelen != 0) {
3395 			/*
3396 			 * Return address to caller.
3397 			 * Caller handles truncation if length
3398 			 * exceeds msg_namelen.
3399 			 * NOTE: AF_UNIX NUL termination is ensured by
3400 			 * the sender's copyin_name().
3401 			 */
3402 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3403 
3404 			bcopy(addr, abuf, addrlen);
3405 			msg->msg_name = abuf;
3406 			msg->msg_namelen = addrlen;
3407 		}
3408 
3409 		if (controllen != 0) {
3410 			/*
3411 			 * Return control msg to caller.
3412 			 * Caller handles truncation if length
3413 			 * exceeds msg_controllen.
3414 			 */
3415 			control = kmem_zalloc(controllen, KM_SLEEP);
3416 
3417 			error = so_opt2cmsg(mp, opt, optlen,
3418 			    !(flags & MSG_XPG4_2),
3419 			    control, controllen);
3420 			if (error) {
3421 				freemsg(mp);
3422 				if (msg->msg_namelen != 0)
3423 					kmem_free(msg->msg_name,
3424 					    msg->msg_namelen);
3425 				kmem_free(control, controllen);
3426 				eprintsoline(so, error);
3427 				goto out;
3428 			}
3429 			msg->msg_control = control;
3430 			msg->msg_controllen = controllen;
3431 		}
3432 
3433 		freemsg(mp);
3434 		goto out;
3435 	}
3436 	case T_OPTDATA_IND: {
3437 		struct T_optdata_req *tdr;
3438 		void *opt;
3439 		t_uscalar_t optlen;
3440 
3441 		if ((so->so_state &
3442 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3443 		    (uiop->uio_resid != saved_resid) &&
3444 		    !(flags & MSG_PEEK)) {
3445 			sorecv_update_oobstate(so);
3446 		}
3447 
3448 		tdr = (struct T_optdata_req *)mp->b_rptr;
3449 		optlen = tdr->OPT_length;
3450 		if (optlen != 0) {
3451 			t_uscalar_t ncontrollen;
3452 			/*
3453 			 * Determine how large cmsg buffer is needed.
3454 			 */
3455 			opt = sogetoff(mp,
3456 			    tpr->optdata_ind.OPT_offset,
3457 			    optlen, __TPI_ALIGN_SIZE);
3458 
3459 			if (opt == NULL) {
3460 				freemsg(mp);
3461 				error = EPROTO;
3462 				eprintsoline(so, error);
3463 				goto out;
3464 			}
3465 
3466 			ncontrollen = so_cmsglen(mp, opt, optlen,
3467 			    !(flags & MSG_XPG4_2));
3468 			if (controllen != 0)
3469 				controllen = ncontrollen;
3470 			else if (ncontrollen != 0)
3471 				msg->msg_flags |= MSG_CTRUNC;
3472 		} else {
3473 			controllen = 0;
3474 		}
3475 
3476 		if (controllen != 0) {
3477 			/*
3478 			 * Return control msg to caller.
3479 			 * Caller handles truncation if length
3480 			 * exceeds msg_controllen.
3481 			 */
3482 			control = kmem_zalloc(controllen, KM_SLEEP);
3483 
3484 			error = so_opt2cmsg(mp, opt, optlen,
3485 			    !(flags & MSG_XPG4_2),
3486 			    control, controllen);
3487 			if (error) {
3488 				freemsg(mp);
3489 				kmem_free(control, controllen);
3490 				eprintsoline(so, error);
3491 				goto out;
3492 			}
3493 			msg->msg_control = control;
3494 			msg->msg_controllen = controllen;
3495 		}
3496 
3497 		/*
3498 		 * Set msg_flags to MSG_EOR based on
3499 		 * DATA_flag and MOREDATA.
3500 		 */
3501 		mutex_enter(&so->so_lock);
3502 		so->so_state &= ~SS_SAVEDEOR;
3503 		if (!(tpr->data_ind.MORE_flag & 1)) {
3504 			if (!(rval.r_val1 & MOREDATA))
3505 				msg->msg_flags |= MSG_EOR;
3506 			else
3507 				so->so_state |= SS_SAVEDEOR;
3508 		}
3509 		freemsg(mp);
3510 		/*
3511 		 * If some data was received (i.e. not EOF) and the
3512 		 * read/recv* has not been satisfied wait for some more.
3513 		 * Not possible to wait if control info was received.
3514 		 */
3515 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3516 		    controllen == 0 &&
3517 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3518 			mutex_exit(&so->so_lock);
3519 			pflag = opflag | MSG_NOMARK;
3520 			goto retry;
3521 		}
3522 		goto out_locked;
3523 	}
3524 	case T_EXDATA_IND: {
3525 		dprintso(so, 1,
3526 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3527 		    "state %s\n",
3528 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3529 		    saved_resid - uiop->uio_resid,
3530 		    pr_state(so->so_state, so->so_mode)));
3531 		/*
3532 		 * kstrgetmsg handles MSGMARK so there is nothing to
3533 		 * inspect in the T_EXDATA_IND.
3534 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3535 		 * as a separate message with no M_DATA component. Furthermore,
3536 		 * the stream head does not consolidate M_DATA messages onto
3537 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3538 		 * remains a message by itself. This is needed since MSGMARK
3539 		 * marks both the whole message as well as the last byte
3540 		 * of the message.
3541 		 */
3542 		freemsg(mp);
3543 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3544 		if (flags & MSG_PEEK) {
3545 			/*
3546 			 * Even though we are peeking we consume the
3547 			 * T_EXDATA_IND thereby moving the mark information
3548 			 * to SS_RCVATMARK. Then the oob code below will
3549 			 * retry the peeking kstrgetmsg.
3550 			 * Note that the stream head read queue is
3551 			 * never flushed without holding SOREADLOCKED
3552 			 * thus the T_EXDATA_IND can not disappear
3553 			 * underneath us.
3554 			 */
3555 			dprintso(so, 1,
3556 			    ("sotpi_recvmsg: consume EXDATA_IND "
3557 			    "counts %d/%d state %s\n",
3558 			    sti->sti_oobsigcnt,
3559 			    sti->sti_oobcnt,
3560 			    pr_state(so->so_state, so->so_mode)));
3561 
3562 			pflag = MSG_ANY | MSG_DELAYERROR;
3563 			if (so->so_mode & SM_ATOMIC)
3564 				pflag |= MSG_DISCARDTAIL;
3565 
3566 			pri = 0;
3567 			mp = NULL;
3568 
3569 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3570 			    &pri, &pflag, (clock_t)-1, &rval);
3571 			ASSERT(uiop->uio_resid == saved_resid);
3572 
3573 			if (error) {
3574 #ifdef SOCK_DEBUG
3575 				if (error != EWOULDBLOCK && error != EINTR) {
3576 					eprintsoline(so, error);
3577 				}
3578 #endif /* SOCK_DEBUG */
3579 				goto out;
3580 			}
3581 			ASSERT(mp);
3582 			tpr = (union T_primitives *)mp->b_rptr;
3583 			ASSERT(tpr->type == T_EXDATA_IND);
3584 			freemsg(mp);
3585 		} /* end "if (flags & MSG_PEEK)" */
3586 
3587 		/*
3588 		 * Decrement the number of queued and pending oob.
3589 		 *
3590 		 * SS_RCVATMARK is cleared when we read past a mark.
3591 		 * SS_HAVEOOBDATA is cleared when we've read past the
3592 		 * last mark.
3593 		 * SS_OOBPEND is cleared if we've read past the last
3594 		 * mark and no (new) SIGURG has been posted.
3595 		 */
3596 		mutex_enter(&so->so_lock);
3597 		ASSERT(so_verify_oobstate(so));
3598 		ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3599 		ASSERT(sti->sti_oobsigcnt > 0);
3600 		sti->sti_oobsigcnt--;
3601 		ASSERT(sti->sti_oobcnt > 0);
3602 		sti->sti_oobcnt--;
3603 		/*
3604 		 * Since the T_EXDATA_IND has been removed from the stream
3605 		 * head, but we have not read data past the mark,
3606 		 * sockfs needs to track that the socket is still at the mark.
3607 		 *
3608 		 * Since no data was received call kstrgetmsg again to wait
3609 		 * for data.
3610 		 */
3611 		so->so_state |= SS_RCVATMARK;
3612 		mutex_exit(&so->so_lock);
3613 		dprintso(so, 1,
3614 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3615 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3616 		    pr_state(so->so_state, so->so_mode)));
3617 		pflag = opflag;
3618 		goto retry;
3619 	}
3620 	default:
3621 		cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3622 		    (void *)so, tpr->type, (void *)mp);
3623 		ASSERT(0);
3624 		freemsg(mp);
3625 		error = EPROTO;
3626 		eprintsoline(so, error);
3627 		goto out;
3628 	}
3629 	/* NOTREACHED */
3630 out:
3631 	mutex_enter(&so->so_lock);
3632 out_locked:
3633 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3634 	mutex_exit(&so->so_lock);
3635 	return (error);
3636 }
3637 
3638 /*
3639  * Sending data with options on a datagram socket.
3640  * Assumes caller has verified that SS_ISBOUND etc. are set.
3641  */
3642 static int
3643 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3644     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3645 {
3646 	struct T_unitdata_req	tudr;
3647 	mblk_t			*mp;
3648 	int			error;
3649 	void			*addr;
3650 	socklen_t		addrlen;
3651 	void			*src;
3652 	socklen_t		srclen;
3653 	ssize_t			len;
3654 	int			size;
3655 	struct T_opthdr		toh;
3656 	struct fdbuf		*fdbuf;
3657 	t_uscalar_t		optlen;
3658 	void			*fds;
3659 	int			fdlen;
3660 	sotpi_info_t		*sti = SOTOTPI(so);
3661 
3662 	ASSERT(name && namelen);
3663 	ASSERT(control && controllen);
3664 
3665 	len = uiop->uio_resid;
3666 	if (len > (ssize_t)sti->sti_tidu_size) {
3667 		return (EMSGSIZE);
3668 	}
3669 
3670 	/*
3671 	 * For AF_UNIX the destination address is translated to an internal
3672 	 * name and the source address is passed as an option.
3673 	 * Also, file descriptors are passed as file pointers in an
3674 	 * option.
3675 	 */
3676 
3677 	/*
3678 	 * Length and family checks.
3679 	 */
3680 	error = so_addr_verify(so, name, namelen);
3681 	if (error) {
3682 		eprintsoline(so, error);
3683 		return (error);
3684 	}
3685 	if (so->so_family == AF_UNIX) {
3686 		if (sti->sti_faddr_noxlate) {
3687 			/*
3688 			 * Already have a transport internal address. Do not
3689 			 * pass any (transport internal) source address.
3690 			 */
3691 			addr = name;
3692 			addrlen = namelen;
3693 			src = NULL;
3694 			srclen = 0;
3695 		} else {
3696 			/*
3697 			 * Pass the sockaddr_un source address as an option
3698 			 * and translate the remote address.
3699 			 *
3700 			 * Note that this code does not prevent sti_laddr_sa
3701 			 * from changing while it is being used. Thus
3702 			 * if an unbind+bind occurs concurrently with this
3703 			 * send the peer might see a partially new and a
3704 			 * partially old "from" address.
3705 			 */
3706 			src = sti->sti_laddr_sa;
3707 			srclen = (t_uscalar_t)sti->sti_laddr_len;
3708 			dprintso(so, 1,
3709 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3710 			    srclen, src));
3711 			error = so_ux_addr_xlate(so, name, namelen,
3712 			    (flags & MSG_XPG4_2),
3713 			    &addr, &addrlen);
3714 			if (error) {
3715 				eprintsoline(so, error);
3716 				return (error);
3717 			}
3718 		}
3719 	} else {
3720 		addr = name;
3721 		addrlen = namelen;
3722 		src = NULL;
3723 		srclen = 0;
3724 	}
3725 	optlen = so_optlen(control, controllen,
3726 	    !(flags & MSG_XPG4_2));
3727 	tudr.PRIM_type = T_UNITDATA_REQ;
3728 	tudr.DEST_length = addrlen;
3729 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3730 	if (srclen != 0)
3731 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3732 		    _TPI_ALIGN_TOPT(srclen));
3733 	else
3734 		tudr.OPT_length = optlen;
3735 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3736 	    _TPI_ALIGN_TOPT(addrlen));
3737 
3738 	size = tudr.OPT_offset + tudr.OPT_length;
3739 
3740 	/*
3741 	 * File descriptors only when SM_FDPASSING set.
3742 	 */
3743 	error = so_getfdopt(control, controllen,
3744 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3745 	if (error)
3746 		return (error);
3747 	if (fdlen != -1) {
3748 		if (!(so->so_mode & SM_FDPASSING))
3749 			return (EOPNOTSUPP);
3750 
3751 		error = fdbuf_create(fds, fdlen, &fdbuf);
3752 		if (error)
3753 			return (error);
3754 		mp = fdbuf_allocmsg(size, fdbuf);
3755 	} else {
3756 		mp = soallocproto(size, _ALLOC_INTR, CRED());
3757 		if (mp == NULL) {
3758 			/*
3759 			 * Caught a signal waiting for memory.
3760 			 * Let send* return EINTR.
3761 			 */
3762 			return (EINTR);
3763 		}
3764 	}
3765 	soappendmsg(mp, &tudr, sizeof (tudr));
3766 	soappendmsg(mp, addr, addrlen);
3767 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3768 
3769 	if (fdlen != -1) {
3770 		ASSERT(fdbuf != NULL);
3771 		toh.level = SOL_SOCKET;
3772 		toh.name = SO_FILEP;
3773 		toh.len = fdbuf->fd_size +
3774 		    (t_uscalar_t)sizeof (struct T_opthdr);
3775 		toh.status = 0;
3776 		soappendmsg(mp, &toh, sizeof (toh));
3777 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3778 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3779 	}
3780 	if (srclen != 0) {
3781 		/*
3782 		 * There is a AF_UNIX sockaddr_un to include as a source
3783 		 * address option.
3784 		 */
3785 		toh.level = SOL_SOCKET;
3786 		toh.name = SO_SRCADDR;
3787 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3788 		toh.status = 0;
3789 		soappendmsg(mp, &toh, sizeof (toh));
3790 		soappendmsg(mp, src, srclen);
3791 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3792 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3793 	}
3794 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3795 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3796 	/* At most 3 bytes left in the message */
3797 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3798 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3799 
3800 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3801 	if (AU_AUDITING())
3802 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3803 
3804 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3805 #ifdef SOCK_DEBUG
3806 	if (error) {
3807 		eprintsoline(so, error);
3808 	}
3809 #endif /* SOCK_DEBUG */
3810 	return (error);
3811 }
3812 
3813 /*
3814  * Sending data with options on a connected stream socket.
3815  * Assumes caller has verified that SS_ISCONNECTED is set.
3816  */
3817 static int
3818 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3819     t_uscalar_t controllen, int flags)
3820 {
3821 	struct T_optdata_req	tdr;
3822 	mblk_t			*mp;
3823 	int			error;
3824 	ssize_t			iosize;
3825 	int			size;
3826 	struct fdbuf		*fdbuf;
3827 	t_uscalar_t		optlen;
3828 	void			*fds;
3829 	int			fdlen;
3830 	struct T_opthdr		toh;
3831 	sotpi_info_t		*sti = SOTOTPI(so);
3832 
3833 	dprintso(so, 1,
3834 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3835 
3836 	/*
3837 	 * Has to be bound and connected. However, since no locks are
3838 	 * held the state could have changed after sotpi_sendmsg checked it
3839 	 * thus it is not possible to ASSERT on the state.
3840 	 */
3841 
3842 	/* Options on connection-oriented only when SM_OPTDATA set. */
3843 	if (!(so->so_mode & SM_OPTDATA))
3844 		return (EOPNOTSUPP);
3845 
3846 	do {
3847 		/*
3848 		 * Set the MORE flag if uio_resid does not fit in this
3849 		 * message or if the caller passed in "more".
3850 		 * Error for transports with zero tidu_size.
3851 		 */
3852 		tdr.PRIM_type = T_OPTDATA_REQ;
3853 		iosize = sti->sti_tidu_size;
3854 		if (iosize <= 0)
3855 			return (EMSGSIZE);
3856 		if (uiop->uio_resid > iosize) {
3857 			tdr.DATA_flag = 1;
3858 		} else {
3859 			if (more)
3860 				tdr.DATA_flag = 1;
3861 			else
3862 				tdr.DATA_flag = 0;
3863 			iosize = uiop->uio_resid;
3864 		}
3865 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3866 		    tdr.DATA_flag, iosize));
3867 
3868 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3869 		tdr.OPT_length = optlen;
3870 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3871 
3872 		size = (int)sizeof (tdr) + optlen;
3873 		/*
3874 		 * File descriptors only when SM_FDPASSING set.
3875 		 */
3876 		error = so_getfdopt(control, controllen,
3877 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3878 		if (error)
3879 			return (error);
3880 		if (fdlen != -1) {
3881 			if (!(so->so_mode & SM_FDPASSING))
3882 				return (EOPNOTSUPP);
3883 
3884 			error = fdbuf_create(fds, fdlen, &fdbuf);
3885 			if (error)
3886 				return (error);
3887 			mp = fdbuf_allocmsg(size, fdbuf);
3888 		} else {
3889 			mp = soallocproto(size, _ALLOC_INTR, CRED());
3890 			if (mp == NULL) {
3891 				/*
3892 				 * Caught a signal waiting for memory.
3893 				 * Let send* return EINTR.
3894 				 */
3895 				return (EINTR);
3896 			}
3897 		}
3898 		soappendmsg(mp, &tdr, sizeof (tdr));
3899 
3900 		if (fdlen != -1) {
3901 			ASSERT(fdbuf != NULL);
3902 			toh.level = SOL_SOCKET;
3903 			toh.name = SO_FILEP;
3904 			toh.len = fdbuf->fd_size +
3905 			    (t_uscalar_t)sizeof (struct T_opthdr);
3906 			toh.status = 0;
3907 			soappendmsg(mp, &toh, sizeof (toh));
3908 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3909 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3910 		}
3911 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3912 		/* At most 3 bytes left in the message */
3913 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3914 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3915 
3916 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3917 
3918 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3919 		    0, MSG_BAND, 0);
3920 		if (error) {
3921 			eprintsoline(so, error);
3922 			return (error);
3923 		}
3924 		control = NULL;
3925 		if (uiop->uio_resid > 0) {
3926 			/*
3927 			 * Recheck for fatal errors. Fail write even though
3928 			 * some data have been written. This is consistent
3929 			 * with strwrite semantics and BSD sockets semantics.
3930 			 */
3931 			if (so->so_state & SS_CANTSENDMORE) {
3932 				eprintsoline(so, error);
3933 				return (EPIPE);
3934 			}
3935 			if (so->so_error != 0) {
3936 				mutex_enter(&so->so_lock);
3937 				error = sogeterr(so, B_TRUE);
3938 				mutex_exit(&so->so_lock);
3939 				if (error != 0) {
3940 					eprintsoline(so, error);
3941 					return (error);
3942 				}
3943 			}
3944 		}
3945 	} while (uiop->uio_resid > 0);
3946 	return (0);
3947 }
3948 
3949 /*
3950  * Sending data on a datagram socket.
3951  * Assumes caller has verified that SS_ISBOUND etc. are set.
3952  *
3953  * For AF_UNIX the destination address is translated to an internal
3954  * name and the source address is passed as an option.
3955  */
3956 int
3957 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
3958     struct uio *uiop, int flags)
3959 {
3960 	struct T_unitdata_req	tudr;
3961 	mblk_t			*mp;
3962 	int			error;
3963 	void			*addr;
3964 	socklen_t		addrlen;
3965 	void			*src;
3966 	socklen_t		srclen;
3967 	ssize_t			len;
3968 	sotpi_info_t		*sti = SOTOTPI(so);
3969 
3970 	ASSERT(name != NULL && namelen != 0);
3971 
3972 	len = uiop->uio_resid;
3973 	if (len > sti->sti_tidu_size) {
3974 		error = EMSGSIZE;
3975 		goto done;
3976 	}
3977 
3978 	/* Length and family checks */
3979 	error = so_addr_verify(so, name, namelen);
3980 	if (error != 0)
3981 		goto done;
3982 
3983 	if (sti->sti_direct)
3984 		return (sodgram_direct(so, name, namelen, uiop, flags));
3985 
3986 	if (so->so_family == AF_UNIX) {
3987 		if (sti->sti_faddr_noxlate) {
3988 			/*
3989 			 * Already have a transport internal address. Do not
3990 			 * pass any (transport internal) source address.
3991 			 */
3992 			addr = name;
3993 			addrlen = namelen;
3994 			src = NULL;
3995 			srclen = 0;
3996 		} else {
3997 			/*
3998 			 * Pass the sockaddr_un source address as an option
3999 			 * and translate the remote address.
4000 			 *
4001 			 * Note that this code does not prevent sti_laddr_sa
4002 			 * from changing while it is being used. Thus
4003 			 * if an unbind+bind occurs concurrently with this
4004 			 * send the peer might see a partially new and a
4005 			 * partially old "from" address.
4006 			 */
4007 			src = sti->sti_laddr_sa;
4008 			srclen = (socklen_t)sti->sti_laddr_len;
4009 			dprintso(so, 1,
4010 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
4011 			    srclen, src));
4012 			error = so_ux_addr_xlate(so, name, namelen,
4013 			    (flags & MSG_XPG4_2),
4014 			    &addr, &addrlen);
4015 			if (error) {
4016 				eprintsoline(so, error);
4017 				goto done;
4018 			}
4019 		}
4020 	} else {
4021 		addr = name;
4022 		addrlen = namelen;
4023 		src = NULL;
4024 		srclen = 0;
4025 	}
4026 	tudr.PRIM_type = T_UNITDATA_REQ;
4027 	tudr.DEST_length = addrlen;
4028 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4029 	if (srclen == 0) {
4030 		tudr.OPT_length = 0;
4031 		tudr.OPT_offset = 0;
4032 
4033 		mp = soallocproto2(&tudr, sizeof (tudr),
4034 		    addr, addrlen, 0, _ALLOC_INTR, CRED());
4035 		if (mp == NULL) {
4036 			/*
4037 			 * Caught a signal waiting for memory.
4038 			 * Let send* return EINTR.
4039 			 */
4040 			error = EINTR;
4041 			goto done;
4042 		}
4043 	} else {
4044 		/*
4045 		 * There is a AF_UNIX sockaddr_un to include as a source
4046 		 * address option.
4047 		 */
4048 		struct T_opthdr toh;
4049 		ssize_t size;
4050 
4051 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4052 		    _TPI_ALIGN_TOPT(srclen));
4053 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4054 		    _TPI_ALIGN_TOPT(addrlen));
4055 
4056 		toh.level = SOL_SOCKET;
4057 		toh.name = SO_SRCADDR;
4058 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4059 		toh.status = 0;
4060 
4061 		size = tudr.OPT_offset + tudr.OPT_length;
4062 		mp = soallocproto2(&tudr, sizeof (tudr),
4063 		    addr, addrlen, size, _ALLOC_INTR, CRED());
4064 		if (mp == NULL) {
4065 			/*
4066 			 * Caught a signal waiting for memory.
4067 			 * Let send* return EINTR.
4068 			 */
4069 			error = EINTR;
4070 			goto done;
4071 		}
4072 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4073 		soappendmsg(mp, &toh, sizeof (toh));
4074 		soappendmsg(mp, src, srclen);
4075 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4076 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4077 	}
4078 
4079 	if (AU_AUDITING())
4080 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4081 
4082 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4083 done:
4084 #ifdef SOCK_DEBUG
4085 	if (error) {
4086 		eprintsoline(so, error);
4087 	}
4088 #endif /* SOCK_DEBUG */
4089 	return (error);
4090 }
4091 
4092 /*
4093  * Sending data on a connected stream socket.
4094  * Assumes caller has verified that SS_ISCONNECTED is set.
4095  */
4096 int
4097 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4098     int sflag)
4099 {
4100 	struct T_data_req	tdr;
4101 	mblk_t			*mp;
4102 	int			error;
4103 	ssize_t			iosize;
4104 	sotpi_info_t		*sti = SOTOTPI(so);
4105 
4106 	dprintso(so, 1,
4107 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4108 	    (void *)so, uiop->uio_resid, prim, sflag));
4109 
4110 	/*
4111 	 * Has to be bound and connected. However, since no locks are
4112 	 * held the state could have changed after sotpi_sendmsg checked it
4113 	 * thus it is not possible to ASSERT on the state.
4114 	 */
4115 
4116 	do {
4117 		/*
4118 		 * Set the MORE flag if uio_resid does not fit in this
4119 		 * message or if the caller passed in "more".
4120 		 * Error for transports with zero tidu_size.
4121 		 */
4122 		tdr.PRIM_type = prim;
4123 		iosize = sti->sti_tidu_size;
4124 		if (iosize <= 0)
4125 			return (EMSGSIZE);
4126 		if (uiop->uio_resid > iosize) {
4127 			tdr.MORE_flag = 1;
4128 		} else {
4129 			if (more)
4130 				tdr.MORE_flag = 1;
4131 			else
4132 				tdr.MORE_flag = 0;
4133 			iosize = uiop->uio_resid;
4134 		}
4135 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4136 		    prim, tdr.MORE_flag, iosize));
4137 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4138 		if (mp == NULL) {
4139 			/*
4140 			 * Caught a signal waiting for memory.
4141 			 * Let send* return EINTR.
4142 			 */
4143 			return (EINTR);
4144 		}
4145 
4146 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4147 		    0, sflag | MSG_BAND, 0);
4148 		if (error) {
4149 			eprintsoline(so, error);
4150 			return (error);
4151 		}
4152 		if (uiop->uio_resid > 0) {
4153 			/*
4154 			 * Recheck for fatal errors. Fail write even though
4155 			 * some data have been written. This is consistent
4156 			 * with strwrite semantics and BSD sockets semantics.
4157 			 */
4158 			if (so->so_state & SS_CANTSENDMORE) {
4159 				eprintsoline(so, error);
4160 				return (EPIPE);
4161 			}
4162 			if (so->so_error != 0) {
4163 				mutex_enter(&so->so_lock);
4164 				error = sogeterr(so, B_TRUE);
4165 				mutex_exit(&so->so_lock);
4166 				if (error != 0) {
4167 					eprintsoline(so, error);
4168 					return (error);
4169 				}
4170 			}
4171 		}
4172 	} while (uiop->uio_resid > 0);
4173 	return (0);
4174 }
4175 
4176 /*
4177  * Check the state for errors and call the appropriate send function.
4178  *
4179  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4180  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4181  * after sending the message.
4182  */
4183 static int
4184 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4185     struct cred *cr)
4186 {
4187 	int		so_state;
4188 	int		so_mode;
4189 	int		error;
4190 	struct sockaddr *name;
4191 	t_uscalar_t	namelen;
4192 	int		dontroute;
4193 	int		flags;
4194 	sotpi_info_t	*sti = SOTOTPI(so);
4195 
4196 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4197 	    (void *)so, (void *)msg, msg->msg_flags,
4198 	    pr_state(so->so_state, so->so_mode), so->so_error));
4199 
4200 	if (so->so_version == SOV_STREAM) {
4201 		/* The imaginary "sockmod" has been popped - act as a stream */
4202 		so_update_attrs(so, SOMOD);
4203 		return (strwrite(SOTOV(so), uiop, cr));
4204 	}
4205 
4206 	mutex_enter(&so->so_lock);
4207 	so_state = so->so_state;
4208 
4209 	if (so_state & SS_CANTSENDMORE) {
4210 		mutex_exit(&so->so_lock);
4211 		return (EPIPE);
4212 	}
4213 
4214 	if (so->so_error != 0) {
4215 		error = sogeterr(so, B_TRUE);
4216 		if (error != 0) {
4217 			mutex_exit(&so->so_lock);
4218 			return (error);
4219 		}
4220 	}
4221 
4222 	name = (struct sockaddr *)msg->msg_name;
4223 	namelen = msg->msg_namelen;
4224 
4225 	so_mode = so->so_mode;
4226 
4227 	if (name == NULL) {
4228 		if (!(so_state & SS_ISCONNECTED)) {
4229 			mutex_exit(&so->so_lock);
4230 			if (so_mode & SM_CONNREQUIRED)
4231 				return (ENOTCONN);
4232 			else
4233 				return (EDESTADDRREQ);
4234 		}
4235 		if (so_mode & SM_CONNREQUIRED) {
4236 			name = NULL;
4237 			namelen = 0;
4238 		} else {
4239 			/*
4240 			 * Note that this code does not prevent sti_faddr_sa
4241 			 * from changing while it is being used. Thus
4242 			 * if an "unconnect"+connect occurs concurrently with
4243 			 * this send the datagram might be delivered to a
4244 			 * garbaled address.
4245 			 */
4246 			ASSERT(sti->sti_faddr_sa);
4247 			name = sti->sti_faddr_sa;
4248 			namelen = (t_uscalar_t)sti->sti_faddr_len;
4249 		}
4250 	} else {
4251 		if (!(so_state & SS_ISCONNECTED) &&
4252 		    (so_mode & SM_CONNREQUIRED)) {
4253 			/* Required but not connected */
4254 			mutex_exit(&so->so_lock);
4255 			return (ENOTCONN);
4256 		}
4257 		/*
4258 		 * Ignore the address on connection-oriented sockets.
4259 		 * Just like BSD this code does not generate an error for
4260 		 * TCP (a CONNREQUIRED socket) when sending to an address
4261 		 * passed in with sendto/sendmsg. Instead the data is
4262 		 * delivered on the connection as if no address had been
4263 		 * supplied.
4264 		 */
4265 		if ((so_state & SS_ISCONNECTED) &&
4266 		    !(so_mode & SM_CONNREQUIRED)) {
4267 			mutex_exit(&so->so_lock);
4268 			return (EISCONN);
4269 		}
4270 		if (!(so_state & SS_ISBOUND)) {
4271 			so_lock_single(so);	/* Set SOLOCKED */
4272 			error = sotpi_bind(so, NULL, 0,
4273 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4274 			so_unlock_single(so, SOLOCKED);
4275 			if (error) {
4276 				mutex_exit(&so->so_lock);
4277 				eprintsoline(so, error);
4278 				return (error);
4279 			}
4280 		}
4281 		/*
4282 		 * Handle delayed datagram errors. These are only queued
4283 		 * when the application sets SO_DGRAM_ERRIND.
4284 		 * Return the error if we are sending to the address
4285 		 * that was returned in the last T_UDERROR_IND.
4286 		 * If sending to some other address discard the delayed
4287 		 * error indication.
4288 		 */
4289 		if (sti->sti_delayed_error) {
4290 			struct T_uderror_ind	*tudi;
4291 			void			*addr;
4292 			t_uscalar_t		addrlen;
4293 			boolean_t		match = B_FALSE;
4294 
4295 			ASSERT(sti->sti_eaddr_mp);
4296 			error = sti->sti_delayed_error;
4297 			sti->sti_delayed_error = 0;
4298 			tudi =
4299 			    (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4300 			addrlen = tudi->DEST_length;
4301 			addr = sogetoff(sti->sti_eaddr_mp,
4302 			    tudi->DEST_offset, addrlen, 1);
4303 			ASSERT(addr);	/* Checked by strsock_proto */
4304 			switch (so->so_family) {
4305 			case AF_INET: {
4306 				/* Compare just IP address and port */
4307 				sin_t *sin1 = (sin_t *)name;
4308 				sin_t *sin2 = (sin_t *)addr;
4309 
4310 				if (addrlen == sizeof (sin_t) &&
4311 				    namelen == addrlen &&
4312 				    sin1->sin_port == sin2->sin_port &&
4313 				    sin1->sin_addr.s_addr ==
4314 				    sin2->sin_addr.s_addr)
4315 					match = B_TRUE;
4316 				break;
4317 			}
4318 			case AF_INET6: {
4319 				/* Compare just IP address and port. Not flow */
4320 				sin6_t *sin1 = (sin6_t *)name;
4321 				sin6_t *sin2 = (sin6_t *)addr;
4322 
4323 				if (addrlen == sizeof (sin6_t) &&
4324 				    namelen == addrlen &&
4325 				    sin1->sin6_port == sin2->sin6_port &&
4326 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4327 				    &sin2->sin6_addr))
4328 					match = B_TRUE;
4329 				break;
4330 			}
4331 			case AF_UNIX:
4332 			default:
4333 				if (namelen == addrlen &&
4334 				    bcmp(name, addr, namelen) == 0)
4335 					match = B_TRUE;
4336 			}
4337 			if (match) {
4338 				freemsg(sti->sti_eaddr_mp);
4339 				sti->sti_eaddr_mp = NULL;
4340 				mutex_exit(&so->so_lock);
4341 #ifdef DEBUG
4342 				dprintso(so, 0,
4343 				    ("sockfs delayed error %d for %s\n",
4344 				    error,
4345 				    pr_addr(so->so_family, name, namelen)));
4346 #endif /* DEBUG */
4347 				return (error);
4348 			}
4349 			freemsg(sti->sti_eaddr_mp);
4350 			sti->sti_eaddr_mp = NULL;
4351 		}
4352 	}
4353 	mutex_exit(&so->so_lock);
4354 
4355 	flags = msg->msg_flags;
4356 	dontroute = 0;
4357 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4358 		uint32_t	val;
4359 
4360 		val = 1;
4361 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4362 		    &val, (t_uscalar_t)sizeof (val), cr);
4363 		if (error)
4364 			return (error);
4365 		dontroute = 1;
4366 	}
4367 
4368 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4369 		error = EOPNOTSUPP;
4370 		goto done;
4371 	}
4372 	if (msg->msg_controllen != 0) {
4373 		if (!(so_mode & SM_CONNREQUIRED)) {
4374 			so_update_attrs(so, SOMOD);
4375 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4376 			    msg->msg_control, msg->msg_controllen, flags);
4377 		} else {
4378 			if (flags & MSG_OOB) {
4379 				/* Can't generate T_EXDATA_REQ with options */
4380 				error = EOPNOTSUPP;
4381 				goto done;
4382 			}
4383 			so_update_attrs(so, SOMOD);
4384 			error = sosend_svccmsg(so, uiop,
4385 			    !(flags & MSG_EOR),
4386 			    msg->msg_control, msg->msg_controllen,
4387 			    flags);
4388 		}
4389 		goto done;
4390 	}
4391 
4392 	so_update_attrs(so, SOMOD);
4393 	if (!(so_mode & SM_CONNREQUIRED)) {
4394 		/*
4395 		 * If there is no SO_DONTROUTE to turn off return immediately
4396 		 * from send_dgram. This can allow tail-call optimizations.
4397 		 */
4398 		if (!dontroute) {
4399 			return (sosend_dgram(so, name, namelen, uiop, flags));
4400 		}
4401 		error = sosend_dgram(so, name, namelen, uiop, flags);
4402 	} else {
4403 		t_scalar_t prim;
4404 		int sflag;
4405 
4406 		/* Ignore msg_name in the connected state */
4407 		if (flags & MSG_OOB) {
4408 			prim = T_EXDATA_REQ;
4409 			/*
4410 			 * Send down T_EXDATA_REQ even if there is flow
4411 			 * control for data.
4412 			 */
4413 			sflag = MSG_IGNFLOW;
4414 		} else {
4415 			if (so_mode & SM_BYTESTREAM) {
4416 				/* Byte stream transport - use write */
4417 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4418 
4419 				/* Send M_DATA messages */
4420 				if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4421 				    (error = nl7c_data(so, uiop)) >= 0) {
4422 					/* NL7C consumed the data */
4423 					return (error);
4424 				}
4425 				/*
4426 				 * If there is no SO_DONTROUTE to turn off,
4427 				 * sti_direct is on, and there is no flow
4428 				 * control, we can take the fast path.
4429 				 */
4430 				if (!dontroute && sti->sti_direct != 0 &&
4431 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4432 					return (sostream_direct(so, uiop,
4433 					    NULL, cr));
4434 				}
4435 				error = strwrite(SOTOV(so), uiop, cr);
4436 				goto done;
4437 			}
4438 			prim = T_DATA_REQ;
4439 			sflag = 0;
4440 		}
4441 		/*
4442 		 * If there is no SO_DONTROUTE to turn off return immediately
4443 		 * from sosend_svc. This can allow tail-call optimizations.
4444 		 */
4445 		if (!dontroute)
4446 			return (sosend_svc(so, uiop, prim,
4447 			    !(flags & MSG_EOR), sflag));
4448 		error = sosend_svc(so, uiop, prim,
4449 		    !(flags & MSG_EOR), sflag);
4450 	}
4451 	ASSERT(dontroute);
4452 done:
4453 	if (dontroute) {
4454 		uint32_t	val;
4455 
4456 		val = 0;
4457 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4458 		    &val, (t_uscalar_t)sizeof (val), cr);
4459 	}
4460 	return (error);
4461 }
4462 
4463 /*
4464  * kstrwritemp() has very similar semantics as that of strwrite().
4465  * The main difference is it obtains mblks from the caller and also
4466  * does not do any copy as done in strwrite() from user buffers to
4467  * kernel buffers.
4468  *
4469  * Currently, this routine is used by sendfile to send data allocated
4470  * within the kernel without any copying. This interface does not use the
4471  * synchronous stream interface as synch. stream interface implies
4472  * copying.
4473  */
4474 int
4475 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4476 {
4477 	struct stdata *stp;
4478 	struct queue *wqp;
4479 	mblk_t *newmp;
4480 	char waitflag;
4481 	int tempmode;
4482 	int error = 0;
4483 	int done = 0;
4484 	struct sonode *so;
4485 	boolean_t direct;
4486 
4487 	ASSERT(vp->v_stream);
4488 	stp = vp->v_stream;
4489 
4490 	so = VTOSO(vp);
4491 	direct = _SOTOTPI(so)->sti_direct;
4492 
4493 	/*
4494 	 * This is the sockfs direct fast path. canputnext() need
4495 	 * not be accurate so we don't grab the sd_lock here. If
4496 	 * we get flow-controlled, we grab sd_lock just before the
4497 	 * do..while loop below to emulate what strwrite() does.
4498 	 */
4499 	wqp = stp->sd_wrq;
4500 	if (canputnext(wqp) && direct &&
4501 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4502 		return (sostream_direct(so, NULL, mp, CRED()));
4503 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4504 		/* Fast check of flags before acquiring the lock */
4505 		mutex_enter(&stp->sd_lock);
4506 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4507 		mutex_exit(&stp->sd_lock);
4508 		if (error != 0) {
4509 			if (!(stp->sd_flag & STPLEX) &&
4510 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
4511 				error = EPIPE;
4512 			}
4513 			return (error);
4514 		}
4515 	}
4516 
4517 	waitflag = WRITEWAIT;
4518 	if (stp->sd_flag & OLDNDELAY)
4519 		tempmode = fmode & ~FNDELAY;
4520 	else
4521 		tempmode = fmode;
4522 
4523 	mutex_enter(&stp->sd_lock);
4524 	do {
4525 		if (canputnext(wqp)) {
4526 			mutex_exit(&stp->sd_lock);
4527 			if (stp->sd_wputdatafunc != NULL) {
4528 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4529 				    NULL, NULL, NULL);
4530 				if (newmp == NULL) {
4531 					/* The caller will free mp */
4532 					return (ECOMM);
4533 				}
4534 				mp = newmp;
4535 			}
4536 			putnext(wqp, mp);
4537 			return (0);
4538 		}
4539 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4540 		    &done);
4541 	} while (error == 0 && !done);
4542 
4543 	mutex_exit(&stp->sd_lock);
4544 	/*
4545 	 * EAGAIN tells the application to try again. ENOMEM
4546 	 * is returned only if the memory allocation size
4547 	 * exceeds the physical limits of the system. ENOMEM
4548 	 * can't be true here.
4549 	 */
4550 	if (error == ENOMEM)
4551 		error = EAGAIN;
4552 	return (error);
4553 }
4554 
4555 /* ARGSUSED */
4556 static int
4557 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4558     struct cred *cr, mblk_t **mpp)
4559 {
4560 	int error;
4561 
4562 	if (so->so_family != AF_INET && so->so_family != AF_INET6)
4563 		return (EAFNOSUPPORT);
4564 
4565 	if (so->so_state & SS_CANTSENDMORE)
4566 		return (EPIPE);
4567 
4568 	if (so->so_type != SOCK_STREAM)
4569 		return (EOPNOTSUPP);
4570 
4571 	if ((so->so_state & SS_ISCONNECTED) == 0)
4572 		return (ENOTCONN);
4573 
4574 	error = kstrwritemp(so->so_vnode, *mpp, fflag);
4575 	if (error == 0)
4576 		*mpp = NULL;
4577 	return (error);
4578 }
4579 
4580 /*
4581  * Sending data on a datagram socket.
4582  * Assumes caller has verified that SS_ISBOUND etc. are set.
4583  */
4584 /* ARGSUSED */
4585 static int
4586 sodgram_direct(struct sonode *so, struct sockaddr *name,
4587     socklen_t namelen, struct uio *uiop, int flags)
4588 {
4589 	struct T_unitdata_req	tudr;
4590 	mblk_t			*mp = NULL;
4591 	int			error = 0;
4592 	void			*addr;
4593 	socklen_t		addrlen;
4594 	ssize_t			len;
4595 	struct stdata		*stp = SOTOV(so)->v_stream;
4596 	int			so_state;
4597 	queue_t			*udp_wq;
4598 	boolean_t		connected;
4599 	mblk_t			*mpdata = NULL;
4600 	sotpi_info_t		*sti = SOTOTPI(so);
4601 	uint32_t		auditing = AU_AUDITING();
4602 
4603 	ASSERT(name != NULL && namelen != 0);
4604 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4605 	ASSERT(!(so->so_mode & SM_EXDATA));
4606 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4607 	ASSERT(SOTOV(so)->v_type == VSOCK);
4608 
4609 	/* Caller checked for proper length */
4610 	len = uiop->uio_resid;
4611 	ASSERT(len <= sti->sti_tidu_size);
4612 
4613 	/* Length and family checks have been done by caller */
4614 	ASSERT(name->sa_family == so->so_family);
4615 	ASSERT(so->so_family == AF_INET ||
4616 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4617 	ASSERT(so->so_family == AF_INET6 ||
4618 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4619 
4620 	addr = name;
4621 	addrlen = namelen;
4622 
4623 	if (stp->sd_sidp != NULL &&
4624 	    (error = straccess(stp, JCWRITE)) != 0)
4625 		goto done;
4626 
4627 	so_state = so->so_state;
4628 
4629 	connected = so_state & SS_ISCONNECTED;
4630 	if (!connected) {
4631 		tudr.PRIM_type = T_UNITDATA_REQ;
4632 		tudr.DEST_length = addrlen;
4633 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4634 		tudr.OPT_length = 0;
4635 		tudr.OPT_offset = 0;
4636 
4637 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4638 		    _ALLOC_INTR, CRED());
4639 		if (mp == NULL) {
4640 			/*
4641 			 * Caught a signal waiting for memory.
4642 			 * Let send* return EINTR.
4643 			 */
4644 			error = EINTR;
4645 			goto done;
4646 		}
4647 	}
4648 
4649 	/*
4650 	 * For UDP we don't break up the copyin into smaller pieces
4651 	 * as in the TCP case.  That means if ENOMEM is returned by
4652 	 * mcopyinuio() then the uio vector has not been modified at
4653 	 * all and we fallback to either strwrite() or kstrputmsg()
4654 	 * below.  Note also that we never generate priority messages
4655 	 * from here.
4656 	 */
4657 	udp_wq = stp->sd_wrq->q_next;
4658 	if (canput(udp_wq) &&
4659 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4660 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4661 		ASSERT(uiop->uio_resid == 0);
4662 		if (!connected)
4663 			linkb(mp, mpdata);
4664 		else
4665 			mp = mpdata;
4666 		if (auditing)
4667 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4668 
4669 		udp_wput(udp_wq, mp);
4670 		return (0);
4671 	}
4672 
4673 	ASSERT(mpdata == NULL);
4674 	if (error != 0 && error != ENOMEM) {
4675 		freemsg(mp);
4676 		return (error);
4677 	}
4678 
4679 	/*
4680 	 * For connected, let strwrite() handle the blocking case.
4681 	 * Otherwise we fall thru and use kstrputmsg().
4682 	 */
4683 	if (connected)
4684 		return (strwrite(SOTOV(so), uiop, CRED()));
4685 
4686 	if (auditing)
4687 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4688 
4689 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4690 done:
4691 #ifdef SOCK_DEBUG
4692 	if (error != 0) {
4693 		eprintsoline(so, error);
4694 	}
4695 #endif /* SOCK_DEBUG */
4696 	return (error);
4697 }
4698 
4699 int
4700 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4701 {
4702 	struct stdata *stp = SOTOV(so)->v_stream;
4703 	ssize_t iosize, rmax, maxblk;
4704 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4705 	mblk_t *newmp;
4706 	int error = 0, wflag = 0;
4707 
4708 	ASSERT(so->so_mode & SM_BYTESTREAM);
4709 	ASSERT(SOTOV(so)->v_type == VSOCK);
4710 
4711 	if (stp->sd_sidp != NULL &&
4712 	    (error = straccess(stp, JCWRITE)) != 0)
4713 		return (error);
4714 
4715 	if (uiop == NULL) {
4716 		/*
4717 		 * kstrwritemp() should have checked sd_flag and
4718 		 * flow-control before coming here.  If we end up
4719 		 * here it means that we can simply pass down the
4720 		 * data to tcp.
4721 		 */
4722 		ASSERT(mp != NULL);
4723 		if (stp->sd_wputdatafunc != NULL) {
4724 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4725 			    NULL, NULL, NULL);
4726 			if (newmp == NULL) {
4727 				/* The caller will free mp */
4728 				return (ECOMM);
4729 			}
4730 			mp = newmp;
4731 		}
4732 		tcp_wput(tcp_wq, mp);
4733 		return (0);
4734 	}
4735 
4736 	/* Fallback to strwrite() to do proper error handling */
4737 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4738 		return (strwrite(SOTOV(so), uiop, cr));
4739 
4740 	rmax = stp->sd_qn_maxpsz;
4741 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4742 	if (rmax == 0 || uiop->uio_resid <= 0)
4743 		return (0);
4744 
4745 	if (rmax == INFPSZ)
4746 		rmax = uiop->uio_resid;
4747 
4748 	maxblk = stp->sd_maxblk;
4749 
4750 	for (;;) {
4751 		iosize = MIN(uiop->uio_resid, rmax);
4752 
4753 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4754 		if (mp == NULL) {
4755 			/*
4756 			 * Fallback to strwrite() for ENOMEM; if this
4757 			 * is our first time in this routine and the uio
4758 			 * vector has not been modified, we will end up
4759 			 * calling strwrite() without any flag set.
4760 			 */
4761 			if (error == ENOMEM)
4762 				goto slow_send;
4763 			else
4764 				return (error);
4765 		}
4766 		ASSERT(uiop->uio_resid >= 0);
4767 		/*
4768 		 * If mp is non-NULL and ENOMEM is set, it means that
4769 		 * mcopyinuio() was able to break down some of the user
4770 		 * data into one or more mblks.  Send the partial data
4771 		 * to tcp and let the rest be handled in strwrite().
4772 		 */
4773 		ASSERT(error == 0 || error == ENOMEM);
4774 		if (stp->sd_wputdatafunc != NULL) {
4775 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4776 			    NULL, NULL, NULL);
4777 			if (newmp == NULL) {
4778 				/* The caller will free mp */
4779 				return (ECOMM);
4780 			}
4781 			mp = newmp;
4782 		}
4783 		tcp_wput(tcp_wq, mp);
4784 
4785 		wflag |= NOINTR;
4786 
4787 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4788 			ASSERT(error == 0);
4789 			break;
4790 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4791 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4792 slow_send:
4793 			/*
4794 			 * We were able to send down partial data using
4795 			 * the direct call interface, but are now relying
4796 			 * on strwrite() to handle the non-fastpath cases.
4797 			 * If the socket is blocking we will sleep in
4798 			 * strwaitq() until write is permitted, otherwise,
4799 			 * we will need to return the amount of bytes
4800 			 * written so far back to the app.  This is the
4801 			 * reason why we pass NOINTR flag to strwrite()
4802 			 * for non-blocking socket, because we don't want
4803 			 * to return EAGAIN when portion of the user data
4804 			 * has actually been sent down.
4805 			 */
4806 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4807 		}
4808 	}
4809 	return (0);
4810 }
4811 
4812 /*
4813  * Update sti_faddr by asking the transport (unless AF_UNIX).
4814  */
4815 /* ARGSUSED */
4816 int
4817 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4818     boolean_t accept, struct cred *cr)
4819 {
4820 	struct strbuf	strbuf;
4821 	int		error = 0, res;
4822 	void		*addr;
4823 	t_uscalar_t	addrlen;
4824 	k_sigset_t	smask;
4825 	sotpi_info_t	*sti = SOTOTPI(so);
4826 
4827 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4828 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4829 
4830 	ASSERT(*namelen > 0);
4831 	mutex_enter(&so->so_lock);
4832 	so_lock_single(so);	/* Set SOLOCKED */
4833 
4834 	if (accept) {
4835 		bcopy(sti->sti_faddr_sa, name,
4836 		    MIN(*namelen, sti->sti_faddr_len));
4837 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4838 		goto done;
4839 	}
4840 
4841 	if (!(so->so_state & SS_ISCONNECTED)) {
4842 		error = ENOTCONN;
4843 		goto done;
4844 	}
4845 	/* Added this check for X/Open */
4846 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4847 		error = EINVAL;
4848 		if (xnet_check_print) {
4849 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4850 		}
4851 		goto done;
4852 	}
4853 
4854 	if (sti->sti_faddr_valid) {
4855 		bcopy(sti->sti_faddr_sa, name,
4856 		    MIN(*namelen, sti->sti_faddr_len));
4857 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4858 		goto done;
4859 	}
4860 
4861 #ifdef DEBUG
4862 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4863 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4864 	    (t_uscalar_t)sti->sti_faddr_len)));
4865 #endif /* DEBUG */
4866 
4867 	if (so->so_family == AF_UNIX) {
4868 		/* Transport has different name space - return local info */
4869 		if (sti->sti_faddr_noxlate)
4870 			*namelen = 0;
4871 		error = 0;
4872 		goto done;
4873 	}
4874 
4875 	ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4876 
4877 	ASSERT(sti->sti_faddr_sa);
4878 	/* Allocate local buffer to use with ioctl */
4879 	addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4880 	mutex_exit(&so->so_lock);
4881 	addr = kmem_alloc(addrlen, KM_SLEEP);
4882 
4883 	/*
4884 	 * Issue TI_GETPEERNAME with signals masked.
4885 	 * Put the result in sti_faddr_sa so that getpeername works after
4886 	 * a shutdown(output).
4887 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4888 	 * back to the socket.
4889 	 */
4890 	strbuf.buf = addr;
4891 	strbuf.maxlen = addrlen;
4892 	strbuf.len = 0;
4893 
4894 	sigintr(&smask, 0);
4895 	res = 0;
4896 	ASSERT(cr);
4897 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4898 	    0, K_TO_K, cr, &res);
4899 	sigunintr(&smask);
4900 
4901 	mutex_enter(&so->so_lock);
4902 	/*
4903 	 * If there is an error record the error in so_error put don't fail
4904 	 * the getpeername. Instead fallback on the recorded
4905 	 * sti->sti_faddr_sa.
4906 	 */
4907 	if (error) {
4908 		/*
4909 		 * Various stream head errors can be returned to the ioctl.
4910 		 * However, it is impossible to determine which ones of
4911 		 * these are really socket level errors that were incorrectly
4912 		 * consumed by the ioctl. Thus this code silently ignores the
4913 		 * error - to code explicitly does not reinstate the error
4914 		 * using soseterror().
4915 		 * Experiments have shows that at least this set of
4916 		 * errors are reported and should not be reinstated on the
4917 		 * socket:
4918 		 *	EINVAL	E.g. if an I_LINK was in effect when
4919 		 *		getpeername was called.
4920 		 *	EPIPE	The ioctl error semantics prefer the write
4921 		 *		side error over the read side error.
4922 		 *	ENOTCONN The transport just got disconnected but
4923 		 *		sockfs had not yet seen the T_DISCON_IND
4924 		 *		when issuing the ioctl.
4925 		 */
4926 		error = 0;
4927 	} else if (res == 0 && strbuf.len > 0 &&
4928 	    (so->so_state & SS_ISCONNECTED)) {
4929 		ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
4930 		sti->sti_faddr_len = (socklen_t)strbuf.len;
4931 		bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
4932 		sti->sti_faddr_valid = 1;
4933 
4934 		bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
4935 		*namelen = sti->sti_faddr_len;
4936 	}
4937 	kmem_free(addr, addrlen);
4938 #ifdef DEBUG
4939 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4940 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4941 	    (t_uscalar_t)sti->sti_faddr_len)));
4942 #endif /* DEBUG */
4943 done:
4944 	so_unlock_single(so, SOLOCKED);
4945 	mutex_exit(&so->so_lock);
4946 	return (error);
4947 }
4948 
4949 /*
4950  * Update sti_laddr by asking the transport (unless AF_UNIX).
4951  */
4952 int
4953 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4954     struct cred *cr)
4955 {
4956 	struct strbuf	strbuf;
4957 	int		error = 0, res;
4958 	void		*addr;
4959 	t_uscalar_t	addrlen;
4960 	k_sigset_t	smask;
4961 	sotpi_info_t	*sti = SOTOTPI(so);
4962 
4963 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4964 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4965 
4966 	ASSERT(*namelen > 0);
4967 	mutex_enter(&so->so_lock);
4968 	so_lock_single(so);	/* Set SOLOCKED */
4969 
4970 #ifdef DEBUG
4971 
4972 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4973 	    pr_addr(so->so_family, sti->sti_laddr_sa,
4974 	    (t_uscalar_t)sti->sti_laddr_len)));
4975 #endif /* DEBUG */
4976 	if (sti->sti_laddr_valid) {
4977 		bcopy(sti->sti_laddr_sa, name,
4978 		    MIN(*namelen, sti->sti_laddr_len));
4979 		*namelen = sti->sti_laddr_len;
4980 		goto done;
4981 	}
4982 
4983 	if (so->so_family == AF_UNIX) {
4984 		/*
4985 		 * Transport has different name space - return local info. If we
4986 		 * have enough space, let consumers know the family.
4987 		 */
4988 		if (*namelen >= sizeof (sa_family_t)) {
4989 			name->sa_family = AF_UNIX;
4990 			*namelen = sizeof (sa_family_t);
4991 		} else {
4992 			*namelen = 0;
4993 		}
4994 		error = 0;
4995 		goto done;
4996 	}
4997 	if (!(so->so_state & SS_ISBOUND)) {
4998 		/* If not bound, then nothing to return. */
4999 		error = 0;
5000 		goto done;
5001 	}
5002 
5003 	/* Allocate local buffer to use with ioctl */
5004 	addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
5005 	mutex_exit(&so->so_lock);
5006 	addr = kmem_alloc(addrlen, KM_SLEEP);
5007 
5008 	/*
5009 	 * Issue TI_GETMYNAME with signals masked.
5010 	 * Put the result in sti_laddr_sa so that getsockname works after
5011 	 * a shutdown(output).
5012 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5013 	 * back to the socket.
5014 	 */
5015 	strbuf.buf = addr;
5016 	strbuf.maxlen = addrlen;
5017 	strbuf.len = 0;
5018 
5019 	sigintr(&smask, 0);
5020 	res = 0;
5021 	ASSERT(cr);
5022 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
5023 	    0, K_TO_K, cr, &res);
5024 	sigunintr(&smask);
5025 
5026 	mutex_enter(&so->so_lock);
5027 	/*
5028 	 * If there is an error record the error in so_error put don't fail
5029 	 * the getsockname. Instead fallback on the recorded
5030 	 * sti->sti_laddr_sa.
5031 	 */
5032 	if (error) {
5033 		/*
5034 		 * Various stream head errors can be returned to the ioctl.
5035 		 * However, it is impossible to determine which ones of
5036 		 * these are really socket level errors that were incorrectly
5037 		 * consumed by the ioctl. Thus this code silently ignores the
5038 		 * error - to code explicitly does not reinstate the error
5039 		 * using soseterror().
5040 		 * Experiments have shows that at least this set of
5041 		 * errors are reported and should not be reinstated on the
5042 		 * socket:
5043 		 *	EINVAL	E.g. if an I_LINK was in effect when
5044 		 *		getsockname was called.
5045 		 *	EPIPE	The ioctl error semantics prefer the write
5046 		 *		side error over the read side error.
5047 		 */
5048 		error = 0;
5049 	} else if (res == 0 && strbuf.len > 0 &&
5050 	    (so->so_state & SS_ISBOUND)) {
5051 		ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5052 		sti->sti_laddr_len = (socklen_t)strbuf.len;
5053 		bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5054 		sti->sti_laddr_valid = 1;
5055 
5056 		bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5057 		*namelen = sti->sti_laddr_len;
5058 	}
5059 	kmem_free(addr, addrlen);
5060 #ifdef DEBUG
5061 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5062 	    pr_addr(so->so_family, sti->sti_laddr_sa,
5063 	    (t_uscalar_t)sti->sti_laddr_len)));
5064 #endif /* DEBUG */
5065 done:
5066 	so_unlock_single(so, SOLOCKED);
5067 	mutex_exit(&so->so_lock);
5068 	return (error);
5069 }
5070 
5071 /*
5072  * Get socket options. For SOL_SOCKET options some options are handled
5073  * by the sockfs while others use the value recorded in the sonode as a
5074  * fallback should the T_SVR4_OPTMGMT_REQ fail.
5075  *
5076  * On the return most *optlenp bytes are copied to optval.
5077  */
5078 /* ARGSUSED */
5079 int
5080 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5081 		void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5082 {
5083 	struct T_optmgmt_req	optmgmt_req;
5084 	struct T_optmgmt_ack	*optmgmt_ack;
5085 	struct opthdr		oh;
5086 	struct opthdr		*opt_res;
5087 	mblk_t			*mp = NULL;
5088 	int			error = 0;
5089 	void			*option = NULL;	/* Set if fallback value */
5090 	t_uscalar_t		maxlen = *optlenp;
5091 	t_uscalar_t		len;
5092 	uint32_t		value;
5093 	struct timeval		tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5094 	struct timeval32	tmo_val32;
5095 	struct so_snd_bufinfo	snd_bufinfo;	/* used for zero copy */
5096 
5097 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5098 	    (void *)so, level, option_name, optval, (void *)optlenp,
5099 	    pr_state(so->so_state, so->so_mode)));
5100 
5101 	mutex_enter(&so->so_lock);
5102 	so_lock_single(so);	/* Set SOLOCKED */
5103 
5104 	/*
5105 	 * Check for SOL_SOCKET options.
5106 	 * Certain SOL_SOCKET options are returned directly whereas
5107 	 * others only provide a default (fallback) value should
5108 	 * the T_SVR4_OPTMGMT_REQ fail.
5109 	 */
5110 	if (level == SOL_SOCKET) {
5111 		/* Check parameters */
5112 		switch (option_name) {
5113 		case SO_TYPE:
5114 		case SO_ERROR:
5115 		case SO_DEBUG:
5116 		case SO_ACCEPTCONN:
5117 		case SO_REUSEADDR:
5118 		case SO_KEEPALIVE:
5119 		case SO_DONTROUTE:
5120 		case SO_BROADCAST:
5121 		case SO_USELOOPBACK:
5122 		case SO_OOBINLINE:
5123 		case SO_SNDBUF:
5124 		case SO_RCVBUF:
5125 #ifdef notyet
5126 		case SO_SNDLOWAT:
5127 		case SO_RCVLOWAT:
5128 #endif /* notyet */
5129 		case SO_DOMAIN:
5130 		case SO_DGRAM_ERRIND:
5131 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5132 				error = EINVAL;
5133 				eprintsoline(so, error);
5134 				goto done2;
5135 			}
5136 			break;
5137 		case SO_RCVTIMEO:
5138 		case SO_SNDTIMEO:
5139 			if (get_udatamodel() == DATAMODEL_NONE ||
5140 			    get_udatamodel() == DATAMODEL_NATIVE) {
5141 				if (maxlen < sizeof (struct timeval)) {
5142 					error = EINVAL;
5143 					eprintsoline(so, error);
5144 					goto done2;
5145 				}
5146 			} else {
5147 				if (maxlen < sizeof (struct timeval32)) {
5148 					error = EINVAL;
5149 					eprintsoline(so, error);
5150 					goto done2;
5151 				}
5152 
5153 			}
5154 			break;
5155 		case SO_LINGER:
5156 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5157 				error = EINVAL;
5158 				eprintsoline(so, error);
5159 				goto done2;
5160 			}
5161 			break;
5162 		case SO_SND_BUFINFO:
5163 			if (maxlen < (t_uscalar_t)
5164 			    sizeof (struct so_snd_bufinfo)) {
5165 				error = EINVAL;
5166 				eprintsoline(so, error);
5167 				goto done2;
5168 			}
5169 			break;
5170 		}
5171 
5172 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
5173 
5174 		switch (option_name) {
5175 		case SO_TYPE:
5176 			value = so->so_type;
5177 			option = &value;
5178 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5179 
5180 		case SO_ERROR:
5181 			value = sogeterr(so, B_TRUE);
5182 			option = &value;
5183 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5184 
5185 		case SO_ACCEPTCONN:
5186 			if (so->so_state & SS_ACCEPTCONN)
5187 				value = SO_ACCEPTCONN;
5188 			else
5189 				value = 0;
5190 #ifdef DEBUG
5191 			if (value) {
5192 				dprintso(so, 1,
5193 				    ("sotpi_getsockopt: 0x%x is set\n",
5194 				    option_name));
5195 			} else {
5196 				dprintso(so, 1,
5197 				    ("sotpi_getsockopt: 0x%x not set\n",
5198 				    option_name));
5199 			}
5200 #endif /* DEBUG */
5201 			option = &value;
5202 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5203 
5204 		case SO_DEBUG:
5205 		case SO_REUSEADDR:
5206 		case SO_KEEPALIVE:
5207 		case SO_DONTROUTE:
5208 		case SO_BROADCAST:
5209 		case SO_USELOOPBACK:
5210 		case SO_OOBINLINE:
5211 		case SO_DGRAM_ERRIND:
5212 			value = (so->so_options & option_name);
5213 #ifdef DEBUG
5214 			if (value) {
5215 				dprintso(so, 1,
5216 				    ("sotpi_getsockopt: 0x%x is set\n",
5217 				    option_name));
5218 			} else {
5219 				dprintso(so, 1,
5220 				    ("sotpi_getsockopt: 0x%x not set\n",
5221 				    option_name));
5222 			}
5223 #endif /* DEBUG */
5224 			option = &value;
5225 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5226 
5227 		/*
5228 		 * The following options are only returned by sockfs when the
5229 		 * T_SVR4_OPTMGMT_REQ fails.
5230 		 */
5231 		case SO_LINGER:
5232 			option = &so->so_linger;
5233 			len = (t_uscalar_t)sizeof (struct linger);
5234 			break;
5235 		case SO_SNDBUF: {
5236 			ssize_t lvalue;
5237 
5238 			/*
5239 			 * If the option has not been set then get a default
5240 			 * value from the read queue. This value is
5241 			 * returned if the transport fails
5242 			 * the T_SVR4_OPTMGMT_REQ.
5243 			 */
5244 			lvalue = so->so_sndbuf;
5245 			if (lvalue == 0) {
5246 				mutex_exit(&so->so_lock);
5247 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
5248 				    QHIWAT, 0, &lvalue);
5249 				mutex_enter(&so->so_lock);
5250 				dprintso(so, 1,
5251 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5252 			}
5253 			value = (int)lvalue;
5254 			option = &value;
5255 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5256 			break;
5257 		}
5258 		case SO_RCVBUF: {
5259 			ssize_t lvalue;
5260 
5261 			/*
5262 			 * If the option has not been set then get a default
5263 			 * value from the read queue. This value is
5264 			 * returned if the transport fails
5265 			 * the T_SVR4_OPTMGMT_REQ.
5266 			 *
5267 			 * XXX If SO_RCVBUF has been set and this is an
5268 			 * XPG 4.2 application then do not ask the transport
5269 			 * since the transport might adjust the value and not
5270 			 * return exactly what was set by the application.
5271 			 * For non-XPG 4.2 application we return the value
5272 			 * that the transport is actually using.
5273 			 */
5274 			lvalue = so->so_rcvbuf;
5275 			if (lvalue == 0) {
5276 				mutex_exit(&so->so_lock);
5277 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5278 				    QHIWAT, 0, &lvalue);
5279 				mutex_enter(&so->so_lock);
5280 				dprintso(so, 1,
5281 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5282 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5283 				value = (int)lvalue;
5284 				option = &value;
5285 				goto copyout;	/* skip asking transport */
5286 			}
5287 			value = (int)lvalue;
5288 			option = &value;
5289 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5290 			break;
5291 		}
5292 		case SO_DOMAIN:
5293 			value = so->so_family;
5294 			option = &value;
5295 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5296 
5297 #ifdef notyet
5298 		/*
5299 		 * We do not implement the semantics of these options
5300 		 * thus we shouldn't implement the options either.
5301 		 */
5302 		case SO_SNDLOWAT:
5303 			value = so->so_sndlowat;
5304 			option = &value;
5305 			break;
5306 		case SO_RCVLOWAT:
5307 			value = so->so_rcvlowat;
5308 			option = &value;
5309 			break;
5310 #endif /* notyet */
5311 		case SO_SNDTIMEO:
5312 		case SO_RCVTIMEO: {
5313 			clock_t val;
5314 
5315 			if (option_name == SO_RCVTIMEO)
5316 				val = drv_hztousec(so->so_rcvtimeo);
5317 			else
5318 				val = drv_hztousec(so->so_sndtimeo);
5319 			tmo_val.tv_sec = val / (1000 * 1000);
5320 			tmo_val.tv_usec = val % (1000 * 1000);
5321 			if (get_udatamodel() == DATAMODEL_NONE ||
5322 			    get_udatamodel() == DATAMODEL_NATIVE) {
5323 				option = &tmo_val;
5324 				len = sizeof (struct timeval);
5325 			} else {
5326 				TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5327 				option = &tmo_val32;
5328 				len = sizeof (struct timeval32);
5329 			}
5330 			break;
5331 		}
5332 		case SO_SND_BUFINFO: {
5333 			snd_bufinfo.sbi_wroff =
5334 			    (so->so_proto_props).sopp_wroff;
5335 			snd_bufinfo.sbi_maxblk =
5336 			    (so->so_proto_props).sopp_maxblk;
5337 			snd_bufinfo.sbi_maxpsz =
5338 			    (so->so_proto_props).sopp_maxpsz;
5339 			snd_bufinfo.sbi_tail =
5340 			    (so->so_proto_props).sopp_tail;
5341 			option = &snd_bufinfo;
5342 			len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5343 			break;
5344 		}
5345 		}
5346 	}
5347 
5348 	mutex_exit(&so->so_lock);
5349 
5350 	/* Send request */
5351 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5352 	optmgmt_req.MGMT_flags = T_CHECK;
5353 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5354 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5355 
5356 	oh.level = level;
5357 	oh.name = option_name;
5358 	oh.len = maxlen;
5359 
5360 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5361 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5362 	/* Let option management work in the presence of data flow control */
5363 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5364 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5365 	mp = NULL;
5366 	mutex_enter(&so->so_lock);
5367 	if (error) {
5368 		eprintsoline(so, error);
5369 		goto done2;
5370 	}
5371 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5372 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5373 	if (error) {
5374 		if (option != NULL) {
5375 			/* We have a fallback value */
5376 			error = 0;
5377 			goto copyout;
5378 		}
5379 		eprintsoline(so, error);
5380 		goto done2;
5381 	}
5382 	ASSERT(mp);
5383 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5384 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5385 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5386 	if (opt_res == NULL) {
5387 		if (option != NULL) {
5388 			/* We have a fallback value */
5389 			error = 0;
5390 			goto copyout;
5391 		}
5392 		error = EPROTO;
5393 		eprintsoline(so, error);
5394 		goto done;
5395 	}
5396 	option = &opt_res[1];
5397 
5398 	/* check to ensure that the option is within bounds */
5399 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5400 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5401 		if (option != NULL) {
5402 			/* We have a fallback value */
5403 			error = 0;
5404 			goto copyout;
5405 		}
5406 		error = EPROTO;
5407 		eprintsoline(so, error);
5408 		goto done;
5409 	}
5410 
5411 	len = opt_res->len;
5412 
5413 copyout: {
5414 		t_uscalar_t size = MIN(len, maxlen);
5415 		bcopy(option, optval, size);
5416 		bcopy(&size, optlenp, sizeof (size));
5417 	}
5418 done:
5419 	freemsg(mp);
5420 done2:
5421 	so_unlock_single(so, SOLOCKED);
5422 	mutex_exit(&so->so_lock);
5423 
5424 	return (error);
5425 }
5426 
5427 /*
5428  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5429  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5430  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5431  * setsockopt has to work even if the transport does not support the option.
5432  */
5433 /* ARGSUSED */
5434 int
5435 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5436 	const void *optval, t_uscalar_t optlen, struct cred *cr)
5437 {
5438 	struct T_optmgmt_req	optmgmt_req;
5439 	struct opthdr		oh;
5440 	mblk_t			*mp;
5441 	int			error = 0;
5442 	boolean_t		handled = B_FALSE;
5443 
5444 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5445 	    (void *)so, level, option_name, optval, optlen,
5446 	    pr_state(so->so_state, so->so_mode)));
5447 
5448 	/* X/Open requires this check */
5449 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5450 		if (xnet_check_print)
5451 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5452 		return (EINVAL);
5453 	}
5454 
5455 	mutex_enter(&so->so_lock);
5456 	so_lock_single(so);	/* Set SOLOCKED */
5457 	mutex_exit(&so->so_lock);
5458 
5459 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5460 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5461 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5462 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5463 
5464 	oh.level = level;
5465 	oh.name = option_name;
5466 	oh.len = optlen;
5467 
5468 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5469 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5470 	/* Let option management work in the presence of data flow control */
5471 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5472 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5473 	mp = NULL;
5474 	mutex_enter(&so->so_lock);
5475 	if (error) {
5476 		eprintsoline(so, error);
5477 		goto done2;
5478 	}
5479 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5480 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5481 	if (error) {
5482 		eprintsoline(so, error);
5483 		goto done;
5484 	}
5485 	ASSERT(mp);
5486 	/* No need to verify T_optmgmt_ack */
5487 	freemsg(mp);
5488 done:
5489 	/*
5490 	 * Check for SOL_SOCKET options and record their values.
5491 	 * If we know about a SOL_SOCKET parameter and the transport
5492 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5493 	 * EPROTO) we let the setsockopt succeed.
5494 	 */
5495 	if (level == SOL_SOCKET) {
5496 		/* Check parameters */
5497 		switch (option_name) {
5498 		case SO_DEBUG:
5499 		case SO_REUSEADDR:
5500 		case SO_KEEPALIVE:
5501 		case SO_DONTROUTE:
5502 		case SO_BROADCAST:
5503 		case SO_USELOOPBACK:
5504 		case SO_OOBINLINE:
5505 		case SO_SNDBUF:
5506 		case SO_RCVBUF:
5507 #ifdef notyet
5508 		case SO_SNDLOWAT:
5509 		case SO_RCVLOWAT:
5510 #endif /* notyet */
5511 		case SO_DGRAM_ERRIND:
5512 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5513 				error = EINVAL;
5514 				eprintsoline(so, error);
5515 				goto done2;
5516 			}
5517 			ASSERT(optval);
5518 			handled = B_TRUE;
5519 			break;
5520 		case SO_SNDTIMEO:
5521 		case SO_RCVTIMEO:
5522 			if (get_udatamodel() == DATAMODEL_NONE ||
5523 			    get_udatamodel() == DATAMODEL_NATIVE) {
5524 				if (optlen != sizeof (struct timeval)) {
5525 					error = EINVAL;
5526 					eprintsoline(so, error);
5527 					goto done2;
5528 				}
5529 			} else {
5530 				if (optlen != sizeof (struct timeval32)) {
5531 					error = EINVAL;
5532 					eprintsoline(so, error);
5533 					goto done2;
5534 				}
5535 			}
5536 			ASSERT(optval);
5537 			handled = B_TRUE;
5538 			break;
5539 		case SO_LINGER:
5540 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5541 				error = EINVAL;
5542 				eprintsoline(so, error);
5543 				goto done2;
5544 			}
5545 			ASSERT(optval);
5546 			handled = B_TRUE;
5547 			break;
5548 		}
5549 
5550 #define	intvalue	(*(int32_t *)optval)
5551 
5552 		switch (option_name) {
5553 		case SO_TYPE:
5554 		case SO_ERROR:
5555 		case SO_ACCEPTCONN:
5556 			/* Can't be set */
5557 			error = ENOPROTOOPT;
5558 			goto done2;
5559 		case SO_LINGER: {
5560 			struct linger *l = (struct linger *)optval;
5561 
5562 			so->so_linger.l_linger = l->l_linger;
5563 			if (l->l_onoff) {
5564 				so->so_linger.l_onoff = SO_LINGER;
5565 				so->so_options |= SO_LINGER;
5566 			} else {
5567 				so->so_linger.l_onoff = 0;
5568 				so->so_options &= ~SO_LINGER;
5569 			}
5570 			break;
5571 		}
5572 
5573 		case SO_DEBUG:
5574 #ifdef SOCK_TEST
5575 			if (intvalue & 2)
5576 				sock_test_timelimit = 10 * hz;
5577 			else
5578 				sock_test_timelimit = 0;
5579 
5580 			if (intvalue & 4)
5581 				do_useracc = 0;
5582 			else
5583 				do_useracc = 1;
5584 #endif /* SOCK_TEST */
5585 			/* FALLTHRU */
5586 		case SO_REUSEADDR:
5587 		case SO_KEEPALIVE:
5588 		case SO_DONTROUTE:
5589 		case SO_BROADCAST:
5590 		case SO_USELOOPBACK:
5591 		case SO_OOBINLINE:
5592 		case SO_DGRAM_ERRIND:
5593 			if (intvalue != 0) {
5594 				dprintso(so, 1,
5595 				    ("socket_setsockopt: setting 0x%x\n",
5596 				    option_name));
5597 				so->so_options |= option_name;
5598 			} else {
5599 				dprintso(so, 1,
5600 				    ("socket_setsockopt: clearing 0x%x\n",
5601 				    option_name));
5602 				so->so_options &= ~option_name;
5603 			}
5604 			break;
5605 		/*
5606 		 * The following options are only returned by us when the
5607 		 * transport layer fails.
5608 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5609 		 * since the transport might adjust the value and not
5610 		 * return exactly what was set by the application.
5611 		 */
5612 		case SO_SNDBUF:
5613 			so->so_sndbuf = intvalue;
5614 			break;
5615 		case SO_RCVBUF:
5616 			so->so_rcvbuf = intvalue;
5617 			break;
5618 		case SO_RCVPSH:
5619 			so->so_rcv_timer_interval = intvalue;
5620 			break;
5621 #ifdef notyet
5622 		/*
5623 		 * We do not implement the semantics of these options
5624 		 * thus we shouldn't implement the options either.
5625 		 */
5626 		case SO_SNDLOWAT:
5627 			so->so_sndlowat = intvalue;
5628 			break;
5629 		case SO_RCVLOWAT:
5630 			so->so_rcvlowat = intvalue;
5631 			break;
5632 #endif /* notyet */
5633 		case SO_SNDTIMEO:
5634 		case SO_RCVTIMEO: {
5635 			struct timeval tl;
5636 			clock_t val;
5637 
5638 			if (get_udatamodel() == DATAMODEL_NONE ||
5639 			    get_udatamodel() == DATAMODEL_NATIVE)
5640 				bcopy(&tl, (struct timeval *)optval,
5641 				    sizeof (struct timeval));
5642 			else
5643 				TIMEVAL32_TO_TIMEVAL(&tl,
5644 				    (struct timeval32 *)optval);
5645 			val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5646 			if (option_name == SO_RCVTIMEO)
5647 				so->so_rcvtimeo = drv_usectohz(val);
5648 			else
5649 				so->so_sndtimeo = drv_usectohz(val);
5650 			break;
5651 		}
5652 		}
5653 #undef	intvalue
5654 
5655 		if (error) {
5656 			if ((error == ENOPROTOOPT || error == EPROTO ||
5657 			    error == EINVAL) && handled) {
5658 				dprintso(so, 1,
5659 				    ("setsockopt: ignoring error %d for 0x%x\n",
5660 				    error, option_name));
5661 				error = 0;
5662 			}
5663 		}
5664 	}
5665 done2:
5666 	so_unlock_single(so, SOLOCKED);
5667 	mutex_exit(&so->so_lock);
5668 	return (error);
5669 }
5670 
5671 /*
5672  * sotpi_close() is called when the last open reference goes away.
5673  */
5674 /* ARGSUSED */
5675 int
5676 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5677 {
5678 	struct vnode *vp = SOTOV(so);
5679 	dev_t dev;
5680 	int error = 0;
5681 	sotpi_info_t *sti = SOTOTPI(so);
5682 
5683 	dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5684 	    (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5685 
5686 	dev = sti->sti_dev;
5687 
5688 	ASSERT(STREAMSTAB(getmajor(dev)));
5689 
5690 	mutex_enter(&so->so_lock);
5691 	so_lock_single(so);	/* Set SOLOCKED */
5692 
5693 	ASSERT(so_verify_oobstate(so));
5694 
5695 	if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5696 		sti->sti_nl7c_flags = 0;
5697 		nl7c_close(so);
5698 	}
5699 
5700 	if (vp->v_stream != NULL) {
5701 		vnode_t *ux_vp;
5702 
5703 		if (so->so_family == AF_UNIX) {
5704 			/* Could avoid this when CANTSENDMORE for !dgram */
5705 			so_unix_close(so);
5706 		}
5707 
5708 		mutex_exit(&so->so_lock);
5709 		/*
5710 		 * Disassemble the linkage from the AF_UNIX underlying file
5711 		 * system vnode to this socket (by atomically clearing
5712 		 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5713 		 * and frees the stream head.
5714 		 */
5715 		if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5716 			ASSERT(ux_vp->v_stream);
5717 			sti->sti_ux_bound_vp = NULL;
5718 			vn_rele_stream(ux_vp);
5719 		}
5720 		error = strclose(vp, flag, cr);
5721 		vp->v_stream = NULL;
5722 		mutex_enter(&so->so_lock);
5723 	}
5724 
5725 	/*
5726 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5727 	 */
5728 	so_flush_discon_ind(so);
5729 
5730 	so_unlock_single(so, SOLOCKED);
5731 	mutex_exit(&so->so_lock);
5732 
5733 	/*
5734 	 * Needed for STREAMs.
5735 	 * Decrement the device driver's reference count for streams
5736 	 * opened via the clone dip. The driver was held in clone_open().
5737 	 * The absence of clone_close() forces this asymmetry.
5738 	 */
5739 	if (so->so_flag & SOCLONE)
5740 		ddi_rele_driver(getmajor(dev));
5741 
5742 	return (error);
5743 }
5744 
5745 static int
5746 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5747     struct cred *cr, int32_t *rvalp)
5748 {
5749 	struct vnode *vp = SOTOV(so);
5750 	sotpi_info_t *sti = SOTOTPI(so);
5751 	int error = 0;
5752 
5753 	dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5754 	    cmd, arg, pr_state(so->so_state, so->so_mode)));
5755 
5756 	switch (cmd) {
5757 	case SIOCSQPTR:
5758 		/*
5759 		 * SIOCSQPTR is valid only when helper stream is created
5760 		 * by the protocol.
5761 		 */
5762 	case _I_INSERT:
5763 	case _I_REMOVE:
5764 		/*
5765 		 * Since there's no compelling reason to support these ioctls
5766 		 * on sockets, and doing so would increase the complexity
5767 		 * markedly, prevent it.
5768 		 */
5769 		return (EOPNOTSUPP);
5770 
5771 	case I_FIND:
5772 	case I_LIST:
5773 	case I_LOOK:
5774 	case I_POP:
5775 	case I_PUSH:
5776 		/*
5777 		 * To prevent races and inconsistencies between the actual
5778 		 * state of the stream and the state according to the sonode,
5779 		 * we serialize all operations which modify or operate on the
5780 		 * list of modules on the socket's stream.
5781 		 */
5782 		mutex_enter(&sti->sti_plumb_lock);
5783 		error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5784 		mutex_exit(&sti->sti_plumb_lock);
5785 		return (error);
5786 
5787 	default:
5788 		if (so->so_version != SOV_STREAM)
5789 			break;
5790 
5791 		/*
5792 		 * The imaginary "sockmod" has been popped; act as a stream.
5793 		 */
5794 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5795 	}
5796 
5797 	ASSERT(so->so_version != SOV_STREAM);
5798 
5799 	/*
5800 	 * Process socket-specific ioctls.
5801 	 */
5802 	switch (cmd) {
5803 	case FIONBIO: {
5804 		int32_t value;
5805 
5806 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5807 		    (mode & (int)FKIOCTL)))
5808 			return (EFAULT);
5809 
5810 		mutex_enter(&so->so_lock);
5811 		if (value) {
5812 			so->so_state |= SS_NDELAY;
5813 		} else {
5814 			so->so_state &= ~SS_NDELAY;
5815 		}
5816 		mutex_exit(&so->so_lock);
5817 		return (0);
5818 	}
5819 
5820 	case FIOASYNC: {
5821 		int32_t value;
5822 
5823 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5824 		    (mode & (int)FKIOCTL)))
5825 			return (EFAULT);
5826 
5827 		mutex_enter(&so->so_lock);
5828 		/*
5829 		 * SS_ASYNC flag not already set correctly?
5830 		 * (!value != !(so->so_state & SS_ASYNC))
5831 		 * but some engineers find that too hard to read.
5832 		 */
5833 		if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
5834 		    value != 0 && (so->so_state & SS_ASYNC) == 0)
5835 			error = so_flip_async(so, vp, mode, cr);
5836 		mutex_exit(&so->so_lock);
5837 		return (error);
5838 	}
5839 
5840 	case SIOCSPGRP:
5841 	case FIOSETOWN: {
5842 		pid_t pgrp;
5843 
5844 		if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5845 		    (mode & (int)FKIOCTL)))
5846 			return (EFAULT);
5847 
5848 		mutex_enter(&so->so_lock);
5849 		dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5850 		/* Any change? */
5851 		if (pgrp != so->so_pgrp)
5852 			error = so_set_siggrp(so, vp, pgrp, mode, cr);
5853 		mutex_exit(&so->so_lock);
5854 		return (error);
5855 	}
5856 	case SIOCGPGRP:
5857 	case FIOGETOWN:
5858 		if (so_copyout(&so->so_pgrp, (void *)arg,
5859 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
5860 			return (EFAULT);
5861 		return (0);
5862 
5863 	case SIOCATMARK: {
5864 		int retval;
5865 		uint_t so_state;
5866 
5867 		/*
5868 		 * strwaitmark has a finite timeout after which it
5869 		 * returns -1 if the mark state is undetermined.
5870 		 * In order to avoid any race between the mark state
5871 		 * in sockfs and the mark state in the stream head this
5872 		 * routine loops until the mark state can be determined
5873 		 * (or the urgent data indication has been removed by some
5874 		 * other thread).
5875 		 */
5876 		do {
5877 			mutex_enter(&so->so_lock);
5878 			so_state = so->so_state;
5879 			mutex_exit(&so->so_lock);
5880 			if (so_state & SS_RCVATMARK) {
5881 				retval = 1;
5882 			} else if (!(so_state & SS_OOBPEND)) {
5883 				/*
5884 				 * No SIGURG has been generated -- there is no
5885 				 * pending or present urgent data. Thus can't
5886 				 * possibly be at the mark.
5887 				 */
5888 				retval = 0;
5889 			} else {
5890 				/*
5891 				 * Have the stream head wait until there is
5892 				 * either some messages on the read queue, or
5893 				 * STRATMARK or STRNOTATMARK gets set. The
5894 				 * STRNOTATMARK flag is used so that the
5895 				 * transport can send up a MSGNOTMARKNEXT
5896 				 * M_DATA to indicate that it is not
5897 				 * at the mark and additional data is not about
5898 				 * to be send upstream.
5899 				 *
5900 				 * If the mark state is undetermined this will
5901 				 * return -1 and we will loop rechecking the
5902 				 * socket state.
5903 				 */
5904 				retval = strwaitmark(vp);
5905 			}
5906 		} while (retval == -1);
5907 
5908 		if (so_copyout(&retval, (void *)arg, sizeof (int),
5909 		    (mode & (int)FKIOCTL)))
5910 			return (EFAULT);
5911 		return (0);
5912 	}
5913 
5914 	case I_FDINSERT:
5915 	case I_SENDFD:
5916 	case I_RECVFD:
5917 	case I_ATMARK:
5918 	case _SIOCSOCKFALLBACK:
5919 		/*
5920 		 * These ioctls do not apply to sockets. I_FDINSERT can be
5921 		 * used to send M_PROTO messages without modifying the socket
5922 		 * state. I_SENDFD/RECVFD should not be used for socket file
5923 		 * descriptor passing since they assume a twisted stream.
5924 		 * SIOCATMARK must be used instead of I_ATMARK.
5925 		 *
5926 		 * _SIOCSOCKFALLBACK from an application should never be
5927 		 * processed.  It is only generated by socktpi_open() or
5928 		 * in response to I_POP or I_PUSH.
5929 		 */
5930 #ifdef DEBUG
5931 		zcmn_err(getzoneid(), CE_WARN,
5932 		    "Unsupported STREAMS ioctl 0x%x on socket. "
5933 		    "Pid = %d\n", cmd, curproc->p_pid);
5934 #endif /* DEBUG */
5935 		return (EOPNOTSUPP);
5936 
5937 	case _I_GETPEERCRED:
5938 		if ((mode & FKIOCTL) == 0)
5939 			return (EINVAL);
5940 
5941 		mutex_enter(&so->so_lock);
5942 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
5943 			error = ENOTSUP;
5944 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
5945 			error = ENOTCONN;
5946 		} else if (so->so_peercred != NULL) {
5947 			k_peercred_t *kp = (k_peercred_t *)arg;
5948 			kp->pc_cr = so->so_peercred;
5949 			kp->pc_cpid = so->so_cpid;
5950 			crhold(so->so_peercred);
5951 		} else {
5952 			error = EINVAL;
5953 		}
5954 		mutex_exit(&so->so_lock);
5955 		return (error);
5956 
5957 	default:
5958 		/*
5959 		 * Do the higher-order bits of the ioctl cmd indicate
5960 		 * that it is an I_* streams ioctl?
5961 		 */
5962 		if ((cmd & 0xffffff00U) == STR &&
5963 		    so->so_version == SOV_SOCKBSD) {
5964 #ifdef DEBUG
5965 			zcmn_err(getzoneid(), CE_WARN,
5966 			    "Unsupported STREAMS ioctl 0x%x on socket. "
5967 			    "Pid = %d\n", cmd, 	curproc->p_pid);
5968 #endif /* DEBUG */
5969 			return (EOPNOTSUPP);
5970 		}
5971 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5972 	}
5973 }
5974 
5975 /*
5976  * Handle plumbing-related ioctls.
5977  */
5978 static int
5979 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
5980     struct cred *cr, int32_t *rvalp)
5981 {
5982 	static const char sockmod_name[] = "sockmod";
5983 	struct sonode	*so = VTOSO(vp);
5984 	char		mname[FMNAMESZ + 1];
5985 	int		error;
5986 	sotpi_info_t	*sti = SOTOTPI(so);
5987 
5988 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
5989 
5990 	if (so->so_version == SOV_SOCKBSD)
5991 		return (EOPNOTSUPP);
5992 
5993 	if (so->so_version == SOV_STREAM) {
5994 		/*
5995 		 * The imaginary "sockmod" has been popped - act as a stream.
5996 		 * If this is a push of sockmod then change back to a socket.
5997 		 */
5998 		if (cmd == I_PUSH) {
5999 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6000 			    (void *)arg, mname, sizeof (mname), NULL);
6001 
6002 			if (error == 0 && strcmp(mname, sockmod_name) == 0) {
6003 				dprintso(so, 0, ("socktpi_ioctl: going to "
6004 				    "socket version\n"));
6005 				so_stream2sock(so);
6006 				return (0);
6007 			}
6008 		}
6009 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6010 	}
6011 
6012 	switch (cmd) {
6013 	case I_PUSH:
6014 		if (sti->sti_direct) {
6015 			mutex_enter(&so->so_lock);
6016 			so_lock_single(so);
6017 			mutex_exit(&so->so_lock);
6018 
6019 			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6020 			    cr, rvalp);
6021 
6022 			mutex_enter(&so->so_lock);
6023 			if (error == 0)
6024 				sti->sti_direct = 0;
6025 			so_unlock_single(so, SOLOCKED);
6026 			mutex_exit(&so->so_lock);
6027 
6028 			if (error != 0)
6029 				return (error);
6030 		}
6031 
6032 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6033 		if (error == 0)
6034 			sti->sti_pushcnt++;
6035 		return (error);
6036 
6037 	case I_POP:
6038 		if (sti->sti_pushcnt == 0) {
6039 			/* Emulate sockmod being popped */
6040 			dprintso(so, 0,
6041 			    ("socktpi_ioctl: going to STREAMS version\n"));
6042 			return (so_sock2stream(so));
6043 		}
6044 
6045 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6046 		if (error == 0)
6047 			sti->sti_pushcnt--;
6048 		return (error);
6049 
6050 	case I_LIST: {
6051 		struct str_mlist *kmlistp, *umlistp;
6052 		struct str_list	kstrlist;
6053 		ssize_t		kstrlistsize;
6054 		int		i, nmods;
6055 
6056 		STRUCT_DECL(str_list, ustrlist);
6057 		STRUCT_INIT(ustrlist, mode);
6058 
6059 		if (arg == NULL) {
6060 			error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6061 			if (error == 0)
6062 				(*rvalp)++;	/* Add one for sockmod */
6063 			return (error);
6064 		}
6065 
6066 		error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6067 		    STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6068 		if (error != 0)
6069 			return (error);
6070 
6071 		nmods = STRUCT_FGET(ustrlist, sl_nmods);
6072 		if (nmods <= 0)
6073 			return (EINVAL);
6074 		/*
6075 		 * Ceiling nmods at nstrpush to prevent someone from
6076 		 * maliciously consuming lots of kernel memory.
6077 		 */
6078 		nmods = MIN(nmods, nstrpush);
6079 
6080 		kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6081 		kstrlist.sl_nmods = nmods;
6082 		kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6083 
6084 		error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6085 		    cr, rvalp);
6086 		if (error != 0)
6087 			goto done;
6088 
6089 		/*
6090 		 * Considering the module list as a 0-based array of sl_nmods
6091 		 * modules, sockmod should conceptually exist at slot
6092 		 * sti_pushcnt.  Insert sockmod at this location by sliding all
6093 		 * of the module names after so_pushcnt over by one.  We know
6094 		 * that there will be room to do this since we allocated
6095 		 * sl_modlist with an additional slot.
6096 		 */
6097 		for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6098 			kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6099 
6100 		(void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6101 		kstrlist.sl_nmods++;
6102 
6103 		/*
6104 		 * Copy all of the entries out to ustrlist.
6105 		 */
6106 		kmlistp = kstrlist.sl_modlist;
6107 		umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6108 		for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6109 			error = so_copyout(kmlistp++, umlistp++,
6110 			    sizeof (struct str_mlist), mode & FKIOCTL);
6111 			if (error != 0)
6112 				goto done;
6113 		}
6114 
6115 		error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6116 		    mode & FKIOCTL);
6117 		if (error == 0)
6118 			*rvalp = 0;
6119 	done:
6120 		kmem_free(kstrlist.sl_modlist, kstrlistsize);
6121 		return (error);
6122 	}
6123 	case I_LOOK:
6124 		if (sti->sti_pushcnt == 0) {
6125 			return (so_copyout(sockmod_name, (void *)arg,
6126 			    sizeof (sockmod_name), mode & FKIOCTL));
6127 		}
6128 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6129 
6130 	case I_FIND:
6131 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6132 		if (error && error != EINVAL)
6133 			return (error);
6134 
6135 		/* if not found and string was sockmod return 1 */
6136 		if (*rvalp == 0 || error == EINVAL) {
6137 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6138 			    (void *)arg, mname, sizeof (mname), NULL);
6139 			if (error == ENAMETOOLONG)
6140 				error = EINVAL;
6141 
6142 			if (error == 0 && strcmp(mname, sockmod_name) == 0)
6143 				*rvalp = 1;
6144 		}
6145 		return (error);
6146 
6147 	default:
6148 		panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6149 		break;
6150 	}
6151 
6152 	return (0);
6153 }
6154 
6155 /*
6156  * Wrapper around the streams poll routine that implements socket poll
6157  * semantics.
6158  * The sockfs never calls pollwakeup itself - the stream head take care
6159  * of all pollwakeups. Since sockfs never holds so_lock when calling the
6160  * stream head there can never be a deadlock due to holding so_lock across
6161  * pollwakeup and acquiring so_lock in this routine.
6162  *
6163  * However, since the performance of VOP_POLL is critical we avoid
6164  * acquiring so_lock here. This is based on two assumptions:
6165  *  - The poll implementation holds locks to serialize the VOP_POLL call
6166  *    and a pollwakeup for the same pollhead. This ensures that should
6167  *    e.g. so_state change during a socktpi_poll call the pollwakeup
6168  *    (which strsock_* and strrput conspire to issue) is issued after
6169  *    the state change. Thus the pollwakeup will block until VOP_POLL has
6170  *    returned and then wake up poll and have it call VOP_POLL again.
6171  *  - The reading of so_state without holding so_lock does not result in
6172  *    stale data that is older than the latest state change that has dropped
6173  *    so_lock. This is ensured by the mutex_exit issuing the appropriate
6174  *    memory barrier to force the data into the coherency domain.
6175  */
6176 static int
6177 sotpi_poll(
6178 	struct sonode	*so,
6179 	short		events,
6180 	int		anyyet,
6181 	short		*reventsp,
6182 	struct pollhead **phpp)
6183 {
6184 	short origevents = events;
6185 	struct vnode *vp = SOTOV(so);
6186 	int error;
6187 	int so_state = so->so_state;	/* snapshot */
6188 	sotpi_info_t *sti = SOTOTPI(so);
6189 
6190 	dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6191 	    (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6192 
6193 	ASSERT(vp->v_type == VSOCK);
6194 	ASSERT(vp->v_stream != NULL);
6195 
6196 	if (so->so_version == SOV_STREAM) {
6197 		/* The imaginary "sockmod" has been popped - act as a stream */
6198 		return (strpoll(vp->v_stream, events, anyyet,
6199 		    reventsp, phpp));
6200 	}
6201 
6202 	if (!(so_state & SS_ISCONNECTED) &&
6203 	    (so->so_mode & SM_CONNREQUIRED)) {
6204 		/* Not connected yet - turn off write side events */
6205 		events &= ~(POLLOUT|POLLWRBAND);
6206 	}
6207 	/*
6208 	 * Check for errors without calling strpoll if the caller wants them.
6209 	 * In sockets the errors are represented as input/output events
6210 	 * and there is no need to ask the stream head for this information.
6211 	 */
6212 	if (so->so_error != 0 &&
6213 	    ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
6214 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6215 		return (0);
6216 	}
6217 	/*
6218 	 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6219 	 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6220 	 * will not trigger a POLLIN event with POLLRDDATA set.
6221 	 * The handling of urgent data (causing POLLRDBAND) is done by
6222 	 * inspecting SS_OOBPEND below.
6223 	 */
6224 	events |= POLLRDDATA;
6225 
6226 	/*
6227 	 * After shutdown(output) a stream head write error is set.
6228 	 * However, we should not return output events.
6229 	 */
6230 	events |= POLLNOERR;
6231 	error = strpoll(vp->v_stream, events, anyyet,
6232 	    reventsp, phpp);
6233 	if (error)
6234 		return (error);
6235 
6236 	ASSERT(!(*reventsp & POLLERR));
6237 
6238 	/*
6239 	 * Notes on T_CONN_IND handling for sockets.
6240 	 *
6241 	 * If strpoll() returned without events, SR_POLLIN is guaranteed
6242 	 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6243 	 *
6244 	 * Since the so_lock is not held, soqueueconnind() may have run
6245 	 * and a T_CONN_IND may be waiting. We now check for any queued
6246 	 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6247 	 * to ensure poll returns.
6248 	 *
6249 	 * However:
6250 	 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6251 	 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6252 	 * the following actions will occur; taken together they ensure the
6253 	 * syscall will return.
6254 	 *
6255 	 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6256 	 *    the accept() was run on a non-blocking socket sowaitconnind()
6257 	 *    may have already returned EWOULDBLOCK, so not be waiting to
6258 	 *    process the message. Additionally socktpi_poll() has probably
6259 	 *    proceeded past the sti_conn_ind_head check below.
6260 	 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6261 	 *    this thread,  however that could occur before poll_common()
6262 	 *    has entered cv_wait.
6263 	 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6264 	 *
6265 	 * Before proceeding to cv_wait() in poll_common() for an event,
6266 	 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6267 	 * and if set, re-calls strpoll() to ensure the late arriving
6268 	 * T_CONN_IND is recognized, and pollsys() returns.
6269 	 */
6270 
6271 	if (sti->sti_conn_ind_head != NULL)
6272 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6273 
6274 	if (so->so_state & SS_OOBPEND)
6275 		*reventsp |= POLLRDBAND & events;
6276 
6277 	if (sti->sti_nl7c_rcv_mp != NULL) {
6278 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6279 	}
6280 	if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6281 	    ((POLLIN|POLLRDNORM) & *reventsp)) {
6282 		sti->sti_nl7c_flags |= NL7C_POLLIN;
6283 	}
6284 
6285 	return (0);
6286 }
6287 
6288 /*ARGSUSED*/
6289 static int
6290 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6291 {
6292 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6293 	int error = 0;
6294 
6295 	error = sonode_constructor(buf, cdrarg, kmflags);
6296 	if (error != 0)
6297 		return (error);
6298 
6299 	error = i_sotpi_info_constructor(&st->st_info);
6300 	if (error != 0)
6301 		sonode_destructor(buf, cdrarg);
6302 
6303 	st->st_sonode.so_priv = &st->st_info;
6304 
6305 	return (error);
6306 }
6307 
6308 /*ARGSUSED1*/
6309 static void
6310 socktpi_destructor(void *buf, void *cdrarg)
6311 {
6312 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6313 
6314 	ASSERT(st->st_sonode.so_priv == &st->st_info);
6315 	st->st_sonode.so_priv = NULL;
6316 
6317 	i_sotpi_info_destructor(&st->st_info);
6318 	sonode_destructor(buf, cdrarg);
6319 }
6320 
6321 static int
6322 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6323 {
6324 	int retval;
6325 
6326 	if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6327 		struct sonode *so = (struct sonode *)buf;
6328 		sotpi_info_t *sti = SOTOTPI(so);
6329 
6330 		mutex_enter(&socklist.sl_lock);
6331 
6332 		sti->sti_next_so = socklist.sl_list;
6333 		sti->sti_prev_so = NULL;
6334 		if (sti->sti_next_so != NULL)
6335 			SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6336 		socklist.sl_list = so;
6337 
6338 		mutex_exit(&socklist.sl_lock);
6339 
6340 	}
6341 	return (retval);
6342 }
6343 
6344 static void
6345 socktpi_unix_destructor(void *buf, void *cdrarg)
6346 {
6347 	struct sonode	*so = (struct sonode *)buf;
6348 	sotpi_info_t	*sti = SOTOTPI(so);
6349 
6350 	mutex_enter(&socklist.sl_lock);
6351 
6352 	if (sti->sti_next_so != NULL)
6353 		SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6354 	if (sti->sti_prev_so != NULL)
6355 		SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6356 	else
6357 		socklist.sl_list = sti->sti_next_so;
6358 
6359 	mutex_exit(&socklist.sl_lock);
6360 
6361 	socktpi_destructor(buf, cdrarg);
6362 }
6363 
6364 int
6365 socktpi_init(void)
6366 {
6367 	/*
6368 	 * Create sonode caches.  We create a special one for AF_UNIX so
6369 	 * that we can track them for netstat(1m).
6370 	 */
6371 	socktpi_cache = kmem_cache_create("socktpi_cache",
6372 	    sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6373 	    socktpi_destructor, NULL, NULL, NULL, 0);
6374 
6375 	socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6376 	    sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6377 	    socktpi_unix_destructor, NULL, NULL, NULL, 0);
6378 
6379 	return (0);
6380 }
6381 
6382 /*
6383  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6384  *
6385  * Caller must still update state and mode using sotpi_update_state().
6386  */
6387 int
6388 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6389     boolean_t *direct, queue_t **qp, struct cred *cr)
6390 {
6391 	sotpi_info_t *sti;
6392 	struct sockparams *origsp = so->so_sockparams;
6393 	sock_lower_handle_t handle = so->so_proto_handle;
6394 	struct stdata *stp;
6395 	struct vnode *vp;
6396 	queue_t *q;
6397 	int error = 0;
6398 
6399 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6400 	    SS_FALLBACK_PENDING);
6401 	ASSERT(SOCK_IS_NONSTR(so));
6402 
6403 	*qp = NULL;
6404 	*direct = B_FALSE;
6405 	so->so_sockparams = newsp;
6406 	/*
6407 	 * Allocate and initalize fields required by TPI.
6408 	 */
6409 	(void) sotpi_info_create(so, KM_SLEEP);
6410 	sotpi_info_init(so);
6411 
6412 	if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6413 		sotpi_info_fini(so);
6414 		sotpi_info_destroy(so);
6415 		return (error);
6416 	}
6417 	ASSERT(handle == so->so_proto_handle);
6418 	sti = SOTOTPI(so);
6419 	if (sti->sti_direct != 0)
6420 		*direct = B_TRUE;
6421 
6422 	/*
6423 	 * Keep the original sp around so we can properly dispose of the
6424 	 * sonode when the socket is being closed.
6425 	 */
6426 	sti->sti_orig_sp = origsp;
6427 
6428 	so_basic_strinit(so);	/* skips the T_CAPABILITY_REQ */
6429 	so_alloc_addr(so, so->so_max_addr_len);
6430 
6431 	/*
6432 	 * If the application has done a SIOCSPGRP, make sure the
6433 	 * STREAM head is aware. This needs to take place before
6434 	 * the protocol start sending up messages. Otherwise we
6435 	 * might miss to generate SIGPOLL.
6436 	 *
6437 	 * It is possible that the application will receive duplicate
6438 	 * signals if some were already generated for either data or
6439 	 * connection indications.
6440 	 */
6441 	if (so->so_pgrp != 0) {
6442 		if (so_set_events(so, so->so_vnode, cr) != 0)
6443 			so->so_pgrp = 0;
6444 	}
6445 
6446 	/*
6447 	 * Determine which queue to use.
6448 	 */
6449 	vp = SOTOV(so);
6450 	stp = vp->v_stream;
6451 	ASSERT(stp != NULL);
6452 	q = stp->sd_wrq->q_next;
6453 
6454 	/*
6455 	 * Skip any modules that may have been auto pushed when the device
6456 	 * was opened
6457 	 */
6458 	while (q->q_next != NULL)
6459 		q = q->q_next;
6460 	*qp = _RD(q);
6461 
6462 	/* This is now a STREAMS sockets */
6463 	so->so_not_str = B_FALSE;
6464 
6465 	return (error);
6466 }
6467 
6468 /*
6469  * Revert a TPI sonode. It is only allowed to revert the sonode during
6470  * the fallback process.
6471  */
6472 void
6473 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6474 {
6475 	vnode_t *vp = SOTOV(so);
6476 
6477 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6478 	    SS_FALLBACK_PENDING);
6479 	ASSERT(!SOCK_IS_NONSTR(so));
6480 	ASSERT(vp->v_stream != NULL);
6481 
6482 	strclean(vp);
6483 	(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6484 
6485 	/*
6486 	 * Restore the original sockparams. The caller is responsible for
6487 	 * dropping the ref to the new sp.
6488 	 */
6489 	so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6490 
6491 	sotpi_info_fini(so);
6492 	sotpi_info_destroy(so);
6493 
6494 	/* This is no longer a STREAMS sockets */
6495 	so->so_not_str = B_TRUE;
6496 }
6497 
6498 void
6499 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6500     struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6501     socklen_t faddrlen, short opts)
6502 {
6503 	sotpi_info_t *sti = SOTOTPI(so);
6504 
6505 	so_proc_tcapability_ack(so, tcap);
6506 
6507 	so->so_options |= opts;
6508 
6509 	/*
6510 	 * Determine whether the foreign and local address are valid
6511 	 */
6512 	if (laddrlen != 0) {
6513 		ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6514 		sti->sti_laddr_len = laddrlen;
6515 		bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6516 		sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6517 	}
6518 
6519 	if (faddrlen != 0) {
6520 		ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6521 		sti->sti_faddr_len = faddrlen;
6522 		bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6523 		sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6524 	}
6525 
6526 }
6527 
6528 /*
6529  * Allocate enough space to cache the local and foreign addresses.
6530  */
6531 void
6532 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6533 {
6534 	sotpi_info_t *sti = SOTOTPI(so);
6535 
6536 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6537 	ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6538 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6539 	    P2ROUNDUP(maxlen, KMEM_ALIGN);
6540 	so->so_max_addr_len = sti->sti_laddr_maxlen;
6541 	sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6542 	sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6543 	    + sti->sti_laddr_maxlen);
6544 
6545 	if (so->so_family == AF_UNIX) {
6546 		/*
6547 		 * Initialize AF_UNIX related fields.
6548 		 */
6549 		bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6550 		bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6551 	}
6552 }
6553 
6554 
6555 sotpi_info_t *
6556 sotpi_sototpi(struct sonode *so)
6557 {
6558 	sotpi_info_t *sti;
6559 
6560 	ASSERT(so != NULL);
6561 
6562 	sti = (sotpi_info_t *)so->so_priv;
6563 
6564 	ASSERT(sti != NULL);
6565 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6566 
6567 	return (sti);
6568 }
6569 
6570 static int
6571 i_sotpi_info_constructor(sotpi_info_t *sti)
6572 {
6573 	sti->sti_magic		= SOTPI_INFO_MAGIC;
6574 	sti->sti_ack_mp		= NULL;
6575 	sti->sti_discon_ind_mp	= NULL;
6576 	sti->sti_ux_bound_vp	= NULL;
6577 	sti->sti_unbind_mp	= NULL;
6578 
6579 	sti->sti_conn_ind_head	= NULL;
6580 	sti->sti_conn_ind_tail	= NULL;
6581 
6582 	sti->sti_laddr_sa	= NULL;
6583 	sti->sti_faddr_sa	= NULL;
6584 
6585 	sti->sti_nl7c_flags	= 0;
6586 	sti->sti_nl7c_uri	= NULL;
6587 	sti->sti_nl7c_rcv_mp	= NULL;
6588 
6589 	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6590 	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6591 
6592 	return (0);
6593 }
6594 
6595 static void
6596 i_sotpi_info_destructor(sotpi_info_t *sti)
6597 {
6598 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6599 	ASSERT(sti->sti_ack_mp == NULL);
6600 	ASSERT(sti->sti_discon_ind_mp == NULL);
6601 	ASSERT(sti->sti_ux_bound_vp == NULL);
6602 	ASSERT(sti->sti_unbind_mp == NULL);
6603 
6604 	ASSERT(sti->sti_conn_ind_head == NULL);
6605 	ASSERT(sti->sti_conn_ind_tail == NULL);
6606 
6607 	ASSERT(sti->sti_laddr_sa == NULL);
6608 	ASSERT(sti->sti_faddr_sa == NULL);
6609 
6610 	ASSERT(sti->sti_nl7c_flags == 0);
6611 	ASSERT(sti->sti_nl7c_uri == NULL);
6612 	ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6613 
6614 	mutex_destroy(&sti->sti_plumb_lock);
6615 	cv_destroy(&sti->sti_ack_cv);
6616 }
6617 
6618 /*
6619  * Creates and attaches TPI information to the given sonode
6620  */
6621 static boolean_t
6622 sotpi_info_create(struct sonode *so, int kmflags)
6623 {
6624 	sotpi_info_t *sti;
6625 
6626 	ASSERT(so->so_priv == NULL);
6627 
6628 	if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6629 		return (B_FALSE);
6630 
6631 	if (i_sotpi_info_constructor(sti) != 0) {
6632 		kmem_free(sti, sizeof (*sti));
6633 		return (B_FALSE);
6634 	}
6635 
6636 	so->so_priv = (void *)sti;
6637 	return (B_TRUE);
6638 }
6639 
6640 /*
6641  * Initializes the TPI information.
6642  */
6643 static void
6644 sotpi_info_init(struct sonode *so)
6645 {
6646 	struct vnode *vp = SOTOV(so);
6647 	sotpi_info_t *sti = SOTOTPI(so);
6648 	time_t now;
6649 
6650 	sti->sti_dev 	= so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6651 	vp->v_rdev	= sti->sti_dev;
6652 
6653 	sti->sti_orig_sp = NULL;
6654 
6655 	sti->sti_pushcnt = 0;
6656 
6657 	now = gethrestime_sec();
6658 	sti->sti_atime	= now;
6659 	sti->sti_mtime	= now;
6660 	sti->sti_ctime	= now;
6661 
6662 	sti->sti_eaddr_mp = NULL;
6663 	sti->sti_delayed_error = 0;
6664 
6665 	sti->sti_provinfo = NULL;
6666 
6667 	sti->sti_oobcnt = 0;
6668 	sti->sti_oobsigcnt = 0;
6669 
6670 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6671 
6672 	sti->sti_laddr_sa	= 0;
6673 	sti->sti_faddr_sa	= 0;
6674 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6675 	sti->sti_laddr_len = sti->sti_faddr_len = 0;
6676 
6677 	sti->sti_laddr_valid = 0;
6678 	sti->sti_faddr_valid = 0;
6679 	sti->sti_faddr_noxlate = 0;
6680 
6681 	sti->sti_direct = 0;
6682 
6683 	ASSERT(sti->sti_ack_mp == NULL);
6684 	ASSERT(sti->sti_ux_bound_vp == NULL);
6685 	ASSERT(sti->sti_unbind_mp == NULL);
6686 
6687 	ASSERT(sti->sti_conn_ind_head == NULL);
6688 	ASSERT(sti->sti_conn_ind_tail == NULL);
6689 }
6690 
6691 /*
6692  * Given a sonode, grab the TPI info and free any data.
6693  */
6694 static void
6695 sotpi_info_fini(struct sonode *so)
6696 {
6697 	sotpi_info_t *sti = SOTOTPI(so);
6698 	mblk_t *mp;
6699 
6700 	ASSERT(sti->sti_discon_ind_mp == NULL);
6701 
6702 	if ((mp = sti->sti_conn_ind_head) != NULL) {
6703 		mblk_t *mp1;
6704 
6705 		while (mp) {
6706 			mp1 = mp->b_next;
6707 			mp->b_next = NULL;
6708 			freemsg(mp);
6709 			mp = mp1;
6710 		}
6711 		sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6712 	}
6713 
6714 	/*
6715 	 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6716 	 * indirect them.  It also uses so_count as a validity test.
6717 	 */
6718 	mutex_enter(&so->so_lock);
6719 
6720 	if (sti->sti_laddr_sa) {
6721 		ASSERT((caddr_t)sti->sti_faddr_sa ==
6722 		    (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6723 		ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6724 		sti->sti_laddr_valid = 0;
6725 		sti->sti_faddr_valid = 0;
6726 		kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6727 		sti->sti_laddr_sa = NULL;
6728 		sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6729 		sti->sti_faddr_sa = NULL;
6730 		sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6731 	}
6732 
6733 	mutex_exit(&so->so_lock);
6734 
6735 	if ((mp = sti->sti_eaddr_mp) != NULL) {
6736 		freemsg(mp);
6737 		sti->sti_eaddr_mp = NULL;
6738 		sti->sti_delayed_error = 0;
6739 	}
6740 
6741 	if ((mp = sti->sti_ack_mp) != NULL) {
6742 		freemsg(mp);
6743 		sti->sti_ack_mp = NULL;
6744 	}
6745 
6746 	if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
6747 		sti->sti_nl7c_rcv_mp = NULL;
6748 		freemsg(mp);
6749 	}
6750 	sti->sti_nl7c_rcv_rval = 0;
6751 	if (sti->sti_nl7c_uri != NULL) {
6752 		nl7c_urifree(so);
6753 		/* urifree() cleared nl7c_uri */
6754 	}
6755 	if (sti->sti_nl7c_flags) {
6756 		sti->sti_nl7c_flags = 0;
6757 	}
6758 
6759 	ASSERT(sti->sti_ux_bound_vp == NULL);
6760 	if ((mp = sti->sti_unbind_mp) != NULL) {
6761 		freemsg(mp);
6762 		sti->sti_unbind_mp = NULL;
6763 	}
6764 }
6765 
6766 /*
6767  * Destroys the TPI information attached to a sonode.
6768  */
6769 static void
6770 sotpi_info_destroy(struct sonode *so)
6771 {
6772 	sotpi_info_t *sti = SOTOTPI(so);
6773 
6774 	i_sotpi_info_destructor(sti);
6775 	kmem_free(sti, sizeof (*sti));
6776 
6777 	so->so_priv = NULL;
6778 }
6779 
6780 /*
6781  * Create the global sotpi socket module entry. It will never be freed.
6782  */
6783 smod_info_t *
6784 sotpi_smod_create(void)
6785 {
6786 	smod_info_t *smodp;
6787 
6788 	smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6789 	smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6790 	(void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6791 	/*
6792 	 * Initialize the smod_refcnt to 1 so it will never be freed.
6793 	 */
6794 	smodp->smod_refcnt = 1;
6795 	smodp->smod_uc_version = SOCK_UC_VERSION;
6796 	smodp->smod_dc_version = SOCK_DC_VERSION;
6797 	smodp->smod_sock_create_func = &sotpi_create;
6798 	smodp->smod_sock_destroy_func = &sotpi_destroy;
6799 	return (smodp);
6800 }
6801