xref: /illumos-gate/usr/src/uts/common/fs/sockfs/socktpi.c (revision 46b592853d0f4f11781b6b0a7533f267c6aee132)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/t_lock.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/buf.h>
32 #include <sys/conf.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/kmem_impl.h>
36 #include <sys/sysmacros.h>
37 #include <sys/vfs.h>
38 #include <sys/vnode.h>
39 #include <sys/debug.h>
40 #include <sys/errno.h>
41 #include <sys/time.h>
42 #include <sys/file.h>
43 #include <sys/open.h>
44 #include <sys/user.h>
45 #include <sys/termios.h>
46 #include <sys/stream.h>
47 #include <sys/strsubr.h>
48 #include <sys/strsun.h>
49 #include <sys/suntpi.h>
50 #include <sys/ddi.h>
51 #include <sys/esunddi.h>
52 #include <sys/flock.h>
53 #include <sys/modctl.h>
54 #include <sys/vtrace.h>
55 #include <sys/cmn_err.h>
56 #include <sys/pathname.h>
57 
58 #include <sys/socket.h>
59 #include <sys/socketvar.h>
60 #include <sys/sockio.h>
61 #include <netinet/in.h>
62 #include <sys/un.h>
63 #include <sys/strsun.h>
64 
65 #include <sys/tiuser.h>
66 #define	_SUN_TPI_VERSION	2
67 #include <sys/tihdr.h>
68 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
69 
70 #include <c2/audit.h>
71 
72 #include <inet/common.h>
73 #include <inet/ip.h>
74 #include <inet/ip6.h>
75 #include <inet/tcp.h>
76 #include <inet/udp_impl.h>
77 
78 #include <sys/zone.h>
79 
80 #include <fs/sockfs/nl7c.h>
81 #include <fs/sockfs/nl7curi.h>
82 
83 #include <inet/kssl/ksslapi.h>
84 
85 #include <fs/sockfs/sockcommon.h>
86 #include <fs/sockfs/socktpi.h>
87 #include <fs/sockfs/socktpi_impl.h>
88 
89 /*
90  * Possible failures when memory can't be allocated. The documented behavior:
91  *
92  * 		5.5:			4.X:		XNET:
93  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
94  *							EINTR
95  *	(4.X does not document EINTR but returns it)
96  * bind:	ENOSR			-		ENOBUFS/ENOSR
97  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
98  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
99  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
100  *	(4.X getpeername and getsockname do not fail in practice)
101  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
102  * listen:	-			-		ENOBUFS
103  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
104  *							EINTR
105  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
106  *							EINTR
107  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
108  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
109  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
110  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
111  *
112  * Resolution. When allocation fails:
113  *	recv: return EINTR
114  *	send: return EINTR
115  *	connect, accept: EINTR
116  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
117  *	socket, socketpair: ENOBUFS
118  *	getpeername, getsockname: sleep
119  *	getsockopt, setsockopt: sleep
120  */
121 
122 #ifdef SOCK_TEST
123 /*
124  * Variables that make sockfs do something other than the standard TPI
125  * for the AF_INET transports.
126  *
127  * solisten_tpi_tcp:
128  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
129  *	the transport is already bound. This is needed to avoid loosing the
130  *	port number should listen() do a T_UNBIND_REQ followed by a
131  *	O_T_BIND_REQ.
132  *
133  * soconnect_tpi_udp:
134  *	UDP and ICMP can handle a T_CONN_REQ.
135  *	This is needed to make the sequence of connect(), getsockname()
136  *	return the local IP address used to send packets to the connected to
137  *	destination.
138  *
139  * soconnect_tpi_tcp:
140  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
141  *	Set this to non-zero to send TPI conformant messages to TCP in this
142  *	respect. This is a performance optimization.
143  *
144  * soaccept_tpi_tcp:
145  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
146  *	This is a performance optimization that has been picked up in XTI.
147  *
148  * soaccept_tpi_multioptions:
149  *	When inheriting SOL_SOCKET options from the listener to the accepting
150  *	socket send them as a single message for AF_INET{,6}.
151  */
152 int solisten_tpi_tcp = 0;
153 int soconnect_tpi_udp = 0;
154 int soconnect_tpi_tcp = 0;
155 int soaccept_tpi_tcp = 0;
156 int soaccept_tpi_multioptions = 1;
157 #else /* SOCK_TEST */
158 #define	soconnect_tpi_tcp	0
159 #define	soconnect_tpi_udp	0
160 #define	solisten_tpi_tcp	0
161 #define	soaccept_tpi_tcp	0
162 #define	soaccept_tpi_multioptions	1
163 #endif /* SOCK_TEST */
164 
165 #ifdef SOCK_TEST
166 extern int do_useracc;
167 extern clock_t sock_test_timelimit;
168 #endif /* SOCK_TEST */
169 
170 /*
171  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
172  * applications working. Turn on this flag to disable these checks.
173  */
174 int xnet_skip_checks = 0;
175 int xnet_check_print = 0;
176 int xnet_truncate_print = 0;
177 
178 static void sotpi_destroy(struct sonode *);
179 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
180     int, int *, cred_t *cr);
181 
182 static boolean_t	sotpi_info_create(struct sonode *, int);
183 static void		sotpi_info_init(struct sonode *);
184 static void 		sotpi_info_fini(struct sonode *);
185 static void 		sotpi_info_destroy(struct sonode *);
186 
187 /*
188  * Do direct function call to the transport layer below; this would
189  * also allow the transport to utilize read-side synchronous stream
190  * interface if necessary.  This is a /etc/system tunable that must
191  * not be modified on a running system.  By default this is enabled
192  * for performance reasons and may be disabled for debugging purposes.
193  */
194 boolean_t socktpi_direct = B_TRUE;
195 
196 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
197 
198 extern	void sigintr(k_sigset_t *, int);
199 extern	void sigunintr(k_sigset_t *);
200 
201 /* Sockets acting as an in-kernel SSL proxy */
202 extern mblk_t	*strsock_kssl_input(vnode_t *, mblk_t *, strwakeup_t *,
203 		    strsigset_t *, strsigset_t *, strpollset_t *);
204 extern mblk_t	*strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *,
205 		    strsigset_t *, strsigset_t *, strpollset_t *);
206 
207 static int	sotpi_unbind(struct sonode *, int);
208 
209 /* TPI sockfs sonode operations */
210 int 		sotpi_init(struct sonode *, struct sonode *, struct cred *,
211 		    int);
212 static int	sotpi_accept(struct sonode *, int, struct cred *,
213 		    struct sonode **);
214 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
215 		    int, struct cred *);
216 static int	sotpi_listen(struct sonode *, int, struct cred *);
217 static int	sotpi_connect(struct sonode *, const struct sockaddr *,
218 		    socklen_t, int, int, struct cred *);
219 extern int	sotpi_recvmsg(struct sonode *, struct nmsghdr *,
220 		    struct uio *, struct cred *);
221 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
222 		    struct uio *, struct cred *);
223 static int	sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
224 		    struct cred *, mblk_t **);
225 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
226 		    struct uio *, void *, t_uscalar_t, int);
227 static int	sodgram_direct(struct sonode *, struct sockaddr *,
228 		    socklen_t, struct uio *, int);
229 extern int	sotpi_getpeername(struct sonode *, struct sockaddr *,
230 		    socklen_t *, boolean_t, struct cred *);
231 static int	sotpi_getsockname(struct sonode *, struct sockaddr *,
232 		    socklen_t *, struct cred *);
233 static int	sotpi_shutdown(struct sonode *, int, struct cred *);
234 extern int	sotpi_getsockopt(struct sonode *, int, int, void *,
235 		    socklen_t *, int, struct cred *);
236 extern int	sotpi_setsockopt(struct sonode *, int, int, const void *,
237 		    socklen_t, struct cred *);
238 static int 	sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
239 		    int32_t *);
240 static int 	socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
241 		    struct cred *, int32_t *);
242 static int 	sotpi_poll(struct sonode *, short, int, short *,
243 		    struct pollhead **);
244 static int 	sotpi_close(struct sonode *, int, struct cred *);
245 
246 static int	i_sotpi_info_constructor(sotpi_info_t *);
247 static void 	i_sotpi_info_destructor(sotpi_info_t *);
248 
249 sonodeops_t sotpi_sonodeops = {
250 	sotpi_init,		/* sop_init		*/
251 	sotpi_accept,		/* sop_accept		*/
252 	sotpi_bind,		/* sop_bind		*/
253 	sotpi_listen,		/* sop_listen		*/
254 	sotpi_connect,		/* sop_connect		*/
255 	sotpi_recvmsg,		/* sop_recvmsg		*/
256 	sotpi_sendmsg,		/* sop_sendmsg		*/
257 	sotpi_sendmblk,		/* sop_sendmblk		*/
258 	sotpi_getpeername,	/* sop_getpeername	*/
259 	sotpi_getsockname,	/* sop_getsockname	*/
260 	sotpi_shutdown,		/* sop_shutdown		*/
261 	sotpi_getsockopt,	/* sop_getsockopt	*/
262 	sotpi_setsockopt,	/* sop_setsockopt	*/
263 	sotpi_ioctl,		/* sop_ioctl		*/
264 	sotpi_poll,		/* sop_poll		*/
265 	sotpi_close,		/* sop_close		*/
266 };
267 
268 /*
269  * Return a TPI socket vnode.
270  *
271  * Note that sockets assume that the driver will clone (either itself
272  * or by using the clone driver) i.e. a socket() call will always
273  * result in a new vnode being created.
274  */
275 
276 /*
277  * Common create code for socket and accept. If tso is set the values
278  * from that node is used instead of issuing a T_INFO_REQ.
279  */
280 
281 /* ARGSUSED */
282 static struct sonode *
283 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
284     int version, int sflags, int *errorp, cred_t *cr)
285 {
286 	struct sonode	*so;
287 	kmem_cache_t 	*cp;
288 	int		sfamily = family;
289 
290 	ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
291 
292 	if (family == AF_NCA) {
293 		/*
294 		 * The request is for an NCA socket so for NL7C use the
295 		 * INET domain instead and mark NL7C_AF_NCA below.
296 		 */
297 		family = AF_INET;
298 		/*
299 		 * NL7C is not supported in the non-global zone,
300 		 * we enforce this restriction here.
301 		 */
302 		if (getzoneid() != GLOBAL_ZONEID) {
303 			*errorp = ENOTSUP;
304 			return (NULL);
305 		}
306 	}
307 
308 	/*
309 	 * to be compatible with old tpi socket implementation ignore
310 	 * sleep flag (sflags) passed in
311 	 */
312 	cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
313 	so = kmem_cache_alloc(cp, KM_SLEEP);
314 	if (so == NULL) {
315 		*errorp = ENOMEM;
316 		return (NULL);
317 	}
318 
319 	sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
320 	sotpi_info_init(so);
321 
322 	if (sfamily == AF_NCA) {
323 		SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
324 	}
325 
326 	if (version == SOV_DEFAULT)
327 		version = so_default_version;
328 
329 	so->so_version = (short)version;
330 	*errorp = 0;
331 
332 	return (so);
333 }
334 
335 static void
336 sotpi_destroy(struct sonode *so)
337 {
338 	kmem_cache_t *cp;
339 	struct sockparams *origsp;
340 
341 	/*
342 	 * If there is a new dealloc function (ie. smod_destroy_func),
343 	 * then it should check the correctness of the ops.
344 	 */
345 
346 	ASSERT(so->so_ops == &sotpi_sonodeops);
347 
348 	origsp = SOTOTPI(so)->sti_orig_sp;
349 
350 	sotpi_info_fini(so);
351 
352 	if (so->so_state & SS_FALLBACK_COMP) {
353 		/*
354 		 * A fallback happend, which means that a sotpi_info_t struct
355 		 * was allocated (as opposed to being allocated from the TPI
356 		 * sonode cache. Therefore we explicitly free the struct
357 		 * here.
358 		 */
359 		sotpi_info_destroy(so);
360 		ASSERT(origsp != NULL);
361 
362 		origsp->sp_smod_info->smod_sock_destroy_func(so);
363 		SOCKPARAMS_DEC_REF(origsp);
364 	} else {
365 		sonode_fini(so);
366 		cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
367 		    socktpi_cache;
368 		kmem_cache_free(cp, so);
369 	}
370 }
371 
372 /* ARGSUSED1 */
373 int
374 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
375 {
376 	major_t maj;
377 	dev_t newdev;
378 	struct vnode *vp;
379 	int error = 0;
380 	struct stdata *stp;
381 
382 	sotpi_info_t *sti = SOTOTPI(so);
383 
384 	dprint(1, ("sotpi_init()\n"));
385 
386 	/*
387 	 * over write the sleep flag passed in but that is ok
388 	 * as tpi socket does not honor sleep flag.
389 	 */
390 	flags |= FREAD|FWRITE;
391 
392 	/*
393 	 * Record in so_flag that it is a clone.
394 	 */
395 	if (getmajor(sti->sti_dev) == clone_major)
396 		so->so_flag |= SOCLONE;
397 
398 	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
399 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
400 	    (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
401 	    so->so_protocol == IPPROTO_IP)) {
402 		/* Tell tcp or udp that it's talking to sockets */
403 		flags |= SO_SOCKSTR;
404 
405 		/*
406 		 * Here we indicate to socktpi_open() our attempt to
407 		 * make direct calls between sockfs and transport.
408 		 * The final decision is left to socktpi_open().
409 		 */
410 		sti->sti_direct = 1;
411 
412 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
413 		if (so->so_type == SOCK_STREAM && tso != NULL) {
414 			if (SOTOTPI(tso)->sti_direct) {
415 				/*
416 				 * Inherit sti_direct from listener and pass
417 				 * SO_ACCEPTOR open flag to tcp, indicating
418 				 * that this is an accept fast-path instance.
419 				 */
420 				flags |= SO_ACCEPTOR;
421 			} else {
422 				/*
423 				 * sti_direct is not set on listener, meaning
424 				 * that the listener has been converted from
425 				 * a socket to a stream.  Ensure that the
426 				 * acceptor inherits these settings.
427 				 */
428 				sti->sti_direct = 0;
429 				flags &= ~SO_SOCKSTR;
430 			}
431 		}
432 	}
433 
434 	/*
435 	 * Tell local transport that it is talking to sockets.
436 	 */
437 	if (so->so_family == AF_UNIX) {
438 		flags |= SO_SOCKSTR;
439 	}
440 
441 	vp = SOTOV(so);
442 	newdev = vp->v_rdev;
443 	maj = getmajor(newdev);
444 	ASSERT(STREAMSTAB(maj));
445 
446 	error = stropen(vp, &newdev, flags, cr);
447 
448 	stp = vp->v_stream;
449 	if (error == 0) {
450 		if (so->so_flag & SOCLONE)
451 			ASSERT(newdev != vp->v_rdev);
452 		mutex_enter(&so->so_lock);
453 		sti->sti_dev = newdev;
454 		vp->v_rdev = newdev;
455 		mutex_exit(&so->so_lock);
456 
457 		if (stp->sd_flag & STRISTTY) {
458 			/*
459 			 * this is a post SVR4 tty driver - a socket can not
460 			 * be a controlling terminal. Fail the open.
461 			 */
462 			(void) sotpi_close(so, flags, cr);
463 			return (ENOTTY);	/* XXX */
464 		}
465 
466 		ASSERT(stp->sd_wrq != NULL);
467 		sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
468 
469 		/*
470 		 * If caller is interested in doing direct function call
471 		 * interface to/from transport module, probe the module
472 		 * directly beneath the streamhead to see if it qualifies.
473 		 *
474 		 * We turn off the direct interface when qualifications fail.
475 		 * In the acceptor case, we simply turn off the sti_direct
476 		 * flag on the socket. We do the fallback after the accept
477 		 * has completed, before the new socket is returned to the
478 		 * application.
479 		 */
480 		if (sti->sti_direct) {
481 			queue_t *tq = stp->sd_wrq->q_next;
482 
483 			/*
484 			 * sti_direct is currently supported and tested
485 			 * only for tcp/udp; this is the main reason to
486 			 * have the following assertions.
487 			 */
488 			ASSERT(so->so_family == AF_INET ||
489 			    so->so_family == AF_INET6);
490 			ASSERT(so->so_protocol == IPPROTO_UDP ||
491 			    so->so_protocol == IPPROTO_TCP ||
492 			    so->so_protocol == IPPROTO_IP);
493 			ASSERT(so->so_type == SOCK_DGRAM ||
494 			    so->so_type == SOCK_STREAM);
495 
496 			/*
497 			 * Abort direct call interface if the module directly
498 			 * underneath the stream head is not defined with the
499 			 * _D_DIRECT flag.  This could happen in the tcp or
500 			 * udp case, when some other module is autopushed
501 			 * above it, or for some reasons the expected module
502 			 * isn't purely D_MP (which is the main requirement).
503 			 */
504 			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
505 			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
506 				int rval;
507 
508 				/* Continue on without direct calls */
509 				sti->sti_direct = 0;
510 
511 				/*
512 				 * Cannot issue ioctl on fallback socket since
513 				 * there is no conn associated with the queue.
514 				 * The fallback downcall will notify the proto
515 				 * of the change.
516 				 */
517 				if (!(flags & SO_ACCEPTOR) &&
518 				    !(flags & SO_FALLBACK)) {
519 					if ((error = strioctl(vp,
520 					    _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
521 					    cr, &rval)) != 0) {
522 						(void) sotpi_close(so, flags,
523 						    cr);
524 						return (error);
525 					}
526 				}
527 			}
528 		}
529 
530 		if (flags & SO_FALLBACK) {
531 			/*
532 			 * The stream created does not have a conn.
533 			 * do stream set up after conn has been assigned
534 			 */
535 			return (error);
536 		}
537 		if (error = so_strinit(so, tso)) {
538 			(void) sotpi_close(so, flags, cr);
539 			return (error);
540 		}
541 
542 		/* Wildcard */
543 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
544 			int protocol = so->so_protocol;
545 			/*
546 			 * Issue SO_PROTOTYPE setsockopt.
547 			 */
548 			error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
549 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
550 			if (error != 0) {
551 				(void) sotpi_close(so, flags, cr);
552 				/*
553 				 * Setsockopt often fails with ENOPROTOOPT but
554 				 * socket() should fail with
555 				 * EPROTONOSUPPORT/EPROTOTYPE.
556 				 */
557 				return (EPROTONOSUPPORT);
558 			}
559 		}
560 
561 	} else {
562 		/*
563 		 * While the same socket can not be reopened (unlike specfs)
564 		 * the stream head sets STREOPENFAIL when the autopush fails.
565 		 */
566 		if ((stp != NULL) &&
567 		    (stp->sd_flag & STREOPENFAIL)) {
568 			/*
569 			 * Open failed part way through.
570 			 */
571 			mutex_enter(&stp->sd_lock);
572 			stp->sd_flag &= ~STREOPENFAIL;
573 			mutex_exit(&stp->sd_lock);
574 			(void) sotpi_close(so, flags, cr);
575 			return (error);
576 			/*NOTREACHED*/
577 		}
578 		ASSERT(stp == NULL);
579 	}
580 	TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
581 	    "sockfs open:maj %d vp %p so %p error %d",
582 	    maj, vp, so, error);
583 	return (error);
584 }
585 
586 /*
587  * Bind the socket to an unspecified address in sockfs only.
588  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
589  * required in all cases.
590  */
591 static void
592 so_automatic_bind(struct sonode *so)
593 {
594 	sotpi_info_t *sti = SOTOTPI(so);
595 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
596 
597 	ASSERT(MUTEX_HELD(&so->so_lock));
598 	ASSERT(!(so->so_state & SS_ISBOUND));
599 	ASSERT(sti->sti_unbind_mp);
600 
601 	ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
602 	bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
603 	sti->sti_laddr_sa->sa_family = so->so_family;
604 	so->so_state |= SS_ISBOUND;
605 }
606 
607 
608 /*
609  * bind the socket.
610  *
611  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
612  * are passed in we allow rebinding. Note that for backwards compatibility
613  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
614  * Thus the rebinding code is currently not executed.
615  *
616  * The constraints for rebinding are:
617  * - it is a SOCK_DGRAM, or
618  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
619  *   and no listen() has been done.
620  * This rebinding code was added based on some language in the XNET book
621  * about not returning EINVAL it the protocol allows rebinding. However,
622  * this language is not present in the Posix socket draft. Thus maybe the
623  * rebinding logic should be deleted from the source.
624  *
625  * A null "name" can be used to unbind the socket if:
626  * - it is a SOCK_DGRAM, or
627  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
628  *   and no listen() has been done.
629  */
630 /* ARGSUSED */
631 static int
632 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
633     socklen_t namelen, int backlog, int flags, struct cred *cr)
634 {
635 	struct T_bind_req	bind_req;
636 	struct T_bind_ack	*bind_ack;
637 	int			error = 0;
638 	mblk_t			*mp;
639 	void			*addr;
640 	t_uscalar_t		addrlen;
641 	int			unbind_on_err = 1;
642 	boolean_t		clear_acceptconn_on_err = B_FALSE;
643 	boolean_t		restore_backlog_on_err = B_FALSE;
644 	int			save_so_backlog;
645 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
646 	boolean_t		tcp_udp_xport;
647 	void			*nl7c = NULL;
648 	sotpi_info_t		*sti = SOTOTPI(so);
649 
650 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
651 	    (void *)so, (void *)name, namelen, backlog, flags,
652 	    pr_state(so->so_state, so->so_mode)));
653 
654 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
655 
656 	if (!(flags & _SOBIND_LOCK_HELD)) {
657 		mutex_enter(&so->so_lock);
658 		so_lock_single(so);	/* Set SOLOCKED */
659 	} else {
660 		ASSERT(MUTEX_HELD(&so->so_lock));
661 		ASSERT(so->so_flag & SOLOCKED);
662 	}
663 
664 	/*
665 	 * Make sure that there is a preallocated unbind_req message
666 	 * before binding. This message allocated when the socket is
667 	 * created  but it might be have been consumed.
668 	 */
669 	if (sti->sti_unbind_mp == NULL) {
670 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
671 		/* NOTE: holding so_lock while sleeping */
672 		sti->sti_unbind_mp =
673 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
674 		    cr);
675 	}
676 
677 	if (flags & _SOBIND_REBIND) {
678 		/*
679 		 * Called from solisten after doing an sotpi_unbind() or
680 		 * potentially without the unbind (latter for AF_INET{,6}).
681 		 */
682 		ASSERT(name == NULL && namelen == 0);
683 
684 		if (so->so_family == AF_UNIX) {
685 			ASSERT(sti->sti_ux_bound_vp);
686 			addr = &sti->sti_ux_laddr;
687 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
688 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
689 			    "addr 0x%p, vp %p\n",
690 			    addrlen,
691 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
692 			    (void *)sti->sti_ux_bound_vp));
693 		} else {
694 			addr = sti->sti_laddr_sa;
695 			addrlen = (t_uscalar_t)sti->sti_laddr_len;
696 		}
697 	} else if (flags & _SOBIND_UNSPEC) {
698 		ASSERT(name == NULL && namelen == 0);
699 
700 		/*
701 		 * The caller checked SS_ISBOUND but not necessarily
702 		 * under so_lock
703 		 */
704 		if (so->so_state & SS_ISBOUND) {
705 			/* No error */
706 			goto done;
707 		}
708 
709 		/* Set an initial local address */
710 		switch (so->so_family) {
711 		case AF_UNIX:
712 			/*
713 			 * Use an address with same size as struct sockaddr
714 			 * just like BSD.
715 			 */
716 			sti->sti_laddr_len =
717 			    (socklen_t)sizeof (struct sockaddr);
718 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
719 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
720 			sti->sti_laddr_sa->sa_family = so->so_family;
721 
722 			/*
723 			 * Pass down an address with the implicit bind
724 			 * magic number and the rest all zeros.
725 			 * The transport will return a unique address.
726 			 */
727 			sti->sti_ux_laddr.soua_vp = NULL;
728 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
729 			addr = &sti->sti_ux_laddr;
730 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
731 			break;
732 
733 		case AF_INET:
734 		case AF_INET6:
735 			/*
736 			 * An unspecified bind in TPI has a NULL address.
737 			 * Set the address in sockfs to have the sa_family.
738 			 */
739 			sti->sti_laddr_len = (so->so_family == AF_INET) ?
740 			    (socklen_t)sizeof (sin_t) :
741 			    (socklen_t)sizeof (sin6_t);
742 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
743 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
744 			sti->sti_laddr_sa->sa_family = so->so_family;
745 			addr = NULL;
746 			addrlen = 0;
747 			break;
748 
749 		default:
750 			/*
751 			 * An unspecified bind in TPI has a NULL address.
752 			 * Set the address in sockfs to be zero length.
753 			 *
754 			 * Can not assume there is a sa_family for all
755 			 * protocol families. For example, AF_X25 does not
756 			 * have a family field.
757 			 */
758 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
759 			sti->sti_laddr_len = 0;	/* XXX correct? */
760 			addr = NULL;
761 			addrlen = 0;
762 			break;
763 		}
764 
765 	} else {
766 		if (so->so_state & SS_ISBOUND) {
767 			/*
768 			 * If it is ok to rebind the socket, first unbind
769 			 * with the transport. A rebind to the NULL address
770 			 * is interpreted as an unbind.
771 			 * Note that a bind to NULL in BSD does unbind the
772 			 * socket but it fails with EINVAL.
773 			 * Note that regular sockets set SOV_SOCKBSD i.e.
774 			 * _SOBIND_SOCKBSD gets set here hence no type of
775 			 * socket does currently allow rebinding.
776 			 *
777 			 * If the name is NULL just do an unbind.
778 			 */
779 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
780 			    name != NULL) {
781 				error = EINVAL;
782 				unbind_on_err = 0;
783 				eprintsoline(so, error);
784 				goto done;
785 			}
786 			if ((so->so_mode & SM_CONNREQUIRED) &&
787 			    (so->so_state & SS_CANTREBIND)) {
788 				error = EINVAL;
789 				unbind_on_err = 0;
790 				eprintsoline(so, error);
791 				goto done;
792 			}
793 			error = sotpi_unbind(so, 0);
794 			if (error) {
795 				eprintsoline(so, error);
796 				goto done;
797 			}
798 			ASSERT(!(so->so_state & SS_ISBOUND));
799 			if (name == NULL) {
800 				so->so_state &=
801 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
802 				goto done;
803 			}
804 		}
805 
806 		/* X/Open requires this check */
807 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
808 			if (xnet_check_print) {
809 				printf("sockfs: X/Open bind state check "
810 				    "caused EINVAL\n");
811 			}
812 			error = EINVAL;
813 			goto done;
814 		}
815 
816 		switch (so->so_family) {
817 		case AF_UNIX:
818 			/*
819 			 * All AF_UNIX addresses are nul terminated
820 			 * when copied (copyin_name) in so the minimum
821 			 * length is 3 bytes.
822 			 */
823 			if (name == NULL ||
824 			    (ssize_t)namelen <= sizeof (short) + 1) {
825 				error = EISDIR;
826 				eprintsoline(so, error);
827 				goto done;
828 			}
829 			/*
830 			 * Verify so_family matches the bound family.
831 			 * BSD does not check this for AF_UNIX resulting
832 			 * in funny mknods.
833 			 */
834 			if (name->sa_family != so->so_family) {
835 				error = EAFNOSUPPORT;
836 				goto done;
837 			}
838 			break;
839 		case AF_INET:
840 			if (name == NULL) {
841 				error = EINVAL;
842 				eprintsoline(so, error);
843 				goto done;
844 			}
845 			if ((size_t)namelen != sizeof (sin_t)) {
846 				error = name->sa_family != so->so_family ?
847 				    EAFNOSUPPORT : EINVAL;
848 				eprintsoline(so, error);
849 				goto done;
850 			}
851 			if ((flags & _SOBIND_XPG4_2) &&
852 			    (name->sa_family != so->so_family)) {
853 				/*
854 				 * This check has to be made for X/Open
855 				 * sockets however application failures have
856 				 * been observed when it is applied to
857 				 * all sockets.
858 				 */
859 				error = EAFNOSUPPORT;
860 				eprintsoline(so, error);
861 				goto done;
862 			}
863 			/*
864 			 * Force a zero sa_family to match so_family.
865 			 *
866 			 * Some programs like inetd(1M) don't set the
867 			 * family field. Other programs leave
868 			 * sin_family set to garbage - SunOS 4.X does
869 			 * not check the family field on a bind.
870 			 * We use the family field that
871 			 * was passed in to the socket() call.
872 			 */
873 			name->sa_family = so->so_family;
874 			break;
875 
876 		case AF_INET6: {
877 #ifdef DEBUG
878 			sin6_t *sin6 = (sin6_t *)name;
879 #endif /* DEBUG */
880 
881 			if (name == NULL) {
882 				error = EINVAL;
883 				eprintsoline(so, error);
884 				goto done;
885 			}
886 			if ((size_t)namelen != sizeof (sin6_t)) {
887 				error = name->sa_family != so->so_family ?
888 				    EAFNOSUPPORT : EINVAL;
889 				eprintsoline(so, error);
890 				goto done;
891 			}
892 			if (name->sa_family != so->so_family) {
893 				/*
894 				 * With IPv6 we require the family to match
895 				 * unlike in IPv4.
896 				 */
897 				error = EAFNOSUPPORT;
898 				eprintsoline(so, error);
899 				goto done;
900 			}
901 #ifdef DEBUG
902 			/*
903 			 * Verify that apps don't forget to clear
904 			 * sin6_scope_id etc
905 			 */
906 			if (sin6->sin6_scope_id != 0 &&
907 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
908 				zcmn_err(getzoneid(), CE_WARN,
909 				    "bind with uninitialized sin6_scope_id "
910 				    "(%d) on socket. Pid = %d\n",
911 				    (int)sin6->sin6_scope_id,
912 				    (int)curproc->p_pid);
913 			}
914 			if (sin6->__sin6_src_id != 0) {
915 				zcmn_err(getzoneid(), CE_WARN,
916 				    "bind with uninitialized __sin6_src_id "
917 				    "(%d) on socket. Pid = %d\n",
918 				    (int)sin6->__sin6_src_id,
919 				    (int)curproc->p_pid);
920 			}
921 #endif /* DEBUG */
922 			break;
923 		}
924 		default:
925 			/*
926 			 * Don't do any length or sa_family check to allow
927 			 * non-sockaddr style addresses.
928 			 */
929 			if (name == NULL) {
930 				error = EINVAL;
931 				eprintsoline(so, error);
932 				goto done;
933 			}
934 			break;
935 		}
936 
937 		if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
938 			error = ENAMETOOLONG;
939 			eprintsoline(so, error);
940 			goto done;
941 		}
942 		/*
943 		 * Save local address.
944 		 */
945 		sti->sti_laddr_len = (socklen_t)namelen;
946 		ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
947 		bcopy(name, sti->sti_laddr_sa, namelen);
948 
949 		addr = sti->sti_laddr_sa;
950 		addrlen = (t_uscalar_t)sti->sti_laddr_len;
951 		switch (so->so_family) {
952 		case AF_INET6:
953 		case AF_INET:
954 			break;
955 		case AF_UNIX: {
956 			struct sockaddr_un *soun =
957 			    (struct sockaddr_un *)sti->sti_laddr_sa;
958 			struct vnode *vp, *rvp;
959 			struct vattr vattr;
960 
961 			ASSERT(sti->sti_ux_bound_vp == NULL);
962 			/*
963 			 * Create vnode for the specified path name.
964 			 * Keep vnode held with a reference in sti_ux_bound_vp.
965 			 * Use the vnode pointer as the address used in the
966 			 * bind with the transport.
967 			 *
968 			 * Use the same mode as in BSD. In particular this does
969 			 * not observe the umask.
970 			 */
971 			/* MAXPATHLEN + soun_family + nul termination */
972 			if (sti->sti_laddr_len >
973 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
974 				error = ENAMETOOLONG;
975 				eprintsoline(so, error);
976 				goto done;
977 			}
978 			vattr.va_type = VSOCK;
979 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
980 			vattr.va_mask = AT_TYPE|AT_MODE;
981 			/* NOTE: holding so_lock */
982 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
983 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
984 			if (error) {
985 				if (error == EEXIST)
986 					error = EADDRINUSE;
987 				eprintsoline(so, error);
988 				goto done;
989 			}
990 			/*
991 			 * Establish pointer from the underlying filesystem
992 			 * vnode to the socket node.
993 			 * sti_ux_bound_vp and v_stream->sd_vnode form the
994 			 * cross-linkage between the underlying filesystem
995 			 * node and the socket node.
996 			 */
997 
998 			if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
999 				VN_HOLD(rvp);
1000 				VN_RELE(vp);
1001 				vp = rvp;
1002 			}
1003 
1004 			ASSERT(SOTOV(so)->v_stream);
1005 			mutex_enter(&vp->v_lock);
1006 			vp->v_stream = SOTOV(so)->v_stream;
1007 			sti->sti_ux_bound_vp = vp;
1008 			mutex_exit(&vp->v_lock);
1009 
1010 			/*
1011 			 * Use the vnode pointer value as a unique address
1012 			 * (together with the magic number to avoid conflicts
1013 			 * with implicit binds) in the transport provider.
1014 			 */
1015 			sti->sti_ux_laddr.soua_vp =
1016 			    (void *)sti->sti_ux_bound_vp;
1017 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1018 			addr = &sti->sti_ux_laddr;
1019 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1020 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1021 			    addrlen,
1022 			    (void *)((struct so_ux_addr *)addr)->soua_vp));
1023 			break;
1024 		}
1025 		} /* end switch (so->so_family) */
1026 	}
1027 
1028 	/*
1029 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1030 	 * the transport can start passing up T_CONN_IND messages
1031 	 * as soon as it receives the bind req and strsock_proto()
1032 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1033 	 */
1034 	if (flags & _SOBIND_LISTEN) {
1035 		if ((so->so_state & SS_ACCEPTCONN) == 0)
1036 			clear_acceptconn_on_err = B_TRUE;
1037 		save_so_backlog = so->so_backlog;
1038 		restore_backlog_on_err = B_TRUE;
1039 		so->so_state |= SS_ACCEPTCONN;
1040 		so->so_backlog = backlog;
1041 	}
1042 
1043 	/*
1044 	 * If NL7C addr(s) have been configured check for addr/port match,
1045 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1046 	 *
1047 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1048 	 * family sockets only. If match mark as such.
1049 	 */
1050 	if (nl7c_enabled && ((addr != NULL &&
1051 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1052 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1053 	    sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1054 		/*
1055 		 * NL7C is not supported in non-global zones,
1056 		 * we enforce this restriction here.
1057 		 */
1058 		if (so->so_zoneid == GLOBAL_ZONEID) {
1059 			/* An NL7C socket, mark it */
1060 			sti->sti_nl7c_flags |= NL7C_ENABLED;
1061 			if (nl7c == NULL) {
1062 				/*
1063 				 * Was an AF_NCA bind() so add it to the
1064 				 * addr list for reporting purposes.
1065 				 */
1066 				nl7c = nl7c_add_addr(addr, addrlen);
1067 			}
1068 		} else
1069 			nl7c = NULL;
1070 	}
1071 
1072 	/*
1073 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1074 	 * for other transports we will send in a O_T_BIND_REQ.
1075 	 */
1076 	if (tcp_udp_xport &&
1077 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
1078 		PRIM_type = T_BIND_REQ;
1079 
1080 	bind_req.PRIM_type = PRIM_type;
1081 	bind_req.ADDR_length = addrlen;
1082 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1083 	bind_req.CONIND_number = backlog;
1084 	/* NOTE: holding so_lock while sleeping */
1085 	mp = soallocproto2(&bind_req, sizeof (bind_req),
1086 	    addr, addrlen, 0, _ALLOC_SLEEP, cr);
1087 	sti->sti_laddr_valid = 0;
1088 
1089 	/* Done using sti_laddr_sa - can drop the lock */
1090 	mutex_exit(&so->so_lock);
1091 
1092 	/*
1093 	 * Intercept the bind_req message here to check if this <address/port>
1094 	 * was configured as an SSL proxy server, or if another endpoint was
1095 	 * already configured to act as a proxy for us.
1096 	 *
1097 	 * Note, only if NL7C not enabled for this socket.
1098 	 */
1099 	if (nl7c == NULL &&
1100 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1101 	    so->so_type == SOCK_STREAM) {
1102 
1103 		if (sti->sti_kssl_ent != NULL) {
1104 			kssl_release_ent(sti->sti_kssl_ent, so,
1105 			    sti->sti_kssl_type);
1106 			sti->sti_kssl_ent = NULL;
1107 		}
1108 
1109 		sti->sti_kssl_type = kssl_check_proxy(mp, so,
1110 		    &sti->sti_kssl_ent);
1111 		switch (sti->sti_kssl_type) {
1112 		case KSSL_NO_PROXY:
1113 			break;
1114 
1115 		case KSSL_HAS_PROXY:
1116 			mutex_enter(&so->so_lock);
1117 			goto skip_transport;
1118 
1119 		case KSSL_IS_PROXY:
1120 			break;
1121 		}
1122 	}
1123 
1124 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1125 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1126 	if (error) {
1127 		eprintsoline(so, error);
1128 		mutex_enter(&so->so_lock);
1129 		goto done;
1130 	}
1131 
1132 	mutex_enter(&so->so_lock);
1133 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1134 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1135 	if (error) {
1136 		eprintsoline(so, error);
1137 		goto done;
1138 	}
1139 skip_transport:
1140 	ASSERT(mp);
1141 	/*
1142 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1143 	 * strsock_proto while the lock was dropped above, the bind
1144 	 * is allowed to complete.
1145 	 */
1146 
1147 	/* Mark as bound. This will be undone if we detect errors below. */
1148 	if (flags & _SOBIND_NOXLATE) {
1149 		ASSERT(so->so_family == AF_UNIX);
1150 		sti->sti_faddr_noxlate = 1;
1151 	}
1152 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1153 	so->so_state |= SS_ISBOUND;
1154 	ASSERT(sti->sti_unbind_mp);
1155 
1156 	/* note that we've already set SS_ACCEPTCONN above */
1157 
1158 	/*
1159 	 * Recompute addrlen - an unspecied bind sent down an
1160 	 * address of length zero but we expect the appropriate length
1161 	 * in return.
1162 	 */
1163 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1164 	    sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1165 
1166 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
1167 	/*
1168 	 * The alignment restriction is really too strict but
1169 	 * we want enough alignment to inspect the fields of
1170 	 * a sockaddr_in.
1171 	 */
1172 	addr = sogetoff(mp, bind_ack->ADDR_offset,
1173 	    bind_ack->ADDR_length,
1174 	    __TPI_ALIGN_SIZE);
1175 	if (addr == NULL) {
1176 		freemsg(mp);
1177 		error = EPROTO;
1178 		eprintsoline(so, error);
1179 		goto done;
1180 	}
1181 	if (!(flags & _SOBIND_UNSPEC)) {
1182 		/*
1183 		 * Verify that the transport didn't return something we
1184 		 * did not want e.g. an address other than what we asked for.
1185 		 *
1186 		 * NOTE: These checks would go away if/when we switch to
1187 		 * using the new TPI (in which the transport would fail
1188 		 * the request instead of assigning a different address).
1189 		 *
1190 		 * NOTE2: For protocols that we don't know (i.e. any
1191 		 * other than AF_INET6, AF_INET and AF_UNIX), we
1192 		 * cannot know if the transport should be expected to
1193 		 * return the same address as that requested.
1194 		 *
1195 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1196 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1197 		 *
1198 		 * For example, in the case of netatalk it may be
1199 		 * inappropriate for the transport to return the
1200 		 * requested address (as it may have allocated a local
1201 		 * port number in behaviour similar to that of an
1202 		 * AF_INET bind request with a port number of zero).
1203 		 *
1204 		 * Given the definition of O_T_BIND_REQ, where the
1205 		 * transport may bind to an address other than the
1206 		 * requested address, it's not possible to determine
1207 		 * whether a returned address that differs from the
1208 		 * requested address is a reason to fail (because the
1209 		 * requested address was not available) or succeed
1210 		 * (because the transport allocated an appropriate
1211 		 * address and/or port).
1212 		 *
1213 		 * sockfs currently requires that the transport return
1214 		 * the requested address in the T_BIND_ACK, unless
1215 		 * there is code here to allow for any discrepancy.
1216 		 * Such code exists for AF_INET and AF_INET6.
1217 		 *
1218 		 * Netatalk chooses to return the requested address
1219 		 * rather than the (correct) allocated address.  This
1220 		 * means that netatalk violates the TPI specification
1221 		 * (and would not function correctly if used from a
1222 		 * TLI application), but it does mean that it works
1223 		 * with sockfs.
1224 		 *
1225 		 * As noted above, using the newer XTI bind primitive
1226 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1227 		 * allow sockfs to be more sure about whether or not
1228 		 * the bind request had succeeded (as transports are
1229 		 * not permitted to bind to a different address than
1230 		 * that requested - they must return failure).
1231 		 * Unfortunately, support for T_BIND_REQ may not be
1232 		 * present in all transport implementations (netatalk,
1233 		 * for example, doesn't have it), making the
1234 		 * transition difficult.
1235 		 */
1236 		if (bind_ack->ADDR_length != addrlen) {
1237 			/* Assumes that the requested address was in use */
1238 			freemsg(mp);
1239 			error = EADDRINUSE;
1240 			eprintsoline(so, error);
1241 			goto done;
1242 		}
1243 
1244 		switch (so->so_family) {
1245 		case AF_INET6:
1246 		case AF_INET: {
1247 			sin_t *rname, *aname;
1248 
1249 			rname = (sin_t *)addr;
1250 			aname = (sin_t *)sti->sti_laddr_sa;
1251 
1252 			/*
1253 			 * Take advantage of the alignment
1254 			 * of sin_port and sin6_port which fall
1255 			 * in the same place in their data structures.
1256 			 * Just use sin_port for either address family.
1257 			 *
1258 			 * This may become a problem if (heaven forbid)
1259 			 * there's a separate ipv6port_reserved... :-P
1260 			 *
1261 			 * Binding to port 0 has the semantics of letting
1262 			 * the transport bind to any port.
1263 			 *
1264 			 * If the transport is TCP or UDP since we had sent
1265 			 * a T_BIND_REQ we would not get a port other than
1266 			 * what we asked for.
1267 			 */
1268 			if (tcp_udp_xport) {
1269 				/*
1270 				 * Pick up the new port number if we bound to
1271 				 * port 0.
1272 				 */
1273 				if (aname->sin_port == 0)
1274 					aname->sin_port = rname->sin_port;
1275 				sti->sti_laddr_valid = 1;
1276 				break;
1277 			}
1278 			if (aname->sin_port != 0 &&
1279 			    aname->sin_port != rname->sin_port) {
1280 				freemsg(mp);
1281 				error = EADDRINUSE;
1282 				eprintsoline(so, error);
1283 				goto done;
1284 			}
1285 			/*
1286 			 * Pick up the new port number if we bound to port 0.
1287 			 */
1288 			aname->sin_port = rname->sin_port;
1289 
1290 			/*
1291 			 * Unfortunately, addresses aren't _quite_ the same.
1292 			 */
1293 			if (so->so_family == AF_INET) {
1294 				if (aname->sin_addr.s_addr !=
1295 				    rname->sin_addr.s_addr) {
1296 					freemsg(mp);
1297 					error = EADDRNOTAVAIL;
1298 					eprintsoline(so, error);
1299 					goto done;
1300 				}
1301 			} else {
1302 				sin6_t *rname6 = (sin6_t *)rname;
1303 				sin6_t *aname6 = (sin6_t *)aname;
1304 
1305 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1306 				    &rname6->sin6_addr)) {
1307 					freemsg(mp);
1308 					error = EADDRNOTAVAIL;
1309 					eprintsoline(so, error);
1310 					goto done;
1311 				}
1312 			}
1313 			break;
1314 		}
1315 		case AF_UNIX:
1316 			if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1317 				freemsg(mp);
1318 				error = EADDRINUSE;
1319 				eprintsoline(so, error);
1320 				eprintso(so,
1321 				    ("addrlen %d, addr 0x%x, vp %p\n",
1322 				    addrlen, *((int *)addr),
1323 				    (void *)sti->sti_ux_bound_vp));
1324 				goto done;
1325 			}
1326 			sti->sti_laddr_valid = 1;
1327 			break;
1328 		default:
1329 			/*
1330 			 * NOTE: This assumes that addresses can be
1331 			 * byte-compared for equivalence.
1332 			 */
1333 			if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1334 				freemsg(mp);
1335 				error = EADDRINUSE;
1336 				eprintsoline(so, error);
1337 				goto done;
1338 			}
1339 			/*
1340 			 * Don't mark sti_laddr_valid, as we cannot be
1341 			 * sure that the returned address is the real
1342 			 * bound address when talking to an unknown
1343 			 * transport.
1344 			 */
1345 			break;
1346 		}
1347 	} else {
1348 		/*
1349 		 * Save for returned address for getsockname.
1350 		 * Needed for unspecific bind unless transport supports
1351 		 * the TI_GETMYNAME ioctl.
1352 		 * Do this for AF_INET{,6} even though they do, as
1353 		 * caching info here is much better performance than
1354 		 * a TPI/STREAMS trip to the transport for getsockname.
1355 		 * Any which can't for some reason _must_ _not_ set
1356 		 * sti_laddr_valid here for the caching version of
1357 		 * getsockname to not break;
1358 		 */
1359 		switch (so->so_family) {
1360 		case AF_UNIX:
1361 			/*
1362 			 * Record the address bound with the transport
1363 			 * for use by socketpair.
1364 			 */
1365 			bcopy(addr, &sti->sti_ux_laddr, addrlen);
1366 			sti->sti_laddr_valid = 1;
1367 			break;
1368 		case AF_INET:
1369 		case AF_INET6:
1370 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1371 			bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1372 			sti->sti_laddr_valid = 1;
1373 			break;
1374 		default:
1375 			/*
1376 			 * Don't mark sti_laddr_valid, as we cannot be
1377 			 * sure that the returned address is the real
1378 			 * bound address when talking to an unknown
1379 			 * transport.
1380 			 */
1381 			break;
1382 		}
1383 	}
1384 
1385 	if (nl7c != NULL) {
1386 		/* Register listen()er sonode pointer with NL7C */
1387 		nl7c_listener_addr(nl7c, so);
1388 	}
1389 
1390 	freemsg(mp);
1391 
1392 done:
1393 	if (error) {
1394 		/* reset state & backlog to values held on entry */
1395 		if (clear_acceptconn_on_err == B_TRUE)
1396 			so->so_state &= ~SS_ACCEPTCONN;
1397 		if (restore_backlog_on_err == B_TRUE)
1398 			so->so_backlog = save_so_backlog;
1399 
1400 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1401 			int err;
1402 
1403 			err = sotpi_unbind(so, 0);
1404 			/* LINTED - statement has no consequent: if */
1405 			if (err) {
1406 				eprintsoline(so, error);
1407 			} else {
1408 				ASSERT(!(so->so_state & SS_ISBOUND));
1409 			}
1410 		}
1411 	}
1412 	if (!(flags & _SOBIND_LOCK_HELD)) {
1413 		so_unlock_single(so, SOLOCKED);
1414 		mutex_exit(&so->so_lock);
1415 	} else {
1416 		ASSERT(MUTEX_HELD(&so->so_lock));
1417 		ASSERT(so->so_flag & SOLOCKED);
1418 	}
1419 	return (error);
1420 }
1421 
1422 /* bind the socket */
1423 static int
1424 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1425     int flags, struct cred *cr)
1426 {
1427 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1428 		return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1429 
1430 	flags &= ~_SOBIND_SOCKETPAIR;
1431 	return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1432 }
1433 
1434 /*
1435  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1436  * address, or when listen needs to unbind and bind.
1437  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1438  * so that a sobind can pick them up.
1439  */
1440 static int
1441 sotpi_unbind(struct sonode *so, int flags)
1442 {
1443 	struct T_unbind_req	unbind_req;
1444 	int			error = 0;
1445 	mblk_t			*mp;
1446 	sotpi_info_t		*sti = SOTOTPI(so);
1447 
1448 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1449 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1450 
1451 	ASSERT(MUTEX_HELD(&so->so_lock));
1452 	ASSERT(so->so_flag & SOLOCKED);
1453 
1454 	if (!(so->so_state & SS_ISBOUND)) {
1455 		error = EINVAL;
1456 		eprintsoline(so, error);
1457 		goto done;
1458 	}
1459 
1460 	mutex_exit(&so->so_lock);
1461 
1462 	/*
1463 	 * Flush the read and write side (except stream head read queue)
1464 	 * and send down T_UNBIND_REQ.
1465 	 */
1466 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1467 
1468 	unbind_req.PRIM_type = T_UNBIND_REQ;
1469 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1470 	    0, _ALLOC_SLEEP, CRED());
1471 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1472 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1473 	mutex_enter(&so->so_lock);
1474 	if (error) {
1475 		eprintsoline(so, error);
1476 		goto done;
1477 	}
1478 
1479 	error = sowaitokack(so, T_UNBIND_REQ);
1480 	if (error) {
1481 		eprintsoline(so, error);
1482 		goto done;
1483 	}
1484 
1485 	/*
1486 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1487 	 * strsock_proto while the lock was dropped above, the unbind
1488 	 * is allowed to complete.
1489 	 */
1490 	if (!(flags & _SOUNBIND_REBIND)) {
1491 		/*
1492 		 * Clear out bound address.
1493 		 */
1494 		vnode_t *vp;
1495 
1496 		if ((vp = sti->sti_ux_bound_vp) != NULL) {
1497 
1498 			/* Undo any SSL proxy setup */
1499 			if ((so->so_family == AF_INET ||
1500 			    so->so_family == AF_INET6) &&
1501 			    (so->so_type == SOCK_STREAM) &&
1502 			    (sti->sti_kssl_ent != NULL)) {
1503 				kssl_release_ent(sti->sti_kssl_ent, so,
1504 				    sti->sti_kssl_type);
1505 				sti->sti_kssl_ent = NULL;
1506 				sti->sti_kssl_type = KSSL_NO_PROXY;
1507 			}
1508 			sti->sti_ux_bound_vp = NULL;
1509 			vn_rele_stream(vp);
1510 		}
1511 		/* Clear out address */
1512 		sti->sti_laddr_len = 0;
1513 	}
1514 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1515 	sti->sti_laddr_valid = 0;
1516 
1517 done:
1518 
1519 	/* If the caller held the lock don't release it here */
1520 	ASSERT(MUTEX_HELD(&so->so_lock));
1521 	ASSERT(so->so_flag & SOLOCKED);
1522 
1523 	return (error);
1524 }
1525 
1526 /*
1527  * listen on the socket.
1528  * For TPI conforming transports this has to first unbind with the transport
1529  * and then bind again using the new backlog.
1530  */
1531 /* ARGSUSED */
1532 int
1533 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1534 {
1535 	int		error = 0;
1536 	sotpi_info_t	*sti = SOTOTPI(so);
1537 
1538 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1539 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1540 
1541 	if (sti->sti_serv_type == T_CLTS)
1542 		return (EOPNOTSUPP);
1543 
1544 	/*
1545 	 * If the socket is ready to accept connections already, then
1546 	 * return without doing anything.  This avoids a problem where
1547 	 * a second listen() call fails if a connection is pending and
1548 	 * leaves the socket unbound. Only when we are not unbinding
1549 	 * with the transport can we safely increase the backlog.
1550 	 */
1551 	if (so->so_state & SS_ACCEPTCONN &&
1552 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1553 	    /*CONSTCOND*/
1554 	    !solisten_tpi_tcp))
1555 		return (0);
1556 
1557 	if (so->so_state & SS_ISCONNECTED)
1558 		return (EINVAL);
1559 
1560 	mutex_enter(&so->so_lock);
1561 	so_lock_single(so);	/* Set SOLOCKED */
1562 
1563 	/*
1564 	 * If the listen doesn't change the backlog we do nothing.
1565 	 * This avoids an EPROTO error from the transport.
1566 	 */
1567 	if ((so->so_state & SS_ACCEPTCONN) &&
1568 	    so->so_backlog == backlog)
1569 		goto done;
1570 
1571 	if (!(so->so_state & SS_ISBOUND)) {
1572 		/*
1573 		 * Must have been explicitly bound in the UNIX domain.
1574 		 */
1575 		if (so->so_family == AF_UNIX) {
1576 			error = EINVAL;
1577 			goto done;
1578 		}
1579 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1580 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1581 	} else if (backlog > 0) {
1582 		/*
1583 		 * AF_INET{,6} hack to avoid losing the port.
1584 		 * Assumes that all AF_INET{,6} transports can handle a
1585 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1586 		 * has already bound thus it is possible to avoid the unbind.
1587 		 */
1588 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1589 		    /*CONSTCOND*/
1590 		    !solisten_tpi_tcp)) {
1591 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1592 			if (error)
1593 				goto done;
1594 		}
1595 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1596 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1597 	} else {
1598 		so->so_state |= SS_ACCEPTCONN;
1599 		so->so_backlog = backlog;
1600 	}
1601 	if (error)
1602 		goto done;
1603 	ASSERT(so->so_state & SS_ACCEPTCONN);
1604 done:
1605 	so_unlock_single(so, SOLOCKED);
1606 	mutex_exit(&so->so_lock);
1607 	return (error);
1608 }
1609 
1610 /*
1611  * Disconnect either a specified seqno or all (-1).
1612  * The former is used on listening sockets only.
1613  *
1614  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1615  * the current use of sodisconnect(seqno == -1) is only for shutdown
1616  * so there is no point (and potentially incorrect) to unbind.
1617  */
1618 static int
1619 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1620 {
1621 	struct T_discon_req	discon_req;
1622 	int			error = 0;
1623 	mblk_t			*mp;
1624 
1625 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1626 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1627 
1628 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1629 		mutex_enter(&so->so_lock);
1630 		so_lock_single(so);	/* Set SOLOCKED */
1631 	} else {
1632 		ASSERT(MUTEX_HELD(&so->so_lock));
1633 		ASSERT(so->so_flag & SOLOCKED);
1634 	}
1635 
1636 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1637 		error = EINVAL;
1638 		eprintsoline(so, error);
1639 		goto done;
1640 	}
1641 
1642 	mutex_exit(&so->so_lock);
1643 	/*
1644 	 * Flush the write side (unless this is a listener)
1645 	 * and then send down a T_DISCON_REQ.
1646 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1647 	 * and other messages.)
1648 	 */
1649 	if (!(so->so_state & SS_ACCEPTCONN))
1650 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1651 
1652 	discon_req.PRIM_type = T_DISCON_REQ;
1653 	discon_req.SEQ_number = seqno;
1654 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1655 	    0, _ALLOC_SLEEP, CRED());
1656 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1657 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1658 	mutex_enter(&so->so_lock);
1659 	if (error) {
1660 		eprintsoline(so, error);
1661 		goto done;
1662 	}
1663 
1664 	error = sowaitokack(so, T_DISCON_REQ);
1665 	if (error) {
1666 		eprintsoline(so, error);
1667 		goto done;
1668 	}
1669 	/*
1670 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1671 	 * strsock_proto while the lock was dropped above, the disconnect
1672 	 * is allowed to complete. However, it is not possible to
1673 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1674 	 */
1675 	so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1676 	SOTOTPI(so)->sti_laddr_valid = 0;
1677 	SOTOTPI(so)->sti_faddr_valid = 0;
1678 done:
1679 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1680 		so_unlock_single(so, SOLOCKED);
1681 		mutex_exit(&so->so_lock);
1682 	} else {
1683 		/* If the caller held the lock don't release it here */
1684 		ASSERT(MUTEX_HELD(&so->so_lock));
1685 		ASSERT(so->so_flag & SOLOCKED);
1686 	}
1687 	return (error);
1688 }
1689 
1690 /* ARGSUSED */
1691 int
1692 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1693     struct sonode **nsop)
1694 {
1695 	struct T_conn_ind	*conn_ind;
1696 	struct T_conn_res	*conn_res;
1697 	int			error = 0;
1698 	mblk_t			*mp, *ctxmp, *ack_mp;
1699 	struct sonode		*nso;
1700 	vnode_t			*nvp;
1701 	void			*src;
1702 	t_uscalar_t		srclen;
1703 	void			*opt;
1704 	t_uscalar_t		optlen;
1705 	t_scalar_t		PRIM_type;
1706 	t_scalar_t		SEQ_number;
1707 	size_t			sinlen;
1708 	sotpi_info_t		*sti = SOTOTPI(so);
1709 	sotpi_info_t		*nsti;
1710 
1711 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1712 	    (void *)so, fflag, (void *)nsop,
1713 	    pr_state(so->so_state, so->so_mode)));
1714 
1715 	/*
1716 	 * Defer single-threading the accepting socket until
1717 	 * the T_CONN_IND has been received and parsed and the
1718 	 * new sonode has been opened.
1719 	 */
1720 
1721 	/* Check that we are not already connected */
1722 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1723 		goto conn_bad;
1724 again:
1725 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1726 		goto e_bad;
1727 
1728 	ASSERT(mp != NULL);
1729 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1730 	ctxmp = mp->b_cont;
1731 
1732 	/*
1733 	 * Save SEQ_number for error paths.
1734 	 */
1735 	SEQ_number = conn_ind->SEQ_number;
1736 
1737 	srclen = conn_ind->SRC_length;
1738 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1739 	if (src == NULL) {
1740 		error = EPROTO;
1741 		freemsg(mp);
1742 		eprintsoline(so, error);
1743 		goto disconnect_unlocked;
1744 	}
1745 	optlen = conn_ind->OPT_length;
1746 	switch (so->so_family) {
1747 	case AF_INET:
1748 	case AF_INET6:
1749 		if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1750 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1751 			    &opt, conn_ind->OPT_length);
1752 		} else {
1753 			/*
1754 			 * The transport (in this case TCP) hasn't sent up
1755 			 * a pointer to an instance for the accept fast-path.
1756 			 * Disable fast-path completely because the call to
1757 			 * sotpi_create() below would otherwise create an
1758 			 * incomplete TCP instance, which would lead to
1759 			 * problems when sockfs sends a normal T_CONN_RES
1760 			 * message down the new stream.
1761 			 */
1762 			if (sti->sti_direct) {
1763 				int rval;
1764 				/*
1765 				 * For consistency we inform tcp to disable
1766 				 * direct interface on the listener, though
1767 				 * we can certainly live without doing this
1768 				 * because no data will ever travel upstream
1769 				 * on the listening socket.
1770 				 */
1771 				sti->sti_direct = 0;
1772 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1773 				    0, 0, K_TO_K, cr, &rval);
1774 			}
1775 			opt = NULL;
1776 			optlen = 0;
1777 		}
1778 		break;
1779 	case AF_UNIX:
1780 	default:
1781 		if (optlen != 0) {
1782 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1783 			    __TPI_ALIGN_SIZE);
1784 			if (opt == NULL) {
1785 				error = EPROTO;
1786 				freemsg(mp);
1787 				eprintsoline(so, error);
1788 				goto disconnect_unlocked;
1789 			}
1790 		}
1791 		if (so->so_family == AF_UNIX) {
1792 			if (!sti->sti_faddr_noxlate) {
1793 				src = NULL;
1794 				srclen = 0;
1795 			}
1796 			/* Extract src address from options */
1797 			if (optlen != 0)
1798 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1799 		}
1800 		break;
1801 	}
1802 
1803 	/*
1804 	 * Create the new socket.
1805 	 */
1806 	nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1807 	if (nso == NULL) {
1808 		ASSERT(error != 0);
1809 		/*
1810 		 * Accept can not fail with ENOBUFS. sotpi_create
1811 		 * sleeps waiting for memory until a signal is caught
1812 		 * so return EINTR.
1813 		 */
1814 		freemsg(mp);
1815 		if (error == ENOBUFS)
1816 			error = EINTR;
1817 		goto e_disc_unl;
1818 	}
1819 	nvp = SOTOV(nso);
1820 	nsti = SOTOTPI(nso);
1821 
1822 	/*
1823 	 * If the transport sent up an SSL connection context, then attach
1824 	 * it the new socket, and set the (sd_wputdatafunc)() and
1825 	 * (sd_rputdatafunc)() stream head hooks to intercept and process
1826 	 * SSL records.
1827 	 */
1828 	if (ctxmp != NULL) {
1829 		/*
1830 		 * This kssl_ctx_t is already held for us by the transport.
1831 		 * So, we don't need to do a kssl_hold_ctx() here.
1832 		 */
1833 		nsti->sti_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr);
1834 		freemsg(ctxmp);
1835 		mp->b_cont = NULL;
1836 		strsetrwputdatahooks(nvp, strsock_kssl_input,
1837 		    strsock_kssl_output);
1838 	}
1839 #ifdef DEBUG
1840 	/*
1841 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1842 	 * it's inherited early to allow debugging of the accept code itself.
1843 	 */
1844 	nso->so_options |= so->so_options & SO_DEBUG;
1845 #endif /* DEBUG */
1846 
1847 	/*
1848 	 * Save the SRC address from the T_CONN_IND
1849 	 * for getpeername to work on AF_UNIX and on transports that do not
1850 	 * support TI_GETPEERNAME.
1851 	 *
1852 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1853 	 * copyin_name().
1854 	 */
1855 	if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1856 		error = EINVAL;
1857 		freemsg(mp);
1858 		eprintsoline(so, error);
1859 		goto disconnect_vp_unlocked;
1860 	}
1861 	nsti->sti_faddr_len = (socklen_t)srclen;
1862 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1863 	bcopy(src, nsti->sti_faddr_sa, srclen);
1864 	nsti->sti_faddr_valid = 1;
1865 
1866 	/*
1867 	 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1868 	 */
1869 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1870 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1871 		cred_t	*cr;
1872 		pid_t	cpid;
1873 
1874 		cr = msg_getcred(mp, &cpid);
1875 		if (cr != NULL) {
1876 			crhold(cr);
1877 			nso->so_peercred = cr;
1878 			nso->so_cpid = cpid;
1879 		}
1880 		freemsg(mp);
1881 
1882 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1883 		    sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1884 		if (mp == NULL) {
1885 			/*
1886 			 * Accept can not fail with ENOBUFS.
1887 			 * A signal was caught so return EINTR.
1888 			 */
1889 			error = EINTR;
1890 			eprintsoline(so, error);
1891 			goto disconnect_vp_unlocked;
1892 		}
1893 		conn_res = (struct T_conn_res *)mp->b_rptr;
1894 	} else {
1895 		/*
1896 		 * For efficency reasons we use msg_extractcred; no crhold
1897 		 * needed since db_credp is cleared (i.e., we move the cred
1898 		 * from the message to so_peercred.
1899 		 */
1900 		nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1901 
1902 		mp->b_rptr = DB_BASE(mp);
1903 		conn_res = (struct T_conn_res *)mp->b_rptr;
1904 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1905 
1906 		mblk_setcred(mp, cr, curproc->p_pid);
1907 	}
1908 
1909 	/*
1910 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1911 	 * (or AF_INET6) it also has to be bound in the transport provider.
1912 	 * We set the local address in the sonode from the T_OK_ACK of the
1913 	 * T_CONN_RES. For this reason the address we bind to here isn't
1914 	 * important.
1915 	 */
1916 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1917 	    /*CONSTCOND*/
1918 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1919 		/*
1920 		 * Optimization for AF_INET{,6} transports
1921 		 * that can handle a T_CONN_RES without being bound.
1922 		 */
1923 		mutex_enter(&nso->so_lock);
1924 		so_automatic_bind(nso);
1925 		mutex_exit(&nso->so_lock);
1926 	} else {
1927 		/* Perform NULL bind with the transport provider. */
1928 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1929 		    cr)) != 0) {
1930 			ASSERT(error != ENOBUFS);
1931 			freemsg(mp);
1932 			eprintsoline(nso, error);
1933 			goto disconnect_vp_unlocked;
1934 		}
1935 	}
1936 
1937 	/*
1938 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1939 	 * so that any data arriving on the new socket will cause the
1940 	 * appropriate signals to be delivered for the new socket.
1941 	 *
1942 	 * No other thread (except strsock_proto and strsock_misc)
1943 	 * can access the new socket thus we relax the locking.
1944 	 */
1945 	nso->so_pgrp = so->so_pgrp;
1946 	nso->so_state |= so->so_state & SS_ASYNC;
1947 	nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1948 
1949 	if (nso->so_pgrp != 0) {
1950 		if ((error = so_set_events(nso, nvp, cr)) != 0) {
1951 			eprintsoline(nso, error);
1952 			error = 0;
1953 			nso->so_pgrp = 0;
1954 		}
1955 	}
1956 
1957 	/*
1958 	 * Make note of the socket level options. TCP and IP level options
1959 	 * are already inherited. We could do all this after accept is
1960 	 * successful but doing it here simplifies code and no harm done
1961 	 * for error case.
1962 	 */
1963 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1964 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1965 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1966 	nso->so_sndbuf = so->so_sndbuf;
1967 	nso->so_rcvbuf = so->so_rcvbuf;
1968 	if (nso->so_options & SO_LINGER)
1969 		nso->so_linger = so->so_linger;
1970 
1971 	/*
1972 	 * Note that the following sti_direct code path should be
1973 	 * removed once we are confident that the direct sockets
1974 	 * do not result in any degradation.
1975 	 */
1976 	if (sti->sti_direct) {
1977 
1978 		ASSERT(opt != NULL);
1979 
1980 		conn_res->OPT_length = optlen;
1981 		conn_res->OPT_offset = MBLKL(mp);
1982 		bcopy(&opt, mp->b_wptr, optlen);
1983 		mp->b_wptr += optlen;
1984 		conn_res->PRIM_type = T_CONN_RES;
1985 		conn_res->ACCEPTOR_id = 0;
1986 		PRIM_type = T_CONN_RES;
1987 
1988 		/* Send down the T_CONN_RES on acceptor STREAM */
1989 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1990 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1991 		if (error) {
1992 			mutex_enter(&so->so_lock);
1993 			so_lock_single(so);
1994 			eprintsoline(so, error);
1995 			goto disconnect_vp;
1996 		}
1997 		mutex_enter(&nso->so_lock);
1998 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1999 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2000 		if (error) {
2001 			mutex_exit(&nso->so_lock);
2002 			mutex_enter(&so->so_lock);
2003 			so_lock_single(so);
2004 			eprintsoline(so, error);
2005 			goto disconnect_vp;
2006 		}
2007 		if (nso->so_family == AF_INET) {
2008 			sin_t *sin;
2009 
2010 			sin = (sin_t *)(ack_mp->b_rptr +
2011 			    sizeof (struct T_ok_ack));
2012 			bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
2013 			nsti->sti_laddr_len = sizeof (sin_t);
2014 		} else {
2015 			sin6_t *sin6;
2016 
2017 			sin6 = (sin6_t *)(ack_mp->b_rptr +
2018 			    sizeof (struct T_ok_ack));
2019 			bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
2020 			nsti->sti_laddr_len = sizeof (sin6_t);
2021 		}
2022 		freemsg(ack_mp);
2023 
2024 		nso->so_state |= SS_ISCONNECTED;
2025 		nso->so_proto_handle = (sock_lower_handle_t)opt;
2026 		nsti->sti_laddr_valid = 1;
2027 
2028 		if (sti->sti_nl7c_flags & NL7C_ENABLED) {
2029 			/*
2030 			 * A NL7C marked listen()er so the new socket
2031 			 * inherits the listen()er's NL7C state, except
2032 			 * for NL7C_POLLIN.
2033 			 *
2034 			 * Only call NL7C to process the new socket if
2035 			 * the listen socket allows blocking i/o.
2036 			 */
2037 			nsti->sti_nl7c_flags =
2038 			    sti->sti_nl7c_flags & (~NL7C_POLLIN);
2039 			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
2040 				/*
2041 				 * Nonblocking accept() just make it
2042 				 * persist to defer processing to the
2043 				 * read-side syscall (e.g. read).
2044 				 */
2045 				nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
2046 			} else if (nl7c_process(nso, B_FALSE)) {
2047 				/*
2048 				 * NL7C has completed processing on the
2049 				 * socket, close the socket and back to
2050 				 * the top to await the next T_CONN_IND.
2051 				 */
2052 				mutex_exit(&nso->so_lock);
2053 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
2054 				    cr, NULL);
2055 				VN_RELE(nvp);
2056 				goto again;
2057 			}
2058 			/* Pass the new socket out */
2059 		}
2060 
2061 		mutex_exit(&nso->so_lock);
2062 
2063 		/*
2064 		 * It's possible, through the use of autopush for example,
2065 		 * that the acceptor stream may not support sti_direct
2066 		 * semantics. If the new socket does not support sti_direct
2067 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
2068 		 * as we would in the I_PUSH case.
2069 		 */
2070 		if (nsti->sti_direct == 0) {
2071 			int	rval;
2072 
2073 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2074 			    0, 0, K_TO_K, cr, &rval)) != 0) {
2075 				mutex_enter(&so->so_lock);
2076 				so_lock_single(so);
2077 				eprintsoline(so, error);
2078 				goto disconnect_vp;
2079 			}
2080 		}
2081 
2082 		/*
2083 		 * Pass out new socket.
2084 		 */
2085 		if (nsop != NULL)
2086 			*nsop = nso;
2087 
2088 		return (0);
2089 	}
2090 
2091 	/*
2092 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2093 	 * which don't support the FireEngine accept fast-path. It is also
2094 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2095 	 * again. Neither sockfs nor TCP attempt to find out if some other
2096 	 * random module has been inserted in between (in which case we
2097 	 * should follow TLI accept behaviour). We blindly assume the worst
2098 	 * case and revert back to old behaviour i.e. TCP will not send us
2099 	 * any option (eager) and the accept should happen on the listener
2100 	 * queue. Any queued T_conn_ind have already got their options removed
2101 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
2102 	 */
2103 	/*
2104 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2105 	 */
2106 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2107 #ifdef	_ILP32
2108 		queue_t	*q;
2109 
2110 		/*
2111 		 * Find read queue in driver
2112 		 * Can safely do this since we "own" nso/nvp.
2113 		 */
2114 		q = strvp2wq(nvp)->q_next;
2115 		while (SAMESTR(q))
2116 			q = q->q_next;
2117 		q = RD(q);
2118 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2119 #else
2120 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2121 #endif	/* _ILP32 */
2122 		conn_res->PRIM_type = O_T_CONN_RES;
2123 		PRIM_type = O_T_CONN_RES;
2124 	} else {
2125 		conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2126 		conn_res->PRIM_type = T_CONN_RES;
2127 		PRIM_type = T_CONN_RES;
2128 	}
2129 	conn_res->SEQ_number = SEQ_number;
2130 	conn_res->OPT_length = 0;
2131 	conn_res->OPT_offset = 0;
2132 
2133 	mutex_enter(&so->so_lock);
2134 	so_lock_single(so);	/* Set SOLOCKED */
2135 	mutex_exit(&so->so_lock);
2136 
2137 	error = kstrputmsg(SOTOV(so), mp, NULL,
2138 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2139 	mutex_enter(&so->so_lock);
2140 	if (error) {
2141 		eprintsoline(so, error);
2142 		goto disconnect_vp;
2143 	}
2144 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
2145 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2146 	if (error) {
2147 		eprintsoline(so, error);
2148 		goto disconnect_vp;
2149 	}
2150 	/*
2151 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
2152 	 * that to set the local address. If this is not present
2153 	 * then we zero out the address and don't set the
2154 	 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2155 	 * the pathname from the listening socket.
2156 	 */
2157 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2158 	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2159 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2160 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
2161 		bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2162 		nsti->sti_laddr_len = sinlen;
2163 		nsti->sti_laddr_valid = 1;
2164 	} else if (nso->so_family == AF_UNIX) {
2165 		ASSERT(so->so_family == AF_UNIX);
2166 		nsti->sti_laddr_len = sti->sti_laddr_len;
2167 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2168 		bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2169 		    nsti->sti_laddr_len);
2170 		nsti->sti_laddr_valid = 1;
2171 	} else {
2172 		nsti->sti_laddr_len = sti->sti_laddr_len;
2173 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2174 		bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2175 		nsti->sti_laddr_sa->sa_family = nso->so_family;
2176 	}
2177 	freemsg(ack_mp);
2178 
2179 	so_unlock_single(so, SOLOCKED);
2180 	mutex_exit(&so->so_lock);
2181 
2182 	nso->so_state |= SS_ISCONNECTED;
2183 
2184 	/*
2185 	 * Pass out new socket.
2186 	 */
2187 	if (nsop != NULL)
2188 		*nsop = nso;
2189 
2190 	return (0);
2191 
2192 
2193 eproto_disc_unl:
2194 	error = EPROTO;
2195 e_disc_unl:
2196 	eprintsoline(so, error);
2197 	goto disconnect_unlocked;
2198 
2199 pr_disc_vp_unl:
2200 	eprintsoline(so, error);
2201 disconnect_vp_unlocked:
2202 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2203 	VN_RELE(nvp);
2204 disconnect_unlocked:
2205 	(void) sodisconnect(so, SEQ_number, 0);
2206 	return (error);
2207 
2208 pr_disc_vp:
2209 	eprintsoline(so, error);
2210 disconnect_vp:
2211 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2212 	so_unlock_single(so, SOLOCKED);
2213 	mutex_exit(&so->so_lock);
2214 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2215 	VN_RELE(nvp);
2216 	return (error);
2217 
2218 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2219 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2220 	    ? EOPNOTSUPP : EINVAL;
2221 e_bad:
2222 	eprintsoline(so, error);
2223 	return (error);
2224 }
2225 
2226 /*
2227  * connect a socket.
2228  *
2229  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2230  * unconnect (by specifying a null address).
2231  */
2232 int
2233 sotpi_connect(struct sonode *so,
2234 	const struct sockaddr *name,
2235 	socklen_t namelen,
2236 	int fflag,
2237 	int flags,
2238 	struct cred *cr)
2239 {
2240 	struct T_conn_req	conn_req;
2241 	int			error = 0;
2242 	mblk_t			*mp;
2243 	void			*src;
2244 	socklen_t		srclen;
2245 	void			*addr;
2246 	socklen_t		addrlen;
2247 	boolean_t		need_unlock;
2248 	sotpi_info_t		*sti = SOTOTPI(so);
2249 
2250 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2251 	    (void *)so, (void *)name, namelen, fflag, flags,
2252 	    pr_state(so->so_state, so->so_mode)));
2253 
2254 	/*
2255 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2256 	 * avoid sleeping for memory with SOLOCKED held.
2257 	 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2258 	 * + sizeof (struct T_opthdr).
2259 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2260 	 * exceed sti_faddr_maxlen).
2261 	 */
2262 	mp = soallocproto(sizeof (struct T_conn_req) +
2263 	    2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2264 	    cr);
2265 	if (mp == NULL) {
2266 		/*
2267 		 * Connect can not fail with ENOBUFS. A signal was
2268 		 * caught so return EINTR.
2269 		 */
2270 		error = EINTR;
2271 		eprintsoline(so, error);
2272 		return (error);
2273 	}
2274 
2275 	mutex_enter(&so->so_lock);
2276 	/*
2277 	 * Make sure there is a preallocated T_unbind_req message
2278 	 * before any binding. This message is allocated when the
2279 	 * socket is created. Since another thread can consume
2280 	 * so_unbind_mp by the time we return from so_lock_single(),
2281 	 * we should check the availability of so_unbind_mp after
2282 	 * we return from so_lock_single().
2283 	 */
2284 
2285 	so_lock_single(so);	/* Set SOLOCKED */
2286 	need_unlock = B_TRUE;
2287 
2288 	if (sti->sti_unbind_mp == NULL) {
2289 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2290 		/* NOTE: holding so_lock while sleeping */
2291 		sti->sti_unbind_mp =
2292 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2293 		if (sti->sti_unbind_mp == NULL) {
2294 			error = EINTR;
2295 			goto done;
2296 		}
2297 	}
2298 
2299 	/*
2300 	 * Can't have done a listen before connecting.
2301 	 */
2302 	if (so->so_state & SS_ACCEPTCONN) {
2303 		error = EOPNOTSUPP;
2304 		goto done;
2305 	}
2306 
2307 	/*
2308 	 * Must be bound with the transport
2309 	 */
2310 	if (!(so->so_state & SS_ISBOUND)) {
2311 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2312 		    /*CONSTCOND*/
2313 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2314 			/*
2315 			 * Optimization for AF_INET{,6} transports
2316 			 * that can handle a T_CONN_REQ without being bound.
2317 			 */
2318 			so_automatic_bind(so);
2319 		} else {
2320 			error = sotpi_bind(so, NULL, 0,
2321 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2322 			if (error)
2323 				goto done;
2324 		}
2325 		ASSERT(so->so_state & SS_ISBOUND);
2326 		flags |= _SOCONNECT_DID_BIND;
2327 	}
2328 
2329 	/*
2330 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2331 	 * connect to a null address. This is the portable method to
2332 	 * unconnect a socket.
2333 	 */
2334 	if ((namelen >= sizeof (sa_family_t)) &&
2335 	    (name->sa_family == AF_UNSPEC)) {
2336 		name = NULL;
2337 		namelen = 0;
2338 	}
2339 
2340 	/*
2341 	 * Check that we are not already connected.
2342 	 * A connection-oriented socket cannot be reconnected.
2343 	 * A connected connection-less socket can be
2344 	 * - connected to a different address by a subsequent connect
2345 	 * - "unconnected" by a connect to the NULL address
2346 	 */
2347 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2348 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2349 		if (so->so_mode & SM_CONNREQUIRED) {
2350 			/* Connection-oriented socket */
2351 			error = so->so_state & SS_ISCONNECTED ?
2352 			    EISCONN : EALREADY;
2353 			goto done;
2354 		}
2355 		/* Connection-less socket */
2356 		if (name == NULL) {
2357 			/*
2358 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2359 			 * since it was set when the socket was connected.
2360 			 * If this is UDP also send down a T_DISCON_REQ.
2361 			 */
2362 			int val;
2363 
2364 			if ((so->so_family == AF_INET ||
2365 			    so->so_family == AF_INET6) &&
2366 			    (so->so_type == SOCK_DGRAM ||
2367 			    so->so_type == SOCK_RAW) &&
2368 			    /*CONSTCOND*/
2369 			    !soconnect_tpi_udp) {
2370 				/* XXX What about implicitly unbinding here? */
2371 				error = sodisconnect(so, -1,
2372 				    _SODISCONNECT_LOCK_HELD);
2373 			} else {
2374 				so->so_state &=
2375 				    ~(SS_ISCONNECTED | SS_ISCONNECTING);
2376 				sti->sti_faddr_valid = 0;
2377 				sti->sti_faddr_len = 0;
2378 			}
2379 
2380 			/* Remove SOLOCKED since setsockopt will grab it */
2381 			so_unlock_single(so, SOLOCKED);
2382 			mutex_exit(&so->so_lock);
2383 
2384 			val = 0;
2385 			(void) sotpi_setsockopt(so, SOL_SOCKET,
2386 			    SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2387 			    cr);
2388 
2389 			mutex_enter(&so->so_lock);
2390 			so_lock_single(so);	/* Set SOLOCKED */
2391 			goto done;
2392 		}
2393 	}
2394 	ASSERT(so->so_state & SS_ISBOUND);
2395 
2396 	if (name == NULL || namelen == 0) {
2397 		error = EINVAL;
2398 		goto done;
2399 	}
2400 	/*
2401 	 * Mark the socket if sti_faddr_sa represents the transport level
2402 	 * address.
2403 	 */
2404 	if (flags & _SOCONNECT_NOXLATE) {
2405 		struct sockaddr_ux	*soaddr_ux;
2406 
2407 		ASSERT(so->so_family == AF_UNIX);
2408 		if (namelen != sizeof (struct sockaddr_ux)) {
2409 			error = EINVAL;
2410 			goto done;
2411 		}
2412 		soaddr_ux = (struct sockaddr_ux *)name;
2413 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2414 		namelen = sizeof (soaddr_ux->sou_addr);
2415 		sti->sti_faddr_noxlate = 1;
2416 	}
2417 
2418 	/*
2419 	 * Length and family checks.
2420 	 */
2421 	error = so_addr_verify(so, name, namelen);
2422 	if (error)
2423 		goto bad;
2424 
2425 	/*
2426 	 * Save foreign address. Needed for AF_UNIX as well as
2427 	 * transport providers that do not support TI_GETPEERNAME.
2428 	 * Also used for cached foreign address for TCP and UDP.
2429 	 */
2430 	if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2431 		error = EINVAL;
2432 		goto done;
2433 	}
2434 	sti->sti_faddr_len = (socklen_t)namelen;
2435 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2436 	bcopy(name, sti->sti_faddr_sa, namelen);
2437 	sti->sti_faddr_valid = 1;
2438 
2439 	if (so->so_family == AF_UNIX) {
2440 		if (sti->sti_faddr_noxlate) {
2441 			/*
2442 			 * Already have a transport internal address. Do not
2443 			 * pass any (transport internal) source address.
2444 			 */
2445 			addr = sti->sti_faddr_sa;
2446 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2447 			src = NULL;
2448 			srclen = 0;
2449 		} else {
2450 			/*
2451 			 * Pass the sockaddr_un source address as an option
2452 			 * and translate the remote address.
2453 			 * Holding so_lock thus sti_laddr_sa can not change.
2454 			 */
2455 			src = sti->sti_laddr_sa;
2456 			srclen = (t_uscalar_t)sti->sti_laddr_len;
2457 			dprintso(so, 1,
2458 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2459 			    srclen, src));
2460 			error = so_ux_addr_xlate(so,
2461 			    sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2462 			    (flags & _SOCONNECT_XPG4_2),
2463 			    &addr, &addrlen);
2464 			if (error)
2465 				goto bad;
2466 		}
2467 	} else {
2468 		addr = sti->sti_faddr_sa;
2469 		addrlen = (t_uscalar_t)sti->sti_faddr_len;
2470 		src = NULL;
2471 		srclen = 0;
2472 	}
2473 	/*
2474 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2475 	 * option which asks the transport provider to send T_UDERR_IND
2476 	 * messages. These T_UDERR_IND messages are used to return connected
2477 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2478 	 *
2479 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2480 	 * we send down a T_CONN_REQ. This is needed to let the
2481 	 * transport assign a local address that is consistent with
2482 	 * the remote address. Applications depend on a getsockname()
2483 	 * after a connect() to retrieve the "source" IP address for
2484 	 * the connected socket.  Invalidate the cached local address
2485 	 * to force getsockname() to enquire of the transport.
2486 	 */
2487 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2488 		/*
2489 		 * Datagram socket.
2490 		 */
2491 		int32_t val;
2492 
2493 		so_unlock_single(so, SOLOCKED);
2494 		mutex_exit(&so->so_lock);
2495 
2496 		val = 1;
2497 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2498 		    &val, (t_uscalar_t)sizeof (val), cr);
2499 
2500 		mutex_enter(&so->so_lock);
2501 		so_lock_single(so);	/* Set SOLOCKED */
2502 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2503 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2504 		    soconnect_tpi_udp) {
2505 			soisconnected(so);
2506 			goto done;
2507 		}
2508 		/*
2509 		 * Send down T_CONN_REQ etc.
2510 		 * Clear fflag to avoid returning EWOULDBLOCK.
2511 		 */
2512 		fflag = 0;
2513 		ASSERT(so->so_family != AF_UNIX);
2514 		sti->sti_laddr_valid = 0;
2515 	} else if (sti->sti_laddr_len != 0) {
2516 		/*
2517 		 * If the local address or port was "any" then it may be
2518 		 * changed by the transport as a result of the
2519 		 * connect.  Invalidate the cached version if we have one.
2520 		 */
2521 		switch (so->so_family) {
2522 		case AF_INET:
2523 			ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2524 			if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2525 			    INADDR_ANY ||
2526 			    ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2527 				sti->sti_laddr_valid = 0;
2528 			break;
2529 
2530 		case AF_INET6:
2531 			ASSERT(sti->sti_laddr_len ==
2532 			    (socklen_t)sizeof (sin6_t));
2533 			if (IN6_IS_ADDR_UNSPECIFIED(
2534 			    &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2535 			    IN6_IS_ADDR_V4MAPPED_ANY(
2536 			    &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2537 			    ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2538 				sti->sti_laddr_valid = 0;
2539 			break;
2540 
2541 		default:
2542 			break;
2543 		}
2544 	}
2545 
2546 	/*
2547 	 * Check for failure of an earlier call
2548 	 */
2549 	if (so->so_error != 0)
2550 		goto so_bad;
2551 
2552 	/*
2553 	 * Send down T_CONN_REQ. Message was allocated above.
2554 	 */
2555 	conn_req.PRIM_type = T_CONN_REQ;
2556 	conn_req.DEST_length = addrlen;
2557 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2558 	if (srclen == 0) {
2559 		conn_req.OPT_length = 0;
2560 		conn_req.OPT_offset = 0;
2561 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2562 		soappendmsg(mp, addr, addrlen);
2563 	} else {
2564 		/*
2565 		 * There is a AF_UNIX sockaddr_un to include as a source
2566 		 * address option.
2567 		 */
2568 		struct T_opthdr toh;
2569 
2570 		toh.level = SOL_SOCKET;
2571 		toh.name = SO_SRCADDR;
2572 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2573 		toh.status = 0;
2574 		conn_req.OPT_length =
2575 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2576 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2577 		    _TPI_ALIGN_TOPT(addrlen));
2578 
2579 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2580 		soappendmsg(mp, addr, addrlen);
2581 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2582 		soappendmsg(mp, &toh, sizeof (toh));
2583 		soappendmsg(mp, src, srclen);
2584 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2585 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2586 	}
2587 	/*
2588 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2589 	 * in order to have the right state when the T_CONN_CON shows up.
2590 	 */
2591 	soisconnecting(so);
2592 	mutex_exit(&so->so_lock);
2593 
2594 	if (audit_active)
2595 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2596 
2597 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2598 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2599 	mp = NULL;
2600 	mutex_enter(&so->so_lock);
2601 	if (error != 0)
2602 		goto bad;
2603 
2604 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2605 		goto bad;
2606 
2607 	/* Allow other threads to access the socket */
2608 	so_unlock_single(so, SOLOCKED);
2609 	need_unlock = B_FALSE;
2610 
2611 	/*
2612 	 * Wait until we get a T_CONN_CON or an error
2613 	 */
2614 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2615 		so_lock_single(so);	/* Set SOLOCKED */
2616 		need_unlock = B_TRUE;
2617 	}
2618 
2619 done:
2620 	freemsg(mp);
2621 	switch (error) {
2622 	case EINPROGRESS:
2623 	case EALREADY:
2624 	case EISCONN:
2625 	case EINTR:
2626 		/* Non-fatal errors */
2627 		sti->sti_laddr_valid = 0;
2628 		/* FALLTHRU */
2629 	case 0:
2630 		break;
2631 	default:
2632 		ASSERT(need_unlock);
2633 		/*
2634 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2635 		 * and invalidate local-address cache
2636 		 */
2637 		so->so_state &= ~SS_ISCONNECTING;
2638 		sti->sti_laddr_valid = 0;
2639 		/* A discon_ind might have already unbound us */
2640 		if ((flags & _SOCONNECT_DID_BIND) &&
2641 		    (so->so_state & SS_ISBOUND)) {
2642 			int err;
2643 
2644 			err = sotpi_unbind(so, 0);
2645 			/* LINTED - statement has no conseq */
2646 			if (err) {
2647 				eprintsoline(so, err);
2648 			}
2649 		}
2650 		break;
2651 	}
2652 	if (need_unlock)
2653 		so_unlock_single(so, SOLOCKED);
2654 	mutex_exit(&so->so_lock);
2655 	return (error);
2656 
2657 so_bad:	error = sogeterr(so, B_TRUE);
2658 bad:	eprintsoline(so, error);
2659 	goto done;
2660 }
2661 
2662 /* ARGSUSED */
2663 int
2664 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2665 {
2666 	struct T_ordrel_req	ordrel_req;
2667 	mblk_t			*mp;
2668 	uint_t			old_state, state_change;
2669 	int			error = 0;
2670 	sotpi_info_t		*sti = SOTOTPI(so);
2671 
2672 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2673 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2674 
2675 	mutex_enter(&so->so_lock);
2676 	so_lock_single(so);	/* Set SOLOCKED */
2677 
2678 	/*
2679 	 * SunOS 4.X has no check for datagram sockets.
2680 	 * 5.X checks that it is connected (ENOTCONN)
2681 	 * X/Open requires that we check the connected state.
2682 	 */
2683 	if (!(so->so_state & SS_ISCONNECTED)) {
2684 		if (!xnet_skip_checks) {
2685 			error = ENOTCONN;
2686 			if (xnet_check_print) {
2687 				printf("sockfs: X/Open shutdown check "
2688 				    "caused ENOTCONN\n");
2689 			}
2690 		}
2691 		goto done;
2692 	}
2693 	/*
2694 	 * Record the current state and then perform any state changes.
2695 	 * Then use the difference between the old and new states to
2696 	 * determine which messages need to be sent.
2697 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2698 	 * duplicate calls to shutdown().
2699 	 */
2700 	old_state = so->so_state;
2701 
2702 	switch (how) {
2703 	case 0:
2704 		socantrcvmore(so);
2705 		break;
2706 	case 1:
2707 		socantsendmore(so);
2708 		break;
2709 	case 2:
2710 		socantsendmore(so);
2711 		socantrcvmore(so);
2712 		break;
2713 	default:
2714 		error = EINVAL;
2715 		goto done;
2716 	}
2717 
2718 	/*
2719 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2720 	 */
2721 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2722 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2723 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2724 
2725 	switch (state_change) {
2726 	case 0:
2727 		dprintso(so, 1,
2728 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2729 		    so->so_state));
2730 		goto done;
2731 
2732 	case SS_CANTRCVMORE:
2733 		mutex_exit(&so->so_lock);
2734 		strseteof(SOTOV(so), 1);
2735 		/*
2736 		 * strseteof takes care of read side wakeups,
2737 		 * pollwakeups, and signals.
2738 		 */
2739 		/*
2740 		 * Get the read lock before flushing data to avoid problems
2741 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2742 		 */
2743 		mutex_enter(&so->so_lock);
2744 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2745 		mutex_exit(&so->so_lock);
2746 
2747 		/* Flush read side queue */
2748 		strflushrq(SOTOV(so), FLUSHALL);
2749 
2750 		mutex_enter(&so->so_lock);
2751 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2752 		break;
2753 
2754 	case SS_CANTSENDMORE:
2755 		mutex_exit(&so->so_lock);
2756 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2757 		mutex_enter(&so->so_lock);
2758 		break;
2759 
2760 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2761 		mutex_exit(&so->so_lock);
2762 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2763 		strseteof(SOTOV(so), 1);
2764 		/*
2765 		 * strseteof takes care of read side wakeups,
2766 		 * pollwakeups, and signals.
2767 		 */
2768 		/*
2769 		 * Get the read lock before flushing data to avoid problems
2770 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2771 		 */
2772 		mutex_enter(&so->so_lock);
2773 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2774 		mutex_exit(&so->so_lock);
2775 
2776 		/* Flush read side queue */
2777 		strflushrq(SOTOV(so), FLUSHALL);
2778 
2779 		mutex_enter(&so->so_lock);
2780 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2781 		break;
2782 	}
2783 
2784 	ASSERT(MUTEX_HELD(&so->so_lock));
2785 
2786 	/*
2787 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2788 	 * was set due to this call and the new state has both of them set:
2789 	 *	Send the AF_UNIX close indication
2790 	 *	For T_COTS send a discon_ind
2791 	 *
2792 	 * If cantsend was set due to this call:
2793 	 *	For T_COTSORD send an ordrel_ind
2794 	 *
2795 	 * Note that for T_CLTS there is no message sent here.
2796 	 */
2797 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2798 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2799 		/*
2800 		 * For SunOS 4.X compatibility we tell the other end
2801 		 * that we are unable to receive at this point.
2802 		 */
2803 		if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2804 			so_unix_close(so);
2805 
2806 		if (sti->sti_serv_type == T_COTS)
2807 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2808 	}
2809 	if ((state_change & SS_CANTSENDMORE) &&
2810 	    (sti->sti_serv_type == T_COTS_ORD)) {
2811 		/* Send an orderly release */
2812 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2813 
2814 		mutex_exit(&so->so_lock);
2815 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2816 		    0, _ALLOC_SLEEP, cr);
2817 		/*
2818 		 * Send down the T_ORDREL_REQ even if there is flow control.
2819 		 * This prevents shutdown from blocking.
2820 		 * Note that there is no T_OK_ACK for ordrel_req.
2821 		 */
2822 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2823 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2824 		mutex_enter(&so->so_lock);
2825 		if (error) {
2826 			eprintsoline(so, error);
2827 			goto done;
2828 		}
2829 	}
2830 
2831 done:
2832 	so_unlock_single(so, SOLOCKED);
2833 	mutex_exit(&so->so_lock);
2834 	return (error);
2835 }
2836 
2837 /*
2838  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2839  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2840  * that we have closed.
2841  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2842  * T_UNITDATA_REQ containing the same option.
2843  *
2844  * For SOCK_DGRAM half-connections (somebody connected to this end
2845  * but this end is not connect) we don't know where to send any
2846  * SO_UNIX_CLOSE.
2847  *
2848  * We have to ignore stream head errors just in case there has been
2849  * a shutdown(output).
2850  * Ignore any flow control to try to get the message more quickly to the peer.
2851  * While locally ignoring flow control solves the problem when there
2852  * is only the loopback transport on the stream it would not provide
2853  * the correct AF_UNIX socket semantics when one or more modules have
2854  * been pushed.
2855  */
2856 void
2857 so_unix_close(struct sonode *so)
2858 {
2859 	int		error;
2860 	struct T_opthdr	toh;
2861 	mblk_t		*mp;
2862 	sotpi_info_t	*sti = SOTOTPI(so);
2863 
2864 	ASSERT(MUTEX_HELD(&so->so_lock));
2865 
2866 	ASSERT(so->so_family == AF_UNIX);
2867 
2868 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2869 	    (SS_ISCONNECTED|SS_ISBOUND))
2870 		return;
2871 
2872 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2873 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2874 
2875 	toh.level = SOL_SOCKET;
2876 	toh.name = SO_UNIX_CLOSE;
2877 
2878 	/* zero length + header */
2879 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2880 	toh.status = 0;
2881 
2882 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2883 		struct T_optdata_req tdr;
2884 
2885 		tdr.PRIM_type = T_OPTDATA_REQ;
2886 		tdr.DATA_flag = 0;
2887 
2888 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2889 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2890 
2891 		/* NOTE: holding so_lock while sleeping */
2892 		mp = soallocproto2(&tdr, sizeof (tdr),
2893 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2894 	} else {
2895 		struct T_unitdata_req	tudr;
2896 		void			*addr;
2897 		socklen_t		addrlen;
2898 		void			*src;
2899 		socklen_t		srclen;
2900 		struct T_opthdr		toh2;
2901 		t_scalar_t		size;
2902 
2903 		/* Connecteded DGRAM socket */
2904 
2905 		/*
2906 		 * For AF_UNIX the destination address is translated to
2907 		 * an internal name and the source address is passed as
2908 		 * an option.
2909 		 */
2910 		/*
2911 		 * Length and family checks.
2912 		 */
2913 		error = so_addr_verify(so, sti->sti_faddr_sa,
2914 		    (t_uscalar_t)sti->sti_faddr_len);
2915 		if (error) {
2916 			eprintsoline(so, error);
2917 			return;
2918 		}
2919 		if (sti->sti_faddr_noxlate) {
2920 			/*
2921 			 * Already have a transport internal address. Do not
2922 			 * pass any (transport internal) source address.
2923 			 */
2924 			addr = sti->sti_faddr_sa;
2925 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2926 			src = NULL;
2927 			srclen = 0;
2928 		} else {
2929 			/*
2930 			 * Pass the sockaddr_un source address as an option
2931 			 * and translate the remote address.
2932 			 * Holding so_lock thus sti_laddr_sa can not change.
2933 			 */
2934 			src = sti->sti_laddr_sa;
2935 			srclen = (socklen_t)sti->sti_laddr_len;
2936 			dprintso(so, 1,
2937 			    ("so_ux_close: srclen %d, src %p\n",
2938 			    srclen, src));
2939 			error = so_ux_addr_xlate(so,
2940 			    sti->sti_faddr_sa,
2941 			    (socklen_t)sti->sti_faddr_len, 0,
2942 			    &addr, &addrlen);
2943 			if (error) {
2944 				eprintsoline(so, error);
2945 				return;
2946 			}
2947 		}
2948 		tudr.PRIM_type = T_UNITDATA_REQ;
2949 		tudr.DEST_length = addrlen;
2950 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2951 		if (srclen == 0) {
2952 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2953 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2954 			    _TPI_ALIGN_TOPT(addrlen));
2955 
2956 			size = tudr.OPT_offset + tudr.OPT_length;
2957 			/* NOTE: holding so_lock while sleeping */
2958 			mp = soallocproto2(&tudr, sizeof (tudr),
2959 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2960 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2961 			soappendmsg(mp, &toh, sizeof (toh));
2962 		} else {
2963 			/*
2964 			 * There is a AF_UNIX sockaddr_un to include as a
2965 			 * source address option.
2966 			 */
2967 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2968 			    _TPI_ALIGN_TOPT(srclen));
2969 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2970 			    _TPI_ALIGN_TOPT(addrlen));
2971 
2972 			toh2.level = SOL_SOCKET;
2973 			toh2.name = SO_SRCADDR;
2974 			toh2.len = (t_uscalar_t)(srclen +
2975 			    sizeof (struct T_opthdr));
2976 			toh2.status = 0;
2977 
2978 			size = tudr.OPT_offset + tudr.OPT_length;
2979 
2980 			/* NOTE: holding so_lock while sleeping */
2981 			mp = soallocproto2(&tudr, sizeof (tudr),
2982 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2983 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2984 			soappendmsg(mp, &toh, sizeof (toh));
2985 			soappendmsg(mp, &toh2, sizeof (toh2));
2986 			soappendmsg(mp, src, srclen);
2987 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2988 		}
2989 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2990 	}
2991 	mutex_exit(&so->so_lock);
2992 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2993 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2994 	mutex_enter(&so->so_lock);
2995 }
2996 
2997 /*
2998  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2999  * In addition, the caller typically verifies that there is some
3000  * potential state to clear by checking
3001  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
3002  * before calling this routine.
3003  * Note that such a check can be made without holding so_lock since
3004  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
3005  * decrements sti_oobsigcnt.
3006  *
3007  * When data is read *after* the point that all pending
3008  * oob data has been consumed the oob indication is cleared.
3009  *
3010  * This logic keeps select/poll returning POLLRDBAND and
3011  * SIOCATMARK returning true until we have read past
3012  * the mark.
3013  */
3014 static void
3015 sorecv_update_oobstate(struct sonode *so)
3016 {
3017 	sotpi_info_t *sti = SOTOTPI(so);
3018 
3019 	mutex_enter(&so->so_lock);
3020 	ASSERT(so_verify_oobstate(so));
3021 	dprintso(so, 1,
3022 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
3023 	    sti->sti_oobsigcnt,
3024 	    sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
3025 	if (sti->sti_oobsigcnt == 0) {
3026 		/* No more pending oob indications */
3027 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
3028 		freemsg(so->so_oobmsg);
3029 		so->so_oobmsg = NULL;
3030 	}
3031 	ASSERT(so_verify_oobstate(so));
3032 	mutex_exit(&so->so_lock);
3033 }
3034 
3035 /*
3036  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
3037  */
3038 static int
3039 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
3040 {
3041 	sotpi_info_t *sti = SOTOTPI(so);
3042 	int	error = 0;
3043 	mblk_t *tmp = NULL;
3044 	mblk_t *pmp = NULL;
3045 	mblk_t *nmp = sti->sti_nl7c_rcv_mp;
3046 
3047 	ASSERT(nmp != NULL);
3048 
3049 	while (nmp != NULL && uiop->uio_resid > 0) {
3050 		ssize_t n;
3051 
3052 		if (DB_TYPE(nmp) == M_DATA) {
3053 			/*
3054 			 * We have some data, uiomove up to resid bytes.
3055 			 */
3056 			n = MIN(MBLKL(nmp), uiop->uio_resid);
3057 			if (n > 0)
3058 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
3059 			nmp->b_rptr += n;
3060 			if (nmp->b_rptr == nmp->b_wptr) {
3061 				pmp = nmp;
3062 				nmp = nmp->b_cont;
3063 			}
3064 			if (error)
3065 				break;
3066 		} else {
3067 			/*
3068 			 * We only handle data, save for caller to handle.
3069 			 */
3070 			if (pmp != NULL) {
3071 				pmp->b_cont = nmp->b_cont;
3072 			}
3073 			nmp->b_cont = NULL;
3074 			if (*rmp == NULL) {
3075 				*rmp = nmp;
3076 			} else {
3077 				tmp->b_cont = nmp;
3078 			}
3079 			nmp = nmp->b_cont;
3080 			tmp = nmp;
3081 		}
3082 	}
3083 	if (pmp != NULL) {
3084 		/* Free any mblk_t(s) which we have consumed */
3085 		pmp->b_cont = NULL;
3086 		freemsg(sti->sti_nl7c_rcv_mp);
3087 	}
3088 	if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3089 		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
3090 		if (error == 0) {
3091 			rval_t	*p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3092 
3093 			error = p->r_v.r_v2;
3094 			p->r_v.r_v2 = 0;
3095 		}
3096 		rp->r_vals = sti->sti_nl7c_rcv_rval;
3097 		sti->sti_nl7c_rcv_rval = 0;
3098 	} else {
3099 		/* More mblk_t(s) to process so no rval to return */
3100 		rp->r_vals = 0;
3101 	}
3102 	return (error);
3103 }
3104 /*
3105  * Receive the next message on the queue.
3106  * If msg_controllen is non-zero when called the caller is interested in
3107  * any received control info (options).
3108  * If msg_namelen is non-zero when called the caller is interested in
3109  * any received source address.
3110  * The routine returns with msg_control and msg_name pointing to
3111  * kmem_alloc'ed memory which the caller has to free.
3112  */
3113 /* ARGSUSED */
3114 int
3115 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3116     struct cred *cr)
3117 {
3118 	union T_primitives	*tpr;
3119 	mblk_t			*mp;
3120 	uchar_t			pri;
3121 	int			pflag, opflag;
3122 	void			*control;
3123 	t_uscalar_t		controllen;
3124 	t_uscalar_t		namelen;
3125 	int			so_state = so->so_state; /* Snapshot */
3126 	ssize_t			saved_resid;
3127 	rval_t			rval;
3128 	int			flags;
3129 	clock_t			timout;
3130 	int			error = 0;
3131 	sotpi_info_t		*sti = SOTOTPI(so);
3132 
3133 	flags = msg->msg_flags;
3134 	msg->msg_flags = 0;
3135 
3136 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3137 	    (void *)so, (void *)msg, flags,
3138 	    pr_state(so->so_state, so->so_mode), so->so_error));
3139 
3140 	if (so->so_version == SOV_STREAM) {
3141 		so_update_attrs(so, SOACC);
3142 		/* The imaginary "sockmod" has been popped - act as a stream */
3143 		return (strread(SOTOV(so), uiop, cr));
3144 	}
3145 
3146 	/*
3147 	 * If we are not connected because we have never been connected
3148 	 * we return ENOTCONN. If we have been connected (but are no longer
3149 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3150 	 * the EOF.
3151 	 *
3152 	 * An alternative would be to post an ENOTCONN error in stream head
3153 	 * (read+write) and clear it when we're connected. However, that error
3154 	 * would cause incorrect poll/select behavior!
3155 	 */
3156 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3157 	    (so->so_mode & SM_CONNREQUIRED)) {
3158 		return (ENOTCONN);
3159 	}
3160 
3161 	/*
3162 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3163 	 * after checking that the read queue is empty) and returns zero.
3164 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
3165 	 * is zero.
3166 	 */
3167 
3168 	if (flags & MSG_OOB) {
3169 		/* Check that the transport supports OOB */
3170 		if (!(so->so_mode & SM_EXDATA))
3171 			return (EOPNOTSUPP);
3172 		so_update_attrs(so, SOACC);
3173 		return (sorecvoob(so, msg, uiop, flags,
3174 		    (so->so_options & SO_OOBINLINE)));
3175 	}
3176 
3177 	so_update_attrs(so, SOACC);
3178 
3179 	/*
3180 	 * Set msg_controllen and msg_namelen to zero here to make it
3181 	 * simpler in the cases that no control or name is returned.
3182 	 */
3183 	controllen = msg->msg_controllen;
3184 	namelen = msg->msg_namelen;
3185 	msg->msg_controllen = 0;
3186 	msg->msg_namelen = 0;
3187 
3188 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3189 	    namelen, controllen));
3190 
3191 	mutex_enter(&so->so_lock);
3192 	/*
3193 	 * If an NL7C enabled socket and not waiting for write data.
3194 	 */
3195 	if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3196 	    NL7C_ENABLED) {
3197 		if (sti->sti_nl7c_uri) {
3198 			/* Close uri processing for a previous request */
3199 			nl7c_close(so);
3200 		}
3201 		if ((so_state & SS_CANTRCVMORE) &&
3202 		    sti->sti_nl7c_rcv_mp == NULL) {
3203 			/* Nothing to process, EOF */
3204 			mutex_exit(&so->so_lock);
3205 			return (0);
3206 		} else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3207 			/* Persistent NL7C socket, try to process request */
3208 			boolean_t ret;
3209 
3210 			ret = nl7c_process(so,
3211 			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3212 			rval.r_vals = sti->sti_nl7c_rcv_rval;
3213 			error = rval.r_v.r_v2;
3214 			if (error) {
3215 				/* Error of some sort, return it */
3216 				mutex_exit(&so->so_lock);
3217 				return (error);
3218 			}
3219 			if (sti->sti_nl7c_flags &&
3220 			    ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3221 				/*
3222 				 * Still an NL7C socket and no data
3223 				 * to pass up to the caller.
3224 				 */
3225 				mutex_exit(&so->so_lock);
3226 				if (ret) {
3227 					/* EOF */
3228 					return (0);
3229 				} else {
3230 					/* Need more data */
3231 					return (EAGAIN);
3232 				}
3233 			}
3234 		} else {
3235 			/*
3236 			 * Not persistent so no further NL7C processing.
3237 			 */
3238 			sti->sti_nl7c_flags = 0;
3239 		}
3240 	}
3241 	/*
3242 	 * Only one reader is allowed at any given time. This is needed
3243 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3244 	 *
3245 	 * This is slightly different that BSD behavior in that it fails with
3246 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3247 	 * is single-threaded using sblock(), which is dropped while waiting
3248 	 * for data to appear. The difference shows up e.g. if one
3249 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3250 	 * does use nonblocking io and different threads are reading each
3251 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3252 	 * in this case as long as the read queue doesn't get empty.
3253 	 * In this implementation the thread using nonblocking io can
3254 	 * get an EWOULDBLOCK error due to the blocking thread executing
3255 	 * e.g. in the uiomove in kstrgetmsg.
3256 	 * This difference is not believed to be significant.
3257 	 */
3258 	/* Set SOREADLOCKED */
3259 	error = so_lock_read_intr(so,
3260 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3261 	mutex_exit(&so->so_lock);
3262 	if (error)
3263 		return (error);
3264 
3265 	/*
3266 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3267 	 * queued data has been consumed.
3268 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3269 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3270 	 *
3271 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3272 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3273 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3274 	 */
3275 	pflag = MSG_ANY | MSG_DELAYERROR;
3276 	if (flags & MSG_PEEK) {
3277 		pflag |= MSG_IPEEK;
3278 		flags &= ~MSG_WAITALL;
3279 	}
3280 	if (so->so_mode & SM_ATOMIC)
3281 		pflag |= MSG_DISCARDTAIL;
3282 
3283 	if (flags & MSG_DONTWAIT)
3284 		timout = 0;
3285 	else
3286 		timout = -1;
3287 	opflag = pflag;
3288 retry:
3289 	saved_resid = uiop->uio_resid;
3290 	pri = 0;
3291 	mp = NULL;
3292 	if (sti->sti_nl7c_rcv_mp != NULL) {
3293 		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3294 		error = nl7c_sorecv(so, &mp, uiop, &rval);
3295 	} else {
3296 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3297 		    timout, &rval);
3298 	}
3299 	if (error != 0) {
3300 		/* kstrgetmsg returns ETIME when timeout expires */
3301 		if (error == ETIME)
3302 			error = EWOULDBLOCK;
3303 		goto out;
3304 	}
3305 	/*
3306 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3307 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3308 	 */
3309 	ASSERT(!(rval.r_val1 & MORECTL));
3310 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3311 		msg->msg_flags |= MSG_TRUNC;
3312 
3313 	if (mp == NULL) {
3314 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3315 		/*
3316 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3317 		 * The draft Posix socket spec states that the mark should
3318 		 * not be cleared when peeking. We follow the latter.
3319 		 */
3320 		if ((so->so_state &
3321 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3322 		    (uiop->uio_resid != saved_resid) &&
3323 		    !(flags & MSG_PEEK)) {
3324 			sorecv_update_oobstate(so);
3325 		}
3326 
3327 		mutex_enter(&so->so_lock);
3328 		/* Set MSG_EOR based on MOREDATA */
3329 		if (!(rval.r_val1 & MOREDATA)) {
3330 			if (so->so_state & SS_SAVEDEOR) {
3331 				msg->msg_flags |= MSG_EOR;
3332 				so->so_state &= ~SS_SAVEDEOR;
3333 			}
3334 		}
3335 		/*
3336 		 * If some data was received (i.e. not EOF) and the
3337 		 * read/recv* has not been satisfied wait for some more.
3338 		 */
3339 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3340 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3341 			mutex_exit(&so->so_lock);
3342 			pflag = opflag | MSG_NOMARK;
3343 			goto retry;
3344 		}
3345 		goto out_locked;
3346 	}
3347 
3348 	/* strsock_proto has already verified length and alignment */
3349 	tpr = (union T_primitives *)mp->b_rptr;
3350 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3351 
3352 	switch (tpr->type) {
3353 	case T_DATA_IND: {
3354 		if ((so->so_state &
3355 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3356 		    (uiop->uio_resid != saved_resid) &&
3357 		    !(flags & MSG_PEEK)) {
3358 			sorecv_update_oobstate(so);
3359 		}
3360 
3361 		/*
3362 		 * Set msg_flags to MSG_EOR based on
3363 		 * MORE_flag and MOREDATA.
3364 		 */
3365 		mutex_enter(&so->so_lock);
3366 		so->so_state &= ~SS_SAVEDEOR;
3367 		if (!(tpr->data_ind.MORE_flag & 1)) {
3368 			if (!(rval.r_val1 & MOREDATA))
3369 				msg->msg_flags |= MSG_EOR;
3370 			else
3371 				so->so_state |= SS_SAVEDEOR;
3372 		}
3373 		freemsg(mp);
3374 		/*
3375 		 * If some data was received (i.e. not EOF) and the
3376 		 * read/recv* has not been satisfied wait for some more.
3377 		 */
3378 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3379 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3380 			mutex_exit(&so->so_lock);
3381 			pflag = opflag | MSG_NOMARK;
3382 			goto retry;
3383 		}
3384 		goto out_locked;
3385 	}
3386 	case T_UNITDATA_IND: {
3387 		void *addr;
3388 		t_uscalar_t addrlen;
3389 		void *abuf;
3390 		t_uscalar_t optlen;
3391 		void *opt;
3392 
3393 		if ((so->so_state &
3394 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3395 		    (uiop->uio_resid != saved_resid) &&
3396 		    !(flags & MSG_PEEK)) {
3397 			sorecv_update_oobstate(so);
3398 		}
3399 
3400 		if (namelen != 0) {
3401 			/* Caller wants source address */
3402 			addrlen = tpr->unitdata_ind.SRC_length;
3403 			addr = sogetoff(mp,
3404 			    tpr->unitdata_ind.SRC_offset,
3405 			    addrlen, 1);
3406 			if (addr == NULL) {
3407 				freemsg(mp);
3408 				error = EPROTO;
3409 				eprintsoline(so, error);
3410 				goto out;
3411 			}
3412 			if (so->so_family == AF_UNIX) {
3413 				/*
3414 				 * Can not use the transport level address.
3415 				 * If there is a SO_SRCADDR option carrying
3416 				 * the socket level address it will be
3417 				 * extracted below.
3418 				 */
3419 				addr = NULL;
3420 				addrlen = 0;
3421 			}
3422 		}
3423 		optlen = tpr->unitdata_ind.OPT_length;
3424 		if (optlen != 0) {
3425 			t_uscalar_t ncontrollen;
3426 
3427 			/*
3428 			 * Extract any source address option.
3429 			 * Determine how large cmsg buffer is needed.
3430 			 */
3431 			opt = sogetoff(mp,
3432 			    tpr->unitdata_ind.OPT_offset,
3433 			    optlen, __TPI_ALIGN_SIZE);
3434 
3435 			if (opt == NULL) {
3436 				freemsg(mp);
3437 				error = EPROTO;
3438 				eprintsoline(so, error);
3439 				goto out;
3440 			}
3441 			if (so->so_family == AF_UNIX)
3442 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3443 			ncontrollen = so_cmsglen(mp, opt, optlen,
3444 			    !(flags & MSG_XPG4_2));
3445 			if (controllen != 0)
3446 				controllen = ncontrollen;
3447 			else if (ncontrollen != 0)
3448 				msg->msg_flags |= MSG_CTRUNC;
3449 		} else {
3450 			controllen = 0;
3451 		}
3452 
3453 		if (namelen != 0) {
3454 			/*
3455 			 * Return address to caller.
3456 			 * Caller handles truncation if length
3457 			 * exceeds msg_namelen.
3458 			 * NOTE: AF_UNIX NUL termination is ensured by
3459 			 * the sender's copyin_name().
3460 			 */
3461 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3462 
3463 			bcopy(addr, abuf, addrlen);
3464 			msg->msg_name = abuf;
3465 			msg->msg_namelen = addrlen;
3466 		}
3467 
3468 		if (controllen != 0) {
3469 			/*
3470 			 * Return control msg to caller.
3471 			 * Caller handles truncation if length
3472 			 * exceeds msg_controllen.
3473 			 */
3474 			control = kmem_zalloc(controllen, KM_SLEEP);
3475 
3476 			error = so_opt2cmsg(mp, opt, optlen,
3477 			    !(flags & MSG_XPG4_2),
3478 			    control, controllen);
3479 			if (error) {
3480 				freemsg(mp);
3481 				if (msg->msg_namelen != 0)
3482 					kmem_free(msg->msg_name,
3483 					    msg->msg_namelen);
3484 				kmem_free(control, controllen);
3485 				eprintsoline(so, error);
3486 				goto out;
3487 			}
3488 			msg->msg_control = control;
3489 			msg->msg_controllen = controllen;
3490 		}
3491 
3492 		freemsg(mp);
3493 		goto out;
3494 	}
3495 	case T_OPTDATA_IND: {
3496 		struct T_optdata_req *tdr;
3497 		void *opt;
3498 		t_uscalar_t optlen;
3499 
3500 		if ((so->so_state &
3501 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3502 		    (uiop->uio_resid != saved_resid) &&
3503 		    !(flags & MSG_PEEK)) {
3504 			sorecv_update_oobstate(so);
3505 		}
3506 
3507 		tdr = (struct T_optdata_req *)mp->b_rptr;
3508 		optlen = tdr->OPT_length;
3509 		if (optlen != 0) {
3510 			t_uscalar_t ncontrollen;
3511 			/*
3512 			 * Determine how large cmsg buffer is needed.
3513 			 */
3514 			opt = sogetoff(mp,
3515 			    tpr->optdata_ind.OPT_offset,
3516 			    optlen, __TPI_ALIGN_SIZE);
3517 
3518 			if (opt == NULL) {
3519 				freemsg(mp);
3520 				error = EPROTO;
3521 				eprintsoline(so, error);
3522 				goto out;
3523 			}
3524 
3525 			ncontrollen = so_cmsglen(mp, opt, optlen,
3526 			    !(flags & MSG_XPG4_2));
3527 			if (controllen != 0)
3528 				controllen = ncontrollen;
3529 			else if (ncontrollen != 0)
3530 				msg->msg_flags |= MSG_CTRUNC;
3531 		} else {
3532 			controllen = 0;
3533 		}
3534 
3535 		if (controllen != 0) {
3536 			/*
3537 			 * Return control msg to caller.
3538 			 * Caller handles truncation if length
3539 			 * exceeds msg_controllen.
3540 			 */
3541 			control = kmem_zalloc(controllen, KM_SLEEP);
3542 
3543 			error = so_opt2cmsg(mp, opt, optlen,
3544 			    !(flags & MSG_XPG4_2),
3545 			    control, controllen);
3546 			if (error) {
3547 				freemsg(mp);
3548 				kmem_free(control, controllen);
3549 				eprintsoline(so, error);
3550 				goto out;
3551 			}
3552 			msg->msg_control = control;
3553 			msg->msg_controllen = controllen;
3554 		}
3555 
3556 		/*
3557 		 * Set msg_flags to MSG_EOR based on
3558 		 * DATA_flag and MOREDATA.
3559 		 */
3560 		mutex_enter(&so->so_lock);
3561 		so->so_state &= ~SS_SAVEDEOR;
3562 		if (!(tpr->data_ind.MORE_flag & 1)) {
3563 			if (!(rval.r_val1 & MOREDATA))
3564 				msg->msg_flags |= MSG_EOR;
3565 			else
3566 				so->so_state |= SS_SAVEDEOR;
3567 		}
3568 		freemsg(mp);
3569 		/*
3570 		 * If some data was received (i.e. not EOF) and the
3571 		 * read/recv* has not been satisfied wait for some more.
3572 		 * Not possible to wait if control info was received.
3573 		 */
3574 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3575 		    controllen == 0 &&
3576 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3577 			mutex_exit(&so->so_lock);
3578 			pflag = opflag | MSG_NOMARK;
3579 			goto retry;
3580 		}
3581 		goto out_locked;
3582 	}
3583 	case T_EXDATA_IND: {
3584 		dprintso(so, 1,
3585 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3586 		    "state %s\n",
3587 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3588 		    saved_resid - uiop->uio_resid,
3589 		    pr_state(so->so_state, so->so_mode)));
3590 		/*
3591 		 * kstrgetmsg handles MSGMARK so there is nothing to
3592 		 * inspect in the T_EXDATA_IND.
3593 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3594 		 * as a separate message with no M_DATA component. Furthermore,
3595 		 * the stream head does not consolidate M_DATA messages onto
3596 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3597 		 * remains a message by itself. This is needed since MSGMARK
3598 		 * marks both the whole message as well as the last byte
3599 		 * of the message.
3600 		 */
3601 		freemsg(mp);
3602 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3603 		if (flags & MSG_PEEK) {
3604 			/*
3605 			 * Even though we are peeking we consume the
3606 			 * T_EXDATA_IND thereby moving the mark information
3607 			 * to SS_RCVATMARK. Then the oob code below will
3608 			 * retry the peeking kstrgetmsg.
3609 			 * Note that the stream head read queue is
3610 			 * never flushed without holding SOREADLOCKED
3611 			 * thus the T_EXDATA_IND can not disappear
3612 			 * underneath us.
3613 			 */
3614 			dprintso(so, 1,
3615 			    ("sotpi_recvmsg: consume EXDATA_IND "
3616 			    "counts %d/%d state %s\n",
3617 			    sti->sti_oobsigcnt,
3618 			    sti->sti_oobcnt,
3619 			    pr_state(so->so_state, so->so_mode)));
3620 
3621 			pflag = MSG_ANY | MSG_DELAYERROR;
3622 			if (so->so_mode & SM_ATOMIC)
3623 				pflag |= MSG_DISCARDTAIL;
3624 
3625 			pri = 0;
3626 			mp = NULL;
3627 
3628 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3629 			    &pri, &pflag, (clock_t)-1, &rval);
3630 			ASSERT(uiop->uio_resid == saved_resid);
3631 
3632 			if (error) {
3633 #ifdef SOCK_DEBUG
3634 				if (error != EWOULDBLOCK && error != EINTR) {
3635 					eprintsoline(so, error);
3636 				}
3637 #endif /* SOCK_DEBUG */
3638 				goto out;
3639 			}
3640 			ASSERT(mp);
3641 			tpr = (union T_primitives *)mp->b_rptr;
3642 			ASSERT(tpr->type == T_EXDATA_IND);
3643 			freemsg(mp);
3644 		} /* end "if (flags & MSG_PEEK)" */
3645 
3646 		/*
3647 		 * Decrement the number of queued and pending oob.
3648 		 *
3649 		 * SS_RCVATMARK is cleared when we read past a mark.
3650 		 * SS_HAVEOOBDATA is cleared when we've read past the
3651 		 * last mark.
3652 		 * SS_OOBPEND is cleared if we've read past the last
3653 		 * mark and no (new) SIGURG has been posted.
3654 		 */
3655 		mutex_enter(&so->so_lock);
3656 		ASSERT(so_verify_oobstate(so));
3657 		ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3658 		ASSERT(sti->sti_oobsigcnt > 0);
3659 		sti->sti_oobsigcnt--;
3660 		ASSERT(sti->sti_oobcnt > 0);
3661 		sti->sti_oobcnt--;
3662 		/*
3663 		 * Since the T_EXDATA_IND has been removed from the stream
3664 		 * head, but we have not read data past the mark,
3665 		 * sockfs needs to track that the socket is still at the mark.
3666 		 *
3667 		 * Since no data was received call kstrgetmsg again to wait
3668 		 * for data.
3669 		 */
3670 		so->so_state |= SS_RCVATMARK;
3671 		mutex_exit(&so->so_lock);
3672 		dprintso(so, 1,
3673 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3674 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3675 		    pr_state(so->so_state, so->so_mode)));
3676 		pflag = opflag;
3677 		goto retry;
3678 	}
3679 	default:
3680 		cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3681 		    (void *)so, tpr->type, (void *)mp);
3682 		ASSERT(0);
3683 		freemsg(mp);
3684 		error = EPROTO;
3685 		eprintsoline(so, error);
3686 		goto out;
3687 	}
3688 	/* NOTREACHED */
3689 out:
3690 	mutex_enter(&so->so_lock);
3691 out_locked:
3692 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3693 	mutex_exit(&so->so_lock);
3694 	return (error);
3695 }
3696 
3697 /*
3698  * Sending data with options on a datagram socket.
3699  * Assumes caller has verified that SS_ISBOUND etc. are set.
3700  */
3701 static int
3702 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3703     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3704 {
3705 	struct T_unitdata_req	tudr;
3706 	mblk_t			*mp;
3707 	int			error;
3708 	void			*addr;
3709 	socklen_t		addrlen;
3710 	void			*src;
3711 	socklen_t		srclen;
3712 	ssize_t			len;
3713 	int			size;
3714 	struct T_opthdr		toh;
3715 	struct fdbuf		*fdbuf;
3716 	t_uscalar_t		optlen;
3717 	void			*fds;
3718 	int			fdlen;
3719 	sotpi_info_t		*sti = SOTOTPI(so);
3720 
3721 	ASSERT(name && namelen);
3722 	ASSERT(control && controllen);
3723 
3724 	len = uiop->uio_resid;
3725 	if (len > (ssize_t)sti->sti_tidu_size) {
3726 		return (EMSGSIZE);
3727 	}
3728 
3729 	/*
3730 	 * For AF_UNIX the destination address is translated to an internal
3731 	 * name and the source address is passed as an option.
3732 	 * Also, file descriptors are passed as file pointers in an
3733 	 * option.
3734 	 */
3735 
3736 	/*
3737 	 * Length and family checks.
3738 	 */
3739 	error = so_addr_verify(so, name, namelen);
3740 	if (error) {
3741 		eprintsoline(so, error);
3742 		return (error);
3743 	}
3744 	if (so->so_family == AF_UNIX) {
3745 		if (sti->sti_faddr_noxlate) {
3746 			/*
3747 			 * Already have a transport internal address. Do not
3748 			 * pass any (transport internal) source address.
3749 			 */
3750 			addr = name;
3751 			addrlen = namelen;
3752 			src = NULL;
3753 			srclen = 0;
3754 		} else {
3755 			/*
3756 			 * Pass the sockaddr_un source address as an option
3757 			 * and translate the remote address.
3758 			 *
3759 			 * Note that this code does not prevent sti_laddr_sa
3760 			 * from changing while it is being used. Thus
3761 			 * if an unbind+bind occurs concurrently with this
3762 			 * send the peer might see a partially new and a
3763 			 * partially old "from" address.
3764 			 */
3765 			src = sti->sti_laddr_sa;
3766 			srclen = (t_uscalar_t)sti->sti_laddr_len;
3767 			dprintso(so, 1,
3768 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3769 			    srclen, src));
3770 			error = so_ux_addr_xlate(so, name, namelen,
3771 			    (flags & MSG_XPG4_2),
3772 			    &addr, &addrlen);
3773 			if (error) {
3774 				eprintsoline(so, error);
3775 				return (error);
3776 			}
3777 		}
3778 	} else {
3779 		addr = name;
3780 		addrlen = namelen;
3781 		src = NULL;
3782 		srclen = 0;
3783 	}
3784 	optlen = so_optlen(control, controllen,
3785 	    !(flags & MSG_XPG4_2));
3786 	tudr.PRIM_type = T_UNITDATA_REQ;
3787 	tudr.DEST_length = addrlen;
3788 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3789 	if (srclen != 0)
3790 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3791 		    _TPI_ALIGN_TOPT(srclen));
3792 	else
3793 		tudr.OPT_length = optlen;
3794 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3795 	    _TPI_ALIGN_TOPT(addrlen));
3796 
3797 	size = tudr.OPT_offset + tudr.OPT_length;
3798 
3799 	/*
3800 	 * File descriptors only when SM_FDPASSING set.
3801 	 */
3802 	error = so_getfdopt(control, controllen,
3803 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3804 	if (error)
3805 		return (error);
3806 	if (fdlen != -1) {
3807 		if (!(so->so_mode & SM_FDPASSING))
3808 			return (EOPNOTSUPP);
3809 
3810 		error = fdbuf_create(fds, fdlen, &fdbuf);
3811 		if (error)
3812 			return (error);
3813 		mp = fdbuf_allocmsg(size, fdbuf);
3814 	} else {
3815 		mp = soallocproto(size, _ALLOC_INTR, CRED());
3816 		if (mp == NULL) {
3817 			/*
3818 			 * Caught a signal waiting for memory.
3819 			 * Let send* return EINTR.
3820 			 */
3821 			return (EINTR);
3822 		}
3823 	}
3824 	soappendmsg(mp, &tudr, sizeof (tudr));
3825 	soappendmsg(mp, addr, addrlen);
3826 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3827 
3828 	if (fdlen != -1) {
3829 		ASSERT(fdbuf != NULL);
3830 		toh.level = SOL_SOCKET;
3831 		toh.name = SO_FILEP;
3832 		toh.len = fdbuf->fd_size +
3833 		    (t_uscalar_t)sizeof (struct T_opthdr);
3834 		toh.status = 0;
3835 		soappendmsg(mp, &toh, sizeof (toh));
3836 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3837 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3838 	}
3839 	if (srclen != 0) {
3840 		/*
3841 		 * There is a AF_UNIX sockaddr_un to include as a source
3842 		 * address option.
3843 		 */
3844 		toh.level = SOL_SOCKET;
3845 		toh.name = SO_SRCADDR;
3846 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3847 		toh.status = 0;
3848 		soappendmsg(mp, &toh, sizeof (toh));
3849 		soappendmsg(mp, src, srclen);
3850 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3851 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3852 	}
3853 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3854 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3855 	/* At most 3 bytes left in the message */
3856 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3857 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3858 
3859 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3860 	if (audit_active)
3861 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3862 
3863 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3864 #ifdef SOCK_DEBUG
3865 	if (error) {
3866 		eprintsoline(so, error);
3867 	}
3868 #endif /* SOCK_DEBUG */
3869 	return (error);
3870 }
3871 
3872 /*
3873  * Sending data with options on a connected stream socket.
3874  * Assumes caller has verified that SS_ISCONNECTED is set.
3875  */
3876 static int
3877 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3878     t_uscalar_t controllen, int flags)
3879 {
3880 	struct T_optdata_req	tdr;
3881 	mblk_t			*mp;
3882 	int			error;
3883 	ssize_t			iosize;
3884 	int			size;
3885 	struct fdbuf		*fdbuf;
3886 	t_uscalar_t		optlen;
3887 	void			*fds;
3888 	int			fdlen;
3889 	struct T_opthdr		toh;
3890 	sotpi_info_t		*sti = SOTOTPI(so);
3891 
3892 	dprintso(so, 1,
3893 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3894 
3895 	/*
3896 	 * Has to be bound and connected. However, since no locks are
3897 	 * held the state could have changed after sotpi_sendmsg checked it
3898 	 * thus it is not possible to ASSERT on the state.
3899 	 */
3900 
3901 	/* Options on connection-oriented only when SM_OPTDATA set. */
3902 	if (!(so->so_mode & SM_OPTDATA))
3903 		return (EOPNOTSUPP);
3904 
3905 	do {
3906 		/*
3907 		 * Set the MORE flag if uio_resid does not fit in this
3908 		 * message or if the caller passed in "more".
3909 		 * Error for transports with zero tidu_size.
3910 		 */
3911 		tdr.PRIM_type = T_OPTDATA_REQ;
3912 		iosize = sti->sti_tidu_size;
3913 		if (iosize <= 0)
3914 			return (EMSGSIZE);
3915 		if (uiop->uio_resid > iosize) {
3916 			tdr.DATA_flag = 1;
3917 		} else {
3918 			if (more)
3919 				tdr.DATA_flag = 1;
3920 			else
3921 				tdr.DATA_flag = 0;
3922 			iosize = uiop->uio_resid;
3923 		}
3924 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3925 		    tdr.DATA_flag, iosize));
3926 
3927 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3928 		tdr.OPT_length = optlen;
3929 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3930 
3931 		size = (int)sizeof (tdr) + optlen;
3932 		/*
3933 		 * File descriptors only when SM_FDPASSING set.
3934 		 */
3935 		error = so_getfdopt(control, controllen,
3936 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3937 		if (error)
3938 			return (error);
3939 		if (fdlen != -1) {
3940 			if (!(so->so_mode & SM_FDPASSING))
3941 				return (EOPNOTSUPP);
3942 
3943 			error = fdbuf_create(fds, fdlen, &fdbuf);
3944 			if (error)
3945 				return (error);
3946 			mp = fdbuf_allocmsg(size, fdbuf);
3947 		} else {
3948 			mp = soallocproto(size, _ALLOC_INTR, CRED());
3949 			if (mp == NULL) {
3950 				/*
3951 				 * Caught a signal waiting for memory.
3952 				 * Let send* return EINTR.
3953 				 */
3954 				return (EINTR);
3955 			}
3956 		}
3957 		soappendmsg(mp, &tdr, sizeof (tdr));
3958 
3959 		if (fdlen != -1) {
3960 			ASSERT(fdbuf != NULL);
3961 			toh.level = SOL_SOCKET;
3962 			toh.name = SO_FILEP;
3963 			toh.len = fdbuf->fd_size +
3964 			    (t_uscalar_t)sizeof (struct T_opthdr);
3965 			toh.status = 0;
3966 			soappendmsg(mp, &toh, sizeof (toh));
3967 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3968 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3969 		}
3970 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3971 		/* At most 3 bytes left in the message */
3972 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3973 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3974 
3975 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3976 
3977 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3978 		    0, MSG_BAND, 0);
3979 		if (error) {
3980 			eprintsoline(so, error);
3981 			return (error);
3982 		}
3983 		control = NULL;
3984 		if (uiop->uio_resid > 0) {
3985 			/*
3986 			 * Recheck for fatal errors. Fail write even though
3987 			 * some data have been written. This is consistent
3988 			 * with strwrite semantics and BSD sockets semantics.
3989 			 */
3990 			if (so->so_state & SS_CANTSENDMORE) {
3991 				eprintsoline(so, error);
3992 				return (EPIPE);
3993 			}
3994 			if (so->so_error != 0) {
3995 				mutex_enter(&so->so_lock);
3996 				error = sogeterr(so, B_TRUE);
3997 				mutex_exit(&so->so_lock);
3998 				if (error != 0) {
3999 					eprintsoline(so, error);
4000 					return (error);
4001 				}
4002 			}
4003 		}
4004 	} while (uiop->uio_resid > 0);
4005 	return (0);
4006 }
4007 
4008 /*
4009  * Sending data on a datagram socket.
4010  * Assumes caller has verified that SS_ISBOUND etc. are set.
4011  *
4012  * For AF_UNIX the destination address is translated to an internal
4013  * name and the source address is passed as an option.
4014  */
4015 int
4016 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
4017     struct uio *uiop, int flags)
4018 {
4019 	struct T_unitdata_req	tudr;
4020 	mblk_t			*mp;
4021 	int			error;
4022 	void			*addr;
4023 	socklen_t		addrlen;
4024 	void			*src;
4025 	socklen_t		srclen;
4026 	ssize_t			len;
4027 	sotpi_info_t		*sti = SOTOTPI(so);
4028 
4029 	ASSERT(name != NULL && namelen != 0);
4030 
4031 	len = uiop->uio_resid;
4032 	if (len > sti->sti_tidu_size) {
4033 		error = EMSGSIZE;
4034 		goto done;
4035 	}
4036 
4037 	/* Length and family checks */
4038 	error = so_addr_verify(so, name, namelen);
4039 	if (error != 0)
4040 		goto done;
4041 
4042 	if (sti->sti_direct)
4043 		return (sodgram_direct(so, name, namelen, uiop, flags));
4044 
4045 	if (so->so_family == AF_UNIX) {
4046 		if (sti->sti_faddr_noxlate) {
4047 			/*
4048 			 * Already have a transport internal address. Do not
4049 			 * pass any (transport internal) source address.
4050 			 */
4051 			addr = name;
4052 			addrlen = namelen;
4053 			src = NULL;
4054 			srclen = 0;
4055 		} else {
4056 			/*
4057 			 * Pass the sockaddr_un source address as an option
4058 			 * and translate the remote address.
4059 			 *
4060 			 * Note that this code does not prevent sti_laddr_sa
4061 			 * from changing while it is being used. Thus
4062 			 * if an unbind+bind occurs concurrently with this
4063 			 * send the peer might see a partially new and a
4064 			 * partially old "from" address.
4065 			 */
4066 			src = sti->sti_laddr_sa;
4067 			srclen = (socklen_t)sti->sti_laddr_len;
4068 			dprintso(so, 1,
4069 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
4070 			    srclen, src));
4071 			error = so_ux_addr_xlate(so, name, namelen,
4072 			    (flags & MSG_XPG4_2),
4073 			    &addr, &addrlen);
4074 			if (error) {
4075 				eprintsoline(so, error);
4076 				goto done;
4077 			}
4078 		}
4079 	} else {
4080 		addr = name;
4081 		addrlen = namelen;
4082 		src = NULL;
4083 		srclen = 0;
4084 	}
4085 	tudr.PRIM_type = T_UNITDATA_REQ;
4086 	tudr.DEST_length = addrlen;
4087 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4088 	if (srclen == 0) {
4089 		tudr.OPT_length = 0;
4090 		tudr.OPT_offset = 0;
4091 
4092 		mp = soallocproto2(&tudr, sizeof (tudr),
4093 		    addr, addrlen, 0, _ALLOC_INTR, CRED());
4094 		if (mp == NULL) {
4095 			/*
4096 			 * Caught a signal waiting for memory.
4097 			 * Let send* return EINTR.
4098 			 */
4099 			error = EINTR;
4100 			goto done;
4101 		}
4102 	} else {
4103 		/*
4104 		 * There is a AF_UNIX sockaddr_un to include as a source
4105 		 * address option.
4106 		 */
4107 		struct T_opthdr toh;
4108 		ssize_t size;
4109 
4110 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4111 		    _TPI_ALIGN_TOPT(srclen));
4112 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4113 		    _TPI_ALIGN_TOPT(addrlen));
4114 
4115 		toh.level = SOL_SOCKET;
4116 		toh.name = SO_SRCADDR;
4117 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4118 		toh.status = 0;
4119 
4120 		size = tudr.OPT_offset + tudr.OPT_length;
4121 		mp = soallocproto2(&tudr, sizeof (tudr),
4122 		    addr, addrlen, size, _ALLOC_INTR, CRED());
4123 		if (mp == NULL) {
4124 			/*
4125 			 * Caught a signal waiting for memory.
4126 			 * Let send* return EINTR.
4127 			 */
4128 			error = EINTR;
4129 			goto done;
4130 		}
4131 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4132 		soappendmsg(mp, &toh, sizeof (toh));
4133 		soappendmsg(mp, src, srclen);
4134 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4135 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4136 	}
4137 
4138 	if (audit_active)
4139 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4140 
4141 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4142 done:
4143 #ifdef SOCK_DEBUG
4144 	if (error) {
4145 		eprintsoline(so, error);
4146 	}
4147 #endif /* SOCK_DEBUG */
4148 	return (error);
4149 }
4150 
4151 /*
4152  * Sending data on a connected stream socket.
4153  * Assumes caller has verified that SS_ISCONNECTED is set.
4154  */
4155 int
4156 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4157     int sflag)
4158 {
4159 	struct T_data_req	tdr;
4160 	mblk_t			*mp;
4161 	int			error;
4162 	ssize_t			iosize;
4163 	sotpi_info_t		*sti = SOTOTPI(so);
4164 
4165 	dprintso(so, 1,
4166 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4167 	    (void *)so, uiop->uio_resid, prim, sflag));
4168 
4169 	/*
4170 	 * Has to be bound and connected. However, since no locks are
4171 	 * held the state could have changed after sotpi_sendmsg checked it
4172 	 * thus it is not possible to ASSERT on the state.
4173 	 */
4174 
4175 	do {
4176 		/*
4177 		 * Set the MORE flag if uio_resid does not fit in this
4178 		 * message or if the caller passed in "more".
4179 		 * Error for transports with zero tidu_size.
4180 		 */
4181 		tdr.PRIM_type = prim;
4182 		iosize = sti->sti_tidu_size;
4183 		if (iosize <= 0)
4184 			return (EMSGSIZE);
4185 		if (uiop->uio_resid > iosize) {
4186 			tdr.MORE_flag = 1;
4187 		} else {
4188 			if (more)
4189 				tdr.MORE_flag = 1;
4190 			else
4191 				tdr.MORE_flag = 0;
4192 			iosize = uiop->uio_resid;
4193 		}
4194 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4195 		    prim, tdr.MORE_flag, iosize));
4196 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4197 		if (mp == NULL) {
4198 			/*
4199 			 * Caught a signal waiting for memory.
4200 			 * Let send* return EINTR.
4201 			 */
4202 			return (EINTR);
4203 		}
4204 
4205 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4206 		    0, sflag | MSG_BAND, 0);
4207 		if (error) {
4208 			eprintsoline(so, error);
4209 			return (error);
4210 		}
4211 		if (uiop->uio_resid > 0) {
4212 			/*
4213 			 * Recheck for fatal errors. Fail write even though
4214 			 * some data have been written. This is consistent
4215 			 * with strwrite semantics and BSD sockets semantics.
4216 			 */
4217 			if (so->so_state & SS_CANTSENDMORE) {
4218 				eprintsoline(so, error);
4219 				return (EPIPE);
4220 			}
4221 			if (so->so_error != 0) {
4222 				mutex_enter(&so->so_lock);
4223 				error = sogeterr(so, B_TRUE);
4224 				mutex_exit(&so->so_lock);
4225 				if (error != 0) {
4226 					eprintsoline(so, error);
4227 					return (error);
4228 				}
4229 			}
4230 		}
4231 	} while (uiop->uio_resid > 0);
4232 	return (0);
4233 }
4234 
4235 /*
4236  * Check the state for errors and call the appropriate send function.
4237  *
4238  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4239  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4240  * after sending the message.
4241  */
4242 static int
4243 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4244     struct cred *cr)
4245 {
4246 	int		so_state;
4247 	int		so_mode;
4248 	int		error;
4249 	struct sockaddr *name;
4250 	t_uscalar_t	namelen;
4251 	int		dontroute;
4252 	int		flags;
4253 	sotpi_info_t	*sti = SOTOTPI(so);
4254 
4255 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4256 	    (void *)so, (void *)msg, msg->msg_flags,
4257 	    pr_state(so->so_state, so->so_mode), so->so_error));
4258 
4259 	if (so->so_version == SOV_STREAM) {
4260 		/* The imaginary "sockmod" has been popped - act as a stream */
4261 		so_update_attrs(so, SOMOD);
4262 		return (strwrite(SOTOV(so), uiop, cr));
4263 	}
4264 
4265 	mutex_enter(&so->so_lock);
4266 	so_state = so->so_state;
4267 
4268 	if (so_state & SS_CANTSENDMORE) {
4269 		mutex_exit(&so->so_lock);
4270 		return (EPIPE);
4271 	}
4272 
4273 	if (so->so_error != 0) {
4274 		error = sogeterr(so, B_TRUE);
4275 		if (error != 0) {
4276 			mutex_exit(&so->so_lock);
4277 			return (error);
4278 		}
4279 	}
4280 
4281 	name = (struct sockaddr *)msg->msg_name;
4282 	namelen = msg->msg_namelen;
4283 
4284 	so_mode = so->so_mode;
4285 
4286 	if (name == NULL) {
4287 		if (!(so_state & SS_ISCONNECTED)) {
4288 			mutex_exit(&so->so_lock);
4289 			if (so_mode & SM_CONNREQUIRED)
4290 				return (ENOTCONN);
4291 			else
4292 				return (EDESTADDRREQ);
4293 		}
4294 		if (so_mode & SM_CONNREQUIRED) {
4295 			name = NULL;
4296 			namelen = 0;
4297 		} else {
4298 			/*
4299 			 * Note that this code does not prevent sti_faddr_sa
4300 			 * from changing while it is being used. Thus
4301 			 * if an "unconnect"+connect occurs concurrently with
4302 			 * this send the datagram might be delivered to a
4303 			 * garbaled address.
4304 			 */
4305 			ASSERT(sti->sti_faddr_sa);
4306 			name = sti->sti_faddr_sa;
4307 			namelen = (t_uscalar_t)sti->sti_faddr_len;
4308 		}
4309 	} else {
4310 		if (!(so_state & SS_ISCONNECTED) &&
4311 		    (so_mode & SM_CONNREQUIRED)) {
4312 			/* Required but not connected */
4313 			mutex_exit(&so->so_lock);
4314 			return (ENOTCONN);
4315 		}
4316 		/*
4317 		 * Ignore the address on connection-oriented sockets.
4318 		 * Just like BSD this code does not generate an error for
4319 		 * TCP (a CONNREQUIRED socket) when sending to an address
4320 		 * passed in with sendto/sendmsg. Instead the data is
4321 		 * delivered on the connection as if no address had been
4322 		 * supplied.
4323 		 */
4324 		if ((so_state & SS_ISCONNECTED) &&
4325 		    !(so_mode & SM_CONNREQUIRED)) {
4326 			mutex_exit(&so->so_lock);
4327 			return (EISCONN);
4328 		}
4329 		if (!(so_state & SS_ISBOUND)) {
4330 			so_lock_single(so);	/* Set SOLOCKED */
4331 			error = sotpi_bind(so, NULL, 0,
4332 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4333 			so_unlock_single(so, SOLOCKED);
4334 			if (error) {
4335 				mutex_exit(&so->so_lock);
4336 				eprintsoline(so, error);
4337 				return (error);
4338 			}
4339 		}
4340 		/*
4341 		 * Handle delayed datagram errors. These are only queued
4342 		 * when the application sets SO_DGRAM_ERRIND.
4343 		 * Return the error if we are sending to the address
4344 		 * that was returned in the last T_UDERROR_IND.
4345 		 * If sending to some other address discard the delayed
4346 		 * error indication.
4347 		 */
4348 		if (sti->sti_delayed_error) {
4349 			struct T_uderror_ind	*tudi;
4350 			void			*addr;
4351 			t_uscalar_t		addrlen;
4352 			boolean_t		match = B_FALSE;
4353 
4354 			ASSERT(sti->sti_eaddr_mp);
4355 			error = sti->sti_delayed_error;
4356 			sti->sti_delayed_error = 0;
4357 			tudi =
4358 			    (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4359 			addrlen = tudi->DEST_length;
4360 			addr = sogetoff(sti->sti_eaddr_mp,
4361 			    tudi->DEST_offset, addrlen, 1);
4362 			ASSERT(addr);	/* Checked by strsock_proto */
4363 			switch (so->so_family) {
4364 			case AF_INET: {
4365 				/* Compare just IP address and port */
4366 				sin_t *sin1 = (sin_t *)name;
4367 				sin_t *sin2 = (sin_t *)addr;
4368 
4369 				if (addrlen == sizeof (sin_t) &&
4370 				    namelen == addrlen &&
4371 				    sin1->sin_port == sin2->sin_port &&
4372 				    sin1->sin_addr.s_addr ==
4373 				    sin2->sin_addr.s_addr)
4374 					match = B_TRUE;
4375 				break;
4376 			}
4377 			case AF_INET6: {
4378 				/* Compare just IP address and port. Not flow */
4379 				sin6_t *sin1 = (sin6_t *)name;
4380 				sin6_t *sin2 = (sin6_t *)addr;
4381 
4382 				if (addrlen == sizeof (sin6_t) &&
4383 				    namelen == addrlen &&
4384 				    sin1->sin6_port == sin2->sin6_port &&
4385 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4386 				    &sin2->sin6_addr))
4387 					match = B_TRUE;
4388 				break;
4389 			}
4390 			case AF_UNIX:
4391 			default:
4392 				if (namelen == addrlen &&
4393 				    bcmp(name, addr, namelen) == 0)
4394 					match = B_TRUE;
4395 			}
4396 			if (match) {
4397 				freemsg(sti->sti_eaddr_mp);
4398 				sti->sti_eaddr_mp = NULL;
4399 				mutex_exit(&so->so_lock);
4400 #ifdef DEBUG
4401 				dprintso(so, 0,
4402 				    ("sockfs delayed error %d for %s\n",
4403 				    error,
4404 				    pr_addr(so->so_family, name, namelen)));
4405 #endif /* DEBUG */
4406 				return (error);
4407 			}
4408 			freemsg(sti->sti_eaddr_mp);
4409 			sti->sti_eaddr_mp = NULL;
4410 		}
4411 	}
4412 	mutex_exit(&so->so_lock);
4413 
4414 	flags = msg->msg_flags;
4415 	dontroute = 0;
4416 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4417 		uint32_t	val;
4418 
4419 		val = 1;
4420 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4421 		    &val, (t_uscalar_t)sizeof (val), cr);
4422 		if (error)
4423 			return (error);
4424 		dontroute = 1;
4425 	}
4426 
4427 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4428 		error = EOPNOTSUPP;
4429 		goto done;
4430 	}
4431 	if (msg->msg_controllen != 0) {
4432 		if (!(so_mode & SM_CONNREQUIRED)) {
4433 			so_update_attrs(so, SOMOD);
4434 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4435 			    msg->msg_control, msg->msg_controllen, flags);
4436 		} else {
4437 			if (flags & MSG_OOB) {
4438 				/* Can't generate T_EXDATA_REQ with options */
4439 				error = EOPNOTSUPP;
4440 				goto done;
4441 			}
4442 			so_update_attrs(so, SOMOD);
4443 			error = sosend_svccmsg(so, uiop,
4444 			    !(flags & MSG_EOR),
4445 			    msg->msg_control, msg->msg_controllen,
4446 			    flags);
4447 		}
4448 		goto done;
4449 	}
4450 
4451 	so_update_attrs(so, SOMOD);
4452 	if (!(so_mode & SM_CONNREQUIRED)) {
4453 		/*
4454 		 * If there is no SO_DONTROUTE to turn off return immediately
4455 		 * from send_dgram. This can allow tail-call optimizations.
4456 		 */
4457 		if (!dontroute) {
4458 			return (sosend_dgram(so, name, namelen, uiop, flags));
4459 		}
4460 		error = sosend_dgram(so, name, namelen, uiop, flags);
4461 	} else {
4462 		t_scalar_t prim;
4463 		int sflag;
4464 
4465 		/* Ignore msg_name in the connected state */
4466 		if (flags & MSG_OOB) {
4467 			prim = T_EXDATA_REQ;
4468 			/*
4469 			 * Send down T_EXDATA_REQ even if there is flow
4470 			 * control for data.
4471 			 */
4472 			sflag = MSG_IGNFLOW;
4473 		} else {
4474 			if (so_mode & SM_BYTESTREAM) {
4475 				/* Byte stream transport - use write */
4476 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4477 
4478 				/* Send M_DATA messages */
4479 				if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4480 				    (error = nl7c_data(so, uiop)) >= 0) {
4481 					/* NL7C consumed the data */
4482 					return (error);
4483 				}
4484 				/*
4485 				 * If there is no SO_DONTROUTE to turn off,
4486 				 * sti_direct is on, and there is no flow
4487 				 * control, we can take the fast path.
4488 				 */
4489 				if (!dontroute && sti->sti_direct != 0 &&
4490 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4491 					return (sostream_direct(so, uiop,
4492 					    NULL, cr));
4493 				}
4494 				error = strwrite(SOTOV(so), uiop, cr);
4495 				goto done;
4496 			}
4497 			prim = T_DATA_REQ;
4498 			sflag = 0;
4499 		}
4500 		/*
4501 		 * If there is no SO_DONTROUTE to turn off return immediately
4502 		 * from sosend_svc. This can allow tail-call optimizations.
4503 		 */
4504 		if (!dontroute)
4505 			return (sosend_svc(so, uiop, prim,
4506 			    !(flags & MSG_EOR), sflag));
4507 		error = sosend_svc(so, uiop, prim,
4508 		    !(flags & MSG_EOR), sflag);
4509 	}
4510 	ASSERT(dontroute);
4511 done:
4512 	if (dontroute) {
4513 		uint32_t	val;
4514 
4515 		val = 0;
4516 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4517 		    &val, (t_uscalar_t)sizeof (val), cr);
4518 	}
4519 	return (error);
4520 }
4521 
4522 /*
4523  * kstrwritemp() has very similar semantics as that of strwrite().
4524  * The main difference is it obtains mblks from the caller and also
4525  * does not do any copy as done in strwrite() from user buffers to
4526  * kernel buffers.
4527  *
4528  * Currently, this routine is used by sendfile to send data allocated
4529  * within the kernel without any copying. This interface does not use the
4530  * synchronous stream interface as synch. stream interface implies
4531  * copying.
4532  */
4533 int
4534 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4535 {
4536 	struct stdata *stp;
4537 	struct queue *wqp;
4538 	mblk_t *newmp;
4539 	char waitflag;
4540 	int tempmode;
4541 	int error = 0;
4542 	int done = 0;
4543 	struct sonode *so;
4544 	boolean_t direct;
4545 
4546 	ASSERT(vp->v_stream);
4547 	stp = vp->v_stream;
4548 
4549 	so = VTOSO(vp);
4550 	direct = _SOTOTPI(so)->sti_direct;
4551 
4552 	/*
4553 	 * This is the sockfs direct fast path. canputnext() need
4554 	 * not be accurate so we don't grab the sd_lock here. If
4555 	 * we get flow-controlled, we grab sd_lock just before the
4556 	 * do..while loop below to emulate what strwrite() does.
4557 	 */
4558 	wqp = stp->sd_wrq;
4559 	if (canputnext(wqp) && direct &&
4560 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4561 		return (sostream_direct(so, NULL, mp, CRED()));
4562 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4563 		/* Fast check of flags before acquiring the lock */
4564 		mutex_enter(&stp->sd_lock);
4565 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4566 		mutex_exit(&stp->sd_lock);
4567 		if (error != 0) {
4568 			if (!(stp->sd_flag & STPLEX) &&
4569 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
4570 				error = EPIPE;
4571 			}
4572 			return (error);
4573 		}
4574 	}
4575 
4576 	waitflag = WRITEWAIT;
4577 	if (stp->sd_flag & OLDNDELAY)
4578 		tempmode = fmode & ~FNDELAY;
4579 	else
4580 		tempmode = fmode;
4581 
4582 	mutex_enter(&stp->sd_lock);
4583 	do {
4584 		if (canputnext(wqp)) {
4585 			mutex_exit(&stp->sd_lock);
4586 			if (stp->sd_wputdatafunc != NULL) {
4587 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4588 				    NULL, NULL, NULL);
4589 				if (newmp == NULL) {
4590 					/* The caller will free mp */
4591 					return (ECOMM);
4592 				}
4593 				mp = newmp;
4594 			}
4595 			putnext(wqp, mp);
4596 			return (0);
4597 		}
4598 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4599 		    &done);
4600 	} while (error == 0 && !done);
4601 
4602 	mutex_exit(&stp->sd_lock);
4603 	/*
4604 	 * EAGAIN tells the application to try again. ENOMEM
4605 	 * is returned only if the memory allocation size
4606 	 * exceeds the physical limits of the system. ENOMEM
4607 	 * can't be true here.
4608 	 */
4609 	if (error == ENOMEM)
4610 		error = EAGAIN;
4611 	return (error);
4612 }
4613 
4614 /* ARGSUSED */
4615 static int
4616 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4617     struct cred *cr, mblk_t **mpp)
4618 {
4619 	int error;
4620 
4621 	if (so->so_family != AF_INET && so->so_family != AF_INET6)
4622 		return (EAFNOSUPPORT);
4623 
4624 	if (so->so_state & SS_CANTSENDMORE)
4625 		return (EPIPE);
4626 
4627 	if (so->so_type != SOCK_STREAM)
4628 		return (EOPNOTSUPP);
4629 
4630 	if ((so->so_state & SS_ISCONNECTED) == 0)
4631 		return (ENOTCONN);
4632 
4633 	error = kstrwritemp(so->so_vnode, *mpp, fflag);
4634 	if (error == 0)
4635 		*mpp = NULL;
4636 	return (error);
4637 }
4638 
4639 /*
4640  * Sending data on a datagram socket.
4641  * Assumes caller has verified that SS_ISBOUND etc. are set.
4642  */
4643 /* ARGSUSED */
4644 static int
4645 sodgram_direct(struct sonode *so, struct sockaddr *name,
4646     socklen_t namelen, struct uio *uiop, int flags)
4647 {
4648 	struct T_unitdata_req	tudr;
4649 	mblk_t			*mp = NULL;
4650 	int			error = 0;
4651 	void			*addr;
4652 	socklen_t		addrlen;
4653 	ssize_t			len;
4654 	struct stdata		*stp = SOTOV(so)->v_stream;
4655 	int			so_state;
4656 	queue_t			*udp_wq;
4657 	boolean_t		connected;
4658 	mblk_t			*mpdata = NULL;
4659 	sotpi_info_t		*sti = SOTOTPI(so);
4660 
4661 	ASSERT(name != NULL && namelen != 0);
4662 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4663 	ASSERT(!(so->so_mode & SM_EXDATA));
4664 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4665 	ASSERT(SOTOV(so)->v_type == VSOCK);
4666 
4667 	/* Caller checked for proper length */
4668 	len = uiop->uio_resid;
4669 	ASSERT(len <= sti->sti_tidu_size);
4670 
4671 	/* Length and family checks have been done by caller */
4672 	ASSERT(name->sa_family == so->so_family);
4673 	ASSERT(so->so_family == AF_INET ||
4674 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4675 	ASSERT(so->so_family == AF_INET6 ||
4676 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4677 
4678 	addr = name;
4679 	addrlen = namelen;
4680 
4681 	if (stp->sd_sidp != NULL &&
4682 	    (error = straccess(stp, JCWRITE)) != 0)
4683 		goto done;
4684 
4685 	so_state = so->so_state;
4686 
4687 	connected = so_state & SS_ISCONNECTED;
4688 	if (!connected) {
4689 		tudr.PRIM_type = T_UNITDATA_REQ;
4690 		tudr.DEST_length = addrlen;
4691 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4692 		tudr.OPT_length = 0;
4693 		tudr.OPT_offset = 0;
4694 
4695 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4696 		    _ALLOC_INTR, CRED());
4697 		if (mp == NULL) {
4698 			/*
4699 			 * Caught a signal waiting for memory.
4700 			 * Let send* return EINTR.
4701 			 */
4702 			error = EINTR;
4703 			goto done;
4704 		}
4705 	}
4706 
4707 	/*
4708 	 * For UDP we don't break up the copyin into smaller pieces
4709 	 * as in the TCP case.  That means if ENOMEM is returned by
4710 	 * mcopyinuio() then the uio vector has not been modified at
4711 	 * all and we fallback to either strwrite() or kstrputmsg()
4712 	 * below.  Note also that we never generate priority messages
4713 	 * from here.
4714 	 */
4715 	udp_wq = stp->sd_wrq->q_next;
4716 	if (canput(udp_wq) &&
4717 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4718 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4719 		ASSERT(uiop->uio_resid == 0);
4720 		if (!connected)
4721 			linkb(mp, mpdata);
4722 		else
4723 			mp = mpdata;
4724 		if (audit_active)
4725 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4726 
4727 		udp_wput(udp_wq, mp);
4728 		return (0);
4729 	}
4730 
4731 	ASSERT(mpdata == NULL);
4732 	if (error != 0 && error != ENOMEM) {
4733 		freemsg(mp);
4734 		return (error);
4735 	}
4736 
4737 	/*
4738 	 * For connected, let strwrite() handle the blocking case.
4739 	 * Otherwise we fall thru and use kstrputmsg().
4740 	 */
4741 	if (connected)
4742 		return (strwrite(SOTOV(so), uiop, CRED()));
4743 
4744 	if (audit_active)
4745 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4746 
4747 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4748 done:
4749 #ifdef SOCK_DEBUG
4750 	if (error != 0) {
4751 		eprintsoline(so, error);
4752 	}
4753 #endif /* SOCK_DEBUG */
4754 	return (error);
4755 }
4756 
4757 int
4758 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4759 {
4760 	struct stdata *stp = SOTOV(so)->v_stream;
4761 	ssize_t iosize, rmax, maxblk;
4762 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4763 	mblk_t *newmp;
4764 	int error = 0, wflag = 0;
4765 
4766 	ASSERT(so->so_mode & SM_BYTESTREAM);
4767 	ASSERT(SOTOV(so)->v_type == VSOCK);
4768 
4769 	if (stp->sd_sidp != NULL &&
4770 	    (error = straccess(stp, JCWRITE)) != 0)
4771 		return (error);
4772 
4773 	if (uiop == NULL) {
4774 		/*
4775 		 * kstrwritemp() should have checked sd_flag and
4776 		 * flow-control before coming here.  If we end up
4777 		 * here it means that we can simply pass down the
4778 		 * data to tcp.
4779 		 */
4780 		ASSERT(mp != NULL);
4781 		if (stp->sd_wputdatafunc != NULL) {
4782 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4783 			    NULL, NULL, NULL);
4784 			if (newmp == NULL) {
4785 				/* The caller will free mp */
4786 				return (ECOMM);
4787 			}
4788 			mp = newmp;
4789 		}
4790 		tcp_wput(tcp_wq, mp);
4791 		return (0);
4792 	}
4793 
4794 	/* Fallback to strwrite() to do proper error handling */
4795 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4796 		return (strwrite(SOTOV(so), uiop, cr));
4797 
4798 	rmax = stp->sd_qn_maxpsz;
4799 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4800 	if (rmax == 0 || uiop->uio_resid <= 0)
4801 		return (0);
4802 
4803 	if (rmax == INFPSZ)
4804 		rmax = uiop->uio_resid;
4805 
4806 	maxblk = stp->sd_maxblk;
4807 
4808 	for (;;) {
4809 		iosize = MIN(uiop->uio_resid, rmax);
4810 
4811 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4812 		if (mp == NULL) {
4813 			/*
4814 			 * Fallback to strwrite() for ENOMEM; if this
4815 			 * is our first time in this routine and the uio
4816 			 * vector has not been modified, we will end up
4817 			 * calling strwrite() without any flag set.
4818 			 */
4819 			if (error == ENOMEM)
4820 				goto slow_send;
4821 			else
4822 				return (error);
4823 		}
4824 		ASSERT(uiop->uio_resid >= 0);
4825 		/*
4826 		 * If mp is non-NULL and ENOMEM is set, it means that
4827 		 * mcopyinuio() was able to break down some of the user
4828 		 * data into one or more mblks.  Send the partial data
4829 		 * to tcp and let the rest be handled in strwrite().
4830 		 */
4831 		ASSERT(error == 0 || error == ENOMEM);
4832 		if (stp->sd_wputdatafunc != NULL) {
4833 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4834 			    NULL, NULL, NULL);
4835 			if (newmp == NULL) {
4836 				/* The caller will free mp */
4837 				return (ECOMM);
4838 			}
4839 			mp = newmp;
4840 		}
4841 		tcp_wput(tcp_wq, mp);
4842 
4843 		wflag |= NOINTR;
4844 
4845 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4846 			ASSERT(error == 0);
4847 			break;
4848 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4849 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4850 slow_send:
4851 			/*
4852 			 * We were able to send down partial data using
4853 			 * the direct call interface, but are now relying
4854 			 * on strwrite() to handle the non-fastpath cases.
4855 			 * If the socket is blocking we will sleep in
4856 			 * strwaitq() until write is permitted, otherwise,
4857 			 * we will need to return the amount of bytes
4858 			 * written so far back to the app.  This is the
4859 			 * reason why we pass NOINTR flag to strwrite()
4860 			 * for non-blocking socket, because we don't want
4861 			 * to return EAGAIN when portion of the user data
4862 			 * has actually been sent down.
4863 			 */
4864 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4865 		}
4866 	}
4867 	return (0);
4868 }
4869 
4870 /*
4871  * Update sti_faddr by asking the transport (unless AF_UNIX).
4872  */
4873 /* ARGSUSED */
4874 int
4875 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4876     boolean_t accept, struct cred *cr)
4877 {
4878 	struct strbuf	strbuf;
4879 	int		error = 0, res;
4880 	void		*addr;
4881 	t_uscalar_t	addrlen;
4882 	k_sigset_t	smask;
4883 	sotpi_info_t	*sti = SOTOTPI(so);
4884 
4885 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4886 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4887 
4888 	ASSERT(*namelen > 0);
4889 	mutex_enter(&so->so_lock);
4890 	so_lock_single(so);	/* Set SOLOCKED */
4891 
4892 	if (accept) {
4893 		bcopy(sti->sti_faddr_sa, name,
4894 		    MIN(*namelen, sti->sti_faddr_len));
4895 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4896 		goto done;
4897 	}
4898 
4899 	if (!(so->so_state & SS_ISCONNECTED)) {
4900 		error = ENOTCONN;
4901 		goto done;
4902 	}
4903 	/* Added this check for X/Open */
4904 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4905 		error = EINVAL;
4906 		if (xnet_check_print) {
4907 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4908 		}
4909 		goto done;
4910 	}
4911 
4912 	if (sti->sti_faddr_valid) {
4913 		bcopy(sti->sti_faddr_sa, name,
4914 		    MIN(*namelen, sti->sti_faddr_len));
4915 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4916 		goto done;
4917 	}
4918 
4919 #ifdef DEBUG
4920 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4921 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4922 	    (t_uscalar_t)sti->sti_faddr_len)));
4923 #endif /* DEBUG */
4924 
4925 	if (so->so_family == AF_UNIX) {
4926 		/* Transport has different name space - return local info */
4927 		if (sti->sti_faddr_noxlate)
4928 			*namelen = 0;
4929 		error = 0;
4930 		goto done;
4931 	}
4932 
4933 	ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4934 
4935 	ASSERT(sti->sti_faddr_sa);
4936 	/* Allocate local buffer to use with ioctl */
4937 	addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4938 	mutex_exit(&so->so_lock);
4939 	addr = kmem_alloc(addrlen, KM_SLEEP);
4940 
4941 	/*
4942 	 * Issue TI_GETPEERNAME with signals masked.
4943 	 * Put the result in sti_faddr_sa so that getpeername works after
4944 	 * a shutdown(output).
4945 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4946 	 * back to the socket.
4947 	 */
4948 	strbuf.buf = addr;
4949 	strbuf.maxlen = addrlen;
4950 	strbuf.len = 0;
4951 
4952 	sigintr(&smask, 0);
4953 	res = 0;
4954 	ASSERT(cr);
4955 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4956 	    0, K_TO_K, cr, &res);
4957 	sigunintr(&smask);
4958 
4959 	mutex_enter(&so->so_lock);
4960 	/*
4961 	 * If there is an error record the error in so_error put don't fail
4962 	 * the getpeername. Instead fallback on the recorded
4963 	 * sti->sti_faddr_sa.
4964 	 */
4965 	if (error) {
4966 		/*
4967 		 * Various stream head errors can be returned to the ioctl.
4968 		 * However, it is impossible to determine which ones of
4969 		 * these are really socket level errors that were incorrectly
4970 		 * consumed by the ioctl. Thus this code silently ignores the
4971 		 * error - to code explicitly does not reinstate the error
4972 		 * using soseterror().
4973 		 * Experiments have shows that at least this set of
4974 		 * errors are reported and should not be reinstated on the
4975 		 * socket:
4976 		 *	EINVAL	E.g. if an I_LINK was in effect when
4977 		 *		getpeername was called.
4978 		 *	EPIPE	The ioctl error semantics prefer the write
4979 		 *		side error over the read side error.
4980 		 *	ENOTCONN The transport just got disconnected but
4981 		 *		sockfs had not yet seen the T_DISCON_IND
4982 		 *		when issuing the ioctl.
4983 		 */
4984 		error = 0;
4985 	} else if (res == 0 && strbuf.len > 0 &&
4986 	    (so->so_state & SS_ISCONNECTED)) {
4987 		ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
4988 		sti->sti_faddr_len = (socklen_t)strbuf.len;
4989 		bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
4990 		sti->sti_faddr_valid = 1;
4991 
4992 		bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
4993 		*namelen = sti->sti_faddr_len;
4994 	}
4995 	kmem_free(addr, addrlen);
4996 #ifdef DEBUG
4997 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4998 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4999 	    (t_uscalar_t)sti->sti_faddr_len)));
5000 #endif /* DEBUG */
5001 done:
5002 	so_unlock_single(so, SOLOCKED);
5003 	mutex_exit(&so->so_lock);
5004 	return (error);
5005 }
5006 
5007 /*
5008  * Update sti_laddr by asking the transport (unless AF_UNIX).
5009  */
5010 int
5011 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
5012     struct cred *cr)
5013 {
5014 	struct strbuf	strbuf;
5015 	int		error = 0, res;
5016 	void		*addr;
5017 	t_uscalar_t	addrlen;
5018 	k_sigset_t	smask;
5019 	sotpi_info_t	*sti = SOTOTPI(so);
5020 
5021 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
5022 	    (void *)so, pr_state(so->so_state, so->so_mode)));
5023 
5024 	ASSERT(*namelen > 0);
5025 	mutex_enter(&so->so_lock);
5026 	so_lock_single(so);	/* Set SOLOCKED */
5027 
5028 #ifdef DEBUG
5029 
5030 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
5031 	    pr_addr(so->so_family, sti->sti_laddr_sa,
5032 	    (t_uscalar_t)sti->sti_laddr_len)));
5033 #endif /* DEBUG */
5034 	if (sti->sti_laddr_valid) {
5035 		bcopy(sti->sti_laddr_sa, name,
5036 		    MIN(*namelen, sti->sti_laddr_len));
5037 		*namelen = sti->sti_laddr_len;
5038 		goto done;
5039 	}
5040 
5041 	if (so->so_family == AF_UNIX) {
5042 		/* Transport has different name space - return local info */
5043 		error = 0;
5044 		*namelen = 0;
5045 		goto done;
5046 	}
5047 	if (!(so->so_state & SS_ISBOUND)) {
5048 		/* If not bound, then nothing to return. */
5049 		error = 0;
5050 		goto done;
5051 	}
5052 
5053 	/* Allocate local buffer to use with ioctl */
5054 	addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
5055 	mutex_exit(&so->so_lock);
5056 	addr = kmem_alloc(addrlen, KM_SLEEP);
5057 
5058 	/*
5059 	 * Issue TI_GETMYNAME with signals masked.
5060 	 * Put the result in sti_laddr_sa so that getsockname works after
5061 	 * a shutdown(output).
5062 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5063 	 * back to the socket.
5064 	 */
5065 	strbuf.buf = addr;
5066 	strbuf.maxlen = addrlen;
5067 	strbuf.len = 0;
5068 
5069 	sigintr(&smask, 0);
5070 	res = 0;
5071 	ASSERT(cr);
5072 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
5073 	    0, K_TO_K, cr, &res);
5074 	sigunintr(&smask);
5075 
5076 	mutex_enter(&so->so_lock);
5077 	/*
5078 	 * If there is an error record the error in so_error put don't fail
5079 	 * the getsockname. Instead fallback on the recorded
5080 	 * sti->sti_laddr_sa.
5081 	 */
5082 	if (error) {
5083 		/*
5084 		 * Various stream head errors can be returned to the ioctl.
5085 		 * However, it is impossible to determine which ones of
5086 		 * these are really socket level errors that were incorrectly
5087 		 * consumed by the ioctl. Thus this code silently ignores the
5088 		 * error - to code explicitly does not reinstate the error
5089 		 * using soseterror().
5090 		 * Experiments have shows that at least this set of
5091 		 * errors are reported and should not be reinstated on the
5092 		 * socket:
5093 		 *	EINVAL	E.g. if an I_LINK was in effect when
5094 		 *		getsockname was called.
5095 		 *	EPIPE	The ioctl error semantics prefer the write
5096 		 *		side error over the read side error.
5097 		 */
5098 		error = 0;
5099 	} else if (res == 0 && strbuf.len > 0 &&
5100 	    (so->so_state & SS_ISBOUND)) {
5101 		ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5102 		sti->sti_laddr_len = (socklen_t)strbuf.len;
5103 		bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5104 		sti->sti_laddr_valid = 1;
5105 
5106 		bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5107 		*namelen = sti->sti_laddr_len;
5108 	}
5109 	kmem_free(addr, addrlen);
5110 #ifdef DEBUG
5111 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5112 	    pr_addr(so->so_family, sti->sti_laddr_sa,
5113 	    (t_uscalar_t)sti->sti_laddr_len)));
5114 #endif /* DEBUG */
5115 done:
5116 	so_unlock_single(so, SOLOCKED);
5117 	mutex_exit(&so->so_lock);
5118 	return (error);
5119 }
5120 
5121 /*
5122  * Get socket options. For SOL_SOCKET options some options are handled
5123  * by the sockfs while others use the value recorded in the sonode as a
5124  * fallback should the T_SVR4_OPTMGMT_REQ fail.
5125  *
5126  * On the return most *optlenp bytes are copied to optval.
5127  */
5128 /* ARGSUSED */
5129 int
5130 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5131 		void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5132 {
5133 	struct T_optmgmt_req	optmgmt_req;
5134 	struct T_optmgmt_ack	*optmgmt_ack;
5135 	struct opthdr		oh;
5136 	struct opthdr		*opt_res;
5137 	mblk_t			*mp = NULL;
5138 	int			error = 0;
5139 	void			*option = NULL;	/* Set if fallback value */
5140 	t_uscalar_t		maxlen = *optlenp;
5141 	t_uscalar_t		len;
5142 	uint32_t		value;
5143 	struct timeval		tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5144 	struct timeval32	tmo_val32;
5145 	struct so_snd_bufinfo	snd_bufinfo;	/* used for zero copy */
5146 
5147 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5148 	    (void *)so, level, option_name, optval, (void *)optlenp,
5149 	    pr_state(so->so_state, so->so_mode)));
5150 
5151 	mutex_enter(&so->so_lock);
5152 	so_lock_single(so);	/* Set SOLOCKED */
5153 
5154 	/*
5155 	 * Check for SOL_SOCKET options.
5156 	 * Certain SOL_SOCKET options are returned directly whereas
5157 	 * others only provide a default (fallback) value should
5158 	 * the T_SVR4_OPTMGMT_REQ fail.
5159 	 */
5160 	if (level == SOL_SOCKET) {
5161 		/* Check parameters */
5162 		switch (option_name) {
5163 		case SO_TYPE:
5164 		case SO_ERROR:
5165 		case SO_DEBUG:
5166 		case SO_ACCEPTCONN:
5167 		case SO_REUSEADDR:
5168 		case SO_KEEPALIVE:
5169 		case SO_DONTROUTE:
5170 		case SO_BROADCAST:
5171 		case SO_USELOOPBACK:
5172 		case SO_OOBINLINE:
5173 		case SO_SNDBUF:
5174 		case SO_RCVBUF:
5175 #ifdef notyet
5176 		case SO_SNDLOWAT:
5177 		case SO_RCVLOWAT:
5178 #endif /* notyet */
5179 		case SO_DOMAIN:
5180 		case SO_DGRAM_ERRIND:
5181 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5182 				error = EINVAL;
5183 				eprintsoline(so, error);
5184 				goto done2;
5185 			}
5186 			break;
5187 		case SO_RCVTIMEO:
5188 		case SO_SNDTIMEO:
5189 			if (get_udatamodel() == DATAMODEL_NONE ||
5190 			    get_udatamodel() == DATAMODEL_NATIVE) {
5191 				if (maxlen < sizeof (struct timeval)) {
5192 					error = EINVAL;
5193 					eprintsoline(so, error);
5194 					goto done2;
5195 				}
5196 			} else {
5197 				if (maxlen < sizeof (struct timeval32)) {
5198 					error = EINVAL;
5199 					eprintsoline(so, error);
5200 					goto done2;
5201 				}
5202 
5203 			}
5204 			break;
5205 		case SO_LINGER:
5206 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5207 				error = EINVAL;
5208 				eprintsoline(so, error);
5209 				goto done2;
5210 			}
5211 			break;
5212 		case SO_SND_BUFINFO:
5213 			if (maxlen < (t_uscalar_t)
5214 			    sizeof (struct so_snd_bufinfo)) {
5215 				error = EINVAL;
5216 				eprintsoline(so, error);
5217 				goto done2;
5218 			}
5219 			break;
5220 		}
5221 
5222 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
5223 
5224 		switch (option_name) {
5225 		case SO_TYPE:
5226 			value = so->so_type;
5227 			option = &value;
5228 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5229 
5230 		case SO_ERROR:
5231 			value = sogeterr(so, B_TRUE);
5232 			option = &value;
5233 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5234 
5235 		case SO_ACCEPTCONN:
5236 			if (so->so_state & SS_ACCEPTCONN)
5237 				value = SO_ACCEPTCONN;
5238 			else
5239 				value = 0;
5240 #ifdef DEBUG
5241 			if (value) {
5242 				dprintso(so, 1,
5243 				    ("sotpi_getsockopt: 0x%x is set\n",
5244 				    option_name));
5245 			} else {
5246 				dprintso(so, 1,
5247 				    ("sotpi_getsockopt: 0x%x not set\n",
5248 				    option_name));
5249 			}
5250 #endif /* DEBUG */
5251 			option = &value;
5252 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5253 
5254 		case SO_DEBUG:
5255 		case SO_REUSEADDR:
5256 		case SO_KEEPALIVE:
5257 		case SO_DONTROUTE:
5258 		case SO_BROADCAST:
5259 		case SO_USELOOPBACK:
5260 		case SO_OOBINLINE:
5261 		case SO_DGRAM_ERRIND:
5262 			value = (so->so_options & option_name);
5263 #ifdef DEBUG
5264 			if (value) {
5265 				dprintso(so, 1,
5266 				    ("sotpi_getsockopt: 0x%x is set\n",
5267 				    option_name));
5268 			} else {
5269 				dprintso(so, 1,
5270 				    ("sotpi_getsockopt: 0x%x not set\n",
5271 				    option_name));
5272 			}
5273 #endif /* DEBUG */
5274 			option = &value;
5275 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5276 
5277 		/*
5278 		 * The following options are only returned by sockfs when the
5279 		 * T_SVR4_OPTMGMT_REQ fails.
5280 		 */
5281 		case SO_LINGER:
5282 			option = &so->so_linger;
5283 			len = (t_uscalar_t)sizeof (struct linger);
5284 			break;
5285 		case SO_SNDBUF: {
5286 			ssize_t lvalue;
5287 
5288 			/*
5289 			 * If the option has not been set then get a default
5290 			 * value from the read queue. This value is
5291 			 * returned if the transport fails
5292 			 * the T_SVR4_OPTMGMT_REQ.
5293 			 */
5294 			lvalue = so->so_sndbuf;
5295 			if (lvalue == 0) {
5296 				mutex_exit(&so->so_lock);
5297 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
5298 				    QHIWAT, 0, &lvalue);
5299 				mutex_enter(&so->so_lock);
5300 				dprintso(so, 1,
5301 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5302 			}
5303 			value = (int)lvalue;
5304 			option = &value;
5305 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5306 			break;
5307 		}
5308 		case SO_RCVBUF: {
5309 			ssize_t lvalue;
5310 
5311 			/*
5312 			 * If the option has not been set then get a default
5313 			 * value from the read queue. This value is
5314 			 * returned if the transport fails
5315 			 * the T_SVR4_OPTMGMT_REQ.
5316 			 *
5317 			 * XXX If SO_RCVBUF has been set and this is an
5318 			 * XPG 4.2 application then do not ask the transport
5319 			 * since the transport might adjust the value and not
5320 			 * return exactly what was set by the application.
5321 			 * For non-XPG 4.2 application we return the value
5322 			 * that the transport is actually using.
5323 			 */
5324 			lvalue = so->so_rcvbuf;
5325 			if (lvalue == 0) {
5326 				mutex_exit(&so->so_lock);
5327 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5328 				    QHIWAT, 0, &lvalue);
5329 				mutex_enter(&so->so_lock);
5330 				dprintso(so, 1,
5331 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5332 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5333 				value = (int)lvalue;
5334 				option = &value;
5335 				goto copyout;	/* skip asking transport */
5336 			}
5337 			value = (int)lvalue;
5338 			option = &value;
5339 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5340 			break;
5341 		}
5342 		case SO_DOMAIN:
5343 			value = so->so_family;
5344 			option = &value;
5345 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5346 
5347 #ifdef notyet
5348 		/*
5349 		 * We do not implement the semantics of these options
5350 		 * thus we shouldn't implement the options either.
5351 		 */
5352 		case SO_SNDLOWAT:
5353 			value = so->so_sndlowat;
5354 			option = &value;
5355 			break;
5356 		case SO_RCVLOWAT:
5357 			value = so->so_rcvlowat;
5358 			option = &value;
5359 			break;
5360 #endif /* notyet */
5361 		case SO_SNDTIMEO:
5362 		case SO_RCVTIMEO: {
5363 			clock_t val;
5364 
5365 			if (option_name == SO_RCVTIMEO)
5366 				val = drv_hztousec(so->so_rcvtimeo);
5367 			else
5368 				val = drv_hztousec(so->so_sndtimeo);
5369 			tmo_val.tv_sec = val / (1000 * 1000);
5370 			tmo_val.tv_usec = val % (1000 * 1000);
5371 			if (get_udatamodel() == DATAMODEL_NONE ||
5372 			    get_udatamodel() == DATAMODEL_NATIVE) {
5373 				option = &tmo_val;
5374 				len = sizeof (struct timeval);
5375 			} else {
5376 				TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5377 				option = &tmo_val32;
5378 				len = sizeof (struct timeval32);
5379 			}
5380 			break;
5381 		}
5382 		case SO_SND_BUFINFO: {
5383 			snd_bufinfo.sbi_wroff =
5384 			    (so->so_proto_props).sopp_wroff;
5385 			snd_bufinfo.sbi_maxblk =
5386 			    (so->so_proto_props).sopp_maxblk;
5387 			snd_bufinfo.sbi_maxpsz =
5388 			    (so->so_proto_props).sopp_maxpsz;
5389 			snd_bufinfo.sbi_tail =
5390 			    (so->so_proto_props).sopp_tail;
5391 			option = &snd_bufinfo;
5392 			len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5393 			break;
5394 		}
5395 		}
5396 	}
5397 
5398 	mutex_exit(&so->so_lock);
5399 
5400 	/* Send request */
5401 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5402 	optmgmt_req.MGMT_flags = T_CHECK;
5403 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5404 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5405 
5406 	oh.level = level;
5407 	oh.name = option_name;
5408 	oh.len = maxlen;
5409 
5410 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5411 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5412 	/* Let option management work in the presence of data flow control */
5413 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5414 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5415 	mp = NULL;
5416 	mutex_enter(&so->so_lock);
5417 	if (error) {
5418 		eprintsoline(so, error);
5419 		goto done2;
5420 	}
5421 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5422 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5423 	if (error) {
5424 		if (option != NULL) {
5425 			/* We have a fallback value */
5426 			error = 0;
5427 			goto copyout;
5428 		}
5429 		eprintsoline(so, error);
5430 		goto done2;
5431 	}
5432 	ASSERT(mp);
5433 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5434 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5435 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5436 	if (opt_res == NULL) {
5437 		if (option != NULL) {
5438 			/* We have a fallback value */
5439 			error = 0;
5440 			goto copyout;
5441 		}
5442 		error = EPROTO;
5443 		eprintsoline(so, error);
5444 		goto done;
5445 	}
5446 	option = &opt_res[1];
5447 
5448 	/* check to ensure that the option is within bounds */
5449 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5450 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5451 		if (option != NULL) {
5452 			/* We have a fallback value */
5453 			error = 0;
5454 			goto copyout;
5455 		}
5456 		error = EPROTO;
5457 		eprintsoline(so, error);
5458 		goto done;
5459 	}
5460 
5461 	len = opt_res->len;
5462 
5463 copyout: {
5464 		t_uscalar_t size = MIN(len, maxlen);
5465 		bcopy(option, optval, size);
5466 		bcopy(&size, optlenp, sizeof (size));
5467 	}
5468 done:
5469 	freemsg(mp);
5470 done2:
5471 	so_unlock_single(so, SOLOCKED);
5472 	mutex_exit(&so->so_lock);
5473 
5474 	return (error);
5475 }
5476 
5477 /*
5478  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5479  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5480  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5481  * setsockopt has to work even if the transport does not support the option.
5482  */
5483 /* ARGSUSED */
5484 int
5485 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5486 	const void *optval, t_uscalar_t optlen, struct cred *cr)
5487 {
5488 	struct T_optmgmt_req	optmgmt_req;
5489 	struct opthdr		oh;
5490 	mblk_t			*mp;
5491 	int			error = 0;
5492 	boolean_t		handled = B_FALSE;
5493 
5494 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5495 	    (void *)so, level, option_name, optval, optlen,
5496 	    pr_state(so->so_state, so->so_mode)));
5497 
5498 	/* X/Open requires this check */
5499 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5500 		if (xnet_check_print)
5501 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5502 		return (EINVAL);
5503 	}
5504 
5505 	mutex_enter(&so->so_lock);
5506 	so_lock_single(so);	/* Set SOLOCKED */
5507 	mutex_exit(&so->so_lock);
5508 
5509 	/*
5510 	 * For SOCKET or TCP level options, try to set it here itself
5511 	 * provided socket has not been popped and we know the tcp
5512 	 * structure (stored in so_priv).
5513 	 */
5514 	if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
5515 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
5516 	    (so->so_version == SOV_SOCKSTREAM) &&
5517 	    (so->so_proto_handle != NULL)) {
5518 		tcp_t		*tcp = (tcp_t *)so->so_proto_handle;
5519 		boolean_t	onoff;
5520 
5521 #define	intvalue	(*(int32_t *)optval)
5522 
5523 		switch (level) {
5524 		case SOL_SOCKET:
5525 			switch (option_name) {		/* Check length param */
5526 			case SO_DEBUG:
5527 			case SO_REUSEADDR:
5528 			case SO_DONTROUTE:
5529 			case SO_BROADCAST:
5530 			case SO_USELOOPBACK:
5531 			case SO_OOBINLINE:
5532 			case SO_DGRAM_ERRIND:
5533 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5534 					error = EINVAL;
5535 					eprintsoline(so, error);
5536 					mutex_enter(&so->so_lock);
5537 					goto done2;
5538 				}
5539 				ASSERT(optval);
5540 				onoff = intvalue != 0;
5541 				handled = B_TRUE;
5542 				break;
5543 			case SO_SNDTIMEO:
5544 			case SO_RCVTIMEO:
5545 				if (get_udatamodel() == DATAMODEL_NONE ||
5546 				    get_udatamodel() == DATAMODEL_NATIVE) {
5547 					if (optlen !=
5548 					    sizeof (struct timeval)) {
5549 						error = EINVAL;
5550 						eprintsoline(so, error);
5551 						mutex_enter(&so->so_lock);
5552 						goto done2;
5553 					}
5554 				} else {
5555 					if (optlen !=
5556 					    sizeof (struct timeval32)) {
5557 						error = EINVAL;
5558 						eprintsoline(so, error);
5559 						mutex_enter(&so->so_lock);
5560 						goto done2;
5561 					}
5562 				}
5563 				ASSERT(optval);
5564 				handled = B_TRUE;
5565 				break;
5566 			case SO_LINGER:
5567 				if (optlen !=
5568 				    (t_uscalar_t)sizeof (struct linger)) {
5569 					error = EINVAL;
5570 					eprintsoline(so, error);
5571 					mutex_enter(&so->so_lock);
5572 					goto done2;
5573 				}
5574 				ASSERT(optval);
5575 				handled = B_TRUE;
5576 				break;
5577 			}
5578 
5579 			switch (option_name) {			/* Do actions */
5580 			case SO_LINGER: {
5581 				struct linger *lgr = (struct linger *)optval;
5582 
5583 				if (lgr->l_onoff) {
5584 					tcp->tcp_linger = 1;
5585 					tcp->tcp_lingertime = lgr->l_linger;
5586 					so->so_linger.l_onoff = SO_LINGER;
5587 					so->so_options |= SO_LINGER;
5588 				} else {
5589 					tcp->tcp_linger = 0;
5590 					tcp->tcp_lingertime = 0;
5591 					so->so_linger.l_onoff = 0;
5592 					so->so_options &= ~SO_LINGER;
5593 				}
5594 				so->so_linger.l_linger = lgr->l_linger;
5595 				handled = B_TRUE;
5596 				break;
5597 			}
5598 			case SO_SNDTIMEO:
5599 			case SO_RCVTIMEO: {
5600 				struct timeval tl;
5601 				clock_t val;
5602 
5603 				if (get_udatamodel() == DATAMODEL_NONE ||
5604 				    get_udatamodel() == DATAMODEL_NATIVE)
5605 					bcopy(&tl, (struct timeval *)optval,
5606 					    sizeof (struct timeval));
5607 				else
5608 					TIMEVAL32_TO_TIMEVAL(&tl,
5609 					    (struct timeval32 *)optval);
5610 				val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5611 				if (option_name == SO_RCVTIMEO)
5612 					so->so_rcvtimeo = drv_usectohz(val);
5613 				else
5614 					so->so_sndtimeo = drv_usectohz(val);
5615 				break;
5616 			}
5617 
5618 			case SO_DEBUG:
5619 				tcp->tcp_debug = onoff;
5620 #ifdef SOCK_TEST
5621 				if (intvalue & 2)
5622 					sock_test_timelimit = 10 * hz;
5623 				else
5624 					sock_test_timelimit = 0;
5625 
5626 				if (intvalue & 4)
5627 					do_useracc = 0;
5628 				else
5629 					do_useracc = 1;
5630 #endif /* SOCK_TEST */
5631 				break;
5632 			case SO_DONTROUTE:
5633 				/*
5634 				 * SO_DONTROUTE, SO_USELOOPBACK and
5635 				 * SO_BROADCAST are only of interest to IP.
5636 				 * We track them here only so
5637 				 * that we can report their current value.
5638 				 */
5639 				tcp->tcp_dontroute = onoff;
5640 				if (onoff)
5641 					so->so_options |= option_name;
5642 				else
5643 					so->so_options &= ~option_name;
5644 				break;
5645 			case SO_USELOOPBACK:
5646 				tcp->tcp_useloopback = onoff;
5647 				if (onoff)
5648 					so->so_options |= option_name;
5649 				else
5650 					so->so_options &= ~option_name;
5651 				break;
5652 			case SO_BROADCAST:
5653 				tcp->tcp_broadcast = onoff;
5654 				if (onoff)
5655 					so->so_options |= option_name;
5656 				else
5657 					so->so_options &= ~option_name;
5658 				break;
5659 			case SO_REUSEADDR:
5660 				tcp->tcp_reuseaddr = onoff;
5661 				if (onoff)
5662 					so->so_options |= option_name;
5663 				else
5664 					so->so_options &= ~option_name;
5665 				break;
5666 			case SO_OOBINLINE:
5667 				tcp->tcp_oobinline = onoff;
5668 				if (onoff)
5669 					so->so_options |= option_name;
5670 				else
5671 					so->so_options &= ~option_name;
5672 				break;
5673 			case SO_DGRAM_ERRIND:
5674 				tcp->tcp_dgram_errind = onoff;
5675 				if (onoff)
5676 					so->so_options |= option_name;
5677 				else
5678 					so->so_options &= ~option_name;
5679 				break;
5680 			}
5681 			break;
5682 		case IPPROTO_TCP:
5683 			switch (option_name) {
5684 			case TCP_NODELAY:
5685 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5686 					error = EINVAL;
5687 					eprintsoline(so, error);
5688 					mutex_enter(&so->so_lock);
5689 					goto done2;
5690 				}
5691 				ASSERT(optval);
5692 				tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss;
5693 				handled = B_TRUE;
5694 				break;
5695 			}
5696 			break;
5697 		default:
5698 			handled = B_FALSE;
5699 			break;
5700 		}
5701 	}
5702 
5703 	if (handled) {
5704 		mutex_enter(&so->so_lock);
5705 		goto done2;
5706 	}
5707 
5708 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5709 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5710 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5711 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5712 
5713 	oh.level = level;
5714 	oh.name = option_name;
5715 	oh.len = optlen;
5716 
5717 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5718 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5719 	/* Let option management work in the presence of data flow control */
5720 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5721 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5722 	mp = NULL;
5723 	mutex_enter(&so->so_lock);
5724 	if (error) {
5725 		eprintsoline(so, error);
5726 		goto done2;
5727 	}
5728 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5729 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5730 	if (error) {
5731 		eprintsoline(so, error);
5732 		goto done;
5733 	}
5734 	ASSERT(mp);
5735 	/* No need to verify T_optmgmt_ack */
5736 	freemsg(mp);
5737 done:
5738 	/*
5739 	 * Check for SOL_SOCKET options and record their values.
5740 	 * If we know about a SOL_SOCKET parameter and the transport
5741 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5742 	 * EPROTO) we let the setsockopt succeed.
5743 	 */
5744 	if (level == SOL_SOCKET) {
5745 		/* Check parameters */
5746 		switch (option_name) {
5747 		case SO_DEBUG:
5748 		case SO_REUSEADDR:
5749 		case SO_KEEPALIVE:
5750 		case SO_DONTROUTE:
5751 		case SO_BROADCAST:
5752 		case SO_USELOOPBACK:
5753 		case SO_OOBINLINE:
5754 		case SO_SNDBUF:
5755 		case SO_RCVBUF:
5756 #ifdef notyet
5757 		case SO_SNDLOWAT:
5758 		case SO_RCVLOWAT:
5759 #endif /* notyet */
5760 		case SO_DGRAM_ERRIND:
5761 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5762 				error = EINVAL;
5763 				eprintsoline(so, error);
5764 				goto done2;
5765 			}
5766 			ASSERT(optval);
5767 			handled = B_TRUE;
5768 			break;
5769 		case SO_SNDTIMEO:
5770 		case SO_RCVTIMEO:
5771 			if (get_udatamodel() == DATAMODEL_NONE ||
5772 			    get_udatamodel() == DATAMODEL_NATIVE) {
5773 				if (optlen != sizeof (struct timeval)) {
5774 					error = EINVAL;
5775 					eprintsoline(so, error);
5776 					goto done2;
5777 				}
5778 			} else {
5779 				if (optlen != sizeof (struct timeval32)) {
5780 					error = EINVAL;
5781 					eprintsoline(so, error);
5782 					goto done2;
5783 				}
5784 			}
5785 			ASSERT(optval);
5786 			handled = B_TRUE;
5787 			break;
5788 		case SO_LINGER:
5789 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5790 				error = EINVAL;
5791 				eprintsoline(so, error);
5792 				goto done2;
5793 			}
5794 			ASSERT(optval);
5795 			handled = B_TRUE;
5796 			break;
5797 		}
5798 
5799 #define	intvalue	(*(int32_t *)optval)
5800 
5801 		switch (option_name) {
5802 		case SO_TYPE:
5803 		case SO_ERROR:
5804 		case SO_ACCEPTCONN:
5805 			/* Can't be set */
5806 			error = ENOPROTOOPT;
5807 			goto done2;
5808 		case SO_LINGER: {
5809 			struct linger *l = (struct linger *)optval;
5810 
5811 			so->so_linger.l_linger = l->l_linger;
5812 			if (l->l_onoff) {
5813 				so->so_linger.l_onoff = SO_LINGER;
5814 				so->so_options |= SO_LINGER;
5815 			} else {
5816 				so->so_linger.l_onoff = 0;
5817 				so->so_options &= ~SO_LINGER;
5818 			}
5819 			break;
5820 		}
5821 
5822 		case SO_DEBUG:
5823 #ifdef SOCK_TEST
5824 			if (intvalue & 2)
5825 				sock_test_timelimit = 10 * hz;
5826 			else
5827 				sock_test_timelimit = 0;
5828 
5829 			if (intvalue & 4)
5830 				do_useracc = 0;
5831 			else
5832 				do_useracc = 1;
5833 #endif /* SOCK_TEST */
5834 			/* FALLTHRU */
5835 		case SO_REUSEADDR:
5836 		case SO_KEEPALIVE:
5837 		case SO_DONTROUTE:
5838 		case SO_BROADCAST:
5839 		case SO_USELOOPBACK:
5840 		case SO_OOBINLINE:
5841 		case SO_DGRAM_ERRIND:
5842 			if (intvalue != 0) {
5843 				dprintso(so, 1,
5844 				    ("socket_setsockopt: setting 0x%x\n",
5845 				    option_name));
5846 				so->so_options |= option_name;
5847 			} else {
5848 				dprintso(so, 1,
5849 				    ("socket_setsockopt: clearing 0x%x\n",
5850 				    option_name));
5851 				so->so_options &= ~option_name;
5852 			}
5853 			break;
5854 		/*
5855 		 * The following options are only returned by us when the
5856 		 * transport layer fails.
5857 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5858 		 * since the transport might adjust the value and not
5859 		 * return exactly what was set by the application.
5860 		 */
5861 		case SO_SNDBUF:
5862 			so->so_sndbuf = intvalue;
5863 			break;
5864 		case SO_RCVBUF:
5865 			so->so_rcvbuf = intvalue;
5866 			break;
5867 		case SO_RCVPSH:
5868 			so->so_rcv_timer_interval = intvalue;
5869 			break;
5870 #ifdef notyet
5871 		/*
5872 		 * We do not implement the semantics of these options
5873 		 * thus we shouldn't implement the options either.
5874 		 */
5875 		case SO_SNDLOWAT:
5876 			so->so_sndlowat = intvalue;
5877 			break;
5878 		case SO_RCVLOWAT:
5879 			so->so_rcvlowat = intvalue;
5880 			break;
5881 #endif /* notyet */
5882 		case SO_SNDTIMEO:
5883 		case SO_RCVTIMEO: {
5884 			struct timeval tl;
5885 			clock_t val;
5886 
5887 			if (get_udatamodel() == DATAMODEL_NONE ||
5888 			    get_udatamodel() == DATAMODEL_NATIVE)
5889 				bcopy(&tl, (struct timeval *)optval,
5890 				    sizeof (struct timeval));
5891 			else
5892 				TIMEVAL32_TO_TIMEVAL(&tl,
5893 				    (struct timeval32 *)optval);
5894 			val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5895 			if (option_name == SO_RCVTIMEO)
5896 				so->so_rcvtimeo = drv_usectohz(val);
5897 			else
5898 				so->so_sndtimeo = drv_usectohz(val);
5899 			break;
5900 		}
5901 		}
5902 #undef	intvalue
5903 
5904 		if (error) {
5905 			if ((error == ENOPROTOOPT || error == EPROTO ||
5906 			    error == EINVAL) && handled) {
5907 				dprintso(so, 1,
5908 				    ("setsockopt: ignoring error %d for 0x%x\n",
5909 				    error, option_name));
5910 				error = 0;
5911 			}
5912 		}
5913 	}
5914 done2:
5915 	so_unlock_single(so, SOLOCKED);
5916 	mutex_exit(&so->so_lock);
5917 	return (error);
5918 }
5919 
5920 /*
5921  * sotpi_close() is called when the last open reference goes away.
5922  */
5923 /* ARGSUSED */
5924 int
5925 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5926 {
5927 	struct vnode *vp = SOTOV(so);
5928 	dev_t dev;
5929 	int error = 0;
5930 	sotpi_info_t *sti = SOTOTPI(so);
5931 
5932 	dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5933 	    (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5934 
5935 	dev = sti->sti_dev;
5936 
5937 	ASSERT(STREAMSTAB(getmajor(dev)));
5938 
5939 	mutex_enter(&so->so_lock);
5940 	so_lock_single(so);	/* Set SOLOCKED */
5941 
5942 	ASSERT(so_verify_oobstate(so));
5943 
5944 	if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5945 		sti->sti_nl7c_flags = 0;
5946 		nl7c_close(so);
5947 	}
5948 
5949 	if (vp->v_stream != NULL) {
5950 		vnode_t *ux_vp;
5951 
5952 		if (so->so_family == AF_UNIX) {
5953 			/* Could avoid this when CANTSENDMORE for !dgram */
5954 			so_unix_close(so);
5955 		}
5956 
5957 		mutex_exit(&so->so_lock);
5958 		/*
5959 		 * Disassemble the linkage from the AF_UNIX underlying file
5960 		 * system vnode to this socket (by atomically clearing
5961 		 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5962 		 * and frees the stream head.
5963 		 */
5964 		if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5965 			ASSERT(ux_vp->v_stream);
5966 			sti->sti_ux_bound_vp = NULL;
5967 			vn_rele_stream(ux_vp);
5968 		}
5969 		if (so->so_family == AF_INET || so->so_family == AF_INET6) {
5970 			strsetrwputdatahooks(SOTOV(so), NULL, NULL);
5971 			if (sti->sti_kssl_ent != NULL) {
5972 				kssl_release_ent(sti->sti_kssl_ent, so,
5973 				    sti->sti_kssl_type);
5974 				sti->sti_kssl_ent = NULL;
5975 			}
5976 			if (sti->sti_kssl_ctx != NULL) {
5977 				kssl_release_ctx(sti->sti_kssl_ctx);
5978 				sti->sti_kssl_ctx = NULL;
5979 			}
5980 			sti->sti_kssl_type = KSSL_NO_PROXY;
5981 		}
5982 		error = strclose(vp, flag, cr);
5983 		vp->v_stream = NULL;
5984 		mutex_enter(&so->so_lock);
5985 	}
5986 
5987 	/*
5988 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5989 	 */
5990 	so_flush_discon_ind(so);
5991 
5992 	so_unlock_single(so, SOLOCKED);
5993 	mutex_exit(&so->so_lock);
5994 
5995 	/*
5996 	 * Needed for STREAMs.
5997 	 * Decrement the device driver's reference count for streams
5998 	 * opened via the clone dip. The driver was held in clone_open().
5999 	 * The absence of clone_close() forces this asymmetry.
6000 	 */
6001 	if (so->so_flag & SOCLONE)
6002 		ddi_rele_driver(getmajor(dev));
6003 
6004 	return (error);
6005 }
6006 
6007 static int
6008 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
6009     struct cred *cr, int32_t *rvalp)
6010 {
6011 	struct vnode *vp = SOTOV(so);
6012 	sotpi_info_t *sti = SOTOTPI(so);
6013 	int error = 0;
6014 
6015 	dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
6016 	    cmd, arg, pr_state(so->so_state, so->so_mode)));
6017 
6018 	switch (cmd) {
6019 	case SIOCSQPTR:
6020 		/*
6021 		 * SIOCSQPTR is valid only when helper stream is created
6022 		 * by the protocol.
6023 		 */
6024 	case _I_INSERT:
6025 	case _I_REMOVE:
6026 		/*
6027 		 * Since there's no compelling reason to support these ioctls
6028 		 * on sockets, and doing so would increase the complexity
6029 		 * markedly, prevent it.
6030 		 */
6031 		return (EOPNOTSUPP);
6032 
6033 	case I_FIND:
6034 	case I_LIST:
6035 	case I_LOOK:
6036 	case I_POP:
6037 	case I_PUSH:
6038 		/*
6039 		 * To prevent races and inconsistencies between the actual
6040 		 * state of the stream and the state according to the sonode,
6041 		 * we serialize all operations which modify or operate on the
6042 		 * list of modules on the socket's stream.
6043 		 */
6044 		mutex_enter(&sti->sti_plumb_lock);
6045 		error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
6046 		mutex_exit(&sti->sti_plumb_lock);
6047 		return (error);
6048 
6049 	default:
6050 		if (so->so_version != SOV_STREAM)
6051 			break;
6052 
6053 		/*
6054 		 * The imaginary "sockmod" has been popped; act as a stream.
6055 		 */
6056 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6057 	}
6058 
6059 	ASSERT(so->so_version != SOV_STREAM);
6060 
6061 	/*
6062 	 * Process socket-specific ioctls.
6063 	 */
6064 	switch (cmd) {
6065 	case FIONBIO: {
6066 		int32_t value;
6067 
6068 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
6069 		    (mode & (int)FKIOCTL)))
6070 			return (EFAULT);
6071 
6072 		mutex_enter(&so->so_lock);
6073 		if (value) {
6074 			so->so_state |= SS_NDELAY;
6075 		} else {
6076 			so->so_state &= ~SS_NDELAY;
6077 		}
6078 		mutex_exit(&so->so_lock);
6079 		return (0);
6080 	}
6081 
6082 	case FIOASYNC: {
6083 		int32_t value;
6084 
6085 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
6086 		    (mode & (int)FKIOCTL)))
6087 			return (EFAULT);
6088 
6089 		mutex_enter(&so->so_lock);
6090 		/*
6091 		 * SS_ASYNC flag not already set correctly?
6092 		 * (!value != !(so->so_state & SS_ASYNC))
6093 		 * but some engineers find that too hard to read.
6094 		 */
6095 		if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
6096 		    value != 0 && (so->so_state & SS_ASYNC) == 0)
6097 			error = so_flip_async(so, vp, mode, cr);
6098 		mutex_exit(&so->so_lock);
6099 		return (error);
6100 	}
6101 
6102 	case SIOCSPGRP:
6103 	case FIOSETOWN: {
6104 		pid_t pgrp;
6105 
6106 		if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
6107 		    (mode & (int)FKIOCTL)))
6108 			return (EFAULT);
6109 
6110 		mutex_enter(&so->so_lock);
6111 		dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
6112 		/* Any change? */
6113 		if (pgrp != so->so_pgrp)
6114 			error = so_set_siggrp(so, vp, pgrp, mode, cr);
6115 		mutex_exit(&so->so_lock);
6116 		return (error);
6117 	}
6118 	case SIOCGPGRP:
6119 	case FIOGETOWN:
6120 		if (so_copyout(&so->so_pgrp, (void *)arg,
6121 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
6122 			return (EFAULT);
6123 		return (0);
6124 
6125 	case SIOCATMARK: {
6126 		int retval;
6127 		uint_t so_state;
6128 
6129 		/*
6130 		 * strwaitmark has a finite timeout after which it
6131 		 * returns -1 if the mark state is undetermined.
6132 		 * In order to avoid any race between the mark state
6133 		 * in sockfs and the mark state in the stream head this
6134 		 * routine loops until the mark state can be determined
6135 		 * (or the urgent data indication has been removed by some
6136 		 * other thread).
6137 		 */
6138 		do {
6139 			mutex_enter(&so->so_lock);
6140 			so_state = so->so_state;
6141 			mutex_exit(&so->so_lock);
6142 			if (so_state & SS_RCVATMARK) {
6143 				retval = 1;
6144 			} else if (!(so_state & SS_OOBPEND)) {
6145 				/*
6146 				 * No SIGURG has been generated -- there is no
6147 				 * pending or present urgent data. Thus can't
6148 				 * possibly be at the mark.
6149 				 */
6150 				retval = 0;
6151 			} else {
6152 				/*
6153 				 * Have the stream head wait until there is
6154 				 * either some messages on the read queue, or
6155 				 * STRATMARK or STRNOTATMARK gets set. The
6156 				 * STRNOTATMARK flag is used so that the
6157 				 * transport can send up a MSGNOTMARKNEXT
6158 				 * M_DATA to indicate that it is not
6159 				 * at the mark and additional data is not about
6160 				 * to be send upstream.
6161 				 *
6162 				 * If the mark state is undetermined this will
6163 				 * return -1 and we will loop rechecking the
6164 				 * socket state.
6165 				 */
6166 				retval = strwaitmark(vp);
6167 			}
6168 		} while (retval == -1);
6169 
6170 		if (so_copyout(&retval, (void *)arg, sizeof (int),
6171 		    (mode & (int)FKIOCTL)))
6172 			return (EFAULT);
6173 		return (0);
6174 	}
6175 
6176 	case I_FDINSERT:
6177 	case I_SENDFD:
6178 	case I_RECVFD:
6179 	case I_ATMARK:
6180 	case _SIOCSOCKFALLBACK:
6181 		/*
6182 		 * These ioctls do not apply to sockets. I_FDINSERT can be
6183 		 * used to send M_PROTO messages without modifying the socket
6184 		 * state. I_SENDFD/RECVFD should not be used for socket file
6185 		 * descriptor passing since they assume a twisted stream.
6186 		 * SIOCATMARK must be used instead of I_ATMARK.
6187 		 *
6188 		 * _SIOCSOCKFALLBACK from an application should never be
6189 		 * processed.  It is only generated by socktpi_open() or
6190 		 * in response to I_POP or I_PUSH.
6191 		 */
6192 #ifdef DEBUG
6193 		zcmn_err(getzoneid(), CE_WARN,
6194 		    "Unsupported STREAMS ioctl 0x%x on socket. "
6195 		    "Pid = %d\n", cmd, curproc->p_pid);
6196 #endif /* DEBUG */
6197 		return (EOPNOTSUPP);
6198 
6199 	case _I_GETPEERCRED:
6200 		if ((mode & FKIOCTL) == 0)
6201 			return (EINVAL);
6202 
6203 		mutex_enter(&so->so_lock);
6204 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
6205 			error = ENOTSUP;
6206 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
6207 			error = ENOTCONN;
6208 		} else if (so->so_peercred != NULL) {
6209 			k_peercred_t *kp = (k_peercred_t *)arg;
6210 			kp->pc_cr = so->so_peercred;
6211 			kp->pc_cpid = so->so_cpid;
6212 			crhold(so->so_peercred);
6213 		} else {
6214 			error = EINVAL;
6215 		}
6216 		mutex_exit(&so->so_lock);
6217 		return (error);
6218 
6219 	default:
6220 		/*
6221 		 * Do the higher-order bits of the ioctl cmd indicate
6222 		 * that it is an I_* streams ioctl?
6223 		 */
6224 		if ((cmd & 0xffffff00U) == STR &&
6225 		    so->so_version == SOV_SOCKBSD) {
6226 #ifdef DEBUG
6227 			zcmn_err(getzoneid(), CE_WARN,
6228 			    "Unsupported STREAMS ioctl 0x%x on socket. "
6229 			    "Pid = %d\n", cmd, 	curproc->p_pid);
6230 #endif /* DEBUG */
6231 			return (EOPNOTSUPP);
6232 		}
6233 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6234 	}
6235 }
6236 
6237 /*
6238  * Handle plumbing-related ioctls.
6239  */
6240 static int
6241 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
6242     struct cred *cr, int32_t *rvalp)
6243 {
6244 	static const char sockmod_name[] = "sockmod";
6245 	struct sonode	*so = VTOSO(vp);
6246 	char		mname[FMNAMESZ + 1];
6247 	int		error;
6248 	sotpi_info_t	*sti = SOTOTPI(so);
6249 
6250 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
6251 
6252 	if (so->so_version == SOV_SOCKBSD)
6253 		return (EOPNOTSUPP);
6254 
6255 	if (so->so_version == SOV_STREAM) {
6256 		/*
6257 		 * The imaginary "sockmod" has been popped - act as a stream.
6258 		 * If this is a push of sockmod then change back to a socket.
6259 		 */
6260 		if (cmd == I_PUSH) {
6261 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6262 			    (void *)arg, mname, sizeof (mname), NULL);
6263 
6264 			if (error == 0 && strcmp(mname, sockmod_name) == 0) {
6265 				dprintso(so, 0, ("socktpi_ioctl: going to "
6266 				    "socket version\n"));
6267 				so_stream2sock(so);
6268 				return (0);
6269 			}
6270 		}
6271 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6272 	}
6273 
6274 	switch (cmd) {
6275 	case I_PUSH:
6276 		if (sti->sti_direct) {
6277 			mutex_enter(&so->so_lock);
6278 			so_lock_single(so);
6279 			mutex_exit(&so->so_lock);
6280 
6281 			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6282 			    cr, rvalp);
6283 
6284 			mutex_enter(&so->so_lock);
6285 			if (error == 0)
6286 				sti->sti_direct = 0;
6287 			so_unlock_single(so, SOLOCKED);
6288 			mutex_exit(&so->so_lock);
6289 
6290 			if (error != 0)
6291 				return (error);
6292 		}
6293 
6294 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6295 		if (error == 0)
6296 			sti->sti_pushcnt++;
6297 		return (error);
6298 
6299 	case I_POP:
6300 		if (sti->sti_pushcnt == 0) {
6301 			/* Emulate sockmod being popped */
6302 			dprintso(so, 0,
6303 			    ("socktpi_ioctl: going to STREAMS version\n"));
6304 			return (so_sock2stream(so));
6305 		}
6306 
6307 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6308 		if (error == 0)
6309 			sti->sti_pushcnt--;
6310 		return (error);
6311 
6312 	case I_LIST: {
6313 		struct str_mlist *kmlistp, *umlistp;
6314 		struct str_list	kstrlist;
6315 		ssize_t		kstrlistsize;
6316 		int		i, nmods;
6317 
6318 		STRUCT_DECL(str_list, ustrlist);
6319 		STRUCT_INIT(ustrlist, mode);
6320 
6321 		if (arg == NULL) {
6322 			error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6323 			if (error == 0)
6324 				(*rvalp)++;	/* Add one for sockmod */
6325 			return (error);
6326 		}
6327 
6328 		error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6329 		    STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6330 		if (error != 0)
6331 			return (error);
6332 
6333 		nmods = STRUCT_FGET(ustrlist, sl_nmods);
6334 		if (nmods <= 0)
6335 			return (EINVAL);
6336 		/*
6337 		 * Ceiling nmods at nstrpush to prevent someone from
6338 		 * maliciously consuming lots of kernel memory.
6339 		 */
6340 		nmods = MIN(nmods, nstrpush);
6341 
6342 		kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6343 		kstrlist.sl_nmods = nmods;
6344 		kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6345 
6346 		error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6347 		    cr, rvalp);
6348 		if (error != 0)
6349 			goto done;
6350 
6351 		/*
6352 		 * Considering the module list as a 0-based array of sl_nmods
6353 		 * modules, sockmod should conceptually exist at slot
6354 		 * sti_pushcnt.  Insert sockmod at this location by sliding all
6355 		 * of the module names after so_pushcnt over by one.  We know
6356 		 * that there will be room to do this since we allocated
6357 		 * sl_modlist with an additional slot.
6358 		 */
6359 		for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6360 			kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6361 
6362 		(void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6363 		kstrlist.sl_nmods++;
6364 
6365 		/*
6366 		 * Copy all of the entries out to ustrlist.
6367 		 */
6368 		kmlistp = kstrlist.sl_modlist;
6369 		umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6370 		for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6371 			error = so_copyout(kmlistp++, umlistp++,
6372 			    sizeof (struct str_mlist), mode & FKIOCTL);
6373 			if (error != 0)
6374 				goto done;
6375 		}
6376 
6377 		error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6378 		    mode & FKIOCTL);
6379 		if (error == 0)
6380 			*rvalp = 0;
6381 	done:
6382 		kmem_free(kstrlist.sl_modlist, kstrlistsize);
6383 		return (error);
6384 	}
6385 	case I_LOOK:
6386 		if (sti->sti_pushcnt == 0) {
6387 			return (so_copyout(sockmod_name, (void *)arg,
6388 			    sizeof (sockmod_name), mode & FKIOCTL));
6389 		}
6390 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6391 
6392 	case I_FIND:
6393 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6394 		if (error && error != EINVAL)
6395 			return (error);
6396 
6397 		/* if not found and string was sockmod return 1 */
6398 		if (*rvalp == 0 || error == EINVAL) {
6399 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6400 			    (void *)arg, mname, sizeof (mname), NULL);
6401 			if (error == ENAMETOOLONG)
6402 				error = EINVAL;
6403 
6404 			if (error == 0 && strcmp(mname, sockmod_name) == 0)
6405 				*rvalp = 1;
6406 		}
6407 		return (error);
6408 
6409 	default:
6410 		panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6411 		break;
6412 	}
6413 
6414 	return (0);
6415 }
6416 
6417 /*
6418  * Wrapper around the streams poll routine that implements socket poll
6419  * semantics.
6420  * The sockfs never calls pollwakeup itself - the stream head take care
6421  * of all pollwakeups. Since sockfs never holds so_lock when calling the
6422  * stream head there can never be a deadlock due to holding so_lock across
6423  * pollwakeup and acquiring so_lock in this routine.
6424  *
6425  * However, since the performance of VOP_POLL is critical we avoid
6426  * acquiring so_lock here. This is based on two assumptions:
6427  *  - The poll implementation holds locks to serialize the VOP_POLL call
6428  *    and a pollwakeup for the same pollhead. This ensures that should
6429  *    e.g. so_state change during a socktpi_poll call the pollwakeup
6430  *    (which strsock_* and strrput conspire to issue) is issued after
6431  *    the state change. Thus the pollwakeup will block until VOP_POLL has
6432  *    returned and then wake up poll and have it call VOP_POLL again.
6433  *  - The reading of so_state without holding so_lock does not result in
6434  *    stale data that is older than the latest state change that has dropped
6435  *    so_lock. This is ensured by the mutex_exit issuing the appropriate
6436  *    memory barrier to force the data into the coherency domain.
6437  */
6438 static int
6439 sotpi_poll(
6440 	struct sonode	*so,
6441 	short		events,
6442 	int		anyyet,
6443 	short		*reventsp,
6444 	struct pollhead **phpp)
6445 {
6446 	short origevents = events;
6447 	struct vnode *vp = SOTOV(so);
6448 	int error;
6449 	int so_state = so->so_state;	/* snapshot */
6450 	sotpi_info_t *sti = SOTOTPI(so);
6451 
6452 	dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6453 	    (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6454 
6455 	ASSERT(vp->v_type == VSOCK);
6456 	ASSERT(vp->v_stream != NULL);
6457 
6458 	if (so->so_version == SOV_STREAM) {
6459 		/* The imaginary "sockmod" has been popped - act as a stream */
6460 		return (strpoll(vp->v_stream, events, anyyet,
6461 		    reventsp, phpp));
6462 	}
6463 
6464 	if (!(so_state & SS_ISCONNECTED) &&
6465 	    (so->so_mode & SM_CONNREQUIRED)) {
6466 		/* Not connected yet - turn off write side events */
6467 		events &= ~(POLLOUT|POLLWRBAND);
6468 	}
6469 	/*
6470 	 * Check for errors without calling strpoll if the caller wants them.
6471 	 * In sockets the errors are represented as input/output events
6472 	 * and there is no need to ask the stream head for this information.
6473 	 */
6474 	if (so->so_error != 0 &&
6475 	    ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
6476 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6477 		return (0);
6478 	}
6479 	/*
6480 	 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6481 	 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6482 	 * will not trigger a POLLIN event with POLLRDDATA set.
6483 	 * The handling of urgent data (causing POLLRDBAND) is done by
6484 	 * inspecting SS_OOBPEND below.
6485 	 */
6486 	events |= POLLRDDATA;
6487 
6488 	/*
6489 	 * After shutdown(output) a stream head write error is set.
6490 	 * However, we should not return output events.
6491 	 */
6492 	events |= POLLNOERR;
6493 	error = strpoll(vp->v_stream, events, anyyet,
6494 	    reventsp, phpp);
6495 	if (error)
6496 		return (error);
6497 
6498 	ASSERT(!(*reventsp & POLLERR));
6499 
6500 	/*
6501 	 * Notes on T_CONN_IND handling for sockets.
6502 	 *
6503 	 * If strpoll() returned without events, SR_POLLIN is guaranteed
6504 	 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6505 	 *
6506 	 * Since the so_lock is not held, soqueueconnind() may have run
6507 	 * and a T_CONN_IND may be waiting. We now check for any queued
6508 	 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6509 	 * to ensure poll returns.
6510 	 *
6511 	 * However:
6512 	 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6513 	 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6514 	 * the following actions will occur; taken together they ensure the
6515 	 * syscall will return.
6516 	 *
6517 	 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6518 	 *    the accept() was run on a non-blocking socket sowaitconnind()
6519 	 *    may have already returned EWOULDBLOCK, so not be waiting to
6520 	 *    process the message. Additionally socktpi_poll() has probably
6521 	 *    proceeded past the sti_conn_ind_head check below.
6522 	 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6523 	 *    this thread,  however that could occur before poll_common()
6524 	 *    has entered cv_wait.
6525 	 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6526 	 *
6527 	 * Before proceeding to cv_wait() in poll_common() for an event,
6528 	 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6529 	 * and if set, re-calls strpoll() to ensure the late arriving
6530 	 * T_CONN_IND is recognized, and pollsys() returns.
6531 	 */
6532 
6533 	if (sti->sti_conn_ind_head != NULL)
6534 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6535 
6536 	if (so->so_state & SS_OOBPEND)
6537 		*reventsp |= POLLRDBAND & events;
6538 
6539 	if (sti->sti_nl7c_rcv_mp != NULL) {
6540 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6541 	}
6542 	if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6543 	    ((POLLIN|POLLRDNORM) & *reventsp)) {
6544 		sti->sti_nl7c_flags |= NL7C_POLLIN;
6545 	}
6546 
6547 	return (0);
6548 }
6549 
6550 /*ARGSUSED*/
6551 static int
6552 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6553 {
6554 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6555 	int error = 0;
6556 
6557 	error = sonode_constructor(buf, cdrarg, kmflags);
6558 	if (error != 0)
6559 		return (error);
6560 
6561 	error = i_sotpi_info_constructor(&st->st_info);
6562 	if (error != 0)
6563 		sonode_destructor(buf, cdrarg);
6564 
6565 	st->st_sonode.so_priv = &st->st_info;
6566 
6567 	return (error);
6568 }
6569 
6570 /*ARGSUSED1*/
6571 static void
6572 socktpi_destructor(void *buf, void *cdrarg)
6573 {
6574 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6575 
6576 	ASSERT(st->st_sonode.so_priv == &st->st_info);
6577 	st->st_sonode.so_priv = NULL;
6578 
6579 	i_sotpi_info_destructor(&st->st_info);
6580 	sonode_destructor(buf, cdrarg);
6581 }
6582 
6583 static int
6584 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6585 {
6586 	int retval;
6587 
6588 	if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6589 		struct sonode *so = (struct sonode *)buf;
6590 		sotpi_info_t *sti = SOTOTPI(so);
6591 
6592 		mutex_enter(&socklist.sl_lock);
6593 
6594 		sti->sti_next_so = socklist.sl_list;
6595 		sti->sti_prev_so = NULL;
6596 		if (sti->sti_next_so != NULL)
6597 			SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6598 		socklist.sl_list = so;
6599 
6600 		mutex_exit(&socklist.sl_lock);
6601 
6602 	}
6603 	return (retval);
6604 }
6605 
6606 static void
6607 socktpi_unix_destructor(void *buf, void *cdrarg)
6608 {
6609 	struct sonode	*so = (struct sonode *)buf;
6610 	sotpi_info_t	*sti = SOTOTPI(so);
6611 
6612 	mutex_enter(&socklist.sl_lock);
6613 
6614 	if (sti->sti_next_so != NULL)
6615 		SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6616 	if (sti->sti_prev_so != NULL)
6617 		SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6618 	else
6619 		socklist.sl_list = sti->sti_next_so;
6620 
6621 	mutex_exit(&socklist.sl_lock);
6622 
6623 	socktpi_destructor(buf, cdrarg);
6624 }
6625 
6626 int
6627 socktpi_init(void)
6628 {
6629 	/*
6630 	 * Create sonode caches.  We create a special one for AF_UNIX so
6631 	 * that we can track them for netstat(1m).
6632 	 */
6633 	socktpi_cache = kmem_cache_create("socktpi_cache",
6634 	    sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6635 	    socktpi_destructor, NULL, NULL, NULL, 0);
6636 
6637 	socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6638 	    sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6639 	    socktpi_unix_destructor, NULL, NULL, NULL, 0);
6640 
6641 	return (0);
6642 }
6643 
6644 /*
6645  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6646  *
6647  * Caller must still update state and mode using sotpi_update_state().
6648  */
6649 int
6650 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6651     boolean_t *direct, queue_t **qp, struct cred *cr)
6652 {
6653 	sotpi_info_t *sti;
6654 	struct sockparams *origsp = so->so_sockparams;
6655 	sock_lower_handle_t handle = so->so_proto_handle;
6656 	struct stdata *stp;
6657 	struct vnode *vp;
6658 	queue_t *q;
6659 	int error = 0;
6660 
6661 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6662 	    SS_FALLBACK_PENDING);
6663 	ASSERT(SOCK_IS_NONSTR(so));
6664 
6665 	*qp = NULL;
6666 	*direct = B_FALSE;
6667 	so->so_sockparams = newsp;
6668 	/*
6669 	 * Allocate and initalize fields required by TPI.
6670 	 */
6671 	(void) sotpi_info_create(so, KM_SLEEP);
6672 	sotpi_info_init(so);
6673 
6674 	if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6675 		sotpi_info_fini(so);
6676 		sotpi_info_destroy(so);
6677 		return (error);
6678 	}
6679 	ASSERT(handle == so->so_proto_handle);
6680 	sti = SOTOTPI(so);
6681 	if (sti->sti_direct != 0)
6682 		*direct = B_TRUE;
6683 
6684 	/*
6685 	 * When it comes to urgent data we have two cases to deal with;
6686 	 * (1) The oob byte has already arrived, or (2) the protocol has
6687 	 * notified that oob data is pending, but it has not yet arrived.
6688 	 *
6689 	 * For (1) all we need to do is send a T_EXDATA_IND to indicate were
6690 	 * in the byte stream the oob byte is. For (2) we have to send a
6691 	 * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
6692 	 * the oob byte will be the next byte from the protocol.
6693 	 *
6694 	 * So in the worst case we need two mblks, one for the signal, another
6695 	 * for mark indication. In that case we use the exdata_mp for the sig.
6696 	 */
6697 	sti->sti_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind), BPRI_MED,
6698 	    STR_NOSIG, NULL);
6699 	sti->sti_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
6700 
6701 	/*
6702 	 * Keep the original sp around so we can properly dispose of the
6703 	 * sonode when the socket is being closed.
6704 	 */
6705 	sti->sti_orig_sp = origsp;
6706 
6707 	so_basic_strinit(so);	/* skips the T_CAPABILITY_REQ */
6708 	so_alloc_addr(so, so->so_max_addr_len);
6709 
6710 	/*
6711 	 * If the application has done a SIOCSPGRP, make sure the
6712 	 * STREAM head is aware. This needs to take place before
6713 	 * the protocol start sending up messages. Otherwise we
6714 	 * might miss to generate SIGPOLL.
6715 	 *
6716 	 * It is possible that the application will receive duplicate
6717 	 * signals if some were already generated for either data or
6718 	 * connection indications.
6719 	 */
6720 	if (so->so_pgrp != 0) {
6721 		if (so_set_events(so, so->so_vnode, cr) != 0)
6722 			so->so_pgrp = 0;
6723 	}
6724 
6725 	/*
6726 	 * Determine which queue to use.
6727 	 */
6728 	vp = SOTOV(so);
6729 	stp = vp->v_stream;
6730 	ASSERT(stp != NULL);
6731 	q = stp->sd_wrq->q_next;
6732 
6733 	/*
6734 	 * Skip any modules that may have been auto pushed when the device
6735 	 * was opened
6736 	 */
6737 	while (q->q_next != NULL)
6738 		q = q->q_next;
6739 	*qp = _RD(q);
6740 
6741 	/* This is now a STREAMS sockets */
6742 	so->so_not_str = B_FALSE;
6743 
6744 	return (error);
6745 }
6746 
6747 /*
6748  * Revert a TPI sonode. It is only allowed to revert the sonode during
6749  * the fallback process.
6750  */
6751 void
6752 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6753 {
6754 	vnode_t *vp = SOTOV(so);
6755 
6756 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6757 	    SS_FALLBACK_PENDING);
6758 	ASSERT(!SOCK_IS_NONSTR(so));
6759 	ASSERT(vp->v_stream != NULL);
6760 
6761 	if (SOTOTPI(so)->sti_exdata_mp != NULL) {
6762 		freeb(SOTOTPI(so)->sti_exdata_mp);
6763 		SOTOTPI(so)->sti_exdata_mp = NULL;
6764 	}
6765 
6766 	if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
6767 		freeb(SOTOTPI(so)->sti_urgmark_mp);
6768 		SOTOTPI(so)->sti_urgmark_mp = NULL;
6769 	}
6770 
6771 	strclean(vp);
6772 	(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6773 
6774 	/*
6775 	 * Restore the original sockparams. The caller is responsible for
6776 	 * dropping the ref to the new sp.
6777 	 */
6778 	so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6779 
6780 	sotpi_info_fini(so);
6781 	sotpi_info_destroy(so);
6782 
6783 	/* This is no longer a STREAMS sockets */
6784 	so->so_not_str = B_TRUE;
6785 }
6786 
6787 void
6788 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6789     struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6790     socklen_t faddrlen, short opts)
6791 {
6792 	sotpi_info_t *sti = SOTOTPI(so);
6793 
6794 	so_proc_tcapability_ack(so, tcap);
6795 
6796 	so->so_options |= opts;
6797 
6798 	/*
6799 	 * Determine whether the foreign and local address are valid
6800 	 */
6801 	if (laddrlen != 0) {
6802 		ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6803 		sti->sti_laddr_len = laddrlen;
6804 		bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6805 		sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6806 	}
6807 
6808 	if (faddrlen != 0) {
6809 		ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6810 		sti->sti_faddr_len = faddrlen;
6811 		bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6812 		sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6813 	}
6814 
6815 }
6816 
6817 /*
6818  * Allocate enough space to cache the local and foreign addresses.
6819  */
6820 void
6821 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6822 {
6823 	sotpi_info_t *sti = SOTOTPI(so);
6824 
6825 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6826 	ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6827 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6828 	    P2ROUNDUP(maxlen, KMEM_ALIGN);
6829 	so->so_max_addr_len = sti->sti_laddr_maxlen;
6830 	sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6831 	sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6832 	    + sti->sti_laddr_maxlen);
6833 
6834 	if (so->so_family == AF_UNIX) {
6835 		/*
6836 		 * Initialize AF_UNIX related fields.
6837 		 */
6838 		bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6839 		bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6840 	}
6841 }
6842 
6843 
6844 sotpi_info_t *
6845 sotpi_sototpi(struct sonode *so)
6846 {
6847 	sotpi_info_t *sti;
6848 
6849 	ASSERT(so != NULL);
6850 
6851 	sti = (sotpi_info_t *)so->so_priv;
6852 
6853 	ASSERT(sti != NULL);
6854 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6855 
6856 	return (sti);
6857 }
6858 
6859 static int
6860 i_sotpi_info_constructor(sotpi_info_t *sti)
6861 {
6862 	sti->sti_magic		= SOTPI_INFO_MAGIC;
6863 	sti->sti_ack_mp		= NULL;
6864 	sti->sti_discon_ind_mp	= NULL;
6865 	sti->sti_ux_bound_vp	= NULL;
6866 	sti->sti_unbind_mp	= NULL;
6867 
6868 	sti->sti_conn_ind_head	= NULL;
6869 	sti->sti_conn_ind_tail	= NULL;
6870 
6871 	sti->sti_laddr_sa	= NULL;
6872 	sti->sti_faddr_sa	= NULL;
6873 
6874 	sti->sti_nl7c_flags	= 0;
6875 	sti->sti_nl7c_uri	= NULL;
6876 	sti->sti_nl7c_rcv_mp	= NULL;
6877 
6878 	sti->sti_exdata_mp	= NULL;
6879 	sti->sti_urgmark_mp	= NULL;
6880 
6881 	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6882 	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6883 
6884 	return (0);
6885 }
6886 
6887 static void
6888 i_sotpi_info_destructor(sotpi_info_t *sti)
6889 {
6890 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6891 	ASSERT(sti->sti_ack_mp == NULL);
6892 	ASSERT(sti->sti_discon_ind_mp == NULL);
6893 	ASSERT(sti->sti_ux_bound_vp == NULL);
6894 	ASSERT(sti->sti_unbind_mp == NULL);
6895 
6896 	ASSERT(sti->sti_conn_ind_head == NULL);
6897 	ASSERT(sti->sti_conn_ind_tail == NULL);
6898 
6899 	ASSERT(sti->sti_laddr_sa == NULL);
6900 	ASSERT(sti->sti_faddr_sa == NULL);
6901 
6902 	ASSERT(sti->sti_nl7c_flags == 0);
6903 	ASSERT(sti->sti_nl7c_uri == NULL);
6904 	ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6905 
6906 	ASSERT(sti->sti_exdata_mp == NULL);
6907 	ASSERT(sti->sti_urgmark_mp == NULL);
6908 
6909 	mutex_destroy(&sti->sti_plumb_lock);
6910 	cv_destroy(&sti->sti_ack_cv);
6911 }
6912 
6913 /*
6914  * Creates and attaches TPI information to the given sonode
6915  */
6916 static boolean_t
6917 sotpi_info_create(struct sonode *so, int kmflags)
6918 {
6919 	sotpi_info_t *sti;
6920 
6921 	ASSERT(so->so_priv == NULL);
6922 
6923 	if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6924 		return (B_FALSE);
6925 
6926 	if (i_sotpi_info_constructor(sti) != 0) {
6927 		kmem_free(sti, sizeof (*sti));
6928 		return (B_FALSE);
6929 	}
6930 
6931 	so->so_priv = (void *)sti;
6932 	return (B_TRUE);
6933 }
6934 
6935 /*
6936  * Initializes the TPI information.
6937  */
6938 static void
6939 sotpi_info_init(struct sonode *so)
6940 {
6941 	struct vnode *vp = SOTOV(so);
6942 	sotpi_info_t *sti = SOTOTPI(so);
6943 	time_t now;
6944 
6945 	sti->sti_dev 	= so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6946 	vp->v_rdev	= sti->sti_dev;
6947 
6948 	sti->sti_orig_sp = NULL;
6949 
6950 	sti->sti_pushcnt = 0;
6951 
6952 	now = gethrestime_sec();
6953 	sti->sti_atime	= now;
6954 	sti->sti_mtime	= now;
6955 	sti->sti_ctime	= now;
6956 
6957 	sti->sti_eaddr_mp = NULL;
6958 	sti->sti_delayed_error = 0;
6959 
6960 	sti->sti_provinfo = NULL;
6961 
6962 	sti->sti_oobcnt = 0;
6963 	sti->sti_oobsigcnt = 0;
6964 
6965 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6966 
6967 	sti->sti_laddr_sa	= 0;
6968 	sti->sti_faddr_sa	= 0;
6969 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6970 	sti->sti_laddr_len = sti->sti_faddr_len = 0;
6971 
6972 	sti->sti_laddr_valid = 0;
6973 	sti->sti_faddr_valid = 0;
6974 	sti->sti_faddr_noxlate = 0;
6975 
6976 	sti->sti_direct = 0;
6977 
6978 	ASSERT(sti->sti_ack_mp == NULL);
6979 	ASSERT(sti->sti_ux_bound_vp == NULL);
6980 	ASSERT(sti->sti_unbind_mp == NULL);
6981 
6982 	ASSERT(sti->sti_conn_ind_head == NULL);
6983 	ASSERT(sti->sti_conn_ind_tail == NULL);
6984 
6985 	/* Initialize the kernel SSL proxy fields */
6986 	sti->sti_kssl_type = KSSL_NO_PROXY;
6987 	sti->sti_kssl_ent = NULL;
6988 	sti->sti_kssl_ctx = NULL;
6989 }
6990 
6991 /*
6992  * Given a sonode, grab the TPI info and free any data.
6993  */
6994 static void
6995 sotpi_info_fini(struct sonode *so)
6996 {
6997 	sotpi_info_t *sti = SOTOTPI(so);
6998 	mblk_t *mp;
6999 
7000 	ASSERT(sti->sti_discon_ind_mp == NULL);
7001 
7002 	if ((mp = sti->sti_conn_ind_head) != NULL) {
7003 		mblk_t *mp1;
7004 
7005 		while (mp) {
7006 			mp1 = mp->b_next;
7007 			mp->b_next = NULL;
7008 			freemsg(mp);
7009 			mp = mp1;
7010 		}
7011 		sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
7012 	}
7013 
7014 	/*
7015 	 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
7016 	 * indirect them.  It also uses so_count as a validity test.
7017 	 */
7018 	mutex_enter(&so->so_lock);
7019 
7020 	if (sti->sti_laddr_sa) {
7021 		ASSERT((caddr_t)sti->sti_faddr_sa ==
7022 		    (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
7023 		ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
7024 		sti->sti_laddr_valid = 0;
7025 		sti->sti_faddr_valid = 0;
7026 		kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
7027 		sti->sti_laddr_sa = NULL;
7028 		sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
7029 		sti->sti_faddr_sa = NULL;
7030 		sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
7031 	}
7032 
7033 	mutex_exit(&so->so_lock);
7034 
7035 	if ((mp = sti->sti_eaddr_mp) != NULL) {
7036 		freemsg(mp);
7037 		sti->sti_eaddr_mp = NULL;
7038 		sti->sti_delayed_error = 0;
7039 	}
7040 
7041 	if ((mp = sti->sti_ack_mp) != NULL) {
7042 		freemsg(mp);
7043 		sti->sti_ack_mp = NULL;
7044 	}
7045 
7046 	if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
7047 		sti->sti_nl7c_rcv_mp = NULL;
7048 		freemsg(mp);
7049 	}
7050 	sti->sti_nl7c_rcv_rval = 0;
7051 	if (sti->sti_nl7c_uri != NULL) {
7052 		nl7c_urifree(so);
7053 		/* urifree() cleared nl7c_uri */
7054 	}
7055 	if (sti->sti_nl7c_flags) {
7056 		sti->sti_nl7c_flags = 0;
7057 	}
7058 
7059 	ASSERT(sti->sti_ux_bound_vp == NULL);
7060 	if ((mp = sti->sti_unbind_mp) != NULL) {
7061 		freemsg(mp);
7062 		sti->sti_unbind_mp = NULL;
7063 	}
7064 }
7065 
7066 /*
7067  * Destroys the TPI information attached to a sonode.
7068  */
7069 static void
7070 sotpi_info_destroy(struct sonode *so)
7071 {
7072 	sotpi_info_t *sti = SOTOTPI(so);
7073 
7074 	i_sotpi_info_destructor(sti);
7075 	kmem_free(sti, sizeof (*sti));
7076 
7077 	so->so_priv = NULL;
7078 }
7079 
7080 /*
7081  * Create the global sotpi socket module entry. It will never be freed.
7082  */
7083 smod_info_t *
7084 sotpi_smod_create(void)
7085 {
7086 	smod_info_t *smodp;
7087 
7088 	smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
7089 	smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
7090 	(void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
7091 	/*
7092 	 * Initialize the smod_refcnt to 1 so it will never be freed.
7093 	 */
7094 	smodp->smod_refcnt = 1;
7095 	smodp->smod_uc_version = SOCK_UC_VERSION;
7096 	smodp->smod_dc_version = SOCK_DC_VERSION;
7097 	smodp->smod_sock_create_func = &sotpi_create;
7098 	smodp->smod_sock_destroy_func = &sotpi_destroy;
7099 	return (smodp);
7100 }
7101