xref: /illumos-gate/usr/src/uts/common/fs/sockfs/socktpi.c (revision c2785286330b58810071fc24f6ca69f401682dff)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/strsun.h>
50 #include <sys/ddi.h>
51 #include <sys/esunddi.h>
52 #include <sys/flock.h>
53 #include <sys/modctl.h>
54 #include <sys/vtrace.h>
55 #include <sys/cmn_err.h>
56 #include <sys/pathname.h>
57 
58 #include <sys/socket.h>
59 #include <sys/socketvar.h>
60 #include <sys/sockio.h>
61 #include <sys/sodirect.h>
62 #include <netinet/in.h>
63 #include <sys/un.h>
64 #include <sys/strsun.h>
65 
66 #include <sys/tiuser.h>
67 #define	_SUN_TPI_VERSION	2
68 #include <sys/tihdr.h>
69 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
70 
71 #include <c2/audit.h>
72 
73 #include <inet/common.h>
74 #include <inet/ip.h>
75 #include <inet/ip6.h>
76 #include <inet/tcp.h>
77 #include <inet/udp_impl.h>
78 
79 #include <sys/zone.h>
80 
81 #include <fs/sockfs/nl7c.h>
82 #include <fs/sockfs/nl7curi.h>
83 
84 #include <inet/kssl/ksslapi.h>
85 
86 /*
87  * Possible failures when memory can't be allocated. The documented behavior:
88  *
89  * 		5.5:			4.X:		XNET:
90  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
91  *							EINTR
92  *	(4.X does not document EINTR but returns it)
93  * bind:	ENOSR			-		ENOBUFS/ENOSR
94  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
95  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
96  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
97  *	(4.X getpeername and getsockname do not fail in practice)
98  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
99  * listen:	-			-		ENOBUFS
100  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
101  *							EINTR
102  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
103  *							EINTR
104  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
105  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
106  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
107  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
108  *
109  * Resolution. When allocation fails:
110  *	recv: return EINTR
111  *	send: return EINTR
112  *	connect, accept: EINTR
113  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
114  *	socket, socketpair: ENOBUFS
115  *	getpeername, getsockname: sleep
116  *	getsockopt, setsockopt: sleep
117  */
118 
119 #ifdef SOCK_TEST
120 /*
121  * Variables that make sockfs do something other than the standard TPI
122  * for the AF_INET transports.
123  *
124  * solisten_tpi_tcp:
125  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
126  *	the transport is already bound. This is needed to avoid loosing the
127  *	port number should listen() do a T_UNBIND_REQ followed by a
128  *	O_T_BIND_REQ.
129  *
130  * soconnect_tpi_udp:
131  *	UDP and ICMP can handle a T_CONN_REQ.
132  *	This is needed to make the sequence of connect(), getsockname()
133  *	return the local IP address used to send packets to the connected to
134  *	destination.
135  *
136  * soconnect_tpi_tcp:
137  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
138  *	Set this to non-zero to send TPI conformant messages to TCP in this
139  *	respect. This is a performance optimization.
140  *
141  * soaccept_tpi_tcp:
142  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
143  *	This is a performance optimization that has been picked up in XTI.
144  *
145  * soaccept_tpi_multioptions:
146  *	When inheriting SOL_SOCKET options from the listener to the accepting
147  *	socket send them as a single message for AF_INET{,6}.
148  */
149 int solisten_tpi_tcp = 0;
150 int soconnect_tpi_udp = 0;
151 int soconnect_tpi_tcp = 0;
152 int soaccept_tpi_tcp = 0;
153 int soaccept_tpi_multioptions = 1;
154 #else /* SOCK_TEST */
155 #define	soconnect_tpi_tcp	0
156 #define	soconnect_tpi_udp	0
157 #define	solisten_tpi_tcp	0
158 #define	soaccept_tpi_tcp	0
159 #define	soaccept_tpi_multioptions	1
160 #endif /* SOCK_TEST */
161 
162 #ifdef SOCK_TEST
163 extern int do_useracc;
164 extern clock_t sock_test_timelimit;
165 #endif /* SOCK_TEST */
166 
167 /*
168  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
169  * applications working. Turn on this flag to disable these checks.
170  */
171 int xnet_skip_checks = 0;
172 int xnet_check_print = 0;
173 int xnet_truncate_print = 0;
174 
175 extern	void sigintr(k_sigset_t *, int);
176 extern	void sigunintr(k_sigset_t *);
177 
178 extern	void *nl7c_lookup_addr(void *, t_uscalar_t);
179 extern	void *nl7c_add_addr(void *, t_uscalar_t);
180 extern	void nl7c_listener_addr(void *, struct sonode *);
181 
182 /* Sockets acting as an in-kernel SSL proxy */
183 extern mblk_t	*strsock_kssl_input(vnode_t *, mblk_t *, strwakeup_t *,
184 		    strsigset_t *, strsigset_t *, strpollset_t *);
185 extern mblk_t	*strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *,
186 		    strsigset_t *, strsigset_t *, strpollset_t *);
187 
188 static int	sotpi_unbind(struct sonode *, int);
189 
190 extern int	sodput(sodirect_t *, mblk_t *);
191 extern void	sodwakeup(sodirect_t *);
192 
193 /* TPI sockfs sonode operations */
194 static int	sotpi_accept(struct sonode *, int, struct sonode **);
195 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
196 		    int);
197 static int	sotpi_connect(struct sonode *, const struct sockaddr *,
198 		    socklen_t, int, int);
199 static int	sotpi_listen(struct sonode *, int);
200 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
201 		    struct uio *);
202 static int	sotpi_shutdown(struct sonode *, int);
203 static int	sotpi_getsockname(struct sonode *);
204 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
205 		    struct uio *, void *, t_uscalar_t, int);
206 static int	sodgram_direct(struct sonode *, struct sockaddr *,
207 		    socklen_t, struct uio *, int);
208 
209 sonodeops_t sotpi_sonodeops = {
210 	sotpi_accept,		/* sop_accept		*/
211 	sotpi_bind,		/* sop_bind		*/
212 	sotpi_listen,		/* sop_listen		*/
213 	sotpi_connect,		/* sop_connect		*/
214 	sotpi_recvmsg,		/* sop_recvmsg		*/
215 	sotpi_sendmsg,		/* sop_sendmsg		*/
216 	sotpi_getpeername,	/* sop_getpeername	*/
217 	sotpi_getsockname,	/* sop_getsockname	*/
218 	sotpi_shutdown,		/* sop_shutdown		*/
219 	sotpi_getsockopt,	/* sop_getsockopt	*/
220 	sotpi_setsockopt	/* sop_setsockopt	*/
221 };
222 
223 /*
224  * Common create code for socket and accept. If tso is set the values
225  * from that node is used instead of issuing a T_INFO_REQ.
226  *
227  * Assumes that the caller has a VN_HOLD on accessvp.
228  * The VN_RELE will occur either when sotpi_create() fails or when
229  * the returned sonode is freed.
230  */
231 struct sonode *
232 sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version,
233     struct sonode *tso, int *errorp)
234 {
235 	struct sonode	*so;
236 	vnode_t		*vp;
237 	int		flags, error;
238 
239 	ASSERT(accessvp != NULL);
240 	vp = makesockvp(accessvp, domain, type, protocol);
241 	ASSERT(vp != NULL);
242 	so = VTOSO(vp);
243 
244 	flags = FREAD|FWRITE;
245 
246 	if ((type == SOCK_STREAM || type == SOCK_DGRAM) &&
247 	    (domain == AF_INET || domain == AF_INET6) &&
248 	    (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP ||
249 	    protocol == IPPROTO_IP)) {
250 		/* Tell tcp or udp that it's talking to sockets */
251 		flags |= SO_SOCKSTR;
252 
253 		/*
254 		 * Here we indicate to socktpi_open() our attempt to
255 		 * make direct calls between sockfs and transport.
256 		 * The final decision is left to socktpi_open().
257 		 */
258 		so->so_state |= SS_DIRECT;
259 
260 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
261 		if (so->so_type == SOCK_STREAM && tso != NULL) {
262 			if (tso->so_state & SS_DIRECT) {
263 				/*
264 				 * Inherit SS_DIRECT from listener and pass
265 				 * SO_ACCEPTOR open flag to tcp, indicating
266 				 * that this is an accept fast-path instance.
267 				 */
268 				flags |= SO_ACCEPTOR;
269 			} else {
270 				/*
271 				 * SS_DIRECT is not set on listener, meaning
272 				 * that the listener has been converted from
273 				 * a socket to a stream.  Ensure that the
274 				 * acceptor inherits these settings.
275 				 */
276 				so->so_state &= ~SS_DIRECT;
277 				flags &= ~SO_SOCKSTR;
278 			}
279 		}
280 	}
281 
282 	/*
283 	 * Tell local transport that it is talking to sockets.
284 	 */
285 	if (so->so_family == AF_UNIX) {
286 		flags |= SO_SOCKSTR;
287 	}
288 
289 	/* Initialize the kernel SSL proxy fields */
290 	so->so_kssl_type = KSSL_NO_PROXY;
291 	so->so_kssl_ent = NULL;
292 	so->so_kssl_ctx = NULL;
293 
294 	if (error = socktpi_open(&vp, flags, CRED(), NULL)) {
295 		VN_RELE(vp);
296 		*errorp = error;
297 		return (NULL);
298 	}
299 
300 	if (error = so_strinit(so, tso)) {
301 		(void) VOP_CLOSE(vp, 0, 1, 0, CRED(), NULL);
302 		VN_RELE(vp);
303 		*errorp = error;
304 		return (NULL);
305 	}
306 
307 	if (version == SOV_DEFAULT)
308 		version = so_default_version;
309 
310 	so->so_version = (short)version;
311 
312 	return (so);
313 }
314 
315 /*
316  * Bind the socket to an unspecified address in sockfs only.
317  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
318  * required in all cases.
319  */
320 static void
321 so_automatic_bind(struct sonode *so)
322 {
323 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
324 
325 	ASSERT(MUTEX_HELD(&so->so_lock));
326 	ASSERT(!(so->so_state & SS_ISBOUND));
327 	ASSERT(so->so_unbind_mp);
328 
329 	ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
330 	bzero(so->so_laddr_sa, so->so_laddr_len);
331 	so->so_laddr_sa->sa_family = so->so_family;
332 	so->so_state |= SS_ISBOUND;
333 }
334 
335 
336 /*
337  * bind the socket.
338  *
339  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
340  * are passed in we allow rebinding. Note that for backwards compatibility
341  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
342  * Thus the rebinding code is currently not executed.
343  *
344  * The constraints for rebinding are:
345  * - it is a SOCK_DGRAM, or
346  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
347  *   and no listen() has been done.
348  * This rebinding code was added based on some language in the XNET book
349  * about not returning EINVAL it the protocol allows rebinding. However,
350  * this language is not present in the Posix socket draft. Thus maybe the
351  * rebinding logic should be deleted from the source.
352  *
353  * A null "name" can be used to unbind the socket if:
354  * - it is a SOCK_DGRAM, or
355  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
356  *   and no listen() has been done.
357  */
358 static int
359 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
360     socklen_t namelen, int backlog, int flags)
361 {
362 	struct T_bind_req	bind_req;
363 	struct T_bind_ack	*bind_ack;
364 	int			error = 0;
365 	mblk_t			*mp;
366 	void			*addr;
367 	t_uscalar_t		addrlen;
368 	int			unbind_on_err = 1;
369 	boolean_t		clear_acceptconn_on_err = B_FALSE;
370 	boolean_t		restore_backlog_on_err = B_FALSE;
371 	int			save_so_backlog;
372 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
373 	boolean_t		tcp_udp_xport;
374 	void			*nl7c = NULL;
375 
376 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
377 	    so, name, namelen, backlog, flags,
378 	    pr_state(so->so_state, so->so_mode)));
379 
380 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
381 
382 	if (!(flags & _SOBIND_LOCK_HELD)) {
383 		mutex_enter(&so->so_lock);
384 		so_lock_single(so);	/* Set SOLOCKED */
385 	} else {
386 		ASSERT(MUTEX_HELD(&so->so_lock));
387 		ASSERT(so->so_flag & SOLOCKED);
388 	}
389 
390 	/*
391 	 * Make sure that there is a preallocated unbind_req message
392 	 * before binding. This message allocated when the socket is
393 	 * created  but it might be have been consumed.
394 	 */
395 	if (so->so_unbind_mp == NULL) {
396 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
397 		/* NOTE: holding so_lock while sleeping */
398 		so->so_unbind_mp =
399 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
400 	}
401 
402 	if (flags & _SOBIND_REBIND) {
403 		/*
404 		 * Called from solisten after doing an sotpi_unbind() or
405 		 * potentially without the unbind (latter for AF_INET{,6}).
406 		 */
407 		ASSERT(name == NULL && namelen == 0);
408 
409 		if (so->so_family == AF_UNIX) {
410 			ASSERT(so->so_ux_bound_vp);
411 			addr = &so->so_ux_laddr;
412 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
413 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
414 			    "addr 0x%p, vp %p\n",
415 			    addrlen,
416 			    ((struct so_ux_addr *)addr)->soua_vp,
417 			    so->so_ux_bound_vp));
418 		} else {
419 			addr = so->so_laddr_sa;
420 			addrlen = (t_uscalar_t)so->so_laddr_len;
421 		}
422 	} else if (flags & _SOBIND_UNSPEC) {
423 		ASSERT(name == NULL && namelen == 0);
424 
425 		/*
426 		 * The caller checked SS_ISBOUND but not necessarily
427 		 * under so_lock
428 		 */
429 		if (so->so_state & SS_ISBOUND) {
430 			/* No error */
431 			goto done;
432 		}
433 
434 		/* Set an initial local address */
435 		switch (so->so_family) {
436 		case AF_UNIX:
437 			/*
438 			 * Use an address with same size as struct sockaddr
439 			 * just like BSD.
440 			 */
441 			so->so_laddr_len =
442 			    (socklen_t)sizeof (struct sockaddr);
443 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
444 			bzero(so->so_laddr_sa, so->so_laddr_len);
445 			so->so_laddr_sa->sa_family = so->so_family;
446 
447 			/*
448 			 * Pass down an address with the implicit bind
449 			 * magic number and the rest all zeros.
450 			 * The transport will return a unique address.
451 			 */
452 			so->so_ux_laddr.soua_vp = NULL;
453 			so->so_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
454 			addr = &so->so_ux_laddr;
455 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
456 			break;
457 
458 		case AF_INET:
459 		case AF_INET6:
460 			/*
461 			 * An unspecified bind in TPI has a NULL address.
462 			 * Set the address in sockfs to have the sa_family.
463 			 */
464 			so->so_laddr_len = (so->so_family == AF_INET) ?
465 			    (socklen_t)sizeof (sin_t) :
466 			    (socklen_t)sizeof (sin6_t);
467 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
468 			bzero(so->so_laddr_sa, so->so_laddr_len);
469 			so->so_laddr_sa->sa_family = so->so_family;
470 			addr = NULL;
471 			addrlen = 0;
472 			break;
473 
474 		default:
475 			/*
476 			 * An unspecified bind in TPI has a NULL address.
477 			 * Set the address in sockfs to be zero length.
478 			 *
479 			 * Can not assume there is a sa_family for all
480 			 * protocol families. For example, AF_X25 does not
481 			 * have a family field.
482 			 */
483 			bzero(so->so_laddr_sa, so->so_laddr_len);
484 			so->so_laddr_len = 0;	/* XXX correct? */
485 			addr = NULL;
486 			addrlen = 0;
487 			break;
488 		}
489 
490 	} else {
491 		if (so->so_state & SS_ISBOUND) {
492 			/*
493 			 * If it is ok to rebind the socket, first unbind
494 			 * with the transport. A rebind to the NULL address
495 			 * is interpreted as an unbind.
496 			 * Note that a bind to NULL in BSD does unbind the
497 			 * socket but it fails with EINVAL.
498 			 * Note that regular sockets set SOV_SOCKBSD i.e.
499 			 * _SOBIND_SOCKBSD gets set here hence no type of
500 			 * socket does currently allow rebinding.
501 			 *
502 			 * If the name is NULL just do an unbind.
503 			 */
504 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
505 			    name != NULL) {
506 				error = EINVAL;
507 				unbind_on_err = 0;
508 				eprintsoline(so, error);
509 				goto done;
510 			}
511 			if ((so->so_mode & SM_CONNREQUIRED) &&
512 			    (so->so_state & SS_CANTREBIND)) {
513 				error = EINVAL;
514 				unbind_on_err = 0;
515 				eprintsoline(so, error);
516 				goto done;
517 			}
518 			error = sotpi_unbind(so, 0);
519 			if (error) {
520 				eprintsoline(so, error);
521 				goto done;
522 			}
523 			ASSERT(!(so->so_state & SS_ISBOUND));
524 			if (name == NULL) {
525 				so->so_state &=
526 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
527 				goto done;
528 			}
529 		}
530 		/* X/Open requires this check */
531 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
532 			if (xnet_check_print) {
533 				printf("sockfs: X/Open bind state check "
534 				    "caused EINVAL\n");
535 			}
536 			error = EINVAL;
537 			goto done;
538 		}
539 
540 		switch (so->so_family) {
541 		case AF_UNIX:
542 			/*
543 			 * All AF_UNIX addresses are nul terminated
544 			 * when copied (copyin_name) in so the minimum
545 			 * length is 3 bytes.
546 			 */
547 			if (name == NULL ||
548 			    (ssize_t)namelen <= sizeof (short) + 1) {
549 				error = EISDIR;
550 				eprintsoline(so, error);
551 				goto done;
552 			}
553 			/*
554 			 * Verify so_family matches the bound family.
555 			 * BSD does not check this for AF_UNIX resulting
556 			 * in funny mknods.
557 			 */
558 			if (name->sa_family != so->so_family) {
559 				error = EAFNOSUPPORT;
560 				goto done;
561 			}
562 			break;
563 		case AF_INET:
564 			if (name == NULL) {
565 				error = EINVAL;
566 				eprintsoline(so, error);
567 				goto done;
568 			}
569 			if ((size_t)namelen != sizeof (sin_t)) {
570 				error = name->sa_family != so->so_family ?
571 				    EAFNOSUPPORT : EINVAL;
572 				eprintsoline(so, error);
573 				goto done;
574 			}
575 			if ((flags & _SOBIND_XPG4_2) &&
576 			    (name->sa_family != so->so_family)) {
577 				/*
578 				 * This check has to be made for X/Open
579 				 * sockets however application failures have
580 				 * been observed when it is applied to
581 				 * all sockets.
582 				 */
583 				error = EAFNOSUPPORT;
584 				eprintsoline(so, error);
585 				goto done;
586 			}
587 			/*
588 			 * Force a zero sa_family to match so_family.
589 			 *
590 			 * Some programs like inetd(1M) don't set the
591 			 * family field. Other programs leave
592 			 * sin_family set to garbage - SunOS 4.X does
593 			 * not check the family field on a bind.
594 			 * We use the family field that
595 			 * was passed in to the socket() call.
596 			 */
597 			name->sa_family = so->so_family;
598 			break;
599 
600 		case AF_INET6: {
601 #ifdef DEBUG
602 			sin6_t *sin6 = (sin6_t *)name;
603 #endif /* DEBUG */
604 
605 			if (name == NULL) {
606 				error = EINVAL;
607 				eprintsoline(so, error);
608 				goto done;
609 			}
610 			if ((size_t)namelen != sizeof (sin6_t)) {
611 				error = name->sa_family != so->so_family ?
612 				    EAFNOSUPPORT : EINVAL;
613 				eprintsoline(so, error);
614 				goto done;
615 			}
616 			if (name->sa_family != so->so_family) {
617 				/*
618 				 * With IPv6 we require the family to match
619 				 * unlike in IPv4.
620 				 */
621 				error = EAFNOSUPPORT;
622 				eprintsoline(so, error);
623 				goto done;
624 			}
625 #ifdef DEBUG
626 			/*
627 			 * Verify that apps don't forget to clear
628 			 * sin6_scope_id etc
629 			 */
630 			if (sin6->sin6_scope_id != 0 &&
631 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
632 				zcmn_err(getzoneid(), CE_WARN,
633 				    "bind with uninitialized sin6_scope_id "
634 				    "(%d) on socket. Pid = %d\n",
635 				    (int)sin6->sin6_scope_id,
636 				    (int)curproc->p_pid);
637 			}
638 			if (sin6->__sin6_src_id != 0) {
639 				zcmn_err(getzoneid(), CE_WARN,
640 				    "bind with uninitialized __sin6_src_id "
641 				    "(%d) on socket. Pid = %d\n",
642 				    (int)sin6->__sin6_src_id,
643 				    (int)curproc->p_pid);
644 			}
645 #endif /* DEBUG */
646 			break;
647 		}
648 		default:
649 			/*
650 			 * Don't do any length or sa_family check to allow
651 			 * non-sockaddr style addresses.
652 			 */
653 			if (name == NULL) {
654 				error = EINVAL;
655 				eprintsoline(so, error);
656 				goto done;
657 			}
658 			break;
659 		}
660 
661 		if (namelen > (t_uscalar_t)so->so_laddr_maxlen) {
662 			error = ENAMETOOLONG;
663 			eprintsoline(so, error);
664 			goto done;
665 		}
666 		/*
667 		 * Save local address.
668 		 */
669 		so->so_laddr_len = (socklen_t)namelen;
670 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
671 		bcopy(name, so->so_laddr_sa, namelen);
672 
673 		addr = so->so_laddr_sa;
674 		addrlen = (t_uscalar_t)so->so_laddr_len;
675 		switch (so->so_family) {
676 		case AF_INET6:
677 		case AF_INET:
678 			break;
679 		case AF_UNIX: {
680 			struct sockaddr_un *soun =
681 			    (struct sockaddr_un *)so->so_laddr_sa;
682 			struct vnode *vp;
683 			struct vattr vattr;
684 
685 			ASSERT(so->so_ux_bound_vp == NULL);
686 			/*
687 			 * Create vnode for the specified path name.
688 			 * Keep vnode held with a reference in so_ux_bound_vp.
689 			 * Use the vnode pointer as the address used in the
690 			 * bind with the transport.
691 			 *
692 			 * Use the same mode as in BSD. In particular this does
693 			 * not observe the umask.
694 			 */
695 			/* MAXPATHLEN + soun_family + nul termination */
696 			if (so->so_laddr_len >
697 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
698 				error = ENAMETOOLONG;
699 				eprintsoline(so, error);
700 				goto done;
701 			}
702 			vattr.va_type = VSOCK;
703 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
704 			vattr.va_mask = AT_TYPE|AT_MODE;
705 			/* NOTE: holding so_lock */
706 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
707 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
708 			if (error) {
709 				if (error == EEXIST)
710 					error = EADDRINUSE;
711 				eprintsoline(so, error);
712 				goto done;
713 			}
714 			/*
715 			 * Establish pointer from the underlying filesystem
716 			 * vnode to the socket node.
717 			 * so_ux_bound_vp and v_stream->sd_vnode form the
718 			 * cross-linkage between the underlying filesystem
719 			 * node and the socket node.
720 			 */
721 			ASSERT(SOTOV(so)->v_stream);
722 			mutex_enter(&vp->v_lock);
723 			vp->v_stream = SOTOV(so)->v_stream;
724 			so->so_ux_bound_vp = vp;
725 			mutex_exit(&vp->v_lock);
726 
727 			/*
728 			 * Use the vnode pointer value as a unique address
729 			 * (together with the magic number to avoid conflicts
730 			 * with implicit binds) in the transport provider.
731 			 */
732 			so->so_ux_laddr.soua_vp = (void *)so->so_ux_bound_vp;
733 			so->so_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
734 			addr = &so->so_ux_laddr;
735 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
736 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
737 			    addrlen,
738 			    ((struct so_ux_addr *)addr)->soua_vp));
739 			break;
740 		}
741 		} /* end switch (so->so_family) */
742 	}
743 
744 	/*
745 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
746 	 * the transport can start passing up T_CONN_IND messages
747 	 * as soon as it receives the bind req and strsock_proto()
748 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
749 	 */
750 	if (flags & _SOBIND_LISTEN) {
751 		if ((so->so_state & SS_ACCEPTCONN) == 0)
752 			clear_acceptconn_on_err = B_TRUE;
753 		save_so_backlog = so->so_backlog;
754 		restore_backlog_on_err = B_TRUE;
755 		so->so_state |= SS_ACCEPTCONN;
756 		so->so_backlog = backlog;
757 	}
758 
759 	/*
760 	 * If NL7C addr(s) have been configured check for addr/port match,
761 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
762 	 *
763 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
764 	 * family sockets only. If match mark as such.
765 	 */
766 	if (nl7c_enabled && ((addr != NULL &&
767 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
768 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
769 	    so->so_nl7c_flags == NL7C_AF_NCA)) {
770 		/*
771 		 * NL7C is not supported in non-global zones,
772 		 * we enforce this restriction here.
773 		 */
774 		if (so->so_zoneid == GLOBAL_ZONEID) {
775 			/* An NL7C socket, mark it */
776 			so->so_nl7c_flags |= NL7C_ENABLED;
777 			if (nl7c == NULL) {
778 				/*
779 				 * Was an AF_NCA bind() so add it to the
780 				 * addr list for reporting purposes.
781 				 */
782 				nl7c = nl7c_add_addr(addr, addrlen);
783 			}
784 		} else
785 			nl7c = NULL;
786 	}
787 	/*
788 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
789 	 * for other transports we will send in a O_T_BIND_REQ.
790 	 */
791 	if (tcp_udp_xport &&
792 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
793 		PRIM_type = T_BIND_REQ;
794 
795 	bind_req.PRIM_type = PRIM_type;
796 	bind_req.ADDR_length = addrlen;
797 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
798 	bind_req.CONIND_number = backlog;
799 	/* NOTE: holding so_lock while sleeping */
800 	mp = soallocproto2(&bind_req, sizeof (bind_req),
801 	    addr, addrlen, 0, _ALLOC_SLEEP);
802 	so->so_state &= ~SS_LADDR_VALID;
803 
804 	/* Done using so_laddr_sa - can drop the lock */
805 	mutex_exit(&so->so_lock);
806 
807 	/*
808 	 * Intercept the bind_req message here to check if this <address/port>
809 	 * was configured as an SSL proxy server, or if another endpoint was
810 	 * already configured to act as a proxy for us.
811 	 *
812 	 * Note, only if NL7C not enabled for this socket.
813 	 */
814 	if (nl7c == NULL &&
815 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
816 	    so->so_type == SOCK_STREAM) {
817 
818 		if (so->so_kssl_ent != NULL) {
819 			kssl_release_ent(so->so_kssl_ent, so, so->so_kssl_type);
820 			so->so_kssl_ent = NULL;
821 		}
822 
823 		so->so_kssl_type = kssl_check_proxy(mp, so, &so->so_kssl_ent);
824 		switch (so->so_kssl_type) {
825 		case KSSL_NO_PROXY:
826 			break;
827 
828 		case KSSL_HAS_PROXY:
829 			mutex_enter(&so->so_lock);
830 			goto skip_transport;
831 
832 		case KSSL_IS_PROXY:
833 			break;
834 		}
835 	}
836 
837 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
838 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
839 	if (error) {
840 		eprintsoline(so, error);
841 		mutex_enter(&so->so_lock);
842 		goto done;
843 	}
844 
845 	mutex_enter(&so->so_lock);
846 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
847 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
848 	if (error) {
849 		eprintsoline(so, error);
850 		goto done;
851 	}
852 skip_transport:
853 	ASSERT(mp);
854 	/*
855 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
856 	 * strsock_proto while the lock was dropped above, the bind
857 	 * is allowed to complete.
858 	 */
859 
860 	/* Mark as bound. This will be undone if we detect errors below. */
861 	if (flags & _SOBIND_NOXLATE) {
862 		ASSERT(so->so_family == AF_UNIX);
863 		so->so_state |= SS_FADDR_NOXLATE;
864 	}
865 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
866 	so->so_state |= SS_ISBOUND;
867 	ASSERT(so->so_unbind_mp);
868 
869 	/* note that we've already set SS_ACCEPTCONN above */
870 
871 	/*
872 	 * Recompute addrlen - an unspecied bind sent down an
873 	 * address of length zero but we expect the appropriate length
874 	 * in return.
875 	 */
876 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
877 	    sizeof (so->so_ux_laddr) : so->so_laddr_len);
878 
879 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
880 	/*
881 	 * The alignment restriction is really too strict but
882 	 * we want enough alignment to inspect the fields of
883 	 * a sockaddr_in.
884 	 */
885 	addr = sogetoff(mp, bind_ack->ADDR_offset,
886 	    bind_ack->ADDR_length,
887 	    __TPI_ALIGN_SIZE);
888 	if (addr == NULL) {
889 		freemsg(mp);
890 		error = EPROTO;
891 		eprintsoline(so, error);
892 		goto done;
893 	}
894 	if (!(flags & _SOBIND_UNSPEC)) {
895 		/*
896 		 * Verify that the transport didn't return something we
897 		 * did not want e.g. an address other than what we asked for.
898 		 *
899 		 * NOTE: These checks would go away if/when we switch to
900 		 * using the new TPI (in which the transport would fail
901 		 * the request instead of assigning a different address).
902 		 *
903 		 * NOTE2: For protocols that we don't know (i.e. any
904 		 * other than AF_INET6, AF_INET and AF_UNIX), we
905 		 * cannot know if the transport should be expected to
906 		 * return the same address as that requested.
907 		 *
908 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
909 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
910 		 *
911 		 * For example, in the case of netatalk it may be
912 		 * inappropriate for the transport to return the
913 		 * requested address (as it may have allocated a local
914 		 * port number in behaviour similar to that of an
915 		 * AF_INET bind request with a port number of zero).
916 		 *
917 		 * Given the definition of O_T_BIND_REQ, where the
918 		 * transport may bind to an address other than the
919 		 * requested address, it's not possible to determine
920 		 * whether a returned address that differs from the
921 		 * requested address is a reason to fail (because the
922 		 * requested address was not available) or succeed
923 		 * (because the transport allocated an appropriate
924 		 * address and/or port).
925 		 *
926 		 * sockfs currently requires that the transport return
927 		 * the requested address in the T_BIND_ACK, unless
928 		 * there is code here to allow for any discrepancy.
929 		 * Such code exists for AF_INET and AF_INET6.
930 		 *
931 		 * Netatalk chooses to return the requested address
932 		 * rather than the (correct) allocated address.  This
933 		 * means that netatalk violates the TPI specification
934 		 * (and would not function correctly if used from a
935 		 * TLI application), but it does mean that it works
936 		 * with sockfs.
937 		 *
938 		 * As noted above, using the newer XTI bind primitive
939 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
940 		 * allow sockfs to be more sure about whether or not
941 		 * the bind request had succeeded (as transports are
942 		 * not permitted to bind to a different address than
943 		 * that requested - they must return failure).
944 		 * Unfortunately, support for T_BIND_REQ may not be
945 		 * present in all transport implementations (netatalk,
946 		 * for example, doesn't have it), making the
947 		 * transition difficult.
948 		 */
949 		if (bind_ack->ADDR_length != addrlen) {
950 			/* Assumes that the requested address was in use */
951 			freemsg(mp);
952 			error = EADDRINUSE;
953 			eprintsoline(so, error);
954 			goto done;
955 		}
956 
957 		switch (so->so_family) {
958 		case AF_INET6:
959 		case AF_INET: {
960 			sin_t *rname, *aname;
961 
962 			rname = (sin_t *)addr;
963 			aname = (sin_t *)so->so_laddr_sa;
964 
965 			/*
966 			 * Take advantage of the alignment
967 			 * of sin_port and sin6_port which fall
968 			 * in the same place in their data structures.
969 			 * Just use sin_port for either address family.
970 			 *
971 			 * This may become a problem if (heaven forbid)
972 			 * there's a separate ipv6port_reserved... :-P
973 			 *
974 			 * Binding to port 0 has the semantics of letting
975 			 * the transport bind to any port.
976 			 *
977 			 * If the transport is TCP or UDP since we had sent
978 			 * a T_BIND_REQ we would not get a port other than
979 			 * what we asked for.
980 			 */
981 			if (tcp_udp_xport) {
982 				/*
983 				 * Pick up the new port number if we bound to
984 				 * port 0.
985 				 */
986 				if (aname->sin_port == 0)
987 					aname->sin_port = rname->sin_port;
988 				so->so_state |= SS_LADDR_VALID;
989 				break;
990 			}
991 			if (aname->sin_port != 0 &&
992 			    aname->sin_port != rname->sin_port) {
993 				freemsg(mp);
994 				error = EADDRINUSE;
995 				eprintsoline(so, error);
996 				goto done;
997 			}
998 			/*
999 			 * Pick up the new port number if we bound to port 0.
1000 			 */
1001 			aname->sin_port = rname->sin_port;
1002 
1003 			/*
1004 			 * Unfortunately, addresses aren't _quite_ the same.
1005 			 */
1006 			if (so->so_family == AF_INET) {
1007 				if (aname->sin_addr.s_addr !=
1008 				    rname->sin_addr.s_addr) {
1009 					freemsg(mp);
1010 					error = EADDRNOTAVAIL;
1011 					eprintsoline(so, error);
1012 					goto done;
1013 				}
1014 			} else {
1015 				sin6_t *rname6 = (sin6_t *)rname;
1016 				sin6_t *aname6 = (sin6_t *)aname;
1017 
1018 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1019 				    &rname6->sin6_addr)) {
1020 					freemsg(mp);
1021 					error = EADDRNOTAVAIL;
1022 					eprintsoline(so, error);
1023 					goto done;
1024 				}
1025 			}
1026 			break;
1027 		}
1028 		case AF_UNIX:
1029 			if (bcmp(addr, &so->so_ux_laddr, addrlen) != 0) {
1030 				freemsg(mp);
1031 				error = EADDRINUSE;
1032 				eprintsoline(so, error);
1033 				eprintso(so,
1034 				    ("addrlen %d, addr 0x%x, vp %p\n",
1035 				    addrlen, *((int *)addr),
1036 				    so->so_ux_bound_vp));
1037 				goto done;
1038 			}
1039 			so->so_state |= SS_LADDR_VALID;
1040 			break;
1041 		default:
1042 			/*
1043 			 * NOTE: This assumes that addresses can be
1044 			 * byte-compared for equivalence.
1045 			 */
1046 			if (bcmp(addr, so->so_laddr_sa, addrlen) != 0) {
1047 				freemsg(mp);
1048 				error = EADDRINUSE;
1049 				eprintsoline(so, error);
1050 				goto done;
1051 			}
1052 			/*
1053 			 * Don't mark SS_LADDR_VALID, as we cannot be
1054 			 * sure that the returned address is the real
1055 			 * bound address when talking to an unknown
1056 			 * transport.
1057 			 */
1058 			break;
1059 		}
1060 	} else {
1061 		/*
1062 		 * Save for returned address for getsockname.
1063 		 * Needed for unspecific bind unless transport supports
1064 		 * the TI_GETMYNAME ioctl.
1065 		 * Do this for AF_INET{,6} even though they do, as
1066 		 * caching info here is much better performance than
1067 		 * a TPI/STREAMS trip to the transport for getsockname.
1068 		 * Any which can't for some reason _must_ _not_ set
1069 		 * LADDR_VALID here for the caching version of getsockname
1070 		 * to not break;
1071 		 */
1072 		switch (so->so_family) {
1073 		case AF_UNIX:
1074 			/*
1075 			 * Record the address bound with the transport
1076 			 * for use by socketpair.
1077 			 */
1078 			bcopy(addr, &so->so_ux_laddr, addrlen);
1079 			so->so_state |= SS_LADDR_VALID;
1080 			break;
1081 		case AF_INET:
1082 		case AF_INET6:
1083 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
1084 			bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
1085 			so->so_state |= SS_LADDR_VALID;
1086 			break;
1087 		default:
1088 			/*
1089 			 * Don't mark SS_LADDR_VALID, as we cannot be
1090 			 * sure that the returned address is the real
1091 			 * bound address when talking to an unknown
1092 			 * transport.
1093 			 */
1094 			break;
1095 		}
1096 	}
1097 
1098 	if (nl7c != NULL) {
1099 		/* Register listen()er sonode pointer with NL7C */
1100 		nl7c_listener_addr(nl7c, so);
1101 	}
1102 
1103 	freemsg(mp);
1104 
1105 done:
1106 	if (error) {
1107 		/* reset state & backlog to values held on entry */
1108 		if (clear_acceptconn_on_err == B_TRUE)
1109 			so->so_state &= ~SS_ACCEPTCONN;
1110 		if (restore_backlog_on_err == B_TRUE)
1111 			so->so_backlog = save_so_backlog;
1112 
1113 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1114 			int err;
1115 
1116 			err = sotpi_unbind(so, 0);
1117 			/* LINTED - statement has no consequent: if */
1118 			if (err) {
1119 				eprintsoline(so, error);
1120 			} else {
1121 				ASSERT(!(so->so_state & SS_ISBOUND));
1122 			}
1123 		}
1124 	}
1125 	if (!(flags & _SOBIND_LOCK_HELD)) {
1126 		so_unlock_single(so, SOLOCKED);
1127 		mutex_exit(&so->so_lock);
1128 	} else {
1129 		/* If the caller held the lock don't release it here */
1130 		ASSERT(MUTEX_HELD(&so->so_lock));
1131 		ASSERT(so->so_flag & SOLOCKED);
1132 	}
1133 	return (error);
1134 }
1135 
1136 /* bind the socket */
1137 static int
1138 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1139     int flags)
1140 {
1141 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1142 		return (sotpi_bindlisten(so, name, namelen, 0, flags));
1143 
1144 	flags &= ~_SOBIND_SOCKETPAIR;
1145 	return (sotpi_bindlisten(so, name, namelen, 1, flags));
1146 }
1147 
1148 /*
1149  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1150  * address, or when listen needs to unbind and bind.
1151  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1152  * so that a sobind can pick them up.
1153  */
1154 static int
1155 sotpi_unbind(struct sonode *so, int flags)
1156 {
1157 	struct T_unbind_req	unbind_req;
1158 	int			error = 0;
1159 	mblk_t			*mp;
1160 
1161 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1162 	    so, flags, pr_state(so->so_state, so->so_mode)));
1163 
1164 	ASSERT(MUTEX_HELD(&so->so_lock));
1165 	ASSERT(so->so_flag & SOLOCKED);
1166 
1167 	if (!(so->so_state & SS_ISBOUND)) {
1168 		error = EINVAL;
1169 		eprintsoline(so, error);
1170 		goto done;
1171 	}
1172 
1173 	mutex_exit(&so->so_lock);
1174 
1175 	/*
1176 	 * Flush the read and write side (except stream head read queue)
1177 	 * and send down T_UNBIND_REQ.
1178 	 */
1179 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1180 
1181 	unbind_req.PRIM_type = T_UNBIND_REQ;
1182 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1183 	    0, _ALLOC_SLEEP);
1184 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1185 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1186 	mutex_enter(&so->so_lock);
1187 	if (error) {
1188 		eprintsoline(so, error);
1189 		goto done;
1190 	}
1191 
1192 	error = sowaitokack(so, T_UNBIND_REQ);
1193 	if (error) {
1194 		eprintsoline(so, error);
1195 		goto done;
1196 	}
1197 
1198 	/*
1199 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1200 	 * strsock_proto while the lock was dropped above, the unbind
1201 	 * is allowed to complete.
1202 	 */
1203 	if (!(flags & _SOUNBIND_REBIND)) {
1204 		/*
1205 		 * Clear out bound address.
1206 		 */
1207 		vnode_t *vp;
1208 
1209 		if ((vp = so->so_ux_bound_vp) != NULL) {
1210 
1211 			/* Undo any SSL proxy setup */
1212 			if ((so->so_family == AF_INET ||
1213 			    so->so_family == AF_INET6) &&
1214 			    (so->so_type == SOCK_STREAM) &&
1215 			    (so->so_kssl_ent != NULL)) {
1216 				kssl_release_ent(so->so_kssl_ent, so,
1217 				    so->so_kssl_type);
1218 				so->so_kssl_ent = NULL;
1219 				so->so_kssl_type = KSSL_NO_PROXY;
1220 			}
1221 
1222 			so->so_ux_bound_vp = NULL;
1223 			vn_rele_stream(vp);
1224 		}
1225 		/* Clear out address */
1226 		so->so_laddr_len = 0;
1227 	}
1228 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN|SS_LADDR_VALID);
1229 
1230 done:
1231 
1232 	/* If the caller held the lock don't release it here */
1233 	ASSERT(MUTEX_HELD(&so->so_lock));
1234 	ASSERT(so->so_flag & SOLOCKED);
1235 
1236 	return (error);
1237 }
1238 
1239 /*
1240  * listen on the socket.
1241  * For TPI conforming transports this has to first unbind with the transport
1242  * and then bind again using the new backlog.
1243  */
1244 int
1245 sotpi_listen(struct sonode *so, int backlog)
1246 {
1247 	int		error = 0;
1248 
1249 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1250 	    so, backlog, pr_state(so->so_state, so->so_mode)));
1251 
1252 	if (so->so_serv_type == T_CLTS)
1253 		return (EOPNOTSUPP);
1254 
1255 	/*
1256 	 * If the socket is ready to accept connections already, then
1257 	 * return without doing anything.  This avoids a problem where
1258 	 * a second listen() call fails if a connection is pending and
1259 	 * leaves the socket unbound. Only when we are not unbinding
1260 	 * with the transport can we safely increase the backlog.
1261 	 */
1262 	if (so->so_state & SS_ACCEPTCONN &&
1263 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1264 	    /*CONSTCOND*/
1265 	    !solisten_tpi_tcp))
1266 		return (0);
1267 
1268 	if (so->so_state & SS_ISCONNECTED)
1269 		return (EINVAL);
1270 
1271 	mutex_enter(&so->so_lock);
1272 	so_lock_single(so);	/* Set SOLOCKED */
1273 
1274 	if (backlog < 0)
1275 		backlog = 0;
1276 	/*
1277 	 * Use the same qlimit as in BSD. BSD checks the qlimit
1278 	 * before queuing the next connection implying that a
1279 	 * listen(sock, 0) allows one connection to be queued.
1280 	 * BSD also uses 1.5 times the requested backlog.
1281 	 *
1282 	 * XNS Issue 4 required a strict interpretation of the backlog.
1283 	 * This has been waived subsequently for Issue 4 and the change
1284 	 * incorporated in XNS Issue 5. So we aren't required to do
1285 	 * anything special for XPG apps.
1286 	 */
1287 	if (backlog >= (INT_MAX - 1) / 3)
1288 		backlog = INT_MAX;
1289 	else
1290 		backlog = backlog * 3 / 2 + 1;
1291 
1292 	/*
1293 	 * If the listen doesn't change the backlog we do nothing.
1294 	 * This avoids an EPROTO error from the transport.
1295 	 */
1296 	if ((so->so_state & SS_ACCEPTCONN) &&
1297 	    so->so_backlog == backlog)
1298 		goto done;
1299 
1300 	if (!(so->so_state & SS_ISBOUND)) {
1301 		/*
1302 		 * Must have been explicitly bound in the UNIX domain.
1303 		 */
1304 		if (so->so_family == AF_UNIX) {
1305 			error = EINVAL;
1306 			goto done;
1307 		}
1308 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1309 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1310 	} else if (backlog > 0) {
1311 		/*
1312 		 * AF_INET{,6} hack to avoid losing the port.
1313 		 * Assumes that all AF_INET{,6} transports can handle a
1314 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1315 		 * has already bound thus it is possible to avoid the unbind.
1316 		 */
1317 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1318 		    /*CONSTCOND*/
1319 		    !solisten_tpi_tcp)) {
1320 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1321 			if (error)
1322 				goto done;
1323 		}
1324 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1325 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1326 	} else {
1327 		so->so_state |= SS_ACCEPTCONN;
1328 		so->so_backlog = backlog;
1329 	}
1330 	if (error)
1331 		goto done;
1332 	ASSERT(so->so_state & SS_ACCEPTCONN);
1333 done:
1334 	so_unlock_single(so, SOLOCKED);
1335 	mutex_exit(&so->so_lock);
1336 	return (error);
1337 }
1338 
1339 /*
1340  * Disconnect either a specified seqno or all (-1).
1341  * The former is used on listening sockets only.
1342  *
1343  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1344  * the current use of sodisconnect(seqno == -1) is only for shutdown
1345  * so there is no point (and potentially incorrect) to unbind.
1346  */
1347 int
1348 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1349 {
1350 	struct T_discon_req	discon_req;
1351 	int			error = 0;
1352 	mblk_t			*mp;
1353 
1354 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1355 	    so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1356 
1357 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1358 		mutex_enter(&so->so_lock);
1359 		so_lock_single(so);	/* Set SOLOCKED */
1360 	} else {
1361 		ASSERT(MUTEX_HELD(&so->so_lock));
1362 		ASSERT(so->so_flag & SOLOCKED);
1363 	}
1364 
1365 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1366 		error = EINVAL;
1367 		eprintsoline(so, error);
1368 		goto done;
1369 	}
1370 
1371 	mutex_exit(&so->so_lock);
1372 	/*
1373 	 * Flush the write side (unless this is a listener)
1374 	 * and then send down a T_DISCON_REQ.
1375 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1376 	 * and other messages.)
1377 	 */
1378 	if (!(so->so_state & SS_ACCEPTCONN))
1379 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1380 
1381 	discon_req.PRIM_type = T_DISCON_REQ;
1382 	discon_req.SEQ_number = seqno;
1383 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1384 	    0, _ALLOC_SLEEP);
1385 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1386 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1387 	mutex_enter(&so->so_lock);
1388 	if (error) {
1389 		eprintsoline(so, error);
1390 		goto done;
1391 	}
1392 
1393 	error = sowaitokack(so, T_DISCON_REQ);
1394 	if (error) {
1395 		eprintsoline(so, error);
1396 		goto done;
1397 	}
1398 	/*
1399 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1400 	 * strsock_proto while the lock was dropped above, the disconnect
1401 	 * is allowed to complete. However, it is not possible to
1402 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1403 	 */
1404 	so->so_state &=
1405 	    ~(SS_ISCONNECTED|SS_ISCONNECTING|SS_LADDR_VALID|SS_FADDR_VALID);
1406 done:
1407 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1408 		so_unlock_single(so, SOLOCKED);
1409 		mutex_exit(&so->so_lock);
1410 	} else {
1411 		/* If the caller held the lock don't release it here */
1412 		ASSERT(MUTEX_HELD(&so->so_lock));
1413 		ASSERT(so->so_flag & SOLOCKED);
1414 	}
1415 	return (error);
1416 }
1417 
1418 int
1419 sotpi_accept(struct sonode *so, int fflag, struct sonode **nsop)
1420 {
1421 	struct T_conn_ind	*conn_ind;
1422 	struct T_conn_res	*conn_res;
1423 	int			error = 0;
1424 	mblk_t			*mp, *ctxmp, *ack_mp;
1425 	struct sonode		*nso;
1426 	vnode_t			*nvp;
1427 	void			*src;
1428 	t_uscalar_t		srclen;
1429 	void			*opt;
1430 	t_uscalar_t		optlen;
1431 	t_scalar_t		PRIM_type;
1432 	t_scalar_t		SEQ_number;
1433 	size_t			sinlen;
1434 
1435 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1436 	    so, fflag, nsop, pr_state(so->so_state, so->so_mode)));
1437 
1438 	/*
1439 	 * Defer single-threading the accepting socket until
1440 	 * the T_CONN_IND has been received and parsed and the
1441 	 * new sonode has been opened.
1442 	 */
1443 
1444 	/* Check that we are not already connected */
1445 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1446 		goto conn_bad;
1447 again:
1448 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1449 		goto e_bad;
1450 
1451 	ASSERT(mp);
1452 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1453 	ctxmp = mp->b_cont;
1454 
1455 	/*
1456 	 * Save SEQ_number for error paths.
1457 	 */
1458 	SEQ_number = conn_ind->SEQ_number;
1459 
1460 	srclen = conn_ind->SRC_length;
1461 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1462 	if (src == NULL) {
1463 		error = EPROTO;
1464 		freemsg(mp);
1465 		eprintsoline(so, error);
1466 		goto disconnect_unlocked;
1467 	}
1468 	optlen = conn_ind->OPT_length;
1469 	switch (so->so_family) {
1470 	case AF_INET:
1471 	case AF_INET6:
1472 		if ((optlen == sizeof (intptr_t)) &&
1473 		    ((so->so_state & SS_DIRECT) != 0)) {
1474 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1475 			    &opt, conn_ind->OPT_length);
1476 		} else {
1477 			/*
1478 			 * The transport (in this case TCP) hasn't sent up
1479 			 * a pointer to an instance for the accept fast-path.
1480 			 * Disable fast-path completely because the call to
1481 			 * sotpi_create() below would otherwise create an
1482 			 * incomplete TCP instance, which would lead to
1483 			 * problems when sockfs sends a normal T_CONN_RES
1484 			 * message down the new stream.
1485 			 */
1486 			if (so->so_state & SS_DIRECT) {
1487 				int rval;
1488 				/*
1489 				 * For consistency we inform tcp to disable
1490 				 * direct interface on the listener, though
1491 				 * we can certainly live without doing this
1492 				 * because no data will ever travel upstream
1493 				 * on the listening socket.
1494 				 */
1495 				so->so_state &= ~SS_DIRECT;
1496 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1497 				    0, 0, K_TO_K, CRED(), &rval);
1498 			}
1499 			opt = NULL;
1500 			optlen = 0;
1501 		}
1502 		break;
1503 	case AF_UNIX:
1504 	default:
1505 		if (optlen != 0) {
1506 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1507 			    __TPI_ALIGN_SIZE);
1508 			if (opt == NULL) {
1509 				error = EPROTO;
1510 				freemsg(mp);
1511 				eprintsoline(so, error);
1512 				goto disconnect_unlocked;
1513 			}
1514 		}
1515 		if (so->so_family == AF_UNIX) {
1516 			if (!(so->so_state & SS_FADDR_NOXLATE)) {
1517 				src = NULL;
1518 				srclen = 0;
1519 			}
1520 			/* Extract src address from options */
1521 			if (optlen != 0)
1522 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1523 		}
1524 		break;
1525 	}
1526 
1527 	/*
1528 	 * Create the new socket.
1529 	 */
1530 	VN_HOLD(so->so_accessvp);
1531 	nso = sotpi_create(so->so_accessvp, so->so_family, so->so_type,
1532 	    so->so_protocol, so->so_version, so, &error);
1533 	if (nso == NULL) {
1534 		ASSERT(error != 0);
1535 		/*
1536 		 * Accept can not fail with ENOBUFS. sotpi_create
1537 		 * sleeps waiting for memory until a signal is caught
1538 		 * so return EINTR.
1539 		 */
1540 		freemsg(mp);
1541 		if (error == ENOBUFS)
1542 			error = EINTR;
1543 		goto e_disc_unl;
1544 	}
1545 	nvp = SOTOV(nso);
1546 
1547 	/*
1548 	 * If the transport sent up an SSL connection context, then attach
1549 	 * it the new socket, and set the (sd_wputdatafunc)() and
1550 	 * (sd_rputdatafunc)() stream head hooks to intercept and process
1551 	 * SSL records.
1552 	 */
1553 	if (ctxmp != NULL) {
1554 		/*
1555 		 * This kssl_ctx_t is already held for us by the transport.
1556 		 * So, we don't need to do a kssl_hold_ctx() here.
1557 		 */
1558 		nso->so_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr);
1559 		freemsg(ctxmp);
1560 		mp->b_cont = NULL;
1561 		strsetrwputdatahooks(nvp, strsock_kssl_input,
1562 		    strsock_kssl_output);
1563 	}
1564 #ifdef DEBUG
1565 	/*
1566 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1567 	 * it's inherited early to allow debugging of the accept code itself.
1568 	 */
1569 	nso->so_options |= so->so_options & SO_DEBUG;
1570 #endif /* DEBUG */
1571 
1572 	/*
1573 	 * Save the SRC address from the T_CONN_IND
1574 	 * for getpeername to work on AF_UNIX and on transports that do not
1575 	 * support TI_GETPEERNAME.
1576 	 *
1577 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1578 	 * copyin_name().
1579 	 */
1580 	if (srclen > (t_uscalar_t)nso->so_faddr_maxlen) {
1581 		error = EINVAL;
1582 		freemsg(mp);
1583 		eprintsoline(so, error);
1584 		goto disconnect_vp_unlocked;
1585 	}
1586 	nso->so_faddr_len = (socklen_t)srclen;
1587 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
1588 	bcopy(src, nso->so_faddr_sa, srclen);
1589 	nso->so_state |= SS_FADDR_VALID;
1590 
1591 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1592 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1593 		cred_t *cr;
1594 
1595 		if ((cr = DB_CRED(mp)) != NULL) {
1596 			crhold(cr);
1597 			nso->so_peercred = cr;
1598 			nso->so_cpid = DB_CPID(mp);
1599 		}
1600 		freemsg(mp);
1601 
1602 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1603 		    sizeof (intptr_t), 0, _ALLOC_INTR);
1604 		if (mp == NULL) {
1605 			/*
1606 			 * Accept can not fail with ENOBUFS.
1607 			 * A signal was caught so return EINTR.
1608 			 */
1609 			error = EINTR;
1610 			eprintsoline(so, error);
1611 			goto disconnect_vp_unlocked;
1612 		}
1613 		conn_res = (struct T_conn_res *)mp->b_rptr;
1614 	} else {
1615 		nso->so_peercred = DB_CRED(mp);
1616 		nso->so_cpid = DB_CPID(mp);
1617 		DB_CRED(mp) = NULL;
1618 
1619 		mp->b_rptr = DB_BASE(mp);
1620 		conn_res = (struct T_conn_res *)mp->b_rptr;
1621 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1622 	}
1623 
1624 	/*
1625 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1626 	 * (or AF_INET6) it also has to be bound in the transport provider.
1627 	 * We set the local address in the sonode from the T_OK_ACK of the
1628 	 * T_CONN_RES. For this reason the address we bind to here isn't
1629 	 * important.
1630 	 */
1631 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1632 	    /*CONSTCOND*/
1633 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1634 		/*
1635 		 * Optimization for AF_INET{,6} transports
1636 		 * that can handle a T_CONN_RES without being bound.
1637 		 */
1638 		mutex_enter(&nso->so_lock);
1639 		so_automatic_bind(nso);
1640 		mutex_exit(&nso->so_lock);
1641 	} else {
1642 		/* Perform NULL bind with the transport provider. */
1643 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC)) != 0) {
1644 			ASSERT(error != ENOBUFS);
1645 			freemsg(mp);
1646 			eprintsoline(nso, error);
1647 			goto disconnect_vp_unlocked;
1648 		}
1649 	}
1650 
1651 	/*
1652 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1653 	 * so that any data arriving on the new socket will cause the
1654 	 * appropriate signals to be delivered for the new socket.
1655 	 *
1656 	 * No other thread (except strsock_proto and strsock_misc)
1657 	 * can access the new socket thus we relax the locking.
1658 	 */
1659 	nso->so_pgrp = so->so_pgrp;
1660 	nso->so_state |= so->so_state & (SS_ASYNC|SS_FADDR_NOXLATE);
1661 
1662 	if (nso->so_pgrp != 0) {
1663 		if ((error = so_set_events(nso, nvp, CRED())) != 0) {
1664 			eprintsoline(nso, error);
1665 			error = 0;
1666 			nso->so_pgrp = 0;
1667 		}
1668 	}
1669 
1670 	/*
1671 	 * Make note of the socket level options. TCP and IP level options
1672 	 * are already inherited. We could do all this after accept is
1673 	 * successful but doing it here simplifies code and no harm done
1674 	 * for error case.
1675 	 */
1676 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1677 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1678 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1679 	nso->so_sndbuf = so->so_sndbuf;
1680 	nso->so_rcvbuf = so->so_rcvbuf;
1681 	if (nso->so_options & SO_LINGER)
1682 		nso->so_linger = so->so_linger;
1683 
1684 	if ((so->so_state & SS_DIRECT) != 0) {
1685 
1686 		ASSERT(opt != NULL);
1687 
1688 		conn_res->OPT_length = optlen;
1689 		conn_res->OPT_offset = MBLKL(mp);
1690 		bcopy(&opt, mp->b_wptr, optlen);
1691 		mp->b_wptr += optlen;
1692 		conn_res->PRIM_type = T_CONN_RES;
1693 		conn_res->ACCEPTOR_id = 0;
1694 		PRIM_type = T_CONN_RES;
1695 
1696 		/* Send down the T_CONN_RES on acceptor STREAM */
1697 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1698 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1699 		if (error) {
1700 			mutex_enter(&so->so_lock);
1701 			so_lock_single(so);
1702 			eprintsoline(so, error);
1703 			goto disconnect_vp;
1704 		}
1705 		mutex_enter(&nso->so_lock);
1706 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1707 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1708 		if (error) {
1709 			mutex_exit(&nso->so_lock);
1710 			mutex_enter(&so->so_lock);
1711 			so_lock_single(so);
1712 			eprintsoline(so, error);
1713 			goto disconnect_vp;
1714 		}
1715 		if (nso->so_family == AF_INET) {
1716 			sin_t *sin;
1717 
1718 			sin = (sin_t *)(ack_mp->b_rptr +
1719 			    sizeof (struct T_ok_ack));
1720 			bcopy(sin, nso->so_laddr_sa, sizeof (sin_t));
1721 			nso->so_laddr_len = sizeof (sin_t);
1722 		} else {
1723 			sin6_t *sin6;
1724 
1725 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1726 			    sizeof (struct T_ok_ack));
1727 			bcopy(sin6, nso->so_laddr_sa, sizeof (sin6_t));
1728 			nso->so_laddr_len = sizeof (sin6_t);
1729 		}
1730 		freemsg(ack_mp);
1731 
1732 		nso->so_state |= SS_ISCONNECTED | SS_LADDR_VALID;
1733 		nso->so_priv = opt;
1734 
1735 		if (so->so_nl7c_flags & NL7C_ENABLED) {
1736 			/*
1737 			 * A NL7C marked listen()er so the new socket
1738 			 * inherits the listen()er's NL7C state, except
1739 			 * for NL7C_POLLIN.
1740 			 *
1741 			 * Only call NL7C to process the new socket if
1742 			 * the listen socket allows blocking i/o.
1743 			 */
1744 			nso->so_nl7c_flags = so->so_nl7c_flags & (~NL7C_POLLIN);
1745 			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
1746 				/*
1747 				 * Nonblocking accept() just make it
1748 				 * persist to defer processing to the
1749 				 * read-side syscall (e.g. read).
1750 				 */
1751 				nso->so_nl7c_flags |= NL7C_SOPERSIST;
1752 			} else if (nl7c_process(nso, B_FALSE)) {
1753 				/*
1754 				 * NL7C has completed processing on the
1755 				 * socket, close the socket and back to
1756 				 * the top to await the next T_CONN_IND.
1757 				 */
1758 				mutex_exit(&nso->so_lock);
1759 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1760 				    CRED(), NULL);
1761 				VN_RELE(nvp);
1762 				goto again;
1763 			}
1764 			/* Pass the new socket out */
1765 		}
1766 
1767 		mutex_exit(&nso->so_lock);
1768 
1769 		/*
1770 		 * It's possible, through the use of autopush for example,
1771 		 * that the acceptor stream may not support SS_DIRECT
1772 		 * semantics. If the new socket does not support SS_DIRECT
1773 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
1774 		 * as we would in the I_PUSH case.
1775 		 */
1776 		if (!(nso->so_state & SS_DIRECT)) {
1777 			int	rval;
1778 
1779 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
1780 			    0, 0, K_TO_K, CRED(), &rval)) != 0) {
1781 				mutex_enter(&so->so_lock);
1782 				so_lock_single(so);
1783 				eprintsoline(so, error);
1784 				goto disconnect_vp;
1785 			}
1786 		}
1787 
1788 		/*
1789 		 * Pass out new socket.
1790 		 */
1791 		if (nsop != NULL)
1792 			*nsop = nso;
1793 
1794 		return (0);
1795 	}
1796 
1797 	/*
1798 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1799 	 * which don't support the FireEngine accept fast-path. It is also
1800 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1801 	 * again. Neither sockfs nor TCP attempt to find out if some other
1802 	 * random module has been inserted in between (in which case we
1803 	 * should follow TLI accept behaviour). We blindly assume the worst
1804 	 * case and revert back to old behaviour i.e. TCP will not send us
1805 	 * any option (eager) and the accept should happen on the listener
1806 	 * queue. Any queued T_conn_ind have already got their options removed
1807 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
1808 	 */
1809 	/*
1810 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1811 	 */
1812 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1813 #ifdef	_ILP32
1814 		queue_t	*q;
1815 
1816 		/*
1817 		 * Find read queue in driver
1818 		 * Can safely do this since we "own" nso/nvp.
1819 		 */
1820 		q = strvp2wq(nvp)->q_next;
1821 		while (SAMESTR(q))
1822 			q = q->q_next;
1823 		q = RD(q);
1824 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1825 #else
1826 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1827 #endif	/* _ILP32 */
1828 		conn_res->PRIM_type = O_T_CONN_RES;
1829 		PRIM_type = O_T_CONN_RES;
1830 	} else {
1831 		conn_res->ACCEPTOR_id = nso->so_acceptor_id;
1832 		conn_res->PRIM_type = T_CONN_RES;
1833 		PRIM_type = T_CONN_RES;
1834 	}
1835 	conn_res->SEQ_number = SEQ_number;
1836 	conn_res->OPT_length = 0;
1837 	conn_res->OPT_offset = 0;
1838 
1839 	mutex_enter(&so->so_lock);
1840 	so_lock_single(so);	/* Set SOLOCKED */
1841 	mutex_exit(&so->so_lock);
1842 
1843 	error = kstrputmsg(SOTOV(so), mp, NULL,
1844 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1845 	mutex_enter(&so->so_lock);
1846 	if (error) {
1847 		eprintsoline(so, error);
1848 		goto disconnect_vp;
1849 	}
1850 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
1851 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1852 	if (error) {
1853 		eprintsoline(so, error);
1854 		goto disconnect_vp;
1855 	}
1856 	/*
1857 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
1858 	 * that to set the local address. If this is not present
1859 	 * then we zero out the address and don't set the
1860 	 * SS_LADDR_VALID bit. For AF_UNIX endpoints we copy over
1861 	 * the pathname from the listening socket.
1862 	 */
1863 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
1864 	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
1865 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
1866 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
1867 		bcopy(ack_mp->b_rptr, nso->so_laddr_sa, sinlen);
1868 		nso->so_laddr_len = sinlen;
1869 		nso->so_state |= SS_LADDR_VALID;
1870 	} else if (nso->so_family == AF_UNIX) {
1871 		ASSERT(so->so_family == AF_UNIX);
1872 		nso->so_laddr_len = so->so_laddr_len;
1873 		ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
1874 		bcopy(so->so_laddr_sa, nso->so_laddr_sa, nso->so_laddr_len);
1875 		nso->so_state |= SS_LADDR_VALID;
1876 	} else {
1877 		nso->so_laddr_len = so->so_laddr_len;
1878 		ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
1879 		bzero(nso->so_laddr_sa, nso->so_addr_size);
1880 		nso->so_laddr_sa->sa_family = nso->so_family;
1881 	}
1882 	freemsg(ack_mp);
1883 
1884 	so_unlock_single(so, SOLOCKED);
1885 	mutex_exit(&so->so_lock);
1886 
1887 	nso->so_state |= SS_ISCONNECTED;
1888 
1889 	/*
1890 	 * Pass out new socket.
1891 	 */
1892 	if (nsop != NULL)
1893 		*nsop = nso;
1894 
1895 	return (0);
1896 
1897 
1898 eproto_disc_unl:
1899 	error = EPROTO;
1900 e_disc_unl:
1901 	eprintsoline(so, error);
1902 	goto disconnect_unlocked;
1903 
1904 pr_disc_vp_unl:
1905 	eprintsoline(so, error);
1906 disconnect_vp_unlocked:
1907 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
1908 	VN_RELE(nvp);
1909 disconnect_unlocked:
1910 	(void) sodisconnect(so, SEQ_number, 0);
1911 	return (error);
1912 
1913 pr_disc_vp:
1914 	eprintsoline(so, error);
1915 disconnect_vp:
1916 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
1917 	so_unlock_single(so, SOLOCKED);
1918 	mutex_exit(&so->so_lock);
1919 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
1920 	VN_RELE(nvp);
1921 	return (error);
1922 
1923 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
1924 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
1925 	    ? EOPNOTSUPP : EINVAL;
1926 e_bad:
1927 	eprintsoline(so, error);
1928 	return (error);
1929 }
1930 
1931 /*
1932  * connect a socket.
1933  *
1934  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
1935  * unconnect (by specifying a null address).
1936  */
1937 int
1938 sotpi_connect(struct sonode *so,
1939 	const struct sockaddr *name,
1940 	socklen_t namelen,
1941 	int fflag,
1942 	int flags)
1943 {
1944 	struct T_conn_req	conn_req;
1945 	int			error = 0;
1946 	mblk_t			*mp;
1947 	void			*src;
1948 	socklen_t		srclen;
1949 	void			*addr;
1950 	socklen_t		addrlen;
1951 	boolean_t		need_unlock;
1952 
1953 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
1954 	    so, name, namelen, fflag, flags,
1955 	    pr_state(so->so_state, so->so_mode)));
1956 
1957 	/*
1958 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
1959 	 * avoid sleeping for memory with SOLOCKED held.
1960 	 * We know that the T_CONN_REQ can't be larger than 2 * so_faddr_maxlen
1961 	 * + sizeof (struct T_opthdr).
1962 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
1963 	 * exceed so_faddr_maxlen).
1964 	 */
1965 	mp = soallocproto(sizeof (struct T_conn_req) +
1966 	    2 * so->so_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR);
1967 	if (mp == NULL) {
1968 		/*
1969 		 * Connect can not fail with ENOBUFS. A signal was
1970 		 * caught so return EINTR.
1971 		 */
1972 		error = EINTR;
1973 		eprintsoline(so, error);
1974 		return (error);
1975 	}
1976 
1977 	mutex_enter(&so->so_lock);
1978 	/*
1979 	 * Make sure there is a preallocated T_unbind_req message
1980 	 * before any binding. This message is allocated when the
1981 	 * socket is created. Since another thread can consume
1982 	 * so_unbind_mp by the time we return from so_lock_single(),
1983 	 * we should check the availability of so_unbind_mp after
1984 	 * we return from so_lock_single().
1985 	 */
1986 
1987 	so_lock_single(so);	/* Set SOLOCKED */
1988 	need_unlock = B_TRUE;
1989 
1990 	if (so->so_unbind_mp == NULL) {
1991 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
1992 		/* NOTE: holding so_lock while sleeping */
1993 		so->so_unbind_mp =
1994 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR);
1995 		if (so->so_unbind_mp == NULL) {
1996 			error = EINTR;
1997 			goto done;
1998 		}
1999 	}
2000 
2001 	/*
2002 	 * Can't have done a listen before connecting.
2003 	 */
2004 	if (so->so_state & SS_ACCEPTCONN) {
2005 		error = EOPNOTSUPP;
2006 		goto done;
2007 	}
2008 
2009 	/*
2010 	 * Must be bound with the transport
2011 	 */
2012 	if (!(so->so_state & SS_ISBOUND)) {
2013 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2014 		    /*CONSTCOND*/
2015 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2016 			/*
2017 			 * Optimization for AF_INET{,6} transports
2018 			 * that can handle a T_CONN_REQ without being bound.
2019 			 */
2020 			so_automatic_bind(so);
2021 		} else {
2022 			error = sotpi_bind(so, NULL, 0,
2023 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
2024 			if (error)
2025 				goto done;
2026 		}
2027 		ASSERT(so->so_state & SS_ISBOUND);
2028 		flags |= _SOCONNECT_DID_BIND;
2029 	}
2030 
2031 	/*
2032 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2033 	 * connect to a null address. This is the portable method to
2034 	 * unconnect a socket.
2035 	 */
2036 	if ((namelen >= sizeof (sa_family_t)) &&
2037 	    (name->sa_family == AF_UNSPEC)) {
2038 		name = NULL;
2039 		namelen = 0;
2040 	}
2041 
2042 	/*
2043 	 * Check that we are not already connected.
2044 	 * A connection-oriented socket cannot be reconnected.
2045 	 * A connected connection-less socket can be
2046 	 * - connected to a different address by a subsequent connect
2047 	 * - "unconnected" by a connect to the NULL address
2048 	 */
2049 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2050 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2051 		if (so->so_mode & SM_CONNREQUIRED) {
2052 			/* Connection-oriented socket */
2053 			error = so->so_state & SS_ISCONNECTED ?
2054 			    EISCONN : EALREADY;
2055 			goto done;
2056 		}
2057 		/* Connection-less socket */
2058 		if (name == NULL) {
2059 			/*
2060 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2061 			 * since it was set when the socket was connected.
2062 			 * If this is UDP also send down a T_DISCON_REQ.
2063 			 */
2064 			int val;
2065 
2066 			if ((so->so_family == AF_INET ||
2067 			    so->so_family == AF_INET6) &&
2068 			    (so->so_type == SOCK_DGRAM ||
2069 			    so->so_type == SOCK_RAW) &&
2070 			    /*CONSTCOND*/
2071 			    !soconnect_tpi_udp) {
2072 				/* XXX What about implicitly unbinding here? */
2073 				error = sodisconnect(so, -1,
2074 				    _SODISCONNECT_LOCK_HELD);
2075 			} else {
2076 				so->so_state &=
2077 				    ~(SS_ISCONNECTED | SS_ISCONNECTING |
2078 				    SS_FADDR_VALID);
2079 				so->so_faddr_len = 0;
2080 			}
2081 
2082 			so_unlock_single(so, SOLOCKED);
2083 			mutex_exit(&so->so_lock);
2084 
2085 			val = 0;
2086 			(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2087 			    &val, (t_uscalar_t)sizeof (val));
2088 
2089 			mutex_enter(&so->so_lock);
2090 			so_lock_single(so);	/* Set SOLOCKED */
2091 			goto done;
2092 		}
2093 	}
2094 	ASSERT(so->so_state & SS_ISBOUND);
2095 
2096 	if (name == NULL || namelen == 0) {
2097 		error = EINVAL;
2098 		goto done;
2099 	}
2100 	/*
2101 	 * Mark the socket if so_faddr_sa represents the transport level
2102 	 * address.
2103 	 */
2104 	if (flags & _SOCONNECT_NOXLATE) {
2105 		struct sockaddr_ux	*soaddr_ux;
2106 
2107 		ASSERT(so->so_family == AF_UNIX);
2108 		if (namelen != sizeof (struct sockaddr_ux)) {
2109 			error = EINVAL;
2110 			goto done;
2111 		}
2112 		soaddr_ux = (struct sockaddr_ux *)name;
2113 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2114 		namelen = sizeof (soaddr_ux->sou_addr);
2115 		so->so_state |= SS_FADDR_NOXLATE;
2116 	}
2117 
2118 	/*
2119 	 * Length and family checks.
2120 	 */
2121 	error = so_addr_verify(so, name, namelen);
2122 	if (error)
2123 		goto bad;
2124 
2125 	/*
2126 	 * Save foreign address. Needed for AF_UNIX as well as
2127 	 * transport providers that do not support TI_GETPEERNAME.
2128 	 * Also used for cached foreign address for TCP and UDP.
2129 	 */
2130 	if (namelen > (t_uscalar_t)so->so_faddr_maxlen) {
2131 		error = EINVAL;
2132 		goto done;
2133 	}
2134 	so->so_faddr_len = (socklen_t)namelen;
2135 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
2136 	bcopy(name, so->so_faddr_sa, namelen);
2137 	so->so_state |= SS_FADDR_VALID;
2138 
2139 	if (so->so_family == AF_UNIX) {
2140 		if (so->so_state & SS_FADDR_NOXLATE) {
2141 			/*
2142 			 * Already have a transport internal address. Do not
2143 			 * pass any (transport internal) source address.
2144 			 */
2145 			addr = so->so_faddr_sa;
2146 			addrlen = (t_uscalar_t)so->so_faddr_len;
2147 			src = NULL;
2148 			srclen = 0;
2149 		} else {
2150 			/*
2151 			 * Pass the sockaddr_un source address as an option
2152 			 * and translate the remote address.
2153 			 * Holding so_lock thus so_laddr_sa can not change.
2154 			 */
2155 			src = so->so_laddr_sa;
2156 			srclen = (t_uscalar_t)so->so_laddr_len;
2157 			dprintso(so, 1,
2158 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2159 			    srclen, src));
2160 			error = so_ux_addr_xlate(so,
2161 			    so->so_faddr_sa, (socklen_t)so->so_faddr_len,
2162 			    (flags & _SOCONNECT_XPG4_2),
2163 			    &addr, &addrlen);
2164 			if (error)
2165 				goto bad;
2166 		}
2167 	} else {
2168 		addr = so->so_faddr_sa;
2169 		addrlen = (t_uscalar_t)so->so_faddr_len;
2170 		src = NULL;
2171 		srclen = 0;
2172 	}
2173 	/*
2174 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2175 	 * option which asks the transport provider to send T_UDERR_IND
2176 	 * messages. These T_UDERR_IND messages are used to return connected
2177 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2178 	 *
2179 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2180 	 * we send down a T_CONN_REQ. This is needed to let the
2181 	 * transport assign a local address that is consistent with
2182 	 * the remote address. Applications depend on a getsockname()
2183 	 * after a connect() to retrieve the "source" IP address for
2184 	 * the connected socket.  Invalidate the cached local address
2185 	 * to force getsockname() to enquire of the transport.
2186 	 */
2187 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2188 		/*
2189 		 * Datagram socket.
2190 		 */
2191 		int32_t val;
2192 
2193 		so_unlock_single(so, SOLOCKED);
2194 		mutex_exit(&so->so_lock);
2195 
2196 		val = 1;
2197 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2198 		    &val, (t_uscalar_t)sizeof (val));
2199 
2200 		mutex_enter(&so->so_lock);
2201 		so_lock_single(so);	/* Set SOLOCKED */
2202 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2203 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2204 		    soconnect_tpi_udp) {
2205 			soisconnected(so);
2206 			goto done;
2207 		}
2208 		/*
2209 		 * Send down T_CONN_REQ etc.
2210 		 * Clear fflag to avoid returning EWOULDBLOCK.
2211 		 */
2212 		fflag = 0;
2213 		ASSERT(so->so_family != AF_UNIX);
2214 		so->so_state &= ~SS_LADDR_VALID;
2215 	} else if (so->so_laddr_len != 0) {
2216 		/*
2217 		 * If the local address or port was "any" then it may be
2218 		 * changed by the transport as a result of the
2219 		 * connect.  Invalidate the cached version if we have one.
2220 		 */
2221 		switch (so->so_family) {
2222 		case AF_INET:
2223 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin_t));
2224 			if (((sin_t *)so->so_laddr_sa)->sin_addr.s_addr ==
2225 			    INADDR_ANY ||
2226 			    ((sin_t *)so->so_laddr_sa)->sin_port == 0)
2227 				so->so_state &= ~SS_LADDR_VALID;
2228 			break;
2229 
2230 		case AF_INET6:
2231 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin6_t));
2232 			if (IN6_IS_ADDR_UNSPECIFIED(
2233 			    &((sin6_t *)so->so_laddr_sa) ->sin6_addr) ||
2234 			    IN6_IS_ADDR_V4MAPPED_ANY(
2235 			    &((sin6_t *)so->so_laddr_sa)->sin6_addr) ||
2236 			    ((sin6_t *)so->so_laddr_sa)->sin6_port == 0)
2237 				so->so_state &= ~SS_LADDR_VALID;
2238 			break;
2239 
2240 		default:
2241 			break;
2242 		}
2243 	}
2244 
2245 	/*
2246 	 * Check for failure of an earlier call
2247 	 */
2248 	if (so->so_error != 0)
2249 		goto so_bad;
2250 
2251 	/*
2252 	 * Send down T_CONN_REQ. Message was allocated above.
2253 	 */
2254 	conn_req.PRIM_type = T_CONN_REQ;
2255 	conn_req.DEST_length = addrlen;
2256 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2257 	if (srclen == 0) {
2258 		conn_req.OPT_length = 0;
2259 		conn_req.OPT_offset = 0;
2260 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2261 		soappendmsg(mp, addr, addrlen);
2262 	} else {
2263 		/*
2264 		 * There is a AF_UNIX sockaddr_un to include as a source
2265 		 * address option.
2266 		 */
2267 		struct T_opthdr toh;
2268 
2269 		toh.level = SOL_SOCKET;
2270 		toh.name = SO_SRCADDR;
2271 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2272 		toh.status = 0;
2273 		conn_req.OPT_length =
2274 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2275 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2276 		    _TPI_ALIGN_TOPT(addrlen));
2277 
2278 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2279 		soappendmsg(mp, addr, addrlen);
2280 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2281 		soappendmsg(mp, &toh, sizeof (toh));
2282 		soappendmsg(mp, src, srclen);
2283 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2284 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2285 	}
2286 	/*
2287 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2288 	 * in order to have the right state when the T_CONN_CON shows up.
2289 	 */
2290 	soisconnecting(so);
2291 	mutex_exit(&so->so_lock);
2292 
2293 	if (audit_active)
2294 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2295 
2296 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2297 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2298 	mp = NULL;
2299 	mutex_enter(&so->so_lock);
2300 	if (error != 0)
2301 		goto bad;
2302 
2303 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2304 		goto bad;
2305 
2306 	/* Allow other threads to access the socket */
2307 	so_unlock_single(so, SOLOCKED);
2308 	need_unlock = B_FALSE;
2309 
2310 	/*
2311 	 * Wait until we get a T_CONN_CON or an error
2312 	 */
2313 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2314 		so_lock_single(so);	/* Set SOLOCKED */
2315 		need_unlock = B_TRUE;
2316 	}
2317 
2318 done:
2319 	freemsg(mp);
2320 	switch (error) {
2321 	case EINPROGRESS:
2322 	case EALREADY:
2323 	case EISCONN:
2324 	case EINTR:
2325 		/* Non-fatal errors */
2326 		so->so_state &= ~SS_LADDR_VALID;
2327 		/* FALLTHRU */
2328 	case 0:
2329 		break;
2330 
2331 	case EHOSTUNREACH:
2332 		if (flags & _SOCONNECT_XPG4_2) {
2333 			/*
2334 			 * X/Open specification contains a requirement that
2335 			 * ENETUNREACH be returned but does not require
2336 			 * EHOSTUNREACH. In order to keep the test suite
2337 			 * happy we mess with the errno here.
2338 			 */
2339 			error = ENETUNREACH;
2340 		}
2341 		/* FALLTHRU */
2342 
2343 	default:
2344 		ASSERT(need_unlock);
2345 		/*
2346 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2347 		 * and invalidate local-address cache
2348 		 */
2349 		so->so_state &= ~(SS_ISCONNECTING | SS_LADDR_VALID);
2350 		/* A discon_ind might have already unbound us */
2351 		if ((flags & _SOCONNECT_DID_BIND) &&
2352 		    (so->so_state & SS_ISBOUND)) {
2353 			int err;
2354 
2355 			err = sotpi_unbind(so, 0);
2356 			/* LINTED - statement has no conseq */
2357 			if (err) {
2358 				eprintsoline(so, err);
2359 			}
2360 		}
2361 		break;
2362 	}
2363 	if (need_unlock)
2364 		so_unlock_single(so, SOLOCKED);
2365 	mutex_exit(&so->so_lock);
2366 	return (error);
2367 
2368 so_bad:	error = sogeterr(so);
2369 bad:	eprintsoline(so, error);
2370 	goto done;
2371 }
2372 
2373 int
2374 sotpi_shutdown(struct sonode *so, int how)
2375 {
2376 	struct T_ordrel_req	ordrel_req;
2377 	mblk_t			*mp;
2378 	uint_t			old_state, state_change;
2379 	int			error = 0;
2380 
2381 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2382 	    so, how, pr_state(so->so_state, so->so_mode)));
2383 
2384 	mutex_enter(&so->so_lock);
2385 	so_lock_single(so);	/* Set SOLOCKED */
2386 
2387 	/*
2388 	 * SunOS 4.X has no check for datagram sockets.
2389 	 * 5.X checks that it is connected (ENOTCONN)
2390 	 * X/Open requires that we check the connected state.
2391 	 */
2392 	if (!(so->so_state & SS_ISCONNECTED)) {
2393 		if (!xnet_skip_checks) {
2394 			error = ENOTCONN;
2395 			if (xnet_check_print) {
2396 				printf("sockfs: X/Open shutdown check "
2397 				    "caused ENOTCONN\n");
2398 			}
2399 		}
2400 		goto done;
2401 	}
2402 	/*
2403 	 * Record the current state and then perform any state changes.
2404 	 * Then use the difference between the old and new states to
2405 	 * determine which messages need to be sent.
2406 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2407 	 * duplicate calls to shutdown().
2408 	 */
2409 	old_state = so->so_state;
2410 
2411 	switch (how) {
2412 	case 0:
2413 		socantrcvmore(so);
2414 		break;
2415 	case 1:
2416 		socantsendmore(so);
2417 		break;
2418 	case 2:
2419 		socantsendmore(so);
2420 		socantrcvmore(so);
2421 		break;
2422 	default:
2423 		error = EINVAL;
2424 		goto done;
2425 	}
2426 
2427 	/*
2428 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2429 	 */
2430 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2431 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2432 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2433 
2434 	switch (state_change) {
2435 	case 0:
2436 		dprintso(so, 1,
2437 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2438 		    so->so_state));
2439 		goto done;
2440 
2441 	case SS_CANTRCVMORE:
2442 		mutex_exit(&so->so_lock);
2443 		strseteof(SOTOV(so), 1);
2444 		/*
2445 		 * strseteof takes care of read side wakeups,
2446 		 * pollwakeups, and signals.
2447 		 */
2448 		/*
2449 		 * Get the read lock before flushing data to avoid problems
2450 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2451 		 */
2452 		mutex_enter(&so->so_lock);
2453 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2454 		mutex_exit(&so->so_lock);
2455 
2456 		/* Flush read side queue */
2457 		strflushrq(SOTOV(so), FLUSHALL);
2458 
2459 		mutex_enter(&so->so_lock);
2460 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2461 		break;
2462 
2463 	case SS_CANTSENDMORE:
2464 		mutex_exit(&so->so_lock);
2465 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2466 		mutex_enter(&so->so_lock);
2467 		break;
2468 
2469 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2470 		mutex_exit(&so->so_lock);
2471 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2472 		strseteof(SOTOV(so), 1);
2473 		/*
2474 		 * strseteof takes care of read side wakeups,
2475 		 * pollwakeups, and signals.
2476 		 */
2477 		/*
2478 		 * Get the read lock before flushing data to avoid problems
2479 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2480 		 */
2481 		mutex_enter(&so->so_lock);
2482 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2483 		mutex_exit(&so->so_lock);
2484 
2485 		/* Flush read side queue */
2486 		strflushrq(SOTOV(so), FLUSHALL);
2487 
2488 		mutex_enter(&so->so_lock);
2489 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2490 		break;
2491 	}
2492 
2493 	ASSERT(MUTEX_HELD(&so->so_lock));
2494 
2495 	/*
2496 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2497 	 * was set due to this call and the new state has both of them set:
2498 	 *	Send the AF_UNIX close indication
2499 	 *	For T_COTS send a discon_ind
2500 	 *
2501 	 * If cantsend was set due to this call:
2502 	 *	For T_COTSORD send an ordrel_ind
2503 	 *
2504 	 * Note that for T_CLTS there is no message sent here.
2505 	 */
2506 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2507 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2508 		/*
2509 		 * For SunOS 4.X compatibility we tell the other end
2510 		 * that we are unable to receive at this point.
2511 		 */
2512 		if (so->so_family == AF_UNIX && so->so_serv_type != T_CLTS)
2513 			so_unix_close(so);
2514 
2515 		if (so->so_serv_type == T_COTS)
2516 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2517 	}
2518 	if ((state_change & SS_CANTSENDMORE) &&
2519 	    (so->so_serv_type == T_COTS_ORD)) {
2520 		/* Send an orderly release */
2521 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2522 
2523 		mutex_exit(&so->so_lock);
2524 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2525 		    0, _ALLOC_SLEEP);
2526 		/*
2527 		 * Send down the T_ORDREL_REQ even if there is flow control.
2528 		 * This prevents shutdown from blocking.
2529 		 * Note that there is no T_OK_ACK for ordrel_req.
2530 		 */
2531 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2532 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2533 		mutex_enter(&so->so_lock);
2534 		if (error) {
2535 			eprintsoline(so, error);
2536 			goto done;
2537 		}
2538 	}
2539 
2540 done:
2541 	so_unlock_single(so, SOLOCKED);
2542 	mutex_exit(&so->so_lock);
2543 	return (error);
2544 }
2545 
2546 /*
2547  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2548  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2549  * that we have closed.
2550  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2551  * T_UNITDATA_REQ containing the same option.
2552  *
2553  * For SOCK_DGRAM half-connections (somebody connected to this end
2554  * but this end is not connect) we don't know where to send any
2555  * SO_UNIX_CLOSE.
2556  *
2557  * We have to ignore stream head errors just in case there has been
2558  * a shutdown(output).
2559  * Ignore any flow control to try to get the message more quickly to the peer.
2560  * While locally ignoring flow control solves the problem when there
2561  * is only the loopback transport on the stream it would not provide
2562  * the correct AF_UNIX socket semantics when one or more modules have
2563  * been pushed.
2564  */
2565 void
2566 so_unix_close(struct sonode *so)
2567 {
2568 	int		error;
2569 	struct T_opthdr	toh;
2570 	mblk_t		*mp;
2571 
2572 	ASSERT(MUTEX_HELD(&so->so_lock));
2573 
2574 	ASSERT(so->so_family == AF_UNIX);
2575 
2576 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2577 	    (SS_ISCONNECTED|SS_ISBOUND))
2578 		return;
2579 
2580 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2581 	    so, pr_state(so->so_state, so->so_mode)));
2582 
2583 	toh.level = SOL_SOCKET;
2584 	toh.name = SO_UNIX_CLOSE;
2585 
2586 	/* zero length + header */
2587 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2588 	toh.status = 0;
2589 
2590 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2591 		struct T_optdata_req tdr;
2592 
2593 		tdr.PRIM_type = T_OPTDATA_REQ;
2594 		tdr.DATA_flag = 0;
2595 
2596 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2597 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2598 
2599 		/* NOTE: holding so_lock while sleeping */
2600 		mp = soallocproto2(&tdr, sizeof (tdr),
2601 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP);
2602 	} else {
2603 		struct T_unitdata_req	tudr;
2604 		void			*addr;
2605 		socklen_t		addrlen;
2606 		void			*src;
2607 		socklen_t		srclen;
2608 		struct T_opthdr		toh2;
2609 		t_scalar_t		size;
2610 
2611 		/* Connecteded DGRAM socket */
2612 
2613 		/*
2614 		 * For AF_UNIX the destination address is translated to
2615 		 * an internal name and the source address is passed as
2616 		 * an option.
2617 		 */
2618 		/*
2619 		 * Length and family checks.
2620 		 */
2621 		error = so_addr_verify(so, so->so_faddr_sa,
2622 		    (t_uscalar_t)so->so_faddr_len);
2623 		if (error) {
2624 			eprintsoline(so, error);
2625 			return;
2626 		}
2627 		if (so->so_state & SS_FADDR_NOXLATE) {
2628 			/*
2629 			 * Already have a transport internal address. Do not
2630 			 * pass any (transport internal) source address.
2631 			 */
2632 			addr = so->so_faddr_sa;
2633 			addrlen = (t_uscalar_t)so->so_faddr_len;
2634 			src = NULL;
2635 			srclen = 0;
2636 		} else {
2637 			/*
2638 			 * Pass the sockaddr_un source address as an option
2639 			 * and translate the remote address.
2640 			 * Holding so_lock thus so_laddr_sa can not change.
2641 			 */
2642 			src = so->so_laddr_sa;
2643 			srclen = (socklen_t)so->so_laddr_len;
2644 			dprintso(so, 1,
2645 			    ("so_ux_close: srclen %d, src %p\n",
2646 			    srclen, src));
2647 			error = so_ux_addr_xlate(so,
2648 			    so->so_faddr_sa,
2649 			    (socklen_t)so->so_faddr_len, 0,
2650 			    &addr, &addrlen);
2651 			if (error) {
2652 				eprintsoline(so, error);
2653 				return;
2654 			}
2655 		}
2656 		tudr.PRIM_type = T_UNITDATA_REQ;
2657 		tudr.DEST_length = addrlen;
2658 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2659 		if (srclen == 0) {
2660 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2661 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2662 			    _TPI_ALIGN_TOPT(addrlen));
2663 
2664 			size = tudr.OPT_offset + tudr.OPT_length;
2665 			/* NOTE: holding so_lock while sleeping */
2666 			mp = soallocproto2(&tudr, sizeof (tudr),
2667 			    addr, addrlen, size, _ALLOC_SLEEP);
2668 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2669 			soappendmsg(mp, &toh, sizeof (toh));
2670 		} else {
2671 			/*
2672 			 * There is a AF_UNIX sockaddr_un to include as a
2673 			 * source address option.
2674 			 */
2675 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2676 			    _TPI_ALIGN_TOPT(srclen));
2677 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2678 			    _TPI_ALIGN_TOPT(addrlen));
2679 
2680 			toh2.level = SOL_SOCKET;
2681 			toh2.name = SO_SRCADDR;
2682 			toh2.len = (t_uscalar_t)(srclen +
2683 			    sizeof (struct T_opthdr));
2684 			toh2.status = 0;
2685 
2686 			size = tudr.OPT_offset + tudr.OPT_length;
2687 
2688 			/* NOTE: holding so_lock while sleeping */
2689 			mp = soallocproto2(&tudr, sizeof (tudr),
2690 			    addr, addrlen, size, _ALLOC_SLEEP);
2691 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2692 			soappendmsg(mp, &toh, sizeof (toh));
2693 			soappendmsg(mp, &toh2, sizeof (toh2));
2694 			soappendmsg(mp, src, srclen);
2695 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2696 		}
2697 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2698 	}
2699 	mutex_exit(&so->so_lock);
2700 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2701 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2702 	mutex_enter(&so->so_lock);
2703 }
2704 
2705 /*
2706  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
2707  */
2708 int
2709 sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags)
2710 {
2711 	mblk_t		*mp, *nmp;
2712 	int		error;
2713 
2714 	dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", so, msg, flags));
2715 
2716 	/*
2717 	 * There is never any oob data with addresses or control since
2718 	 * the T_EXDATA_IND does not carry any options.
2719 	 */
2720 	msg->msg_controllen = 0;
2721 	msg->msg_namelen = 0;
2722 
2723 	mutex_enter(&so->so_lock);
2724 	ASSERT(so_verify_oobstate(so));
2725 	if ((so->so_options & SO_OOBINLINE) ||
2726 	    (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
2727 		dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
2728 		mutex_exit(&so->so_lock);
2729 		return (EINVAL);
2730 	}
2731 	if (!(so->so_state & SS_HAVEOOBDATA)) {
2732 		dprintso(so, 1, ("sorecvoob: no data yet\n"));
2733 		mutex_exit(&so->so_lock);
2734 		return (EWOULDBLOCK);
2735 	}
2736 	ASSERT(so->so_oobmsg != NULL);
2737 	mp = so->so_oobmsg;
2738 	if (flags & MSG_PEEK) {
2739 		/*
2740 		 * Since recv* can not return ENOBUFS we can not use dupmsg.
2741 		 * Instead we revert to the consolidation private
2742 		 * allocb_wait plus bcopy.
2743 		 */
2744 		mblk_t *mp1;
2745 
2746 		mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
2747 		ASSERT(mp1);
2748 
2749 		while (mp != NULL) {
2750 			ssize_t size;
2751 
2752 			size = MBLKL(mp);
2753 			bcopy(mp->b_rptr, mp1->b_wptr, size);
2754 			mp1->b_wptr += size;
2755 			ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
2756 			mp = mp->b_cont;
2757 		}
2758 		mp = mp1;
2759 	} else {
2760 		/*
2761 		 * Update the state indicating that the data has been consumed.
2762 		 * Keep SS_OOBPEND set until data is consumed past the mark.
2763 		 */
2764 		so->so_oobmsg = NULL;
2765 		so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
2766 	}
2767 	dprintso(so, 1,
2768 	    ("after recvoob(%p): counts %d/%d state %s\n",
2769 	    so, so->so_oobsigcnt,
2770 	    so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2771 	ASSERT(so_verify_oobstate(so));
2772 	mutex_exit(&so->so_lock);
2773 
2774 	error = 0;
2775 	nmp = mp;
2776 	while (nmp != NULL && uiop->uio_resid > 0) {
2777 		ssize_t n = MBLKL(nmp);
2778 
2779 		n = MIN(n, uiop->uio_resid);
2780 		if (n > 0)
2781 			error = uiomove(nmp->b_rptr, n,
2782 			    UIO_READ, uiop);
2783 		if (error)
2784 			break;
2785 		nmp = nmp->b_cont;
2786 	}
2787 	freemsg(mp);
2788 	return (error);
2789 }
2790 
2791 /*
2792  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2793  * In addition, the caller typically verifies that there is some
2794  * potential state to clear by checking
2795  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2796  * before calling this routine.
2797  * Note that such a check can be made without holding so_lock since
2798  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2799  * decrements so_oobsigcnt.
2800  *
2801  * When data is read *after* the point that all pending
2802  * oob data has been consumed the oob indication is cleared.
2803  *
2804  * This logic keeps select/poll returning POLLRDBAND and
2805  * SIOCATMARK returning true until we have read past
2806  * the mark.
2807  */
2808 static void
2809 sorecv_update_oobstate(struct sonode *so)
2810 {
2811 	mutex_enter(&so->so_lock);
2812 	ASSERT(so_verify_oobstate(so));
2813 	dprintso(so, 1,
2814 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
2815 	    so->so_oobsigcnt,
2816 	    so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2817 	if (so->so_oobsigcnt == 0) {
2818 		/* No more pending oob indications */
2819 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2820 		freemsg(so->so_oobmsg);
2821 		so->so_oobmsg = NULL;
2822 	}
2823 	ASSERT(so_verify_oobstate(so));
2824 	mutex_exit(&so->so_lock);
2825 }
2826 
2827 /*
2828  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2829  */
2830 static int
2831 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2832 {
2833 	int	error = 0;
2834 	mblk_t *tmp = NULL;
2835 	mblk_t *pmp = NULL;
2836 	mblk_t *nmp = so->so_nl7c_rcv_mp;
2837 
2838 	ASSERT(nmp != NULL);
2839 
2840 	while (nmp != NULL && uiop->uio_resid > 0) {
2841 		ssize_t n;
2842 
2843 		if (DB_TYPE(nmp) == M_DATA) {
2844 			/*
2845 			 * We have some data, uiomove up to resid bytes.
2846 			 */
2847 			n = MIN(MBLKL(nmp), uiop->uio_resid);
2848 			if (n > 0)
2849 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
2850 			nmp->b_rptr += n;
2851 			if (nmp->b_rptr == nmp->b_wptr) {
2852 				pmp = nmp;
2853 				nmp = nmp->b_cont;
2854 			}
2855 			if (error)
2856 				break;
2857 		} else {
2858 			/*
2859 			 * We only handle data, save for caller to handle.
2860 			 */
2861 			if (pmp != NULL) {
2862 				pmp->b_cont = nmp->b_cont;
2863 			}
2864 			nmp->b_cont = NULL;
2865 			if (*rmp == NULL) {
2866 				*rmp = nmp;
2867 			} else {
2868 				tmp->b_cont = nmp;
2869 			}
2870 			nmp = nmp->b_cont;
2871 			tmp = nmp;
2872 		}
2873 	}
2874 	if (pmp != NULL) {
2875 		/* Free any mblk_t(s) which we have consumed */
2876 		pmp->b_cont = NULL;
2877 		freemsg(so->so_nl7c_rcv_mp);
2878 	}
2879 	if ((so->so_nl7c_rcv_mp = nmp) == NULL) {
2880 		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
2881 		if (error == 0) {
2882 			rval_t	*p = (rval_t *)&so->so_nl7c_rcv_rval;
2883 
2884 			error = p->r_v.r_v2;
2885 			p->r_v.r_v2 = 0;
2886 		}
2887 		rp->r_vals = so->so_nl7c_rcv_rval;
2888 		so->so_nl7c_rcv_rval = 0;
2889 	} else {
2890 		/* More mblk_t(s) to process so no rval to return */
2891 		rp->r_vals = 0;
2892 	}
2893 	return (error);
2894 }
2895 
2896 /*
2897  * Receive the next message on the queue.
2898  * If msg_controllen is non-zero when called the caller is interested in
2899  * any received control info (options).
2900  * If msg_namelen is non-zero when called the caller is interested in
2901  * any received source address.
2902  * The routine returns with msg_control and msg_name pointing to
2903  * kmem_alloc'ed memory which the caller has to free.
2904  */
2905 int
2906 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
2907 {
2908 	union T_primitives	*tpr;
2909 	mblk_t			*mp;
2910 	uchar_t			pri;
2911 	int			pflag, opflag;
2912 	void			*control;
2913 	t_uscalar_t		controllen;
2914 	t_uscalar_t		namelen;
2915 	int			so_state = so->so_state; /* Snapshot */
2916 	ssize_t			saved_resid;
2917 	rval_t			rval;
2918 	int			flags;
2919 	clock_t			timout;
2920 	int			first;
2921 	int			error = 0;
2922 	struct uio		*suiop = NULL;
2923 	sodirect_t		*sodp = so->so_direct;
2924 
2925 	flags = msg->msg_flags;
2926 	msg->msg_flags = 0;
2927 
2928 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2929 	    so, msg, flags,
2930 	    pr_state(so->so_state, so->so_mode), so->so_error));
2931 
2932 	/*
2933 	 * If we are not connected because we have never been connected
2934 	 * we return ENOTCONN. If we have been connected (but are no longer
2935 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2936 	 * the EOF.
2937 	 *
2938 	 * An alternative would be to post an ENOTCONN error in stream head
2939 	 * (read+write) and clear it when we're connected. However, that error
2940 	 * would cause incorrect poll/select behavior!
2941 	 */
2942 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2943 	    (so->so_mode & SM_CONNREQUIRED)) {
2944 		return (ENOTCONN);
2945 	}
2946 
2947 	/*
2948 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2949 	 * after checking that the read queue is empty) and returns zero.
2950 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2951 	 * is zero.
2952 	 */
2953 
2954 	if (flags & MSG_OOB) {
2955 		/* Check that the transport supports OOB */
2956 		if (!(so->so_mode & SM_EXDATA))
2957 			return (EOPNOTSUPP);
2958 		return (sorecvoob(so, msg, uiop, flags));
2959 	}
2960 
2961 	/*
2962 	 * Set msg_controllen and msg_namelen to zero here to make it
2963 	 * simpler in the cases that no control or name is returned.
2964 	 */
2965 	controllen = msg->msg_controllen;
2966 	namelen = msg->msg_namelen;
2967 	msg->msg_controllen = 0;
2968 	msg->msg_namelen = 0;
2969 
2970 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2971 	    namelen, controllen));
2972 
2973 	mutex_enter(&so->so_lock);
2974 	/*
2975 	 * If an NL7C enabled socket and not waiting for write data.
2976 	 */
2977 	if ((so->so_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
2978 	    NL7C_ENABLED) {
2979 		if (so->so_nl7c_uri) {
2980 			/* Close uri processing for a previous request */
2981 			nl7c_close(so);
2982 		}
2983 		if ((so_state & SS_CANTRCVMORE) && so->so_nl7c_rcv_mp == NULL) {
2984 			/* Nothing to process, EOF */
2985 			mutex_exit(&so->so_lock);
2986 			return (0);
2987 		} else if (so->so_nl7c_flags & NL7C_SOPERSIST) {
2988 			/* Persistent NL7C socket, try to process request */
2989 			boolean_t ret;
2990 
2991 			ret = nl7c_process(so,
2992 			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
2993 			rval.r_vals = so->so_nl7c_rcv_rval;
2994 			error = rval.r_v.r_v2;
2995 			if (error) {
2996 				/* Error of some sort, return it */
2997 				mutex_exit(&so->so_lock);
2998 				return (error);
2999 			}
3000 			if (so->so_nl7c_flags &&
3001 			    ! (so->so_nl7c_flags & NL7C_WAITWRITE)) {
3002 				/*
3003 				 * Still an NL7C socket and no data
3004 				 * to pass up to the caller.
3005 				 */
3006 				mutex_exit(&so->so_lock);
3007 				if (ret) {
3008 					/* EOF */
3009 					return (0);
3010 				} else {
3011 					/* Need more data */
3012 					return (EAGAIN);
3013 				}
3014 			}
3015 		} else {
3016 			/*
3017 			 * Not persistent so no further NL7C processing.
3018 			 */
3019 			so->so_nl7c_flags = 0;
3020 		}
3021 	}
3022 	/*
3023 	 * Only one reader is allowed at any given time. This is needed
3024 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3025 	 *
3026 	 * This is slightly different that BSD behavior in that it fails with
3027 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3028 	 * is single-threaded using sblock(), which is dropped while waiting
3029 	 * for data to appear. The difference shows up e.g. if one
3030 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3031 	 * does use nonblocking io and different threads are reading each
3032 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3033 	 * in this case as long as the read queue doesn't get empty.
3034 	 * In this implementation the thread using nonblocking io can
3035 	 * get an EWOULDBLOCK error due to the blocking thread executing
3036 	 * e.g. in the uiomove in kstrgetmsg.
3037 	 * This difference is not believed to be significant.
3038 	 */
3039 	/* Set SOREADLOCKED */
3040 	error = so_lock_read_intr(so,
3041 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3042 	mutex_exit(&so->so_lock);
3043 	if (error)
3044 		return (error);
3045 
3046 	/*
3047 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3048 	 * queued data has been consumed.
3049 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3050 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3051 	 *
3052 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3053 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3054 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3055 	 */
3056 	pflag = MSG_ANY | MSG_DELAYERROR;
3057 	if (flags & MSG_PEEK) {
3058 		pflag |= MSG_IPEEK;
3059 		flags &= ~MSG_WAITALL;
3060 	}
3061 	if (so->so_mode & SM_ATOMIC)
3062 		pflag |= MSG_DISCARDTAIL;
3063 
3064 	if (flags & MSG_DONTWAIT)
3065 		timout = 0;
3066 	else
3067 		timout = -1;
3068 	opflag = pflag;
3069 	first = 1;
3070 
3071 	if (uiop->uio_resid >= uioasync.mincnt &&
3072 	    sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
3073 	    uioasync.enabled && !(flags & MSG_PEEK) &&
3074 	    !(so_state & SS_CANTRCVMORE)) {
3075 		/*
3076 		 * Big enough I/O for uioa min setup and an sodirect socket
3077 		 * and sodirect enabled and uioa enabled and I/O will be done
3078 		 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
3079 		 */
3080 		mutex_enter(sodp->sod_lock);
3081 		if (!uioainit(uiop, &sodp->sod_uioa)) {
3082 			/*
3083 			 * Successful uioainit() so the uio_t part of the
3084 			 * uioa_t will be used for all uio_t work to follow,
3085 			 * we save the original "uiop" in "suiop".
3086 			 */
3087 			suiop = uiop;
3088 			uiop = (uio_t *)&sodp->sod_uioa;
3089 			/*
3090 			 * Before returning to the caller the passed in uio_t
3091 			 * "uiop" will be updated via a call to uioafini()
3092 			 * below.
3093 			 *
3094 			 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
3095 			 * here as first we have to uioamove() any currently
3096 			 * queued M_DATA mblk_t(s) so it will be done in
3097 			 * kstrgetmsg().
3098 			 */
3099 		}
3100 		/*
3101 		 * In either uioainit() success or not case note the number
3102 		 * of uio bytes the caller wants for sod framework and/or
3103 		 * transport (e.g. TCP) strategy.
3104 		 */
3105 		sodp->sod_want = uiop->uio_resid;
3106 		mutex_exit(sodp->sod_lock);
3107 	} else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
3108 		/*
3109 		 * No uioa but still using sodirect so note the number of
3110 		 * uio bytes the caller wants for sodirect framework and/or
3111 		 * transport (e.g. TCP) strategy.
3112 		 *
3113 		 * Note, sod_lock not held, only writer is in this function
3114 		 * and only one thread at a time so not needed just to init.
3115 		 */
3116 		sodp->sod_want = uiop->uio_resid;
3117 	}
3118 retry:
3119 	saved_resid = uiop->uio_resid;
3120 	pri = 0;
3121 	mp = NULL;
3122 	if (so->so_nl7c_rcv_mp != NULL) {
3123 		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3124 		error = nl7c_sorecv(so, &mp, uiop, &rval);
3125 	} else {
3126 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3127 		    timout, &rval);
3128 	}
3129 	if (error) {
3130 		switch (error) {
3131 		case EINTR:
3132 		case EWOULDBLOCK:
3133 			if (!first)
3134 				error = 0;
3135 			break;
3136 		case ETIME:
3137 			/* Returned from kstrgetmsg when timeout expires */
3138 			if (!first)
3139 				error = 0;
3140 			else
3141 				error = EWOULDBLOCK;
3142 			break;
3143 		default:
3144 			eprintsoline(so, error);
3145 			break;
3146 		}
3147 		goto out;
3148 	}
3149 	/*
3150 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3151 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3152 	 */
3153 	ASSERT(!(rval.r_val1 & MORECTL));
3154 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3155 		msg->msg_flags |= MSG_TRUNC;
3156 
3157 	if (mp == NULL) {
3158 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3159 		/*
3160 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3161 		 * The draft Posix socket spec states that the mark should
3162 		 * not be cleared when peeking. We follow the latter.
3163 		 */
3164 		if ((so->so_state &
3165 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3166 		    (uiop->uio_resid != saved_resid) &&
3167 		    !(flags & MSG_PEEK)) {
3168 			sorecv_update_oobstate(so);
3169 		}
3170 
3171 		mutex_enter(&so->so_lock);
3172 		/* Set MSG_EOR based on MOREDATA */
3173 		if (!(rval.r_val1 & MOREDATA)) {
3174 			if (so->so_state & SS_SAVEDEOR) {
3175 				msg->msg_flags |= MSG_EOR;
3176 				so->so_state &= ~SS_SAVEDEOR;
3177 			}
3178 		}
3179 		/*
3180 		 * If some data was received (i.e. not EOF) and the
3181 		 * read/recv* has not been satisfied wait for some more.
3182 		 */
3183 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3184 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3185 			mutex_exit(&so->so_lock);
3186 			first = 0;
3187 			pflag = opflag | MSG_NOMARK;
3188 			goto retry;
3189 		}
3190 		goto out_locked;
3191 	}
3192 
3193 	/* strsock_proto has already verified length and alignment */
3194 	tpr = (union T_primitives *)mp->b_rptr;
3195 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3196 
3197 	switch (tpr->type) {
3198 	case T_DATA_IND: {
3199 		if ((so->so_state &
3200 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3201 		    (uiop->uio_resid != saved_resid) &&
3202 		    !(flags & MSG_PEEK)) {
3203 			sorecv_update_oobstate(so);
3204 		}
3205 
3206 		/*
3207 		 * Set msg_flags to MSG_EOR based on
3208 		 * MORE_flag and MOREDATA.
3209 		 */
3210 		mutex_enter(&so->so_lock);
3211 		so->so_state &= ~SS_SAVEDEOR;
3212 		if (!(tpr->data_ind.MORE_flag & 1)) {
3213 			if (!(rval.r_val1 & MOREDATA))
3214 				msg->msg_flags |= MSG_EOR;
3215 			else
3216 				so->so_state |= SS_SAVEDEOR;
3217 		}
3218 		freemsg(mp);
3219 		/*
3220 		 * If some data was received (i.e. not EOF) and the
3221 		 * read/recv* has not been satisfied wait for some more.
3222 		 */
3223 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3224 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3225 			mutex_exit(&so->so_lock);
3226 			first = 0;
3227 			pflag = opflag | MSG_NOMARK;
3228 			goto retry;
3229 		}
3230 		goto out_locked;
3231 	}
3232 	case T_UNITDATA_IND: {
3233 		void *addr;
3234 		t_uscalar_t addrlen;
3235 		void *abuf;
3236 		t_uscalar_t optlen;
3237 		void *opt;
3238 
3239 		if ((so->so_state &
3240 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3241 		    (uiop->uio_resid != saved_resid) &&
3242 		    !(flags & MSG_PEEK)) {
3243 			sorecv_update_oobstate(so);
3244 		}
3245 
3246 		if (namelen != 0) {
3247 			/* Caller wants source address */
3248 			addrlen = tpr->unitdata_ind.SRC_length;
3249 			addr = sogetoff(mp,
3250 			    tpr->unitdata_ind.SRC_offset,
3251 			    addrlen, 1);
3252 			if (addr == NULL) {
3253 				freemsg(mp);
3254 				error = EPROTO;
3255 				eprintsoline(so, error);
3256 				goto out;
3257 			}
3258 			if (so->so_family == AF_UNIX) {
3259 				/*
3260 				 * Can not use the transport level address.
3261 				 * If there is a SO_SRCADDR option carrying
3262 				 * the socket level address it will be
3263 				 * extracted below.
3264 				 */
3265 				addr = NULL;
3266 				addrlen = 0;
3267 			}
3268 		}
3269 		optlen = tpr->unitdata_ind.OPT_length;
3270 		if (optlen != 0) {
3271 			t_uscalar_t ncontrollen;
3272 
3273 			/*
3274 			 * Extract any source address option.
3275 			 * Determine how large cmsg buffer is needed.
3276 			 */
3277 			opt = sogetoff(mp,
3278 			    tpr->unitdata_ind.OPT_offset,
3279 			    optlen, __TPI_ALIGN_SIZE);
3280 
3281 			if (opt == NULL) {
3282 				freemsg(mp);
3283 				error = EPROTO;
3284 				eprintsoline(so, error);
3285 				goto out;
3286 			}
3287 			if (so->so_family == AF_UNIX)
3288 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3289 			ncontrollen = so_cmsglen(mp, opt, optlen,
3290 			    !(flags & MSG_XPG4_2));
3291 			if (controllen != 0)
3292 				controllen = ncontrollen;
3293 			else if (ncontrollen != 0)
3294 				msg->msg_flags |= MSG_CTRUNC;
3295 		} else {
3296 			controllen = 0;
3297 		}
3298 
3299 		if (namelen != 0) {
3300 			/*
3301 			 * Return address to caller.
3302 			 * Caller handles truncation if length
3303 			 * exceeds msg_namelen.
3304 			 * NOTE: AF_UNIX NUL termination is ensured by
3305 			 * the sender's copyin_name().
3306 			 */
3307 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3308 
3309 			bcopy(addr, abuf, addrlen);
3310 			msg->msg_name = abuf;
3311 			msg->msg_namelen = addrlen;
3312 		}
3313 
3314 		if (controllen != 0) {
3315 			/*
3316 			 * Return control msg to caller.
3317 			 * Caller handles truncation if length
3318 			 * exceeds msg_controllen.
3319 			 */
3320 			control = kmem_zalloc(controllen, KM_SLEEP);
3321 
3322 			error = so_opt2cmsg(mp, opt, optlen,
3323 			    !(flags & MSG_XPG4_2),
3324 			    control, controllen);
3325 			if (error) {
3326 				freemsg(mp);
3327 				if (msg->msg_namelen != 0)
3328 					kmem_free(msg->msg_name,
3329 					    msg->msg_namelen);
3330 				kmem_free(control, controllen);
3331 				eprintsoline(so, error);
3332 				goto out;
3333 			}
3334 			msg->msg_control = control;
3335 			msg->msg_controllen = controllen;
3336 		}
3337 
3338 		freemsg(mp);
3339 		goto out;
3340 	}
3341 	case T_OPTDATA_IND: {
3342 		struct T_optdata_req *tdr;
3343 		void *opt;
3344 		t_uscalar_t optlen;
3345 
3346 		if ((so->so_state &
3347 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3348 		    (uiop->uio_resid != saved_resid) &&
3349 		    !(flags & MSG_PEEK)) {
3350 			sorecv_update_oobstate(so);
3351 		}
3352 
3353 		tdr = (struct T_optdata_req *)mp->b_rptr;
3354 		optlen = tdr->OPT_length;
3355 		if (optlen != 0) {
3356 			t_uscalar_t ncontrollen;
3357 			/*
3358 			 * Determine how large cmsg buffer is needed.
3359 			 */
3360 			opt = sogetoff(mp,
3361 			    tpr->optdata_ind.OPT_offset,
3362 			    optlen, __TPI_ALIGN_SIZE);
3363 
3364 			if (opt == NULL) {
3365 				freemsg(mp);
3366 				error = EPROTO;
3367 				eprintsoline(so, error);
3368 				goto out;
3369 			}
3370 
3371 			ncontrollen = so_cmsglen(mp, opt, optlen,
3372 			    !(flags & MSG_XPG4_2));
3373 			if (controllen != 0)
3374 				controllen = ncontrollen;
3375 			else if (ncontrollen != 0)
3376 				msg->msg_flags |= MSG_CTRUNC;
3377 		} else {
3378 			controllen = 0;
3379 		}
3380 
3381 		if (controllen != 0) {
3382 			/*
3383 			 * Return control msg to caller.
3384 			 * Caller handles truncation if length
3385 			 * exceeds msg_controllen.
3386 			 */
3387 			control = kmem_zalloc(controllen, KM_SLEEP);
3388 
3389 			error = so_opt2cmsg(mp, opt, optlen,
3390 			    !(flags & MSG_XPG4_2),
3391 			    control, controllen);
3392 			if (error) {
3393 				freemsg(mp);
3394 				kmem_free(control, controllen);
3395 				eprintsoline(so, error);
3396 				goto out;
3397 			}
3398 			msg->msg_control = control;
3399 			msg->msg_controllen = controllen;
3400 		}
3401 
3402 		/*
3403 		 * Set msg_flags to MSG_EOR based on
3404 		 * DATA_flag and MOREDATA.
3405 		 */
3406 		mutex_enter(&so->so_lock);
3407 		so->so_state &= ~SS_SAVEDEOR;
3408 		if (!(tpr->data_ind.MORE_flag & 1)) {
3409 			if (!(rval.r_val1 & MOREDATA))
3410 				msg->msg_flags |= MSG_EOR;
3411 			else
3412 				so->so_state |= SS_SAVEDEOR;
3413 		}
3414 		freemsg(mp);
3415 		/*
3416 		 * If some data was received (i.e. not EOF) and the
3417 		 * read/recv* has not been satisfied wait for some more.
3418 		 * Not possible to wait if control info was received.
3419 		 */
3420 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3421 		    controllen == 0 &&
3422 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3423 			mutex_exit(&so->so_lock);
3424 			first = 0;
3425 			pflag = opflag | MSG_NOMARK;
3426 			goto retry;
3427 		}
3428 		goto out_locked;
3429 	}
3430 	case T_EXDATA_IND: {
3431 		dprintso(so, 1,
3432 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3433 		    "state %s\n",
3434 		    so->so_oobsigcnt, so->so_oobcnt,
3435 		    saved_resid - uiop->uio_resid,
3436 		    pr_state(so->so_state, so->so_mode)));
3437 		/*
3438 		 * kstrgetmsg handles MSGMARK so there is nothing to
3439 		 * inspect in the T_EXDATA_IND.
3440 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3441 		 * as a separate message with no M_DATA component. Furthermore,
3442 		 * the stream head does not consolidate M_DATA messages onto
3443 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3444 		 * remains a message by itself. This is needed since MSGMARK
3445 		 * marks both the whole message as well as the last byte
3446 		 * of the message.
3447 		 */
3448 		freemsg(mp);
3449 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3450 		if (flags & MSG_PEEK) {
3451 			/*
3452 			 * Even though we are peeking we consume the
3453 			 * T_EXDATA_IND thereby moving the mark information
3454 			 * to SS_RCVATMARK. Then the oob code below will
3455 			 * retry the peeking kstrgetmsg.
3456 			 * Note that the stream head read queue is
3457 			 * never flushed without holding SOREADLOCKED
3458 			 * thus the T_EXDATA_IND can not disappear
3459 			 * underneath us.
3460 			 */
3461 			dprintso(so, 1,
3462 			    ("sotpi_recvmsg: consume EXDATA_IND "
3463 			    "counts %d/%d state %s\n",
3464 			    so->so_oobsigcnt,
3465 			    so->so_oobcnt,
3466 			    pr_state(so->so_state, so->so_mode)));
3467 
3468 			pflag = MSG_ANY | MSG_DELAYERROR;
3469 			if (so->so_mode & SM_ATOMIC)
3470 				pflag |= MSG_DISCARDTAIL;
3471 
3472 			pri = 0;
3473 			mp = NULL;
3474 
3475 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3476 			    &pri, &pflag, (clock_t)-1, &rval);
3477 			ASSERT(uiop->uio_resid == saved_resid);
3478 
3479 			if (error) {
3480 #ifdef SOCK_DEBUG
3481 				if (error != EWOULDBLOCK && error != EINTR) {
3482 					eprintsoline(so, error);
3483 				}
3484 #endif /* SOCK_DEBUG */
3485 				goto out;
3486 			}
3487 			ASSERT(mp);
3488 			tpr = (union T_primitives *)mp->b_rptr;
3489 			ASSERT(tpr->type == T_EXDATA_IND);
3490 			freemsg(mp);
3491 		} /* end "if (flags & MSG_PEEK)" */
3492 
3493 		/*
3494 		 * Decrement the number of queued and pending oob.
3495 		 *
3496 		 * SS_RCVATMARK is cleared when we read past a mark.
3497 		 * SS_HAVEOOBDATA is cleared when we've read past the
3498 		 * last mark.
3499 		 * SS_OOBPEND is cleared if we've read past the last
3500 		 * mark and no (new) SIGURG has been posted.
3501 		 */
3502 		mutex_enter(&so->so_lock);
3503 		ASSERT(so_verify_oobstate(so));
3504 		ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
3505 		ASSERT(so->so_oobsigcnt > 0);
3506 		so->so_oobsigcnt--;
3507 		ASSERT(so->so_oobcnt > 0);
3508 		so->so_oobcnt--;
3509 		/*
3510 		 * Since the T_EXDATA_IND has been removed from the stream
3511 		 * head, but we have not read data past the mark,
3512 		 * sockfs needs to track that the socket is still at the mark.
3513 		 *
3514 		 * Since no data was received call kstrgetmsg again to wait
3515 		 * for data.
3516 		 */
3517 		so->so_state |= SS_RCVATMARK;
3518 		mutex_exit(&so->so_lock);
3519 		dprintso(so, 1,
3520 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3521 		    so->so_oobsigcnt, so->so_oobcnt,
3522 		    pr_state(so->so_state, so->so_mode)));
3523 		pflag = opflag;
3524 		goto retry;
3525 	}
3526 	default:
3527 		ASSERT(0);
3528 		freemsg(mp);
3529 		error = EPROTO;
3530 		eprintsoline(so, error);
3531 		goto out;
3532 	}
3533 	/* NOTREACHED */
3534 out:
3535 	mutex_enter(&so->so_lock);
3536 out_locked:
3537 	if (sodp != NULL) {
3538 		/* Finish any sodirect and uioa processing */
3539 		mutex_enter(sodp->sod_lock);
3540 		if (suiop != NULL) {
3541 			/* Finish any uioa_t processing */
3542 			int ret;
3543 
3544 			ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
3545 			ret = uioafini(suiop, (uioa_t *)uiop);
3546 			if (error == 0 && ret != 0) {
3547 				/* If no error yet, set it */
3548 				error = ret;
3549 			}
3550 			if ((mp = sodp->sod_uioafh) != NULL) {
3551 				sodp->sod_uioafh = NULL;
3552 				sodp->sod_uioaft = NULL;
3553 				freemsg(mp);
3554 			}
3555 		}
3556 		if (!(sodp->sod_state & SOD_WAKE_NOT)) {
3557 			/* Awoke */
3558 			sodp->sod_state &= SOD_WAKE_CLR;
3559 			sodp->sod_state |= SOD_WAKE_NOT;
3560 		}
3561 		/* Last, clear sod_want value */
3562 		sodp->sod_want = 0;
3563 		mutex_exit(sodp->sod_lock);
3564 	}
3565 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3566 	mutex_exit(&so->so_lock);
3567 	return (error);
3568 }
3569 
3570 /*
3571  * Sending data with options on a datagram socket.
3572  * Assumes caller has verified that SS_ISBOUND etc. are set.
3573  */
3574 static int
3575 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3576     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3577 {
3578 	struct T_unitdata_req	tudr;
3579 	mblk_t			*mp;
3580 	int			error;
3581 	void			*addr;
3582 	socklen_t		addrlen;
3583 	void			*src;
3584 	socklen_t		srclen;
3585 	ssize_t			len;
3586 	int			size;
3587 	struct T_opthdr		toh;
3588 	struct fdbuf		*fdbuf;
3589 	t_uscalar_t		optlen;
3590 	void			*fds;
3591 	int			fdlen;
3592 
3593 	ASSERT(name && namelen);
3594 	ASSERT(control && controllen);
3595 
3596 	len = uiop->uio_resid;
3597 	if (len > (ssize_t)so->so_tidu_size) {
3598 		return (EMSGSIZE);
3599 	}
3600 
3601 	/*
3602 	 * For AF_UNIX the destination address is translated to an internal
3603 	 * name and the source address is passed as an option.
3604 	 * Also, file descriptors are passed as file pointers in an
3605 	 * option.
3606 	 */
3607 
3608 	/*
3609 	 * Length and family checks.
3610 	 */
3611 	error = so_addr_verify(so, name, namelen);
3612 	if (error) {
3613 		eprintsoline(so, error);
3614 		return (error);
3615 	}
3616 	if (so->so_family == AF_UNIX) {
3617 		if (so->so_state & SS_FADDR_NOXLATE) {
3618 			/*
3619 			 * Already have a transport internal address. Do not
3620 			 * pass any (transport internal) source address.
3621 			 */
3622 			addr = name;
3623 			addrlen = namelen;
3624 			src = NULL;
3625 			srclen = 0;
3626 		} else {
3627 			/*
3628 			 * Pass the sockaddr_un source address as an option
3629 			 * and translate the remote address.
3630 			 *
3631 			 * Note that this code does not prevent so_laddr_sa
3632 			 * from changing while it is being used. Thus
3633 			 * if an unbind+bind occurs concurrently with this
3634 			 * send the peer might see a partially new and a
3635 			 * partially old "from" address.
3636 			 */
3637 			src = so->so_laddr_sa;
3638 			srclen = (t_uscalar_t)so->so_laddr_len;
3639 			dprintso(so, 1,
3640 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3641 			    srclen, src));
3642 			error = so_ux_addr_xlate(so, name, namelen,
3643 			    (flags & MSG_XPG4_2),
3644 			    &addr, &addrlen);
3645 			if (error) {
3646 				eprintsoline(so, error);
3647 				return (error);
3648 			}
3649 		}
3650 	} else {
3651 		addr = name;
3652 		addrlen = namelen;
3653 		src = NULL;
3654 		srclen = 0;
3655 	}
3656 	optlen = so_optlen(control, controllen,
3657 	    !(flags & MSG_XPG4_2));
3658 	tudr.PRIM_type = T_UNITDATA_REQ;
3659 	tudr.DEST_length = addrlen;
3660 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3661 	if (srclen != 0)
3662 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3663 		    _TPI_ALIGN_TOPT(srclen));
3664 	else
3665 		tudr.OPT_length = optlen;
3666 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3667 	    _TPI_ALIGN_TOPT(addrlen));
3668 
3669 	size = tudr.OPT_offset + tudr.OPT_length;
3670 
3671 	/*
3672 	 * File descriptors only when SM_FDPASSING set.
3673 	 */
3674 	error = so_getfdopt(control, controllen,
3675 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3676 	if (error)
3677 		return (error);
3678 	if (fdlen != -1) {
3679 		if (!(so->so_mode & SM_FDPASSING))
3680 			return (EOPNOTSUPP);
3681 
3682 		error = fdbuf_create(fds, fdlen, &fdbuf);
3683 		if (error)
3684 			return (error);
3685 		mp = fdbuf_allocmsg(size, fdbuf);
3686 	} else {
3687 		mp = soallocproto(size, _ALLOC_INTR);
3688 		if (mp == NULL) {
3689 			/*
3690 			 * Caught a signal waiting for memory.
3691 			 * Let send* return EINTR.
3692 			 */
3693 			return (EINTR);
3694 		}
3695 	}
3696 	soappendmsg(mp, &tudr, sizeof (tudr));
3697 	soappendmsg(mp, addr, addrlen);
3698 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3699 
3700 	if (fdlen != -1) {
3701 		ASSERT(fdbuf != NULL);
3702 		toh.level = SOL_SOCKET;
3703 		toh.name = SO_FILEP;
3704 		toh.len = fdbuf->fd_size +
3705 		    (t_uscalar_t)sizeof (struct T_opthdr);
3706 		toh.status = 0;
3707 		soappendmsg(mp, &toh, sizeof (toh));
3708 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3709 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3710 	}
3711 	if (srclen != 0) {
3712 		/*
3713 		 * There is a AF_UNIX sockaddr_un to include as a source
3714 		 * address option.
3715 		 */
3716 		toh.level = SOL_SOCKET;
3717 		toh.name = SO_SRCADDR;
3718 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3719 		toh.status = 0;
3720 		soappendmsg(mp, &toh, sizeof (toh));
3721 		soappendmsg(mp, src, srclen);
3722 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3723 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3724 	}
3725 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3726 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3727 	/* At most 3 bytes left in the message */
3728 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3729 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3730 
3731 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3732 	if (audit_active)
3733 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3734 
3735 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3736 #ifdef SOCK_DEBUG
3737 	if (error) {
3738 		eprintsoline(so, error);
3739 	}
3740 #endif /* SOCK_DEBUG */
3741 	return (error);
3742 }
3743 
3744 /*
3745  * Sending data with options on a connected stream socket.
3746  * Assumes caller has verified that SS_ISCONNECTED is set.
3747  */
3748 static int
3749 sosend_svccmsg(struct sonode *so,
3750 		struct uio *uiop,
3751 		int more,
3752 		void *control,
3753 		t_uscalar_t controllen,
3754 		int flags)
3755 {
3756 	struct T_optdata_req	tdr;
3757 	mblk_t			*mp;
3758 	int			error;
3759 	ssize_t			iosize;
3760 	int			first = 1;
3761 	int			size;
3762 	struct fdbuf		*fdbuf;
3763 	t_uscalar_t		optlen;
3764 	void			*fds;
3765 	int			fdlen;
3766 	struct T_opthdr		toh;
3767 
3768 	dprintso(so, 1,
3769 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3770 
3771 	/*
3772 	 * Has to be bound and connected. However, since no locks are
3773 	 * held the state could have changed after sotpi_sendmsg checked it
3774 	 * thus it is not possible to ASSERT on the state.
3775 	 */
3776 
3777 	/* Options on connection-oriented only when SM_OPTDATA set. */
3778 	if (!(so->so_mode & SM_OPTDATA))
3779 		return (EOPNOTSUPP);
3780 
3781 	do {
3782 		/*
3783 		 * Set the MORE flag if uio_resid does not fit in this
3784 		 * message or if the caller passed in "more".
3785 		 * Error for transports with zero tidu_size.
3786 		 */
3787 		tdr.PRIM_type = T_OPTDATA_REQ;
3788 		iosize = so->so_tidu_size;
3789 		if (iosize <= 0)
3790 			return (EMSGSIZE);
3791 		if (uiop->uio_resid > iosize) {
3792 			tdr.DATA_flag = 1;
3793 		} else {
3794 			if (more)
3795 				tdr.DATA_flag = 1;
3796 			else
3797 				tdr.DATA_flag = 0;
3798 			iosize = uiop->uio_resid;
3799 		}
3800 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3801 		    tdr.DATA_flag, iosize));
3802 
3803 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3804 		tdr.OPT_length = optlen;
3805 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3806 
3807 		size = (int)sizeof (tdr) + optlen;
3808 		/*
3809 		 * File descriptors only when SM_FDPASSING set.
3810 		 */
3811 		error = so_getfdopt(control, controllen,
3812 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3813 		if (error)
3814 			return (error);
3815 		if (fdlen != -1) {
3816 			if (!(so->so_mode & SM_FDPASSING))
3817 				return (EOPNOTSUPP);
3818 
3819 			error = fdbuf_create(fds, fdlen, &fdbuf);
3820 			if (error)
3821 				return (error);
3822 			mp = fdbuf_allocmsg(size, fdbuf);
3823 		} else {
3824 			mp = soallocproto(size, _ALLOC_INTR);
3825 			if (mp == NULL) {
3826 				/*
3827 				 * Caught a signal waiting for memory.
3828 				 * Let send* return EINTR.
3829 				 */
3830 				return (first ? EINTR : 0);
3831 			}
3832 		}
3833 		soappendmsg(mp, &tdr, sizeof (tdr));
3834 
3835 		if (fdlen != -1) {
3836 			ASSERT(fdbuf != NULL);
3837 			toh.level = SOL_SOCKET;
3838 			toh.name = SO_FILEP;
3839 			toh.len = fdbuf->fd_size +
3840 			    (t_uscalar_t)sizeof (struct T_opthdr);
3841 			toh.status = 0;
3842 			soappendmsg(mp, &toh, sizeof (toh));
3843 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3844 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3845 		}
3846 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3847 		/* At most 3 bytes left in the message */
3848 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3849 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3850 
3851 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3852 
3853 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3854 		    0, MSG_BAND, 0);
3855 		if (error) {
3856 			if (!first && error == EWOULDBLOCK)
3857 				return (0);
3858 			eprintsoline(so, error);
3859 			return (error);
3860 		}
3861 		control = NULL;
3862 		first = 0;
3863 		if (uiop->uio_resid > 0) {
3864 			/*
3865 			 * Recheck for fatal errors. Fail write even though
3866 			 * some data have been written. This is consistent
3867 			 * with strwrite semantics and BSD sockets semantics.
3868 			 */
3869 			if (so->so_state & SS_CANTSENDMORE) {
3870 				tsignal(curthread, SIGPIPE);
3871 				eprintsoline(so, error);
3872 				return (EPIPE);
3873 			}
3874 			if (so->so_error != 0) {
3875 				mutex_enter(&so->so_lock);
3876 				error = sogeterr(so);
3877 				mutex_exit(&so->so_lock);
3878 				if (error != 0) {
3879 					eprintsoline(so, error);
3880 					return (error);
3881 				}
3882 			}
3883 		}
3884 	} while (uiop->uio_resid > 0);
3885 	return (0);
3886 }
3887 
3888 /*
3889  * Sending data on a datagram socket.
3890  * Assumes caller has verified that SS_ISBOUND etc. are set.
3891  *
3892  * For AF_UNIX the destination address is translated to an internal
3893  * name and the source address is passed as an option.
3894  */
3895 int
3896 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
3897     struct uio *uiop, int flags)
3898 {
3899 	struct T_unitdata_req	tudr;
3900 	mblk_t			*mp;
3901 	int			error;
3902 	void			*addr;
3903 	socklen_t		addrlen;
3904 	void			*src;
3905 	socklen_t		srclen;
3906 	ssize_t			len;
3907 
3908 	ASSERT(name != NULL && namelen != 0);
3909 
3910 	len = uiop->uio_resid;
3911 	if (len > so->so_tidu_size) {
3912 		error = EMSGSIZE;
3913 		goto done;
3914 	}
3915 
3916 	/* Length and family checks */
3917 	error = so_addr_verify(so, name, namelen);
3918 	if (error != 0)
3919 		goto done;
3920 
3921 	if (so->so_state & SS_DIRECT)
3922 		return (sodgram_direct(so, name, namelen, uiop, flags));
3923 
3924 	if (so->so_family == AF_UNIX) {
3925 		if (so->so_state & SS_FADDR_NOXLATE) {
3926 			/*
3927 			 * Already have a transport internal address. Do not
3928 			 * pass any (transport internal) source address.
3929 			 */
3930 			addr = name;
3931 			addrlen = namelen;
3932 			src = NULL;
3933 			srclen = 0;
3934 		} else {
3935 			/*
3936 			 * Pass the sockaddr_un source address as an option
3937 			 * and translate the remote address.
3938 			 *
3939 			 * Note that this code does not prevent so_laddr_sa
3940 			 * from changing while it is being used. Thus
3941 			 * if an unbind+bind occurs concurrently with this
3942 			 * send the peer might see a partially new and a
3943 			 * partially old "from" address.
3944 			 */
3945 			src = so->so_laddr_sa;
3946 			srclen = (socklen_t)so->so_laddr_len;
3947 			dprintso(so, 1,
3948 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
3949 			    srclen, src));
3950 			error = so_ux_addr_xlate(so, name, namelen,
3951 			    (flags & MSG_XPG4_2),
3952 			    &addr, &addrlen);
3953 			if (error) {
3954 				eprintsoline(so, error);
3955 				goto done;
3956 			}
3957 		}
3958 	} else {
3959 		addr = name;
3960 		addrlen = namelen;
3961 		src = NULL;
3962 		srclen = 0;
3963 	}
3964 	tudr.PRIM_type = T_UNITDATA_REQ;
3965 	tudr.DEST_length = addrlen;
3966 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3967 	if (srclen == 0) {
3968 		tudr.OPT_length = 0;
3969 		tudr.OPT_offset = 0;
3970 
3971 		mp = soallocproto2(&tudr, sizeof (tudr),
3972 		    addr, addrlen, 0, _ALLOC_INTR);
3973 		if (mp == NULL) {
3974 			/*
3975 			 * Caught a signal waiting for memory.
3976 			 * Let send* return EINTR.
3977 			 */
3978 			error = EINTR;
3979 			goto done;
3980 		}
3981 	} else {
3982 		/*
3983 		 * There is a AF_UNIX sockaddr_un to include as a source
3984 		 * address option.
3985 		 */
3986 		struct T_opthdr toh;
3987 		ssize_t size;
3988 
3989 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
3990 		    _TPI_ALIGN_TOPT(srclen));
3991 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3992 		    _TPI_ALIGN_TOPT(addrlen));
3993 
3994 		toh.level = SOL_SOCKET;
3995 		toh.name = SO_SRCADDR;
3996 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3997 		toh.status = 0;
3998 
3999 		size = tudr.OPT_offset + tudr.OPT_length;
4000 		mp = soallocproto2(&tudr, sizeof (tudr),
4001 		    addr, addrlen, size, _ALLOC_INTR);
4002 		if (mp == NULL) {
4003 			/*
4004 			 * Caught a signal waiting for memory.
4005 			 * Let send* return EINTR.
4006 			 */
4007 			error = EINTR;
4008 			goto done;
4009 		}
4010 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4011 		soappendmsg(mp, &toh, sizeof (toh));
4012 		soappendmsg(mp, src, srclen);
4013 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4014 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4015 	}
4016 
4017 	if (audit_active)
4018 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4019 
4020 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4021 done:
4022 #ifdef SOCK_DEBUG
4023 	if (error) {
4024 		eprintsoline(so, error);
4025 	}
4026 #endif /* SOCK_DEBUG */
4027 	return (error);
4028 }
4029 
4030 /*
4031  * Sending data on a connected stream socket.
4032  * Assumes caller has verified that SS_ISCONNECTED is set.
4033  */
4034 int
4035 sosend_svc(struct sonode *so,
4036 	struct uio *uiop,
4037 	t_scalar_t prim,
4038 	int more,
4039 	int sflag)
4040 {
4041 	struct T_data_req	tdr;
4042 	mblk_t			*mp;
4043 	int			error;
4044 	ssize_t			iosize;
4045 	int			first = 1;
4046 
4047 	dprintso(so, 1,
4048 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4049 	    so, uiop->uio_resid, prim, sflag));
4050 
4051 	/*
4052 	 * Has to be bound and connected. However, since no locks are
4053 	 * held the state could have changed after sotpi_sendmsg checked it
4054 	 * thus it is not possible to ASSERT on the state.
4055 	 */
4056 
4057 	do {
4058 		/*
4059 		 * Set the MORE flag if uio_resid does not fit in this
4060 		 * message or if the caller passed in "more".
4061 		 * Error for transports with zero tidu_size.
4062 		 */
4063 		tdr.PRIM_type = prim;
4064 		iosize = so->so_tidu_size;
4065 		if (iosize <= 0)
4066 			return (EMSGSIZE);
4067 		if (uiop->uio_resid > iosize) {
4068 			tdr.MORE_flag = 1;
4069 		} else {
4070 			if (more)
4071 				tdr.MORE_flag = 1;
4072 			else
4073 				tdr.MORE_flag = 0;
4074 			iosize = uiop->uio_resid;
4075 		}
4076 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4077 		    prim, tdr.MORE_flag, iosize));
4078 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR);
4079 		if (mp == NULL) {
4080 			/*
4081 			 * Caught a signal waiting for memory.
4082 			 * Let send* return EINTR.
4083 			 */
4084 			if (first)
4085 				return (EINTR);
4086 			else
4087 				return (0);
4088 		}
4089 
4090 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4091 		    0, sflag | MSG_BAND, 0);
4092 		if (error) {
4093 			if (!first && error == EWOULDBLOCK)
4094 				return (0);
4095 			eprintsoline(so, error);
4096 			return (error);
4097 		}
4098 		first = 0;
4099 		if (uiop->uio_resid > 0) {
4100 			/*
4101 			 * Recheck for fatal errors. Fail write even though
4102 			 * some data have been written. This is consistent
4103 			 * with strwrite semantics and BSD sockets semantics.
4104 			 */
4105 			if (so->so_state & SS_CANTSENDMORE) {
4106 				tsignal(curthread, SIGPIPE);
4107 				eprintsoline(so, error);
4108 				return (EPIPE);
4109 			}
4110 			if (so->so_error != 0) {
4111 				mutex_enter(&so->so_lock);
4112 				error = sogeterr(so);
4113 				mutex_exit(&so->so_lock);
4114 				if (error != 0) {
4115 					eprintsoline(so, error);
4116 					return (error);
4117 				}
4118 			}
4119 		}
4120 	} while (uiop->uio_resid > 0);
4121 	return (0);
4122 }
4123 
4124 /*
4125  * Check the state for errors and call the appropriate send function.
4126  *
4127  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4128  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4129  * after sending the message.
4130  */
4131 static int
4132 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
4133 {
4134 	int		so_state;
4135 	int		so_mode;
4136 	int		error;
4137 	struct sockaddr *name;
4138 	t_uscalar_t	namelen;
4139 	int		dontroute;
4140 	int		flags;
4141 
4142 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4143 	    so, msg, msg->msg_flags,
4144 	    pr_state(so->so_state, so->so_mode), so->so_error));
4145 
4146 	mutex_enter(&so->so_lock);
4147 	so_state = so->so_state;
4148 
4149 	if (so_state & SS_CANTSENDMORE) {
4150 		mutex_exit(&so->so_lock);
4151 		tsignal(curthread, SIGPIPE);
4152 		return (EPIPE);
4153 	}
4154 
4155 	if (so->so_error != 0) {
4156 		error = sogeterr(so);
4157 		if (error != 0) {
4158 			mutex_exit(&so->so_lock);
4159 			return (error);
4160 		}
4161 	}
4162 
4163 	name = (struct sockaddr *)msg->msg_name;
4164 	namelen = msg->msg_namelen;
4165 
4166 	so_mode = so->so_mode;
4167 
4168 	if (name == NULL) {
4169 		if (!(so_state & SS_ISCONNECTED)) {
4170 			mutex_exit(&so->so_lock);
4171 			if (so_mode & SM_CONNREQUIRED)
4172 				return (ENOTCONN);
4173 			else
4174 				return (EDESTADDRREQ);
4175 		}
4176 		if (so_mode & SM_CONNREQUIRED) {
4177 			name = NULL;
4178 			namelen = 0;
4179 		} else {
4180 			/*
4181 			 * Note that this code does not prevent so_faddr_sa
4182 			 * from changing while it is being used. Thus
4183 			 * if an "unconnect"+connect occurs concurrently with
4184 			 * this send the datagram might be delivered to a
4185 			 * garbaled address.
4186 			 */
4187 			ASSERT(so->so_faddr_sa);
4188 			name = so->so_faddr_sa;
4189 			namelen = (t_uscalar_t)so->so_faddr_len;
4190 		}
4191 	} else {
4192 		if (!(so_state & SS_ISCONNECTED) &&
4193 		    (so_mode & SM_CONNREQUIRED)) {
4194 			/* Required but not connected */
4195 			mutex_exit(&so->so_lock);
4196 			return (ENOTCONN);
4197 		}
4198 		/*
4199 		 * Ignore the address on connection-oriented sockets.
4200 		 * Just like BSD this code does not generate an error for
4201 		 * TCP (a CONNREQUIRED socket) when sending to an address
4202 		 * passed in with sendto/sendmsg. Instead the data is
4203 		 * delivered on the connection as if no address had been
4204 		 * supplied.
4205 		 */
4206 		if ((so_state & SS_ISCONNECTED) &&
4207 		    !(so_mode & SM_CONNREQUIRED)) {
4208 			mutex_exit(&so->so_lock);
4209 			return (EISCONN);
4210 		}
4211 		if (!(so_state & SS_ISBOUND)) {
4212 			so_lock_single(so);	/* Set SOLOCKED */
4213 			error = sotpi_bind(so, NULL, 0,
4214 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
4215 			so_unlock_single(so, SOLOCKED);
4216 			if (error) {
4217 				mutex_exit(&so->so_lock);
4218 				eprintsoline(so, error);
4219 				return (error);
4220 			}
4221 		}
4222 		/*
4223 		 * Handle delayed datagram errors. These are only queued
4224 		 * when the application sets SO_DGRAM_ERRIND.
4225 		 * Return the error if we are sending to the address
4226 		 * that was returned in the last T_UDERROR_IND.
4227 		 * If sending to some other address discard the delayed
4228 		 * error indication.
4229 		 */
4230 		if (so->so_delayed_error) {
4231 			struct T_uderror_ind	*tudi;
4232 			void			*addr;
4233 			t_uscalar_t		addrlen;
4234 			boolean_t		match = B_FALSE;
4235 
4236 			ASSERT(so->so_eaddr_mp);
4237 			error = so->so_delayed_error;
4238 			so->so_delayed_error = 0;
4239 			tudi = (struct T_uderror_ind *)so->so_eaddr_mp->b_rptr;
4240 			addrlen = tudi->DEST_length;
4241 			addr = sogetoff(so->so_eaddr_mp,
4242 			    tudi->DEST_offset,
4243 			    addrlen, 1);
4244 			ASSERT(addr);	/* Checked by strsock_proto */
4245 			switch (so->so_family) {
4246 			case AF_INET: {
4247 				/* Compare just IP address and port */
4248 				sin_t *sin1 = (sin_t *)name;
4249 				sin_t *sin2 = (sin_t *)addr;
4250 
4251 				if (addrlen == sizeof (sin_t) &&
4252 				    namelen == addrlen &&
4253 				    sin1->sin_port == sin2->sin_port &&
4254 				    sin1->sin_addr.s_addr ==
4255 				    sin2->sin_addr.s_addr)
4256 					match = B_TRUE;
4257 				break;
4258 			}
4259 			case AF_INET6: {
4260 				/* Compare just IP address and port. Not flow */
4261 				sin6_t *sin1 = (sin6_t *)name;
4262 				sin6_t *sin2 = (sin6_t *)addr;
4263 
4264 				if (addrlen == sizeof (sin6_t) &&
4265 				    namelen == addrlen &&
4266 				    sin1->sin6_port == sin2->sin6_port &&
4267 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4268 				    &sin2->sin6_addr))
4269 					match = B_TRUE;
4270 				break;
4271 			}
4272 			case AF_UNIX:
4273 			default:
4274 				if (namelen == addrlen &&
4275 				    bcmp(name, addr, namelen) == 0)
4276 					match = B_TRUE;
4277 			}
4278 			if (match) {
4279 				freemsg(so->so_eaddr_mp);
4280 				so->so_eaddr_mp = NULL;
4281 				mutex_exit(&so->so_lock);
4282 #ifdef DEBUG
4283 				dprintso(so, 0,
4284 				    ("sockfs delayed error %d for %s\n",
4285 				    error,
4286 				    pr_addr(so->so_family, name, namelen)));
4287 #endif /* DEBUG */
4288 				return (error);
4289 			}
4290 			freemsg(so->so_eaddr_mp);
4291 			so->so_eaddr_mp = NULL;
4292 		}
4293 	}
4294 	mutex_exit(&so->so_lock);
4295 
4296 	flags = msg->msg_flags;
4297 	dontroute = 0;
4298 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4299 		uint32_t	val;
4300 
4301 		val = 1;
4302 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4303 		    &val, (t_uscalar_t)sizeof (val));
4304 		if (error)
4305 			return (error);
4306 		dontroute = 1;
4307 	}
4308 
4309 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4310 		error = EOPNOTSUPP;
4311 		goto done;
4312 	}
4313 	if (msg->msg_controllen != 0) {
4314 		if (!(so_mode & SM_CONNREQUIRED)) {
4315 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4316 			    msg->msg_control, msg->msg_controllen, flags);
4317 		} else {
4318 			if (flags & MSG_OOB) {
4319 				/* Can't generate T_EXDATA_REQ with options */
4320 				error = EOPNOTSUPP;
4321 				goto done;
4322 			}
4323 			error = sosend_svccmsg(so, uiop,
4324 			    !(flags & MSG_EOR),
4325 			    msg->msg_control, msg->msg_controllen,
4326 			    flags);
4327 		}
4328 		goto done;
4329 	}
4330 
4331 	if (!(so_mode & SM_CONNREQUIRED)) {
4332 		/*
4333 		 * If there is no SO_DONTROUTE to turn off return immediately
4334 		 * from send_dgram. This can allow tail-call optimizations.
4335 		 */
4336 		if (!dontroute) {
4337 			return (sosend_dgram(so, name, namelen, uiop, flags));
4338 		}
4339 		error = sosend_dgram(so, name, namelen, uiop, flags);
4340 	} else {
4341 		t_scalar_t prim;
4342 		int sflag;
4343 
4344 		/* Ignore msg_name in the connected state */
4345 		if (flags & MSG_OOB) {
4346 			prim = T_EXDATA_REQ;
4347 			/*
4348 			 * Send down T_EXDATA_REQ even if there is flow
4349 			 * control for data.
4350 			 */
4351 			sflag = MSG_IGNFLOW;
4352 		} else {
4353 			if (so_mode & SM_BYTESTREAM) {
4354 				/* Byte stream transport - use write */
4355 
4356 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4357 				/*
4358 				 * If there is no SO_DONTROUTE to turn off,
4359 				 * SS_DIRECT is on, and there is no flow
4360 				 * control, we can take the fast path.
4361 				 */
4362 				if (!dontroute &&
4363 				    (so_state & SS_DIRECT) &&
4364 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4365 					return (sostream_direct(so, uiop,
4366 					    NULL, CRED()));
4367 				}
4368 				error = strwrite(SOTOV(so), uiop, CRED());
4369 				goto done;
4370 			}
4371 			prim = T_DATA_REQ;
4372 			sflag = 0;
4373 		}
4374 		/*
4375 		 * If there is no SO_DONTROUTE to turn off return immediately
4376 		 * from sosend_svc. This can allow tail-call optimizations.
4377 		 */
4378 		if (!dontroute)
4379 			return (sosend_svc(so, uiop, prim,
4380 			    !(flags & MSG_EOR), sflag));
4381 		error = sosend_svc(so, uiop, prim,
4382 		    !(flags & MSG_EOR), sflag);
4383 	}
4384 	ASSERT(dontroute);
4385 done:
4386 	if (dontroute) {
4387 		uint32_t	val;
4388 
4389 		val = 0;
4390 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4391 		    &val, (t_uscalar_t)sizeof (val));
4392 	}
4393 	return (error);
4394 }
4395 
4396 /*
4397  * Sending data on a datagram socket.
4398  * Assumes caller has verified that SS_ISBOUND etc. are set.
4399  */
4400 /* ARGSUSED */
4401 static int
4402 sodgram_direct(struct sonode *so, struct sockaddr *name,
4403     socklen_t namelen, struct uio *uiop, int flags)
4404 {
4405 	struct T_unitdata_req	tudr;
4406 	mblk_t			*mp = NULL;
4407 	int			error = 0;
4408 	void			*addr;
4409 	socklen_t		addrlen;
4410 	ssize_t			len;
4411 	struct stdata		*stp = SOTOV(so)->v_stream;
4412 	int			so_state;
4413 	queue_t			*udp_wq;
4414 	boolean_t		connected;
4415 	mblk_t			*mpdata = NULL;
4416 
4417 	ASSERT(name != NULL && namelen != 0);
4418 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4419 	ASSERT(!(so->so_mode & SM_EXDATA));
4420 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4421 	ASSERT(SOTOV(so)->v_type == VSOCK);
4422 
4423 	/* Caller checked for proper length */
4424 	len = uiop->uio_resid;
4425 	ASSERT(len <= so->so_tidu_size);
4426 
4427 	/* Length and family checks have been done by caller */
4428 	ASSERT(name->sa_family == so->so_family);
4429 	ASSERT(so->so_family == AF_INET ||
4430 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4431 	ASSERT(so->so_family == AF_INET6 ||
4432 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4433 
4434 	addr = name;
4435 	addrlen = namelen;
4436 
4437 	if (stp->sd_sidp != NULL &&
4438 	    (error = straccess(stp, JCWRITE)) != 0)
4439 		goto done;
4440 
4441 	so_state = so->so_state;
4442 
4443 	connected = so_state & SS_ISCONNECTED;
4444 	if (!connected) {
4445 		tudr.PRIM_type = T_UNITDATA_REQ;
4446 		tudr.DEST_length = addrlen;
4447 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4448 		tudr.OPT_length = 0;
4449 		tudr.OPT_offset = 0;
4450 
4451 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4452 		    _ALLOC_INTR);
4453 		if (mp == NULL) {
4454 			/*
4455 			 * Caught a signal waiting for memory.
4456 			 * Let send* return EINTR.
4457 			 */
4458 			error = EINTR;
4459 			goto done;
4460 		}
4461 	}
4462 
4463 	/*
4464 	 * For UDP we don't break up the copyin into smaller pieces
4465 	 * as in the TCP case.  That means if ENOMEM is returned by
4466 	 * mcopyinuio() then the uio vector has not been modified at
4467 	 * all and we fallback to either strwrite() or kstrputmsg()
4468 	 * below.  Note also that we never generate priority messages
4469 	 * from here.
4470 	 */
4471 	udp_wq = stp->sd_wrq->q_next;
4472 	if (canput(udp_wq) &&
4473 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4474 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4475 		ASSERT(uiop->uio_resid == 0);
4476 		if (!connected)
4477 			linkb(mp, mpdata);
4478 		else
4479 			mp = mpdata;
4480 		if (audit_active)
4481 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4482 
4483 		udp_wput(udp_wq, mp);
4484 		return (0);
4485 	}
4486 
4487 	ASSERT(mpdata == NULL);
4488 	if (error != 0 && error != ENOMEM) {
4489 		freemsg(mp);
4490 		return (error);
4491 	}
4492 
4493 	/*
4494 	 * For connected, let strwrite() handle the blocking case.
4495 	 * Otherwise we fall thru and use kstrputmsg().
4496 	 */
4497 	if (connected)
4498 		return (strwrite(SOTOV(so), uiop, CRED()));
4499 
4500 	if (audit_active)
4501 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4502 
4503 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4504 done:
4505 #ifdef SOCK_DEBUG
4506 	if (error != 0) {
4507 		eprintsoline(so, error);
4508 	}
4509 #endif /* SOCK_DEBUG */
4510 	return (error);
4511 }
4512 
4513 int
4514 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4515 {
4516 	struct stdata *stp = SOTOV(so)->v_stream;
4517 	ssize_t iosize, rmax, maxblk;
4518 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4519 	mblk_t *newmp;
4520 	int error = 0, wflag = 0;
4521 
4522 	ASSERT(so->so_mode & SM_BYTESTREAM);
4523 	ASSERT(SOTOV(so)->v_type == VSOCK);
4524 
4525 	if (stp->sd_sidp != NULL &&
4526 	    (error = straccess(stp, JCWRITE)) != 0)
4527 		return (error);
4528 
4529 	if (uiop == NULL) {
4530 		/*
4531 		 * kstrwritemp() should have checked sd_flag and
4532 		 * flow-control before coming here.  If we end up
4533 		 * here it means that we can simply pass down the
4534 		 * data to tcp.
4535 		 */
4536 		ASSERT(mp != NULL);
4537 		if (stp->sd_wputdatafunc != NULL) {
4538 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4539 			    NULL, NULL, NULL);
4540 			if (newmp == NULL) {
4541 				/* The caller will free mp */
4542 				return (ECOMM);
4543 			}
4544 			mp = newmp;
4545 		}
4546 		tcp_wput(tcp_wq, mp);
4547 		return (0);
4548 	}
4549 
4550 	/* Fallback to strwrite() to do proper error handling */
4551 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4552 		return (strwrite(SOTOV(so), uiop, cr));
4553 
4554 	rmax = stp->sd_qn_maxpsz;
4555 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4556 	if (rmax == 0 || uiop->uio_resid <= 0)
4557 		return (0);
4558 
4559 	if (rmax == INFPSZ)
4560 		rmax = uiop->uio_resid;
4561 
4562 	maxblk = stp->sd_maxblk;
4563 
4564 	for (;;) {
4565 		iosize = MIN(uiop->uio_resid, rmax);
4566 
4567 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4568 		if (mp == NULL) {
4569 			/*
4570 			 * Fallback to strwrite() for ENOMEM; if this
4571 			 * is our first time in this routine and the uio
4572 			 * vector has not been modified, we will end up
4573 			 * calling strwrite() without any flag set.
4574 			 */
4575 			if (error == ENOMEM)
4576 				goto slow_send;
4577 			else
4578 				return (error);
4579 		}
4580 		ASSERT(uiop->uio_resid >= 0);
4581 		/*
4582 		 * If mp is non-NULL and ENOMEM is set, it means that
4583 		 * mcopyinuio() was able to break down some of the user
4584 		 * data into one or more mblks.  Send the partial data
4585 		 * to tcp and let the rest be handled in strwrite().
4586 		 */
4587 		ASSERT(error == 0 || error == ENOMEM);
4588 		if (stp->sd_wputdatafunc != NULL) {
4589 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4590 			    NULL, NULL, NULL);
4591 			if (newmp == NULL) {
4592 				/* The caller will free mp */
4593 				return (ECOMM);
4594 			}
4595 			mp = newmp;
4596 		}
4597 		tcp_wput(tcp_wq, mp);
4598 
4599 		wflag |= NOINTR;
4600 
4601 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4602 			ASSERT(error == 0);
4603 			break;
4604 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4605 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4606 slow_send:
4607 			/*
4608 			 * We were able to send down partial data using
4609 			 * the direct call interface, but are now relying
4610 			 * on strwrite() to handle the non-fastpath cases.
4611 			 * If the socket is blocking we will sleep in
4612 			 * strwaitq() until write is permitted, otherwise,
4613 			 * we will need to return the amount of bytes
4614 			 * written so far back to the app.  This is the
4615 			 * reason why we pass NOINTR flag to strwrite()
4616 			 * for non-blocking socket, because we don't want
4617 			 * to return EAGAIN when portion of the user data
4618 			 * has actually been sent down.
4619 			 */
4620 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4621 		}
4622 	}
4623 	return (0);
4624 }
4625 
4626 /*
4627  * Update so_faddr by asking the transport (unless AF_UNIX).
4628  */
4629 int
4630 sotpi_getpeername(struct sonode *so)
4631 {
4632 	struct strbuf	strbuf;
4633 	int		error = 0, res;
4634 	void		*addr;
4635 	t_uscalar_t	addrlen;
4636 	k_sigset_t	smask;
4637 
4638 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4639 	    so, pr_state(so->so_state, so->so_mode)));
4640 
4641 	mutex_enter(&so->so_lock);
4642 	so_lock_single(so);	/* Set SOLOCKED */
4643 	if (!(so->so_state & SS_ISCONNECTED)) {
4644 		error = ENOTCONN;
4645 		goto done;
4646 	}
4647 	/* Added this check for X/Open */
4648 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4649 		error = EINVAL;
4650 		if (xnet_check_print) {
4651 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4652 		}
4653 		goto done;
4654 	}
4655 #ifdef DEBUG
4656 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4657 	    pr_addr(so->so_family, so->so_faddr_sa,
4658 	    (t_uscalar_t)so->so_faddr_len)));
4659 #endif /* DEBUG */
4660 
4661 	if (so->so_family == AF_UNIX) {
4662 		/* Transport has different name space - return local info */
4663 		error = 0;
4664 		goto done;
4665 	}
4666 
4667 	ASSERT(so->so_faddr_sa);
4668 	/* Allocate local buffer to use with ioctl */
4669 	addrlen = (t_uscalar_t)so->so_faddr_maxlen;
4670 	mutex_exit(&so->so_lock);
4671 	addr = kmem_alloc(addrlen, KM_SLEEP);
4672 
4673 	/*
4674 	 * Issue TI_GETPEERNAME with signals masked.
4675 	 * Put the result in so_faddr_sa so that getpeername works after
4676 	 * a shutdown(output).
4677 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4678 	 * back to the socket.
4679 	 */
4680 	strbuf.buf = addr;
4681 	strbuf.maxlen = addrlen;
4682 	strbuf.len = 0;
4683 
4684 	sigintr(&smask, 0);
4685 	res = 0;
4686 	ASSERT(CRED());
4687 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4688 	    0, K_TO_K, CRED(), &res);
4689 	sigunintr(&smask);
4690 
4691 	mutex_enter(&so->so_lock);
4692 	/*
4693 	 * If there is an error record the error in so_error put don't fail
4694 	 * the getpeername. Instead fallback on the recorded
4695 	 * so->so_faddr_sa.
4696 	 */
4697 	if (error) {
4698 		/*
4699 		 * Various stream head errors can be returned to the ioctl.
4700 		 * However, it is impossible to determine which ones of
4701 		 * these are really socket level errors that were incorrectly
4702 		 * consumed by the ioctl. Thus this code silently ignores the
4703 		 * error - to code explicitly does not reinstate the error
4704 		 * using soseterror().
4705 		 * Experiments have shows that at least this set of
4706 		 * errors are reported and should not be reinstated on the
4707 		 * socket:
4708 		 *	EINVAL	E.g. if an I_LINK was in effect when
4709 		 *		getpeername was called.
4710 		 *	EPIPE	The ioctl error semantics prefer the write
4711 		 *		side error over the read side error.
4712 		 *	ENOTCONN The transport just got disconnected but
4713 		 *		sockfs had not yet seen the T_DISCON_IND
4714 		 *		when issuing the ioctl.
4715 		 */
4716 		error = 0;
4717 	} else if (res == 0 && strbuf.len > 0 &&
4718 	    (so->so_state & SS_ISCONNECTED)) {
4719 		ASSERT(strbuf.len <= (int)so->so_faddr_maxlen);
4720 		so->so_faddr_len = (socklen_t)strbuf.len;
4721 		bcopy(addr, so->so_faddr_sa, so->so_faddr_len);
4722 		so->so_state |= SS_FADDR_VALID;
4723 	}
4724 	kmem_free(addr, addrlen);
4725 #ifdef DEBUG
4726 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4727 	    pr_addr(so->so_family, so->so_faddr_sa,
4728 	    (t_uscalar_t)so->so_faddr_len)));
4729 #endif /* DEBUG */
4730 done:
4731 	so_unlock_single(so, SOLOCKED);
4732 	mutex_exit(&so->so_lock);
4733 	return (error);
4734 }
4735 
4736 /*
4737  * Update so_laddr by asking the transport (unless AF_UNIX).
4738  */
4739 int
4740 sotpi_getsockname(struct sonode *so)
4741 {
4742 	struct strbuf	strbuf;
4743 	int		error = 0, res;
4744 	void		*addr;
4745 	t_uscalar_t	addrlen;
4746 	k_sigset_t	smask;
4747 
4748 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4749 	    so, pr_state(so->so_state, so->so_mode)));
4750 
4751 	mutex_enter(&so->so_lock);
4752 	so_lock_single(so);	/* Set SOLOCKED */
4753 	if (!(so->so_state & SS_ISBOUND) && so->so_family != AF_UNIX) {
4754 		/* Return an all zero address except for the family */
4755 		if (so->so_family == AF_INET)
4756 			so->so_laddr_len = (socklen_t)sizeof (sin_t);
4757 		else if (so->so_family == AF_INET6)
4758 			so->so_laddr_len = (socklen_t)sizeof (sin6_t);
4759 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
4760 		bzero(so->so_laddr_sa, so->so_laddr_len);
4761 		/*
4762 		 * Can not assume there is a sa_family for all
4763 		 * protocol families.
4764 		 */
4765 		if (so->so_family == AF_INET || so->so_family == AF_INET6)
4766 			so->so_laddr_sa->sa_family = so->so_family;
4767 	}
4768 #ifdef DEBUG
4769 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4770 	    pr_addr(so->so_family, so->so_laddr_sa,
4771 	    (t_uscalar_t)so->so_laddr_len)));
4772 #endif /* DEBUG */
4773 	if (so->so_family == AF_UNIX) {
4774 		/* Transport has different name space - return local info */
4775 		error = 0;
4776 		goto done;
4777 	}
4778 	if (!(so->so_state & SS_ISBOUND)) {
4779 		/* If not bound, then nothing to return. */
4780 		error = 0;
4781 		goto done;
4782 	}
4783 	/* Allocate local buffer to use with ioctl */
4784 	addrlen = (t_uscalar_t)so->so_laddr_maxlen;
4785 	mutex_exit(&so->so_lock);
4786 	addr = kmem_alloc(addrlen, KM_SLEEP);
4787 
4788 	/*
4789 	 * Issue TI_GETMYNAME with signals masked.
4790 	 * Put the result in so_laddr_sa so that getsockname works after
4791 	 * a shutdown(output).
4792 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4793 	 * back to the socket.
4794 	 */
4795 	strbuf.buf = addr;
4796 	strbuf.maxlen = addrlen;
4797 	strbuf.len = 0;
4798 
4799 	sigintr(&smask, 0);
4800 	res = 0;
4801 	ASSERT(CRED());
4802 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4803 	    0, K_TO_K, CRED(), &res);
4804 	sigunintr(&smask);
4805 
4806 	mutex_enter(&so->so_lock);
4807 	/*
4808 	 * If there is an error record the error in so_error put don't fail
4809 	 * the getsockname. Instead fallback on the recorded
4810 	 * so->so_laddr_sa.
4811 	 */
4812 	if (error) {
4813 		/*
4814 		 * Various stream head errors can be returned to the ioctl.
4815 		 * However, it is impossible to determine which ones of
4816 		 * these are really socket level errors that were incorrectly
4817 		 * consumed by the ioctl. Thus this code silently ignores the
4818 		 * error - to code explicitly does not reinstate the error
4819 		 * using soseterror().
4820 		 * Experiments have shows that at least this set of
4821 		 * errors are reported and should not be reinstated on the
4822 		 * socket:
4823 		 *	EINVAL	E.g. if an I_LINK was in effect when
4824 		 *		getsockname was called.
4825 		 *	EPIPE	The ioctl error semantics prefer the write
4826 		 *		side error over the read side error.
4827 		 */
4828 		error = 0;
4829 	} else if (res == 0 && strbuf.len > 0 &&
4830 	    (so->so_state & SS_ISBOUND)) {
4831 		ASSERT(strbuf.len <= (int)so->so_laddr_maxlen);
4832 		so->so_laddr_len = (socklen_t)strbuf.len;
4833 		bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
4834 		so->so_state |= SS_LADDR_VALID;
4835 	}
4836 	kmem_free(addr, addrlen);
4837 #ifdef DEBUG
4838 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
4839 	    pr_addr(so->so_family, so->so_laddr_sa,
4840 	    (t_uscalar_t)so->so_laddr_len)));
4841 #endif /* DEBUG */
4842 done:
4843 	so_unlock_single(so, SOLOCKED);
4844 	mutex_exit(&so->so_lock);
4845 	return (error);
4846 }
4847 
4848 /*
4849  * Get socket options. For SOL_SOCKET options some options are handled
4850  * by the sockfs while others use the value recorded in the sonode as a
4851  * fallback should the T_SVR4_OPTMGMT_REQ fail.
4852  *
4853  * On the return most *optlenp bytes are copied to optval.
4854  */
4855 int
4856 sotpi_getsockopt(struct sonode *so, int level, int option_name,
4857 		void *optval, socklen_t *optlenp, int flags)
4858 {
4859 	struct T_optmgmt_req	optmgmt_req;
4860 	struct T_optmgmt_ack	*optmgmt_ack;
4861 	struct opthdr		oh;
4862 	struct opthdr		*opt_res;
4863 	mblk_t			*mp = NULL;
4864 	int			error = 0;
4865 	void			*option = NULL;	/* Set if fallback value */
4866 	t_uscalar_t		maxlen = *optlenp;
4867 	t_uscalar_t		len;
4868 	uint32_t		value;
4869 
4870 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
4871 	    so, level, option_name, optval, optlenp,
4872 	    pr_state(so->so_state, so->so_mode)));
4873 
4874 	mutex_enter(&so->so_lock);
4875 	so_lock_single(so);	/* Set SOLOCKED */
4876 
4877 	/*
4878 	 * Check for SOL_SOCKET options.
4879 	 * Certain SOL_SOCKET options are returned directly whereas
4880 	 * others only provide a default (fallback) value should
4881 	 * the T_SVR4_OPTMGMT_REQ fail.
4882 	 */
4883 	if (level == SOL_SOCKET) {
4884 		/* Check parameters */
4885 		switch (option_name) {
4886 		case SO_TYPE:
4887 		case SO_ERROR:
4888 		case SO_DEBUG:
4889 		case SO_ACCEPTCONN:
4890 		case SO_REUSEADDR:
4891 		case SO_KEEPALIVE:
4892 		case SO_DONTROUTE:
4893 		case SO_BROADCAST:
4894 		case SO_USELOOPBACK:
4895 		case SO_OOBINLINE:
4896 		case SO_SNDBUF:
4897 		case SO_RCVBUF:
4898 #ifdef notyet
4899 		case SO_SNDLOWAT:
4900 		case SO_RCVLOWAT:
4901 		case SO_SNDTIMEO:
4902 		case SO_RCVTIMEO:
4903 #endif /* notyet */
4904 		case SO_DOMAIN:
4905 		case SO_DGRAM_ERRIND:
4906 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
4907 				error = EINVAL;
4908 				eprintsoline(so, error);
4909 				goto done2;
4910 			}
4911 			break;
4912 		case SO_LINGER:
4913 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
4914 				error = EINVAL;
4915 				eprintsoline(so, error);
4916 				goto done2;
4917 			}
4918 			break;
4919 		}
4920 
4921 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
4922 
4923 		switch (option_name) {
4924 		case SO_TYPE:
4925 			value = so->so_type;
4926 			option = &value;
4927 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4928 
4929 		case SO_ERROR:
4930 			value = sogeterr(so);
4931 			option = &value;
4932 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4933 
4934 		case SO_ACCEPTCONN:
4935 			if (so->so_state & SS_ACCEPTCONN)
4936 				value = SO_ACCEPTCONN;
4937 			else
4938 				value = 0;
4939 #ifdef DEBUG
4940 			if (value) {
4941 				dprintso(so, 1,
4942 				    ("sotpi_getsockopt: 0x%x is set\n",
4943 				    option_name));
4944 			} else {
4945 				dprintso(so, 1,
4946 				    ("sotpi_getsockopt: 0x%x not set\n",
4947 				    option_name));
4948 			}
4949 #endif /* DEBUG */
4950 			option = &value;
4951 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4952 
4953 		case SO_DEBUG:
4954 		case SO_REUSEADDR:
4955 		case SO_KEEPALIVE:
4956 		case SO_DONTROUTE:
4957 		case SO_BROADCAST:
4958 		case SO_USELOOPBACK:
4959 		case SO_OOBINLINE:
4960 		case SO_DGRAM_ERRIND:
4961 			value = (so->so_options & option_name);
4962 #ifdef DEBUG
4963 			if (value) {
4964 				dprintso(so, 1,
4965 				    ("sotpi_getsockopt: 0x%x is set\n",
4966 				    option_name));
4967 			} else {
4968 				dprintso(so, 1,
4969 				    ("sotpi_getsockopt: 0x%x not set\n",
4970 				    option_name));
4971 			}
4972 #endif /* DEBUG */
4973 			option = &value;
4974 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4975 
4976 		/*
4977 		 * The following options are only returned by sockfs when the
4978 		 * T_SVR4_OPTMGMT_REQ fails.
4979 		 */
4980 		case SO_LINGER:
4981 			option = &so->so_linger;
4982 			len = (t_uscalar_t)sizeof (struct linger);
4983 			break;
4984 		case SO_SNDBUF: {
4985 			ssize_t lvalue;
4986 
4987 			/*
4988 			 * If the option has not been set then get a default
4989 			 * value from the read queue. This value is
4990 			 * returned if the transport fails
4991 			 * the T_SVR4_OPTMGMT_REQ.
4992 			 */
4993 			lvalue = so->so_sndbuf;
4994 			if (lvalue == 0) {
4995 				mutex_exit(&so->so_lock);
4996 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
4997 				    QHIWAT, 0, &lvalue);
4998 				mutex_enter(&so->so_lock);
4999 				dprintso(so, 1,
5000 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5001 			}
5002 			value = (int)lvalue;
5003 			option = &value;
5004 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5005 			break;
5006 		}
5007 		case SO_RCVBUF: {
5008 			ssize_t lvalue;
5009 
5010 			/*
5011 			 * If the option has not been set then get a default
5012 			 * value from the read queue. This value is
5013 			 * returned if the transport fails
5014 			 * the T_SVR4_OPTMGMT_REQ.
5015 			 *
5016 			 * XXX If SO_RCVBUF has been set and this is an
5017 			 * XPG 4.2 application then do not ask the transport
5018 			 * since the transport might adjust the value and not
5019 			 * return exactly what was set by the application.
5020 			 * For non-XPG 4.2 application we return the value
5021 			 * that the transport is actually using.
5022 			 */
5023 			lvalue = so->so_rcvbuf;
5024 			if (lvalue == 0) {
5025 				mutex_exit(&so->so_lock);
5026 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5027 				    QHIWAT, 0, &lvalue);
5028 				mutex_enter(&so->so_lock);
5029 				dprintso(so, 1,
5030 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5031 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5032 				value = (int)lvalue;
5033 				option = &value;
5034 				goto copyout;	/* skip asking transport */
5035 			}
5036 			value = (int)lvalue;
5037 			option = &value;
5038 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5039 			break;
5040 		}
5041 		case SO_DOMAIN:
5042 			value = so->so_family;
5043 			option = &value;
5044 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5045 
5046 #ifdef notyet
5047 		/*
5048 		 * We do not implement the semantics of these options
5049 		 * thus we shouldn't implement the options either.
5050 		 */
5051 		case SO_SNDLOWAT:
5052 			value = so->so_sndlowat;
5053 			option = &value;
5054 			break;
5055 		case SO_RCVLOWAT:
5056 			value = so->so_rcvlowat;
5057 			option = &value;
5058 			break;
5059 		case SO_SNDTIMEO:
5060 			value = so->so_sndtimeo;
5061 			option = &value;
5062 			break;
5063 		case SO_RCVTIMEO:
5064 			value = so->so_rcvtimeo;
5065 			option = &value;
5066 			break;
5067 #endif /* notyet */
5068 		}
5069 	}
5070 
5071 	mutex_exit(&so->so_lock);
5072 
5073 	/* Send request */
5074 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5075 	optmgmt_req.MGMT_flags = T_CHECK;
5076 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5077 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5078 
5079 	oh.level = level;
5080 	oh.name = option_name;
5081 	oh.len = maxlen;
5082 
5083 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5084 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP);
5085 	/* Let option management work in the presence of data flow control */
5086 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5087 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5088 	mp = NULL;
5089 	mutex_enter(&so->so_lock);
5090 	if (error) {
5091 		eprintsoline(so, error);
5092 		goto done2;
5093 	}
5094 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5095 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5096 	if (error) {
5097 		if (option != NULL) {
5098 			/* We have a fallback value */
5099 			error = 0;
5100 			goto copyout;
5101 		}
5102 		eprintsoline(so, error);
5103 		goto done2;
5104 	}
5105 	ASSERT(mp);
5106 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5107 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5108 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5109 	if (opt_res == NULL) {
5110 		if (option != NULL) {
5111 			/* We have a fallback value */
5112 			error = 0;
5113 			goto copyout;
5114 		}
5115 		error = EPROTO;
5116 		eprintsoline(so, error);
5117 		goto done;
5118 	}
5119 	option = &opt_res[1];
5120 
5121 	/* check to ensure that the option is within bounds */
5122 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5123 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5124 		if (option != NULL) {
5125 			/* We have a fallback value */
5126 			error = 0;
5127 			goto copyout;
5128 		}
5129 		error = EPROTO;
5130 		eprintsoline(so, error);
5131 		goto done;
5132 	}
5133 
5134 	len = opt_res->len;
5135 
5136 copyout: {
5137 		t_uscalar_t size = MIN(len, maxlen);
5138 		bcopy(option, optval, size);
5139 		bcopy(&size, optlenp, sizeof (size));
5140 	}
5141 done:
5142 	freemsg(mp);
5143 done2:
5144 	so_unlock_single(so, SOLOCKED);
5145 	mutex_exit(&so->so_lock);
5146 	return (error);
5147 }
5148 
5149 /*
5150  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5151  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5152  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5153  * setsockopt has to work even if the transport does not support the option.
5154  */
5155 int
5156 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5157 	const void *optval, t_uscalar_t optlen)
5158 {
5159 	struct T_optmgmt_req	optmgmt_req;
5160 	struct opthdr		oh;
5161 	mblk_t			*mp;
5162 	int			error = 0;
5163 	boolean_t		handled = B_FALSE;
5164 
5165 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5166 	    so, level, option_name, optval, optlen,
5167 	    pr_state(so->so_state, so->so_mode)));
5168 
5169 
5170 	/* X/Open requires this check */
5171 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5172 		if (xnet_check_print)
5173 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5174 		return (EINVAL);
5175 	}
5176 
5177 	/* Caller allocates aligned optval, or passes null */
5178 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
5179 	/* If optval is null optlen is 0, and vice-versa */
5180 	ASSERT(optval != NULL || optlen == 0);
5181 	ASSERT(optlen != 0 || optval == NULL);
5182 
5183 	mutex_enter(&so->so_lock);
5184 	so_lock_single(so);	/* Set SOLOCKED */
5185 	mutex_exit(&so->so_lock);
5186 
5187 	/*
5188 	 * For SOCKET or TCP level options, try to set it here itself
5189 	 * provided socket has not been popped and we know the tcp
5190 	 * structure (stored in so_priv).
5191 	 */
5192 	if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
5193 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
5194 	    (so->so_version == SOV_SOCKSTREAM) && (so->so_priv != NULL)) {
5195 		tcp_t		*tcp = so->so_priv;
5196 		boolean_t	onoff;
5197 
5198 #define	intvalue	(*(int32_t *)optval)
5199 
5200 		switch (level) {
5201 		case SOL_SOCKET:
5202 			switch (option_name) {		/* Check length param */
5203 			case SO_DEBUG:
5204 			case SO_REUSEADDR:
5205 			case SO_DONTROUTE:
5206 			case SO_BROADCAST:
5207 			case SO_USELOOPBACK:
5208 			case SO_OOBINLINE:
5209 			case SO_DGRAM_ERRIND:
5210 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5211 					error = EINVAL;
5212 					eprintsoline(so, error);
5213 					mutex_enter(&so->so_lock);
5214 					goto done2;
5215 				}
5216 				ASSERT(optval);
5217 				onoff = intvalue != 0;
5218 				handled = B_TRUE;
5219 				break;
5220 			case SO_LINGER:
5221 				if (optlen !=
5222 				    (t_uscalar_t)sizeof (struct linger)) {
5223 					error = EINVAL;
5224 					eprintsoline(so, error);
5225 					mutex_enter(&so->so_lock);
5226 					goto done2;
5227 				}
5228 				ASSERT(optval);
5229 				handled = B_TRUE;
5230 				break;
5231 			}
5232 
5233 			switch (option_name) {			/* Do actions */
5234 			case SO_LINGER: {
5235 				struct linger *lgr = (struct linger *)optval;
5236 
5237 				if (lgr->l_onoff) {
5238 					tcp->tcp_linger = 1;
5239 					tcp->tcp_lingertime = lgr->l_linger;
5240 					so->so_linger.l_onoff = SO_LINGER;
5241 					so->so_options |= SO_LINGER;
5242 				} else {
5243 					tcp->tcp_linger = 0;
5244 					tcp->tcp_lingertime = 0;
5245 					so->so_linger.l_onoff = 0;
5246 					so->so_options &= ~SO_LINGER;
5247 				}
5248 				so->so_linger.l_linger = lgr->l_linger;
5249 				handled = B_TRUE;
5250 				break;
5251 			}
5252 			case SO_DEBUG:
5253 				tcp->tcp_debug = onoff;
5254 #ifdef SOCK_TEST
5255 				if (intvalue & 2)
5256 					sock_test_timelimit = 10 * hz;
5257 				else
5258 					sock_test_timelimit = 0;
5259 
5260 				if (intvalue & 4)
5261 					do_useracc = 0;
5262 				else
5263 					do_useracc = 1;
5264 #endif /* SOCK_TEST */
5265 				break;
5266 			case SO_DONTROUTE:
5267 				/*
5268 				 * SO_DONTROUTE, SO_USELOOPBACK and
5269 				 * SO_BROADCAST are only of interest to IP.
5270 				 * We track them here only so
5271 				 * that we can report their current value.
5272 				 */
5273 				tcp->tcp_dontroute = onoff;
5274 				if (onoff)
5275 					so->so_options |= option_name;
5276 				else
5277 					so->so_options &= ~option_name;
5278 				break;
5279 			case SO_USELOOPBACK:
5280 				tcp->tcp_useloopback = onoff;
5281 				if (onoff)
5282 					so->so_options |= option_name;
5283 				else
5284 					so->so_options &= ~option_name;
5285 				break;
5286 			case SO_BROADCAST:
5287 				tcp->tcp_broadcast = onoff;
5288 				if (onoff)
5289 					so->so_options |= option_name;
5290 				else
5291 					so->so_options &= ~option_name;
5292 				break;
5293 			case SO_REUSEADDR:
5294 				tcp->tcp_reuseaddr = onoff;
5295 				if (onoff)
5296 					so->so_options |= option_name;
5297 				else
5298 					so->so_options &= ~option_name;
5299 				break;
5300 			case SO_OOBINLINE:
5301 				tcp->tcp_oobinline = onoff;
5302 				if (onoff)
5303 					so->so_options |= option_name;
5304 				else
5305 					so->so_options &= ~option_name;
5306 				break;
5307 			case SO_DGRAM_ERRIND:
5308 				tcp->tcp_dgram_errind = onoff;
5309 				if (onoff)
5310 					so->so_options |= option_name;
5311 				else
5312 					so->so_options &= ~option_name;
5313 				break;
5314 			}
5315 			break;
5316 		case IPPROTO_TCP:
5317 			switch (option_name) {
5318 			case TCP_NODELAY:
5319 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5320 					error = EINVAL;
5321 					eprintsoline(so, error);
5322 					mutex_enter(&so->so_lock);
5323 					goto done2;
5324 				}
5325 				ASSERT(optval);
5326 				tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss;
5327 				handled = B_TRUE;
5328 				break;
5329 			}
5330 			break;
5331 		default:
5332 			handled = B_FALSE;
5333 			break;
5334 		}
5335 	}
5336 
5337 	if (handled) {
5338 		mutex_enter(&so->so_lock);
5339 		goto done2;
5340 	}
5341 
5342 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5343 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5344 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5345 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5346 
5347 	oh.level = level;
5348 	oh.name = option_name;
5349 	oh.len = optlen;
5350 
5351 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5352 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP);
5353 	/* Let option management work in the presence of data flow control */
5354 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5355 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5356 	mp = NULL;
5357 	mutex_enter(&so->so_lock);
5358 	if (error) {
5359 		eprintsoline(so, error);
5360 		goto done;
5361 	}
5362 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5363 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5364 	if (error) {
5365 		eprintsoline(so, error);
5366 		goto done;
5367 	}
5368 	ASSERT(mp);
5369 	/* No need to verify T_optmgmt_ack */
5370 	freemsg(mp);
5371 done:
5372 	/*
5373 	 * Check for SOL_SOCKET options and record their values.
5374 	 * If we know about a SOL_SOCKET parameter and the transport
5375 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5376 	 * EPROTO) we let the setsockopt succeed.
5377 	 */
5378 	if (level == SOL_SOCKET) {
5379 		/* Check parameters */
5380 		switch (option_name) {
5381 		case SO_DEBUG:
5382 		case SO_REUSEADDR:
5383 		case SO_KEEPALIVE:
5384 		case SO_DONTROUTE:
5385 		case SO_BROADCAST:
5386 		case SO_USELOOPBACK:
5387 		case SO_OOBINLINE:
5388 		case SO_SNDBUF:
5389 		case SO_RCVBUF:
5390 #ifdef notyet
5391 		case SO_SNDLOWAT:
5392 		case SO_RCVLOWAT:
5393 		case SO_SNDTIMEO:
5394 		case SO_RCVTIMEO:
5395 #endif /* notyet */
5396 		case SO_DGRAM_ERRIND:
5397 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5398 				error = EINVAL;
5399 				eprintsoline(so, error);
5400 				goto done2;
5401 			}
5402 			ASSERT(optval);
5403 			handled = B_TRUE;
5404 			break;
5405 		case SO_LINGER:
5406 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5407 				error = EINVAL;
5408 				eprintsoline(so, error);
5409 				goto done2;
5410 			}
5411 			ASSERT(optval);
5412 			handled = B_TRUE;
5413 			break;
5414 		}
5415 
5416 #define	intvalue	(*(int32_t *)optval)
5417 
5418 		switch (option_name) {
5419 		case SO_TYPE:
5420 		case SO_ERROR:
5421 		case SO_ACCEPTCONN:
5422 			/* Can't be set */
5423 			error = ENOPROTOOPT;
5424 			goto done2;
5425 		case SO_LINGER: {
5426 			struct linger *l = (struct linger *)optval;
5427 
5428 			so->so_linger.l_linger = l->l_linger;
5429 			if (l->l_onoff) {
5430 				so->so_linger.l_onoff = SO_LINGER;
5431 				so->so_options |= SO_LINGER;
5432 			} else {
5433 				so->so_linger.l_onoff = 0;
5434 				so->so_options &= ~SO_LINGER;
5435 			}
5436 			break;
5437 		}
5438 
5439 		case SO_DEBUG:
5440 #ifdef SOCK_TEST
5441 			if (intvalue & 2)
5442 				sock_test_timelimit = 10 * hz;
5443 			else
5444 				sock_test_timelimit = 0;
5445 
5446 			if (intvalue & 4)
5447 				do_useracc = 0;
5448 			else
5449 				do_useracc = 1;
5450 #endif /* SOCK_TEST */
5451 			/* FALLTHRU */
5452 		case SO_REUSEADDR:
5453 		case SO_KEEPALIVE:
5454 		case SO_DONTROUTE:
5455 		case SO_BROADCAST:
5456 		case SO_USELOOPBACK:
5457 		case SO_OOBINLINE:
5458 		case SO_DGRAM_ERRIND:
5459 			if (intvalue != 0) {
5460 				dprintso(so, 1,
5461 				    ("sotpi_setsockopt: setting 0x%x\n",
5462 				    option_name));
5463 				so->so_options |= option_name;
5464 			} else {
5465 				dprintso(so, 1,
5466 				    ("sotpi_setsockopt: clearing 0x%x\n",
5467 				    option_name));
5468 				so->so_options &= ~option_name;
5469 			}
5470 			break;
5471 		/*
5472 		 * The following options are only returned by us when the
5473 		 * T_SVR4_OPTMGMT_REQ fails.
5474 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5475 		 * since the transport might adjust the value and not
5476 		 * return exactly what was set by the application.
5477 		 */
5478 		case SO_SNDBUF:
5479 			so->so_sndbuf = intvalue;
5480 			break;
5481 		case SO_RCVBUF:
5482 			so->so_rcvbuf = intvalue;
5483 			break;
5484 #ifdef notyet
5485 		/*
5486 		 * We do not implement the semantics of these options
5487 		 * thus we shouldn't implement the options either.
5488 		 */
5489 		case SO_SNDLOWAT:
5490 			so->so_sndlowat = intvalue;
5491 			break;
5492 		case SO_RCVLOWAT:
5493 			so->so_rcvlowat = intvalue;
5494 			break;
5495 		case SO_SNDTIMEO:
5496 			so->so_sndtimeo = intvalue;
5497 			break;
5498 		case SO_RCVTIMEO:
5499 			so->so_rcvtimeo = intvalue;
5500 			break;
5501 #endif /* notyet */
5502 		}
5503 #undef	intvalue
5504 
5505 		if (error) {
5506 			if ((error == ENOPROTOOPT || error == EPROTO ||
5507 			    error == EINVAL) && handled) {
5508 				dprintso(so, 1,
5509 				    ("setsockopt: ignoring error %d for 0x%x\n",
5510 				    error, option_name));
5511 				error = 0;
5512 			}
5513 		}
5514 	}
5515 done2:
5516 ret:
5517 	so_unlock_single(so, SOLOCKED);
5518 	mutex_exit(&so->so_lock);
5519 	return (error);
5520 }
5521