xref: /titanic_44/usr/src/uts/common/fs/sockfs/socktpi.c (revision 72612f86fafbe2510a166b48e158c9031e0dd63b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/strsun.h>
50 #include <sys/ddi.h>
51 #include <sys/esunddi.h>
52 #include <sys/flock.h>
53 #include <sys/modctl.h>
54 #include <sys/vtrace.h>
55 #include <sys/cmn_err.h>
56 #include <sys/pathname.h>
57 
58 #include <sys/socket.h>
59 #include <sys/socketvar.h>
60 #include <sys/sockio.h>
61 #include <sys/sodirect.h>
62 #include <netinet/in.h>
63 #include <sys/un.h>
64 #include <sys/strsun.h>
65 
66 #include <sys/tiuser.h>
67 #define	_SUN_TPI_VERSION	2
68 #include <sys/tihdr.h>
69 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
70 
71 #include <c2/audit.h>
72 
73 #include <inet/common.h>
74 #include <inet/ip.h>
75 #include <inet/ip6.h>
76 #include <inet/tcp.h>
77 #include <inet/udp_impl.h>
78 
79 #include <sys/zone.h>
80 
81 #include <fs/sockfs/nl7c.h>
82 #include <fs/sockfs/nl7curi.h>
83 
84 #include <inet/kssl/ksslapi.h>
85 
86 /*
87  * Possible failures when memory can't be allocated. The documented behavior:
88  *
89  * 		5.5:			4.X:		XNET:
90  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
91  *							EINTR
92  *	(4.X does not document EINTR but returns it)
93  * bind:	ENOSR			-		ENOBUFS/ENOSR
94  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
95  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
96  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
97  *	(4.X getpeername and getsockname do not fail in practice)
98  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
99  * listen:	-			-		ENOBUFS
100  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
101  *							EINTR
102  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
103  *							EINTR
104  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
105  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
106  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
107  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
108  *
109  * Resolution. When allocation fails:
110  *	recv: return EINTR
111  *	send: return EINTR
112  *	connect, accept: EINTR
113  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
114  *	socket, socketpair: ENOBUFS
115  *	getpeername, getsockname: sleep
116  *	getsockopt, setsockopt: sleep
117  */
118 
119 #ifdef SOCK_TEST
120 /*
121  * Variables that make sockfs do something other than the standard TPI
122  * for the AF_INET transports.
123  *
124  * solisten_tpi_tcp:
125  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
126  *	the transport is already bound. This is needed to avoid loosing the
127  *	port number should listen() do a T_UNBIND_REQ followed by a
128  *	O_T_BIND_REQ.
129  *
130  * soconnect_tpi_udp:
131  *	UDP and ICMP can handle a T_CONN_REQ.
132  *	This is needed to make the sequence of connect(), getsockname()
133  *	return the local IP address used to send packets to the connected to
134  *	destination.
135  *
136  * soconnect_tpi_tcp:
137  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
138  *	Set this to non-zero to send TPI conformant messages to TCP in this
139  *	respect. This is a performance optimization.
140  *
141  * soaccept_tpi_tcp:
142  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
143  *	This is a performance optimization that has been picked up in XTI.
144  *
145  * soaccept_tpi_multioptions:
146  *	When inheriting SOL_SOCKET options from the listener to the accepting
147  *	socket send them as a single message for AF_INET{,6}.
148  */
149 int solisten_tpi_tcp = 0;
150 int soconnect_tpi_udp = 0;
151 int soconnect_tpi_tcp = 0;
152 int soaccept_tpi_tcp = 0;
153 int soaccept_tpi_multioptions = 1;
154 #else /* SOCK_TEST */
155 #define	soconnect_tpi_tcp	0
156 #define	soconnect_tpi_udp	0
157 #define	solisten_tpi_tcp	0
158 #define	soaccept_tpi_tcp	0
159 #define	soaccept_tpi_multioptions	1
160 #endif /* SOCK_TEST */
161 
162 #ifdef SOCK_TEST
163 extern int do_useracc;
164 extern clock_t sock_test_timelimit;
165 #endif /* SOCK_TEST */
166 
167 /*
168  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
169  * applications working. Turn on this flag to disable these checks.
170  */
171 int xnet_skip_checks = 0;
172 int xnet_check_print = 0;
173 int xnet_truncate_print = 0;
174 
175 extern	void sigintr(k_sigset_t *, int);
176 extern	void sigunintr(k_sigset_t *);
177 
178 extern	void *nl7c_lookup_addr(void *, t_uscalar_t);
179 extern	void *nl7c_add_addr(void *, t_uscalar_t);
180 extern	void nl7c_listener_addr(void *, struct sonode *);
181 
182 /* Sockets acting as an in-kernel SSL proxy */
183 extern mblk_t	*strsock_kssl_input(vnode_t *, mblk_t *, strwakeup_t *,
184 		    strsigset_t *, strsigset_t *, strpollset_t *);
185 extern mblk_t	*strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *,
186 		    strsigset_t *, strsigset_t *, strpollset_t *);
187 
188 static int	sotpi_unbind(struct sonode *, int);
189 
190 extern int	sodput(sodirect_t *, mblk_t *);
191 extern void	sodwakeup(sodirect_t *);
192 
193 /* TPI sockfs sonode operations */
194 static int	sotpi_accept(struct sonode *, int, struct sonode **);
195 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
196 		    int);
197 static int	sotpi_connect(struct sonode *, const struct sockaddr *,
198 		    socklen_t, int, int);
199 static int	sotpi_listen(struct sonode *, int);
200 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
201 		    struct uio *);
202 static int	sotpi_shutdown(struct sonode *, int);
203 static int	sotpi_getsockname(struct sonode *);
204 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
205 		    struct uio *, void *, t_uscalar_t, int);
206 static int	sodgram_direct(struct sonode *, struct sockaddr *,
207 		    socklen_t, struct uio *, int);
208 
209 sonodeops_t sotpi_sonodeops = {
210 	sotpi_accept,		/* sop_accept		*/
211 	sotpi_bind,		/* sop_bind		*/
212 	sotpi_listen,		/* sop_listen		*/
213 	sotpi_connect,		/* sop_connect		*/
214 	sotpi_recvmsg,		/* sop_recvmsg		*/
215 	sotpi_sendmsg,		/* sop_sendmsg		*/
216 	sotpi_getpeername,	/* sop_getpeername	*/
217 	sotpi_getsockname,	/* sop_getsockname	*/
218 	sotpi_shutdown,		/* sop_shutdown		*/
219 	sotpi_getsockopt,	/* sop_getsockopt	*/
220 	sotpi_setsockopt	/* sop_setsockopt	*/
221 };
222 
223 /*
224  * Common create code for socket and accept. If tso is set the values
225  * from that node is used instead of issuing a T_INFO_REQ.
226  *
227  * Assumes that the caller has a VN_HOLD on accessvp.
228  * The VN_RELE will occur either when sotpi_create() fails or when
229  * the returned sonode is freed.
230  */
231 struct sonode *
232 sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version,
233     struct sonode *tso, int *errorp)
234 {
235 	struct sonode	*so;
236 	vnode_t		*vp;
237 	int		flags, error;
238 
239 	ASSERT(accessvp != NULL);
240 	vp = makesockvp(accessvp, domain, type, protocol);
241 	ASSERT(vp != NULL);
242 	so = VTOSO(vp);
243 
244 	flags = FREAD|FWRITE;
245 
246 	if ((type == SOCK_STREAM || type == SOCK_DGRAM) &&
247 	    (domain == AF_INET || domain == AF_INET6) &&
248 	    (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP ||
249 	    protocol == IPPROTO_IP)) {
250 		/* Tell tcp or udp that it's talking to sockets */
251 		flags |= SO_SOCKSTR;
252 
253 		/*
254 		 * Here we indicate to socktpi_open() our attempt to
255 		 * make direct calls between sockfs and transport.
256 		 * The final decision is left to socktpi_open().
257 		 */
258 		so->so_state |= SS_DIRECT;
259 
260 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
261 		if (so->so_type == SOCK_STREAM && tso != NULL) {
262 			if (tso->so_state & SS_DIRECT) {
263 				/*
264 				 * Inherit SS_DIRECT from listener and pass
265 				 * SO_ACCEPTOR open flag to tcp, indicating
266 				 * that this is an accept fast-path instance.
267 				 */
268 				flags |= SO_ACCEPTOR;
269 			} else {
270 				/*
271 				 * SS_DIRECT is not set on listener, meaning
272 				 * that the listener has been converted from
273 				 * a socket to a stream.  Ensure that the
274 				 * acceptor inherits these settings.
275 				 */
276 				so->so_state &= ~SS_DIRECT;
277 				flags &= ~SO_SOCKSTR;
278 			}
279 		}
280 	}
281 
282 	/*
283 	 * Tell local transport that it is talking to sockets.
284 	 */
285 	if (so->so_family == AF_UNIX) {
286 		flags |= SO_SOCKSTR;
287 	}
288 
289 	/* Initialize the kernel SSL proxy fields */
290 	so->so_kssl_type = KSSL_NO_PROXY;
291 	so->so_kssl_ent = NULL;
292 	so->so_kssl_ctx = NULL;
293 
294 	if (error = socktpi_open(&vp, flags, CRED(), NULL)) {
295 		VN_RELE(vp);
296 		*errorp = error;
297 		return (NULL);
298 	}
299 
300 	if (error = so_strinit(so, tso)) {
301 		(void) VOP_CLOSE(vp, 0, 1, 0, CRED(), NULL);
302 		VN_RELE(vp);
303 		*errorp = error;
304 		return (NULL);
305 	}
306 
307 	if (version == SOV_DEFAULT)
308 		version = so_default_version;
309 
310 	so->so_version = (short)version;
311 
312 	return (so);
313 }
314 
315 /*
316  * Bind the socket to an unspecified address in sockfs only.
317  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
318  * required in all cases.
319  */
320 static void
321 so_automatic_bind(struct sonode *so)
322 {
323 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
324 
325 	ASSERT(MUTEX_HELD(&so->so_lock));
326 	ASSERT(!(so->so_state & SS_ISBOUND));
327 	ASSERT(so->so_unbind_mp);
328 
329 	ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
330 	bzero(so->so_laddr_sa, so->so_laddr_len);
331 	so->so_laddr_sa->sa_family = so->so_family;
332 	so->so_state |= SS_ISBOUND;
333 }
334 
335 
336 /*
337  * bind the socket.
338  *
339  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
340  * are passed in we allow rebinding. Note that for backwards compatibility
341  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
342  * Thus the rebinding code is currently not executed.
343  *
344  * The constraints for rebinding are:
345  * - it is a SOCK_DGRAM, or
346  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
347  *   and no listen() has been done.
348  * This rebinding code was added based on some language in the XNET book
349  * about not returning EINVAL it the protocol allows rebinding. However,
350  * this language is not present in the Posix socket draft. Thus maybe the
351  * rebinding logic should be deleted from the source.
352  *
353  * A null "name" can be used to unbind the socket if:
354  * - it is a SOCK_DGRAM, or
355  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
356  *   and no listen() has been done.
357  */
358 static int
359 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
360     socklen_t namelen, int backlog, int flags)
361 {
362 	struct T_bind_req	bind_req;
363 	struct T_bind_ack	*bind_ack;
364 	int			error = 0;
365 	mblk_t			*mp;
366 	void			*addr;
367 	t_uscalar_t		addrlen;
368 	int			unbind_on_err = 1;
369 	boolean_t		clear_acceptconn_on_err = B_FALSE;
370 	boolean_t		restore_backlog_on_err = B_FALSE;
371 	int			save_so_backlog;
372 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
373 	boolean_t		tcp_udp_xport;
374 	void			*nl7c = NULL;
375 
376 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
377 	    (void *)so, (void *)name, namelen, backlog, flags,
378 	    pr_state(so->so_state, so->so_mode)));
379 
380 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
381 
382 	if (!(flags & _SOBIND_LOCK_HELD)) {
383 		mutex_enter(&so->so_lock);
384 		so_lock_single(so);	/* Set SOLOCKED */
385 	} else {
386 		ASSERT(MUTEX_HELD(&so->so_lock));
387 		ASSERT(so->so_flag & SOLOCKED);
388 	}
389 
390 	/*
391 	 * Make sure that there is a preallocated unbind_req message
392 	 * before binding. This message allocated when the socket is
393 	 * created  but it might be have been consumed.
394 	 */
395 	if (so->so_unbind_mp == NULL) {
396 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
397 		/* NOTE: holding so_lock while sleeping */
398 		so->so_unbind_mp =
399 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
400 	}
401 
402 	if (flags & _SOBIND_REBIND) {
403 		/*
404 		 * Called from solisten after doing an sotpi_unbind() or
405 		 * potentially without the unbind (latter for AF_INET{,6}).
406 		 */
407 		ASSERT(name == NULL && namelen == 0);
408 
409 		if (so->so_family == AF_UNIX) {
410 			ASSERT(so->so_ux_bound_vp);
411 			addr = &so->so_ux_laddr;
412 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
413 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
414 			    "addr 0x%p, vp %p\n",
415 			    addrlen,
416 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
417 			    (void *)so->so_ux_bound_vp));
418 		} else {
419 			addr = so->so_laddr_sa;
420 			addrlen = (t_uscalar_t)so->so_laddr_len;
421 		}
422 	} else if (flags & _SOBIND_UNSPEC) {
423 		ASSERT(name == NULL && namelen == 0);
424 
425 		/*
426 		 * The caller checked SS_ISBOUND but not necessarily
427 		 * under so_lock
428 		 */
429 		if (so->so_state & SS_ISBOUND) {
430 			/* No error */
431 			goto done;
432 		}
433 
434 		/* Set an initial local address */
435 		switch (so->so_family) {
436 		case AF_UNIX:
437 			/*
438 			 * Use an address with same size as struct sockaddr
439 			 * just like BSD.
440 			 */
441 			so->so_laddr_len =
442 			    (socklen_t)sizeof (struct sockaddr);
443 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
444 			bzero(so->so_laddr_sa, so->so_laddr_len);
445 			so->so_laddr_sa->sa_family = so->so_family;
446 
447 			/*
448 			 * Pass down an address with the implicit bind
449 			 * magic number and the rest all zeros.
450 			 * The transport will return a unique address.
451 			 */
452 			so->so_ux_laddr.soua_vp = NULL;
453 			so->so_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
454 			addr = &so->so_ux_laddr;
455 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
456 			break;
457 
458 		case AF_INET:
459 		case AF_INET6:
460 			/*
461 			 * An unspecified bind in TPI has a NULL address.
462 			 * Set the address in sockfs to have the sa_family.
463 			 */
464 			so->so_laddr_len = (so->so_family == AF_INET) ?
465 			    (socklen_t)sizeof (sin_t) :
466 			    (socklen_t)sizeof (sin6_t);
467 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
468 			bzero(so->so_laddr_sa, so->so_laddr_len);
469 			so->so_laddr_sa->sa_family = so->so_family;
470 			addr = NULL;
471 			addrlen = 0;
472 			break;
473 
474 		default:
475 			/*
476 			 * An unspecified bind in TPI has a NULL address.
477 			 * Set the address in sockfs to be zero length.
478 			 *
479 			 * Can not assume there is a sa_family for all
480 			 * protocol families. For example, AF_X25 does not
481 			 * have a family field.
482 			 */
483 			bzero(so->so_laddr_sa, so->so_laddr_len);
484 			so->so_laddr_len = 0;	/* XXX correct? */
485 			addr = NULL;
486 			addrlen = 0;
487 			break;
488 		}
489 
490 	} else {
491 		if (so->so_state & SS_ISBOUND) {
492 			/*
493 			 * If it is ok to rebind the socket, first unbind
494 			 * with the transport. A rebind to the NULL address
495 			 * is interpreted as an unbind.
496 			 * Note that a bind to NULL in BSD does unbind the
497 			 * socket but it fails with EINVAL.
498 			 * Note that regular sockets set SOV_SOCKBSD i.e.
499 			 * _SOBIND_SOCKBSD gets set here hence no type of
500 			 * socket does currently allow rebinding.
501 			 *
502 			 * If the name is NULL just do an unbind.
503 			 */
504 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
505 			    name != NULL) {
506 				error = EINVAL;
507 				unbind_on_err = 0;
508 				eprintsoline(so, error);
509 				goto done;
510 			}
511 			if ((so->so_mode & SM_CONNREQUIRED) &&
512 			    (so->so_state & SS_CANTREBIND)) {
513 				error = EINVAL;
514 				unbind_on_err = 0;
515 				eprintsoline(so, error);
516 				goto done;
517 			}
518 			error = sotpi_unbind(so, 0);
519 			if (error) {
520 				eprintsoline(so, error);
521 				goto done;
522 			}
523 			ASSERT(!(so->so_state & SS_ISBOUND));
524 			if (name == NULL) {
525 				so->so_state &=
526 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
527 				goto done;
528 			}
529 		}
530 		/* X/Open requires this check */
531 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
532 			if (xnet_check_print) {
533 				printf("sockfs: X/Open bind state check "
534 				    "caused EINVAL\n");
535 			}
536 			error = EINVAL;
537 			goto done;
538 		}
539 
540 		switch (so->so_family) {
541 		case AF_UNIX:
542 			/*
543 			 * All AF_UNIX addresses are nul terminated
544 			 * when copied (copyin_name) in so the minimum
545 			 * length is 3 bytes.
546 			 */
547 			if (name == NULL ||
548 			    (ssize_t)namelen <= sizeof (short) + 1) {
549 				error = EISDIR;
550 				eprintsoline(so, error);
551 				goto done;
552 			}
553 			/*
554 			 * Verify so_family matches the bound family.
555 			 * BSD does not check this for AF_UNIX resulting
556 			 * in funny mknods.
557 			 */
558 			if (name->sa_family != so->so_family) {
559 				error = EAFNOSUPPORT;
560 				goto done;
561 			}
562 			break;
563 		case AF_INET:
564 			if (name == NULL) {
565 				error = EINVAL;
566 				eprintsoline(so, error);
567 				goto done;
568 			}
569 			if ((size_t)namelen != sizeof (sin_t)) {
570 				error = name->sa_family != so->so_family ?
571 				    EAFNOSUPPORT : EINVAL;
572 				eprintsoline(so, error);
573 				goto done;
574 			}
575 			if ((flags & _SOBIND_XPG4_2) &&
576 			    (name->sa_family != so->so_family)) {
577 				/*
578 				 * This check has to be made for X/Open
579 				 * sockets however application failures have
580 				 * been observed when it is applied to
581 				 * all sockets.
582 				 */
583 				error = EAFNOSUPPORT;
584 				eprintsoline(so, error);
585 				goto done;
586 			}
587 			/*
588 			 * Force a zero sa_family to match so_family.
589 			 *
590 			 * Some programs like inetd(1M) don't set the
591 			 * family field. Other programs leave
592 			 * sin_family set to garbage - SunOS 4.X does
593 			 * not check the family field on a bind.
594 			 * We use the family field that
595 			 * was passed in to the socket() call.
596 			 */
597 			name->sa_family = so->so_family;
598 			break;
599 
600 		case AF_INET6: {
601 #ifdef DEBUG
602 			sin6_t *sin6 = (sin6_t *)name;
603 #endif /* DEBUG */
604 
605 			if (name == NULL) {
606 				error = EINVAL;
607 				eprintsoline(so, error);
608 				goto done;
609 			}
610 			if ((size_t)namelen != sizeof (sin6_t)) {
611 				error = name->sa_family != so->so_family ?
612 				    EAFNOSUPPORT : EINVAL;
613 				eprintsoline(so, error);
614 				goto done;
615 			}
616 			if (name->sa_family != so->so_family) {
617 				/*
618 				 * With IPv6 we require the family to match
619 				 * unlike in IPv4.
620 				 */
621 				error = EAFNOSUPPORT;
622 				eprintsoline(so, error);
623 				goto done;
624 			}
625 #ifdef DEBUG
626 			/*
627 			 * Verify that apps don't forget to clear
628 			 * sin6_scope_id etc
629 			 */
630 			if (sin6->sin6_scope_id != 0 &&
631 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
632 				zcmn_err(getzoneid(), CE_WARN,
633 				    "bind with uninitialized sin6_scope_id "
634 				    "(%d) on socket. Pid = %d\n",
635 				    (int)sin6->sin6_scope_id,
636 				    (int)curproc->p_pid);
637 			}
638 			if (sin6->__sin6_src_id != 0) {
639 				zcmn_err(getzoneid(), CE_WARN,
640 				    "bind with uninitialized __sin6_src_id "
641 				    "(%d) on socket. Pid = %d\n",
642 				    (int)sin6->__sin6_src_id,
643 				    (int)curproc->p_pid);
644 			}
645 #endif /* DEBUG */
646 			break;
647 		}
648 		default:
649 			/*
650 			 * Don't do any length or sa_family check to allow
651 			 * non-sockaddr style addresses.
652 			 */
653 			if (name == NULL) {
654 				error = EINVAL;
655 				eprintsoline(so, error);
656 				goto done;
657 			}
658 			break;
659 		}
660 
661 		if (namelen > (t_uscalar_t)so->so_laddr_maxlen) {
662 			error = ENAMETOOLONG;
663 			eprintsoline(so, error);
664 			goto done;
665 		}
666 		/*
667 		 * Save local address.
668 		 */
669 		so->so_laddr_len = (socklen_t)namelen;
670 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
671 		bcopy(name, so->so_laddr_sa, namelen);
672 
673 		addr = so->so_laddr_sa;
674 		addrlen = (t_uscalar_t)so->so_laddr_len;
675 		switch (so->so_family) {
676 		case AF_INET6:
677 		case AF_INET:
678 			break;
679 		case AF_UNIX: {
680 			struct sockaddr_un *soun =
681 			    (struct sockaddr_un *)so->so_laddr_sa;
682 			struct vnode *vp;
683 			struct vattr vattr;
684 
685 			ASSERT(so->so_ux_bound_vp == NULL);
686 			/*
687 			 * Create vnode for the specified path name.
688 			 * Keep vnode held with a reference in so_ux_bound_vp.
689 			 * Use the vnode pointer as the address used in the
690 			 * bind with the transport.
691 			 *
692 			 * Use the same mode as in BSD. In particular this does
693 			 * not observe the umask.
694 			 */
695 			/* MAXPATHLEN + soun_family + nul termination */
696 			if (so->so_laddr_len >
697 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
698 				error = ENAMETOOLONG;
699 				eprintsoline(so, error);
700 				goto done;
701 			}
702 			vattr.va_type = VSOCK;
703 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
704 			vattr.va_mask = AT_TYPE|AT_MODE;
705 			/* NOTE: holding so_lock */
706 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
707 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
708 			if (error) {
709 				if (error == EEXIST)
710 					error = EADDRINUSE;
711 				eprintsoline(so, error);
712 				goto done;
713 			}
714 			/*
715 			 * Establish pointer from the underlying filesystem
716 			 * vnode to the socket node.
717 			 * so_ux_bound_vp and v_stream->sd_vnode form the
718 			 * cross-linkage between the underlying filesystem
719 			 * node and the socket node.
720 			 */
721 			ASSERT(SOTOV(so)->v_stream);
722 			mutex_enter(&vp->v_lock);
723 			vp->v_stream = SOTOV(so)->v_stream;
724 			so->so_ux_bound_vp = vp;
725 			mutex_exit(&vp->v_lock);
726 
727 			/*
728 			 * Use the vnode pointer value as a unique address
729 			 * (together with the magic number to avoid conflicts
730 			 * with implicit binds) in the transport provider.
731 			 */
732 			so->so_ux_laddr.soua_vp = (void *)so->so_ux_bound_vp;
733 			so->so_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
734 			addr = &so->so_ux_laddr;
735 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
736 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
737 			    addrlen,
738 			    ((struct so_ux_addr *)addr)->soua_vp));
739 			break;
740 		}
741 		} /* end switch (so->so_family) */
742 	}
743 
744 	/*
745 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
746 	 * the transport can start passing up T_CONN_IND messages
747 	 * as soon as it receives the bind req and strsock_proto()
748 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
749 	 */
750 	if (flags & _SOBIND_LISTEN) {
751 		if ((so->so_state & SS_ACCEPTCONN) == 0)
752 			clear_acceptconn_on_err = B_TRUE;
753 		save_so_backlog = so->so_backlog;
754 		restore_backlog_on_err = B_TRUE;
755 		so->so_state |= SS_ACCEPTCONN;
756 		so->so_backlog = backlog;
757 	}
758 
759 	/*
760 	 * If NL7C addr(s) have been configured check for addr/port match,
761 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
762 	 *
763 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
764 	 * family sockets only. If match mark as such.
765 	 */
766 	if (nl7c_enabled && ((addr != NULL &&
767 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
768 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
769 	    so->so_nl7c_flags == NL7C_AF_NCA)) {
770 		/*
771 		 * NL7C is not supported in non-global zones,
772 		 * we enforce this restriction here.
773 		 */
774 		if (so->so_zoneid == GLOBAL_ZONEID) {
775 			/* An NL7C socket, mark it */
776 			so->so_nl7c_flags |= NL7C_ENABLED;
777 			if (nl7c == NULL) {
778 				/*
779 				 * Was an AF_NCA bind() so add it to the
780 				 * addr list for reporting purposes.
781 				 */
782 				nl7c = nl7c_add_addr(addr, addrlen);
783 			}
784 		} else
785 			nl7c = NULL;
786 	}
787 	/*
788 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
789 	 * for other transports we will send in a O_T_BIND_REQ.
790 	 */
791 	if (tcp_udp_xport &&
792 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
793 		PRIM_type = T_BIND_REQ;
794 
795 	bind_req.PRIM_type = PRIM_type;
796 	bind_req.ADDR_length = addrlen;
797 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
798 	bind_req.CONIND_number = backlog;
799 	/* NOTE: holding so_lock while sleeping */
800 	mp = soallocproto2(&bind_req, sizeof (bind_req),
801 	    addr, addrlen, 0, _ALLOC_SLEEP);
802 	so->so_state &= ~SS_LADDR_VALID;
803 
804 	/* Done using so_laddr_sa - can drop the lock */
805 	mutex_exit(&so->so_lock);
806 
807 	/*
808 	 * Intercept the bind_req message here to check if this <address/port>
809 	 * was configured as an SSL proxy server, or if another endpoint was
810 	 * already configured to act as a proxy for us.
811 	 *
812 	 * Note, only if NL7C not enabled for this socket.
813 	 */
814 	if (nl7c == NULL &&
815 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
816 	    so->so_type == SOCK_STREAM) {
817 
818 		if (so->so_kssl_ent != NULL) {
819 			kssl_release_ent(so->so_kssl_ent, so, so->so_kssl_type);
820 			so->so_kssl_ent = NULL;
821 		}
822 
823 		so->so_kssl_type = kssl_check_proxy(mp, so, &so->so_kssl_ent);
824 		switch (so->so_kssl_type) {
825 		case KSSL_NO_PROXY:
826 			break;
827 
828 		case KSSL_HAS_PROXY:
829 			mutex_enter(&so->so_lock);
830 			goto skip_transport;
831 
832 		case KSSL_IS_PROXY:
833 			break;
834 		}
835 	}
836 
837 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
838 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
839 	if (error) {
840 		eprintsoline(so, error);
841 		mutex_enter(&so->so_lock);
842 		goto done;
843 	}
844 
845 	mutex_enter(&so->so_lock);
846 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
847 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
848 	if (error) {
849 		eprintsoline(so, error);
850 		goto done;
851 	}
852 skip_transport:
853 	ASSERT(mp);
854 	/*
855 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
856 	 * strsock_proto while the lock was dropped above, the bind
857 	 * is allowed to complete.
858 	 */
859 
860 	/* Mark as bound. This will be undone if we detect errors below. */
861 	if (flags & _SOBIND_NOXLATE) {
862 		ASSERT(so->so_family == AF_UNIX);
863 		so->so_state |= SS_FADDR_NOXLATE;
864 	}
865 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
866 	so->so_state |= SS_ISBOUND;
867 	ASSERT(so->so_unbind_mp);
868 
869 	/* note that we've already set SS_ACCEPTCONN above */
870 
871 	/*
872 	 * Recompute addrlen - an unspecied bind sent down an
873 	 * address of length zero but we expect the appropriate length
874 	 * in return.
875 	 */
876 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
877 	    sizeof (so->so_ux_laddr) : so->so_laddr_len);
878 
879 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
880 	/*
881 	 * The alignment restriction is really too strict but
882 	 * we want enough alignment to inspect the fields of
883 	 * a sockaddr_in.
884 	 */
885 	addr = sogetoff(mp, bind_ack->ADDR_offset,
886 	    bind_ack->ADDR_length,
887 	    __TPI_ALIGN_SIZE);
888 	if (addr == NULL) {
889 		freemsg(mp);
890 		error = EPROTO;
891 		eprintsoline(so, error);
892 		goto done;
893 	}
894 	if (!(flags & _SOBIND_UNSPEC)) {
895 		/*
896 		 * Verify that the transport didn't return something we
897 		 * did not want e.g. an address other than what we asked for.
898 		 *
899 		 * NOTE: These checks would go away if/when we switch to
900 		 * using the new TPI (in which the transport would fail
901 		 * the request instead of assigning a different address).
902 		 *
903 		 * NOTE2: For protocols that we don't know (i.e. any
904 		 * other than AF_INET6, AF_INET and AF_UNIX), we
905 		 * cannot know if the transport should be expected to
906 		 * return the same address as that requested.
907 		 *
908 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
909 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
910 		 *
911 		 * For example, in the case of netatalk it may be
912 		 * inappropriate for the transport to return the
913 		 * requested address (as it may have allocated a local
914 		 * port number in behaviour similar to that of an
915 		 * AF_INET bind request with a port number of zero).
916 		 *
917 		 * Given the definition of O_T_BIND_REQ, where the
918 		 * transport may bind to an address other than the
919 		 * requested address, it's not possible to determine
920 		 * whether a returned address that differs from the
921 		 * requested address is a reason to fail (because the
922 		 * requested address was not available) or succeed
923 		 * (because the transport allocated an appropriate
924 		 * address and/or port).
925 		 *
926 		 * sockfs currently requires that the transport return
927 		 * the requested address in the T_BIND_ACK, unless
928 		 * there is code here to allow for any discrepancy.
929 		 * Such code exists for AF_INET and AF_INET6.
930 		 *
931 		 * Netatalk chooses to return the requested address
932 		 * rather than the (correct) allocated address.  This
933 		 * means that netatalk violates the TPI specification
934 		 * (and would not function correctly if used from a
935 		 * TLI application), but it does mean that it works
936 		 * with sockfs.
937 		 *
938 		 * As noted above, using the newer XTI bind primitive
939 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
940 		 * allow sockfs to be more sure about whether or not
941 		 * the bind request had succeeded (as transports are
942 		 * not permitted to bind to a different address than
943 		 * that requested - they must return failure).
944 		 * Unfortunately, support for T_BIND_REQ may not be
945 		 * present in all transport implementations (netatalk,
946 		 * for example, doesn't have it), making the
947 		 * transition difficult.
948 		 */
949 		if (bind_ack->ADDR_length != addrlen) {
950 			/* Assumes that the requested address was in use */
951 			freemsg(mp);
952 			error = EADDRINUSE;
953 			eprintsoline(so, error);
954 			goto done;
955 		}
956 
957 		switch (so->so_family) {
958 		case AF_INET6:
959 		case AF_INET: {
960 			sin_t *rname, *aname;
961 
962 			rname = (sin_t *)addr;
963 			aname = (sin_t *)so->so_laddr_sa;
964 
965 			/*
966 			 * Take advantage of the alignment
967 			 * of sin_port and sin6_port which fall
968 			 * in the same place in their data structures.
969 			 * Just use sin_port for either address family.
970 			 *
971 			 * This may become a problem if (heaven forbid)
972 			 * there's a separate ipv6port_reserved... :-P
973 			 *
974 			 * Binding to port 0 has the semantics of letting
975 			 * the transport bind to any port.
976 			 *
977 			 * If the transport is TCP or UDP since we had sent
978 			 * a T_BIND_REQ we would not get a port other than
979 			 * what we asked for.
980 			 */
981 			if (tcp_udp_xport) {
982 				/*
983 				 * Pick up the new port number if we bound to
984 				 * port 0.
985 				 */
986 				if (aname->sin_port == 0)
987 					aname->sin_port = rname->sin_port;
988 				so->so_state |= SS_LADDR_VALID;
989 				break;
990 			}
991 			if (aname->sin_port != 0 &&
992 			    aname->sin_port != rname->sin_port) {
993 				freemsg(mp);
994 				error = EADDRINUSE;
995 				eprintsoline(so, error);
996 				goto done;
997 			}
998 			/*
999 			 * Pick up the new port number if we bound to port 0.
1000 			 */
1001 			aname->sin_port = rname->sin_port;
1002 
1003 			/*
1004 			 * Unfortunately, addresses aren't _quite_ the same.
1005 			 */
1006 			if (so->so_family == AF_INET) {
1007 				if (aname->sin_addr.s_addr !=
1008 				    rname->sin_addr.s_addr) {
1009 					freemsg(mp);
1010 					error = EADDRNOTAVAIL;
1011 					eprintsoline(so, error);
1012 					goto done;
1013 				}
1014 			} else {
1015 				sin6_t *rname6 = (sin6_t *)rname;
1016 				sin6_t *aname6 = (sin6_t *)aname;
1017 
1018 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1019 				    &rname6->sin6_addr)) {
1020 					freemsg(mp);
1021 					error = EADDRNOTAVAIL;
1022 					eprintsoline(so, error);
1023 					goto done;
1024 				}
1025 			}
1026 			break;
1027 		}
1028 		case AF_UNIX:
1029 			if (bcmp(addr, &so->so_ux_laddr, addrlen) != 0) {
1030 				freemsg(mp);
1031 				error = EADDRINUSE;
1032 				eprintsoline(so, error);
1033 				eprintso(so,
1034 				    ("addrlen %d, addr 0x%x, vp %p\n",
1035 				    addrlen, *((int *)addr),
1036 				    (void *)so->so_ux_bound_vp));
1037 				goto done;
1038 			}
1039 			so->so_state |= SS_LADDR_VALID;
1040 			break;
1041 		default:
1042 			/*
1043 			 * NOTE: This assumes that addresses can be
1044 			 * byte-compared for equivalence.
1045 			 */
1046 			if (bcmp(addr, so->so_laddr_sa, addrlen) != 0) {
1047 				freemsg(mp);
1048 				error = EADDRINUSE;
1049 				eprintsoline(so, error);
1050 				goto done;
1051 			}
1052 			/*
1053 			 * Don't mark SS_LADDR_VALID, as we cannot be
1054 			 * sure that the returned address is the real
1055 			 * bound address when talking to an unknown
1056 			 * transport.
1057 			 */
1058 			break;
1059 		}
1060 	} else {
1061 		/*
1062 		 * Save for returned address for getsockname.
1063 		 * Needed for unspecific bind unless transport supports
1064 		 * the TI_GETMYNAME ioctl.
1065 		 * Do this for AF_INET{,6} even though they do, as
1066 		 * caching info here is much better performance than
1067 		 * a TPI/STREAMS trip to the transport for getsockname.
1068 		 * Any which can't for some reason _must_ _not_ set
1069 		 * LADDR_VALID here for the caching version of getsockname
1070 		 * to not break;
1071 		 */
1072 		switch (so->so_family) {
1073 		case AF_UNIX:
1074 			/*
1075 			 * Record the address bound with the transport
1076 			 * for use by socketpair.
1077 			 */
1078 			bcopy(addr, &so->so_ux_laddr, addrlen);
1079 			so->so_state |= SS_LADDR_VALID;
1080 			break;
1081 		case AF_INET:
1082 		case AF_INET6:
1083 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
1084 			bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
1085 			so->so_state |= SS_LADDR_VALID;
1086 			break;
1087 		default:
1088 			/*
1089 			 * Don't mark SS_LADDR_VALID, as we cannot be
1090 			 * sure that the returned address is the real
1091 			 * bound address when talking to an unknown
1092 			 * transport.
1093 			 */
1094 			break;
1095 		}
1096 	}
1097 
1098 	if (nl7c != NULL) {
1099 		/* Register listen()er sonode pointer with NL7C */
1100 		nl7c_listener_addr(nl7c, so);
1101 	}
1102 
1103 	freemsg(mp);
1104 
1105 done:
1106 	if (error) {
1107 		/* reset state & backlog to values held on entry */
1108 		if (clear_acceptconn_on_err == B_TRUE)
1109 			so->so_state &= ~SS_ACCEPTCONN;
1110 		if (restore_backlog_on_err == B_TRUE)
1111 			so->so_backlog = save_so_backlog;
1112 
1113 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1114 			int err;
1115 
1116 			err = sotpi_unbind(so, 0);
1117 			/* LINTED - statement has no consequent: if */
1118 			if (err) {
1119 				eprintsoline(so, error);
1120 			} else {
1121 				ASSERT(!(so->so_state & SS_ISBOUND));
1122 			}
1123 		}
1124 	}
1125 	if (!(flags & _SOBIND_LOCK_HELD)) {
1126 		so_unlock_single(so, SOLOCKED);
1127 		mutex_exit(&so->so_lock);
1128 	} else {
1129 		/* If the caller held the lock don't release it here */
1130 		ASSERT(MUTEX_HELD(&so->so_lock));
1131 		ASSERT(so->so_flag & SOLOCKED);
1132 	}
1133 	return (error);
1134 }
1135 
1136 /* bind the socket */
1137 static int
1138 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1139     int flags)
1140 {
1141 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1142 		return (sotpi_bindlisten(so, name, namelen, 0, flags));
1143 
1144 	flags &= ~_SOBIND_SOCKETPAIR;
1145 	return (sotpi_bindlisten(so, name, namelen, 1, flags));
1146 }
1147 
1148 /*
1149  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1150  * address, or when listen needs to unbind and bind.
1151  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1152  * so that a sobind can pick them up.
1153  */
1154 static int
1155 sotpi_unbind(struct sonode *so, int flags)
1156 {
1157 	struct T_unbind_req	unbind_req;
1158 	int			error = 0;
1159 	mblk_t			*mp;
1160 
1161 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1162 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1163 
1164 	ASSERT(MUTEX_HELD(&so->so_lock));
1165 	ASSERT(so->so_flag & SOLOCKED);
1166 
1167 	if (!(so->so_state & SS_ISBOUND)) {
1168 		error = EINVAL;
1169 		eprintsoline(so, error);
1170 		goto done;
1171 	}
1172 
1173 	mutex_exit(&so->so_lock);
1174 
1175 	/*
1176 	 * Flush the read and write side (except stream head read queue)
1177 	 * and send down T_UNBIND_REQ.
1178 	 */
1179 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1180 
1181 	unbind_req.PRIM_type = T_UNBIND_REQ;
1182 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1183 	    0, _ALLOC_SLEEP);
1184 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1185 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1186 	mutex_enter(&so->so_lock);
1187 	if (error) {
1188 		eprintsoline(so, error);
1189 		goto done;
1190 	}
1191 
1192 	error = sowaitokack(so, T_UNBIND_REQ);
1193 	if (error) {
1194 		eprintsoline(so, error);
1195 		goto done;
1196 	}
1197 
1198 	/*
1199 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1200 	 * strsock_proto while the lock was dropped above, the unbind
1201 	 * is allowed to complete.
1202 	 */
1203 	if (!(flags & _SOUNBIND_REBIND)) {
1204 		/*
1205 		 * Clear out bound address.
1206 		 */
1207 		vnode_t *vp;
1208 
1209 		if ((vp = so->so_ux_bound_vp) != NULL) {
1210 
1211 			/* Undo any SSL proxy setup */
1212 			if ((so->so_family == AF_INET ||
1213 			    so->so_family == AF_INET6) &&
1214 			    (so->so_type == SOCK_STREAM) &&
1215 			    (so->so_kssl_ent != NULL)) {
1216 				kssl_release_ent(so->so_kssl_ent, so,
1217 				    so->so_kssl_type);
1218 				so->so_kssl_ent = NULL;
1219 				so->so_kssl_type = KSSL_NO_PROXY;
1220 			}
1221 
1222 			so->so_ux_bound_vp = NULL;
1223 			vn_rele_stream(vp);
1224 		}
1225 		/* Clear out address */
1226 		so->so_laddr_len = 0;
1227 	}
1228 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN|SS_LADDR_VALID);
1229 
1230 done:
1231 
1232 	/* If the caller held the lock don't release it here */
1233 	ASSERT(MUTEX_HELD(&so->so_lock));
1234 	ASSERT(so->so_flag & SOLOCKED);
1235 
1236 	return (error);
1237 }
1238 
1239 /*
1240  * listen on the socket.
1241  * For TPI conforming transports this has to first unbind with the transport
1242  * and then bind again using the new backlog.
1243  */
1244 int
1245 sotpi_listen(struct sonode *so, int backlog)
1246 {
1247 	int		error = 0;
1248 
1249 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1250 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1251 
1252 	if (so->so_serv_type == T_CLTS)
1253 		return (EOPNOTSUPP);
1254 
1255 	/*
1256 	 * If the socket is ready to accept connections already, then
1257 	 * return without doing anything.  This avoids a problem where
1258 	 * a second listen() call fails if a connection is pending and
1259 	 * leaves the socket unbound. Only when we are not unbinding
1260 	 * with the transport can we safely increase the backlog.
1261 	 */
1262 	if (so->so_state & SS_ACCEPTCONN &&
1263 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1264 	    /*CONSTCOND*/
1265 	    !solisten_tpi_tcp))
1266 		return (0);
1267 
1268 	if (so->so_state & SS_ISCONNECTED)
1269 		return (EINVAL);
1270 
1271 	mutex_enter(&so->so_lock);
1272 	so_lock_single(so);	/* Set SOLOCKED */
1273 
1274 	if (backlog < 0)
1275 		backlog = 0;
1276 	/*
1277 	 * Use the same qlimit as in BSD. BSD checks the qlimit
1278 	 * before queuing the next connection implying that a
1279 	 * listen(sock, 0) allows one connection to be queued.
1280 	 * BSD also uses 1.5 times the requested backlog.
1281 	 *
1282 	 * XNS Issue 4 required a strict interpretation of the backlog.
1283 	 * This has been waived subsequently for Issue 4 and the change
1284 	 * incorporated in XNS Issue 5. So we aren't required to do
1285 	 * anything special for XPG apps.
1286 	 */
1287 	if (backlog >= (INT_MAX - 1) / 3)
1288 		backlog = INT_MAX;
1289 	else
1290 		backlog = backlog * 3 / 2 + 1;
1291 
1292 	/*
1293 	 * If the listen doesn't change the backlog we do nothing.
1294 	 * This avoids an EPROTO error from the transport.
1295 	 */
1296 	if ((so->so_state & SS_ACCEPTCONN) &&
1297 	    so->so_backlog == backlog)
1298 		goto done;
1299 
1300 	if (!(so->so_state & SS_ISBOUND)) {
1301 		/*
1302 		 * Must have been explicitly bound in the UNIX domain.
1303 		 */
1304 		if (so->so_family == AF_UNIX) {
1305 			error = EINVAL;
1306 			goto done;
1307 		}
1308 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1309 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1310 	} else if (backlog > 0) {
1311 		/*
1312 		 * AF_INET{,6} hack to avoid losing the port.
1313 		 * Assumes that all AF_INET{,6} transports can handle a
1314 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1315 		 * has already bound thus it is possible to avoid the unbind.
1316 		 */
1317 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1318 		    /*CONSTCOND*/
1319 		    !solisten_tpi_tcp)) {
1320 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1321 			if (error)
1322 				goto done;
1323 		}
1324 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1325 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1326 	} else {
1327 		so->so_state |= SS_ACCEPTCONN;
1328 		so->so_backlog = backlog;
1329 	}
1330 	if (error)
1331 		goto done;
1332 	ASSERT(so->so_state & SS_ACCEPTCONN);
1333 done:
1334 	so_unlock_single(so, SOLOCKED);
1335 	mutex_exit(&so->so_lock);
1336 	return (error);
1337 }
1338 
1339 /*
1340  * Disconnect either a specified seqno or all (-1).
1341  * The former is used on listening sockets only.
1342  *
1343  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1344  * the current use of sodisconnect(seqno == -1) is only for shutdown
1345  * so there is no point (and potentially incorrect) to unbind.
1346  */
1347 int
1348 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1349 {
1350 	struct T_discon_req	discon_req;
1351 	int			error = 0;
1352 	mblk_t			*mp;
1353 
1354 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1355 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1356 
1357 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1358 		mutex_enter(&so->so_lock);
1359 		so_lock_single(so);	/* Set SOLOCKED */
1360 	} else {
1361 		ASSERT(MUTEX_HELD(&so->so_lock));
1362 		ASSERT(so->so_flag & SOLOCKED);
1363 	}
1364 
1365 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1366 		error = EINVAL;
1367 		eprintsoline(so, error);
1368 		goto done;
1369 	}
1370 
1371 	mutex_exit(&so->so_lock);
1372 	/*
1373 	 * Flush the write side (unless this is a listener)
1374 	 * and then send down a T_DISCON_REQ.
1375 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1376 	 * and other messages.)
1377 	 */
1378 	if (!(so->so_state & SS_ACCEPTCONN))
1379 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1380 
1381 	discon_req.PRIM_type = T_DISCON_REQ;
1382 	discon_req.SEQ_number = seqno;
1383 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1384 	    0, _ALLOC_SLEEP);
1385 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1386 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1387 	mutex_enter(&so->so_lock);
1388 	if (error) {
1389 		eprintsoline(so, error);
1390 		goto done;
1391 	}
1392 
1393 	error = sowaitokack(so, T_DISCON_REQ);
1394 	if (error) {
1395 		eprintsoline(so, error);
1396 		goto done;
1397 	}
1398 	/*
1399 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1400 	 * strsock_proto while the lock was dropped above, the disconnect
1401 	 * is allowed to complete. However, it is not possible to
1402 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1403 	 */
1404 	so->so_state &=
1405 	    ~(SS_ISCONNECTED|SS_ISCONNECTING|SS_LADDR_VALID|SS_FADDR_VALID);
1406 done:
1407 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1408 		so_unlock_single(so, SOLOCKED);
1409 		mutex_exit(&so->so_lock);
1410 	} else {
1411 		/* If the caller held the lock don't release it here */
1412 		ASSERT(MUTEX_HELD(&so->so_lock));
1413 		ASSERT(so->so_flag & SOLOCKED);
1414 	}
1415 	return (error);
1416 }
1417 
1418 int
1419 sotpi_accept(struct sonode *so, int fflag, struct sonode **nsop)
1420 {
1421 	struct T_conn_ind	*conn_ind;
1422 	struct T_conn_res	*conn_res;
1423 	int			error = 0;
1424 	mblk_t			*mp, *ctxmp, *ack_mp;
1425 	struct sonode		*nso;
1426 	vnode_t			*nvp;
1427 	void			*src;
1428 	t_uscalar_t		srclen;
1429 	void			*opt;
1430 	t_uscalar_t		optlen;
1431 	t_scalar_t		PRIM_type;
1432 	t_scalar_t		SEQ_number;
1433 	size_t			sinlen;
1434 
1435 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1436 	    (void *)so, fflag, (void *)nsop,
1437 	    pr_state(so->so_state, so->so_mode)));
1438 
1439 	/*
1440 	 * Defer single-threading the accepting socket until
1441 	 * the T_CONN_IND has been received and parsed and the
1442 	 * new sonode has been opened.
1443 	 */
1444 
1445 	/* Check that we are not already connected */
1446 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1447 		goto conn_bad;
1448 again:
1449 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1450 		goto e_bad;
1451 
1452 	ASSERT(mp);
1453 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1454 	ctxmp = mp->b_cont;
1455 
1456 	/*
1457 	 * Save SEQ_number for error paths.
1458 	 */
1459 	SEQ_number = conn_ind->SEQ_number;
1460 
1461 	srclen = conn_ind->SRC_length;
1462 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1463 	if (src == NULL) {
1464 		error = EPROTO;
1465 		freemsg(mp);
1466 		eprintsoline(so, error);
1467 		goto disconnect_unlocked;
1468 	}
1469 	optlen = conn_ind->OPT_length;
1470 	switch (so->so_family) {
1471 	case AF_INET:
1472 	case AF_INET6:
1473 		if ((optlen == sizeof (intptr_t)) &&
1474 		    ((so->so_state & SS_DIRECT) != 0)) {
1475 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1476 			    &opt, conn_ind->OPT_length);
1477 		} else {
1478 			/*
1479 			 * The transport (in this case TCP) hasn't sent up
1480 			 * a pointer to an instance for the accept fast-path.
1481 			 * Disable fast-path completely because the call to
1482 			 * sotpi_create() below would otherwise create an
1483 			 * incomplete TCP instance, which would lead to
1484 			 * problems when sockfs sends a normal T_CONN_RES
1485 			 * message down the new stream.
1486 			 */
1487 			if (so->so_state & SS_DIRECT) {
1488 				int rval;
1489 				/*
1490 				 * For consistency we inform tcp to disable
1491 				 * direct interface on the listener, though
1492 				 * we can certainly live without doing this
1493 				 * because no data will ever travel upstream
1494 				 * on the listening socket.
1495 				 */
1496 				so->so_state &= ~SS_DIRECT;
1497 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1498 				    0, 0, K_TO_K, CRED(), &rval);
1499 			}
1500 			opt = NULL;
1501 			optlen = 0;
1502 		}
1503 		break;
1504 	case AF_UNIX:
1505 	default:
1506 		if (optlen != 0) {
1507 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1508 			    __TPI_ALIGN_SIZE);
1509 			if (opt == NULL) {
1510 				error = EPROTO;
1511 				freemsg(mp);
1512 				eprintsoline(so, error);
1513 				goto disconnect_unlocked;
1514 			}
1515 		}
1516 		if (so->so_family == AF_UNIX) {
1517 			if (!(so->so_state & SS_FADDR_NOXLATE)) {
1518 				src = NULL;
1519 				srclen = 0;
1520 			}
1521 			/* Extract src address from options */
1522 			if (optlen != 0)
1523 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1524 		}
1525 		break;
1526 	}
1527 
1528 	/*
1529 	 * Create the new socket.
1530 	 */
1531 	VN_HOLD(so->so_accessvp);
1532 	nso = sotpi_create(so->so_accessvp, so->so_family, so->so_type,
1533 	    so->so_protocol, so->so_version, so, &error);
1534 	if (nso == NULL) {
1535 		ASSERT(error != 0);
1536 		/*
1537 		 * Accept can not fail with ENOBUFS. sotpi_create
1538 		 * sleeps waiting for memory until a signal is caught
1539 		 * so return EINTR.
1540 		 */
1541 		freemsg(mp);
1542 		if (error == ENOBUFS)
1543 			error = EINTR;
1544 		goto e_disc_unl;
1545 	}
1546 	nvp = SOTOV(nso);
1547 
1548 	/*
1549 	 * If the transport sent up an SSL connection context, then attach
1550 	 * it the new socket, and set the (sd_wputdatafunc)() and
1551 	 * (sd_rputdatafunc)() stream head hooks to intercept and process
1552 	 * SSL records.
1553 	 */
1554 	if (ctxmp != NULL) {
1555 		/*
1556 		 * This kssl_ctx_t is already held for us by the transport.
1557 		 * So, we don't need to do a kssl_hold_ctx() here.
1558 		 */
1559 		nso->so_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr);
1560 		freemsg(ctxmp);
1561 		mp->b_cont = NULL;
1562 		strsetrwputdatahooks(nvp, strsock_kssl_input,
1563 		    strsock_kssl_output);
1564 	}
1565 #ifdef DEBUG
1566 	/*
1567 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1568 	 * it's inherited early to allow debugging of the accept code itself.
1569 	 */
1570 	nso->so_options |= so->so_options & SO_DEBUG;
1571 #endif /* DEBUG */
1572 
1573 	/*
1574 	 * Save the SRC address from the T_CONN_IND
1575 	 * for getpeername to work on AF_UNIX and on transports that do not
1576 	 * support TI_GETPEERNAME.
1577 	 *
1578 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1579 	 * copyin_name().
1580 	 */
1581 	if (srclen > (t_uscalar_t)nso->so_faddr_maxlen) {
1582 		error = EINVAL;
1583 		freemsg(mp);
1584 		eprintsoline(so, error);
1585 		goto disconnect_vp_unlocked;
1586 	}
1587 	nso->so_faddr_len = (socklen_t)srclen;
1588 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
1589 	bcopy(src, nso->so_faddr_sa, srclen);
1590 	nso->so_state |= SS_FADDR_VALID;
1591 
1592 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1593 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1594 		cred_t *cr;
1595 
1596 		if ((cr = DB_CRED(mp)) != NULL) {
1597 			crhold(cr);
1598 			nso->so_peercred = cr;
1599 			nso->so_cpid = DB_CPID(mp);
1600 		}
1601 		freemsg(mp);
1602 
1603 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1604 		    sizeof (intptr_t), 0, _ALLOC_INTR);
1605 		if (mp == NULL) {
1606 			/*
1607 			 * Accept can not fail with ENOBUFS.
1608 			 * A signal was caught so return EINTR.
1609 			 */
1610 			error = EINTR;
1611 			eprintsoline(so, error);
1612 			goto disconnect_vp_unlocked;
1613 		}
1614 		conn_res = (struct T_conn_res *)mp->b_rptr;
1615 	} else {
1616 		nso->so_peercred = DB_CRED(mp);
1617 		nso->so_cpid = DB_CPID(mp);
1618 		DB_CRED(mp) = NULL;
1619 
1620 		mp->b_rptr = DB_BASE(mp);
1621 		conn_res = (struct T_conn_res *)mp->b_rptr;
1622 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1623 	}
1624 
1625 	/*
1626 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1627 	 * (or AF_INET6) it also has to be bound in the transport provider.
1628 	 * We set the local address in the sonode from the T_OK_ACK of the
1629 	 * T_CONN_RES. For this reason the address we bind to here isn't
1630 	 * important.
1631 	 */
1632 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1633 	    /*CONSTCOND*/
1634 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1635 		/*
1636 		 * Optimization for AF_INET{,6} transports
1637 		 * that can handle a T_CONN_RES without being bound.
1638 		 */
1639 		mutex_enter(&nso->so_lock);
1640 		so_automatic_bind(nso);
1641 		mutex_exit(&nso->so_lock);
1642 	} else {
1643 		/* Perform NULL bind with the transport provider. */
1644 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC)) != 0) {
1645 			ASSERT(error != ENOBUFS);
1646 			freemsg(mp);
1647 			eprintsoline(nso, error);
1648 			goto disconnect_vp_unlocked;
1649 		}
1650 	}
1651 
1652 	/*
1653 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1654 	 * so that any data arriving on the new socket will cause the
1655 	 * appropriate signals to be delivered for the new socket.
1656 	 *
1657 	 * No other thread (except strsock_proto and strsock_misc)
1658 	 * can access the new socket thus we relax the locking.
1659 	 */
1660 	nso->so_pgrp = so->so_pgrp;
1661 	nso->so_state |= so->so_state & (SS_ASYNC|SS_FADDR_NOXLATE);
1662 
1663 	if (nso->so_pgrp != 0) {
1664 		if ((error = so_set_events(nso, nvp, CRED())) != 0) {
1665 			eprintsoline(nso, error);
1666 			error = 0;
1667 			nso->so_pgrp = 0;
1668 		}
1669 	}
1670 
1671 	/*
1672 	 * Make note of the socket level options. TCP and IP level options
1673 	 * are already inherited. We could do all this after accept is
1674 	 * successful but doing it here simplifies code and no harm done
1675 	 * for error case.
1676 	 */
1677 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1678 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1679 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1680 	nso->so_sndbuf = so->so_sndbuf;
1681 	nso->so_rcvbuf = so->so_rcvbuf;
1682 	if (nso->so_options & SO_LINGER)
1683 		nso->so_linger = so->so_linger;
1684 
1685 	if ((so->so_state & SS_DIRECT) != 0) {
1686 
1687 		ASSERT(opt != NULL);
1688 
1689 		conn_res->OPT_length = optlen;
1690 		conn_res->OPT_offset = MBLKL(mp);
1691 		bcopy(&opt, mp->b_wptr, optlen);
1692 		mp->b_wptr += optlen;
1693 		conn_res->PRIM_type = T_CONN_RES;
1694 		conn_res->ACCEPTOR_id = 0;
1695 		PRIM_type = T_CONN_RES;
1696 
1697 		/* Send down the T_CONN_RES on acceptor STREAM */
1698 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1699 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1700 		if (error) {
1701 			mutex_enter(&so->so_lock);
1702 			so_lock_single(so);
1703 			eprintsoline(so, error);
1704 			goto disconnect_vp;
1705 		}
1706 		mutex_enter(&nso->so_lock);
1707 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1708 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1709 		if (error) {
1710 			mutex_exit(&nso->so_lock);
1711 			mutex_enter(&so->so_lock);
1712 			so_lock_single(so);
1713 			eprintsoline(so, error);
1714 			goto disconnect_vp;
1715 		}
1716 		if (nso->so_family == AF_INET) {
1717 			sin_t *sin;
1718 
1719 			sin = (sin_t *)(ack_mp->b_rptr +
1720 			    sizeof (struct T_ok_ack));
1721 			bcopy(sin, nso->so_laddr_sa, sizeof (sin_t));
1722 			nso->so_laddr_len = sizeof (sin_t);
1723 		} else {
1724 			sin6_t *sin6;
1725 
1726 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1727 			    sizeof (struct T_ok_ack));
1728 			bcopy(sin6, nso->so_laddr_sa, sizeof (sin6_t));
1729 			nso->so_laddr_len = sizeof (sin6_t);
1730 		}
1731 		freemsg(ack_mp);
1732 
1733 		nso->so_state |= SS_ISCONNECTED | SS_LADDR_VALID;
1734 		nso->so_priv = opt;
1735 
1736 		if (so->so_nl7c_flags & NL7C_ENABLED) {
1737 			/*
1738 			 * A NL7C marked listen()er so the new socket
1739 			 * inherits the listen()er's NL7C state, except
1740 			 * for NL7C_POLLIN.
1741 			 *
1742 			 * Only call NL7C to process the new socket if
1743 			 * the listen socket allows blocking i/o.
1744 			 */
1745 			nso->so_nl7c_flags = so->so_nl7c_flags & (~NL7C_POLLIN);
1746 			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
1747 				/*
1748 				 * Nonblocking accept() just make it
1749 				 * persist to defer processing to the
1750 				 * read-side syscall (e.g. read).
1751 				 */
1752 				nso->so_nl7c_flags |= NL7C_SOPERSIST;
1753 			} else if (nl7c_process(nso, B_FALSE)) {
1754 				/*
1755 				 * NL7C has completed processing on the
1756 				 * socket, close the socket and back to
1757 				 * the top to await the next T_CONN_IND.
1758 				 */
1759 				mutex_exit(&nso->so_lock);
1760 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1761 				    CRED(), NULL);
1762 				VN_RELE(nvp);
1763 				goto again;
1764 			}
1765 			/* Pass the new socket out */
1766 		}
1767 
1768 		mutex_exit(&nso->so_lock);
1769 
1770 		/*
1771 		 * It's possible, through the use of autopush for example,
1772 		 * that the acceptor stream may not support SS_DIRECT
1773 		 * semantics. If the new socket does not support SS_DIRECT
1774 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
1775 		 * as we would in the I_PUSH case.
1776 		 */
1777 		if (!(nso->so_state & SS_DIRECT)) {
1778 			int	rval;
1779 
1780 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
1781 			    0, 0, K_TO_K, CRED(), &rval)) != 0) {
1782 				mutex_enter(&so->so_lock);
1783 				so_lock_single(so);
1784 				eprintsoline(so, error);
1785 				goto disconnect_vp;
1786 			}
1787 		}
1788 
1789 		/*
1790 		 * Pass out new socket.
1791 		 */
1792 		if (nsop != NULL)
1793 			*nsop = nso;
1794 
1795 		return (0);
1796 	}
1797 
1798 	/*
1799 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1800 	 * which don't support the FireEngine accept fast-path. It is also
1801 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1802 	 * again. Neither sockfs nor TCP attempt to find out if some other
1803 	 * random module has been inserted in between (in which case we
1804 	 * should follow TLI accept behaviour). We blindly assume the worst
1805 	 * case and revert back to old behaviour i.e. TCP will not send us
1806 	 * any option (eager) and the accept should happen on the listener
1807 	 * queue. Any queued T_conn_ind have already got their options removed
1808 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
1809 	 */
1810 	/*
1811 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1812 	 */
1813 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1814 #ifdef	_ILP32
1815 		queue_t	*q;
1816 
1817 		/*
1818 		 * Find read queue in driver
1819 		 * Can safely do this since we "own" nso/nvp.
1820 		 */
1821 		q = strvp2wq(nvp)->q_next;
1822 		while (SAMESTR(q))
1823 			q = q->q_next;
1824 		q = RD(q);
1825 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1826 #else
1827 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1828 #endif	/* _ILP32 */
1829 		conn_res->PRIM_type = O_T_CONN_RES;
1830 		PRIM_type = O_T_CONN_RES;
1831 	} else {
1832 		conn_res->ACCEPTOR_id = nso->so_acceptor_id;
1833 		conn_res->PRIM_type = T_CONN_RES;
1834 		PRIM_type = T_CONN_RES;
1835 	}
1836 	conn_res->SEQ_number = SEQ_number;
1837 	conn_res->OPT_length = 0;
1838 	conn_res->OPT_offset = 0;
1839 
1840 	mutex_enter(&so->so_lock);
1841 	so_lock_single(so);	/* Set SOLOCKED */
1842 	mutex_exit(&so->so_lock);
1843 
1844 	error = kstrputmsg(SOTOV(so), mp, NULL,
1845 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1846 	mutex_enter(&so->so_lock);
1847 	if (error) {
1848 		eprintsoline(so, error);
1849 		goto disconnect_vp;
1850 	}
1851 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
1852 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1853 	if (error) {
1854 		eprintsoline(so, error);
1855 		goto disconnect_vp;
1856 	}
1857 	/*
1858 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
1859 	 * that to set the local address. If this is not present
1860 	 * then we zero out the address and don't set the
1861 	 * SS_LADDR_VALID bit. For AF_UNIX endpoints we copy over
1862 	 * the pathname from the listening socket.
1863 	 */
1864 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
1865 	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
1866 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
1867 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
1868 		bcopy(ack_mp->b_rptr, nso->so_laddr_sa, sinlen);
1869 		nso->so_laddr_len = sinlen;
1870 		nso->so_state |= SS_LADDR_VALID;
1871 	} else if (nso->so_family == AF_UNIX) {
1872 		ASSERT(so->so_family == AF_UNIX);
1873 		nso->so_laddr_len = so->so_laddr_len;
1874 		ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
1875 		bcopy(so->so_laddr_sa, nso->so_laddr_sa, nso->so_laddr_len);
1876 		nso->so_state |= SS_LADDR_VALID;
1877 	} else {
1878 		nso->so_laddr_len = so->so_laddr_len;
1879 		ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
1880 		bzero(nso->so_laddr_sa, nso->so_addr_size);
1881 		nso->so_laddr_sa->sa_family = nso->so_family;
1882 	}
1883 	freemsg(ack_mp);
1884 
1885 	so_unlock_single(so, SOLOCKED);
1886 	mutex_exit(&so->so_lock);
1887 
1888 	nso->so_state |= SS_ISCONNECTED;
1889 
1890 	/*
1891 	 * Pass out new socket.
1892 	 */
1893 	if (nsop != NULL)
1894 		*nsop = nso;
1895 
1896 	return (0);
1897 
1898 
1899 eproto_disc_unl:
1900 	error = EPROTO;
1901 e_disc_unl:
1902 	eprintsoline(so, error);
1903 	goto disconnect_unlocked;
1904 
1905 pr_disc_vp_unl:
1906 	eprintsoline(so, error);
1907 disconnect_vp_unlocked:
1908 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
1909 	VN_RELE(nvp);
1910 disconnect_unlocked:
1911 	(void) sodisconnect(so, SEQ_number, 0);
1912 	return (error);
1913 
1914 pr_disc_vp:
1915 	eprintsoline(so, error);
1916 disconnect_vp:
1917 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
1918 	so_unlock_single(so, SOLOCKED);
1919 	mutex_exit(&so->so_lock);
1920 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
1921 	VN_RELE(nvp);
1922 	return (error);
1923 
1924 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
1925 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
1926 	    ? EOPNOTSUPP : EINVAL;
1927 e_bad:
1928 	eprintsoline(so, error);
1929 	return (error);
1930 }
1931 
1932 /*
1933  * connect a socket.
1934  *
1935  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
1936  * unconnect (by specifying a null address).
1937  */
1938 int
1939 sotpi_connect(struct sonode *so,
1940 	const struct sockaddr *name,
1941 	socklen_t namelen,
1942 	int fflag,
1943 	int flags)
1944 {
1945 	struct T_conn_req	conn_req;
1946 	int			error = 0;
1947 	mblk_t			*mp;
1948 	void			*src;
1949 	socklen_t		srclen;
1950 	void			*addr;
1951 	socklen_t		addrlen;
1952 	boolean_t		need_unlock;
1953 
1954 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
1955 	    (void *)so, (void *)name, namelen, fflag, flags,
1956 	    pr_state(so->so_state, so->so_mode)));
1957 
1958 	/*
1959 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
1960 	 * avoid sleeping for memory with SOLOCKED held.
1961 	 * We know that the T_CONN_REQ can't be larger than 2 * so_faddr_maxlen
1962 	 * + sizeof (struct T_opthdr).
1963 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
1964 	 * exceed so_faddr_maxlen).
1965 	 */
1966 	mp = soallocproto(sizeof (struct T_conn_req) +
1967 	    2 * so->so_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR);
1968 	if (mp == NULL) {
1969 		/*
1970 		 * Connect can not fail with ENOBUFS. A signal was
1971 		 * caught so return EINTR.
1972 		 */
1973 		error = EINTR;
1974 		eprintsoline(so, error);
1975 		return (error);
1976 	}
1977 
1978 	mutex_enter(&so->so_lock);
1979 	/*
1980 	 * Make sure there is a preallocated T_unbind_req message
1981 	 * before any binding. This message is allocated when the
1982 	 * socket is created. Since another thread can consume
1983 	 * so_unbind_mp by the time we return from so_lock_single(),
1984 	 * we should check the availability of so_unbind_mp after
1985 	 * we return from so_lock_single().
1986 	 */
1987 
1988 	so_lock_single(so);	/* Set SOLOCKED */
1989 	need_unlock = B_TRUE;
1990 
1991 	if (so->so_unbind_mp == NULL) {
1992 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
1993 		/* NOTE: holding so_lock while sleeping */
1994 		so->so_unbind_mp =
1995 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR);
1996 		if (so->so_unbind_mp == NULL) {
1997 			error = EINTR;
1998 			goto done;
1999 		}
2000 	}
2001 
2002 	/*
2003 	 * Can't have done a listen before connecting.
2004 	 */
2005 	if (so->so_state & SS_ACCEPTCONN) {
2006 		error = EOPNOTSUPP;
2007 		goto done;
2008 	}
2009 
2010 	/*
2011 	 * Must be bound with the transport
2012 	 */
2013 	if (!(so->so_state & SS_ISBOUND)) {
2014 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2015 		    /*CONSTCOND*/
2016 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2017 			/*
2018 			 * Optimization for AF_INET{,6} transports
2019 			 * that can handle a T_CONN_REQ without being bound.
2020 			 */
2021 			so_automatic_bind(so);
2022 		} else {
2023 			error = sotpi_bind(so, NULL, 0,
2024 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
2025 			if (error)
2026 				goto done;
2027 		}
2028 		ASSERT(so->so_state & SS_ISBOUND);
2029 		flags |= _SOCONNECT_DID_BIND;
2030 	}
2031 
2032 	/*
2033 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2034 	 * connect to a null address. This is the portable method to
2035 	 * unconnect a socket.
2036 	 */
2037 	if ((namelen >= sizeof (sa_family_t)) &&
2038 	    (name->sa_family == AF_UNSPEC)) {
2039 		name = NULL;
2040 		namelen = 0;
2041 	}
2042 
2043 	/*
2044 	 * Check that we are not already connected.
2045 	 * A connection-oriented socket cannot be reconnected.
2046 	 * A connected connection-less socket can be
2047 	 * - connected to a different address by a subsequent connect
2048 	 * - "unconnected" by a connect to the NULL address
2049 	 */
2050 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2051 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2052 		if (so->so_mode & SM_CONNREQUIRED) {
2053 			/* Connection-oriented socket */
2054 			error = so->so_state & SS_ISCONNECTED ?
2055 			    EISCONN : EALREADY;
2056 			goto done;
2057 		}
2058 		/* Connection-less socket */
2059 		if (name == NULL) {
2060 			/*
2061 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2062 			 * since it was set when the socket was connected.
2063 			 * If this is UDP also send down a T_DISCON_REQ.
2064 			 */
2065 			int val;
2066 
2067 			if ((so->so_family == AF_INET ||
2068 			    so->so_family == AF_INET6) &&
2069 			    (so->so_type == SOCK_DGRAM ||
2070 			    so->so_type == SOCK_RAW) &&
2071 			    /*CONSTCOND*/
2072 			    !soconnect_tpi_udp) {
2073 				/* XXX What about implicitly unbinding here? */
2074 				error = sodisconnect(so, -1,
2075 				    _SODISCONNECT_LOCK_HELD);
2076 			} else {
2077 				so->so_state &=
2078 				    ~(SS_ISCONNECTED | SS_ISCONNECTING |
2079 				    SS_FADDR_VALID);
2080 				so->so_faddr_len = 0;
2081 			}
2082 
2083 			so_unlock_single(so, SOLOCKED);
2084 			mutex_exit(&so->so_lock);
2085 
2086 			val = 0;
2087 			(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2088 			    &val, (t_uscalar_t)sizeof (val));
2089 
2090 			mutex_enter(&so->so_lock);
2091 			so_lock_single(so);	/* Set SOLOCKED */
2092 			goto done;
2093 		}
2094 	}
2095 	ASSERT(so->so_state & SS_ISBOUND);
2096 
2097 	if (name == NULL || namelen == 0) {
2098 		error = EINVAL;
2099 		goto done;
2100 	}
2101 	/*
2102 	 * Mark the socket if so_faddr_sa represents the transport level
2103 	 * address.
2104 	 */
2105 	if (flags & _SOCONNECT_NOXLATE) {
2106 		struct sockaddr_ux	*soaddr_ux;
2107 
2108 		ASSERT(so->so_family == AF_UNIX);
2109 		if (namelen != sizeof (struct sockaddr_ux)) {
2110 			error = EINVAL;
2111 			goto done;
2112 		}
2113 		soaddr_ux = (struct sockaddr_ux *)name;
2114 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2115 		namelen = sizeof (soaddr_ux->sou_addr);
2116 		so->so_state |= SS_FADDR_NOXLATE;
2117 	}
2118 
2119 	/*
2120 	 * Length and family checks.
2121 	 */
2122 	error = so_addr_verify(so, name, namelen);
2123 	if (error)
2124 		goto bad;
2125 
2126 	/*
2127 	 * Save foreign address. Needed for AF_UNIX as well as
2128 	 * transport providers that do not support TI_GETPEERNAME.
2129 	 * Also used for cached foreign address for TCP and UDP.
2130 	 */
2131 	if (namelen > (t_uscalar_t)so->so_faddr_maxlen) {
2132 		error = EINVAL;
2133 		goto done;
2134 	}
2135 	so->so_faddr_len = (socklen_t)namelen;
2136 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
2137 	bcopy(name, so->so_faddr_sa, namelen);
2138 	so->so_state |= SS_FADDR_VALID;
2139 
2140 	if (so->so_family == AF_UNIX) {
2141 		if (so->so_state & SS_FADDR_NOXLATE) {
2142 			/*
2143 			 * Already have a transport internal address. Do not
2144 			 * pass any (transport internal) source address.
2145 			 */
2146 			addr = so->so_faddr_sa;
2147 			addrlen = (t_uscalar_t)so->so_faddr_len;
2148 			src = NULL;
2149 			srclen = 0;
2150 		} else {
2151 			/*
2152 			 * Pass the sockaddr_un source address as an option
2153 			 * and translate the remote address.
2154 			 * Holding so_lock thus so_laddr_sa can not change.
2155 			 */
2156 			src = so->so_laddr_sa;
2157 			srclen = (t_uscalar_t)so->so_laddr_len;
2158 			dprintso(so, 1,
2159 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2160 			    srclen, src));
2161 			error = so_ux_addr_xlate(so,
2162 			    so->so_faddr_sa, (socklen_t)so->so_faddr_len,
2163 			    (flags & _SOCONNECT_XPG4_2),
2164 			    &addr, &addrlen);
2165 			if (error)
2166 				goto bad;
2167 		}
2168 	} else {
2169 		addr = so->so_faddr_sa;
2170 		addrlen = (t_uscalar_t)so->so_faddr_len;
2171 		src = NULL;
2172 		srclen = 0;
2173 	}
2174 	/*
2175 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2176 	 * option which asks the transport provider to send T_UDERR_IND
2177 	 * messages. These T_UDERR_IND messages are used to return connected
2178 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2179 	 *
2180 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2181 	 * we send down a T_CONN_REQ. This is needed to let the
2182 	 * transport assign a local address that is consistent with
2183 	 * the remote address. Applications depend on a getsockname()
2184 	 * after a connect() to retrieve the "source" IP address for
2185 	 * the connected socket.  Invalidate the cached local address
2186 	 * to force getsockname() to enquire of the transport.
2187 	 */
2188 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2189 		/*
2190 		 * Datagram socket.
2191 		 */
2192 		int32_t val;
2193 
2194 		so_unlock_single(so, SOLOCKED);
2195 		mutex_exit(&so->so_lock);
2196 
2197 		val = 1;
2198 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2199 		    &val, (t_uscalar_t)sizeof (val));
2200 
2201 		mutex_enter(&so->so_lock);
2202 		so_lock_single(so);	/* Set SOLOCKED */
2203 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2204 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2205 		    soconnect_tpi_udp) {
2206 			soisconnected(so);
2207 			goto done;
2208 		}
2209 		/*
2210 		 * Send down T_CONN_REQ etc.
2211 		 * Clear fflag to avoid returning EWOULDBLOCK.
2212 		 */
2213 		fflag = 0;
2214 		ASSERT(so->so_family != AF_UNIX);
2215 		so->so_state &= ~SS_LADDR_VALID;
2216 	} else if (so->so_laddr_len != 0) {
2217 		/*
2218 		 * If the local address or port was "any" then it may be
2219 		 * changed by the transport as a result of the
2220 		 * connect.  Invalidate the cached version if we have one.
2221 		 */
2222 		switch (so->so_family) {
2223 		case AF_INET:
2224 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin_t));
2225 			if (((sin_t *)so->so_laddr_sa)->sin_addr.s_addr ==
2226 			    INADDR_ANY ||
2227 			    ((sin_t *)so->so_laddr_sa)->sin_port == 0)
2228 				so->so_state &= ~SS_LADDR_VALID;
2229 			break;
2230 
2231 		case AF_INET6:
2232 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin6_t));
2233 			if (IN6_IS_ADDR_UNSPECIFIED(
2234 			    &((sin6_t *)so->so_laddr_sa) ->sin6_addr) ||
2235 			    IN6_IS_ADDR_V4MAPPED_ANY(
2236 			    &((sin6_t *)so->so_laddr_sa)->sin6_addr) ||
2237 			    ((sin6_t *)so->so_laddr_sa)->sin6_port == 0)
2238 				so->so_state &= ~SS_LADDR_VALID;
2239 			break;
2240 
2241 		default:
2242 			break;
2243 		}
2244 	}
2245 
2246 	/*
2247 	 * Check for failure of an earlier call
2248 	 */
2249 	if (so->so_error != 0)
2250 		goto so_bad;
2251 
2252 	/*
2253 	 * Send down T_CONN_REQ. Message was allocated above.
2254 	 */
2255 	conn_req.PRIM_type = T_CONN_REQ;
2256 	conn_req.DEST_length = addrlen;
2257 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2258 	if (srclen == 0) {
2259 		conn_req.OPT_length = 0;
2260 		conn_req.OPT_offset = 0;
2261 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2262 		soappendmsg(mp, addr, addrlen);
2263 	} else {
2264 		/*
2265 		 * There is a AF_UNIX sockaddr_un to include as a source
2266 		 * address option.
2267 		 */
2268 		struct T_opthdr toh;
2269 
2270 		toh.level = SOL_SOCKET;
2271 		toh.name = SO_SRCADDR;
2272 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2273 		toh.status = 0;
2274 		conn_req.OPT_length =
2275 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2276 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2277 		    _TPI_ALIGN_TOPT(addrlen));
2278 
2279 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2280 		soappendmsg(mp, addr, addrlen);
2281 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2282 		soappendmsg(mp, &toh, sizeof (toh));
2283 		soappendmsg(mp, src, srclen);
2284 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2285 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2286 	}
2287 	/*
2288 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2289 	 * in order to have the right state when the T_CONN_CON shows up.
2290 	 */
2291 	soisconnecting(so);
2292 	mutex_exit(&so->so_lock);
2293 
2294 	if (audit_active)
2295 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2296 
2297 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2298 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2299 	mp = NULL;
2300 	mutex_enter(&so->so_lock);
2301 	if (error != 0)
2302 		goto bad;
2303 
2304 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2305 		goto bad;
2306 
2307 	/* Allow other threads to access the socket */
2308 	so_unlock_single(so, SOLOCKED);
2309 	need_unlock = B_FALSE;
2310 
2311 	/*
2312 	 * Wait until we get a T_CONN_CON or an error
2313 	 */
2314 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2315 		so_lock_single(so);	/* Set SOLOCKED */
2316 		need_unlock = B_TRUE;
2317 	}
2318 
2319 done:
2320 	freemsg(mp);
2321 	switch (error) {
2322 	case EINPROGRESS:
2323 	case EALREADY:
2324 	case EISCONN:
2325 	case EINTR:
2326 		/* Non-fatal errors */
2327 		so->so_state &= ~SS_LADDR_VALID;
2328 		/* FALLTHRU */
2329 	case 0:
2330 		break;
2331 
2332 	case EHOSTUNREACH:
2333 		if (flags & _SOCONNECT_XPG4_2) {
2334 			/*
2335 			 * X/Open specification contains a requirement that
2336 			 * ENETUNREACH be returned but does not require
2337 			 * EHOSTUNREACH. In order to keep the test suite
2338 			 * happy we mess with the errno here.
2339 			 */
2340 			error = ENETUNREACH;
2341 		}
2342 		/* FALLTHRU */
2343 
2344 	default:
2345 		ASSERT(need_unlock);
2346 		/*
2347 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2348 		 * and invalidate local-address cache
2349 		 */
2350 		so->so_state &= ~(SS_ISCONNECTING | SS_LADDR_VALID);
2351 		/* A discon_ind might have already unbound us */
2352 		if ((flags & _SOCONNECT_DID_BIND) &&
2353 		    (so->so_state & SS_ISBOUND)) {
2354 			int err;
2355 
2356 			err = sotpi_unbind(so, 0);
2357 			/* LINTED - statement has no conseq */
2358 			if (err) {
2359 				eprintsoline(so, err);
2360 			}
2361 		}
2362 		break;
2363 	}
2364 	if (need_unlock)
2365 		so_unlock_single(so, SOLOCKED);
2366 	mutex_exit(&so->so_lock);
2367 	return (error);
2368 
2369 so_bad:	error = sogeterr(so);
2370 bad:	eprintsoline(so, error);
2371 	goto done;
2372 }
2373 
2374 int
2375 sotpi_shutdown(struct sonode *so, int how)
2376 {
2377 	struct T_ordrel_req	ordrel_req;
2378 	mblk_t			*mp;
2379 	uint_t			old_state, state_change;
2380 	int			error = 0;
2381 
2382 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2383 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2384 
2385 	mutex_enter(&so->so_lock);
2386 	so_lock_single(so);	/* Set SOLOCKED */
2387 
2388 	/*
2389 	 * SunOS 4.X has no check for datagram sockets.
2390 	 * 5.X checks that it is connected (ENOTCONN)
2391 	 * X/Open requires that we check the connected state.
2392 	 */
2393 	if (!(so->so_state & SS_ISCONNECTED)) {
2394 		if (!xnet_skip_checks) {
2395 			error = ENOTCONN;
2396 			if (xnet_check_print) {
2397 				printf("sockfs: X/Open shutdown check "
2398 				    "caused ENOTCONN\n");
2399 			}
2400 		}
2401 		goto done;
2402 	}
2403 	/*
2404 	 * Record the current state and then perform any state changes.
2405 	 * Then use the difference between the old and new states to
2406 	 * determine which messages need to be sent.
2407 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2408 	 * duplicate calls to shutdown().
2409 	 */
2410 	old_state = so->so_state;
2411 
2412 	switch (how) {
2413 	case 0:
2414 		socantrcvmore(so);
2415 		break;
2416 	case 1:
2417 		socantsendmore(so);
2418 		break;
2419 	case 2:
2420 		socantsendmore(so);
2421 		socantrcvmore(so);
2422 		break;
2423 	default:
2424 		error = EINVAL;
2425 		goto done;
2426 	}
2427 
2428 	/*
2429 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2430 	 */
2431 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2432 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2433 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2434 
2435 	switch (state_change) {
2436 	case 0:
2437 		dprintso(so, 1,
2438 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2439 		    so->so_state));
2440 		goto done;
2441 
2442 	case SS_CANTRCVMORE:
2443 		mutex_exit(&so->so_lock);
2444 		strseteof(SOTOV(so), 1);
2445 		/*
2446 		 * strseteof takes care of read side wakeups,
2447 		 * pollwakeups, and signals.
2448 		 */
2449 		/*
2450 		 * Get the read lock before flushing data to avoid problems
2451 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2452 		 */
2453 		mutex_enter(&so->so_lock);
2454 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2455 		mutex_exit(&so->so_lock);
2456 
2457 		/* Flush read side queue */
2458 		strflushrq(SOTOV(so), FLUSHALL);
2459 
2460 		mutex_enter(&so->so_lock);
2461 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2462 		break;
2463 
2464 	case SS_CANTSENDMORE:
2465 		mutex_exit(&so->so_lock);
2466 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2467 		mutex_enter(&so->so_lock);
2468 		break;
2469 
2470 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2471 		mutex_exit(&so->so_lock);
2472 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2473 		strseteof(SOTOV(so), 1);
2474 		/*
2475 		 * strseteof takes care of read side wakeups,
2476 		 * pollwakeups, and signals.
2477 		 */
2478 		/*
2479 		 * Get the read lock before flushing data to avoid problems
2480 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2481 		 */
2482 		mutex_enter(&so->so_lock);
2483 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2484 		mutex_exit(&so->so_lock);
2485 
2486 		/* Flush read side queue */
2487 		strflushrq(SOTOV(so), FLUSHALL);
2488 
2489 		mutex_enter(&so->so_lock);
2490 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2491 		break;
2492 	}
2493 
2494 	ASSERT(MUTEX_HELD(&so->so_lock));
2495 
2496 	/*
2497 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2498 	 * was set due to this call and the new state has both of them set:
2499 	 *	Send the AF_UNIX close indication
2500 	 *	For T_COTS send a discon_ind
2501 	 *
2502 	 * If cantsend was set due to this call:
2503 	 *	For T_COTSORD send an ordrel_ind
2504 	 *
2505 	 * Note that for T_CLTS there is no message sent here.
2506 	 */
2507 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2508 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2509 		/*
2510 		 * For SunOS 4.X compatibility we tell the other end
2511 		 * that we are unable to receive at this point.
2512 		 */
2513 		if (so->so_family == AF_UNIX && so->so_serv_type != T_CLTS)
2514 			so_unix_close(so);
2515 
2516 		if (so->so_serv_type == T_COTS)
2517 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2518 	}
2519 	if ((state_change & SS_CANTSENDMORE) &&
2520 	    (so->so_serv_type == T_COTS_ORD)) {
2521 		/* Send an orderly release */
2522 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2523 
2524 		mutex_exit(&so->so_lock);
2525 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2526 		    0, _ALLOC_SLEEP);
2527 		/*
2528 		 * Send down the T_ORDREL_REQ even if there is flow control.
2529 		 * This prevents shutdown from blocking.
2530 		 * Note that there is no T_OK_ACK for ordrel_req.
2531 		 */
2532 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2533 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2534 		mutex_enter(&so->so_lock);
2535 		if (error) {
2536 			eprintsoline(so, error);
2537 			goto done;
2538 		}
2539 	}
2540 
2541 done:
2542 	so_unlock_single(so, SOLOCKED);
2543 	mutex_exit(&so->so_lock);
2544 	return (error);
2545 }
2546 
2547 /*
2548  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2549  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2550  * that we have closed.
2551  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2552  * T_UNITDATA_REQ containing the same option.
2553  *
2554  * For SOCK_DGRAM half-connections (somebody connected to this end
2555  * but this end is not connect) we don't know where to send any
2556  * SO_UNIX_CLOSE.
2557  *
2558  * We have to ignore stream head errors just in case there has been
2559  * a shutdown(output).
2560  * Ignore any flow control to try to get the message more quickly to the peer.
2561  * While locally ignoring flow control solves the problem when there
2562  * is only the loopback transport on the stream it would not provide
2563  * the correct AF_UNIX socket semantics when one or more modules have
2564  * been pushed.
2565  */
2566 void
2567 so_unix_close(struct sonode *so)
2568 {
2569 	int		error;
2570 	struct T_opthdr	toh;
2571 	mblk_t		*mp;
2572 
2573 	ASSERT(MUTEX_HELD(&so->so_lock));
2574 
2575 	ASSERT(so->so_family == AF_UNIX);
2576 
2577 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2578 	    (SS_ISCONNECTED|SS_ISBOUND))
2579 		return;
2580 
2581 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2582 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2583 
2584 	toh.level = SOL_SOCKET;
2585 	toh.name = SO_UNIX_CLOSE;
2586 
2587 	/* zero length + header */
2588 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2589 	toh.status = 0;
2590 
2591 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2592 		struct T_optdata_req tdr;
2593 
2594 		tdr.PRIM_type = T_OPTDATA_REQ;
2595 		tdr.DATA_flag = 0;
2596 
2597 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2598 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2599 
2600 		/* NOTE: holding so_lock while sleeping */
2601 		mp = soallocproto2(&tdr, sizeof (tdr),
2602 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP);
2603 	} else {
2604 		struct T_unitdata_req	tudr;
2605 		void			*addr;
2606 		socklen_t		addrlen;
2607 		void			*src;
2608 		socklen_t		srclen;
2609 		struct T_opthdr		toh2;
2610 		t_scalar_t		size;
2611 
2612 		/* Connecteded DGRAM socket */
2613 
2614 		/*
2615 		 * For AF_UNIX the destination address is translated to
2616 		 * an internal name and the source address is passed as
2617 		 * an option.
2618 		 */
2619 		/*
2620 		 * Length and family checks.
2621 		 */
2622 		error = so_addr_verify(so, so->so_faddr_sa,
2623 		    (t_uscalar_t)so->so_faddr_len);
2624 		if (error) {
2625 			eprintsoline(so, error);
2626 			return;
2627 		}
2628 		if (so->so_state & SS_FADDR_NOXLATE) {
2629 			/*
2630 			 * Already have a transport internal address. Do not
2631 			 * pass any (transport internal) source address.
2632 			 */
2633 			addr = so->so_faddr_sa;
2634 			addrlen = (t_uscalar_t)so->so_faddr_len;
2635 			src = NULL;
2636 			srclen = 0;
2637 		} else {
2638 			/*
2639 			 * Pass the sockaddr_un source address as an option
2640 			 * and translate the remote address.
2641 			 * Holding so_lock thus so_laddr_sa can not change.
2642 			 */
2643 			src = so->so_laddr_sa;
2644 			srclen = (socklen_t)so->so_laddr_len;
2645 			dprintso(so, 1,
2646 			    ("so_ux_close: srclen %d, src %p\n",
2647 			    srclen, src));
2648 			error = so_ux_addr_xlate(so,
2649 			    so->so_faddr_sa,
2650 			    (socklen_t)so->so_faddr_len, 0,
2651 			    &addr, &addrlen);
2652 			if (error) {
2653 				eprintsoline(so, error);
2654 				return;
2655 			}
2656 		}
2657 		tudr.PRIM_type = T_UNITDATA_REQ;
2658 		tudr.DEST_length = addrlen;
2659 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2660 		if (srclen == 0) {
2661 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2662 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2663 			    _TPI_ALIGN_TOPT(addrlen));
2664 
2665 			size = tudr.OPT_offset + tudr.OPT_length;
2666 			/* NOTE: holding so_lock while sleeping */
2667 			mp = soallocproto2(&tudr, sizeof (tudr),
2668 			    addr, addrlen, size, _ALLOC_SLEEP);
2669 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2670 			soappendmsg(mp, &toh, sizeof (toh));
2671 		} else {
2672 			/*
2673 			 * There is a AF_UNIX sockaddr_un to include as a
2674 			 * source address option.
2675 			 */
2676 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2677 			    _TPI_ALIGN_TOPT(srclen));
2678 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2679 			    _TPI_ALIGN_TOPT(addrlen));
2680 
2681 			toh2.level = SOL_SOCKET;
2682 			toh2.name = SO_SRCADDR;
2683 			toh2.len = (t_uscalar_t)(srclen +
2684 			    sizeof (struct T_opthdr));
2685 			toh2.status = 0;
2686 
2687 			size = tudr.OPT_offset + tudr.OPT_length;
2688 
2689 			/* NOTE: holding so_lock while sleeping */
2690 			mp = soallocproto2(&tudr, sizeof (tudr),
2691 			    addr, addrlen, size, _ALLOC_SLEEP);
2692 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2693 			soappendmsg(mp, &toh, sizeof (toh));
2694 			soappendmsg(mp, &toh2, sizeof (toh2));
2695 			soappendmsg(mp, src, srclen);
2696 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2697 		}
2698 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2699 	}
2700 	mutex_exit(&so->so_lock);
2701 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2702 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2703 	mutex_enter(&so->so_lock);
2704 }
2705 
2706 /*
2707  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
2708  */
2709 int
2710 sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags)
2711 {
2712 	mblk_t		*mp, *nmp;
2713 	int		error;
2714 
2715 	dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n",
2716 	    (void *)so, (void *)msg, flags));
2717 
2718 	/*
2719 	 * There is never any oob data with addresses or control since
2720 	 * the T_EXDATA_IND does not carry any options.
2721 	 */
2722 	msg->msg_controllen = 0;
2723 	msg->msg_namelen = 0;
2724 
2725 	mutex_enter(&so->so_lock);
2726 	ASSERT(so_verify_oobstate(so));
2727 	if ((so->so_options & SO_OOBINLINE) ||
2728 	    (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
2729 		dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
2730 		mutex_exit(&so->so_lock);
2731 		return (EINVAL);
2732 	}
2733 	if (!(so->so_state & SS_HAVEOOBDATA)) {
2734 		dprintso(so, 1, ("sorecvoob: no data yet\n"));
2735 		mutex_exit(&so->so_lock);
2736 		return (EWOULDBLOCK);
2737 	}
2738 	ASSERT(so->so_oobmsg != NULL);
2739 	mp = so->so_oobmsg;
2740 	if (flags & MSG_PEEK) {
2741 		/*
2742 		 * Since recv* can not return ENOBUFS we can not use dupmsg.
2743 		 * Instead we revert to the consolidation private
2744 		 * allocb_wait plus bcopy.
2745 		 */
2746 		mblk_t *mp1;
2747 
2748 		mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
2749 		ASSERT(mp1);
2750 
2751 		while (mp != NULL) {
2752 			ssize_t size;
2753 
2754 			size = MBLKL(mp);
2755 			bcopy(mp->b_rptr, mp1->b_wptr, size);
2756 			mp1->b_wptr += size;
2757 			ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
2758 			mp = mp->b_cont;
2759 		}
2760 		mp = mp1;
2761 	} else {
2762 		/*
2763 		 * Update the state indicating that the data has been consumed.
2764 		 * Keep SS_OOBPEND set until data is consumed past the mark.
2765 		 */
2766 		so->so_oobmsg = NULL;
2767 		so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
2768 	}
2769 	dprintso(so, 1,
2770 	    ("after recvoob(%p): counts %d/%d state %s\n",
2771 	    (void *)so, so->so_oobsigcnt,
2772 	    so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2773 	ASSERT(so_verify_oobstate(so));
2774 	mutex_exit(&so->so_lock);
2775 
2776 	error = 0;
2777 	nmp = mp;
2778 	while (nmp != NULL && uiop->uio_resid > 0) {
2779 		ssize_t n = MBLKL(nmp);
2780 
2781 		n = MIN(n, uiop->uio_resid);
2782 		if (n > 0)
2783 			error = uiomove(nmp->b_rptr, n,
2784 			    UIO_READ, uiop);
2785 		if (error)
2786 			break;
2787 		nmp = nmp->b_cont;
2788 	}
2789 	freemsg(mp);
2790 	return (error);
2791 }
2792 
2793 /*
2794  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2795  * In addition, the caller typically verifies that there is some
2796  * potential state to clear by checking
2797  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2798  * before calling this routine.
2799  * Note that such a check can be made without holding so_lock since
2800  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2801  * decrements so_oobsigcnt.
2802  *
2803  * When data is read *after* the point that all pending
2804  * oob data has been consumed the oob indication is cleared.
2805  *
2806  * This logic keeps select/poll returning POLLRDBAND and
2807  * SIOCATMARK returning true until we have read past
2808  * the mark.
2809  */
2810 static void
2811 sorecv_update_oobstate(struct sonode *so)
2812 {
2813 	mutex_enter(&so->so_lock);
2814 	ASSERT(so_verify_oobstate(so));
2815 	dprintso(so, 1,
2816 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
2817 	    so->so_oobsigcnt,
2818 	    so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2819 	if (so->so_oobsigcnt == 0) {
2820 		/* No more pending oob indications */
2821 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2822 		freemsg(so->so_oobmsg);
2823 		so->so_oobmsg = NULL;
2824 	}
2825 	ASSERT(so_verify_oobstate(so));
2826 	mutex_exit(&so->so_lock);
2827 }
2828 
2829 /*
2830  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2831  */
2832 static int
2833 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2834 {
2835 	int	error = 0;
2836 	mblk_t *tmp = NULL;
2837 	mblk_t *pmp = NULL;
2838 	mblk_t *nmp = so->so_nl7c_rcv_mp;
2839 
2840 	ASSERT(nmp != NULL);
2841 
2842 	while (nmp != NULL && uiop->uio_resid > 0) {
2843 		ssize_t n;
2844 
2845 		if (DB_TYPE(nmp) == M_DATA) {
2846 			/*
2847 			 * We have some data, uiomove up to resid bytes.
2848 			 */
2849 			n = MIN(MBLKL(nmp), uiop->uio_resid);
2850 			if (n > 0)
2851 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
2852 			nmp->b_rptr += n;
2853 			if (nmp->b_rptr == nmp->b_wptr) {
2854 				pmp = nmp;
2855 				nmp = nmp->b_cont;
2856 			}
2857 			if (error)
2858 				break;
2859 		} else {
2860 			/*
2861 			 * We only handle data, save for caller to handle.
2862 			 */
2863 			if (pmp != NULL) {
2864 				pmp->b_cont = nmp->b_cont;
2865 			}
2866 			nmp->b_cont = NULL;
2867 			if (*rmp == NULL) {
2868 				*rmp = nmp;
2869 			} else {
2870 				tmp->b_cont = nmp;
2871 			}
2872 			nmp = nmp->b_cont;
2873 			tmp = nmp;
2874 		}
2875 	}
2876 	if (pmp != NULL) {
2877 		/* Free any mblk_t(s) which we have consumed */
2878 		pmp->b_cont = NULL;
2879 		freemsg(so->so_nl7c_rcv_mp);
2880 	}
2881 	if ((so->so_nl7c_rcv_mp = nmp) == NULL) {
2882 		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
2883 		if (error == 0) {
2884 			rval_t	*p = (rval_t *)&so->so_nl7c_rcv_rval;
2885 
2886 			error = p->r_v.r_v2;
2887 			p->r_v.r_v2 = 0;
2888 		}
2889 		rp->r_vals = so->so_nl7c_rcv_rval;
2890 		so->so_nl7c_rcv_rval = 0;
2891 	} else {
2892 		/* More mblk_t(s) to process so no rval to return */
2893 		rp->r_vals = 0;
2894 	}
2895 	return (error);
2896 }
2897 
2898 /*
2899  * Receive the next message on the queue.
2900  * If msg_controllen is non-zero when called the caller is interested in
2901  * any received control info (options).
2902  * If msg_namelen is non-zero when called the caller is interested in
2903  * any received source address.
2904  * The routine returns with msg_control and msg_name pointing to
2905  * kmem_alloc'ed memory which the caller has to free.
2906  */
2907 int
2908 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
2909 {
2910 	union T_primitives	*tpr;
2911 	mblk_t			*mp;
2912 	uchar_t			pri;
2913 	int			pflag, opflag;
2914 	void			*control;
2915 	t_uscalar_t		controllen;
2916 	t_uscalar_t		namelen;
2917 	int			so_state = so->so_state; /* Snapshot */
2918 	ssize_t			saved_resid;
2919 	rval_t			rval;
2920 	int			flags;
2921 	clock_t			timout;
2922 	int			first;
2923 	int			error = 0;
2924 	struct uio		*suiop = NULL;
2925 	sodirect_t		*sodp = so->so_direct;
2926 
2927 	flags = msg->msg_flags;
2928 	msg->msg_flags = 0;
2929 
2930 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2931 	    (void *)so, (void *)msg, flags,
2932 	    pr_state(so->so_state, so->so_mode), so->so_error));
2933 
2934 	/*
2935 	 * If we are not connected because we have never been connected
2936 	 * we return ENOTCONN. If we have been connected (but are no longer
2937 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2938 	 * the EOF.
2939 	 *
2940 	 * An alternative would be to post an ENOTCONN error in stream head
2941 	 * (read+write) and clear it when we're connected. However, that error
2942 	 * would cause incorrect poll/select behavior!
2943 	 */
2944 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2945 	    (so->so_mode & SM_CONNREQUIRED)) {
2946 		return (ENOTCONN);
2947 	}
2948 
2949 	/*
2950 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2951 	 * after checking that the read queue is empty) and returns zero.
2952 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2953 	 * is zero.
2954 	 */
2955 
2956 	if (flags & MSG_OOB) {
2957 		/* Check that the transport supports OOB */
2958 		if (!(so->so_mode & SM_EXDATA))
2959 			return (EOPNOTSUPP);
2960 		return (sorecvoob(so, msg, uiop, flags));
2961 	}
2962 
2963 	/*
2964 	 * Set msg_controllen and msg_namelen to zero here to make it
2965 	 * simpler in the cases that no control or name is returned.
2966 	 */
2967 	controllen = msg->msg_controllen;
2968 	namelen = msg->msg_namelen;
2969 	msg->msg_controllen = 0;
2970 	msg->msg_namelen = 0;
2971 
2972 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2973 	    namelen, controllen));
2974 
2975 	mutex_enter(&so->so_lock);
2976 	/*
2977 	 * If an NL7C enabled socket and not waiting for write data.
2978 	 */
2979 	if ((so->so_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
2980 	    NL7C_ENABLED) {
2981 		if (so->so_nl7c_uri) {
2982 			/* Close uri processing for a previous request */
2983 			nl7c_close(so);
2984 		}
2985 		if ((so_state & SS_CANTRCVMORE) && so->so_nl7c_rcv_mp == NULL) {
2986 			/* Nothing to process, EOF */
2987 			mutex_exit(&so->so_lock);
2988 			return (0);
2989 		} else if (so->so_nl7c_flags & NL7C_SOPERSIST) {
2990 			/* Persistent NL7C socket, try to process request */
2991 			boolean_t ret;
2992 
2993 			ret = nl7c_process(so,
2994 			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
2995 			rval.r_vals = so->so_nl7c_rcv_rval;
2996 			error = rval.r_v.r_v2;
2997 			if (error) {
2998 				/* Error of some sort, return it */
2999 				mutex_exit(&so->so_lock);
3000 				return (error);
3001 			}
3002 			if (so->so_nl7c_flags &&
3003 			    ! (so->so_nl7c_flags & NL7C_WAITWRITE)) {
3004 				/*
3005 				 * Still an NL7C socket and no data
3006 				 * to pass up to the caller.
3007 				 */
3008 				mutex_exit(&so->so_lock);
3009 				if (ret) {
3010 					/* EOF */
3011 					return (0);
3012 				} else {
3013 					/* Need more data */
3014 					return (EAGAIN);
3015 				}
3016 			}
3017 		} else {
3018 			/*
3019 			 * Not persistent so no further NL7C processing.
3020 			 */
3021 			so->so_nl7c_flags = 0;
3022 		}
3023 	}
3024 	/*
3025 	 * Only one reader is allowed at any given time. This is needed
3026 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3027 	 *
3028 	 * This is slightly different that BSD behavior in that it fails with
3029 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3030 	 * is single-threaded using sblock(), which is dropped while waiting
3031 	 * for data to appear. The difference shows up e.g. if one
3032 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3033 	 * does use nonblocking io and different threads are reading each
3034 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3035 	 * in this case as long as the read queue doesn't get empty.
3036 	 * In this implementation the thread using nonblocking io can
3037 	 * get an EWOULDBLOCK error due to the blocking thread executing
3038 	 * e.g. in the uiomove in kstrgetmsg.
3039 	 * This difference is not believed to be significant.
3040 	 */
3041 	/* Set SOREADLOCKED */
3042 	error = so_lock_read_intr(so,
3043 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3044 	mutex_exit(&so->so_lock);
3045 	if (error)
3046 		return (error);
3047 
3048 	/*
3049 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3050 	 * queued data has been consumed.
3051 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3052 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3053 	 *
3054 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3055 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3056 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3057 	 */
3058 	pflag = MSG_ANY | MSG_DELAYERROR;
3059 	if (flags & MSG_PEEK) {
3060 		pflag |= MSG_IPEEK;
3061 		flags &= ~MSG_WAITALL;
3062 	}
3063 	if (so->so_mode & SM_ATOMIC)
3064 		pflag |= MSG_DISCARDTAIL;
3065 
3066 	if (flags & MSG_DONTWAIT)
3067 		timout = 0;
3068 	else
3069 		timout = -1;
3070 	opflag = pflag;
3071 	first = 1;
3072 
3073 	if (uiop->uio_resid >= uioasync.mincnt &&
3074 	    sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
3075 	    uioasync.enabled && !(flags & MSG_PEEK) &&
3076 	    !(so_state & SS_CANTRCVMORE)) {
3077 		/*
3078 		 * Big enough I/O for uioa min setup and an sodirect socket
3079 		 * and sodirect enabled and uioa enabled and I/O will be done
3080 		 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
3081 		 */
3082 		mutex_enter(sodp->sod_lock);
3083 		if (!uioainit(uiop, &sodp->sod_uioa)) {
3084 			/*
3085 			 * Successful uioainit() so the uio_t part of the
3086 			 * uioa_t will be used for all uio_t work to follow,
3087 			 * we save the original "uiop" in "suiop".
3088 			 */
3089 			suiop = uiop;
3090 			uiop = (uio_t *)&sodp->sod_uioa;
3091 			/*
3092 			 * Before returning to the caller the passed in uio_t
3093 			 * "uiop" will be updated via a call to uioafini()
3094 			 * below.
3095 			 *
3096 			 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
3097 			 * here as first we have to uioamove() any currently
3098 			 * queued M_DATA mblk_t(s) so it will be done in
3099 			 * kstrgetmsg().
3100 			 */
3101 		}
3102 		/*
3103 		 * In either uioainit() success or not case note the number
3104 		 * of uio bytes the caller wants for sod framework and/or
3105 		 * transport (e.g. TCP) strategy.
3106 		 */
3107 		sodp->sod_want = uiop->uio_resid;
3108 		mutex_exit(sodp->sod_lock);
3109 	} else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
3110 		/*
3111 		 * No uioa but still using sodirect so note the number of
3112 		 * uio bytes the caller wants for sodirect framework and/or
3113 		 * transport (e.g. TCP) strategy.
3114 		 *
3115 		 * Note, sod_lock not held, only writer is in this function
3116 		 * and only one thread at a time so not needed just to init.
3117 		 */
3118 		sodp->sod_want = uiop->uio_resid;
3119 	}
3120 retry:
3121 	saved_resid = uiop->uio_resid;
3122 	pri = 0;
3123 	mp = NULL;
3124 	if (so->so_nl7c_rcv_mp != NULL) {
3125 		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3126 		error = nl7c_sorecv(so, &mp, uiop, &rval);
3127 	} else {
3128 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3129 		    timout, &rval);
3130 	}
3131 	if (error) {
3132 		switch (error) {
3133 		case EINTR:
3134 		case EWOULDBLOCK:
3135 			if (!first)
3136 				error = 0;
3137 			break;
3138 		case ETIME:
3139 			/* Returned from kstrgetmsg when timeout expires */
3140 			if (!first)
3141 				error = 0;
3142 			else
3143 				error = EWOULDBLOCK;
3144 			break;
3145 		default:
3146 			eprintsoline(so, error);
3147 			break;
3148 		}
3149 		goto out;
3150 	}
3151 	/*
3152 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3153 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3154 	 */
3155 	ASSERT(!(rval.r_val1 & MORECTL));
3156 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3157 		msg->msg_flags |= MSG_TRUNC;
3158 
3159 	if (mp == NULL) {
3160 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3161 		/*
3162 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3163 		 * The draft Posix socket spec states that the mark should
3164 		 * not be cleared when peeking. We follow the latter.
3165 		 */
3166 		if ((so->so_state &
3167 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3168 		    (uiop->uio_resid != saved_resid) &&
3169 		    !(flags & MSG_PEEK)) {
3170 			sorecv_update_oobstate(so);
3171 		}
3172 
3173 		mutex_enter(&so->so_lock);
3174 		/* Set MSG_EOR based on MOREDATA */
3175 		if (!(rval.r_val1 & MOREDATA)) {
3176 			if (so->so_state & SS_SAVEDEOR) {
3177 				msg->msg_flags |= MSG_EOR;
3178 				so->so_state &= ~SS_SAVEDEOR;
3179 			}
3180 		}
3181 		/*
3182 		 * If some data was received (i.e. not EOF) and the
3183 		 * read/recv* has not been satisfied wait for some more.
3184 		 */
3185 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3186 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3187 			mutex_exit(&so->so_lock);
3188 			first = 0;
3189 			pflag = opflag | MSG_NOMARK;
3190 			goto retry;
3191 		}
3192 		goto out_locked;
3193 	}
3194 
3195 	/* strsock_proto has already verified length and alignment */
3196 	tpr = (union T_primitives *)mp->b_rptr;
3197 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3198 
3199 	switch (tpr->type) {
3200 	case T_DATA_IND: {
3201 		if ((so->so_state &
3202 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3203 		    (uiop->uio_resid != saved_resid) &&
3204 		    !(flags & MSG_PEEK)) {
3205 			sorecv_update_oobstate(so);
3206 		}
3207 
3208 		/*
3209 		 * Set msg_flags to MSG_EOR based on
3210 		 * MORE_flag and MOREDATA.
3211 		 */
3212 		mutex_enter(&so->so_lock);
3213 		so->so_state &= ~SS_SAVEDEOR;
3214 		if (!(tpr->data_ind.MORE_flag & 1)) {
3215 			if (!(rval.r_val1 & MOREDATA))
3216 				msg->msg_flags |= MSG_EOR;
3217 			else
3218 				so->so_state |= SS_SAVEDEOR;
3219 		}
3220 		freemsg(mp);
3221 		/*
3222 		 * If some data was received (i.e. not EOF) and the
3223 		 * read/recv* has not been satisfied wait for some more.
3224 		 */
3225 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3226 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3227 			mutex_exit(&so->so_lock);
3228 			first = 0;
3229 			pflag = opflag | MSG_NOMARK;
3230 			goto retry;
3231 		}
3232 		goto out_locked;
3233 	}
3234 	case T_UNITDATA_IND: {
3235 		void *addr;
3236 		t_uscalar_t addrlen;
3237 		void *abuf;
3238 		t_uscalar_t optlen;
3239 		void *opt;
3240 
3241 		if ((so->so_state &
3242 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3243 		    (uiop->uio_resid != saved_resid) &&
3244 		    !(flags & MSG_PEEK)) {
3245 			sorecv_update_oobstate(so);
3246 		}
3247 
3248 		if (namelen != 0) {
3249 			/* Caller wants source address */
3250 			addrlen = tpr->unitdata_ind.SRC_length;
3251 			addr = sogetoff(mp,
3252 			    tpr->unitdata_ind.SRC_offset,
3253 			    addrlen, 1);
3254 			if (addr == NULL) {
3255 				freemsg(mp);
3256 				error = EPROTO;
3257 				eprintsoline(so, error);
3258 				goto out;
3259 			}
3260 			if (so->so_family == AF_UNIX) {
3261 				/*
3262 				 * Can not use the transport level address.
3263 				 * If there is a SO_SRCADDR option carrying
3264 				 * the socket level address it will be
3265 				 * extracted below.
3266 				 */
3267 				addr = NULL;
3268 				addrlen = 0;
3269 			}
3270 		}
3271 		optlen = tpr->unitdata_ind.OPT_length;
3272 		if (optlen != 0) {
3273 			t_uscalar_t ncontrollen;
3274 
3275 			/*
3276 			 * Extract any source address option.
3277 			 * Determine how large cmsg buffer is needed.
3278 			 */
3279 			opt = sogetoff(mp,
3280 			    tpr->unitdata_ind.OPT_offset,
3281 			    optlen, __TPI_ALIGN_SIZE);
3282 
3283 			if (opt == NULL) {
3284 				freemsg(mp);
3285 				error = EPROTO;
3286 				eprintsoline(so, error);
3287 				goto out;
3288 			}
3289 			if (so->so_family == AF_UNIX)
3290 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3291 			ncontrollen = so_cmsglen(mp, opt, optlen,
3292 			    !(flags & MSG_XPG4_2));
3293 			if (controllen != 0)
3294 				controllen = ncontrollen;
3295 			else if (ncontrollen != 0)
3296 				msg->msg_flags |= MSG_CTRUNC;
3297 		} else {
3298 			controllen = 0;
3299 		}
3300 
3301 		if (namelen != 0) {
3302 			/*
3303 			 * Return address to caller.
3304 			 * Caller handles truncation if length
3305 			 * exceeds msg_namelen.
3306 			 * NOTE: AF_UNIX NUL termination is ensured by
3307 			 * the sender's copyin_name().
3308 			 */
3309 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3310 
3311 			bcopy(addr, abuf, addrlen);
3312 			msg->msg_name = abuf;
3313 			msg->msg_namelen = addrlen;
3314 		}
3315 
3316 		if (controllen != 0) {
3317 			/*
3318 			 * Return control msg to caller.
3319 			 * Caller handles truncation if length
3320 			 * exceeds msg_controllen.
3321 			 */
3322 			control = kmem_zalloc(controllen, KM_SLEEP);
3323 
3324 			error = so_opt2cmsg(mp, opt, optlen,
3325 			    !(flags & MSG_XPG4_2),
3326 			    control, controllen);
3327 			if (error) {
3328 				freemsg(mp);
3329 				if (msg->msg_namelen != 0)
3330 					kmem_free(msg->msg_name,
3331 					    msg->msg_namelen);
3332 				kmem_free(control, controllen);
3333 				eprintsoline(so, error);
3334 				goto out;
3335 			}
3336 			msg->msg_control = control;
3337 			msg->msg_controllen = controllen;
3338 		}
3339 
3340 		freemsg(mp);
3341 		goto out;
3342 	}
3343 	case T_OPTDATA_IND: {
3344 		struct T_optdata_req *tdr;
3345 		void *opt;
3346 		t_uscalar_t optlen;
3347 
3348 		if ((so->so_state &
3349 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3350 		    (uiop->uio_resid != saved_resid) &&
3351 		    !(flags & MSG_PEEK)) {
3352 			sorecv_update_oobstate(so);
3353 		}
3354 
3355 		tdr = (struct T_optdata_req *)mp->b_rptr;
3356 		optlen = tdr->OPT_length;
3357 		if (optlen != 0) {
3358 			t_uscalar_t ncontrollen;
3359 			/*
3360 			 * Determine how large cmsg buffer is needed.
3361 			 */
3362 			opt = sogetoff(mp,
3363 			    tpr->optdata_ind.OPT_offset,
3364 			    optlen, __TPI_ALIGN_SIZE);
3365 
3366 			if (opt == NULL) {
3367 				freemsg(mp);
3368 				error = EPROTO;
3369 				eprintsoline(so, error);
3370 				goto out;
3371 			}
3372 
3373 			ncontrollen = so_cmsglen(mp, opt, optlen,
3374 			    !(flags & MSG_XPG4_2));
3375 			if (controllen != 0)
3376 				controllen = ncontrollen;
3377 			else if (ncontrollen != 0)
3378 				msg->msg_flags |= MSG_CTRUNC;
3379 		} else {
3380 			controllen = 0;
3381 		}
3382 
3383 		if (controllen != 0) {
3384 			/*
3385 			 * Return control msg to caller.
3386 			 * Caller handles truncation if length
3387 			 * exceeds msg_controllen.
3388 			 */
3389 			control = kmem_zalloc(controllen, KM_SLEEP);
3390 
3391 			error = so_opt2cmsg(mp, opt, optlen,
3392 			    !(flags & MSG_XPG4_2),
3393 			    control, controllen);
3394 			if (error) {
3395 				freemsg(mp);
3396 				kmem_free(control, controllen);
3397 				eprintsoline(so, error);
3398 				goto out;
3399 			}
3400 			msg->msg_control = control;
3401 			msg->msg_controllen = controllen;
3402 		}
3403 
3404 		/*
3405 		 * Set msg_flags to MSG_EOR based on
3406 		 * DATA_flag and MOREDATA.
3407 		 */
3408 		mutex_enter(&so->so_lock);
3409 		so->so_state &= ~SS_SAVEDEOR;
3410 		if (!(tpr->data_ind.MORE_flag & 1)) {
3411 			if (!(rval.r_val1 & MOREDATA))
3412 				msg->msg_flags |= MSG_EOR;
3413 			else
3414 				so->so_state |= SS_SAVEDEOR;
3415 		}
3416 		freemsg(mp);
3417 		/*
3418 		 * If some data was received (i.e. not EOF) and the
3419 		 * read/recv* has not been satisfied wait for some more.
3420 		 * Not possible to wait if control info was received.
3421 		 */
3422 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3423 		    controllen == 0 &&
3424 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3425 			mutex_exit(&so->so_lock);
3426 			first = 0;
3427 			pflag = opflag | MSG_NOMARK;
3428 			goto retry;
3429 		}
3430 		goto out_locked;
3431 	}
3432 	case T_EXDATA_IND: {
3433 		dprintso(so, 1,
3434 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3435 		    "state %s\n",
3436 		    so->so_oobsigcnt, so->so_oobcnt,
3437 		    saved_resid - uiop->uio_resid,
3438 		    pr_state(so->so_state, so->so_mode)));
3439 		/*
3440 		 * kstrgetmsg handles MSGMARK so there is nothing to
3441 		 * inspect in the T_EXDATA_IND.
3442 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3443 		 * as a separate message with no M_DATA component. Furthermore,
3444 		 * the stream head does not consolidate M_DATA messages onto
3445 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3446 		 * remains a message by itself. This is needed since MSGMARK
3447 		 * marks both the whole message as well as the last byte
3448 		 * of the message.
3449 		 */
3450 		freemsg(mp);
3451 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3452 		if (flags & MSG_PEEK) {
3453 			/*
3454 			 * Even though we are peeking we consume the
3455 			 * T_EXDATA_IND thereby moving the mark information
3456 			 * to SS_RCVATMARK. Then the oob code below will
3457 			 * retry the peeking kstrgetmsg.
3458 			 * Note that the stream head read queue is
3459 			 * never flushed without holding SOREADLOCKED
3460 			 * thus the T_EXDATA_IND can not disappear
3461 			 * underneath us.
3462 			 */
3463 			dprintso(so, 1,
3464 			    ("sotpi_recvmsg: consume EXDATA_IND "
3465 			    "counts %d/%d state %s\n",
3466 			    so->so_oobsigcnt,
3467 			    so->so_oobcnt,
3468 			    pr_state(so->so_state, so->so_mode)));
3469 
3470 			pflag = MSG_ANY | MSG_DELAYERROR;
3471 			if (so->so_mode & SM_ATOMIC)
3472 				pflag |= MSG_DISCARDTAIL;
3473 
3474 			pri = 0;
3475 			mp = NULL;
3476 
3477 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3478 			    &pri, &pflag, (clock_t)-1, &rval);
3479 			ASSERT(uiop->uio_resid == saved_resid);
3480 
3481 			if (error) {
3482 #ifdef SOCK_DEBUG
3483 				if (error != EWOULDBLOCK && error != EINTR) {
3484 					eprintsoline(so, error);
3485 				}
3486 #endif /* SOCK_DEBUG */
3487 				goto out;
3488 			}
3489 			ASSERT(mp);
3490 			tpr = (union T_primitives *)mp->b_rptr;
3491 			ASSERT(tpr->type == T_EXDATA_IND);
3492 			freemsg(mp);
3493 		} /* end "if (flags & MSG_PEEK)" */
3494 
3495 		/*
3496 		 * Decrement the number of queued and pending oob.
3497 		 *
3498 		 * SS_RCVATMARK is cleared when we read past a mark.
3499 		 * SS_HAVEOOBDATA is cleared when we've read past the
3500 		 * last mark.
3501 		 * SS_OOBPEND is cleared if we've read past the last
3502 		 * mark and no (new) SIGURG has been posted.
3503 		 */
3504 		mutex_enter(&so->so_lock);
3505 		ASSERT(so_verify_oobstate(so));
3506 		ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
3507 		ASSERT(so->so_oobsigcnt > 0);
3508 		so->so_oobsigcnt--;
3509 		ASSERT(so->so_oobcnt > 0);
3510 		so->so_oobcnt--;
3511 		/*
3512 		 * Since the T_EXDATA_IND has been removed from the stream
3513 		 * head, but we have not read data past the mark,
3514 		 * sockfs needs to track that the socket is still at the mark.
3515 		 *
3516 		 * Since no data was received call kstrgetmsg again to wait
3517 		 * for data.
3518 		 */
3519 		so->so_state |= SS_RCVATMARK;
3520 		mutex_exit(&so->so_lock);
3521 		dprintso(so, 1,
3522 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3523 		    so->so_oobsigcnt, so->so_oobcnt,
3524 		    pr_state(so->so_state, so->so_mode)));
3525 		pflag = opflag;
3526 		goto retry;
3527 	}
3528 	default:
3529 		ASSERT(0);
3530 		freemsg(mp);
3531 		error = EPROTO;
3532 		eprintsoline(so, error);
3533 		goto out;
3534 	}
3535 	/* NOTREACHED */
3536 out:
3537 	mutex_enter(&so->so_lock);
3538 out_locked:
3539 	if (sodp != NULL) {
3540 		/* Finish any sodirect and uioa processing */
3541 		mutex_enter(sodp->sod_lock);
3542 		if (suiop != NULL) {
3543 			/* Finish any uioa_t processing */
3544 			int ret;
3545 
3546 			ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
3547 			ret = uioafini(suiop, (uioa_t *)uiop);
3548 			if (error == 0 && ret != 0) {
3549 				/* If no error yet, set it */
3550 				error = ret;
3551 			}
3552 			if ((mp = sodp->sod_uioafh) != NULL) {
3553 				sodp->sod_uioafh = NULL;
3554 				sodp->sod_uioaft = NULL;
3555 				freemsg(mp);
3556 			}
3557 		}
3558 		if (!(sodp->sod_state & SOD_WAKE_NOT)) {
3559 			/* Awoke */
3560 			sodp->sod_state &= SOD_WAKE_CLR;
3561 			sodp->sod_state |= SOD_WAKE_NOT;
3562 		}
3563 		/* Last, clear sod_want value */
3564 		sodp->sod_want = 0;
3565 		mutex_exit(sodp->sod_lock);
3566 	}
3567 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3568 	mutex_exit(&so->so_lock);
3569 	return (error);
3570 }
3571 
3572 /*
3573  * Sending data with options on a datagram socket.
3574  * Assumes caller has verified that SS_ISBOUND etc. are set.
3575  */
3576 static int
3577 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3578     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3579 {
3580 	struct T_unitdata_req	tudr;
3581 	mblk_t			*mp;
3582 	int			error;
3583 	void			*addr;
3584 	socklen_t		addrlen;
3585 	void			*src;
3586 	socklen_t		srclen;
3587 	ssize_t			len;
3588 	int			size;
3589 	struct T_opthdr		toh;
3590 	struct fdbuf		*fdbuf;
3591 	t_uscalar_t		optlen;
3592 	void			*fds;
3593 	int			fdlen;
3594 
3595 	ASSERT(name && namelen);
3596 	ASSERT(control && controllen);
3597 
3598 	len = uiop->uio_resid;
3599 	if (len > (ssize_t)so->so_tidu_size) {
3600 		return (EMSGSIZE);
3601 	}
3602 
3603 	/*
3604 	 * For AF_UNIX the destination address is translated to an internal
3605 	 * name and the source address is passed as an option.
3606 	 * Also, file descriptors are passed as file pointers in an
3607 	 * option.
3608 	 */
3609 
3610 	/*
3611 	 * Length and family checks.
3612 	 */
3613 	error = so_addr_verify(so, name, namelen);
3614 	if (error) {
3615 		eprintsoline(so, error);
3616 		return (error);
3617 	}
3618 	if (so->so_family == AF_UNIX) {
3619 		if (so->so_state & SS_FADDR_NOXLATE) {
3620 			/*
3621 			 * Already have a transport internal address. Do not
3622 			 * pass any (transport internal) source address.
3623 			 */
3624 			addr = name;
3625 			addrlen = namelen;
3626 			src = NULL;
3627 			srclen = 0;
3628 		} else {
3629 			/*
3630 			 * Pass the sockaddr_un source address as an option
3631 			 * and translate the remote address.
3632 			 *
3633 			 * Note that this code does not prevent so_laddr_sa
3634 			 * from changing while it is being used. Thus
3635 			 * if an unbind+bind occurs concurrently with this
3636 			 * send the peer might see a partially new and a
3637 			 * partially old "from" address.
3638 			 */
3639 			src = so->so_laddr_sa;
3640 			srclen = (t_uscalar_t)so->so_laddr_len;
3641 			dprintso(so, 1,
3642 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3643 			    srclen, src));
3644 			error = so_ux_addr_xlate(so, name, namelen,
3645 			    (flags & MSG_XPG4_2),
3646 			    &addr, &addrlen);
3647 			if (error) {
3648 				eprintsoline(so, error);
3649 				return (error);
3650 			}
3651 		}
3652 	} else {
3653 		addr = name;
3654 		addrlen = namelen;
3655 		src = NULL;
3656 		srclen = 0;
3657 	}
3658 	optlen = so_optlen(control, controllen,
3659 	    !(flags & MSG_XPG4_2));
3660 	tudr.PRIM_type = T_UNITDATA_REQ;
3661 	tudr.DEST_length = addrlen;
3662 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3663 	if (srclen != 0)
3664 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3665 		    _TPI_ALIGN_TOPT(srclen));
3666 	else
3667 		tudr.OPT_length = optlen;
3668 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3669 	    _TPI_ALIGN_TOPT(addrlen));
3670 
3671 	size = tudr.OPT_offset + tudr.OPT_length;
3672 
3673 	/*
3674 	 * File descriptors only when SM_FDPASSING set.
3675 	 */
3676 	error = so_getfdopt(control, controllen,
3677 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3678 	if (error)
3679 		return (error);
3680 	if (fdlen != -1) {
3681 		if (!(so->so_mode & SM_FDPASSING))
3682 			return (EOPNOTSUPP);
3683 
3684 		error = fdbuf_create(fds, fdlen, &fdbuf);
3685 		if (error)
3686 			return (error);
3687 		mp = fdbuf_allocmsg(size, fdbuf);
3688 	} else {
3689 		mp = soallocproto(size, _ALLOC_INTR);
3690 		if (mp == NULL) {
3691 			/*
3692 			 * Caught a signal waiting for memory.
3693 			 * Let send* return EINTR.
3694 			 */
3695 			return (EINTR);
3696 		}
3697 	}
3698 	soappendmsg(mp, &tudr, sizeof (tudr));
3699 	soappendmsg(mp, addr, addrlen);
3700 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3701 
3702 	if (fdlen != -1) {
3703 		ASSERT(fdbuf != NULL);
3704 		toh.level = SOL_SOCKET;
3705 		toh.name = SO_FILEP;
3706 		toh.len = fdbuf->fd_size +
3707 		    (t_uscalar_t)sizeof (struct T_opthdr);
3708 		toh.status = 0;
3709 		soappendmsg(mp, &toh, sizeof (toh));
3710 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3711 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3712 	}
3713 	if (srclen != 0) {
3714 		/*
3715 		 * There is a AF_UNIX sockaddr_un to include as a source
3716 		 * address option.
3717 		 */
3718 		toh.level = SOL_SOCKET;
3719 		toh.name = SO_SRCADDR;
3720 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3721 		toh.status = 0;
3722 		soappendmsg(mp, &toh, sizeof (toh));
3723 		soappendmsg(mp, src, srclen);
3724 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3725 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3726 	}
3727 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3728 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3729 	/* At most 3 bytes left in the message */
3730 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3731 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3732 
3733 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3734 	if (audit_active)
3735 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3736 
3737 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3738 #ifdef SOCK_DEBUG
3739 	if (error) {
3740 		eprintsoline(so, error);
3741 	}
3742 #endif /* SOCK_DEBUG */
3743 	return (error);
3744 }
3745 
3746 /*
3747  * Sending data with options on a connected stream socket.
3748  * Assumes caller has verified that SS_ISCONNECTED is set.
3749  */
3750 static int
3751 sosend_svccmsg(struct sonode *so,
3752 		struct uio *uiop,
3753 		int more,
3754 		void *control,
3755 		t_uscalar_t controllen,
3756 		int flags)
3757 {
3758 	struct T_optdata_req	tdr;
3759 	mblk_t			*mp;
3760 	int			error;
3761 	ssize_t			iosize;
3762 	int			first = 1;
3763 	int			size;
3764 	struct fdbuf		*fdbuf;
3765 	t_uscalar_t		optlen;
3766 	void			*fds;
3767 	int			fdlen;
3768 	struct T_opthdr		toh;
3769 
3770 	dprintso(so, 1,
3771 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3772 
3773 	/*
3774 	 * Has to be bound and connected. However, since no locks are
3775 	 * held the state could have changed after sotpi_sendmsg checked it
3776 	 * thus it is not possible to ASSERT on the state.
3777 	 */
3778 
3779 	/* Options on connection-oriented only when SM_OPTDATA set. */
3780 	if (!(so->so_mode & SM_OPTDATA))
3781 		return (EOPNOTSUPP);
3782 
3783 	do {
3784 		/*
3785 		 * Set the MORE flag if uio_resid does not fit in this
3786 		 * message or if the caller passed in "more".
3787 		 * Error for transports with zero tidu_size.
3788 		 */
3789 		tdr.PRIM_type = T_OPTDATA_REQ;
3790 		iosize = so->so_tidu_size;
3791 		if (iosize <= 0)
3792 			return (EMSGSIZE);
3793 		if (uiop->uio_resid > iosize) {
3794 			tdr.DATA_flag = 1;
3795 		} else {
3796 			if (more)
3797 				tdr.DATA_flag = 1;
3798 			else
3799 				tdr.DATA_flag = 0;
3800 			iosize = uiop->uio_resid;
3801 		}
3802 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3803 		    tdr.DATA_flag, iosize));
3804 
3805 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3806 		tdr.OPT_length = optlen;
3807 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3808 
3809 		size = (int)sizeof (tdr) + optlen;
3810 		/*
3811 		 * File descriptors only when SM_FDPASSING set.
3812 		 */
3813 		error = so_getfdopt(control, controllen,
3814 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3815 		if (error)
3816 			return (error);
3817 		if (fdlen != -1) {
3818 			if (!(so->so_mode & SM_FDPASSING))
3819 				return (EOPNOTSUPP);
3820 
3821 			error = fdbuf_create(fds, fdlen, &fdbuf);
3822 			if (error)
3823 				return (error);
3824 			mp = fdbuf_allocmsg(size, fdbuf);
3825 		} else {
3826 			mp = soallocproto(size, _ALLOC_INTR);
3827 			if (mp == NULL) {
3828 				/*
3829 				 * Caught a signal waiting for memory.
3830 				 * Let send* return EINTR.
3831 				 */
3832 				return (first ? EINTR : 0);
3833 			}
3834 		}
3835 		soappendmsg(mp, &tdr, sizeof (tdr));
3836 
3837 		if (fdlen != -1) {
3838 			ASSERT(fdbuf != NULL);
3839 			toh.level = SOL_SOCKET;
3840 			toh.name = SO_FILEP;
3841 			toh.len = fdbuf->fd_size +
3842 			    (t_uscalar_t)sizeof (struct T_opthdr);
3843 			toh.status = 0;
3844 			soappendmsg(mp, &toh, sizeof (toh));
3845 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3846 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3847 		}
3848 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3849 		/* At most 3 bytes left in the message */
3850 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3851 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3852 
3853 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3854 
3855 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3856 		    0, MSG_BAND, 0);
3857 		if (error) {
3858 			if (!first && error == EWOULDBLOCK)
3859 				return (0);
3860 			eprintsoline(so, error);
3861 			return (error);
3862 		}
3863 		control = NULL;
3864 		first = 0;
3865 		if (uiop->uio_resid > 0) {
3866 			/*
3867 			 * Recheck for fatal errors. Fail write even though
3868 			 * some data have been written. This is consistent
3869 			 * with strwrite semantics and BSD sockets semantics.
3870 			 */
3871 			if (so->so_state & SS_CANTSENDMORE) {
3872 				tsignal(curthread, SIGPIPE);
3873 				eprintsoline(so, error);
3874 				return (EPIPE);
3875 			}
3876 			if (so->so_error != 0) {
3877 				mutex_enter(&so->so_lock);
3878 				error = sogeterr(so);
3879 				mutex_exit(&so->so_lock);
3880 				if (error != 0) {
3881 					eprintsoline(so, error);
3882 					return (error);
3883 				}
3884 			}
3885 		}
3886 	} while (uiop->uio_resid > 0);
3887 	return (0);
3888 }
3889 
3890 /*
3891  * Sending data on a datagram socket.
3892  * Assumes caller has verified that SS_ISBOUND etc. are set.
3893  *
3894  * For AF_UNIX the destination address is translated to an internal
3895  * name and the source address is passed as an option.
3896  */
3897 int
3898 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
3899     struct uio *uiop, int flags)
3900 {
3901 	struct T_unitdata_req	tudr;
3902 	mblk_t			*mp;
3903 	int			error;
3904 	void			*addr;
3905 	socklen_t		addrlen;
3906 	void			*src;
3907 	socklen_t		srclen;
3908 	ssize_t			len;
3909 
3910 	ASSERT(name != NULL && namelen != 0);
3911 
3912 	len = uiop->uio_resid;
3913 	if (len > so->so_tidu_size) {
3914 		error = EMSGSIZE;
3915 		goto done;
3916 	}
3917 
3918 	/* Length and family checks */
3919 	error = so_addr_verify(so, name, namelen);
3920 	if (error != 0)
3921 		goto done;
3922 
3923 	if (so->so_state & SS_DIRECT)
3924 		return (sodgram_direct(so, name, namelen, uiop, flags));
3925 
3926 	if (so->so_family == AF_UNIX) {
3927 		if (so->so_state & SS_FADDR_NOXLATE) {
3928 			/*
3929 			 * Already have a transport internal address. Do not
3930 			 * pass any (transport internal) source address.
3931 			 */
3932 			addr = name;
3933 			addrlen = namelen;
3934 			src = NULL;
3935 			srclen = 0;
3936 		} else {
3937 			/*
3938 			 * Pass the sockaddr_un source address as an option
3939 			 * and translate the remote address.
3940 			 *
3941 			 * Note that this code does not prevent so_laddr_sa
3942 			 * from changing while it is being used. Thus
3943 			 * if an unbind+bind occurs concurrently with this
3944 			 * send the peer might see a partially new and a
3945 			 * partially old "from" address.
3946 			 */
3947 			src = so->so_laddr_sa;
3948 			srclen = (socklen_t)so->so_laddr_len;
3949 			dprintso(so, 1,
3950 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
3951 			    srclen, src));
3952 			error = so_ux_addr_xlate(so, name, namelen,
3953 			    (flags & MSG_XPG4_2),
3954 			    &addr, &addrlen);
3955 			if (error) {
3956 				eprintsoline(so, error);
3957 				goto done;
3958 			}
3959 		}
3960 	} else {
3961 		addr = name;
3962 		addrlen = namelen;
3963 		src = NULL;
3964 		srclen = 0;
3965 	}
3966 	tudr.PRIM_type = T_UNITDATA_REQ;
3967 	tudr.DEST_length = addrlen;
3968 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3969 	if (srclen == 0) {
3970 		tudr.OPT_length = 0;
3971 		tudr.OPT_offset = 0;
3972 
3973 		mp = soallocproto2(&tudr, sizeof (tudr),
3974 		    addr, addrlen, 0, _ALLOC_INTR);
3975 		if (mp == NULL) {
3976 			/*
3977 			 * Caught a signal waiting for memory.
3978 			 * Let send* return EINTR.
3979 			 */
3980 			error = EINTR;
3981 			goto done;
3982 		}
3983 	} else {
3984 		/*
3985 		 * There is a AF_UNIX sockaddr_un to include as a source
3986 		 * address option.
3987 		 */
3988 		struct T_opthdr toh;
3989 		ssize_t size;
3990 
3991 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
3992 		    _TPI_ALIGN_TOPT(srclen));
3993 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3994 		    _TPI_ALIGN_TOPT(addrlen));
3995 
3996 		toh.level = SOL_SOCKET;
3997 		toh.name = SO_SRCADDR;
3998 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3999 		toh.status = 0;
4000 
4001 		size = tudr.OPT_offset + tudr.OPT_length;
4002 		mp = soallocproto2(&tudr, sizeof (tudr),
4003 		    addr, addrlen, size, _ALLOC_INTR);
4004 		if (mp == NULL) {
4005 			/*
4006 			 * Caught a signal waiting for memory.
4007 			 * Let send* return EINTR.
4008 			 */
4009 			error = EINTR;
4010 			goto done;
4011 		}
4012 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4013 		soappendmsg(mp, &toh, sizeof (toh));
4014 		soappendmsg(mp, src, srclen);
4015 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4016 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4017 	}
4018 
4019 	if (audit_active)
4020 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4021 
4022 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4023 done:
4024 #ifdef SOCK_DEBUG
4025 	if (error) {
4026 		eprintsoline(so, error);
4027 	}
4028 #endif /* SOCK_DEBUG */
4029 	return (error);
4030 }
4031 
4032 /*
4033  * Sending data on a connected stream socket.
4034  * Assumes caller has verified that SS_ISCONNECTED is set.
4035  */
4036 int
4037 sosend_svc(struct sonode *so,
4038 	struct uio *uiop,
4039 	t_scalar_t prim,
4040 	int more,
4041 	int sflag)
4042 {
4043 	struct T_data_req	tdr;
4044 	mblk_t			*mp;
4045 	int			error;
4046 	ssize_t			iosize;
4047 	int			first = 1;
4048 
4049 	dprintso(so, 1,
4050 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4051 	    (void *)so, uiop->uio_resid, prim, sflag));
4052 
4053 	/*
4054 	 * Has to be bound and connected. However, since no locks are
4055 	 * held the state could have changed after sotpi_sendmsg checked it
4056 	 * thus it is not possible to ASSERT on the state.
4057 	 */
4058 
4059 	do {
4060 		/*
4061 		 * Set the MORE flag if uio_resid does not fit in this
4062 		 * message or if the caller passed in "more".
4063 		 * Error for transports with zero tidu_size.
4064 		 */
4065 		tdr.PRIM_type = prim;
4066 		iosize = so->so_tidu_size;
4067 		if (iosize <= 0)
4068 			return (EMSGSIZE);
4069 		if (uiop->uio_resid > iosize) {
4070 			tdr.MORE_flag = 1;
4071 		} else {
4072 			if (more)
4073 				tdr.MORE_flag = 1;
4074 			else
4075 				tdr.MORE_flag = 0;
4076 			iosize = uiop->uio_resid;
4077 		}
4078 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4079 		    prim, tdr.MORE_flag, iosize));
4080 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR);
4081 		if (mp == NULL) {
4082 			/*
4083 			 * Caught a signal waiting for memory.
4084 			 * Let send* return EINTR.
4085 			 */
4086 			if (first)
4087 				return (EINTR);
4088 			else
4089 				return (0);
4090 		}
4091 
4092 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4093 		    0, sflag | MSG_BAND, 0);
4094 		if (error) {
4095 			if (!first && error == EWOULDBLOCK)
4096 				return (0);
4097 			eprintsoline(so, error);
4098 			return (error);
4099 		}
4100 		first = 0;
4101 		if (uiop->uio_resid > 0) {
4102 			/*
4103 			 * Recheck for fatal errors. Fail write even though
4104 			 * some data have been written. This is consistent
4105 			 * with strwrite semantics and BSD sockets semantics.
4106 			 */
4107 			if (so->so_state & SS_CANTSENDMORE) {
4108 				tsignal(curthread, SIGPIPE);
4109 				eprintsoline(so, error);
4110 				return (EPIPE);
4111 			}
4112 			if (so->so_error != 0) {
4113 				mutex_enter(&so->so_lock);
4114 				error = sogeterr(so);
4115 				mutex_exit(&so->so_lock);
4116 				if (error != 0) {
4117 					eprintsoline(so, error);
4118 					return (error);
4119 				}
4120 			}
4121 		}
4122 	} while (uiop->uio_resid > 0);
4123 	return (0);
4124 }
4125 
4126 /*
4127  * Check the state for errors and call the appropriate send function.
4128  *
4129  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4130  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4131  * after sending the message.
4132  */
4133 static int
4134 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
4135 {
4136 	int		so_state;
4137 	int		so_mode;
4138 	int		error;
4139 	struct sockaddr *name;
4140 	t_uscalar_t	namelen;
4141 	int		dontroute;
4142 	int		flags;
4143 
4144 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4145 	    (void *)so, (void *)msg, msg->msg_flags,
4146 	    pr_state(so->so_state, so->so_mode), so->so_error));
4147 
4148 	mutex_enter(&so->so_lock);
4149 	so_state = so->so_state;
4150 
4151 	if (so_state & SS_CANTSENDMORE) {
4152 		mutex_exit(&so->so_lock);
4153 		tsignal(curthread, SIGPIPE);
4154 		return (EPIPE);
4155 	}
4156 
4157 	if (so->so_error != 0) {
4158 		error = sogeterr(so);
4159 		if (error != 0) {
4160 			mutex_exit(&so->so_lock);
4161 			return (error);
4162 		}
4163 	}
4164 
4165 	name = (struct sockaddr *)msg->msg_name;
4166 	namelen = msg->msg_namelen;
4167 
4168 	so_mode = so->so_mode;
4169 
4170 	if (name == NULL) {
4171 		if (!(so_state & SS_ISCONNECTED)) {
4172 			mutex_exit(&so->so_lock);
4173 			if (so_mode & SM_CONNREQUIRED)
4174 				return (ENOTCONN);
4175 			else
4176 				return (EDESTADDRREQ);
4177 		}
4178 		if (so_mode & SM_CONNREQUIRED) {
4179 			name = NULL;
4180 			namelen = 0;
4181 		} else {
4182 			/*
4183 			 * Note that this code does not prevent so_faddr_sa
4184 			 * from changing while it is being used. Thus
4185 			 * if an "unconnect"+connect occurs concurrently with
4186 			 * this send the datagram might be delivered to a
4187 			 * garbaled address.
4188 			 */
4189 			ASSERT(so->so_faddr_sa);
4190 			name = so->so_faddr_sa;
4191 			namelen = (t_uscalar_t)so->so_faddr_len;
4192 		}
4193 	} else {
4194 		if (!(so_state & SS_ISCONNECTED) &&
4195 		    (so_mode & SM_CONNREQUIRED)) {
4196 			/* Required but not connected */
4197 			mutex_exit(&so->so_lock);
4198 			return (ENOTCONN);
4199 		}
4200 		/*
4201 		 * Ignore the address on connection-oriented sockets.
4202 		 * Just like BSD this code does not generate an error for
4203 		 * TCP (a CONNREQUIRED socket) when sending to an address
4204 		 * passed in with sendto/sendmsg. Instead the data is
4205 		 * delivered on the connection as if no address had been
4206 		 * supplied.
4207 		 */
4208 		if ((so_state & SS_ISCONNECTED) &&
4209 		    !(so_mode & SM_CONNREQUIRED)) {
4210 			mutex_exit(&so->so_lock);
4211 			return (EISCONN);
4212 		}
4213 		if (!(so_state & SS_ISBOUND)) {
4214 			so_lock_single(so);	/* Set SOLOCKED */
4215 			error = sotpi_bind(so, NULL, 0,
4216 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
4217 			so_unlock_single(so, SOLOCKED);
4218 			if (error) {
4219 				mutex_exit(&so->so_lock);
4220 				eprintsoline(so, error);
4221 				return (error);
4222 			}
4223 		}
4224 		/*
4225 		 * Handle delayed datagram errors. These are only queued
4226 		 * when the application sets SO_DGRAM_ERRIND.
4227 		 * Return the error if we are sending to the address
4228 		 * that was returned in the last T_UDERROR_IND.
4229 		 * If sending to some other address discard the delayed
4230 		 * error indication.
4231 		 */
4232 		if (so->so_delayed_error) {
4233 			struct T_uderror_ind	*tudi;
4234 			void			*addr;
4235 			t_uscalar_t		addrlen;
4236 			boolean_t		match = B_FALSE;
4237 
4238 			ASSERT(so->so_eaddr_mp);
4239 			error = so->so_delayed_error;
4240 			so->so_delayed_error = 0;
4241 			tudi = (struct T_uderror_ind *)so->so_eaddr_mp->b_rptr;
4242 			addrlen = tudi->DEST_length;
4243 			addr = sogetoff(so->so_eaddr_mp,
4244 			    tudi->DEST_offset,
4245 			    addrlen, 1);
4246 			ASSERT(addr);	/* Checked by strsock_proto */
4247 			switch (so->so_family) {
4248 			case AF_INET: {
4249 				/* Compare just IP address and port */
4250 				sin_t *sin1 = (sin_t *)name;
4251 				sin_t *sin2 = (sin_t *)addr;
4252 
4253 				if (addrlen == sizeof (sin_t) &&
4254 				    namelen == addrlen &&
4255 				    sin1->sin_port == sin2->sin_port &&
4256 				    sin1->sin_addr.s_addr ==
4257 				    sin2->sin_addr.s_addr)
4258 					match = B_TRUE;
4259 				break;
4260 			}
4261 			case AF_INET6: {
4262 				/* Compare just IP address and port. Not flow */
4263 				sin6_t *sin1 = (sin6_t *)name;
4264 				sin6_t *sin2 = (sin6_t *)addr;
4265 
4266 				if (addrlen == sizeof (sin6_t) &&
4267 				    namelen == addrlen &&
4268 				    sin1->sin6_port == sin2->sin6_port &&
4269 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4270 				    &sin2->sin6_addr))
4271 					match = B_TRUE;
4272 				break;
4273 			}
4274 			case AF_UNIX:
4275 			default:
4276 				if (namelen == addrlen &&
4277 				    bcmp(name, addr, namelen) == 0)
4278 					match = B_TRUE;
4279 			}
4280 			if (match) {
4281 				freemsg(so->so_eaddr_mp);
4282 				so->so_eaddr_mp = NULL;
4283 				mutex_exit(&so->so_lock);
4284 #ifdef DEBUG
4285 				dprintso(so, 0,
4286 				    ("sockfs delayed error %d for %s\n",
4287 				    error,
4288 				    pr_addr(so->so_family, name, namelen)));
4289 #endif /* DEBUG */
4290 				return (error);
4291 			}
4292 			freemsg(so->so_eaddr_mp);
4293 			so->so_eaddr_mp = NULL;
4294 		}
4295 	}
4296 	mutex_exit(&so->so_lock);
4297 
4298 	flags = msg->msg_flags;
4299 	dontroute = 0;
4300 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4301 		uint32_t	val;
4302 
4303 		val = 1;
4304 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4305 		    &val, (t_uscalar_t)sizeof (val));
4306 		if (error)
4307 			return (error);
4308 		dontroute = 1;
4309 	}
4310 
4311 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4312 		error = EOPNOTSUPP;
4313 		goto done;
4314 	}
4315 	if (msg->msg_controllen != 0) {
4316 		if (!(so_mode & SM_CONNREQUIRED)) {
4317 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4318 			    msg->msg_control, msg->msg_controllen, flags);
4319 		} else {
4320 			if (flags & MSG_OOB) {
4321 				/* Can't generate T_EXDATA_REQ with options */
4322 				error = EOPNOTSUPP;
4323 				goto done;
4324 			}
4325 			error = sosend_svccmsg(so, uiop,
4326 			    !(flags & MSG_EOR),
4327 			    msg->msg_control, msg->msg_controllen,
4328 			    flags);
4329 		}
4330 		goto done;
4331 	}
4332 
4333 	if (!(so_mode & SM_CONNREQUIRED)) {
4334 		/*
4335 		 * If there is no SO_DONTROUTE to turn off return immediately
4336 		 * from send_dgram. This can allow tail-call optimizations.
4337 		 */
4338 		if (!dontroute) {
4339 			return (sosend_dgram(so, name, namelen, uiop, flags));
4340 		}
4341 		error = sosend_dgram(so, name, namelen, uiop, flags);
4342 	} else {
4343 		t_scalar_t prim;
4344 		int sflag;
4345 
4346 		/* Ignore msg_name in the connected state */
4347 		if (flags & MSG_OOB) {
4348 			prim = T_EXDATA_REQ;
4349 			/*
4350 			 * Send down T_EXDATA_REQ even if there is flow
4351 			 * control for data.
4352 			 */
4353 			sflag = MSG_IGNFLOW;
4354 		} else {
4355 			if (so_mode & SM_BYTESTREAM) {
4356 				/* Byte stream transport - use write */
4357 
4358 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4359 				/*
4360 				 * If there is no SO_DONTROUTE to turn off,
4361 				 * SS_DIRECT is on, and there is no flow
4362 				 * control, we can take the fast path.
4363 				 */
4364 				if (!dontroute &&
4365 				    (so_state & SS_DIRECT) &&
4366 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4367 					return (sostream_direct(so, uiop,
4368 					    NULL, CRED()));
4369 				}
4370 				error = strwrite(SOTOV(so), uiop, CRED());
4371 				goto done;
4372 			}
4373 			prim = T_DATA_REQ;
4374 			sflag = 0;
4375 		}
4376 		/*
4377 		 * If there is no SO_DONTROUTE to turn off return immediately
4378 		 * from sosend_svc. This can allow tail-call optimizations.
4379 		 */
4380 		if (!dontroute)
4381 			return (sosend_svc(so, uiop, prim,
4382 			    !(flags & MSG_EOR), sflag));
4383 		error = sosend_svc(so, uiop, prim,
4384 		    !(flags & MSG_EOR), sflag);
4385 	}
4386 	ASSERT(dontroute);
4387 done:
4388 	if (dontroute) {
4389 		uint32_t	val;
4390 
4391 		val = 0;
4392 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4393 		    &val, (t_uscalar_t)sizeof (val));
4394 	}
4395 	return (error);
4396 }
4397 
4398 /*
4399  * Sending data on a datagram socket.
4400  * Assumes caller has verified that SS_ISBOUND etc. are set.
4401  */
4402 /* ARGSUSED */
4403 static int
4404 sodgram_direct(struct sonode *so, struct sockaddr *name,
4405     socklen_t namelen, struct uio *uiop, int flags)
4406 {
4407 	struct T_unitdata_req	tudr;
4408 	mblk_t			*mp = NULL;
4409 	int			error = 0;
4410 	void			*addr;
4411 	socklen_t		addrlen;
4412 	ssize_t			len;
4413 	struct stdata		*stp = SOTOV(so)->v_stream;
4414 	int			so_state;
4415 	queue_t			*udp_wq;
4416 	boolean_t		connected;
4417 	mblk_t			*mpdata = NULL;
4418 
4419 	ASSERT(name != NULL && namelen != 0);
4420 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4421 	ASSERT(!(so->so_mode & SM_EXDATA));
4422 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4423 	ASSERT(SOTOV(so)->v_type == VSOCK);
4424 
4425 	/* Caller checked for proper length */
4426 	len = uiop->uio_resid;
4427 	ASSERT(len <= so->so_tidu_size);
4428 
4429 	/* Length and family checks have been done by caller */
4430 	ASSERT(name->sa_family == so->so_family);
4431 	ASSERT(so->so_family == AF_INET ||
4432 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4433 	ASSERT(so->so_family == AF_INET6 ||
4434 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4435 
4436 	addr = name;
4437 	addrlen = namelen;
4438 
4439 	if (stp->sd_sidp != NULL &&
4440 	    (error = straccess(stp, JCWRITE)) != 0)
4441 		goto done;
4442 
4443 	so_state = so->so_state;
4444 
4445 	connected = so_state & SS_ISCONNECTED;
4446 	if (!connected) {
4447 		tudr.PRIM_type = T_UNITDATA_REQ;
4448 		tudr.DEST_length = addrlen;
4449 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4450 		tudr.OPT_length = 0;
4451 		tudr.OPT_offset = 0;
4452 
4453 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4454 		    _ALLOC_INTR);
4455 		if (mp == NULL) {
4456 			/*
4457 			 * Caught a signal waiting for memory.
4458 			 * Let send* return EINTR.
4459 			 */
4460 			error = EINTR;
4461 			goto done;
4462 		}
4463 	}
4464 
4465 	/*
4466 	 * For UDP we don't break up the copyin into smaller pieces
4467 	 * as in the TCP case.  That means if ENOMEM is returned by
4468 	 * mcopyinuio() then the uio vector has not been modified at
4469 	 * all and we fallback to either strwrite() or kstrputmsg()
4470 	 * below.  Note also that we never generate priority messages
4471 	 * from here.
4472 	 */
4473 	udp_wq = stp->sd_wrq->q_next;
4474 	if (canput(udp_wq) &&
4475 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4476 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4477 		ASSERT(uiop->uio_resid == 0);
4478 		if (!connected)
4479 			linkb(mp, mpdata);
4480 		else
4481 			mp = mpdata;
4482 		if (audit_active)
4483 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4484 
4485 		udp_wput(udp_wq, mp);
4486 		return (0);
4487 	}
4488 
4489 	ASSERT(mpdata == NULL);
4490 	if (error != 0 && error != ENOMEM) {
4491 		freemsg(mp);
4492 		return (error);
4493 	}
4494 
4495 	/*
4496 	 * For connected, let strwrite() handle the blocking case.
4497 	 * Otherwise we fall thru and use kstrputmsg().
4498 	 */
4499 	if (connected)
4500 		return (strwrite(SOTOV(so), uiop, CRED()));
4501 
4502 	if (audit_active)
4503 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4504 
4505 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4506 done:
4507 #ifdef SOCK_DEBUG
4508 	if (error != 0) {
4509 		eprintsoline(so, error);
4510 	}
4511 #endif /* SOCK_DEBUG */
4512 	return (error);
4513 }
4514 
4515 int
4516 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4517 {
4518 	struct stdata *stp = SOTOV(so)->v_stream;
4519 	ssize_t iosize, rmax, maxblk;
4520 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4521 	mblk_t *newmp;
4522 	int error = 0, wflag = 0;
4523 
4524 	ASSERT(so->so_mode & SM_BYTESTREAM);
4525 	ASSERT(SOTOV(so)->v_type == VSOCK);
4526 
4527 	if (stp->sd_sidp != NULL &&
4528 	    (error = straccess(stp, JCWRITE)) != 0)
4529 		return (error);
4530 
4531 	if (uiop == NULL) {
4532 		/*
4533 		 * kstrwritemp() should have checked sd_flag and
4534 		 * flow-control before coming here.  If we end up
4535 		 * here it means that we can simply pass down the
4536 		 * data to tcp.
4537 		 */
4538 		ASSERT(mp != NULL);
4539 		if (stp->sd_wputdatafunc != NULL) {
4540 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4541 			    NULL, NULL, NULL);
4542 			if (newmp == NULL) {
4543 				/* The caller will free mp */
4544 				return (ECOMM);
4545 			}
4546 			mp = newmp;
4547 		}
4548 		tcp_wput(tcp_wq, mp);
4549 		return (0);
4550 	}
4551 
4552 	/* Fallback to strwrite() to do proper error handling */
4553 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4554 		return (strwrite(SOTOV(so), uiop, cr));
4555 
4556 	rmax = stp->sd_qn_maxpsz;
4557 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4558 	if (rmax == 0 || uiop->uio_resid <= 0)
4559 		return (0);
4560 
4561 	if (rmax == INFPSZ)
4562 		rmax = uiop->uio_resid;
4563 
4564 	maxblk = stp->sd_maxblk;
4565 
4566 	for (;;) {
4567 		iosize = MIN(uiop->uio_resid, rmax);
4568 
4569 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4570 		if (mp == NULL) {
4571 			/*
4572 			 * Fallback to strwrite() for ENOMEM; if this
4573 			 * is our first time in this routine and the uio
4574 			 * vector has not been modified, we will end up
4575 			 * calling strwrite() without any flag set.
4576 			 */
4577 			if (error == ENOMEM)
4578 				goto slow_send;
4579 			else
4580 				return (error);
4581 		}
4582 		ASSERT(uiop->uio_resid >= 0);
4583 		/*
4584 		 * If mp is non-NULL and ENOMEM is set, it means that
4585 		 * mcopyinuio() was able to break down some of the user
4586 		 * data into one or more mblks.  Send the partial data
4587 		 * to tcp and let the rest be handled in strwrite().
4588 		 */
4589 		ASSERT(error == 0 || error == ENOMEM);
4590 		if (stp->sd_wputdatafunc != NULL) {
4591 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4592 			    NULL, NULL, NULL);
4593 			if (newmp == NULL) {
4594 				/* The caller will free mp */
4595 				return (ECOMM);
4596 			}
4597 			mp = newmp;
4598 		}
4599 		tcp_wput(tcp_wq, mp);
4600 
4601 		wflag |= NOINTR;
4602 
4603 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4604 			ASSERT(error == 0);
4605 			break;
4606 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4607 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4608 slow_send:
4609 			/*
4610 			 * We were able to send down partial data using
4611 			 * the direct call interface, but are now relying
4612 			 * on strwrite() to handle the non-fastpath cases.
4613 			 * If the socket is blocking we will sleep in
4614 			 * strwaitq() until write is permitted, otherwise,
4615 			 * we will need to return the amount of bytes
4616 			 * written so far back to the app.  This is the
4617 			 * reason why we pass NOINTR flag to strwrite()
4618 			 * for non-blocking socket, because we don't want
4619 			 * to return EAGAIN when portion of the user data
4620 			 * has actually been sent down.
4621 			 */
4622 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4623 		}
4624 	}
4625 	return (0);
4626 }
4627 
4628 /*
4629  * Update so_faddr by asking the transport (unless AF_UNIX).
4630  */
4631 int
4632 sotpi_getpeername(struct sonode *so)
4633 {
4634 	struct strbuf	strbuf;
4635 	int		error = 0, res;
4636 	void		*addr;
4637 	t_uscalar_t	addrlen;
4638 	k_sigset_t	smask;
4639 
4640 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4641 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4642 
4643 	mutex_enter(&so->so_lock);
4644 	so_lock_single(so);	/* Set SOLOCKED */
4645 	if (!(so->so_state & SS_ISCONNECTED)) {
4646 		error = ENOTCONN;
4647 		goto done;
4648 	}
4649 	/* Added this check for X/Open */
4650 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4651 		error = EINVAL;
4652 		if (xnet_check_print) {
4653 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4654 		}
4655 		goto done;
4656 	}
4657 #ifdef DEBUG
4658 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4659 	    pr_addr(so->so_family, so->so_faddr_sa,
4660 	    (t_uscalar_t)so->so_faddr_len)));
4661 #endif /* DEBUG */
4662 
4663 	if (so->so_family == AF_UNIX) {
4664 		/* Transport has different name space - return local info */
4665 		error = 0;
4666 		goto done;
4667 	}
4668 
4669 	ASSERT(so->so_faddr_sa);
4670 	/* Allocate local buffer to use with ioctl */
4671 	addrlen = (t_uscalar_t)so->so_faddr_maxlen;
4672 	mutex_exit(&so->so_lock);
4673 	addr = kmem_alloc(addrlen, KM_SLEEP);
4674 
4675 	/*
4676 	 * Issue TI_GETPEERNAME with signals masked.
4677 	 * Put the result in so_faddr_sa so that getpeername works after
4678 	 * a shutdown(output).
4679 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4680 	 * back to the socket.
4681 	 */
4682 	strbuf.buf = addr;
4683 	strbuf.maxlen = addrlen;
4684 	strbuf.len = 0;
4685 
4686 	sigintr(&smask, 0);
4687 	res = 0;
4688 	ASSERT(CRED());
4689 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4690 	    0, K_TO_K, CRED(), &res);
4691 	sigunintr(&smask);
4692 
4693 	mutex_enter(&so->so_lock);
4694 	/*
4695 	 * If there is an error record the error in so_error put don't fail
4696 	 * the getpeername. Instead fallback on the recorded
4697 	 * so->so_faddr_sa.
4698 	 */
4699 	if (error) {
4700 		/*
4701 		 * Various stream head errors can be returned to the ioctl.
4702 		 * However, it is impossible to determine which ones of
4703 		 * these are really socket level errors that were incorrectly
4704 		 * consumed by the ioctl. Thus this code silently ignores the
4705 		 * error - to code explicitly does not reinstate the error
4706 		 * using soseterror().
4707 		 * Experiments have shows that at least this set of
4708 		 * errors are reported and should not be reinstated on the
4709 		 * socket:
4710 		 *	EINVAL	E.g. if an I_LINK was in effect when
4711 		 *		getpeername was called.
4712 		 *	EPIPE	The ioctl error semantics prefer the write
4713 		 *		side error over the read side error.
4714 		 *	ENOTCONN The transport just got disconnected but
4715 		 *		sockfs had not yet seen the T_DISCON_IND
4716 		 *		when issuing the ioctl.
4717 		 */
4718 		error = 0;
4719 	} else if (res == 0 && strbuf.len > 0 &&
4720 	    (so->so_state & SS_ISCONNECTED)) {
4721 		ASSERT(strbuf.len <= (int)so->so_faddr_maxlen);
4722 		so->so_faddr_len = (socklen_t)strbuf.len;
4723 		bcopy(addr, so->so_faddr_sa, so->so_faddr_len);
4724 		so->so_state |= SS_FADDR_VALID;
4725 	}
4726 	kmem_free(addr, addrlen);
4727 #ifdef DEBUG
4728 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4729 	    pr_addr(so->so_family, so->so_faddr_sa,
4730 	    (t_uscalar_t)so->so_faddr_len)));
4731 #endif /* DEBUG */
4732 done:
4733 	so_unlock_single(so, SOLOCKED);
4734 	mutex_exit(&so->so_lock);
4735 	return (error);
4736 }
4737 
4738 /*
4739  * Update so_laddr by asking the transport (unless AF_UNIX).
4740  */
4741 int
4742 sotpi_getsockname(struct sonode *so)
4743 {
4744 	struct strbuf	strbuf;
4745 	int		error = 0, res;
4746 	void		*addr;
4747 	t_uscalar_t	addrlen;
4748 	k_sigset_t	smask;
4749 
4750 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4751 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4752 
4753 	mutex_enter(&so->so_lock);
4754 	so_lock_single(so);	/* Set SOLOCKED */
4755 	if (!(so->so_state & SS_ISBOUND) && so->so_family != AF_UNIX) {
4756 		/* Return an all zero address except for the family */
4757 		if (so->so_family == AF_INET)
4758 			so->so_laddr_len = (socklen_t)sizeof (sin_t);
4759 		else if (so->so_family == AF_INET6)
4760 			so->so_laddr_len = (socklen_t)sizeof (sin6_t);
4761 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
4762 		bzero(so->so_laddr_sa, so->so_laddr_len);
4763 		/*
4764 		 * Can not assume there is a sa_family for all
4765 		 * protocol families.
4766 		 */
4767 		if (so->so_family == AF_INET || so->so_family == AF_INET6)
4768 			so->so_laddr_sa->sa_family = so->so_family;
4769 	}
4770 #ifdef DEBUG
4771 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4772 	    pr_addr(so->so_family, so->so_laddr_sa,
4773 	    (t_uscalar_t)so->so_laddr_len)));
4774 #endif /* DEBUG */
4775 	if (so->so_family == AF_UNIX) {
4776 		/* Transport has different name space - return local info */
4777 		error = 0;
4778 		goto done;
4779 	}
4780 	if (!(so->so_state & SS_ISBOUND)) {
4781 		/* If not bound, then nothing to return. */
4782 		error = 0;
4783 		goto done;
4784 	}
4785 	/* Allocate local buffer to use with ioctl */
4786 	addrlen = (t_uscalar_t)so->so_laddr_maxlen;
4787 	mutex_exit(&so->so_lock);
4788 	addr = kmem_alloc(addrlen, KM_SLEEP);
4789 
4790 	/*
4791 	 * Issue TI_GETMYNAME with signals masked.
4792 	 * Put the result in so_laddr_sa so that getsockname works after
4793 	 * a shutdown(output).
4794 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4795 	 * back to the socket.
4796 	 */
4797 	strbuf.buf = addr;
4798 	strbuf.maxlen = addrlen;
4799 	strbuf.len = 0;
4800 
4801 	sigintr(&smask, 0);
4802 	res = 0;
4803 	ASSERT(CRED());
4804 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4805 	    0, K_TO_K, CRED(), &res);
4806 	sigunintr(&smask);
4807 
4808 	mutex_enter(&so->so_lock);
4809 	/*
4810 	 * If there is an error record the error in so_error put don't fail
4811 	 * the getsockname. Instead fallback on the recorded
4812 	 * so->so_laddr_sa.
4813 	 */
4814 	if (error) {
4815 		/*
4816 		 * Various stream head errors can be returned to the ioctl.
4817 		 * However, it is impossible to determine which ones of
4818 		 * these are really socket level errors that were incorrectly
4819 		 * consumed by the ioctl. Thus this code silently ignores the
4820 		 * error - to code explicitly does not reinstate the error
4821 		 * using soseterror().
4822 		 * Experiments have shows that at least this set of
4823 		 * errors are reported and should not be reinstated on the
4824 		 * socket:
4825 		 *	EINVAL	E.g. if an I_LINK was in effect when
4826 		 *		getsockname was called.
4827 		 *	EPIPE	The ioctl error semantics prefer the write
4828 		 *		side error over the read side error.
4829 		 */
4830 		error = 0;
4831 	} else if (res == 0 && strbuf.len > 0 &&
4832 	    (so->so_state & SS_ISBOUND)) {
4833 		ASSERT(strbuf.len <= (int)so->so_laddr_maxlen);
4834 		so->so_laddr_len = (socklen_t)strbuf.len;
4835 		bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
4836 		so->so_state |= SS_LADDR_VALID;
4837 	}
4838 	kmem_free(addr, addrlen);
4839 #ifdef DEBUG
4840 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
4841 	    pr_addr(so->so_family, so->so_laddr_sa,
4842 	    (t_uscalar_t)so->so_laddr_len)));
4843 #endif /* DEBUG */
4844 done:
4845 	so_unlock_single(so, SOLOCKED);
4846 	mutex_exit(&so->so_lock);
4847 	return (error);
4848 }
4849 
4850 /*
4851  * Get socket options. For SOL_SOCKET options some options are handled
4852  * by the sockfs while others use the value recorded in the sonode as a
4853  * fallback should the T_SVR4_OPTMGMT_REQ fail.
4854  *
4855  * On the return most *optlenp bytes are copied to optval.
4856  */
4857 int
4858 sotpi_getsockopt(struct sonode *so, int level, int option_name,
4859 		void *optval, socklen_t *optlenp, int flags)
4860 {
4861 	struct T_optmgmt_req	optmgmt_req;
4862 	struct T_optmgmt_ack	*optmgmt_ack;
4863 	struct opthdr		oh;
4864 	struct opthdr		*opt_res;
4865 	mblk_t			*mp = NULL;
4866 	int			error = 0;
4867 	void			*option = NULL;	/* Set if fallback value */
4868 	t_uscalar_t		maxlen = *optlenp;
4869 	t_uscalar_t		len;
4870 	uint32_t		value;
4871 
4872 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
4873 	    (void *)so, level, option_name, optval, (void *)optlenp,
4874 	    pr_state(so->so_state, so->so_mode)));
4875 
4876 	mutex_enter(&so->so_lock);
4877 	so_lock_single(so);	/* Set SOLOCKED */
4878 
4879 	/*
4880 	 * Check for SOL_SOCKET options.
4881 	 * Certain SOL_SOCKET options are returned directly whereas
4882 	 * others only provide a default (fallback) value should
4883 	 * the T_SVR4_OPTMGMT_REQ fail.
4884 	 */
4885 	if (level == SOL_SOCKET) {
4886 		/* Check parameters */
4887 		switch (option_name) {
4888 		case SO_TYPE:
4889 		case SO_ERROR:
4890 		case SO_DEBUG:
4891 		case SO_ACCEPTCONN:
4892 		case SO_REUSEADDR:
4893 		case SO_KEEPALIVE:
4894 		case SO_DONTROUTE:
4895 		case SO_BROADCAST:
4896 		case SO_USELOOPBACK:
4897 		case SO_OOBINLINE:
4898 		case SO_SNDBUF:
4899 		case SO_RCVBUF:
4900 #ifdef notyet
4901 		case SO_SNDLOWAT:
4902 		case SO_RCVLOWAT:
4903 		case SO_SNDTIMEO:
4904 		case SO_RCVTIMEO:
4905 #endif /* notyet */
4906 		case SO_DOMAIN:
4907 		case SO_DGRAM_ERRIND:
4908 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
4909 				error = EINVAL;
4910 				eprintsoline(so, error);
4911 				goto done2;
4912 			}
4913 			break;
4914 		case SO_LINGER:
4915 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
4916 				error = EINVAL;
4917 				eprintsoline(so, error);
4918 				goto done2;
4919 			}
4920 			break;
4921 		}
4922 
4923 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
4924 
4925 		switch (option_name) {
4926 		case SO_TYPE:
4927 			value = so->so_type;
4928 			option = &value;
4929 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4930 
4931 		case SO_ERROR:
4932 			value = sogeterr(so);
4933 			option = &value;
4934 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4935 
4936 		case SO_ACCEPTCONN:
4937 			if (so->so_state & SS_ACCEPTCONN)
4938 				value = SO_ACCEPTCONN;
4939 			else
4940 				value = 0;
4941 #ifdef DEBUG
4942 			if (value) {
4943 				dprintso(so, 1,
4944 				    ("sotpi_getsockopt: 0x%x is set\n",
4945 				    option_name));
4946 			} else {
4947 				dprintso(so, 1,
4948 				    ("sotpi_getsockopt: 0x%x not set\n",
4949 				    option_name));
4950 			}
4951 #endif /* DEBUG */
4952 			option = &value;
4953 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4954 
4955 		case SO_DEBUG:
4956 		case SO_REUSEADDR:
4957 		case SO_KEEPALIVE:
4958 		case SO_DONTROUTE:
4959 		case SO_BROADCAST:
4960 		case SO_USELOOPBACK:
4961 		case SO_OOBINLINE:
4962 		case SO_DGRAM_ERRIND:
4963 			value = (so->so_options & option_name);
4964 #ifdef DEBUG
4965 			if (value) {
4966 				dprintso(so, 1,
4967 				    ("sotpi_getsockopt: 0x%x is set\n",
4968 				    option_name));
4969 			} else {
4970 				dprintso(so, 1,
4971 				    ("sotpi_getsockopt: 0x%x not set\n",
4972 				    option_name));
4973 			}
4974 #endif /* DEBUG */
4975 			option = &value;
4976 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4977 
4978 		/*
4979 		 * The following options are only returned by sockfs when the
4980 		 * T_SVR4_OPTMGMT_REQ fails.
4981 		 */
4982 		case SO_LINGER:
4983 			option = &so->so_linger;
4984 			len = (t_uscalar_t)sizeof (struct linger);
4985 			break;
4986 		case SO_SNDBUF: {
4987 			ssize_t lvalue;
4988 
4989 			/*
4990 			 * If the option has not been set then get a default
4991 			 * value from the read queue. This value is
4992 			 * returned if the transport fails
4993 			 * the T_SVR4_OPTMGMT_REQ.
4994 			 */
4995 			lvalue = so->so_sndbuf;
4996 			if (lvalue == 0) {
4997 				mutex_exit(&so->so_lock);
4998 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
4999 				    QHIWAT, 0, &lvalue);
5000 				mutex_enter(&so->so_lock);
5001 				dprintso(so, 1,
5002 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5003 			}
5004 			value = (int)lvalue;
5005 			option = &value;
5006 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5007 			break;
5008 		}
5009 		case SO_RCVBUF: {
5010 			ssize_t lvalue;
5011 
5012 			/*
5013 			 * If the option has not been set then get a default
5014 			 * value from the read queue. This value is
5015 			 * returned if the transport fails
5016 			 * the T_SVR4_OPTMGMT_REQ.
5017 			 *
5018 			 * XXX If SO_RCVBUF has been set and this is an
5019 			 * XPG 4.2 application then do not ask the transport
5020 			 * since the transport might adjust the value and not
5021 			 * return exactly what was set by the application.
5022 			 * For non-XPG 4.2 application we return the value
5023 			 * that the transport is actually using.
5024 			 */
5025 			lvalue = so->so_rcvbuf;
5026 			if (lvalue == 0) {
5027 				mutex_exit(&so->so_lock);
5028 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5029 				    QHIWAT, 0, &lvalue);
5030 				mutex_enter(&so->so_lock);
5031 				dprintso(so, 1,
5032 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5033 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5034 				value = (int)lvalue;
5035 				option = &value;
5036 				goto copyout;	/* skip asking transport */
5037 			}
5038 			value = (int)lvalue;
5039 			option = &value;
5040 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5041 			break;
5042 		}
5043 		case SO_DOMAIN:
5044 			value = so->so_family;
5045 			option = &value;
5046 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5047 
5048 #ifdef notyet
5049 		/*
5050 		 * We do not implement the semantics of these options
5051 		 * thus we shouldn't implement the options either.
5052 		 */
5053 		case SO_SNDLOWAT:
5054 			value = so->so_sndlowat;
5055 			option = &value;
5056 			break;
5057 		case SO_RCVLOWAT:
5058 			value = so->so_rcvlowat;
5059 			option = &value;
5060 			break;
5061 		case SO_SNDTIMEO:
5062 			value = so->so_sndtimeo;
5063 			option = &value;
5064 			break;
5065 		case SO_RCVTIMEO:
5066 			value = so->so_rcvtimeo;
5067 			option = &value;
5068 			break;
5069 #endif /* notyet */
5070 		}
5071 	}
5072 
5073 	mutex_exit(&so->so_lock);
5074 
5075 	/* Send request */
5076 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5077 	optmgmt_req.MGMT_flags = T_CHECK;
5078 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5079 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5080 
5081 	oh.level = level;
5082 	oh.name = option_name;
5083 	oh.len = maxlen;
5084 
5085 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5086 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP);
5087 	/* Let option management work in the presence of data flow control */
5088 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5089 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5090 	mp = NULL;
5091 	mutex_enter(&so->so_lock);
5092 	if (error) {
5093 		eprintsoline(so, error);
5094 		goto done2;
5095 	}
5096 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5097 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5098 	if (error) {
5099 		if (option != NULL) {
5100 			/* We have a fallback value */
5101 			error = 0;
5102 			goto copyout;
5103 		}
5104 		eprintsoline(so, error);
5105 		goto done2;
5106 	}
5107 	ASSERT(mp);
5108 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5109 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5110 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5111 	if (opt_res == NULL) {
5112 		if (option != NULL) {
5113 			/* We have a fallback value */
5114 			error = 0;
5115 			goto copyout;
5116 		}
5117 		error = EPROTO;
5118 		eprintsoline(so, error);
5119 		goto done;
5120 	}
5121 	option = &opt_res[1];
5122 
5123 	/* check to ensure that the option is within bounds */
5124 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5125 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5126 		if (option != NULL) {
5127 			/* We have a fallback value */
5128 			error = 0;
5129 			goto copyout;
5130 		}
5131 		error = EPROTO;
5132 		eprintsoline(so, error);
5133 		goto done;
5134 	}
5135 
5136 	len = opt_res->len;
5137 
5138 copyout: {
5139 		t_uscalar_t size = MIN(len, maxlen);
5140 		bcopy(option, optval, size);
5141 		bcopy(&size, optlenp, sizeof (size));
5142 	}
5143 done:
5144 	freemsg(mp);
5145 done2:
5146 	so_unlock_single(so, SOLOCKED);
5147 	mutex_exit(&so->so_lock);
5148 	return (error);
5149 }
5150 
5151 /*
5152  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5153  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5154  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5155  * setsockopt has to work even if the transport does not support the option.
5156  */
5157 int
5158 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5159 	const void *optval, t_uscalar_t optlen)
5160 {
5161 	struct T_optmgmt_req	optmgmt_req;
5162 	struct opthdr		oh;
5163 	mblk_t			*mp;
5164 	int			error = 0;
5165 	boolean_t		handled = B_FALSE;
5166 
5167 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5168 	    (void *)so, level, option_name, optval, optlen,
5169 	    pr_state(so->so_state, so->so_mode)));
5170 
5171 
5172 	/* X/Open requires this check */
5173 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5174 		if (xnet_check_print)
5175 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5176 		return (EINVAL);
5177 	}
5178 
5179 	/* Caller allocates aligned optval, or passes null */
5180 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
5181 	/* If optval is null optlen is 0, and vice-versa */
5182 	ASSERT(optval != NULL || optlen == 0);
5183 	ASSERT(optlen != 0 || optval == NULL);
5184 
5185 	mutex_enter(&so->so_lock);
5186 	so_lock_single(so);	/* Set SOLOCKED */
5187 	mutex_exit(&so->so_lock);
5188 
5189 	/*
5190 	 * For SOCKET or TCP level options, try to set it here itself
5191 	 * provided socket has not been popped and we know the tcp
5192 	 * structure (stored in so_priv).
5193 	 */
5194 	if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
5195 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
5196 	    (so->so_version == SOV_SOCKSTREAM) && (so->so_priv != NULL)) {
5197 		tcp_t		*tcp = so->so_priv;
5198 		boolean_t	onoff;
5199 
5200 #define	intvalue	(*(int32_t *)optval)
5201 
5202 		switch (level) {
5203 		case SOL_SOCKET:
5204 			switch (option_name) {		/* Check length param */
5205 			case SO_DEBUG:
5206 			case SO_REUSEADDR:
5207 			case SO_DONTROUTE:
5208 			case SO_BROADCAST:
5209 			case SO_USELOOPBACK:
5210 			case SO_OOBINLINE:
5211 			case SO_DGRAM_ERRIND:
5212 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5213 					error = EINVAL;
5214 					eprintsoline(so, error);
5215 					mutex_enter(&so->so_lock);
5216 					goto done2;
5217 				}
5218 				ASSERT(optval);
5219 				onoff = intvalue != 0;
5220 				handled = B_TRUE;
5221 				break;
5222 			case SO_LINGER:
5223 				if (optlen !=
5224 				    (t_uscalar_t)sizeof (struct linger)) {
5225 					error = EINVAL;
5226 					eprintsoline(so, error);
5227 					mutex_enter(&so->so_lock);
5228 					goto done2;
5229 				}
5230 				ASSERT(optval);
5231 				handled = B_TRUE;
5232 				break;
5233 			}
5234 
5235 			switch (option_name) {			/* Do actions */
5236 			case SO_LINGER: {
5237 				struct linger *lgr = (struct linger *)optval;
5238 
5239 				if (lgr->l_onoff) {
5240 					tcp->tcp_linger = 1;
5241 					tcp->tcp_lingertime = lgr->l_linger;
5242 					so->so_linger.l_onoff = SO_LINGER;
5243 					so->so_options |= SO_LINGER;
5244 				} else {
5245 					tcp->tcp_linger = 0;
5246 					tcp->tcp_lingertime = 0;
5247 					so->so_linger.l_onoff = 0;
5248 					so->so_options &= ~SO_LINGER;
5249 				}
5250 				so->so_linger.l_linger = lgr->l_linger;
5251 				handled = B_TRUE;
5252 				break;
5253 			}
5254 			case SO_DEBUG:
5255 				tcp->tcp_debug = onoff;
5256 #ifdef SOCK_TEST
5257 				if (intvalue & 2)
5258 					sock_test_timelimit = 10 * hz;
5259 				else
5260 					sock_test_timelimit = 0;
5261 
5262 				if (intvalue & 4)
5263 					do_useracc = 0;
5264 				else
5265 					do_useracc = 1;
5266 #endif /* SOCK_TEST */
5267 				break;
5268 			case SO_DONTROUTE:
5269 				/*
5270 				 * SO_DONTROUTE, SO_USELOOPBACK and
5271 				 * SO_BROADCAST are only of interest to IP.
5272 				 * We track them here only so
5273 				 * that we can report their current value.
5274 				 */
5275 				tcp->tcp_dontroute = onoff;
5276 				if (onoff)
5277 					so->so_options |= option_name;
5278 				else
5279 					so->so_options &= ~option_name;
5280 				break;
5281 			case SO_USELOOPBACK:
5282 				tcp->tcp_useloopback = onoff;
5283 				if (onoff)
5284 					so->so_options |= option_name;
5285 				else
5286 					so->so_options &= ~option_name;
5287 				break;
5288 			case SO_BROADCAST:
5289 				tcp->tcp_broadcast = onoff;
5290 				if (onoff)
5291 					so->so_options |= option_name;
5292 				else
5293 					so->so_options &= ~option_name;
5294 				break;
5295 			case SO_REUSEADDR:
5296 				tcp->tcp_reuseaddr = onoff;
5297 				if (onoff)
5298 					so->so_options |= option_name;
5299 				else
5300 					so->so_options &= ~option_name;
5301 				break;
5302 			case SO_OOBINLINE:
5303 				tcp->tcp_oobinline = onoff;
5304 				if (onoff)
5305 					so->so_options |= option_name;
5306 				else
5307 					so->so_options &= ~option_name;
5308 				break;
5309 			case SO_DGRAM_ERRIND:
5310 				tcp->tcp_dgram_errind = onoff;
5311 				if (onoff)
5312 					so->so_options |= option_name;
5313 				else
5314 					so->so_options &= ~option_name;
5315 				break;
5316 			}
5317 			break;
5318 		case IPPROTO_TCP:
5319 			switch (option_name) {
5320 			case TCP_NODELAY:
5321 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5322 					error = EINVAL;
5323 					eprintsoline(so, error);
5324 					mutex_enter(&so->so_lock);
5325 					goto done2;
5326 				}
5327 				ASSERT(optval);
5328 				tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss;
5329 				handled = B_TRUE;
5330 				break;
5331 			}
5332 			break;
5333 		default:
5334 			handled = B_FALSE;
5335 			break;
5336 		}
5337 	}
5338 
5339 	if (handled) {
5340 		mutex_enter(&so->so_lock);
5341 		goto done2;
5342 	}
5343 
5344 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5345 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5346 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5347 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5348 
5349 	oh.level = level;
5350 	oh.name = option_name;
5351 	oh.len = optlen;
5352 
5353 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5354 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP);
5355 	/* Let option management work in the presence of data flow control */
5356 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5357 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5358 	mp = NULL;
5359 	mutex_enter(&so->so_lock);
5360 	if (error) {
5361 		eprintsoline(so, error);
5362 		goto done;
5363 	}
5364 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5365 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5366 	if (error) {
5367 		eprintsoline(so, error);
5368 		goto done;
5369 	}
5370 	ASSERT(mp);
5371 	/* No need to verify T_optmgmt_ack */
5372 	freemsg(mp);
5373 done:
5374 	/*
5375 	 * Check for SOL_SOCKET options and record their values.
5376 	 * If we know about a SOL_SOCKET parameter and the transport
5377 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5378 	 * EPROTO) we let the setsockopt succeed.
5379 	 */
5380 	if (level == SOL_SOCKET) {
5381 		/* Check parameters */
5382 		switch (option_name) {
5383 		case SO_DEBUG:
5384 		case SO_REUSEADDR:
5385 		case SO_KEEPALIVE:
5386 		case SO_DONTROUTE:
5387 		case SO_BROADCAST:
5388 		case SO_USELOOPBACK:
5389 		case SO_OOBINLINE:
5390 		case SO_SNDBUF:
5391 		case SO_RCVBUF:
5392 #ifdef notyet
5393 		case SO_SNDLOWAT:
5394 		case SO_RCVLOWAT:
5395 		case SO_SNDTIMEO:
5396 		case SO_RCVTIMEO:
5397 #endif /* notyet */
5398 		case SO_DGRAM_ERRIND:
5399 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5400 				error = EINVAL;
5401 				eprintsoline(so, error);
5402 				goto done2;
5403 			}
5404 			ASSERT(optval);
5405 			handled = B_TRUE;
5406 			break;
5407 		case SO_LINGER:
5408 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5409 				error = EINVAL;
5410 				eprintsoline(so, error);
5411 				goto done2;
5412 			}
5413 			ASSERT(optval);
5414 			handled = B_TRUE;
5415 			break;
5416 		}
5417 
5418 #define	intvalue	(*(int32_t *)optval)
5419 
5420 		switch (option_name) {
5421 		case SO_TYPE:
5422 		case SO_ERROR:
5423 		case SO_ACCEPTCONN:
5424 			/* Can't be set */
5425 			error = ENOPROTOOPT;
5426 			goto done2;
5427 		case SO_LINGER: {
5428 			struct linger *l = (struct linger *)optval;
5429 
5430 			so->so_linger.l_linger = l->l_linger;
5431 			if (l->l_onoff) {
5432 				so->so_linger.l_onoff = SO_LINGER;
5433 				so->so_options |= SO_LINGER;
5434 			} else {
5435 				so->so_linger.l_onoff = 0;
5436 				so->so_options &= ~SO_LINGER;
5437 			}
5438 			break;
5439 		}
5440 
5441 		case SO_DEBUG:
5442 #ifdef SOCK_TEST
5443 			if (intvalue & 2)
5444 				sock_test_timelimit = 10 * hz;
5445 			else
5446 				sock_test_timelimit = 0;
5447 
5448 			if (intvalue & 4)
5449 				do_useracc = 0;
5450 			else
5451 				do_useracc = 1;
5452 #endif /* SOCK_TEST */
5453 			/* FALLTHRU */
5454 		case SO_REUSEADDR:
5455 		case SO_KEEPALIVE:
5456 		case SO_DONTROUTE:
5457 		case SO_BROADCAST:
5458 		case SO_USELOOPBACK:
5459 		case SO_OOBINLINE:
5460 		case SO_DGRAM_ERRIND:
5461 			if (intvalue != 0) {
5462 				dprintso(so, 1,
5463 				    ("sotpi_setsockopt: setting 0x%x\n",
5464 				    option_name));
5465 				so->so_options |= option_name;
5466 			} else {
5467 				dprintso(so, 1,
5468 				    ("sotpi_setsockopt: clearing 0x%x\n",
5469 				    option_name));
5470 				so->so_options &= ~option_name;
5471 			}
5472 			break;
5473 		/*
5474 		 * The following options are only returned by us when the
5475 		 * T_SVR4_OPTMGMT_REQ fails.
5476 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5477 		 * since the transport might adjust the value and not
5478 		 * return exactly what was set by the application.
5479 		 */
5480 		case SO_SNDBUF:
5481 			so->so_sndbuf = intvalue;
5482 			break;
5483 		case SO_RCVBUF:
5484 			so->so_rcvbuf = intvalue;
5485 			break;
5486 #ifdef notyet
5487 		/*
5488 		 * We do not implement the semantics of these options
5489 		 * thus we shouldn't implement the options either.
5490 		 */
5491 		case SO_SNDLOWAT:
5492 			so->so_sndlowat = intvalue;
5493 			break;
5494 		case SO_RCVLOWAT:
5495 			so->so_rcvlowat = intvalue;
5496 			break;
5497 		case SO_SNDTIMEO:
5498 			so->so_sndtimeo = intvalue;
5499 			break;
5500 		case SO_RCVTIMEO:
5501 			so->so_rcvtimeo = intvalue;
5502 			break;
5503 #endif /* notyet */
5504 		}
5505 #undef	intvalue
5506 
5507 		if (error) {
5508 			if ((error == ENOPROTOOPT || error == EPROTO ||
5509 			    error == EINVAL) && handled) {
5510 				dprintso(so, 1,
5511 				    ("setsockopt: ignoring error %d for 0x%x\n",
5512 				    error, option_name));
5513 				error = 0;
5514 			}
5515 		}
5516 	}
5517 done2:
5518 ret:
5519 	so_unlock_single(so, SOLOCKED);
5520 	mutex_exit(&so->so_lock);
5521 	return (error);
5522 }
5523