xref: /titanic_44/usr/src/uts/common/fs/sockfs/socktpi.c (revision 551bc2a66868b5cb5be6b70ab9f55515e77a39a9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/strsun.h>
50 #include <sys/ddi.h>
51 #include <sys/esunddi.h>
52 #include <sys/flock.h>
53 #include <sys/modctl.h>
54 #include <sys/vtrace.h>
55 #include <sys/cmn_err.h>
56 #include <sys/pathname.h>
57 
58 #include <sys/socket.h>
59 #include <sys/socketvar.h>
60 #include <sys/sockio.h>
61 #include <netinet/in.h>
62 #include <sys/un.h>
63 #include <sys/strsun.h>
64 
65 #include <sys/tiuser.h>
66 #define	_SUN_TPI_VERSION	2
67 #include <sys/tihdr.h>
68 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
69 
70 #include <c2/audit.h>
71 
72 #include <inet/common.h>
73 #include <inet/ip.h>
74 #include <inet/ip6.h>
75 #include <inet/tcp.h>
76 #include <inet/udp_impl.h>
77 
78 #include <sys/zone.h>
79 
80 #include <fs/sockfs/nl7c.h>
81 #include <fs/sockfs/nl7curi.h>
82 
83 #include <inet/kssl/ksslapi.h>
84 
85 /*
86  * Possible failures when memory can't be allocated. The documented behavior:
87  *
88  * 		5.5:			4.X:		XNET:
89  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
90  *							EINTR
91  *	(4.X does not document EINTR but returns it)
92  * bind:	ENOSR			-		ENOBUFS/ENOSR
93  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
94  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
95  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
96  *	(4.X getpeername and getsockname do not fail in practice)
97  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
98  * listen:	-			-		ENOBUFS
99  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
100  *							EINTR
101  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
102  *							EINTR
103  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
104  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
105  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
106  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
107  *
108  * Resolution. When allocation fails:
109  *	recv: return EINTR
110  *	send: return EINTR
111  *	connect, accept: EINTR
112  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
113  *	socket, socketpair: ENOBUFS
114  *	getpeername, getsockname: sleep
115  *	getsockopt, setsockopt: sleep
116  */
117 
118 #ifdef SOCK_TEST
119 /*
120  * Variables that make sockfs do something other than the standard TPI
121  * for the AF_INET transports.
122  *
123  * solisten_tpi_tcp:
124  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
125  *	the transport is already bound. This is needed to avoid loosing the
126  *	port number should listen() do a T_UNBIND_REQ followed by a
127  *	O_T_BIND_REQ.
128  *
129  * soconnect_tpi_udp:
130  *	UDP and ICMP can handle a T_CONN_REQ.
131  *	This is needed to make the sequence of connect(), getsockname()
132  *	return the local IP address used to send packets to the connected to
133  *	destination.
134  *
135  * soconnect_tpi_tcp:
136  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
137  *	Set this to non-zero to send TPI conformant messages to TCP in this
138  *	respect. This is a performance optimization.
139  *
140  * soaccept_tpi_tcp:
141  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
142  *	This is a performance optimization that has been picked up in XTI.
143  *
144  * soaccept_tpi_multioptions:
145  *	When inheriting SOL_SOCKET options from the listener to the accepting
146  *	socket send them as a single message for AF_INET{,6}.
147  */
148 int solisten_tpi_tcp = 0;
149 int soconnect_tpi_udp = 0;
150 int soconnect_tpi_tcp = 0;
151 int soaccept_tpi_tcp = 0;
152 int soaccept_tpi_multioptions = 1;
153 #else /* SOCK_TEST */
154 #define	soconnect_tpi_tcp	0
155 #define	soconnect_tpi_udp	0
156 #define	solisten_tpi_tcp	0
157 #define	soaccept_tpi_tcp	0
158 #define	soaccept_tpi_multioptions	1
159 #endif /* SOCK_TEST */
160 
161 #ifdef SOCK_TEST
162 extern int do_useracc;
163 extern clock_t sock_test_timelimit;
164 #endif /* SOCK_TEST */
165 
166 /*
167  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
168  * applications working. Turn on this flag to disable these checks.
169  */
170 int xnet_skip_checks = 0;
171 int xnet_check_print = 0;
172 int xnet_truncate_print = 0;
173 
174 extern	void sigintr(k_sigset_t *, int);
175 extern	void sigunintr(k_sigset_t *);
176 
177 extern	void *nl7c_lookup_addr(void *, t_uscalar_t);
178 extern	void *nl7c_add_addr(void *, t_uscalar_t);
179 extern	void nl7c_listener_addr(void *, struct sonode *);
180 
181 /* Sockets acting as an in-kernel SSL proxy */
182 extern mblk_t	*strsock_kssl_input(vnode_t *, mblk_t *, strwakeup_t *,
183 		    strsigset_t *, strsigset_t *, strpollset_t *);
184 extern mblk_t	*strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *,
185 		    strsigset_t *, strsigset_t *, strpollset_t *);
186 
187 static int	sotpi_unbind(struct sonode *, int);
188 
189 /* TPI sockfs sonode operations */
190 static int	sotpi_accept(struct sonode *, int, struct sonode **);
191 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
192 		    int);
193 static int	sotpi_connect(struct sonode *, const struct sockaddr *,
194 		    socklen_t, int, int);
195 static int	sotpi_listen(struct sonode *, int);
196 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
197 		    struct uio *);
198 static int	sotpi_shutdown(struct sonode *, int);
199 static int	sotpi_getsockname(struct sonode *);
200 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
201 		    struct uio *, void *, t_uscalar_t, int);
202 static int	sodgram_direct(struct sonode *, struct sockaddr *,
203 		    socklen_t, struct uio *, int);
204 
205 sonodeops_t sotpi_sonodeops = {
206 	sotpi_accept,		/* sop_accept		*/
207 	sotpi_bind,		/* sop_bind		*/
208 	sotpi_listen,		/* sop_listen		*/
209 	sotpi_connect,		/* sop_connect		*/
210 	sotpi_recvmsg,		/* sop_recvmsg		*/
211 	sotpi_sendmsg,		/* sop_sendmsg		*/
212 	sotpi_getpeername,	/* sop_getpeername	*/
213 	sotpi_getsockname,	/* sop_getsockname	*/
214 	sotpi_shutdown,		/* sop_shutdown		*/
215 	sotpi_getsockopt,	/* sop_getsockopt	*/
216 	sotpi_setsockopt	/* sop_setsockopt	*/
217 };
218 
219 /*
220  * Common create code for socket and accept. If tso is set the values
221  * from that node is used instead of issuing a T_INFO_REQ.
222  *
223  * Assumes that the caller has a VN_HOLD on accessvp.
224  * The VN_RELE will occur either when sotpi_create() fails or when
225  * the returned sonode is freed.
226  */
227 struct sonode *
228 sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version,
229     struct sonode *tso, int *errorp)
230 {
231 	struct sonode	*so;
232 	vnode_t		*vp;
233 	int		flags, error;
234 
235 	ASSERT(accessvp != NULL);
236 	vp = makesockvp(accessvp, domain, type, protocol);
237 	ASSERT(vp != NULL);
238 	so = VTOSO(vp);
239 
240 	flags = FREAD|FWRITE;
241 
242 	if ((type == SOCK_STREAM || type == SOCK_DGRAM) &&
243 	    (domain == AF_INET || domain == AF_INET6) &&
244 	    (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP ||
245 	    protocol == IPPROTO_IP)) {
246 		/* Tell tcp or udp that it's talking to sockets */
247 		flags |= SO_SOCKSTR;
248 
249 		/*
250 		 * Here we indicate to socktpi_open() our attempt to
251 		 * make direct calls between sockfs and transport.
252 		 * The final decision is left to socktpi_open().
253 		 */
254 		so->so_state |= SS_DIRECT;
255 
256 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
257 		if (so->so_type == SOCK_STREAM && tso != NULL) {
258 			if (tso->so_state & SS_DIRECT) {
259 				/*
260 				 * Inherit SS_DIRECT from listener and pass
261 				 * SO_ACCEPTOR open flag to tcp, indicating
262 				 * that this is an accept fast-path instance.
263 				 */
264 				flags |= SO_ACCEPTOR;
265 			} else {
266 				/*
267 				 * SS_DIRECT is not set on listener, meaning
268 				 * that the listener has been converted from
269 				 * a socket to a stream.  Ensure that the
270 				 * acceptor inherits these settings.
271 				 */
272 				so->so_state &= ~SS_DIRECT;
273 				flags &= ~SO_SOCKSTR;
274 			}
275 		}
276 	}
277 
278 	/*
279 	 * Tell local transport that it is talking to sockets.
280 	 */
281 	if (so->so_family == AF_UNIX) {
282 		flags |= SO_SOCKSTR;
283 	}
284 
285 	/* Initialize the kernel SSL proxy fields */
286 	so->so_kssl_type = KSSL_NO_PROXY;
287 	so->so_kssl_ent = NULL;
288 	so->so_kssl_ctx = NULL;
289 
290 	if (error = socktpi_open(&vp, flags, CRED(), NULL)) {
291 		VN_RELE(vp);
292 		*errorp = error;
293 		return (NULL);
294 	}
295 
296 	if (error = so_strinit(so, tso)) {
297 		(void) VOP_CLOSE(vp, 0, 1, 0, CRED(), NULL);
298 		VN_RELE(vp);
299 		*errorp = error;
300 		return (NULL);
301 	}
302 
303 	if (version == SOV_DEFAULT)
304 		version = so_default_version;
305 
306 	so->so_version = (short)version;
307 
308 	return (so);
309 }
310 
311 /*
312  * Bind the socket to an unspecified address in sockfs only.
313  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
314  * required in all cases.
315  */
316 static void
317 so_automatic_bind(struct sonode *so)
318 {
319 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
320 
321 	ASSERT(MUTEX_HELD(&so->so_lock));
322 	ASSERT(!(so->so_state & SS_ISBOUND));
323 	ASSERT(so->so_unbind_mp);
324 
325 	ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
326 	bzero(so->so_laddr_sa, so->so_laddr_len);
327 	so->so_laddr_sa->sa_family = so->so_family;
328 	so->so_state |= SS_ISBOUND;
329 }
330 
331 
332 /*
333  * bind the socket.
334  *
335  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
336  * are passed in we allow rebinding. Note that for backwards compatibility
337  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
338  * Thus the rebinding code is currently not executed.
339  *
340  * The constraints for rebinding are:
341  * - it is a SOCK_DGRAM, or
342  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
343  *   and no listen() has been done.
344  * This rebinding code was added based on some language in the XNET book
345  * about not returning EINVAL it the protocol allows rebinding. However,
346  * this language is not present in the Posix socket draft. Thus maybe the
347  * rebinding logic should be deleted from the source.
348  *
349  * A null "name" can be used to unbind the socket if:
350  * - it is a SOCK_DGRAM, or
351  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
352  *   and no listen() has been done.
353  */
354 static int
355 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
356     socklen_t namelen, int backlog, int flags)
357 {
358 	struct T_bind_req	bind_req;
359 	struct T_bind_ack	*bind_ack;
360 	int			error = 0;
361 	mblk_t			*mp;
362 	void			*addr;
363 	t_uscalar_t		addrlen;
364 	int			unbind_on_err = 1;
365 	boolean_t		clear_acceptconn_on_err = B_FALSE;
366 	boolean_t		restore_backlog_on_err = B_FALSE;
367 	int			save_so_backlog;
368 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
369 	boolean_t		tcp_udp_xport;
370 	void			*nl7c = NULL;
371 
372 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
373 	    so, name, namelen, backlog, flags,
374 	    pr_state(so->so_state, so->so_mode)));
375 
376 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
377 
378 	if (!(flags & _SOBIND_LOCK_HELD)) {
379 		mutex_enter(&so->so_lock);
380 		so_lock_single(so);	/* Set SOLOCKED */
381 	} else {
382 		ASSERT(MUTEX_HELD(&so->so_lock));
383 		ASSERT(so->so_flag & SOLOCKED);
384 	}
385 
386 	/*
387 	 * Make sure that there is a preallocated unbind_req message
388 	 * before binding. This message allocated when the socket is
389 	 * created  but it might be have been consumed.
390 	 */
391 	if (so->so_unbind_mp == NULL) {
392 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
393 		/* NOTE: holding so_lock while sleeping */
394 		so->so_unbind_mp =
395 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
396 	}
397 
398 	if (flags & _SOBIND_REBIND) {
399 		/*
400 		 * Called from solisten after doing an sotpi_unbind() or
401 		 * potentially without the unbind (latter for AF_INET{,6}).
402 		 */
403 		ASSERT(name == NULL && namelen == 0);
404 
405 		if (so->so_family == AF_UNIX) {
406 			ASSERT(so->so_ux_bound_vp);
407 			addr = &so->so_ux_laddr;
408 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
409 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
410 			    "addr 0x%p, vp %p\n",
411 			    addrlen,
412 			    ((struct so_ux_addr *)addr)->soua_vp,
413 			    so->so_ux_bound_vp));
414 		} else {
415 			addr = so->so_laddr_sa;
416 			addrlen = (t_uscalar_t)so->so_laddr_len;
417 		}
418 	} else if (flags & _SOBIND_UNSPEC) {
419 		ASSERT(name == NULL && namelen == 0);
420 
421 		/*
422 		 * The caller checked SS_ISBOUND but not necessarily
423 		 * under so_lock
424 		 */
425 		if (so->so_state & SS_ISBOUND) {
426 			/* No error */
427 			goto done;
428 		}
429 
430 		/* Set an initial local address */
431 		switch (so->so_family) {
432 		case AF_UNIX:
433 			/*
434 			 * Use an address with same size as struct sockaddr
435 			 * just like BSD.
436 			 */
437 			so->so_laddr_len =
438 			    (socklen_t)sizeof (struct sockaddr);
439 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
440 			bzero(so->so_laddr_sa, so->so_laddr_len);
441 			so->so_laddr_sa->sa_family = so->so_family;
442 
443 			/*
444 			 * Pass down an address with the implicit bind
445 			 * magic number and the rest all zeros.
446 			 * The transport will return a unique address.
447 			 */
448 			so->so_ux_laddr.soua_vp = NULL;
449 			so->so_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
450 			addr = &so->so_ux_laddr;
451 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
452 			break;
453 
454 		case AF_INET:
455 		case AF_INET6:
456 			/*
457 			 * An unspecified bind in TPI has a NULL address.
458 			 * Set the address in sockfs to have the sa_family.
459 			 */
460 			so->so_laddr_len = (so->so_family == AF_INET) ?
461 			    (socklen_t)sizeof (sin_t) :
462 			    (socklen_t)sizeof (sin6_t);
463 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
464 			bzero(so->so_laddr_sa, so->so_laddr_len);
465 			so->so_laddr_sa->sa_family = so->so_family;
466 			addr = NULL;
467 			addrlen = 0;
468 			break;
469 
470 		default:
471 			/*
472 			 * An unspecified bind in TPI has a NULL address.
473 			 * Set the address in sockfs to be zero length.
474 			 *
475 			 * Can not assume there is a sa_family for all
476 			 * protocol families. For example, AF_X25 does not
477 			 * have a family field.
478 			 */
479 			bzero(so->so_laddr_sa, so->so_laddr_len);
480 			so->so_laddr_len = 0;	/* XXX correct? */
481 			addr = NULL;
482 			addrlen = 0;
483 			break;
484 		}
485 
486 	} else {
487 		if (so->so_state & SS_ISBOUND) {
488 			/*
489 			 * If it is ok to rebind the socket, first unbind
490 			 * with the transport. A rebind to the NULL address
491 			 * is interpreted as an unbind.
492 			 * Note that a bind to NULL in BSD does unbind the
493 			 * socket but it fails with EINVAL.
494 			 * Note that regular sockets set SOV_SOCKBSD i.e.
495 			 * _SOBIND_SOCKBSD gets set here hence no type of
496 			 * socket does currently allow rebinding.
497 			 *
498 			 * If the name is NULL just do an unbind.
499 			 */
500 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
501 			    name != NULL) {
502 				error = EINVAL;
503 				unbind_on_err = 0;
504 				eprintsoline(so, error);
505 				goto done;
506 			}
507 			if ((so->so_mode & SM_CONNREQUIRED) &&
508 			    (so->so_state & SS_CANTREBIND)) {
509 				error = EINVAL;
510 				unbind_on_err = 0;
511 				eprintsoline(so, error);
512 				goto done;
513 			}
514 			error = sotpi_unbind(so, 0);
515 			if (error) {
516 				eprintsoline(so, error);
517 				goto done;
518 			}
519 			ASSERT(!(so->so_state & SS_ISBOUND));
520 			if (name == NULL) {
521 				so->so_state &=
522 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
523 				goto done;
524 			}
525 		}
526 		/* X/Open requires this check */
527 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
528 			if (xnet_check_print) {
529 				printf("sockfs: X/Open bind state check "
530 				    "caused EINVAL\n");
531 			}
532 			error = EINVAL;
533 			goto done;
534 		}
535 
536 		switch (so->so_family) {
537 		case AF_UNIX:
538 			/*
539 			 * All AF_UNIX addresses are nul terminated
540 			 * when copied (copyin_name) in so the minimum
541 			 * length is 3 bytes.
542 			 */
543 			if (name == NULL ||
544 			    (ssize_t)namelen <= sizeof (short) + 1) {
545 				error = EISDIR;
546 				eprintsoline(so, error);
547 				goto done;
548 			}
549 			/*
550 			 * Verify so_family matches the bound family.
551 			 * BSD does not check this for AF_UNIX resulting
552 			 * in funny mknods.
553 			 */
554 			if (name->sa_family != so->so_family) {
555 				error = EAFNOSUPPORT;
556 				goto done;
557 			}
558 			break;
559 		case AF_INET:
560 			if (name == NULL) {
561 				error = EINVAL;
562 				eprintsoline(so, error);
563 				goto done;
564 			}
565 			if ((size_t)namelen != sizeof (sin_t)) {
566 				error = name->sa_family != so->so_family ?
567 				    EAFNOSUPPORT : EINVAL;
568 				eprintsoline(so, error);
569 				goto done;
570 			}
571 			if ((flags & _SOBIND_XPG4_2) &&
572 			    (name->sa_family != so->so_family)) {
573 				/*
574 				 * This check has to be made for X/Open
575 				 * sockets however application failures have
576 				 * been observed when it is applied to
577 				 * all sockets.
578 				 */
579 				error = EAFNOSUPPORT;
580 				eprintsoline(so, error);
581 				goto done;
582 			}
583 			/*
584 			 * Force a zero sa_family to match so_family.
585 			 *
586 			 * Some programs like inetd(1M) don't set the
587 			 * family field. Other programs leave
588 			 * sin_family set to garbage - SunOS 4.X does
589 			 * not check the family field on a bind.
590 			 * We use the family field that
591 			 * was passed in to the socket() call.
592 			 */
593 			name->sa_family = so->so_family;
594 			break;
595 
596 		case AF_INET6: {
597 #ifdef DEBUG
598 			sin6_t *sin6 = (sin6_t *)name;
599 #endif /* DEBUG */
600 
601 			if (name == NULL) {
602 				error = EINVAL;
603 				eprintsoline(so, error);
604 				goto done;
605 			}
606 			if ((size_t)namelen != sizeof (sin6_t)) {
607 				error = name->sa_family != so->so_family ?
608 				    EAFNOSUPPORT : EINVAL;
609 				eprintsoline(so, error);
610 				goto done;
611 			}
612 			if (name->sa_family != so->so_family) {
613 				/*
614 				 * With IPv6 we require the family to match
615 				 * unlike in IPv4.
616 				 */
617 				error = EAFNOSUPPORT;
618 				eprintsoline(so, error);
619 				goto done;
620 			}
621 #ifdef DEBUG
622 			/*
623 			 * Verify that apps don't forget to clear
624 			 * sin6_scope_id etc
625 			 */
626 			if (sin6->sin6_scope_id != 0 &&
627 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
628 				zcmn_err(getzoneid(), CE_WARN,
629 				    "bind with uninitialized sin6_scope_id "
630 				    "(%d) on socket. Pid = %d\n",
631 				    (int)sin6->sin6_scope_id,
632 				    (int)curproc->p_pid);
633 			}
634 			if (sin6->__sin6_src_id != 0) {
635 				zcmn_err(getzoneid(), CE_WARN,
636 				    "bind with uninitialized __sin6_src_id "
637 				    "(%d) on socket. Pid = %d\n",
638 				    (int)sin6->__sin6_src_id,
639 				    (int)curproc->p_pid);
640 			}
641 #endif /* DEBUG */
642 			break;
643 		}
644 		default:
645 			/*
646 			 * Don't do any length or sa_family check to allow
647 			 * non-sockaddr style addresses.
648 			 */
649 			if (name == NULL) {
650 				error = EINVAL;
651 				eprintsoline(so, error);
652 				goto done;
653 			}
654 			break;
655 		}
656 
657 		if (namelen > (t_uscalar_t)so->so_laddr_maxlen) {
658 			error = ENAMETOOLONG;
659 			eprintsoline(so, error);
660 			goto done;
661 		}
662 		/*
663 		 * Save local address.
664 		 */
665 		so->so_laddr_len = (socklen_t)namelen;
666 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
667 		bcopy(name, so->so_laddr_sa, namelen);
668 
669 		addr = so->so_laddr_sa;
670 		addrlen = (t_uscalar_t)so->so_laddr_len;
671 		switch (so->so_family) {
672 		case AF_INET6:
673 		case AF_INET:
674 			break;
675 		case AF_UNIX: {
676 			struct sockaddr_un *soun =
677 			    (struct sockaddr_un *)so->so_laddr_sa;
678 			struct vnode *vp;
679 			struct vattr vattr;
680 
681 			ASSERT(so->so_ux_bound_vp == NULL);
682 			/*
683 			 * Create vnode for the specified path name.
684 			 * Keep vnode held with a reference in so_ux_bound_vp.
685 			 * Use the vnode pointer as the address used in the
686 			 * bind with the transport.
687 			 *
688 			 * Use the same mode as in BSD. In particular this does
689 			 * not observe the umask.
690 			 */
691 			/* MAXPATHLEN + soun_family + nul termination */
692 			if (so->so_laddr_len >
693 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
694 				error = ENAMETOOLONG;
695 				eprintsoline(so, error);
696 				goto done;
697 			}
698 			vattr.va_type = VSOCK;
699 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
700 			vattr.va_mask = AT_TYPE|AT_MODE;
701 			/* NOTE: holding so_lock */
702 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
703 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
704 			if (error) {
705 				if (error == EEXIST)
706 					error = EADDRINUSE;
707 				eprintsoline(so, error);
708 				goto done;
709 			}
710 			/*
711 			 * Establish pointer from the underlying filesystem
712 			 * vnode to the socket node.
713 			 * so_ux_bound_vp and v_stream->sd_vnode form the
714 			 * cross-linkage between the underlying filesystem
715 			 * node and the socket node.
716 			 */
717 			ASSERT(SOTOV(so)->v_stream);
718 			mutex_enter(&vp->v_lock);
719 			vp->v_stream = SOTOV(so)->v_stream;
720 			so->so_ux_bound_vp = vp;
721 			mutex_exit(&vp->v_lock);
722 
723 			/*
724 			 * Use the vnode pointer value as a unique address
725 			 * (together with the magic number to avoid conflicts
726 			 * with implicit binds) in the transport provider.
727 			 */
728 			so->so_ux_laddr.soua_vp = (void *)so->so_ux_bound_vp;
729 			so->so_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
730 			addr = &so->so_ux_laddr;
731 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
732 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
733 			    addrlen,
734 			    ((struct so_ux_addr *)addr)->soua_vp));
735 			break;
736 		}
737 		} /* end switch (so->so_family) */
738 	}
739 
740 	/*
741 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
742 	 * the transport can start passing up T_CONN_IND messages
743 	 * as soon as it receives the bind req and strsock_proto()
744 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
745 	 */
746 	if (flags & _SOBIND_LISTEN) {
747 		if ((so->so_state & SS_ACCEPTCONN) == 0)
748 			clear_acceptconn_on_err = B_TRUE;
749 		save_so_backlog = so->so_backlog;
750 		restore_backlog_on_err = B_TRUE;
751 		so->so_state |= SS_ACCEPTCONN;
752 		so->so_backlog = backlog;
753 	}
754 
755 	/*
756 	 * If NL7C addr(s) have been configured check for addr/port match,
757 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
758 	 *
759 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
760 	 * family sockets only. If match mark as such.
761 	 */
762 	if (nl7c_enabled && ((addr != NULL &&
763 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
764 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
765 	    so->so_nl7c_flags == NL7C_AF_NCA)) {
766 		/*
767 		 * NL7C is not supported in non-global zones,
768 		 * we enforce this restriction here.
769 		 */
770 		if (so->so_zoneid == GLOBAL_ZONEID) {
771 			/* An NL7C socket, mark it */
772 			so->so_nl7c_flags |= NL7C_ENABLED;
773 			if (nl7c == NULL) {
774 				/*
775 				 * Was an AF_NCA bind() so add it to the
776 				 * addr list for reporting purposes.
777 				 */
778 				nl7c = nl7c_add_addr(addr, addrlen);
779 			}
780 		} else
781 			nl7c = NULL;
782 	}
783 	/*
784 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
785 	 * for other transports we will send in a O_T_BIND_REQ.
786 	 */
787 	if (tcp_udp_xport &&
788 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
789 		PRIM_type = T_BIND_REQ;
790 
791 	bind_req.PRIM_type = PRIM_type;
792 	bind_req.ADDR_length = addrlen;
793 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
794 	bind_req.CONIND_number = backlog;
795 	/* NOTE: holding so_lock while sleeping */
796 	mp = soallocproto2(&bind_req, sizeof (bind_req),
797 	    addr, addrlen, 0, _ALLOC_SLEEP);
798 	so->so_state &= ~SS_LADDR_VALID;
799 
800 	/* Done using so_laddr_sa - can drop the lock */
801 	mutex_exit(&so->so_lock);
802 
803 	/*
804 	 * Intercept the bind_req message here to check if this <address/port>
805 	 * was configured as an SSL proxy server, or if another endpoint was
806 	 * already configured to act as a proxy for us.
807 	 *
808 	 * Note, only if NL7C not enabled for this socket.
809 	 */
810 	if (nl7c == NULL &&
811 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
812 	    so->so_type == SOCK_STREAM) {
813 
814 		if (so->so_kssl_ent != NULL) {
815 			kssl_release_ent(so->so_kssl_ent, so, so->so_kssl_type);
816 			so->so_kssl_ent = NULL;
817 		}
818 
819 		so->so_kssl_type = kssl_check_proxy(mp, so, &so->so_kssl_ent);
820 		switch (so->so_kssl_type) {
821 		case KSSL_NO_PROXY:
822 			break;
823 
824 		case KSSL_HAS_PROXY:
825 			mutex_enter(&so->so_lock);
826 			goto skip_transport;
827 
828 		case KSSL_IS_PROXY:
829 			break;
830 		}
831 	}
832 
833 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
834 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
835 	if (error) {
836 		eprintsoline(so, error);
837 		mutex_enter(&so->so_lock);
838 		goto done;
839 	}
840 
841 	mutex_enter(&so->so_lock);
842 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
843 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
844 	if (error) {
845 		eprintsoline(so, error);
846 		goto done;
847 	}
848 skip_transport:
849 	ASSERT(mp);
850 	/*
851 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
852 	 * strsock_proto while the lock was dropped above, the bind
853 	 * is allowed to complete.
854 	 */
855 
856 	/* Mark as bound. This will be undone if we detect errors below. */
857 	if (flags & _SOBIND_NOXLATE) {
858 		ASSERT(so->so_family == AF_UNIX);
859 		so->so_state |= SS_FADDR_NOXLATE;
860 	}
861 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
862 	so->so_state |= SS_ISBOUND;
863 	ASSERT(so->so_unbind_mp);
864 
865 	/* note that we've already set SS_ACCEPTCONN above */
866 
867 	/*
868 	 * Recompute addrlen - an unspecied bind sent down an
869 	 * address of length zero but we expect the appropriate length
870 	 * in return.
871 	 */
872 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
873 	    sizeof (so->so_ux_laddr) : so->so_laddr_len);
874 
875 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
876 	/*
877 	 * The alignment restriction is really too strict but
878 	 * we want enough alignment to inspect the fields of
879 	 * a sockaddr_in.
880 	 */
881 	addr = sogetoff(mp, bind_ack->ADDR_offset,
882 	    bind_ack->ADDR_length,
883 	    __TPI_ALIGN_SIZE);
884 	if (addr == NULL) {
885 		freemsg(mp);
886 		error = EPROTO;
887 		eprintsoline(so, error);
888 		goto done;
889 	}
890 	if (!(flags & _SOBIND_UNSPEC)) {
891 		/*
892 		 * Verify that the transport didn't return something we
893 		 * did not want e.g. an address other than what we asked for.
894 		 *
895 		 * NOTE: These checks would go away if/when we switch to
896 		 * using the new TPI (in which the transport would fail
897 		 * the request instead of assigning a different address).
898 		 *
899 		 * NOTE2: For protocols that we don't know (i.e. any
900 		 * other than AF_INET6, AF_INET and AF_UNIX), we
901 		 * cannot know if the transport should be expected to
902 		 * return the same address as that requested.
903 		 *
904 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
905 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
906 		 *
907 		 * For example, in the case of netatalk it may be
908 		 * inappropriate for the transport to return the
909 		 * requested address (as it may have allocated a local
910 		 * port number in behaviour similar to that of an
911 		 * AF_INET bind request with a port number of zero).
912 		 *
913 		 * Given the definition of O_T_BIND_REQ, where the
914 		 * transport may bind to an address other than the
915 		 * requested address, it's not possible to determine
916 		 * whether a returned address that differs from the
917 		 * requested address is a reason to fail (because the
918 		 * requested address was not available) or succeed
919 		 * (because the transport allocated an appropriate
920 		 * address and/or port).
921 		 *
922 		 * sockfs currently requires that the transport return
923 		 * the requested address in the T_BIND_ACK, unless
924 		 * there is code here to allow for any discrepancy.
925 		 * Such code exists for AF_INET and AF_INET6.
926 		 *
927 		 * Netatalk chooses to return the requested address
928 		 * rather than the (correct) allocated address.  This
929 		 * means that netatalk violates the TPI specification
930 		 * (and would not function correctly if used from a
931 		 * TLI application), but it does mean that it works
932 		 * with sockfs.
933 		 *
934 		 * As noted above, using the newer XTI bind primitive
935 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
936 		 * allow sockfs to be more sure about whether or not
937 		 * the bind request had succeeded (as transports are
938 		 * not permitted to bind to a different address than
939 		 * that requested - they must return failure).
940 		 * Unfortunately, support for T_BIND_REQ may not be
941 		 * present in all transport implementations (netatalk,
942 		 * for example, doesn't have it), making the
943 		 * transition difficult.
944 		 */
945 		if (bind_ack->ADDR_length != addrlen) {
946 			/* Assumes that the requested address was in use */
947 			freemsg(mp);
948 			error = EADDRINUSE;
949 			eprintsoline(so, error);
950 			goto done;
951 		}
952 
953 		switch (so->so_family) {
954 		case AF_INET6:
955 		case AF_INET: {
956 			sin_t *rname, *aname;
957 
958 			rname = (sin_t *)addr;
959 			aname = (sin_t *)so->so_laddr_sa;
960 
961 			/*
962 			 * Take advantage of the alignment
963 			 * of sin_port and sin6_port which fall
964 			 * in the same place in their data structures.
965 			 * Just use sin_port for either address family.
966 			 *
967 			 * This may become a problem if (heaven forbid)
968 			 * there's a separate ipv6port_reserved... :-P
969 			 *
970 			 * Binding to port 0 has the semantics of letting
971 			 * the transport bind to any port.
972 			 *
973 			 * If the transport is TCP or UDP since we had sent
974 			 * a T_BIND_REQ we would not get a port other than
975 			 * what we asked for.
976 			 */
977 			if (tcp_udp_xport) {
978 				/*
979 				 * Pick up the new port number if we bound to
980 				 * port 0.
981 				 */
982 				if (aname->sin_port == 0)
983 					aname->sin_port = rname->sin_port;
984 				so->so_state |= SS_LADDR_VALID;
985 				break;
986 			}
987 			if (aname->sin_port != 0 &&
988 			    aname->sin_port != rname->sin_port) {
989 				freemsg(mp);
990 				error = EADDRINUSE;
991 				eprintsoline(so, error);
992 				goto done;
993 			}
994 			/*
995 			 * Pick up the new port number if we bound to port 0.
996 			 */
997 			aname->sin_port = rname->sin_port;
998 
999 			/*
1000 			 * Unfortunately, addresses aren't _quite_ the same.
1001 			 */
1002 			if (so->so_family == AF_INET) {
1003 				if (aname->sin_addr.s_addr !=
1004 				    rname->sin_addr.s_addr) {
1005 					freemsg(mp);
1006 					error = EADDRNOTAVAIL;
1007 					eprintsoline(so, error);
1008 					goto done;
1009 				}
1010 			} else {
1011 				sin6_t *rname6 = (sin6_t *)rname;
1012 				sin6_t *aname6 = (sin6_t *)aname;
1013 
1014 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1015 				    &rname6->sin6_addr)) {
1016 					freemsg(mp);
1017 					error = EADDRNOTAVAIL;
1018 					eprintsoline(so, error);
1019 					goto done;
1020 				}
1021 			}
1022 			break;
1023 		}
1024 		case AF_UNIX:
1025 			if (bcmp(addr, &so->so_ux_laddr, addrlen) != 0) {
1026 				freemsg(mp);
1027 				error = EADDRINUSE;
1028 				eprintsoline(so, error);
1029 				eprintso(so,
1030 				    ("addrlen %d, addr 0x%x, vp %p\n",
1031 				    addrlen, *((int *)addr),
1032 				    so->so_ux_bound_vp));
1033 				goto done;
1034 			}
1035 			so->so_state |= SS_LADDR_VALID;
1036 			break;
1037 		default:
1038 			/*
1039 			 * NOTE: This assumes that addresses can be
1040 			 * byte-compared for equivalence.
1041 			 */
1042 			if (bcmp(addr, so->so_laddr_sa, addrlen) != 0) {
1043 				freemsg(mp);
1044 				error = EADDRINUSE;
1045 				eprintsoline(so, error);
1046 				goto done;
1047 			}
1048 			/*
1049 			 * Don't mark SS_LADDR_VALID, as we cannot be
1050 			 * sure that the returned address is the real
1051 			 * bound address when talking to an unknown
1052 			 * transport.
1053 			 */
1054 			break;
1055 		}
1056 	} else {
1057 		/*
1058 		 * Save for returned address for getsockname.
1059 		 * Needed for unspecific bind unless transport supports
1060 		 * the TI_GETMYNAME ioctl.
1061 		 * Do this for AF_INET{,6} even though they do, as
1062 		 * caching info here is much better performance than
1063 		 * a TPI/STREAMS trip to the transport for getsockname.
1064 		 * Any which can't for some reason _must_ _not_ set
1065 		 * LADDR_VALID here for the caching version of getsockname
1066 		 * to not break;
1067 		 */
1068 		switch (so->so_family) {
1069 		case AF_UNIX:
1070 			/*
1071 			 * Record the address bound with the transport
1072 			 * for use by socketpair.
1073 			 */
1074 			bcopy(addr, &so->so_ux_laddr, addrlen);
1075 			so->so_state |= SS_LADDR_VALID;
1076 			break;
1077 		case AF_INET:
1078 		case AF_INET6:
1079 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
1080 			bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
1081 			so->so_state |= SS_LADDR_VALID;
1082 			break;
1083 		default:
1084 			/*
1085 			 * Don't mark SS_LADDR_VALID, as we cannot be
1086 			 * sure that the returned address is the real
1087 			 * bound address when talking to an unknown
1088 			 * transport.
1089 			 */
1090 			break;
1091 		}
1092 	}
1093 
1094 	if (nl7c != NULL) {
1095 		/* Register listen()er sonode pointer with NL7C */
1096 		nl7c_listener_addr(nl7c, so);
1097 	}
1098 
1099 	freemsg(mp);
1100 
1101 done:
1102 	if (error) {
1103 		/* reset state & backlog to values held on entry */
1104 		if (clear_acceptconn_on_err == B_TRUE)
1105 			so->so_state &= ~SS_ACCEPTCONN;
1106 		if (restore_backlog_on_err == B_TRUE)
1107 			so->so_backlog = save_so_backlog;
1108 
1109 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1110 			int err;
1111 
1112 			err = sotpi_unbind(so, 0);
1113 			/* LINTED - statement has no consequent: if */
1114 			if (err) {
1115 				eprintsoline(so, error);
1116 			} else {
1117 				ASSERT(!(so->so_state & SS_ISBOUND));
1118 			}
1119 		}
1120 	}
1121 	if (!(flags & _SOBIND_LOCK_HELD)) {
1122 		so_unlock_single(so, SOLOCKED);
1123 		mutex_exit(&so->so_lock);
1124 	} else {
1125 		/* If the caller held the lock don't release it here */
1126 		ASSERT(MUTEX_HELD(&so->so_lock));
1127 		ASSERT(so->so_flag & SOLOCKED);
1128 	}
1129 	return (error);
1130 }
1131 
1132 /* bind the socket */
1133 static int
1134 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1135     int flags)
1136 {
1137 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1138 		return (sotpi_bindlisten(so, name, namelen, 0, flags));
1139 
1140 	flags &= ~_SOBIND_SOCKETPAIR;
1141 	return (sotpi_bindlisten(so, name, namelen, 1, flags));
1142 }
1143 
1144 /*
1145  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1146  * address, or when listen needs to unbind and bind.
1147  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1148  * so that a sobind can pick them up.
1149  */
1150 static int
1151 sotpi_unbind(struct sonode *so, int flags)
1152 {
1153 	struct T_unbind_req	unbind_req;
1154 	int			error = 0;
1155 	mblk_t			*mp;
1156 
1157 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1158 	    so, flags, pr_state(so->so_state, so->so_mode)));
1159 
1160 	ASSERT(MUTEX_HELD(&so->so_lock));
1161 	ASSERT(so->so_flag & SOLOCKED);
1162 
1163 	if (!(so->so_state & SS_ISBOUND)) {
1164 		error = EINVAL;
1165 		eprintsoline(so, error);
1166 		goto done;
1167 	}
1168 
1169 	mutex_exit(&so->so_lock);
1170 
1171 	/*
1172 	 * Flush the read and write side (except stream head read queue)
1173 	 * and send down T_UNBIND_REQ.
1174 	 */
1175 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1176 
1177 	unbind_req.PRIM_type = T_UNBIND_REQ;
1178 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1179 	    0, _ALLOC_SLEEP);
1180 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1181 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1182 	mutex_enter(&so->so_lock);
1183 	if (error) {
1184 		eprintsoline(so, error);
1185 		goto done;
1186 	}
1187 
1188 	error = sowaitokack(so, T_UNBIND_REQ);
1189 	if (error) {
1190 		eprintsoline(so, error);
1191 		goto done;
1192 	}
1193 
1194 	/*
1195 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1196 	 * strsock_proto while the lock was dropped above, the unbind
1197 	 * is allowed to complete.
1198 	 */
1199 	if (!(flags & _SOUNBIND_REBIND)) {
1200 		/*
1201 		 * Clear out bound address.
1202 		 */
1203 		vnode_t *vp;
1204 
1205 		if ((vp = so->so_ux_bound_vp) != NULL) {
1206 
1207 			/* Undo any SSL proxy setup */
1208 			if ((so->so_family == AF_INET ||
1209 			    so->so_family == AF_INET6) &&
1210 			    (so->so_type == SOCK_STREAM) &&
1211 			    (so->so_kssl_ent != NULL)) {
1212 				kssl_release_ent(so->so_kssl_ent, so,
1213 				    so->so_kssl_type);
1214 				so->so_kssl_ent = NULL;
1215 				so->so_kssl_type = KSSL_NO_PROXY;
1216 			}
1217 
1218 			so->so_ux_bound_vp = NULL;
1219 			vn_rele_stream(vp);
1220 		}
1221 		/* Clear out address */
1222 		so->so_laddr_len = 0;
1223 	}
1224 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN|SS_LADDR_VALID);
1225 
1226 done:
1227 
1228 	/* If the caller held the lock don't release it here */
1229 	ASSERT(MUTEX_HELD(&so->so_lock));
1230 	ASSERT(so->so_flag & SOLOCKED);
1231 
1232 	return (error);
1233 }
1234 
1235 /*
1236  * listen on the socket.
1237  * For TPI conforming transports this has to first unbind with the transport
1238  * and then bind again using the new backlog.
1239  */
1240 int
1241 sotpi_listen(struct sonode *so, int backlog)
1242 {
1243 	int		error = 0;
1244 
1245 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1246 	    so, backlog, pr_state(so->so_state, so->so_mode)));
1247 
1248 	if (so->so_serv_type == T_CLTS)
1249 		return (EOPNOTSUPP);
1250 
1251 	/*
1252 	 * If the socket is ready to accept connections already, then
1253 	 * return without doing anything.  This avoids a problem where
1254 	 * a second listen() call fails if a connection is pending and
1255 	 * leaves the socket unbound. Only when we are not unbinding
1256 	 * with the transport can we safely increase the backlog.
1257 	 */
1258 	if (so->so_state & SS_ACCEPTCONN &&
1259 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1260 	    /*CONSTCOND*/
1261 	    !solisten_tpi_tcp))
1262 		return (0);
1263 
1264 	if (so->so_state & SS_ISCONNECTED)
1265 		return (EINVAL);
1266 
1267 	mutex_enter(&so->so_lock);
1268 	so_lock_single(so);	/* Set SOLOCKED */
1269 
1270 	if (backlog < 0)
1271 		backlog = 0;
1272 	/*
1273 	 * Use the same qlimit as in BSD. BSD checks the qlimit
1274 	 * before queuing the next connection implying that a
1275 	 * listen(sock, 0) allows one connection to be queued.
1276 	 * BSD also uses 1.5 times the requested backlog.
1277 	 *
1278 	 * XNS Issue 4 required a strict interpretation of the backlog.
1279 	 * This has been waived subsequently for Issue 4 and the change
1280 	 * incorporated in XNS Issue 5. So we aren't required to do
1281 	 * anything special for XPG apps.
1282 	 */
1283 	if (backlog >= (INT_MAX - 1) / 3)
1284 		backlog = INT_MAX;
1285 	else
1286 		backlog = backlog * 3 / 2 + 1;
1287 
1288 	/*
1289 	 * If the listen doesn't change the backlog we do nothing.
1290 	 * This avoids an EPROTO error from the transport.
1291 	 */
1292 	if ((so->so_state & SS_ACCEPTCONN) &&
1293 	    so->so_backlog == backlog)
1294 		goto done;
1295 
1296 	if (!(so->so_state & SS_ISBOUND)) {
1297 		/*
1298 		 * Must have been explicitly bound in the UNIX domain.
1299 		 */
1300 		if (so->so_family == AF_UNIX) {
1301 			error = EINVAL;
1302 			goto done;
1303 		}
1304 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1305 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1306 	} else if (backlog > 0) {
1307 		/*
1308 		 * AF_INET{,6} hack to avoid losing the port.
1309 		 * Assumes that all AF_INET{,6} transports can handle a
1310 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1311 		 * has already bound thus it is possible to avoid the unbind.
1312 		 */
1313 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1314 		    /*CONSTCOND*/
1315 		    !solisten_tpi_tcp)) {
1316 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1317 			if (error)
1318 				goto done;
1319 		}
1320 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1321 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1322 	} else {
1323 		so->so_state |= SS_ACCEPTCONN;
1324 		so->so_backlog = backlog;
1325 	}
1326 	if (error)
1327 		goto done;
1328 	ASSERT(so->so_state & SS_ACCEPTCONN);
1329 done:
1330 	so_unlock_single(so, SOLOCKED);
1331 	mutex_exit(&so->so_lock);
1332 	return (error);
1333 }
1334 
1335 /*
1336  * Disconnect either a specified seqno or all (-1).
1337  * The former is used on listening sockets only.
1338  *
1339  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1340  * the current use of sodisconnect(seqno == -1) is only for shutdown
1341  * so there is no point (and potentially incorrect) to unbind.
1342  */
1343 int
1344 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1345 {
1346 	struct T_discon_req	discon_req;
1347 	int			error = 0;
1348 	mblk_t			*mp;
1349 
1350 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1351 	    so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1352 
1353 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1354 		mutex_enter(&so->so_lock);
1355 		so_lock_single(so);	/* Set SOLOCKED */
1356 	} else {
1357 		ASSERT(MUTEX_HELD(&so->so_lock));
1358 		ASSERT(so->so_flag & SOLOCKED);
1359 	}
1360 
1361 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1362 		error = EINVAL;
1363 		eprintsoline(so, error);
1364 		goto done;
1365 	}
1366 
1367 	mutex_exit(&so->so_lock);
1368 	/*
1369 	 * Flush the write side (unless this is a listener)
1370 	 * and then send down a T_DISCON_REQ.
1371 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1372 	 * and other messages.)
1373 	 */
1374 	if (!(so->so_state & SS_ACCEPTCONN))
1375 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1376 
1377 	discon_req.PRIM_type = T_DISCON_REQ;
1378 	discon_req.SEQ_number = seqno;
1379 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1380 	    0, _ALLOC_SLEEP);
1381 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1382 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1383 	mutex_enter(&so->so_lock);
1384 	if (error) {
1385 		eprintsoline(so, error);
1386 		goto done;
1387 	}
1388 
1389 	error = sowaitokack(so, T_DISCON_REQ);
1390 	if (error) {
1391 		eprintsoline(so, error);
1392 		goto done;
1393 	}
1394 	/*
1395 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1396 	 * strsock_proto while the lock was dropped above, the disconnect
1397 	 * is allowed to complete. However, it is not possible to
1398 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1399 	 */
1400 	so->so_state &=
1401 	    ~(SS_ISCONNECTED|SS_ISCONNECTING|SS_LADDR_VALID|SS_FADDR_VALID);
1402 done:
1403 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1404 		so_unlock_single(so, SOLOCKED);
1405 		mutex_exit(&so->so_lock);
1406 	} else {
1407 		/* If the caller held the lock don't release it here */
1408 		ASSERT(MUTEX_HELD(&so->so_lock));
1409 		ASSERT(so->so_flag & SOLOCKED);
1410 	}
1411 	return (error);
1412 }
1413 
1414 int
1415 sotpi_accept(struct sonode *so, int fflag, struct sonode **nsop)
1416 {
1417 	struct T_conn_ind	*conn_ind;
1418 	struct T_conn_res	*conn_res;
1419 	int			error = 0;
1420 	mblk_t			*mp, *ctxmp, *ack_mp;
1421 	struct sonode		*nso;
1422 	vnode_t			*nvp;
1423 	void			*src;
1424 	t_uscalar_t		srclen;
1425 	void			*opt;
1426 	t_uscalar_t		optlen;
1427 	t_scalar_t		PRIM_type;
1428 	t_scalar_t		SEQ_number;
1429 	size_t			sinlen;
1430 
1431 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1432 	    so, fflag, nsop, pr_state(so->so_state, so->so_mode)));
1433 
1434 	/*
1435 	 * Defer single-threading the accepting socket until
1436 	 * the T_CONN_IND has been received and parsed and the
1437 	 * new sonode has been opened.
1438 	 */
1439 
1440 	/* Check that we are not already connected */
1441 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1442 		goto conn_bad;
1443 again:
1444 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1445 		goto e_bad;
1446 
1447 	ASSERT(mp);
1448 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1449 	ctxmp = mp->b_cont;
1450 
1451 	/*
1452 	 * Save SEQ_number for error paths.
1453 	 */
1454 	SEQ_number = conn_ind->SEQ_number;
1455 
1456 	srclen = conn_ind->SRC_length;
1457 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1458 	if (src == NULL) {
1459 		error = EPROTO;
1460 		freemsg(mp);
1461 		eprintsoline(so, error);
1462 		goto disconnect_unlocked;
1463 	}
1464 	optlen = conn_ind->OPT_length;
1465 	switch (so->so_family) {
1466 	case AF_INET:
1467 	case AF_INET6:
1468 		if ((optlen == sizeof (intptr_t)) &&
1469 		    ((so->so_state & SS_DIRECT) != 0)) {
1470 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1471 			    &opt, conn_ind->OPT_length);
1472 		} else {
1473 			/*
1474 			 * The transport (in this case TCP) hasn't sent up
1475 			 * a pointer to an instance for the accept fast-path.
1476 			 * Disable fast-path completely because the call to
1477 			 * sotpi_create() below would otherwise create an
1478 			 * incomplete TCP instance, which would lead to
1479 			 * problems when sockfs sends a normal T_CONN_RES
1480 			 * message down the new stream.
1481 			 */
1482 			if (so->so_state & SS_DIRECT) {
1483 				int rval;
1484 				/*
1485 				 * For consistency we inform tcp to disable
1486 				 * direct interface on the listener, though
1487 				 * we can certainly live without doing this
1488 				 * because no data will ever travel upstream
1489 				 * on the listening socket.
1490 				 */
1491 				so->so_state &= ~SS_DIRECT;
1492 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1493 				    0, 0, K_TO_K, CRED(), &rval);
1494 			}
1495 			opt = NULL;
1496 			optlen = 0;
1497 		}
1498 		break;
1499 	case AF_UNIX:
1500 	default:
1501 		if (optlen != 0) {
1502 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1503 			    __TPI_ALIGN_SIZE);
1504 			if (opt == NULL) {
1505 				error = EPROTO;
1506 				freemsg(mp);
1507 				eprintsoline(so, error);
1508 				goto disconnect_unlocked;
1509 			}
1510 		}
1511 		if (so->so_family == AF_UNIX) {
1512 			if (!(so->so_state & SS_FADDR_NOXLATE)) {
1513 				src = NULL;
1514 				srclen = 0;
1515 			}
1516 			/* Extract src address from options */
1517 			if (optlen != 0)
1518 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1519 		}
1520 		break;
1521 	}
1522 
1523 	/*
1524 	 * Create the new socket.
1525 	 */
1526 	VN_HOLD(so->so_accessvp);
1527 	nso = sotpi_create(so->so_accessvp, so->so_family, so->so_type,
1528 	    so->so_protocol, so->so_version, so, &error);
1529 	if (nso == NULL) {
1530 		ASSERT(error != 0);
1531 		/*
1532 		 * Accept can not fail with ENOBUFS. sotpi_create
1533 		 * sleeps waiting for memory until a signal is caught
1534 		 * so return EINTR.
1535 		 */
1536 		freemsg(mp);
1537 		if (error == ENOBUFS)
1538 			error = EINTR;
1539 		goto e_disc_unl;
1540 	}
1541 	nvp = SOTOV(nso);
1542 
1543 	/*
1544 	 * If the transport sent up an SSL connection context, then attach
1545 	 * it the new socket, and set the (sd_wputdatafunc)() and
1546 	 * (sd_rputdatafunc)() stream head hooks to intercept and process
1547 	 * SSL records.
1548 	 */
1549 	if (ctxmp != NULL) {
1550 		/*
1551 		 * This kssl_ctx_t is already held for us by the transport.
1552 		 * So, we don't need to do a kssl_hold_ctx() here.
1553 		 */
1554 		nso->so_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr);
1555 		freemsg(ctxmp);
1556 		mp->b_cont = NULL;
1557 		strsetrwputdatahooks(nvp, strsock_kssl_input,
1558 		    strsock_kssl_output);
1559 	}
1560 #ifdef DEBUG
1561 	/*
1562 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1563 	 * it's inherited early to allow debugging of the accept code itself.
1564 	 */
1565 	nso->so_options |= so->so_options & SO_DEBUG;
1566 #endif /* DEBUG */
1567 
1568 	/*
1569 	 * Save the SRC address from the T_CONN_IND
1570 	 * for getpeername to work on AF_UNIX and on transports that do not
1571 	 * support TI_GETPEERNAME.
1572 	 *
1573 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1574 	 * copyin_name().
1575 	 */
1576 	if (srclen > (t_uscalar_t)nso->so_faddr_maxlen) {
1577 		error = EINVAL;
1578 		freemsg(mp);
1579 		eprintsoline(so, error);
1580 		goto disconnect_vp_unlocked;
1581 	}
1582 	nso->so_faddr_len = (socklen_t)srclen;
1583 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
1584 	bcopy(src, nso->so_faddr_sa, srclen);
1585 	nso->so_state |= SS_FADDR_VALID;
1586 
1587 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1588 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1589 		cred_t *cr;
1590 
1591 		if ((cr = DB_CRED(mp)) != NULL) {
1592 			crhold(cr);
1593 			nso->so_peercred = cr;
1594 			nso->so_cpid = DB_CPID(mp);
1595 		}
1596 		freemsg(mp);
1597 
1598 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1599 		    sizeof (intptr_t), 0, _ALLOC_INTR);
1600 		if (mp == NULL) {
1601 			/*
1602 			 * Accept can not fail with ENOBUFS.
1603 			 * A signal was caught so return EINTR.
1604 			 */
1605 			error = EINTR;
1606 			eprintsoline(so, error);
1607 			goto disconnect_vp_unlocked;
1608 		}
1609 		conn_res = (struct T_conn_res *)mp->b_rptr;
1610 	} else {
1611 		nso->so_peercred = DB_CRED(mp);
1612 		nso->so_cpid = DB_CPID(mp);
1613 		DB_CRED(mp) = NULL;
1614 
1615 		mp->b_rptr = DB_BASE(mp);
1616 		conn_res = (struct T_conn_res *)mp->b_rptr;
1617 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1618 	}
1619 
1620 	/*
1621 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1622 	 * (or AF_INET6) it also has to be bound in the transport provider.
1623 	 * We set the local address in the sonode from the T_OK_ACK of the
1624 	 * T_CONN_RES. For this reason the address we bind to here isn't
1625 	 * important.
1626 	 */
1627 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1628 	    /*CONSTCOND*/
1629 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1630 		/*
1631 		 * Optimization for AF_INET{,6} transports
1632 		 * that can handle a T_CONN_RES without being bound.
1633 		 */
1634 		mutex_enter(&nso->so_lock);
1635 		so_automatic_bind(nso);
1636 		mutex_exit(&nso->so_lock);
1637 	} else {
1638 		/* Perform NULL bind with the transport provider. */
1639 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC)) != 0) {
1640 			ASSERT(error != ENOBUFS);
1641 			freemsg(mp);
1642 			eprintsoline(nso, error);
1643 			goto disconnect_vp_unlocked;
1644 		}
1645 	}
1646 
1647 	/*
1648 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1649 	 * so that any data arriving on the new socket will cause the
1650 	 * appropriate signals to be delivered for the new socket.
1651 	 *
1652 	 * No other thread (except strsock_proto and strsock_misc)
1653 	 * can access the new socket thus we relax the locking.
1654 	 */
1655 	nso->so_pgrp = so->so_pgrp;
1656 	nso->so_state |= so->so_state & (SS_ASYNC|SS_FADDR_NOXLATE);
1657 
1658 	if (nso->so_pgrp != 0) {
1659 		if ((error = so_set_events(nso, nvp, CRED())) != 0) {
1660 			eprintsoline(nso, error);
1661 			error = 0;
1662 			nso->so_pgrp = 0;
1663 		}
1664 	}
1665 
1666 	/*
1667 	 * Make note of the socket level options. TCP and IP level options
1668 	 * are already inherited. We could do all this after accept is
1669 	 * successful but doing it here simplifies code and no harm done
1670 	 * for error case.
1671 	 */
1672 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1673 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1674 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1675 	nso->so_sndbuf = so->so_sndbuf;
1676 	nso->so_rcvbuf = so->so_rcvbuf;
1677 	if (nso->so_options & SO_LINGER)
1678 		nso->so_linger = so->so_linger;
1679 
1680 	if ((so->so_state & SS_DIRECT) != 0) {
1681 
1682 		ASSERT(opt != NULL);
1683 
1684 		conn_res->OPT_length = optlen;
1685 		conn_res->OPT_offset = MBLKL(mp);
1686 		bcopy(&opt, mp->b_wptr, optlen);
1687 		mp->b_wptr += optlen;
1688 		conn_res->PRIM_type = T_CONN_RES;
1689 		conn_res->ACCEPTOR_id = 0;
1690 		PRIM_type = T_CONN_RES;
1691 
1692 		/* Send down the T_CONN_RES on acceptor STREAM */
1693 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1694 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1695 		if (error) {
1696 			mutex_enter(&so->so_lock);
1697 			so_lock_single(so);
1698 			eprintsoline(so, error);
1699 			goto disconnect_vp;
1700 		}
1701 		mutex_enter(&nso->so_lock);
1702 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1703 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1704 		if (error) {
1705 			mutex_exit(&nso->so_lock);
1706 			mutex_enter(&so->so_lock);
1707 			so_lock_single(so);
1708 			eprintsoline(so, error);
1709 			goto disconnect_vp;
1710 		}
1711 		if (nso->so_family == AF_INET) {
1712 			sin_t *sin;
1713 
1714 			sin = (sin_t *)(ack_mp->b_rptr +
1715 			    sizeof (struct T_ok_ack));
1716 			bcopy(sin, nso->so_laddr_sa, sizeof (sin_t));
1717 			nso->so_laddr_len = sizeof (sin_t);
1718 		} else {
1719 			sin6_t *sin6;
1720 
1721 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1722 			    sizeof (struct T_ok_ack));
1723 			bcopy(sin6, nso->so_laddr_sa, sizeof (sin6_t));
1724 			nso->so_laddr_len = sizeof (sin6_t);
1725 		}
1726 		freemsg(ack_mp);
1727 
1728 		nso->so_state |= SS_ISCONNECTED | SS_LADDR_VALID;
1729 		nso->so_priv = opt;
1730 
1731 		if (so->so_nl7c_flags & NL7C_ENABLED) {
1732 			/*
1733 			 * A NL7C marked listen()er so the new socket
1734 			 * inherits the listen()er's NL7C state, except
1735 			 * for NL7C_POLLIN.
1736 			 *
1737 			 * Only call NL7C to process the new socket if
1738 			 * the listen socket allows blocking i/o.
1739 			 */
1740 			nso->so_nl7c_flags = so->so_nl7c_flags & (~NL7C_POLLIN);
1741 			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
1742 				/*
1743 				 * Nonblocking accept() just make it
1744 				 * persist to defer processing to the
1745 				 * read-side syscall (e.g. read).
1746 				 */
1747 				nso->so_nl7c_flags |= NL7C_SOPERSIST;
1748 			} else if (nl7c_process(nso, B_FALSE)) {
1749 				/*
1750 				 * NL7C has completed processing on the
1751 				 * socket, close the socket and back to
1752 				 * the top to await the next T_CONN_IND.
1753 				 */
1754 				mutex_exit(&nso->so_lock);
1755 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1756 				    CRED(), NULL);
1757 				VN_RELE(nvp);
1758 				goto again;
1759 			}
1760 			/* Pass the new socket out */
1761 		}
1762 
1763 		mutex_exit(&nso->so_lock);
1764 
1765 		/*
1766 		 * It's possible, through the use of autopush for example,
1767 		 * that the acceptor stream may not support SS_DIRECT
1768 		 * semantics. If the new socket does not support SS_DIRECT
1769 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
1770 		 * as we would in the I_PUSH case.
1771 		 */
1772 		if (!(nso->so_state & SS_DIRECT)) {
1773 			int	rval;
1774 
1775 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
1776 			    0, 0, K_TO_K, CRED(), &rval)) != 0) {
1777 				mutex_enter(&so->so_lock);
1778 				so_lock_single(so);
1779 				eprintsoline(so, error);
1780 				goto disconnect_vp;
1781 			}
1782 		}
1783 
1784 		/*
1785 		 * Pass out new socket.
1786 		 */
1787 		if (nsop != NULL)
1788 			*nsop = nso;
1789 
1790 		return (0);
1791 	}
1792 
1793 	/*
1794 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1795 	 * which don't support the FireEngine accept fast-path. It is also
1796 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1797 	 * again. Neither sockfs nor TCP attempt to find out if some other
1798 	 * random module has been inserted in between (in which case we
1799 	 * should follow TLI accept behaviour). We blindly assume the worst
1800 	 * case and revert back to old behaviour i.e. TCP will not send us
1801 	 * any option (eager) and the accept should happen on the listener
1802 	 * queue. Any queued T_conn_ind have already got their options removed
1803 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
1804 	 */
1805 	/*
1806 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1807 	 */
1808 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1809 #ifdef	_ILP32
1810 		queue_t	*q;
1811 
1812 		/*
1813 		 * Find read queue in driver
1814 		 * Can safely do this since we "own" nso/nvp.
1815 		 */
1816 		q = strvp2wq(nvp)->q_next;
1817 		while (SAMESTR(q))
1818 			q = q->q_next;
1819 		q = RD(q);
1820 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1821 #else
1822 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1823 #endif	/* _ILP32 */
1824 		conn_res->PRIM_type = O_T_CONN_RES;
1825 		PRIM_type = O_T_CONN_RES;
1826 	} else {
1827 		conn_res->ACCEPTOR_id = nso->so_acceptor_id;
1828 		conn_res->PRIM_type = T_CONN_RES;
1829 		PRIM_type = T_CONN_RES;
1830 	}
1831 	conn_res->SEQ_number = SEQ_number;
1832 	conn_res->OPT_length = 0;
1833 	conn_res->OPT_offset = 0;
1834 
1835 	mutex_enter(&so->so_lock);
1836 	so_lock_single(so);	/* Set SOLOCKED */
1837 	mutex_exit(&so->so_lock);
1838 
1839 	error = kstrputmsg(SOTOV(so), mp, NULL,
1840 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1841 	mutex_enter(&so->so_lock);
1842 	if (error) {
1843 		eprintsoline(so, error);
1844 		goto disconnect_vp;
1845 	}
1846 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
1847 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1848 	if (error) {
1849 		eprintsoline(so, error);
1850 		goto disconnect_vp;
1851 	}
1852 	/*
1853 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
1854 	 * that to set the local address. If this is not present
1855 	 * then we zero out the address and don't set the
1856 	 * SS_LADDR_VALID bit. For AF_UNIX endpoints we copy over
1857 	 * the pathname from the listening socket.
1858 	 */
1859 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
1860 	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
1861 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
1862 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
1863 		bcopy(ack_mp->b_rptr, nso->so_laddr_sa, sinlen);
1864 		nso->so_laddr_len = sinlen;
1865 		nso->so_state |= SS_LADDR_VALID;
1866 	} else if (nso->so_family == AF_UNIX) {
1867 		ASSERT(so->so_family == AF_UNIX);
1868 		nso->so_laddr_len = so->so_laddr_len;
1869 		ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
1870 		bcopy(so->so_laddr_sa, nso->so_laddr_sa, nso->so_laddr_len);
1871 		nso->so_state |= SS_LADDR_VALID;
1872 	} else {
1873 		nso->so_laddr_len = so->so_laddr_len;
1874 		ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
1875 		bzero(nso->so_laddr_sa, nso->so_addr_size);
1876 		nso->so_laddr_sa->sa_family = nso->so_family;
1877 	}
1878 	freemsg(ack_mp);
1879 
1880 	so_unlock_single(so, SOLOCKED);
1881 	mutex_exit(&so->so_lock);
1882 
1883 	nso->so_state |= SS_ISCONNECTED;
1884 
1885 	/*
1886 	 * Pass out new socket.
1887 	 */
1888 	if (nsop != NULL)
1889 		*nsop = nso;
1890 
1891 	return (0);
1892 
1893 
1894 eproto_disc_unl:
1895 	error = EPROTO;
1896 e_disc_unl:
1897 	eprintsoline(so, error);
1898 	goto disconnect_unlocked;
1899 
1900 pr_disc_vp_unl:
1901 	eprintsoline(so, error);
1902 disconnect_vp_unlocked:
1903 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
1904 	VN_RELE(nvp);
1905 disconnect_unlocked:
1906 	(void) sodisconnect(so, SEQ_number, 0);
1907 	return (error);
1908 
1909 pr_disc_vp:
1910 	eprintsoline(so, error);
1911 disconnect_vp:
1912 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
1913 	so_unlock_single(so, SOLOCKED);
1914 	mutex_exit(&so->so_lock);
1915 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
1916 	VN_RELE(nvp);
1917 	return (error);
1918 
1919 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
1920 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
1921 	    ? EOPNOTSUPP : EINVAL;
1922 e_bad:
1923 	eprintsoline(so, error);
1924 	return (error);
1925 }
1926 
1927 /*
1928  * connect a socket.
1929  *
1930  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
1931  * unconnect (by specifying a null address).
1932  */
1933 int
1934 sotpi_connect(struct sonode *so,
1935 	const struct sockaddr *name,
1936 	socklen_t namelen,
1937 	int fflag,
1938 	int flags)
1939 {
1940 	struct T_conn_req	conn_req;
1941 	int			error = 0;
1942 	mblk_t			*mp;
1943 	void			*src;
1944 	socklen_t		srclen;
1945 	void			*addr;
1946 	socklen_t		addrlen;
1947 	boolean_t		need_unlock;
1948 
1949 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
1950 	    so, name, namelen, fflag, flags,
1951 	    pr_state(so->so_state, so->so_mode)));
1952 
1953 	/*
1954 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
1955 	 * avoid sleeping for memory with SOLOCKED held.
1956 	 * We know that the T_CONN_REQ can't be larger than 2 * so_faddr_maxlen
1957 	 * + sizeof (struct T_opthdr).
1958 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
1959 	 * exceed so_faddr_maxlen).
1960 	 */
1961 	mp = soallocproto(sizeof (struct T_conn_req) +
1962 	    2 * so->so_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR);
1963 	if (mp == NULL) {
1964 		/*
1965 		 * Connect can not fail with ENOBUFS. A signal was
1966 		 * caught so return EINTR.
1967 		 */
1968 		error = EINTR;
1969 		eprintsoline(so, error);
1970 		return (error);
1971 	}
1972 
1973 	mutex_enter(&so->so_lock);
1974 	/*
1975 	 * Make sure there is a preallocated T_unbind_req message
1976 	 * before any binding. This message is allocated when the
1977 	 * socket is created. Since another thread can consume
1978 	 * so_unbind_mp by the time we return from so_lock_single(),
1979 	 * we should check the availability of so_unbind_mp after
1980 	 * we return from so_lock_single().
1981 	 */
1982 
1983 	so_lock_single(so);	/* Set SOLOCKED */
1984 	need_unlock = B_TRUE;
1985 
1986 	if (so->so_unbind_mp == NULL) {
1987 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
1988 		/* NOTE: holding so_lock while sleeping */
1989 		so->so_unbind_mp =
1990 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR);
1991 		if (so->so_unbind_mp == NULL) {
1992 			error = EINTR;
1993 			goto done;
1994 		}
1995 	}
1996 
1997 	/*
1998 	 * Can't have done a listen before connecting.
1999 	 */
2000 	if (so->so_state & SS_ACCEPTCONN) {
2001 		error = EOPNOTSUPP;
2002 		goto done;
2003 	}
2004 
2005 	/*
2006 	 * Must be bound with the transport
2007 	 */
2008 	if (!(so->so_state & SS_ISBOUND)) {
2009 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2010 		    /*CONSTCOND*/
2011 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2012 			/*
2013 			 * Optimization for AF_INET{,6} transports
2014 			 * that can handle a T_CONN_REQ without being bound.
2015 			 */
2016 			so_automatic_bind(so);
2017 		} else {
2018 			error = sotpi_bind(so, NULL, 0,
2019 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
2020 			if (error)
2021 				goto done;
2022 		}
2023 		ASSERT(so->so_state & SS_ISBOUND);
2024 		flags |= _SOCONNECT_DID_BIND;
2025 	}
2026 
2027 	/*
2028 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2029 	 * connect to a null address. This is the portable method to
2030 	 * unconnect a socket.
2031 	 */
2032 	if ((namelen >= sizeof (sa_family_t)) &&
2033 	    (name->sa_family == AF_UNSPEC)) {
2034 		name = NULL;
2035 		namelen = 0;
2036 	}
2037 
2038 	/*
2039 	 * Check that we are not already connected.
2040 	 * A connection-oriented socket cannot be reconnected.
2041 	 * A connected connection-less socket can be
2042 	 * - connected to a different address by a subsequent connect
2043 	 * - "unconnected" by a connect to the NULL address
2044 	 */
2045 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2046 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2047 		if (so->so_mode & SM_CONNREQUIRED) {
2048 			/* Connection-oriented socket */
2049 			error = so->so_state & SS_ISCONNECTED ?
2050 			    EISCONN : EALREADY;
2051 			goto done;
2052 		}
2053 		/* Connection-less socket */
2054 		if (name == NULL) {
2055 			/*
2056 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2057 			 * since it was set when the socket was connected.
2058 			 * If this is UDP also send down a T_DISCON_REQ.
2059 			 */
2060 			int val;
2061 
2062 			if ((so->so_family == AF_INET ||
2063 			    so->so_family == AF_INET6) &&
2064 			    (so->so_type == SOCK_DGRAM ||
2065 			    so->so_type == SOCK_RAW) &&
2066 			    /*CONSTCOND*/
2067 			    !soconnect_tpi_udp) {
2068 				/* XXX What about implicitly unbinding here? */
2069 				error = sodisconnect(so, -1,
2070 				    _SODISCONNECT_LOCK_HELD);
2071 			} else {
2072 				so->so_state &=
2073 				    ~(SS_ISCONNECTED | SS_ISCONNECTING |
2074 				    SS_FADDR_VALID);
2075 				so->so_faddr_len = 0;
2076 			}
2077 
2078 			so_unlock_single(so, SOLOCKED);
2079 			mutex_exit(&so->so_lock);
2080 
2081 			val = 0;
2082 			(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2083 			    &val, (t_uscalar_t)sizeof (val));
2084 
2085 			mutex_enter(&so->so_lock);
2086 			so_lock_single(so);	/* Set SOLOCKED */
2087 			goto done;
2088 		}
2089 	}
2090 	ASSERT(so->so_state & SS_ISBOUND);
2091 
2092 	if (name == NULL || namelen == 0) {
2093 		error = EINVAL;
2094 		goto done;
2095 	}
2096 	/*
2097 	 * Mark the socket if so_faddr_sa represents the transport level
2098 	 * address.
2099 	 */
2100 	if (flags & _SOCONNECT_NOXLATE) {
2101 		struct sockaddr_ux	*soaddr_ux;
2102 
2103 		ASSERT(so->so_family == AF_UNIX);
2104 		if (namelen != sizeof (struct sockaddr_ux)) {
2105 			error = EINVAL;
2106 			goto done;
2107 		}
2108 		soaddr_ux = (struct sockaddr_ux *)name;
2109 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2110 		namelen = sizeof (soaddr_ux->sou_addr);
2111 		so->so_state |= SS_FADDR_NOXLATE;
2112 	}
2113 
2114 	/*
2115 	 * Length and family checks.
2116 	 */
2117 	error = so_addr_verify(so, name, namelen);
2118 	if (error)
2119 		goto bad;
2120 
2121 	/*
2122 	 * Save foreign address. Needed for AF_UNIX as well as
2123 	 * transport providers that do not support TI_GETPEERNAME.
2124 	 * Also used for cached foreign address for TCP and UDP.
2125 	 */
2126 	if (namelen > (t_uscalar_t)so->so_faddr_maxlen) {
2127 		error = EINVAL;
2128 		goto done;
2129 	}
2130 	so->so_faddr_len = (socklen_t)namelen;
2131 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
2132 	bcopy(name, so->so_faddr_sa, namelen);
2133 	so->so_state |= SS_FADDR_VALID;
2134 
2135 	if (so->so_family == AF_UNIX) {
2136 		if (so->so_state & SS_FADDR_NOXLATE) {
2137 			/*
2138 			 * Already have a transport internal address. Do not
2139 			 * pass any (transport internal) source address.
2140 			 */
2141 			addr = so->so_faddr_sa;
2142 			addrlen = (t_uscalar_t)so->so_faddr_len;
2143 			src = NULL;
2144 			srclen = 0;
2145 		} else {
2146 			/*
2147 			 * Pass the sockaddr_un source address as an option
2148 			 * and translate the remote address.
2149 			 * Holding so_lock thus so_laddr_sa can not change.
2150 			 */
2151 			src = so->so_laddr_sa;
2152 			srclen = (t_uscalar_t)so->so_laddr_len;
2153 			dprintso(so, 1,
2154 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2155 			    srclen, src));
2156 			error = so_ux_addr_xlate(so,
2157 			    so->so_faddr_sa, (socklen_t)so->so_faddr_len,
2158 			    (flags & _SOCONNECT_XPG4_2),
2159 			    &addr, &addrlen);
2160 			if (error)
2161 				goto bad;
2162 		}
2163 	} else {
2164 		addr = so->so_faddr_sa;
2165 		addrlen = (t_uscalar_t)so->so_faddr_len;
2166 		src = NULL;
2167 		srclen = 0;
2168 	}
2169 	/*
2170 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2171 	 * option which asks the transport provider to send T_UDERR_IND
2172 	 * messages. These T_UDERR_IND messages are used to return connected
2173 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2174 	 *
2175 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2176 	 * we send down a T_CONN_REQ. This is needed to let the
2177 	 * transport assign a local address that is consistent with
2178 	 * the remote address. Applications depend on a getsockname()
2179 	 * after a connect() to retrieve the "source" IP address for
2180 	 * the connected socket.  Invalidate the cached local address
2181 	 * to force getsockname() to enquire of the transport.
2182 	 */
2183 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2184 		/*
2185 		 * Datagram socket.
2186 		 */
2187 		int32_t val;
2188 
2189 		so_unlock_single(so, SOLOCKED);
2190 		mutex_exit(&so->so_lock);
2191 
2192 		val = 1;
2193 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2194 		    &val, (t_uscalar_t)sizeof (val));
2195 
2196 		mutex_enter(&so->so_lock);
2197 		so_lock_single(so);	/* Set SOLOCKED */
2198 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2199 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2200 		    soconnect_tpi_udp) {
2201 			soisconnected(so);
2202 			goto done;
2203 		}
2204 		/*
2205 		 * Send down T_CONN_REQ etc.
2206 		 * Clear fflag to avoid returning EWOULDBLOCK.
2207 		 */
2208 		fflag = 0;
2209 		ASSERT(so->so_family != AF_UNIX);
2210 		so->so_state &= ~SS_LADDR_VALID;
2211 	} else if (so->so_laddr_len != 0) {
2212 		/*
2213 		 * If the local address or port was "any" then it may be
2214 		 * changed by the transport as a result of the
2215 		 * connect.  Invalidate the cached version if we have one.
2216 		 */
2217 		switch (so->so_family) {
2218 		case AF_INET:
2219 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin_t));
2220 			if (((sin_t *)so->so_laddr_sa)->sin_addr.s_addr ==
2221 			    INADDR_ANY ||
2222 			    ((sin_t *)so->so_laddr_sa)->sin_port == 0)
2223 				so->so_state &= ~SS_LADDR_VALID;
2224 			break;
2225 
2226 		case AF_INET6:
2227 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin6_t));
2228 			if (IN6_IS_ADDR_UNSPECIFIED(
2229 			    &((sin6_t *)so->so_laddr_sa) ->sin6_addr) ||
2230 			    IN6_IS_ADDR_V4MAPPED_ANY(
2231 			    &((sin6_t *)so->so_laddr_sa)->sin6_addr) ||
2232 			    ((sin6_t *)so->so_laddr_sa)->sin6_port == 0)
2233 				so->so_state &= ~SS_LADDR_VALID;
2234 			break;
2235 
2236 		default:
2237 			break;
2238 		}
2239 	}
2240 
2241 	/*
2242 	 * Check for failure of an earlier call
2243 	 */
2244 	if (so->so_error != 0)
2245 		goto so_bad;
2246 
2247 	/*
2248 	 * Send down T_CONN_REQ. Message was allocated above.
2249 	 */
2250 	conn_req.PRIM_type = T_CONN_REQ;
2251 	conn_req.DEST_length = addrlen;
2252 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2253 	if (srclen == 0) {
2254 		conn_req.OPT_length = 0;
2255 		conn_req.OPT_offset = 0;
2256 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2257 		soappendmsg(mp, addr, addrlen);
2258 	} else {
2259 		/*
2260 		 * There is a AF_UNIX sockaddr_un to include as a source
2261 		 * address option.
2262 		 */
2263 		struct T_opthdr toh;
2264 
2265 		toh.level = SOL_SOCKET;
2266 		toh.name = SO_SRCADDR;
2267 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2268 		toh.status = 0;
2269 		conn_req.OPT_length =
2270 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2271 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2272 		    _TPI_ALIGN_TOPT(addrlen));
2273 
2274 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2275 		soappendmsg(mp, addr, addrlen);
2276 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2277 		soappendmsg(mp, &toh, sizeof (toh));
2278 		soappendmsg(mp, src, srclen);
2279 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2280 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2281 	}
2282 	/*
2283 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2284 	 * in order to have the right state when the T_CONN_CON shows up.
2285 	 */
2286 	soisconnecting(so);
2287 	mutex_exit(&so->so_lock);
2288 
2289 #ifdef C2_AUDIT
2290 	if (audit_active)
2291 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2292 #endif /* C2_AUDIT */
2293 
2294 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2295 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2296 	mp = NULL;
2297 	mutex_enter(&so->so_lock);
2298 	if (error != 0)
2299 		goto bad;
2300 
2301 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2302 		goto bad;
2303 
2304 	/* Allow other threads to access the socket */
2305 	so_unlock_single(so, SOLOCKED);
2306 	need_unlock = B_FALSE;
2307 
2308 	/*
2309 	 * Wait until we get a T_CONN_CON or an error
2310 	 */
2311 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2312 		so_lock_single(so);	/* Set SOLOCKED */
2313 		need_unlock = B_TRUE;
2314 	}
2315 
2316 done:
2317 	freemsg(mp);
2318 	switch (error) {
2319 	case EINPROGRESS:
2320 	case EALREADY:
2321 	case EISCONN:
2322 	case EINTR:
2323 		/* Non-fatal errors */
2324 		so->so_state &= ~SS_LADDR_VALID;
2325 		/* FALLTHRU */
2326 	case 0:
2327 		break;
2328 
2329 	case EHOSTUNREACH:
2330 		if (flags & _SOCONNECT_XPG4_2) {
2331 			/*
2332 			 * X/Open specification contains a requirement that
2333 			 * ENETUNREACH be returned but does not require
2334 			 * EHOSTUNREACH. In order to keep the test suite
2335 			 * happy we mess with the errno here.
2336 			 */
2337 			error = ENETUNREACH;
2338 		}
2339 		/* FALLTHRU */
2340 
2341 	default:
2342 		ASSERT(need_unlock);
2343 		/*
2344 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2345 		 * and invalidate local-address cache
2346 		 */
2347 		so->so_state &= ~(SS_ISCONNECTING | SS_LADDR_VALID);
2348 		/* A discon_ind might have already unbound us */
2349 		if ((flags & _SOCONNECT_DID_BIND) &&
2350 		    (so->so_state & SS_ISBOUND)) {
2351 			int err;
2352 
2353 			err = sotpi_unbind(so, 0);
2354 			/* LINTED - statement has no conseq */
2355 			if (err) {
2356 				eprintsoline(so, err);
2357 			}
2358 		}
2359 		break;
2360 	}
2361 	if (need_unlock)
2362 		so_unlock_single(so, SOLOCKED);
2363 	mutex_exit(&so->so_lock);
2364 	return (error);
2365 
2366 so_bad:	error = sogeterr(so);
2367 bad:	eprintsoline(so, error);
2368 	goto done;
2369 }
2370 
2371 int
2372 sotpi_shutdown(struct sonode *so, int how)
2373 {
2374 	struct T_ordrel_req	ordrel_req;
2375 	mblk_t			*mp;
2376 	uint_t			old_state, state_change;
2377 	int			error = 0;
2378 
2379 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2380 	    so, how, pr_state(so->so_state, so->so_mode)));
2381 
2382 	mutex_enter(&so->so_lock);
2383 	so_lock_single(so);	/* Set SOLOCKED */
2384 
2385 	/*
2386 	 * SunOS 4.X has no check for datagram sockets.
2387 	 * 5.X checks that it is connected (ENOTCONN)
2388 	 * X/Open requires that we check the connected state.
2389 	 */
2390 	if (!(so->so_state & SS_ISCONNECTED)) {
2391 		if (!xnet_skip_checks) {
2392 			error = ENOTCONN;
2393 			if (xnet_check_print) {
2394 				printf("sockfs: X/Open shutdown check "
2395 				    "caused ENOTCONN\n");
2396 			}
2397 		}
2398 		goto done;
2399 	}
2400 	/*
2401 	 * Record the current state and then perform any state changes.
2402 	 * Then use the difference between the old and new states to
2403 	 * determine which messages need to be sent.
2404 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2405 	 * duplicate calls to shutdown().
2406 	 */
2407 	old_state = so->so_state;
2408 
2409 	switch (how) {
2410 	case 0:
2411 		socantrcvmore(so);
2412 		break;
2413 	case 1:
2414 		socantsendmore(so);
2415 		break;
2416 	case 2:
2417 		socantsendmore(so);
2418 		socantrcvmore(so);
2419 		break;
2420 	default:
2421 		error = EINVAL;
2422 		goto done;
2423 	}
2424 
2425 	/*
2426 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2427 	 */
2428 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2429 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2430 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2431 
2432 	switch (state_change) {
2433 	case 0:
2434 		dprintso(so, 1,
2435 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2436 		    so->so_state));
2437 		goto done;
2438 
2439 	case SS_CANTRCVMORE:
2440 		mutex_exit(&so->so_lock);
2441 		strseteof(SOTOV(so), 1);
2442 		/*
2443 		 * strseteof takes care of read side wakeups,
2444 		 * pollwakeups, and signals.
2445 		 */
2446 		/*
2447 		 * Get the read lock before flushing data to avoid problems
2448 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2449 		 */
2450 		mutex_enter(&so->so_lock);
2451 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2452 		mutex_exit(&so->so_lock);
2453 
2454 		/* Flush read side queue */
2455 		strflushrq(SOTOV(so), FLUSHALL);
2456 
2457 		mutex_enter(&so->so_lock);
2458 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2459 		break;
2460 
2461 	case SS_CANTSENDMORE:
2462 		mutex_exit(&so->so_lock);
2463 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2464 		mutex_enter(&so->so_lock);
2465 		break;
2466 
2467 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2468 		mutex_exit(&so->so_lock);
2469 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2470 		strseteof(SOTOV(so), 1);
2471 		/*
2472 		 * strseteof takes care of read side wakeups,
2473 		 * pollwakeups, and signals.
2474 		 */
2475 		/*
2476 		 * Get the read lock before flushing data to avoid problems
2477 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2478 		 */
2479 		mutex_enter(&so->so_lock);
2480 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2481 		mutex_exit(&so->so_lock);
2482 
2483 		/* Flush read side queue */
2484 		strflushrq(SOTOV(so), FLUSHALL);
2485 
2486 		mutex_enter(&so->so_lock);
2487 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2488 		break;
2489 	}
2490 
2491 	ASSERT(MUTEX_HELD(&so->so_lock));
2492 
2493 	/*
2494 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2495 	 * was set due to this call and the new state has both of them set:
2496 	 *	Send the AF_UNIX close indication
2497 	 *	For T_COTS send a discon_ind
2498 	 *
2499 	 * If cantsend was set due to this call:
2500 	 *	For T_COTSORD send an ordrel_ind
2501 	 *
2502 	 * Note that for T_CLTS there is no message sent here.
2503 	 */
2504 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2505 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2506 		/*
2507 		 * For SunOS 4.X compatibility we tell the other end
2508 		 * that we are unable to receive at this point.
2509 		 */
2510 		if (so->so_family == AF_UNIX && so->so_serv_type != T_CLTS)
2511 			so_unix_close(so);
2512 
2513 		if (so->so_serv_type == T_COTS)
2514 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2515 	}
2516 	if ((state_change & SS_CANTSENDMORE) &&
2517 	    (so->so_serv_type == T_COTS_ORD)) {
2518 		/* Send an orderly release */
2519 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2520 
2521 		mutex_exit(&so->so_lock);
2522 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2523 		    0, _ALLOC_SLEEP);
2524 		/*
2525 		 * Send down the T_ORDREL_REQ even if there is flow control.
2526 		 * This prevents shutdown from blocking.
2527 		 * Note that there is no T_OK_ACK for ordrel_req.
2528 		 */
2529 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2530 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2531 		mutex_enter(&so->so_lock);
2532 		if (error) {
2533 			eprintsoline(so, error);
2534 			goto done;
2535 		}
2536 	}
2537 
2538 done:
2539 	so_unlock_single(so, SOLOCKED);
2540 	mutex_exit(&so->so_lock);
2541 	return (error);
2542 }
2543 
2544 /*
2545  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2546  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2547  * that we have closed.
2548  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2549  * T_UNITDATA_REQ containing the same option.
2550  *
2551  * For SOCK_DGRAM half-connections (somebody connected to this end
2552  * but this end is not connect) we don't know where to send any
2553  * SO_UNIX_CLOSE.
2554  *
2555  * We have to ignore stream head errors just in case there has been
2556  * a shutdown(output).
2557  * Ignore any flow control to try to get the message more quickly to the peer.
2558  * While locally ignoring flow control solves the problem when there
2559  * is only the loopback transport on the stream it would not provide
2560  * the correct AF_UNIX socket semantics when one or more modules have
2561  * been pushed.
2562  */
2563 void
2564 so_unix_close(struct sonode *so)
2565 {
2566 	int		error;
2567 	struct T_opthdr	toh;
2568 	mblk_t		*mp;
2569 
2570 	ASSERT(MUTEX_HELD(&so->so_lock));
2571 
2572 	ASSERT(so->so_family == AF_UNIX);
2573 
2574 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2575 	    (SS_ISCONNECTED|SS_ISBOUND))
2576 		return;
2577 
2578 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2579 	    so, pr_state(so->so_state, so->so_mode)));
2580 
2581 	toh.level = SOL_SOCKET;
2582 	toh.name = SO_UNIX_CLOSE;
2583 
2584 	/* zero length + header */
2585 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2586 	toh.status = 0;
2587 
2588 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2589 		struct T_optdata_req tdr;
2590 
2591 		tdr.PRIM_type = T_OPTDATA_REQ;
2592 		tdr.DATA_flag = 0;
2593 
2594 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2595 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2596 
2597 		/* NOTE: holding so_lock while sleeping */
2598 		mp = soallocproto2(&tdr, sizeof (tdr),
2599 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP);
2600 	} else {
2601 		struct T_unitdata_req	tudr;
2602 		void			*addr;
2603 		socklen_t		addrlen;
2604 		void			*src;
2605 		socklen_t		srclen;
2606 		struct T_opthdr		toh2;
2607 		t_scalar_t		size;
2608 
2609 		/* Connecteded DGRAM socket */
2610 
2611 		/*
2612 		 * For AF_UNIX the destination address is translated to
2613 		 * an internal name and the source address is passed as
2614 		 * an option.
2615 		 */
2616 		/*
2617 		 * Length and family checks.
2618 		 */
2619 		error = so_addr_verify(so, so->so_faddr_sa,
2620 		    (t_uscalar_t)so->so_faddr_len);
2621 		if (error) {
2622 			eprintsoline(so, error);
2623 			return;
2624 		}
2625 		if (so->so_state & SS_FADDR_NOXLATE) {
2626 			/*
2627 			 * Already have a transport internal address. Do not
2628 			 * pass any (transport internal) source address.
2629 			 */
2630 			addr = so->so_faddr_sa;
2631 			addrlen = (t_uscalar_t)so->so_faddr_len;
2632 			src = NULL;
2633 			srclen = 0;
2634 		} else {
2635 			/*
2636 			 * Pass the sockaddr_un source address as an option
2637 			 * and translate the remote address.
2638 			 * Holding so_lock thus so_laddr_sa can not change.
2639 			 */
2640 			src = so->so_laddr_sa;
2641 			srclen = (socklen_t)so->so_laddr_len;
2642 			dprintso(so, 1,
2643 			    ("so_ux_close: srclen %d, src %p\n",
2644 			    srclen, src));
2645 			error = so_ux_addr_xlate(so,
2646 			    so->so_faddr_sa,
2647 			    (socklen_t)so->so_faddr_len, 0,
2648 			    &addr, &addrlen);
2649 			if (error) {
2650 				eprintsoline(so, error);
2651 				return;
2652 			}
2653 		}
2654 		tudr.PRIM_type = T_UNITDATA_REQ;
2655 		tudr.DEST_length = addrlen;
2656 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2657 		if (srclen == 0) {
2658 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2659 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2660 			    _TPI_ALIGN_TOPT(addrlen));
2661 
2662 			size = tudr.OPT_offset + tudr.OPT_length;
2663 			/* NOTE: holding so_lock while sleeping */
2664 			mp = soallocproto2(&tudr, sizeof (tudr),
2665 			    addr, addrlen, size, _ALLOC_SLEEP);
2666 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2667 			soappendmsg(mp, &toh, sizeof (toh));
2668 		} else {
2669 			/*
2670 			 * There is a AF_UNIX sockaddr_un to include as a
2671 			 * source address option.
2672 			 */
2673 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2674 			    _TPI_ALIGN_TOPT(srclen));
2675 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2676 			    _TPI_ALIGN_TOPT(addrlen));
2677 
2678 			toh2.level = SOL_SOCKET;
2679 			toh2.name = SO_SRCADDR;
2680 			toh2.len = (t_uscalar_t)(srclen +
2681 			    sizeof (struct T_opthdr));
2682 			toh2.status = 0;
2683 
2684 			size = tudr.OPT_offset + tudr.OPT_length;
2685 
2686 			/* NOTE: holding so_lock while sleeping */
2687 			mp = soallocproto2(&tudr, sizeof (tudr),
2688 			    addr, addrlen, size, _ALLOC_SLEEP);
2689 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2690 			soappendmsg(mp, &toh, sizeof (toh));
2691 			soappendmsg(mp, &toh2, sizeof (toh2));
2692 			soappendmsg(mp, src, srclen);
2693 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2694 		}
2695 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2696 	}
2697 	mutex_exit(&so->so_lock);
2698 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2699 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2700 	mutex_enter(&so->so_lock);
2701 }
2702 
2703 /*
2704  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
2705  */
2706 int
2707 sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags)
2708 {
2709 	mblk_t		*mp, *nmp;
2710 	int		error;
2711 
2712 	dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", so, msg, flags));
2713 
2714 	/*
2715 	 * There is never any oob data with addresses or control since
2716 	 * the T_EXDATA_IND does not carry any options.
2717 	 */
2718 	msg->msg_controllen = 0;
2719 	msg->msg_namelen = 0;
2720 
2721 	mutex_enter(&so->so_lock);
2722 	ASSERT(so_verify_oobstate(so));
2723 	if ((so->so_options & SO_OOBINLINE) ||
2724 	    (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
2725 		dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
2726 		mutex_exit(&so->so_lock);
2727 		return (EINVAL);
2728 	}
2729 	if (!(so->so_state & SS_HAVEOOBDATA)) {
2730 		dprintso(so, 1, ("sorecvoob: no data yet\n"));
2731 		mutex_exit(&so->so_lock);
2732 		return (EWOULDBLOCK);
2733 	}
2734 	ASSERT(so->so_oobmsg != NULL);
2735 	mp = so->so_oobmsg;
2736 	if (flags & MSG_PEEK) {
2737 		/*
2738 		 * Since recv* can not return ENOBUFS we can not use dupmsg.
2739 		 * Instead we revert to the consolidation private
2740 		 * allocb_wait plus bcopy.
2741 		 */
2742 		mblk_t *mp1;
2743 
2744 		mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
2745 		ASSERT(mp1);
2746 
2747 		while (mp != NULL) {
2748 			ssize_t size;
2749 
2750 			size = MBLKL(mp);
2751 			bcopy(mp->b_rptr, mp1->b_wptr, size);
2752 			mp1->b_wptr += size;
2753 			ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
2754 			mp = mp->b_cont;
2755 		}
2756 		mp = mp1;
2757 	} else {
2758 		/*
2759 		 * Update the state indicating that the data has been consumed.
2760 		 * Keep SS_OOBPEND set until data is consumed past the mark.
2761 		 */
2762 		so->so_oobmsg = NULL;
2763 		so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
2764 	}
2765 	dprintso(so, 1,
2766 	    ("after recvoob(%p): counts %d/%d state %s\n",
2767 	    so, so->so_oobsigcnt,
2768 	    so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2769 	ASSERT(so_verify_oobstate(so));
2770 	mutex_exit(&so->so_lock);
2771 
2772 	error = 0;
2773 	nmp = mp;
2774 	while (nmp != NULL && uiop->uio_resid > 0) {
2775 		ssize_t n = MBLKL(nmp);
2776 
2777 		n = MIN(n, uiop->uio_resid);
2778 		if (n > 0)
2779 			error = uiomove(nmp->b_rptr, n,
2780 			    UIO_READ, uiop);
2781 		if (error)
2782 			break;
2783 		nmp = nmp->b_cont;
2784 	}
2785 	freemsg(mp);
2786 	return (error);
2787 }
2788 
2789 /*
2790  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2791  * In addition, the caller typically verifies that there is some
2792  * potential state to clear by checking
2793  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2794  * before calling this routine.
2795  * Note that such a check can be made without holding so_lock since
2796  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2797  * decrements so_oobsigcnt.
2798  *
2799  * When data is read *after* the point that all pending
2800  * oob data has been consumed the oob indication is cleared.
2801  *
2802  * This logic keeps select/poll returning POLLRDBAND and
2803  * SIOCATMARK returning true until we have read past
2804  * the mark.
2805  */
2806 static void
2807 sorecv_update_oobstate(struct sonode *so)
2808 {
2809 	mutex_enter(&so->so_lock);
2810 	ASSERT(so_verify_oobstate(so));
2811 	dprintso(so, 1,
2812 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
2813 	    so->so_oobsigcnt,
2814 	    so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2815 	if (so->so_oobsigcnt == 0) {
2816 		/* No more pending oob indications */
2817 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2818 		freemsg(so->so_oobmsg);
2819 		so->so_oobmsg = NULL;
2820 	}
2821 	ASSERT(so_verify_oobstate(so));
2822 	mutex_exit(&so->so_lock);
2823 }
2824 
2825 /*
2826  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2827  */
2828 static int
2829 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2830 {
2831 	int	error = 0;
2832 	mblk_t *tmp = NULL;
2833 	mblk_t *pmp = NULL;
2834 	mblk_t *nmp = so->so_nl7c_rcv_mp;
2835 
2836 	ASSERT(nmp != NULL);
2837 
2838 	while (nmp != NULL && uiop->uio_resid > 0) {
2839 		ssize_t n;
2840 
2841 		if (DB_TYPE(nmp) == M_DATA) {
2842 			/*
2843 			 * We have some data, uiomove up to resid bytes.
2844 			 */
2845 			n = MIN(MBLKL(nmp), uiop->uio_resid);
2846 			if (n > 0)
2847 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
2848 			nmp->b_rptr += n;
2849 			if (nmp->b_rptr == nmp->b_wptr) {
2850 				pmp = nmp;
2851 				nmp = nmp->b_cont;
2852 			}
2853 			if (error)
2854 				break;
2855 		} else {
2856 			/*
2857 			 * We only handle data, save for caller to handle.
2858 			 */
2859 			if (pmp != NULL) {
2860 				pmp->b_cont = nmp->b_cont;
2861 			}
2862 			nmp->b_cont = NULL;
2863 			if (*rmp == NULL) {
2864 				*rmp = nmp;
2865 			} else {
2866 				tmp->b_cont = nmp;
2867 			}
2868 			nmp = nmp->b_cont;
2869 			tmp = nmp;
2870 		}
2871 	}
2872 	if (pmp != NULL) {
2873 		/* Free any mblk_t(s) which we have consumed */
2874 		pmp->b_cont = NULL;
2875 		freemsg(so->so_nl7c_rcv_mp);
2876 	}
2877 	if ((so->so_nl7c_rcv_mp = nmp) == NULL) {
2878 		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
2879 		if (error == 0) {
2880 			rval_t	*p = (rval_t *)&so->so_nl7c_rcv_rval;
2881 
2882 			error = p->r_v.r_v2;
2883 			p->r_v.r_v2 = 0;
2884 		}
2885 		rp->r_vals = so->so_nl7c_rcv_rval;
2886 		so->so_nl7c_rcv_rval = 0;
2887 	} else {
2888 		/* More mblk_t(s) to process so no rval to return */
2889 		rp->r_vals = 0;
2890 	}
2891 	return (error);
2892 }
2893 
2894 /*
2895  * Receive the next message on the queue.
2896  * If msg_controllen is non-zero when called the caller is interested in
2897  * any received control info (options).
2898  * If msg_namelen is non-zero when called the caller is interested in
2899  * any received source address.
2900  * The routine returns with msg_control and msg_name pointing to
2901  * kmem_alloc'ed memory which the caller has to free.
2902  */
2903 int
2904 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
2905 {
2906 	union T_primitives	*tpr;
2907 	mblk_t			*mp;
2908 	uchar_t			pri;
2909 	int			pflag, opflag;
2910 	void			*control;
2911 	t_uscalar_t		controllen;
2912 	t_uscalar_t		namelen;
2913 	int			so_state = so->so_state; /* Snapshot */
2914 	ssize_t			saved_resid;
2915 	int			error;
2916 	rval_t			rval;
2917 	int			flags;
2918 	clock_t			timout;
2919 	int			first;
2920 
2921 	flags = msg->msg_flags;
2922 	msg->msg_flags = 0;
2923 
2924 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2925 	    so, msg, flags,
2926 	    pr_state(so->so_state, so->so_mode), so->so_error));
2927 
2928 	/*
2929 	 * If we are not connected because we have never been connected
2930 	 * we return ENOTCONN. If we have been connected (but are no longer
2931 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2932 	 * the EOF.
2933 	 *
2934 	 * An alternative would be to post an ENOTCONN error in stream head
2935 	 * (read+write) and clear it when we're connected. However, that error
2936 	 * would cause incorrect poll/select behavior!
2937 	 */
2938 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2939 	    (so->so_mode & SM_CONNREQUIRED)) {
2940 		return (ENOTCONN);
2941 	}
2942 
2943 	/*
2944 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2945 	 * after checking that the read queue is empty) and returns zero.
2946 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2947 	 * is zero.
2948 	 */
2949 
2950 	if (flags & MSG_OOB) {
2951 		/* Check that the transport supports OOB */
2952 		if (!(so->so_mode & SM_EXDATA))
2953 			return (EOPNOTSUPP);
2954 		return (sorecvoob(so, msg, uiop, flags));
2955 	}
2956 
2957 	/*
2958 	 * Set msg_controllen and msg_namelen to zero here to make it
2959 	 * simpler in the cases that no control or name is returned.
2960 	 */
2961 	controllen = msg->msg_controllen;
2962 	namelen = msg->msg_namelen;
2963 	msg->msg_controllen = 0;
2964 	msg->msg_namelen = 0;
2965 
2966 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2967 	    namelen, controllen));
2968 
2969 	mutex_enter(&so->so_lock);
2970 	/*
2971 	 * If an NL7C enabled socket and not waiting for write data.
2972 	 */
2973 	if ((so->so_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
2974 	    NL7C_ENABLED) {
2975 		if (so->so_nl7c_uri) {
2976 			/* Close uri processing for a previous request */
2977 			nl7c_close(so);
2978 		}
2979 		if ((so_state & SS_CANTRCVMORE) && so->so_nl7c_rcv_mp == NULL) {
2980 			/* Nothing to process, EOF */
2981 			mutex_exit(&so->so_lock);
2982 			return (0);
2983 		} else if (so->so_nl7c_flags & NL7C_SOPERSIST) {
2984 			/* Persistent NL7C socket, try to process request */
2985 			boolean_t ret;
2986 
2987 			ret = nl7c_process(so,
2988 			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
2989 			rval.r_vals = so->so_nl7c_rcv_rval;
2990 			error = rval.r_v.r_v2;
2991 			if (error) {
2992 				/* Error of some sort, return it */
2993 				mutex_exit(&so->so_lock);
2994 				return (error);
2995 			}
2996 			if (so->so_nl7c_flags &&
2997 			    ! (so->so_nl7c_flags & NL7C_WAITWRITE)) {
2998 				/*
2999 				 * Still an NL7C socket and no data
3000 				 * to pass up to the caller.
3001 				 */
3002 				mutex_exit(&so->so_lock);
3003 				if (ret) {
3004 					/* EOF */
3005 					return (0);
3006 				} else {
3007 					/* Need more data */
3008 					return (EAGAIN);
3009 				}
3010 			}
3011 		} else {
3012 			/*
3013 			 * Not persistent so no further NL7C processing.
3014 			 */
3015 			so->so_nl7c_flags = 0;
3016 		}
3017 	}
3018 	/*
3019 	 * Only one reader is allowed at any given time. This is needed
3020 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3021 	 *
3022 	 * This is slightly different that BSD behavior in that it fails with
3023 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3024 	 * is single-threaded using sblock(), which is dropped while waiting
3025 	 * for data to appear. The difference shows up e.g. if one
3026 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3027 	 * does use nonblocking io and different threads are reading each
3028 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3029 	 * in this case as long as the read queue doesn't get empty.
3030 	 * In this implementation the thread using nonblocking io can
3031 	 * get an EWOULDBLOCK error due to the blocking thread executing
3032 	 * e.g. in the uiomove in kstrgetmsg.
3033 	 * This difference is not believed to be significant.
3034 	 */
3035 	/* Set SOREADLOCKED */
3036 	error = so_lock_read_intr(so,
3037 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3038 	mutex_exit(&so->so_lock);
3039 	if (error)
3040 		return (error);
3041 
3042 	/*
3043 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3044 	 * queued data has been consumed.
3045 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3046 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3047 	 *
3048 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3049 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3050 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3051 	 */
3052 	pflag = MSG_ANY | MSG_DELAYERROR;
3053 	if (flags & MSG_PEEK) {
3054 		pflag |= MSG_IPEEK;
3055 		flags &= ~MSG_WAITALL;
3056 	}
3057 	if (so->so_mode & SM_ATOMIC)
3058 		pflag |= MSG_DISCARDTAIL;
3059 
3060 	if (flags & MSG_DONTWAIT)
3061 		timout = 0;
3062 	else
3063 		timout = -1;
3064 	opflag = pflag;
3065 	first = 1;
3066 
3067 retry:
3068 	saved_resid = uiop->uio_resid;
3069 	pri = 0;
3070 	mp = NULL;
3071 	if (so->so_nl7c_rcv_mp != NULL) {
3072 		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3073 		error = nl7c_sorecv(so, &mp, uiop, &rval);
3074 	} else {
3075 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3076 		    timout, &rval);
3077 	}
3078 	if (error) {
3079 		switch (error) {
3080 		case EINTR:
3081 		case EWOULDBLOCK:
3082 			if (!first)
3083 				error = 0;
3084 			break;
3085 		case ETIME:
3086 			/* Returned from kstrgetmsg when timeout expires */
3087 			if (!first)
3088 				error = 0;
3089 			else
3090 				error = EWOULDBLOCK;
3091 			break;
3092 		default:
3093 			eprintsoline(so, error);
3094 			break;
3095 		}
3096 		mutex_enter(&so->so_lock);
3097 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3098 		mutex_exit(&so->so_lock);
3099 		return (error);
3100 	}
3101 	/*
3102 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3103 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3104 	 */
3105 	ASSERT(!(rval.r_val1 & MORECTL));
3106 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3107 		msg->msg_flags |= MSG_TRUNC;
3108 
3109 	if (mp == NULL) {
3110 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3111 		/*
3112 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3113 		 * The draft Posix socket spec states that the mark should
3114 		 * not be cleared when peeking. We follow the latter.
3115 		 */
3116 		if ((so->so_state &
3117 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3118 		    (uiop->uio_resid != saved_resid) &&
3119 		    !(flags & MSG_PEEK)) {
3120 			sorecv_update_oobstate(so);
3121 		}
3122 
3123 		mutex_enter(&so->so_lock);
3124 		/* Set MSG_EOR based on MOREDATA */
3125 		if (!(rval.r_val1 & MOREDATA)) {
3126 			if (so->so_state & SS_SAVEDEOR) {
3127 				msg->msg_flags |= MSG_EOR;
3128 				so->so_state &= ~SS_SAVEDEOR;
3129 			}
3130 		}
3131 		/*
3132 		 * If some data was received (i.e. not EOF) and the
3133 		 * read/recv* has not been satisfied wait for some more.
3134 		 */
3135 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3136 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3137 			mutex_exit(&so->so_lock);
3138 			first = 0;
3139 			pflag = opflag | MSG_NOMARK;
3140 			goto retry;
3141 		}
3142 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3143 		mutex_exit(&so->so_lock);
3144 		return (0);
3145 	}
3146 
3147 	/* strsock_proto has already verified length and alignment */
3148 	tpr = (union T_primitives *)mp->b_rptr;
3149 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3150 
3151 	switch (tpr->type) {
3152 	case T_DATA_IND: {
3153 		if ((so->so_state &
3154 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3155 		    (uiop->uio_resid != saved_resid) &&
3156 		    !(flags & MSG_PEEK)) {
3157 			sorecv_update_oobstate(so);
3158 		}
3159 
3160 		/*
3161 		 * Set msg_flags to MSG_EOR based on
3162 		 * MORE_flag and MOREDATA.
3163 		 */
3164 		mutex_enter(&so->so_lock);
3165 		so->so_state &= ~SS_SAVEDEOR;
3166 		if (!(tpr->data_ind.MORE_flag & 1)) {
3167 			if (!(rval.r_val1 & MOREDATA))
3168 				msg->msg_flags |= MSG_EOR;
3169 			else
3170 				so->so_state |= SS_SAVEDEOR;
3171 		}
3172 		freemsg(mp);
3173 		/*
3174 		 * If some data was received (i.e. not EOF) and the
3175 		 * read/recv* has not been satisfied wait for some more.
3176 		 */
3177 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3178 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3179 			mutex_exit(&so->so_lock);
3180 			first = 0;
3181 			pflag = opflag | MSG_NOMARK;
3182 			goto retry;
3183 		}
3184 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3185 		mutex_exit(&so->so_lock);
3186 		return (0);
3187 	}
3188 	case T_UNITDATA_IND: {
3189 		void *addr;
3190 		t_uscalar_t addrlen;
3191 		void *abuf;
3192 		t_uscalar_t optlen;
3193 		void *opt;
3194 
3195 		if ((so->so_state &
3196 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3197 		    (uiop->uio_resid != saved_resid) &&
3198 		    !(flags & MSG_PEEK)) {
3199 			sorecv_update_oobstate(so);
3200 		}
3201 
3202 		if (namelen != 0) {
3203 			/* Caller wants source address */
3204 			addrlen = tpr->unitdata_ind.SRC_length;
3205 			addr = sogetoff(mp,
3206 			    tpr->unitdata_ind.SRC_offset,
3207 			    addrlen, 1);
3208 			if (addr == NULL) {
3209 				freemsg(mp);
3210 				error = EPROTO;
3211 				eprintsoline(so, error);
3212 				goto err;
3213 			}
3214 			if (so->so_family == AF_UNIX) {
3215 				/*
3216 				 * Can not use the transport level address.
3217 				 * If there is a SO_SRCADDR option carrying
3218 				 * the socket level address it will be
3219 				 * extracted below.
3220 				 */
3221 				addr = NULL;
3222 				addrlen = 0;
3223 			}
3224 		}
3225 		optlen = tpr->unitdata_ind.OPT_length;
3226 		if (optlen != 0) {
3227 			t_uscalar_t ncontrollen;
3228 
3229 			/*
3230 			 * Extract any source address option.
3231 			 * Determine how large cmsg buffer is needed.
3232 			 */
3233 			opt = sogetoff(mp,
3234 			    tpr->unitdata_ind.OPT_offset,
3235 			    optlen, __TPI_ALIGN_SIZE);
3236 
3237 			if (opt == NULL) {
3238 				freemsg(mp);
3239 				error = EPROTO;
3240 				eprintsoline(so, error);
3241 				goto err;
3242 			}
3243 			if (so->so_family == AF_UNIX)
3244 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3245 			ncontrollen = so_cmsglen(mp, opt, optlen,
3246 			    !(flags & MSG_XPG4_2));
3247 			if (controllen != 0)
3248 				controllen = ncontrollen;
3249 			else if (ncontrollen != 0)
3250 				msg->msg_flags |= MSG_CTRUNC;
3251 		} else {
3252 			controllen = 0;
3253 		}
3254 
3255 		if (namelen != 0) {
3256 			/*
3257 			 * Return address to caller.
3258 			 * Caller handles truncation if length
3259 			 * exceeds msg_namelen.
3260 			 * NOTE: AF_UNIX NUL termination is ensured by
3261 			 * the sender's copyin_name().
3262 			 */
3263 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3264 
3265 			bcopy(addr, abuf, addrlen);
3266 			msg->msg_name = abuf;
3267 			msg->msg_namelen = addrlen;
3268 		}
3269 
3270 		if (controllen != 0) {
3271 			/*
3272 			 * Return control msg to caller.
3273 			 * Caller handles truncation if length
3274 			 * exceeds msg_controllen.
3275 			 */
3276 			control = kmem_zalloc(controllen, KM_SLEEP);
3277 
3278 			error = so_opt2cmsg(mp, opt, optlen,
3279 			    !(flags & MSG_XPG4_2),
3280 			    control, controllen);
3281 			if (error) {
3282 				freemsg(mp);
3283 				if (msg->msg_namelen != 0)
3284 					kmem_free(msg->msg_name,
3285 					    msg->msg_namelen);
3286 				kmem_free(control, controllen);
3287 				eprintsoline(so, error);
3288 				goto err;
3289 			}
3290 			msg->msg_control = control;
3291 			msg->msg_controllen = controllen;
3292 		}
3293 
3294 		freemsg(mp);
3295 		mutex_enter(&so->so_lock);
3296 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3297 		mutex_exit(&so->so_lock);
3298 		return (0);
3299 	}
3300 	case T_OPTDATA_IND: {
3301 		struct T_optdata_req *tdr;
3302 		void *opt;
3303 		t_uscalar_t optlen;
3304 
3305 		if ((so->so_state &
3306 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3307 		    (uiop->uio_resid != saved_resid) &&
3308 		    !(flags & MSG_PEEK)) {
3309 			sorecv_update_oobstate(so);
3310 		}
3311 
3312 		tdr = (struct T_optdata_req *)mp->b_rptr;
3313 		optlen = tdr->OPT_length;
3314 		if (optlen != 0) {
3315 			t_uscalar_t ncontrollen;
3316 			/*
3317 			 * Determine how large cmsg buffer is needed.
3318 			 */
3319 			opt = sogetoff(mp,
3320 			    tpr->optdata_ind.OPT_offset,
3321 			    optlen, __TPI_ALIGN_SIZE);
3322 
3323 			if (opt == NULL) {
3324 				freemsg(mp);
3325 				error = EPROTO;
3326 				eprintsoline(so, error);
3327 				goto err;
3328 			}
3329 
3330 			ncontrollen = so_cmsglen(mp, opt, optlen,
3331 			    !(flags & MSG_XPG4_2));
3332 			if (controllen != 0)
3333 				controllen = ncontrollen;
3334 			else if (ncontrollen != 0)
3335 				msg->msg_flags |= MSG_CTRUNC;
3336 		} else {
3337 			controllen = 0;
3338 		}
3339 
3340 		if (controllen != 0) {
3341 			/*
3342 			 * Return control msg to caller.
3343 			 * Caller handles truncation if length
3344 			 * exceeds msg_controllen.
3345 			 */
3346 			control = kmem_zalloc(controllen, KM_SLEEP);
3347 
3348 			error = so_opt2cmsg(mp, opt, optlen,
3349 			    !(flags & MSG_XPG4_2),
3350 			    control, controllen);
3351 			if (error) {
3352 				freemsg(mp);
3353 				kmem_free(control, controllen);
3354 				eprintsoline(so, error);
3355 				goto err;
3356 			}
3357 			msg->msg_control = control;
3358 			msg->msg_controllen = controllen;
3359 		}
3360 
3361 		/*
3362 		 * Set msg_flags to MSG_EOR based on
3363 		 * DATA_flag and MOREDATA.
3364 		 */
3365 		mutex_enter(&so->so_lock);
3366 		so->so_state &= ~SS_SAVEDEOR;
3367 		if (!(tpr->data_ind.MORE_flag & 1)) {
3368 			if (!(rval.r_val1 & MOREDATA))
3369 				msg->msg_flags |= MSG_EOR;
3370 			else
3371 				so->so_state |= SS_SAVEDEOR;
3372 		}
3373 		freemsg(mp);
3374 		/*
3375 		 * If some data was received (i.e. not EOF) and the
3376 		 * read/recv* has not been satisfied wait for some more.
3377 		 * Not possible to wait if control info was received.
3378 		 */
3379 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3380 		    controllen == 0 &&
3381 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3382 			mutex_exit(&so->so_lock);
3383 			first = 0;
3384 			pflag = opflag | MSG_NOMARK;
3385 			goto retry;
3386 		}
3387 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3388 		mutex_exit(&so->so_lock);
3389 		return (0);
3390 	}
3391 	case T_EXDATA_IND: {
3392 		dprintso(so, 1,
3393 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3394 		    "state %s\n",
3395 		    so->so_oobsigcnt, so->so_oobcnt,
3396 		    saved_resid - uiop->uio_resid,
3397 		    pr_state(so->so_state, so->so_mode)));
3398 		/*
3399 		 * kstrgetmsg handles MSGMARK so there is nothing to
3400 		 * inspect in the T_EXDATA_IND.
3401 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3402 		 * as a separate message with no M_DATA component. Furthermore,
3403 		 * the stream head does not consolidate M_DATA messages onto
3404 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3405 		 * remains a message by itself. This is needed since MSGMARK
3406 		 * marks both the whole message as well as the last byte
3407 		 * of the message.
3408 		 */
3409 		freemsg(mp);
3410 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3411 		if (flags & MSG_PEEK) {
3412 			/*
3413 			 * Even though we are peeking we consume the
3414 			 * T_EXDATA_IND thereby moving the mark information
3415 			 * to SS_RCVATMARK. Then the oob code below will
3416 			 * retry the peeking kstrgetmsg.
3417 			 * Note that the stream head read queue is
3418 			 * never flushed without holding SOREADLOCKED
3419 			 * thus the T_EXDATA_IND can not disappear
3420 			 * underneath us.
3421 			 */
3422 			dprintso(so, 1,
3423 			    ("sotpi_recvmsg: consume EXDATA_IND "
3424 			    "counts %d/%d state %s\n",
3425 			    so->so_oobsigcnt,
3426 			    so->so_oobcnt,
3427 			    pr_state(so->so_state, so->so_mode)));
3428 
3429 			pflag = MSG_ANY | MSG_DELAYERROR;
3430 			if (so->so_mode & SM_ATOMIC)
3431 				pflag |= MSG_DISCARDTAIL;
3432 
3433 			pri = 0;
3434 			mp = NULL;
3435 
3436 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3437 			    &pri, &pflag, (clock_t)-1, &rval);
3438 			ASSERT(uiop->uio_resid == saved_resid);
3439 
3440 			if (error) {
3441 #ifdef SOCK_DEBUG
3442 				if (error != EWOULDBLOCK && error != EINTR) {
3443 					eprintsoline(so, error);
3444 				}
3445 #endif /* SOCK_DEBUG */
3446 				mutex_enter(&so->so_lock);
3447 				so_unlock_read(so);	/* Clear SOREADLOCKED */
3448 				mutex_exit(&so->so_lock);
3449 				return (error);
3450 			}
3451 			ASSERT(mp);
3452 			tpr = (union T_primitives *)mp->b_rptr;
3453 			ASSERT(tpr->type == T_EXDATA_IND);
3454 			freemsg(mp);
3455 		} /* end "if (flags & MSG_PEEK)" */
3456 
3457 		/*
3458 		 * Decrement the number of queued and pending oob.
3459 		 *
3460 		 * SS_RCVATMARK is cleared when we read past a mark.
3461 		 * SS_HAVEOOBDATA is cleared when we've read past the
3462 		 * last mark.
3463 		 * SS_OOBPEND is cleared if we've read past the last
3464 		 * mark and no (new) SIGURG has been posted.
3465 		 */
3466 		mutex_enter(&so->so_lock);
3467 		ASSERT(so_verify_oobstate(so));
3468 		ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
3469 		ASSERT(so->so_oobsigcnt > 0);
3470 		so->so_oobsigcnt--;
3471 		ASSERT(so->so_oobcnt > 0);
3472 		so->so_oobcnt--;
3473 		/*
3474 		 * Since the T_EXDATA_IND has been removed from the stream
3475 		 * head, but we have not read data past the mark,
3476 		 * sockfs needs to track that the socket is still at the mark.
3477 		 *
3478 		 * Since no data was received call kstrgetmsg again to wait
3479 		 * for data.
3480 		 */
3481 		so->so_state |= SS_RCVATMARK;
3482 		mutex_exit(&so->so_lock);
3483 		dprintso(so, 1,
3484 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3485 		    so->so_oobsigcnt, so->so_oobcnt,
3486 		    pr_state(so->so_state, so->so_mode)));
3487 		pflag = opflag;
3488 		goto retry;
3489 	}
3490 	default:
3491 		ASSERT(0);
3492 		freemsg(mp);
3493 		error = EPROTO;
3494 		eprintsoline(so, error);
3495 		goto err;
3496 	}
3497 	/* NOTREACHED */
3498 err:
3499 	mutex_enter(&so->so_lock);
3500 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3501 	mutex_exit(&so->so_lock);
3502 	return (error);
3503 }
3504 
3505 /*
3506  * Sending data with options on a datagram socket.
3507  * Assumes caller has verified that SS_ISBOUND etc. are set.
3508  */
3509 static int
3510 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3511     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3512 {
3513 	struct T_unitdata_req	tudr;
3514 	mblk_t			*mp;
3515 	int			error;
3516 	void			*addr;
3517 	socklen_t		addrlen;
3518 	void			*src;
3519 	socklen_t		srclen;
3520 	ssize_t			len;
3521 	int			size;
3522 	struct T_opthdr		toh;
3523 	struct fdbuf		*fdbuf;
3524 	t_uscalar_t		optlen;
3525 	void			*fds;
3526 	int			fdlen;
3527 
3528 	ASSERT(name && namelen);
3529 	ASSERT(control && controllen);
3530 
3531 	len = uiop->uio_resid;
3532 	if (len > (ssize_t)so->so_tidu_size) {
3533 		return (EMSGSIZE);
3534 	}
3535 
3536 	/*
3537 	 * For AF_UNIX the destination address is translated to an internal
3538 	 * name and the source address is passed as an option.
3539 	 * Also, file descriptors are passed as file pointers in an
3540 	 * option.
3541 	 */
3542 
3543 	/*
3544 	 * Length and family checks.
3545 	 */
3546 	error = so_addr_verify(so, name, namelen);
3547 	if (error) {
3548 		eprintsoline(so, error);
3549 		return (error);
3550 	}
3551 	if (so->so_family == AF_UNIX) {
3552 		if (so->so_state & SS_FADDR_NOXLATE) {
3553 			/*
3554 			 * Already have a transport internal address. Do not
3555 			 * pass any (transport internal) source address.
3556 			 */
3557 			addr = name;
3558 			addrlen = namelen;
3559 			src = NULL;
3560 			srclen = 0;
3561 		} else {
3562 			/*
3563 			 * Pass the sockaddr_un source address as an option
3564 			 * and translate the remote address.
3565 			 *
3566 			 * Note that this code does not prevent so_laddr_sa
3567 			 * from changing while it is being used. Thus
3568 			 * if an unbind+bind occurs concurrently with this
3569 			 * send the peer might see a partially new and a
3570 			 * partially old "from" address.
3571 			 */
3572 			src = so->so_laddr_sa;
3573 			srclen = (t_uscalar_t)so->so_laddr_len;
3574 			dprintso(so, 1,
3575 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3576 			    srclen, src));
3577 			error = so_ux_addr_xlate(so, name, namelen,
3578 			    (flags & MSG_XPG4_2),
3579 			    &addr, &addrlen);
3580 			if (error) {
3581 				eprintsoline(so, error);
3582 				return (error);
3583 			}
3584 		}
3585 	} else {
3586 		addr = name;
3587 		addrlen = namelen;
3588 		src = NULL;
3589 		srclen = 0;
3590 	}
3591 	optlen = so_optlen(control, controllen,
3592 	    !(flags & MSG_XPG4_2));
3593 	tudr.PRIM_type = T_UNITDATA_REQ;
3594 	tudr.DEST_length = addrlen;
3595 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3596 	if (srclen != 0)
3597 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3598 		    _TPI_ALIGN_TOPT(srclen));
3599 	else
3600 		tudr.OPT_length = optlen;
3601 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3602 	    _TPI_ALIGN_TOPT(addrlen));
3603 
3604 	size = tudr.OPT_offset + tudr.OPT_length;
3605 
3606 	/*
3607 	 * File descriptors only when SM_FDPASSING set.
3608 	 */
3609 	error = so_getfdopt(control, controllen,
3610 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3611 	if (error)
3612 		return (error);
3613 	if (fdlen != -1) {
3614 		if (!(so->so_mode & SM_FDPASSING))
3615 			return (EOPNOTSUPP);
3616 
3617 		error = fdbuf_create(fds, fdlen, &fdbuf);
3618 		if (error)
3619 			return (error);
3620 		mp = fdbuf_allocmsg(size, fdbuf);
3621 	} else {
3622 		mp = soallocproto(size, _ALLOC_INTR);
3623 		if (mp == NULL) {
3624 			/*
3625 			 * Caught a signal waiting for memory.
3626 			 * Let send* return EINTR.
3627 			 */
3628 			return (EINTR);
3629 		}
3630 	}
3631 	soappendmsg(mp, &tudr, sizeof (tudr));
3632 	soappendmsg(mp, addr, addrlen);
3633 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3634 
3635 	if (fdlen != -1) {
3636 		ASSERT(fdbuf != NULL);
3637 		toh.level = SOL_SOCKET;
3638 		toh.name = SO_FILEP;
3639 		toh.len = fdbuf->fd_size +
3640 		    (t_uscalar_t)sizeof (struct T_opthdr);
3641 		toh.status = 0;
3642 		soappendmsg(mp, &toh, sizeof (toh));
3643 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3644 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3645 	}
3646 	if (srclen != 0) {
3647 		/*
3648 		 * There is a AF_UNIX sockaddr_un to include as a source
3649 		 * address option.
3650 		 */
3651 		toh.level = SOL_SOCKET;
3652 		toh.name = SO_SRCADDR;
3653 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3654 		toh.status = 0;
3655 		soappendmsg(mp, &toh, sizeof (toh));
3656 		soappendmsg(mp, src, srclen);
3657 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3658 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3659 	}
3660 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3661 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3662 	/* At most 3 bytes left in the message */
3663 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3664 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3665 
3666 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3667 #ifdef C2_AUDIT
3668 	if (audit_active)
3669 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3670 #endif /* C2_AUDIT */
3671 
3672 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3673 #ifdef SOCK_DEBUG
3674 	if (error) {
3675 		eprintsoline(so, error);
3676 	}
3677 #endif /* SOCK_DEBUG */
3678 	return (error);
3679 }
3680 
3681 /*
3682  * Sending data with options on a connected stream socket.
3683  * Assumes caller has verified that SS_ISCONNECTED is set.
3684  */
3685 static int
3686 sosend_svccmsg(struct sonode *so,
3687 		struct uio *uiop,
3688 		int more,
3689 		void *control,
3690 		t_uscalar_t controllen,
3691 		int flags)
3692 {
3693 	struct T_optdata_req	tdr;
3694 	mblk_t			*mp;
3695 	int			error;
3696 	ssize_t			iosize;
3697 	int			first = 1;
3698 	int			size;
3699 	struct fdbuf		*fdbuf;
3700 	t_uscalar_t		optlen;
3701 	void			*fds;
3702 	int			fdlen;
3703 	struct T_opthdr		toh;
3704 
3705 	dprintso(so, 1,
3706 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3707 
3708 	/*
3709 	 * Has to be bound and connected. However, since no locks are
3710 	 * held the state could have changed after sotpi_sendmsg checked it
3711 	 * thus it is not possible to ASSERT on the state.
3712 	 */
3713 
3714 	/* Options on connection-oriented only when SM_OPTDATA set. */
3715 	if (!(so->so_mode & SM_OPTDATA))
3716 		return (EOPNOTSUPP);
3717 
3718 	do {
3719 		/*
3720 		 * Set the MORE flag if uio_resid does not fit in this
3721 		 * message or if the caller passed in "more".
3722 		 * Error for transports with zero tidu_size.
3723 		 */
3724 		tdr.PRIM_type = T_OPTDATA_REQ;
3725 		iosize = so->so_tidu_size;
3726 		if (iosize <= 0)
3727 			return (EMSGSIZE);
3728 		if (uiop->uio_resid > iosize) {
3729 			tdr.DATA_flag = 1;
3730 		} else {
3731 			if (more)
3732 				tdr.DATA_flag = 1;
3733 			else
3734 				tdr.DATA_flag = 0;
3735 			iosize = uiop->uio_resid;
3736 		}
3737 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3738 		    tdr.DATA_flag, iosize));
3739 
3740 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3741 		tdr.OPT_length = optlen;
3742 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3743 
3744 		size = (int)sizeof (tdr) + optlen;
3745 		/*
3746 		 * File descriptors only when SM_FDPASSING set.
3747 		 */
3748 		error = so_getfdopt(control, controllen,
3749 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3750 		if (error)
3751 			return (error);
3752 		if (fdlen != -1) {
3753 			if (!(so->so_mode & SM_FDPASSING))
3754 				return (EOPNOTSUPP);
3755 
3756 			error = fdbuf_create(fds, fdlen, &fdbuf);
3757 			if (error)
3758 				return (error);
3759 			mp = fdbuf_allocmsg(size, fdbuf);
3760 		} else {
3761 			mp = soallocproto(size, _ALLOC_INTR);
3762 			if (mp == NULL) {
3763 				/*
3764 				 * Caught a signal waiting for memory.
3765 				 * Let send* return EINTR.
3766 				 */
3767 				return (first ? EINTR : 0);
3768 			}
3769 		}
3770 		soappendmsg(mp, &tdr, sizeof (tdr));
3771 
3772 		if (fdlen != -1) {
3773 			ASSERT(fdbuf != NULL);
3774 			toh.level = SOL_SOCKET;
3775 			toh.name = SO_FILEP;
3776 			toh.len = fdbuf->fd_size +
3777 			    (t_uscalar_t)sizeof (struct T_opthdr);
3778 			toh.status = 0;
3779 			soappendmsg(mp, &toh, sizeof (toh));
3780 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3781 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3782 		}
3783 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3784 		/* At most 3 bytes left in the message */
3785 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3786 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3787 
3788 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3789 
3790 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3791 		    0, MSG_BAND, 0);
3792 		if (error) {
3793 			if (!first && error == EWOULDBLOCK)
3794 				return (0);
3795 			eprintsoline(so, error);
3796 			return (error);
3797 		}
3798 		control = NULL;
3799 		first = 0;
3800 		if (uiop->uio_resid > 0) {
3801 			/*
3802 			 * Recheck for fatal errors. Fail write even though
3803 			 * some data have been written. This is consistent
3804 			 * with strwrite semantics and BSD sockets semantics.
3805 			 */
3806 			if (so->so_state & SS_CANTSENDMORE) {
3807 				tsignal(curthread, SIGPIPE);
3808 				eprintsoline(so, error);
3809 				return (EPIPE);
3810 			}
3811 			if (so->so_error != 0) {
3812 				mutex_enter(&so->so_lock);
3813 				error = sogeterr(so);
3814 				mutex_exit(&so->so_lock);
3815 				if (error != 0) {
3816 					eprintsoline(so, error);
3817 					return (error);
3818 				}
3819 			}
3820 		}
3821 	} while (uiop->uio_resid > 0);
3822 	return (0);
3823 }
3824 
3825 /*
3826  * Sending data on a datagram socket.
3827  * Assumes caller has verified that SS_ISBOUND etc. are set.
3828  *
3829  * For AF_UNIX the destination address is translated to an internal
3830  * name and the source address is passed as an option.
3831  */
3832 int
3833 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
3834     struct uio *uiop, int flags)
3835 {
3836 	struct T_unitdata_req	tudr;
3837 	mblk_t			*mp;
3838 	int			error;
3839 	void			*addr;
3840 	socklen_t		addrlen;
3841 	void			*src;
3842 	socklen_t		srclen;
3843 	ssize_t			len;
3844 
3845 	ASSERT(name != NULL && namelen != 0);
3846 
3847 	len = uiop->uio_resid;
3848 	if (len > so->so_tidu_size) {
3849 		error = EMSGSIZE;
3850 		goto done;
3851 	}
3852 
3853 	/* Length and family checks */
3854 	error = so_addr_verify(so, name, namelen);
3855 	if (error != 0)
3856 		goto done;
3857 
3858 	if (so->so_state & SS_DIRECT)
3859 		return (sodgram_direct(so, name, namelen, uiop, flags));
3860 
3861 	if (so->so_family == AF_UNIX) {
3862 		if (so->so_state & SS_FADDR_NOXLATE) {
3863 			/*
3864 			 * Already have a transport internal address. Do not
3865 			 * pass any (transport internal) source address.
3866 			 */
3867 			addr = name;
3868 			addrlen = namelen;
3869 			src = NULL;
3870 			srclen = 0;
3871 		} else {
3872 			/*
3873 			 * Pass the sockaddr_un source address as an option
3874 			 * and translate the remote address.
3875 			 *
3876 			 * Note that this code does not prevent so_laddr_sa
3877 			 * from changing while it is being used. Thus
3878 			 * if an unbind+bind occurs concurrently with this
3879 			 * send the peer might see a partially new and a
3880 			 * partially old "from" address.
3881 			 */
3882 			src = so->so_laddr_sa;
3883 			srclen = (socklen_t)so->so_laddr_len;
3884 			dprintso(so, 1,
3885 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
3886 			    srclen, src));
3887 			error = so_ux_addr_xlate(so, name, namelen,
3888 			    (flags & MSG_XPG4_2),
3889 			    &addr, &addrlen);
3890 			if (error) {
3891 				eprintsoline(so, error);
3892 				goto done;
3893 			}
3894 		}
3895 	} else {
3896 		addr = name;
3897 		addrlen = namelen;
3898 		src = NULL;
3899 		srclen = 0;
3900 	}
3901 	tudr.PRIM_type = T_UNITDATA_REQ;
3902 	tudr.DEST_length = addrlen;
3903 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3904 	if (srclen == 0) {
3905 		tudr.OPT_length = 0;
3906 		tudr.OPT_offset = 0;
3907 
3908 		mp = soallocproto2(&tudr, sizeof (tudr),
3909 		    addr, addrlen, 0, _ALLOC_INTR);
3910 		if (mp == NULL) {
3911 			/*
3912 			 * Caught a signal waiting for memory.
3913 			 * Let send* return EINTR.
3914 			 */
3915 			error = EINTR;
3916 			goto done;
3917 		}
3918 	} else {
3919 		/*
3920 		 * There is a AF_UNIX sockaddr_un to include as a source
3921 		 * address option.
3922 		 */
3923 		struct T_opthdr toh;
3924 		ssize_t size;
3925 
3926 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
3927 		    _TPI_ALIGN_TOPT(srclen));
3928 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3929 		    _TPI_ALIGN_TOPT(addrlen));
3930 
3931 		toh.level = SOL_SOCKET;
3932 		toh.name = SO_SRCADDR;
3933 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3934 		toh.status = 0;
3935 
3936 		size = tudr.OPT_offset + tudr.OPT_length;
3937 		mp = soallocproto2(&tudr, sizeof (tudr),
3938 		    addr, addrlen, size, _ALLOC_INTR);
3939 		if (mp == NULL) {
3940 			/*
3941 			 * Caught a signal waiting for memory.
3942 			 * Let send* return EINTR.
3943 			 */
3944 			error = EINTR;
3945 			goto done;
3946 		}
3947 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3948 		soappendmsg(mp, &toh, sizeof (toh));
3949 		soappendmsg(mp, src, srclen);
3950 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3951 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3952 	}
3953 
3954 #ifdef C2_AUDIT
3955 	if (audit_active)
3956 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3957 #endif /* C2_AUDIT */
3958 
3959 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3960 done:
3961 #ifdef SOCK_DEBUG
3962 	if (error) {
3963 		eprintsoline(so, error);
3964 	}
3965 #endif /* SOCK_DEBUG */
3966 	return (error);
3967 }
3968 
3969 /*
3970  * Sending data on a connected stream socket.
3971  * Assumes caller has verified that SS_ISCONNECTED is set.
3972  */
3973 int
3974 sosend_svc(struct sonode *so,
3975 	struct uio *uiop,
3976 	t_scalar_t prim,
3977 	int more,
3978 	int sflag)
3979 {
3980 	struct T_data_req	tdr;
3981 	mblk_t			*mp;
3982 	int			error;
3983 	ssize_t			iosize;
3984 	int			first = 1;
3985 
3986 	dprintso(so, 1,
3987 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
3988 	    so, uiop->uio_resid, prim, sflag));
3989 
3990 	/*
3991 	 * Has to be bound and connected. However, since no locks are
3992 	 * held the state could have changed after sotpi_sendmsg checked it
3993 	 * thus it is not possible to ASSERT on the state.
3994 	 */
3995 
3996 	do {
3997 		/*
3998 		 * Set the MORE flag if uio_resid does not fit in this
3999 		 * message or if the caller passed in "more".
4000 		 * Error for transports with zero tidu_size.
4001 		 */
4002 		tdr.PRIM_type = prim;
4003 		iosize = so->so_tidu_size;
4004 		if (iosize <= 0)
4005 			return (EMSGSIZE);
4006 		if (uiop->uio_resid > iosize) {
4007 			tdr.MORE_flag = 1;
4008 		} else {
4009 			if (more)
4010 				tdr.MORE_flag = 1;
4011 			else
4012 				tdr.MORE_flag = 0;
4013 			iosize = uiop->uio_resid;
4014 		}
4015 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4016 		    prim, tdr.MORE_flag, iosize));
4017 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR);
4018 		if (mp == NULL) {
4019 			/*
4020 			 * Caught a signal waiting for memory.
4021 			 * Let send* return EINTR.
4022 			 */
4023 			if (first)
4024 				return (EINTR);
4025 			else
4026 				return (0);
4027 		}
4028 
4029 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4030 		    0, sflag | MSG_BAND, 0);
4031 		if (error) {
4032 			if (!first && error == EWOULDBLOCK)
4033 				return (0);
4034 			eprintsoline(so, error);
4035 			return (error);
4036 		}
4037 		first = 0;
4038 		if (uiop->uio_resid > 0) {
4039 			/*
4040 			 * Recheck for fatal errors. Fail write even though
4041 			 * some data have been written. This is consistent
4042 			 * with strwrite semantics and BSD sockets semantics.
4043 			 */
4044 			if (so->so_state & SS_CANTSENDMORE) {
4045 				tsignal(curthread, SIGPIPE);
4046 				eprintsoline(so, error);
4047 				return (EPIPE);
4048 			}
4049 			if (so->so_error != 0) {
4050 				mutex_enter(&so->so_lock);
4051 				error = sogeterr(so);
4052 				mutex_exit(&so->so_lock);
4053 				if (error != 0) {
4054 					eprintsoline(so, error);
4055 					return (error);
4056 				}
4057 			}
4058 		}
4059 	} while (uiop->uio_resid > 0);
4060 	return (0);
4061 }
4062 
4063 /*
4064  * Check the state for errors and call the appropriate send function.
4065  *
4066  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4067  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4068  * after sending the message.
4069  */
4070 static int
4071 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
4072 {
4073 	int		so_state;
4074 	int		so_mode;
4075 	int		error;
4076 	struct sockaddr *name;
4077 	t_uscalar_t	namelen;
4078 	int		dontroute;
4079 	int		flags;
4080 
4081 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4082 	    so, msg, msg->msg_flags,
4083 	    pr_state(so->so_state, so->so_mode), so->so_error));
4084 
4085 	mutex_enter(&so->so_lock);
4086 	so_state = so->so_state;
4087 
4088 	if (so_state & SS_CANTSENDMORE) {
4089 		mutex_exit(&so->so_lock);
4090 		tsignal(curthread, SIGPIPE);
4091 		return (EPIPE);
4092 	}
4093 
4094 	if (so->so_error != 0) {
4095 		error = sogeterr(so);
4096 		if (error != 0) {
4097 			mutex_exit(&so->so_lock);
4098 			return (error);
4099 		}
4100 	}
4101 
4102 	name = (struct sockaddr *)msg->msg_name;
4103 	namelen = msg->msg_namelen;
4104 
4105 	so_mode = so->so_mode;
4106 
4107 	if (name == NULL) {
4108 		if (!(so_state & SS_ISCONNECTED)) {
4109 			mutex_exit(&so->so_lock);
4110 			if (so_mode & SM_CONNREQUIRED)
4111 				return (ENOTCONN);
4112 			else
4113 				return (EDESTADDRREQ);
4114 		}
4115 		if (so_mode & SM_CONNREQUIRED) {
4116 			name = NULL;
4117 			namelen = 0;
4118 		} else {
4119 			/*
4120 			 * Note that this code does not prevent so_faddr_sa
4121 			 * from changing while it is being used. Thus
4122 			 * if an "unconnect"+connect occurs concurrently with
4123 			 * this send the datagram might be delivered to a
4124 			 * garbaled address.
4125 			 */
4126 			ASSERT(so->so_faddr_sa);
4127 			name = so->so_faddr_sa;
4128 			namelen = (t_uscalar_t)so->so_faddr_len;
4129 		}
4130 	} else {
4131 		if (!(so_state & SS_ISCONNECTED) &&
4132 		    (so_mode & SM_CONNREQUIRED)) {
4133 			/* Required but not connected */
4134 			mutex_exit(&so->so_lock);
4135 			return (ENOTCONN);
4136 		}
4137 		/*
4138 		 * Ignore the address on connection-oriented sockets.
4139 		 * Just like BSD this code does not generate an error for
4140 		 * TCP (a CONNREQUIRED socket) when sending to an address
4141 		 * passed in with sendto/sendmsg. Instead the data is
4142 		 * delivered on the connection as if no address had been
4143 		 * supplied.
4144 		 */
4145 		if ((so_state & SS_ISCONNECTED) &&
4146 		    !(so_mode & SM_CONNREQUIRED)) {
4147 			mutex_exit(&so->so_lock);
4148 			return (EISCONN);
4149 		}
4150 		if (!(so_state & SS_ISBOUND)) {
4151 			so_lock_single(so);	/* Set SOLOCKED */
4152 			error = sotpi_bind(so, NULL, 0,
4153 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
4154 			so_unlock_single(so, SOLOCKED);
4155 			if (error) {
4156 				mutex_exit(&so->so_lock);
4157 				eprintsoline(so, error);
4158 				return (error);
4159 			}
4160 		}
4161 		/*
4162 		 * Handle delayed datagram errors. These are only queued
4163 		 * when the application sets SO_DGRAM_ERRIND.
4164 		 * Return the error if we are sending to the address
4165 		 * that was returned in the last T_UDERROR_IND.
4166 		 * If sending to some other address discard the delayed
4167 		 * error indication.
4168 		 */
4169 		if (so->so_delayed_error) {
4170 			struct T_uderror_ind	*tudi;
4171 			void			*addr;
4172 			t_uscalar_t		addrlen;
4173 			boolean_t		match = B_FALSE;
4174 
4175 			ASSERT(so->so_eaddr_mp);
4176 			error = so->so_delayed_error;
4177 			so->so_delayed_error = 0;
4178 			tudi = (struct T_uderror_ind *)so->so_eaddr_mp->b_rptr;
4179 			addrlen = tudi->DEST_length;
4180 			addr = sogetoff(so->so_eaddr_mp,
4181 			    tudi->DEST_offset,
4182 			    addrlen, 1);
4183 			ASSERT(addr);	/* Checked by strsock_proto */
4184 			switch (so->so_family) {
4185 			case AF_INET: {
4186 				/* Compare just IP address and port */
4187 				sin_t *sin1 = (sin_t *)name;
4188 				sin_t *sin2 = (sin_t *)addr;
4189 
4190 				if (addrlen == sizeof (sin_t) &&
4191 				    namelen == addrlen &&
4192 				    sin1->sin_port == sin2->sin_port &&
4193 				    sin1->sin_addr.s_addr ==
4194 				    sin2->sin_addr.s_addr)
4195 					match = B_TRUE;
4196 				break;
4197 			}
4198 			case AF_INET6: {
4199 				/* Compare just IP address and port. Not flow */
4200 				sin6_t *sin1 = (sin6_t *)name;
4201 				sin6_t *sin2 = (sin6_t *)addr;
4202 
4203 				if (addrlen == sizeof (sin6_t) &&
4204 				    namelen == addrlen &&
4205 				    sin1->sin6_port == sin2->sin6_port &&
4206 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4207 				    &sin2->sin6_addr))
4208 					match = B_TRUE;
4209 				break;
4210 			}
4211 			case AF_UNIX:
4212 			default:
4213 				if (namelen == addrlen &&
4214 				    bcmp(name, addr, namelen) == 0)
4215 					match = B_TRUE;
4216 			}
4217 			if (match) {
4218 				freemsg(so->so_eaddr_mp);
4219 				so->so_eaddr_mp = NULL;
4220 				mutex_exit(&so->so_lock);
4221 #ifdef DEBUG
4222 				dprintso(so, 0,
4223 				    ("sockfs delayed error %d for %s\n",
4224 				    error,
4225 				    pr_addr(so->so_family, name, namelen)));
4226 #endif /* DEBUG */
4227 				return (error);
4228 			}
4229 			freemsg(so->so_eaddr_mp);
4230 			so->so_eaddr_mp = NULL;
4231 		}
4232 	}
4233 	mutex_exit(&so->so_lock);
4234 
4235 	flags = msg->msg_flags;
4236 	dontroute = 0;
4237 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4238 		uint32_t	val;
4239 
4240 		val = 1;
4241 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4242 		    &val, (t_uscalar_t)sizeof (val));
4243 		if (error)
4244 			return (error);
4245 		dontroute = 1;
4246 	}
4247 
4248 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4249 		error = EOPNOTSUPP;
4250 		goto done;
4251 	}
4252 	if (msg->msg_controllen != 0) {
4253 		if (!(so_mode & SM_CONNREQUIRED)) {
4254 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4255 			    msg->msg_control, msg->msg_controllen, flags);
4256 		} else {
4257 			if (flags & MSG_OOB) {
4258 				/* Can't generate T_EXDATA_REQ with options */
4259 				error = EOPNOTSUPP;
4260 				goto done;
4261 			}
4262 			error = sosend_svccmsg(so, uiop,
4263 			    !(flags & MSG_EOR),
4264 			    msg->msg_control, msg->msg_controllen,
4265 			    flags);
4266 		}
4267 		goto done;
4268 	}
4269 
4270 	if (!(so_mode & SM_CONNREQUIRED)) {
4271 		/*
4272 		 * If there is no SO_DONTROUTE to turn off return immediately
4273 		 * from send_dgram. This can allow tail-call optimizations.
4274 		 */
4275 		if (!dontroute) {
4276 			return (sosend_dgram(so, name, namelen, uiop, flags));
4277 		}
4278 		error = sosend_dgram(so, name, namelen, uiop, flags);
4279 	} else {
4280 		t_scalar_t prim;
4281 		int sflag;
4282 
4283 		/* Ignore msg_name in the connected state */
4284 		if (flags & MSG_OOB) {
4285 			prim = T_EXDATA_REQ;
4286 			/*
4287 			 * Send down T_EXDATA_REQ even if there is flow
4288 			 * control for data.
4289 			 */
4290 			sflag = MSG_IGNFLOW;
4291 		} else {
4292 			if (so_mode & SM_BYTESTREAM) {
4293 				/* Byte stream transport - use write */
4294 
4295 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4296 				/*
4297 				 * If there is no SO_DONTROUTE to turn off,
4298 				 * SS_DIRECT is on, and there is no flow
4299 				 * control, we can take the fast path.
4300 				 */
4301 				if (!dontroute &&
4302 				    (so_state & SS_DIRECT) &&
4303 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4304 					return (sostream_direct(so, uiop,
4305 					    NULL, CRED()));
4306 				}
4307 				error = strwrite(SOTOV(so), uiop, CRED());
4308 				goto done;
4309 			}
4310 			prim = T_DATA_REQ;
4311 			sflag = 0;
4312 		}
4313 		/*
4314 		 * If there is no SO_DONTROUTE to turn off return immediately
4315 		 * from sosend_svc. This can allow tail-call optimizations.
4316 		 */
4317 		if (!dontroute)
4318 			return (sosend_svc(so, uiop, prim,
4319 			    !(flags & MSG_EOR), sflag));
4320 		error = sosend_svc(so, uiop, prim,
4321 		    !(flags & MSG_EOR), sflag);
4322 	}
4323 	ASSERT(dontroute);
4324 done:
4325 	if (dontroute) {
4326 		uint32_t	val;
4327 
4328 		val = 0;
4329 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4330 		    &val, (t_uscalar_t)sizeof (val));
4331 	}
4332 	return (error);
4333 }
4334 
4335 /*
4336  * Sending data on a datagram socket.
4337  * Assumes caller has verified that SS_ISBOUND etc. are set.
4338  */
4339 /* ARGSUSED */
4340 static int
4341 sodgram_direct(struct sonode *so, struct sockaddr *name,
4342     socklen_t namelen, struct uio *uiop, int flags)
4343 {
4344 	struct T_unitdata_req	tudr;
4345 	mblk_t			*mp = NULL;
4346 	int			error = 0;
4347 	void			*addr;
4348 	socklen_t		addrlen;
4349 	ssize_t			len;
4350 	struct stdata		*stp = SOTOV(so)->v_stream;
4351 	int			so_state;
4352 	queue_t			*udp_wq;
4353 	boolean_t		connected;
4354 	mblk_t			*mpdata = NULL;
4355 
4356 	ASSERT(name != NULL && namelen != 0);
4357 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4358 	ASSERT(!(so->so_mode & SM_EXDATA));
4359 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4360 	ASSERT(SOTOV(so)->v_type == VSOCK);
4361 
4362 	/* Caller checked for proper length */
4363 	len = uiop->uio_resid;
4364 	ASSERT(len <= so->so_tidu_size);
4365 
4366 	/* Length and family checks have been done by caller */
4367 	ASSERT(name->sa_family == so->so_family);
4368 	ASSERT(so->so_family == AF_INET ||
4369 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4370 	ASSERT(so->so_family == AF_INET6 ||
4371 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4372 
4373 	addr = name;
4374 	addrlen = namelen;
4375 
4376 	if (stp->sd_sidp != NULL &&
4377 	    (error = straccess(stp, JCWRITE)) != 0)
4378 		goto done;
4379 
4380 	so_state = so->so_state;
4381 
4382 	connected = so_state & SS_ISCONNECTED;
4383 	if (!connected) {
4384 		tudr.PRIM_type = T_UNITDATA_REQ;
4385 		tudr.DEST_length = addrlen;
4386 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4387 		tudr.OPT_length = 0;
4388 		tudr.OPT_offset = 0;
4389 
4390 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4391 		    _ALLOC_INTR);
4392 		if (mp == NULL) {
4393 			/*
4394 			 * Caught a signal waiting for memory.
4395 			 * Let send* return EINTR.
4396 			 */
4397 			error = EINTR;
4398 			goto done;
4399 		}
4400 	}
4401 
4402 	/*
4403 	 * For UDP we don't break up the copyin into smaller pieces
4404 	 * as in the TCP case.  That means if ENOMEM is returned by
4405 	 * mcopyinuio() then the uio vector has not been modified at
4406 	 * all and we fallback to either strwrite() or kstrputmsg()
4407 	 * below.  Note also that we never generate priority messages
4408 	 * from here.
4409 	 */
4410 	udp_wq = stp->sd_wrq->q_next;
4411 	if (canput(udp_wq) &&
4412 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4413 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4414 		ASSERT(uiop->uio_resid == 0);
4415 		if (!connected)
4416 			linkb(mp, mpdata);
4417 		else
4418 			mp = mpdata;
4419 #ifdef C2_AUDIT
4420 		if (audit_active)
4421 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4422 #endif /* C2_AUDIT */
4423 
4424 		udp_wput(udp_wq, mp);
4425 		return (0);
4426 	}
4427 
4428 	ASSERT(mpdata == NULL);
4429 	if (error != 0 && error != ENOMEM) {
4430 		freemsg(mp);
4431 		return (error);
4432 	}
4433 
4434 	/*
4435 	 * For connected, let strwrite() handle the blocking case.
4436 	 * Otherwise we fall thru and use kstrputmsg().
4437 	 */
4438 	if (connected)
4439 		return (strwrite(SOTOV(so), uiop, CRED()));
4440 
4441 #ifdef C2_AUDIT
4442 	if (audit_active)
4443 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4444 #endif /* C2_AUDIT */
4445 
4446 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4447 done:
4448 #ifdef SOCK_DEBUG
4449 	if (error != 0) {
4450 		eprintsoline(so, error);
4451 	}
4452 #endif /* SOCK_DEBUG */
4453 	return (error);
4454 }
4455 
4456 int
4457 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4458 {
4459 	struct stdata *stp = SOTOV(so)->v_stream;
4460 	ssize_t iosize, rmax, maxblk;
4461 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4462 	mblk_t *newmp;
4463 	int error = 0, wflag = 0;
4464 
4465 	ASSERT(so->so_mode & SM_BYTESTREAM);
4466 	ASSERT(SOTOV(so)->v_type == VSOCK);
4467 
4468 	if (stp->sd_sidp != NULL &&
4469 	    (error = straccess(stp, JCWRITE)) != 0)
4470 		return (error);
4471 
4472 	if (uiop == NULL) {
4473 		/*
4474 		 * kstrwritemp() should have checked sd_flag and
4475 		 * flow-control before coming here.  If we end up
4476 		 * here it means that we can simply pass down the
4477 		 * data to tcp.
4478 		 */
4479 		ASSERT(mp != NULL);
4480 		if (stp->sd_wputdatafunc != NULL) {
4481 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4482 			    NULL, NULL, NULL);
4483 			if (newmp == NULL) {
4484 				/* The caller will free mp */
4485 				return (ECOMM);
4486 			}
4487 			mp = newmp;
4488 		}
4489 		tcp_wput(tcp_wq, mp);
4490 		return (0);
4491 	}
4492 
4493 	/* Fallback to strwrite() to do proper error handling */
4494 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4495 		return (strwrite(SOTOV(so), uiop, cr));
4496 
4497 	rmax = stp->sd_qn_maxpsz;
4498 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4499 	if (rmax == 0 || uiop->uio_resid <= 0)
4500 		return (0);
4501 
4502 	if (rmax == INFPSZ)
4503 		rmax = uiop->uio_resid;
4504 
4505 	maxblk = stp->sd_maxblk;
4506 
4507 	for (;;) {
4508 		iosize = MIN(uiop->uio_resid, rmax);
4509 
4510 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4511 		if (mp == NULL) {
4512 			/*
4513 			 * Fallback to strwrite() for ENOMEM; if this
4514 			 * is our first time in this routine and the uio
4515 			 * vector has not been modified, we will end up
4516 			 * calling strwrite() without any flag set.
4517 			 */
4518 			if (error == ENOMEM)
4519 				goto slow_send;
4520 			else
4521 				return (error);
4522 		}
4523 		ASSERT(uiop->uio_resid >= 0);
4524 		/*
4525 		 * If mp is non-NULL and ENOMEM is set, it means that
4526 		 * mcopyinuio() was able to break down some of the user
4527 		 * data into one or more mblks.  Send the partial data
4528 		 * to tcp and let the rest be handled in strwrite().
4529 		 */
4530 		ASSERT(error == 0 || error == ENOMEM);
4531 		if (stp->sd_wputdatafunc != NULL) {
4532 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4533 			    NULL, NULL, NULL);
4534 			if (newmp == NULL) {
4535 				/* The caller will free mp */
4536 				return (ECOMM);
4537 			}
4538 			mp = newmp;
4539 		}
4540 		tcp_wput(tcp_wq, mp);
4541 
4542 		wflag |= NOINTR;
4543 
4544 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4545 			ASSERT(error == 0);
4546 			break;
4547 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4548 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4549 slow_send:
4550 			/*
4551 			 * We were able to send down partial data using
4552 			 * the direct call interface, but are now relying
4553 			 * on strwrite() to handle the non-fastpath cases.
4554 			 * If the socket is blocking we will sleep in
4555 			 * strwaitq() until write is permitted, otherwise,
4556 			 * we will need to return the amount of bytes
4557 			 * written so far back to the app.  This is the
4558 			 * reason why we pass NOINTR flag to strwrite()
4559 			 * for non-blocking socket, because we don't want
4560 			 * to return EAGAIN when portion of the user data
4561 			 * has actually been sent down.
4562 			 */
4563 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4564 		}
4565 	}
4566 	return (0);
4567 }
4568 
4569 /*
4570  * Update so_faddr by asking the transport (unless AF_UNIX).
4571  */
4572 int
4573 sotpi_getpeername(struct sonode *so)
4574 {
4575 	struct strbuf	strbuf;
4576 	int		error = 0, res;
4577 	void		*addr;
4578 	t_uscalar_t	addrlen;
4579 	k_sigset_t	smask;
4580 
4581 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4582 	    so, pr_state(so->so_state, so->so_mode)));
4583 
4584 	mutex_enter(&so->so_lock);
4585 	so_lock_single(so);	/* Set SOLOCKED */
4586 	if (!(so->so_state & SS_ISCONNECTED)) {
4587 		error = ENOTCONN;
4588 		goto done;
4589 	}
4590 	/* Added this check for X/Open */
4591 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4592 		error = EINVAL;
4593 		if (xnet_check_print) {
4594 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4595 		}
4596 		goto done;
4597 	}
4598 #ifdef DEBUG
4599 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4600 	    pr_addr(so->so_family, so->so_faddr_sa,
4601 	    (t_uscalar_t)so->so_faddr_len)));
4602 #endif /* DEBUG */
4603 
4604 	if (so->so_family == AF_UNIX) {
4605 		/* Transport has different name space - return local info */
4606 		error = 0;
4607 		goto done;
4608 	}
4609 
4610 	ASSERT(so->so_faddr_sa);
4611 	/* Allocate local buffer to use with ioctl */
4612 	addrlen = (t_uscalar_t)so->so_faddr_maxlen;
4613 	mutex_exit(&so->so_lock);
4614 	addr = kmem_alloc(addrlen, KM_SLEEP);
4615 
4616 	/*
4617 	 * Issue TI_GETPEERNAME with signals masked.
4618 	 * Put the result in so_faddr_sa so that getpeername works after
4619 	 * a shutdown(output).
4620 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4621 	 * back to the socket.
4622 	 */
4623 	strbuf.buf = addr;
4624 	strbuf.maxlen = addrlen;
4625 	strbuf.len = 0;
4626 
4627 	sigintr(&smask, 0);
4628 	res = 0;
4629 	ASSERT(CRED());
4630 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4631 	    0, K_TO_K, CRED(), &res);
4632 	sigunintr(&smask);
4633 
4634 	mutex_enter(&so->so_lock);
4635 	/*
4636 	 * If there is an error record the error in so_error put don't fail
4637 	 * the getpeername. Instead fallback on the recorded
4638 	 * so->so_faddr_sa.
4639 	 */
4640 	if (error) {
4641 		/*
4642 		 * Various stream head errors can be returned to the ioctl.
4643 		 * However, it is impossible to determine which ones of
4644 		 * these are really socket level errors that were incorrectly
4645 		 * consumed by the ioctl. Thus this code silently ignores the
4646 		 * error - to code explicitly does not reinstate the error
4647 		 * using soseterror().
4648 		 * Experiments have shows that at least this set of
4649 		 * errors are reported and should not be reinstated on the
4650 		 * socket:
4651 		 *	EINVAL	E.g. if an I_LINK was in effect when
4652 		 *		getpeername was called.
4653 		 *	EPIPE	The ioctl error semantics prefer the write
4654 		 *		side error over the read side error.
4655 		 *	ENOTCONN The transport just got disconnected but
4656 		 *		sockfs had not yet seen the T_DISCON_IND
4657 		 *		when issuing the ioctl.
4658 		 */
4659 		error = 0;
4660 	} else if (res == 0 && strbuf.len > 0 &&
4661 	    (so->so_state & SS_ISCONNECTED)) {
4662 		ASSERT(strbuf.len <= (int)so->so_faddr_maxlen);
4663 		so->so_faddr_len = (socklen_t)strbuf.len;
4664 		bcopy(addr, so->so_faddr_sa, so->so_faddr_len);
4665 		so->so_state |= SS_FADDR_VALID;
4666 	}
4667 	kmem_free(addr, addrlen);
4668 #ifdef DEBUG
4669 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4670 	    pr_addr(so->so_family, so->so_faddr_sa,
4671 	    (t_uscalar_t)so->so_faddr_len)));
4672 #endif /* DEBUG */
4673 done:
4674 	so_unlock_single(so, SOLOCKED);
4675 	mutex_exit(&so->so_lock);
4676 	return (error);
4677 }
4678 
4679 /*
4680  * Update so_laddr by asking the transport (unless AF_UNIX).
4681  */
4682 int
4683 sotpi_getsockname(struct sonode *so)
4684 {
4685 	struct strbuf	strbuf;
4686 	int		error = 0, res;
4687 	void		*addr;
4688 	t_uscalar_t	addrlen;
4689 	k_sigset_t	smask;
4690 
4691 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4692 	    so, pr_state(so->so_state, so->so_mode)));
4693 
4694 	mutex_enter(&so->so_lock);
4695 	so_lock_single(so);	/* Set SOLOCKED */
4696 	if (!(so->so_state & SS_ISBOUND) && so->so_family != AF_UNIX) {
4697 		/* Return an all zero address except for the family */
4698 		if (so->so_family == AF_INET)
4699 			so->so_laddr_len = (socklen_t)sizeof (sin_t);
4700 		else if (so->so_family == AF_INET6)
4701 			so->so_laddr_len = (socklen_t)sizeof (sin6_t);
4702 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
4703 		bzero(so->so_laddr_sa, so->so_laddr_len);
4704 		/*
4705 		 * Can not assume there is a sa_family for all
4706 		 * protocol families.
4707 		 */
4708 		if (so->so_family == AF_INET || so->so_family == AF_INET6)
4709 			so->so_laddr_sa->sa_family = so->so_family;
4710 	}
4711 #ifdef DEBUG
4712 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4713 	    pr_addr(so->so_family, so->so_laddr_sa,
4714 	    (t_uscalar_t)so->so_laddr_len)));
4715 #endif /* DEBUG */
4716 	if (so->so_family == AF_UNIX) {
4717 		/* Transport has different name space - return local info */
4718 		error = 0;
4719 		goto done;
4720 	}
4721 	if (!(so->so_state & SS_ISBOUND)) {
4722 		/* If not bound, then nothing to return. */
4723 		error = 0;
4724 		goto done;
4725 	}
4726 	/* Allocate local buffer to use with ioctl */
4727 	addrlen = (t_uscalar_t)so->so_laddr_maxlen;
4728 	mutex_exit(&so->so_lock);
4729 	addr = kmem_alloc(addrlen, KM_SLEEP);
4730 
4731 	/*
4732 	 * Issue TI_GETMYNAME with signals masked.
4733 	 * Put the result in so_laddr_sa so that getsockname works after
4734 	 * a shutdown(output).
4735 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4736 	 * back to the socket.
4737 	 */
4738 	strbuf.buf = addr;
4739 	strbuf.maxlen = addrlen;
4740 	strbuf.len = 0;
4741 
4742 	sigintr(&smask, 0);
4743 	res = 0;
4744 	ASSERT(CRED());
4745 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4746 	    0, K_TO_K, CRED(), &res);
4747 	sigunintr(&smask);
4748 
4749 	mutex_enter(&so->so_lock);
4750 	/*
4751 	 * If there is an error record the error in so_error put don't fail
4752 	 * the getsockname. Instead fallback on the recorded
4753 	 * so->so_laddr_sa.
4754 	 */
4755 	if (error) {
4756 		/*
4757 		 * Various stream head errors can be returned to the ioctl.
4758 		 * However, it is impossible to determine which ones of
4759 		 * these are really socket level errors that were incorrectly
4760 		 * consumed by the ioctl. Thus this code silently ignores the
4761 		 * error - to code explicitly does not reinstate the error
4762 		 * using soseterror().
4763 		 * Experiments have shows that at least this set of
4764 		 * errors are reported and should not be reinstated on the
4765 		 * socket:
4766 		 *	EINVAL	E.g. if an I_LINK was in effect when
4767 		 *		getsockname was called.
4768 		 *	EPIPE	The ioctl error semantics prefer the write
4769 		 *		side error over the read side error.
4770 		 */
4771 		error = 0;
4772 	} else if (res == 0 && strbuf.len > 0 &&
4773 	    (so->so_state & SS_ISBOUND)) {
4774 		ASSERT(strbuf.len <= (int)so->so_laddr_maxlen);
4775 		so->so_laddr_len = (socklen_t)strbuf.len;
4776 		bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
4777 		so->so_state |= SS_LADDR_VALID;
4778 	}
4779 	kmem_free(addr, addrlen);
4780 #ifdef DEBUG
4781 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
4782 	    pr_addr(so->so_family, so->so_laddr_sa,
4783 	    (t_uscalar_t)so->so_laddr_len)));
4784 #endif /* DEBUG */
4785 done:
4786 	so_unlock_single(so, SOLOCKED);
4787 	mutex_exit(&so->so_lock);
4788 	return (error);
4789 }
4790 
4791 /*
4792  * Get socket options. For SOL_SOCKET options some options are handled
4793  * by the sockfs while others use the value recorded in the sonode as a
4794  * fallback should the T_SVR4_OPTMGMT_REQ fail.
4795  *
4796  * On the return most *optlenp bytes are copied to optval.
4797  */
4798 int
4799 sotpi_getsockopt(struct sonode *so, int level, int option_name,
4800 		void *optval, socklen_t *optlenp, int flags)
4801 {
4802 	struct T_optmgmt_req	optmgmt_req;
4803 	struct T_optmgmt_ack	*optmgmt_ack;
4804 	struct opthdr		oh;
4805 	struct opthdr		*opt_res;
4806 	mblk_t			*mp = NULL;
4807 	int			error = 0;
4808 	void			*option = NULL;	/* Set if fallback value */
4809 	t_uscalar_t		maxlen = *optlenp;
4810 	t_uscalar_t		len;
4811 	uint32_t		value;
4812 
4813 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
4814 	    so, level, option_name, optval, optlenp,
4815 	    pr_state(so->so_state, so->so_mode)));
4816 
4817 	mutex_enter(&so->so_lock);
4818 	so_lock_single(so);	/* Set SOLOCKED */
4819 
4820 	/*
4821 	 * Check for SOL_SOCKET options.
4822 	 * Certain SOL_SOCKET options are returned directly whereas
4823 	 * others only provide a default (fallback) value should
4824 	 * the T_SVR4_OPTMGMT_REQ fail.
4825 	 */
4826 	if (level == SOL_SOCKET) {
4827 		/* Check parameters */
4828 		switch (option_name) {
4829 		case SO_TYPE:
4830 		case SO_ERROR:
4831 		case SO_DEBUG:
4832 		case SO_ACCEPTCONN:
4833 		case SO_REUSEADDR:
4834 		case SO_KEEPALIVE:
4835 		case SO_DONTROUTE:
4836 		case SO_BROADCAST:
4837 		case SO_USELOOPBACK:
4838 		case SO_OOBINLINE:
4839 		case SO_SNDBUF:
4840 		case SO_RCVBUF:
4841 #ifdef notyet
4842 		case SO_SNDLOWAT:
4843 		case SO_RCVLOWAT:
4844 		case SO_SNDTIMEO:
4845 		case SO_RCVTIMEO:
4846 #endif /* notyet */
4847 		case SO_DOMAIN:
4848 		case SO_DGRAM_ERRIND:
4849 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
4850 				error = EINVAL;
4851 				eprintsoline(so, error);
4852 				goto done2;
4853 			}
4854 			break;
4855 		case SO_LINGER:
4856 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
4857 				error = EINVAL;
4858 				eprintsoline(so, error);
4859 				goto done2;
4860 			}
4861 			break;
4862 		}
4863 
4864 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
4865 
4866 		switch (option_name) {
4867 		case SO_TYPE:
4868 			value = so->so_type;
4869 			option = &value;
4870 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4871 
4872 		case SO_ERROR:
4873 			value = sogeterr(so);
4874 			option = &value;
4875 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4876 
4877 		case SO_ACCEPTCONN:
4878 			if (so->so_state & SS_ACCEPTCONN)
4879 				value = SO_ACCEPTCONN;
4880 			else
4881 				value = 0;
4882 #ifdef DEBUG
4883 			if (value) {
4884 				dprintso(so, 1,
4885 				    ("sotpi_getsockopt: 0x%x is set\n",
4886 				    option_name));
4887 			} else {
4888 				dprintso(so, 1,
4889 				    ("sotpi_getsockopt: 0x%x not set\n",
4890 				    option_name));
4891 			}
4892 #endif /* DEBUG */
4893 			option = &value;
4894 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4895 
4896 		case SO_DEBUG:
4897 		case SO_REUSEADDR:
4898 		case SO_KEEPALIVE:
4899 		case SO_DONTROUTE:
4900 		case SO_BROADCAST:
4901 		case SO_USELOOPBACK:
4902 		case SO_OOBINLINE:
4903 		case SO_DGRAM_ERRIND:
4904 			value = (so->so_options & option_name);
4905 #ifdef DEBUG
4906 			if (value) {
4907 				dprintso(so, 1,
4908 				    ("sotpi_getsockopt: 0x%x is set\n",
4909 				    option_name));
4910 			} else {
4911 				dprintso(so, 1,
4912 				    ("sotpi_getsockopt: 0x%x not set\n",
4913 				    option_name));
4914 			}
4915 #endif /* DEBUG */
4916 			option = &value;
4917 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4918 
4919 		/*
4920 		 * The following options are only returned by sockfs when the
4921 		 * T_SVR4_OPTMGMT_REQ fails.
4922 		 */
4923 		case SO_LINGER:
4924 			option = &so->so_linger;
4925 			len = (t_uscalar_t)sizeof (struct linger);
4926 			break;
4927 		case SO_SNDBUF: {
4928 			ssize_t lvalue;
4929 
4930 			/*
4931 			 * If the option has not been set then get a default
4932 			 * value from the read queue. This value is
4933 			 * returned if the transport fails
4934 			 * the T_SVR4_OPTMGMT_REQ.
4935 			 */
4936 			lvalue = so->so_sndbuf;
4937 			if (lvalue == 0) {
4938 				mutex_exit(&so->so_lock);
4939 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
4940 				    QHIWAT, 0, &lvalue);
4941 				mutex_enter(&so->so_lock);
4942 				dprintso(so, 1,
4943 				    ("got SO_SNDBUF %ld from q\n", lvalue));
4944 			}
4945 			value = (int)lvalue;
4946 			option = &value;
4947 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
4948 			break;
4949 		}
4950 		case SO_RCVBUF: {
4951 			ssize_t lvalue;
4952 
4953 			/*
4954 			 * If the option has not been set then get a default
4955 			 * value from the read queue. This value is
4956 			 * returned if the transport fails
4957 			 * the T_SVR4_OPTMGMT_REQ.
4958 			 *
4959 			 * XXX If SO_RCVBUF has been set and this is an
4960 			 * XPG 4.2 application then do not ask the transport
4961 			 * since the transport might adjust the value and not
4962 			 * return exactly what was set by the application.
4963 			 * For non-XPG 4.2 application we return the value
4964 			 * that the transport is actually using.
4965 			 */
4966 			lvalue = so->so_rcvbuf;
4967 			if (lvalue == 0) {
4968 				mutex_exit(&so->so_lock);
4969 				(void) strqget(RD(strvp2wq(SOTOV(so))),
4970 				    QHIWAT, 0, &lvalue);
4971 				mutex_enter(&so->so_lock);
4972 				dprintso(so, 1,
4973 				    ("got SO_RCVBUF %ld from q\n", lvalue));
4974 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
4975 				value = (int)lvalue;
4976 				option = &value;
4977 				goto copyout;	/* skip asking transport */
4978 			}
4979 			value = (int)lvalue;
4980 			option = &value;
4981 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
4982 			break;
4983 		}
4984 		case SO_DOMAIN:
4985 			value = so->so_family;
4986 			option = &value;
4987 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4988 
4989 #ifdef notyet
4990 		/*
4991 		 * We do not implement the semantics of these options
4992 		 * thus we shouldn't implement the options either.
4993 		 */
4994 		case SO_SNDLOWAT:
4995 			value = so->so_sndlowat;
4996 			option = &value;
4997 			break;
4998 		case SO_RCVLOWAT:
4999 			value = so->so_rcvlowat;
5000 			option = &value;
5001 			break;
5002 		case SO_SNDTIMEO:
5003 			value = so->so_sndtimeo;
5004 			option = &value;
5005 			break;
5006 		case SO_RCVTIMEO:
5007 			value = so->so_rcvtimeo;
5008 			option = &value;
5009 			break;
5010 #endif /* notyet */
5011 		}
5012 	}
5013 
5014 	mutex_exit(&so->so_lock);
5015 
5016 	/* Send request */
5017 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5018 	optmgmt_req.MGMT_flags = T_CHECK;
5019 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5020 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5021 
5022 	oh.level = level;
5023 	oh.name = option_name;
5024 	oh.len = maxlen;
5025 
5026 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5027 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP);
5028 	/* Let option management work in the presence of data flow control */
5029 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5030 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5031 	mp = NULL;
5032 	mutex_enter(&so->so_lock);
5033 	if (error) {
5034 		eprintsoline(so, error);
5035 		goto done2;
5036 	}
5037 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5038 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5039 	if (error) {
5040 		if (option != NULL) {
5041 			/* We have a fallback value */
5042 			error = 0;
5043 			goto copyout;
5044 		}
5045 		eprintsoline(so, error);
5046 		goto done2;
5047 	}
5048 	ASSERT(mp);
5049 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5050 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5051 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5052 	if (opt_res == NULL) {
5053 		if (option != NULL) {
5054 			/* We have a fallback value */
5055 			error = 0;
5056 			goto copyout;
5057 		}
5058 		error = EPROTO;
5059 		eprintsoline(so, error);
5060 		goto done;
5061 	}
5062 	option = &opt_res[1];
5063 
5064 	/* check to ensure that the option is within bounds */
5065 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5066 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5067 		if (option != NULL) {
5068 			/* We have a fallback value */
5069 			error = 0;
5070 			goto copyout;
5071 		}
5072 		error = EPROTO;
5073 		eprintsoline(so, error);
5074 		goto done;
5075 	}
5076 
5077 	len = opt_res->len;
5078 
5079 copyout: {
5080 		t_uscalar_t size = MIN(len, maxlen);
5081 		bcopy(option, optval, size);
5082 		bcopy(&size, optlenp, sizeof (size));
5083 	}
5084 done:
5085 	freemsg(mp);
5086 done2:
5087 	so_unlock_single(so, SOLOCKED);
5088 	mutex_exit(&so->so_lock);
5089 	return (error);
5090 }
5091 
5092 /*
5093  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5094  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5095  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5096  * setsockopt has to work even if the transport does not support the option.
5097  */
5098 int
5099 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5100 	const void *optval, t_uscalar_t optlen)
5101 {
5102 	struct T_optmgmt_req	optmgmt_req;
5103 	struct opthdr		oh;
5104 	mblk_t			*mp;
5105 	int			error = 0;
5106 	boolean_t		handled = B_FALSE;
5107 
5108 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5109 	    so, level, option_name, optval, optlen,
5110 	    pr_state(so->so_state, so->so_mode)));
5111 
5112 
5113 	/* X/Open requires this check */
5114 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5115 		if (xnet_check_print)
5116 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5117 		return (EINVAL);
5118 	}
5119 
5120 	/* Caller allocates aligned optval, or passes null */
5121 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
5122 	/* If optval is null optlen is 0, and vice-versa */
5123 	ASSERT(optval != NULL || optlen == 0);
5124 	ASSERT(optlen != 0 || optval == NULL);
5125 
5126 	mutex_enter(&so->so_lock);
5127 	so_lock_single(so);	/* Set SOLOCKED */
5128 	mutex_exit(&so->so_lock);
5129 
5130 	/*
5131 	 * For SOCKET or TCP level options, try to set it here itself
5132 	 * provided socket has not been popped and we know the tcp
5133 	 * structure (stored in so_priv).
5134 	 */
5135 	if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
5136 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
5137 	    (so->so_version == SOV_SOCKSTREAM) && (so->so_priv != NULL)) {
5138 		tcp_t		*tcp = so->so_priv;
5139 		boolean_t	onoff;
5140 
5141 #define	intvalue	(*(int32_t *)optval)
5142 
5143 		switch (level) {
5144 		case SOL_SOCKET:
5145 			switch (option_name) {		/* Check length param */
5146 			case SO_DEBUG:
5147 			case SO_REUSEADDR:
5148 			case SO_DONTROUTE:
5149 			case SO_BROADCAST:
5150 			case SO_USELOOPBACK:
5151 			case SO_OOBINLINE:
5152 			case SO_DGRAM_ERRIND:
5153 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5154 					error = EINVAL;
5155 					eprintsoline(so, error);
5156 					mutex_enter(&so->so_lock);
5157 					goto done2;
5158 				}
5159 				ASSERT(optval);
5160 				onoff = intvalue != 0;
5161 				handled = B_TRUE;
5162 				break;
5163 			case SO_LINGER:
5164 				if (optlen !=
5165 				    (t_uscalar_t)sizeof (struct linger)) {
5166 					error = EINVAL;
5167 					eprintsoline(so, error);
5168 					mutex_enter(&so->so_lock);
5169 					goto done2;
5170 				}
5171 				ASSERT(optval);
5172 				handled = B_TRUE;
5173 				break;
5174 			}
5175 
5176 			switch (option_name) {			/* Do actions */
5177 			case SO_LINGER: {
5178 				struct linger *lgr = (struct linger *)optval;
5179 
5180 				if (lgr->l_onoff) {
5181 					tcp->tcp_linger = 1;
5182 					tcp->tcp_lingertime = lgr->l_linger;
5183 					so->so_linger.l_onoff = SO_LINGER;
5184 					so->so_options |= SO_LINGER;
5185 				} else {
5186 					tcp->tcp_linger = 0;
5187 					tcp->tcp_lingertime = 0;
5188 					so->so_linger.l_onoff = 0;
5189 					so->so_options &= ~SO_LINGER;
5190 				}
5191 				so->so_linger.l_linger = lgr->l_linger;
5192 				handled = B_TRUE;
5193 				break;
5194 			}
5195 			case SO_DEBUG:
5196 				tcp->tcp_debug = onoff;
5197 #ifdef SOCK_TEST
5198 				if (intvalue & 2)
5199 					sock_test_timelimit = 10 * hz;
5200 				else
5201 					sock_test_timelimit = 0;
5202 
5203 				if (intvalue & 4)
5204 					do_useracc = 0;
5205 				else
5206 					do_useracc = 1;
5207 #endif /* SOCK_TEST */
5208 				break;
5209 			case SO_DONTROUTE:
5210 				/*
5211 				 * SO_DONTROUTE, SO_USELOOPBACK and
5212 				 * SO_BROADCAST are only of interest to IP.
5213 				 * We track them here only so
5214 				 * that we can report their current value.
5215 				 */
5216 				tcp->tcp_dontroute = onoff;
5217 				if (onoff)
5218 					so->so_options |= option_name;
5219 				else
5220 					so->so_options &= ~option_name;
5221 				break;
5222 			case SO_USELOOPBACK:
5223 				tcp->tcp_useloopback = onoff;
5224 				if (onoff)
5225 					so->so_options |= option_name;
5226 				else
5227 					so->so_options &= ~option_name;
5228 				break;
5229 			case SO_BROADCAST:
5230 				tcp->tcp_broadcast = onoff;
5231 				if (onoff)
5232 					so->so_options |= option_name;
5233 				else
5234 					so->so_options &= ~option_name;
5235 				break;
5236 			case SO_REUSEADDR:
5237 				tcp->tcp_reuseaddr = onoff;
5238 				if (onoff)
5239 					so->so_options |= option_name;
5240 				else
5241 					so->so_options &= ~option_name;
5242 				break;
5243 			case SO_OOBINLINE:
5244 				tcp->tcp_oobinline = onoff;
5245 				if (onoff)
5246 					so->so_options |= option_name;
5247 				else
5248 					so->so_options &= ~option_name;
5249 				break;
5250 			case SO_DGRAM_ERRIND:
5251 				tcp->tcp_dgram_errind = onoff;
5252 				if (onoff)
5253 					so->so_options |= option_name;
5254 				else
5255 					so->so_options &= ~option_name;
5256 				break;
5257 			}
5258 			break;
5259 		case IPPROTO_TCP:
5260 			switch (option_name) {
5261 			case TCP_NODELAY:
5262 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5263 					error = EINVAL;
5264 					eprintsoline(so, error);
5265 					mutex_enter(&so->so_lock);
5266 					goto done2;
5267 				}
5268 				ASSERT(optval);
5269 				tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss;
5270 				handled = B_TRUE;
5271 				break;
5272 			}
5273 			break;
5274 		default:
5275 			handled = B_FALSE;
5276 			break;
5277 		}
5278 	}
5279 
5280 	if (handled) {
5281 		mutex_enter(&so->so_lock);
5282 		goto done2;
5283 	}
5284 
5285 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5286 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5287 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5288 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5289 
5290 	oh.level = level;
5291 	oh.name = option_name;
5292 	oh.len = optlen;
5293 
5294 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5295 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP);
5296 	/* Let option management work in the presence of data flow control */
5297 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5298 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5299 	mp = NULL;
5300 	mutex_enter(&so->so_lock);
5301 	if (error) {
5302 		eprintsoline(so, error);
5303 		goto done;
5304 	}
5305 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5306 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5307 	if (error) {
5308 		eprintsoline(so, error);
5309 		goto done;
5310 	}
5311 	ASSERT(mp);
5312 	/* No need to verify T_optmgmt_ack */
5313 	freemsg(mp);
5314 done:
5315 	/*
5316 	 * Check for SOL_SOCKET options and record their values.
5317 	 * If we know about a SOL_SOCKET parameter and the transport
5318 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5319 	 * EPROTO) we let the setsockopt succeed.
5320 	 */
5321 	if (level == SOL_SOCKET) {
5322 		/* Check parameters */
5323 		switch (option_name) {
5324 		case SO_DEBUG:
5325 		case SO_REUSEADDR:
5326 		case SO_KEEPALIVE:
5327 		case SO_DONTROUTE:
5328 		case SO_BROADCAST:
5329 		case SO_USELOOPBACK:
5330 		case SO_OOBINLINE:
5331 		case SO_SNDBUF:
5332 		case SO_RCVBUF:
5333 #ifdef notyet
5334 		case SO_SNDLOWAT:
5335 		case SO_RCVLOWAT:
5336 		case SO_SNDTIMEO:
5337 		case SO_RCVTIMEO:
5338 #endif /* notyet */
5339 		case SO_DGRAM_ERRIND:
5340 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5341 				error = EINVAL;
5342 				eprintsoline(so, error);
5343 				goto done2;
5344 			}
5345 			ASSERT(optval);
5346 			handled = B_TRUE;
5347 			break;
5348 		case SO_LINGER:
5349 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5350 				error = EINVAL;
5351 				eprintsoline(so, error);
5352 				goto done2;
5353 			}
5354 			ASSERT(optval);
5355 			handled = B_TRUE;
5356 			break;
5357 		}
5358 
5359 #define	intvalue	(*(int32_t *)optval)
5360 
5361 		switch (option_name) {
5362 		case SO_TYPE:
5363 		case SO_ERROR:
5364 		case SO_ACCEPTCONN:
5365 			/* Can't be set */
5366 			error = ENOPROTOOPT;
5367 			goto done2;
5368 		case SO_LINGER: {
5369 			struct linger *l = (struct linger *)optval;
5370 
5371 			so->so_linger.l_linger = l->l_linger;
5372 			if (l->l_onoff) {
5373 				so->so_linger.l_onoff = SO_LINGER;
5374 				so->so_options |= SO_LINGER;
5375 			} else {
5376 				so->so_linger.l_onoff = 0;
5377 				so->so_options &= ~SO_LINGER;
5378 			}
5379 			break;
5380 		}
5381 
5382 		case SO_DEBUG:
5383 #ifdef SOCK_TEST
5384 			if (intvalue & 2)
5385 				sock_test_timelimit = 10 * hz;
5386 			else
5387 				sock_test_timelimit = 0;
5388 
5389 			if (intvalue & 4)
5390 				do_useracc = 0;
5391 			else
5392 				do_useracc = 1;
5393 #endif /* SOCK_TEST */
5394 			/* FALLTHRU */
5395 		case SO_REUSEADDR:
5396 		case SO_KEEPALIVE:
5397 		case SO_DONTROUTE:
5398 		case SO_BROADCAST:
5399 		case SO_USELOOPBACK:
5400 		case SO_OOBINLINE:
5401 		case SO_DGRAM_ERRIND:
5402 			if (intvalue != 0) {
5403 				dprintso(so, 1,
5404 				    ("sotpi_setsockopt: setting 0x%x\n",
5405 				    option_name));
5406 				so->so_options |= option_name;
5407 			} else {
5408 				dprintso(so, 1,
5409 				    ("sotpi_setsockopt: clearing 0x%x\n",
5410 				    option_name));
5411 				so->so_options &= ~option_name;
5412 			}
5413 			break;
5414 		/*
5415 		 * The following options are only returned by us when the
5416 		 * T_SVR4_OPTMGMT_REQ fails.
5417 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5418 		 * since the transport might adjust the value and not
5419 		 * return exactly what was set by the application.
5420 		 */
5421 		case SO_SNDBUF:
5422 			so->so_sndbuf = intvalue;
5423 			break;
5424 		case SO_RCVBUF:
5425 			so->so_rcvbuf = intvalue;
5426 			break;
5427 #ifdef notyet
5428 		/*
5429 		 * We do not implement the semantics of these options
5430 		 * thus we shouldn't implement the options either.
5431 		 */
5432 		case SO_SNDLOWAT:
5433 			so->so_sndlowat = intvalue;
5434 			break;
5435 		case SO_RCVLOWAT:
5436 			so->so_rcvlowat = intvalue;
5437 			break;
5438 		case SO_SNDTIMEO:
5439 			so->so_sndtimeo = intvalue;
5440 			break;
5441 		case SO_RCVTIMEO:
5442 			so->so_rcvtimeo = intvalue;
5443 			break;
5444 #endif /* notyet */
5445 		}
5446 #undef	intvalue
5447 
5448 		if (error) {
5449 			if ((error == ENOPROTOOPT || error == EPROTO ||
5450 			    error == EINVAL) && handled) {
5451 				dprintso(so, 1,
5452 				    ("setsockopt: ignoring error %d for 0x%x\n",
5453 				    error, option_name));
5454 				error = 0;
5455 			}
5456 		}
5457 	}
5458 done2:
5459 ret:
5460 	so_unlock_single(so, SOLOCKED);
5461 	mutex_exit(&so->so_lock);
5462 	return (error);
5463 }
5464