xref: /titanic_44/usr/src/uts/common/fs/sockfs/socktpi.c (revision de6a15ee5bac749223cdd3f3d02367ab582243ff)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/strsun.h>
50 #include <sys/ddi.h>
51 #include <sys/esunddi.h>
52 #include <sys/flock.h>
53 #include <sys/modctl.h>
54 #include <sys/vtrace.h>
55 #include <sys/cmn_err.h>
56 #include <sys/pathname.h>
57 
58 #include <sys/socket.h>
59 #include <sys/socketvar.h>
60 #include <sys/sockio.h>
61 #include <netinet/in.h>
62 #include <sys/un.h>
63 #include <sys/strsun.h>
64 
65 #include <sys/tiuser.h>
66 #define	_SUN_TPI_VERSION	2
67 #include <sys/tihdr.h>
68 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
69 
70 #include <c2/audit.h>
71 
72 #include <inet/common.h>
73 #include <inet/ip.h>
74 #include <inet/ip6.h>
75 #include <inet/tcp.h>
76 #include <inet/udp_impl.h>
77 
78 #include <fs/sockfs/nl7c.h>
79 #include <sys/zone.h>
80 
81 /*
82  * Possible failures when memory can't be allocated. The documented behavior:
83  *
84  * 		5.5:			4.X:		XNET:
85  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
86  *							EINTR
87  *	(4.X does not document EINTR but returns it)
88  * bind:	ENOSR			-		ENOBUFS/ENOSR
89  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
90  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
91  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
92  *	(4.X getpeername and getsockname do not fail in practice)
93  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
94  * listen:	-			-		ENOBUFS
95  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
96  *							EINTR
97  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
98  *							EINTR
99  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
100  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
101  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
102  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
103  *
104  * Resolution. When allocation fails:
105  *	recv: return EINTR
106  *	send: return EINTR
107  *	connect, accept: EINTR
108  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
109  *	socket, socketpair: ENOBUFS
110  *	getpeername, getsockname: sleep
111  *	getsockopt, setsockopt: sleep
112  */
113 
114 #ifdef SOCK_TEST
115 /*
116  * Variables that make sockfs do something other than the standard TPI
117  * for the AF_INET transports.
118  *
119  * solisten_tpi_tcp:
120  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
121  *	the transport is already bound. This is needed to avoid loosing the
122  *	port number should listen() do a T_UNBIND_REQ followed by a
123  *	O_T_BIND_REQ.
124  *
125  * soconnect_tpi_udp:
126  *	UDP and ICMP can handle a T_CONN_REQ.
127  *	This is needed to make the sequence of connect(), getsockname()
128  *	return the local IP address used to send packets to the connected to
129  *	destination.
130  *
131  * soconnect_tpi_tcp:
132  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
133  *	Set this to non-zero to send TPI conformant messages to TCP in this
134  *	respect. This is a performance optimization.
135  *
136  * soaccept_tpi_tcp:
137  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
138  *	This is a performance optimization that has been picked up in XTI.
139  *
140  * soaccept_tpi_multioptions:
141  *	When inheriting SOL_SOCKET options from the listener to the accepting
142  *	socket send them as a single message for AF_INET{,6}.
143  */
144 int solisten_tpi_tcp = 0;
145 int soconnect_tpi_udp = 0;
146 int soconnect_tpi_tcp = 0;
147 int soaccept_tpi_tcp = 0;
148 int soaccept_tpi_multioptions = 1;
149 #else /* SOCK_TEST */
150 #define	soconnect_tpi_tcp	0
151 #define	soconnect_tpi_udp	0
152 #define	solisten_tpi_tcp	0
153 #define	soaccept_tpi_tcp	0
154 #define	soaccept_tpi_multioptions	1
155 #endif /* SOCK_TEST */
156 
157 #ifdef SOCK_TEST
158 extern int do_useracc;
159 extern clock_t sock_test_timelimit;
160 #endif /* SOCK_TEST */
161 
162 /*
163  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
164  * applications working. Turn on this flag to disable these checks.
165  */
166 int xnet_skip_checks = 0;
167 int xnet_check_print = 0;
168 int xnet_truncate_print = 0;
169 
170 extern	void sigintr(k_sigset_t *, int);
171 extern	void sigunintr(k_sigset_t *);
172 
173 extern	void *nl7c_lookup_addr(void *, t_uscalar_t);
174 extern	void *nl7c_add_addr(void *, t_uscalar_t);
175 extern	void nl7c_listener_addr(void *, queue_t *);
176 
177 static int	sotpi_unbind(struct sonode *, int);
178 
179 /* TPI sockfs sonode operations */
180 static int	sotpi_accept(struct sonode *, int, struct sonode **);
181 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
182 		    int);
183 static int	sotpi_connect(struct sonode *, const struct sockaddr *,
184 		    socklen_t, int, int);
185 static int	sotpi_listen(struct sonode *, int);
186 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
187 		    struct uio *);
188 static int	sotpi_shutdown(struct sonode *, int);
189 static int	sotpi_getsockname(struct sonode *);
190 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
191 		    struct uio *, void *, t_uscalar_t, int);
192 static int	sodgram_direct(struct sonode *, struct sockaddr *,
193 		    socklen_t, struct uio *, int);
194 
195 sonodeops_t sotpi_sonodeops = {
196 	sotpi_accept,		/* sop_accept		*/
197 	sotpi_bind,		/* sop_bind		*/
198 	sotpi_listen,		/* sop_listen		*/
199 	sotpi_connect,		/* sop_connect		*/
200 	sotpi_recvmsg,		/* sop_recvmsg		*/
201 	sotpi_sendmsg,		/* sop_sendmsg		*/
202 	sotpi_getpeername,	/* sop_getpeername	*/
203 	sotpi_getsockname,	/* sop_getsockname	*/
204 	sotpi_shutdown,		/* sop_shutdown		*/
205 	sotpi_getsockopt,	/* sop_getsockopt	*/
206 	sotpi_setsockopt	/* sop_setsockopt	*/
207 };
208 
209 /*
210  * Common create code for socket and accept. If tso is set the values
211  * from that node is used instead of issuing a T_INFO_REQ.
212  *
213  * Assumes that the caller has a VN_HOLD on accessvp.
214  * The VN_RELE will occur either when sotpi_create() fails or when
215  * the returned sonode is freed.
216  */
217 struct sonode *
218 sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version,
219     struct sonode *tso, int *errorp)
220 {
221 	struct sonode	*so;
222 	vnode_t		*vp;
223 	int		flags, error;
224 
225 	ASSERT(accessvp != NULL);
226 	vp = makesockvp(accessvp, domain, type, protocol);
227 	ASSERT(vp != NULL);
228 	so = VTOSO(vp);
229 
230 	flags = FREAD|FWRITE;
231 
232 	if ((type == SOCK_STREAM || type == SOCK_DGRAM) &&
233 	    (domain == AF_INET || domain == AF_INET6) &&
234 	    (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP ||
235 	    protocol == IPPROTO_IP)) {
236 		/* Tell tcp or udp that it's talking to sockets */
237 		flags |= SO_SOCKSTR;
238 
239 		/*
240 		 * Here we indicate to socktpi_open() our attempt to
241 		 * make direct calls between sockfs and transport.
242 		 * The final decision is left to socktpi_open().
243 		 */
244 		so->so_state |= SS_DIRECT;
245 
246 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
247 		if (so->so_type == SOCK_STREAM && tso != NULL) {
248 			if (tso->so_state & SS_DIRECT) {
249 				/*
250 				 * Inherit SS_DIRECT from listener and pass
251 				 * SO_ACCEPTOR open flag to tcp, indicating
252 				 * that this is an accept fast-path instance.
253 				 */
254 				flags |= SO_ACCEPTOR;
255 			} else {
256 				/*
257 				 * SS_DIRECT is not set on listener, meaning
258 				 * that the listener has been converted from
259 				 * a socket to a stream.  Ensure that the
260 				 * acceptor inherits these settings.
261 				 */
262 				so->so_state &= ~SS_DIRECT;
263 				flags &= ~SO_SOCKSTR;
264 			}
265 		}
266 	}
267 
268 	/*
269 	 * Tell local transport that it is talking to sockets.
270 	 */
271 	if (so->so_family == AF_UNIX) {
272 		flags |= SO_SOCKSTR;
273 	}
274 
275 	if (error = socktpi_open(&vp, flags, CRED())) {
276 		VN_RELE(vp);
277 		*errorp = error;
278 		return (NULL);
279 	}
280 
281 	if (error = so_strinit(so, tso)) {
282 		(void) VOP_CLOSE(vp, 0, 1, 0, CRED());
283 		VN_RELE(vp);
284 		*errorp = error;
285 		return (NULL);
286 	}
287 
288 	if (version == SOV_DEFAULT)
289 		version = so_default_version;
290 
291 	so->so_version = (short)version;
292 	return (so);
293 }
294 
295 /*
296  * Bind the socket to an unspecified address in sockfs only.
297  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
298  * required in all cases.
299  */
300 static void
301 so_automatic_bind(struct sonode *so)
302 {
303 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
304 
305 	ASSERT(MUTEX_HELD(&so->so_lock));
306 	ASSERT(!(so->so_state & SS_ISBOUND));
307 	ASSERT(so->so_unbind_mp);
308 
309 	ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
310 	bzero(so->so_laddr_sa, so->so_laddr_len);
311 	so->so_laddr_sa->sa_family = so->so_family;
312 	so->so_state |= SS_ISBOUND;
313 }
314 
315 
316 /*
317  * bind the socket.
318  *
319  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
320  * are passed in we allow rebinding. Note that for backwards compatibility
321  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
322  * Thus the rebinding code is currently not executed.
323  *
324  * The constraints for rebinding are:
325  * - it is a SOCK_DGRAM, or
326  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
327  *   and no listen() has been done.
328  * This rebinding code was added based on some language in the XNET book
329  * about not returning EINVAL it the protocol allows rebinding. However,
330  * this language is not present in the Posix socket draft. Thus maybe the
331  * rebinding logic should be deleted from the source.
332  *
333  * A null "name" can be used to unbind the socket if:
334  * - it is a SOCK_DGRAM, or
335  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
336  *   and no listen() has been done.
337  */
338 static int
339 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
340     socklen_t namelen, int backlog, int flags)
341 {
342 	struct T_bind_req	bind_req;
343 	struct T_bind_ack	*bind_ack;
344 	int			error = 0;
345 	mblk_t			*mp;
346 	void			*addr;
347 	t_uscalar_t		addrlen;
348 	int			unbind_on_err = 1;
349 	boolean_t		clear_acceptconn_on_err = B_FALSE;
350 	boolean_t		restore_backlog_on_err = B_FALSE;
351 	int			save_so_backlog;
352 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
353 	boolean_t		tcp_udp_xport;
354 	void			*nl7c = NULL;
355 
356 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
357 		so, name, namelen, backlog, flags,
358 		pr_state(so->so_state, so->so_mode)));
359 
360 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
361 
362 	if (!(flags & _SOBIND_LOCK_HELD)) {
363 		mutex_enter(&so->so_lock);
364 		so_lock_single(so);	/* Set SOLOCKED */
365 	} else {
366 		ASSERT(MUTEX_HELD(&so->so_lock));
367 		ASSERT(so->so_flag & SOLOCKED);
368 	}
369 
370 	/*
371 	 * Make sure that there is a preallocated unbind_req message
372 	 * before binding. This message allocated when the socket is
373 	 * created  but it might be have been consumed.
374 	 */
375 	if (so->so_unbind_mp == NULL) {
376 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
377 		/* NOTE: holding so_lock while sleeping */
378 		so->so_unbind_mp =
379 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
380 	}
381 
382 	if (flags & _SOBIND_REBIND) {
383 		/*
384 		 * Called from solisten after doing an sotpi_unbind() or
385 		 * potentially without the unbind (latter for AF_INET{,6}).
386 		 */
387 		ASSERT(name == NULL && namelen == 0);
388 
389 		if (so->so_family == AF_UNIX) {
390 			ASSERT(so->so_ux_bound_vp);
391 			addr = &so->so_ux_laddr;
392 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
393 			dprintso(so, 1,
394 			("sobind rebind UNIX: addrlen %d, addr 0x%p, vp %p\n",
395 			    addrlen,
396 			    ((struct so_ux_addr *)addr)->soua_vp,
397 			    so->so_ux_bound_vp));
398 		} else {
399 			addr = so->so_laddr_sa;
400 			addrlen = (t_uscalar_t)so->so_laddr_len;
401 		}
402 	} else if (flags & _SOBIND_UNSPEC) {
403 		ASSERT(name == NULL && namelen == 0);
404 
405 		/*
406 		 * The caller checked SS_ISBOUND but not necessarily
407 		 * under so_lock
408 		 */
409 		if (so->so_state & SS_ISBOUND) {
410 			/* No error */
411 			goto done;
412 		}
413 
414 		/* Set an initial local address */
415 		switch (so->so_family) {
416 		case AF_UNIX:
417 			/*
418 			 * Use an address with same size as struct sockaddr
419 			 * just like BSD.
420 			 */
421 			so->so_laddr_len =
422 				(socklen_t)sizeof (struct sockaddr);
423 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
424 			bzero(so->so_laddr_sa, so->so_laddr_len);
425 			so->so_laddr_sa->sa_family = so->so_family;
426 
427 			/*
428 			 * Pass down an address with the implicit bind
429 			 * magic number and the rest all zeros.
430 			 * The transport will return a unique address.
431 			 */
432 			so->so_ux_laddr.soua_vp = NULL;
433 			so->so_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
434 			addr = &so->so_ux_laddr;
435 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
436 			break;
437 
438 		case AF_INET:
439 		case AF_INET6:
440 			/*
441 			 * An unspecified bind in TPI has a NULL address.
442 			 * Set the address in sockfs to have the sa_family.
443 			 */
444 			so->so_laddr_len = (so->so_family == AF_INET) ?
445 			    (socklen_t)sizeof (sin_t) :
446 			    (socklen_t)sizeof (sin6_t);
447 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
448 			bzero(so->so_laddr_sa, so->so_laddr_len);
449 			so->so_laddr_sa->sa_family = so->so_family;
450 			addr = NULL;
451 			addrlen = 0;
452 			break;
453 
454 		default:
455 			/*
456 			 * An unspecified bind in TPI has a NULL address.
457 			 * Set the address in sockfs to be zero length.
458 			 *
459 			 * Can not assume there is a sa_family for all
460 			 * protocol families. For example, AF_X25 does not
461 			 * have a family field.
462 			 */
463 			so->so_laddr_len = 0;	/* XXX correct? */
464 			bzero(so->so_laddr_sa, so->so_laddr_len);
465 			addr = NULL;
466 			addrlen = 0;
467 			break;
468 		}
469 
470 	} else {
471 		if (so->so_state & SS_ISBOUND) {
472 			/*
473 			 * If it is ok to rebind the socket, first unbind
474 			 * with the transport. A rebind to the NULL address
475 			 * is interpreted as an unbind.
476 			 * Note that a bind to NULL in BSD does unbind the
477 			 * socket but it fails with EINVAL.
478 			 * Note that regular sockets set SOV_SOCKBSD i.e.
479 			 * _SOBIND_SOCKBSD gets set here hence no type of
480 			 * socket does currently allow rebinding.
481 			 *
482 			 * If the name is NULL just do an unbind.
483 			 */
484 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
485 			    name != NULL) {
486 				error = EINVAL;
487 				unbind_on_err = 0;
488 				eprintsoline(so, error);
489 				goto done;
490 			}
491 			if ((so->so_mode & SM_CONNREQUIRED) &&
492 			    (so->so_state & SS_CANTREBIND)) {
493 				error = EINVAL;
494 				unbind_on_err = 0;
495 				eprintsoline(so, error);
496 				goto done;
497 			}
498 			error = sotpi_unbind(so, 0);
499 			if (error) {
500 				eprintsoline(so, error);
501 				goto done;
502 			}
503 			ASSERT(!(so->so_state & SS_ISBOUND));
504 			if (name == NULL) {
505 				so->so_state &=
506 					~(SS_ISCONNECTED|SS_ISCONNECTING);
507 				goto done;
508 			}
509 		}
510 		/* X/Open requires this check */
511 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
512 			if (xnet_check_print) {
513 				printf("sockfs: X/Open bind state check "
514 				    "caused EINVAL\n");
515 			}
516 			error = EINVAL;
517 			goto done;
518 		}
519 
520 		switch (so->so_family) {
521 		case AF_UNIX:
522 			/*
523 			 * All AF_UNIX addresses are nul terminated
524 			 * when copied (copyin_name) in so the minimum
525 			 * length is 3 bytes.
526 			 */
527 			if (name == NULL ||
528 			    (ssize_t)namelen <= sizeof (short) + 1) {
529 				error = EISDIR;
530 				eprintsoline(so, error);
531 				goto done;
532 			}
533 			/*
534 			 * Verify so_family matches the bound family.
535 			 * BSD does not check this for AF_UNIX resulting
536 			 * in funny mknods.
537 			 */
538 			if (name->sa_family != so->so_family) {
539 				error = EAFNOSUPPORT;
540 				goto done;
541 			}
542 			break;
543 		case AF_INET:
544 			if (name == NULL) {
545 				error = EINVAL;
546 				eprintsoline(so, error);
547 				goto done;
548 			}
549 			if ((size_t)namelen != sizeof (sin_t)) {
550 				error = name->sa_family != so->so_family ?
551 				    EAFNOSUPPORT : EINVAL;
552 				eprintsoline(so, error);
553 				goto done;
554 			}
555 			if ((flags & _SOBIND_XPG4_2) &&
556 			    (name->sa_family != so->so_family)) {
557 				/*
558 				 * This check has to be made for X/Open
559 				 * sockets however application failures have
560 				 * been observed when it is applied to
561 				 * all sockets.
562 				 */
563 				error = EAFNOSUPPORT;
564 				eprintsoline(so, error);
565 				goto done;
566 			}
567 			/*
568 			 * Force a zero sa_family to match so_family.
569 			 *
570 			 * Some programs like inetd(1M) don't set the
571 			 * family field. Other programs leave
572 			 * sin_family set to garbage - SunOS 4.X does
573 			 * not check the family field on a bind.
574 			 * We use the family field that
575 			 * was passed in to the socket() call.
576 			 */
577 			name->sa_family = so->so_family;
578 			break;
579 
580 		case AF_INET6: {
581 #ifdef DEBUG
582 			sin6_t *sin6 = (sin6_t *)name;
583 #endif /* DEBUG */
584 
585 			if (name == NULL) {
586 				error = EINVAL;
587 				eprintsoline(so, error);
588 				goto done;
589 			}
590 			if ((size_t)namelen != sizeof (sin6_t)) {
591 				error = name->sa_family != so->so_family ?
592 				    EAFNOSUPPORT : EINVAL;
593 				eprintsoline(so, error);
594 				goto done;
595 			}
596 			if (name->sa_family != so->so_family) {
597 				/*
598 				 * With IPv6 we require the family to match
599 				 * unlike in IPv4.
600 				 */
601 				error = EAFNOSUPPORT;
602 				eprintsoline(so, error);
603 				goto done;
604 			}
605 #ifdef DEBUG
606 			/*
607 			 * Verify that apps don't forget to clear
608 			 * sin6_scope_id etc
609 			 */
610 			if (sin6->sin6_scope_id != 0 &&
611 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
612 				cmn_err(CE_WARN,
613 				    "bind with uninitialized sin6_scope_id "
614 				    "(%d) on socket. Pid = %d\n",
615 				    (int)sin6->sin6_scope_id,
616 				    (int)curproc->p_pid);
617 			}
618 			if (sin6->__sin6_src_id != 0) {
619 				cmn_err(CE_WARN,
620 				    "bind with uninitialized __sin6_src_id "
621 				    "(%d) on socket. Pid = %d\n",
622 				    (int)sin6->__sin6_src_id,
623 				    (int)curproc->p_pid);
624 			}
625 #endif /* DEBUG */
626 			break;
627 		}
628 		default:
629 			/*
630 			 * Don't do any length or sa_family check to allow
631 			 * non-sockaddr style addresses.
632 			 */
633 			if (name == NULL) {
634 				error = EINVAL;
635 				eprintsoline(so, error);
636 				goto done;
637 			}
638 			break;
639 		}
640 
641 		if (namelen > (t_uscalar_t)so->so_laddr_maxlen) {
642 			error = ENAMETOOLONG;
643 			eprintsoline(so, error);
644 			goto done;
645 		}
646 		/*
647 		 * Save local address.
648 		 */
649 		so->so_laddr_len = (socklen_t)namelen;
650 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
651 		bcopy(name, so->so_laddr_sa, namelen);
652 
653 		addr = so->so_laddr_sa;
654 		addrlen = (t_uscalar_t)so->so_laddr_len;
655 		switch (so->so_family) {
656 		case AF_INET6:
657 		case AF_INET:
658 			break;
659 		case AF_UNIX: {
660 			struct sockaddr_un *soun =
661 				(struct sockaddr_un *)so->so_laddr_sa;
662 			struct vnode *vp;
663 			struct vattr vattr;
664 
665 			ASSERT(so->so_ux_bound_vp == NULL);
666 			/*
667 			 * Create vnode for the specified path name.
668 			 * Keep vnode held with a reference in so_ux_bound_vp.
669 			 * Use the vnode pointer as the address used in the
670 			 * bind with the transport.
671 			 *
672 			 * Use the same mode as in BSD. In particular this does
673 			 * not observe the umask.
674 			 */
675 			/* MAXPATHLEN + soun_family + nul termination */
676 			if (so->so_laddr_len >
677 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
678 				error = ENAMETOOLONG;
679 				eprintsoline(so, error);
680 				goto done;
681 			}
682 			vattr.va_type = VSOCK;
683 			vattr.va_mode = 0777 & ~u.u_cmask;
684 			vattr.va_mask = AT_TYPE|AT_MODE;
685 			/* NOTE: holding so_lock */
686 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
687 						EXCL, 0, &vp, CRMKNOD, 0, 0);
688 			if (error) {
689 				if (error == EEXIST)
690 					error = EADDRINUSE;
691 				eprintsoline(so, error);
692 				goto done;
693 			}
694 			/*
695 			 * Establish pointer from the underlying filesystem
696 			 * vnode to the socket node.
697 			 * so_ux_bound_vp and v_stream->sd_vnode form the
698 			 * cross-linkage between the underlying filesystem
699 			 * node and the socket node.
700 			 */
701 			ASSERT(SOTOV(so)->v_stream);
702 			mutex_enter(&vp->v_lock);
703 			vp->v_stream = SOTOV(so)->v_stream;
704 			so->so_ux_bound_vp = vp;
705 			mutex_exit(&vp->v_lock);
706 
707 			/*
708 			 * Use the vnode pointer value as a unique address
709 			 * (together with the magic number to avoid conflicts
710 			 * with implicit binds) in the transport provider.
711 			 */
712 			so->so_ux_laddr.soua_vp = (void *)so->so_ux_bound_vp;
713 			so->so_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
714 			addr = &so->so_ux_laddr;
715 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
716 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
717 			    addrlen,
718 			    ((struct so_ux_addr *)addr)->soua_vp));
719 			break;
720 		}
721 		} /* end switch (so->so_family) */
722 	}
723 
724 	/*
725 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
726 	 * the transport can start passing up T_CONN_IND messages
727 	 * as soon as it receives the bind req and strsock_proto()
728 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
729 	 */
730 	if (flags & _SOBIND_LISTEN) {
731 		if ((so->so_state & SS_ACCEPTCONN) == 0)
732 			clear_acceptconn_on_err = B_TRUE;
733 		save_so_backlog = so->so_backlog;
734 		restore_backlog_on_err = B_TRUE;
735 		so->so_state |= SS_ACCEPTCONN;
736 		so->so_backlog = backlog;
737 	}
738 
739 	/*
740 	 * If NL7C addr(s) have been configured check for addr/port match,
741 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
742 	 *
743 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
744 	 * family sockets only. If match mark as such.
745 	 */
746 	if ((nl7c_enabled && addr != NULL &&
747 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
748 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
749 	    so->so_nl7c_flags == NL7C_AF_NCA) {
750 		/*
751 		 * NL7C is not supported in non-global zones,
752 		 * we enforce this restriction here.
753 		 */
754 		if (so->so_zoneid == GLOBAL_ZONEID) {
755 			/* An NL7C socket, mark it */
756 			so->so_nl7c_flags |= NL7C_ENABLED;
757 		} else
758 			nl7c = NULL;
759 	}
760 	/*
761 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
762 	 * for other transports we will send in a O_T_BIND_REQ.
763 	 */
764 	if (tcp_udp_xport &&
765 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
766 		PRIM_type = T_BIND_REQ;
767 
768 	bind_req.PRIM_type = PRIM_type;
769 	bind_req.ADDR_length = addrlen;
770 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
771 	bind_req.CONIND_number = backlog;
772 	/* NOTE: holding so_lock while sleeping */
773 	mp = soallocproto2(&bind_req, sizeof (bind_req),
774 				addr, addrlen, 0, _ALLOC_SLEEP);
775 	so->so_state &= ~SS_LADDR_VALID;
776 	/* Done using so_laddr_sa - can drop the lock */
777 	mutex_exit(&so->so_lock);
778 
779 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
780 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
781 	if (error) {
782 		eprintsoline(so, error);
783 		mutex_enter(&so->so_lock);
784 		goto done;
785 	}
786 
787 	mutex_enter(&so->so_lock);
788 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
789 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
790 	if (error) {
791 		eprintsoline(so, error);
792 		goto done;
793 	}
794 	ASSERT(mp);
795 	/*
796 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
797 	 * strsock_proto while the lock was dropped above, the bind
798 	 * is allowed to complete.
799 	 */
800 
801 	/* Mark as bound. This will be undone if we detect errors below. */
802 	if (flags & _SOBIND_NOXLATE) {
803 		ASSERT(so->so_family == AF_UNIX);
804 		so->so_state |= SS_FADDR_NOXLATE;
805 	}
806 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
807 	so->so_state |= SS_ISBOUND;
808 	ASSERT(so->so_unbind_mp);
809 
810 	/* note that we've already set SS_ACCEPTCONN above */
811 
812 	/*
813 	 * Recompute addrlen - an unspecied bind sent down an
814 	 * address of length zero but we expect the appropriate length
815 	 * in return.
816 	 */
817 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
818 	    sizeof (so->so_ux_laddr) : so->so_laddr_len);
819 
820 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
821 	/*
822 	 * The alignment restriction is really too strict but
823 	 * we want enough alignment to inspect the fields of
824 	 * a sockaddr_in.
825 	 */
826 	addr = sogetoff(mp, bind_ack->ADDR_offset,
827 			bind_ack->ADDR_length,
828 			__TPI_ALIGN_SIZE);
829 	if (addr == NULL) {
830 		freemsg(mp);
831 		error = EPROTO;
832 		eprintsoline(so, error);
833 		goto done;
834 	}
835 	if (!(flags & _SOBIND_UNSPEC)) {
836 		/*
837 		 * Verify that the transport didn't return something we
838 		 * did not want e.g. an address other than what we asked for.
839 		 *
840 		 * NOTE: These checks would go away if/when we switch to
841 		 * using the new TPI (in which the transport would fail
842 		 * the request instead of assigning a different address).
843 		 *
844 		 * NOTE2: For protocols that we don't know (i.e. any
845 		 * other than AF_INET6, AF_INET and AF_UNIX), we
846 		 * cannot know if the transport should be expected to
847 		 * return the same address as that requested.
848 		 *
849 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
850 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
851 		 *
852 		 * For example, in the case of netatalk it may be
853 		 * inappropriate for the transport to return the
854 		 * requested address (as it may have allocated a local
855 		 * port number in behaviour similar to that of an
856 		 * AF_INET bind request with a port number of zero).
857 		 *
858 		 * Given the definition of O_T_BIND_REQ, where the
859 		 * transport may bind to an address other than the
860 		 * requested address, it's not possible to determine
861 		 * whether a returned address that differs from the
862 		 * requested address is a reason to fail (because the
863 		 * requested address was not available) or succeed
864 		 * (because the transport allocated an appropriate
865 		 * address and/or port).
866 		 *
867 		 * sockfs currently requires that the transport return
868 		 * the requested address in the T_BIND_ACK, unless
869 		 * there is code here to allow for any discrepancy.
870 		 * Such code exists for AF_INET and AF_INET6.
871 		 *
872 		 * Netatalk chooses to return the requested address
873 		 * rather than the (correct) allocated address.  This
874 		 * means that netatalk violates the TPI specification
875 		 * (and would not function correctly if used from a
876 		 * TLI application), but it does mean that it works
877 		 * with sockfs.
878 		 *
879 		 * As noted above, using the newer XTI bind primitive
880 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
881 		 * allow sockfs to be more sure about whether or not
882 		 * the bind request had succeeded (as transports are
883 		 * not permitted to bind to a different address than
884 		 * that requested - they must return failure).
885 		 * Unfortunately, support for T_BIND_REQ may not be
886 		 * present in all transport implementations (netatalk,
887 		 * for example, doesn't have it), making the
888 		 * transition difficult.
889 		 */
890 		if (bind_ack->ADDR_length != addrlen) {
891 			/* Assumes that the requested address was in use */
892 			freemsg(mp);
893 			error = EADDRINUSE;
894 			eprintsoline(so, error);
895 			goto done;
896 		}
897 
898 		switch (so->so_family) {
899 		case AF_INET6:
900 		case AF_INET: {
901 			sin_t *rname, *aname;
902 
903 			rname = (sin_t *)addr;
904 			aname = (sin_t *)so->so_laddr_sa;
905 
906 			/*
907 			 * Take advantage of the alignment
908 			 * of sin_port and sin6_port which fall
909 			 * in the same place in their data structures.
910 			 * Just use sin_port for either address family.
911 			 *
912 			 * This may become a problem if (heaven forbid)
913 			 * there's a separate ipv6port_reserved... :-P
914 			 *
915 			 * Binding to port 0 has the semantics of letting
916 			 * the transport bind to any port.
917 			 *
918 			 * If the transport is TCP or UDP since we had sent
919 			 * a T_BIND_REQ we would not get a port other than
920 			 * what we asked for.
921 			 */
922 			if (tcp_udp_xport) {
923 				/*
924 				 * Pick up the new port number if we bound to
925 				 * port 0.
926 				 */
927 				if (aname->sin_port == 0)
928 					aname->sin_port = rname->sin_port;
929 				so->so_state |= SS_LADDR_VALID;
930 				break;
931 			}
932 			if (aname->sin_port != 0 &&
933 			    aname->sin_port != rname->sin_port) {
934 				freemsg(mp);
935 				error = EADDRINUSE;
936 				eprintsoline(so, error);
937 				goto done;
938 			}
939 			/*
940 			 * Pick up the new port number if we bound to port 0.
941 			 */
942 			aname->sin_port = rname->sin_port;
943 
944 			/*
945 			 * Unfortunately, addresses aren't _quite_ the same.
946 			 */
947 			if (so->so_family == AF_INET) {
948 				if (aname->sin_addr.s_addr !=
949 				    rname->sin_addr.s_addr) {
950 					freemsg(mp);
951 					error = EADDRNOTAVAIL;
952 					eprintsoline(so, error);
953 					goto done;
954 				}
955 			} else {
956 				sin6_t *rname6 = (sin6_t *)rname;
957 				sin6_t *aname6 = (sin6_t *)aname;
958 
959 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
960 				    &rname6->sin6_addr)) {
961 					freemsg(mp);
962 					error = EADDRNOTAVAIL;
963 					eprintsoline(so, error);
964 					goto done;
965 				}
966 			}
967 			break;
968 		}
969 		case AF_UNIX:
970 			if (bcmp(addr, &so->so_ux_laddr, addrlen) != 0) {
971 				freemsg(mp);
972 				error = EADDRINUSE;
973 				eprintsoline(so, error);
974 				eprintso(so,
975 					("addrlen %d, addr 0x%x, vp %p\n",
976 					addrlen, *((int *)addr),
977 					so->so_ux_bound_vp));
978 				goto done;
979 			}
980 			so->so_state |= SS_LADDR_VALID;
981 			break;
982 		default:
983 			/*
984 			 * NOTE: This assumes that addresses can be
985 			 * byte-compared for equivalence.
986 			 */
987 			if (bcmp(addr, so->so_laddr_sa, addrlen) != 0) {
988 				freemsg(mp);
989 				error = EADDRINUSE;
990 				eprintsoline(so, error);
991 				goto done;
992 			}
993 			/*
994 			 * Don't mark SS_LADDR_VALID, as we cannot be
995 			 * sure that the returned address is the real
996 			 * bound address when talking to an unknown
997 			 * transport.
998 			 */
999 			break;
1000 		}
1001 	} else {
1002 		/*
1003 		 * Save for returned address for getsockname.
1004 		 * Needed for unspecific bind unless transport supports
1005 		 * the TI_GETMYNAME ioctl.
1006 		 * Do this for AF_INET{,6} even though they do, as
1007 		 * caching info here is much better performance than
1008 		 * a TPI/STREAMS trip to the transport for getsockname.
1009 		 * Any which can't for some reason _must_ _not_ set
1010 		 * LADDR_VALID here for the caching version of getsockname
1011 		 * to not break;
1012 		 */
1013 		switch (so->so_family) {
1014 		case AF_UNIX:
1015 			/*
1016 			 * Record the address bound with the transport
1017 			 * for use by socketpair.
1018 			 */
1019 			bcopy(addr, &so->so_ux_laddr, addrlen);
1020 			so->so_state |= SS_LADDR_VALID;
1021 			break;
1022 		case AF_INET:
1023 		case AF_INET6:
1024 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
1025 			bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
1026 			so->so_state |= SS_LADDR_VALID;
1027 			break;
1028 		default:
1029 			/*
1030 			 * Don't mark SS_LADDR_VALID, as we cannot be
1031 			 * sure that the returned address is the real
1032 			 * bound address when talking to an unknown
1033 			 * transport.
1034 			 */
1035 			break;
1036 		}
1037 	}
1038 
1039 	if (nl7c == NULL && (so->so_nl7c_flags & NL7C_AF_NCA) &&
1040 	    (so->so_nl7c_flags & NL7C_ENABLED)) {
1041 		/*
1042 		 * Was an AF_NCA bind() so add it to the addr list for
1043 		 * reporting purposes.
1044 		 */
1045 		nl7c = nl7c_add_addr(addr, addrlen);
1046 	}
1047 	if (nl7c != NULL) {
1048 		nl7c_listener_addr(nl7c, strvp2wq(SOTOV(so)));
1049 	}
1050 
1051 	freemsg(mp);
1052 
1053 done:
1054 	if (error) {
1055 		/* reset state & backlog to values held on entry */
1056 		if (clear_acceptconn_on_err == B_TRUE)
1057 			so->so_state &= ~SS_ACCEPTCONN;
1058 		if (restore_backlog_on_err == B_TRUE)
1059 			so->so_backlog = save_so_backlog;
1060 
1061 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1062 			int err;
1063 
1064 			err = sotpi_unbind(so, 0);
1065 			/* LINTED - statement has no consequent: if */
1066 			if (err) {
1067 				eprintsoline(so, error);
1068 			} else {
1069 				ASSERT(!(so->so_state & SS_ISBOUND));
1070 			}
1071 		}
1072 	}
1073 	if (!(flags & _SOBIND_LOCK_HELD)) {
1074 		so_unlock_single(so, SOLOCKED);
1075 		mutex_exit(&so->so_lock);
1076 	} else {
1077 		/* If the caller held the lock don't release it here */
1078 		ASSERT(MUTEX_HELD(&so->so_lock));
1079 		ASSERT(so->so_flag & SOLOCKED);
1080 	}
1081 	return (error);
1082 }
1083 
1084 /* bind the socket */
1085 static int
1086 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1087     int flags)
1088 {
1089 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1090 		return (sotpi_bindlisten(so, name, namelen, 0, flags));
1091 
1092 	flags &= ~_SOBIND_SOCKETPAIR;
1093 	return (sotpi_bindlisten(so, name, namelen, 1, flags));
1094 }
1095 
1096 /*
1097  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1098  * address, or when listen needs to unbind and bind.
1099  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1100  * so that a sobind can pick them up.
1101  */
1102 static int
1103 sotpi_unbind(struct sonode *so, int flags)
1104 {
1105 	struct T_unbind_req	unbind_req;
1106 	int			error = 0;
1107 	mblk_t			*mp;
1108 
1109 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1110 			so, flags, pr_state(so->so_state, so->so_mode)));
1111 
1112 	ASSERT(MUTEX_HELD(&so->so_lock));
1113 	ASSERT(so->so_flag & SOLOCKED);
1114 
1115 	if (!(so->so_state & SS_ISBOUND)) {
1116 		error = EINVAL;
1117 		eprintsoline(so, error);
1118 		goto done;
1119 	}
1120 
1121 	mutex_exit(&so->so_lock);
1122 
1123 	/*
1124 	 * Flush the read and write side (except stream head read queue)
1125 	 * and send down T_UNBIND_REQ.
1126 	 */
1127 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1128 
1129 	unbind_req.PRIM_type = T_UNBIND_REQ;
1130 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1131 	    0, _ALLOC_SLEEP);
1132 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1133 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1134 	mutex_enter(&so->so_lock);
1135 	if (error) {
1136 		eprintsoline(so, error);
1137 		goto done;
1138 	}
1139 
1140 	error = sowaitokack(so, T_UNBIND_REQ);
1141 	if (error) {
1142 		eprintsoline(so, error);
1143 		goto done;
1144 	}
1145 
1146 	/*
1147 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1148 	 * strsock_proto while the lock was dropped above, the unbind
1149 	 * is allowed to complete.
1150 	 */
1151 	if (!(flags & _SOUNBIND_REBIND)) {
1152 		/*
1153 		 * Clear out bound address.
1154 		 */
1155 		vnode_t *vp;
1156 
1157 		if ((vp = so->so_ux_bound_vp) != NULL) {
1158 			ASSERT(vp->v_stream);
1159 			so->so_ux_bound_vp = NULL;
1160 			vn_rele_stream(vp);
1161 		}
1162 		/* Clear out address */
1163 		so->so_laddr_len = 0;
1164 	}
1165 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN|SS_LADDR_VALID);
1166 done:
1167 	/* If the caller held the lock don't release it here */
1168 	ASSERT(MUTEX_HELD(&so->so_lock));
1169 	ASSERT(so->so_flag & SOLOCKED);
1170 
1171 	return (error);
1172 }
1173 
1174 /*
1175  * listen on the socket.
1176  * For TPI conforming transports this has to first unbind with the transport
1177  * and then bind again using the new backlog.
1178  */
1179 int
1180 sotpi_listen(struct sonode *so, int backlog)
1181 {
1182 	int		error = 0;
1183 
1184 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1185 		so, backlog, pr_state(so->so_state, so->so_mode)));
1186 
1187 	if (so->so_serv_type == T_CLTS)
1188 		return (EOPNOTSUPP);
1189 
1190 	/*
1191 	 * If the socket is ready to accept connections already, then
1192 	 * return without doing anything.  This avoids a problem where
1193 	 * a second listen() call fails if a connection is pending and
1194 	 * leaves the socket unbound. Only when we are not unbinding
1195 	 * with the transport can we safely increase the backlog.
1196 	 */
1197 	if (so->so_state & SS_ACCEPTCONN &&
1198 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1199 		/*CONSTCOND*/
1200 		!solisten_tpi_tcp))
1201 		return (0);
1202 
1203 	if (so->so_state & SS_ISCONNECTED)
1204 		return (EINVAL);
1205 
1206 	mutex_enter(&so->so_lock);
1207 	so_lock_single(so);	/* Set SOLOCKED */
1208 
1209 	if (backlog < 0)
1210 		backlog = 0;
1211 	/*
1212 	 * Use the same qlimit as in BSD. BSD checks the qlimit
1213 	 * before queuing the next connection implying that a
1214 	 * listen(sock, 0) allows one connection to be queued.
1215 	 * BSD also uses 1.5 times the requested backlog.
1216 	 *
1217 	 * XNS Issue 4 required a strict interpretation of the backlog.
1218 	 * This has been waived subsequently for Issue 4 and the change
1219 	 * incorporated in XNS Issue 5. So we aren't required to do
1220 	 * anything special for XPG apps.
1221 	 */
1222 	if (backlog >= (INT_MAX - 1) / 3)
1223 		backlog = INT_MAX;
1224 	else
1225 		backlog = backlog * 3 / 2 + 1;
1226 
1227 	/*
1228 	 * If the listen doesn't change the backlog we do nothing.
1229 	 * This avoids an EPROTO error from the transport.
1230 	 */
1231 	if ((so->so_state & SS_ACCEPTCONN) &&
1232 	    so->so_backlog == backlog)
1233 		goto done;
1234 
1235 	if (!(so->so_state & SS_ISBOUND)) {
1236 		/*
1237 		 * Must have been explicitly bound in the UNIX domain.
1238 		 */
1239 		if (so->so_family == AF_UNIX) {
1240 			error = EINVAL;
1241 			goto done;
1242 		}
1243 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1244 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1245 	} else if (backlog > 0) {
1246 		/*
1247 		 * AF_INET{,6} hack to avoid losing the port.
1248 		 * Assumes that all AF_INET{,6} transports can handle a
1249 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1250 		 * has already bound thus it is possible to avoid the unbind.
1251 		 */
1252 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1253 		    /*CONSTCOND*/
1254 		    !solisten_tpi_tcp)) {
1255 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1256 			if (error)
1257 				goto done;
1258 		}
1259 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1260 			    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1261 	} else {
1262 		so->so_state |= SS_ACCEPTCONN;
1263 		so->so_backlog = backlog;
1264 	}
1265 	if (error)
1266 		goto done;
1267 	ASSERT(so->so_state & SS_ACCEPTCONN);
1268 done:
1269 	so_unlock_single(so, SOLOCKED);
1270 	mutex_exit(&so->so_lock);
1271 	return (error);
1272 }
1273 
1274 /*
1275  * Disconnect either a specified seqno or all (-1).
1276  * The former is used on listening sockets only.
1277  *
1278  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1279  * the current use of sodisconnect(seqno == -1) is only for shutdown
1280  * so there is no point (and potentially incorrect) to unbind.
1281  */
1282 int
1283 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1284 {
1285 	struct T_discon_req	discon_req;
1286 	int			error = 0;
1287 	mblk_t			*mp;
1288 
1289 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1290 			so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1291 
1292 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1293 		mutex_enter(&so->so_lock);
1294 		so_lock_single(so);	/* Set SOLOCKED */
1295 	} else {
1296 		ASSERT(MUTEX_HELD(&so->so_lock));
1297 		ASSERT(so->so_flag & SOLOCKED);
1298 	}
1299 
1300 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1301 		error = EINVAL;
1302 		eprintsoline(so, error);
1303 		goto done;
1304 	}
1305 
1306 	mutex_exit(&so->so_lock);
1307 	/*
1308 	 * Flush the write side (unless this is a listener)
1309 	 * and then send down a T_DISCON_REQ.
1310 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1311 	 * and other messages.)
1312 	 */
1313 	if (!(so->so_state & SS_ACCEPTCONN))
1314 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1315 
1316 	discon_req.PRIM_type = T_DISCON_REQ;
1317 	discon_req.SEQ_number = seqno;
1318 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1319 	    0, _ALLOC_SLEEP);
1320 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1321 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1322 	mutex_enter(&so->so_lock);
1323 	if (error) {
1324 		eprintsoline(so, error);
1325 		goto done;
1326 	}
1327 
1328 	error = sowaitokack(so, T_DISCON_REQ);
1329 	if (error) {
1330 		eprintsoline(so, error);
1331 		goto done;
1332 	}
1333 	/*
1334 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1335 	 * strsock_proto while the lock was dropped above, the disconnect
1336 	 * is allowed to complete. However, it is not possible to
1337 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1338 	 */
1339 	so->so_state &=
1340 	    ~(SS_ISCONNECTED|SS_ISCONNECTING|SS_LADDR_VALID|SS_FADDR_VALID);
1341 done:
1342 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1343 		so_unlock_single(so, SOLOCKED);
1344 		mutex_exit(&so->so_lock);
1345 	} else {
1346 		/* If the caller held the lock don't release it here */
1347 		ASSERT(MUTEX_HELD(&so->so_lock));
1348 		ASSERT(so->so_flag & SOLOCKED);
1349 	}
1350 	return (error);
1351 }
1352 
1353 int
1354 sotpi_accept(struct sonode *so, int fflag, struct sonode **nsop)
1355 {
1356 	struct T_conn_ind	*conn_ind;
1357 	struct T_conn_res	*conn_res;
1358 	int			error = 0;
1359 	mblk_t			*mp;
1360 	struct sonode		*nso;
1361 	vnode_t			*nvp;
1362 	void			*src;
1363 	t_uscalar_t		srclen;
1364 	void			*opt;
1365 	t_uscalar_t		optlen;
1366 	t_scalar_t		PRIM_type;
1367 	t_scalar_t		SEQ_number;
1368 
1369 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1370 		so, fflag, nsop, pr_state(so->so_state, so->so_mode)));
1371 
1372 	/*
1373 	 * Defer single-threading the accepting socket until
1374 	 * the T_CONN_IND has been received and parsed and the
1375 	 * new sonode has been opened.
1376 	 */
1377 
1378 	/* Check that we are not already connected */
1379 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1380 		goto conn_bad;
1381 again:
1382 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1383 		goto e_bad;
1384 
1385 	ASSERT(mp);
1386 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1387 	/*
1388 	 * Save SEQ_number for error paths.
1389 	 */
1390 	SEQ_number = conn_ind->SEQ_number;
1391 
1392 	srclen = conn_ind->SRC_length;
1393 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1394 	if (src == NULL) {
1395 		error = EPROTO;
1396 		freemsg(mp);
1397 		eprintsoline(so, error);
1398 		goto disconnect_unlocked;
1399 	}
1400 	optlen = conn_ind->OPT_length;
1401 	switch (so->so_family) {
1402 	case AF_INET:
1403 	case AF_INET6:
1404 		if ((optlen == sizeof (intptr_t)) &&
1405 		    ((so->so_state & SS_DIRECT) != 0)) {
1406 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1407 			    &opt, conn_ind->OPT_length);
1408 		} else {
1409 			/*
1410 			 * The transport (in this case TCP) hasn't sent up
1411 			 * a pointer to an instance for the accept fast-path.
1412 			 * Disable fast-path completely because the call to
1413 			 * sotpi_create() below would otherwise create an
1414 			 * incomplete TCP instance, which would lead to
1415 			 * problems when sockfs sends a normal T_CONN_RES
1416 			 * message down the new stream.
1417 			 */
1418 			if (so->so_state & SS_DIRECT) {
1419 				int rval;
1420 				/*
1421 				 * For consistency we inform tcp to disable
1422 				 * direct interface on the listener, though
1423 				 * we can certainly live without doing this
1424 				 * because no data will ever travel upstream
1425 				 * on the listening socket.
1426 				 */
1427 				so->so_state &= ~SS_DIRECT;
1428 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1429 				    0, 0, K_TO_K, CRED(), &rval);
1430 			}
1431 			opt = NULL;
1432 			optlen = 0;
1433 		}
1434 		break;
1435 	case AF_UNIX:
1436 	default:
1437 		if (optlen != 0) {
1438 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1439 			    __TPI_ALIGN_SIZE);
1440 			if (opt == NULL) {
1441 				error = EPROTO;
1442 				freemsg(mp);
1443 				eprintsoline(so, error);
1444 				goto disconnect_unlocked;
1445 			}
1446 		}
1447 		if (so->so_family == AF_UNIX) {
1448 			if (!(so->so_state & SS_FADDR_NOXLATE)) {
1449 				src = NULL;
1450 				srclen = 0;
1451 			}
1452 			/* Extract src address from options */
1453 			if (optlen != 0)
1454 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1455 		}
1456 		break;
1457 	}
1458 
1459 	/*
1460 	 * Create the new socket.
1461 	 */
1462 	VN_HOLD(so->so_accessvp);
1463 	nso = sotpi_create(so->so_accessvp, so->so_family, so->so_type,
1464 			so->so_protocol, so->so_version, so, &error);
1465 	if (nso == NULL) {
1466 		ASSERT(error != 0);
1467 		/*
1468 		 * Accept can not fail with ENOBUFS. sotpi_create
1469 		 * sleeps waiting for memory until a signal is caught
1470 		 * so return EINTR.
1471 		 */
1472 		freemsg(mp);
1473 		if (error == ENOBUFS)
1474 			error = EINTR;
1475 		goto e_disc_unl;
1476 	}
1477 	nvp = SOTOV(nso);
1478 
1479 #ifdef DEBUG
1480 	/*
1481 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1482 	 * it's inherited early to allow debugging of the accept code itself.
1483 	 */
1484 	nso->so_options |= so->so_options & SO_DEBUG;
1485 #endif /* DEBUG */
1486 
1487 	/*
1488 	 * Save the SRC address from the T_CONN_IND
1489 	 * for getpeername to work on AF_UNIX and on transports that do not
1490 	 * support TI_GETPEERNAME.
1491 	 *
1492 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1493 	 * copyin_name().
1494 	 */
1495 	if (srclen > (t_uscalar_t)nso->so_faddr_maxlen) {
1496 		error = EINVAL;
1497 		freemsg(mp);
1498 		eprintsoline(so, error);
1499 		goto disconnect_vp_unlocked;
1500 	}
1501 	nso->so_faddr_len = (socklen_t)srclen;
1502 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
1503 	bcopy(src, nso->so_faddr_sa, srclen);
1504 	nso->so_state |= SS_FADDR_VALID;
1505 
1506 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1507 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1508 		cred_t *cr;
1509 
1510 		if ((cr = DB_CRED(mp)) != NULL) {
1511 			crhold(cr);
1512 			nso->so_peercred = cr;
1513 			nso->so_cpid = DB_CPID(mp);
1514 		}
1515 		freemsg(mp);
1516 
1517 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1518 		    sizeof (intptr_t), 0, _ALLOC_INTR);
1519 		if (mp == NULL) {
1520 			/*
1521 			 * Accept can not fail with ENOBUFS.
1522 			 * A signal was caught so return EINTR.
1523 			 */
1524 			error = EINTR;
1525 			eprintsoline(so, error);
1526 			goto disconnect_vp_unlocked;
1527 		}
1528 		conn_res = (struct T_conn_res *)mp->b_rptr;
1529 	} else {
1530 		nso->so_peercred = DB_CRED(mp);
1531 		nso->so_cpid = DB_CPID(mp);
1532 		DB_CRED(mp) = NULL;
1533 
1534 		mp->b_rptr = DB_BASE(mp);
1535 		conn_res = (struct T_conn_res *)mp->b_rptr;
1536 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1537 	}
1538 
1539 	/*
1540 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1541 	 * (or AF_INET6) it also has to be bound in the transport provider.
1542 	 * After accepting the connection on nso so_laddr_sa will be set to
1543 	 * contain the same address as the listener's local address
1544 	 * so the address we bind to isn't important.
1545 	 */
1546 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1547 	    /*CONSTCOND*/
1548 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1549 		/*
1550 		 * Optimization for AF_INET{,6} transports
1551 		 * that can handle a T_CONN_RES without being bound.
1552 		 */
1553 		mutex_enter(&nso->so_lock);
1554 		so_automatic_bind(nso);
1555 		mutex_exit(&nso->so_lock);
1556 	} else {
1557 		/* Perform NULL bind with the transport provider. */
1558 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC)) != 0) {
1559 			ASSERT(error != ENOBUFS);
1560 			freemsg(mp);
1561 			eprintsoline(nso, error);
1562 			goto disconnect_vp_unlocked;
1563 		}
1564 	}
1565 
1566 	/*
1567 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1568 	 * so that any data arriving on the new socket will cause the
1569 	 * appropriate signals to be delivered for the new socket.
1570 	 *
1571 	 * No other thread (except strsock_proto and strsock_misc)
1572 	 * can access the new socket thus we relax the locking.
1573 	 */
1574 	nso->so_pgrp = so->so_pgrp;
1575 	nso->so_state |= so->so_state & (SS_ASYNC|SS_FADDR_NOXLATE);
1576 
1577 	if (nso->so_pgrp != 0) {
1578 		if ((error = so_set_events(nso, nvp, CRED())) != 0) {
1579 			eprintsoline(nso, error);
1580 			error = 0;
1581 			nso->so_pgrp = 0;
1582 		}
1583 	}
1584 
1585 	/*
1586 	 * Make note of the socket level options. TCP and IP level options
1587 	 * are already inherited. We could do all this after accept is
1588 	 * successful but doing it here simplifies code and no harm done
1589 	 * for error case.
1590 	 */
1591 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1592 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1593 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1594 	nso->so_sndbuf = so->so_sndbuf;
1595 	nso->so_rcvbuf = so->so_rcvbuf;
1596 	if (nso->so_options & SO_LINGER)
1597 		nso->so_linger = so->so_linger;
1598 
1599 	if ((so->so_state & SS_DIRECT) != 0) {
1600 		mblk_t *ack_mp;
1601 
1602 		ASSERT(nso->so_state & SS_DIRECT);
1603 		ASSERT(opt != NULL);
1604 
1605 		conn_res->OPT_length = optlen;
1606 		conn_res->OPT_offset = MBLKL(mp);
1607 		bcopy(&opt, mp->b_wptr, optlen);
1608 		mp->b_wptr += optlen;
1609 		conn_res->PRIM_type = T_CONN_RES;
1610 		conn_res->ACCEPTOR_id = 0;
1611 		PRIM_type = T_CONN_RES;
1612 
1613 		/* Send down the T_CONN_RES on acceptor STREAM */
1614 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1615 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1616 		if (error) {
1617 			mutex_enter(&so->so_lock);
1618 			so_lock_single(so);
1619 			eprintsoline(so, error);
1620 			goto disconnect_vp;
1621 		}
1622 		mutex_enter(&nso->so_lock);
1623 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1624 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1625 		if (error) {
1626 			mutex_exit(&nso->so_lock);
1627 			mutex_enter(&so->so_lock);
1628 			so_lock_single(so);
1629 			eprintsoline(so, error);
1630 			goto disconnect_vp;
1631 		}
1632 		if (nso->so_family == AF_INET) {
1633 			sin_t *sin;
1634 
1635 			sin = (sin_t *)(ack_mp->b_rptr +
1636 			    sizeof (struct T_ok_ack));
1637 			bcopy(sin, nso->so_laddr_sa, sizeof (sin_t));
1638 			nso->so_laddr_len = sizeof (sin_t);
1639 		} else {
1640 			sin6_t *sin6;
1641 
1642 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1643 			    sizeof (struct T_ok_ack));
1644 			bcopy(sin6, nso->so_laddr_sa, sizeof (sin6_t));
1645 			nso->so_laddr_len = sizeof (sin6_t);
1646 		}
1647 		freemsg(ack_mp);
1648 
1649 		nso->so_state |= SS_ISCONNECTED | SS_LADDR_VALID;
1650 		nso->so_priv = opt;
1651 
1652 		if (so->so_nl7c_flags & NL7C_ENABLED) {
1653 			/*
1654 			 * An NL7C marked listen()er so the new socket
1655 			 * inherits the listen()er's NL7C state.
1656 			 *
1657 			 * When calling NL7C to process the new socket
1658 			 * pass the nonblocking i/o state of the listen
1659 			 * socket as this is the context we are in.
1660 			 */
1661 			nso->so_nl7c_flags = so->so_nl7c_flags;
1662 			if (nl7c_process(nso,
1663 			    (nso->so_state & (SS_NONBLOCK|SS_NDELAY)),
1664 			    (int)((tcp_t *)nso->so_priv)->tcp_mss)) {
1665 				/*
1666 				 * NL7C has completed processing on the
1667 				 * socket, close the socket and back to
1668 				 * the top to await the next T_CONN_IND.
1669 				 */
1670 				mutex_exit(&nso->so_lock);
1671 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1672 						CRED());
1673 				VN_RELE(nvp);
1674 				goto again;
1675 			}
1676 			/* Pass the new socket out */
1677 		}
1678 
1679 		mutex_exit(&nso->so_lock);
1680 
1681 		/*
1682 		 * Pass out new socket.
1683 		 */
1684 		if (nsop != NULL)
1685 			*nsop = nso;
1686 
1687 		return (0);
1688 	}
1689 
1690 	/*
1691 	 * Copy local address from listener.
1692 	 */
1693 	nso->so_laddr_len = so->so_laddr_len;
1694 	ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
1695 	bcopy(so->so_laddr_sa, nso->so_laddr_sa, nso->so_laddr_len);
1696 	nso->so_state |= SS_LADDR_VALID;
1697 
1698 	/*
1699 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1700 	 * which don't support the FireEngine accept fast-path. It is also
1701 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1702 	 * again. Neither sockfs nor TCP attempt to find out if some other
1703 	 * random module has been inserted in between (in which case we
1704 	 * should follow TLI accept behaviour). We blindly assume the worst
1705 	 * case and revert back to old behaviour i.e. TCP will not send us
1706 	 * any option (eager) and the accept should happen on the listener
1707 	 * queue. Any queued T_conn_ind have already got their options removed
1708 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
1709 	 */
1710 	/*
1711 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1712 	 */
1713 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1714 #ifdef	_ILP32
1715 		queue_t	*q;
1716 
1717 		/*
1718 		 * Find read queue in driver
1719 		 * Can safely do this since we "own" nso/nvp.
1720 		 */
1721 		q = strvp2wq(nvp)->q_next;
1722 		while (SAMESTR(q))
1723 			q = q->q_next;
1724 		q = RD(q);
1725 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1726 #else
1727 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1728 #endif	/* _ILP32 */
1729 		conn_res->PRIM_type = O_T_CONN_RES;
1730 		PRIM_type = O_T_CONN_RES;
1731 	} else {
1732 		conn_res->ACCEPTOR_id = nso->so_acceptor_id;
1733 		conn_res->PRIM_type = T_CONN_RES;
1734 		PRIM_type = T_CONN_RES;
1735 	}
1736 	conn_res->SEQ_number = SEQ_number;
1737 	conn_res->OPT_length = 0;
1738 	conn_res->OPT_offset = 0;
1739 
1740 	mutex_enter(&so->so_lock);
1741 	so_lock_single(so);	/* Set SOLOCKED */
1742 	mutex_exit(&so->so_lock);
1743 
1744 	error = kstrputmsg(SOTOV(so), mp, NULL,
1745 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1746 	mutex_enter(&so->so_lock);
1747 	if (error) {
1748 		eprintsoline(so, error);
1749 		goto disconnect_vp;
1750 	}
1751 	error = sowaitokack(so, PRIM_type);
1752 	if (error) {
1753 		eprintsoline(so, error);
1754 		goto disconnect_vp;
1755 	}
1756 	so_unlock_single(so, SOLOCKED);
1757 	mutex_exit(&so->so_lock);
1758 
1759 	nso->so_state |= SS_ISCONNECTED;
1760 
1761 	/*
1762 	 * Pass out new socket.
1763 	 */
1764 	if (nsop != NULL)
1765 		*nsop = nso;
1766 
1767 	return (0);
1768 
1769 
1770 eproto_disc_unl:
1771 	error = EPROTO;
1772 e_disc_unl:
1773 	eprintsoline(so, error);
1774 	goto disconnect_unlocked;
1775 
1776 pr_disc_vp_unl:
1777 	eprintsoline(so, error);
1778 disconnect_vp_unlocked:
1779 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED());
1780 	VN_RELE(nvp);
1781 disconnect_unlocked:
1782 	(void) sodisconnect(so, SEQ_number, 0);
1783 	return (error);
1784 
1785 pr_disc_vp:
1786 	eprintsoline(so, error);
1787 disconnect_vp:
1788 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
1789 	so_unlock_single(so, SOLOCKED);
1790 	mutex_exit(&so->so_lock);
1791 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED());
1792 	VN_RELE(nvp);
1793 	return (error);
1794 
1795 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
1796 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
1797 	    ? EOPNOTSUPP : EINVAL;
1798 e_bad:
1799 	eprintsoline(so, error);
1800 	return (error);
1801 }
1802 
1803 /*
1804  * connect a socket.
1805  *
1806  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
1807  * unconnect (by specifying a null address).
1808  */
1809 int
1810 sotpi_connect(struct sonode *so,
1811 	const struct sockaddr *name,
1812 	socklen_t namelen,
1813 	int fflag,
1814 	int flags)
1815 {
1816 	struct T_conn_req	conn_req;
1817 	int			error = 0;
1818 	mblk_t			*mp;
1819 	void			*src;
1820 	socklen_t		srclen;
1821 	void			*addr;
1822 	socklen_t		addrlen;
1823 	boolean_t		need_unlock;
1824 
1825 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
1826 		so, name, namelen, fflag, flags,
1827 		pr_state(so->so_state, so->so_mode)));
1828 
1829 	/*
1830 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
1831 	 * avoid sleeping for memory with SOLOCKED held.
1832 	 * We know that the T_CONN_REQ can't be larger than 2 * so_faddr_maxlen
1833 	 * + sizeof (struct T_opthdr).
1834 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
1835 	 * exceed so_faddr_maxlen).
1836 	 */
1837 	mp = soallocproto(sizeof (struct T_conn_req) +
1838 	    2 * so->so_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR);
1839 	if (mp == NULL) {
1840 		/*
1841 		 * Connect can not fail with ENOBUFS. A signal was
1842 		 * caught so return EINTR.
1843 		 */
1844 		error = EINTR;
1845 		eprintsoline(so, error);
1846 		return (error);
1847 	}
1848 
1849 	mutex_enter(&so->so_lock);
1850 	/*
1851 	 * Make sure that there is a preallocated unbind_req
1852 	 * message before any binding. This message allocated when
1853 	 * the socket is created  but it might be have been
1854 	 * consumed.
1855 	 */
1856 	if (so->so_unbind_mp == NULL) {
1857 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
1858 		/* NOTE: holding so_lock while sleeping */
1859 		so->so_unbind_mp =
1860 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR);
1861 		if (so->so_unbind_mp == NULL) {
1862 			error = EINTR;
1863 			need_unlock = B_FALSE;
1864 			goto done;
1865 		}
1866 	}
1867 
1868 	so_lock_single(so);	/* Set SOLOCKED */
1869 	need_unlock = B_TRUE;
1870 
1871 	/*
1872 	 * Can't have done a listen before connecting.
1873 	 */
1874 	if (so->so_state & SS_ACCEPTCONN) {
1875 		error = EOPNOTSUPP;
1876 		goto done;
1877 	}
1878 
1879 	/*
1880 	 * Must be bound with the transport
1881 	 */
1882 	if (!(so->so_state & SS_ISBOUND)) {
1883 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1884 		    /*CONSTCOND*/
1885 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
1886 			/*
1887 			 * Optimization for AF_INET{,6} transports
1888 			 * that can handle a T_CONN_REQ without being bound.
1889 			 */
1890 			so_automatic_bind(so);
1891 		} else {
1892 			error = sotpi_bind(so, NULL, 0,
1893 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
1894 			if (error)
1895 				goto done;
1896 		}
1897 		ASSERT(so->so_state & SS_ISBOUND);
1898 		flags |= _SOCONNECT_DID_BIND;
1899 	}
1900 
1901 	/*
1902 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
1903 	 * connect to a null address. This is the portable method to
1904 	 * unconnect a socket.
1905 	 */
1906 	if ((namelen >= sizeof (sa_family_t)) &&
1907 	    (name->sa_family == AF_UNSPEC)) {
1908 		name = NULL;
1909 		namelen = 0;
1910 	}
1911 
1912 	/*
1913 	 * Check that we are not already connected.
1914 	 * A connection-oriented socket cannot be reconnected.
1915 	 * A connected connection-less socket can be
1916 	 * - connected to a different address by a subsequent connect
1917 	 * - "unconnected" by a connect to the NULL address
1918 	 */
1919 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
1920 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
1921 		if (so->so_mode & SM_CONNREQUIRED) {
1922 			/* Connection-oriented socket */
1923 			error = so->so_state & SS_ISCONNECTED ?
1924 			    EISCONN : EALREADY;
1925 			goto done;
1926 		}
1927 		/* Connection-less socket */
1928 		if (name == NULL) {
1929 			/*
1930 			 * Remove the connected state and clear SO_DGRAM_ERRIND
1931 			 * since it was set when the socket was connected.
1932 			 * If this is UDP also send down a T_DISCON_REQ.
1933 			 */
1934 			int val;
1935 
1936 			if ((so->so_family == AF_INET ||
1937 				so->so_family == AF_INET6) &&
1938 			    (so->so_type == SOCK_DGRAM ||
1939 				so->so_type == SOCK_RAW) &&
1940 			    /*CONSTCOND*/
1941 			    !soconnect_tpi_udp) {
1942 				/* XXX What about implicitly unbinding here? */
1943 				error = sodisconnect(so, -1,
1944 						_SODISCONNECT_LOCK_HELD);
1945 			} else {
1946 				so->so_state &=
1947 				    ~(SS_ISCONNECTED | SS_ISCONNECTING |
1948 				    SS_FADDR_VALID);
1949 				so->so_faddr_len = 0;
1950 			}
1951 
1952 			so_unlock_single(so, SOLOCKED);
1953 			mutex_exit(&so->so_lock);
1954 
1955 			val = 0;
1956 			(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
1957 					&val, (t_uscalar_t)sizeof (val));
1958 
1959 			mutex_enter(&so->so_lock);
1960 			so_lock_single(so);	/* Set SOLOCKED */
1961 			goto done;
1962 		}
1963 	}
1964 	ASSERT(so->so_state & SS_ISBOUND);
1965 
1966 	if (name == NULL || namelen == 0) {
1967 		error = EINVAL;
1968 		goto done;
1969 	}
1970 	/*
1971 	 * Mark the socket if so_faddr_sa represents the transport level
1972 	 * address.
1973 	 */
1974 	if (flags & _SOCONNECT_NOXLATE) {
1975 		struct sockaddr_ux	*soaddr_ux;
1976 
1977 		ASSERT(so->so_family == AF_UNIX);
1978 		if (namelen != sizeof (struct sockaddr_ux)) {
1979 			error = EINVAL;
1980 			goto done;
1981 		}
1982 		soaddr_ux = (struct sockaddr_ux *)name;
1983 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
1984 		namelen = sizeof (soaddr_ux->sou_addr);
1985 		so->so_state |= SS_FADDR_NOXLATE;
1986 	}
1987 
1988 	/*
1989 	 * Length and family checks.
1990 	 */
1991 	error = so_addr_verify(so, name, namelen);
1992 	if (error)
1993 		goto bad;
1994 
1995 	/*
1996 	 * Save foreign address. Needed for AF_UNIX as well as
1997 	 * transport providers that do not support TI_GETPEERNAME.
1998 	 * Also used for cached foreign address for TCP and UDP.
1999 	 */
2000 	if (namelen > (t_uscalar_t)so->so_faddr_maxlen) {
2001 		error = EINVAL;
2002 		goto done;
2003 	}
2004 	so->so_faddr_len = (socklen_t)namelen;
2005 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
2006 	bcopy(name, so->so_faddr_sa, namelen);
2007 	so->so_state |= SS_FADDR_VALID;
2008 
2009 	if (so->so_family == AF_UNIX) {
2010 		if (so->so_state & SS_FADDR_NOXLATE) {
2011 			/*
2012 			 * Already have a transport internal address. Do not
2013 			 * pass any (transport internal) source address.
2014 			 */
2015 			addr = so->so_faddr_sa;
2016 			addrlen = (t_uscalar_t)so->so_faddr_len;
2017 			src = NULL;
2018 			srclen = 0;
2019 		} else {
2020 			/*
2021 			 * Pass the sockaddr_un source address as an option
2022 			 * and translate the remote address.
2023 			 * Holding so_lock thus so_laddr_sa can not change.
2024 			 */
2025 			src = so->so_laddr_sa;
2026 			srclen = (t_uscalar_t)so->so_laddr_len;
2027 			dprintso(so, 1,
2028 				("sotpi_connect UNIX: srclen %d, src %p\n",
2029 				srclen, src));
2030 			error = so_ux_addr_xlate(so,
2031 				so->so_faddr_sa, (socklen_t)so->so_faddr_len,
2032 				(flags & _SOCONNECT_XPG4_2),
2033 				&addr, &addrlen);
2034 			if (error)
2035 				goto bad;
2036 		}
2037 	} else {
2038 		addr = so->so_faddr_sa;
2039 		addrlen = (t_uscalar_t)so->so_faddr_len;
2040 		src = NULL;
2041 		srclen = 0;
2042 	}
2043 	/*
2044 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2045 	 * option which asks the transport provider to send T_UDERR_IND
2046 	 * messages. These T_UDERR_IND messages are used to return connected
2047 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2048 	 *
2049 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2050 	 * we send down a T_CONN_REQ. This is needed to let the
2051 	 * transport assign a local address that is consistent with
2052 	 * the remote address. Applications depend on a getsockname()
2053 	 * after a connect() to retrieve the "source" IP address for
2054 	 * the connected socket.  Invalidate the cached local address
2055 	 * to force getsockname() to enquire of the transport.
2056 	 */
2057 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2058 		/*
2059 		 * Datagram socket.
2060 		 */
2061 		int32_t val;
2062 
2063 		so_unlock_single(so, SOLOCKED);
2064 		mutex_exit(&so->so_lock);
2065 
2066 		val = 1;
2067 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2068 					&val, (t_uscalar_t)sizeof (val));
2069 
2070 		mutex_enter(&so->so_lock);
2071 		so_lock_single(so);	/* Set SOLOCKED */
2072 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2073 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2074 		    soconnect_tpi_udp) {
2075 			soisconnected(so);
2076 			goto done;
2077 		}
2078 		/*
2079 		 * Send down T_CONN_REQ etc.
2080 		 * Clear fflag to avoid returning EWOULDBLOCK.
2081 		 */
2082 		fflag = 0;
2083 		ASSERT(so->so_family != AF_UNIX);
2084 		so->so_state &= ~SS_LADDR_VALID;
2085 	} else if (so->so_laddr_len != 0) {
2086 		/*
2087 		 * If the local address or port was "any" then it may be
2088 		 * changed by the transport as a result of the
2089 		 * connect.  Invalidate the cached version if we have one.
2090 		 */
2091 		switch (so->so_family) {
2092 		case AF_INET:
2093 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin_t));
2094 			if (((sin_t *)so->so_laddr_sa)->sin_addr.s_addr ==
2095 			    INADDR_ANY ||
2096 			    ((sin_t *)so->so_laddr_sa)->sin_port == 0)
2097 				so->so_state &= ~SS_LADDR_VALID;
2098 			break;
2099 
2100 		case AF_INET6:
2101 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin6_t));
2102 			if (IN6_IS_ADDR_UNSPECIFIED(
2103 			    &((sin6_t *)so->so_laddr_sa) ->sin6_addr) ||
2104 			    IN6_IS_ADDR_V4MAPPED_ANY(
2105 			    &((sin6_t *)so->so_laddr_sa)->sin6_addr) ||
2106 			    ((sin6_t *)so->so_laddr_sa)->sin6_port == 0)
2107 				    so->so_state &= ~SS_LADDR_VALID;
2108 			break;
2109 
2110 		default:
2111 			break;
2112 		}
2113 	}
2114 
2115 	/*
2116 	 * Check for failure of an earlier call
2117 	 */
2118 	if (so->so_error != 0)
2119 		goto so_bad;
2120 
2121 	/*
2122 	 * Send down T_CONN_REQ. Message was allocated above.
2123 	 */
2124 	conn_req.PRIM_type = T_CONN_REQ;
2125 	conn_req.DEST_length = addrlen;
2126 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2127 	if (srclen == 0) {
2128 		conn_req.OPT_length = 0;
2129 		conn_req.OPT_offset = 0;
2130 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2131 		soappendmsg(mp, addr, addrlen);
2132 	} else {
2133 		/*
2134 		 * There is a AF_UNIX sockaddr_un to include as a source
2135 		 * address option.
2136 		 */
2137 		struct T_opthdr toh;
2138 
2139 		toh.level = SOL_SOCKET;
2140 		toh.name = SO_SRCADDR;
2141 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2142 		toh.status = 0;
2143 		conn_req.OPT_length =
2144 			(t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2145 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2146 			_TPI_ALIGN_TOPT(addrlen));
2147 
2148 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2149 		soappendmsg(mp, addr, addrlen);
2150 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2151 		soappendmsg(mp, &toh, sizeof (toh));
2152 		soappendmsg(mp, src, srclen);
2153 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2154 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2155 	}
2156 	/*
2157 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2158 	 * in order to have the right state when the T_CONN_CON shows up.
2159 	 */
2160 	soisconnecting(so);
2161 	mutex_exit(&so->so_lock);
2162 
2163 #ifdef C2_AUDIT
2164 	if (audit_active)
2165 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2166 #endif /* C2_AUDIT */
2167 
2168 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2169 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2170 	mp = NULL;
2171 	mutex_enter(&so->so_lock);
2172 	if (error != 0)
2173 		goto bad;
2174 
2175 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2176 		goto bad;
2177 
2178 	/* Allow other threads to access the socket */
2179 	so_unlock_single(so, SOLOCKED);
2180 	need_unlock = B_FALSE;
2181 
2182 	/*
2183 	 * Wait until we get a T_CONN_CON or an error
2184 	 */
2185 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2186 		so_lock_single(so);	/* Set SOLOCKED */
2187 		need_unlock = B_TRUE;
2188 	}
2189 
2190 done:
2191 	freemsg(mp);
2192 	switch (error) {
2193 	case EINPROGRESS:
2194 	case EALREADY:
2195 	case EISCONN:
2196 	case EINTR:
2197 		/* Non-fatal errors */
2198 		so->so_state &= ~SS_LADDR_VALID;
2199 		/* FALLTHRU */
2200 	case 0:
2201 		break;
2202 
2203 	case EHOSTUNREACH:
2204 		if (flags & _SOCONNECT_XPG4_2) {
2205 			/*
2206 			 * X/Open specification contains a requirement that
2207 			 * ENETUNREACH be returned but does not require
2208 			 * EHOSTUNREACH. In order to keep the test suite
2209 			 * happy we mess with the errno here.
2210 			 */
2211 			error = ENETUNREACH;
2212 		}
2213 		/* FALLTHRU */
2214 
2215 	default:
2216 		ASSERT(need_unlock);
2217 		/*
2218 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2219 		 * and invalidate local-address cache
2220 		 */
2221 		so->so_state &= ~(SS_ISCONNECTING | SS_LADDR_VALID);
2222 		/* A discon_ind might have already unbound us */
2223 		if ((flags & _SOCONNECT_DID_BIND) &&
2224 		    (so->so_state & SS_ISBOUND)) {
2225 			int err;
2226 
2227 			err = sotpi_unbind(so, 0);
2228 			/* LINTED - statement has no conseq */
2229 			if (err) {
2230 				eprintsoline(so, err);
2231 			}
2232 		}
2233 		break;
2234 	}
2235 	if (need_unlock)
2236 		so_unlock_single(so, SOLOCKED);
2237 	mutex_exit(&so->so_lock);
2238 	return (error);
2239 
2240 so_bad:	error = sogeterr(so);
2241 bad:	eprintsoline(so, error);
2242 	goto done;
2243 }
2244 
2245 int
2246 sotpi_shutdown(struct sonode *so, int how)
2247 {
2248 	struct T_ordrel_req	ordrel_req;
2249 	mblk_t			*mp;
2250 	uint_t			old_state, state_change;
2251 	int			error = 0;
2252 
2253 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2254 		so, how, pr_state(so->so_state, so->so_mode)));
2255 
2256 	mutex_enter(&so->so_lock);
2257 	so_lock_single(so);	/* Set SOLOCKED */
2258 
2259 	/*
2260 	 * SunOS 4.X has no check for datagram sockets.
2261 	 * 5.X checks that it is connected (ENOTCONN)
2262 	 * X/Open requires that we check the connected state.
2263 	 */
2264 	if (!(so->so_state & SS_ISCONNECTED)) {
2265 		if (!xnet_skip_checks) {
2266 			error = ENOTCONN;
2267 			if (xnet_check_print) {
2268 				printf("sockfs: X/Open shutdown check "
2269 					"caused ENOTCONN\n");
2270 			}
2271 		}
2272 		goto done;
2273 	}
2274 	/*
2275 	 * Record the current state and then perform any state changes.
2276 	 * Then use the difference between the old and new states to
2277 	 * determine which messages need to be sent.
2278 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2279 	 * duplicate calls to shutdown().
2280 	 */
2281 	old_state = so->so_state;
2282 
2283 	switch (how) {
2284 	case 0:
2285 		socantrcvmore(so);
2286 		break;
2287 	case 1:
2288 		socantsendmore(so);
2289 		break;
2290 	case 2:
2291 		socantsendmore(so);
2292 		socantrcvmore(so);
2293 		break;
2294 	default:
2295 		error = EINVAL;
2296 		goto done;
2297 	}
2298 
2299 	/*
2300 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2301 	 */
2302 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2303 		(old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2304 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2305 
2306 	switch (state_change) {
2307 	case 0:
2308 		dprintso(so, 1,
2309 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2310 		    so->so_state));
2311 		goto done;
2312 
2313 	case SS_CANTRCVMORE:
2314 		mutex_exit(&so->so_lock);
2315 		strseteof(SOTOV(so), 1);
2316 		/*
2317 		 * strseteof takes care of read side wakeups,
2318 		 * pollwakeups, and signals.
2319 		 */
2320 		/*
2321 		 * Get the read lock before flushing data to avoid problems
2322 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2323 		 */
2324 		mutex_enter(&so->so_lock);
2325 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2326 		mutex_exit(&so->so_lock);
2327 
2328 		/* Flush read side queue */
2329 		strflushrq(SOTOV(so), FLUSHALL);
2330 
2331 		mutex_enter(&so->so_lock);
2332 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2333 		break;
2334 
2335 	case SS_CANTSENDMORE:
2336 		mutex_exit(&so->so_lock);
2337 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2338 		mutex_enter(&so->so_lock);
2339 		break;
2340 
2341 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2342 		mutex_exit(&so->so_lock);
2343 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2344 		strseteof(SOTOV(so), 1);
2345 		/*
2346 		 * strseteof takes care of read side wakeups,
2347 		 * pollwakeups, and signals.
2348 		 */
2349 		/*
2350 		 * Get the read lock before flushing data to avoid problems
2351 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2352 		 */
2353 		mutex_enter(&so->so_lock);
2354 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2355 		mutex_exit(&so->so_lock);
2356 
2357 		/* Flush read side queue */
2358 		strflushrq(SOTOV(so), FLUSHALL);
2359 
2360 		mutex_enter(&so->so_lock);
2361 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2362 		break;
2363 	}
2364 
2365 	ASSERT(MUTEX_HELD(&so->so_lock));
2366 
2367 	/*
2368 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2369 	 * was set due to this call and the new state has both of them set:
2370 	 *	Send the AF_UNIX close indication
2371 	 *	For T_COTS send a discon_ind
2372 	 *
2373 	 * If cantsend was set due to this call:
2374 	 *	For T_COTSORD send an ordrel_ind
2375 	 *
2376 	 * Note that for T_CLTS there is no message sent here.
2377 	 */
2378 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2379 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2380 		/*
2381 		 * For SunOS 4.X compatibility we tell the other end
2382 		 * that we are unable to receive at this point.
2383 		 */
2384 		if (so->so_family == AF_UNIX && so->so_serv_type != T_CLTS)
2385 			so_unix_close(so);
2386 
2387 		if (so->so_serv_type == T_COTS)
2388 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2389 	}
2390 	if ((state_change & SS_CANTSENDMORE) &&
2391 	    (so->so_serv_type == T_COTS_ORD)) {
2392 		/* Send an orderly release */
2393 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2394 
2395 		mutex_exit(&so->so_lock);
2396 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2397 		    0, _ALLOC_SLEEP);
2398 		/*
2399 		 * Send down the T_ORDREL_REQ even if there is flow control.
2400 		 * This prevents shutdown from blocking.
2401 		 * Note that there is no T_OK_ACK for ordrel_req.
2402 		 */
2403 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2404 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2405 		mutex_enter(&so->so_lock);
2406 		if (error) {
2407 			eprintsoline(so, error);
2408 			goto done;
2409 		}
2410 	}
2411 
2412 done:
2413 	so_unlock_single(so, SOLOCKED);
2414 	mutex_exit(&so->so_lock);
2415 	return (error);
2416 }
2417 
2418 /*
2419  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2420  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2421  * that we have closed.
2422  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2423  * T_UNITDATA_REQ containing the same option.
2424  *
2425  * For SOCK_DGRAM half-connections (somebody connected to this end
2426  * but this end is not connect) we don't know where to send any
2427  * SO_UNIX_CLOSE.
2428  *
2429  * We have to ignore stream head errors just in case there has been
2430  * a shutdown(output).
2431  * Ignore any flow control to try to get the message more quickly to the peer.
2432  * While locally ignoring flow control solves the problem when there
2433  * is only the loopback transport on the stream it would not provide
2434  * the correct AF_UNIX socket semantics when one or more modules have
2435  * been pushed.
2436  */
2437 void
2438 so_unix_close(struct sonode *so)
2439 {
2440 	int		error;
2441 	struct T_opthdr	toh;
2442 	mblk_t		*mp;
2443 
2444 	ASSERT(MUTEX_HELD(&so->so_lock));
2445 
2446 	ASSERT(so->so_family == AF_UNIX);
2447 
2448 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2449 	    (SS_ISCONNECTED|SS_ISBOUND))
2450 		return;
2451 
2452 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2453 		so, pr_state(so->so_state, so->so_mode)));
2454 
2455 	toh.level = SOL_SOCKET;
2456 	toh.name = SO_UNIX_CLOSE;
2457 
2458 	/* zero length + header */
2459 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2460 	toh.status = 0;
2461 
2462 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2463 		struct T_optdata_req tdr;
2464 
2465 		tdr.PRIM_type = T_OPTDATA_REQ;
2466 		tdr.DATA_flag = 0;
2467 
2468 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2469 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2470 
2471 		/* NOTE: holding so_lock while sleeping */
2472 		mp = soallocproto2(&tdr, sizeof (tdr),
2473 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP);
2474 	} else {
2475 		struct T_unitdata_req	tudr;
2476 		void			*addr;
2477 		socklen_t		addrlen;
2478 		void			*src;
2479 		socklen_t		srclen;
2480 		struct T_opthdr		toh2;
2481 		t_scalar_t		size;
2482 
2483 		/* Connecteded DGRAM socket */
2484 
2485 		/*
2486 		 * For AF_UNIX the destination address is translated to
2487 		 * an internal name and the source address is passed as
2488 		 * an option.
2489 		 */
2490 		/*
2491 		 * Length and family checks.
2492 		 */
2493 		error = so_addr_verify(so, so->so_faddr_sa,
2494 					(t_uscalar_t)so->so_faddr_len);
2495 		if (error) {
2496 			eprintsoline(so, error);
2497 			return;
2498 		}
2499 		if (so->so_state & SS_FADDR_NOXLATE) {
2500 			/*
2501 			 * Already have a transport internal address. Do not
2502 			 * pass any (transport internal) source address.
2503 			 */
2504 			addr = so->so_faddr_sa;
2505 			addrlen = (t_uscalar_t)so->so_faddr_len;
2506 			src = NULL;
2507 			srclen = 0;
2508 		} else {
2509 			/*
2510 			 * Pass the sockaddr_un source address as an option
2511 			 * and translate the remote address.
2512 			 * Holding so_lock thus so_laddr_sa can not change.
2513 			 */
2514 			src = so->so_laddr_sa;
2515 			srclen = (socklen_t)so->so_laddr_len;
2516 			dprintso(so, 1,
2517 				("so_ux_close: srclen %d, src %p\n",
2518 				srclen, src));
2519 			error = so_ux_addr_xlate(so,
2520 				so->so_faddr_sa,
2521 				(socklen_t)so->so_faddr_len, 0,
2522 				&addr, &addrlen);
2523 			if (error) {
2524 				eprintsoline(so, error);
2525 				return;
2526 			}
2527 		}
2528 		tudr.PRIM_type = T_UNITDATA_REQ;
2529 		tudr.DEST_length = addrlen;
2530 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2531 		if (srclen == 0) {
2532 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2533 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2534 				_TPI_ALIGN_TOPT(addrlen));
2535 
2536 			size = tudr.OPT_offset + tudr.OPT_length;
2537 			/* NOTE: holding so_lock while sleeping */
2538 			mp = soallocproto2(&tudr, sizeof (tudr),
2539 			    addr, addrlen, size, _ALLOC_SLEEP);
2540 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2541 			soappendmsg(mp, &toh, sizeof (toh));
2542 		} else {
2543 			/*
2544 			 * There is a AF_UNIX sockaddr_un to include as a
2545 			 * source address option.
2546 			 */
2547 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2548 			    _TPI_ALIGN_TOPT(srclen));
2549 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2550 			    _TPI_ALIGN_TOPT(addrlen));
2551 
2552 			toh2.level = SOL_SOCKET;
2553 			toh2.name = SO_SRCADDR;
2554 			toh2.len = (t_uscalar_t)(srclen +
2555 					sizeof (struct T_opthdr));
2556 			toh2.status = 0;
2557 
2558 			size = tudr.OPT_offset + tudr.OPT_length;
2559 
2560 			/* NOTE: holding so_lock while sleeping */
2561 			mp = soallocproto2(&tudr, sizeof (tudr),
2562 			    addr, addrlen, size, _ALLOC_SLEEP);
2563 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2564 			soappendmsg(mp, &toh, sizeof (toh));
2565 			soappendmsg(mp, &toh2, sizeof (toh2));
2566 			soappendmsg(mp, src, srclen);
2567 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2568 		}
2569 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2570 	}
2571 	mutex_exit(&so->so_lock);
2572 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2573 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2574 	mutex_enter(&so->so_lock);
2575 }
2576 
2577 /*
2578  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
2579  */
2580 int
2581 sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags)
2582 {
2583 	mblk_t		*mp, *nmp;
2584 	int		error;
2585 
2586 	dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", so, msg, flags));
2587 
2588 	/*
2589 	 * There is never any oob data with addresses or control since
2590 	 * the T_EXDATA_IND does not carry any options.
2591 	 */
2592 	msg->msg_controllen = 0;
2593 	msg->msg_namelen = 0;
2594 
2595 	mutex_enter(&so->so_lock);
2596 	ASSERT(so_verify_oobstate(so));
2597 	if ((so->so_options & SO_OOBINLINE) ||
2598 	    (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
2599 		dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
2600 		mutex_exit(&so->so_lock);
2601 		return (EINVAL);
2602 	}
2603 	if (!(so->so_state & SS_HAVEOOBDATA)) {
2604 		dprintso(so, 1, ("sorecvoob: no data yet\n"));
2605 		mutex_exit(&so->so_lock);
2606 		return (EWOULDBLOCK);
2607 	}
2608 	ASSERT(so->so_oobmsg != NULL);
2609 	mp = so->so_oobmsg;
2610 	if (flags & MSG_PEEK) {
2611 		/*
2612 		 * Since recv* can not return ENOBUFS we can not use dupmsg.
2613 		 * Instead we revert to the consolidation private
2614 		 * allocb_wait plus bcopy.
2615 		 */
2616 		mblk_t *mp1;
2617 
2618 		mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
2619 		ASSERT(mp1);
2620 
2621 		while (mp != NULL) {
2622 			ssize_t size;
2623 
2624 			size = MBLKL(mp);
2625 			bcopy(mp->b_rptr, mp1->b_wptr, size);
2626 			mp1->b_wptr += size;
2627 			ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
2628 			mp = mp->b_cont;
2629 		}
2630 		mp = mp1;
2631 	} else {
2632 		/*
2633 		 * Update the state indicating that the data has been consumed.
2634 		 * Keep SS_OOBPEND set until data is consumed past the mark.
2635 		 */
2636 		so->so_oobmsg = NULL;
2637 		so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
2638 	}
2639 	dprintso(so, 1,
2640 		("after recvoob(%p): counts %d/%d state %s\n",
2641 		so, so->so_oobsigcnt,
2642 		so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2643 	ASSERT(so_verify_oobstate(so));
2644 	mutex_exit(&so->so_lock);
2645 
2646 	error = 0;
2647 	nmp = mp;
2648 	while (nmp != NULL && uiop->uio_resid > 0) {
2649 		ssize_t n = MBLKL(nmp);
2650 
2651 		n = MIN(n, uiop->uio_resid);
2652 		if (n > 0)
2653 			error = uiomove(nmp->b_rptr, n,
2654 					UIO_READ, uiop);
2655 		if (error)
2656 			break;
2657 		nmp = nmp->b_cont;
2658 	}
2659 	freemsg(mp);
2660 	return (error);
2661 }
2662 
2663 /*
2664  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2665  * In addition, the caller typically verifies that there is some
2666  * potential state to clear by checking
2667  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2668  * before calling this routine.
2669  * Note that such a check can be made without holding so_lock since
2670  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2671  * decrements so_oobsigcnt.
2672  *
2673  * When data is read *after* the point that all pending
2674  * oob data has been consumed the oob indication is cleared.
2675  *
2676  * This logic keeps select/poll returning POLLRDBAND and
2677  * SIOCATMARK returning true until we have read past
2678  * the mark.
2679  */
2680 static void
2681 sorecv_update_oobstate(struct sonode *so)
2682 {
2683 	mutex_enter(&so->so_lock);
2684 	ASSERT(so_verify_oobstate(so));
2685 	dprintso(so, 1,
2686 		("sorecv_update_oobstate: counts %d/%d state %s\n",
2687 		so->so_oobsigcnt,
2688 		so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2689 	if (so->so_oobsigcnt == 0) {
2690 		/* No more pending oob indications */
2691 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2692 		freemsg(so->so_oobmsg);
2693 		so->so_oobmsg = NULL;
2694 	}
2695 	ASSERT(so_verify_oobstate(so));
2696 	mutex_exit(&so->so_lock);
2697 }
2698 
2699 /*
2700  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2701  */
2702 static int
2703 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2704 {
2705 	int	error = 0;
2706 	mblk_t *tmp = NULL;
2707 	mblk_t *pmp = NULL;
2708 	mblk_t *nmp = so->so_nl7c_rcv_mp;
2709 
2710 	ASSERT(nmp != NULL);
2711 
2712 	while (nmp != NULL && uiop->uio_resid > 0) {
2713 		ssize_t n;
2714 
2715 		if (DB_TYPE(nmp) == M_DATA) {
2716 			/*
2717 			 * We have some data, uiomove up to resid bytes.
2718 			 */
2719 			n = MIN(MBLKL(nmp), uiop->uio_resid);
2720 			if (n > 0)
2721 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
2722 			if (error)
2723 				break;
2724 			nmp->b_rptr += n;
2725 			if (nmp->b_rptr == nmp->b_wptr) {
2726 				pmp = nmp;
2727 				nmp = nmp->b_cont;
2728 			}
2729 		} else {
2730 			/*
2731 			 * We only handle data, save for caller to handle.
2732 			 */
2733 			if (pmp != NULL) {
2734 				pmp->b_cont = nmp->b_cont;
2735 			}
2736 			nmp->b_cont = NULL;
2737 			if (*rmp == NULL) {
2738 				*rmp = nmp;
2739 			} else {
2740 				tmp->b_next = nmp;
2741 			}
2742 			nmp = nmp->b_cont;
2743 			tmp = nmp;
2744 		}
2745 	}
2746 	if (pmp != NULL) {
2747 		/* Free any mblk_t(s) which we have consumed */
2748 		pmp->b_cont = NULL;
2749 		freemsg(so->so_nl7c_rcv_mp);
2750 	}
2751 	if ((so->so_nl7c_rcv_mp = nmp) == NULL) {
2752 		/* Last mblk_t so return the saved rval from kstrgetmsg() */
2753 		rp->r_vals = so->so_nl7c_rcv_rval;
2754 		so->so_nl7c_rcv_rval = 0;
2755 	} else {
2756 		/* More mblk_t(s) to process so no rval to return */
2757 		rp->r_vals = 0;
2758 	}
2759 	return (error);
2760 }
2761 
2762 /*
2763  * Receive the next message on the queue.
2764  * If msg_controllen is non-zero when called the caller is interested in
2765  * any received control info (options).
2766  * If msg_namelen is non-zero when called the caller is interested in
2767  * any received source address.
2768  * The routine returns with msg_control and msg_name pointing to
2769  * kmem_alloc'ed memory which the caller has to free.
2770  */
2771 int
2772 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
2773 {
2774 	union T_primitives	*tpr;
2775 	mblk_t			*mp;
2776 	uchar_t			pri;
2777 	int			pflag, opflag;
2778 	void			*control;
2779 	t_uscalar_t		controllen;
2780 	t_uscalar_t		namelen;
2781 	int			so_state = so->so_state; /* Snapshot */
2782 	ssize_t			saved_resid;
2783 	int			error;
2784 	rval_t			rval;
2785 	int			flags;
2786 	clock_t			timout;
2787 	int			first;
2788 
2789 	flags = msg->msg_flags;
2790 	msg->msg_flags = 0;
2791 
2792 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2793 		so, msg, flags,
2794 		pr_state(so->so_state, so->so_mode), so->so_error));
2795 
2796 	/*
2797 	 * If we are not connected because we have never been connected
2798 	 * we return ENOTCONN. If we have been connected (but are no longer
2799 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2800 	 * the EOF.
2801 	 *
2802 	 * An alternative would be to post an ENOTCONN error in stream head
2803 	 * (read+write) and clear it when we're connected. However, that error
2804 	 * would cause incorrect poll/select behavior!
2805 	 */
2806 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2807 	    (so->so_mode & SM_CONNREQUIRED)) {
2808 		return (ENOTCONN);
2809 	}
2810 
2811 	/*
2812 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2813 	 * after checking that the read queue is empty) and returns zero.
2814 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2815 	 * is zero.
2816 	 */
2817 
2818 	if (flags & MSG_OOB) {
2819 		/* Check that the transport supports OOB */
2820 		if (!(so->so_mode & SM_EXDATA))
2821 			return (EOPNOTSUPP);
2822 		return (sorecvoob(so, msg, uiop, flags));
2823 	}
2824 
2825 	/*
2826 	 * Set msg_controllen and msg_namelen to zero here to make it
2827 	 * simpler in the cases that no control or name is returned.
2828 	 */
2829 	controllen = msg->msg_controllen;
2830 	namelen = msg->msg_namelen;
2831 	msg->msg_controllen = 0;
2832 	msg->msg_namelen = 0;
2833 
2834 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2835 		namelen, controllen));
2836 
2837 	/*
2838 	 * If an NL7C enabled socket and not waiting for write data.
2839 	 */
2840 	mutex_enter(&so->so_lock);
2841 	if ((so->so_nl7c_flags & (NL7C_ENABLED|NL7C_WAITWRITE)) ==
2842 	    NL7C_ENABLED) {
2843 		if (so->so_nl7c_uri) {
2844 			/*
2845 			 * Close uri processing for a previous request.
2846 			 */
2847 			nl7c_close(so);
2848 		}
2849 		if (nl7c_process(so,
2850 		    (so->so_state & (SS_NONBLOCK|SS_NDELAY)),
2851 		    (int)((tcp_t *)so->so_priv)->tcp_mss)) {
2852 			/*
2853 			 * NL7C has completed processing on the socket,
2854 			 * clear the enabled bit as no further NL7C
2855 			 * processing will be needed.
2856 			 */
2857 			so->so_nl7c_flags = 0;
2858 		}
2859 	}
2860 
2861 	/*
2862 	 * Only one reader is allowed at any given time. This is needed
2863 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
2864 	 *
2865 	 * This is slightly different that BSD behavior in that it fails with
2866 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
2867 	 * is single-threaded using sblock(), which is dropped while waiting
2868 	 * for data to appear. The difference shows up e.g. if one
2869 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
2870 	 * does use nonblocking io and different threads are reading each
2871 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
2872 	 * in this case as long as the read queue doesn't get empty.
2873 	 * In this implementation the thread using nonblocking io can
2874 	 * get an EWOULDBLOCK error due to the blocking thread executing
2875 	 * e.g. in the uiomove in kstrgetmsg.
2876 	 * This difference is not believed to be significant.
2877 	 */
2878 	error = so_lock_read_intr(so, uiop->uio_fmode);	/* Set SOREADLOCKED */
2879 	mutex_exit(&so->so_lock);
2880 	if (error)
2881 		return (error);
2882 
2883 	/*
2884 	 * Tell kstrgetmsg to not inspect the stream head errors until all
2885 	 * queued data has been consumed.
2886 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
2887 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
2888 	 *
2889 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
2890 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
2891 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
2892 	 */
2893 	pflag = MSG_ANY | MSG_DELAYERROR;
2894 	if (flags & MSG_PEEK) {
2895 		pflag |= MSG_IPEEK;
2896 		flags &= ~MSG_WAITALL;
2897 	}
2898 	if (so->so_mode & SM_ATOMIC)
2899 		pflag |= MSG_DISCARDTAIL;
2900 
2901 	if (flags & MSG_DONTWAIT)
2902 		timout = 0;
2903 	else
2904 		timout = -1;
2905 	opflag = pflag;
2906 	first = 1;
2907 
2908 	/*
2909 	 * If so saved NL7C rcv mblk_t(s) uiomove them first
2910 	 * else get'm from the streamhead.
2911 	 */
2912 retry:
2913 	saved_resid = uiop->uio_resid;
2914 	pri = 0;
2915 	mp = NULL;
2916 	if (so->so_nl7c_rcv_mp != NULL) {
2917 		error = nl7c_sorecv(so, &mp, uiop, &rval);
2918 	} else {
2919 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
2920 		    timout, &rval);
2921 	}
2922 	if (error) {
2923 		switch (error) {
2924 		case EINTR:
2925 		case EWOULDBLOCK:
2926 			if (!first)
2927 				error = 0;
2928 			break;
2929 		case ETIME:
2930 			/* Returned from kstrgetmsg when timeout expires */
2931 			if (!first)
2932 				error = 0;
2933 			else
2934 				error = EWOULDBLOCK;
2935 			break;
2936 		default:
2937 			eprintsoline(so, error);
2938 			break;
2939 		}
2940 		mutex_enter(&so->so_lock);
2941 		so_unlock_read(so);	/* Clear SOREADLOCKED */
2942 		mutex_exit(&so->so_lock);
2943 		return (error);
2944 	}
2945 	/*
2946 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
2947 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
2948 	 */
2949 	ASSERT(!(rval.r_val1 & MORECTL));
2950 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
2951 		msg->msg_flags |= MSG_TRUNC;
2952 
2953 	if (mp == NULL) {
2954 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
2955 		/*
2956 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
2957 		 * The draft Posix socket spec states that the mark should
2958 		 * not be cleared when peeking. We follow the latter.
2959 		 */
2960 		if ((so->so_state &
2961 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
2962 		    (uiop->uio_resid != saved_resid) &&
2963 		    !(flags & MSG_PEEK)) {
2964 			sorecv_update_oobstate(so);
2965 		}
2966 
2967 		mutex_enter(&so->so_lock);
2968 		/* Set MSG_EOR based on MOREDATA */
2969 		if (!(rval.r_val1 & MOREDATA)) {
2970 			if (so->so_state & SS_SAVEDEOR) {
2971 				msg->msg_flags |= MSG_EOR;
2972 				so->so_state &= ~SS_SAVEDEOR;
2973 			}
2974 		}
2975 		/*
2976 		 * If some data was received (i.e. not EOF) and the
2977 		 * read/recv* has not been satisfied wait for some more.
2978 		 */
2979 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
2980 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
2981 			mutex_exit(&so->so_lock);
2982 			first = 0;
2983 			pflag = opflag | MSG_NOMARK;
2984 			goto retry;
2985 		}
2986 		so_unlock_read(so);	/* Clear SOREADLOCKED */
2987 		mutex_exit(&so->so_lock);
2988 		return (0);
2989 	}
2990 
2991 	/* strsock_proto has already verified length and alignment */
2992 	tpr = (union T_primitives *)mp->b_rptr;
2993 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
2994 
2995 	switch (tpr->type) {
2996 	case T_DATA_IND: {
2997 		if ((so->so_state &
2998 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
2999 		    (uiop->uio_resid != saved_resid) &&
3000 		    !(flags & MSG_PEEK)) {
3001 			sorecv_update_oobstate(so);
3002 		}
3003 
3004 		/*
3005 		 * Set msg_flags to MSG_EOR based on
3006 		 * MORE_flag and MOREDATA.
3007 		 */
3008 		mutex_enter(&so->so_lock);
3009 		so->so_state &= ~SS_SAVEDEOR;
3010 		if (!(tpr->data_ind.MORE_flag & 1)) {
3011 			if (!(rval.r_val1 & MOREDATA))
3012 				msg->msg_flags |= MSG_EOR;
3013 			else
3014 				so->so_state |= SS_SAVEDEOR;
3015 		}
3016 		freemsg(mp);
3017 		/*
3018 		 * If some data was received (i.e. not EOF) and the
3019 		 * read/recv* has not been satisfied wait for some more.
3020 		 */
3021 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3022 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3023 			mutex_exit(&so->so_lock);
3024 			first = 0;
3025 			pflag = opflag | MSG_NOMARK;
3026 			goto retry;
3027 		}
3028 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3029 		mutex_exit(&so->so_lock);
3030 		return (0);
3031 	}
3032 	case T_UNITDATA_IND: {
3033 		void *addr;
3034 		t_uscalar_t addrlen;
3035 		void *abuf;
3036 		t_uscalar_t optlen;
3037 		void *opt;
3038 
3039 		if ((so->so_state &
3040 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3041 		    (uiop->uio_resid != saved_resid) &&
3042 		    !(flags & MSG_PEEK)) {
3043 			sorecv_update_oobstate(so);
3044 		}
3045 
3046 		if (namelen != 0) {
3047 			/* Caller wants source address */
3048 			addrlen = tpr->unitdata_ind.SRC_length;
3049 			addr = sogetoff(mp,
3050 				tpr->unitdata_ind.SRC_offset,
3051 				addrlen, 1);
3052 			if (addr == NULL) {
3053 				freemsg(mp);
3054 				error = EPROTO;
3055 				eprintsoline(so, error);
3056 				goto err;
3057 			}
3058 			if (so->so_family == AF_UNIX) {
3059 				/*
3060 				 * Can not use the transport level address.
3061 				 * If there is a SO_SRCADDR option carrying
3062 				 * the socket level address it will be
3063 				 * extracted below.
3064 				 */
3065 				addr = NULL;
3066 				addrlen = 0;
3067 			}
3068 		}
3069 		optlen = tpr->unitdata_ind.OPT_length;
3070 		if (optlen != 0) {
3071 			t_uscalar_t ncontrollen;
3072 
3073 			/*
3074 			 * Extract any source address option.
3075 			 * Determine how large cmsg buffer is needed.
3076 			 */
3077 			opt = sogetoff(mp,
3078 				tpr->unitdata_ind.OPT_offset,
3079 				optlen, __TPI_ALIGN_SIZE);
3080 
3081 			if (opt == NULL) {
3082 				freemsg(mp);
3083 				error = EPROTO;
3084 				eprintsoline(so, error);
3085 				goto err;
3086 			}
3087 			if (so->so_family == AF_UNIX)
3088 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3089 			ncontrollen = so_cmsglen(mp, opt, optlen,
3090 						!(flags & MSG_XPG4_2));
3091 			if (controllen != 0)
3092 				controllen = ncontrollen;
3093 			else if (ncontrollen != 0)
3094 				msg->msg_flags |= MSG_CTRUNC;
3095 		} else {
3096 			controllen = 0;
3097 		}
3098 
3099 		if (namelen != 0) {
3100 			/*
3101 			 * Return address to caller.
3102 			 * Caller handles truncation if length
3103 			 * exceeds msg_namelen.
3104 			 * NOTE: AF_UNIX NUL termination is ensured by
3105 			 * the sender's copyin_name().
3106 			 */
3107 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3108 
3109 			bcopy(addr, abuf, addrlen);
3110 			msg->msg_name = abuf;
3111 			msg->msg_namelen = addrlen;
3112 		}
3113 
3114 		if (controllen != 0) {
3115 			/*
3116 			 * Return control msg to caller.
3117 			 * Caller handles truncation if length
3118 			 * exceeds msg_controllen.
3119 			 */
3120 			control = kmem_alloc(controllen, KM_SLEEP);
3121 
3122 			error = so_opt2cmsg(mp, opt, optlen,
3123 					!(flags & MSG_XPG4_2),
3124 					control, controllen);
3125 			if (error) {
3126 				freemsg(mp);
3127 				if (msg->msg_namelen != 0)
3128 					kmem_free(msg->msg_name,
3129 						msg->msg_namelen);
3130 				kmem_free(control, controllen);
3131 				eprintsoline(so, error);
3132 				goto err;
3133 			}
3134 			msg->msg_control = control;
3135 			msg->msg_controllen = controllen;
3136 		}
3137 
3138 		freemsg(mp);
3139 		mutex_enter(&so->so_lock);
3140 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3141 		mutex_exit(&so->so_lock);
3142 		return (0);
3143 	}
3144 	case T_OPTDATA_IND: {
3145 		struct T_optdata_req *tdr;
3146 		void *opt;
3147 		t_uscalar_t optlen;
3148 
3149 		if ((so->so_state &
3150 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3151 		    (uiop->uio_resid != saved_resid) &&
3152 		    !(flags & MSG_PEEK)) {
3153 			sorecv_update_oobstate(so);
3154 		}
3155 
3156 		tdr = (struct T_optdata_req *)mp->b_rptr;
3157 		optlen = tdr->OPT_length;
3158 		if (optlen != 0) {
3159 			t_uscalar_t ncontrollen;
3160 			/*
3161 			 * Determine how large cmsg buffer is needed.
3162 			 */
3163 			opt = sogetoff(mp,
3164 					tpr->optdata_ind.OPT_offset,
3165 					optlen, __TPI_ALIGN_SIZE);
3166 
3167 			if (opt == NULL) {
3168 				freemsg(mp);
3169 				error = EPROTO;
3170 				eprintsoline(so, error);
3171 				goto err;
3172 			}
3173 
3174 			ncontrollen = so_cmsglen(mp, opt, optlen,
3175 						!(flags & MSG_XPG4_2));
3176 			if (controllen != 0)
3177 				controllen = ncontrollen;
3178 			else if (ncontrollen != 0)
3179 				msg->msg_flags |= MSG_CTRUNC;
3180 		} else {
3181 			controllen = 0;
3182 		}
3183 
3184 		if (controllen != 0) {
3185 			/*
3186 			 * Return control msg to caller.
3187 			 * Caller handles truncation if length
3188 			 * exceeds msg_controllen.
3189 			 */
3190 			control = kmem_alloc(controllen, KM_SLEEP);
3191 
3192 			error = so_opt2cmsg(mp, opt, optlen,
3193 					!(flags & MSG_XPG4_2),
3194 					control, controllen);
3195 			if (error) {
3196 				freemsg(mp);
3197 				kmem_free(control, controllen);
3198 				eprintsoline(so, error);
3199 				goto err;
3200 			}
3201 			msg->msg_control = control;
3202 			msg->msg_controllen = controllen;
3203 		}
3204 
3205 		/*
3206 		 * Set msg_flags to MSG_EOR based on
3207 		 * DATA_flag and MOREDATA.
3208 		 */
3209 		mutex_enter(&so->so_lock);
3210 		so->so_state &= ~SS_SAVEDEOR;
3211 		if (!(tpr->data_ind.MORE_flag & 1)) {
3212 			if (!(rval.r_val1 & MOREDATA))
3213 				msg->msg_flags |= MSG_EOR;
3214 			else
3215 				so->so_state |= SS_SAVEDEOR;
3216 		}
3217 		freemsg(mp);
3218 		/*
3219 		 * If some data was received (i.e. not EOF) and the
3220 		 * read/recv* has not been satisfied wait for some more.
3221 		 * Not possible to wait if control info was received.
3222 		 */
3223 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3224 		    controllen == 0 &&
3225 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3226 			mutex_exit(&so->so_lock);
3227 			first = 0;
3228 			pflag = opflag | MSG_NOMARK;
3229 			goto retry;
3230 		}
3231 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3232 		mutex_exit(&so->so_lock);
3233 		return (0);
3234 	}
3235 	case T_EXDATA_IND: {
3236 		dprintso(so, 1,
3237 			("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3238 			"state %s\n",
3239 			so->so_oobsigcnt, so->so_oobcnt,
3240 			saved_resid - uiop->uio_resid,
3241 			pr_state(so->so_state, so->so_mode)));
3242 		/*
3243 		 * kstrgetmsg handles MSGMARK so there is nothing to
3244 		 * inspect in the T_EXDATA_IND.
3245 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3246 		 * as a separate message with no M_DATA component. Furthermore,
3247 		 * the stream head does not consolidate M_DATA messages onto
3248 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3249 		 * remains a message by itself. This is needed since MSGMARK
3250 		 * marks both the whole message as well as the last byte
3251 		 * of the message.
3252 		 */
3253 		freemsg(mp);
3254 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3255 		if (flags & MSG_PEEK) {
3256 			/*
3257 			 * Even though we are peeking we consume the
3258 			 * T_EXDATA_IND thereby moving the mark information
3259 			 * to SS_RCVATMARK. Then the oob code below will
3260 			 * retry the peeking kstrgetmsg.
3261 			 * Note that the stream head read queue is
3262 			 * never flushed without holding SOREADLOCKED
3263 			 * thus the T_EXDATA_IND can not disappear
3264 			 * underneath us.
3265 			 */
3266 			dprintso(so, 1,
3267 				("sotpi_recvmsg: consume EXDATA_IND "
3268 				"counts %d/%d state %s\n",
3269 				so->so_oobsigcnt,
3270 				so->so_oobcnt,
3271 				pr_state(so->so_state, so->so_mode)));
3272 
3273 			pflag = MSG_ANY | MSG_DELAYERROR;
3274 			if (so->so_mode & SM_ATOMIC)
3275 				pflag |= MSG_DISCARDTAIL;
3276 
3277 			pri = 0;
3278 			mp = NULL;
3279 
3280 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3281 				&pri, &pflag, (clock_t)-1, &rval);
3282 			ASSERT(uiop->uio_resid == saved_resid);
3283 
3284 			if (error) {
3285 #ifdef SOCK_DEBUG
3286 				if (error != EWOULDBLOCK && error != EINTR) {
3287 					eprintsoline(so, error);
3288 				}
3289 #endif /* SOCK_DEBUG */
3290 				mutex_enter(&so->so_lock);
3291 				so_unlock_read(so);	/* Clear SOREADLOCKED */
3292 				mutex_exit(&so->so_lock);
3293 				return (error);
3294 			}
3295 			ASSERT(mp);
3296 			tpr = (union T_primitives *)mp->b_rptr;
3297 			ASSERT(tpr->type == T_EXDATA_IND);
3298 			freemsg(mp);
3299 		} /* end "if (flags & MSG_PEEK)" */
3300 
3301 		/*
3302 		 * Decrement the number of queued and pending oob.
3303 		 *
3304 		 * SS_RCVATMARK is cleared when we read past a mark.
3305 		 * SS_HAVEOOBDATA is cleared when we've read past the
3306 		 * last mark.
3307 		 * SS_OOBPEND is cleared if we've read past the last
3308 		 * mark and no (new) SIGURG has been posted.
3309 		 */
3310 		mutex_enter(&so->so_lock);
3311 		ASSERT(so_verify_oobstate(so));
3312 		ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
3313 		ASSERT(so->so_oobsigcnt > 0);
3314 		so->so_oobsigcnt--;
3315 		ASSERT(so->so_oobcnt > 0);
3316 		so->so_oobcnt--;
3317 		/*
3318 		 * Since the T_EXDATA_IND has been removed from the stream
3319 		 * head, but we have not read data past the mark,
3320 		 * sockfs needs to track that the socket is still at the mark.
3321 		 *
3322 		 * Since no data was received call kstrgetmsg again to wait
3323 		 * for data.
3324 		 */
3325 		so->so_state |= SS_RCVATMARK;
3326 		mutex_exit(&so->so_lock);
3327 		dprintso(so, 1,
3328 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3329 		    so->so_oobsigcnt, so->so_oobcnt,
3330 		    pr_state(so->so_state, so->so_mode)));
3331 		pflag = opflag;
3332 		goto retry;
3333 	}
3334 	default:
3335 		ASSERT(0);
3336 		freemsg(mp);
3337 		error = EPROTO;
3338 		eprintsoline(so, error);
3339 		goto err;
3340 	}
3341 	/* NOTREACHED */
3342 err:
3343 	mutex_enter(&so->so_lock);
3344 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3345 	mutex_exit(&so->so_lock);
3346 	return (error);
3347 }
3348 
3349 /*
3350  * Sending data with options on a datagram socket.
3351  * Assumes caller has verified that SS_ISBOUND etc. are set.
3352  */
3353 static int
3354 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3355     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3356 {
3357 	struct T_unitdata_req	tudr;
3358 	mblk_t			*mp;
3359 	int			error;
3360 	void			*addr;
3361 	socklen_t		addrlen;
3362 	void			*src;
3363 	socklen_t		srclen;
3364 	ssize_t			len;
3365 	int			size;
3366 	struct T_opthdr		toh;
3367 	struct fdbuf		*fdbuf;
3368 	t_uscalar_t		optlen;
3369 	void			*fds;
3370 	int			fdlen;
3371 
3372 	ASSERT(name && namelen);
3373 	ASSERT(control && controllen);
3374 
3375 	len = uiop->uio_resid;
3376 	if (len > (ssize_t)so->so_tidu_size) {
3377 		return (EMSGSIZE);
3378 	}
3379 
3380 	/*
3381 	 * For AF_UNIX the destination address is translated to an internal
3382 	 * name and the source address is passed as an option.
3383 	 * Also, file descriptors are passed as file pointers in an
3384 	 * option.
3385 	 */
3386 
3387 	/*
3388 	 * Length and family checks.
3389 	 */
3390 	error = so_addr_verify(so, name, namelen);
3391 	if (error) {
3392 		eprintsoline(so, error);
3393 		return (error);
3394 	}
3395 	if (so->so_family == AF_UNIX) {
3396 		if (so->so_state & SS_FADDR_NOXLATE) {
3397 			/*
3398 			 * Already have a transport internal address. Do not
3399 			 * pass any (transport internal) source address.
3400 			 */
3401 			addr = name;
3402 			addrlen = namelen;
3403 			src = NULL;
3404 			srclen = 0;
3405 		} else {
3406 			/*
3407 			 * Pass the sockaddr_un source address as an option
3408 			 * and translate the remote address.
3409 			 *
3410 			 * Note that this code does not prevent so_laddr_sa
3411 			 * from changing while it is being used. Thus
3412 			 * if an unbind+bind occurs concurrently with this
3413 			 * send the peer might see a partially new and a
3414 			 * partially old "from" address.
3415 			 */
3416 			src = so->so_laddr_sa;
3417 			srclen = (t_uscalar_t)so->so_laddr_len;
3418 			dprintso(so, 1,
3419 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3420 			    srclen, src));
3421 			error = so_ux_addr_xlate(so, name, namelen,
3422 				(flags & MSG_XPG4_2),
3423 				&addr, &addrlen);
3424 			if (error) {
3425 				eprintsoline(so, error);
3426 				return (error);
3427 			}
3428 		}
3429 	} else {
3430 		addr = name;
3431 		addrlen = namelen;
3432 		src = NULL;
3433 		srclen = 0;
3434 	}
3435 	optlen = so_optlen(control, controllen,
3436 					!(flags & MSG_XPG4_2));
3437 	tudr.PRIM_type = T_UNITDATA_REQ;
3438 	tudr.DEST_length = addrlen;
3439 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3440 	if (srclen != 0)
3441 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3442 		    _TPI_ALIGN_TOPT(srclen));
3443 	else
3444 		tudr.OPT_length = optlen;
3445 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3446 				_TPI_ALIGN_TOPT(addrlen));
3447 
3448 	size = tudr.OPT_offset + tudr.OPT_length;
3449 
3450 	/*
3451 	 * File descriptors only when SM_FDPASSING set.
3452 	 */
3453 	error = so_getfdopt(control, controllen,
3454 			!(flags & MSG_XPG4_2), &fds, &fdlen);
3455 	if (error)
3456 		return (error);
3457 	if (fdlen != -1) {
3458 		if (!(so->so_mode & SM_FDPASSING))
3459 			return (EOPNOTSUPP);
3460 
3461 		error = fdbuf_create(fds, fdlen, &fdbuf);
3462 		if (error)
3463 			return (error);
3464 		mp = fdbuf_allocmsg(size, fdbuf);
3465 	} else {
3466 		mp = soallocproto(size, _ALLOC_INTR);
3467 		if (mp == NULL) {
3468 			/*
3469 			 * Caught a signal waiting for memory.
3470 			 * Let send* return EINTR.
3471 			 */
3472 			return (EINTR);
3473 		}
3474 	}
3475 	soappendmsg(mp, &tudr, sizeof (tudr));
3476 	soappendmsg(mp, addr, addrlen);
3477 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3478 
3479 	if (fdlen != -1) {
3480 		ASSERT(fdbuf != NULL);
3481 		toh.level = SOL_SOCKET;
3482 		toh.name = SO_FILEP;
3483 		toh.len = fdbuf->fd_size +
3484 				(t_uscalar_t)sizeof (struct T_opthdr);
3485 		toh.status = 0;
3486 		soappendmsg(mp, &toh, sizeof (toh));
3487 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3488 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3489 	}
3490 	if (srclen != 0) {
3491 		/*
3492 		 * There is a AF_UNIX sockaddr_un to include as a source
3493 		 * address option.
3494 		 */
3495 		toh.level = SOL_SOCKET;
3496 		toh.name = SO_SRCADDR;
3497 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3498 		toh.status = 0;
3499 		soappendmsg(mp, &toh, sizeof (toh));
3500 		soappendmsg(mp, src, srclen);
3501 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3502 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3503 	}
3504 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3505 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3506 	/* At most 3 bytes left in the message */
3507 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3508 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3509 
3510 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3511 #ifdef C2_AUDIT
3512 	if (audit_active)
3513 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3514 #endif /* C2_AUDIT */
3515 
3516 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3517 #ifdef SOCK_DEBUG
3518 	if (error) {
3519 		eprintsoline(so, error);
3520 	}
3521 #endif /* SOCK_DEBUG */
3522 	return (error);
3523 }
3524 
3525 /*
3526  * Sending data with options on a connected stream socket.
3527  * Assumes caller has verified that SS_ISCONNECTED is set.
3528  */
3529 static int
3530 sosend_svccmsg(struct sonode *so,
3531 		struct uio *uiop,
3532 		int more,
3533 		void *control,
3534 		t_uscalar_t controllen,
3535 		int flags)
3536 {
3537 	struct T_optdata_req	tdr;
3538 	mblk_t			*mp;
3539 	int			error;
3540 	ssize_t			iosize;
3541 	int			first = 1;
3542 	int			size;
3543 	struct fdbuf		*fdbuf;
3544 	t_uscalar_t		optlen;
3545 	void			*fds;
3546 	int			fdlen;
3547 	struct T_opthdr		toh;
3548 
3549 	dprintso(so, 1,
3550 		("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3551 
3552 	/*
3553 	 * Has to be bound and connected. However, since no locks are
3554 	 * held the state could have changed after sotpi_sendmsg checked it
3555 	 * thus it is not possible to ASSERT on the state.
3556 	 */
3557 
3558 	/* Options on connection-oriented only when SM_OPTDATA set. */
3559 	if (!(so->so_mode & SM_OPTDATA))
3560 		return (EOPNOTSUPP);
3561 
3562 	do {
3563 		/*
3564 		 * Set the MORE flag if uio_resid does not fit in this
3565 		 * message or if the caller passed in "more".
3566 		 * Error for transports with zero tidu_size.
3567 		 */
3568 		tdr.PRIM_type = T_OPTDATA_REQ;
3569 		iosize = so->so_tidu_size;
3570 		if (iosize <= 0)
3571 			return (EMSGSIZE);
3572 		if (uiop->uio_resid > iosize) {
3573 			tdr.DATA_flag = 1;
3574 		} else {
3575 			if (more)
3576 				tdr.DATA_flag = 1;
3577 			else
3578 				tdr.DATA_flag = 0;
3579 			iosize = uiop->uio_resid;
3580 		}
3581 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3582 			tdr.DATA_flag, iosize));
3583 
3584 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3585 		tdr.OPT_length = optlen;
3586 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3587 
3588 		size = (int)sizeof (tdr) + optlen;
3589 		/*
3590 		 * File descriptors only when SM_FDPASSING set.
3591 		 */
3592 		error = so_getfdopt(control, controllen,
3593 				!(flags & MSG_XPG4_2), &fds, &fdlen);
3594 		if (error)
3595 			return (error);
3596 		if (fdlen != -1) {
3597 			if (!(so->so_mode & SM_FDPASSING))
3598 				return (EOPNOTSUPP);
3599 
3600 			error = fdbuf_create(fds, fdlen, &fdbuf);
3601 			if (error)
3602 				return (error);
3603 			mp = fdbuf_allocmsg(size, fdbuf);
3604 		} else {
3605 			mp = soallocproto(size, _ALLOC_INTR);
3606 			if (mp == NULL) {
3607 				/*
3608 				 * Caught a signal waiting for memory.
3609 				 * Let send* return EINTR.
3610 				 */
3611 				return (first ? EINTR : 0);
3612 			}
3613 		}
3614 		soappendmsg(mp, &tdr, sizeof (tdr));
3615 
3616 		if (fdlen != -1) {
3617 			ASSERT(fdbuf != NULL);
3618 			toh.level = SOL_SOCKET;
3619 			toh.name = SO_FILEP;
3620 			toh.len = fdbuf->fd_size +
3621 				(t_uscalar_t)sizeof (struct T_opthdr);
3622 			toh.status = 0;
3623 			soappendmsg(mp, &toh, sizeof (toh));
3624 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3625 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3626 		}
3627 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3628 		/* At most 3 bytes left in the message */
3629 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3630 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3631 
3632 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3633 
3634 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3635 					0, MSG_BAND, 0);
3636 		if (error) {
3637 			if (!first && error == EWOULDBLOCK)
3638 				return (0);
3639 			eprintsoline(so, error);
3640 			return (error);
3641 		}
3642 		control = NULL;
3643 		first = 0;
3644 		if (uiop->uio_resid > 0) {
3645 			/*
3646 			 * Recheck for fatal errors. Fail write even though
3647 			 * some data have been written. This is consistent
3648 			 * with strwrite semantics and BSD sockets semantics.
3649 			 */
3650 			if (so->so_state & SS_CANTSENDMORE) {
3651 				tsignal(curthread, SIGPIPE);
3652 				eprintsoline(so, error);
3653 				return (EPIPE);
3654 			}
3655 			if (so->so_error != 0) {
3656 				mutex_enter(&so->so_lock);
3657 				error = sogeterr(so);
3658 				mutex_exit(&so->so_lock);
3659 				if (error != 0) {
3660 					eprintsoline(so, error);
3661 					return (error);
3662 				}
3663 			}
3664 		}
3665 	} while (uiop->uio_resid > 0);
3666 	return (0);
3667 }
3668 
3669 /*
3670  * Sending data on a datagram socket.
3671  * Assumes caller has verified that SS_ISBOUND etc. are set.
3672  *
3673  * For AF_UNIX the destination address is translated to an internal
3674  * name and the source address is passed as an option.
3675  */
3676 int
3677 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
3678     struct uio *uiop, int flags)
3679 {
3680 	struct T_unitdata_req	tudr;
3681 	mblk_t			*mp;
3682 	int			error;
3683 	void			*addr;
3684 	socklen_t		addrlen;
3685 	void			*src;
3686 	socklen_t		srclen;
3687 	ssize_t			len;
3688 
3689 	ASSERT(name != NULL && namelen != 0);
3690 
3691 	len = uiop->uio_resid;
3692 	if (len > so->so_tidu_size) {
3693 		error = EMSGSIZE;
3694 		goto done;
3695 	}
3696 
3697 	/* Length and family checks */
3698 	error = so_addr_verify(so, name, namelen);
3699 	if (error != 0)
3700 		goto done;
3701 
3702 	if (so->so_state & SS_DIRECT)
3703 		return (sodgram_direct(so, name, namelen, uiop, flags));
3704 
3705 	if (so->so_family == AF_UNIX) {
3706 		if (so->so_state & SS_FADDR_NOXLATE) {
3707 			/*
3708 			 * Already have a transport internal address. Do not
3709 			 * pass any (transport internal) source address.
3710 			 */
3711 			addr = name;
3712 			addrlen = namelen;
3713 			src = NULL;
3714 			srclen = 0;
3715 		} else {
3716 			/*
3717 			 * Pass the sockaddr_un source address as an option
3718 			 * and translate the remote address.
3719 			 *
3720 			 * Note that this code does not prevent so_laddr_sa
3721 			 * from changing while it is being used. Thus
3722 			 * if an unbind+bind occurs concurrently with this
3723 			 * send the peer might see a partially new and a
3724 			 * partially old "from" address.
3725 			 */
3726 			src = so->so_laddr_sa;
3727 			srclen = (socklen_t)so->so_laddr_len;
3728 			dprintso(so, 1,
3729 				("sosend_dgram UNIX: srclen %d, src %p\n",
3730 				srclen, src));
3731 			error = so_ux_addr_xlate(so, name, namelen,
3732 				(flags & MSG_XPG4_2),
3733 				&addr, &addrlen);
3734 			if (error) {
3735 				eprintsoline(so, error);
3736 				goto done;
3737 			}
3738 		}
3739 	} else {
3740 		addr = name;
3741 		addrlen = namelen;
3742 		src = NULL;
3743 		srclen = 0;
3744 	}
3745 	tudr.PRIM_type = T_UNITDATA_REQ;
3746 	tudr.DEST_length = addrlen;
3747 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3748 	if (srclen == 0) {
3749 		tudr.OPT_length = 0;
3750 		tudr.OPT_offset = 0;
3751 
3752 		mp = soallocproto2(&tudr, sizeof (tudr),
3753 		    addr, addrlen, 0, _ALLOC_INTR);
3754 		if (mp == NULL) {
3755 			/*
3756 			 * Caught a signal waiting for memory.
3757 			 * Let send* return EINTR.
3758 			 */
3759 			error = EINTR;
3760 			goto done;
3761 		}
3762 	} else {
3763 		/*
3764 		 * There is a AF_UNIX sockaddr_un to include as a source
3765 		 * address option.
3766 		 */
3767 		struct T_opthdr toh;
3768 		ssize_t size;
3769 
3770 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
3771 					_TPI_ALIGN_TOPT(srclen));
3772 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3773 					_TPI_ALIGN_TOPT(addrlen));
3774 
3775 		toh.level = SOL_SOCKET;
3776 		toh.name = SO_SRCADDR;
3777 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3778 		toh.status = 0;
3779 
3780 		size = tudr.OPT_offset + tudr.OPT_length;
3781 		mp = soallocproto2(&tudr, sizeof (tudr),
3782 		    addr, addrlen, size, _ALLOC_INTR);
3783 		if (mp == NULL) {
3784 			/*
3785 			 * Caught a signal waiting for memory.
3786 			 * Let send* return EINTR.
3787 			 */
3788 			error = EINTR;
3789 			goto done;
3790 		}
3791 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3792 		soappendmsg(mp, &toh, sizeof (toh));
3793 		soappendmsg(mp, src, srclen);
3794 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3795 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3796 	}
3797 
3798 #ifdef C2_AUDIT
3799 	if (audit_active)
3800 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3801 #endif /* C2_AUDIT */
3802 
3803 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3804 done:
3805 #ifdef SOCK_DEBUG
3806 	if (error) {
3807 		eprintsoline(so, error);
3808 	}
3809 #endif /* SOCK_DEBUG */
3810 	return (error);
3811 }
3812 
3813 /*
3814  * Sending data on a connected stream socket.
3815  * Assumes caller has verified that SS_ISCONNECTED is set.
3816  */
3817 int
3818 sosend_svc(struct sonode *so,
3819 	struct uio *uiop,
3820 	t_scalar_t prim,
3821 	int more,
3822 	int sflag)
3823 {
3824 	struct T_data_req	tdr;
3825 	mblk_t			*mp;
3826 	int			error;
3827 	ssize_t			iosize;
3828 	int			first = 1;
3829 
3830 	dprintso(so, 1,
3831 		("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
3832 		so, uiop->uio_resid, prim, sflag));
3833 
3834 	/*
3835 	 * Has to be bound and connected. However, since no locks are
3836 	 * held the state could have changed after sotpi_sendmsg checked it
3837 	 * thus it is not possible to ASSERT on the state.
3838 	 */
3839 
3840 	do {
3841 		/*
3842 		 * Set the MORE flag if uio_resid does not fit in this
3843 		 * message or if the caller passed in "more".
3844 		 * Error for transports with zero tidu_size.
3845 		 */
3846 		tdr.PRIM_type = prim;
3847 		iosize = so->so_tidu_size;
3848 		if (iosize <= 0)
3849 			return (EMSGSIZE);
3850 		if (uiop->uio_resid > iosize) {
3851 			tdr.MORE_flag = 1;
3852 		} else {
3853 			if (more)
3854 				tdr.MORE_flag = 1;
3855 			else
3856 				tdr.MORE_flag = 0;
3857 			iosize = uiop->uio_resid;
3858 		}
3859 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
3860 			prim, tdr.MORE_flag, iosize));
3861 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR);
3862 		if (mp == NULL) {
3863 			/*
3864 			 * Caught a signal waiting for memory.
3865 			 * Let send* return EINTR.
3866 			 */
3867 			if (first)
3868 				return (EINTR);
3869 			else
3870 				return (0);
3871 		}
3872 
3873 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3874 					0, sflag | MSG_BAND, 0);
3875 		if (error) {
3876 			if (!first && error == EWOULDBLOCK)
3877 				return (0);
3878 			eprintsoline(so, error);
3879 			return (error);
3880 		}
3881 		first = 0;
3882 		if (uiop->uio_resid > 0) {
3883 			/*
3884 			 * Recheck for fatal errors. Fail write even though
3885 			 * some data have been written. This is consistent
3886 			 * with strwrite semantics and BSD sockets semantics.
3887 			 */
3888 			if (so->so_state & SS_CANTSENDMORE) {
3889 				tsignal(curthread, SIGPIPE);
3890 				eprintsoline(so, error);
3891 				return (EPIPE);
3892 			}
3893 			if (so->so_error != 0) {
3894 				mutex_enter(&so->so_lock);
3895 				error = sogeterr(so);
3896 				mutex_exit(&so->so_lock);
3897 				if (error != 0) {
3898 					eprintsoline(so, error);
3899 					return (error);
3900 				}
3901 			}
3902 		}
3903 	} while (uiop->uio_resid > 0);
3904 	return (0);
3905 }
3906 
3907 /*
3908  * Check the state for errors and call the appropriate send function.
3909  *
3910  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
3911  * this function issues a setsockopt to toggle SO_DONTROUTE before and
3912  * after sending the message.
3913  */
3914 static int
3915 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3916 {
3917 	int		so_state;
3918 	int		so_mode;
3919 	int		error;
3920 	struct sockaddr *name;
3921 	t_uscalar_t	namelen;
3922 	int		dontroute;
3923 	int		flags;
3924 
3925 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
3926 		so, msg, msg->msg_flags,
3927 		pr_state(so->so_state, so->so_mode), so->so_error));
3928 
3929 	mutex_enter(&so->so_lock);
3930 	so_state = so->so_state;
3931 
3932 	if (so_state & SS_CANTSENDMORE) {
3933 		mutex_exit(&so->so_lock);
3934 		tsignal(curthread, SIGPIPE);
3935 		return (EPIPE);
3936 	}
3937 
3938 	if (so->so_error != 0) {
3939 		error = sogeterr(so);
3940 		if (error != 0) {
3941 			mutex_exit(&so->so_lock);
3942 			return (error);
3943 		}
3944 	}
3945 
3946 	name = (struct sockaddr *)msg->msg_name;
3947 	namelen = msg->msg_namelen;
3948 
3949 	so_mode = so->so_mode;
3950 
3951 	if (name == NULL) {
3952 		if (!(so_state & SS_ISCONNECTED)) {
3953 			mutex_exit(&so->so_lock);
3954 			if (so_mode & SM_CONNREQUIRED)
3955 				return (ENOTCONN);
3956 			else
3957 				return (EDESTADDRREQ);
3958 		}
3959 		if (so_mode & SM_CONNREQUIRED) {
3960 			name = NULL;
3961 			namelen = 0;
3962 		} else {
3963 			/*
3964 			 * Note that this code does not prevent so_faddr_sa
3965 			 * from changing while it is being used. Thus
3966 			 * if an "unconnect"+connect occurs concurrently with
3967 			 * this send the datagram might be delivered to a
3968 			 * garbaled address.
3969 			 */
3970 			ASSERT(so->so_faddr_sa);
3971 			name = so->so_faddr_sa;
3972 			namelen = (t_uscalar_t)so->so_faddr_len;
3973 		}
3974 	} else {
3975 		if (!(so_state & SS_ISCONNECTED) &&
3976 		    (so_mode & SM_CONNREQUIRED)) {
3977 			/* Required but not connected */
3978 			mutex_exit(&so->so_lock);
3979 			return (ENOTCONN);
3980 		}
3981 		/*
3982 		 * Ignore the address on connection-oriented sockets.
3983 		 * Just like BSD this code does not generate an error for
3984 		 * TCP (a CONNREQUIRED socket) when sending to an address
3985 		 * passed in with sendto/sendmsg. Instead the data is
3986 		 * delivered on the connection as if no address had been
3987 		 * supplied.
3988 		 */
3989 		if ((so_state & SS_ISCONNECTED) &&
3990 		    !(so_mode & SM_CONNREQUIRED)) {
3991 			mutex_exit(&so->so_lock);
3992 			return (EISCONN);
3993 		}
3994 		if (!(so_state & SS_ISBOUND)) {
3995 			so_lock_single(so);	/* Set SOLOCKED */
3996 			error = sotpi_bind(so, NULL, 0,
3997 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
3998 			so_unlock_single(so, SOLOCKED);
3999 			if (error) {
4000 				mutex_exit(&so->so_lock);
4001 				eprintsoline(so, error);
4002 				return (error);
4003 			}
4004 		}
4005 		/*
4006 		 * Handle delayed datagram errors. These are only queued
4007 		 * when the application sets SO_DGRAM_ERRIND.
4008 		 * Return the error if we are sending to the address
4009 		 * that was returned in the last T_UDERROR_IND.
4010 		 * If sending to some other address discard the delayed
4011 		 * error indication.
4012 		 */
4013 		if (so->so_delayed_error) {
4014 			struct T_uderror_ind	*tudi;
4015 			void			*addr;
4016 			t_uscalar_t		addrlen;
4017 			boolean_t		match = B_FALSE;
4018 
4019 			ASSERT(so->so_eaddr_mp);
4020 			error = so->so_delayed_error;
4021 			so->so_delayed_error = 0;
4022 			tudi = (struct T_uderror_ind *)so->so_eaddr_mp->b_rptr;
4023 			addrlen = tudi->DEST_length;
4024 			addr = sogetoff(so->so_eaddr_mp,
4025 					tudi->DEST_offset,
4026 					addrlen, 1);
4027 			ASSERT(addr);	/* Checked by strsock_proto */
4028 			switch (so->so_family) {
4029 			case AF_INET: {
4030 				/* Compare just IP address and port */
4031 				sin_t *sin1 = (sin_t *)name;
4032 				sin_t *sin2 = (sin_t *)addr;
4033 
4034 				if (addrlen == sizeof (sin_t) &&
4035 				    namelen == addrlen &&
4036 				    sin1->sin_port == sin2->sin_port &&
4037 				    sin1->sin_addr.s_addr ==
4038 				    sin2->sin_addr.s_addr)
4039 					match = B_TRUE;
4040 				break;
4041 			}
4042 			case AF_INET6: {
4043 				/* Compare just IP address and port. Not flow */
4044 				sin6_t *sin1 = (sin6_t *)name;
4045 				sin6_t *sin2 = (sin6_t *)addr;
4046 
4047 				if (addrlen == sizeof (sin6_t) &&
4048 				    namelen == addrlen &&
4049 				    sin1->sin6_port == sin2->sin6_port &&
4050 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4051 					&sin2->sin6_addr))
4052 					match = B_TRUE;
4053 				break;
4054 			}
4055 			case AF_UNIX:
4056 			default:
4057 				if (namelen == addrlen &&
4058 				    bcmp(name, addr, namelen) == 0)
4059 					match = B_TRUE;
4060 			}
4061 			if (match) {
4062 				freemsg(so->so_eaddr_mp);
4063 				so->so_eaddr_mp = NULL;
4064 				mutex_exit(&so->so_lock);
4065 #ifdef DEBUG
4066 				dprintso(so, 0,
4067 					("sockfs delayed error %d for %s\n",
4068 					error,
4069 					pr_addr(so->so_family, name, namelen)));
4070 #endif /* DEBUG */
4071 				return (error);
4072 			}
4073 			freemsg(so->so_eaddr_mp);
4074 			so->so_eaddr_mp = NULL;
4075 		}
4076 	}
4077 	mutex_exit(&so->so_lock);
4078 
4079 	flags = msg->msg_flags;
4080 	dontroute = 0;
4081 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4082 		uint32_t	val;
4083 
4084 		val = 1;
4085 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4086 					&val, (t_uscalar_t)sizeof (val));
4087 		if (error)
4088 			return (error);
4089 		dontroute = 1;
4090 	}
4091 
4092 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4093 		error = EOPNOTSUPP;
4094 		goto done;
4095 	}
4096 	if (msg->msg_controllen != 0) {
4097 		if (!(so_mode & SM_CONNREQUIRED)) {
4098 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4099 			    msg->msg_control, msg->msg_controllen, flags);
4100 		} else {
4101 			if (flags & MSG_OOB) {
4102 				/* Can't generate T_EXDATA_REQ with options */
4103 				error = EOPNOTSUPP;
4104 				goto done;
4105 			}
4106 			error = sosend_svccmsg(so, uiop,
4107 				!(flags & MSG_EOR),
4108 				msg->msg_control, msg->msg_controllen,
4109 				flags);
4110 		}
4111 		goto done;
4112 	}
4113 
4114 	if (!(so_mode & SM_CONNREQUIRED)) {
4115 		/*
4116 		 * If there is no SO_DONTROUTE to turn off return immediately
4117 		 * from send_dgram. This can allow tail-call optimizations.
4118 		 */
4119 		if (!dontroute) {
4120 			return (sosend_dgram(so, name, namelen, uiop, flags));
4121 		}
4122 		error = sosend_dgram(so, name, namelen, uiop, flags);
4123 	} else {
4124 		t_scalar_t prim;
4125 		int sflag;
4126 
4127 		/* Ignore msg_name in the connected state */
4128 		if (flags & MSG_OOB) {
4129 			prim = T_EXDATA_REQ;
4130 			/*
4131 			 * Send down T_EXDATA_REQ even if there is flow
4132 			 * control for data.
4133 			 */
4134 			sflag = MSG_IGNFLOW;
4135 		} else {
4136 			if (so_mode & SM_BYTESTREAM) {
4137 				/* Byte stream transport - use write */
4138 
4139 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4140 				/*
4141 				 * If there is no SO_DONTROUTE to turn off,
4142 				 * SS_DIRECT is on, and there is no flow
4143 				 * control, we can take the fast path.
4144 				 */
4145 				if (!dontroute &&
4146 				    (so_state & SS_DIRECT) &&
4147 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4148 					return (sostream_direct(so, uiop,
4149 					    NULL, CRED()));
4150 				}
4151 				error = strwrite(SOTOV(so), uiop, CRED());
4152 				goto done;
4153 			}
4154 			prim = T_DATA_REQ;
4155 			sflag = 0;
4156 		}
4157 		/*
4158 		 * If there is no SO_DONTROUTE to turn off return immediately
4159 		 * from sosend_svc. This can allow tail-call optimizations.
4160 		 */
4161 		if (!dontroute)
4162 			return (sosend_svc(so, uiop, prim,
4163 				!(flags & MSG_EOR), sflag));
4164 		error = sosend_svc(so, uiop, prim,
4165 				!(flags & MSG_EOR), sflag);
4166 	}
4167 	ASSERT(dontroute);
4168 done:
4169 	if (dontroute) {
4170 		uint32_t	val;
4171 
4172 		val = 0;
4173 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4174 				&val, (t_uscalar_t)sizeof (val));
4175 	}
4176 	return (error);
4177 }
4178 
4179 /*
4180  * Sending data on a datagram socket.
4181  * Assumes caller has verified that SS_ISBOUND etc. are set.
4182  */
4183 /* ARGSUSED */
4184 static int
4185 sodgram_direct(struct sonode *so, struct sockaddr *name,
4186     socklen_t namelen, struct uio *uiop, int flags)
4187 {
4188 	struct T_unitdata_req	tudr;
4189 	mblk_t			*mp;
4190 	int			error = 0;
4191 	void			*addr;
4192 	socklen_t		addrlen;
4193 	ssize_t			len;
4194 	struct stdata		*stp = SOTOV(so)->v_stream;
4195 	int			so_state;
4196 	queue_t			*udp_wq;
4197 
4198 	ASSERT(name != NULL && namelen != 0);
4199 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4200 	ASSERT(!(so->so_mode & SM_EXDATA));
4201 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4202 	ASSERT(SOTOV(so)->v_type == VSOCK);
4203 
4204 	/* Caller checked for proper length */
4205 	len = uiop->uio_resid;
4206 	ASSERT(len <= so->so_tidu_size);
4207 
4208 	/* Length and family checks have been done by caller */
4209 	ASSERT(name->sa_family == so->so_family);
4210 	ASSERT(so->so_family == AF_INET ||
4211 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4212 	ASSERT(so->so_family == AF_INET6 ||
4213 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4214 
4215 	addr = name;
4216 	addrlen = namelen;
4217 
4218 	if (stp->sd_sidp != NULL &&
4219 	    (error = straccess(stp, JCWRITE)) != 0)
4220 		goto done;
4221 
4222 	so_state = so->so_state;
4223 
4224 	/*
4225 	 * For UDP we don't break up the copyin into smaller pieces
4226 	 * as in the TCP case.  That means if ENOMEM is returned by
4227 	 * mcopyinuio() then the uio vector has not been modified at
4228 	 * all and we fallback to either strwrite() or kstrputmsg()
4229 	 * below.  Note also that we never generate priority messages
4230 	 * from here.
4231 	 */
4232 	udp_wq = stp->sd_wrq->q_next;
4233 	if (canput(udp_wq) &&
4234 	    (mp = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4235 		ASSERT(DB_TYPE(mp) == M_DATA);
4236 		ASSERT(uiop->uio_resid == 0);
4237 #ifdef C2_AUDIT
4238 		if (audit_active)
4239 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4240 #endif /* C2_AUDIT */
4241 		udp_wput_data(udp_wq, mp, addr, addrlen);
4242 		return (0);
4243 	}
4244 	if (error != 0 && error != ENOMEM)
4245 		return (error);
4246 
4247 	/*
4248 	 * For connected, let strwrite() handle the blocking case.
4249 	 * Otherwise we fall thru and use kstrputmsg().
4250 	 */
4251 	if (so_state & SS_ISCONNECTED)
4252 		return (strwrite(SOTOV(so), uiop, CRED()));
4253 
4254 	tudr.PRIM_type = T_UNITDATA_REQ;
4255 	tudr.DEST_length = addrlen;
4256 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4257 	tudr.OPT_length = 0;
4258 	tudr.OPT_offset = 0;
4259 
4260 	mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0, _ALLOC_INTR);
4261 	if (mp == NULL) {
4262 		/*
4263 		 * Caught a signal waiting for memory.
4264 		 * Let send* return EINTR.
4265 		 */
4266 		error = EINTR;
4267 		goto done;
4268 	}
4269 
4270 #ifdef C2_AUDIT
4271 	if (audit_active)
4272 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4273 #endif /* C2_AUDIT */
4274 
4275 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4276 done:
4277 #ifdef SOCK_DEBUG
4278 	if (error != 0) {
4279 		eprintsoline(so, error);
4280 	}
4281 #endif /* SOCK_DEBUG */
4282 	return (error);
4283 }
4284 
4285 int
4286 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4287 {
4288 	struct stdata *stp = SOTOV(so)->v_stream;
4289 	ssize_t iosize, rmax, maxblk;
4290 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4291 	int error = 0, wflag = 0;
4292 
4293 	ASSERT(so->so_mode & SM_BYTESTREAM);
4294 	ASSERT(SOTOV(so)->v_type == VSOCK);
4295 
4296 	if (stp->sd_sidp != NULL &&
4297 	    (error = straccess(stp, JCWRITE)) != 0)
4298 		return (error);
4299 
4300 	if (uiop == NULL) {
4301 		/*
4302 		 * kstrwritemp() should have checked sd_flag and
4303 		 * flow-control before coming here.  If we end up
4304 		 * here it means that we can simply pass down the
4305 		 * data to tcp.
4306 		 */
4307 		ASSERT(mp != NULL);
4308 		tcp_wput(tcp_wq, mp);
4309 		return (0);
4310 	}
4311 
4312 	/* Fallback to strwrite() to do proper error handling */
4313 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4314 		return (strwrite(SOTOV(so), uiop, cr));
4315 
4316 	rmax = stp->sd_qn_maxpsz;
4317 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4318 	if (rmax == 0 || uiop->uio_resid <= 0)
4319 		return (0);
4320 
4321 	if (rmax == INFPSZ)
4322 		rmax = uiop->uio_resid;
4323 
4324 	maxblk = stp->sd_maxblk;
4325 
4326 	for (;;) {
4327 		iosize = MIN(uiop->uio_resid, rmax);
4328 
4329 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4330 		if (mp == NULL) {
4331 			/*
4332 			 * Fallback to strwrite() for ENOMEM; if this
4333 			 * is our first time in this routine and the uio
4334 			 * vector has not been modified, we will end up
4335 			 * calling strwrite() without any flag set.
4336 			 */
4337 			if (error == ENOMEM)
4338 				goto slow_send;
4339 			else
4340 				return (error);
4341 		}
4342 		ASSERT(uiop->uio_resid >= 0);
4343 		/*
4344 		 * If mp is non-NULL and ENOMEM is set, it means that
4345 		 * mcopyinuio() was able to break down some of the user
4346 		 * data into one or more mblks.  Send the partial data
4347 		 * to tcp and let the rest be handled in strwrite().
4348 		 */
4349 		ASSERT(error == 0 || error == ENOMEM);
4350 		tcp_wput(tcp_wq, mp);
4351 
4352 		wflag |= NOINTR;
4353 
4354 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4355 			ASSERT(error == 0);
4356 			break;
4357 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4358 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4359 slow_send:
4360 			/*
4361 			 * We were able to send down partial data using
4362 			 * the direct call interface, but are now relying
4363 			 * on strwrite() to handle the non-fastpath cases.
4364 			 * If the socket is blocking we will sleep in
4365 			 * strwaitq() until write is permitted, otherwise,
4366 			 * we will need to return the amount of bytes
4367 			 * written so far back to the app.  This is the
4368 			 * reason why we pass NOINTR flag to strwrite()
4369 			 * for non-blocking socket, because we don't want
4370 			 * to return EAGAIN when portion of the user data
4371 			 * has actually been sent down.
4372 			 */
4373 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4374 		}
4375 	}
4376 	return (0);
4377 }
4378 
4379 /*
4380  * Update so_faddr by asking the transport (unless AF_UNIX).
4381  */
4382 int
4383 sotpi_getpeername(struct sonode *so)
4384 {
4385 	struct strbuf	strbuf;
4386 	int		error = 0, res;
4387 	void		*addr;
4388 	t_uscalar_t	addrlen;
4389 	k_sigset_t	smask;
4390 
4391 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4392 		so, pr_state(so->so_state, so->so_mode)));
4393 
4394 	mutex_enter(&so->so_lock);
4395 	so_lock_single(so);	/* Set SOLOCKED */
4396 	if (!(so->so_state & SS_ISCONNECTED)) {
4397 		error = ENOTCONN;
4398 		goto done;
4399 	}
4400 	/* Added this check for X/Open */
4401 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4402 		error = EINVAL;
4403 		if (xnet_check_print) {
4404 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4405 		}
4406 		goto done;
4407 	}
4408 #ifdef DEBUG
4409 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4410 		pr_addr(so->so_family, so->so_faddr_sa,
4411 			(t_uscalar_t)so->so_faddr_len)));
4412 #endif /* DEBUG */
4413 
4414 	if (so->so_family == AF_UNIX || so->so_family == AF_NCA) {
4415 		/* Transport has different name space - return local info */
4416 		error = 0;
4417 		goto done;
4418 	}
4419 
4420 	ASSERT(so->so_faddr_sa);
4421 	/* Allocate local buffer to use with ioctl */
4422 	addrlen = (t_uscalar_t)so->so_faddr_maxlen;
4423 	mutex_exit(&so->so_lock);
4424 	addr = kmem_alloc(addrlen, KM_SLEEP);
4425 
4426 	/*
4427 	 * Issue TI_GETPEERNAME with signals masked.
4428 	 * Put the result in so_faddr_sa so that getpeername works after
4429 	 * a shutdown(output).
4430 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4431 	 * back to the socket.
4432 	 */
4433 	strbuf.buf = addr;
4434 	strbuf.maxlen = addrlen;
4435 	strbuf.len = 0;
4436 
4437 	sigintr(&smask, 0);
4438 	res = 0;
4439 	ASSERT(CRED());
4440 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4441 			0, K_TO_K, CRED(), &res);
4442 	sigunintr(&smask);
4443 
4444 	mutex_enter(&so->so_lock);
4445 	/*
4446 	 * If there is an error record the error in so_error put don't fail
4447 	 * the getpeername. Instead fallback on the recorded
4448 	 * so->so_faddr_sa.
4449 	 */
4450 	if (error) {
4451 		/*
4452 		 * Various stream head errors can be returned to the ioctl.
4453 		 * However, it is impossible to determine which ones of
4454 		 * these are really socket level errors that were incorrectly
4455 		 * consumed by the ioctl. Thus this code silently ignores the
4456 		 * error - to code explicitly does not reinstate the error
4457 		 * using soseterror().
4458 		 * Experiments have shows that at least this set of
4459 		 * errors are reported and should not be reinstated on the
4460 		 * socket:
4461 		 *	EINVAL	E.g. if an I_LINK was in effect when
4462 		 *		getpeername was called.
4463 		 *	EPIPE	The ioctl error semantics prefer the write
4464 		 *		side error over the read side error.
4465 		 *	ENOTCONN The transport just got disconnected but
4466 		 *		sockfs had not yet seen the T_DISCON_IND
4467 		 *		when issuing the ioctl.
4468 		 */
4469 		error = 0;
4470 	} else if (res == 0 && strbuf.len > 0 &&
4471 	    (so->so_state & SS_ISCONNECTED)) {
4472 		ASSERT(strbuf.len <= (int)so->so_faddr_maxlen);
4473 		so->so_faddr_len = (socklen_t)strbuf.len;
4474 		bcopy(addr, so->so_faddr_sa, so->so_faddr_len);
4475 		so->so_state |= SS_FADDR_VALID;
4476 	}
4477 	kmem_free(addr, addrlen);
4478 #ifdef DEBUG
4479 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4480 			pr_addr(so->so_family, so->so_faddr_sa,
4481 				(t_uscalar_t)so->so_faddr_len)));
4482 #endif /* DEBUG */
4483 done:
4484 	so_unlock_single(so, SOLOCKED);
4485 	mutex_exit(&so->so_lock);
4486 	return (error);
4487 }
4488 
4489 /*
4490  * Update so_laddr by asking the transport (unless AF_UNIX).
4491  */
4492 int
4493 sotpi_getsockname(struct sonode *so)
4494 {
4495 	struct strbuf	strbuf;
4496 	int		error = 0, res;
4497 	void		*addr;
4498 	t_uscalar_t	addrlen;
4499 	k_sigset_t	smask;
4500 
4501 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4502 		so, pr_state(so->so_state, so->so_mode)));
4503 
4504 	mutex_enter(&so->so_lock);
4505 	so_lock_single(so);	/* Set SOLOCKED */
4506 	if (!(so->so_state & SS_ISBOUND) && so->so_family != AF_UNIX) {
4507 		/* Return an all zero address except for the family */
4508 		if (so->so_family == AF_INET)
4509 			so->so_laddr_len = (socklen_t)sizeof (sin_t);
4510 		else if (so->so_family == AF_INET6)
4511 			so->so_laddr_len = (socklen_t)sizeof (sin6_t);
4512 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
4513 		bzero(so->so_laddr_sa, so->so_laddr_len);
4514 		/*
4515 		 * Can not assume there is a sa_family for all
4516 		 * protocol families.
4517 		 */
4518 		if (so->so_family == AF_INET || so->so_family == AF_INET6)
4519 			so->so_laddr_sa->sa_family = so->so_family;
4520 	}
4521 #ifdef DEBUG
4522 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4523 		pr_addr(so->so_family, so->so_laddr_sa,
4524 			(t_uscalar_t)so->so_laddr_len)));
4525 #endif /* DEBUG */
4526 	if (so->so_family == AF_UNIX) {
4527 		/* Transport has different name space - return local info */
4528 		error = 0;
4529 		goto done;
4530 	}
4531 	/* Allocate local buffer to use with ioctl */
4532 	addrlen = (t_uscalar_t)so->so_laddr_maxlen;
4533 	mutex_exit(&so->so_lock);
4534 	addr = kmem_alloc(addrlen, KM_SLEEP);
4535 
4536 	/*
4537 	 * Issue TI_GETMYNAME with signals masked.
4538 	 * Put the result in so_laddr_sa so that getsockname works after
4539 	 * a shutdown(output).
4540 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4541 	 * back to the socket.
4542 	 */
4543 	strbuf.buf = addr;
4544 	strbuf.maxlen = addrlen;
4545 	strbuf.len = 0;
4546 
4547 	sigintr(&smask, 0);
4548 	res = 0;
4549 	ASSERT(CRED());
4550 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4551 			0, K_TO_K, CRED(), &res);
4552 	sigunintr(&smask);
4553 
4554 	mutex_enter(&so->so_lock);
4555 	/*
4556 	 * If there is an error record the error in so_error put don't fail
4557 	 * the getsockname. Instead fallback on the recorded
4558 	 * so->so_laddr_sa.
4559 	 */
4560 	if (error) {
4561 		/*
4562 		 * Various stream head errors can be returned to the ioctl.
4563 		 * However, it is impossible to determine which ones of
4564 		 * these are really socket level errors that were incorrectly
4565 		 * consumed by the ioctl. Thus this code silently ignores the
4566 		 * error - to code explicitly does not reinstate the error
4567 		 * using soseterror().
4568 		 * Experiments have shows that at least this set of
4569 		 * errors are reported and should not be reinstated on the
4570 		 * socket:
4571 		 *	EINVAL	E.g. if an I_LINK was in effect when
4572 		 *		getsockname was called.
4573 		 *	EPIPE	The ioctl error semantics prefer the write
4574 		 *		side error over the read side error.
4575 		 */
4576 		error = 0;
4577 	} else if (res == 0 && strbuf.len > 0 &&
4578 	    (so->so_state & SS_ISBOUND)) {
4579 		ASSERT(strbuf.len <= (int)so->so_laddr_maxlen);
4580 		so->so_laddr_len = (socklen_t)strbuf.len;
4581 		bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
4582 		so->so_state |= SS_LADDR_VALID;
4583 	}
4584 	kmem_free(addr, addrlen);
4585 #ifdef DEBUG
4586 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
4587 			pr_addr(so->so_family, so->so_laddr_sa,
4588 				(t_uscalar_t)so->so_laddr_len)));
4589 #endif /* DEBUG */
4590 done:
4591 	so_unlock_single(so, SOLOCKED);
4592 	mutex_exit(&so->so_lock);
4593 	return (error);
4594 }
4595 
4596 /*
4597  * Get socket options. For SOL_SOCKET options some options are handled
4598  * by the sockfs while others use the value recorded in the sonode as a
4599  * fallback should the T_SVR4_OPTMGMT_REQ fail.
4600  *
4601  * On the return most *optlenp bytes are copied to optval.
4602  */
4603 int
4604 sotpi_getsockopt(struct sonode *so, int level, int option_name,
4605 		void *optval, socklen_t *optlenp, int flags)
4606 {
4607 	struct T_optmgmt_req	optmgmt_req;
4608 	struct T_optmgmt_ack	*optmgmt_ack;
4609 	struct opthdr		oh;
4610 	struct opthdr		*opt_res;
4611 	mblk_t			*mp = NULL;
4612 	int			error = 0;
4613 	void			*option = NULL;	/* Set if fallback value */
4614 	t_uscalar_t		maxlen = *optlenp;
4615 	t_uscalar_t		len;
4616 	uint32_t		value;
4617 
4618 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
4619 			so, level, option_name, optval, optlenp,
4620 			pr_state(so->so_state, so->so_mode)));
4621 
4622 	mutex_enter(&so->so_lock);
4623 	so_lock_single(so);	/* Set SOLOCKED */
4624 
4625 	/*
4626 	 * Check for SOL_SOCKET options.
4627 	 * Certain SOL_SOCKET options are returned directly whereas
4628 	 * others only provide a default (fallback) value should
4629 	 * the T_SVR4_OPTMGMT_REQ fail.
4630 	 */
4631 	if (level == SOL_SOCKET) {
4632 		/* Check parameters */
4633 		switch (option_name) {
4634 		case SO_TYPE:
4635 		case SO_ERROR:
4636 		case SO_DEBUG:
4637 		case SO_ACCEPTCONN:
4638 		case SO_REUSEADDR:
4639 		case SO_KEEPALIVE:
4640 		case SO_DONTROUTE:
4641 		case SO_BROADCAST:
4642 		case SO_USELOOPBACK:
4643 		case SO_OOBINLINE:
4644 		case SO_SNDBUF:
4645 		case SO_RCVBUF:
4646 #ifdef notyet
4647 		case SO_SNDLOWAT:
4648 		case SO_RCVLOWAT:
4649 		case SO_SNDTIMEO:
4650 		case SO_RCVTIMEO:
4651 #endif /* notyet */
4652 		case SO_DGRAM_ERRIND:
4653 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
4654 				error = EINVAL;
4655 				eprintsoline(so, error);
4656 				goto done2;
4657 			}
4658 			break;
4659 		case SO_LINGER:
4660 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
4661 				error = EINVAL;
4662 				eprintsoline(so, error);
4663 				goto done2;
4664 			}
4665 			break;
4666 		}
4667 
4668 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
4669 
4670 		switch (option_name) {
4671 		case SO_TYPE:
4672 			value = so->so_type;
4673 			option = &value;
4674 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4675 
4676 		case SO_ERROR:
4677 			value = sogeterr(so);
4678 			option = &value;
4679 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4680 
4681 		case SO_ACCEPTCONN:
4682 			if (so->so_state & SS_ACCEPTCONN)
4683 				value = SO_ACCEPTCONN;
4684 			else
4685 				value = 0;
4686 #ifdef DEBUG
4687 			if (value) {
4688 				dprintso(so, 1,
4689 				    ("sotpi_getsockopt: 0x%x is set\n",
4690 				    option_name));
4691 			} else {
4692 				dprintso(so, 1,
4693 				    ("sotpi_getsockopt: 0x%x not set\n",
4694 				    option_name));
4695 			}
4696 #endif /* DEBUG */
4697 			option = &value;
4698 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4699 
4700 		case SO_DEBUG:
4701 		case SO_REUSEADDR:
4702 		case SO_KEEPALIVE:
4703 		case SO_DONTROUTE:
4704 		case SO_BROADCAST:
4705 		case SO_USELOOPBACK:
4706 		case SO_OOBINLINE:
4707 		case SO_DGRAM_ERRIND:
4708 			value = (so->so_options & option_name);
4709 #ifdef DEBUG
4710 			if (value) {
4711 				dprintso(so, 1,
4712 				    ("sotpi_getsockopt: 0x%x is set\n",
4713 				    option_name));
4714 			} else {
4715 				dprintso(so, 1,
4716 				    ("sotpi_getsockopt: 0x%x not set\n",
4717 				    option_name));
4718 			}
4719 #endif /* DEBUG */
4720 			option = &value;
4721 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4722 
4723 		/*
4724 		 * The following options are only returned by sockfs when the
4725 		 * T_SVR4_OPTMGMT_REQ fails.
4726 		 */
4727 		case SO_LINGER:
4728 			option = &so->so_linger;
4729 			len = (t_uscalar_t)sizeof (struct linger);
4730 			break;
4731 		case SO_SNDBUF: {
4732 			ssize_t lvalue;
4733 
4734 			/*
4735 			 * If the option has not been set then get a default
4736 			 * value from the read queue. This value is
4737 			 * returned if the transport fails
4738 			 * the T_SVR4_OPTMGMT_REQ.
4739 			 */
4740 			lvalue = so->so_sndbuf;
4741 			if (lvalue == 0) {
4742 				mutex_exit(&so->so_lock);
4743 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
4744 						QHIWAT, 0, &lvalue);
4745 				mutex_enter(&so->so_lock);
4746 				dprintso(so, 1,
4747 				    ("got SO_SNDBUF %ld from q\n", lvalue));
4748 			}
4749 			value = (int)lvalue;
4750 			option = &value;
4751 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
4752 			break;
4753 		}
4754 		case SO_RCVBUF: {
4755 			ssize_t lvalue;
4756 
4757 			/*
4758 			 * If the option has not been set then get a default
4759 			 * value from the read queue. This value is
4760 			 * returned if the transport fails
4761 			 * the T_SVR4_OPTMGMT_REQ.
4762 			 *
4763 			 * XXX If SO_RCVBUF has been set and this is an
4764 			 * XPG 4.2 application then do not ask the transport
4765 			 * since the transport might adjust the value and not
4766 			 * return exactly what was set by the application.
4767 			 * For non-XPG 4.2 application we return the value
4768 			 * that the transport is actually using.
4769 			 */
4770 			lvalue = so->so_rcvbuf;
4771 			if (lvalue == 0) {
4772 				mutex_exit(&so->so_lock);
4773 				(void) strqget(RD(strvp2wq(SOTOV(so))),
4774 						QHIWAT, 0, &lvalue);
4775 				mutex_enter(&so->so_lock);
4776 				dprintso(so, 1,
4777 				    ("got SO_RCVBUF %ld from q\n", lvalue));
4778 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
4779 				value = (int)lvalue;
4780 				option = &value;
4781 				goto copyout;	/* skip asking transport */
4782 			}
4783 			value = (int)lvalue;
4784 			option = &value;
4785 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
4786 			break;
4787 		}
4788 #ifdef notyet
4789 		/*
4790 		 * We do not implement the semantics of these options
4791 		 * thus we shouldn't implement the options either.
4792 		 */
4793 		case SO_SNDLOWAT:
4794 			value = so->so_sndlowat;
4795 			option = &value;
4796 			break;
4797 		case SO_RCVLOWAT:
4798 			value = so->so_rcvlowat;
4799 			option = &value;
4800 			break;
4801 		case SO_SNDTIMEO:
4802 			value = so->so_sndtimeo;
4803 			option = &value;
4804 			break;
4805 		case SO_RCVTIMEO:
4806 			value = so->so_rcvtimeo;
4807 			option = &value;
4808 			break;
4809 #endif /* notyet */
4810 		}
4811 	}
4812 
4813 	if (so->so_family == AF_NCA) {
4814 		goto done2;
4815 	}
4816 
4817 	mutex_exit(&so->so_lock);
4818 
4819 	/* Send request */
4820 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
4821 	optmgmt_req.MGMT_flags = T_CHECK;
4822 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
4823 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
4824 
4825 	oh.level = level;
4826 	oh.name = option_name;
4827 	oh.len = maxlen;
4828 
4829 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
4830 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP);
4831 	/* Let option management work in the presence of data flow control */
4832 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
4833 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
4834 	mp = NULL;
4835 	mutex_enter(&so->so_lock);
4836 	if (error) {
4837 		eprintsoline(so, error);
4838 		goto done2;
4839 	}
4840 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
4841 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
4842 	if (error) {
4843 		if (option != NULL) {
4844 			/* We have a fallback value */
4845 			error = 0;
4846 			goto copyout;
4847 		}
4848 		eprintsoline(so, error);
4849 		goto done2;
4850 	}
4851 	ASSERT(mp);
4852 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
4853 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
4854 			optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
4855 	if (opt_res == NULL) {
4856 		if (option != NULL) {
4857 			/* We have a fallback value */
4858 			error = 0;
4859 			goto copyout;
4860 		}
4861 		error = EPROTO;
4862 		eprintsoline(so, error);
4863 		goto done;
4864 	}
4865 	option = &opt_res[1];
4866 
4867 	/* check to ensure that the option is within bounds */
4868 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
4869 		(uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
4870 		if (option != NULL) {
4871 			/* We have a fallback value */
4872 			error = 0;
4873 			goto copyout;
4874 		}
4875 		error = EPROTO;
4876 		eprintsoline(so, error);
4877 		goto done;
4878 	}
4879 
4880 	len = opt_res->len;
4881 
4882 copyout: {
4883 		t_uscalar_t size = MIN(len, maxlen);
4884 		bcopy(option, optval, size);
4885 		bcopy(&size, optlenp, sizeof (size));
4886 	}
4887 done:
4888 	freemsg(mp);
4889 done2:
4890 	so_unlock_single(so, SOLOCKED);
4891 	mutex_exit(&so->so_lock);
4892 	return (error);
4893 }
4894 
4895 /*
4896  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
4897  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
4898  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
4899  * setsockopt has to work even if the transport does not support the option.
4900  */
4901 int
4902 sotpi_setsockopt(struct sonode *so, int level, int option_name,
4903 	const void *optval, t_uscalar_t optlen)
4904 {
4905 	struct T_optmgmt_req	optmgmt_req;
4906 	struct opthdr		oh;
4907 	mblk_t			*mp;
4908 	int			error = 0;
4909 	boolean_t		handled = B_FALSE;
4910 
4911 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
4912 			so, level, option_name, optval, optlen,
4913 			pr_state(so->so_state, so->so_mode)));
4914 
4915 
4916 	/* X/Open requires this check */
4917 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4918 		if (xnet_check_print)
4919 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
4920 		return (EINVAL);
4921 	}
4922 
4923 	/* Caller allocates aligned optval, or passes null */
4924 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
4925 	/* If optval is null optlen is 0, and vice-versa */
4926 	ASSERT(optval != NULL || optlen == 0);
4927 	ASSERT(optlen != 0 || optval == NULL);
4928 
4929 	mutex_enter(&so->so_lock);
4930 	so_lock_single(so);	/* Set SOLOCKED */
4931 	mutex_exit(&so->so_lock);
4932 
4933 	if (so->so_family == AF_NCA) {
4934 		/* Ignore any flow control problems with the transport. */
4935 		mutex_enter(&so->so_lock);
4936 		goto done;
4937 	}
4938 
4939 	/*
4940 	 * For SOCKET or TCP level options, try to set it here itself
4941 	 * provided socket has not been popped and we know the tcp
4942 	 * structure (stored in so_priv).
4943 	 */
4944 	if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
4945 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
4946 	    (so->so_version == SOV_SOCKSTREAM) && (so->so_priv != NULL)) {
4947 		tcp_t		*tcp = so->so_priv;
4948 		boolean_t	onoff;
4949 
4950 #define	intvalue	(*(int32_t *)optval)
4951 
4952 		switch (level) {
4953 		case SOL_SOCKET:
4954 			switch (option_name) {		/* Check length param */
4955 			case SO_DEBUG:
4956 			case SO_REUSEADDR:
4957 			case SO_DONTROUTE:
4958 			case SO_BROADCAST:
4959 			case SO_USELOOPBACK:
4960 			case SO_OOBINLINE:
4961 			case SO_DGRAM_ERRIND:
4962 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
4963 					error = EINVAL;
4964 					eprintsoline(so, error);
4965 					mutex_enter(&so->so_lock);
4966 					goto done2;
4967 				}
4968 				ASSERT(optval);
4969 				onoff = intvalue != 0;
4970 				handled = B_TRUE;
4971 				break;
4972 			case SO_LINGER:
4973 				if (optlen !=
4974 				    (t_uscalar_t)sizeof (struct linger)) {
4975 					error = EINVAL;
4976 					eprintsoline(so, error);
4977 					mutex_enter(&so->so_lock);
4978 					goto done2;
4979 				}
4980 				ASSERT(optval);
4981 				handled = B_TRUE;
4982 				break;
4983 			}
4984 
4985 			switch (option_name) {			/* Do actions */
4986 			case SO_LINGER: {
4987 				struct linger *lgr = (struct linger *)optval;
4988 
4989 				if (lgr->l_onoff) {
4990 					tcp->tcp_linger = 1;
4991 					tcp->tcp_lingertime = lgr->l_linger;
4992 					so->so_linger.l_onoff = SO_LINGER;
4993 					so->so_options |= SO_LINGER;
4994 				} else {
4995 					tcp->tcp_linger = 0;
4996 					tcp->tcp_lingertime = 0;
4997 					so->so_linger.l_onoff = 0;
4998 					so->so_options &= ~SO_LINGER;
4999 				}
5000 				so->so_linger.l_linger = lgr->l_linger;
5001 				handled = B_TRUE;
5002 				break;
5003 			}
5004 			case SO_DEBUG:
5005 				tcp->tcp_debug = onoff;
5006 #ifdef SOCK_TEST
5007 				if (intvalue & 2)
5008 					sock_test_timelimit = 10 * hz;
5009 				else
5010 					sock_test_timelimit = 0;
5011 
5012 				if (intvalue & 4)
5013 					do_useracc = 0;
5014 				else
5015 					do_useracc = 1;
5016 #endif /* SOCK_TEST */
5017 				break;
5018 			case SO_DONTROUTE:
5019 				/*
5020 				 * SO_DONTROUTE, SO_USELOOPBACK and
5021 				 * SO_BROADCAST are only of interest to IP.
5022 				 * We track them here only so
5023 				 * that we can report their current value.
5024 				 */
5025 				tcp->tcp_dontroute = onoff;
5026 				if (onoff)
5027 					so->so_options |= option_name;
5028 				else
5029 					so->so_options &= ~option_name;
5030 				break;
5031 			case SO_USELOOPBACK:
5032 				tcp->tcp_useloopback = onoff;
5033 				if (onoff)
5034 					so->so_options |= option_name;
5035 				else
5036 					so->so_options &= ~option_name;
5037 				break;
5038 			case SO_BROADCAST:
5039 				tcp->tcp_broadcast = onoff;
5040 				if (onoff)
5041 					so->so_options |= option_name;
5042 				else
5043 					so->so_options &= ~option_name;
5044 				break;
5045 			case SO_REUSEADDR:
5046 				tcp->tcp_reuseaddr = onoff;
5047 				if (onoff)
5048 					so->so_options |= option_name;
5049 				else
5050 					so->so_options &= ~option_name;
5051 				break;
5052 			case SO_OOBINLINE:
5053 				tcp->tcp_oobinline = onoff;
5054 				if (onoff)
5055 					so->so_options |= option_name;
5056 				else
5057 					so->so_options &= ~option_name;
5058 				break;
5059 			case SO_DGRAM_ERRIND:
5060 				tcp->tcp_dgram_errind = onoff;
5061 				if (onoff)
5062 					so->so_options |= option_name;
5063 				else
5064 					so->so_options &= ~option_name;
5065 				break;
5066 			}
5067 			break;
5068 		case IPPROTO_TCP:
5069 			switch (option_name) {
5070 			case TCP_NODELAY:
5071 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5072 					error = EINVAL;
5073 					eprintsoline(so, error);
5074 					mutex_enter(&so->so_lock);
5075 					goto done2;
5076 				}
5077 				ASSERT(optval);
5078 				tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss;
5079 				handled = B_TRUE;
5080 				break;
5081 			}
5082 			break;
5083 		default:
5084 			handled = B_FALSE;
5085 			break;
5086 		}
5087 	}
5088 
5089 	if (handled) {
5090 		mutex_enter(&so->so_lock);
5091 		goto done2;
5092 	}
5093 
5094 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5095 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5096 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5097 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5098 
5099 	oh.level = level;
5100 	oh.name = option_name;
5101 	oh.len = optlen;
5102 
5103 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5104 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP);
5105 	/* Let option management work in the presence of data flow control */
5106 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5107 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5108 	mp = NULL;
5109 	mutex_enter(&so->so_lock);
5110 	if (error) {
5111 		eprintsoline(so, error);
5112 		goto done;
5113 	}
5114 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5115 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5116 	if (error) {
5117 		eprintsoline(so, error);
5118 		goto done;
5119 	}
5120 	ASSERT(mp);
5121 	/* No need to verify T_optmgmt_ack */
5122 	freemsg(mp);
5123 done:
5124 	/*
5125 	 * Check for SOL_SOCKET options and record their values.
5126 	 * If we know about a SOL_SOCKET parameter and the transport
5127 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5128 	 * EPROTO) we let the setsockopt succeed.
5129 	 */
5130 	if (level == SOL_SOCKET) {
5131 		/* Check parameters */
5132 		switch (option_name) {
5133 		case SO_DEBUG:
5134 		case SO_REUSEADDR:
5135 		case SO_KEEPALIVE:
5136 		case SO_DONTROUTE:
5137 		case SO_BROADCAST:
5138 		case SO_USELOOPBACK:
5139 		case SO_OOBINLINE:
5140 		case SO_SNDBUF:
5141 		case SO_RCVBUF:
5142 #ifdef notyet
5143 		case SO_SNDLOWAT:
5144 		case SO_RCVLOWAT:
5145 		case SO_SNDTIMEO:
5146 		case SO_RCVTIMEO:
5147 #endif /* notyet */
5148 		case SO_DGRAM_ERRIND:
5149 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5150 				error = EINVAL;
5151 				eprintsoline(so, error);
5152 				goto done2;
5153 			}
5154 			ASSERT(optval);
5155 			handled = B_TRUE;
5156 			break;
5157 		case SO_LINGER:
5158 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5159 				error = EINVAL;
5160 				eprintsoline(so, error);
5161 				goto done2;
5162 			}
5163 			ASSERT(optval);
5164 			handled = B_TRUE;
5165 			break;
5166 		}
5167 
5168 #define	intvalue	(*(int32_t *)optval)
5169 
5170 		switch (option_name) {
5171 		case SO_TYPE:
5172 		case SO_ERROR:
5173 		case SO_ACCEPTCONN:
5174 			/* Can't be set */
5175 			error = ENOPROTOOPT;
5176 			goto done2;
5177 		case SO_LINGER: {
5178 			struct linger *l = (struct linger *)optval;
5179 
5180 			so->so_linger.l_linger = l->l_linger;
5181 			if (l->l_onoff) {
5182 				so->so_linger.l_onoff = SO_LINGER;
5183 				so->so_options |= SO_LINGER;
5184 			} else {
5185 				so->so_linger.l_onoff = 0;
5186 				so->so_options &= ~SO_LINGER;
5187 			}
5188 			break;
5189 		}
5190 
5191 		case SO_DEBUG:
5192 #ifdef SOCK_TEST
5193 			if (intvalue & 2)
5194 				sock_test_timelimit = 10 * hz;
5195 			else
5196 				sock_test_timelimit = 0;
5197 
5198 			if (intvalue & 4)
5199 				do_useracc = 0;
5200 			else
5201 				do_useracc = 1;
5202 #endif /* SOCK_TEST */
5203 			/* FALLTHRU */
5204 		case SO_REUSEADDR:
5205 		case SO_KEEPALIVE:
5206 		case SO_DONTROUTE:
5207 		case SO_BROADCAST:
5208 		case SO_USELOOPBACK:
5209 		case SO_OOBINLINE:
5210 		case SO_DGRAM_ERRIND:
5211 			if (intvalue != 0) {
5212 				dprintso(so, 1,
5213 					("sotpi_setsockopt: setting 0x%x\n",
5214 					option_name));
5215 				so->so_options |= option_name;
5216 			} else {
5217 				dprintso(so, 1,
5218 					("sotpi_setsockopt: clearing 0x%x\n",
5219 					option_name));
5220 				so->so_options &= ~option_name;
5221 			}
5222 			break;
5223 		/*
5224 		 * The following options are only returned by us when the
5225 		 * T_SVR4_OPTMGMT_REQ fails.
5226 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5227 		 * since the transport might adjust the value and not
5228 		 * return exactly what was set by the application.
5229 		 */
5230 		case SO_SNDBUF:
5231 			so->so_sndbuf = intvalue;
5232 			break;
5233 		case SO_RCVBUF:
5234 			so->so_rcvbuf = intvalue;
5235 			break;
5236 #ifdef notyet
5237 		/*
5238 		 * We do not implement the semantics of these options
5239 		 * thus we shouldn't implement the options either.
5240 		 */
5241 		case SO_SNDLOWAT:
5242 			so->so_sndlowat = intvalue;
5243 			break;
5244 		case SO_RCVLOWAT:
5245 			so->so_rcvlowat = intvalue;
5246 			break;
5247 		case SO_SNDTIMEO:
5248 			so->so_sndtimeo = intvalue;
5249 			break;
5250 		case SO_RCVTIMEO:
5251 			so->so_rcvtimeo = intvalue;
5252 			break;
5253 #endif /* notyet */
5254 		}
5255 #undef	intvalue
5256 
5257 		if (error) {
5258 			if ((error == ENOPROTOOPT || error == EPROTO ||
5259 			    error == EINVAL) && handled) {
5260 				dprintso(so, 1,
5261 				    ("setsockopt: ignoring error %d for 0x%x\n",
5262 				    error, option_name));
5263 				error = 0;
5264 			}
5265 		}
5266 	}
5267 done2:
5268 ret:
5269 	so_unlock_single(so, SOLOCKED);
5270 	mutex_exit(&so->so_lock);
5271 	return (error);
5272 }
5273