xref: /illumos-gate/usr/src/uts/common/fs/sockfs/socktpi.c (revision d6bb6a8465e557cb946ef49d56ed3202f6218652)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/strsun.h>
50 #include <sys/ddi.h>
51 #include <sys/esunddi.h>
52 #include <sys/flock.h>
53 #include <sys/modctl.h>
54 #include <sys/vtrace.h>
55 #include <sys/cmn_err.h>
56 #include <sys/pathname.h>
57 
58 #include <sys/socket.h>
59 #include <sys/socketvar.h>
60 #include <sys/sockio.h>
61 #include <netinet/in.h>
62 #include <sys/un.h>
63 #include <sys/strsun.h>
64 
65 #include <sys/tiuser.h>
66 #define	_SUN_TPI_VERSION	2
67 #include <sys/tihdr.h>
68 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
69 
70 #include <c2/audit.h>
71 
72 #include <inet/common.h>
73 #include <inet/ip.h>
74 #include <inet/ip6.h>
75 #include <inet/tcp.h>
76 #include <inet/udp_impl.h>
77 
78 #include <fs/sockfs/nl7c.h>
79 #include <sys/zone.h>
80 
81 #include <inet/kssl/ksslapi.h>
82 
83 /*
84  * Possible failures when memory can't be allocated. The documented behavior:
85  *
86  * 		5.5:			4.X:		XNET:
87  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
88  *							EINTR
89  *	(4.X does not document EINTR but returns it)
90  * bind:	ENOSR			-		ENOBUFS/ENOSR
91  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
92  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
93  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
94  *	(4.X getpeername and getsockname do not fail in practice)
95  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
96  * listen:	-			-		ENOBUFS
97  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
98  *							EINTR
99  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
100  *							EINTR
101  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
102  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
103  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
104  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
105  *
106  * Resolution. When allocation fails:
107  *	recv: return EINTR
108  *	send: return EINTR
109  *	connect, accept: EINTR
110  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
111  *	socket, socketpair: ENOBUFS
112  *	getpeername, getsockname: sleep
113  *	getsockopt, setsockopt: sleep
114  */
115 
116 #ifdef SOCK_TEST
117 /*
118  * Variables that make sockfs do something other than the standard TPI
119  * for the AF_INET transports.
120  *
121  * solisten_tpi_tcp:
122  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
123  *	the transport is already bound. This is needed to avoid loosing the
124  *	port number should listen() do a T_UNBIND_REQ followed by a
125  *	O_T_BIND_REQ.
126  *
127  * soconnect_tpi_udp:
128  *	UDP and ICMP can handle a T_CONN_REQ.
129  *	This is needed to make the sequence of connect(), getsockname()
130  *	return the local IP address used to send packets to the connected to
131  *	destination.
132  *
133  * soconnect_tpi_tcp:
134  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
135  *	Set this to non-zero to send TPI conformant messages to TCP in this
136  *	respect. This is a performance optimization.
137  *
138  * soaccept_tpi_tcp:
139  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
140  *	This is a performance optimization that has been picked up in XTI.
141  *
142  * soaccept_tpi_multioptions:
143  *	When inheriting SOL_SOCKET options from the listener to the accepting
144  *	socket send them as a single message for AF_INET{,6}.
145  */
146 int solisten_tpi_tcp = 0;
147 int soconnect_tpi_udp = 0;
148 int soconnect_tpi_tcp = 0;
149 int soaccept_tpi_tcp = 0;
150 int soaccept_tpi_multioptions = 1;
151 #else /* SOCK_TEST */
152 #define	soconnect_tpi_tcp	0
153 #define	soconnect_tpi_udp	0
154 #define	solisten_tpi_tcp	0
155 #define	soaccept_tpi_tcp	0
156 #define	soaccept_tpi_multioptions	1
157 #endif /* SOCK_TEST */
158 
159 #ifdef SOCK_TEST
160 extern int do_useracc;
161 extern clock_t sock_test_timelimit;
162 #endif /* SOCK_TEST */
163 
164 /*
165  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
166  * applications working. Turn on this flag to disable these checks.
167  */
168 int xnet_skip_checks = 0;
169 int xnet_check_print = 0;
170 int xnet_truncate_print = 0;
171 
172 extern	void sigintr(k_sigset_t *, int);
173 extern	void sigunintr(k_sigset_t *);
174 
175 extern	void *nl7c_lookup_addr(void *, t_uscalar_t);
176 extern	void *nl7c_add_addr(void *, t_uscalar_t);
177 extern	void nl7c_listener_addr(void *, queue_t *);
178 
179 /* Sockets acting as an in-kernel SSL proxy */
180 extern mblk_t	*strsock_kssl_input(vnode_t *, mblk_t *, strwakeup_t *,
181 		    strsigset_t *, strsigset_t *, strpollset_t *);
182 extern mblk_t	*strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *,
183 		    strsigset_t *, strsigset_t *, strpollset_t *);
184 
185 static int	sotpi_unbind(struct sonode *, int);
186 
187 /* TPI sockfs sonode operations */
188 static int	sotpi_accept(struct sonode *, int, struct sonode **);
189 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
190 		    int);
191 static int	sotpi_connect(struct sonode *, const struct sockaddr *,
192 		    socklen_t, int, int);
193 static int	sotpi_listen(struct sonode *, int);
194 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
195 		    struct uio *);
196 static int	sotpi_shutdown(struct sonode *, int);
197 static int	sotpi_getsockname(struct sonode *);
198 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
199 		    struct uio *, void *, t_uscalar_t, int);
200 static int	sodgram_direct(struct sonode *, struct sockaddr *,
201 		    socklen_t, struct uio *, int);
202 
203 sonodeops_t sotpi_sonodeops = {
204 	sotpi_accept,		/* sop_accept		*/
205 	sotpi_bind,		/* sop_bind		*/
206 	sotpi_listen,		/* sop_listen		*/
207 	sotpi_connect,		/* sop_connect		*/
208 	sotpi_recvmsg,		/* sop_recvmsg		*/
209 	sotpi_sendmsg,		/* sop_sendmsg		*/
210 	sotpi_getpeername,	/* sop_getpeername	*/
211 	sotpi_getsockname,	/* sop_getsockname	*/
212 	sotpi_shutdown,		/* sop_shutdown		*/
213 	sotpi_getsockopt,	/* sop_getsockopt	*/
214 	sotpi_setsockopt	/* sop_setsockopt	*/
215 };
216 
217 /*
218  * Common create code for socket and accept. If tso is set the values
219  * from that node is used instead of issuing a T_INFO_REQ.
220  *
221  * Assumes that the caller has a VN_HOLD on accessvp.
222  * The VN_RELE will occur either when sotpi_create() fails or when
223  * the returned sonode is freed.
224  */
225 struct sonode *
226 sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version,
227     struct sonode *tso, int *errorp)
228 {
229 	struct sonode	*so;
230 	vnode_t		*vp;
231 	int		flags, error;
232 
233 	ASSERT(accessvp != NULL);
234 	vp = makesockvp(accessvp, domain, type, protocol);
235 	ASSERT(vp != NULL);
236 	so = VTOSO(vp);
237 
238 	flags = FREAD|FWRITE;
239 
240 	if ((type == SOCK_STREAM || type == SOCK_DGRAM) &&
241 	    (domain == AF_INET || domain == AF_INET6) &&
242 	    (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP ||
243 	    protocol == IPPROTO_IP)) {
244 		/* Tell tcp or udp that it's talking to sockets */
245 		flags |= SO_SOCKSTR;
246 
247 		/*
248 		 * Here we indicate to socktpi_open() our attempt to
249 		 * make direct calls between sockfs and transport.
250 		 * The final decision is left to socktpi_open().
251 		 */
252 		so->so_state |= SS_DIRECT;
253 
254 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
255 		if (so->so_type == SOCK_STREAM && tso != NULL) {
256 			if (tso->so_state & SS_DIRECT) {
257 				/*
258 				 * Inherit SS_DIRECT from listener and pass
259 				 * SO_ACCEPTOR open flag to tcp, indicating
260 				 * that this is an accept fast-path instance.
261 				 */
262 				flags |= SO_ACCEPTOR;
263 			} else {
264 				/*
265 				 * SS_DIRECT is not set on listener, meaning
266 				 * that the listener has been converted from
267 				 * a socket to a stream.  Ensure that the
268 				 * acceptor inherits these settings.
269 				 */
270 				so->so_state &= ~SS_DIRECT;
271 				flags &= ~SO_SOCKSTR;
272 			}
273 		}
274 	}
275 
276 	/*
277 	 * Tell local transport that it is talking to sockets.
278 	 */
279 	if (so->so_family == AF_UNIX) {
280 		flags |= SO_SOCKSTR;
281 	}
282 
283 	/* Initialize the kernel SSL proxy fields */
284 	so->so_kssl_type = KSSL_NO_PROXY;
285 	so->so_kssl_ent = NULL;
286 	so->so_kssl_ctx = NULL;
287 
288 	if (error = socktpi_open(&vp, flags, CRED())) {
289 		VN_RELE(vp);
290 		*errorp = error;
291 		return (NULL);
292 	}
293 
294 	if (error = so_strinit(so, tso)) {
295 		(void) VOP_CLOSE(vp, 0, 1, 0, CRED());
296 		VN_RELE(vp);
297 		*errorp = error;
298 		return (NULL);
299 	}
300 
301 	if (version == SOV_DEFAULT)
302 		version = so_default_version;
303 
304 	so->so_version = (short)version;
305 
306 	return (so);
307 }
308 
309 /*
310  * Bind the socket to an unspecified address in sockfs only.
311  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
312  * required in all cases.
313  */
314 static void
315 so_automatic_bind(struct sonode *so)
316 {
317 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
318 
319 	ASSERT(MUTEX_HELD(&so->so_lock));
320 	ASSERT(!(so->so_state & SS_ISBOUND));
321 	ASSERT(so->so_unbind_mp);
322 
323 	ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
324 	bzero(so->so_laddr_sa, so->so_laddr_len);
325 	so->so_laddr_sa->sa_family = so->so_family;
326 	so->so_state |= SS_ISBOUND;
327 }
328 
329 
330 /*
331  * bind the socket.
332  *
333  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
334  * are passed in we allow rebinding. Note that for backwards compatibility
335  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
336  * Thus the rebinding code is currently not executed.
337  *
338  * The constraints for rebinding are:
339  * - it is a SOCK_DGRAM, or
340  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
341  *   and no listen() has been done.
342  * This rebinding code was added based on some language in the XNET book
343  * about not returning EINVAL it the protocol allows rebinding. However,
344  * this language is not present in the Posix socket draft. Thus maybe the
345  * rebinding logic should be deleted from the source.
346  *
347  * A null "name" can be used to unbind the socket if:
348  * - it is a SOCK_DGRAM, or
349  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
350  *   and no listen() has been done.
351  */
352 static int
353 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
354     socklen_t namelen, int backlog, int flags)
355 {
356 	struct T_bind_req	bind_req;
357 	struct T_bind_ack	*bind_ack;
358 	int			error = 0;
359 	mblk_t			*mp;
360 	void			*addr;
361 	t_uscalar_t		addrlen;
362 	int			unbind_on_err = 1;
363 	boolean_t		clear_acceptconn_on_err = B_FALSE;
364 	boolean_t		restore_backlog_on_err = B_FALSE;
365 	int			save_so_backlog;
366 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
367 	boolean_t		tcp_udp_xport;
368 	void			*nl7c = NULL;
369 
370 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
371 		so, name, namelen, backlog, flags,
372 		pr_state(so->so_state, so->so_mode)));
373 
374 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
375 
376 	if (!(flags & _SOBIND_LOCK_HELD)) {
377 		mutex_enter(&so->so_lock);
378 		so_lock_single(so);	/* Set SOLOCKED */
379 	} else {
380 		ASSERT(MUTEX_HELD(&so->so_lock));
381 		ASSERT(so->so_flag & SOLOCKED);
382 	}
383 
384 	/*
385 	 * Make sure that there is a preallocated unbind_req message
386 	 * before binding. This message allocated when the socket is
387 	 * created  but it might be have been consumed.
388 	 */
389 	if (so->so_unbind_mp == NULL) {
390 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
391 		/* NOTE: holding so_lock while sleeping */
392 		so->so_unbind_mp =
393 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
394 	}
395 
396 	if (flags & _SOBIND_REBIND) {
397 		/*
398 		 * Called from solisten after doing an sotpi_unbind() or
399 		 * potentially without the unbind (latter for AF_INET{,6}).
400 		 */
401 		ASSERT(name == NULL && namelen == 0);
402 
403 		if (so->so_family == AF_UNIX) {
404 			ASSERT(so->so_ux_bound_vp);
405 			addr = &so->so_ux_laddr;
406 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
407 			dprintso(so, 1,
408 			("sobind rebind UNIX: addrlen %d, addr 0x%p, vp %p\n",
409 			    addrlen,
410 			    ((struct so_ux_addr *)addr)->soua_vp,
411 			    so->so_ux_bound_vp));
412 		} else {
413 			addr = so->so_laddr_sa;
414 			addrlen = (t_uscalar_t)so->so_laddr_len;
415 		}
416 	} else if (flags & _SOBIND_UNSPEC) {
417 		ASSERT(name == NULL && namelen == 0);
418 
419 		/*
420 		 * The caller checked SS_ISBOUND but not necessarily
421 		 * under so_lock
422 		 */
423 		if (so->so_state & SS_ISBOUND) {
424 			/* No error */
425 			goto done;
426 		}
427 
428 		/* Set an initial local address */
429 		switch (so->so_family) {
430 		case AF_UNIX:
431 			/*
432 			 * Use an address with same size as struct sockaddr
433 			 * just like BSD.
434 			 */
435 			so->so_laddr_len =
436 				(socklen_t)sizeof (struct sockaddr);
437 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
438 			bzero(so->so_laddr_sa, so->so_laddr_len);
439 			so->so_laddr_sa->sa_family = so->so_family;
440 
441 			/*
442 			 * Pass down an address with the implicit bind
443 			 * magic number and the rest all zeros.
444 			 * The transport will return a unique address.
445 			 */
446 			so->so_ux_laddr.soua_vp = NULL;
447 			so->so_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
448 			addr = &so->so_ux_laddr;
449 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
450 			break;
451 
452 		case AF_INET:
453 		case AF_INET6:
454 			/*
455 			 * An unspecified bind in TPI has a NULL address.
456 			 * Set the address in sockfs to have the sa_family.
457 			 */
458 			so->so_laddr_len = (so->so_family == AF_INET) ?
459 			    (socklen_t)sizeof (sin_t) :
460 			    (socklen_t)sizeof (sin6_t);
461 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
462 			bzero(so->so_laddr_sa, so->so_laddr_len);
463 			so->so_laddr_sa->sa_family = so->so_family;
464 			addr = NULL;
465 			addrlen = 0;
466 			break;
467 
468 		default:
469 			/*
470 			 * An unspecified bind in TPI has a NULL address.
471 			 * Set the address in sockfs to be zero length.
472 			 *
473 			 * Can not assume there is a sa_family for all
474 			 * protocol families. For example, AF_X25 does not
475 			 * have a family field.
476 			 */
477 			bzero(so->so_laddr_sa, so->so_laddr_len);
478 			so->so_laddr_len = 0;	/* XXX correct? */
479 			addr = NULL;
480 			addrlen = 0;
481 			break;
482 		}
483 
484 	} else {
485 		if (so->so_state & SS_ISBOUND) {
486 			/*
487 			 * If it is ok to rebind the socket, first unbind
488 			 * with the transport. A rebind to the NULL address
489 			 * is interpreted as an unbind.
490 			 * Note that a bind to NULL in BSD does unbind the
491 			 * socket but it fails with EINVAL.
492 			 * Note that regular sockets set SOV_SOCKBSD i.e.
493 			 * _SOBIND_SOCKBSD gets set here hence no type of
494 			 * socket does currently allow rebinding.
495 			 *
496 			 * If the name is NULL just do an unbind.
497 			 */
498 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
499 			    name != NULL) {
500 				error = EINVAL;
501 				unbind_on_err = 0;
502 				eprintsoline(so, error);
503 				goto done;
504 			}
505 			if ((so->so_mode & SM_CONNREQUIRED) &&
506 			    (so->so_state & SS_CANTREBIND)) {
507 				error = EINVAL;
508 				unbind_on_err = 0;
509 				eprintsoline(so, error);
510 				goto done;
511 			}
512 			error = sotpi_unbind(so, 0);
513 			if (error) {
514 				eprintsoline(so, error);
515 				goto done;
516 			}
517 			ASSERT(!(so->so_state & SS_ISBOUND));
518 			if (name == NULL) {
519 				so->so_state &=
520 					~(SS_ISCONNECTED|SS_ISCONNECTING);
521 				goto done;
522 			}
523 		}
524 		/* X/Open requires this check */
525 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
526 			if (xnet_check_print) {
527 				printf("sockfs: X/Open bind state check "
528 				    "caused EINVAL\n");
529 			}
530 			error = EINVAL;
531 			goto done;
532 		}
533 
534 		switch (so->so_family) {
535 		case AF_UNIX:
536 			/*
537 			 * All AF_UNIX addresses are nul terminated
538 			 * when copied (copyin_name) in so the minimum
539 			 * length is 3 bytes.
540 			 */
541 			if (name == NULL ||
542 			    (ssize_t)namelen <= sizeof (short) + 1) {
543 				error = EISDIR;
544 				eprintsoline(so, error);
545 				goto done;
546 			}
547 			/*
548 			 * Verify so_family matches the bound family.
549 			 * BSD does not check this for AF_UNIX resulting
550 			 * in funny mknods.
551 			 */
552 			if (name->sa_family != so->so_family) {
553 				error = EAFNOSUPPORT;
554 				goto done;
555 			}
556 			break;
557 		case AF_INET:
558 			if (name == NULL) {
559 				error = EINVAL;
560 				eprintsoline(so, error);
561 				goto done;
562 			}
563 			if ((size_t)namelen != sizeof (sin_t)) {
564 				error = name->sa_family != so->so_family ?
565 				    EAFNOSUPPORT : EINVAL;
566 				eprintsoline(so, error);
567 				goto done;
568 			}
569 			if ((flags & _SOBIND_XPG4_2) &&
570 			    (name->sa_family != so->so_family)) {
571 				/*
572 				 * This check has to be made for X/Open
573 				 * sockets however application failures have
574 				 * been observed when it is applied to
575 				 * all sockets.
576 				 */
577 				error = EAFNOSUPPORT;
578 				eprintsoline(so, error);
579 				goto done;
580 			}
581 			/*
582 			 * Force a zero sa_family to match so_family.
583 			 *
584 			 * Some programs like inetd(1M) don't set the
585 			 * family field. Other programs leave
586 			 * sin_family set to garbage - SunOS 4.X does
587 			 * not check the family field on a bind.
588 			 * We use the family field that
589 			 * was passed in to the socket() call.
590 			 */
591 			name->sa_family = so->so_family;
592 			break;
593 
594 		case AF_INET6: {
595 #ifdef DEBUG
596 			sin6_t *sin6 = (sin6_t *)name;
597 #endif /* DEBUG */
598 
599 			if (name == NULL) {
600 				error = EINVAL;
601 				eprintsoline(so, error);
602 				goto done;
603 			}
604 			if ((size_t)namelen != sizeof (sin6_t)) {
605 				error = name->sa_family != so->so_family ?
606 				    EAFNOSUPPORT : EINVAL;
607 				eprintsoline(so, error);
608 				goto done;
609 			}
610 			if (name->sa_family != so->so_family) {
611 				/*
612 				 * With IPv6 we require the family to match
613 				 * unlike in IPv4.
614 				 */
615 				error = EAFNOSUPPORT;
616 				eprintsoline(so, error);
617 				goto done;
618 			}
619 #ifdef DEBUG
620 			/*
621 			 * Verify that apps don't forget to clear
622 			 * sin6_scope_id etc
623 			 */
624 			if (sin6->sin6_scope_id != 0 &&
625 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
626 				zcmn_err(getzoneid(), CE_WARN,
627 				    "bind with uninitialized sin6_scope_id "
628 				    "(%d) on socket. Pid = %d\n",
629 				    (int)sin6->sin6_scope_id,
630 				    (int)curproc->p_pid);
631 			}
632 			if (sin6->__sin6_src_id != 0) {
633 				zcmn_err(getzoneid(), CE_WARN,
634 				    "bind with uninitialized __sin6_src_id "
635 				    "(%d) on socket. Pid = %d\n",
636 				    (int)sin6->__sin6_src_id,
637 				    (int)curproc->p_pid);
638 			}
639 #endif /* DEBUG */
640 			break;
641 		}
642 		default:
643 			/*
644 			 * Don't do any length or sa_family check to allow
645 			 * non-sockaddr style addresses.
646 			 */
647 			if (name == NULL) {
648 				error = EINVAL;
649 				eprintsoline(so, error);
650 				goto done;
651 			}
652 			break;
653 		}
654 
655 		if (namelen > (t_uscalar_t)so->so_laddr_maxlen) {
656 			error = ENAMETOOLONG;
657 			eprintsoline(so, error);
658 			goto done;
659 		}
660 		/*
661 		 * Save local address.
662 		 */
663 		so->so_laddr_len = (socklen_t)namelen;
664 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
665 		bcopy(name, so->so_laddr_sa, namelen);
666 
667 		addr = so->so_laddr_sa;
668 		addrlen = (t_uscalar_t)so->so_laddr_len;
669 		switch (so->so_family) {
670 		case AF_INET6:
671 		case AF_INET:
672 			break;
673 		case AF_UNIX: {
674 			struct sockaddr_un *soun =
675 				(struct sockaddr_un *)so->so_laddr_sa;
676 			struct vnode *vp;
677 			struct vattr vattr;
678 
679 			ASSERT(so->so_ux_bound_vp == NULL);
680 			/*
681 			 * Create vnode for the specified path name.
682 			 * Keep vnode held with a reference in so_ux_bound_vp.
683 			 * Use the vnode pointer as the address used in the
684 			 * bind with the transport.
685 			 *
686 			 * Use the same mode as in BSD. In particular this does
687 			 * not observe the umask.
688 			 */
689 			/* MAXPATHLEN + soun_family + nul termination */
690 			if (so->so_laddr_len >
691 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
692 				error = ENAMETOOLONG;
693 				eprintsoline(so, error);
694 				goto done;
695 			}
696 			vattr.va_type = VSOCK;
697 			vattr.va_mode = 0777 & ~u.u_cmask;
698 			vattr.va_mask = AT_TYPE|AT_MODE;
699 			/* NOTE: holding so_lock */
700 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
701 						EXCL, 0, &vp, CRMKNOD, 0, 0);
702 			if (error) {
703 				if (error == EEXIST)
704 					error = EADDRINUSE;
705 				eprintsoline(so, error);
706 				goto done;
707 			}
708 			/*
709 			 * Establish pointer from the underlying filesystem
710 			 * vnode to the socket node.
711 			 * so_ux_bound_vp and v_stream->sd_vnode form the
712 			 * cross-linkage between the underlying filesystem
713 			 * node and the socket node.
714 			 */
715 			ASSERT(SOTOV(so)->v_stream);
716 			mutex_enter(&vp->v_lock);
717 			vp->v_stream = SOTOV(so)->v_stream;
718 			so->so_ux_bound_vp = vp;
719 			mutex_exit(&vp->v_lock);
720 
721 			/*
722 			 * Use the vnode pointer value as a unique address
723 			 * (together with the magic number to avoid conflicts
724 			 * with implicit binds) in the transport provider.
725 			 */
726 			so->so_ux_laddr.soua_vp = (void *)so->so_ux_bound_vp;
727 			so->so_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
728 			addr = &so->so_ux_laddr;
729 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
730 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
731 			    addrlen,
732 			    ((struct so_ux_addr *)addr)->soua_vp));
733 			break;
734 		}
735 		} /* end switch (so->so_family) */
736 	}
737 
738 	/*
739 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
740 	 * the transport can start passing up T_CONN_IND messages
741 	 * as soon as it receives the bind req and strsock_proto()
742 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
743 	 */
744 	if (flags & _SOBIND_LISTEN) {
745 		if ((so->so_state & SS_ACCEPTCONN) == 0)
746 			clear_acceptconn_on_err = B_TRUE;
747 		save_so_backlog = so->so_backlog;
748 		restore_backlog_on_err = B_TRUE;
749 		so->so_state |= SS_ACCEPTCONN;
750 		so->so_backlog = backlog;
751 	}
752 
753 	/*
754 	 * If NL7C addr(s) have been configured check for addr/port match,
755 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
756 	 *
757 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
758 	 * family sockets only. If match mark as such.
759 	 */
760 	if ((nl7c_enabled && addr != NULL &&
761 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
762 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
763 	    so->so_nl7c_flags == NL7C_AF_NCA) {
764 		/*
765 		 * NL7C is not supported in non-global zones,
766 		 * we enforce this restriction here.
767 		 */
768 		if (so->so_zoneid == GLOBAL_ZONEID) {
769 			/* An NL7C socket, mark it */
770 			so->so_nl7c_flags |= NL7C_ENABLED;
771 		} else
772 			nl7c = NULL;
773 	}
774 	/*
775 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
776 	 * for other transports we will send in a O_T_BIND_REQ.
777 	 */
778 	if (tcp_udp_xport &&
779 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
780 		PRIM_type = T_BIND_REQ;
781 
782 	bind_req.PRIM_type = PRIM_type;
783 	bind_req.ADDR_length = addrlen;
784 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
785 	bind_req.CONIND_number = backlog;
786 	/* NOTE: holding so_lock while sleeping */
787 	mp = soallocproto2(&bind_req, sizeof (bind_req),
788 				addr, addrlen, 0, _ALLOC_SLEEP);
789 	so->so_state &= ~SS_LADDR_VALID;
790 
791 	/* Done using so_laddr_sa - can drop the lock */
792 	mutex_exit(&so->so_lock);
793 
794 	/*
795 	 * Intercept the bind_req message here to check if this <address/port>
796 	 * was configured as an SSL proxy server, or if another endpoint was
797 	 * already configured to act as a proxy for us.
798 	 */
799 	if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
800 	    so->so_type == SOCK_STREAM) {
801 
802 		if (so->so_kssl_ent != NULL) {
803 			kssl_release_ent(so->so_kssl_ent, so, so->so_kssl_type);
804 			so->so_kssl_ent = NULL;
805 		}
806 
807 		so->so_kssl_type = kssl_check_proxy(mp, so, &so->so_kssl_ent);
808 		switch (so->so_kssl_type) {
809 		case KSSL_NO_PROXY:
810 			break;
811 
812 		case KSSL_HAS_PROXY:
813 			mutex_enter(&so->so_lock);
814 			goto skip_transport;
815 
816 		case KSSL_IS_PROXY:
817 			break;
818 		}
819 	}
820 
821 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
822 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
823 	if (error) {
824 		eprintsoline(so, error);
825 		mutex_enter(&so->so_lock);
826 		goto done;
827 	}
828 
829 	mutex_enter(&so->so_lock);
830 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
831 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
832 	if (error) {
833 		eprintsoline(so, error);
834 		goto done;
835 	}
836 skip_transport:
837 	ASSERT(mp);
838 	/*
839 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
840 	 * strsock_proto while the lock was dropped above, the bind
841 	 * is allowed to complete.
842 	 */
843 
844 	/* Mark as bound. This will be undone if we detect errors below. */
845 	if (flags & _SOBIND_NOXLATE) {
846 		ASSERT(so->so_family == AF_UNIX);
847 		so->so_state |= SS_FADDR_NOXLATE;
848 	}
849 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
850 	so->so_state |= SS_ISBOUND;
851 	ASSERT(so->so_unbind_mp);
852 
853 	/* note that we've already set SS_ACCEPTCONN above */
854 
855 	/*
856 	 * Recompute addrlen - an unspecied bind sent down an
857 	 * address of length zero but we expect the appropriate length
858 	 * in return.
859 	 */
860 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
861 	    sizeof (so->so_ux_laddr) : so->so_laddr_len);
862 
863 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
864 	/*
865 	 * The alignment restriction is really too strict but
866 	 * we want enough alignment to inspect the fields of
867 	 * a sockaddr_in.
868 	 */
869 	addr = sogetoff(mp, bind_ack->ADDR_offset,
870 			bind_ack->ADDR_length,
871 			__TPI_ALIGN_SIZE);
872 	if (addr == NULL) {
873 		freemsg(mp);
874 		error = EPROTO;
875 		eprintsoline(so, error);
876 		goto done;
877 	}
878 	if (!(flags & _SOBIND_UNSPEC)) {
879 		/*
880 		 * Verify that the transport didn't return something we
881 		 * did not want e.g. an address other than what we asked for.
882 		 *
883 		 * NOTE: These checks would go away if/when we switch to
884 		 * using the new TPI (in which the transport would fail
885 		 * the request instead of assigning a different address).
886 		 *
887 		 * NOTE2: For protocols that we don't know (i.e. any
888 		 * other than AF_INET6, AF_INET and AF_UNIX), we
889 		 * cannot know if the transport should be expected to
890 		 * return the same address as that requested.
891 		 *
892 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
893 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
894 		 *
895 		 * For example, in the case of netatalk it may be
896 		 * inappropriate for the transport to return the
897 		 * requested address (as it may have allocated a local
898 		 * port number in behaviour similar to that of an
899 		 * AF_INET bind request with a port number of zero).
900 		 *
901 		 * Given the definition of O_T_BIND_REQ, where the
902 		 * transport may bind to an address other than the
903 		 * requested address, it's not possible to determine
904 		 * whether a returned address that differs from the
905 		 * requested address is a reason to fail (because the
906 		 * requested address was not available) or succeed
907 		 * (because the transport allocated an appropriate
908 		 * address and/or port).
909 		 *
910 		 * sockfs currently requires that the transport return
911 		 * the requested address in the T_BIND_ACK, unless
912 		 * there is code here to allow for any discrepancy.
913 		 * Such code exists for AF_INET and AF_INET6.
914 		 *
915 		 * Netatalk chooses to return the requested address
916 		 * rather than the (correct) allocated address.  This
917 		 * means that netatalk violates the TPI specification
918 		 * (and would not function correctly if used from a
919 		 * TLI application), but it does mean that it works
920 		 * with sockfs.
921 		 *
922 		 * As noted above, using the newer XTI bind primitive
923 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
924 		 * allow sockfs to be more sure about whether or not
925 		 * the bind request had succeeded (as transports are
926 		 * not permitted to bind to a different address than
927 		 * that requested - they must return failure).
928 		 * Unfortunately, support for T_BIND_REQ may not be
929 		 * present in all transport implementations (netatalk,
930 		 * for example, doesn't have it), making the
931 		 * transition difficult.
932 		 */
933 		if (bind_ack->ADDR_length != addrlen) {
934 			/* Assumes that the requested address was in use */
935 			freemsg(mp);
936 			error = EADDRINUSE;
937 			eprintsoline(so, error);
938 			goto done;
939 		}
940 
941 		switch (so->so_family) {
942 		case AF_INET6:
943 		case AF_INET: {
944 			sin_t *rname, *aname;
945 
946 			rname = (sin_t *)addr;
947 			aname = (sin_t *)so->so_laddr_sa;
948 
949 			/*
950 			 * Take advantage of the alignment
951 			 * of sin_port and sin6_port which fall
952 			 * in the same place in their data structures.
953 			 * Just use sin_port for either address family.
954 			 *
955 			 * This may become a problem if (heaven forbid)
956 			 * there's a separate ipv6port_reserved... :-P
957 			 *
958 			 * Binding to port 0 has the semantics of letting
959 			 * the transport bind to any port.
960 			 *
961 			 * If the transport is TCP or UDP since we had sent
962 			 * a T_BIND_REQ we would not get a port other than
963 			 * what we asked for.
964 			 */
965 			if (tcp_udp_xport) {
966 				/*
967 				 * Pick up the new port number if we bound to
968 				 * port 0.
969 				 */
970 				if (aname->sin_port == 0)
971 					aname->sin_port = rname->sin_port;
972 				so->so_state |= SS_LADDR_VALID;
973 				break;
974 			}
975 			if (aname->sin_port != 0 &&
976 			    aname->sin_port != rname->sin_port) {
977 				freemsg(mp);
978 				error = EADDRINUSE;
979 				eprintsoline(so, error);
980 				goto done;
981 			}
982 			/*
983 			 * Pick up the new port number if we bound to port 0.
984 			 */
985 			aname->sin_port = rname->sin_port;
986 
987 			/*
988 			 * Unfortunately, addresses aren't _quite_ the same.
989 			 */
990 			if (so->so_family == AF_INET) {
991 				if (aname->sin_addr.s_addr !=
992 				    rname->sin_addr.s_addr) {
993 					freemsg(mp);
994 					error = EADDRNOTAVAIL;
995 					eprintsoline(so, error);
996 					goto done;
997 				}
998 			} else {
999 				sin6_t *rname6 = (sin6_t *)rname;
1000 				sin6_t *aname6 = (sin6_t *)aname;
1001 
1002 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1003 				    &rname6->sin6_addr)) {
1004 					freemsg(mp);
1005 					error = EADDRNOTAVAIL;
1006 					eprintsoline(so, error);
1007 					goto done;
1008 				}
1009 			}
1010 			break;
1011 		}
1012 		case AF_UNIX:
1013 			if (bcmp(addr, &so->so_ux_laddr, addrlen) != 0) {
1014 				freemsg(mp);
1015 				error = EADDRINUSE;
1016 				eprintsoline(so, error);
1017 				eprintso(so,
1018 					("addrlen %d, addr 0x%x, vp %p\n",
1019 					addrlen, *((int *)addr),
1020 					so->so_ux_bound_vp));
1021 				goto done;
1022 			}
1023 			so->so_state |= SS_LADDR_VALID;
1024 			break;
1025 		default:
1026 			/*
1027 			 * NOTE: This assumes that addresses can be
1028 			 * byte-compared for equivalence.
1029 			 */
1030 			if (bcmp(addr, so->so_laddr_sa, addrlen) != 0) {
1031 				freemsg(mp);
1032 				error = EADDRINUSE;
1033 				eprintsoline(so, error);
1034 				goto done;
1035 			}
1036 			/*
1037 			 * Don't mark SS_LADDR_VALID, as we cannot be
1038 			 * sure that the returned address is the real
1039 			 * bound address when talking to an unknown
1040 			 * transport.
1041 			 */
1042 			break;
1043 		}
1044 	} else {
1045 		/*
1046 		 * Save for returned address for getsockname.
1047 		 * Needed for unspecific bind unless transport supports
1048 		 * the TI_GETMYNAME ioctl.
1049 		 * Do this for AF_INET{,6} even though they do, as
1050 		 * caching info here is much better performance than
1051 		 * a TPI/STREAMS trip to the transport for getsockname.
1052 		 * Any which can't for some reason _must_ _not_ set
1053 		 * LADDR_VALID here for the caching version of getsockname
1054 		 * to not break;
1055 		 */
1056 		switch (so->so_family) {
1057 		case AF_UNIX:
1058 			/*
1059 			 * Record the address bound with the transport
1060 			 * for use by socketpair.
1061 			 */
1062 			bcopy(addr, &so->so_ux_laddr, addrlen);
1063 			so->so_state |= SS_LADDR_VALID;
1064 			break;
1065 		case AF_INET:
1066 		case AF_INET6:
1067 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
1068 			bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
1069 			so->so_state |= SS_LADDR_VALID;
1070 			break;
1071 		default:
1072 			/*
1073 			 * Don't mark SS_LADDR_VALID, as we cannot be
1074 			 * sure that the returned address is the real
1075 			 * bound address when talking to an unknown
1076 			 * transport.
1077 			 */
1078 			break;
1079 		}
1080 	}
1081 
1082 	if (nl7c == NULL && (so->so_nl7c_flags & NL7C_AF_NCA) &&
1083 	    (so->so_nl7c_flags & NL7C_ENABLED)) {
1084 		/*
1085 		 * Was an AF_NCA bind() so add it to the addr list for
1086 		 * reporting purposes.
1087 		 */
1088 		nl7c = nl7c_add_addr(addr, addrlen);
1089 	}
1090 	if (nl7c != NULL) {
1091 		nl7c_listener_addr(nl7c, strvp2wq(SOTOV(so)));
1092 	}
1093 
1094 	freemsg(mp);
1095 
1096 done:
1097 	if (error) {
1098 		/* reset state & backlog to values held on entry */
1099 		if (clear_acceptconn_on_err == B_TRUE)
1100 			so->so_state &= ~SS_ACCEPTCONN;
1101 		if (restore_backlog_on_err == B_TRUE)
1102 			so->so_backlog = save_so_backlog;
1103 
1104 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1105 			int err;
1106 
1107 			err = sotpi_unbind(so, 0);
1108 			/* LINTED - statement has no consequent: if */
1109 			if (err) {
1110 				eprintsoline(so, error);
1111 			} else {
1112 				ASSERT(!(so->so_state & SS_ISBOUND));
1113 			}
1114 		}
1115 	}
1116 	if (!(flags & _SOBIND_LOCK_HELD)) {
1117 		so_unlock_single(so, SOLOCKED);
1118 		mutex_exit(&so->so_lock);
1119 	} else {
1120 		/* If the caller held the lock don't release it here */
1121 		ASSERT(MUTEX_HELD(&so->so_lock));
1122 		ASSERT(so->so_flag & SOLOCKED);
1123 	}
1124 	return (error);
1125 }
1126 
1127 /* bind the socket */
1128 static int
1129 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1130     int flags)
1131 {
1132 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1133 		return (sotpi_bindlisten(so, name, namelen, 0, flags));
1134 
1135 	flags &= ~_SOBIND_SOCKETPAIR;
1136 	return (sotpi_bindlisten(so, name, namelen, 1, flags));
1137 }
1138 
1139 /*
1140  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1141  * address, or when listen needs to unbind and bind.
1142  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1143  * so that a sobind can pick them up.
1144  */
1145 static int
1146 sotpi_unbind(struct sonode *so, int flags)
1147 {
1148 	struct T_unbind_req	unbind_req;
1149 	int			error = 0;
1150 	mblk_t			*mp;
1151 
1152 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1153 			so, flags, pr_state(so->so_state, so->so_mode)));
1154 
1155 	ASSERT(MUTEX_HELD(&so->so_lock));
1156 	ASSERT(so->so_flag & SOLOCKED);
1157 
1158 	if (!(so->so_state & SS_ISBOUND)) {
1159 		error = EINVAL;
1160 		eprintsoline(so, error);
1161 		goto done;
1162 	}
1163 
1164 	mutex_exit(&so->so_lock);
1165 
1166 	/*
1167 	 * Flush the read and write side (except stream head read queue)
1168 	 * and send down T_UNBIND_REQ.
1169 	 */
1170 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1171 
1172 	unbind_req.PRIM_type = T_UNBIND_REQ;
1173 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1174 	    0, _ALLOC_SLEEP);
1175 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1176 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1177 	mutex_enter(&so->so_lock);
1178 	if (error) {
1179 		eprintsoline(so, error);
1180 		goto done;
1181 	}
1182 
1183 	error = sowaitokack(so, T_UNBIND_REQ);
1184 	if (error) {
1185 		eprintsoline(so, error);
1186 		goto done;
1187 	}
1188 
1189 	/*
1190 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1191 	 * strsock_proto while the lock was dropped above, the unbind
1192 	 * is allowed to complete.
1193 	 */
1194 	if (!(flags & _SOUNBIND_REBIND)) {
1195 		/*
1196 		 * Clear out bound address.
1197 		 */
1198 		vnode_t *vp;
1199 
1200 		if ((vp = so->so_ux_bound_vp) != NULL) {
1201 
1202 			/* Undo any SSL proxy setup */
1203 			if ((so->so_family == AF_INET ||
1204 			    so->so_family == AF_INET6) &&
1205 			    (so->so_type == SOCK_STREAM) &&
1206 			    (so->so_kssl_ent != NULL)) {
1207 				kssl_release_ent(so->so_kssl_ent, so,
1208 				    so->so_kssl_type);
1209 				so->so_kssl_ent = NULL;
1210 				so->so_kssl_type = KSSL_NO_PROXY;
1211 			}
1212 
1213 			so->so_ux_bound_vp = NULL;
1214 			vn_rele_stream(vp);
1215 		}
1216 		/* Clear out address */
1217 		so->so_laddr_len = 0;
1218 	}
1219 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN|SS_LADDR_VALID);
1220 done:
1221 
1222 	/* If the caller held the lock don't release it here */
1223 	ASSERT(MUTEX_HELD(&so->so_lock));
1224 	ASSERT(so->so_flag & SOLOCKED);
1225 
1226 	return (error);
1227 }
1228 
1229 /*
1230  * listen on the socket.
1231  * For TPI conforming transports this has to first unbind with the transport
1232  * and then bind again using the new backlog.
1233  */
1234 int
1235 sotpi_listen(struct sonode *so, int backlog)
1236 {
1237 	int		error = 0;
1238 
1239 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1240 		so, backlog, pr_state(so->so_state, so->so_mode)));
1241 
1242 	if (so->so_serv_type == T_CLTS)
1243 		return (EOPNOTSUPP);
1244 
1245 	/*
1246 	 * If the socket is ready to accept connections already, then
1247 	 * return without doing anything.  This avoids a problem where
1248 	 * a second listen() call fails if a connection is pending and
1249 	 * leaves the socket unbound. Only when we are not unbinding
1250 	 * with the transport can we safely increase the backlog.
1251 	 */
1252 	if (so->so_state & SS_ACCEPTCONN &&
1253 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1254 		/*CONSTCOND*/
1255 		!solisten_tpi_tcp))
1256 		return (0);
1257 
1258 	if (so->so_state & SS_ISCONNECTED)
1259 		return (EINVAL);
1260 
1261 	mutex_enter(&so->so_lock);
1262 	so_lock_single(so);	/* Set SOLOCKED */
1263 
1264 	if (backlog < 0)
1265 		backlog = 0;
1266 	/*
1267 	 * Use the same qlimit as in BSD. BSD checks the qlimit
1268 	 * before queuing the next connection implying that a
1269 	 * listen(sock, 0) allows one connection to be queued.
1270 	 * BSD also uses 1.5 times the requested backlog.
1271 	 *
1272 	 * XNS Issue 4 required a strict interpretation of the backlog.
1273 	 * This has been waived subsequently for Issue 4 and the change
1274 	 * incorporated in XNS Issue 5. So we aren't required to do
1275 	 * anything special for XPG apps.
1276 	 */
1277 	if (backlog >= (INT_MAX - 1) / 3)
1278 		backlog = INT_MAX;
1279 	else
1280 		backlog = backlog * 3 / 2 + 1;
1281 
1282 	/*
1283 	 * If the listen doesn't change the backlog we do nothing.
1284 	 * This avoids an EPROTO error from the transport.
1285 	 */
1286 	if ((so->so_state & SS_ACCEPTCONN) &&
1287 	    so->so_backlog == backlog)
1288 		goto done;
1289 
1290 	if (!(so->so_state & SS_ISBOUND)) {
1291 		/*
1292 		 * Must have been explicitly bound in the UNIX domain.
1293 		 */
1294 		if (so->so_family == AF_UNIX) {
1295 			error = EINVAL;
1296 			goto done;
1297 		}
1298 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1299 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1300 	} else if (backlog > 0) {
1301 		/*
1302 		 * AF_INET{,6} hack to avoid losing the port.
1303 		 * Assumes that all AF_INET{,6} transports can handle a
1304 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1305 		 * has already bound thus it is possible to avoid the unbind.
1306 		 */
1307 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1308 		    /*CONSTCOND*/
1309 		    !solisten_tpi_tcp)) {
1310 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1311 			if (error)
1312 				goto done;
1313 		}
1314 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1315 			    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1316 	} else {
1317 		so->so_state |= SS_ACCEPTCONN;
1318 		so->so_backlog = backlog;
1319 	}
1320 	if (error)
1321 		goto done;
1322 	ASSERT(so->so_state & SS_ACCEPTCONN);
1323 done:
1324 	so_unlock_single(so, SOLOCKED);
1325 	mutex_exit(&so->so_lock);
1326 	return (error);
1327 }
1328 
1329 /*
1330  * Disconnect either a specified seqno or all (-1).
1331  * The former is used on listening sockets only.
1332  *
1333  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1334  * the current use of sodisconnect(seqno == -1) is only for shutdown
1335  * so there is no point (and potentially incorrect) to unbind.
1336  */
1337 int
1338 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1339 {
1340 	struct T_discon_req	discon_req;
1341 	int			error = 0;
1342 	mblk_t			*mp;
1343 
1344 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1345 			so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1346 
1347 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1348 		mutex_enter(&so->so_lock);
1349 		so_lock_single(so);	/* Set SOLOCKED */
1350 	} else {
1351 		ASSERT(MUTEX_HELD(&so->so_lock));
1352 		ASSERT(so->so_flag & SOLOCKED);
1353 	}
1354 
1355 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1356 		error = EINVAL;
1357 		eprintsoline(so, error);
1358 		goto done;
1359 	}
1360 
1361 	mutex_exit(&so->so_lock);
1362 	/*
1363 	 * Flush the write side (unless this is a listener)
1364 	 * and then send down a T_DISCON_REQ.
1365 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1366 	 * and other messages.)
1367 	 */
1368 	if (!(so->so_state & SS_ACCEPTCONN))
1369 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1370 
1371 	discon_req.PRIM_type = T_DISCON_REQ;
1372 	discon_req.SEQ_number = seqno;
1373 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1374 	    0, _ALLOC_SLEEP);
1375 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1376 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1377 	mutex_enter(&so->so_lock);
1378 	if (error) {
1379 		eprintsoline(so, error);
1380 		goto done;
1381 	}
1382 
1383 	error = sowaitokack(so, T_DISCON_REQ);
1384 	if (error) {
1385 		eprintsoline(so, error);
1386 		goto done;
1387 	}
1388 	/*
1389 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1390 	 * strsock_proto while the lock was dropped above, the disconnect
1391 	 * is allowed to complete. However, it is not possible to
1392 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1393 	 */
1394 	so->so_state &=
1395 	    ~(SS_ISCONNECTED|SS_ISCONNECTING|SS_LADDR_VALID|SS_FADDR_VALID);
1396 done:
1397 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1398 		so_unlock_single(so, SOLOCKED);
1399 		mutex_exit(&so->so_lock);
1400 	} else {
1401 		/* If the caller held the lock don't release it here */
1402 		ASSERT(MUTEX_HELD(&so->so_lock));
1403 		ASSERT(so->so_flag & SOLOCKED);
1404 	}
1405 	return (error);
1406 }
1407 
1408 int
1409 sotpi_accept(struct sonode *so, int fflag, struct sonode **nsop)
1410 {
1411 	struct T_conn_ind	*conn_ind;
1412 	struct T_conn_res	*conn_res;
1413 	int			error = 0;
1414 	mblk_t			*mp, *ctxmp;
1415 	struct sonode		*nso;
1416 	vnode_t			*nvp;
1417 	void			*src;
1418 	t_uscalar_t		srclen;
1419 	void			*opt;
1420 	t_uscalar_t		optlen;
1421 	t_scalar_t		PRIM_type;
1422 	t_scalar_t		SEQ_number;
1423 
1424 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1425 		so, fflag, nsop, pr_state(so->so_state, so->so_mode)));
1426 
1427 	/*
1428 	 * Defer single-threading the accepting socket until
1429 	 * the T_CONN_IND has been received and parsed and the
1430 	 * new sonode has been opened.
1431 	 */
1432 
1433 	/* Check that we are not already connected */
1434 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1435 		goto conn_bad;
1436 again:
1437 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1438 		goto e_bad;
1439 
1440 	ASSERT(mp);
1441 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1442 	ctxmp = mp->b_cont;
1443 
1444 	/*
1445 	 * Save SEQ_number for error paths.
1446 	 */
1447 	SEQ_number = conn_ind->SEQ_number;
1448 
1449 	srclen = conn_ind->SRC_length;
1450 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1451 	if (src == NULL) {
1452 		error = EPROTO;
1453 		freemsg(mp);
1454 		eprintsoline(so, error);
1455 		goto disconnect_unlocked;
1456 	}
1457 	optlen = conn_ind->OPT_length;
1458 	switch (so->so_family) {
1459 	case AF_INET:
1460 	case AF_INET6:
1461 		if ((optlen == sizeof (intptr_t)) &&
1462 		    ((so->so_state & SS_DIRECT) != 0)) {
1463 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1464 			    &opt, conn_ind->OPT_length);
1465 		} else {
1466 			/*
1467 			 * The transport (in this case TCP) hasn't sent up
1468 			 * a pointer to an instance for the accept fast-path.
1469 			 * Disable fast-path completely because the call to
1470 			 * sotpi_create() below would otherwise create an
1471 			 * incomplete TCP instance, which would lead to
1472 			 * problems when sockfs sends a normal T_CONN_RES
1473 			 * message down the new stream.
1474 			 */
1475 			if (so->so_state & SS_DIRECT) {
1476 				int rval;
1477 				/*
1478 				 * For consistency we inform tcp to disable
1479 				 * direct interface on the listener, though
1480 				 * we can certainly live without doing this
1481 				 * because no data will ever travel upstream
1482 				 * on the listening socket.
1483 				 */
1484 				so->so_state &= ~SS_DIRECT;
1485 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1486 				    0, 0, K_TO_K, CRED(), &rval);
1487 			}
1488 			opt = NULL;
1489 			optlen = 0;
1490 		}
1491 		break;
1492 	case AF_UNIX:
1493 	default:
1494 		if (optlen != 0) {
1495 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1496 			    __TPI_ALIGN_SIZE);
1497 			if (opt == NULL) {
1498 				error = EPROTO;
1499 				freemsg(mp);
1500 				eprintsoline(so, error);
1501 				goto disconnect_unlocked;
1502 			}
1503 		}
1504 		if (so->so_family == AF_UNIX) {
1505 			if (!(so->so_state & SS_FADDR_NOXLATE)) {
1506 				src = NULL;
1507 				srclen = 0;
1508 			}
1509 			/* Extract src address from options */
1510 			if (optlen != 0)
1511 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1512 		}
1513 		break;
1514 	}
1515 
1516 	/*
1517 	 * Create the new socket.
1518 	 */
1519 	VN_HOLD(so->so_accessvp);
1520 	nso = sotpi_create(so->so_accessvp, so->so_family, so->so_type,
1521 			so->so_protocol, so->so_version, so, &error);
1522 	if (nso == NULL) {
1523 		ASSERT(error != 0);
1524 		/*
1525 		 * Accept can not fail with ENOBUFS. sotpi_create
1526 		 * sleeps waiting for memory until a signal is caught
1527 		 * so return EINTR.
1528 		 */
1529 		freemsg(mp);
1530 		if (error == ENOBUFS)
1531 			error = EINTR;
1532 		goto e_disc_unl;
1533 	}
1534 	nvp = SOTOV(nso);
1535 
1536 	/*
1537 	 * If the transport sent up an SSL connection context, then attach
1538 	 * it the new socket, and set the (sd_wputdatafunc)() and
1539 	 * (sd_rputdatafunc)() stream head hooks to intercept and process
1540 	 * SSL records.
1541 	 */
1542 	if (ctxmp != NULL) {
1543 		/*
1544 		 * This kssl_ctx_t is already held for us by the transport.
1545 		 * So, we don't need to do a kssl_hold_ctx() here.
1546 		 */
1547 		nso->so_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr);
1548 		freemsg(ctxmp);
1549 		mp->b_cont = NULL;
1550 		strsetrwputdatahooks(nvp, strsock_kssl_input,
1551 		    strsock_kssl_output);
1552 	}
1553 #ifdef DEBUG
1554 	/*
1555 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1556 	 * it's inherited early to allow debugging of the accept code itself.
1557 	 */
1558 	nso->so_options |= so->so_options & SO_DEBUG;
1559 #endif /* DEBUG */
1560 
1561 	/*
1562 	 * Save the SRC address from the T_CONN_IND
1563 	 * for getpeername to work on AF_UNIX and on transports that do not
1564 	 * support TI_GETPEERNAME.
1565 	 *
1566 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1567 	 * copyin_name().
1568 	 */
1569 	if (srclen > (t_uscalar_t)nso->so_faddr_maxlen) {
1570 		error = EINVAL;
1571 		freemsg(mp);
1572 		eprintsoline(so, error);
1573 		goto disconnect_vp_unlocked;
1574 	}
1575 	nso->so_faddr_len = (socklen_t)srclen;
1576 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
1577 	bcopy(src, nso->so_faddr_sa, srclen);
1578 	nso->so_state |= SS_FADDR_VALID;
1579 
1580 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1581 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1582 		cred_t *cr;
1583 
1584 		if ((cr = DB_CRED(mp)) != NULL) {
1585 			crhold(cr);
1586 			nso->so_peercred = cr;
1587 			nso->so_cpid = DB_CPID(mp);
1588 		}
1589 		freemsg(mp);
1590 
1591 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1592 		    sizeof (intptr_t), 0, _ALLOC_INTR);
1593 		if (mp == NULL) {
1594 			/*
1595 			 * Accept can not fail with ENOBUFS.
1596 			 * A signal was caught so return EINTR.
1597 			 */
1598 			error = EINTR;
1599 			eprintsoline(so, error);
1600 			goto disconnect_vp_unlocked;
1601 		}
1602 		conn_res = (struct T_conn_res *)mp->b_rptr;
1603 	} else {
1604 		nso->so_peercred = DB_CRED(mp);
1605 		nso->so_cpid = DB_CPID(mp);
1606 		DB_CRED(mp) = NULL;
1607 
1608 		mp->b_rptr = DB_BASE(mp);
1609 		conn_res = (struct T_conn_res *)mp->b_rptr;
1610 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1611 	}
1612 
1613 	/*
1614 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1615 	 * (or AF_INET6) it also has to be bound in the transport provider.
1616 	 * After accepting the connection on nso so_laddr_sa will be set to
1617 	 * contain the same address as the listener's local address
1618 	 * so the address we bind to isn't important.
1619 	 */
1620 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1621 	    /*CONSTCOND*/
1622 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1623 		/*
1624 		 * Optimization for AF_INET{,6} transports
1625 		 * that can handle a T_CONN_RES without being bound.
1626 		 */
1627 		mutex_enter(&nso->so_lock);
1628 		so_automatic_bind(nso);
1629 		mutex_exit(&nso->so_lock);
1630 	} else {
1631 		/* Perform NULL bind with the transport provider. */
1632 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC)) != 0) {
1633 			ASSERT(error != ENOBUFS);
1634 			freemsg(mp);
1635 			eprintsoline(nso, error);
1636 			goto disconnect_vp_unlocked;
1637 		}
1638 	}
1639 
1640 	/*
1641 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1642 	 * so that any data arriving on the new socket will cause the
1643 	 * appropriate signals to be delivered for the new socket.
1644 	 *
1645 	 * No other thread (except strsock_proto and strsock_misc)
1646 	 * can access the new socket thus we relax the locking.
1647 	 */
1648 	nso->so_pgrp = so->so_pgrp;
1649 	nso->so_state |= so->so_state & (SS_ASYNC|SS_FADDR_NOXLATE);
1650 
1651 	if (nso->so_pgrp != 0) {
1652 		if ((error = so_set_events(nso, nvp, CRED())) != 0) {
1653 			eprintsoline(nso, error);
1654 			error = 0;
1655 			nso->so_pgrp = 0;
1656 		}
1657 	}
1658 
1659 	/*
1660 	 * Make note of the socket level options. TCP and IP level options
1661 	 * are already inherited. We could do all this after accept is
1662 	 * successful but doing it here simplifies code and no harm done
1663 	 * for error case.
1664 	 */
1665 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1666 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1667 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1668 	nso->so_sndbuf = so->so_sndbuf;
1669 	nso->so_rcvbuf = so->so_rcvbuf;
1670 	if (nso->so_options & SO_LINGER)
1671 		nso->so_linger = so->so_linger;
1672 
1673 	if ((so->so_state & SS_DIRECT) != 0) {
1674 		mblk_t *ack_mp;
1675 
1676 		ASSERT(nso->so_state & SS_DIRECT);
1677 		ASSERT(opt != NULL);
1678 
1679 		conn_res->OPT_length = optlen;
1680 		conn_res->OPT_offset = MBLKL(mp);
1681 		bcopy(&opt, mp->b_wptr, optlen);
1682 		mp->b_wptr += optlen;
1683 		conn_res->PRIM_type = T_CONN_RES;
1684 		conn_res->ACCEPTOR_id = 0;
1685 		PRIM_type = T_CONN_RES;
1686 
1687 		/* Send down the T_CONN_RES on acceptor STREAM */
1688 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1689 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1690 		if (error) {
1691 			mutex_enter(&so->so_lock);
1692 			so_lock_single(so);
1693 			eprintsoline(so, error);
1694 			goto disconnect_vp;
1695 		}
1696 		mutex_enter(&nso->so_lock);
1697 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1698 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1699 		if (error) {
1700 			mutex_exit(&nso->so_lock);
1701 			mutex_enter(&so->so_lock);
1702 			so_lock_single(so);
1703 			eprintsoline(so, error);
1704 			goto disconnect_vp;
1705 		}
1706 		if (nso->so_family == AF_INET) {
1707 			sin_t *sin;
1708 
1709 			sin = (sin_t *)(ack_mp->b_rptr +
1710 			    sizeof (struct T_ok_ack));
1711 			bcopy(sin, nso->so_laddr_sa, sizeof (sin_t));
1712 			nso->so_laddr_len = sizeof (sin_t);
1713 		} else {
1714 			sin6_t *sin6;
1715 
1716 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1717 			    sizeof (struct T_ok_ack));
1718 			bcopy(sin6, nso->so_laddr_sa, sizeof (sin6_t));
1719 			nso->so_laddr_len = sizeof (sin6_t);
1720 		}
1721 		freemsg(ack_mp);
1722 
1723 		nso->so_state |= SS_ISCONNECTED | SS_LADDR_VALID;
1724 		nso->so_priv = opt;
1725 
1726 		if (so->so_nl7c_flags & NL7C_ENABLED) {
1727 			/*
1728 			 * An NL7C marked listen()er so the new socket
1729 			 * inherits the listen()er's NL7C state.
1730 			 *
1731 			 * When calling NL7C to process the new socket
1732 			 * pass the nonblocking i/o state of the listen
1733 			 * socket as this is the context we are in.
1734 			 */
1735 			nso->so_nl7c_flags = so->so_nl7c_flags;
1736 			if (nl7c_process(nso,
1737 			    (nso->so_state & (SS_NONBLOCK|SS_NDELAY)),
1738 			    (int)((tcp_t *)nso->so_priv)->tcp_mss)) {
1739 				/*
1740 				 * NL7C has completed processing on the
1741 				 * socket, close the socket and back to
1742 				 * the top to await the next T_CONN_IND.
1743 				 */
1744 				mutex_exit(&nso->so_lock);
1745 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1746 						CRED());
1747 				VN_RELE(nvp);
1748 				goto again;
1749 			}
1750 			/* Pass the new socket out */
1751 		}
1752 
1753 		mutex_exit(&nso->so_lock);
1754 
1755 		/*
1756 		 * Pass out new socket.
1757 		 */
1758 		if (nsop != NULL)
1759 			*nsop = nso;
1760 
1761 		return (0);
1762 	}
1763 
1764 	/*
1765 	 * Copy local address from listener.
1766 	 */
1767 	nso->so_laddr_len = so->so_laddr_len;
1768 	ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
1769 	bcopy(so->so_laddr_sa, nso->so_laddr_sa, nso->so_laddr_len);
1770 	nso->so_state |= SS_LADDR_VALID;
1771 
1772 	/*
1773 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1774 	 * which don't support the FireEngine accept fast-path. It is also
1775 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1776 	 * again. Neither sockfs nor TCP attempt to find out if some other
1777 	 * random module has been inserted in between (in which case we
1778 	 * should follow TLI accept behaviour). We blindly assume the worst
1779 	 * case and revert back to old behaviour i.e. TCP will not send us
1780 	 * any option (eager) and the accept should happen on the listener
1781 	 * queue. Any queued T_conn_ind have already got their options removed
1782 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
1783 	 */
1784 	/*
1785 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1786 	 */
1787 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1788 #ifdef	_ILP32
1789 		queue_t	*q;
1790 
1791 		/*
1792 		 * Find read queue in driver
1793 		 * Can safely do this since we "own" nso/nvp.
1794 		 */
1795 		q = strvp2wq(nvp)->q_next;
1796 		while (SAMESTR(q))
1797 			q = q->q_next;
1798 		q = RD(q);
1799 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1800 #else
1801 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1802 #endif	/* _ILP32 */
1803 		conn_res->PRIM_type = O_T_CONN_RES;
1804 		PRIM_type = O_T_CONN_RES;
1805 	} else {
1806 		conn_res->ACCEPTOR_id = nso->so_acceptor_id;
1807 		conn_res->PRIM_type = T_CONN_RES;
1808 		PRIM_type = T_CONN_RES;
1809 	}
1810 	conn_res->SEQ_number = SEQ_number;
1811 	conn_res->OPT_length = 0;
1812 	conn_res->OPT_offset = 0;
1813 
1814 	mutex_enter(&so->so_lock);
1815 	so_lock_single(so);	/* Set SOLOCKED */
1816 	mutex_exit(&so->so_lock);
1817 
1818 	error = kstrputmsg(SOTOV(so), mp, NULL,
1819 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1820 	mutex_enter(&so->so_lock);
1821 	if (error) {
1822 		eprintsoline(so, error);
1823 		goto disconnect_vp;
1824 	}
1825 	error = sowaitokack(so, PRIM_type);
1826 	if (error) {
1827 		eprintsoline(so, error);
1828 		goto disconnect_vp;
1829 	}
1830 	so_unlock_single(so, SOLOCKED);
1831 	mutex_exit(&so->so_lock);
1832 
1833 	nso->so_state |= SS_ISCONNECTED;
1834 
1835 	/*
1836 	 * Pass out new socket.
1837 	 */
1838 	if (nsop != NULL)
1839 		*nsop = nso;
1840 
1841 	return (0);
1842 
1843 
1844 eproto_disc_unl:
1845 	error = EPROTO;
1846 e_disc_unl:
1847 	eprintsoline(so, error);
1848 	goto disconnect_unlocked;
1849 
1850 pr_disc_vp_unl:
1851 	eprintsoline(so, error);
1852 disconnect_vp_unlocked:
1853 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED());
1854 	VN_RELE(nvp);
1855 disconnect_unlocked:
1856 	(void) sodisconnect(so, SEQ_number, 0);
1857 	return (error);
1858 
1859 pr_disc_vp:
1860 	eprintsoline(so, error);
1861 disconnect_vp:
1862 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
1863 	so_unlock_single(so, SOLOCKED);
1864 	mutex_exit(&so->so_lock);
1865 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED());
1866 	VN_RELE(nvp);
1867 	return (error);
1868 
1869 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
1870 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
1871 	    ? EOPNOTSUPP : EINVAL;
1872 e_bad:
1873 	eprintsoline(so, error);
1874 	return (error);
1875 }
1876 
1877 /*
1878  * connect a socket.
1879  *
1880  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
1881  * unconnect (by specifying a null address).
1882  */
1883 int
1884 sotpi_connect(struct sonode *so,
1885 	const struct sockaddr *name,
1886 	socklen_t namelen,
1887 	int fflag,
1888 	int flags)
1889 {
1890 	struct T_conn_req	conn_req;
1891 	int			error = 0;
1892 	mblk_t			*mp;
1893 	void			*src;
1894 	socklen_t		srclen;
1895 	void			*addr;
1896 	socklen_t		addrlen;
1897 	boolean_t		need_unlock;
1898 
1899 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
1900 		so, name, namelen, fflag, flags,
1901 		pr_state(so->so_state, so->so_mode)));
1902 
1903 	/*
1904 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
1905 	 * avoid sleeping for memory with SOLOCKED held.
1906 	 * We know that the T_CONN_REQ can't be larger than 2 * so_faddr_maxlen
1907 	 * + sizeof (struct T_opthdr).
1908 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
1909 	 * exceed so_faddr_maxlen).
1910 	 */
1911 	mp = soallocproto(sizeof (struct T_conn_req) +
1912 	    2 * so->so_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR);
1913 	if (mp == NULL) {
1914 		/*
1915 		 * Connect can not fail with ENOBUFS. A signal was
1916 		 * caught so return EINTR.
1917 		 */
1918 		error = EINTR;
1919 		eprintsoline(so, error);
1920 		return (error);
1921 	}
1922 
1923 	mutex_enter(&so->so_lock);
1924 	/*
1925 	 * Make sure that there is a preallocated unbind_req
1926 	 * message before any binding. This message allocated when
1927 	 * the socket is created  but it might be have been
1928 	 * consumed.
1929 	 */
1930 	if (so->so_unbind_mp == NULL) {
1931 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
1932 		/* NOTE: holding so_lock while sleeping */
1933 		so->so_unbind_mp =
1934 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR);
1935 		if (so->so_unbind_mp == NULL) {
1936 			error = EINTR;
1937 			need_unlock = B_FALSE;
1938 			goto done;
1939 		}
1940 	}
1941 
1942 	so_lock_single(so);	/* Set SOLOCKED */
1943 	need_unlock = B_TRUE;
1944 
1945 	/*
1946 	 * Can't have done a listen before connecting.
1947 	 */
1948 	if (so->so_state & SS_ACCEPTCONN) {
1949 		error = EOPNOTSUPP;
1950 		goto done;
1951 	}
1952 
1953 	/*
1954 	 * Must be bound with the transport
1955 	 */
1956 	if (!(so->so_state & SS_ISBOUND)) {
1957 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1958 		    /*CONSTCOND*/
1959 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
1960 			/*
1961 			 * Optimization for AF_INET{,6} transports
1962 			 * that can handle a T_CONN_REQ without being bound.
1963 			 */
1964 			so_automatic_bind(so);
1965 		} else {
1966 			error = sotpi_bind(so, NULL, 0,
1967 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
1968 			if (error)
1969 				goto done;
1970 		}
1971 		ASSERT(so->so_state & SS_ISBOUND);
1972 		flags |= _SOCONNECT_DID_BIND;
1973 	}
1974 
1975 	/*
1976 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
1977 	 * connect to a null address. This is the portable method to
1978 	 * unconnect a socket.
1979 	 */
1980 	if ((namelen >= sizeof (sa_family_t)) &&
1981 	    (name->sa_family == AF_UNSPEC)) {
1982 		name = NULL;
1983 		namelen = 0;
1984 	}
1985 
1986 	/*
1987 	 * Check that we are not already connected.
1988 	 * A connection-oriented socket cannot be reconnected.
1989 	 * A connected connection-less socket can be
1990 	 * - connected to a different address by a subsequent connect
1991 	 * - "unconnected" by a connect to the NULL address
1992 	 */
1993 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
1994 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
1995 		if (so->so_mode & SM_CONNREQUIRED) {
1996 			/* Connection-oriented socket */
1997 			error = so->so_state & SS_ISCONNECTED ?
1998 			    EISCONN : EALREADY;
1999 			goto done;
2000 		}
2001 		/* Connection-less socket */
2002 		if (name == NULL) {
2003 			/*
2004 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2005 			 * since it was set when the socket was connected.
2006 			 * If this is UDP also send down a T_DISCON_REQ.
2007 			 */
2008 			int val;
2009 
2010 			if ((so->so_family == AF_INET ||
2011 				so->so_family == AF_INET6) &&
2012 			    (so->so_type == SOCK_DGRAM ||
2013 				so->so_type == SOCK_RAW) &&
2014 			    /*CONSTCOND*/
2015 			    !soconnect_tpi_udp) {
2016 				/* XXX What about implicitly unbinding here? */
2017 				error = sodisconnect(so, -1,
2018 						_SODISCONNECT_LOCK_HELD);
2019 			} else {
2020 				so->so_state &=
2021 				    ~(SS_ISCONNECTED | SS_ISCONNECTING |
2022 				    SS_FADDR_VALID);
2023 				so->so_faddr_len = 0;
2024 			}
2025 
2026 			so_unlock_single(so, SOLOCKED);
2027 			mutex_exit(&so->so_lock);
2028 
2029 			val = 0;
2030 			(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2031 					&val, (t_uscalar_t)sizeof (val));
2032 
2033 			mutex_enter(&so->so_lock);
2034 			so_lock_single(so);	/* Set SOLOCKED */
2035 			goto done;
2036 		}
2037 	}
2038 	ASSERT(so->so_state & SS_ISBOUND);
2039 
2040 	if (name == NULL || namelen == 0) {
2041 		error = EINVAL;
2042 		goto done;
2043 	}
2044 	/*
2045 	 * Mark the socket if so_faddr_sa represents the transport level
2046 	 * address.
2047 	 */
2048 	if (flags & _SOCONNECT_NOXLATE) {
2049 		struct sockaddr_ux	*soaddr_ux;
2050 
2051 		ASSERT(so->so_family == AF_UNIX);
2052 		if (namelen != sizeof (struct sockaddr_ux)) {
2053 			error = EINVAL;
2054 			goto done;
2055 		}
2056 		soaddr_ux = (struct sockaddr_ux *)name;
2057 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2058 		namelen = sizeof (soaddr_ux->sou_addr);
2059 		so->so_state |= SS_FADDR_NOXLATE;
2060 	}
2061 
2062 	/*
2063 	 * Length and family checks.
2064 	 */
2065 	error = so_addr_verify(so, name, namelen);
2066 	if (error)
2067 		goto bad;
2068 
2069 	/*
2070 	 * Save foreign address. Needed for AF_UNIX as well as
2071 	 * transport providers that do not support TI_GETPEERNAME.
2072 	 * Also used for cached foreign address for TCP and UDP.
2073 	 */
2074 	if (namelen > (t_uscalar_t)so->so_faddr_maxlen) {
2075 		error = EINVAL;
2076 		goto done;
2077 	}
2078 	so->so_faddr_len = (socklen_t)namelen;
2079 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
2080 	bcopy(name, so->so_faddr_sa, namelen);
2081 	so->so_state |= SS_FADDR_VALID;
2082 
2083 	if (so->so_family == AF_UNIX) {
2084 		if (so->so_state & SS_FADDR_NOXLATE) {
2085 			/*
2086 			 * Already have a transport internal address. Do not
2087 			 * pass any (transport internal) source address.
2088 			 */
2089 			addr = so->so_faddr_sa;
2090 			addrlen = (t_uscalar_t)so->so_faddr_len;
2091 			src = NULL;
2092 			srclen = 0;
2093 		} else {
2094 			/*
2095 			 * Pass the sockaddr_un source address as an option
2096 			 * and translate the remote address.
2097 			 * Holding so_lock thus so_laddr_sa can not change.
2098 			 */
2099 			src = so->so_laddr_sa;
2100 			srclen = (t_uscalar_t)so->so_laddr_len;
2101 			dprintso(so, 1,
2102 				("sotpi_connect UNIX: srclen %d, src %p\n",
2103 				srclen, src));
2104 			error = so_ux_addr_xlate(so,
2105 				so->so_faddr_sa, (socklen_t)so->so_faddr_len,
2106 				(flags & _SOCONNECT_XPG4_2),
2107 				&addr, &addrlen);
2108 			if (error)
2109 				goto bad;
2110 		}
2111 	} else {
2112 		addr = so->so_faddr_sa;
2113 		addrlen = (t_uscalar_t)so->so_faddr_len;
2114 		src = NULL;
2115 		srclen = 0;
2116 	}
2117 	/*
2118 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2119 	 * option which asks the transport provider to send T_UDERR_IND
2120 	 * messages. These T_UDERR_IND messages are used to return connected
2121 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2122 	 *
2123 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2124 	 * we send down a T_CONN_REQ. This is needed to let the
2125 	 * transport assign a local address that is consistent with
2126 	 * the remote address. Applications depend on a getsockname()
2127 	 * after a connect() to retrieve the "source" IP address for
2128 	 * the connected socket.  Invalidate the cached local address
2129 	 * to force getsockname() to enquire of the transport.
2130 	 */
2131 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2132 		/*
2133 		 * Datagram socket.
2134 		 */
2135 		int32_t val;
2136 
2137 		so_unlock_single(so, SOLOCKED);
2138 		mutex_exit(&so->so_lock);
2139 
2140 		val = 1;
2141 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2142 					&val, (t_uscalar_t)sizeof (val));
2143 
2144 		mutex_enter(&so->so_lock);
2145 		so_lock_single(so);	/* Set SOLOCKED */
2146 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2147 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2148 		    soconnect_tpi_udp) {
2149 			soisconnected(so);
2150 			goto done;
2151 		}
2152 		/*
2153 		 * Send down T_CONN_REQ etc.
2154 		 * Clear fflag to avoid returning EWOULDBLOCK.
2155 		 */
2156 		fflag = 0;
2157 		ASSERT(so->so_family != AF_UNIX);
2158 		so->so_state &= ~SS_LADDR_VALID;
2159 	} else if (so->so_laddr_len != 0) {
2160 		/*
2161 		 * If the local address or port was "any" then it may be
2162 		 * changed by the transport as a result of the
2163 		 * connect.  Invalidate the cached version if we have one.
2164 		 */
2165 		switch (so->so_family) {
2166 		case AF_INET:
2167 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin_t));
2168 			if (((sin_t *)so->so_laddr_sa)->sin_addr.s_addr ==
2169 			    INADDR_ANY ||
2170 			    ((sin_t *)so->so_laddr_sa)->sin_port == 0)
2171 				so->so_state &= ~SS_LADDR_VALID;
2172 			break;
2173 
2174 		case AF_INET6:
2175 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin6_t));
2176 			if (IN6_IS_ADDR_UNSPECIFIED(
2177 			    &((sin6_t *)so->so_laddr_sa) ->sin6_addr) ||
2178 			    IN6_IS_ADDR_V4MAPPED_ANY(
2179 			    &((sin6_t *)so->so_laddr_sa)->sin6_addr) ||
2180 			    ((sin6_t *)so->so_laddr_sa)->sin6_port == 0)
2181 				    so->so_state &= ~SS_LADDR_VALID;
2182 			break;
2183 
2184 		default:
2185 			break;
2186 		}
2187 	}
2188 
2189 	/*
2190 	 * Check for failure of an earlier call
2191 	 */
2192 	if (so->so_error != 0)
2193 		goto so_bad;
2194 
2195 	/*
2196 	 * Send down T_CONN_REQ. Message was allocated above.
2197 	 */
2198 	conn_req.PRIM_type = T_CONN_REQ;
2199 	conn_req.DEST_length = addrlen;
2200 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2201 	if (srclen == 0) {
2202 		conn_req.OPT_length = 0;
2203 		conn_req.OPT_offset = 0;
2204 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2205 		soappendmsg(mp, addr, addrlen);
2206 	} else {
2207 		/*
2208 		 * There is a AF_UNIX sockaddr_un to include as a source
2209 		 * address option.
2210 		 */
2211 		struct T_opthdr toh;
2212 
2213 		toh.level = SOL_SOCKET;
2214 		toh.name = SO_SRCADDR;
2215 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2216 		toh.status = 0;
2217 		conn_req.OPT_length =
2218 			(t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2219 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2220 			_TPI_ALIGN_TOPT(addrlen));
2221 
2222 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2223 		soappendmsg(mp, addr, addrlen);
2224 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2225 		soappendmsg(mp, &toh, sizeof (toh));
2226 		soappendmsg(mp, src, srclen);
2227 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2228 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2229 	}
2230 	/*
2231 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2232 	 * in order to have the right state when the T_CONN_CON shows up.
2233 	 */
2234 	soisconnecting(so);
2235 	mutex_exit(&so->so_lock);
2236 
2237 #ifdef C2_AUDIT
2238 	if (audit_active)
2239 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2240 #endif /* C2_AUDIT */
2241 
2242 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2243 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2244 	mp = NULL;
2245 	mutex_enter(&so->so_lock);
2246 	if (error != 0)
2247 		goto bad;
2248 
2249 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2250 		goto bad;
2251 
2252 	/* Allow other threads to access the socket */
2253 	so_unlock_single(so, SOLOCKED);
2254 	need_unlock = B_FALSE;
2255 
2256 	/*
2257 	 * Wait until we get a T_CONN_CON or an error
2258 	 */
2259 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2260 		so_lock_single(so);	/* Set SOLOCKED */
2261 		need_unlock = B_TRUE;
2262 	}
2263 
2264 done:
2265 	freemsg(mp);
2266 	switch (error) {
2267 	case EINPROGRESS:
2268 	case EALREADY:
2269 	case EISCONN:
2270 	case EINTR:
2271 		/* Non-fatal errors */
2272 		so->so_state &= ~SS_LADDR_VALID;
2273 		/* FALLTHRU */
2274 	case 0:
2275 		break;
2276 
2277 	case EHOSTUNREACH:
2278 		if (flags & _SOCONNECT_XPG4_2) {
2279 			/*
2280 			 * X/Open specification contains a requirement that
2281 			 * ENETUNREACH be returned but does not require
2282 			 * EHOSTUNREACH. In order to keep the test suite
2283 			 * happy we mess with the errno here.
2284 			 */
2285 			error = ENETUNREACH;
2286 		}
2287 		/* FALLTHRU */
2288 
2289 	default:
2290 		ASSERT(need_unlock);
2291 		/*
2292 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2293 		 * and invalidate local-address cache
2294 		 */
2295 		so->so_state &= ~(SS_ISCONNECTING | SS_LADDR_VALID);
2296 		/* A discon_ind might have already unbound us */
2297 		if ((flags & _SOCONNECT_DID_BIND) &&
2298 		    (so->so_state & SS_ISBOUND)) {
2299 			int err;
2300 
2301 			err = sotpi_unbind(so, 0);
2302 			/* LINTED - statement has no conseq */
2303 			if (err) {
2304 				eprintsoline(so, err);
2305 			}
2306 		}
2307 		break;
2308 	}
2309 	if (need_unlock)
2310 		so_unlock_single(so, SOLOCKED);
2311 	mutex_exit(&so->so_lock);
2312 	return (error);
2313 
2314 so_bad:	error = sogeterr(so);
2315 bad:	eprintsoline(so, error);
2316 	goto done;
2317 }
2318 
2319 int
2320 sotpi_shutdown(struct sonode *so, int how)
2321 {
2322 	struct T_ordrel_req	ordrel_req;
2323 	mblk_t			*mp;
2324 	uint_t			old_state, state_change;
2325 	int			error = 0;
2326 
2327 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2328 		so, how, pr_state(so->so_state, so->so_mode)));
2329 
2330 	mutex_enter(&so->so_lock);
2331 	so_lock_single(so);	/* Set SOLOCKED */
2332 
2333 	/*
2334 	 * SunOS 4.X has no check for datagram sockets.
2335 	 * 5.X checks that it is connected (ENOTCONN)
2336 	 * X/Open requires that we check the connected state.
2337 	 */
2338 	if (!(so->so_state & SS_ISCONNECTED)) {
2339 		if (!xnet_skip_checks) {
2340 			error = ENOTCONN;
2341 			if (xnet_check_print) {
2342 				printf("sockfs: X/Open shutdown check "
2343 					"caused ENOTCONN\n");
2344 			}
2345 		}
2346 		goto done;
2347 	}
2348 	/*
2349 	 * Record the current state and then perform any state changes.
2350 	 * Then use the difference between the old and new states to
2351 	 * determine which messages need to be sent.
2352 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2353 	 * duplicate calls to shutdown().
2354 	 */
2355 	old_state = so->so_state;
2356 
2357 	switch (how) {
2358 	case 0:
2359 		socantrcvmore(so);
2360 		break;
2361 	case 1:
2362 		socantsendmore(so);
2363 		break;
2364 	case 2:
2365 		socantsendmore(so);
2366 		socantrcvmore(so);
2367 		break;
2368 	default:
2369 		error = EINVAL;
2370 		goto done;
2371 	}
2372 
2373 	/*
2374 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2375 	 */
2376 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2377 		(old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2378 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2379 
2380 	switch (state_change) {
2381 	case 0:
2382 		dprintso(so, 1,
2383 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2384 		    so->so_state));
2385 		goto done;
2386 
2387 	case SS_CANTRCVMORE:
2388 		mutex_exit(&so->so_lock);
2389 		strseteof(SOTOV(so), 1);
2390 		/*
2391 		 * strseteof takes care of read side wakeups,
2392 		 * pollwakeups, and signals.
2393 		 */
2394 		/*
2395 		 * Get the read lock before flushing data to avoid problems
2396 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2397 		 */
2398 		mutex_enter(&so->so_lock);
2399 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2400 		mutex_exit(&so->so_lock);
2401 
2402 		/* Flush read side queue */
2403 		strflushrq(SOTOV(so), FLUSHALL);
2404 
2405 		mutex_enter(&so->so_lock);
2406 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2407 		break;
2408 
2409 	case SS_CANTSENDMORE:
2410 		mutex_exit(&so->so_lock);
2411 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2412 		mutex_enter(&so->so_lock);
2413 		break;
2414 
2415 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2416 		mutex_exit(&so->so_lock);
2417 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2418 		strseteof(SOTOV(so), 1);
2419 		/*
2420 		 * strseteof takes care of read side wakeups,
2421 		 * pollwakeups, and signals.
2422 		 */
2423 		/*
2424 		 * Get the read lock before flushing data to avoid problems
2425 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2426 		 */
2427 		mutex_enter(&so->so_lock);
2428 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2429 		mutex_exit(&so->so_lock);
2430 
2431 		/* Flush read side queue */
2432 		strflushrq(SOTOV(so), FLUSHALL);
2433 
2434 		mutex_enter(&so->so_lock);
2435 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2436 		break;
2437 	}
2438 
2439 	ASSERT(MUTEX_HELD(&so->so_lock));
2440 
2441 	/*
2442 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2443 	 * was set due to this call and the new state has both of them set:
2444 	 *	Send the AF_UNIX close indication
2445 	 *	For T_COTS send a discon_ind
2446 	 *
2447 	 * If cantsend was set due to this call:
2448 	 *	For T_COTSORD send an ordrel_ind
2449 	 *
2450 	 * Note that for T_CLTS there is no message sent here.
2451 	 */
2452 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2453 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2454 		/*
2455 		 * For SunOS 4.X compatibility we tell the other end
2456 		 * that we are unable to receive at this point.
2457 		 */
2458 		if (so->so_family == AF_UNIX && so->so_serv_type != T_CLTS)
2459 			so_unix_close(so);
2460 
2461 		if (so->so_serv_type == T_COTS)
2462 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2463 	}
2464 	if ((state_change & SS_CANTSENDMORE) &&
2465 	    (so->so_serv_type == T_COTS_ORD)) {
2466 		/* Send an orderly release */
2467 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2468 
2469 		mutex_exit(&so->so_lock);
2470 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2471 		    0, _ALLOC_SLEEP);
2472 		/*
2473 		 * Send down the T_ORDREL_REQ even if there is flow control.
2474 		 * This prevents shutdown from blocking.
2475 		 * Note that there is no T_OK_ACK for ordrel_req.
2476 		 */
2477 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2478 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2479 		mutex_enter(&so->so_lock);
2480 		if (error) {
2481 			eprintsoline(so, error);
2482 			goto done;
2483 		}
2484 	}
2485 
2486 done:
2487 	so_unlock_single(so, SOLOCKED);
2488 	mutex_exit(&so->so_lock);
2489 	return (error);
2490 }
2491 
2492 /*
2493  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2494  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2495  * that we have closed.
2496  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2497  * T_UNITDATA_REQ containing the same option.
2498  *
2499  * For SOCK_DGRAM half-connections (somebody connected to this end
2500  * but this end is not connect) we don't know where to send any
2501  * SO_UNIX_CLOSE.
2502  *
2503  * We have to ignore stream head errors just in case there has been
2504  * a shutdown(output).
2505  * Ignore any flow control to try to get the message more quickly to the peer.
2506  * While locally ignoring flow control solves the problem when there
2507  * is only the loopback transport on the stream it would not provide
2508  * the correct AF_UNIX socket semantics when one or more modules have
2509  * been pushed.
2510  */
2511 void
2512 so_unix_close(struct sonode *so)
2513 {
2514 	int		error;
2515 	struct T_opthdr	toh;
2516 	mblk_t		*mp;
2517 
2518 	ASSERT(MUTEX_HELD(&so->so_lock));
2519 
2520 	ASSERT(so->so_family == AF_UNIX);
2521 
2522 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2523 	    (SS_ISCONNECTED|SS_ISBOUND))
2524 		return;
2525 
2526 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2527 		so, pr_state(so->so_state, so->so_mode)));
2528 
2529 	toh.level = SOL_SOCKET;
2530 	toh.name = SO_UNIX_CLOSE;
2531 
2532 	/* zero length + header */
2533 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2534 	toh.status = 0;
2535 
2536 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2537 		struct T_optdata_req tdr;
2538 
2539 		tdr.PRIM_type = T_OPTDATA_REQ;
2540 		tdr.DATA_flag = 0;
2541 
2542 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2543 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2544 
2545 		/* NOTE: holding so_lock while sleeping */
2546 		mp = soallocproto2(&tdr, sizeof (tdr),
2547 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP);
2548 	} else {
2549 		struct T_unitdata_req	tudr;
2550 		void			*addr;
2551 		socklen_t		addrlen;
2552 		void			*src;
2553 		socklen_t		srclen;
2554 		struct T_opthdr		toh2;
2555 		t_scalar_t		size;
2556 
2557 		/* Connecteded DGRAM socket */
2558 
2559 		/*
2560 		 * For AF_UNIX the destination address is translated to
2561 		 * an internal name and the source address is passed as
2562 		 * an option.
2563 		 */
2564 		/*
2565 		 * Length and family checks.
2566 		 */
2567 		error = so_addr_verify(so, so->so_faddr_sa,
2568 					(t_uscalar_t)so->so_faddr_len);
2569 		if (error) {
2570 			eprintsoline(so, error);
2571 			return;
2572 		}
2573 		if (so->so_state & SS_FADDR_NOXLATE) {
2574 			/*
2575 			 * Already have a transport internal address. Do not
2576 			 * pass any (transport internal) source address.
2577 			 */
2578 			addr = so->so_faddr_sa;
2579 			addrlen = (t_uscalar_t)so->so_faddr_len;
2580 			src = NULL;
2581 			srclen = 0;
2582 		} else {
2583 			/*
2584 			 * Pass the sockaddr_un source address as an option
2585 			 * and translate the remote address.
2586 			 * Holding so_lock thus so_laddr_sa can not change.
2587 			 */
2588 			src = so->so_laddr_sa;
2589 			srclen = (socklen_t)so->so_laddr_len;
2590 			dprintso(so, 1,
2591 				("so_ux_close: srclen %d, src %p\n",
2592 				srclen, src));
2593 			error = so_ux_addr_xlate(so,
2594 				so->so_faddr_sa,
2595 				(socklen_t)so->so_faddr_len, 0,
2596 				&addr, &addrlen);
2597 			if (error) {
2598 				eprintsoline(so, error);
2599 				return;
2600 			}
2601 		}
2602 		tudr.PRIM_type = T_UNITDATA_REQ;
2603 		tudr.DEST_length = addrlen;
2604 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2605 		if (srclen == 0) {
2606 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2607 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2608 				_TPI_ALIGN_TOPT(addrlen));
2609 
2610 			size = tudr.OPT_offset + tudr.OPT_length;
2611 			/* NOTE: holding so_lock while sleeping */
2612 			mp = soallocproto2(&tudr, sizeof (tudr),
2613 			    addr, addrlen, size, _ALLOC_SLEEP);
2614 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2615 			soappendmsg(mp, &toh, sizeof (toh));
2616 		} else {
2617 			/*
2618 			 * There is a AF_UNIX sockaddr_un to include as a
2619 			 * source address option.
2620 			 */
2621 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2622 			    _TPI_ALIGN_TOPT(srclen));
2623 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2624 			    _TPI_ALIGN_TOPT(addrlen));
2625 
2626 			toh2.level = SOL_SOCKET;
2627 			toh2.name = SO_SRCADDR;
2628 			toh2.len = (t_uscalar_t)(srclen +
2629 					sizeof (struct T_opthdr));
2630 			toh2.status = 0;
2631 
2632 			size = tudr.OPT_offset + tudr.OPT_length;
2633 
2634 			/* NOTE: holding so_lock while sleeping */
2635 			mp = soallocproto2(&tudr, sizeof (tudr),
2636 			    addr, addrlen, size, _ALLOC_SLEEP);
2637 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2638 			soappendmsg(mp, &toh, sizeof (toh));
2639 			soappendmsg(mp, &toh2, sizeof (toh2));
2640 			soappendmsg(mp, src, srclen);
2641 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2642 		}
2643 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2644 	}
2645 	mutex_exit(&so->so_lock);
2646 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2647 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2648 	mutex_enter(&so->so_lock);
2649 }
2650 
2651 /*
2652  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
2653  */
2654 int
2655 sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags)
2656 {
2657 	mblk_t		*mp, *nmp;
2658 	int		error;
2659 
2660 	dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", so, msg, flags));
2661 
2662 	/*
2663 	 * There is never any oob data with addresses or control since
2664 	 * the T_EXDATA_IND does not carry any options.
2665 	 */
2666 	msg->msg_controllen = 0;
2667 	msg->msg_namelen = 0;
2668 
2669 	mutex_enter(&so->so_lock);
2670 	ASSERT(so_verify_oobstate(so));
2671 	if ((so->so_options & SO_OOBINLINE) ||
2672 	    (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
2673 		dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
2674 		mutex_exit(&so->so_lock);
2675 		return (EINVAL);
2676 	}
2677 	if (!(so->so_state & SS_HAVEOOBDATA)) {
2678 		dprintso(so, 1, ("sorecvoob: no data yet\n"));
2679 		mutex_exit(&so->so_lock);
2680 		return (EWOULDBLOCK);
2681 	}
2682 	ASSERT(so->so_oobmsg != NULL);
2683 	mp = so->so_oobmsg;
2684 	if (flags & MSG_PEEK) {
2685 		/*
2686 		 * Since recv* can not return ENOBUFS we can not use dupmsg.
2687 		 * Instead we revert to the consolidation private
2688 		 * allocb_wait plus bcopy.
2689 		 */
2690 		mblk_t *mp1;
2691 
2692 		mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
2693 		ASSERT(mp1);
2694 
2695 		while (mp != NULL) {
2696 			ssize_t size;
2697 
2698 			size = MBLKL(mp);
2699 			bcopy(mp->b_rptr, mp1->b_wptr, size);
2700 			mp1->b_wptr += size;
2701 			ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
2702 			mp = mp->b_cont;
2703 		}
2704 		mp = mp1;
2705 	} else {
2706 		/*
2707 		 * Update the state indicating that the data has been consumed.
2708 		 * Keep SS_OOBPEND set until data is consumed past the mark.
2709 		 */
2710 		so->so_oobmsg = NULL;
2711 		so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
2712 	}
2713 	dprintso(so, 1,
2714 		("after recvoob(%p): counts %d/%d state %s\n",
2715 		so, so->so_oobsigcnt,
2716 		so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2717 	ASSERT(so_verify_oobstate(so));
2718 	mutex_exit(&so->so_lock);
2719 
2720 	error = 0;
2721 	nmp = mp;
2722 	while (nmp != NULL && uiop->uio_resid > 0) {
2723 		ssize_t n = MBLKL(nmp);
2724 
2725 		n = MIN(n, uiop->uio_resid);
2726 		if (n > 0)
2727 			error = uiomove(nmp->b_rptr, n,
2728 					UIO_READ, uiop);
2729 		if (error)
2730 			break;
2731 		nmp = nmp->b_cont;
2732 	}
2733 	freemsg(mp);
2734 	return (error);
2735 }
2736 
2737 /*
2738  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2739  * In addition, the caller typically verifies that there is some
2740  * potential state to clear by checking
2741  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2742  * before calling this routine.
2743  * Note that such a check can be made without holding so_lock since
2744  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2745  * decrements so_oobsigcnt.
2746  *
2747  * When data is read *after* the point that all pending
2748  * oob data has been consumed the oob indication is cleared.
2749  *
2750  * This logic keeps select/poll returning POLLRDBAND and
2751  * SIOCATMARK returning true until we have read past
2752  * the mark.
2753  */
2754 static void
2755 sorecv_update_oobstate(struct sonode *so)
2756 {
2757 	mutex_enter(&so->so_lock);
2758 	ASSERT(so_verify_oobstate(so));
2759 	dprintso(so, 1,
2760 		("sorecv_update_oobstate: counts %d/%d state %s\n",
2761 		so->so_oobsigcnt,
2762 		so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2763 	if (so->so_oobsigcnt == 0) {
2764 		/* No more pending oob indications */
2765 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2766 		freemsg(so->so_oobmsg);
2767 		so->so_oobmsg = NULL;
2768 	}
2769 	ASSERT(so_verify_oobstate(so));
2770 	mutex_exit(&so->so_lock);
2771 }
2772 
2773 /*
2774  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2775  */
2776 static int
2777 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2778 {
2779 	int	error = 0;
2780 	mblk_t *tmp = NULL;
2781 	mblk_t *pmp = NULL;
2782 	mblk_t *nmp = so->so_nl7c_rcv_mp;
2783 
2784 	ASSERT(nmp != NULL);
2785 
2786 	while (nmp != NULL && uiop->uio_resid > 0) {
2787 		ssize_t n;
2788 
2789 		if (DB_TYPE(nmp) == M_DATA) {
2790 			/*
2791 			 * We have some data, uiomove up to resid bytes.
2792 			 */
2793 			n = MIN(MBLKL(nmp), uiop->uio_resid);
2794 			if (n > 0)
2795 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
2796 			if (error)
2797 				break;
2798 			nmp->b_rptr += n;
2799 			if (nmp->b_rptr == nmp->b_wptr) {
2800 				pmp = nmp;
2801 				nmp = nmp->b_cont;
2802 			}
2803 		} else {
2804 			/*
2805 			 * We only handle data, save for caller to handle.
2806 			 */
2807 			if (pmp != NULL) {
2808 				pmp->b_cont = nmp->b_cont;
2809 			}
2810 			nmp->b_cont = NULL;
2811 			if (*rmp == NULL) {
2812 				*rmp = nmp;
2813 			} else {
2814 				tmp->b_next = nmp;
2815 			}
2816 			nmp = nmp->b_cont;
2817 			tmp = nmp;
2818 		}
2819 	}
2820 	if (pmp != NULL) {
2821 		/* Free any mblk_t(s) which we have consumed */
2822 		pmp->b_cont = NULL;
2823 		freemsg(so->so_nl7c_rcv_mp);
2824 	}
2825 	if ((so->so_nl7c_rcv_mp = nmp) == NULL) {
2826 		/* Last mblk_t so return the saved rval from kstrgetmsg() */
2827 		rp->r_vals = so->so_nl7c_rcv_rval;
2828 		so->so_nl7c_rcv_rval = 0;
2829 	} else {
2830 		/* More mblk_t(s) to process so no rval to return */
2831 		rp->r_vals = 0;
2832 	}
2833 	return (error);
2834 }
2835 
2836 /*
2837  * Receive the next message on the queue.
2838  * If msg_controllen is non-zero when called the caller is interested in
2839  * any received control info (options).
2840  * If msg_namelen is non-zero when called the caller is interested in
2841  * any received source address.
2842  * The routine returns with msg_control and msg_name pointing to
2843  * kmem_alloc'ed memory which the caller has to free.
2844  */
2845 int
2846 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
2847 {
2848 	union T_primitives	*tpr;
2849 	mblk_t			*mp;
2850 	uchar_t			pri;
2851 	int			pflag, opflag;
2852 	void			*control;
2853 	t_uscalar_t		controllen;
2854 	t_uscalar_t		namelen;
2855 	int			so_state = so->so_state; /* Snapshot */
2856 	ssize_t			saved_resid;
2857 	int			error;
2858 	rval_t			rval;
2859 	int			flags;
2860 	clock_t			timout;
2861 	int			first;
2862 
2863 	flags = msg->msg_flags;
2864 	msg->msg_flags = 0;
2865 
2866 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2867 		so, msg, flags,
2868 		pr_state(so->so_state, so->so_mode), so->so_error));
2869 
2870 	/*
2871 	 * If we are not connected because we have never been connected
2872 	 * we return ENOTCONN. If we have been connected (but are no longer
2873 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2874 	 * the EOF.
2875 	 *
2876 	 * An alternative would be to post an ENOTCONN error in stream head
2877 	 * (read+write) and clear it when we're connected. However, that error
2878 	 * would cause incorrect poll/select behavior!
2879 	 */
2880 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2881 	    (so->so_mode & SM_CONNREQUIRED)) {
2882 		return (ENOTCONN);
2883 	}
2884 
2885 	/*
2886 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2887 	 * after checking that the read queue is empty) and returns zero.
2888 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2889 	 * is zero.
2890 	 */
2891 
2892 	if (flags & MSG_OOB) {
2893 		/* Check that the transport supports OOB */
2894 		if (!(so->so_mode & SM_EXDATA))
2895 			return (EOPNOTSUPP);
2896 		return (sorecvoob(so, msg, uiop, flags));
2897 	}
2898 
2899 	/*
2900 	 * Set msg_controllen and msg_namelen to zero here to make it
2901 	 * simpler in the cases that no control or name is returned.
2902 	 */
2903 	controllen = msg->msg_controllen;
2904 	namelen = msg->msg_namelen;
2905 	msg->msg_controllen = 0;
2906 	msg->msg_namelen = 0;
2907 
2908 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2909 		namelen, controllen));
2910 
2911 	/*
2912 	 * If an NL7C enabled socket and not waiting for write data.
2913 	 */
2914 	mutex_enter(&so->so_lock);
2915 	if ((so->so_nl7c_flags & (NL7C_ENABLED|NL7C_WAITWRITE)) ==
2916 	    NL7C_ENABLED) {
2917 		if (so->so_nl7c_uri) {
2918 			/*
2919 			 * Close uri processing for a previous request.
2920 			 */
2921 			nl7c_close(so);
2922 		}
2923 		if (nl7c_process(so,
2924 		    (so->so_state & (SS_NONBLOCK|SS_NDELAY)),
2925 		    (int)((tcp_t *)so->so_priv)->tcp_mss)) {
2926 			/*
2927 			 * NL7C has completed processing on the socket,
2928 			 * clear the enabled bit as no further NL7C
2929 			 * processing will be needed.
2930 			 */
2931 			so->so_nl7c_flags = 0;
2932 		}
2933 	}
2934 
2935 	/*
2936 	 * Only one reader is allowed at any given time. This is needed
2937 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
2938 	 *
2939 	 * This is slightly different that BSD behavior in that it fails with
2940 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
2941 	 * is single-threaded using sblock(), which is dropped while waiting
2942 	 * for data to appear. The difference shows up e.g. if one
2943 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
2944 	 * does use nonblocking io and different threads are reading each
2945 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
2946 	 * in this case as long as the read queue doesn't get empty.
2947 	 * In this implementation the thread using nonblocking io can
2948 	 * get an EWOULDBLOCK error due to the blocking thread executing
2949 	 * e.g. in the uiomove in kstrgetmsg.
2950 	 * This difference is not believed to be significant.
2951 	 */
2952 	error = so_lock_read_intr(so, uiop->uio_fmode);	/* Set SOREADLOCKED */
2953 	mutex_exit(&so->so_lock);
2954 	if (error)
2955 		return (error);
2956 
2957 	/*
2958 	 * Tell kstrgetmsg to not inspect the stream head errors until all
2959 	 * queued data has been consumed.
2960 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
2961 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
2962 	 *
2963 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
2964 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
2965 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
2966 	 */
2967 	pflag = MSG_ANY | MSG_DELAYERROR;
2968 	if (flags & MSG_PEEK) {
2969 		pflag |= MSG_IPEEK;
2970 		flags &= ~MSG_WAITALL;
2971 	}
2972 	if (so->so_mode & SM_ATOMIC)
2973 		pflag |= MSG_DISCARDTAIL;
2974 
2975 	if (flags & MSG_DONTWAIT)
2976 		timout = 0;
2977 	else
2978 		timout = -1;
2979 	opflag = pflag;
2980 	first = 1;
2981 
2982 	/*
2983 	 * If so saved NL7C rcv mblk_t(s) uiomove them first
2984 	 * else get'm from the streamhead.
2985 	 */
2986 retry:
2987 	saved_resid = uiop->uio_resid;
2988 	pri = 0;
2989 	mp = NULL;
2990 	if (so->so_nl7c_rcv_mp != NULL) {
2991 		error = nl7c_sorecv(so, &mp, uiop, &rval);
2992 	} else {
2993 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
2994 		    timout, &rval);
2995 	}
2996 	if (error) {
2997 		switch (error) {
2998 		case EINTR:
2999 		case EWOULDBLOCK:
3000 			if (!first)
3001 				error = 0;
3002 			break;
3003 		case ETIME:
3004 			/* Returned from kstrgetmsg when timeout expires */
3005 			if (!first)
3006 				error = 0;
3007 			else
3008 				error = EWOULDBLOCK;
3009 			break;
3010 		default:
3011 			eprintsoline(so, error);
3012 			break;
3013 		}
3014 		mutex_enter(&so->so_lock);
3015 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3016 		mutex_exit(&so->so_lock);
3017 		return (error);
3018 	}
3019 	/*
3020 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3021 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3022 	 */
3023 	ASSERT(!(rval.r_val1 & MORECTL));
3024 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3025 		msg->msg_flags |= MSG_TRUNC;
3026 
3027 	if (mp == NULL) {
3028 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3029 		/*
3030 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3031 		 * The draft Posix socket spec states that the mark should
3032 		 * not be cleared when peeking. We follow the latter.
3033 		 */
3034 		if ((so->so_state &
3035 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3036 		    (uiop->uio_resid != saved_resid) &&
3037 		    !(flags & MSG_PEEK)) {
3038 			sorecv_update_oobstate(so);
3039 		}
3040 
3041 		mutex_enter(&so->so_lock);
3042 		/* Set MSG_EOR based on MOREDATA */
3043 		if (!(rval.r_val1 & MOREDATA)) {
3044 			if (so->so_state & SS_SAVEDEOR) {
3045 				msg->msg_flags |= MSG_EOR;
3046 				so->so_state &= ~SS_SAVEDEOR;
3047 			}
3048 		}
3049 		/*
3050 		 * If some data was received (i.e. not EOF) and the
3051 		 * read/recv* has not been satisfied wait for some more.
3052 		 */
3053 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3054 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3055 			mutex_exit(&so->so_lock);
3056 			first = 0;
3057 			pflag = opflag | MSG_NOMARK;
3058 			goto retry;
3059 		}
3060 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3061 		mutex_exit(&so->so_lock);
3062 		return (0);
3063 	}
3064 
3065 	/* strsock_proto has already verified length and alignment */
3066 	tpr = (union T_primitives *)mp->b_rptr;
3067 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3068 
3069 	switch (tpr->type) {
3070 	case T_DATA_IND: {
3071 		if ((so->so_state &
3072 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3073 		    (uiop->uio_resid != saved_resid) &&
3074 		    !(flags & MSG_PEEK)) {
3075 			sorecv_update_oobstate(so);
3076 		}
3077 
3078 		/*
3079 		 * Set msg_flags to MSG_EOR based on
3080 		 * MORE_flag and MOREDATA.
3081 		 */
3082 		mutex_enter(&so->so_lock);
3083 		so->so_state &= ~SS_SAVEDEOR;
3084 		if (!(tpr->data_ind.MORE_flag & 1)) {
3085 			if (!(rval.r_val1 & MOREDATA))
3086 				msg->msg_flags |= MSG_EOR;
3087 			else
3088 				so->so_state |= SS_SAVEDEOR;
3089 		}
3090 		freemsg(mp);
3091 		/*
3092 		 * If some data was received (i.e. not EOF) and the
3093 		 * read/recv* has not been satisfied wait for some more.
3094 		 */
3095 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3096 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3097 			mutex_exit(&so->so_lock);
3098 			first = 0;
3099 			pflag = opflag | MSG_NOMARK;
3100 			goto retry;
3101 		}
3102 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3103 		mutex_exit(&so->so_lock);
3104 		return (0);
3105 	}
3106 	case T_UNITDATA_IND: {
3107 		void *addr;
3108 		t_uscalar_t addrlen;
3109 		void *abuf;
3110 		t_uscalar_t optlen;
3111 		void *opt;
3112 
3113 		if ((so->so_state &
3114 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3115 		    (uiop->uio_resid != saved_resid) &&
3116 		    !(flags & MSG_PEEK)) {
3117 			sorecv_update_oobstate(so);
3118 		}
3119 
3120 		if (namelen != 0) {
3121 			/* Caller wants source address */
3122 			addrlen = tpr->unitdata_ind.SRC_length;
3123 			addr = sogetoff(mp,
3124 				tpr->unitdata_ind.SRC_offset,
3125 				addrlen, 1);
3126 			if (addr == NULL) {
3127 				freemsg(mp);
3128 				error = EPROTO;
3129 				eprintsoline(so, error);
3130 				goto err;
3131 			}
3132 			if (so->so_family == AF_UNIX) {
3133 				/*
3134 				 * Can not use the transport level address.
3135 				 * If there is a SO_SRCADDR option carrying
3136 				 * the socket level address it will be
3137 				 * extracted below.
3138 				 */
3139 				addr = NULL;
3140 				addrlen = 0;
3141 			}
3142 		}
3143 		optlen = tpr->unitdata_ind.OPT_length;
3144 		if (optlen != 0) {
3145 			t_uscalar_t ncontrollen;
3146 
3147 			/*
3148 			 * Extract any source address option.
3149 			 * Determine how large cmsg buffer is needed.
3150 			 */
3151 			opt = sogetoff(mp,
3152 				tpr->unitdata_ind.OPT_offset,
3153 				optlen, __TPI_ALIGN_SIZE);
3154 
3155 			if (opt == NULL) {
3156 				freemsg(mp);
3157 				error = EPROTO;
3158 				eprintsoline(so, error);
3159 				goto err;
3160 			}
3161 			if (so->so_family == AF_UNIX)
3162 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3163 			ncontrollen = so_cmsglen(mp, opt, optlen,
3164 						!(flags & MSG_XPG4_2));
3165 			if (controllen != 0)
3166 				controllen = ncontrollen;
3167 			else if (ncontrollen != 0)
3168 				msg->msg_flags |= MSG_CTRUNC;
3169 		} else {
3170 			controllen = 0;
3171 		}
3172 
3173 		if (namelen != 0) {
3174 			/*
3175 			 * Return address to caller.
3176 			 * Caller handles truncation if length
3177 			 * exceeds msg_namelen.
3178 			 * NOTE: AF_UNIX NUL termination is ensured by
3179 			 * the sender's copyin_name().
3180 			 */
3181 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3182 
3183 			bcopy(addr, abuf, addrlen);
3184 			msg->msg_name = abuf;
3185 			msg->msg_namelen = addrlen;
3186 		}
3187 
3188 		if (controllen != 0) {
3189 			/*
3190 			 * Return control msg to caller.
3191 			 * Caller handles truncation if length
3192 			 * exceeds msg_controllen.
3193 			 */
3194 			control = kmem_alloc(controllen, KM_SLEEP);
3195 
3196 			error = so_opt2cmsg(mp, opt, optlen,
3197 					!(flags & MSG_XPG4_2),
3198 					control, controllen);
3199 			if (error) {
3200 				freemsg(mp);
3201 				if (msg->msg_namelen != 0)
3202 					kmem_free(msg->msg_name,
3203 						msg->msg_namelen);
3204 				kmem_free(control, controllen);
3205 				eprintsoline(so, error);
3206 				goto err;
3207 			}
3208 			msg->msg_control = control;
3209 			msg->msg_controllen = controllen;
3210 		}
3211 
3212 		freemsg(mp);
3213 		mutex_enter(&so->so_lock);
3214 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3215 		mutex_exit(&so->so_lock);
3216 		return (0);
3217 	}
3218 	case T_OPTDATA_IND: {
3219 		struct T_optdata_req *tdr;
3220 		void *opt;
3221 		t_uscalar_t optlen;
3222 
3223 		if ((so->so_state &
3224 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3225 		    (uiop->uio_resid != saved_resid) &&
3226 		    !(flags & MSG_PEEK)) {
3227 			sorecv_update_oobstate(so);
3228 		}
3229 
3230 		tdr = (struct T_optdata_req *)mp->b_rptr;
3231 		optlen = tdr->OPT_length;
3232 		if (optlen != 0) {
3233 			t_uscalar_t ncontrollen;
3234 			/*
3235 			 * Determine how large cmsg buffer is needed.
3236 			 */
3237 			opt = sogetoff(mp,
3238 					tpr->optdata_ind.OPT_offset,
3239 					optlen, __TPI_ALIGN_SIZE);
3240 
3241 			if (opt == NULL) {
3242 				freemsg(mp);
3243 				error = EPROTO;
3244 				eprintsoline(so, error);
3245 				goto err;
3246 			}
3247 
3248 			ncontrollen = so_cmsglen(mp, opt, optlen,
3249 						!(flags & MSG_XPG4_2));
3250 			if (controllen != 0)
3251 				controllen = ncontrollen;
3252 			else if (ncontrollen != 0)
3253 				msg->msg_flags |= MSG_CTRUNC;
3254 		} else {
3255 			controllen = 0;
3256 		}
3257 
3258 		if (controllen != 0) {
3259 			/*
3260 			 * Return control msg to caller.
3261 			 * Caller handles truncation if length
3262 			 * exceeds msg_controllen.
3263 			 */
3264 			control = kmem_alloc(controllen, KM_SLEEP);
3265 
3266 			error = so_opt2cmsg(mp, opt, optlen,
3267 					!(flags & MSG_XPG4_2),
3268 					control, controllen);
3269 			if (error) {
3270 				freemsg(mp);
3271 				kmem_free(control, controllen);
3272 				eprintsoline(so, error);
3273 				goto err;
3274 			}
3275 			msg->msg_control = control;
3276 			msg->msg_controllen = controllen;
3277 		}
3278 
3279 		/*
3280 		 * Set msg_flags to MSG_EOR based on
3281 		 * DATA_flag and MOREDATA.
3282 		 */
3283 		mutex_enter(&so->so_lock);
3284 		so->so_state &= ~SS_SAVEDEOR;
3285 		if (!(tpr->data_ind.MORE_flag & 1)) {
3286 			if (!(rval.r_val1 & MOREDATA))
3287 				msg->msg_flags |= MSG_EOR;
3288 			else
3289 				so->so_state |= SS_SAVEDEOR;
3290 		}
3291 		freemsg(mp);
3292 		/*
3293 		 * If some data was received (i.e. not EOF) and the
3294 		 * read/recv* has not been satisfied wait for some more.
3295 		 * Not possible to wait if control info was received.
3296 		 */
3297 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3298 		    controllen == 0 &&
3299 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3300 			mutex_exit(&so->so_lock);
3301 			first = 0;
3302 			pflag = opflag | MSG_NOMARK;
3303 			goto retry;
3304 		}
3305 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3306 		mutex_exit(&so->so_lock);
3307 		return (0);
3308 	}
3309 	case T_EXDATA_IND: {
3310 		dprintso(so, 1,
3311 			("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3312 			"state %s\n",
3313 			so->so_oobsigcnt, so->so_oobcnt,
3314 			saved_resid - uiop->uio_resid,
3315 			pr_state(so->so_state, so->so_mode)));
3316 		/*
3317 		 * kstrgetmsg handles MSGMARK so there is nothing to
3318 		 * inspect in the T_EXDATA_IND.
3319 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3320 		 * as a separate message with no M_DATA component. Furthermore,
3321 		 * the stream head does not consolidate M_DATA messages onto
3322 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3323 		 * remains a message by itself. This is needed since MSGMARK
3324 		 * marks both the whole message as well as the last byte
3325 		 * of the message.
3326 		 */
3327 		freemsg(mp);
3328 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3329 		if (flags & MSG_PEEK) {
3330 			/*
3331 			 * Even though we are peeking we consume the
3332 			 * T_EXDATA_IND thereby moving the mark information
3333 			 * to SS_RCVATMARK. Then the oob code below will
3334 			 * retry the peeking kstrgetmsg.
3335 			 * Note that the stream head read queue is
3336 			 * never flushed without holding SOREADLOCKED
3337 			 * thus the T_EXDATA_IND can not disappear
3338 			 * underneath us.
3339 			 */
3340 			dprintso(so, 1,
3341 				("sotpi_recvmsg: consume EXDATA_IND "
3342 				"counts %d/%d state %s\n",
3343 				so->so_oobsigcnt,
3344 				so->so_oobcnt,
3345 				pr_state(so->so_state, so->so_mode)));
3346 
3347 			pflag = MSG_ANY | MSG_DELAYERROR;
3348 			if (so->so_mode & SM_ATOMIC)
3349 				pflag |= MSG_DISCARDTAIL;
3350 
3351 			pri = 0;
3352 			mp = NULL;
3353 
3354 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3355 				&pri, &pflag, (clock_t)-1, &rval);
3356 			ASSERT(uiop->uio_resid == saved_resid);
3357 
3358 			if (error) {
3359 #ifdef SOCK_DEBUG
3360 				if (error != EWOULDBLOCK && error != EINTR) {
3361 					eprintsoline(so, error);
3362 				}
3363 #endif /* SOCK_DEBUG */
3364 				mutex_enter(&so->so_lock);
3365 				so_unlock_read(so);	/* Clear SOREADLOCKED */
3366 				mutex_exit(&so->so_lock);
3367 				return (error);
3368 			}
3369 			ASSERT(mp);
3370 			tpr = (union T_primitives *)mp->b_rptr;
3371 			ASSERT(tpr->type == T_EXDATA_IND);
3372 			freemsg(mp);
3373 		} /* end "if (flags & MSG_PEEK)" */
3374 
3375 		/*
3376 		 * Decrement the number of queued and pending oob.
3377 		 *
3378 		 * SS_RCVATMARK is cleared when we read past a mark.
3379 		 * SS_HAVEOOBDATA is cleared when we've read past the
3380 		 * last mark.
3381 		 * SS_OOBPEND is cleared if we've read past the last
3382 		 * mark and no (new) SIGURG has been posted.
3383 		 */
3384 		mutex_enter(&so->so_lock);
3385 		ASSERT(so_verify_oobstate(so));
3386 		ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
3387 		ASSERT(so->so_oobsigcnt > 0);
3388 		so->so_oobsigcnt--;
3389 		ASSERT(so->so_oobcnt > 0);
3390 		so->so_oobcnt--;
3391 		/*
3392 		 * Since the T_EXDATA_IND has been removed from the stream
3393 		 * head, but we have not read data past the mark,
3394 		 * sockfs needs to track that the socket is still at the mark.
3395 		 *
3396 		 * Since no data was received call kstrgetmsg again to wait
3397 		 * for data.
3398 		 */
3399 		so->so_state |= SS_RCVATMARK;
3400 		mutex_exit(&so->so_lock);
3401 		dprintso(so, 1,
3402 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3403 		    so->so_oobsigcnt, so->so_oobcnt,
3404 		    pr_state(so->so_state, so->so_mode)));
3405 		pflag = opflag;
3406 		goto retry;
3407 	}
3408 	default:
3409 		ASSERT(0);
3410 		freemsg(mp);
3411 		error = EPROTO;
3412 		eprintsoline(so, error);
3413 		goto err;
3414 	}
3415 	/* NOTREACHED */
3416 err:
3417 	mutex_enter(&so->so_lock);
3418 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3419 	mutex_exit(&so->so_lock);
3420 	return (error);
3421 }
3422 
3423 /*
3424  * Sending data with options on a datagram socket.
3425  * Assumes caller has verified that SS_ISBOUND etc. are set.
3426  */
3427 static int
3428 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3429     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3430 {
3431 	struct T_unitdata_req	tudr;
3432 	mblk_t			*mp;
3433 	int			error;
3434 	void			*addr;
3435 	socklen_t		addrlen;
3436 	void			*src;
3437 	socklen_t		srclen;
3438 	ssize_t			len;
3439 	int			size;
3440 	struct T_opthdr		toh;
3441 	struct fdbuf		*fdbuf;
3442 	t_uscalar_t		optlen;
3443 	void			*fds;
3444 	int			fdlen;
3445 
3446 	ASSERT(name && namelen);
3447 	ASSERT(control && controllen);
3448 
3449 	len = uiop->uio_resid;
3450 	if (len > (ssize_t)so->so_tidu_size) {
3451 		return (EMSGSIZE);
3452 	}
3453 
3454 	/*
3455 	 * For AF_UNIX the destination address is translated to an internal
3456 	 * name and the source address is passed as an option.
3457 	 * Also, file descriptors are passed as file pointers in an
3458 	 * option.
3459 	 */
3460 
3461 	/*
3462 	 * Length and family checks.
3463 	 */
3464 	error = so_addr_verify(so, name, namelen);
3465 	if (error) {
3466 		eprintsoline(so, error);
3467 		return (error);
3468 	}
3469 	if (so->so_family == AF_UNIX) {
3470 		if (so->so_state & SS_FADDR_NOXLATE) {
3471 			/*
3472 			 * Already have a transport internal address. Do not
3473 			 * pass any (transport internal) source address.
3474 			 */
3475 			addr = name;
3476 			addrlen = namelen;
3477 			src = NULL;
3478 			srclen = 0;
3479 		} else {
3480 			/*
3481 			 * Pass the sockaddr_un source address as an option
3482 			 * and translate the remote address.
3483 			 *
3484 			 * Note that this code does not prevent so_laddr_sa
3485 			 * from changing while it is being used. Thus
3486 			 * if an unbind+bind occurs concurrently with this
3487 			 * send the peer might see a partially new and a
3488 			 * partially old "from" address.
3489 			 */
3490 			src = so->so_laddr_sa;
3491 			srclen = (t_uscalar_t)so->so_laddr_len;
3492 			dprintso(so, 1,
3493 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3494 			    srclen, src));
3495 			error = so_ux_addr_xlate(so, name, namelen,
3496 				(flags & MSG_XPG4_2),
3497 				&addr, &addrlen);
3498 			if (error) {
3499 				eprintsoline(so, error);
3500 				return (error);
3501 			}
3502 		}
3503 	} else {
3504 		addr = name;
3505 		addrlen = namelen;
3506 		src = NULL;
3507 		srclen = 0;
3508 	}
3509 	optlen = so_optlen(control, controllen,
3510 					!(flags & MSG_XPG4_2));
3511 	tudr.PRIM_type = T_UNITDATA_REQ;
3512 	tudr.DEST_length = addrlen;
3513 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3514 	if (srclen != 0)
3515 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3516 		    _TPI_ALIGN_TOPT(srclen));
3517 	else
3518 		tudr.OPT_length = optlen;
3519 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3520 				_TPI_ALIGN_TOPT(addrlen));
3521 
3522 	size = tudr.OPT_offset + tudr.OPT_length;
3523 
3524 	/*
3525 	 * File descriptors only when SM_FDPASSING set.
3526 	 */
3527 	error = so_getfdopt(control, controllen,
3528 			!(flags & MSG_XPG4_2), &fds, &fdlen);
3529 	if (error)
3530 		return (error);
3531 	if (fdlen != -1) {
3532 		if (!(so->so_mode & SM_FDPASSING))
3533 			return (EOPNOTSUPP);
3534 
3535 		error = fdbuf_create(fds, fdlen, &fdbuf);
3536 		if (error)
3537 			return (error);
3538 		mp = fdbuf_allocmsg(size, fdbuf);
3539 	} else {
3540 		mp = soallocproto(size, _ALLOC_INTR);
3541 		if (mp == NULL) {
3542 			/*
3543 			 * Caught a signal waiting for memory.
3544 			 * Let send* return EINTR.
3545 			 */
3546 			return (EINTR);
3547 		}
3548 	}
3549 	soappendmsg(mp, &tudr, sizeof (tudr));
3550 	soappendmsg(mp, addr, addrlen);
3551 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3552 
3553 	if (fdlen != -1) {
3554 		ASSERT(fdbuf != NULL);
3555 		toh.level = SOL_SOCKET;
3556 		toh.name = SO_FILEP;
3557 		toh.len = fdbuf->fd_size +
3558 				(t_uscalar_t)sizeof (struct T_opthdr);
3559 		toh.status = 0;
3560 		soappendmsg(mp, &toh, sizeof (toh));
3561 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3562 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3563 	}
3564 	if (srclen != 0) {
3565 		/*
3566 		 * There is a AF_UNIX sockaddr_un to include as a source
3567 		 * address option.
3568 		 */
3569 		toh.level = SOL_SOCKET;
3570 		toh.name = SO_SRCADDR;
3571 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3572 		toh.status = 0;
3573 		soappendmsg(mp, &toh, sizeof (toh));
3574 		soappendmsg(mp, src, srclen);
3575 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3576 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3577 	}
3578 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3579 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3580 	/* At most 3 bytes left in the message */
3581 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3582 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3583 
3584 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3585 #ifdef C2_AUDIT
3586 	if (audit_active)
3587 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3588 #endif /* C2_AUDIT */
3589 
3590 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3591 #ifdef SOCK_DEBUG
3592 	if (error) {
3593 		eprintsoline(so, error);
3594 	}
3595 #endif /* SOCK_DEBUG */
3596 	return (error);
3597 }
3598 
3599 /*
3600  * Sending data with options on a connected stream socket.
3601  * Assumes caller has verified that SS_ISCONNECTED is set.
3602  */
3603 static int
3604 sosend_svccmsg(struct sonode *so,
3605 		struct uio *uiop,
3606 		int more,
3607 		void *control,
3608 		t_uscalar_t controllen,
3609 		int flags)
3610 {
3611 	struct T_optdata_req	tdr;
3612 	mblk_t			*mp;
3613 	int			error;
3614 	ssize_t			iosize;
3615 	int			first = 1;
3616 	int			size;
3617 	struct fdbuf		*fdbuf;
3618 	t_uscalar_t		optlen;
3619 	void			*fds;
3620 	int			fdlen;
3621 	struct T_opthdr		toh;
3622 
3623 	dprintso(so, 1,
3624 		("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3625 
3626 	/*
3627 	 * Has to be bound and connected. However, since no locks are
3628 	 * held the state could have changed after sotpi_sendmsg checked it
3629 	 * thus it is not possible to ASSERT on the state.
3630 	 */
3631 
3632 	/* Options on connection-oriented only when SM_OPTDATA set. */
3633 	if (!(so->so_mode & SM_OPTDATA))
3634 		return (EOPNOTSUPP);
3635 
3636 	do {
3637 		/*
3638 		 * Set the MORE flag if uio_resid does not fit in this
3639 		 * message or if the caller passed in "more".
3640 		 * Error for transports with zero tidu_size.
3641 		 */
3642 		tdr.PRIM_type = T_OPTDATA_REQ;
3643 		iosize = so->so_tidu_size;
3644 		if (iosize <= 0)
3645 			return (EMSGSIZE);
3646 		if (uiop->uio_resid > iosize) {
3647 			tdr.DATA_flag = 1;
3648 		} else {
3649 			if (more)
3650 				tdr.DATA_flag = 1;
3651 			else
3652 				tdr.DATA_flag = 0;
3653 			iosize = uiop->uio_resid;
3654 		}
3655 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3656 			tdr.DATA_flag, iosize));
3657 
3658 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3659 		tdr.OPT_length = optlen;
3660 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3661 
3662 		size = (int)sizeof (tdr) + optlen;
3663 		/*
3664 		 * File descriptors only when SM_FDPASSING set.
3665 		 */
3666 		error = so_getfdopt(control, controllen,
3667 				!(flags & MSG_XPG4_2), &fds, &fdlen);
3668 		if (error)
3669 			return (error);
3670 		if (fdlen != -1) {
3671 			if (!(so->so_mode & SM_FDPASSING))
3672 				return (EOPNOTSUPP);
3673 
3674 			error = fdbuf_create(fds, fdlen, &fdbuf);
3675 			if (error)
3676 				return (error);
3677 			mp = fdbuf_allocmsg(size, fdbuf);
3678 		} else {
3679 			mp = soallocproto(size, _ALLOC_INTR);
3680 			if (mp == NULL) {
3681 				/*
3682 				 * Caught a signal waiting for memory.
3683 				 * Let send* return EINTR.
3684 				 */
3685 				return (first ? EINTR : 0);
3686 			}
3687 		}
3688 		soappendmsg(mp, &tdr, sizeof (tdr));
3689 
3690 		if (fdlen != -1) {
3691 			ASSERT(fdbuf != NULL);
3692 			toh.level = SOL_SOCKET;
3693 			toh.name = SO_FILEP;
3694 			toh.len = fdbuf->fd_size +
3695 				(t_uscalar_t)sizeof (struct T_opthdr);
3696 			toh.status = 0;
3697 			soappendmsg(mp, &toh, sizeof (toh));
3698 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3699 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3700 		}
3701 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3702 		/* At most 3 bytes left in the message */
3703 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3704 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3705 
3706 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3707 
3708 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3709 					0, MSG_BAND, 0);
3710 		if (error) {
3711 			if (!first && error == EWOULDBLOCK)
3712 				return (0);
3713 			eprintsoline(so, error);
3714 			return (error);
3715 		}
3716 		control = NULL;
3717 		first = 0;
3718 		if (uiop->uio_resid > 0) {
3719 			/*
3720 			 * Recheck for fatal errors. Fail write even though
3721 			 * some data have been written. This is consistent
3722 			 * with strwrite semantics and BSD sockets semantics.
3723 			 */
3724 			if (so->so_state & SS_CANTSENDMORE) {
3725 				tsignal(curthread, SIGPIPE);
3726 				eprintsoline(so, error);
3727 				return (EPIPE);
3728 			}
3729 			if (so->so_error != 0) {
3730 				mutex_enter(&so->so_lock);
3731 				error = sogeterr(so);
3732 				mutex_exit(&so->so_lock);
3733 				if (error != 0) {
3734 					eprintsoline(so, error);
3735 					return (error);
3736 				}
3737 			}
3738 		}
3739 	} while (uiop->uio_resid > 0);
3740 	return (0);
3741 }
3742 
3743 /*
3744  * Sending data on a datagram socket.
3745  * Assumes caller has verified that SS_ISBOUND etc. are set.
3746  *
3747  * For AF_UNIX the destination address is translated to an internal
3748  * name and the source address is passed as an option.
3749  */
3750 int
3751 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
3752     struct uio *uiop, int flags)
3753 {
3754 	struct T_unitdata_req	tudr;
3755 	mblk_t			*mp;
3756 	int			error;
3757 	void			*addr;
3758 	socklen_t		addrlen;
3759 	void			*src;
3760 	socklen_t		srclen;
3761 	ssize_t			len;
3762 
3763 	ASSERT(name != NULL && namelen != 0);
3764 
3765 	len = uiop->uio_resid;
3766 	if (len > so->so_tidu_size) {
3767 		error = EMSGSIZE;
3768 		goto done;
3769 	}
3770 
3771 	/* Length and family checks */
3772 	error = so_addr_verify(so, name, namelen);
3773 	if (error != 0)
3774 		goto done;
3775 
3776 	if (so->so_state & SS_DIRECT)
3777 		return (sodgram_direct(so, name, namelen, uiop, flags));
3778 
3779 	if (so->so_family == AF_UNIX) {
3780 		if (so->so_state & SS_FADDR_NOXLATE) {
3781 			/*
3782 			 * Already have a transport internal address. Do not
3783 			 * pass any (transport internal) source address.
3784 			 */
3785 			addr = name;
3786 			addrlen = namelen;
3787 			src = NULL;
3788 			srclen = 0;
3789 		} else {
3790 			/*
3791 			 * Pass the sockaddr_un source address as an option
3792 			 * and translate the remote address.
3793 			 *
3794 			 * Note that this code does not prevent so_laddr_sa
3795 			 * from changing while it is being used. Thus
3796 			 * if an unbind+bind occurs concurrently with this
3797 			 * send the peer might see a partially new and a
3798 			 * partially old "from" address.
3799 			 */
3800 			src = so->so_laddr_sa;
3801 			srclen = (socklen_t)so->so_laddr_len;
3802 			dprintso(so, 1,
3803 				("sosend_dgram UNIX: srclen %d, src %p\n",
3804 				srclen, src));
3805 			error = so_ux_addr_xlate(so, name, namelen,
3806 				(flags & MSG_XPG4_2),
3807 				&addr, &addrlen);
3808 			if (error) {
3809 				eprintsoline(so, error);
3810 				goto done;
3811 			}
3812 		}
3813 	} else {
3814 		addr = name;
3815 		addrlen = namelen;
3816 		src = NULL;
3817 		srclen = 0;
3818 	}
3819 	tudr.PRIM_type = T_UNITDATA_REQ;
3820 	tudr.DEST_length = addrlen;
3821 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3822 	if (srclen == 0) {
3823 		tudr.OPT_length = 0;
3824 		tudr.OPT_offset = 0;
3825 
3826 		mp = soallocproto2(&tudr, sizeof (tudr),
3827 		    addr, addrlen, 0, _ALLOC_INTR);
3828 		if (mp == NULL) {
3829 			/*
3830 			 * Caught a signal waiting for memory.
3831 			 * Let send* return EINTR.
3832 			 */
3833 			error = EINTR;
3834 			goto done;
3835 		}
3836 	} else {
3837 		/*
3838 		 * There is a AF_UNIX sockaddr_un to include as a source
3839 		 * address option.
3840 		 */
3841 		struct T_opthdr toh;
3842 		ssize_t size;
3843 
3844 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
3845 					_TPI_ALIGN_TOPT(srclen));
3846 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3847 					_TPI_ALIGN_TOPT(addrlen));
3848 
3849 		toh.level = SOL_SOCKET;
3850 		toh.name = SO_SRCADDR;
3851 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3852 		toh.status = 0;
3853 
3854 		size = tudr.OPT_offset + tudr.OPT_length;
3855 		mp = soallocproto2(&tudr, sizeof (tudr),
3856 		    addr, addrlen, size, _ALLOC_INTR);
3857 		if (mp == NULL) {
3858 			/*
3859 			 * Caught a signal waiting for memory.
3860 			 * Let send* return EINTR.
3861 			 */
3862 			error = EINTR;
3863 			goto done;
3864 		}
3865 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3866 		soappendmsg(mp, &toh, sizeof (toh));
3867 		soappendmsg(mp, src, srclen);
3868 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3869 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3870 	}
3871 
3872 #ifdef C2_AUDIT
3873 	if (audit_active)
3874 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3875 #endif /* C2_AUDIT */
3876 
3877 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3878 done:
3879 #ifdef SOCK_DEBUG
3880 	if (error) {
3881 		eprintsoline(so, error);
3882 	}
3883 #endif /* SOCK_DEBUG */
3884 	return (error);
3885 }
3886 
3887 /*
3888  * Sending data on a connected stream socket.
3889  * Assumes caller has verified that SS_ISCONNECTED is set.
3890  */
3891 int
3892 sosend_svc(struct sonode *so,
3893 	struct uio *uiop,
3894 	t_scalar_t prim,
3895 	int more,
3896 	int sflag)
3897 {
3898 	struct T_data_req	tdr;
3899 	mblk_t			*mp;
3900 	int			error;
3901 	ssize_t			iosize;
3902 	int			first = 1;
3903 
3904 	dprintso(so, 1,
3905 		("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
3906 		so, uiop->uio_resid, prim, sflag));
3907 
3908 	/*
3909 	 * Has to be bound and connected. However, since no locks are
3910 	 * held the state could have changed after sotpi_sendmsg checked it
3911 	 * thus it is not possible to ASSERT on the state.
3912 	 */
3913 
3914 	do {
3915 		/*
3916 		 * Set the MORE flag if uio_resid does not fit in this
3917 		 * message or if the caller passed in "more".
3918 		 * Error for transports with zero tidu_size.
3919 		 */
3920 		tdr.PRIM_type = prim;
3921 		iosize = so->so_tidu_size;
3922 		if (iosize <= 0)
3923 			return (EMSGSIZE);
3924 		if (uiop->uio_resid > iosize) {
3925 			tdr.MORE_flag = 1;
3926 		} else {
3927 			if (more)
3928 				tdr.MORE_flag = 1;
3929 			else
3930 				tdr.MORE_flag = 0;
3931 			iosize = uiop->uio_resid;
3932 		}
3933 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
3934 			prim, tdr.MORE_flag, iosize));
3935 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR);
3936 		if (mp == NULL) {
3937 			/*
3938 			 * Caught a signal waiting for memory.
3939 			 * Let send* return EINTR.
3940 			 */
3941 			if (first)
3942 				return (EINTR);
3943 			else
3944 				return (0);
3945 		}
3946 
3947 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3948 					0, sflag | MSG_BAND, 0);
3949 		if (error) {
3950 			if (!first && error == EWOULDBLOCK)
3951 				return (0);
3952 			eprintsoline(so, error);
3953 			return (error);
3954 		}
3955 		first = 0;
3956 		if (uiop->uio_resid > 0) {
3957 			/*
3958 			 * Recheck for fatal errors. Fail write even though
3959 			 * some data have been written. This is consistent
3960 			 * with strwrite semantics and BSD sockets semantics.
3961 			 */
3962 			if (so->so_state & SS_CANTSENDMORE) {
3963 				tsignal(curthread, SIGPIPE);
3964 				eprintsoline(so, error);
3965 				return (EPIPE);
3966 			}
3967 			if (so->so_error != 0) {
3968 				mutex_enter(&so->so_lock);
3969 				error = sogeterr(so);
3970 				mutex_exit(&so->so_lock);
3971 				if (error != 0) {
3972 					eprintsoline(so, error);
3973 					return (error);
3974 				}
3975 			}
3976 		}
3977 	} while (uiop->uio_resid > 0);
3978 	return (0);
3979 }
3980 
3981 /*
3982  * Check the state for errors and call the appropriate send function.
3983  *
3984  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
3985  * this function issues a setsockopt to toggle SO_DONTROUTE before and
3986  * after sending the message.
3987  */
3988 static int
3989 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3990 {
3991 	int		so_state;
3992 	int		so_mode;
3993 	int		error;
3994 	struct sockaddr *name;
3995 	t_uscalar_t	namelen;
3996 	int		dontroute;
3997 	int		flags;
3998 
3999 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4000 		so, msg, msg->msg_flags,
4001 		pr_state(so->so_state, so->so_mode), so->so_error));
4002 
4003 	mutex_enter(&so->so_lock);
4004 	so_state = so->so_state;
4005 
4006 	if (so_state & SS_CANTSENDMORE) {
4007 		mutex_exit(&so->so_lock);
4008 		tsignal(curthread, SIGPIPE);
4009 		return (EPIPE);
4010 	}
4011 
4012 	if (so->so_error != 0) {
4013 		error = sogeterr(so);
4014 		if (error != 0) {
4015 			mutex_exit(&so->so_lock);
4016 			return (error);
4017 		}
4018 	}
4019 
4020 	name = (struct sockaddr *)msg->msg_name;
4021 	namelen = msg->msg_namelen;
4022 
4023 	so_mode = so->so_mode;
4024 
4025 	if (name == NULL) {
4026 		if (!(so_state & SS_ISCONNECTED)) {
4027 			mutex_exit(&so->so_lock);
4028 			if (so_mode & SM_CONNREQUIRED)
4029 				return (ENOTCONN);
4030 			else
4031 				return (EDESTADDRREQ);
4032 		}
4033 		if (so_mode & SM_CONNREQUIRED) {
4034 			name = NULL;
4035 			namelen = 0;
4036 		} else {
4037 			/*
4038 			 * Note that this code does not prevent so_faddr_sa
4039 			 * from changing while it is being used. Thus
4040 			 * if an "unconnect"+connect occurs concurrently with
4041 			 * this send the datagram might be delivered to a
4042 			 * garbaled address.
4043 			 */
4044 			ASSERT(so->so_faddr_sa);
4045 			name = so->so_faddr_sa;
4046 			namelen = (t_uscalar_t)so->so_faddr_len;
4047 		}
4048 	} else {
4049 		if (!(so_state & SS_ISCONNECTED) &&
4050 		    (so_mode & SM_CONNREQUIRED)) {
4051 			/* Required but not connected */
4052 			mutex_exit(&so->so_lock);
4053 			return (ENOTCONN);
4054 		}
4055 		/*
4056 		 * Ignore the address on connection-oriented sockets.
4057 		 * Just like BSD this code does not generate an error for
4058 		 * TCP (a CONNREQUIRED socket) when sending to an address
4059 		 * passed in with sendto/sendmsg. Instead the data is
4060 		 * delivered on the connection as if no address had been
4061 		 * supplied.
4062 		 */
4063 		if ((so_state & SS_ISCONNECTED) &&
4064 		    !(so_mode & SM_CONNREQUIRED)) {
4065 			mutex_exit(&so->so_lock);
4066 			return (EISCONN);
4067 		}
4068 		if (!(so_state & SS_ISBOUND)) {
4069 			so_lock_single(so);	/* Set SOLOCKED */
4070 			error = sotpi_bind(so, NULL, 0,
4071 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
4072 			so_unlock_single(so, SOLOCKED);
4073 			if (error) {
4074 				mutex_exit(&so->so_lock);
4075 				eprintsoline(so, error);
4076 				return (error);
4077 			}
4078 		}
4079 		/*
4080 		 * Handle delayed datagram errors. These are only queued
4081 		 * when the application sets SO_DGRAM_ERRIND.
4082 		 * Return the error if we are sending to the address
4083 		 * that was returned in the last T_UDERROR_IND.
4084 		 * If sending to some other address discard the delayed
4085 		 * error indication.
4086 		 */
4087 		if (so->so_delayed_error) {
4088 			struct T_uderror_ind	*tudi;
4089 			void			*addr;
4090 			t_uscalar_t		addrlen;
4091 			boolean_t		match = B_FALSE;
4092 
4093 			ASSERT(so->so_eaddr_mp);
4094 			error = so->so_delayed_error;
4095 			so->so_delayed_error = 0;
4096 			tudi = (struct T_uderror_ind *)so->so_eaddr_mp->b_rptr;
4097 			addrlen = tudi->DEST_length;
4098 			addr = sogetoff(so->so_eaddr_mp,
4099 					tudi->DEST_offset,
4100 					addrlen, 1);
4101 			ASSERT(addr);	/* Checked by strsock_proto */
4102 			switch (so->so_family) {
4103 			case AF_INET: {
4104 				/* Compare just IP address and port */
4105 				sin_t *sin1 = (sin_t *)name;
4106 				sin_t *sin2 = (sin_t *)addr;
4107 
4108 				if (addrlen == sizeof (sin_t) &&
4109 				    namelen == addrlen &&
4110 				    sin1->sin_port == sin2->sin_port &&
4111 				    sin1->sin_addr.s_addr ==
4112 				    sin2->sin_addr.s_addr)
4113 					match = B_TRUE;
4114 				break;
4115 			}
4116 			case AF_INET6: {
4117 				/* Compare just IP address and port. Not flow */
4118 				sin6_t *sin1 = (sin6_t *)name;
4119 				sin6_t *sin2 = (sin6_t *)addr;
4120 
4121 				if (addrlen == sizeof (sin6_t) &&
4122 				    namelen == addrlen &&
4123 				    sin1->sin6_port == sin2->sin6_port &&
4124 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4125 					&sin2->sin6_addr))
4126 					match = B_TRUE;
4127 				break;
4128 			}
4129 			case AF_UNIX:
4130 			default:
4131 				if (namelen == addrlen &&
4132 				    bcmp(name, addr, namelen) == 0)
4133 					match = B_TRUE;
4134 			}
4135 			if (match) {
4136 				freemsg(so->so_eaddr_mp);
4137 				so->so_eaddr_mp = NULL;
4138 				mutex_exit(&so->so_lock);
4139 #ifdef DEBUG
4140 				dprintso(so, 0,
4141 					("sockfs delayed error %d for %s\n",
4142 					error,
4143 					pr_addr(so->so_family, name, namelen)));
4144 #endif /* DEBUG */
4145 				return (error);
4146 			}
4147 			freemsg(so->so_eaddr_mp);
4148 			so->so_eaddr_mp = NULL;
4149 		}
4150 	}
4151 	mutex_exit(&so->so_lock);
4152 
4153 	flags = msg->msg_flags;
4154 	dontroute = 0;
4155 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4156 		uint32_t	val;
4157 
4158 		val = 1;
4159 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4160 					&val, (t_uscalar_t)sizeof (val));
4161 		if (error)
4162 			return (error);
4163 		dontroute = 1;
4164 	}
4165 
4166 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4167 		error = EOPNOTSUPP;
4168 		goto done;
4169 	}
4170 	if (msg->msg_controllen != 0) {
4171 		if (!(so_mode & SM_CONNREQUIRED)) {
4172 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4173 			    msg->msg_control, msg->msg_controllen, flags);
4174 		} else {
4175 			if (flags & MSG_OOB) {
4176 				/* Can't generate T_EXDATA_REQ with options */
4177 				error = EOPNOTSUPP;
4178 				goto done;
4179 			}
4180 			error = sosend_svccmsg(so, uiop,
4181 				!(flags & MSG_EOR),
4182 				msg->msg_control, msg->msg_controllen,
4183 				flags);
4184 		}
4185 		goto done;
4186 	}
4187 
4188 	if (!(so_mode & SM_CONNREQUIRED)) {
4189 		/*
4190 		 * If there is no SO_DONTROUTE to turn off return immediately
4191 		 * from send_dgram. This can allow tail-call optimizations.
4192 		 */
4193 		if (!dontroute) {
4194 			return (sosend_dgram(so, name, namelen, uiop, flags));
4195 		}
4196 		error = sosend_dgram(so, name, namelen, uiop, flags);
4197 	} else {
4198 		t_scalar_t prim;
4199 		int sflag;
4200 
4201 		/* Ignore msg_name in the connected state */
4202 		if (flags & MSG_OOB) {
4203 			prim = T_EXDATA_REQ;
4204 			/*
4205 			 * Send down T_EXDATA_REQ even if there is flow
4206 			 * control for data.
4207 			 */
4208 			sflag = MSG_IGNFLOW;
4209 		} else {
4210 			if (so_mode & SM_BYTESTREAM) {
4211 				/* Byte stream transport - use write */
4212 
4213 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4214 				/*
4215 				 * If there is no SO_DONTROUTE to turn off,
4216 				 * SS_DIRECT is on, and there is no flow
4217 				 * control, we can take the fast path.
4218 				 */
4219 				if (!dontroute &&
4220 				    (so_state & SS_DIRECT) &&
4221 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4222 					return (sostream_direct(so, uiop,
4223 					    NULL, CRED()));
4224 				}
4225 				error = strwrite(SOTOV(so), uiop, CRED());
4226 				goto done;
4227 			}
4228 			prim = T_DATA_REQ;
4229 			sflag = 0;
4230 		}
4231 		/*
4232 		 * If there is no SO_DONTROUTE to turn off return immediately
4233 		 * from sosend_svc. This can allow tail-call optimizations.
4234 		 */
4235 		if (!dontroute)
4236 			return (sosend_svc(so, uiop, prim,
4237 				!(flags & MSG_EOR), sflag));
4238 		error = sosend_svc(so, uiop, prim,
4239 				!(flags & MSG_EOR), sflag);
4240 	}
4241 	ASSERT(dontroute);
4242 done:
4243 	if (dontroute) {
4244 		uint32_t	val;
4245 
4246 		val = 0;
4247 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4248 				&val, (t_uscalar_t)sizeof (val));
4249 	}
4250 	return (error);
4251 }
4252 
4253 /*
4254  * Sending data on a datagram socket.
4255  * Assumes caller has verified that SS_ISBOUND etc. are set.
4256  */
4257 /* ARGSUSED */
4258 static int
4259 sodgram_direct(struct sonode *so, struct sockaddr *name,
4260     socklen_t namelen, struct uio *uiop, int flags)
4261 {
4262 	struct T_unitdata_req	tudr;
4263 	mblk_t			*mp;
4264 	int			error = 0;
4265 	void			*addr;
4266 	socklen_t		addrlen;
4267 	ssize_t			len;
4268 	struct stdata		*stp = SOTOV(so)->v_stream;
4269 	int			so_state;
4270 	queue_t			*udp_wq;
4271 
4272 	ASSERT(name != NULL && namelen != 0);
4273 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4274 	ASSERT(!(so->so_mode & SM_EXDATA));
4275 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4276 	ASSERT(SOTOV(so)->v_type == VSOCK);
4277 
4278 	/* Caller checked for proper length */
4279 	len = uiop->uio_resid;
4280 	ASSERT(len <= so->so_tidu_size);
4281 
4282 	/* Length and family checks have been done by caller */
4283 	ASSERT(name->sa_family == so->so_family);
4284 	ASSERT(so->so_family == AF_INET ||
4285 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4286 	ASSERT(so->so_family == AF_INET6 ||
4287 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4288 
4289 	addr = name;
4290 	addrlen = namelen;
4291 
4292 	if (stp->sd_sidp != NULL &&
4293 	    (error = straccess(stp, JCWRITE)) != 0)
4294 		goto done;
4295 
4296 	so_state = so->so_state;
4297 
4298 	/*
4299 	 * For UDP we don't break up the copyin into smaller pieces
4300 	 * as in the TCP case.  That means if ENOMEM is returned by
4301 	 * mcopyinuio() then the uio vector has not been modified at
4302 	 * all and we fallback to either strwrite() or kstrputmsg()
4303 	 * below.  Note also that we never generate priority messages
4304 	 * from here.
4305 	 */
4306 	udp_wq = stp->sd_wrq->q_next;
4307 	if (canput(udp_wq) &&
4308 	    (mp = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4309 		ASSERT(DB_TYPE(mp) == M_DATA);
4310 		ASSERT(uiop->uio_resid == 0);
4311 #ifdef C2_AUDIT
4312 		if (audit_active)
4313 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4314 #endif /* C2_AUDIT */
4315 		udp_wput_data(udp_wq, mp, addr, addrlen);
4316 		return (0);
4317 	}
4318 	if (error != 0 && error != ENOMEM)
4319 		return (error);
4320 
4321 	/*
4322 	 * For connected, let strwrite() handle the blocking case.
4323 	 * Otherwise we fall thru and use kstrputmsg().
4324 	 */
4325 	if (so_state & SS_ISCONNECTED)
4326 		return (strwrite(SOTOV(so), uiop, CRED()));
4327 
4328 	tudr.PRIM_type = T_UNITDATA_REQ;
4329 	tudr.DEST_length = addrlen;
4330 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4331 	tudr.OPT_length = 0;
4332 	tudr.OPT_offset = 0;
4333 
4334 	mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0, _ALLOC_INTR);
4335 	if (mp == NULL) {
4336 		/*
4337 		 * Caught a signal waiting for memory.
4338 		 * Let send* return EINTR.
4339 		 */
4340 		error = EINTR;
4341 		goto done;
4342 	}
4343 
4344 #ifdef C2_AUDIT
4345 	if (audit_active)
4346 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4347 #endif /* C2_AUDIT */
4348 
4349 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4350 done:
4351 #ifdef SOCK_DEBUG
4352 	if (error != 0) {
4353 		eprintsoline(so, error);
4354 	}
4355 #endif /* SOCK_DEBUG */
4356 	return (error);
4357 }
4358 
4359 int
4360 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4361 {
4362 	struct stdata *stp = SOTOV(so)->v_stream;
4363 	ssize_t iosize, rmax, maxblk;
4364 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4365 	mblk_t *newmp;
4366 	int error = 0, wflag = 0;
4367 
4368 	ASSERT(so->so_mode & SM_BYTESTREAM);
4369 	ASSERT(SOTOV(so)->v_type == VSOCK);
4370 
4371 	if (stp->sd_sidp != NULL &&
4372 	    (error = straccess(stp, JCWRITE)) != 0)
4373 		return (error);
4374 
4375 	if (uiop == NULL) {
4376 		/*
4377 		 * kstrwritemp() should have checked sd_flag and
4378 		 * flow-control before coming here.  If we end up
4379 		 * here it means that we can simply pass down the
4380 		 * data to tcp.
4381 		 */
4382 		ASSERT(mp != NULL);
4383 		if (stp->sd_wputdatafunc != NULL) {
4384 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4385 			    NULL, NULL, NULL);
4386 			if (newmp == NULL) {
4387 				/* The caller will free mp */
4388 				return (ECOMM);
4389 			}
4390 			mp = newmp;
4391 		}
4392 		tcp_wput(tcp_wq, mp);
4393 		return (0);
4394 	}
4395 
4396 	/* Fallback to strwrite() to do proper error handling */
4397 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4398 		return (strwrite(SOTOV(so), uiop, cr));
4399 
4400 	rmax = stp->sd_qn_maxpsz;
4401 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4402 	if (rmax == 0 || uiop->uio_resid <= 0)
4403 		return (0);
4404 
4405 	if (rmax == INFPSZ)
4406 		rmax = uiop->uio_resid;
4407 
4408 	maxblk = stp->sd_maxblk;
4409 
4410 	for (;;) {
4411 		iosize = MIN(uiop->uio_resid, rmax);
4412 
4413 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4414 		if (mp == NULL) {
4415 			/*
4416 			 * Fallback to strwrite() for ENOMEM; if this
4417 			 * is our first time in this routine and the uio
4418 			 * vector has not been modified, we will end up
4419 			 * calling strwrite() without any flag set.
4420 			 */
4421 			if (error == ENOMEM)
4422 				goto slow_send;
4423 			else
4424 				return (error);
4425 		}
4426 		ASSERT(uiop->uio_resid >= 0);
4427 		/*
4428 		 * If mp is non-NULL and ENOMEM is set, it means that
4429 		 * mcopyinuio() was able to break down some of the user
4430 		 * data into one or more mblks.  Send the partial data
4431 		 * to tcp and let the rest be handled in strwrite().
4432 		 */
4433 		ASSERT(error == 0 || error == ENOMEM);
4434 		if (stp->sd_wputdatafunc != NULL) {
4435 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4436 			    NULL, NULL, NULL);
4437 			if (newmp == NULL) {
4438 				/* The caller will free mp */
4439 				return (ECOMM);
4440 			}
4441 			mp = newmp;
4442 		}
4443 		tcp_wput(tcp_wq, mp);
4444 
4445 		wflag |= NOINTR;
4446 
4447 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4448 			ASSERT(error == 0);
4449 			break;
4450 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4451 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4452 slow_send:
4453 			/*
4454 			 * We were able to send down partial data using
4455 			 * the direct call interface, but are now relying
4456 			 * on strwrite() to handle the non-fastpath cases.
4457 			 * If the socket is blocking we will sleep in
4458 			 * strwaitq() until write is permitted, otherwise,
4459 			 * we will need to return the amount of bytes
4460 			 * written so far back to the app.  This is the
4461 			 * reason why we pass NOINTR flag to strwrite()
4462 			 * for non-blocking socket, because we don't want
4463 			 * to return EAGAIN when portion of the user data
4464 			 * has actually been sent down.
4465 			 */
4466 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4467 		}
4468 	}
4469 	return (0);
4470 }
4471 
4472 /*
4473  * Update so_faddr by asking the transport (unless AF_UNIX).
4474  */
4475 int
4476 sotpi_getpeername(struct sonode *so)
4477 {
4478 	struct strbuf	strbuf;
4479 	int		error = 0, res;
4480 	void		*addr;
4481 	t_uscalar_t	addrlen;
4482 	k_sigset_t	smask;
4483 
4484 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4485 		so, pr_state(so->so_state, so->so_mode)));
4486 
4487 	mutex_enter(&so->so_lock);
4488 	so_lock_single(so);	/* Set SOLOCKED */
4489 	if (!(so->so_state & SS_ISCONNECTED)) {
4490 		error = ENOTCONN;
4491 		goto done;
4492 	}
4493 	/* Added this check for X/Open */
4494 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4495 		error = EINVAL;
4496 		if (xnet_check_print) {
4497 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4498 		}
4499 		goto done;
4500 	}
4501 #ifdef DEBUG
4502 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4503 		pr_addr(so->so_family, so->so_faddr_sa,
4504 			(t_uscalar_t)so->so_faddr_len)));
4505 #endif /* DEBUG */
4506 
4507 	if (so->so_family == AF_UNIX) {
4508 		/* Transport has different name space - return local info */
4509 		error = 0;
4510 		goto done;
4511 	}
4512 
4513 	ASSERT(so->so_faddr_sa);
4514 	/* Allocate local buffer to use with ioctl */
4515 	addrlen = (t_uscalar_t)so->so_faddr_maxlen;
4516 	mutex_exit(&so->so_lock);
4517 	addr = kmem_alloc(addrlen, KM_SLEEP);
4518 
4519 	/*
4520 	 * Issue TI_GETPEERNAME with signals masked.
4521 	 * Put the result in so_faddr_sa so that getpeername works after
4522 	 * a shutdown(output).
4523 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4524 	 * back to the socket.
4525 	 */
4526 	strbuf.buf = addr;
4527 	strbuf.maxlen = addrlen;
4528 	strbuf.len = 0;
4529 
4530 	sigintr(&smask, 0);
4531 	res = 0;
4532 	ASSERT(CRED());
4533 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4534 			0, K_TO_K, CRED(), &res);
4535 	sigunintr(&smask);
4536 
4537 	mutex_enter(&so->so_lock);
4538 	/*
4539 	 * If there is an error record the error in so_error put don't fail
4540 	 * the getpeername. Instead fallback on the recorded
4541 	 * so->so_faddr_sa.
4542 	 */
4543 	if (error) {
4544 		/*
4545 		 * Various stream head errors can be returned to the ioctl.
4546 		 * However, it is impossible to determine which ones of
4547 		 * these are really socket level errors that were incorrectly
4548 		 * consumed by the ioctl. Thus this code silently ignores the
4549 		 * error - to code explicitly does not reinstate the error
4550 		 * using soseterror().
4551 		 * Experiments have shows that at least this set of
4552 		 * errors are reported and should not be reinstated on the
4553 		 * socket:
4554 		 *	EINVAL	E.g. if an I_LINK was in effect when
4555 		 *		getpeername was called.
4556 		 *	EPIPE	The ioctl error semantics prefer the write
4557 		 *		side error over the read side error.
4558 		 *	ENOTCONN The transport just got disconnected but
4559 		 *		sockfs had not yet seen the T_DISCON_IND
4560 		 *		when issuing the ioctl.
4561 		 */
4562 		error = 0;
4563 	} else if (res == 0 && strbuf.len > 0 &&
4564 	    (so->so_state & SS_ISCONNECTED)) {
4565 		ASSERT(strbuf.len <= (int)so->so_faddr_maxlen);
4566 		so->so_faddr_len = (socklen_t)strbuf.len;
4567 		bcopy(addr, so->so_faddr_sa, so->so_faddr_len);
4568 		so->so_state |= SS_FADDR_VALID;
4569 	}
4570 	kmem_free(addr, addrlen);
4571 #ifdef DEBUG
4572 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4573 			pr_addr(so->so_family, so->so_faddr_sa,
4574 				(t_uscalar_t)so->so_faddr_len)));
4575 #endif /* DEBUG */
4576 done:
4577 	so_unlock_single(so, SOLOCKED);
4578 	mutex_exit(&so->so_lock);
4579 	return (error);
4580 }
4581 
4582 /*
4583  * Update so_laddr by asking the transport (unless AF_UNIX).
4584  */
4585 int
4586 sotpi_getsockname(struct sonode *so)
4587 {
4588 	struct strbuf	strbuf;
4589 	int		error = 0, res;
4590 	void		*addr;
4591 	t_uscalar_t	addrlen;
4592 	k_sigset_t	smask;
4593 
4594 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4595 		so, pr_state(so->so_state, so->so_mode)));
4596 
4597 	mutex_enter(&so->so_lock);
4598 	so_lock_single(so);	/* Set SOLOCKED */
4599 	if (!(so->so_state & SS_ISBOUND) && so->so_family != AF_UNIX) {
4600 		/* Return an all zero address except for the family */
4601 		if (so->so_family == AF_INET)
4602 			so->so_laddr_len = (socklen_t)sizeof (sin_t);
4603 		else if (so->so_family == AF_INET6)
4604 			so->so_laddr_len = (socklen_t)sizeof (sin6_t);
4605 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
4606 		bzero(so->so_laddr_sa, so->so_laddr_len);
4607 		/*
4608 		 * Can not assume there is a sa_family for all
4609 		 * protocol families.
4610 		 */
4611 		if (so->so_family == AF_INET || so->so_family == AF_INET6)
4612 			so->so_laddr_sa->sa_family = so->so_family;
4613 	}
4614 #ifdef DEBUG
4615 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4616 		pr_addr(so->so_family, so->so_laddr_sa,
4617 			(t_uscalar_t)so->so_laddr_len)));
4618 #endif /* DEBUG */
4619 	if (so->so_family == AF_UNIX) {
4620 		/* Transport has different name space - return local info */
4621 		error = 0;
4622 		goto done;
4623 	}
4624 	if (!(so->so_state & SS_ISBOUND)) {
4625 		/* If not bound, then nothing to return. */
4626 		error = 0;
4627 		goto done;
4628 	}
4629 	/* Allocate local buffer to use with ioctl */
4630 	addrlen = (t_uscalar_t)so->so_laddr_maxlen;
4631 	mutex_exit(&so->so_lock);
4632 	addr = kmem_alloc(addrlen, KM_SLEEP);
4633 
4634 	/*
4635 	 * Issue TI_GETMYNAME with signals masked.
4636 	 * Put the result in so_laddr_sa so that getsockname works after
4637 	 * a shutdown(output).
4638 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4639 	 * back to the socket.
4640 	 */
4641 	strbuf.buf = addr;
4642 	strbuf.maxlen = addrlen;
4643 	strbuf.len = 0;
4644 
4645 	sigintr(&smask, 0);
4646 	res = 0;
4647 	ASSERT(CRED());
4648 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4649 			0, K_TO_K, CRED(), &res);
4650 	sigunintr(&smask);
4651 
4652 	mutex_enter(&so->so_lock);
4653 	/*
4654 	 * If there is an error record the error in so_error put don't fail
4655 	 * the getsockname. Instead fallback on the recorded
4656 	 * so->so_laddr_sa.
4657 	 */
4658 	if (error) {
4659 		/*
4660 		 * Various stream head errors can be returned to the ioctl.
4661 		 * However, it is impossible to determine which ones of
4662 		 * these are really socket level errors that were incorrectly
4663 		 * consumed by the ioctl. Thus this code silently ignores the
4664 		 * error - to code explicitly does not reinstate the error
4665 		 * using soseterror().
4666 		 * Experiments have shows that at least this set of
4667 		 * errors are reported and should not be reinstated on the
4668 		 * socket:
4669 		 *	EINVAL	E.g. if an I_LINK was in effect when
4670 		 *		getsockname was called.
4671 		 *	EPIPE	The ioctl error semantics prefer the write
4672 		 *		side error over the read side error.
4673 		 */
4674 		error = 0;
4675 	} else if (res == 0 && strbuf.len > 0 &&
4676 	    (so->so_state & SS_ISBOUND)) {
4677 		ASSERT(strbuf.len <= (int)so->so_laddr_maxlen);
4678 		so->so_laddr_len = (socklen_t)strbuf.len;
4679 		bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
4680 		so->so_state |= SS_LADDR_VALID;
4681 	}
4682 	kmem_free(addr, addrlen);
4683 #ifdef DEBUG
4684 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
4685 			pr_addr(so->so_family, so->so_laddr_sa,
4686 				(t_uscalar_t)so->so_laddr_len)));
4687 #endif /* DEBUG */
4688 done:
4689 	so_unlock_single(so, SOLOCKED);
4690 	mutex_exit(&so->so_lock);
4691 	return (error);
4692 }
4693 
4694 /*
4695  * Get socket options. For SOL_SOCKET options some options are handled
4696  * by the sockfs while others use the value recorded in the sonode as a
4697  * fallback should the T_SVR4_OPTMGMT_REQ fail.
4698  *
4699  * On the return most *optlenp bytes are copied to optval.
4700  */
4701 int
4702 sotpi_getsockopt(struct sonode *so, int level, int option_name,
4703 		void *optval, socklen_t *optlenp, int flags)
4704 {
4705 	struct T_optmgmt_req	optmgmt_req;
4706 	struct T_optmgmt_ack	*optmgmt_ack;
4707 	struct opthdr		oh;
4708 	struct opthdr		*opt_res;
4709 	mblk_t			*mp = NULL;
4710 	int			error = 0;
4711 	void			*option = NULL;	/* Set if fallback value */
4712 	t_uscalar_t		maxlen = *optlenp;
4713 	t_uscalar_t		len;
4714 	uint32_t		value;
4715 
4716 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
4717 			so, level, option_name, optval, optlenp,
4718 			pr_state(so->so_state, so->so_mode)));
4719 
4720 	mutex_enter(&so->so_lock);
4721 	so_lock_single(so);	/* Set SOLOCKED */
4722 
4723 	/*
4724 	 * Check for SOL_SOCKET options.
4725 	 * Certain SOL_SOCKET options are returned directly whereas
4726 	 * others only provide a default (fallback) value should
4727 	 * the T_SVR4_OPTMGMT_REQ fail.
4728 	 */
4729 	if (level == SOL_SOCKET) {
4730 		/* Check parameters */
4731 		switch (option_name) {
4732 		case SO_TYPE:
4733 		case SO_ERROR:
4734 		case SO_DEBUG:
4735 		case SO_ACCEPTCONN:
4736 		case SO_REUSEADDR:
4737 		case SO_KEEPALIVE:
4738 		case SO_DONTROUTE:
4739 		case SO_BROADCAST:
4740 		case SO_USELOOPBACK:
4741 		case SO_OOBINLINE:
4742 		case SO_SNDBUF:
4743 		case SO_RCVBUF:
4744 #ifdef notyet
4745 		case SO_SNDLOWAT:
4746 		case SO_RCVLOWAT:
4747 		case SO_SNDTIMEO:
4748 		case SO_RCVTIMEO:
4749 #endif /* notyet */
4750 		case SO_DGRAM_ERRIND:
4751 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
4752 				error = EINVAL;
4753 				eprintsoline(so, error);
4754 				goto done2;
4755 			}
4756 			break;
4757 		case SO_LINGER:
4758 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
4759 				error = EINVAL;
4760 				eprintsoline(so, error);
4761 				goto done2;
4762 			}
4763 			break;
4764 		}
4765 
4766 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
4767 
4768 		switch (option_name) {
4769 		case SO_TYPE:
4770 			value = so->so_type;
4771 			option = &value;
4772 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4773 
4774 		case SO_ERROR:
4775 			value = sogeterr(so);
4776 			option = &value;
4777 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4778 
4779 		case SO_ACCEPTCONN:
4780 			if (so->so_state & SS_ACCEPTCONN)
4781 				value = SO_ACCEPTCONN;
4782 			else
4783 				value = 0;
4784 #ifdef DEBUG
4785 			if (value) {
4786 				dprintso(so, 1,
4787 				    ("sotpi_getsockopt: 0x%x is set\n",
4788 				    option_name));
4789 			} else {
4790 				dprintso(so, 1,
4791 				    ("sotpi_getsockopt: 0x%x not set\n",
4792 				    option_name));
4793 			}
4794 #endif /* DEBUG */
4795 			option = &value;
4796 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4797 
4798 		case SO_DEBUG:
4799 		case SO_REUSEADDR:
4800 		case SO_KEEPALIVE:
4801 		case SO_DONTROUTE:
4802 		case SO_BROADCAST:
4803 		case SO_USELOOPBACK:
4804 		case SO_OOBINLINE:
4805 		case SO_DGRAM_ERRIND:
4806 			value = (so->so_options & option_name);
4807 #ifdef DEBUG
4808 			if (value) {
4809 				dprintso(so, 1,
4810 				    ("sotpi_getsockopt: 0x%x is set\n",
4811 				    option_name));
4812 			} else {
4813 				dprintso(so, 1,
4814 				    ("sotpi_getsockopt: 0x%x not set\n",
4815 				    option_name));
4816 			}
4817 #endif /* DEBUG */
4818 			option = &value;
4819 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4820 
4821 		/*
4822 		 * The following options are only returned by sockfs when the
4823 		 * T_SVR4_OPTMGMT_REQ fails.
4824 		 */
4825 		case SO_LINGER:
4826 			option = &so->so_linger;
4827 			len = (t_uscalar_t)sizeof (struct linger);
4828 			break;
4829 		case SO_SNDBUF: {
4830 			ssize_t lvalue;
4831 
4832 			/*
4833 			 * If the option has not been set then get a default
4834 			 * value from the read queue. This value is
4835 			 * returned if the transport fails
4836 			 * the T_SVR4_OPTMGMT_REQ.
4837 			 */
4838 			lvalue = so->so_sndbuf;
4839 			if (lvalue == 0) {
4840 				mutex_exit(&so->so_lock);
4841 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
4842 						QHIWAT, 0, &lvalue);
4843 				mutex_enter(&so->so_lock);
4844 				dprintso(so, 1,
4845 				    ("got SO_SNDBUF %ld from q\n", lvalue));
4846 			}
4847 			value = (int)lvalue;
4848 			option = &value;
4849 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
4850 			break;
4851 		}
4852 		case SO_RCVBUF: {
4853 			ssize_t lvalue;
4854 
4855 			/*
4856 			 * If the option has not been set then get a default
4857 			 * value from the read queue. This value is
4858 			 * returned if the transport fails
4859 			 * the T_SVR4_OPTMGMT_REQ.
4860 			 *
4861 			 * XXX If SO_RCVBUF has been set and this is an
4862 			 * XPG 4.2 application then do not ask the transport
4863 			 * since the transport might adjust the value and not
4864 			 * return exactly what was set by the application.
4865 			 * For non-XPG 4.2 application we return the value
4866 			 * that the transport is actually using.
4867 			 */
4868 			lvalue = so->so_rcvbuf;
4869 			if (lvalue == 0) {
4870 				mutex_exit(&so->so_lock);
4871 				(void) strqget(RD(strvp2wq(SOTOV(so))),
4872 						QHIWAT, 0, &lvalue);
4873 				mutex_enter(&so->so_lock);
4874 				dprintso(so, 1,
4875 				    ("got SO_RCVBUF %ld from q\n", lvalue));
4876 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
4877 				value = (int)lvalue;
4878 				option = &value;
4879 				goto copyout;	/* skip asking transport */
4880 			}
4881 			value = (int)lvalue;
4882 			option = &value;
4883 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
4884 			break;
4885 		}
4886 #ifdef notyet
4887 		/*
4888 		 * We do not implement the semantics of these options
4889 		 * thus we shouldn't implement the options either.
4890 		 */
4891 		case SO_SNDLOWAT:
4892 			value = so->so_sndlowat;
4893 			option = &value;
4894 			break;
4895 		case SO_RCVLOWAT:
4896 			value = so->so_rcvlowat;
4897 			option = &value;
4898 			break;
4899 		case SO_SNDTIMEO:
4900 			value = so->so_sndtimeo;
4901 			option = &value;
4902 			break;
4903 		case SO_RCVTIMEO:
4904 			value = so->so_rcvtimeo;
4905 			option = &value;
4906 			break;
4907 #endif /* notyet */
4908 		}
4909 	}
4910 
4911 	mutex_exit(&so->so_lock);
4912 
4913 	/* Send request */
4914 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
4915 	optmgmt_req.MGMT_flags = T_CHECK;
4916 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
4917 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
4918 
4919 	oh.level = level;
4920 	oh.name = option_name;
4921 	oh.len = maxlen;
4922 
4923 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
4924 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP);
4925 	/* Let option management work in the presence of data flow control */
4926 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
4927 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
4928 	mp = NULL;
4929 	mutex_enter(&so->so_lock);
4930 	if (error) {
4931 		eprintsoline(so, error);
4932 		goto done2;
4933 	}
4934 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
4935 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
4936 	if (error) {
4937 		if (option != NULL) {
4938 			/* We have a fallback value */
4939 			error = 0;
4940 			goto copyout;
4941 		}
4942 		eprintsoline(so, error);
4943 		goto done2;
4944 	}
4945 	ASSERT(mp);
4946 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
4947 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
4948 			optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
4949 	if (opt_res == NULL) {
4950 		if (option != NULL) {
4951 			/* We have a fallback value */
4952 			error = 0;
4953 			goto copyout;
4954 		}
4955 		error = EPROTO;
4956 		eprintsoline(so, error);
4957 		goto done;
4958 	}
4959 	option = &opt_res[1];
4960 
4961 	/* check to ensure that the option is within bounds */
4962 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
4963 		(uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
4964 		if (option != NULL) {
4965 			/* We have a fallback value */
4966 			error = 0;
4967 			goto copyout;
4968 		}
4969 		error = EPROTO;
4970 		eprintsoline(so, error);
4971 		goto done;
4972 	}
4973 
4974 	len = opt_res->len;
4975 
4976 copyout: {
4977 		t_uscalar_t size = MIN(len, maxlen);
4978 		bcopy(option, optval, size);
4979 		bcopy(&size, optlenp, sizeof (size));
4980 	}
4981 done:
4982 	freemsg(mp);
4983 done2:
4984 	so_unlock_single(so, SOLOCKED);
4985 	mutex_exit(&so->so_lock);
4986 	return (error);
4987 }
4988 
4989 /*
4990  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
4991  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
4992  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
4993  * setsockopt has to work even if the transport does not support the option.
4994  */
4995 int
4996 sotpi_setsockopt(struct sonode *so, int level, int option_name,
4997 	const void *optval, t_uscalar_t optlen)
4998 {
4999 	struct T_optmgmt_req	optmgmt_req;
5000 	struct opthdr		oh;
5001 	mblk_t			*mp;
5002 	int			error = 0;
5003 	boolean_t		handled = B_FALSE;
5004 
5005 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5006 			so, level, option_name, optval, optlen,
5007 			pr_state(so->so_state, so->so_mode)));
5008 
5009 
5010 	/* X/Open requires this check */
5011 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5012 		if (xnet_check_print)
5013 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5014 		return (EINVAL);
5015 	}
5016 
5017 	/* Caller allocates aligned optval, or passes null */
5018 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
5019 	/* If optval is null optlen is 0, and vice-versa */
5020 	ASSERT(optval != NULL || optlen == 0);
5021 	ASSERT(optlen != 0 || optval == NULL);
5022 
5023 	mutex_enter(&so->so_lock);
5024 	so_lock_single(so);	/* Set SOLOCKED */
5025 	mutex_exit(&so->so_lock);
5026 
5027 	/*
5028 	 * For SOCKET or TCP level options, try to set it here itself
5029 	 * provided socket has not been popped and we know the tcp
5030 	 * structure (stored in so_priv).
5031 	 */
5032 	if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
5033 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
5034 	    (so->so_version == SOV_SOCKSTREAM) && (so->so_priv != NULL)) {
5035 		tcp_t		*tcp = so->so_priv;
5036 		boolean_t	onoff;
5037 
5038 #define	intvalue	(*(int32_t *)optval)
5039 
5040 		switch (level) {
5041 		case SOL_SOCKET:
5042 			switch (option_name) {		/* Check length param */
5043 			case SO_DEBUG:
5044 			case SO_REUSEADDR:
5045 			case SO_DONTROUTE:
5046 			case SO_BROADCAST:
5047 			case SO_USELOOPBACK:
5048 			case SO_OOBINLINE:
5049 			case SO_DGRAM_ERRIND:
5050 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5051 					error = EINVAL;
5052 					eprintsoline(so, error);
5053 					mutex_enter(&so->so_lock);
5054 					goto done2;
5055 				}
5056 				ASSERT(optval);
5057 				onoff = intvalue != 0;
5058 				handled = B_TRUE;
5059 				break;
5060 			case SO_LINGER:
5061 				if (optlen !=
5062 				    (t_uscalar_t)sizeof (struct linger)) {
5063 					error = EINVAL;
5064 					eprintsoline(so, error);
5065 					mutex_enter(&so->so_lock);
5066 					goto done2;
5067 				}
5068 				ASSERT(optval);
5069 				handled = B_TRUE;
5070 				break;
5071 			}
5072 
5073 			switch (option_name) {			/* Do actions */
5074 			case SO_LINGER: {
5075 				struct linger *lgr = (struct linger *)optval;
5076 
5077 				if (lgr->l_onoff) {
5078 					tcp->tcp_linger = 1;
5079 					tcp->tcp_lingertime = lgr->l_linger;
5080 					so->so_linger.l_onoff = SO_LINGER;
5081 					so->so_options |= SO_LINGER;
5082 				} else {
5083 					tcp->tcp_linger = 0;
5084 					tcp->tcp_lingertime = 0;
5085 					so->so_linger.l_onoff = 0;
5086 					so->so_options &= ~SO_LINGER;
5087 				}
5088 				so->so_linger.l_linger = lgr->l_linger;
5089 				handled = B_TRUE;
5090 				break;
5091 			}
5092 			case SO_DEBUG:
5093 				tcp->tcp_debug = onoff;
5094 #ifdef SOCK_TEST
5095 				if (intvalue & 2)
5096 					sock_test_timelimit = 10 * hz;
5097 				else
5098 					sock_test_timelimit = 0;
5099 
5100 				if (intvalue & 4)
5101 					do_useracc = 0;
5102 				else
5103 					do_useracc = 1;
5104 #endif /* SOCK_TEST */
5105 				break;
5106 			case SO_DONTROUTE:
5107 				/*
5108 				 * SO_DONTROUTE, SO_USELOOPBACK and
5109 				 * SO_BROADCAST are only of interest to IP.
5110 				 * We track them here only so
5111 				 * that we can report their current value.
5112 				 */
5113 				tcp->tcp_dontroute = onoff;
5114 				if (onoff)
5115 					so->so_options |= option_name;
5116 				else
5117 					so->so_options &= ~option_name;
5118 				break;
5119 			case SO_USELOOPBACK:
5120 				tcp->tcp_useloopback = onoff;
5121 				if (onoff)
5122 					so->so_options |= option_name;
5123 				else
5124 					so->so_options &= ~option_name;
5125 				break;
5126 			case SO_BROADCAST:
5127 				tcp->tcp_broadcast = onoff;
5128 				if (onoff)
5129 					so->so_options |= option_name;
5130 				else
5131 					so->so_options &= ~option_name;
5132 				break;
5133 			case SO_REUSEADDR:
5134 				tcp->tcp_reuseaddr = onoff;
5135 				if (onoff)
5136 					so->so_options |= option_name;
5137 				else
5138 					so->so_options &= ~option_name;
5139 				break;
5140 			case SO_OOBINLINE:
5141 				tcp->tcp_oobinline = onoff;
5142 				if (onoff)
5143 					so->so_options |= option_name;
5144 				else
5145 					so->so_options &= ~option_name;
5146 				break;
5147 			case SO_DGRAM_ERRIND:
5148 				tcp->tcp_dgram_errind = onoff;
5149 				if (onoff)
5150 					so->so_options |= option_name;
5151 				else
5152 					so->so_options &= ~option_name;
5153 				break;
5154 			}
5155 			break;
5156 		case IPPROTO_TCP:
5157 			switch (option_name) {
5158 			case TCP_NODELAY:
5159 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5160 					error = EINVAL;
5161 					eprintsoline(so, error);
5162 					mutex_enter(&so->so_lock);
5163 					goto done2;
5164 				}
5165 				ASSERT(optval);
5166 				tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss;
5167 				handled = B_TRUE;
5168 				break;
5169 			}
5170 			break;
5171 		default:
5172 			handled = B_FALSE;
5173 			break;
5174 		}
5175 	}
5176 
5177 	if (handled) {
5178 		mutex_enter(&so->so_lock);
5179 		goto done2;
5180 	}
5181 
5182 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5183 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5184 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5185 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5186 
5187 	oh.level = level;
5188 	oh.name = option_name;
5189 	oh.len = optlen;
5190 
5191 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5192 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP);
5193 	/* Let option management work in the presence of data flow control */
5194 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5195 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5196 	mp = NULL;
5197 	mutex_enter(&so->so_lock);
5198 	if (error) {
5199 		eprintsoline(so, error);
5200 		goto done;
5201 	}
5202 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5203 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5204 	if (error) {
5205 		eprintsoline(so, error);
5206 		goto done;
5207 	}
5208 	ASSERT(mp);
5209 	/* No need to verify T_optmgmt_ack */
5210 	freemsg(mp);
5211 done:
5212 	/*
5213 	 * Check for SOL_SOCKET options and record their values.
5214 	 * If we know about a SOL_SOCKET parameter and the transport
5215 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5216 	 * EPROTO) we let the setsockopt succeed.
5217 	 */
5218 	if (level == SOL_SOCKET) {
5219 		/* Check parameters */
5220 		switch (option_name) {
5221 		case SO_DEBUG:
5222 		case SO_REUSEADDR:
5223 		case SO_KEEPALIVE:
5224 		case SO_DONTROUTE:
5225 		case SO_BROADCAST:
5226 		case SO_USELOOPBACK:
5227 		case SO_OOBINLINE:
5228 		case SO_SNDBUF:
5229 		case SO_RCVBUF:
5230 #ifdef notyet
5231 		case SO_SNDLOWAT:
5232 		case SO_RCVLOWAT:
5233 		case SO_SNDTIMEO:
5234 		case SO_RCVTIMEO:
5235 #endif /* notyet */
5236 		case SO_DGRAM_ERRIND:
5237 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5238 				error = EINVAL;
5239 				eprintsoline(so, error);
5240 				goto done2;
5241 			}
5242 			ASSERT(optval);
5243 			handled = B_TRUE;
5244 			break;
5245 		case SO_LINGER:
5246 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5247 				error = EINVAL;
5248 				eprintsoline(so, error);
5249 				goto done2;
5250 			}
5251 			ASSERT(optval);
5252 			handled = B_TRUE;
5253 			break;
5254 		}
5255 
5256 #define	intvalue	(*(int32_t *)optval)
5257 
5258 		switch (option_name) {
5259 		case SO_TYPE:
5260 		case SO_ERROR:
5261 		case SO_ACCEPTCONN:
5262 			/* Can't be set */
5263 			error = ENOPROTOOPT;
5264 			goto done2;
5265 		case SO_LINGER: {
5266 			struct linger *l = (struct linger *)optval;
5267 
5268 			so->so_linger.l_linger = l->l_linger;
5269 			if (l->l_onoff) {
5270 				so->so_linger.l_onoff = SO_LINGER;
5271 				so->so_options |= SO_LINGER;
5272 			} else {
5273 				so->so_linger.l_onoff = 0;
5274 				so->so_options &= ~SO_LINGER;
5275 			}
5276 			break;
5277 		}
5278 
5279 		case SO_DEBUG:
5280 #ifdef SOCK_TEST
5281 			if (intvalue & 2)
5282 				sock_test_timelimit = 10 * hz;
5283 			else
5284 				sock_test_timelimit = 0;
5285 
5286 			if (intvalue & 4)
5287 				do_useracc = 0;
5288 			else
5289 				do_useracc = 1;
5290 #endif /* SOCK_TEST */
5291 			/* FALLTHRU */
5292 		case SO_REUSEADDR:
5293 		case SO_KEEPALIVE:
5294 		case SO_DONTROUTE:
5295 		case SO_BROADCAST:
5296 		case SO_USELOOPBACK:
5297 		case SO_OOBINLINE:
5298 		case SO_DGRAM_ERRIND:
5299 			if (intvalue != 0) {
5300 				dprintso(so, 1,
5301 					("sotpi_setsockopt: setting 0x%x\n",
5302 					option_name));
5303 				so->so_options |= option_name;
5304 			} else {
5305 				dprintso(so, 1,
5306 					("sotpi_setsockopt: clearing 0x%x\n",
5307 					option_name));
5308 				so->so_options &= ~option_name;
5309 			}
5310 			break;
5311 		/*
5312 		 * The following options are only returned by us when the
5313 		 * T_SVR4_OPTMGMT_REQ fails.
5314 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5315 		 * since the transport might adjust the value and not
5316 		 * return exactly what was set by the application.
5317 		 */
5318 		case SO_SNDBUF:
5319 			so->so_sndbuf = intvalue;
5320 			break;
5321 		case SO_RCVBUF:
5322 			so->so_rcvbuf = intvalue;
5323 			break;
5324 #ifdef notyet
5325 		/*
5326 		 * We do not implement the semantics of these options
5327 		 * thus we shouldn't implement the options either.
5328 		 */
5329 		case SO_SNDLOWAT:
5330 			so->so_sndlowat = intvalue;
5331 			break;
5332 		case SO_RCVLOWAT:
5333 			so->so_rcvlowat = intvalue;
5334 			break;
5335 		case SO_SNDTIMEO:
5336 			so->so_sndtimeo = intvalue;
5337 			break;
5338 		case SO_RCVTIMEO:
5339 			so->so_rcvtimeo = intvalue;
5340 			break;
5341 #endif /* notyet */
5342 		}
5343 #undef	intvalue
5344 
5345 		if (error) {
5346 			if ((error == ENOPROTOOPT || error == EPROTO ||
5347 			    error == EINVAL) && handled) {
5348 				dprintso(so, 1,
5349 				    ("setsockopt: ignoring error %d for 0x%x\n",
5350 				    error, option_name));
5351 				error = 0;
5352 			}
5353 		}
5354 	}
5355 done2:
5356 ret:
5357 	so_unlock_single(so, SOLOCKED);
5358 	mutex_exit(&so->so_lock);
5359 	return (error);
5360 }
5361