xref: /illumos-gate/usr/src/uts/common/fs/sockfs/socktpi.c (revision 032624d56c174c5c55126582b32e314a6af15522)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/strsun.h>
50 #include <sys/ddi.h>
51 #include <sys/esunddi.h>
52 #include <sys/flock.h>
53 #include <sys/modctl.h>
54 #include <sys/vtrace.h>
55 #include <sys/cmn_err.h>
56 #include <sys/pathname.h>
57 
58 #include <sys/socket.h>
59 #include <sys/socketvar.h>
60 #include <netinet/in.h>
61 #include <sys/un.h>
62 #include <sys/strsun.h>
63 
64 #include <sys/tiuser.h>
65 #define	_SUN_TPI_VERSION	2
66 #include <sys/tihdr.h>
67 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
68 
69 #include <c2/audit.h>
70 
71 #include <inet/common.h>
72 #include <inet/ip.h>
73 #include <inet/ip6.h>
74 #include <inet/tcp.h>
75 
76 #include <fs/sockfs/nl7c.h>
77 #include <sys/zone.h>
78 
79 /*
80  * Possible failures when memory can't be allocated. The documented behavior:
81  *
82  * 		5.5:			4.X:		XNET:
83  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
84  *							EINTR
85  *	(4.X does not document EINTR but returns it)
86  * bind:	ENOSR			-		ENOBUFS/ENOSR
87  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
88  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
89  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
90  *	(4.X getpeername and getsockname do not fail in practice)
91  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
92  * listen:	-			-		ENOBUFS
93  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
94  *							EINTR
95  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
96  *							EINTR
97  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
98  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
99  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
100  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
101  *
102  * Resolution. When allocation fails:
103  *	recv: return EINTR
104  *	send: return EINTR
105  *	connect, accept: EINTR
106  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
107  *	socket, socketpair: ENOBUFS
108  *	getpeername, getsockname: sleep
109  *	getsockopt, setsockopt: sleep
110  */
111 
112 #ifdef SOCK_TEST
113 /*
114  * Variables that make sockfs do something other than the standard TPI
115  * for the AF_INET transports.
116  *
117  * solisten_tpi_tcp:
118  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
119  *	the transport is already bound. This is needed to avoid loosing the
120  *	port number should listen() do a T_UNBIND_REQ followed by a
121  *	O_T_BIND_REQ.
122  *
123  * soconnect_tpi_udp:
124  *	UDP and ICMP can handle a T_CONN_REQ.
125  *	This is needed to make the sequence of connect(), getsockname()
126  *	return the local IP address used to send packets to the connected to
127  *	destination.
128  *
129  * soconnect_tpi_tcp:
130  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
131  *	Set this to non-zero to send TPI conformant messages to TCP in this
132  *	respect. This is a performance optimization.
133  *
134  * soaccept_tpi_tcp:
135  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
136  *	This is a performance optimization that has been picked up in XTI.
137  *
138  * soaccept_tpi_multioptions:
139  *	When inheriting SOL_SOCKET options from the listener to the accepting
140  *	socket send them as a single message for AF_INET{,6}.
141  */
142 int solisten_tpi_tcp = 0;
143 int soconnect_tpi_udp = 0;
144 int soconnect_tpi_tcp = 0;
145 int soaccept_tpi_tcp = 0;
146 int soaccept_tpi_multioptions = 1;
147 #else /* SOCK_TEST */
148 #define	soconnect_tpi_tcp	0
149 #define	soconnect_tpi_udp	0
150 #define	solisten_tpi_tcp	0
151 #define	soaccept_tpi_tcp	0
152 #define	soaccept_tpi_multioptions	1
153 #endif /* SOCK_TEST */
154 
155 #ifdef SOCK_TEST
156 extern int do_useracc;
157 extern clock_t sock_test_timelimit;
158 #endif /* SOCK_TEST */
159 
160 /*
161  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
162  * applications working. Turn on this flag to disable these checks.
163  */
164 int xnet_skip_checks = 0;
165 int xnet_check_print = 0;
166 int xnet_truncate_print = 0;
167 
168 extern	void sigintr(k_sigset_t *, int);
169 extern	void sigunintr(k_sigset_t *);
170 
171 extern	void *nl7c_lookup_addr(void *, t_uscalar_t);
172 extern	void *nl7c_add_addr(void *, t_uscalar_t);
173 extern	void nl7c_listener_addr(void *, queue_t *);
174 
175 static int	sotpi_unbind(struct sonode *, int);
176 
177 /* TPI sockfs sonode operations */
178 static int	sotpi_accept(struct sonode *, int, struct sonode **);
179 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
180 		    int);
181 static int	sotpi_connect(struct sonode *, const struct sockaddr *,
182 		    socklen_t, int, int);
183 static int	sotpi_listen(struct sonode *, int);
184 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
185 		    struct uio *);
186 static int	sotpi_shutdown(struct sonode *, int);
187 static int	sotpi_getsockname(struct sonode *);
188 
189 sonodeops_t sotpi_sonodeops = {
190 	sotpi_accept,		/* sop_accept		*/
191 	sotpi_bind,		/* sop_bind		*/
192 	sotpi_listen,		/* sop_listen		*/
193 	sotpi_connect,		/* sop_connect		*/
194 	sotpi_recvmsg,		/* sop_recvmsg		*/
195 	sotpi_sendmsg,		/* sop_sendmsg		*/
196 	sotpi_getpeername,	/* sop_getpeername	*/
197 	sotpi_getsockname,	/* sop_getsockname	*/
198 	sotpi_shutdown,		/* sop_shutdown		*/
199 	sotpi_getsockopt,	/* sop_getsockopt	*/
200 	sotpi_setsockopt	/* sop_setsockopt	*/
201 };
202 
203 /*
204  * Common create code for socket and accept. If tso is set the values
205  * from that node is used instead of issuing a T_INFO_REQ.
206  *
207  * Assumes that the caller has a VN_HOLD on accessvp.
208  * The VN_RELE will occur either when sotpi_create() fails or when
209  * the returned sonode is freed.
210  */
211 struct sonode *
212 sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version,
213     struct sonode *tso, int *errorp)
214 {
215 	struct sonode	*so;
216 	vnode_t		*vp;
217 	int		flags, error;
218 
219 	ASSERT(accessvp != NULL);
220 	vp = makesockvp(accessvp, domain, type, protocol);
221 	ASSERT(vp != NULL);
222 	so = VTOSO(vp);
223 
224 	flags = FREAD|FWRITE;
225 	if (tso != NULL) {
226 		if ((tso->so_state & (SS_TCP_FAST_ACCEPT)) != 0) {
227 			flags |= SO_ACCEPTOR|SO_SOCKSTR;
228 			so->so_state |= SS_TCP_FAST_ACCEPT;
229 		}
230 	} else {
231 		if ((so->so_type == SOCK_STREAM) &&
232 		    (so->so_family == AF_INET || so->so_family == AF_INET6)) {
233 			flags |= SO_SOCKSTR;
234 			so->so_state |= SS_TCP_FAST_ACCEPT;
235 		}
236 	}
237 
238 	/*
239 	 * Tell local transport that it is talking to sockets.
240 	 */
241 	if (so->so_family == AF_UNIX) {
242 		flags |= SO_SOCKSTR;
243 	}
244 
245 	if (error = socktpi_open(&vp, flags, CRED())) {
246 		VN_RELE(vp);
247 		*errorp = error;
248 		return (NULL);
249 	}
250 
251 	if (error = so_strinit(so, tso)) {
252 		(void) VOP_CLOSE(vp, 0, 1, 0, CRED());
253 		VN_RELE(vp);
254 		*errorp = error;
255 		return (NULL);
256 	}
257 
258 	if (version == SOV_DEFAULT)
259 		version = so_default_version;
260 
261 	so->so_version = (short)version;
262 	return (so);
263 }
264 
265 /*
266  * Bind the socket to an unspecified address in sockfs only.
267  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
268  * required in all cases.
269  */
270 static void
271 so_automatic_bind(struct sonode *so)
272 {
273 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
274 
275 	ASSERT(MUTEX_HELD(&so->so_lock));
276 	ASSERT(!(so->so_state & SS_ISBOUND));
277 	ASSERT(so->so_unbind_mp);
278 
279 	ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
280 	bzero(so->so_laddr_sa, so->so_laddr_len);
281 	so->so_laddr_sa->sa_family = so->so_family;
282 	so->so_state |= SS_ISBOUND;
283 }
284 
285 
286 /*
287  * bind the socket.
288  *
289  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
290  * are passed in we allow rebinding. Note that for backwards compatibility
291  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
292  * Thus the rebinding code is currently not executed.
293  *
294  * The constraints for rebinding are:
295  * - it is a SOCK_DGRAM, or
296  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
297  *   and no listen() has been done.
298  * This rebinding code was added based on some language in the XNET book
299  * about not returning EINVAL it the protocol allows rebinding. However,
300  * this language is not present in the Posix socket draft. Thus maybe the
301  * rebinding logic should be deleted from the source.
302  *
303  * A null "name" can be used to unbind the socket if:
304  * - it is a SOCK_DGRAM, or
305  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
306  *   and no listen() has been done.
307  */
308 static int
309 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
310     socklen_t namelen, int backlog, int flags)
311 {
312 	struct T_bind_req	bind_req;
313 	struct T_bind_ack	*bind_ack;
314 	int			error = 0;
315 	mblk_t			*mp;
316 	void			*addr;
317 	t_uscalar_t		addrlen;
318 	int			unbind_on_err = 1;
319 	boolean_t		clear_acceptconn_on_err = B_FALSE;
320 	boolean_t		restore_backlog_on_err = B_FALSE;
321 	int			save_so_backlog;
322 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
323 	boolean_t		tcp_udp_xport;
324 	void			*nl7c = NULL;
325 
326 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
327 		so, name, namelen, backlog, flags,
328 		pr_state(so->so_state, so->so_mode)));
329 
330 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
331 
332 	if (!(flags & _SOBIND_LOCK_HELD)) {
333 		mutex_enter(&so->so_lock);
334 		so_lock_single(so);	/* Set SOLOCKED */
335 	} else {
336 		ASSERT(MUTEX_HELD(&so->so_lock));
337 		ASSERT(so->so_flag & SOLOCKED);
338 	}
339 
340 	/*
341 	 * Make sure that there is a preallocated unbind_req message
342 	 * before binding. This message allocated when the socket is
343 	 * created  but it might be have been consumed.
344 	 */
345 	if (so->so_unbind_mp == NULL) {
346 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
347 		/* NOTE: holding so_lock while sleeping */
348 		so->so_unbind_mp =
349 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
350 	}
351 
352 	if (flags & _SOBIND_REBIND) {
353 		/*
354 		 * Called from solisten after doing an sotpi_unbind() or
355 		 * potentially without the unbind (latter for AF_INET{,6}).
356 		 */
357 		ASSERT(name == NULL && namelen == 0);
358 
359 		if (so->so_family == AF_UNIX) {
360 			ASSERT(so->so_ux_bound_vp);
361 			addr = &so->so_ux_laddr;
362 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
363 			dprintso(so, 1,
364 			("sobind rebind UNIX: addrlen %d, addr 0x%p, vp %p\n",
365 			    addrlen,
366 			    ((struct so_ux_addr *)addr)->soua_vp,
367 			    so->so_ux_bound_vp));
368 		} else {
369 			addr = so->so_laddr_sa;
370 			addrlen = (t_uscalar_t)so->so_laddr_len;
371 		}
372 	} else if (flags & _SOBIND_UNSPEC) {
373 		ASSERT(name == NULL && namelen == 0);
374 
375 		/*
376 		 * The caller checked SS_ISBOUND but not necessarily
377 		 * under so_lock
378 		 */
379 		if (so->so_state & SS_ISBOUND) {
380 			/* No error */
381 			goto done;
382 		}
383 
384 		/* Set an initial local address */
385 		switch (so->so_family) {
386 		case AF_UNIX:
387 			/*
388 			 * Use an address with same size as struct sockaddr
389 			 * just like BSD.
390 			 */
391 			so->so_laddr_len =
392 				(socklen_t)sizeof (struct sockaddr);
393 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
394 			bzero(so->so_laddr_sa, so->so_laddr_len);
395 			so->so_laddr_sa->sa_family = so->so_family;
396 
397 			/*
398 			 * Pass down an address with the implicit bind
399 			 * magic number and the rest all zeros.
400 			 * The transport will return a unique address.
401 			 */
402 			so->so_ux_laddr.soua_vp = NULL;
403 			so->so_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
404 			addr = &so->so_ux_laddr;
405 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
406 			break;
407 
408 		case AF_INET:
409 		case AF_INET6:
410 			/*
411 			 * An unspecified bind in TPI has a NULL address.
412 			 * Set the address in sockfs to have the sa_family.
413 			 */
414 			so->so_laddr_len = (so->so_family == AF_INET) ?
415 			    (socklen_t)sizeof (sin_t) :
416 			    (socklen_t)sizeof (sin6_t);
417 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
418 			bzero(so->so_laddr_sa, so->so_laddr_len);
419 			so->so_laddr_sa->sa_family = so->so_family;
420 			addr = NULL;
421 			addrlen = 0;
422 			break;
423 
424 		default:
425 			/*
426 			 * An unspecified bind in TPI has a NULL address.
427 			 * Set the address in sockfs to be zero length.
428 			 *
429 			 * Can not assume there is a sa_family for all
430 			 * protocol families. For example, AF_X25 does not
431 			 * have a family field.
432 			 */
433 			so->so_laddr_len = 0;	/* XXX correct? */
434 			bzero(so->so_laddr_sa, so->so_laddr_len);
435 			addr = NULL;
436 			addrlen = 0;
437 			break;
438 		}
439 
440 	} else {
441 		if (so->so_state & SS_ISBOUND) {
442 			/*
443 			 * If it is ok to rebind the socket, first unbind
444 			 * with the transport. A rebind to the NULL address
445 			 * is interpreted as an unbind.
446 			 * Note that a bind to NULL in BSD does unbind the
447 			 * socket but it fails with EINVAL.
448 			 * Note that regular sockets set SOV_SOCKBSD i.e.
449 			 * _SOBIND_SOCKBSD gets set here hence no type of
450 			 * socket does currently allow rebinding.
451 			 *
452 			 * If the name is NULL just do an unbind.
453 			 */
454 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
455 			    name != NULL) {
456 				error = EINVAL;
457 				unbind_on_err = 0;
458 				eprintsoline(so, error);
459 				goto done;
460 			}
461 			if ((so->so_mode & SM_CONNREQUIRED) &&
462 			    (so->so_state & SS_CANTREBIND)) {
463 				error = EINVAL;
464 				unbind_on_err = 0;
465 				eprintsoline(so, error);
466 				goto done;
467 			}
468 			error = sotpi_unbind(so, 0);
469 			if (error) {
470 				eprintsoline(so, error);
471 				goto done;
472 			}
473 			ASSERT(!(so->so_state & SS_ISBOUND));
474 			if (name == NULL) {
475 				so->so_state &=
476 					~(SS_ISCONNECTED|SS_ISCONNECTING);
477 				goto done;
478 			}
479 		}
480 		/* X/Open requires this check */
481 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
482 			if (xnet_check_print) {
483 				printf("sockfs: X/Open bind state check "
484 				    "caused EINVAL\n");
485 			}
486 			error = EINVAL;
487 			goto done;
488 		}
489 
490 		switch (so->so_family) {
491 		case AF_UNIX:
492 			/*
493 			 * All AF_UNIX addresses are nul terminated
494 			 * when copied (copyin_name) in so the minimum
495 			 * length is 3 bytes.
496 			 */
497 			if (name == NULL ||
498 			    (ssize_t)namelen <= sizeof (short) + 1) {
499 				error = EISDIR;
500 				eprintsoline(so, error);
501 				goto done;
502 			}
503 			/*
504 			 * Verify so_family matches the bound family.
505 			 * BSD does not check this for AF_UNIX resulting
506 			 * in funny mknods.
507 			 */
508 			if (name->sa_family != so->so_family) {
509 				error = EAFNOSUPPORT;
510 				goto done;
511 			}
512 			break;
513 		case AF_INET:
514 			if (name == NULL) {
515 				error = EINVAL;
516 				eprintsoline(so, error);
517 				goto done;
518 			}
519 			if ((size_t)namelen != sizeof (sin_t)) {
520 				error = name->sa_family != so->so_family ?
521 				    EAFNOSUPPORT : EINVAL;
522 				eprintsoline(so, error);
523 				goto done;
524 			}
525 			if ((flags & _SOBIND_XPG4_2) &&
526 			    (name->sa_family != so->so_family)) {
527 				/*
528 				 * This check has to be made for X/Open
529 				 * sockets however application failures have
530 				 * been observed when it is applied to
531 				 * all sockets.
532 				 */
533 				error = EAFNOSUPPORT;
534 				eprintsoline(so, error);
535 				goto done;
536 			}
537 			/*
538 			 * Force a zero sa_family to match so_family.
539 			 *
540 			 * Some programs like inetd(1M) don't set the
541 			 * family field. Other programs leave
542 			 * sin_family set to garbage - SunOS 4.X does
543 			 * not check the family field on a bind.
544 			 * We use the family field that
545 			 * was passed in to the socket() call.
546 			 */
547 			name->sa_family = so->so_family;
548 			break;
549 
550 		case AF_INET6: {
551 #ifdef DEBUG
552 			sin6_t *sin6 = (sin6_t *)name;
553 #endif /* DEBUG */
554 
555 			if (name == NULL) {
556 				error = EINVAL;
557 				eprintsoline(so, error);
558 				goto done;
559 			}
560 			if ((size_t)namelen != sizeof (sin6_t)) {
561 				error = name->sa_family != so->so_family ?
562 				    EAFNOSUPPORT : EINVAL;
563 				eprintsoline(so, error);
564 				goto done;
565 			}
566 			if (name->sa_family != so->so_family) {
567 				/*
568 				 * With IPv6 we require the family to match
569 				 * unlike in IPv4.
570 				 */
571 				error = EAFNOSUPPORT;
572 				eprintsoline(so, error);
573 				goto done;
574 			}
575 #ifdef DEBUG
576 			/*
577 			 * Verify that apps don't forget to clear
578 			 * sin6_scope_id etc
579 			 */
580 			if (sin6->sin6_scope_id != 0 &&
581 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
582 				cmn_err(CE_WARN,
583 				    "bind with uninitialized sin6_scope_id "
584 				    "(%d) on socket. Pid = %d\n",
585 				    (int)sin6->sin6_scope_id,
586 				    (int)curproc->p_pid);
587 			}
588 			if (sin6->__sin6_src_id != 0) {
589 				cmn_err(CE_WARN,
590 				    "bind with uninitialized __sin6_src_id "
591 				    "(%d) on socket. Pid = %d\n",
592 				    (int)sin6->__sin6_src_id,
593 				    (int)curproc->p_pid);
594 			}
595 #endif /* DEBUG */
596 			break;
597 		}
598 		default:
599 			/*
600 			 * Don't do any length or sa_family check to allow
601 			 * non-sockaddr style addresses.
602 			 */
603 			if (name == NULL) {
604 				error = EINVAL;
605 				eprintsoline(so, error);
606 				goto done;
607 			}
608 			break;
609 		}
610 
611 		if (namelen > (t_uscalar_t)so->so_laddr_maxlen) {
612 			error = ENAMETOOLONG;
613 			eprintsoline(so, error);
614 			goto done;
615 		}
616 		/*
617 		 * Save local address.
618 		 */
619 		so->so_laddr_len = (socklen_t)namelen;
620 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
621 		bcopy(name, so->so_laddr_sa, namelen);
622 
623 		addr = so->so_laddr_sa;
624 		addrlen = (t_uscalar_t)so->so_laddr_len;
625 		switch (so->so_family) {
626 		case AF_INET6:
627 		case AF_INET:
628 			break;
629 		case AF_UNIX: {
630 			struct sockaddr_un *soun =
631 				(struct sockaddr_un *)so->so_laddr_sa;
632 			struct vnode *vp;
633 			struct vattr vattr;
634 
635 			ASSERT(so->so_ux_bound_vp == NULL);
636 			/*
637 			 * Create vnode for the specified path name.
638 			 * Keep vnode held with a reference in so_ux_bound_vp.
639 			 * Use the vnode pointer as the address used in the
640 			 * bind with the transport.
641 			 *
642 			 * Use the same mode as in BSD. In particular this does
643 			 * not observe the umask.
644 			 */
645 			/* MAXPATHLEN + soun_family + nul termination */
646 			if (so->so_laddr_len >
647 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
648 				error = ENAMETOOLONG;
649 				eprintsoline(so, error);
650 				goto done;
651 			}
652 			vattr.va_type = VSOCK;
653 			vattr.va_mode = 0777 & ~u.u_cmask;
654 			vattr.va_mask = AT_TYPE|AT_MODE;
655 			/* NOTE: holding so_lock */
656 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
657 						EXCL, 0, &vp, CRMKNOD, 0, 0);
658 			if (error) {
659 				if (error == EEXIST)
660 					error = EADDRINUSE;
661 				eprintsoline(so, error);
662 				goto done;
663 			}
664 			/*
665 			 * Establish pointer from the underlying filesystem
666 			 * vnode to the socket node.
667 			 * so_ux_bound_vp and v_stream->sd_vnode form the
668 			 * cross-linkage between the underlying filesystem
669 			 * node and the socket node.
670 			 */
671 			ASSERT(SOTOV(so)->v_stream);
672 			mutex_enter(&vp->v_lock);
673 			vp->v_stream = SOTOV(so)->v_stream;
674 			so->so_ux_bound_vp = vp;
675 			mutex_exit(&vp->v_lock);
676 
677 			/*
678 			 * Use the vnode pointer value as a unique address
679 			 * (together with the magic number to avoid conflicts
680 			 * with implicit binds) in the transport provider.
681 			 */
682 			so->so_ux_laddr.soua_vp = (void *)so->so_ux_bound_vp;
683 			so->so_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
684 			addr = &so->so_ux_laddr;
685 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
686 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
687 			    addrlen,
688 			    ((struct so_ux_addr *)addr)->soua_vp));
689 			break;
690 		}
691 		} /* end switch (so->so_family) */
692 	}
693 
694 	/*
695 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
696 	 * the transport can start passing up T_CONN_IND messages
697 	 * as soon as it receives the bind req and strsock_proto()
698 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
699 	 */
700 	if (flags & _SOBIND_LISTEN) {
701 		if ((so->so_state & SS_ACCEPTCONN) == 0)
702 			clear_acceptconn_on_err = B_TRUE;
703 		save_so_backlog = so->so_backlog;
704 		restore_backlog_on_err = B_TRUE;
705 		so->so_state |= SS_ACCEPTCONN;
706 		so->so_backlog = backlog;
707 	}
708 
709 	/*
710 	 * If NL7C addr(s) have been configured check for addr/port match,
711 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
712 	 *
713 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
714 	 * family sockets only. If match mark as such.
715 	 */
716 	if ((nl7c_enabled && addr != NULL &&
717 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
718 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
719 	    so->so_nl7c_flags == NL7C_AF_NCA) {
720 		/*
721 		 * NL7C is not supported in non-global zones,
722 		 * we enforce this restriction here.
723 		 */
724 		if (so->so_zoneid == GLOBAL_ZONEID) {
725 			/* An NL7C socket, mark it */
726 			so->so_nl7c_flags |= NL7C_ENABLED;
727 		} else
728 			nl7c = NULL;
729 	}
730 	/*
731 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
732 	 * for other transports we will send in a O_T_BIND_REQ.
733 	 */
734 	if (tcp_udp_xport &&
735 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
736 		PRIM_type = T_BIND_REQ;
737 
738 	bind_req.PRIM_type = PRIM_type;
739 	bind_req.ADDR_length = addrlen;
740 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
741 	bind_req.CONIND_number = backlog;
742 	/* NOTE: holding so_lock while sleeping */
743 	mp = soallocproto2(&bind_req, sizeof (bind_req),
744 				addr, addrlen, 0, _ALLOC_SLEEP);
745 	so->so_state &= ~SS_LADDR_VALID;
746 	/* Done using so_laddr_sa - can drop the lock */
747 	mutex_exit(&so->so_lock);
748 
749 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
750 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
751 	if (error) {
752 		eprintsoline(so, error);
753 		mutex_enter(&so->so_lock);
754 		goto done;
755 	}
756 
757 	mutex_enter(&so->so_lock);
758 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
759 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
760 	if (error) {
761 		eprintsoline(so, error);
762 		goto done;
763 	}
764 	ASSERT(mp);
765 	/*
766 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
767 	 * strsock_proto while the lock was dropped above, the bind
768 	 * is allowed to complete.
769 	 */
770 
771 	/* Mark as bound. This will be undone if we detect errors below. */
772 	if (flags & _SOBIND_NOXLATE) {
773 		ASSERT(so->so_family == AF_UNIX);
774 		so->so_state |= SS_FADDR_NOXLATE;
775 	}
776 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
777 	so->so_state |= SS_ISBOUND;
778 	ASSERT(so->so_unbind_mp);
779 
780 	/* note that we've already set SS_ACCEPTCONN above */
781 
782 	/*
783 	 * Recompute addrlen - an unspecied bind sent down an
784 	 * address of length zero but we expect the appropriate length
785 	 * in return.
786 	 */
787 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
788 	    sizeof (so->so_ux_laddr) : so->so_laddr_len);
789 
790 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
791 	/*
792 	 * The alignment restriction is really too strict but
793 	 * we want enough alignment to inspect the fields of
794 	 * a sockaddr_in.
795 	 */
796 	addr = sogetoff(mp, bind_ack->ADDR_offset,
797 			bind_ack->ADDR_length,
798 			__TPI_ALIGN_SIZE);
799 	if (addr == NULL) {
800 		freemsg(mp);
801 		error = EPROTO;
802 		eprintsoline(so, error);
803 		goto done;
804 	}
805 	if (!(flags & _SOBIND_UNSPEC)) {
806 		/*
807 		 * Verify that the transport didn't return something we
808 		 * did not want e.g. an address other than what we asked for.
809 		 *
810 		 * NOTE: These checks would go away if/when we switch to
811 		 * using the new TPI (in which the transport would fail
812 		 * the request instead of assigning a different address).
813 		 *
814 		 * NOTE2: For protocols that we don't know (i.e. any
815 		 * other than AF_INET6, AF_INET and AF_UNIX), we
816 		 * cannot know if the transport should be expected to
817 		 * return the same address as that requested.
818 		 *
819 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
820 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
821 		 *
822 		 * For example, in the case of netatalk it may be
823 		 * inappropriate for the transport to return the
824 		 * requested address (as it may have allocated a local
825 		 * port number in behaviour similar to that of an
826 		 * AF_INET bind request with a port number of zero).
827 		 *
828 		 * Given the definition of O_T_BIND_REQ, where the
829 		 * transport may bind to an address other than the
830 		 * requested address, it's not possible to determine
831 		 * whether a returned address that differs from the
832 		 * requested address is a reason to fail (because the
833 		 * requested address was not available) or succeed
834 		 * (because the transport allocated an appropriate
835 		 * address and/or port).
836 		 *
837 		 * sockfs currently requires that the transport return
838 		 * the requested address in the T_BIND_ACK, unless
839 		 * there is code here to allow for any discrepancy.
840 		 * Such code exists for AF_INET and AF_INET6.
841 		 *
842 		 * Netatalk chooses to return the requested address
843 		 * rather than the (correct) allocated address.  This
844 		 * means that netatalk violates the TPI specification
845 		 * (and would not function correctly if used from a
846 		 * TLI application), but it does mean that it works
847 		 * with sockfs.
848 		 *
849 		 * As noted above, using the newer XTI bind primitive
850 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
851 		 * allow sockfs to be more sure about whether or not
852 		 * the bind request had succeeded (as transports are
853 		 * not permitted to bind to a different address than
854 		 * that requested - they must return failure).
855 		 * Unfortunately, support for T_BIND_REQ may not be
856 		 * present in all transport implementations (netatalk,
857 		 * for example, doesn't have it), making the
858 		 * transition difficult.
859 		 */
860 		if (bind_ack->ADDR_length != addrlen) {
861 			/* Assumes that the requested address was in use */
862 			freemsg(mp);
863 			error = EADDRINUSE;
864 			eprintsoline(so, error);
865 			goto done;
866 		}
867 
868 		switch (so->so_family) {
869 		case AF_INET6:
870 		case AF_INET: {
871 			sin_t *rname, *aname;
872 
873 			rname = (sin_t *)addr;
874 			aname = (sin_t *)so->so_laddr_sa;
875 
876 			/*
877 			 * Take advantage of the alignment
878 			 * of sin_port and sin6_port which fall
879 			 * in the same place in their data structures.
880 			 * Just use sin_port for either address family.
881 			 *
882 			 * This may become a problem if (heaven forbid)
883 			 * there's a separate ipv6port_reserved... :-P
884 			 *
885 			 * Binding to port 0 has the semantics of letting
886 			 * the transport bind to any port.
887 			 *
888 			 * If the transport is TCP or UDP since we had sent
889 			 * a T_BIND_REQ we would not get a port other than
890 			 * what we asked for.
891 			 */
892 			if (tcp_udp_xport) {
893 				/*
894 				 * Pick up the new port number if we bound to
895 				 * port 0.
896 				 */
897 				if (aname->sin_port == 0)
898 					aname->sin_port = rname->sin_port;
899 				so->so_state |= SS_LADDR_VALID;
900 				break;
901 			}
902 			if (aname->sin_port != 0 &&
903 			    aname->sin_port != rname->sin_port) {
904 				freemsg(mp);
905 				error = EADDRINUSE;
906 				eprintsoline(so, error);
907 				goto done;
908 			}
909 			/*
910 			 * Pick up the new port number if we bound to port 0.
911 			 */
912 			aname->sin_port = rname->sin_port;
913 
914 			/*
915 			 * Unfortunately, addresses aren't _quite_ the same.
916 			 */
917 			if (so->so_family == AF_INET) {
918 				if (aname->sin_addr.s_addr !=
919 				    rname->sin_addr.s_addr) {
920 					freemsg(mp);
921 					error = EADDRNOTAVAIL;
922 					eprintsoline(so, error);
923 					goto done;
924 				}
925 			} else {
926 				sin6_t *rname6 = (sin6_t *)rname;
927 				sin6_t *aname6 = (sin6_t *)aname;
928 
929 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
930 				    &rname6->sin6_addr)) {
931 					freemsg(mp);
932 					error = EADDRNOTAVAIL;
933 					eprintsoline(so, error);
934 					goto done;
935 				}
936 			}
937 			break;
938 		}
939 		case AF_UNIX:
940 			if (bcmp(addr, &so->so_ux_laddr, addrlen) != 0) {
941 				freemsg(mp);
942 				error = EADDRINUSE;
943 				eprintsoline(so, error);
944 				eprintso(so,
945 					("addrlen %d, addr 0x%x, vp %p\n",
946 					addrlen, *((int *)addr),
947 					so->so_ux_bound_vp));
948 				goto done;
949 			}
950 			so->so_state |= SS_LADDR_VALID;
951 			break;
952 		default:
953 			/*
954 			 * NOTE: This assumes that addresses can be
955 			 * byte-compared for equivalence.
956 			 */
957 			if (bcmp(addr, so->so_laddr_sa, addrlen) != 0) {
958 				freemsg(mp);
959 				error = EADDRINUSE;
960 				eprintsoline(so, error);
961 				goto done;
962 			}
963 			/*
964 			 * Don't mark SS_LADDR_VALID, as we cannot be
965 			 * sure that the returned address is the real
966 			 * bound address when talking to an unknown
967 			 * transport.
968 			 */
969 			break;
970 		}
971 	} else {
972 		/*
973 		 * Save for returned address for getsockname.
974 		 * Needed for unspecific bind unless transport supports
975 		 * the TI_GETMYNAME ioctl.
976 		 * Do this for AF_INET{,6} even though they do, as
977 		 * caching info here is much better performance than
978 		 * a TPI/STREAMS trip to the transport for getsockname.
979 		 * Any which can't for some reason _must_ _not_ set
980 		 * LADDR_VALID here for the caching version of getsockname
981 		 * to not break;
982 		 */
983 		switch (so->so_family) {
984 		case AF_UNIX:
985 			/*
986 			 * Record the address bound with the transport
987 			 * for use by socketpair.
988 			 */
989 			bcopy(addr, &so->so_ux_laddr, addrlen);
990 			so->so_state |= SS_LADDR_VALID;
991 			break;
992 		case AF_INET:
993 		case AF_INET6:
994 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
995 			bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
996 			so->so_state |= SS_LADDR_VALID;
997 			break;
998 		default:
999 			/*
1000 			 * Don't mark SS_LADDR_VALID, as we cannot be
1001 			 * sure that the returned address is the real
1002 			 * bound address when talking to an unknown
1003 			 * transport.
1004 			 */
1005 			break;
1006 		}
1007 	}
1008 
1009 	if (nl7c == NULL && (so->so_nl7c_flags & NL7C_AF_NCA) &&
1010 	    (so->so_nl7c_flags & NL7C_ENABLED)) {
1011 		/*
1012 		 * Was an AF_NCA bind() so add it to the addr list for
1013 		 * reporting purposes.
1014 		 */
1015 		nl7c = nl7c_add_addr(addr, addrlen);
1016 	}
1017 	if (nl7c != NULL) {
1018 		nl7c_listener_addr(nl7c, strvp2wq(SOTOV(so)));
1019 	}
1020 
1021 	freemsg(mp);
1022 
1023 done:
1024 	if (error) {
1025 		/* reset state & backlog to values held on entry */
1026 		if (clear_acceptconn_on_err == B_TRUE)
1027 			so->so_state &= ~SS_ACCEPTCONN;
1028 		if (restore_backlog_on_err == B_TRUE)
1029 			so->so_backlog = save_so_backlog;
1030 
1031 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1032 			int err;
1033 
1034 			err = sotpi_unbind(so, 0);
1035 			/* LINTED - statement has no consequent: if */
1036 			if (err) {
1037 				eprintsoline(so, error);
1038 			} else {
1039 				ASSERT(!(so->so_state & SS_ISBOUND));
1040 			}
1041 		}
1042 	}
1043 	if (!(flags & _SOBIND_LOCK_HELD)) {
1044 		so_unlock_single(so, SOLOCKED);
1045 		mutex_exit(&so->so_lock);
1046 	} else {
1047 		/* If the caller held the lock don't release it here */
1048 		ASSERT(MUTEX_HELD(&so->so_lock));
1049 		ASSERT(so->so_flag & SOLOCKED);
1050 	}
1051 	return (error);
1052 }
1053 
1054 /* bind the socket */
1055 int
1056 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1057     int flags)
1058 {
1059 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1060 		return (sotpi_bindlisten(so, name, namelen, 0, flags));
1061 
1062 	flags &= ~_SOBIND_SOCKETPAIR;
1063 	return (sotpi_bindlisten(so, name, namelen, 1, flags));
1064 }
1065 
1066 /*
1067  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1068  * address, or when listen needs to unbind and bind.
1069  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1070  * so that a sobind can pick them up.
1071  */
1072 static int
1073 sotpi_unbind(struct sonode *so, int flags)
1074 {
1075 	struct T_unbind_req	unbind_req;
1076 	int			error = 0;
1077 	mblk_t			*mp;
1078 
1079 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1080 			so, flags, pr_state(so->so_state, so->so_mode)));
1081 
1082 	ASSERT(MUTEX_HELD(&so->so_lock));
1083 	ASSERT(so->so_flag & SOLOCKED);
1084 
1085 	if (!(so->so_state & SS_ISBOUND)) {
1086 		error = EINVAL;
1087 		eprintsoline(so, error);
1088 		goto done;
1089 	}
1090 
1091 	mutex_exit(&so->so_lock);
1092 
1093 	/*
1094 	 * Flush the read and write side (except stream head read queue)
1095 	 * and send down T_UNBIND_REQ.
1096 	 */
1097 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1098 
1099 	unbind_req.PRIM_type = T_UNBIND_REQ;
1100 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1101 	    0, _ALLOC_SLEEP);
1102 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1103 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1104 	mutex_enter(&so->so_lock);
1105 	if (error) {
1106 		eprintsoline(so, error);
1107 		goto done;
1108 	}
1109 
1110 	error = sowaitokack(so, T_UNBIND_REQ);
1111 	if (error) {
1112 		eprintsoline(so, error);
1113 		goto done;
1114 	}
1115 
1116 	/*
1117 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1118 	 * strsock_proto while the lock was dropped above, the unbind
1119 	 * is allowed to complete.
1120 	 */
1121 	if (!(flags & _SOUNBIND_REBIND)) {
1122 		/*
1123 		 * Clear out bound address.
1124 		 */
1125 		vnode_t *vp;
1126 
1127 		if ((vp = so->so_ux_bound_vp) != NULL) {
1128 			ASSERT(vp->v_stream);
1129 			so->so_ux_bound_vp = NULL;
1130 			vn_rele_stream(vp);
1131 		}
1132 		/* Clear out address */
1133 		so->so_laddr_len = 0;
1134 	}
1135 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN|SS_LADDR_VALID);
1136 done:
1137 	/* If the caller held the lock don't release it here */
1138 	ASSERT(MUTEX_HELD(&so->so_lock));
1139 	ASSERT(so->so_flag & SOLOCKED);
1140 
1141 	return (error);
1142 }
1143 
1144 /*
1145  * listen on the socket.
1146  * For TPI conforming transports this has to first unbind with the transport
1147  * and then bind again using the new backlog.
1148  */
1149 int
1150 sotpi_listen(struct sonode *so, int backlog)
1151 {
1152 	int		error = 0;
1153 
1154 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1155 		so, backlog, pr_state(so->so_state, so->so_mode)));
1156 
1157 	if (so->so_serv_type == T_CLTS)
1158 		return (EOPNOTSUPP);
1159 
1160 	/*
1161 	 * If the socket is ready to accept connections already, then
1162 	 * return without doing anything.  This avoids a problem where
1163 	 * a second listen() call fails if a connection is pending and
1164 	 * leaves the socket unbound. Only when we are not unbinding
1165 	 * with the transport can we safely increase the backlog.
1166 	 */
1167 	if (so->so_state & SS_ACCEPTCONN &&
1168 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1169 		/*CONSTCOND*/
1170 		!solisten_tpi_tcp))
1171 		return (0);
1172 
1173 	if (so->so_state & SS_ISCONNECTED)
1174 		return (EINVAL);
1175 
1176 	mutex_enter(&so->so_lock);
1177 	so_lock_single(so);	/* Set SOLOCKED */
1178 
1179 	if (backlog < 0)
1180 		backlog = 0;
1181 	/*
1182 	 * Use the same qlimit as in BSD. BSD checks the qlimit
1183 	 * before queuing the next connection implying that a
1184 	 * listen(sock, 0) allows one connection to be queued.
1185 	 * BSD also uses 1.5 times the requested backlog.
1186 	 *
1187 	 * XNS Issue 4 required a strict interpretation of the backlog.
1188 	 * This has been waived subsequently for Issue 4 and the change
1189 	 * incorporated in XNS Issue 5. So we aren't required to do
1190 	 * anything special for XPG apps.
1191 	 */
1192 	if (backlog >= (INT_MAX - 1) / 3)
1193 		backlog = INT_MAX;
1194 	else
1195 		backlog = backlog * 3 / 2 + 1;
1196 
1197 	/*
1198 	 * If the listen doesn't change the backlog we do nothing.
1199 	 * This avoids an EPROTO error from the transport.
1200 	 */
1201 	if ((so->so_state & SS_ACCEPTCONN) &&
1202 	    so->so_backlog == backlog)
1203 		goto done;
1204 
1205 	if (!(so->so_state & SS_ISBOUND)) {
1206 		/*
1207 		 * Must have been explicitly bound in the UNIX domain.
1208 		 */
1209 		if (so->so_family == AF_UNIX) {
1210 			error = EINVAL;
1211 			goto done;
1212 		}
1213 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1214 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1215 	} else if (backlog > 0) {
1216 		/*
1217 		 * AF_INET{,6} hack to avoid losing the port.
1218 		 * Assumes that all AF_INET{,6} transports can handle a
1219 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1220 		 * has already bound thus it is possible to avoid the unbind.
1221 		 */
1222 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1223 		    /*CONSTCOND*/
1224 		    !solisten_tpi_tcp)) {
1225 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1226 			if (error)
1227 				goto done;
1228 		}
1229 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1230 			    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1231 	} else {
1232 		so->so_state |= SS_ACCEPTCONN;
1233 		so->so_backlog = backlog;
1234 	}
1235 	if (error)
1236 		goto done;
1237 	ASSERT(so->so_state & SS_ACCEPTCONN);
1238 done:
1239 	so_unlock_single(so, SOLOCKED);
1240 	mutex_exit(&so->so_lock);
1241 	return (error);
1242 }
1243 
1244 /*
1245  * Disconnect either a specified seqno or all (-1).
1246  * The former is used on listening sockets only.
1247  *
1248  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1249  * the current use of sodisconnect(seqno == -1) is only for shutdown
1250  * so there is no point (and potentially incorrect) to unbind.
1251  */
1252 int
1253 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1254 {
1255 	struct T_discon_req	discon_req;
1256 	int			error = 0;
1257 	mblk_t			*mp;
1258 
1259 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1260 			so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1261 
1262 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1263 		mutex_enter(&so->so_lock);
1264 		so_lock_single(so);	/* Set SOLOCKED */
1265 	} else {
1266 		ASSERT(MUTEX_HELD(&so->so_lock));
1267 		ASSERT(so->so_flag & SOLOCKED);
1268 	}
1269 
1270 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1271 		error = EINVAL;
1272 		eprintsoline(so, error);
1273 		goto done;
1274 	}
1275 
1276 	mutex_exit(&so->so_lock);
1277 	/*
1278 	 * Flush the write side (unless this is a listener)
1279 	 * and then send down a T_DISCON_REQ.
1280 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1281 	 * and other messages.)
1282 	 */
1283 	if (!(so->so_state & SS_ACCEPTCONN))
1284 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1285 
1286 	discon_req.PRIM_type = T_DISCON_REQ;
1287 	discon_req.SEQ_number = seqno;
1288 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1289 	    0, _ALLOC_SLEEP);
1290 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1291 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1292 	mutex_enter(&so->so_lock);
1293 	if (error) {
1294 		eprintsoline(so, error);
1295 		goto done;
1296 	}
1297 
1298 	error = sowaitokack(so, T_DISCON_REQ);
1299 	if (error) {
1300 		eprintsoline(so, error);
1301 		goto done;
1302 	}
1303 	/*
1304 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1305 	 * strsock_proto while the lock was dropped above, the disconnect
1306 	 * is allowed to complete. However, it is not possible to
1307 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1308 	 */
1309 	so->so_state &=
1310 	    ~(SS_ISCONNECTED|SS_ISCONNECTING|SS_LADDR_VALID|SS_FADDR_VALID);
1311 done:
1312 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1313 		so_unlock_single(so, SOLOCKED);
1314 		mutex_exit(&so->so_lock);
1315 	} else {
1316 		/* If the caller held the lock don't release it here */
1317 		ASSERT(MUTEX_HELD(&so->so_lock));
1318 		ASSERT(so->so_flag & SOLOCKED);
1319 	}
1320 	return (error);
1321 }
1322 
1323 int
1324 sotpi_accept(struct sonode *so, int fflag, struct sonode **nsop)
1325 {
1326 	struct T_conn_ind	*conn_ind;
1327 	struct T_conn_res	*conn_res;
1328 	int			error = 0;
1329 	mblk_t			*mp;
1330 	struct sonode		*nso;
1331 	vnode_t			*nvp;
1332 	void			*src;
1333 	t_uscalar_t		srclen;
1334 	void			*opt;
1335 	t_uscalar_t		optlen;
1336 	t_scalar_t		PRIM_type;
1337 	t_scalar_t		SEQ_number;
1338 
1339 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1340 		so, fflag, nsop, pr_state(so->so_state, so->so_mode)));
1341 
1342 	/*
1343 	 * Defer single-threading the accepting socket until
1344 	 * the T_CONN_IND has been received and parsed and the
1345 	 * new sonode has been opened.
1346 	 */
1347 
1348 	/* Check that we are not already connected */
1349 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1350 		goto conn_bad;
1351 again:
1352 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1353 		goto e_bad;
1354 
1355 	ASSERT(mp);
1356 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1357 	/*
1358 	 * Save SEQ_number for error paths.
1359 	 */
1360 	SEQ_number = conn_ind->SEQ_number;
1361 
1362 	srclen = conn_ind->SRC_length;
1363 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1364 	if (src == NULL) {
1365 		error = EPROTO;
1366 		freemsg(mp);
1367 		eprintsoline(so, error);
1368 		goto disconnect_unlocked;
1369 	}
1370 	optlen = conn_ind->OPT_length;
1371 	switch (so->so_family) {
1372 	case AF_INET:
1373 	case AF_INET6:
1374 		if ((optlen == sizeof (intptr_t)) &&
1375 		    ((so->so_state & SS_TCP_FAST_ACCEPT) != 0)) {
1376 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1377 			    &opt, conn_ind->OPT_length);
1378 		} else {
1379 			/*
1380 			 * The transport (in this case TCP) hasn't sent up
1381 			 * a pointer to an instance for the accept fast-path.
1382 			 * Disable fast-path completely because the call to
1383 			 * sotpi_create() below would otherwise create an
1384 			 * incomplete TCP instance, which would lead to
1385 			 * problems when sockfs sends a normal T_CONN_RES
1386 			 * message down the new stream.
1387 			 */
1388 			so->so_state &= ~SS_TCP_FAST_ACCEPT;
1389 			opt = NULL;
1390 			optlen = 0;
1391 		}
1392 		break;
1393 	case AF_UNIX:
1394 	default:
1395 		if (optlen != 0) {
1396 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1397 			    __TPI_ALIGN_SIZE);
1398 			if (opt == NULL) {
1399 				error = EPROTO;
1400 				freemsg(mp);
1401 				eprintsoline(so, error);
1402 				goto disconnect_unlocked;
1403 			}
1404 		}
1405 		if (so->so_family == AF_UNIX) {
1406 			if (!(so->so_state & SS_FADDR_NOXLATE)) {
1407 				src = NULL;
1408 				srclen = 0;
1409 			}
1410 			/* Extract src address from options */
1411 			if (optlen != 0)
1412 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1413 		}
1414 		break;
1415 	}
1416 
1417 	/*
1418 	 * Create the new socket.
1419 	 */
1420 	VN_HOLD(so->so_accessvp);
1421 	nso = sotpi_create(so->so_accessvp, so->so_family, so->so_type,
1422 			so->so_protocol, so->so_version, so, &error);
1423 	if (nso == NULL) {
1424 		ASSERT(error != 0);
1425 		/*
1426 		 * Accept can not fail with ENOBUFS. sotpi_create
1427 		 * sleeps waiting for memory until a signal is caught
1428 		 * so return EINTR.
1429 		 */
1430 		freemsg(mp);
1431 		if (error == ENOBUFS)
1432 			error = EINTR;
1433 		goto e_disc_unl;
1434 	}
1435 	nvp = SOTOV(nso);
1436 
1437 #ifdef DEBUG
1438 	/*
1439 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1440 	 * it's inherited early to allow debugging of the accept code itself.
1441 	 */
1442 	nso->so_options |= so->so_options & SO_DEBUG;
1443 #endif /* DEBUG */
1444 
1445 	/*
1446 	 * Save the SRC address from the T_CONN_IND
1447 	 * for getpeername to work on AF_UNIX and on transports that do not
1448 	 * support TI_GETPEERNAME.
1449 	 *
1450 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1451 	 * copyin_name().
1452 	 */
1453 	if (srclen > (t_uscalar_t)nso->so_faddr_maxlen) {
1454 		error = EINVAL;
1455 		freemsg(mp);
1456 		eprintsoline(so, error);
1457 		goto disconnect_vp_unlocked;
1458 	}
1459 	nso->so_faddr_len = (socklen_t)srclen;
1460 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
1461 	bcopy(src, nso->so_faddr_sa, srclen);
1462 	nso->so_state |= SS_FADDR_VALID;
1463 
1464 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1465 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1466 		cred_t *cr;
1467 
1468 		if ((cr = DB_CRED(mp)) != NULL) {
1469 			crhold(cr);
1470 			nso->so_peercred = cr;
1471 			nso->so_cpid = DB_CPID(mp);
1472 		}
1473 		freemsg(mp);
1474 
1475 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1476 		    sizeof (intptr_t), 0, _ALLOC_INTR);
1477 		if (mp == NULL) {
1478 			/*
1479 			 * Accept can not fail with ENOBUFS.
1480 			 * A signal was caught so return EINTR.
1481 			 */
1482 			error = EINTR;
1483 			eprintsoline(so, error);
1484 			goto disconnect_vp_unlocked;
1485 		}
1486 		conn_res = (struct T_conn_res *)mp->b_rptr;
1487 	} else {
1488 		nso->so_peercred = DB_CRED(mp);
1489 		nso->so_cpid = DB_CPID(mp);
1490 		DB_CRED(mp) = NULL;
1491 
1492 		mp->b_rptr = DB_BASE(mp);
1493 		conn_res = (struct T_conn_res *)mp->b_rptr;
1494 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1495 	}
1496 
1497 	/*
1498 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1499 	 * (or AF_INET6) it also has to be bound in the transport provider.
1500 	 * After accepting the connection on nso so_laddr_sa will be set to
1501 	 * contain the same address as the listener's local address
1502 	 * so the address we bind to isn't important.
1503 	 */
1504 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1505 	    /*CONSTCOND*/
1506 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1507 		/*
1508 		 * Optimization for AF_INET{,6} transports
1509 		 * that can handle a T_CONN_RES without being bound.
1510 		 */
1511 		mutex_enter(&nso->so_lock);
1512 		so_automatic_bind(nso);
1513 		mutex_exit(&nso->so_lock);
1514 	} else {
1515 		/* Perform NULL bind with the transport provider. */
1516 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC)) != 0) {
1517 			ASSERT(error != ENOBUFS);
1518 			freemsg(mp);
1519 			eprintsoline(nso, error);
1520 			goto disconnect_vp_unlocked;
1521 		}
1522 	}
1523 
1524 	/*
1525 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1526 	 * so that any data arriving on the new socket will cause the
1527 	 * appropriate signals to be delivered for the new socket.
1528 	 *
1529 	 * No other thread (except strsock_proto and strsock_misc)
1530 	 * can access the new socket thus we relax the locking.
1531 	 */
1532 	nso->so_pgrp = so->so_pgrp;
1533 	nso->so_state |= so->so_state & (SS_ASYNC|SS_FADDR_NOXLATE);
1534 
1535 	if (nso->so_pgrp != 0) {
1536 		if ((error = so_set_events(nso, nvp, CRED())) != 0) {
1537 			eprintsoline(nso, error);
1538 			error = 0;
1539 			nso->so_pgrp = 0;
1540 		}
1541 	}
1542 
1543 	/*
1544 	 * Make note of the socket level options. TCP and IP level options
1545 	 * are already inherited. We could do all this after accept is
1546 	 * successful but doing it here simplifies code and no harm done
1547 	 * for error case.
1548 	 */
1549 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1550 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1551 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1552 	nso->so_sndbuf = so->so_sndbuf;
1553 	nso->so_rcvbuf = so->so_rcvbuf;
1554 	if (nso->so_options & SO_LINGER)
1555 		nso->so_linger = so->so_linger;
1556 
1557 	if ((so->so_state & SS_TCP_FAST_ACCEPT) != 0) {
1558 		mblk_t *ack_mp;
1559 
1560 		ASSERT(opt != NULL);
1561 
1562 		conn_res->OPT_length = optlen;
1563 		conn_res->OPT_offset = MBLKL(mp);
1564 		bcopy(&opt, mp->b_wptr, optlen);
1565 		mp->b_wptr += optlen;
1566 		conn_res->PRIM_type = T_CONN_RES;
1567 		conn_res->ACCEPTOR_id = 0;
1568 		PRIM_type = T_CONN_RES;
1569 
1570 		/* Send down the T_CONN_RES on acceptor STREAM */
1571 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1572 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1573 		if (error) {
1574 			mutex_enter(&so->so_lock);
1575 			so_lock_single(so);
1576 			eprintsoline(so, error);
1577 			goto disconnect_vp;
1578 		}
1579 		mutex_enter(&nso->so_lock);
1580 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1581 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1582 		if (error) {
1583 			mutex_exit(&nso->so_lock);
1584 			mutex_enter(&so->so_lock);
1585 			so_lock_single(so);
1586 			eprintsoline(so, error);
1587 			goto disconnect_vp;
1588 		}
1589 		if (nso->so_family == AF_INET) {
1590 			sin_t *sin;
1591 
1592 			sin = (sin_t *)(ack_mp->b_rptr +
1593 			    sizeof (struct T_ok_ack));
1594 			bcopy(sin, nso->so_laddr_sa, sizeof (sin_t));
1595 			nso->so_laddr_len = sizeof (sin_t);
1596 		} else {
1597 			sin6_t *sin6;
1598 
1599 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1600 			    sizeof (struct T_ok_ack));
1601 			bcopy(sin6, nso->so_laddr_sa, sizeof (sin6_t));
1602 			nso->so_laddr_len = sizeof (sin6_t);
1603 		}
1604 		freemsg(ack_mp);
1605 
1606 		nso->so_state |= SS_ISCONNECTED | SS_LADDR_VALID;
1607 		nso->so_priv = opt;
1608 
1609 		if (so->so_nl7c_flags & NL7C_ENABLED) {
1610 			/*
1611 			 * An NL7C marked listen()er so the new socket
1612 			 * inherits the listen()er's NL7C state.
1613 			 *
1614 			 * When calling NL7C to process the new socket
1615 			 * pass the nonblocking i/o state of the listen
1616 			 * socket as this is the context we are in.
1617 			 */
1618 			nso->so_nl7c_flags = so->so_nl7c_flags;
1619 			if (nl7c_process(nso,
1620 			    (nso->so_state & (SS_NONBLOCK|SS_NDELAY)),
1621 			    (int)((tcp_t *)nso->so_priv)->tcp_mss)) {
1622 				/*
1623 				 * NL7C has completed processing on the
1624 				 * socket, close the socket and back to
1625 				 * the top to await the next T_CONN_IND.
1626 				 */
1627 				mutex_exit(&nso->so_lock);
1628 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1629 						CRED());
1630 				VN_RELE(nvp);
1631 				goto again;
1632 			}
1633 			/* Pass the new socket out */
1634 		}
1635 
1636 		mutex_exit(&nso->so_lock);
1637 
1638 		/*
1639 		 * Pass out new socket.
1640 		 */
1641 		if (nsop != NULL)
1642 			*nsop = nso;
1643 
1644 		return (0);
1645 	}
1646 
1647 	/*
1648 	 * Copy local address from listener.
1649 	 */
1650 	nso->so_laddr_len = so->so_laddr_len;
1651 	ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
1652 	bcopy(so->so_laddr_sa, nso->so_laddr_sa, nso->so_laddr_len);
1653 	nso->so_state |= SS_LADDR_VALID;
1654 
1655 	/*
1656 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1657 	 * which don't support the FireEngine accept fast-path. It is also
1658 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1659 	 * again. Neither sockfs nor TCP attempt to find out if some other
1660 	 * random module has been inserted in between (in which case we
1661 	 * should follow TLI accept behaviour). We blindly assume the worst
1662 	 * case and revert back to old behaviour i.e. TCP will not send us
1663 	 * any option (eager) and the accept should happen on the listener
1664 	 * queue. Any queued T_conn_ind have already got their options removed
1665 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
1666 	 */
1667 	/*
1668 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1669 	 */
1670 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1671 #ifdef	_ILP32
1672 		queue_t	*q;
1673 
1674 		/*
1675 		 * Find read queue in driver
1676 		 * Can safely do this since we "own" nso/nvp.
1677 		 */
1678 		q = strvp2wq(nvp)->q_next;
1679 		while (SAMESTR(q))
1680 			q = q->q_next;
1681 		q = RD(q);
1682 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1683 #else
1684 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1685 #endif	/* _ILP32 */
1686 		conn_res->PRIM_type = O_T_CONN_RES;
1687 		PRIM_type = O_T_CONN_RES;
1688 	} else {
1689 		conn_res->ACCEPTOR_id = nso->so_acceptor_id;
1690 		conn_res->PRIM_type = T_CONN_RES;
1691 		PRIM_type = T_CONN_RES;
1692 	}
1693 	conn_res->SEQ_number = SEQ_number;
1694 	conn_res->OPT_length = 0;
1695 	conn_res->OPT_offset = 0;
1696 
1697 	mutex_enter(&so->so_lock);
1698 	so_lock_single(so);	/* Set SOLOCKED */
1699 	mutex_exit(&so->so_lock);
1700 
1701 	error = kstrputmsg(SOTOV(so), mp, NULL,
1702 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1703 	mutex_enter(&so->so_lock);
1704 	if (error) {
1705 		eprintsoline(so, error);
1706 		goto disconnect_vp;
1707 	}
1708 	error = sowaitokack(so, PRIM_type);
1709 	if (error) {
1710 		eprintsoline(so, error);
1711 		goto disconnect_vp;
1712 	}
1713 	so_unlock_single(so, SOLOCKED);
1714 	mutex_exit(&so->so_lock);
1715 
1716 	nso->so_state |= SS_ISCONNECTED;
1717 
1718 	/*
1719 	 * Pass out new socket.
1720 	 */
1721 	if (nsop != NULL)
1722 		*nsop = nso;
1723 
1724 	return (0);
1725 
1726 
1727 eproto_disc_unl:
1728 	error = EPROTO;
1729 e_disc_unl:
1730 	eprintsoline(so, error);
1731 	goto disconnect_unlocked;
1732 
1733 pr_disc_vp_unl:
1734 	eprintsoline(so, error);
1735 disconnect_vp_unlocked:
1736 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED());
1737 	VN_RELE(nvp);
1738 disconnect_unlocked:
1739 	(void) sodisconnect(so, SEQ_number, 0);
1740 	return (error);
1741 
1742 pr_disc_vp:
1743 	eprintsoline(so, error);
1744 disconnect_vp:
1745 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
1746 	so_unlock_single(so, SOLOCKED);
1747 	mutex_exit(&so->so_lock);
1748 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED());
1749 	VN_RELE(nvp);
1750 	return (error);
1751 
1752 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
1753 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
1754 	    ? EOPNOTSUPP : EINVAL;
1755 e_bad:
1756 	eprintsoline(so, error);
1757 	return (error);
1758 }
1759 
1760 /*
1761  * connect a socket.
1762  *
1763  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
1764  * unconnect (by specifying a null address).
1765  */
1766 int
1767 sotpi_connect(struct sonode *so,
1768 	const struct sockaddr *name,
1769 	socklen_t namelen,
1770 	int fflag,
1771 	int flags)
1772 {
1773 	struct T_conn_req	conn_req;
1774 	int			error = 0;
1775 	mblk_t			*mp;
1776 	void			*src;
1777 	socklen_t		srclen;
1778 	void			*addr;
1779 	socklen_t		addrlen;
1780 	boolean_t		need_unlock;
1781 
1782 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
1783 		so, name, namelen, fflag, flags,
1784 		pr_state(so->so_state, so->so_mode)));
1785 
1786 	/*
1787 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
1788 	 * avoid sleeping for memory with SOLOCKED held.
1789 	 * We know that the T_CONN_REQ can't be larger than 2 * so_faddr_maxlen
1790 	 * + sizeof (struct T_opthdr).
1791 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
1792 	 * exceed so_faddr_maxlen).
1793 	 */
1794 	mp = soallocproto(sizeof (struct T_conn_req) +
1795 	    2 * so->so_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR);
1796 	if (mp == NULL) {
1797 		/*
1798 		 * Connect can not fail with ENOBUFS. A signal was
1799 		 * caught so return EINTR.
1800 		 */
1801 		error = EINTR;
1802 		eprintsoline(so, error);
1803 		return (error);
1804 	}
1805 
1806 	mutex_enter(&so->so_lock);
1807 	/*
1808 	 * Make sure that there is a preallocated unbind_req
1809 	 * message before any binding. This message allocated when
1810 	 * the socket is created  but it might be have been
1811 	 * consumed.
1812 	 */
1813 	if (so->so_unbind_mp == NULL) {
1814 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
1815 		/* NOTE: holding so_lock while sleeping */
1816 		so->so_unbind_mp =
1817 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR);
1818 		if (so->so_unbind_mp == NULL) {
1819 			error = EINTR;
1820 			need_unlock = B_FALSE;
1821 			goto done;
1822 		}
1823 	}
1824 
1825 	so_lock_single(so);	/* Set SOLOCKED */
1826 	need_unlock = B_TRUE;
1827 
1828 	/*
1829 	 * Can't have done a listen before connecting.
1830 	 */
1831 	if (so->so_state & SS_ACCEPTCONN) {
1832 		error = EOPNOTSUPP;
1833 		goto done;
1834 	}
1835 
1836 	/*
1837 	 * Must be bound with the transport
1838 	 */
1839 	if (!(so->so_state & SS_ISBOUND)) {
1840 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1841 		    /*CONSTCOND*/
1842 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
1843 			/*
1844 			 * Optimization for AF_INET{,6} transports
1845 			 * that can handle a T_CONN_REQ without being bound.
1846 			 */
1847 			so_automatic_bind(so);
1848 		} else {
1849 			error = sotpi_bind(so, NULL, 0,
1850 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
1851 			if (error)
1852 				goto done;
1853 		}
1854 		ASSERT(so->so_state & SS_ISBOUND);
1855 		flags |= _SOCONNECT_DID_BIND;
1856 	}
1857 
1858 	/*
1859 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
1860 	 * connect to a null address. This is the portable method to
1861 	 * unconnect a socket.
1862 	 */
1863 	if ((namelen >= sizeof (sa_family_t)) &&
1864 	    (name->sa_family == AF_UNSPEC)) {
1865 		name = NULL;
1866 		namelen = 0;
1867 	}
1868 
1869 	/*
1870 	 * Check that we are not already connected.
1871 	 * A connection-oriented socket cannot be reconnected.
1872 	 * A connected connection-less socket can be
1873 	 * - connected to a different address by a subsequent connect
1874 	 * - "unconnected" by a connect to the NULL address
1875 	 */
1876 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
1877 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
1878 		if (so->so_mode & SM_CONNREQUIRED) {
1879 			/* Connection-oriented socket */
1880 			error = so->so_state & SS_ISCONNECTED ?
1881 			    EISCONN : EALREADY;
1882 			goto done;
1883 		}
1884 		/* Connection-less socket */
1885 		if (name == NULL) {
1886 			/*
1887 			 * Remove the connected state and clear SO_DGRAM_ERRIND
1888 			 * since it was set when the socket was connected.
1889 			 * If this is UDP also send down a T_DISCON_REQ.
1890 			 */
1891 			int val;
1892 
1893 			if ((so->so_family == AF_INET ||
1894 				so->so_family == AF_INET6) &&
1895 			    (so->so_type == SOCK_DGRAM ||
1896 				so->so_type == SOCK_RAW) &&
1897 			    /*CONSTCOND*/
1898 			    !soconnect_tpi_udp) {
1899 				/* XXX What about implicitly unbinding here? */
1900 				error = sodisconnect(so, -1,
1901 						_SODISCONNECT_LOCK_HELD);
1902 			} else {
1903 				so->so_state &=
1904 				    ~(SS_ISCONNECTED | SS_ISCONNECTING |
1905 				    SS_FADDR_VALID);
1906 				so->so_faddr_len = 0;
1907 			}
1908 
1909 			so_unlock_single(so, SOLOCKED);
1910 			mutex_exit(&so->so_lock);
1911 
1912 			val = 0;
1913 			(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
1914 					&val, (t_uscalar_t)sizeof (val));
1915 
1916 			mutex_enter(&so->so_lock);
1917 			so_lock_single(so);	/* Set SOLOCKED */
1918 			goto done;
1919 		}
1920 	}
1921 	ASSERT(so->so_state & SS_ISBOUND);
1922 
1923 	if (name == NULL || namelen == 0) {
1924 		error = EINVAL;
1925 		goto done;
1926 	}
1927 	/*
1928 	 * Mark the socket if so_faddr_sa represents the transport level
1929 	 * address.
1930 	 */
1931 	if (flags & _SOCONNECT_NOXLATE) {
1932 		struct sockaddr_ux	*soaddr_ux;
1933 
1934 		ASSERT(so->so_family == AF_UNIX);
1935 		if (namelen != sizeof (struct sockaddr_ux)) {
1936 			error = EINVAL;
1937 			goto done;
1938 		}
1939 		soaddr_ux = (struct sockaddr_ux *)name;
1940 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
1941 		namelen = sizeof (soaddr_ux->sou_addr);
1942 		so->so_state |= SS_FADDR_NOXLATE;
1943 	}
1944 
1945 	/*
1946 	 * Length and family checks.
1947 	 */
1948 	error = so_addr_verify(so, name, namelen);
1949 	if (error)
1950 		goto bad;
1951 
1952 	/*
1953 	 * Save foreign address. Needed for AF_UNIX as well as
1954 	 * transport providers that do not support TI_GETPEERNAME.
1955 	 * Also used for cached foreign address for TCP and UDP.
1956 	 */
1957 	if (namelen > (t_uscalar_t)so->so_faddr_maxlen) {
1958 		error = EINVAL;
1959 		goto done;
1960 	}
1961 	so->so_faddr_len = (socklen_t)namelen;
1962 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
1963 	bcopy(name, so->so_faddr_sa, namelen);
1964 	so->so_state |= SS_FADDR_VALID;
1965 
1966 	if (so->so_family == AF_UNIX) {
1967 		if (so->so_state & SS_FADDR_NOXLATE) {
1968 			/*
1969 			 * Already have a transport internal address. Do not
1970 			 * pass any (transport internal) source address.
1971 			 */
1972 			addr = so->so_faddr_sa;
1973 			addrlen = (t_uscalar_t)so->so_faddr_len;
1974 			src = NULL;
1975 			srclen = 0;
1976 		} else {
1977 			/*
1978 			 * Pass the sockaddr_un source address as an option
1979 			 * and translate the remote address.
1980 			 * Holding so_lock thus so_laddr_sa can not change.
1981 			 */
1982 			src = so->so_laddr_sa;
1983 			srclen = (t_uscalar_t)so->so_laddr_len;
1984 			dprintso(so, 1,
1985 				("sotpi_connect UNIX: srclen %d, src %p\n",
1986 				srclen, src));
1987 			error = so_ux_addr_xlate(so,
1988 				so->so_faddr_sa, (socklen_t)so->so_faddr_len,
1989 				(flags & _SOCONNECT_XPG4_2),
1990 				&addr, &addrlen);
1991 			if (error)
1992 				goto bad;
1993 		}
1994 	} else {
1995 		addr = so->so_faddr_sa;
1996 		addrlen = (t_uscalar_t)so->so_faddr_len;
1997 		src = NULL;
1998 		srclen = 0;
1999 	}
2000 	/*
2001 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2002 	 * option which asks the transport provider to send T_UDERR_IND
2003 	 * messages. These T_UDERR_IND messages are used to return connected
2004 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2005 	 *
2006 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2007 	 * we send down a T_CONN_REQ. This is needed to let the
2008 	 * transport assign a local address that is consistent with
2009 	 * the remote address. Applications depend on a getsockname()
2010 	 * after a connect() to retrieve the "source" IP address for
2011 	 * the connected socket.  Invalidate the cached local address
2012 	 * to force getsockname() to enquire of the transport.
2013 	 */
2014 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2015 		/*
2016 		 * Datagram socket.
2017 		 */
2018 		int32_t val;
2019 
2020 		so_unlock_single(so, SOLOCKED);
2021 		mutex_exit(&so->so_lock);
2022 
2023 		val = 1;
2024 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2025 					&val, (t_uscalar_t)sizeof (val));
2026 
2027 		mutex_enter(&so->so_lock);
2028 		so_lock_single(so);	/* Set SOLOCKED */
2029 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2030 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2031 		    soconnect_tpi_udp) {
2032 			soisconnected(so);
2033 			goto done;
2034 		}
2035 		/*
2036 		 * Send down T_CONN_REQ etc.
2037 		 * Clear fflag to avoid returning EWOULDBLOCK.
2038 		 */
2039 		fflag = 0;
2040 		ASSERT(so->so_family != AF_UNIX);
2041 		so->so_state &= ~SS_LADDR_VALID;
2042 	} else if (so->so_laddr_len != 0) {
2043 		/*
2044 		 * If the local address or port was "any" then it may be
2045 		 * changed by the transport as a result of the
2046 		 * connect.  Invalidate the cached version if we have one.
2047 		 */
2048 		switch (so->so_family) {
2049 		case AF_INET:
2050 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin_t));
2051 			if (((sin_t *)so->so_laddr_sa)->sin_addr.s_addr ==
2052 			    INADDR_ANY ||
2053 			    ((sin_t *)so->so_laddr_sa)->sin_port == 0)
2054 				so->so_state &= ~SS_LADDR_VALID;
2055 			break;
2056 
2057 		case AF_INET6:
2058 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin6_t));
2059 			if (IN6_IS_ADDR_UNSPECIFIED(
2060 			    &((sin6_t *)so->so_laddr_sa) ->sin6_addr) ||
2061 			    IN6_IS_ADDR_V4MAPPED_ANY(
2062 			    &((sin6_t *)so->so_laddr_sa)->sin6_addr) ||
2063 			    ((sin6_t *)so->so_laddr_sa)->sin6_port == 0)
2064 				    so->so_state &= ~SS_LADDR_VALID;
2065 			break;
2066 
2067 		default:
2068 			break;
2069 		}
2070 	}
2071 
2072 	/*
2073 	 * Check for failure of an earlier call
2074 	 */
2075 	if (so->so_error != 0)
2076 		goto so_bad;
2077 
2078 	/*
2079 	 * Send down T_CONN_REQ. Message was allocated above.
2080 	 */
2081 	conn_req.PRIM_type = T_CONN_REQ;
2082 	conn_req.DEST_length = addrlen;
2083 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2084 	if (srclen == 0) {
2085 		conn_req.OPT_length = 0;
2086 		conn_req.OPT_offset = 0;
2087 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2088 		soappendmsg(mp, addr, addrlen);
2089 	} else {
2090 		/*
2091 		 * There is a AF_UNIX sockaddr_un to include as a source
2092 		 * address option.
2093 		 */
2094 		struct T_opthdr toh;
2095 
2096 		toh.level = SOL_SOCKET;
2097 		toh.name = SO_SRCADDR;
2098 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2099 		toh.status = 0;
2100 		conn_req.OPT_length =
2101 			(t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2102 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2103 			_TPI_ALIGN_TOPT(addrlen));
2104 
2105 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2106 		soappendmsg(mp, addr, addrlen);
2107 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2108 		soappendmsg(mp, &toh, sizeof (toh));
2109 		soappendmsg(mp, src, srclen);
2110 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2111 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2112 	}
2113 	/*
2114 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2115 	 * in order to have the right state when the T_CONN_CON shows up.
2116 	 */
2117 	soisconnecting(so);
2118 	mutex_exit(&so->so_lock);
2119 
2120 #ifdef C2_AUDIT
2121 	if (audit_active)
2122 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2123 #endif /* C2_AUDIT */
2124 
2125 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2126 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2127 	mp = NULL;
2128 	mutex_enter(&so->so_lock);
2129 	if (error != 0)
2130 		goto bad;
2131 
2132 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2133 		goto bad;
2134 
2135 	/* Allow other threads to access the socket */
2136 	so_unlock_single(so, SOLOCKED);
2137 	need_unlock = B_FALSE;
2138 
2139 	/*
2140 	 * Wait until we get a T_CONN_CON or an error
2141 	 */
2142 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2143 		so_lock_single(so);	/* Set SOLOCKED */
2144 		need_unlock = B_TRUE;
2145 	}
2146 
2147 done:
2148 	freemsg(mp);
2149 	switch (error) {
2150 	case EINPROGRESS:
2151 	case EALREADY:
2152 	case EISCONN:
2153 	case EINTR:
2154 		/* Non-fatal errors */
2155 		so->so_state &= ~SS_LADDR_VALID;
2156 		/* FALLTHRU */
2157 	case 0:
2158 		break;
2159 
2160 	case EHOSTUNREACH:
2161 		if (flags & _SOCONNECT_XPG4_2) {
2162 			/*
2163 			 * X/Open specification contains a requirement that
2164 			 * ENETUNREACH be returned but does not require
2165 			 * EHOSTUNREACH. In order to keep the test suite
2166 			 * happy we mess with the errno here.
2167 			 */
2168 			error = ENETUNREACH;
2169 		}
2170 		/* FALLTHRU */
2171 
2172 	default:
2173 		ASSERT(need_unlock);
2174 		/*
2175 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2176 		 * and invalidate local-address cache
2177 		 */
2178 		so->so_state &= ~(SS_ISCONNECTING | SS_LADDR_VALID);
2179 		/* A discon_ind might have already unbound us */
2180 		if ((flags & _SOCONNECT_DID_BIND) &&
2181 		    (so->so_state & SS_ISBOUND)) {
2182 			int err;
2183 
2184 			err = sotpi_unbind(so, 0);
2185 			/* LINTED - statement has no conseq */
2186 			if (err) {
2187 				eprintsoline(so, err);
2188 			}
2189 		}
2190 		break;
2191 	}
2192 	if (need_unlock)
2193 		so_unlock_single(so, SOLOCKED);
2194 	mutex_exit(&so->so_lock);
2195 	return (error);
2196 
2197 so_bad:	error = sogeterr(so);
2198 bad:	eprintsoline(so, error);
2199 	goto done;
2200 }
2201 
2202 int
2203 sotpi_shutdown(struct sonode *so, int how)
2204 {
2205 	struct T_ordrel_req	ordrel_req;
2206 	mblk_t			*mp;
2207 	uint_t			old_state, state_change;
2208 	int			error = 0;
2209 
2210 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2211 		so, how, pr_state(so->so_state, so->so_mode)));
2212 
2213 	mutex_enter(&so->so_lock);
2214 	so_lock_single(so);	/* Set SOLOCKED */
2215 
2216 	/*
2217 	 * SunOS 4.X has no check for datagram sockets.
2218 	 * 5.X checks that it is connected (ENOTCONN)
2219 	 * X/Open requires that we check the connected state.
2220 	 */
2221 	if (!(so->so_state & SS_ISCONNECTED)) {
2222 		if (!xnet_skip_checks) {
2223 			error = ENOTCONN;
2224 			if (xnet_check_print) {
2225 				printf("sockfs: X/Open shutdown check "
2226 					"caused ENOTCONN\n");
2227 			}
2228 		}
2229 		goto done;
2230 	}
2231 	/*
2232 	 * Record the current state and then perform any state changes.
2233 	 * Then use the difference between the old and new states to
2234 	 * determine which messages need to be sent.
2235 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2236 	 * duplicate calls to shutdown().
2237 	 */
2238 	old_state = so->so_state;
2239 
2240 	switch (how) {
2241 	case 0:
2242 		socantrcvmore(so);
2243 		break;
2244 	case 1:
2245 		socantsendmore(so);
2246 		break;
2247 	case 2:
2248 		socantsendmore(so);
2249 		socantrcvmore(so);
2250 		break;
2251 	default:
2252 		error = EINVAL;
2253 		goto done;
2254 	}
2255 
2256 	/*
2257 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2258 	 */
2259 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2260 		(old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2261 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2262 
2263 	switch (state_change) {
2264 	case 0:
2265 		dprintso(so, 1,
2266 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2267 		    so->so_state));
2268 		goto done;
2269 
2270 	case SS_CANTRCVMORE:
2271 		mutex_exit(&so->so_lock);
2272 		strseteof(SOTOV(so), 1);
2273 		/*
2274 		 * strseteof takes care of read side wakeups,
2275 		 * pollwakeups, and signals.
2276 		 */
2277 		/*
2278 		 * Get the read lock before flushing data to avoid problems
2279 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2280 		 */
2281 		mutex_enter(&so->so_lock);
2282 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2283 		mutex_exit(&so->so_lock);
2284 
2285 		/* Flush read side queue */
2286 		strflushrq(SOTOV(so), FLUSHALL);
2287 
2288 		mutex_enter(&so->so_lock);
2289 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2290 		break;
2291 
2292 	case SS_CANTSENDMORE:
2293 		mutex_exit(&so->so_lock);
2294 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2295 		mutex_enter(&so->so_lock);
2296 		break;
2297 
2298 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2299 		mutex_exit(&so->so_lock);
2300 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2301 		strseteof(SOTOV(so), 1);
2302 		/*
2303 		 * strseteof takes care of read side wakeups,
2304 		 * pollwakeups, and signals.
2305 		 */
2306 		/*
2307 		 * Get the read lock before flushing data to avoid problems
2308 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2309 		 */
2310 		mutex_enter(&so->so_lock);
2311 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2312 		mutex_exit(&so->so_lock);
2313 
2314 		/* Flush read side queue */
2315 		strflushrq(SOTOV(so), FLUSHALL);
2316 
2317 		mutex_enter(&so->so_lock);
2318 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2319 		break;
2320 	}
2321 
2322 	ASSERT(MUTEX_HELD(&so->so_lock));
2323 
2324 	/*
2325 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2326 	 * was set due to this call and the new state has both of them set:
2327 	 *	Send the AF_UNIX close indication
2328 	 *	For T_COTS send a discon_ind
2329 	 *
2330 	 * If cantsend was set due to this call:
2331 	 *	For T_COTSORD send an ordrel_ind
2332 	 *
2333 	 * Note that for T_CLTS there is no message sent here.
2334 	 */
2335 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2336 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2337 		/*
2338 		 * For SunOS 4.X compatibility we tell the other end
2339 		 * that we are unable to receive at this point.
2340 		 */
2341 		if (so->so_family == AF_UNIX && so->so_serv_type != T_CLTS)
2342 			so_unix_close(so);
2343 
2344 		if (so->so_serv_type == T_COTS)
2345 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2346 	}
2347 	if ((state_change & SS_CANTSENDMORE) &&
2348 	    (so->so_serv_type == T_COTS_ORD)) {
2349 		/* Send an orderly release */
2350 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2351 
2352 		mutex_exit(&so->so_lock);
2353 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2354 		    0, _ALLOC_SLEEP);
2355 		/*
2356 		 * Send down the T_ORDREL_REQ even if there is flow control.
2357 		 * This prevents shutdown from blocking.
2358 		 * Note that there is no T_OK_ACK for ordrel_req.
2359 		 */
2360 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2361 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2362 		mutex_enter(&so->so_lock);
2363 		if (error) {
2364 			eprintsoline(so, error);
2365 			goto done;
2366 		}
2367 	}
2368 
2369 done:
2370 	so_unlock_single(so, SOLOCKED);
2371 	mutex_exit(&so->so_lock);
2372 	return (error);
2373 }
2374 
2375 /*
2376  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2377  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2378  * that we have closed.
2379  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2380  * T_UNITDATA_REQ containing the same option.
2381  *
2382  * For SOCK_DGRAM half-connections (somebody connected to this end
2383  * but this end is not connect) we don't know where to send any
2384  * SO_UNIX_CLOSE.
2385  *
2386  * We have to ignore stream head errors just in case there has been
2387  * a shutdown(output).
2388  * Ignore any flow control to try to get the message more quickly to the peer.
2389  * While locally ignoring flow control solves the problem when there
2390  * is only the loopback transport on the stream it would not provide
2391  * the correct AF_UNIX socket semantics when one or more modules have
2392  * been pushed.
2393  */
2394 void
2395 so_unix_close(struct sonode *so)
2396 {
2397 	int		error;
2398 	struct T_opthdr	toh;
2399 	mblk_t		*mp;
2400 
2401 	ASSERT(MUTEX_HELD(&so->so_lock));
2402 
2403 	ASSERT(so->so_family == AF_UNIX);
2404 
2405 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2406 	    (SS_ISCONNECTED|SS_ISBOUND))
2407 		return;
2408 
2409 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2410 		so, pr_state(so->so_state, so->so_mode)));
2411 
2412 	toh.level = SOL_SOCKET;
2413 	toh.name = SO_UNIX_CLOSE;
2414 
2415 	/* zero length + header */
2416 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2417 	toh.status = 0;
2418 
2419 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2420 		struct T_optdata_req tdr;
2421 
2422 		tdr.PRIM_type = T_OPTDATA_REQ;
2423 		tdr.DATA_flag = 0;
2424 
2425 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2426 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2427 
2428 		/* NOTE: holding so_lock while sleeping */
2429 		mp = soallocproto2(&tdr, sizeof (tdr),
2430 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP);
2431 	} else {
2432 		struct T_unitdata_req	tudr;
2433 		void			*addr;
2434 		socklen_t		addrlen;
2435 		void			*src;
2436 		socklen_t		srclen;
2437 		struct T_opthdr		toh2;
2438 		t_scalar_t		size;
2439 
2440 		/* Connecteded DGRAM socket */
2441 
2442 		/*
2443 		 * For AF_UNIX the destination address is translated to
2444 		 * an internal name and the source address is passed as
2445 		 * an option.
2446 		 */
2447 		/*
2448 		 * Length and family checks.
2449 		 */
2450 		error = so_addr_verify(so, so->so_faddr_sa,
2451 					(t_uscalar_t)so->so_faddr_len);
2452 		if (error) {
2453 			eprintsoline(so, error);
2454 			return;
2455 		}
2456 		if (so->so_state & SS_FADDR_NOXLATE) {
2457 			/*
2458 			 * Already have a transport internal address. Do not
2459 			 * pass any (transport internal) source address.
2460 			 */
2461 			addr = so->so_faddr_sa;
2462 			addrlen = (t_uscalar_t)so->so_faddr_len;
2463 			src = NULL;
2464 			srclen = 0;
2465 		} else {
2466 			/*
2467 			 * Pass the sockaddr_un source address as an option
2468 			 * and translate the remote address.
2469 			 * Holding so_lock thus so_laddr_sa can not change.
2470 			 */
2471 			src = so->so_laddr_sa;
2472 			srclen = (socklen_t)so->so_laddr_len;
2473 			dprintso(so, 1,
2474 				("so_ux_close: srclen %d, src %p\n",
2475 				srclen, src));
2476 			error = so_ux_addr_xlate(so,
2477 				so->so_faddr_sa,
2478 				(socklen_t)so->so_faddr_len, 0,
2479 				&addr, &addrlen);
2480 			if (error) {
2481 				eprintsoline(so, error);
2482 				return;
2483 			}
2484 		}
2485 		tudr.PRIM_type = T_UNITDATA_REQ;
2486 		tudr.DEST_length = addrlen;
2487 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2488 		if (srclen == 0) {
2489 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2490 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2491 				_TPI_ALIGN_TOPT(addrlen));
2492 
2493 			size = tudr.OPT_offset + tudr.OPT_length;
2494 			/* NOTE: holding so_lock while sleeping */
2495 			mp = soallocproto2(&tudr, sizeof (tudr),
2496 			    addr, addrlen, size, _ALLOC_SLEEP);
2497 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2498 			soappendmsg(mp, &toh, sizeof (toh));
2499 		} else {
2500 			/*
2501 			 * There is a AF_UNIX sockaddr_un to include as a
2502 			 * source address option.
2503 			 */
2504 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2505 			    _TPI_ALIGN_TOPT(srclen));
2506 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2507 			    _TPI_ALIGN_TOPT(addrlen));
2508 
2509 			toh2.level = SOL_SOCKET;
2510 			toh2.name = SO_SRCADDR;
2511 			toh2.len = (t_uscalar_t)(srclen +
2512 					sizeof (struct T_opthdr));
2513 			toh2.status = 0;
2514 
2515 			size = tudr.OPT_offset + tudr.OPT_length;
2516 
2517 			/* NOTE: holding so_lock while sleeping */
2518 			mp = soallocproto2(&tudr, sizeof (tudr),
2519 			    addr, addrlen, size, _ALLOC_SLEEP);
2520 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2521 			soappendmsg(mp, &toh, sizeof (toh));
2522 			soappendmsg(mp, &toh2, sizeof (toh2));
2523 			soappendmsg(mp, src, srclen);
2524 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2525 		}
2526 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2527 	}
2528 	mutex_exit(&so->so_lock);
2529 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2530 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2531 	mutex_enter(&so->so_lock);
2532 }
2533 
2534 /*
2535  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
2536  */
2537 int
2538 sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags)
2539 {
2540 	mblk_t		*mp, *nmp;
2541 	int		error;
2542 
2543 	dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", so, msg, flags));
2544 
2545 	/*
2546 	 * There is never any oob data with addresses or control since
2547 	 * the T_EXDATA_IND does not carry any options.
2548 	 */
2549 	msg->msg_controllen = 0;
2550 	msg->msg_namelen = 0;
2551 
2552 	mutex_enter(&so->so_lock);
2553 	ASSERT(so_verify_oobstate(so));
2554 	if ((so->so_options & SO_OOBINLINE) ||
2555 	    (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
2556 		dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
2557 		mutex_exit(&so->so_lock);
2558 		return (EINVAL);
2559 	}
2560 	if (!(so->so_state & SS_HAVEOOBDATA)) {
2561 		dprintso(so, 1, ("sorecvoob: no data yet\n"));
2562 		mutex_exit(&so->so_lock);
2563 		return (EWOULDBLOCK);
2564 	}
2565 	ASSERT(so->so_oobmsg != NULL);
2566 	mp = so->so_oobmsg;
2567 	if (flags & MSG_PEEK) {
2568 		/*
2569 		 * Since recv* can not return ENOBUFS we can not use dupmsg.
2570 		 * Instead we revert to the consolidation private
2571 		 * allocb_wait plus bcopy.
2572 		 */
2573 		mblk_t *mp1;
2574 
2575 		mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
2576 		ASSERT(mp1);
2577 
2578 		while (mp != NULL) {
2579 			ssize_t size;
2580 
2581 			size = MBLKL(mp);
2582 			bcopy(mp->b_rptr, mp1->b_wptr, size);
2583 			mp1->b_wptr += size;
2584 			ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
2585 			mp = mp->b_cont;
2586 		}
2587 		mp = mp1;
2588 	} else {
2589 		/*
2590 		 * Update the state indicating that the data has been consumed.
2591 		 * Keep SS_OOBPEND set until data is consumed past the mark.
2592 		 */
2593 		so->so_oobmsg = NULL;
2594 		so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
2595 	}
2596 	dprintso(so, 1,
2597 		("after recvoob(%p): counts %d/%d state %s\n",
2598 		so, so->so_oobsigcnt,
2599 		so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2600 	ASSERT(so_verify_oobstate(so));
2601 	mutex_exit(&so->so_lock);
2602 
2603 	error = 0;
2604 	nmp = mp;
2605 	while (nmp != NULL && uiop->uio_resid > 0) {
2606 		ssize_t n = MBLKL(nmp);
2607 
2608 		n = MIN(n, uiop->uio_resid);
2609 		if (n > 0)
2610 			error = uiomove(nmp->b_rptr, n,
2611 					UIO_READ, uiop);
2612 		if (error)
2613 			break;
2614 		nmp = nmp->b_cont;
2615 	}
2616 	freemsg(mp);
2617 	return (error);
2618 }
2619 
2620 /*
2621  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2622  * In addition, the caller typically verifies that there is some
2623  * potential state to clear by checking
2624  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2625  * before calling this routine.
2626  * Note that such a check can be made without holding so_lock since
2627  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2628  * decrements so_oobsigcnt.
2629  *
2630  * When data is read *after* the point that all pending
2631  * oob data has been consumed the oob indication is cleared.
2632  *
2633  * This logic keeps select/poll returning POLLRDBAND and
2634  * SIOCATMARK returning true until we have read past
2635  * the mark.
2636  */
2637 static void
2638 sorecv_update_oobstate(struct sonode *so)
2639 {
2640 	mutex_enter(&so->so_lock);
2641 	ASSERT(so_verify_oobstate(so));
2642 	dprintso(so, 1,
2643 		("sorecv_update_oobstate: counts %d/%d state %s\n",
2644 		so->so_oobsigcnt,
2645 		so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2646 	if (so->so_oobsigcnt == 0) {
2647 		/* No more pending oob indications */
2648 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2649 		freemsg(so->so_oobmsg);
2650 		so->so_oobmsg = NULL;
2651 	}
2652 	ASSERT(so_verify_oobstate(so));
2653 	mutex_exit(&so->so_lock);
2654 }
2655 
2656 /*
2657  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2658  */
2659 static int
2660 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2661 {
2662 	int	error = 0;
2663 	mblk_t *tmp = NULL;
2664 	mblk_t *pmp = NULL;
2665 	mblk_t *nmp = so->so_nl7c_rcv_mp;
2666 
2667 	ASSERT(nmp != NULL);
2668 
2669 	while (nmp != NULL && uiop->uio_resid > 0) {
2670 		ssize_t n;
2671 
2672 		if (DB_TYPE(nmp) == M_DATA) {
2673 			/*
2674 			 * We have some data, uiomove up to resid bytes.
2675 			 */
2676 			n = MIN(MBLKL(nmp), uiop->uio_resid);
2677 			if (n > 0)
2678 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
2679 			if (error)
2680 				break;
2681 			nmp->b_rptr += n;
2682 			if (nmp->b_rptr == nmp->b_wptr) {
2683 				pmp = nmp;
2684 				nmp = nmp->b_cont;
2685 			}
2686 		} else {
2687 			/*
2688 			 * We only handle data, save for caller to handle.
2689 			 */
2690 			if (pmp != NULL) {
2691 				pmp->b_cont = nmp->b_cont;
2692 			}
2693 			nmp->b_cont = NULL;
2694 			if (*rmp == NULL) {
2695 				*rmp = nmp;
2696 			} else {
2697 				tmp->b_next = nmp;
2698 			}
2699 			nmp = nmp->b_cont;
2700 			tmp = nmp;
2701 		}
2702 	}
2703 	if (pmp != NULL) {
2704 		/* Free any mblk_t(s) which we have consumed */
2705 		pmp->b_cont = NULL;
2706 		freemsg(so->so_nl7c_rcv_mp);
2707 	}
2708 	if ((so->so_nl7c_rcv_mp = nmp) == NULL) {
2709 		/* Last mblk_t so return the saved rval from kstrgetmsg() */
2710 		rp->r_vals = so->so_nl7c_rcv_rval;
2711 		so->so_nl7c_rcv_rval = 0;
2712 	} else {
2713 		/* More mblk_t(s) to process so no rval to return */
2714 		rp->r_vals = 0;
2715 	}
2716 	return (error);
2717 }
2718 
2719 /*
2720  * Receive the next message on the queue.
2721  * If msg_controllen is non-zero when called the caller is interested in
2722  * any received control info (options).
2723  * If msg_namelen is non-zero when called the caller is interested in
2724  * any received source address.
2725  * The routine returns with msg_control and msg_name pointing to
2726  * kmem_alloc'ed memory which the caller has to free.
2727  */
2728 int
2729 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
2730 {
2731 	union T_primitives	*tpr;
2732 	mblk_t			*mp;
2733 	uchar_t			pri;
2734 	int			pflag, opflag;
2735 	void			*control;
2736 	t_uscalar_t		controllen;
2737 	t_uscalar_t		namelen;
2738 	int			so_state = so->so_state; /* Snapshot */
2739 	ssize_t			saved_resid;
2740 	int			error;
2741 	rval_t			rval;
2742 	int			flags;
2743 	clock_t			timout;
2744 	int			first;
2745 
2746 	flags = msg->msg_flags;
2747 	msg->msg_flags = 0;
2748 
2749 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2750 		so, msg, flags,
2751 		pr_state(so->so_state, so->so_mode), so->so_error));
2752 
2753 	/*
2754 	 * If we are not connected because we have never been connected
2755 	 * we return ENOTCONN. If we have been connected (but are no longer
2756 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2757 	 * the EOF.
2758 	 *
2759 	 * An alternative would be to post an ENOTCONN error in stream head
2760 	 * (read+write) and clear it when we're connected. However, that error
2761 	 * would cause incorrect poll/select behavior!
2762 	 */
2763 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2764 	    (so->so_mode & SM_CONNREQUIRED)) {
2765 		return (ENOTCONN);
2766 	}
2767 
2768 	/*
2769 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2770 	 * after checking that the read queue is empty) and returns zero.
2771 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2772 	 * is zero.
2773 	 */
2774 
2775 	if (flags & MSG_OOB) {
2776 		/* Check that the transport supports OOB */
2777 		if (!(so->so_mode & SM_EXDATA))
2778 			return (EOPNOTSUPP);
2779 		return (sorecvoob(so, msg, uiop, flags));
2780 	}
2781 
2782 	/*
2783 	 * Set msg_controllen and msg_namelen to zero here to make it
2784 	 * simpler in the cases that no control or name is returned.
2785 	 */
2786 	controllen = msg->msg_controllen;
2787 	namelen = msg->msg_namelen;
2788 	msg->msg_controllen = 0;
2789 	msg->msg_namelen = 0;
2790 
2791 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2792 		namelen, controllen));
2793 
2794 	/*
2795 	 * If an NL7C enabled socket and not waiting for write data.
2796 	 */
2797 	mutex_enter(&so->so_lock);
2798 	if ((so->so_nl7c_flags & (NL7C_ENABLED|NL7C_WAITWRITE)) ==
2799 	    NL7C_ENABLED) {
2800 		if (so->so_nl7c_uri) {
2801 			/*
2802 			 * Close uri processing for a previous request.
2803 			 */
2804 			nl7c_close(so);
2805 		}
2806 		if (nl7c_process(so,
2807 		    (so->so_state & (SS_NONBLOCK|SS_NDELAY)),
2808 		    (int)((tcp_t *)so->so_priv)->tcp_mss)) {
2809 			/*
2810 			 * NL7C has completed processing on the socket,
2811 			 * clear the enabled bit as no further NL7C
2812 			 * processing will be needed.
2813 			 */
2814 			so->so_nl7c_flags = 0;
2815 		}
2816 	}
2817 
2818 	/*
2819 	 * Only one reader is allowed at any given time. This is needed
2820 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
2821 	 *
2822 	 * This is slightly different that BSD behavior in that it fails with
2823 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
2824 	 * is single-threaded using sblock(), which is dropped while waiting
2825 	 * for data to appear. The difference shows up e.g. if one
2826 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
2827 	 * does use nonblocking io and different threads are reading each
2828 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
2829 	 * in this case as long as the read queue doesn't get empty.
2830 	 * In this implementation the thread using nonblocking io can
2831 	 * get an EWOULDBLOCK error due to the blocking thread executing
2832 	 * e.g. in the uiomove in kstrgetmsg.
2833 	 * This difference is not believed to be significant.
2834 	 */
2835 	error = so_lock_read_intr(so, uiop->uio_fmode);	/* Set SOREADLOCKED */
2836 	mutex_exit(&so->so_lock);
2837 	if (error)
2838 		return (error);
2839 
2840 	/*
2841 	 * Tell kstrgetmsg to not inspect the stream head errors until all
2842 	 * queued data has been consumed.
2843 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
2844 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
2845 	 *
2846 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
2847 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
2848 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
2849 	 */
2850 	pflag = MSG_ANY | MSG_DELAYERROR;
2851 	if (flags & MSG_PEEK) {
2852 		pflag |= MSG_IPEEK;
2853 		flags &= ~MSG_WAITALL;
2854 	}
2855 	if (so->so_mode & SM_ATOMIC)
2856 		pflag |= MSG_DISCARDTAIL;
2857 
2858 	if (flags & MSG_DONTWAIT)
2859 		timout = 0;
2860 	else
2861 		timout = -1;
2862 	opflag = pflag;
2863 	first = 1;
2864 
2865 	/*
2866 	 * If so saved NL7C rcv mblk_t(s) uiomove them first
2867 	 * else get'm from the streamhead.
2868 	 */
2869 retry:
2870 	saved_resid = uiop->uio_resid;
2871 	pri = 0;
2872 	mp = NULL;
2873 	if (so->so_nl7c_rcv_mp != NULL) {
2874 		error = nl7c_sorecv(so, &mp, uiop, &rval);
2875 	} else {
2876 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
2877 		    timout, &rval);
2878 	}
2879 	if (error) {
2880 		switch (error) {
2881 		case EINTR:
2882 		case EWOULDBLOCK:
2883 			if (!first)
2884 				error = 0;
2885 			break;
2886 		case ETIME:
2887 			/* Returned from kstrgetmsg when timeout expires */
2888 			if (!first)
2889 				error = 0;
2890 			else
2891 				error = EWOULDBLOCK;
2892 			break;
2893 		default:
2894 			eprintsoline(so, error);
2895 			break;
2896 		}
2897 		mutex_enter(&so->so_lock);
2898 		so_unlock_read(so);	/* Clear SOREADLOCKED */
2899 		mutex_exit(&so->so_lock);
2900 		return (error);
2901 	}
2902 	/*
2903 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
2904 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
2905 	 */
2906 	ASSERT(!(rval.r_val1 & MORECTL));
2907 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
2908 		msg->msg_flags |= MSG_TRUNC;
2909 
2910 	if (mp == NULL) {
2911 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
2912 		/*
2913 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
2914 		 * The draft Posix socket spec states that the mark should
2915 		 * not be cleared when peeking. We follow the latter.
2916 		 */
2917 		if ((so->so_state &
2918 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
2919 		    (uiop->uio_resid != saved_resid) &&
2920 		    !(flags & MSG_PEEK)) {
2921 			sorecv_update_oobstate(so);
2922 		}
2923 
2924 		mutex_enter(&so->so_lock);
2925 		/* Set MSG_EOR based on MOREDATA */
2926 		if (!(rval.r_val1 & MOREDATA)) {
2927 			if (so->so_state & SS_SAVEDEOR) {
2928 				msg->msg_flags |= MSG_EOR;
2929 				so->so_state &= ~SS_SAVEDEOR;
2930 			}
2931 		}
2932 		/*
2933 		 * If some data was received (i.e. not EOF) and the
2934 		 * read/recv* has not been satisfied wait for some more.
2935 		 */
2936 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
2937 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
2938 			mutex_exit(&so->so_lock);
2939 			first = 0;
2940 			pflag = opflag | MSG_NOMARK;
2941 			goto retry;
2942 		}
2943 		so_unlock_read(so);	/* Clear SOREADLOCKED */
2944 		mutex_exit(&so->so_lock);
2945 		return (0);
2946 	}
2947 
2948 	/* strsock_proto has already verified length and alignment */
2949 	tpr = (union T_primitives *)mp->b_rptr;
2950 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
2951 
2952 	switch (tpr->type) {
2953 	case T_DATA_IND: {
2954 		if ((so->so_state &
2955 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
2956 		    (uiop->uio_resid != saved_resid) &&
2957 		    !(flags & MSG_PEEK)) {
2958 			sorecv_update_oobstate(so);
2959 		}
2960 
2961 		/*
2962 		 * Set msg_flags to MSG_EOR based on
2963 		 * MORE_flag and MOREDATA.
2964 		 */
2965 		mutex_enter(&so->so_lock);
2966 		so->so_state &= ~SS_SAVEDEOR;
2967 		if (!(tpr->data_ind.MORE_flag & 1)) {
2968 			if (!(rval.r_val1 & MOREDATA))
2969 				msg->msg_flags |= MSG_EOR;
2970 			else
2971 				so->so_state |= SS_SAVEDEOR;
2972 		}
2973 		freemsg(mp);
2974 		/*
2975 		 * If some data was received (i.e. not EOF) and the
2976 		 * read/recv* has not been satisfied wait for some more.
2977 		 */
2978 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
2979 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
2980 			mutex_exit(&so->so_lock);
2981 			first = 0;
2982 			pflag = opflag | MSG_NOMARK;
2983 			goto retry;
2984 		}
2985 		so_unlock_read(so);	/* Clear SOREADLOCKED */
2986 		mutex_exit(&so->so_lock);
2987 		return (0);
2988 	}
2989 	case T_UNITDATA_IND: {
2990 		void *addr;
2991 		t_uscalar_t addrlen;
2992 		void *abuf;
2993 		t_uscalar_t optlen;
2994 		void *opt;
2995 
2996 		if ((so->so_state &
2997 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
2998 		    (uiop->uio_resid != saved_resid) &&
2999 		    !(flags & MSG_PEEK)) {
3000 			sorecv_update_oobstate(so);
3001 		}
3002 
3003 		if (namelen != 0) {
3004 			/* Caller wants source address */
3005 			addrlen = tpr->unitdata_ind.SRC_length;
3006 			addr = sogetoff(mp,
3007 				tpr->unitdata_ind.SRC_offset,
3008 				addrlen, 1);
3009 			if (addr == NULL) {
3010 				freemsg(mp);
3011 				error = EPROTO;
3012 				eprintsoline(so, error);
3013 				goto err;
3014 			}
3015 			if (so->so_family == AF_UNIX) {
3016 				/*
3017 				 * Can not use the transport level address.
3018 				 * If there is a SO_SRCADDR option carrying
3019 				 * the socket level address it will be
3020 				 * extracted below.
3021 				 */
3022 				addr = NULL;
3023 				addrlen = 0;
3024 			}
3025 		}
3026 		optlen = tpr->unitdata_ind.OPT_length;
3027 		if (optlen != 0) {
3028 			t_uscalar_t ncontrollen;
3029 
3030 			/*
3031 			 * Extract any source address option.
3032 			 * Determine how large cmsg buffer is needed.
3033 			 */
3034 			opt = sogetoff(mp,
3035 				tpr->unitdata_ind.OPT_offset,
3036 				optlen, __TPI_ALIGN_SIZE);
3037 
3038 			if (opt == NULL) {
3039 				freemsg(mp);
3040 				error = EPROTO;
3041 				eprintsoline(so, error);
3042 				goto err;
3043 			}
3044 			if (so->so_family == AF_UNIX)
3045 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3046 			ncontrollen = so_cmsglen(mp, opt, optlen,
3047 						!(flags & MSG_XPG4_2));
3048 			if (controllen != 0)
3049 				controllen = ncontrollen;
3050 			else if (ncontrollen != 0)
3051 				msg->msg_flags |= MSG_CTRUNC;
3052 		} else {
3053 			controllen = 0;
3054 		}
3055 
3056 		if (namelen != 0) {
3057 			/*
3058 			 * Return address to caller.
3059 			 * Caller handles truncation if length
3060 			 * exceeds msg_namelen.
3061 			 * NOTE: AF_UNIX NUL termination is ensured by
3062 			 * the sender's copyin_name().
3063 			 */
3064 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3065 
3066 			bcopy(addr, abuf, addrlen);
3067 			msg->msg_name = abuf;
3068 			msg->msg_namelen = addrlen;
3069 		}
3070 
3071 		if (controllen != 0) {
3072 			/*
3073 			 * Return control msg to caller.
3074 			 * Caller handles truncation if length
3075 			 * exceeds msg_controllen.
3076 			 */
3077 			control = kmem_alloc(controllen, KM_SLEEP);
3078 
3079 			error = so_opt2cmsg(mp, opt, optlen,
3080 					!(flags & MSG_XPG4_2),
3081 					control, controllen);
3082 			if (error) {
3083 				freemsg(mp);
3084 				if (msg->msg_namelen != 0)
3085 					kmem_free(msg->msg_name,
3086 						msg->msg_namelen);
3087 				kmem_free(control, controllen);
3088 				eprintsoline(so, error);
3089 				goto err;
3090 			}
3091 			msg->msg_control = control;
3092 			msg->msg_controllen = controllen;
3093 		}
3094 
3095 		freemsg(mp);
3096 		mutex_enter(&so->so_lock);
3097 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3098 		mutex_exit(&so->so_lock);
3099 		return (0);
3100 	}
3101 	case T_OPTDATA_IND: {
3102 		struct T_optdata_req *tdr;
3103 		void *opt;
3104 		t_uscalar_t optlen;
3105 
3106 		if ((so->so_state &
3107 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3108 		    (uiop->uio_resid != saved_resid) &&
3109 		    !(flags & MSG_PEEK)) {
3110 			sorecv_update_oobstate(so);
3111 		}
3112 
3113 		tdr = (struct T_optdata_req *)mp->b_rptr;
3114 		optlen = tdr->OPT_length;
3115 		if (optlen != 0) {
3116 			t_uscalar_t ncontrollen;
3117 			/*
3118 			 * Determine how large cmsg buffer is needed.
3119 			 */
3120 			opt = sogetoff(mp,
3121 					tpr->optdata_ind.OPT_offset,
3122 					optlen, __TPI_ALIGN_SIZE);
3123 
3124 			if (opt == NULL) {
3125 				freemsg(mp);
3126 				error = EPROTO;
3127 				eprintsoline(so, error);
3128 				goto err;
3129 			}
3130 
3131 			ncontrollen = so_cmsglen(mp, opt, optlen,
3132 						!(flags & MSG_XPG4_2));
3133 			if (controllen != 0)
3134 				controllen = ncontrollen;
3135 			else if (ncontrollen != 0)
3136 				msg->msg_flags |= MSG_CTRUNC;
3137 		} else {
3138 			controllen = 0;
3139 		}
3140 
3141 		if (controllen != 0) {
3142 			/*
3143 			 * Return control msg to caller.
3144 			 * Caller handles truncation if length
3145 			 * exceeds msg_controllen.
3146 			 */
3147 			control = kmem_alloc(controllen, KM_SLEEP);
3148 
3149 			error = so_opt2cmsg(mp, opt, optlen,
3150 					!(flags & MSG_XPG4_2),
3151 					control, controllen);
3152 			if (error) {
3153 				freemsg(mp);
3154 				kmem_free(control, controllen);
3155 				eprintsoline(so, error);
3156 				goto err;
3157 			}
3158 			msg->msg_control = control;
3159 			msg->msg_controllen = controllen;
3160 		}
3161 
3162 		/*
3163 		 * Set msg_flags to MSG_EOR based on
3164 		 * DATA_flag and MOREDATA.
3165 		 */
3166 		mutex_enter(&so->so_lock);
3167 		so->so_state &= ~SS_SAVEDEOR;
3168 		if (!(tpr->data_ind.MORE_flag & 1)) {
3169 			if (!(rval.r_val1 & MOREDATA))
3170 				msg->msg_flags |= MSG_EOR;
3171 			else
3172 				so->so_state |= SS_SAVEDEOR;
3173 		}
3174 		freemsg(mp);
3175 		/*
3176 		 * If some data was received (i.e. not EOF) and the
3177 		 * read/recv* has not been satisfied wait for some more.
3178 		 * Not possible to wait if control info was received.
3179 		 */
3180 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3181 		    controllen == 0 &&
3182 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3183 			mutex_exit(&so->so_lock);
3184 			first = 0;
3185 			pflag = opflag | MSG_NOMARK;
3186 			goto retry;
3187 		}
3188 		so_unlock_read(so);	/* Clear SOREADLOCKED */
3189 		mutex_exit(&so->so_lock);
3190 		return (0);
3191 	}
3192 	case T_EXDATA_IND: {
3193 		dprintso(so, 1,
3194 			("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3195 			"state %s\n",
3196 			so->so_oobsigcnt, so->so_oobcnt,
3197 			saved_resid - uiop->uio_resid,
3198 			pr_state(so->so_state, so->so_mode)));
3199 		/*
3200 		 * kstrgetmsg handles MSGMARK so there is nothing to
3201 		 * inspect in the T_EXDATA_IND.
3202 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3203 		 * as a separate message with no M_DATA component. Furthermore,
3204 		 * the stream head does not consolidate M_DATA messages onto
3205 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3206 		 * remains a message by itself. This is needed since MSGMARK
3207 		 * marks both the whole message as well as the last byte
3208 		 * of the message.
3209 		 */
3210 		freemsg(mp);
3211 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3212 		if (flags & MSG_PEEK) {
3213 			/*
3214 			 * Even though we are peeking we consume the
3215 			 * T_EXDATA_IND thereby moving the mark information
3216 			 * to SS_RCVATMARK. Then the oob code below will
3217 			 * retry the peeking kstrgetmsg.
3218 			 * Note that the stream head read queue is
3219 			 * never flushed without holding SOREADLOCKED
3220 			 * thus the T_EXDATA_IND can not disappear
3221 			 * underneath us.
3222 			 */
3223 			dprintso(so, 1,
3224 				("sotpi_recvmsg: consume EXDATA_IND "
3225 				"counts %d/%d state %s\n",
3226 				so->so_oobsigcnt,
3227 				so->so_oobcnt,
3228 				pr_state(so->so_state, so->so_mode)));
3229 
3230 			pflag = MSG_ANY | MSG_DELAYERROR;
3231 			if (so->so_mode & SM_ATOMIC)
3232 				pflag |= MSG_DISCARDTAIL;
3233 
3234 			pri = 0;
3235 			mp = NULL;
3236 
3237 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3238 				&pri, &pflag, (clock_t)-1, &rval);
3239 			ASSERT(uiop->uio_resid == saved_resid);
3240 
3241 			if (error) {
3242 #ifdef SOCK_DEBUG
3243 				if (error != EWOULDBLOCK && error != EINTR) {
3244 					eprintsoline(so, error);
3245 				}
3246 #endif /* SOCK_DEBUG */
3247 				mutex_enter(&so->so_lock);
3248 				so_unlock_read(so);	/* Clear SOREADLOCKED */
3249 				mutex_exit(&so->so_lock);
3250 				return (error);
3251 			}
3252 			ASSERT(mp);
3253 			tpr = (union T_primitives *)mp->b_rptr;
3254 			ASSERT(tpr->type == T_EXDATA_IND);
3255 			freemsg(mp);
3256 		} /* end "if (flags & MSG_PEEK)" */
3257 
3258 		/*
3259 		 * Decrement the number of queued and pending oob.
3260 		 *
3261 		 * SS_RCVATMARK is cleared when we read past a mark.
3262 		 * SS_HAVEOOBDATA is cleared when we've read past the
3263 		 * last mark.
3264 		 * SS_OOBPEND is cleared if we've read past the last
3265 		 * mark and no (new) SIGURG has been posted.
3266 		 */
3267 		mutex_enter(&so->so_lock);
3268 		ASSERT(so_verify_oobstate(so));
3269 		ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
3270 		ASSERT(so->so_oobsigcnt > 0);
3271 		so->so_oobsigcnt--;
3272 		ASSERT(so->so_oobcnt > 0);
3273 		so->so_oobcnt--;
3274 		/*
3275 		 * Since the T_EXDATA_IND has been removed from the stream
3276 		 * head, but we have not read data past the mark,
3277 		 * sockfs needs to track that the socket is still at the mark.
3278 		 *
3279 		 * Since no data was received call kstrgetmsg again to wait
3280 		 * for data.
3281 		 */
3282 		so->so_state |= SS_RCVATMARK;
3283 		mutex_exit(&so->so_lock);
3284 		dprintso(so, 1,
3285 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3286 		    so->so_oobsigcnt, so->so_oobcnt,
3287 		    pr_state(so->so_state, so->so_mode)));
3288 		pflag = opflag;
3289 		goto retry;
3290 	}
3291 	default:
3292 		ASSERT(0);
3293 		freemsg(mp);
3294 		error = EPROTO;
3295 		eprintsoline(so, error);
3296 		goto err;
3297 	}
3298 	/* NOTREACHED */
3299 err:
3300 	mutex_enter(&so->so_lock);
3301 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3302 	mutex_exit(&so->so_lock);
3303 	return (error);
3304 }
3305 
3306 /*
3307  * Sending data with options on a datagram socket.
3308  * Assumes caller has verified that SS_ISBOUND etc. are set.
3309  */
3310 static int
3311 sosend_dgramcmsg(struct sonode *so,
3312 		struct sockaddr *name,
3313 		t_uscalar_t namelen,
3314 		struct uio *uiop,
3315 		void *control,
3316 		t_uscalar_t controllen,
3317 		int flags)
3318 {
3319 	struct T_unitdata_req	tudr;
3320 	mblk_t			*mp;
3321 	int			error;
3322 	void			*addr;
3323 	socklen_t		addrlen;
3324 	void			*src;
3325 	socklen_t		srclen;
3326 	ssize_t			len;
3327 	int			size;
3328 	struct T_opthdr		toh;
3329 	struct fdbuf		*fdbuf;
3330 	t_uscalar_t		optlen;
3331 	void			*fds;
3332 	int			fdlen;
3333 
3334 	ASSERT(name && namelen);
3335 	ASSERT(control && controllen);
3336 
3337 	len = uiop->uio_resid;
3338 	if (len > (ssize_t)so->so_tidu_size) {
3339 		return (EMSGSIZE);
3340 	}
3341 
3342 	/*
3343 	 * For AF_UNIX the destination address is translated to an internal
3344 	 * name and the source address is passed as an option.
3345 	 * Also, file descriptors are passed as file pointers in an
3346 	 * option.
3347 	 */
3348 
3349 	/*
3350 	 * Length and family checks.
3351 	 */
3352 	error = so_addr_verify(so, name, namelen);
3353 	if (error) {
3354 		eprintsoline(so, error);
3355 		return (error);
3356 	}
3357 	if (so->so_family == AF_UNIX) {
3358 		if (so->so_state & SS_FADDR_NOXLATE) {
3359 			/*
3360 			 * Already have a transport internal address. Do not
3361 			 * pass any (transport internal) source address.
3362 			 */
3363 			addr = name;
3364 			addrlen = namelen;
3365 			src = NULL;
3366 			srclen = 0;
3367 		} else {
3368 			/*
3369 			 * Pass the sockaddr_un source address as an option
3370 			 * and translate the remote address.
3371 			 *
3372 			 * Note that this code does not prevent so_laddr_sa
3373 			 * from changing while it is being used. Thus
3374 			 * if an unbind+bind occurs concurrently with this
3375 			 * send the peer might see a partially new and a
3376 			 * partially old "from" address.
3377 			 */
3378 			src = so->so_laddr_sa;
3379 			srclen = (t_uscalar_t)so->so_laddr_len;
3380 			dprintso(so, 1,
3381 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3382 			    srclen, src));
3383 			error = so_ux_addr_xlate(so, name, namelen,
3384 				(flags & MSG_XPG4_2),
3385 				&addr, &addrlen);
3386 			if (error) {
3387 				eprintsoline(so, error);
3388 				return (error);
3389 			}
3390 		}
3391 	} else {
3392 		addr = name;
3393 		addrlen = namelen;
3394 		src = NULL;
3395 		srclen = 0;
3396 	}
3397 	optlen = so_optlen(control, controllen,
3398 					!(flags & MSG_XPG4_2));
3399 	tudr.PRIM_type = T_UNITDATA_REQ;
3400 	tudr.DEST_length = addrlen;
3401 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3402 	if (srclen != 0)
3403 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3404 		    _TPI_ALIGN_TOPT(srclen));
3405 	else
3406 		tudr.OPT_length = optlen;
3407 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3408 				_TPI_ALIGN_TOPT(addrlen));
3409 
3410 	size = tudr.OPT_offset + tudr.OPT_length;
3411 
3412 	/*
3413 	 * File descriptors only when SM_FDPASSING set.
3414 	 */
3415 	error = so_getfdopt(control, controllen,
3416 			!(flags & MSG_XPG4_2), &fds, &fdlen);
3417 	if (error)
3418 		return (error);
3419 	if (fdlen != -1) {
3420 		if (!(so->so_mode & SM_FDPASSING))
3421 			return (EOPNOTSUPP);
3422 
3423 		error = fdbuf_create(fds, fdlen, &fdbuf);
3424 		if (error)
3425 			return (error);
3426 		mp = fdbuf_allocmsg(size, fdbuf);
3427 	} else {
3428 		mp = soallocproto(size, _ALLOC_INTR);
3429 		if (mp == NULL) {
3430 			/*
3431 			 * Caught a signal waiting for memory.
3432 			 * Let send* return EINTR.
3433 			 */
3434 			return (EINTR);
3435 		}
3436 	}
3437 	soappendmsg(mp, &tudr, sizeof (tudr));
3438 	soappendmsg(mp, addr, addrlen);
3439 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3440 
3441 	if (fdlen != -1) {
3442 		ASSERT(fdbuf != NULL);
3443 		toh.level = SOL_SOCKET;
3444 		toh.name = SO_FILEP;
3445 		toh.len = fdbuf->fd_size +
3446 				(t_uscalar_t)sizeof (struct T_opthdr);
3447 		toh.status = 0;
3448 		soappendmsg(mp, &toh, sizeof (toh));
3449 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3450 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3451 	}
3452 	if (srclen != 0) {
3453 		/*
3454 		 * There is a AF_UNIX sockaddr_un to include as a source
3455 		 * address option.
3456 		 */
3457 		toh.level = SOL_SOCKET;
3458 		toh.name = SO_SRCADDR;
3459 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3460 		toh.status = 0;
3461 		soappendmsg(mp, &toh, sizeof (toh));
3462 		soappendmsg(mp, src, srclen);
3463 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3464 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3465 	}
3466 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3467 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3468 	/* At most 3 bytes left in the message */
3469 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3470 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3471 
3472 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3473 #ifdef C2_AUDIT
3474 	if (audit_active)
3475 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3476 #endif /* C2_AUDIT */
3477 
3478 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3479 #ifdef SOCK_DEBUG
3480 	if (error) {
3481 		eprintsoline(so, error);
3482 	}
3483 #endif /* SOCK_DEBUG */
3484 	return (error);
3485 }
3486 
3487 /*
3488  * Sending data with options on a connected stream socket.
3489  * Assumes caller has verified that SS_ISCONNECTED is set.
3490  */
3491 static int
3492 sosend_svccmsg(struct sonode *so,
3493 		struct uio *uiop,
3494 		int more,
3495 		void *control,
3496 		t_uscalar_t controllen,
3497 		int flags)
3498 {
3499 	struct T_optdata_req	tdr;
3500 	mblk_t			*mp;
3501 	int			error;
3502 	ssize_t			iosize;
3503 	int			first = 1;
3504 	int			size;
3505 	struct fdbuf		*fdbuf;
3506 	t_uscalar_t		optlen;
3507 	void			*fds;
3508 	int			fdlen;
3509 	struct T_opthdr		toh;
3510 
3511 	dprintso(so, 1,
3512 		("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3513 
3514 	/*
3515 	 * Has to be bound and connected. However, since no locks are
3516 	 * held the state could have changed after sotpi_sendmsg checked it
3517 	 * thus it is not possible to ASSERT on the state.
3518 	 */
3519 
3520 	/* Options on connection-oriented only when SM_OPTDATA set. */
3521 	if (!(so->so_mode & SM_OPTDATA))
3522 		return (EOPNOTSUPP);
3523 
3524 	do {
3525 		/*
3526 		 * Set the MORE flag if uio_resid does not fit in this
3527 		 * message or if the caller passed in "more".
3528 		 * Error for transports with zero tidu_size.
3529 		 */
3530 		tdr.PRIM_type = T_OPTDATA_REQ;
3531 		iosize = so->so_tidu_size;
3532 		if (iosize <= 0)
3533 			return (EMSGSIZE);
3534 		if (uiop->uio_resid > iosize) {
3535 			tdr.DATA_flag = 1;
3536 		} else {
3537 			if (more)
3538 				tdr.DATA_flag = 1;
3539 			else
3540 				tdr.DATA_flag = 0;
3541 			iosize = uiop->uio_resid;
3542 		}
3543 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3544 			tdr.DATA_flag, iosize));
3545 
3546 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3547 		tdr.OPT_length = optlen;
3548 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3549 
3550 		size = (int)sizeof (tdr) + optlen;
3551 		/*
3552 		 * File descriptors only when SM_FDPASSING set.
3553 		 */
3554 		error = so_getfdopt(control, controllen,
3555 				!(flags & MSG_XPG4_2), &fds, &fdlen);
3556 		if (error)
3557 			return (error);
3558 		if (fdlen != -1) {
3559 			if (!(so->so_mode & SM_FDPASSING))
3560 				return (EOPNOTSUPP);
3561 
3562 			error = fdbuf_create(fds, fdlen, &fdbuf);
3563 			if (error)
3564 				return (error);
3565 			mp = fdbuf_allocmsg(size, fdbuf);
3566 		} else {
3567 			mp = soallocproto(size, _ALLOC_INTR);
3568 			if (mp == NULL) {
3569 				/*
3570 				 * Caught a signal waiting for memory.
3571 				 * Let send* return EINTR.
3572 				 */
3573 				return (first ? EINTR : 0);
3574 			}
3575 		}
3576 		soappendmsg(mp, &tdr, sizeof (tdr));
3577 
3578 		if (fdlen != -1) {
3579 			ASSERT(fdbuf != NULL);
3580 			toh.level = SOL_SOCKET;
3581 			toh.name = SO_FILEP;
3582 			toh.len = fdbuf->fd_size +
3583 				(t_uscalar_t)sizeof (struct T_opthdr);
3584 			toh.status = 0;
3585 			soappendmsg(mp, &toh, sizeof (toh));
3586 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3587 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3588 		}
3589 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3590 		/* At most 3 bytes left in the message */
3591 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3592 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3593 
3594 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3595 
3596 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3597 					0, MSG_BAND, 0);
3598 		if (error) {
3599 			if (!first && error == EWOULDBLOCK)
3600 				return (0);
3601 			eprintsoline(so, error);
3602 			return (error);
3603 		}
3604 		control = NULL;
3605 		first = 0;
3606 		if (uiop->uio_resid > 0) {
3607 			/*
3608 			 * Recheck for fatal errors. Fail write even though
3609 			 * some data have been written. This is consistent
3610 			 * with strwrite semantics and BSD sockets semantics.
3611 			 */
3612 			if (so->so_state & SS_CANTSENDMORE) {
3613 				tsignal(curthread, SIGPIPE);
3614 				eprintsoline(so, error);
3615 				return (EPIPE);
3616 			}
3617 			if (so->so_error != 0) {
3618 				mutex_enter(&so->so_lock);
3619 				error = sogeterr(so);
3620 				mutex_exit(&so->so_lock);
3621 				if (error != 0) {
3622 					eprintsoline(so, error);
3623 					return (error);
3624 				}
3625 			}
3626 		}
3627 	} while (uiop->uio_resid > 0);
3628 	return (0);
3629 }
3630 
3631 /*
3632  * Sending data on a datagram socket.
3633  * Assumes caller has verified that SS_ISBOUND etc. are set.
3634  *
3635  * For AF_UNIX the destination address is translated to an internal
3636  * name and the source address is passed as an option.
3637  */
3638 int
3639 sosend_dgram(struct sonode	*so,
3640 		struct sockaddr	*name,
3641 		socklen_t	namelen,
3642 		struct uio	*uiop,
3643 		int		flags)
3644 {
3645 	struct T_unitdata_req	tudr;
3646 	mblk_t			*mp;
3647 	int			error;
3648 	void			*addr;
3649 	socklen_t		addrlen;
3650 	void			*src;
3651 	socklen_t		srclen;
3652 	ssize_t			len;
3653 
3654 	ASSERT(name && namelen);
3655 
3656 	len = uiop->uio_resid;
3657 	if (len > so->so_tidu_size) {
3658 		error = EMSGSIZE;
3659 		goto done;
3660 	}
3661 
3662 	/*
3663 	 * Length and family checks.
3664 	 */
3665 	error = so_addr_verify(so, name, namelen);
3666 	if (error) {
3667 		eprintsoline(so, error);
3668 		goto done;
3669 	}
3670 	if (so->so_family == AF_UNIX) {
3671 		if (so->so_state & SS_FADDR_NOXLATE) {
3672 			/*
3673 			 * Already have a transport internal address. Do not
3674 			 * pass any (transport internal) source address.
3675 			 */
3676 			addr = name;
3677 			addrlen = namelen;
3678 			src = NULL;
3679 			srclen = 0;
3680 		} else {
3681 			/*
3682 			 * Pass the sockaddr_un source address as an option
3683 			 * and translate the remote address.
3684 			 *
3685 			 * Note that this code does not prevent so_laddr_sa
3686 			 * from changing while it is being used. Thus
3687 			 * if an unbind+bind occurs concurrently with this
3688 			 * send the peer might see a partially new and a
3689 			 * partially old "from" address.
3690 			 */
3691 			src = so->so_laddr_sa;
3692 			srclen = (socklen_t)so->so_laddr_len;
3693 			dprintso(so, 1,
3694 				("sosend_dgram UNIX: srclen %d, src %p\n",
3695 				srclen, src));
3696 			error = so_ux_addr_xlate(so, name, namelen,
3697 				(flags & MSG_XPG4_2),
3698 				&addr, &addrlen);
3699 			if (error) {
3700 				eprintsoline(so, error);
3701 				goto done;
3702 			}
3703 		}
3704 	} else {
3705 		addr = name;
3706 		addrlen = namelen;
3707 		src = NULL;
3708 		srclen = 0;
3709 	}
3710 	tudr.PRIM_type = T_UNITDATA_REQ;
3711 	tudr.DEST_length = addrlen;
3712 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3713 	if (srclen == 0) {
3714 		tudr.OPT_length = 0;
3715 		tudr.OPT_offset = 0;
3716 
3717 		mp = soallocproto2(&tudr, sizeof (tudr),
3718 		    addr, addrlen, 0, _ALLOC_INTR);
3719 		if (mp == NULL) {
3720 			/*
3721 			 * Caught a signal waiting for memory.
3722 			 * Let send* return EINTR.
3723 			 */
3724 			error = EINTR;
3725 			goto done;
3726 		}
3727 	} else {
3728 		/*
3729 		 * There is a AF_UNIX sockaddr_un to include as a source
3730 		 * address option.
3731 		 */
3732 		struct T_opthdr toh;
3733 		ssize_t size;
3734 
3735 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
3736 					_TPI_ALIGN_TOPT(srclen));
3737 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3738 					_TPI_ALIGN_TOPT(addrlen));
3739 
3740 		toh.level = SOL_SOCKET;
3741 		toh.name = SO_SRCADDR;
3742 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3743 		toh.status = 0;
3744 
3745 		size = tudr.OPT_offset + tudr.OPT_length;
3746 		mp = soallocproto2(&tudr, sizeof (tudr),
3747 		    addr, addrlen, size, _ALLOC_INTR);
3748 		if (mp == NULL) {
3749 			/*
3750 			 * Caught a signal waiting for memory.
3751 			 * Let send* return EINTR.
3752 			 */
3753 			error = EINTR;
3754 			goto done;
3755 		}
3756 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3757 		soappendmsg(mp, &toh, sizeof (toh));
3758 		soappendmsg(mp, src, srclen);
3759 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3760 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3761 	}
3762 
3763 #ifdef C2_AUDIT
3764 	if (audit_active)
3765 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3766 #endif /* C2_AUDIT */
3767 
3768 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3769 done:
3770 #ifdef SOCK_DEBUG
3771 	if (error) {
3772 		eprintsoline(so, error);
3773 	}
3774 #endif /* SOCK_DEBUG */
3775 	return (error);
3776 }
3777 
3778 /*
3779  * Sending data on a connected stream socket.
3780  * Assumes caller has verified that SS_ISCONNECTED is set.
3781  */
3782 int
3783 sosend_svc(struct sonode *so,
3784 	struct uio *uiop,
3785 	t_scalar_t prim,
3786 	int more,
3787 	int sflag)
3788 {
3789 	struct T_data_req	tdr;
3790 	mblk_t			*mp;
3791 	int			error;
3792 	ssize_t			iosize;
3793 	int			first = 1;
3794 
3795 	dprintso(so, 1,
3796 		("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
3797 		so, uiop->uio_resid, prim, sflag));
3798 
3799 	/*
3800 	 * Has to be bound and connected. However, since no locks are
3801 	 * held the state could have changed after sotpi_sendmsg checked it
3802 	 * thus it is not possible to ASSERT on the state.
3803 	 */
3804 
3805 	do {
3806 		/*
3807 		 * Set the MORE flag if uio_resid does not fit in this
3808 		 * message or if the caller passed in "more".
3809 		 * Error for transports with zero tidu_size.
3810 		 */
3811 		tdr.PRIM_type = prim;
3812 		iosize = so->so_tidu_size;
3813 		if (iosize <= 0)
3814 			return (EMSGSIZE);
3815 		if (uiop->uio_resid > iosize) {
3816 			tdr.MORE_flag = 1;
3817 		} else {
3818 			if (more)
3819 				tdr.MORE_flag = 1;
3820 			else
3821 				tdr.MORE_flag = 0;
3822 			iosize = uiop->uio_resid;
3823 		}
3824 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
3825 			prim, tdr.MORE_flag, iosize));
3826 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR);
3827 		if (mp == NULL) {
3828 			/*
3829 			 * Caught a signal waiting for memory.
3830 			 * Let send* return EINTR.
3831 			 */
3832 			if (first)
3833 				return (EINTR);
3834 			else
3835 				return (0);
3836 		}
3837 
3838 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3839 					0, sflag | MSG_BAND, 0);
3840 		if (error) {
3841 			if (!first && error == EWOULDBLOCK)
3842 				return (0);
3843 			eprintsoline(so, error);
3844 			return (error);
3845 		}
3846 		first = 0;
3847 		if (uiop->uio_resid > 0) {
3848 			/*
3849 			 * Recheck for fatal errors. Fail write even though
3850 			 * some data have been written. This is consistent
3851 			 * with strwrite semantics and BSD sockets semantics.
3852 			 */
3853 			if (so->so_state & SS_CANTSENDMORE) {
3854 				tsignal(curthread, SIGPIPE);
3855 				eprintsoline(so, error);
3856 				return (EPIPE);
3857 			}
3858 			if (so->so_error != 0) {
3859 				mutex_enter(&so->so_lock);
3860 				error = sogeterr(so);
3861 				mutex_exit(&so->so_lock);
3862 				if (error != 0) {
3863 					eprintsoline(so, error);
3864 					return (error);
3865 				}
3866 			}
3867 		}
3868 	} while (uiop->uio_resid > 0);
3869 	return (0);
3870 }
3871 
3872 /*
3873  * Check the state for errors and call the appropriate send function.
3874  *
3875  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
3876  * this function issues a setsockopt to toggle SO_DONTROUTE before and
3877  * after sending the message.
3878  */
3879 static int
3880 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3881 {
3882 	int		so_state;
3883 	int		so_mode;
3884 	int		error;
3885 	struct sockaddr *name;
3886 	t_uscalar_t	namelen;
3887 	int		dontroute;
3888 	int		flags;
3889 
3890 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
3891 		so, msg, msg->msg_flags,
3892 		pr_state(so->so_state, so->so_mode), so->so_error));
3893 
3894 	mutex_enter(&so->so_lock);
3895 	so_state = so->so_state;
3896 
3897 	if (so_state & SS_CANTSENDMORE) {
3898 		mutex_exit(&so->so_lock);
3899 		tsignal(curthread, SIGPIPE);
3900 		return (EPIPE);
3901 	}
3902 
3903 	if (so->so_error != 0) {
3904 		error = sogeterr(so);
3905 		if (error != 0) {
3906 			mutex_exit(&so->so_lock);
3907 			return (error);
3908 		}
3909 	}
3910 
3911 	name = (struct sockaddr *)msg->msg_name;
3912 	namelen = msg->msg_namelen;
3913 
3914 	so_mode = so->so_mode;
3915 
3916 	if (name == NULL) {
3917 		if (!(so_state & SS_ISCONNECTED)) {
3918 			mutex_exit(&so->so_lock);
3919 			if (so_mode & SM_CONNREQUIRED)
3920 				return (ENOTCONN);
3921 			else
3922 				return (EDESTADDRREQ);
3923 		}
3924 		if (so_mode & SM_CONNREQUIRED) {
3925 			name = NULL;
3926 			namelen = 0;
3927 		} else {
3928 			/*
3929 			 * Note that this code does not prevent so_faddr_sa
3930 			 * from changing while it is being used. Thus
3931 			 * if an "unconnect"+connect occurs concurrently with
3932 			 * this send the datagram might be delivered to a
3933 			 * garbaled address.
3934 			 */
3935 			ASSERT(so->so_faddr_sa);
3936 			name = so->so_faddr_sa;
3937 			namelen = (t_uscalar_t)so->so_faddr_len;
3938 		}
3939 	} else {
3940 		if (!(so_state & SS_ISCONNECTED) &&
3941 		    (so_mode & SM_CONNREQUIRED)) {
3942 			/* Required but not connected */
3943 			mutex_exit(&so->so_lock);
3944 			return (ENOTCONN);
3945 		}
3946 		/*
3947 		 * Ignore the address on connection-oriented sockets.
3948 		 * Just like BSD this code does not generate an error for
3949 		 * TCP (a CONNREQUIRED socket) when sending to an address
3950 		 * passed in with sendto/sendmsg. Instead the data is
3951 		 * delivered on the connection as if no address had been
3952 		 * supplied.
3953 		 */
3954 		if ((so_state & SS_ISCONNECTED) &&
3955 		    !(so_mode & SM_CONNREQUIRED)) {
3956 			mutex_exit(&so->so_lock);
3957 			return (EISCONN);
3958 		}
3959 		if (!(so_state & SS_ISBOUND)) {
3960 			so_lock_single(so);	/* Set SOLOCKED */
3961 			error = sotpi_bind(so, NULL, 0,
3962 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
3963 			so_unlock_single(so, SOLOCKED);
3964 			if (error) {
3965 				mutex_exit(&so->so_lock);
3966 				eprintsoline(so, error);
3967 				return (error);
3968 			}
3969 		}
3970 		/*
3971 		 * Handle delayed datagram errors. These are only queued
3972 		 * when the application sets SO_DGRAM_ERRIND.
3973 		 * Return the error if we are sending to the address
3974 		 * that was returned in the last T_UDERROR_IND.
3975 		 * If sending to some other address discard the delayed
3976 		 * error indication.
3977 		 */
3978 		if (so->so_delayed_error) {
3979 			struct T_uderror_ind	*tudi;
3980 			void			*addr;
3981 			t_uscalar_t		addrlen;
3982 			boolean_t		match = B_FALSE;
3983 
3984 			ASSERT(so->so_eaddr_mp);
3985 			error = so->so_delayed_error;
3986 			so->so_delayed_error = 0;
3987 			tudi = (struct T_uderror_ind *)so->so_eaddr_mp->b_rptr;
3988 			addrlen = tudi->DEST_length;
3989 			addr = sogetoff(so->so_eaddr_mp,
3990 					tudi->DEST_offset,
3991 					addrlen, 1);
3992 			ASSERT(addr);	/* Checked by strsock_proto */
3993 			switch (so->so_family) {
3994 			case AF_INET: {
3995 				/* Compare just IP address and port */
3996 				sin_t *sin1 = (sin_t *)name;
3997 				sin_t *sin2 = (sin_t *)addr;
3998 
3999 				if (addrlen == sizeof (sin_t) &&
4000 				    namelen == addrlen &&
4001 				    sin1->sin_port == sin2->sin_port &&
4002 				    sin1->sin_addr.s_addr ==
4003 				    sin2->sin_addr.s_addr)
4004 					match = B_TRUE;
4005 				break;
4006 			}
4007 			case AF_INET6: {
4008 				/* Compare just IP address and port. Not flow */
4009 				sin6_t *sin1 = (sin6_t *)name;
4010 				sin6_t *sin2 = (sin6_t *)addr;
4011 
4012 				if (addrlen == sizeof (sin6_t) &&
4013 				    namelen == addrlen &&
4014 				    sin1->sin6_port == sin2->sin6_port &&
4015 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4016 					&sin2->sin6_addr))
4017 					match = B_TRUE;
4018 				break;
4019 			}
4020 			case AF_UNIX:
4021 			default:
4022 				if (namelen == addrlen &&
4023 				    bcmp(name, addr, namelen) == 0)
4024 					match = B_TRUE;
4025 			}
4026 			if (match) {
4027 				freemsg(so->so_eaddr_mp);
4028 				so->so_eaddr_mp = NULL;
4029 				mutex_exit(&so->so_lock);
4030 #ifdef DEBUG
4031 				dprintso(so, 0,
4032 					("sockfs delayed error %d for %s\n",
4033 					error,
4034 					pr_addr(so->so_family, name, namelen)));
4035 #endif /* DEBUG */
4036 				return (error);
4037 			}
4038 			freemsg(so->so_eaddr_mp);
4039 			so->so_eaddr_mp = NULL;
4040 		}
4041 	}
4042 	mutex_exit(&so->so_lock);
4043 
4044 	flags = msg->msg_flags;
4045 	dontroute = 0;
4046 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4047 		uint32_t	val;
4048 
4049 		val = 1;
4050 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4051 					&val, (t_uscalar_t)sizeof (val));
4052 		if (error)
4053 			return (error);
4054 		dontroute = 1;
4055 	}
4056 
4057 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4058 		error = EOPNOTSUPP;
4059 		goto done;
4060 	}
4061 	if (msg->msg_controllen != 0) {
4062 		if (!(so_mode & SM_CONNREQUIRED)) {
4063 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4064 				msg->msg_control, msg->msg_controllen,
4065 				flags);
4066 		} else {
4067 			if (flags & MSG_OOB) {
4068 				/* Can't generate T_EXDATA_REQ with options */
4069 				error = EOPNOTSUPP;
4070 				goto done;
4071 			}
4072 			error = sosend_svccmsg(so, uiop,
4073 				!(flags & MSG_EOR),
4074 				msg->msg_control, msg->msg_controllen,
4075 				flags);
4076 		}
4077 		goto done;
4078 	}
4079 
4080 	if (!(so_mode & SM_CONNREQUIRED)) {
4081 		/*
4082 		 * If there is no SO_DONTROUTE to turn off return immediately
4083 		 * from sosend_dgram. This can allow tail-call optimizations.
4084 		 */
4085 		if (!dontroute) {
4086 			return (sosend_dgram(so, name, namelen, uiop, flags));
4087 		}
4088 		error = sosend_dgram(so, name, namelen, uiop, flags);
4089 	} else {
4090 		t_scalar_t prim;
4091 		int sflag;
4092 
4093 		/* Ignore msg_name in the connected state */
4094 		if (flags & MSG_OOB) {
4095 			prim = T_EXDATA_REQ;
4096 			/*
4097 			 * Send down T_EXDATA_REQ even if there is flow
4098 			 * control for data.
4099 			 */
4100 			sflag = MSG_IGNFLOW;
4101 		} else {
4102 			if (so_mode & SM_BYTESTREAM) {
4103 				/* Byte stream transport - use write */
4104 
4105 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4106 				/*
4107 				 * If there is no SO_DONTROUTE to turn off
4108 				 * return immediately from strwrite. This can
4109 				 * allow tail-call optimizations.
4110 				 */
4111 				if (!dontroute)
4112 					return (strwrite(SOTOV(so), uiop,
4113 							CRED()));
4114 				error = strwrite(SOTOV(so), uiop, CRED());
4115 				goto done;
4116 			}
4117 			prim = T_DATA_REQ;
4118 			sflag = 0;
4119 		}
4120 		/*
4121 		 * If there is no SO_DONTROUTE to turn off return immediately
4122 		 * from sosend_svc. This can allow tail-call optimizations.
4123 		 */
4124 		if (!dontroute)
4125 			return (sosend_svc(so, uiop, prim,
4126 				!(flags & MSG_EOR), sflag));
4127 		error = sosend_svc(so, uiop, prim,
4128 				!(flags & MSG_EOR), sflag);
4129 	}
4130 	ASSERT(dontroute);
4131 done:
4132 	if (dontroute) {
4133 		uint32_t	val;
4134 
4135 		val = 0;
4136 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4137 				&val, (t_uscalar_t)sizeof (val));
4138 	}
4139 	return (error);
4140 }
4141 
4142 /*
4143  * Update so_faddr by asking the transport (unless AF_UNIX).
4144  */
4145 int
4146 sotpi_getpeername(struct sonode *so)
4147 {
4148 	struct strbuf	strbuf;
4149 	int		error = 0, res;
4150 	void		*addr;
4151 	t_uscalar_t	addrlen;
4152 	k_sigset_t	smask;
4153 
4154 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4155 		so, pr_state(so->so_state, so->so_mode)));
4156 
4157 	mutex_enter(&so->so_lock);
4158 	so_lock_single(so);	/* Set SOLOCKED */
4159 	if (!(so->so_state & SS_ISCONNECTED)) {
4160 		error = ENOTCONN;
4161 		goto done;
4162 	}
4163 	/* Added this check for X/Open */
4164 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4165 		error = EINVAL;
4166 		if (xnet_check_print) {
4167 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4168 		}
4169 		goto done;
4170 	}
4171 #ifdef DEBUG
4172 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4173 		pr_addr(so->so_family, so->so_faddr_sa,
4174 			(t_uscalar_t)so->so_faddr_len)));
4175 #endif /* DEBUG */
4176 
4177 	if (so->so_family == AF_UNIX || so->so_family == AF_NCA) {
4178 		/* Transport has different name space - return local info */
4179 		error = 0;
4180 		goto done;
4181 	}
4182 
4183 	ASSERT(so->so_faddr_sa);
4184 	/* Allocate local buffer to use with ioctl */
4185 	addrlen = (t_uscalar_t)so->so_faddr_maxlen;
4186 	mutex_exit(&so->so_lock);
4187 	addr = kmem_alloc(addrlen, KM_SLEEP);
4188 
4189 	/*
4190 	 * Issue TI_GETPEERNAME with signals masked.
4191 	 * Put the result in so_faddr_sa so that getpeername works after
4192 	 * a shutdown(output).
4193 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4194 	 * back to the socket.
4195 	 */
4196 	strbuf.buf = addr;
4197 	strbuf.maxlen = addrlen;
4198 	strbuf.len = 0;
4199 
4200 	sigintr(&smask, 0);
4201 	res = 0;
4202 	ASSERT(CRED());
4203 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4204 			0, K_TO_K, CRED(), &res);
4205 	sigunintr(&smask);
4206 
4207 	mutex_enter(&so->so_lock);
4208 	/*
4209 	 * If there is an error record the error in so_error put don't fail
4210 	 * the getpeername. Instead fallback on the recorded
4211 	 * so->so_faddr_sa.
4212 	 */
4213 	if (error) {
4214 		/*
4215 		 * Various stream head errors can be returned to the ioctl.
4216 		 * However, it is impossible to determine which ones of
4217 		 * these are really socket level errors that were incorrectly
4218 		 * consumed by the ioctl. Thus this code silently ignores the
4219 		 * error - to code explicitly does not reinstate the error
4220 		 * using soseterror().
4221 		 * Experiments have shows that at least this set of
4222 		 * errors are reported and should not be reinstated on the
4223 		 * socket:
4224 		 *	EINVAL	E.g. if an I_LINK was in effect when
4225 		 *		getpeername was called.
4226 		 *	EPIPE	The ioctl error semantics prefer the write
4227 		 *		side error over the read side error.
4228 		 *	ENOTCONN The transport just got disconnected but
4229 		 *		sockfs had not yet seen the T_DISCON_IND
4230 		 *		when issuing the ioctl.
4231 		 */
4232 		error = 0;
4233 	} else if (res == 0 && strbuf.len > 0 &&
4234 	    (so->so_state & SS_ISCONNECTED)) {
4235 		ASSERT(strbuf.len <= (int)so->so_faddr_maxlen);
4236 		so->so_faddr_len = (socklen_t)strbuf.len;
4237 		bcopy(addr, so->so_faddr_sa, so->so_faddr_len);
4238 		so->so_state |= SS_FADDR_VALID;
4239 	}
4240 	kmem_free(addr, addrlen);
4241 #ifdef DEBUG
4242 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4243 			pr_addr(so->so_family, so->so_faddr_sa,
4244 				(t_uscalar_t)so->so_faddr_len)));
4245 #endif /* DEBUG */
4246 done:
4247 	so_unlock_single(so, SOLOCKED);
4248 	mutex_exit(&so->so_lock);
4249 	return (error);
4250 }
4251 
4252 /*
4253  * Update so_laddr by asking the transport (unless AF_UNIX).
4254  */
4255 int
4256 sotpi_getsockname(struct sonode *so)
4257 {
4258 	struct strbuf	strbuf;
4259 	int		error = 0, res;
4260 	void		*addr;
4261 	t_uscalar_t	addrlen;
4262 	k_sigset_t	smask;
4263 
4264 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4265 		so, pr_state(so->so_state, so->so_mode)));
4266 
4267 	mutex_enter(&so->so_lock);
4268 	so_lock_single(so);	/* Set SOLOCKED */
4269 	if (!(so->so_state & SS_ISBOUND) && so->so_family != AF_UNIX) {
4270 		/* Return an all zero address except for the family */
4271 		if (so->so_family == AF_INET)
4272 			so->so_laddr_len = (socklen_t)sizeof (sin_t);
4273 		else if (so->so_family == AF_INET6)
4274 			so->so_laddr_len = (socklen_t)sizeof (sin6_t);
4275 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
4276 		bzero(so->so_laddr_sa, so->so_laddr_len);
4277 		/*
4278 		 * Can not assume there is a sa_family for all
4279 		 * protocol families.
4280 		 */
4281 		if (so->so_family == AF_INET || so->so_family == AF_INET6)
4282 			so->so_laddr_sa->sa_family = so->so_family;
4283 	}
4284 #ifdef DEBUG
4285 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4286 		pr_addr(so->so_family, so->so_laddr_sa,
4287 			(t_uscalar_t)so->so_laddr_len)));
4288 #endif /* DEBUG */
4289 	if (so->so_family == AF_UNIX) {
4290 		/* Transport has different name space - return local info */
4291 		error = 0;
4292 		goto done;
4293 	}
4294 	/* Allocate local buffer to use with ioctl */
4295 	addrlen = (t_uscalar_t)so->so_laddr_maxlen;
4296 	mutex_exit(&so->so_lock);
4297 	addr = kmem_alloc(addrlen, KM_SLEEP);
4298 
4299 	/*
4300 	 * Issue TI_GETMYNAME with signals masked.
4301 	 * Put the result in so_laddr_sa so that getsockname works after
4302 	 * a shutdown(output).
4303 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4304 	 * back to the socket.
4305 	 */
4306 	strbuf.buf = addr;
4307 	strbuf.maxlen = addrlen;
4308 	strbuf.len = 0;
4309 
4310 	sigintr(&smask, 0);
4311 	res = 0;
4312 	ASSERT(CRED());
4313 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4314 			0, K_TO_K, CRED(), &res);
4315 	sigunintr(&smask);
4316 
4317 	mutex_enter(&so->so_lock);
4318 	/*
4319 	 * If there is an error record the error in so_error put don't fail
4320 	 * the getsockname. Instead fallback on the recorded
4321 	 * so->so_laddr_sa.
4322 	 */
4323 	if (error) {
4324 		/*
4325 		 * Various stream head errors can be returned to the ioctl.
4326 		 * However, it is impossible to determine which ones of
4327 		 * these are really socket level errors that were incorrectly
4328 		 * consumed by the ioctl. Thus this code silently ignores the
4329 		 * error - to code explicitly does not reinstate the error
4330 		 * using soseterror().
4331 		 * Experiments have shows that at least this set of
4332 		 * errors are reported and should not be reinstated on the
4333 		 * socket:
4334 		 *	EINVAL	E.g. if an I_LINK was in effect when
4335 		 *		getsockname was called.
4336 		 *	EPIPE	The ioctl error semantics prefer the write
4337 		 *		side error over the read side error.
4338 		 */
4339 		error = 0;
4340 	} else if (res == 0 && strbuf.len > 0 &&
4341 	    (so->so_state & SS_ISBOUND)) {
4342 		ASSERT(strbuf.len <= (int)so->so_laddr_maxlen);
4343 		so->so_laddr_len = (socklen_t)strbuf.len;
4344 		bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
4345 		so->so_state |= SS_LADDR_VALID;
4346 	}
4347 	kmem_free(addr, addrlen);
4348 #ifdef DEBUG
4349 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
4350 			pr_addr(so->so_family, so->so_laddr_sa,
4351 				(t_uscalar_t)so->so_laddr_len)));
4352 #endif /* DEBUG */
4353 done:
4354 	so_unlock_single(so, SOLOCKED);
4355 	mutex_exit(&so->so_lock);
4356 	return (error);
4357 }
4358 
4359 /*
4360  * Get socket options. For SOL_SOCKET options some options are handled
4361  * by the sockfs while others use the value recorded in the sonode as a
4362  * fallback should the T_SVR4_OPTMGMT_REQ fail.
4363  *
4364  * On the return most *optlenp bytes are copied to optval.
4365  */
4366 int
4367 sotpi_getsockopt(struct sonode *so, int level, int option_name,
4368 		void *optval, socklen_t *optlenp, int flags)
4369 {
4370 	struct T_optmgmt_req	optmgmt_req;
4371 	struct T_optmgmt_ack	*optmgmt_ack;
4372 	struct opthdr		oh;
4373 	struct opthdr		*opt_res;
4374 	mblk_t			*mp = NULL;
4375 	int			error = 0;
4376 	void			*option = NULL;	/* Set if fallback value */
4377 	t_uscalar_t		maxlen = *optlenp;
4378 	t_uscalar_t		len;
4379 	uint32_t		value;
4380 
4381 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
4382 			so, level, option_name, optval, optlenp,
4383 			pr_state(so->so_state, so->so_mode)));
4384 
4385 	mutex_enter(&so->so_lock);
4386 	so_lock_single(so);	/* Set SOLOCKED */
4387 
4388 	/*
4389 	 * Check for SOL_SOCKET options.
4390 	 * Certain SOL_SOCKET options are returned directly whereas
4391 	 * others only provide a default (fallback) value should
4392 	 * the T_SVR4_OPTMGMT_REQ fail.
4393 	 */
4394 	if (level == SOL_SOCKET) {
4395 		/* Check parameters */
4396 		switch (option_name) {
4397 		case SO_TYPE:
4398 		case SO_ERROR:
4399 		case SO_DEBUG:
4400 		case SO_ACCEPTCONN:
4401 		case SO_REUSEADDR:
4402 		case SO_KEEPALIVE:
4403 		case SO_DONTROUTE:
4404 		case SO_BROADCAST:
4405 		case SO_USELOOPBACK:
4406 		case SO_OOBINLINE:
4407 		case SO_SNDBUF:
4408 		case SO_RCVBUF:
4409 #ifdef notyet
4410 		case SO_SNDLOWAT:
4411 		case SO_RCVLOWAT:
4412 		case SO_SNDTIMEO:
4413 		case SO_RCVTIMEO:
4414 #endif /* notyet */
4415 		case SO_DGRAM_ERRIND:
4416 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
4417 				error = EINVAL;
4418 				eprintsoline(so, error);
4419 				goto done2;
4420 			}
4421 			break;
4422 		case SO_LINGER:
4423 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
4424 				error = EINVAL;
4425 				eprintsoline(so, error);
4426 				goto done2;
4427 			}
4428 			break;
4429 		}
4430 
4431 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
4432 
4433 		switch (option_name) {
4434 		case SO_TYPE:
4435 			value = so->so_type;
4436 			option = &value;
4437 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4438 
4439 		case SO_ERROR:
4440 			value = sogeterr(so);
4441 			option = &value;
4442 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4443 
4444 		case SO_ACCEPTCONN:
4445 			if (so->so_state & SS_ACCEPTCONN)
4446 				value = SO_ACCEPTCONN;
4447 			else
4448 				value = 0;
4449 #ifdef DEBUG
4450 			if (value) {
4451 				dprintso(so, 1,
4452 				    ("sotpi_getsockopt: 0x%x is set\n",
4453 				    option_name));
4454 			} else {
4455 				dprintso(so, 1,
4456 				    ("sotpi_getsockopt: 0x%x not set\n",
4457 				    option_name));
4458 			}
4459 #endif /* DEBUG */
4460 			option = &value;
4461 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4462 
4463 		case SO_DEBUG:
4464 		case SO_REUSEADDR:
4465 		case SO_KEEPALIVE:
4466 		case SO_DONTROUTE:
4467 		case SO_BROADCAST:
4468 		case SO_USELOOPBACK:
4469 		case SO_OOBINLINE:
4470 		case SO_DGRAM_ERRIND:
4471 			value = (so->so_options & option_name);
4472 #ifdef DEBUG
4473 			if (value) {
4474 				dprintso(so, 1,
4475 				    ("sotpi_getsockopt: 0x%x is set\n",
4476 				    option_name));
4477 			} else {
4478 				dprintso(so, 1,
4479 				    ("sotpi_getsockopt: 0x%x not set\n",
4480 				    option_name));
4481 			}
4482 #endif /* DEBUG */
4483 			option = &value;
4484 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4485 
4486 		/*
4487 		 * The following options are only returned by sockfs when the
4488 		 * T_SVR4_OPTMGMT_REQ fails.
4489 		 */
4490 		case SO_LINGER:
4491 			option = &so->so_linger;
4492 			len = (t_uscalar_t)sizeof (struct linger);
4493 			break;
4494 		case SO_SNDBUF: {
4495 			ssize_t lvalue;
4496 
4497 			/*
4498 			 * If the option has not been set then get a default
4499 			 * value from the read queue. This value is
4500 			 * returned if the transport fails
4501 			 * the T_SVR4_OPTMGMT_REQ.
4502 			 */
4503 			lvalue = so->so_sndbuf;
4504 			if (lvalue == 0) {
4505 				mutex_exit(&so->so_lock);
4506 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
4507 						QHIWAT, 0, &lvalue);
4508 				mutex_enter(&so->so_lock);
4509 				dprintso(so, 1,
4510 				    ("got SO_SNDBUF %ld from q\n", lvalue));
4511 			}
4512 			value = (int)lvalue;
4513 			option = &value;
4514 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
4515 			break;
4516 		}
4517 		case SO_RCVBUF: {
4518 			ssize_t lvalue;
4519 
4520 			/*
4521 			 * If the option has not been set then get a default
4522 			 * value from the read queue. This value is
4523 			 * returned if the transport fails
4524 			 * the T_SVR4_OPTMGMT_REQ.
4525 			 *
4526 			 * XXX If SO_RCVBUF has been set and this is an
4527 			 * XPG 4.2 application then do not ask the transport
4528 			 * since the transport might adjust the value and not
4529 			 * return exactly what was set by the application.
4530 			 * For non-XPG 4.2 application we return the value
4531 			 * that the transport is actually using.
4532 			 */
4533 			lvalue = so->so_rcvbuf;
4534 			if (lvalue == 0) {
4535 				mutex_exit(&so->so_lock);
4536 				(void) strqget(RD(strvp2wq(SOTOV(so))),
4537 						QHIWAT, 0, &lvalue);
4538 				mutex_enter(&so->so_lock);
4539 				dprintso(so, 1,
4540 				    ("got SO_RCVBUF %ld from q\n", lvalue));
4541 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
4542 				value = (int)lvalue;
4543 				option = &value;
4544 				goto copyout;	/* skip asking transport */
4545 			}
4546 			value = (int)lvalue;
4547 			option = &value;
4548 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
4549 			break;
4550 		}
4551 #ifdef notyet
4552 		/*
4553 		 * We do not implement the semantics of these options
4554 		 * thus we shouldn't implement the options either.
4555 		 */
4556 		case SO_SNDLOWAT:
4557 			value = so->so_sndlowat;
4558 			option = &value;
4559 			break;
4560 		case SO_RCVLOWAT:
4561 			value = so->so_rcvlowat;
4562 			option = &value;
4563 			break;
4564 		case SO_SNDTIMEO:
4565 			value = so->so_sndtimeo;
4566 			option = &value;
4567 			break;
4568 		case SO_RCVTIMEO:
4569 			value = so->so_rcvtimeo;
4570 			option = &value;
4571 			break;
4572 #endif /* notyet */
4573 		}
4574 	}
4575 
4576 	if (so->so_family == AF_NCA) {
4577 		goto done2;
4578 	}
4579 
4580 	mutex_exit(&so->so_lock);
4581 
4582 	/* Send request */
4583 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
4584 	optmgmt_req.MGMT_flags = T_CHECK;
4585 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
4586 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
4587 
4588 	oh.level = level;
4589 	oh.name = option_name;
4590 	oh.len = maxlen;
4591 
4592 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
4593 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP);
4594 	/* Let option management work in the presence of data flow control */
4595 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
4596 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
4597 	mp = NULL;
4598 	mutex_enter(&so->so_lock);
4599 	if (error) {
4600 		eprintsoline(so, error);
4601 		goto done2;
4602 	}
4603 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
4604 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
4605 	if (error) {
4606 		if (option != NULL) {
4607 			/* We have a fallback value */
4608 			error = 0;
4609 			goto copyout;
4610 		}
4611 		eprintsoline(so, error);
4612 		goto done2;
4613 	}
4614 	ASSERT(mp);
4615 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
4616 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
4617 			optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
4618 	if (opt_res == NULL) {
4619 		if (option != NULL) {
4620 			/* We have a fallback value */
4621 			error = 0;
4622 			goto copyout;
4623 		}
4624 		error = EPROTO;
4625 		eprintsoline(so, error);
4626 		goto done;
4627 	}
4628 	option = &opt_res[1];
4629 
4630 	/* check to ensure that the option is within bounds */
4631 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
4632 		(uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
4633 		if (option != NULL) {
4634 			/* We have a fallback value */
4635 			error = 0;
4636 			goto copyout;
4637 		}
4638 		error = EPROTO;
4639 		eprintsoline(so, error);
4640 		goto done;
4641 	}
4642 
4643 	len = opt_res->len;
4644 
4645 copyout: {
4646 		t_uscalar_t size = MIN(len, maxlen);
4647 		bcopy(option, optval, size);
4648 		bcopy(&size, optlenp, sizeof (size));
4649 	}
4650 done:
4651 	freemsg(mp);
4652 done2:
4653 	so_unlock_single(so, SOLOCKED);
4654 	mutex_exit(&so->so_lock);
4655 	return (error);
4656 }
4657 
4658 /*
4659  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
4660  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
4661  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
4662  * setsockopt has to work even if the transport does not support the option.
4663  */
4664 int
4665 sotpi_setsockopt(struct sonode *so, int level, int option_name,
4666 	const void *optval, t_uscalar_t optlen)
4667 {
4668 	struct T_optmgmt_req	optmgmt_req;
4669 	struct opthdr		oh;
4670 	mblk_t			*mp;
4671 	int			error = 0;
4672 	boolean_t		handled = B_FALSE;
4673 
4674 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
4675 			so, level, option_name, optval, optlen,
4676 			pr_state(so->so_state, so->so_mode)));
4677 
4678 
4679 	/* X/Open requires this check */
4680 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4681 		if (xnet_check_print)
4682 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
4683 		return (EINVAL);
4684 	}
4685 
4686 	/* Caller allocates aligned optval, or passes null */
4687 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
4688 	/* If optval is null optlen is 0, and vice-versa */
4689 	ASSERT(optval != NULL || optlen == 0);
4690 	ASSERT(optlen != 0 || optval == NULL);
4691 
4692 	mutex_enter(&so->so_lock);
4693 	so_lock_single(so);	/* Set SOLOCKED */
4694 	mutex_exit(&so->so_lock);
4695 
4696 	if (so->so_family == AF_NCA) {
4697 		/* Ignore any flow control problems with the transport. */
4698 		mutex_enter(&so->so_lock);
4699 		goto done;
4700 	}
4701 
4702 	/*
4703 	 * For SOCKET or TCP level options, try to set it here itself
4704 	 * provided socket has not been popped and we know the tcp
4705 	 * structure (stored in so_priv).
4706 	 */
4707 	if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
4708 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
4709 	    (so->so_version == SOV_SOCKSTREAM) && (so->so_priv != NULL)) {
4710 		tcp_t		*tcp = so->so_priv;
4711 		boolean_t	onoff;
4712 
4713 #define	intvalue	(*(int32_t *)optval)
4714 
4715 		switch (level) {
4716 		case SOL_SOCKET:
4717 			switch (option_name) {		/* Check length param */
4718 			case SO_DEBUG:
4719 			case SO_REUSEADDR:
4720 			case SO_DONTROUTE:
4721 			case SO_BROADCAST:
4722 			case SO_USELOOPBACK:
4723 			case SO_OOBINLINE:
4724 			case SO_DGRAM_ERRIND:
4725 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
4726 					error = EINVAL;
4727 					eprintsoline(so, error);
4728 					mutex_enter(&so->so_lock);
4729 					goto done2;
4730 				}
4731 				ASSERT(optval);
4732 				onoff = intvalue != 0;
4733 				handled = B_TRUE;
4734 				break;
4735 			case SO_LINGER:
4736 				if (optlen !=
4737 				    (t_uscalar_t)sizeof (struct linger)) {
4738 					error = EINVAL;
4739 					eprintsoline(so, error);
4740 					mutex_enter(&so->so_lock);
4741 					goto done2;
4742 				}
4743 				ASSERT(optval);
4744 				handled = B_TRUE;
4745 				break;
4746 			}
4747 
4748 			switch (option_name) {			/* Do actions */
4749 			case SO_LINGER: {
4750 				struct linger *lgr = (struct linger *)optval;
4751 
4752 				if (lgr->l_onoff) {
4753 					tcp->tcp_linger = 1;
4754 					tcp->tcp_lingertime = lgr->l_linger;
4755 					so->so_linger.l_onoff = SO_LINGER;
4756 					so->so_options |= SO_LINGER;
4757 				} else {
4758 					tcp->tcp_linger = 0;
4759 					tcp->tcp_lingertime = 0;
4760 					so->so_linger.l_onoff = 0;
4761 					so->so_options &= ~SO_LINGER;
4762 				}
4763 				so->so_linger.l_linger = lgr->l_linger;
4764 				handled = B_TRUE;
4765 				break;
4766 			}
4767 			case SO_DEBUG:
4768 				tcp->tcp_debug = onoff;
4769 #ifdef SOCK_TEST
4770 				if (intvalue & 2)
4771 					sock_test_timelimit = 10 * hz;
4772 				else
4773 					sock_test_timelimit = 0;
4774 
4775 				if (intvalue & 4)
4776 					do_useracc = 0;
4777 				else
4778 					do_useracc = 1;
4779 #endif /* SOCK_TEST */
4780 				break;
4781 			case SO_DONTROUTE:
4782 				/*
4783 				 * SO_DONTROUTE, SO_USELOOPBACK and
4784 				 * SO_BROADCAST are only of interest to IP.
4785 				 * We track them here only so
4786 				 * that we can report their current value.
4787 				 */
4788 				tcp->tcp_dontroute = onoff;
4789 				if (onoff)
4790 					so->so_options |= option_name;
4791 				else
4792 					so->so_options &= ~option_name;
4793 				break;
4794 			case SO_USELOOPBACK:
4795 				tcp->tcp_useloopback = onoff;
4796 				if (onoff)
4797 					so->so_options |= option_name;
4798 				else
4799 					so->so_options &= ~option_name;
4800 				break;
4801 			case SO_BROADCAST:
4802 				tcp->tcp_broadcast = onoff;
4803 				if (onoff)
4804 					so->so_options |= option_name;
4805 				else
4806 					so->so_options &= ~option_name;
4807 				break;
4808 			case SO_REUSEADDR:
4809 				tcp->tcp_reuseaddr = onoff;
4810 				if (onoff)
4811 					so->so_options |= option_name;
4812 				else
4813 					so->so_options &= ~option_name;
4814 				break;
4815 			case SO_OOBINLINE:
4816 				tcp->tcp_oobinline = onoff;
4817 				if (onoff)
4818 					so->so_options |= option_name;
4819 				else
4820 					so->so_options &= ~option_name;
4821 				break;
4822 			case SO_DGRAM_ERRIND:
4823 				tcp->tcp_dgram_errind = onoff;
4824 				if (onoff)
4825 					so->so_options |= option_name;
4826 				else
4827 					so->so_options &= ~option_name;
4828 				break;
4829 			}
4830 			break;
4831 		case IPPROTO_TCP:
4832 			switch (option_name) {
4833 			case TCP_NODELAY:
4834 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
4835 					error = EINVAL;
4836 					eprintsoline(so, error);
4837 					mutex_enter(&so->so_lock);
4838 					goto done2;
4839 				}
4840 				ASSERT(optval);
4841 				tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss;
4842 				handled = B_TRUE;
4843 				break;
4844 			}
4845 			break;
4846 		default:
4847 			handled = B_FALSE;
4848 			break;
4849 		}
4850 	}
4851 
4852 	if (handled) {
4853 		mutex_enter(&so->so_lock);
4854 		goto done2;
4855 	}
4856 
4857 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
4858 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
4859 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
4860 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
4861 
4862 	oh.level = level;
4863 	oh.name = option_name;
4864 	oh.len = optlen;
4865 
4866 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
4867 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP);
4868 	/* Let option management work in the presence of data flow control */
4869 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
4870 			MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
4871 	mp = NULL;
4872 	mutex_enter(&so->so_lock);
4873 	if (error) {
4874 		eprintsoline(so, error);
4875 		goto done;
4876 	}
4877 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
4878 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
4879 	if (error) {
4880 		eprintsoline(so, error);
4881 		goto done;
4882 	}
4883 	ASSERT(mp);
4884 	/* No need to verify T_optmgmt_ack */
4885 	freemsg(mp);
4886 done:
4887 	/*
4888 	 * Check for SOL_SOCKET options and record their values.
4889 	 * If we know about a SOL_SOCKET parameter and the transport
4890 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
4891 	 * EPROTO) we let the setsockopt succeed.
4892 	 */
4893 	if (level == SOL_SOCKET) {
4894 		/* Check parameters */
4895 		switch (option_name) {
4896 		case SO_DEBUG:
4897 		case SO_REUSEADDR:
4898 		case SO_KEEPALIVE:
4899 		case SO_DONTROUTE:
4900 		case SO_BROADCAST:
4901 		case SO_USELOOPBACK:
4902 		case SO_OOBINLINE:
4903 		case SO_SNDBUF:
4904 		case SO_RCVBUF:
4905 #ifdef notyet
4906 		case SO_SNDLOWAT:
4907 		case SO_RCVLOWAT:
4908 		case SO_SNDTIMEO:
4909 		case SO_RCVTIMEO:
4910 #endif /* notyet */
4911 		case SO_DGRAM_ERRIND:
4912 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
4913 				error = EINVAL;
4914 				eprintsoline(so, error);
4915 				goto done2;
4916 			}
4917 			ASSERT(optval);
4918 			handled = B_TRUE;
4919 			break;
4920 		case SO_LINGER:
4921 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
4922 				error = EINVAL;
4923 				eprintsoline(so, error);
4924 				goto done2;
4925 			}
4926 			ASSERT(optval);
4927 			handled = B_TRUE;
4928 			break;
4929 		}
4930 
4931 #define	intvalue	(*(int32_t *)optval)
4932 
4933 		switch (option_name) {
4934 		case SO_TYPE:
4935 		case SO_ERROR:
4936 		case SO_ACCEPTCONN:
4937 			/* Can't be set */
4938 			error = ENOPROTOOPT;
4939 			goto done2;
4940 		case SO_LINGER: {
4941 			struct linger *l = (struct linger *)optval;
4942 
4943 			so->so_linger.l_linger = l->l_linger;
4944 			if (l->l_onoff) {
4945 				so->so_linger.l_onoff = SO_LINGER;
4946 				so->so_options |= SO_LINGER;
4947 			} else {
4948 				so->so_linger.l_onoff = 0;
4949 				so->so_options &= ~SO_LINGER;
4950 			}
4951 			break;
4952 		}
4953 
4954 		case SO_DEBUG:
4955 #ifdef SOCK_TEST
4956 			if (intvalue & 2)
4957 				sock_test_timelimit = 10 * hz;
4958 			else
4959 				sock_test_timelimit = 0;
4960 
4961 			if (intvalue & 4)
4962 				do_useracc = 0;
4963 			else
4964 				do_useracc = 1;
4965 #endif /* SOCK_TEST */
4966 			/* FALLTHRU */
4967 		case SO_REUSEADDR:
4968 		case SO_KEEPALIVE:
4969 		case SO_DONTROUTE:
4970 		case SO_BROADCAST:
4971 		case SO_USELOOPBACK:
4972 		case SO_OOBINLINE:
4973 		case SO_DGRAM_ERRIND:
4974 			if (intvalue != 0) {
4975 				dprintso(so, 1,
4976 					("sotpi_setsockopt: setting 0x%x\n",
4977 					option_name));
4978 				so->so_options |= option_name;
4979 			} else {
4980 				dprintso(so, 1,
4981 					("sotpi_setsockopt: clearing 0x%x\n",
4982 					option_name));
4983 				so->so_options &= ~option_name;
4984 			}
4985 			break;
4986 		/*
4987 		 * The following options are only returned by us when the
4988 		 * T_SVR4_OPTMGMT_REQ fails.
4989 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
4990 		 * since the transport might adjust the value and not
4991 		 * return exactly what was set by the application.
4992 		 */
4993 		case SO_SNDBUF:
4994 			so->so_sndbuf = intvalue;
4995 			break;
4996 		case SO_RCVBUF:
4997 			so->so_rcvbuf = intvalue;
4998 			break;
4999 #ifdef notyet
5000 		/*
5001 		 * We do not implement the semantics of these options
5002 		 * thus we shouldn't implement the options either.
5003 		 */
5004 		case SO_SNDLOWAT:
5005 			so->so_sndlowat = intvalue;
5006 			break;
5007 		case SO_RCVLOWAT:
5008 			so->so_rcvlowat = intvalue;
5009 			break;
5010 		case SO_SNDTIMEO:
5011 			so->so_sndtimeo = intvalue;
5012 			break;
5013 		case SO_RCVTIMEO:
5014 			so->so_rcvtimeo = intvalue;
5015 			break;
5016 #endif /* notyet */
5017 		}
5018 #undef	intvalue
5019 
5020 		if (error) {
5021 			if ((error == ENOPROTOOPT || error == EPROTO ||
5022 			    error == EINVAL) && handled) {
5023 				dprintso(so, 1,
5024 				    ("setsockopt: ignoring error %d for 0x%x\n",
5025 				    error, option_name));
5026 				error = 0;
5027 			}
5028 		}
5029 	}
5030 done2:
5031 ret:
5032 	so_unlock_single(so, SOLOCKED);
5033 	mutex_exit(&so->so_lock);
5034 	return (error);
5035 }
5036