xref: /illumos-gate/usr/src/uts/common/fs/sockfs/socktpi.c (revision fc483d51bc48aa682ae329934252d02ccdc77469)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2015, Joyent, Inc.
25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
26  * Copyright 2022 Garrett D'Amore
27  * Copyright 2024 Oxide Computer Company
28  */
29 
30 #include <sys/types.h>
31 #include <sys/t_lock.h>
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/buf.h>
35 #include <sys/conf.h>
36 #include <sys/cred.h>
37 #include <sys/kmem.h>
38 #include <sys/kmem_impl.h>
39 #include <sys/sysmacros.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/debug.h>
43 #include <sys/errno.h>
44 #include <sys/time.h>
45 #include <sys/file.h>
46 #include <sys/open.h>
47 #include <sys/user.h>
48 #include <sys/termios.h>
49 #include <sys/stream.h>
50 #include <sys/strsubr.h>
51 #include <sys/strsun.h>
52 #include <sys/suntpi.h>
53 #include <sys/ddi.h>
54 #include <sys/esunddi.h>
55 #include <sys/flock.h>
56 #include <sys/modctl.h>
57 #include <sys/vtrace.h>
58 #include <sys/cmn_err.h>
59 #include <sys/pathname.h>
60 
61 #include <sys/socket.h>
62 #include <sys/socketvar.h>
63 #include <sys/sockio.h>
64 #include <netinet/in.h>
65 #include <sys/un.h>
66 #include <sys/strsun.h>
67 
68 #include <sys/tiuser.h>
69 #define	_SUN_TPI_VERSION	2
70 #include <sys/tihdr.h>
71 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
72 
73 #include <c2/audit.h>
74 
75 #include <inet/common.h>
76 #include <inet/ip.h>
77 #include <inet/ip6.h>
78 #include <inet/tcp.h>
79 #include <inet/udp_impl.h>
80 
81 #include <sys/zone.h>
82 
83 #include <fs/sockfs/sockcommon.h>
84 #include <fs/sockfs/socktpi.h>
85 #include <fs/sockfs/socktpi_impl.h>
86 
87 /*
88  * Possible failures when memory can't be allocated. The documented behavior:
89  *
90  *		5.5:			4.X:		XNET:
91  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
92  *							EINTR
93  *	(4.X does not document EINTR but returns it)
94  * bind:	ENOSR			-		ENOBUFS/ENOSR
95  * connect:	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
96  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
97  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
98  *	(4.X getpeername and getsockname do not fail in practice)
99  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
100  * listen:	-			-		ENOBUFS
101  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
102  *							EINTR
103  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
104  *							EINTR
105  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
106  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
107  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
108  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
109  *
110  * Resolution. When allocation fails:
111  *	recv: return EINTR
112  *	send: return EINTR
113  *	connect, accept: EINTR
114  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
115  *	socket, socketpair: ENOBUFS
116  *	getpeername, getsockname: sleep
117  *	getsockopt, setsockopt: sleep
118  */
119 
120 #ifdef SOCK_TEST
121 /*
122  * Variables that make sockfs do something other than the standard TPI
123  * for the AF_INET transports.
124  *
125  * solisten_tpi_tcp:
126  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
127  *	the transport is already bound. This is needed to avoid loosing the
128  *	port number should listen() do a T_UNBIND_REQ followed by a
129  *	O_T_BIND_REQ.
130  *
131  * soconnect_tpi_udp:
132  *	UDP and ICMP can handle a T_CONN_REQ.
133  *	This is needed to make the sequence of connect(), getsockname()
134  *	return the local IP address used to send packets to the connected to
135  *	destination.
136  *
137  * soconnect_tpi_tcp:
138  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
139  *	Set this to non-zero to send TPI conformant messages to TCP in this
140  *	respect. This is a performance optimization.
141  *
142  * soaccept_tpi_tcp:
143  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
144  *	This is a performance optimization that has been picked up in XTI.
145  *
146  * soaccept_tpi_multioptions:
147  *	When inheriting SOL_SOCKET options from the listener to the accepting
148  *	socket send them as a single message for AF_INET{,6}.
149  */
150 int solisten_tpi_tcp = 0;
151 int soconnect_tpi_udp = 0;
152 int soconnect_tpi_tcp = 0;
153 int soaccept_tpi_tcp = 0;
154 int soaccept_tpi_multioptions = 1;
155 #else /* SOCK_TEST */
156 #define	soconnect_tpi_tcp	0
157 #define	soconnect_tpi_udp	0
158 #define	solisten_tpi_tcp	0
159 #define	soaccept_tpi_tcp	0
160 #define	soaccept_tpi_multioptions	1
161 #endif /* SOCK_TEST */
162 
163 #ifdef SOCK_TEST
164 extern int do_useracc;
165 extern clock_t sock_test_timelimit;
166 #endif /* SOCK_TEST */
167 
168 extern uint32_t ucredsize;
169 
170 /*
171  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
172  * applications working. Turn on this flag to disable these checks.
173  */
174 int xnet_skip_checks = 0;
175 int xnet_check_print = 0;
176 int xnet_truncate_print = 0;
177 
178 static void sotpi_destroy(struct sonode *);
179 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
180     int, int *, cred_t *cr);
181 
182 static boolean_t	sotpi_info_create(struct sonode *, int);
183 static void		sotpi_info_init(struct sonode *);
184 static void		sotpi_info_fini(struct sonode *);
185 static void		sotpi_info_destroy(struct sonode *);
186 
187 /*
188  * Do direct function call to the transport layer below; this would
189  * also allow the transport to utilize read-side synchronous stream
190  * interface if necessary.  This is a /etc/system tunable that must
191  * not be modified on a running system.  By default this is enabled
192  * for performance reasons and may be disabled for debugging purposes.
193  */
194 boolean_t socktpi_direct = B_TRUE;
195 
196 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
197 
198 extern	void sigintr(k_sigset_t *, int);
199 extern	void sigunintr(k_sigset_t *);
200 
201 static int	sotpi_unbind(struct sonode *, int);
202 
203 /* TPI sockfs sonode operations */
204 int		sotpi_init(struct sonode *, struct sonode *, struct cred *,
205 		    int);
206 static int	sotpi_accept(struct sonode *, int, struct cred *,
207 		    struct sonode **);
208 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
209 		    int, struct cred *);
210 static int	sotpi_listen(struct sonode *, int, struct cred *);
211 static int	sotpi_connect(struct sonode *, struct sockaddr *,
212 		    socklen_t, int, int, struct cred *);
213 extern int	sotpi_recvmsg(struct sonode *, struct nmsghdr *,
214 		    struct uio *, struct cred *);
215 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
216 		    struct uio *, struct cred *);
217 static int	sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
218 		    struct cred *, mblk_t **);
219 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
220 		    struct uio *, void *, t_uscalar_t, int);
221 static int	sodgram_direct(struct sonode *, struct sockaddr *,
222 		    socklen_t, struct uio *, int);
223 extern int	sotpi_getpeername(struct sonode *, struct sockaddr *,
224 		    socklen_t *, boolean_t, struct cred *);
225 static int	sotpi_getsockname(struct sonode *, struct sockaddr *,
226 		    socklen_t *, struct cred *);
227 static int	sotpi_shutdown(struct sonode *, int, struct cred *);
228 extern int	sotpi_getsockopt(struct sonode *, int, int, void *,
229 		    socklen_t *, int, struct cred *);
230 extern int	sotpi_setsockopt(struct sonode *, int, int, const void *,
231 		    socklen_t, struct cred *);
232 static int	sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
233 		    int32_t *);
234 static int	socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
235 		    struct cred *, int32_t *);
236 static int	sotpi_poll(struct sonode *, short, int, short *,
237 		    struct pollhead **);
238 static int	sotpi_close(struct sonode *, int, struct cred *);
239 
240 static int	i_sotpi_info_constructor(sotpi_info_t *);
241 static void	i_sotpi_info_destructor(sotpi_info_t *);
242 
243 sonodeops_t sotpi_sonodeops = {
244 	sotpi_init,		/* sop_init		*/
245 	sotpi_accept,		/* sop_accept		*/
246 	sotpi_bind,		/* sop_bind		*/
247 	sotpi_listen,		/* sop_listen		*/
248 	sotpi_connect,		/* sop_connect		*/
249 	sotpi_recvmsg,		/* sop_recvmsg		*/
250 	sotpi_sendmsg,		/* sop_sendmsg		*/
251 	sotpi_sendmblk,		/* sop_sendmblk		*/
252 	sotpi_getpeername,	/* sop_getpeername	*/
253 	sotpi_getsockname,	/* sop_getsockname	*/
254 	sotpi_shutdown,		/* sop_shutdown		*/
255 	sotpi_getsockopt,	/* sop_getsockopt	*/
256 	sotpi_setsockopt,	/* sop_setsockopt	*/
257 	sotpi_ioctl,		/* sop_ioctl		*/
258 	sotpi_poll,		/* sop_poll		*/
259 	sotpi_close,		/* sop_close		*/
260 };
261 
262 /*
263  * Return a TPI socket vnode.
264  *
265  * Note that sockets assume that the driver will clone (either itself
266  * or by using the clone driver) i.e. a socket() call will always
267  * result in a new vnode being created.
268  */
269 
270 /*
271  * Common create code for socket and accept. If tso is set the values
272  * from that node is used instead of issuing a T_INFO_REQ.
273  */
274 
275 /* ARGSUSED */
276 static struct sonode *
sotpi_create(struct sockparams * sp,int family,int type,int protocol,int version,int sflags,int * errorp,cred_t * cr)277 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
278     int version, int sflags, int *errorp, cred_t *cr)
279 {
280 	struct sonode	*so;
281 	kmem_cache_t	*cp;
282 
283 	ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
284 
285 	/*
286 	 * to be compatible with old tpi socket implementation ignore
287 	 * sleep flag (sflags) passed in
288 	 */
289 	cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
290 	so = kmem_cache_alloc(cp, KM_SLEEP);
291 	if (so == NULL) {
292 		*errorp = ENOMEM;
293 		return (NULL);
294 	}
295 
296 	sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
297 	sotpi_info_init(so);
298 
299 	if (version == SOV_DEFAULT)
300 		version = so_default_version;
301 
302 	so->so_version = (short)version;
303 	*errorp = 0;
304 
305 	return (so);
306 }
307 
308 static void
sotpi_destroy(struct sonode * so)309 sotpi_destroy(struct sonode *so)
310 {
311 	kmem_cache_t *cp;
312 	struct sockparams *origsp;
313 
314 	/*
315 	 * If there is a new dealloc function (ie. smod_destroy_func),
316 	 * then it should check the correctness of the ops.
317 	 */
318 
319 	ASSERT(so->so_ops == &sotpi_sonodeops);
320 
321 	origsp = SOTOTPI(so)->sti_orig_sp;
322 
323 	sotpi_info_fini(so);
324 
325 	if (so->so_state & SS_FALLBACK_COMP) {
326 		/*
327 		 * A fallback happend, which means that a sotpi_info_t struct
328 		 * was allocated (as opposed to being allocated from the TPI
329 		 * sonode cache. Therefore we explicitly free the struct
330 		 * here.
331 		 */
332 		sotpi_info_destroy(so);
333 		ASSERT(origsp != NULL);
334 
335 		origsp->sp_smod_info->smod_sock_destroy_func(so);
336 		SOCKPARAMS_DEC_REF(origsp);
337 	} else {
338 		sonode_fini(so);
339 		cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
340 		    socktpi_cache;
341 		kmem_cache_free(cp, so);
342 	}
343 }
344 
345 /* ARGSUSED1 */
346 int
sotpi_init(struct sonode * so,struct sonode * tso,struct cred * cr,int flags)347 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
348 {
349 	major_t maj;
350 	dev_t newdev;
351 	struct vnode *vp;
352 	int error = 0;
353 	struct stdata *stp;
354 
355 	sotpi_info_t *sti = SOTOTPI(so);
356 
357 	dprint(1, ("sotpi_init()\n"));
358 
359 	/*
360 	 * over write the sleep flag passed in but that is ok
361 	 * as tpi socket does not honor sleep flag.
362 	 */
363 	flags |= FREAD|FWRITE;
364 
365 	/*
366 	 * Record in so_flag that it is a clone.
367 	 */
368 	if (getmajor(sti->sti_dev) == clone_major)
369 		so->so_flag |= SOCLONE;
370 
371 	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
372 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
373 	    (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
374 	    so->so_protocol == IPPROTO_IP)) {
375 		/* Tell tcp or udp that it's talking to sockets */
376 		flags |= SO_SOCKSTR;
377 
378 		/*
379 		 * Here we indicate to socktpi_open() our attempt to
380 		 * make direct calls between sockfs and transport.
381 		 * The final decision is left to socktpi_open().
382 		 */
383 		sti->sti_direct = 1;
384 
385 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
386 		if (so->so_type == SOCK_STREAM && tso != NULL) {
387 			if (SOTOTPI(tso)->sti_direct) {
388 				/*
389 				 * Inherit sti_direct from listener and pass
390 				 * SO_ACCEPTOR open flag to tcp, indicating
391 				 * that this is an accept fast-path instance.
392 				 */
393 				flags |= SO_ACCEPTOR;
394 			} else {
395 				/*
396 				 * sti_direct is not set on listener, meaning
397 				 * that the listener has been converted from
398 				 * a socket to a stream.  Ensure that the
399 				 * acceptor inherits these settings.
400 				 */
401 				sti->sti_direct = 0;
402 				flags &= ~SO_SOCKSTR;
403 			}
404 		}
405 	}
406 
407 	/*
408 	 * Tell local transport that it is talking to sockets.
409 	 */
410 	if (so->so_family == AF_UNIX) {
411 		flags |= SO_SOCKSTR;
412 	}
413 
414 	vp = SOTOV(so);
415 	newdev = vp->v_rdev;
416 	maj = getmajor(newdev);
417 	ASSERT(STREAMSTAB(maj));
418 
419 	error = stropen(vp, &newdev, flags, cr);
420 
421 	stp = vp->v_stream;
422 	if (error == 0) {
423 		if (so->so_flag & SOCLONE)
424 			ASSERT(newdev != vp->v_rdev);
425 		mutex_enter(&so->so_lock);
426 		sti->sti_dev = newdev;
427 		vp->v_rdev = newdev;
428 		mutex_exit(&so->so_lock);
429 
430 		if (stp->sd_flag & STRISTTY) {
431 			/*
432 			 * this is a post SVR4 tty driver - a socket can not
433 			 * be a controlling terminal. Fail the open.
434 			 */
435 			(void) sotpi_close(so, flags, cr);
436 			return (ENOTTY);	/* XXX */
437 		}
438 
439 		ASSERT(stp->sd_wrq != NULL);
440 		sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
441 
442 		/*
443 		 * If caller is interested in doing direct function call
444 		 * interface to/from transport module, probe the module
445 		 * directly beneath the streamhead to see if it qualifies.
446 		 *
447 		 * We turn off the direct interface when qualifications fail.
448 		 * In the acceptor case, we simply turn off the sti_direct
449 		 * flag on the socket. We do the fallback after the accept
450 		 * has completed, before the new socket is returned to the
451 		 * application.
452 		 */
453 		if (sti->sti_direct) {
454 			queue_t *tq = stp->sd_wrq->q_next;
455 
456 			/*
457 			 * sti_direct is currently supported and tested
458 			 * only for tcp/udp; this is the main reason to
459 			 * have the following assertions.
460 			 */
461 			ASSERT(so->so_family == AF_INET ||
462 			    so->so_family == AF_INET6);
463 			ASSERT(so->so_protocol == IPPROTO_UDP ||
464 			    so->so_protocol == IPPROTO_TCP ||
465 			    so->so_protocol == IPPROTO_IP);
466 			ASSERT(so->so_type == SOCK_DGRAM ||
467 			    so->so_type == SOCK_STREAM);
468 
469 			/*
470 			 * Abort direct call interface if the module directly
471 			 * underneath the stream head is not defined with the
472 			 * _D_DIRECT flag.  This could happen in the tcp or
473 			 * udp case, when some other module is autopushed
474 			 * above it, or for some reasons the expected module
475 			 * isn't purely D_MP (which is the main requirement).
476 			 */
477 			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
478 			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
479 				int rval;
480 
481 				/* Continue on without direct calls */
482 				sti->sti_direct = 0;
483 
484 				/*
485 				 * Cannot issue ioctl on fallback socket since
486 				 * there is no conn associated with the queue.
487 				 * The fallback downcall will notify the proto
488 				 * of the change.
489 				 */
490 				if (!(flags & SO_ACCEPTOR) &&
491 				    !(flags & SO_FALLBACK)) {
492 					if ((error = strioctl(vp,
493 					    _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
494 					    cr, &rval)) != 0) {
495 						(void) sotpi_close(so, flags,
496 						    cr);
497 						return (error);
498 					}
499 				}
500 			}
501 		}
502 
503 		if (flags & SO_FALLBACK) {
504 			/*
505 			 * The stream created does not have a conn.
506 			 * do stream set up after conn has been assigned
507 			 */
508 			return (error);
509 		}
510 		error = so_strinit(so, tso);
511 		if (error != 0) {
512 			(void) sotpi_close(so, flags, cr);
513 			return (error);
514 		}
515 
516 		/* Enable sendfile() on AF_UNIX streams */
517 		if (so->so_family == AF_UNIX && so->so_type == SOCK_STREAM) {
518 			mutex_enter(&so->so_lock);
519 			so->so_mode |= SM_SENDFILESUPP;
520 			mutex_exit(&so->so_lock);
521 		}
522 
523 		/* Wildcard */
524 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
525 			int protocol = so->so_protocol;
526 			/*
527 			 * Issue SO_PROTOTYPE setsockopt.
528 			 */
529 			error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
530 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
531 			if (error != 0) {
532 				(void) sotpi_close(so, flags, cr);
533 				/*
534 				 * Setsockopt often fails with ENOPROTOOPT but
535 				 * socket() should fail with
536 				 * EPROTONOSUPPORT/EPROTOTYPE.
537 				 */
538 				return (EPROTONOSUPPORT);
539 			}
540 		}
541 
542 	} else {
543 		/*
544 		 * While the same socket can not be reopened (unlike specfs)
545 		 * the stream head sets STREOPENFAIL when the autopush fails.
546 		 */
547 		if ((stp != NULL) &&
548 		    (stp->sd_flag & STREOPENFAIL)) {
549 			/*
550 			 * Open failed part way through.
551 			 */
552 			mutex_enter(&stp->sd_lock);
553 			stp->sd_flag &= ~STREOPENFAIL;
554 			mutex_exit(&stp->sd_lock);
555 			(void) sotpi_close(so, flags, cr);
556 			return (error);
557 			/*NOTREACHED*/
558 		}
559 		ASSERT(stp == NULL);
560 	}
561 	TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
562 	    "sockfs open:maj %d vp %p so %p error %d",
563 	    maj, vp, so, error);
564 	return (error);
565 }
566 
567 /*
568  * Bind the socket to an unspecified address in sockfs only.
569  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
570  * required in all cases.
571  */
572 static void
so_automatic_bind(struct sonode * so)573 so_automatic_bind(struct sonode *so)
574 {
575 	sotpi_info_t *sti = SOTOTPI(so);
576 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
577 
578 	ASSERT(MUTEX_HELD(&so->so_lock));
579 	ASSERT(!(so->so_state & SS_ISBOUND));
580 	ASSERT(sti->sti_unbind_mp);
581 
582 	ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
583 	bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
584 	sti->sti_laddr_sa->sa_family = so->so_family;
585 	so->so_state |= SS_ISBOUND;
586 }
587 
588 
589 /*
590  * bind the socket.
591  *
592  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
593  * are passed in we allow rebinding. Note that for backwards compatibility
594  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
595  * Thus the rebinding code is currently not executed.
596  *
597  * The constraints for rebinding are:
598  * - it is a SOCK_DGRAM, or
599  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
600  *   and no listen() has been done.
601  * This rebinding code was added based on some language in the XNET book
602  * about not returning EINVAL it the protocol allows rebinding. However,
603  * this language is not present in the Posix socket draft. Thus maybe the
604  * rebinding logic should be deleted from the source.
605  *
606  * A null "name" can be used to unbind the socket if:
607  * - it is a SOCK_DGRAM, or
608  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
609  *   and no listen() has been done.
610  */
611 /* ARGSUSED */
612 static int
sotpi_bindlisten(struct sonode * so,struct sockaddr * name,socklen_t namelen,int backlog,int flags,struct cred * cr)613 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
614     socklen_t namelen, int backlog, int flags, struct cred *cr)
615 {
616 	struct T_bind_req	bind_req;
617 	struct T_bind_ack	*bind_ack;
618 	int			error = 0;
619 	mblk_t			*mp;
620 	void			*addr;
621 	t_uscalar_t		addrlen;
622 	int			unbind_on_err = 1;
623 	boolean_t		clear_acceptconn_on_err = B_FALSE;
624 	boolean_t		restore_backlog_on_err = B_FALSE;
625 	int			save_so_backlog = 0;
626 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
627 	boolean_t		tcp_udp_xport;
628 	sotpi_info_t		*sti = SOTOTPI(so);
629 
630 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
631 	    (void *)so, (void *)name, namelen, backlog, flags,
632 	    pr_state(so->so_state, so->so_mode)));
633 
634 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
635 
636 	if (!(flags & _SOBIND_LOCK_HELD)) {
637 		mutex_enter(&so->so_lock);
638 		so_lock_single(so);	/* Set SOLOCKED */
639 	} else {
640 		ASSERT(MUTEX_HELD(&so->so_lock));
641 		ASSERT(so->so_flag & SOLOCKED);
642 	}
643 
644 	/*
645 	 * Make sure that there is a preallocated unbind_req message
646 	 * before binding. This message allocated when the socket is
647 	 * created  but it might be have been consumed.
648 	 */
649 	if (sti->sti_unbind_mp == NULL) {
650 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
651 		/* NOTE: holding so_lock while sleeping */
652 		sti->sti_unbind_mp =
653 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
654 		    cr);
655 	}
656 
657 	if (flags & _SOBIND_REBIND) {
658 		/*
659 		 * Called from solisten after doing an sotpi_unbind() or
660 		 * potentially without the unbind (latter for AF_INET{,6}).
661 		 */
662 		ASSERT(name == NULL && namelen == 0);
663 
664 		if (so->so_family == AF_UNIX) {
665 			ASSERT(sti->sti_ux_bound_vp);
666 			addr = &sti->sti_ux_laddr;
667 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
668 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
669 			    "addr 0x%p, vp %p\n",
670 			    addrlen,
671 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
672 			    (void *)sti->sti_ux_bound_vp));
673 		} else {
674 			addr = sti->sti_laddr_sa;
675 			addrlen = (t_uscalar_t)sti->sti_laddr_len;
676 		}
677 	} else if (flags & _SOBIND_UNSPEC) {
678 		ASSERT(name == NULL && namelen == 0);
679 
680 		/*
681 		 * The caller checked SS_ISBOUND but not necessarily
682 		 * under so_lock
683 		 */
684 		if (so->so_state & SS_ISBOUND) {
685 			/* No error */
686 			goto done;
687 		}
688 
689 		/* Set an initial local address */
690 		switch (so->so_family) {
691 		case AF_UNIX:
692 			/*
693 			 * Use an address with same size as struct sockaddr
694 			 * just like BSD.
695 			 */
696 			sti->sti_laddr_len =
697 			    (socklen_t)sizeof (struct sockaddr);
698 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
699 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
700 			sti->sti_laddr_sa->sa_family = so->so_family;
701 
702 			/*
703 			 * Pass down an address with the implicit bind
704 			 * magic number and the rest all zeros.
705 			 * The transport will return a unique address.
706 			 */
707 			sti->sti_ux_laddr.soua_vp = NULL;
708 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
709 			addr = &sti->sti_ux_laddr;
710 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
711 			break;
712 
713 		case AF_INET:
714 		case AF_INET6:
715 			/*
716 			 * An unspecified bind in TPI has a NULL address.
717 			 * Set the address in sockfs to have the sa_family.
718 			 */
719 			sti->sti_laddr_len = (so->so_family == AF_INET) ?
720 			    (socklen_t)sizeof (sin_t) :
721 			    (socklen_t)sizeof (sin6_t);
722 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
723 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
724 			sti->sti_laddr_sa->sa_family = so->so_family;
725 			addr = NULL;
726 			addrlen = 0;
727 			break;
728 
729 		default:
730 			/*
731 			 * An unspecified bind in TPI has a NULL address.
732 			 * Set the address in sockfs to be zero length.
733 			 *
734 			 * Can not assume there is a sa_family for all
735 			 * protocol families. For example, AF_X25 does not
736 			 * have a family field.
737 			 */
738 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
739 			sti->sti_laddr_len = 0;	/* XXX correct? */
740 			addr = NULL;
741 			addrlen = 0;
742 			break;
743 		}
744 
745 	} else {
746 		if (so->so_state & SS_ISBOUND) {
747 			/*
748 			 * If it is ok to rebind the socket, first unbind
749 			 * with the transport. A rebind to the NULL address
750 			 * is interpreted as an unbind.
751 			 * Note that a bind to NULL in BSD does unbind the
752 			 * socket but it fails with EINVAL.
753 			 * Note that regular sockets set SOV_SOCKBSD i.e.
754 			 * _SOBIND_SOCKBSD gets set here hence no type of
755 			 * socket does currently allow rebinding.
756 			 *
757 			 * If the name is NULL just do an unbind.
758 			 */
759 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
760 			    name != NULL) {
761 				error = EINVAL;
762 				unbind_on_err = 0;
763 				eprintsoline(so, error);
764 				goto done;
765 			}
766 			if ((so->so_mode & SM_CONNREQUIRED) &&
767 			    (so->so_state & SS_CANTREBIND)) {
768 				error = EINVAL;
769 				unbind_on_err = 0;
770 				eprintsoline(so, error);
771 				goto done;
772 			}
773 			error = sotpi_unbind(so, 0);
774 			if (error) {
775 				eprintsoline(so, error);
776 				goto done;
777 			}
778 			ASSERT(!(so->so_state & SS_ISBOUND));
779 			if (name == NULL) {
780 				so->so_state &=
781 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
782 				goto done;
783 			}
784 		}
785 
786 		/* X/Open requires this check */
787 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
788 			if (xnet_check_print) {
789 				printf("sockfs: X/Open bind state check "
790 				    "caused EINVAL\n");
791 			}
792 			error = EINVAL;
793 			goto done;
794 		}
795 
796 		switch (so->so_family) {
797 		case AF_UNIX:
798 			/*
799 			 * All AF_UNIX addresses are nul terminated
800 			 * when copied (copyin_name) in so the minimum
801 			 * length is 3 bytes.
802 			 */
803 			if (name == NULL ||
804 			    (ssize_t)namelen <= sizeof (short) + 1) {
805 				error = EISDIR;
806 				eprintsoline(so, error);
807 				goto done;
808 			}
809 			/*
810 			 * Verify so_family matches the bound family.
811 			 * BSD does not check this for AF_UNIX resulting
812 			 * in funny mknods.
813 			 */
814 			if (name->sa_family != so->so_family) {
815 				error = EAFNOSUPPORT;
816 				goto done;
817 			}
818 			break;
819 		case AF_INET:
820 			if (name == NULL) {
821 				error = EINVAL;
822 				eprintsoline(so, error);
823 				goto done;
824 			}
825 			if ((size_t)namelen != sizeof (sin_t)) {
826 				error = name->sa_family != so->so_family ?
827 				    EAFNOSUPPORT : EINVAL;
828 				eprintsoline(so, error);
829 				goto done;
830 			}
831 			if ((flags & _SOBIND_XPG4_2) &&
832 			    (name->sa_family != so->so_family)) {
833 				/*
834 				 * This check has to be made for X/Open
835 				 * sockets however application failures have
836 				 * been observed when it is applied to
837 				 * all sockets.
838 				 */
839 				error = EAFNOSUPPORT;
840 				eprintsoline(so, error);
841 				goto done;
842 			}
843 			/*
844 			 * Force a zero sa_family to match so_family.
845 			 *
846 			 * Some programs like inetd(8) don't set the
847 			 * family field. Other programs leave
848 			 * sin_family set to garbage - SunOS 4.X does
849 			 * not check the family field on a bind.
850 			 * We use the family field that
851 			 * was passed in to the socket() call.
852 			 */
853 			name->sa_family = so->so_family;
854 			break;
855 
856 		case AF_INET6: {
857 #ifdef DEBUG
858 			sin6_t *sin6 = (sin6_t *)name;
859 #endif /* DEBUG */
860 
861 			if (name == NULL) {
862 				error = EINVAL;
863 				eprintsoline(so, error);
864 				goto done;
865 			}
866 			if ((size_t)namelen != sizeof (sin6_t)) {
867 				error = name->sa_family != so->so_family ?
868 				    EAFNOSUPPORT : EINVAL;
869 				eprintsoline(so, error);
870 				goto done;
871 			}
872 			if (name->sa_family != so->so_family) {
873 				/*
874 				 * With IPv6 we require the family to match
875 				 * unlike in IPv4.
876 				 */
877 				error = EAFNOSUPPORT;
878 				eprintsoline(so, error);
879 				goto done;
880 			}
881 #ifdef DEBUG
882 			/*
883 			 * Verify that apps don't forget to clear
884 			 * sin6_scope_id etc
885 			 */
886 			if (sin6->sin6_scope_id != 0 &&
887 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
888 				zcmn_err(getzoneid(), CE_WARN,
889 				    "bind with uninitialized sin6_scope_id "
890 				    "(%d) on socket. Pid = %d\n",
891 				    (int)sin6->sin6_scope_id,
892 				    (int)curproc->p_pid);
893 			}
894 			if (sin6->__sin6_src_id != 0) {
895 				zcmn_err(getzoneid(), CE_WARN,
896 				    "bind with uninitialized __sin6_src_id "
897 				    "(%d) on socket. Pid = %d\n",
898 				    (int)sin6->__sin6_src_id,
899 				    (int)curproc->p_pid);
900 			}
901 #endif /* DEBUG */
902 			break;
903 		}
904 		default:
905 			/*
906 			 * Don't do any length or sa_family check to allow
907 			 * non-sockaddr style addresses.
908 			 */
909 			if (name == NULL) {
910 				error = EINVAL;
911 				eprintsoline(so, error);
912 				goto done;
913 			}
914 			break;
915 		}
916 
917 		if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
918 			error = ENAMETOOLONG;
919 			eprintsoline(so, error);
920 			goto done;
921 		}
922 		/*
923 		 * Save local address.
924 		 */
925 		sti->sti_laddr_len = (socklen_t)namelen;
926 		ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
927 		bcopy(name, sti->sti_laddr_sa, namelen);
928 
929 		addr = sti->sti_laddr_sa;
930 		addrlen = (t_uscalar_t)sti->sti_laddr_len;
931 		switch (so->so_family) {
932 		case AF_INET6:
933 		case AF_INET:
934 			break;
935 		case AF_UNIX: {
936 			struct sockaddr_un *soun =
937 			    (struct sockaddr_un *)sti->sti_laddr_sa;
938 			struct vnode *vp, *rvp;
939 			struct vattr vattr;
940 
941 			ASSERT(sti->sti_ux_bound_vp == NULL);
942 			/*
943 			 * Create vnode for the specified path name.
944 			 * Keep vnode held with a reference in sti_ux_bound_vp.
945 			 * Use the vnode pointer as the address used in the
946 			 * bind with the transport.
947 			 *
948 			 * Use the same mode as in BSD. In particular this does
949 			 * not observe the umask.
950 			 */
951 			/* MAXPATHLEN + soun_family + nul termination */
952 			if (sti->sti_laddr_len >
953 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
954 				error = ENAMETOOLONG;
955 				eprintsoline(so, error);
956 				goto done;
957 			}
958 			vattr.va_type = VSOCK;
959 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
960 			vattr.va_mask = AT_TYPE|AT_MODE;
961 			/* NOTE: holding so_lock */
962 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
963 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
964 			if (error) {
965 				if (error == EEXIST)
966 					error = EADDRINUSE;
967 				eprintsoline(so, error);
968 				goto done;
969 			}
970 			/*
971 			 * Establish pointer from the underlying filesystem
972 			 * vnode to the socket node.
973 			 * sti_ux_bound_vp and v_stream->sd_vnode form the
974 			 * cross-linkage between the underlying filesystem
975 			 * node and the socket node.
976 			 */
977 
978 			if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
979 				VN_HOLD(rvp);
980 				VN_RELE(vp);
981 				vp = rvp;
982 			}
983 
984 			ASSERT(SOTOV(so)->v_stream);
985 			mutex_enter(&vp->v_lock);
986 			vp->v_stream = SOTOV(so)->v_stream;
987 			sti->sti_ux_bound_vp = vp;
988 			mutex_exit(&vp->v_lock);
989 
990 			/*
991 			 * Use the vnode pointer value as a unique address
992 			 * (together with the magic number to avoid conflicts
993 			 * with implicit binds) in the transport provider.
994 			 */
995 			sti->sti_ux_laddr.soua_vp =
996 			    (void *)sti->sti_ux_bound_vp;
997 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
998 			addr = &sti->sti_ux_laddr;
999 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1000 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1001 			    addrlen,
1002 			    (void *)((struct so_ux_addr *)addr)->soua_vp));
1003 			break;
1004 		}
1005 		} /* end switch (so->so_family) */
1006 	}
1007 
1008 	/*
1009 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1010 	 * the transport can start passing up T_CONN_IND messages
1011 	 * as soon as it receives the bind req and strsock_proto()
1012 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1013 	 */
1014 	if (flags & _SOBIND_LISTEN) {
1015 		if ((so->so_state & SS_ACCEPTCONN) == 0)
1016 			clear_acceptconn_on_err = B_TRUE;
1017 		save_so_backlog = so->so_backlog;
1018 		restore_backlog_on_err = B_TRUE;
1019 		so->so_state |= SS_ACCEPTCONN;
1020 		so->so_backlog = backlog;
1021 	}
1022 
1023 	/*
1024 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1025 	 * for other transports we will send in a O_T_BIND_REQ.
1026 	 */
1027 	if (tcp_udp_xport &&
1028 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
1029 		PRIM_type = T_BIND_REQ;
1030 
1031 	bind_req.PRIM_type = PRIM_type;
1032 	bind_req.ADDR_length = addrlen;
1033 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1034 	bind_req.CONIND_number = backlog;
1035 	/* NOTE: holding so_lock while sleeping */
1036 	mp = soallocproto2(&bind_req, sizeof (bind_req),
1037 	    addr, addrlen, 0, _ALLOC_SLEEP, cr);
1038 	sti->sti_laddr_valid = 0;
1039 
1040 	/* Done using sti_laddr_sa - can drop the lock */
1041 	mutex_exit(&so->so_lock);
1042 
1043 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1044 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1045 	if (error) {
1046 		eprintsoline(so, error);
1047 		mutex_enter(&so->so_lock);
1048 		goto done;
1049 	}
1050 
1051 	mutex_enter(&so->so_lock);
1052 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1053 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1054 	if (error) {
1055 		eprintsoline(so, error);
1056 		goto done;
1057 	}
1058 	ASSERT(mp);
1059 	/*
1060 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1061 	 * strsock_proto while the lock was dropped above, the bind
1062 	 * is allowed to complete.
1063 	 */
1064 
1065 	/* Mark as bound. This will be undone if we detect errors below. */
1066 	if (flags & _SOBIND_NOXLATE) {
1067 		ASSERT(so->so_family == AF_UNIX);
1068 		sti->sti_faddr_noxlate = 1;
1069 	}
1070 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1071 	so->so_state |= SS_ISBOUND;
1072 	ASSERT(sti->sti_unbind_mp);
1073 
1074 	/* note that we've already set SS_ACCEPTCONN above */
1075 
1076 	/*
1077 	 * Recompute addrlen - an unspecied bind sent down an
1078 	 * address of length zero but we expect the appropriate length
1079 	 * in return.
1080 	 */
1081 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1082 	    sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1083 
1084 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
1085 	/*
1086 	 * The alignment restriction is really too strict but
1087 	 * we want enough alignment to inspect the fields of
1088 	 * a sockaddr_in.
1089 	 */
1090 	addr = sogetoff(mp, bind_ack->ADDR_offset,
1091 	    bind_ack->ADDR_length,
1092 	    __TPI_ALIGN_SIZE);
1093 	if (addr == NULL) {
1094 		freemsg(mp);
1095 		error = EPROTO;
1096 		eprintsoline(so, error);
1097 		goto done;
1098 	}
1099 	if (!(flags & _SOBIND_UNSPEC)) {
1100 		/*
1101 		 * Verify that the transport didn't return something we
1102 		 * did not want e.g. an address other than what we asked for.
1103 		 *
1104 		 * NOTE: These checks would go away if/when we switch to
1105 		 * using the new TPI (in which the transport would fail
1106 		 * the request instead of assigning a different address).
1107 		 *
1108 		 * NOTE2: For protocols that we don't know (i.e. any
1109 		 * other than AF_INET6, AF_INET and AF_UNIX), we
1110 		 * cannot know if the transport should be expected to
1111 		 * return the same address as that requested.
1112 		 *
1113 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1114 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1115 		 *
1116 		 * For example, in the case of netatalk it may be
1117 		 * inappropriate for the transport to return the
1118 		 * requested address (as it may have allocated a local
1119 		 * port number in behaviour similar to that of an
1120 		 * AF_INET bind request with a port number of zero).
1121 		 *
1122 		 * Given the definition of O_T_BIND_REQ, where the
1123 		 * transport may bind to an address other than the
1124 		 * requested address, it's not possible to determine
1125 		 * whether a returned address that differs from the
1126 		 * requested address is a reason to fail (because the
1127 		 * requested address was not available) or succeed
1128 		 * (because the transport allocated an appropriate
1129 		 * address and/or port).
1130 		 *
1131 		 * sockfs currently requires that the transport return
1132 		 * the requested address in the T_BIND_ACK, unless
1133 		 * there is code here to allow for any discrepancy.
1134 		 * Such code exists for AF_INET and AF_INET6.
1135 		 *
1136 		 * Netatalk chooses to return the requested address
1137 		 * rather than the (correct) allocated address.  This
1138 		 * means that netatalk violates the TPI specification
1139 		 * (and would not function correctly if used from a
1140 		 * TLI application), but it does mean that it works
1141 		 * with sockfs.
1142 		 *
1143 		 * As noted above, using the newer XTI bind primitive
1144 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1145 		 * allow sockfs to be more sure about whether or not
1146 		 * the bind request had succeeded (as transports are
1147 		 * not permitted to bind to a different address than
1148 		 * that requested - they must return failure).
1149 		 * Unfortunately, support for T_BIND_REQ may not be
1150 		 * present in all transport implementations (netatalk,
1151 		 * for example, doesn't have it), making the
1152 		 * transition difficult.
1153 		 */
1154 		if (bind_ack->ADDR_length != addrlen) {
1155 			/* Assumes that the requested address was in use */
1156 			freemsg(mp);
1157 			error = EADDRINUSE;
1158 			eprintsoline(so, error);
1159 			goto done;
1160 		}
1161 
1162 		switch (so->so_family) {
1163 		case AF_INET6:
1164 		case AF_INET: {
1165 			sin_t *rname, *aname;
1166 
1167 			rname = (sin_t *)addr;
1168 			aname = (sin_t *)sti->sti_laddr_sa;
1169 
1170 			/*
1171 			 * Take advantage of the alignment
1172 			 * of sin_port and sin6_port which fall
1173 			 * in the same place in their data structures.
1174 			 * Just use sin_port for either address family.
1175 			 *
1176 			 * This may become a problem if (heaven forbid)
1177 			 * there's a separate ipv6port_reserved... :-P
1178 			 *
1179 			 * Binding to port 0 has the semantics of letting
1180 			 * the transport bind to any port.
1181 			 *
1182 			 * If the transport is TCP or UDP since we had sent
1183 			 * a T_BIND_REQ we would not get a port other than
1184 			 * what we asked for.
1185 			 */
1186 			if (tcp_udp_xport) {
1187 				/*
1188 				 * Pick up the new port number if we bound to
1189 				 * port 0.
1190 				 */
1191 				if (aname->sin_port == 0)
1192 					aname->sin_port = rname->sin_port;
1193 				sti->sti_laddr_valid = 1;
1194 				break;
1195 			}
1196 			if (aname->sin_port != 0 &&
1197 			    aname->sin_port != rname->sin_port) {
1198 				freemsg(mp);
1199 				error = EADDRINUSE;
1200 				eprintsoline(so, error);
1201 				goto done;
1202 			}
1203 			/*
1204 			 * Pick up the new port number if we bound to port 0.
1205 			 */
1206 			aname->sin_port = rname->sin_port;
1207 
1208 			/*
1209 			 * Unfortunately, addresses aren't _quite_ the same.
1210 			 */
1211 			if (so->so_family == AF_INET) {
1212 				if (aname->sin_addr.s_addr !=
1213 				    rname->sin_addr.s_addr) {
1214 					freemsg(mp);
1215 					error = EADDRNOTAVAIL;
1216 					eprintsoline(so, error);
1217 					goto done;
1218 				}
1219 			} else {
1220 				sin6_t *rname6 = (sin6_t *)rname;
1221 				sin6_t *aname6 = (sin6_t *)aname;
1222 
1223 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1224 				    &rname6->sin6_addr)) {
1225 					freemsg(mp);
1226 					error = EADDRNOTAVAIL;
1227 					eprintsoline(so, error);
1228 					goto done;
1229 				}
1230 			}
1231 			break;
1232 		}
1233 		case AF_UNIX:
1234 			if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1235 				freemsg(mp);
1236 				error = EADDRINUSE;
1237 				eprintsoline(so, error);
1238 				eprintso(so,
1239 				    ("addrlen %d, addr 0x%x, vp %p\n",
1240 				    addrlen, *((int *)addr),
1241 				    (void *)sti->sti_ux_bound_vp));
1242 				goto done;
1243 			}
1244 			sti->sti_laddr_valid = 1;
1245 			break;
1246 		default:
1247 			/*
1248 			 * NOTE: This assumes that addresses can be
1249 			 * byte-compared for equivalence.
1250 			 */
1251 			if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1252 				freemsg(mp);
1253 				error = EADDRINUSE;
1254 				eprintsoline(so, error);
1255 				goto done;
1256 			}
1257 			/*
1258 			 * Don't mark sti_laddr_valid, as we cannot be
1259 			 * sure that the returned address is the real
1260 			 * bound address when talking to an unknown
1261 			 * transport.
1262 			 */
1263 			break;
1264 		}
1265 	} else {
1266 		/*
1267 		 * Save for returned address for getsockname.
1268 		 * Needed for unspecific bind unless transport supports
1269 		 * the TI_GETMYNAME ioctl.
1270 		 * Do this for AF_INET{,6} even though they do, as
1271 		 * caching info here is much better performance than
1272 		 * a TPI/STREAMS trip to the transport for getsockname.
1273 		 * Any which can't for some reason _must_ _not_ set
1274 		 * sti_laddr_valid here for the caching version of
1275 		 * getsockname to not break;
1276 		 */
1277 		switch (so->so_family) {
1278 		case AF_UNIX:
1279 			/*
1280 			 * Record the address bound with the transport
1281 			 * for use by socketpair.
1282 			 */
1283 			bcopy(addr, &sti->sti_ux_laddr, addrlen);
1284 			sti->sti_laddr_valid = 1;
1285 			break;
1286 		case AF_INET:
1287 		case AF_INET6:
1288 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1289 			bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1290 			sti->sti_laddr_valid = 1;
1291 			break;
1292 		default:
1293 			/*
1294 			 * Don't mark sti_laddr_valid, as we cannot be
1295 			 * sure that the returned address is the real
1296 			 * bound address when talking to an unknown
1297 			 * transport.
1298 			 */
1299 			break;
1300 		}
1301 	}
1302 
1303 	freemsg(mp);
1304 
1305 done:
1306 	if (error) {
1307 		/* reset state & backlog to values held on entry */
1308 		if (clear_acceptconn_on_err == B_TRUE)
1309 			so->so_state &= ~SS_ACCEPTCONN;
1310 		if (restore_backlog_on_err == B_TRUE)
1311 			so->so_backlog = save_so_backlog;
1312 
1313 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1314 			int err;
1315 
1316 			err = sotpi_unbind(so, 0);
1317 			/* LINTED - statement has no consequent: if */
1318 			if (err) {
1319 				eprintsoline(so, error);
1320 			} else {
1321 				ASSERT(!(so->so_state & SS_ISBOUND));
1322 			}
1323 		}
1324 	}
1325 	if (!(flags & _SOBIND_LOCK_HELD)) {
1326 		so_unlock_single(so, SOLOCKED);
1327 		mutex_exit(&so->so_lock);
1328 	} else {
1329 		ASSERT(MUTEX_HELD(&so->so_lock));
1330 		ASSERT(so->so_flag & SOLOCKED);
1331 	}
1332 	return (error);
1333 }
1334 
1335 /* bind the socket */
1336 static int
sotpi_bind(struct sonode * so,struct sockaddr * name,socklen_t namelen,int flags,struct cred * cr)1337 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1338     int flags, struct cred *cr)
1339 {
1340 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1341 		return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1342 
1343 	flags &= ~_SOBIND_SOCKETPAIR;
1344 	return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1345 }
1346 
1347 /*
1348  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1349  * address, or when listen needs to unbind and bind.
1350  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1351  * so that a sobind can pick them up.
1352  */
1353 static int
sotpi_unbind(struct sonode * so,int flags)1354 sotpi_unbind(struct sonode *so, int flags)
1355 {
1356 	struct T_unbind_req	unbind_req;
1357 	int			error = 0;
1358 	mblk_t			*mp;
1359 	sotpi_info_t		*sti = SOTOTPI(so);
1360 
1361 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1362 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1363 
1364 	ASSERT(MUTEX_HELD(&so->so_lock));
1365 	ASSERT(so->so_flag & SOLOCKED);
1366 
1367 	if (!(so->so_state & SS_ISBOUND)) {
1368 		error = EINVAL;
1369 		eprintsoline(so, error);
1370 		goto done;
1371 	}
1372 
1373 	mutex_exit(&so->so_lock);
1374 
1375 	/*
1376 	 * Flush the read and write side (except stream head read queue)
1377 	 * and send down T_UNBIND_REQ.
1378 	 */
1379 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1380 
1381 	unbind_req.PRIM_type = T_UNBIND_REQ;
1382 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1383 	    0, _ALLOC_SLEEP, CRED());
1384 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1385 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1386 	mutex_enter(&so->so_lock);
1387 	if (error) {
1388 		eprintsoline(so, error);
1389 		goto done;
1390 	}
1391 
1392 	error = sowaitokack(so, T_UNBIND_REQ);
1393 	if (error) {
1394 		eprintsoline(so, error);
1395 		goto done;
1396 	}
1397 
1398 	/*
1399 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1400 	 * strsock_proto while the lock was dropped above, the unbind
1401 	 * is allowed to complete.
1402 	 */
1403 	if (!(flags & _SOUNBIND_REBIND)) {
1404 		/*
1405 		 * Clear out bound address.
1406 		 */
1407 		vnode_t *vp;
1408 
1409 		if ((vp = sti->sti_ux_bound_vp) != NULL) {
1410 			sti->sti_ux_bound_vp = NULL;
1411 			vn_rele_stream(vp);
1412 		}
1413 		/* Clear out address */
1414 		sti->sti_laddr_len = 0;
1415 	}
1416 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1417 	sti->sti_laddr_valid = 0;
1418 
1419 done:
1420 
1421 	/* If the caller held the lock don't release it here */
1422 	ASSERT(MUTEX_HELD(&so->so_lock));
1423 	ASSERT(so->so_flag & SOLOCKED);
1424 
1425 	return (error);
1426 }
1427 
1428 /*
1429  * listen on the socket.
1430  * For TPI conforming transports this has to first unbind with the transport
1431  * and then bind again using the new backlog.
1432  */
1433 /* ARGSUSED */
1434 int
sotpi_listen(struct sonode * so,int backlog,struct cred * cr)1435 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1436 {
1437 	int		error = 0;
1438 	sotpi_info_t	*sti = SOTOTPI(so);
1439 
1440 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1441 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1442 
1443 	if (sti->sti_serv_type == T_CLTS)
1444 		return (EOPNOTSUPP);
1445 
1446 	/*
1447 	 * If the socket is ready to accept connections already, then
1448 	 * return without doing anything.  This avoids a problem where
1449 	 * a second listen() call fails if a connection is pending and
1450 	 * leaves the socket unbound. Only when we are not unbinding
1451 	 * with the transport can we safely increase the backlog.
1452 	 */
1453 	if (so->so_state & SS_ACCEPTCONN &&
1454 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1455 	    /*CONSTCOND*/
1456 	    !solisten_tpi_tcp))
1457 		return (0);
1458 
1459 	if (so->so_state & SS_ISCONNECTED)
1460 		return (EINVAL);
1461 
1462 	mutex_enter(&so->so_lock);
1463 	so_lock_single(so);	/* Set SOLOCKED */
1464 
1465 	/*
1466 	 * If the listen doesn't change the backlog we do nothing.
1467 	 * This avoids an EPROTO error from the transport.
1468 	 */
1469 	if ((so->so_state & SS_ACCEPTCONN) &&
1470 	    so->so_backlog == backlog)
1471 		goto done;
1472 
1473 	if (!(so->so_state & SS_ISBOUND)) {
1474 		/*
1475 		 * Must have been explicitly bound in the UNIX domain.
1476 		 */
1477 		if (so->so_family == AF_UNIX) {
1478 			error = EINVAL;
1479 			goto done;
1480 		}
1481 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1482 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1483 	} else if (backlog > 0) {
1484 		/*
1485 		 * AF_INET{,6} hack to avoid losing the port.
1486 		 * Assumes that all AF_INET{,6} transports can handle a
1487 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1488 		 * has already bound thus it is possible to avoid the unbind.
1489 		 */
1490 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1491 		    /*CONSTCOND*/
1492 		    !solisten_tpi_tcp)) {
1493 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1494 			if (error)
1495 				goto done;
1496 		}
1497 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1498 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1499 	} else {
1500 		so->so_state |= SS_ACCEPTCONN;
1501 		so->so_backlog = backlog;
1502 	}
1503 	if (error)
1504 		goto done;
1505 	ASSERT(so->so_state & SS_ACCEPTCONN);
1506 done:
1507 	so_unlock_single(so, SOLOCKED);
1508 	mutex_exit(&so->so_lock);
1509 	return (error);
1510 }
1511 
1512 /*
1513  * Disconnect either a specified seqno or all (-1).
1514  * The former is used on listening sockets only.
1515  *
1516  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1517  * the current use of sodisconnect(seqno == -1) is only for shutdown
1518  * so there is no point (and potentially incorrect) to unbind.
1519  */
1520 static int
sodisconnect(struct sonode * so,t_scalar_t seqno,int flags)1521 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1522 {
1523 	struct T_discon_req	discon_req;
1524 	int			error = 0;
1525 	mblk_t			*mp;
1526 
1527 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1528 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1529 
1530 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1531 		mutex_enter(&so->so_lock);
1532 		so_lock_single(so);	/* Set SOLOCKED */
1533 	} else {
1534 		ASSERT(MUTEX_HELD(&so->so_lock));
1535 		ASSERT(so->so_flag & SOLOCKED);
1536 	}
1537 
1538 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1539 		error = EINVAL;
1540 		eprintsoline(so, error);
1541 		goto done;
1542 	}
1543 
1544 	mutex_exit(&so->so_lock);
1545 	/*
1546 	 * Flush the write side (unless this is a listener)
1547 	 * and then send down a T_DISCON_REQ.
1548 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1549 	 * and other messages.)
1550 	 */
1551 	if (!(so->so_state & SS_ACCEPTCONN))
1552 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1553 
1554 	discon_req.PRIM_type = T_DISCON_REQ;
1555 	discon_req.SEQ_number = seqno;
1556 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1557 	    0, _ALLOC_SLEEP, CRED());
1558 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1559 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1560 	mutex_enter(&so->so_lock);
1561 	if (error) {
1562 		eprintsoline(so, error);
1563 		goto done;
1564 	}
1565 
1566 	error = sowaitokack(so, T_DISCON_REQ);
1567 	if (error) {
1568 		eprintsoline(so, error);
1569 		goto done;
1570 	}
1571 	/*
1572 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1573 	 * strsock_proto while the lock was dropped above, the disconnect
1574 	 * is allowed to complete. However, it is not possible to
1575 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1576 	 */
1577 	so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1578 	SOTOTPI(so)->sti_laddr_valid = 0;
1579 	SOTOTPI(so)->sti_faddr_valid = 0;
1580 done:
1581 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1582 		so_unlock_single(so, SOLOCKED);
1583 		mutex_exit(&so->so_lock);
1584 	} else {
1585 		/* If the caller held the lock don't release it here */
1586 		ASSERT(MUTEX_HELD(&so->so_lock));
1587 		ASSERT(so->so_flag & SOLOCKED);
1588 	}
1589 	return (error);
1590 }
1591 
1592 /* ARGSUSED */
1593 int
sotpi_accept(struct sonode * so,int fflag,struct cred * cr,struct sonode ** nsop)1594 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1595     struct sonode **nsop)
1596 {
1597 	struct T_conn_ind	*conn_ind;
1598 	struct T_conn_res	*conn_res;
1599 	int			error = 0;
1600 	mblk_t			*mp, *ack_mp;
1601 	struct sonode		*nso;
1602 	vnode_t			*nvp;
1603 	void			*src;
1604 	t_uscalar_t		srclen;
1605 	void			*opt;
1606 	t_uscalar_t		optlen;
1607 	t_scalar_t		PRIM_type;
1608 	t_scalar_t		SEQ_number;
1609 	size_t			sinlen;
1610 	sotpi_info_t		*sti = SOTOTPI(so);
1611 	sotpi_info_t		*nsti;
1612 
1613 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1614 	    (void *)so, fflag, (void *)nsop,
1615 	    pr_state(so->so_state, so->so_mode)));
1616 
1617 	/*
1618 	 * Defer single-threading the accepting socket until
1619 	 * the T_CONN_IND has been received and parsed and the
1620 	 * new sonode has been opened.
1621 	 */
1622 
1623 	/* Check that we are not already connected */
1624 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1625 		goto conn_bad;
1626 
1627 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1628 		goto e_bad;
1629 
1630 	ASSERT(mp != NULL);
1631 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1632 
1633 	/*
1634 	 * Save SEQ_number for error paths.
1635 	 */
1636 	SEQ_number = conn_ind->SEQ_number;
1637 
1638 	srclen = conn_ind->SRC_length;
1639 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1640 	if (src == NULL) {
1641 		error = EPROTO;
1642 		freemsg(mp);
1643 		eprintsoline(so, error);
1644 		goto disconnect_unlocked;
1645 	}
1646 	optlen = conn_ind->OPT_length;
1647 	switch (so->so_family) {
1648 	case AF_INET:
1649 	case AF_INET6:
1650 		if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1651 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1652 			    &opt, conn_ind->OPT_length);
1653 		} else {
1654 			/*
1655 			 * The transport (in this case TCP) hasn't sent up
1656 			 * a pointer to an instance for the accept fast-path.
1657 			 * Disable fast-path completely because the call to
1658 			 * sotpi_create() below would otherwise create an
1659 			 * incomplete TCP instance, which would lead to
1660 			 * problems when sockfs sends a normal T_CONN_RES
1661 			 * message down the new stream.
1662 			 */
1663 			if (sti->sti_direct) {
1664 				int rval;
1665 				/*
1666 				 * For consistency we inform tcp to disable
1667 				 * direct interface on the listener, though
1668 				 * we can certainly live without doing this
1669 				 * because no data will ever travel upstream
1670 				 * on the listening socket.
1671 				 */
1672 				sti->sti_direct = 0;
1673 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1674 				    0, 0, K_TO_K, cr, &rval);
1675 			}
1676 			opt = NULL;
1677 			optlen = 0;
1678 		}
1679 		break;
1680 	case AF_UNIX:
1681 	default:
1682 		if (optlen != 0) {
1683 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1684 			    __TPI_ALIGN_SIZE);
1685 			if (opt == NULL) {
1686 				error = EPROTO;
1687 				freemsg(mp);
1688 				eprintsoline(so, error);
1689 				goto disconnect_unlocked;
1690 			}
1691 		}
1692 		if (so->so_family == AF_UNIX) {
1693 			if (!sti->sti_faddr_noxlate) {
1694 				src = NULL;
1695 				srclen = 0;
1696 			}
1697 			/* Extract src address from options */
1698 			if (optlen != 0)
1699 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1700 		}
1701 		break;
1702 	}
1703 
1704 	/*
1705 	 * Create the new socket.
1706 	 */
1707 	nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1708 	if (nso == NULL) {
1709 		ASSERT(error != 0);
1710 		/*
1711 		 * Accept can not fail with ENOBUFS. sotpi_create
1712 		 * sleeps waiting for memory until a signal is caught
1713 		 * so return EINTR.
1714 		 */
1715 		freemsg(mp);
1716 		if (error == ENOBUFS)
1717 			error = EINTR;
1718 		goto e_disc_unl;
1719 	}
1720 	nvp = SOTOV(nso);
1721 	nsti = SOTOTPI(nso);
1722 
1723 #ifdef DEBUG
1724 	/*
1725 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1726 	 * it's inherited early to allow debugging of the accept code itself.
1727 	 */
1728 	nso->so_options |= so->so_options & SO_DEBUG;
1729 #endif /* DEBUG */
1730 
1731 	/*
1732 	 * Save the SRC address from the T_CONN_IND
1733 	 * for getpeername to work on AF_UNIX and on transports that do not
1734 	 * support TI_GETPEERNAME.
1735 	 *
1736 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1737 	 * copyin_name().
1738 	 */
1739 	if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1740 		error = EINVAL;
1741 		freemsg(mp);
1742 		eprintsoline(so, error);
1743 		goto disconnect_vp_unlocked;
1744 	}
1745 	nsti->sti_faddr_len = (socklen_t)srclen;
1746 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1747 	bcopy(src, nsti->sti_faddr_sa, srclen);
1748 	nsti->sti_faddr_valid = 1;
1749 
1750 	/*
1751 	 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1752 	 */
1753 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1754 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1755 		cred_t	*cr;
1756 		pid_t	cpid;
1757 
1758 		cr = msg_getcred(mp, &cpid);
1759 		if (cr != NULL) {
1760 			crhold(cr);
1761 			nso->so_peercred = cr;
1762 			nso->so_cpid = cpid;
1763 		}
1764 		freemsg(mp);
1765 
1766 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1767 		    sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1768 		if (mp == NULL) {
1769 			/*
1770 			 * Accept can not fail with ENOBUFS.
1771 			 * A signal was caught so return EINTR.
1772 			 */
1773 			error = EINTR;
1774 			eprintsoline(so, error);
1775 			goto disconnect_vp_unlocked;
1776 		}
1777 		conn_res = (struct T_conn_res *)mp->b_rptr;
1778 	} else {
1779 		/*
1780 		 * For efficency reasons we use msg_extractcred; no crhold
1781 		 * needed since db_credp is cleared (i.e., we move the cred
1782 		 * from the message to so_peercred.
1783 		 */
1784 		nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1785 
1786 		mp->b_rptr = DB_BASE(mp);
1787 		conn_res = (struct T_conn_res *)mp->b_rptr;
1788 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1789 
1790 		mblk_setcred(mp, cr, curproc->p_pid);
1791 	}
1792 
1793 	/*
1794 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1795 	 * (or AF_INET6) it also has to be bound in the transport provider.
1796 	 * We set the local address in the sonode from the T_OK_ACK of the
1797 	 * T_CONN_RES. For this reason the address we bind to here isn't
1798 	 * important.
1799 	 */
1800 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1801 	    /*CONSTCOND*/
1802 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1803 		/*
1804 		 * Optimization for AF_INET{,6} transports
1805 		 * that can handle a T_CONN_RES without being bound.
1806 		 */
1807 		mutex_enter(&nso->so_lock);
1808 		so_automatic_bind(nso);
1809 		mutex_exit(&nso->so_lock);
1810 	} else {
1811 		/* Perform NULL bind with the transport provider. */
1812 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1813 		    cr)) != 0) {
1814 			ASSERT(error != ENOBUFS);
1815 			freemsg(mp);
1816 			eprintsoline(nso, error);
1817 			goto disconnect_vp_unlocked;
1818 		}
1819 	}
1820 
1821 	/*
1822 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1823 	 * so that any data arriving on the new socket will cause the
1824 	 * appropriate signals to be delivered for the new socket.
1825 	 *
1826 	 * No other thread (except strsock_proto and strsock_misc)
1827 	 * can access the new socket thus we relax the locking.
1828 	 */
1829 	nso->so_pgrp = so->so_pgrp;
1830 	nso->so_state |= so->so_state & SS_ASYNC;
1831 	nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1832 
1833 	if (nso->so_pgrp != 0) {
1834 		if ((error = so_set_events(nso, nvp, cr)) != 0) {
1835 			eprintsoline(nso, error);
1836 			error = 0;
1837 			nso->so_pgrp = 0;
1838 		}
1839 	}
1840 
1841 	/*
1842 	 * Make note of the socket level options. TCP and IP level options
1843 	 * are already inherited. We could do all this after accept is
1844 	 * successful but doing it here simplifies code and no harm done
1845 	 * for error case.
1846 	 */
1847 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1848 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1849 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1850 	nso->so_sndbuf = so->so_sndbuf;
1851 	nso->so_rcvbuf = so->so_rcvbuf;
1852 	if (nso->so_options & SO_LINGER)
1853 		nso->so_linger = so->so_linger;
1854 
1855 	/*
1856 	 * Note that the following sti_direct code path should be
1857 	 * removed once we are confident that the direct sockets
1858 	 * do not result in any degradation.
1859 	 */
1860 	if (sti->sti_direct) {
1861 
1862 		ASSERT(opt != NULL);
1863 
1864 		conn_res->OPT_length = optlen;
1865 		conn_res->OPT_offset = MBLKL(mp);
1866 		bcopy(&opt, mp->b_wptr, optlen);
1867 		mp->b_wptr += optlen;
1868 		conn_res->PRIM_type = T_CONN_RES;
1869 		conn_res->ACCEPTOR_id = 0;
1870 		PRIM_type = T_CONN_RES;
1871 
1872 		/* Send down the T_CONN_RES on acceptor STREAM */
1873 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1874 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1875 		if (error) {
1876 			mutex_enter(&so->so_lock);
1877 			so_lock_single(so);
1878 			eprintsoline(so, error);
1879 			goto disconnect_vp;
1880 		}
1881 		mutex_enter(&nso->so_lock);
1882 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1883 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1884 		if (error) {
1885 			mutex_exit(&nso->so_lock);
1886 			mutex_enter(&so->so_lock);
1887 			so_lock_single(so);
1888 			eprintsoline(so, error);
1889 			goto disconnect_vp;
1890 		}
1891 		if (nso->so_family == AF_INET) {
1892 			sin_t *sin;
1893 
1894 			sin = (sin_t *)(ack_mp->b_rptr +
1895 			    sizeof (struct T_ok_ack));
1896 			bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1897 			nsti->sti_laddr_len = sizeof (sin_t);
1898 		} else {
1899 			sin6_t *sin6;
1900 
1901 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1902 			    sizeof (struct T_ok_ack));
1903 			bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1904 			nsti->sti_laddr_len = sizeof (sin6_t);
1905 		}
1906 		freemsg(ack_mp);
1907 
1908 		nso->so_state |= SS_ISCONNECTED;
1909 		nso->so_proto_handle = (sock_lower_handle_t)opt;
1910 		nsti->sti_laddr_valid = 1;
1911 
1912 		mutex_exit(&nso->so_lock);
1913 
1914 		/*
1915 		 * It's possible, through the use of autopush for example,
1916 		 * that the acceptor stream may not support sti_direct
1917 		 * semantics. If the new socket does not support sti_direct
1918 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
1919 		 * as we would in the I_PUSH case.
1920 		 */
1921 		if (nsti->sti_direct == 0) {
1922 			int	rval;
1923 
1924 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
1925 			    0, 0, K_TO_K, cr, &rval)) != 0) {
1926 				mutex_enter(&so->so_lock);
1927 				so_lock_single(so);
1928 				eprintsoline(so, error);
1929 				goto disconnect_vp;
1930 			}
1931 		}
1932 
1933 		/*
1934 		 * Pass out new socket.
1935 		 */
1936 		if (nsop != NULL)
1937 			*nsop = nso;
1938 
1939 		return (0);
1940 	}
1941 
1942 	/*
1943 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1944 	 * which don't support the FireEngine accept fast-path. It is also
1945 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1946 	 * again. Neither sockfs nor TCP attempt to find out if some other
1947 	 * random module has been inserted in between (in which case we
1948 	 * should follow TLI accept behaviour). We blindly assume the worst
1949 	 * case and revert back to old behaviour i.e. TCP will not send us
1950 	 * any option (eager) and the accept should happen on the listener
1951 	 * queue. Any queued T_conn_ind have already got their options removed
1952 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
1953 	 */
1954 	/*
1955 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1956 	 */
1957 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1958 #ifdef	_ILP32
1959 		queue_t	*q;
1960 
1961 		/*
1962 		 * Find read queue in driver
1963 		 * Can safely do this since we "own" nso/nvp.
1964 		 */
1965 		q = strvp2wq(nvp)->q_next;
1966 		while (SAMESTR(q))
1967 			q = q->q_next;
1968 		q = RD(q);
1969 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1970 #else
1971 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1972 #endif	/* _ILP32 */
1973 		conn_res->PRIM_type = O_T_CONN_RES;
1974 		PRIM_type = O_T_CONN_RES;
1975 	} else {
1976 		conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
1977 		conn_res->PRIM_type = T_CONN_RES;
1978 		PRIM_type = T_CONN_RES;
1979 	}
1980 	conn_res->SEQ_number = SEQ_number;
1981 	conn_res->OPT_length = 0;
1982 	conn_res->OPT_offset = 0;
1983 
1984 	mutex_enter(&so->so_lock);
1985 	so_lock_single(so);	/* Set SOLOCKED */
1986 	mutex_exit(&so->so_lock);
1987 
1988 	error = kstrputmsg(SOTOV(so), mp, NULL,
1989 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1990 	mutex_enter(&so->so_lock);
1991 	if (error) {
1992 		eprintsoline(so, error);
1993 		goto disconnect_vp;
1994 	}
1995 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
1996 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1997 	if (error) {
1998 		eprintsoline(so, error);
1999 		goto disconnect_vp;
2000 	}
2001 	mutex_exit(&so->so_lock);
2002 	/*
2003 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
2004 	 * that to set the local address. If this is not present
2005 	 * then we zero out the address and don't set the
2006 	 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2007 	 * the pathname from the listening socket.
2008 	 * In the case where this is TCP or an AF_UNIX socket the
2009 	 * client side may have queued data or a T_ORDREL in the
2010 	 * transport. Having now sent the T_CONN_RES we may receive
2011 	 * those queued messages at any time. Hold the acceptor
2012 	 * so_lock until its state and laddr are finalized.
2013 	 */
2014 	mutex_enter(&nso->so_lock);
2015 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2016 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
2017 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2018 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
2019 		bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2020 		nsti->sti_laddr_len = sinlen;
2021 		nsti->sti_laddr_valid = 1;
2022 	} else if (nso->so_family == AF_UNIX) {
2023 		ASSERT(so->so_family == AF_UNIX);
2024 		nsti->sti_laddr_len = sti->sti_laddr_len;
2025 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2026 		bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2027 		    nsti->sti_laddr_len);
2028 		nsti->sti_laddr_valid = 1;
2029 	} else {
2030 		nsti->sti_laddr_len = sti->sti_laddr_len;
2031 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2032 		bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2033 		nsti->sti_laddr_sa->sa_family = nso->so_family;
2034 	}
2035 	nso->so_state |= SS_ISCONNECTED;
2036 	mutex_exit(&nso->so_lock);
2037 
2038 	freemsg(ack_mp);
2039 
2040 	mutex_enter(&so->so_lock);
2041 	so_unlock_single(so, SOLOCKED);
2042 	mutex_exit(&so->so_lock);
2043 
2044 	/*
2045 	 * Pass out new socket.
2046 	 */
2047 	if (nsop != NULL)
2048 		*nsop = nso;
2049 
2050 	return (0);
2051 
2052 e_disc_unl:
2053 	eprintsoline(so, error);
2054 	goto disconnect_unlocked;
2055 
2056 disconnect_vp_unlocked:
2057 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2058 	VN_RELE(nvp);
2059 disconnect_unlocked:
2060 	(void) sodisconnect(so, SEQ_number, 0);
2061 	return (error);
2062 
2063 disconnect_vp:
2064 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2065 	so_unlock_single(so, SOLOCKED);
2066 	mutex_exit(&so->so_lock);
2067 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2068 	VN_RELE(nvp);
2069 	return (error);
2070 
2071 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2072 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2073 	    ? EOPNOTSUPP : EINVAL;
2074 e_bad:
2075 	eprintsoline(so, error);
2076 	return (error);
2077 }
2078 
2079 /*
2080  * connect a socket.
2081  *
2082  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2083  * unconnect (by specifying a null address).
2084  */
2085 int
sotpi_connect(struct sonode * so,struct sockaddr * name,socklen_t namelen,int fflag,int flags,struct cred * cr)2086 sotpi_connect(struct sonode *so,
2087     struct sockaddr *name,
2088     socklen_t namelen,
2089     int fflag,
2090     int flags,
2091     struct cred *cr)
2092 {
2093 	struct T_conn_req	conn_req;
2094 	int			error = 0;
2095 	mblk_t			*mp;
2096 	void			*src;
2097 	socklen_t		srclen;
2098 	void			*addr;
2099 	socklen_t		addrlen;
2100 	boolean_t		need_unlock;
2101 	sotpi_info_t		*sti = SOTOTPI(so);
2102 
2103 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2104 	    (void *)so, (void *)name, namelen, fflag, flags,
2105 	    pr_state(so->so_state, so->so_mode)));
2106 
2107 	/*
2108 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2109 	 * avoid sleeping for memory with SOLOCKED held.
2110 	 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2111 	 * + sizeof (struct T_opthdr).
2112 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2113 	 * exceed sti_faddr_maxlen).
2114 	 */
2115 	mp = soallocproto(sizeof (struct T_conn_req) +
2116 	    2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2117 	    cr);
2118 	if (mp == NULL) {
2119 		/*
2120 		 * Connect can not fail with ENOBUFS. A signal was
2121 		 * caught so return EINTR.
2122 		 */
2123 		error = EINTR;
2124 		eprintsoline(so, error);
2125 		return (error);
2126 	}
2127 
2128 	mutex_enter(&so->so_lock);
2129 	/*
2130 	 * Make sure there is a preallocated T_unbind_req message
2131 	 * before any binding. This message is allocated when the
2132 	 * socket is created. Since another thread can consume
2133 	 * so_unbind_mp by the time we return from so_lock_single(),
2134 	 * we should check the availability of so_unbind_mp after
2135 	 * we return from so_lock_single().
2136 	 */
2137 
2138 	so_lock_single(so);	/* Set SOLOCKED */
2139 	need_unlock = B_TRUE;
2140 
2141 	if (sti->sti_unbind_mp == NULL) {
2142 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2143 		/* NOTE: holding so_lock while sleeping */
2144 		sti->sti_unbind_mp =
2145 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2146 		if (sti->sti_unbind_mp == NULL) {
2147 			error = EINTR;
2148 			goto done;
2149 		}
2150 	}
2151 
2152 	/*
2153 	 * Can't have done a listen before connecting.
2154 	 */
2155 	if (so->so_state & SS_ACCEPTCONN) {
2156 		error = EOPNOTSUPP;
2157 		goto done;
2158 	}
2159 
2160 	/*
2161 	 * Must be bound with the transport
2162 	 */
2163 	if (!(so->so_state & SS_ISBOUND)) {
2164 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2165 		    /*CONSTCOND*/
2166 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2167 			/*
2168 			 * Optimization for AF_INET{,6} transports
2169 			 * that can handle a T_CONN_REQ without being bound.
2170 			 */
2171 			so_automatic_bind(so);
2172 		} else {
2173 			error = sotpi_bind(so, NULL, 0,
2174 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2175 			if (error)
2176 				goto done;
2177 		}
2178 		ASSERT(so->so_state & SS_ISBOUND);
2179 		flags |= _SOCONNECT_DID_BIND;
2180 	}
2181 
2182 	/*
2183 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2184 	 * connect to a null address. This is the portable method to
2185 	 * unconnect a socket.
2186 	 */
2187 	if ((namelen >= sizeof (sa_family_t)) &&
2188 	    (name->sa_family == AF_UNSPEC)) {
2189 		name = NULL;
2190 		namelen = 0;
2191 	}
2192 
2193 	/*
2194 	 * Check that we are not already connected.
2195 	 * A connection-oriented socket cannot be reconnected.
2196 	 * A connected connection-less socket can be
2197 	 * - connected to a different address by a subsequent connect
2198 	 * - "unconnected" by a connect to the NULL address
2199 	 */
2200 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2201 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2202 		if (so->so_mode & SM_CONNREQUIRED) {
2203 			/* Connection-oriented socket */
2204 			error = so->so_state & SS_ISCONNECTED ?
2205 			    EISCONN : EALREADY;
2206 			goto done;
2207 		}
2208 		/* Connection-less socket */
2209 		if (name == NULL) {
2210 			/*
2211 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2212 			 * since it was set when the socket was connected.
2213 			 * If this is UDP also send down a T_DISCON_REQ.
2214 			 */
2215 			int val;
2216 
2217 			if ((so->so_family == AF_INET ||
2218 			    so->so_family == AF_INET6) &&
2219 			    (so->so_type == SOCK_DGRAM ||
2220 			    so->so_type == SOCK_RAW) &&
2221 			    /*CONSTCOND*/
2222 			    !soconnect_tpi_udp) {
2223 				/* XXX What about implicitly unbinding here? */
2224 				error = sodisconnect(so, -1,
2225 				    _SODISCONNECT_LOCK_HELD);
2226 			} else {
2227 				so->so_state &=
2228 				    ~(SS_ISCONNECTED | SS_ISCONNECTING);
2229 				sti->sti_faddr_valid = 0;
2230 				sti->sti_faddr_len = 0;
2231 			}
2232 
2233 			/* Remove SOLOCKED since setsockopt will grab it */
2234 			so_unlock_single(so, SOLOCKED);
2235 			mutex_exit(&so->so_lock);
2236 
2237 			val = 0;
2238 			(void) sotpi_setsockopt(so, SOL_SOCKET,
2239 			    SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2240 			    cr);
2241 
2242 			mutex_enter(&so->so_lock);
2243 			so_lock_single(so);	/* Set SOLOCKED */
2244 			goto done;
2245 		}
2246 	}
2247 	ASSERT(so->so_state & SS_ISBOUND);
2248 
2249 	if (name == NULL || namelen == 0) {
2250 		error = EINVAL;
2251 		goto done;
2252 	}
2253 	/*
2254 	 * Mark the socket if sti_faddr_sa represents the transport level
2255 	 * address.
2256 	 */
2257 	if (flags & _SOCONNECT_NOXLATE) {
2258 		struct sockaddr_ux	*soaddr_ux;
2259 
2260 		ASSERT(so->so_family == AF_UNIX);
2261 		if (namelen != sizeof (struct sockaddr_ux)) {
2262 			error = EINVAL;
2263 			goto done;
2264 		}
2265 		soaddr_ux = (struct sockaddr_ux *)name;
2266 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2267 		namelen = sizeof (soaddr_ux->sou_addr);
2268 		sti->sti_faddr_noxlate = 1;
2269 	}
2270 
2271 	/*
2272 	 * Length and family checks.
2273 	 */
2274 	error = so_addr_verify(so, name, namelen);
2275 	if (error)
2276 		goto bad;
2277 
2278 	/*
2279 	 * Save foreign address. Needed for AF_UNIX as well as
2280 	 * transport providers that do not support TI_GETPEERNAME.
2281 	 * Also used for cached foreign address for TCP and UDP.
2282 	 */
2283 	if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2284 		error = EINVAL;
2285 		goto done;
2286 	}
2287 	sti->sti_faddr_len = (socklen_t)namelen;
2288 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2289 	bcopy(name, sti->sti_faddr_sa, namelen);
2290 	sti->sti_faddr_valid = 1;
2291 
2292 	if (so->so_family == AF_UNIX) {
2293 		if (sti->sti_faddr_noxlate) {
2294 			/*
2295 			 * sti_faddr is a transport-level address, so
2296 			 * don't pass it as an option.  Do save it in
2297 			 * sti_ux_faddr, used for connected DG send.
2298 			 */
2299 			src = NULL;
2300 			srclen = 0;
2301 			addr = sti->sti_faddr_sa;
2302 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2303 			bcopy(addr, &sti->sti_ux_faddr,
2304 			    sizeof (sti->sti_ux_faddr));
2305 		} else {
2306 			/*
2307 			 * Pass the sockaddr_un source address as an option
2308 			 * and translate the remote address.
2309 			 * Holding so_lock thus sti_laddr_sa can not change.
2310 			 */
2311 			src = sti->sti_laddr_sa;
2312 			srclen = (t_uscalar_t)sti->sti_laddr_len;
2313 			dprintso(so, 1,
2314 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2315 			    srclen, src));
2316 			/*
2317 			 * Translate the destination address into our
2318 			 * internal form, and save it in sti_ux_faddr.
2319 			 * After this call, addr==&sti->sti_ux_taddr,
2320 			 * and we copy that to sti->sti_ux_faddr so
2321 			 * we save the connected peer address.
2322 			 */
2323 			error = so_ux_addr_xlate(so,
2324 			    sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2325 			    (flags & _SOCONNECT_XPG4_2),
2326 			    &addr, &addrlen);
2327 			if (error)
2328 				goto bad;
2329 			bcopy(&sti->sti_ux_taddr, &sti->sti_ux_faddr,
2330 			    sizeof (sti->sti_ux_faddr));
2331 		}
2332 	} else {
2333 		addr = sti->sti_faddr_sa;
2334 		addrlen = (t_uscalar_t)sti->sti_faddr_len;
2335 		src = NULL;
2336 		srclen = 0;
2337 	}
2338 	/*
2339 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2340 	 * option which asks the transport provider to send T_UDERR_IND
2341 	 * messages. These T_UDERR_IND messages are used to return connected
2342 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2343 	 *
2344 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2345 	 * we send down a T_CONN_REQ. This is needed to let the
2346 	 * transport assign a local address that is consistent with
2347 	 * the remote address. Applications depend on a getsockname()
2348 	 * after a connect() to retrieve the "source" IP address for
2349 	 * the connected socket.  Invalidate the cached local address
2350 	 * to force getsockname() to enquire of the transport.
2351 	 */
2352 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2353 		/*
2354 		 * Datagram socket.
2355 		 */
2356 		int32_t val;
2357 
2358 		so_unlock_single(so, SOLOCKED);
2359 		mutex_exit(&so->so_lock);
2360 
2361 		val = 1;
2362 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2363 		    &val, (t_uscalar_t)sizeof (val), cr);
2364 
2365 		mutex_enter(&so->so_lock);
2366 		so_lock_single(so);	/* Set SOLOCKED */
2367 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2368 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2369 		    soconnect_tpi_udp) {
2370 			soisconnected(so);
2371 			goto done;
2372 		}
2373 		/*
2374 		 * Send down T_CONN_REQ etc.
2375 		 * Clear fflag to avoid returning EWOULDBLOCK.
2376 		 */
2377 		fflag = 0;
2378 		ASSERT(so->so_family != AF_UNIX);
2379 		sti->sti_laddr_valid = 0;
2380 	} else if (sti->sti_laddr_len != 0) {
2381 		/*
2382 		 * If the local address or port was "any" then it may be
2383 		 * changed by the transport as a result of the
2384 		 * connect.  Invalidate the cached version if we have one.
2385 		 */
2386 		switch (so->so_family) {
2387 		case AF_INET:
2388 			ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2389 			if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2390 			    INADDR_ANY ||
2391 			    ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2392 				sti->sti_laddr_valid = 0;
2393 			break;
2394 
2395 		case AF_INET6:
2396 			ASSERT(sti->sti_laddr_len ==
2397 			    (socklen_t)sizeof (sin6_t));
2398 			if (IN6_IS_ADDR_UNSPECIFIED(
2399 			    &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2400 			    IN6_IS_ADDR_V4MAPPED_ANY(
2401 			    &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2402 			    ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2403 				sti->sti_laddr_valid = 0;
2404 			break;
2405 
2406 		default:
2407 			break;
2408 		}
2409 	}
2410 
2411 	/*
2412 	 * Check for failure of an earlier call
2413 	 */
2414 	if (so->so_error != 0)
2415 		goto so_bad;
2416 
2417 	/*
2418 	 * Send down T_CONN_REQ. Message was allocated above.
2419 	 */
2420 	conn_req.PRIM_type = T_CONN_REQ;
2421 	conn_req.DEST_length = addrlen;
2422 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2423 	if (srclen == 0) {
2424 		conn_req.OPT_length = 0;
2425 		conn_req.OPT_offset = 0;
2426 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2427 		soappendmsg(mp, addr, addrlen);
2428 	} else {
2429 		/*
2430 		 * There is a AF_UNIX sockaddr_un to include as a source
2431 		 * address option.
2432 		 */
2433 		struct T_opthdr toh;
2434 
2435 		toh.level = SOL_SOCKET;
2436 		toh.name = SO_SRCADDR;
2437 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2438 		toh.status = 0;
2439 		conn_req.OPT_length =
2440 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2441 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2442 		    _TPI_ALIGN_TOPT(addrlen));
2443 
2444 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2445 		soappendmsg(mp, addr, addrlen);
2446 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2447 		soappendmsg(mp, &toh, sizeof (toh));
2448 		soappendmsg(mp, src, srclen);
2449 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2450 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2451 	}
2452 	/*
2453 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2454 	 * in order to have the right state when the T_CONN_CON shows up.
2455 	 */
2456 	soisconnecting(so);
2457 	mutex_exit(&so->so_lock);
2458 
2459 	if (AU_AUDITING())
2460 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2461 
2462 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2463 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2464 	mp = NULL;
2465 	mutex_enter(&so->so_lock);
2466 	if (error != 0)
2467 		goto bad;
2468 
2469 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2470 		goto bad;
2471 
2472 	/* Allow other threads to access the socket */
2473 	so_unlock_single(so, SOLOCKED);
2474 	need_unlock = B_FALSE;
2475 
2476 	/*
2477 	 * Wait until we get a T_CONN_CON or an error
2478 	 */
2479 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2480 		so_lock_single(so);	/* Set SOLOCKED */
2481 		need_unlock = B_TRUE;
2482 	}
2483 
2484 done:
2485 	freemsg(mp);
2486 	switch (error) {
2487 	case EINPROGRESS:
2488 	case EALREADY:
2489 	case EISCONN:
2490 	case EINTR:
2491 		/* Non-fatal errors */
2492 		sti->sti_laddr_valid = 0;
2493 		/* FALLTHRU */
2494 	case 0:
2495 		break;
2496 	default:
2497 		ASSERT(need_unlock);
2498 		/*
2499 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2500 		 * and invalidate local-address cache
2501 		 */
2502 		so->so_state &= ~SS_ISCONNECTING;
2503 		sti->sti_laddr_valid = 0;
2504 		/* A discon_ind might have already unbound us */
2505 		if ((flags & _SOCONNECT_DID_BIND) &&
2506 		    (so->so_state & SS_ISBOUND)) {
2507 			int err;
2508 
2509 			err = sotpi_unbind(so, 0);
2510 			/* LINTED - statement has no conseq */
2511 			if (err) {
2512 				eprintsoline(so, err);
2513 			}
2514 		}
2515 		break;
2516 	}
2517 	if (need_unlock)
2518 		so_unlock_single(so, SOLOCKED);
2519 	mutex_exit(&so->so_lock);
2520 	return (error);
2521 
2522 so_bad:	error = sogeterr(so, B_TRUE);
2523 bad:	eprintsoline(so, error);
2524 	goto done;
2525 }
2526 
2527 /* ARGSUSED */
2528 int
sotpi_shutdown(struct sonode * so,int how,struct cred * cr)2529 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2530 {
2531 	struct T_ordrel_req	ordrel_req;
2532 	mblk_t			*mp;
2533 	uint_t			old_state, state_change;
2534 	int			error = 0;
2535 	sotpi_info_t		*sti = SOTOTPI(so);
2536 
2537 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2538 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2539 
2540 	mutex_enter(&so->so_lock);
2541 	so_lock_single(so);	/* Set SOLOCKED */
2542 
2543 	/*
2544 	 * SunOS 4.X has no check for datagram sockets.
2545 	 * 5.X checks that it is connected (ENOTCONN)
2546 	 * X/Open requires that we check the connected state.
2547 	 */
2548 	if (!(so->so_state & SS_ISCONNECTED)) {
2549 		if (!xnet_skip_checks) {
2550 			error = ENOTCONN;
2551 			if (xnet_check_print) {
2552 				printf("sockfs: X/Open shutdown check "
2553 				    "caused ENOTCONN\n");
2554 			}
2555 		}
2556 		goto done;
2557 	}
2558 	/*
2559 	 * Record the current state and then perform any state changes.
2560 	 * Then use the difference between the old and new states to
2561 	 * determine which messages need to be sent.
2562 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2563 	 * duplicate calls to shutdown().
2564 	 */
2565 	old_state = so->so_state;
2566 
2567 	switch (how) {
2568 	case 0:
2569 		socantrcvmore(so);
2570 		break;
2571 	case 1:
2572 		socantsendmore(so);
2573 		break;
2574 	case 2:
2575 		socantsendmore(so);
2576 		socantrcvmore(so);
2577 		break;
2578 	default:
2579 		error = EINVAL;
2580 		goto done;
2581 	}
2582 
2583 	/*
2584 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2585 	 */
2586 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2587 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2588 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2589 
2590 	switch (state_change) {
2591 	case 0:
2592 		dprintso(so, 1,
2593 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2594 		    so->so_state));
2595 		goto done;
2596 
2597 	case SS_CANTRCVMORE:
2598 		mutex_exit(&so->so_lock);
2599 		strseteof(SOTOV(so), 1);
2600 		/*
2601 		 * strseteof takes care of read side wakeups,
2602 		 * pollwakeups, and signals.
2603 		 */
2604 		/*
2605 		 * Get the read lock before flushing data to avoid problems
2606 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2607 		 */
2608 		mutex_enter(&so->so_lock);
2609 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2610 		mutex_exit(&so->so_lock);
2611 
2612 		/* Flush read side queue */
2613 		strflushrq(SOTOV(so), FLUSHALL);
2614 
2615 		mutex_enter(&so->so_lock);
2616 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2617 		break;
2618 
2619 	case SS_CANTSENDMORE:
2620 		mutex_exit(&so->so_lock);
2621 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2622 		mutex_enter(&so->so_lock);
2623 		break;
2624 
2625 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2626 		mutex_exit(&so->so_lock);
2627 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2628 		strseteof(SOTOV(so), 1);
2629 		/*
2630 		 * strseteof takes care of read side wakeups,
2631 		 * pollwakeups, and signals.
2632 		 */
2633 		/*
2634 		 * Get the read lock before flushing data to avoid problems
2635 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2636 		 */
2637 		mutex_enter(&so->so_lock);
2638 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2639 		mutex_exit(&so->so_lock);
2640 
2641 		/* Flush read side queue */
2642 		strflushrq(SOTOV(so), FLUSHALL);
2643 
2644 		mutex_enter(&so->so_lock);
2645 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2646 		break;
2647 	}
2648 
2649 	ASSERT(MUTEX_HELD(&so->so_lock));
2650 
2651 	/*
2652 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2653 	 * was set due to this call and the new state has both of them set:
2654 	 *	Send the AF_UNIX close indication
2655 	 *	For T_COTS send a discon_ind
2656 	 *
2657 	 * If cantsend was set due to this call:
2658 	 *	For T_COTSORD send an ordrel_ind
2659 	 *
2660 	 * Note that for T_CLTS there is no message sent here.
2661 	 */
2662 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2663 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2664 		/*
2665 		 * For SunOS 4.X compatibility we tell the other end
2666 		 * that we are unable to receive at this point.
2667 		 */
2668 		if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2669 			so_unix_close(so);
2670 
2671 		if (sti->sti_serv_type == T_COTS)
2672 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2673 	}
2674 	if ((state_change & SS_CANTSENDMORE) &&
2675 	    (sti->sti_serv_type == T_COTS_ORD)) {
2676 		/* Send an orderly release */
2677 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2678 
2679 		mutex_exit(&so->so_lock);
2680 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2681 		    0, _ALLOC_SLEEP, cr);
2682 		/*
2683 		 * Send down the T_ORDREL_REQ even if there is flow control.
2684 		 * This prevents shutdown from blocking.
2685 		 * Note that there is no T_OK_ACK for ordrel_req.
2686 		 */
2687 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2688 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2689 		mutex_enter(&so->so_lock);
2690 		if (error) {
2691 			eprintsoline(so, error);
2692 			goto done;
2693 		}
2694 	}
2695 
2696 done:
2697 	so_unlock_single(so, SOLOCKED);
2698 	mutex_exit(&so->so_lock);
2699 	return (error);
2700 }
2701 
2702 /*
2703  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2704  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2705  * that we have closed.
2706  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2707  * T_UNITDATA_REQ containing the same option.
2708  *
2709  * For SOCK_DGRAM half-connections (somebody connected to this end
2710  * but this end is not connect) we don't know where to send any
2711  * SO_UNIX_CLOSE.
2712  *
2713  * We have to ignore stream head errors just in case there has been
2714  * a shutdown(output).
2715  * Ignore any flow control to try to get the message more quickly to the peer.
2716  * While locally ignoring flow control solves the problem when there
2717  * is only the loopback transport on the stream it would not provide
2718  * the correct AF_UNIX socket semantics when one or more modules have
2719  * been pushed.
2720  */
2721 void
so_unix_close(struct sonode * so)2722 so_unix_close(struct sonode *so)
2723 {
2724 	struct T_opthdr	toh;
2725 	mblk_t		*mp;
2726 	sotpi_info_t	*sti = SOTOTPI(so);
2727 
2728 	ASSERT(MUTEX_HELD(&so->so_lock));
2729 
2730 	ASSERT(so->so_family == AF_UNIX);
2731 
2732 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2733 	    (SS_ISCONNECTED|SS_ISBOUND))
2734 		return;
2735 
2736 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2737 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2738 
2739 	toh.level = SOL_SOCKET;
2740 	toh.name = SO_UNIX_CLOSE;
2741 
2742 	/* zero length + header */
2743 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2744 	toh.status = 0;
2745 
2746 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2747 		struct T_optdata_req tdr;
2748 
2749 		tdr.PRIM_type = T_OPTDATA_REQ;
2750 		tdr.DATA_flag = 0;
2751 
2752 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2753 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2754 
2755 		/* NOTE: holding so_lock while sleeping */
2756 		mp = soallocproto2(&tdr, sizeof (tdr),
2757 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2758 	} else {
2759 		struct T_unitdata_req	tudr;
2760 		void			*addr;
2761 		socklen_t		addrlen;
2762 		void			*src;
2763 		socklen_t		srclen;
2764 		struct T_opthdr		toh2;
2765 		t_scalar_t		size;
2766 
2767 		/*
2768 		 * We know this is an AF_UNIX connected DGRAM socket.
2769 		 * We therefore already have the destination address
2770 		 * in the internal form needed for this send.  This is
2771 		 * similar to the sosend_dgram call later in this file
2772 		 * when there's no user-specified destination address.
2773 		 */
2774 		if (sti->sti_faddr_noxlate) {
2775 			/*
2776 			 * Already have a transport internal address. Do not
2777 			 * pass any (transport internal) source address.
2778 			 */
2779 			addr = sti->sti_faddr_sa;
2780 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2781 			src = NULL;
2782 			srclen = 0;
2783 		} else {
2784 			/*
2785 			 * Pass the sockaddr_un source address as an option
2786 			 * and translate the remote address.
2787 			 * Holding so_lock thus sti_laddr_sa can not change.
2788 			 */
2789 			src = sti->sti_laddr_sa;
2790 			srclen = (socklen_t)sti->sti_laddr_len;
2791 			dprintso(so, 1,
2792 			    ("so_ux_close: srclen %d, src %p\n",
2793 			    srclen, src));
2794 			/*
2795 			 * Use the destination address saved in connect.
2796 			 */
2797 			addr = &sti->sti_ux_faddr;
2798 			addrlen = sizeof (sti->sti_ux_faddr);
2799 		}
2800 		tudr.PRIM_type = T_UNITDATA_REQ;
2801 		tudr.DEST_length = addrlen;
2802 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2803 		if (srclen == 0) {
2804 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2805 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2806 			    _TPI_ALIGN_TOPT(addrlen));
2807 
2808 			size = tudr.OPT_offset + tudr.OPT_length;
2809 			/* NOTE: holding so_lock while sleeping */
2810 			mp = soallocproto2(&tudr, sizeof (tudr),
2811 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2812 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2813 			soappendmsg(mp, &toh, sizeof (toh));
2814 		} else {
2815 			/*
2816 			 * There is a AF_UNIX sockaddr_un to include as a
2817 			 * source address option.
2818 			 */
2819 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2820 			    _TPI_ALIGN_TOPT(srclen));
2821 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2822 			    _TPI_ALIGN_TOPT(addrlen));
2823 
2824 			toh2.level = SOL_SOCKET;
2825 			toh2.name = SO_SRCADDR;
2826 			toh2.len = (t_uscalar_t)(srclen +
2827 			    sizeof (struct T_opthdr));
2828 			toh2.status = 0;
2829 
2830 			size = tudr.OPT_offset + tudr.OPT_length;
2831 
2832 			/* NOTE: holding so_lock while sleeping */
2833 			mp = soallocproto2(&tudr, sizeof (tudr),
2834 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2835 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2836 			soappendmsg(mp, &toh, sizeof (toh));
2837 			soappendmsg(mp, &toh2, sizeof (toh2));
2838 			soappendmsg(mp, src, srclen);
2839 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2840 		}
2841 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2842 	}
2843 	mutex_exit(&so->so_lock);
2844 	(void) kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2845 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2846 	mutex_enter(&so->so_lock);
2847 }
2848 
2849 /*
2850  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2851  * In addition, the caller typically verifies that there is some
2852  * potential state to clear by checking
2853  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2854  * before calling this routine.
2855  * Note that such a check can be made without holding so_lock since
2856  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2857  * decrements sti_oobsigcnt.
2858  *
2859  * When data is read *after* the point that all pending
2860  * oob data has been consumed the oob indication is cleared.
2861  *
2862  * This logic keeps select/poll returning POLLRDBAND and
2863  * SIOCATMARK returning true until we have read past
2864  * the mark.
2865  */
2866 static void
sorecv_update_oobstate(struct sonode * so)2867 sorecv_update_oobstate(struct sonode *so)
2868 {
2869 	sotpi_info_t *sti = SOTOTPI(so);
2870 
2871 	mutex_enter(&so->so_lock);
2872 	ASSERT(so_verify_oobstate(so));
2873 	dprintso(so, 1,
2874 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
2875 	    sti->sti_oobsigcnt,
2876 	    sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2877 	if (sti->sti_oobsigcnt == 0) {
2878 		/* No more pending oob indications */
2879 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2880 		freemsg(so->so_oobmsg);
2881 		so->so_oobmsg = NULL;
2882 	}
2883 	ASSERT(so_verify_oobstate(so));
2884 	mutex_exit(&so->so_lock);
2885 }
2886 
2887 /*
2888  * Receive the next message on the queue.
2889  * If msg_controllen is non-zero when called the caller is interested in
2890  * any received control info (options).
2891  * If msg_namelen is non-zero when called the caller is interested in
2892  * any received source address.
2893  * The routine returns with msg_control and msg_name pointing to
2894  * kmem_alloc'ed memory which the caller has to free.
2895  */
2896 /* ARGSUSED */
2897 int
sotpi_recvmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)2898 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
2899     struct cred *cr)
2900 {
2901 	union T_primitives	*tpr;
2902 	mblk_t			*mp;
2903 	uchar_t			pri;
2904 	int			pflag, opflag;
2905 	void			*control;
2906 	t_uscalar_t		controllen;
2907 	t_uscalar_t		namelen;
2908 	int			so_state = so->so_state; /* Snapshot */
2909 	ssize_t			saved_resid;
2910 	rval_t			rval;
2911 	int			flags;
2912 	clock_t			timout;
2913 	int			error = 0;
2914 	sotpi_info_t		*sti = SOTOTPI(so);
2915 
2916 	flags = msg->msg_flags;
2917 	msg->msg_flags = 0;
2918 
2919 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2920 	    (void *)so, (void *)msg, flags,
2921 	    pr_state(so->so_state, so->so_mode), so->so_error));
2922 
2923 	if (so->so_version == SOV_STREAM) {
2924 		so_update_attrs(so, SOACC);
2925 		/* The imaginary "sockmod" has been popped - act as a stream */
2926 		return (strread(SOTOV(so), uiop, cr));
2927 	}
2928 
2929 	/*
2930 	 * If we are not connected because we have never been connected
2931 	 * we return ENOTCONN. If we have been connected (but are no longer
2932 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2933 	 * the EOF.
2934 	 *
2935 	 * An alternative would be to post an ENOTCONN error in stream head
2936 	 * (read+write) and clear it when we're connected. However, that error
2937 	 * would cause incorrect poll/select behavior!
2938 	 */
2939 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2940 	    (so->so_mode & SM_CONNREQUIRED)) {
2941 		return (ENOTCONN);
2942 	}
2943 
2944 	/*
2945 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2946 	 * after checking that the read queue is empty) and returns zero.
2947 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2948 	 * is zero.
2949 	 */
2950 
2951 	if (flags & MSG_OOB) {
2952 		/* Check that the transport supports OOB */
2953 		if (!(so->so_mode & SM_EXDATA))
2954 			return (EOPNOTSUPP);
2955 		so_update_attrs(so, SOACC);
2956 		return (sorecvoob(so, msg, uiop, flags,
2957 		    (so->so_options & SO_OOBINLINE)));
2958 	}
2959 
2960 	so_update_attrs(so, SOACC);
2961 
2962 	/*
2963 	 * Set msg_controllen and msg_namelen to zero here to make it
2964 	 * simpler in the cases that no control or name is returned.
2965 	 */
2966 	controllen = msg->msg_controllen;
2967 	namelen = msg->msg_namelen;
2968 	msg->msg_controllen = 0;
2969 	msg->msg_namelen = 0;
2970 
2971 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2972 	    namelen, controllen));
2973 
2974 	mutex_enter(&so->so_lock);
2975 	/*
2976 	 * Only one reader is allowed at any given time. This is needed
2977 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
2978 	 *
2979 	 * This is slightly different that BSD behavior in that it fails with
2980 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
2981 	 * is single-threaded using sblock(), which is dropped while waiting
2982 	 * for data to appear. The difference shows up e.g. if one
2983 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
2984 	 * does use nonblocking io and different threads are reading each
2985 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
2986 	 * in this case as long as the read queue doesn't get empty.
2987 	 * In this implementation the thread using nonblocking io can
2988 	 * get an EWOULDBLOCK error due to the blocking thread executing
2989 	 * e.g. in the uiomove in kstrgetmsg.
2990 	 * This difference is not believed to be significant.
2991 	 */
2992 	/* Set SOREADLOCKED */
2993 	error = so_lock_read_intr(so,
2994 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
2995 	mutex_exit(&so->so_lock);
2996 	if (error)
2997 		return (error);
2998 
2999 	/*
3000 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3001 	 * queued data has been consumed.
3002 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3003 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3004 	 *
3005 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3006 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3007 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3008 	 */
3009 	pflag = MSG_ANY | MSG_DELAYERROR;
3010 	if (flags & MSG_PEEK) {
3011 		pflag |= MSG_IPEEK;
3012 		flags &= ~MSG_WAITALL;
3013 	}
3014 	if (so->so_mode & SM_ATOMIC)
3015 		pflag |= MSG_DISCARDTAIL;
3016 
3017 	if (flags & MSG_DONTWAIT)
3018 		timout = 0;
3019 	else if (so->so_rcvtimeo != 0)
3020 		timout = TICK_TO_MSEC(so->so_rcvtimeo);
3021 	else
3022 		timout = -1;
3023 	opflag = pflag;
3024 retry:
3025 	saved_resid = uiop->uio_resid;
3026 	pri = 0;
3027 	mp = NULL;
3028 	error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3029 	    timout, &rval);
3030 	if (error != 0) {
3031 		/* kstrgetmsg returns ETIME when timeout expires */
3032 		if (error == ETIME)
3033 			error = EWOULDBLOCK;
3034 		goto out;
3035 	}
3036 	/*
3037 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3038 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3039 	 */
3040 	ASSERT(!(rval.r_val1 & MORECTL));
3041 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3042 		msg->msg_flags |= MSG_TRUNC;
3043 
3044 	if (mp == NULL) {
3045 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3046 		/*
3047 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3048 		 * The draft Posix socket spec states that the mark should
3049 		 * not be cleared when peeking. We follow the latter.
3050 		 */
3051 		if ((so->so_state &
3052 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3053 		    (uiop->uio_resid != saved_resid) &&
3054 		    !(flags & MSG_PEEK)) {
3055 			sorecv_update_oobstate(so);
3056 		}
3057 
3058 		mutex_enter(&so->so_lock);
3059 		/* Set MSG_EOR based on MOREDATA */
3060 		if (!(rval.r_val1 & MOREDATA)) {
3061 			if (so->so_state & SS_SAVEDEOR) {
3062 				msg->msg_flags |= MSG_EOR;
3063 				so->so_state &= ~SS_SAVEDEOR;
3064 			}
3065 		}
3066 		/*
3067 		 * If some data was received (i.e. not EOF) and the
3068 		 * read/recv* has not been satisfied wait for some more.
3069 		 */
3070 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3071 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3072 			mutex_exit(&so->so_lock);
3073 			pflag = opflag | MSG_NOMARK;
3074 			goto retry;
3075 		}
3076 		goto out_locked;
3077 	}
3078 
3079 	/* strsock_proto has already verified length and alignment */
3080 	tpr = (union T_primitives *)mp->b_rptr;
3081 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3082 
3083 	switch (tpr->type) {
3084 	case T_DATA_IND: {
3085 		if ((so->so_state &
3086 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3087 		    (uiop->uio_resid != saved_resid) &&
3088 		    !(flags & MSG_PEEK)) {
3089 			sorecv_update_oobstate(so);
3090 		}
3091 
3092 		/*
3093 		 * Set msg_flags to MSG_EOR based on
3094 		 * MORE_flag and MOREDATA.
3095 		 */
3096 		mutex_enter(&so->so_lock);
3097 		so->so_state &= ~SS_SAVEDEOR;
3098 		if (!(tpr->data_ind.MORE_flag & 1)) {
3099 			if (!(rval.r_val1 & MOREDATA))
3100 				msg->msg_flags |= MSG_EOR;
3101 			else
3102 				so->so_state |= SS_SAVEDEOR;
3103 		}
3104 		freemsg(mp);
3105 		/*
3106 		 * If some data was received (i.e. not EOF) and the
3107 		 * read/recv* has not been satisfied wait for some more.
3108 		 */
3109 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3110 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3111 			mutex_exit(&so->so_lock);
3112 			pflag = opflag | MSG_NOMARK;
3113 			goto retry;
3114 		}
3115 		goto out_locked;
3116 	}
3117 	case T_UNITDATA_IND: {
3118 		void *addr;
3119 		t_uscalar_t addrlen;
3120 		void *abuf;
3121 		t_uscalar_t optlen;
3122 		void *opt;
3123 
3124 		if ((so->so_state &
3125 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3126 		    (uiop->uio_resid != saved_resid) &&
3127 		    !(flags & MSG_PEEK)) {
3128 			sorecv_update_oobstate(so);
3129 		}
3130 
3131 		if (namelen != 0) {
3132 			/* Caller wants source address */
3133 			addrlen = tpr->unitdata_ind.SRC_length;
3134 			addr = sogetoff(mp,
3135 			    tpr->unitdata_ind.SRC_offset,
3136 			    addrlen, 1);
3137 			if (addr == NULL) {
3138 				freemsg(mp);
3139 				error = EPROTO;
3140 				eprintsoline(so, error);
3141 				goto out;
3142 			}
3143 			if (so->so_family == AF_UNIX) {
3144 				/*
3145 				 * Can not use the transport level address.
3146 				 * If there is a SO_SRCADDR option carrying
3147 				 * the socket level address it will be
3148 				 * extracted below.
3149 				 */
3150 				addr = NULL;
3151 				addrlen = 0;
3152 			}
3153 		}
3154 		optlen = tpr->unitdata_ind.OPT_length;
3155 		if (optlen != 0) {
3156 			t_uscalar_t ncontrollen;
3157 
3158 			/*
3159 			 * Extract any source address option.
3160 			 * Determine how large cmsg buffer is needed.
3161 			 */
3162 			opt = sogetoff(mp,
3163 			    tpr->unitdata_ind.OPT_offset,
3164 			    optlen, __TPI_ALIGN_SIZE);
3165 
3166 			if (opt == NULL) {
3167 				freemsg(mp);
3168 				error = EPROTO;
3169 				eprintsoline(so, error);
3170 				goto out;
3171 			}
3172 			if (so->so_family == AF_UNIX)
3173 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3174 			ncontrollen = so_cmsglen(mp, opt, optlen,
3175 			    !(flags & MSG_XPG4_2));
3176 			if (controllen != 0)
3177 				controllen = ncontrollen;
3178 			else if (ncontrollen != 0)
3179 				msg->msg_flags |= MSG_CTRUNC;
3180 		} else {
3181 			controllen = 0;
3182 		}
3183 
3184 		if (namelen != 0) {
3185 			/*
3186 			 * Return address to caller.
3187 			 * Caller handles truncation if length
3188 			 * exceeds msg_namelen.
3189 			 * NOTE: AF_UNIX NUL termination is ensured by
3190 			 * the sender's copyin_name().
3191 			 */
3192 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3193 
3194 			bcopy(addr, abuf, addrlen);
3195 			msg->msg_name = abuf;
3196 			msg->msg_namelen = addrlen;
3197 		}
3198 
3199 		if (controllen != 0) {
3200 			/*
3201 			 * Return control msg to caller.
3202 			 * Caller handles truncation if length
3203 			 * exceeds msg_controllen.
3204 			 */
3205 			control = kmem_zalloc(controllen, KM_SLEEP);
3206 
3207 			error = so_opt2cmsg(mp, opt, optlen, flags, control,
3208 			    controllen);
3209 			if (error) {
3210 				freemsg(mp);
3211 				if (msg->msg_namelen != 0)
3212 					kmem_free(msg->msg_name,
3213 					    msg->msg_namelen);
3214 				kmem_free(control, controllen);
3215 				eprintsoline(so, error);
3216 				goto out;
3217 			}
3218 			msg->msg_control = control;
3219 			msg->msg_controllen = controllen;
3220 		}
3221 
3222 		freemsg(mp);
3223 		goto out;
3224 	}
3225 	case T_OPTDATA_IND: {
3226 		struct T_optdata_req *tdr;
3227 		void *opt;
3228 		t_uscalar_t optlen;
3229 
3230 		if ((so->so_state &
3231 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3232 		    (uiop->uio_resid != saved_resid) &&
3233 		    !(flags & MSG_PEEK)) {
3234 			sorecv_update_oobstate(so);
3235 		}
3236 
3237 		tdr = (struct T_optdata_req *)mp->b_rptr;
3238 		optlen = tdr->OPT_length;
3239 		if (optlen != 0) {
3240 			t_uscalar_t ncontrollen;
3241 			/*
3242 			 * Determine how large cmsg buffer is needed.
3243 			 */
3244 			opt = sogetoff(mp,
3245 			    tpr->optdata_ind.OPT_offset,
3246 			    optlen, __TPI_ALIGN_SIZE);
3247 
3248 			if (opt == NULL) {
3249 				freemsg(mp);
3250 				error = EPROTO;
3251 				eprintsoline(so, error);
3252 				goto out;
3253 			}
3254 
3255 			ncontrollen = so_cmsglen(mp, opt, optlen,
3256 			    !(flags & MSG_XPG4_2));
3257 			if (controllen != 0)
3258 				controllen = ncontrollen;
3259 			else if (ncontrollen != 0)
3260 				msg->msg_flags |= MSG_CTRUNC;
3261 		} else {
3262 			controllen = 0;
3263 		}
3264 
3265 		if (controllen != 0) {
3266 			/*
3267 			 * Return control msg to caller.
3268 			 * Caller handles truncation if length
3269 			 * exceeds msg_controllen.
3270 			 */
3271 			control = kmem_zalloc(controllen, KM_SLEEP);
3272 
3273 			error = so_opt2cmsg(mp, opt, optlen, flags, control,
3274 			    controllen);
3275 			if (error) {
3276 				freemsg(mp);
3277 				kmem_free(control, controllen);
3278 				eprintsoline(so, error);
3279 				goto out;
3280 			}
3281 			msg->msg_control = control;
3282 			msg->msg_controllen = controllen;
3283 		}
3284 
3285 		/*
3286 		 * Set msg_flags to MSG_EOR based on
3287 		 * DATA_flag and MOREDATA.
3288 		 */
3289 		mutex_enter(&so->so_lock);
3290 		so->so_state &= ~SS_SAVEDEOR;
3291 		if (!(tpr->data_ind.MORE_flag & 1)) {
3292 			if (!(rval.r_val1 & MOREDATA))
3293 				msg->msg_flags |= MSG_EOR;
3294 			else
3295 				so->so_state |= SS_SAVEDEOR;
3296 		}
3297 		freemsg(mp);
3298 		/*
3299 		 * If some data was received (i.e. not EOF) and the
3300 		 * read/recv* has not been satisfied wait for some more.
3301 		 * Not possible to wait if control info was received.
3302 		 */
3303 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3304 		    controllen == 0 &&
3305 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3306 			mutex_exit(&so->so_lock);
3307 			pflag = opflag | MSG_NOMARK;
3308 			goto retry;
3309 		}
3310 		goto out_locked;
3311 	}
3312 	case T_EXDATA_IND: {
3313 		dprintso(so, 1,
3314 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3315 		    "state %s\n",
3316 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3317 		    saved_resid - uiop->uio_resid,
3318 		    pr_state(so->so_state, so->so_mode)));
3319 		/*
3320 		 * kstrgetmsg handles MSGMARK so there is nothing to
3321 		 * inspect in the T_EXDATA_IND.
3322 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3323 		 * as a separate message with no M_DATA component. Furthermore,
3324 		 * the stream head does not consolidate M_DATA messages onto
3325 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3326 		 * remains a message by itself. This is needed since MSGMARK
3327 		 * marks both the whole message as well as the last byte
3328 		 * of the message.
3329 		 */
3330 		freemsg(mp);
3331 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3332 		if (flags & MSG_PEEK) {
3333 			/*
3334 			 * Even though we are peeking we consume the
3335 			 * T_EXDATA_IND thereby moving the mark information
3336 			 * to SS_RCVATMARK. Then the oob code below will
3337 			 * retry the peeking kstrgetmsg.
3338 			 * Note that the stream head read queue is
3339 			 * never flushed without holding SOREADLOCKED
3340 			 * thus the T_EXDATA_IND can not disappear
3341 			 * underneath us.
3342 			 */
3343 			dprintso(so, 1,
3344 			    ("sotpi_recvmsg: consume EXDATA_IND "
3345 			    "counts %d/%d state %s\n",
3346 			    sti->sti_oobsigcnt,
3347 			    sti->sti_oobcnt,
3348 			    pr_state(so->so_state, so->so_mode)));
3349 
3350 			pflag = MSG_ANY | MSG_DELAYERROR;
3351 			if (so->so_mode & SM_ATOMIC)
3352 				pflag |= MSG_DISCARDTAIL;
3353 
3354 			pri = 0;
3355 			mp = NULL;
3356 
3357 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3358 			    &pri, &pflag, (clock_t)-1, &rval);
3359 			ASSERT(uiop->uio_resid == saved_resid);
3360 
3361 			if (error) {
3362 #ifdef SOCK_DEBUG
3363 				if (error != EWOULDBLOCK && error != EINTR) {
3364 					eprintsoline(so, error);
3365 				}
3366 #endif /* SOCK_DEBUG */
3367 				goto out;
3368 			}
3369 			ASSERT(mp);
3370 			tpr = (union T_primitives *)mp->b_rptr;
3371 			ASSERT(tpr->type == T_EXDATA_IND);
3372 			freemsg(mp);
3373 		} /* end "if (flags & MSG_PEEK)" */
3374 
3375 		/*
3376 		 * Decrement the number of queued and pending oob.
3377 		 *
3378 		 * SS_RCVATMARK is cleared when we read past a mark.
3379 		 * SS_HAVEOOBDATA is cleared when we've read past the
3380 		 * last mark.
3381 		 * SS_OOBPEND is cleared if we've read past the last
3382 		 * mark and no (new) SIGURG has been posted.
3383 		 */
3384 		mutex_enter(&so->so_lock);
3385 		ASSERT(so_verify_oobstate(so));
3386 		ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3387 		ASSERT(sti->sti_oobsigcnt > 0);
3388 		sti->sti_oobsigcnt--;
3389 		ASSERT(sti->sti_oobcnt > 0);
3390 		sti->sti_oobcnt--;
3391 		/*
3392 		 * Since the T_EXDATA_IND has been removed from the stream
3393 		 * head, but we have not read data past the mark,
3394 		 * sockfs needs to track that the socket is still at the mark.
3395 		 *
3396 		 * Since no data was received call kstrgetmsg again to wait
3397 		 * for data.
3398 		 */
3399 		so->so_state |= SS_RCVATMARK;
3400 		mutex_exit(&so->so_lock);
3401 		dprintso(so, 1,
3402 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3403 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3404 		    pr_state(so->so_state, so->so_mode)));
3405 		pflag = opflag;
3406 		goto retry;
3407 	}
3408 	default:
3409 		cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3410 		    (void *)so, tpr->type, (void *)mp);
3411 		ASSERT(0);
3412 		freemsg(mp);
3413 		error = EPROTO;
3414 		eprintsoline(so, error);
3415 		goto out;
3416 	}
3417 	/* NOTREACHED */
3418 out:
3419 	mutex_enter(&so->so_lock);
3420 out_locked:
3421 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3422 	mutex_exit(&so->so_lock);
3423 	return (error);
3424 }
3425 
3426 /*
3427  * Sending data with options on a datagram socket.
3428  * Assumes caller has verified that SS_ISBOUND etc. are set.
3429  *
3430  * For AF_UNIX the destination address may be already in
3431  * internal form, as indicated by sti->sti_faddr_noxlate
3432  * or the MSG_SENDTO_NOXLATE flag.  Otherwise we need to
3433  * translate the destination address to internal form.
3434  *
3435  * The source address is passed as an option.  If passing
3436  * file descriptors, those are passed as file pointers in
3437  * another option.
3438  */
3439 static int
sosend_dgramcmsg(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,void * control,t_uscalar_t controllen,int flags)3440 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3441     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3442 {
3443 	struct T_unitdata_req	tudr;
3444 	mblk_t			*mp;
3445 	int			error;
3446 	void			*addr;
3447 	socklen_t		addrlen;
3448 	void			*src;
3449 	socklen_t		srclen;
3450 	ssize_t			len;
3451 	int			size;
3452 	struct T_opthdr		toh;
3453 	struct fdbuf		*fdbuf;
3454 	t_uscalar_t		optlen;
3455 	void			*fds;
3456 	int			fdlen;
3457 	sotpi_info_t		*sti = SOTOTPI(so);
3458 
3459 	ASSERT(name && namelen);
3460 	ASSERT(control && controllen);
3461 
3462 	len = uiop->uio_resid;
3463 	if (len > (ssize_t)sti->sti_tidu_size) {
3464 		return (EMSGSIZE);
3465 	}
3466 
3467 	if (sti->sti_faddr_noxlate == 0 &&
3468 	    (flags & MSG_SENDTO_NOXLATE) == 0) {
3469 		/*
3470 		 * Length and family checks.
3471 		 * Don't verify internal form.
3472 		 */
3473 		error = so_addr_verify(so, name, namelen);
3474 		if (error) {
3475 			eprintsoline(so, error);
3476 			return (error);
3477 		}
3478 	}
3479 
3480 	if (so->so_family == AF_UNIX) {
3481 		if (sti->sti_faddr_noxlate) {
3482 			/*
3483 			 * Already have a transport internal address. Do not
3484 			 * pass any (transport internal) source address.
3485 			 */
3486 			addr = name;
3487 			addrlen = namelen;
3488 			src = NULL;
3489 			srclen = 0;
3490 		} else if (flags & MSG_SENDTO_NOXLATE) {
3491 			/*
3492 			 * Have an internal form dest. address.
3493 			 * Pass the source address as usual.
3494 			 */
3495 			addr = name;
3496 			addrlen = namelen;
3497 			src = sti->sti_laddr_sa;
3498 			srclen = (socklen_t)sti->sti_laddr_len;
3499 		} else {
3500 			/*
3501 			 * Pass the sockaddr_un source address as an option
3502 			 * and translate the remote address.
3503 			 *
3504 			 * Note that this code does not prevent sti_laddr_sa
3505 			 * from changing while it is being used. Thus
3506 			 * if an unbind+bind occurs concurrently with this
3507 			 * send the peer might see a partially new and a
3508 			 * partially old "from" address.
3509 			 */
3510 			src = sti->sti_laddr_sa;
3511 			srclen = (socklen_t)sti->sti_laddr_len;
3512 			dprintso(so, 1,
3513 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3514 			    srclen, src));
3515 			/*
3516 			 * The sendmsg caller specified a destination
3517 			 * address, which we must translate into our
3518 			 * internal form.  addr = &sti->sti_ux_taddr
3519 			 */
3520 			error = so_ux_addr_xlate(so, name, namelen,
3521 			    (flags & MSG_XPG4_2),
3522 			    &addr, &addrlen);
3523 			if (error) {
3524 				eprintsoline(so, error);
3525 				return (error);
3526 			}
3527 		}
3528 	} else {
3529 		addr = name;
3530 		addrlen = namelen;
3531 		src = NULL;
3532 		srclen = 0;
3533 	}
3534 	optlen = so_optlen(control, controllen,
3535 	    !(flags & MSG_XPG4_2));
3536 	tudr.PRIM_type = T_UNITDATA_REQ;
3537 	tudr.DEST_length = addrlen;
3538 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3539 	if (srclen != 0)
3540 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3541 		    _TPI_ALIGN_TOPT(srclen));
3542 	else
3543 		tudr.OPT_length = optlen;
3544 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3545 	    _TPI_ALIGN_TOPT(addrlen));
3546 
3547 	size = tudr.OPT_offset + tudr.OPT_length;
3548 
3549 	/*
3550 	 * File descriptors only when SM_FDPASSING set.
3551 	 */
3552 	error = so_getfdopt(control, controllen,
3553 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3554 	if (error)
3555 		return (error);
3556 	if (fdlen != -1) {
3557 		if (!(so->so_mode & SM_FDPASSING))
3558 			return (EOPNOTSUPP);
3559 
3560 		error = fdbuf_create(fds, fdlen, &fdbuf);
3561 		if (error)
3562 			return (error);
3563 
3564 		/*
3565 		 * Pre-allocate enough additional space for lower level modules
3566 		 * to append an option (e.g. see tl_unitdata). The following
3567 		 * is enough extra space for the largest option we might append.
3568 		 */
3569 		size += sizeof (struct T_opthdr) + ucredsize;
3570 		mp = fdbuf_allocmsg(size, fdbuf);
3571 	} else {
3572 		mp = soallocproto(size, _ALLOC_INTR, CRED());
3573 		if (mp == NULL) {
3574 			/*
3575 			 * Caught a signal waiting for memory.
3576 			 * Let send* return EINTR.
3577 			 */
3578 			return (EINTR);
3579 		}
3580 	}
3581 	soappendmsg(mp, &tudr, sizeof (tudr));
3582 	soappendmsg(mp, addr, addrlen);
3583 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3584 
3585 	if (fdlen != -1) {
3586 		ASSERT(fdbuf != NULL);
3587 		toh.level = SOL_SOCKET;
3588 		toh.name = SO_FILEP;
3589 		toh.len = fdbuf->fd_size +
3590 		    (t_uscalar_t)sizeof (struct T_opthdr);
3591 		toh.status = 0;
3592 		soappendmsg(mp, &toh, sizeof (toh));
3593 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3594 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3595 	}
3596 	if (srclen != 0) {
3597 		/*
3598 		 * There is a AF_UNIX sockaddr_un to include as a source
3599 		 * address option.
3600 		 */
3601 		toh.level = SOL_SOCKET;
3602 		toh.name = SO_SRCADDR;
3603 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3604 		toh.status = 0;
3605 		soappendmsg(mp, &toh, sizeof (toh));
3606 		soappendmsg(mp, src, srclen);
3607 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3608 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3609 	}
3610 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3611 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3612 	/*
3613 	 * Normally at most 3 bytes left in the message, but we might have
3614 	 * allowed for extra space if we're passing fd's through.
3615 	 */
3616 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3617 
3618 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3619 	if (AU_AUDITING())
3620 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3621 
3622 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3623 #ifdef SOCK_DEBUG
3624 	if (error) {
3625 		eprintsoline(so, error);
3626 	}
3627 #endif /* SOCK_DEBUG */
3628 	return (error);
3629 }
3630 
3631 /*
3632  * Sending data with options on a connected stream socket.
3633  * Assumes caller has verified that SS_ISCONNECTED is set.
3634  */
3635 static int
sosend_svccmsg(struct sonode * so,struct uio * uiop,int more,void * control,t_uscalar_t controllen,int flags)3636 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3637     t_uscalar_t controllen, int flags)
3638 {
3639 	struct T_optdata_req	tdr;
3640 	mblk_t			*mp;
3641 	int			error;
3642 	ssize_t			iosize;
3643 	int			size;
3644 	struct fdbuf		*fdbuf;
3645 	t_uscalar_t		optlen;
3646 	void			*fds;
3647 	int			fdlen;
3648 	struct T_opthdr		toh;
3649 	sotpi_info_t		*sti = SOTOTPI(so);
3650 
3651 	dprintso(so, 1,
3652 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3653 
3654 	/*
3655 	 * Has to be bound and connected. However, since no locks are
3656 	 * held the state could have changed after sotpi_sendmsg checked it
3657 	 * thus it is not possible to ASSERT on the state.
3658 	 */
3659 
3660 	/* Options on connection-oriented only when SM_OPTDATA set. */
3661 	if (!(so->so_mode & SM_OPTDATA))
3662 		return (EOPNOTSUPP);
3663 
3664 	do {
3665 		/*
3666 		 * Set the MORE flag if uio_resid does not fit in this
3667 		 * message or if the caller passed in "more".
3668 		 * Error for transports with zero tidu_size.
3669 		 */
3670 		tdr.PRIM_type = T_OPTDATA_REQ;
3671 		iosize = sti->sti_tidu_size;
3672 		if (iosize <= 0)
3673 			return (EMSGSIZE);
3674 		if (uiop->uio_resid > iosize) {
3675 			tdr.DATA_flag = 1;
3676 		} else {
3677 			if (more)
3678 				tdr.DATA_flag = 1;
3679 			else
3680 				tdr.DATA_flag = 0;
3681 			iosize = uiop->uio_resid;
3682 		}
3683 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3684 		    tdr.DATA_flag, iosize));
3685 
3686 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3687 		tdr.OPT_length = optlen;
3688 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3689 
3690 		size = (int)sizeof (tdr) + optlen;
3691 		/*
3692 		 * File descriptors only when SM_FDPASSING set.
3693 		 */
3694 		error = so_getfdopt(control, controllen,
3695 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3696 		if (error)
3697 			return (error);
3698 		if (fdlen != -1) {
3699 			if (!(so->so_mode & SM_FDPASSING))
3700 				return (EOPNOTSUPP);
3701 
3702 			error = fdbuf_create(fds, fdlen, &fdbuf);
3703 			if (error)
3704 				return (error);
3705 
3706 			/*
3707 			 * Pre-allocate enough additional space for lower level
3708 			 * modules to append an option (e.g. see tl_unitdata).
3709 			 * The following is enough extra space for the largest
3710 			 * option we might append.
3711 			 */
3712 			size += sizeof (struct T_opthdr) + ucredsize;
3713 			mp = fdbuf_allocmsg(size, fdbuf);
3714 		} else {
3715 			mp = soallocproto(size, _ALLOC_INTR, CRED());
3716 			if (mp == NULL) {
3717 				/*
3718 				 * Caught a signal waiting for memory.
3719 				 * Let send* return EINTR.
3720 				 */
3721 				return (EINTR);
3722 			}
3723 		}
3724 		soappendmsg(mp, &tdr, sizeof (tdr));
3725 
3726 		if (fdlen != -1) {
3727 			ASSERT(fdbuf != NULL);
3728 			toh.level = SOL_SOCKET;
3729 			toh.name = SO_FILEP;
3730 			toh.len = fdbuf->fd_size +
3731 			    (t_uscalar_t)sizeof (struct T_opthdr);
3732 			toh.status = 0;
3733 			soappendmsg(mp, &toh, sizeof (toh));
3734 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3735 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3736 		}
3737 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3738 		/*
3739 		 * Normally at most 3 bytes left in the message, but we might
3740 		 * have allowed for extra space if we're passing fd's through.
3741 		 */
3742 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3743 
3744 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3745 
3746 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3747 		    0, MSG_BAND, 0);
3748 		if (error) {
3749 			eprintsoline(so, error);
3750 			return (error);
3751 		}
3752 		control = NULL;
3753 		if (uiop->uio_resid > 0) {
3754 			/*
3755 			 * Recheck for fatal errors. Fail write even though
3756 			 * some data have been written. This is consistent
3757 			 * with strwrite semantics and BSD sockets semantics.
3758 			 */
3759 			if (so->so_state & SS_CANTSENDMORE) {
3760 				eprintsoline(so, error);
3761 				return (EPIPE);
3762 			}
3763 			if (so->so_error != 0) {
3764 				mutex_enter(&so->so_lock);
3765 				error = sogeterr(so, B_TRUE);
3766 				mutex_exit(&so->so_lock);
3767 				if (error != 0) {
3768 					eprintsoline(so, error);
3769 					return (error);
3770 				}
3771 			}
3772 		}
3773 	} while (uiop->uio_resid > 0);
3774 	return (0);
3775 }
3776 
3777 /*
3778  * Sending data on a datagram socket.
3779  * Assumes caller has verified that SS_ISBOUND etc. are set.
3780  *
3781  * For AF_UNIX the destination address may be already in
3782  * internal form, as indicated by sti->sti_faddr_noxlate
3783  * or the MSG_SENDTO_NOXLATE flag.  Otherwise we need to
3784  * translate the destination address to internal form.
3785  *
3786  * The source address is passed as an option.
3787  */
3788 int
sosend_dgram(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)3789 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
3790     struct uio *uiop, int flags)
3791 {
3792 	struct T_unitdata_req	tudr;
3793 	mblk_t			*mp;
3794 	int			error;
3795 	void			*addr;
3796 	socklen_t		addrlen;
3797 	void			*src;
3798 	socklen_t		srclen;
3799 	ssize_t			len;
3800 	sotpi_info_t		*sti = SOTOTPI(so);
3801 
3802 	ASSERT(name != NULL && namelen != 0);
3803 
3804 	len = uiop->uio_resid;
3805 	if (len > sti->sti_tidu_size) {
3806 		error = EMSGSIZE;
3807 		goto done;
3808 	}
3809 
3810 	if (sti->sti_faddr_noxlate == 0 &&
3811 	    (flags & MSG_SENDTO_NOXLATE) == 0) {
3812 		/*
3813 		 * Length and family checks.
3814 		 * Don't verify internal form.
3815 		 */
3816 		error = so_addr_verify(so, name, namelen);
3817 		if (error != 0)
3818 			goto done;
3819 	}
3820 
3821 	if (sti->sti_direct)	/* Never on AF_UNIX */
3822 		return (sodgram_direct(so, name, namelen, uiop, flags));
3823 
3824 	if (so->so_family == AF_UNIX) {
3825 		if (sti->sti_faddr_noxlate) {
3826 			/*
3827 			 * Already have a transport internal address. Do not
3828 			 * pass any (transport internal) source address.
3829 			 */
3830 			addr = name;
3831 			addrlen = namelen;
3832 			src = NULL;
3833 			srclen = 0;
3834 		} else if (flags & MSG_SENDTO_NOXLATE) {
3835 			/*
3836 			 * Have an internal form dest. address.
3837 			 * Pass the source address as usual.
3838 			 */
3839 			addr = name;
3840 			addrlen = namelen;
3841 			src = sti->sti_laddr_sa;
3842 			srclen = (socklen_t)sti->sti_laddr_len;
3843 		} else {
3844 			/*
3845 			 * Pass the sockaddr_un source address as an option
3846 			 * and translate the remote address.
3847 			 *
3848 			 * Note that this code does not prevent sti_laddr_sa
3849 			 * from changing while it is being used. Thus
3850 			 * if an unbind+bind occurs concurrently with this
3851 			 * send the peer might see a partially new and a
3852 			 * partially old "from" address.
3853 			 */
3854 			src = sti->sti_laddr_sa;
3855 			srclen = (socklen_t)sti->sti_laddr_len;
3856 			dprintso(so, 1,
3857 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
3858 			    srclen, src));
3859 			/*
3860 			 * The sendmsg caller specified a destination
3861 			 * address, which we must translate into our
3862 			 * internal form.  addr = &sti->sti_ux_taddr
3863 			 */
3864 			error = so_ux_addr_xlate(so, name, namelen,
3865 			    (flags & MSG_XPG4_2),
3866 			    &addr, &addrlen);
3867 			if (error) {
3868 				eprintsoline(so, error);
3869 				goto done;
3870 			}
3871 		}
3872 	} else {
3873 		addr = name;
3874 		addrlen = namelen;
3875 		src = NULL;
3876 		srclen = 0;
3877 	}
3878 	tudr.PRIM_type = T_UNITDATA_REQ;
3879 	tudr.DEST_length = addrlen;
3880 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3881 	if (srclen == 0) {
3882 		tudr.OPT_length = 0;
3883 		tudr.OPT_offset = 0;
3884 
3885 		mp = soallocproto2(&tudr, sizeof (tudr),
3886 		    addr, addrlen, 0, _ALLOC_INTR, CRED());
3887 		if (mp == NULL) {
3888 			/*
3889 			 * Caught a signal waiting for memory.
3890 			 * Let send* return EINTR.
3891 			 */
3892 			error = EINTR;
3893 			goto done;
3894 		}
3895 	} else {
3896 		/*
3897 		 * There is a AF_UNIX sockaddr_un to include as a source
3898 		 * address option.
3899 		 */
3900 		struct T_opthdr toh;
3901 		ssize_t size;
3902 
3903 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
3904 		    _TPI_ALIGN_TOPT(srclen));
3905 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3906 		    _TPI_ALIGN_TOPT(addrlen));
3907 
3908 		toh.level = SOL_SOCKET;
3909 		toh.name = SO_SRCADDR;
3910 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3911 		toh.status = 0;
3912 
3913 		size = tudr.OPT_offset + tudr.OPT_length;
3914 		mp = soallocproto2(&tudr, sizeof (tudr),
3915 		    addr, addrlen, size, _ALLOC_INTR, CRED());
3916 		if (mp == NULL) {
3917 			/*
3918 			 * Caught a signal waiting for memory.
3919 			 * Let send* return EINTR.
3920 			 */
3921 			error = EINTR;
3922 			goto done;
3923 		}
3924 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3925 		soappendmsg(mp, &toh, sizeof (toh));
3926 		soappendmsg(mp, src, srclen);
3927 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3928 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3929 	}
3930 
3931 	if (AU_AUDITING())
3932 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3933 
3934 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3935 done:
3936 #ifdef SOCK_DEBUG
3937 	if (error) {
3938 		eprintsoline(so, error);
3939 	}
3940 #endif /* SOCK_DEBUG */
3941 	return (error);
3942 }
3943 
3944 /*
3945  * Sending data on a connected stream socket.
3946  * Assumes caller has verified that SS_ISCONNECTED is set.
3947  */
3948 int
sosend_svc(struct sonode * so,struct uio * uiop,t_scalar_t prim,int more,int sflag)3949 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
3950     int sflag)
3951 {
3952 	struct T_data_req	tdr;
3953 	mblk_t			*mp;
3954 	int			error;
3955 	ssize_t			iosize;
3956 	sotpi_info_t		*sti = SOTOTPI(so);
3957 
3958 	dprintso(so, 1,
3959 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
3960 	    (void *)so, uiop->uio_resid, prim, sflag));
3961 
3962 	/*
3963 	 * Has to be bound and connected. However, since no locks are
3964 	 * held the state could have changed after sotpi_sendmsg checked it
3965 	 * thus it is not possible to ASSERT on the state.
3966 	 */
3967 
3968 	do {
3969 		/*
3970 		 * Set the MORE flag if uio_resid does not fit in this
3971 		 * message or if the caller passed in "more".
3972 		 * Error for transports with zero tidu_size.
3973 		 */
3974 		tdr.PRIM_type = prim;
3975 		iosize = sti->sti_tidu_size;
3976 		if (iosize <= 0)
3977 			return (EMSGSIZE);
3978 		if (uiop->uio_resid > iosize) {
3979 			tdr.MORE_flag = 1;
3980 		} else {
3981 			if (more)
3982 				tdr.MORE_flag = 1;
3983 			else
3984 				tdr.MORE_flag = 0;
3985 			iosize = uiop->uio_resid;
3986 		}
3987 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
3988 		    prim, tdr.MORE_flag, iosize));
3989 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
3990 		if (mp == NULL) {
3991 			/*
3992 			 * Caught a signal waiting for memory.
3993 			 * Let send* return EINTR.
3994 			 */
3995 			return (EINTR);
3996 		}
3997 
3998 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3999 		    0, sflag | MSG_BAND, 0);
4000 		if (error) {
4001 			eprintsoline(so, error);
4002 			return (error);
4003 		}
4004 		if (uiop->uio_resid > 0) {
4005 			/*
4006 			 * Recheck for fatal errors. Fail write even though
4007 			 * some data have been written. This is consistent
4008 			 * with strwrite semantics and BSD sockets semantics.
4009 			 */
4010 			if (so->so_state & SS_CANTSENDMORE) {
4011 				eprintsoline(so, error);
4012 				return (EPIPE);
4013 			}
4014 			if (so->so_error != 0) {
4015 				mutex_enter(&so->so_lock);
4016 				error = sogeterr(so, B_TRUE);
4017 				mutex_exit(&so->so_lock);
4018 				if (error != 0) {
4019 					eprintsoline(so, error);
4020 					return (error);
4021 				}
4022 			}
4023 		}
4024 	} while (uiop->uio_resid > 0);
4025 	return (0);
4026 }
4027 
4028 /*
4029  * Check the state for errors and call the appropriate send function.
4030  *
4031  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4032  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4033  * after sending the message.
4034  *
4035  * The caller may optionally specify a destination address, for either
4036  * stream or datagram sockets.  This table summarizes the cases:
4037  *
4038  *    Socket type    Dest. given    Connected    Result
4039  *    -----------    -----------    ---------    --------------
4040  *    Stream         *              Yes	         send to conn. addr.
4041  *    Stream         *              No           error ENOTCONN
4042  *    Dgram          yes            *            send to given addr.
4043  *    Dgram          no             yes          send to conn. addr.
4044  *    Dgram          no             no	         error EDESTADDRREQ
4045  *
4046  * There are subtleties around the destination address when using
4047  * AF_UNIX datagram sockets.  When the sendmsg call specifies the
4048  * destination address, it's in (struct sockaddr_un) form and we
4049  * need to translate it to our internal form (struct so_ux_addr).
4050  *
4051  * When the sendmsg call does not specify a destination address
4052  * we're using the peer address saved during sotpi_connect, and
4053  * that address is already in internal form.  In this case, the
4054  * (internal only) flag MSG_SENDTO_NOXLATE is set in the flags
4055  * passed to sosend_dgram or sosend_dgramcmsg to indicate that
4056  * those functions should skip translation to internal form.
4057  * Avoiding that translation is not only more efficient, but it's
4058  * also necessary when a process does a connect on an AF_UNIX
4059  * datagram socket and then drops privileges.  After the process
4060  * has dropped privileges, it may no longer be able to lookup the
4061  * the external name in the filesystem, but it should still be
4062  * able to send messages on the connected socket by leaving the
4063  * destination name unspecified.
4064  *
4065  * Yet more subtleties arise with sockets connected by socketpair(),
4066  * which puts internal form addresses in the fields where normally
4067  * the external form is found, and sets sti_faddr_noxlate=1, which
4068  * (like flag MSG_SENDTO_NOXLATE) causes the sosend_dgram functions
4069  * to skip translation of destination addresses to internal form.
4070  * However, beware that the flag sti_faddr_noxlate=1 also triggers
4071  * different behaviour almost everywhere AF_UNIX addresses appear.
4072  */
4073 static int
sotpi_sendmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)4074 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4075     struct cred *cr)
4076 {
4077 	int		so_state;
4078 	int		so_mode;
4079 	int		error;
4080 	struct sockaddr *name;
4081 	t_uscalar_t	namelen;
4082 	int		dontroute;
4083 	int		flags;
4084 	sotpi_info_t	*sti = SOTOTPI(so);
4085 
4086 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4087 	    (void *)so, (void *)msg, msg->msg_flags,
4088 	    pr_state(so->so_state, so->so_mode), so->so_error));
4089 
4090 	if (so->so_version == SOV_STREAM) {
4091 		/* The imaginary "sockmod" has been popped - act as a stream */
4092 		so_update_attrs(so, SOMOD);
4093 		return (strwrite(SOTOV(so), uiop, cr));
4094 	}
4095 
4096 	mutex_enter(&so->so_lock);
4097 	so_state = so->so_state;
4098 
4099 	if (so_state & SS_CANTSENDMORE) {
4100 		mutex_exit(&so->so_lock);
4101 		return (EPIPE);
4102 	}
4103 
4104 	if (so->so_error != 0) {
4105 		error = sogeterr(so, B_TRUE);
4106 		if (error != 0) {
4107 			mutex_exit(&so->so_lock);
4108 			return (error);
4109 		}
4110 	}
4111 
4112 	name = (struct sockaddr *)msg->msg_name;
4113 	namelen = msg->msg_namelen;
4114 	flags = msg->msg_flags;
4115 
4116 	/*
4117 	 * Historically, this function does not validate the flags
4118 	 * passed in, and any errant bits are ignored.  However,
4119 	 * we would not want any such errant flag bits accidently
4120 	 * being treated as one of the internal-only flags, so
4121 	 * clear the internal-only flag bits.
4122 	 */
4123 	flags &= ~MSG_SENDTO_NOXLATE;
4124 
4125 	so_mode = so->so_mode;
4126 
4127 	if (name == NULL) {
4128 		if (!(so_state & SS_ISCONNECTED)) {
4129 			mutex_exit(&so->so_lock);
4130 			if (so_mode & SM_CONNREQUIRED)
4131 				return (ENOTCONN);
4132 			else
4133 				return (EDESTADDRREQ);
4134 		}
4135 		/*
4136 		 * This is a connected socket.
4137 		 */
4138 		if (so_mode & SM_CONNREQUIRED) {
4139 			/*
4140 			 * This is a connected STREAM socket,
4141 			 * destination not specified.
4142 			 */
4143 			name = NULL;
4144 			namelen = 0;
4145 		} else {
4146 			/*
4147 			 * Datagram send on connected socket with
4148 			 * the destination name not specified.
4149 			 * Use the peer address from connect.
4150 			 */
4151 			if (so->so_family == AF_UNIX) {
4152 				/*
4153 				 * Use the (internal form) address saved
4154 				 * in sotpi_connect.  See above.
4155 				 */
4156 				name = (void *)&sti->sti_ux_faddr;
4157 				namelen = sizeof (sti->sti_ux_faddr);
4158 				flags |= MSG_SENDTO_NOXLATE;
4159 			} else {
4160 				ASSERT(sti->sti_faddr_sa);
4161 				name = sti->sti_faddr_sa;
4162 				namelen = (t_uscalar_t)sti->sti_faddr_len;
4163 			}
4164 		}
4165 	} else {
4166 		/*
4167 		 * Sendmsg specifies a destination name
4168 		 */
4169 		if (!(so_state & SS_ISCONNECTED) &&
4170 		    (so_mode & SM_CONNREQUIRED)) {
4171 			/* i.e. TCP not connected */
4172 			mutex_exit(&so->so_lock);
4173 			return (ENOTCONN);
4174 		}
4175 		/*
4176 		 * Ignore the address on connection-oriented sockets.
4177 		 * Just like BSD this code does not generate an error for
4178 		 * TCP (a CONNREQUIRED socket) when sending to an address
4179 		 * passed in with sendto/sendmsg. Instead the data is
4180 		 * delivered on the connection as if no address had been
4181 		 * supplied.
4182 		 */
4183 		if ((so_state & SS_ISCONNECTED) &&
4184 		    !(so_mode & SM_CONNREQUIRED)) {
4185 			mutex_exit(&so->so_lock);
4186 			return (EISCONN);
4187 		}
4188 		if (!(so_state & SS_ISBOUND)) {
4189 			so_lock_single(so);	/* Set SOLOCKED */
4190 			error = sotpi_bind(so, NULL, 0,
4191 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4192 			so_unlock_single(so, SOLOCKED);
4193 			if (error) {
4194 				mutex_exit(&so->so_lock);
4195 				eprintsoline(so, error);
4196 				return (error);
4197 			}
4198 		}
4199 		/*
4200 		 * Handle delayed datagram errors. These are only queued
4201 		 * when the application sets SO_DGRAM_ERRIND.
4202 		 * Return the error if we are sending to the address
4203 		 * that was returned in the last T_UDERROR_IND.
4204 		 * If sending to some other address discard the delayed
4205 		 * error indication.
4206 		 */
4207 		if (sti->sti_delayed_error) {
4208 			struct T_uderror_ind	*tudi;
4209 			void			*addr;
4210 			t_uscalar_t		addrlen;
4211 			boolean_t		match = B_FALSE;
4212 
4213 			ASSERT(sti->sti_eaddr_mp);
4214 			error = sti->sti_delayed_error;
4215 			sti->sti_delayed_error = 0;
4216 			tudi =
4217 			    (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4218 			addrlen = tudi->DEST_length;
4219 			addr = sogetoff(sti->sti_eaddr_mp,
4220 			    tudi->DEST_offset, addrlen, 1);
4221 			ASSERT(addr);	/* Checked by strsock_proto */
4222 			switch (so->so_family) {
4223 			case AF_INET: {
4224 				/* Compare just IP address and port */
4225 				sin_t *sin1 = (sin_t *)name;
4226 				sin_t *sin2 = (sin_t *)addr;
4227 
4228 				if (addrlen == sizeof (sin_t) &&
4229 				    namelen == addrlen &&
4230 				    sin1->sin_port == sin2->sin_port &&
4231 				    sin1->sin_addr.s_addr ==
4232 				    sin2->sin_addr.s_addr)
4233 					match = B_TRUE;
4234 				break;
4235 			}
4236 			case AF_INET6: {
4237 				/* Compare just IP address and port. Not flow */
4238 				sin6_t *sin1 = (sin6_t *)name;
4239 				sin6_t *sin2 = (sin6_t *)addr;
4240 
4241 				if (addrlen == sizeof (sin6_t) &&
4242 				    namelen == addrlen &&
4243 				    sin1->sin6_port == sin2->sin6_port &&
4244 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4245 				    &sin2->sin6_addr))
4246 					match = B_TRUE;
4247 				break;
4248 			}
4249 			case AF_UNIX:
4250 			default:
4251 				if (namelen == addrlen &&
4252 				    bcmp(name, addr, namelen) == 0)
4253 					match = B_TRUE;
4254 			}
4255 			if (match) {
4256 				freemsg(sti->sti_eaddr_mp);
4257 				sti->sti_eaddr_mp = NULL;
4258 				mutex_exit(&so->so_lock);
4259 #ifdef DEBUG
4260 				dprintso(so, 0,
4261 				    ("sockfs delayed error %d for %s\n",
4262 				    error,
4263 				    pr_addr(so->so_family, name, namelen)));
4264 #endif /* DEBUG */
4265 				return (error);
4266 			}
4267 			freemsg(sti->sti_eaddr_mp);
4268 			sti->sti_eaddr_mp = NULL;
4269 		}
4270 	}
4271 	mutex_exit(&so->so_lock);
4272 
4273 	dontroute = 0;
4274 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4275 		uint32_t	val;
4276 
4277 		val = 1;
4278 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4279 		    &val, (t_uscalar_t)sizeof (val), cr);
4280 		if (error)
4281 			return (error);
4282 		dontroute = 1;
4283 	}
4284 
4285 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4286 		error = EOPNOTSUPP;
4287 		goto done;
4288 	}
4289 	if (msg->msg_controllen != 0) {
4290 		if (!(so_mode & SM_CONNREQUIRED)) {
4291 			so_update_attrs(so, SOMOD);
4292 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4293 			    msg->msg_control, msg->msg_controllen, flags);
4294 		} else {
4295 			if (flags & MSG_OOB) {
4296 				/* Can't generate T_EXDATA_REQ with options */
4297 				error = EOPNOTSUPP;
4298 				goto done;
4299 			}
4300 			so_update_attrs(so, SOMOD);
4301 			error = sosend_svccmsg(so, uiop,
4302 			    !(flags & MSG_EOR),
4303 			    msg->msg_control, msg->msg_controllen,
4304 			    flags);
4305 		}
4306 		goto done;
4307 	}
4308 
4309 	so_update_attrs(so, SOMOD);
4310 	if (!(so_mode & SM_CONNREQUIRED)) {
4311 		/*
4312 		 * If there is no SO_DONTROUTE to turn off return immediately
4313 		 * from send_dgram. This can allow tail-call optimizations.
4314 		 */
4315 		if (!dontroute) {
4316 			return (sosend_dgram(so, name, namelen, uiop, flags));
4317 		}
4318 		error = sosend_dgram(so, name, namelen, uiop, flags);
4319 	} else {
4320 		t_scalar_t prim;
4321 		int sflag;
4322 
4323 		/* Ignore msg_name in the connected state */
4324 		if (flags & MSG_OOB) {
4325 			prim = T_EXDATA_REQ;
4326 			/*
4327 			 * Send down T_EXDATA_REQ even if there is flow
4328 			 * control for data.
4329 			 */
4330 			sflag = MSG_IGNFLOW;
4331 		} else {
4332 			if (so_mode & SM_BYTESTREAM) {
4333 				/* Byte stream transport - use write */
4334 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4335 
4336 				/* Send M_DATA messages */
4337 				/*
4338 				 * If there is no SO_DONTROUTE to turn off,
4339 				 * sti_direct is on, and there is no flow
4340 				 * control, we can take the fast path.
4341 				 */
4342 				if (!dontroute && sti->sti_direct != 0 &&
4343 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4344 					return (sostream_direct(so, uiop,
4345 					    NULL, cr));
4346 				}
4347 				error = strwrite(SOTOV(so), uiop, cr);
4348 				goto done;
4349 			}
4350 			prim = T_DATA_REQ;
4351 			sflag = 0;
4352 		}
4353 		/*
4354 		 * If there is no SO_DONTROUTE to turn off return immediately
4355 		 * from sosend_svc. This can allow tail-call optimizations.
4356 		 */
4357 		if (!dontroute)
4358 			return (sosend_svc(so, uiop, prim,
4359 			    !(flags & MSG_EOR), sflag));
4360 		error = sosend_svc(so, uiop, prim,
4361 		    !(flags & MSG_EOR), sflag);
4362 	}
4363 	ASSERT(dontroute);
4364 done:
4365 	if (dontroute) {
4366 		uint32_t	val;
4367 
4368 		val = 0;
4369 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4370 		    &val, (t_uscalar_t)sizeof (val), cr);
4371 	}
4372 	return (error);
4373 }
4374 
4375 /*
4376  * kstrwritemp() has very similar semantics as that of strwrite().
4377  * The main difference is it obtains mblks from the caller and also
4378  * does not do any copy as done in strwrite() from user buffers to
4379  * kernel buffers.
4380  *
4381  * Currently, this routine is used by sendfile to send data allocated
4382  * within the kernel without any copying. This interface does not use the
4383  * synchronous stream interface as synch. stream interface implies
4384  * copying.
4385  */
4386 int
kstrwritemp(struct vnode * vp,mblk_t * mp,ushort_t fmode)4387 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4388 {
4389 	struct stdata *stp;
4390 	struct queue *wqp;
4391 	mblk_t *newmp;
4392 	char waitflag;
4393 	int tempmode;
4394 	int error = 0;
4395 	int done = 0;
4396 	struct sonode *so;
4397 	boolean_t direct;
4398 
4399 	ASSERT(vp->v_stream);
4400 	stp = vp->v_stream;
4401 
4402 	so = VTOSO(vp);
4403 	direct = _SOTOTPI(so)->sti_direct;
4404 
4405 	/*
4406 	 * This is the sockfs direct fast path. canputnext() need
4407 	 * not be accurate so we don't grab the sd_lock here. If
4408 	 * we get flow-controlled, we grab sd_lock just before the
4409 	 * do..while loop below to emulate what strwrite() does.
4410 	 */
4411 	wqp = stp->sd_wrq;
4412 	if (canputnext(wqp) && direct &&
4413 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4414 		return (sostream_direct(so, NULL, mp, CRED()));
4415 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4416 		/* Fast check of flags before acquiring the lock */
4417 		mutex_enter(&stp->sd_lock);
4418 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4419 		mutex_exit(&stp->sd_lock);
4420 		if (error != 0) {
4421 			if (!(stp->sd_flag & STPLEX) &&
4422 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
4423 				error = EPIPE;
4424 			}
4425 			return (error);
4426 		}
4427 	}
4428 
4429 	waitflag = WRITEWAIT;
4430 	if (stp->sd_flag & OLDNDELAY)
4431 		tempmode = fmode & ~FNDELAY;
4432 	else
4433 		tempmode = fmode;
4434 
4435 	mutex_enter(&stp->sd_lock);
4436 	do {
4437 		if (canputnext(wqp)) {
4438 			mutex_exit(&stp->sd_lock);
4439 			if (stp->sd_wputdatafunc != NULL) {
4440 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4441 				    NULL, NULL, NULL);
4442 				if (newmp == NULL) {
4443 					/* The caller will free mp */
4444 					return (ECOMM);
4445 				}
4446 				mp = newmp;
4447 			}
4448 			putnext(wqp, mp);
4449 			return (0);
4450 		}
4451 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4452 		    &done);
4453 	} while (error == 0 && !done);
4454 
4455 	mutex_exit(&stp->sd_lock);
4456 	/*
4457 	 * EAGAIN tells the application to try again. ENOMEM
4458 	 * is returned only if the memory allocation size
4459 	 * exceeds the physical limits of the system. ENOMEM
4460 	 * can't be true here.
4461 	 */
4462 	if (error == ENOMEM)
4463 		error = EAGAIN;
4464 	return (error);
4465 }
4466 
4467 /* ARGSUSED */
4468 static int
sotpi_sendmblk(struct sonode * so,struct nmsghdr * msg,int fflag,struct cred * cr,mblk_t ** mpp)4469 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4470     struct cred *cr, mblk_t **mpp)
4471 {
4472 	int error;
4473 
4474 	switch (so->so_family) {
4475 	case AF_INET:
4476 	case AF_INET6:
4477 	case AF_UNIX:
4478 		break;
4479 	default:
4480 		return (EAFNOSUPPORT);
4481 
4482 	}
4483 
4484 	if (so->so_state & SS_CANTSENDMORE)
4485 		return (EPIPE);
4486 
4487 	if (so->so_type != SOCK_STREAM)
4488 		return (EOPNOTSUPP);
4489 
4490 	if ((so->so_state & SS_ISCONNECTED) == 0)
4491 		return (ENOTCONN);
4492 
4493 	error = kstrwritemp(so->so_vnode, *mpp, fflag);
4494 	if (error == 0)
4495 		*mpp = NULL;
4496 	return (error);
4497 }
4498 
4499 /*
4500  * Sending data on a datagram socket.
4501  * Assumes caller has verified that SS_ISBOUND etc. are set.
4502  */
4503 /* ARGSUSED */
4504 static int
sodgram_direct(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)4505 sodgram_direct(struct sonode *so, struct sockaddr *name,
4506     socklen_t namelen, struct uio *uiop, int flags)
4507 {
4508 	struct T_unitdata_req	tudr;
4509 	mblk_t			*mp = NULL;
4510 	int			error = 0;
4511 	void			*addr;
4512 	socklen_t		addrlen;
4513 	ssize_t			len;
4514 	struct stdata		*stp = SOTOV(so)->v_stream;
4515 	int			so_state;
4516 	queue_t			*udp_wq;
4517 	boolean_t		connected;
4518 	mblk_t			*mpdata = NULL;
4519 	sotpi_info_t		*sti = SOTOTPI(so);
4520 	uint32_t		auditing = AU_AUDITING();
4521 
4522 	ASSERT(name != NULL && namelen != 0);
4523 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4524 	ASSERT(!(so->so_mode & SM_EXDATA));
4525 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4526 	ASSERT(SOTOV(so)->v_type == VSOCK);
4527 
4528 	/* Caller checked for proper length */
4529 	len = uiop->uio_resid;
4530 	ASSERT(len <= sti->sti_tidu_size);
4531 
4532 	/* Length and family checks have been done by caller */
4533 	ASSERT(name->sa_family == so->so_family);
4534 	ASSERT(so->so_family == AF_INET ||
4535 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4536 	ASSERT(so->so_family == AF_INET6 ||
4537 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4538 
4539 	addr = name;
4540 	addrlen = namelen;
4541 
4542 	if (stp->sd_sidp != NULL &&
4543 	    (error = straccess(stp, JCWRITE)) != 0)
4544 		goto done;
4545 
4546 	so_state = so->so_state;
4547 
4548 	connected = so_state & SS_ISCONNECTED;
4549 	if (!connected) {
4550 		tudr.PRIM_type = T_UNITDATA_REQ;
4551 		tudr.DEST_length = addrlen;
4552 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4553 		tudr.OPT_length = 0;
4554 		tudr.OPT_offset = 0;
4555 
4556 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4557 		    _ALLOC_INTR, CRED());
4558 		if (mp == NULL) {
4559 			/*
4560 			 * Caught a signal waiting for memory.
4561 			 * Let send* return EINTR.
4562 			 */
4563 			error = EINTR;
4564 			goto done;
4565 		}
4566 	}
4567 
4568 	/*
4569 	 * For UDP we don't break up the copyin into smaller pieces
4570 	 * as in the TCP case.  That means if ENOMEM is returned by
4571 	 * mcopyinuio() then the uio vector has not been modified at
4572 	 * all and we fallback to either strwrite() or kstrputmsg()
4573 	 * below.  Note also that we never generate priority messages
4574 	 * from here.
4575 	 */
4576 	udp_wq = stp->sd_wrq->q_next;
4577 	if (canput(udp_wq) &&
4578 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4579 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4580 		ASSERT(uiop->uio_resid == 0);
4581 		if (!connected)
4582 			linkb(mp, mpdata);
4583 		else
4584 			mp = mpdata;
4585 		if (auditing)
4586 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4587 
4588 		/* Always returns 0... */
4589 		return (udp_wput(udp_wq, mp));
4590 	}
4591 
4592 	ASSERT(mpdata == NULL);
4593 	if (error != 0 && error != ENOMEM) {
4594 		freemsg(mp);
4595 		return (error);
4596 	}
4597 
4598 	/*
4599 	 * For connected, let strwrite() handle the blocking case.
4600 	 * Otherwise we fall thru and use kstrputmsg().
4601 	 */
4602 	if (connected)
4603 		return (strwrite(SOTOV(so), uiop, CRED()));
4604 
4605 	if (auditing)
4606 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4607 
4608 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4609 done:
4610 #ifdef SOCK_DEBUG
4611 	if (error != 0) {
4612 		eprintsoline(so, error);
4613 	}
4614 #endif /* SOCK_DEBUG */
4615 	return (error);
4616 }
4617 
4618 int
sostream_direct(struct sonode * so,struct uio * uiop,mblk_t * mp,cred_t * cr)4619 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4620 {
4621 	struct stdata *stp = SOTOV(so)->v_stream;
4622 	ssize_t iosize, rmax, maxblk;
4623 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4624 	mblk_t *newmp;
4625 	int error = 0, wflag = 0;
4626 
4627 	ASSERT(so->so_mode & SM_BYTESTREAM);
4628 	ASSERT(SOTOV(so)->v_type == VSOCK);
4629 
4630 	if (stp->sd_sidp != NULL &&
4631 	    (error = straccess(stp, JCWRITE)) != 0)
4632 		return (error);
4633 
4634 	if (uiop == NULL) {
4635 		/*
4636 		 * kstrwritemp() should have checked sd_flag and
4637 		 * flow-control before coming here.  If we end up
4638 		 * here it means that we can simply pass down the
4639 		 * data to tcp.
4640 		 */
4641 		ASSERT(mp != NULL);
4642 		if (stp->sd_wputdatafunc != NULL) {
4643 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4644 			    NULL, NULL, NULL);
4645 			if (newmp == NULL) {
4646 				/* The caller will free mp */
4647 				return (ECOMM);
4648 			}
4649 			mp = newmp;
4650 		}
4651 		/* Always returns 0... */
4652 		return (tcp_wput(tcp_wq, mp));
4653 	}
4654 
4655 	/* Fallback to strwrite() to do proper error handling */
4656 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4657 		return (strwrite(SOTOV(so), uiop, cr));
4658 
4659 	rmax = stp->sd_qn_maxpsz;
4660 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4661 	if (rmax == 0 || uiop->uio_resid <= 0)
4662 		return (0);
4663 
4664 	if (rmax == INFPSZ)
4665 		rmax = uiop->uio_resid;
4666 
4667 	maxblk = stp->sd_maxblk;
4668 
4669 	for (;;) {
4670 		iosize = MIN(uiop->uio_resid, rmax);
4671 
4672 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4673 		if (mp == NULL) {
4674 			/*
4675 			 * Fallback to strwrite() for ENOMEM; if this
4676 			 * is our first time in this routine and the uio
4677 			 * vector has not been modified, we will end up
4678 			 * calling strwrite() without any flag set.
4679 			 */
4680 			if (error == ENOMEM)
4681 				goto slow_send;
4682 			else
4683 				return (error);
4684 		}
4685 		ASSERT(uiop->uio_resid >= 0);
4686 		/*
4687 		 * If mp is non-NULL and ENOMEM is set, it means that
4688 		 * mcopyinuio() was able to break down some of the user
4689 		 * data into one or more mblks.  Send the partial data
4690 		 * to tcp and let the rest be handled in strwrite().
4691 		 */
4692 		ASSERT(error == 0 || error == ENOMEM);
4693 		if (stp->sd_wputdatafunc != NULL) {
4694 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4695 			    NULL, NULL, NULL);
4696 			if (newmp == NULL) {
4697 				/* The caller will free mp */
4698 				return (ECOMM);
4699 			}
4700 			mp = newmp;
4701 		}
4702 		(void) tcp_wput(tcp_wq, mp);	/* Always returns 0 anyway. */
4703 
4704 		wflag |= NOINTR;
4705 
4706 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4707 			ASSERT(error == 0);
4708 			break;
4709 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4710 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4711 slow_send:
4712 			/*
4713 			 * We were able to send down partial data using
4714 			 * the direct call interface, but are now relying
4715 			 * on strwrite() to handle the non-fastpath cases.
4716 			 * If the socket is blocking we will sleep in
4717 			 * strwaitq() until write is permitted, otherwise,
4718 			 * we will need to return the amount of bytes
4719 			 * written so far back to the app.  This is the
4720 			 * reason why we pass NOINTR flag to strwrite()
4721 			 * for non-blocking socket, because we don't want
4722 			 * to return EAGAIN when portion of the user data
4723 			 * has actually been sent down.
4724 			 */
4725 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4726 		}
4727 	}
4728 	return (0);
4729 }
4730 
4731 /*
4732  * Update sti_faddr by asking the transport (unless AF_UNIX).
4733  */
4734 /* ARGSUSED */
4735 int
sotpi_getpeername(struct sonode * so,struct sockaddr * name,socklen_t * namelen,boolean_t accept,struct cred * cr)4736 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4737     boolean_t accept, struct cred *cr)
4738 {
4739 	struct strbuf	strbuf;
4740 	int		error = 0, res;
4741 	void		*addr;
4742 	t_uscalar_t	addrlen;
4743 	k_sigset_t	smask;
4744 	sotpi_info_t	*sti = SOTOTPI(so);
4745 
4746 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4747 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4748 
4749 	ASSERT(*namelen > 0);
4750 	mutex_enter(&so->so_lock);
4751 	so_lock_single(so);	/* Set SOLOCKED */
4752 
4753 	if (accept) {
4754 		bcopy(sti->sti_faddr_sa, name,
4755 		    MIN(*namelen, sti->sti_faddr_len));
4756 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4757 		goto done;
4758 	}
4759 
4760 	if (!(so->so_state & SS_ISCONNECTED)) {
4761 		error = ENOTCONN;
4762 		goto done;
4763 	}
4764 	/* Added this check for X/Open */
4765 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4766 		error = EINVAL;
4767 		if (xnet_check_print) {
4768 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4769 		}
4770 		goto done;
4771 	}
4772 
4773 	if (sti->sti_faddr_valid) {
4774 		bcopy(sti->sti_faddr_sa, name,
4775 		    MIN(*namelen, sti->sti_faddr_len));
4776 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4777 		goto done;
4778 	}
4779 
4780 #ifdef DEBUG
4781 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4782 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4783 	    (t_uscalar_t)sti->sti_faddr_len)));
4784 #endif /* DEBUG */
4785 
4786 	if (so->so_family == AF_UNIX) {
4787 		/* Transport has different name space - return local info */
4788 		if (sti->sti_faddr_noxlate)
4789 			*namelen = 0;
4790 		error = 0;
4791 		goto done;
4792 	}
4793 
4794 	ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4795 
4796 	ASSERT(sti->sti_faddr_sa);
4797 	/* Allocate local buffer to use with ioctl */
4798 	addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4799 	mutex_exit(&so->so_lock);
4800 	addr = kmem_alloc(addrlen, KM_SLEEP);
4801 
4802 	/*
4803 	 * Issue TI_GETPEERNAME with signals masked.
4804 	 * Put the result in sti_faddr_sa so that getpeername works after
4805 	 * a shutdown(output).
4806 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4807 	 * back to the socket.
4808 	 */
4809 	strbuf.buf = addr;
4810 	strbuf.maxlen = addrlen;
4811 	strbuf.len = 0;
4812 
4813 	sigintr(&smask, 0);
4814 	res = 0;
4815 	ASSERT(cr);
4816 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4817 	    0, K_TO_K, cr, &res);
4818 	sigunintr(&smask);
4819 
4820 	mutex_enter(&so->so_lock);
4821 	/*
4822 	 * If there is an error record the error in so_error put don't fail
4823 	 * the getpeername. Instead fallback on the recorded
4824 	 * sti->sti_faddr_sa.
4825 	 */
4826 	if (error) {
4827 		/*
4828 		 * Various stream head errors can be returned to the ioctl.
4829 		 * However, it is impossible to determine which ones of
4830 		 * these are really socket level errors that were incorrectly
4831 		 * consumed by the ioctl. Thus this code silently ignores the
4832 		 * error - to code explicitly does not reinstate the error
4833 		 * using soseterror().
4834 		 * Experiments have shows that at least this set of
4835 		 * errors are reported and should not be reinstated on the
4836 		 * socket:
4837 		 *	EINVAL	E.g. if an I_LINK was in effect when
4838 		 *		getpeername was called.
4839 		 *	EPIPE	The ioctl error semantics prefer the write
4840 		 *		side error over the read side error.
4841 		 *	ENOTCONN The transport just got disconnected but
4842 		 *		sockfs had not yet seen the T_DISCON_IND
4843 		 *		when issuing the ioctl.
4844 		 */
4845 		error = 0;
4846 	} else if (res == 0 && strbuf.len > 0 &&
4847 	    (so->so_state & SS_ISCONNECTED)) {
4848 		ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
4849 		sti->sti_faddr_len = (socklen_t)strbuf.len;
4850 		bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
4851 		sti->sti_faddr_valid = 1;
4852 
4853 		bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
4854 		*namelen = sti->sti_faddr_len;
4855 	}
4856 	kmem_free(addr, addrlen);
4857 #ifdef DEBUG
4858 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4859 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4860 	    (t_uscalar_t)sti->sti_faddr_len)));
4861 #endif /* DEBUG */
4862 done:
4863 	so_unlock_single(so, SOLOCKED);
4864 	mutex_exit(&so->so_lock);
4865 	return (error);
4866 }
4867 
4868 /*
4869  * Update sti_laddr by asking the transport (unless AF_UNIX).
4870  */
4871 int
sotpi_getsockname(struct sonode * so,struct sockaddr * name,socklen_t * namelen,struct cred * cr)4872 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4873     struct cred *cr)
4874 {
4875 	struct strbuf	strbuf;
4876 	int		error = 0, res;
4877 	void		*addr;
4878 	t_uscalar_t	addrlen;
4879 	k_sigset_t	smask;
4880 	sotpi_info_t	*sti = SOTOTPI(so);
4881 
4882 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4883 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4884 
4885 	ASSERT(*namelen > 0);
4886 	mutex_enter(&so->so_lock);
4887 	so_lock_single(so);	/* Set SOLOCKED */
4888 
4889 #ifdef DEBUG
4890 
4891 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4892 	    pr_addr(so->so_family, sti->sti_laddr_sa,
4893 	    (t_uscalar_t)sti->sti_laddr_len)));
4894 #endif /* DEBUG */
4895 	if (sti->sti_laddr_valid) {
4896 		bcopy(sti->sti_laddr_sa, name,
4897 		    MIN(*namelen, sti->sti_laddr_len));
4898 		*namelen = sti->sti_laddr_len;
4899 		goto done;
4900 	}
4901 
4902 	if (so->so_family == AF_UNIX) {
4903 		/*
4904 		 * Transport has different name space - return local info. If we
4905 		 * have enough space, let consumers know the family.
4906 		 */
4907 		if (*namelen >= sizeof (sa_family_t)) {
4908 			name->sa_family = AF_UNIX;
4909 			*namelen = sizeof (sa_family_t);
4910 		} else {
4911 			*namelen = 0;
4912 		}
4913 		error = 0;
4914 		goto done;
4915 	}
4916 	if (!(so->so_state & SS_ISBOUND)) {
4917 		/* If not bound, then nothing to return. */
4918 		error = 0;
4919 		goto done;
4920 	}
4921 
4922 	/* Allocate local buffer to use with ioctl */
4923 	addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
4924 	mutex_exit(&so->so_lock);
4925 	addr = kmem_alloc(addrlen, KM_SLEEP);
4926 
4927 	/*
4928 	 * Issue TI_GETMYNAME with signals masked.
4929 	 * Put the result in sti_laddr_sa so that getsockname works after
4930 	 * a shutdown(output).
4931 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4932 	 * back to the socket.
4933 	 */
4934 	strbuf.buf = addr;
4935 	strbuf.maxlen = addrlen;
4936 	strbuf.len = 0;
4937 
4938 	sigintr(&smask, 0);
4939 	res = 0;
4940 	ASSERT(cr);
4941 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4942 	    0, K_TO_K, cr, &res);
4943 	sigunintr(&smask);
4944 
4945 	mutex_enter(&so->so_lock);
4946 	/*
4947 	 * If there is an error record the error in so_error put don't fail
4948 	 * the getsockname. Instead fallback on the recorded
4949 	 * sti->sti_laddr_sa.
4950 	 */
4951 	if (error) {
4952 		/*
4953 		 * Various stream head errors can be returned to the ioctl.
4954 		 * However, it is impossible to determine which ones of
4955 		 * these are really socket level errors that were incorrectly
4956 		 * consumed by the ioctl. Thus this code silently ignores the
4957 		 * error - to code explicitly does not reinstate the error
4958 		 * using soseterror().
4959 		 * Experiments have shows that at least this set of
4960 		 * errors are reported and should not be reinstated on the
4961 		 * socket:
4962 		 *	EINVAL	E.g. if an I_LINK was in effect when
4963 		 *		getsockname was called.
4964 		 *	EPIPE	The ioctl error semantics prefer the write
4965 		 *		side error over the read side error.
4966 		 */
4967 		error = 0;
4968 	} else if (res == 0 && strbuf.len > 0 &&
4969 	    (so->so_state & SS_ISBOUND)) {
4970 		ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
4971 		sti->sti_laddr_len = (socklen_t)strbuf.len;
4972 		bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
4973 		sti->sti_laddr_valid = 1;
4974 
4975 		bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
4976 		*namelen = sti->sti_laddr_len;
4977 	}
4978 	kmem_free(addr, addrlen);
4979 #ifdef DEBUG
4980 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
4981 	    pr_addr(so->so_family, sti->sti_laddr_sa,
4982 	    (t_uscalar_t)sti->sti_laddr_len)));
4983 #endif /* DEBUG */
4984 done:
4985 	so_unlock_single(so, SOLOCKED);
4986 	mutex_exit(&so->so_lock);
4987 	return (error);
4988 }
4989 
4990 /*
4991  * Get socket options. For SOL_SOCKET options some options are handled
4992  * by the sockfs while others use the value recorded in the sonode as a
4993  * fallback should the T_SVR4_OPTMGMT_REQ fail.
4994  *
4995  * On the return most *optlenp bytes are copied to optval.
4996  */
4997 /* ARGSUSED */
4998 int
sotpi_getsockopt(struct sonode * so,int level,int option_name,void * optval,socklen_t * optlenp,int flags,struct cred * cr)4999 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5000     void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5001 {
5002 	struct T_optmgmt_req	optmgmt_req;
5003 	struct T_optmgmt_ack	*optmgmt_ack;
5004 	struct opthdr		oh;
5005 	struct opthdr		*opt_res;
5006 	mblk_t			*mp = NULL;
5007 	int			error = 0;
5008 	void			*option = NULL;	/* Set if fallback value */
5009 	t_uscalar_t		maxlen = *optlenp;
5010 	t_uscalar_t		len;
5011 	uint32_t		value;
5012 	struct timeval		tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5013 	struct timeval32	tmo_val32;
5014 	struct so_snd_bufinfo	snd_bufinfo;	/* used for zero copy */
5015 
5016 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5017 	    (void *)so, level, option_name, optval, (void *)optlenp,
5018 	    pr_state(so->so_state, so->so_mode)));
5019 
5020 	mutex_enter(&so->so_lock);
5021 	so_lock_single(so);	/* Set SOLOCKED */
5022 
5023 	len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
5024 
5025 	/*
5026 	 * Check for SOL_SOCKET options.
5027 	 * Certain SOL_SOCKET options are returned directly whereas
5028 	 * others only provide a default (fallback) value should
5029 	 * the T_SVR4_OPTMGMT_REQ fail.
5030 	 */
5031 	if (level == SOL_SOCKET) {
5032 		/* Check parameters */
5033 		switch (option_name) {
5034 		case SO_TYPE:
5035 		case SO_ERROR:
5036 		case SO_DEBUG:
5037 		case SO_ACCEPTCONN:
5038 		case SO_REUSEADDR:
5039 		case SO_KEEPALIVE:
5040 		case SO_DONTROUTE:
5041 		case SO_BROADCAST:
5042 		case SO_USELOOPBACK:
5043 		case SO_OOBINLINE:
5044 		case SO_SNDBUF:
5045 		case SO_RCVBUF:
5046 #ifdef notyet
5047 		case SO_SNDLOWAT:
5048 		case SO_RCVLOWAT:
5049 #endif /* notyet */
5050 		case SO_DOMAIN:
5051 		case SO_DGRAM_ERRIND:
5052 		case SO_PROTOCOL:
5053 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5054 				error = EINVAL;
5055 				eprintsoline(so, error);
5056 				goto done2;
5057 			}
5058 			break;
5059 		case SO_RCVTIMEO:
5060 		case SO_SNDTIMEO:
5061 			if (get_udatamodel() == DATAMODEL_NONE ||
5062 			    get_udatamodel() == DATAMODEL_NATIVE) {
5063 				if (maxlen < sizeof (struct timeval)) {
5064 					error = EINVAL;
5065 					eprintsoline(so, error);
5066 					goto done2;
5067 				}
5068 			} else {
5069 				if (maxlen < sizeof (struct timeval32)) {
5070 					error = EINVAL;
5071 					eprintsoline(so, error);
5072 					goto done2;
5073 				}
5074 
5075 			}
5076 			break;
5077 		case SO_LINGER:
5078 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5079 				error = EINVAL;
5080 				eprintsoline(so, error);
5081 				goto done2;
5082 			}
5083 			break;
5084 		case SO_SND_BUFINFO:
5085 			if (maxlen < (t_uscalar_t)
5086 			    sizeof (struct so_snd_bufinfo)) {
5087 				error = EINVAL;
5088 				eprintsoline(so, error);
5089 				goto done2;
5090 			}
5091 			break;
5092 		}
5093 
5094 		switch (option_name) {
5095 		case SO_TYPE:
5096 			value = so->so_type;
5097 			option = &value;
5098 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5099 
5100 		case SO_ERROR:
5101 			value = sogeterr(so, B_TRUE);
5102 			option = &value;
5103 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5104 
5105 		case SO_ACCEPTCONN:
5106 			if (so->so_state & SS_ACCEPTCONN)
5107 				value = SO_ACCEPTCONN;
5108 			else
5109 				value = 0;
5110 #ifdef DEBUG
5111 			if (value) {
5112 				dprintso(so, 1,
5113 				    ("sotpi_getsockopt: 0x%x is set\n",
5114 				    option_name));
5115 			} else {
5116 				dprintso(so, 1,
5117 				    ("sotpi_getsockopt: 0x%x not set\n",
5118 				    option_name));
5119 			}
5120 #endif /* DEBUG */
5121 			option = &value;
5122 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5123 
5124 		case SO_DEBUG:
5125 		case SO_REUSEADDR:
5126 		case SO_KEEPALIVE:
5127 		case SO_DONTROUTE:
5128 		case SO_BROADCAST:
5129 		case SO_USELOOPBACK:
5130 		case SO_OOBINLINE:
5131 		case SO_DGRAM_ERRIND:
5132 			value = (so->so_options & option_name);
5133 #ifdef DEBUG
5134 			if (value) {
5135 				dprintso(so, 1,
5136 				    ("sotpi_getsockopt: 0x%x is set\n",
5137 				    option_name));
5138 			} else {
5139 				dprintso(so, 1,
5140 				    ("sotpi_getsockopt: 0x%x not set\n",
5141 				    option_name));
5142 			}
5143 #endif /* DEBUG */
5144 			option = &value;
5145 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5146 
5147 		/*
5148 		 * The following options are only returned by sockfs when the
5149 		 * T_SVR4_OPTMGMT_REQ fails.
5150 		 */
5151 		case SO_LINGER:
5152 			option = &so->so_linger;
5153 			len = (t_uscalar_t)sizeof (struct linger);
5154 			break;
5155 		case SO_SNDBUF: {
5156 			ssize_t lvalue;
5157 
5158 			/*
5159 			 * If the option has not been set then get a default
5160 			 * value from the read queue. This value is
5161 			 * returned if the transport fails
5162 			 * the T_SVR4_OPTMGMT_REQ.
5163 			 */
5164 			lvalue = so->so_sndbuf;
5165 			if (lvalue == 0) {
5166 				mutex_exit(&so->so_lock);
5167 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
5168 				    QHIWAT, 0, &lvalue);
5169 				mutex_enter(&so->so_lock);
5170 				dprintso(so, 1,
5171 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5172 			}
5173 			value = (int)lvalue;
5174 			option = &value;
5175 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5176 			break;
5177 		}
5178 		case SO_RCVBUF: {
5179 			ssize_t lvalue;
5180 
5181 			/*
5182 			 * If the option has not been set then get a default
5183 			 * value from the read queue. This value is
5184 			 * returned if the transport fails
5185 			 * the T_SVR4_OPTMGMT_REQ.
5186 			 *
5187 			 * XXX If SO_RCVBUF has been set and this is an
5188 			 * XPG 4.2 application then do not ask the transport
5189 			 * since the transport might adjust the value and not
5190 			 * return exactly what was set by the application.
5191 			 * For non-XPG 4.2 application we return the value
5192 			 * that the transport is actually using.
5193 			 */
5194 			lvalue = so->so_rcvbuf;
5195 			if (lvalue == 0) {
5196 				mutex_exit(&so->so_lock);
5197 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5198 				    QHIWAT, 0, &lvalue);
5199 				mutex_enter(&so->so_lock);
5200 				dprintso(so, 1,
5201 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5202 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5203 				value = (int)lvalue;
5204 				option = &value;
5205 				goto copyout;	/* skip asking transport */
5206 			}
5207 			value = (int)lvalue;
5208 			option = &value;
5209 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5210 			break;
5211 		}
5212 		case SO_DOMAIN:
5213 			value = so->so_family;
5214 			option = &value;
5215 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5216 
5217 		case SO_PROTOCOL:
5218 			value = so->so_protocol;
5219 			option = &value;
5220 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5221 
5222 #ifdef notyet
5223 		/*
5224 		 * We do not implement the semantics of these options
5225 		 * thus we shouldn't implement the options either.
5226 		 */
5227 		case SO_SNDLOWAT:
5228 			value = so->so_sndlowat;
5229 			option = &value;
5230 			break;
5231 		case SO_RCVLOWAT:
5232 			value = so->so_rcvlowat;
5233 			option = &value;
5234 			break;
5235 #endif /* notyet */
5236 		case SO_SNDTIMEO:
5237 		case SO_RCVTIMEO: {
5238 			clock_t val;
5239 
5240 			if (option_name == SO_RCVTIMEO)
5241 				val = drv_hztousec(so->so_rcvtimeo);
5242 			else
5243 				val = drv_hztousec(so->so_sndtimeo);
5244 			tmo_val.tv_sec = val / (1000 * 1000);
5245 			tmo_val.tv_usec = val % (1000 * 1000);
5246 			if (get_udatamodel() == DATAMODEL_NONE ||
5247 			    get_udatamodel() == DATAMODEL_NATIVE) {
5248 				option = &tmo_val;
5249 				len = sizeof (struct timeval);
5250 			} else {
5251 				TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5252 				option = &tmo_val32;
5253 				len = sizeof (struct timeval32);
5254 			}
5255 			break;
5256 		}
5257 		case SO_SND_BUFINFO: {
5258 			snd_bufinfo.sbi_wroff =
5259 			    (so->so_proto_props).sopp_wroff;
5260 			snd_bufinfo.sbi_maxblk =
5261 			    (so->so_proto_props).sopp_maxblk;
5262 			snd_bufinfo.sbi_maxpsz =
5263 			    (so->so_proto_props).sopp_maxpsz;
5264 			snd_bufinfo.sbi_tail =
5265 			    (so->so_proto_props).sopp_tail;
5266 			option = &snd_bufinfo;
5267 			len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5268 			break;
5269 		}
5270 		}
5271 	}
5272 
5273 	mutex_exit(&so->so_lock);
5274 
5275 	/* Send request */
5276 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5277 	optmgmt_req.MGMT_flags = T_CHECK;
5278 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5279 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5280 
5281 	oh.level = level;
5282 	oh.name = option_name;
5283 	oh.len = maxlen;
5284 
5285 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5286 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5287 	/* Let option management work in the presence of data flow control */
5288 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5289 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5290 	mp = NULL;
5291 	mutex_enter(&so->so_lock);
5292 	if (error) {
5293 		eprintsoline(so, error);
5294 		goto done2;
5295 	}
5296 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5297 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5298 	if (error) {
5299 		if (option != NULL) {
5300 			/* We have a fallback value */
5301 			error = 0;
5302 			goto copyout;
5303 		}
5304 		eprintsoline(so, error);
5305 		goto done2;
5306 	}
5307 	ASSERT(mp);
5308 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5309 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5310 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5311 	if (opt_res == NULL) {
5312 		if (option != NULL) {
5313 			/* We have a fallback value */
5314 			error = 0;
5315 			goto copyout;
5316 		}
5317 		error = EPROTO;
5318 		eprintsoline(so, error);
5319 		goto done;
5320 	}
5321 	option = &opt_res[1];
5322 
5323 	/* check to ensure that the option is within bounds */
5324 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5325 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5326 		if (option != NULL) {
5327 			/* We have a fallback value */
5328 			error = 0;
5329 			goto copyout;
5330 		}
5331 		error = EPROTO;
5332 		eprintsoline(so, error);
5333 		goto done;
5334 	}
5335 
5336 	len = opt_res->len;
5337 
5338 copyout: {
5339 		t_uscalar_t size = MIN(len, maxlen);
5340 		bcopy(option, optval, size);
5341 		bcopy(&size, optlenp, sizeof (size));
5342 	}
5343 done:
5344 	freemsg(mp);
5345 done2:
5346 	so_unlock_single(so, SOLOCKED);
5347 	mutex_exit(&so->so_lock);
5348 
5349 	return (error);
5350 }
5351 
5352 /*
5353  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5354  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5355  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5356  * setsockopt has to work even if the transport does not support the option.
5357  */
5358 /* ARGSUSED */
5359 int
sotpi_setsockopt(struct sonode * so,int level,int option_name,const void * optval,t_uscalar_t optlen,struct cred * cr)5360 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5361     const void *optval, t_uscalar_t optlen, struct cred *cr)
5362 {
5363 	struct T_optmgmt_req	optmgmt_req;
5364 	struct opthdr		oh;
5365 	mblk_t			*mp;
5366 	int			error = 0;
5367 	boolean_t		handled = B_FALSE;
5368 
5369 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5370 	    (void *)so, level, option_name, optval, optlen,
5371 	    pr_state(so->so_state, so->so_mode)));
5372 
5373 	/* X/Open requires this check */
5374 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5375 		if (xnet_check_print)
5376 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5377 		return (EINVAL);
5378 	}
5379 
5380 	mutex_enter(&so->so_lock);
5381 	so_lock_single(so);	/* Set SOLOCKED */
5382 	mutex_exit(&so->so_lock);
5383 
5384 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5385 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5386 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5387 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5388 
5389 	oh.level = level;
5390 	oh.name = option_name;
5391 	oh.len = optlen;
5392 
5393 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5394 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5395 	/* Let option management work in the presence of data flow control */
5396 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5397 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5398 	mp = NULL;
5399 	mutex_enter(&so->so_lock);
5400 	if (error) {
5401 		eprintsoline(so, error);
5402 		goto done2;
5403 	}
5404 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5405 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5406 	if (error) {
5407 		eprintsoline(so, error);
5408 		goto done;
5409 	}
5410 	ASSERT(mp);
5411 	/* No need to verify T_optmgmt_ack */
5412 	freemsg(mp);
5413 done:
5414 	/*
5415 	 * Check for SOL_SOCKET options and record their values.
5416 	 * If we know about a SOL_SOCKET parameter and the transport
5417 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5418 	 * EPROTO) we let the setsockopt succeed.
5419 	 */
5420 	if (level == SOL_SOCKET) {
5421 		/* Check parameters */
5422 		switch (option_name) {
5423 		case SO_DEBUG:
5424 		case SO_REUSEADDR:
5425 		case SO_KEEPALIVE:
5426 		case SO_DONTROUTE:
5427 		case SO_BROADCAST:
5428 		case SO_USELOOPBACK:
5429 		case SO_OOBINLINE:
5430 		case SO_SNDBUF:
5431 		case SO_RCVBUF:
5432 #ifdef notyet
5433 		case SO_SNDLOWAT:
5434 		case SO_RCVLOWAT:
5435 #endif /* notyet */
5436 		case SO_DGRAM_ERRIND:
5437 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5438 				error = EINVAL;
5439 				eprintsoline(so, error);
5440 				goto done2;
5441 			}
5442 			ASSERT(optval);
5443 			handled = B_TRUE;
5444 			break;
5445 		case SO_SNDTIMEO:
5446 		case SO_RCVTIMEO:
5447 			if (get_udatamodel() == DATAMODEL_NONE ||
5448 			    get_udatamodel() == DATAMODEL_NATIVE) {
5449 				if (optlen != sizeof (struct timeval)) {
5450 					error = EINVAL;
5451 					eprintsoline(so, error);
5452 					goto done2;
5453 				}
5454 			} else {
5455 				if (optlen != sizeof (struct timeval32)) {
5456 					error = EINVAL;
5457 					eprintsoline(so, error);
5458 					goto done2;
5459 				}
5460 			}
5461 			ASSERT(optval);
5462 			handled = B_TRUE;
5463 			break;
5464 		case SO_LINGER:
5465 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5466 				error = EINVAL;
5467 				eprintsoline(so, error);
5468 				goto done2;
5469 			}
5470 			ASSERT(optval);
5471 			handled = B_TRUE;
5472 			break;
5473 		}
5474 
5475 #define	intvalue	(*(int32_t *)optval)
5476 
5477 		switch (option_name) {
5478 		case SO_TYPE:
5479 		case SO_ERROR:
5480 		case SO_ACCEPTCONN:
5481 			/* Can't be set */
5482 			error = ENOPROTOOPT;
5483 			goto done2;
5484 		case SO_LINGER: {
5485 			struct linger *l = (struct linger *)optval;
5486 
5487 			so->so_linger.l_linger = l->l_linger;
5488 			if (l->l_onoff) {
5489 				so->so_linger.l_onoff = SO_LINGER;
5490 				so->so_options |= SO_LINGER;
5491 			} else {
5492 				so->so_linger.l_onoff = 0;
5493 				so->so_options &= ~SO_LINGER;
5494 			}
5495 			break;
5496 		}
5497 
5498 		case SO_DEBUG:
5499 #ifdef SOCK_TEST
5500 			if (intvalue & 2)
5501 				sock_test_timelimit = 10 * hz;
5502 			else
5503 				sock_test_timelimit = 0;
5504 
5505 			if (intvalue & 4)
5506 				do_useracc = 0;
5507 			else
5508 				do_useracc = 1;
5509 #endif /* SOCK_TEST */
5510 			/* FALLTHRU */
5511 		case SO_REUSEADDR:
5512 		case SO_KEEPALIVE:
5513 		case SO_DONTROUTE:
5514 		case SO_BROADCAST:
5515 		case SO_USELOOPBACK:
5516 		case SO_OOBINLINE:
5517 		case SO_DGRAM_ERRIND:
5518 			if (intvalue != 0) {
5519 				dprintso(so, 1,
5520 				    ("socket_setsockopt: setting 0x%x\n",
5521 				    option_name));
5522 				so->so_options |= option_name;
5523 			} else {
5524 				dprintso(so, 1,
5525 				    ("socket_setsockopt: clearing 0x%x\n",
5526 				    option_name));
5527 				so->so_options &= ~option_name;
5528 			}
5529 			break;
5530 		/*
5531 		 * The following options are only returned by us when the
5532 		 * transport layer fails.
5533 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5534 		 * since the transport might adjust the value and not
5535 		 * return exactly what was set by the application.
5536 		 */
5537 		case SO_SNDBUF:
5538 			so->so_sndbuf = intvalue;
5539 			break;
5540 		case SO_RCVBUF:
5541 			so->so_rcvbuf = intvalue;
5542 			break;
5543 		case SO_RCVPSH:
5544 			so->so_rcv_timer_interval = intvalue;
5545 			break;
5546 #ifdef notyet
5547 		/*
5548 		 * We do not implement the semantics of these options
5549 		 * thus we shouldn't implement the options either.
5550 		 */
5551 		case SO_SNDLOWAT:
5552 			so->so_sndlowat = intvalue;
5553 			break;
5554 		case SO_RCVLOWAT:
5555 			so->so_rcvlowat = intvalue;
5556 			break;
5557 #endif /* notyet */
5558 		case SO_SNDTIMEO:
5559 		case SO_RCVTIMEO: {
5560 			struct timeval tl;
5561 			clock_t val;
5562 
5563 			if (get_udatamodel() == DATAMODEL_NONE ||
5564 			    get_udatamodel() == DATAMODEL_NATIVE) {
5565 				bcopy((struct timeval *)optval, &tl,
5566 				    sizeof (struct timeval));
5567 			} else {
5568 				TIMEVAL32_TO_TIMEVAL(&tl,
5569 				    (struct timeval32 *)optval);
5570 			}
5571 			val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5572 			if (option_name == SO_RCVTIMEO)
5573 				so->so_rcvtimeo = drv_usectohz(val);
5574 			else
5575 				so->so_sndtimeo = drv_usectohz(val);
5576 			break;
5577 		}
5578 		}
5579 #undef	intvalue
5580 
5581 		if (error) {
5582 			if ((error == ENOPROTOOPT || error == EPROTO ||
5583 			    error == EINVAL) && handled) {
5584 				dprintso(so, 1,
5585 				    ("setsockopt: ignoring error %d for 0x%x\n",
5586 				    error, option_name));
5587 				error = 0;
5588 			}
5589 		}
5590 	}
5591 done2:
5592 	so_unlock_single(so, SOLOCKED);
5593 	mutex_exit(&so->so_lock);
5594 	return (error);
5595 }
5596 
5597 /*
5598  * sotpi_close() is called when the last open reference goes away.
5599  */
5600 /* ARGSUSED */
5601 int
sotpi_close(struct sonode * so,int flag,struct cred * cr)5602 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5603 {
5604 	struct vnode *vp = SOTOV(so);
5605 	dev_t dev;
5606 	int error = 0;
5607 	sotpi_info_t *sti = SOTOTPI(so);
5608 
5609 	dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5610 	    (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5611 
5612 	dev = sti->sti_dev;
5613 
5614 	ASSERT(STREAMSTAB(getmajor(dev)));
5615 
5616 	mutex_enter(&so->so_lock);
5617 	so_lock_single(so);	/* Set SOLOCKED */
5618 
5619 	ASSERT(so_verify_oobstate(so));
5620 
5621 	if (vp->v_stream != NULL) {
5622 		vnode_t *ux_vp;
5623 
5624 		if (so->so_family == AF_UNIX) {
5625 			/* Could avoid this when CANTSENDMORE for !dgram */
5626 			so_unix_close(so);
5627 		}
5628 
5629 		mutex_exit(&so->so_lock);
5630 		/*
5631 		 * Disassemble the linkage from the AF_UNIX underlying file
5632 		 * system vnode to this socket (by atomically clearing
5633 		 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5634 		 * and frees the stream head.
5635 		 */
5636 		if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5637 			ASSERT(ux_vp->v_stream);
5638 			sti->sti_ux_bound_vp = NULL;
5639 			vn_rele_stream(ux_vp);
5640 		}
5641 		error = strclose(vp, flag, cr);
5642 		vp->v_stream = NULL;
5643 		mutex_enter(&so->so_lock);
5644 	}
5645 
5646 	/*
5647 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5648 	 */
5649 	so_flush_discon_ind(so);
5650 
5651 	so_unlock_single(so, SOLOCKED);
5652 	mutex_exit(&so->so_lock);
5653 
5654 	/*
5655 	 * Needed for STREAMs.
5656 	 * Decrement the device driver's reference count for streams
5657 	 * opened via the clone dip. The driver was held in clone_open().
5658 	 * The absence of clone_close() forces this asymmetry.
5659 	 */
5660 	if (so->so_flag & SOCLONE)
5661 		ddi_rele_driver(getmajor(dev));
5662 
5663 	return (error);
5664 }
5665 
5666 static int
sotpi_ioctl(struct sonode * so,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)5667 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5668     struct cred *cr, int32_t *rvalp)
5669 {
5670 	struct vnode *vp = SOTOV(so);
5671 	sotpi_info_t *sti = SOTOTPI(so);
5672 	int error = 0;
5673 
5674 	dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5675 	    cmd, arg, pr_state(so->so_state, so->so_mode)));
5676 
5677 	switch (cmd) {
5678 	case SIOCSQPTR:
5679 		/*
5680 		 * SIOCSQPTR is valid only when helper stream is created
5681 		 * by the protocol.
5682 		 */
5683 	case _I_INSERT:
5684 	case _I_REMOVE:
5685 		/*
5686 		 * Since there's no compelling reason to support these ioctls
5687 		 * on sockets, and doing so would increase the complexity
5688 		 * markedly, prevent it.
5689 		 */
5690 		return (EOPNOTSUPP);
5691 
5692 	case I_FIND:
5693 	case I_LIST:
5694 	case I_LOOK:
5695 	case I_POP:
5696 	case I_PUSH:
5697 		/*
5698 		 * To prevent races and inconsistencies between the actual
5699 		 * state of the stream and the state according to the sonode,
5700 		 * we serialize all operations which modify or operate on the
5701 		 * list of modules on the socket's stream.
5702 		 */
5703 		mutex_enter(&sti->sti_plumb_lock);
5704 		error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5705 		mutex_exit(&sti->sti_plumb_lock);
5706 		return (error);
5707 
5708 	default:
5709 		if (so->so_version != SOV_STREAM)
5710 			break;
5711 
5712 		/*
5713 		 * The imaginary "sockmod" has been popped; act as a stream.
5714 		 */
5715 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5716 	}
5717 
5718 	ASSERT(so->so_version != SOV_STREAM);
5719 
5720 	/*
5721 	 * Process socket-specific ioctls.
5722 	 */
5723 	switch (cmd) {
5724 	case FIONBIO: {
5725 		int32_t value;
5726 
5727 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5728 		    (mode & (int)FKIOCTL)))
5729 			return (EFAULT);
5730 
5731 		mutex_enter(&so->so_lock);
5732 		if (value) {
5733 			so->so_state |= SS_NDELAY;
5734 		} else {
5735 			so->so_state &= ~SS_NDELAY;
5736 		}
5737 		mutex_exit(&so->so_lock);
5738 		return (0);
5739 	}
5740 
5741 	case FIOASYNC: {
5742 		int32_t value;
5743 
5744 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5745 		    (mode & (int)FKIOCTL)))
5746 			return (EFAULT);
5747 
5748 		mutex_enter(&so->so_lock);
5749 		/*
5750 		 * SS_ASYNC flag not already set correctly?
5751 		 * (!value != !(so->so_state & SS_ASYNC))
5752 		 * but some engineers find that too hard to read.
5753 		 */
5754 		if ((value == 0 && (so->so_state & SS_ASYNC) != 0) ||
5755 		    (value != 0 && (so->so_state & SS_ASYNC) == 0))
5756 			error = so_flip_async(so, vp, mode, cr);
5757 		mutex_exit(&so->so_lock);
5758 		return (error);
5759 	}
5760 
5761 	case SIOCSPGRP:
5762 	case FIOSETOWN: {
5763 		pid_t pgrp;
5764 
5765 		if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5766 		    (mode & (int)FKIOCTL)))
5767 			return (EFAULT);
5768 
5769 		mutex_enter(&so->so_lock);
5770 		dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5771 		/* Any change? */
5772 		if (pgrp != so->so_pgrp)
5773 			error = so_set_siggrp(so, vp, pgrp, mode, cr);
5774 		mutex_exit(&so->so_lock);
5775 		return (error);
5776 	}
5777 	case SIOCGPGRP:
5778 	case FIOGETOWN:
5779 		if (so_copyout(&so->so_pgrp, (void *)arg,
5780 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
5781 			return (EFAULT);
5782 		return (0);
5783 
5784 	case SIOCATMARK: {
5785 		int retval;
5786 		uint_t so_state;
5787 
5788 		/*
5789 		 * strwaitmark has a finite timeout after which it
5790 		 * returns -1 if the mark state is undetermined.
5791 		 * In order to avoid any race between the mark state
5792 		 * in sockfs and the mark state in the stream head this
5793 		 * routine loops until the mark state can be determined
5794 		 * (or the urgent data indication has been removed by some
5795 		 * other thread).
5796 		 */
5797 		do {
5798 			mutex_enter(&so->so_lock);
5799 			so_state = so->so_state;
5800 			mutex_exit(&so->so_lock);
5801 			if (so_state & SS_RCVATMARK) {
5802 				retval = 1;
5803 			} else if (!(so_state & SS_OOBPEND)) {
5804 				/*
5805 				 * No SIGURG has been generated -- there is no
5806 				 * pending or present urgent data. Thus can't
5807 				 * possibly be at the mark.
5808 				 */
5809 				retval = 0;
5810 			} else {
5811 				/*
5812 				 * Have the stream head wait until there is
5813 				 * either some messages on the read queue, or
5814 				 * STRATMARK or STRNOTATMARK gets set. The
5815 				 * STRNOTATMARK flag is used so that the
5816 				 * transport can send up a MSGNOTMARKNEXT
5817 				 * M_DATA to indicate that it is not
5818 				 * at the mark and additional data is not about
5819 				 * to be send upstream.
5820 				 *
5821 				 * If the mark state is undetermined this will
5822 				 * return -1 and we will loop rechecking the
5823 				 * socket state.
5824 				 */
5825 				retval = strwaitmark(vp);
5826 			}
5827 		} while (retval == -1);
5828 
5829 		if (so_copyout(&retval, (void *)arg, sizeof (int),
5830 		    (mode & (int)FKIOCTL)))
5831 			return (EFAULT);
5832 		return (0);
5833 	}
5834 
5835 	case I_FDINSERT:
5836 	case I_SENDFD:
5837 	case I_RECVFD:
5838 	case I_ATMARK:
5839 	case _SIOCSOCKFALLBACK:
5840 		/*
5841 		 * These ioctls do not apply to sockets. I_FDINSERT can be
5842 		 * used to send M_PROTO messages without modifying the socket
5843 		 * state. I_SENDFD/RECVFD should not be used for socket file
5844 		 * descriptor passing since they assume a twisted stream.
5845 		 * SIOCATMARK must be used instead of I_ATMARK.
5846 		 *
5847 		 * _SIOCSOCKFALLBACK from an application should never be
5848 		 * processed.  It is only generated by socktpi_open() or
5849 		 * in response to I_POP or I_PUSH.
5850 		 */
5851 #ifdef DEBUG
5852 		zcmn_err(getzoneid(), CE_WARN,
5853 		    "Unsupported STREAMS ioctl 0x%x on socket. "
5854 		    "Pid = %d\n", cmd, curproc->p_pid);
5855 #endif /* DEBUG */
5856 		return (EOPNOTSUPP);
5857 
5858 	case _I_GETPEERCRED:
5859 		if ((mode & FKIOCTL) == 0)
5860 			return (EINVAL);
5861 
5862 		mutex_enter(&so->so_lock);
5863 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
5864 			error = ENOTSUP;
5865 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
5866 			error = ENOTCONN;
5867 		} else if (so->so_peercred != NULL) {
5868 			k_peercred_t *kp = (k_peercred_t *)arg;
5869 			kp->pc_cr = so->so_peercred;
5870 			kp->pc_cpid = so->so_cpid;
5871 			crhold(so->so_peercred);
5872 		} else {
5873 			error = EINVAL;
5874 		}
5875 		mutex_exit(&so->so_lock);
5876 		return (error);
5877 
5878 	default:
5879 		/*
5880 		 * Do the higher-order bits of the ioctl cmd indicate
5881 		 * that it is an I_* streams ioctl?
5882 		 */
5883 		if ((cmd & 0xffffff00U) == STR &&
5884 		    so->so_version == SOV_SOCKBSD) {
5885 #ifdef DEBUG
5886 			zcmn_err(getzoneid(), CE_WARN,
5887 			    "Unsupported STREAMS ioctl 0x%x on socket. "
5888 			    "Pid = %d\n", cmd, curproc->p_pid);
5889 #endif /* DEBUG */
5890 			return (EOPNOTSUPP);
5891 		}
5892 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5893 	}
5894 }
5895 
5896 /*
5897  * Handle plumbing-related ioctls.
5898  */
5899 static int
socktpi_plumbioctl(struct vnode * vp,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)5900 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
5901     struct cred *cr, int32_t *rvalp)
5902 {
5903 	static const char sockmod_name[] = "sockmod";
5904 	struct sonode	*so = VTOSO(vp);
5905 	char		mname[FMNAMESZ + 1];
5906 	int		error;
5907 	sotpi_info_t	*sti = SOTOTPI(so);
5908 
5909 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
5910 
5911 	if (so->so_version == SOV_SOCKBSD)
5912 		return (EOPNOTSUPP);
5913 
5914 	if (so->so_version == SOV_STREAM) {
5915 		/*
5916 		 * The imaginary "sockmod" has been popped - act as a stream.
5917 		 * If this is a push of sockmod then change back to a socket.
5918 		 */
5919 		if (cmd == I_PUSH) {
5920 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
5921 			    (void *)arg, mname, sizeof (mname), NULL);
5922 
5923 			if (error == 0 && strcmp(mname, sockmod_name) == 0) {
5924 				dprintso(so, 0, ("socktpi_ioctl: going to "
5925 				    "socket version\n"));
5926 				so_stream2sock(so);
5927 				return (0);
5928 			}
5929 		}
5930 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5931 	}
5932 
5933 	switch (cmd) {
5934 	case I_PUSH:
5935 		if (sti->sti_direct) {
5936 			mutex_enter(&so->so_lock);
5937 			so_lock_single(so);
5938 			mutex_exit(&so->so_lock);
5939 
5940 			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
5941 			    cr, rvalp);
5942 
5943 			mutex_enter(&so->so_lock);
5944 			if (error == 0)
5945 				sti->sti_direct = 0;
5946 			so_unlock_single(so, SOLOCKED);
5947 			mutex_exit(&so->so_lock);
5948 
5949 			if (error != 0)
5950 				return (error);
5951 		}
5952 
5953 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5954 		if (error == 0)
5955 			sti->sti_pushcnt++;
5956 		return (error);
5957 
5958 	case I_POP:
5959 		if (sti->sti_pushcnt == 0) {
5960 			/* Emulate sockmod being popped */
5961 			dprintso(so, 0,
5962 			    ("socktpi_ioctl: going to STREAMS version\n"));
5963 			return (so_sock2stream(so));
5964 		}
5965 
5966 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5967 		if (error == 0)
5968 			sti->sti_pushcnt--;
5969 		return (error);
5970 
5971 	case I_LIST: {
5972 		struct str_mlist *kmlistp, *umlistp;
5973 		struct str_list	kstrlist;
5974 		ssize_t		kstrlistsize;
5975 		int		i, nmods;
5976 
5977 		STRUCT_DECL(str_list, ustrlist);
5978 		STRUCT_INIT(ustrlist, mode);
5979 
5980 		if (arg == 0) {
5981 			error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5982 			if (error == 0)
5983 				(*rvalp)++;	/* Add one for sockmod */
5984 			return (error);
5985 		}
5986 
5987 		error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
5988 		    STRUCT_SIZE(ustrlist), mode & FKIOCTL);
5989 		if (error != 0)
5990 			return (error);
5991 
5992 		nmods = STRUCT_FGET(ustrlist, sl_nmods);
5993 		if (nmods <= 0)
5994 			return (EINVAL);
5995 		/*
5996 		 * Ceiling nmods at nstrpush to prevent someone from
5997 		 * maliciously consuming lots of kernel memory.
5998 		 */
5999 		nmods = MIN(nmods, nstrpush);
6000 
6001 		kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6002 		kstrlist.sl_nmods = nmods;
6003 		kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6004 
6005 		error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6006 		    cr, rvalp);
6007 		if (error != 0)
6008 			goto done;
6009 
6010 		/*
6011 		 * Considering the module list as a 0-based array of sl_nmods
6012 		 * modules, sockmod should conceptually exist at slot
6013 		 * sti_pushcnt.  Insert sockmod at this location by sliding all
6014 		 * of the module names after so_pushcnt over by one.  We know
6015 		 * that there will be room to do this since we allocated
6016 		 * sl_modlist with an additional slot.
6017 		 */
6018 		for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6019 			kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6020 
6021 		(void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6022 		kstrlist.sl_nmods++;
6023 
6024 		/*
6025 		 * Copy all of the entries out to ustrlist.
6026 		 */
6027 		kmlistp = kstrlist.sl_modlist;
6028 		umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6029 		for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6030 			error = so_copyout(kmlistp++, umlistp++,
6031 			    sizeof (struct str_mlist), mode & FKIOCTL);
6032 			if (error != 0)
6033 				goto done;
6034 		}
6035 
6036 		error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6037 		    mode & FKIOCTL);
6038 		if (error == 0)
6039 			*rvalp = 0;
6040 	done:
6041 		kmem_free(kstrlist.sl_modlist, kstrlistsize);
6042 		return (error);
6043 	}
6044 	case I_LOOK:
6045 		if (sti->sti_pushcnt == 0) {
6046 			return (so_copyout(sockmod_name, (void *)arg,
6047 			    sizeof (sockmod_name), mode & FKIOCTL));
6048 		}
6049 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6050 
6051 	case I_FIND:
6052 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6053 		if (error && error != EINVAL)
6054 			return (error);
6055 
6056 		/* if not found and string was sockmod return 1 */
6057 		if (*rvalp == 0 || error == EINVAL) {
6058 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6059 			    (void *)arg, mname, sizeof (mname), NULL);
6060 			if (error == ENAMETOOLONG)
6061 				error = EINVAL;
6062 
6063 			if (error == 0 && strcmp(mname, sockmod_name) == 0)
6064 				*rvalp = 1;
6065 		}
6066 		return (error);
6067 
6068 	default:
6069 		panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6070 		break;
6071 	}
6072 
6073 	return (0);
6074 }
6075 
6076 /*
6077  * Wrapper around the streams poll routine that implements socket poll
6078  * semantics.
6079  * The sockfs never calls pollwakeup itself - the stream head take care
6080  * of all pollwakeups. Since sockfs never holds so_lock when calling the
6081  * stream head there can never be a deadlock due to holding so_lock across
6082  * pollwakeup and acquiring so_lock in this routine.
6083  *
6084  * However, since the performance of VOP_POLL is critical we avoid
6085  * acquiring so_lock here. This is based on two assumptions:
6086  *  - The poll implementation holds locks to serialize the VOP_POLL call
6087  *    and a pollwakeup for the same pollhead. This ensures that should
6088  *    e.g. so_state change during a socktpi_poll call the pollwakeup
6089  *    (which strsock_* and strrput conspire to issue) is issued after
6090  *    the state change. Thus the pollwakeup will block until VOP_POLL has
6091  *    returned and then wake up poll and have it call VOP_POLL again.
6092  *  - The reading of so_state without holding so_lock does not result in
6093  *    stale data that is older than the latest state change that has dropped
6094  *    so_lock. This is ensured by the mutex_exit issuing the appropriate
6095  *    memory barrier to force the data into the coherency domain.
6096  */
6097 static int
sotpi_poll(struct sonode * so,short events,int anyyet,short * reventsp,struct pollhead ** phpp)6098 sotpi_poll(
6099 	struct sonode	*so,
6100 	short		events,
6101 	int		anyyet,
6102 	short		*reventsp,
6103 	struct pollhead **phpp)
6104 {
6105 	short origevents = events;
6106 	struct vnode *vp = SOTOV(so);
6107 	int error;
6108 	int so_state = so->so_state;	/* snapshot */
6109 	sotpi_info_t *sti = SOTOTPI(so);
6110 
6111 	dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6112 	    (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6113 
6114 	ASSERT(vp->v_type == VSOCK);
6115 	ASSERT(vp->v_stream != NULL);
6116 
6117 	if (so->so_version == SOV_STREAM) {
6118 		/* The imaginary "sockmod" has been popped - act as a stream */
6119 		return (strpoll(vp->v_stream, events, anyyet,
6120 		    reventsp, phpp));
6121 	}
6122 
6123 	if (!(so_state & SS_ISCONNECTED) &&
6124 	    (so->so_mode & SM_CONNREQUIRED)) {
6125 		/* Not connected yet - turn off write side events */
6126 		events &= ~(POLLOUT|POLLWRBAND);
6127 	}
6128 	/*
6129 	 * Check for errors without calling strpoll if the caller wants them.
6130 	 * In sockets the errors are represented as input/output events
6131 	 * and there is no need to ask the stream head for this information.
6132 	 */
6133 	if (so->so_error != 0 &&
6134 	    ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
6135 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6136 		return (0);
6137 	}
6138 	/*
6139 	 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6140 	 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6141 	 * will not trigger a POLLIN event with POLLRDDATA set.
6142 	 * The handling of urgent data (causing POLLRDBAND) is done by
6143 	 * inspecting SS_OOBPEND below.
6144 	 */
6145 	events |= POLLRDDATA;
6146 
6147 	/*
6148 	 * After shutdown(output) a stream head write error is set.
6149 	 * However, we should not return output events.
6150 	 */
6151 	events |= POLLNOERR;
6152 	error = strpoll(vp->v_stream, events, anyyet,
6153 	    reventsp, phpp);
6154 	if (error)
6155 		return (error);
6156 
6157 	ASSERT(!(*reventsp & POLLERR));
6158 
6159 	/*
6160 	 * Notes on T_CONN_IND handling for sockets.
6161 	 *
6162 	 * If strpoll() returned without events, SR_POLLIN is guaranteed
6163 	 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6164 	 *
6165 	 * Since the so_lock is not held, soqueueconnind() may have run
6166 	 * and a T_CONN_IND may be waiting. We now check for any queued
6167 	 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6168 	 * to ensure poll returns.
6169 	 *
6170 	 * However:
6171 	 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6172 	 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6173 	 * the following actions will occur; taken together they ensure the
6174 	 * syscall will return.
6175 	 *
6176 	 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6177 	 *    the accept() was run on a non-blocking socket sowaitconnind()
6178 	 *    may have already returned EWOULDBLOCK, so not be waiting to
6179 	 *    process the message. Additionally socktpi_poll() has probably
6180 	 *    proceeded past the sti_conn_ind_head check below.
6181 	 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6182 	 *    this thread,  however that could occur before poll_common()
6183 	 *    has entered cv_wait.
6184 	 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6185 	 *
6186 	 * Before proceeding to cv_wait() in poll_common() for an event,
6187 	 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6188 	 * and if set, re-calls strpoll() to ensure the late arriving
6189 	 * T_CONN_IND is recognized, and pollsys() returns.
6190 	 */
6191 
6192 	if (sti->sti_conn_ind_head != NULL)
6193 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6194 
6195 	if (so->so_state & SS_CANTRCVMORE) {
6196 		*reventsp |= POLLRDHUP & events;
6197 
6198 		if (so->so_state & SS_CANTSENDMORE)
6199 			*reventsp |= POLLHUP;
6200 	}
6201 
6202 	if (so->so_state & SS_OOBPEND)
6203 		*reventsp |= POLLRDBAND & events;
6204 
6205 	return (0);
6206 }
6207 
6208 /*ARGSUSED*/
6209 static int
socktpi_constructor(void * buf,void * cdrarg,int kmflags)6210 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6211 {
6212 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6213 	int error = 0;
6214 
6215 	error = sonode_constructor(buf, cdrarg, kmflags);
6216 	if (error != 0)
6217 		return (error);
6218 
6219 	error = i_sotpi_info_constructor(&st->st_info);
6220 	if (error != 0)
6221 		sonode_destructor(buf, cdrarg);
6222 
6223 	st->st_sonode.so_priv = &st->st_info;
6224 
6225 	return (error);
6226 }
6227 
6228 /*ARGSUSED1*/
6229 static void
socktpi_destructor(void * buf,void * cdrarg)6230 socktpi_destructor(void *buf, void *cdrarg)
6231 {
6232 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6233 
6234 	ASSERT(st->st_sonode.so_priv == &st->st_info);
6235 	st->st_sonode.so_priv = NULL;
6236 
6237 	i_sotpi_info_destructor(&st->st_info);
6238 	sonode_destructor(buf, cdrarg);
6239 }
6240 
6241 static int
socktpi_unix_constructor(void * buf,void * cdrarg,int kmflags)6242 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6243 {
6244 	int retval;
6245 
6246 	if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6247 		struct sonode *so = (struct sonode *)buf;
6248 		sotpi_info_t *sti = SOTOTPI(so);
6249 
6250 		mutex_enter(&socklist.sl_lock);
6251 
6252 		sti->sti_next_so = socklist.sl_list;
6253 		sti->sti_prev_so = NULL;
6254 		if (sti->sti_next_so != NULL)
6255 			SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6256 		socklist.sl_list = so;
6257 
6258 		mutex_exit(&socklist.sl_lock);
6259 
6260 	}
6261 	return (retval);
6262 }
6263 
6264 static void
socktpi_unix_destructor(void * buf,void * cdrarg)6265 socktpi_unix_destructor(void *buf, void *cdrarg)
6266 {
6267 	struct sonode	*so = (struct sonode *)buf;
6268 	sotpi_info_t	*sti = SOTOTPI(so);
6269 
6270 	mutex_enter(&socklist.sl_lock);
6271 
6272 	if (sti->sti_next_so != NULL)
6273 		SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6274 	if (sti->sti_prev_so != NULL)
6275 		SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6276 	else
6277 		socklist.sl_list = sti->sti_next_so;
6278 
6279 	mutex_exit(&socklist.sl_lock);
6280 
6281 	socktpi_destructor(buf, cdrarg);
6282 }
6283 
6284 int
socktpi_init(void)6285 socktpi_init(void)
6286 {
6287 	/*
6288 	 * Create sonode caches.  We create a special one for AF_UNIX so
6289 	 * that we can track them for netstat(8).
6290 	 */
6291 	socktpi_cache = kmem_cache_create("socktpi_cache",
6292 	    sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6293 	    socktpi_destructor, NULL, NULL, NULL, 0);
6294 
6295 	socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6296 	    sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6297 	    socktpi_unix_destructor, NULL, NULL, NULL, 0);
6298 
6299 	return (0);
6300 }
6301 
6302 /*
6303  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6304  *
6305  * Caller must still update state and mode using sotpi_update_state().
6306  */
6307 int
sotpi_convert_sonode(struct sonode * so,struct sockparams * newsp,boolean_t * direct,queue_t ** qp,struct cred * cr)6308 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6309     boolean_t *direct, queue_t **qp, struct cred *cr)
6310 {
6311 	sotpi_info_t *sti;
6312 	struct sockparams *origsp = so->so_sockparams;
6313 	sock_lower_handle_t handle = so->so_proto_handle;
6314 	struct stdata *stp;
6315 	struct vnode *vp;
6316 	queue_t *q;
6317 	int error = 0;
6318 
6319 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6320 	    SS_FALLBACK_PENDING);
6321 	ASSERT(SOCK_IS_NONSTR(so));
6322 
6323 	*qp = NULL;
6324 	*direct = B_FALSE;
6325 	so->so_sockparams = newsp;
6326 	/*
6327 	 * Allocate and initalize fields required by TPI.
6328 	 */
6329 	(void) sotpi_info_create(so, KM_SLEEP);
6330 	sotpi_info_init(so);
6331 
6332 	if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6333 		sotpi_info_fini(so);
6334 		sotpi_info_destroy(so);
6335 		return (error);
6336 	}
6337 	ASSERT(handle == so->so_proto_handle);
6338 	sti = SOTOTPI(so);
6339 	if (sti->sti_direct != 0)
6340 		*direct = B_TRUE;
6341 
6342 	/*
6343 	 * Keep the original sp around so we can properly dispose of the
6344 	 * sonode when the socket is being closed.
6345 	 */
6346 	sti->sti_orig_sp = origsp;
6347 
6348 	so_basic_strinit(so);	/* skips the T_CAPABILITY_REQ */
6349 	so_alloc_addr(so, so->so_max_addr_len);
6350 
6351 	/*
6352 	 * If the application has done a SIOCSPGRP, make sure the
6353 	 * STREAM head is aware. This needs to take place before
6354 	 * the protocol start sending up messages. Otherwise we
6355 	 * might miss to generate SIGPOLL.
6356 	 *
6357 	 * It is possible that the application will receive duplicate
6358 	 * signals if some were already generated for either data or
6359 	 * connection indications.
6360 	 */
6361 	if (so->so_pgrp != 0) {
6362 		if (so_set_events(so, so->so_vnode, cr) != 0)
6363 			so->so_pgrp = 0;
6364 	}
6365 
6366 	/*
6367 	 * Determine which queue to use.
6368 	 */
6369 	vp = SOTOV(so);
6370 	stp = vp->v_stream;
6371 	ASSERT(stp != NULL);
6372 	q = stp->sd_wrq->q_next;
6373 
6374 	/*
6375 	 * Skip any modules that may have been auto pushed when the device
6376 	 * was opened
6377 	 */
6378 	while (q->q_next != NULL)
6379 		q = q->q_next;
6380 	*qp = _RD(q);
6381 
6382 	/* This is now a STREAMS sockets */
6383 	so->so_not_str = B_FALSE;
6384 
6385 	return (error);
6386 }
6387 
6388 /*
6389  * Revert a TPI sonode. It is only allowed to revert the sonode during
6390  * the fallback process.
6391  */
6392 void
sotpi_revert_sonode(struct sonode * so,struct cred * cr)6393 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6394 {
6395 	vnode_t *vp = SOTOV(so);
6396 
6397 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6398 	    SS_FALLBACK_PENDING);
6399 	ASSERT(!SOCK_IS_NONSTR(so));
6400 	ASSERT(vp->v_stream != NULL);
6401 
6402 	strclean(vp);
6403 	(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6404 
6405 	/*
6406 	 * Restore the original sockparams. The caller is responsible for
6407 	 * dropping the ref to the new sp.
6408 	 */
6409 	so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6410 
6411 	sotpi_info_fini(so);
6412 	sotpi_info_destroy(so);
6413 
6414 	/* This is no longer a STREAMS sockets */
6415 	so->so_not_str = B_TRUE;
6416 }
6417 
6418 void
sotpi_update_state(struct sonode * so,struct T_capability_ack * tcap,struct sockaddr * laddr,socklen_t laddrlen,struct sockaddr * faddr,socklen_t faddrlen,short opts)6419 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6420     struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6421     socklen_t faddrlen, short opts)
6422 {
6423 	sotpi_info_t *sti = SOTOTPI(so);
6424 
6425 	so_proc_tcapability_ack(so, tcap);
6426 
6427 	so->so_options |= opts;
6428 
6429 	/*
6430 	 * Determine whether the foreign and local address are valid
6431 	 */
6432 	if (laddrlen != 0) {
6433 		ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6434 		sti->sti_laddr_len = laddrlen;
6435 		bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6436 		sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6437 	}
6438 
6439 	if (faddrlen != 0) {
6440 		ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6441 		sti->sti_faddr_len = faddrlen;
6442 		bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6443 		sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6444 	}
6445 
6446 }
6447 
6448 /*
6449  * Allocate enough space to cache the local and foreign addresses.
6450  */
6451 void
so_alloc_addr(struct sonode * so,t_uscalar_t maxlen)6452 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6453 {
6454 	sotpi_info_t *sti = SOTOTPI(so);
6455 
6456 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6457 	ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6458 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6459 	    P2ROUNDUP(maxlen, KMEM_ALIGN);
6460 	so->so_max_addr_len = sti->sti_laddr_maxlen;
6461 	sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6462 	sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6463 	    + sti->sti_laddr_maxlen);
6464 
6465 	if (so->so_family == AF_UNIX) {
6466 		/*
6467 		 * Initialize AF_UNIX related fields.
6468 		 */
6469 		bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6470 		bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6471 	}
6472 }
6473 
6474 
6475 sotpi_info_t *
sotpi_sototpi(struct sonode * so)6476 sotpi_sototpi(struct sonode *so)
6477 {
6478 	sotpi_info_t *sti;
6479 
6480 	ASSERT(so != NULL);
6481 
6482 	sti = (sotpi_info_t *)so->so_priv;
6483 
6484 	ASSERT(sti != NULL);
6485 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6486 
6487 	return (sti);
6488 }
6489 
6490 static int
i_sotpi_info_constructor(sotpi_info_t * sti)6491 i_sotpi_info_constructor(sotpi_info_t *sti)
6492 {
6493 	sti->sti_magic		= SOTPI_INFO_MAGIC;
6494 	sti->sti_ack_mp		= NULL;
6495 	sti->sti_discon_ind_mp	= NULL;
6496 	sti->sti_ux_bound_vp	= NULL;
6497 	sti->sti_unbind_mp	= NULL;
6498 
6499 	sti->sti_conn_ind_head	= NULL;
6500 	sti->sti_conn_ind_tail	= NULL;
6501 
6502 	sti->sti_laddr_sa	= NULL;
6503 	sti->sti_faddr_sa	= NULL;
6504 
6505 	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6506 	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6507 
6508 	return (0);
6509 }
6510 
6511 static void
i_sotpi_info_destructor(sotpi_info_t * sti)6512 i_sotpi_info_destructor(sotpi_info_t *sti)
6513 {
6514 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6515 	ASSERT(sti->sti_ack_mp == NULL);
6516 	ASSERT(sti->sti_discon_ind_mp == NULL);
6517 	ASSERT(sti->sti_ux_bound_vp == NULL);
6518 	ASSERT(sti->sti_unbind_mp == NULL);
6519 
6520 	ASSERT(sti->sti_conn_ind_head == NULL);
6521 	ASSERT(sti->sti_conn_ind_tail == NULL);
6522 
6523 	ASSERT(sti->sti_laddr_sa == NULL);
6524 	ASSERT(sti->sti_faddr_sa == NULL);
6525 
6526 	mutex_destroy(&sti->sti_plumb_lock);
6527 	cv_destroy(&sti->sti_ack_cv);
6528 }
6529 
6530 /*
6531  * Creates and attaches TPI information to the given sonode
6532  */
6533 static boolean_t
sotpi_info_create(struct sonode * so,int kmflags)6534 sotpi_info_create(struct sonode *so, int kmflags)
6535 {
6536 	sotpi_info_t *sti;
6537 
6538 	ASSERT(so->so_priv == NULL);
6539 
6540 	if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6541 		return (B_FALSE);
6542 
6543 	if (i_sotpi_info_constructor(sti) != 0) {
6544 		kmem_free(sti, sizeof (*sti));
6545 		return (B_FALSE);
6546 	}
6547 
6548 	so->so_priv = (void *)sti;
6549 	return (B_TRUE);
6550 }
6551 
6552 /*
6553  * Initializes the TPI information.
6554  */
6555 static void
sotpi_info_init(struct sonode * so)6556 sotpi_info_init(struct sonode *so)
6557 {
6558 	struct vnode *vp = SOTOV(so);
6559 	sotpi_info_t *sti = SOTOTPI(so);
6560 	time_t now;
6561 
6562 	sti->sti_dev	= so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6563 	vp->v_rdev	= sti->sti_dev;
6564 
6565 	sti->sti_orig_sp = NULL;
6566 
6567 	sti->sti_pushcnt = 0;
6568 
6569 	now = gethrestime_sec();
6570 	sti->sti_atime	= now;
6571 	sti->sti_mtime	= now;
6572 	sti->sti_ctime	= now;
6573 
6574 	sti->sti_eaddr_mp = NULL;
6575 	sti->sti_delayed_error = 0;
6576 
6577 	sti->sti_provinfo = NULL;
6578 
6579 	sti->sti_oobcnt = 0;
6580 	sti->sti_oobsigcnt = 0;
6581 
6582 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6583 
6584 	sti->sti_laddr_sa	= 0;
6585 	sti->sti_faddr_sa	= 0;
6586 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6587 	sti->sti_laddr_len = sti->sti_faddr_len = 0;
6588 
6589 	sti->sti_laddr_valid = 0;
6590 	sti->sti_faddr_valid = 0;
6591 	sti->sti_faddr_noxlate = 0;
6592 
6593 	sti->sti_direct = 0;
6594 
6595 	ASSERT(sti->sti_ack_mp == NULL);
6596 	ASSERT(sti->sti_ux_bound_vp == NULL);
6597 	ASSERT(sti->sti_unbind_mp == NULL);
6598 
6599 	ASSERT(sti->sti_conn_ind_head == NULL);
6600 	ASSERT(sti->sti_conn_ind_tail == NULL);
6601 }
6602 
6603 /*
6604  * Given a sonode, grab the TPI info and free any data.
6605  */
6606 static void
sotpi_info_fini(struct sonode * so)6607 sotpi_info_fini(struct sonode *so)
6608 {
6609 	sotpi_info_t *sti = SOTOTPI(so);
6610 	mblk_t *mp;
6611 
6612 	ASSERT(sti->sti_discon_ind_mp == NULL);
6613 
6614 	if ((mp = sti->sti_conn_ind_head) != NULL) {
6615 		mblk_t *mp1;
6616 
6617 		while (mp) {
6618 			mp1 = mp->b_next;
6619 			mp->b_next = NULL;
6620 			freemsg(mp);
6621 			mp = mp1;
6622 		}
6623 		sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6624 	}
6625 
6626 	/*
6627 	 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6628 	 * indirect them.  It also uses so_count as a validity test.
6629 	 */
6630 	mutex_enter(&so->so_lock);
6631 
6632 	if (sti->sti_laddr_sa) {
6633 		ASSERT((caddr_t)sti->sti_faddr_sa ==
6634 		    (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6635 		ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6636 		sti->sti_laddr_valid = 0;
6637 		sti->sti_faddr_valid = 0;
6638 		kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6639 		sti->sti_laddr_sa = NULL;
6640 		sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6641 		sti->sti_faddr_sa = NULL;
6642 		sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6643 	}
6644 
6645 	mutex_exit(&so->so_lock);
6646 
6647 	if ((mp = sti->sti_eaddr_mp) != NULL) {
6648 		freemsg(mp);
6649 		sti->sti_eaddr_mp = NULL;
6650 		sti->sti_delayed_error = 0;
6651 	}
6652 
6653 	if ((mp = sti->sti_ack_mp) != NULL) {
6654 		freemsg(mp);
6655 		sti->sti_ack_mp = NULL;
6656 	}
6657 
6658 	ASSERT(sti->sti_ux_bound_vp == NULL);
6659 	if ((mp = sti->sti_unbind_mp) != NULL) {
6660 		freemsg(mp);
6661 		sti->sti_unbind_mp = NULL;
6662 	}
6663 }
6664 
6665 /*
6666  * Destroys the TPI information attached to a sonode.
6667  */
6668 static void
sotpi_info_destroy(struct sonode * so)6669 sotpi_info_destroy(struct sonode *so)
6670 {
6671 	sotpi_info_t *sti = SOTOTPI(so);
6672 
6673 	i_sotpi_info_destructor(sti);
6674 	kmem_free(sti, sizeof (*sti));
6675 
6676 	so->so_priv = NULL;
6677 }
6678 
6679 /*
6680  * Create the global sotpi socket module entry. It will never be freed.
6681  */
6682 smod_info_t *
sotpi_smod_create(void)6683 sotpi_smod_create(void)
6684 {
6685 	smod_info_t *smodp;
6686 
6687 	smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6688 	smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6689 	(void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6690 	/*
6691 	 * Initialize the smod_refcnt to 1 so it will never be freed.
6692 	 */
6693 	smodp->smod_refcnt = 1;
6694 	smodp->smod_uc_version = SOCK_UC_VERSION;
6695 	smodp->smod_dc_version = SOCK_DC_VERSION;
6696 	smodp->smod_sock_create_func = &sotpi_create;
6697 	smodp->smod_sock_destroy_func = &sotpi_destroy;
6698 	return (smodp);
6699 }
6700