xref: /illumos-gate/usr/src/uts/common/fs/sockfs/socktpi.c (revision 1c1f30a61162b465dc08830707c1c2509ecfc872)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2015, Joyent, Inc.
25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
26  * Copyright 2022 Garrett D'Amore
27  * Copyright 2024 Oxide Computer Company
28  */
29 
30 #include <sys/types.h>
31 #include <sys/t_lock.h>
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/buf.h>
35 #include <sys/conf.h>
36 #include <sys/cred.h>
37 #include <sys/kmem.h>
38 #include <sys/kmem_impl.h>
39 #include <sys/sysmacros.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/debug.h>
43 #include <sys/errno.h>
44 #include <sys/time.h>
45 #include <sys/file.h>
46 #include <sys/open.h>
47 #include <sys/user.h>
48 #include <sys/termios.h>
49 #include <sys/stream.h>
50 #include <sys/strsubr.h>
51 #include <sys/strsun.h>
52 #include <sys/suntpi.h>
53 #include <sys/ddi.h>
54 #include <sys/esunddi.h>
55 #include <sys/flock.h>
56 #include <sys/modctl.h>
57 #include <sys/vtrace.h>
58 #include <sys/cmn_err.h>
59 #include <sys/pathname.h>
60 
61 #include <sys/socket.h>
62 #include <sys/socketvar.h>
63 #include <sys/sockio.h>
64 #include <netinet/in.h>
65 #include <sys/un.h>
66 #include <sys/strsun.h>
67 
68 #include <sys/tiuser.h>
69 #define	_SUN_TPI_VERSION	2
70 #include <sys/tihdr.h>
71 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
72 
73 #include <c2/audit.h>
74 
75 #include <inet/common.h>
76 #include <inet/ip.h>
77 #include <inet/ip6.h>
78 #include <inet/tcp.h>
79 #include <inet/udp_impl.h>
80 
81 #include <sys/zone.h>
82 
83 #include <fs/sockfs/sockcommon.h>
84 #include <fs/sockfs/socktpi.h>
85 #include <fs/sockfs/socktpi_impl.h>
86 
87 /*
88  * Possible failures when memory can't be allocated. The documented behavior:
89  *
90  *		5.5:			4.X:		XNET:
91  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
92  *							EINTR
93  *	(4.X does not document EINTR but returns it)
94  * bind:	ENOSR			-		ENOBUFS/ENOSR
95  * connect:	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
96  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
97  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
98  *	(4.X getpeername and getsockname do not fail in practice)
99  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
100  * listen:	-			-		ENOBUFS
101  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
102  *							EINTR
103  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
104  *							EINTR
105  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
106  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
107  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
108  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
109  *
110  * Resolution. When allocation fails:
111  *	recv: return EINTR
112  *	send: return EINTR
113  *	connect, accept: EINTR
114  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
115  *	socket, socketpair: ENOBUFS
116  *	getpeername, getsockname: sleep
117  *	getsockopt, setsockopt: sleep
118  */
119 
120 #ifdef SOCK_TEST
121 /*
122  * Variables that make sockfs do something other than the standard TPI
123  * for the AF_INET transports.
124  *
125  * solisten_tpi_tcp:
126  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
127  *	the transport is already bound. This is needed to avoid loosing the
128  *	port number should listen() do a T_UNBIND_REQ followed by a
129  *	O_T_BIND_REQ.
130  *
131  * soconnect_tpi_udp:
132  *	UDP and ICMP can handle a T_CONN_REQ.
133  *	This is needed to make the sequence of connect(), getsockname()
134  *	return the local IP address used to send packets to the connected to
135  *	destination.
136  *
137  * soconnect_tpi_tcp:
138  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
139  *	Set this to non-zero to send TPI conformant messages to TCP in this
140  *	respect. This is a performance optimization.
141  *
142  * soaccept_tpi_tcp:
143  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
144  *	This is a performance optimization that has been picked up in XTI.
145  *
146  * soaccept_tpi_multioptions:
147  *	When inheriting SOL_SOCKET options from the listener to the accepting
148  *	socket send them as a single message for AF_INET{,6}.
149  */
150 int solisten_tpi_tcp = 0;
151 int soconnect_tpi_udp = 0;
152 int soconnect_tpi_tcp = 0;
153 int soaccept_tpi_tcp = 0;
154 int soaccept_tpi_multioptions = 1;
155 #else /* SOCK_TEST */
156 #define	soconnect_tpi_tcp	0
157 #define	soconnect_tpi_udp	0
158 #define	solisten_tpi_tcp	0
159 #define	soaccept_tpi_tcp	0
160 #define	soaccept_tpi_multioptions	1
161 #endif /* SOCK_TEST */
162 
163 #ifdef SOCK_TEST
164 extern int do_useracc;
165 extern clock_t sock_test_timelimit;
166 #endif /* SOCK_TEST */
167 
168 extern uint32_t ucredsize;
169 
170 /*
171  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
172  * applications working. Turn on this flag to disable these checks.
173  */
174 int xnet_skip_checks = 0;
175 int xnet_check_print = 0;
176 int xnet_truncate_print = 0;
177 
178 static void sotpi_destroy(struct sonode *);
179 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
180     int, int *, cred_t *cr);
181 
182 static boolean_t	sotpi_info_create(struct sonode *, int);
183 static void		sotpi_info_init(struct sonode *);
184 static void		sotpi_info_fini(struct sonode *);
185 static void		sotpi_info_destroy(struct sonode *);
186 
187 /*
188  * Do direct function call to the transport layer below; this would
189  * also allow the transport to utilize read-side synchronous stream
190  * interface if necessary.  This is a /etc/system tunable that must
191  * not be modified on a running system.  By default this is enabled
192  * for performance reasons and may be disabled for debugging purposes.
193  */
194 boolean_t socktpi_direct = B_TRUE;
195 
196 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
197 
198 extern	void sigintr(k_sigset_t *, int);
199 extern	void sigunintr(k_sigset_t *);
200 
201 static int	sotpi_unbind(struct sonode *, int);
202 
203 /* TPI sockfs sonode operations */
204 int		sotpi_init(struct sonode *, struct sonode *, struct cred *,
205 		    int);
206 static int	sotpi_accept(struct sonode *, int, struct cred *,
207 		    struct sonode **);
208 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
209 		    int, struct cred *);
210 static int	sotpi_listen(struct sonode *, int, struct cred *);
211 static int	sotpi_connect(struct sonode *, struct sockaddr *,
212 		    socklen_t, int, int, struct cred *);
213 extern int	sotpi_recvmsg(struct sonode *, struct nmsghdr *,
214 		    struct uio *, struct cred *);
215 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
216 		    struct uio *, struct cred *);
217 static int	sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
218 		    struct cred *, mblk_t **);
219 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
220 		    struct uio *, void *, t_uscalar_t, int);
221 static int	sodgram_direct(struct sonode *, struct sockaddr *,
222 		    socklen_t, struct uio *, int);
223 extern int	sotpi_getpeername(struct sonode *, struct sockaddr *,
224 		    socklen_t *, boolean_t, struct cred *);
225 static int	sotpi_getsockname(struct sonode *, struct sockaddr *,
226 		    socklen_t *, struct cred *);
227 static int	sotpi_shutdown(struct sonode *, int, struct cred *);
228 extern int	sotpi_getsockopt(struct sonode *, int, int, void *,
229 		    socklen_t *, int, struct cred *);
230 extern int	sotpi_setsockopt(struct sonode *, int, int, const void *,
231 		    socklen_t, struct cred *);
232 static int	sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
233 		    int32_t *);
234 static int	socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
235 		    struct cred *, int32_t *);
236 static int	sotpi_poll(struct sonode *, short, int, short *,
237 		    struct pollhead **);
238 static int	sotpi_close(struct sonode *, int, struct cred *);
239 
240 static int	i_sotpi_info_constructor(sotpi_info_t *);
241 static void	i_sotpi_info_destructor(sotpi_info_t *);
242 
243 sonodeops_t sotpi_sonodeops = {
244 	sotpi_init,		/* sop_init		*/
245 	sotpi_accept,		/* sop_accept		*/
246 	sotpi_bind,		/* sop_bind		*/
247 	sotpi_listen,		/* sop_listen		*/
248 	sotpi_connect,		/* sop_connect		*/
249 	sotpi_recvmsg,		/* sop_recvmsg		*/
250 	sotpi_sendmsg,		/* sop_sendmsg		*/
251 	sotpi_sendmblk,		/* sop_sendmblk		*/
252 	sotpi_getpeername,	/* sop_getpeername	*/
253 	sotpi_getsockname,	/* sop_getsockname	*/
254 	sotpi_shutdown,		/* sop_shutdown		*/
255 	sotpi_getsockopt,	/* sop_getsockopt	*/
256 	sotpi_setsockopt,	/* sop_setsockopt	*/
257 	sotpi_ioctl,		/* sop_ioctl		*/
258 	sotpi_poll,		/* sop_poll		*/
259 	sotpi_close,		/* sop_close		*/
260 };
261 
262 /*
263  * Return a TPI socket vnode.
264  *
265  * Note that sockets assume that the driver will clone (either itself
266  * or by using the clone driver) i.e. a socket() call will always
267  * result in a new vnode being created.
268  */
269 
270 /*
271  * Common create code for socket and accept. If tso is set the values
272  * from that node is used instead of issuing a T_INFO_REQ.
273  */
274 
275 /* ARGSUSED */
276 static struct sonode *
sotpi_create(struct sockparams * sp,int family,int type,int protocol,int version,int sflags,int * errorp,cred_t * cr)277 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
278     int version, int sflags, int *errorp, cred_t *cr)
279 {
280 	struct sonode	*so;
281 	kmem_cache_t	*cp;
282 
283 	ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
284 
285 	if (family == AF_NCA) {
286 		/*
287 		 * The request is for an NCA socket so for NL7C use the
288 		 * INET domain instead and mark NL7C_AF_NCA below.
289 		 */
290 		family = AF_INET;
291 		/*
292 		 * NL7C is not supported in the non-global zone,
293 		 * we enforce this restriction here.
294 		 */
295 		if (getzoneid() != GLOBAL_ZONEID) {
296 			*errorp = ENOTSUP;
297 			return (NULL);
298 		}
299 	}
300 
301 	/*
302 	 * to be compatible with old tpi socket implementation ignore
303 	 * sleep flag (sflags) passed in
304 	 */
305 	cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
306 	so = kmem_cache_alloc(cp, KM_SLEEP);
307 	if (so == NULL) {
308 		*errorp = ENOMEM;
309 		return (NULL);
310 	}
311 
312 	sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
313 	sotpi_info_init(so);
314 
315 	if (version == SOV_DEFAULT)
316 		version = so_default_version;
317 
318 	so->so_version = (short)version;
319 	*errorp = 0;
320 
321 	return (so);
322 }
323 
324 static void
sotpi_destroy(struct sonode * so)325 sotpi_destroy(struct sonode *so)
326 {
327 	kmem_cache_t *cp;
328 	struct sockparams *origsp;
329 
330 	/*
331 	 * If there is a new dealloc function (ie. smod_destroy_func),
332 	 * then it should check the correctness of the ops.
333 	 */
334 
335 	ASSERT(so->so_ops == &sotpi_sonodeops);
336 
337 	origsp = SOTOTPI(so)->sti_orig_sp;
338 
339 	sotpi_info_fini(so);
340 
341 	if (so->so_state & SS_FALLBACK_COMP) {
342 		/*
343 		 * A fallback happend, which means that a sotpi_info_t struct
344 		 * was allocated (as opposed to being allocated from the TPI
345 		 * sonode cache. Therefore we explicitly free the struct
346 		 * here.
347 		 */
348 		sotpi_info_destroy(so);
349 		ASSERT(origsp != NULL);
350 
351 		origsp->sp_smod_info->smod_sock_destroy_func(so);
352 		SOCKPARAMS_DEC_REF(origsp);
353 	} else {
354 		sonode_fini(so);
355 		cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
356 		    socktpi_cache;
357 		kmem_cache_free(cp, so);
358 	}
359 }
360 
361 /* ARGSUSED1 */
362 int
sotpi_init(struct sonode * so,struct sonode * tso,struct cred * cr,int flags)363 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
364 {
365 	major_t maj;
366 	dev_t newdev;
367 	struct vnode *vp;
368 	int error = 0;
369 	struct stdata *stp;
370 
371 	sotpi_info_t *sti = SOTOTPI(so);
372 
373 	dprint(1, ("sotpi_init()\n"));
374 
375 	/*
376 	 * over write the sleep flag passed in but that is ok
377 	 * as tpi socket does not honor sleep flag.
378 	 */
379 	flags |= FREAD|FWRITE;
380 
381 	/*
382 	 * Record in so_flag that it is a clone.
383 	 */
384 	if (getmajor(sti->sti_dev) == clone_major)
385 		so->so_flag |= SOCLONE;
386 
387 	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
388 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
389 	    (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
390 	    so->so_protocol == IPPROTO_IP)) {
391 		/* Tell tcp or udp that it's talking to sockets */
392 		flags |= SO_SOCKSTR;
393 
394 		/*
395 		 * Here we indicate to socktpi_open() our attempt to
396 		 * make direct calls between sockfs and transport.
397 		 * The final decision is left to socktpi_open().
398 		 */
399 		sti->sti_direct = 1;
400 
401 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
402 		if (so->so_type == SOCK_STREAM && tso != NULL) {
403 			if (SOTOTPI(tso)->sti_direct) {
404 				/*
405 				 * Inherit sti_direct from listener and pass
406 				 * SO_ACCEPTOR open flag to tcp, indicating
407 				 * that this is an accept fast-path instance.
408 				 */
409 				flags |= SO_ACCEPTOR;
410 			} else {
411 				/*
412 				 * sti_direct is not set on listener, meaning
413 				 * that the listener has been converted from
414 				 * a socket to a stream.  Ensure that the
415 				 * acceptor inherits these settings.
416 				 */
417 				sti->sti_direct = 0;
418 				flags &= ~SO_SOCKSTR;
419 			}
420 		}
421 	}
422 
423 	/*
424 	 * Tell local transport that it is talking to sockets.
425 	 */
426 	if (so->so_family == AF_UNIX) {
427 		flags |= SO_SOCKSTR;
428 	}
429 
430 	vp = SOTOV(so);
431 	newdev = vp->v_rdev;
432 	maj = getmajor(newdev);
433 	ASSERT(STREAMSTAB(maj));
434 
435 	error = stropen(vp, &newdev, flags, cr);
436 
437 	stp = vp->v_stream;
438 	if (error == 0) {
439 		if (so->so_flag & SOCLONE)
440 			ASSERT(newdev != vp->v_rdev);
441 		mutex_enter(&so->so_lock);
442 		sti->sti_dev = newdev;
443 		vp->v_rdev = newdev;
444 		mutex_exit(&so->so_lock);
445 
446 		if (stp->sd_flag & STRISTTY) {
447 			/*
448 			 * this is a post SVR4 tty driver - a socket can not
449 			 * be a controlling terminal. Fail the open.
450 			 */
451 			(void) sotpi_close(so, flags, cr);
452 			return (ENOTTY);	/* XXX */
453 		}
454 
455 		ASSERT(stp->sd_wrq != NULL);
456 		sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
457 
458 		/*
459 		 * If caller is interested in doing direct function call
460 		 * interface to/from transport module, probe the module
461 		 * directly beneath the streamhead to see if it qualifies.
462 		 *
463 		 * We turn off the direct interface when qualifications fail.
464 		 * In the acceptor case, we simply turn off the sti_direct
465 		 * flag on the socket. We do the fallback after the accept
466 		 * has completed, before the new socket is returned to the
467 		 * application.
468 		 */
469 		if (sti->sti_direct) {
470 			queue_t *tq = stp->sd_wrq->q_next;
471 
472 			/*
473 			 * sti_direct is currently supported and tested
474 			 * only for tcp/udp; this is the main reason to
475 			 * have the following assertions.
476 			 */
477 			ASSERT(so->so_family == AF_INET ||
478 			    so->so_family == AF_INET6);
479 			ASSERT(so->so_protocol == IPPROTO_UDP ||
480 			    so->so_protocol == IPPROTO_TCP ||
481 			    so->so_protocol == IPPROTO_IP);
482 			ASSERT(so->so_type == SOCK_DGRAM ||
483 			    so->so_type == SOCK_STREAM);
484 
485 			/*
486 			 * Abort direct call interface if the module directly
487 			 * underneath the stream head is not defined with the
488 			 * _D_DIRECT flag.  This could happen in the tcp or
489 			 * udp case, when some other module is autopushed
490 			 * above it, or for some reasons the expected module
491 			 * isn't purely D_MP (which is the main requirement).
492 			 */
493 			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
494 			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
495 				int rval;
496 
497 				/* Continue on without direct calls */
498 				sti->sti_direct = 0;
499 
500 				/*
501 				 * Cannot issue ioctl on fallback socket since
502 				 * there is no conn associated with the queue.
503 				 * The fallback downcall will notify the proto
504 				 * of the change.
505 				 */
506 				if (!(flags & SO_ACCEPTOR) &&
507 				    !(flags & SO_FALLBACK)) {
508 					if ((error = strioctl(vp,
509 					    _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
510 					    cr, &rval)) != 0) {
511 						(void) sotpi_close(so, flags,
512 						    cr);
513 						return (error);
514 					}
515 				}
516 			}
517 		}
518 
519 		if (flags & SO_FALLBACK) {
520 			/*
521 			 * The stream created does not have a conn.
522 			 * do stream set up after conn has been assigned
523 			 */
524 			return (error);
525 		}
526 		error = so_strinit(so, tso);
527 		if (error != 0) {
528 			(void) sotpi_close(so, flags, cr);
529 			return (error);
530 		}
531 
532 		/* Enable sendfile() on AF_UNIX streams */
533 		if (so->so_family == AF_UNIX && so->so_type == SOCK_STREAM) {
534 			mutex_enter(&so->so_lock);
535 			so->so_mode |= SM_SENDFILESUPP;
536 			mutex_exit(&so->so_lock);
537 		}
538 
539 		/* Wildcard */
540 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
541 			int protocol = so->so_protocol;
542 			/*
543 			 * Issue SO_PROTOTYPE setsockopt.
544 			 */
545 			error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
546 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
547 			if (error != 0) {
548 				(void) sotpi_close(so, flags, cr);
549 				/*
550 				 * Setsockopt often fails with ENOPROTOOPT but
551 				 * socket() should fail with
552 				 * EPROTONOSUPPORT/EPROTOTYPE.
553 				 */
554 				return (EPROTONOSUPPORT);
555 			}
556 		}
557 
558 	} else {
559 		/*
560 		 * While the same socket can not be reopened (unlike specfs)
561 		 * the stream head sets STREOPENFAIL when the autopush fails.
562 		 */
563 		if ((stp != NULL) &&
564 		    (stp->sd_flag & STREOPENFAIL)) {
565 			/*
566 			 * Open failed part way through.
567 			 */
568 			mutex_enter(&stp->sd_lock);
569 			stp->sd_flag &= ~STREOPENFAIL;
570 			mutex_exit(&stp->sd_lock);
571 			(void) sotpi_close(so, flags, cr);
572 			return (error);
573 			/*NOTREACHED*/
574 		}
575 		ASSERT(stp == NULL);
576 	}
577 	TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
578 	    "sockfs open:maj %d vp %p so %p error %d",
579 	    maj, vp, so, error);
580 	return (error);
581 }
582 
583 /*
584  * Bind the socket to an unspecified address in sockfs only.
585  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
586  * required in all cases.
587  */
588 static void
so_automatic_bind(struct sonode * so)589 so_automatic_bind(struct sonode *so)
590 {
591 	sotpi_info_t *sti = SOTOTPI(so);
592 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
593 
594 	ASSERT(MUTEX_HELD(&so->so_lock));
595 	ASSERT(!(so->so_state & SS_ISBOUND));
596 	ASSERT(sti->sti_unbind_mp);
597 
598 	ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
599 	bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
600 	sti->sti_laddr_sa->sa_family = so->so_family;
601 	so->so_state |= SS_ISBOUND;
602 }
603 
604 
605 /*
606  * bind the socket.
607  *
608  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
609  * are passed in we allow rebinding. Note that for backwards compatibility
610  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
611  * Thus the rebinding code is currently not executed.
612  *
613  * The constraints for rebinding are:
614  * - it is a SOCK_DGRAM, or
615  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
616  *   and no listen() has been done.
617  * This rebinding code was added based on some language in the XNET book
618  * about not returning EINVAL it the protocol allows rebinding. However,
619  * this language is not present in the Posix socket draft. Thus maybe the
620  * rebinding logic should be deleted from the source.
621  *
622  * A null "name" can be used to unbind the socket if:
623  * - it is a SOCK_DGRAM, or
624  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
625  *   and no listen() has been done.
626  */
627 /* ARGSUSED */
628 static int
sotpi_bindlisten(struct sonode * so,struct sockaddr * name,socklen_t namelen,int backlog,int flags,struct cred * cr)629 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
630     socklen_t namelen, int backlog, int flags, struct cred *cr)
631 {
632 	struct T_bind_req	bind_req;
633 	struct T_bind_ack	*bind_ack;
634 	int			error = 0;
635 	mblk_t			*mp;
636 	void			*addr;
637 	t_uscalar_t		addrlen;
638 	int			unbind_on_err = 1;
639 	boolean_t		clear_acceptconn_on_err = B_FALSE;
640 	boolean_t		restore_backlog_on_err = B_FALSE;
641 	int			save_so_backlog = 0;
642 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
643 	boolean_t		tcp_udp_xport;
644 	sotpi_info_t		*sti = SOTOTPI(so);
645 
646 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
647 	    (void *)so, (void *)name, namelen, backlog, flags,
648 	    pr_state(so->so_state, so->so_mode)));
649 
650 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
651 
652 	if (!(flags & _SOBIND_LOCK_HELD)) {
653 		mutex_enter(&so->so_lock);
654 		so_lock_single(so);	/* Set SOLOCKED */
655 	} else {
656 		ASSERT(MUTEX_HELD(&so->so_lock));
657 		ASSERT(so->so_flag & SOLOCKED);
658 	}
659 
660 	/*
661 	 * Make sure that there is a preallocated unbind_req message
662 	 * before binding. This message allocated when the socket is
663 	 * created  but it might be have been consumed.
664 	 */
665 	if (sti->sti_unbind_mp == NULL) {
666 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
667 		/* NOTE: holding so_lock while sleeping */
668 		sti->sti_unbind_mp =
669 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
670 		    cr);
671 	}
672 
673 	if (flags & _SOBIND_REBIND) {
674 		/*
675 		 * Called from solisten after doing an sotpi_unbind() or
676 		 * potentially without the unbind (latter for AF_INET{,6}).
677 		 */
678 		ASSERT(name == NULL && namelen == 0);
679 
680 		if (so->so_family == AF_UNIX) {
681 			ASSERT(sti->sti_ux_bound_vp);
682 			addr = &sti->sti_ux_laddr;
683 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
684 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
685 			    "addr 0x%p, vp %p\n",
686 			    addrlen,
687 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
688 			    (void *)sti->sti_ux_bound_vp));
689 		} else {
690 			addr = sti->sti_laddr_sa;
691 			addrlen = (t_uscalar_t)sti->sti_laddr_len;
692 		}
693 	} else if (flags & _SOBIND_UNSPEC) {
694 		ASSERT(name == NULL && namelen == 0);
695 
696 		/*
697 		 * The caller checked SS_ISBOUND but not necessarily
698 		 * under so_lock
699 		 */
700 		if (so->so_state & SS_ISBOUND) {
701 			/* No error */
702 			goto done;
703 		}
704 
705 		/* Set an initial local address */
706 		switch (so->so_family) {
707 		case AF_UNIX:
708 			/*
709 			 * Use an address with same size as struct sockaddr
710 			 * just like BSD.
711 			 */
712 			sti->sti_laddr_len =
713 			    (socklen_t)sizeof (struct sockaddr);
714 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
715 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
716 			sti->sti_laddr_sa->sa_family = so->so_family;
717 
718 			/*
719 			 * Pass down an address with the implicit bind
720 			 * magic number and the rest all zeros.
721 			 * The transport will return a unique address.
722 			 */
723 			sti->sti_ux_laddr.soua_vp = NULL;
724 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
725 			addr = &sti->sti_ux_laddr;
726 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
727 			break;
728 
729 		case AF_INET:
730 		case AF_INET6:
731 			/*
732 			 * An unspecified bind in TPI has a NULL address.
733 			 * Set the address in sockfs to have the sa_family.
734 			 */
735 			sti->sti_laddr_len = (so->so_family == AF_INET) ?
736 			    (socklen_t)sizeof (sin_t) :
737 			    (socklen_t)sizeof (sin6_t);
738 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
739 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
740 			sti->sti_laddr_sa->sa_family = so->so_family;
741 			addr = NULL;
742 			addrlen = 0;
743 			break;
744 
745 		default:
746 			/*
747 			 * An unspecified bind in TPI has a NULL address.
748 			 * Set the address in sockfs to be zero length.
749 			 *
750 			 * Can not assume there is a sa_family for all
751 			 * protocol families. For example, AF_X25 does not
752 			 * have a family field.
753 			 */
754 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
755 			sti->sti_laddr_len = 0;	/* XXX correct? */
756 			addr = NULL;
757 			addrlen = 0;
758 			break;
759 		}
760 
761 	} else {
762 		if (so->so_state & SS_ISBOUND) {
763 			/*
764 			 * If it is ok to rebind the socket, first unbind
765 			 * with the transport. A rebind to the NULL address
766 			 * is interpreted as an unbind.
767 			 * Note that a bind to NULL in BSD does unbind the
768 			 * socket but it fails with EINVAL.
769 			 * Note that regular sockets set SOV_SOCKBSD i.e.
770 			 * _SOBIND_SOCKBSD gets set here hence no type of
771 			 * socket does currently allow rebinding.
772 			 *
773 			 * If the name is NULL just do an unbind.
774 			 */
775 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
776 			    name != NULL) {
777 				error = EINVAL;
778 				unbind_on_err = 0;
779 				eprintsoline(so, error);
780 				goto done;
781 			}
782 			if ((so->so_mode & SM_CONNREQUIRED) &&
783 			    (so->so_state & SS_CANTREBIND)) {
784 				error = EINVAL;
785 				unbind_on_err = 0;
786 				eprintsoline(so, error);
787 				goto done;
788 			}
789 			error = sotpi_unbind(so, 0);
790 			if (error) {
791 				eprintsoline(so, error);
792 				goto done;
793 			}
794 			ASSERT(!(so->so_state & SS_ISBOUND));
795 			if (name == NULL) {
796 				so->so_state &=
797 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
798 				goto done;
799 			}
800 		}
801 
802 		/* X/Open requires this check */
803 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
804 			if (xnet_check_print) {
805 				printf("sockfs: X/Open bind state check "
806 				    "caused EINVAL\n");
807 			}
808 			error = EINVAL;
809 			goto done;
810 		}
811 
812 		switch (so->so_family) {
813 		case AF_UNIX:
814 			/*
815 			 * All AF_UNIX addresses are nul terminated
816 			 * when copied (copyin_name) in so the minimum
817 			 * length is 3 bytes.
818 			 */
819 			if (name == NULL ||
820 			    (ssize_t)namelen <= sizeof (short) + 1) {
821 				error = EISDIR;
822 				eprintsoline(so, error);
823 				goto done;
824 			}
825 			/*
826 			 * Verify so_family matches the bound family.
827 			 * BSD does not check this for AF_UNIX resulting
828 			 * in funny mknods.
829 			 */
830 			if (name->sa_family != so->so_family) {
831 				error = EAFNOSUPPORT;
832 				goto done;
833 			}
834 			break;
835 		case AF_INET:
836 			if (name == NULL) {
837 				error = EINVAL;
838 				eprintsoline(so, error);
839 				goto done;
840 			}
841 			if ((size_t)namelen != sizeof (sin_t)) {
842 				error = name->sa_family != so->so_family ?
843 				    EAFNOSUPPORT : EINVAL;
844 				eprintsoline(so, error);
845 				goto done;
846 			}
847 			if ((flags & _SOBIND_XPG4_2) &&
848 			    (name->sa_family != so->so_family)) {
849 				/*
850 				 * This check has to be made for X/Open
851 				 * sockets however application failures have
852 				 * been observed when it is applied to
853 				 * all sockets.
854 				 */
855 				error = EAFNOSUPPORT;
856 				eprintsoline(so, error);
857 				goto done;
858 			}
859 			/*
860 			 * Force a zero sa_family to match so_family.
861 			 *
862 			 * Some programs like inetd(8) don't set the
863 			 * family field. Other programs leave
864 			 * sin_family set to garbage - SunOS 4.X does
865 			 * not check the family field on a bind.
866 			 * We use the family field that
867 			 * was passed in to the socket() call.
868 			 */
869 			name->sa_family = so->so_family;
870 			break;
871 
872 		case AF_INET6: {
873 #ifdef DEBUG
874 			sin6_t *sin6 = (sin6_t *)name;
875 #endif /* DEBUG */
876 
877 			if (name == NULL) {
878 				error = EINVAL;
879 				eprintsoline(so, error);
880 				goto done;
881 			}
882 			if ((size_t)namelen != sizeof (sin6_t)) {
883 				error = name->sa_family != so->so_family ?
884 				    EAFNOSUPPORT : EINVAL;
885 				eprintsoline(so, error);
886 				goto done;
887 			}
888 			if (name->sa_family != so->so_family) {
889 				/*
890 				 * With IPv6 we require the family to match
891 				 * unlike in IPv4.
892 				 */
893 				error = EAFNOSUPPORT;
894 				eprintsoline(so, error);
895 				goto done;
896 			}
897 #ifdef DEBUG
898 			/*
899 			 * Verify that apps don't forget to clear
900 			 * sin6_scope_id etc
901 			 */
902 			if (sin6->sin6_scope_id != 0 &&
903 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
904 				zcmn_err(getzoneid(), CE_WARN,
905 				    "bind with uninitialized sin6_scope_id "
906 				    "(%d) on socket. Pid = %d\n",
907 				    (int)sin6->sin6_scope_id,
908 				    (int)curproc->p_pid);
909 			}
910 			if (sin6->__sin6_src_id != 0) {
911 				zcmn_err(getzoneid(), CE_WARN,
912 				    "bind with uninitialized __sin6_src_id "
913 				    "(%d) on socket. Pid = %d\n",
914 				    (int)sin6->__sin6_src_id,
915 				    (int)curproc->p_pid);
916 			}
917 #endif /* DEBUG */
918 			break;
919 		}
920 		default:
921 			/*
922 			 * Don't do any length or sa_family check to allow
923 			 * non-sockaddr style addresses.
924 			 */
925 			if (name == NULL) {
926 				error = EINVAL;
927 				eprintsoline(so, error);
928 				goto done;
929 			}
930 			break;
931 		}
932 
933 		if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
934 			error = ENAMETOOLONG;
935 			eprintsoline(so, error);
936 			goto done;
937 		}
938 		/*
939 		 * Save local address.
940 		 */
941 		sti->sti_laddr_len = (socklen_t)namelen;
942 		ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
943 		bcopy(name, sti->sti_laddr_sa, namelen);
944 
945 		addr = sti->sti_laddr_sa;
946 		addrlen = (t_uscalar_t)sti->sti_laddr_len;
947 		switch (so->so_family) {
948 		case AF_INET6:
949 		case AF_INET:
950 			break;
951 		case AF_UNIX: {
952 			struct sockaddr_un *soun =
953 			    (struct sockaddr_un *)sti->sti_laddr_sa;
954 			struct vnode *vp, *rvp;
955 			struct vattr vattr;
956 
957 			ASSERT(sti->sti_ux_bound_vp == NULL);
958 			/*
959 			 * Create vnode for the specified path name.
960 			 * Keep vnode held with a reference in sti_ux_bound_vp.
961 			 * Use the vnode pointer as the address used in the
962 			 * bind with the transport.
963 			 *
964 			 * Use the same mode as in BSD. In particular this does
965 			 * not observe the umask.
966 			 */
967 			/* MAXPATHLEN + soun_family + nul termination */
968 			if (sti->sti_laddr_len >
969 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
970 				error = ENAMETOOLONG;
971 				eprintsoline(so, error);
972 				goto done;
973 			}
974 			vattr.va_type = VSOCK;
975 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
976 			vattr.va_mask = AT_TYPE|AT_MODE;
977 			/* NOTE: holding so_lock */
978 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
979 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
980 			if (error) {
981 				if (error == EEXIST)
982 					error = EADDRINUSE;
983 				eprintsoline(so, error);
984 				goto done;
985 			}
986 			/*
987 			 * Establish pointer from the underlying filesystem
988 			 * vnode to the socket node.
989 			 * sti_ux_bound_vp and v_stream->sd_vnode form the
990 			 * cross-linkage between the underlying filesystem
991 			 * node and the socket node.
992 			 */
993 
994 			if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
995 				VN_HOLD(rvp);
996 				VN_RELE(vp);
997 				vp = rvp;
998 			}
999 
1000 			ASSERT(SOTOV(so)->v_stream);
1001 			mutex_enter(&vp->v_lock);
1002 			vp->v_stream = SOTOV(so)->v_stream;
1003 			sti->sti_ux_bound_vp = vp;
1004 			mutex_exit(&vp->v_lock);
1005 
1006 			/*
1007 			 * Use the vnode pointer value as a unique address
1008 			 * (together with the magic number to avoid conflicts
1009 			 * with implicit binds) in the transport provider.
1010 			 */
1011 			sti->sti_ux_laddr.soua_vp =
1012 			    (void *)sti->sti_ux_bound_vp;
1013 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1014 			addr = &sti->sti_ux_laddr;
1015 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1016 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1017 			    addrlen,
1018 			    (void *)((struct so_ux_addr *)addr)->soua_vp));
1019 			break;
1020 		}
1021 		} /* end switch (so->so_family) */
1022 	}
1023 
1024 	/*
1025 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1026 	 * the transport can start passing up T_CONN_IND messages
1027 	 * as soon as it receives the bind req and strsock_proto()
1028 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1029 	 */
1030 	if (flags & _SOBIND_LISTEN) {
1031 		if ((so->so_state & SS_ACCEPTCONN) == 0)
1032 			clear_acceptconn_on_err = B_TRUE;
1033 		save_so_backlog = so->so_backlog;
1034 		restore_backlog_on_err = B_TRUE;
1035 		so->so_state |= SS_ACCEPTCONN;
1036 		so->so_backlog = backlog;
1037 	}
1038 
1039 	/*
1040 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1041 	 * for other transports we will send in a O_T_BIND_REQ.
1042 	 */
1043 	if (tcp_udp_xport &&
1044 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
1045 		PRIM_type = T_BIND_REQ;
1046 
1047 	bind_req.PRIM_type = PRIM_type;
1048 	bind_req.ADDR_length = addrlen;
1049 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1050 	bind_req.CONIND_number = backlog;
1051 	/* NOTE: holding so_lock while sleeping */
1052 	mp = soallocproto2(&bind_req, sizeof (bind_req),
1053 	    addr, addrlen, 0, _ALLOC_SLEEP, cr);
1054 	sti->sti_laddr_valid = 0;
1055 
1056 	/* Done using sti_laddr_sa - can drop the lock */
1057 	mutex_exit(&so->so_lock);
1058 
1059 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1060 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1061 	if (error) {
1062 		eprintsoline(so, error);
1063 		mutex_enter(&so->so_lock);
1064 		goto done;
1065 	}
1066 
1067 	mutex_enter(&so->so_lock);
1068 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1069 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1070 	if (error) {
1071 		eprintsoline(so, error);
1072 		goto done;
1073 	}
1074 	ASSERT(mp);
1075 	/*
1076 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1077 	 * strsock_proto while the lock was dropped above, the bind
1078 	 * is allowed to complete.
1079 	 */
1080 
1081 	/* Mark as bound. This will be undone if we detect errors below. */
1082 	if (flags & _SOBIND_NOXLATE) {
1083 		ASSERT(so->so_family == AF_UNIX);
1084 		sti->sti_faddr_noxlate = 1;
1085 	}
1086 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1087 	so->so_state |= SS_ISBOUND;
1088 	ASSERT(sti->sti_unbind_mp);
1089 
1090 	/* note that we've already set SS_ACCEPTCONN above */
1091 
1092 	/*
1093 	 * Recompute addrlen - an unspecied bind sent down an
1094 	 * address of length zero but we expect the appropriate length
1095 	 * in return.
1096 	 */
1097 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1098 	    sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1099 
1100 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
1101 	/*
1102 	 * The alignment restriction is really too strict but
1103 	 * we want enough alignment to inspect the fields of
1104 	 * a sockaddr_in.
1105 	 */
1106 	addr = sogetoff(mp, bind_ack->ADDR_offset,
1107 	    bind_ack->ADDR_length,
1108 	    __TPI_ALIGN_SIZE);
1109 	if (addr == NULL) {
1110 		freemsg(mp);
1111 		error = EPROTO;
1112 		eprintsoline(so, error);
1113 		goto done;
1114 	}
1115 	if (!(flags & _SOBIND_UNSPEC)) {
1116 		/*
1117 		 * Verify that the transport didn't return something we
1118 		 * did not want e.g. an address other than what we asked for.
1119 		 *
1120 		 * NOTE: These checks would go away if/when we switch to
1121 		 * using the new TPI (in which the transport would fail
1122 		 * the request instead of assigning a different address).
1123 		 *
1124 		 * NOTE2: For protocols that we don't know (i.e. any
1125 		 * other than AF_INET6, AF_INET and AF_UNIX), we
1126 		 * cannot know if the transport should be expected to
1127 		 * return the same address as that requested.
1128 		 *
1129 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1130 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1131 		 *
1132 		 * For example, in the case of netatalk it may be
1133 		 * inappropriate for the transport to return the
1134 		 * requested address (as it may have allocated a local
1135 		 * port number in behaviour similar to that of an
1136 		 * AF_INET bind request with a port number of zero).
1137 		 *
1138 		 * Given the definition of O_T_BIND_REQ, where the
1139 		 * transport may bind to an address other than the
1140 		 * requested address, it's not possible to determine
1141 		 * whether a returned address that differs from the
1142 		 * requested address is a reason to fail (because the
1143 		 * requested address was not available) or succeed
1144 		 * (because the transport allocated an appropriate
1145 		 * address and/or port).
1146 		 *
1147 		 * sockfs currently requires that the transport return
1148 		 * the requested address in the T_BIND_ACK, unless
1149 		 * there is code here to allow for any discrepancy.
1150 		 * Such code exists for AF_INET and AF_INET6.
1151 		 *
1152 		 * Netatalk chooses to return the requested address
1153 		 * rather than the (correct) allocated address.  This
1154 		 * means that netatalk violates the TPI specification
1155 		 * (and would not function correctly if used from a
1156 		 * TLI application), but it does mean that it works
1157 		 * with sockfs.
1158 		 *
1159 		 * As noted above, using the newer XTI bind primitive
1160 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1161 		 * allow sockfs to be more sure about whether or not
1162 		 * the bind request had succeeded (as transports are
1163 		 * not permitted to bind to a different address than
1164 		 * that requested - they must return failure).
1165 		 * Unfortunately, support for T_BIND_REQ may not be
1166 		 * present in all transport implementations (netatalk,
1167 		 * for example, doesn't have it), making the
1168 		 * transition difficult.
1169 		 */
1170 		if (bind_ack->ADDR_length != addrlen) {
1171 			/* Assumes that the requested address was in use */
1172 			freemsg(mp);
1173 			error = EADDRINUSE;
1174 			eprintsoline(so, error);
1175 			goto done;
1176 		}
1177 
1178 		switch (so->so_family) {
1179 		case AF_INET6:
1180 		case AF_INET: {
1181 			sin_t *rname, *aname;
1182 
1183 			rname = (sin_t *)addr;
1184 			aname = (sin_t *)sti->sti_laddr_sa;
1185 
1186 			/*
1187 			 * Take advantage of the alignment
1188 			 * of sin_port and sin6_port which fall
1189 			 * in the same place in their data structures.
1190 			 * Just use sin_port for either address family.
1191 			 *
1192 			 * This may become a problem if (heaven forbid)
1193 			 * there's a separate ipv6port_reserved... :-P
1194 			 *
1195 			 * Binding to port 0 has the semantics of letting
1196 			 * the transport bind to any port.
1197 			 *
1198 			 * If the transport is TCP or UDP since we had sent
1199 			 * a T_BIND_REQ we would not get a port other than
1200 			 * what we asked for.
1201 			 */
1202 			if (tcp_udp_xport) {
1203 				/*
1204 				 * Pick up the new port number if we bound to
1205 				 * port 0.
1206 				 */
1207 				if (aname->sin_port == 0)
1208 					aname->sin_port = rname->sin_port;
1209 				sti->sti_laddr_valid = 1;
1210 				break;
1211 			}
1212 			if (aname->sin_port != 0 &&
1213 			    aname->sin_port != rname->sin_port) {
1214 				freemsg(mp);
1215 				error = EADDRINUSE;
1216 				eprintsoline(so, error);
1217 				goto done;
1218 			}
1219 			/*
1220 			 * Pick up the new port number if we bound to port 0.
1221 			 */
1222 			aname->sin_port = rname->sin_port;
1223 
1224 			/*
1225 			 * Unfortunately, addresses aren't _quite_ the same.
1226 			 */
1227 			if (so->so_family == AF_INET) {
1228 				if (aname->sin_addr.s_addr !=
1229 				    rname->sin_addr.s_addr) {
1230 					freemsg(mp);
1231 					error = EADDRNOTAVAIL;
1232 					eprintsoline(so, error);
1233 					goto done;
1234 				}
1235 			} else {
1236 				sin6_t *rname6 = (sin6_t *)rname;
1237 				sin6_t *aname6 = (sin6_t *)aname;
1238 
1239 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1240 				    &rname6->sin6_addr)) {
1241 					freemsg(mp);
1242 					error = EADDRNOTAVAIL;
1243 					eprintsoline(so, error);
1244 					goto done;
1245 				}
1246 			}
1247 			break;
1248 		}
1249 		case AF_UNIX:
1250 			if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1251 				freemsg(mp);
1252 				error = EADDRINUSE;
1253 				eprintsoline(so, error);
1254 				eprintso(so,
1255 				    ("addrlen %d, addr 0x%x, vp %p\n",
1256 				    addrlen, *((int *)addr),
1257 				    (void *)sti->sti_ux_bound_vp));
1258 				goto done;
1259 			}
1260 			sti->sti_laddr_valid = 1;
1261 			break;
1262 		default:
1263 			/*
1264 			 * NOTE: This assumes that addresses can be
1265 			 * byte-compared for equivalence.
1266 			 */
1267 			if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1268 				freemsg(mp);
1269 				error = EADDRINUSE;
1270 				eprintsoline(so, error);
1271 				goto done;
1272 			}
1273 			/*
1274 			 * Don't mark sti_laddr_valid, as we cannot be
1275 			 * sure that the returned address is the real
1276 			 * bound address when talking to an unknown
1277 			 * transport.
1278 			 */
1279 			break;
1280 		}
1281 	} else {
1282 		/*
1283 		 * Save for returned address for getsockname.
1284 		 * Needed for unspecific bind unless transport supports
1285 		 * the TI_GETMYNAME ioctl.
1286 		 * Do this for AF_INET{,6} even though they do, as
1287 		 * caching info here is much better performance than
1288 		 * a TPI/STREAMS trip to the transport for getsockname.
1289 		 * Any which can't for some reason _must_ _not_ set
1290 		 * sti_laddr_valid here for the caching version of
1291 		 * getsockname to not break;
1292 		 */
1293 		switch (so->so_family) {
1294 		case AF_UNIX:
1295 			/*
1296 			 * Record the address bound with the transport
1297 			 * for use by socketpair.
1298 			 */
1299 			bcopy(addr, &sti->sti_ux_laddr, addrlen);
1300 			sti->sti_laddr_valid = 1;
1301 			break;
1302 		case AF_INET:
1303 		case AF_INET6:
1304 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1305 			bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1306 			sti->sti_laddr_valid = 1;
1307 			break;
1308 		default:
1309 			/*
1310 			 * Don't mark sti_laddr_valid, as we cannot be
1311 			 * sure that the returned address is the real
1312 			 * bound address when talking to an unknown
1313 			 * transport.
1314 			 */
1315 			break;
1316 		}
1317 	}
1318 
1319 	freemsg(mp);
1320 
1321 done:
1322 	if (error) {
1323 		/* reset state & backlog to values held on entry */
1324 		if (clear_acceptconn_on_err == B_TRUE)
1325 			so->so_state &= ~SS_ACCEPTCONN;
1326 		if (restore_backlog_on_err == B_TRUE)
1327 			so->so_backlog = save_so_backlog;
1328 
1329 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1330 			int err;
1331 
1332 			err = sotpi_unbind(so, 0);
1333 			/* LINTED - statement has no consequent: if */
1334 			if (err) {
1335 				eprintsoline(so, error);
1336 			} else {
1337 				ASSERT(!(so->so_state & SS_ISBOUND));
1338 			}
1339 		}
1340 	}
1341 	if (!(flags & _SOBIND_LOCK_HELD)) {
1342 		so_unlock_single(so, SOLOCKED);
1343 		mutex_exit(&so->so_lock);
1344 	} else {
1345 		ASSERT(MUTEX_HELD(&so->so_lock));
1346 		ASSERT(so->so_flag & SOLOCKED);
1347 	}
1348 	return (error);
1349 }
1350 
1351 /* bind the socket */
1352 static int
sotpi_bind(struct sonode * so,struct sockaddr * name,socklen_t namelen,int flags,struct cred * cr)1353 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1354     int flags, struct cred *cr)
1355 {
1356 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1357 		return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1358 
1359 	flags &= ~_SOBIND_SOCKETPAIR;
1360 	return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1361 }
1362 
1363 /*
1364  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1365  * address, or when listen needs to unbind and bind.
1366  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1367  * so that a sobind can pick them up.
1368  */
1369 static int
sotpi_unbind(struct sonode * so,int flags)1370 sotpi_unbind(struct sonode *so, int flags)
1371 {
1372 	struct T_unbind_req	unbind_req;
1373 	int			error = 0;
1374 	mblk_t			*mp;
1375 	sotpi_info_t		*sti = SOTOTPI(so);
1376 
1377 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1378 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1379 
1380 	ASSERT(MUTEX_HELD(&so->so_lock));
1381 	ASSERT(so->so_flag & SOLOCKED);
1382 
1383 	if (!(so->so_state & SS_ISBOUND)) {
1384 		error = EINVAL;
1385 		eprintsoline(so, error);
1386 		goto done;
1387 	}
1388 
1389 	mutex_exit(&so->so_lock);
1390 
1391 	/*
1392 	 * Flush the read and write side (except stream head read queue)
1393 	 * and send down T_UNBIND_REQ.
1394 	 */
1395 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1396 
1397 	unbind_req.PRIM_type = T_UNBIND_REQ;
1398 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1399 	    0, _ALLOC_SLEEP, CRED());
1400 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1401 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1402 	mutex_enter(&so->so_lock);
1403 	if (error) {
1404 		eprintsoline(so, error);
1405 		goto done;
1406 	}
1407 
1408 	error = sowaitokack(so, T_UNBIND_REQ);
1409 	if (error) {
1410 		eprintsoline(so, error);
1411 		goto done;
1412 	}
1413 
1414 	/*
1415 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1416 	 * strsock_proto while the lock was dropped above, the unbind
1417 	 * is allowed to complete.
1418 	 */
1419 	if (!(flags & _SOUNBIND_REBIND)) {
1420 		/*
1421 		 * Clear out bound address.
1422 		 */
1423 		vnode_t *vp;
1424 
1425 		if ((vp = sti->sti_ux_bound_vp) != NULL) {
1426 			sti->sti_ux_bound_vp = NULL;
1427 			vn_rele_stream(vp);
1428 		}
1429 		/* Clear out address */
1430 		sti->sti_laddr_len = 0;
1431 	}
1432 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1433 	sti->sti_laddr_valid = 0;
1434 
1435 done:
1436 
1437 	/* If the caller held the lock don't release it here */
1438 	ASSERT(MUTEX_HELD(&so->so_lock));
1439 	ASSERT(so->so_flag & SOLOCKED);
1440 
1441 	return (error);
1442 }
1443 
1444 /*
1445  * listen on the socket.
1446  * For TPI conforming transports this has to first unbind with the transport
1447  * and then bind again using the new backlog.
1448  */
1449 /* ARGSUSED */
1450 int
sotpi_listen(struct sonode * so,int backlog,struct cred * cr)1451 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1452 {
1453 	int		error = 0;
1454 	sotpi_info_t	*sti = SOTOTPI(so);
1455 
1456 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1457 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1458 
1459 	if (sti->sti_serv_type == T_CLTS)
1460 		return (EOPNOTSUPP);
1461 
1462 	/*
1463 	 * If the socket is ready to accept connections already, then
1464 	 * return without doing anything.  This avoids a problem where
1465 	 * a second listen() call fails if a connection is pending and
1466 	 * leaves the socket unbound. Only when we are not unbinding
1467 	 * with the transport can we safely increase the backlog.
1468 	 */
1469 	if (so->so_state & SS_ACCEPTCONN &&
1470 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1471 	    /*CONSTCOND*/
1472 	    !solisten_tpi_tcp))
1473 		return (0);
1474 
1475 	if (so->so_state & SS_ISCONNECTED)
1476 		return (EINVAL);
1477 
1478 	mutex_enter(&so->so_lock);
1479 	so_lock_single(so);	/* Set SOLOCKED */
1480 
1481 	/*
1482 	 * If the listen doesn't change the backlog we do nothing.
1483 	 * This avoids an EPROTO error from the transport.
1484 	 */
1485 	if ((so->so_state & SS_ACCEPTCONN) &&
1486 	    so->so_backlog == backlog)
1487 		goto done;
1488 
1489 	if (!(so->so_state & SS_ISBOUND)) {
1490 		/*
1491 		 * Must have been explicitly bound in the UNIX domain.
1492 		 */
1493 		if (so->so_family == AF_UNIX) {
1494 			error = EINVAL;
1495 			goto done;
1496 		}
1497 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1498 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1499 	} else if (backlog > 0) {
1500 		/*
1501 		 * AF_INET{,6} hack to avoid losing the port.
1502 		 * Assumes that all AF_INET{,6} transports can handle a
1503 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1504 		 * has already bound thus it is possible to avoid the unbind.
1505 		 */
1506 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1507 		    /*CONSTCOND*/
1508 		    !solisten_tpi_tcp)) {
1509 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1510 			if (error)
1511 				goto done;
1512 		}
1513 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1514 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1515 	} else {
1516 		so->so_state |= SS_ACCEPTCONN;
1517 		so->so_backlog = backlog;
1518 	}
1519 	if (error)
1520 		goto done;
1521 	ASSERT(so->so_state & SS_ACCEPTCONN);
1522 done:
1523 	so_unlock_single(so, SOLOCKED);
1524 	mutex_exit(&so->so_lock);
1525 	return (error);
1526 }
1527 
1528 /*
1529  * Disconnect either a specified seqno or all (-1).
1530  * The former is used on listening sockets only.
1531  *
1532  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1533  * the current use of sodisconnect(seqno == -1) is only for shutdown
1534  * so there is no point (and potentially incorrect) to unbind.
1535  */
1536 static int
sodisconnect(struct sonode * so,t_scalar_t seqno,int flags)1537 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1538 {
1539 	struct T_discon_req	discon_req;
1540 	int			error = 0;
1541 	mblk_t			*mp;
1542 
1543 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1544 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1545 
1546 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1547 		mutex_enter(&so->so_lock);
1548 		so_lock_single(so);	/* Set SOLOCKED */
1549 	} else {
1550 		ASSERT(MUTEX_HELD(&so->so_lock));
1551 		ASSERT(so->so_flag & SOLOCKED);
1552 	}
1553 
1554 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1555 		error = EINVAL;
1556 		eprintsoline(so, error);
1557 		goto done;
1558 	}
1559 
1560 	mutex_exit(&so->so_lock);
1561 	/*
1562 	 * Flush the write side (unless this is a listener)
1563 	 * and then send down a T_DISCON_REQ.
1564 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1565 	 * and other messages.)
1566 	 */
1567 	if (!(so->so_state & SS_ACCEPTCONN))
1568 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1569 
1570 	discon_req.PRIM_type = T_DISCON_REQ;
1571 	discon_req.SEQ_number = seqno;
1572 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1573 	    0, _ALLOC_SLEEP, CRED());
1574 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1575 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1576 	mutex_enter(&so->so_lock);
1577 	if (error) {
1578 		eprintsoline(so, error);
1579 		goto done;
1580 	}
1581 
1582 	error = sowaitokack(so, T_DISCON_REQ);
1583 	if (error) {
1584 		eprintsoline(so, error);
1585 		goto done;
1586 	}
1587 	/*
1588 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1589 	 * strsock_proto while the lock was dropped above, the disconnect
1590 	 * is allowed to complete. However, it is not possible to
1591 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1592 	 */
1593 	so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1594 	SOTOTPI(so)->sti_laddr_valid = 0;
1595 	SOTOTPI(so)->sti_faddr_valid = 0;
1596 done:
1597 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1598 		so_unlock_single(so, SOLOCKED);
1599 		mutex_exit(&so->so_lock);
1600 	} else {
1601 		/* If the caller held the lock don't release it here */
1602 		ASSERT(MUTEX_HELD(&so->so_lock));
1603 		ASSERT(so->so_flag & SOLOCKED);
1604 	}
1605 	return (error);
1606 }
1607 
1608 /* ARGSUSED */
1609 int
sotpi_accept(struct sonode * so,int fflag,struct cred * cr,struct sonode ** nsop)1610 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1611     struct sonode **nsop)
1612 {
1613 	struct T_conn_ind	*conn_ind;
1614 	struct T_conn_res	*conn_res;
1615 	int			error = 0;
1616 	mblk_t			*mp, *ack_mp;
1617 	struct sonode		*nso;
1618 	vnode_t			*nvp;
1619 	void			*src;
1620 	t_uscalar_t		srclen;
1621 	void			*opt;
1622 	t_uscalar_t		optlen;
1623 	t_scalar_t		PRIM_type;
1624 	t_scalar_t		SEQ_number;
1625 	size_t			sinlen;
1626 	sotpi_info_t		*sti = SOTOTPI(so);
1627 	sotpi_info_t		*nsti;
1628 
1629 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1630 	    (void *)so, fflag, (void *)nsop,
1631 	    pr_state(so->so_state, so->so_mode)));
1632 
1633 	/*
1634 	 * Defer single-threading the accepting socket until
1635 	 * the T_CONN_IND has been received and parsed and the
1636 	 * new sonode has been opened.
1637 	 */
1638 
1639 	/* Check that we are not already connected */
1640 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1641 		goto conn_bad;
1642 
1643 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1644 		goto e_bad;
1645 
1646 	ASSERT(mp != NULL);
1647 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1648 
1649 	/*
1650 	 * Save SEQ_number for error paths.
1651 	 */
1652 	SEQ_number = conn_ind->SEQ_number;
1653 
1654 	srclen = conn_ind->SRC_length;
1655 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1656 	if (src == NULL) {
1657 		error = EPROTO;
1658 		freemsg(mp);
1659 		eprintsoline(so, error);
1660 		goto disconnect_unlocked;
1661 	}
1662 	optlen = conn_ind->OPT_length;
1663 	switch (so->so_family) {
1664 	case AF_INET:
1665 	case AF_INET6:
1666 		if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1667 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1668 			    &opt, conn_ind->OPT_length);
1669 		} else {
1670 			/*
1671 			 * The transport (in this case TCP) hasn't sent up
1672 			 * a pointer to an instance for the accept fast-path.
1673 			 * Disable fast-path completely because the call to
1674 			 * sotpi_create() below would otherwise create an
1675 			 * incomplete TCP instance, which would lead to
1676 			 * problems when sockfs sends a normal T_CONN_RES
1677 			 * message down the new stream.
1678 			 */
1679 			if (sti->sti_direct) {
1680 				int rval;
1681 				/*
1682 				 * For consistency we inform tcp to disable
1683 				 * direct interface on the listener, though
1684 				 * we can certainly live without doing this
1685 				 * because no data will ever travel upstream
1686 				 * on the listening socket.
1687 				 */
1688 				sti->sti_direct = 0;
1689 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1690 				    0, 0, K_TO_K, cr, &rval);
1691 			}
1692 			opt = NULL;
1693 			optlen = 0;
1694 		}
1695 		break;
1696 	case AF_UNIX:
1697 	default:
1698 		if (optlen != 0) {
1699 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1700 			    __TPI_ALIGN_SIZE);
1701 			if (opt == NULL) {
1702 				error = EPROTO;
1703 				freemsg(mp);
1704 				eprintsoline(so, error);
1705 				goto disconnect_unlocked;
1706 			}
1707 		}
1708 		if (so->so_family == AF_UNIX) {
1709 			if (!sti->sti_faddr_noxlate) {
1710 				src = NULL;
1711 				srclen = 0;
1712 			}
1713 			/* Extract src address from options */
1714 			if (optlen != 0)
1715 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1716 		}
1717 		break;
1718 	}
1719 
1720 	/*
1721 	 * Create the new socket.
1722 	 */
1723 	nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1724 	if (nso == NULL) {
1725 		ASSERT(error != 0);
1726 		/*
1727 		 * Accept can not fail with ENOBUFS. sotpi_create
1728 		 * sleeps waiting for memory until a signal is caught
1729 		 * so return EINTR.
1730 		 */
1731 		freemsg(mp);
1732 		if (error == ENOBUFS)
1733 			error = EINTR;
1734 		goto e_disc_unl;
1735 	}
1736 	nvp = SOTOV(nso);
1737 	nsti = SOTOTPI(nso);
1738 
1739 #ifdef DEBUG
1740 	/*
1741 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1742 	 * it's inherited early to allow debugging of the accept code itself.
1743 	 */
1744 	nso->so_options |= so->so_options & SO_DEBUG;
1745 #endif /* DEBUG */
1746 
1747 	/*
1748 	 * Save the SRC address from the T_CONN_IND
1749 	 * for getpeername to work on AF_UNIX and on transports that do not
1750 	 * support TI_GETPEERNAME.
1751 	 *
1752 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1753 	 * copyin_name().
1754 	 */
1755 	if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1756 		error = EINVAL;
1757 		freemsg(mp);
1758 		eprintsoline(so, error);
1759 		goto disconnect_vp_unlocked;
1760 	}
1761 	nsti->sti_faddr_len = (socklen_t)srclen;
1762 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1763 	bcopy(src, nsti->sti_faddr_sa, srclen);
1764 	nsti->sti_faddr_valid = 1;
1765 
1766 	/*
1767 	 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1768 	 */
1769 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1770 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1771 		cred_t	*cr;
1772 		pid_t	cpid;
1773 
1774 		cr = msg_getcred(mp, &cpid);
1775 		if (cr != NULL) {
1776 			crhold(cr);
1777 			nso->so_peercred = cr;
1778 			nso->so_cpid = cpid;
1779 		}
1780 		freemsg(mp);
1781 
1782 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1783 		    sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1784 		if (mp == NULL) {
1785 			/*
1786 			 * Accept can not fail with ENOBUFS.
1787 			 * A signal was caught so return EINTR.
1788 			 */
1789 			error = EINTR;
1790 			eprintsoline(so, error);
1791 			goto disconnect_vp_unlocked;
1792 		}
1793 		conn_res = (struct T_conn_res *)mp->b_rptr;
1794 	} else {
1795 		/*
1796 		 * For efficency reasons we use msg_extractcred; no crhold
1797 		 * needed since db_credp is cleared (i.e., we move the cred
1798 		 * from the message to so_peercred.
1799 		 */
1800 		nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1801 
1802 		mp->b_rptr = DB_BASE(mp);
1803 		conn_res = (struct T_conn_res *)mp->b_rptr;
1804 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1805 
1806 		mblk_setcred(mp, cr, curproc->p_pid);
1807 	}
1808 
1809 	/*
1810 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1811 	 * (or AF_INET6) it also has to be bound in the transport provider.
1812 	 * We set the local address in the sonode from the T_OK_ACK of the
1813 	 * T_CONN_RES. For this reason the address we bind to here isn't
1814 	 * important.
1815 	 */
1816 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1817 	    /*CONSTCOND*/
1818 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1819 		/*
1820 		 * Optimization for AF_INET{,6} transports
1821 		 * that can handle a T_CONN_RES without being bound.
1822 		 */
1823 		mutex_enter(&nso->so_lock);
1824 		so_automatic_bind(nso);
1825 		mutex_exit(&nso->so_lock);
1826 	} else {
1827 		/* Perform NULL bind with the transport provider. */
1828 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1829 		    cr)) != 0) {
1830 			ASSERT(error != ENOBUFS);
1831 			freemsg(mp);
1832 			eprintsoline(nso, error);
1833 			goto disconnect_vp_unlocked;
1834 		}
1835 	}
1836 
1837 	/*
1838 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1839 	 * so that any data arriving on the new socket will cause the
1840 	 * appropriate signals to be delivered for the new socket.
1841 	 *
1842 	 * No other thread (except strsock_proto and strsock_misc)
1843 	 * can access the new socket thus we relax the locking.
1844 	 */
1845 	nso->so_pgrp = so->so_pgrp;
1846 	nso->so_state |= so->so_state & SS_ASYNC;
1847 	nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1848 
1849 	if (nso->so_pgrp != 0) {
1850 		if ((error = so_set_events(nso, nvp, cr)) != 0) {
1851 			eprintsoline(nso, error);
1852 			error = 0;
1853 			nso->so_pgrp = 0;
1854 		}
1855 	}
1856 
1857 	/*
1858 	 * Make note of the socket level options. TCP and IP level options
1859 	 * are already inherited. We could do all this after accept is
1860 	 * successful but doing it here simplifies code and no harm done
1861 	 * for error case.
1862 	 */
1863 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1864 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1865 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1866 	nso->so_sndbuf = so->so_sndbuf;
1867 	nso->so_rcvbuf = so->so_rcvbuf;
1868 	if (nso->so_options & SO_LINGER)
1869 		nso->so_linger = so->so_linger;
1870 
1871 	/*
1872 	 * Note that the following sti_direct code path should be
1873 	 * removed once we are confident that the direct sockets
1874 	 * do not result in any degradation.
1875 	 */
1876 	if (sti->sti_direct) {
1877 
1878 		ASSERT(opt != NULL);
1879 
1880 		conn_res->OPT_length = optlen;
1881 		conn_res->OPT_offset = MBLKL(mp);
1882 		bcopy(&opt, mp->b_wptr, optlen);
1883 		mp->b_wptr += optlen;
1884 		conn_res->PRIM_type = T_CONN_RES;
1885 		conn_res->ACCEPTOR_id = 0;
1886 		PRIM_type = T_CONN_RES;
1887 
1888 		/* Send down the T_CONN_RES on acceptor STREAM */
1889 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1890 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1891 		if (error) {
1892 			mutex_enter(&so->so_lock);
1893 			so_lock_single(so);
1894 			eprintsoline(so, error);
1895 			goto disconnect_vp;
1896 		}
1897 		mutex_enter(&nso->so_lock);
1898 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1899 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1900 		if (error) {
1901 			mutex_exit(&nso->so_lock);
1902 			mutex_enter(&so->so_lock);
1903 			so_lock_single(so);
1904 			eprintsoline(so, error);
1905 			goto disconnect_vp;
1906 		}
1907 		if (nso->so_family == AF_INET) {
1908 			sin_t *sin;
1909 
1910 			sin = (sin_t *)(ack_mp->b_rptr +
1911 			    sizeof (struct T_ok_ack));
1912 			bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1913 			nsti->sti_laddr_len = sizeof (sin_t);
1914 		} else {
1915 			sin6_t *sin6;
1916 
1917 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1918 			    sizeof (struct T_ok_ack));
1919 			bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1920 			nsti->sti_laddr_len = sizeof (sin6_t);
1921 		}
1922 		freemsg(ack_mp);
1923 
1924 		nso->so_state |= SS_ISCONNECTED;
1925 		nso->so_proto_handle = (sock_lower_handle_t)opt;
1926 		nsti->sti_laddr_valid = 1;
1927 
1928 		mutex_exit(&nso->so_lock);
1929 
1930 		/*
1931 		 * It's possible, through the use of autopush for example,
1932 		 * that the acceptor stream may not support sti_direct
1933 		 * semantics. If the new socket does not support sti_direct
1934 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
1935 		 * as we would in the I_PUSH case.
1936 		 */
1937 		if (nsti->sti_direct == 0) {
1938 			int	rval;
1939 
1940 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
1941 			    0, 0, K_TO_K, cr, &rval)) != 0) {
1942 				mutex_enter(&so->so_lock);
1943 				so_lock_single(so);
1944 				eprintsoline(so, error);
1945 				goto disconnect_vp;
1946 			}
1947 		}
1948 
1949 		/*
1950 		 * Pass out new socket.
1951 		 */
1952 		if (nsop != NULL)
1953 			*nsop = nso;
1954 
1955 		return (0);
1956 	}
1957 
1958 	/*
1959 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1960 	 * which don't support the FireEngine accept fast-path. It is also
1961 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1962 	 * again. Neither sockfs nor TCP attempt to find out if some other
1963 	 * random module has been inserted in between (in which case we
1964 	 * should follow TLI accept behaviour). We blindly assume the worst
1965 	 * case and revert back to old behaviour i.e. TCP will not send us
1966 	 * any option (eager) and the accept should happen on the listener
1967 	 * queue. Any queued T_conn_ind have already got their options removed
1968 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
1969 	 */
1970 	/*
1971 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1972 	 */
1973 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1974 #ifdef	_ILP32
1975 		queue_t	*q;
1976 
1977 		/*
1978 		 * Find read queue in driver
1979 		 * Can safely do this since we "own" nso/nvp.
1980 		 */
1981 		q = strvp2wq(nvp)->q_next;
1982 		while (SAMESTR(q))
1983 			q = q->q_next;
1984 		q = RD(q);
1985 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1986 #else
1987 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1988 #endif	/* _ILP32 */
1989 		conn_res->PRIM_type = O_T_CONN_RES;
1990 		PRIM_type = O_T_CONN_RES;
1991 	} else {
1992 		conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
1993 		conn_res->PRIM_type = T_CONN_RES;
1994 		PRIM_type = T_CONN_RES;
1995 	}
1996 	conn_res->SEQ_number = SEQ_number;
1997 	conn_res->OPT_length = 0;
1998 	conn_res->OPT_offset = 0;
1999 
2000 	mutex_enter(&so->so_lock);
2001 	so_lock_single(so);	/* Set SOLOCKED */
2002 	mutex_exit(&so->so_lock);
2003 
2004 	error = kstrputmsg(SOTOV(so), mp, NULL,
2005 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2006 	mutex_enter(&so->so_lock);
2007 	if (error) {
2008 		eprintsoline(so, error);
2009 		goto disconnect_vp;
2010 	}
2011 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
2012 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2013 	if (error) {
2014 		eprintsoline(so, error);
2015 		goto disconnect_vp;
2016 	}
2017 	mutex_exit(&so->so_lock);
2018 	/*
2019 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
2020 	 * that to set the local address. If this is not present
2021 	 * then we zero out the address and don't set the
2022 	 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2023 	 * the pathname from the listening socket.
2024 	 * In the case where this is TCP or an AF_UNIX socket the
2025 	 * client side may have queued data or a T_ORDREL in the
2026 	 * transport. Having now sent the T_CONN_RES we may receive
2027 	 * those queued messages at any time. Hold the acceptor
2028 	 * so_lock until its state and laddr are finalized.
2029 	 */
2030 	mutex_enter(&nso->so_lock);
2031 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2032 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
2033 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2034 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
2035 		bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2036 		nsti->sti_laddr_len = sinlen;
2037 		nsti->sti_laddr_valid = 1;
2038 	} else if (nso->so_family == AF_UNIX) {
2039 		ASSERT(so->so_family == AF_UNIX);
2040 		nsti->sti_laddr_len = sti->sti_laddr_len;
2041 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2042 		bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2043 		    nsti->sti_laddr_len);
2044 		nsti->sti_laddr_valid = 1;
2045 	} else {
2046 		nsti->sti_laddr_len = sti->sti_laddr_len;
2047 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2048 		bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2049 		nsti->sti_laddr_sa->sa_family = nso->so_family;
2050 	}
2051 	nso->so_state |= SS_ISCONNECTED;
2052 	mutex_exit(&nso->so_lock);
2053 
2054 	freemsg(ack_mp);
2055 
2056 	mutex_enter(&so->so_lock);
2057 	so_unlock_single(so, SOLOCKED);
2058 	mutex_exit(&so->so_lock);
2059 
2060 	/*
2061 	 * Pass out new socket.
2062 	 */
2063 	if (nsop != NULL)
2064 		*nsop = nso;
2065 
2066 	return (0);
2067 
2068 e_disc_unl:
2069 	eprintsoline(so, error);
2070 	goto disconnect_unlocked;
2071 
2072 disconnect_vp_unlocked:
2073 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2074 	VN_RELE(nvp);
2075 disconnect_unlocked:
2076 	(void) sodisconnect(so, SEQ_number, 0);
2077 	return (error);
2078 
2079 disconnect_vp:
2080 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2081 	so_unlock_single(so, SOLOCKED);
2082 	mutex_exit(&so->so_lock);
2083 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2084 	VN_RELE(nvp);
2085 	return (error);
2086 
2087 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2088 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2089 	    ? EOPNOTSUPP : EINVAL;
2090 e_bad:
2091 	eprintsoline(so, error);
2092 	return (error);
2093 }
2094 
2095 /*
2096  * connect a socket.
2097  *
2098  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2099  * unconnect (by specifying a null address).
2100  */
2101 int
sotpi_connect(struct sonode * so,struct sockaddr * name,socklen_t namelen,int fflag,int flags,struct cred * cr)2102 sotpi_connect(struct sonode *so,
2103     struct sockaddr *name,
2104     socklen_t namelen,
2105     int fflag,
2106     int flags,
2107     struct cred *cr)
2108 {
2109 	struct T_conn_req	conn_req;
2110 	int			error = 0;
2111 	mblk_t			*mp;
2112 	void			*src;
2113 	socklen_t		srclen;
2114 	void			*addr;
2115 	socklen_t		addrlen;
2116 	boolean_t		need_unlock;
2117 	sotpi_info_t		*sti = SOTOTPI(so);
2118 
2119 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2120 	    (void *)so, (void *)name, namelen, fflag, flags,
2121 	    pr_state(so->so_state, so->so_mode)));
2122 
2123 	/*
2124 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2125 	 * avoid sleeping for memory with SOLOCKED held.
2126 	 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2127 	 * + sizeof (struct T_opthdr).
2128 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2129 	 * exceed sti_faddr_maxlen).
2130 	 */
2131 	mp = soallocproto(sizeof (struct T_conn_req) +
2132 	    2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2133 	    cr);
2134 	if (mp == NULL) {
2135 		/*
2136 		 * Connect can not fail with ENOBUFS. A signal was
2137 		 * caught so return EINTR.
2138 		 */
2139 		error = EINTR;
2140 		eprintsoline(so, error);
2141 		return (error);
2142 	}
2143 
2144 	mutex_enter(&so->so_lock);
2145 	/*
2146 	 * Make sure there is a preallocated T_unbind_req message
2147 	 * before any binding. This message is allocated when the
2148 	 * socket is created. Since another thread can consume
2149 	 * so_unbind_mp by the time we return from so_lock_single(),
2150 	 * we should check the availability of so_unbind_mp after
2151 	 * we return from so_lock_single().
2152 	 */
2153 
2154 	so_lock_single(so);	/* Set SOLOCKED */
2155 	need_unlock = B_TRUE;
2156 
2157 	if (sti->sti_unbind_mp == NULL) {
2158 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2159 		/* NOTE: holding so_lock while sleeping */
2160 		sti->sti_unbind_mp =
2161 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2162 		if (sti->sti_unbind_mp == NULL) {
2163 			error = EINTR;
2164 			goto done;
2165 		}
2166 	}
2167 
2168 	/*
2169 	 * Can't have done a listen before connecting.
2170 	 */
2171 	if (so->so_state & SS_ACCEPTCONN) {
2172 		error = EOPNOTSUPP;
2173 		goto done;
2174 	}
2175 
2176 	/*
2177 	 * Must be bound with the transport
2178 	 */
2179 	if (!(so->so_state & SS_ISBOUND)) {
2180 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2181 		    /*CONSTCOND*/
2182 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2183 			/*
2184 			 * Optimization for AF_INET{,6} transports
2185 			 * that can handle a T_CONN_REQ without being bound.
2186 			 */
2187 			so_automatic_bind(so);
2188 		} else {
2189 			error = sotpi_bind(so, NULL, 0,
2190 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2191 			if (error)
2192 				goto done;
2193 		}
2194 		ASSERT(so->so_state & SS_ISBOUND);
2195 		flags |= _SOCONNECT_DID_BIND;
2196 	}
2197 
2198 	/*
2199 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2200 	 * connect to a null address. This is the portable method to
2201 	 * unconnect a socket.
2202 	 */
2203 	if ((namelen >= sizeof (sa_family_t)) &&
2204 	    (name->sa_family == AF_UNSPEC)) {
2205 		name = NULL;
2206 		namelen = 0;
2207 	}
2208 
2209 	/*
2210 	 * Check that we are not already connected.
2211 	 * A connection-oriented socket cannot be reconnected.
2212 	 * A connected connection-less socket can be
2213 	 * - connected to a different address by a subsequent connect
2214 	 * - "unconnected" by a connect to the NULL address
2215 	 */
2216 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2217 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2218 		if (so->so_mode & SM_CONNREQUIRED) {
2219 			/* Connection-oriented socket */
2220 			error = so->so_state & SS_ISCONNECTED ?
2221 			    EISCONN : EALREADY;
2222 			goto done;
2223 		}
2224 		/* Connection-less socket */
2225 		if (name == NULL) {
2226 			/*
2227 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2228 			 * since it was set when the socket was connected.
2229 			 * If this is UDP also send down a T_DISCON_REQ.
2230 			 */
2231 			int val;
2232 
2233 			if ((so->so_family == AF_INET ||
2234 			    so->so_family == AF_INET6) &&
2235 			    (so->so_type == SOCK_DGRAM ||
2236 			    so->so_type == SOCK_RAW) &&
2237 			    /*CONSTCOND*/
2238 			    !soconnect_tpi_udp) {
2239 				/* XXX What about implicitly unbinding here? */
2240 				error = sodisconnect(so, -1,
2241 				    _SODISCONNECT_LOCK_HELD);
2242 			} else {
2243 				so->so_state &=
2244 				    ~(SS_ISCONNECTED | SS_ISCONNECTING);
2245 				sti->sti_faddr_valid = 0;
2246 				sti->sti_faddr_len = 0;
2247 			}
2248 
2249 			/* Remove SOLOCKED since setsockopt will grab it */
2250 			so_unlock_single(so, SOLOCKED);
2251 			mutex_exit(&so->so_lock);
2252 
2253 			val = 0;
2254 			(void) sotpi_setsockopt(so, SOL_SOCKET,
2255 			    SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2256 			    cr);
2257 
2258 			mutex_enter(&so->so_lock);
2259 			so_lock_single(so);	/* Set SOLOCKED */
2260 			goto done;
2261 		}
2262 	}
2263 	ASSERT(so->so_state & SS_ISBOUND);
2264 
2265 	if (name == NULL || namelen == 0) {
2266 		error = EINVAL;
2267 		goto done;
2268 	}
2269 	/*
2270 	 * Mark the socket if sti_faddr_sa represents the transport level
2271 	 * address.
2272 	 */
2273 	if (flags & _SOCONNECT_NOXLATE) {
2274 		struct sockaddr_ux	*soaddr_ux;
2275 
2276 		ASSERT(so->so_family == AF_UNIX);
2277 		if (namelen != sizeof (struct sockaddr_ux)) {
2278 			error = EINVAL;
2279 			goto done;
2280 		}
2281 		soaddr_ux = (struct sockaddr_ux *)name;
2282 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2283 		namelen = sizeof (soaddr_ux->sou_addr);
2284 		sti->sti_faddr_noxlate = 1;
2285 	}
2286 
2287 	/*
2288 	 * Length and family checks.
2289 	 */
2290 	error = so_addr_verify(so, name, namelen);
2291 	if (error)
2292 		goto bad;
2293 
2294 	/*
2295 	 * Save foreign address. Needed for AF_UNIX as well as
2296 	 * transport providers that do not support TI_GETPEERNAME.
2297 	 * Also used for cached foreign address for TCP and UDP.
2298 	 */
2299 	if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2300 		error = EINVAL;
2301 		goto done;
2302 	}
2303 	sti->sti_faddr_len = (socklen_t)namelen;
2304 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2305 	bcopy(name, sti->sti_faddr_sa, namelen);
2306 	sti->sti_faddr_valid = 1;
2307 
2308 	if (so->so_family == AF_UNIX) {
2309 		if (sti->sti_faddr_noxlate) {
2310 			/*
2311 			 * sti_faddr is a transport-level address, so
2312 			 * don't pass it as an option.  Do save it in
2313 			 * sti_ux_faddr, used for connected DG send.
2314 			 */
2315 			src = NULL;
2316 			srclen = 0;
2317 			addr = sti->sti_faddr_sa;
2318 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2319 			bcopy(addr, &sti->sti_ux_faddr,
2320 			    sizeof (sti->sti_ux_faddr));
2321 		} else {
2322 			/*
2323 			 * Pass the sockaddr_un source address as an option
2324 			 * and translate the remote address.
2325 			 * Holding so_lock thus sti_laddr_sa can not change.
2326 			 */
2327 			src = sti->sti_laddr_sa;
2328 			srclen = (t_uscalar_t)sti->sti_laddr_len;
2329 			dprintso(so, 1,
2330 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2331 			    srclen, src));
2332 			/*
2333 			 * Translate the destination address into our
2334 			 * internal form, and save it in sti_ux_faddr.
2335 			 * After this call, addr==&sti->sti_ux_taddr,
2336 			 * and we copy that to sti->sti_ux_faddr so
2337 			 * we save the connected peer address.
2338 			 */
2339 			error = so_ux_addr_xlate(so,
2340 			    sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2341 			    (flags & _SOCONNECT_XPG4_2),
2342 			    &addr, &addrlen);
2343 			if (error)
2344 				goto bad;
2345 			bcopy(&sti->sti_ux_taddr, &sti->sti_ux_faddr,
2346 			    sizeof (sti->sti_ux_faddr));
2347 		}
2348 	} else {
2349 		addr = sti->sti_faddr_sa;
2350 		addrlen = (t_uscalar_t)sti->sti_faddr_len;
2351 		src = NULL;
2352 		srclen = 0;
2353 	}
2354 	/*
2355 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2356 	 * option which asks the transport provider to send T_UDERR_IND
2357 	 * messages. These T_UDERR_IND messages are used to return connected
2358 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2359 	 *
2360 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2361 	 * we send down a T_CONN_REQ. This is needed to let the
2362 	 * transport assign a local address that is consistent with
2363 	 * the remote address. Applications depend on a getsockname()
2364 	 * after a connect() to retrieve the "source" IP address for
2365 	 * the connected socket.  Invalidate the cached local address
2366 	 * to force getsockname() to enquire of the transport.
2367 	 */
2368 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2369 		/*
2370 		 * Datagram socket.
2371 		 */
2372 		int32_t val;
2373 
2374 		so_unlock_single(so, SOLOCKED);
2375 		mutex_exit(&so->so_lock);
2376 
2377 		val = 1;
2378 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2379 		    &val, (t_uscalar_t)sizeof (val), cr);
2380 
2381 		mutex_enter(&so->so_lock);
2382 		so_lock_single(so);	/* Set SOLOCKED */
2383 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2384 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2385 		    soconnect_tpi_udp) {
2386 			soisconnected(so);
2387 			goto done;
2388 		}
2389 		/*
2390 		 * Send down T_CONN_REQ etc.
2391 		 * Clear fflag to avoid returning EWOULDBLOCK.
2392 		 */
2393 		fflag = 0;
2394 		ASSERT(so->so_family != AF_UNIX);
2395 		sti->sti_laddr_valid = 0;
2396 	} else if (sti->sti_laddr_len != 0) {
2397 		/*
2398 		 * If the local address or port was "any" then it may be
2399 		 * changed by the transport as a result of the
2400 		 * connect.  Invalidate the cached version if we have one.
2401 		 */
2402 		switch (so->so_family) {
2403 		case AF_INET:
2404 			ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2405 			if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2406 			    INADDR_ANY ||
2407 			    ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2408 				sti->sti_laddr_valid = 0;
2409 			break;
2410 
2411 		case AF_INET6:
2412 			ASSERT(sti->sti_laddr_len ==
2413 			    (socklen_t)sizeof (sin6_t));
2414 			if (IN6_IS_ADDR_UNSPECIFIED(
2415 			    &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2416 			    IN6_IS_ADDR_V4MAPPED_ANY(
2417 			    &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2418 			    ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2419 				sti->sti_laddr_valid = 0;
2420 			break;
2421 
2422 		default:
2423 			break;
2424 		}
2425 	}
2426 
2427 	/*
2428 	 * Check for failure of an earlier call
2429 	 */
2430 	if (so->so_error != 0)
2431 		goto so_bad;
2432 
2433 	/*
2434 	 * Send down T_CONN_REQ. Message was allocated above.
2435 	 */
2436 	conn_req.PRIM_type = T_CONN_REQ;
2437 	conn_req.DEST_length = addrlen;
2438 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2439 	if (srclen == 0) {
2440 		conn_req.OPT_length = 0;
2441 		conn_req.OPT_offset = 0;
2442 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2443 		soappendmsg(mp, addr, addrlen);
2444 	} else {
2445 		/*
2446 		 * There is a AF_UNIX sockaddr_un to include as a source
2447 		 * address option.
2448 		 */
2449 		struct T_opthdr toh;
2450 
2451 		toh.level = SOL_SOCKET;
2452 		toh.name = SO_SRCADDR;
2453 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2454 		toh.status = 0;
2455 		conn_req.OPT_length =
2456 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2457 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2458 		    _TPI_ALIGN_TOPT(addrlen));
2459 
2460 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2461 		soappendmsg(mp, addr, addrlen);
2462 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2463 		soappendmsg(mp, &toh, sizeof (toh));
2464 		soappendmsg(mp, src, srclen);
2465 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2466 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2467 	}
2468 	/*
2469 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2470 	 * in order to have the right state when the T_CONN_CON shows up.
2471 	 */
2472 	soisconnecting(so);
2473 	mutex_exit(&so->so_lock);
2474 
2475 	if (AU_AUDITING())
2476 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2477 
2478 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2479 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2480 	mp = NULL;
2481 	mutex_enter(&so->so_lock);
2482 	if (error != 0)
2483 		goto bad;
2484 
2485 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2486 		goto bad;
2487 
2488 	/* Allow other threads to access the socket */
2489 	so_unlock_single(so, SOLOCKED);
2490 	need_unlock = B_FALSE;
2491 
2492 	/*
2493 	 * Wait until we get a T_CONN_CON or an error
2494 	 */
2495 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2496 		so_lock_single(so);	/* Set SOLOCKED */
2497 		need_unlock = B_TRUE;
2498 	}
2499 
2500 done:
2501 	freemsg(mp);
2502 	switch (error) {
2503 	case EINPROGRESS:
2504 	case EALREADY:
2505 	case EISCONN:
2506 	case EINTR:
2507 		/* Non-fatal errors */
2508 		sti->sti_laddr_valid = 0;
2509 		/* FALLTHRU */
2510 	case 0:
2511 		break;
2512 	default:
2513 		ASSERT(need_unlock);
2514 		/*
2515 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2516 		 * and invalidate local-address cache
2517 		 */
2518 		so->so_state &= ~SS_ISCONNECTING;
2519 		sti->sti_laddr_valid = 0;
2520 		/* A discon_ind might have already unbound us */
2521 		if ((flags & _SOCONNECT_DID_BIND) &&
2522 		    (so->so_state & SS_ISBOUND)) {
2523 			int err;
2524 
2525 			err = sotpi_unbind(so, 0);
2526 			/* LINTED - statement has no conseq */
2527 			if (err) {
2528 				eprintsoline(so, err);
2529 			}
2530 		}
2531 		break;
2532 	}
2533 	if (need_unlock)
2534 		so_unlock_single(so, SOLOCKED);
2535 	mutex_exit(&so->so_lock);
2536 	return (error);
2537 
2538 so_bad:	error = sogeterr(so, B_TRUE);
2539 bad:	eprintsoline(so, error);
2540 	goto done;
2541 }
2542 
2543 /* ARGSUSED */
2544 int
sotpi_shutdown(struct sonode * so,int how,struct cred * cr)2545 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2546 {
2547 	struct T_ordrel_req	ordrel_req;
2548 	mblk_t			*mp;
2549 	uint_t			old_state, state_change;
2550 	int			error = 0;
2551 	sotpi_info_t		*sti = SOTOTPI(so);
2552 
2553 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2554 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2555 
2556 	mutex_enter(&so->so_lock);
2557 	so_lock_single(so);	/* Set SOLOCKED */
2558 
2559 	/*
2560 	 * SunOS 4.X has no check for datagram sockets.
2561 	 * 5.X checks that it is connected (ENOTCONN)
2562 	 * X/Open requires that we check the connected state.
2563 	 */
2564 	if (!(so->so_state & SS_ISCONNECTED)) {
2565 		if (!xnet_skip_checks) {
2566 			error = ENOTCONN;
2567 			if (xnet_check_print) {
2568 				printf("sockfs: X/Open shutdown check "
2569 				    "caused ENOTCONN\n");
2570 			}
2571 		}
2572 		goto done;
2573 	}
2574 	/*
2575 	 * Record the current state and then perform any state changes.
2576 	 * Then use the difference between the old and new states to
2577 	 * determine which messages need to be sent.
2578 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2579 	 * duplicate calls to shutdown().
2580 	 */
2581 	old_state = so->so_state;
2582 
2583 	switch (how) {
2584 	case 0:
2585 		socantrcvmore(so);
2586 		break;
2587 	case 1:
2588 		socantsendmore(so);
2589 		break;
2590 	case 2:
2591 		socantsendmore(so);
2592 		socantrcvmore(so);
2593 		break;
2594 	default:
2595 		error = EINVAL;
2596 		goto done;
2597 	}
2598 
2599 	/*
2600 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2601 	 */
2602 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2603 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2604 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2605 
2606 	switch (state_change) {
2607 	case 0:
2608 		dprintso(so, 1,
2609 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2610 		    so->so_state));
2611 		goto done;
2612 
2613 	case SS_CANTRCVMORE:
2614 		mutex_exit(&so->so_lock);
2615 		strseteof(SOTOV(so), 1);
2616 		/*
2617 		 * strseteof takes care of read side wakeups,
2618 		 * pollwakeups, and signals.
2619 		 */
2620 		/*
2621 		 * Get the read lock before flushing data to avoid problems
2622 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2623 		 */
2624 		mutex_enter(&so->so_lock);
2625 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2626 		mutex_exit(&so->so_lock);
2627 
2628 		/* Flush read side queue */
2629 		strflushrq(SOTOV(so), FLUSHALL);
2630 
2631 		mutex_enter(&so->so_lock);
2632 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2633 		break;
2634 
2635 	case SS_CANTSENDMORE:
2636 		mutex_exit(&so->so_lock);
2637 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2638 		mutex_enter(&so->so_lock);
2639 		break;
2640 
2641 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2642 		mutex_exit(&so->so_lock);
2643 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2644 		strseteof(SOTOV(so), 1);
2645 		/*
2646 		 * strseteof takes care of read side wakeups,
2647 		 * pollwakeups, and signals.
2648 		 */
2649 		/*
2650 		 * Get the read lock before flushing data to avoid problems
2651 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2652 		 */
2653 		mutex_enter(&so->so_lock);
2654 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2655 		mutex_exit(&so->so_lock);
2656 
2657 		/* Flush read side queue */
2658 		strflushrq(SOTOV(so), FLUSHALL);
2659 
2660 		mutex_enter(&so->so_lock);
2661 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2662 		break;
2663 	}
2664 
2665 	ASSERT(MUTEX_HELD(&so->so_lock));
2666 
2667 	/*
2668 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2669 	 * was set due to this call and the new state has both of them set:
2670 	 *	Send the AF_UNIX close indication
2671 	 *	For T_COTS send a discon_ind
2672 	 *
2673 	 * If cantsend was set due to this call:
2674 	 *	For T_COTSORD send an ordrel_ind
2675 	 *
2676 	 * Note that for T_CLTS there is no message sent here.
2677 	 */
2678 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2679 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2680 		/*
2681 		 * For SunOS 4.X compatibility we tell the other end
2682 		 * that we are unable to receive at this point.
2683 		 */
2684 		if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2685 			so_unix_close(so);
2686 
2687 		if (sti->sti_serv_type == T_COTS)
2688 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2689 	}
2690 	if ((state_change & SS_CANTSENDMORE) &&
2691 	    (sti->sti_serv_type == T_COTS_ORD)) {
2692 		/* Send an orderly release */
2693 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2694 
2695 		mutex_exit(&so->so_lock);
2696 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2697 		    0, _ALLOC_SLEEP, cr);
2698 		/*
2699 		 * Send down the T_ORDREL_REQ even if there is flow control.
2700 		 * This prevents shutdown from blocking.
2701 		 * Note that there is no T_OK_ACK for ordrel_req.
2702 		 */
2703 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2704 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2705 		mutex_enter(&so->so_lock);
2706 		if (error) {
2707 			eprintsoline(so, error);
2708 			goto done;
2709 		}
2710 	}
2711 
2712 done:
2713 	so_unlock_single(so, SOLOCKED);
2714 	mutex_exit(&so->so_lock);
2715 	return (error);
2716 }
2717 
2718 /*
2719  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2720  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2721  * that we have closed.
2722  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2723  * T_UNITDATA_REQ containing the same option.
2724  *
2725  * For SOCK_DGRAM half-connections (somebody connected to this end
2726  * but this end is not connect) we don't know where to send any
2727  * SO_UNIX_CLOSE.
2728  *
2729  * We have to ignore stream head errors just in case there has been
2730  * a shutdown(output).
2731  * Ignore any flow control to try to get the message more quickly to the peer.
2732  * While locally ignoring flow control solves the problem when there
2733  * is only the loopback transport on the stream it would not provide
2734  * the correct AF_UNIX socket semantics when one or more modules have
2735  * been pushed.
2736  */
2737 void
so_unix_close(struct sonode * so)2738 so_unix_close(struct sonode *so)
2739 {
2740 	struct T_opthdr	toh;
2741 	mblk_t		*mp;
2742 	sotpi_info_t	*sti = SOTOTPI(so);
2743 
2744 	ASSERT(MUTEX_HELD(&so->so_lock));
2745 
2746 	ASSERT(so->so_family == AF_UNIX);
2747 
2748 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2749 	    (SS_ISCONNECTED|SS_ISBOUND))
2750 		return;
2751 
2752 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2753 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2754 
2755 	toh.level = SOL_SOCKET;
2756 	toh.name = SO_UNIX_CLOSE;
2757 
2758 	/* zero length + header */
2759 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2760 	toh.status = 0;
2761 
2762 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2763 		struct T_optdata_req tdr;
2764 
2765 		tdr.PRIM_type = T_OPTDATA_REQ;
2766 		tdr.DATA_flag = 0;
2767 
2768 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2769 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2770 
2771 		/* NOTE: holding so_lock while sleeping */
2772 		mp = soallocproto2(&tdr, sizeof (tdr),
2773 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2774 	} else {
2775 		struct T_unitdata_req	tudr;
2776 		void			*addr;
2777 		socklen_t		addrlen;
2778 		void			*src;
2779 		socklen_t		srclen;
2780 		struct T_opthdr		toh2;
2781 		t_scalar_t		size;
2782 
2783 		/*
2784 		 * We know this is an AF_UNIX connected DGRAM socket.
2785 		 * We therefore already have the destination address
2786 		 * in the internal form needed for this send.  This is
2787 		 * similar to the sosend_dgram call later in this file
2788 		 * when there's no user-specified destination address.
2789 		 */
2790 		if (sti->sti_faddr_noxlate) {
2791 			/*
2792 			 * Already have a transport internal address. Do not
2793 			 * pass any (transport internal) source address.
2794 			 */
2795 			addr = sti->sti_faddr_sa;
2796 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2797 			src = NULL;
2798 			srclen = 0;
2799 		} else {
2800 			/*
2801 			 * Pass the sockaddr_un source address as an option
2802 			 * and translate the remote address.
2803 			 * Holding so_lock thus sti_laddr_sa can not change.
2804 			 */
2805 			src = sti->sti_laddr_sa;
2806 			srclen = (socklen_t)sti->sti_laddr_len;
2807 			dprintso(so, 1,
2808 			    ("so_ux_close: srclen %d, src %p\n",
2809 			    srclen, src));
2810 			/*
2811 			 * Use the destination address saved in connect.
2812 			 */
2813 			addr = &sti->sti_ux_faddr;
2814 			addrlen = sizeof (sti->sti_ux_faddr);
2815 		}
2816 		tudr.PRIM_type = T_UNITDATA_REQ;
2817 		tudr.DEST_length = addrlen;
2818 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2819 		if (srclen == 0) {
2820 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2821 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2822 			    _TPI_ALIGN_TOPT(addrlen));
2823 
2824 			size = tudr.OPT_offset + tudr.OPT_length;
2825 			/* NOTE: holding so_lock while sleeping */
2826 			mp = soallocproto2(&tudr, sizeof (tudr),
2827 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2828 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2829 			soappendmsg(mp, &toh, sizeof (toh));
2830 		} else {
2831 			/*
2832 			 * There is a AF_UNIX sockaddr_un to include as a
2833 			 * source address option.
2834 			 */
2835 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2836 			    _TPI_ALIGN_TOPT(srclen));
2837 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2838 			    _TPI_ALIGN_TOPT(addrlen));
2839 
2840 			toh2.level = SOL_SOCKET;
2841 			toh2.name = SO_SRCADDR;
2842 			toh2.len = (t_uscalar_t)(srclen +
2843 			    sizeof (struct T_opthdr));
2844 			toh2.status = 0;
2845 
2846 			size = tudr.OPT_offset + tudr.OPT_length;
2847 
2848 			/* NOTE: holding so_lock while sleeping */
2849 			mp = soallocproto2(&tudr, sizeof (tudr),
2850 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2851 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2852 			soappendmsg(mp, &toh, sizeof (toh));
2853 			soappendmsg(mp, &toh2, sizeof (toh2));
2854 			soappendmsg(mp, src, srclen);
2855 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2856 		}
2857 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2858 	}
2859 	mutex_exit(&so->so_lock);
2860 	(void) kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2861 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2862 	mutex_enter(&so->so_lock);
2863 }
2864 
2865 /*
2866  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2867  * In addition, the caller typically verifies that there is some
2868  * potential state to clear by checking
2869  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2870  * before calling this routine.
2871  * Note that such a check can be made without holding so_lock since
2872  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2873  * decrements sti_oobsigcnt.
2874  *
2875  * When data is read *after* the point that all pending
2876  * oob data has been consumed the oob indication is cleared.
2877  *
2878  * This logic keeps select/poll returning POLLRDBAND and
2879  * SIOCATMARK returning true until we have read past
2880  * the mark.
2881  */
2882 static void
sorecv_update_oobstate(struct sonode * so)2883 sorecv_update_oobstate(struct sonode *so)
2884 {
2885 	sotpi_info_t *sti = SOTOTPI(so);
2886 
2887 	mutex_enter(&so->so_lock);
2888 	ASSERT(so_verify_oobstate(so));
2889 	dprintso(so, 1,
2890 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
2891 	    sti->sti_oobsigcnt,
2892 	    sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2893 	if (sti->sti_oobsigcnt == 0) {
2894 		/* No more pending oob indications */
2895 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2896 		freemsg(so->so_oobmsg);
2897 		so->so_oobmsg = NULL;
2898 	}
2899 	ASSERT(so_verify_oobstate(so));
2900 	mutex_exit(&so->so_lock);
2901 }
2902 
2903 /*
2904  * Receive the next message on the queue.
2905  * If msg_controllen is non-zero when called the caller is interested in
2906  * any received control info (options).
2907  * If msg_namelen is non-zero when called the caller is interested in
2908  * any received source address.
2909  * The routine returns with msg_control and msg_name pointing to
2910  * kmem_alloc'ed memory which the caller has to free.
2911  */
2912 /* ARGSUSED */
2913 int
sotpi_recvmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)2914 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
2915     struct cred *cr)
2916 {
2917 	union T_primitives	*tpr;
2918 	mblk_t			*mp;
2919 	uchar_t			pri;
2920 	int			pflag, opflag;
2921 	void			*control;
2922 	t_uscalar_t		controllen;
2923 	t_uscalar_t		namelen;
2924 	int			so_state = so->so_state; /* Snapshot */
2925 	ssize_t			saved_resid;
2926 	rval_t			rval;
2927 	int			flags;
2928 	clock_t			timout;
2929 	int			error = 0;
2930 	sotpi_info_t		*sti = SOTOTPI(so);
2931 
2932 	flags = msg->msg_flags;
2933 	msg->msg_flags = 0;
2934 
2935 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2936 	    (void *)so, (void *)msg, flags,
2937 	    pr_state(so->so_state, so->so_mode), so->so_error));
2938 
2939 	if (so->so_version == SOV_STREAM) {
2940 		so_update_attrs(so, SOACC);
2941 		/* The imaginary "sockmod" has been popped - act as a stream */
2942 		return (strread(SOTOV(so), uiop, cr));
2943 	}
2944 
2945 	/*
2946 	 * If we are not connected because we have never been connected
2947 	 * we return ENOTCONN. If we have been connected (but are no longer
2948 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2949 	 * the EOF.
2950 	 *
2951 	 * An alternative would be to post an ENOTCONN error in stream head
2952 	 * (read+write) and clear it when we're connected. However, that error
2953 	 * would cause incorrect poll/select behavior!
2954 	 */
2955 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2956 	    (so->so_mode & SM_CONNREQUIRED)) {
2957 		return (ENOTCONN);
2958 	}
2959 
2960 	/*
2961 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2962 	 * after checking that the read queue is empty) and returns zero.
2963 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2964 	 * is zero.
2965 	 */
2966 
2967 	if (flags & MSG_OOB) {
2968 		/* Check that the transport supports OOB */
2969 		if (!(so->so_mode & SM_EXDATA))
2970 			return (EOPNOTSUPP);
2971 		so_update_attrs(so, SOACC);
2972 		return (sorecvoob(so, msg, uiop, flags,
2973 		    (so->so_options & SO_OOBINLINE)));
2974 	}
2975 
2976 	so_update_attrs(so, SOACC);
2977 
2978 	/*
2979 	 * Set msg_controllen and msg_namelen to zero here to make it
2980 	 * simpler in the cases that no control or name is returned.
2981 	 */
2982 	controllen = msg->msg_controllen;
2983 	namelen = msg->msg_namelen;
2984 	msg->msg_controllen = 0;
2985 	msg->msg_namelen = 0;
2986 
2987 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2988 	    namelen, controllen));
2989 
2990 	mutex_enter(&so->so_lock);
2991 	/*
2992 	 * Only one reader is allowed at any given time. This is needed
2993 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
2994 	 *
2995 	 * This is slightly different that BSD behavior in that it fails with
2996 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
2997 	 * is single-threaded using sblock(), which is dropped while waiting
2998 	 * for data to appear. The difference shows up e.g. if one
2999 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3000 	 * does use nonblocking io and different threads are reading each
3001 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3002 	 * in this case as long as the read queue doesn't get empty.
3003 	 * In this implementation the thread using nonblocking io can
3004 	 * get an EWOULDBLOCK error due to the blocking thread executing
3005 	 * e.g. in the uiomove in kstrgetmsg.
3006 	 * This difference is not believed to be significant.
3007 	 */
3008 	/* Set SOREADLOCKED */
3009 	error = so_lock_read_intr(so,
3010 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3011 	mutex_exit(&so->so_lock);
3012 	if (error)
3013 		return (error);
3014 
3015 	/*
3016 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3017 	 * queued data has been consumed.
3018 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3019 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3020 	 *
3021 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3022 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3023 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3024 	 */
3025 	pflag = MSG_ANY | MSG_DELAYERROR;
3026 	if (flags & MSG_PEEK) {
3027 		pflag |= MSG_IPEEK;
3028 		flags &= ~MSG_WAITALL;
3029 	}
3030 	if (so->so_mode & SM_ATOMIC)
3031 		pflag |= MSG_DISCARDTAIL;
3032 
3033 	if (flags & MSG_DONTWAIT)
3034 		timout = 0;
3035 	else if (so->so_rcvtimeo != 0)
3036 		timout = TICK_TO_MSEC(so->so_rcvtimeo);
3037 	else
3038 		timout = -1;
3039 	opflag = pflag;
3040 retry:
3041 	saved_resid = uiop->uio_resid;
3042 	pri = 0;
3043 	mp = NULL;
3044 	error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3045 	    timout, &rval);
3046 	if (error != 0) {
3047 		/* kstrgetmsg returns ETIME when timeout expires */
3048 		if (error == ETIME)
3049 			error = EWOULDBLOCK;
3050 		goto out;
3051 	}
3052 	/*
3053 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3054 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3055 	 */
3056 	ASSERT(!(rval.r_val1 & MORECTL));
3057 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3058 		msg->msg_flags |= MSG_TRUNC;
3059 
3060 	if (mp == NULL) {
3061 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3062 		/*
3063 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3064 		 * The draft Posix socket spec states that the mark should
3065 		 * not be cleared when peeking. We follow the latter.
3066 		 */
3067 		if ((so->so_state &
3068 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3069 		    (uiop->uio_resid != saved_resid) &&
3070 		    !(flags & MSG_PEEK)) {
3071 			sorecv_update_oobstate(so);
3072 		}
3073 
3074 		mutex_enter(&so->so_lock);
3075 		/* Set MSG_EOR based on MOREDATA */
3076 		if (!(rval.r_val1 & MOREDATA)) {
3077 			if (so->so_state & SS_SAVEDEOR) {
3078 				msg->msg_flags |= MSG_EOR;
3079 				so->so_state &= ~SS_SAVEDEOR;
3080 			}
3081 		}
3082 		/*
3083 		 * If some data was received (i.e. not EOF) and the
3084 		 * read/recv* has not been satisfied wait for some more.
3085 		 */
3086 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3087 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3088 			mutex_exit(&so->so_lock);
3089 			pflag = opflag | MSG_NOMARK;
3090 			goto retry;
3091 		}
3092 		goto out_locked;
3093 	}
3094 
3095 	/* strsock_proto has already verified length and alignment */
3096 	tpr = (union T_primitives *)mp->b_rptr;
3097 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3098 
3099 	switch (tpr->type) {
3100 	case T_DATA_IND: {
3101 		if ((so->so_state &
3102 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3103 		    (uiop->uio_resid != saved_resid) &&
3104 		    !(flags & MSG_PEEK)) {
3105 			sorecv_update_oobstate(so);
3106 		}
3107 
3108 		/*
3109 		 * Set msg_flags to MSG_EOR based on
3110 		 * MORE_flag and MOREDATA.
3111 		 */
3112 		mutex_enter(&so->so_lock);
3113 		so->so_state &= ~SS_SAVEDEOR;
3114 		if (!(tpr->data_ind.MORE_flag & 1)) {
3115 			if (!(rval.r_val1 & MOREDATA))
3116 				msg->msg_flags |= MSG_EOR;
3117 			else
3118 				so->so_state |= SS_SAVEDEOR;
3119 		}
3120 		freemsg(mp);
3121 		/*
3122 		 * If some data was received (i.e. not EOF) and the
3123 		 * read/recv* has not been satisfied wait for some more.
3124 		 */
3125 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3126 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3127 			mutex_exit(&so->so_lock);
3128 			pflag = opflag | MSG_NOMARK;
3129 			goto retry;
3130 		}
3131 		goto out_locked;
3132 	}
3133 	case T_UNITDATA_IND: {
3134 		void *addr;
3135 		t_uscalar_t addrlen;
3136 		void *abuf;
3137 		t_uscalar_t optlen;
3138 		void *opt;
3139 
3140 		if ((so->so_state &
3141 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3142 		    (uiop->uio_resid != saved_resid) &&
3143 		    !(flags & MSG_PEEK)) {
3144 			sorecv_update_oobstate(so);
3145 		}
3146 
3147 		if (namelen != 0) {
3148 			/* Caller wants source address */
3149 			addrlen = tpr->unitdata_ind.SRC_length;
3150 			addr = sogetoff(mp,
3151 			    tpr->unitdata_ind.SRC_offset,
3152 			    addrlen, 1);
3153 			if (addr == NULL) {
3154 				freemsg(mp);
3155 				error = EPROTO;
3156 				eprintsoline(so, error);
3157 				goto out;
3158 			}
3159 			if (so->so_family == AF_UNIX) {
3160 				/*
3161 				 * Can not use the transport level address.
3162 				 * If there is a SO_SRCADDR option carrying
3163 				 * the socket level address it will be
3164 				 * extracted below.
3165 				 */
3166 				addr = NULL;
3167 				addrlen = 0;
3168 			}
3169 		}
3170 		optlen = tpr->unitdata_ind.OPT_length;
3171 		if (optlen != 0) {
3172 			t_uscalar_t ncontrollen;
3173 
3174 			/*
3175 			 * Extract any source address option.
3176 			 * Determine how large cmsg buffer is needed.
3177 			 */
3178 			opt = sogetoff(mp,
3179 			    tpr->unitdata_ind.OPT_offset,
3180 			    optlen, __TPI_ALIGN_SIZE);
3181 
3182 			if (opt == NULL) {
3183 				freemsg(mp);
3184 				error = EPROTO;
3185 				eprintsoline(so, error);
3186 				goto out;
3187 			}
3188 			if (so->so_family == AF_UNIX)
3189 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3190 			ncontrollen = so_cmsglen(mp, opt, optlen,
3191 			    !(flags & MSG_XPG4_2));
3192 			if (controllen != 0)
3193 				controllen = ncontrollen;
3194 			else if (ncontrollen != 0)
3195 				msg->msg_flags |= MSG_CTRUNC;
3196 		} else {
3197 			controllen = 0;
3198 		}
3199 
3200 		if (namelen != 0) {
3201 			/*
3202 			 * Return address to caller.
3203 			 * Caller handles truncation if length
3204 			 * exceeds msg_namelen.
3205 			 * NOTE: AF_UNIX NUL termination is ensured by
3206 			 * the sender's copyin_name().
3207 			 */
3208 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3209 
3210 			bcopy(addr, abuf, addrlen);
3211 			msg->msg_name = abuf;
3212 			msg->msg_namelen = addrlen;
3213 		}
3214 
3215 		if (controllen != 0) {
3216 			/*
3217 			 * Return control msg to caller.
3218 			 * Caller handles truncation if length
3219 			 * exceeds msg_controllen.
3220 			 */
3221 			control = kmem_zalloc(controllen, KM_SLEEP);
3222 
3223 			error = so_opt2cmsg(mp, opt, optlen, flags, control,
3224 			    controllen);
3225 			if (error) {
3226 				freemsg(mp);
3227 				if (msg->msg_namelen != 0)
3228 					kmem_free(msg->msg_name,
3229 					    msg->msg_namelen);
3230 				kmem_free(control, controllen);
3231 				eprintsoline(so, error);
3232 				goto out;
3233 			}
3234 			msg->msg_control = control;
3235 			msg->msg_controllen = controllen;
3236 		}
3237 
3238 		freemsg(mp);
3239 		goto out;
3240 	}
3241 	case T_OPTDATA_IND: {
3242 		struct T_optdata_req *tdr;
3243 		void *opt;
3244 		t_uscalar_t optlen;
3245 
3246 		if ((so->so_state &
3247 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3248 		    (uiop->uio_resid != saved_resid) &&
3249 		    !(flags & MSG_PEEK)) {
3250 			sorecv_update_oobstate(so);
3251 		}
3252 
3253 		tdr = (struct T_optdata_req *)mp->b_rptr;
3254 		optlen = tdr->OPT_length;
3255 		if (optlen != 0) {
3256 			t_uscalar_t ncontrollen;
3257 			/*
3258 			 * Determine how large cmsg buffer is needed.
3259 			 */
3260 			opt = sogetoff(mp,
3261 			    tpr->optdata_ind.OPT_offset,
3262 			    optlen, __TPI_ALIGN_SIZE);
3263 
3264 			if (opt == NULL) {
3265 				freemsg(mp);
3266 				error = EPROTO;
3267 				eprintsoline(so, error);
3268 				goto out;
3269 			}
3270 
3271 			ncontrollen = so_cmsglen(mp, opt, optlen,
3272 			    !(flags & MSG_XPG4_2));
3273 			if (controllen != 0)
3274 				controllen = ncontrollen;
3275 			else if (ncontrollen != 0)
3276 				msg->msg_flags |= MSG_CTRUNC;
3277 		} else {
3278 			controllen = 0;
3279 		}
3280 
3281 		if (controllen != 0) {
3282 			/*
3283 			 * Return control msg to caller.
3284 			 * Caller handles truncation if length
3285 			 * exceeds msg_controllen.
3286 			 */
3287 			control = kmem_zalloc(controllen, KM_SLEEP);
3288 
3289 			error = so_opt2cmsg(mp, opt, optlen, flags, control,
3290 			    controllen);
3291 			if (error) {
3292 				freemsg(mp);
3293 				kmem_free(control, controllen);
3294 				eprintsoline(so, error);
3295 				goto out;
3296 			}
3297 			msg->msg_control = control;
3298 			msg->msg_controllen = controllen;
3299 		}
3300 
3301 		/*
3302 		 * Set msg_flags to MSG_EOR based on
3303 		 * DATA_flag and MOREDATA.
3304 		 */
3305 		mutex_enter(&so->so_lock);
3306 		so->so_state &= ~SS_SAVEDEOR;
3307 		if (!(tpr->data_ind.MORE_flag & 1)) {
3308 			if (!(rval.r_val1 & MOREDATA))
3309 				msg->msg_flags |= MSG_EOR;
3310 			else
3311 				so->so_state |= SS_SAVEDEOR;
3312 		}
3313 		freemsg(mp);
3314 		/*
3315 		 * If some data was received (i.e. not EOF) and the
3316 		 * read/recv* has not been satisfied wait for some more.
3317 		 * Not possible to wait if control info was received.
3318 		 */
3319 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3320 		    controllen == 0 &&
3321 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3322 			mutex_exit(&so->so_lock);
3323 			pflag = opflag | MSG_NOMARK;
3324 			goto retry;
3325 		}
3326 		goto out_locked;
3327 	}
3328 	case T_EXDATA_IND: {
3329 		dprintso(so, 1,
3330 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3331 		    "state %s\n",
3332 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3333 		    saved_resid - uiop->uio_resid,
3334 		    pr_state(so->so_state, so->so_mode)));
3335 		/*
3336 		 * kstrgetmsg handles MSGMARK so there is nothing to
3337 		 * inspect in the T_EXDATA_IND.
3338 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3339 		 * as a separate message with no M_DATA component. Furthermore,
3340 		 * the stream head does not consolidate M_DATA messages onto
3341 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3342 		 * remains a message by itself. This is needed since MSGMARK
3343 		 * marks both the whole message as well as the last byte
3344 		 * of the message.
3345 		 */
3346 		freemsg(mp);
3347 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3348 		if (flags & MSG_PEEK) {
3349 			/*
3350 			 * Even though we are peeking we consume the
3351 			 * T_EXDATA_IND thereby moving the mark information
3352 			 * to SS_RCVATMARK. Then the oob code below will
3353 			 * retry the peeking kstrgetmsg.
3354 			 * Note that the stream head read queue is
3355 			 * never flushed without holding SOREADLOCKED
3356 			 * thus the T_EXDATA_IND can not disappear
3357 			 * underneath us.
3358 			 */
3359 			dprintso(so, 1,
3360 			    ("sotpi_recvmsg: consume EXDATA_IND "
3361 			    "counts %d/%d state %s\n",
3362 			    sti->sti_oobsigcnt,
3363 			    sti->sti_oobcnt,
3364 			    pr_state(so->so_state, so->so_mode)));
3365 
3366 			pflag = MSG_ANY | MSG_DELAYERROR;
3367 			if (so->so_mode & SM_ATOMIC)
3368 				pflag |= MSG_DISCARDTAIL;
3369 
3370 			pri = 0;
3371 			mp = NULL;
3372 
3373 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3374 			    &pri, &pflag, (clock_t)-1, &rval);
3375 			ASSERT(uiop->uio_resid == saved_resid);
3376 
3377 			if (error) {
3378 #ifdef SOCK_DEBUG
3379 				if (error != EWOULDBLOCK && error != EINTR) {
3380 					eprintsoline(so, error);
3381 				}
3382 #endif /* SOCK_DEBUG */
3383 				goto out;
3384 			}
3385 			ASSERT(mp);
3386 			tpr = (union T_primitives *)mp->b_rptr;
3387 			ASSERT(tpr->type == T_EXDATA_IND);
3388 			freemsg(mp);
3389 		} /* end "if (flags & MSG_PEEK)" */
3390 
3391 		/*
3392 		 * Decrement the number of queued and pending oob.
3393 		 *
3394 		 * SS_RCVATMARK is cleared when we read past a mark.
3395 		 * SS_HAVEOOBDATA is cleared when we've read past the
3396 		 * last mark.
3397 		 * SS_OOBPEND is cleared if we've read past the last
3398 		 * mark and no (new) SIGURG has been posted.
3399 		 */
3400 		mutex_enter(&so->so_lock);
3401 		ASSERT(so_verify_oobstate(so));
3402 		ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3403 		ASSERT(sti->sti_oobsigcnt > 0);
3404 		sti->sti_oobsigcnt--;
3405 		ASSERT(sti->sti_oobcnt > 0);
3406 		sti->sti_oobcnt--;
3407 		/*
3408 		 * Since the T_EXDATA_IND has been removed from the stream
3409 		 * head, but we have not read data past the mark,
3410 		 * sockfs needs to track that the socket is still at the mark.
3411 		 *
3412 		 * Since no data was received call kstrgetmsg again to wait
3413 		 * for data.
3414 		 */
3415 		so->so_state |= SS_RCVATMARK;
3416 		mutex_exit(&so->so_lock);
3417 		dprintso(so, 1,
3418 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3419 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3420 		    pr_state(so->so_state, so->so_mode)));
3421 		pflag = opflag;
3422 		goto retry;
3423 	}
3424 	default:
3425 		cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3426 		    (void *)so, tpr->type, (void *)mp);
3427 		ASSERT(0);
3428 		freemsg(mp);
3429 		error = EPROTO;
3430 		eprintsoline(so, error);
3431 		goto out;
3432 	}
3433 	/* NOTREACHED */
3434 out:
3435 	mutex_enter(&so->so_lock);
3436 out_locked:
3437 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3438 	mutex_exit(&so->so_lock);
3439 	return (error);
3440 }
3441 
3442 /*
3443  * Sending data with options on a datagram socket.
3444  * Assumes caller has verified that SS_ISBOUND etc. are set.
3445  *
3446  * For AF_UNIX the destination address may be already in
3447  * internal form, as indicated by sti->sti_faddr_noxlate
3448  * or the MSG_SENDTO_NOXLATE flag.  Otherwise we need to
3449  * translate the destination address to internal form.
3450  *
3451  * The source address is passed as an option.  If passing
3452  * file descriptors, those are passed as file pointers in
3453  * another option.
3454  */
3455 static int
sosend_dgramcmsg(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,void * control,t_uscalar_t controllen,int flags)3456 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3457     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3458 {
3459 	struct T_unitdata_req	tudr;
3460 	mblk_t			*mp;
3461 	int			error;
3462 	void			*addr;
3463 	socklen_t		addrlen;
3464 	void			*src;
3465 	socklen_t		srclen;
3466 	ssize_t			len;
3467 	int			size;
3468 	struct T_opthdr		toh;
3469 	struct fdbuf		*fdbuf;
3470 	t_uscalar_t		optlen;
3471 	void			*fds;
3472 	int			fdlen;
3473 	sotpi_info_t		*sti = SOTOTPI(so);
3474 
3475 	ASSERT(name && namelen);
3476 	ASSERT(control && controllen);
3477 
3478 	len = uiop->uio_resid;
3479 	if (len > (ssize_t)sti->sti_tidu_size) {
3480 		return (EMSGSIZE);
3481 	}
3482 
3483 	if (sti->sti_faddr_noxlate == 0 &&
3484 	    (flags & MSG_SENDTO_NOXLATE) == 0) {
3485 		/*
3486 		 * Length and family checks.
3487 		 * Don't verify internal form.
3488 		 */
3489 		error = so_addr_verify(so, name, namelen);
3490 		if (error) {
3491 			eprintsoline(so, error);
3492 			return (error);
3493 		}
3494 	}
3495 
3496 	if (so->so_family == AF_UNIX) {
3497 		if (sti->sti_faddr_noxlate) {
3498 			/*
3499 			 * Already have a transport internal address. Do not
3500 			 * pass any (transport internal) source address.
3501 			 */
3502 			addr = name;
3503 			addrlen = namelen;
3504 			src = NULL;
3505 			srclen = 0;
3506 		} else if (flags & MSG_SENDTO_NOXLATE) {
3507 			/*
3508 			 * Have an internal form dest. address.
3509 			 * Pass the source address as usual.
3510 			 */
3511 			addr = name;
3512 			addrlen = namelen;
3513 			src = sti->sti_laddr_sa;
3514 			srclen = (socklen_t)sti->sti_laddr_len;
3515 		} else {
3516 			/*
3517 			 * Pass the sockaddr_un source address as an option
3518 			 * and translate the remote address.
3519 			 *
3520 			 * Note that this code does not prevent sti_laddr_sa
3521 			 * from changing while it is being used. Thus
3522 			 * if an unbind+bind occurs concurrently with this
3523 			 * send the peer might see a partially new and a
3524 			 * partially old "from" address.
3525 			 */
3526 			src = sti->sti_laddr_sa;
3527 			srclen = (socklen_t)sti->sti_laddr_len;
3528 			dprintso(so, 1,
3529 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3530 			    srclen, src));
3531 			/*
3532 			 * The sendmsg caller specified a destination
3533 			 * address, which we must translate into our
3534 			 * internal form.  addr = &sti->sti_ux_taddr
3535 			 */
3536 			error = so_ux_addr_xlate(so, name, namelen,
3537 			    (flags & MSG_XPG4_2),
3538 			    &addr, &addrlen);
3539 			if (error) {
3540 				eprintsoline(so, error);
3541 				return (error);
3542 			}
3543 		}
3544 	} else {
3545 		addr = name;
3546 		addrlen = namelen;
3547 		src = NULL;
3548 		srclen = 0;
3549 	}
3550 	optlen = so_optlen(control, controllen,
3551 	    !(flags & MSG_XPG4_2));
3552 	tudr.PRIM_type = T_UNITDATA_REQ;
3553 	tudr.DEST_length = addrlen;
3554 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3555 	if (srclen != 0)
3556 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3557 		    _TPI_ALIGN_TOPT(srclen));
3558 	else
3559 		tudr.OPT_length = optlen;
3560 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3561 	    _TPI_ALIGN_TOPT(addrlen));
3562 
3563 	size = tudr.OPT_offset + tudr.OPT_length;
3564 
3565 	/*
3566 	 * File descriptors only when SM_FDPASSING set.
3567 	 */
3568 	error = so_getfdopt(control, controllen,
3569 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3570 	if (error)
3571 		return (error);
3572 	if (fdlen != -1) {
3573 		if (!(so->so_mode & SM_FDPASSING))
3574 			return (EOPNOTSUPP);
3575 
3576 		error = fdbuf_create(fds, fdlen, &fdbuf);
3577 		if (error)
3578 			return (error);
3579 
3580 		/*
3581 		 * Pre-allocate enough additional space for lower level modules
3582 		 * to append an option (e.g. see tl_unitdata). The following
3583 		 * is enough extra space for the largest option we might append.
3584 		 */
3585 		size += sizeof (struct T_opthdr) + ucredsize;
3586 		mp = fdbuf_allocmsg(size, fdbuf);
3587 	} else {
3588 		mp = soallocproto(size, _ALLOC_INTR, CRED());
3589 		if (mp == NULL) {
3590 			/*
3591 			 * Caught a signal waiting for memory.
3592 			 * Let send* return EINTR.
3593 			 */
3594 			return (EINTR);
3595 		}
3596 	}
3597 	soappendmsg(mp, &tudr, sizeof (tudr));
3598 	soappendmsg(mp, addr, addrlen);
3599 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3600 
3601 	if (fdlen != -1) {
3602 		ASSERT(fdbuf != NULL);
3603 		toh.level = SOL_SOCKET;
3604 		toh.name = SO_FILEP;
3605 		toh.len = fdbuf->fd_size +
3606 		    (t_uscalar_t)sizeof (struct T_opthdr);
3607 		toh.status = 0;
3608 		soappendmsg(mp, &toh, sizeof (toh));
3609 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3610 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3611 	}
3612 	if (srclen != 0) {
3613 		/*
3614 		 * There is a AF_UNIX sockaddr_un to include as a source
3615 		 * address option.
3616 		 */
3617 		toh.level = SOL_SOCKET;
3618 		toh.name = SO_SRCADDR;
3619 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3620 		toh.status = 0;
3621 		soappendmsg(mp, &toh, sizeof (toh));
3622 		soappendmsg(mp, src, srclen);
3623 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3624 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3625 	}
3626 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3627 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3628 	/*
3629 	 * Normally at most 3 bytes left in the message, but we might have
3630 	 * allowed for extra space if we're passing fd's through.
3631 	 */
3632 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3633 
3634 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3635 	if (AU_AUDITING())
3636 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3637 
3638 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3639 #ifdef SOCK_DEBUG
3640 	if (error) {
3641 		eprintsoline(so, error);
3642 	}
3643 #endif /* SOCK_DEBUG */
3644 	return (error);
3645 }
3646 
3647 /*
3648  * Sending data with options on a connected stream socket.
3649  * Assumes caller has verified that SS_ISCONNECTED is set.
3650  */
3651 static int
sosend_svccmsg(struct sonode * so,struct uio * uiop,int more,void * control,t_uscalar_t controllen,int flags)3652 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3653     t_uscalar_t controllen, int flags)
3654 {
3655 	struct T_optdata_req	tdr;
3656 	mblk_t			*mp;
3657 	int			error;
3658 	ssize_t			iosize;
3659 	int			size;
3660 	struct fdbuf		*fdbuf;
3661 	t_uscalar_t		optlen;
3662 	void			*fds;
3663 	int			fdlen;
3664 	struct T_opthdr		toh;
3665 	sotpi_info_t		*sti = SOTOTPI(so);
3666 
3667 	dprintso(so, 1,
3668 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3669 
3670 	/*
3671 	 * Has to be bound and connected. However, since no locks are
3672 	 * held the state could have changed after sotpi_sendmsg checked it
3673 	 * thus it is not possible to ASSERT on the state.
3674 	 */
3675 
3676 	/* Options on connection-oriented only when SM_OPTDATA set. */
3677 	if (!(so->so_mode & SM_OPTDATA))
3678 		return (EOPNOTSUPP);
3679 
3680 	do {
3681 		/*
3682 		 * Set the MORE flag if uio_resid does not fit in this
3683 		 * message or if the caller passed in "more".
3684 		 * Error for transports with zero tidu_size.
3685 		 */
3686 		tdr.PRIM_type = T_OPTDATA_REQ;
3687 		iosize = sti->sti_tidu_size;
3688 		if (iosize <= 0)
3689 			return (EMSGSIZE);
3690 		if (uiop->uio_resid > iosize) {
3691 			tdr.DATA_flag = 1;
3692 		} else {
3693 			if (more)
3694 				tdr.DATA_flag = 1;
3695 			else
3696 				tdr.DATA_flag = 0;
3697 			iosize = uiop->uio_resid;
3698 		}
3699 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3700 		    tdr.DATA_flag, iosize));
3701 
3702 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3703 		tdr.OPT_length = optlen;
3704 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3705 
3706 		size = (int)sizeof (tdr) + optlen;
3707 		/*
3708 		 * File descriptors only when SM_FDPASSING set.
3709 		 */
3710 		error = so_getfdopt(control, controllen,
3711 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3712 		if (error)
3713 			return (error);
3714 		if (fdlen != -1) {
3715 			if (!(so->so_mode & SM_FDPASSING))
3716 				return (EOPNOTSUPP);
3717 
3718 			error = fdbuf_create(fds, fdlen, &fdbuf);
3719 			if (error)
3720 				return (error);
3721 
3722 			/*
3723 			 * Pre-allocate enough additional space for lower level
3724 			 * modules to append an option (e.g. see tl_unitdata).
3725 			 * The following is enough extra space for the largest
3726 			 * option we might append.
3727 			 */
3728 			size += sizeof (struct T_opthdr) + ucredsize;
3729 			mp = fdbuf_allocmsg(size, fdbuf);
3730 		} else {
3731 			mp = soallocproto(size, _ALLOC_INTR, CRED());
3732 			if (mp == NULL) {
3733 				/*
3734 				 * Caught a signal waiting for memory.
3735 				 * Let send* return EINTR.
3736 				 */
3737 				return (EINTR);
3738 			}
3739 		}
3740 		soappendmsg(mp, &tdr, sizeof (tdr));
3741 
3742 		if (fdlen != -1) {
3743 			ASSERT(fdbuf != NULL);
3744 			toh.level = SOL_SOCKET;
3745 			toh.name = SO_FILEP;
3746 			toh.len = fdbuf->fd_size +
3747 			    (t_uscalar_t)sizeof (struct T_opthdr);
3748 			toh.status = 0;
3749 			soappendmsg(mp, &toh, sizeof (toh));
3750 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3751 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3752 		}
3753 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3754 		/*
3755 		 * Normally at most 3 bytes left in the message, but we might
3756 		 * have allowed for extra space if we're passing fd's through.
3757 		 */
3758 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3759 
3760 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3761 
3762 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3763 		    0, MSG_BAND, 0);
3764 		if (error) {
3765 			eprintsoline(so, error);
3766 			return (error);
3767 		}
3768 		control = NULL;
3769 		if (uiop->uio_resid > 0) {
3770 			/*
3771 			 * Recheck for fatal errors. Fail write even though
3772 			 * some data have been written. This is consistent
3773 			 * with strwrite semantics and BSD sockets semantics.
3774 			 */
3775 			if (so->so_state & SS_CANTSENDMORE) {
3776 				eprintsoline(so, error);
3777 				return (EPIPE);
3778 			}
3779 			if (so->so_error != 0) {
3780 				mutex_enter(&so->so_lock);
3781 				error = sogeterr(so, B_TRUE);
3782 				mutex_exit(&so->so_lock);
3783 				if (error != 0) {
3784 					eprintsoline(so, error);
3785 					return (error);
3786 				}
3787 			}
3788 		}
3789 	} while (uiop->uio_resid > 0);
3790 	return (0);
3791 }
3792 
3793 /*
3794  * Sending data on a datagram socket.
3795  * Assumes caller has verified that SS_ISBOUND etc. are set.
3796  *
3797  * For AF_UNIX the destination address may be already in
3798  * internal form, as indicated by sti->sti_faddr_noxlate
3799  * or the MSG_SENDTO_NOXLATE flag.  Otherwise we need to
3800  * translate the destination address to internal form.
3801  *
3802  * The source address is passed as an option.
3803  */
3804 int
sosend_dgram(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)3805 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
3806     struct uio *uiop, int flags)
3807 {
3808 	struct T_unitdata_req	tudr;
3809 	mblk_t			*mp;
3810 	int			error;
3811 	void			*addr;
3812 	socklen_t		addrlen;
3813 	void			*src;
3814 	socklen_t		srclen;
3815 	ssize_t			len;
3816 	sotpi_info_t		*sti = SOTOTPI(so);
3817 
3818 	ASSERT(name != NULL && namelen != 0);
3819 
3820 	len = uiop->uio_resid;
3821 	if (len > sti->sti_tidu_size) {
3822 		error = EMSGSIZE;
3823 		goto done;
3824 	}
3825 
3826 	if (sti->sti_faddr_noxlate == 0 &&
3827 	    (flags & MSG_SENDTO_NOXLATE) == 0) {
3828 		/*
3829 		 * Length and family checks.
3830 		 * Don't verify internal form.
3831 		 */
3832 		error = so_addr_verify(so, name, namelen);
3833 		if (error != 0)
3834 			goto done;
3835 	}
3836 
3837 	if (sti->sti_direct)	/* Never on AF_UNIX */
3838 		return (sodgram_direct(so, name, namelen, uiop, flags));
3839 
3840 	if (so->so_family == AF_UNIX) {
3841 		if (sti->sti_faddr_noxlate) {
3842 			/*
3843 			 * Already have a transport internal address. Do not
3844 			 * pass any (transport internal) source address.
3845 			 */
3846 			addr = name;
3847 			addrlen = namelen;
3848 			src = NULL;
3849 			srclen = 0;
3850 		} else if (flags & MSG_SENDTO_NOXLATE) {
3851 			/*
3852 			 * Have an internal form dest. address.
3853 			 * Pass the source address as usual.
3854 			 */
3855 			addr = name;
3856 			addrlen = namelen;
3857 			src = sti->sti_laddr_sa;
3858 			srclen = (socklen_t)sti->sti_laddr_len;
3859 		} else {
3860 			/*
3861 			 * Pass the sockaddr_un source address as an option
3862 			 * and translate the remote address.
3863 			 *
3864 			 * Note that this code does not prevent sti_laddr_sa
3865 			 * from changing while it is being used. Thus
3866 			 * if an unbind+bind occurs concurrently with this
3867 			 * send the peer might see a partially new and a
3868 			 * partially old "from" address.
3869 			 */
3870 			src = sti->sti_laddr_sa;
3871 			srclen = (socklen_t)sti->sti_laddr_len;
3872 			dprintso(so, 1,
3873 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
3874 			    srclen, src));
3875 			/*
3876 			 * The sendmsg caller specified a destination
3877 			 * address, which we must translate into our
3878 			 * internal form.  addr = &sti->sti_ux_taddr
3879 			 */
3880 			error = so_ux_addr_xlate(so, name, namelen,
3881 			    (flags & MSG_XPG4_2),
3882 			    &addr, &addrlen);
3883 			if (error) {
3884 				eprintsoline(so, error);
3885 				goto done;
3886 			}
3887 		}
3888 	} else {
3889 		addr = name;
3890 		addrlen = namelen;
3891 		src = NULL;
3892 		srclen = 0;
3893 	}
3894 	tudr.PRIM_type = T_UNITDATA_REQ;
3895 	tudr.DEST_length = addrlen;
3896 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3897 	if (srclen == 0) {
3898 		tudr.OPT_length = 0;
3899 		tudr.OPT_offset = 0;
3900 
3901 		mp = soallocproto2(&tudr, sizeof (tudr),
3902 		    addr, addrlen, 0, _ALLOC_INTR, CRED());
3903 		if (mp == NULL) {
3904 			/*
3905 			 * Caught a signal waiting for memory.
3906 			 * Let send* return EINTR.
3907 			 */
3908 			error = EINTR;
3909 			goto done;
3910 		}
3911 	} else {
3912 		/*
3913 		 * There is a AF_UNIX sockaddr_un to include as a source
3914 		 * address option.
3915 		 */
3916 		struct T_opthdr toh;
3917 		ssize_t size;
3918 
3919 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
3920 		    _TPI_ALIGN_TOPT(srclen));
3921 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3922 		    _TPI_ALIGN_TOPT(addrlen));
3923 
3924 		toh.level = SOL_SOCKET;
3925 		toh.name = SO_SRCADDR;
3926 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3927 		toh.status = 0;
3928 
3929 		size = tudr.OPT_offset + tudr.OPT_length;
3930 		mp = soallocproto2(&tudr, sizeof (tudr),
3931 		    addr, addrlen, size, _ALLOC_INTR, CRED());
3932 		if (mp == NULL) {
3933 			/*
3934 			 * Caught a signal waiting for memory.
3935 			 * Let send* return EINTR.
3936 			 */
3937 			error = EINTR;
3938 			goto done;
3939 		}
3940 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3941 		soappendmsg(mp, &toh, sizeof (toh));
3942 		soappendmsg(mp, src, srclen);
3943 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3944 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3945 	}
3946 
3947 	if (AU_AUDITING())
3948 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3949 
3950 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3951 done:
3952 #ifdef SOCK_DEBUG
3953 	if (error) {
3954 		eprintsoline(so, error);
3955 	}
3956 #endif /* SOCK_DEBUG */
3957 	return (error);
3958 }
3959 
3960 /*
3961  * Sending data on a connected stream socket.
3962  * Assumes caller has verified that SS_ISCONNECTED is set.
3963  */
3964 int
sosend_svc(struct sonode * so,struct uio * uiop,t_scalar_t prim,int more,int sflag)3965 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
3966     int sflag)
3967 {
3968 	struct T_data_req	tdr;
3969 	mblk_t			*mp;
3970 	int			error;
3971 	ssize_t			iosize;
3972 	sotpi_info_t		*sti = SOTOTPI(so);
3973 
3974 	dprintso(so, 1,
3975 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
3976 	    (void *)so, uiop->uio_resid, prim, sflag));
3977 
3978 	/*
3979 	 * Has to be bound and connected. However, since no locks are
3980 	 * held the state could have changed after sotpi_sendmsg checked it
3981 	 * thus it is not possible to ASSERT on the state.
3982 	 */
3983 
3984 	do {
3985 		/*
3986 		 * Set the MORE flag if uio_resid does not fit in this
3987 		 * message or if the caller passed in "more".
3988 		 * Error for transports with zero tidu_size.
3989 		 */
3990 		tdr.PRIM_type = prim;
3991 		iosize = sti->sti_tidu_size;
3992 		if (iosize <= 0)
3993 			return (EMSGSIZE);
3994 		if (uiop->uio_resid > iosize) {
3995 			tdr.MORE_flag = 1;
3996 		} else {
3997 			if (more)
3998 				tdr.MORE_flag = 1;
3999 			else
4000 				tdr.MORE_flag = 0;
4001 			iosize = uiop->uio_resid;
4002 		}
4003 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4004 		    prim, tdr.MORE_flag, iosize));
4005 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4006 		if (mp == NULL) {
4007 			/*
4008 			 * Caught a signal waiting for memory.
4009 			 * Let send* return EINTR.
4010 			 */
4011 			return (EINTR);
4012 		}
4013 
4014 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4015 		    0, sflag | MSG_BAND, 0);
4016 		if (error) {
4017 			eprintsoline(so, error);
4018 			return (error);
4019 		}
4020 		if (uiop->uio_resid > 0) {
4021 			/*
4022 			 * Recheck for fatal errors. Fail write even though
4023 			 * some data have been written. This is consistent
4024 			 * with strwrite semantics and BSD sockets semantics.
4025 			 */
4026 			if (so->so_state & SS_CANTSENDMORE) {
4027 				eprintsoline(so, error);
4028 				return (EPIPE);
4029 			}
4030 			if (so->so_error != 0) {
4031 				mutex_enter(&so->so_lock);
4032 				error = sogeterr(so, B_TRUE);
4033 				mutex_exit(&so->so_lock);
4034 				if (error != 0) {
4035 					eprintsoline(so, error);
4036 					return (error);
4037 				}
4038 			}
4039 		}
4040 	} while (uiop->uio_resid > 0);
4041 	return (0);
4042 }
4043 
4044 /*
4045  * Check the state for errors and call the appropriate send function.
4046  *
4047  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4048  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4049  * after sending the message.
4050  *
4051  * The caller may optionally specify a destination address, for either
4052  * stream or datagram sockets.  This table summarizes the cases:
4053  *
4054  *    Socket type    Dest. given    Connected    Result
4055  *    -----------    -----------    ---------    --------------
4056  *    Stream         *              Yes	         send to conn. addr.
4057  *    Stream         *              No           error ENOTCONN
4058  *    Dgram          yes            *            send to given addr.
4059  *    Dgram          no             yes          send to conn. addr.
4060  *    Dgram          no             no	         error EDESTADDRREQ
4061  *
4062  * There are subtleties around the destination address when using
4063  * AF_UNIX datagram sockets.  When the sendmsg call specifies the
4064  * destination address, it's in (struct sockaddr_un) form and we
4065  * need to translate it to our internal form (struct so_ux_addr).
4066  *
4067  * When the sendmsg call does not specify a destination address
4068  * we're using the peer address saved during sotpi_connect, and
4069  * that address is already in internal form.  In this case, the
4070  * (internal only) flag MSG_SENDTO_NOXLATE is set in the flags
4071  * passed to sosend_dgram or sosend_dgramcmsg to indicate that
4072  * those functions should skip translation to internal form.
4073  * Avoiding that translation is not only more efficient, but it's
4074  * also necessary when a process does a connect on an AF_UNIX
4075  * datagram socket and then drops privileges.  After the process
4076  * has dropped privileges, it may no longer be able to lookup the
4077  * the external name in the filesystem, but it should still be
4078  * able to send messages on the connected socket by leaving the
4079  * destination name unspecified.
4080  *
4081  * Yet more subtleties arise with sockets connected by socketpair(),
4082  * which puts internal form addresses in the fields where normally
4083  * the external form is found, and sets sti_faddr_noxlate=1, which
4084  * (like flag MSG_SENDTO_NOXLATE) causes the sosend_dgram functions
4085  * to skip translation of destination addresses to internal form.
4086  * However, beware that the flag sti_faddr_noxlate=1 also triggers
4087  * different behaviour almost everywhere AF_UNIX addresses appear.
4088  */
4089 static int
sotpi_sendmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)4090 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4091     struct cred *cr)
4092 {
4093 	int		so_state;
4094 	int		so_mode;
4095 	int		error;
4096 	struct sockaddr *name;
4097 	t_uscalar_t	namelen;
4098 	int		dontroute;
4099 	int		flags;
4100 	sotpi_info_t	*sti = SOTOTPI(so);
4101 
4102 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4103 	    (void *)so, (void *)msg, msg->msg_flags,
4104 	    pr_state(so->so_state, so->so_mode), so->so_error));
4105 
4106 	if (so->so_version == SOV_STREAM) {
4107 		/* The imaginary "sockmod" has been popped - act as a stream */
4108 		so_update_attrs(so, SOMOD);
4109 		return (strwrite(SOTOV(so), uiop, cr));
4110 	}
4111 
4112 	mutex_enter(&so->so_lock);
4113 	so_state = so->so_state;
4114 
4115 	if (so_state & SS_CANTSENDMORE) {
4116 		mutex_exit(&so->so_lock);
4117 		return (EPIPE);
4118 	}
4119 
4120 	if (so->so_error != 0) {
4121 		error = sogeterr(so, B_TRUE);
4122 		if (error != 0) {
4123 			mutex_exit(&so->so_lock);
4124 			return (error);
4125 		}
4126 	}
4127 
4128 	name = (struct sockaddr *)msg->msg_name;
4129 	namelen = msg->msg_namelen;
4130 	flags = msg->msg_flags;
4131 
4132 	/*
4133 	 * Historically, this function does not validate the flags
4134 	 * passed in, and any errant bits are ignored.  However,
4135 	 * we would not want any such errant flag bits accidently
4136 	 * being treated as one of the internal-only flags, so
4137 	 * clear the internal-only flag bits.
4138 	 */
4139 	flags &= ~MSG_SENDTO_NOXLATE;
4140 
4141 	so_mode = so->so_mode;
4142 
4143 	if (name == NULL) {
4144 		if (!(so_state & SS_ISCONNECTED)) {
4145 			mutex_exit(&so->so_lock);
4146 			if (so_mode & SM_CONNREQUIRED)
4147 				return (ENOTCONN);
4148 			else
4149 				return (EDESTADDRREQ);
4150 		}
4151 		/*
4152 		 * This is a connected socket.
4153 		 */
4154 		if (so_mode & SM_CONNREQUIRED) {
4155 			/*
4156 			 * This is a connected STREAM socket,
4157 			 * destination not specified.
4158 			 */
4159 			name = NULL;
4160 			namelen = 0;
4161 		} else {
4162 			/*
4163 			 * Datagram send on connected socket with
4164 			 * the destination name not specified.
4165 			 * Use the peer address from connect.
4166 			 */
4167 			if (so->so_family == AF_UNIX) {
4168 				/*
4169 				 * Use the (internal form) address saved
4170 				 * in sotpi_connect.  See above.
4171 				 */
4172 				name = (void *)&sti->sti_ux_faddr;
4173 				namelen = sizeof (sti->sti_ux_faddr);
4174 				flags |= MSG_SENDTO_NOXLATE;
4175 			} else {
4176 				ASSERT(sti->sti_faddr_sa);
4177 				name = sti->sti_faddr_sa;
4178 				namelen = (t_uscalar_t)sti->sti_faddr_len;
4179 			}
4180 		}
4181 	} else {
4182 		/*
4183 		 * Sendmsg specifies a destination name
4184 		 */
4185 		if (!(so_state & SS_ISCONNECTED) &&
4186 		    (so_mode & SM_CONNREQUIRED)) {
4187 			/* i.e. TCP not connected */
4188 			mutex_exit(&so->so_lock);
4189 			return (ENOTCONN);
4190 		}
4191 		/*
4192 		 * Ignore the address on connection-oriented sockets.
4193 		 * Just like BSD this code does not generate an error for
4194 		 * TCP (a CONNREQUIRED socket) when sending to an address
4195 		 * passed in with sendto/sendmsg. Instead the data is
4196 		 * delivered on the connection as if no address had been
4197 		 * supplied.
4198 		 */
4199 		if ((so_state & SS_ISCONNECTED) &&
4200 		    !(so_mode & SM_CONNREQUIRED)) {
4201 			mutex_exit(&so->so_lock);
4202 			return (EISCONN);
4203 		}
4204 		if (!(so_state & SS_ISBOUND)) {
4205 			so_lock_single(so);	/* Set SOLOCKED */
4206 			error = sotpi_bind(so, NULL, 0,
4207 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4208 			so_unlock_single(so, SOLOCKED);
4209 			if (error) {
4210 				mutex_exit(&so->so_lock);
4211 				eprintsoline(so, error);
4212 				return (error);
4213 			}
4214 		}
4215 		/*
4216 		 * Handle delayed datagram errors. These are only queued
4217 		 * when the application sets SO_DGRAM_ERRIND.
4218 		 * Return the error if we are sending to the address
4219 		 * that was returned in the last T_UDERROR_IND.
4220 		 * If sending to some other address discard the delayed
4221 		 * error indication.
4222 		 */
4223 		if (sti->sti_delayed_error) {
4224 			struct T_uderror_ind	*tudi;
4225 			void			*addr;
4226 			t_uscalar_t		addrlen;
4227 			boolean_t		match = B_FALSE;
4228 
4229 			ASSERT(sti->sti_eaddr_mp);
4230 			error = sti->sti_delayed_error;
4231 			sti->sti_delayed_error = 0;
4232 			tudi =
4233 			    (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4234 			addrlen = tudi->DEST_length;
4235 			addr = sogetoff(sti->sti_eaddr_mp,
4236 			    tudi->DEST_offset, addrlen, 1);
4237 			ASSERT(addr);	/* Checked by strsock_proto */
4238 			switch (so->so_family) {
4239 			case AF_INET: {
4240 				/* Compare just IP address and port */
4241 				sin_t *sin1 = (sin_t *)name;
4242 				sin_t *sin2 = (sin_t *)addr;
4243 
4244 				if (addrlen == sizeof (sin_t) &&
4245 				    namelen == addrlen &&
4246 				    sin1->sin_port == sin2->sin_port &&
4247 				    sin1->sin_addr.s_addr ==
4248 				    sin2->sin_addr.s_addr)
4249 					match = B_TRUE;
4250 				break;
4251 			}
4252 			case AF_INET6: {
4253 				/* Compare just IP address and port. Not flow */
4254 				sin6_t *sin1 = (sin6_t *)name;
4255 				sin6_t *sin2 = (sin6_t *)addr;
4256 
4257 				if (addrlen == sizeof (sin6_t) &&
4258 				    namelen == addrlen &&
4259 				    sin1->sin6_port == sin2->sin6_port &&
4260 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4261 				    &sin2->sin6_addr))
4262 					match = B_TRUE;
4263 				break;
4264 			}
4265 			case AF_UNIX:
4266 			default:
4267 				if (namelen == addrlen &&
4268 				    bcmp(name, addr, namelen) == 0)
4269 					match = B_TRUE;
4270 			}
4271 			if (match) {
4272 				freemsg(sti->sti_eaddr_mp);
4273 				sti->sti_eaddr_mp = NULL;
4274 				mutex_exit(&so->so_lock);
4275 #ifdef DEBUG
4276 				dprintso(so, 0,
4277 				    ("sockfs delayed error %d for %s\n",
4278 				    error,
4279 				    pr_addr(so->so_family, name, namelen)));
4280 #endif /* DEBUG */
4281 				return (error);
4282 			}
4283 			freemsg(sti->sti_eaddr_mp);
4284 			sti->sti_eaddr_mp = NULL;
4285 		}
4286 	}
4287 	mutex_exit(&so->so_lock);
4288 
4289 	dontroute = 0;
4290 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4291 		uint32_t	val;
4292 
4293 		val = 1;
4294 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4295 		    &val, (t_uscalar_t)sizeof (val), cr);
4296 		if (error)
4297 			return (error);
4298 		dontroute = 1;
4299 	}
4300 
4301 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4302 		error = EOPNOTSUPP;
4303 		goto done;
4304 	}
4305 	if (msg->msg_controllen != 0) {
4306 		if (!(so_mode & SM_CONNREQUIRED)) {
4307 			so_update_attrs(so, SOMOD);
4308 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4309 			    msg->msg_control, msg->msg_controllen, flags);
4310 		} else {
4311 			if (flags & MSG_OOB) {
4312 				/* Can't generate T_EXDATA_REQ with options */
4313 				error = EOPNOTSUPP;
4314 				goto done;
4315 			}
4316 			so_update_attrs(so, SOMOD);
4317 			error = sosend_svccmsg(so, uiop,
4318 			    !(flags & MSG_EOR),
4319 			    msg->msg_control, msg->msg_controllen,
4320 			    flags);
4321 		}
4322 		goto done;
4323 	}
4324 
4325 	so_update_attrs(so, SOMOD);
4326 	if (!(so_mode & SM_CONNREQUIRED)) {
4327 		/*
4328 		 * If there is no SO_DONTROUTE to turn off return immediately
4329 		 * from send_dgram. This can allow tail-call optimizations.
4330 		 */
4331 		if (!dontroute) {
4332 			return (sosend_dgram(so, name, namelen, uiop, flags));
4333 		}
4334 		error = sosend_dgram(so, name, namelen, uiop, flags);
4335 	} else {
4336 		t_scalar_t prim;
4337 		int sflag;
4338 
4339 		/* Ignore msg_name in the connected state */
4340 		if (flags & MSG_OOB) {
4341 			prim = T_EXDATA_REQ;
4342 			/*
4343 			 * Send down T_EXDATA_REQ even if there is flow
4344 			 * control for data.
4345 			 */
4346 			sflag = MSG_IGNFLOW;
4347 		} else {
4348 			if (so_mode & SM_BYTESTREAM) {
4349 				/* Byte stream transport - use write */
4350 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4351 
4352 				/* Send M_DATA messages */
4353 				/*
4354 				 * If there is no SO_DONTROUTE to turn off,
4355 				 * sti_direct is on, and there is no flow
4356 				 * control, we can take the fast path.
4357 				 */
4358 				if (!dontroute && sti->sti_direct != 0 &&
4359 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4360 					return (sostream_direct(so, uiop,
4361 					    NULL, cr));
4362 				}
4363 				error = strwrite(SOTOV(so), uiop, cr);
4364 				goto done;
4365 			}
4366 			prim = T_DATA_REQ;
4367 			sflag = 0;
4368 		}
4369 		/*
4370 		 * If there is no SO_DONTROUTE to turn off return immediately
4371 		 * from sosend_svc. This can allow tail-call optimizations.
4372 		 */
4373 		if (!dontroute)
4374 			return (sosend_svc(so, uiop, prim,
4375 			    !(flags & MSG_EOR), sflag));
4376 		error = sosend_svc(so, uiop, prim,
4377 		    !(flags & MSG_EOR), sflag);
4378 	}
4379 	ASSERT(dontroute);
4380 done:
4381 	if (dontroute) {
4382 		uint32_t	val;
4383 
4384 		val = 0;
4385 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4386 		    &val, (t_uscalar_t)sizeof (val), cr);
4387 	}
4388 	return (error);
4389 }
4390 
4391 /*
4392  * kstrwritemp() has very similar semantics as that of strwrite().
4393  * The main difference is it obtains mblks from the caller and also
4394  * does not do any copy as done in strwrite() from user buffers to
4395  * kernel buffers.
4396  *
4397  * Currently, this routine is used by sendfile to send data allocated
4398  * within the kernel without any copying. This interface does not use the
4399  * synchronous stream interface as synch. stream interface implies
4400  * copying.
4401  */
4402 int
kstrwritemp(struct vnode * vp,mblk_t * mp,ushort_t fmode)4403 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4404 {
4405 	struct stdata *stp;
4406 	struct queue *wqp;
4407 	mblk_t *newmp;
4408 	char waitflag;
4409 	int tempmode;
4410 	int error = 0;
4411 	int done = 0;
4412 	struct sonode *so;
4413 	boolean_t direct;
4414 
4415 	ASSERT(vp->v_stream);
4416 	stp = vp->v_stream;
4417 
4418 	so = VTOSO(vp);
4419 	direct = _SOTOTPI(so)->sti_direct;
4420 
4421 	/*
4422 	 * This is the sockfs direct fast path. canputnext() need
4423 	 * not be accurate so we don't grab the sd_lock here. If
4424 	 * we get flow-controlled, we grab sd_lock just before the
4425 	 * do..while loop below to emulate what strwrite() does.
4426 	 */
4427 	wqp = stp->sd_wrq;
4428 	if (canputnext(wqp) && direct &&
4429 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4430 		return (sostream_direct(so, NULL, mp, CRED()));
4431 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4432 		/* Fast check of flags before acquiring the lock */
4433 		mutex_enter(&stp->sd_lock);
4434 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4435 		mutex_exit(&stp->sd_lock);
4436 		if (error != 0) {
4437 			if (!(stp->sd_flag & STPLEX) &&
4438 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
4439 				error = EPIPE;
4440 			}
4441 			return (error);
4442 		}
4443 	}
4444 
4445 	waitflag = WRITEWAIT;
4446 	if (stp->sd_flag & OLDNDELAY)
4447 		tempmode = fmode & ~FNDELAY;
4448 	else
4449 		tempmode = fmode;
4450 
4451 	mutex_enter(&stp->sd_lock);
4452 	do {
4453 		if (canputnext(wqp)) {
4454 			mutex_exit(&stp->sd_lock);
4455 			if (stp->sd_wputdatafunc != NULL) {
4456 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4457 				    NULL, NULL, NULL);
4458 				if (newmp == NULL) {
4459 					/* The caller will free mp */
4460 					return (ECOMM);
4461 				}
4462 				mp = newmp;
4463 			}
4464 			putnext(wqp, mp);
4465 			return (0);
4466 		}
4467 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4468 		    &done);
4469 	} while (error == 0 && !done);
4470 
4471 	mutex_exit(&stp->sd_lock);
4472 	/*
4473 	 * EAGAIN tells the application to try again. ENOMEM
4474 	 * is returned only if the memory allocation size
4475 	 * exceeds the physical limits of the system. ENOMEM
4476 	 * can't be true here.
4477 	 */
4478 	if (error == ENOMEM)
4479 		error = EAGAIN;
4480 	return (error);
4481 }
4482 
4483 /* ARGSUSED */
4484 static int
sotpi_sendmblk(struct sonode * so,struct nmsghdr * msg,int fflag,struct cred * cr,mblk_t ** mpp)4485 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4486     struct cred *cr, mblk_t **mpp)
4487 {
4488 	int error;
4489 
4490 	switch (so->so_family) {
4491 	case AF_INET:
4492 	case AF_INET6:
4493 	case AF_UNIX:
4494 		break;
4495 	default:
4496 		return (EAFNOSUPPORT);
4497 
4498 	}
4499 
4500 	if (so->so_state & SS_CANTSENDMORE)
4501 		return (EPIPE);
4502 
4503 	if (so->so_type != SOCK_STREAM)
4504 		return (EOPNOTSUPP);
4505 
4506 	if ((so->so_state & SS_ISCONNECTED) == 0)
4507 		return (ENOTCONN);
4508 
4509 	error = kstrwritemp(so->so_vnode, *mpp, fflag);
4510 	if (error == 0)
4511 		*mpp = NULL;
4512 	return (error);
4513 }
4514 
4515 /*
4516  * Sending data on a datagram socket.
4517  * Assumes caller has verified that SS_ISBOUND etc. are set.
4518  */
4519 /* ARGSUSED */
4520 static int
sodgram_direct(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)4521 sodgram_direct(struct sonode *so, struct sockaddr *name,
4522     socklen_t namelen, struct uio *uiop, int flags)
4523 {
4524 	struct T_unitdata_req	tudr;
4525 	mblk_t			*mp = NULL;
4526 	int			error = 0;
4527 	void			*addr;
4528 	socklen_t		addrlen;
4529 	ssize_t			len;
4530 	struct stdata		*stp = SOTOV(so)->v_stream;
4531 	int			so_state;
4532 	queue_t			*udp_wq;
4533 	boolean_t		connected;
4534 	mblk_t			*mpdata = NULL;
4535 	sotpi_info_t		*sti = SOTOTPI(so);
4536 	uint32_t		auditing = AU_AUDITING();
4537 
4538 	ASSERT(name != NULL && namelen != 0);
4539 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4540 	ASSERT(!(so->so_mode & SM_EXDATA));
4541 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4542 	ASSERT(SOTOV(so)->v_type == VSOCK);
4543 
4544 	/* Caller checked for proper length */
4545 	len = uiop->uio_resid;
4546 	ASSERT(len <= sti->sti_tidu_size);
4547 
4548 	/* Length and family checks have been done by caller */
4549 	ASSERT(name->sa_family == so->so_family);
4550 	ASSERT(so->so_family == AF_INET ||
4551 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4552 	ASSERT(so->so_family == AF_INET6 ||
4553 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4554 
4555 	addr = name;
4556 	addrlen = namelen;
4557 
4558 	if (stp->sd_sidp != NULL &&
4559 	    (error = straccess(stp, JCWRITE)) != 0)
4560 		goto done;
4561 
4562 	so_state = so->so_state;
4563 
4564 	connected = so_state & SS_ISCONNECTED;
4565 	if (!connected) {
4566 		tudr.PRIM_type = T_UNITDATA_REQ;
4567 		tudr.DEST_length = addrlen;
4568 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4569 		tudr.OPT_length = 0;
4570 		tudr.OPT_offset = 0;
4571 
4572 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4573 		    _ALLOC_INTR, CRED());
4574 		if (mp == NULL) {
4575 			/*
4576 			 * Caught a signal waiting for memory.
4577 			 * Let send* return EINTR.
4578 			 */
4579 			error = EINTR;
4580 			goto done;
4581 		}
4582 	}
4583 
4584 	/*
4585 	 * For UDP we don't break up the copyin into smaller pieces
4586 	 * as in the TCP case.  That means if ENOMEM is returned by
4587 	 * mcopyinuio() then the uio vector has not been modified at
4588 	 * all and we fallback to either strwrite() or kstrputmsg()
4589 	 * below.  Note also that we never generate priority messages
4590 	 * from here.
4591 	 */
4592 	udp_wq = stp->sd_wrq->q_next;
4593 	if (canput(udp_wq) &&
4594 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4595 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4596 		ASSERT(uiop->uio_resid == 0);
4597 		if (!connected)
4598 			linkb(mp, mpdata);
4599 		else
4600 			mp = mpdata;
4601 		if (auditing)
4602 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4603 
4604 		/* Always returns 0... */
4605 		return (udp_wput(udp_wq, mp));
4606 	}
4607 
4608 	ASSERT(mpdata == NULL);
4609 	if (error != 0 && error != ENOMEM) {
4610 		freemsg(mp);
4611 		return (error);
4612 	}
4613 
4614 	/*
4615 	 * For connected, let strwrite() handle the blocking case.
4616 	 * Otherwise we fall thru and use kstrputmsg().
4617 	 */
4618 	if (connected)
4619 		return (strwrite(SOTOV(so), uiop, CRED()));
4620 
4621 	if (auditing)
4622 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4623 
4624 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4625 done:
4626 #ifdef SOCK_DEBUG
4627 	if (error != 0) {
4628 		eprintsoline(so, error);
4629 	}
4630 #endif /* SOCK_DEBUG */
4631 	return (error);
4632 }
4633 
4634 int
sostream_direct(struct sonode * so,struct uio * uiop,mblk_t * mp,cred_t * cr)4635 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4636 {
4637 	struct stdata *stp = SOTOV(so)->v_stream;
4638 	ssize_t iosize, rmax, maxblk;
4639 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4640 	mblk_t *newmp;
4641 	int error = 0, wflag = 0;
4642 
4643 	ASSERT(so->so_mode & SM_BYTESTREAM);
4644 	ASSERT(SOTOV(so)->v_type == VSOCK);
4645 
4646 	if (stp->sd_sidp != NULL &&
4647 	    (error = straccess(stp, JCWRITE)) != 0)
4648 		return (error);
4649 
4650 	if (uiop == NULL) {
4651 		/*
4652 		 * kstrwritemp() should have checked sd_flag and
4653 		 * flow-control before coming here.  If we end up
4654 		 * here it means that we can simply pass down the
4655 		 * data to tcp.
4656 		 */
4657 		ASSERT(mp != NULL);
4658 		if (stp->sd_wputdatafunc != NULL) {
4659 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4660 			    NULL, NULL, NULL);
4661 			if (newmp == NULL) {
4662 				/* The caller will free mp */
4663 				return (ECOMM);
4664 			}
4665 			mp = newmp;
4666 		}
4667 		/* Always returns 0... */
4668 		return (tcp_wput(tcp_wq, mp));
4669 	}
4670 
4671 	/* Fallback to strwrite() to do proper error handling */
4672 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4673 		return (strwrite(SOTOV(so), uiop, cr));
4674 
4675 	rmax = stp->sd_qn_maxpsz;
4676 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4677 	if (rmax == 0 || uiop->uio_resid <= 0)
4678 		return (0);
4679 
4680 	if (rmax == INFPSZ)
4681 		rmax = uiop->uio_resid;
4682 
4683 	maxblk = stp->sd_maxblk;
4684 
4685 	for (;;) {
4686 		iosize = MIN(uiop->uio_resid, rmax);
4687 
4688 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4689 		if (mp == NULL) {
4690 			/*
4691 			 * Fallback to strwrite() for ENOMEM; if this
4692 			 * is our first time in this routine and the uio
4693 			 * vector has not been modified, we will end up
4694 			 * calling strwrite() without any flag set.
4695 			 */
4696 			if (error == ENOMEM)
4697 				goto slow_send;
4698 			else
4699 				return (error);
4700 		}
4701 		ASSERT(uiop->uio_resid >= 0);
4702 		/*
4703 		 * If mp is non-NULL and ENOMEM is set, it means that
4704 		 * mcopyinuio() was able to break down some of the user
4705 		 * data into one or more mblks.  Send the partial data
4706 		 * to tcp and let the rest be handled in strwrite().
4707 		 */
4708 		ASSERT(error == 0 || error == ENOMEM);
4709 		if (stp->sd_wputdatafunc != NULL) {
4710 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4711 			    NULL, NULL, NULL);
4712 			if (newmp == NULL) {
4713 				/* The caller will free mp */
4714 				return (ECOMM);
4715 			}
4716 			mp = newmp;
4717 		}
4718 		(void) tcp_wput(tcp_wq, mp);	/* Always returns 0 anyway. */
4719 
4720 		wflag |= NOINTR;
4721 
4722 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4723 			ASSERT(error == 0);
4724 			break;
4725 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4726 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4727 slow_send:
4728 			/*
4729 			 * We were able to send down partial data using
4730 			 * the direct call interface, but are now relying
4731 			 * on strwrite() to handle the non-fastpath cases.
4732 			 * If the socket is blocking we will sleep in
4733 			 * strwaitq() until write is permitted, otherwise,
4734 			 * we will need to return the amount of bytes
4735 			 * written so far back to the app.  This is the
4736 			 * reason why we pass NOINTR flag to strwrite()
4737 			 * for non-blocking socket, because we don't want
4738 			 * to return EAGAIN when portion of the user data
4739 			 * has actually been sent down.
4740 			 */
4741 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4742 		}
4743 	}
4744 	return (0);
4745 }
4746 
4747 /*
4748  * Update sti_faddr by asking the transport (unless AF_UNIX).
4749  */
4750 /* ARGSUSED */
4751 int
sotpi_getpeername(struct sonode * so,struct sockaddr * name,socklen_t * namelen,boolean_t accept,struct cred * cr)4752 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4753     boolean_t accept, struct cred *cr)
4754 {
4755 	struct strbuf	strbuf;
4756 	int		error = 0, res;
4757 	void		*addr;
4758 	t_uscalar_t	addrlen;
4759 	k_sigset_t	smask;
4760 	sotpi_info_t	*sti = SOTOTPI(so);
4761 
4762 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4763 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4764 
4765 	ASSERT(*namelen > 0);
4766 	mutex_enter(&so->so_lock);
4767 	so_lock_single(so);	/* Set SOLOCKED */
4768 
4769 	if (accept) {
4770 		bcopy(sti->sti_faddr_sa, name,
4771 		    MIN(*namelen, sti->sti_faddr_len));
4772 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4773 		goto done;
4774 	}
4775 
4776 	if (!(so->so_state & SS_ISCONNECTED)) {
4777 		error = ENOTCONN;
4778 		goto done;
4779 	}
4780 	/* Added this check for X/Open */
4781 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4782 		error = EINVAL;
4783 		if (xnet_check_print) {
4784 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4785 		}
4786 		goto done;
4787 	}
4788 
4789 	if (sti->sti_faddr_valid) {
4790 		bcopy(sti->sti_faddr_sa, name,
4791 		    MIN(*namelen, sti->sti_faddr_len));
4792 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4793 		goto done;
4794 	}
4795 
4796 #ifdef DEBUG
4797 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4798 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4799 	    (t_uscalar_t)sti->sti_faddr_len)));
4800 #endif /* DEBUG */
4801 
4802 	if (so->so_family == AF_UNIX) {
4803 		/* Transport has different name space - return local info */
4804 		if (sti->sti_faddr_noxlate)
4805 			*namelen = 0;
4806 		error = 0;
4807 		goto done;
4808 	}
4809 
4810 	ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4811 
4812 	ASSERT(sti->sti_faddr_sa);
4813 	/* Allocate local buffer to use with ioctl */
4814 	addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4815 	mutex_exit(&so->so_lock);
4816 	addr = kmem_alloc(addrlen, KM_SLEEP);
4817 
4818 	/*
4819 	 * Issue TI_GETPEERNAME with signals masked.
4820 	 * Put the result in sti_faddr_sa so that getpeername works after
4821 	 * a shutdown(output).
4822 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4823 	 * back to the socket.
4824 	 */
4825 	strbuf.buf = addr;
4826 	strbuf.maxlen = addrlen;
4827 	strbuf.len = 0;
4828 
4829 	sigintr(&smask, 0);
4830 	res = 0;
4831 	ASSERT(cr);
4832 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4833 	    0, K_TO_K, cr, &res);
4834 	sigunintr(&smask);
4835 
4836 	mutex_enter(&so->so_lock);
4837 	/*
4838 	 * If there is an error record the error in so_error put don't fail
4839 	 * the getpeername. Instead fallback on the recorded
4840 	 * sti->sti_faddr_sa.
4841 	 */
4842 	if (error) {
4843 		/*
4844 		 * Various stream head errors can be returned to the ioctl.
4845 		 * However, it is impossible to determine which ones of
4846 		 * these are really socket level errors that were incorrectly
4847 		 * consumed by the ioctl. Thus this code silently ignores the
4848 		 * error - to code explicitly does not reinstate the error
4849 		 * using soseterror().
4850 		 * Experiments have shows that at least this set of
4851 		 * errors are reported and should not be reinstated on the
4852 		 * socket:
4853 		 *	EINVAL	E.g. if an I_LINK was in effect when
4854 		 *		getpeername was called.
4855 		 *	EPIPE	The ioctl error semantics prefer the write
4856 		 *		side error over the read side error.
4857 		 *	ENOTCONN The transport just got disconnected but
4858 		 *		sockfs had not yet seen the T_DISCON_IND
4859 		 *		when issuing the ioctl.
4860 		 */
4861 		error = 0;
4862 	} else if (res == 0 && strbuf.len > 0 &&
4863 	    (so->so_state & SS_ISCONNECTED)) {
4864 		ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
4865 		sti->sti_faddr_len = (socklen_t)strbuf.len;
4866 		bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
4867 		sti->sti_faddr_valid = 1;
4868 
4869 		bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
4870 		*namelen = sti->sti_faddr_len;
4871 	}
4872 	kmem_free(addr, addrlen);
4873 #ifdef DEBUG
4874 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4875 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4876 	    (t_uscalar_t)sti->sti_faddr_len)));
4877 #endif /* DEBUG */
4878 done:
4879 	so_unlock_single(so, SOLOCKED);
4880 	mutex_exit(&so->so_lock);
4881 	return (error);
4882 }
4883 
4884 /*
4885  * Update sti_laddr by asking the transport (unless AF_UNIX).
4886  */
4887 int
sotpi_getsockname(struct sonode * so,struct sockaddr * name,socklen_t * namelen,struct cred * cr)4888 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4889     struct cred *cr)
4890 {
4891 	struct strbuf	strbuf;
4892 	int		error = 0, res;
4893 	void		*addr;
4894 	t_uscalar_t	addrlen;
4895 	k_sigset_t	smask;
4896 	sotpi_info_t	*sti = SOTOTPI(so);
4897 
4898 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4899 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4900 
4901 	ASSERT(*namelen > 0);
4902 	mutex_enter(&so->so_lock);
4903 	so_lock_single(so);	/* Set SOLOCKED */
4904 
4905 #ifdef DEBUG
4906 
4907 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4908 	    pr_addr(so->so_family, sti->sti_laddr_sa,
4909 	    (t_uscalar_t)sti->sti_laddr_len)));
4910 #endif /* DEBUG */
4911 	if (sti->sti_laddr_valid) {
4912 		bcopy(sti->sti_laddr_sa, name,
4913 		    MIN(*namelen, sti->sti_laddr_len));
4914 		*namelen = sti->sti_laddr_len;
4915 		goto done;
4916 	}
4917 
4918 	if (so->so_family == AF_UNIX) {
4919 		/*
4920 		 * Transport has different name space - return local info. If we
4921 		 * have enough space, let consumers know the family.
4922 		 */
4923 		if (*namelen >= sizeof (sa_family_t)) {
4924 			name->sa_family = AF_UNIX;
4925 			*namelen = sizeof (sa_family_t);
4926 		} else {
4927 			*namelen = 0;
4928 		}
4929 		error = 0;
4930 		goto done;
4931 	}
4932 	if (!(so->so_state & SS_ISBOUND)) {
4933 		/* If not bound, then nothing to return. */
4934 		error = 0;
4935 		goto done;
4936 	}
4937 
4938 	/* Allocate local buffer to use with ioctl */
4939 	addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
4940 	mutex_exit(&so->so_lock);
4941 	addr = kmem_alloc(addrlen, KM_SLEEP);
4942 
4943 	/*
4944 	 * Issue TI_GETMYNAME with signals masked.
4945 	 * Put the result in sti_laddr_sa so that getsockname works after
4946 	 * a shutdown(output).
4947 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4948 	 * back to the socket.
4949 	 */
4950 	strbuf.buf = addr;
4951 	strbuf.maxlen = addrlen;
4952 	strbuf.len = 0;
4953 
4954 	sigintr(&smask, 0);
4955 	res = 0;
4956 	ASSERT(cr);
4957 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4958 	    0, K_TO_K, cr, &res);
4959 	sigunintr(&smask);
4960 
4961 	mutex_enter(&so->so_lock);
4962 	/*
4963 	 * If there is an error record the error in so_error put don't fail
4964 	 * the getsockname. Instead fallback on the recorded
4965 	 * sti->sti_laddr_sa.
4966 	 */
4967 	if (error) {
4968 		/*
4969 		 * Various stream head errors can be returned to the ioctl.
4970 		 * However, it is impossible to determine which ones of
4971 		 * these are really socket level errors that were incorrectly
4972 		 * consumed by the ioctl. Thus this code silently ignores the
4973 		 * error - to code explicitly does not reinstate the error
4974 		 * using soseterror().
4975 		 * Experiments have shows that at least this set of
4976 		 * errors are reported and should not be reinstated on the
4977 		 * socket:
4978 		 *	EINVAL	E.g. if an I_LINK was in effect when
4979 		 *		getsockname was called.
4980 		 *	EPIPE	The ioctl error semantics prefer the write
4981 		 *		side error over the read side error.
4982 		 */
4983 		error = 0;
4984 	} else if (res == 0 && strbuf.len > 0 &&
4985 	    (so->so_state & SS_ISBOUND)) {
4986 		ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
4987 		sti->sti_laddr_len = (socklen_t)strbuf.len;
4988 		bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
4989 		sti->sti_laddr_valid = 1;
4990 
4991 		bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
4992 		*namelen = sti->sti_laddr_len;
4993 	}
4994 	kmem_free(addr, addrlen);
4995 #ifdef DEBUG
4996 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
4997 	    pr_addr(so->so_family, sti->sti_laddr_sa,
4998 	    (t_uscalar_t)sti->sti_laddr_len)));
4999 #endif /* DEBUG */
5000 done:
5001 	so_unlock_single(so, SOLOCKED);
5002 	mutex_exit(&so->so_lock);
5003 	return (error);
5004 }
5005 
5006 /*
5007  * Get socket options. For SOL_SOCKET options some options are handled
5008  * by the sockfs while others use the value recorded in the sonode as a
5009  * fallback should the T_SVR4_OPTMGMT_REQ fail.
5010  *
5011  * On the return most *optlenp bytes are copied to optval.
5012  */
5013 /* ARGSUSED */
5014 int
sotpi_getsockopt(struct sonode * so,int level,int option_name,void * optval,socklen_t * optlenp,int flags,struct cred * cr)5015 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5016     void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5017 {
5018 	struct T_optmgmt_req	optmgmt_req;
5019 	struct T_optmgmt_ack	*optmgmt_ack;
5020 	struct opthdr		oh;
5021 	struct opthdr		*opt_res;
5022 	mblk_t			*mp = NULL;
5023 	int			error = 0;
5024 	void			*option = NULL;	/* Set if fallback value */
5025 	t_uscalar_t		maxlen = *optlenp;
5026 	t_uscalar_t		len;
5027 	uint32_t		value;
5028 	struct timeval		tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5029 	struct timeval32	tmo_val32;
5030 	struct so_snd_bufinfo	snd_bufinfo;	/* used for zero copy */
5031 
5032 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5033 	    (void *)so, level, option_name, optval, (void *)optlenp,
5034 	    pr_state(so->so_state, so->so_mode)));
5035 
5036 	mutex_enter(&so->so_lock);
5037 	so_lock_single(so);	/* Set SOLOCKED */
5038 
5039 	len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
5040 
5041 	/*
5042 	 * Check for SOL_SOCKET options.
5043 	 * Certain SOL_SOCKET options are returned directly whereas
5044 	 * others only provide a default (fallback) value should
5045 	 * the T_SVR4_OPTMGMT_REQ fail.
5046 	 */
5047 	if (level == SOL_SOCKET) {
5048 		/* Check parameters */
5049 		switch (option_name) {
5050 		case SO_TYPE:
5051 		case SO_ERROR:
5052 		case SO_DEBUG:
5053 		case SO_ACCEPTCONN:
5054 		case SO_REUSEADDR:
5055 		case SO_KEEPALIVE:
5056 		case SO_DONTROUTE:
5057 		case SO_BROADCAST:
5058 		case SO_USELOOPBACK:
5059 		case SO_OOBINLINE:
5060 		case SO_SNDBUF:
5061 		case SO_RCVBUF:
5062 #ifdef notyet
5063 		case SO_SNDLOWAT:
5064 		case SO_RCVLOWAT:
5065 #endif /* notyet */
5066 		case SO_DOMAIN:
5067 		case SO_DGRAM_ERRIND:
5068 		case SO_PROTOCOL:
5069 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5070 				error = EINVAL;
5071 				eprintsoline(so, error);
5072 				goto done2;
5073 			}
5074 			break;
5075 		case SO_RCVTIMEO:
5076 		case SO_SNDTIMEO:
5077 			if (get_udatamodel() == DATAMODEL_NONE ||
5078 			    get_udatamodel() == DATAMODEL_NATIVE) {
5079 				if (maxlen < sizeof (struct timeval)) {
5080 					error = EINVAL;
5081 					eprintsoline(so, error);
5082 					goto done2;
5083 				}
5084 			} else {
5085 				if (maxlen < sizeof (struct timeval32)) {
5086 					error = EINVAL;
5087 					eprintsoline(so, error);
5088 					goto done2;
5089 				}
5090 
5091 			}
5092 			break;
5093 		case SO_LINGER:
5094 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5095 				error = EINVAL;
5096 				eprintsoline(so, error);
5097 				goto done2;
5098 			}
5099 			break;
5100 		case SO_SND_BUFINFO:
5101 			if (maxlen < (t_uscalar_t)
5102 			    sizeof (struct so_snd_bufinfo)) {
5103 				error = EINVAL;
5104 				eprintsoline(so, error);
5105 				goto done2;
5106 			}
5107 			break;
5108 		}
5109 
5110 		switch (option_name) {
5111 		case SO_TYPE:
5112 			value = so->so_type;
5113 			option = &value;
5114 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5115 
5116 		case SO_ERROR:
5117 			value = sogeterr(so, B_TRUE);
5118 			option = &value;
5119 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5120 
5121 		case SO_ACCEPTCONN:
5122 			if (so->so_state & SS_ACCEPTCONN)
5123 				value = SO_ACCEPTCONN;
5124 			else
5125 				value = 0;
5126 #ifdef DEBUG
5127 			if (value) {
5128 				dprintso(so, 1,
5129 				    ("sotpi_getsockopt: 0x%x is set\n",
5130 				    option_name));
5131 			} else {
5132 				dprintso(so, 1,
5133 				    ("sotpi_getsockopt: 0x%x not set\n",
5134 				    option_name));
5135 			}
5136 #endif /* DEBUG */
5137 			option = &value;
5138 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5139 
5140 		case SO_DEBUG:
5141 		case SO_REUSEADDR:
5142 		case SO_KEEPALIVE:
5143 		case SO_DONTROUTE:
5144 		case SO_BROADCAST:
5145 		case SO_USELOOPBACK:
5146 		case SO_OOBINLINE:
5147 		case SO_DGRAM_ERRIND:
5148 			value = (so->so_options & option_name);
5149 #ifdef DEBUG
5150 			if (value) {
5151 				dprintso(so, 1,
5152 				    ("sotpi_getsockopt: 0x%x is set\n",
5153 				    option_name));
5154 			} else {
5155 				dprintso(so, 1,
5156 				    ("sotpi_getsockopt: 0x%x not set\n",
5157 				    option_name));
5158 			}
5159 #endif /* DEBUG */
5160 			option = &value;
5161 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5162 
5163 		/*
5164 		 * The following options are only returned by sockfs when the
5165 		 * T_SVR4_OPTMGMT_REQ fails.
5166 		 */
5167 		case SO_LINGER:
5168 			option = &so->so_linger;
5169 			len = (t_uscalar_t)sizeof (struct linger);
5170 			break;
5171 		case SO_SNDBUF: {
5172 			ssize_t lvalue;
5173 
5174 			/*
5175 			 * If the option has not been set then get a default
5176 			 * value from the read queue. This value is
5177 			 * returned if the transport fails
5178 			 * the T_SVR4_OPTMGMT_REQ.
5179 			 */
5180 			lvalue = so->so_sndbuf;
5181 			if (lvalue == 0) {
5182 				mutex_exit(&so->so_lock);
5183 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
5184 				    QHIWAT, 0, &lvalue);
5185 				mutex_enter(&so->so_lock);
5186 				dprintso(so, 1,
5187 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5188 			}
5189 			value = (int)lvalue;
5190 			option = &value;
5191 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5192 			break;
5193 		}
5194 		case SO_RCVBUF: {
5195 			ssize_t lvalue;
5196 
5197 			/*
5198 			 * If the option has not been set then get a default
5199 			 * value from the read queue. This value is
5200 			 * returned if the transport fails
5201 			 * the T_SVR4_OPTMGMT_REQ.
5202 			 *
5203 			 * XXX If SO_RCVBUF has been set and this is an
5204 			 * XPG 4.2 application then do not ask the transport
5205 			 * since the transport might adjust the value and not
5206 			 * return exactly what was set by the application.
5207 			 * For non-XPG 4.2 application we return the value
5208 			 * that the transport is actually using.
5209 			 */
5210 			lvalue = so->so_rcvbuf;
5211 			if (lvalue == 0) {
5212 				mutex_exit(&so->so_lock);
5213 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5214 				    QHIWAT, 0, &lvalue);
5215 				mutex_enter(&so->so_lock);
5216 				dprintso(so, 1,
5217 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5218 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5219 				value = (int)lvalue;
5220 				option = &value;
5221 				goto copyout;	/* skip asking transport */
5222 			}
5223 			value = (int)lvalue;
5224 			option = &value;
5225 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5226 			break;
5227 		}
5228 		case SO_DOMAIN:
5229 			value = so->so_family;
5230 			option = &value;
5231 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5232 
5233 		case SO_PROTOCOL:
5234 			value = so->so_protocol;
5235 			option = &value;
5236 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5237 
5238 #ifdef notyet
5239 		/*
5240 		 * We do not implement the semantics of these options
5241 		 * thus we shouldn't implement the options either.
5242 		 */
5243 		case SO_SNDLOWAT:
5244 			value = so->so_sndlowat;
5245 			option = &value;
5246 			break;
5247 		case SO_RCVLOWAT:
5248 			value = so->so_rcvlowat;
5249 			option = &value;
5250 			break;
5251 #endif /* notyet */
5252 		case SO_SNDTIMEO:
5253 		case SO_RCVTIMEO: {
5254 			clock_t val;
5255 
5256 			if (option_name == SO_RCVTIMEO)
5257 				val = drv_hztousec(so->so_rcvtimeo);
5258 			else
5259 				val = drv_hztousec(so->so_sndtimeo);
5260 			tmo_val.tv_sec = val / (1000 * 1000);
5261 			tmo_val.tv_usec = val % (1000 * 1000);
5262 			if (get_udatamodel() == DATAMODEL_NONE ||
5263 			    get_udatamodel() == DATAMODEL_NATIVE) {
5264 				option = &tmo_val;
5265 				len = sizeof (struct timeval);
5266 			} else {
5267 				TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5268 				option = &tmo_val32;
5269 				len = sizeof (struct timeval32);
5270 			}
5271 			break;
5272 		}
5273 		case SO_SND_BUFINFO: {
5274 			snd_bufinfo.sbi_wroff =
5275 			    (so->so_proto_props).sopp_wroff;
5276 			snd_bufinfo.sbi_maxblk =
5277 			    (so->so_proto_props).sopp_maxblk;
5278 			snd_bufinfo.sbi_maxpsz =
5279 			    (so->so_proto_props).sopp_maxpsz;
5280 			snd_bufinfo.sbi_tail =
5281 			    (so->so_proto_props).sopp_tail;
5282 			option = &snd_bufinfo;
5283 			len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5284 			break;
5285 		}
5286 		}
5287 	}
5288 
5289 	mutex_exit(&so->so_lock);
5290 
5291 	/* Send request */
5292 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5293 	optmgmt_req.MGMT_flags = T_CHECK;
5294 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5295 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5296 
5297 	oh.level = level;
5298 	oh.name = option_name;
5299 	oh.len = maxlen;
5300 
5301 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5302 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5303 	/* Let option management work in the presence of data flow control */
5304 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5305 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5306 	mp = NULL;
5307 	mutex_enter(&so->so_lock);
5308 	if (error) {
5309 		eprintsoline(so, error);
5310 		goto done2;
5311 	}
5312 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5313 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5314 	if (error) {
5315 		if (option != NULL) {
5316 			/* We have a fallback value */
5317 			error = 0;
5318 			goto copyout;
5319 		}
5320 		eprintsoline(so, error);
5321 		goto done2;
5322 	}
5323 	ASSERT(mp);
5324 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5325 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5326 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5327 	if (opt_res == NULL) {
5328 		if (option != NULL) {
5329 			/* We have a fallback value */
5330 			error = 0;
5331 			goto copyout;
5332 		}
5333 		error = EPROTO;
5334 		eprintsoline(so, error);
5335 		goto done;
5336 	}
5337 	option = &opt_res[1];
5338 
5339 	/* check to ensure that the option is within bounds */
5340 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5341 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5342 		if (option != NULL) {
5343 			/* We have a fallback value */
5344 			error = 0;
5345 			goto copyout;
5346 		}
5347 		error = EPROTO;
5348 		eprintsoline(so, error);
5349 		goto done;
5350 	}
5351 
5352 	len = opt_res->len;
5353 
5354 copyout: {
5355 		t_uscalar_t size = MIN(len, maxlen);
5356 		bcopy(option, optval, size);
5357 		bcopy(&size, optlenp, sizeof (size));
5358 	}
5359 done:
5360 	freemsg(mp);
5361 done2:
5362 	so_unlock_single(so, SOLOCKED);
5363 	mutex_exit(&so->so_lock);
5364 
5365 	return (error);
5366 }
5367 
5368 /*
5369  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5370  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5371  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5372  * setsockopt has to work even if the transport does not support the option.
5373  */
5374 /* ARGSUSED */
5375 int
sotpi_setsockopt(struct sonode * so,int level,int option_name,const void * optval,t_uscalar_t optlen,struct cred * cr)5376 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5377     const void *optval, t_uscalar_t optlen, struct cred *cr)
5378 {
5379 	struct T_optmgmt_req	optmgmt_req;
5380 	struct opthdr		oh;
5381 	mblk_t			*mp;
5382 	int			error = 0;
5383 	boolean_t		handled = B_FALSE;
5384 
5385 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5386 	    (void *)so, level, option_name, optval, optlen,
5387 	    pr_state(so->so_state, so->so_mode)));
5388 
5389 	/* X/Open requires this check */
5390 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5391 		if (xnet_check_print)
5392 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5393 		return (EINVAL);
5394 	}
5395 
5396 	mutex_enter(&so->so_lock);
5397 	so_lock_single(so);	/* Set SOLOCKED */
5398 	mutex_exit(&so->so_lock);
5399 
5400 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5401 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5402 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5403 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5404 
5405 	oh.level = level;
5406 	oh.name = option_name;
5407 	oh.len = optlen;
5408 
5409 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5410 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5411 	/* Let option management work in the presence of data flow control */
5412 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5413 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5414 	mp = NULL;
5415 	mutex_enter(&so->so_lock);
5416 	if (error) {
5417 		eprintsoline(so, error);
5418 		goto done2;
5419 	}
5420 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5421 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5422 	if (error) {
5423 		eprintsoline(so, error);
5424 		goto done;
5425 	}
5426 	ASSERT(mp);
5427 	/* No need to verify T_optmgmt_ack */
5428 	freemsg(mp);
5429 done:
5430 	/*
5431 	 * Check for SOL_SOCKET options and record their values.
5432 	 * If we know about a SOL_SOCKET parameter and the transport
5433 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5434 	 * EPROTO) we let the setsockopt succeed.
5435 	 */
5436 	if (level == SOL_SOCKET) {
5437 		/* Check parameters */
5438 		switch (option_name) {
5439 		case SO_DEBUG:
5440 		case SO_REUSEADDR:
5441 		case SO_KEEPALIVE:
5442 		case SO_DONTROUTE:
5443 		case SO_BROADCAST:
5444 		case SO_USELOOPBACK:
5445 		case SO_OOBINLINE:
5446 		case SO_SNDBUF:
5447 		case SO_RCVBUF:
5448 #ifdef notyet
5449 		case SO_SNDLOWAT:
5450 		case SO_RCVLOWAT:
5451 #endif /* notyet */
5452 		case SO_DGRAM_ERRIND:
5453 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5454 				error = EINVAL;
5455 				eprintsoline(so, error);
5456 				goto done2;
5457 			}
5458 			ASSERT(optval);
5459 			handled = B_TRUE;
5460 			break;
5461 		case SO_SNDTIMEO:
5462 		case SO_RCVTIMEO:
5463 			if (get_udatamodel() == DATAMODEL_NONE ||
5464 			    get_udatamodel() == DATAMODEL_NATIVE) {
5465 				if (optlen != sizeof (struct timeval)) {
5466 					error = EINVAL;
5467 					eprintsoline(so, error);
5468 					goto done2;
5469 				}
5470 			} else {
5471 				if (optlen != sizeof (struct timeval32)) {
5472 					error = EINVAL;
5473 					eprintsoline(so, error);
5474 					goto done2;
5475 				}
5476 			}
5477 			ASSERT(optval);
5478 			handled = B_TRUE;
5479 			break;
5480 		case SO_LINGER:
5481 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5482 				error = EINVAL;
5483 				eprintsoline(so, error);
5484 				goto done2;
5485 			}
5486 			ASSERT(optval);
5487 			handled = B_TRUE;
5488 			break;
5489 		}
5490 
5491 #define	intvalue	(*(int32_t *)optval)
5492 
5493 		switch (option_name) {
5494 		case SO_TYPE:
5495 		case SO_ERROR:
5496 		case SO_ACCEPTCONN:
5497 			/* Can't be set */
5498 			error = ENOPROTOOPT;
5499 			goto done2;
5500 		case SO_LINGER: {
5501 			struct linger *l = (struct linger *)optval;
5502 
5503 			so->so_linger.l_linger = l->l_linger;
5504 			if (l->l_onoff) {
5505 				so->so_linger.l_onoff = SO_LINGER;
5506 				so->so_options |= SO_LINGER;
5507 			} else {
5508 				so->so_linger.l_onoff = 0;
5509 				so->so_options &= ~SO_LINGER;
5510 			}
5511 			break;
5512 		}
5513 
5514 		case SO_DEBUG:
5515 #ifdef SOCK_TEST
5516 			if (intvalue & 2)
5517 				sock_test_timelimit = 10 * hz;
5518 			else
5519 				sock_test_timelimit = 0;
5520 
5521 			if (intvalue & 4)
5522 				do_useracc = 0;
5523 			else
5524 				do_useracc = 1;
5525 #endif /* SOCK_TEST */
5526 			/* FALLTHRU */
5527 		case SO_REUSEADDR:
5528 		case SO_KEEPALIVE:
5529 		case SO_DONTROUTE:
5530 		case SO_BROADCAST:
5531 		case SO_USELOOPBACK:
5532 		case SO_OOBINLINE:
5533 		case SO_DGRAM_ERRIND:
5534 			if (intvalue != 0) {
5535 				dprintso(so, 1,
5536 				    ("socket_setsockopt: setting 0x%x\n",
5537 				    option_name));
5538 				so->so_options |= option_name;
5539 			} else {
5540 				dprintso(so, 1,
5541 				    ("socket_setsockopt: clearing 0x%x\n",
5542 				    option_name));
5543 				so->so_options &= ~option_name;
5544 			}
5545 			break;
5546 		/*
5547 		 * The following options are only returned by us when the
5548 		 * transport layer fails.
5549 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5550 		 * since the transport might adjust the value and not
5551 		 * return exactly what was set by the application.
5552 		 */
5553 		case SO_SNDBUF:
5554 			so->so_sndbuf = intvalue;
5555 			break;
5556 		case SO_RCVBUF:
5557 			so->so_rcvbuf = intvalue;
5558 			break;
5559 		case SO_RCVPSH:
5560 			so->so_rcv_timer_interval = intvalue;
5561 			break;
5562 #ifdef notyet
5563 		/*
5564 		 * We do not implement the semantics of these options
5565 		 * thus we shouldn't implement the options either.
5566 		 */
5567 		case SO_SNDLOWAT:
5568 			so->so_sndlowat = intvalue;
5569 			break;
5570 		case SO_RCVLOWAT:
5571 			so->so_rcvlowat = intvalue;
5572 			break;
5573 #endif /* notyet */
5574 		case SO_SNDTIMEO:
5575 		case SO_RCVTIMEO: {
5576 			struct timeval tl;
5577 			clock_t val;
5578 
5579 			if (get_udatamodel() == DATAMODEL_NONE ||
5580 			    get_udatamodel() == DATAMODEL_NATIVE) {
5581 				bcopy((struct timeval *)optval, &tl,
5582 				    sizeof (struct timeval));
5583 			} else {
5584 				TIMEVAL32_TO_TIMEVAL(&tl,
5585 				    (struct timeval32 *)optval);
5586 			}
5587 			val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5588 			if (option_name == SO_RCVTIMEO)
5589 				so->so_rcvtimeo = drv_usectohz(val);
5590 			else
5591 				so->so_sndtimeo = drv_usectohz(val);
5592 			break;
5593 		}
5594 		}
5595 #undef	intvalue
5596 
5597 		if (error) {
5598 			if ((error == ENOPROTOOPT || error == EPROTO ||
5599 			    error == EINVAL) && handled) {
5600 				dprintso(so, 1,
5601 				    ("setsockopt: ignoring error %d for 0x%x\n",
5602 				    error, option_name));
5603 				error = 0;
5604 			}
5605 		}
5606 	}
5607 done2:
5608 	so_unlock_single(so, SOLOCKED);
5609 	mutex_exit(&so->so_lock);
5610 	return (error);
5611 }
5612 
5613 /*
5614  * sotpi_close() is called when the last open reference goes away.
5615  */
5616 /* ARGSUSED */
5617 int
sotpi_close(struct sonode * so,int flag,struct cred * cr)5618 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5619 {
5620 	struct vnode *vp = SOTOV(so);
5621 	dev_t dev;
5622 	int error = 0;
5623 	sotpi_info_t *sti = SOTOTPI(so);
5624 
5625 	dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5626 	    (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5627 
5628 	dev = sti->sti_dev;
5629 
5630 	ASSERT(STREAMSTAB(getmajor(dev)));
5631 
5632 	mutex_enter(&so->so_lock);
5633 	so_lock_single(so);	/* Set SOLOCKED */
5634 
5635 	ASSERT(so_verify_oobstate(so));
5636 
5637 	if (vp->v_stream != NULL) {
5638 		vnode_t *ux_vp;
5639 
5640 		if (so->so_family == AF_UNIX) {
5641 			/* Could avoid this when CANTSENDMORE for !dgram */
5642 			so_unix_close(so);
5643 		}
5644 
5645 		mutex_exit(&so->so_lock);
5646 		/*
5647 		 * Disassemble the linkage from the AF_UNIX underlying file
5648 		 * system vnode to this socket (by atomically clearing
5649 		 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5650 		 * and frees the stream head.
5651 		 */
5652 		if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5653 			ASSERT(ux_vp->v_stream);
5654 			sti->sti_ux_bound_vp = NULL;
5655 			vn_rele_stream(ux_vp);
5656 		}
5657 		error = strclose(vp, flag, cr);
5658 		vp->v_stream = NULL;
5659 		mutex_enter(&so->so_lock);
5660 	}
5661 
5662 	/*
5663 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5664 	 */
5665 	so_flush_discon_ind(so);
5666 
5667 	so_unlock_single(so, SOLOCKED);
5668 	mutex_exit(&so->so_lock);
5669 
5670 	/*
5671 	 * Needed for STREAMs.
5672 	 * Decrement the device driver's reference count for streams
5673 	 * opened via the clone dip. The driver was held in clone_open().
5674 	 * The absence of clone_close() forces this asymmetry.
5675 	 */
5676 	if (so->so_flag & SOCLONE)
5677 		ddi_rele_driver(getmajor(dev));
5678 
5679 	return (error);
5680 }
5681 
5682 static int
sotpi_ioctl(struct sonode * so,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)5683 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5684     struct cred *cr, int32_t *rvalp)
5685 {
5686 	struct vnode *vp = SOTOV(so);
5687 	sotpi_info_t *sti = SOTOTPI(so);
5688 	int error = 0;
5689 
5690 	dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5691 	    cmd, arg, pr_state(so->so_state, so->so_mode)));
5692 
5693 	switch (cmd) {
5694 	case SIOCSQPTR:
5695 		/*
5696 		 * SIOCSQPTR is valid only when helper stream is created
5697 		 * by the protocol.
5698 		 */
5699 	case _I_INSERT:
5700 	case _I_REMOVE:
5701 		/*
5702 		 * Since there's no compelling reason to support these ioctls
5703 		 * on sockets, and doing so would increase the complexity
5704 		 * markedly, prevent it.
5705 		 */
5706 		return (EOPNOTSUPP);
5707 
5708 	case I_FIND:
5709 	case I_LIST:
5710 	case I_LOOK:
5711 	case I_POP:
5712 	case I_PUSH:
5713 		/*
5714 		 * To prevent races and inconsistencies between the actual
5715 		 * state of the stream and the state according to the sonode,
5716 		 * we serialize all operations which modify or operate on the
5717 		 * list of modules on the socket's stream.
5718 		 */
5719 		mutex_enter(&sti->sti_plumb_lock);
5720 		error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5721 		mutex_exit(&sti->sti_plumb_lock);
5722 		return (error);
5723 
5724 	default:
5725 		if (so->so_version != SOV_STREAM)
5726 			break;
5727 
5728 		/*
5729 		 * The imaginary "sockmod" has been popped; act as a stream.
5730 		 */
5731 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5732 	}
5733 
5734 	ASSERT(so->so_version != SOV_STREAM);
5735 
5736 	/*
5737 	 * Process socket-specific ioctls.
5738 	 */
5739 	switch (cmd) {
5740 	case FIONBIO: {
5741 		int32_t value;
5742 
5743 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5744 		    (mode & (int)FKIOCTL)))
5745 			return (EFAULT);
5746 
5747 		mutex_enter(&so->so_lock);
5748 		if (value) {
5749 			so->so_state |= SS_NDELAY;
5750 		} else {
5751 			so->so_state &= ~SS_NDELAY;
5752 		}
5753 		mutex_exit(&so->so_lock);
5754 		return (0);
5755 	}
5756 
5757 	case FIOASYNC: {
5758 		int32_t value;
5759 
5760 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5761 		    (mode & (int)FKIOCTL)))
5762 			return (EFAULT);
5763 
5764 		mutex_enter(&so->so_lock);
5765 		/*
5766 		 * SS_ASYNC flag not already set correctly?
5767 		 * (!value != !(so->so_state & SS_ASYNC))
5768 		 * but some engineers find that too hard to read.
5769 		 */
5770 		if ((value == 0 && (so->so_state & SS_ASYNC) != 0) ||
5771 		    (value != 0 && (so->so_state & SS_ASYNC) == 0))
5772 			error = so_flip_async(so, vp, mode, cr);
5773 		mutex_exit(&so->so_lock);
5774 		return (error);
5775 	}
5776 
5777 	case SIOCSPGRP:
5778 	case FIOSETOWN: {
5779 		pid_t pgrp;
5780 
5781 		if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5782 		    (mode & (int)FKIOCTL)))
5783 			return (EFAULT);
5784 
5785 		mutex_enter(&so->so_lock);
5786 		dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5787 		/* Any change? */
5788 		if (pgrp != so->so_pgrp)
5789 			error = so_set_siggrp(so, vp, pgrp, mode, cr);
5790 		mutex_exit(&so->so_lock);
5791 		return (error);
5792 	}
5793 	case SIOCGPGRP:
5794 	case FIOGETOWN:
5795 		if (so_copyout(&so->so_pgrp, (void *)arg,
5796 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
5797 			return (EFAULT);
5798 		return (0);
5799 
5800 	case SIOCATMARK: {
5801 		int retval;
5802 		uint_t so_state;
5803 
5804 		/*
5805 		 * strwaitmark has a finite timeout after which it
5806 		 * returns -1 if the mark state is undetermined.
5807 		 * In order to avoid any race between the mark state
5808 		 * in sockfs and the mark state in the stream head this
5809 		 * routine loops until the mark state can be determined
5810 		 * (or the urgent data indication has been removed by some
5811 		 * other thread).
5812 		 */
5813 		do {
5814 			mutex_enter(&so->so_lock);
5815 			so_state = so->so_state;
5816 			mutex_exit(&so->so_lock);
5817 			if (so_state & SS_RCVATMARK) {
5818 				retval = 1;
5819 			} else if (!(so_state & SS_OOBPEND)) {
5820 				/*
5821 				 * No SIGURG has been generated -- there is no
5822 				 * pending or present urgent data. Thus can't
5823 				 * possibly be at the mark.
5824 				 */
5825 				retval = 0;
5826 			} else {
5827 				/*
5828 				 * Have the stream head wait until there is
5829 				 * either some messages on the read queue, or
5830 				 * STRATMARK or STRNOTATMARK gets set. The
5831 				 * STRNOTATMARK flag is used so that the
5832 				 * transport can send up a MSGNOTMARKNEXT
5833 				 * M_DATA to indicate that it is not
5834 				 * at the mark and additional data is not about
5835 				 * to be send upstream.
5836 				 *
5837 				 * If the mark state is undetermined this will
5838 				 * return -1 and we will loop rechecking the
5839 				 * socket state.
5840 				 */
5841 				retval = strwaitmark(vp);
5842 			}
5843 		} while (retval == -1);
5844 
5845 		if (so_copyout(&retval, (void *)arg, sizeof (int),
5846 		    (mode & (int)FKIOCTL)))
5847 			return (EFAULT);
5848 		return (0);
5849 	}
5850 
5851 	case I_FDINSERT:
5852 	case I_SENDFD:
5853 	case I_RECVFD:
5854 	case I_ATMARK:
5855 	case _SIOCSOCKFALLBACK:
5856 		/*
5857 		 * These ioctls do not apply to sockets. I_FDINSERT can be
5858 		 * used to send M_PROTO messages without modifying the socket
5859 		 * state. I_SENDFD/RECVFD should not be used for socket file
5860 		 * descriptor passing since they assume a twisted stream.
5861 		 * SIOCATMARK must be used instead of I_ATMARK.
5862 		 *
5863 		 * _SIOCSOCKFALLBACK from an application should never be
5864 		 * processed.  It is only generated by socktpi_open() or
5865 		 * in response to I_POP or I_PUSH.
5866 		 */
5867 #ifdef DEBUG
5868 		zcmn_err(getzoneid(), CE_WARN,
5869 		    "Unsupported STREAMS ioctl 0x%x on socket. "
5870 		    "Pid = %d\n", cmd, curproc->p_pid);
5871 #endif /* DEBUG */
5872 		return (EOPNOTSUPP);
5873 
5874 	case _I_GETPEERCRED:
5875 		if ((mode & FKIOCTL) == 0)
5876 			return (EINVAL);
5877 
5878 		mutex_enter(&so->so_lock);
5879 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
5880 			error = ENOTSUP;
5881 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
5882 			error = ENOTCONN;
5883 		} else if (so->so_peercred != NULL) {
5884 			k_peercred_t *kp = (k_peercred_t *)arg;
5885 			kp->pc_cr = so->so_peercred;
5886 			kp->pc_cpid = so->so_cpid;
5887 			crhold(so->so_peercred);
5888 		} else {
5889 			error = EINVAL;
5890 		}
5891 		mutex_exit(&so->so_lock);
5892 		return (error);
5893 
5894 	default:
5895 		/*
5896 		 * Do the higher-order bits of the ioctl cmd indicate
5897 		 * that it is an I_* streams ioctl?
5898 		 */
5899 		if ((cmd & 0xffffff00U) == STR &&
5900 		    so->so_version == SOV_SOCKBSD) {
5901 #ifdef DEBUG
5902 			zcmn_err(getzoneid(), CE_WARN,
5903 			    "Unsupported STREAMS ioctl 0x%x on socket. "
5904 			    "Pid = %d\n", cmd, curproc->p_pid);
5905 #endif /* DEBUG */
5906 			return (EOPNOTSUPP);
5907 		}
5908 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5909 	}
5910 }
5911 
5912 /*
5913  * Handle plumbing-related ioctls.
5914  */
5915 static int
socktpi_plumbioctl(struct vnode * vp,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)5916 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
5917     struct cred *cr, int32_t *rvalp)
5918 {
5919 	static const char sockmod_name[] = "sockmod";
5920 	struct sonode	*so = VTOSO(vp);
5921 	char		mname[FMNAMESZ + 1];
5922 	int		error;
5923 	sotpi_info_t	*sti = SOTOTPI(so);
5924 
5925 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
5926 
5927 	if (so->so_version == SOV_SOCKBSD)
5928 		return (EOPNOTSUPP);
5929 
5930 	if (so->so_version == SOV_STREAM) {
5931 		/*
5932 		 * The imaginary "sockmod" has been popped - act as a stream.
5933 		 * If this is a push of sockmod then change back to a socket.
5934 		 */
5935 		if (cmd == I_PUSH) {
5936 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
5937 			    (void *)arg, mname, sizeof (mname), NULL);
5938 
5939 			if (error == 0 && strcmp(mname, sockmod_name) == 0) {
5940 				dprintso(so, 0, ("socktpi_ioctl: going to "
5941 				    "socket version\n"));
5942 				so_stream2sock(so);
5943 				return (0);
5944 			}
5945 		}
5946 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5947 	}
5948 
5949 	switch (cmd) {
5950 	case I_PUSH:
5951 		if (sti->sti_direct) {
5952 			mutex_enter(&so->so_lock);
5953 			so_lock_single(so);
5954 			mutex_exit(&so->so_lock);
5955 
5956 			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
5957 			    cr, rvalp);
5958 
5959 			mutex_enter(&so->so_lock);
5960 			if (error == 0)
5961 				sti->sti_direct = 0;
5962 			so_unlock_single(so, SOLOCKED);
5963 			mutex_exit(&so->so_lock);
5964 
5965 			if (error != 0)
5966 				return (error);
5967 		}
5968 
5969 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5970 		if (error == 0)
5971 			sti->sti_pushcnt++;
5972 		return (error);
5973 
5974 	case I_POP:
5975 		if (sti->sti_pushcnt == 0) {
5976 			/* Emulate sockmod being popped */
5977 			dprintso(so, 0,
5978 			    ("socktpi_ioctl: going to STREAMS version\n"));
5979 			return (so_sock2stream(so));
5980 		}
5981 
5982 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5983 		if (error == 0)
5984 			sti->sti_pushcnt--;
5985 		return (error);
5986 
5987 	case I_LIST: {
5988 		struct str_mlist *kmlistp, *umlistp;
5989 		struct str_list	kstrlist;
5990 		ssize_t		kstrlistsize;
5991 		int		i, nmods;
5992 
5993 		STRUCT_DECL(str_list, ustrlist);
5994 		STRUCT_INIT(ustrlist, mode);
5995 
5996 		if (arg == 0) {
5997 			error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5998 			if (error == 0)
5999 				(*rvalp)++;	/* Add one for sockmod */
6000 			return (error);
6001 		}
6002 
6003 		error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6004 		    STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6005 		if (error != 0)
6006 			return (error);
6007 
6008 		nmods = STRUCT_FGET(ustrlist, sl_nmods);
6009 		if (nmods <= 0)
6010 			return (EINVAL);
6011 		/*
6012 		 * Ceiling nmods at nstrpush to prevent someone from
6013 		 * maliciously consuming lots of kernel memory.
6014 		 */
6015 		nmods = MIN(nmods, nstrpush);
6016 
6017 		kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6018 		kstrlist.sl_nmods = nmods;
6019 		kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6020 
6021 		error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6022 		    cr, rvalp);
6023 		if (error != 0)
6024 			goto done;
6025 
6026 		/*
6027 		 * Considering the module list as a 0-based array of sl_nmods
6028 		 * modules, sockmod should conceptually exist at slot
6029 		 * sti_pushcnt.  Insert sockmod at this location by sliding all
6030 		 * of the module names after so_pushcnt over by one.  We know
6031 		 * that there will be room to do this since we allocated
6032 		 * sl_modlist with an additional slot.
6033 		 */
6034 		for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6035 			kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6036 
6037 		(void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6038 		kstrlist.sl_nmods++;
6039 
6040 		/*
6041 		 * Copy all of the entries out to ustrlist.
6042 		 */
6043 		kmlistp = kstrlist.sl_modlist;
6044 		umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6045 		for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6046 			error = so_copyout(kmlistp++, umlistp++,
6047 			    sizeof (struct str_mlist), mode & FKIOCTL);
6048 			if (error != 0)
6049 				goto done;
6050 		}
6051 
6052 		error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6053 		    mode & FKIOCTL);
6054 		if (error == 0)
6055 			*rvalp = 0;
6056 	done:
6057 		kmem_free(kstrlist.sl_modlist, kstrlistsize);
6058 		return (error);
6059 	}
6060 	case I_LOOK:
6061 		if (sti->sti_pushcnt == 0) {
6062 			return (so_copyout(sockmod_name, (void *)arg,
6063 			    sizeof (sockmod_name), mode & FKIOCTL));
6064 		}
6065 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6066 
6067 	case I_FIND:
6068 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6069 		if (error && error != EINVAL)
6070 			return (error);
6071 
6072 		/* if not found and string was sockmod return 1 */
6073 		if (*rvalp == 0 || error == EINVAL) {
6074 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6075 			    (void *)arg, mname, sizeof (mname), NULL);
6076 			if (error == ENAMETOOLONG)
6077 				error = EINVAL;
6078 
6079 			if (error == 0 && strcmp(mname, sockmod_name) == 0)
6080 				*rvalp = 1;
6081 		}
6082 		return (error);
6083 
6084 	default:
6085 		panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6086 		break;
6087 	}
6088 
6089 	return (0);
6090 }
6091 
6092 /*
6093  * Wrapper around the streams poll routine that implements socket poll
6094  * semantics.
6095  * The sockfs never calls pollwakeup itself - the stream head take care
6096  * of all pollwakeups. Since sockfs never holds so_lock when calling the
6097  * stream head there can never be a deadlock due to holding so_lock across
6098  * pollwakeup and acquiring so_lock in this routine.
6099  *
6100  * However, since the performance of VOP_POLL is critical we avoid
6101  * acquiring so_lock here. This is based on two assumptions:
6102  *  - The poll implementation holds locks to serialize the VOP_POLL call
6103  *    and a pollwakeup for the same pollhead. This ensures that should
6104  *    e.g. so_state change during a socktpi_poll call the pollwakeup
6105  *    (which strsock_* and strrput conspire to issue) is issued after
6106  *    the state change. Thus the pollwakeup will block until VOP_POLL has
6107  *    returned and then wake up poll and have it call VOP_POLL again.
6108  *  - The reading of so_state without holding so_lock does not result in
6109  *    stale data that is older than the latest state change that has dropped
6110  *    so_lock. This is ensured by the mutex_exit issuing the appropriate
6111  *    memory barrier to force the data into the coherency domain.
6112  */
6113 static int
sotpi_poll(struct sonode * so,short events,int anyyet,short * reventsp,struct pollhead ** phpp)6114 sotpi_poll(
6115 	struct sonode	*so,
6116 	short		events,
6117 	int		anyyet,
6118 	short		*reventsp,
6119 	struct pollhead **phpp)
6120 {
6121 	short origevents = events;
6122 	struct vnode *vp = SOTOV(so);
6123 	int error;
6124 	int so_state = so->so_state;	/* snapshot */
6125 	sotpi_info_t *sti = SOTOTPI(so);
6126 
6127 	dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6128 	    (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6129 
6130 	ASSERT(vp->v_type == VSOCK);
6131 	ASSERT(vp->v_stream != NULL);
6132 
6133 	if (so->so_version == SOV_STREAM) {
6134 		/* The imaginary "sockmod" has been popped - act as a stream */
6135 		return (strpoll(vp->v_stream, events, anyyet,
6136 		    reventsp, phpp));
6137 	}
6138 
6139 	if (!(so_state & SS_ISCONNECTED) &&
6140 	    (so->so_mode & SM_CONNREQUIRED)) {
6141 		/* Not connected yet - turn off write side events */
6142 		events &= ~(POLLOUT|POLLWRBAND);
6143 	}
6144 	/*
6145 	 * Check for errors without calling strpoll if the caller wants them.
6146 	 * In sockets the errors are represented as input/output events
6147 	 * and there is no need to ask the stream head for this information.
6148 	 */
6149 	if (so->so_error != 0 &&
6150 	    ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
6151 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6152 		return (0);
6153 	}
6154 	/*
6155 	 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6156 	 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6157 	 * will not trigger a POLLIN event with POLLRDDATA set.
6158 	 * The handling of urgent data (causing POLLRDBAND) is done by
6159 	 * inspecting SS_OOBPEND below.
6160 	 */
6161 	events |= POLLRDDATA;
6162 
6163 	/*
6164 	 * After shutdown(output) a stream head write error is set.
6165 	 * However, we should not return output events.
6166 	 */
6167 	events |= POLLNOERR;
6168 	error = strpoll(vp->v_stream, events, anyyet,
6169 	    reventsp, phpp);
6170 	if (error)
6171 		return (error);
6172 
6173 	ASSERT(!(*reventsp & POLLERR));
6174 
6175 	/*
6176 	 * Notes on T_CONN_IND handling for sockets.
6177 	 *
6178 	 * If strpoll() returned without events, SR_POLLIN is guaranteed
6179 	 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6180 	 *
6181 	 * Since the so_lock is not held, soqueueconnind() may have run
6182 	 * and a T_CONN_IND may be waiting. We now check for any queued
6183 	 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6184 	 * to ensure poll returns.
6185 	 *
6186 	 * However:
6187 	 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6188 	 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6189 	 * the following actions will occur; taken together they ensure the
6190 	 * syscall will return.
6191 	 *
6192 	 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6193 	 *    the accept() was run on a non-blocking socket sowaitconnind()
6194 	 *    may have already returned EWOULDBLOCK, so not be waiting to
6195 	 *    process the message. Additionally socktpi_poll() has probably
6196 	 *    proceeded past the sti_conn_ind_head check below.
6197 	 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6198 	 *    this thread,  however that could occur before poll_common()
6199 	 *    has entered cv_wait.
6200 	 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6201 	 *
6202 	 * Before proceeding to cv_wait() in poll_common() for an event,
6203 	 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6204 	 * and if set, re-calls strpoll() to ensure the late arriving
6205 	 * T_CONN_IND is recognized, and pollsys() returns.
6206 	 */
6207 
6208 	if (sti->sti_conn_ind_head != NULL)
6209 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6210 
6211 	if (so->so_state & SS_CANTRCVMORE) {
6212 		*reventsp |= POLLRDHUP & events;
6213 
6214 		if (so->so_state & SS_CANTSENDMORE)
6215 			*reventsp |= POLLHUP;
6216 	}
6217 
6218 	if (so->so_state & SS_OOBPEND)
6219 		*reventsp |= POLLRDBAND & events;
6220 
6221 	return (0);
6222 }
6223 
6224 /*ARGSUSED*/
6225 static int
socktpi_constructor(void * buf,void * cdrarg,int kmflags)6226 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6227 {
6228 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6229 	int error = 0;
6230 
6231 	error = sonode_constructor(buf, cdrarg, kmflags);
6232 	if (error != 0)
6233 		return (error);
6234 
6235 	error = i_sotpi_info_constructor(&st->st_info);
6236 	if (error != 0)
6237 		sonode_destructor(buf, cdrarg);
6238 
6239 	st->st_sonode.so_priv = &st->st_info;
6240 
6241 	return (error);
6242 }
6243 
6244 /*ARGSUSED1*/
6245 static void
socktpi_destructor(void * buf,void * cdrarg)6246 socktpi_destructor(void *buf, void *cdrarg)
6247 {
6248 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6249 
6250 	ASSERT(st->st_sonode.so_priv == &st->st_info);
6251 	st->st_sonode.so_priv = NULL;
6252 
6253 	i_sotpi_info_destructor(&st->st_info);
6254 	sonode_destructor(buf, cdrarg);
6255 }
6256 
6257 static int
socktpi_unix_constructor(void * buf,void * cdrarg,int kmflags)6258 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6259 {
6260 	int retval;
6261 
6262 	if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6263 		struct sonode *so = (struct sonode *)buf;
6264 		sotpi_info_t *sti = SOTOTPI(so);
6265 
6266 		mutex_enter(&socklist.sl_lock);
6267 
6268 		sti->sti_next_so = socklist.sl_list;
6269 		sti->sti_prev_so = NULL;
6270 		if (sti->sti_next_so != NULL)
6271 			SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6272 		socklist.sl_list = so;
6273 
6274 		mutex_exit(&socklist.sl_lock);
6275 
6276 	}
6277 	return (retval);
6278 }
6279 
6280 static void
socktpi_unix_destructor(void * buf,void * cdrarg)6281 socktpi_unix_destructor(void *buf, void *cdrarg)
6282 {
6283 	struct sonode	*so = (struct sonode *)buf;
6284 	sotpi_info_t	*sti = SOTOTPI(so);
6285 
6286 	mutex_enter(&socklist.sl_lock);
6287 
6288 	if (sti->sti_next_so != NULL)
6289 		SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6290 	if (sti->sti_prev_so != NULL)
6291 		SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6292 	else
6293 		socklist.sl_list = sti->sti_next_so;
6294 
6295 	mutex_exit(&socklist.sl_lock);
6296 
6297 	socktpi_destructor(buf, cdrarg);
6298 }
6299 
6300 int
socktpi_init(void)6301 socktpi_init(void)
6302 {
6303 	/*
6304 	 * Create sonode caches.  We create a special one for AF_UNIX so
6305 	 * that we can track them for netstat(8).
6306 	 */
6307 	socktpi_cache = kmem_cache_create("socktpi_cache",
6308 	    sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6309 	    socktpi_destructor, NULL, NULL, NULL, 0);
6310 
6311 	socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6312 	    sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6313 	    socktpi_unix_destructor, NULL, NULL, NULL, 0);
6314 
6315 	return (0);
6316 }
6317 
6318 /*
6319  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6320  *
6321  * Caller must still update state and mode using sotpi_update_state().
6322  */
6323 int
sotpi_convert_sonode(struct sonode * so,struct sockparams * newsp,boolean_t * direct,queue_t ** qp,struct cred * cr)6324 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6325     boolean_t *direct, queue_t **qp, struct cred *cr)
6326 {
6327 	sotpi_info_t *sti;
6328 	struct sockparams *origsp = so->so_sockparams;
6329 	sock_lower_handle_t handle = so->so_proto_handle;
6330 	struct stdata *stp;
6331 	struct vnode *vp;
6332 	queue_t *q;
6333 	int error = 0;
6334 
6335 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6336 	    SS_FALLBACK_PENDING);
6337 	ASSERT(SOCK_IS_NONSTR(so));
6338 
6339 	*qp = NULL;
6340 	*direct = B_FALSE;
6341 	so->so_sockparams = newsp;
6342 	/*
6343 	 * Allocate and initalize fields required by TPI.
6344 	 */
6345 	(void) sotpi_info_create(so, KM_SLEEP);
6346 	sotpi_info_init(so);
6347 
6348 	if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6349 		sotpi_info_fini(so);
6350 		sotpi_info_destroy(so);
6351 		return (error);
6352 	}
6353 	ASSERT(handle == so->so_proto_handle);
6354 	sti = SOTOTPI(so);
6355 	if (sti->sti_direct != 0)
6356 		*direct = B_TRUE;
6357 
6358 	/*
6359 	 * Keep the original sp around so we can properly dispose of the
6360 	 * sonode when the socket is being closed.
6361 	 */
6362 	sti->sti_orig_sp = origsp;
6363 
6364 	so_basic_strinit(so);	/* skips the T_CAPABILITY_REQ */
6365 	so_alloc_addr(so, so->so_max_addr_len);
6366 
6367 	/*
6368 	 * If the application has done a SIOCSPGRP, make sure the
6369 	 * STREAM head is aware. This needs to take place before
6370 	 * the protocol start sending up messages. Otherwise we
6371 	 * might miss to generate SIGPOLL.
6372 	 *
6373 	 * It is possible that the application will receive duplicate
6374 	 * signals if some were already generated for either data or
6375 	 * connection indications.
6376 	 */
6377 	if (so->so_pgrp != 0) {
6378 		if (so_set_events(so, so->so_vnode, cr) != 0)
6379 			so->so_pgrp = 0;
6380 	}
6381 
6382 	/*
6383 	 * Determine which queue to use.
6384 	 */
6385 	vp = SOTOV(so);
6386 	stp = vp->v_stream;
6387 	ASSERT(stp != NULL);
6388 	q = stp->sd_wrq->q_next;
6389 
6390 	/*
6391 	 * Skip any modules that may have been auto pushed when the device
6392 	 * was opened
6393 	 */
6394 	while (q->q_next != NULL)
6395 		q = q->q_next;
6396 	*qp = _RD(q);
6397 
6398 	/* This is now a STREAMS sockets */
6399 	so->so_not_str = B_FALSE;
6400 
6401 	return (error);
6402 }
6403 
6404 /*
6405  * Revert a TPI sonode. It is only allowed to revert the sonode during
6406  * the fallback process.
6407  */
6408 void
sotpi_revert_sonode(struct sonode * so,struct cred * cr)6409 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6410 {
6411 	vnode_t *vp = SOTOV(so);
6412 
6413 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6414 	    SS_FALLBACK_PENDING);
6415 	ASSERT(!SOCK_IS_NONSTR(so));
6416 	ASSERT(vp->v_stream != NULL);
6417 
6418 	strclean(vp);
6419 	(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6420 
6421 	/*
6422 	 * Restore the original sockparams. The caller is responsible for
6423 	 * dropping the ref to the new sp.
6424 	 */
6425 	so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6426 
6427 	sotpi_info_fini(so);
6428 	sotpi_info_destroy(so);
6429 
6430 	/* This is no longer a STREAMS sockets */
6431 	so->so_not_str = B_TRUE;
6432 }
6433 
6434 void
sotpi_update_state(struct sonode * so,struct T_capability_ack * tcap,struct sockaddr * laddr,socklen_t laddrlen,struct sockaddr * faddr,socklen_t faddrlen,short opts)6435 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6436     struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6437     socklen_t faddrlen, short opts)
6438 {
6439 	sotpi_info_t *sti = SOTOTPI(so);
6440 
6441 	so_proc_tcapability_ack(so, tcap);
6442 
6443 	so->so_options |= opts;
6444 
6445 	/*
6446 	 * Determine whether the foreign and local address are valid
6447 	 */
6448 	if (laddrlen != 0) {
6449 		ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6450 		sti->sti_laddr_len = laddrlen;
6451 		bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6452 		sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6453 	}
6454 
6455 	if (faddrlen != 0) {
6456 		ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6457 		sti->sti_faddr_len = faddrlen;
6458 		bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6459 		sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6460 	}
6461 
6462 }
6463 
6464 /*
6465  * Allocate enough space to cache the local and foreign addresses.
6466  */
6467 void
so_alloc_addr(struct sonode * so,t_uscalar_t maxlen)6468 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6469 {
6470 	sotpi_info_t *sti = SOTOTPI(so);
6471 
6472 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6473 	ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6474 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6475 	    P2ROUNDUP(maxlen, KMEM_ALIGN);
6476 	so->so_max_addr_len = sti->sti_laddr_maxlen;
6477 	sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6478 	sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6479 	    + sti->sti_laddr_maxlen);
6480 
6481 	if (so->so_family == AF_UNIX) {
6482 		/*
6483 		 * Initialize AF_UNIX related fields.
6484 		 */
6485 		bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6486 		bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6487 	}
6488 }
6489 
6490 
6491 sotpi_info_t *
sotpi_sototpi(struct sonode * so)6492 sotpi_sototpi(struct sonode *so)
6493 {
6494 	sotpi_info_t *sti;
6495 
6496 	ASSERT(so != NULL);
6497 
6498 	sti = (sotpi_info_t *)so->so_priv;
6499 
6500 	ASSERT(sti != NULL);
6501 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6502 
6503 	return (sti);
6504 }
6505 
6506 static int
i_sotpi_info_constructor(sotpi_info_t * sti)6507 i_sotpi_info_constructor(sotpi_info_t *sti)
6508 {
6509 	sti->sti_magic		= SOTPI_INFO_MAGIC;
6510 	sti->sti_ack_mp		= NULL;
6511 	sti->sti_discon_ind_mp	= NULL;
6512 	sti->sti_ux_bound_vp	= NULL;
6513 	sti->sti_unbind_mp	= NULL;
6514 
6515 	sti->sti_conn_ind_head	= NULL;
6516 	sti->sti_conn_ind_tail	= NULL;
6517 
6518 	sti->sti_laddr_sa	= NULL;
6519 	sti->sti_faddr_sa	= NULL;
6520 
6521 	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6522 	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6523 
6524 	return (0);
6525 }
6526 
6527 static void
i_sotpi_info_destructor(sotpi_info_t * sti)6528 i_sotpi_info_destructor(sotpi_info_t *sti)
6529 {
6530 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6531 	ASSERT(sti->sti_ack_mp == NULL);
6532 	ASSERT(sti->sti_discon_ind_mp == NULL);
6533 	ASSERT(sti->sti_ux_bound_vp == NULL);
6534 	ASSERT(sti->sti_unbind_mp == NULL);
6535 
6536 	ASSERT(sti->sti_conn_ind_head == NULL);
6537 	ASSERT(sti->sti_conn_ind_tail == NULL);
6538 
6539 	ASSERT(sti->sti_laddr_sa == NULL);
6540 	ASSERT(sti->sti_faddr_sa == NULL);
6541 
6542 	mutex_destroy(&sti->sti_plumb_lock);
6543 	cv_destroy(&sti->sti_ack_cv);
6544 }
6545 
6546 /*
6547  * Creates and attaches TPI information to the given sonode
6548  */
6549 static boolean_t
sotpi_info_create(struct sonode * so,int kmflags)6550 sotpi_info_create(struct sonode *so, int kmflags)
6551 {
6552 	sotpi_info_t *sti;
6553 
6554 	ASSERT(so->so_priv == NULL);
6555 
6556 	if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6557 		return (B_FALSE);
6558 
6559 	if (i_sotpi_info_constructor(sti) != 0) {
6560 		kmem_free(sti, sizeof (*sti));
6561 		return (B_FALSE);
6562 	}
6563 
6564 	so->so_priv = (void *)sti;
6565 	return (B_TRUE);
6566 }
6567 
6568 /*
6569  * Initializes the TPI information.
6570  */
6571 static void
sotpi_info_init(struct sonode * so)6572 sotpi_info_init(struct sonode *so)
6573 {
6574 	struct vnode *vp = SOTOV(so);
6575 	sotpi_info_t *sti = SOTOTPI(so);
6576 	time_t now;
6577 
6578 	sti->sti_dev	= so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6579 	vp->v_rdev	= sti->sti_dev;
6580 
6581 	sti->sti_orig_sp = NULL;
6582 
6583 	sti->sti_pushcnt = 0;
6584 
6585 	now = gethrestime_sec();
6586 	sti->sti_atime	= now;
6587 	sti->sti_mtime	= now;
6588 	sti->sti_ctime	= now;
6589 
6590 	sti->sti_eaddr_mp = NULL;
6591 	sti->sti_delayed_error = 0;
6592 
6593 	sti->sti_provinfo = NULL;
6594 
6595 	sti->sti_oobcnt = 0;
6596 	sti->sti_oobsigcnt = 0;
6597 
6598 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6599 
6600 	sti->sti_laddr_sa	= 0;
6601 	sti->sti_faddr_sa	= 0;
6602 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6603 	sti->sti_laddr_len = sti->sti_faddr_len = 0;
6604 
6605 	sti->sti_laddr_valid = 0;
6606 	sti->sti_faddr_valid = 0;
6607 	sti->sti_faddr_noxlate = 0;
6608 
6609 	sti->sti_direct = 0;
6610 
6611 	ASSERT(sti->sti_ack_mp == NULL);
6612 	ASSERT(sti->sti_ux_bound_vp == NULL);
6613 	ASSERT(sti->sti_unbind_mp == NULL);
6614 
6615 	ASSERT(sti->sti_conn_ind_head == NULL);
6616 	ASSERT(sti->sti_conn_ind_tail == NULL);
6617 }
6618 
6619 /*
6620  * Given a sonode, grab the TPI info and free any data.
6621  */
6622 static void
sotpi_info_fini(struct sonode * so)6623 sotpi_info_fini(struct sonode *so)
6624 {
6625 	sotpi_info_t *sti = SOTOTPI(so);
6626 	mblk_t *mp;
6627 
6628 	ASSERT(sti->sti_discon_ind_mp == NULL);
6629 
6630 	if ((mp = sti->sti_conn_ind_head) != NULL) {
6631 		mblk_t *mp1;
6632 
6633 		while (mp) {
6634 			mp1 = mp->b_next;
6635 			mp->b_next = NULL;
6636 			freemsg(mp);
6637 			mp = mp1;
6638 		}
6639 		sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6640 	}
6641 
6642 	/*
6643 	 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6644 	 * indirect them.  It also uses so_count as a validity test.
6645 	 */
6646 	mutex_enter(&so->so_lock);
6647 
6648 	if (sti->sti_laddr_sa) {
6649 		ASSERT((caddr_t)sti->sti_faddr_sa ==
6650 		    (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6651 		ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6652 		sti->sti_laddr_valid = 0;
6653 		sti->sti_faddr_valid = 0;
6654 		kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6655 		sti->sti_laddr_sa = NULL;
6656 		sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6657 		sti->sti_faddr_sa = NULL;
6658 		sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6659 	}
6660 
6661 	mutex_exit(&so->so_lock);
6662 
6663 	if ((mp = sti->sti_eaddr_mp) != NULL) {
6664 		freemsg(mp);
6665 		sti->sti_eaddr_mp = NULL;
6666 		sti->sti_delayed_error = 0;
6667 	}
6668 
6669 	if ((mp = sti->sti_ack_mp) != NULL) {
6670 		freemsg(mp);
6671 		sti->sti_ack_mp = NULL;
6672 	}
6673 
6674 	ASSERT(sti->sti_ux_bound_vp == NULL);
6675 	if ((mp = sti->sti_unbind_mp) != NULL) {
6676 		freemsg(mp);
6677 		sti->sti_unbind_mp = NULL;
6678 	}
6679 }
6680 
6681 /*
6682  * Destroys the TPI information attached to a sonode.
6683  */
6684 static void
sotpi_info_destroy(struct sonode * so)6685 sotpi_info_destroy(struct sonode *so)
6686 {
6687 	sotpi_info_t *sti = SOTOTPI(so);
6688 
6689 	i_sotpi_info_destructor(sti);
6690 	kmem_free(sti, sizeof (*sti));
6691 
6692 	so->so_priv = NULL;
6693 }
6694 
6695 /*
6696  * Create the global sotpi socket module entry. It will never be freed.
6697  */
6698 smod_info_t *
sotpi_smod_create(void)6699 sotpi_smod_create(void)
6700 {
6701 	smod_info_t *smodp;
6702 
6703 	smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6704 	smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6705 	(void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6706 	/*
6707 	 * Initialize the smod_refcnt to 1 so it will never be freed.
6708 	 */
6709 	smodp->smod_refcnt = 1;
6710 	smodp->smod_uc_version = SOCK_UC_VERSION;
6711 	smodp->smod_dc_version = SOCK_DC_VERSION;
6712 	smodp->smod_sock_create_func = &sotpi_create;
6713 	smodp->smod_sock_destroy_func = &sotpi_destroy;
6714 	return (smodp);
6715 }
6716