xref: /titanic_54/usr/src/uts/common/fs/sockfs/socktpi.c (revision e359ab8683e0e1152d9f40bc35e1096870d76f60)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2015, Joyent, Inc.
25  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/buf.h>
33 #include <sys/conf.h>
34 #include <sys/cred.h>
35 #include <sys/kmem.h>
36 #include <sys/kmem_impl.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/strsun.h>
50 #include <sys/suntpi.h>
51 #include <sys/ddi.h>
52 #include <sys/esunddi.h>
53 #include <sys/flock.h>
54 #include <sys/modctl.h>
55 #include <sys/vtrace.h>
56 #include <sys/cmn_err.h>
57 #include <sys/pathname.h>
58 
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/sockio.h>
62 #include <netinet/in.h>
63 #include <sys/un.h>
64 #include <sys/strsun.h>
65 
66 #include <sys/tiuser.h>
67 #define	_SUN_TPI_VERSION	2
68 #include <sys/tihdr.h>
69 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
70 
71 #include <c2/audit.h>
72 
73 #include <inet/common.h>
74 #include <inet/ip.h>
75 #include <inet/ip6.h>
76 #include <inet/tcp.h>
77 #include <inet/udp_impl.h>
78 
79 #include <sys/zone.h>
80 
81 #include <fs/sockfs/nl7c.h>
82 #include <fs/sockfs/nl7curi.h>
83 
84 #include <fs/sockfs/sockcommon.h>
85 #include <fs/sockfs/socktpi.h>
86 #include <fs/sockfs/socktpi_impl.h>
87 
88 /*
89  * Possible failures when memory can't be allocated. The documented behavior:
90  *
91  * 		5.5:			4.X:		XNET:
92  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
93  *							EINTR
94  *	(4.X does not document EINTR but returns it)
95  * bind:	ENOSR			-		ENOBUFS/ENOSR
96  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
97  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
98  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
99  *	(4.X getpeername and getsockname do not fail in practice)
100  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
101  * listen:	-			-		ENOBUFS
102  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
103  *							EINTR
104  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
105  *							EINTR
106  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
107  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
108  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
109  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
110  *
111  * Resolution. When allocation fails:
112  *	recv: return EINTR
113  *	send: return EINTR
114  *	connect, accept: EINTR
115  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
116  *	socket, socketpair: ENOBUFS
117  *	getpeername, getsockname: sleep
118  *	getsockopt, setsockopt: sleep
119  */
120 
121 #ifdef SOCK_TEST
122 /*
123  * Variables that make sockfs do something other than the standard TPI
124  * for the AF_INET transports.
125  *
126  * solisten_tpi_tcp:
127  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
128  *	the transport is already bound. This is needed to avoid loosing the
129  *	port number should listen() do a T_UNBIND_REQ followed by a
130  *	O_T_BIND_REQ.
131  *
132  * soconnect_tpi_udp:
133  *	UDP and ICMP can handle a T_CONN_REQ.
134  *	This is needed to make the sequence of connect(), getsockname()
135  *	return the local IP address used to send packets to the connected to
136  *	destination.
137  *
138  * soconnect_tpi_tcp:
139  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
140  *	Set this to non-zero to send TPI conformant messages to TCP in this
141  *	respect. This is a performance optimization.
142  *
143  * soaccept_tpi_tcp:
144  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
145  *	This is a performance optimization that has been picked up in XTI.
146  *
147  * soaccept_tpi_multioptions:
148  *	When inheriting SOL_SOCKET options from the listener to the accepting
149  *	socket send them as a single message for AF_INET{,6}.
150  */
151 int solisten_tpi_tcp = 0;
152 int soconnect_tpi_udp = 0;
153 int soconnect_tpi_tcp = 0;
154 int soaccept_tpi_tcp = 0;
155 int soaccept_tpi_multioptions = 1;
156 #else /* SOCK_TEST */
157 #define	soconnect_tpi_tcp	0
158 #define	soconnect_tpi_udp	0
159 #define	solisten_tpi_tcp	0
160 #define	soaccept_tpi_tcp	0
161 #define	soaccept_tpi_multioptions	1
162 #endif /* SOCK_TEST */
163 
164 #ifdef SOCK_TEST
165 extern int do_useracc;
166 extern clock_t sock_test_timelimit;
167 #endif /* SOCK_TEST */
168 
169 /*
170  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
171  * applications working. Turn on this flag to disable these checks.
172  */
173 int xnet_skip_checks = 0;
174 int xnet_check_print = 0;
175 int xnet_truncate_print = 0;
176 
177 static void sotpi_destroy(struct sonode *);
178 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
179     int, int *, cred_t *cr);
180 
181 static boolean_t	sotpi_info_create(struct sonode *, int);
182 static void		sotpi_info_init(struct sonode *);
183 static void 		sotpi_info_fini(struct sonode *);
184 static void 		sotpi_info_destroy(struct sonode *);
185 
186 /*
187  * Do direct function call to the transport layer below; this would
188  * also allow the transport to utilize read-side synchronous stream
189  * interface if necessary.  This is a /etc/system tunable that must
190  * not be modified on a running system.  By default this is enabled
191  * for performance reasons and may be disabled for debugging purposes.
192  */
193 boolean_t socktpi_direct = B_TRUE;
194 
195 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
196 
197 extern	void sigintr(k_sigset_t *, int);
198 extern	void sigunintr(k_sigset_t *);
199 
200 static int	sotpi_unbind(struct sonode *, int);
201 
202 /* TPI sockfs sonode operations */
203 int 		sotpi_init(struct sonode *, struct sonode *, struct cred *,
204 		    int);
205 static int	sotpi_accept(struct sonode *, int, struct cred *,
206 		    struct sonode **);
207 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
208 		    int, struct cred *);
209 static int	sotpi_listen(struct sonode *, int, struct cred *);
210 static int	sotpi_connect(struct sonode *, struct sockaddr *,
211 		    socklen_t, int, int, struct cred *);
212 extern int	sotpi_recvmsg(struct sonode *, struct nmsghdr *,
213 		    struct uio *, struct cred *);
214 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
215 		    struct uio *, struct cred *);
216 static int	sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
217 		    struct cred *, mblk_t **);
218 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
219 		    struct uio *, void *, t_uscalar_t, int);
220 static int	sodgram_direct(struct sonode *, struct sockaddr *,
221 		    socklen_t, struct uio *, int);
222 extern int	sotpi_getpeername(struct sonode *, struct sockaddr *,
223 		    socklen_t *, boolean_t, struct cred *);
224 static int	sotpi_getsockname(struct sonode *, struct sockaddr *,
225 		    socklen_t *, struct cred *);
226 static int	sotpi_shutdown(struct sonode *, int, struct cred *);
227 extern int	sotpi_getsockopt(struct sonode *, int, int, void *,
228 		    socklen_t *, int, struct cred *);
229 extern int	sotpi_setsockopt(struct sonode *, int, int, const void *,
230 		    socklen_t, struct cred *);
231 static int 	sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
232 		    int32_t *);
233 static int 	socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
234 		    struct cred *, int32_t *);
235 static int 	sotpi_poll(struct sonode *, short, int, short *,
236 		    struct pollhead **);
237 static int 	sotpi_close(struct sonode *, int, struct cred *);
238 
239 static int	i_sotpi_info_constructor(sotpi_info_t *);
240 static void 	i_sotpi_info_destructor(sotpi_info_t *);
241 
242 sonodeops_t sotpi_sonodeops = {
243 	sotpi_init,		/* sop_init		*/
244 	sotpi_accept,		/* sop_accept		*/
245 	sotpi_bind,		/* sop_bind		*/
246 	sotpi_listen,		/* sop_listen		*/
247 	sotpi_connect,		/* sop_connect		*/
248 	sotpi_recvmsg,		/* sop_recvmsg		*/
249 	sotpi_sendmsg,		/* sop_sendmsg		*/
250 	sotpi_sendmblk,		/* sop_sendmblk		*/
251 	sotpi_getpeername,	/* sop_getpeername	*/
252 	sotpi_getsockname,	/* sop_getsockname	*/
253 	sotpi_shutdown,		/* sop_shutdown		*/
254 	sotpi_getsockopt,	/* sop_getsockopt	*/
255 	sotpi_setsockopt,	/* sop_setsockopt	*/
256 	sotpi_ioctl,		/* sop_ioctl		*/
257 	sotpi_poll,		/* sop_poll		*/
258 	sotpi_close,		/* sop_close		*/
259 };
260 
261 /*
262  * Return a TPI socket vnode.
263  *
264  * Note that sockets assume that the driver will clone (either itself
265  * or by using the clone driver) i.e. a socket() call will always
266  * result in a new vnode being created.
267  */
268 
269 /*
270  * Common create code for socket and accept. If tso is set the values
271  * from that node is used instead of issuing a T_INFO_REQ.
272  */
273 
274 /* ARGSUSED */
275 static struct sonode *
sotpi_create(struct sockparams * sp,int family,int type,int protocol,int version,int sflags,int * errorp,cred_t * cr)276 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
277     int version, int sflags, int *errorp, cred_t *cr)
278 {
279 	struct sonode	*so;
280 	kmem_cache_t 	*cp;
281 	int		sfamily = family;
282 
283 	ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
284 
285 	if (family == AF_NCA) {
286 		/*
287 		 * The request is for an NCA socket so for NL7C use the
288 		 * INET domain instead and mark NL7C_AF_NCA below.
289 		 */
290 		family = AF_INET;
291 		/*
292 		 * NL7C is not supported in the non-global zone,
293 		 * we enforce this restriction here.
294 		 */
295 		if (getzoneid() != GLOBAL_ZONEID) {
296 			*errorp = ENOTSUP;
297 			return (NULL);
298 		}
299 	}
300 
301 	/*
302 	 * to be compatible with old tpi socket implementation ignore
303 	 * sleep flag (sflags) passed in
304 	 */
305 	cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
306 	so = kmem_cache_alloc(cp, KM_SLEEP);
307 	if (so == NULL) {
308 		*errorp = ENOMEM;
309 		return (NULL);
310 	}
311 
312 	sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
313 	sotpi_info_init(so);
314 
315 	if (sfamily == AF_NCA) {
316 		SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
317 	}
318 
319 	if (version == SOV_DEFAULT)
320 		version = so_default_version;
321 
322 	so->so_version = (short)version;
323 	*errorp = 0;
324 
325 	return (so);
326 }
327 
328 static void
sotpi_destroy(struct sonode * so)329 sotpi_destroy(struct sonode *so)
330 {
331 	kmem_cache_t *cp;
332 	struct sockparams *origsp;
333 
334 	/*
335 	 * If there is a new dealloc function (ie. smod_destroy_func),
336 	 * then it should check the correctness of the ops.
337 	 */
338 
339 	ASSERT(so->so_ops == &sotpi_sonodeops);
340 
341 	origsp = SOTOTPI(so)->sti_orig_sp;
342 
343 	sotpi_info_fini(so);
344 
345 	if (so->so_state & SS_FALLBACK_COMP) {
346 		/*
347 		 * A fallback happend, which means that a sotpi_info_t struct
348 		 * was allocated (as opposed to being allocated from the TPI
349 		 * sonode cache. Therefore we explicitly free the struct
350 		 * here.
351 		 */
352 		sotpi_info_destroy(so);
353 		ASSERT(origsp != NULL);
354 
355 		origsp->sp_smod_info->smod_sock_destroy_func(so);
356 		SOCKPARAMS_DEC_REF(origsp);
357 	} else {
358 		sonode_fini(so);
359 		cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
360 		    socktpi_cache;
361 		kmem_cache_free(cp, so);
362 	}
363 }
364 
365 /* ARGSUSED1 */
366 int
sotpi_init(struct sonode * so,struct sonode * tso,struct cred * cr,int flags)367 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
368 {
369 	major_t maj;
370 	dev_t newdev;
371 	struct vnode *vp;
372 	int error = 0;
373 	struct stdata *stp;
374 
375 	sotpi_info_t *sti = SOTOTPI(so);
376 
377 	dprint(1, ("sotpi_init()\n"));
378 
379 	/*
380 	 * over write the sleep flag passed in but that is ok
381 	 * as tpi socket does not honor sleep flag.
382 	 */
383 	flags |= FREAD|FWRITE;
384 
385 	/*
386 	 * Record in so_flag that it is a clone.
387 	 */
388 	if (getmajor(sti->sti_dev) == clone_major)
389 		so->so_flag |= SOCLONE;
390 
391 	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
392 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
393 	    (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
394 	    so->so_protocol == IPPROTO_IP)) {
395 		/* Tell tcp or udp that it's talking to sockets */
396 		flags |= SO_SOCKSTR;
397 
398 		/*
399 		 * Here we indicate to socktpi_open() our attempt to
400 		 * make direct calls between sockfs and transport.
401 		 * The final decision is left to socktpi_open().
402 		 */
403 		sti->sti_direct = 1;
404 
405 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
406 		if (so->so_type == SOCK_STREAM && tso != NULL) {
407 			if (SOTOTPI(tso)->sti_direct) {
408 				/*
409 				 * Inherit sti_direct from listener and pass
410 				 * SO_ACCEPTOR open flag to tcp, indicating
411 				 * that this is an accept fast-path instance.
412 				 */
413 				flags |= SO_ACCEPTOR;
414 			} else {
415 				/*
416 				 * sti_direct is not set on listener, meaning
417 				 * that the listener has been converted from
418 				 * a socket to a stream.  Ensure that the
419 				 * acceptor inherits these settings.
420 				 */
421 				sti->sti_direct = 0;
422 				flags &= ~SO_SOCKSTR;
423 			}
424 		}
425 	}
426 
427 	/*
428 	 * Tell local transport that it is talking to sockets.
429 	 */
430 	if (so->so_family == AF_UNIX) {
431 		flags |= SO_SOCKSTR;
432 	}
433 
434 	vp = SOTOV(so);
435 	newdev = vp->v_rdev;
436 	maj = getmajor(newdev);
437 	ASSERT(STREAMSTAB(maj));
438 
439 	error = stropen(vp, &newdev, flags, cr);
440 
441 	stp = vp->v_stream;
442 	if (error == 0) {
443 		if (so->so_flag & SOCLONE)
444 			ASSERT(newdev != vp->v_rdev);
445 		mutex_enter(&so->so_lock);
446 		sti->sti_dev = newdev;
447 		vp->v_rdev = newdev;
448 		mutex_exit(&so->so_lock);
449 
450 		if (stp->sd_flag & STRISTTY) {
451 			/*
452 			 * this is a post SVR4 tty driver - a socket can not
453 			 * be a controlling terminal. Fail the open.
454 			 */
455 			(void) sotpi_close(so, flags, cr);
456 			return (ENOTTY);	/* XXX */
457 		}
458 
459 		ASSERT(stp->sd_wrq != NULL);
460 		sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
461 
462 		/*
463 		 * If caller is interested in doing direct function call
464 		 * interface to/from transport module, probe the module
465 		 * directly beneath the streamhead to see if it qualifies.
466 		 *
467 		 * We turn off the direct interface when qualifications fail.
468 		 * In the acceptor case, we simply turn off the sti_direct
469 		 * flag on the socket. We do the fallback after the accept
470 		 * has completed, before the new socket is returned to the
471 		 * application.
472 		 */
473 		if (sti->sti_direct) {
474 			queue_t *tq = stp->sd_wrq->q_next;
475 
476 			/*
477 			 * sti_direct is currently supported and tested
478 			 * only for tcp/udp; this is the main reason to
479 			 * have the following assertions.
480 			 */
481 			ASSERT(so->so_family == AF_INET ||
482 			    so->so_family == AF_INET6);
483 			ASSERT(so->so_protocol == IPPROTO_UDP ||
484 			    so->so_protocol == IPPROTO_TCP ||
485 			    so->so_protocol == IPPROTO_IP);
486 			ASSERT(so->so_type == SOCK_DGRAM ||
487 			    so->so_type == SOCK_STREAM);
488 
489 			/*
490 			 * Abort direct call interface if the module directly
491 			 * underneath the stream head is not defined with the
492 			 * _D_DIRECT flag.  This could happen in the tcp or
493 			 * udp case, when some other module is autopushed
494 			 * above it, or for some reasons the expected module
495 			 * isn't purely D_MP (which is the main requirement).
496 			 */
497 			if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
498 			    !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
499 				int rval;
500 
501 				/* Continue on without direct calls */
502 				sti->sti_direct = 0;
503 
504 				/*
505 				 * Cannot issue ioctl on fallback socket since
506 				 * there is no conn associated with the queue.
507 				 * The fallback downcall will notify the proto
508 				 * of the change.
509 				 */
510 				if (!(flags & SO_ACCEPTOR) &&
511 				    !(flags & SO_FALLBACK)) {
512 					if ((error = strioctl(vp,
513 					    _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
514 					    cr, &rval)) != 0) {
515 						(void) sotpi_close(so, flags,
516 						    cr);
517 						return (error);
518 					}
519 				}
520 			}
521 		}
522 
523 		if (flags & SO_FALLBACK) {
524 			/*
525 			 * The stream created does not have a conn.
526 			 * do stream set up after conn has been assigned
527 			 */
528 			return (error);
529 		}
530 		if (error = so_strinit(so, tso)) {
531 			(void) sotpi_close(so, flags, cr);
532 			return (error);
533 		}
534 
535 		/* Wildcard */
536 		if (so->so_protocol != so->so_sockparams->sp_protocol) {
537 			int protocol = so->so_protocol;
538 			/*
539 			 * Issue SO_PROTOTYPE setsockopt.
540 			 */
541 			error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
542 			    &protocol, (t_uscalar_t)sizeof (protocol), cr);
543 			if (error != 0) {
544 				(void) sotpi_close(so, flags, cr);
545 				/*
546 				 * Setsockopt often fails with ENOPROTOOPT but
547 				 * socket() should fail with
548 				 * EPROTONOSUPPORT/EPROTOTYPE.
549 				 */
550 				return (EPROTONOSUPPORT);
551 			}
552 		}
553 
554 	} else {
555 		/*
556 		 * While the same socket can not be reopened (unlike specfs)
557 		 * the stream head sets STREOPENFAIL when the autopush fails.
558 		 */
559 		if ((stp != NULL) &&
560 		    (stp->sd_flag & STREOPENFAIL)) {
561 			/*
562 			 * Open failed part way through.
563 			 */
564 			mutex_enter(&stp->sd_lock);
565 			stp->sd_flag &= ~STREOPENFAIL;
566 			mutex_exit(&stp->sd_lock);
567 			(void) sotpi_close(so, flags, cr);
568 			return (error);
569 			/*NOTREACHED*/
570 		}
571 		ASSERT(stp == NULL);
572 	}
573 	TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
574 	    "sockfs open:maj %d vp %p so %p error %d",
575 	    maj, vp, so, error);
576 	return (error);
577 }
578 
579 /*
580  * Bind the socket to an unspecified address in sockfs only.
581  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
582  * required in all cases.
583  */
584 static void
so_automatic_bind(struct sonode * so)585 so_automatic_bind(struct sonode *so)
586 {
587 	sotpi_info_t *sti = SOTOTPI(so);
588 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
589 
590 	ASSERT(MUTEX_HELD(&so->so_lock));
591 	ASSERT(!(so->so_state & SS_ISBOUND));
592 	ASSERT(sti->sti_unbind_mp);
593 
594 	ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
595 	bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
596 	sti->sti_laddr_sa->sa_family = so->so_family;
597 	so->so_state |= SS_ISBOUND;
598 }
599 
600 
601 /*
602  * bind the socket.
603  *
604  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
605  * are passed in we allow rebinding. Note that for backwards compatibility
606  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
607  * Thus the rebinding code is currently not executed.
608  *
609  * The constraints for rebinding are:
610  * - it is a SOCK_DGRAM, or
611  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
612  *   and no listen() has been done.
613  * This rebinding code was added based on some language in the XNET book
614  * about not returning EINVAL it the protocol allows rebinding. However,
615  * this language is not present in the Posix socket draft. Thus maybe the
616  * rebinding logic should be deleted from the source.
617  *
618  * A null "name" can be used to unbind the socket if:
619  * - it is a SOCK_DGRAM, or
620  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
621  *   and no listen() has been done.
622  */
623 /* ARGSUSED */
624 static int
sotpi_bindlisten(struct sonode * so,struct sockaddr * name,socklen_t namelen,int backlog,int flags,struct cred * cr)625 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
626     socklen_t namelen, int backlog, int flags, struct cred *cr)
627 {
628 	struct T_bind_req	bind_req;
629 	struct T_bind_ack	*bind_ack;
630 	int			error = 0;
631 	mblk_t			*mp;
632 	void			*addr;
633 	t_uscalar_t		addrlen;
634 	int			unbind_on_err = 1;
635 	boolean_t		clear_acceptconn_on_err = B_FALSE;
636 	boolean_t		restore_backlog_on_err = B_FALSE;
637 	int			save_so_backlog;
638 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
639 	boolean_t		tcp_udp_xport;
640 	void			*nl7c = NULL;
641 	sotpi_info_t		*sti = SOTOTPI(so);
642 
643 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
644 	    (void *)so, (void *)name, namelen, backlog, flags,
645 	    pr_state(so->so_state, so->so_mode)));
646 
647 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
648 
649 	if (!(flags & _SOBIND_LOCK_HELD)) {
650 		mutex_enter(&so->so_lock);
651 		so_lock_single(so);	/* Set SOLOCKED */
652 	} else {
653 		ASSERT(MUTEX_HELD(&so->so_lock));
654 		ASSERT(so->so_flag & SOLOCKED);
655 	}
656 
657 	/*
658 	 * Make sure that there is a preallocated unbind_req message
659 	 * before binding. This message allocated when the socket is
660 	 * created  but it might be have been consumed.
661 	 */
662 	if (sti->sti_unbind_mp == NULL) {
663 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
664 		/* NOTE: holding so_lock while sleeping */
665 		sti->sti_unbind_mp =
666 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
667 		    cr);
668 	}
669 
670 	if (flags & _SOBIND_REBIND) {
671 		/*
672 		 * Called from solisten after doing an sotpi_unbind() or
673 		 * potentially without the unbind (latter for AF_INET{,6}).
674 		 */
675 		ASSERT(name == NULL && namelen == 0);
676 
677 		if (so->so_family == AF_UNIX) {
678 			ASSERT(sti->sti_ux_bound_vp);
679 			addr = &sti->sti_ux_laddr;
680 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
681 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
682 			    "addr 0x%p, vp %p\n",
683 			    addrlen,
684 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
685 			    (void *)sti->sti_ux_bound_vp));
686 		} else {
687 			addr = sti->sti_laddr_sa;
688 			addrlen = (t_uscalar_t)sti->sti_laddr_len;
689 		}
690 	} else if (flags & _SOBIND_UNSPEC) {
691 		ASSERT(name == NULL && namelen == 0);
692 
693 		/*
694 		 * The caller checked SS_ISBOUND but not necessarily
695 		 * under so_lock
696 		 */
697 		if (so->so_state & SS_ISBOUND) {
698 			/* No error */
699 			goto done;
700 		}
701 
702 		/* Set an initial local address */
703 		switch (so->so_family) {
704 		case AF_UNIX:
705 			/*
706 			 * Use an address with same size as struct sockaddr
707 			 * just like BSD.
708 			 */
709 			sti->sti_laddr_len =
710 			    (socklen_t)sizeof (struct sockaddr);
711 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
712 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
713 			sti->sti_laddr_sa->sa_family = so->so_family;
714 
715 			/*
716 			 * Pass down an address with the implicit bind
717 			 * magic number and the rest all zeros.
718 			 * The transport will return a unique address.
719 			 */
720 			sti->sti_ux_laddr.soua_vp = NULL;
721 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
722 			addr = &sti->sti_ux_laddr;
723 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
724 			break;
725 
726 		case AF_INET:
727 		case AF_INET6:
728 			/*
729 			 * An unspecified bind in TPI has a NULL address.
730 			 * Set the address in sockfs to have the sa_family.
731 			 */
732 			sti->sti_laddr_len = (so->so_family == AF_INET) ?
733 			    (socklen_t)sizeof (sin_t) :
734 			    (socklen_t)sizeof (sin6_t);
735 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
736 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
737 			sti->sti_laddr_sa->sa_family = so->so_family;
738 			addr = NULL;
739 			addrlen = 0;
740 			break;
741 
742 		default:
743 			/*
744 			 * An unspecified bind in TPI has a NULL address.
745 			 * Set the address in sockfs to be zero length.
746 			 *
747 			 * Can not assume there is a sa_family for all
748 			 * protocol families. For example, AF_X25 does not
749 			 * have a family field.
750 			 */
751 			bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
752 			sti->sti_laddr_len = 0;	/* XXX correct? */
753 			addr = NULL;
754 			addrlen = 0;
755 			break;
756 		}
757 
758 	} else {
759 		if (so->so_state & SS_ISBOUND) {
760 			/*
761 			 * If it is ok to rebind the socket, first unbind
762 			 * with the transport. A rebind to the NULL address
763 			 * is interpreted as an unbind.
764 			 * Note that a bind to NULL in BSD does unbind the
765 			 * socket but it fails with EINVAL.
766 			 * Note that regular sockets set SOV_SOCKBSD i.e.
767 			 * _SOBIND_SOCKBSD gets set here hence no type of
768 			 * socket does currently allow rebinding.
769 			 *
770 			 * If the name is NULL just do an unbind.
771 			 */
772 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
773 			    name != NULL) {
774 				error = EINVAL;
775 				unbind_on_err = 0;
776 				eprintsoline(so, error);
777 				goto done;
778 			}
779 			if ((so->so_mode & SM_CONNREQUIRED) &&
780 			    (so->so_state & SS_CANTREBIND)) {
781 				error = EINVAL;
782 				unbind_on_err = 0;
783 				eprintsoline(so, error);
784 				goto done;
785 			}
786 			error = sotpi_unbind(so, 0);
787 			if (error) {
788 				eprintsoline(so, error);
789 				goto done;
790 			}
791 			ASSERT(!(so->so_state & SS_ISBOUND));
792 			if (name == NULL) {
793 				so->so_state &=
794 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
795 				goto done;
796 			}
797 		}
798 
799 		/* X/Open requires this check */
800 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
801 			if (xnet_check_print) {
802 				printf("sockfs: X/Open bind state check "
803 				    "caused EINVAL\n");
804 			}
805 			error = EINVAL;
806 			goto done;
807 		}
808 
809 		switch (so->so_family) {
810 		case AF_UNIX:
811 			/*
812 			 * All AF_UNIX addresses are nul terminated
813 			 * when copied (copyin_name) in so the minimum
814 			 * length is 3 bytes.
815 			 */
816 			if (name == NULL ||
817 			    (ssize_t)namelen <= sizeof (short) + 1) {
818 				error = EISDIR;
819 				eprintsoline(so, error);
820 				goto done;
821 			}
822 			/*
823 			 * Verify so_family matches the bound family.
824 			 * BSD does not check this for AF_UNIX resulting
825 			 * in funny mknods.
826 			 */
827 			if (name->sa_family != so->so_family) {
828 				error = EAFNOSUPPORT;
829 				goto done;
830 			}
831 			break;
832 		case AF_INET:
833 			if (name == NULL) {
834 				error = EINVAL;
835 				eprintsoline(so, error);
836 				goto done;
837 			}
838 			if ((size_t)namelen != sizeof (sin_t)) {
839 				error = name->sa_family != so->so_family ?
840 				    EAFNOSUPPORT : EINVAL;
841 				eprintsoline(so, error);
842 				goto done;
843 			}
844 			if ((flags & _SOBIND_XPG4_2) &&
845 			    (name->sa_family != so->so_family)) {
846 				/*
847 				 * This check has to be made for X/Open
848 				 * sockets however application failures have
849 				 * been observed when it is applied to
850 				 * all sockets.
851 				 */
852 				error = EAFNOSUPPORT;
853 				eprintsoline(so, error);
854 				goto done;
855 			}
856 			/*
857 			 * Force a zero sa_family to match so_family.
858 			 *
859 			 * Some programs like inetd(1M) don't set the
860 			 * family field. Other programs leave
861 			 * sin_family set to garbage - SunOS 4.X does
862 			 * not check the family field on a bind.
863 			 * We use the family field that
864 			 * was passed in to the socket() call.
865 			 */
866 			name->sa_family = so->so_family;
867 			break;
868 
869 		case AF_INET6: {
870 #ifdef DEBUG
871 			sin6_t *sin6 = (sin6_t *)name;
872 #endif /* DEBUG */
873 
874 			if (name == NULL) {
875 				error = EINVAL;
876 				eprintsoline(so, error);
877 				goto done;
878 			}
879 			if ((size_t)namelen != sizeof (sin6_t)) {
880 				error = name->sa_family != so->so_family ?
881 				    EAFNOSUPPORT : EINVAL;
882 				eprintsoline(so, error);
883 				goto done;
884 			}
885 			if (name->sa_family != so->so_family) {
886 				/*
887 				 * With IPv6 we require the family to match
888 				 * unlike in IPv4.
889 				 */
890 				error = EAFNOSUPPORT;
891 				eprintsoline(so, error);
892 				goto done;
893 			}
894 #ifdef DEBUG
895 			/*
896 			 * Verify that apps don't forget to clear
897 			 * sin6_scope_id etc
898 			 */
899 			if (sin6->sin6_scope_id != 0 &&
900 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
901 				zcmn_err(getzoneid(), CE_WARN,
902 				    "bind with uninitialized sin6_scope_id "
903 				    "(%d) on socket. Pid = %d\n",
904 				    (int)sin6->sin6_scope_id,
905 				    (int)curproc->p_pid);
906 			}
907 			if (sin6->__sin6_src_id != 0) {
908 				zcmn_err(getzoneid(), CE_WARN,
909 				    "bind with uninitialized __sin6_src_id "
910 				    "(%d) on socket. Pid = %d\n",
911 				    (int)sin6->__sin6_src_id,
912 				    (int)curproc->p_pid);
913 			}
914 #endif /* DEBUG */
915 			break;
916 		}
917 		default:
918 			/*
919 			 * Don't do any length or sa_family check to allow
920 			 * non-sockaddr style addresses.
921 			 */
922 			if (name == NULL) {
923 				error = EINVAL;
924 				eprintsoline(so, error);
925 				goto done;
926 			}
927 			break;
928 		}
929 
930 		if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
931 			error = ENAMETOOLONG;
932 			eprintsoline(so, error);
933 			goto done;
934 		}
935 		/*
936 		 * Save local address.
937 		 */
938 		sti->sti_laddr_len = (socklen_t)namelen;
939 		ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
940 		bcopy(name, sti->sti_laddr_sa, namelen);
941 
942 		addr = sti->sti_laddr_sa;
943 		addrlen = (t_uscalar_t)sti->sti_laddr_len;
944 		switch (so->so_family) {
945 		case AF_INET6:
946 		case AF_INET:
947 			break;
948 		case AF_UNIX: {
949 			struct sockaddr_un *soun =
950 			    (struct sockaddr_un *)sti->sti_laddr_sa;
951 			struct vnode *vp, *rvp;
952 			struct vattr vattr;
953 
954 			ASSERT(sti->sti_ux_bound_vp == NULL);
955 			/*
956 			 * Create vnode for the specified path name.
957 			 * Keep vnode held with a reference in sti_ux_bound_vp.
958 			 * Use the vnode pointer as the address used in the
959 			 * bind with the transport.
960 			 *
961 			 * Use the same mode as in BSD. In particular this does
962 			 * not observe the umask.
963 			 */
964 			/* MAXPATHLEN + soun_family + nul termination */
965 			if (sti->sti_laddr_len >
966 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
967 				error = ENAMETOOLONG;
968 				eprintsoline(so, error);
969 				goto done;
970 			}
971 			vattr.va_type = VSOCK;
972 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
973 			vattr.va_mask = AT_TYPE|AT_MODE;
974 			/* NOTE: holding so_lock */
975 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
976 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
977 			if (error) {
978 				if (error == EEXIST)
979 					error = EADDRINUSE;
980 				eprintsoline(so, error);
981 				goto done;
982 			}
983 			/*
984 			 * Establish pointer from the underlying filesystem
985 			 * vnode to the socket node.
986 			 * sti_ux_bound_vp and v_stream->sd_vnode form the
987 			 * cross-linkage between the underlying filesystem
988 			 * node and the socket node.
989 			 */
990 
991 			if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
992 				VN_HOLD(rvp);
993 				VN_RELE(vp);
994 				vp = rvp;
995 			}
996 
997 			ASSERT(SOTOV(so)->v_stream);
998 			mutex_enter(&vp->v_lock);
999 			vp->v_stream = SOTOV(so)->v_stream;
1000 			sti->sti_ux_bound_vp = vp;
1001 			mutex_exit(&vp->v_lock);
1002 
1003 			/*
1004 			 * Use the vnode pointer value as a unique address
1005 			 * (together with the magic number to avoid conflicts
1006 			 * with implicit binds) in the transport provider.
1007 			 */
1008 			sti->sti_ux_laddr.soua_vp =
1009 			    (void *)sti->sti_ux_bound_vp;
1010 			sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1011 			addr = &sti->sti_ux_laddr;
1012 			addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1013 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1014 			    addrlen,
1015 			    (void *)((struct so_ux_addr *)addr)->soua_vp));
1016 			break;
1017 		}
1018 		} /* end switch (so->so_family) */
1019 	}
1020 
1021 	/*
1022 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1023 	 * the transport can start passing up T_CONN_IND messages
1024 	 * as soon as it receives the bind req and strsock_proto()
1025 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1026 	 */
1027 	if (flags & _SOBIND_LISTEN) {
1028 		if ((so->so_state & SS_ACCEPTCONN) == 0)
1029 			clear_acceptconn_on_err = B_TRUE;
1030 		save_so_backlog = so->so_backlog;
1031 		restore_backlog_on_err = B_TRUE;
1032 		so->so_state |= SS_ACCEPTCONN;
1033 		so->so_backlog = backlog;
1034 	}
1035 
1036 	/*
1037 	 * If NL7C addr(s) have been configured check for addr/port match,
1038 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1039 	 *
1040 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1041 	 * family sockets only. If match mark as such.
1042 	 */
1043 	if (nl7c_enabled && ((addr != NULL &&
1044 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1045 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1046 	    sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1047 		/*
1048 		 * NL7C is not supported in non-global zones,
1049 		 * we enforce this restriction here.
1050 		 */
1051 		if (so->so_zoneid == GLOBAL_ZONEID) {
1052 			/* An NL7C socket, mark it */
1053 			sti->sti_nl7c_flags |= NL7C_ENABLED;
1054 			if (nl7c == NULL) {
1055 				/*
1056 				 * Was an AF_NCA bind() so add it to the
1057 				 * addr list for reporting purposes.
1058 				 */
1059 				nl7c = nl7c_add_addr(addr, addrlen);
1060 			}
1061 		} else
1062 			nl7c = NULL;
1063 	}
1064 
1065 	/*
1066 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1067 	 * for other transports we will send in a O_T_BIND_REQ.
1068 	 */
1069 	if (tcp_udp_xport &&
1070 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
1071 		PRIM_type = T_BIND_REQ;
1072 
1073 	bind_req.PRIM_type = PRIM_type;
1074 	bind_req.ADDR_length = addrlen;
1075 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1076 	bind_req.CONIND_number = backlog;
1077 	/* NOTE: holding so_lock while sleeping */
1078 	mp = soallocproto2(&bind_req, sizeof (bind_req),
1079 	    addr, addrlen, 0, _ALLOC_SLEEP, cr);
1080 	sti->sti_laddr_valid = 0;
1081 
1082 	/* Done using sti_laddr_sa - can drop the lock */
1083 	mutex_exit(&so->so_lock);
1084 
1085 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1086 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1087 	if (error) {
1088 		eprintsoline(so, error);
1089 		mutex_enter(&so->so_lock);
1090 		goto done;
1091 	}
1092 
1093 	mutex_enter(&so->so_lock);
1094 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1095 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1096 	if (error) {
1097 		eprintsoline(so, error);
1098 		goto done;
1099 	}
1100 	ASSERT(mp);
1101 	/*
1102 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1103 	 * strsock_proto while the lock was dropped above, the bind
1104 	 * is allowed to complete.
1105 	 */
1106 
1107 	/* Mark as bound. This will be undone if we detect errors below. */
1108 	if (flags & _SOBIND_NOXLATE) {
1109 		ASSERT(so->so_family == AF_UNIX);
1110 		sti->sti_faddr_noxlate = 1;
1111 	}
1112 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1113 	so->so_state |= SS_ISBOUND;
1114 	ASSERT(sti->sti_unbind_mp);
1115 
1116 	/* note that we've already set SS_ACCEPTCONN above */
1117 
1118 	/*
1119 	 * Recompute addrlen - an unspecied bind sent down an
1120 	 * address of length zero but we expect the appropriate length
1121 	 * in return.
1122 	 */
1123 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1124 	    sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1125 
1126 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
1127 	/*
1128 	 * The alignment restriction is really too strict but
1129 	 * we want enough alignment to inspect the fields of
1130 	 * a sockaddr_in.
1131 	 */
1132 	addr = sogetoff(mp, bind_ack->ADDR_offset,
1133 	    bind_ack->ADDR_length,
1134 	    __TPI_ALIGN_SIZE);
1135 	if (addr == NULL) {
1136 		freemsg(mp);
1137 		error = EPROTO;
1138 		eprintsoline(so, error);
1139 		goto done;
1140 	}
1141 	if (!(flags & _SOBIND_UNSPEC)) {
1142 		/*
1143 		 * Verify that the transport didn't return something we
1144 		 * did not want e.g. an address other than what we asked for.
1145 		 *
1146 		 * NOTE: These checks would go away if/when we switch to
1147 		 * using the new TPI (in which the transport would fail
1148 		 * the request instead of assigning a different address).
1149 		 *
1150 		 * NOTE2: For protocols that we don't know (i.e. any
1151 		 * other than AF_INET6, AF_INET and AF_UNIX), we
1152 		 * cannot know if the transport should be expected to
1153 		 * return the same address as that requested.
1154 		 *
1155 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1156 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1157 		 *
1158 		 * For example, in the case of netatalk it may be
1159 		 * inappropriate for the transport to return the
1160 		 * requested address (as it may have allocated a local
1161 		 * port number in behaviour similar to that of an
1162 		 * AF_INET bind request with a port number of zero).
1163 		 *
1164 		 * Given the definition of O_T_BIND_REQ, where the
1165 		 * transport may bind to an address other than the
1166 		 * requested address, it's not possible to determine
1167 		 * whether a returned address that differs from the
1168 		 * requested address is a reason to fail (because the
1169 		 * requested address was not available) or succeed
1170 		 * (because the transport allocated an appropriate
1171 		 * address and/or port).
1172 		 *
1173 		 * sockfs currently requires that the transport return
1174 		 * the requested address in the T_BIND_ACK, unless
1175 		 * there is code here to allow for any discrepancy.
1176 		 * Such code exists for AF_INET and AF_INET6.
1177 		 *
1178 		 * Netatalk chooses to return the requested address
1179 		 * rather than the (correct) allocated address.  This
1180 		 * means that netatalk violates the TPI specification
1181 		 * (and would not function correctly if used from a
1182 		 * TLI application), but it does mean that it works
1183 		 * with sockfs.
1184 		 *
1185 		 * As noted above, using the newer XTI bind primitive
1186 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1187 		 * allow sockfs to be more sure about whether or not
1188 		 * the bind request had succeeded (as transports are
1189 		 * not permitted to bind to a different address than
1190 		 * that requested - they must return failure).
1191 		 * Unfortunately, support for T_BIND_REQ may not be
1192 		 * present in all transport implementations (netatalk,
1193 		 * for example, doesn't have it), making the
1194 		 * transition difficult.
1195 		 */
1196 		if (bind_ack->ADDR_length != addrlen) {
1197 			/* Assumes that the requested address was in use */
1198 			freemsg(mp);
1199 			error = EADDRINUSE;
1200 			eprintsoline(so, error);
1201 			goto done;
1202 		}
1203 
1204 		switch (so->so_family) {
1205 		case AF_INET6:
1206 		case AF_INET: {
1207 			sin_t *rname, *aname;
1208 
1209 			rname = (sin_t *)addr;
1210 			aname = (sin_t *)sti->sti_laddr_sa;
1211 
1212 			/*
1213 			 * Take advantage of the alignment
1214 			 * of sin_port and sin6_port which fall
1215 			 * in the same place in their data structures.
1216 			 * Just use sin_port for either address family.
1217 			 *
1218 			 * This may become a problem if (heaven forbid)
1219 			 * there's a separate ipv6port_reserved... :-P
1220 			 *
1221 			 * Binding to port 0 has the semantics of letting
1222 			 * the transport bind to any port.
1223 			 *
1224 			 * If the transport is TCP or UDP since we had sent
1225 			 * a T_BIND_REQ we would not get a port other than
1226 			 * what we asked for.
1227 			 */
1228 			if (tcp_udp_xport) {
1229 				/*
1230 				 * Pick up the new port number if we bound to
1231 				 * port 0.
1232 				 */
1233 				if (aname->sin_port == 0)
1234 					aname->sin_port = rname->sin_port;
1235 				sti->sti_laddr_valid = 1;
1236 				break;
1237 			}
1238 			if (aname->sin_port != 0 &&
1239 			    aname->sin_port != rname->sin_port) {
1240 				freemsg(mp);
1241 				error = EADDRINUSE;
1242 				eprintsoline(so, error);
1243 				goto done;
1244 			}
1245 			/*
1246 			 * Pick up the new port number if we bound to port 0.
1247 			 */
1248 			aname->sin_port = rname->sin_port;
1249 
1250 			/*
1251 			 * Unfortunately, addresses aren't _quite_ the same.
1252 			 */
1253 			if (so->so_family == AF_INET) {
1254 				if (aname->sin_addr.s_addr !=
1255 				    rname->sin_addr.s_addr) {
1256 					freemsg(mp);
1257 					error = EADDRNOTAVAIL;
1258 					eprintsoline(so, error);
1259 					goto done;
1260 				}
1261 			} else {
1262 				sin6_t *rname6 = (sin6_t *)rname;
1263 				sin6_t *aname6 = (sin6_t *)aname;
1264 
1265 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1266 				    &rname6->sin6_addr)) {
1267 					freemsg(mp);
1268 					error = EADDRNOTAVAIL;
1269 					eprintsoline(so, error);
1270 					goto done;
1271 				}
1272 			}
1273 			break;
1274 		}
1275 		case AF_UNIX:
1276 			if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1277 				freemsg(mp);
1278 				error = EADDRINUSE;
1279 				eprintsoline(so, error);
1280 				eprintso(so,
1281 				    ("addrlen %d, addr 0x%x, vp %p\n",
1282 				    addrlen, *((int *)addr),
1283 				    (void *)sti->sti_ux_bound_vp));
1284 				goto done;
1285 			}
1286 			sti->sti_laddr_valid = 1;
1287 			break;
1288 		default:
1289 			/*
1290 			 * NOTE: This assumes that addresses can be
1291 			 * byte-compared for equivalence.
1292 			 */
1293 			if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1294 				freemsg(mp);
1295 				error = EADDRINUSE;
1296 				eprintsoline(so, error);
1297 				goto done;
1298 			}
1299 			/*
1300 			 * Don't mark sti_laddr_valid, as we cannot be
1301 			 * sure that the returned address is the real
1302 			 * bound address when talking to an unknown
1303 			 * transport.
1304 			 */
1305 			break;
1306 		}
1307 	} else {
1308 		/*
1309 		 * Save for returned address for getsockname.
1310 		 * Needed for unspecific bind unless transport supports
1311 		 * the TI_GETMYNAME ioctl.
1312 		 * Do this for AF_INET{,6} even though they do, as
1313 		 * caching info here is much better performance than
1314 		 * a TPI/STREAMS trip to the transport for getsockname.
1315 		 * Any which can't for some reason _must_ _not_ set
1316 		 * sti_laddr_valid here for the caching version of
1317 		 * getsockname to not break;
1318 		 */
1319 		switch (so->so_family) {
1320 		case AF_UNIX:
1321 			/*
1322 			 * Record the address bound with the transport
1323 			 * for use by socketpair.
1324 			 */
1325 			bcopy(addr, &sti->sti_ux_laddr, addrlen);
1326 			sti->sti_laddr_valid = 1;
1327 			break;
1328 		case AF_INET:
1329 		case AF_INET6:
1330 			ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1331 			bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1332 			sti->sti_laddr_valid = 1;
1333 			break;
1334 		default:
1335 			/*
1336 			 * Don't mark sti_laddr_valid, as we cannot be
1337 			 * sure that the returned address is the real
1338 			 * bound address when talking to an unknown
1339 			 * transport.
1340 			 */
1341 			break;
1342 		}
1343 	}
1344 
1345 	if (nl7c != NULL) {
1346 		/* Register listen()er sonode pointer with NL7C */
1347 		nl7c_listener_addr(nl7c, so);
1348 	}
1349 
1350 	freemsg(mp);
1351 
1352 done:
1353 	if (error) {
1354 		/* reset state & backlog to values held on entry */
1355 		if (clear_acceptconn_on_err == B_TRUE)
1356 			so->so_state &= ~SS_ACCEPTCONN;
1357 		if (restore_backlog_on_err == B_TRUE)
1358 			so->so_backlog = save_so_backlog;
1359 
1360 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1361 			int err;
1362 
1363 			err = sotpi_unbind(so, 0);
1364 			/* LINTED - statement has no consequent: if */
1365 			if (err) {
1366 				eprintsoline(so, error);
1367 			} else {
1368 				ASSERT(!(so->so_state & SS_ISBOUND));
1369 			}
1370 		}
1371 	}
1372 	if (!(flags & _SOBIND_LOCK_HELD)) {
1373 		so_unlock_single(so, SOLOCKED);
1374 		mutex_exit(&so->so_lock);
1375 	} else {
1376 		ASSERT(MUTEX_HELD(&so->so_lock));
1377 		ASSERT(so->so_flag & SOLOCKED);
1378 	}
1379 	return (error);
1380 }
1381 
1382 /* bind the socket */
1383 static int
sotpi_bind(struct sonode * so,struct sockaddr * name,socklen_t namelen,int flags,struct cred * cr)1384 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1385     int flags, struct cred *cr)
1386 {
1387 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1388 		return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1389 
1390 	flags &= ~_SOBIND_SOCKETPAIR;
1391 	return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1392 }
1393 
1394 /*
1395  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1396  * address, or when listen needs to unbind and bind.
1397  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1398  * so that a sobind can pick them up.
1399  */
1400 static int
sotpi_unbind(struct sonode * so,int flags)1401 sotpi_unbind(struct sonode *so, int flags)
1402 {
1403 	struct T_unbind_req	unbind_req;
1404 	int			error = 0;
1405 	mblk_t			*mp;
1406 	sotpi_info_t		*sti = SOTOTPI(so);
1407 
1408 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1409 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1410 
1411 	ASSERT(MUTEX_HELD(&so->so_lock));
1412 	ASSERT(so->so_flag & SOLOCKED);
1413 
1414 	if (!(so->so_state & SS_ISBOUND)) {
1415 		error = EINVAL;
1416 		eprintsoline(so, error);
1417 		goto done;
1418 	}
1419 
1420 	mutex_exit(&so->so_lock);
1421 
1422 	/*
1423 	 * Flush the read and write side (except stream head read queue)
1424 	 * and send down T_UNBIND_REQ.
1425 	 */
1426 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1427 
1428 	unbind_req.PRIM_type = T_UNBIND_REQ;
1429 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1430 	    0, _ALLOC_SLEEP, CRED());
1431 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1432 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1433 	mutex_enter(&so->so_lock);
1434 	if (error) {
1435 		eprintsoline(so, error);
1436 		goto done;
1437 	}
1438 
1439 	error = sowaitokack(so, T_UNBIND_REQ);
1440 	if (error) {
1441 		eprintsoline(so, error);
1442 		goto done;
1443 	}
1444 
1445 	/*
1446 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1447 	 * strsock_proto while the lock was dropped above, the unbind
1448 	 * is allowed to complete.
1449 	 */
1450 	if (!(flags & _SOUNBIND_REBIND)) {
1451 		/*
1452 		 * Clear out bound address.
1453 		 */
1454 		vnode_t *vp;
1455 
1456 		if ((vp = sti->sti_ux_bound_vp) != NULL) {
1457 			sti->sti_ux_bound_vp = NULL;
1458 			vn_rele_stream(vp);
1459 		}
1460 		/* Clear out address */
1461 		sti->sti_laddr_len = 0;
1462 	}
1463 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1464 	sti->sti_laddr_valid = 0;
1465 
1466 done:
1467 
1468 	/* If the caller held the lock don't release it here */
1469 	ASSERT(MUTEX_HELD(&so->so_lock));
1470 	ASSERT(so->so_flag & SOLOCKED);
1471 
1472 	return (error);
1473 }
1474 
1475 /*
1476  * listen on the socket.
1477  * For TPI conforming transports this has to first unbind with the transport
1478  * and then bind again using the new backlog.
1479  */
1480 /* ARGSUSED */
1481 int
sotpi_listen(struct sonode * so,int backlog,struct cred * cr)1482 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1483 {
1484 	int		error = 0;
1485 	sotpi_info_t	*sti = SOTOTPI(so);
1486 
1487 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1488 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1489 
1490 	if (sti->sti_serv_type == T_CLTS)
1491 		return (EOPNOTSUPP);
1492 
1493 	/*
1494 	 * If the socket is ready to accept connections already, then
1495 	 * return without doing anything.  This avoids a problem where
1496 	 * a second listen() call fails if a connection is pending and
1497 	 * leaves the socket unbound. Only when we are not unbinding
1498 	 * with the transport can we safely increase the backlog.
1499 	 */
1500 	if (so->so_state & SS_ACCEPTCONN &&
1501 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1502 	    /*CONSTCOND*/
1503 	    !solisten_tpi_tcp))
1504 		return (0);
1505 
1506 	if (so->so_state & SS_ISCONNECTED)
1507 		return (EINVAL);
1508 
1509 	mutex_enter(&so->so_lock);
1510 	so_lock_single(so);	/* Set SOLOCKED */
1511 
1512 	/*
1513 	 * If the listen doesn't change the backlog we do nothing.
1514 	 * This avoids an EPROTO error from the transport.
1515 	 */
1516 	if ((so->so_state & SS_ACCEPTCONN) &&
1517 	    so->so_backlog == backlog)
1518 		goto done;
1519 
1520 	if (!(so->so_state & SS_ISBOUND)) {
1521 		/*
1522 		 * Must have been explicitly bound in the UNIX domain.
1523 		 */
1524 		if (so->so_family == AF_UNIX) {
1525 			error = EINVAL;
1526 			goto done;
1527 		}
1528 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1529 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1530 	} else if (backlog > 0) {
1531 		/*
1532 		 * AF_INET{,6} hack to avoid losing the port.
1533 		 * Assumes that all AF_INET{,6} transports can handle a
1534 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1535 		 * has already bound thus it is possible to avoid the unbind.
1536 		 */
1537 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1538 		    /*CONSTCOND*/
1539 		    !solisten_tpi_tcp)) {
1540 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1541 			if (error)
1542 				goto done;
1543 		}
1544 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1545 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1546 	} else {
1547 		so->so_state |= SS_ACCEPTCONN;
1548 		so->so_backlog = backlog;
1549 	}
1550 	if (error)
1551 		goto done;
1552 	ASSERT(so->so_state & SS_ACCEPTCONN);
1553 done:
1554 	so_unlock_single(so, SOLOCKED);
1555 	mutex_exit(&so->so_lock);
1556 	return (error);
1557 }
1558 
1559 /*
1560  * Disconnect either a specified seqno or all (-1).
1561  * The former is used on listening sockets only.
1562  *
1563  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1564  * the current use of sodisconnect(seqno == -1) is only for shutdown
1565  * so there is no point (and potentially incorrect) to unbind.
1566  */
1567 static int
sodisconnect(struct sonode * so,t_scalar_t seqno,int flags)1568 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1569 {
1570 	struct T_discon_req	discon_req;
1571 	int			error = 0;
1572 	mblk_t			*mp;
1573 
1574 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1575 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1576 
1577 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1578 		mutex_enter(&so->so_lock);
1579 		so_lock_single(so);	/* Set SOLOCKED */
1580 	} else {
1581 		ASSERT(MUTEX_HELD(&so->so_lock));
1582 		ASSERT(so->so_flag & SOLOCKED);
1583 	}
1584 
1585 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1586 		error = EINVAL;
1587 		eprintsoline(so, error);
1588 		goto done;
1589 	}
1590 
1591 	mutex_exit(&so->so_lock);
1592 	/*
1593 	 * Flush the write side (unless this is a listener)
1594 	 * and then send down a T_DISCON_REQ.
1595 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1596 	 * and other messages.)
1597 	 */
1598 	if (!(so->so_state & SS_ACCEPTCONN))
1599 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1600 
1601 	discon_req.PRIM_type = T_DISCON_REQ;
1602 	discon_req.SEQ_number = seqno;
1603 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1604 	    0, _ALLOC_SLEEP, CRED());
1605 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1606 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1607 	mutex_enter(&so->so_lock);
1608 	if (error) {
1609 		eprintsoline(so, error);
1610 		goto done;
1611 	}
1612 
1613 	error = sowaitokack(so, T_DISCON_REQ);
1614 	if (error) {
1615 		eprintsoline(so, error);
1616 		goto done;
1617 	}
1618 	/*
1619 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1620 	 * strsock_proto while the lock was dropped above, the disconnect
1621 	 * is allowed to complete. However, it is not possible to
1622 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1623 	 */
1624 	so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1625 	SOTOTPI(so)->sti_laddr_valid = 0;
1626 	SOTOTPI(so)->sti_faddr_valid = 0;
1627 done:
1628 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1629 		so_unlock_single(so, SOLOCKED);
1630 		mutex_exit(&so->so_lock);
1631 	} else {
1632 		/* If the caller held the lock don't release it here */
1633 		ASSERT(MUTEX_HELD(&so->so_lock));
1634 		ASSERT(so->so_flag & SOLOCKED);
1635 	}
1636 	return (error);
1637 }
1638 
1639 /* ARGSUSED */
1640 int
sotpi_accept(struct sonode * so,int fflag,struct cred * cr,struct sonode ** nsop)1641 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1642     struct sonode **nsop)
1643 {
1644 	struct T_conn_ind	*conn_ind;
1645 	struct T_conn_res	*conn_res;
1646 	int			error = 0;
1647 	mblk_t			*mp, *ack_mp;
1648 	struct sonode		*nso;
1649 	vnode_t			*nvp;
1650 	void			*src;
1651 	t_uscalar_t		srclen;
1652 	void			*opt;
1653 	t_uscalar_t		optlen;
1654 	t_scalar_t		PRIM_type;
1655 	t_scalar_t		SEQ_number;
1656 	size_t			sinlen;
1657 	sotpi_info_t		*sti = SOTOTPI(so);
1658 	sotpi_info_t		*nsti;
1659 
1660 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1661 	    (void *)so, fflag, (void *)nsop,
1662 	    pr_state(so->so_state, so->so_mode)));
1663 
1664 	/*
1665 	 * Defer single-threading the accepting socket until
1666 	 * the T_CONN_IND has been received and parsed and the
1667 	 * new sonode has been opened.
1668 	 */
1669 
1670 	/* Check that we are not already connected */
1671 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1672 		goto conn_bad;
1673 again:
1674 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1675 		goto e_bad;
1676 
1677 	ASSERT(mp != NULL);
1678 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1679 
1680 	/*
1681 	 * Save SEQ_number for error paths.
1682 	 */
1683 	SEQ_number = conn_ind->SEQ_number;
1684 
1685 	srclen = conn_ind->SRC_length;
1686 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1687 	if (src == NULL) {
1688 		error = EPROTO;
1689 		freemsg(mp);
1690 		eprintsoline(so, error);
1691 		goto disconnect_unlocked;
1692 	}
1693 	optlen = conn_ind->OPT_length;
1694 	switch (so->so_family) {
1695 	case AF_INET:
1696 	case AF_INET6:
1697 		if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1698 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1699 			    &opt, conn_ind->OPT_length);
1700 		} else {
1701 			/*
1702 			 * The transport (in this case TCP) hasn't sent up
1703 			 * a pointer to an instance for the accept fast-path.
1704 			 * Disable fast-path completely because the call to
1705 			 * sotpi_create() below would otherwise create an
1706 			 * incomplete TCP instance, which would lead to
1707 			 * problems when sockfs sends a normal T_CONN_RES
1708 			 * message down the new stream.
1709 			 */
1710 			if (sti->sti_direct) {
1711 				int rval;
1712 				/*
1713 				 * For consistency we inform tcp to disable
1714 				 * direct interface on the listener, though
1715 				 * we can certainly live without doing this
1716 				 * because no data will ever travel upstream
1717 				 * on the listening socket.
1718 				 */
1719 				sti->sti_direct = 0;
1720 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1721 				    0, 0, K_TO_K, cr, &rval);
1722 			}
1723 			opt = NULL;
1724 			optlen = 0;
1725 		}
1726 		break;
1727 	case AF_UNIX:
1728 	default:
1729 		if (optlen != 0) {
1730 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1731 			    __TPI_ALIGN_SIZE);
1732 			if (opt == NULL) {
1733 				error = EPROTO;
1734 				freemsg(mp);
1735 				eprintsoline(so, error);
1736 				goto disconnect_unlocked;
1737 			}
1738 		}
1739 		if (so->so_family == AF_UNIX) {
1740 			if (!sti->sti_faddr_noxlate) {
1741 				src = NULL;
1742 				srclen = 0;
1743 			}
1744 			/* Extract src address from options */
1745 			if (optlen != 0)
1746 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1747 		}
1748 		break;
1749 	}
1750 
1751 	/*
1752 	 * Create the new socket.
1753 	 */
1754 	nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1755 	if (nso == NULL) {
1756 		ASSERT(error != 0);
1757 		/*
1758 		 * Accept can not fail with ENOBUFS. sotpi_create
1759 		 * sleeps waiting for memory until a signal is caught
1760 		 * so return EINTR.
1761 		 */
1762 		freemsg(mp);
1763 		if (error == ENOBUFS)
1764 			error = EINTR;
1765 		goto e_disc_unl;
1766 	}
1767 	nvp = SOTOV(nso);
1768 	nsti = SOTOTPI(nso);
1769 
1770 #ifdef DEBUG
1771 	/*
1772 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1773 	 * it's inherited early to allow debugging of the accept code itself.
1774 	 */
1775 	nso->so_options |= so->so_options & SO_DEBUG;
1776 #endif /* DEBUG */
1777 
1778 	/*
1779 	 * Save the SRC address from the T_CONN_IND
1780 	 * for getpeername to work on AF_UNIX and on transports that do not
1781 	 * support TI_GETPEERNAME.
1782 	 *
1783 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1784 	 * copyin_name().
1785 	 */
1786 	if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1787 		error = EINVAL;
1788 		freemsg(mp);
1789 		eprintsoline(so, error);
1790 		goto disconnect_vp_unlocked;
1791 	}
1792 	nsti->sti_faddr_len = (socklen_t)srclen;
1793 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1794 	bcopy(src, nsti->sti_faddr_sa, srclen);
1795 	nsti->sti_faddr_valid = 1;
1796 
1797 	/*
1798 	 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1799 	 */
1800 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1801 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1802 		cred_t	*cr;
1803 		pid_t	cpid;
1804 
1805 		cr = msg_getcred(mp, &cpid);
1806 		if (cr != NULL) {
1807 			crhold(cr);
1808 			nso->so_peercred = cr;
1809 			nso->so_cpid = cpid;
1810 		}
1811 		freemsg(mp);
1812 
1813 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1814 		    sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1815 		if (mp == NULL) {
1816 			/*
1817 			 * Accept can not fail with ENOBUFS.
1818 			 * A signal was caught so return EINTR.
1819 			 */
1820 			error = EINTR;
1821 			eprintsoline(so, error);
1822 			goto disconnect_vp_unlocked;
1823 		}
1824 		conn_res = (struct T_conn_res *)mp->b_rptr;
1825 	} else {
1826 		/*
1827 		 * For efficency reasons we use msg_extractcred; no crhold
1828 		 * needed since db_credp is cleared (i.e., we move the cred
1829 		 * from the message to so_peercred.
1830 		 */
1831 		nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1832 
1833 		mp->b_rptr = DB_BASE(mp);
1834 		conn_res = (struct T_conn_res *)mp->b_rptr;
1835 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1836 
1837 		mblk_setcred(mp, cr, curproc->p_pid);
1838 	}
1839 
1840 	/*
1841 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1842 	 * (or AF_INET6) it also has to be bound in the transport provider.
1843 	 * We set the local address in the sonode from the T_OK_ACK of the
1844 	 * T_CONN_RES. For this reason the address we bind to here isn't
1845 	 * important.
1846 	 */
1847 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1848 	    /*CONSTCOND*/
1849 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1850 		/*
1851 		 * Optimization for AF_INET{,6} transports
1852 		 * that can handle a T_CONN_RES without being bound.
1853 		 */
1854 		mutex_enter(&nso->so_lock);
1855 		so_automatic_bind(nso);
1856 		mutex_exit(&nso->so_lock);
1857 	} else {
1858 		/* Perform NULL bind with the transport provider. */
1859 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1860 		    cr)) != 0) {
1861 			ASSERT(error != ENOBUFS);
1862 			freemsg(mp);
1863 			eprintsoline(nso, error);
1864 			goto disconnect_vp_unlocked;
1865 		}
1866 	}
1867 
1868 	/*
1869 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1870 	 * so that any data arriving on the new socket will cause the
1871 	 * appropriate signals to be delivered for the new socket.
1872 	 *
1873 	 * No other thread (except strsock_proto and strsock_misc)
1874 	 * can access the new socket thus we relax the locking.
1875 	 */
1876 	nso->so_pgrp = so->so_pgrp;
1877 	nso->so_state |= so->so_state & SS_ASYNC;
1878 	nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1879 
1880 	if (nso->so_pgrp != 0) {
1881 		if ((error = so_set_events(nso, nvp, cr)) != 0) {
1882 			eprintsoline(nso, error);
1883 			error = 0;
1884 			nso->so_pgrp = 0;
1885 		}
1886 	}
1887 
1888 	/*
1889 	 * Make note of the socket level options. TCP and IP level options
1890 	 * are already inherited. We could do all this after accept is
1891 	 * successful but doing it here simplifies code and no harm done
1892 	 * for error case.
1893 	 */
1894 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1895 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1896 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1897 	nso->so_sndbuf = so->so_sndbuf;
1898 	nso->so_rcvbuf = so->so_rcvbuf;
1899 	if (nso->so_options & SO_LINGER)
1900 		nso->so_linger = so->so_linger;
1901 
1902 	/*
1903 	 * Note that the following sti_direct code path should be
1904 	 * removed once we are confident that the direct sockets
1905 	 * do not result in any degradation.
1906 	 */
1907 	if (sti->sti_direct) {
1908 
1909 		ASSERT(opt != NULL);
1910 
1911 		conn_res->OPT_length = optlen;
1912 		conn_res->OPT_offset = MBLKL(mp);
1913 		bcopy(&opt, mp->b_wptr, optlen);
1914 		mp->b_wptr += optlen;
1915 		conn_res->PRIM_type = T_CONN_RES;
1916 		conn_res->ACCEPTOR_id = 0;
1917 		PRIM_type = T_CONN_RES;
1918 
1919 		/* Send down the T_CONN_RES on acceptor STREAM */
1920 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1921 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1922 		if (error) {
1923 			mutex_enter(&so->so_lock);
1924 			so_lock_single(so);
1925 			eprintsoline(so, error);
1926 			goto disconnect_vp;
1927 		}
1928 		mutex_enter(&nso->so_lock);
1929 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1930 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1931 		if (error) {
1932 			mutex_exit(&nso->so_lock);
1933 			mutex_enter(&so->so_lock);
1934 			so_lock_single(so);
1935 			eprintsoline(so, error);
1936 			goto disconnect_vp;
1937 		}
1938 		if (nso->so_family == AF_INET) {
1939 			sin_t *sin;
1940 
1941 			sin = (sin_t *)(ack_mp->b_rptr +
1942 			    sizeof (struct T_ok_ack));
1943 			bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1944 			nsti->sti_laddr_len = sizeof (sin_t);
1945 		} else {
1946 			sin6_t *sin6;
1947 
1948 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1949 			    sizeof (struct T_ok_ack));
1950 			bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1951 			nsti->sti_laddr_len = sizeof (sin6_t);
1952 		}
1953 		freemsg(ack_mp);
1954 
1955 		nso->so_state |= SS_ISCONNECTED;
1956 		nso->so_proto_handle = (sock_lower_handle_t)opt;
1957 		nsti->sti_laddr_valid = 1;
1958 
1959 		if (sti->sti_nl7c_flags & NL7C_ENABLED) {
1960 			/*
1961 			 * A NL7C marked listen()er so the new socket
1962 			 * inherits the listen()er's NL7C state, except
1963 			 * for NL7C_POLLIN.
1964 			 *
1965 			 * Only call NL7C to process the new socket if
1966 			 * the listen socket allows blocking i/o.
1967 			 */
1968 			nsti->sti_nl7c_flags =
1969 			    sti->sti_nl7c_flags & (~NL7C_POLLIN);
1970 			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
1971 				/*
1972 				 * Nonblocking accept() just make it
1973 				 * persist to defer processing to the
1974 				 * read-side syscall (e.g. read).
1975 				 */
1976 				nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
1977 			} else if (nl7c_process(nso, B_FALSE)) {
1978 				/*
1979 				 * NL7C has completed processing on the
1980 				 * socket, close the socket and back to
1981 				 * the top to await the next T_CONN_IND.
1982 				 */
1983 				mutex_exit(&nso->so_lock);
1984 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1985 				    cr, NULL);
1986 				VN_RELE(nvp);
1987 				goto again;
1988 			}
1989 			/* Pass the new socket out */
1990 		}
1991 
1992 		mutex_exit(&nso->so_lock);
1993 
1994 		/*
1995 		 * It's possible, through the use of autopush for example,
1996 		 * that the acceptor stream may not support sti_direct
1997 		 * semantics. If the new socket does not support sti_direct
1998 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
1999 		 * as we would in the I_PUSH case.
2000 		 */
2001 		if (nsti->sti_direct == 0) {
2002 			int	rval;
2003 
2004 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2005 			    0, 0, K_TO_K, cr, &rval)) != 0) {
2006 				mutex_enter(&so->so_lock);
2007 				so_lock_single(so);
2008 				eprintsoline(so, error);
2009 				goto disconnect_vp;
2010 			}
2011 		}
2012 
2013 		/*
2014 		 * Pass out new socket.
2015 		 */
2016 		if (nsop != NULL)
2017 			*nsop = nso;
2018 
2019 		return (0);
2020 	}
2021 
2022 	/*
2023 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2024 	 * which don't support the FireEngine accept fast-path. It is also
2025 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2026 	 * again. Neither sockfs nor TCP attempt to find out if some other
2027 	 * random module has been inserted in between (in which case we
2028 	 * should follow TLI accept behaviour). We blindly assume the worst
2029 	 * case and revert back to old behaviour i.e. TCP will not send us
2030 	 * any option (eager) and the accept should happen on the listener
2031 	 * queue. Any queued T_conn_ind have already got their options removed
2032 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
2033 	 */
2034 	/*
2035 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2036 	 */
2037 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2038 #ifdef	_ILP32
2039 		queue_t	*q;
2040 
2041 		/*
2042 		 * Find read queue in driver
2043 		 * Can safely do this since we "own" nso/nvp.
2044 		 */
2045 		q = strvp2wq(nvp)->q_next;
2046 		while (SAMESTR(q))
2047 			q = q->q_next;
2048 		q = RD(q);
2049 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2050 #else
2051 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2052 #endif	/* _ILP32 */
2053 		conn_res->PRIM_type = O_T_CONN_RES;
2054 		PRIM_type = O_T_CONN_RES;
2055 	} else {
2056 		conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2057 		conn_res->PRIM_type = T_CONN_RES;
2058 		PRIM_type = T_CONN_RES;
2059 	}
2060 	conn_res->SEQ_number = SEQ_number;
2061 	conn_res->OPT_length = 0;
2062 	conn_res->OPT_offset = 0;
2063 
2064 	mutex_enter(&so->so_lock);
2065 	so_lock_single(so);	/* Set SOLOCKED */
2066 	mutex_exit(&so->so_lock);
2067 
2068 	error = kstrputmsg(SOTOV(so), mp, NULL,
2069 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2070 	mutex_enter(&so->so_lock);
2071 	if (error) {
2072 		eprintsoline(so, error);
2073 		goto disconnect_vp;
2074 	}
2075 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
2076 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2077 	if (error) {
2078 		eprintsoline(so, error);
2079 		goto disconnect_vp;
2080 	}
2081 	mutex_exit(&so->so_lock);
2082 	/*
2083 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
2084 	 * that to set the local address. If this is not present
2085 	 * then we zero out the address and don't set the
2086 	 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2087 	 * the pathname from the listening socket.
2088 	 * In the case where this is TCP or an AF_UNIX socket the
2089 	 * client side may have queued data or a T_ORDREL in the
2090 	 * transport. Having now sent the T_CONN_RES we may receive
2091 	 * those queued messages at any time. Hold the acceptor
2092 	 * so_lock until its state and laddr are finalized.
2093 	 */
2094 	mutex_enter(&nso->so_lock);
2095 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2096 	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2097 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2098 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
2099 		bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2100 		nsti->sti_laddr_len = sinlen;
2101 		nsti->sti_laddr_valid = 1;
2102 	} else if (nso->so_family == AF_UNIX) {
2103 		ASSERT(so->so_family == AF_UNIX);
2104 		nsti->sti_laddr_len = sti->sti_laddr_len;
2105 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2106 		bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2107 		    nsti->sti_laddr_len);
2108 		nsti->sti_laddr_valid = 1;
2109 	} else {
2110 		nsti->sti_laddr_len = sti->sti_laddr_len;
2111 		ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2112 		bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2113 		nsti->sti_laddr_sa->sa_family = nso->so_family;
2114 	}
2115 	nso->so_state |= SS_ISCONNECTED;
2116 	mutex_exit(&nso->so_lock);
2117 
2118 	freemsg(ack_mp);
2119 
2120 	mutex_enter(&so->so_lock);
2121 	so_unlock_single(so, SOLOCKED);
2122 	mutex_exit(&so->so_lock);
2123 
2124 	/*
2125 	 * Pass out new socket.
2126 	 */
2127 	if (nsop != NULL)
2128 		*nsop = nso;
2129 
2130 	return (0);
2131 
2132 
2133 eproto_disc_unl:
2134 	error = EPROTO;
2135 e_disc_unl:
2136 	eprintsoline(so, error);
2137 	goto disconnect_unlocked;
2138 
2139 pr_disc_vp_unl:
2140 	eprintsoline(so, error);
2141 disconnect_vp_unlocked:
2142 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2143 	VN_RELE(nvp);
2144 disconnect_unlocked:
2145 	(void) sodisconnect(so, SEQ_number, 0);
2146 	return (error);
2147 
2148 pr_disc_vp:
2149 	eprintsoline(so, error);
2150 disconnect_vp:
2151 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2152 	so_unlock_single(so, SOLOCKED);
2153 	mutex_exit(&so->so_lock);
2154 	(void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2155 	VN_RELE(nvp);
2156 	return (error);
2157 
2158 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2159 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2160 	    ? EOPNOTSUPP : EINVAL;
2161 e_bad:
2162 	eprintsoline(so, error);
2163 	return (error);
2164 }
2165 
2166 /*
2167  * connect a socket.
2168  *
2169  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2170  * unconnect (by specifying a null address).
2171  */
2172 int
sotpi_connect(struct sonode * so,struct sockaddr * name,socklen_t namelen,int fflag,int flags,struct cred * cr)2173 sotpi_connect(struct sonode *so,
2174     struct sockaddr *name,
2175     socklen_t namelen,
2176     int fflag,
2177     int flags,
2178     struct cred *cr)
2179 {
2180 	struct T_conn_req	conn_req;
2181 	int			error = 0;
2182 	mblk_t			*mp;
2183 	void			*src;
2184 	socklen_t		srclen;
2185 	void			*addr;
2186 	socklen_t		addrlen;
2187 	boolean_t		need_unlock;
2188 	sotpi_info_t		*sti = SOTOTPI(so);
2189 
2190 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2191 	    (void *)so, (void *)name, namelen, fflag, flags,
2192 	    pr_state(so->so_state, so->so_mode)));
2193 
2194 	/*
2195 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2196 	 * avoid sleeping for memory with SOLOCKED held.
2197 	 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2198 	 * + sizeof (struct T_opthdr).
2199 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2200 	 * exceed sti_faddr_maxlen).
2201 	 */
2202 	mp = soallocproto(sizeof (struct T_conn_req) +
2203 	    2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2204 	    cr);
2205 	if (mp == NULL) {
2206 		/*
2207 		 * Connect can not fail with ENOBUFS. A signal was
2208 		 * caught so return EINTR.
2209 		 */
2210 		error = EINTR;
2211 		eprintsoline(so, error);
2212 		return (error);
2213 	}
2214 
2215 	mutex_enter(&so->so_lock);
2216 	/*
2217 	 * Make sure there is a preallocated T_unbind_req message
2218 	 * before any binding. This message is allocated when the
2219 	 * socket is created. Since another thread can consume
2220 	 * so_unbind_mp by the time we return from so_lock_single(),
2221 	 * we should check the availability of so_unbind_mp after
2222 	 * we return from so_lock_single().
2223 	 */
2224 
2225 	so_lock_single(so);	/* Set SOLOCKED */
2226 	need_unlock = B_TRUE;
2227 
2228 	if (sti->sti_unbind_mp == NULL) {
2229 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2230 		/* NOTE: holding so_lock while sleeping */
2231 		sti->sti_unbind_mp =
2232 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2233 		if (sti->sti_unbind_mp == NULL) {
2234 			error = EINTR;
2235 			goto done;
2236 		}
2237 	}
2238 
2239 	/*
2240 	 * Can't have done a listen before connecting.
2241 	 */
2242 	if (so->so_state & SS_ACCEPTCONN) {
2243 		error = EOPNOTSUPP;
2244 		goto done;
2245 	}
2246 
2247 	/*
2248 	 * Must be bound with the transport
2249 	 */
2250 	if (!(so->so_state & SS_ISBOUND)) {
2251 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2252 		    /*CONSTCOND*/
2253 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2254 			/*
2255 			 * Optimization for AF_INET{,6} transports
2256 			 * that can handle a T_CONN_REQ without being bound.
2257 			 */
2258 			so_automatic_bind(so);
2259 		} else {
2260 			error = sotpi_bind(so, NULL, 0,
2261 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2262 			if (error)
2263 				goto done;
2264 		}
2265 		ASSERT(so->so_state & SS_ISBOUND);
2266 		flags |= _SOCONNECT_DID_BIND;
2267 	}
2268 
2269 	/*
2270 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2271 	 * connect to a null address. This is the portable method to
2272 	 * unconnect a socket.
2273 	 */
2274 	if ((namelen >= sizeof (sa_family_t)) &&
2275 	    (name->sa_family == AF_UNSPEC)) {
2276 		name = NULL;
2277 		namelen = 0;
2278 	}
2279 
2280 	/*
2281 	 * Check that we are not already connected.
2282 	 * A connection-oriented socket cannot be reconnected.
2283 	 * A connected connection-less socket can be
2284 	 * - connected to a different address by a subsequent connect
2285 	 * - "unconnected" by a connect to the NULL address
2286 	 */
2287 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2288 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2289 		if (so->so_mode & SM_CONNREQUIRED) {
2290 			/* Connection-oriented socket */
2291 			error = so->so_state & SS_ISCONNECTED ?
2292 			    EISCONN : EALREADY;
2293 			goto done;
2294 		}
2295 		/* Connection-less socket */
2296 		if (name == NULL) {
2297 			/*
2298 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2299 			 * since it was set when the socket was connected.
2300 			 * If this is UDP also send down a T_DISCON_REQ.
2301 			 */
2302 			int val;
2303 
2304 			if ((so->so_family == AF_INET ||
2305 			    so->so_family == AF_INET6) &&
2306 			    (so->so_type == SOCK_DGRAM ||
2307 			    so->so_type == SOCK_RAW) &&
2308 			    /*CONSTCOND*/
2309 			    !soconnect_tpi_udp) {
2310 				/* XXX What about implicitly unbinding here? */
2311 				error = sodisconnect(so, -1,
2312 				    _SODISCONNECT_LOCK_HELD);
2313 			} else {
2314 				so->so_state &=
2315 				    ~(SS_ISCONNECTED | SS_ISCONNECTING);
2316 				sti->sti_faddr_valid = 0;
2317 				sti->sti_faddr_len = 0;
2318 			}
2319 
2320 			/* Remove SOLOCKED since setsockopt will grab it */
2321 			so_unlock_single(so, SOLOCKED);
2322 			mutex_exit(&so->so_lock);
2323 
2324 			val = 0;
2325 			(void) sotpi_setsockopt(so, SOL_SOCKET,
2326 			    SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2327 			    cr);
2328 
2329 			mutex_enter(&so->so_lock);
2330 			so_lock_single(so);	/* Set SOLOCKED */
2331 			goto done;
2332 		}
2333 	}
2334 	ASSERT(so->so_state & SS_ISBOUND);
2335 
2336 	if (name == NULL || namelen == 0) {
2337 		error = EINVAL;
2338 		goto done;
2339 	}
2340 	/*
2341 	 * Mark the socket if sti_faddr_sa represents the transport level
2342 	 * address.
2343 	 */
2344 	if (flags & _SOCONNECT_NOXLATE) {
2345 		struct sockaddr_ux	*soaddr_ux;
2346 
2347 		ASSERT(so->so_family == AF_UNIX);
2348 		if (namelen != sizeof (struct sockaddr_ux)) {
2349 			error = EINVAL;
2350 			goto done;
2351 		}
2352 		soaddr_ux = (struct sockaddr_ux *)name;
2353 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2354 		namelen = sizeof (soaddr_ux->sou_addr);
2355 		sti->sti_faddr_noxlate = 1;
2356 	}
2357 
2358 	/*
2359 	 * Length and family checks.
2360 	 */
2361 	error = so_addr_verify(so, name, namelen);
2362 	if (error)
2363 		goto bad;
2364 
2365 	/*
2366 	 * Save foreign address. Needed for AF_UNIX as well as
2367 	 * transport providers that do not support TI_GETPEERNAME.
2368 	 * Also used for cached foreign address for TCP and UDP.
2369 	 */
2370 	if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2371 		error = EINVAL;
2372 		goto done;
2373 	}
2374 	sti->sti_faddr_len = (socklen_t)namelen;
2375 	ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2376 	bcopy(name, sti->sti_faddr_sa, namelen);
2377 	sti->sti_faddr_valid = 1;
2378 
2379 	if (so->so_family == AF_UNIX) {
2380 		if (sti->sti_faddr_noxlate) {
2381 			/*
2382 			 * sti_faddr is a transport-level address, so
2383 			 * don't pass it as an option.  Do save it in
2384 			 * sti_ux_faddr, used for connected DG send.
2385 			 */
2386 			src = NULL;
2387 			srclen = 0;
2388 			addr = sti->sti_faddr_sa;
2389 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2390 			bcopy(addr, &sti->sti_ux_faddr,
2391 			    sizeof (sti->sti_ux_faddr));
2392 		} else {
2393 			/*
2394 			 * Pass the sockaddr_un source address as an option
2395 			 * and translate the remote address.
2396 			 * Holding so_lock thus sti_laddr_sa can not change.
2397 			 */
2398 			src = sti->sti_laddr_sa;
2399 			srclen = (t_uscalar_t)sti->sti_laddr_len;
2400 			dprintso(so, 1,
2401 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2402 			    srclen, src));
2403 			/*
2404 			 * Translate the destination address into our
2405 			 * internal form, and save it in sti_ux_faddr.
2406 			 * After this call, addr==&sti->sti_ux_taddr,
2407 			 * and we copy that to sti->sti_ux_faddr so
2408 			 * we save the connected peer address.
2409 			 */
2410 			error = so_ux_addr_xlate(so,
2411 			    sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2412 			    (flags & _SOCONNECT_XPG4_2),
2413 			    &addr, &addrlen);
2414 			if (error)
2415 				goto bad;
2416 			bcopy(&sti->sti_ux_taddr, &sti->sti_ux_faddr,
2417 			    sizeof (sti->sti_ux_faddr));
2418 		}
2419 	} else {
2420 		addr = sti->sti_faddr_sa;
2421 		addrlen = (t_uscalar_t)sti->sti_faddr_len;
2422 		src = NULL;
2423 		srclen = 0;
2424 	}
2425 	/*
2426 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2427 	 * option which asks the transport provider to send T_UDERR_IND
2428 	 * messages. These T_UDERR_IND messages are used to return connected
2429 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2430 	 *
2431 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2432 	 * we send down a T_CONN_REQ. This is needed to let the
2433 	 * transport assign a local address that is consistent with
2434 	 * the remote address. Applications depend on a getsockname()
2435 	 * after a connect() to retrieve the "source" IP address for
2436 	 * the connected socket.  Invalidate the cached local address
2437 	 * to force getsockname() to enquire of the transport.
2438 	 */
2439 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2440 		/*
2441 		 * Datagram socket.
2442 		 */
2443 		int32_t val;
2444 
2445 		so_unlock_single(so, SOLOCKED);
2446 		mutex_exit(&so->so_lock);
2447 
2448 		val = 1;
2449 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2450 		    &val, (t_uscalar_t)sizeof (val), cr);
2451 
2452 		mutex_enter(&so->so_lock);
2453 		so_lock_single(so);	/* Set SOLOCKED */
2454 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2455 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2456 		    soconnect_tpi_udp) {
2457 			soisconnected(so);
2458 			goto done;
2459 		}
2460 		/*
2461 		 * Send down T_CONN_REQ etc.
2462 		 * Clear fflag to avoid returning EWOULDBLOCK.
2463 		 */
2464 		fflag = 0;
2465 		ASSERT(so->so_family != AF_UNIX);
2466 		sti->sti_laddr_valid = 0;
2467 	} else if (sti->sti_laddr_len != 0) {
2468 		/*
2469 		 * If the local address or port was "any" then it may be
2470 		 * changed by the transport as a result of the
2471 		 * connect.  Invalidate the cached version if we have one.
2472 		 */
2473 		switch (so->so_family) {
2474 		case AF_INET:
2475 			ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2476 			if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2477 			    INADDR_ANY ||
2478 			    ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2479 				sti->sti_laddr_valid = 0;
2480 			break;
2481 
2482 		case AF_INET6:
2483 			ASSERT(sti->sti_laddr_len ==
2484 			    (socklen_t)sizeof (sin6_t));
2485 			if (IN6_IS_ADDR_UNSPECIFIED(
2486 			    &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2487 			    IN6_IS_ADDR_V4MAPPED_ANY(
2488 			    &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2489 			    ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2490 				sti->sti_laddr_valid = 0;
2491 			break;
2492 
2493 		default:
2494 			break;
2495 		}
2496 	}
2497 
2498 	/*
2499 	 * Check for failure of an earlier call
2500 	 */
2501 	if (so->so_error != 0)
2502 		goto so_bad;
2503 
2504 	/*
2505 	 * Send down T_CONN_REQ. Message was allocated above.
2506 	 */
2507 	conn_req.PRIM_type = T_CONN_REQ;
2508 	conn_req.DEST_length = addrlen;
2509 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2510 	if (srclen == 0) {
2511 		conn_req.OPT_length = 0;
2512 		conn_req.OPT_offset = 0;
2513 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2514 		soappendmsg(mp, addr, addrlen);
2515 	} else {
2516 		/*
2517 		 * There is a AF_UNIX sockaddr_un to include as a source
2518 		 * address option.
2519 		 */
2520 		struct T_opthdr toh;
2521 
2522 		toh.level = SOL_SOCKET;
2523 		toh.name = SO_SRCADDR;
2524 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2525 		toh.status = 0;
2526 		conn_req.OPT_length =
2527 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2528 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2529 		    _TPI_ALIGN_TOPT(addrlen));
2530 
2531 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2532 		soappendmsg(mp, addr, addrlen);
2533 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2534 		soappendmsg(mp, &toh, sizeof (toh));
2535 		soappendmsg(mp, src, srclen);
2536 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2537 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2538 	}
2539 	/*
2540 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2541 	 * in order to have the right state when the T_CONN_CON shows up.
2542 	 */
2543 	soisconnecting(so);
2544 	mutex_exit(&so->so_lock);
2545 
2546 	if (AU_AUDITING())
2547 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2548 
2549 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2550 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2551 	mp = NULL;
2552 	mutex_enter(&so->so_lock);
2553 	if (error != 0)
2554 		goto bad;
2555 
2556 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2557 		goto bad;
2558 
2559 	/* Allow other threads to access the socket */
2560 	so_unlock_single(so, SOLOCKED);
2561 	need_unlock = B_FALSE;
2562 
2563 	/*
2564 	 * Wait until we get a T_CONN_CON or an error
2565 	 */
2566 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2567 		so_lock_single(so);	/* Set SOLOCKED */
2568 		need_unlock = B_TRUE;
2569 	}
2570 
2571 done:
2572 	freemsg(mp);
2573 	switch (error) {
2574 	case EINPROGRESS:
2575 	case EALREADY:
2576 	case EISCONN:
2577 	case EINTR:
2578 		/* Non-fatal errors */
2579 		sti->sti_laddr_valid = 0;
2580 		/* FALLTHRU */
2581 	case 0:
2582 		break;
2583 	default:
2584 		ASSERT(need_unlock);
2585 		/*
2586 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2587 		 * and invalidate local-address cache
2588 		 */
2589 		so->so_state &= ~SS_ISCONNECTING;
2590 		sti->sti_laddr_valid = 0;
2591 		/* A discon_ind might have already unbound us */
2592 		if ((flags & _SOCONNECT_DID_BIND) &&
2593 		    (so->so_state & SS_ISBOUND)) {
2594 			int err;
2595 
2596 			err = sotpi_unbind(so, 0);
2597 			/* LINTED - statement has no conseq */
2598 			if (err) {
2599 				eprintsoline(so, err);
2600 			}
2601 		}
2602 		break;
2603 	}
2604 	if (need_unlock)
2605 		so_unlock_single(so, SOLOCKED);
2606 	mutex_exit(&so->so_lock);
2607 	return (error);
2608 
2609 so_bad:	error = sogeterr(so, B_TRUE);
2610 bad:	eprintsoline(so, error);
2611 	goto done;
2612 }
2613 
2614 /* ARGSUSED */
2615 int
sotpi_shutdown(struct sonode * so,int how,struct cred * cr)2616 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2617 {
2618 	struct T_ordrel_req	ordrel_req;
2619 	mblk_t			*mp;
2620 	uint_t			old_state, state_change;
2621 	int			error = 0;
2622 	sotpi_info_t		*sti = SOTOTPI(so);
2623 
2624 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2625 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2626 
2627 	mutex_enter(&so->so_lock);
2628 	so_lock_single(so);	/* Set SOLOCKED */
2629 
2630 	/*
2631 	 * SunOS 4.X has no check for datagram sockets.
2632 	 * 5.X checks that it is connected (ENOTCONN)
2633 	 * X/Open requires that we check the connected state.
2634 	 */
2635 	if (!(so->so_state & SS_ISCONNECTED)) {
2636 		if (!xnet_skip_checks) {
2637 			error = ENOTCONN;
2638 			if (xnet_check_print) {
2639 				printf("sockfs: X/Open shutdown check "
2640 				    "caused ENOTCONN\n");
2641 			}
2642 		}
2643 		goto done;
2644 	}
2645 	/*
2646 	 * Record the current state and then perform any state changes.
2647 	 * Then use the difference between the old and new states to
2648 	 * determine which messages need to be sent.
2649 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2650 	 * duplicate calls to shutdown().
2651 	 */
2652 	old_state = so->so_state;
2653 
2654 	switch (how) {
2655 	case 0:
2656 		socantrcvmore(so);
2657 		break;
2658 	case 1:
2659 		socantsendmore(so);
2660 		break;
2661 	case 2:
2662 		socantsendmore(so);
2663 		socantrcvmore(so);
2664 		break;
2665 	default:
2666 		error = EINVAL;
2667 		goto done;
2668 	}
2669 
2670 	/*
2671 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2672 	 */
2673 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2674 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2675 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2676 
2677 	switch (state_change) {
2678 	case 0:
2679 		dprintso(so, 1,
2680 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2681 		    so->so_state));
2682 		goto done;
2683 
2684 	case SS_CANTRCVMORE:
2685 		mutex_exit(&so->so_lock);
2686 		strseteof(SOTOV(so), 1);
2687 		/*
2688 		 * strseteof takes care of read side wakeups,
2689 		 * pollwakeups, and signals.
2690 		 */
2691 		/*
2692 		 * Get the read lock before flushing data to avoid problems
2693 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2694 		 */
2695 		mutex_enter(&so->so_lock);
2696 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2697 		mutex_exit(&so->so_lock);
2698 
2699 		/* Flush read side queue */
2700 		strflushrq(SOTOV(so), FLUSHALL);
2701 
2702 		mutex_enter(&so->so_lock);
2703 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2704 		break;
2705 
2706 	case SS_CANTSENDMORE:
2707 		mutex_exit(&so->so_lock);
2708 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2709 		mutex_enter(&so->so_lock);
2710 		break;
2711 
2712 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2713 		mutex_exit(&so->so_lock);
2714 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2715 		strseteof(SOTOV(so), 1);
2716 		/*
2717 		 * strseteof takes care of read side wakeups,
2718 		 * pollwakeups, and signals.
2719 		 */
2720 		/*
2721 		 * Get the read lock before flushing data to avoid problems
2722 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2723 		 */
2724 		mutex_enter(&so->so_lock);
2725 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2726 		mutex_exit(&so->so_lock);
2727 
2728 		/* Flush read side queue */
2729 		strflushrq(SOTOV(so), FLUSHALL);
2730 
2731 		mutex_enter(&so->so_lock);
2732 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2733 		break;
2734 	}
2735 
2736 	ASSERT(MUTEX_HELD(&so->so_lock));
2737 
2738 	/*
2739 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2740 	 * was set due to this call and the new state has both of them set:
2741 	 *	Send the AF_UNIX close indication
2742 	 *	For T_COTS send a discon_ind
2743 	 *
2744 	 * If cantsend was set due to this call:
2745 	 *	For T_COTSORD send an ordrel_ind
2746 	 *
2747 	 * Note that for T_CLTS there is no message sent here.
2748 	 */
2749 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2750 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2751 		/*
2752 		 * For SunOS 4.X compatibility we tell the other end
2753 		 * that we are unable to receive at this point.
2754 		 */
2755 		if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2756 			so_unix_close(so);
2757 
2758 		if (sti->sti_serv_type == T_COTS)
2759 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2760 	}
2761 	if ((state_change & SS_CANTSENDMORE) &&
2762 	    (sti->sti_serv_type == T_COTS_ORD)) {
2763 		/* Send an orderly release */
2764 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2765 
2766 		mutex_exit(&so->so_lock);
2767 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2768 		    0, _ALLOC_SLEEP, cr);
2769 		/*
2770 		 * Send down the T_ORDREL_REQ even if there is flow control.
2771 		 * This prevents shutdown from blocking.
2772 		 * Note that there is no T_OK_ACK for ordrel_req.
2773 		 */
2774 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2775 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2776 		mutex_enter(&so->so_lock);
2777 		if (error) {
2778 			eprintsoline(so, error);
2779 			goto done;
2780 		}
2781 	}
2782 
2783 done:
2784 	so_unlock_single(so, SOLOCKED);
2785 	mutex_exit(&so->so_lock);
2786 	return (error);
2787 }
2788 
2789 /*
2790  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2791  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2792  * that we have closed.
2793  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2794  * T_UNITDATA_REQ containing the same option.
2795  *
2796  * For SOCK_DGRAM half-connections (somebody connected to this end
2797  * but this end is not connect) we don't know where to send any
2798  * SO_UNIX_CLOSE.
2799  *
2800  * We have to ignore stream head errors just in case there has been
2801  * a shutdown(output).
2802  * Ignore any flow control to try to get the message more quickly to the peer.
2803  * While locally ignoring flow control solves the problem when there
2804  * is only the loopback transport on the stream it would not provide
2805  * the correct AF_UNIX socket semantics when one or more modules have
2806  * been pushed.
2807  */
2808 void
so_unix_close(struct sonode * so)2809 so_unix_close(struct sonode *so)
2810 {
2811 	struct T_opthdr	toh;
2812 	mblk_t		*mp;
2813 	sotpi_info_t	*sti = SOTOTPI(so);
2814 
2815 	ASSERT(MUTEX_HELD(&so->so_lock));
2816 
2817 	ASSERT(so->so_family == AF_UNIX);
2818 
2819 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2820 	    (SS_ISCONNECTED|SS_ISBOUND))
2821 		return;
2822 
2823 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2824 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2825 
2826 	toh.level = SOL_SOCKET;
2827 	toh.name = SO_UNIX_CLOSE;
2828 
2829 	/* zero length + header */
2830 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2831 	toh.status = 0;
2832 
2833 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2834 		struct T_optdata_req tdr;
2835 
2836 		tdr.PRIM_type = T_OPTDATA_REQ;
2837 		tdr.DATA_flag = 0;
2838 
2839 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2840 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2841 
2842 		/* NOTE: holding so_lock while sleeping */
2843 		mp = soallocproto2(&tdr, sizeof (tdr),
2844 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2845 	} else {
2846 		struct T_unitdata_req	tudr;
2847 		void			*addr;
2848 		socklen_t		addrlen;
2849 		void			*src;
2850 		socklen_t		srclen;
2851 		struct T_opthdr		toh2;
2852 		t_scalar_t		size;
2853 
2854 		/*
2855 		 * We know this is an AF_UNIX connected DGRAM socket.
2856 		 * We therefore already have the destination address
2857 		 * in the internal form needed for this send.  This is
2858 		 * similar to the sosend_dgram call later in this file
2859 		 * when there's no user-specified destination address.
2860 		 */
2861 		if (sti->sti_faddr_noxlate) {
2862 			/*
2863 			 * Already have a transport internal address. Do not
2864 			 * pass any (transport internal) source address.
2865 			 */
2866 			addr = sti->sti_faddr_sa;
2867 			addrlen = (t_uscalar_t)sti->sti_faddr_len;
2868 			src = NULL;
2869 			srclen = 0;
2870 		} else {
2871 			/*
2872 			 * Pass the sockaddr_un source address as an option
2873 			 * and translate the remote address.
2874 			 * Holding so_lock thus sti_laddr_sa can not change.
2875 			 */
2876 			src = sti->sti_laddr_sa;
2877 			srclen = (socklen_t)sti->sti_laddr_len;
2878 			dprintso(so, 1,
2879 			    ("so_ux_close: srclen %d, src %p\n",
2880 			    srclen, src));
2881 			/*
2882 			 * Use the destination address saved in connect.
2883 			 */
2884 			addr = &sti->sti_ux_faddr;
2885 			addrlen = sizeof (sti->sti_ux_faddr);
2886 		}
2887 		tudr.PRIM_type = T_UNITDATA_REQ;
2888 		tudr.DEST_length = addrlen;
2889 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2890 		if (srclen == 0) {
2891 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2892 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2893 			    _TPI_ALIGN_TOPT(addrlen));
2894 
2895 			size = tudr.OPT_offset + tudr.OPT_length;
2896 			/* NOTE: holding so_lock while sleeping */
2897 			mp = soallocproto2(&tudr, sizeof (tudr),
2898 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2899 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2900 			soappendmsg(mp, &toh, sizeof (toh));
2901 		} else {
2902 			/*
2903 			 * There is a AF_UNIX sockaddr_un to include as a
2904 			 * source address option.
2905 			 */
2906 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2907 			    _TPI_ALIGN_TOPT(srclen));
2908 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2909 			    _TPI_ALIGN_TOPT(addrlen));
2910 
2911 			toh2.level = SOL_SOCKET;
2912 			toh2.name = SO_SRCADDR;
2913 			toh2.len = (t_uscalar_t)(srclen +
2914 			    sizeof (struct T_opthdr));
2915 			toh2.status = 0;
2916 
2917 			size = tudr.OPT_offset + tudr.OPT_length;
2918 
2919 			/* NOTE: holding so_lock while sleeping */
2920 			mp = soallocproto2(&tudr, sizeof (tudr),
2921 			    addr, addrlen, size, _ALLOC_SLEEP, CRED());
2922 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2923 			soappendmsg(mp, &toh, sizeof (toh));
2924 			soappendmsg(mp, &toh2, sizeof (toh2));
2925 			soappendmsg(mp, src, srclen);
2926 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2927 		}
2928 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2929 	}
2930 	mutex_exit(&so->so_lock);
2931 	(void) kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2932 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2933 	mutex_enter(&so->so_lock);
2934 }
2935 
2936 /*
2937  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2938  * In addition, the caller typically verifies that there is some
2939  * potential state to clear by checking
2940  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2941  * before calling this routine.
2942  * Note that such a check can be made without holding so_lock since
2943  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2944  * decrements sti_oobsigcnt.
2945  *
2946  * When data is read *after* the point that all pending
2947  * oob data has been consumed the oob indication is cleared.
2948  *
2949  * This logic keeps select/poll returning POLLRDBAND and
2950  * SIOCATMARK returning true until we have read past
2951  * the mark.
2952  */
2953 static void
sorecv_update_oobstate(struct sonode * so)2954 sorecv_update_oobstate(struct sonode *so)
2955 {
2956 	sotpi_info_t *sti = SOTOTPI(so);
2957 
2958 	mutex_enter(&so->so_lock);
2959 	ASSERT(so_verify_oobstate(so));
2960 	dprintso(so, 1,
2961 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
2962 	    sti->sti_oobsigcnt,
2963 	    sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2964 	if (sti->sti_oobsigcnt == 0) {
2965 		/* No more pending oob indications */
2966 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2967 		freemsg(so->so_oobmsg);
2968 		so->so_oobmsg = NULL;
2969 	}
2970 	ASSERT(so_verify_oobstate(so));
2971 	mutex_exit(&so->so_lock);
2972 }
2973 
2974 /*
2975  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2976  */
2977 static int
nl7c_sorecv(struct sonode * so,mblk_t ** rmp,uio_t * uiop,rval_t * rp)2978 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2979 {
2980 	sotpi_info_t *sti = SOTOTPI(so);
2981 	int	error = 0;
2982 	mblk_t *tmp = NULL;
2983 	mblk_t *pmp = NULL;
2984 	mblk_t *nmp = sti->sti_nl7c_rcv_mp;
2985 
2986 	ASSERT(nmp != NULL);
2987 
2988 	while (nmp != NULL && uiop->uio_resid > 0) {
2989 		ssize_t n;
2990 
2991 		if (DB_TYPE(nmp) == M_DATA) {
2992 			/*
2993 			 * We have some data, uiomove up to resid bytes.
2994 			 */
2995 			n = MIN(MBLKL(nmp), uiop->uio_resid);
2996 			if (n > 0)
2997 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
2998 			nmp->b_rptr += n;
2999 			if (nmp->b_rptr == nmp->b_wptr) {
3000 				pmp = nmp;
3001 				nmp = nmp->b_cont;
3002 			}
3003 			if (error)
3004 				break;
3005 		} else {
3006 			/*
3007 			 * We only handle data, save for caller to handle.
3008 			 */
3009 			if (pmp != NULL) {
3010 				pmp->b_cont = nmp->b_cont;
3011 			}
3012 			nmp->b_cont = NULL;
3013 			if (*rmp == NULL) {
3014 				*rmp = nmp;
3015 			} else {
3016 				tmp->b_cont = nmp;
3017 			}
3018 			nmp = nmp->b_cont;
3019 			tmp = nmp;
3020 		}
3021 	}
3022 	if (pmp != NULL) {
3023 		/* Free any mblk_t(s) which we have consumed */
3024 		pmp->b_cont = NULL;
3025 		freemsg(sti->sti_nl7c_rcv_mp);
3026 	}
3027 	if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3028 		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
3029 		if (error == 0) {
3030 			rval_t	*p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3031 
3032 			error = p->r_v.r_v2;
3033 			p->r_v.r_v2 = 0;
3034 		}
3035 		rp->r_vals = sti->sti_nl7c_rcv_rval;
3036 		sti->sti_nl7c_rcv_rval = 0;
3037 	} else {
3038 		/* More mblk_t(s) to process so no rval to return */
3039 		rp->r_vals = 0;
3040 	}
3041 	return (error);
3042 }
3043 /*
3044  * Receive the next message on the queue.
3045  * If msg_controllen is non-zero when called the caller is interested in
3046  * any received control info (options).
3047  * If msg_namelen is non-zero when called the caller is interested in
3048  * any received source address.
3049  * The routine returns with msg_control and msg_name pointing to
3050  * kmem_alloc'ed memory which the caller has to free.
3051  */
3052 /* ARGSUSED */
3053 int
sotpi_recvmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)3054 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3055     struct cred *cr)
3056 {
3057 	union T_primitives	*tpr;
3058 	mblk_t			*mp;
3059 	uchar_t			pri;
3060 	int			pflag, opflag;
3061 	void			*control;
3062 	t_uscalar_t		controllen;
3063 	t_uscalar_t		namelen;
3064 	int			so_state = so->so_state; /* Snapshot */
3065 	ssize_t			saved_resid;
3066 	rval_t			rval;
3067 	int			flags;
3068 	clock_t			timout;
3069 	int			error = 0;
3070 	sotpi_info_t		*sti = SOTOTPI(so);
3071 
3072 	flags = msg->msg_flags;
3073 	msg->msg_flags = 0;
3074 
3075 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3076 	    (void *)so, (void *)msg, flags,
3077 	    pr_state(so->so_state, so->so_mode), so->so_error));
3078 
3079 	if (so->so_version == SOV_STREAM) {
3080 		so_update_attrs(so, SOACC);
3081 		/* The imaginary "sockmod" has been popped - act as a stream */
3082 		return (strread(SOTOV(so), uiop, cr));
3083 	}
3084 
3085 	/*
3086 	 * If we are not connected because we have never been connected
3087 	 * we return ENOTCONN. If we have been connected (but are no longer
3088 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3089 	 * the EOF.
3090 	 *
3091 	 * An alternative would be to post an ENOTCONN error in stream head
3092 	 * (read+write) and clear it when we're connected. However, that error
3093 	 * would cause incorrect poll/select behavior!
3094 	 */
3095 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3096 	    (so->so_mode & SM_CONNREQUIRED)) {
3097 		return (ENOTCONN);
3098 	}
3099 
3100 	/*
3101 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3102 	 * after checking that the read queue is empty) and returns zero.
3103 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
3104 	 * is zero.
3105 	 */
3106 
3107 	if (flags & MSG_OOB) {
3108 		/* Check that the transport supports OOB */
3109 		if (!(so->so_mode & SM_EXDATA))
3110 			return (EOPNOTSUPP);
3111 		so_update_attrs(so, SOACC);
3112 		return (sorecvoob(so, msg, uiop, flags,
3113 		    (so->so_options & SO_OOBINLINE)));
3114 	}
3115 
3116 	so_update_attrs(so, SOACC);
3117 
3118 	/*
3119 	 * Set msg_controllen and msg_namelen to zero here to make it
3120 	 * simpler in the cases that no control or name is returned.
3121 	 */
3122 	controllen = msg->msg_controllen;
3123 	namelen = msg->msg_namelen;
3124 	msg->msg_controllen = 0;
3125 	msg->msg_namelen = 0;
3126 
3127 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3128 	    namelen, controllen));
3129 
3130 	mutex_enter(&so->so_lock);
3131 	/*
3132 	 * If an NL7C enabled socket and not waiting for write data.
3133 	 */
3134 	if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3135 	    NL7C_ENABLED) {
3136 		if (sti->sti_nl7c_uri) {
3137 			/* Close uri processing for a previous request */
3138 			nl7c_close(so);
3139 		}
3140 		if ((so_state & SS_CANTRCVMORE) &&
3141 		    sti->sti_nl7c_rcv_mp == NULL) {
3142 			/* Nothing to process, EOF */
3143 			mutex_exit(&so->so_lock);
3144 			return (0);
3145 		} else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3146 			/* Persistent NL7C socket, try to process request */
3147 			boolean_t ret;
3148 
3149 			ret = nl7c_process(so,
3150 			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3151 			rval.r_vals = sti->sti_nl7c_rcv_rval;
3152 			error = rval.r_v.r_v2;
3153 			if (error) {
3154 				/* Error of some sort, return it */
3155 				mutex_exit(&so->so_lock);
3156 				return (error);
3157 			}
3158 			if (sti->sti_nl7c_flags &&
3159 			    ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3160 				/*
3161 				 * Still an NL7C socket and no data
3162 				 * to pass up to the caller.
3163 				 */
3164 				mutex_exit(&so->so_lock);
3165 				if (ret) {
3166 					/* EOF */
3167 					return (0);
3168 				} else {
3169 					/* Need more data */
3170 					return (EAGAIN);
3171 				}
3172 			}
3173 		} else {
3174 			/*
3175 			 * Not persistent so no further NL7C processing.
3176 			 */
3177 			sti->sti_nl7c_flags = 0;
3178 		}
3179 	}
3180 	/*
3181 	 * Only one reader is allowed at any given time. This is needed
3182 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3183 	 *
3184 	 * This is slightly different that BSD behavior in that it fails with
3185 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3186 	 * is single-threaded using sblock(), which is dropped while waiting
3187 	 * for data to appear. The difference shows up e.g. if one
3188 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3189 	 * does use nonblocking io and different threads are reading each
3190 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3191 	 * in this case as long as the read queue doesn't get empty.
3192 	 * In this implementation the thread using nonblocking io can
3193 	 * get an EWOULDBLOCK error due to the blocking thread executing
3194 	 * e.g. in the uiomove in kstrgetmsg.
3195 	 * This difference is not believed to be significant.
3196 	 */
3197 	/* Set SOREADLOCKED */
3198 	error = so_lock_read_intr(so,
3199 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3200 	mutex_exit(&so->so_lock);
3201 	if (error)
3202 		return (error);
3203 
3204 	/*
3205 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3206 	 * queued data has been consumed.
3207 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3208 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3209 	 *
3210 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3211 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3212 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3213 	 */
3214 	pflag = MSG_ANY | MSG_DELAYERROR;
3215 	if (flags & MSG_PEEK) {
3216 		pflag |= MSG_IPEEK;
3217 		flags &= ~MSG_WAITALL;
3218 	}
3219 	if (so->so_mode & SM_ATOMIC)
3220 		pflag |= MSG_DISCARDTAIL;
3221 
3222 	if (flags & MSG_DONTWAIT)
3223 		timout = 0;
3224 	else if (so->so_rcvtimeo != 0)
3225 		timout = TICK_TO_MSEC(so->so_rcvtimeo);
3226 	else
3227 		timout = -1;
3228 	opflag = pflag;
3229 retry:
3230 	saved_resid = uiop->uio_resid;
3231 	pri = 0;
3232 	mp = NULL;
3233 	if (sti->sti_nl7c_rcv_mp != NULL) {
3234 		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3235 		error = nl7c_sorecv(so, &mp, uiop, &rval);
3236 	} else {
3237 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3238 		    timout, &rval);
3239 	}
3240 	if (error != 0) {
3241 		/* kstrgetmsg returns ETIME when timeout expires */
3242 		if (error == ETIME)
3243 			error = EWOULDBLOCK;
3244 		goto out;
3245 	}
3246 	/*
3247 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3248 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3249 	 */
3250 	ASSERT(!(rval.r_val1 & MORECTL));
3251 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3252 		msg->msg_flags |= MSG_TRUNC;
3253 
3254 	if (mp == NULL) {
3255 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3256 		/*
3257 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3258 		 * The draft Posix socket spec states that the mark should
3259 		 * not be cleared when peeking. We follow the latter.
3260 		 */
3261 		if ((so->so_state &
3262 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3263 		    (uiop->uio_resid != saved_resid) &&
3264 		    !(flags & MSG_PEEK)) {
3265 			sorecv_update_oobstate(so);
3266 		}
3267 
3268 		mutex_enter(&so->so_lock);
3269 		/* Set MSG_EOR based on MOREDATA */
3270 		if (!(rval.r_val1 & MOREDATA)) {
3271 			if (so->so_state & SS_SAVEDEOR) {
3272 				msg->msg_flags |= MSG_EOR;
3273 				so->so_state &= ~SS_SAVEDEOR;
3274 			}
3275 		}
3276 		/*
3277 		 * If some data was received (i.e. not EOF) and the
3278 		 * read/recv* has not been satisfied wait for some more.
3279 		 */
3280 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3281 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3282 			mutex_exit(&so->so_lock);
3283 			pflag = opflag | MSG_NOMARK;
3284 			goto retry;
3285 		}
3286 		goto out_locked;
3287 	}
3288 
3289 	/* strsock_proto has already verified length and alignment */
3290 	tpr = (union T_primitives *)mp->b_rptr;
3291 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3292 
3293 	switch (tpr->type) {
3294 	case T_DATA_IND: {
3295 		if ((so->so_state &
3296 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3297 		    (uiop->uio_resid != saved_resid) &&
3298 		    !(flags & MSG_PEEK)) {
3299 			sorecv_update_oobstate(so);
3300 		}
3301 
3302 		/*
3303 		 * Set msg_flags to MSG_EOR based on
3304 		 * MORE_flag and MOREDATA.
3305 		 */
3306 		mutex_enter(&so->so_lock);
3307 		so->so_state &= ~SS_SAVEDEOR;
3308 		if (!(tpr->data_ind.MORE_flag & 1)) {
3309 			if (!(rval.r_val1 & MOREDATA))
3310 				msg->msg_flags |= MSG_EOR;
3311 			else
3312 				so->so_state |= SS_SAVEDEOR;
3313 		}
3314 		freemsg(mp);
3315 		/*
3316 		 * If some data was received (i.e. not EOF) and the
3317 		 * read/recv* has not been satisfied wait for some more.
3318 		 */
3319 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3320 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3321 			mutex_exit(&so->so_lock);
3322 			pflag = opflag | MSG_NOMARK;
3323 			goto retry;
3324 		}
3325 		goto out_locked;
3326 	}
3327 	case T_UNITDATA_IND: {
3328 		void *addr;
3329 		t_uscalar_t addrlen;
3330 		void *abuf;
3331 		t_uscalar_t optlen;
3332 		void *opt;
3333 
3334 		if ((so->so_state &
3335 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3336 		    (uiop->uio_resid != saved_resid) &&
3337 		    !(flags & MSG_PEEK)) {
3338 			sorecv_update_oobstate(so);
3339 		}
3340 
3341 		if (namelen != 0) {
3342 			/* Caller wants source address */
3343 			addrlen = tpr->unitdata_ind.SRC_length;
3344 			addr = sogetoff(mp,
3345 			    tpr->unitdata_ind.SRC_offset,
3346 			    addrlen, 1);
3347 			if (addr == NULL) {
3348 				freemsg(mp);
3349 				error = EPROTO;
3350 				eprintsoline(so, error);
3351 				goto out;
3352 			}
3353 			if (so->so_family == AF_UNIX) {
3354 				/*
3355 				 * Can not use the transport level address.
3356 				 * If there is a SO_SRCADDR option carrying
3357 				 * the socket level address it will be
3358 				 * extracted below.
3359 				 */
3360 				addr = NULL;
3361 				addrlen = 0;
3362 			}
3363 		}
3364 		optlen = tpr->unitdata_ind.OPT_length;
3365 		if (optlen != 0) {
3366 			t_uscalar_t ncontrollen;
3367 
3368 			/*
3369 			 * Extract any source address option.
3370 			 * Determine how large cmsg buffer is needed.
3371 			 */
3372 			opt = sogetoff(mp,
3373 			    tpr->unitdata_ind.OPT_offset,
3374 			    optlen, __TPI_ALIGN_SIZE);
3375 
3376 			if (opt == NULL) {
3377 				freemsg(mp);
3378 				error = EPROTO;
3379 				eprintsoline(so, error);
3380 				goto out;
3381 			}
3382 			if (so->so_family == AF_UNIX)
3383 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3384 			ncontrollen = so_cmsglen(mp, opt, optlen,
3385 			    !(flags & MSG_XPG4_2));
3386 			if (controllen != 0)
3387 				controllen = ncontrollen;
3388 			else if (ncontrollen != 0)
3389 				msg->msg_flags |= MSG_CTRUNC;
3390 		} else {
3391 			controllen = 0;
3392 		}
3393 
3394 		if (namelen != 0) {
3395 			/*
3396 			 * Return address to caller.
3397 			 * Caller handles truncation if length
3398 			 * exceeds msg_namelen.
3399 			 * NOTE: AF_UNIX NUL termination is ensured by
3400 			 * the sender's copyin_name().
3401 			 */
3402 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3403 
3404 			bcopy(addr, abuf, addrlen);
3405 			msg->msg_name = abuf;
3406 			msg->msg_namelen = addrlen;
3407 		}
3408 
3409 		if (controllen != 0) {
3410 			/*
3411 			 * Return control msg to caller.
3412 			 * Caller handles truncation if length
3413 			 * exceeds msg_controllen.
3414 			 */
3415 			control = kmem_zalloc(controllen, KM_SLEEP);
3416 
3417 			error = so_opt2cmsg(mp, opt, optlen,
3418 			    !(flags & MSG_XPG4_2),
3419 			    control, controllen);
3420 			if (error) {
3421 				freemsg(mp);
3422 				if (msg->msg_namelen != 0)
3423 					kmem_free(msg->msg_name,
3424 					    msg->msg_namelen);
3425 				kmem_free(control, controllen);
3426 				eprintsoline(so, error);
3427 				goto out;
3428 			}
3429 			msg->msg_control = control;
3430 			msg->msg_controllen = controllen;
3431 		}
3432 
3433 		freemsg(mp);
3434 		goto out;
3435 	}
3436 	case T_OPTDATA_IND: {
3437 		struct T_optdata_req *tdr;
3438 		void *opt;
3439 		t_uscalar_t optlen;
3440 
3441 		if ((so->so_state &
3442 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3443 		    (uiop->uio_resid != saved_resid) &&
3444 		    !(flags & MSG_PEEK)) {
3445 			sorecv_update_oobstate(so);
3446 		}
3447 
3448 		tdr = (struct T_optdata_req *)mp->b_rptr;
3449 		optlen = tdr->OPT_length;
3450 		if (optlen != 0) {
3451 			t_uscalar_t ncontrollen;
3452 			/*
3453 			 * Determine how large cmsg buffer is needed.
3454 			 */
3455 			opt = sogetoff(mp,
3456 			    tpr->optdata_ind.OPT_offset,
3457 			    optlen, __TPI_ALIGN_SIZE);
3458 
3459 			if (opt == NULL) {
3460 				freemsg(mp);
3461 				error = EPROTO;
3462 				eprintsoline(so, error);
3463 				goto out;
3464 			}
3465 
3466 			ncontrollen = so_cmsglen(mp, opt, optlen,
3467 			    !(flags & MSG_XPG4_2));
3468 			if (controllen != 0)
3469 				controllen = ncontrollen;
3470 			else if (ncontrollen != 0)
3471 				msg->msg_flags |= MSG_CTRUNC;
3472 		} else {
3473 			controllen = 0;
3474 		}
3475 
3476 		if (controllen != 0) {
3477 			/*
3478 			 * Return control msg to caller.
3479 			 * Caller handles truncation if length
3480 			 * exceeds msg_controllen.
3481 			 */
3482 			control = kmem_zalloc(controllen, KM_SLEEP);
3483 
3484 			error = so_opt2cmsg(mp, opt, optlen,
3485 			    !(flags & MSG_XPG4_2),
3486 			    control, controllen);
3487 			if (error) {
3488 				freemsg(mp);
3489 				kmem_free(control, controllen);
3490 				eprintsoline(so, error);
3491 				goto out;
3492 			}
3493 			msg->msg_control = control;
3494 			msg->msg_controllen = controllen;
3495 		}
3496 
3497 		/*
3498 		 * Set msg_flags to MSG_EOR based on
3499 		 * DATA_flag and MOREDATA.
3500 		 */
3501 		mutex_enter(&so->so_lock);
3502 		so->so_state &= ~SS_SAVEDEOR;
3503 		if (!(tpr->data_ind.MORE_flag & 1)) {
3504 			if (!(rval.r_val1 & MOREDATA))
3505 				msg->msg_flags |= MSG_EOR;
3506 			else
3507 				so->so_state |= SS_SAVEDEOR;
3508 		}
3509 		freemsg(mp);
3510 		/*
3511 		 * If some data was received (i.e. not EOF) and the
3512 		 * read/recv* has not been satisfied wait for some more.
3513 		 * Not possible to wait if control info was received.
3514 		 */
3515 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3516 		    controllen == 0 &&
3517 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3518 			mutex_exit(&so->so_lock);
3519 			pflag = opflag | MSG_NOMARK;
3520 			goto retry;
3521 		}
3522 		goto out_locked;
3523 	}
3524 	case T_EXDATA_IND: {
3525 		dprintso(so, 1,
3526 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3527 		    "state %s\n",
3528 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3529 		    saved_resid - uiop->uio_resid,
3530 		    pr_state(so->so_state, so->so_mode)));
3531 		/*
3532 		 * kstrgetmsg handles MSGMARK so there is nothing to
3533 		 * inspect in the T_EXDATA_IND.
3534 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3535 		 * as a separate message with no M_DATA component. Furthermore,
3536 		 * the stream head does not consolidate M_DATA messages onto
3537 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3538 		 * remains a message by itself. This is needed since MSGMARK
3539 		 * marks both the whole message as well as the last byte
3540 		 * of the message.
3541 		 */
3542 		freemsg(mp);
3543 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3544 		if (flags & MSG_PEEK) {
3545 			/*
3546 			 * Even though we are peeking we consume the
3547 			 * T_EXDATA_IND thereby moving the mark information
3548 			 * to SS_RCVATMARK. Then the oob code below will
3549 			 * retry the peeking kstrgetmsg.
3550 			 * Note that the stream head read queue is
3551 			 * never flushed without holding SOREADLOCKED
3552 			 * thus the T_EXDATA_IND can not disappear
3553 			 * underneath us.
3554 			 */
3555 			dprintso(so, 1,
3556 			    ("sotpi_recvmsg: consume EXDATA_IND "
3557 			    "counts %d/%d state %s\n",
3558 			    sti->sti_oobsigcnt,
3559 			    sti->sti_oobcnt,
3560 			    pr_state(so->so_state, so->so_mode)));
3561 
3562 			pflag = MSG_ANY | MSG_DELAYERROR;
3563 			if (so->so_mode & SM_ATOMIC)
3564 				pflag |= MSG_DISCARDTAIL;
3565 
3566 			pri = 0;
3567 			mp = NULL;
3568 
3569 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3570 			    &pri, &pflag, (clock_t)-1, &rval);
3571 			ASSERT(uiop->uio_resid == saved_resid);
3572 
3573 			if (error) {
3574 #ifdef SOCK_DEBUG
3575 				if (error != EWOULDBLOCK && error != EINTR) {
3576 					eprintsoline(so, error);
3577 				}
3578 #endif /* SOCK_DEBUG */
3579 				goto out;
3580 			}
3581 			ASSERT(mp);
3582 			tpr = (union T_primitives *)mp->b_rptr;
3583 			ASSERT(tpr->type == T_EXDATA_IND);
3584 			freemsg(mp);
3585 		} /* end "if (flags & MSG_PEEK)" */
3586 
3587 		/*
3588 		 * Decrement the number of queued and pending oob.
3589 		 *
3590 		 * SS_RCVATMARK is cleared when we read past a mark.
3591 		 * SS_HAVEOOBDATA is cleared when we've read past the
3592 		 * last mark.
3593 		 * SS_OOBPEND is cleared if we've read past the last
3594 		 * mark and no (new) SIGURG has been posted.
3595 		 */
3596 		mutex_enter(&so->so_lock);
3597 		ASSERT(so_verify_oobstate(so));
3598 		ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3599 		ASSERT(sti->sti_oobsigcnt > 0);
3600 		sti->sti_oobsigcnt--;
3601 		ASSERT(sti->sti_oobcnt > 0);
3602 		sti->sti_oobcnt--;
3603 		/*
3604 		 * Since the T_EXDATA_IND has been removed from the stream
3605 		 * head, but we have not read data past the mark,
3606 		 * sockfs needs to track that the socket is still at the mark.
3607 		 *
3608 		 * Since no data was received call kstrgetmsg again to wait
3609 		 * for data.
3610 		 */
3611 		so->so_state |= SS_RCVATMARK;
3612 		mutex_exit(&so->so_lock);
3613 		dprintso(so, 1,
3614 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3615 		    sti->sti_oobsigcnt, sti->sti_oobcnt,
3616 		    pr_state(so->so_state, so->so_mode)));
3617 		pflag = opflag;
3618 		goto retry;
3619 	}
3620 	default:
3621 		cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3622 		    (void *)so, tpr->type, (void *)mp);
3623 		ASSERT(0);
3624 		freemsg(mp);
3625 		error = EPROTO;
3626 		eprintsoline(so, error);
3627 		goto out;
3628 	}
3629 	/* NOTREACHED */
3630 out:
3631 	mutex_enter(&so->so_lock);
3632 out_locked:
3633 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3634 	mutex_exit(&so->so_lock);
3635 	return (error);
3636 }
3637 
3638 /*
3639  * Sending data with options on a datagram socket.
3640  * Assumes caller has verified that SS_ISBOUND etc. are set.
3641  *
3642  * For AF_UNIX the destination address may be already in
3643  * internal form, as indicated by sti->sti_faddr_noxlate
3644  * or the MSG_SENDTO_NOXLATE flag.  Otherwise we need to
3645  * translate the destination address to internal form.
3646  *
3647  * The source address is passed as an option.  If passing
3648  * file descriptors, those are passed as file pointers in
3649  * another option.
3650  */
3651 static int
sosend_dgramcmsg(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,void * control,t_uscalar_t controllen,int flags)3652 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3653     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3654 {
3655 	struct T_unitdata_req	tudr;
3656 	mblk_t			*mp;
3657 	int			error;
3658 	void			*addr;
3659 	socklen_t		addrlen;
3660 	void			*src;
3661 	socklen_t		srclen;
3662 	ssize_t			len;
3663 	int			size;
3664 	struct T_opthdr		toh;
3665 	struct fdbuf		*fdbuf;
3666 	t_uscalar_t		optlen;
3667 	void			*fds;
3668 	int			fdlen;
3669 	sotpi_info_t		*sti = SOTOTPI(so);
3670 
3671 	ASSERT(name && namelen);
3672 	ASSERT(control && controllen);
3673 
3674 	len = uiop->uio_resid;
3675 	if (len > (ssize_t)sti->sti_tidu_size) {
3676 		return (EMSGSIZE);
3677 	}
3678 
3679 	if (sti->sti_faddr_noxlate == 0 &&
3680 	    (flags & MSG_SENDTO_NOXLATE) == 0) {
3681 		/*
3682 		 * Length and family checks.
3683 		 * Don't verify internal form.
3684 		 */
3685 		error = so_addr_verify(so, name, namelen);
3686 		if (error) {
3687 			eprintsoline(so, error);
3688 			return (error);
3689 		}
3690 	}
3691 
3692 	if (so->so_family == AF_UNIX) {
3693 		if (sti->sti_faddr_noxlate) {
3694 			/*
3695 			 * Already have a transport internal address. Do not
3696 			 * pass any (transport internal) source address.
3697 			 */
3698 			addr = name;
3699 			addrlen = namelen;
3700 			src = NULL;
3701 			srclen = 0;
3702 		} else if (flags & MSG_SENDTO_NOXLATE) {
3703 			/*
3704 			 * Have an internal form dest. address.
3705 			 * Pass the source address as usual.
3706 			 */
3707 			addr = name;
3708 			addrlen = namelen;
3709 			src = sti->sti_laddr_sa;
3710 			srclen = (socklen_t)sti->sti_laddr_len;
3711 		} else {
3712 			/*
3713 			 * Pass the sockaddr_un source address as an option
3714 			 * and translate the remote address.
3715 			 *
3716 			 * Note that this code does not prevent sti_laddr_sa
3717 			 * from changing while it is being used. Thus
3718 			 * if an unbind+bind occurs concurrently with this
3719 			 * send the peer might see a partially new and a
3720 			 * partially old "from" address.
3721 			 */
3722 			src = sti->sti_laddr_sa;
3723 			srclen = (socklen_t)sti->sti_laddr_len;
3724 			dprintso(so, 1,
3725 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3726 			    srclen, src));
3727 			/*
3728 			 * The sendmsg caller specified a destination
3729 			 * address, which we must translate into our
3730 			 * internal form.  addr = &sti->sti_ux_taddr
3731 			 */
3732 			error = so_ux_addr_xlate(so, name, namelen,
3733 			    (flags & MSG_XPG4_2),
3734 			    &addr, &addrlen);
3735 			if (error) {
3736 				eprintsoline(so, error);
3737 				return (error);
3738 			}
3739 		}
3740 	} else {
3741 		addr = name;
3742 		addrlen = namelen;
3743 		src = NULL;
3744 		srclen = 0;
3745 	}
3746 	optlen = so_optlen(control, controllen,
3747 	    !(flags & MSG_XPG4_2));
3748 	tudr.PRIM_type = T_UNITDATA_REQ;
3749 	tudr.DEST_length = addrlen;
3750 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3751 	if (srclen != 0)
3752 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3753 		    _TPI_ALIGN_TOPT(srclen));
3754 	else
3755 		tudr.OPT_length = optlen;
3756 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3757 	    _TPI_ALIGN_TOPT(addrlen));
3758 
3759 	size = tudr.OPT_offset + tudr.OPT_length;
3760 
3761 	/*
3762 	 * File descriptors only when SM_FDPASSING set.
3763 	 */
3764 	error = so_getfdopt(control, controllen,
3765 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3766 	if (error)
3767 		return (error);
3768 	if (fdlen != -1) {
3769 		if (!(so->so_mode & SM_FDPASSING))
3770 			return (EOPNOTSUPP);
3771 
3772 		error = fdbuf_create(fds, fdlen, &fdbuf);
3773 		if (error)
3774 			return (error);
3775 		mp = fdbuf_allocmsg(size, fdbuf);
3776 	} else {
3777 		mp = soallocproto(size, _ALLOC_INTR, CRED());
3778 		if (mp == NULL) {
3779 			/*
3780 			 * Caught a signal waiting for memory.
3781 			 * Let send* return EINTR.
3782 			 */
3783 			return (EINTR);
3784 		}
3785 	}
3786 	soappendmsg(mp, &tudr, sizeof (tudr));
3787 	soappendmsg(mp, addr, addrlen);
3788 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3789 
3790 	if (fdlen != -1) {
3791 		ASSERT(fdbuf != NULL);
3792 		toh.level = SOL_SOCKET;
3793 		toh.name = SO_FILEP;
3794 		toh.len = fdbuf->fd_size +
3795 		    (t_uscalar_t)sizeof (struct T_opthdr);
3796 		toh.status = 0;
3797 		soappendmsg(mp, &toh, sizeof (toh));
3798 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3799 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3800 	}
3801 	if (srclen != 0) {
3802 		/*
3803 		 * There is a AF_UNIX sockaddr_un to include as a source
3804 		 * address option.
3805 		 */
3806 		toh.level = SOL_SOCKET;
3807 		toh.name = SO_SRCADDR;
3808 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3809 		toh.status = 0;
3810 		soappendmsg(mp, &toh, sizeof (toh));
3811 		soappendmsg(mp, src, srclen);
3812 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3813 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3814 	}
3815 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3816 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3817 	/* At most 3 bytes left in the message */
3818 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3819 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3820 
3821 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3822 	if (AU_AUDITING())
3823 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3824 
3825 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3826 #ifdef SOCK_DEBUG
3827 	if (error) {
3828 		eprintsoline(so, error);
3829 	}
3830 #endif /* SOCK_DEBUG */
3831 	return (error);
3832 }
3833 
3834 /*
3835  * Sending data with options on a connected stream socket.
3836  * Assumes caller has verified that SS_ISCONNECTED is set.
3837  */
3838 static int
sosend_svccmsg(struct sonode * so,struct uio * uiop,int more,void * control,t_uscalar_t controllen,int flags)3839 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3840     t_uscalar_t controllen, int flags)
3841 {
3842 	struct T_optdata_req	tdr;
3843 	mblk_t			*mp;
3844 	int			error;
3845 	ssize_t			iosize;
3846 	int			size;
3847 	struct fdbuf		*fdbuf;
3848 	t_uscalar_t		optlen;
3849 	void			*fds;
3850 	int			fdlen;
3851 	struct T_opthdr		toh;
3852 	sotpi_info_t		*sti = SOTOTPI(so);
3853 
3854 	dprintso(so, 1,
3855 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3856 
3857 	/*
3858 	 * Has to be bound and connected. However, since no locks are
3859 	 * held the state could have changed after sotpi_sendmsg checked it
3860 	 * thus it is not possible to ASSERT on the state.
3861 	 */
3862 
3863 	/* Options on connection-oriented only when SM_OPTDATA set. */
3864 	if (!(so->so_mode & SM_OPTDATA))
3865 		return (EOPNOTSUPP);
3866 
3867 	do {
3868 		/*
3869 		 * Set the MORE flag if uio_resid does not fit in this
3870 		 * message or if the caller passed in "more".
3871 		 * Error for transports with zero tidu_size.
3872 		 */
3873 		tdr.PRIM_type = T_OPTDATA_REQ;
3874 		iosize = sti->sti_tidu_size;
3875 		if (iosize <= 0)
3876 			return (EMSGSIZE);
3877 		if (uiop->uio_resid > iosize) {
3878 			tdr.DATA_flag = 1;
3879 		} else {
3880 			if (more)
3881 				tdr.DATA_flag = 1;
3882 			else
3883 				tdr.DATA_flag = 0;
3884 			iosize = uiop->uio_resid;
3885 		}
3886 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3887 		    tdr.DATA_flag, iosize));
3888 
3889 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3890 		tdr.OPT_length = optlen;
3891 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3892 
3893 		size = (int)sizeof (tdr) + optlen;
3894 		/*
3895 		 * File descriptors only when SM_FDPASSING set.
3896 		 */
3897 		error = so_getfdopt(control, controllen,
3898 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3899 		if (error)
3900 			return (error);
3901 		if (fdlen != -1) {
3902 			if (!(so->so_mode & SM_FDPASSING))
3903 				return (EOPNOTSUPP);
3904 
3905 			error = fdbuf_create(fds, fdlen, &fdbuf);
3906 			if (error)
3907 				return (error);
3908 			mp = fdbuf_allocmsg(size, fdbuf);
3909 		} else {
3910 			mp = soallocproto(size, _ALLOC_INTR, CRED());
3911 			if (mp == NULL) {
3912 				/*
3913 				 * Caught a signal waiting for memory.
3914 				 * Let send* return EINTR.
3915 				 */
3916 				return (EINTR);
3917 			}
3918 		}
3919 		soappendmsg(mp, &tdr, sizeof (tdr));
3920 
3921 		if (fdlen != -1) {
3922 			ASSERT(fdbuf != NULL);
3923 			toh.level = SOL_SOCKET;
3924 			toh.name = SO_FILEP;
3925 			toh.len = fdbuf->fd_size +
3926 			    (t_uscalar_t)sizeof (struct T_opthdr);
3927 			toh.status = 0;
3928 			soappendmsg(mp, &toh, sizeof (toh));
3929 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3930 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3931 		}
3932 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3933 		/* At most 3 bytes left in the message */
3934 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3935 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3936 
3937 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3938 
3939 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3940 		    0, MSG_BAND, 0);
3941 		if (error) {
3942 			eprintsoline(so, error);
3943 			return (error);
3944 		}
3945 		control = NULL;
3946 		if (uiop->uio_resid > 0) {
3947 			/*
3948 			 * Recheck for fatal errors. Fail write even though
3949 			 * some data have been written. This is consistent
3950 			 * with strwrite semantics and BSD sockets semantics.
3951 			 */
3952 			if (so->so_state & SS_CANTSENDMORE) {
3953 				eprintsoline(so, error);
3954 				return (EPIPE);
3955 			}
3956 			if (so->so_error != 0) {
3957 				mutex_enter(&so->so_lock);
3958 				error = sogeterr(so, B_TRUE);
3959 				mutex_exit(&so->so_lock);
3960 				if (error != 0) {
3961 					eprintsoline(so, error);
3962 					return (error);
3963 				}
3964 			}
3965 		}
3966 	} while (uiop->uio_resid > 0);
3967 	return (0);
3968 }
3969 
3970 /*
3971  * Sending data on a datagram socket.
3972  * Assumes caller has verified that SS_ISBOUND etc. are set.
3973  *
3974  * For AF_UNIX the destination address may be already in
3975  * internal form, as indicated by sti->sti_faddr_noxlate
3976  * or the MSG_SENDTO_NOXLATE flag.  Otherwise we need to
3977  * translate the destination address to internal form.
3978  *
3979  * The source address is passed as an option.
3980  */
3981 int
sosend_dgram(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)3982 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
3983     struct uio *uiop, int flags)
3984 {
3985 	struct T_unitdata_req	tudr;
3986 	mblk_t			*mp;
3987 	int			error;
3988 	void			*addr;
3989 	socklen_t		addrlen;
3990 	void			*src;
3991 	socklen_t		srclen;
3992 	ssize_t			len;
3993 	sotpi_info_t		*sti = SOTOTPI(so);
3994 
3995 	ASSERT(name != NULL && namelen != 0);
3996 
3997 	len = uiop->uio_resid;
3998 	if (len > sti->sti_tidu_size) {
3999 		error = EMSGSIZE;
4000 		goto done;
4001 	}
4002 
4003 	if (sti->sti_faddr_noxlate == 0 &&
4004 	    (flags & MSG_SENDTO_NOXLATE) == 0) {
4005 		/*
4006 		 * Length and family checks.
4007 		 * Don't verify internal form.
4008 		 */
4009 		error = so_addr_verify(so, name, namelen);
4010 		if (error != 0)
4011 			goto done;
4012 	}
4013 
4014 	if (sti->sti_direct)	/* Never on AF_UNIX */
4015 		return (sodgram_direct(so, name, namelen, uiop, flags));
4016 
4017 	if (so->so_family == AF_UNIX) {
4018 		if (sti->sti_faddr_noxlate) {
4019 			/*
4020 			 * Already have a transport internal address. Do not
4021 			 * pass any (transport internal) source address.
4022 			 */
4023 			addr = name;
4024 			addrlen = namelen;
4025 			src = NULL;
4026 			srclen = 0;
4027 		} else if (flags & MSG_SENDTO_NOXLATE) {
4028 			/*
4029 			 * Have an internal form dest. address.
4030 			 * Pass the source address as usual.
4031 			 */
4032 			addr = name;
4033 			addrlen = namelen;
4034 			src = sti->sti_laddr_sa;
4035 			srclen = (socklen_t)sti->sti_laddr_len;
4036 		} else {
4037 			/*
4038 			 * Pass the sockaddr_un source address as an option
4039 			 * and translate the remote address.
4040 			 *
4041 			 * Note that this code does not prevent sti_laddr_sa
4042 			 * from changing while it is being used. Thus
4043 			 * if an unbind+bind occurs concurrently with this
4044 			 * send the peer might see a partially new and a
4045 			 * partially old "from" address.
4046 			 */
4047 			src = sti->sti_laddr_sa;
4048 			srclen = (socklen_t)sti->sti_laddr_len;
4049 			dprintso(so, 1,
4050 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
4051 			    srclen, src));
4052 			/*
4053 			 * The sendmsg caller specified a destination
4054 			 * address, which we must translate into our
4055 			 * internal form.  addr = &sti->sti_ux_taddr
4056 			 */
4057 			error = so_ux_addr_xlate(so, name, namelen,
4058 			    (flags & MSG_XPG4_2),
4059 			    &addr, &addrlen);
4060 			if (error) {
4061 				eprintsoline(so, error);
4062 				goto done;
4063 			}
4064 		}
4065 	} else {
4066 		addr = name;
4067 		addrlen = namelen;
4068 		src = NULL;
4069 		srclen = 0;
4070 	}
4071 	tudr.PRIM_type = T_UNITDATA_REQ;
4072 	tudr.DEST_length = addrlen;
4073 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4074 	if (srclen == 0) {
4075 		tudr.OPT_length = 0;
4076 		tudr.OPT_offset = 0;
4077 
4078 		mp = soallocproto2(&tudr, sizeof (tudr),
4079 		    addr, addrlen, 0, _ALLOC_INTR, CRED());
4080 		if (mp == NULL) {
4081 			/*
4082 			 * Caught a signal waiting for memory.
4083 			 * Let send* return EINTR.
4084 			 */
4085 			error = EINTR;
4086 			goto done;
4087 		}
4088 	} else {
4089 		/*
4090 		 * There is a AF_UNIX sockaddr_un to include as a source
4091 		 * address option.
4092 		 */
4093 		struct T_opthdr toh;
4094 		ssize_t size;
4095 
4096 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4097 		    _TPI_ALIGN_TOPT(srclen));
4098 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4099 		    _TPI_ALIGN_TOPT(addrlen));
4100 
4101 		toh.level = SOL_SOCKET;
4102 		toh.name = SO_SRCADDR;
4103 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4104 		toh.status = 0;
4105 
4106 		size = tudr.OPT_offset + tudr.OPT_length;
4107 		mp = soallocproto2(&tudr, sizeof (tudr),
4108 		    addr, addrlen, size, _ALLOC_INTR, CRED());
4109 		if (mp == NULL) {
4110 			/*
4111 			 * Caught a signal waiting for memory.
4112 			 * Let send* return EINTR.
4113 			 */
4114 			error = EINTR;
4115 			goto done;
4116 		}
4117 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4118 		soappendmsg(mp, &toh, sizeof (toh));
4119 		soappendmsg(mp, src, srclen);
4120 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4121 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4122 	}
4123 
4124 	if (AU_AUDITING())
4125 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4126 
4127 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4128 done:
4129 #ifdef SOCK_DEBUG
4130 	if (error) {
4131 		eprintsoline(so, error);
4132 	}
4133 #endif /* SOCK_DEBUG */
4134 	return (error);
4135 }
4136 
4137 /*
4138  * Sending data on a connected stream socket.
4139  * Assumes caller has verified that SS_ISCONNECTED is set.
4140  */
4141 int
sosend_svc(struct sonode * so,struct uio * uiop,t_scalar_t prim,int more,int sflag)4142 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4143     int sflag)
4144 {
4145 	struct T_data_req	tdr;
4146 	mblk_t			*mp;
4147 	int			error;
4148 	ssize_t			iosize;
4149 	sotpi_info_t		*sti = SOTOTPI(so);
4150 
4151 	dprintso(so, 1,
4152 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4153 	    (void *)so, uiop->uio_resid, prim, sflag));
4154 
4155 	/*
4156 	 * Has to be bound and connected. However, since no locks are
4157 	 * held the state could have changed after sotpi_sendmsg checked it
4158 	 * thus it is not possible to ASSERT on the state.
4159 	 */
4160 
4161 	do {
4162 		/*
4163 		 * Set the MORE flag if uio_resid does not fit in this
4164 		 * message or if the caller passed in "more".
4165 		 * Error for transports with zero tidu_size.
4166 		 */
4167 		tdr.PRIM_type = prim;
4168 		iosize = sti->sti_tidu_size;
4169 		if (iosize <= 0)
4170 			return (EMSGSIZE);
4171 		if (uiop->uio_resid > iosize) {
4172 			tdr.MORE_flag = 1;
4173 		} else {
4174 			if (more)
4175 				tdr.MORE_flag = 1;
4176 			else
4177 				tdr.MORE_flag = 0;
4178 			iosize = uiop->uio_resid;
4179 		}
4180 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4181 		    prim, tdr.MORE_flag, iosize));
4182 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4183 		if (mp == NULL) {
4184 			/*
4185 			 * Caught a signal waiting for memory.
4186 			 * Let send* return EINTR.
4187 			 */
4188 			return (EINTR);
4189 		}
4190 
4191 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4192 		    0, sflag | MSG_BAND, 0);
4193 		if (error) {
4194 			eprintsoline(so, error);
4195 			return (error);
4196 		}
4197 		if (uiop->uio_resid > 0) {
4198 			/*
4199 			 * Recheck for fatal errors. Fail write even though
4200 			 * some data have been written. This is consistent
4201 			 * with strwrite semantics and BSD sockets semantics.
4202 			 */
4203 			if (so->so_state & SS_CANTSENDMORE) {
4204 				eprintsoline(so, error);
4205 				return (EPIPE);
4206 			}
4207 			if (so->so_error != 0) {
4208 				mutex_enter(&so->so_lock);
4209 				error = sogeterr(so, B_TRUE);
4210 				mutex_exit(&so->so_lock);
4211 				if (error != 0) {
4212 					eprintsoline(so, error);
4213 					return (error);
4214 				}
4215 			}
4216 		}
4217 	} while (uiop->uio_resid > 0);
4218 	return (0);
4219 }
4220 
4221 /*
4222  * Check the state for errors and call the appropriate send function.
4223  *
4224  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4225  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4226  * after sending the message.
4227  *
4228  * The caller may optionally specify a destination address, for either
4229  * stream or datagram sockets.  This table summarizes the cases:
4230  *
4231  *    Socket type    Dest. given    Connected    Result
4232  *    -----------    -----------    ---------    --------------
4233  *    Stream         *              Yes	         send to conn. addr.
4234  *    Stream         *              No           error ENOTCONN
4235  *    Dgram          yes            *            send to given addr.
4236  *    Dgram          no             yes          send to conn. addr.
4237  *    Dgram          no             no	         error EDESTADDRREQ
4238  *
4239  * There are subtleties around the destination address when using
4240  * AF_UNIX datagram sockets.  When the sendmsg call specifies the
4241  * destination address, it's in (struct sockaddr_un) form and we
4242  * need to translate it to our internal form (struct so_ux_addr).
4243  *
4244  * When the sendmsg call does not specify a destination address
4245  * we're using the peer address saved during sotpi_connect, and
4246  * that address is already in internal form.  In this case, the
4247  * (internal only) flag MSG_SENDTO_NOXLATE is set in the flags
4248  * passed to sosend_dgram or sosend_dgramcmsg to indicate that
4249  * those functions should skip translation to internal form.
4250  * Avoiding that translation is not only more efficient, but it's
4251  * also necessary when a process does a connect on an AF_UNIX
4252  * datagram socket and then drops privileges.  After the process
4253  * has dropped privileges, it may no longer be able to lookup the
4254  * the external name in the filesystem, but it should still be
4255  * able to send messages on the connected socket by leaving the
4256  * destination name unspecified.
4257  *
4258  * Yet more subtleties arise with sockets connected by socketpair(),
4259  * which puts internal form addresses in the fields where normally
4260  * the external form is found, and sets sti_faddr_noxlate=1, which
4261  * (like flag MSG_SENDTO_NOXLATE) causes the sosend_dgram functions
4262  * to skip translation of destination addresses to internal form.
4263  * However, beware that the flag sti_faddr_noxlate=1 also triggers
4264  * different behaviour almost everywhere AF_UNIX addresses appear.
4265  */
4266 static int
sotpi_sendmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)4267 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4268     struct cred *cr)
4269 {
4270 	int		so_state;
4271 	int		so_mode;
4272 	int		error;
4273 	struct sockaddr *name;
4274 	t_uscalar_t	namelen;
4275 	int		dontroute;
4276 	int		flags;
4277 	sotpi_info_t	*sti = SOTOTPI(so);
4278 
4279 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4280 	    (void *)so, (void *)msg, msg->msg_flags,
4281 	    pr_state(so->so_state, so->so_mode), so->so_error));
4282 
4283 	if (so->so_version == SOV_STREAM) {
4284 		/* The imaginary "sockmod" has been popped - act as a stream */
4285 		so_update_attrs(so, SOMOD);
4286 		return (strwrite(SOTOV(so), uiop, cr));
4287 	}
4288 
4289 	mutex_enter(&so->so_lock);
4290 	so_state = so->so_state;
4291 
4292 	if (so_state & SS_CANTSENDMORE) {
4293 		mutex_exit(&so->so_lock);
4294 		return (EPIPE);
4295 	}
4296 
4297 	if (so->so_error != 0) {
4298 		error = sogeterr(so, B_TRUE);
4299 		if (error != 0) {
4300 			mutex_exit(&so->so_lock);
4301 			return (error);
4302 		}
4303 	}
4304 
4305 	name = (struct sockaddr *)msg->msg_name;
4306 	namelen = msg->msg_namelen;
4307 	flags = msg->msg_flags;
4308 
4309 	/*
4310 	 * Historically, this function does not validate the flags
4311 	 * passed in, and any errant bits are ignored.  However,
4312 	 * we would not want any such errant flag bits accidently
4313 	 * being treated as one of the internal-only flags, so
4314 	 * clear the internal-only flag bits.
4315 	 */
4316 	flags &= ~MSG_SENDTO_NOXLATE;
4317 
4318 	so_mode = so->so_mode;
4319 
4320 	if (name == NULL) {
4321 		if (!(so_state & SS_ISCONNECTED)) {
4322 			mutex_exit(&so->so_lock);
4323 			if (so_mode & SM_CONNREQUIRED)
4324 				return (ENOTCONN);
4325 			else
4326 				return (EDESTADDRREQ);
4327 		}
4328 		/*
4329 		 * This is a connected socket.
4330 		 */
4331 		if (so_mode & SM_CONNREQUIRED) {
4332 			/*
4333 			 * This is a connected STREAM socket,
4334 			 * destination not specified.
4335 			 */
4336 			name = NULL;
4337 			namelen = 0;
4338 		} else {
4339 			/*
4340 			 * Datagram send on connected socket with
4341 			 * the destination name not specified.
4342 			 * Use the peer address from connect.
4343 			 */
4344 			if (so->so_family == AF_UNIX) {
4345 				/*
4346 				 * Use the (internal form) address saved
4347 				 * in sotpi_connect.  See above.
4348 				 */
4349 				name = (void *)&sti->sti_ux_faddr;
4350 				namelen = sizeof (sti->sti_ux_faddr);
4351 				flags |= MSG_SENDTO_NOXLATE;
4352 			} else {
4353 				ASSERT(sti->sti_faddr_sa);
4354 				name = sti->sti_faddr_sa;
4355 				namelen = (t_uscalar_t)sti->sti_faddr_len;
4356 			}
4357 		}
4358 	} else {
4359 		/*
4360 		 * Sendmsg specifies a destination name
4361 		 */
4362 		if (!(so_state & SS_ISCONNECTED) &&
4363 		    (so_mode & SM_CONNREQUIRED)) {
4364 			/* i.e. TCP not connected */
4365 			mutex_exit(&so->so_lock);
4366 			return (ENOTCONN);
4367 		}
4368 		/*
4369 		 * Ignore the address on connection-oriented sockets.
4370 		 * Just like BSD this code does not generate an error for
4371 		 * TCP (a CONNREQUIRED socket) when sending to an address
4372 		 * passed in with sendto/sendmsg. Instead the data is
4373 		 * delivered on the connection as if no address had been
4374 		 * supplied.
4375 		 */
4376 		if ((so_state & SS_ISCONNECTED) &&
4377 		    !(so_mode & SM_CONNREQUIRED)) {
4378 			mutex_exit(&so->so_lock);
4379 			return (EISCONN);
4380 		}
4381 		if (!(so_state & SS_ISBOUND)) {
4382 			so_lock_single(so);	/* Set SOLOCKED */
4383 			error = sotpi_bind(so, NULL, 0,
4384 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4385 			so_unlock_single(so, SOLOCKED);
4386 			if (error) {
4387 				mutex_exit(&so->so_lock);
4388 				eprintsoline(so, error);
4389 				return (error);
4390 			}
4391 		}
4392 		/*
4393 		 * Handle delayed datagram errors. These are only queued
4394 		 * when the application sets SO_DGRAM_ERRIND.
4395 		 * Return the error if we are sending to the address
4396 		 * that was returned in the last T_UDERROR_IND.
4397 		 * If sending to some other address discard the delayed
4398 		 * error indication.
4399 		 */
4400 		if (sti->sti_delayed_error) {
4401 			struct T_uderror_ind	*tudi;
4402 			void			*addr;
4403 			t_uscalar_t		addrlen;
4404 			boolean_t		match = B_FALSE;
4405 
4406 			ASSERT(sti->sti_eaddr_mp);
4407 			error = sti->sti_delayed_error;
4408 			sti->sti_delayed_error = 0;
4409 			tudi =
4410 			    (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4411 			addrlen = tudi->DEST_length;
4412 			addr = sogetoff(sti->sti_eaddr_mp,
4413 			    tudi->DEST_offset, addrlen, 1);
4414 			ASSERT(addr);	/* Checked by strsock_proto */
4415 			switch (so->so_family) {
4416 			case AF_INET: {
4417 				/* Compare just IP address and port */
4418 				sin_t *sin1 = (sin_t *)name;
4419 				sin_t *sin2 = (sin_t *)addr;
4420 
4421 				if (addrlen == sizeof (sin_t) &&
4422 				    namelen == addrlen &&
4423 				    sin1->sin_port == sin2->sin_port &&
4424 				    sin1->sin_addr.s_addr ==
4425 				    sin2->sin_addr.s_addr)
4426 					match = B_TRUE;
4427 				break;
4428 			}
4429 			case AF_INET6: {
4430 				/* Compare just IP address and port. Not flow */
4431 				sin6_t *sin1 = (sin6_t *)name;
4432 				sin6_t *sin2 = (sin6_t *)addr;
4433 
4434 				if (addrlen == sizeof (sin6_t) &&
4435 				    namelen == addrlen &&
4436 				    sin1->sin6_port == sin2->sin6_port &&
4437 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4438 				    &sin2->sin6_addr))
4439 					match = B_TRUE;
4440 				break;
4441 			}
4442 			case AF_UNIX:
4443 			default:
4444 				if (namelen == addrlen &&
4445 				    bcmp(name, addr, namelen) == 0)
4446 					match = B_TRUE;
4447 			}
4448 			if (match) {
4449 				freemsg(sti->sti_eaddr_mp);
4450 				sti->sti_eaddr_mp = NULL;
4451 				mutex_exit(&so->so_lock);
4452 #ifdef DEBUG
4453 				dprintso(so, 0,
4454 				    ("sockfs delayed error %d for %s\n",
4455 				    error,
4456 				    pr_addr(so->so_family, name, namelen)));
4457 #endif /* DEBUG */
4458 				return (error);
4459 			}
4460 			freemsg(sti->sti_eaddr_mp);
4461 			sti->sti_eaddr_mp = NULL;
4462 		}
4463 	}
4464 	mutex_exit(&so->so_lock);
4465 
4466 	dontroute = 0;
4467 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4468 		uint32_t	val;
4469 
4470 		val = 1;
4471 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4472 		    &val, (t_uscalar_t)sizeof (val), cr);
4473 		if (error)
4474 			return (error);
4475 		dontroute = 1;
4476 	}
4477 
4478 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4479 		error = EOPNOTSUPP;
4480 		goto done;
4481 	}
4482 	if (msg->msg_controllen != 0) {
4483 		if (!(so_mode & SM_CONNREQUIRED)) {
4484 			so_update_attrs(so, SOMOD);
4485 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4486 			    msg->msg_control, msg->msg_controllen, flags);
4487 		} else {
4488 			if (flags & MSG_OOB) {
4489 				/* Can't generate T_EXDATA_REQ with options */
4490 				error = EOPNOTSUPP;
4491 				goto done;
4492 			}
4493 			so_update_attrs(so, SOMOD);
4494 			error = sosend_svccmsg(so, uiop,
4495 			    !(flags & MSG_EOR),
4496 			    msg->msg_control, msg->msg_controllen,
4497 			    flags);
4498 		}
4499 		goto done;
4500 	}
4501 
4502 	so_update_attrs(so, SOMOD);
4503 	if (!(so_mode & SM_CONNREQUIRED)) {
4504 		/*
4505 		 * If there is no SO_DONTROUTE to turn off return immediately
4506 		 * from send_dgram. This can allow tail-call optimizations.
4507 		 */
4508 		if (!dontroute) {
4509 			return (sosend_dgram(so, name, namelen, uiop, flags));
4510 		}
4511 		error = sosend_dgram(so, name, namelen, uiop, flags);
4512 	} else {
4513 		t_scalar_t prim;
4514 		int sflag;
4515 
4516 		/* Ignore msg_name in the connected state */
4517 		if (flags & MSG_OOB) {
4518 			prim = T_EXDATA_REQ;
4519 			/*
4520 			 * Send down T_EXDATA_REQ even if there is flow
4521 			 * control for data.
4522 			 */
4523 			sflag = MSG_IGNFLOW;
4524 		} else {
4525 			if (so_mode & SM_BYTESTREAM) {
4526 				/* Byte stream transport - use write */
4527 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4528 
4529 				/* Send M_DATA messages */
4530 				if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4531 				    (error = nl7c_data(so, uiop)) >= 0) {
4532 					/* NL7C consumed the data */
4533 					return (error);
4534 				}
4535 				/*
4536 				 * If there is no SO_DONTROUTE to turn off,
4537 				 * sti_direct is on, and there is no flow
4538 				 * control, we can take the fast path.
4539 				 */
4540 				if (!dontroute && sti->sti_direct != 0 &&
4541 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4542 					return (sostream_direct(so, uiop,
4543 					    NULL, cr));
4544 				}
4545 				error = strwrite(SOTOV(so), uiop, cr);
4546 				goto done;
4547 			}
4548 			prim = T_DATA_REQ;
4549 			sflag = 0;
4550 		}
4551 		/*
4552 		 * If there is no SO_DONTROUTE to turn off return immediately
4553 		 * from sosend_svc. This can allow tail-call optimizations.
4554 		 */
4555 		if (!dontroute)
4556 			return (sosend_svc(so, uiop, prim,
4557 			    !(flags & MSG_EOR), sflag));
4558 		error = sosend_svc(so, uiop, prim,
4559 		    !(flags & MSG_EOR), sflag);
4560 	}
4561 	ASSERT(dontroute);
4562 done:
4563 	if (dontroute) {
4564 		uint32_t	val;
4565 
4566 		val = 0;
4567 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4568 		    &val, (t_uscalar_t)sizeof (val), cr);
4569 	}
4570 	return (error);
4571 }
4572 
4573 /*
4574  * kstrwritemp() has very similar semantics as that of strwrite().
4575  * The main difference is it obtains mblks from the caller and also
4576  * does not do any copy as done in strwrite() from user buffers to
4577  * kernel buffers.
4578  *
4579  * Currently, this routine is used by sendfile to send data allocated
4580  * within the kernel without any copying. This interface does not use the
4581  * synchronous stream interface as synch. stream interface implies
4582  * copying.
4583  */
4584 int
kstrwritemp(struct vnode * vp,mblk_t * mp,ushort_t fmode)4585 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4586 {
4587 	struct stdata *stp;
4588 	struct queue *wqp;
4589 	mblk_t *newmp;
4590 	char waitflag;
4591 	int tempmode;
4592 	int error = 0;
4593 	int done = 0;
4594 	struct sonode *so;
4595 	boolean_t direct;
4596 
4597 	ASSERT(vp->v_stream);
4598 	stp = vp->v_stream;
4599 
4600 	so = VTOSO(vp);
4601 	direct = _SOTOTPI(so)->sti_direct;
4602 
4603 	/*
4604 	 * This is the sockfs direct fast path. canputnext() need
4605 	 * not be accurate so we don't grab the sd_lock here. If
4606 	 * we get flow-controlled, we grab sd_lock just before the
4607 	 * do..while loop below to emulate what strwrite() does.
4608 	 */
4609 	wqp = stp->sd_wrq;
4610 	if (canputnext(wqp) && direct &&
4611 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4612 		return (sostream_direct(so, NULL, mp, CRED()));
4613 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4614 		/* Fast check of flags before acquiring the lock */
4615 		mutex_enter(&stp->sd_lock);
4616 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4617 		mutex_exit(&stp->sd_lock);
4618 		if (error != 0) {
4619 			if (!(stp->sd_flag & STPLEX) &&
4620 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
4621 				error = EPIPE;
4622 			}
4623 			return (error);
4624 		}
4625 	}
4626 
4627 	waitflag = WRITEWAIT;
4628 	if (stp->sd_flag & OLDNDELAY)
4629 		tempmode = fmode & ~FNDELAY;
4630 	else
4631 		tempmode = fmode;
4632 
4633 	mutex_enter(&stp->sd_lock);
4634 	do {
4635 		if (canputnext(wqp)) {
4636 			mutex_exit(&stp->sd_lock);
4637 			if (stp->sd_wputdatafunc != NULL) {
4638 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4639 				    NULL, NULL, NULL);
4640 				if (newmp == NULL) {
4641 					/* The caller will free mp */
4642 					return (ECOMM);
4643 				}
4644 				mp = newmp;
4645 			}
4646 			putnext(wqp, mp);
4647 			return (0);
4648 		}
4649 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4650 		    &done);
4651 	} while (error == 0 && !done);
4652 
4653 	mutex_exit(&stp->sd_lock);
4654 	/*
4655 	 * EAGAIN tells the application to try again. ENOMEM
4656 	 * is returned only if the memory allocation size
4657 	 * exceeds the physical limits of the system. ENOMEM
4658 	 * can't be true here.
4659 	 */
4660 	if (error == ENOMEM)
4661 		error = EAGAIN;
4662 	return (error);
4663 }
4664 
4665 /* ARGSUSED */
4666 static int
sotpi_sendmblk(struct sonode * so,struct nmsghdr * msg,int fflag,struct cred * cr,mblk_t ** mpp)4667 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4668     struct cred *cr, mblk_t **mpp)
4669 {
4670 	int error;
4671 
4672 	if (so->so_family != AF_INET && so->so_family != AF_INET6)
4673 		return (EAFNOSUPPORT);
4674 
4675 	if (so->so_state & SS_CANTSENDMORE)
4676 		return (EPIPE);
4677 
4678 	if (so->so_type != SOCK_STREAM)
4679 		return (EOPNOTSUPP);
4680 
4681 	if ((so->so_state & SS_ISCONNECTED) == 0)
4682 		return (ENOTCONN);
4683 
4684 	error = kstrwritemp(so->so_vnode, *mpp, fflag);
4685 	if (error == 0)
4686 		*mpp = NULL;
4687 	return (error);
4688 }
4689 
4690 /*
4691  * Sending data on a datagram socket.
4692  * Assumes caller has verified that SS_ISBOUND etc. are set.
4693  */
4694 /* ARGSUSED */
4695 static int
sodgram_direct(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)4696 sodgram_direct(struct sonode *so, struct sockaddr *name,
4697     socklen_t namelen, struct uio *uiop, int flags)
4698 {
4699 	struct T_unitdata_req	tudr;
4700 	mblk_t			*mp = NULL;
4701 	int			error = 0;
4702 	void			*addr;
4703 	socklen_t		addrlen;
4704 	ssize_t			len;
4705 	struct stdata		*stp = SOTOV(so)->v_stream;
4706 	int			so_state;
4707 	queue_t			*udp_wq;
4708 	boolean_t		connected;
4709 	mblk_t			*mpdata = NULL;
4710 	sotpi_info_t		*sti = SOTOTPI(so);
4711 	uint32_t		auditing = AU_AUDITING();
4712 
4713 	ASSERT(name != NULL && namelen != 0);
4714 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4715 	ASSERT(!(so->so_mode & SM_EXDATA));
4716 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4717 	ASSERT(SOTOV(so)->v_type == VSOCK);
4718 
4719 	/* Caller checked for proper length */
4720 	len = uiop->uio_resid;
4721 	ASSERT(len <= sti->sti_tidu_size);
4722 
4723 	/* Length and family checks have been done by caller */
4724 	ASSERT(name->sa_family == so->so_family);
4725 	ASSERT(so->so_family == AF_INET ||
4726 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4727 	ASSERT(so->so_family == AF_INET6 ||
4728 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4729 
4730 	addr = name;
4731 	addrlen = namelen;
4732 
4733 	if (stp->sd_sidp != NULL &&
4734 	    (error = straccess(stp, JCWRITE)) != 0)
4735 		goto done;
4736 
4737 	so_state = so->so_state;
4738 
4739 	connected = so_state & SS_ISCONNECTED;
4740 	if (!connected) {
4741 		tudr.PRIM_type = T_UNITDATA_REQ;
4742 		tudr.DEST_length = addrlen;
4743 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4744 		tudr.OPT_length = 0;
4745 		tudr.OPT_offset = 0;
4746 
4747 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4748 		    _ALLOC_INTR, CRED());
4749 		if (mp == NULL) {
4750 			/*
4751 			 * Caught a signal waiting for memory.
4752 			 * Let send* return EINTR.
4753 			 */
4754 			error = EINTR;
4755 			goto done;
4756 		}
4757 	}
4758 
4759 	/*
4760 	 * For UDP we don't break up the copyin into smaller pieces
4761 	 * as in the TCP case.  That means if ENOMEM is returned by
4762 	 * mcopyinuio() then the uio vector has not been modified at
4763 	 * all and we fallback to either strwrite() or kstrputmsg()
4764 	 * below.  Note also that we never generate priority messages
4765 	 * from here.
4766 	 */
4767 	udp_wq = stp->sd_wrq->q_next;
4768 	if (canput(udp_wq) &&
4769 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4770 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4771 		ASSERT(uiop->uio_resid == 0);
4772 		if (!connected)
4773 			linkb(mp, mpdata);
4774 		else
4775 			mp = mpdata;
4776 		if (auditing)
4777 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4778 
4779 		udp_wput(udp_wq, mp);
4780 		return (0);
4781 	}
4782 
4783 	ASSERT(mpdata == NULL);
4784 	if (error != 0 && error != ENOMEM) {
4785 		freemsg(mp);
4786 		return (error);
4787 	}
4788 
4789 	/*
4790 	 * For connected, let strwrite() handle the blocking case.
4791 	 * Otherwise we fall thru and use kstrputmsg().
4792 	 */
4793 	if (connected)
4794 		return (strwrite(SOTOV(so), uiop, CRED()));
4795 
4796 	if (auditing)
4797 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4798 
4799 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4800 done:
4801 #ifdef SOCK_DEBUG
4802 	if (error != 0) {
4803 		eprintsoline(so, error);
4804 	}
4805 #endif /* SOCK_DEBUG */
4806 	return (error);
4807 }
4808 
4809 int
sostream_direct(struct sonode * so,struct uio * uiop,mblk_t * mp,cred_t * cr)4810 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4811 {
4812 	struct stdata *stp = SOTOV(so)->v_stream;
4813 	ssize_t iosize, rmax, maxblk;
4814 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4815 	mblk_t *newmp;
4816 	int error = 0, wflag = 0;
4817 
4818 	ASSERT(so->so_mode & SM_BYTESTREAM);
4819 	ASSERT(SOTOV(so)->v_type == VSOCK);
4820 
4821 	if (stp->sd_sidp != NULL &&
4822 	    (error = straccess(stp, JCWRITE)) != 0)
4823 		return (error);
4824 
4825 	if (uiop == NULL) {
4826 		/*
4827 		 * kstrwritemp() should have checked sd_flag and
4828 		 * flow-control before coming here.  If we end up
4829 		 * here it means that we can simply pass down the
4830 		 * data to tcp.
4831 		 */
4832 		ASSERT(mp != NULL);
4833 		if (stp->sd_wputdatafunc != NULL) {
4834 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4835 			    NULL, NULL, NULL);
4836 			if (newmp == NULL) {
4837 				/* The caller will free mp */
4838 				return (ECOMM);
4839 			}
4840 			mp = newmp;
4841 		}
4842 		tcp_wput(tcp_wq, mp);
4843 		return (0);
4844 	}
4845 
4846 	/* Fallback to strwrite() to do proper error handling */
4847 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4848 		return (strwrite(SOTOV(so), uiop, cr));
4849 
4850 	rmax = stp->sd_qn_maxpsz;
4851 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4852 	if (rmax == 0 || uiop->uio_resid <= 0)
4853 		return (0);
4854 
4855 	if (rmax == INFPSZ)
4856 		rmax = uiop->uio_resid;
4857 
4858 	maxblk = stp->sd_maxblk;
4859 
4860 	for (;;) {
4861 		iosize = MIN(uiop->uio_resid, rmax);
4862 
4863 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4864 		if (mp == NULL) {
4865 			/*
4866 			 * Fallback to strwrite() for ENOMEM; if this
4867 			 * is our first time in this routine and the uio
4868 			 * vector has not been modified, we will end up
4869 			 * calling strwrite() without any flag set.
4870 			 */
4871 			if (error == ENOMEM)
4872 				goto slow_send;
4873 			else
4874 				return (error);
4875 		}
4876 		ASSERT(uiop->uio_resid >= 0);
4877 		/*
4878 		 * If mp is non-NULL and ENOMEM is set, it means that
4879 		 * mcopyinuio() was able to break down some of the user
4880 		 * data into one or more mblks.  Send the partial data
4881 		 * to tcp and let the rest be handled in strwrite().
4882 		 */
4883 		ASSERT(error == 0 || error == ENOMEM);
4884 		if (stp->sd_wputdatafunc != NULL) {
4885 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4886 			    NULL, NULL, NULL);
4887 			if (newmp == NULL) {
4888 				/* The caller will free mp */
4889 				return (ECOMM);
4890 			}
4891 			mp = newmp;
4892 		}
4893 		tcp_wput(tcp_wq, mp);
4894 
4895 		wflag |= NOINTR;
4896 
4897 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4898 			ASSERT(error == 0);
4899 			break;
4900 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4901 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4902 slow_send:
4903 			/*
4904 			 * We were able to send down partial data using
4905 			 * the direct call interface, but are now relying
4906 			 * on strwrite() to handle the non-fastpath cases.
4907 			 * If the socket is blocking we will sleep in
4908 			 * strwaitq() until write is permitted, otherwise,
4909 			 * we will need to return the amount of bytes
4910 			 * written so far back to the app.  This is the
4911 			 * reason why we pass NOINTR flag to strwrite()
4912 			 * for non-blocking socket, because we don't want
4913 			 * to return EAGAIN when portion of the user data
4914 			 * has actually been sent down.
4915 			 */
4916 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4917 		}
4918 	}
4919 	return (0);
4920 }
4921 
4922 /*
4923  * Update sti_faddr by asking the transport (unless AF_UNIX).
4924  */
4925 /* ARGSUSED */
4926 int
sotpi_getpeername(struct sonode * so,struct sockaddr * name,socklen_t * namelen,boolean_t accept,struct cred * cr)4927 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4928     boolean_t accept, struct cred *cr)
4929 {
4930 	struct strbuf	strbuf;
4931 	int		error = 0, res;
4932 	void		*addr;
4933 	t_uscalar_t	addrlen;
4934 	k_sigset_t	smask;
4935 	sotpi_info_t	*sti = SOTOTPI(so);
4936 
4937 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4938 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4939 
4940 	ASSERT(*namelen > 0);
4941 	mutex_enter(&so->so_lock);
4942 	so_lock_single(so);	/* Set SOLOCKED */
4943 
4944 	if (accept) {
4945 		bcopy(sti->sti_faddr_sa, name,
4946 		    MIN(*namelen, sti->sti_faddr_len));
4947 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4948 		goto done;
4949 	}
4950 
4951 	if (!(so->so_state & SS_ISCONNECTED)) {
4952 		error = ENOTCONN;
4953 		goto done;
4954 	}
4955 	/* Added this check for X/Open */
4956 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4957 		error = EINVAL;
4958 		if (xnet_check_print) {
4959 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4960 		}
4961 		goto done;
4962 	}
4963 
4964 	if (sti->sti_faddr_valid) {
4965 		bcopy(sti->sti_faddr_sa, name,
4966 		    MIN(*namelen, sti->sti_faddr_len));
4967 		*namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4968 		goto done;
4969 	}
4970 
4971 #ifdef DEBUG
4972 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4973 	    pr_addr(so->so_family, sti->sti_faddr_sa,
4974 	    (t_uscalar_t)sti->sti_faddr_len)));
4975 #endif /* DEBUG */
4976 
4977 	if (so->so_family == AF_UNIX) {
4978 		/* Transport has different name space - return local info */
4979 		if (sti->sti_faddr_noxlate)
4980 			*namelen = 0;
4981 		error = 0;
4982 		goto done;
4983 	}
4984 
4985 	ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4986 
4987 	ASSERT(sti->sti_faddr_sa);
4988 	/* Allocate local buffer to use with ioctl */
4989 	addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4990 	mutex_exit(&so->so_lock);
4991 	addr = kmem_alloc(addrlen, KM_SLEEP);
4992 
4993 	/*
4994 	 * Issue TI_GETPEERNAME with signals masked.
4995 	 * Put the result in sti_faddr_sa so that getpeername works after
4996 	 * a shutdown(output).
4997 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4998 	 * back to the socket.
4999 	 */
5000 	strbuf.buf = addr;
5001 	strbuf.maxlen = addrlen;
5002 	strbuf.len = 0;
5003 
5004 	sigintr(&smask, 0);
5005 	res = 0;
5006 	ASSERT(cr);
5007 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
5008 	    0, K_TO_K, cr, &res);
5009 	sigunintr(&smask);
5010 
5011 	mutex_enter(&so->so_lock);
5012 	/*
5013 	 * If there is an error record the error in so_error put don't fail
5014 	 * the getpeername. Instead fallback on the recorded
5015 	 * sti->sti_faddr_sa.
5016 	 */
5017 	if (error) {
5018 		/*
5019 		 * Various stream head errors can be returned to the ioctl.
5020 		 * However, it is impossible to determine which ones of
5021 		 * these are really socket level errors that were incorrectly
5022 		 * consumed by the ioctl. Thus this code silently ignores the
5023 		 * error - to code explicitly does not reinstate the error
5024 		 * using soseterror().
5025 		 * Experiments have shows that at least this set of
5026 		 * errors are reported and should not be reinstated on the
5027 		 * socket:
5028 		 *	EINVAL	E.g. if an I_LINK was in effect when
5029 		 *		getpeername was called.
5030 		 *	EPIPE	The ioctl error semantics prefer the write
5031 		 *		side error over the read side error.
5032 		 *	ENOTCONN The transport just got disconnected but
5033 		 *		sockfs had not yet seen the T_DISCON_IND
5034 		 *		when issuing the ioctl.
5035 		 */
5036 		error = 0;
5037 	} else if (res == 0 && strbuf.len > 0 &&
5038 	    (so->so_state & SS_ISCONNECTED)) {
5039 		ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
5040 		sti->sti_faddr_len = (socklen_t)strbuf.len;
5041 		bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
5042 		sti->sti_faddr_valid = 1;
5043 
5044 		bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
5045 		*namelen = sti->sti_faddr_len;
5046 	}
5047 	kmem_free(addr, addrlen);
5048 #ifdef DEBUG
5049 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
5050 	    pr_addr(so->so_family, sti->sti_faddr_sa,
5051 	    (t_uscalar_t)sti->sti_faddr_len)));
5052 #endif /* DEBUG */
5053 done:
5054 	so_unlock_single(so, SOLOCKED);
5055 	mutex_exit(&so->so_lock);
5056 	return (error);
5057 }
5058 
5059 /*
5060  * Update sti_laddr by asking the transport (unless AF_UNIX).
5061  */
5062 int
sotpi_getsockname(struct sonode * so,struct sockaddr * name,socklen_t * namelen,struct cred * cr)5063 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
5064     struct cred *cr)
5065 {
5066 	struct strbuf	strbuf;
5067 	int		error = 0, res;
5068 	void		*addr;
5069 	t_uscalar_t	addrlen;
5070 	k_sigset_t	smask;
5071 	sotpi_info_t	*sti = SOTOTPI(so);
5072 
5073 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
5074 	    (void *)so, pr_state(so->so_state, so->so_mode)));
5075 
5076 	ASSERT(*namelen > 0);
5077 	mutex_enter(&so->so_lock);
5078 	so_lock_single(so);	/* Set SOLOCKED */
5079 
5080 #ifdef DEBUG
5081 
5082 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
5083 	    pr_addr(so->so_family, sti->sti_laddr_sa,
5084 	    (t_uscalar_t)sti->sti_laddr_len)));
5085 #endif /* DEBUG */
5086 	if (sti->sti_laddr_valid) {
5087 		bcopy(sti->sti_laddr_sa, name,
5088 		    MIN(*namelen, sti->sti_laddr_len));
5089 		*namelen = sti->sti_laddr_len;
5090 		goto done;
5091 	}
5092 
5093 	if (so->so_family == AF_UNIX) {
5094 		/*
5095 		 * Transport has different name space - return local info. If we
5096 		 * have enough space, let consumers know the family.
5097 		 */
5098 		if (*namelen >= sizeof (sa_family_t)) {
5099 			name->sa_family = AF_UNIX;
5100 			*namelen = sizeof (sa_family_t);
5101 		} else {
5102 			*namelen = 0;
5103 		}
5104 		error = 0;
5105 		goto done;
5106 	}
5107 	if (!(so->so_state & SS_ISBOUND)) {
5108 		/* If not bound, then nothing to return. */
5109 		error = 0;
5110 		goto done;
5111 	}
5112 
5113 	/* Allocate local buffer to use with ioctl */
5114 	addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
5115 	mutex_exit(&so->so_lock);
5116 	addr = kmem_alloc(addrlen, KM_SLEEP);
5117 
5118 	/*
5119 	 * Issue TI_GETMYNAME with signals masked.
5120 	 * Put the result in sti_laddr_sa so that getsockname works after
5121 	 * a shutdown(output).
5122 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5123 	 * back to the socket.
5124 	 */
5125 	strbuf.buf = addr;
5126 	strbuf.maxlen = addrlen;
5127 	strbuf.len = 0;
5128 
5129 	sigintr(&smask, 0);
5130 	res = 0;
5131 	ASSERT(cr);
5132 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
5133 	    0, K_TO_K, cr, &res);
5134 	sigunintr(&smask);
5135 
5136 	mutex_enter(&so->so_lock);
5137 	/*
5138 	 * If there is an error record the error in so_error put don't fail
5139 	 * the getsockname. Instead fallback on the recorded
5140 	 * sti->sti_laddr_sa.
5141 	 */
5142 	if (error) {
5143 		/*
5144 		 * Various stream head errors can be returned to the ioctl.
5145 		 * However, it is impossible to determine which ones of
5146 		 * these are really socket level errors that were incorrectly
5147 		 * consumed by the ioctl. Thus this code silently ignores the
5148 		 * error - to code explicitly does not reinstate the error
5149 		 * using soseterror().
5150 		 * Experiments have shows that at least this set of
5151 		 * errors are reported and should not be reinstated on the
5152 		 * socket:
5153 		 *	EINVAL	E.g. if an I_LINK was in effect when
5154 		 *		getsockname was called.
5155 		 *	EPIPE	The ioctl error semantics prefer the write
5156 		 *		side error over the read side error.
5157 		 */
5158 		error = 0;
5159 	} else if (res == 0 && strbuf.len > 0 &&
5160 	    (so->so_state & SS_ISBOUND)) {
5161 		ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5162 		sti->sti_laddr_len = (socklen_t)strbuf.len;
5163 		bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5164 		sti->sti_laddr_valid = 1;
5165 
5166 		bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5167 		*namelen = sti->sti_laddr_len;
5168 	}
5169 	kmem_free(addr, addrlen);
5170 #ifdef DEBUG
5171 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5172 	    pr_addr(so->so_family, sti->sti_laddr_sa,
5173 	    (t_uscalar_t)sti->sti_laddr_len)));
5174 #endif /* DEBUG */
5175 done:
5176 	so_unlock_single(so, SOLOCKED);
5177 	mutex_exit(&so->so_lock);
5178 	return (error);
5179 }
5180 
5181 /*
5182  * Get socket options. For SOL_SOCKET options some options are handled
5183  * by the sockfs while others use the value recorded in the sonode as a
5184  * fallback should the T_SVR4_OPTMGMT_REQ fail.
5185  *
5186  * On the return most *optlenp bytes are copied to optval.
5187  */
5188 /* ARGSUSED */
5189 int
sotpi_getsockopt(struct sonode * so,int level,int option_name,void * optval,socklen_t * optlenp,int flags,struct cred * cr)5190 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5191     void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5192 {
5193 	struct T_optmgmt_req	optmgmt_req;
5194 	struct T_optmgmt_ack	*optmgmt_ack;
5195 	struct opthdr		oh;
5196 	struct opthdr		*opt_res;
5197 	mblk_t			*mp = NULL;
5198 	int			error = 0;
5199 	void			*option = NULL;	/* Set if fallback value */
5200 	t_uscalar_t		maxlen = *optlenp;
5201 	t_uscalar_t		len;
5202 	uint32_t		value;
5203 	struct timeval		tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5204 	struct timeval32	tmo_val32;
5205 	struct so_snd_bufinfo	snd_bufinfo;	/* used for zero copy */
5206 
5207 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5208 	    (void *)so, level, option_name, optval, (void *)optlenp,
5209 	    pr_state(so->so_state, so->so_mode)));
5210 
5211 	mutex_enter(&so->so_lock);
5212 	so_lock_single(so);	/* Set SOLOCKED */
5213 
5214 	/*
5215 	 * Check for SOL_SOCKET options.
5216 	 * Certain SOL_SOCKET options are returned directly whereas
5217 	 * others only provide a default (fallback) value should
5218 	 * the T_SVR4_OPTMGMT_REQ fail.
5219 	 */
5220 	if (level == SOL_SOCKET) {
5221 		/* Check parameters */
5222 		switch (option_name) {
5223 		case SO_TYPE:
5224 		case SO_ERROR:
5225 		case SO_DEBUG:
5226 		case SO_ACCEPTCONN:
5227 		case SO_REUSEADDR:
5228 		case SO_REUSEPORT:
5229 		case SO_KEEPALIVE:
5230 		case SO_DONTROUTE:
5231 		case SO_BROADCAST:
5232 		case SO_USELOOPBACK:
5233 		case SO_OOBINLINE:
5234 		case SO_SNDBUF:
5235 		case SO_RCVBUF:
5236 #ifdef notyet
5237 		case SO_SNDLOWAT:
5238 		case SO_RCVLOWAT:
5239 #endif /* notyet */
5240 		case SO_DOMAIN:
5241 		case SO_DGRAM_ERRIND:
5242 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5243 				error = EINVAL;
5244 				eprintsoline(so, error);
5245 				goto done2;
5246 			}
5247 			break;
5248 		case SO_RCVTIMEO:
5249 		case SO_SNDTIMEO:
5250 			if (get_udatamodel() == DATAMODEL_NONE ||
5251 			    get_udatamodel() == DATAMODEL_NATIVE) {
5252 				if (maxlen < sizeof (struct timeval)) {
5253 					error = EINVAL;
5254 					eprintsoline(so, error);
5255 					goto done2;
5256 				}
5257 			} else {
5258 				if (maxlen < sizeof (struct timeval32)) {
5259 					error = EINVAL;
5260 					eprintsoline(so, error);
5261 					goto done2;
5262 				}
5263 
5264 			}
5265 			break;
5266 		case SO_LINGER:
5267 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5268 				error = EINVAL;
5269 				eprintsoline(so, error);
5270 				goto done2;
5271 			}
5272 			break;
5273 		case SO_SND_BUFINFO:
5274 			if (maxlen < (t_uscalar_t)
5275 			    sizeof (struct so_snd_bufinfo)) {
5276 				error = EINVAL;
5277 				eprintsoline(so, error);
5278 				goto done2;
5279 			}
5280 			break;
5281 		}
5282 
5283 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
5284 
5285 		switch (option_name) {
5286 		case SO_TYPE:
5287 			value = so->so_type;
5288 			option = &value;
5289 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5290 
5291 		case SO_ERROR:
5292 			value = sogeterr(so, B_TRUE);
5293 			option = &value;
5294 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5295 
5296 		case SO_ACCEPTCONN:
5297 			if (so->so_state & SS_ACCEPTCONN)
5298 				value = SO_ACCEPTCONN;
5299 			else
5300 				value = 0;
5301 #ifdef DEBUG
5302 			if (value) {
5303 				dprintso(so, 1,
5304 				    ("sotpi_getsockopt: 0x%x is set\n",
5305 				    option_name));
5306 			} else {
5307 				dprintso(so, 1,
5308 				    ("sotpi_getsockopt: 0x%x not set\n",
5309 				    option_name));
5310 			}
5311 #endif /* DEBUG */
5312 			option = &value;
5313 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5314 
5315 		case SO_DEBUG:
5316 		case SO_REUSEADDR:
5317 		case SO_REUSEPORT:
5318 		case SO_KEEPALIVE:
5319 		case SO_DONTROUTE:
5320 		case SO_BROADCAST:
5321 		case SO_USELOOPBACK:
5322 		case SO_OOBINLINE:
5323 		case SO_DGRAM_ERRIND:
5324 			value = (so->so_options & option_name);
5325 #ifdef DEBUG
5326 			if (value) {
5327 				dprintso(so, 1,
5328 				    ("sotpi_getsockopt: 0x%x is set\n",
5329 				    option_name));
5330 			} else {
5331 				dprintso(so, 1,
5332 				    ("sotpi_getsockopt: 0x%x not set\n",
5333 				    option_name));
5334 			}
5335 #endif /* DEBUG */
5336 			option = &value;
5337 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5338 
5339 		/*
5340 		 * The following options are only returned by sockfs when the
5341 		 * T_SVR4_OPTMGMT_REQ fails.
5342 		 */
5343 		case SO_LINGER:
5344 			option = &so->so_linger;
5345 			len = (t_uscalar_t)sizeof (struct linger);
5346 			break;
5347 		case SO_SNDBUF: {
5348 			ssize_t lvalue;
5349 
5350 			/*
5351 			 * If the option has not been set then get a default
5352 			 * value from the read queue. This value is
5353 			 * returned if the transport fails
5354 			 * the T_SVR4_OPTMGMT_REQ.
5355 			 */
5356 			lvalue = so->so_sndbuf;
5357 			if (lvalue == 0) {
5358 				mutex_exit(&so->so_lock);
5359 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
5360 				    QHIWAT, 0, &lvalue);
5361 				mutex_enter(&so->so_lock);
5362 				dprintso(so, 1,
5363 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5364 			}
5365 			value = (int)lvalue;
5366 			option = &value;
5367 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5368 			break;
5369 		}
5370 		case SO_RCVBUF: {
5371 			ssize_t lvalue;
5372 
5373 			/*
5374 			 * If the option has not been set then get a default
5375 			 * value from the read queue. This value is
5376 			 * returned if the transport fails
5377 			 * the T_SVR4_OPTMGMT_REQ.
5378 			 *
5379 			 * XXX If SO_RCVBUF has been set and this is an
5380 			 * XPG 4.2 application then do not ask the transport
5381 			 * since the transport might adjust the value and not
5382 			 * return exactly what was set by the application.
5383 			 * For non-XPG 4.2 application we return the value
5384 			 * that the transport is actually using.
5385 			 */
5386 			lvalue = so->so_rcvbuf;
5387 			if (lvalue == 0) {
5388 				mutex_exit(&so->so_lock);
5389 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5390 				    QHIWAT, 0, &lvalue);
5391 				mutex_enter(&so->so_lock);
5392 				dprintso(so, 1,
5393 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5394 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5395 				value = (int)lvalue;
5396 				option = &value;
5397 				goto copyout;	/* skip asking transport */
5398 			}
5399 			value = (int)lvalue;
5400 			option = &value;
5401 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5402 			break;
5403 		}
5404 		case SO_DOMAIN:
5405 			value = so->so_family;
5406 			option = &value;
5407 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5408 
5409 #ifdef notyet
5410 		/*
5411 		 * We do not implement the semantics of these options
5412 		 * thus we shouldn't implement the options either.
5413 		 */
5414 		case SO_SNDLOWAT:
5415 			value = so->so_sndlowat;
5416 			option = &value;
5417 			break;
5418 		case SO_RCVLOWAT:
5419 			value = so->so_rcvlowat;
5420 			option = &value;
5421 			break;
5422 #endif /* notyet */
5423 		case SO_SNDTIMEO:
5424 		case SO_RCVTIMEO: {
5425 			clock_t val;
5426 
5427 			if (option_name == SO_RCVTIMEO)
5428 				val = drv_hztousec(so->so_rcvtimeo);
5429 			else
5430 				val = drv_hztousec(so->so_sndtimeo);
5431 			tmo_val.tv_sec = val / (1000 * 1000);
5432 			tmo_val.tv_usec = val % (1000 * 1000);
5433 			if (get_udatamodel() == DATAMODEL_NONE ||
5434 			    get_udatamodel() == DATAMODEL_NATIVE) {
5435 				option = &tmo_val;
5436 				len = sizeof (struct timeval);
5437 			} else {
5438 				TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5439 				option = &tmo_val32;
5440 				len = sizeof (struct timeval32);
5441 			}
5442 			break;
5443 		}
5444 		case SO_SND_BUFINFO: {
5445 			snd_bufinfo.sbi_wroff =
5446 			    (so->so_proto_props).sopp_wroff;
5447 			snd_bufinfo.sbi_maxblk =
5448 			    (so->so_proto_props).sopp_maxblk;
5449 			snd_bufinfo.sbi_maxpsz =
5450 			    (so->so_proto_props).sopp_maxpsz;
5451 			snd_bufinfo.sbi_tail =
5452 			    (so->so_proto_props).sopp_tail;
5453 			option = &snd_bufinfo;
5454 			len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5455 			break;
5456 		}
5457 		}
5458 	}
5459 
5460 	mutex_exit(&so->so_lock);
5461 
5462 	/* Send request */
5463 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5464 	optmgmt_req.MGMT_flags = T_CHECK;
5465 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5466 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5467 
5468 	oh.level = level;
5469 	oh.name = option_name;
5470 	oh.len = maxlen;
5471 
5472 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5473 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5474 	/* Let option management work in the presence of data flow control */
5475 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5476 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5477 	mp = NULL;
5478 	mutex_enter(&so->so_lock);
5479 	if (error) {
5480 		eprintsoline(so, error);
5481 		goto done2;
5482 	}
5483 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5484 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5485 	if (error) {
5486 		if (option != NULL) {
5487 			/* We have a fallback value */
5488 			error = 0;
5489 			goto copyout;
5490 		}
5491 		eprintsoline(so, error);
5492 		goto done2;
5493 	}
5494 	ASSERT(mp);
5495 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5496 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5497 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5498 	if (opt_res == NULL) {
5499 		if (option != NULL) {
5500 			/* We have a fallback value */
5501 			error = 0;
5502 			goto copyout;
5503 		}
5504 		error = EPROTO;
5505 		eprintsoline(so, error);
5506 		goto done;
5507 	}
5508 	option = &opt_res[1];
5509 
5510 	/* check to ensure that the option is within bounds */
5511 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5512 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5513 		if (option != NULL) {
5514 			/* We have a fallback value */
5515 			error = 0;
5516 			goto copyout;
5517 		}
5518 		error = EPROTO;
5519 		eprintsoline(so, error);
5520 		goto done;
5521 	}
5522 
5523 	len = opt_res->len;
5524 
5525 copyout: {
5526 		t_uscalar_t size = MIN(len, maxlen);
5527 		bcopy(option, optval, size);
5528 		bcopy(&size, optlenp, sizeof (size));
5529 	}
5530 done:
5531 	freemsg(mp);
5532 done2:
5533 	so_unlock_single(so, SOLOCKED);
5534 	mutex_exit(&so->so_lock);
5535 
5536 	return (error);
5537 }
5538 
5539 /*
5540  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5541  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5542  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5543  * setsockopt has to work even if the transport does not support the option.
5544  */
5545 /* ARGSUSED */
5546 int
sotpi_setsockopt(struct sonode * so,int level,int option_name,const void * optval,t_uscalar_t optlen,struct cred * cr)5547 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5548     const void *optval, t_uscalar_t optlen, struct cred *cr)
5549 {
5550 	struct T_optmgmt_req	optmgmt_req;
5551 	struct opthdr		oh;
5552 	mblk_t			*mp;
5553 	int			error = 0;
5554 	boolean_t		handled = B_FALSE;
5555 
5556 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5557 	    (void *)so, level, option_name, optval, optlen,
5558 	    pr_state(so->so_state, so->so_mode)));
5559 
5560 	/* X/Open requires this check */
5561 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5562 		if (xnet_check_print)
5563 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5564 		return (EINVAL);
5565 	}
5566 
5567 	mutex_enter(&so->so_lock);
5568 	so_lock_single(so);	/* Set SOLOCKED */
5569 	mutex_exit(&so->so_lock);
5570 
5571 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5572 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5573 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5574 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5575 
5576 	oh.level = level;
5577 	oh.name = option_name;
5578 	oh.len = optlen;
5579 
5580 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5581 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5582 	/* Let option management work in the presence of data flow control */
5583 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5584 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5585 	mp = NULL;
5586 	mutex_enter(&so->so_lock);
5587 	if (error) {
5588 		eprintsoline(so, error);
5589 		goto done2;
5590 	}
5591 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5592 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5593 	if (error) {
5594 		eprintsoline(so, error);
5595 		goto done;
5596 	}
5597 	ASSERT(mp);
5598 	/* No need to verify T_optmgmt_ack */
5599 	freemsg(mp);
5600 done:
5601 	/*
5602 	 * Check for SOL_SOCKET options and record their values.
5603 	 * If we know about a SOL_SOCKET parameter and the transport
5604 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5605 	 * EPROTO) we let the setsockopt succeed.
5606 	 */
5607 	if (level == SOL_SOCKET) {
5608 		/* Check parameters */
5609 		switch (option_name) {
5610 		case SO_DEBUG:
5611 		case SO_REUSEADDR:
5612 		case SO_REUSEPORT:
5613 		case SO_KEEPALIVE:
5614 		case SO_DONTROUTE:
5615 		case SO_BROADCAST:
5616 		case SO_USELOOPBACK:
5617 		case SO_OOBINLINE:
5618 		case SO_SNDBUF:
5619 		case SO_RCVBUF:
5620 #ifdef notyet
5621 		case SO_SNDLOWAT:
5622 		case SO_RCVLOWAT:
5623 #endif /* notyet */
5624 		case SO_DGRAM_ERRIND:
5625 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5626 				error = EINVAL;
5627 				eprintsoline(so, error);
5628 				goto done2;
5629 			}
5630 			ASSERT(optval);
5631 			handled = B_TRUE;
5632 			break;
5633 		case SO_SNDTIMEO:
5634 		case SO_RCVTIMEO:
5635 			if (get_udatamodel() == DATAMODEL_NONE ||
5636 			    get_udatamodel() == DATAMODEL_NATIVE) {
5637 				if (optlen != sizeof (struct timeval)) {
5638 					error = EINVAL;
5639 					eprintsoline(so, error);
5640 					goto done2;
5641 				}
5642 			} else {
5643 				if (optlen != sizeof (struct timeval32)) {
5644 					error = EINVAL;
5645 					eprintsoline(so, error);
5646 					goto done2;
5647 				}
5648 			}
5649 			ASSERT(optval);
5650 			handled = B_TRUE;
5651 			break;
5652 		case SO_LINGER:
5653 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5654 				error = EINVAL;
5655 				eprintsoline(so, error);
5656 				goto done2;
5657 			}
5658 			ASSERT(optval);
5659 			handled = B_TRUE;
5660 			break;
5661 		}
5662 
5663 #define	intvalue	(*(int32_t *)optval)
5664 
5665 		switch (option_name) {
5666 		case SO_TYPE:
5667 		case SO_ERROR:
5668 		case SO_ACCEPTCONN:
5669 			/* Can't be set */
5670 			error = ENOPROTOOPT;
5671 			goto done2;
5672 		case SO_LINGER: {
5673 			struct linger *l = (struct linger *)optval;
5674 
5675 			so->so_linger.l_linger = l->l_linger;
5676 			if (l->l_onoff) {
5677 				so->so_linger.l_onoff = SO_LINGER;
5678 				so->so_options |= SO_LINGER;
5679 			} else {
5680 				so->so_linger.l_onoff = 0;
5681 				so->so_options &= ~SO_LINGER;
5682 			}
5683 			break;
5684 		}
5685 
5686 		case SO_DEBUG:
5687 #ifdef SOCK_TEST
5688 			if (intvalue & 2)
5689 				sock_test_timelimit = 10 * hz;
5690 			else
5691 				sock_test_timelimit = 0;
5692 
5693 			if (intvalue & 4)
5694 				do_useracc = 0;
5695 			else
5696 				do_useracc = 1;
5697 #endif /* SOCK_TEST */
5698 			/* FALLTHRU */
5699 		case SO_REUSEADDR:
5700 		case SO_REUSEPORT:
5701 		case SO_KEEPALIVE:
5702 		case SO_DONTROUTE:
5703 		case SO_BROADCAST:
5704 		case SO_USELOOPBACK:
5705 		case SO_OOBINLINE:
5706 		case SO_DGRAM_ERRIND:
5707 			if (intvalue != 0) {
5708 				dprintso(so, 1,
5709 				    ("socket_setsockopt: setting 0x%x\n",
5710 				    option_name));
5711 				so->so_options |= option_name;
5712 			} else {
5713 				dprintso(so, 1,
5714 				    ("socket_setsockopt: clearing 0x%x\n",
5715 				    option_name));
5716 				so->so_options &= ~option_name;
5717 			}
5718 			break;
5719 		/*
5720 		 * The following options are only returned by us when the
5721 		 * transport layer fails.
5722 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5723 		 * since the transport might adjust the value and not
5724 		 * return exactly what was set by the application.
5725 		 */
5726 		case SO_SNDBUF:
5727 			so->so_sndbuf = intvalue;
5728 			break;
5729 		case SO_RCVBUF:
5730 			so->so_rcvbuf = intvalue;
5731 			break;
5732 		case SO_RCVPSH:
5733 			so->so_rcv_timer_interval = intvalue;
5734 			break;
5735 #ifdef notyet
5736 		/*
5737 		 * We do not implement the semantics of these options
5738 		 * thus we shouldn't implement the options either.
5739 		 */
5740 		case SO_SNDLOWAT:
5741 			so->so_sndlowat = intvalue;
5742 			break;
5743 		case SO_RCVLOWAT:
5744 			so->so_rcvlowat = intvalue;
5745 			break;
5746 #endif /* notyet */
5747 		case SO_SNDTIMEO:
5748 		case SO_RCVTIMEO: {
5749 			struct timeval tl;
5750 			clock_t val;
5751 
5752 			if (get_udatamodel() == DATAMODEL_NONE ||
5753 			    get_udatamodel() == DATAMODEL_NATIVE)
5754 				bcopy(&tl, (struct timeval *)optval,
5755 				    sizeof (struct timeval));
5756 			else
5757 				TIMEVAL32_TO_TIMEVAL(&tl,
5758 				    (struct timeval32 *)optval);
5759 			val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5760 			if (option_name == SO_RCVTIMEO)
5761 				so->so_rcvtimeo = drv_usectohz(val);
5762 			else
5763 				so->so_sndtimeo = drv_usectohz(val);
5764 			break;
5765 		}
5766 		}
5767 #undef	intvalue
5768 
5769 		if (error) {
5770 			if ((error == ENOPROTOOPT || error == EPROTO ||
5771 			    error == EINVAL) && handled) {
5772 				dprintso(so, 1,
5773 				    ("setsockopt: ignoring error %d for 0x%x\n",
5774 				    error, option_name));
5775 				error = 0;
5776 			}
5777 		}
5778 	}
5779 done2:
5780 	so_unlock_single(so, SOLOCKED);
5781 	mutex_exit(&so->so_lock);
5782 	return (error);
5783 }
5784 
5785 /*
5786  * sotpi_close() is called when the last open reference goes away.
5787  */
5788 /* ARGSUSED */
5789 int
sotpi_close(struct sonode * so,int flag,struct cred * cr)5790 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5791 {
5792 	struct vnode *vp = SOTOV(so);
5793 	dev_t dev;
5794 	int error = 0;
5795 	sotpi_info_t *sti = SOTOTPI(so);
5796 
5797 	dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5798 	    (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5799 
5800 	dev = sti->sti_dev;
5801 
5802 	ASSERT(STREAMSTAB(getmajor(dev)));
5803 
5804 	mutex_enter(&so->so_lock);
5805 	so_lock_single(so);	/* Set SOLOCKED */
5806 
5807 	ASSERT(so_verify_oobstate(so));
5808 
5809 	if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5810 		sti->sti_nl7c_flags = 0;
5811 		nl7c_close(so);
5812 	}
5813 
5814 	if (vp->v_stream != NULL) {
5815 		vnode_t *ux_vp;
5816 
5817 		if (so->so_family == AF_UNIX) {
5818 			/* Could avoid this when CANTSENDMORE for !dgram */
5819 			so_unix_close(so);
5820 		}
5821 
5822 		mutex_exit(&so->so_lock);
5823 		/*
5824 		 * Disassemble the linkage from the AF_UNIX underlying file
5825 		 * system vnode to this socket (by atomically clearing
5826 		 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5827 		 * and frees the stream head.
5828 		 */
5829 		if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5830 			ASSERT(ux_vp->v_stream);
5831 			sti->sti_ux_bound_vp = NULL;
5832 			vn_rele_stream(ux_vp);
5833 		}
5834 		error = strclose(vp, flag, cr);
5835 		vp->v_stream = NULL;
5836 		mutex_enter(&so->so_lock);
5837 	}
5838 
5839 	/*
5840 	 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5841 	 */
5842 	so_flush_discon_ind(so);
5843 
5844 	so_unlock_single(so, SOLOCKED);
5845 	mutex_exit(&so->so_lock);
5846 
5847 	/*
5848 	 * Needed for STREAMs.
5849 	 * Decrement the device driver's reference count for streams
5850 	 * opened via the clone dip. The driver was held in clone_open().
5851 	 * The absence of clone_close() forces this asymmetry.
5852 	 */
5853 	if (so->so_flag & SOCLONE)
5854 		ddi_rele_driver(getmajor(dev));
5855 
5856 	return (error);
5857 }
5858 
5859 static int
sotpi_ioctl(struct sonode * so,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)5860 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5861     struct cred *cr, int32_t *rvalp)
5862 {
5863 	struct vnode *vp = SOTOV(so);
5864 	sotpi_info_t *sti = SOTOTPI(so);
5865 	int error = 0;
5866 
5867 	dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5868 	    cmd, arg, pr_state(so->so_state, so->so_mode)));
5869 
5870 	switch (cmd) {
5871 	case SIOCSQPTR:
5872 		/*
5873 		 * SIOCSQPTR is valid only when helper stream is created
5874 		 * by the protocol.
5875 		 */
5876 	case _I_INSERT:
5877 	case _I_REMOVE:
5878 		/*
5879 		 * Since there's no compelling reason to support these ioctls
5880 		 * on sockets, and doing so would increase the complexity
5881 		 * markedly, prevent it.
5882 		 */
5883 		return (EOPNOTSUPP);
5884 
5885 	case I_FIND:
5886 	case I_LIST:
5887 	case I_LOOK:
5888 	case I_POP:
5889 	case I_PUSH:
5890 		/*
5891 		 * To prevent races and inconsistencies between the actual
5892 		 * state of the stream and the state according to the sonode,
5893 		 * we serialize all operations which modify or operate on the
5894 		 * list of modules on the socket's stream.
5895 		 */
5896 		mutex_enter(&sti->sti_plumb_lock);
5897 		error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5898 		mutex_exit(&sti->sti_plumb_lock);
5899 		return (error);
5900 
5901 	default:
5902 		if (so->so_version != SOV_STREAM)
5903 			break;
5904 
5905 		/*
5906 		 * The imaginary "sockmod" has been popped; act as a stream.
5907 		 */
5908 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5909 	}
5910 
5911 	ASSERT(so->so_version != SOV_STREAM);
5912 
5913 	/*
5914 	 * Process socket-specific ioctls.
5915 	 */
5916 	switch (cmd) {
5917 	case FIONBIO: {
5918 		int32_t value;
5919 
5920 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5921 		    (mode & (int)FKIOCTL)))
5922 			return (EFAULT);
5923 
5924 		mutex_enter(&so->so_lock);
5925 		if (value) {
5926 			so->so_state |= SS_NDELAY;
5927 		} else {
5928 			so->so_state &= ~SS_NDELAY;
5929 		}
5930 		mutex_exit(&so->so_lock);
5931 		return (0);
5932 	}
5933 
5934 	case FIOASYNC: {
5935 		int32_t value;
5936 
5937 		if (so_copyin((void *)arg, &value, sizeof (int32_t),
5938 		    (mode & (int)FKIOCTL)))
5939 			return (EFAULT);
5940 
5941 		mutex_enter(&so->so_lock);
5942 		/*
5943 		 * SS_ASYNC flag not already set correctly?
5944 		 * (!value != !(so->so_state & SS_ASYNC))
5945 		 * but some engineers find that too hard to read.
5946 		 */
5947 		if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
5948 		    value != 0 && (so->so_state & SS_ASYNC) == 0)
5949 			error = so_flip_async(so, vp, mode, cr);
5950 		mutex_exit(&so->so_lock);
5951 		return (error);
5952 	}
5953 
5954 	case SIOCSPGRP:
5955 	case FIOSETOWN: {
5956 		pid_t pgrp;
5957 
5958 		if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5959 		    (mode & (int)FKIOCTL)))
5960 			return (EFAULT);
5961 
5962 		mutex_enter(&so->so_lock);
5963 		dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5964 		/* Any change? */
5965 		if (pgrp != so->so_pgrp)
5966 			error = so_set_siggrp(so, vp, pgrp, mode, cr);
5967 		mutex_exit(&so->so_lock);
5968 		return (error);
5969 	}
5970 	case SIOCGPGRP:
5971 	case FIOGETOWN:
5972 		if (so_copyout(&so->so_pgrp, (void *)arg,
5973 		    sizeof (pid_t), (mode & (int)FKIOCTL)))
5974 			return (EFAULT);
5975 		return (0);
5976 
5977 	case SIOCATMARK: {
5978 		int retval;
5979 		uint_t so_state;
5980 
5981 		/*
5982 		 * strwaitmark has a finite timeout after which it
5983 		 * returns -1 if the mark state is undetermined.
5984 		 * In order to avoid any race between the mark state
5985 		 * in sockfs and the mark state in the stream head this
5986 		 * routine loops until the mark state can be determined
5987 		 * (or the urgent data indication has been removed by some
5988 		 * other thread).
5989 		 */
5990 		do {
5991 			mutex_enter(&so->so_lock);
5992 			so_state = so->so_state;
5993 			mutex_exit(&so->so_lock);
5994 			if (so_state & SS_RCVATMARK) {
5995 				retval = 1;
5996 			} else if (!(so_state & SS_OOBPEND)) {
5997 				/*
5998 				 * No SIGURG has been generated -- there is no
5999 				 * pending or present urgent data. Thus can't
6000 				 * possibly be at the mark.
6001 				 */
6002 				retval = 0;
6003 			} else {
6004 				/*
6005 				 * Have the stream head wait until there is
6006 				 * either some messages on the read queue, or
6007 				 * STRATMARK or STRNOTATMARK gets set. The
6008 				 * STRNOTATMARK flag is used so that the
6009 				 * transport can send up a MSGNOTMARKNEXT
6010 				 * M_DATA to indicate that it is not
6011 				 * at the mark and additional data is not about
6012 				 * to be send upstream.
6013 				 *
6014 				 * If the mark state is undetermined this will
6015 				 * return -1 and we will loop rechecking the
6016 				 * socket state.
6017 				 */
6018 				retval = strwaitmark(vp);
6019 			}
6020 		} while (retval == -1);
6021 
6022 		if (so_copyout(&retval, (void *)arg, sizeof (int),
6023 		    (mode & (int)FKIOCTL)))
6024 			return (EFAULT);
6025 		return (0);
6026 	}
6027 
6028 	case I_FDINSERT:
6029 	case I_SENDFD:
6030 	case I_RECVFD:
6031 	case I_ATMARK:
6032 	case _SIOCSOCKFALLBACK:
6033 		/*
6034 		 * These ioctls do not apply to sockets. I_FDINSERT can be
6035 		 * used to send M_PROTO messages without modifying the socket
6036 		 * state. I_SENDFD/RECVFD should not be used for socket file
6037 		 * descriptor passing since they assume a twisted stream.
6038 		 * SIOCATMARK must be used instead of I_ATMARK.
6039 		 *
6040 		 * _SIOCSOCKFALLBACK from an application should never be
6041 		 * processed.  It is only generated by socktpi_open() or
6042 		 * in response to I_POP or I_PUSH.
6043 		 */
6044 #ifdef DEBUG
6045 		zcmn_err(getzoneid(), CE_WARN,
6046 		    "Unsupported STREAMS ioctl 0x%x on socket. "
6047 		    "Pid = %d\n", cmd, curproc->p_pid);
6048 #endif /* DEBUG */
6049 		return (EOPNOTSUPP);
6050 
6051 	case _I_GETPEERCRED:
6052 		if ((mode & FKIOCTL) == 0)
6053 			return (EINVAL);
6054 
6055 		mutex_enter(&so->so_lock);
6056 		if ((so->so_mode & SM_CONNREQUIRED) == 0) {
6057 			error = ENOTSUP;
6058 		} else if ((so->so_state & SS_ISCONNECTED) == 0) {
6059 			error = ENOTCONN;
6060 		} else if (so->so_peercred != NULL) {
6061 			k_peercred_t *kp = (k_peercred_t *)arg;
6062 			kp->pc_cr = so->so_peercred;
6063 			kp->pc_cpid = so->so_cpid;
6064 			crhold(so->so_peercred);
6065 		} else {
6066 			error = EINVAL;
6067 		}
6068 		mutex_exit(&so->so_lock);
6069 		return (error);
6070 
6071 	default:
6072 		/*
6073 		 * Do the higher-order bits of the ioctl cmd indicate
6074 		 * that it is an I_* streams ioctl?
6075 		 */
6076 		if ((cmd & 0xffffff00U) == STR &&
6077 		    so->so_version == SOV_SOCKBSD) {
6078 #ifdef DEBUG
6079 			zcmn_err(getzoneid(), CE_WARN,
6080 			    "Unsupported STREAMS ioctl 0x%x on socket. "
6081 			    "Pid = %d\n", cmd, 	curproc->p_pid);
6082 #endif /* DEBUG */
6083 			return (EOPNOTSUPP);
6084 		}
6085 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6086 	}
6087 }
6088 
6089 /*
6090  * Handle plumbing-related ioctls.
6091  */
6092 static int
socktpi_plumbioctl(struct vnode * vp,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)6093 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
6094     struct cred *cr, int32_t *rvalp)
6095 {
6096 	static const char sockmod_name[] = "sockmod";
6097 	struct sonode	*so = VTOSO(vp);
6098 	char		mname[FMNAMESZ + 1];
6099 	int		error;
6100 	sotpi_info_t	*sti = SOTOTPI(so);
6101 
6102 	ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
6103 
6104 	if (so->so_version == SOV_SOCKBSD)
6105 		return (EOPNOTSUPP);
6106 
6107 	if (so->so_version == SOV_STREAM) {
6108 		/*
6109 		 * The imaginary "sockmod" has been popped - act as a stream.
6110 		 * If this is a push of sockmod then change back to a socket.
6111 		 */
6112 		if (cmd == I_PUSH) {
6113 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6114 			    (void *)arg, mname, sizeof (mname), NULL);
6115 
6116 			if (error == 0 && strcmp(mname, sockmod_name) == 0) {
6117 				dprintso(so, 0, ("socktpi_ioctl: going to "
6118 				    "socket version\n"));
6119 				so_stream2sock(so);
6120 				return (0);
6121 			}
6122 		}
6123 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6124 	}
6125 
6126 	switch (cmd) {
6127 	case I_PUSH:
6128 		if (sti->sti_direct) {
6129 			mutex_enter(&so->so_lock);
6130 			so_lock_single(so);
6131 			mutex_exit(&so->so_lock);
6132 
6133 			error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6134 			    cr, rvalp);
6135 
6136 			mutex_enter(&so->so_lock);
6137 			if (error == 0)
6138 				sti->sti_direct = 0;
6139 			so_unlock_single(so, SOLOCKED);
6140 			mutex_exit(&so->so_lock);
6141 
6142 			if (error != 0)
6143 				return (error);
6144 		}
6145 
6146 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6147 		if (error == 0)
6148 			sti->sti_pushcnt++;
6149 		return (error);
6150 
6151 	case I_POP:
6152 		if (sti->sti_pushcnt == 0) {
6153 			/* Emulate sockmod being popped */
6154 			dprintso(so, 0,
6155 			    ("socktpi_ioctl: going to STREAMS version\n"));
6156 			return (so_sock2stream(so));
6157 		}
6158 
6159 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6160 		if (error == 0)
6161 			sti->sti_pushcnt--;
6162 		return (error);
6163 
6164 	case I_LIST: {
6165 		struct str_mlist *kmlistp, *umlistp;
6166 		struct str_list	kstrlist;
6167 		ssize_t		kstrlistsize;
6168 		int		i, nmods;
6169 
6170 		STRUCT_DECL(str_list, ustrlist);
6171 		STRUCT_INIT(ustrlist, mode);
6172 
6173 		if (arg == NULL) {
6174 			error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6175 			if (error == 0)
6176 				(*rvalp)++;	/* Add one for sockmod */
6177 			return (error);
6178 		}
6179 
6180 		error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6181 		    STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6182 		if (error != 0)
6183 			return (error);
6184 
6185 		nmods = STRUCT_FGET(ustrlist, sl_nmods);
6186 		if (nmods <= 0)
6187 			return (EINVAL);
6188 		/*
6189 		 * Ceiling nmods at nstrpush to prevent someone from
6190 		 * maliciously consuming lots of kernel memory.
6191 		 */
6192 		nmods = MIN(nmods, nstrpush);
6193 
6194 		kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6195 		kstrlist.sl_nmods = nmods;
6196 		kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6197 
6198 		error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6199 		    cr, rvalp);
6200 		if (error != 0)
6201 			goto done;
6202 
6203 		/*
6204 		 * Considering the module list as a 0-based array of sl_nmods
6205 		 * modules, sockmod should conceptually exist at slot
6206 		 * sti_pushcnt.  Insert sockmod at this location by sliding all
6207 		 * of the module names after so_pushcnt over by one.  We know
6208 		 * that there will be room to do this since we allocated
6209 		 * sl_modlist with an additional slot.
6210 		 */
6211 		for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6212 			kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6213 
6214 		(void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6215 		kstrlist.sl_nmods++;
6216 
6217 		/*
6218 		 * Copy all of the entries out to ustrlist.
6219 		 */
6220 		kmlistp = kstrlist.sl_modlist;
6221 		umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6222 		for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6223 			error = so_copyout(kmlistp++, umlistp++,
6224 			    sizeof (struct str_mlist), mode & FKIOCTL);
6225 			if (error != 0)
6226 				goto done;
6227 		}
6228 
6229 		error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6230 		    mode & FKIOCTL);
6231 		if (error == 0)
6232 			*rvalp = 0;
6233 	done:
6234 		kmem_free(kstrlist.sl_modlist, kstrlistsize);
6235 		return (error);
6236 	}
6237 	case I_LOOK:
6238 		if (sti->sti_pushcnt == 0) {
6239 			return (so_copyout(sockmod_name, (void *)arg,
6240 			    sizeof (sockmod_name), mode & FKIOCTL));
6241 		}
6242 		return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6243 
6244 	case I_FIND:
6245 		error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6246 		if (error && error != EINVAL)
6247 			return (error);
6248 
6249 		/* if not found and string was sockmod return 1 */
6250 		if (*rvalp == 0 || error == EINVAL) {
6251 			error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6252 			    (void *)arg, mname, sizeof (mname), NULL);
6253 			if (error == ENAMETOOLONG)
6254 				error = EINVAL;
6255 
6256 			if (error == 0 && strcmp(mname, sockmod_name) == 0)
6257 				*rvalp = 1;
6258 		}
6259 		return (error);
6260 
6261 	default:
6262 		panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6263 		break;
6264 	}
6265 
6266 	return (0);
6267 }
6268 
6269 /*
6270  * Wrapper around the streams poll routine that implements socket poll
6271  * semantics.
6272  * The sockfs never calls pollwakeup itself - the stream head take care
6273  * of all pollwakeups. Since sockfs never holds so_lock when calling the
6274  * stream head there can never be a deadlock due to holding so_lock across
6275  * pollwakeup and acquiring so_lock in this routine.
6276  *
6277  * However, since the performance of VOP_POLL is critical we avoid
6278  * acquiring so_lock here. This is based on two assumptions:
6279  *  - The poll implementation holds locks to serialize the VOP_POLL call
6280  *    and a pollwakeup for the same pollhead. This ensures that should
6281  *    e.g. so_state change during a socktpi_poll call the pollwakeup
6282  *    (which strsock_* and strrput conspire to issue) is issued after
6283  *    the state change. Thus the pollwakeup will block until VOP_POLL has
6284  *    returned and then wake up poll and have it call VOP_POLL again.
6285  *  - The reading of so_state without holding so_lock does not result in
6286  *    stale data that is older than the latest state change that has dropped
6287  *    so_lock. This is ensured by the mutex_exit issuing the appropriate
6288  *    memory barrier to force the data into the coherency domain.
6289  */
6290 static int
sotpi_poll(struct sonode * so,short events,int anyyet,short * reventsp,struct pollhead ** phpp)6291 sotpi_poll(
6292 	struct sonode	*so,
6293 	short		events,
6294 	int		anyyet,
6295 	short		*reventsp,
6296 	struct pollhead **phpp)
6297 {
6298 	short origevents = events;
6299 	struct vnode *vp = SOTOV(so);
6300 	int error;
6301 	int so_state = so->so_state;	/* snapshot */
6302 	sotpi_info_t *sti = SOTOTPI(so);
6303 
6304 	dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6305 	    (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6306 
6307 	ASSERT(vp->v_type == VSOCK);
6308 	ASSERT(vp->v_stream != NULL);
6309 
6310 	if (so->so_version == SOV_STREAM) {
6311 		/* The imaginary "sockmod" has been popped - act as a stream */
6312 		return (strpoll(vp->v_stream, events, anyyet,
6313 		    reventsp, phpp));
6314 	}
6315 
6316 	if (!(so_state & SS_ISCONNECTED) &&
6317 	    (so->so_mode & SM_CONNREQUIRED)) {
6318 		/* Not connected yet - turn off write side events */
6319 		events &= ~(POLLOUT|POLLWRBAND);
6320 	}
6321 	/*
6322 	 * Check for errors without calling strpoll if the caller wants them.
6323 	 * In sockets the errors are represented as input/output events
6324 	 * and there is no need to ask the stream head for this information.
6325 	 */
6326 	if (so->so_error != 0 &&
6327 	    ((POLLIN|POLLRDNORM|POLLOUT) & origevents)  != 0) {
6328 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6329 		return (0);
6330 	}
6331 	/*
6332 	 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6333 	 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6334 	 * will not trigger a POLLIN event with POLLRDDATA set.
6335 	 * The handling of urgent data (causing POLLRDBAND) is done by
6336 	 * inspecting SS_OOBPEND below.
6337 	 */
6338 	events |= POLLRDDATA;
6339 
6340 	/*
6341 	 * After shutdown(output) a stream head write error is set.
6342 	 * However, we should not return output events.
6343 	 */
6344 	events |= POLLNOERR;
6345 	error = strpoll(vp->v_stream, events, anyyet,
6346 	    reventsp, phpp);
6347 	if (error)
6348 		return (error);
6349 
6350 	ASSERT(!(*reventsp & POLLERR));
6351 
6352 	/*
6353 	 * Notes on T_CONN_IND handling for sockets.
6354 	 *
6355 	 * If strpoll() returned without events, SR_POLLIN is guaranteed
6356 	 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6357 	 *
6358 	 * Since the so_lock is not held, soqueueconnind() may have run
6359 	 * and a T_CONN_IND may be waiting. We now check for any queued
6360 	 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6361 	 * to ensure poll returns.
6362 	 *
6363 	 * However:
6364 	 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6365 	 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6366 	 * the following actions will occur; taken together they ensure the
6367 	 * syscall will return.
6368 	 *
6369 	 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6370 	 *    the accept() was run on a non-blocking socket sowaitconnind()
6371 	 *    may have already returned EWOULDBLOCK, so not be waiting to
6372 	 *    process the message. Additionally socktpi_poll() has probably
6373 	 *    proceeded past the sti_conn_ind_head check below.
6374 	 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6375 	 *    this thread,  however that could occur before poll_common()
6376 	 *    has entered cv_wait.
6377 	 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6378 	 *
6379 	 * Before proceeding to cv_wait() in poll_common() for an event,
6380 	 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6381 	 * and if set, re-calls strpoll() to ensure the late arriving
6382 	 * T_CONN_IND is recognized, and pollsys() returns.
6383 	 */
6384 
6385 	if (sti->sti_conn_ind_head != NULL)
6386 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6387 
6388 	if (so->so_state & SS_CANTRCVMORE) {
6389 		*reventsp |= POLLRDHUP & events;
6390 
6391 		if (so->so_state & SS_CANTSENDMORE)
6392 			*reventsp |= POLLHUP;
6393 	}
6394 
6395 	if (so->so_state & SS_OOBPEND)
6396 		*reventsp |= POLLRDBAND & events;
6397 
6398 	if (sti->sti_nl7c_rcv_mp != NULL) {
6399 		*reventsp |= (POLLIN|POLLRDNORM) & events;
6400 	}
6401 	if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6402 	    ((POLLIN|POLLRDNORM) & *reventsp)) {
6403 		sti->sti_nl7c_flags |= NL7C_POLLIN;
6404 	}
6405 
6406 	return (0);
6407 }
6408 
6409 /*ARGSUSED*/
6410 static int
socktpi_constructor(void * buf,void * cdrarg,int kmflags)6411 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6412 {
6413 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6414 	int error = 0;
6415 
6416 	error = sonode_constructor(buf, cdrarg, kmflags);
6417 	if (error != 0)
6418 		return (error);
6419 
6420 	error = i_sotpi_info_constructor(&st->st_info);
6421 	if (error != 0)
6422 		sonode_destructor(buf, cdrarg);
6423 
6424 	st->st_sonode.so_priv = &st->st_info;
6425 
6426 	return (error);
6427 }
6428 
6429 /*ARGSUSED1*/
6430 static void
socktpi_destructor(void * buf,void * cdrarg)6431 socktpi_destructor(void *buf, void *cdrarg)
6432 {
6433 	sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6434 
6435 	ASSERT(st->st_sonode.so_priv == &st->st_info);
6436 	st->st_sonode.so_priv = NULL;
6437 
6438 	i_sotpi_info_destructor(&st->st_info);
6439 	sonode_destructor(buf, cdrarg);
6440 }
6441 
6442 static int
socktpi_unix_constructor(void * buf,void * cdrarg,int kmflags)6443 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6444 {
6445 	int retval;
6446 
6447 	if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6448 		struct sonode *so = (struct sonode *)buf;
6449 		sotpi_info_t *sti = SOTOTPI(so);
6450 
6451 		mutex_enter(&socklist.sl_lock);
6452 
6453 		sti->sti_next_so = socklist.sl_list;
6454 		sti->sti_prev_so = NULL;
6455 		if (sti->sti_next_so != NULL)
6456 			SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6457 		socklist.sl_list = so;
6458 
6459 		mutex_exit(&socklist.sl_lock);
6460 
6461 	}
6462 	return (retval);
6463 }
6464 
6465 static void
socktpi_unix_destructor(void * buf,void * cdrarg)6466 socktpi_unix_destructor(void *buf, void *cdrarg)
6467 {
6468 	struct sonode	*so = (struct sonode *)buf;
6469 	sotpi_info_t	*sti = SOTOTPI(so);
6470 
6471 	mutex_enter(&socklist.sl_lock);
6472 
6473 	if (sti->sti_next_so != NULL)
6474 		SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6475 	if (sti->sti_prev_so != NULL)
6476 		SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6477 	else
6478 		socklist.sl_list = sti->sti_next_so;
6479 
6480 	mutex_exit(&socklist.sl_lock);
6481 
6482 	socktpi_destructor(buf, cdrarg);
6483 }
6484 
6485 int
socktpi_init(void)6486 socktpi_init(void)
6487 {
6488 	/*
6489 	 * Create sonode caches.  We create a special one for AF_UNIX so
6490 	 * that we can track them for netstat(1m).
6491 	 */
6492 	socktpi_cache = kmem_cache_create("socktpi_cache",
6493 	    sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6494 	    socktpi_destructor, NULL, NULL, NULL, 0);
6495 
6496 	socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6497 	    sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6498 	    socktpi_unix_destructor, NULL, NULL, NULL, 0);
6499 
6500 	return (0);
6501 }
6502 
6503 /*
6504  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6505  *
6506  * Caller must still update state and mode using sotpi_update_state().
6507  */
6508 int
sotpi_convert_sonode(struct sonode * so,struct sockparams * newsp,boolean_t * direct,queue_t ** qp,struct cred * cr)6509 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6510     boolean_t *direct, queue_t **qp, struct cred *cr)
6511 {
6512 	sotpi_info_t *sti;
6513 	struct sockparams *origsp = so->so_sockparams;
6514 	sock_lower_handle_t handle = so->so_proto_handle;
6515 	struct stdata *stp;
6516 	struct vnode *vp;
6517 	queue_t *q;
6518 	int error = 0;
6519 
6520 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6521 	    SS_FALLBACK_PENDING);
6522 	ASSERT(SOCK_IS_NONSTR(so));
6523 
6524 	*qp = NULL;
6525 	*direct = B_FALSE;
6526 	so->so_sockparams = newsp;
6527 	/*
6528 	 * Allocate and initalize fields required by TPI.
6529 	 */
6530 	(void) sotpi_info_create(so, KM_SLEEP);
6531 	sotpi_info_init(so);
6532 
6533 	if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6534 		sotpi_info_fini(so);
6535 		sotpi_info_destroy(so);
6536 		return (error);
6537 	}
6538 	ASSERT(handle == so->so_proto_handle);
6539 	sti = SOTOTPI(so);
6540 	if (sti->sti_direct != 0)
6541 		*direct = B_TRUE;
6542 
6543 	/*
6544 	 * Keep the original sp around so we can properly dispose of the
6545 	 * sonode when the socket is being closed.
6546 	 */
6547 	sti->sti_orig_sp = origsp;
6548 
6549 	so_basic_strinit(so);	/* skips the T_CAPABILITY_REQ */
6550 	so_alloc_addr(so, so->so_max_addr_len);
6551 
6552 	/*
6553 	 * If the application has done a SIOCSPGRP, make sure the
6554 	 * STREAM head is aware. This needs to take place before
6555 	 * the protocol start sending up messages. Otherwise we
6556 	 * might miss to generate SIGPOLL.
6557 	 *
6558 	 * It is possible that the application will receive duplicate
6559 	 * signals if some were already generated for either data or
6560 	 * connection indications.
6561 	 */
6562 	if (so->so_pgrp != 0) {
6563 		if (so_set_events(so, so->so_vnode, cr) != 0)
6564 			so->so_pgrp = 0;
6565 	}
6566 
6567 	/*
6568 	 * Determine which queue to use.
6569 	 */
6570 	vp = SOTOV(so);
6571 	stp = vp->v_stream;
6572 	ASSERT(stp != NULL);
6573 	q = stp->sd_wrq->q_next;
6574 
6575 	/*
6576 	 * Skip any modules that may have been auto pushed when the device
6577 	 * was opened
6578 	 */
6579 	while (q->q_next != NULL)
6580 		q = q->q_next;
6581 	*qp = _RD(q);
6582 
6583 	/* This is now a STREAMS sockets */
6584 	so->so_not_str = B_FALSE;
6585 
6586 	return (error);
6587 }
6588 
6589 /*
6590  * Revert a TPI sonode. It is only allowed to revert the sonode during
6591  * the fallback process.
6592  */
6593 void
sotpi_revert_sonode(struct sonode * so,struct cred * cr)6594 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6595 {
6596 	vnode_t *vp = SOTOV(so);
6597 
6598 	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6599 	    SS_FALLBACK_PENDING);
6600 	ASSERT(!SOCK_IS_NONSTR(so));
6601 	ASSERT(vp->v_stream != NULL);
6602 
6603 	strclean(vp);
6604 	(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6605 
6606 	/*
6607 	 * Restore the original sockparams. The caller is responsible for
6608 	 * dropping the ref to the new sp.
6609 	 */
6610 	so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6611 
6612 	sotpi_info_fini(so);
6613 	sotpi_info_destroy(so);
6614 
6615 	/* This is no longer a STREAMS sockets */
6616 	so->so_not_str = B_TRUE;
6617 }
6618 
6619 void
sotpi_update_state(struct sonode * so,struct T_capability_ack * tcap,struct sockaddr * laddr,socklen_t laddrlen,struct sockaddr * faddr,socklen_t faddrlen,short opts)6620 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6621     struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6622     socklen_t faddrlen, short opts)
6623 {
6624 	sotpi_info_t *sti = SOTOTPI(so);
6625 
6626 	so_proc_tcapability_ack(so, tcap);
6627 
6628 	so->so_options |= opts;
6629 
6630 	/*
6631 	 * Determine whether the foreign and local address are valid
6632 	 */
6633 	if (laddrlen != 0) {
6634 		ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6635 		sti->sti_laddr_len = laddrlen;
6636 		bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6637 		sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6638 	}
6639 
6640 	if (faddrlen != 0) {
6641 		ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6642 		sti->sti_faddr_len = faddrlen;
6643 		bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6644 		sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6645 	}
6646 
6647 }
6648 
6649 /*
6650  * Allocate enough space to cache the local and foreign addresses.
6651  */
6652 void
so_alloc_addr(struct sonode * so,t_uscalar_t maxlen)6653 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6654 {
6655 	sotpi_info_t *sti = SOTOTPI(so);
6656 
6657 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6658 	ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6659 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6660 	    P2ROUNDUP(maxlen, KMEM_ALIGN);
6661 	so->so_max_addr_len = sti->sti_laddr_maxlen;
6662 	sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6663 	sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6664 	    + sti->sti_laddr_maxlen);
6665 
6666 	if (so->so_family == AF_UNIX) {
6667 		/*
6668 		 * Initialize AF_UNIX related fields.
6669 		 */
6670 		bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6671 		bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6672 	}
6673 }
6674 
6675 
6676 sotpi_info_t *
sotpi_sototpi(struct sonode * so)6677 sotpi_sototpi(struct sonode *so)
6678 {
6679 	sotpi_info_t *sti;
6680 
6681 	ASSERT(so != NULL);
6682 
6683 	sti = (sotpi_info_t *)so->so_priv;
6684 
6685 	ASSERT(sti != NULL);
6686 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6687 
6688 	return (sti);
6689 }
6690 
6691 static int
i_sotpi_info_constructor(sotpi_info_t * sti)6692 i_sotpi_info_constructor(sotpi_info_t *sti)
6693 {
6694 	sti->sti_magic		= SOTPI_INFO_MAGIC;
6695 	sti->sti_ack_mp		= NULL;
6696 	sti->sti_discon_ind_mp	= NULL;
6697 	sti->sti_ux_bound_vp	= NULL;
6698 	sti->sti_unbind_mp	= NULL;
6699 
6700 	sti->sti_conn_ind_head	= NULL;
6701 	sti->sti_conn_ind_tail	= NULL;
6702 
6703 	sti->sti_laddr_sa	= NULL;
6704 	sti->sti_faddr_sa	= NULL;
6705 
6706 	sti->sti_nl7c_flags	= 0;
6707 	sti->sti_nl7c_uri	= NULL;
6708 	sti->sti_nl7c_rcv_mp	= NULL;
6709 
6710 	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6711 	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6712 
6713 	return (0);
6714 }
6715 
6716 static void
i_sotpi_info_destructor(sotpi_info_t * sti)6717 i_sotpi_info_destructor(sotpi_info_t *sti)
6718 {
6719 	ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6720 	ASSERT(sti->sti_ack_mp == NULL);
6721 	ASSERT(sti->sti_discon_ind_mp == NULL);
6722 	ASSERT(sti->sti_ux_bound_vp == NULL);
6723 	ASSERT(sti->sti_unbind_mp == NULL);
6724 
6725 	ASSERT(sti->sti_conn_ind_head == NULL);
6726 	ASSERT(sti->sti_conn_ind_tail == NULL);
6727 
6728 	ASSERT(sti->sti_laddr_sa == NULL);
6729 	ASSERT(sti->sti_faddr_sa == NULL);
6730 
6731 	ASSERT(sti->sti_nl7c_flags == 0);
6732 	ASSERT(sti->sti_nl7c_uri == NULL);
6733 	ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6734 
6735 	mutex_destroy(&sti->sti_plumb_lock);
6736 	cv_destroy(&sti->sti_ack_cv);
6737 }
6738 
6739 /*
6740  * Creates and attaches TPI information to the given sonode
6741  */
6742 static boolean_t
sotpi_info_create(struct sonode * so,int kmflags)6743 sotpi_info_create(struct sonode *so, int kmflags)
6744 {
6745 	sotpi_info_t *sti;
6746 
6747 	ASSERT(so->so_priv == NULL);
6748 
6749 	if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6750 		return (B_FALSE);
6751 
6752 	if (i_sotpi_info_constructor(sti) != 0) {
6753 		kmem_free(sti, sizeof (*sti));
6754 		return (B_FALSE);
6755 	}
6756 
6757 	so->so_priv = (void *)sti;
6758 	return (B_TRUE);
6759 }
6760 
6761 /*
6762  * Initializes the TPI information.
6763  */
6764 static void
sotpi_info_init(struct sonode * so)6765 sotpi_info_init(struct sonode *so)
6766 {
6767 	struct vnode *vp = SOTOV(so);
6768 	sotpi_info_t *sti = SOTOTPI(so);
6769 	time_t now;
6770 
6771 	sti->sti_dev 	= so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6772 	vp->v_rdev	= sti->sti_dev;
6773 
6774 	sti->sti_orig_sp = NULL;
6775 
6776 	sti->sti_pushcnt = 0;
6777 
6778 	now = gethrestime_sec();
6779 	sti->sti_atime	= now;
6780 	sti->sti_mtime	= now;
6781 	sti->sti_ctime	= now;
6782 
6783 	sti->sti_eaddr_mp = NULL;
6784 	sti->sti_delayed_error = 0;
6785 
6786 	sti->sti_provinfo = NULL;
6787 
6788 	sti->sti_oobcnt = 0;
6789 	sti->sti_oobsigcnt = 0;
6790 
6791 	ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6792 
6793 	sti->sti_laddr_sa	= 0;
6794 	sti->sti_faddr_sa	= 0;
6795 	sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6796 	sti->sti_laddr_len = sti->sti_faddr_len = 0;
6797 
6798 	sti->sti_laddr_valid = 0;
6799 	sti->sti_faddr_valid = 0;
6800 	sti->sti_faddr_noxlate = 0;
6801 
6802 	sti->sti_direct = 0;
6803 
6804 	ASSERT(sti->sti_ack_mp == NULL);
6805 	ASSERT(sti->sti_ux_bound_vp == NULL);
6806 	ASSERT(sti->sti_unbind_mp == NULL);
6807 
6808 	ASSERT(sti->sti_conn_ind_head == NULL);
6809 	ASSERT(sti->sti_conn_ind_tail == NULL);
6810 }
6811 
6812 /*
6813  * Given a sonode, grab the TPI info and free any data.
6814  */
6815 static void
sotpi_info_fini(struct sonode * so)6816 sotpi_info_fini(struct sonode *so)
6817 {
6818 	sotpi_info_t *sti = SOTOTPI(so);
6819 	mblk_t *mp;
6820 
6821 	ASSERT(sti->sti_discon_ind_mp == NULL);
6822 
6823 	if ((mp = sti->sti_conn_ind_head) != NULL) {
6824 		mblk_t *mp1;
6825 
6826 		while (mp) {
6827 			mp1 = mp->b_next;
6828 			mp->b_next = NULL;
6829 			freemsg(mp);
6830 			mp = mp1;
6831 		}
6832 		sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6833 	}
6834 
6835 	/*
6836 	 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6837 	 * indirect them.  It also uses so_count as a validity test.
6838 	 */
6839 	mutex_enter(&so->so_lock);
6840 
6841 	if (sti->sti_laddr_sa) {
6842 		ASSERT((caddr_t)sti->sti_faddr_sa ==
6843 		    (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6844 		ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6845 		sti->sti_laddr_valid = 0;
6846 		sti->sti_faddr_valid = 0;
6847 		kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6848 		sti->sti_laddr_sa = NULL;
6849 		sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6850 		sti->sti_faddr_sa = NULL;
6851 		sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6852 	}
6853 
6854 	mutex_exit(&so->so_lock);
6855 
6856 	if ((mp = sti->sti_eaddr_mp) != NULL) {
6857 		freemsg(mp);
6858 		sti->sti_eaddr_mp = NULL;
6859 		sti->sti_delayed_error = 0;
6860 	}
6861 
6862 	if ((mp = sti->sti_ack_mp) != NULL) {
6863 		freemsg(mp);
6864 		sti->sti_ack_mp = NULL;
6865 	}
6866 
6867 	if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
6868 		sti->sti_nl7c_rcv_mp = NULL;
6869 		freemsg(mp);
6870 	}
6871 	sti->sti_nl7c_rcv_rval = 0;
6872 	if (sti->sti_nl7c_uri != NULL) {
6873 		nl7c_urifree(so);
6874 		/* urifree() cleared nl7c_uri */
6875 	}
6876 	if (sti->sti_nl7c_flags) {
6877 		sti->sti_nl7c_flags = 0;
6878 	}
6879 
6880 	ASSERT(sti->sti_ux_bound_vp == NULL);
6881 	if ((mp = sti->sti_unbind_mp) != NULL) {
6882 		freemsg(mp);
6883 		sti->sti_unbind_mp = NULL;
6884 	}
6885 }
6886 
6887 /*
6888  * Destroys the TPI information attached to a sonode.
6889  */
6890 static void
sotpi_info_destroy(struct sonode * so)6891 sotpi_info_destroy(struct sonode *so)
6892 {
6893 	sotpi_info_t *sti = SOTOTPI(so);
6894 
6895 	i_sotpi_info_destructor(sti);
6896 	kmem_free(sti, sizeof (*sti));
6897 
6898 	so->so_priv = NULL;
6899 }
6900 
6901 /*
6902  * Create the global sotpi socket module entry. It will never be freed.
6903  */
6904 smod_info_t *
sotpi_smod_create(void)6905 sotpi_smod_create(void)
6906 {
6907 	smod_info_t *smodp;
6908 
6909 	smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6910 	smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6911 	(void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6912 	/*
6913 	 * Initialize the smod_refcnt to 1 so it will never be freed.
6914 	 */
6915 	smodp->smod_refcnt = 1;
6916 	smodp->smod_uc_version = SOCK_UC_VERSION;
6917 	smodp->smod_dc_version = SOCK_DC_VERSION;
6918 	smodp->smod_sock_create_func = &sotpi_create;
6919 	smodp->smod_sock_destroy_func = &sotpi_destroy;
6920 	return (smodp);
6921 }
6922