xref: /titanic_50/usr/src/uts/common/fs/sockfs/sockcommon_sops.c (revision 3a7782fe8269426104107f8b4144794a995733f0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"@(#)sockcommon_sops.c	1.1	07/06/14 SMI"
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/sysmacros.h>
33 #include <sys/debug.h>
34 #include <sys/cmn_err.h>
35 
36 #include <sys/stropts.h>
37 #include <sys/socket.h>
38 #include <sys/socketvar.h>
39 
40 #define	_SUN_TPI_VERSION	2
41 #include <sys/tihdr.h>
42 #include <sys/sockio.h>
43 #include <sys/sodirect.h>
44 #include <sys/kmem_impl.h>
45 
46 #include <sys/strsubr.h>
47 #include <sys/strsun.h>
48 #include <sys/ddi.h>
49 #include <netinet/in.h>
50 #include <inet/ip.h>
51 
52 #include <fs/sockfs/sockcommon.h>
53 
54 #include <sys/socket_proto.h>
55 
56 #include <fs/sockfs/socktpi_impl.h>
57 #include <sys/tihdr.h>
58 #include <fs/sockfs/nl7c.h>
59 #include <inet/kssl/ksslapi.h>
60 
61 
62 extern int xnet_skip_checks;
63 extern int xnet_check_print;
64 
65 static void so_queue_oob(sock_upper_handle_t, mblk_t *, size_t);
66 
67 
68 /*ARGSUSED*/
69 int
70 so_accept_notsupp(struct sonode *lso, int fflag,
71     struct cred *cr, struct sonode **nsop)
72 {
73 	return (EOPNOTSUPP);
74 }
75 
76 /*ARGSUSED*/
77 int
78 so_listen_notsupp(struct sonode *so, int backlog, struct cred *cr)
79 {
80 	return (EOPNOTSUPP);
81 }
82 
83 /*ARGSUSED*/
84 int
85 so_getsockname_notsupp(struct sonode *so, struct sockaddr *sa,
86     socklen_t *len, struct cred *cr)
87 {
88 	return (EOPNOTSUPP);
89 }
90 
91 /*ARGSUSED*/
92 int
93 so_getpeername_notsupp(struct sonode *so, struct sockaddr *addr,
94     socklen_t *addrlen, boolean_t accept, struct cred *cr)
95 {
96 	return (EOPNOTSUPP);
97 }
98 
99 /*ARGSUSED*/
100 int
101 so_shutdown_notsupp(struct sonode *so, int how, struct cred *cr)
102 {
103 	return (EOPNOTSUPP);
104 }
105 
106 /*ARGSUSED*/
107 int
108 so_sendmblk_notsupp(struct sonode *so, struct msghdr *msg, int fflag,
109     struct cred *cr, mblk_t **mpp)
110 {
111 	return (EOPNOTSUPP);
112 }
113 
114 /*
115  * Generic Socket Ops
116  */
117 
118 /* ARGSUSED */
119 int
120 so_init(struct sonode *so, struct sonode *pso, struct cred *cr, int flags)
121 {
122 	return (socket_init_common(so, pso, flags, cr));
123 }
124 
125 int
126 so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
127     int flags, struct cred *cr)
128 {
129 	int error;
130 
131 	SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr));
132 
133 	ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD);
134 
135 	/* X/Open requires this check */
136 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
137 		if (xnet_check_print) {
138 			printf("sockfs: X/Open bind state check "
139 			    "caused EINVAL\n");
140 		}
141 		error = EINVAL;
142 		goto done;
143 	}
144 
145 	/*
146 	 * a bind to a NULL address is interpreted as unbind. So just
147 	 * do the downcall.
148 	 */
149 	if (name == NULL)
150 		goto dobind;
151 
152 	switch (so->so_family) {
153 	case AF_INET:
154 		if ((size_t)namelen != sizeof (sin_t)) {
155 			error = name->sa_family != so->so_family ?
156 			    EAFNOSUPPORT : EINVAL;
157 			eprintsoline(so, error);
158 			goto done;
159 		}
160 
161 		if ((flags & _SOBIND_XPG4_2) &&
162 		    (name->sa_family != so->so_family)) {
163 			/*
164 			 * This check has to be made for X/Open
165 			 * sockets however application failures have
166 			 * been observed when it is applied to
167 			 * all sockets.
168 			 */
169 			error = EAFNOSUPPORT;
170 			eprintsoline(so, error);
171 			goto done;
172 		}
173 		/*
174 		 * Force a zero sa_family to match so_family.
175 		 *
176 		 * Some programs like inetd(1M) don't set the
177 		 * family field. Other programs leave
178 		 * sin_family set to garbage - SunOS 4.X does
179 		 * not check the family field on a bind.
180 		 * We use the family field that
181 		 * was passed in to the socket() call.
182 		 */
183 		name->sa_family = so->so_family;
184 		break;
185 
186 	case AF_INET6: {
187 #ifdef DEBUG
188 		sin6_t *sin6 = (sin6_t *)name;
189 #endif
190 		if ((size_t)namelen != sizeof (sin6_t)) {
191 			error = name->sa_family != so->so_family ?
192 			    EAFNOSUPPORT : EINVAL;
193 			eprintsoline(so, error);
194 			goto done;
195 		}
196 
197 		if (name->sa_family != so->so_family) {
198 			/*
199 			 * With IPv6 we require the family to match
200 			 * unlike in IPv4.
201 			 */
202 			error = EAFNOSUPPORT;
203 			eprintsoline(so, error);
204 			goto done;
205 		}
206 #ifdef DEBUG
207 		/*
208 		 * Verify that apps don't forget to clear
209 		 * sin6_scope_id etc
210 		 */
211 		if (sin6->sin6_scope_id != 0 &&
212 		    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
213 			zcmn_err(getzoneid(), CE_WARN,
214 			    "bind with uninitialized sin6_scope_id "
215 			    "(%d) on socket. Pid = %d\n",
216 			    (int)sin6->sin6_scope_id,
217 			    (int)curproc->p_pid);
218 		}
219 		if (sin6->__sin6_src_id != 0) {
220 			zcmn_err(getzoneid(), CE_WARN,
221 			    "bind with uninitialized __sin6_src_id "
222 			    "(%d) on socket. Pid = %d\n",
223 			    (int)sin6->__sin6_src_id,
224 			    (int)curproc->p_pid);
225 		}
226 #endif /* DEBUG */
227 
228 		break;
229 	}
230 	default:
231 		/* Just pass the request to the protocol */
232 		goto dobind;
233 	}
234 
235 	/*
236 	 * First we check if either NCA or KSSL has been enabled for
237 	 * the requested address, and if so, we fall back to TPI.
238 	 * If neither of those two services are enabled, then we just
239 	 * pass the request to the protocol.
240 	 *
241 	 * Note that KSSL can only be enabled on a socket if NCA is NOT
242 	 * enabled for that socket, hence the else-statement below.
243 	 */
244 	if (nl7c_enabled && ((so->so_family == AF_INET ||
245 	    so->so_family == AF_INET6) &&
246 	    nl7c_lookup_addr(name, namelen) != NULL)) {
247 		/*
248 		 * NL7C is not supported in non-global zones,
249 		 * we enforce this restriction here.
250 		 */
251 		if (so->so_zoneid == GLOBAL_ZONEID) {
252 			/* NCA should be used, so fall back to TPI */
253 			error = so_tpi_fallback(so, cr);
254 			SO_UNBLOCK_FALLBACK(so);
255 			if (error)
256 				return (error);
257 			else
258 				return (SOP_BIND(so, name, namelen, flags, cr));
259 		}
260 	} else if (so->so_type == SOCK_STREAM) {
261 		/* Check if KSSL has been configured for this address */
262 		kssl_ent_t ent;
263 		kssl_endpt_type_t type;
264 		struct T_bind_req bind_req;
265 		mblk_t *mp;
266 
267 		/*
268 		 * TODO: Check with KSSL team if we could add a function call
269 		 * that only queries whether KSSL is enabled for the given
270 		 * address.
271 		 */
272 		bind_req.PRIM_type = T_BIND_REQ;
273 		bind_req.ADDR_length = namelen;
274 		bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
275 		mp = soallocproto2(&bind_req, sizeof (bind_req),
276 		    name, namelen, 0, _ALLOC_SLEEP);
277 
278 		type = kssl_check_proxy(mp, so, &ent);
279 		freemsg(mp);
280 
281 		if (type != KSSL_NO_PROXY) {
282 			/*
283 			 * KSSL has been configured for this address, so
284 			 * we must fall back to TPI.
285 			 */
286 			kssl_release_ent(ent, so, type);
287 			error = so_tpi_fallback(so, cr);
288 			SO_UNBLOCK_FALLBACK(so);
289 			if (error)
290 				return (error);
291 			else
292 				return (SOP_BIND(so, name, namelen, flags, cr));
293 		}
294 	}
295 
296 dobind:
297 	error = (*so->so_downcalls->sd_bind)
298 	    (so->so_proto_handle, name, namelen, cr);
299 done:
300 	SO_UNBLOCK_FALLBACK(so);
301 
302 	return (error);
303 }
304 
305 int
306 so_listen(struct sonode *so, int backlog, struct cred *cr)
307 {
308 	int	error = 0;
309 
310 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
311 	SO_BLOCK_FALLBACK(so, SOP_LISTEN(so, backlog, cr));
312 
313 	error = (*so->so_downcalls->sd_listen)(so->so_proto_handle, backlog,
314 	    cr);
315 
316 	SO_UNBLOCK_FALLBACK(so);
317 
318 	return (error);
319 }
320 
321 
322 int
323 so_connect(struct sonode *so, const struct sockaddr *name,
324     socklen_t namelen, int fflag, int flags, struct cred *cr)
325 {
326 	int error = 0;
327 	sock_connid_t id;
328 
329 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
330 	SO_BLOCK_FALLBACK(so, SOP_CONNECT(so, name, namelen, fflag, flags, cr));
331 
332 	/*
333 	 * If there is a pending error, return error
334 	 * This can happen if a non blocking operation caused an error.
335 	 */
336 
337 	if (so->so_error != 0) {
338 		mutex_enter(&so->so_lock);
339 		error = sogeterr(so, B_TRUE);
340 		mutex_exit(&so->so_lock);
341 		if (error != 0)
342 			goto done;
343 	}
344 
345 	error = (*so->so_downcalls->sd_connect)(so->so_proto_handle,
346 	    name, namelen, &id, cr);
347 
348 	if (error == EINPROGRESS)
349 		error = so_wait_connected(so, fflag & (FNONBLOCK|FNDELAY), id);
350 
351 done:
352 	SO_UNBLOCK_FALLBACK(so);
353 	return (error);
354 }
355 
356 /*ARGSUSED*/
357 int
358 so_accept(struct sonode *so, int fflag, struct cred *cr, struct sonode **nsop)
359 {
360 	int error = 0;
361 	struct sonode *nso;
362 
363 	*nsop = NULL;
364 
365 	SO_BLOCK_FALLBACK(so, SOP_ACCEPT(so, fflag, cr, nsop));
366 	if ((so->so_state & SS_ACCEPTCONN) == 0) {
367 		SO_UNBLOCK_FALLBACK(so);
368 		return ((so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW) ?
369 		    EOPNOTSUPP : EINVAL);
370 	}
371 
372 	if ((error = so_acceptq_dequeue(so, (fflag & (FNONBLOCK|FNDELAY)),
373 	    &nso)) == 0) {
374 		ASSERT(nso != NULL);
375 
376 		/* finish the accept */
377 		error = (*so->so_downcalls->sd_accept)(so->so_proto_handle,
378 		    nso->so_proto_handle, (sock_upper_handle_t)nso, cr);
379 		if (error != 0) {
380 			(void) socket_close(nso, 0, cr);
381 			socket_destroy(nso);
382 		} else {
383 			*nsop = nso;
384 		}
385 	}
386 
387 	SO_UNBLOCK_FALLBACK(so);
388 	return (error);
389 }
390 
391 int
392 so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
393     struct cred *cr)
394 {
395 	int error, flags;
396 	boolean_t dontblock;
397 	ssize_t orig_resid;
398 	mblk_t  *mp;
399 
400 	SO_BLOCK_FALLBACK(so, SOP_SENDMSG(so, msg, uiop, cr));
401 
402 	flags = msg->msg_flags;
403 	error = 0;
404 	dontblock = (flags & MSG_DONTWAIT) ||
405 	    (uiop->uio_fmode & (FNONBLOCK|FNDELAY));
406 
407 	if (!(flags & MSG_XPG4_2) && msg->msg_controllen != 0) {
408 		/*
409 		 * Old way of passing fd's is not supported
410 		 */
411 		SO_UNBLOCK_FALLBACK(so);
412 		return (EOPNOTSUPP);
413 	}
414 
415 	if ((so->so_mode & SM_ATOMIC) &&
416 	    uiop->uio_resid > so->so_proto_props.sopp_maxpsz &&
417 	    so->so_proto_props.sopp_maxpsz != -1) {
418 		SO_UNBLOCK_FALLBACK(so);
419 		return (EMSGSIZE);
420 	}
421 
422 	/*
423 	 * For atomic sends we will only do one iteration.
424 	 */
425 	do {
426 		if (so->so_state & SS_CANTSENDMORE) {
427 			error = EPIPE;
428 			break;
429 		}
430 
431 		if (so->so_error != 0) {
432 			mutex_enter(&so->so_lock);
433 			error = sogeterr(so, B_TRUE);
434 			mutex_exit(&so->so_lock);
435 			if (error != 0)
436 				break;
437 		}
438 
439 		/*
440 		 * Send down OOB messages even if the send path is being
441 		 * flow controlled (assuming the protocol supports OOB data).
442 		 */
443 		if (flags & MSG_OOB) {
444 			if ((so->so_mode & SM_EXDATA) == 0) {
445 				error = EOPNOTSUPP;
446 				break;
447 			}
448 		} else if (so->so_snd_qfull) {
449 			/*
450 			 * Need to wait until the protocol is ready to receive
451 			 * more data for transmission.
452 			 */
453 			if ((error = so_snd_wait_qnotfull(so, dontblock)) != 0)
454 				break;
455 		}
456 
457 		/*
458 		 * Time to send data to the protocol. We either copy the
459 		 * data into mblks or pass the uio directly to the protocol.
460 		 * We decide what to do based on the available down calls.
461 		 */
462 		if (so->so_downcalls->sd_send_uio != NULL) {
463 			error = (*so->so_downcalls->sd_send_uio)
464 			    (so->so_proto_handle, uiop, msg, cr);
465 			if (error != 0)
466 				break;
467 		} else {
468 			/* save the resid in case of failure */
469 			orig_resid = uiop->uio_resid;
470 
471 			if ((mp = socopyinuio(uiop,
472 			    so->so_proto_props.sopp_maxpsz,
473 			    so->so_proto_props.sopp_wroff,
474 			    so->so_proto_props.sopp_maxblk,
475 			    so->so_proto_props.sopp_tail, &error)) == NULL) {
476 				break;
477 			}
478 			ASSERT(uiop->uio_resid >= 0);
479 
480 			error = (*so->so_downcalls->sd_send)
481 			    (so->so_proto_handle, mp, msg, cr);
482 			if (error != 0) {
483 				/*
484 				 * The send failed. We do not have to free the
485 				 * mblks, because that is the protocol's
486 				 * responsibility. However, uio_resid must
487 				 * remain accurate, so adjust that here.
488 				 */
489 				uiop->uio_resid = orig_resid;
490 					break;
491 			}
492 		}
493 	} while (uiop->uio_resid > 0);
494 
495 	SO_UNBLOCK_FALLBACK(so);
496 
497 	return (error);
498 }
499 
500 int
501 so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
502     struct cred *cr, mblk_t **mpp)
503 {
504 	int error;
505 	boolean_t dontblock;
506 	size_t size;
507 	mblk_t *mp = *mpp;
508 
509 	SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp));
510 
511 	error = 0;
512 	dontblock = (msg->msg_flags & MSG_DONTWAIT) ||
513 	    (fflag & (FNONBLOCK|FNDELAY));
514 	size = msgdsize(mp);
515 
516 	if ((so->so_mode & SM_SENDFILESUPP) == 0 ||
517 	    so->so_downcalls->sd_send == NULL) {
518 		SO_UNBLOCK_FALLBACK(so);
519 		return (EOPNOTSUPP);
520 	}
521 
522 	if ((so->so_mode & SM_ATOMIC) &&
523 	    size > so->so_proto_props.sopp_maxpsz &&
524 	    so->so_proto_props.sopp_maxpsz != -1) {
525 		SO_UNBLOCK_FALLBACK(so);
526 		return (EMSGSIZE);
527 	}
528 
529 	while (mp != NULL) {
530 		mblk_t *nmp, *last_mblk;
531 		size_t mlen;
532 
533 		if (so->so_state & SS_CANTSENDMORE) {
534 			error = EPIPE;
535 			break;
536 		}
537 		if (so->so_error != 0) {
538 			mutex_enter(&so->so_lock);
539 			error = sogeterr(so, B_TRUE);
540 			mutex_exit(&so->so_lock);
541 			if (error != 0)
542 				break;
543 		}
544 		if (so->so_snd_qfull) {
545 			/*
546 			 * Need to wait until the protocol is ready to receive
547 			 * more data for transmission.
548 			 */
549 			if ((error = so_snd_wait_qnotfull(so, dontblock)) != 0)
550 				break;
551 		}
552 
553 		/*
554 		 * We only allow so_maxpsz of data to be sent down to
555 		 * the protocol at time.
556 		 */
557 		mlen = MBLKL(mp);
558 		nmp = mp->b_cont;
559 		last_mblk = mp;
560 		while (nmp != NULL) {
561 			mlen += MBLKL(nmp);
562 			if (mlen > so->so_proto_props.sopp_maxpsz) {
563 				last_mblk->b_cont = NULL;
564 				break;
565 			}
566 			last_mblk = nmp;
567 			nmp = nmp->b_cont;
568 		}
569 
570 		error = (*so->so_downcalls->sd_send)
571 		    (so->so_proto_handle, mp, msg, cr);
572 		if (error != 0) {
573 			/*
574 			 * The send failed. The protocol will free the mblks
575 			 * that were sent down. Let the caller deal with the
576 			 * rest.
577 			 */
578 			*mpp = nmp;
579 			break;
580 		}
581 
582 		*mpp = mp = nmp;
583 	}
584 
585 	SO_UNBLOCK_FALLBACK(so);
586 
587 	return (error);
588 }
589 
590 int
591 so_shutdown(struct sonode *so, int how, struct cred *cr)
592 {
593 	int error;
594 
595 	SO_BLOCK_FALLBACK(so, SOP_SHUTDOWN(so, how, cr));
596 
597 	/*
598 	 * SunOS 4.X has no check for datagram sockets.
599 	 * 5.X checks that it is connected (ENOTCONN)
600 	 * X/Open requires that we check the connected state.
601 	 */
602 	if (!(so->so_state & SS_ISCONNECTED)) {
603 		if (!xnet_skip_checks) {
604 			error = ENOTCONN;
605 			if (xnet_check_print) {
606 				printf("sockfs: X/Open shutdown check "
607 				    "caused ENOTCONN\n");
608 			}
609 		}
610 		goto done;
611 	}
612 
613 	error = ((*so->so_downcalls->sd_shutdown)(so->so_proto_handle,
614 	    how, cr));
615 
616 	/*
617 	 * Protocol agreed to shutdown. We need to flush the
618 	 * receive buffer if the receive side is being shutdown.
619 	 */
620 	if (error == 0 && how != SHUT_WR) {
621 		mutex_enter(&so->so_lock);
622 		/* wait for active reader to finish */
623 		(void) so_lock_read(so, 0);
624 
625 		so_rcv_flush(so);
626 
627 		so_unlock_read(so);
628 		mutex_exit(&so->so_lock);
629 	}
630 
631 done:
632 	SO_UNBLOCK_FALLBACK(so);
633 	return (error);
634 }
635 
636 int
637 so_getsockname(struct sonode *so, struct sockaddr *addr,
638     socklen_t *addrlen, struct cred *cr)
639 {
640 	int error;
641 
642 	SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
643 
644 	error = (*so->so_downcalls->sd_getsockname)
645 	    (so->so_proto_handle, addr, addrlen, cr);
646 
647 	SO_UNBLOCK_FALLBACK(so);
648 	return (error);
649 }
650 
651 int
652 so_getpeername(struct sonode *so, struct sockaddr *addr,
653     socklen_t *addrlen, boolean_t accept, struct cred *cr)
654 {
655 	int error;
656 
657 	SO_BLOCK_FALLBACK(so, SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
658 
659 	if (accept) {
660 		error = (*so->so_downcalls->sd_getpeername)
661 		    (so->so_proto_handle, addr, addrlen, cr);
662 	} else if (!(so->so_state & SS_ISCONNECTED)) {
663 		error = ENOTCONN;
664 	} else if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
665 		/* Added this check for X/Open */
666 		error = EINVAL;
667 		if (xnet_check_print) {
668 			printf("sockfs: X/Open getpeername check => EINVAL\n");
669 		}
670 	} else {
671 		error = (*so->so_downcalls->sd_getpeername)
672 		    (so->so_proto_handle, addr, addrlen, cr);
673 	}
674 
675 	SO_UNBLOCK_FALLBACK(so);
676 	return (error);
677 }
678 
679 int
680 so_getsockopt(struct sonode *so, int level, int option_name,
681     void *optval, socklen_t *optlenp, int flags, struct cred *cr)
682 {
683 	int error = 0;
684 
685 	ASSERT(MUTEX_NOT_HELD(&so->so_lock));
686 	SO_BLOCK_FALLBACK(so,
687 	    SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr));
688 
689 	error = socket_getopt_common(so, level, option_name, optval, optlenp,
690 	    flags);
691 	if (error < 0) {
692 		error = (*so->so_downcalls->sd_getsockopt)
693 		    (so->so_proto_handle, level, option_name, optval, optlenp,
694 		    cr);
695 		if (error ==  ENOPROTOOPT) {
696 			if (level == SOL_SOCKET) {
697 				/*
698 				 * If a protocol does not support a particular
699 				 * socket option, set can fail (not allowed)
700 				 * but get can not fail. This is the previous
701 				 * sockfs bahvior.
702 				 */
703 				switch (option_name) {
704 				case SO_LINGER:
705 					if (*optlenp < (t_uscalar_t)
706 					    sizeof (struct linger)) {
707 						error = EINVAL;
708 						break;
709 					}
710 					error = 0;
711 					bzero(optval, sizeof (struct linger));
712 					*optlenp = sizeof (struct linger);
713 					break;
714 				case SO_RCVTIMEO:
715 				case SO_SNDTIMEO:
716 					if (*optlenp < (t_uscalar_t)
717 					    sizeof (struct timeval)) {
718 						error = EINVAL;
719 						break;
720 					}
721 					error = 0;
722 					bzero(optval, sizeof (struct timeval));
723 					*optlenp = sizeof (struct timeval);
724 					break;
725 				case SO_SND_BUFINFO:
726 					if (*optlenp < (t_uscalar_t)
727 					    sizeof (struct so_snd_bufinfo)) {
728 						error = EINVAL;
729 						break;
730 					}
731 					error = 0;
732 					bzero(optval,
733 					    sizeof (struct so_snd_bufinfo));
734 					*optlenp =
735 					    sizeof (struct so_snd_bufinfo);
736 					break;
737 				case SO_DEBUG:
738 				case SO_REUSEADDR:
739 				case SO_KEEPALIVE:
740 				case SO_DONTROUTE:
741 				case SO_BROADCAST:
742 				case SO_USELOOPBACK:
743 				case SO_OOBINLINE:
744 				case SO_DGRAM_ERRIND:
745 				case SO_SNDBUF:
746 				case SO_RCVBUF:
747 					error = 0;
748 					*((int32_t *)optval) = 0;
749 					*optlenp = sizeof (int32_t);
750 					break;
751 				default:
752 					break;
753 				}
754 			}
755 		}
756 	}
757 
758 	SO_UNBLOCK_FALLBACK(so);
759 	return (error);
760 }
761 
762 int
763 so_setsockopt(struct sonode *so, int level, int option_name,
764     const void *optval, socklen_t optlen, struct cred *cr)
765 {
766 	int error = 0;
767 	struct timeval tl;
768 	const void *opt = optval;
769 
770 	SO_BLOCK_FALLBACK(so,
771 	    SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
772 
773 	/* X/Open requires this check */
774 	if (so->so_state & SS_CANTSENDMORE && !xnet_skip_checks) {
775 		SO_UNBLOCK_FALLBACK(so);
776 		if (xnet_check_print)
777 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
778 		return (EINVAL);
779 	}
780 
781 	if (level == SOL_SOCKET) {
782 		switch (option_name) {
783 		case SO_RCVTIMEO:
784 		case SO_SNDTIMEO: {
785 			/*
786 			 * We pass down these two options to protocol in order
787 			 * to support some third part protocols which need to
788 			 * know them. For those protocols which don't care
789 			 * these two options, simply return 0.
790 			 */
791 			clock_t t_usec;
792 
793 			if (get_udatamodel() == DATAMODEL_NONE ||
794 			    get_udatamodel() == DATAMODEL_NATIVE) {
795 				if (optlen != sizeof (struct timeval)) {
796 					error = EINVAL;
797 					goto done;
798 				}
799 				bcopy((struct timeval *)optval, &tl,
800 				    sizeof (struct timeval));
801 			} else {
802 				if (optlen != sizeof (struct timeval32)) {
803 					error = EINVAL;
804 					goto done;
805 				}
806 				TIMEVAL32_TO_TIMEVAL(&tl,
807 				    (struct timeval32 *)optval);
808 			}
809 			opt = &tl;
810 			optlen = sizeof (tl);
811 			t_usec = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
812 			mutex_enter(&so->so_lock);
813 			if (option_name == SO_RCVTIMEO)
814 				so->so_rcvtimeo = drv_usectohz(t_usec);
815 			else
816 				so->so_sndtimeo = drv_usectohz(t_usec);
817 			mutex_exit(&so->so_lock);
818 			break;
819 		}
820 		case SO_RCVBUF:
821 			/*
822 			 * XXX XPG 4.2 applications retrieve SO_RCVBUF from
823 			 * sockfs since the transport might adjust the value
824 			 * and not return exactly what was set by the
825 			 * application.
826 			 */
827 			so->so_xpg_rcvbuf = *(int32_t *)optval;
828 			break;
829 		}
830 	}
831 	error = (*so->so_downcalls->sd_setsockopt)
832 	    (so->so_proto_handle, level, option_name, opt, optlen, cr);
833 done:
834 	SO_UNBLOCK_FALLBACK(so);
835 	return (error);
836 }
837 
838 int
839 so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
840     struct cred *cr, int32_t *rvalp)
841 {
842 	int error = 0;
843 
844 	SO_BLOCK_FALLBACK(so, SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
845 
846 	/*
847 	 * If there is a pending error, return error
848 	 * This can happen if a non blocking operation caused an error.
849 	 */
850 	if (so->so_error != 0) {
851 		mutex_enter(&so->so_lock);
852 		error = sogeterr(so, B_TRUE);
853 		mutex_exit(&so->so_lock);
854 		if (error != 0)
855 			goto done;
856 	}
857 
858 	/*
859 	 * calling strioc can result in the socket falling back to TPI,
860 	 * if that is supported.
861 	 */
862 	if ((error = socket_ioctl_common(so, cmd, arg, mode, cr, rvalp)) < 0 &&
863 	    (error = socket_strioc_common(so, cmd, arg, mode, cr, rvalp)) < 0) {
864 		error = (*so->so_downcalls->sd_ioctl)(so->so_proto_handle,
865 		    cmd, arg, mode, rvalp, cr);
866 	}
867 
868 done:
869 	SO_UNBLOCK_FALLBACK(so);
870 
871 	return (error);
872 }
873 
874 int
875 so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
876     struct pollhead **phpp)
877 {
878 	int state = so->so_state;
879 	*reventsp = 0;
880 
881 	if (so->so_error != 0 &&
882 	    ((POLLIN|POLLRDNORM|POLLOUT) & events)  != 0) {
883 		*reventsp = (POLLIN|POLLRDNORM|POLLOUT) & events;
884 		return (0);
885 	}
886 
887 	/*
888 	 * As long as there is buffer to send data, and the socket is
889 	 * in a state where it can send data (i.e., connected for
890 	 * connection oriented protocols), then turn on POLLOUT events
891 	 */
892 	if (!so->so_snd_qfull && ((so->so_mode & SM_CONNREQUIRED) == 0 ||
893 	    state & SS_ISCONNECTED)) {
894 		*reventsp |= POLLOUT & events;
895 	}
896 
897 	/*
898 	 * Turn on POLLIN whenever there is data on the receive queue,
899 	 * or the socket is in a state where no more data will be received.
900 	 * Also, if the socket is accepting connections, flip the bit if
901 	 * there is something on the queue.
902 	 *
903 	 * We do an initial check for events without holding locks. However,
904 	 * if there are no event available, then we redo the check for POLLIN
905 	 * events under the lock.
906 	 */
907 
908 	/* Pending connections */
909 	if (so->so_acceptq_len > 0)
910 		*reventsp |= (POLLIN|POLLRDNORM) & events;
911 
912 	/* Data */
913 	/* so_downcalls is null for sctp */
914 	if (so->so_downcalls != NULL && so->so_downcalls->sd_poll != NULL) {
915 		*reventsp |= (*so->so_downcalls->sd_poll)
916 		    (so->so_proto_handle, events & SO_PROTO_POLLEV, anyyet,
917 		    CRED()) & events;
918 		ASSERT((*reventsp & ~events) == 0);
919 		/* do not recheck events */
920 		events &= ~SO_PROTO_POLLEV;
921 	} else {
922 		if (SO_HAVE_DATA(so))
923 			*reventsp |= (POLLIN|POLLRDNORM) & events;
924 
925 		/* Urgent data */
926 		if ((state & SS_OOBPEND) != 0)
927 			*reventsp |= (POLLRDBAND) & events;
928 	}
929 
930 	if (!*reventsp && !anyyet) {
931 		/* Check for read events again, but this time under lock */
932 		if (events & (POLLIN|POLLRDNORM)) {
933 			mutex_enter(&so->so_lock);
934 			if (SO_HAVE_DATA(so) || so->so_acceptq_len > 0) {
935 				mutex_exit(&so->so_lock);
936 				*reventsp |= (POLLIN|POLLRDNORM) & events;
937 				return (0);
938 			} else {
939 				so->so_pollev |= SO_POLLEV_IN;
940 				mutex_exit(&so->so_lock);
941 			}
942 		}
943 		*phpp = &so->so_poll_list;
944 	}
945 	return (0);
946 }
947 
948 /*
949  * Generic Upcalls
950  */
951 void
952 so_connected(sock_upper_handle_t sock_handle, sock_connid_t id,
953     cred_t *peer_cred, pid_t peer_cpid)
954 {
955 	struct sonode *so = (struct sonode *)sock_handle;
956 
957 	mutex_enter(&so->so_lock);
958 	ASSERT(so->so_proto_handle != NULL);
959 
960 	if (peer_cred != NULL) {
961 		if (so->so_peercred != NULL)
962 			crfree(so->so_peercred);
963 		crhold(peer_cred);
964 		so->so_peercred = peer_cred;
965 		so->so_cpid = peer_cpid;
966 	}
967 
968 	so->so_proto_connid = id;
969 	soisconnected(so);
970 	/*
971 	 * Wake ones who're waiting for conn to become established.
972 	 */
973 	so_notify_connected(so);
974 }
975 
976 int
977 so_disconnected(sock_upper_handle_t sock_handle, sock_connid_t id, int error)
978 {
979 	struct sonode *so = (struct sonode *)sock_handle;
980 
981 	mutex_enter(&so->so_lock);
982 
983 	so->so_proto_connid = id;
984 	soisdisconnected(so, error);
985 	so_notify_disconnected(so, error);
986 
987 	return (0);
988 }
989 
990 void
991 so_opctl(sock_upper_handle_t sock_handle, sock_opctl_action_t action,
992     uintptr_t arg)
993 {
994 	struct sonode *so = (struct sonode *)sock_handle;
995 
996 	switch (action) {
997 	case SOCK_OPCTL_SHUT_SEND:
998 		mutex_enter(&so->so_lock);
999 		socantsendmore(so);
1000 		so_notify_disconnecting(so);
1001 		break;
1002 	case SOCK_OPCTL_SHUT_RECV: {
1003 		mutex_enter(&so->so_lock);
1004 		socantrcvmore(so);
1005 		so_notify_eof(so);
1006 		break;
1007 	}
1008 	case SOCK_OPCTL_ENAB_ACCEPT:
1009 		mutex_enter(&so->so_lock);
1010 		so->so_state |= SS_ACCEPTCONN;
1011 		so->so_backlog = (unsigned int)arg;
1012 		mutex_exit(&so->so_lock);
1013 		break;
1014 	default:
1015 		ASSERT(0);
1016 		break;
1017 	}
1018 }
1019 
1020 void
1021 so_txq_full(sock_upper_handle_t sock_handle, boolean_t qfull)
1022 {
1023 	struct sonode *so = (struct sonode *)sock_handle;
1024 
1025 	if (qfull) {
1026 		so_snd_qfull(so);
1027 	} else {
1028 		so_snd_qnotfull(so);
1029 		mutex_enter(&so->so_lock);
1030 		so_notify_writable(so);
1031 	}
1032 }
1033 
1034 sock_upper_handle_t
1035 so_newconn(sock_upper_handle_t parenthandle,
1036     sock_lower_handle_t proto_handle, sock_downcalls_t *sock_downcalls,
1037     struct cred *peer_cred, pid_t peer_cpid, sock_upcalls_t **sock_upcallsp)
1038 {
1039 	struct sonode	*so = (struct sonode *)parenthandle;
1040 	struct sonode	*nso;
1041 	int error;
1042 
1043 	ASSERT(proto_handle != NULL);
1044 
1045 	if ((so->so_state & SS_ACCEPTCONN) == 0 ||
1046 	    so->so_acceptq_len >= so->so_backlog)
1047 		return (NULL);
1048 
1049 	nso = socket_newconn(so, proto_handle, sock_downcalls, SOCKET_NOSLEEP,
1050 	    &error);
1051 	if (nso == NULL)
1052 		return (NULL);
1053 
1054 	if (peer_cred != NULL) {
1055 		crhold(peer_cred);
1056 		nso->so_peercred = peer_cred;
1057 		nso->so_cpid = peer_cpid;
1058 	}
1059 
1060 	(void) so_acceptq_enqueue(so, nso);
1061 	mutex_enter(&so->so_lock);
1062 	so_notify_newconn(so);
1063 
1064 	*sock_upcallsp = &so_upcalls;
1065 
1066 	return ((sock_upper_handle_t)nso);
1067 }
1068 
1069 void
1070 so_set_prop(sock_upper_handle_t sock_handle, struct sock_proto_props *soppp)
1071 {
1072 	struct sonode *so;
1073 
1074 	so = (struct sonode *)sock_handle;
1075 
1076 	mutex_enter(&so->so_lock);
1077 
1078 	if (soppp->sopp_flags & SOCKOPT_MAXBLK)
1079 		so->so_proto_props.sopp_maxblk = soppp->sopp_maxblk;
1080 	if (soppp->sopp_flags & SOCKOPT_WROFF)
1081 		so->so_proto_props.sopp_wroff = soppp->sopp_wroff;
1082 	if (soppp->sopp_flags & SOCKOPT_TAIL)
1083 		so->so_proto_props.sopp_tail = soppp->sopp_tail;
1084 	if (soppp->sopp_flags & SOCKOPT_RCVHIWAT)
1085 		so->so_proto_props.sopp_rxhiwat = soppp->sopp_rxhiwat;
1086 	if (soppp->sopp_flags & SOCKOPT_RCVLOWAT)
1087 		so->so_proto_props.sopp_rxlowat = soppp->sopp_rxlowat;
1088 	if (soppp->sopp_flags & SOCKOPT_MAXPSZ)
1089 		so->so_proto_props.sopp_maxpsz = soppp->sopp_maxpsz;
1090 	if (soppp->sopp_flags & SOCKOPT_MINPSZ)
1091 		so->so_proto_props.sopp_minpsz = soppp->sopp_minpsz;
1092 	if (soppp->sopp_flags & SOCKOPT_ZCOPY) {
1093 		if (soppp->sopp_zcopyflag & ZCVMSAFE) {
1094 			so->so_proto_props.sopp_zcopyflag |= STZCVMSAFE;
1095 			so->so_proto_props.sopp_zcopyflag &= ~STZCVMUNSAFE;
1096 		} else if (soppp->sopp_zcopyflag & ZCVMUNSAFE) {
1097 			so->so_proto_props.sopp_zcopyflag |= STZCVMUNSAFE;
1098 			so->so_proto_props.sopp_zcopyflag &= ~STZCVMSAFE;
1099 		}
1100 
1101 		if (soppp->sopp_zcopyflag & COPYCACHED) {
1102 			so->so_proto_props.sopp_zcopyflag |= STRCOPYCACHED;
1103 		}
1104 	}
1105 	if (soppp->sopp_flags & SOCKOPT_OOBINLINE)
1106 		so->so_proto_props.sopp_oobinline = soppp->sopp_oobinline;
1107 	if (soppp->sopp_flags & SOCKOPT_RCVTIMER)
1108 		so->so_proto_props.sopp_rcvtimer = soppp->sopp_rcvtimer;
1109 	if (soppp->sopp_flags & SOCKOPT_RCVTHRESH)
1110 		so->so_proto_props.sopp_rcvthresh = soppp->sopp_rcvthresh;
1111 	if (soppp->sopp_flags & SOCKOPT_MAXADDRLEN)
1112 		so->so_proto_props.sopp_maxaddrlen = soppp->sopp_maxaddrlen;
1113 
1114 	mutex_exit(&so->so_lock);
1115 
1116 #ifdef DEBUG
1117 	soppp->sopp_flags &= ~(SOCKOPT_MAXBLK | SOCKOPT_WROFF | SOCKOPT_TAIL |
1118 	    SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | SOCKOPT_MAXPSZ |
1119 	    SOCKOPT_ZCOPY | SOCKOPT_OOBINLINE | SOCKOPT_RCVTIMER |
1120 	    SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ);
1121 	ASSERT(soppp->sopp_flags == 0);
1122 #endif
1123 }
1124 
1125 /* ARGSUSED */
1126 ssize_t
1127 so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
1128     size_t msg_size, int flags, int *errorp,  boolean_t *force_pushp)
1129 {
1130 	struct sonode *so = (struct sonode *)sock_handle;
1131 	boolean_t force_push = B_TRUE;
1132 	int space_left;
1133 	sodirect_t *sodp = so->so_direct;
1134 
1135 	ASSERT(errorp != NULL);
1136 	*errorp = 0;
1137 	if (mp == NULL) {
1138 		if (msg_size > 0) {
1139 			ASSERT(so->so_downcalls->sd_recv_uio != NULL);
1140 			mutex_enter(&so->so_lock);
1141 			/* the notify functions will drop the lock */
1142 			if (flags & MSG_OOB)
1143 				so_notify_oobdata(so, IS_SO_OOB_INLINE(so));
1144 			else
1145 				so_notify_data(so, msg_size);
1146 			return (0);
1147 		}
1148 		/*
1149 		 * recv space check
1150 		 */
1151 		mutex_enter(&so->so_lock);
1152 		space_left = so->so_rcvbuf - so->so_rcv_queued;
1153 		if (space_left <= 0) {
1154 			so->so_flowctrld = B_TRUE;
1155 			*errorp = ENOSPC;
1156 			space_left = -1;
1157 		}
1158 		goto done_unlock;
1159 	}
1160 
1161 	ASSERT(mp->b_next == NULL);
1162 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO);
1163 	ASSERT(msg_size == msgdsize(mp));
1164 
1165 	if (flags & MSG_OOB) {
1166 		so_queue_oob(sock_handle, mp, msg_size);
1167 		return (0);
1168 	}
1169 
1170 	if (force_pushp != NULL)
1171 		force_push = *force_pushp;
1172 
1173 	if (DB_TYPE(mp) == M_PROTO && !__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
1174 		/* The read pointer is not aligned correctly for TPI */
1175 		zcmn_err(getzoneid(), CE_WARN,
1176 		    "sockfs: Unaligned TPI message received. rptr = %p\n",
1177 		    (void *)mp->b_rptr);
1178 		freemsg(mp);
1179 		mutex_enter(sodp->sod_lockp);
1180 		SOD_UIOAFINI(sodp);
1181 		mutex_exit(sodp->sod_lockp);
1182 
1183 		return (so->so_rcvbuf - so->so_rcv_queued);
1184 	}
1185 
1186 	mutex_enter(&so->so_lock);
1187 	if (so->so_state & (SS_FALLBACK_PENDING | SS_FALLBACK_COMP)) {
1188 		SOD_DISABLE(sodp);
1189 		mutex_exit(&so->so_lock);
1190 		*errorp = EOPNOTSUPP;
1191 		return (-1);
1192 	}
1193 	if (so->so_state & SS_CANTRCVMORE) {
1194 		freemsg(mp);
1195 		SOD_DISABLE(sodp);
1196 		mutex_exit(&so->so_lock);
1197 		return (0);
1198 	}
1199 
1200 	/* process the mblk via I/OAT if capable */
1201 	if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
1202 		if (DB_TYPE(mp) == M_DATA) {
1203 			(void) sod_uioa_mblk_init(sodp, mp, msg_size);
1204 		} else {
1205 			SOD_UIOAFINI(sodp);
1206 		}
1207 	}
1208 
1209 	if (mp->b_next == NULL) {
1210 		so_enqueue_msg(so, mp, msg_size);
1211 	} else {
1212 		do {
1213 			mblk_t *nmp;
1214 
1215 			if ((nmp = mp->b_next) != NULL) {
1216 				mp->b_next = NULL;
1217 			}
1218 			so_enqueue_msg(so, mp, msgdsize(mp));
1219 			mp = nmp;
1220 		} while (mp != NULL);
1221 	}
1222 
1223 	space_left = so->so_rcvbuf - so->so_rcv_queued;
1224 	if (space_left <= 0) {
1225 		so->so_flowctrld = B_TRUE;
1226 		*errorp = ENOSPC;
1227 		space_left = -1;
1228 	}
1229 
1230 	if (force_push || so->so_rcv_queued >= so->so_rcv_thresh ||
1231 	    so->so_rcv_queued >= so->so_rcv_wanted ||
1232 	    (sodp != NULL && so->so_rcv_queued >= sodp->sod_want)) {
1233 		SOCKET_TIMER_CANCEL(so);
1234 		/*
1235 		 * so_notify_data will release the lock
1236 		 */
1237 		so_notify_data(so, so->so_rcv_queued);
1238 
1239 		if (force_pushp != NULL)
1240 			*force_pushp = B_TRUE;
1241 		goto done;
1242 	} else if (so->so_rcv_timer_tid == 0) {
1243 		/* Make sure the recv push timer is running */
1244 		SOCKET_TIMER_START(so);
1245 	}
1246 
1247 done_unlock:
1248 	mutex_exit(&so->so_lock);
1249 done:
1250 	return (space_left);
1251 }
1252 
1253 /*
1254  * Set the offset of where the oob data is relative to the bytes in
1255  * queued. Also generate SIGURG
1256  */
1257 void
1258 so_signal_oob(sock_upper_handle_t sock_handle, ssize_t offset)
1259 {
1260 	struct sonode *so;
1261 
1262 	ASSERT(offset >= 0);
1263 	so = (struct sonode *)sock_handle;
1264 	mutex_enter(&so->so_lock);
1265 	SOD_UIOAFINI(so->so_direct);
1266 
1267 	/*
1268 	 * New urgent data on the way so forget about any old
1269 	 * urgent data.
1270 	 */
1271 	so->so_state &= ~(SS_HAVEOOBDATA|SS_HADOOBDATA);
1272 
1273 	/*
1274 	 * Record that urgent data is pending.
1275 	 */
1276 	so->so_state |= SS_OOBPEND;
1277 
1278 	if (so->so_oobmsg != NULL) {
1279 		dprintso(so, 1, ("sock: discarding old oob\n"));
1280 		freemsg(so->so_oobmsg);
1281 		so->so_oobmsg = NULL;
1282 	}
1283 
1284 	/*
1285 	 * set the offset where the urgent byte is
1286 	 */
1287 	so->so_oobmark = so->so_rcv_queued + offset;
1288 	if (so->so_oobmark == 0)
1289 		so->so_state |= SS_RCVATMARK;
1290 	else
1291 		so->so_state &= ~SS_RCVATMARK;
1292 
1293 	so_notify_oobsig(so);
1294 }
1295 
1296 /*
1297  * Queue the OOB byte
1298  */
1299 static void
1300 so_queue_oob(sock_upper_handle_t sock_handle, mblk_t *mp, size_t len)
1301 {
1302 	struct sonode *so;
1303 
1304 	so = (struct sonode *)sock_handle;
1305 	mutex_enter(&so->so_lock);
1306 	SOD_UIOAFINI(so->so_direct);
1307 
1308 	ASSERT(mp != NULL);
1309 	if (!IS_SO_OOB_INLINE(so)) {
1310 		so->so_oobmsg = mp;
1311 		so->so_state |= SS_HAVEOOBDATA;
1312 	} else {
1313 		so_enqueue_msg(so, mp, len);
1314 	}
1315 
1316 	so_notify_oobdata(so, IS_SO_OOB_INLINE(so));
1317 }
1318 
1319 int
1320 so_close(struct sonode *so, int flag, struct cred *cr)
1321 {
1322 	int error;
1323 
1324 	error = (*so->so_downcalls->sd_close)(so->so_proto_handle, flag, cr);
1325 
1326 	/*
1327 	 * At this point there will be no more upcalls from the protocol
1328 	 */
1329 	mutex_enter(&so->so_lock);
1330 
1331 	ASSERT(so_verify_oobstate(so));
1332 
1333 	so_rcv_flush(so);
1334 	mutex_exit(&so->so_lock);
1335 
1336 	return (error);
1337 }
1338 
1339 void
1340 so_zcopy_notify(sock_upper_handle_t sock_handle)
1341 {
1342 	struct sonode *so = (struct sonode *)sock_handle;
1343 
1344 	mutex_enter(&so->so_lock);
1345 	so->so_copyflag |= STZCNOTIFY;
1346 	cv_broadcast(&so->so_copy_cv);
1347 	mutex_exit(&so->so_lock);
1348 }
1349 
1350 void
1351 so_set_error(sock_upper_handle_t sock_handle, int error)
1352 {
1353 	struct sonode *so = (struct sonode *)sock_handle;
1354 
1355 	mutex_enter(&so->so_lock);
1356 
1357 	soseterror(so, error);
1358 
1359 	so_notify_error(so);
1360 }
1361 
1362 /*
1363  * so_recvmsg - read data from the socket
1364  *
1365  * There are two ways of obtaining data; either we ask the protocol to
1366  * copy directly into the supplied buffer, or we copy data from the
1367  * sonode's receive queue. The decision which one to use depends on
1368  * whether the protocol has a sd_recv_uio down call.
1369  */
1370 int
1371 so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
1372     struct cred *cr)
1373 {
1374 	rval_t 		rval;
1375 	int 		flags = 0;
1376 	t_uscalar_t	controllen, namelen;
1377 	int 		error = 0;
1378 	int ret;
1379 	mblk_t		*mctlp = NULL;
1380 	union T_primitives *tpr;
1381 	void		*control;
1382 	ssize_t		saved_resid;
1383 	struct uio	*suiop;
1384 
1385 	SO_BLOCK_FALLBACK(so, SOP_RECVMSG(so, msg, uiop, cr));
1386 
1387 	if ((so->so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
1388 	    (so->so_mode & SM_CONNREQUIRED)) {
1389 		SO_UNBLOCK_FALLBACK(so);
1390 		return (ENOTCONN);
1391 	}
1392 
1393 	if (msg->msg_flags & MSG_PEEK)
1394 		msg->msg_flags &= ~MSG_WAITALL;
1395 
1396 	if (so->so_mode & SM_ATOMIC)
1397 		msg->msg_flags |= MSG_TRUNC;
1398 
1399 	if (msg->msg_flags & MSG_OOB) {
1400 		if ((so->so_mode & SM_EXDATA) == 0) {
1401 			error = EOPNOTSUPP;
1402 		} else if (so->so_downcalls->sd_recv_uio != NULL) {
1403 			error = (*so->so_downcalls->sd_recv_uio)
1404 			    (so->so_proto_handle, uiop, msg, cr);
1405 		} else {
1406 			error = sorecvoob(so, msg, uiop, msg->msg_flags,
1407 			    IS_SO_OOB_INLINE(so));
1408 		}
1409 		SO_UNBLOCK_FALLBACK(so);
1410 		return (error);
1411 	}
1412 
1413 	/*
1414 	 * If the protocol has the recv down call, then pass the request
1415 	 * down.
1416 	 */
1417 	if (so->so_downcalls->sd_recv_uio != NULL) {
1418 		error = (*so->so_downcalls->sd_recv_uio)
1419 		    (so->so_proto_handle, uiop, msg, cr);
1420 		SO_UNBLOCK_FALLBACK(so);
1421 		return (error);
1422 	}
1423 
1424 	/*
1425 	 * Reading data from the socket buffer
1426 	 */
1427 	flags = msg->msg_flags;
1428 	msg->msg_flags = 0;
1429 
1430 	/*
1431 	 * Set msg_controllen and msg_namelen to zero here to make it
1432 	 * simpler in the cases that no control or name is returned.
1433 	 */
1434 	controllen = msg->msg_controllen;
1435 	namelen = msg->msg_namelen;
1436 	msg->msg_controllen = 0;
1437 	msg->msg_namelen = 0;
1438 
1439 	mutex_enter(&so->so_lock);
1440 	/* Set SOREADLOCKED */
1441 	error = so_lock_read_intr(so,
1442 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
1443 	mutex_exit(&so->so_lock);
1444 	if (error) {
1445 		SO_UNBLOCK_FALLBACK(so);
1446 		return (error);
1447 	}
1448 
1449 	suiop = sod_rcv_init(so, flags, &uiop);
1450 retry:
1451 	saved_resid = uiop->uio_resid;
1452 	error = so_dequeue_msg(so, &mctlp, uiop, &rval, flags);
1453 	if (error != 0) {
1454 		goto out;
1455 	}
1456 	/*
1457 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
1458 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
1459 	 */
1460 	ASSERT(!(rval.r_val1 & MORECTL));
1461 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
1462 		msg->msg_flags |= MSG_TRUNC;
1463 	if (mctlp == NULL) {
1464 		dprintso(so, 1, ("so_recvmsg: got M_DATA\n"));
1465 
1466 		mutex_enter(&so->so_lock);
1467 		/* Set MSG_EOR based on MOREDATA */
1468 		if (!(rval.r_val1 & MOREDATA)) {
1469 			if (so->so_state & SS_SAVEDEOR) {
1470 				msg->msg_flags |= MSG_EOR;
1471 				so->so_state &= ~SS_SAVEDEOR;
1472 			}
1473 		}
1474 		/*
1475 		 * If some data was received (i.e. not EOF) and the
1476 		 * read/recv* has not been satisfied wait for some more.
1477 		 */
1478 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
1479 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
1480 			mutex_exit(&so->so_lock);
1481 			goto retry;
1482 		}
1483 
1484 		goto out_locked;
1485 	}
1486 	/* strsock_proto has already verified length and alignment */
1487 	tpr = (union T_primitives *)mctlp->b_rptr;
1488 	dprintso(so, 1, ("so_recvmsg: type %d\n", tpr->type));
1489 	switch (tpr->type) {
1490 	case T_DATA_IND: {
1491 		/*
1492 		 * Set msg_flags to MSG_EOR based on
1493 		 * MORE_flag and MOREDATA.
1494 		 */
1495 		mutex_enter(&so->so_lock);
1496 		so->so_state &= ~SS_SAVEDEOR;
1497 		if (!(tpr->data_ind.MORE_flag & 1)) {
1498 			if (!(rval.r_val1 & MOREDATA))
1499 				msg->msg_flags |= MSG_EOR;
1500 			else
1501 				so->so_state |= SS_SAVEDEOR;
1502 		}
1503 		freemsg(mctlp);
1504 		/*
1505 		 * If some data was received (i.e. not EOF) and the
1506 		 * read/recv* has not been satisfied wait for some more.
1507 		 */
1508 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
1509 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
1510 			mutex_exit(&so->so_lock);
1511 			goto retry;
1512 		}
1513 		goto out_locked;
1514 	}
1515 	case T_UNITDATA_IND: {
1516 		void *addr;
1517 		t_uscalar_t addrlen;
1518 		void *abuf;
1519 		t_uscalar_t optlen;
1520 		void *opt;
1521 
1522 		if (namelen != 0) {
1523 			/* Caller wants source address */
1524 			addrlen = tpr->unitdata_ind.SRC_length;
1525 			addr = sogetoff(mctlp, tpr->unitdata_ind.SRC_offset,
1526 			    addrlen, 1);
1527 			if (addr == NULL) {
1528 				freemsg(mctlp);
1529 				error = EPROTO;
1530 				eprintsoline(so, error);
1531 				goto out;
1532 			}
1533 			ASSERT(so->so_family != AF_UNIX);
1534 		}
1535 		optlen = tpr->unitdata_ind.OPT_length;
1536 		if (optlen != 0) {
1537 			t_uscalar_t ncontrollen;
1538 
1539 			/*
1540 			 * Extract any source address option.
1541 			 * Determine how large cmsg buffer is needed.
1542 			 */
1543 			opt = sogetoff(mctlp, tpr->unitdata_ind.OPT_offset,
1544 			    optlen, __TPI_ALIGN_SIZE);
1545 
1546 			if (opt == NULL) {
1547 				freemsg(mctlp);
1548 				error = EPROTO;
1549 				eprintsoline(so, error);
1550 				goto out;
1551 			}
1552 			if (so->so_family == AF_UNIX)
1553 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
1554 			ncontrollen = so_cmsglen(mctlp, opt, optlen,
1555 			    !(flags & MSG_XPG4_2));
1556 			if (controllen != 0)
1557 				controllen = ncontrollen;
1558 			else if (ncontrollen != 0)
1559 				msg->msg_flags |= MSG_CTRUNC;
1560 		} else {
1561 			controllen = 0;
1562 		}
1563 
1564 		if (namelen != 0) {
1565 			/*
1566 			 * Return address to caller.
1567 			 * Caller handles truncation if length
1568 			 * exceeds msg_namelen.
1569 			 * NOTE: AF_UNIX NUL termination is ensured by
1570 			 * the sender's copyin_name().
1571 			 */
1572 			abuf = kmem_alloc(addrlen, KM_SLEEP);
1573 
1574 			bcopy(addr, abuf, addrlen);
1575 			msg->msg_name = abuf;
1576 			msg->msg_namelen = addrlen;
1577 		}
1578 
1579 		if (controllen != 0) {
1580 			/*
1581 			 * Return control msg to caller.
1582 			 * Caller handles truncation if length
1583 			 * exceeds msg_controllen.
1584 			 */
1585 			control = kmem_zalloc(controllen, KM_SLEEP);
1586 
1587 			error = so_opt2cmsg(mctlp, opt, optlen,
1588 			    !(flags & MSG_XPG4_2), control, controllen);
1589 			if (error) {
1590 				freemsg(mctlp);
1591 				if (msg->msg_namelen != 0)
1592 					kmem_free(msg->msg_name,
1593 					    msg->msg_namelen);
1594 				kmem_free(control, controllen);
1595 				eprintsoline(so, error);
1596 				goto out;
1597 			}
1598 			msg->msg_control = control;
1599 			msg->msg_controllen = controllen;
1600 		}
1601 
1602 		freemsg(mctlp);
1603 		goto out;
1604 	}
1605 	case T_OPTDATA_IND: {
1606 		struct T_optdata_req *tdr;
1607 		void *opt;
1608 		t_uscalar_t optlen;
1609 
1610 		tdr = (struct T_optdata_req *)mctlp->b_rptr;
1611 		optlen = tdr->OPT_length;
1612 		if (optlen != 0) {
1613 			t_uscalar_t ncontrollen;
1614 			/*
1615 			 * Determine how large cmsg buffer is needed.
1616 			 */
1617 			opt = sogetoff(mctlp,
1618 			    tpr->optdata_ind.OPT_offset, optlen,
1619 			    __TPI_ALIGN_SIZE);
1620 
1621 			if (opt == NULL) {
1622 				freemsg(mctlp);
1623 				error = EPROTO;
1624 				eprintsoline(so, error);
1625 				goto out;
1626 			}
1627 
1628 			ncontrollen = so_cmsglen(mctlp, opt, optlen,
1629 			    !(flags & MSG_XPG4_2));
1630 			if (controllen != 0)
1631 				controllen = ncontrollen;
1632 			else if (ncontrollen != 0)
1633 				msg->msg_flags |= MSG_CTRUNC;
1634 		} else {
1635 			controllen = 0;
1636 		}
1637 
1638 		if (controllen != 0) {
1639 			/*
1640 			 * Return control msg to caller.
1641 			 * Caller handles truncation if length
1642 			 * exceeds msg_controllen.
1643 			 */
1644 			control = kmem_zalloc(controllen, KM_SLEEP);
1645 
1646 			error = so_opt2cmsg(mctlp, opt, optlen,
1647 			    !(flags & MSG_XPG4_2), control, controllen);
1648 			if (error) {
1649 				freemsg(mctlp);
1650 				kmem_free(control, controllen);
1651 				eprintsoline(so, error);
1652 				goto out;
1653 			}
1654 			msg->msg_control = control;
1655 			msg->msg_controllen = controllen;
1656 		}
1657 
1658 		/*
1659 		 * Set msg_flags to MSG_EOR based on
1660 		 * DATA_flag and MOREDATA.
1661 		 */
1662 		mutex_enter(&so->so_lock);
1663 		so->so_state &= ~SS_SAVEDEOR;
1664 		if (!(tpr->data_ind.MORE_flag & 1)) {
1665 			if (!(rval.r_val1 & MOREDATA))
1666 				msg->msg_flags |= MSG_EOR;
1667 			else
1668 				so->so_state |= SS_SAVEDEOR;
1669 		}
1670 		freemsg(mctlp);
1671 		/*
1672 		 * If some data was received (i.e. not EOF) and the
1673 		 * read/recv* has not been satisfied wait for some more.
1674 		 * Not possible to wait if control info was received.
1675 		 */
1676 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
1677 		    controllen == 0 &&
1678 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
1679 			mutex_exit(&so->so_lock);
1680 			goto retry;
1681 		}
1682 		goto out_locked;
1683 	}
1684 	default:
1685 		cmn_err(CE_CONT, "so_recvmsg bad type %x \n",
1686 		    tpr->type);
1687 		freemsg(mctlp);
1688 		error = EPROTO;
1689 		ASSERT(0);
1690 	}
1691 out:
1692 	mutex_enter(&so->so_lock);
1693 out_locked:
1694 	/* The sod_lockp pointers to the sonode so_lock */
1695 	ret = sod_rcv_done(so, suiop, uiop);
1696 	if (ret != 0 && error == 0)
1697 		error = ret;
1698 
1699 	so_unlock_read(so);	/* Clear SOREADLOCKED */
1700 	mutex_exit(&so->so_lock);
1701 
1702 	SO_UNBLOCK_FALLBACK(so);
1703 
1704 	return (error);
1705 }
1706 
1707 sonodeops_t so_sonodeops = {
1708 	so_init,		/* sop_init	*/
1709 	so_accept,		/* sop_accept   */
1710 	so_bind,		/* sop_bind	*/
1711 	so_listen,		/* sop_listen   */
1712 	so_connect,		/* sop_connect  */
1713 	so_recvmsg,		/* sop_recvmsg  */
1714 	so_sendmsg,		/* sop_sendmsg  */
1715 	so_sendmblk,		/* sop_sendmblk */
1716 	so_getpeername,		/* sop_getpeername */
1717 	so_getsockname,		/* sop_getsockname */
1718 	so_shutdown,		/* sop_shutdown */
1719 	so_getsockopt,		/* sop_getsockopt */
1720 	so_setsockopt,		/* sop_setsockopt */
1721 	so_ioctl,		/* sop_ioctl    */
1722 	so_poll,		/* sop_poll	*/
1723 	so_close,		/* sop_close */
1724 };
1725 
1726 sock_upcalls_t so_upcalls = {
1727 	so_newconn,
1728 	so_connected,
1729 	so_disconnected,
1730 	so_opctl,
1731 	so_queue_msg,
1732 	so_set_prop,
1733 	so_txq_full,
1734 	so_signal_oob,
1735 	so_zcopy_notify,
1736 	so_set_error
1737 };
1738