xref: /titanic_50/usr/src/uts/common/fs/sockfs/sockcommon.c (revision 3db3491215579980a91e230cf21b20608fbb8259)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
32 #include <sys/cmn_err.h>
33 #include <sys/vfs.h>
34 #include <sys/policy.h>
35 #include <sys/modctl.h>
36 
37 #include <sys/sunddi.h>
38 
39 #include <sys/strsun.h>
40 #include <sys/stropts.h>
41 #include <sys/strsubr.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/sodirect.h>
45 #include <sys/uio.h>
46 
47 #include <inet/ipclassifier.h>
48 #include <fs/sockfs/sockcommon.h>
49 #include <fs/sockfs/nl7c.h>
50 #include <fs/sockfs/socktpi.h>
51 #include <inet/ip.h>
52 
53 extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print;
54 
55 static struct kmem_cache *sock_sod_cache;
56 
57 /*
58  * Common socket access functions.
59  *
60  * Instead of accessing the sonode switch directly (i.e., SOP_xxx()),
61  * the socket_xxx() function should be used.
62  */
63 
64 /*
65  * Try to create a new sonode of the requested <family, type, protocol>.
66  */
67 /* ARGSUSED */
68 struct sonode *
69 socket_create(int family, int type, int protocol, char *devpath, char *mod,
70     int flags, int version, struct cred *cr, int *errorp)
71 {
72 	struct sonode *so;
73 	struct sockparams *sp = NULL;
74 
75 	/*
76 	 * Look for a sockparams entry that match the given criteria.
77 	 * solookup() returns with the entry held.
78 	 */
79 	*errorp = solookup(family, type, protocol, &sp);
80 	if (sp == NULL) {
81 		int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
82 		/*
83 		 * There is no matching sockparams entry. An ephemeral entry is
84 		 * created if the caller specifies a device or a socket module.
85 		 */
86 		if (devpath != NULL) {
87 			sp = sockparams_hold_ephemeral_bydev(family, type,
88 			    protocol, devpath, kmflags, errorp);
89 		} else if (mod != NULL) {
90 			sp = sockparams_hold_ephemeral_bymod(family, type,
91 			    protocol, mod, kmflags, errorp);
92 		} else {
93 			return (NULL);
94 		}
95 
96 		if (sp == NULL)
97 			return (NULL);
98 	}
99 
100 	ASSERT(sp->sp_smod_info != NULL);
101 	ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP);
102 	so = sp->sp_smod_info->smod_sock_create_func(sp, family, type,
103 	    protocol, version, flags, errorp, cr);
104 	if (so == NULL) {
105 		SOCKPARAMS_DEC_REF(sp);
106 	} else {
107 		if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) {
108 			/* Cannot fail, only bumps so_count */
109 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
110 		} else {
111 			socket_destroy(so);
112 			so = NULL;
113 		}
114 	}
115 	return (so);
116 }
117 
118 struct sonode *
119 socket_newconn(struct sonode *parent, sock_lower_handle_t lh,
120     sock_downcalls_t *dc, int flags, int *errorp)
121 {
122 	struct sonode *so;
123 	struct sockparams *sp;
124 	struct cred *cr;
125 
126 	if ((cr = CRED()) == NULL)
127 		cr = kcred;
128 
129 	sp = parent->so_sockparams;
130 	ASSERT(sp != NULL);
131 
132 	so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family,
133 	    parent->so_type, parent->so_protocol, parent->so_version, flags,
134 	    errorp, cr);
135 	if (so != NULL) {
136 		SOCKPARAMS_INC_REF(sp);
137 
138 		so->so_proto_handle = lh;
139 		so->so_downcalls = dc;
140 		/*
141 		 * This function may be called in interrupt context, and CRED()
142 		 * will be NULL. In this case, pass in kcred.
143 		 */
144 		if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) {
145 			/* Cannot fail, only bumps so_count */
146 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
147 		} else  {
148 			socket_destroy(so);
149 			so = NULL;
150 		}
151 	}
152 
153 	return (so);
154 }
155 
156 /*
157  * Bind local endpoint.
158  */
159 int
160 socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
161     int flags, cred_t *cr)
162 {
163 	return (SOP_BIND(so, name, namelen, flags, cr));
164 }
165 
166 /*
167  * Turn socket into a listen socket.
168  */
169 int
170 socket_listen(struct sonode *so, int backlog, cred_t *cr)
171 {
172 	if (backlog < 0) {
173 		backlog = 0;
174 	}
175 
176 	/*
177 	 * Use the same qlimit as in BSD. BSD checks the qlimit
178 	 * before queuing the next connection implying that a
179 	 * listen(sock, 0) allows one connection to be queued.
180 	 * BSD also uses 1.5 times the requested backlog.
181 	 *
182 	 * XNS Issue 4 required a strict interpretation of the backlog.
183 	 * This has been waived subsequently for Issue 4 and the change
184 	 * incorporated in XNS Issue 5. So we aren't required to do
185 	 * anything special for XPG apps.
186 	 */
187 	if (backlog >= (INT_MAX - 1) / 3)
188 		backlog = INT_MAX;
189 	else
190 		backlog = backlog * 3 / 2 + 1;
191 
192 	return (SOP_LISTEN(so, backlog, cr));
193 }
194 
195 /*
196  * Accept incoming connection.
197  */
198 int
199 socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
200 {
201 	return (SOP_ACCEPT(lso, fflag, cr, nsop));
202 }
203 
204 /*
205  * Active open.
206  */
207 int
208 socket_connect(struct sonode *so, const struct sockaddr *name,
209     socklen_t namelen, int fflag, int flags, cred_t *cr)
210 {
211 	int error;
212 
213 	/*
214 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
215 	 * connect to a null address. This is the portable method to
216 	 * unconnect a socket.
217 	 */
218 	if ((namelen >= sizeof (sa_family_t)) &&
219 	    (name->sa_family == AF_UNSPEC)) {
220 		name = NULL;
221 		namelen = 0;
222 	}
223 
224 	error = SOP_CONNECT(so, name, namelen, fflag, flags, cr);
225 
226 	if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) {
227 		/*
228 		 * X/Open specification contains a requirement that
229 		 * ENETUNREACH be returned but does not require
230 		 * EHOSTUNREACH. In order to keep the test suite
231 		 * happy we mess with the errno here.
232 		 */
233 		error = ENETUNREACH;
234 	}
235 
236 	return (error);
237 }
238 
239 /*
240  * Get address of remote node.
241  */
242 int
243 socket_getpeername(struct sonode *so, struct sockaddr *addr,
244     socklen_t *addrlen, boolean_t accept, cred_t *cr)
245 {
246 	ASSERT(*addrlen > 0);
247 	return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
248 
249 }
250 
251 /*
252  * Get local address.
253  */
254 int
255 socket_getsockname(struct sonode *so, struct sockaddr *addr,
256     socklen_t *addrlen, cred_t *cr)
257 {
258 	return (SOP_GETSOCKNAME(so, addr, addrlen, cr));
259 
260 }
261 
262 /*
263  * Called from shutdown().
264  */
265 int
266 socket_shutdown(struct sonode *so, int how, cred_t *cr)
267 {
268 	return (SOP_SHUTDOWN(so, how, cr));
269 }
270 
271 /*
272  * Get socket options.
273  */
274 /*ARGSUSED*/
275 int
276 socket_getsockopt(struct sonode *so, int level, int option_name,
277     void *optval, socklen_t *optlenp, int flags, cred_t *cr)
278 {
279 	return (SOP_GETSOCKOPT(so, level, option_name, optval,
280 	    optlenp, flags, cr));
281 }
282 
283 /*
284  * Set socket options
285  */
286 int
287 socket_setsockopt(struct sonode *so, int level, int option_name,
288     const void *optval, t_uscalar_t optlen, cred_t *cr)
289 {
290 	/* Caller allocates aligned optval, or passes null */
291 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
292 	/* If optval is null optlen is 0, and vice-versa */
293 	ASSERT(optval != NULL || optlen == 0);
294 	ASSERT(optlen != 0 || optval == NULL);
295 
296 	/* No options should be zero-length */
297 	if (optlen == 0)
298 		return (EINVAL);
299 
300 	return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
301 }
302 
303 int
304 socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
305     cred_t *cr)
306 {
307 	int error = 0;
308 	ssize_t orig_resid = uiop->uio_resid;
309 
310 	/*
311 	 * Do not bypass the cache if we are doing a local (AF_UNIX) write.
312 	 */
313 	if (so->so_family == AF_UNIX)
314 		uiop->uio_extflg |= UIO_COPY_CACHED;
315 	else
316 		uiop->uio_extflg &= ~UIO_COPY_CACHED;
317 
318 	error = SOP_SENDMSG(so, msg, uiop, cr);
319 	switch (error) {
320 	default:
321 		break;
322 	case EINTR:
323 	case ETIME:
324 	case EWOULDBLOCK:
325 		/* We did a partial send */
326 		if (uiop->uio_resid != orig_resid)
327 			error = 0;
328 		break;
329 	case EPIPE:
330 		if ((so->so_mode & SM_KERNEL) == 0)
331 			tsignal(curthread, SIGPIPE);
332 		break;
333 	}
334 
335 	return (error);
336 }
337 
338 int
339 socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
340     struct cred *cr, mblk_t **mpp)
341 {
342 	int error = 0;
343 
344 	error = SOP_SENDMBLK(so, msg, fflag, cr, mpp);
345 	if (error == EPIPE) {
346 		tsignal(curthread, SIGPIPE);
347 	}
348 	return (error);
349 }
350 
351 int
352 socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
353     cred_t *cr)
354 {
355 	int error;
356 	ssize_t orig_resid = uiop->uio_resid;
357 
358 	/*
359 	 * Do not bypass the cache when reading data, as the application
360 	 * is likely to access the data shortly.
361 	 */
362 	uiop->uio_extflg |= UIO_COPY_CACHED;
363 
364 	error = SOP_RECVMSG(so, msg, uiop, cr);
365 
366 	switch (error) {
367 	case EINTR:
368 	case ETIME:
369 	case EWOULDBLOCK:
370 		/* We did a partial read */
371 		if (uiop->uio_resid != orig_resid)
372 			error = 0;
373 		break;
374 	default:
375 		break;
376 	}
377 	return (error);
378 }
379 
380 int
381 socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
382     struct cred *cr, int32_t *rvalp)
383 {
384 	return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
385 }
386 
387 int
388 socket_poll(struct sonode *so, short events, int anyyet, short *reventsp,
389     struct pollhead **phpp)
390 {
391 	return (SOP_POLL(so, events, anyyet, reventsp, phpp));
392 }
393 
394 int
395 socket_close(struct sonode *so, int flag, struct cred *cr)
396 {
397 	return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL));
398 }
399 
400 int
401 socket_close_internal(struct sonode *so, int flag, cred_t *cr)
402 {
403 	ASSERT(so->so_count == 0);
404 
405 	return (SOP_CLOSE(so, flag, cr));
406 }
407 
408 void
409 socket_destroy(struct sonode *so)
410 {
411 	vn_invalid(SOTOV(so));
412 	VN_RELE(SOTOV(so));
413 }
414 
415 /* ARGSUSED */
416 void
417 socket_destroy_internal(struct sonode *so, cred_t *cr)
418 {
419 	struct sockparams *sp = so->so_sockparams;
420 	ASSERT(so->so_count == 0 && sp != NULL);
421 
422 	sp->sp_smod_info->smod_sock_destroy_func(so);
423 
424 	SOCKPARAMS_DEC_REF(sp);
425 }
426 
427 /*
428  * TODO Once the common vnode ops is available, then the vnops argument
429  * should be removed.
430  */
431 /*ARGSUSED*/
432 int
433 sonode_constructor(void *buf, void *cdrarg, int kmflags)
434 {
435 	struct sonode *so = buf;
436 	struct vnode *vp;
437 
438 	vp = so->so_vnode = vn_alloc(kmflags);
439 	if (vp == NULL) {
440 		return (-1);
441 	}
442 	vp->v_data = so;
443 	vn_setops(vp, socket_vnodeops);
444 
445 	so->so_priv 		= NULL;
446 	so->so_oobmsg		= NULL;
447 
448 	so->so_proto_handle	= NULL;
449 
450 	so->so_peercred 	= NULL;
451 
452 	so->so_rcv_queued	= 0;
453 	so->so_rcv_q_head 	= NULL;
454 	so->so_rcv_q_last_head 	= NULL;
455 	so->so_rcv_head		= NULL;
456 	so->so_rcv_last_head	= NULL;
457 	so->so_rcv_wanted	= 0;
458 	so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER;
459 	so->so_rcv_timer_tid	= 0;
460 	so->so_rcv_thresh	= 0;
461 
462 	so->so_acceptq_head	= NULL;
463 	so->so_acceptq_tail	= &so->so_acceptq_head;
464 	so->so_acceptq_next	= NULL;
465 	so->so_acceptq_len	= 0;
466 	so->so_backlog		= 0;
467 
468 	so->so_snd_qfull	= B_FALSE;
469 
470 	mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
471 	mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
472 	rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
473 	cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
474 	cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL);
475 
476 	cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL);
477 	cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL);
478 	cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL);
479 	cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
480 	cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
481 
482 	return (0);
483 }
484 
485 /*ARGSUSED*/
486 void
487 sonode_destructor(void *buf, void *cdrarg)
488 {
489 	struct sonode *so = buf;
490 	struct vnode *vp = SOTOV(so);
491 
492 	ASSERT(so->so_priv == NULL);
493 	ASSERT(so->so_peercred == NULL);
494 
495 	ASSERT(so->so_oobmsg == NULL);
496 
497 	ASSERT(so->so_rcv_q_head == NULL);
498 
499 	ASSERT(so->so_acceptq_head == NULL);
500 	ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
501 	ASSERT(so->so_acceptq_next == NULL);
502 
503 	ASSERT(vp->v_data == so);
504 	ASSERT(vn_matchops(vp, socket_vnodeops));
505 
506 	vn_free(vp);
507 
508 	mutex_destroy(&so->so_lock);
509 	mutex_destroy(&so->so_acceptq_lock);
510 	rw_destroy(&so->so_fallback_rwlock);
511 
512 	cv_destroy(&so->so_state_cv);
513 	cv_destroy(&so->so_want_cv);
514 	cv_destroy(&so->so_acceptq_cv);
515 	cv_destroy(&so->so_snd_cv);
516 	cv_destroy(&so->so_rcv_cv);
517 	cv_destroy(&so->so_closing_cv);
518 }
519 
520 void
521 sonode_init(struct sonode *so, struct sockparams *sp, int family,
522     int type, int protocol, sonodeops_t *sops)
523 {
524 	vnode_t *vp;
525 
526 	vp = SOTOV(so);
527 
528 	so->so_flag	= 0;
529 
530 	so->so_state	= 0;
531 	so->so_mode	= 0;
532 
533 	so->so_count	= 0;
534 
535 	so->so_family	= family;
536 	so->so_type	= type;
537 	so->so_protocol	= protocol;
538 
539 	SOCK_CONNID_INIT(so->so_proto_connid);
540 
541 	so->so_options	= 0;
542 	so->so_linger.l_onoff   = 0;
543 	so->so_linger.l_linger = 0;
544 	so->so_sndbuf	= 0;
545 	so->so_error	= 0;
546 	so->so_rcvtimeo	= 0;
547 	so->so_sndtimeo = 0;
548 	so->so_xpg_rcvbuf = 0;
549 
550 	ASSERT(so->so_oobmsg == NULL);
551 	so->so_oobmark	= 0;
552 	so->so_pgrp	= 0;
553 
554 	ASSERT(so->so_peercred == NULL);
555 
556 	so->so_zoneid = getzoneid();
557 
558 	so->so_sockparams = sp;
559 
560 	so->so_ops = sops;
561 
562 	so->so_not_str = (sops != &sotpi_sonodeops);
563 
564 	so->so_proto_handle = NULL;
565 
566 	so->so_downcalls = NULL;
567 
568 	so->so_copyflag = 0;
569 
570 	ASSERT(so->so_acceptq_head == NULL);
571 	ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
572 	ASSERT(so->so_acceptq_next == NULL);
573 
574 	vn_reinit(vp);
575 	vp->v_vfsp	= rootvfs;
576 	vp->v_type	= VSOCK;
577 	vp->v_rdev	= sockdev;
578 
579 	so->so_rcv_queued = 0;
580 	so->so_rcv_q_head = NULL;
581 	so->so_rcv_q_last_head = NULL;
582 	so->so_rcv_head	= NULL;
583 	so->so_rcv_last_head = NULL;
584 
585 	so->so_snd_qfull = B_FALSE;
586 	so->so_minpsz = 0;
587 
588 	so->so_rcv_wakeup = B_FALSE;
589 	so->so_snd_wakeup = B_FALSE;
590 	so->so_flowctrld = B_FALSE;
591 
592 	so->so_pollev = 0;
593 	bzero(&so->so_poll_list, sizeof (so->so_poll_list));
594 	bzero(&so->so_proto_props, sizeof (struct sock_proto_props));
595 
596 	bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
597 	so->so_ksock_cb_arg = NULL;
598 
599 	so->so_max_addr_len = sizeof (struct sockaddr_storage);
600 
601 	so->so_direct = NULL;
602 
603 	vn_exists(vp);
604 }
605 
606 void
607 sonode_fini(struct sonode *so)
608 {
609 	mblk_t *mp;
610 	vnode_t *vp;
611 
612 	ASSERT(so->so_count == 0);
613 
614 	if (so->so_rcv_timer_tid) {
615 		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
616 		(void) untimeout(so->so_rcv_timer_tid);
617 		so->so_rcv_timer_tid = 0;
618 	}
619 
620 	so_acceptq_flush(so);
621 
622 	if ((mp = so->so_oobmsg) != NULL) {
623 		freemsg(mp);
624 		so->so_oobmsg = NULL;
625 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
626 		    SS_RCVATMARK);
627 	}
628 
629 	if (so->so_poll_list.ph_list != NULL) {
630 		pollwakeup(&so->so_poll_list, POLLERR);
631 		pollhead_clean(&so->so_poll_list);
632 	}
633 
634 	if (so->so_direct != NULL) {
635 		sodirect_t *sodp = so->so_direct;
636 
637 		ASSERT(sodp->sod_uioafh == NULL);
638 
639 		so->so_direct = NULL;
640 		kmem_cache_free(sock_sod_cache, sodp);
641 	}
642 
643 	vp = SOTOV(so);
644 	vn_invalid(vp);
645 
646 	if (so->so_peercred != NULL) {
647 		crfree(so->so_peercred);
648 		so->so_peercred = NULL;
649 	}
650 }
651 
652 /*
653  * This function is called at the beginning of recvmsg().
654  *
655  * If I/OAT is enabled on this sonode, initialize the uioa state machine
656  * with state UIOA_ALLOC.
657  */
658 uio_t *
659 sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp)
660 {
661 	struct uio *suiop;
662 	struct uio *uiop;
663 	sodirect_t *sodp = so->so_direct;
664 
665 	if (sodp == NULL)
666 		return (NULL);
667 
668 	suiop = NULL;
669 	uiop = *uiopp;
670 
671 	mutex_enter(sodp->sod_lockp);
672 	if (uiop->uio_resid >= uioasync.mincnt &&
673 	    sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
674 	    uioasync.enabled && !(flags & MSG_PEEK) &&
675 	    !(so->so_state & SS_CANTRCVMORE)) {
676 		/*
677 		 * Big enough I/O for uioa min setup and an sodirect socket
678 		 * and sodirect enabled and uioa enabled and I/O will be done
679 		 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
680 		 */
681 		if (!uioainit(uiop, &sodp->sod_uioa)) {
682 			/*
683 			 * Successful uioainit() so the uio_t part of the
684 			 * uioa_t will be used for all uio_t work to follow,
685 			 * we return the original "uiop" in "suiop".
686 			 */
687 			suiop = uiop;
688 			*uiopp = (uio_t *)&sodp->sod_uioa;
689 			/*
690 			 * Before returning to the caller the passed in uio_t
691 			 * "uiop" will be updated via a call to uioafini()
692 			 * below.
693 			 *
694 			 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
695 			 * here as first we have to uioamove() any currently
696 			 * queued M_DATA mblk_t(s) so it will be done later.
697 			 */
698 		}
699 		/*
700 		 * In either uioainit() success or not case note the number
701 		 * of uio bytes the caller wants for sod framework and/or
702 		 * transport (e.g. TCP) strategy.
703 		 */
704 		sodp->sod_want = uiop->uio_resid;
705 	} else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
706 		/*
707 		 * No uioa but still using sodirect so note the number of
708 		 * uio bytes the caller wants for sodirect framework and/or
709 		 * transport (e.g. TCP) strategy.
710 		 */
711 		sodp->sod_want = uiop->uio_resid;
712 	}
713 	mutex_exit(sodp->sod_lockp);
714 
715 	return (suiop);
716 }
717 
718 /*
719  * This function is called at the end of recvmsg(), it finializes all the I/OAT
720  * operations, and reset the uioa state to UIOA_ALLOC.
721  */
722 int
723 sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop)
724 {
725 	int error = 0;
726 	sodirect_t *sodp = so->so_direct;
727 	mblk_t *mp;
728 
729 	if (sodp == NULL) {
730 		return (0);
731 	}
732 
733 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
734 	/* Finish any sodirect and uioa processing */
735 	if (suiop != NULL) {
736 		/* Finish any uioa_t processing */
737 
738 		ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
739 		error = uioafini(suiop, (uioa_t *)uiop);
740 		if ((mp = sodp->sod_uioafh) != NULL) {
741 			sodp->sod_uioafh = NULL;
742 			sodp->sod_uioaft = NULL;
743 			freemsg(mp);
744 		}
745 	}
746 	ASSERT(sodp->sod_uioafh == NULL);
747 	if (!(sodp->sod_state & SOD_WAKE_NOT)) {
748 		/* Awoke */
749 		sodp->sod_state &= SOD_WAKE_CLR;
750 		sodp->sod_state |= SOD_WAKE_NOT;
751 	}
752 	/* Last, clear sod_want value */
753 	sodp->sod_want = 0;
754 
755 	return (error);
756 }
757 
758 /*
759  * Schedule a uioamove() on a mblk. This is ususally called from
760  * protocols (e.g. TCP) on a I/OAT enabled sonode.
761  */
762 mblk_t *
763 sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size)
764 {
765 	uioa_t *uioap = &sodp->sod_uioa;
766 	mblk_t *mp1 = mp;
767 	mblk_t *lmp = NULL;
768 
769 	ASSERT(DB_TYPE(mp) == M_DATA);
770 	ASSERT(msg_size == msgdsize(mp));
771 
772 	/* Caller must have lock held */
773 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
774 
775 	if (uioap->uioa_state & UIOA_ENABLED) {
776 		/* Uioa is enabled */
777 
778 		if (msg_size > uioap->uio_resid) {
779 			/*
780 			 * There isn't enough uio space for the mblk_t chain
781 			 * so disable uioa such that this and any additional
782 			 * mblk_t data is handled by the socket and schedule
783 			 * the socket for wakeup to finish this uioa.
784 			 */
785 			uioap->uioa_state &= UIOA_CLR;
786 			uioap->uioa_state |= UIOA_FINI;
787 			if (sodp->sod_state & SOD_WAKE_NOT) {
788 				sodp->sod_state &= SOD_WAKE_CLR;
789 				sodp->sod_state |= SOD_WAKE_NEED;
790 			}
791 			return (mp);
792 		}
793 		do {
794 			uint32_t	len = MBLKL(mp1);
795 
796 			if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) {
797 				/* Scheduled, mark dblk_t as such */
798 				DB_FLAGS(mp1) |= DBLK_UIOA;
799 			} else {
800 				/* Error, turn off async processing */
801 				uioap->uioa_state &= UIOA_CLR;
802 				uioap->uioa_state |= UIOA_FINI;
803 				break;
804 			}
805 			lmp = mp1;
806 		} while ((mp1 = mp1->b_cont) != NULL);
807 
808 		if (mp1 != NULL || uioap->uio_resid == 0) {
809 			/*
810 			 * Not all mblk_t(s) uioamoved (error) or all uio
811 			 * space has been consumed so schedule the socket
812 			 * for wakeup to finish this uio.
813 			 */
814 			sodp->sod_state &= SOD_WAKE_CLR;
815 			sodp->sod_state |= SOD_WAKE_NEED;
816 
817 			/* Break the mblk chain if neccessary. */
818 			if (mp1 != NULL && lmp != NULL) {
819 				mp->b_next = mp1;
820 				lmp->b_cont = NULL;
821 			}
822 		}
823 	}
824 	return (mp1);
825 }
826 
827 /*
828  * This function is called on a mblk that thas been successfully uioamoved().
829  */
830 void
831 sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp)
832 {
833 	if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
834 		/*
835 		 * A uioa flaged mblk_t chain, already uio processed,
836 		 * add it to the sodirect uioa pending free list.
837 		 *
838 		 * Note, a b_cont chain headed by a DBLK_UIOA enable
839 		 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled.
840 		 */
841 		mblk_t	*bpt = sodp->sod_uioaft;
842 
843 		ASSERT(sodp != NULL);
844 
845 		/*
846 		 * Add first mblk_t of "bp" chain to current sodirect uioa
847 		 * free list tail mblk_t, if any, else empty list so new head.
848 		 */
849 		if (bpt == NULL)
850 			sodp->sod_uioafh = bp;
851 		else
852 			bpt->b_cont = bp;
853 
854 		/*
855 		 * Walk mblk_t "bp" chain to find tail and adjust rptr of
856 		 * each to reflect that uioamove() has consumed all data.
857 		 */
858 		bpt = bp;
859 		for (;;) {
860 			ASSERT(bpt->b_datap->db_flags & DBLK_UIOA);
861 
862 			bpt->b_rptr = bpt->b_wptr;
863 			if (bpt->b_cont == NULL)
864 				break;
865 			bpt = bpt->b_cont;
866 		}
867 		/* New sodirect uioa free list tail */
868 		sodp->sod_uioaft = bpt;
869 
870 		/* Only dequeue once with data returned per uioa_t */
871 		if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
872 			sodp->sod_uioa.uioa_state &= UIOA_CLR;
873 			sodp->sod_uioa.uioa_state |= UIOA_FINI;
874 		}
875 	}
876 }
877 
878 /*
879  * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call
880  * this function on a non-STREAMS socket to schedule uioamove() on the data
881  * that has already queued in this socket.
882  */
883 void
884 sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop)
885 {
886 	uioa_t	*uioap = (uioa_t *)uiop;
887 	mblk_t	*lbp;
888 	mblk_t	*wbp;
889 	mblk_t	*bp;
890 	int	len;
891 	int	error;
892 	boolean_t in_rcv_q = B_TRUE;
893 
894 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
895 	ASSERT(&sodp->sod_uioa == uioap);
896 
897 	/*
898 	 * Walk first b_cont chain in sod_q
899 	 * and schedule any M_DATA mblk_t's for uio asynchronous move.
900 	 */
901 	bp = so->so_rcv_q_head;
902 
903 again:
904 	/* Walk the chain */
905 	lbp = NULL;
906 	wbp = bp;
907 
908 	do {
909 		if (bp == NULL)
910 			break;
911 
912 		if (wbp->b_datap->db_type != M_DATA) {
913 			/* Not M_DATA, no more uioa */
914 			goto nouioa;
915 		}
916 		if ((len = wbp->b_wptr - wbp->b_rptr) > 0) {
917 			/* Have a M_DATA mblk_t with data */
918 			if (len > uioap->uio_resid || (so->so_oobmark > 0 &&
919 			    len + uioap->uioa_mbytes >= so->so_oobmark)) {
920 				/* Not enough uio sapce, or beyond oobmark */
921 				goto nouioa;
922 			}
923 			ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA));
924 			error = uioamove(wbp->b_rptr, len,
925 			    UIO_READ, uioap);
926 			if (!error) {
927 				/* Scheduled, mark dblk_t as such */
928 				wbp->b_datap->db_flags |= DBLK_UIOA;
929 			} else {
930 				/* Break the mblk chain */
931 				goto nouioa;
932 			}
933 		}
934 		/* Save last wbp processed */
935 		lbp = wbp;
936 	} while ((wbp = wbp->b_cont) != NULL);
937 
938 	if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) {
939 		/*
940 		 * We get here only once to process the sonode dump area
941 		 * if so_rcv_q_head is NULL or all the mblks have been
942 		 * successfully uioamoved()ed.
943 		 */
944 		in_rcv_q = B_FALSE;
945 
946 		/* move to dump area */
947 		bp = so->so_rcv_head;
948 		goto again;
949 	}
950 
951 	return;
952 
953 nouioa:
954 	/* No more uioa */
955 	uioap->uioa_state &= UIOA_CLR;
956 	uioap->uioa_state |= UIOA_FINI;
957 
958 	/*
959 	 * If we processed 1 or more mblk_t(s) then we need to split the
960 	 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s)
961 	 * are in the current chain and the rest are in the following new
962 	 * chain.
963 	 */
964 	if (lbp != NULL) {
965 		/* New end of current chain */
966 		lbp->b_cont = NULL;
967 
968 		/* Insert new chain wbp after bp */
969 		if ((wbp->b_next = bp->b_next) == NULL) {
970 			/*
971 			 * No need to grab so_lock, since sod_lockp
972 			 * points to so_lock.
973 			 */
974 			if (in_rcv_q)
975 				so->so_rcv_q_last_head = wbp;
976 			else
977 				so->so_rcv_last_head = wbp;
978 		}
979 		bp->b_next = wbp;
980 		bp->b_next->b_prev = bp->b_prev;
981 		bp->b_prev = lbp;
982 	}
983 }
984 
985 /*
986  * Initialize sodirect data structures on a socket.
987  */
988 void
989 sod_sock_init(struct sonode *so, struct stdata *stp, sod_enq_func enq_func,
990     sod_wakeup_func wake_func, kmutex_t *lockp)
991 {
992 	sodirect_t	*sodp;
993 
994 	ASSERT(so->so_direct == NULL);
995 
996 	so->so_state |= SS_SODIRECT;
997 
998 	sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP);
999 	sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT;
1000 	sodp->sod_want = 0;
1001 	sodp->sod_q = (stp != NULL) ? RD(stp->sd_wrq) : NULL;
1002 	sodp->sod_enqueue = enq_func;
1003 	sodp->sod_wakeup = wake_func;
1004 	sodp->sod_uioafh = NULL;
1005 	sodp->sod_uioaft = NULL;
1006 	sodp->sod_lockp = lockp;
1007 	/*
1008 	 * Remainder of the sod_uioa members are left uninitialized
1009 	 * but will be initialized later by uioainit() before uioa
1010 	 * is enabled.
1011 	 */
1012 	sodp->sod_uioa.uioa_state = UIOA_ALLOC;
1013 	so->so_direct = sodp;
1014 	if (stp != NULL)
1015 		stp->sd_sodirect = sodp;
1016 }
1017 
1018 /*
1019  * Init the sodirect kmem cache while sockfs is loading.
1020  */
1021 void
1022 sod_init()
1023 {
1024 	/* Allocate sodirect_t kmem_cache */
1025 	sock_sod_cache = kmem_cache_create("sock_sod_cache",
1026 	    sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1027 }
1028 
1029 ssize_t
1030 sod_uioa_mblk(struct sonode *so, mblk_t *mp)
1031 {
1032 	sodirect_t *sodp = so->so_direct;
1033 
1034 	ASSERT(sodp != NULL);
1035 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
1036 
1037 	ASSERT(sodp->sod_state & SOD_ENABLED);
1038 	ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT));
1039 
1040 	ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI));
1041 
1042 	if (mp == NULL && so->so_rcv_q_head != NULL) {
1043 		mp = so->so_rcv_q_head;
1044 		ASSERT(mp->b_prev != NULL);
1045 		mp->b_prev = NULL;
1046 		so->so_rcv_q_head = mp->b_next;
1047 		if (so->so_rcv_q_head == NULL) {
1048 			so->so_rcv_q_last_head = NULL;
1049 		}
1050 		mp->b_next = NULL;
1051 	}
1052 
1053 	sod_uioa_mblk_done(sodp, mp);
1054 
1055 	if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL &&
1056 	    DB_TYPE(so->so_rcv_head) == M_DATA &&
1057 	    (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) {
1058 		/* more arrived */
1059 		ASSERT(so->so_rcv_q_head == NULL);
1060 		mp = so->so_rcv_head;
1061 		so->so_rcv_head = mp->b_next;
1062 		if (so->so_rcv_head == NULL)
1063 			so->so_rcv_last_head = NULL;
1064 		mp->b_prev = mp->b_next = NULL;
1065 		sod_uioa_mblk_done(sodp, mp);
1066 	}
1067 
1068 #ifdef DEBUG
1069 	if (so->so_rcv_q_head != NULL) {
1070 		mblk_t *m = so->so_rcv_q_head;
1071 		while (m != NULL) {
1072 			if (DB_FLAGS(m) & DBLK_UIOA) {
1073 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
1074 				    " in so_rcv_q_head.\n", (void *)m);
1075 			}
1076 			m = m->b_next;
1077 		}
1078 	}
1079 	if (so->so_rcv_head != NULL) {
1080 		mblk_t *m = so->so_rcv_head;
1081 		while (m != NULL) {
1082 			if (DB_FLAGS(m) & DBLK_UIOA) {
1083 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
1084 				    " in so_rcv_head.\n", (void *)m);
1085 			}
1086 			m = m->b_next;
1087 		}
1088 	}
1089 #endif
1090 	return (sodp->sod_uioa.uioa_mbytes);
1091 }
1092