xref: /illumos-gate/usr/src/uts/common/fs/sockfs/sockcommon.c (revision 9b4e3ac25d882519cad3fc11f0c53b07f4e60536)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/debug.h>
32 #include <sys/cmn_err.h>
33 #include <sys/vfs.h>
34 #include <sys/policy.h>
35 #include <sys/modctl.h>
36 
37 #include <sys/sunddi.h>
38 
39 #include <sys/strsun.h>
40 #include <sys/stropts.h>
41 #include <sys/strsubr.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/sodirect.h>
45 #include <sys/uio.h>
46 
47 #include <inet/ipclassifier.h>
48 #include <fs/sockfs/sockcommon.h>
49 #include <fs/sockfs/nl7c.h>
50 #include <inet/ip.h>
51 
52 extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print;
53 
54 static struct kmem_cache *sock_sod_cache;
55 
56 /*
57  * Common socket access functions.
58  *
59  * Instead of accessing the sonode switch directly (i.e., SOP_xxx()),
60  * the socket_xxx() function should be used.
61  */
62 
63 /*
64  * Try to create a new sonode of the requested <family, type, protocol>.
65  */
66 /* ARGSUSED */
67 struct sonode *
68 socket_create(int family, int type, int protocol, char *devpath, char *mod,
69     int flags, int version, struct cred *cr, int *errorp)
70 {
71 	struct sonode *so;
72 	struct sockparams *sp = NULL;
73 
74 	/*
75 	 * Look for a sockparams entry that match the given criteria.
76 	 * solookup() returns with the entry held.
77 	 */
78 	*errorp = solookup(family, type, protocol, &sp);
79 	if (sp == NULL) {
80 		int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
81 		/*
82 		 * There is no matching sockparams entry. An ephemeral entry is
83 		 * created if the caller specifies a device or a socket module.
84 		 */
85 		if (devpath != NULL) {
86 			sp = sockparams_hold_ephemeral_bydev(family, type,
87 			    protocol, devpath, kmflags, errorp);
88 		} else if (mod != NULL) {
89 			sp = sockparams_hold_ephemeral_bymod(family, type,
90 			    protocol, mod, kmflags, errorp);
91 		} else {
92 			return (NULL);
93 		}
94 
95 		if (sp == NULL)
96 			return (NULL);
97 	}
98 
99 	ASSERT(sp->sp_smod_info != NULL);
100 	ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP);
101 	so = sp->sp_smod_info->smod_sock_create_func(sp, family, type,
102 	    protocol, version, flags, errorp, cr);
103 	if (so == NULL) {
104 		SOCKPARAMS_DEC_REF(sp);
105 	} else {
106 		if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) {
107 			/* Cannot fail, only bumps so_count */
108 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
109 		} else {
110 			socket_destroy(so);
111 			so = NULL;
112 		}
113 	}
114 	return (so);
115 }
116 
117 struct sonode *
118 socket_newconn(struct sonode *parent, sock_lower_handle_t lh,
119     sock_downcalls_t *dc, int flags, int *errorp)
120 {
121 	struct sonode *so;
122 	struct sockparams *sp;
123 	struct cred *cr;
124 
125 	if ((cr = CRED()) == NULL)
126 		cr = kcred;
127 
128 	sp = parent->so_sockparams;
129 	ASSERT(sp != NULL);
130 
131 	so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family,
132 	    parent->so_type, parent->so_protocol, parent->so_version, flags,
133 	    errorp, cr);
134 	if (so != NULL) {
135 		SOCKPARAMS_INC_REF(sp);
136 
137 		so->so_proto_handle = lh;
138 		so->so_downcalls = dc;
139 		/*
140 		 * This function may be called in interrupt context, and CRED()
141 		 * will be NULL. In this case, pass in kcred.
142 		 */
143 		if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) {
144 			/* Cannot fail, only bumps so_count */
145 			(void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
146 		} else  {
147 			socket_destroy(so);
148 			so = NULL;
149 		}
150 	}
151 
152 	return (so);
153 }
154 
155 /*
156  * Bind local endpoint.
157  */
158 int
159 socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
160     int flags, cred_t *cr)
161 {
162 	return (SOP_BIND(so, name, namelen, flags, cr));
163 }
164 
165 /*
166  * Turn socket into a listen socket.
167  */
168 int
169 socket_listen(struct sonode *so, int backlog, cred_t *cr)
170 {
171 	if (backlog < 0) {
172 		backlog = 0;
173 	}
174 
175 	/*
176 	 * Use the same qlimit as in BSD. BSD checks the qlimit
177 	 * before queuing the next connection implying that a
178 	 * listen(sock, 0) allows one connection to be queued.
179 	 * BSD also uses 1.5 times the requested backlog.
180 	 *
181 	 * XNS Issue 4 required a strict interpretation of the backlog.
182 	 * This has been waived subsequently for Issue 4 and the change
183 	 * incorporated in XNS Issue 5. So we aren't required to do
184 	 * anything special for XPG apps.
185 	 */
186 	if (backlog >= (INT_MAX - 1) / 3)
187 		backlog = INT_MAX;
188 	else
189 		backlog = backlog * 3 / 2 + 1;
190 
191 	return (SOP_LISTEN(so, backlog, cr));
192 }
193 
194 /*
195  * Accept incoming connection.
196  */
197 int
198 socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
199 {
200 	return (SOP_ACCEPT(lso, fflag, cr, nsop));
201 }
202 
203 /*
204  * Active open.
205  */
206 int
207 socket_connect(struct sonode *so, const struct sockaddr *name,
208     socklen_t namelen, int fflag, int flags, cred_t *cr)
209 {
210 	int error;
211 
212 	/*
213 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
214 	 * connect to a null address. This is the portable method to
215 	 * unconnect a socket.
216 	 */
217 	if ((namelen >= sizeof (sa_family_t)) &&
218 	    (name->sa_family == AF_UNSPEC)) {
219 		name = NULL;
220 		namelen = 0;
221 	}
222 
223 	error = SOP_CONNECT(so, name, namelen, fflag, flags, cr);
224 
225 	if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) {
226 		/*
227 		 * X/Open specification contains a requirement that
228 		 * ENETUNREACH be returned but does not require
229 		 * EHOSTUNREACH. In order to keep the test suite
230 		 * happy we mess with the errno here.
231 		 */
232 		error = ENETUNREACH;
233 	}
234 
235 	return (error);
236 }
237 
238 /*
239  * Get address of remote node.
240  */
241 int
242 socket_getpeername(struct sonode *so, struct sockaddr *addr,
243     socklen_t *addrlen, boolean_t accept, cred_t *cr)
244 {
245 	ASSERT(*addrlen > 0);
246 	return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
247 
248 }
249 
250 /*
251  * Get local address.
252  */
253 int
254 socket_getsockname(struct sonode *so, struct sockaddr *addr,
255     socklen_t *addrlen, cred_t *cr)
256 {
257 	return (SOP_GETSOCKNAME(so, addr, addrlen, cr));
258 
259 }
260 
261 /*
262  * Called from shutdown().
263  */
264 int
265 socket_shutdown(struct sonode *so, int how, cred_t *cr)
266 {
267 	return (SOP_SHUTDOWN(so, how, cr));
268 }
269 
270 /*
271  * Get socket options.
272  */
273 /*ARGSUSED*/
274 int
275 socket_getsockopt(struct sonode *so, int level, int option_name,
276     void *optval, socklen_t *optlenp, int flags, cred_t *cr)
277 {
278 	return (SOP_GETSOCKOPT(so, level, option_name, optval,
279 	    optlenp, flags, cr));
280 }
281 
282 /*
283  * Set socket options
284  */
285 int
286 socket_setsockopt(struct sonode *so, int level, int option_name,
287     const void *optval, t_uscalar_t optlen, cred_t *cr)
288 {
289 	/* Caller allocates aligned optval, or passes null */
290 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
291 	/* If optval is null optlen is 0, and vice-versa */
292 	ASSERT(optval != NULL || optlen == 0);
293 	ASSERT(optlen != 0 || optval == NULL);
294 
295 	/* No options should be zero-length */
296 	if (optlen == 0)
297 		return (EINVAL);
298 
299 	return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
300 }
301 
302 int
303 socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
304     cred_t *cr)
305 {
306 	int error = 0;
307 	ssize_t orig_resid = uiop->uio_resid;
308 
309 	/*
310 	 * Do not bypass the cache if we are doing a local (AF_UNIX) write.
311 	 */
312 	if (so->so_family == AF_UNIX)
313 		uiop->uio_extflg |= UIO_COPY_CACHED;
314 	else
315 		uiop->uio_extflg &= ~UIO_COPY_CACHED;
316 
317 	error = SOP_SENDMSG(so, msg, uiop, cr);
318 	switch (error) {
319 	default:
320 		break;
321 	case EINTR:
322 	case ETIME:
323 	case EWOULDBLOCK:
324 		/* We did a partial send */
325 		if (uiop->uio_resid != orig_resid)
326 			error = 0;
327 		break;
328 	case EPIPE:
329 		if ((so->so_mode & SM_KERNEL) == 0)
330 			tsignal(curthread, SIGPIPE);
331 		break;
332 	}
333 
334 	return (error);
335 }
336 
337 int
338 socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
339     struct cred *cr, mblk_t **mpp)
340 {
341 	int error = 0;
342 
343 	error = SOP_SENDMBLK(so, msg, fflag, cr, mpp);
344 	if (error == EPIPE) {
345 		tsignal(curthread, SIGPIPE);
346 	}
347 	return (error);
348 }
349 
350 int
351 socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
352     cred_t *cr)
353 {
354 	int error;
355 	ssize_t orig_resid = uiop->uio_resid;
356 
357 	/*
358 	 * Do not bypass the cache when reading data, as the application
359 	 * is likely to access the data shortly.
360 	 */
361 	uiop->uio_extflg |= UIO_COPY_CACHED;
362 
363 	error = SOP_RECVMSG(so, msg, uiop, cr);
364 
365 	switch (error) {
366 	case EINTR:
367 	case ETIME:
368 	case EWOULDBLOCK:
369 		/* We did a partial read */
370 		if (uiop->uio_resid != orig_resid)
371 			error = 0;
372 		break;
373 	default:
374 		break;
375 	}
376 	return (error);
377 }
378 
379 int
380 socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
381     struct cred *cr, int32_t *rvalp)
382 {
383 	return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
384 }
385 
386 int
387 socket_poll(struct sonode *so, short events, int anyyet, short *reventsp,
388     struct pollhead **phpp)
389 {
390 	return (SOP_POLL(so, events, anyyet, reventsp, phpp));
391 }
392 
393 int
394 socket_close(struct sonode *so, int flag, struct cred *cr)
395 {
396 	return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL));
397 }
398 
399 int
400 socket_close_internal(struct sonode *so, int flag, cred_t *cr)
401 {
402 	ASSERT(so->so_count == 0);
403 
404 	return (SOP_CLOSE(so, flag, cr));
405 }
406 
407 void
408 socket_destroy(struct sonode *so)
409 {
410 	vn_invalid(SOTOV(so));
411 	VN_RELE(SOTOV(so));
412 }
413 
414 /* ARGSUSED */
415 void
416 socket_destroy_internal(struct sonode *so, cred_t *cr)
417 {
418 	struct sockparams *sp = so->so_sockparams;
419 	ASSERT(so->so_count == 0 && sp != NULL);
420 
421 	sp->sp_smod_info->smod_sock_destroy_func(so);
422 
423 	SOCKPARAMS_DEC_REF(sp);
424 }
425 
426 /*
427  * TODO Once the common vnode ops is available, then the vnops argument
428  * should be removed.
429  */
430 /*ARGSUSED*/
431 int
432 sonode_constructor(void *buf, void *cdrarg, int kmflags)
433 {
434 	struct sonode *so = buf;
435 	struct vnode *vp;
436 
437 	vp = so->so_vnode = vn_alloc(kmflags);
438 	if (vp == NULL) {
439 		return (-1);
440 	}
441 	vp->v_data = so;
442 	vn_setops(vp, socket_vnodeops);
443 
444 	so->so_priv 		= NULL;
445 	so->so_oobmsg		= NULL;
446 
447 	so->so_proto_handle	= NULL;
448 
449 	so->so_peercred 	= NULL;
450 
451 	so->so_rcv_queued	= 0;
452 	so->so_rcv_q_head 	= NULL;
453 	so->so_rcv_q_last_head 	= NULL;
454 	so->so_rcv_head		= NULL;
455 	so->so_rcv_last_head	= NULL;
456 	so->so_rcv_wanted	= 0;
457 	so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER;
458 	so->so_rcv_timer_tid	= 0;
459 	so->so_rcv_thresh	= 0;
460 
461 	so->so_acceptq_head	= NULL;
462 	so->so_acceptq_tail	= &so->so_acceptq_head;
463 	so->so_acceptq_next	= NULL;
464 	so->so_acceptq_len	= 0;
465 	so->so_backlog		= 0;
466 
467 	so->so_snd_qfull	= B_FALSE;
468 
469 	mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
470 	mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
471 	rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
472 	cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
473 	cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL);
474 
475 	cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL);
476 	cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL);
477 	cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL);
478 	cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
479 	cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
480 
481 	return (0);
482 }
483 
484 /*ARGSUSED*/
485 void
486 sonode_destructor(void *buf, void *cdrarg)
487 {
488 	struct sonode *so = buf;
489 	struct vnode *vp = SOTOV(so);
490 
491 	ASSERT(so->so_priv == NULL);
492 	ASSERT(so->so_peercred == NULL);
493 
494 	ASSERT(so->so_oobmsg == NULL);
495 
496 	ASSERT(so->so_rcv_q_head == NULL);
497 
498 	ASSERT(so->so_acceptq_head == NULL);
499 	ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
500 	ASSERT(so->so_acceptq_next == NULL);
501 
502 	ASSERT(vp->v_data == so);
503 	ASSERT(vn_matchops(vp, socket_vnodeops));
504 
505 	vn_free(vp);
506 
507 	mutex_destroy(&so->so_lock);
508 	mutex_destroy(&so->so_acceptq_lock);
509 	rw_destroy(&so->so_fallback_rwlock);
510 
511 	cv_destroy(&so->so_state_cv);
512 	cv_destroy(&so->so_want_cv);
513 	cv_destroy(&so->so_acceptq_cv);
514 	cv_destroy(&so->so_snd_cv);
515 	cv_destroy(&so->so_rcv_cv);
516 	cv_destroy(&so->so_closing_cv);
517 }
518 
519 void
520 sonode_init(struct sonode *so, struct sockparams *sp, int family,
521     int type, int protocol, sonodeops_t *sops)
522 {
523 	vnode_t *vp;
524 
525 	vp = SOTOV(so);
526 
527 	so->so_flag	= 0;
528 
529 	so->so_state	= 0;
530 	so->so_mode	= 0;
531 
532 	so->so_count	= 0;
533 
534 	so->so_family	= family;
535 	so->so_type	= type;
536 	so->so_protocol	= protocol;
537 
538 	SOCK_CONNID_INIT(so->so_proto_connid);
539 
540 	so->so_options	= 0;
541 	so->so_linger.l_onoff   = 0;
542 	so->so_linger.l_linger = 0;
543 	so->so_sndbuf	= 0;
544 	so->so_error	= 0;
545 	so->so_rcvtimeo	= 0;
546 	so->so_sndtimeo = 0;
547 
548 	ASSERT(so->so_oobmsg == NULL);
549 	so->so_oobmark	= 0;
550 	so->so_pgrp	= 0;
551 
552 	ASSERT(so->so_peercred == NULL);
553 
554 	so->so_zoneid = getzoneid();
555 
556 	so->so_sockparams = sp;
557 
558 	so->so_ops = sops;
559 
560 	so->so_proto_handle = NULL;
561 
562 	so->so_downcalls = NULL;
563 
564 	so->so_copyflag = 0;
565 
566 	ASSERT(so->so_acceptq_head == NULL);
567 	ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
568 	ASSERT(so->so_acceptq_next == NULL);
569 
570 	vn_reinit(vp);
571 	vp->v_vfsp	= rootvfs;
572 	vp->v_type	= VSOCK;
573 	vp->v_rdev	= sockdev;
574 
575 	so->so_rcv_queued = 0;
576 	so->so_rcv_q_head = NULL;
577 	so->so_rcv_q_last_head = NULL;
578 	so->so_rcv_head	= NULL;
579 	so->so_rcv_last_head = NULL;
580 
581 	so->so_snd_qfull = B_FALSE;
582 	so->so_minpsz = 0;
583 
584 	so->so_rcv_wakeup = B_FALSE;
585 	so->so_snd_wakeup = B_FALSE;
586 	so->so_flowctrld = B_FALSE;
587 
588 	so->so_pollev = 0;
589 	bzero(&so->so_poll_list, sizeof (so->so_poll_list));
590 	bzero(&so->so_proto_props, sizeof (struct sock_proto_props));
591 
592 	bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
593 	so->so_ksock_cb_arg = NULL;
594 
595 	so->so_max_addr_len = sizeof (struct sockaddr_storage);
596 
597 	so->so_direct = NULL;
598 
599 	vn_exists(vp);
600 }
601 
602 void
603 sonode_fini(struct sonode *so)
604 {
605 	mblk_t *mp;
606 	vnode_t *vp;
607 
608 	ASSERT(so->so_count == 0);
609 
610 	if (so->so_rcv_timer_tid) {
611 		ASSERT(MUTEX_NOT_HELD(&so->so_lock));
612 		(void) untimeout(so->so_rcv_timer_tid);
613 		so->so_rcv_timer_tid = 0;
614 	}
615 
616 	so_acceptq_flush(so);
617 
618 #ifdef DEBUG
619 	mutex_enter(&so->so_lock);
620 	ASSERT(so_verify_oobstate(so));
621 	mutex_exit(&so->so_lock);
622 #endif /* DEBUG */
623 	if ((mp = so->so_oobmsg) != NULL) {
624 		freemsg(mp);
625 		so->so_oobmsg = NULL;
626 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
627 		    SS_RCVATMARK);
628 	}
629 
630 	if (so->so_poll_list.ph_list != NULL) {
631 		pollwakeup(&so->so_poll_list, POLLERR);
632 		pollhead_clean(&so->so_poll_list);
633 	}
634 
635 	if (so->so_direct != NULL) {
636 		sodirect_t *sodp = so->so_direct;
637 
638 		ASSERT(sodp->sod_uioafh == NULL);
639 
640 		so->so_direct = NULL;
641 		kmem_cache_free(sock_sod_cache, sodp);
642 	}
643 
644 	vp = SOTOV(so);
645 	vn_invalid(vp);
646 
647 	if (so->so_peercred != NULL) {
648 		crfree(so->so_peercred);
649 		so->so_peercred = NULL;
650 	}
651 }
652 
653 /*
654  * This function is called at the beginning of recvmsg().
655  *
656  * If I/OAT is enabled on this sonode, initialize the uioa state machine
657  * with state UIOA_ALLOC.
658  */
659 uio_t *
660 sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp)
661 {
662 	struct uio *suiop;
663 	struct uio *uiop;
664 	sodirect_t *sodp = so->so_direct;
665 
666 	if (sodp == NULL)
667 		return (NULL);
668 
669 	suiop = NULL;
670 	uiop = *uiopp;
671 
672 	mutex_enter(sodp->sod_lockp);
673 	if (uiop->uio_resid >= uioasync.mincnt &&
674 	    sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
675 	    uioasync.enabled && !(flags & MSG_PEEK) &&
676 	    !(so->so_state & SS_CANTRCVMORE)) {
677 		/*
678 		 * Big enough I/O for uioa min setup and an sodirect socket
679 		 * and sodirect enabled and uioa enabled and I/O will be done
680 		 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
681 		 */
682 		if (!uioainit(uiop, &sodp->sod_uioa)) {
683 			/*
684 			 * Successful uioainit() so the uio_t part of the
685 			 * uioa_t will be used for all uio_t work to follow,
686 			 * we return the original "uiop" in "suiop".
687 			 */
688 			suiop = uiop;
689 			*uiopp = (uio_t *)&sodp->sod_uioa;
690 			/*
691 			 * Before returning to the caller the passed in uio_t
692 			 * "uiop" will be updated via a call to uioafini()
693 			 * below.
694 			 *
695 			 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
696 			 * here as first we have to uioamove() any currently
697 			 * queued M_DATA mblk_t(s) so it will be done later.
698 			 */
699 		}
700 		/*
701 		 * In either uioainit() success or not case note the number
702 		 * of uio bytes the caller wants for sod framework and/or
703 		 * transport (e.g. TCP) strategy.
704 		 */
705 		sodp->sod_want = uiop->uio_resid;
706 	} else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
707 		/*
708 		 * No uioa but still using sodirect so note the number of
709 		 * uio bytes the caller wants for sodirect framework and/or
710 		 * transport (e.g. TCP) strategy.
711 		 */
712 		sodp->sod_want = uiop->uio_resid;
713 	}
714 	mutex_exit(sodp->sod_lockp);
715 
716 	return (suiop);
717 }
718 
719 /*
720  * This function is called at the end of recvmsg(), it finializes all the I/OAT
721  * operations, and reset the uioa state to UIOA_ALLOC.
722  */
723 int
724 sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop)
725 {
726 	int error = 0;
727 	sodirect_t *sodp = so->so_direct;
728 	mblk_t *mp;
729 
730 	if (sodp == NULL) {
731 		return (0);
732 	}
733 
734 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
735 	/* Finish any sodirect and uioa processing */
736 	if (suiop != NULL) {
737 		/* Finish any uioa_t processing */
738 
739 		ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
740 		error = uioafini(suiop, (uioa_t *)uiop);
741 		if ((mp = sodp->sod_uioafh) != NULL) {
742 			sodp->sod_uioafh = NULL;
743 			sodp->sod_uioaft = NULL;
744 			freemsg(mp);
745 		}
746 	}
747 	ASSERT(sodp->sod_uioafh == NULL);
748 	if (!(sodp->sod_state & SOD_WAKE_NOT)) {
749 		/* Awoke */
750 		sodp->sod_state &= SOD_WAKE_CLR;
751 		sodp->sod_state |= SOD_WAKE_NOT;
752 	}
753 	/* Last, clear sod_want value */
754 	sodp->sod_want = 0;
755 
756 	return (error);
757 }
758 
759 /*
760  * Schedule a uioamove() on a mblk. This is ususally called from
761  * protocols (e.g. TCP) on a I/OAT enabled sonode.
762  */
763 mblk_t *
764 sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size)
765 {
766 	uioa_t *uioap = &sodp->sod_uioa;
767 	mblk_t *mp1 = mp;
768 	mblk_t *lmp = NULL;
769 
770 	ASSERT(DB_TYPE(mp) == M_DATA);
771 	ASSERT(msg_size == msgdsize(mp));
772 
773 	/* Caller must have lock held */
774 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
775 
776 	if (uioap->uioa_state & UIOA_ENABLED) {
777 		/* Uioa is enabled */
778 
779 		if (msg_size > uioap->uio_resid) {
780 			/*
781 			 * There isn't enough uio space for the mblk_t chain
782 			 * so disable uioa such that this and any additional
783 			 * mblk_t data is handled by the socket and schedule
784 			 * the socket for wakeup to finish this uioa.
785 			 */
786 			uioap->uioa_state &= UIOA_CLR;
787 			uioap->uioa_state |= UIOA_FINI;
788 			if (sodp->sod_state & SOD_WAKE_NOT) {
789 				sodp->sod_state &= SOD_WAKE_CLR;
790 				sodp->sod_state |= SOD_WAKE_NEED;
791 			}
792 			return (mp);
793 		}
794 		do {
795 			uint32_t	len = MBLKL(mp1);
796 
797 			if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) {
798 				/* Scheduled, mark dblk_t as such */
799 				DB_FLAGS(mp1) |= DBLK_UIOA;
800 			} else {
801 				/* Error, turn off async processing */
802 				uioap->uioa_state &= UIOA_CLR;
803 				uioap->uioa_state |= UIOA_FINI;
804 				break;
805 			}
806 			lmp = mp1;
807 		} while ((mp1 = mp1->b_cont) != NULL);
808 
809 		if (mp1 != NULL || uioap->uio_resid == 0) {
810 			/*
811 			 * Not all mblk_t(s) uioamoved (error) or all uio
812 			 * space has been consumed so schedule the socket
813 			 * for wakeup to finish this uio.
814 			 */
815 			sodp->sod_state &= SOD_WAKE_CLR;
816 			sodp->sod_state |= SOD_WAKE_NEED;
817 
818 			/* Break the mblk chain if neccessary. */
819 			if (mp1 != NULL && lmp != NULL) {
820 				mp->b_next = mp1;
821 				lmp->b_cont = NULL;
822 			}
823 		}
824 	}
825 	return (mp1);
826 }
827 
828 /*
829  * This function is called on a mblk that thas been successfully uioamoved().
830  */
831 void
832 sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp)
833 {
834 	if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
835 		/*
836 		 * A uioa flaged mblk_t chain, already uio processed,
837 		 * add it to the sodirect uioa pending free list.
838 		 *
839 		 * Note, a b_cont chain headed by a DBLK_UIOA enable
840 		 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled.
841 		 */
842 		mblk_t	*bpt = sodp->sod_uioaft;
843 
844 		ASSERT(sodp != NULL);
845 
846 		/*
847 		 * Add first mblk_t of "bp" chain to current sodirect uioa
848 		 * free list tail mblk_t, if any, else empty list so new head.
849 		 */
850 		if (bpt == NULL)
851 			sodp->sod_uioafh = bp;
852 		else
853 			bpt->b_cont = bp;
854 
855 		/*
856 		 * Walk mblk_t "bp" chain to find tail and adjust rptr of
857 		 * each to reflect that uioamove() has consumed all data.
858 		 */
859 		bpt = bp;
860 		for (;;) {
861 			ASSERT(bpt->b_datap->db_flags & DBLK_UIOA);
862 
863 			bpt->b_rptr = bpt->b_wptr;
864 			if (bpt->b_cont == NULL)
865 				break;
866 			bpt = bpt->b_cont;
867 		}
868 		/* New sodirect uioa free list tail */
869 		sodp->sod_uioaft = bpt;
870 
871 		/* Only dequeue once with data returned per uioa_t */
872 		if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
873 			sodp->sod_uioa.uioa_state &= UIOA_CLR;
874 			sodp->sod_uioa.uioa_state |= UIOA_FINI;
875 		}
876 	}
877 }
878 
879 /*
880  * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call
881  * this function on a non-STREAMS socket to schedule uioamove() on the data
882  * that has already queued in this socket.
883  */
884 void
885 sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop)
886 {
887 	uioa_t	*uioap = (uioa_t *)uiop;
888 	mblk_t	*lbp;
889 	mblk_t	*wbp;
890 	mblk_t	*bp;
891 	int	len;
892 	int	error;
893 	boolean_t in_rcv_q = B_TRUE;
894 
895 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
896 	ASSERT(&sodp->sod_uioa == uioap);
897 
898 	/*
899 	 * Walk first b_cont chain in sod_q
900 	 * and schedule any M_DATA mblk_t's for uio asynchronous move.
901 	 */
902 	bp = so->so_rcv_q_head;
903 
904 again:
905 	/* Walk the chain */
906 	lbp = NULL;
907 	wbp = bp;
908 
909 	do {
910 		if (bp == NULL)
911 			break;
912 
913 		if (wbp->b_datap->db_type != M_DATA) {
914 			/* Not M_DATA, no more uioa */
915 			goto nouioa;
916 		}
917 		if ((len = wbp->b_wptr - wbp->b_rptr) > 0) {
918 			/* Have a M_DATA mblk_t with data */
919 			if (len > uioap->uio_resid || (so->so_oobmark > 0 &&
920 			    len + uioap->uioa_mbytes >= so->so_oobmark)) {
921 				/* Not enough uio sapce, or beyond oobmark */
922 				goto nouioa;
923 			}
924 			ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA));
925 			error = uioamove(wbp->b_rptr, len,
926 			    UIO_READ, uioap);
927 			if (!error) {
928 				/* Scheduled, mark dblk_t as such */
929 				wbp->b_datap->db_flags |= DBLK_UIOA;
930 			} else {
931 				/* Break the mblk chain */
932 				goto nouioa;
933 			}
934 		}
935 		/* Save last wbp processed */
936 		lbp = wbp;
937 	} while ((wbp = wbp->b_cont) != NULL);
938 
939 	if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) {
940 		/*
941 		 * We get here only once to process the sonode dump area
942 		 * if so_rcv_q_head is NULL or all the mblks have been
943 		 * successfully uioamoved()ed.
944 		 */
945 		in_rcv_q = B_FALSE;
946 
947 		/* move to dump area */
948 		bp = so->so_rcv_head;
949 		goto again;
950 	}
951 
952 	return;
953 
954 nouioa:
955 	/* No more uioa */
956 	uioap->uioa_state &= UIOA_CLR;
957 	uioap->uioa_state |= UIOA_FINI;
958 
959 	/*
960 	 * If we processed 1 or more mblk_t(s) then we need to split the
961 	 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s)
962 	 * are in the current chain and the rest are in the following new
963 	 * chain.
964 	 */
965 	if (lbp != NULL) {
966 		/* New end of current chain */
967 		lbp->b_cont = NULL;
968 
969 		/* Insert new chain wbp after bp */
970 		if ((wbp->b_next = bp->b_next) == NULL) {
971 			/*
972 			 * No need to grab so_lock, since sod_lockp
973 			 * points to so_lock.
974 			 */
975 			if (in_rcv_q)
976 				so->so_rcv_q_last_head = wbp;
977 			else
978 				so->so_rcv_last_head = wbp;
979 		}
980 		bp->b_next = wbp;
981 		bp->b_next->b_prev = bp->b_prev;
982 		bp->b_prev = lbp;
983 	}
984 }
985 
986 /*
987  * Initialize sodirect data structures on a socket.
988  */
989 void
990 sod_sock_init(struct sonode *so, struct stdata *stp, sod_enq_func enq_func,
991     sod_wakeup_func wake_func, kmutex_t *lockp)
992 {
993 	sodirect_t	*sodp;
994 
995 	ASSERT(so->so_direct == NULL);
996 
997 	so->so_state |= SS_SODIRECT;
998 
999 	sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP);
1000 	sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT;
1001 	sodp->sod_want = 0;
1002 	sodp->sod_q = (stp != NULL) ? RD(stp->sd_wrq) : NULL;
1003 	sodp->sod_enqueue = enq_func;
1004 	sodp->sod_wakeup = wake_func;
1005 	sodp->sod_uioafh = NULL;
1006 	sodp->sod_uioaft = NULL;
1007 	sodp->sod_lockp = lockp;
1008 	/*
1009 	 * Remainder of the sod_uioa members are left uninitialized
1010 	 * but will be initialized later by uioainit() before uioa
1011 	 * is enabled.
1012 	 */
1013 	sodp->sod_uioa.uioa_state = UIOA_ALLOC;
1014 	so->so_direct = sodp;
1015 	if (stp != NULL)
1016 		stp->sd_sodirect = sodp;
1017 }
1018 
1019 /*
1020  * Init the sodirect kmem cache while sockfs is loading.
1021  */
1022 void
1023 sod_init()
1024 {
1025 	/* Allocate sodirect_t kmem_cache */
1026 	sock_sod_cache = kmem_cache_create("sock_sod_cache",
1027 	    sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1028 }
1029 
1030 ssize_t
1031 sod_uioa_mblk(struct sonode *so, mblk_t *mp)
1032 {
1033 	sodirect_t *sodp = so->so_direct;
1034 
1035 	ASSERT(sodp != NULL);
1036 	ASSERT(MUTEX_HELD(sodp->sod_lockp));
1037 
1038 	ASSERT(sodp->sod_state & SOD_ENABLED);
1039 	ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT));
1040 
1041 	ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI));
1042 
1043 	if (mp == NULL && so->so_rcv_q_head != NULL) {
1044 		mp = so->so_rcv_q_head;
1045 		ASSERT(mp->b_prev != NULL);
1046 		mp->b_prev = NULL;
1047 		so->so_rcv_q_head = mp->b_next;
1048 		if (so->so_rcv_q_head == NULL) {
1049 			so->so_rcv_q_last_head = NULL;
1050 		}
1051 		mp->b_next = NULL;
1052 	}
1053 
1054 	sod_uioa_mblk_done(sodp, mp);
1055 
1056 	if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL &&
1057 	    DB_TYPE(so->so_rcv_head) == M_DATA &&
1058 	    (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) {
1059 		/* more arrived */
1060 		ASSERT(so->so_rcv_q_head == NULL);
1061 		mp = so->so_rcv_head;
1062 		so->so_rcv_head = mp->b_next;
1063 		if (so->so_rcv_head == NULL)
1064 			so->so_rcv_last_head = NULL;
1065 		mp->b_prev = mp->b_next = NULL;
1066 		sod_uioa_mblk_done(sodp, mp);
1067 	}
1068 
1069 #ifdef DEBUG
1070 	if (so->so_rcv_q_head != NULL) {
1071 		mblk_t *m = so->so_rcv_q_head;
1072 		while (m != NULL) {
1073 			if (DB_FLAGS(m) & DBLK_UIOA) {
1074 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
1075 				    " in so_rcv_q_head.\n", (void *)m);
1076 			}
1077 			m = m->b_next;
1078 		}
1079 	}
1080 	if (so->so_rcv_head != NULL) {
1081 		mblk_t *m = so->so_rcv_head;
1082 		while (m != NULL) {
1083 			if (DB_FLAGS(m) & DBLK_UIOA) {
1084 				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
1085 				    " in so_rcv_head.\n", (void *)m);
1086 			}
1087 			m = m->b_next;
1088 		}
1089 	}
1090 #endif
1091 	return (sodp->sod_uioa.uioa_mbytes);
1092 }
1093