xref: /freebsd/sys/kern/uipc_syscalls.c (revision a98ff317388a00b992f1bf8404dee596f9383f5e)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include "opt_capsicum.h"
39 #include "opt_inet.h"
40 #include "opt_inet6.h"
41 #include "opt_sctp.h"
42 #include "opt_compat.h"
43 #include "opt_ktrace.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/capability.h>
48 #include <sys/kernel.h>
49 #include <sys/lock.h>
50 #include <sys/mutex.h>
51 #include <sys/sysproto.h>
52 #include <sys/malloc.h>
53 #include <sys/filedesc.h>
54 #include <sys/event.h>
55 #include <sys/proc.h>
56 #include <sys/fcntl.h>
57 #include <sys/file.h>
58 #include <sys/filio.h>
59 #include <sys/jail.h>
60 #include <sys/mount.h>
61 #include <sys/mbuf.h>
62 #include <sys/protosw.h>
63 #include <sys/rwlock.h>
64 #include <sys/sf_buf.h>
65 #include <sys/sysent.h>
66 #include <sys/socket.h>
67 #include <sys/socketvar.h>
68 #include <sys/signalvar.h>
69 #include <sys/syscallsubr.h>
70 #include <sys/sysctl.h>
71 #include <sys/uio.h>
72 #include <sys/vnode.h>
73 #ifdef KTRACE
74 #include <sys/ktrace.h>
75 #endif
76 #ifdef COMPAT_FREEBSD32
77 #include <compat/freebsd32/freebsd32_util.h>
78 #endif
79 
80 #include <net/vnet.h>
81 
82 #include <security/audit/audit.h>
83 #include <security/mac/mac_framework.h>
84 
85 #include <vm/vm.h>
86 #include <vm/vm_param.h>
87 #include <vm/vm_object.h>
88 #include <vm/vm_page.h>
89 #include <vm/vm_pageout.h>
90 #include <vm/vm_kern.h>
91 #include <vm/vm_extern.h>
92 
93 #if defined(INET) || defined(INET6)
94 #ifdef SCTP
95 #include <netinet/sctp.h>
96 #include <netinet/sctp_peeloff.h>
97 #endif /* SCTP */
98 #endif /* INET || INET6 */
99 
100 /*
101  * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
102  * and SOCK_NONBLOCK.
103  */
104 #define	ACCEPT4_INHERIT	0x1
105 #define	ACCEPT4_COMPAT	0x2
106 
107 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
108 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
109 
110 static int accept1(struct thread *td, int s, struct sockaddr *uname,
111 		   socklen_t *anamelen, int flags);
112 static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
113 static int getsockname1(struct thread *td, struct getsockname_args *uap,
114 			int compat);
115 static int getpeername1(struct thread *td, struct getpeername_args *uap,
116 			int compat);
117 
118 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
119 /*
120  * NSFBUFS-related variables and associated sysctls
121  */
122 int nsfbufs;
123 int nsfbufspeak;
124 int nsfbufsused;
125 
126 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
127     "Maximum number of sendfile(2) sf_bufs available");
128 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
129     "Number of sendfile(2) sf_bufs at peak usage");
130 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
131     "Number of sendfile(2) sf_bufs in use");
132 
133 static void
134 sfstat_init(const void *unused)
135 {
136 
137 	COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
138 	    M_WAITOK);
139 }
140 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
141 
142 static int
143 sfstat_sysctl(SYSCTL_HANDLER_ARGS)
144 {
145 	struct sfstat s;
146 
147 	COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
148 	if (req->newptr)
149 		COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
150 	return (SYSCTL_OUT(req, &s, sizeof(s)));
151 }
152 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
153     NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
154 /*
155  * Convert a user file descriptor to a kernel file entry and check if required
156  * capability rights are present.
157  * A reference on the file entry is held upon returning.
158  */
159 static int
160 getsock_cap(struct filedesc *fdp, int fd, cap_rights_t rights,
161     struct file **fpp, u_int *fflagp)
162 {
163 	struct file *fp;
164 	int error;
165 
166 	error = fget_unlocked(fdp, fd, rights, 0, &fp, NULL);
167 	if (error != 0)
168 		return (error);
169 	if (fp->f_type != DTYPE_SOCKET) {
170 		fdrop(fp, curthread);
171 		return (ENOTSOCK);
172 	}
173 	if (fflagp != NULL)
174 		*fflagp = fp->f_flag;
175 	*fpp = fp;
176 	return (0);
177 }
178 
179 /*
180  * System call interface to the socket abstraction.
181  */
182 #if defined(COMPAT_43)
183 #define COMPAT_OLDSOCK
184 #endif
185 
186 int
187 sys_socket(td, uap)
188 	struct thread *td;
189 	struct socket_args /* {
190 		int	domain;
191 		int	type;
192 		int	protocol;
193 	} */ *uap;
194 {
195 	struct socket *so;
196 	struct file *fp;
197 	int fd, error, type, oflag, fflag;
198 
199 	AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
200 
201 	type = uap->type;
202 	oflag = 0;
203 	fflag = 0;
204 	if ((type & SOCK_CLOEXEC) != 0) {
205 		type &= ~SOCK_CLOEXEC;
206 		oflag |= O_CLOEXEC;
207 	}
208 	if ((type & SOCK_NONBLOCK) != 0) {
209 		type &= ~SOCK_NONBLOCK;
210 		fflag |= FNONBLOCK;
211 	}
212 
213 #ifdef MAC
214 	error = mac_socket_check_create(td->td_ucred, uap->domain, type,
215 	    uap->protocol);
216 	if (error)
217 		return (error);
218 #endif
219 	error = falloc(td, &fp, &fd, oflag);
220 	if (error)
221 		return (error);
222 	/* An extra reference on `fp' has been held for us by falloc(). */
223 	error = socreate(uap->domain, &so, type, uap->protocol,
224 	    td->td_ucred, td);
225 	if (error) {
226 		fdclose(td->td_proc->p_fd, fp, fd, td);
227 	} else {
228 		finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
229 		if ((fflag & FNONBLOCK) != 0)
230 			(void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
231 		td->td_retval[0] = fd;
232 	}
233 	fdrop(fp, td);
234 	return (error);
235 }
236 
237 /* ARGSUSED */
238 int
239 sys_bind(td, uap)
240 	struct thread *td;
241 	struct bind_args /* {
242 		int	s;
243 		caddr_t	name;
244 		int	namelen;
245 	} */ *uap;
246 {
247 	struct sockaddr *sa;
248 	int error;
249 
250 	error = getsockaddr(&sa, uap->name, uap->namelen);
251 	if (error == 0) {
252 		error = kern_bind(td, uap->s, sa);
253 		free(sa, M_SONAME);
254 	}
255 	return (error);
256 }
257 
258 static int
259 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
260 {
261 	struct socket *so;
262 	struct file *fp;
263 	int error;
264 
265 	AUDIT_ARG_FD(fd);
266 	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
267 	error = getsock_cap(td->td_proc->p_fd, fd, CAP_BIND, &fp, NULL);
268 	if (error)
269 		return (error);
270 	so = fp->f_data;
271 #ifdef KTRACE
272 	if (KTRPOINT(td, KTR_STRUCT))
273 		ktrsockaddr(sa);
274 #endif
275 #ifdef MAC
276 	error = mac_socket_check_bind(td->td_ucred, so, sa);
277 	if (error == 0) {
278 #endif
279 		if (dirfd == AT_FDCWD)
280 			error = sobind(so, sa, td);
281 		else
282 			error = sobindat(dirfd, so, sa, td);
283 #ifdef MAC
284 	}
285 #endif
286 	fdrop(fp, td);
287 	return (error);
288 }
289 
290 int
291 kern_bind(struct thread *td, int fd, struct sockaddr *sa)
292 {
293 
294 	return (kern_bindat(td, AT_FDCWD, fd, sa));
295 }
296 
297 /* ARGSUSED */
298 int
299 sys_bindat(td, uap)
300 	struct thread *td;
301 	struct bindat_args /* {
302 		int	fd;
303 		int	s;
304 		caddr_t	name;
305 		int	namelen;
306 	} */ *uap;
307 {
308 	struct sockaddr *sa;
309 	int error;
310 
311 	error = getsockaddr(&sa, uap->name, uap->namelen);
312 	if (error == 0) {
313 		error = kern_bindat(td, uap->fd, uap->s, sa);
314 		free(sa, M_SONAME);
315 	}
316 	return (error);
317 }
318 
319 /* ARGSUSED */
320 int
321 sys_listen(td, uap)
322 	struct thread *td;
323 	struct listen_args /* {
324 		int	s;
325 		int	backlog;
326 	} */ *uap;
327 {
328 	struct socket *so;
329 	struct file *fp;
330 	int error;
331 
332 	AUDIT_ARG_FD(uap->s);
333 	error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_LISTEN, &fp, NULL);
334 	if (error == 0) {
335 		so = fp->f_data;
336 #ifdef MAC
337 		error = mac_socket_check_listen(td->td_ucred, so);
338 		if (error == 0)
339 #endif
340 			error = solisten(so, uap->backlog, td);
341 		fdrop(fp, td);
342 	}
343 	return(error);
344 }
345 
346 /*
347  * accept1()
348  */
349 static int
350 accept1(td, s, uname, anamelen, flags)
351 	struct thread *td;
352 	int s;
353 	struct sockaddr *uname;
354 	socklen_t *anamelen;
355 	int flags;
356 {
357 	struct sockaddr *name;
358 	socklen_t namelen;
359 	struct file *fp;
360 	int error;
361 
362 	if (uname == NULL)
363 		return (kern_accept4(td, s, NULL, NULL, flags, NULL));
364 
365 	error = copyin(anamelen, &namelen, sizeof (namelen));
366 	if (error)
367 		return (error);
368 
369 	error = kern_accept4(td, s, &name, &namelen, flags, &fp);
370 
371 	/*
372 	 * return a namelen of zero for older code which might
373 	 * ignore the return value from accept.
374 	 */
375 	if (error) {
376 		(void) copyout(&namelen, anamelen, sizeof(*anamelen));
377 		return (error);
378 	}
379 
380 	if (error == 0 && uname != NULL) {
381 #ifdef COMPAT_OLDSOCK
382 		if (flags & ACCEPT4_COMPAT)
383 			((struct osockaddr *)name)->sa_family =
384 			    name->sa_family;
385 #endif
386 		error = copyout(name, uname, namelen);
387 	}
388 	if (error == 0)
389 		error = copyout(&namelen, anamelen,
390 		    sizeof(namelen));
391 	if (error)
392 		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
393 	fdrop(fp, td);
394 	free(name, M_SONAME);
395 	return (error);
396 }
397 
398 int
399 kern_accept(struct thread *td, int s, struct sockaddr **name,
400     socklen_t *namelen, struct file **fp)
401 {
402 	return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
403 }
404 
405 int
406 kern_accept4(struct thread *td, int s, struct sockaddr **name,
407     socklen_t *namelen, int flags, struct file **fp)
408 {
409 	struct filedesc *fdp;
410 	struct file *headfp, *nfp = NULL;
411 	struct sockaddr *sa = NULL;
412 	int error;
413 	struct socket *head, *so;
414 	int fd;
415 	u_int fflag;
416 	pid_t pgid;
417 	int tmp;
418 
419 	if (name)
420 		*name = NULL;
421 
422 	AUDIT_ARG_FD(s);
423 	fdp = td->td_proc->p_fd;
424 	error = getsock_cap(fdp, s, CAP_ACCEPT, &headfp, &fflag);
425 	if (error)
426 		return (error);
427 	head = headfp->f_data;
428 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
429 		error = EINVAL;
430 		goto done;
431 	}
432 #ifdef MAC
433 	error = mac_socket_check_accept(td->td_ucred, head);
434 	if (error != 0)
435 		goto done;
436 #endif
437 	error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0);
438 	if (error)
439 		goto done;
440 	ACCEPT_LOCK();
441 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
442 		ACCEPT_UNLOCK();
443 		error = EWOULDBLOCK;
444 		goto noconnection;
445 	}
446 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
447 		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
448 			head->so_error = ECONNABORTED;
449 			break;
450 		}
451 		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
452 		    "accept", 0);
453 		if (error) {
454 			ACCEPT_UNLOCK();
455 			goto noconnection;
456 		}
457 	}
458 	if (head->so_error) {
459 		error = head->so_error;
460 		head->so_error = 0;
461 		ACCEPT_UNLOCK();
462 		goto noconnection;
463 	}
464 	so = TAILQ_FIRST(&head->so_comp);
465 	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
466 	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
467 
468 	/*
469 	 * Before changing the flags on the socket, we have to bump the
470 	 * reference count.  Otherwise, if the protocol calls sofree(),
471 	 * the socket will be released due to a zero refcount.
472 	 */
473 	SOCK_LOCK(so);			/* soref() and so_state update */
474 	soref(so);			/* file descriptor reference */
475 
476 	TAILQ_REMOVE(&head->so_comp, so, so_list);
477 	head->so_qlen--;
478 	if (flags & ACCEPT4_INHERIT)
479 		so->so_state |= (head->so_state & SS_NBIO);
480 	else
481 		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
482 	so->so_qstate &= ~SQ_COMP;
483 	so->so_head = NULL;
484 
485 	SOCK_UNLOCK(so);
486 	ACCEPT_UNLOCK();
487 
488 	/* An extra reference on `nfp' has been held for us by falloc(). */
489 	td->td_retval[0] = fd;
490 
491 	/* connection has been removed from the listen queue */
492 	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
493 
494 	if (flags & ACCEPT4_INHERIT) {
495 		pgid = fgetown(&head->so_sigio);
496 		if (pgid != 0)
497 			fsetown(pgid, &so->so_sigio);
498 	} else {
499 		fflag &= ~(FNONBLOCK | FASYNC);
500 		if (flags & SOCK_NONBLOCK)
501 			fflag |= FNONBLOCK;
502 	}
503 
504 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
505 	/* Sync socket nonblocking/async state with file flags */
506 	tmp = fflag & FNONBLOCK;
507 	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
508 	tmp = fflag & FASYNC;
509 	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
510 	sa = 0;
511 	error = soaccept(so, &sa);
512 	if (error) {
513 		/*
514 		 * return a namelen of zero for older code which might
515 		 * ignore the return value from accept.
516 		 */
517 		if (name)
518 			*namelen = 0;
519 		goto noconnection;
520 	}
521 	if (sa == NULL) {
522 		if (name)
523 			*namelen = 0;
524 		goto done;
525 	}
526 	AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
527 	if (name) {
528 		/* check sa_len before it is destroyed */
529 		if (*namelen > sa->sa_len)
530 			*namelen = sa->sa_len;
531 #ifdef KTRACE
532 		if (KTRPOINT(td, KTR_STRUCT))
533 			ktrsockaddr(sa);
534 #endif
535 		*name = sa;
536 		sa = NULL;
537 	}
538 noconnection:
539 	if (sa)
540 		free(sa, M_SONAME);
541 
542 	/*
543 	 * close the new descriptor, assuming someone hasn't ripped it
544 	 * out from under us.
545 	 */
546 	if (error)
547 		fdclose(fdp, nfp, fd, td);
548 
549 	/*
550 	 * Release explicitly held references before returning.  We return
551 	 * a reference on nfp to the caller on success if they request it.
552 	 */
553 done:
554 	if (fp != NULL) {
555 		if (error == 0) {
556 			*fp = nfp;
557 			nfp = NULL;
558 		} else
559 			*fp = NULL;
560 	}
561 	if (nfp != NULL)
562 		fdrop(nfp, td);
563 	fdrop(headfp, td);
564 	return (error);
565 }
566 
567 int
568 sys_accept(td, uap)
569 	struct thread *td;
570 	struct accept_args *uap;
571 {
572 
573 	return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
574 }
575 
576 int
577 sys_accept4(td, uap)
578 	struct thread *td;
579 	struct accept4_args *uap;
580 {
581 	if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
582 		return (EINVAL);
583 
584 	return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
585 }
586 
587 #ifdef COMPAT_OLDSOCK
588 int
589 oaccept(td, uap)
590 	struct thread *td;
591 	struct accept_args *uap;
592 {
593 
594 	return (accept1(td, uap->s, uap->name, uap->anamelen,
595 	    ACCEPT4_INHERIT | ACCEPT4_COMPAT));
596 }
597 #endif /* COMPAT_OLDSOCK */
598 
599 /* ARGSUSED */
600 int
601 sys_connect(td, uap)
602 	struct thread *td;
603 	struct connect_args /* {
604 		int	s;
605 		caddr_t	name;
606 		int	namelen;
607 	} */ *uap;
608 {
609 	struct sockaddr *sa;
610 	int error;
611 
612 	error = getsockaddr(&sa, uap->name, uap->namelen);
613 	if (error == 0) {
614 		error = kern_connect(td, uap->s, sa);
615 		free(sa, M_SONAME);
616 	}
617 	return (error);
618 }
619 
620 static int
621 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
622 {
623 	struct socket *so;
624 	struct file *fp;
625 	int error;
626 	int interrupted = 0;
627 
628 	AUDIT_ARG_FD(fd);
629 	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
630 	error = getsock_cap(td->td_proc->p_fd, fd, CAP_CONNECT, &fp, NULL);
631 	if (error)
632 		return (error);
633 	so = fp->f_data;
634 	if (so->so_state & SS_ISCONNECTING) {
635 		error = EALREADY;
636 		goto done1;
637 	}
638 #ifdef KTRACE
639 	if (KTRPOINT(td, KTR_STRUCT))
640 		ktrsockaddr(sa);
641 #endif
642 #ifdef MAC
643 	error = mac_socket_check_connect(td->td_ucred, so, sa);
644 	if (error)
645 		goto bad;
646 #endif
647 	if (dirfd == AT_FDCWD)
648 		error = soconnect(so, sa, td);
649 	else
650 		error = soconnectat(dirfd, so, sa, td);
651 	if (error)
652 		goto bad;
653 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
654 		error = EINPROGRESS;
655 		goto done1;
656 	}
657 	SOCK_LOCK(so);
658 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
659 		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
660 		    "connec", 0);
661 		if (error) {
662 			if (error == EINTR || error == ERESTART)
663 				interrupted = 1;
664 			break;
665 		}
666 	}
667 	if (error == 0) {
668 		error = so->so_error;
669 		so->so_error = 0;
670 	}
671 	SOCK_UNLOCK(so);
672 bad:
673 	if (!interrupted)
674 		so->so_state &= ~SS_ISCONNECTING;
675 	if (error == ERESTART)
676 		error = EINTR;
677 done1:
678 	fdrop(fp, td);
679 	return (error);
680 }
681 
682 int
683 kern_connect(struct thread *td, int fd, struct sockaddr *sa)
684 {
685 
686 	return (kern_connectat(td, AT_FDCWD, fd, sa));
687 }
688 
689 /* ARGSUSED */
690 int
691 sys_connectat(td, uap)
692 	struct thread *td;
693 	struct connectat_args /* {
694 		int	fd;
695 		int	s;
696 		caddr_t	name;
697 		int	namelen;
698 	} */ *uap;
699 {
700 	struct sockaddr *sa;
701 	int error;
702 
703 	error = getsockaddr(&sa, uap->name, uap->namelen);
704 	if (error == 0) {
705 		error = kern_connectat(td, uap->fd, uap->s, sa);
706 		free(sa, M_SONAME);
707 	}
708 	return (error);
709 }
710 
711 int
712 kern_socketpair(struct thread *td, int domain, int type, int protocol,
713     int *rsv)
714 {
715 	struct filedesc *fdp = td->td_proc->p_fd;
716 	struct file *fp1, *fp2;
717 	struct socket *so1, *so2;
718 	int fd, error, oflag, fflag;
719 
720 	AUDIT_ARG_SOCKET(domain, type, protocol);
721 
722 	oflag = 0;
723 	fflag = 0;
724 	if ((type & SOCK_CLOEXEC) != 0) {
725 		type &= ~SOCK_CLOEXEC;
726 		oflag |= O_CLOEXEC;
727 	}
728 	if ((type & SOCK_NONBLOCK) != 0) {
729 		type &= ~SOCK_NONBLOCK;
730 		fflag |= FNONBLOCK;
731 	}
732 #ifdef MAC
733 	/* We might want to have a separate check for socket pairs. */
734 	error = mac_socket_check_create(td->td_ucred, domain, type,
735 	    protocol);
736 	if (error)
737 		return (error);
738 #endif
739 	error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
740 	if (error)
741 		return (error);
742 	error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
743 	if (error)
744 		goto free1;
745 	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
746 	error = falloc(td, &fp1, &fd, oflag);
747 	if (error)
748 		goto free2;
749 	rsv[0] = fd;
750 	fp1->f_data = so1;	/* so1 already has ref count */
751 	error = falloc(td, &fp2, &fd, oflag);
752 	if (error)
753 		goto free3;
754 	fp2->f_data = so2;	/* so2 already has ref count */
755 	rsv[1] = fd;
756 	error = soconnect2(so1, so2);
757 	if (error)
758 		goto free4;
759 	if (type == SOCK_DGRAM) {
760 		/*
761 		 * Datagram socket connection is asymmetric.
762 		 */
763 		 error = soconnect2(so2, so1);
764 		 if (error)
765 			goto free4;
766 	}
767 	finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
768 	    &socketops);
769 	finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
770 	    &socketops);
771 	if ((fflag & FNONBLOCK) != 0) {
772 		(void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
773 		(void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
774 	}
775 	fdrop(fp1, td);
776 	fdrop(fp2, td);
777 	return (0);
778 free4:
779 	fdclose(fdp, fp2, rsv[1], td);
780 	fdrop(fp2, td);
781 free3:
782 	fdclose(fdp, fp1, rsv[0], td);
783 	fdrop(fp1, td);
784 free2:
785 	if (so2 != NULL)
786 		(void)soclose(so2);
787 free1:
788 	if (so1 != NULL)
789 		(void)soclose(so1);
790 	return (error);
791 }
792 
793 int
794 sys_socketpair(struct thread *td, struct socketpair_args *uap)
795 {
796 	int error, sv[2];
797 
798 	error = kern_socketpair(td, uap->domain, uap->type,
799 	    uap->protocol, sv);
800 	if (error)
801 		return (error);
802 	error = copyout(sv, uap->rsv, 2 * sizeof(int));
803 	if (error) {
804 		(void)kern_close(td, sv[0]);
805 		(void)kern_close(td, sv[1]);
806 	}
807 	return (error);
808 }
809 
810 static int
811 sendit(td, s, mp, flags)
812 	struct thread *td;
813 	int s;
814 	struct msghdr *mp;
815 	int flags;
816 {
817 	struct mbuf *control;
818 	struct sockaddr *to;
819 	int error;
820 
821 #ifdef CAPABILITY_MODE
822 	if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
823 		return (ECAPMODE);
824 #endif
825 
826 	if (mp->msg_name != NULL) {
827 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
828 		if (error) {
829 			to = NULL;
830 			goto bad;
831 		}
832 		mp->msg_name = to;
833 	} else {
834 		to = NULL;
835 	}
836 
837 	if (mp->msg_control) {
838 		if (mp->msg_controllen < sizeof(struct cmsghdr)
839 #ifdef COMPAT_OLDSOCK
840 		    && mp->msg_flags != MSG_COMPAT
841 #endif
842 		) {
843 			error = EINVAL;
844 			goto bad;
845 		}
846 		error = sockargs(&control, mp->msg_control,
847 		    mp->msg_controllen, MT_CONTROL);
848 		if (error)
849 			goto bad;
850 #ifdef COMPAT_OLDSOCK
851 		if (mp->msg_flags == MSG_COMPAT) {
852 			struct cmsghdr *cm;
853 
854 			M_PREPEND(control, sizeof(*cm), M_WAITOK);
855 			cm = mtod(control, struct cmsghdr *);
856 			cm->cmsg_len = control->m_len;
857 			cm->cmsg_level = SOL_SOCKET;
858 			cm->cmsg_type = SCM_RIGHTS;
859 		}
860 #endif
861 	} else {
862 		control = NULL;
863 	}
864 
865 	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
866 
867 bad:
868 	if (to)
869 		free(to, M_SONAME);
870 	return (error);
871 }
872 
873 int
874 kern_sendit(td, s, mp, flags, control, segflg)
875 	struct thread *td;
876 	int s;
877 	struct msghdr *mp;
878 	int flags;
879 	struct mbuf *control;
880 	enum uio_seg segflg;
881 {
882 	struct file *fp;
883 	struct uio auio;
884 	struct iovec *iov;
885 	struct socket *so;
886 	int i, error;
887 	ssize_t len;
888 	cap_rights_t rights;
889 #ifdef KTRACE
890 	struct uio *ktruio = NULL;
891 #endif
892 
893 	AUDIT_ARG_FD(s);
894 	rights = CAP_SEND;
895 	if (mp->msg_name != NULL) {
896 		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
897 		rights |= CAP_CONNECT;
898 	}
899 	error = getsock_cap(td->td_proc->p_fd, s, rights, &fp, NULL);
900 	if (error)
901 		return (error);
902 	so = (struct socket *)fp->f_data;
903 
904 #ifdef KTRACE
905 	if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
906 		ktrsockaddr(mp->msg_name);
907 #endif
908 #ifdef MAC
909 	if (mp->msg_name != NULL) {
910 		error = mac_socket_check_connect(td->td_ucred, so,
911 		    mp->msg_name);
912 		if (error)
913 			goto bad;
914 	}
915 	error = mac_socket_check_send(td->td_ucred, so);
916 	if (error)
917 		goto bad;
918 #endif
919 
920 	auio.uio_iov = mp->msg_iov;
921 	auio.uio_iovcnt = mp->msg_iovlen;
922 	auio.uio_segflg = segflg;
923 	auio.uio_rw = UIO_WRITE;
924 	auio.uio_td = td;
925 	auio.uio_offset = 0;			/* XXX */
926 	auio.uio_resid = 0;
927 	iov = mp->msg_iov;
928 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
929 		if ((auio.uio_resid += iov->iov_len) < 0) {
930 			error = EINVAL;
931 			goto bad;
932 		}
933 	}
934 #ifdef KTRACE
935 	if (KTRPOINT(td, KTR_GENIO))
936 		ktruio = cloneuio(&auio);
937 #endif
938 	len = auio.uio_resid;
939 	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
940 	if (error) {
941 		if (auio.uio_resid != len && (error == ERESTART ||
942 		    error == EINTR || error == EWOULDBLOCK))
943 			error = 0;
944 		/* Generation of SIGPIPE can be controlled per socket */
945 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
946 		    !(flags & MSG_NOSIGNAL)) {
947 			PROC_LOCK(td->td_proc);
948 			tdsignal(td, SIGPIPE);
949 			PROC_UNLOCK(td->td_proc);
950 		}
951 	}
952 	if (error == 0)
953 		td->td_retval[0] = len - auio.uio_resid;
954 #ifdef KTRACE
955 	if (ktruio != NULL) {
956 		ktruio->uio_resid = td->td_retval[0];
957 		ktrgenio(s, UIO_WRITE, ktruio, error);
958 	}
959 #endif
960 bad:
961 	fdrop(fp, td);
962 	return (error);
963 }
964 
965 int
966 sys_sendto(td, uap)
967 	struct thread *td;
968 	struct sendto_args /* {
969 		int	s;
970 		caddr_t	buf;
971 		size_t	len;
972 		int	flags;
973 		caddr_t	to;
974 		int	tolen;
975 	} */ *uap;
976 {
977 	struct msghdr msg;
978 	struct iovec aiov;
979 	int error;
980 
981 	msg.msg_name = uap->to;
982 	msg.msg_namelen = uap->tolen;
983 	msg.msg_iov = &aiov;
984 	msg.msg_iovlen = 1;
985 	msg.msg_control = 0;
986 #ifdef COMPAT_OLDSOCK
987 	msg.msg_flags = 0;
988 #endif
989 	aiov.iov_base = uap->buf;
990 	aiov.iov_len = uap->len;
991 	error = sendit(td, uap->s, &msg, uap->flags);
992 	return (error);
993 }
994 
995 #ifdef COMPAT_OLDSOCK
996 int
997 osend(td, uap)
998 	struct thread *td;
999 	struct osend_args /* {
1000 		int	s;
1001 		caddr_t	buf;
1002 		int	len;
1003 		int	flags;
1004 	} */ *uap;
1005 {
1006 	struct msghdr msg;
1007 	struct iovec aiov;
1008 	int error;
1009 
1010 	msg.msg_name = 0;
1011 	msg.msg_namelen = 0;
1012 	msg.msg_iov = &aiov;
1013 	msg.msg_iovlen = 1;
1014 	aiov.iov_base = uap->buf;
1015 	aiov.iov_len = uap->len;
1016 	msg.msg_control = 0;
1017 	msg.msg_flags = 0;
1018 	error = sendit(td, uap->s, &msg, uap->flags);
1019 	return (error);
1020 }
1021 
1022 int
1023 osendmsg(td, uap)
1024 	struct thread *td;
1025 	struct osendmsg_args /* {
1026 		int	s;
1027 		caddr_t	msg;
1028 		int	flags;
1029 	} */ *uap;
1030 {
1031 	struct msghdr msg;
1032 	struct iovec *iov;
1033 	int error;
1034 
1035 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1036 	if (error)
1037 		return (error);
1038 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1039 	if (error)
1040 		return (error);
1041 	msg.msg_iov = iov;
1042 	msg.msg_flags = MSG_COMPAT;
1043 	error = sendit(td, uap->s, &msg, uap->flags);
1044 	free(iov, M_IOV);
1045 	return (error);
1046 }
1047 #endif
1048 
1049 int
1050 sys_sendmsg(td, uap)
1051 	struct thread *td;
1052 	struct sendmsg_args /* {
1053 		int	s;
1054 		caddr_t	msg;
1055 		int	flags;
1056 	} */ *uap;
1057 {
1058 	struct msghdr msg;
1059 	struct iovec *iov;
1060 	int error;
1061 
1062 	error = copyin(uap->msg, &msg, sizeof (msg));
1063 	if (error)
1064 		return (error);
1065 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1066 	if (error)
1067 		return (error);
1068 	msg.msg_iov = iov;
1069 #ifdef COMPAT_OLDSOCK
1070 	msg.msg_flags = 0;
1071 #endif
1072 	error = sendit(td, uap->s, &msg, uap->flags);
1073 	free(iov, M_IOV);
1074 	return (error);
1075 }
1076 
1077 int
1078 kern_recvit(td, s, mp, fromseg, controlp)
1079 	struct thread *td;
1080 	int s;
1081 	struct msghdr *mp;
1082 	enum uio_seg fromseg;
1083 	struct mbuf **controlp;
1084 {
1085 	struct uio auio;
1086 	struct iovec *iov;
1087 	int i;
1088 	ssize_t len;
1089 	int error;
1090 	struct mbuf *m, *control = NULL;
1091 	caddr_t ctlbuf;
1092 	struct file *fp;
1093 	struct socket *so;
1094 	struct sockaddr *fromsa = NULL;
1095 #ifdef KTRACE
1096 	struct uio *ktruio = NULL;
1097 #endif
1098 
1099 	if (controlp != NULL)
1100 		*controlp = NULL;
1101 
1102 	AUDIT_ARG_FD(s);
1103 	error = getsock_cap(td->td_proc->p_fd, s, CAP_RECV, &fp, NULL);
1104 	if (error)
1105 		return (error);
1106 	so = fp->f_data;
1107 
1108 #ifdef MAC
1109 	error = mac_socket_check_receive(td->td_ucred, so);
1110 	if (error) {
1111 		fdrop(fp, td);
1112 		return (error);
1113 	}
1114 #endif
1115 
1116 	auio.uio_iov = mp->msg_iov;
1117 	auio.uio_iovcnt = mp->msg_iovlen;
1118 	auio.uio_segflg = UIO_USERSPACE;
1119 	auio.uio_rw = UIO_READ;
1120 	auio.uio_td = td;
1121 	auio.uio_offset = 0;			/* XXX */
1122 	auio.uio_resid = 0;
1123 	iov = mp->msg_iov;
1124 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
1125 		if ((auio.uio_resid += iov->iov_len) < 0) {
1126 			fdrop(fp, td);
1127 			return (EINVAL);
1128 		}
1129 	}
1130 #ifdef KTRACE
1131 	if (KTRPOINT(td, KTR_GENIO))
1132 		ktruio = cloneuio(&auio);
1133 #endif
1134 	len = auio.uio_resid;
1135 	error = soreceive(so, &fromsa, &auio, NULL,
1136 	    (mp->msg_control || controlp) ? &control : NULL,
1137 	    &mp->msg_flags);
1138 	if (error) {
1139 		if (auio.uio_resid != len && (error == ERESTART ||
1140 		    error == EINTR || error == EWOULDBLOCK))
1141 			error = 0;
1142 	}
1143 	if (fromsa != NULL)
1144 		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
1145 #ifdef KTRACE
1146 	if (ktruio != NULL) {
1147 		ktruio->uio_resid = len - auio.uio_resid;
1148 		ktrgenio(s, UIO_READ, ktruio, error);
1149 	}
1150 #endif
1151 	if (error)
1152 		goto out;
1153 	td->td_retval[0] = len - auio.uio_resid;
1154 	if (mp->msg_name) {
1155 		len = mp->msg_namelen;
1156 		if (len <= 0 || fromsa == NULL)
1157 			len = 0;
1158 		else {
1159 			/* save sa_len before it is destroyed by MSG_COMPAT */
1160 			len = MIN(len, fromsa->sa_len);
1161 #ifdef COMPAT_OLDSOCK
1162 			if (mp->msg_flags & MSG_COMPAT)
1163 				((struct osockaddr *)fromsa)->sa_family =
1164 				    fromsa->sa_family;
1165 #endif
1166 			if (fromseg == UIO_USERSPACE) {
1167 				error = copyout(fromsa, mp->msg_name,
1168 				    (unsigned)len);
1169 				if (error)
1170 					goto out;
1171 			} else
1172 				bcopy(fromsa, mp->msg_name, len);
1173 		}
1174 		mp->msg_namelen = len;
1175 	}
1176 	if (mp->msg_control && controlp == NULL) {
1177 #ifdef COMPAT_OLDSOCK
1178 		/*
1179 		 * We assume that old recvmsg calls won't receive access
1180 		 * rights and other control info, esp. as control info
1181 		 * is always optional and those options didn't exist in 4.3.
1182 		 * If we receive rights, trim the cmsghdr; anything else
1183 		 * is tossed.
1184 		 */
1185 		if (control && mp->msg_flags & MSG_COMPAT) {
1186 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1187 			    SOL_SOCKET ||
1188 			    mtod(control, struct cmsghdr *)->cmsg_type !=
1189 			    SCM_RIGHTS) {
1190 				mp->msg_controllen = 0;
1191 				goto out;
1192 			}
1193 			control->m_len -= sizeof (struct cmsghdr);
1194 			control->m_data += sizeof (struct cmsghdr);
1195 		}
1196 #endif
1197 		len = mp->msg_controllen;
1198 		m = control;
1199 		mp->msg_controllen = 0;
1200 		ctlbuf = mp->msg_control;
1201 
1202 		while (m && len > 0) {
1203 			unsigned int tocopy;
1204 
1205 			if (len >= m->m_len)
1206 				tocopy = m->m_len;
1207 			else {
1208 				mp->msg_flags |= MSG_CTRUNC;
1209 				tocopy = len;
1210 			}
1211 
1212 			if ((error = copyout(mtod(m, caddr_t),
1213 					ctlbuf, tocopy)) != 0)
1214 				goto out;
1215 
1216 			ctlbuf += tocopy;
1217 			len -= tocopy;
1218 			m = m->m_next;
1219 		}
1220 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1221 	}
1222 out:
1223 	fdrop(fp, td);
1224 #ifdef KTRACE
1225 	if (fromsa && KTRPOINT(td, KTR_STRUCT))
1226 		ktrsockaddr(fromsa);
1227 #endif
1228 	if (fromsa)
1229 		free(fromsa, M_SONAME);
1230 
1231 	if (error == 0 && controlp != NULL)
1232 		*controlp = control;
1233 	else  if (control)
1234 		m_freem(control);
1235 
1236 	return (error);
1237 }
1238 
1239 static int
1240 recvit(td, s, mp, namelenp)
1241 	struct thread *td;
1242 	int s;
1243 	struct msghdr *mp;
1244 	void *namelenp;
1245 {
1246 	int error;
1247 
1248 	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
1249 	if (error)
1250 		return (error);
1251 	if (namelenp) {
1252 		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
1253 #ifdef COMPAT_OLDSOCK
1254 		if (mp->msg_flags & MSG_COMPAT)
1255 			error = 0;	/* old recvfrom didn't check */
1256 #endif
1257 	}
1258 	return (error);
1259 }
1260 
1261 int
1262 sys_recvfrom(td, uap)
1263 	struct thread *td;
1264 	struct recvfrom_args /* {
1265 		int	s;
1266 		caddr_t	buf;
1267 		size_t	len;
1268 		int	flags;
1269 		struct sockaddr * __restrict	from;
1270 		socklen_t * __restrict fromlenaddr;
1271 	} */ *uap;
1272 {
1273 	struct msghdr msg;
1274 	struct iovec aiov;
1275 	int error;
1276 
1277 	if (uap->fromlenaddr) {
1278 		error = copyin(uap->fromlenaddr,
1279 		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1280 		if (error)
1281 			goto done2;
1282 	} else {
1283 		msg.msg_namelen = 0;
1284 	}
1285 	msg.msg_name = uap->from;
1286 	msg.msg_iov = &aiov;
1287 	msg.msg_iovlen = 1;
1288 	aiov.iov_base = uap->buf;
1289 	aiov.iov_len = uap->len;
1290 	msg.msg_control = 0;
1291 	msg.msg_flags = uap->flags;
1292 	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1293 done2:
1294 	return(error);
1295 }
1296 
1297 #ifdef COMPAT_OLDSOCK
1298 int
1299 orecvfrom(td, uap)
1300 	struct thread *td;
1301 	struct recvfrom_args *uap;
1302 {
1303 
1304 	uap->flags |= MSG_COMPAT;
1305 	return (sys_recvfrom(td, uap));
1306 }
1307 #endif
1308 
1309 #ifdef COMPAT_OLDSOCK
1310 int
1311 orecv(td, uap)
1312 	struct thread *td;
1313 	struct orecv_args /* {
1314 		int	s;
1315 		caddr_t	buf;
1316 		int	len;
1317 		int	flags;
1318 	} */ *uap;
1319 {
1320 	struct msghdr msg;
1321 	struct iovec aiov;
1322 	int error;
1323 
1324 	msg.msg_name = 0;
1325 	msg.msg_namelen = 0;
1326 	msg.msg_iov = &aiov;
1327 	msg.msg_iovlen = 1;
1328 	aiov.iov_base = uap->buf;
1329 	aiov.iov_len = uap->len;
1330 	msg.msg_control = 0;
1331 	msg.msg_flags = uap->flags;
1332 	error = recvit(td, uap->s, &msg, NULL);
1333 	return (error);
1334 }
1335 
1336 /*
1337  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1338  * overlays the new one, missing only the flags, and with the (old) access
1339  * rights where the control fields are now.
1340  */
1341 int
1342 orecvmsg(td, uap)
1343 	struct thread *td;
1344 	struct orecvmsg_args /* {
1345 		int	s;
1346 		struct	omsghdr *msg;
1347 		int	flags;
1348 	} */ *uap;
1349 {
1350 	struct msghdr msg;
1351 	struct iovec *iov;
1352 	int error;
1353 
1354 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1355 	if (error)
1356 		return (error);
1357 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1358 	if (error)
1359 		return (error);
1360 	msg.msg_flags = uap->flags | MSG_COMPAT;
1361 	msg.msg_iov = iov;
1362 	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1363 	if (msg.msg_controllen && error == 0)
1364 		error = copyout(&msg.msg_controllen,
1365 		    &uap->msg->msg_accrightslen, sizeof (int));
1366 	free(iov, M_IOV);
1367 	return (error);
1368 }
1369 #endif
1370 
1371 int
1372 sys_recvmsg(td, uap)
1373 	struct thread *td;
1374 	struct recvmsg_args /* {
1375 		int	s;
1376 		struct	msghdr *msg;
1377 		int	flags;
1378 	} */ *uap;
1379 {
1380 	struct msghdr msg;
1381 	struct iovec *uiov, *iov;
1382 	int error;
1383 
1384 	error = copyin(uap->msg, &msg, sizeof (msg));
1385 	if (error)
1386 		return (error);
1387 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1388 	if (error)
1389 		return (error);
1390 	msg.msg_flags = uap->flags;
1391 #ifdef COMPAT_OLDSOCK
1392 	msg.msg_flags &= ~MSG_COMPAT;
1393 #endif
1394 	uiov = msg.msg_iov;
1395 	msg.msg_iov = iov;
1396 	error = recvit(td, uap->s, &msg, NULL);
1397 	if (error == 0) {
1398 		msg.msg_iov = uiov;
1399 		error = copyout(&msg, uap->msg, sizeof(msg));
1400 	}
1401 	free(iov, M_IOV);
1402 	return (error);
1403 }
1404 
1405 /* ARGSUSED */
1406 int
1407 sys_shutdown(td, uap)
1408 	struct thread *td;
1409 	struct shutdown_args /* {
1410 		int	s;
1411 		int	how;
1412 	} */ *uap;
1413 {
1414 	struct socket *so;
1415 	struct file *fp;
1416 	int error;
1417 
1418 	AUDIT_ARG_FD(uap->s);
1419 	error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_SHUTDOWN, &fp,
1420 	    NULL);
1421 	if (error == 0) {
1422 		so = fp->f_data;
1423 		error = soshutdown(so, uap->how);
1424 		fdrop(fp, td);
1425 	}
1426 	return (error);
1427 }
1428 
1429 /* ARGSUSED */
1430 int
1431 sys_setsockopt(td, uap)
1432 	struct thread *td;
1433 	struct setsockopt_args /* {
1434 		int	s;
1435 		int	level;
1436 		int	name;
1437 		caddr_t	val;
1438 		int	valsize;
1439 	} */ *uap;
1440 {
1441 
1442 	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1443 	    uap->val, UIO_USERSPACE, uap->valsize));
1444 }
1445 
1446 int
1447 kern_setsockopt(td, s, level, name, val, valseg, valsize)
1448 	struct thread *td;
1449 	int s;
1450 	int level;
1451 	int name;
1452 	void *val;
1453 	enum uio_seg valseg;
1454 	socklen_t valsize;
1455 {
1456 	int error;
1457 	struct socket *so;
1458 	struct file *fp;
1459 	struct sockopt sopt;
1460 
1461 	if (val == NULL && valsize != 0)
1462 		return (EFAULT);
1463 	if ((int)valsize < 0)
1464 		return (EINVAL);
1465 
1466 	sopt.sopt_dir = SOPT_SET;
1467 	sopt.sopt_level = level;
1468 	sopt.sopt_name = name;
1469 	sopt.sopt_val = val;
1470 	sopt.sopt_valsize = valsize;
1471 	switch (valseg) {
1472 	case UIO_USERSPACE:
1473 		sopt.sopt_td = td;
1474 		break;
1475 	case UIO_SYSSPACE:
1476 		sopt.sopt_td = NULL;
1477 		break;
1478 	default:
1479 		panic("kern_setsockopt called with bad valseg");
1480 	}
1481 
1482 	AUDIT_ARG_FD(s);
1483 	error = getsock_cap(td->td_proc->p_fd, s, CAP_SETSOCKOPT, &fp, NULL);
1484 	if (error == 0) {
1485 		so = fp->f_data;
1486 		error = sosetopt(so, &sopt);
1487 		fdrop(fp, td);
1488 	}
1489 	return(error);
1490 }
1491 
1492 /* ARGSUSED */
1493 int
1494 sys_getsockopt(td, uap)
1495 	struct thread *td;
1496 	struct getsockopt_args /* {
1497 		int	s;
1498 		int	level;
1499 		int	name;
1500 		void * __restrict	val;
1501 		socklen_t * __restrict avalsize;
1502 	} */ *uap;
1503 {
1504 	socklen_t valsize;
1505 	int	error;
1506 
1507 	if (uap->val) {
1508 		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1509 		if (error)
1510 			return (error);
1511 	}
1512 
1513 	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1514 	    uap->val, UIO_USERSPACE, &valsize);
1515 
1516 	if (error == 0)
1517 		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1518 	return (error);
1519 }
1520 
1521 /*
1522  * Kernel version of getsockopt.
1523  * optval can be a userland or userspace. optlen is always a kernel pointer.
1524  */
1525 int
1526 kern_getsockopt(td, s, level, name, val, valseg, valsize)
1527 	struct thread *td;
1528 	int s;
1529 	int level;
1530 	int name;
1531 	void *val;
1532 	enum uio_seg valseg;
1533 	socklen_t *valsize;
1534 {
1535 	int error;
1536 	struct  socket *so;
1537 	struct file *fp;
1538 	struct	sockopt sopt;
1539 
1540 	if (val == NULL)
1541 		*valsize = 0;
1542 	if ((int)*valsize < 0)
1543 		return (EINVAL);
1544 
1545 	sopt.sopt_dir = SOPT_GET;
1546 	sopt.sopt_level = level;
1547 	sopt.sopt_name = name;
1548 	sopt.sopt_val = val;
1549 	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1550 	switch (valseg) {
1551 	case UIO_USERSPACE:
1552 		sopt.sopt_td = td;
1553 		break;
1554 	case UIO_SYSSPACE:
1555 		sopt.sopt_td = NULL;
1556 		break;
1557 	default:
1558 		panic("kern_getsockopt called with bad valseg");
1559 	}
1560 
1561 	AUDIT_ARG_FD(s);
1562 	error = getsock_cap(td->td_proc->p_fd, s, CAP_GETSOCKOPT, &fp, NULL);
1563 	if (error == 0) {
1564 		so = fp->f_data;
1565 		error = sogetopt(so, &sopt);
1566 		*valsize = sopt.sopt_valsize;
1567 		fdrop(fp, td);
1568 	}
1569 	return (error);
1570 }
1571 
1572 /*
1573  * getsockname1() - Get socket name.
1574  */
1575 /* ARGSUSED */
1576 static int
1577 getsockname1(td, uap, compat)
1578 	struct thread *td;
1579 	struct getsockname_args /* {
1580 		int	fdes;
1581 		struct sockaddr * __restrict asa;
1582 		socklen_t * __restrict alen;
1583 	} */ *uap;
1584 	int compat;
1585 {
1586 	struct sockaddr *sa;
1587 	socklen_t len;
1588 	int error;
1589 
1590 	error = copyin(uap->alen, &len, sizeof(len));
1591 	if (error)
1592 		return (error);
1593 
1594 	error = kern_getsockname(td, uap->fdes, &sa, &len);
1595 	if (error)
1596 		return (error);
1597 
1598 	if (len != 0) {
1599 #ifdef COMPAT_OLDSOCK
1600 		if (compat)
1601 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1602 #endif
1603 		error = copyout(sa, uap->asa, (u_int)len);
1604 	}
1605 	free(sa, M_SONAME);
1606 	if (error == 0)
1607 		error = copyout(&len, uap->alen, sizeof(len));
1608 	return (error);
1609 }
1610 
1611 int
1612 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
1613     socklen_t *alen)
1614 {
1615 	struct socket *so;
1616 	struct file *fp;
1617 	socklen_t len;
1618 	int error;
1619 
1620 	AUDIT_ARG_FD(fd);
1621 	error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETSOCKNAME, &fp, NULL);
1622 	if (error)
1623 		return (error);
1624 	so = fp->f_data;
1625 	*sa = NULL;
1626 	CURVNET_SET(so->so_vnet);
1627 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
1628 	CURVNET_RESTORE();
1629 	if (error)
1630 		goto bad;
1631 	if (*sa == NULL)
1632 		len = 0;
1633 	else
1634 		len = MIN(*alen, (*sa)->sa_len);
1635 	*alen = len;
1636 #ifdef KTRACE
1637 	if (KTRPOINT(td, KTR_STRUCT))
1638 		ktrsockaddr(*sa);
1639 #endif
1640 bad:
1641 	fdrop(fp, td);
1642 	if (error && *sa) {
1643 		free(*sa, M_SONAME);
1644 		*sa = NULL;
1645 	}
1646 	return (error);
1647 }
1648 
1649 int
1650 sys_getsockname(td, uap)
1651 	struct thread *td;
1652 	struct getsockname_args *uap;
1653 {
1654 
1655 	return (getsockname1(td, uap, 0));
1656 }
1657 
1658 #ifdef COMPAT_OLDSOCK
1659 int
1660 ogetsockname(td, uap)
1661 	struct thread *td;
1662 	struct getsockname_args *uap;
1663 {
1664 
1665 	return (getsockname1(td, uap, 1));
1666 }
1667 #endif /* COMPAT_OLDSOCK */
1668 
1669 /*
1670  * getpeername1() - Get name of peer for connected socket.
1671  */
1672 /* ARGSUSED */
1673 static int
1674 getpeername1(td, uap, compat)
1675 	struct thread *td;
1676 	struct getpeername_args /* {
1677 		int	fdes;
1678 		struct sockaddr * __restrict	asa;
1679 		socklen_t * __restrict	alen;
1680 	} */ *uap;
1681 	int compat;
1682 {
1683 	struct sockaddr *sa;
1684 	socklen_t len;
1685 	int error;
1686 
1687 	error = copyin(uap->alen, &len, sizeof (len));
1688 	if (error)
1689 		return (error);
1690 
1691 	error = kern_getpeername(td, uap->fdes, &sa, &len);
1692 	if (error)
1693 		return (error);
1694 
1695 	if (len != 0) {
1696 #ifdef COMPAT_OLDSOCK
1697 		if (compat)
1698 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1699 #endif
1700 		error = copyout(sa, uap->asa, (u_int)len);
1701 	}
1702 	free(sa, M_SONAME);
1703 	if (error == 0)
1704 		error = copyout(&len, uap->alen, sizeof(len));
1705 	return (error);
1706 }
1707 
1708 int
1709 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
1710     socklen_t *alen)
1711 {
1712 	struct socket *so;
1713 	struct file *fp;
1714 	socklen_t len;
1715 	int error;
1716 
1717 	AUDIT_ARG_FD(fd);
1718 	error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETPEERNAME, &fp, NULL);
1719 	if (error)
1720 		return (error);
1721 	so = fp->f_data;
1722 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1723 		error = ENOTCONN;
1724 		goto done;
1725 	}
1726 	*sa = NULL;
1727 	CURVNET_SET(so->so_vnet);
1728 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
1729 	CURVNET_RESTORE();
1730 	if (error)
1731 		goto bad;
1732 	if (*sa == NULL)
1733 		len = 0;
1734 	else
1735 		len = MIN(*alen, (*sa)->sa_len);
1736 	*alen = len;
1737 #ifdef KTRACE
1738 	if (KTRPOINT(td, KTR_STRUCT))
1739 		ktrsockaddr(*sa);
1740 #endif
1741 bad:
1742 	if (error && *sa) {
1743 		free(*sa, M_SONAME);
1744 		*sa = NULL;
1745 	}
1746 done:
1747 	fdrop(fp, td);
1748 	return (error);
1749 }
1750 
1751 int
1752 sys_getpeername(td, uap)
1753 	struct thread *td;
1754 	struct getpeername_args *uap;
1755 {
1756 
1757 	return (getpeername1(td, uap, 0));
1758 }
1759 
1760 #ifdef COMPAT_OLDSOCK
1761 int
1762 ogetpeername(td, uap)
1763 	struct thread *td;
1764 	struct ogetpeername_args *uap;
1765 {
1766 
1767 	/* XXX uap should have type `getpeername_args *' to begin with. */
1768 	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1769 }
1770 #endif /* COMPAT_OLDSOCK */
1771 
1772 int
1773 sockargs(mp, buf, buflen, type)
1774 	struct mbuf **mp;
1775 	caddr_t buf;
1776 	int buflen, type;
1777 {
1778 	struct sockaddr *sa;
1779 	struct mbuf *m;
1780 	int error;
1781 
1782 	if (buflen > MLEN) {
1783 #ifdef COMPAT_OLDSOCK
1784 		if (type == MT_SONAME && buflen <= 112)
1785 			buflen = MLEN;		/* unix domain compat. hack */
1786 		else
1787 #endif
1788 			if (buflen > MCLBYTES)
1789 				return (EINVAL);
1790 	}
1791 	m = m_get2(buflen, M_WAITOK, type, 0);
1792 	m->m_len = buflen;
1793 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1794 	if (error)
1795 		(void) m_free(m);
1796 	else {
1797 		*mp = m;
1798 		if (type == MT_SONAME) {
1799 			sa = mtod(m, struct sockaddr *);
1800 
1801 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1802 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1803 				sa->sa_family = sa->sa_len;
1804 #endif
1805 			sa->sa_len = buflen;
1806 		}
1807 	}
1808 	return (error);
1809 }
1810 
1811 int
1812 getsockaddr(namp, uaddr, len)
1813 	struct sockaddr **namp;
1814 	caddr_t uaddr;
1815 	size_t len;
1816 {
1817 	struct sockaddr *sa;
1818 	int error;
1819 
1820 	if (len > SOCK_MAXADDRLEN)
1821 		return (ENAMETOOLONG);
1822 	if (len < offsetof(struct sockaddr, sa_data[0]))
1823 		return (EINVAL);
1824 	sa = malloc(len, M_SONAME, M_WAITOK);
1825 	error = copyin(uaddr, sa, len);
1826 	if (error) {
1827 		free(sa, M_SONAME);
1828 	} else {
1829 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1830 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1831 			sa->sa_family = sa->sa_len;
1832 #endif
1833 		sa->sa_len = len;
1834 		*namp = sa;
1835 	}
1836 	return (error);
1837 }
1838 
1839 #include <sys/condvar.h>
1840 
1841 struct sendfile_sync {
1842 	struct mtx	mtx;
1843 	struct cv	cv;
1844 	unsigned	count;
1845 };
1846 
1847 /*
1848  * Detach mapped page and release resources back to the system.
1849  */
1850 void
1851 sf_buf_mext(void *addr, void *args)
1852 {
1853 	vm_page_t m;
1854 	struct sendfile_sync *sfs;
1855 
1856 	m = sf_buf_page(args);
1857 	sf_buf_free(args);
1858 	vm_page_lock(m);
1859 	vm_page_unwire(m, 0);
1860 	/*
1861 	 * Check for the object going away on us. This can
1862 	 * happen since we don't hold a reference to it.
1863 	 * If so, we're responsible for freeing the page.
1864 	 */
1865 	if (m->wire_count == 0 && m->object == NULL)
1866 		vm_page_free(m);
1867 	vm_page_unlock(m);
1868 	if (addr == NULL)
1869 		return;
1870 	sfs = addr;
1871 	mtx_lock(&sfs->mtx);
1872 	KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
1873 	if (--sfs->count == 0)
1874 		cv_signal(&sfs->cv);
1875 	mtx_unlock(&sfs->mtx);
1876 }
1877 
1878 /*
1879  * sendfile(2)
1880  *
1881  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1882  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1883  *
1884  * Send a file specified by 'fd' and starting at 'offset' to a socket
1885  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
1886  * 0.  Optionally add a header and/or trailer to the socket output.  If
1887  * specified, write the total number of bytes sent into *sbytes.
1888  */
1889 int
1890 sys_sendfile(struct thread *td, struct sendfile_args *uap)
1891 {
1892 
1893 	return (do_sendfile(td, uap, 0));
1894 }
1895 
1896 static int
1897 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1898 {
1899 	struct sf_hdtr hdtr;
1900 	struct uio *hdr_uio, *trl_uio;
1901 	int error;
1902 
1903 	hdr_uio = trl_uio = NULL;
1904 
1905 	if (uap->hdtr != NULL) {
1906 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1907 		if (error)
1908 			goto out;
1909 		if (hdtr.headers != NULL) {
1910 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
1911 			if (error)
1912 				goto out;
1913 		}
1914 		if (hdtr.trailers != NULL) {
1915 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
1916 			if (error)
1917 				goto out;
1918 
1919 		}
1920 	}
1921 
1922 	error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
1923 out:
1924 	if (hdr_uio)
1925 		free(hdr_uio, M_IOV);
1926 	if (trl_uio)
1927 		free(trl_uio, M_IOV);
1928 	return (error);
1929 }
1930 
1931 #ifdef COMPAT_FREEBSD4
1932 int
1933 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1934 {
1935 	struct sendfile_args args;
1936 
1937 	args.fd = uap->fd;
1938 	args.s = uap->s;
1939 	args.offset = uap->offset;
1940 	args.nbytes = uap->nbytes;
1941 	args.hdtr = uap->hdtr;
1942 	args.sbytes = uap->sbytes;
1943 	args.flags = uap->flags;
1944 
1945 	return (do_sendfile(td, &args, 1));
1946 }
1947 #endif /* COMPAT_FREEBSD4 */
1948 
1949 int
1950 kern_sendfile(struct thread *td, struct sendfile_args *uap,
1951     struct uio *hdr_uio, struct uio *trl_uio, int compat)
1952 {
1953 	struct file *sock_fp;
1954 	struct vnode *vp;
1955 	struct vm_object *obj = NULL;
1956 	struct socket *so = NULL;
1957 	struct mbuf *m = NULL;
1958 	struct sf_buf *sf;
1959 	struct vm_page *pg;
1960 	struct vattr va;
1961 	off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
1962 	int error, hdrlen = 0, mnw = 0;
1963 	int bsize;
1964 	struct sendfile_sync *sfs = NULL;
1965 
1966 	/*
1967 	 * The file descriptor must be a regular file and have a
1968 	 * backing VM object.
1969 	 * File offset must be positive.  If it goes beyond EOF
1970 	 * we send only the header/trailer and no payload data.
1971 	 */
1972 	AUDIT_ARG_FD(uap->fd);
1973 	/*
1974 	 * sendfile(2) can start at any offset within a file so we require
1975 	 * CAP_READ+CAP_SEEK = CAP_PREAD.
1976 	 */
1977 	if ((error = fgetvp_read(td, uap->fd, CAP_PREAD, &vp)) != 0)
1978 		goto out;
1979 	vn_lock(vp, LK_SHARED | LK_RETRY);
1980 	if (vp->v_type == VREG) {
1981 		bsize = vp->v_mount->mnt_stat.f_iosize;
1982 		if (uap->nbytes == 0) {
1983 			error = VOP_GETATTR(vp, &va, td->td_ucred);
1984 			if (error != 0) {
1985 				VOP_UNLOCK(vp, 0);
1986 				obj = NULL;
1987 				goto out;
1988 			}
1989 			rem = va.va_size;
1990 		} else
1991 			rem = uap->nbytes;
1992 		obj = vp->v_object;
1993 		if (obj != NULL) {
1994 			/*
1995 			 * Temporarily increase the backing VM
1996 			 * object's reference count so that a forced
1997 			 * reclamation of its vnode does not
1998 			 * immediately destroy it.
1999 			 */
2000 			VM_OBJECT_WLOCK(obj);
2001 			if ((obj->flags & OBJ_DEAD) == 0) {
2002 				vm_object_reference_locked(obj);
2003 				VM_OBJECT_WUNLOCK(obj);
2004 			} else {
2005 				VM_OBJECT_WUNLOCK(obj);
2006 				obj = NULL;
2007 			}
2008 		}
2009 	} else
2010 		bsize = 0;	/* silence gcc */
2011 	VOP_UNLOCK(vp, 0);
2012 	if (obj == NULL) {
2013 		error = EINVAL;
2014 		goto out;
2015 	}
2016 	if (uap->offset < 0) {
2017 		error = EINVAL;
2018 		goto out;
2019 	}
2020 
2021 	/*
2022 	 * The socket must be a stream socket and connected.
2023 	 * Remember if it a blocking or non-blocking socket.
2024 	 */
2025 	if ((error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_SEND,
2026 	    &sock_fp, NULL)) != 0)
2027 		goto out;
2028 	so = sock_fp->f_data;
2029 	if (so->so_type != SOCK_STREAM) {
2030 		error = EINVAL;
2031 		goto out;
2032 	}
2033 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2034 		error = ENOTCONN;
2035 		goto out;
2036 	}
2037 	/*
2038 	 * Do not wait on memory allocations but return ENOMEM for
2039 	 * caller to retry later.
2040 	 * XXX: Experimental.
2041 	 */
2042 	if (uap->flags & SF_MNOWAIT)
2043 		mnw = 1;
2044 
2045 	if (uap->flags & SF_SYNC) {
2046 		sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
2047 		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
2048 		cv_init(&sfs->cv, "sendfile");
2049 	}
2050 
2051 #ifdef MAC
2052 	error = mac_socket_check_send(td->td_ucred, so);
2053 	if (error)
2054 		goto out;
2055 #endif
2056 
2057 	/* If headers are specified copy them into mbufs. */
2058 	if (hdr_uio != NULL) {
2059 		hdr_uio->uio_td = td;
2060 		hdr_uio->uio_rw = UIO_WRITE;
2061 		if (hdr_uio->uio_resid > 0) {
2062 			/*
2063 			 * In FBSD < 5.0 the nbytes to send also included
2064 			 * the header.  If compat is specified subtract the
2065 			 * header size from nbytes.
2066 			 */
2067 			if (compat) {
2068 				if (uap->nbytes > hdr_uio->uio_resid)
2069 					uap->nbytes -= hdr_uio->uio_resid;
2070 				else
2071 					uap->nbytes = 0;
2072 			}
2073 			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
2074 			    0, 0, 0);
2075 			if (m == NULL) {
2076 				error = mnw ? EAGAIN : ENOBUFS;
2077 				goto out;
2078 			}
2079 			hdrlen = m_length(m, NULL);
2080 		}
2081 	}
2082 
2083 	/*
2084 	 * Protect against multiple writers to the socket.
2085 	 *
2086 	 * XXXRW: Historically this has assumed non-interruptibility, so now
2087 	 * we implement that, but possibly shouldn't.
2088 	 */
2089 	(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
2090 
2091 	/*
2092 	 * Loop through the pages of the file, starting with the requested
2093 	 * offset. Get a file page (do I/O if necessary), map the file page
2094 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
2095 	 * it on the socket.
2096 	 * This is done in two loops.  The inner loop turns as many pages
2097 	 * as it can, up to available socket buffer space, without blocking
2098 	 * into mbufs to have it bulk delivered into the socket send buffer.
2099 	 * The outer loop checks the state and available space of the socket
2100 	 * and takes care of the overall progress.
2101 	 */
2102 	for (off = uap->offset; ; ) {
2103 		struct mbuf *mtail;
2104 		int loopbytes;
2105 		int space;
2106 		int done;
2107 
2108 		if ((uap->nbytes != 0 && uap->nbytes == fsbytes) ||
2109 		    (uap->nbytes == 0 && va.va_size == fsbytes))
2110 			break;
2111 
2112 		mtail = NULL;
2113 		loopbytes = 0;
2114 		space = 0;
2115 		done = 0;
2116 
2117 		/*
2118 		 * Check the socket state for ongoing connection,
2119 		 * no errors and space in socket buffer.
2120 		 * If space is low allow for the remainder of the
2121 		 * file to be processed if it fits the socket buffer.
2122 		 * Otherwise block in waiting for sufficient space
2123 		 * to proceed, or if the socket is nonblocking, return
2124 		 * to userland with EAGAIN while reporting how far
2125 		 * we've come.
2126 		 * We wait until the socket buffer has significant free
2127 		 * space to do bulk sends.  This makes good use of file
2128 		 * system read ahead and allows packet segmentation
2129 		 * offloading hardware to take over lots of work.  If
2130 		 * we were not careful here we would send off only one
2131 		 * sfbuf at a time.
2132 		 */
2133 		SOCKBUF_LOCK(&so->so_snd);
2134 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
2135 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
2136 retry_space:
2137 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2138 			error = EPIPE;
2139 			SOCKBUF_UNLOCK(&so->so_snd);
2140 			goto done;
2141 		} else if (so->so_error) {
2142 			error = so->so_error;
2143 			so->so_error = 0;
2144 			SOCKBUF_UNLOCK(&so->so_snd);
2145 			goto done;
2146 		}
2147 		space = sbspace(&so->so_snd);
2148 		if (space < rem &&
2149 		    (space <= 0 ||
2150 		     space < so->so_snd.sb_lowat)) {
2151 			if (so->so_state & SS_NBIO) {
2152 				SOCKBUF_UNLOCK(&so->so_snd);
2153 				error = EAGAIN;
2154 				goto done;
2155 			}
2156 			/*
2157 			 * sbwait drops the lock while sleeping.
2158 			 * When we loop back to retry_space the
2159 			 * state may have changed and we retest
2160 			 * for it.
2161 			 */
2162 			error = sbwait(&so->so_snd);
2163 			/*
2164 			 * An error from sbwait usually indicates that we've
2165 			 * been interrupted by a signal. If we've sent anything
2166 			 * then return bytes sent, otherwise return the error.
2167 			 */
2168 			if (error) {
2169 				SOCKBUF_UNLOCK(&so->so_snd);
2170 				goto done;
2171 			}
2172 			goto retry_space;
2173 		}
2174 		SOCKBUF_UNLOCK(&so->so_snd);
2175 
2176 		/*
2177 		 * Reduce space in the socket buffer by the size of
2178 		 * the header mbuf chain.
2179 		 * hdrlen is set to 0 after the first loop.
2180 		 */
2181 		space -= hdrlen;
2182 
2183 		error = vn_lock(vp, LK_SHARED);
2184 		if (error != 0)
2185 			goto done;
2186 		error = VOP_GETATTR(vp, &va, td->td_ucred);
2187 		if (error != 0 || off >= va.va_size) {
2188 			VOP_UNLOCK(vp, 0);
2189 			goto done;
2190 		}
2191 
2192 		/*
2193 		 * Loop and construct maximum sized mbuf chain to be bulk
2194 		 * dumped into socket buffer.
2195 		 */
2196 		while (space > loopbytes) {
2197 			vm_pindex_t pindex;
2198 			vm_offset_t pgoff;
2199 			struct mbuf *m0;
2200 
2201 			/*
2202 			 * Calculate the amount to transfer.
2203 			 * Not to exceed a page, the EOF,
2204 			 * or the passed in nbytes.
2205 			 */
2206 			pgoff = (vm_offset_t)(off & PAGE_MASK);
2207 			if (uap->nbytes)
2208 				rem = (uap->nbytes - fsbytes - loopbytes);
2209 			else
2210 				rem = va.va_size -
2211 				    uap->offset - fsbytes - loopbytes;
2212 			xfsize = omin(PAGE_SIZE - pgoff, rem);
2213 			xfsize = omin(space - loopbytes, xfsize);
2214 			if (xfsize <= 0) {
2215 				done = 1;		/* all data sent */
2216 				break;
2217 			}
2218 
2219 			/*
2220 			 * Attempt to look up the page.  Allocate
2221 			 * if not found or wait and loop if busy.
2222 			 */
2223 			pindex = OFF_TO_IDX(off);
2224 			VM_OBJECT_WLOCK(obj);
2225 			pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
2226 			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
2227 
2228 			/*
2229 			 * Check if page is valid for what we need,
2230 			 * otherwise initiate I/O.
2231 			 * If we already turned some pages into mbufs,
2232 			 * send them off before we come here again and
2233 			 * block.
2234 			 */
2235 			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
2236 				VM_OBJECT_WUNLOCK(obj);
2237 			else if (m != NULL)
2238 				error = EAGAIN;	/* send what we already got */
2239 			else if (uap->flags & SF_NODISKIO)
2240 				error = EBUSY;
2241 			else {
2242 				ssize_t resid;
2243 
2244 				/*
2245 				 * Ensure that our page is still around
2246 				 * when the I/O completes.
2247 				 */
2248 				vm_page_io_start(pg);
2249 				VM_OBJECT_WUNLOCK(obj);
2250 
2251 				/*
2252 				 * Get the page from backing store.
2253 				 * XXXMAC: Because we don't have fp->f_cred
2254 				 * here, we pass in NOCRED.  This is probably
2255 				 * wrong, but is consistent with our original
2256 				 * implementation.
2257 				 */
2258 				error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
2259 				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
2260 				    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
2261 				    td->td_ucred, NOCRED, &resid, td);
2262 				VM_OBJECT_WLOCK(obj);
2263 				vm_page_io_finish(pg);
2264 				if (!error)
2265 					VM_OBJECT_WUNLOCK(obj);
2266 				SFSTAT_INC(sf_iocnt);
2267 			}
2268 			if (error) {
2269 				vm_page_lock(pg);
2270 				vm_page_unwire(pg, 0);
2271 				/*
2272 				 * See if anyone else might know about
2273 				 * this page.  If not and it is not valid,
2274 				 * then free it.
2275 				 */
2276 				if (pg->wire_count == 0 && pg->valid == 0 &&
2277 				    pg->busy == 0 && !(pg->oflags & VPO_BUSY))
2278 					vm_page_free(pg);
2279 				vm_page_unlock(pg);
2280 				VM_OBJECT_WUNLOCK(obj);
2281 				if (error == EAGAIN)
2282 					error = 0;	/* not a real error */
2283 				break;
2284 			}
2285 
2286 			/*
2287 			 * Get a sendfile buf.  When allocating the
2288 			 * first buffer for mbuf chain, we usually
2289 			 * wait as long as necessary, but this wait
2290 			 * can be interrupted.  For consequent
2291 			 * buffers, do not sleep, since several
2292 			 * threads might exhaust the buffers and then
2293 			 * deadlock.
2294 			 */
2295 			sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
2296 			    SFB_CATCH);
2297 			if (sf == NULL) {
2298 				SFSTAT_INC(sf_allocfail);
2299 				vm_page_lock(pg);
2300 				vm_page_unwire(pg, 0);
2301 				KASSERT(pg->object != NULL,
2302 				    ("kern_sendfile: object disappeared"));
2303 				vm_page_unlock(pg);
2304 				if (m == NULL)
2305 					error = (mnw ? EAGAIN : EINTR);
2306 				break;
2307 			}
2308 
2309 			/*
2310 			 * Get an mbuf and set it up as having
2311 			 * external storage.
2312 			 */
2313 			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
2314 			if (m0 == NULL) {
2315 				error = (mnw ? EAGAIN : ENOBUFS);
2316 				sf_buf_mext(NULL, sf);
2317 				break;
2318 			}
2319 			if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
2320 			    sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
2321 			    (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
2322 				error = (mnw ? EAGAIN : ENOBUFS);
2323 				sf_buf_mext(NULL, sf);
2324 				m_freem(m0);
2325 				break;
2326 			}
2327 			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
2328 			m0->m_len = xfsize;
2329 
2330 			/* Append to mbuf chain. */
2331 			if (mtail != NULL)
2332 				mtail->m_next = m0;
2333 			else if (m != NULL)
2334 				m_last(m)->m_next = m0;
2335 			else
2336 				m = m0;
2337 			mtail = m0;
2338 
2339 			/* Keep track of bits processed. */
2340 			loopbytes += xfsize;
2341 			off += xfsize;
2342 
2343 			if (sfs != NULL) {
2344 				mtx_lock(&sfs->mtx);
2345 				sfs->count++;
2346 				mtx_unlock(&sfs->mtx);
2347 			}
2348 		}
2349 
2350 		VOP_UNLOCK(vp, 0);
2351 
2352 		/* Add the buffer chain to the socket buffer. */
2353 		if (m != NULL) {
2354 			int mlen, err;
2355 
2356 			mlen = m_length(m, NULL);
2357 			SOCKBUF_LOCK(&so->so_snd);
2358 			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2359 				error = EPIPE;
2360 				SOCKBUF_UNLOCK(&so->so_snd);
2361 				goto done;
2362 			}
2363 			SOCKBUF_UNLOCK(&so->so_snd);
2364 			CURVNET_SET(so->so_vnet);
2365 			/* Avoid error aliasing. */
2366 			err = (*so->so_proto->pr_usrreqs->pru_send)
2367 				    (so, 0, m, NULL, NULL, td);
2368 			CURVNET_RESTORE();
2369 			if (err == 0) {
2370 				/*
2371 				 * We need two counters to get the
2372 				 * file offset and nbytes to send
2373 				 * right:
2374 				 * - sbytes contains the total amount
2375 				 *   of bytes sent, including headers.
2376 				 * - fsbytes contains the total amount
2377 				 *   of bytes sent from the file.
2378 				 */
2379 				sbytes += mlen;
2380 				fsbytes += mlen;
2381 				if (hdrlen) {
2382 					fsbytes -= hdrlen;
2383 					hdrlen = 0;
2384 				}
2385 			} else if (error == 0)
2386 				error = err;
2387 			m = NULL;	/* pru_send always consumes */
2388 		}
2389 
2390 		/* Quit outer loop on error or when we're done. */
2391 		if (done)
2392 			break;
2393 		if (error)
2394 			goto done;
2395 	}
2396 
2397 	/*
2398 	 * Send trailers. Wimp out and use writev(2).
2399 	 */
2400 	if (trl_uio != NULL) {
2401 		sbunlock(&so->so_snd);
2402 		error = kern_writev(td, uap->s, trl_uio);
2403 		if (error == 0)
2404 			sbytes += td->td_retval[0];
2405 		goto out;
2406 	}
2407 
2408 done:
2409 	sbunlock(&so->so_snd);
2410 out:
2411 	/*
2412 	 * If there was no error we have to clear td->td_retval[0]
2413 	 * because it may have been set by writev.
2414 	 */
2415 	if (error == 0) {
2416 		td->td_retval[0] = 0;
2417 	}
2418 	if (uap->sbytes != NULL) {
2419 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
2420 	}
2421 	if (obj != NULL)
2422 		vm_object_deallocate(obj);
2423 	if (vp != NULL)
2424 		vrele(vp);
2425 	if (so)
2426 		fdrop(sock_fp, td);
2427 	if (m)
2428 		m_freem(m);
2429 
2430 	if (sfs != NULL) {
2431 		mtx_lock(&sfs->mtx);
2432 		if (sfs->count != 0)
2433 			cv_wait(&sfs->cv, &sfs->mtx);
2434 		KASSERT(sfs->count == 0, ("sendfile sync still busy"));
2435 		cv_destroy(&sfs->cv);
2436 		mtx_destroy(&sfs->mtx);
2437 		free(sfs, M_TEMP);
2438 	}
2439 
2440 	if (error == ERESTART)
2441 		error = EINTR;
2442 
2443 	return (error);
2444 }
2445 
2446 /*
2447  * SCTP syscalls.
2448  * Functionality only compiled in if SCTP is defined in the kernel Makefile,
2449  * otherwise all return EOPNOTSUPP.
2450  * XXX: We should make this loadable one day.
2451  */
2452 int
2453 sys_sctp_peeloff(td, uap)
2454 	struct thread *td;
2455 	struct sctp_peeloff_args /* {
2456 		int	sd;
2457 		caddr_t	name;
2458 	} */ *uap;
2459 {
2460 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2461 	struct file *nfp = NULL;
2462 	int error;
2463 	struct socket *head, *so;
2464 	int fd;
2465 	u_int fflag;
2466 
2467 	AUDIT_ARG_FD(uap->sd);
2468 	error = fgetsock(td, uap->sd, CAP_PEELOFF, &head, &fflag);
2469 	if (error)
2470 		goto done2;
2471 	if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
2472 		error = EOPNOTSUPP;
2473 		goto done;
2474 	}
2475 	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
2476 	if (error)
2477 		goto done;
2478 	/*
2479 	 * At this point we know we do have a assoc to pull
2480 	 * we proceed to get the fd setup. This may block
2481 	 * but that is ok.
2482 	 */
2483 
2484 	error = falloc(td, &nfp, &fd, 0);
2485 	if (error)
2486 		goto done;
2487 	td->td_retval[0] = fd;
2488 
2489 	CURVNET_SET(head->so_vnet);
2490 	so = sonewconn(head, SS_ISCONNECTED);
2491 	if (so == NULL) {
2492 		error = ENOMEM;
2493 		goto noconnection;
2494 	}
2495 	/*
2496 	 * Before changing the flags on the socket, we have to bump the
2497 	 * reference count.  Otherwise, if the protocol calls sofree(),
2498 	 * the socket will be released due to a zero refcount.
2499 	 */
2500         SOCK_LOCK(so);
2501         soref(so);                      /* file descriptor reference */
2502         SOCK_UNLOCK(so);
2503 
2504 	ACCEPT_LOCK();
2505 
2506 	TAILQ_REMOVE(&head->so_comp, so, so_list);
2507 	head->so_qlen--;
2508 	so->so_state |= (head->so_state & SS_NBIO);
2509 	so->so_state &= ~SS_NOFDREF;
2510 	so->so_qstate &= ~SQ_COMP;
2511 	so->so_head = NULL;
2512 	ACCEPT_UNLOCK();
2513 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
2514 	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
2515 	if (error)
2516 		goto noconnection;
2517 	if (head->so_sigio != NULL)
2518 		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
2519 
2520 noconnection:
2521 	/*
2522 	 * close the new descriptor, assuming someone hasn't ripped it
2523 	 * out from under us.
2524 	 */
2525 	if (error)
2526 		fdclose(td->td_proc->p_fd, nfp, fd, td);
2527 
2528 	/*
2529 	 * Release explicitly held references before returning.
2530 	 */
2531 	CURVNET_RESTORE();
2532 done:
2533 	if (nfp != NULL)
2534 		fdrop(nfp, td);
2535 	fputsock(head);
2536 done2:
2537 	return (error);
2538 #else  /* SCTP */
2539 	return (EOPNOTSUPP);
2540 #endif /* SCTP */
2541 }
2542 
2543 int
2544 sys_sctp_generic_sendmsg (td, uap)
2545 	struct thread *td;
2546 	struct sctp_generic_sendmsg_args /* {
2547 		int sd,
2548 		caddr_t msg,
2549 		int mlen,
2550 		caddr_t to,
2551 		__socklen_t tolen,
2552 		struct sctp_sndrcvinfo *sinfo,
2553 		int flags
2554 	} */ *uap;
2555 {
2556 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2557 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2558 	struct socket *so;
2559 	struct file *fp = NULL;
2560 	int error = 0, len;
2561 	struct sockaddr *to = NULL;
2562 #ifdef KTRACE
2563 	struct uio *ktruio = NULL;
2564 #endif
2565 	struct uio auio;
2566 	struct iovec iov[1];
2567 	cap_rights_t rights;
2568 
2569 	if (uap->sinfo) {
2570 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2571 		if (error)
2572 			return (error);
2573 		u_sinfo = &sinfo;
2574 	}
2575 
2576 	rights = CAP_SEND;
2577 	if (uap->tolen) {
2578 		error = getsockaddr(&to, uap->to, uap->tolen);
2579 		if (error) {
2580 			to = NULL;
2581 			goto sctp_bad2;
2582 		}
2583 		rights |= CAP_CONNECT;
2584 	}
2585 
2586 	AUDIT_ARG_FD(uap->sd);
2587 	error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL);
2588 	if (error)
2589 		goto sctp_bad;
2590 #ifdef KTRACE
2591 	if (to && (KTRPOINT(td, KTR_STRUCT)))
2592 		ktrsockaddr(to);
2593 #endif
2594 
2595 	iov[0].iov_base = uap->msg;
2596 	iov[0].iov_len = uap->mlen;
2597 
2598 	so = (struct socket *)fp->f_data;
2599 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2600 		error = EOPNOTSUPP;
2601 		goto sctp_bad;
2602 	}
2603 #ifdef MAC
2604 	error = mac_socket_check_send(td->td_ucred, so);
2605 	if (error)
2606 		goto sctp_bad;
2607 #endif /* MAC */
2608 
2609 	auio.uio_iov =  iov;
2610 	auio.uio_iovcnt = 1;
2611 	auio.uio_segflg = UIO_USERSPACE;
2612 	auio.uio_rw = UIO_WRITE;
2613 	auio.uio_td = td;
2614 	auio.uio_offset = 0;			/* XXX */
2615 	auio.uio_resid = 0;
2616 	len = auio.uio_resid = uap->mlen;
2617 	CURVNET_SET(so->so_vnet);
2618 	error = sctp_lower_sosend(so, to, &auio,
2619 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2620 		    uap->flags, u_sinfo, td);
2621 	CURVNET_RESTORE();
2622 	if (error) {
2623 		if (auio.uio_resid != len && (error == ERESTART ||
2624 		    error == EINTR || error == EWOULDBLOCK))
2625 			error = 0;
2626 		/* Generation of SIGPIPE can be controlled per socket. */
2627 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2628 		    !(uap->flags & MSG_NOSIGNAL)) {
2629 			PROC_LOCK(td->td_proc);
2630 			tdsignal(td, SIGPIPE);
2631 			PROC_UNLOCK(td->td_proc);
2632 		}
2633 	}
2634 	if (error == 0)
2635 		td->td_retval[0] = len - auio.uio_resid;
2636 #ifdef KTRACE
2637 	if (ktruio != NULL) {
2638 		ktruio->uio_resid = td->td_retval[0];
2639 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2640 	}
2641 #endif /* KTRACE */
2642 sctp_bad:
2643 	if (fp)
2644 		fdrop(fp, td);
2645 sctp_bad2:
2646 	if (to)
2647 		free(to, M_SONAME);
2648 	return (error);
2649 #else  /* SCTP */
2650 	return (EOPNOTSUPP);
2651 #endif /* SCTP */
2652 }
2653 
2654 int
2655 sys_sctp_generic_sendmsg_iov(td, uap)
2656 	struct thread *td;
2657 	struct sctp_generic_sendmsg_iov_args /* {
2658 		int sd,
2659 		struct iovec *iov,
2660 		int iovlen,
2661 		caddr_t to,
2662 		__socklen_t tolen,
2663 		struct sctp_sndrcvinfo *sinfo,
2664 		int flags
2665 	} */ *uap;
2666 {
2667 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2668 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2669 	struct socket *so;
2670 	struct file *fp = NULL;
2671 	int error=0, i;
2672 	ssize_t len;
2673 	struct sockaddr *to = NULL;
2674 #ifdef KTRACE
2675 	struct uio *ktruio = NULL;
2676 #endif
2677 	struct uio auio;
2678 	struct iovec *iov, *tiov;
2679 	cap_rights_t rights;
2680 
2681 	if (uap->sinfo) {
2682 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2683 		if (error)
2684 			return (error);
2685 		u_sinfo = &sinfo;
2686 	}
2687 	rights = CAP_SEND;
2688 	if (uap->tolen) {
2689 		error = getsockaddr(&to, uap->to, uap->tolen);
2690 		if (error) {
2691 			to = NULL;
2692 			goto sctp_bad2;
2693 		}
2694 		rights |= CAP_CONNECT;
2695 	}
2696 
2697 	AUDIT_ARG_FD(uap->sd);
2698 	error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL);
2699 	if (error)
2700 		goto sctp_bad1;
2701 
2702 #ifdef COMPAT_FREEBSD32
2703 	if (SV_CURPROC_FLAG(SV_ILP32))
2704 		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
2705 		    uap->iovlen, &iov, EMSGSIZE);
2706 	else
2707 #endif
2708 		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2709 	if (error)
2710 		goto sctp_bad1;
2711 #ifdef KTRACE
2712 	if (to && (KTRPOINT(td, KTR_STRUCT)))
2713 		ktrsockaddr(to);
2714 #endif
2715 
2716 	so = (struct socket *)fp->f_data;
2717 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2718 		error = EOPNOTSUPP;
2719 		goto sctp_bad;
2720 	}
2721 #ifdef MAC
2722 	error = mac_socket_check_send(td->td_ucred, so);
2723 	if (error)
2724 		goto sctp_bad;
2725 #endif /* MAC */
2726 
2727 	auio.uio_iov = iov;
2728 	auio.uio_iovcnt = uap->iovlen;
2729 	auio.uio_segflg = UIO_USERSPACE;
2730 	auio.uio_rw = UIO_WRITE;
2731 	auio.uio_td = td;
2732 	auio.uio_offset = 0;			/* XXX */
2733 	auio.uio_resid = 0;
2734 	tiov = iov;
2735 	for (i = 0; i <uap->iovlen; i++, tiov++) {
2736 		if ((auio.uio_resid += tiov->iov_len) < 0) {
2737 			error = EINVAL;
2738 			goto sctp_bad;
2739 		}
2740 	}
2741 	len = auio.uio_resid;
2742 	CURVNET_SET(so->so_vnet);
2743 	error = sctp_lower_sosend(so, to, &auio,
2744 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2745 		    uap->flags, u_sinfo, td);
2746 	CURVNET_RESTORE();
2747 	if (error) {
2748 		if (auio.uio_resid != len && (error == ERESTART ||
2749 		    error == EINTR || error == EWOULDBLOCK))
2750 			error = 0;
2751 		/* Generation of SIGPIPE can be controlled per socket */
2752 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2753 		    !(uap->flags & MSG_NOSIGNAL)) {
2754 			PROC_LOCK(td->td_proc);
2755 			tdsignal(td, SIGPIPE);
2756 			PROC_UNLOCK(td->td_proc);
2757 		}
2758 	}
2759 	if (error == 0)
2760 		td->td_retval[0] = len - auio.uio_resid;
2761 #ifdef KTRACE
2762 	if (ktruio != NULL) {
2763 		ktruio->uio_resid = td->td_retval[0];
2764 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2765 	}
2766 #endif /* KTRACE */
2767 sctp_bad:
2768 	free(iov, M_IOV);
2769 sctp_bad1:
2770 	if (fp)
2771 		fdrop(fp, td);
2772 sctp_bad2:
2773 	if (to)
2774 		free(to, M_SONAME);
2775 	return (error);
2776 #else  /* SCTP */
2777 	return (EOPNOTSUPP);
2778 #endif /* SCTP */
2779 }
2780 
2781 int
2782 sys_sctp_generic_recvmsg(td, uap)
2783 	struct thread *td;
2784 	struct sctp_generic_recvmsg_args /* {
2785 		int sd,
2786 		struct iovec *iov,
2787 		int iovlen,
2788 		struct sockaddr *from,
2789 		__socklen_t *fromlenaddr,
2790 		struct sctp_sndrcvinfo *sinfo,
2791 		int *msg_flags
2792 	} */ *uap;
2793 {
2794 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2795 	uint8_t sockbufstore[256];
2796 	struct uio auio;
2797 	struct iovec *iov, *tiov;
2798 	struct sctp_sndrcvinfo sinfo;
2799 	struct socket *so;
2800 	struct file *fp = NULL;
2801 	struct sockaddr *fromsa;
2802 	int fromlen;
2803 	ssize_t len;
2804 	int i, msg_flags;
2805 	int error = 0;
2806 #ifdef KTRACE
2807 	struct uio *ktruio = NULL;
2808 #endif
2809 
2810 	AUDIT_ARG_FD(uap->sd);
2811 	error = getsock_cap(td->td_proc->p_fd, uap->sd, CAP_RECV, &fp, NULL);
2812 	if (error) {
2813 		return (error);
2814 	}
2815 #ifdef COMPAT_FREEBSD32
2816 	if (SV_CURPROC_FLAG(SV_ILP32))
2817 		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
2818 		    uap->iovlen, &iov, EMSGSIZE);
2819 	else
2820 #endif
2821 		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2822 	if (error)
2823 		goto out1;
2824 
2825 	so = fp->f_data;
2826 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2827 		error = EOPNOTSUPP;
2828 		goto out;
2829 	}
2830 #ifdef MAC
2831 	error = mac_socket_check_receive(td->td_ucred, so);
2832 	if (error) {
2833 		goto out;
2834 	}
2835 #endif /* MAC */
2836 
2837 	if (uap->fromlenaddr) {
2838 		error = copyin(uap->fromlenaddr,
2839 		    &fromlen, sizeof (fromlen));
2840 		if (error) {
2841 			goto out;
2842 		}
2843 	} else {
2844 		fromlen = 0;
2845 	}
2846 	if (uap->msg_flags) {
2847 		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
2848 		if (error) {
2849 			goto out;
2850 		}
2851 	} else {
2852 		msg_flags = 0;
2853 	}
2854 	auio.uio_iov = iov;
2855 	auio.uio_iovcnt = uap->iovlen;
2856 	auio.uio_segflg = UIO_USERSPACE;
2857 	auio.uio_rw = UIO_READ;
2858 	auio.uio_td = td;
2859 	auio.uio_offset = 0;			/* XXX */
2860 	auio.uio_resid = 0;
2861 	tiov = iov;
2862 	for (i = 0; i <uap->iovlen; i++, tiov++) {
2863 		if ((auio.uio_resid += tiov->iov_len) < 0) {
2864 			error = EINVAL;
2865 			goto out;
2866 		}
2867 	}
2868 	len = auio.uio_resid;
2869 	fromsa = (struct sockaddr *)sockbufstore;
2870 
2871 #ifdef KTRACE
2872 	if (KTRPOINT(td, KTR_GENIO))
2873 		ktruio = cloneuio(&auio);
2874 #endif /* KTRACE */
2875 	memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
2876 	CURVNET_SET(so->so_vnet);
2877 	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
2878 		    fromsa, fromlen, &msg_flags,
2879 		    (struct sctp_sndrcvinfo *)&sinfo, 1);
2880 	CURVNET_RESTORE();
2881 	if (error) {
2882 		if (auio.uio_resid != len && (error == ERESTART ||
2883 		    error == EINTR || error == EWOULDBLOCK))
2884 			error = 0;
2885 	} else {
2886 		if (uap->sinfo)
2887 			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
2888 	}
2889 #ifdef KTRACE
2890 	if (ktruio != NULL) {
2891 		ktruio->uio_resid = len - auio.uio_resid;
2892 		ktrgenio(uap->sd, UIO_READ, ktruio, error);
2893 	}
2894 #endif /* KTRACE */
2895 	if (error)
2896 		goto out;
2897 	td->td_retval[0] = len - auio.uio_resid;
2898 
2899 	if (fromlen && uap->from) {
2900 		len = fromlen;
2901 		if (len <= 0 || fromsa == 0)
2902 			len = 0;
2903 		else {
2904 			len = MIN(len, fromsa->sa_len);
2905 			error = copyout(fromsa, uap->from, (size_t)len);
2906 			if (error)
2907 				goto out;
2908 		}
2909 		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
2910 		if (error) {
2911 			goto out;
2912 		}
2913 	}
2914 #ifdef KTRACE
2915 	if (KTRPOINT(td, KTR_STRUCT))
2916 		ktrsockaddr(fromsa);
2917 #endif
2918 	if (uap->msg_flags) {
2919 		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
2920 		if (error) {
2921 			goto out;
2922 		}
2923 	}
2924 out:
2925 	free(iov, M_IOV);
2926 out1:
2927 	if (fp)
2928 		fdrop(fp, td);
2929 
2930 	return (error);
2931 #else  /* SCTP */
2932 	return (EOPNOTSUPP);
2933 #endif /* SCTP */
2934 }
2935