xref: /freebsd/sys/kern/uipc_syscalls.c (revision d940bfec8c329dd82d8d54efebd81c8aa420503b)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include "opt_capsicum.h"
39 #include "opt_inet.h"
40 #include "opt_inet6.h"
41 #include "opt_sctp.h"
42 #include "opt_compat.h"
43 #include "opt_ktrace.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/capability.h>
48 #include <sys/kernel.h>
49 #include <sys/lock.h>
50 #include <sys/mutex.h>
51 #include <sys/sysproto.h>
52 #include <sys/malloc.h>
53 #include <sys/filedesc.h>
54 #include <sys/event.h>
55 #include <sys/proc.h>
56 #include <sys/fcntl.h>
57 #include <sys/file.h>
58 #include <sys/filio.h>
59 #include <sys/jail.h>
60 #include <sys/mount.h>
61 #include <sys/mbuf.h>
62 #include <sys/protosw.h>
63 #include <sys/rwlock.h>
64 #include <sys/sf_buf.h>
65 #include <sys/sysent.h>
66 #include <sys/socket.h>
67 #include <sys/socketvar.h>
68 #include <sys/signalvar.h>
69 #include <sys/syscallsubr.h>
70 #include <sys/sysctl.h>
71 #include <sys/uio.h>
72 #include <sys/vnode.h>
73 #ifdef KTRACE
74 #include <sys/ktrace.h>
75 #endif
76 #ifdef COMPAT_FREEBSD32
77 #include <compat/freebsd32/freebsd32_util.h>
78 #endif
79 
80 #include <net/vnet.h>
81 
82 #include <security/audit/audit.h>
83 #include <security/mac/mac_framework.h>
84 
85 #include <vm/vm.h>
86 #include <vm/vm_param.h>
87 #include <vm/vm_object.h>
88 #include <vm/vm_page.h>
89 #include <vm/vm_pageout.h>
90 #include <vm/vm_kern.h>
91 #include <vm/vm_extern.h>
92 
93 #if defined(INET) || defined(INET6)
94 #ifdef SCTP
95 #include <netinet/sctp.h>
96 #include <netinet/sctp_peeloff.h>
97 #endif /* SCTP */
98 #endif /* INET || INET6 */
99 
100 /*
101  * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
102  * and SOCK_NONBLOCK.
103  */
104 #define	ACCEPT4_INHERIT	0x1
105 #define	ACCEPT4_COMPAT	0x2
106 
107 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
108 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
109 
110 static int accept1(struct thread *td, int s, struct sockaddr *uname,
111 		   socklen_t *anamelen, int flags);
112 static int do_sendfile(struct thread *td, struct sendfile_args *uap,
113 		   int compat);
114 static int getsockname1(struct thread *td, struct getsockname_args *uap,
115 			int compat);
116 static int getpeername1(struct thread *td, struct getpeername_args *uap,
117 			int compat);
118 
119 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
120 
121 /*
122  * sendfile(2)-related variables and associated sysctls
123  */
124 int nsfbufs;
125 int nsfbufspeak;
126 int nsfbufsused;
127 static int sfreadahead = 1;
128 
129 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
130     "Maximum number of sendfile(2) sf_bufs available");
131 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
132     "Number of sendfile(2) sf_bufs at peak usage");
133 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
134     "Number of sendfile(2) sf_bufs in use");
135 SYSCTL_INT(_kern_ipc, OID_AUTO, sfreadahead, CTLFLAG_RW, &sfreadahead, 0,
136     "Number of sendfile(2) read-ahead MAXBSIZE blocks");
137 
138 
139 static void
140 sfstat_init(const void *unused)
141 {
142 
143 	COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
144 	    M_WAITOK);
145 }
146 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
147 
148 static int
149 sfstat_sysctl(SYSCTL_HANDLER_ARGS)
150 {
151 	struct sfstat s;
152 
153 	COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
154 	if (req->newptr)
155 		COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
156 	return (SYSCTL_OUT(req, &s, sizeof(s)));
157 }
158 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
159     NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
160 
161 /*
162  * Convert a user file descriptor to a kernel file entry and check if required
163  * capability rights are present.
164  * A reference on the file entry is held upon returning.
165  */
166 static int
167 getsock_cap(struct filedesc *fdp, int fd, cap_rights_t rights,
168     struct file **fpp, u_int *fflagp)
169 {
170 	struct file *fp;
171 	int error;
172 
173 	error = fget_unlocked(fdp, fd, rights, 0, &fp, NULL);
174 	if (error != 0)
175 		return (error);
176 	if (fp->f_type != DTYPE_SOCKET) {
177 		fdrop(fp, curthread);
178 		return (ENOTSOCK);
179 	}
180 	if (fflagp != NULL)
181 		*fflagp = fp->f_flag;
182 	*fpp = fp;
183 	return (0);
184 }
185 
186 /*
187  * System call interface to the socket abstraction.
188  */
189 #if defined(COMPAT_43)
190 #define COMPAT_OLDSOCK
191 #endif
192 
193 int
194 sys_socket(td, uap)
195 	struct thread *td;
196 	struct socket_args /* {
197 		int	domain;
198 		int	type;
199 		int	protocol;
200 	} */ *uap;
201 {
202 	struct socket *so;
203 	struct file *fp;
204 	int fd, error, type, oflag, fflag;
205 
206 	AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
207 
208 	type = uap->type;
209 	oflag = 0;
210 	fflag = 0;
211 	if ((type & SOCK_CLOEXEC) != 0) {
212 		type &= ~SOCK_CLOEXEC;
213 		oflag |= O_CLOEXEC;
214 	}
215 	if ((type & SOCK_NONBLOCK) != 0) {
216 		type &= ~SOCK_NONBLOCK;
217 		fflag |= FNONBLOCK;
218 	}
219 
220 #ifdef MAC
221 	error = mac_socket_check_create(td->td_ucred, uap->domain, type,
222 	    uap->protocol);
223 	if (error)
224 		return (error);
225 #endif
226 	error = falloc(td, &fp, &fd, oflag);
227 	if (error)
228 		return (error);
229 	/* An extra reference on `fp' has been held for us by falloc(). */
230 	error = socreate(uap->domain, &so, type, uap->protocol,
231 	    td->td_ucred, td);
232 	if (error) {
233 		fdclose(td->td_proc->p_fd, fp, fd, td);
234 	} else {
235 		finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
236 		if ((fflag & FNONBLOCK) != 0)
237 			(void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
238 		td->td_retval[0] = fd;
239 	}
240 	fdrop(fp, td);
241 	return (error);
242 }
243 
244 /* ARGSUSED */
245 int
246 sys_bind(td, uap)
247 	struct thread *td;
248 	struct bind_args /* {
249 		int	s;
250 		caddr_t	name;
251 		int	namelen;
252 	} */ *uap;
253 {
254 	struct sockaddr *sa;
255 	int error;
256 
257 	error = getsockaddr(&sa, uap->name, uap->namelen);
258 	if (error == 0) {
259 		error = kern_bind(td, uap->s, sa);
260 		free(sa, M_SONAME);
261 	}
262 	return (error);
263 }
264 
265 static int
266 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
267 {
268 	struct socket *so;
269 	struct file *fp;
270 	int error;
271 
272 	AUDIT_ARG_FD(fd);
273 	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
274 	error = getsock_cap(td->td_proc->p_fd, fd, CAP_BIND, &fp, NULL);
275 	if (error)
276 		return (error);
277 	so = fp->f_data;
278 #ifdef KTRACE
279 	if (KTRPOINT(td, KTR_STRUCT))
280 		ktrsockaddr(sa);
281 #endif
282 #ifdef MAC
283 	error = mac_socket_check_bind(td->td_ucred, so, sa);
284 	if (error == 0) {
285 #endif
286 		if (dirfd == AT_FDCWD)
287 			error = sobind(so, sa, td);
288 		else
289 			error = sobindat(dirfd, so, sa, td);
290 #ifdef MAC
291 	}
292 #endif
293 	fdrop(fp, td);
294 	return (error);
295 }
296 
297 int
298 kern_bind(struct thread *td, int fd, struct sockaddr *sa)
299 {
300 
301 	return (kern_bindat(td, AT_FDCWD, fd, sa));
302 }
303 
304 /* ARGSUSED */
305 int
306 sys_bindat(td, uap)
307 	struct thread *td;
308 	struct bindat_args /* {
309 		int	fd;
310 		int	s;
311 		caddr_t	name;
312 		int	namelen;
313 	} */ *uap;
314 {
315 	struct sockaddr *sa;
316 	int error;
317 
318 	error = getsockaddr(&sa, uap->name, uap->namelen);
319 	if (error == 0) {
320 		error = kern_bindat(td, uap->fd, uap->s, sa);
321 		free(sa, M_SONAME);
322 	}
323 	return (error);
324 }
325 
326 /* ARGSUSED */
327 int
328 sys_listen(td, uap)
329 	struct thread *td;
330 	struct listen_args /* {
331 		int	s;
332 		int	backlog;
333 	} */ *uap;
334 {
335 	struct socket *so;
336 	struct file *fp;
337 	int error;
338 
339 	AUDIT_ARG_FD(uap->s);
340 	error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_LISTEN, &fp, NULL);
341 	if (error == 0) {
342 		so = fp->f_data;
343 #ifdef MAC
344 		error = mac_socket_check_listen(td->td_ucred, so);
345 		if (error == 0)
346 #endif
347 			error = solisten(so, uap->backlog, td);
348 		fdrop(fp, td);
349 	}
350 	return(error);
351 }
352 
353 /*
354  * accept1()
355  */
356 static int
357 accept1(td, s, uname, anamelen, flags)
358 	struct thread *td;
359 	int s;
360 	struct sockaddr *uname;
361 	socklen_t *anamelen;
362 	int flags;
363 {
364 	struct sockaddr *name;
365 	socklen_t namelen;
366 	struct file *fp;
367 	int error;
368 
369 	if (uname == NULL)
370 		return (kern_accept4(td, s, NULL, NULL, flags, NULL));
371 
372 	error = copyin(anamelen, &namelen, sizeof (namelen));
373 	if (error)
374 		return (error);
375 
376 	error = kern_accept4(td, s, &name, &namelen, flags, &fp);
377 
378 	/*
379 	 * return a namelen of zero for older code which might
380 	 * ignore the return value from accept.
381 	 */
382 	if (error) {
383 		(void) copyout(&namelen, anamelen, sizeof(*anamelen));
384 		return (error);
385 	}
386 
387 	if (error == 0 && uname != NULL) {
388 #ifdef COMPAT_OLDSOCK
389 		if (flags & ACCEPT4_COMPAT)
390 			((struct osockaddr *)name)->sa_family =
391 			    name->sa_family;
392 #endif
393 		error = copyout(name, uname, namelen);
394 	}
395 	if (error == 0)
396 		error = copyout(&namelen, anamelen,
397 		    sizeof(namelen));
398 	if (error)
399 		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
400 	fdrop(fp, td);
401 	free(name, M_SONAME);
402 	return (error);
403 }
404 
405 int
406 kern_accept(struct thread *td, int s, struct sockaddr **name,
407     socklen_t *namelen, struct file **fp)
408 {
409 	return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
410 }
411 
412 int
413 kern_accept4(struct thread *td, int s, struct sockaddr **name,
414     socklen_t *namelen, int flags, struct file **fp)
415 {
416 	struct filedesc *fdp;
417 	struct file *headfp, *nfp = NULL;
418 	struct sockaddr *sa = NULL;
419 	int error;
420 	struct socket *head, *so;
421 	int fd;
422 	u_int fflag;
423 	pid_t pgid;
424 	int tmp;
425 
426 	if (name)
427 		*name = NULL;
428 
429 	AUDIT_ARG_FD(s);
430 	fdp = td->td_proc->p_fd;
431 	error = getsock_cap(fdp, s, CAP_ACCEPT, &headfp, &fflag);
432 	if (error)
433 		return (error);
434 	head = headfp->f_data;
435 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
436 		error = EINVAL;
437 		goto done;
438 	}
439 #ifdef MAC
440 	error = mac_socket_check_accept(td->td_ucred, head);
441 	if (error != 0)
442 		goto done;
443 #endif
444 	error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0);
445 	if (error)
446 		goto done;
447 	ACCEPT_LOCK();
448 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
449 		ACCEPT_UNLOCK();
450 		error = EWOULDBLOCK;
451 		goto noconnection;
452 	}
453 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
454 		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
455 			head->so_error = ECONNABORTED;
456 			break;
457 		}
458 		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
459 		    "accept", 0);
460 		if (error) {
461 			ACCEPT_UNLOCK();
462 			goto noconnection;
463 		}
464 	}
465 	if (head->so_error) {
466 		error = head->so_error;
467 		head->so_error = 0;
468 		ACCEPT_UNLOCK();
469 		goto noconnection;
470 	}
471 	so = TAILQ_FIRST(&head->so_comp);
472 	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
473 	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
474 
475 	/*
476 	 * Before changing the flags on the socket, we have to bump the
477 	 * reference count.  Otherwise, if the protocol calls sofree(),
478 	 * the socket will be released due to a zero refcount.
479 	 */
480 	SOCK_LOCK(so);			/* soref() and so_state update */
481 	soref(so);			/* file descriptor reference */
482 
483 	TAILQ_REMOVE(&head->so_comp, so, so_list);
484 	head->so_qlen--;
485 	if (flags & ACCEPT4_INHERIT)
486 		so->so_state |= (head->so_state & SS_NBIO);
487 	else
488 		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
489 	so->so_qstate &= ~SQ_COMP;
490 	so->so_head = NULL;
491 
492 	SOCK_UNLOCK(so);
493 	ACCEPT_UNLOCK();
494 
495 	/* An extra reference on `nfp' has been held for us by falloc(). */
496 	td->td_retval[0] = fd;
497 
498 	/* connection has been removed from the listen queue */
499 	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
500 
501 	if (flags & ACCEPT4_INHERIT) {
502 		pgid = fgetown(&head->so_sigio);
503 		if (pgid != 0)
504 			fsetown(pgid, &so->so_sigio);
505 	} else {
506 		fflag &= ~(FNONBLOCK | FASYNC);
507 		if (flags & SOCK_NONBLOCK)
508 			fflag |= FNONBLOCK;
509 	}
510 
511 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
512 	/* Sync socket nonblocking/async state with file flags */
513 	tmp = fflag & FNONBLOCK;
514 	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
515 	tmp = fflag & FASYNC;
516 	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
517 	sa = 0;
518 	error = soaccept(so, &sa);
519 	if (error) {
520 		/*
521 		 * return a namelen of zero for older code which might
522 		 * ignore the return value from accept.
523 		 */
524 		if (name)
525 			*namelen = 0;
526 		goto noconnection;
527 	}
528 	if (sa == NULL) {
529 		if (name)
530 			*namelen = 0;
531 		goto done;
532 	}
533 	AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
534 	if (name) {
535 		/* check sa_len before it is destroyed */
536 		if (*namelen > sa->sa_len)
537 			*namelen = sa->sa_len;
538 #ifdef KTRACE
539 		if (KTRPOINT(td, KTR_STRUCT))
540 			ktrsockaddr(sa);
541 #endif
542 		*name = sa;
543 		sa = NULL;
544 	}
545 noconnection:
546 	if (sa)
547 		free(sa, M_SONAME);
548 
549 	/*
550 	 * close the new descriptor, assuming someone hasn't ripped it
551 	 * out from under us.
552 	 */
553 	if (error)
554 		fdclose(fdp, nfp, fd, td);
555 
556 	/*
557 	 * Release explicitly held references before returning.  We return
558 	 * a reference on nfp to the caller on success if they request it.
559 	 */
560 done:
561 	if (fp != NULL) {
562 		if (error == 0) {
563 			*fp = nfp;
564 			nfp = NULL;
565 		} else
566 			*fp = NULL;
567 	}
568 	if (nfp != NULL)
569 		fdrop(nfp, td);
570 	fdrop(headfp, td);
571 	return (error);
572 }
573 
574 int
575 sys_accept(td, uap)
576 	struct thread *td;
577 	struct accept_args *uap;
578 {
579 
580 	return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
581 }
582 
583 int
584 sys_accept4(td, uap)
585 	struct thread *td;
586 	struct accept4_args *uap;
587 {
588 	if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
589 		return (EINVAL);
590 
591 	return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
592 }
593 
594 #ifdef COMPAT_OLDSOCK
595 int
596 oaccept(td, uap)
597 	struct thread *td;
598 	struct accept_args *uap;
599 {
600 
601 	return (accept1(td, uap->s, uap->name, uap->anamelen,
602 	    ACCEPT4_INHERIT | ACCEPT4_COMPAT));
603 }
604 #endif /* COMPAT_OLDSOCK */
605 
606 /* ARGSUSED */
607 int
608 sys_connect(td, uap)
609 	struct thread *td;
610 	struct connect_args /* {
611 		int	s;
612 		caddr_t	name;
613 		int	namelen;
614 	} */ *uap;
615 {
616 	struct sockaddr *sa;
617 	int error;
618 
619 	error = getsockaddr(&sa, uap->name, uap->namelen);
620 	if (error == 0) {
621 		error = kern_connect(td, uap->s, sa);
622 		free(sa, M_SONAME);
623 	}
624 	return (error);
625 }
626 
627 static int
628 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
629 {
630 	struct socket *so;
631 	struct file *fp;
632 	int error;
633 	int interrupted = 0;
634 
635 	AUDIT_ARG_FD(fd);
636 	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
637 	error = getsock_cap(td->td_proc->p_fd, fd, CAP_CONNECT, &fp, NULL);
638 	if (error)
639 		return (error);
640 	so = fp->f_data;
641 	if (so->so_state & SS_ISCONNECTING) {
642 		error = EALREADY;
643 		goto done1;
644 	}
645 #ifdef KTRACE
646 	if (KTRPOINT(td, KTR_STRUCT))
647 		ktrsockaddr(sa);
648 #endif
649 #ifdef MAC
650 	error = mac_socket_check_connect(td->td_ucred, so, sa);
651 	if (error)
652 		goto bad;
653 #endif
654 	if (dirfd == AT_FDCWD)
655 		error = soconnect(so, sa, td);
656 	else
657 		error = soconnectat(dirfd, so, sa, td);
658 	if (error)
659 		goto bad;
660 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
661 		error = EINPROGRESS;
662 		goto done1;
663 	}
664 	SOCK_LOCK(so);
665 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
666 		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
667 		    "connec", 0);
668 		if (error) {
669 			if (error == EINTR || error == ERESTART)
670 				interrupted = 1;
671 			break;
672 		}
673 	}
674 	if (error == 0) {
675 		error = so->so_error;
676 		so->so_error = 0;
677 	}
678 	SOCK_UNLOCK(so);
679 bad:
680 	if (!interrupted)
681 		so->so_state &= ~SS_ISCONNECTING;
682 	if (error == ERESTART)
683 		error = EINTR;
684 done1:
685 	fdrop(fp, td);
686 	return (error);
687 }
688 
689 int
690 kern_connect(struct thread *td, int fd, struct sockaddr *sa)
691 {
692 
693 	return (kern_connectat(td, AT_FDCWD, fd, sa));
694 }
695 
696 /* ARGSUSED */
697 int
698 sys_connectat(td, uap)
699 	struct thread *td;
700 	struct connectat_args /* {
701 		int	fd;
702 		int	s;
703 		caddr_t	name;
704 		int	namelen;
705 	} */ *uap;
706 {
707 	struct sockaddr *sa;
708 	int error;
709 
710 	error = getsockaddr(&sa, uap->name, uap->namelen);
711 	if (error == 0) {
712 		error = kern_connectat(td, uap->fd, uap->s, sa);
713 		free(sa, M_SONAME);
714 	}
715 	return (error);
716 }
717 
718 int
719 kern_socketpair(struct thread *td, int domain, int type, int protocol,
720     int *rsv)
721 {
722 	struct filedesc *fdp = td->td_proc->p_fd;
723 	struct file *fp1, *fp2;
724 	struct socket *so1, *so2;
725 	int fd, error, oflag, fflag;
726 
727 	AUDIT_ARG_SOCKET(domain, type, protocol);
728 
729 	oflag = 0;
730 	fflag = 0;
731 	if ((type & SOCK_CLOEXEC) != 0) {
732 		type &= ~SOCK_CLOEXEC;
733 		oflag |= O_CLOEXEC;
734 	}
735 	if ((type & SOCK_NONBLOCK) != 0) {
736 		type &= ~SOCK_NONBLOCK;
737 		fflag |= FNONBLOCK;
738 	}
739 #ifdef MAC
740 	/* We might want to have a separate check for socket pairs. */
741 	error = mac_socket_check_create(td->td_ucred, domain, type,
742 	    protocol);
743 	if (error)
744 		return (error);
745 #endif
746 	error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
747 	if (error)
748 		return (error);
749 	error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
750 	if (error)
751 		goto free1;
752 	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
753 	error = falloc(td, &fp1, &fd, oflag);
754 	if (error)
755 		goto free2;
756 	rsv[0] = fd;
757 	fp1->f_data = so1;	/* so1 already has ref count */
758 	error = falloc(td, &fp2, &fd, oflag);
759 	if (error)
760 		goto free3;
761 	fp2->f_data = so2;	/* so2 already has ref count */
762 	rsv[1] = fd;
763 	error = soconnect2(so1, so2);
764 	if (error)
765 		goto free4;
766 	if (type == SOCK_DGRAM) {
767 		/*
768 		 * Datagram socket connection is asymmetric.
769 		 */
770 		 error = soconnect2(so2, so1);
771 		 if (error)
772 			goto free4;
773 	}
774 	finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
775 	    &socketops);
776 	finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
777 	    &socketops);
778 	if ((fflag & FNONBLOCK) != 0) {
779 		(void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
780 		(void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
781 	}
782 	fdrop(fp1, td);
783 	fdrop(fp2, td);
784 	return (0);
785 free4:
786 	fdclose(fdp, fp2, rsv[1], td);
787 	fdrop(fp2, td);
788 free3:
789 	fdclose(fdp, fp1, rsv[0], td);
790 	fdrop(fp1, td);
791 free2:
792 	if (so2 != NULL)
793 		(void)soclose(so2);
794 free1:
795 	if (so1 != NULL)
796 		(void)soclose(so1);
797 	return (error);
798 }
799 
800 int
801 sys_socketpair(struct thread *td, struct socketpair_args *uap)
802 {
803 	int error, sv[2];
804 
805 	error = kern_socketpair(td, uap->domain, uap->type,
806 	    uap->protocol, sv);
807 	if (error)
808 		return (error);
809 	error = copyout(sv, uap->rsv, 2 * sizeof(int));
810 	if (error) {
811 		(void)kern_close(td, sv[0]);
812 		(void)kern_close(td, sv[1]);
813 	}
814 	return (error);
815 }
816 
817 static int
818 sendit(td, s, mp, flags)
819 	struct thread *td;
820 	int s;
821 	struct msghdr *mp;
822 	int flags;
823 {
824 	struct mbuf *control;
825 	struct sockaddr *to;
826 	int error;
827 
828 #ifdef CAPABILITY_MODE
829 	if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
830 		return (ECAPMODE);
831 #endif
832 
833 	if (mp->msg_name != NULL) {
834 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
835 		if (error) {
836 			to = NULL;
837 			goto bad;
838 		}
839 		mp->msg_name = to;
840 	} else {
841 		to = NULL;
842 	}
843 
844 	if (mp->msg_control) {
845 		if (mp->msg_controllen < sizeof(struct cmsghdr)
846 #ifdef COMPAT_OLDSOCK
847 		    && mp->msg_flags != MSG_COMPAT
848 #endif
849 		) {
850 			error = EINVAL;
851 			goto bad;
852 		}
853 		error = sockargs(&control, mp->msg_control,
854 		    mp->msg_controllen, MT_CONTROL);
855 		if (error)
856 			goto bad;
857 #ifdef COMPAT_OLDSOCK
858 		if (mp->msg_flags == MSG_COMPAT) {
859 			struct cmsghdr *cm;
860 
861 			M_PREPEND(control, sizeof(*cm), M_WAITOK);
862 			cm = mtod(control, struct cmsghdr *);
863 			cm->cmsg_len = control->m_len;
864 			cm->cmsg_level = SOL_SOCKET;
865 			cm->cmsg_type = SCM_RIGHTS;
866 		}
867 #endif
868 	} else {
869 		control = NULL;
870 	}
871 
872 	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
873 
874 bad:
875 	if (to)
876 		free(to, M_SONAME);
877 	return (error);
878 }
879 
880 int
881 kern_sendit(td, s, mp, flags, control, segflg)
882 	struct thread *td;
883 	int s;
884 	struct msghdr *mp;
885 	int flags;
886 	struct mbuf *control;
887 	enum uio_seg segflg;
888 {
889 	struct file *fp;
890 	struct uio auio;
891 	struct iovec *iov;
892 	struct socket *so;
893 	int i, error;
894 	ssize_t len;
895 	cap_rights_t rights;
896 #ifdef KTRACE
897 	struct uio *ktruio = NULL;
898 #endif
899 
900 	AUDIT_ARG_FD(s);
901 	rights = CAP_SEND;
902 	if (mp->msg_name != NULL) {
903 		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
904 		rights |= CAP_CONNECT;
905 	}
906 	error = getsock_cap(td->td_proc->p_fd, s, rights, &fp, NULL);
907 	if (error)
908 		return (error);
909 	so = (struct socket *)fp->f_data;
910 
911 #ifdef KTRACE
912 	if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
913 		ktrsockaddr(mp->msg_name);
914 #endif
915 #ifdef MAC
916 	if (mp->msg_name != NULL) {
917 		error = mac_socket_check_connect(td->td_ucred, so,
918 		    mp->msg_name);
919 		if (error)
920 			goto bad;
921 	}
922 	error = mac_socket_check_send(td->td_ucred, so);
923 	if (error)
924 		goto bad;
925 #endif
926 
927 	auio.uio_iov = mp->msg_iov;
928 	auio.uio_iovcnt = mp->msg_iovlen;
929 	auio.uio_segflg = segflg;
930 	auio.uio_rw = UIO_WRITE;
931 	auio.uio_td = td;
932 	auio.uio_offset = 0;			/* XXX */
933 	auio.uio_resid = 0;
934 	iov = mp->msg_iov;
935 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
936 		if ((auio.uio_resid += iov->iov_len) < 0) {
937 			error = EINVAL;
938 			goto bad;
939 		}
940 	}
941 #ifdef KTRACE
942 	if (KTRPOINT(td, KTR_GENIO))
943 		ktruio = cloneuio(&auio);
944 #endif
945 	len = auio.uio_resid;
946 	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
947 	if (error) {
948 		if (auio.uio_resid != len && (error == ERESTART ||
949 		    error == EINTR || error == EWOULDBLOCK))
950 			error = 0;
951 		/* Generation of SIGPIPE can be controlled per socket */
952 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
953 		    !(flags & MSG_NOSIGNAL)) {
954 			PROC_LOCK(td->td_proc);
955 			tdsignal(td, SIGPIPE);
956 			PROC_UNLOCK(td->td_proc);
957 		}
958 	}
959 	if (error == 0)
960 		td->td_retval[0] = len - auio.uio_resid;
961 #ifdef KTRACE
962 	if (ktruio != NULL) {
963 		ktruio->uio_resid = td->td_retval[0];
964 		ktrgenio(s, UIO_WRITE, ktruio, error);
965 	}
966 #endif
967 bad:
968 	fdrop(fp, td);
969 	return (error);
970 }
971 
972 int
973 sys_sendto(td, uap)
974 	struct thread *td;
975 	struct sendto_args /* {
976 		int	s;
977 		caddr_t	buf;
978 		size_t	len;
979 		int	flags;
980 		caddr_t	to;
981 		int	tolen;
982 	} */ *uap;
983 {
984 	struct msghdr msg;
985 	struct iovec aiov;
986 	int error;
987 
988 	msg.msg_name = uap->to;
989 	msg.msg_namelen = uap->tolen;
990 	msg.msg_iov = &aiov;
991 	msg.msg_iovlen = 1;
992 	msg.msg_control = 0;
993 #ifdef COMPAT_OLDSOCK
994 	msg.msg_flags = 0;
995 #endif
996 	aiov.iov_base = uap->buf;
997 	aiov.iov_len = uap->len;
998 	error = sendit(td, uap->s, &msg, uap->flags);
999 	return (error);
1000 }
1001 
1002 #ifdef COMPAT_OLDSOCK
1003 int
1004 osend(td, uap)
1005 	struct thread *td;
1006 	struct osend_args /* {
1007 		int	s;
1008 		caddr_t	buf;
1009 		int	len;
1010 		int	flags;
1011 	} */ *uap;
1012 {
1013 	struct msghdr msg;
1014 	struct iovec aiov;
1015 	int error;
1016 
1017 	msg.msg_name = 0;
1018 	msg.msg_namelen = 0;
1019 	msg.msg_iov = &aiov;
1020 	msg.msg_iovlen = 1;
1021 	aiov.iov_base = uap->buf;
1022 	aiov.iov_len = uap->len;
1023 	msg.msg_control = 0;
1024 	msg.msg_flags = 0;
1025 	error = sendit(td, uap->s, &msg, uap->flags);
1026 	return (error);
1027 }
1028 
1029 int
1030 osendmsg(td, uap)
1031 	struct thread *td;
1032 	struct osendmsg_args /* {
1033 		int	s;
1034 		caddr_t	msg;
1035 		int	flags;
1036 	} */ *uap;
1037 {
1038 	struct msghdr msg;
1039 	struct iovec *iov;
1040 	int error;
1041 
1042 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1043 	if (error)
1044 		return (error);
1045 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1046 	if (error)
1047 		return (error);
1048 	msg.msg_iov = iov;
1049 	msg.msg_flags = MSG_COMPAT;
1050 	error = sendit(td, uap->s, &msg, uap->flags);
1051 	free(iov, M_IOV);
1052 	return (error);
1053 }
1054 #endif
1055 
1056 int
1057 sys_sendmsg(td, uap)
1058 	struct thread *td;
1059 	struct sendmsg_args /* {
1060 		int	s;
1061 		caddr_t	msg;
1062 		int	flags;
1063 	} */ *uap;
1064 {
1065 	struct msghdr msg;
1066 	struct iovec *iov;
1067 	int error;
1068 
1069 	error = copyin(uap->msg, &msg, sizeof (msg));
1070 	if (error)
1071 		return (error);
1072 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1073 	if (error)
1074 		return (error);
1075 	msg.msg_iov = iov;
1076 #ifdef COMPAT_OLDSOCK
1077 	msg.msg_flags = 0;
1078 #endif
1079 	error = sendit(td, uap->s, &msg, uap->flags);
1080 	free(iov, M_IOV);
1081 	return (error);
1082 }
1083 
1084 int
1085 kern_recvit(td, s, mp, fromseg, controlp)
1086 	struct thread *td;
1087 	int s;
1088 	struct msghdr *mp;
1089 	enum uio_seg fromseg;
1090 	struct mbuf **controlp;
1091 {
1092 	struct uio auio;
1093 	struct iovec *iov;
1094 	int i;
1095 	ssize_t len;
1096 	int error;
1097 	struct mbuf *m, *control = NULL;
1098 	caddr_t ctlbuf;
1099 	struct file *fp;
1100 	struct socket *so;
1101 	struct sockaddr *fromsa = NULL;
1102 #ifdef KTRACE
1103 	struct uio *ktruio = NULL;
1104 #endif
1105 
1106 	if (controlp != NULL)
1107 		*controlp = NULL;
1108 
1109 	AUDIT_ARG_FD(s);
1110 	error = getsock_cap(td->td_proc->p_fd, s, CAP_RECV, &fp, NULL);
1111 	if (error)
1112 		return (error);
1113 	so = fp->f_data;
1114 
1115 #ifdef MAC
1116 	error = mac_socket_check_receive(td->td_ucred, so);
1117 	if (error) {
1118 		fdrop(fp, td);
1119 		return (error);
1120 	}
1121 #endif
1122 
1123 	auio.uio_iov = mp->msg_iov;
1124 	auio.uio_iovcnt = mp->msg_iovlen;
1125 	auio.uio_segflg = UIO_USERSPACE;
1126 	auio.uio_rw = UIO_READ;
1127 	auio.uio_td = td;
1128 	auio.uio_offset = 0;			/* XXX */
1129 	auio.uio_resid = 0;
1130 	iov = mp->msg_iov;
1131 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
1132 		if ((auio.uio_resid += iov->iov_len) < 0) {
1133 			fdrop(fp, td);
1134 			return (EINVAL);
1135 		}
1136 	}
1137 #ifdef KTRACE
1138 	if (KTRPOINT(td, KTR_GENIO))
1139 		ktruio = cloneuio(&auio);
1140 #endif
1141 	len = auio.uio_resid;
1142 	error = soreceive(so, &fromsa, &auio, NULL,
1143 	    (mp->msg_control || controlp) ? &control : NULL,
1144 	    &mp->msg_flags);
1145 	if (error) {
1146 		if (auio.uio_resid != len && (error == ERESTART ||
1147 		    error == EINTR || error == EWOULDBLOCK))
1148 			error = 0;
1149 	}
1150 	if (fromsa != NULL)
1151 		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
1152 #ifdef KTRACE
1153 	if (ktruio != NULL) {
1154 		ktruio->uio_resid = len - auio.uio_resid;
1155 		ktrgenio(s, UIO_READ, ktruio, error);
1156 	}
1157 #endif
1158 	if (error)
1159 		goto out;
1160 	td->td_retval[0] = len - auio.uio_resid;
1161 	if (mp->msg_name) {
1162 		len = mp->msg_namelen;
1163 		if (len <= 0 || fromsa == NULL)
1164 			len = 0;
1165 		else {
1166 			/* save sa_len before it is destroyed by MSG_COMPAT */
1167 			len = MIN(len, fromsa->sa_len);
1168 #ifdef COMPAT_OLDSOCK
1169 			if (mp->msg_flags & MSG_COMPAT)
1170 				((struct osockaddr *)fromsa)->sa_family =
1171 				    fromsa->sa_family;
1172 #endif
1173 			if (fromseg == UIO_USERSPACE) {
1174 				error = copyout(fromsa, mp->msg_name,
1175 				    (unsigned)len);
1176 				if (error)
1177 					goto out;
1178 			} else
1179 				bcopy(fromsa, mp->msg_name, len);
1180 		}
1181 		mp->msg_namelen = len;
1182 	}
1183 	if (mp->msg_control && controlp == NULL) {
1184 #ifdef COMPAT_OLDSOCK
1185 		/*
1186 		 * We assume that old recvmsg calls won't receive access
1187 		 * rights and other control info, esp. as control info
1188 		 * is always optional and those options didn't exist in 4.3.
1189 		 * If we receive rights, trim the cmsghdr; anything else
1190 		 * is tossed.
1191 		 */
1192 		if (control && mp->msg_flags & MSG_COMPAT) {
1193 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1194 			    SOL_SOCKET ||
1195 			    mtod(control, struct cmsghdr *)->cmsg_type !=
1196 			    SCM_RIGHTS) {
1197 				mp->msg_controllen = 0;
1198 				goto out;
1199 			}
1200 			control->m_len -= sizeof (struct cmsghdr);
1201 			control->m_data += sizeof (struct cmsghdr);
1202 		}
1203 #endif
1204 		len = mp->msg_controllen;
1205 		m = control;
1206 		mp->msg_controllen = 0;
1207 		ctlbuf = mp->msg_control;
1208 
1209 		while (m && len > 0) {
1210 			unsigned int tocopy;
1211 
1212 			if (len >= m->m_len)
1213 				tocopy = m->m_len;
1214 			else {
1215 				mp->msg_flags |= MSG_CTRUNC;
1216 				tocopy = len;
1217 			}
1218 
1219 			if ((error = copyout(mtod(m, caddr_t),
1220 					ctlbuf, tocopy)) != 0)
1221 				goto out;
1222 
1223 			ctlbuf += tocopy;
1224 			len -= tocopy;
1225 			m = m->m_next;
1226 		}
1227 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1228 	}
1229 out:
1230 	fdrop(fp, td);
1231 #ifdef KTRACE
1232 	if (fromsa && KTRPOINT(td, KTR_STRUCT))
1233 		ktrsockaddr(fromsa);
1234 #endif
1235 	if (fromsa)
1236 		free(fromsa, M_SONAME);
1237 
1238 	if (error == 0 && controlp != NULL)
1239 		*controlp = control;
1240 	else  if (control)
1241 		m_freem(control);
1242 
1243 	return (error);
1244 }
1245 
1246 static int
1247 recvit(td, s, mp, namelenp)
1248 	struct thread *td;
1249 	int s;
1250 	struct msghdr *mp;
1251 	void *namelenp;
1252 {
1253 	int error;
1254 
1255 	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
1256 	if (error)
1257 		return (error);
1258 	if (namelenp) {
1259 		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
1260 #ifdef COMPAT_OLDSOCK
1261 		if (mp->msg_flags & MSG_COMPAT)
1262 			error = 0;	/* old recvfrom didn't check */
1263 #endif
1264 	}
1265 	return (error);
1266 }
1267 
1268 int
1269 sys_recvfrom(td, uap)
1270 	struct thread *td;
1271 	struct recvfrom_args /* {
1272 		int	s;
1273 		caddr_t	buf;
1274 		size_t	len;
1275 		int	flags;
1276 		struct sockaddr * __restrict	from;
1277 		socklen_t * __restrict fromlenaddr;
1278 	} */ *uap;
1279 {
1280 	struct msghdr msg;
1281 	struct iovec aiov;
1282 	int error;
1283 
1284 	if (uap->fromlenaddr) {
1285 		error = copyin(uap->fromlenaddr,
1286 		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1287 		if (error)
1288 			goto done2;
1289 	} else {
1290 		msg.msg_namelen = 0;
1291 	}
1292 	msg.msg_name = uap->from;
1293 	msg.msg_iov = &aiov;
1294 	msg.msg_iovlen = 1;
1295 	aiov.iov_base = uap->buf;
1296 	aiov.iov_len = uap->len;
1297 	msg.msg_control = 0;
1298 	msg.msg_flags = uap->flags;
1299 	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1300 done2:
1301 	return(error);
1302 }
1303 
1304 #ifdef COMPAT_OLDSOCK
1305 int
1306 orecvfrom(td, uap)
1307 	struct thread *td;
1308 	struct recvfrom_args *uap;
1309 {
1310 
1311 	uap->flags |= MSG_COMPAT;
1312 	return (sys_recvfrom(td, uap));
1313 }
1314 #endif
1315 
1316 #ifdef COMPAT_OLDSOCK
1317 int
1318 orecv(td, uap)
1319 	struct thread *td;
1320 	struct orecv_args /* {
1321 		int	s;
1322 		caddr_t	buf;
1323 		int	len;
1324 		int	flags;
1325 	} */ *uap;
1326 {
1327 	struct msghdr msg;
1328 	struct iovec aiov;
1329 	int error;
1330 
1331 	msg.msg_name = 0;
1332 	msg.msg_namelen = 0;
1333 	msg.msg_iov = &aiov;
1334 	msg.msg_iovlen = 1;
1335 	aiov.iov_base = uap->buf;
1336 	aiov.iov_len = uap->len;
1337 	msg.msg_control = 0;
1338 	msg.msg_flags = uap->flags;
1339 	error = recvit(td, uap->s, &msg, NULL);
1340 	return (error);
1341 }
1342 
1343 /*
1344  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1345  * overlays the new one, missing only the flags, and with the (old) access
1346  * rights where the control fields are now.
1347  */
1348 int
1349 orecvmsg(td, uap)
1350 	struct thread *td;
1351 	struct orecvmsg_args /* {
1352 		int	s;
1353 		struct	omsghdr *msg;
1354 		int	flags;
1355 	} */ *uap;
1356 {
1357 	struct msghdr msg;
1358 	struct iovec *iov;
1359 	int error;
1360 
1361 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1362 	if (error)
1363 		return (error);
1364 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1365 	if (error)
1366 		return (error);
1367 	msg.msg_flags = uap->flags | MSG_COMPAT;
1368 	msg.msg_iov = iov;
1369 	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1370 	if (msg.msg_controllen && error == 0)
1371 		error = copyout(&msg.msg_controllen,
1372 		    &uap->msg->msg_accrightslen, sizeof (int));
1373 	free(iov, M_IOV);
1374 	return (error);
1375 }
1376 #endif
1377 
1378 int
1379 sys_recvmsg(td, uap)
1380 	struct thread *td;
1381 	struct recvmsg_args /* {
1382 		int	s;
1383 		struct	msghdr *msg;
1384 		int	flags;
1385 	} */ *uap;
1386 {
1387 	struct msghdr msg;
1388 	struct iovec *uiov, *iov;
1389 	int error;
1390 
1391 	error = copyin(uap->msg, &msg, sizeof (msg));
1392 	if (error)
1393 		return (error);
1394 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1395 	if (error)
1396 		return (error);
1397 	msg.msg_flags = uap->flags;
1398 #ifdef COMPAT_OLDSOCK
1399 	msg.msg_flags &= ~MSG_COMPAT;
1400 #endif
1401 	uiov = msg.msg_iov;
1402 	msg.msg_iov = iov;
1403 	error = recvit(td, uap->s, &msg, NULL);
1404 	if (error == 0) {
1405 		msg.msg_iov = uiov;
1406 		error = copyout(&msg, uap->msg, sizeof(msg));
1407 	}
1408 	free(iov, M_IOV);
1409 	return (error);
1410 }
1411 
1412 /* ARGSUSED */
1413 int
1414 sys_shutdown(td, uap)
1415 	struct thread *td;
1416 	struct shutdown_args /* {
1417 		int	s;
1418 		int	how;
1419 	} */ *uap;
1420 {
1421 	struct socket *so;
1422 	struct file *fp;
1423 	int error;
1424 
1425 	AUDIT_ARG_FD(uap->s);
1426 	error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_SHUTDOWN, &fp,
1427 	    NULL);
1428 	if (error == 0) {
1429 		so = fp->f_data;
1430 		error = soshutdown(so, uap->how);
1431 		fdrop(fp, td);
1432 	}
1433 	return (error);
1434 }
1435 
1436 /* ARGSUSED */
1437 int
1438 sys_setsockopt(td, uap)
1439 	struct thread *td;
1440 	struct setsockopt_args /* {
1441 		int	s;
1442 		int	level;
1443 		int	name;
1444 		caddr_t	val;
1445 		int	valsize;
1446 	} */ *uap;
1447 {
1448 
1449 	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1450 	    uap->val, UIO_USERSPACE, uap->valsize));
1451 }
1452 
1453 int
1454 kern_setsockopt(td, s, level, name, val, valseg, valsize)
1455 	struct thread *td;
1456 	int s;
1457 	int level;
1458 	int name;
1459 	void *val;
1460 	enum uio_seg valseg;
1461 	socklen_t valsize;
1462 {
1463 	int error;
1464 	struct socket *so;
1465 	struct file *fp;
1466 	struct sockopt sopt;
1467 
1468 	if (val == NULL && valsize != 0)
1469 		return (EFAULT);
1470 	if ((int)valsize < 0)
1471 		return (EINVAL);
1472 
1473 	sopt.sopt_dir = SOPT_SET;
1474 	sopt.sopt_level = level;
1475 	sopt.sopt_name = name;
1476 	sopt.sopt_val = val;
1477 	sopt.sopt_valsize = valsize;
1478 	switch (valseg) {
1479 	case UIO_USERSPACE:
1480 		sopt.sopt_td = td;
1481 		break;
1482 	case UIO_SYSSPACE:
1483 		sopt.sopt_td = NULL;
1484 		break;
1485 	default:
1486 		panic("kern_setsockopt called with bad valseg");
1487 	}
1488 
1489 	AUDIT_ARG_FD(s);
1490 	error = getsock_cap(td->td_proc->p_fd, s, CAP_SETSOCKOPT, &fp, NULL);
1491 	if (error == 0) {
1492 		so = fp->f_data;
1493 		error = sosetopt(so, &sopt);
1494 		fdrop(fp, td);
1495 	}
1496 	return(error);
1497 }
1498 
1499 /* ARGSUSED */
1500 int
1501 sys_getsockopt(td, uap)
1502 	struct thread *td;
1503 	struct getsockopt_args /* {
1504 		int	s;
1505 		int	level;
1506 		int	name;
1507 		void * __restrict	val;
1508 		socklen_t * __restrict avalsize;
1509 	} */ *uap;
1510 {
1511 	socklen_t valsize;
1512 	int	error;
1513 
1514 	if (uap->val) {
1515 		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1516 		if (error)
1517 			return (error);
1518 	}
1519 
1520 	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1521 	    uap->val, UIO_USERSPACE, &valsize);
1522 
1523 	if (error == 0)
1524 		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1525 	return (error);
1526 }
1527 
1528 /*
1529  * Kernel version of getsockopt.
1530  * optval can be a userland or userspace. optlen is always a kernel pointer.
1531  */
1532 int
1533 kern_getsockopt(td, s, level, name, val, valseg, valsize)
1534 	struct thread *td;
1535 	int s;
1536 	int level;
1537 	int name;
1538 	void *val;
1539 	enum uio_seg valseg;
1540 	socklen_t *valsize;
1541 {
1542 	int error;
1543 	struct  socket *so;
1544 	struct file *fp;
1545 	struct	sockopt sopt;
1546 
1547 	if (val == NULL)
1548 		*valsize = 0;
1549 	if ((int)*valsize < 0)
1550 		return (EINVAL);
1551 
1552 	sopt.sopt_dir = SOPT_GET;
1553 	sopt.sopt_level = level;
1554 	sopt.sopt_name = name;
1555 	sopt.sopt_val = val;
1556 	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1557 	switch (valseg) {
1558 	case UIO_USERSPACE:
1559 		sopt.sopt_td = td;
1560 		break;
1561 	case UIO_SYSSPACE:
1562 		sopt.sopt_td = NULL;
1563 		break;
1564 	default:
1565 		panic("kern_getsockopt called with bad valseg");
1566 	}
1567 
1568 	AUDIT_ARG_FD(s);
1569 	error = getsock_cap(td->td_proc->p_fd, s, CAP_GETSOCKOPT, &fp, NULL);
1570 	if (error == 0) {
1571 		so = fp->f_data;
1572 		error = sogetopt(so, &sopt);
1573 		*valsize = sopt.sopt_valsize;
1574 		fdrop(fp, td);
1575 	}
1576 	return (error);
1577 }
1578 
1579 /*
1580  * getsockname1() - Get socket name.
1581  */
1582 /* ARGSUSED */
1583 static int
1584 getsockname1(td, uap, compat)
1585 	struct thread *td;
1586 	struct getsockname_args /* {
1587 		int	fdes;
1588 		struct sockaddr * __restrict asa;
1589 		socklen_t * __restrict alen;
1590 	} */ *uap;
1591 	int compat;
1592 {
1593 	struct sockaddr *sa;
1594 	socklen_t len;
1595 	int error;
1596 
1597 	error = copyin(uap->alen, &len, sizeof(len));
1598 	if (error)
1599 		return (error);
1600 
1601 	error = kern_getsockname(td, uap->fdes, &sa, &len);
1602 	if (error)
1603 		return (error);
1604 
1605 	if (len != 0) {
1606 #ifdef COMPAT_OLDSOCK
1607 		if (compat)
1608 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1609 #endif
1610 		error = copyout(sa, uap->asa, (u_int)len);
1611 	}
1612 	free(sa, M_SONAME);
1613 	if (error == 0)
1614 		error = copyout(&len, uap->alen, sizeof(len));
1615 	return (error);
1616 }
1617 
1618 int
1619 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
1620     socklen_t *alen)
1621 {
1622 	struct socket *so;
1623 	struct file *fp;
1624 	socklen_t len;
1625 	int error;
1626 
1627 	AUDIT_ARG_FD(fd);
1628 	error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETSOCKNAME, &fp, NULL);
1629 	if (error)
1630 		return (error);
1631 	so = fp->f_data;
1632 	*sa = NULL;
1633 	CURVNET_SET(so->so_vnet);
1634 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
1635 	CURVNET_RESTORE();
1636 	if (error)
1637 		goto bad;
1638 	if (*sa == NULL)
1639 		len = 0;
1640 	else
1641 		len = MIN(*alen, (*sa)->sa_len);
1642 	*alen = len;
1643 #ifdef KTRACE
1644 	if (KTRPOINT(td, KTR_STRUCT))
1645 		ktrsockaddr(*sa);
1646 #endif
1647 bad:
1648 	fdrop(fp, td);
1649 	if (error && *sa) {
1650 		free(*sa, M_SONAME);
1651 		*sa = NULL;
1652 	}
1653 	return (error);
1654 }
1655 
1656 int
1657 sys_getsockname(td, uap)
1658 	struct thread *td;
1659 	struct getsockname_args *uap;
1660 {
1661 
1662 	return (getsockname1(td, uap, 0));
1663 }
1664 
1665 #ifdef COMPAT_OLDSOCK
1666 int
1667 ogetsockname(td, uap)
1668 	struct thread *td;
1669 	struct getsockname_args *uap;
1670 {
1671 
1672 	return (getsockname1(td, uap, 1));
1673 }
1674 #endif /* COMPAT_OLDSOCK */
1675 
1676 /*
1677  * getpeername1() - Get name of peer for connected socket.
1678  */
1679 /* ARGSUSED */
1680 static int
1681 getpeername1(td, uap, compat)
1682 	struct thread *td;
1683 	struct getpeername_args /* {
1684 		int	fdes;
1685 		struct sockaddr * __restrict	asa;
1686 		socklen_t * __restrict	alen;
1687 	} */ *uap;
1688 	int compat;
1689 {
1690 	struct sockaddr *sa;
1691 	socklen_t len;
1692 	int error;
1693 
1694 	error = copyin(uap->alen, &len, sizeof (len));
1695 	if (error)
1696 		return (error);
1697 
1698 	error = kern_getpeername(td, uap->fdes, &sa, &len);
1699 	if (error)
1700 		return (error);
1701 
1702 	if (len != 0) {
1703 #ifdef COMPAT_OLDSOCK
1704 		if (compat)
1705 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1706 #endif
1707 		error = copyout(sa, uap->asa, (u_int)len);
1708 	}
1709 	free(sa, M_SONAME);
1710 	if (error == 0)
1711 		error = copyout(&len, uap->alen, sizeof(len));
1712 	return (error);
1713 }
1714 
1715 int
1716 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
1717     socklen_t *alen)
1718 {
1719 	struct socket *so;
1720 	struct file *fp;
1721 	socklen_t len;
1722 	int error;
1723 
1724 	AUDIT_ARG_FD(fd);
1725 	error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETPEERNAME, &fp, NULL);
1726 	if (error)
1727 		return (error);
1728 	so = fp->f_data;
1729 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1730 		error = ENOTCONN;
1731 		goto done;
1732 	}
1733 	*sa = NULL;
1734 	CURVNET_SET(so->so_vnet);
1735 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
1736 	CURVNET_RESTORE();
1737 	if (error)
1738 		goto bad;
1739 	if (*sa == NULL)
1740 		len = 0;
1741 	else
1742 		len = MIN(*alen, (*sa)->sa_len);
1743 	*alen = len;
1744 #ifdef KTRACE
1745 	if (KTRPOINT(td, KTR_STRUCT))
1746 		ktrsockaddr(*sa);
1747 #endif
1748 bad:
1749 	if (error && *sa) {
1750 		free(*sa, M_SONAME);
1751 		*sa = NULL;
1752 	}
1753 done:
1754 	fdrop(fp, td);
1755 	return (error);
1756 }
1757 
1758 int
1759 sys_getpeername(td, uap)
1760 	struct thread *td;
1761 	struct getpeername_args *uap;
1762 {
1763 
1764 	return (getpeername1(td, uap, 0));
1765 }
1766 
1767 #ifdef COMPAT_OLDSOCK
1768 int
1769 ogetpeername(td, uap)
1770 	struct thread *td;
1771 	struct ogetpeername_args *uap;
1772 {
1773 
1774 	/* XXX uap should have type `getpeername_args *' to begin with. */
1775 	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1776 }
1777 #endif /* COMPAT_OLDSOCK */
1778 
1779 int
1780 sockargs(mp, buf, buflen, type)
1781 	struct mbuf **mp;
1782 	caddr_t buf;
1783 	int buflen, type;
1784 {
1785 	struct sockaddr *sa;
1786 	struct mbuf *m;
1787 	int error;
1788 
1789 	if (buflen > MLEN) {
1790 #ifdef COMPAT_OLDSOCK
1791 		if (type == MT_SONAME && buflen <= 112)
1792 			buflen = MLEN;		/* unix domain compat. hack */
1793 		else
1794 #endif
1795 			if (buflen > MCLBYTES)
1796 				return (EINVAL);
1797 	}
1798 	m = m_get2(buflen, M_WAITOK, type, 0);
1799 	m->m_len = buflen;
1800 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1801 	if (error)
1802 		(void) m_free(m);
1803 	else {
1804 		*mp = m;
1805 		if (type == MT_SONAME) {
1806 			sa = mtod(m, struct sockaddr *);
1807 
1808 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1809 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1810 				sa->sa_family = sa->sa_len;
1811 #endif
1812 			sa->sa_len = buflen;
1813 		}
1814 	}
1815 	return (error);
1816 }
1817 
1818 int
1819 getsockaddr(namp, uaddr, len)
1820 	struct sockaddr **namp;
1821 	caddr_t uaddr;
1822 	size_t len;
1823 {
1824 	struct sockaddr *sa;
1825 	int error;
1826 
1827 	if (len > SOCK_MAXADDRLEN)
1828 		return (ENAMETOOLONG);
1829 	if (len < offsetof(struct sockaddr, sa_data[0]))
1830 		return (EINVAL);
1831 	sa = malloc(len, M_SONAME, M_WAITOK);
1832 	error = copyin(uaddr, sa, len);
1833 	if (error) {
1834 		free(sa, M_SONAME);
1835 	} else {
1836 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1837 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1838 			sa->sa_family = sa->sa_len;
1839 #endif
1840 		sa->sa_len = len;
1841 		*namp = sa;
1842 	}
1843 	return (error);
1844 }
1845 
1846 #include <sys/condvar.h>
1847 
1848 struct sendfile_sync {
1849 	struct mtx	mtx;
1850 	struct cv	cv;
1851 	unsigned	count;
1852 };
1853 
1854 /*
1855  * Detach mapped page and release resources back to the system.
1856  */
1857 int
1858 sf_buf_mext(struct mbuf *mb, void *addr, void *args)
1859 {
1860 	vm_page_t m;
1861 	struct sendfile_sync *sfs;
1862 
1863 	m = sf_buf_page(args);
1864 	sf_buf_free(args);
1865 	vm_page_lock(m);
1866 	vm_page_unwire(m, 0);
1867 	/*
1868 	 * Check for the object going away on us. This can
1869 	 * happen since we don't hold a reference to it.
1870 	 * If so, we're responsible for freeing the page.
1871 	 */
1872 	if (m->wire_count == 0 && m->object == NULL)
1873 		vm_page_free(m);
1874 	vm_page_unlock(m);
1875 	if (addr == NULL)
1876 		return (EXT_FREE_OK);
1877 	sfs = addr;
1878 	mtx_lock(&sfs->mtx);
1879 	KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
1880 	if (--sfs->count == 0)
1881 		cv_signal(&sfs->cv);
1882 	mtx_unlock(&sfs->mtx);
1883 	return (EXT_FREE_OK);
1884 }
1885 
1886 /*
1887  * sendfile(2)
1888  *
1889  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1890  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1891  *
1892  * Send a file specified by 'fd' and starting at 'offset' to a socket
1893  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
1894  * 0.  Optionally add a header and/or trailer to the socket output.  If
1895  * specified, write the total number of bytes sent into *sbytes.
1896  */
1897 int
1898 sys_sendfile(struct thread *td, struct sendfile_args *uap)
1899 {
1900 
1901 	return (do_sendfile(td, uap, 0));
1902 }
1903 
1904 static int
1905 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1906 {
1907 	struct sf_hdtr hdtr;
1908 	struct uio *hdr_uio, *trl_uio;
1909 	struct file *fp;
1910 	int error;
1911 
1912 	if (uap->offset < 0)
1913 		return (EINVAL);
1914 
1915 	hdr_uio = trl_uio = NULL;
1916 
1917 	if (uap->hdtr != NULL) {
1918 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1919 		if (error)
1920 			goto out;
1921 		if (hdtr.headers != NULL) {
1922 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
1923 			if (error)
1924 				goto out;
1925 		}
1926 		if (hdtr.trailers != NULL) {
1927 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
1928 			if (error)
1929 				goto out;
1930 
1931 		}
1932 	}
1933 
1934 	AUDIT_ARG_FD(uap->fd);
1935 
1936 	/*
1937 	 * sendfile(2) can start at any offset within a file so we require
1938 	 * CAP_READ+CAP_SEEK = CAP_PREAD.
1939 	 */
1940 	if ((error = fget_read(td, uap->fd, CAP_PREAD, &fp)) != 0)
1941 		goto out;
1942 
1943 	error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
1944 	    uap->nbytes, uap->sbytes, uap->flags, compat ? SFK_COMPAT : 0, td);
1945 	fdrop(fp, td);
1946 
1947 out:
1948 	if (hdr_uio)
1949 		free(hdr_uio, M_IOV);
1950 	if (trl_uio)
1951 		free(trl_uio, M_IOV);
1952 	return (error);
1953 }
1954 
1955 #ifdef COMPAT_FREEBSD4
1956 int
1957 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1958 {
1959 	struct sendfile_args args;
1960 
1961 	args.fd = uap->fd;
1962 	args.s = uap->s;
1963 	args.offset = uap->offset;
1964 	args.nbytes = uap->nbytes;
1965 	args.hdtr = uap->hdtr;
1966 	args.sbytes = uap->sbytes;
1967 	args.flags = uap->flags;
1968 
1969 	return (do_sendfile(td, &args, 1));
1970 }
1971 #endif /* COMPAT_FREEBSD4 */
1972 
1973 int
1974 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
1975     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
1976     int kflags, struct thread *td)
1977 {
1978 	struct vnode *vp = fp->f_vnode;
1979 	struct file *sock_fp;
1980 	struct vm_object *obj = NULL;
1981 	struct socket *so = NULL;
1982 	struct mbuf *m = NULL;
1983 	struct sf_buf *sf;
1984 	struct vm_page *pg;
1985 	struct vattr va;
1986 	off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
1987 	int error, hdrlen = 0, mnw = 0;
1988 	int bsize;
1989 	struct sendfile_sync *sfs = NULL;
1990 
1991 	vn_lock(vp, LK_SHARED | LK_RETRY);
1992 	if (vp->v_type == VREG) {
1993 		bsize = vp->v_mount->mnt_stat.f_iosize;
1994 		if (nbytes == 0) {
1995 			error = VOP_GETATTR(vp, &va, td->td_ucred);
1996 			if (error != 0) {
1997 				VOP_UNLOCK(vp, 0);
1998 				obj = NULL;
1999 				goto out;
2000 			}
2001 			rem = va.va_size;
2002 		} else
2003 			rem = nbytes;
2004 		obj = vp->v_object;
2005 		if (obj != NULL) {
2006 			/*
2007 			 * Temporarily increase the backing VM
2008 			 * object's reference count so that a forced
2009 			 * reclamation of its vnode does not
2010 			 * immediately destroy it.
2011 			 */
2012 			VM_OBJECT_WLOCK(obj);
2013 			if ((obj->flags & OBJ_DEAD) == 0) {
2014 				vm_object_reference_locked(obj);
2015 				VM_OBJECT_WUNLOCK(obj);
2016 			} else {
2017 				VM_OBJECT_WUNLOCK(obj);
2018 				obj = NULL;
2019 			}
2020 		}
2021 	} else
2022 		bsize = 0;	/* silence gcc */
2023 	VOP_UNLOCK(vp, 0);
2024 	if (obj == NULL) {
2025 		error = EINVAL;
2026 		goto out;
2027 	}
2028 
2029 	/*
2030 	 * The socket must be a stream socket and connected.
2031 	 * Remember if it a blocking or non-blocking socket.
2032 	 */
2033 	if ((error = getsock_cap(td->td_proc->p_fd, sockfd, CAP_SEND,
2034 	    &sock_fp, NULL)) != 0)
2035 		goto out;
2036 	so = sock_fp->f_data;
2037 	if (so->so_type != SOCK_STREAM) {
2038 		error = EINVAL;
2039 		goto out;
2040 	}
2041 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2042 		error = ENOTCONN;
2043 		goto out;
2044 	}
2045 	/*
2046 	 * Do not wait on memory allocations but return ENOMEM for
2047 	 * caller to retry later.
2048 	 * XXX: Experimental.
2049 	 */
2050 	if (flags & SF_MNOWAIT)
2051 		mnw = 1;
2052 
2053 	if (flags & SF_SYNC) {
2054 		sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
2055 		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
2056 		cv_init(&sfs->cv, "sendfile");
2057 	}
2058 
2059 #ifdef MAC
2060 	error = mac_socket_check_send(td->td_ucred, so);
2061 	if (error)
2062 		goto out;
2063 #endif
2064 
2065 	/* If headers are specified copy them into mbufs. */
2066 	if (hdr_uio != NULL) {
2067 		hdr_uio->uio_td = td;
2068 		hdr_uio->uio_rw = UIO_WRITE;
2069 		if (hdr_uio->uio_resid > 0) {
2070 			/*
2071 			 * In FBSD < 5.0 the nbytes to send also included
2072 			 * the header.  If compat is specified subtract the
2073 			 * header size from nbytes.
2074 			 */
2075 			if (kflags & SFK_COMPAT) {
2076 				if (nbytes > hdr_uio->uio_resid)
2077 					nbytes -= hdr_uio->uio_resid;
2078 				else
2079 					nbytes = 0;
2080 			}
2081 			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
2082 			    0, 0, 0);
2083 			if (m == NULL) {
2084 				error = mnw ? EAGAIN : ENOBUFS;
2085 				goto out;
2086 			}
2087 			hdrlen = m_length(m, NULL);
2088 		}
2089 	}
2090 
2091 	/*
2092 	 * Protect against multiple writers to the socket.
2093 	 *
2094 	 * XXXRW: Historically this has assumed non-interruptibility, so now
2095 	 * we implement that, but possibly shouldn't.
2096 	 */
2097 	(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
2098 
2099 	/*
2100 	 * Loop through the pages of the file, starting with the requested
2101 	 * offset. Get a file page (do I/O if necessary), map the file page
2102 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
2103 	 * it on the socket.
2104 	 * This is done in two loops.  The inner loop turns as many pages
2105 	 * as it can, up to available socket buffer space, without blocking
2106 	 * into mbufs to have it bulk delivered into the socket send buffer.
2107 	 * The outer loop checks the state and available space of the socket
2108 	 * and takes care of the overall progress.
2109 	 */
2110 	for (off = offset; ; ) {
2111 		struct mbuf *mtail;
2112 		int loopbytes;
2113 		int space;
2114 		int done;
2115 
2116 		if ((nbytes != 0 && nbytes == fsbytes) ||
2117 		    (nbytes == 0 && va.va_size == fsbytes))
2118 			break;
2119 
2120 		mtail = NULL;
2121 		loopbytes = 0;
2122 		space = 0;
2123 		done = 0;
2124 
2125 		/*
2126 		 * Check the socket state for ongoing connection,
2127 		 * no errors and space in socket buffer.
2128 		 * If space is low allow for the remainder of the
2129 		 * file to be processed if it fits the socket buffer.
2130 		 * Otherwise block in waiting for sufficient space
2131 		 * to proceed, or if the socket is nonblocking, return
2132 		 * to userland with EAGAIN while reporting how far
2133 		 * we've come.
2134 		 * We wait until the socket buffer has significant free
2135 		 * space to do bulk sends.  This makes good use of file
2136 		 * system read ahead and allows packet segmentation
2137 		 * offloading hardware to take over lots of work.  If
2138 		 * we were not careful here we would send off only one
2139 		 * sfbuf at a time.
2140 		 */
2141 		SOCKBUF_LOCK(&so->so_snd);
2142 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
2143 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
2144 retry_space:
2145 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2146 			error = EPIPE;
2147 			SOCKBUF_UNLOCK(&so->so_snd);
2148 			goto done;
2149 		} else if (so->so_error) {
2150 			error = so->so_error;
2151 			so->so_error = 0;
2152 			SOCKBUF_UNLOCK(&so->so_snd);
2153 			goto done;
2154 		}
2155 		space = sbspace(&so->so_snd);
2156 		if (space < rem &&
2157 		    (space <= 0 ||
2158 		     space < so->so_snd.sb_lowat)) {
2159 			if (so->so_state & SS_NBIO) {
2160 				SOCKBUF_UNLOCK(&so->so_snd);
2161 				error = EAGAIN;
2162 				goto done;
2163 			}
2164 			/*
2165 			 * sbwait drops the lock while sleeping.
2166 			 * When we loop back to retry_space the
2167 			 * state may have changed and we retest
2168 			 * for it.
2169 			 */
2170 			error = sbwait(&so->so_snd);
2171 			/*
2172 			 * An error from sbwait usually indicates that we've
2173 			 * been interrupted by a signal. If we've sent anything
2174 			 * then return bytes sent, otherwise return the error.
2175 			 */
2176 			if (error) {
2177 				SOCKBUF_UNLOCK(&so->so_snd);
2178 				goto done;
2179 			}
2180 			goto retry_space;
2181 		}
2182 		SOCKBUF_UNLOCK(&so->so_snd);
2183 
2184 		/*
2185 		 * Reduce space in the socket buffer by the size of
2186 		 * the header mbuf chain.
2187 		 * hdrlen is set to 0 after the first loop.
2188 		 */
2189 		space -= hdrlen;
2190 
2191 		error = vn_lock(vp, LK_SHARED);
2192 		if (error != 0)
2193 			goto done;
2194 		error = VOP_GETATTR(vp, &va, td->td_ucred);
2195 		if (error != 0 || off >= va.va_size) {
2196 			VOP_UNLOCK(vp, 0);
2197 			goto done;
2198 		}
2199 
2200 		/*
2201 		 * Loop and construct maximum sized mbuf chain to be bulk
2202 		 * dumped into socket buffer.
2203 		 */
2204 		while (space > loopbytes) {
2205 			vm_pindex_t pindex;
2206 			vm_offset_t pgoff;
2207 			struct mbuf *m0;
2208 
2209 			/*
2210 			 * Calculate the amount to transfer.
2211 			 * Not to exceed a page, the EOF,
2212 			 * or the passed in nbytes.
2213 			 */
2214 			pgoff = (vm_offset_t)(off & PAGE_MASK);
2215 			if (nbytes)
2216 				rem = (nbytes - fsbytes - loopbytes);
2217 			else
2218 				rem = va.va_size -
2219 				    offset - fsbytes - loopbytes;
2220 			xfsize = omin(PAGE_SIZE - pgoff, rem);
2221 			xfsize = omin(space - loopbytes, xfsize);
2222 			if (xfsize <= 0) {
2223 				done = 1;		/* all data sent */
2224 				break;
2225 			}
2226 
2227 			/*
2228 			 * Attempt to look up the page.  Allocate
2229 			 * if not found or wait and loop if busy.
2230 			 */
2231 			pindex = OFF_TO_IDX(off);
2232 			VM_OBJECT_WLOCK(obj);
2233 			pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
2234 			    VM_ALLOC_IGN_SBUSY | VM_ALLOC_NORMAL |
2235 			    VM_ALLOC_WIRED);
2236 
2237 			/*
2238 			 * Check if page is valid for what we need,
2239 			 * otherwise initiate I/O.
2240 			 * If we already turned some pages into mbufs,
2241 			 * send them off before we come here again and
2242 			 * block.
2243 			 */
2244 			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
2245 				VM_OBJECT_WUNLOCK(obj);
2246 			else if (m != NULL)
2247 				error = EAGAIN;	/* send what we already got */
2248 			else if (flags & SF_NODISKIO)
2249 				error = EBUSY;
2250 			else {
2251 				ssize_t resid;
2252 				int readahead = sfreadahead * MAXBSIZE;
2253 
2254 				VM_OBJECT_WUNLOCK(obj);
2255 
2256 				/*
2257 				 * Get the page from backing store.
2258 				 * XXXMAC: Because we don't have fp->f_cred
2259 				 * here, we pass in NOCRED.  This is probably
2260 				 * wrong, but is consistent with our original
2261 				 * implementation.
2262 				 */
2263 				error = vn_rdwr(UIO_READ, vp, NULL, readahead,
2264 				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
2265 				    IO_VMIO | ((readahead / bsize) << IO_SEQSHIFT),
2266 				    td->td_ucred, NOCRED, &resid, td);
2267 				SFSTAT_INC(sf_iocnt);
2268 				if (error)
2269 					VM_OBJECT_WLOCK(obj);
2270 			}
2271 			if (error) {
2272 				vm_page_lock(pg);
2273 				vm_page_unwire(pg, 0);
2274 				/*
2275 				 * See if anyone else might know about
2276 				 * this page.  If not and it is not valid,
2277 				 * then free it.
2278 				 */
2279 				if (pg->wire_count == 0 && pg->valid == 0 &&
2280 				    !vm_page_busied(pg))
2281 					vm_page_free(pg);
2282 				vm_page_unlock(pg);
2283 				VM_OBJECT_WUNLOCK(obj);
2284 				if (error == EAGAIN)
2285 					error = 0;	/* not a real error */
2286 				break;
2287 			}
2288 
2289 			/*
2290 			 * Get a sendfile buf.  When allocating the
2291 			 * first buffer for mbuf chain, we usually
2292 			 * wait as long as necessary, but this wait
2293 			 * can be interrupted.  For consequent
2294 			 * buffers, do not sleep, since several
2295 			 * threads might exhaust the buffers and then
2296 			 * deadlock.
2297 			 */
2298 			sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
2299 			    SFB_CATCH);
2300 			if (sf == NULL) {
2301 				SFSTAT_INC(sf_allocfail);
2302 				vm_page_lock(pg);
2303 				vm_page_unwire(pg, 0);
2304 				KASSERT(pg->object != NULL,
2305 				    ("%s: object disappeared", __func__));
2306 				vm_page_unlock(pg);
2307 				if (m == NULL)
2308 					error = (mnw ? EAGAIN : EINTR);
2309 				break;
2310 			}
2311 
2312 			/*
2313 			 * Get an mbuf and set it up as having
2314 			 * external storage.
2315 			 */
2316 			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
2317 			if (m0 == NULL) {
2318 				error = (mnw ? EAGAIN : ENOBUFS);
2319 				(void)sf_buf_mext(NULL, NULL, sf);
2320 				break;
2321 			}
2322 			if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
2323 			    sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
2324 			    (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
2325 				error = (mnw ? EAGAIN : ENOBUFS);
2326 				(void)sf_buf_mext(NULL, NULL, sf);
2327 				m_freem(m0);
2328 				break;
2329 			}
2330 			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
2331 			m0->m_len = xfsize;
2332 
2333 			/* Append to mbuf chain. */
2334 			if (mtail != NULL)
2335 				mtail->m_next = m0;
2336 			else if (m != NULL)
2337 				m_last(m)->m_next = m0;
2338 			else
2339 				m = m0;
2340 			mtail = m0;
2341 
2342 			/* Keep track of bits processed. */
2343 			loopbytes += xfsize;
2344 			off += xfsize;
2345 
2346 			if (sfs != NULL) {
2347 				mtx_lock(&sfs->mtx);
2348 				sfs->count++;
2349 				mtx_unlock(&sfs->mtx);
2350 			}
2351 		}
2352 
2353 		VOP_UNLOCK(vp, 0);
2354 
2355 		/* Add the buffer chain to the socket buffer. */
2356 		if (m != NULL) {
2357 			int mlen, err;
2358 
2359 			mlen = m_length(m, NULL);
2360 			SOCKBUF_LOCK(&so->so_snd);
2361 			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2362 				error = EPIPE;
2363 				SOCKBUF_UNLOCK(&so->so_snd);
2364 				goto done;
2365 			}
2366 			SOCKBUF_UNLOCK(&so->so_snd);
2367 			CURVNET_SET(so->so_vnet);
2368 			/* Avoid error aliasing. */
2369 			err = (*so->so_proto->pr_usrreqs->pru_send)
2370 				    (so, 0, m, NULL, NULL, td);
2371 			CURVNET_RESTORE();
2372 			if (err == 0) {
2373 				/*
2374 				 * We need two counters to get the
2375 				 * file offset and nbytes to send
2376 				 * right:
2377 				 * - sbytes contains the total amount
2378 				 *   of bytes sent, including headers.
2379 				 * - fsbytes contains the total amount
2380 				 *   of bytes sent from the file.
2381 				 */
2382 				sbytes += mlen;
2383 				fsbytes += mlen;
2384 				if (hdrlen) {
2385 					fsbytes -= hdrlen;
2386 					hdrlen = 0;
2387 				}
2388 			} else if (error == 0)
2389 				error = err;
2390 			m = NULL;	/* pru_send always consumes */
2391 		}
2392 
2393 		/* Quit outer loop on error or when we're done. */
2394 		if (done)
2395 			break;
2396 		if (error)
2397 			goto done;
2398 	}
2399 
2400 	/*
2401 	 * Send trailers. Wimp out and use writev(2).
2402 	 */
2403 	if (trl_uio != NULL) {
2404 		sbunlock(&so->so_snd);
2405 		error = kern_writev(td, sockfd, trl_uio);
2406 		if (error == 0)
2407 			sbytes += td->td_retval[0];
2408 		goto out;
2409 	}
2410 
2411 done:
2412 	sbunlock(&so->so_snd);
2413 out:
2414 	/*
2415 	 * If there was no error we have to clear td->td_retval[0]
2416 	 * because it may have been set by writev.
2417 	 */
2418 	if (error == 0) {
2419 		td->td_retval[0] = 0;
2420 	}
2421 	if (sent != NULL) {
2422 		copyout(&sbytes, sent, sizeof(off_t));
2423 	}
2424 	if (obj != NULL)
2425 		vm_object_deallocate(obj);
2426 	if (so)
2427 		fdrop(sock_fp, td);
2428 	if (m)
2429 		m_freem(m);
2430 
2431 	if (sfs != NULL) {
2432 		mtx_lock(&sfs->mtx);
2433 		if (sfs->count != 0)
2434 			cv_wait(&sfs->cv, &sfs->mtx);
2435 		KASSERT(sfs->count == 0, ("sendfile sync still busy"));
2436 		cv_destroy(&sfs->cv);
2437 		mtx_destroy(&sfs->mtx);
2438 		free(sfs, M_TEMP);
2439 	}
2440 
2441 	if (error == ERESTART)
2442 		error = EINTR;
2443 
2444 	return (error);
2445 }
2446 
2447 /*
2448  * SCTP syscalls.
2449  * Functionality only compiled in if SCTP is defined in the kernel Makefile,
2450  * otherwise all return EOPNOTSUPP.
2451  * XXX: We should make this loadable one day.
2452  */
2453 int
2454 sys_sctp_peeloff(td, uap)
2455 	struct thread *td;
2456 	struct sctp_peeloff_args /* {
2457 		int	sd;
2458 		caddr_t	name;
2459 	} */ *uap;
2460 {
2461 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2462 	struct file *nfp = NULL;
2463 	int error;
2464 	struct socket *head, *so;
2465 	int fd;
2466 	u_int fflag;
2467 
2468 	AUDIT_ARG_FD(uap->sd);
2469 	error = fgetsock(td, uap->sd, CAP_PEELOFF, &head, &fflag);
2470 	if (error)
2471 		goto done2;
2472 	if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
2473 		error = EOPNOTSUPP;
2474 		goto done;
2475 	}
2476 	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
2477 	if (error)
2478 		goto done;
2479 	/*
2480 	 * At this point we know we do have a assoc to pull
2481 	 * we proceed to get the fd setup. This may block
2482 	 * but that is ok.
2483 	 */
2484 
2485 	error = falloc(td, &nfp, &fd, 0);
2486 	if (error)
2487 		goto done;
2488 	td->td_retval[0] = fd;
2489 
2490 	CURVNET_SET(head->so_vnet);
2491 	so = sonewconn(head, SS_ISCONNECTED);
2492 	if (so == NULL) {
2493 		error = ENOMEM;
2494 		goto noconnection;
2495 	}
2496 	/*
2497 	 * Before changing the flags on the socket, we have to bump the
2498 	 * reference count.  Otherwise, if the protocol calls sofree(),
2499 	 * the socket will be released due to a zero refcount.
2500 	 */
2501         SOCK_LOCK(so);
2502         soref(so);                      /* file descriptor reference */
2503         SOCK_UNLOCK(so);
2504 
2505 	ACCEPT_LOCK();
2506 
2507 	TAILQ_REMOVE(&head->so_comp, so, so_list);
2508 	head->so_qlen--;
2509 	so->so_state |= (head->so_state & SS_NBIO);
2510 	so->so_state &= ~SS_NOFDREF;
2511 	so->so_qstate &= ~SQ_COMP;
2512 	so->so_head = NULL;
2513 	ACCEPT_UNLOCK();
2514 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
2515 	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
2516 	if (error)
2517 		goto noconnection;
2518 	if (head->so_sigio != NULL)
2519 		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
2520 
2521 noconnection:
2522 	/*
2523 	 * close the new descriptor, assuming someone hasn't ripped it
2524 	 * out from under us.
2525 	 */
2526 	if (error)
2527 		fdclose(td->td_proc->p_fd, nfp, fd, td);
2528 
2529 	/*
2530 	 * Release explicitly held references before returning.
2531 	 */
2532 	CURVNET_RESTORE();
2533 done:
2534 	if (nfp != NULL)
2535 		fdrop(nfp, td);
2536 	fputsock(head);
2537 done2:
2538 	return (error);
2539 #else  /* SCTP */
2540 	return (EOPNOTSUPP);
2541 #endif /* SCTP */
2542 }
2543 
2544 int
2545 sys_sctp_generic_sendmsg (td, uap)
2546 	struct thread *td;
2547 	struct sctp_generic_sendmsg_args /* {
2548 		int sd,
2549 		caddr_t msg,
2550 		int mlen,
2551 		caddr_t to,
2552 		__socklen_t tolen,
2553 		struct sctp_sndrcvinfo *sinfo,
2554 		int flags
2555 	} */ *uap;
2556 {
2557 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2558 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2559 	struct socket *so;
2560 	struct file *fp = NULL;
2561 	int error = 0, len;
2562 	struct sockaddr *to = NULL;
2563 #ifdef KTRACE
2564 	struct uio *ktruio = NULL;
2565 #endif
2566 	struct uio auio;
2567 	struct iovec iov[1];
2568 	cap_rights_t rights;
2569 
2570 	if (uap->sinfo) {
2571 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2572 		if (error)
2573 			return (error);
2574 		u_sinfo = &sinfo;
2575 	}
2576 
2577 	rights = CAP_SEND;
2578 	if (uap->tolen) {
2579 		error = getsockaddr(&to, uap->to, uap->tolen);
2580 		if (error) {
2581 			to = NULL;
2582 			goto sctp_bad2;
2583 		}
2584 		rights |= CAP_CONNECT;
2585 	}
2586 
2587 	AUDIT_ARG_FD(uap->sd);
2588 	error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL);
2589 	if (error)
2590 		goto sctp_bad;
2591 #ifdef KTRACE
2592 	if (to && (KTRPOINT(td, KTR_STRUCT)))
2593 		ktrsockaddr(to);
2594 #endif
2595 
2596 	iov[0].iov_base = uap->msg;
2597 	iov[0].iov_len = uap->mlen;
2598 
2599 	so = (struct socket *)fp->f_data;
2600 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2601 		error = EOPNOTSUPP;
2602 		goto sctp_bad;
2603 	}
2604 #ifdef MAC
2605 	error = mac_socket_check_send(td->td_ucred, so);
2606 	if (error)
2607 		goto sctp_bad;
2608 #endif /* MAC */
2609 
2610 	auio.uio_iov =  iov;
2611 	auio.uio_iovcnt = 1;
2612 	auio.uio_segflg = UIO_USERSPACE;
2613 	auio.uio_rw = UIO_WRITE;
2614 	auio.uio_td = td;
2615 	auio.uio_offset = 0;			/* XXX */
2616 	auio.uio_resid = 0;
2617 	len = auio.uio_resid = uap->mlen;
2618 	CURVNET_SET(so->so_vnet);
2619 	error = sctp_lower_sosend(so, to, &auio,
2620 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2621 		    uap->flags, u_sinfo, td);
2622 	CURVNET_RESTORE();
2623 	if (error) {
2624 		if (auio.uio_resid != len && (error == ERESTART ||
2625 		    error == EINTR || error == EWOULDBLOCK))
2626 			error = 0;
2627 		/* Generation of SIGPIPE can be controlled per socket. */
2628 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2629 		    !(uap->flags & MSG_NOSIGNAL)) {
2630 			PROC_LOCK(td->td_proc);
2631 			tdsignal(td, SIGPIPE);
2632 			PROC_UNLOCK(td->td_proc);
2633 		}
2634 	}
2635 	if (error == 0)
2636 		td->td_retval[0] = len - auio.uio_resid;
2637 #ifdef KTRACE
2638 	if (ktruio != NULL) {
2639 		ktruio->uio_resid = td->td_retval[0];
2640 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2641 	}
2642 #endif /* KTRACE */
2643 sctp_bad:
2644 	if (fp)
2645 		fdrop(fp, td);
2646 sctp_bad2:
2647 	if (to)
2648 		free(to, M_SONAME);
2649 	return (error);
2650 #else  /* SCTP */
2651 	return (EOPNOTSUPP);
2652 #endif /* SCTP */
2653 }
2654 
2655 int
2656 sys_sctp_generic_sendmsg_iov(td, uap)
2657 	struct thread *td;
2658 	struct sctp_generic_sendmsg_iov_args /* {
2659 		int sd,
2660 		struct iovec *iov,
2661 		int iovlen,
2662 		caddr_t to,
2663 		__socklen_t tolen,
2664 		struct sctp_sndrcvinfo *sinfo,
2665 		int flags
2666 	} */ *uap;
2667 {
2668 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2669 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2670 	struct socket *so;
2671 	struct file *fp = NULL;
2672 	int error=0, i;
2673 	ssize_t len;
2674 	struct sockaddr *to = NULL;
2675 #ifdef KTRACE
2676 	struct uio *ktruio = NULL;
2677 #endif
2678 	struct uio auio;
2679 	struct iovec *iov, *tiov;
2680 	cap_rights_t rights;
2681 
2682 	if (uap->sinfo) {
2683 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2684 		if (error)
2685 			return (error);
2686 		u_sinfo = &sinfo;
2687 	}
2688 	rights = CAP_SEND;
2689 	if (uap->tolen) {
2690 		error = getsockaddr(&to, uap->to, uap->tolen);
2691 		if (error) {
2692 			to = NULL;
2693 			goto sctp_bad2;
2694 		}
2695 		rights |= CAP_CONNECT;
2696 	}
2697 
2698 	AUDIT_ARG_FD(uap->sd);
2699 	error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL);
2700 	if (error)
2701 		goto sctp_bad1;
2702 
2703 #ifdef COMPAT_FREEBSD32
2704 	if (SV_CURPROC_FLAG(SV_ILP32))
2705 		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
2706 		    uap->iovlen, &iov, EMSGSIZE);
2707 	else
2708 #endif
2709 		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2710 	if (error)
2711 		goto sctp_bad1;
2712 #ifdef KTRACE
2713 	if (to && (KTRPOINT(td, KTR_STRUCT)))
2714 		ktrsockaddr(to);
2715 #endif
2716 
2717 	so = (struct socket *)fp->f_data;
2718 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2719 		error = EOPNOTSUPP;
2720 		goto sctp_bad;
2721 	}
2722 #ifdef MAC
2723 	error = mac_socket_check_send(td->td_ucred, so);
2724 	if (error)
2725 		goto sctp_bad;
2726 #endif /* MAC */
2727 
2728 	auio.uio_iov = iov;
2729 	auio.uio_iovcnt = uap->iovlen;
2730 	auio.uio_segflg = UIO_USERSPACE;
2731 	auio.uio_rw = UIO_WRITE;
2732 	auio.uio_td = td;
2733 	auio.uio_offset = 0;			/* XXX */
2734 	auio.uio_resid = 0;
2735 	tiov = iov;
2736 	for (i = 0; i <uap->iovlen; i++, tiov++) {
2737 		if ((auio.uio_resid += tiov->iov_len) < 0) {
2738 			error = EINVAL;
2739 			goto sctp_bad;
2740 		}
2741 	}
2742 	len = auio.uio_resid;
2743 	CURVNET_SET(so->so_vnet);
2744 	error = sctp_lower_sosend(so, to, &auio,
2745 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2746 		    uap->flags, u_sinfo, td);
2747 	CURVNET_RESTORE();
2748 	if (error) {
2749 		if (auio.uio_resid != len && (error == ERESTART ||
2750 		    error == EINTR || error == EWOULDBLOCK))
2751 			error = 0;
2752 		/* Generation of SIGPIPE can be controlled per socket */
2753 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2754 		    !(uap->flags & MSG_NOSIGNAL)) {
2755 			PROC_LOCK(td->td_proc);
2756 			tdsignal(td, SIGPIPE);
2757 			PROC_UNLOCK(td->td_proc);
2758 		}
2759 	}
2760 	if (error == 0)
2761 		td->td_retval[0] = len - auio.uio_resid;
2762 #ifdef KTRACE
2763 	if (ktruio != NULL) {
2764 		ktruio->uio_resid = td->td_retval[0];
2765 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2766 	}
2767 #endif /* KTRACE */
2768 sctp_bad:
2769 	free(iov, M_IOV);
2770 sctp_bad1:
2771 	if (fp)
2772 		fdrop(fp, td);
2773 sctp_bad2:
2774 	if (to)
2775 		free(to, M_SONAME);
2776 	return (error);
2777 #else  /* SCTP */
2778 	return (EOPNOTSUPP);
2779 #endif /* SCTP */
2780 }
2781 
2782 int
2783 sys_sctp_generic_recvmsg(td, uap)
2784 	struct thread *td;
2785 	struct sctp_generic_recvmsg_args /* {
2786 		int sd,
2787 		struct iovec *iov,
2788 		int iovlen,
2789 		struct sockaddr *from,
2790 		__socklen_t *fromlenaddr,
2791 		struct sctp_sndrcvinfo *sinfo,
2792 		int *msg_flags
2793 	} */ *uap;
2794 {
2795 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2796 	uint8_t sockbufstore[256];
2797 	struct uio auio;
2798 	struct iovec *iov, *tiov;
2799 	struct sctp_sndrcvinfo sinfo;
2800 	struct socket *so;
2801 	struct file *fp = NULL;
2802 	struct sockaddr *fromsa;
2803 	int fromlen;
2804 	ssize_t len;
2805 	int i, msg_flags;
2806 	int error = 0;
2807 #ifdef KTRACE
2808 	struct uio *ktruio = NULL;
2809 #endif
2810 
2811 	AUDIT_ARG_FD(uap->sd);
2812 	error = getsock_cap(td->td_proc->p_fd, uap->sd, CAP_RECV, &fp, NULL);
2813 	if (error) {
2814 		return (error);
2815 	}
2816 #ifdef COMPAT_FREEBSD32
2817 	if (SV_CURPROC_FLAG(SV_ILP32))
2818 		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
2819 		    uap->iovlen, &iov, EMSGSIZE);
2820 	else
2821 #endif
2822 		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2823 	if (error)
2824 		goto out1;
2825 
2826 	so = fp->f_data;
2827 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2828 		error = EOPNOTSUPP;
2829 		goto out;
2830 	}
2831 #ifdef MAC
2832 	error = mac_socket_check_receive(td->td_ucred, so);
2833 	if (error) {
2834 		goto out;
2835 	}
2836 #endif /* MAC */
2837 
2838 	if (uap->fromlenaddr) {
2839 		error = copyin(uap->fromlenaddr,
2840 		    &fromlen, sizeof (fromlen));
2841 		if (error) {
2842 			goto out;
2843 		}
2844 	} else {
2845 		fromlen = 0;
2846 	}
2847 	if (uap->msg_flags) {
2848 		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
2849 		if (error) {
2850 			goto out;
2851 		}
2852 	} else {
2853 		msg_flags = 0;
2854 	}
2855 	auio.uio_iov = iov;
2856 	auio.uio_iovcnt = uap->iovlen;
2857 	auio.uio_segflg = UIO_USERSPACE;
2858 	auio.uio_rw = UIO_READ;
2859 	auio.uio_td = td;
2860 	auio.uio_offset = 0;			/* XXX */
2861 	auio.uio_resid = 0;
2862 	tiov = iov;
2863 	for (i = 0; i <uap->iovlen; i++, tiov++) {
2864 		if ((auio.uio_resid += tiov->iov_len) < 0) {
2865 			error = EINVAL;
2866 			goto out;
2867 		}
2868 	}
2869 	len = auio.uio_resid;
2870 	fromsa = (struct sockaddr *)sockbufstore;
2871 
2872 #ifdef KTRACE
2873 	if (KTRPOINT(td, KTR_GENIO))
2874 		ktruio = cloneuio(&auio);
2875 #endif /* KTRACE */
2876 	memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
2877 	CURVNET_SET(so->so_vnet);
2878 	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
2879 		    fromsa, fromlen, &msg_flags,
2880 		    (struct sctp_sndrcvinfo *)&sinfo, 1);
2881 	CURVNET_RESTORE();
2882 	if (error) {
2883 		if (auio.uio_resid != len && (error == ERESTART ||
2884 		    error == EINTR || error == EWOULDBLOCK))
2885 			error = 0;
2886 	} else {
2887 		if (uap->sinfo)
2888 			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
2889 	}
2890 #ifdef KTRACE
2891 	if (ktruio != NULL) {
2892 		ktruio->uio_resid = len - auio.uio_resid;
2893 		ktrgenio(uap->sd, UIO_READ, ktruio, error);
2894 	}
2895 #endif /* KTRACE */
2896 	if (error)
2897 		goto out;
2898 	td->td_retval[0] = len - auio.uio_resid;
2899 
2900 	if (fromlen && uap->from) {
2901 		len = fromlen;
2902 		if (len <= 0 || fromsa == 0)
2903 			len = 0;
2904 		else {
2905 			len = MIN(len, fromsa->sa_len);
2906 			error = copyout(fromsa, uap->from, (size_t)len);
2907 			if (error)
2908 				goto out;
2909 		}
2910 		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
2911 		if (error) {
2912 			goto out;
2913 		}
2914 	}
2915 #ifdef KTRACE
2916 	if (KTRPOINT(td, KTR_STRUCT))
2917 		ktrsockaddr(fromsa);
2918 #endif
2919 	if (uap->msg_flags) {
2920 		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
2921 		if (error) {
2922 			goto out;
2923 		}
2924 	}
2925 out:
2926 	free(iov, M_IOV);
2927 out1:
2928 	if (fp)
2929 		fdrop(fp, td);
2930 
2931 	return (error);
2932 #else  /* SCTP */
2933 	return (EOPNOTSUPP);
2934 #endif /* SCTP */
2935 }
2936