xref: /freebsd/sys/kern/uipc_syscalls.c (revision 3fc9e2c36555140de248a0b4def91bbfa44d7c2c)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include "opt_capsicum.h"
39 #include "opt_inet.h"
40 #include "opt_inet6.h"
41 #include "opt_sctp.h"
42 #include "opt_compat.h"
43 #include "opt_ktrace.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/capability.h>
48 #include <sys/kernel.h>
49 #include <sys/lock.h>
50 #include <sys/mutex.h>
51 #include <sys/sysproto.h>
52 #include <sys/malloc.h>
53 #include <sys/filedesc.h>
54 #include <sys/event.h>
55 #include <sys/proc.h>
56 #include <sys/fcntl.h>
57 #include <sys/file.h>
58 #include <sys/filio.h>
59 #include <sys/jail.h>
60 #include <sys/mount.h>
61 #include <sys/mbuf.h>
62 #include <sys/protosw.h>
63 #include <sys/rwlock.h>
64 #include <sys/sf_buf.h>
65 #include <sys/sysent.h>
66 #include <sys/socket.h>
67 #include <sys/socketvar.h>
68 #include <sys/signalvar.h>
69 #include <sys/syscallsubr.h>
70 #include <sys/sysctl.h>
71 #include <sys/uio.h>
72 #include <sys/vnode.h>
73 #ifdef KTRACE
74 #include <sys/ktrace.h>
75 #endif
76 #ifdef COMPAT_FREEBSD32
77 #include <compat/freebsd32/freebsd32_util.h>
78 #endif
79 
80 #include <net/vnet.h>
81 
82 #include <security/audit/audit.h>
83 #include <security/mac/mac_framework.h>
84 
85 #include <vm/vm.h>
86 #include <vm/vm_param.h>
87 #include <vm/vm_object.h>
88 #include <vm/vm_page.h>
89 #include <vm/vm_pageout.h>
90 #include <vm/vm_kern.h>
91 #include <vm/vm_extern.h>
92 
93 #if defined(INET) || defined(INET6)
94 #ifdef SCTP
95 #include <netinet/sctp.h>
96 #include <netinet/sctp_peeloff.h>
97 #endif /* SCTP */
98 #endif /* INET || INET6 */
99 
100 /*
101  * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
102  * and SOCK_NONBLOCK.
103  */
104 #define	ACCEPT4_INHERIT	0x1
105 #define	ACCEPT4_COMPAT	0x2
106 
107 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
108 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
109 
110 static int accept1(struct thread *td, int s, struct sockaddr *uname,
111 		   socklen_t *anamelen, int flags);
112 static int do_sendfile(struct thread *td, struct sendfile_args *uap,
113 		   int compat);
114 static int getsockname1(struct thread *td, struct getsockname_args *uap,
115 			int compat);
116 static int getpeername1(struct thread *td, struct getpeername_args *uap,
117 			int compat);
118 
119 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
120 
121 /*
122  * sendfile(2)-related variables and associated sysctls
123  */
124 int nsfbufs;
125 int nsfbufspeak;
126 int nsfbufsused;
127 static int sfreadahead = 1;
128 
129 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
130     "Maximum number of sendfile(2) sf_bufs available");
131 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
132     "Number of sendfile(2) sf_bufs at peak usage");
133 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
134     "Number of sendfile(2) sf_bufs in use");
135 SYSCTL_INT(_kern_ipc, OID_AUTO, sfreadahead, CTLFLAG_RW, &sfreadahead, 0,
136     "Number of sendfile(2) read-ahead MAXBSIZE blocks");
137 
138 
139 static void
140 sfstat_init(const void *unused)
141 {
142 
143 	COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
144 	    M_WAITOK);
145 }
146 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
147 
148 static int
149 sfstat_sysctl(SYSCTL_HANDLER_ARGS)
150 {
151 	struct sfstat s;
152 
153 	COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
154 	if (req->newptr)
155 		COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
156 	return (SYSCTL_OUT(req, &s, sizeof(s)));
157 }
158 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
159     NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
160 
161 /*
162  * Convert a user file descriptor to a kernel file entry and check if required
163  * capability rights are present.
164  * A reference on the file entry is held upon returning.
165  */
166 static int
167 getsock_cap(struct filedesc *fdp, int fd, cap_rights_t *rightsp,
168     struct file **fpp, u_int *fflagp)
169 {
170 	struct file *fp;
171 	int error;
172 
173 	error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
174 	if (error != 0)
175 		return (error);
176 	if (fp->f_type != DTYPE_SOCKET) {
177 		fdrop(fp, curthread);
178 		return (ENOTSOCK);
179 	}
180 	if (fflagp != NULL)
181 		*fflagp = fp->f_flag;
182 	*fpp = fp;
183 	return (0);
184 }
185 
186 /*
187  * System call interface to the socket abstraction.
188  */
189 #if defined(COMPAT_43)
190 #define COMPAT_OLDSOCK
191 #endif
192 
193 int
194 sys_socket(td, uap)
195 	struct thread *td;
196 	struct socket_args /* {
197 		int	domain;
198 		int	type;
199 		int	protocol;
200 	} */ *uap;
201 {
202 	struct socket *so;
203 	struct file *fp;
204 	int fd, error, type, oflag, fflag;
205 
206 	AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
207 
208 	type = uap->type;
209 	oflag = 0;
210 	fflag = 0;
211 	if ((type & SOCK_CLOEXEC) != 0) {
212 		type &= ~SOCK_CLOEXEC;
213 		oflag |= O_CLOEXEC;
214 	}
215 	if ((type & SOCK_NONBLOCK) != 0) {
216 		type &= ~SOCK_NONBLOCK;
217 		fflag |= FNONBLOCK;
218 	}
219 
220 #ifdef MAC
221 	error = mac_socket_check_create(td->td_ucred, uap->domain, type,
222 	    uap->protocol);
223 	if (error != 0)
224 		return (error);
225 #endif
226 	error = falloc(td, &fp, &fd, oflag);
227 	if (error != 0)
228 		return (error);
229 	/* An extra reference on `fp' has been held for us by falloc(). */
230 	error = socreate(uap->domain, &so, type, uap->protocol,
231 	    td->td_ucred, td);
232 	if (error != 0) {
233 		fdclose(td->td_proc->p_fd, fp, fd, td);
234 	} else {
235 		finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
236 		if ((fflag & FNONBLOCK) != 0)
237 			(void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
238 		td->td_retval[0] = fd;
239 	}
240 	fdrop(fp, td);
241 	return (error);
242 }
243 
244 /* ARGSUSED */
245 int
246 sys_bind(td, uap)
247 	struct thread *td;
248 	struct bind_args /* {
249 		int	s;
250 		caddr_t	name;
251 		int	namelen;
252 	} */ *uap;
253 {
254 	struct sockaddr *sa;
255 	int error;
256 
257 	error = getsockaddr(&sa, uap->name, uap->namelen);
258 	if (error == 0) {
259 		error = kern_bind(td, uap->s, sa);
260 		free(sa, M_SONAME);
261 	}
262 	return (error);
263 }
264 
265 static int
266 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
267 {
268 	struct socket *so;
269 	struct file *fp;
270 	cap_rights_t rights;
271 	int error;
272 
273 	AUDIT_ARG_FD(fd);
274 	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
275 	error = getsock_cap(td->td_proc->p_fd, fd,
276 	    cap_rights_init(&rights, CAP_BIND), &fp, NULL);
277 	if (error != 0)
278 		return (error);
279 	so = fp->f_data;
280 #ifdef KTRACE
281 	if (KTRPOINT(td, KTR_STRUCT))
282 		ktrsockaddr(sa);
283 #endif
284 #ifdef MAC
285 	error = mac_socket_check_bind(td->td_ucred, so, sa);
286 	if (error == 0) {
287 #endif
288 		if (dirfd == AT_FDCWD)
289 			error = sobind(so, sa, td);
290 		else
291 			error = sobindat(dirfd, so, sa, td);
292 #ifdef MAC
293 	}
294 #endif
295 	fdrop(fp, td);
296 	return (error);
297 }
298 
299 int
300 kern_bind(struct thread *td, int fd, struct sockaddr *sa)
301 {
302 
303 	return (kern_bindat(td, AT_FDCWD, fd, sa));
304 }
305 
306 /* ARGSUSED */
307 int
308 sys_bindat(td, uap)
309 	struct thread *td;
310 	struct bindat_args /* {
311 		int	fd;
312 		int	s;
313 		caddr_t	name;
314 		int	namelen;
315 	} */ *uap;
316 {
317 	struct sockaddr *sa;
318 	int error;
319 
320 	error = getsockaddr(&sa, uap->name, uap->namelen);
321 	if (error == 0) {
322 		error = kern_bindat(td, uap->fd, uap->s, sa);
323 		free(sa, M_SONAME);
324 	}
325 	return (error);
326 }
327 
328 /* ARGSUSED */
329 int
330 sys_listen(td, uap)
331 	struct thread *td;
332 	struct listen_args /* {
333 		int	s;
334 		int	backlog;
335 	} */ *uap;
336 {
337 	struct socket *so;
338 	struct file *fp;
339 	cap_rights_t rights;
340 	int error;
341 
342 	AUDIT_ARG_FD(uap->s);
343 	error = getsock_cap(td->td_proc->p_fd, uap->s,
344 	    cap_rights_init(&rights, CAP_LISTEN), &fp, NULL);
345 	if (error == 0) {
346 		so = fp->f_data;
347 #ifdef MAC
348 		error = mac_socket_check_listen(td->td_ucred, so);
349 		if (error == 0)
350 #endif
351 			error = solisten(so, uap->backlog, td);
352 		fdrop(fp, td);
353 	}
354 	return(error);
355 }
356 
357 /*
358  * accept1()
359  */
360 static int
361 accept1(td, s, uname, anamelen, flags)
362 	struct thread *td;
363 	int s;
364 	struct sockaddr *uname;
365 	socklen_t *anamelen;
366 	int flags;
367 {
368 	struct sockaddr *name;
369 	socklen_t namelen;
370 	struct file *fp;
371 	int error;
372 
373 	if (uname == NULL)
374 		return (kern_accept4(td, s, NULL, NULL, flags, NULL));
375 
376 	error = copyin(anamelen, &namelen, sizeof (namelen));
377 	if (error != 0)
378 		return (error);
379 
380 	error = kern_accept4(td, s, &name, &namelen, flags, &fp);
381 
382 	/*
383 	 * return a namelen of zero for older code which might
384 	 * ignore the return value from accept.
385 	 */
386 	if (error != 0) {
387 		(void) copyout(&namelen, anamelen, sizeof(*anamelen));
388 		return (error);
389 	}
390 
391 	if (error == 0 && uname != NULL) {
392 #ifdef COMPAT_OLDSOCK
393 		if (flags & ACCEPT4_COMPAT)
394 			((struct osockaddr *)name)->sa_family =
395 			    name->sa_family;
396 #endif
397 		error = copyout(name, uname, namelen);
398 	}
399 	if (error == 0)
400 		error = copyout(&namelen, anamelen,
401 		    sizeof(namelen));
402 	if (error != 0)
403 		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
404 	fdrop(fp, td);
405 	free(name, M_SONAME);
406 	return (error);
407 }
408 
409 int
410 kern_accept(struct thread *td, int s, struct sockaddr **name,
411     socklen_t *namelen, struct file **fp)
412 {
413 	return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
414 }
415 
416 int
417 kern_accept4(struct thread *td, int s, struct sockaddr **name,
418     socklen_t *namelen, int flags, struct file **fp)
419 {
420 	struct filedesc *fdp;
421 	struct file *headfp, *nfp = NULL;
422 	struct sockaddr *sa = NULL;
423 	struct socket *head, *so;
424 	cap_rights_t rights;
425 	u_int fflag;
426 	pid_t pgid;
427 	int error, fd, tmp;
428 
429 	if (name != NULL)
430 		*name = NULL;
431 
432 	AUDIT_ARG_FD(s);
433 	fdp = td->td_proc->p_fd;
434 	error = getsock_cap(fdp, s, cap_rights_init(&rights, CAP_ACCEPT),
435 	    &headfp, &fflag);
436 	if (error != 0)
437 		return (error);
438 	head = headfp->f_data;
439 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
440 		error = EINVAL;
441 		goto done;
442 	}
443 #ifdef MAC
444 	error = mac_socket_check_accept(td->td_ucred, head);
445 	if (error != 0)
446 		goto done;
447 #endif
448 	error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0);
449 	if (error != 0)
450 		goto done;
451 	ACCEPT_LOCK();
452 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
453 		ACCEPT_UNLOCK();
454 		error = EWOULDBLOCK;
455 		goto noconnection;
456 	}
457 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
458 		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
459 			head->so_error = ECONNABORTED;
460 			break;
461 		}
462 		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
463 		    "accept", 0);
464 		if (error != 0) {
465 			ACCEPT_UNLOCK();
466 			goto noconnection;
467 		}
468 	}
469 	if (head->so_error) {
470 		error = head->so_error;
471 		head->so_error = 0;
472 		ACCEPT_UNLOCK();
473 		goto noconnection;
474 	}
475 	so = TAILQ_FIRST(&head->so_comp);
476 	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
477 	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
478 
479 	/*
480 	 * Before changing the flags on the socket, we have to bump the
481 	 * reference count.  Otherwise, if the protocol calls sofree(),
482 	 * the socket will be released due to a zero refcount.
483 	 */
484 	SOCK_LOCK(so);			/* soref() and so_state update */
485 	soref(so);			/* file descriptor reference */
486 
487 	TAILQ_REMOVE(&head->so_comp, so, so_list);
488 	head->so_qlen--;
489 	if (flags & ACCEPT4_INHERIT)
490 		so->so_state |= (head->so_state & SS_NBIO);
491 	else
492 		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
493 	so->so_qstate &= ~SQ_COMP;
494 	so->so_head = NULL;
495 
496 	SOCK_UNLOCK(so);
497 	ACCEPT_UNLOCK();
498 
499 	/* An extra reference on `nfp' has been held for us by falloc(). */
500 	td->td_retval[0] = fd;
501 
502 	/* connection has been removed from the listen queue */
503 	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
504 
505 	if (flags & ACCEPT4_INHERIT) {
506 		pgid = fgetown(&head->so_sigio);
507 		if (pgid != 0)
508 			fsetown(pgid, &so->so_sigio);
509 	} else {
510 		fflag &= ~(FNONBLOCK | FASYNC);
511 		if (flags & SOCK_NONBLOCK)
512 			fflag |= FNONBLOCK;
513 	}
514 
515 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
516 	/* Sync socket nonblocking/async state with file flags */
517 	tmp = fflag & FNONBLOCK;
518 	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
519 	tmp = fflag & FASYNC;
520 	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
521 	sa = 0;
522 	error = soaccept(so, &sa);
523 	if (error != 0) {
524 		/*
525 		 * return a namelen of zero for older code which might
526 		 * ignore the return value from accept.
527 		 */
528 		if (name)
529 			*namelen = 0;
530 		goto noconnection;
531 	}
532 	if (sa == NULL) {
533 		if (name)
534 			*namelen = 0;
535 		goto done;
536 	}
537 	AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
538 	if (name) {
539 		/* check sa_len before it is destroyed */
540 		if (*namelen > sa->sa_len)
541 			*namelen = sa->sa_len;
542 #ifdef KTRACE
543 		if (KTRPOINT(td, KTR_STRUCT))
544 			ktrsockaddr(sa);
545 #endif
546 		*name = sa;
547 		sa = NULL;
548 	}
549 noconnection:
550 	free(sa, M_SONAME);
551 
552 	/*
553 	 * close the new descriptor, assuming someone hasn't ripped it
554 	 * out from under us.
555 	 */
556 	if (error != 0)
557 		fdclose(fdp, nfp, fd, td);
558 
559 	/*
560 	 * Release explicitly held references before returning.  We return
561 	 * a reference on nfp to the caller on success if they request it.
562 	 */
563 done:
564 	if (fp != NULL) {
565 		if (error == 0) {
566 			*fp = nfp;
567 			nfp = NULL;
568 		} else
569 			*fp = NULL;
570 	}
571 	if (nfp != NULL)
572 		fdrop(nfp, td);
573 	fdrop(headfp, td);
574 	return (error);
575 }
576 
577 int
578 sys_accept(td, uap)
579 	struct thread *td;
580 	struct accept_args *uap;
581 {
582 
583 	return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
584 }
585 
586 int
587 sys_accept4(td, uap)
588 	struct thread *td;
589 	struct accept4_args *uap;
590 {
591 
592 	if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
593 		return (EINVAL);
594 
595 	return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
596 }
597 
598 #ifdef COMPAT_OLDSOCK
599 int
600 oaccept(td, uap)
601 	struct thread *td;
602 	struct accept_args *uap;
603 {
604 
605 	return (accept1(td, uap->s, uap->name, uap->anamelen,
606 	    ACCEPT4_INHERIT | ACCEPT4_COMPAT));
607 }
608 #endif /* COMPAT_OLDSOCK */
609 
610 /* ARGSUSED */
611 int
612 sys_connect(td, uap)
613 	struct thread *td;
614 	struct connect_args /* {
615 		int	s;
616 		caddr_t	name;
617 		int	namelen;
618 	} */ *uap;
619 {
620 	struct sockaddr *sa;
621 	int error;
622 
623 	error = getsockaddr(&sa, uap->name, uap->namelen);
624 	if (error == 0) {
625 		error = kern_connect(td, uap->s, sa);
626 		free(sa, M_SONAME);
627 	}
628 	return (error);
629 }
630 
631 static int
632 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
633 {
634 	struct socket *so;
635 	struct file *fp;
636 	cap_rights_t rights;
637 	int error, interrupted = 0;
638 
639 	AUDIT_ARG_FD(fd);
640 	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
641 	error = getsock_cap(td->td_proc->p_fd, fd,
642 	    cap_rights_init(&rights, CAP_CONNECT), &fp, NULL);
643 	if (error != 0)
644 		return (error);
645 	so = fp->f_data;
646 	if (so->so_state & SS_ISCONNECTING) {
647 		error = EALREADY;
648 		goto done1;
649 	}
650 #ifdef KTRACE
651 	if (KTRPOINT(td, KTR_STRUCT))
652 		ktrsockaddr(sa);
653 #endif
654 #ifdef MAC
655 	error = mac_socket_check_connect(td->td_ucred, so, sa);
656 	if (error != 0)
657 		goto bad;
658 #endif
659 	if (dirfd == AT_FDCWD)
660 		error = soconnect(so, sa, td);
661 	else
662 		error = soconnectat(dirfd, so, sa, td);
663 	if (error != 0)
664 		goto bad;
665 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
666 		error = EINPROGRESS;
667 		goto done1;
668 	}
669 	SOCK_LOCK(so);
670 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
671 		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
672 		    "connec", 0);
673 		if (error != 0) {
674 			if (error == EINTR || error == ERESTART)
675 				interrupted = 1;
676 			break;
677 		}
678 	}
679 	if (error == 0) {
680 		error = so->so_error;
681 		so->so_error = 0;
682 	}
683 	SOCK_UNLOCK(so);
684 bad:
685 	if (!interrupted)
686 		so->so_state &= ~SS_ISCONNECTING;
687 	if (error == ERESTART)
688 		error = EINTR;
689 done1:
690 	fdrop(fp, td);
691 	return (error);
692 }
693 
694 int
695 kern_connect(struct thread *td, int fd, struct sockaddr *sa)
696 {
697 
698 	return (kern_connectat(td, AT_FDCWD, fd, sa));
699 }
700 
701 /* ARGSUSED */
702 int
703 sys_connectat(td, uap)
704 	struct thread *td;
705 	struct connectat_args /* {
706 		int	fd;
707 		int	s;
708 		caddr_t	name;
709 		int	namelen;
710 	} */ *uap;
711 {
712 	struct sockaddr *sa;
713 	int error;
714 
715 	error = getsockaddr(&sa, uap->name, uap->namelen);
716 	if (error == 0) {
717 		error = kern_connectat(td, uap->fd, uap->s, sa);
718 		free(sa, M_SONAME);
719 	}
720 	return (error);
721 }
722 
723 int
724 kern_socketpair(struct thread *td, int domain, int type, int protocol,
725     int *rsv)
726 {
727 	struct filedesc *fdp = td->td_proc->p_fd;
728 	struct file *fp1, *fp2;
729 	struct socket *so1, *so2;
730 	int fd, error, oflag, fflag;
731 
732 	AUDIT_ARG_SOCKET(domain, type, protocol);
733 
734 	oflag = 0;
735 	fflag = 0;
736 	if ((type & SOCK_CLOEXEC) != 0) {
737 		type &= ~SOCK_CLOEXEC;
738 		oflag |= O_CLOEXEC;
739 	}
740 	if ((type & SOCK_NONBLOCK) != 0) {
741 		type &= ~SOCK_NONBLOCK;
742 		fflag |= FNONBLOCK;
743 	}
744 #ifdef MAC
745 	/* We might want to have a separate check for socket pairs. */
746 	error = mac_socket_check_create(td->td_ucred, domain, type,
747 	    protocol);
748 	if (error != 0)
749 		return (error);
750 #endif
751 	error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
752 	if (error != 0)
753 		return (error);
754 	error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
755 	if (error != 0)
756 		goto free1;
757 	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
758 	error = falloc(td, &fp1, &fd, oflag);
759 	if (error != 0)
760 		goto free2;
761 	rsv[0] = fd;
762 	fp1->f_data = so1;	/* so1 already has ref count */
763 	error = falloc(td, &fp2, &fd, oflag);
764 	if (error != 0)
765 		goto free3;
766 	fp2->f_data = so2;	/* so2 already has ref count */
767 	rsv[1] = fd;
768 	error = soconnect2(so1, so2);
769 	if (error != 0)
770 		goto free4;
771 	if (type == SOCK_DGRAM) {
772 		/*
773 		 * Datagram socket connection is asymmetric.
774 		 */
775 		 error = soconnect2(so2, so1);
776 		 if (error != 0)
777 			goto free4;
778 	}
779 	finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
780 	    &socketops);
781 	finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
782 	    &socketops);
783 	if ((fflag & FNONBLOCK) != 0) {
784 		(void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
785 		(void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
786 	}
787 	fdrop(fp1, td);
788 	fdrop(fp2, td);
789 	return (0);
790 free4:
791 	fdclose(fdp, fp2, rsv[1], td);
792 	fdrop(fp2, td);
793 free3:
794 	fdclose(fdp, fp1, rsv[0], td);
795 	fdrop(fp1, td);
796 free2:
797 	if (so2 != NULL)
798 		(void)soclose(so2);
799 free1:
800 	if (so1 != NULL)
801 		(void)soclose(so1);
802 	return (error);
803 }
804 
805 int
806 sys_socketpair(struct thread *td, struct socketpair_args *uap)
807 {
808 	int error, sv[2];
809 
810 	error = kern_socketpair(td, uap->domain, uap->type,
811 	    uap->protocol, sv);
812 	if (error != 0)
813 		return (error);
814 	error = copyout(sv, uap->rsv, 2 * sizeof(int));
815 	if (error != 0) {
816 		(void)kern_close(td, sv[0]);
817 		(void)kern_close(td, sv[1]);
818 	}
819 	return (error);
820 }
821 
822 static int
823 sendit(td, s, mp, flags)
824 	struct thread *td;
825 	int s;
826 	struct msghdr *mp;
827 	int flags;
828 {
829 	struct mbuf *control;
830 	struct sockaddr *to;
831 	int error;
832 
833 #ifdef CAPABILITY_MODE
834 	if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
835 		return (ECAPMODE);
836 #endif
837 
838 	if (mp->msg_name != NULL) {
839 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
840 		if (error != 0) {
841 			to = NULL;
842 			goto bad;
843 		}
844 		mp->msg_name = to;
845 	} else {
846 		to = NULL;
847 	}
848 
849 	if (mp->msg_control) {
850 		if (mp->msg_controllen < sizeof(struct cmsghdr)
851 #ifdef COMPAT_OLDSOCK
852 		    && mp->msg_flags != MSG_COMPAT
853 #endif
854 		) {
855 			error = EINVAL;
856 			goto bad;
857 		}
858 		error = sockargs(&control, mp->msg_control,
859 		    mp->msg_controllen, MT_CONTROL);
860 		if (error != 0)
861 			goto bad;
862 #ifdef COMPAT_OLDSOCK
863 		if (mp->msg_flags == MSG_COMPAT) {
864 			struct cmsghdr *cm;
865 
866 			M_PREPEND(control, sizeof(*cm), M_WAITOK);
867 			cm = mtod(control, struct cmsghdr *);
868 			cm->cmsg_len = control->m_len;
869 			cm->cmsg_level = SOL_SOCKET;
870 			cm->cmsg_type = SCM_RIGHTS;
871 		}
872 #endif
873 	} else {
874 		control = NULL;
875 	}
876 
877 	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
878 
879 bad:
880 	free(to, M_SONAME);
881 	return (error);
882 }
883 
884 int
885 kern_sendit(td, s, mp, flags, control, segflg)
886 	struct thread *td;
887 	int s;
888 	struct msghdr *mp;
889 	int flags;
890 	struct mbuf *control;
891 	enum uio_seg segflg;
892 {
893 	struct file *fp;
894 	struct uio auio;
895 	struct iovec *iov;
896 	struct socket *so;
897 	cap_rights_t rights;
898 #ifdef KTRACE
899 	struct uio *ktruio = NULL;
900 #endif
901 	ssize_t len;
902 	int i, error;
903 
904 	AUDIT_ARG_FD(s);
905 	cap_rights_init(&rights, CAP_SEND);
906 	if (mp->msg_name != NULL) {
907 		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
908 		cap_rights_set(&rights, CAP_CONNECT);
909 	}
910 	error = getsock_cap(td->td_proc->p_fd, s, &rights, &fp, NULL);
911 	if (error != 0)
912 		return (error);
913 	so = (struct socket *)fp->f_data;
914 
915 #ifdef KTRACE
916 	if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
917 		ktrsockaddr(mp->msg_name);
918 #endif
919 #ifdef MAC
920 	if (mp->msg_name != NULL) {
921 		error = mac_socket_check_connect(td->td_ucred, so,
922 		    mp->msg_name);
923 		if (error != 0)
924 			goto bad;
925 	}
926 	error = mac_socket_check_send(td->td_ucred, so);
927 	if (error != 0)
928 		goto bad;
929 #endif
930 
931 	auio.uio_iov = mp->msg_iov;
932 	auio.uio_iovcnt = mp->msg_iovlen;
933 	auio.uio_segflg = segflg;
934 	auio.uio_rw = UIO_WRITE;
935 	auio.uio_td = td;
936 	auio.uio_offset = 0;			/* XXX */
937 	auio.uio_resid = 0;
938 	iov = mp->msg_iov;
939 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
940 		if ((auio.uio_resid += iov->iov_len) < 0) {
941 			error = EINVAL;
942 			goto bad;
943 		}
944 	}
945 #ifdef KTRACE
946 	if (KTRPOINT(td, KTR_GENIO))
947 		ktruio = cloneuio(&auio);
948 #endif
949 	len = auio.uio_resid;
950 	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
951 	if (error != 0) {
952 		if (auio.uio_resid != len && (error == ERESTART ||
953 		    error == EINTR || error == EWOULDBLOCK))
954 			error = 0;
955 		/* Generation of SIGPIPE can be controlled per socket */
956 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
957 		    !(flags & MSG_NOSIGNAL)) {
958 			PROC_LOCK(td->td_proc);
959 			tdsignal(td, SIGPIPE);
960 			PROC_UNLOCK(td->td_proc);
961 		}
962 	}
963 	if (error == 0)
964 		td->td_retval[0] = len - auio.uio_resid;
965 #ifdef KTRACE
966 	if (ktruio != NULL) {
967 		ktruio->uio_resid = td->td_retval[0];
968 		ktrgenio(s, UIO_WRITE, ktruio, error);
969 	}
970 #endif
971 bad:
972 	fdrop(fp, td);
973 	return (error);
974 }
975 
976 int
977 sys_sendto(td, uap)
978 	struct thread *td;
979 	struct sendto_args /* {
980 		int	s;
981 		caddr_t	buf;
982 		size_t	len;
983 		int	flags;
984 		caddr_t	to;
985 		int	tolen;
986 	} */ *uap;
987 {
988 	struct msghdr msg;
989 	struct iovec aiov;
990 
991 	msg.msg_name = uap->to;
992 	msg.msg_namelen = uap->tolen;
993 	msg.msg_iov = &aiov;
994 	msg.msg_iovlen = 1;
995 	msg.msg_control = 0;
996 #ifdef COMPAT_OLDSOCK
997 	msg.msg_flags = 0;
998 #endif
999 	aiov.iov_base = uap->buf;
1000 	aiov.iov_len = uap->len;
1001 	return (sendit(td, uap->s, &msg, uap->flags));
1002 }
1003 
1004 #ifdef COMPAT_OLDSOCK
1005 int
1006 osend(td, uap)
1007 	struct thread *td;
1008 	struct osend_args /* {
1009 		int	s;
1010 		caddr_t	buf;
1011 		int	len;
1012 		int	flags;
1013 	} */ *uap;
1014 {
1015 	struct msghdr msg;
1016 	struct iovec aiov;
1017 
1018 	msg.msg_name = 0;
1019 	msg.msg_namelen = 0;
1020 	msg.msg_iov = &aiov;
1021 	msg.msg_iovlen = 1;
1022 	aiov.iov_base = uap->buf;
1023 	aiov.iov_len = uap->len;
1024 	msg.msg_control = 0;
1025 	msg.msg_flags = 0;
1026 	return (sendit(td, uap->s, &msg, uap->flags));
1027 }
1028 
1029 int
1030 osendmsg(td, uap)
1031 	struct thread *td;
1032 	struct osendmsg_args /* {
1033 		int	s;
1034 		caddr_t	msg;
1035 		int	flags;
1036 	} */ *uap;
1037 {
1038 	struct msghdr msg;
1039 	struct iovec *iov;
1040 	int error;
1041 
1042 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1043 	if (error != 0)
1044 		return (error);
1045 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1046 	if (error != 0)
1047 		return (error);
1048 	msg.msg_iov = iov;
1049 	msg.msg_flags = MSG_COMPAT;
1050 	error = sendit(td, uap->s, &msg, uap->flags);
1051 	free(iov, M_IOV);
1052 	return (error);
1053 }
1054 #endif
1055 
1056 int
1057 sys_sendmsg(td, uap)
1058 	struct thread *td;
1059 	struct sendmsg_args /* {
1060 		int	s;
1061 		caddr_t	msg;
1062 		int	flags;
1063 	} */ *uap;
1064 {
1065 	struct msghdr msg;
1066 	struct iovec *iov;
1067 	int error;
1068 
1069 	error = copyin(uap->msg, &msg, sizeof (msg));
1070 	if (error != 0)
1071 		return (error);
1072 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1073 	if (error != 0)
1074 		return (error);
1075 	msg.msg_iov = iov;
1076 #ifdef COMPAT_OLDSOCK
1077 	msg.msg_flags = 0;
1078 #endif
1079 	error = sendit(td, uap->s, &msg, uap->flags);
1080 	free(iov, M_IOV);
1081 	return (error);
1082 }
1083 
1084 int
1085 kern_recvit(td, s, mp, fromseg, controlp)
1086 	struct thread *td;
1087 	int s;
1088 	struct msghdr *mp;
1089 	enum uio_seg fromseg;
1090 	struct mbuf **controlp;
1091 {
1092 	struct uio auio;
1093 	struct iovec *iov;
1094 	struct mbuf *m, *control = NULL;
1095 	caddr_t ctlbuf;
1096 	struct file *fp;
1097 	struct socket *so;
1098 	struct sockaddr *fromsa = NULL;
1099 	cap_rights_t rights;
1100 #ifdef KTRACE
1101 	struct uio *ktruio = NULL;
1102 #endif
1103 	ssize_t len;
1104 	int error, i;
1105 
1106 	if (controlp != NULL)
1107 		*controlp = NULL;
1108 
1109 	AUDIT_ARG_FD(s);
1110 	error = getsock_cap(td->td_proc->p_fd, s,
1111 	    cap_rights_init(&rights, CAP_RECV), &fp, NULL);
1112 	if (error != 0)
1113 		return (error);
1114 	so = fp->f_data;
1115 
1116 #ifdef MAC
1117 	error = mac_socket_check_receive(td->td_ucred, so);
1118 	if (error != 0) {
1119 		fdrop(fp, td);
1120 		return (error);
1121 	}
1122 #endif
1123 
1124 	auio.uio_iov = mp->msg_iov;
1125 	auio.uio_iovcnt = mp->msg_iovlen;
1126 	auio.uio_segflg = UIO_USERSPACE;
1127 	auio.uio_rw = UIO_READ;
1128 	auio.uio_td = td;
1129 	auio.uio_offset = 0;			/* XXX */
1130 	auio.uio_resid = 0;
1131 	iov = mp->msg_iov;
1132 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
1133 		if ((auio.uio_resid += iov->iov_len) < 0) {
1134 			fdrop(fp, td);
1135 			return (EINVAL);
1136 		}
1137 	}
1138 #ifdef KTRACE
1139 	if (KTRPOINT(td, KTR_GENIO))
1140 		ktruio = cloneuio(&auio);
1141 #endif
1142 	len = auio.uio_resid;
1143 	error = soreceive(so, &fromsa, &auio, NULL,
1144 	    (mp->msg_control || controlp) ? &control : NULL,
1145 	    &mp->msg_flags);
1146 	if (error != 0) {
1147 		if (auio.uio_resid != len && (error == ERESTART ||
1148 		    error == EINTR || error == EWOULDBLOCK))
1149 			error = 0;
1150 	}
1151 	if (fromsa != NULL)
1152 		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
1153 #ifdef KTRACE
1154 	if (ktruio != NULL) {
1155 		ktruio->uio_resid = len - auio.uio_resid;
1156 		ktrgenio(s, UIO_READ, ktruio, error);
1157 	}
1158 #endif
1159 	if (error != 0)
1160 		goto out;
1161 	td->td_retval[0] = len - auio.uio_resid;
1162 	if (mp->msg_name) {
1163 		len = mp->msg_namelen;
1164 		if (len <= 0 || fromsa == NULL)
1165 			len = 0;
1166 		else {
1167 			/* save sa_len before it is destroyed by MSG_COMPAT */
1168 			len = MIN(len, fromsa->sa_len);
1169 #ifdef COMPAT_OLDSOCK
1170 			if (mp->msg_flags & MSG_COMPAT)
1171 				((struct osockaddr *)fromsa)->sa_family =
1172 				    fromsa->sa_family;
1173 #endif
1174 			if (fromseg == UIO_USERSPACE) {
1175 				error = copyout(fromsa, mp->msg_name,
1176 				    (unsigned)len);
1177 				if (error != 0)
1178 					goto out;
1179 			} else
1180 				bcopy(fromsa, mp->msg_name, len);
1181 		}
1182 		mp->msg_namelen = len;
1183 	}
1184 	if (mp->msg_control && controlp == NULL) {
1185 #ifdef COMPAT_OLDSOCK
1186 		/*
1187 		 * We assume that old recvmsg calls won't receive access
1188 		 * rights and other control info, esp. as control info
1189 		 * is always optional and those options didn't exist in 4.3.
1190 		 * If we receive rights, trim the cmsghdr; anything else
1191 		 * is tossed.
1192 		 */
1193 		if (control && mp->msg_flags & MSG_COMPAT) {
1194 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1195 			    SOL_SOCKET ||
1196 			    mtod(control, struct cmsghdr *)->cmsg_type !=
1197 			    SCM_RIGHTS) {
1198 				mp->msg_controllen = 0;
1199 				goto out;
1200 			}
1201 			control->m_len -= sizeof (struct cmsghdr);
1202 			control->m_data += sizeof (struct cmsghdr);
1203 		}
1204 #endif
1205 		len = mp->msg_controllen;
1206 		m = control;
1207 		mp->msg_controllen = 0;
1208 		ctlbuf = mp->msg_control;
1209 
1210 		while (m && len > 0) {
1211 			unsigned int tocopy;
1212 
1213 			if (len >= m->m_len)
1214 				tocopy = m->m_len;
1215 			else {
1216 				mp->msg_flags |= MSG_CTRUNC;
1217 				tocopy = len;
1218 			}
1219 
1220 			if ((error = copyout(mtod(m, caddr_t),
1221 					ctlbuf, tocopy)) != 0)
1222 				goto out;
1223 
1224 			ctlbuf += tocopy;
1225 			len -= tocopy;
1226 			m = m->m_next;
1227 		}
1228 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1229 	}
1230 out:
1231 	fdrop(fp, td);
1232 #ifdef KTRACE
1233 	if (fromsa && KTRPOINT(td, KTR_STRUCT))
1234 		ktrsockaddr(fromsa);
1235 #endif
1236 	free(fromsa, M_SONAME);
1237 
1238 	if (error == 0 && controlp != NULL)
1239 		*controlp = control;
1240 	else  if (control)
1241 		m_freem(control);
1242 
1243 	return (error);
1244 }
1245 
1246 static int
1247 recvit(td, s, mp, namelenp)
1248 	struct thread *td;
1249 	int s;
1250 	struct msghdr *mp;
1251 	void *namelenp;
1252 {
1253 	int error;
1254 
1255 	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
1256 	if (error != 0)
1257 		return (error);
1258 	if (namelenp != NULL) {
1259 		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
1260 #ifdef COMPAT_OLDSOCK
1261 		if (mp->msg_flags & MSG_COMPAT)
1262 			error = 0;	/* old recvfrom didn't check */
1263 #endif
1264 	}
1265 	return (error);
1266 }
1267 
1268 int
1269 sys_recvfrom(td, uap)
1270 	struct thread *td;
1271 	struct recvfrom_args /* {
1272 		int	s;
1273 		caddr_t	buf;
1274 		size_t	len;
1275 		int	flags;
1276 		struct sockaddr * __restrict	from;
1277 		socklen_t * __restrict fromlenaddr;
1278 	} */ *uap;
1279 {
1280 	struct msghdr msg;
1281 	struct iovec aiov;
1282 	int error;
1283 
1284 	if (uap->fromlenaddr) {
1285 		error = copyin(uap->fromlenaddr,
1286 		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1287 		if (error != 0)
1288 			goto done2;
1289 	} else {
1290 		msg.msg_namelen = 0;
1291 	}
1292 	msg.msg_name = uap->from;
1293 	msg.msg_iov = &aiov;
1294 	msg.msg_iovlen = 1;
1295 	aiov.iov_base = uap->buf;
1296 	aiov.iov_len = uap->len;
1297 	msg.msg_control = 0;
1298 	msg.msg_flags = uap->flags;
1299 	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1300 done2:
1301 	return (error);
1302 }
1303 
1304 #ifdef COMPAT_OLDSOCK
1305 int
1306 orecvfrom(td, uap)
1307 	struct thread *td;
1308 	struct recvfrom_args *uap;
1309 {
1310 
1311 	uap->flags |= MSG_COMPAT;
1312 	return (sys_recvfrom(td, uap));
1313 }
1314 #endif
1315 
1316 #ifdef COMPAT_OLDSOCK
1317 int
1318 orecv(td, uap)
1319 	struct thread *td;
1320 	struct orecv_args /* {
1321 		int	s;
1322 		caddr_t	buf;
1323 		int	len;
1324 		int	flags;
1325 	} */ *uap;
1326 {
1327 	struct msghdr msg;
1328 	struct iovec aiov;
1329 
1330 	msg.msg_name = 0;
1331 	msg.msg_namelen = 0;
1332 	msg.msg_iov = &aiov;
1333 	msg.msg_iovlen = 1;
1334 	aiov.iov_base = uap->buf;
1335 	aiov.iov_len = uap->len;
1336 	msg.msg_control = 0;
1337 	msg.msg_flags = uap->flags;
1338 	return (recvit(td, uap->s, &msg, NULL));
1339 }
1340 
1341 /*
1342  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1343  * overlays the new one, missing only the flags, and with the (old) access
1344  * rights where the control fields are now.
1345  */
1346 int
1347 orecvmsg(td, uap)
1348 	struct thread *td;
1349 	struct orecvmsg_args /* {
1350 		int	s;
1351 		struct	omsghdr *msg;
1352 		int	flags;
1353 	} */ *uap;
1354 {
1355 	struct msghdr msg;
1356 	struct iovec *iov;
1357 	int error;
1358 
1359 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1360 	if (error != 0)
1361 		return (error);
1362 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1363 	if (error != 0)
1364 		return (error);
1365 	msg.msg_flags = uap->flags | MSG_COMPAT;
1366 	msg.msg_iov = iov;
1367 	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1368 	if (msg.msg_controllen && error == 0)
1369 		error = copyout(&msg.msg_controllen,
1370 		    &uap->msg->msg_accrightslen, sizeof (int));
1371 	free(iov, M_IOV);
1372 	return (error);
1373 }
1374 #endif
1375 
1376 int
1377 sys_recvmsg(td, uap)
1378 	struct thread *td;
1379 	struct recvmsg_args /* {
1380 		int	s;
1381 		struct	msghdr *msg;
1382 		int	flags;
1383 	} */ *uap;
1384 {
1385 	struct msghdr msg;
1386 	struct iovec *uiov, *iov;
1387 	int error;
1388 
1389 	error = copyin(uap->msg, &msg, sizeof (msg));
1390 	if (error != 0)
1391 		return (error);
1392 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1393 	if (error != 0)
1394 		return (error);
1395 	msg.msg_flags = uap->flags;
1396 #ifdef COMPAT_OLDSOCK
1397 	msg.msg_flags &= ~MSG_COMPAT;
1398 #endif
1399 	uiov = msg.msg_iov;
1400 	msg.msg_iov = iov;
1401 	error = recvit(td, uap->s, &msg, NULL);
1402 	if (error == 0) {
1403 		msg.msg_iov = uiov;
1404 		error = copyout(&msg, uap->msg, sizeof(msg));
1405 	}
1406 	free(iov, M_IOV);
1407 	return (error);
1408 }
1409 
1410 /* ARGSUSED */
1411 int
1412 sys_shutdown(td, uap)
1413 	struct thread *td;
1414 	struct shutdown_args /* {
1415 		int	s;
1416 		int	how;
1417 	} */ *uap;
1418 {
1419 	struct socket *so;
1420 	struct file *fp;
1421 	cap_rights_t rights;
1422 	int error;
1423 
1424 	AUDIT_ARG_FD(uap->s);
1425 	error = getsock_cap(td->td_proc->p_fd, uap->s,
1426 	    cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL);
1427 	if (error == 0) {
1428 		so = fp->f_data;
1429 		error = soshutdown(so, uap->how);
1430 		fdrop(fp, td);
1431 	}
1432 	return (error);
1433 }
1434 
1435 /* ARGSUSED */
1436 int
1437 sys_setsockopt(td, uap)
1438 	struct thread *td;
1439 	struct setsockopt_args /* {
1440 		int	s;
1441 		int	level;
1442 		int	name;
1443 		caddr_t	val;
1444 		int	valsize;
1445 	} */ *uap;
1446 {
1447 
1448 	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1449 	    uap->val, UIO_USERSPACE, uap->valsize));
1450 }
1451 
1452 int
1453 kern_setsockopt(td, s, level, name, val, valseg, valsize)
1454 	struct thread *td;
1455 	int s;
1456 	int level;
1457 	int name;
1458 	void *val;
1459 	enum uio_seg valseg;
1460 	socklen_t valsize;
1461 {
1462 	struct socket *so;
1463 	struct file *fp;
1464 	struct sockopt sopt;
1465 	cap_rights_t rights;
1466 	int error;
1467 
1468 	if (val == NULL && valsize != 0)
1469 		return (EFAULT);
1470 	if ((int)valsize < 0)
1471 		return (EINVAL);
1472 
1473 	sopt.sopt_dir = SOPT_SET;
1474 	sopt.sopt_level = level;
1475 	sopt.sopt_name = name;
1476 	sopt.sopt_val = val;
1477 	sopt.sopt_valsize = valsize;
1478 	switch (valseg) {
1479 	case UIO_USERSPACE:
1480 		sopt.sopt_td = td;
1481 		break;
1482 	case UIO_SYSSPACE:
1483 		sopt.sopt_td = NULL;
1484 		break;
1485 	default:
1486 		panic("kern_setsockopt called with bad valseg");
1487 	}
1488 
1489 	AUDIT_ARG_FD(s);
1490 	error = getsock_cap(td->td_proc->p_fd, s,
1491 	    cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL);
1492 	if (error == 0) {
1493 		so = fp->f_data;
1494 		error = sosetopt(so, &sopt);
1495 		fdrop(fp, td);
1496 	}
1497 	return(error);
1498 }
1499 
1500 /* ARGSUSED */
1501 int
1502 sys_getsockopt(td, uap)
1503 	struct thread *td;
1504 	struct getsockopt_args /* {
1505 		int	s;
1506 		int	level;
1507 		int	name;
1508 		void * __restrict	val;
1509 		socklen_t * __restrict avalsize;
1510 	} */ *uap;
1511 {
1512 	socklen_t valsize;
1513 	int error;
1514 
1515 	if (uap->val) {
1516 		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1517 		if (error != 0)
1518 			return (error);
1519 	}
1520 
1521 	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1522 	    uap->val, UIO_USERSPACE, &valsize);
1523 
1524 	if (error == 0)
1525 		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1526 	return (error);
1527 }
1528 
1529 /*
1530  * Kernel version of getsockopt.
1531  * optval can be a userland or userspace. optlen is always a kernel pointer.
1532  */
1533 int
1534 kern_getsockopt(td, s, level, name, val, valseg, valsize)
1535 	struct thread *td;
1536 	int s;
1537 	int level;
1538 	int name;
1539 	void *val;
1540 	enum uio_seg valseg;
1541 	socklen_t *valsize;
1542 {
1543 	struct socket *so;
1544 	struct file *fp;
1545 	struct sockopt sopt;
1546 	cap_rights_t rights;
1547 	int error;
1548 
1549 	if (val == NULL)
1550 		*valsize = 0;
1551 	if ((int)*valsize < 0)
1552 		return (EINVAL);
1553 
1554 	sopt.sopt_dir = SOPT_GET;
1555 	sopt.sopt_level = level;
1556 	sopt.sopt_name = name;
1557 	sopt.sopt_val = val;
1558 	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1559 	switch (valseg) {
1560 	case UIO_USERSPACE:
1561 		sopt.sopt_td = td;
1562 		break;
1563 	case UIO_SYSSPACE:
1564 		sopt.sopt_td = NULL;
1565 		break;
1566 	default:
1567 		panic("kern_getsockopt called with bad valseg");
1568 	}
1569 
1570 	AUDIT_ARG_FD(s);
1571 	error = getsock_cap(td->td_proc->p_fd, s,
1572 	    cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL);
1573 	if (error == 0) {
1574 		so = fp->f_data;
1575 		error = sogetopt(so, &sopt);
1576 		*valsize = sopt.sopt_valsize;
1577 		fdrop(fp, td);
1578 	}
1579 	return (error);
1580 }
1581 
1582 /*
1583  * getsockname1() - Get socket name.
1584  */
1585 /* ARGSUSED */
1586 static int
1587 getsockname1(td, uap, compat)
1588 	struct thread *td;
1589 	struct getsockname_args /* {
1590 		int	fdes;
1591 		struct sockaddr * __restrict asa;
1592 		socklen_t * __restrict alen;
1593 	} */ *uap;
1594 	int compat;
1595 {
1596 	struct sockaddr *sa;
1597 	socklen_t len;
1598 	int error;
1599 
1600 	error = copyin(uap->alen, &len, sizeof(len));
1601 	if (error != 0)
1602 		return (error);
1603 
1604 	error = kern_getsockname(td, uap->fdes, &sa, &len);
1605 	if (error != 0)
1606 		return (error);
1607 
1608 	if (len != 0) {
1609 #ifdef COMPAT_OLDSOCK
1610 		if (compat)
1611 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1612 #endif
1613 		error = copyout(sa, uap->asa, (u_int)len);
1614 	}
1615 	free(sa, M_SONAME);
1616 	if (error == 0)
1617 		error = copyout(&len, uap->alen, sizeof(len));
1618 	return (error);
1619 }
1620 
1621 int
1622 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
1623     socklen_t *alen)
1624 {
1625 	struct socket *so;
1626 	struct file *fp;
1627 	cap_rights_t rights;
1628 	socklen_t len;
1629 	int error;
1630 
1631 	AUDIT_ARG_FD(fd);
1632 	error = getsock_cap(td->td_proc->p_fd, fd,
1633 	    cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL);
1634 	if (error != 0)
1635 		return (error);
1636 	so = fp->f_data;
1637 	*sa = NULL;
1638 	CURVNET_SET(so->so_vnet);
1639 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
1640 	CURVNET_RESTORE();
1641 	if (error != 0)
1642 		goto bad;
1643 	if (*sa == NULL)
1644 		len = 0;
1645 	else
1646 		len = MIN(*alen, (*sa)->sa_len);
1647 	*alen = len;
1648 #ifdef KTRACE
1649 	if (KTRPOINT(td, KTR_STRUCT))
1650 		ktrsockaddr(*sa);
1651 #endif
1652 bad:
1653 	fdrop(fp, td);
1654 	if (error != 0 && *sa != NULL) {
1655 		free(*sa, M_SONAME);
1656 		*sa = NULL;
1657 	}
1658 	return (error);
1659 }
1660 
1661 int
1662 sys_getsockname(td, uap)
1663 	struct thread *td;
1664 	struct getsockname_args *uap;
1665 {
1666 
1667 	return (getsockname1(td, uap, 0));
1668 }
1669 
1670 #ifdef COMPAT_OLDSOCK
1671 int
1672 ogetsockname(td, uap)
1673 	struct thread *td;
1674 	struct getsockname_args *uap;
1675 {
1676 
1677 	return (getsockname1(td, uap, 1));
1678 }
1679 #endif /* COMPAT_OLDSOCK */
1680 
1681 /*
1682  * getpeername1() - Get name of peer for connected socket.
1683  */
1684 /* ARGSUSED */
1685 static int
1686 getpeername1(td, uap, compat)
1687 	struct thread *td;
1688 	struct getpeername_args /* {
1689 		int	fdes;
1690 		struct sockaddr * __restrict	asa;
1691 		socklen_t * __restrict	alen;
1692 	} */ *uap;
1693 	int compat;
1694 {
1695 	struct sockaddr *sa;
1696 	socklen_t len;
1697 	int error;
1698 
1699 	error = copyin(uap->alen, &len, sizeof (len));
1700 	if (error != 0)
1701 		return (error);
1702 
1703 	error = kern_getpeername(td, uap->fdes, &sa, &len);
1704 	if (error != 0)
1705 		return (error);
1706 
1707 	if (len != 0) {
1708 #ifdef COMPAT_OLDSOCK
1709 		if (compat)
1710 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1711 #endif
1712 		error = copyout(sa, uap->asa, (u_int)len);
1713 	}
1714 	free(sa, M_SONAME);
1715 	if (error == 0)
1716 		error = copyout(&len, uap->alen, sizeof(len));
1717 	return (error);
1718 }
1719 
1720 int
1721 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
1722     socklen_t *alen)
1723 {
1724 	struct socket *so;
1725 	struct file *fp;
1726 	cap_rights_t rights;
1727 	socklen_t len;
1728 	int error;
1729 
1730 	AUDIT_ARG_FD(fd);
1731 	error = getsock_cap(td->td_proc->p_fd, fd,
1732 	    cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL);
1733 	if (error != 0)
1734 		return (error);
1735 	so = fp->f_data;
1736 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1737 		error = ENOTCONN;
1738 		goto done;
1739 	}
1740 	*sa = NULL;
1741 	CURVNET_SET(so->so_vnet);
1742 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
1743 	CURVNET_RESTORE();
1744 	if (error != 0)
1745 		goto bad;
1746 	if (*sa == NULL)
1747 		len = 0;
1748 	else
1749 		len = MIN(*alen, (*sa)->sa_len);
1750 	*alen = len;
1751 #ifdef KTRACE
1752 	if (KTRPOINT(td, KTR_STRUCT))
1753 		ktrsockaddr(*sa);
1754 #endif
1755 bad:
1756 	if (error != 0 && *sa != NULL) {
1757 		free(*sa, M_SONAME);
1758 		*sa = NULL;
1759 	}
1760 done:
1761 	fdrop(fp, td);
1762 	return (error);
1763 }
1764 
1765 int
1766 sys_getpeername(td, uap)
1767 	struct thread *td;
1768 	struct getpeername_args *uap;
1769 {
1770 
1771 	return (getpeername1(td, uap, 0));
1772 }
1773 
1774 #ifdef COMPAT_OLDSOCK
1775 int
1776 ogetpeername(td, uap)
1777 	struct thread *td;
1778 	struct ogetpeername_args *uap;
1779 {
1780 
1781 	/* XXX uap should have type `getpeername_args *' to begin with. */
1782 	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1783 }
1784 #endif /* COMPAT_OLDSOCK */
1785 
1786 int
1787 sockargs(mp, buf, buflen, type)
1788 	struct mbuf **mp;
1789 	caddr_t buf;
1790 	int buflen, type;
1791 {
1792 	struct sockaddr *sa;
1793 	struct mbuf *m;
1794 	int error;
1795 
1796 	if (buflen > MLEN) {
1797 #ifdef COMPAT_OLDSOCK
1798 		if (type == MT_SONAME && buflen <= 112)
1799 			buflen = MLEN;		/* unix domain compat. hack */
1800 		else
1801 #endif
1802 			if (buflen > MCLBYTES)
1803 				return (EINVAL);
1804 	}
1805 	m = m_get2(buflen, M_WAITOK, type, 0);
1806 	m->m_len = buflen;
1807 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1808 	if (error != 0)
1809 		(void) m_free(m);
1810 	else {
1811 		*mp = m;
1812 		if (type == MT_SONAME) {
1813 			sa = mtod(m, struct sockaddr *);
1814 
1815 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1816 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1817 				sa->sa_family = sa->sa_len;
1818 #endif
1819 			sa->sa_len = buflen;
1820 		}
1821 	}
1822 	return (error);
1823 }
1824 
1825 int
1826 getsockaddr(namp, uaddr, len)
1827 	struct sockaddr **namp;
1828 	caddr_t uaddr;
1829 	size_t len;
1830 {
1831 	struct sockaddr *sa;
1832 	int error;
1833 
1834 	if (len > SOCK_MAXADDRLEN)
1835 		return (ENAMETOOLONG);
1836 	if (len < offsetof(struct sockaddr, sa_data[0]))
1837 		return (EINVAL);
1838 	sa = malloc(len, M_SONAME, M_WAITOK);
1839 	error = copyin(uaddr, sa, len);
1840 	if (error != 0) {
1841 		free(sa, M_SONAME);
1842 	} else {
1843 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1844 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1845 			sa->sa_family = sa->sa_len;
1846 #endif
1847 		sa->sa_len = len;
1848 		*namp = sa;
1849 	}
1850 	return (error);
1851 }
1852 
1853 #include <sys/condvar.h>
1854 
1855 struct sendfile_sync {
1856 	struct mtx	mtx;
1857 	struct cv	cv;
1858 	unsigned	count;
1859 };
1860 
1861 /*
1862  * Detach mapped page and release resources back to the system.
1863  */
1864 int
1865 sf_buf_mext(struct mbuf *mb, void *addr, void *args)
1866 {
1867 	vm_page_t m;
1868 	struct sendfile_sync *sfs;
1869 
1870 	m = sf_buf_page(args);
1871 	sf_buf_free(args);
1872 	vm_page_lock(m);
1873 	vm_page_unwire(m, 0);
1874 	/*
1875 	 * Check for the object going away on us. This can
1876 	 * happen since we don't hold a reference to it.
1877 	 * If so, we're responsible for freeing the page.
1878 	 */
1879 	if (m->wire_count == 0 && m->object == NULL)
1880 		vm_page_free(m);
1881 	vm_page_unlock(m);
1882 	if (addr == NULL)
1883 		return (EXT_FREE_OK);
1884 	sfs = addr;
1885 	mtx_lock(&sfs->mtx);
1886 	KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
1887 	if (--sfs->count == 0)
1888 		cv_signal(&sfs->cv);
1889 	mtx_unlock(&sfs->mtx);
1890 	return (EXT_FREE_OK);
1891 }
1892 
1893 /*
1894  * sendfile(2)
1895  *
1896  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1897  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1898  *
1899  * Send a file specified by 'fd' and starting at 'offset' to a socket
1900  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
1901  * 0.  Optionally add a header and/or trailer to the socket output.  If
1902  * specified, write the total number of bytes sent into *sbytes.
1903  */
1904 int
1905 sys_sendfile(struct thread *td, struct sendfile_args *uap)
1906 {
1907 
1908 	return (do_sendfile(td, uap, 0));
1909 }
1910 
1911 static int
1912 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1913 {
1914 	struct sf_hdtr hdtr;
1915 	struct uio *hdr_uio, *trl_uio;
1916 	struct file *fp;
1917 	cap_rights_t rights;
1918 	int error;
1919 
1920 	if (uap->offset < 0)
1921 		return (EINVAL);
1922 
1923 	hdr_uio = trl_uio = NULL;
1924 
1925 	if (uap->hdtr != NULL) {
1926 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1927 		if (error != 0)
1928 			goto out;
1929 		if (hdtr.headers != NULL) {
1930 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
1931 			if (error != 0)
1932 				goto out;
1933 		}
1934 		if (hdtr.trailers != NULL) {
1935 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
1936 			if (error != 0)
1937 				goto out;
1938 
1939 		}
1940 	}
1941 
1942 	AUDIT_ARG_FD(uap->fd);
1943 
1944 	/*
1945 	 * sendfile(2) can start at any offset within a file so we require
1946 	 * CAP_READ+CAP_SEEK = CAP_PREAD.
1947 	 */
1948 	if ((error = fget_read(td, uap->fd,
1949 	    cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) {
1950 		goto out;
1951 	}
1952 
1953 	error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
1954 	    uap->nbytes, uap->sbytes, uap->flags, compat ? SFK_COMPAT : 0, td);
1955 	fdrop(fp, td);
1956 
1957 out:
1958 	free(hdr_uio, M_IOV);
1959 	free(trl_uio, M_IOV);
1960 	return (error);
1961 }
1962 
1963 #ifdef COMPAT_FREEBSD4
1964 int
1965 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1966 {
1967 	struct sendfile_args args;
1968 
1969 	args.fd = uap->fd;
1970 	args.s = uap->s;
1971 	args.offset = uap->offset;
1972 	args.nbytes = uap->nbytes;
1973 	args.hdtr = uap->hdtr;
1974 	args.sbytes = uap->sbytes;
1975 	args.flags = uap->flags;
1976 
1977 	return (do_sendfile(td, &args, 1));
1978 }
1979 #endif /* COMPAT_FREEBSD4 */
1980 
1981 int
1982 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
1983     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
1984     int kflags, struct thread *td)
1985 {
1986 	struct vnode *vp = fp->f_vnode;
1987 	struct file *sock_fp;
1988 	struct vm_object *obj = NULL;
1989 	struct socket *so = NULL;
1990 	struct mbuf *m = NULL;
1991 	struct sf_buf *sf;
1992 	struct vm_page *pg;
1993 	struct vattr va;
1994 	struct sendfile_sync *sfs = NULL;
1995 	cap_rights_t rights;
1996 	off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
1997 	int bsize, error, hdrlen = 0, mnw = 0;
1998 
1999 	vn_lock(vp, LK_SHARED | LK_RETRY);
2000 	if (vp->v_type == VREG) {
2001 		bsize = vp->v_mount->mnt_stat.f_iosize;
2002 		if (nbytes == 0) {
2003 			error = VOP_GETATTR(vp, &va, td->td_ucred);
2004 			if (error != 0) {
2005 				VOP_UNLOCK(vp, 0);
2006 				obj = NULL;
2007 				goto out;
2008 			}
2009 			rem = va.va_size;
2010 		} else
2011 			rem = nbytes;
2012 		obj = vp->v_object;
2013 		if (obj != NULL) {
2014 			/*
2015 			 * Temporarily increase the backing VM
2016 			 * object's reference count so that a forced
2017 			 * reclamation of its vnode does not
2018 			 * immediately destroy it.
2019 			 */
2020 			VM_OBJECT_WLOCK(obj);
2021 			if ((obj->flags & OBJ_DEAD) == 0) {
2022 				vm_object_reference_locked(obj);
2023 				VM_OBJECT_WUNLOCK(obj);
2024 			} else {
2025 				VM_OBJECT_WUNLOCK(obj);
2026 				obj = NULL;
2027 			}
2028 		}
2029 	} else
2030 		bsize = 0;	/* silence gcc */
2031 	VOP_UNLOCK(vp, 0);
2032 	if (obj == NULL) {
2033 		error = EINVAL;
2034 		goto out;
2035 	}
2036 
2037 	/*
2038 	 * The socket must be a stream socket and connected.
2039 	 * Remember if it a blocking or non-blocking socket.
2040 	 */
2041 	error = getsock_cap(td->td_proc->p_fd, sockfd,
2042 	    cap_rights_init(&rights, CAP_SEND), &sock_fp, NULL);
2043 	if (error != 0)
2044 		goto out;
2045 	so = sock_fp->f_data;
2046 	if (so->so_type != SOCK_STREAM) {
2047 		error = EINVAL;
2048 		goto out;
2049 	}
2050 	if ((so->so_state & SS_ISCONNECTED) == 0) {
2051 		error = ENOTCONN;
2052 		goto out;
2053 	}
2054 	/*
2055 	 * Do not wait on memory allocations but return ENOMEM for
2056 	 * caller to retry later.
2057 	 * XXX: Experimental.
2058 	 */
2059 	if (flags & SF_MNOWAIT)
2060 		mnw = 1;
2061 
2062 	if (flags & SF_SYNC) {
2063 		sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
2064 		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
2065 		cv_init(&sfs->cv, "sendfile");
2066 	}
2067 
2068 #ifdef MAC
2069 	error = mac_socket_check_send(td->td_ucred, so);
2070 	if (error != 0)
2071 		goto out;
2072 #endif
2073 
2074 	/* If headers are specified copy them into mbufs. */
2075 	if (hdr_uio != NULL) {
2076 		hdr_uio->uio_td = td;
2077 		hdr_uio->uio_rw = UIO_WRITE;
2078 		if (hdr_uio->uio_resid > 0) {
2079 			/*
2080 			 * In FBSD < 5.0 the nbytes to send also included
2081 			 * the header.  If compat is specified subtract the
2082 			 * header size from nbytes.
2083 			 */
2084 			if (kflags & SFK_COMPAT) {
2085 				if (nbytes > hdr_uio->uio_resid)
2086 					nbytes -= hdr_uio->uio_resid;
2087 				else
2088 					nbytes = 0;
2089 			}
2090 			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
2091 			    0, 0, 0);
2092 			if (m == NULL) {
2093 				error = mnw ? EAGAIN : ENOBUFS;
2094 				goto out;
2095 			}
2096 			hdrlen = m_length(m, NULL);
2097 		}
2098 	}
2099 
2100 	/*
2101 	 * Protect against multiple writers to the socket.
2102 	 *
2103 	 * XXXRW: Historically this has assumed non-interruptibility, so now
2104 	 * we implement that, but possibly shouldn't.
2105 	 */
2106 	(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
2107 
2108 	/*
2109 	 * Loop through the pages of the file, starting with the requested
2110 	 * offset. Get a file page (do I/O if necessary), map the file page
2111 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
2112 	 * it on the socket.
2113 	 * This is done in two loops.  The inner loop turns as many pages
2114 	 * as it can, up to available socket buffer space, without blocking
2115 	 * into mbufs to have it bulk delivered into the socket send buffer.
2116 	 * The outer loop checks the state and available space of the socket
2117 	 * and takes care of the overall progress.
2118 	 */
2119 	for (off = offset; ; ) {
2120 		struct mbuf *mtail;
2121 		int loopbytes;
2122 		int space;
2123 		int done;
2124 
2125 		if ((nbytes != 0 && nbytes == fsbytes) ||
2126 		    (nbytes == 0 && va.va_size == fsbytes))
2127 			break;
2128 
2129 		mtail = NULL;
2130 		loopbytes = 0;
2131 		space = 0;
2132 		done = 0;
2133 
2134 		/*
2135 		 * Check the socket state for ongoing connection,
2136 		 * no errors and space in socket buffer.
2137 		 * If space is low allow for the remainder of the
2138 		 * file to be processed if it fits the socket buffer.
2139 		 * Otherwise block in waiting for sufficient space
2140 		 * to proceed, or if the socket is nonblocking, return
2141 		 * to userland with EAGAIN while reporting how far
2142 		 * we've come.
2143 		 * We wait until the socket buffer has significant free
2144 		 * space to do bulk sends.  This makes good use of file
2145 		 * system read ahead and allows packet segmentation
2146 		 * offloading hardware to take over lots of work.  If
2147 		 * we were not careful here we would send off only one
2148 		 * sfbuf at a time.
2149 		 */
2150 		SOCKBUF_LOCK(&so->so_snd);
2151 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
2152 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
2153 retry_space:
2154 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2155 			error = EPIPE;
2156 			SOCKBUF_UNLOCK(&so->so_snd);
2157 			goto done;
2158 		} else if (so->so_error) {
2159 			error = so->so_error;
2160 			so->so_error = 0;
2161 			SOCKBUF_UNLOCK(&so->so_snd);
2162 			goto done;
2163 		}
2164 		space = sbspace(&so->so_snd);
2165 		if (space < rem &&
2166 		    (space <= 0 ||
2167 		     space < so->so_snd.sb_lowat)) {
2168 			if (so->so_state & SS_NBIO) {
2169 				SOCKBUF_UNLOCK(&so->so_snd);
2170 				error = EAGAIN;
2171 				goto done;
2172 			}
2173 			/*
2174 			 * sbwait drops the lock while sleeping.
2175 			 * When we loop back to retry_space the
2176 			 * state may have changed and we retest
2177 			 * for it.
2178 			 */
2179 			error = sbwait(&so->so_snd);
2180 			/*
2181 			 * An error from sbwait usually indicates that we've
2182 			 * been interrupted by a signal. If we've sent anything
2183 			 * then return bytes sent, otherwise return the error.
2184 			 */
2185 			if (error != 0) {
2186 				SOCKBUF_UNLOCK(&so->so_snd);
2187 				goto done;
2188 			}
2189 			goto retry_space;
2190 		}
2191 		SOCKBUF_UNLOCK(&so->so_snd);
2192 
2193 		/*
2194 		 * Reduce space in the socket buffer by the size of
2195 		 * the header mbuf chain.
2196 		 * hdrlen is set to 0 after the first loop.
2197 		 */
2198 		space -= hdrlen;
2199 
2200 		error = vn_lock(vp, LK_SHARED);
2201 		if (error != 0)
2202 			goto done;
2203 		error = VOP_GETATTR(vp, &va, td->td_ucred);
2204 		if (error != 0 || off >= va.va_size) {
2205 			VOP_UNLOCK(vp, 0);
2206 			goto done;
2207 		}
2208 
2209 		/*
2210 		 * Loop and construct maximum sized mbuf chain to be bulk
2211 		 * dumped into socket buffer.
2212 		 */
2213 		while (space > loopbytes) {
2214 			vm_pindex_t pindex;
2215 			vm_offset_t pgoff;
2216 			struct mbuf *m0;
2217 
2218 			/*
2219 			 * Calculate the amount to transfer.
2220 			 * Not to exceed a page, the EOF,
2221 			 * or the passed in nbytes.
2222 			 */
2223 			pgoff = (vm_offset_t)(off & PAGE_MASK);
2224 			if (nbytes)
2225 				rem = (nbytes - fsbytes - loopbytes);
2226 			else
2227 				rem = va.va_size -
2228 				    offset - fsbytes - loopbytes;
2229 			xfsize = omin(PAGE_SIZE - pgoff, rem);
2230 			xfsize = omin(space - loopbytes, xfsize);
2231 			if (xfsize <= 0) {
2232 				done = 1;		/* all data sent */
2233 				break;
2234 			}
2235 
2236 			/*
2237 			 * Attempt to look up the page.  Allocate
2238 			 * if not found or wait and loop if busy.
2239 			 */
2240 			pindex = OFF_TO_IDX(off);
2241 			VM_OBJECT_WLOCK(obj);
2242 			pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
2243 			    VM_ALLOC_IGN_SBUSY | VM_ALLOC_NORMAL |
2244 			    VM_ALLOC_WIRED);
2245 
2246 			/*
2247 			 * Check if page is valid for what we need,
2248 			 * otherwise initiate I/O.
2249 			 * If we already turned some pages into mbufs,
2250 			 * send them off before we come here again and
2251 			 * block.
2252 			 */
2253 			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
2254 				VM_OBJECT_WUNLOCK(obj);
2255 			else if (m != NULL)
2256 				error = EAGAIN;	/* send what we already got */
2257 			else if (flags & SF_NODISKIO)
2258 				error = EBUSY;
2259 			else {
2260 				ssize_t resid;
2261 				int readahead = sfreadahead * MAXBSIZE;
2262 
2263 				VM_OBJECT_WUNLOCK(obj);
2264 
2265 				/*
2266 				 * Get the page from backing store.
2267 				 * XXXMAC: Because we don't have fp->f_cred
2268 				 * here, we pass in NOCRED.  This is probably
2269 				 * wrong, but is consistent with our original
2270 				 * implementation.
2271 				 */
2272 				error = vn_rdwr(UIO_READ, vp, NULL, readahead,
2273 				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
2274 				    IO_VMIO | ((readahead / bsize) << IO_SEQSHIFT),
2275 				    td->td_ucred, NOCRED, &resid, td);
2276 				SFSTAT_INC(sf_iocnt);
2277 				if (error != 0)
2278 					VM_OBJECT_WLOCK(obj);
2279 			}
2280 			if (error != 0) {
2281 				vm_page_lock(pg);
2282 				vm_page_unwire(pg, 0);
2283 				/*
2284 				 * See if anyone else might know about
2285 				 * this page.  If not and it is not valid,
2286 				 * then free it.
2287 				 */
2288 				if (pg->wire_count == 0 && pg->valid == 0 &&
2289 				    !vm_page_busied(pg))
2290 					vm_page_free(pg);
2291 				vm_page_unlock(pg);
2292 				VM_OBJECT_WUNLOCK(obj);
2293 				if (error == EAGAIN)
2294 					error = 0;	/* not a real error */
2295 				break;
2296 			}
2297 
2298 			/*
2299 			 * Get a sendfile buf.  When allocating the
2300 			 * first buffer for mbuf chain, we usually
2301 			 * wait as long as necessary, but this wait
2302 			 * can be interrupted.  For consequent
2303 			 * buffers, do not sleep, since several
2304 			 * threads might exhaust the buffers and then
2305 			 * deadlock.
2306 			 */
2307 			sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
2308 			    SFB_CATCH);
2309 			if (sf == NULL) {
2310 				SFSTAT_INC(sf_allocfail);
2311 				vm_page_lock(pg);
2312 				vm_page_unwire(pg, 0);
2313 				KASSERT(pg->object != NULL,
2314 				    ("%s: object disappeared", __func__));
2315 				vm_page_unlock(pg);
2316 				if (m == NULL)
2317 					error = (mnw ? EAGAIN : EINTR);
2318 				break;
2319 			}
2320 
2321 			/*
2322 			 * Get an mbuf and set it up as having
2323 			 * external storage.
2324 			 */
2325 			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
2326 			if (m0 == NULL) {
2327 				error = (mnw ? EAGAIN : ENOBUFS);
2328 				(void)sf_buf_mext(NULL, NULL, sf);
2329 				break;
2330 			}
2331 			if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
2332 			    sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
2333 			    (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
2334 				error = (mnw ? EAGAIN : ENOBUFS);
2335 				(void)sf_buf_mext(NULL, NULL, sf);
2336 				m_freem(m0);
2337 				break;
2338 			}
2339 			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
2340 			m0->m_len = xfsize;
2341 
2342 			/* Append to mbuf chain. */
2343 			if (mtail != NULL)
2344 				mtail->m_next = m0;
2345 			else if (m != NULL)
2346 				m_last(m)->m_next = m0;
2347 			else
2348 				m = m0;
2349 			mtail = m0;
2350 
2351 			/* Keep track of bits processed. */
2352 			loopbytes += xfsize;
2353 			off += xfsize;
2354 
2355 			if (sfs != NULL) {
2356 				mtx_lock(&sfs->mtx);
2357 				sfs->count++;
2358 				mtx_unlock(&sfs->mtx);
2359 			}
2360 		}
2361 
2362 		VOP_UNLOCK(vp, 0);
2363 
2364 		/* Add the buffer chain to the socket buffer. */
2365 		if (m != NULL) {
2366 			int mlen, err;
2367 
2368 			mlen = m_length(m, NULL);
2369 			SOCKBUF_LOCK(&so->so_snd);
2370 			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2371 				error = EPIPE;
2372 				SOCKBUF_UNLOCK(&so->so_snd);
2373 				goto done;
2374 			}
2375 			SOCKBUF_UNLOCK(&so->so_snd);
2376 			CURVNET_SET(so->so_vnet);
2377 			/* Avoid error aliasing. */
2378 			err = (*so->so_proto->pr_usrreqs->pru_send)
2379 				    (so, 0, m, NULL, NULL, td);
2380 			CURVNET_RESTORE();
2381 			if (err == 0) {
2382 				/*
2383 				 * We need two counters to get the
2384 				 * file offset and nbytes to send
2385 				 * right:
2386 				 * - sbytes contains the total amount
2387 				 *   of bytes sent, including headers.
2388 				 * - fsbytes contains the total amount
2389 				 *   of bytes sent from the file.
2390 				 */
2391 				sbytes += mlen;
2392 				fsbytes += mlen;
2393 				if (hdrlen) {
2394 					fsbytes -= hdrlen;
2395 					hdrlen = 0;
2396 				}
2397 			} else if (error == 0)
2398 				error = err;
2399 			m = NULL;	/* pru_send always consumes */
2400 		}
2401 
2402 		/* Quit outer loop on error or when we're done. */
2403 		if (done)
2404 			break;
2405 		if (error != 0)
2406 			goto done;
2407 	}
2408 
2409 	/*
2410 	 * Send trailers. Wimp out and use writev(2).
2411 	 */
2412 	if (trl_uio != NULL) {
2413 		sbunlock(&so->so_snd);
2414 		error = kern_writev(td, sockfd, trl_uio);
2415 		if (error == 0)
2416 			sbytes += td->td_retval[0];
2417 		goto out;
2418 	}
2419 
2420 done:
2421 	sbunlock(&so->so_snd);
2422 out:
2423 	/*
2424 	 * If there was no error we have to clear td->td_retval[0]
2425 	 * because it may have been set by writev.
2426 	 */
2427 	if (error == 0) {
2428 		td->td_retval[0] = 0;
2429 	}
2430 	if (sent != NULL) {
2431 		copyout(&sbytes, sent, sizeof(off_t));
2432 	}
2433 	if (obj != NULL)
2434 		vm_object_deallocate(obj);
2435 	if (so)
2436 		fdrop(sock_fp, td);
2437 	if (m)
2438 		m_freem(m);
2439 
2440 	if (sfs != NULL) {
2441 		mtx_lock(&sfs->mtx);
2442 		if (sfs->count != 0)
2443 			cv_wait(&sfs->cv, &sfs->mtx);
2444 		KASSERT(sfs->count == 0, ("sendfile sync still busy"));
2445 		cv_destroy(&sfs->cv);
2446 		mtx_destroy(&sfs->mtx);
2447 		free(sfs, M_TEMP);
2448 	}
2449 
2450 	if (error == ERESTART)
2451 		error = EINTR;
2452 
2453 	return (error);
2454 }
2455 
2456 /*
2457  * SCTP syscalls.
2458  * Functionality only compiled in if SCTP is defined in the kernel Makefile,
2459  * otherwise all return EOPNOTSUPP.
2460  * XXX: We should make this loadable one day.
2461  */
2462 int
2463 sys_sctp_peeloff(td, uap)
2464 	struct thread *td;
2465 	struct sctp_peeloff_args /* {
2466 		int	sd;
2467 		caddr_t	name;
2468 	} */ *uap;
2469 {
2470 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2471 	struct file *nfp = NULL;
2472 	struct socket *head, *so;
2473 	cap_rights_t rights;
2474 	u_int fflag;
2475 	int error, fd;
2476 
2477 	AUDIT_ARG_FD(uap->sd);
2478 	error = fgetsock(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF),
2479 	    &head, &fflag);
2480 	if (error != 0)
2481 		goto done2;
2482 	if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
2483 		error = EOPNOTSUPP;
2484 		goto done;
2485 	}
2486 	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
2487 	if (error != 0)
2488 		goto done;
2489 	/*
2490 	 * At this point we know we do have a assoc to pull
2491 	 * we proceed to get the fd setup. This may block
2492 	 * but that is ok.
2493 	 */
2494 
2495 	error = falloc(td, &nfp, &fd, 0);
2496 	if (error != 0)
2497 		goto done;
2498 	td->td_retval[0] = fd;
2499 
2500 	CURVNET_SET(head->so_vnet);
2501 	so = sonewconn(head, SS_ISCONNECTED);
2502 	if (so == NULL) {
2503 		error = ENOMEM;
2504 		goto noconnection;
2505 	}
2506 	/*
2507 	 * Before changing the flags on the socket, we have to bump the
2508 	 * reference count.  Otherwise, if the protocol calls sofree(),
2509 	 * the socket will be released due to a zero refcount.
2510 	 */
2511         SOCK_LOCK(so);
2512         soref(so);                      /* file descriptor reference */
2513         SOCK_UNLOCK(so);
2514 
2515 	ACCEPT_LOCK();
2516 
2517 	TAILQ_REMOVE(&head->so_comp, so, so_list);
2518 	head->so_qlen--;
2519 	so->so_state |= (head->so_state & SS_NBIO);
2520 	so->so_state &= ~SS_NOFDREF;
2521 	so->so_qstate &= ~SQ_COMP;
2522 	so->so_head = NULL;
2523 	ACCEPT_UNLOCK();
2524 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
2525 	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
2526 	if (error != 0)
2527 		goto noconnection;
2528 	if (head->so_sigio != NULL)
2529 		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
2530 
2531 noconnection:
2532 	/*
2533 	 * close the new descriptor, assuming someone hasn't ripped it
2534 	 * out from under us.
2535 	 */
2536 	if (error != 0)
2537 		fdclose(td->td_proc->p_fd, nfp, fd, td);
2538 
2539 	/*
2540 	 * Release explicitly held references before returning.
2541 	 */
2542 	CURVNET_RESTORE();
2543 done:
2544 	if (nfp != NULL)
2545 		fdrop(nfp, td);
2546 	fputsock(head);
2547 done2:
2548 	return (error);
2549 #else  /* SCTP */
2550 	return (EOPNOTSUPP);
2551 #endif /* SCTP */
2552 }
2553 
2554 int
2555 sys_sctp_generic_sendmsg (td, uap)
2556 	struct thread *td;
2557 	struct sctp_generic_sendmsg_args /* {
2558 		int sd,
2559 		caddr_t msg,
2560 		int mlen,
2561 		caddr_t to,
2562 		__socklen_t tolen,
2563 		struct sctp_sndrcvinfo *sinfo,
2564 		int flags
2565 	} */ *uap;
2566 {
2567 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2568 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2569 	struct socket *so;
2570 	struct file *fp = NULL;
2571 	struct sockaddr *to = NULL;
2572 #ifdef KTRACE
2573 	struct uio *ktruio = NULL;
2574 #endif
2575 	struct uio auio;
2576 	struct iovec iov[1];
2577 	cap_rights_t rights;
2578 	int error = 0, len;
2579 
2580 	if (uap->sinfo != NULL) {
2581 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2582 		if (error != 0)
2583 			return (error);
2584 		u_sinfo = &sinfo;
2585 	}
2586 
2587 	cap_rights_init(&rights, CAP_SEND);
2588 	if (uap->tolen != 0) {
2589 		error = getsockaddr(&to, uap->to, uap->tolen);
2590 		if (error != 0) {
2591 			to = NULL;
2592 			goto sctp_bad2;
2593 		}
2594 		cap_rights_set(&rights, CAP_CONNECT);
2595 	}
2596 
2597 	AUDIT_ARG_FD(uap->sd);
2598 	error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL);
2599 	if (error != 0)
2600 		goto sctp_bad;
2601 #ifdef KTRACE
2602 	if (to && (KTRPOINT(td, KTR_STRUCT)))
2603 		ktrsockaddr(to);
2604 #endif
2605 
2606 	iov[0].iov_base = uap->msg;
2607 	iov[0].iov_len = uap->mlen;
2608 
2609 	so = (struct socket *)fp->f_data;
2610 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2611 		error = EOPNOTSUPP;
2612 		goto sctp_bad;
2613 	}
2614 #ifdef MAC
2615 	error = mac_socket_check_send(td->td_ucred, so);
2616 	if (error != 0)
2617 		goto sctp_bad;
2618 #endif /* MAC */
2619 
2620 	auio.uio_iov =  iov;
2621 	auio.uio_iovcnt = 1;
2622 	auio.uio_segflg = UIO_USERSPACE;
2623 	auio.uio_rw = UIO_WRITE;
2624 	auio.uio_td = td;
2625 	auio.uio_offset = 0;			/* XXX */
2626 	auio.uio_resid = 0;
2627 	len = auio.uio_resid = uap->mlen;
2628 	CURVNET_SET(so->so_vnet);
2629 	error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL,
2630 	    (struct mbuf *)NULL, uap->flags, u_sinfo, td);
2631 	CURVNET_RESTORE();
2632 	if (error != 0) {
2633 		if (auio.uio_resid != len && (error == ERESTART ||
2634 		    error == EINTR || error == EWOULDBLOCK))
2635 			error = 0;
2636 		/* Generation of SIGPIPE can be controlled per socket. */
2637 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2638 		    !(uap->flags & MSG_NOSIGNAL)) {
2639 			PROC_LOCK(td->td_proc);
2640 			tdsignal(td, SIGPIPE);
2641 			PROC_UNLOCK(td->td_proc);
2642 		}
2643 	}
2644 	if (error == 0)
2645 		td->td_retval[0] = len - auio.uio_resid;
2646 #ifdef KTRACE
2647 	if (ktruio != NULL) {
2648 		ktruio->uio_resid = td->td_retval[0];
2649 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2650 	}
2651 #endif /* KTRACE */
2652 sctp_bad:
2653 	if (fp != NULL)
2654 		fdrop(fp, td);
2655 sctp_bad2:
2656 	free(to, M_SONAME);
2657 	return (error);
2658 #else  /* SCTP */
2659 	return (EOPNOTSUPP);
2660 #endif /* SCTP */
2661 }
2662 
2663 int
2664 sys_sctp_generic_sendmsg_iov(td, uap)
2665 	struct thread *td;
2666 	struct sctp_generic_sendmsg_iov_args /* {
2667 		int sd,
2668 		struct iovec *iov,
2669 		int iovlen,
2670 		caddr_t to,
2671 		__socklen_t tolen,
2672 		struct sctp_sndrcvinfo *sinfo,
2673 		int flags
2674 	} */ *uap;
2675 {
2676 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2677 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2678 	struct socket *so;
2679 	struct file *fp = NULL;
2680 	struct sockaddr *to = NULL;
2681 #ifdef KTRACE
2682 	struct uio *ktruio = NULL;
2683 #endif
2684 	struct uio auio;
2685 	struct iovec *iov, *tiov;
2686 	cap_rights_t rights;
2687 	ssize_t len;
2688 	int error, i;
2689 
2690 	if (uap->sinfo != NULL) {
2691 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2692 		if (error != 0)
2693 			return (error);
2694 		u_sinfo = &sinfo;
2695 	}
2696 	cap_rights_init(&rights, CAP_SEND);
2697 	if (uap->tolen != 0) {
2698 		error = getsockaddr(&to, uap->to, uap->tolen);
2699 		if (error != 0) {
2700 			to = NULL;
2701 			goto sctp_bad2;
2702 		}
2703 		cap_rights_set(&rights, CAP_CONNECT);
2704 	}
2705 
2706 	AUDIT_ARG_FD(uap->sd);
2707 	error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL);
2708 	if (error != 0)
2709 		goto sctp_bad1;
2710 
2711 #ifdef COMPAT_FREEBSD32
2712 	if (SV_CURPROC_FLAG(SV_ILP32))
2713 		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
2714 		    uap->iovlen, &iov, EMSGSIZE);
2715 	else
2716 #endif
2717 		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2718 	if (error != 0)
2719 		goto sctp_bad1;
2720 #ifdef KTRACE
2721 	if (to && (KTRPOINT(td, KTR_STRUCT)))
2722 		ktrsockaddr(to);
2723 #endif
2724 
2725 	so = (struct socket *)fp->f_data;
2726 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2727 		error = EOPNOTSUPP;
2728 		goto sctp_bad;
2729 	}
2730 #ifdef MAC
2731 	error = mac_socket_check_send(td->td_ucred, so);
2732 	if (error != 0)
2733 		goto sctp_bad;
2734 #endif /* MAC */
2735 
2736 	auio.uio_iov = iov;
2737 	auio.uio_iovcnt = uap->iovlen;
2738 	auio.uio_segflg = UIO_USERSPACE;
2739 	auio.uio_rw = UIO_WRITE;
2740 	auio.uio_td = td;
2741 	auio.uio_offset = 0;			/* XXX */
2742 	auio.uio_resid = 0;
2743 	tiov = iov;
2744 	for (i = 0; i <uap->iovlen; i++, tiov++) {
2745 		if ((auio.uio_resid += tiov->iov_len) < 0) {
2746 			error = EINVAL;
2747 			goto sctp_bad;
2748 		}
2749 	}
2750 	len = auio.uio_resid;
2751 	CURVNET_SET(so->so_vnet);
2752 	error = sctp_lower_sosend(so, to, &auio,
2753 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2754 		    uap->flags, u_sinfo, td);
2755 	CURVNET_RESTORE();
2756 	if (error != 0) {
2757 		if (auio.uio_resid != len && (error == ERESTART ||
2758 		    error == EINTR || error == EWOULDBLOCK))
2759 			error = 0;
2760 		/* Generation of SIGPIPE can be controlled per socket */
2761 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2762 		    !(uap->flags & MSG_NOSIGNAL)) {
2763 			PROC_LOCK(td->td_proc);
2764 			tdsignal(td, SIGPIPE);
2765 			PROC_UNLOCK(td->td_proc);
2766 		}
2767 	}
2768 	if (error == 0)
2769 		td->td_retval[0] = len - auio.uio_resid;
2770 #ifdef KTRACE
2771 	if (ktruio != NULL) {
2772 		ktruio->uio_resid = td->td_retval[0];
2773 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2774 	}
2775 #endif /* KTRACE */
2776 sctp_bad:
2777 	free(iov, M_IOV);
2778 sctp_bad1:
2779 	if (fp != NULL)
2780 		fdrop(fp, td);
2781 sctp_bad2:
2782 	free(to, M_SONAME);
2783 	return (error);
2784 #else  /* SCTP */
2785 	return (EOPNOTSUPP);
2786 #endif /* SCTP */
2787 }
2788 
2789 int
2790 sys_sctp_generic_recvmsg(td, uap)
2791 	struct thread *td;
2792 	struct sctp_generic_recvmsg_args /* {
2793 		int sd,
2794 		struct iovec *iov,
2795 		int iovlen,
2796 		struct sockaddr *from,
2797 		__socklen_t *fromlenaddr,
2798 		struct sctp_sndrcvinfo *sinfo,
2799 		int *msg_flags
2800 	} */ *uap;
2801 {
2802 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2803 	uint8_t sockbufstore[256];
2804 	struct uio auio;
2805 	struct iovec *iov, *tiov;
2806 	struct sctp_sndrcvinfo sinfo;
2807 	struct socket *so;
2808 	struct file *fp = NULL;
2809 	struct sockaddr *fromsa;
2810 	cap_rights_t rights;
2811 #ifdef KTRACE
2812 	struct uio *ktruio = NULL;
2813 #endif
2814 	ssize_t len;
2815 	int error, fromlen, i, msg_flags;
2816 
2817 	AUDIT_ARG_FD(uap->sd);
2818 	error = getsock_cap(td->td_proc->p_fd, uap->sd,
2819 	    cap_rights_init(&rights, CAP_RECV), &fp, NULL);
2820 	if (error != 0)
2821 		return (error);
2822 #ifdef COMPAT_FREEBSD32
2823 	if (SV_CURPROC_FLAG(SV_ILP32))
2824 		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
2825 		    uap->iovlen, &iov, EMSGSIZE);
2826 	else
2827 #endif
2828 		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2829 	if (error != 0)
2830 		goto out1;
2831 
2832 	so = fp->f_data;
2833 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
2834 		error = EOPNOTSUPP;
2835 		goto out;
2836 	}
2837 #ifdef MAC
2838 	error = mac_socket_check_receive(td->td_ucred, so);
2839 	if (error != 0)
2840 		goto out;
2841 #endif /* MAC */
2842 
2843 	if (uap->fromlenaddr != NULL) {
2844 		error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen));
2845 		if (error != 0)
2846 			goto out;
2847 	} else {
2848 		fromlen = 0;
2849 	}
2850 	if (uap->msg_flags) {
2851 		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
2852 		if (error != 0)
2853 			goto out;
2854 	} else {
2855 		msg_flags = 0;
2856 	}
2857 	auio.uio_iov = iov;
2858 	auio.uio_iovcnt = uap->iovlen;
2859 	auio.uio_segflg = UIO_USERSPACE;
2860 	auio.uio_rw = UIO_READ;
2861 	auio.uio_td = td;
2862 	auio.uio_offset = 0;			/* XXX */
2863 	auio.uio_resid = 0;
2864 	tiov = iov;
2865 	for (i = 0; i <uap->iovlen; i++, tiov++) {
2866 		if ((auio.uio_resid += tiov->iov_len) < 0) {
2867 			error = EINVAL;
2868 			goto out;
2869 		}
2870 	}
2871 	len = auio.uio_resid;
2872 	fromsa = (struct sockaddr *)sockbufstore;
2873 
2874 #ifdef KTRACE
2875 	if (KTRPOINT(td, KTR_GENIO))
2876 		ktruio = cloneuio(&auio);
2877 #endif /* KTRACE */
2878 	memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
2879 	CURVNET_SET(so->so_vnet);
2880 	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
2881 		    fromsa, fromlen, &msg_flags,
2882 		    (struct sctp_sndrcvinfo *)&sinfo, 1);
2883 	CURVNET_RESTORE();
2884 	if (error != 0) {
2885 		if (auio.uio_resid != len && (error == ERESTART ||
2886 		    error == EINTR || error == EWOULDBLOCK))
2887 			error = 0;
2888 	} else {
2889 		if (uap->sinfo)
2890 			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
2891 	}
2892 #ifdef KTRACE
2893 	if (ktruio != NULL) {
2894 		ktruio->uio_resid = len - auio.uio_resid;
2895 		ktrgenio(uap->sd, UIO_READ, ktruio, error);
2896 	}
2897 #endif /* KTRACE */
2898 	if (error != 0)
2899 		goto out;
2900 	td->td_retval[0] = len - auio.uio_resid;
2901 
2902 	if (fromlen && uap->from) {
2903 		len = fromlen;
2904 		if (len <= 0 || fromsa == 0)
2905 			len = 0;
2906 		else {
2907 			len = MIN(len, fromsa->sa_len);
2908 			error = copyout(fromsa, uap->from, (size_t)len);
2909 			if (error != 0)
2910 				goto out;
2911 		}
2912 		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
2913 		if (error != 0)
2914 			goto out;
2915 	}
2916 #ifdef KTRACE
2917 	if (KTRPOINT(td, KTR_STRUCT))
2918 		ktrsockaddr(fromsa);
2919 #endif
2920 	if (uap->msg_flags) {
2921 		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
2922 		if (error != 0)
2923 			goto out;
2924 	}
2925 out:
2926 	free(iov, M_IOV);
2927 out1:
2928 	if (fp != NULL)
2929 		fdrop(fp, td);
2930 
2931 	return (error);
2932 #else  /* SCTP */
2933 	return (EOPNOTSUPP);
2934 #endif /* SCTP */
2935 }
2936