xref: /freebsd/sys/kern/uipc_syscalls.c (revision 6f9c8e5b074419423648ffb89b83fd2f257e90b7)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include "opt_inet.h"
39 #include "opt_inet6.h"
40 #include "opt_sctp.h"
41 #include "opt_compat.h"
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kernel.h>
47 #include <sys/lock.h>
48 #include <sys/mutex.h>
49 #include <sys/sysproto.h>
50 #include <sys/malloc.h>
51 #include <sys/filedesc.h>
52 #include <sys/event.h>
53 #include <sys/proc.h>
54 #include <sys/fcntl.h>
55 #include <sys/file.h>
56 #include <sys/filio.h>
57 #include <sys/jail.h>
58 #include <sys/mount.h>
59 #include <sys/mbuf.h>
60 #include <sys/protosw.h>
61 #include <sys/sf_buf.h>
62 #include <sys/sysent.h>
63 #include <sys/socket.h>
64 #include <sys/socketvar.h>
65 #include <sys/signalvar.h>
66 #include <sys/syscallsubr.h>
67 #include <sys/sysctl.h>
68 #include <sys/uio.h>
69 #include <sys/vnode.h>
70 #ifdef KTRACE
71 #include <sys/ktrace.h>
72 #endif
73 #ifdef COMPAT_FREEBSD32
74 #include <compat/freebsd32/freebsd32_util.h>
75 #endif
76 
77 #include <net/vnet.h>
78 
79 #include <security/audit/audit.h>
80 #include <security/mac/mac_framework.h>
81 
82 #include <vm/vm.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_pageout.h>
86 #include <vm/vm_kern.h>
87 #include <vm/vm_extern.h>
88 
89 #if defined(INET) || defined(INET6)
90 #ifdef SCTP
91 #include <netinet/sctp.h>
92 #include <netinet/sctp_peeloff.h>
93 #endif /* SCTP */
94 #endif /* INET || INET6 */
95 
96 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
97 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
98 
99 static int accept1(struct thread *td, struct accept_args *uap, int compat);
100 static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
101 static int getsockname1(struct thread *td, struct getsockname_args *uap,
102 			int compat);
103 static int getpeername1(struct thread *td, struct getpeername_args *uap,
104 			int compat);
105 
106 /*
107  * NSFBUFS-related variables and associated sysctls
108  */
109 int nsfbufs;
110 int nsfbufspeak;
111 int nsfbufsused;
112 
113 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
114     "Maximum number of sendfile(2) sf_bufs available");
115 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
116     "Number of sendfile(2) sf_bufs at peak usage");
117 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
118     "Number of sendfile(2) sf_bufs in use");
119 
120 /*
121  * Convert a user file descriptor to a kernel file entry.  A reference on the
122  * file entry is held upon returning.  This is lighter weight than
123  * fgetsock(), which bumps the socket reference drops the file reference
124  * count instead, as this approach avoids several additional mutex operations
125  * associated with the additional reference count.  If requested, return the
126  * open file flags.
127  */
128 static int
129 getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp)
130 {
131 	struct file *fp;
132 	int error;
133 
134 	fp = NULL;
135 	if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL) {
136 		error = EBADF;
137 	} else if (fp->f_type != DTYPE_SOCKET) {
138 		fdrop(fp, curthread);
139 		fp = NULL;
140 		error = ENOTSOCK;
141 	} else {
142 		if (fflagp != NULL)
143 			*fflagp = fp->f_flag;
144 		error = 0;
145 	}
146 	*fpp = fp;
147 	return (error);
148 }
149 
150 /*
151  * System call interface to the socket abstraction.
152  */
153 #if defined(COMPAT_43)
154 #define COMPAT_OLDSOCK
155 #endif
156 
157 int
158 socket(td, uap)
159 	struct thread *td;
160 	struct socket_args /* {
161 		int	domain;
162 		int	type;
163 		int	protocol;
164 	} */ *uap;
165 {
166 	struct filedesc *fdp;
167 	struct socket *so;
168 	struct file *fp;
169 	int fd, error;
170 
171 	AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
172 #ifdef MAC
173 	error = mac_socket_check_create(td->td_ucred, uap->domain, uap->type,
174 	    uap->protocol);
175 	if (error)
176 		return (error);
177 #endif
178 	fdp = td->td_proc->p_fd;
179 	error = falloc(td, &fp, &fd, 0);
180 	if (error)
181 		return (error);
182 	/* An extra reference on `fp' has been held for us by falloc(). */
183 	error = socreate(uap->domain, &so, uap->type, uap->protocol,
184 	    td->td_ucred, td);
185 	if (error) {
186 		fdclose(fdp, fp, fd, td);
187 	} else {
188 		finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops);
189 		td->td_retval[0] = fd;
190 	}
191 	fdrop(fp, td);
192 	return (error);
193 }
194 
195 /* ARGSUSED */
196 int
197 bind(td, uap)
198 	struct thread *td;
199 	struct bind_args /* {
200 		int	s;
201 		caddr_t	name;
202 		int	namelen;
203 	} */ *uap;
204 {
205 	struct sockaddr *sa;
206 	int error;
207 
208 	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
209 		return (error);
210 
211 	error = kern_bind(td, uap->s, sa);
212 	free(sa, M_SONAME);
213 	return (error);
214 }
215 
216 int
217 kern_bind(td, fd, sa)
218 	struct thread *td;
219 	int fd;
220 	struct sockaddr *sa;
221 {
222 	struct socket *so;
223 	struct file *fp;
224 	int error;
225 
226 	AUDIT_ARG_FD(fd);
227 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
228 	if (error)
229 		return (error);
230 	so = fp->f_data;
231 #ifdef KTRACE
232 	if (KTRPOINT(td, KTR_STRUCT))
233 		ktrsockaddr(sa);
234 #endif
235 #ifdef MAC
236 	error = mac_socket_check_bind(td->td_ucred, so, sa);
237 	if (error == 0)
238 #endif
239 		error = sobind(so, sa, td);
240 	fdrop(fp, td);
241 	return (error);
242 }
243 
244 /* ARGSUSED */
245 int
246 listen(td, uap)
247 	struct thread *td;
248 	struct listen_args /* {
249 		int	s;
250 		int	backlog;
251 	} */ *uap;
252 {
253 	struct socket *so;
254 	struct file *fp;
255 	int error;
256 
257 	AUDIT_ARG_FD(uap->s);
258 	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
259 	if (error == 0) {
260 		so = fp->f_data;
261 #ifdef MAC
262 		error = mac_socket_check_listen(td->td_ucred, so);
263 		if (error == 0)
264 #endif
265 			error = solisten(so, uap->backlog, td);
266 		fdrop(fp, td);
267 	}
268 	return(error);
269 }
270 
271 /*
272  * accept1()
273  */
274 static int
275 accept1(td, uap, compat)
276 	struct thread *td;
277 	struct accept_args /* {
278 		int	s;
279 		struct sockaddr	* __restrict name;
280 		socklen_t	* __restrict anamelen;
281 	} */ *uap;
282 	int compat;
283 {
284 	struct sockaddr *name;
285 	socklen_t namelen;
286 	struct file *fp;
287 	int error;
288 
289 	if (uap->name == NULL)
290 		return (kern_accept(td, uap->s, NULL, NULL, NULL));
291 
292 	error = copyin(uap->anamelen, &namelen, sizeof (namelen));
293 	if (error)
294 		return (error);
295 
296 	error = kern_accept(td, uap->s, &name, &namelen, &fp);
297 
298 	/*
299 	 * return a namelen of zero for older code which might
300 	 * ignore the return value from accept.
301 	 */
302 	if (error) {
303 		(void) copyout(&namelen,
304 		    uap->anamelen, sizeof(*uap->anamelen));
305 		return (error);
306 	}
307 
308 	if (error == 0 && name != NULL) {
309 #ifdef COMPAT_OLDSOCK
310 		if (compat)
311 			((struct osockaddr *)name)->sa_family =
312 			    name->sa_family;
313 #endif
314 		error = copyout(name, uap->name, namelen);
315 	}
316 	if (error == 0)
317 		error = copyout(&namelen, uap->anamelen,
318 		    sizeof(namelen));
319 	if (error)
320 		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
321 	fdrop(fp, td);
322 	free(name, M_SONAME);
323 	return (error);
324 }
325 
326 int
327 kern_accept(struct thread *td, int s, struct sockaddr **name,
328     socklen_t *namelen, struct file **fp)
329 {
330 	struct filedesc *fdp;
331 	struct file *headfp, *nfp = NULL;
332 	struct sockaddr *sa = NULL;
333 	int error;
334 	struct socket *head, *so;
335 	int fd;
336 	u_int fflag;
337 	pid_t pgid;
338 	int tmp;
339 
340 	if (name) {
341 		*name = NULL;
342 		if (*namelen < 0)
343 			return (EINVAL);
344 	}
345 
346 	AUDIT_ARG_FD(s);
347 	fdp = td->td_proc->p_fd;
348 	error = getsock(fdp, s, &headfp, &fflag);
349 	if (error)
350 		return (error);
351 	head = headfp->f_data;
352 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
353 		error = EINVAL;
354 		goto done;
355 	}
356 #ifdef MAC
357 	error = mac_socket_check_accept(td->td_ucred, head);
358 	if (error != 0)
359 		goto done;
360 #endif
361 	error = falloc(td, &nfp, &fd, 0);
362 	if (error)
363 		goto done;
364 	ACCEPT_LOCK();
365 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
366 		ACCEPT_UNLOCK();
367 		error = EWOULDBLOCK;
368 		goto noconnection;
369 	}
370 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
371 		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
372 			head->so_error = ECONNABORTED;
373 			break;
374 		}
375 		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
376 		    "accept", 0);
377 		if (error) {
378 			ACCEPT_UNLOCK();
379 			goto noconnection;
380 		}
381 	}
382 	if (head->so_error) {
383 		error = head->so_error;
384 		head->so_error = 0;
385 		ACCEPT_UNLOCK();
386 		goto noconnection;
387 	}
388 	so = TAILQ_FIRST(&head->so_comp);
389 	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
390 	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
391 
392 	/*
393 	 * Before changing the flags on the socket, we have to bump the
394 	 * reference count.  Otherwise, if the protocol calls sofree(),
395 	 * the socket will be released due to a zero refcount.
396 	 */
397 	SOCK_LOCK(so);			/* soref() and so_state update */
398 	soref(so);			/* file descriptor reference */
399 
400 	TAILQ_REMOVE(&head->so_comp, so, so_list);
401 	head->so_qlen--;
402 	so->so_state |= (head->so_state & SS_NBIO);
403 	so->so_qstate &= ~SQ_COMP;
404 	so->so_head = NULL;
405 
406 	SOCK_UNLOCK(so);
407 	ACCEPT_UNLOCK();
408 
409 	/* An extra reference on `nfp' has been held for us by falloc(). */
410 	td->td_retval[0] = fd;
411 
412 	/* connection has been removed from the listen queue */
413 	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
414 
415 	pgid = fgetown(&head->so_sigio);
416 	if (pgid != 0)
417 		fsetown(pgid, &so->so_sigio);
418 
419 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
420 	/* Sync socket nonblocking/async state with file flags */
421 	tmp = fflag & FNONBLOCK;
422 	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
423 	tmp = fflag & FASYNC;
424 	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
425 	sa = 0;
426 	error = soaccept(so, &sa);
427 	if (error) {
428 		/*
429 		 * return a namelen of zero for older code which might
430 		 * ignore the return value from accept.
431 		 */
432 		if (name)
433 			*namelen = 0;
434 		goto noconnection;
435 	}
436 	if (sa == NULL) {
437 		if (name)
438 			*namelen = 0;
439 		goto done;
440 	}
441 	if (name) {
442 		/* check sa_len before it is destroyed */
443 		if (*namelen > sa->sa_len)
444 			*namelen = sa->sa_len;
445 #ifdef KTRACE
446 		if (KTRPOINT(td, KTR_STRUCT))
447 			ktrsockaddr(sa);
448 #endif
449 		*name = sa;
450 		sa = NULL;
451 	}
452 noconnection:
453 	if (sa)
454 		free(sa, M_SONAME);
455 
456 	/*
457 	 * close the new descriptor, assuming someone hasn't ripped it
458 	 * out from under us.
459 	 */
460 	if (error)
461 		fdclose(fdp, nfp, fd, td);
462 
463 	/*
464 	 * Release explicitly held references before returning.  We return
465 	 * a reference on nfp to the caller on success if they request it.
466 	 */
467 done:
468 	if (fp != NULL) {
469 		if (error == 0) {
470 			*fp = nfp;
471 			nfp = NULL;
472 		} else
473 			*fp = NULL;
474 	}
475 	if (nfp != NULL)
476 		fdrop(nfp, td);
477 	fdrop(headfp, td);
478 	return (error);
479 }
480 
481 int
482 accept(td, uap)
483 	struct thread *td;
484 	struct accept_args *uap;
485 {
486 
487 	return (accept1(td, uap, 0));
488 }
489 
490 #ifdef COMPAT_OLDSOCK
491 int
492 oaccept(td, uap)
493 	struct thread *td;
494 	struct accept_args *uap;
495 {
496 
497 	return (accept1(td, uap, 1));
498 }
499 #endif /* COMPAT_OLDSOCK */
500 
501 /* ARGSUSED */
502 int
503 connect(td, uap)
504 	struct thread *td;
505 	struct connect_args /* {
506 		int	s;
507 		caddr_t	name;
508 		int	namelen;
509 	} */ *uap;
510 {
511 	struct sockaddr *sa;
512 	int error;
513 
514 	error = getsockaddr(&sa, uap->name, uap->namelen);
515 	if (error)
516 		return (error);
517 
518 	error = kern_connect(td, uap->s, sa);
519 	free(sa, M_SONAME);
520 	return (error);
521 }
522 
523 
524 int
525 kern_connect(td, fd, sa)
526 	struct thread *td;
527 	int fd;
528 	struct sockaddr *sa;
529 {
530 	struct socket *so;
531 	struct file *fp;
532 	int error;
533 	int interrupted = 0;
534 
535 	AUDIT_ARG_FD(fd);
536 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
537 	if (error)
538 		return (error);
539 	so = fp->f_data;
540 	if (so->so_state & SS_ISCONNECTING) {
541 		error = EALREADY;
542 		goto done1;
543 	}
544 #ifdef KTRACE
545 	if (KTRPOINT(td, KTR_STRUCT))
546 		ktrsockaddr(sa);
547 #endif
548 #ifdef MAC
549 	error = mac_socket_check_connect(td->td_ucred, so, sa);
550 	if (error)
551 		goto bad;
552 #endif
553 	error = soconnect(so, sa, td);
554 	if (error)
555 		goto bad;
556 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
557 		error = EINPROGRESS;
558 		goto done1;
559 	}
560 	SOCK_LOCK(so);
561 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
562 		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
563 		    "connec", 0);
564 		if (error) {
565 			if (error == EINTR || error == ERESTART)
566 				interrupted = 1;
567 			break;
568 		}
569 	}
570 	if (error == 0) {
571 		error = so->so_error;
572 		so->so_error = 0;
573 	}
574 	SOCK_UNLOCK(so);
575 bad:
576 	if (!interrupted)
577 		so->so_state &= ~SS_ISCONNECTING;
578 	if (error == ERESTART)
579 		error = EINTR;
580 done1:
581 	fdrop(fp, td);
582 	return (error);
583 }
584 
585 int
586 kern_socketpair(struct thread *td, int domain, int type, int protocol,
587     int *rsv)
588 {
589 	struct filedesc *fdp = td->td_proc->p_fd;
590 	struct file *fp1, *fp2;
591 	struct socket *so1, *so2;
592 	int fd, error;
593 
594 	AUDIT_ARG_SOCKET(domain, type, protocol);
595 #ifdef MAC
596 	/* We might want to have a separate check for socket pairs. */
597 	error = mac_socket_check_create(td->td_ucred, domain, type,
598 	    protocol);
599 	if (error)
600 		return (error);
601 #endif
602 	error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
603 	if (error)
604 		return (error);
605 	error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
606 	if (error)
607 		goto free1;
608 	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
609 	error = falloc(td, &fp1, &fd, 0);
610 	if (error)
611 		goto free2;
612 	rsv[0] = fd;
613 	fp1->f_data = so1;	/* so1 already has ref count */
614 	error = falloc(td, &fp2, &fd, 0);
615 	if (error)
616 		goto free3;
617 	fp2->f_data = so2;	/* so2 already has ref count */
618 	rsv[1] = fd;
619 	error = soconnect2(so1, so2);
620 	if (error)
621 		goto free4;
622 	if (type == SOCK_DGRAM) {
623 		/*
624 		 * Datagram socket connection is asymmetric.
625 		 */
626 		 error = soconnect2(so2, so1);
627 		 if (error)
628 			goto free4;
629 	}
630 	finit(fp1, FREAD | FWRITE, DTYPE_SOCKET, fp1->f_data, &socketops);
631 	finit(fp2, FREAD | FWRITE, DTYPE_SOCKET, fp2->f_data, &socketops);
632 	fdrop(fp1, td);
633 	fdrop(fp2, td);
634 	return (0);
635 free4:
636 	fdclose(fdp, fp2, rsv[1], td);
637 	fdrop(fp2, td);
638 free3:
639 	fdclose(fdp, fp1, rsv[0], td);
640 	fdrop(fp1, td);
641 free2:
642 	if (so2 != NULL)
643 		(void)soclose(so2);
644 free1:
645 	if (so1 != NULL)
646 		(void)soclose(so1);
647 	return (error);
648 }
649 
650 int
651 socketpair(struct thread *td, struct socketpair_args *uap)
652 {
653 	int error, sv[2];
654 
655 	error = kern_socketpair(td, uap->domain, uap->type,
656 	    uap->protocol, sv);
657 	if (error)
658 		return (error);
659 	error = copyout(sv, uap->rsv, 2 * sizeof(int));
660 	if (error) {
661 		(void)kern_close(td, sv[0]);
662 		(void)kern_close(td, sv[1]);
663 	}
664 	return (error);
665 }
666 
667 static int
668 sendit(td, s, mp, flags)
669 	struct thread *td;
670 	int s;
671 	struct msghdr *mp;
672 	int flags;
673 {
674 	struct mbuf *control;
675 	struct sockaddr *to;
676 	int error;
677 
678 	if (mp->msg_name != NULL) {
679 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
680 		if (error) {
681 			to = NULL;
682 			goto bad;
683 		}
684 		mp->msg_name = to;
685 	} else {
686 		to = NULL;
687 	}
688 
689 	if (mp->msg_control) {
690 		if (mp->msg_controllen < sizeof(struct cmsghdr)
691 #ifdef COMPAT_OLDSOCK
692 		    && mp->msg_flags != MSG_COMPAT
693 #endif
694 		) {
695 			error = EINVAL;
696 			goto bad;
697 		}
698 		error = sockargs(&control, mp->msg_control,
699 		    mp->msg_controllen, MT_CONTROL);
700 		if (error)
701 			goto bad;
702 #ifdef COMPAT_OLDSOCK
703 		if (mp->msg_flags == MSG_COMPAT) {
704 			struct cmsghdr *cm;
705 
706 			M_PREPEND(control, sizeof(*cm), M_WAIT);
707 			cm = mtod(control, struct cmsghdr *);
708 			cm->cmsg_len = control->m_len;
709 			cm->cmsg_level = SOL_SOCKET;
710 			cm->cmsg_type = SCM_RIGHTS;
711 		}
712 #endif
713 	} else {
714 		control = NULL;
715 	}
716 
717 	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
718 
719 bad:
720 	if (to)
721 		free(to, M_SONAME);
722 	return (error);
723 }
724 
725 int
726 kern_sendit(td, s, mp, flags, control, segflg)
727 	struct thread *td;
728 	int s;
729 	struct msghdr *mp;
730 	int flags;
731 	struct mbuf *control;
732 	enum uio_seg segflg;
733 {
734 	struct file *fp;
735 	struct uio auio;
736 	struct iovec *iov;
737 	struct socket *so;
738 	int i;
739 	int len, error;
740 #ifdef KTRACE
741 	struct uio *ktruio = NULL;
742 #endif
743 
744 	AUDIT_ARG_FD(s);
745 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
746 	if (error)
747 		return (error);
748 	so = (struct socket *)fp->f_data;
749 
750 #ifdef KTRACE
751 	if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
752 		ktrsockaddr(mp->msg_name);
753 #endif
754 #ifdef MAC
755 	if (mp->msg_name != NULL) {
756 		error = mac_socket_check_connect(td->td_ucred, so,
757 		    mp->msg_name);
758 		if (error)
759 			goto bad;
760 	}
761 	error = mac_socket_check_send(td->td_ucred, so);
762 	if (error)
763 		goto bad;
764 #endif
765 
766 	auio.uio_iov = mp->msg_iov;
767 	auio.uio_iovcnt = mp->msg_iovlen;
768 	auio.uio_segflg = segflg;
769 	auio.uio_rw = UIO_WRITE;
770 	auio.uio_td = td;
771 	auio.uio_offset = 0;			/* XXX */
772 	auio.uio_resid = 0;
773 	iov = mp->msg_iov;
774 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
775 		if ((auio.uio_resid += iov->iov_len) < 0) {
776 			error = EINVAL;
777 			goto bad;
778 		}
779 	}
780 #ifdef KTRACE
781 	if (KTRPOINT(td, KTR_GENIO))
782 		ktruio = cloneuio(&auio);
783 #endif
784 	len = auio.uio_resid;
785 	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
786 	if (error) {
787 		if (auio.uio_resid != len && (error == ERESTART ||
788 		    error == EINTR || error == EWOULDBLOCK))
789 			error = 0;
790 		/* Generation of SIGPIPE can be controlled per socket */
791 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
792 		    !(flags & MSG_NOSIGNAL)) {
793 			PROC_LOCK(td->td_proc);
794 			tdsignal(td, SIGPIPE);
795 			PROC_UNLOCK(td->td_proc);
796 		}
797 	}
798 	if (error == 0)
799 		td->td_retval[0] = len - auio.uio_resid;
800 #ifdef KTRACE
801 	if (ktruio != NULL) {
802 		ktruio->uio_resid = td->td_retval[0];
803 		ktrgenio(s, UIO_WRITE, ktruio, error);
804 	}
805 #endif
806 bad:
807 	fdrop(fp, td);
808 	return (error);
809 }
810 
811 int
812 sendto(td, uap)
813 	struct thread *td;
814 	struct sendto_args /* {
815 		int	s;
816 		caddr_t	buf;
817 		size_t	len;
818 		int	flags;
819 		caddr_t	to;
820 		int	tolen;
821 	} */ *uap;
822 {
823 	struct msghdr msg;
824 	struct iovec aiov;
825 	int error;
826 
827 	msg.msg_name = uap->to;
828 	msg.msg_namelen = uap->tolen;
829 	msg.msg_iov = &aiov;
830 	msg.msg_iovlen = 1;
831 	msg.msg_control = 0;
832 #ifdef COMPAT_OLDSOCK
833 	msg.msg_flags = 0;
834 #endif
835 	aiov.iov_base = uap->buf;
836 	aiov.iov_len = uap->len;
837 	error = sendit(td, uap->s, &msg, uap->flags);
838 	return (error);
839 }
840 
841 #ifdef COMPAT_OLDSOCK
842 int
843 osend(td, uap)
844 	struct thread *td;
845 	struct osend_args /* {
846 		int	s;
847 		caddr_t	buf;
848 		int	len;
849 		int	flags;
850 	} */ *uap;
851 {
852 	struct msghdr msg;
853 	struct iovec aiov;
854 	int error;
855 
856 	msg.msg_name = 0;
857 	msg.msg_namelen = 0;
858 	msg.msg_iov = &aiov;
859 	msg.msg_iovlen = 1;
860 	aiov.iov_base = uap->buf;
861 	aiov.iov_len = uap->len;
862 	msg.msg_control = 0;
863 	msg.msg_flags = 0;
864 	error = sendit(td, uap->s, &msg, uap->flags);
865 	return (error);
866 }
867 
868 int
869 osendmsg(td, uap)
870 	struct thread *td;
871 	struct osendmsg_args /* {
872 		int	s;
873 		caddr_t	msg;
874 		int	flags;
875 	} */ *uap;
876 {
877 	struct msghdr msg;
878 	struct iovec *iov;
879 	int error;
880 
881 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
882 	if (error)
883 		return (error);
884 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
885 	if (error)
886 		return (error);
887 	msg.msg_iov = iov;
888 	msg.msg_flags = MSG_COMPAT;
889 	error = sendit(td, uap->s, &msg, uap->flags);
890 	free(iov, M_IOV);
891 	return (error);
892 }
893 #endif
894 
895 int
896 sendmsg(td, uap)
897 	struct thread *td;
898 	struct sendmsg_args /* {
899 		int	s;
900 		caddr_t	msg;
901 		int	flags;
902 	} */ *uap;
903 {
904 	struct msghdr msg;
905 	struct iovec *iov;
906 	int error;
907 
908 	error = copyin(uap->msg, &msg, sizeof (msg));
909 	if (error)
910 		return (error);
911 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
912 	if (error)
913 		return (error);
914 	msg.msg_iov = iov;
915 #ifdef COMPAT_OLDSOCK
916 	msg.msg_flags = 0;
917 #endif
918 	error = sendit(td, uap->s, &msg, uap->flags);
919 	free(iov, M_IOV);
920 	return (error);
921 }
922 
923 int
924 kern_recvit(td, s, mp, fromseg, controlp)
925 	struct thread *td;
926 	int s;
927 	struct msghdr *mp;
928 	enum uio_seg fromseg;
929 	struct mbuf **controlp;
930 {
931 	struct uio auio;
932 	struct iovec *iov;
933 	int i;
934 	socklen_t len;
935 	int error;
936 	struct mbuf *m, *control = 0;
937 	caddr_t ctlbuf;
938 	struct file *fp;
939 	struct socket *so;
940 	struct sockaddr *fromsa = 0;
941 #ifdef KTRACE
942 	struct uio *ktruio = NULL;
943 #endif
944 
945 	if (controlp != NULL)
946 		*controlp = NULL;
947 
948 	AUDIT_ARG_FD(s);
949 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
950 	if (error)
951 		return (error);
952 	so = fp->f_data;
953 
954 #ifdef MAC
955 	error = mac_socket_check_receive(td->td_ucred, so);
956 	if (error) {
957 		fdrop(fp, td);
958 		return (error);
959 	}
960 #endif
961 
962 	auio.uio_iov = mp->msg_iov;
963 	auio.uio_iovcnt = mp->msg_iovlen;
964 	auio.uio_segflg = UIO_USERSPACE;
965 	auio.uio_rw = UIO_READ;
966 	auio.uio_td = td;
967 	auio.uio_offset = 0;			/* XXX */
968 	auio.uio_resid = 0;
969 	iov = mp->msg_iov;
970 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
971 		if ((auio.uio_resid += iov->iov_len) < 0) {
972 			fdrop(fp, td);
973 			return (EINVAL);
974 		}
975 	}
976 #ifdef KTRACE
977 	if (KTRPOINT(td, KTR_GENIO))
978 		ktruio = cloneuio(&auio);
979 #endif
980 	len = auio.uio_resid;
981 	error = soreceive(so, &fromsa, &auio, (struct mbuf **)0,
982 	    (mp->msg_control || controlp) ? &control : (struct mbuf **)0,
983 	    &mp->msg_flags);
984 	if (error) {
985 		if (auio.uio_resid != (int)len && (error == ERESTART ||
986 		    error == EINTR || error == EWOULDBLOCK))
987 			error = 0;
988 	}
989 #ifdef KTRACE
990 	if (ktruio != NULL) {
991 		ktruio->uio_resid = (int)len - auio.uio_resid;
992 		ktrgenio(s, UIO_READ, ktruio, error);
993 	}
994 #endif
995 	if (error)
996 		goto out;
997 	td->td_retval[0] = (int)len - auio.uio_resid;
998 	if (mp->msg_name) {
999 		len = mp->msg_namelen;
1000 		if (len <= 0 || fromsa == 0)
1001 			len = 0;
1002 		else {
1003 			/* save sa_len before it is destroyed by MSG_COMPAT */
1004 			len = MIN(len, fromsa->sa_len);
1005 #ifdef COMPAT_OLDSOCK
1006 			if (mp->msg_flags & MSG_COMPAT)
1007 				((struct osockaddr *)fromsa)->sa_family =
1008 				    fromsa->sa_family;
1009 #endif
1010 			if (fromseg == UIO_USERSPACE) {
1011 				error = copyout(fromsa, mp->msg_name,
1012 				    (unsigned)len);
1013 				if (error)
1014 					goto out;
1015 			} else
1016 				bcopy(fromsa, mp->msg_name, len);
1017 		}
1018 		mp->msg_namelen = len;
1019 	}
1020 	if (mp->msg_control && controlp == NULL) {
1021 #ifdef COMPAT_OLDSOCK
1022 		/*
1023 		 * We assume that old recvmsg calls won't receive access
1024 		 * rights and other control info, esp. as control info
1025 		 * is always optional and those options didn't exist in 4.3.
1026 		 * If we receive rights, trim the cmsghdr; anything else
1027 		 * is tossed.
1028 		 */
1029 		if (control && mp->msg_flags & MSG_COMPAT) {
1030 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1031 			    SOL_SOCKET ||
1032 			    mtod(control, struct cmsghdr *)->cmsg_type !=
1033 			    SCM_RIGHTS) {
1034 				mp->msg_controllen = 0;
1035 				goto out;
1036 			}
1037 			control->m_len -= sizeof (struct cmsghdr);
1038 			control->m_data += sizeof (struct cmsghdr);
1039 		}
1040 #endif
1041 		len = mp->msg_controllen;
1042 		m = control;
1043 		mp->msg_controllen = 0;
1044 		ctlbuf = mp->msg_control;
1045 
1046 		while (m && len > 0) {
1047 			unsigned int tocopy;
1048 
1049 			if (len >= m->m_len)
1050 				tocopy = m->m_len;
1051 			else {
1052 				mp->msg_flags |= MSG_CTRUNC;
1053 				tocopy = len;
1054 			}
1055 
1056 			if ((error = copyout(mtod(m, caddr_t),
1057 					ctlbuf, tocopy)) != 0)
1058 				goto out;
1059 
1060 			ctlbuf += tocopy;
1061 			len -= tocopy;
1062 			m = m->m_next;
1063 		}
1064 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1065 	}
1066 out:
1067 	fdrop(fp, td);
1068 #ifdef KTRACE
1069 	if (fromsa && KTRPOINT(td, KTR_STRUCT))
1070 		ktrsockaddr(fromsa);
1071 #endif
1072 	if (fromsa)
1073 		free(fromsa, M_SONAME);
1074 
1075 	if (error == 0 && controlp != NULL)
1076 		*controlp = control;
1077 	else  if (control)
1078 		m_freem(control);
1079 
1080 	return (error);
1081 }
1082 
1083 static int
1084 recvit(td, s, mp, namelenp)
1085 	struct thread *td;
1086 	int s;
1087 	struct msghdr *mp;
1088 	void *namelenp;
1089 {
1090 	int error;
1091 
1092 	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
1093 	if (error)
1094 		return (error);
1095 	if (namelenp) {
1096 		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
1097 #ifdef COMPAT_OLDSOCK
1098 		if (mp->msg_flags & MSG_COMPAT)
1099 			error = 0;	/* old recvfrom didn't check */
1100 #endif
1101 	}
1102 	return (error);
1103 }
1104 
1105 int
1106 recvfrom(td, uap)
1107 	struct thread *td;
1108 	struct recvfrom_args /* {
1109 		int	s;
1110 		caddr_t	buf;
1111 		size_t	len;
1112 		int	flags;
1113 		struct sockaddr * __restrict	from;
1114 		socklen_t * __restrict fromlenaddr;
1115 	} */ *uap;
1116 {
1117 	struct msghdr msg;
1118 	struct iovec aiov;
1119 	int error;
1120 
1121 	if (uap->fromlenaddr) {
1122 		error = copyin(uap->fromlenaddr,
1123 		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1124 		if (error)
1125 			goto done2;
1126 	} else {
1127 		msg.msg_namelen = 0;
1128 	}
1129 	msg.msg_name = uap->from;
1130 	msg.msg_iov = &aiov;
1131 	msg.msg_iovlen = 1;
1132 	aiov.iov_base = uap->buf;
1133 	aiov.iov_len = uap->len;
1134 	msg.msg_control = 0;
1135 	msg.msg_flags = uap->flags;
1136 	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1137 done2:
1138 	return(error);
1139 }
1140 
1141 #ifdef COMPAT_OLDSOCK
1142 int
1143 orecvfrom(td, uap)
1144 	struct thread *td;
1145 	struct recvfrom_args *uap;
1146 {
1147 
1148 	uap->flags |= MSG_COMPAT;
1149 	return (recvfrom(td, uap));
1150 }
1151 #endif
1152 
1153 #ifdef COMPAT_OLDSOCK
1154 int
1155 orecv(td, uap)
1156 	struct thread *td;
1157 	struct orecv_args /* {
1158 		int	s;
1159 		caddr_t	buf;
1160 		int	len;
1161 		int	flags;
1162 	} */ *uap;
1163 {
1164 	struct msghdr msg;
1165 	struct iovec aiov;
1166 	int error;
1167 
1168 	msg.msg_name = 0;
1169 	msg.msg_namelen = 0;
1170 	msg.msg_iov = &aiov;
1171 	msg.msg_iovlen = 1;
1172 	aiov.iov_base = uap->buf;
1173 	aiov.iov_len = uap->len;
1174 	msg.msg_control = 0;
1175 	msg.msg_flags = uap->flags;
1176 	error = recvit(td, uap->s, &msg, NULL);
1177 	return (error);
1178 }
1179 
1180 /*
1181  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1182  * overlays the new one, missing only the flags, and with the (old) access
1183  * rights where the control fields are now.
1184  */
1185 int
1186 orecvmsg(td, uap)
1187 	struct thread *td;
1188 	struct orecvmsg_args /* {
1189 		int	s;
1190 		struct	omsghdr *msg;
1191 		int	flags;
1192 	} */ *uap;
1193 {
1194 	struct msghdr msg;
1195 	struct iovec *iov;
1196 	int error;
1197 
1198 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1199 	if (error)
1200 		return (error);
1201 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1202 	if (error)
1203 		return (error);
1204 	msg.msg_flags = uap->flags | MSG_COMPAT;
1205 	msg.msg_iov = iov;
1206 	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1207 	if (msg.msg_controllen && error == 0)
1208 		error = copyout(&msg.msg_controllen,
1209 		    &uap->msg->msg_accrightslen, sizeof (int));
1210 	free(iov, M_IOV);
1211 	return (error);
1212 }
1213 #endif
1214 
1215 int
1216 recvmsg(td, uap)
1217 	struct thread *td;
1218 	struct recvmsg_args /* {
1219 		int	s;
1220 		struct	msghdr *msg;
1221 		int	flags;
1222 	} */ *uap;
1223 {
1224 	struct msghdr msg;
1225 	struct iovec *uiov, *iov;
1226 	int error;
1227 
1228 	error = copyin(uap->msg, &msg, sizeof (msg));
1229 	if (error)
1230 		return (error);
1231 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1232 	if (error)
1233 		return (error);
1234 	msg.msg_flags = uap->flags;
1235 #ifdef COMPAT_OLDSOCK
1236 	msg.msg_flags &= ~MSG_COMPAT;
1237 #endif
1238 	uiov = msg.msg_iov;
1239 	msg.msg_iov = iov;
1240 	error = recvit(td, uap->s, &msg, NULL);
1241 	if (error == 0) {
1242 		msg.msg_iov = uiov;
1243 		error = copyout(&msg, uap->msg, sizeof(msg));
1244 	}
1245 	free(iov, M_IOV);
1246 	return (error);
1247 }
1248 
1249 /* ARGSUSED */
1250 int
1251 shutdown(td, uap)
1252 	struct thread *td;
1253 	struct shutdown_args /* {
1254 		int	s;
1255 		int	how;
1256 	} */ *uap;
1257 {
1258 	struct socket *so;
1259 	struct file *fp;
1260 	int error;
1261 
1262 	AUDIT_ARG_FD(uap->s);
1263 	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
1264 	if (error == 0) {
1265 		so = fp->f_data;
1266 		error = soshutdown(so, uap->how);
1267 		fdrop(fp, td);
1268 	}
1269 	return (error);
1270 }
1271 
1272 /* ARGSUSED */
1273 int
1274 setsockopt(td, uap)
1275 	struct thread *td;
1276 	struct setsockopt_args /* {
1277 		int	s;
1278 		int	level;
1279 		int	name;
1280 		caddr_t	val;
1281 		int	valsize;
1282 	} */ *uap;
1283 {
1284 
1285 	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1286 	    uap->val, UIO_USERSPACE, uap->valsize));
1287 }
1288 
1289 int
1290 kern_setsockopt(td, s, level, name, val, valseg, valsize)
1291 	struct thread *td;
1292 	int s;
1293 	int level;
1294 	int name;
1295 	void *val;
1296 	enum uio_seg valseg;
1297 	socklen_t valsize;
1298 {
1299 	int error;
1300 	struct socket *so;
1301 	struct file *fp;
1302 	struct sockopt sopt;
1303 
1304 	if (val == NULL && valsize != 0)
1305 		return (EFAULT);
1306 	if ((int)valsize < 0)
1307 		return (EINVAL);
1308 
1309 	sopt.sopt_dir = SOPT_SET;
1310 	sopt.sopt_level = level;
1311 	sopt.sopt_name = name;
1312 	sopt.sopt_val = val;
1313 	sopt.sopt_valsize = valsize;
1314 	switch (valseg) {
1315 	case UIO_USERSPACE:
1316 		sopt.sopt_td = td;
1317 		break;
1318 	case UIO_SYSSPACE:
1319 		sopt.sopt_td = NULL;
1320 		break;
1321 	default:
1322 		panic("kern_setsockopt called with bad valseg");
1323 	}
1324 
1325 	AUDIT_ARG_FD(s);
1326 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
1327 	if (error == 0) {
1328 		so = fp->f_data;
1329 		error = sosetopt(so, &sopt);
1330 		fdrop(fp, td);
1331 	}
1332 	return(error);
1333 }
1334 
1335 /* ARGSUSED */
1336 int
1337 getsockopt(td, uap)
1338 	struct thread *td;
1339 	struct getsockopt_args /* {
1340 		int	s;
1341 		int	level;
1342 		int	name;
1343 		void * __restrict	val;
1344 		socklen_t * __restrict avalsize;
1345 	} */ *uap;
1346 {
1347 	socklen_t valsize;
1348 	int	error;
1349 
1350 	if (uap->val) {
1351 		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1352 		if (error)
1353 			return (error);
1354 	}
1355 
1356 	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1357 	    uap->val, UIO_USERSPACE, &valsize);
1358 
1359 	if (error == 0)
1360 		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1361 	return (error);
1362 }
1363 
1364 /*
1365  * Kernel version of getsockopt.
1366  * optval can be a userland or userspace. optlen is always a kernel pointer.
1367  */
1368 int
1369 kern_getsockopt(td, s, level, name, val, valseg, valsize)
1370 	struct thread *td;
1371 	int s;
1372 	int level;
1373 	int name;
1374 	void *val;
1375 	enum uio_seg valseg;
1376 	socklen_t *valsize;
1377 {
1378 	int error;
1379 	struct  socket *so;
1380 	struct file *fp;
1381 	struct	sockopt sopt;
1382 
1383 	if (val == NULL)
1384 		*valsize = 0;
1385 	if ((int)*valsize < 0)
1386 		return (EINVAL);
1387 
1388 	sopt.sopt_dir = SOPT_GET;
1389 	sopt.sopt_level = level;
1390 	sopt.sopt_name = name;
1391 	sopt.sopt_val = val;
1392 	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1393 	switch (valseg) {
1394 	case UIO_USERSPACE:
1395 		sopt.sopt_td = td;
1396 		break;
1397 	case UIO_SYSSPACE:
1398 		sopt.sopt_td = NULL;
1399 		break;
1400 	default:
1401 		panic("kern_getsockopt called with bad valseg");
1402 	}
1403 
1404 	AUDIT_ARG_FD(s);
1405 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
1406 	if (error == 0) {
1407 		so = fp->f_data;
1408 		error = sogetopt(so, &sopt);
1409 		*valsize = sopt.sopt_valsize;
1410 		fdrop(fp, td);
1411 	}
1412 	return (error);
1413 }
1414 
1415 /*
1416  * getsockname1() - Get socket name.
1417  */
1418 /* ARGSUSED */
1419 static int
1420 getsockname1(td, uap, compat)
1421 	struct thread *td;
1422 	struct getsockname_args /* {
1423 		int	fdes;
1424 		struct sockaddr * __restrict asa;
1425 		socklen_t * __restrict alen;
1426 	} */ *uap;
1427 	int compat;
1428 {
1429 	struct sockaddr *sa;
1430 	socklen_t len;
1431 	int error;
1432 
1433 	error = copyin(uap->alen, &len, sizeof(len));
1434 	if (error)
1435 		return (error);
1436 
1437 	error = kern_getsockname(td, uap->fdes, &sa, &len);
1438 	if (error)
1439 		return (error);
1440 
1441 	if (len != 0) {
1442 #ifdef COMPAT_OLDSOCK
1443 		if (compat)
1444 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1445 #endif
1446 		error = copyout(sa, uap->asa, (u_int)len);
1447 	}
1448 	free(sa, M_SONAME);
1449 	if (error == 0)
1450 		error = copyout(&len, uap->alen, sizeof(len));
1451 	return (error);
1452 }
1453 
1454 int
1455 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
1456     socklen_t *alen)
1457 {
1458 	struct socket *so;
1459 	struct file *fp;
1460 	socklen_t len;
1461 	int error;
1462 
1463 	if (*alen < 0)
1464 		return (EINVAL);
1465 
1466 	AUDIT_ARG_FD(fd);
1467 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
1468 	if (error)
1469 		return (error);
1470 	so = fp->f_data;
1471 	*sa = NULL;
1472 	CURVNET_SET(so->so_vnet);
1473 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
1474 	CURVNET_RESTORE();
1475 	if (error)
1476 		goto bad;
1477 	if (*sa == NULL)
1478 		len = 0;
1479 	else
1480 		len = MIN(*alen, (*sa)->sa_len);
1481 	*alen = len;
1482 #ifdef KTRACE
1483 	if (KTRPOINT(td, KTR_STRUCT))
1484 		ktrsockaddr(*sa);
1485 #endif
1486 bad:
1487 	fdrop(fp, td);
1488 	if (error && *sa) {
1489 		free(*sa, M_SONAME);
1490 		*sa = NULL;
1491 	}
1492 	return (error);
1493 }
1494 
1495 int
1496 getsockname(td, uap)
1497 	struct thread *td;
1498 	struct getsockname_args *uap;
1499 {
1500 
1501 	return (getsockname1(td, uap, 0));
1502 }
1503 
1504 #ifdef COMPAT_OLDSOCK
1505 int
1506 ogetsockname(td, uap)
1507 	struct thread *td;
1508 	struct getsockname_args *uap;
1509 {
1510 
1511 	return (getsockname1(td, uap, 1));
1512 }
1513 #endif /* COMPAT_OLDSOCK */
1514 
1515 /*
1516  * getpeername1() - Get name of peer for connected socket.
1517  */
1518 /* ARGSUSED */
1519 static int
1520 getpeername1(td, uap, compat)
1521 	struct thread *td;
1522 	struct getpeername_args /* {
1523 		int	fdes;
1524 		struct sockaddr * __restrict	asa;
1525 		socklen_t * __restrict	alen;
1526 	} */ *uap;
1527 	int compat;
1528 {
1529 	struct sockaddr *sa;
1530 	socklen_t len;
1531 	int error;
1532 
1533 	error = copyin(uap->alen, &len, sizeof (len));
1534 	if (error)
1535 		return (error);
1536 
1537 	error = kern_getpeername(td, uap->fdes, &sa, &len);
1538 	if (error)
1539 		return (error);
1540 
1541 	if (len != 0) {
1542 #ifdef COMPAT_OLDSOCK
1543 		if (compat)
1544 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1545 #endif
1546 		error = copyout(sa, uap->asa, (u_int)len);
1547 	}
1548 	free(sa, M_SONAME);
1549 	if (error == 0)
1550 		error = copyout(&len, uap->alen, sizeof(len));
1551 	return (error);
1552 }
1553 
1554 int
1555 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
1556     socklen_t *alen)
1557 {
1558 	struct socket *so;
1559 	struct file *fp;
1560 	socklen_t len;
1561 	int error;
1562 
1563 	if (*alen < 0)
1564 		return (EINVAL);
1565 
1566 	AUDIT_ARG_FD(fd);
1567 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
1568 	if (error)
1569 		return (error);
1570 	so = fp->f_data;
1571 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1572 		error = ENOTCONN;
1573 		goto done;
1574 	}
1575 	*sa = NULL;
1576 	CURVNET_SET(so->so_vnet);
1577 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
1578 	CURVNET_RESTORE();
1579 	if (error)
1580 		goto bad;
1581 	if (*sa == NULL)
1582 		len = 0;
1583 	else
1584 		len = MIN(*alen, (*sa)->sa_len);
1585 	*alen = len;
1586 #ifdef KTRACE
1587 	if (KTRPOINT(td, KTR_STRUCT))
1588 		ktrsockaddr(*sa);
1589 #endif
1590 bad:
1591 	if (error && *sa) {
1592 		free(*sa, M_SONAME);
1593 		*sa = NULL;
1594 	}
1595 done:
1596 	fdrop(fp, td);
1597 	return (error);
1598 }
1599 
1600 int
1601 getpeername(td, uap)
1602 	struct thread *td;
1603 	struct getpeername_args *uap;
1604 {
1605 
1606 	return (getpeername1(td, uap, 0));
1607 }
1608 
1609 #ifdef COMPAT_OLDSOCK
1610 int
1611 ogetpeername(td, uap)
1612 	struct thread *td;
1613 	struct ogetpeername_args *uap;
1614 {
1615 
1616 	/* XXX uap should have type `getpeername_args *' to begin with. */
1617 	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1618 }
1619 #endif /* COMPAT_OLDSOCK */
1620 
1621 int
1622 sockargs(mp, buf, buflen, type)
1623 	struct mbuf **mp;
1624 	caddr_t buf;
1625 	int buflen, type;
1626 {
1627 	struct sockaddr *sa;
1628 	struct mbuf *m;
1629 	int error;
1630 
1631 	if ((u_int)buflen > MLEN) {
1632 #ifdef COMPAT_OLDSOCK
1633 		if (type == MT_SONAME && (u_int)buflen <= 112)
1634 			buflen = MLEN;		/* unix domain compat. hack */
1635 		else
1636 #endif
1637 			if ((u_int)buflen > MCLBYTES)
1638 				return (EINVAL);
1639 	}
1640 	m = m_get(M_WAIT, type);
1641 	if ((u_int)buflen > MLEN)
1642 		MCLGET(m, M_WAIT);
1643 	m->m_len = buflen;
1644 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1645 	if (error)
1646 		(void) m_free(m);
1647 	else {
1648 		*mp = m;
1649 		if (type == MT_SONAME) {
1650 			sa = mtod(m, struct sockaddr *);
1651 
1652 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1653 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1654 				sa->sa_family = sa->sa_len;
1655 #endif
1656 			sa->sa_len = buflen;
1657 		}
1658 	}
1659 	return (error);
1660 }
1661 
1662 int
1663 getsockaddr(namp, uaddr, len)
1664 	struct sockaddr **namp;
1665 	caddr_t uaddr;
1666 	size_t len;
1667 {
1668 	struct sockaddr *sa;
1669 	int error;
1670 
1671 	if (len > SOCK_MAXADDRLEN)
1672 		return (ENAMETOOLONG);
1673 	if (len < offsetof(struct sockaddr, sa_data[0]))
1674 		return (EINVAL);
1675 	sa = malloc(len, M_SONAME, M_WAITOK);
1676 	error = copyin(uaddr, sa, len);
1677 	if (error) {
1678 		free(sa, M_SONAME);
1679 	} else {
1680 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1681 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1682 			sa->sa_family = sa->sa_len;
1683 #endif
1684 		sa->sa_len = len;
1685 		*namp = sa;
1686 	}
1687 	return (error);
1688 }
1689 
1690 #include <sys/condvar.h>
1691 
1692 struct sendfile_sync {
1693 	struct mtx	mtx;
1694 	struct cv	cv;
1695 	unsigned 	count;
1696 };
1697 
1698 /*
1699  * Detach mapped page and release resources back to the system.
1700  */
1701 void
1702 sf_buf_mext(void *addr, void *args)
1703 {
1704 	vm_page_t m;
1705 	struct sendfile_sync *sfs;
1706 
1707 	m = sf_buf_page(args);
1708 	sf_buf_free(args);
1709 	vm_page_lock(m);
1710 	vm_page_unwire(m, 0);
1711 	/*
1712 	 * Check for the object going away on us. This can
1713 	 * happen since we don't hold a reference to it.
1714 	 * If so, we're responsible for freeing the page.
1715 	 */
1716 	if (m->wire_count == 0 && m->object == NULL)
1717 		vm_page_free(m);
1718 	vm_page_unlock(m);
1719 	if (addr == NULL)
1720 		return;
1721 	sfs = addr;
1722 	mtx_lock(&sfs->mtx);
1723 	KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
1724 	if (--sfs->count == 0)
1725 		cv_signal(&sfs->cv);
1726 	mtx_unlock(&sfs->mtx);
1727 }
1728 
1729 /*
1730  * sendfile(2)
1731  *
1732  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1733  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1734  *
1735  * Send a file specified by 'fd' and starting at 'offset' to a socket
1736  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
1737  * 0.  Optionally add a header and/or trailer to the socket output.  If
1738  * specified, write the total number of bytes sent into *sbytes.
1739  */
1740 int
1741 sendfile(struct thread *td, struct sendfile_args *uap)
1742 {
1743 
1744 	return (do_sendfile(td, uap, 0));
1745 }
1746 
1747 static int
1748 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1749 {
1750 	struct sf_hdtr hdtr;
1751 	struct uio *hdr_uio, *trl_uio;
1752 	int error;
1753 
1754 	hdr_uio = trl_uio = NULL;
1755 
1756 	if (uap->hdtr != NULL) {
1757 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1758 		if (error)
1759 			goto out;
1760 		if (hdtr.headers != NULL) {
1761 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
1762 			if (error)
1763 				goto out;
1764 		}
1765 		if (hdtr.trailers != NULL) {
1766 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
1767 			if (error)
1768 				goto out;
1769 
1770 		}
1771 	}
1772 
1773 	error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
1774 out:
1775 	if (hdr_uio)
1776 		free(hdr_uio, M_IOV);
1777 	if (trl_uio)
1778 		free(trl_uio, M_IOV);
1779 	return (error);
1780 }
1781 
1782 #ifdef COMPAT_FREEBSD4
1783 int
1784 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1785 {
1786 	struct sendfile_args args;
1787 
1788 	args.fd = uap->fd;
1789 	args.s = uap->s;
1790 	args.offset = uap->offset;
1791 	args.nbytes = uap->nbytes;
1792 	args.hdtr = uap->hdtr;
1793 	args.sbytes = uap->sbytes;
1794 	args.flags = uap->flags;
1795 
1796 	return (do_sendfile(td, &args, 1));
1797 }
1798 #endif /* COMPAT_FREEBSD4 */
1799 
1800 int
1801 kern_sendfile(struct thread *td, struct sendfile_args *uap,
1802     struct uio *hdr_uio, struct uio *trl_uio, int compat)
1803 {
1804 	struct file *sock_fp;
1805 	struct vnode *vp;
1806 	struct vm_object *obj = NULL;
1807 	struct socket *so = NULL;
1808 	struct mbuf *m = NULL;
1809 	struct sf_buf *sf;
1810 	struct vm_page *pg;
1811 	off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
1812 	int error, hdrlen = 0, mnw = 0;
1813 	int vfslocked;
1814 	struct sendfile_sync *sfs = NULL;
1815 
1816 	/*
1817 	 * The file descriptor must be a regular file and have a
1818 	 * backing VM object.
1819 	 * File offset must be positive.  If it goes beyond EOF
1820 	 * we send only the header/trailer and no payload data.
1821 	 */
1822 	AUDIT_ARG_FD(uap->fd);
1823 	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
1824 		goto out;
1825 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1826 	vn_lock(vp, LK_SHARED | LK_RETRY);
1827 	if (vp->v_type == VREG) {
1828 		obj = vp->v_object;
1829 		if (obj != NULL) {
1830 			/*
1831 			 * Temporarily increase the backing VM
1832 			 * object's reference count so that a forced
1833 			 * reclamation of its vnode does not
1834 			 * immediately destroy it.
1835 			 */
1836 			VM_OBJECT_LOCK(obj);
1837 			if ((obj->flags & OBJ_DEAD) == 0) {
1838 				vm_object_reference_locked(obj);
1839 				VM_OBJECT_UNLOCK(obj);
1840 			} else {
1841 				VM_OBJECT_UNLOCK(obj);
1842 				obj = NULL;
1843 			}
1844 		}
1845 	}
1846 	VOP_UNLOCK(vp, 0);
1847 	VFS_UNLOCK_GIANT(vfslocked);
1848 	if (obj == NULL) {
1849 		error = EINVAL;
1850 		goto out;
1851 	}
1852 	if (uap->offset < 0) {
1853 		error = EINVAL;
1854 		goto out;
1855 	}
1856 
1857 	/*
1858 	 * The socket must be a stream socket and connected.
1859 	 * Remember if it a blocking or non-blocking socket.
1860 	 */
1861 	if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp,
1862 	    NULL)) != 0)
1863 		goto out;
1864 	so = sock_fp->f_data;
1865 	if (so->so_type != SOCK_STREAM) {
1866 		error = EINVAL;
1867 		goto out;
1868 	}
1869 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1870 		error = ENOTCONN;
1871 		goto out;
1872 	}
1873 	/*
1874 	 * Do not wait on memory allocations but return ENOMEM for
1875 	 * caller to retry later.
1876 	 * XXX: Experimental.
1877 	 */
1878 	if (uap->flags & SF_MNOWAIT)
1879 		mnw = 1;
1880 
1881 	if (uap->flags & SF_SYNC) {
1882 		sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
1883 		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
1884 		cv_init(&sfs->cv, "sendfile");
1885 	}
1886 
1887 #ifdef MAC
1888 	error = mac_socket_check_send(td->td_ucred, so);
1889 	if (error)
1890 		goto out;
1891 #endif
1892 
1893 	/* If headers are specified copy them into mbufs. */
1894 	if (hdr_uio != NULL) {
1895 		hdr_uio->uio_td = td;
1896 		hdr_uio->uio_rw = UIO_WRITE;
1897 		if (hdr_uio->uio_resid > 0) {
1898 			/*
1899 			 * In FBSD < 5.0 the nbytes to send also included
1900 			 * the header.  If compat is specified subtract the
1901 			 * header size from nbytes.
1902 			 */
1903 			if (compat) {
1904 				if (uap->nbytes > hdr_uio->uio_resid)
1905 					uap->nbytes -= hdr_uio->uio_resid;
1906 				else
1907 					uap->nbytes = 0;
1908 			}
1909 			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
1910 			    0, 0, 0);
1911 			if (m == NULL) {
1912 				error = mnw ? EAGAIN : ENOBUFS;
1913 				goto out;
1914 			}
1915 			hdrlen = m_length(m, NULL);
1916 		}
1917 	}
1918 
1919 	/*
1920 	 * Protect against multiple writers to the socket.
1921 	 *
1922 	 * XXXRW: Historically this has assumed non-interruptibility, so now
1923 	 * we implement that, but possibly shouldn't.
1924 	 */
1925 	(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
1926 
1927 	/*
1928 	 * Loop through the pages of the file, starting with the requested
1929 	 * offset. Get a file page (do I/O if necessary), map the file page
1930 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1931 	 * it on the socket.
1932 	 * This is done in two loops.  The inner loop turns as many pages
1933 	 * as it can, up to available socket buffer space, without blocking
1934 	 * into mbufs to have it bulk delivered into the socket send buffer.
1935 	 * The outer loop checks the state and available space of the socket
1936 	 * and takes care of the overall progress.
1937 	 */
1938 	for (off = uap->offset, rem = uap->nbytes; ; ) {
1939 		int loopbytes = 0;
1940 		int space = 0;
1941 		int done = 0;
1942 
1943 		/*
1944 		 * Check the socket state for ongoing connection,
1945 		 * no errors and space in socket buffer.
1946 		 * If space is low allow for the remainder of the
1947 		 * file to be processed if it fits the socket buffer.
1948 		 * Otherwise block in waiting for sufficient space
1949 		 * to proceed, or if the socket is nonblocking, return
1950 		 * to userland with EAGAIN while reporting how far
1951 		 * we've come.
1952 		 * We wait until the socket buffer has significant free
1953 		 * space to do bulk sends.  This makes good use of file
1954 		 * system read ahead and allows packet segmentation
1955 		 * offloading hardware to take over lots of work.  If
1956 		 * we were not careful here we would send off only one
1957 		 * sfbuf at a time.
1958 		 */
1959 		SOCKBUF_LOCK(&so->so_snd);
1960 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
1961 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
1962 retry_space:
1963 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1964 			error = EPIPE;
1965 			SOCKBUF_UNLOCK(&so->so_snd);
1966 			goto done;
1967 		} else if (so->so_error) {
1968 			error = so->so_error;
1969 			so->so_error = 0;
1970 			SOCKBUF_UNLOCK(&so->so_snd);
1971 			goto done;
1972 		}
1973 		space = sbspace(&so->so_snd);
1974 		if (space < rem &&
1975 		    (space <= 0 ||
1976 		     space < so->so_snd.sb_lowat)) {
1977 			if (so->so_state & SS_NBIO) {
1978 				SOCKBUF_UNLOCK(&so->so_snd);
1979 				error = EAGAIN;
1980 				goto done;
1981 			}
1982 			/*
1983 			 * sbwait drops the lock while sleeping.
1984 			 * When we loop back to retry_space the
1985 			 * state may have changed and we retest
1986 			 * for it.
1987 			 */
1988 			error = sbwait(&so->so_snd);
1989 			/*
1990 			 * An error from sbwait usually indicates that we've
1991 			 * been interrupted by a signal. If we've sent anything
1992 			 * then return bytes sent, otherwise return the error.
1993 			 */
1994 			if (error) {
1995 				SOCKBUF_UNLOCK(&so->so_snd);
1996 				goto done;
1997 			}
1998 			goto retry_space;
1999 		}
2000 		SOCKBUF_UNLOCK(&so->so_snd);
2001 
2002 		/*
2003 		 * Reduce space in the socket buffer by the size of
2004 		 * the header mbuf chain.
2005 		 * hdrlen is set to 0 after the first loop.
2006 		 */
2007 		space -= hdrlen;
2008 
2009 		/*
2010 		 * Loop and construct maximum sized mbuf chain to be bulk
2011 		 * dumped into socket buffer.
2012 		 */
2013 		while (space > loopbytes) {
2014 			vm_pindex_t pindex;
2015 			vm_offset_t pgoff;
2016 			struct mbuf *m0;
2017 
2018 			VM_OBJECT_LOCK(obj);
2019 			/*
2020 			 * Calculate the amount to transfer.
2021 			 * Not to exceed a page, the EOF,
2022 			 * or the passed in nbytes.
2023 			 */
2024 			pgoff = (vm_offset_t)(off & PAGE_MASK);
2025 			xfsize = omin(PAGE_SIZE - pgoff,
2026 			    obj->un_pager.vnp.vnp_size - uap->offset -
2027 			    fsbytes - loopbytes);
2028 			if (uap->nbytes)
2029 				rem = (uap->nbytes - fsbytes - loopbytes);
2030 			else
2031 				rem = obj->un_pager.vnp.vnp_size -
2032 				    uap->offset - fsbytes - loopbytes;
2033 			xfsize = omin(rem, xfsize);
2034 			xfsize = omin(space - loopbytes, xfsize);
2035 			if (xfsize <= 0) {
2036 				VM_OBJECT_UNLOCK(obj);
2037 				done = 1;		/* all data sent */
2038 				break;
2039 			}
2040 
2041 			/*
2042 			 * Attempt to look up the page.  Allocate
2043 			 * if not found or wait and loop if busy.
2044 			 */
2045 			pindex = OFF_TO_IDX(off);
2046 			pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
2047 			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
2048 
2049 			/*
2050 			 * Check if page is valid for what we need,
2051 			 * otherwise initiate I/O.
2052 			 * If we already turned some pages into mbufs,
2053 			 * send them off before we come here again and
2054 			 * block.
2055 			 */
2056 			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
2057 				VM_OBJECT_UNLOCK(obj);
2058 			else if (m != NULL)
2059 				error = EAGAIN;	/* send what we already got */
2060 			else if (uap->flags & SF_NODISKIO)
2061 				error = EBUSY;
2062 			else {
2063 				int bsize, resid;
2064 
2065 				/*
2066 				 * Ensure that our page is still around
2067 				 * when the I/O completes.
2068 				 */
2069 				vm_page_io_start(pg);
2070 				VM_OBJECT_UNLOCK(obj);
2071 
2072 				/*
2073 				 * Get the page from backing store.
2074 				 */
2075 				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2076 				error = vn_lock(vp, LK_SHARED);
2077 				if (error != 0)
2078 					goto after_read;
2079 				bsize = vp->v_mount->mnt_stat.f_iosize;
2080 
2081 				/*
2082 				 * XXXMAC: Because we don't have fp->f_cred
2083 				 * here, we pass in NOCRED.  This is probably
2084 				 * wrong, but is consistent with our original
2085 				 * implementation.
2086 				 */
2087 				error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
2088 				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
2089 				    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
2090 				    td->td_ucred, NOCRED, &resid, td);
2091 				VOP_UNLOCK(vp, 0);
2092 			after_read:
2093 				VFS_UNLOCK_GIANT(vfslocked);
2094 				VM_OBJECT_LOCK(obj);
2095 				vm_page_io_finish(pg);
2096 				if (!error)
2097 					VM_OBJECT_UNLOCK(obj);
2098 				mbstat.sf_iocnt++;
2099 			}
2100 			if (error) {
2101 				vm_page_lock(pg);
2102 				vm_page_unwire(pg, 0);
2103 				/*
2104 				 * See if anyone else might know about
2105 				 * this page.  If not and it is not valid,
2106 				 * then free it.
2107 				 */
2108 				if (pg->wire_count == 0 && pg->valid == 0 &&
2109 				    pg->busy == 0 && !(pg->oflags & VPO_BUSY))
2110 					vm_page_free(pg);
2111 				vm_page_unlock(pg);
2112 				VM_OBJECT_UNLOCK(obj);
2113 				if (error == EAGAIN)
2114 					error = 0;	/* not a real error */
2115 				break;
2116 			}
2117 
2118 			/*
2119 			 * Get a sendfile buf.  When allocating the
2120 			 * first buffer for mbuf chain, we usually
2121 			 * wait as long as necessary, but this wait
2122 			 * can be interrupted.  For consequent
2123 			 * buffers, do not sleep, since several
2124 			 * threads might exhaust the buffers and then
2125 			 * deadlock.
2126 			 */
2127 			sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
2128 			    SFB_CATCH);
2129 			if (sf == NULL) {
2130 				mbstat.sf_allocfail++;
2131 				vm_page_lock(pg);
2132 				vm_page_unwire(pg, 0);
2133 				KASSERT(pg->object != NULL,
2134 				    ("kern_sendfile: object disappeared"));
2135 				vm_page_unlock(pg);
2136 				if (m == NULL)
2137 					error = (mnw ? EAGAIN : EINTR);
2138 				break;
2139 			}
2140 
2141 			/*
2142 			 * Get an mbuf and set it up as having
2143 			 * external storage.
2144 			 */
2145 			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
2146 			if (m0 == NULL) {
2147 				error = (mnw ? EAGAIN : ENOBUFS);
2148 				sf_buf_mext((void *)sf_buf_kva(sf), sf);
2149 				break;
2150 			}
2151 			MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext,
2152 			    sfs, sf, M_RDONLY, EXT_SFBUF);
2153 			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
2154 			m0->m_len = xfsize;
2155 
2156 			/* Append to mbuf chain. */
2157 			if (m != NULL)
2158 				m_cat(m, m0);
2159 			else
2160 				m = m0;
2161 
2162 			/* Keep track of bits processed. */
2163 			loopbytes += xfsize;
2164 			off += xfsize;
2165 
2166 			if (sfs != NULL) {
2167 				mtx_lock(&sfs->mtx);
2168 				sfs->count++;
2169 				mtx_unlock(&sfs->mtx);
2170 			}
2171 		}
2172 
2173 		/* Add the buffer chain to the socket buffer. */
2174 		if (m != NULL) {
2175 			int mlen, err;
2176 
2177 			mlen = m_length(m, NULL);
2178 			SOCKBUF_LOCK(&so->so_snd);
2179 			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2180 				error = EPIPE;
2181 				SOCKBUF_UNLOCK(&so->so_snd);
2182 				goto done;
2183 			}
2184 			SOCKBUF_UNLOCK(&so->so_snd);
2185 			CURVNET_SET(so->so_vnet);
2186 			/* Avoid error aliasing. */
2187 			err = (*so->so_proto->pr_usrreqs->pru_send)
2188 				    (so, 0, m, NULL, NULL, td);
2189 			CURVNET_RESTORE();
2190 			if (err == 0) {
2191 				/*
2192 				 * We need two counters to get the
2193 				 * file offset and nbytes to send
2194 				 * right:
2195 				 * - sbytes contains the total amount
2196 				 *   of bytes sent, including headers.
2197 				 * - fsbytes contains the total amount
2198 				 *   of bytes sent from the file.
2199 				 */
2200 				sbytes += mlen;
2201 				fsbytes += mlen;
2202 				if (hdrlen) {
2203 					fsbytes -= hdrlen;
2204 					hdrlen = 0;
2205 				}
2206 			} else if (error == 0)
2207 				error = err;
2208 			m = NULL;	/* pru_send always consumes */
2209 		}
2210 
2211 		/* Quit outer loop on error or when we're done. */
2212 		if (done)
2213 			break;
2214 		if (error)
2215 			goto done;
2216 	}
2217 
2218 	/*
2219 	 * Send trailers. Wimp out and use writev(2).
2220 	 */
2221 	if (trl_uio != NULL) {
2222 		sbunlock(&so->so_snd);
2223 		error = kern_writev(td, uap->s, trl_uio);
2224 		if (error == 0)
2225 			sbytes += td->td_retval[0];
2226 		goto out;
2227 	}
2228 
2229 done:
2230 	sbunlock(&so->so_snd);
2231 out:
2232 	/*
2233 	 * If there was no error we have to clear td->td_retval[0]
2234 	 * because it may have been set by writev.
2235 	 */
2236 	if (error == 0) {
2237 		td->td_retval[0] = 0;
2238 	}
2239 	if (uap->sbytes != NULL) {
2240 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
2241 	}
2242 	if (obj != NULL)
2243 		vm_object_deallocate(obj);
2244 	if (vp != NULL) {
2245 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2246 		vrele(vp);
2247 		VFS_UNLOCK_GIANT(vfslocked);
2248 	}
2249 	if (so)
2250 		fdrop(sock_fp, td);
2251 	if (m)
2252 		m_freem(m);
2253 
2254 	if (sfs != NULL) {
2255 		mtx_lock(&sfs->mtx);
2256 		if (sfs->count != 0)
2257 			cv_wait(&sfs->cv, &sfs->mtx);
2258 		KASSERT(sfs->count == 0, ("sendfile sync still busy"));
2259 		cv_destroy(&sfs->cv);
2260 		mtx_destroy(&sfs->mtx);
2261 		free(sfs, M_TEMP);
2262 	}
2263 
2264 	if (error == ERESTART)
2265 		error = EINTR;
2266 
2267 	return (error);
2268 }
2269 
2270 /*
2271  * SCTP syscalls.
2272  * Functionality only compiled in if SCTP is defined in the kernel Makefile,
2273  * otherwise all return EOPNOTSUPP.
2274  * XXX: We should make this loadable one day.
2275  */
2276 int
2277 sctp_peeloff(td, uap)
2278 	struct thread *td;
2279 	struct sctp_peeloff_args /* {
2280 		int	sd;
2281 		caddr_t	name;
2282 	} */ *uap;
2283 {
2284 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2285 	struct filedesc *fdp;
2286 	struct file *nfp = NULL;
2287 	int error;
2288 	struct socket *head, *so;
2289 	int fd;
2290 	u_int fflag;
2291 
2292 	fdp = td->td_proc->p_fd;
2293 	AUDIT_ARG_FD(uap->sd);
2294 	error = fgetsock(td, uap->sd, &head, &fflag);
2295 	if (error)
2296 		goto done2;
2297 	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
2298 	if (error)
2299 		goto done2;
2300 	/*
2301 	 * At this point we know we do have a assoc to pull
2302 	 * we proceed to get the fd setup. This may block
2303 	 * but that is ok.
2304 	 */
2305 
2306 	error = falloc(td, &nfp, &fd, 0);
2307 	if (error)
2308 		goto done;
2309 	td->td_retval[0] = fd;
2310 
2311 	CURVNET_SET(head->so_vnet);
2312 	so = sonewconn(head, SS_ISCONNECTED);
2313 	if (so == NULL)
2314 		goto noconnection;
2315 	/*
2316 	 * Before changing the flags on the socket, we have to bump the
2317 	 * reference count.  Otherwise, if the protocol calls sofree(),
2318 	 * the socket will be released due to a zero refcount.
2319 	 */
2320         SOCK_LOCK(so);
2321         soref(so);                      /* file descriptor reference */
2322         SOCK_UNLOCK(so);
2323 
2324 	ACCEPT_LOCK();
2325 
2326 	TAILQ_REMOVE(&head->so_comp, so, so_list);
2327 	head->so_qlen--;
2328 	so->so_state |= (head->so_state & SS_NBIO);
2329 	so->so_state &= ~SS_NOFDREF;
2330 	so->so_qstate &= ~SQ_COMP;
2331 	so->so_head = NULL;
2332 	ACCEPT_UNLOCK();
2333 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
2334 	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
2335 	if (error)
2336 		goto noconnection;
2337 	if (head->so_sigio != NULL)
2338 		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
2339 
2340 noconnection:
2341 	/*
2342 	 * close the new descriptor, assuming someone hasn't ripped it
2343 	 * out from under us.
2344 	 */
2345 	if (error)
2346 		fdclose(fdp, nfp, fd, td);
2347 
2348 	/*
2349 	 * Release explicitly held references before returning.
2350 	 */
2351 	CURVNET_RESTORE();
2352 done:
2353 	if (nfp != NULL)
2354 		fdrop(nfp, td);
2355 	fputsock(head);
2356 done2:
2357 	return (error);
2358 #else  /* SCTP */
2359 	return (EOPNOTSUPP);
2360 #endif /* SCTP */
2361 }
2362 
2363 int
2364 sctp_generic_sendmsg (td, uap)
2365 	struct thread *td;
2366 	struct sctp_generic_sendmsg_args /* {
2367 		int sd,
2368 		caddr_t msg,
2369 		int mlen,
2370 		caddr_t to,
2371 		__socklen_t tolen,
2372 		struct sctp_sndrcvinfo *sinfo,
2373 		int flags
2374 	} */ *uap;
2375 {
2376 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2377 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2378 	struct socket *so;
2379 	struct file *fp = NULL;
2380 	int error = 0, len;
2381 	struct sockaddr *to = NULL;
2382 #ifdef KTRACE
2383 	struct uio *ktruio = NULL;
2384 #endif
2385 	struct uio auio;
2386 	struct iovec iov[1];
2387 
2388 	if (uap->sinfo) {
2389 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2390 		if (error)
2391 			return (error);
2392 		u_sinfo = &sinfo;
2393 	}
2394 	if (uap->tolen) {
2395 		error = getsockaddr(&to, uap->to, uap->tolen);
2396 		if (error) {
2397 			to = NULL;
2398 			goto sctp_bad2;
2399 		}
2400 	}
2401 
2402 	AUDIT_ARG_FD(uap->sd);
2403 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
2404 	if (error)
2405 		goto sctp_bad;
2406 #ifdef KTRACE
2407 	if (to && (KTRPOINT(td, KTR_STRUCT)))
2408 		ktrsockaddr(to);
2409 #endif
2410 
2411 	iov[0].iov_base = uap->msg;
2412 	iov[0].iov_len = uap->mlen;
2413 
2414 	so = (struct socket *)fp->f_data;
2415 #ifdef MAC
2416 	error = mac_socket_check_send(td->td_ucred, so);
2417 	if (error)
2418 		goto sctp_bad;
2419 #endif /* MAC */
2420 
2421 	auio.uio_iov =  iov;
2422 	auio.uio_iovcnt = 1;
2423 	auio.uio_segflg = UIO_USERSPACE;
2424 	auio.uio_rw = UIO_WRITE;
2425 	auio.uio_td = td;
2426 	auio.uio_offset = 0;			/* XXX */
2427 	auio.uio_resid = 0;
2428 	len = auio.uio_resid = uap->mlen;
2429 	CURVNET_SET(so->so_vnet);
2430 	error = sctp_lower_sosend(so, to, &auio,
2431 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2432 		    uap->flags, u_sinfo, td);
2433 	CURVNET_RESTORE();
2434 	if (error) {
2435 		if (auio.uio_resid != len && (error == ERESTART ||
2436 		    error == EINTR || error == EWOULDBLOCK))
2437 			error = 0;
2438 		/* Generation of SIGPIPE can be controlled per socket. */
2439 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2440 		    !(uap->flags & MSG_NOSIGNAL)) {
2441 			PROC_LOCK(td->td_proc);
2442 			tdsignal(td, SIGPIPE);
2443 			PROC_UNLOCK(td->td_proc);
2444 		}
2445 	}
2446 	if (error == 0)
2447 		td->td_retval[0] = len - auio.uio_resid;
2448 #ifdef KTRACE
2449 	if (ktruio != NULL) {
2450 		ktruio->uio_resid = td->td_retval[0];
2451 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2452 	}
2453 #endif /* KTRACE */
2454 sctp_bad:
2455 	if (fp)
2456 		fdrop(fp, td);
2457 sctp_bad2:
2458 	if (to)
2459 		free(to, M_SONAME);
2460 	return (error);
2461 #else  /* SCTP */
2462 	return (EOPNOTSUPP);
2463 #endif /* SCTP */
2464 }
2465 
2466 int
2467 sctp_generic_sendmsg_iov(td, uap)
2468 	struct thread *td;
2469 	struct sctp_generic_sendmsg_iov_args /* {
2470 		int sd,
2471 		struct iovec *iov,
2472 		int iovlen,
2473 		caddr_t to,
2474 		__socklen_t tolen,
2475 		struct sctp_sndrcvinfo *sinfo,
2476 		int flags
2477 	} */ *uap;
2478 {
2479 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2480 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2481 	struct socket *so;
2482 	struct file *fp = NULL;
2483 	int error=0, len, i;
2484 	struct sockaddr *to = NULL;
2485 #ifdef KTRACE
2486 	struct uio *ktruio = NULL;
2487 #endif
2488 	struct uio auio;
2489 	struct iovec *iov, *tiov;
2490 
2491 	if (uap->sinfo) {
2492 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2493 		if (error)
2494 			return (error);
2495 		u_sinfo = &sinfo;
2496 	}
2497 	if (uap->tolen) {
2498 		error = getsockaddr(&to, uap->to, uap->tolen);
2499 		if (error) {
2500 			to = NULL;
2501 			goto sctp_bad2;
2502 		}
2503 	}
2504 
2505 	AUDIT_ARG_FD(uap->sd);
2506 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
2507 	if (error)
2508 		goto sctp_bad1;
2509 
2510 #ifdef COMPAT_FREEBSD32
2511 	if (SV_CURPROC_FLAG(SV_ILP32))
2512 		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
2513 		    uap->iovlen, &iov, EMSGSIZE);
2514 	else
2515 #endif
2516 		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2517 	if (error)
2518 		goto sctp_bad1;
2519 #ifdef KTRACE
2520 	if (to && (KTRPOINT(td, KTR_STRUCT)))
2521 		ktrsockaddr(to);
2522 #endif
2523 
2524 	so = (struct socket *)fp->f_data;
2525 #ifdef MAC
2526 	error = mac_socket_check_send(td->td_ucred, so);
2527 	if (error)
2528 		goto sctp_bad;
2529 #endif /* MAC */
2530 
2531 	auio.uio_iov = iov;
2532 	auio.uio_iovcnt = uap->iovlen;
2533 	auio.uio_segflg = UIO_USERSPACE;
2534 	auio.uio_rw = UIO_WRITE;
2535 	auio.uio_td = td;
2536 	auio.uio_offset = 0;			/* XXX */
2537 	auio.uio_resid = 0;
2538 	tiov = iov;
2539 	for (i = 0; i <uap->iovlen; i++, tiov++) {
2540 		if ((auio.uio_resid += tiov->iov_len) < 0) {
2541 			error = EINVAL;
2542 			goto sctp_bad;
2543 		}
2544 	}
2545 	len = auio.uio_resid;
2546 	CURVNET_SET(so->so_vnet);
2547 	error = sctp_lower_sosend(so, to, &auio,
2548 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2549 		    uap->flags, u_sinfo, td);
2550 	CURVNET_RESTORE();
2551 	if (error) {
2552 		if (auio.uio_resid != len && (error == ERESTART ||
2553 		    error == EINTR || error == EWOULDBLOCK))
2554 			error = 0;
2555 		/* Generation of SIGPIPE can be controlled per socket */
2556 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2557 		    !(uap->flags & MSG_NOSIGNAL)) {
2558 			PROC_LOCK(td->td_proc);
2559 			tdsignal(td, SIGPIPE);
2560 			PROC_UNLOCK(td->td_proc);
2561 		}
2562 	}
2563 	if (error == 0)
2564 		td->td_retval[0] = len - auio.uio_resid;
2565 #ifdef KTRACE
2566 	if (ktruio != NULL) {
2567 		ktruio->uio_resid = td->td_retval[0];
2568 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2569 	}
2570 #endif /* KTRACE */
2571 sctp_bad:
2572 	free(iov, M_IOV);
2573 sctp_bad1:
2574 	if (fp)
2575 		fdrop(fp, td);
2576 sctp_bad2:
2577 	if (to)
2578 		free(to, M_SONAME);
2579 	return (error);
2580 #else  /* SCTP */
2581 	return (EOPNOTSUPP);
2582 #endif /* SCTP */
2583 }
2584 
2585 int
2586 sctp_generic_recvmsg(td, uap)
2587 	struct thread *td;
2588 	struct sctp_generic_recvmsg_args /* {
2589 		int sd,
2590 		struct iovec *iov,
2591 		int iovlen,
2592 		struct sockaddr *from,
2593 		__socklen_t *fromlenaddr,
2594 		struct sctp_sndrcvinfo *sinfo,
2595 		int *msg_flags
2596 	} */ *uap;
2597 {
2598 #if (defined(INET) || defined(INET6)) && defined(SCTP)
2599 	uint8_t sockbufstore[256];
2600 	struct uio auio;
2601 	struct iovec *iov, *tiov;
2602 	struct sctp_sndrcvinfo sinfo;
2603 	struct socket *so;
2604 	struct file *fp = NULL;
2605 	struct sockaddr *fromsa;
2606 	int fromlen;
2607 	int len, i, msg_flags;
2608 	int error = 0;
2609 #ifdef KTRACE
2610 	struct uio *ktruio = NULL;
2611 #endif
2612 
2613 	AUDIT_ARG_FD(uap->sd);
2614 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
2615 	if (error) {
2616 		return (error);
2617 	}
2618 #ifdef COMPAT_FREEBSD32
2619 	if (SV_CURPROC_FLAG(SV_ILP32))
2620 		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
2621 		    uap->iovlen, &iov, EMSGSIZE);
2622 	else
2623 #endif
2624 		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2625 	if (error)
2626 		goto out1;
2627 
2628 	so = fp->f_data;
2629 #ifdef MAC
2630 	error = mac_socket_check_receive(td->td_ucred, so);
2631 	if (error) {
2632 		goto out;
2633 	}
2634 #endif /* MAC */
2635 
2636 	if (uap->fromlenaddr) {
2637 		error = copyin(uap->fromlenaddr,
2638 		    &fromlen, sizeof (fromlen));
2639 		if (error) {
2640 			goto out;
2641 		}
2642 	} else {
2643 		fromlen = 0;
2644 	}
2645 	if (uap->msg_flags) {
2646 		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
2647 		if (error) {
2648 			goto out;
2649 		}
2650 	} else {
2651 		msg_flags = 0;
2652 	}
2653 	auio.uio_iov = iov;
2654 	auio.uio_iovcnt = uap->iovlen;
2655   	auio.uio_segflg = UIO_USERSPACE;
2656 	auio.uio_rw = UIO_READ;
2657 	auio.uio_td = td;
2658 	auio.uio_offset = 0;			/* XXX */
2659 	auio.uio_resid = 0;
2660 	tiov = iov;
2661 	for (i = 0; i <uap->iovlen; i++, tiov++) {
2662 		if ((auio.uio_resid += tiov->iov_len) < 0) {
2663 			error = EINVAL;
2664 			goto out;
2665 		}
2666 	}
2667 	len = auio.uio_resid;
2668 	fromsa = (struct sockaddr *)sockbufstore;
2669 
2670 #ifdef KTRACE
2671 	if (KTRPOINT(td, KTR_GENIO))
2672 		ktruio = cloneuio(&auio);
2673 #endif /* KTRACE */
2674 	memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
2675 	CURVNET_SET(so->so_vnet);
2676 	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
2677 		    fromsa, fromlen, &msg_flags,
2678 		    (struct sctp_sndrcvinfo *)&sinfo, 1);
2679 	CURVNET_RESTORE();
2680 	if (error) {
2681 		if (auio.uio_resid != (int)len && (error == ERESTART ||
2682 		    error == EINTR || error == EWOULDBLOCK))
2683 			error = 0;
2684 	} else {
2685 		if (uap->sinfo)
2686 			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
2687 	}
2688 #ifdef KTRACE
2689 	if (ktruio != NULL) {
2690 		ktruio->uio_resid = (int)len - auio.uio_resid;
2691 		ktrgenio(uap->sd, UIO_READ, ktruio, error);
2692 	}
2693 #endif /* KTRACE */
2694 	if (error)
2695 		goto out;
2696 	td->td_retval[0] = (int)len - auio.uio_resid;
2697 
2698 	if (fromlen && uap->from) {
2699 		len = fromlen;
2700 		if (len <= 0 || fromsa == 0)
2701 			len = 0;
2702 		else {
2703 			len = MIN(len, fromsa->sa_len);
2704 			error = copyout(fromsa, uap->from, (unsigned)len);
2705 			if (error)
2706 				goto out;
2707 		}
2708 		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
2709 		if (error) {
2710 			goto out;
2711 		}
2712 	}
2713 #ifdef KTRACE
2714 	if (KTRPOINT(td, KTR_STRUCT))
2715 		ktrsockaddr(fromsa);
2716 #endif
2717 	if (uap->msg_flags) {
2718 		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
2719 		if (error) {
2720 			goto out;
2721 		}
2722 	}
2723 out:
2724 	free(iov, M_IOV);
2725 out1:
2726 	if (fp)
2727 		fdrop(fp, td);
2728 
2729 	return (error);
2730 #else  /* SCTP */
2731 	return (EOPNOTSUPP);
2732 #endif /* SCTP */
2733 }
2734