xref: /freebsd/sys/kern/uipc_syscalls.c (revision db612abe8df3355d1eb23bb3b50fdd97bc21e979)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include "opt_sctp.h"
39 #include "opt_compat.h"
40 #include "opt_ktrace.h"
41 #include "opt_mac.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/mutex.h>
48 #include <sys/sysproto.h>
49 #include <sys/malloc.h>
50 #include <sys/filedesc.h>
51 #include <sys/event.h>
52 #include <sys/proc.h>
53 #include <sys/fcntl.h>
54 #include <sys/file.h>
55 #include <sys/filio.h>
56 #include <sys/mount.h>
57 #include <sys/mbuf.h>
58 #include <sys/protosw.h>
59 #include <sys/sf_buf.h>
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/signalvar.h>
63 #include <sys/syscallsubr.h>
64 #include <sys/sysctl.h>
65 #include <sys/uio.h>
66 #include <sys/vnode.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 
71 #include <security/mac/mac_framework.h>
72 
73 #include <vm/vm.h>
74 #include <vm/vm_object.h>
75 #include <vm/vm_page.h>
76 #include <vm/vm_pageout.h>
77 #include <vm/vm_kern.h>
78 #include <vm/vm_extern.h>
79 
80 #ifdef SCTP
81 #include <netinet/sctp.h>
82 #include <netinet/sctp_peeloff.h>
83 #endif /* SCTP */
84 
85 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
86 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
87 
88 static int accept1(struct thread *td, struct accept_args *uap, int compat);
89 static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
90 static int getsockname1(struct thread *td, struct getsockname_args *uap,
91 			int compat);
92 static int getpeername1(struct thread *td, struct getpeername_args *uap,
93 			int compat);
94 
95 /*
96  * NSFBUFS-related variables and associated sysctls
97  */
98 int nsfbufs;
99 int nsfbufspeak;
100 int nsfbufsused;
101 
102 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
103     "Maximum number of sendfile(2) sf_bufs available");
104 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
105     "Number of sendfile(2) sf_bufs at peak usage");
106 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
107     "Number of sendfile(2) sf_bufs in use");
108 
109 /*
110  * Convert a user file descriptor to a kernel file entry.  A reference on the
111  * file entry is held upon returning.  This is lighter weight than
112  * fgetsock(), which bumps the socket reference drops the file reference
113  * count instead, as this approach avoids several additional mutex operations
114  * associated with the additional reference count.  If requested, return the
115  * open file flags.
116  */
117 static int
118 getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp)
119 {
120 	struct file *fp;
121 	int error;
122 
123 	fp = NULL;
124 	if (fdp == NULL)
125 		error = EBADF;
126 	else {
127 		FILEDESC_SLOCK(fdp);
128 		fp = fget_locked(fdp, fd);
129 		if (fp == NULL)
130 			error = EBADF;
131 		else if (fp->f_type != DTYPE_SOCKET) {
132 			fp = NULL;
133 			error = ENOTSOCK;
134 		} else {
135 			fhold(fp);
136 			if (fflagp != NULL)
137 				*fflagp = fp->f_flag;
138 			error = 0;
139 		}
140 		FILEDESC_SUNLOCK(fdp);
141 	}
142 	*fpp = fp;
143 	return (error);
144 }
145 
146 /*
147  * System call interface to the socket abstraction.
148  */
149 #if defined(COMPAT_43)
150 #define COMPAT_OLDSOCK
151 #endif
152 
153 int
154 socket(td, uap)
155 	struct thread *td;
156 	struct socket_args /* {
157 		int	domain;
158 		int	type;
159 		int	protocol;
160 	} */ *uap;
161 {
162 	struct filedesc *fdp;
163 	struct socket *so;
164 	struct file *fp;
165 	int fd, error;
166 
167 #ifdef MAC
168 	error = mac_socket_check_create(td->td_ucred, uap->domain, uap->type,
169 	    uap->protocol);
170 	if (error)
171 		return (error);
172 #endif
173 	fdp = td->td_proc->p_fd;
174 	error = falloc(td, &fp, &fd);
175 	if (error)
176 		return (error);
177 	/* An extra reference on `fp' has been held for us by falloc(). */
178 	error = socreate(uap->domain, &so, uap->type, uap->protocol,
179 	    td->td_ucred, td);
180 	if (error) {
181 		fdclose(fdp, fp, fd, td);
182 	} else {
183 		finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops);
184 		td->td_retval[0] = fd;
185 	}
186 	fdrop(fp, td);
187 	return (error);
188 }
189 
190 /* ARGSUSED */
191 int
192 bind(td, uap)
193 	struct thread *td;
194 	struct bind_args /* {
195 		int	s;
196 		caddr_t	name;
197 		int	namelen;
198 	} */ *uap;
199 {
200 	struct sockaddr *sa;
201 	int error;
202 
203 	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
204 		return (error);
205 
206 	error = kern_bind(td, uap->s, sa);
207 	free(sa, M_SONAME);
208 	return (error);
209 }
210 
211 int
212 kern_bind(td, fd, sa)
213 	struct thread *td;
214 	int fd;
215 	struct sockaddr *sa;
216 {
217 	struct socket *so;
218 	struct file *fp;
219 	int error;
220 
221 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
222 	if (error)
223 		return (error);
224 	so = fp->f_data;
225 #ifdef KTRACE
226 	if (KTRPOINT(td, KTR_STRUCT))
227 		ktrsockaddr(sa);
228 #endif
229 #ifdef MAC
230 	SOCK_LOCK(so);
231 	error = mac_socket_check_bind(td->td_ucred, so, sa);
232 	SOCK_UNLOCK(so);
233 	if (error)
234 		goto done;
235 #endif
236 	error = sobind(so, sa, td);
237 #ifdef MAC
238 done:
239 #endif
240 	fdrop(fp, td);
241 	return (error);
242 }
243 
244 /* ARGSUSED */
245 int
246 listen(td, uap)
247 	struct thread *td;
248 	struct listen_args /* {
249 		int	s;
250 		int	backlog;
251 	} */ *uap;
252 {
253 	struct socket *so;
254 	struct file *fp;
255 	int error;
256 
257 	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
258 	if (error == 0) {
259 		so = fp->f_data;
260 #ifdef MAC
261 		SOCK_LOCK(so);
262 		error = mac_socket_check_listen(td->td_ucred, so);
263 		SOCK_UNLOCK(so);
264 		if (error)
265 			goto done;
266 #endif
267 		error = solisten(so, uap->backlog, td);
268 #ifdef MAC
269 done:
270 #endif
271 		fdrop(fp, td);
272 	}
273 	return(error);
274 }
275 
276 /*
277  * accept1()
278  */
279 static int
280 accept1(td, uap, compat)
281 	struct thread *td;
282 	struct accept_args /* {
283 		int	s;
284 		struct sockaddr	* __restrict name;
285 		socklen_t	* __restrict anamelen;
286 	} */ *uap;
287 	int compat;
288 {
289 	struct sockaddr *name;
290 	socklen_t namelen;
291 	struct file *fp;
292 	int error;
293 
294 	if (uap->name == NULL)
295 		return (kern_accept(td, uap->s, NULL, NULL, NULL));
296 
297 	error = copyin(uap->anamelen, &namelen, sizeof (namelen));
298 	if (error)
299 		return (error);
300 
301 	error = kern_accept(td, uap->s, &name, &namelen, &fp);
302 
303 	/*
304 	 * return a namelen of zero for older code which might
305 	 * ignore the return value from accept.
306 	 */
307 	if (error) {
308 		(void) copyout(&namelen,
309 		    uap->anamelen, sizeof(*uap->anamelen));
310 		return (error);
311 	}
312 
313 	if (error == 0 && name != NULL) {
314 #ifdef COMPAT_OLDSOCK
315 		if (compat)
316 			((struct osockaddr *)name)->sa_family =
317 			    name->sa_family;
318 #endif
319 		error = copyout(name, uap->name, namelen);
320 	}
321 	if (error == 0)
322 		error = copyout(&namelen, uap->anamelen,
323 		    sizeof(namelen));
324 	if (error)
325 		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
326 	fdrop(fp, td);
327 	free(name, M_SONAME);
328 	return (error);
329 }
330 
331 int
332 kern_accept(struct thread *td, int s, struct sockaddr **name,
333     socklen_t *namelen, struct file **fp)
334 {
335 	struct filedesc *fdp;
336 	struct file *headfp, *nfp = NULL;
337 	struct sockaddr *sa = NULL;
338 	int error;
339 	struct socket *head, *so;
340 	int fd;
341 	u_int fflag;
342 	pid_t pgid;
343 	int tmp;
344 
345 	if (name) {
346 		*name = NULL;
347 		if (*namelen < 0)
348 			return (EINVAL);
349 	}
350 
351 	fdp = td->td_proc->p_fd;
352 	error = getsock(fdp, s, &headfp, &fflag);
353 	if (error)
354 		return (error);
355 	head = headfp->f_data;
356 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
357 		error = EINVAL;
358 		goto done;
359 	}
360 #ifdef MAC
361 	SOCK_LOCK(head);
362 	error = mac_socket_check_accept(td->td_ucred, head);
363 	SOCK_UNLOCK(head);
364 	if (error != 0)
365 		goto done;
366 #endif
367 	error = falloc(td, &nfp, &fd);
368 	if (error)
369 		goto done;
370 	ACCEPT_LOCK();
371 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
372 		ACCEPT_UNLOCK();
373 		error = EWOULDBLOCK;
374 		goto noconnection;
375 	}
376 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
377 		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
378 			head->so_error = ECONNABORTED;
379 			break;
380 		}
381 		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
382 		    "accept", 0);
383 		if (error) {
384 			ACCEPT_UNLOCK();
385 			goto noconnection;
386 		}
387 	}
388 	if (head->so_error) {
389 		error = head->so_error;
390 		head->so_error = 0;
391 		ACCEPT_UNLOCK();
392 		goto noconnection;
393 	}
394 	so = TAILQ_FIRST(&head->so_comp);
395 	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
396 	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
397 
398 	/*
399 	 * Before changing the flags on the socket, we have to bump the
400 	 * reference count.  Otherwise, if the protocol calls sofree(),
401 	 * the socket will be released due to a zero refcount.
402 	 */
403 	SOCK_LOCK(so);			/* soref() and so_state update */
404 	soref(so);			/* file descriptor reference */
405 
406 	TAILQ_REMOVE(&head->so_comp, so, so_list);
407 	head->so_qlen--;
408 	so->so_state |= (head->so_state & SS_NBIO);
409 	so->so_qstate &= ~SQ_COMP;
410 	so->so_head = NULL;
411 
412 	SOCK_UNLOCK(so);
413 	ACCEPT_UNLOCK();
414 
415 	/* An extra reference on `nfp' has been held for us by falloc(). */
416 	td->td_retval[0] = fd;
417 
418 	/* connection has been removed from the listen queue */
419 	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
420 
421 	pgid = fgetown(&head->so_sigio);
422 	if (pgid != 0)
423 		fsetown(pgid, &so->so_sigio);
424 
425 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
426 	/* Sync socket nonblocking/async state with file flags */
427 	tmp = fflag & FNONBLOCK;
428 	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
429 	tmp = fflag & FASYNC;
430 	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
431 	sa = 0;
432 	error = soaccept(so, &sa);
433 	if (error) {
434 		/*
435 		 * return a namelen of zero for older code which might
436 		 * ignore the return value from accept.
437 		 */
438 		if (name)
439 			*namelen = 0;
440 		goto noconnection;
441 	}
442 	if (sa == NULL) {
443 		if (name)
444 			*namelen = 0;
445 		goto done;
446 	}
447 	if (name) {
448 		/* check sa_len before it is destroyed */
449 		if (*namelen > sa->sa_len)
450 			*namelen = sa->sa_len;
451 #ifdef KTRACE
452 		if (KTRPOINT(td, KTR_STRUCT))
453 			ktrsockaddr(sa);
454 #endif
455 		*name = sa;
456 		sa = NULL;
457 	}
458 noconnection:
459 	if (sa)
460 		FREE(sa, M_SONAME);
461 
462 	/*
463 	 * close the new descriptor, assuming someone hasn't ripped it
464 	 * out from under us.
465 	 */
466 	if (error)
467 		fdclose(fdp, nfp, fd, td);
468 
469 	/*
470 	 * Release explicitly held references before returning.  We return
471 	 * a reference on nfp to the caller on success if they request it.
472 	 */
473 done:
474 	if (fp != NULL) {
475 		if (error == 0) {
476 			*fp = nfp;
477 			nfp = NULL;
478 		} else
479 			*fp = NULL;
480 	}
481 	if (nfp != NULL)
482 		fdrop(nfp, td);
483 	fdrop(headfp, td);
484 	return (error);
485 }
486 
487 int
488 accept(td, uap)
489 	struct thread *td;
490 	struct accept_args *uap;
491 {
492 
493 	return (accept1(td, uap, 0));
494 }
495 
496 #ifdef COMPAT_OLDSOCK
497 int
498 oaccept(td, uap)
499 	struct thread *td;
500 	struct accept_args *uap;
501 {
502 
503 	return (accept1(td, uap, 1));
504 }
505 #endif /* COMPAT_OLDSOCK */
506 
507 /* ARGSUSED */
508 int
509 connect(td, uap)
510 	struct thread *td;
511 	struct connect_args /* {
512 		int	s;
513 		caddr_t	name;
514 		int	namelen;
515 	} */ *uap;
516 {
517 	struct sockaddr *sa;
518 	int error;
519 
520 	error = getsockaddr(&sa, uap->name, uap->namelen);
521 	if (error)
522 		return (error);
523 
524 	error = kern_connect(td, uap->s, sa);
525 	free(sa, M_SONAME);
526 	return (error);
527 }
528 
529 
530 int
531 kern_connect(td, fd, sa)
532 	struct thread *td;
533 	int fd;
534 	struct sockaddr *sa;
535 {
536 	struct socket *so;
537 	struct file *fp;
538 	int error;
539 	int interrupted = 0;
540 
541 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
542 	if (error)
543 		return (error);
544 	so = fp->f_data;
545 	if (so->so_state & SS_ISCONNECTING) {
546 		error = EALREADY;
547 		goto done1;
548 	}
549 #ifdef KTRACE
550 	if (KTRPOINT(td, KTR_STRUCT))
551 		ktrsockaddr(sa);
552 #endif
553 #ifdef MAC
554 	SOCK_LOCK(so);
555 	error = mac_socket_check_connect(td->td_ucred, so, sa);
556 	SOCK_UNLOCK(so);
557 	if (error)
558 		goto bad;
559 #endif
560 	error = soconnect(so, sa, td);
561 	if (error)
562 		goto bad;
563 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
564 		error = EINPROGRESS;
565 		goto done1;
566 	}
567 	SOCK_LOCK(so);
568 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
569 		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
570 		    "connec", 0);
571 		if (error) {
572 			if (error == EINTR || error == ERESTART)
573 				interrupted = 1;
574 			break;
575 		}
576 	}
577 	if (error == 0) {
578 		error = so->so_error;
579 		so->so_error = 0;
580 	}
581 	SOCK_UNLOCK(so);
582 bad:
583 	if (!interrupted)
584 		so->so_state &= ~SS_ISCONNECTING;
585 	if (error == ERESTART)
586 		error = EINTR;
587 done1:
588 	fdrop(fp, td);
589 	return (error);
590 }
591 
592 int
593 socketpair(td, uap)
594 	struct thread *td;
595 	struct socketpair_args /* {
596 		int	domain;
597 		int	type;
598 		int	protocol;
599 		int	*rsv;
600 	} */ *uap;
601 {
602 	struct filedesc *fdp = td->td_proc->p_fd;
603 	struct file *fp1, *fp2;
604 	struct socket *so1, *so2;
605 	int fd, error, sv[2];
606 
607 #ifdef MAC
608 	/* We might want to have a separate check for socket pairs. */
609 	error = mac_socket_check_create(td->td_ucred, uap->domain, uap->type,
610 	    uap->protocol);
611 	if (error)
612 		return (error);
613 #endif
614 
615 	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
616 	    td->td_ucred, td);
617 	if (error)
618 		return (error);
619 	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
620 	    td->td_ucred, td);
621 	if (error)
622 		goto free1;
623 	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
624 	error = falloc(td, &fp1, &fd);
625 	if (error)
626 		goto free2;
627 	sv[0] = fd;
628 	fp1->f_data = so1;	/* so1 already has ref count */
629 	error = falloc(td, &fp2, &fd);
630 	if (error)
631 		goto free3;
632 	fp2->f_data = so2;	/* so2 already has ref count */
633 	sv[1] = fd;
634 	error = soconnect2(so1, so2);
635 	if (error)
636 		goto free4;
637 	if (uap->type == SOCK_DGRAM) {
638 		/*
639 		 * Datagram socket connection is asymmetric.
640 		 */
641 		 error = soconnect2(so2, so1);
642 		 if (error)
643 			goto free4;
644 	}
645 	finit(fp1, FREAD | FWRITE, DTYPE_SOCKET, fp1->f_data, &socketops);
646 	finit(fp2, FREAD | FWRITE, DTYPE_SOCKET, fp2->f_data, &socketops);
647 	so1 = so2 = NULL;
648 	error = copyout(sv, uap->rsv, 2 * sizeof (int));
649 	if (error)
650 		goto free4;
651 	fdrop(fp1, td);
652 	fdrop(fp2, td);
653 	return (0);
654 free4:
655 	fdclose(fdp, fp2, sv[1], td);
656 	fdrop(fp2, td);
657 free3:
658 	fdclose(fdp, fp1, sv[0], td);
659 	fdrop(fp1, td);
660 free2:
661 	if (so2 != NULL)
662 		(void)soclose(so2);
663 free1:
664 	if (so1 != NULL)
665 		(void)soclose(so1);
666 	return (error);
667 }
668 
669 static int
670 sendit(td, s, mp, flags)
671 	struct thread *td;
672 	int s;
673 	struct msghdr *mp;
674 	int flags;
675 {
676 	struct mbuf *control;
677 	struct sockaddr *to;
678 	int error;
679 
680 	if (mp->msg_name != NULL) {
681 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
682 		if (error) {
683 			to = NULL;
684 			goto bad;
685 		}
686 		mp->msg_name = to;
687 	} else {
688 		to = NULL;
689 	}
690 
691 	if (mp->msg_control) {
692 		if (mp->msg_controllen < sizeof(struct cmsghdr)
693 #ifdef COMPAT_OLDSOCK
694 		    && mp->msg_flags != MSG_COMPAT
695 #endif
696 		) {
697 			error = EINVAL;
698 			goto bad;
699 		}
700 		error = sockargs(&control, mp->msg_control,
701 		    mp->msg_controllen, MT_CONTROL);
702 		if (error)
703 			goto bad;
704 #ifdef COMPAT_OLDSOCK
705 		if (mp->msg_flags == MSG_COMPAT) {
706 			struct cmsghdr *cm;
707 
708 			M_PREPEND(control, sizeof(*cm), M_WAIT);
709 			cm = mtod(control, struct cmsghdr *);
710 			cm->cmsg_len = control->m_len;
711 			cm->cmsg_level = SOL_SOCKET;
712 			cm->cmsg_type = SCM_RIGHTS;
713 		}
714 #endif
715 	} else {
716 		control = NULL;
717 	}
718 
719 	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
720 
721 bad:
722 	if (to)
723 		FREE(to, M_SONAME);
724 	return (error);
725 }
726 
727 int
728 kern_sendit(td, s, mp, flags, control, segflg)
729 	struct thread *td;
730 	int s;
731 	struct msghdr *mp;
732 	int flags;
733 	struct mbuf *control;
734 	enum uio_seg segflg;
735 {
736 	struct file *fp;
737 	struct uio auio;
738 	struct iovec *iov;
739 	struct socket *so;
740 	int i;
741 	int len, error;
742 #ifdef KTRACE
743 	struct uio *ktruio = NULL;
744 #endif
745 
746 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
747 	if (error)
748 		return (error);
749 	so = (struct socket *)fp->f_data;
750 
751 #ifdef MAC
752 	SOCK_LOCK(so);
753 	error = mac_socket_check_send(td->td_ucred, so);
754 	SOCK_UNLOCK(so);
755 	if (error)
756 		goto bad;
757 #endif
758 
759 	auio.uio_iov = mp->msg_iov;
760 	auio.uio_iovcnt = mp->msg_iovlen;
761 	auio.uio_segflg = segflg;
762 	auio.uio_rw = UIO_WRITE;
763 	auio.uio_td = td;
764 	auio.uio_offset = 0;			/* XXX */
765 	auio.uio_resid = 0;
766 	iov = mp->msg_iov;
767 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
768 		if ((auio.uio_resid += iov->iov_len) < 0) {
769 			error = EINVAL;
770 			goto bad;
771 		}
772 	}
773 #ifdef KTRACE
774 	if (KTRPOINT(td, KTR_GENIO))
775 		ktruio = cloneuio(&auio);
776 #endif
777 	len = auio.uio_resid;
778 	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
779 	if (error) {
780 		if (auio.uio_resid != len && (error == ERESTART ||
781 		    error == EINTR || error == EWOULDBLOCK))
782 			error = 0;
783 		/* Generation of SIGPIPE can be controlled per socket */
784 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
785 		    !(flags & MSG_NOSIGNAL)) {
786 			PROC_LOCK(td->td_proc);
787 			psignal(td->td_proc, SIGPIPE);
788 			PROC_UNLOCK(td->td_proc);
789 		}
790 	}
791 	if (error == 0)
792 		td->td_retval[0] = len - auio.uio_resid;
793 #ifdef KTRACE
794 	if (ktruio != NULL) {
795 		ktruio->uio_resid = td->td_retval[0];
796 		ktrgenio(s, UIO_WRITE, ktruio, error);
797 	}
798 #endif
799 bad:
800 	fdrop(fp, td);
801 	return (error);
802 }
803 
804 int
805 sendto(td, uap)
806 	struct thread *td;
807 	struct sendto_args /* {
808 		int	s;
809 		caddr_t	buf;
810 		size_t	len;
811 		int	flags;
812 		caddr_t	to;
813 		int	tolen;
814 	} */ *uap;
815 {
816 	struct msghdr msg;
817 	struct iovec aiov;
818 	int error;
819 
820 	msg.msg_name = uap->to;
821 	msg.msg_namelen = uap->tolen;
822 	msg.msg_iov = &aiov;
823 	msg.msg_iovlen = 1;
824 	msg.msg_control = 0;
825 #ifdef COMPAT_OLDSOCK
826 	msg.msg_flags = 0;
827 #endif
828 	aiov.iov_base = uap->buf;
829 	aiov.iov_len = uap->len;
830 	error = sendit(td, uap->s, &msg, uap->flags);
831 	return (error);
832 }
833 
834 #ifdef COMPAT_OLDSOCK
835 int
836 osend(td, uap)
837 	struct thread *td;
838 	struct osend_args /* {
839 		int	s;
840 		caddr_t	buf;
841 		int	len;
842 		int	flags;
843 	} */ *uap;
844 {
845 	struct msghdr msg;
846 	struct iovec aiov;
847 	int error;
848 
849 	msg.msg_name = 0;
850 	msg.msg_namelen = 0;
851 	msg.msg_iov = &aiov;
852 	msg.msg_iovlen = 1;
853 	aiov.iov_base = uap->buf;
854 	aiov.iov_len = uap->len;
855 	msg.msg_control = 0;
856 	msg.msg_flags = 0;
857 	error = sendit(td, uap->s, &msg, uap->flags);
858 	return (error);
859 }
860 
861 int
862 osendmsg(td, uap)
863 	struct thread *td;
864 	struct osendmsg_args /* {
865 		int	s;
866 		caddr_t	msg;
867 		int	flags;
868 	} */ *uap;
869 {
870 	struct msghdr msg;
871 	struct iovec *iov;
872 	int error;
873 
874 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
875 	if (error)
876 		return (error);
877 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
878 	if (error)
879 		return (error);
880 	msg.msg_iov = iov;
881 	msg.msg_flags = MSG_COMPAT;
882 	error = sendit(td, uap->s, &msg, uap->flags);
883 	free(iov, M_IOV);
884 	return (error);
885 }
886 #endif
887 
888 int
889 sendmsg(td, uap)
890 	struct thread *td;
891 	struct sendmsg_args /* {
892 		int	s;
893 		caddr_t	msg;
894 		int	flags;
895 	} */ *uap;
896 {
897 	struct msghdr msg;
898 	struct iovec *iov;
899 	int error;
900 
901 	error = copyin(uap->msg, &msg, sizeof (msg));
902 	if (error)
903 		return (error);
904 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
905 	if (error)
906 		return (error);
907 	msg.msg_iov = iov;
908 #ifdef COMPAT_OLDSOCK
909 	msg.msg_flags = 0;
910 #endif
911 	error = sendit(td, uap->s, &msg, uap->flags);
912 	free(iov, M_IOV);
913 	return (error);
914 }
915 
916 int
917 kern_recvit(td, s, mp, fromseg, controlp)
918 	struct thread *td;
919 	int s;
920 	struct msghdr *mp;
921 	enum uio_seg fromseg;
922 	struct mbuf **controlp;
923 {
924 	struct uio auio;
925 	struct iovec *iov;
926 	int i;
927 	socklen_t len;
928 	int error;
929 	struct mbuf *m, *control = 0;
930 	caddr_t ctlbuf;
931 	struct file *fp;
932 	struct socket *so;
933 	struct sockaddr *fromsa = 0;
934 #ifdef KTRACE
935 	struct uio *ktruio = NULL;
936 #endif
937 
938 	if(controlp != NULL)
939 		*controlp = 0;
940 
941 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
942 	if (error)
943 		return (error);
944 	so = fp->f_data;
945 
946 #ifdef MAC
947 	SOCK_LOCK(so);
948 	error = mac_socket_check_receive(td->td_ucred, so);
949 	SOCK_UNLOCK(so);
950 	if (error) {
951 		fdrop(fp, td);
952 		return (error);
953 	}
954 #endif
955 
956 	auio.uio_iov = mp->msg_iov;
957 	auio.uio_iovcnt = mp->msg_iovlen;
958 	auio.uio_segflg = UIO_USERSPACE;
959 	auio.uio_rw = UIO_READ;
960 	auio.uio_td = td;
961 	auio.uio_offset = 0;			/* XXX */
962 	auio.uio_resid = 0;
963 	iov = mp->msg_iov;
964 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
965 		if ((auio.uio_resid += iov->iov_len) < 0) {
966 			fdrop(fp, td);
967 			return (EINVAL);
968 		}
969 	}
970 #ifdef KTRACE
971 	if (KTRPOINT(td, KTR_GENIO))
972 		ktruio = cloneuio(&auio);
973 #endif
974 	len = auio.uio_resid;
975 	error = soreceive(so, &fromsa, &auio, (struct mbuf **)0,
976 	    (mp->msg_control || controlp) ? &control : (struct mbuf **)0,
977 	    &mp->msg_flags);
978 	if (error) {
979 		if (auio.uio_resid != (int)len && (error == ERESTART ||
980 		    error == EINTR || error == EWOULDBLOCK))
981 			error = 0;
982 	}
983 #ifdef KTRACE
984 	if (ktruio != NULL) {
985 		ktruio->uio_resid = (int)len - auio.uio_resid;
986 		ktrgenio(s, UIO_READ, ktruio, error);
987 	}
988 #endif
989 	if (error)
990 		goto out;
991 	td->td_retval[0] = (int)len - auio.uio_resid;
992 	if (mp->msg_name) {
993 		len = mp->msg_namelen;
994 		if (len <= 0 || fromsa == 0)
995 			len = 0;
996 		else {
997 			/* save sa_len before it is destroyed by MSG_COMPAT */
998 			len = MIN(len, fromsa->sa_len);
999 #ifdef COMPAT_OLDSOCK
1000 			if (mp->msg_flags & MSG_COMPAT)
1001 				((struct osockaddr *)fromsa)->sa_family =
1002 				    fromsa->sa_family;
1003 #endif
1004 			if (fromseg == UIO_USERSPACE) {
1005 				error = copyout(fromsa, mp->msg_name,
1006 				    (unsigned)len);
1007 				if (error)
1008 					goto out;
1009 			} else
1010 				bcopy(fromsa, mp->msg_name, len);
1011 		}
1012 		mp->msg_namelen = len;
1013 	}
1014 	if (mp->msg_control && controlp == NULL) {
1015 #ifdef COMPAT_OLDSOCK
1016 		/*
1017 		 * We assume that old recvmsg calls won't receive access
1018 		 * rights and other control info, esp. as control info
1019 		 * is always optional and those options didn't exist in 4.3.
1020 		 * If we receive rights, trim the cmsghdr; anything else
1021 		 * is tossed.
1022 		 */
1023 		if (control && mp->msg_flags & MSG_COMPAT) {
1024 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1025 			    SOL_SOCKET ||
1026 			    mtod(control, struct cmsghdr *)->cmsg_type !=
1027 			    SCM_RIGHTS) {
1028 				mp->msg_controllen = 0;
1029 				goto out;
1030 			}
1031 			control->m_len -= sizeof (struct cmsghdr);
1032 			control->m_data += sizeof (struct cmsghdr);
1033 		}
1034 #endif
1035 		len = mp->msg_controllen;
1036 		m = control;
1037 		mp->msg_controllen = 0;
1038 		ctlbuf = mp->msg_control;
1039 
1040 		while (m && len > 0) {
1041 			unsigned int tocopy;
1042 
1043 			if (len >= m->m_len)
1044 				tocopy = m->m_len;
1045 			else {
1046 				mp->msg_flags |= MSG_CTRUNC;
1047 				tocopy = len;
1048 			}
1049 
1050 			if ((error = copyout(mtod(m, caddr_t),
1051 					ctlbuf, tocopy)) != 0)
1052 				goto out;
1053 
1054 			ctlbuf += tocopy;
1055 			len -= tocopy;
1056 			m = m->m_next;
1057 		}
1058 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1059 	}
1060 out:
1061 	fdrop(fp, td);
1062 #ifdef KTRACE
1063 	if (fromsa && KTRPOINT(td, KTR_STRUCT))
1064 		ktrsockaddr(fromsa);
1065 #endif
1066 	if (fromsa)
1067 		FREE(fromsa, M_SONAME);
1068 
1069 	if (error == 0 && controlp != NULL)
1070 		*controlp = control;
1071 	else  if (control)
1072 		m_freem(control);
1073 
1074 	return (error);
1075 }
1076 
1077 static int
1078 recvit(td, s, mp, namelenp)
1079 	struct thread *td;
1080 	int s;
1081 	struct msghdr *mp;
1082 	void *namelenp;
1083 {
1084 	int error;
1085 
1086 	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
1087 	if (error)
1088 		return (error);
1089 	if (namelenp) {
1090 		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
1091 #ifdef COMPAT_OLDSOCK
1092 		if (mp->msg_flags & MSG_COMPAT)
1093 			error = 0;	/* old recvfrom didn't check */
1094 #endif
1095 	}
1096 	return (error);
1097 }
1098 
1099 int
1100 recvfrom(td, uap)
1101 	struct thread *td;
1102 	struct recvfrom_args /* {
1103 		int	s;
1104 		caddr_t	buf;
1105 		size_t	len;
1106 		int	flags;
1107 		struct sockaddr * __restrict	from;
1108 		socklen_t * __restrict fromlenaddr;
1109 	} */ *uap;
1110 {
1111 	struct msghdr msg;
1112 	struct iovec aiov;
1113 	int error;
1114 
1115 	if (uap->fromlenaddr) {
1116 		error = copyin(uap->fromlenaddr,
1117 		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1118 		if (error)
1119 			goto done2;
1120 	} else {
1121 		msg.msg_namelen = 0;
1122 	}
1123 	msg.msg_name = uap->from;
1124 	msg.msg_iov = &aiov;
1125 	msg.msg_iovlen = 1;
1126 	aiov.iov_base = uap->buf;
1127 	aiov.iov_len = uap->len;
1128 	msg.msg_control = 0;
1129 	msg.msg_flags = uap->flags;
1130 	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1131 done2:
1132 	return(error);
1133 }
1134 
1135 #ifdef COMPAT_OLDSOCK
1136 int
1137 orecvfrom(td, uap)
1138 	struct thread *td;
1139 	struct recvfrom_args *uap;
1140 {
1141 
1142 	uap->flags |= MSG_COMPAT;
1143 	return (recvfrom(td, uap));
1144 }
1145 #endif
1146 
1147 #ifdef COMPAT_OLDSOCK
1148 int
1149 orecv(td, uap)
1150 	struct thread *td;
1151 	struct orecv_args /* {
1152 		int	s;
1153 		caddr_t	buf;
1154 		int	len;
1155 		int	flags;
1156 	} */ *uap;
1157 {
1158 	struct msghdr msg;
1159 	struct iovec aiov;
1160 	int error;
1161 
1162 	msg.msg_name = 0;
1163 	msg.msg_namelen = 0;
1164 	msg.msg_iov = &aiov;
1165 	msg.msg_iovlen = 1;
1166 	aiov.iov_base = uap->buf;
1167 	aiov.iov_len = uap->len;
1168 	msg.msg_control = 0;
1169 	msg.msg_flags = uap->flags;
1170 	error = recvit(td, uap->s, &msg, NULL);
1171 	return (error);
1172 }
1173 
1174 /*
1175  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1176  * overlays the new one, missing only the flags, and with the (old) access
1177  * rights where the control fields are now.
1178  */
1179 int
1180 orecvmsg(td, uap)
1181 	struct thread *td;
1182 	struct orecvmsg_args /* {
1183 		int	s;
1184 		struct	omsghdr *msg;
1185 		int	flags;
1186 	} */ *uap;
1187 {
1188 	struct msghdr msg;
1189 	struct iovec *iov;
1190 	int error;
1191 
1192 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1193 	if (error)
1194 		return (error);
1195 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1196 	if (error)
1197 		return (error);
1198 	msg.msg_flags = uap->flags | MSG_COMPAT;
1199 	msg.msg_iov = iov;
1200 	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1201 	if (msg.msg_controllen && error == 0)
1202 		error = copyout(&msg.msg_controllen,
1203 		    &uap->msg->msg_accrightslen, sizeof (int));
1204 	free(iov, M_IOV);
1205 	return (error);
1206 }
1207 #endif
1208 
1209 int
1210 recvmsg(td, uap)
1211 	struct thread *td;
1212 	struct recvmsg_args /* {
1213 		int	s;
1214 		struct	msghdr *msg;
1215 		int	flags;
1216 	} */ *uap;
1217 {
1218 	struct msghdr msg;
1219 	struct iovec *uiov, *iov;
1220 	int error;
1221 
1222 	error = copyin(uap->msg, &msg, sizeof (msg));
1223 	if (error)
1224 		return (error);
1225 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1226 	if (error)
1227 		return (error);
1228 	msg.msg_flags = uap->flags;
1229 #ifdef COMPAT_OLDSOCK
1230 	msg.msg_flags &= ~MSG_COMPAT;
1231 #endif
1232 	uiov = msg.msg_iov;
1233 	msg.msg_iov = iov;
1234 	error = recvit(td, uap->s, &msg, NULL);
1235 	if (error == 0) {
1236 		msg.msg_iov = uiov;
1237 		error = copyout(&msg, uap->msg, sizeof(msg));
1238 	}
1239 	free(iov, M_IOV);
1240 	return (error);
1241 }
1242 
1243 /* ARGSUSED */
1244 int
1245 shutdown(td, uap)
1246 	struct thread *td;
1247 	struct shutdown_args /* {
1248 		int	s;
1249 		int	how;
1250 	} */ *uap;
1251 {
1252 	struct socket *so;
1253 	struct file *fp;
1254 	int error;
1255 
1256 	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
1257 	if (error == 0) {
1258 		so = fp->f_data;
1259 		error = soshutdown(so, uap->how);
1260 		fdrop(fp, td);
1261 	}
1262 	return (error);
1263 }
1264 
1265 /* ARGSUSED */
1266 int
1267 setsockopt(td, uap)
1268 	struct thread *td;
1269 	struct setsockopt_args /* {
1270 		int	s;
1271 		int	level;
1272 		int	name;
1273 		caddr_t	val;
1274 		int	valsize;
1275 	} */ *uap;
1276 {
1277 
1278 	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1279 	    uap->val, UIO_USERSPACE, uap->valsize));
1280 }
1281 
1282 int
1283 kern_setsockopt(td, s, level, name, val, valseg, valsize)
1284 	struct thread *td;
1285 	int s;
1286 	int level;
1287 	int name;
1288 	void *val;
1289 	enum uio_seg valseg;
1290 	socklen_t valsize;
1291 {
1292 	int error;
1293 	struct socket *so;
1294 	struct file *fp;
1295 	struct sockopt sopt;
1296 
1297 	if (val == NULL && valsize != 0)
1298 		return (EFAULT);
1299 	if ((int)valsize < 0)
1300 		return (EINVAL);
1301 
1302 	sopt.sopt_dir = SOPT_SET;
1303 	sopt.sopt_level = level;
1304 	sopt.sopt_name = name;
1305 	sopt.sopt_val = val;
1306 	sopt.sopt_valsize = valsize;
1307 	switch (valseg) {
1308 	case UIO_USERSPACE:
1309 		sopt.sopt_td = td;
1310 		break;
1311 	case UIO_SYSSPACE:
1312 		sopt.sopt_td = NULL;
1313 		break;
1314 	default:
1315 		panic("kern_setsockopt called with bad valseg");
1316 	}
1317 
1318 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
1319 	if (error == 0) {
1320 		so = fp->f_data;
1321 		error = sosetopt(so, &sopt);
1322 		fdrop(fp, td);
1323 	}
1324 	return(error);
1325 }
1326 
1327 /* ARGSUSED */
1328 int
1329 getsockopt(td, uap)
1330 	struct thread *td;
1331 	struct getsockopt_args /* {
1332 		int	s;
1333 		int	level;
1334 		int	name;
1335 		void * __restrict	val;
1336 		socklen_t * __restrict avalsize;
1337 	} */ *uap;
1338 {
1339 	socklen_t valsize;
1340 	int	error;
1341 
1342 	if (uap->val) {
1343 		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1344 		if (error)
1345 			return (error);
1346 	}
1347 
1348 	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1349 	    uap->val, UIO_USERSPACE, &valsize);
1350 
1351 	if (error == 0)
1352 		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1353 	return (error);
1354 }
1355 
1356 /*
1357  * Kernel version of getsockopt.
1358  * optval can be a userland or userspace. optlen is always a kernel pointer.
1359  */
1360 int
1361 kern_getsockopt(td, s, level, name, val, valseg, valsize)
1362 	struct thread *td;
1363 	int s;
1364 	int level;
1365 	int name;
1366 	void *val;
1367 	enum uio_seg valseg;
1368 	socklen_t *valsize;
1369 {
1370 	int error;
1371 	struct  socket *so;
1372 	struct file *fp;
1373 	struct	sockopt sopt;
1374 
1375 	if (val == NULL)
1376 		*valsize = 0;
1377 	if ((int)*valsize < 0)
1378 		return (EINVAL);
1379 
1380 	sopt.sopt_dir = SOPT_GET;
1381 	sopt.sopt_level = level;
1382 	sopt.sopt_name = name;
1383 	sopt.sopt_val = val;
1384 	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1385 	switch (valseg) {
1386 	case UIO_USERSPACE:
1387 		sopt.sopt_td = td;
1388 		break;
1389 	case UIO_SYSSPACE:
1390 		sopt.sopt_td = NULL;
1391 		break;
1392 	default:
1393 		panic("kern_getsockopt called with bad valseg");
1394 	}
1395 
1396 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
1397 	if (error == 0) {
1398 		so = fp->f_data;
1399 		error = sogetopt(so, &sopt);
1400 		*valsize = sopt.sopt_valsize;
1401 		fdrop(fp, td);
1402 	}
1403 	return (error);
1404 }
1405 
1406 /*
1407  * getsockname1() - Get socket name.
1408  */
1409 /* ARGSUSED */
1410 static int
1411 getsockname1(td, uap, compat)
1412 	struct thread *td;
1413 	struct getsockname_args /* {
1414 		int	fdes;
1415 		struct sockaddr * __restrict asa;
1416 		socklen_t * __restrict alen;
1417 	} */ *uap;
1418 	int compat;
1419 {
1420 	struct sockaddr *sa;
1421 	socklen_t len;
1422 	int error;
1423 
1424 	error = copyin(uap->alen, &len, sizeof(len));
1425 	if (error)
1426 		return (error);
1427 
1428 	error = kern_getsockname(td, uap->fdes, &sa, &len);
1429 	if (error)
1430 		return (error);
1431 
1432 	if (len != 0) {
1433 #ifdef COMPAT_OLDSOCK
1434 		if (compat)
1435 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1436 #endif
1437 		error = copyout(sa, uap->asa, (u_int)len);
1438 	}
1439 	free(sa, M_SONAME);
1440 	if (error == 0)
1441 		error = copyout(&len, uap->alen, sizeof(len));
1442 	return (error);
1443 }
1444 
1445 int
1446 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
1447     socklen_t *alen)
1448 {
1449 	struct socket *so;
1450 	struct file *fp;
1451 	socklen_t len;
1452 	int error;
1453 
1454 	if (*alen < 0)
1455 		return (EINVAL);
1456 
1457 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
1458 	if (error)
1459 		return (error);
1460 	so = fp->f_data;
1461 	*sa = NULL;
1462 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
1463 	if (error)
1464 		goto bad;
1465 	if (*sa == NULL)
1466 		len = 0;
1467 	else
1468 		len = MIN(*alen, (*sa)->sa_len);
1469 	*alen = len;
1470 #ifdef KTRACE
1471 	if (KTRPOINT(td, KTR_STRUCT))
1472 		ktrsockaddr(*sa);
1473 #endif
1474 bad:
1475 	fdrop(fp, td);
1476 	if (error && *sa) {
1477 		free(*sa, M_SONAME);
1478 		*sa = NULL;
1479 	}
1480 	return (error);
1481 }
1482 
1483 int
1484 getsockname(td, uap)
1485 	struct thread *td;
1486 	struct getsockname_args *uap;
1487 {
1488 
1489 	return (getsockname1(td, uap, 0));
1490 }
1491 
1492 #ifdef COMPAT_OLDSOCK
1493 int
1494 ogetsockname(td, uap)
1495 	struct thread *td;
1496 	struct getsockname_args *uap;
1497 {
1498 
1499 	return (getsockname1(td, uap, 1));
1500 }
1501 #endif /* COMPAT_OLDSOCK */
1502 
1503 /*
1504  * getpeername1() - Get name of peer for connected socket.
1505  */
1506 /* ARGSUSED */
1507 static int
1508 getpeername1(td, uap, compat)
1509 	struct thread *td;
1510 	struct getpeername_args /* {
1511 		int	fdes;
1512 		struct sockaddr * __restrict	asa;
1513 		socklen_t * __restrict	alen;
1514 	} */ *uap;
1515 	int compat;
1516 {
1517 	struct sockaddr *sa;
1518 	socklen_t len;
1519 	int error;
1520 
1521 	error = copyin(uap->alen, &len, sizeof (len));
1522 	if (error)
1523 		return (error);
1524 
1525 	error = kern_getpeername(td, uap->fdes, &sa, &len);
1526 	if (error)
1527 		return (error);
1528 
1529 	if (len != 0) {
1530 #ifdef COMPAT_OLDSOCK
1531 		if (compat)
1532 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1533 #endif
1534 		error = copyout(sa, uap->asa, (u_int)len);
1535 	}
1536 	free(sa, M_SONAME);
1537 	if (error == 0)
1538 		error = copyout(&len, uap->alen, sizeof(len));
1539 	return (error);
1540 }
1541 
1542 int
1543 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
1544     socklen_t *alen)
1545 {
1546 	struct socket *so;
1547 	struct file *fp;
1548 	socklen_t len;
1549 	int error;
1550 
1551 	if (*alen < 0)
1552 		return (EINVAL);
1553 
1554 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
1555 	if (error)
1556 		return (error);
1557 	so = fp->f_data;
1558 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1559 		error = ENOTCONN;
1560 		goto done;
1561 	}
1562 	*sa = NULL;
1563 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
1564 	if (error)
1565 		goto bad;
1566 	if (*sa == NULL)
1567 		len = 0;
1568 	else
1569 		len = MIN(*alen, (*sa)->sa_len);
1570 	*alen = len;
1571 #ifdef KTRACE
1572 	if (KTRPOINT(td, KTR_STRUCT))
1573 		ktrsockaddr(*sa);
1574 #endif
1575 bad:
1576 	if (error && *sa) {
1577 		free(*sa, M_SONAME);
1578 		*sa = NULL;
1579 	}
1580 done:
1581 	fdrop(fp, td);
1582 	return (error);
1583 }
1584 
1585 int
1586 getpeername(td, uap)
1587 	struct thread *td;
1588 	struct getpeername_args *uap;
1589 {
1590 
1591 	return (getpeername1(td, uap, 0));
1592 }
1593 
1594 #ifdef COMPAT_OLDSOCK
1595 int
1596 ogetpeername(td, uap)
1597 	struct thread *td;
1598 	struct ogetpeername_args *uap;
1599 {
1600 
1601 	/* XXX uap should have type `getpeername_args *' to begin with. */
1602 	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1603 }
1604 #endif /* COMPAT_OLDSOCK */
1605 
1606 int
1607 sockargs(mp, buf, buflen, type)
1608 	struct mbuf **mp;
1609 	caddr_t buf;
1610 	int buflen, type;
1611 {
1612 	struct sockaddr *sa;
1613 	struct mbuf *m;
1614 	int error;
1615 
1616 	if ((u_int)buflen > MLEN) {
1617 #ifdef COMPAT_OLDSOCK
1618 		if (type == MT_SONAME && (u_int)buflen <= 112)
1619 			buflen = MLEN;		/* unix domain compat. hack */
1620 		else
1621 #endif
1622 			if ((u_int)buflen > MCLBYTES)
1623 				return (EINVAL);
1624 	}
1625 	m = m_get(M_WAIT, type);
1626 	if ((u_int)buflen > MLEN)
1627 		MCLGET(m, M_WAIT);
1628 	m->m_len = buflen;
1629 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1630 	if (error)
1631 		(void) m_free(m);
1632 	else {
1633 		*mp = m;
1634 		if (type == MT_SONAME) {
1635 			sa = mtod(m, struct sockaddr *);
1636 
1637 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1638 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1639 				sa->sa_family = sa->sa_len;
1640 #endif
1641 			sa->sa_len = buflen;
1642 		}
1643 	}
1644 	return (error);
1645 }
1646 
1647 int
1648 getsockaddr(namp, uaddr, len)
1649 	struct sockaddr **namp;
1650 	caddr_t uaddr;
1651 	size_t len;
1652 {
1653 	struct sockaddr *sa;
1654 	int error;
1655 
1656 	if (len > SOCK_MAXADDRLEN)
1657 		return (ENAMETOOLONG);
1658 	if (len < offsetof(struct sockaddr, sa_data[0]))
1659 		return (EINVAL);
1660 	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1661 	error = copyin(uaddr, sa, len);
1662 	if (error) {
1663 		FREE(sa, M_SONAME);
1664 	} else {
1665 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1666 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1667 			sa->sa_family = sa->sa_len;
1668 #endif
1669 		sa->sa_len = len;
1670 		*namp = sa;
1671 	}
1672 	return (error);
1673 }
1674 
1675 #include <sys/condvar.h>
1676 
1677 struct sendfile_sync {
1678 	struct mtx	mtx;
1679 	struct cv	cv;
1680 	unsigned 	count;
1681 };
1682 
1683 /*
1684  * Detach mapped page and release resources back to the system.
1685  */
1686 void
1687 sf_buf_mext(void *addr, void *args)
1688 {
1689 	vm_page_t m;
1690 	struct sendfile_sync *sfs;
1691 
1692 	m = sf_buf_page(args);
1693 	sf_buf_free(args);
1694 	vm_page_lock_queues();
1695 	vm_page_unwire(m, 0);
1696 	/*
1697 	 * Check for the object going away on us. This can
1698 	 * happen since we don't hold a reference to it.
1699 	 * If so, we're responsible for freeing the page.
1700 	 */
1701 	if (m->wire_count == 0 && m->object == NULL)
1702 		vm_page_free(m);
1703 	vm_page_unlock_queues();
1704 	if (addr == NULL)
1705 		return;
1706 	sfs = addr;
1707 	mtx_lock(&sfs->mtx);
1708 	KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
1709 	if (--sfs->count == 0)
1710 		cv_signal(&sfs->cv);
1711 	mtx_unlock(&sfs->mtx);
1712 }
1713 
1714 /*
1715  * sendfile(2)
1716  *
1717  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1718  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1719  *
1720  * Send a file specified by 'fd' and starting at 'offset' to a socket
1721  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
1722  * 0.  Optionally add a header and/or trailer to the socket output.  If
1723  * specified, write the total number of bytes sent into *sbytes.
1724  */
1725 int
1726 sendfile(struct thread *td, struct sendfile_args *uap)
1727 {
1728 
1729 	return (do_sendfile(td, uap, 0));
1730 }
1731 
1732 static int
1733 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1734 {
1735 	struct sf_hdtr hdtr;
1736 	struct uio *hdr_uio, *trl_uio;
1737 	int error;
1738 
1739 	hdr_uio = trl_uio = NULL;
1740 
1741 	if (uap->hdtr != NULL) {
1742 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1743 		if (error)
1744 			goto out;
1745 		if (hdtr.headers != NULL) {
1746 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
1747 			if (error)
1748 				goto out;
1749 		}
1750 		if (hdtr.trailers != NULL) {
1751 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
1752 			if (error)
1753 				goto out;
1754 
1755 		}
1756 	}
1757 
1758 	error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
1759 out:
1760 	if (hdr_uio)
1761 		free(hdr_uio, M_IOV);
1762 	if (trl_uio)
1763 		free(trl_uio, M_IOV);
1764 	return (error);
1765 }
1766 
1767 #ifdef COMPAT_FREEBSD4
1768 int
1769 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1770 {
1771 	struct sendfile_args args;
1772 
1773 	args.fd = uap->fd;
1774 	args.s = uap->s;
1775 	args.offset = uap->offset;
1776 	args.nbytes = uap->nbytes;
1777 	args.hdtr = uap->hdtr;
1778 	args.sbytes = uap->sbytes;
1779 	args.flags = uap->flags;
1780 
1781 	return (do_sendfile(td, &args, 1));
1782 }
1783 #endif /* COMPAT_FREEBSD4 */
1784 
1785 int
1786 kern_sendfile(struct thread *td, struct sendfile_args *uap,
1787     struct uio *hdr_uio, struct uio *trl_uio, int compat)
1788 {
1789 	struct file *sock_fp;
1790 	struct vnode *vp;
1791 	struct vm_object *obj = NULL;
1792 	struct socket *so = NULL;
1793 	struct mbuf *m = NULL;
1794 	struct sf_buf *sf;
1795 	struct vm_page *pg;
1796 	off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
1797 	int error, hdrlen = 0, mnw = 0;
1798 	int vfslocked;
1799 	struct sendfile_sync *sfs = NULL;
1800 
1801 	/*
1802 	 * The file descriptor must be a regular file and have a
1803 	 * backing VM object.
1804 	 * File offset must be positive.  If it goes beyond EOF
1805 	 * we send only the header/trailer and no payload data.
1806 	 */
1807 	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
1808 		goto out;
1809 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1810 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1811 	if (vp->v_type == VREG) {
1812 		obj = vp->v_object;
1813 		if (obj != NULL) {
1814 			/*
1815 			 * Temporarily increase the backing VM
1816 			 * object's reference count so that a forced
1817 			 * reclamation of its vnode does not
1818 			 * immediately destroy it.
1819 			 */
1820 			VM_OBJECT_LOCK(obj);
1821 			if ((obj->flags & OBJ_DEAD) == 0) {
1822 				vm_object_reference_locked(obj);
1823 				VM_OBJECT_UNLOCK(obj);
1824 			} else {
1825 				VM_OBJECT_UNLOCK(obj);
1826 				obj = NULL;
1827 			}
1828 		}
1829 	}
1830 	VOP_UNLOCK(vp, 0);
1831 	VFS_UNLOCK_GIANT(vfslocked);
1832 	if (obj == NULL) {
1833 		error = EINVAL;
1834 		goto out;
1835 	}
1836 	if (uap->offset < 0) {
1837 		error = EINVAL;
1838 		goto out;
1839 	}
1840 
1841 	/*
1842 	 * The socket must be a stream socket and connected.
1843 	 * Remember if it a blocking or non-blocking socket.
1844 	 */
1845 	if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp,
1846 	    NULL)) != 0)
1847 		goto out;
1848 	so = sock_fp->f_data;
1849 	if (so->so_type != SOCK_STREAM) {
1850 		error = EINVAL;
1851 		goto out;
1852 	}
1853 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1854 		error = ENOTCONN;
1855 		goto out;
1856 	}
1857 	/*
1858 	 * Do not wait on memory allocations but return ENOMEM for
1859 	 * caller to retry later.
1860 	 * XXX: Experimental.
1861 	 */
1862 	if (uap->flags & SF_MNOWAIT)
1863 		mnw = 1;
1864 
1865 	if (uap->flags & SF_SYNC) {
1866 		sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK);
1867 		memset(sfs, 0, sizeof *sfs);
1868 		mtx_init(&sfs->mtx, "sendfile", MTX_DEF, 0);
1869 		cv_init(&sfs->cv, "sendfile");
1870 	}
1871 
1872 #ifdef MAC
1873 	SOCK_LOCK(so);
1874 	error = mac_socket_check_send(td->td_ucred, so);
1875 	SOCK_UNLOCK(so);
1876 	if (error)
1877 		goto out;
1878 #endif
1879 
1880 	/* If headers are specified copy them into mbufs. */
1881 	if (hdr_uio != NULL) {
1882 		hdr_uio->uio_td = td;
1883 		hdr_uio->uio_rw = UIO_WRITE;
1884 		if (hdr_uio->uio_resid > 0) {
1885 			/*
1886 			 * In FBSD < 5.0 the nbytes to send also included
1887 			 * the header.  If compat is specified subtract the
1888 			 * header size from nbytes.
1889 			 */
1890 			if (compat) {
1891 				if (uap->nbytes > hdr_uio->uio_resid)
1892 					uap->nbytes -= hdr_uio->uio_resid;
1893 				else
1894 					uap->nbytes = 0;
1895 			}
1896 			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
1897 			    0, 0, 0);
1898 			if (m == NULL) {
1899 				error = mnw ? EAGAIN : ENOBUFS;
1900 				goto out;
1901 			}
1902 			hdrlen = m_length(m, NULL);
1903 		}
1904 	}
1905 
1906 	/*
1907 	 * Protect against multiple writers to the socket.
1908 	 *
1909 	 * XXXRW: Historically this has assumed non-interruptibility, so now
1910 	 * we implement that, but possibly shouldn't.
1911 	 */
1912 	(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
1913 
1914 	/*
1915 	 * Loop through the pages of the file, starting with the requested
1916 	 * offset. Get a file page (do I/O if necessary), map the file page
1917 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1918 	 * it on the socket.
1919 	 * This is done in two loops.  The inner loop turns as many pages
1920 	 * as it can, up to available socket buffer space, without blocking
1921 	 * into mbufs to have it bulk delivered into the socket send buffer.
1922 	 * The outer loop checks the state and available space of the socket
1923 	 * and takes care of the overall progress.
1924 	 */
1925 	for (off = uap->offset, rem = uap->nbytes; ; ) {
1926 		int loopbytes = 0;
1927 		int space = 0;
1928 		int done = 0;
1929 
1930 		/*
1931 		 * Check the socket state for ongoing connection,
1932 		 * no errors and space in socket buffer.
1933 		 * If space is low allow for the remainder of the
1934 		 * file to be processed if it fits the socket buffer.
1935 		 * Otherwise block in waiting for sufficient space
1936 		 * to proceed, or if the socket is nonblocking, return
1937 		 * to userland with EAGAIN while reporting how far
1938 		 * we've come.
1939 		 * We wait until the socket buffer has significant free
1940 		 * space to do bulk sends.  This makes good use of file
1941 		 * system read ahead and allows packet segmentation
1942 		 * offloading hardware to take over lots of work.  If
1943 		 * we were not careful here we would send off only one
1944 		 * sfbuf at a time.
1945 		 */
1946 		SOCKBUF_LOCK(&so->so_snd);
1947 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
1948 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
1949 retry_space:
1950 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1951 			error = EPIPE;
1952 			SOCKBUF_UNLOCK(&so->so_snd);
1953 			goto done;
1954 		} else if (so->so_error) {
1955 			error = so->so_error;
1956 			so->so_error = 0;
1957 			SOCKBUF_UNLOCK(&so->so_snd);
1958 			goto done;
1959 		}
1960 		space = sbspace(&so->so_snd);
1961 		if (space < rem &&
1962 		    (space <= 0 ||
1963 		     space < so->so_snd.sb_lowat)) {
1964 			if (so->so_state & SS_NBIO) {
1965 				SOCKBUF_UNLOCK(&so->so_snd);
1966 				error = EAGAIN;
1967 				goto done;
1968 			}
1969 			/*
1970 			 * sbwait drops the lock while sleeping.
1971 			 * When we loop back to retry_space the
1972 			 * state may have changed and we retest
1973 			 * for it.
1974 			 */
1975 			error = sbwait(&so->so_snd);
1976 			/*
1977 			 * An error from sbwait usually indicates that we've
1978 			 * been interrupted by a signal. If we've sent anything
1979 			 * then return bytes sent, otherwise return the error.
1980 			 */
1981 			if (error) {
1982 				SOCKBUF_UNLOCK(&so->so_snd);
1983 				goto done;
1984 			}
1985 			goto retry_space;
1986 		}
1987 		SOCKBUF_UNLOCK(&so->so_snd);
1988 
1989 		/*
1990 		 * Reduce space in the socket buffer by the size of
1991 		 * the header mbuf chain.
1992 		 * hdrlen is set to 0 after the first loop.
1993 		 */
1994 		space -= hdrlen;
1995 
1996 		/*
1997 		 * Loop and construct maximum sized mbuf chain to be bulk
1998 		 * dumped into socket buffer.
1999 		 */
2000 		while(space > loopbytes) {
2001 			vm_pindex_t pindex;
2002 			vm_offset_t pgoff;
2003 			struct mbuf *m0;
2004 
2005 			VM_OBJECT_LOCK(obj);
2006 			/*
2007 			 * Calculate the amount to transfer.
2008 			 * Not to exceed a page, the EOF,
2009 			 * or the passed in nbytes.
2010 			 */
2011 			pgoff = (vm_offset_t)(off & PAGE_MASK);
2012 			xfsize = omin(PAGE_SIZE - pgoff,
2013 			    obj->un_pager.vnp.vnp_size - uap->offset -
2014 			    fsbytes - loopbytes);
2015 			if (uap->nbytes)
2016 				rem = (uap->nbytes - fsbytes - loopbytes);
2017 			else
2018 				rem = obj->un_pager.vnp.vnp_size -
2019 				    uap->offset - fsbytes - loopbytes;
2020 			xfsize = omin(rem, xfsize);
2021 			if (xfsize <= 0) {
2022 				VM_OBJECT_UNLOCK(obj);
2023 				done = 1;		/* all data sent */
2024 				break;
2025 			}
2026 			/*
2027 			 * Don't overflow the send buffer.
2028 			 * Stop here and send out what we've
2029 			 * already got.
2030 			 */
2031 			if (space < loopbytes + xfsize) {
2032 				VM_OBJECT_UNLOCK(obj);
2033 				break;
2034 			}
2035 
2036 			/*
2037 			 * Attempt to look up the page.  Allocate
2038 			 * if not found or wait and loop if busy.
2039 			 */
2040 			pindex = OFF_TO_IDX(off);
2041 			pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
2042 			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
2043 
2044 			/*
2045 			 * Check if page is valid for what we need,
2046 			 * otherwise initiate I/O.
2047 			 * If we already turned some pages into mbufs,
2048 			 * send them off before we come here again and
2049 			 * block.
2050 			 */
2051 			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
2052 				VM_OBJECT_UNLOCK(obj);
2053 			else if (m != NULL)
2054 				error = EAGAIN;	/* send what we already got */
2055 			else if (uap->flags & SF_NODISKIO)
2056 				error = EBUSY;
2057 			else {
2058 				int bsize, resid;
2059 
2060 				/*
2061 				 * Ensure that our page is still around
2062 				 * when the I/O completes.
2063 				 */
2064 				vm_page_io_start(pg);
2065 				VM_OBJECT_UNLOCK(obj);
2066 
2067 				/*
2068 				 * Get the page from backing store.
2069 				 */
2070 				bsize = vp->v_mount->mnt_stat.f_iosize;
2071 				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2072 				vn_lock(vp, LK_SHARED | LK_RETRY);
2073 
2074 				/*
2075 				 * XXXMAC: Because we don't have fp->f_cred
2076 				 * here, we pass in NOCRED.  This is probably
2077 				 * wrong, but is consistent with our original
2078 				 * implementation.
2079 				 */
2080 				error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
2081 				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
2082 				    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
2083 				    td->td_ucred, NOCRED, &resid, td);
2084 				VOP_UNLOCK(vp, 0);
2085 				VFS_UNLOCK_GIANT(vfslocked);
2086 				VM_OBJECT_LOCK(obj);
2087 				vm_page_io_finish(pg);
2088 				if (!error)
2089 					VM_OBJECT_UNLOCK(obj);
2090 				mbstat.sf_iocnt++;
2091 			}
2092 			if (error) {
2093 				vm_page_lock_queues();
2094 				vm_page_unwire(pg, 0);
2095 				/*
2096 				 * See if anyone else might know about
2097 				 * this page.  If not and it is not valid,
2098 				 * then free it.
2099 				 */
2100 				if (pg->wire_count == 0 && pg->valid == 0 &&
2101 				    pg->busy == 0 && !(pg->oflags & VPO_BUSY) &&
2102 				    pg->hold_count == 0) {
2103 					vm_page_free(pg);
2104 				}
2105 				vm_page_unlock_queues();
2106 				VM_OBJECT_UNLOCK(obj);
2107 				if (error == EAGAIN)
2108 					error = 0;	/* not a real error */
2109 				break;
2110 			}
2111 
2112 			/*
2113 			 * Get a sendfile buf.  We usually wait as long
2114 			 * as necessary, but this wait can be interrupted.
2115 			 */
2116 			if ((sf = sf_buf_alloc(pg,
2117 			    (mnw ? SFB_NOWAIT : SFB_CATCH))) == NULL) {
2118 				mbstat.sf_allocfail++;
2119 				vm_page_lock_queues();
2120 				vm_page_unwire(pg, 0);
2121 				/*
2122 				 * XXX: Not same check as above!?
2123 				 */
2124 				if (pg->wire_count == 0 && pg->object == NULL)
2125 					vm_page_free(pg);
2126 				vm_page_unlock_queues();
2127 				error = (mnw ? EAGAIN : EINTR);
2128 				break;
2129 			}
2130 
2131 			/*
2132 			 * Get an mbuf and set it up as having
2133 			 * external storage.
2134 			 */
2135 			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
2136 			if (m0 == NULL) {
2137 				error = (mnw ? EAGAIN : ENOBUFS);
2138 				sf_buf_mext((void *)sf_buf_kva(sf), sf);
2139 				break;
2140 			}
2141 			MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext,
2142 			    sfs, sf, M_RDONLY, EXT_SFBUF);
2143 			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
2144 			m0->m_len = xfsize;
2145 
2146 			/* Append to mbuf chain. */
2147 			if (m != NULL)
2148 				m_cat(m, m0);
2149 			else
2150 				m = m0;
2151 
2152 			/* Keep track of bits processed. */
2153 			loopbytes += xfsize;
2154 			off += xfsize;
2155 
2156 			if (sfs != NULL) {
2157 				mtx_lock(&sfs->mtx);
2158 				sfs->count++;
2159 				mtx_unlock(&sfs->mtx);
2160 			}
2161 		}
2162 
2163 		/* Add the buffer chain to the socket buffer. */
2164 		if (m != NULL) {
2165 			int mlen, err;
2166 
2167 			mlen = m_length(m, NULL);
2168 			SOCKBUF_LOCK(&so->so_snd);
2169 			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2170 				error = EPIPE;
2171 				SOCKBUF_UNLOCK(&so->so_snd);
2172 				goto done;
2173 			}
2174 			SOCKBUF_UNLOCK(&so->so_snd);
2175 			/* Avoid error aliasing. */
2176 			err = (*so->so_proto->pr_usrreqs->pru_send)
2177 				    (so, 0, m, NULL, NULL, td);
2178 			if (err == 0) {
2179 				/*
2180 				 * We need two counters to get the
2181 				 * file offset and nbytes to send
2182 				 * right:
2183 				 * - sbytes contains the total amount
2184 				 *   of bytes sent, including headers.
2185 				 * - fsbytes contains the total amount
2186 				 *   of bytes sent from the file.
2187 				 */
2188 				sbytes += mlen;
2189 				fsbytes += mlen;
2190 				if (hdrlen) {
2191 					fsbytes -= hdrlen;
2192 					hdrlen = 0;
2193 				}
2194 			} else if (error == 0)
2195 				error = err;
2196 			m = NULL;	/* pru_send always consumes */
2197 		}
2198 
2199 		/* Quit outer loop on error or when we're done. */
2200 		if (done)
2201 			break;
2202 		if (error)
2203 			goto done;
2204 	}
2205 
2206 	/*
2207 	 * Send trailers. Wimp out and use writev(2).
2208 	 */
2209 	if (trl_uio != NULL) {
2210 		error = kern_writev(td, uap->s, trl_uio);
2211 		if (error)
2212 			goto done;
2213 		sbytes += td->td_retval[0];
2214 	}
2215 
2216 done:
2217 	sbunlock(&so->so_snd);
2218 out:
2219 	/*
2220 	 * If there was no error we have to clear td->td_retval[0]
2221 	 * because it may have been set by writev.
2222 	 */
2223 	if (error == 0) {
2224 		td->td_retval[0] = 0;
2225 	}
2226 	if (uap->sbytes != NULL) {
2227 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
2228 	}
2229 	if (obj != NULL)
2230 		vm_object_deallocate(obj);
2231 	if (vp != NULL) {
2232 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2233 		vrele(vp);
2234 		VFS_UNLOCK_GIANT(vfslocked);
2235 	}
2236 	if (so)
2237 		fdrop(sock_fp, td);
2238 	if (m)
2239 		m_freem(m);
2240 
2241 	if (sfs != NULL) {
2242 		mtx_lock(&sfs->mtx);
2243 		if (sfs->count != 0)
2244 			cv_wait(&sfs->cv, &sfs->mtx);
2245 		KASSERT(sfs->count == 0, ("sendfile sync still busy"));
2246 		cv_destroy(&sfs->cv);
2247 		mtx_destroy(&sfs->mtx);
2248 		free(sfs, M_TEMP);
2249 	}
2250 
2251 	if (error == ERESTART)
2252 		error = EINTR;
2253 
2254 	return (error);
2255 }
2256 
2257 /*
2258  * SCTP syscalls.
2259  * Functionality only compiled in if SCTP is defined in the kernel Makefile,
2260  * otherwise all return EOPNOTSUPP.
2261  * XXX: We should make this loadable one day.
2262  */
2263 int
2264 sctp_peeloff(td, uap)
2265 	struct thread *td;
2266 	struct sctp_peeloff_args /* {
2267 		int	sd;
2268 		caddr_t	name;
2269 	} */ *uap;
2270 {
2271 #ifdef SCTP
2272 	struct filedesc *fdp;
2273 	struct file *nfp = NULL;
2274 	int error;
2275 	struct socket *head, *so;
2276 	int fd;
2277 	u_int fflag;
2278 
2279 	fdp = td->td_proc->p_fd;
2280 	error = fgetsock(td, uap->sd, &head, &fflag);
2281 	if (error)
2282 		goto done2;
2283 	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
2284 	if (error)
2285 		goto done2;
2286 	/*
2287 	 * At this point we know we do have a assoc to pull
2288 	 * we proceed to get the fd setup. This may block
2289 	 * but that is ok.
2290 	 */
2291 
2292 	error = falloc(td, &nfp, &fd);
2293 	if (error)
2294 		goto done;
2295 	td->td_retval[0] = fd;
2296 
2297 	so = sonewconn(head, SS_ISCONNECTED);
2298 	if (so == NULL)
2299 		goto noconnection;
2300 	/*
2301 	 * Before changing the flags on the socket, we have to bump the
2302 	 * reference count.  Otherwise, if the protocol calls sofree(),
2303 	 * the socket will be released due to a zero refcount.
2304 	 */
2305         SOCK_LOCK(so);
2306         soref(so);                      /* file descriptor reference */
2307         SOCK_UNLOCK(so);
2308 
2309 	ACCEPT_LOCK();
2310 
2311 	TAILQ_REMOVE(&head->so_comp, so, so_list);
2312 	head->so_qlen--;
2313 	so->so_state |= (head->so_state & SS_NBIO);
2314 	so->so_state &= ~SS_NOFDREF;
2315 	so->so_qstate &= ~SQ_COMP;
2316 	so->so_head = NULL;
2317 	ACCEPT_UNLOCK();
2318 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
2319 	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
2320 	if (error)
2321 		goto noconnection;
2322 	if (head->so_sigio != NULL)
2323 		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
2324 
2325 noconnection:
2326 	/*
2327 	 * close the new descriptor, assuming someone hasn't ripped it
2328 	 * out from under us.
2329 	 */
2330 	if (error)
2331 		fdclose(fdp, nfp, fd, td);
2332 
2333 	/*
2334 	 * Release explicitly held references before returning.
2335 	 */
2336 done:
2337 	if (nfp != NULL)
2338 		fdrop(nfp, td);
2339 	fputsock(head);
2340 done2:
2341 	return (error);
2342 #else  /* SCTP */
2343 	return (EOPNOTSUPP);
2344 #endif /* SCTP */
2345 }
2346 
2347 int
2348 sctp_generic_sendmsg (td, uap)
2349 	struct thread *td;
2350 	struct sctp_generic_sendmsg_args /* {
2351 		int sd,
2352 		caddr_t msg,
2353 		int mlen,
2354 		caddr_t to,
2355 		__socklen_t tolen,
2356 		struct sctp_sndrcvinfo *sinfo,
2357 		int flags
2358 	} */ *uap;
2359 {
2360 #ifdef SCTP
2361 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2362 	struct socket *so;
2363 	struct file *fp = NULL;
2364 	int use_rcvinfo = 1;
2365 	int error = 0, len;
2366 	struct sockaddr *to = NULL;
2367 #ifdef KTRACE
2368 	struct uio *ktruio = NULL;
2369 #endif
2370 	struct uio auio;
2371 	struct iovec iov[1];
2372 
2373 	if (uap->sinfo) {
2374 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2375 		if (error)
2376 			return (error);
2377 		u_sinfo = &sinfo;
2378 	}
2379 	if (uap->tolen) {
2380 		error = getsockaddr(&to, uap->to, uap->tolen);
2381 		if (error) {
2382 			to = NULL;
2383 			goto sctp_bad2;
2384 		}
2385 	}
2386 
2387 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
2388 	if (error)
2389 		goto sctp_bad;
2390 #ifdef KTRACE
2391 	if (KTRPOINT(td, KTR_STRUCT))
2392 		ktrsockaddr(to);
2393 #endif
2394 
2395 	iov[0].iov_base = uap->msg;
2396 	iov[0].iov_len = uap->mlen;
2397 
2398 	so = (struct socket *)fp->f_data;
2399 #ifdef MAC
2400 	SOCK_LOCK(so);
2401 	error = mac_socket_check_send(td->td_ucred, so);
2402 	SOCK_UNLOCK(so);
2403 	if (error)
2404 		goto sctp_bad;
2405 #endif /* MAC */
2406 
2407 	auio.uio_iov =  iov;
2408 	auio.uio_iovcnt = 1;
2409 	auio.uio_segflg = UIO_USERSPACE;
2410 	auio.uio_rw = UIO_WRITE;
2411 	auio.uio_td = td;
2412 	auio.uio_offset = 0;			/* XXX */
2413 	auio.uio_resid = 0;
2414 	len = auio.uio_resid = uap->mlen;
2415 	error = sctp_lower_sosend(so, to, &auio,
2416 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2417 		    uap->flags, use_rcvinfo, u_sinfo, td);
2418 	if (error) {
2419 		if (auio.uio_resid != len && (error == ERESTART ||
2420 		    error == EINTR || error == EWOULDBLOCK))
2421 			error = 0;
2422 		/* Generation of SIGPIPE can be controlled per socket. */
2423 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2424 		    !(uap->flags & MSG_NOSIGNAL)) {
2425 			PROC_LOCK(td->td_proc);
2426 			psignal(td->td_proc, SIGPIPE);
2427 			PROC_UNLOCK(td->td_proc);
2428 		}
2429 	}
2430 	if (error == 0)
2431 		td->td_retval[0] = len - auio.uio_resid;
2432 #ifdef KTRACE
2433 	if (ktruio != NULL) {
2434 		ktruio->uio_resid = td->td_retval[0];
2435 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2436 	}
2437 #endif /* KTRACE */
2438 sctp_bad:
2439 	if (fp)
2440 		fdrop(fp, td);
2441 sctp_bad2:
2442 	if (to)
2443 		free(to, M_SONAME);
2444 	return (error);
2445 #else  /* SCTP */
2446 	return (EOPNOTSUPP);
2447 #endif /* SCTP */
2448 }
2449 
2450 int
2451 sctp_generic_sendmsg_iov(td, uap)
2452 	struct thread *td;
2453 	struct sctp_generic_sendmsg_iov_args /* {
2454 		int sd,
2455 		struct iovec *iov,
2456 		int iovlen,
2457 		caddr_t to,
2458 		__socklen_t tolen,
2459 		struct sctp_sndrcvinfo *sinfo,
2460 		int flags
2461 	} */ *uap;
2462 {
2463 #ifdef SCTP
2464 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2465 	struct socket *so;
2466 	struct file *fp = NULL;
2467 	int use_rcvinfo = 1;
2468 	int error=0, len, i;
2469 	struct sockaddr *to = NULL;
2470 #ifdef KTRACE
2471 	struct uio *ktruio = NULL;
2472 #endif
2473 	struct uio auio;
2474 	struct iovec *iov, *tiov;
2475 
2476 	if (uap->sinfo) {
2477 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2478 		if (error)
2479 			return (error);
2480 		u_sinfo = &sinfo;
2481 	}
2482 	if (uap->tolen) {
2483 		error = getsockaddr(&to, uap->to, uap->tolen);
2484 		if (error) {
2485 			to = NULL;
2486 			goto sctp_bad2;
2487 		}
2488 	}
2489 
2490 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
2491 	if (error)
2492 		goto sctp_bad1;
2493 
2494 	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2495 	if (error)
2496 		goto sctp_bad1;
2497 #ifdef KTRACE
2498 	if (KTRPOINT(td, KTR_STRUCT))
2499 		ktrsockaddr(to);
2500 #endif
2501 
2502 	so = (struct socket *)fp->f_data;
2503 #ifdef MAC
2504 	SOCK_LOCK(so);
2505 	error = mac_socket_check_send(td->td_ucred, so);
2506 	SOCK_UNLOCK(so);
2507 	if (error)
2508 		goto sctp_bad;
2509 #endif /* MAC */
2510 
2511 	auio.uio_iov =  iov;
2512 	auio.uio_iovcnt = uap->iovlen;
2513 	auio.uio_segflg = UIO_USERSPACE;
2514 	auio.uio_rw = UIO_WRITE;
2515 	auio.uio_td = td;
2516 	auio.uio_offset = 0;			/* XXX */
2517 	auio.uio_resid = 0;
2518 	tiov = iov;
2519 	for (i = 0; i <uap->iovlen; i++, tiov++) {
2520 		if ((auio.uio_resid += tiov->iov_len) < 0) {
2521 			error = EINVAL;
2522 			goto sctp_bad;
2523 		}
2524 	}
2525 	len = auio.uio_resid;
2526 	error = sctp_lower_sosend(so, to, &auio,
2527 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2528 		    uap->flags, use_rcvinfo, u_sinfo, td);
2529 	if (error) {
2530 		if (auio.uio_resid != len && (error == ERESTART ||
2531 		    error == EINTR || error == EWOULDBLOCK))
2532 			error = 0;
2533 		/* Generation of SIGPIPE can be controlled per socket */
2534 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2535 		    !(uap->flags & MSG_NOSIGNAL)) {
2536 			PROC_LOCK(td->td_proc);
2537 			psignal(td->td_proc, SIGPIPE);
2538 			PROC_UNLOCK(td->td_proc);
2539 		}
2540 	}
2541 	if (error == 0)
2542 		td->td_retval[0] = len - auio.uio_resid;
2543 #ifdef KTRACE
2544 	if (ktruio != NULL) {
2545 		ktruio->uio_resid = td->td_retval[0];
2546 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2547 	}
2548 #endif /* KTRACE */
2549 sctp_bad:
2550 	free(iov, M_IOV);
2551 sctp_bad1:
2552 	if (fp)
2553 		fdrop(fp, td);
2554 sctp_bad2:
2555 	if (to)
2556 		free(to, M_SONAME);
2557 	return (error);
2558 #else  /* SCTP */
2559 	return (EOPNOTSUPP);
2560 #endif /* SCTP */
2561 }
2562 
2563 int
2564 sctp_generic_recvmsg(td, uap)
2565 	struct thread *td;
2566 	struct sctp_generic_recvmsg_args /* {
2567 		int sd,
2568 		struct iovec *iov,
2569 		int iovlen,
2570 		struct sockaddr *from,
2571 		__socklen_t *fromlenaddr,
2572 		struct sctp_sndrcvinfo *sinfo,
2573 		int *msg_flags
2574 	} */ *uap;
2575 {
2576 #ifdef SCTP
2577 	u_int8_t sockbufstore[256];
2578 	struct uio auio;
2579 	struct iovec *iov, *tiov;
2580 	struct sctp_sndrcvinfo sinfo;
2581 	struct socket *so;
2582 	struct file *fp = NULL;
2583 	struct sockaddr *fromsa;
2584 	int fromlen;
2585 	int len, i, msg_flags;
2586 	int error = 0;
2587 #ifdef KTRACE
2588 	struct uio *ktruio = NULL;
2589 #endif
2590 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
2591 	if (error) {
2592 		return (error);
2593 	}
2594 	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2595 	if (error) {
2596 		goto out1;
2597 	}
2598 
2599 	so = fp->f_data;
2600 #ifdef MAC
2601 	SOCK_LOCK(so);
2602 	error = mac_socket_check_receive(td->td_ucred, so);
2603 	SOCK_UNLOCK(so);
2604 	if (error) {
2605 		goto out;
2606 		return (error);
2607 	}
2608 #endif /* MAC */
2609 
2610 	if (uap->fromlenaddr) {
2611 		error = copyin(uap->fromlenaddr,
2612 		    &fromlen, sizeof (fromlen));
2613 		if (error) {
2614 			goto out;
2615 		}
2616 	} else {
2617 		fromlen = 0;
2618 	}
2619 	if(uap->msg_flags) {
2620 		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
2621 		if (error) {
2622 			goto out;
2623 		}
2624 	} else {
2625 		msg_flags = 0;
2626 	}
2627 	auio.uio_iov = iov;
2628 	auio.uio_iovcnt = uap->iovlen;
2629   	auio.uio_segflg = UIO_USERSPACE;
2630 	auio.uio_rw = UIO_READ;
2631 	auio.uio_td = td;
2632 	auio.uio_offset = 0;			/* XXX */
2633 	auio.uio_resid = 0;
2634 	tiov = iov;
2635 	for (i = 0; i <uap->iovlen; i++, tiov++) {
2636 		if ((auio.uio_resid += tiov->iov_len) < 0) {
2637 			error = EINVAL;
2638 			goto out;
2639 		}
2640 	}
2641 	len = auio.uio_resid;
2642 	fromsa = (struct sockaddr *)sockbufstore;
2643 
2644 #ifdef KTRACE
2645 	if (KTRPOINT(td, KTR_GENIO))
2646 		ktruio = cloneuio(&auio);
2647 #endif /* KTRACE */
2648 	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
2649 		    fromsa, fromlen, &msg_flags,
2650 		    (struct sctp_sndrcvinfo *)&sinfo, 1);
2651 	if (error) {
2652 		if (auio.uio_resid != (int)len && (error == ERESTART ||
2653 		    error == EINTR || error == EWOULDBLOCK))
2654 			error = 0;
2655 	} else {
2656 		if (uap->sinfo)
2657 			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
2658 	}
2659 #ifdef KTRACE
2660 	if (ktruio != NULL) {
2661 		ktruio->uio_resid = (int)len - auio.uio_resid;
2662 		ktrgenio(uap->sd, UIO_READ, ktruio, error);
2663 	}
2664 #endif /* KTRACE */
2665 	if (error)
2666 		goto out;
2667 	td->td_retval[0] = (int)len - auio.uio_resid;
2668 
2669 	if (fromlen && uap->from) {
2670 		len = fromlen;
2671 		if (len <= 0 || fromsa == 0)
2672 			len = 0;
2673 		else {
2674 			len = MIN(len, fromsa->sa_len);
2675 			error = copyout(fromsa, uap->from, (unsigned)len);
2676 			if (error)
2677 				goto out;
2678 		}
2679 		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
2680 		if (error) {
2681 			goto out;
2682 		}
2683 	}
2684 #ifdef KTRACE
2685 	if (KTRPOINT(td, KTR_STRUCT))
2686 		ktrsockaddr(fromsa);
2687 #endif
2688 	if (uap->msg_flags) {
2689 		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
2690 		if (error) {
2691 			goto out;
2692 		}
2693 	}
2694 out:
2695 	free(iov, M_IOV);
2696 out1:
2697 	if (fp)
2698 		fdrop(fp, td);
2699 
2700 	return (error);
2701 #else  /* SCTP */
2702 	return (EOPNOTSUPP);
2703 #endif /* SCTP */
2704 }
2705