xref: /freebsd/sys/kern/uipc_syscalls.c (revision f856af0466c076beef4ea9b15d088e1119a945b8)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include "opt_sctp.h"
39 #include "opt_compat.h"
40 #include "opt_ktrace.h"
41 #include "opt_mac.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/mutex.h>
48 #include <sys/sysproto.h>
49 #include <sys/malloc.h>
50 #include <sys/filedesc.h>
51 #include <sys/event.h>
52 #include <sys/proc.h>
53 #include <sys/fcntl.h>
54 #include <sys/file.h>
55 #include <sys/filio.h>
56 #include <sys/mount.h>
57 #include <sys/mbuf.h>
58 #include <sys/protosw.h>
59 #include <sys/sf_buf.h>
60 #include <sys/socket.h>
61 #include <sys/socketvar.h>
62 #include <sys/signalvar.h>
63 #include <sys/syscallsubr.h>
64 #include <sys/sysctl.h>
65 #include <sys/uio.h>
66 #include <sys/vnode.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 
71 #include <security/mac/mac_framework.h>
72 
73 #include <vm/vm.h>
74 #include <vm/vm_object.h>
75 #include <vm/vm_page.h>
76 #include <vm/vm_pageout.h>
77 #include <vm/vm_kern.h>
78 #include <vm/vm_extern.h>
79 
80 #ifdef SCTP
81 #include <netinet/sctp.h>
82 #include <netinet/sctp_peeloff.h>
83 #endif /* SCTP */
84 
85 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
86 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
87 
88 static int accept1(struct thread *td, struct accept_args *uap, int compat);
89 static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
90 static int getsockname1(struct thread *td, struct getsockname_args *uap,
91 			int compat);
92 static int getpeername1(struct thread *td, struct getpeername_args *uap,
93 			int compat);
94 
95 /*
96  * NSFBUFS-related variables and associated sysctls
97  */
98 int nsfbufs;
99 int nsfbufspeak;
100 int nsfbufsused;
101 
102 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
103     "Maximum number of sendfile(2) sf_bufs available");
104 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
105     "Number of sendfile(2) sf_bufs at peak usage");
106 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
107     "Number of sendfile(2) sf_bufs in use");
108 
109 /*
110  * Convert a user file descriptor to a kernel file entry.  A reference on the
111  * file entry is held upon returning.  This is lighter weight than
112  * fgetsock(), which bumps the socket reference drops the file reference
113  * count instead, as this approach avoids several additional mutex operations
114  * associated with the additional reference count.  If requested, return the
115  * open file flags.
116  */
117 static int
118 getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp)
119 {
120 	struct file *fp;
121 	int error;
122 
123 	fp = NULL;
124 	if (fdp == NULL)
125 		error = EBADF;
126 	else {
127 		FILEDESC_LOCK_FAST(fdp);
128 		fp = fget_locked(fdp, fd);
129 		if (fp == NULL)
130 			error = EBADF;
131 		else if (fp->f_type != DTYPE_SOCKET) {
132 			fp = NULL;
133 			error = ENOTSOCK;
134 		} else {
135 			fhold(fp);
136 			if (fflagp != NULL)
137 				*fflagp = fp->f_flag;
138 			error = 0;
139 		}
140 		FILEDESC_UNLOCK_FAST(fdp);
141 	}
142 	*fpp = fp;
143 	return (error);
144 }
145 
146 /*
147  * System call interface to the socket abstraction.
148  */
149 #if defined(COMPAT_43)
150 #define COMPAT_OLDSOCK
151 #endif
152 
153 /*
154  * MPSAFE
155  */
156 int
157 socket(td, uap)
158 	struct thread *td;
159 	register struct socket_args /* {
160 		int	domain;
161 		int	type;
162 		int	protocol;
163 	} */ *uap;
164 {
165 	struct filedesc *fdp;
166 	struct socket *so;
167 	struct file *fp;
168 	int fd, error;
169 
170 #ifdef MAC
171 	error = mac_check_socket_create(td->td_ucred, uap->domain, uap->type,
172 	    uap->protocol);
173 	if (error)
174 		return (error);
175 #endif
176 	fdp = td->td_proc->p_fd;
177 	error = falloc(td, &fp, &fd);
178 	if (error)
179 		return (error);
180 	/* An extra reference on `fp' has been held for us by falloc(). */
181 	NET_LOCK_GIANT();
182 	error = socreate(uap->domain, &so, uap->type, uap->protocol,
183 	    td->td_ucred, td);
184 	NET_UNLOCK_GIANT();
185 	if (error) {
186 		fdclose(fdp, fp, fd, td);
187 	} else {
188 		FILEDESC_LOCK_FAST(fdp);
189 		fp->f_data = so;	/* already has ref count */
190 		fp->f_flag = FREAD|FWRITE;
191 		fp->f_ops = &socketops;
192 		fp->f_type = DTYPE_SOCKET;
193 		FILEDESC_UNLOCK_FAST(fdp);
194 		td->td_retval[0] = fd;
195 	}
196 	fdrop(fp, td);
197 	return (error);
198 }
199 
200 /*
201  * MPSAFE
202  */
203 /* ARGSUSED */
204 int
205 bind(td, uap)
206 	struct thread *td;
207 	register struct bind_args /* {
208 		int	s;
209 		caddr_t	name;
210 		int	namelen;
211 	} */ *uap;
212 {
213 	struct sockaddr *sa;
214 	int error;
215 
216 	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
217 		return (error);
218 
219 	error = kern_bind(td, uap->s, sa);
220 	free(sa, M_SONAME);
221 	return (error);
222 }
223 
224 int
225 kern_bind(td, fd, sa)
226 	struct thread *td;
227 	int fd;
228 	struct sockaddr *sa;
229 {
230 	struct socket *so;
231 	struct file *fp;
232 	int error;
233 
234 	NET_LOCK_GIANT();
235 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
236 	if (error)
237 		goto done2;
238 	so = fp->f_data;
239 #ifdef MAC
240 	SOCK_LOCK(so);
241 	error = mac_check_socket_bind(td->td_ucred, so, sa);
242 	SOCK_UNLOCK(so);
243 	if (error)
244 		goto done1;
245 #endif
246 	error = sobind(so, sa, td);
247 #ifdef MAC
248 done1:
249 #endif
250 	fdrop(fp, td);
251 done2:
252 	NET_UNLOCK_GIANT();
253 	return (error);
254 }
255 
256 /*
257  * MPSAFE
258  */
259 /* ARGSUSED */
260 int
261 listen(td, uap)
262 	struct thread *td;
263 	register struct listen_args /* {
264 		int	s;
265 		int	backlog;
266 	} */ *uap;
267 {
268 	struct socket *so;
269 	struct file *fp;
270 	int error;
271 
272 	NET_LOCK_GIANT();
273 	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
274 	if (error == 0) {
275 		so = fp->f_data;
276 #ifdef MAC
277 		SOCK_LOCK(so);
278 		error = mac_check_socket_listen(td->td_ucred, so);
279 		SOCK_UNLOCK(so);
280 		if (error)
281 			goto done;
282 #endif
283 		error = solisten(so, uap->backlog, td);
284 #ifdef MAC
285 done:
286 #endif
287 		fdrop(fp, td);
288 	}
289 	NET_UNLOCK_GIANT();
290 	return(error);
291 }
292 
293 /*
294  * accept1()
295  * MPSAFE
296  */
297 static int
298 accept1(td, uap, compat)
299 	struct thread *td;
300 	register struct accept_args /* {
301 		int	s;
302 		struct sockaddr	* __restrict name;
303 		socklen_t	* __restrict anamelen;
304 	} */ *uap;
305 	int compat;
306 {
307 	struct sockaddr *name;
308 	socklen_t namelen;
309 	struct file *fp;
310 	int error;
311 
312 	if (uap->name == NULL)
313 		return (kern_accept(td, uap->s, NULL, NULL, NULL));
314 
315 	error = copyin(uap->anamelen, &namelen, sizeof (namelen));
316 	if (error)
317 		return (error);
318 
319 	error = kern_accept(td, uap->s, &name, &namelen, &fp);
320 
321 	/*
322 	 * return a namelen of zero for older code which might
323 	 * ignore the return value from accept.
324 	 */
325 	if (error) {
326 		(void) copyout(&namelen,
327 		    uap->anamelen, sizeof(*uap->anamelen));
328 		return (error);
329 	}
330 
331 	if (error == 0 && name != NULL) {
332 #ifdef COMPAT_OLDSOCK
333 		if (compat)
334 			((struct osockaddr *)name)->sa_family =
335 			    name->sa_family;
336 #endif
337 		error = copyout(name, uap->name, namelen);
338 	}
339 	if (error == 0)
340 		error = copyout(&namelen, uap->anamelen,
341 		    sizeof(namelen));
342 	if (error)
343 		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
344 	fdrop(fp, td);
345 	free(name, M_SONAME);
346 	return (error);
347 }
348 
349 int
350 kern_accept(struct thread *td, int s, struct sockaddr **name,
351     socklen_t *namelen, struct file **fp)
352 {
353 	struct filedesc *fdp;
354 	struct file *headfp, *nfp = NULL;
355 	struct sockaddr *sa = NULL;
356 	int error;
357 	struct socket *head, *so;
358 	int fd;
359 	u_int fflag;
360 	pid_t pgid;
361 	int tmp;
362 
363 	if (name) {
364 		*name = NULL;
365 		if (*namelen < 0)
366 			return (EINVAL);
367 	}
368 
369 	fdp = td->td_proc->p_fd;
370 	NET_LOCK_GIANT();
371 	error = getsock(fdp, s, &headfp, &fflag);
372 	if (error)
373 		goto done2;
374 	head = headfp->f_data;
375 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
376 		error = EINVAL;
377 		goto done;
378 	}
379 #ifdef MAC
380 	SOCK_LOCK(head);
381 	error = mac_check_socket_accept(td->td_ucred, head);
382 	SOCK_UNLOCK(head);
383 	if (error != 0)
384 		goto done;
385 #endif
386 	error = falloc(td, &nfp, &fd);
387 	if (error)
388 		goto done;
389 	ACCEPT_LOCK();
390 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
391 		ACCEPT_UNLOCK();
392 		error = EWOULDBLOCK;
393 		goto noconnection;
394 	}
395 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
396 		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
397 			head->so_error = ECONNABORTED;
398 			break;
399 		}
400 		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
401 		    "accept", 0);
402 		if (error) {
403 			ACCEPT_UNLOCK();
404 			goto noconnection;
405 		}
406 	}
407 	if (head->so_error) {
408 		error = head->so_error;
409 		head->so_error = 0;
410 		ACCEPT_UNLOCK();
411 		goto noconnection;
412 	}
413 	so = TAILQ_FIRST(&head->so_comp);
414 	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
415 	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
416 
417 	/*
418 	 * Before changing the flags on the socket, we have to bump the
419 	 * reference count.  Otherwise, if the protocol calls sofree(),
420 	 * the socket will be released due to a zero refcount.
421 	 */
422 	SOCK_LOCK(so);			/* soref() and so_state update */
423 	soref(so);			/* file descriptor reference */
424 
425 	TAILQ_REMOVE(&head->so_comp, so, so_list);
426 	head->so_qlen--;
427 	so->so_state |= (head->so_state & SS_NBIO);
428 	so->so_qstate &= ~SQ_COMP;
429 	so->so_head = NULL;
430 
431 	SOCK_UNLOCK(so);
432 	ACCEPT_UNLOCK();
433 
434 	/* An extra reference on `nfp' has been held for us by falloc(). */
435 	td->td_retval[0] = fd;
436 
437 	/* connection has been removed from the listen queue */
438 	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
439 
440 	pgid = fgetown(&head->so_sigio);
441 	if (pgid != 0)
442 		fsetown(pgid, &so->so_sigio);
443 
444 	FILE_LOCK(nfp);
445 	nfp->f_data = so;	/* nfp has ref count from falloc */
446 	nfp->f_flag = fflag;
447 	nfp->f_ops = &socketops;
448 	nfp->f_type = DTYPE_SOCKET;
449 	FILE_UNLOCK(nfp);
450 	/* Sync socket nonblocking/async state with file flags */
451 	tmp = fflag & FNONBLOCK;
452 	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
453 	tmp = fflag & FASYNC;
454 	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
455 	sa = 0;
456 	error = soaccept(so, &sa);
457 	if (error) {
458 		/*
459 		 * return a namelen of zero for older code which might
460 		 * ignore the return value from accept.
461 		 */
462 		if (name)
463 			*namelen = 0;
464 		goto noconnection;
465 	}
466 	if (sa == NULL) {
467 		if (name)
468 			*namelen = 0;
469 		goto done;
470 	}
471 	if (name) {
472 		/* check sa_len before it is destroyed */
473 		if (*namelen > sa->sa_len)
474 			*namelen = sa->sa_len;
475 		*name = sa;
476 		sa = NULL;
477 	}
478 noconnection:
479 	if (sa)
480 		FREE(sa, M_SONAME);
481 
482 	/*
483 	 * close the new descriptor, assuming someone hasn't ripped it
484 	 * out from under us.
485 	 */
486 	if (error)
487 		fdclose(fdp, nfp, fd, td);
488 
489 	/*
490 	 * Release explicitly held references before returning.  We return
491 	 * a reference on nfp to the caller on success if they request it.
492 	 */
493 done:
494 	if (fp != NULL) {
495 		if (error == 0) {
496 			*fp = nfp;
497 			nfp = NULL;
498 		} else
499 			*fp = NULL;
500 	}
501 	if (nfp != NULL)
502 		fdrop(nfp, td);
503 	fdrop(headfp, td);
504 done2:
505 	NET_UNLOCK_GIANT();
506 	return (error);
507 }
508 
509 /*
510  * MPSAFE (accept1() is MPSAFE)
511  */
512 int
513 accept(td, uap)
514 	struct thread *td;
515 	struct accept_args *uap;
516 {
517 
518 	return (accept1(td, uap, 0));
519 }
520 
521 #ifdef COMPAT_OLDSOCK
522 /*
523  * MPSAFE (accept1() is MPSAFE)
524  */
525 int
526 oaccept(td, uap)
527 	struct thread *td;
528 	struct accept_args *uap;
529 {
530 
531 	return (accept1(td, uap, 1));
532 }
533 #endif /* COMPAT_OLDSOCK */
534 
535 /*
536  * MPSAFE
537  */
538 /* ARGSUSED */
539 int
540 connect(td, uap)
541 	struct thread *td;
542 	register struct connect_args /* {
543 		int	s;
544 		caddr_t	name;
545 		int	namelen;
546 	} */ *uap;
547 {
548 	struct sockaddr *sa;
549 	int error;
550 
551 	error = getsockaddr(&sa, uap->name, uap->namelen);
552 	if (error)
553 		return (error);
554 
555 	error = kern_connect(td, uap->s, sa);
556 	free(sa, M_SONAME);
557 	return (error);
558 }
559 
560 
561 int
562 kern_connect(td, fd, sa)
563 	struct thread *td;
564 	int fd;
565 	struct sockaddr *sa;
566 {
567 	struct socket *so;
568 	struct file *fp;
569 	int error;
570 	int interrupted = 0;
571 
572 	NET_LOCK_GIANT();
573 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
574 	if (error)
575 		goto done2;
576 	so = fp->f_data;
577 	if (so->so_state & SS_ISCONNECTING) {
578 		error = EALREADY;
579 		goto done1;
580 	}
581 #ifdef MAC
582 	SOCK_LOCK(so);
583 	error = mac_check_socket_connect(td->td_ucred, so, sa);
584 	SOCK_UNLOCK(so);
585 	if (error)
586 		goto bad;
587 #endif
588 	error = soconnect(so, sa, td);
589 	if (error)
590 		goto bad;
591 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
592 		error = EINPROGRESS;
593 		goto done1;
594 	}
595 	SOCK_LOCK(so);
596 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
597 		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
598 		    "connec", 0);
599 		if (error) {
600 			if (error == EINTR || error == ERESTART)
601 				interrupted = 1;
602 			break;
603 		}
604 	}
605 	if (error == 0) {
606 		error = so->so_error;
607 		so->so_error = 0;
608 	}
609 	SOCK_UNLOCK(so);
610 bad:
611 	if (!interrupted)
612 		so->so_state &= ~SS_ISCONNECTING;
613 	if (error == ERESTART)
614 		error = EINTR;
615 done1:
616 	fdrop(fp, td);
617 done2:
618 	NET_UNLOCK_GIANT();
619 	return (error);
620 }
621 
622 /*
623  * MPSAFE
624  */
625 int
626 socketpair(td, uap)
627 	struct thread *td;
628 	register struct socketpair_args /* {
629 		int	domain;
630 		int	type;
631 		int	protocol;
632 		int	*rsv;
633 	} */ *uap;
634 {
635 	register struct filedesc *fdp = td->td_proc->p_fd;
636 	struct file *fp1, *fp2;
637 	struct socket *so1, *so2;
638 	int fd, error, sv[2];
639 
640 #ifdef MAC
641 	/* We might want to have a separate check for socket pairs. */
642 	error = mac_check_socket_create(td->td_ucred, uap->domain, uap->type,
643 	    uap->protocol);
644 	if (error)
645 		return (error);
646 #endif
647 
648 	NET_LOCK_GIANT();
649 	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
650 	    td->td_ucred, td);
651 	if (error)
652 		goto done2;
653 	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
654 	    td->td_ucred, td);
655 	if (error)
656 		goto free1;
657 	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
658 	error = falloc(td, &fp1, &fd);
659 	if (error)
660 		goto free2;
661 	sv[0] = fd;
662 	fp1->f_data = so1;	/* so1 already has ref count */
663 	error = falloc(td, &fp2, &fd);
664 	if (error)
665 		goto free3;
666 	fp2->f_data = so2;	/* so2 already has ref count */
667 	sv[1] = fd;
668 	error = soconnect2(so1, so2);
669 	if (error)
670 		goto free4;
671 	if (uap->type == SOCK_DGRAM) {
672 		/*
673 		 * Datagram socket connection is asymmetric.
674 		 */
675 		 error = soconnect2(so2, so1);
676 		 if (error)
677 			goto free4;
678 	}
679 	FILE_LOCK(fp1);
680 	fp1->f_flag = FREAD|FWRITE;
681 	fp1->f_ops = &socketops;
682 	fp1->f_type = DTYPE_SOCKET;
683 	FILE_UNLOCK(fp1);
684 	FILE_LOCK(fp2);
685 	fp2->f_flag = FREAD|FWRITE;
686 	fp2->f_ops = &socketops;
687 	fp2->f_type = DTYPE_SOCKET;
688 	FILE_UNLOCK(fp2);
689 	error = copyout(sv, uap->rsv, 2 * sizeof (int));
690 	fdrop(fp1, td);
691 	fdrop(fp2, td);
692 	goto done2;
693 free4:
694 	fdclose(fdp, fp2, sv[1], td);
695 	fdrop(fp2, td);
696 free3:
697 	fdclose(fdp, fp1, sv[0], td);
698 	fdrop(fp1, td);
699 free2:
700 	(void)soclose(so2);
701 free1:
702 	(void)soclose(so1);
703 done2:
704 	NET_UNLOCK_GIANT();
705 	return (error);
706 }
707 
708 static int
709 sendit(td, s, mp, flags)
710 	register struct thread *td;
711 	int s;
712 	register struct msghdr *mp;
713 	int flags;
714 {
715 	struct mbuf *control;
716 	struct sockaddr *to;
717 	int error;
718 
719 	if (mp->msg_name != NULL) {
720 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
721 		if (error) {
722 			to = NULL;
723 			goto bad;
724 		}
725 		mp->msg_name = to;
726 	} else {
727 		to = NULL;
728 	}
729 
730 	if (mp->msg_control) {
731 		if (mp->msg_controllen < sizeof(struct cmsghdr)
732 #ifdef COMPAT_OLDSOCK
733 		    && mp->msg_flags != MSG_COMPAT
734 #endif
735 		) {
736 			error = EINVAL;
737 			goto bad;
738 		}
739 		error = sockargs(&control, mp->msg_control,
740 		    mp->msg_controllen, MT_CONTROL);
741 		if (error)
742 			goto bad;
743 #ifdef COMPAT_OLDSOCK
744 		if (mp->msg_flags == MSG_COMPAT) {
745 			register struct cmsghdr *cm;
746 
747 			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
748 			if (control == 0) {
749 				error = ENOBUFS;
750 				goto bad;
751 			} else {
752 				cm = mtod(control, struct cmsghdr *);
753 				cm->cmsg_len = control->m_len;
754 				cm->cmsg_level = SOL_SOCKET;
755 				cm->cmsg_type = SCM_RIGHTS;
756 			}
757 		}
758 #endif
759 	} else {
760 		control = NULL;
761 	}
762 
763 	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
764 
765 bad:
766 	if (to)
767 		FREE(to, M_SONAME);
768 	return (error);
769 }
770 
771 int
772 kern_sendit(td, s, mp, flags, control, segflg)
773 	struct thread *td;
774 	int s;
775 	struct msghdr *mp;
776 	int flags;
777 	struct mbuf *control;
778 	enum uio_seg segflg;
779 {
780 	struct file *fp;
781 	struct uio auio;
782 	struct iovec *iov;
783 	struct socket *so;
784 	int i;
785 	int len, error;
786 #ifdef KTRACE
787 	struct uio *ktruio = NULL;
788 #endif
789 
790 	NET_LOCK_GIANT();
791 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
792 	if (error)
793 		goto bad2;
794 	so = (struct socket *)fp->f_data;
795 
796 #ifdef MAC
797 	SOCK_LOCK(so);
798 	error = mac_check_socket_send(td->td_ucred, so);
799 	SOCK_UNLOCK(so);
800 	if (error)
801 		goto bad;
802 #endif
803 
804 	auio.uio_iov = mp->msg_iov;
805 	auio.uio_iovcnt = mp->msg_iovlen;
806 	auio.uio_segflg = segflg;
807 	auio.uio_rw = UIO_WRITE;
808 	auio.uio_td = td;
809 	auio.uio_offset = 0;			/* XXX */
810 	auio.uio_resid = 0;
811 	iov = mp->msg_iov;
812 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
813 		if ((auio.uio_resid += iov->iov_len) < 0) {
814 			error = EINVAL;
815 			goto bad;
816 		}
817 	}
818 #ifdef KTRACE
819 	if (KTRPOINT(td, KTR_GENIO))
820 		ktruio = cloneuio(&auio);
821 #endif
822 	len = auio.uio_resid;
823 	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
824 	if (error) {
825 		if (auio.uio_resid != len && (error == ERESTART ||
826 		    error == EINTR || error == EWOULDBLOCK))
827 			error = 0;
828 		/* Generation of SIGPIPE can be controlled per socket */
829 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
830 		    !(flags & MSG_NOSIGNAL)) {
831 			PROC_LOCK(td->td_proc);
832 			psignal(td->td_proc, SIGPIPE);
833 			PROC_UNLOCK(td->td_proc);
834 		}
835 	}
836 	if (error == 0)
837 		td->td_retval[0] = len - auio.uio_resid;
838 #ifdef KTRACE
839 	if (ktruio != NULL) {
840 		ktruio->uio_resid = td->td_retval[0];
841 		ktrgenio(s, UIO_WRITE, ktruio, error);
842 	}
843 #endif
844 bad:
845 	fdrop(fp, td);
846 bad2:
847 	NET_UNLOCK_GIANT();
848 	return (error);
849 }
850 
851 /*
852  * MPSAFE
853  */
854 int
855 sendto(td, uap)
856 	struct thread *td;
857 	register struct sendto_args /* {
858 		int	s;
859 		caddr_t	buf;
860 		size_t	len;
861 		int	flags;
862 		caddr_t	to;
863 		int	tolen;
864 	} */ *uap;
865 {
866 	struct msghdr msg;
867 	struct iovec aiov;
868 	int error;
869 
870 	msg.msg_name = uap->to;
871 	msg.msg_namelen = uap->tolen;
872 	msg.msg_iov = &aiov;
873 	msg.msg_iovlen = 1;
874 	msg.msg_control = 0;
875 #ifdef COMPAT_OLDSOCK
876 	msg.msg_flags = 0;
877 #endif
878 	aiov.iov_base = uap->buf;
879 	aiov.iov_len = uap->len;
880 	error = sendit(td, uap->s, &msg, uap->flags);
881 	return (error);
882 }
883 
884 #ifdef COMPAT_OLDSOCK
885 /*
886  * MPSAFE
887  */
888 int
889 osend(td, uap)
890 	struct thread *td;
891 	register struct osend_args /* {
892 		int	s;
893 		caddr_t	buf;
894 		int	len;
895 		int	flags;
896 	} */ *uap;
897 {
898 	struct msghdr msg;
899 	struct iovec aiov;
900 	int error;
901 
902 	msg.msg_name = 0;
903 	msg.msg_namelen = 0;
904 	msg.msg_iov = &aiov;
905 	msg.msg_iovlen = 1;
906 	aiov.iov_base = uap->buf;
907 	aiov.iov_len = uap->len;
908 	msg.msg_control = 0;
909 	msg.msg_flags = 0;
910 	error = sendit(td, uap->s, &msg, uap->flags);
911 	return (error);
912 }
913 
914 /*
915  * MPSAFE
916  */
917 int
918 osendmsg(td, uap)
919 	struct thread *td;
920 	struct osendmsg_args /* {
921 		int	s;
922 		caddr_t	msg;
923 		int	flags;
924 	} */ *uap;
925 {
926 	struct msghdr msg;
927 	struct iovec *iov;
928 	int error;
929 
930 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
931 	if (error)
932 		return (error);
933 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
934 	if (error)
935 		return (error);
936 	msg.msg_iov = iov;
937 	msg.msg_flags = MSG_COMPAT;
938 	error = sendit(td, uap->s, &msg, uap->flags);
939 	free(iov, M_IOV);
940 	return (error);
941 }
942 #endif
943 
944 /*
945  * MPSAFE
946  */
947 int
948 sendmsg(td, uap)
949 	struct thread *td;
950 	struct sendmsg_args /* {
951 		int	s;
952 		caddr_t	msg;
953 		int	flags;
954 	} */ *uap;
955 {
956 	struct msghdr msg;
957 	struct iovec *iov;
958 	int error;
959 
960 	error = copyin(uap->msg, &msg, sizeof (msg));
961 	if (error)
962 		return (error);
963 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
964 	if (error)
965 		return (error);
966 	msg.msg_iov = iov;
967 #ifdef COMPAT_OLDSOCK
968 	msg.msg_flags = 0;
969 #endif
970 	error = sendit(td, uap->s, &msg, uap->flags);
971 	free(iov, M_IOV);
972 	return (error);
973 }
974 
975 int
976 kern_recvit(td, s, mp, fromseg, controlp)
977 	struct thread *td;
978 	int s;
979 	struct msghdr *mp;
980 	enum uio_seg fromseg;
981 	struct mbuf **controlp;
982 {
983 	struct uio auio;
984 	struct iovec *iov;
985 	int i;
986 	socklen_t len;
987 	int error;
988 	struct mbuf *m, *control = 0;
989 	caddr_t ctlbuf;
990 	struct file *fp;
991 	struct socket *so;
992 	struct sockaddr *fromsa = 0;
993 #ifdef KTRACE
994 	struct uio *ktruio = NULL;
995 #endif
996 
997 	if(controlp != NULL)
998 		*controlp = 0;
999 
1000 	NET_LOCK_GIANT();
1001 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
1002 	if (error) {
1003 		NET_UNLOCK_GIANT();
1004 		return (error);
1005 	}
1006 	so = fp->f_data;
1007 
1008 #ifdef MAC
1009 	SOCK_LOCK(so);
1010 	error = mac_check_socket_receive(td->td_ucred, so);
1011 	SOCK_UNLOCK(so);
1012 	if (error) {
1013 		fdrop(fp, td);
1014 		NET_UNLOCK_GIANT();
1015 		return (error);
1016 	}
1017 #endif
1018 
1019 	auio.uio_iov = mp->msg_iov;
1020 	auio.uio_iovcnt = mp->msg_iovlen;
1021 	auio.uio_segflg = UIO_USERSPACE;
1022 	auio.uio_rw = UIO_READ;
1023 	auio.uio_td = td;
1024 	auio.uio_offset = 0;			/* XXX */
1025 	auio.uio_resid = 0;
1026 	iov = mp->msg_iov;
1027 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
1028 		if ((auio.uio_resid += iov->iov_len) < 0) {
1029 			fdrop(fp, td);
1030 			NET_UNLOCK_GIANT();
1031 			return (EINVAL);
1032 		}
1033 	}
1034 #ifdef KTRACE
1035 	if (KTRPOINT(td, KTR_GENIO))
1036 		ktruio = cloneuio(&auio);
1037 #endif
1038 	len = auio.uio_resid;
1039 	error = soreceive(so, &fromsa, &auio, (struct mbuf **)0,
1040 	    (mp->msg_control || controlp) ? &control : (struct mbuf **)0,
1041 	    &mp->msg_flags);
1042 	if (error) {
1043 		if (auio.uio_resid != (int)len && (error == ERESTART ||
1044 		    error == EINTR || error == EWOULDBLOCK))
1045 			error = 0;
1046 	}
1047 #ifdef KTRACE
1048 	if (ktruio != NULL) {
1049 		ktruio->uio_resid = (int)len - auio.uio_resid;
1050 		ktrgenio(s, UIO_READ, ktruio, error);
1051 	}
1052 #endif
1053 	if (error)
1054 		goto out;
1055 	td->td_retval[0] = (int)len - auio.uio_resid;
1056 	if (mp->msg_name) {
1057 		len = mp->msg_namelen;
1058 		if (len <= 0 || fromsa == 0)
1059 			len = 0;
1060 		else {
1061 			/* save sa_len before it is destroyed by MSG_COMPAT */
1062 			len = MIN(len, fromsa->sa_len);
1063 #ifdef COMPAT_OLDSOCK
1064 			if (mp->msg_flags & MSG_COMPAT)
1065 				((struct osockaddr *)fromsa)->sa_family =
1066 				    fromsa->sa_family;
1067 #endif
1068 			if (fromseg == UIO_USERSPACE) {
1069 				error = copyout(fromsa, mp->msg_name,
1070 				    (unsigned)len);
1071 				if (error)
1072 					goto out;
1073 			} else
1074 				bcopy(fromsa, mp->msg_name, len);
1075 		}
1076 		mp->msg_namelen = len;
1077 	}
1078 	if (mp->msg_control && controlp == NULL) {
1079 #ifdef COMPAT_OLDSOCK
1080 		/*
1081 		 * We assume that old recvmsg calls won't receive access
1082 		 * rights and other control info, esp. as control info
1083 		 * is always optional and those options didn't exist in 4.3.
1084 		 * If we receive rights, trim the cmsghdr; anything else
1085 		 * is tossed.
1086 		 */
1087 		if (control && mp->msg_flags & MSG_COMPAT) {
1088 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
1089 			    SOL_SOCKET ||
1090 			    mtod(control, struct cmsghdr *)->cmsg_type !=
1091 			    SCM_RIGHTS) {
1092 				mp->msg_controllen = 0;
1093 				goto out;
1094 			}
1095 			control->m_len -= sizeof (struct cmsghdr);
1096 			control->m_data += sizeof (struct cmsghdr);
1097 		}
1098 #endif
1099 		len = mp->msg_controllen;
1100 		m = control;
1101 		mp->msg_controllen = 0;
1102 		ctlbuf = mp->msg_control;
1103 
1104 		while (m && len > 0) {
1105 			unsigned int tocopy;
1106 
1107 			if (len >= m->m_len)
1108 				tocopy = m->m_len;
1109 			else {
1110 				mp->msg_flags |= MSG_CTRUNC;
1111 				tocopy = len;
1112 			}
1113 
1114 			if ((error = copyout(mtod(m, caddr_t),
1115 					ctlbuf, tocopy)) != 0)
1116 				goto out;
1117 
1118 			ctlbuf += tocopy;
1119 			len -= tocopy;
1120 			m = m->m_next;
1121 		}
1122 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1123 	}
1124 out:
1125 	fdrop(fp, td);
1126 	NET_UNLOCK_GIANT();
1127 	if (fromsa)
1128 		FREE(fromsa, M_SONAME);
1129 
1130 	if (error == 0 && controlp != NULL)
1131 		*controlp = control;
1132 	else  if (control)
1133 		m_freem(control);
1134 
1135 	return (error);
1136 }
1137 
1138 static int
1139 recvit(td, s, mp, namelenp)
1140 	struct thread *td;
1141 	int s;
1142 	struct msghdr *mp;
1143 	void *namelenp;
1144 {
1145 	int error;
1146 
1147 	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
1148 	if (error)
1149 		return (error);
1150 	if (namelenp) {
1151 		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
1152 #ifdef COMPAT_OLDSOCK
1153 		if (mp->msg_flags & MSG_COMPAT)
1154 			error = 0;	/* old recvfrom didn't check */
1155 #endif
1156 	}
1157 	return (error);
1158 }
1159 
1160 /*
1161  * MPSAFE
1162  */
1163 int
1164 recvfrom(td, uap)
1165 	struct thread *td;
1166 	register struct recvfrom_args /* {
1167 		int	s;
1168 		caddr_t	buf;
1169 		size_t	len;
1170 		int	flags;
1171 		struct sockaddr * __restrict	from;
1172 		socklen_t * __restrict fromlenaddr;
1173 	} */ *uap;
1174 {
1175 	struct msghdr msg;
1176 	struct iovec aiov;
1177 	int error;
1178 
1179 	if (uap->fromlenaddr) {
1180 		error = copyin(uap->fromlenaddr,
1181 		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1182 		if (error)
1183 			goto done2;
1184 	} else {
1185 		msg.msg_namelen = 0;
1186 	}
1187 	msg.msg_name = uap->from;
1188 	msg.msg_iov = &aiov;
1189 	msg.msg_iovlen = 1;
1190 	aiov.iov_base = uap->buf;
1191 	aiov.iov_len = uap->len;
1192 	msg.msg_control = 0;
1193 	msg.msg_flags = uap->flags;
1194 	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1195 done2:
1196 	return(error);
1197 }
1198 
1199 #ifdef COMPAT_OLDSOCK
1200 /*
1201  * MPSAFE
1202  */
1203 int
1204 orecvfrom(td, uap)
1205 	struct thread *td;
1206 	struct recvfrom_args *uap;
1207 {
1208 
1209 	uap->flags |= MSG_COMPAT;
1210 	return (recvfrom(td, uap));
1211 }
1212 #endif
1213 
1214 
1215 #ifdef COMPAT_OLDSOCK
1216 /*
1217  * MPSAFE
1218  */
1219 int
1220 orecv(td, uap)
1221 	struct thread *td;
1222 	register struct orecv_args /* {
1223 		int	s;
1224 		caddr_t	buf;
1225 		int	len;
1226 		int	flags;
1227 	} */ *uap;
1228 {
1229 	struct msghdr msg;
1230 	struct iovec aiov;
1231 	int error;
1232 
1233 	msg.msg_name = 0;
1234 	msg.msg_namelen = 0;
1235 	msg.msg_iov = &aiov;
1236 	msg.msg_iovlen = 1;
1237 	aiov.iov_base = uap->buf;
1238 	aiov.iov_len = uap->len;
1239 	msg.msg_control = 0;
1240 	msg.msg_flags = uap->flags;
1241 	error = recvit(td, uap->s, &msg, NULL);
1242 	return (error);
1243 }
1244 
1245 /*
1246  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1247  * overlays the new one, missing only the flags, and with the (old) access
1248  * rights where the control fields are now.
1249  *
1250  * MPSAFE
1251  */
1252 int
1253 orecvmsg(td, uap)
1254 	struct thread *td;
1255 	struct orecvmsg_args /* {
1256 		int	s;
1257 		struct	omsghdr *msg;
1258 		int	flags;
1259 	} */ *uap;
1260 {
1261 	struct msghdr msg;
1262 	struct iovec *iov;
1263 	int error;
1264 
1265 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1266 	if (error)
1267 		return (error);
1268 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1269 	if (error)
1270 		return (error);
1271 	msg.msg_flags = uap->flags | MSG_COMPAT;
1272 	msg.msg_iov = iov;
1273 	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1274 	if (msg.msg_controllen && error == 0)
1275 		error = copyout(&msg.msg_controllen,
1276 		    &uap->msg->msg_accrightslen, sizeof (int));
1277 	free(iov, M_IOV);
1278 	return (error);
1279 }
1280 #endif
1281 
1282 /*
1283  * MPSAFE
1284  */
1285 int
1286 recvmsg(td, uap)
1287 	struct thread *td;
1288 	struct recvmsg_args /* {
1289 		int	s;
1290 		struct	msghdr *msg;
1291 		int	flags;
1292 	} */ *uap;
1293 {
1294 	struct msghdr msg;
1295 	struct iovec *uiov, *iov;
1296 	int error;
1297 
1298 	error = copyin(uap->msg, &msg, sizeof (msg));
1299 	if (error)
1300 		return (error);
1301 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1302 	if (error)
1303 		return (error);
1304 	msg.msg_flags = uap->flags;
1305 #ifdef COMPAT_OLDSOCK
1306 	msg.msg_flags &= ~MSG_COMPAT;
1307 #endif
1308 	uiov = msg.msg_iov;
1309 	msg.msg_iov = iov;
1310 	error = recvit(td, uap->s, &msg, NULL);
1311 	if (error == 0) {
1312 		msg.msg_iov = uiov;
1313 		error = copyout(&msg, uap->msg, sizeof(msg));
1314 	}
1315 	free(iov, M_IOV);
1316 	return (error);
1317 }
1318 
1319 /*
1320  * MPSAFE
1321  */
1322 /* ARGSUSED */
1323 int
1324 shutdown(td, uap)
1325 	struct thread *td;
1326 	register struct shutdown_args /* {
1327 		int	s;
1328 		int	how;
1329 	} */ *uap;
1330 {
1331 	struct socket *so;
1332 	struct file *fp;
1333 	int error;
1334 
1335 	NET_LOCK_GIANT();
1336 	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
1337 	if (error == 0) {
1338 		so = fp->f_data;
1339 		error = soshutdown(so, uap->how);
1340 		fdrop(fp, td);
1341 	}
1342 	NET_UNLOCK_GIANT();
1343 	return (error);
1344 }
1345 
1346 /*
1347  * MPSAFE
1348  */
1349 /* ARGSUSED */
1350 int
1351 setsockopt(td, uap)
1352 	struct thread *td;
1353 	register struct setsockopt_args /* {
1354 		int	s;
1355 		int	level;
1356 		int	name;
1357 		caddr_t	val;
1358 		int	valsize;
1359 	} */ *uap;
1360 {
1361 
1362 	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1363 	    uap->val, UIO_USERSPACE, uap->valsize));
1364 }
1365 
1366 int
1367 kern_setsockopt(td, s, level, name, val, valseg, valsize)
1368 	struct thread *td;
1369 	int s;
1370 	int level;
1371 	int name;
1372 	void *val;
1373 	enum uio_seg valseg;
1374 	socklen_t valsize;
1375 {
1376 	int error;
1377 	struct socket *so;
1378 	struct file *fp;
1379 	struct sockopt sopt;
1380 
1381 	if (val == NULL && valsize != 0)
1382 		return (EFAULT);
1383 	if ((int)valsize < 0)
1384 		return (EINVAL);
1385 
1386 	sopt.sopt_dir = SOPT_SET;
1387 	sopt.sopt_level = level;
1388 	sopt.sopt_name = name;
1389 	sopt.sopt_val = val;
1390 	sopt.sopt_valsize = valsize;
1391 	switch (valseg) {
1392 	case UIO_USERSPACE:
1393 		sopt.sopt_td = td;
1394 		break;
1395 	case UIO_SYSSPACE:
1396 		sopt.sopt_td = NULL;
1397 		break;
1398 	default:
1399 		panic("kern_setsockopt called with bad valseg");
1400 	}
1401 
1402 	NET_LOCK_GIANT();
1403 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
1404 	if (error == 0) {
1405 		so = fp->f_data;
1406 		error = sosetopt(so, &sopt);
1407 		fdrop(fp, td);
1408 	}
1409 	NET_UNLOCK_GIANT();
1410 	return(error);
1411 }
1412 
1413 /*
1414  * MPSAFE
1415  */
1416 /* ARGSUSED */
1417 int
1418 getsockopt(td, uap)
1419 	struct thread *td;
1420 	register struct getsockopt_args /* {
1421 		int	s;
1422 		int	level;
1423 		int	name;
1424 		void * __restrict	val;
1425 		socklen_t * __restrict avalsize;
1426 	} */ *uap;
1427 {
1428 	socklen_t valsize;
1429 	int	error;
1430 
1431 	if (uap->val) {
1432 		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1433 		if (error)
1434 			return (error);
1435 	}
1436 
1437 	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1438 	    uap->val, UIO_USERSPACE, &valsize);
1439 
1440 	if (error == 0)
1441 		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1442 	return (error);
1443 }
1444 
1445 /*
1446  * Kernel version of getsockopt.
1447  * optval can be a userland or userspace. optlen is always a kernel pointer.
1448  */
1449 int
1450 kern_getsockopt(td, s, level, name, val, valseg, valsize)
1451 	struct thread *td;
1452 	int s;
1453 	int level;
1454 	int name;
1455 	void *val;
1456 	enum uio_seg valseg;
1457 	socklen_t *valsize;
1458 {
1459 	int error;
1460 	struct  socket *so;
1461 	struct file *fp;
1462 	struct	sockopt sopt;
1463 
1464 	if (val == NULL)
1465 		*valsize = 0;
1466 	if ((int)*valsize < 0)
1467 		return (EINVAL);
1468 
1469 	sopt.sopt_dir = SOPT_GET;
1470 	sopt.sopt_level = level;
1471 	sopt.sopt_name = name;
1472 	sopt.sopt_val = val;
1473 	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1474 	switch (valseg) {
1475 	case UIO_USERSPACE:
1476 		sopt.sopt_td = td;
1477 		break;
1478 	case UIO_SYSSPACE:
1479 		sopt.sopt_td = NULL;
1480 		break;
1481 	default:
1482 		panic("kern_getsockopt called with bad valseg");
1483 	}
1484 
1485 	NET_LOCK_GIANT();
1486 	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
1487 	if (error == 0) {
1488 		so = fp->f_data;
1489 		error = sogetopt(so, &sopt);
1490 		*valsize = sopt.sopt_valsize;
1491 		fdrop(fp, td);
1492 	}
1493 	NET_UNLOCK_GIANT();
1494 	return (error);
1495 }
1496 
1497 /*
1498  * getsockname1() - Get socket name.
1499  *
1500  * MPSAFE
1501  */
1502 /* ARGSUSED */
1503 static int
1504 getsockname1(td, uap, compat)
1505 	struct thread *td;
1506 	register struct getsockname_args /* {
1507 		int	fdes;
1508 		struct sockaddr * __restrict asa;
1509 		socklen_t * __restrict alen;
1510 	} */ *uap;
1511 	int compat;
1512 {
1513 	struct sockaddr *sa;
1514 	socklen_t len;
1515 	int error;
1516 
1517 	error = copyin(uap->alen, &len, sizeof(len));
1518 	if (error)
1519 		return (error);
1520 
1521 	error = kern_getsockname(td, uap->fdes, &sa, &len);
1522 	if (error)
1523 		return (error);
1524 
1525 	if (len != 0) {
1526 #ifdef COMPAT_OLDSOCK
1527 		if (compat)
1528 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1529 #endif
1530 		error = copyout(sa, uap->asa, (u_int)len);
1531 	}
1532 	free(sa, M_SONAME);
1533 	if (error == 0)
1534 		error = copyout(&len, uap->alen, sizeof(len));
1535 	return (error);
1536 }
1537 
1538 int
1539 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
1540     socklen_t *alen)
1541 {
1542 	struct socket *so;
1543 	struct file *fp;
1544 	socklen_t len;
1545 	int error;
1546 
1547 	if (*alen < 0)
1548 		return (EINVAL);
1549 
1550 	NET_LOCK_GIANT();
1551 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
1552 	if (error)
1553 		goto done;
1554 	so = fp->f_data;
1555 	*sa = NULL;
1556 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
1557 	if (error)
1558 		goto bad;
1559 	if (*sa == NULL)
1560 		len = 0;
1561 	else
1562 		len = MIN(*alen, (*sa)->sa_len);
1563 	*alen = len;
1564 bad:
1565 	fdrop(fp, td);
1566 	if (error && *sa) {
1567 		free(*sa, M_SONAME);
1568 		*sa = NULL;
1569 	}
1570 done:
1571 	NET_UNLOCK_GIANT();
1572 	return (error);
1573 }
1574 
1575 /*
1576  * MPSAFE
1577  */
1578 int
1579 getsockname(td, uap)
1580 	struct thread *td;
1581 	struct getsockname_args *uap;
1582 {
1583 
1584 	return (getsockname1(td, uap, 0));
1585 }
1586 
1587 #ifdef COMPAT_OLDSOCK
1588 /*
1589  * MPSAFE
1590  */
1591 int
1592 ogetsockname(td, uap)
1593 	struct thread *td;
1594 	struct getsockname_args *uap;
1595 {
1596 
1597 	return (getsockname1(td, uap, 1));
1598 }
1599 #endif /* COMPAT_OLDSOCK */
1600 
1601 /*
1602  * getpeername1() - Get name of peer for connected socket.
1603  *
1604  * MPSAFE
1605  */
1606 /* ARGSUSED */
1607 static int
1608 getpeername1(td, uap, compat)
1609 	struct thread *td;
1610 	register struct getpeername_args /* {
1611 		int	fdes;
1612 		struct sockaddr * __restrict	asa;
1613 		socklen_t * __restrict	alen;
1614 	} */ *uap;
1615 	int compat;
1616 {
1617 	struct sockaddr *sa;
1618 	socklen_t len;
1619 	int error;
1620 
1621 	error = copyin(uap->alen, &len, sizeof (len));
1622 	if (error)
1623 		return (error);
1624 
1625 	error = kern_getpeername(td, uap->fdes, &sa, &len);
1626 	if (error)
1627 		return (error);
1628 
1629 	if (len != 0) {
1630 #ifdef COMPAT_OLDSOCK
1631 		if (compat)
1632 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
1633 #endif
1634 		error = copyout(sa, uap->asa, (u_int)len);
1635 	}
1636 	free(sa, M_SONAME);
1637 	if (error == 0)
1638 		error = copyout(&len, uap->alen, sizeof(len));
1639 	return (error);
1640 }
1641 
1642 int
1643 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
1644     socklen_t *alen)
1645 {
1646 	struct socket *so;
1647 	struct file *fp;
1648 	socklen_t len;
1649 	int error;
1650 
1651 	if (*alen < 0)
1652 		return (EINVAL);
1653 
1654 	NET_LOCK_GIANT();
1655 	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
1656 	if (error)
1657 		goto done2;
1658 	so = fp->f_data;
1659 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1660 		error = ENOTCONN;
1661 		goto done1;
1662 	}
1663 	*sa = NULL;
1664 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
1665 	if (error)
1666 		goto bad;
1667 	if (*sa == NULL)
1668 		len = 0;
1669 	else
1670 		len = MIN(*alen, (*sa)->sa_len);
1671 	*alen = len;
1672 bad:
1673 	if (error && *sa) {
1674 		free(*sa, M_SONAME);
1675 		*sa = NULL;
1676 	}
1677 done1:
1678 	fdrop(fp, td);
1679 done2:
1680 	NET_UNLOCK_GIANT();
1681 	return (error);
1682 }
1683 
1684 /*
1685  * MPSAFE
1686  */
1687 int
1688 getpeername(td, uap)
1689 	struct thread *td;
1690 	struct getpeername_args *uap;
1691 {
1692 
1693 	return (getpeername1(td, uap, 0));
1694 }
1695 
1696 #ifdef COMPAT_OLDSOCK
1697 /*
1698  * MPSAFE
1699  */
1700 int
1701 ogetpeername(td, uap)
1702 	struct thread *td;
1703 	struct ogetpeername_args *uap;
1704 {
1705 
1706 	/* XXX uap should have type `getpeername_args *' to begin with. */
1707 	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1708 }
1709 #endif /* COMPAT_OLDSOCK */
1710 
1711 int
1712 sockargs(mp, buf, buflen, type)
1713 	struct mbuf **mp;
1714 	caddr_t buf;
1715 	int buflen, type;
1716 {
1717 	register struct sockaddr *sa;
1718 	register struct mbuf *m;
1719 	int error;
1720 
1721 	if ((u_int)buflen > MLEN) {
1722 #ifdef COMPAT_OLDSOCK
1723 		if (type == MT_SONAME && (u_int)buflen <= 112)
1724 			buflen = MLEN;		/* unix domain compat. hack */
1725 		else
1726 #endif
1727 			if ((u_int)buflen > MCLBYTES)
1728 				return (EINVAL);
1729 	}
1730 	m = m_get(M_TRYWAIT, type);
1731 	if (m == NULL)
1732 		return (ENOBUFS);
1733 	if ((u_int)buflen > MLEN) {
1734 		MCLGET(m, M_TRYWAIT);
1735 		if ((m->m_flags & M_EXT) == 0) {
1736 			m_free(m);
1737 			return (ENOBUFS);
1738 		}
1739 	}
1740 	m->m_len = buflen;
1741 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1742 	if (error)
1743 		(void) m_free(m);
1744 	else {
1745 		*mp = m;
1746 		if (type == MT_SONAME) {
1747 			sa = mtod(m, struct sockaddr *);
1748 
1749 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1750 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1751 				sa->sa_family = sa->sa_len;
1752 #endif
1753 			sa->sa_len = buflen;
1754 		}
1755 	}
1756 	return (error);
1757 }
1758 
1759 int
1760 getsockaddr(namp, uaddr, len)
1761 	struct sockaddr **namp;
1762 	caddr_t uaddr;
1763 	size_t len;
1764 {
1765 	struct sockaddr *sa;
1766 	int error;
1767 
1768 	if (len > SOCK_MAXADDRLEN)
1769 		return (ENAMETOOLONG);
1770 	if (len < offsetof(struct sockaddr, sa_data[0]))
1771 		return (EINVAL);
1772 	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1773 	error = copyin(uaddr, sa, len);
1774 	if (error) {
1775 		FREE(sa, M_SONAME);
1776 	} else {
1777 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1778 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1779 			sa->sa_family = sa->sa_len;
1780 #endif
1781 		sa->sa_len = len;
1782 		*namp = sa;
1783 	}
1784 	return (error);
1785 }
1786 
1787 /*
1788  * Detach mapped page and release resources back to the system.
1789  */
1790 void
1791 sf_buf_mext(void *addr, void *args)
1792 {
1793 	vm_page_t m;
1794 
1795 	m = sf_buf_page(args);
1796 	sf_buf_free(args);
1797 	vm_page_lock_queues();
1798 	vm_page_unwire(m, 0);
1799 	/*
1800 	 * Check for the object going away on us. This can
1801 	 * happen since we don't hold a reference to it.
1802 	 * If so, we're responsible for freeing the page.
1803 	 */
1804 	if (m->wire_count == 0 && m->object == NULL)
1805 		vm_page_free(m);
1806 	vm_page_unlock_queues();
1807 }
1808 
1809 /*
1810  * sendfile(2)
1811  *
1812  * MPSAFE
1813  *
1814  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1815  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1816  *
1817  * Send a file specified by 'fd' and starting at 'offset' to a socket
1818  * specified by 's'. Send only 'nbytes' of the file or until EOF if
1819  * nbytes == 0. Optionally add a header and/or trailer to the socket
1820  * output. If specified, write the total number of bytes sent into *sbytes.
1821  *
1822  */
1823 int
1824 sendfile(struct thread *td, struct sendfile_args *uap)
1825 {
1826 
1827 	return (do_sendfile(td, uap, 0));
1828 }
1829 
1830 static int
1831 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1832 {
1833 	struct sf_hdtr hdtr;
1834 	struct uio *hdr_uio, *trl_uio;
1835 	int error;
1836 
1837 	hdr_uio = trl_uio = NULL;
1838 
1839 	if (uap->hdtr != NULL) {
1840 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1841 		if (error)
1842 			goto out;
1843 		if (hdtr.headers != NULL) {
1844 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
1845 			if (error)
1846 				goto out;
1847 		}
1848 		if (hdtr.trailers != NULL) {
1849 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
1850 			if (error)
1851 				goto out;
1852 
1853 		}
1854 	}
1855 
1856 	error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
1857 out:
1858 	if (hdr_uio)
1859 		free(hdr_uio, M_IOV);
1860 	if (trl_uio)
1861 		free(trl_uio, M_IOV);
1862 	return (error);
1863 }
1864 
1865 #ifdef COMPAT_FREEBSD4
1866 int
1867 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1868 {
1869 	struct sendfile_args args;
1870 
1871 	args.fd = uap->fd;
1872 	args.s = uap->s;
1873 	args.offset = uap->offset;
1874 	args.nbytes = uap->nbytes;
1875 	args.hdtr = uap->hdtr;
1876 	args.sbytes = uap->sbytes;
1877 	args.flags = uap->flags;
1878 
1879 	return (do_sendfile(td, &args, 1));
1880 }
1881 #endif /* COMPAT_FREEBSD4 */
1882 
1883 int
1884 kern_sendfile(struct thread *td, struct sendfile_args *uap,
1885     struct uio *hdr_uio, struct uio *trl_uio, int compat)
1886 {
1887 	struct file *sock_fp;
1888 	struct vnode *vp;
1889 	struct vm_object *obj = NULL;
1890 	struct socket *so = NULL;
1891 	struct mbuf *m = NULL;
1892 	struct sf_buf *sf;
1893 	struct vm_page *pg;
1894 	off_t off, xfsize, sbytes = 0, rem = 0;
1895 	int error, mnw = 0;
1896 	int vfslocked;
1897 
1898 	NET_LOCK_GIANT();
1899 
1900 	/*
1901 	 * The file descriptor must be a regular file and have a
1902 	 * backing VM object.
1903 	 * File offset must be positive.  If it goes beyond EOF
1904 	 * we send only the header/trailer and no payload data.
1905 	 */
1906 	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
1907 		goto out;
1908 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1909 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1910 	obj = vp->v_object;
1911 	if (obj != NULL) {
1912 		/*
1913 		 * Temporarily increase the backing VM object's reference
1914 		 * count so that a forced reclamation of its vnode does not
1915 		 * immediately destroy it.
1916 		 */
1917 		VM_OBJECT_LOCK(obj);
1918 		if ((obj->flags & OBJ_DEAD) == 0) {
1919 			vm_object_reference_locked(obj);
1920 			VM_OBJECT_UNLOCK(obj);
1921 		} else {
1922 			VM_OBJECT_UNLOCK(obj);
1923 			obj = NULL;
1924 		}
1925 	}
1926 	VOP_UNLOCK(vp, 0, td);
1927 	VFS_UNLOCK_GIANT(vfslocked);
1928 	if (obj == NULL) {
1929 		error = EINVAL;
1930 		goto out;
1931 	}
1932 	if (uap->offset < 0) {
1933 		error = EINVAL;
1934 		goto out;
1935 	}
1936 
1937 	/*
1938 	 * The socket must be a stream socket and connected.
1939 	 * Remember if it a blocking or non-blocking socket.
1940 	 */
1941 	if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp,
1942 	    NULL)) != 0)
1943 		goto out;
1944 	so = sock_fp->f_data;
1945 	if (so->so_type != SOCK_STREAM) {
1946 		error = EINVAL;
1947 		goto out;
1948 	}
1949 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1950 		error = ENOTCONN;
1951 		goto out;
1952 	}
1953 	/*
1954 	 * Do not wait on memory allocations but return ENOMEM for
1955 	 * caller to retry later.
1956 	 * XXX: Experimental.
1957 	 */
1958 	if (uap->flags & SF_MNOWAIT)
1959 		mnw = 1;
1960 
1961 #ifdef MAC
1962 	SOCK_LOCK(so);
1963 	error = mac_check_socket_send(td->td_ucred, so);
1964 	SOCK_UNLOCK(so);
1965 	if (error)
1966 		goto out;
1967 #endif
1968 
1969 	/* If headers are specified copy them into mbufs. */
1970 	if (hdr_uio != NULL) {
1971 		hdr_uio->uio_td = td;
1972 		hdr_uio->uio_rw = UIO_WRITE;
1973 		if (hdr_uio->uio_resid > 0) {
1974 			/*
1975 			 * In FBSD < 5.0 the nbytes to send also included
1976 			 * the header.  If compat is specified subtract the
1977 			 * header size from nbytes.
1978 			 */
1979 			if (compat) {
1980 				if (uap->nbytes > hdr_uio->uio_resid)
1981 					uap->nbytes -= hdr_uio->uio_resid;
1982 				else
1983 					uap->nbytes = 0;
1984 			}
1985 			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
1986 			    0, 0, 0);
1987 			if (m == NULL) {
1988 				error = mnw ? EAGAIN : ENOBUFS;
1989 				goto out;
1990 			}
1991 		}
1992 	}
1993 
1994 	/* Protect against multiple writers to the socket. */
1995 	SOCKBUF_LOCK(&so->so_snd);
1996 	(void) sblock(&so->so_snd, M_WAITOK);
1997 	SOCKBUF_UNLOCK(&so->so_snd);
1998 
1999 	/*
2000 	 * Loop through the pages of the file, starting with the requested
2001 	 * offset. Get a file page (do I/O if necessary), map the file page
2002 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
2003 	 * it on the socket.
2004 	 * This is done in two loops.  The inner loop turns as many pages
2005 	 * as it can, up to available socket buffer space, without blocking
2006 	 * into mbufs to have it bulk delivered into the socket send buffer.
2007 	 * The outer loop checks the state and available space of the socket
2008 	 * and takes care of the overall progress.
2009 	 */
2010 	for (off = uap->offset; ; ) {
2011 		int loopbytes = 0;
2012 		int space = 0;
2013 		int done = 0;
2014 
2015 		/*
2016 		 * Check the socket state for ongoing connection,
2017 		 * no errors and space in socket buffer.
2018 		 * If space is low allow for the remainder of the
2019 		 * file to be processed if it fits the socket buffer.
2020 		 * Otherwise block in waiting for sufficient space
2021 		 * to proceed, or if the socket is nonblocking, return
2022 		 * to userland with EAGAIN while reporting how far
2023 		 * we've come.
2024 		 * We wait until the socket buffer has significant free
2025 		 * space to do bulk sends.  This makes good use of file
2026 		 * system read ahead and allows packet segmentation
2027 		 * offloading hardware to take over lots of work.  If
2028 		 * we were not careful here we would send off only one
2029 		 * sfbuf at a time.
2030 		 */
2031 		SOCKBUF_LOCK(&so->so_snd);
2032 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
2033 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
2034 retry_space:
2035 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2036 			error = EPIPE;
2037 			SOCKBUF_UNLOCK(&so->so_snd);
2038 			goto done;
2039 		} else if (so->so_error) {
2040 			error = so->so_error;
2041 			so->so_error = 0;
2042 			SOCKBUF_UNLOCK(&so->so_snd);
2043 			goto done;
2044 		}
2045 		space = sbspace(&so->so_snd);
2046 		if (space < rem &&
2047 		    (space <= 0 ||
2048 		     space < so->so_snd.sb_lowat)) {
2049 			if (so->so_state & SS_NBIO) {
2050 				SOCKBUF_UNLOCK(&so->so_snd);
2051 				error = EAGAIN;
2052 				goto done;
2053 			}
2054 			/*
2055 			 * sbwait drops the lock while sleeping.
2056 			 * When we loop back to retry_space the
2057 			 * state may have changed and we retest
2058 			 * for it.
2059 			 */
2060 			error = sbwait(&so->so_snd);
2061 			/*
2062 			 * An error from sbwait usually indicates that we've
2063 			 * been interrupted by a signal. If we've sent anything
2064 			 * then return bytes sent, otherwise return the error.
2065 			 */
2066 			if (error) {
2067 				SOCKBUF_UNLOCK(&so->so_snd);
2068 				goto done;
2069 			}
2070 			goto retry_space;
2071 		}
2072 		SOCKBUF_UNLOCK(&so->so_snd);
2073 
2074 		/*
2075 		 * Loop and construct maximum sized mbuf chain to be bulk
2076 		 * dumped into socket buffer.
2077 		 */
2078 		while(space > loopbytes) {
2079 			vm_pindex_t pindex;
2080 			vm_offset_t pgoff;
2081 			struct mbuf *m0;
2082 
2083 			VM_OBJECT_LOCK(obj);
2084 			/*
2085 			 * Calculate the amount to transfer.
2086 			 * Not to exceed a page, the EOF,
2087 			 * or the passed in nbytes.
2088 			 */
2089 			pgoff = (vm_offset_t)(off & PAGE_MASK);
2090 			xfsize = omin(PAGE_SIZE - pgoff,
2091 			    obj->un_pager.vnp.vnp_size - off -
2092 			    sbytes - loopbytes);
2093 			if (uap->nbytes)
2094 				rem = (uap->nbytes - sbytes - loopbytes);
2095 			else
2096 				rem = obj->un_pager.vnp.vnp_size - off -
2097 				    sbytes - loopbytes;
2098 			xfsize = omin(rem, xfsize);
2099 			if (xfsize <= 0) {
2100 				VM_OBJECT_UNLOCK(obj);
2101 				done = 1;		/* all data sent */
2102 				break;
2103 			}
2104 			/*
2105 			 * Don't overflow the send buffer.
2106 			 * Stop here and send out what we've
2107 			 * already got.
2108 			 */
2109 			if (space < loopbytes + xfsize) {
2110 				VM_OBJECT_UNLOCK(obj);
2111 				break;
2112 			}
2113 retry_lookup:
2114 			/*
2115 			 * Attempt to look up the page.
2116 			 * Allocate if not found or
2117 			 * wait and loop if busy.
2118 			 */
2119 			pindex = OFF_TO_IDX(off);
2120 			pg = vm_page_lookup(obj, pindex);
2121 			if (pg == NULL) {
2122 				pg = vm_page_alloc(obj, pindex,
2123 				    VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL |
2124 				    VM_ALLOC_WIRED);
2125 				if (pg == NULL) {
2126 					VM_OBJECT_UNLOCK(obj);
2127 					VM_WAIT;
2128 					VM_OBJECT_LOCK(obj);
2129 					goto retry_lookup;
2130 				}
2131 			} else if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy"))
2132 				goto retry_lookup;
2133 			else {
2134 				/*
2135 				 * Wire the page so it does not get
2136 				 * ripped out from under us.
2137 				 */
2138 				vm_page_lock_queues();
2139 				vm_page_wire(pg);
2140 				vm_page_unlock_queues();
2141 			}
2142 
2143 			/*
2144 			 * Check if page is valid for what we need,
2145 			 * otherwise initiate I/O.
2146 			 * If we already turned some pages into mbufs,
2147 			 * send them off before we come here again and
2148 			 * block.
2149 			 */
2150 			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
2151 				VM_OBJECT_UNLOCK(obj);
2152 			else if (m != NULL)
2153 				error = EAGAIN;	/* send what we already got */
2154 			else if (uap->flags & SF_NODISKIO)
2155 				error = EBUSY;
2156 			else {
2157 				int bsize, resid;
2158 
2159 				/*
2160 				 * Ensure that our page is still around
2161 				 * when the I/O completes.
2162 				 */
2163 				vm_page_io_start(pg);
2164 				VM_OBJECT_UNLOCK(obj);
2165 
2166 				/*
2167 				 * Get the page from backing store.
2168 				 */
2169 				bsize = vp->v_mount->mnt_stat.f_iosize;
2170 				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2171 				vn_lock(vp, LK_SHARED | LK_RETRY, td);
2172 
2173 				/*
2174 				 * XXXMAC: Because we don't have fp->f_cred
2175 				 * here, we pass in NOCRED.  This is probably
2176 				 * wrong, but is consistent with our original
2177 				 * implementation.
2178 				 */
2179 				error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
2180 				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
2181 				    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
2182 				    td->td_ucred, NOCRED, &resid, td);
2183 				VOP_UNLOCK(vp, 0, td);
2184 				VFS_UNLOCK_GIANT(vfslocked);
2185 				VM_OBJECT_LOCK(obj);
2186 				vm_page_io_finish(pg);
2187 				if (!error)
2188 					VM_OBJECT_UNLOCK(obj);
2189 				mbstat.sf_iocnt++;
2190 			}
2191 			if (error) {
2192 				vm_page_lock_queues();
2193 				vm_page_unwire(pg, 0);
2194 				/*
2195 				 * See if anyone else might know about
2196 				 * this page.  If not and it is not valid,
2197 				 * then free it.
2198 				 */
2199 				if (pg->wire_count == 0 && pg->valid == 0 &&
2200 				    pg->busy == 0 && !(pg->oflags & VPO_BUSY) &&
2201 				    pg->hold_count == 0) {
2202 					vm_page_free(pg);
2203 				}
2204 				vm_page_unlock_queues();
2205 				VM_OBJECT_UNLOCK(obj);
2206 				if (error == EAGAIN)
2207 					error = 0;	/* not a real error */
2208 				break;
2209 			}
2210 
2211 			/*
2212 			 * Get a sendfile buf.  We usually wait as long
2213 			 * as necessary, but this wait can be interrupted.
2214 			 */
2215 			if ((sf = sf_buf_alloc(pg,
2216 			    (mnw ? SFB_NOWAIT : SFB_CATCH))) == NULL) {
2217 				mbstat.sf_allocfail++;
2218 				vm_page_lock_queues();
2219 				vm_page_unwire(pg, 0);
2220 				/*
2221 				 * XXX: Not same check as above!?
2222 				 */
2223 				if (pg->wire_count == 0 && pg->object == NULL)
2224 					vm_page_free(pg);
2225 				vm_page_unlock_queues();
2226 				error = (mnw ? EAGAIN : EINTR);
2227 				break;
2228 			}
2229 
2230 			/*
2231 			 * Get an mbuf and set it up as having
2232 			 * external storage.
2233 			 */
2234 			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
2235 			if (m0 == NULL) {
2236 				error = (mnw ? EAGAIN : ENOBUFS);
2237 				sf_buf_mext((void *)sf_buf_kva(sf), sf);
2238 				break;
2239 			}
2240 			MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext,
2241 			    sf, M_RDONLY, EXT_SFBUF);
2242 			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
2243 			m0->m_len = xfsize;
2244 
2245 			/* Append to mbuf chain. */
2246 			if (m != NULL)
2247 				m_cat(m, m0);
2248 			else
2249 				m = m0;
2250 
2251 			/* Keep track of bits processed. */
2252 			loopbytes += xfsize;
2253 			off += xfsize;
2254 		}
2255 
2256 		/* Add the buffer chain to the socket buffer. */
2257 		if (m != NULL) {
2258 			int mlen;
2259 
2260 			mlen = m_length(m, NULL);
2261 			SOCKBUF_LOCK(&so->so_snd);
2262 			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2263 				error = EPIPE;
2264 				SOCKBUF_UNLOCK(&so->so_snd);
2265 				goto done;
2266 			}
2267 			SOCKBUF_UNLOCK(&so->so_snd);
2268 			error = (*so->so_proto->pr_usrreqs->pru_send)
2269 				    (so, 0, m, NULL, NULL, td);
2270 			if (!error)
2271 				sbytes += mlen;
2272 			m = NULL;	/* pru_send always consumes */
2273 		}
2274 
2275 		/* Quit outer loop on error or when we're done. */
2276 		if (error || done)
2277 			goto done;
2278 	}
2279 
2280 	/*
2281 	 * Send trailers. Wimp out and use writev(2).
2282 	 */
2283 	if (trl_uio != NULL) {
2284 		error = kern_writev(td, uap->s, trl_uio);
2285 		if (error)
2286 			goto done;
2287 		sbytes += td->td_retval[0];
2288 	}
2289 
2290 done:
2291 	SOCKBUF_LOCK(&so->so_snd);
2292 	sbunlock(&so->so_snd);
2293 	SOCKBUF_UNLOCK(&so->so_snd);
2294 out:
2295 	/*
2296 	 * If there was no error we have to clear td->td_retval[0]
2297 	 * because it may have been set by writev.
2298 	 */
2299 	if (error == 0) {
2300 		td->td_retval[0] = 0;
2301 	}
2302 	if (uap->sbytes != NULL) {
2303 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
2304 	}
2305 	if (obj != NULL)
2306 		vm_object_deallocate(obj);
2307 	if (vp != NULL) {
2308 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
2309 		vrele(vp);
2310 		VFS_UNLOCK_GIANT(vfslocked);
2311 	}
2312 	if (so)
2313 		fdrop(sock_fp, td);
2314 	if (m)
2315 		m_freem(m);
2316 
2317 	NET_UNLOCK_GIANT();
2318 
2319 	if (error == ERESTART)
2320 		error = EINTR;
2321 
2322 	return (error);
2323 }
2324 
2325 /*
2326  * SCTP syscalls.
2327  * Functionality only compiled in if SCTP is defined in the kernel Makefile,
2328  * otherwise all return EOPNOTSUPP.
2329  * XXX: We should make this loadable one day.
2330  */
2331 int
2332 sctp_peeloff(td, uap)
2333 	struct thread *td;
2334 	struct sctp_peeloff_args /* {
2335 		int	sd;
2336 		caddr_t	name;
2337 	} */ *uap;
2338 {
2339 #ifdef SCTP
2340 	struct filedesc *fdp;
2341 	struct file *nfp = NULL;
2342 	int error;
2343 	struct socket *head, *so;
2344 	int fd;
2345 	u_int fflag;
2346 
2347 	fdp = td->td_proc->p_fd;
2348 	error = fgetsock(td, uap->sd, &head, &fflag);
2349 	if (error)
2350 		goto done2;
2351 	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
2352 	if (error)
2353 		goto done2;
2354 	/*
2355 	 * At this point we know we do have a assoc to pull
2356 	 * we proceed to get the fd setup. This may block
2357 	 * but that is ok.
2358 	 */
2359 
2360 	error = falloc(td, &nfp, &fd);
2361 	if (error)
2362 		goto done;
2363 	td->td_retval[0] = fd;
2364 
2365 	so = sonewconn(head, SS_ISCONNECTED);
2366 	if (so == NULL)
2367 		goto noconnection;
2368 	/*
2369 	 * Before changing the flags on the socket, we have to bump the
2370 	 * reference count.  Otherwise, if the protocol calls sofree(),
2371 	 * the socket will be released due to a zero refcount.
2372 	 */
2373         SOCK_LOCK(so);
2374         soref(so);                      /* file descriptor reference */
2375         SOCK_UNLOCK(so);
2376 
2377 	ACCEPT_LOCK();
2378 
2379 	TAILQ_REMOVE(&head->so_comp, so, so_list);
2380 	head->so_qlen--;
2381 	so->so_state |= (head->so_state & SS_NBIO);
2382 	so->so_state &= ~SS_NOFDREF;
2383 	so->so_qstate &= ~SQ_COMP;
2384 	so->so_head = NULL;
2385 
2386 	ACCEPT_UNLOCK();
2387 
2388 	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
2389 	if (error)
2390 		goto noconnection;
2391 	if (head->so_sigio != NULL)
2392 		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
2393 
2394 	FILE_LOCK(nfp);
2395 	nfp->f_data = so;
2396 	nfp->f_flag = fflag;
2397 	nfp->f_ops = &socketops;
2398 	nfp->f_type = DTYPE_SOCKET;
2399 	FILE_UNLOCK(nfp);
2400 
2401 noconnection:
2402 	/*
2403 	 * close the new descriptor, assuming someone hasn't ripped it
2404 	 * out from under us.
2405 	 */
2406 	if (error)
2407 		fdclose(fdp, nfp, fd, td);
2408 
2409 	/*
2410 	 * Release explicitly held references before returning.
2411 	 */
2412 done:
2413 	if (nfp != NULL)
2414 		fdrop(nfp, td);
2415 	fputsock(head);
2416 done2:
2417 	return (error);
2418 #else  /* SCTP */
2419 	return (EOPNOTSUPP);
2420 #endif /* SCTP */
2421 }
2422 
2423 int
2424 sctp_generic_sendmsg (td, uap)
2425 	struct thread *td;
2426 	struct sctp_generic_sendmsg_args /* {
2427 		int sd,
2428 		caddr_t msg,
2429 		int mlen,
2430 		caddr_t to,
2431 		__socklen_t tolen,
2432 		struct sctp_sndrcvinfo *sinfo,
2433 		int flags
2434 	} */ *uap;
2435 {
2436 #ifdef SCTP
2437 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2438 	struct socket *so;
2439 	struct file *fp;
2440 	int use_rcvinfo = 1;
2441 	int error = 0, len;
2442 	struct sockaddr *to = NULL;
2443 #ifdef KTRACE
2444 	struct uio *ktruio = NULL;
2445 #endif
2446 	struct uio auio;
2447 	struct iovec iov[1];
2448 
2449 	if (uap->sinfo) {
2450 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2451 		if (error)
2452 			return (error);
2453 		u_sinfo = &sinfo;
2454 	}
2455 	if (uap->tolen) {
2456 		error = getsockaddr(&to, uap->to, uap->tolen);
2457 		if (error) {
2458 			to = NULL;
2459 			goto sctp_bad2;
2460 		}
2461 	}
2462 
2463 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
2464 	if (error)
2465 		goto sctp_bad;
2466 
2467 	iov[0].iov_base = uap->msg;
2468 	iov[0].iov_len = uap->mlen;
2469 
2470 	so = (struct socket *)fp->f_data;
2471 #ifdef MAC
2472 	SOCK_LOCK(so);
2473 	error = mac_check_socket_send(td->td_ucred, so);
2474 	SOCK_UNLOCK(so);
2475 	if (error)
2476 		goto sctp_bad;
2477 #endif /* MAC */
2478 
2479 	auio.uio_iov =  iov;
2480 	auio.uio_iovcnt = 1;
2481 	auio.uio_segflg = UIO_USERSPACE;
2482 	auio.uio_rw = UIO_WRITE;
2483 	auio.uio_td = td;
2484 	auio.uio_offset = 0;			/* XXX */
2485 	auio.uio_resid = 0;
2486 	len = auio.uio_resid = uap->mlen;
2487 	error = sctp_lower_sosend(so, to, &auio,
2488 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2489 		    uap->flags, use_rcvinfo, u_sinfo, td);
2490 	if (error) {
2491 		if (auio.uio_resid != len && (error == ERESTART ||
2492 		    error == EINTR || error == EWOULDBLOCK))
2493 			error = 0;
2494 		/* Generation of SIGPIPE can be controlled per socket. */
2495 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2496 		    !(uap->flags & MSG_NOSIGNAL)) {
2497 			PROC_LOCK(td->td_proc);
2498 			psignal(td->td_proc, SIGPIPE);
2499 			PROC_UNLOCK(td->td_proc);
2500 		}
2501 	}
2502 	if (error == 0)
2503 		td->td_retval[0] = len - auio.uio_resid;
2504 #ifdef KTRACE
2505 	if (ktruio != NULL) {
2506 		ktruio->uio_resid = td->td_retval[0];
2507 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2508 	}
2509 #endif /* KTRACE */
2510 sctp_bad:
2511 	fdrop(fp, td);
2512 sctp_bad2:
2513 	if (to)
2514 		free(to, M_SONAME);
2515 	return (error);
2516 #else  /* SCTP */
2517 	return (EOPNOTSUPP);
2518 #endif /* SCTP */
2519 }
2520 
2521 int
2522 sctp_generic_sendmsg_iov(td, uap)
2523 	struct thread *td;
2524 	struct sctp_generic_sendmsg_iov_args /* {
2525 		int sd,
2526 		struct iovec *iov,
2527 		int iovlen,
2528 		caddr_t to,
2529 		__socklen_t tolen,
2530 		struct sctp_sndrcvinfo *sinfo,
2531 		int flags
2532 	} */ *uap;
2533 {
2534 #ifdef SCTP
2535 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
2536 	struct socket *so;
2537 	struct file *fp;
2538 	int use_rcvinfo = 1;
2539 	int error=0, len, i;
2540 	struct sockaddr *to = NULL;
2541 #ifdef KTRACE
2542 	struct uio *ktruio = NULL;
2543 #endif
2544 	struct uio auio;
2545 	struct iovec *iov, *tiov;
2546 
2547 	if (uap->sinfo) {
2548 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
2549 		if (error)
2550 			return (error);
2551 		u_sinfo = &sinfo;
2552 	}
2553 	if (uap->tolen) {
2554 		error = getsockaddr(&to, uap->to, uap->tolen);
2555 		if (error) {
2556 			to = NULL;
2557 			goto sctp_bad2;
2558 		}
2559 	}
2560 
2561 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
2562 	if (error)
2563 		goto sctp_bad1;
2564 
2565 	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2566 	if (error)
2567 		goto sctp_bad1;
2568 
2569 	so = (struct socket *)fp->f_data;
2570 #ifdef MAC
2571 	SOCK_LOCK(so);
2572 	error = mac_check_socket_send(td->td_ucred, so);
2573 	SOCK_UNLOCK(so);
2574 	if (error)
2575 		goto sctp_bad;
2576 #endif /* MAC */
2577 
2578 	auio.uio_iov =  iov;
2579 	auio.uio_iovcnt = uap->iovlen;
2580 	auio.uio_segflg = UIO_USERSPACE;
2581 	auio.uio_rw = UIO_WRITE;
2582 	auio.uio_td = td;
2583 	auio.uio_offset = 0;			/* XXX */
2584 	auio.uio_resid = 0;
2585 	tiov = iov;
2586 	for (i = 0; i <uap->iovlen; i++, tiov++) {
2587 		if ((auio.uio_resid += tiov->iov_len) < 0) {
2588 			error = EINVAL;
2589 			goto sctp_bad;
2590 		}
2591 	}
2592 	len = auio.uio_resid;
2593 	error = sctp_lower_sosend(so, to, &auio,
2594 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
2595 		    uap->flags, use_rcvinfo, u_sinfo, td);
2596 	if (error) {
2597 		if (auio.uio_resid != len && (error == ERESTART ||
2598 		    error == EINTR || error == EWOULDBLOCK))
2599 			error = 0;
2600 		/* Generation of SIGPIPE can be controlled per socket */
2601 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
2602 		    !(uap->flags & MSG_NOSIGNAL)) {
2603 			PROC_LOCK(td->td_proc);
2604 			psignal(td->td_proc, SIGPIPE);
2605 			PROC_UNLOCK(td->td_proc);
2606 		}
2607 	}
2608 	if (error == 0)
2609 		td->td_retval[0] = len - auio.uio_resid;
2610 #ifdef KTRACE
2611 	if (ktruio != NULL) {
2612 		ktruio->uio_resid = td->td_retval[0];
2613 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
2614 	}
2615 #endif /* KTRACE */
2616 sctp_bad:
2617 	free(iov, M_IOV);
2618 sctp_bad1:
2619 	fdrop(fp, td);
2620 sctp_bad2:
2621 	if (to)
2622 		free(to, M_SONAME);
2623 	return (error);
2624 #else  /* SCTP */
2625 	return (EOPNOTSUPP);
2626 #endif /* SCTP */
2627 }
2628 
2629 int
2630 sctp_generic_recvmsg(td, uap)
2631 	struct thread *td;
2632 	struct sctp_generic_recvmsg_args /* {
2633 		int sd,
2634 		struct iovec *iov,
2635 		int iovlen,
2636 		struct sockaddr *from,
2637 		__socklen_t *fromlenaddr,
2638 		struct sctp_sndrcvinfo *sinfo,
2639 		int *msg_flags
2640 	} */ *uap;
2641 {
2642 #ifdef SCTP
2643 	u_int8_t sockbufstore[256];
2644 	struct uio auio;
2645 	struct iovec *iov, *tiov;
2646 	struct sctp_sndrcvinfo sinfo;
2647 	struct socket *so;
2648 	struct file *fp;
2649 	struct sockaddr *fromsa;
2650 	int fromlen;
2651 	int len, i, msg_flags = 0;
2652 	int error = 0;
2653 #ifdef KTRACE
2654 	struct uio *ktruio = NULL;
2655 #endif
2656 	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
2657 	if (error) {
2658 		return (error);
2659 	}
2660 	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
2661 	if (error) {
2662 		goto out1;
2663 	}
2664 
2665 	so = fp->f_data;
2666 #ifdef MAC
2667 	SOCK_LOCK(so);
2668 	error = mac_check_socket_receive(td->td_ucred, so);
2669 	SOCK_UNLOCK(so);
2670 	if (error) {
2671 		goto out;
2672 		return (error);
2673 	}
2674 #endif /* MAC */
2675 
2676 	if (uap->fromlenaddr) {
2677 		error = copyin(uap->fromlenaddr,
2678 		    &fromlen, sizeof (fromlen));
2679 		if (error) {
2680 			goto out;
2681 		}
2682 	} else {
2683 		fromlen = 0;
2684 	}
2685 
2686 	auio.uio_iov = iov;
2687 	auio.uio_iovcnt = uap->iovlen;
2688   	auio.uio_segflg = UIO_USERSPACE;
2689 	auio.uio_rw = UIO_READ;
2690 	auio.uio_td = td;
2691 	auio.uio_offset = 0;			/* XXX */
2692 	auio.uio_resid = 0;
2693 	tiov = iov;
2694 	for (i = 0; i <uap->iovlen; i++, tiov++) {
2695 		if ((auio.uio_resid += tiov->iov_len) < 0) {
2696 			error = EINVAL;
2697 			goto out;
2698 		}
2699 	}
2700 	len = auio.uio_resid;
2701 	fromsa = (struct sockaddr *)sockbufstore;
2702 #ifdef KTRACE
2703 	if (KTRPOINT(td, KTR_GENIO))
2704 		ktruio = cloneuio(&auio);
2705 #endif /* KTRACE */
2706 	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
2707 		    fromsa, fromlen, &msg_flags,
2708 		    (struct sctp_sndrcvinfo *)&sinfo, 1);
2709 	if (error) {
2710 		if (auio.uio_resid != (int)len && (error == ERESTART ||
2711 		    error == EINTR || error == EWOULDBLOCK))
2712 			error = 0;
2713 	} else {
2714 		if (uap->sinfo)
2715 			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
2716 	}
2717 #ifdef KTRACE
2718 	if (ktruio != NULL) {
2719 		ktruio->uio_resid = (int)len - auio.uio_resid;
2720 		ktrgenio(uap->sd, UIO_READ, ktruio, error);
2721 	}
2722 #endif /* KTRACE */
2723 	if (error)
2724 		goto out;
2725 	td->td_retval[0] = (int)len - auio.uio_resid;
2726 
2727 	if (fromlen && uap->from) {
2728 		len = fromlen;
2729 		if (len <= 0 || fromsa == 0)
2730 			len = 0;
2731 		else {
2732 			len = MIN(len, fromsa->sa_len);
2733 			error = copyout(fromsa, uap->from, (unsigned)len);
2734 			if (error)
2735 				goto out;
2736 		}
2737 		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
2738 		if (error) {
2739 			goto out;
2740 		}
2741 	}
2742 	if (uap->msg_flags) {
2743 		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
2744 		if (error) {
2745 			goto out;
2746 		}
2747 	}
2748 out:
2749 	free(iov, M_IOV);
2750 out1:
2751 	fdrop(fp, td);
2752 	return (error);
2753 #else  /* SCTP */
2754 	return (EOPNOTSUPP);
2755 #endif /* SCTP */
2756 }
2757