xref: /freebsd/sys/kern/uipc_syscalls.c (revision b601c69bdbe8755d26570261d7fd4c02ee4eff74)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37  * $FreeBSD$
38  */
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/sysproto.h>
47 #include <sys/malloc.h>
48 #include <sys/filedesc.h>
49 #include <sys/event.h>
50 #include <sys/proc.h>
51 #include <sys/fcntl.h>
52 #include <sys/file.h>
53 #include <sys/mbuf.h>
54 #include <sys/protosw.h>
55 #include <sys/socket.h>
56 #include <sys/socketvar.h>
57 #include <sys/signalvar.h>
58 #include <sys/uio.h>
59 #include <sys/vnode.h>
60 #include <sys/lock.h>
61 #include <sys/mount.h>
62 #ifdef KTRACE
63 #include <sys/ktrace.h>
64 #endif
65 #include <vm/vm.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_pageout.h>
69 #include <vm/vm_kern.h>
70 #include <vm/vm_extern.h>
71 
72 static void sf_buf_init(void *arg);
73 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
74 static struct sf_buf *sf_buf_alloc(void);
75 static void sf_buf_ref(caddr_t addr, u_int size);
76 static void sf_buf_free(caddr_t addr, u_int size);
77 
78 static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags));
79 static int recvit __P((struct proc *p, int s, struct msghdr *mp,
80 		       caddr_t namelenp));
81 
82 static int accept1 __P((struct proc *p, struct accept_args *uap, int compat));
83 static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
84 			     int compat));
85 static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
86 			     int compat));
87 
88 static SLIST_HEAD(, sf_buf) sf_freelist;
89 static vm_offset_t sf_base;
90 static struct sf_buf *sf_bufs;
91 static int sf_buf_alloc_want;
92 
93 /*
94  * System call interface to the socket abstraction.
95  */
96 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
97 #define COMPAT_OLDSOCK
98 #endif
99 
100 extern	struct fileops socketops;
101 
102 int
103 socket(p, uap)
104 	struct proc *p;
105 	register struct socket_args /* {
106 		int	domain;
107 		int	type;
108 		int	protocol;
109 	} */ *uap;
110 {
111 	struct filedesc *fdp = p->p_fd;
112 	struct socket *so;
113 	struct file *fp;
114 	int fd, error;
115 
116 	error = falloc(p, &fp, &fd);
117 	if (error)
118 		return (error);
119 	error = socreate(uap->domain, &so, uap->type, uap->protocol, p);
120 	if (error) {
121 		fdp->fd_ofiles[fd] = 0;
122 		ffree(fp);
123 	} else {
124 		fp->f_data = (caddr_t)so;
125 		fp->f_flag = FREAD|FWRITE;
126 		fp->f_ops = &socketops;
127 		fp->f_type = DTYPE_SOCKET;
128 		p->p_retval[0] = fd;
129 	}
130 	return (error);
131 }
132 
133 /* ARGSUSED */
134 int
135 bind(p, uap)
136 	struct proc *p;
137 	register struct bind_args /* {
138 		int	s;
139 		caddr_t	name;
140 		int	namelen;
141 	} */ *uap;
142 {
143 	struct file *fp;
144 	struct sockaddr *sa;
145 	int error;
146 
147 	error = getsock(p->p_fd, uap->s, &fp);
148 	if (error)
149 		return (error);
150 	error = getsockaddr(&sa, uap->name, uap->namelen);
151 	if (error)
152 		return (error);
153 	error = sobind((struct socket *)fp->f_data, sa, p);
154 	FREE(sa, M_SONAME);
155 	return (error);
156 }
157 
158 /* ARGSUSED */
159 int
160 listen(p, uap)
161 	struct proc *p;
162 	register struct listen_args /* {
163 		int	s;
164 		int	backlog;
165 	} */ *uap;
166 {
167 	struct file *fp;
168 	int error;
169 
170 	error = getsock(p->p_fd, uap->s, &fp);
171 	if (error)
172 		return (error);
173 	return (solisten((struct socket *)fp->f_data, uap->backlog, p));
174 }
175 
176 static int
177 accept1(p, uap, compat)
178 	struct proc *p;
179 	register struct accept_args /* {
180 		int	s;
181 		caddr_t	name;
182 		int	*anamelen;
183 	} */ *uap;
184 	int compat;
185 {
186 	struct filedesc *fdp = p->p_fd;
187 	struct file *fp;
188 	struct sockaddr *sa;
189 	int namelen, error, s;
190 	struct socket *head, *so;
191 	int fd;
192 	short fflag;		/* type must match fp->f_flag */
193 
194 	if (uap->name) {
195 		error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
196 			sizeof (namelen));
197 		if(error)
198 			return (error);
199 	}
200 	error = getsock(fdp, uap->s, &fp);
201 	if (error)
202 		return (error);
203 	s = splnet();
204 	head = (struct socket *)fp->f_data;
205 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
206 		splx(s);
207 		return (EINVAL);
208 	}
209 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
210 		splx(s);
211 		return (EWOULDBLOCK);
212 	}
213 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
214 		if (head->so_state & SS_CANTRCVMORE) {
215 			head->so_error = ECONNABORTED;
216 			break;
217 		}
218 		error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
219 		    "accept", 0);
220 		if (error) {
221 			splx(s);
222 			return (error);
223 		}
224 	}
225 	if (head->so_error) {
226 		error = head->so_error;
227 		head->so_error = 0;
228 		splx(s);
229 		return (error);
230 	}
231 
232 	/*
233 	 * At this point we know that there is at least one connection
234 	 * ready to be accepted. Remove it from the queue prior to
235 	 * allocating the file descriptor for it since falloc() may
236 	 * block allowing another process to accept the connection
237 	 * instead.
238 	 */
239 	so = TAILQ_FIRST(&head->so_comp);
240 	TAILQ_REMOVE(&head->so_comp, so, so_list);
241 	head->so_qlen--;
242 
243 	fflag = fp->f_flag;
244 	error = falloc(p, &fp, &fd);
245 	if (error) {
246 		/*
247 		 * Probably ran out of file descriptors. Put the
248 		 * unaccepted connection back onto the queue and
249 		 * do another wakeup so some other process might
250 		 * have a chance at it.
251 		 */
252 		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
253 		head->so_qlen++;
254 		wakeup_one(&head->so_timeo);
255 		splx(s);
256 		return (error);
257 	} else
258 		p->p_retval[0] = fd;
259 
260 	/* connection has been removed from the listen queue */
261 	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
262 
263 	so->so_state &= ~SS_COMP;
264 	so->so_head = NULL;
265 	if (head->so_sigio != NULL)
266 		fsetown(fgetown(head->so_sigio), &so->so_sigio);
267 
268 	fp->f_data = (caddr_t)so;
269 	fp->f_flag = fflag;
270 	fp->f_ops = &socketops;
271 	fp->f_type = DTYPE_SOCKET;
272 	sa = 0;
273 	(void) soaccept(so, &sa);
274 	if (sa == 0) {
275 		namelen = 0;
276 		if (uap->name)
277 			goto gotnoname;
278 		splx(s);
279 		return 0;
280 	}
281 	if (uap->name) {
282 		/* check sa_len before it is destroyed */
283 		if (namelen > sa->sa_len)
284 			namelen = sa->sa_len;
285 #ifdef COMPAT_OLDSOCK
286 		if (compat)
287 			((struct osockaddr *)sa)->sa_family =
288 			    sa->sa_family;
289 #endif
290 		error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
291 		if (!error)
292 gotnoname:
293 			error = copyout((caddr_t)&namelen,
294 			    (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
295 	}
296 	if (sa)
297 		FREE(sa, M_SONAME);
298 	if (error) {
299 		fdp->fd_ofiles[fd] = 0;
300 		ffree(fp);
301 	}
302 	splx(s);
303 	return (error);
304 }
305 
306 int
307 accept(p, uap)
308 	struct proc *p;
309 	struct accept_args *uap;
310 {
311 
312 	return (accept1(p, uap, 0));
313 }
314 
315 #ifdef COMPAT_OLDSOCK
316 int
317 oaccept(p, uap)
318 	struct proc *p;
319 	struct accept_args *uap;
320 {
321 
322 	return (accept1(p, uap, 1));
323 }
324 #endif /* COMPAT_OLDSOCK */
325 
326 /* ARGSUSED */
327 int
328 connect(p, uap)
329 	struct proc *p;
330 	register struct connect_args /* {
331 		int	s;
332 		caddr_t	name;
333 		int	namelen;
334 	} */ *uap;
335 {
336 	struct file *fp;
337 	register struct socket *so;
338 	struct sockaddr *sa;
339 	int error, s;
340 
341 	error = getsock(p->p_fd, uap->s, &fp);
342 	if (error)
343 		return (error);
344 	so = (struct socket *)fp->f_data;
345 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING))
346 		return (EALREADY);
347 	error = getsockaddr(&sa, uap->name, uap->namelen);
348 	if (error)
349 		return (error);
350 	error = soconnect(so, sa, p);
351 	if (error)
352 		goto bad;
353 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
354 		FREE(sa, M_SONAME);
355 		return (EINPROGRESS);
356 	}
357 	s = splnet();
358 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
359 		error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
360 		    "connec", 0);
361 		if (error)
362 			break;
363 	}
364 	if (error == 0) {
365 		error = so->so_error;
366 		so->so_error = 0;
367 	}
368 	splx(s);
369 bad:
370 	so->so_state &= ~SS_ISCONNECTING;
371 	FREE(sa, M_SONAME);
372 	if (error == ERESTART)
373 		error = EINTR;
374 	return (error);
375 }
376 
377 int
378 socketpair(p, uap)
379 	struct proc *p;
380 	register struct socketpair_args /* {
381 		int	domain;
382 		int	type;
383 		int	protocol;
384 		int	*rsv;
385 	} */ *uap;
386 {
387 	register struct filedesc *fdp = p->p_fd;
388 	struct file *fp1, *fp2;
389 	struct socket *so1, *so2;
390 	int fd, error, sv[2];
391 
392 	error = socreate(uap->domain, &so1, uap->type, uap->protocol, p);
393 	if (error)
394 		return (error);
395 	error = socreate(uap->domain, &so2, uap->type, uap->protocol, p);
396 	if (error)
397 		goto free1;
398 	error = falloc(p, &fp1, &fd);
399 	if (error)
400 		goto free2;
401 	sv[0] = fd;
402 	fp1->f_data = (caddr_t)so1;
403 	error = falloc(p, &fp2, &fd);
404 	if (error)
405 		goto free3;
406 	fp2->f_data = (caddr_t)so2;
407 	sv[1] = fd;
408 	error = soconnect2(so1, so2);
409 	if (error)
410 		goto free4;
411 	if (uap->type == SOCK_DGRAM) {
412 		/*
413 		 * Datagram socket connection is asymmetric.
414 		 */
415 		 error = soconnect2(so2, so1);
416 		 if (error)
417 			goto free4;
418 	}
419 	fp1->f_flag = fp2->f_flag = FREAD|FWRITE;
420 	fp1->f_ops = fp2->f_ops = &socketops;
421 	fp1->f_type = fp2->f_type = DTYPE_SOCKET;
422 	error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
423 	return (error);
424 free4:
425 	fdp->fd_ofiles[sv[1]] = 0;
426 	ffree(fp2);
427 free3:
428 	fdp->fd_ofiles[sv[0]] = 0;
429 	ffree(fp1);
430 free2:
431 	(void)soclose(so2);
432 free1:
433 	(void)soclose(so1);
434 	return (error);
435 }
436 
437 static int
438 sendit(p, s, mp, flags)
439 	register struct proc *p;
440 	int s;
441 	register struct msghdr *mp;
442 	int flags;
443 {
444 	struct file *fp;
445 	struct uio auio;
446 	register struct iovec *iov;
447 	register int i;
448 	struct mbuf *control;
449 	struct sockaddr *to;
450 	int len, error;
451 	struct socket *so;
452 #ifdef KTRACE
453 	struct iovec *ktriov = NULL;
454 	struct uio ktruio;
455 #endif
456 
457 	error = getsock(p->p_fd, s, &fp);
458 	if (error)
459 		return (error);
460 	auio.uio_iov = mp->msg_iov;
461 	auio.uio_iovcnt = mp->msg_iovlen;
462 	auio.uio_segflg = UIO_USERSPACE;
463 	auio.uio_rw = UIO_WRITE;
464 	auio.uio_procp = p;
465 	auio.uio_offset = 0;			/* XXX */
466 	auio.uio_resid = 0;
467 	iov = mp->msg_iov;
468 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
469 		if ((auio.uio_resid += iov->iov_len) < 0)
470 			return (EINVAL);
471 	}
472 	if (mp->msg_name) {
473 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
474 		if (error)
475 			return (error);
476 	} else
477 		to = 0;
478 	if (mp->msg_control) {
479 		if (mp->msg_controllen < sizeof(struct cmsghdr)
480 #ifdef COMPAT_OLDSOCK
481 		    && mp->msg_flags != MSG_COMPAT
482 #endif
483 		) {
484 			error = EINVAL;
485 			goto bad;
486 		}
487 		error = sockargs(&control, mp->msg_control,
488 		    mp->msg_controllen, MT_CONTROL);
489 		if (error)
490 			goto bad;
491 #ifdef COMPAT_OLDSOCK
492 		if (mp->msg_flags == MSG_COMPAT) {
493 			register struct cmsghdr *cm;
494 
495 			M_PREPEND(control, sizeof(*cm), M_WAIT);
496 			if (control == 0) {
497 				error = ENOBUFS;
498 				goto bad;
499 			} else {
500 				cm = mtod(control, struct cmsghdr *);
501 				cm->cmsg_len = control->m_len;
502 				cm->cmsg_level = SOL_SOCKET;
503 				cm->cmsg_type = SCM_RIGHTS;
504 			}
505 		}
506 #endif
507 	} else
508 		control = 0;
509 #ifdef KTRACE
510 	if (KTRPOINT(p, KTR_GENIO)) {
511 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
512 
513 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
514 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
515 		ktruio = auio;
516 	}
517 #endif
518 	len = auio.uio_resid;
519 	so = (struct socket *)fp->f_data;
520 	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
521 						     flags, p);
522 	if (error) {
523 		if (auio.uio_resid != len && (error == ERESTART ||
524 		    error == EINTR || error == EWOULDBLOCK))
525 			error = 0;
526 		if (error == EPIPE)
527 			psignal(p, SIGPIPE);
528 	}
529 	if (error == 0)
530 		p->p_retval[0] = len - auio.uio_resid;
531 #ifdef KTRACE
532 	if (ktriov != NULL) {
533 		if (error == 0) {
534 			ktruio.uio_iov = ktriov;
535 			ktruio.uio_resid = p->p_retval[0];
536 			ktrgenio(p->p_tracep, s, UIO_WRITE, &ktruio, error);
537 		}
538 		FREE(ktriov, M_TEMP);
539 	}
540 #endif
541 bad:
542 	if (to)
543 		FREE(to, M_SONAME);
544 	return (error);
545 }
546 
547 int
548 sendto(p, uap)
549 	struct proc *p;
550 	register struct sendto_args /* {
551 		int	s;
552 		caddr_t	buf;
553 		size_t	len;
554 		int	flags;
555 		caddr_t	to;
556 		int	tolen;
557 	} */ *uap;
558 {
559 	struct msghdr msg;
560 	struct iovec aiov;
561 
562 	msg.msg_name = uap->to;
563 	msg.msg_namelen = uap->tolen;
564 	msg.msg_iov = &aiov;
565 	msg.msg_iovlen = 1;
566 	msg.msg_control = 0;
567 #ifdef COMPAT_OLDSOCK
568 	msg.msg_flags = 0;
569 #endif
570 	aiov.iov_base = uap->buf;
571 	aiov.iov_len = uap->len;
572 	return (sendit(p, uap->s, &msg, uap->flags));
573 }
574 
575 #ifdef COMPAT_OLDSOCK
576 int
577 osend(p, uap)
578 	struct proc *p;
579 	register struct osend_args /* {
580 		int	s;
581 		caddr_t	buf;
582 		int	len;
583 		int	flags;
584 	} */ *uap;
585 {
586 	struct msghdr msg;
587 	struct iovec aiov;
588 
589 	msg.msg_name = 0;
590 	msg.msg_namelen = 0;
591 	msg.msg_iov = &aiov;
592 	msg.msg_iovlen = 1;
593 	aiov.iov_base = uap->buf;
594 	aiov.iov_len = uap->len;
595 	msg.msg_control = 0;
596 	msg.msg_flags = 0;
597 	return (sendit(p, uap->s, &msg, uap->flags));
598 }
599 
600 int
601 osendmsg(p, uap)
602 	struct proc *p;
603 	register struct osendmsg_args /* {
604 		int	s;
605 		caddr_t	msg;
606 		int	flags;
607 	} */ *uap;
608 {
609 	struct msghdr msg;
610 	struct iovec aiov[UIO_SMALLIOV], *iov;
611 	int error;
612 
613 	error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
614 	if (error)
615 		return (error);
616 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
617 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
618 			return (EMSGSIZE);
619 		MALLOC(iov, struct iovec *,
620 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
621 		      M_WAITOK);
622 	} else
623 		iov = aiov;
624 	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
625 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
626 	if (error)
627 		goto done;
628 	msg.msg_flags = MSG_COMPAT;
629 	msg.msg_iov = iov;
630 	error = sendit(p, uap->s, &msg, uap->flags);
631 done:
632 	if (iov != aiov)
633 		FREE(iov, M_IOV);
634 	return (error);
635 }
636 #endif
637 
638 int
639 sendmsg(p, uap)
640 	struct proc *p;
641 	register struct sendmsg_args /* {
642 		int	s;
643 		caddr_t	msg;
644 		int	flags;
645 	} */ *uap;
646 {
647 	struct msghdr msg;
648 	struct iovec aiov[UIO_SMALLIOV], *iov;
649 	int error;
650 
651 	error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
652 	if (error)
653 		return (error);
654 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
655 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
656 			return (EMSGSIZE);
657 		MALLOC(iov, struct iovec *,
658 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
659 		       M_WAITOK);
660 	} else
661 		iov = aiov;
662 	if (msg.msg_iovlen &&
663 	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
664 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
665 		goto done;
666 	msg.msg_iov = iov;
667 #ifdef COMPAT_OLDSOCK
668 	msg.msg_flags = 0;
669 #endif
670 	error = sendit(p, uap->s, &msg, uap->flags);
671 done:
672 	if (iov != aiov)
673 		FREE(iov, M_IOV);
674 	return (error);
675 }
676 
677 static int
678 recvit(p, s, mp, namelenp)
679 	register struct proc *p;
680 	int s;
681 	register struct msghdr *mp;
682 	caddr_t namelenp;
683 {
684 	struct file *fp;
685 	struct uio auio;
686 	register struct iovec *iov;
687 	register int i;
688 	int len, error;
689 	struct mbuf *m, *control = 0;
690 	caddr_t ctlbuf;
691 	struct socket *so;
692 	struct sockaddr *fromsa = 0;
693 #ifdef KTRACE
694 	struct iovec *ktriov = NULL;
695 	struct uio ktruio;
696 #endif
697 
698 	error = getsock(p->p_fd, s, &fp);
699 	if (error)
700 		return (error);
701 	auio.uio_iov = mp->msg_iov;
702 	auio.uio_iovcnt = mp->msg_iovlen;
703 	auio.uio_segflg = UIO_USERSPACE;
704 	auio.uio_rw = UIO_READ;
705 	auio.uio_procp = p;
706 	auio.uio_offset = 0;			/* XXX */
707 	auio.uio_resid = 0;
708 	iov = mp->msg_iov;
709 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
710 		if ((auio.uio_resid += iov->iov_len) < 0)
711 			return (EINVAL);
712 	}
713 #ifdef KTRACE
714 	if (KTRPOINT(p, KTR_GENIO)) {
715 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
716 
717 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
718 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
719 		ktruio = auio;
720 	}
721 #endif
722 	len = auio.uio_resid;
723 	so = (struct socket *)fp->f_data;
724 	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
725 	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
726 	    &mp->msg_flags);
727 	if (error) {
728 		if (auio.uio_resid != len && (error == ERESTART ||
729 		    error == EINTR || error == EWOULDBLOCK))
730 			error = 0;
731 	}
732 #ifdef KTRACE
733 	if (ktriov != NULL) {
734 		if (error == 0) {
735 			ktruio.uio_iov = ktriov;
736 			ktruio.uio_resid = len - auio.uio_resid;
737 			ktrgenio(p->p_tracep, s, UIO_READ, &ktruio, error);
738 		}
739 		FREE(ktriov, M_TEMP);
740 	}
741 #endif
742 	if (error)
743 		goto out;
744 	p->p_retval[0] = len - auio.uio_resid;
745 	if (mp->msg_name) {
746 		len = mp->msg_namelen;
747 		if (len <= 0 || fromsa == 0)
748 			len = 0;
749 		else {
750 #ifndef MIN
751 #define MIN(a,b) ((a)>(b)?(b):(a))
752 #endif
753 			/* save sa_len before it is destroyed by MSG_COMPAT */
754 			len = MIN(len, fromsa->sa_len);
755 #ifdef COMPAT_OLDSOCK
756 			if (mp->msg_flags & MSG_COMPAT)
757 				((struct osockaddr *)fromsa)->sa_family =
758 				    fromsa->sa_family;
759 #endif
760 			error = copyout(fromsa,
761 			    (caddr_t)mp->msg_name, (unsigned)len);
762 			if (error)
763 				goto out;
764 		}
765 		mp->msg_namelen = len;
766 		if (namelenp &&
767 		    (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
768 #ifdef COMPAT_OLDSOCK
769 			if (mp->msg_flags & MSG_COMPAT)
770 				error = 0;	/* old recvfrom didn't check */
771 			else
772 #endif
773 			goto out;
774 		}
775 	}
776 	if (mp->msg_control) {
777 #ifdef COMPAT_OLDSOCK
778 		/*
779 		 * We assume that old recvmsg calls won't receive access
780 		 * rights and other control info, esp. as control info
781 		 * is always optional and those options didn't exist in 4.3.
782 		 * If we receive rights, trim the cmsghdr; anything else
783 		 * is tossed.
784 		 */
785 		if (control && mp->msg_flags & MSG_COMPAT) {
786 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
787 			    SOL_SOCKET ||
788 			    mtod(control, struct cmsghdr *)->cmsg_type !=
789 			    SCM_RIGHTS) {
790 				mp->msg_controllen = 0;
791 				goto out;
792 			}
793 			control->m_len -= sizeof (struct cmsghdr);
794 			control->m_data += sizeof (struct cmsghdr);
795 		}
796 #endif
797 		len = mp->msg_controllen;
798 		m = control;
799 		mp->msg_controllen = 0;
800 		ctlbuf = (caddr_t) mp->msg_control;
801 
802 		while (m && len > 0) {
803 			unsigned int tocopy;
804 
805 			if (len >= m->m_len)
806 				tocopy = m->m_len;
807 			else {
808 				mp->msg_flags |= MSG_CTRUNC;
809 				tocopy = len;
810 			}
811 
812 			if ((error = copyout((caddr_t)mtod(m, caddr_t),
813 					ctlbuf, tocopy)) != 0)
814 				goto out;
815 
816 			ctlbuf += tocopy;
817 			len -= tocopy;
818 			m = m->m_next;
819 		}
820 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
821 	}
822 out:
823 	if (fromsa)
824 		FREE(fromsa, M_SONAME);
825 	if (control)
826 		m_freem(control);
827 	return (error);
828 }
829 
830 int
831 recvfrom(p, uap)
832 	struct proc *p;
833 	register struct recvfrom_args /* {
834 		int	s;
835 		caddr_t	buf;
836 		size_t	len;
837 		int	flags;
838 		caddr_t	from;
839 		int	*fromlenaddr;
840 	} */ *uap;
841 {
842 	struct msghdr msg;
843 	struct iovec aiov;
844 	int error;
845 
846 	if (uap->fromlenaddr) {
847 		error = copyin((caddr_t)uap->fromlenaddr,
848 		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
849 		if (error)
850 			return (error);
851 	} else
852 		msg.msg_namelen = 0;
853 	msg.msg_name = uap->from;
854 	msg.msg_iov = &aiov;
855 	msg.msg_iovlen = 1;
856 	aiov.iov_base = uap->buf;
857 	aiov.iov_len = uap->len;
858 	msg.msg_control = 0;
859 	msg.msg_flags = uap->flags;
860 	return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr));
861 }
862 
863 #ifdef COMPAT_OLDSOCK
864 int
865 orecvfrom(p, uap)
866 	struct proc *p;
867 	struct recvfrom_args *uap;
868 {
869 
870 	uap->flags |= MSG_COMPAT;
871 	return (recvfrom(p, uap));
872 }
873 #endif
874 
875 
876 #ifdef COMPAT_OLDSOCK
877 int
878 orecv(p, uap)
879 	struct proc *p;
880 	register struct orecv_args /* {
881 		int	s;
882 		caddr_t	buf;
883 		int	len;
884 		int	flags;
885 	} */ *uap;
886 {
887 	struct msghdr msg;
888 	struct iovec aiov;
889 
890 	msg.msg_name = 0;
891 	msg.msg_namelen = 0;
892 	msg.msg_iov = &aiov;
893 	msg.msg_iovlen = 1;
894 	aiov.iov_base = uap->buf;
895 	aiov.iov_len = uap->len;
896 	msg.msg_control = 0;
897 	msg.msg_flags = uap->flags;
898 	return (recvit(p, uap->s, &msg, (caddr_t)0));
899 }
900 
901 /*
902  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
903  * overlays the new one, missing only the flags, and with the (old) access
904  * rights where the control fields are now.
905  */
906 int
907 orecvmsg(p, uap)
908 	struct proc *p;
909 	register struct orecvmsg_args /* {
910 		int	s;
911 		struct	omsghdr *msg;
912 		int	flags;
913 	} */ *uap;
914 {
915 	struct msghdr msg;
916 	struct iovec aiov[UIO_SMALLIOV], *iov;
917 	int error;
918 
919 	error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
920 	    sizeof (struct omsghdr));
921 	if (error)
922 		return (error);
923 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
924 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
925 			return (EMSGSIZE);
926 		MALLOC(iov, struct iovec *,
927 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
928 		      M_WAITOK);
929 	} else
930 		iov = aiov;
931 	msg.msg_flags = uap->flags | MSG_COMPAT;
932 	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
933 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
934 	if (error)
935 		goto done;
936 	msg.msg_iov = iov;
937 	error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
938 
939 	if (msg.msg_controllen && error == 0)
940 		error = copyout((caddr_t)&msg.msg_controllen,
941 		    (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
942 done:
943 	if (iov != aiov)
944 		FREE(iov, M_IOV);
945 	return (error);
946 }
947 #endif
948 
949 int
950 recvmsg(p, uap)
951 	struct proc *p;
952 	register struct recvmsg_args /* {
953 		int	s;
954 		struct	msghdr *msg;
955 		int	flags;
956 	} */ *uap;
957 {
958 	struct msghdr msg;
959 	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
960 	register int error;
961 
962 	error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
963 	if (error)
964 		return (error);
965 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
966 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
967 			return (EMSGSIZE);
968 		MALLOC(iov, struct iovec *,
969 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
970 		       M_WAITOK);
971 	} else
972 		iov = aiov;
973 #ifdef COMPAT_OLDSOCK
974 	msg.msg_flags = uap->flags &~ MSG_COMPAT;
975 #else
976 	msg.msg_flags = uap->flags;
977 #endif
978 	uiov = msg.msg_iov;
979 	msg.msg_iov = iov;
980 	error = copyin((caddr_t)uiov, (caddr_t)iov,
981 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
982 	if (error)
983 		goto done;
984 	error = recvit(p, uap->s, &msg, (caddr_t)0);
985 	if (!error) {
986 		msg.msg_iov = uiov;
987 		error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
988 	}
989 done:
990 	if (iov != aiov)
991 		FREE(iov, M_IOV);
992 	return (error);
993 }
994 
995 /* ARGSUSED */
996 int
997 shutdown(p, uap)
998 	struct proc *p;
999 	register struct shutdown_args /* {
1000 		int	s;
1001 		int	how;
1002 	} */ *uap;
1003 {
1004 	struct file *fp;
1005 	int error;
1006 
1007 	error = getsock(p->p_fd, uap->s, &fp);
1008 	if (error)
1009 		return (error);
1010 	return (soshutdown((struct socket *)fp->f_data, uap->how));
1011 }
1012 
1013 /* ARGSUSED */
1014 int
1015 setsockopt(p, uap)
1016 	struct proc *p;
1017 	register struct setsockopt_args /* {
1018 		int	s;
1019 		int	level;
1020 		int	name;
1021 		caddr_t	val;
1022 		int	valsize;
1023 	} */ *uap;
1024 {
1025 	struct file *fp;
1026 	struct sockopt sopt;
1027 	int error;
1028 
1029 	if (uap->val == 0 && uap->valsize != 0)
1030 		return (EFAULT);
1031 	if (uap->valsize < 0)
1032 		return (EINVAL);
1033 
1034 	error = getsock(p->p_fd, uap->s, &fp);
1035 	if (error)
1036 		return (error);
1037 
1038 	sopt.sopt_dir = SOPT_SET;
1039 	sopt.sopt_level = uap->level;
1040 	sopt.sopt_name = uap->name;
1041 	sopt.sopt_val = uap->val;
1042 	sopt.sopt_valsize = uap->valsize;
1043 	sopt.sopt_p = p;
1044 
1045 	return (sosetopt((struct socket *)fp->f_data, &sopt));
1046 }
1047 
1048 /* ARGSUSED */
1049 int
1050 getsockopt(p, uap)
1051 	struct proc *p;
1052 	register struct getsockopt_args /* {
1053 		int	s;
1054 		int	level;
1055 		int	name;
1056 		caddr_t	val;
1057 		int	*avalsize;
1058 	} */ *uap;
1059 {
1060 	int	valsize, error;
1061 	struct	file *fp;
1062 	struct	sockopt sopt;
1063 
1064 	error = getsock(p->p_fd, uap->s, &fp);
1065 	if (error)
1066 		return (error);
1067 	if (uap->val) {
1068 		error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
1069 		    sizeof (valsize));
1070 		if (error)
1071 			return (error);
1072 		if (valsize < 0)
1073 			return (EINVAL);
1074 	} else
1075 		valsize = 0;
1076 
1077 	sopt.sopt_dir = SOPT_GET;
1078 	sopt.sopt_level = uap->level;
1079 	sopt.sopt_name = uap->name;
1080 	sopt.sopt_val = uap->val;
1081 	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1082 	sopt.sopt_p = p;
1083 
1084 	error = sogetopt((struct socket *)fp->f_data, &sopt);
1085 	if (error == 0) {
1086 		valsize = sopt.sopt_valsize;
1087 		error = copyout((caddr_t)&valsize,
1088 				(caddr_t)uap->avalsize, sizeof (valsize));
1089 	}
1090 	return (error);
1091 }
1092 
1093 /*
1094  * Get socket name.
1095  */
1096 /* ARGSUSED */
1097 static int
1098 getsockname1(p, uap, compat)
1099 	struct proc *p;
1100 	register struct getsockname_args /* {
1101 		int	fdes;
1102 		caddr_t	asa;
1103 		int	*alen;
1104 	} */ *uap;
1105 	int compat;
1106 {
1107 	struct file *fp;
1108 	register struct socket *so;
1109 	struct sockaddr *sa;
1110 	int len, error;
1111 
1112 	error = getsock(p->p_fd, uap->fdes, &fp);
1113 	if (error)
1114 		return (error);
1115 	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1116 	if (error)
1117 		return (error);
1118 	so = (struct socket *)fp->f_data;
1119 	sa = 0;
1120 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1121 	if (error)
1122 		goto bad;
1123 	if (sa == 0) {
1124 		len = 0;
1125 		goto gotnothing;
1126 	}
1127 
1128 	len = MIN(len, sa->sa_len);
1129 #ifdef COMPAT_OLDSOCK
1130 	if (compat)
1131 		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1132 #endif
1133 	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1134 	if (error == 0)
1135 gotnothing:
1136 		error = copyout((caddr_t)&len, (caddr_t)uap->alen,
1137 		    sizeof (len));
1138 bad:
1139 	if (sa)
1140 		FREE(sa, M_SONAME);
1141 	return (error);
1142 }
1143 
1144 int
1145 getsockname(p, uap)
1146 	struct proc *p;
1147 	struct getsockname_args *uap;
1148 {
1149 
1150 	return (getsockname1(p, uap, 0));
1151 }
1152 
1153 #ifdef COMPAT_OLDSOCK
1154 int
1155 ogetsockname(p, uap)
1156 	struct proc *p;
1157 	struct getsockname_args *uap;
1158 {
1159 
1160 	return (getsockname1(p, uap, 1));
1161 }
1162 #endif /* COMPAT_OLDSOCK */
1163 
1164 /*
1165  * Get name of peer for connected socket.
1166  */
1167 /* ARGSUSED */
1168 static int
1169 getpeername1(p, uap, compat)
1170 	struct proc *p;
1171 	register struct getpeername_args /* {
1172 		int	fdes;
1173 		caddr_t	asa;
1174 		int	*alen;
1175 	} */ *uap;
1176 	int compat;
1177 {
1178 	struct file *fp;
1179 	register struct socket *so;
1180 	struct sockaddr *sa;
1181 	int len, error;
1182 
1183 	error = getsock(p->p_fd, uap->fdes, &fp);
1184 	if (error)
1185 		return (error);
1186 	so = (struct socket *)fp->f_data;
1187 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
1188 		return (ENOTCONN);
1189 	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1190 	if (error)
1191 		return (error);
1192 	sa = 0;
1193 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1194 	if (error)
1195 		goto bad;
1196 	if (sa == 0) {
1197 		len = 0;
1198 		goto gotnothing;
1199 	}
1200 	len = MIN(len, sa->sa_len);
1201 #ifdef COMPAT_OLDSOCK
1202 	if (compat)
1203 		((struct osockaddr *)sa)->sa_family =
1204 		    sa->sa_family;
1205 #endif
1206 	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1207 	if (error)
1208 		goto bad;
1209 gotnothing:
1210 	error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
1211 bad:
1212 	if (sa) FREE(sa, M_SONAME);
1213 	return (error);
1214 }
1215 
1216 int
1217 getpeername(p, uap)
1218 	struct proc *p;
1219 	struct getpeername_args *uap;
1220 {
1221 
1222 	return (getpeername1(p, uap, 0));
1223 }
1224 
1225 #ifdef COMPAT_OLDSOCK
1226 int
1227 ogetpeername(p, uap)
1228 	struct proc *p;
1229 	struct ogetpeername_args *uap;
1230 {
1231 
1232 	/* XXX uap should have type `getpeername_args *' to begin with. */
1233 	return (getpeername1(p, (struct getpeername_args *)uap, 1));
1234 }
1235 #endif /* COMPAT_OLDSOCK */
1236 
1237 int
1238 sockargs(mp, buf, buflen, type)
1239 	struct mbuf **mp;
1240 	caddr_t buf;
1241 	int buflen, type;
1242 {
1243 	register struct sockaddr *sa;
1244 	register struct mbuf *m;
1245 	int error;
1246 
1247 	if ((u_int)buflen > MLEN) {
1248 #ifdef COMPAT_OLDSOCK
1249 		if (type == MT_SONAME && (u_int)buflen <= 112)
1250 			buflen = MLEN;		/* unix domain compat. hack */
1251 		else
1252 #endif
1253 		return (EINVAL);
1254 	}
1255 	m = m_get(M_WAIT, type);
1256 	if (m == NULL)
1257 		return (ENOBUFS);
1258 	m->m_len = buflen;
1259 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1260 	if (error)
1261 		(void) m_free(m);
1262 	else {
1263 		*mp = m;
1264 		if (type == MT_SONAME) {
1265 			sa = mtod(m, struct sockaddr *);
1266 
1267 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1268 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1269 				sa->sa_family = sa->sa_len;
1270 #endif
1271 			sa->sa_len = buflen;
1272 		}
1273 	}
1274 	return (error);
1275 }
1276 
1277 int
1278 getsockaddr(namp, uaddr, len)
1279 	struct sockaddr **namp;
1280 	caddr_t uaddr;
1281 	size_t len;
1282 {
1283 	struct sockaddr *sa;
1284 	int error;
1285 
1286 	if (len > SOCK_MAXADDRLEN)
1287 		return ENAMETOOLONG;
1288 	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1289 	error = copyin(uaddr, sa, len);
1290 	if (error) {
1291 		FREE(sa, M_SONAME);
1292 	} else {
1293 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1294 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1295 			sa->sa_family = sa->sa_len;
1296 #endif
1297 		sa->sa_len = len;
1298 		*namp = sa;
1299 	}
1300 	return error;
1301 }
1302 
1303 int
1304 getsock(fdp, fdes, fpp)
1305 	struct filedesc *fdp;
1306 	int fdes;
1307 	struct file **fpp;
1308 {
1309 	register struct file *fp;
1310 
1311 	if ((unsigned)fdes >= fdp->fd_nfiles ||
1312 	    (fp = fdp->fd_ofiles[fdes]) == NULL)
1313 		return (EBADF);
1314 	if (fp->f_type != DTYPE_SOCKET)
1315 		return (ENOTSOCK);
1316 	*fpp = fp;
1317 	return (0);
1318 }
1319 
1320 /*
1321  * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1322  * XXX - The sf_buf functions are currently private to sendfile(2), so have
1323  * been made static, but may be useful in the future for doing zero-copy in
1324  * other parts of the networking code.
1325  */
1326 static void
1327 sf_buf_init(void *arg)
1328 {
1329 	int i;
1330 
1331 	SLIST_INIT(&sf_freelist);
1332 	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
1333 	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT);
1334 	bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf));
1335 	for (i = 0; i < nsfbufs; i++) {
1336 		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1337 		SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list);
1338 	}
1339 }
1340 
1341 /*
1342  * Get an sf_buf from the freelist. Will block if none are available.
1343  */
1344 static struct sf_buf *
1345 sf_buf_alloc()
1346 {
1347 	struct sf_buf *sf;
1348 	int s;
1349 
1350 	s = splimp();
1351 	while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) {
1352 		sf_buf_alloc_want = 1;
1353 		tsleep(&sf_freelist, PVM, "sfbufa", 0);
1354 	}
1355 	SLIST_REMOVE_HEAD(&sf_freelist, free_list);
1356 	splx(s);
1357 	sf->refcnt = 1;
1358 	return (sf);
1359 }
1360 
1361 #define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1362 static void
1363 sf_buf_ref(caddr_t addr, u_int size)
1364 {
1365 	struct sf_buf *sf;
1366 
1367 	sf = dtosf(addr);
1368 	if (sf->refcnt == 0)
1369 		panic("sf_buf_ref: referencing a free sf_buf");
1370 	sf->refcnt++;
1371 }
1372 
1373 /*
1374  * Lose a reference to an sf_buf. When none left, detach mapped page
1375  * and release resources back to the system.
1376  *
1377  * Must be called at splimp.
1378  */
1379 static void
1380 sf_buf_free(caddr_t addr, u_int size)
1381 {
1382 	struct sf_buf *sf;
1383 	struct vm_page *m;
1384 	int s;
1385 
1386 	sf = dtosf(addr);
1387 	if (sf->refcnt == 0)
1388 		panic("sf_buf_free: freeing free sf_buf");
1389 	sf->refcnt--;
1390 	if (sf->refcnt == 0) {
1391 		pmap_qremove((vm_offset_t)addr, 1);
1392 		m = sf->m;
1393 		s = splvm();
1394 		vm_page_unwire(m, 0);
1395 		/*
1396 		 * Check for the object going away on us. This can
1397 		 * happen since we don't hold a reference to it.
1398 		 * If so, we're responsible for freeing the page.
1399 		 */
1400 		if (m->wire_count == 0 && m->object == NULL)
1401 			vm_page_free(m);
1402 		splx(s);
1403 		sf->m = NULL;
1404 		SLIST_INSERT_HEAD(&sf_freelist, sf, free_list);
1405 		if (sf_buf_alloc_want) {
1406 			sf_buf_alloc_want = 0;
1407 			wakeup(&sf_freelist);
1408 		}
1409 	}
1410 }
1411 
1412 /*
1413  * sendfile(2).
1414  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1415  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1416  *
1417  * Send a file specified by 'fd' and starting at 'offset' to a socket
1418  * specified by 's'. Send only 'nbytes' of the file or until EOF if
1419  * nbytes == 0. Optionally add a header and/or trailer to the socket
1420  * output. If specified, write the total number of bytes sent into *sbytes.
1421  */
1422 int
1423 sendfile(struct proc *p, struct sendfile_args *uap)
1424 {
1425 	struct file *fp;
1426 	struct filedesc *fdp = p->p_fd;
1427 	struct vnode *vp;
1428 	struct vm_object *obj;
1429 	struct socket *so;
1430 	struct mbuf *m;
1431 	struct sf_buf *sf;
1432 	struct vm_page *pg;
1433 	struct writev_args nuap;
1434 	struct sf_hdtr hdtr;
1435 	off_t off, xfsize, sbytes = 0;
1436 	int error = 0, s;
1437 
1438 	vp = NULL;
1439 	/*
1440 	 * Do argument checking. Must be a regular file in, stream
1441 	 * type and connected socket out, positive offset.
1442 	 */
1443 	fp = getfp(fdp, uap->fd, FREAD);
1444 	if (fp == NULL) {
1445 		error = EBADF;
1446 		goto done;
1447 	}
1448 	if (fp->f_type != DTYPE_VNODE) {
1449 		error = EINVAL;
1450 		goto done;
1451 	}
1452 	vp = (struct vnode *)fp->f_data;
1453 	vref(vp);
1454 	obj = vp->v_object;
1455 	if (vp->v_type != VREG || obj == NULL) {
1456 		error = EINVAL;
1457 		goto done;
1458 	}
1459 	error = getsock(p->p_fd, uap->s, &fp);
1460 	if (error)
1461 		goto done;
1462 	so = (struct socket *)fp->f_data;
1463 	if (so->so_type != SOCK_STREAM) {
1464 		error = EINVAL;
1465 		goto done;
1466 	}
1467 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1468 		error = ENOTCONN;
1469 		goto done;
1470 	}
1471 	if (uap->offset < 0) {
1472 		error = EINVAL;
1473 		goto done;
1474 	}
1475 
1476 	/*
1477 	 * If specified, get the pointer to the sf_hdtr struct for
1478 	 * any headers/trailers.
1479 	 */
1480 	if (uap->hdtr != NULL) {
1481 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1482 		if (error)
1483 			goto done;
1484 		/*
1485 		 * Send any headers. Wimp out and use writev(2).
1486 		 */
1487 		if (hdtr.headers != NULL) {
1488 			nuap.fd = uap->s;
1489 			nuap.iovp = hdtr.headers;
1490 			nuap.iovcnt = hdtr.hdr_cnt;
1491 			error = writev(p, &nuap);
1492 			if (error)
1493 				goto done;
1494 			sbytes += p->p_retval[0];
1495 		}
1496 	}
1497 
1498 	/*
1499 	 * Protect against multiple writers to the socket.
1500 	 */
1501 	(void) sblock(&so->so_snd, M_WAITOK);
1502 
1503 	/*
1504 	 * Loop through the pages in the file, starting with the requested
1505 	 * offset. Get a file page (do I/O if necessary), map the file page
1506 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1507 	 * it on the socket.
1508 	 */
1509 	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1510 		vm_pindex_t pindex;
1511 		vm_offset_t pgoff;
1512 
1513 		pindex = OFF_TO_IDX(off);
1514 retry_lookup:
1515 		/*
1516 		 * Calculate the amount to transfer. Not to exceed a page,
1517 		 * the EOF, or the passed in nbytes.
1518 		 */
1519 		xfsize = obj->un_pager.vnp.vnp_size - off;
1520 		if (xfsize > PAGE_SIZE)
1521 			xfsize = PAGE_SIZE;
1522 		pgoff = (vm_offset_t)(off & PAGE_MASK);
1523 		if (PAGE_SIZE - pgoff < xfsize)
1524 			xfsize = PAGE_SIZE - pgoff;
1525 		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1526 			xfsize = uap->nbytes - sbytes;
1527 		if (xfsize <= 0)
1528 			break;
1529 		/*
1530 		 * Optimize the non-blocking case by looking at the socket space
1531 		 * before going to the extra work of constituting the sf_buf.
1532 		 */
1533 		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1534 			if (so->so_state & SS_CANTSENDMORE)
1535 				error = EPIPE;
1536 			else
1537 				error = EAGAIN;
1538 			sbunlock(&so->so_snd);
1539 			goto done;
1540 		}
1541 		/*
1542 		 * Attempt to look up the page.
1543 		 *
1544 		 *	Allocate if not found
1545 		 *
1546 		 *	Wait and loop if busy.
1547 		 */
1548 		pg = vm_page_lookup(obj, pindex);
1549 
1550 		if (pg == NULL) {
1551 			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1552 			if (pg == NULL) {
1553 				VM_WAIT;
1554 				goto retry_lookup;
1555 			}
1556 			vm_page_wakeup(pg);
1557 		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
1558 			goto retry_lookup;
1559 		}
1560 
1561 		/*
1562 		 * Wire the page so it does not get ripped out from under
1563 		 * us.
1564 		 */
1565 
1566 		vm_page_wire(pg);
1567 
1568 		/*
1569 		 * If page is not valid for what we need, initiate I/O
1570 		 */
1571 
1572 		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1573 			struct uio auio;
1574 			struct iovec aiov;
1575 			int bsize;
1576 
1577 			/*
1578 			 * Ensure that our page is still around when the I/O
1579 			 * completes.
1580 			 */
1581 			vm_page_io_start(pg);
1582 
1583 			/*
1584 			 * Get the page from backing store.
1585 			 */
1586 			bsize = vp->v_mount->mnt_stat.f_iosize;
1587 			auio.uio_iov = &aiov;
1588 			auio.uio_iovcnt = 1;
1589 			aiov.iov_base = 0;
1590 			aiov.iov_len = MAXBSIZE;
1591 			auio.uio_resid = MAXBSIZE;
1592 			auio.uio_offset = trunc_page(off);
1593 			auio.uio_segflg = UIO_NOCOPY;
1594 			auio.uio_rw = UIO_READ;
1595 			auio.uio_procp = p;
1596 			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
1597 			error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
1598 			        p->p_ucred);
1599 			VOP_UNLOCK(vp, 0, p);
1600 			vm_page_flag_clear(pg, PG_ZERO);
1601 			vm_page_io_finish(pg);
1602 			if (error) {
1603 				vm_page_unwire(pg, 0);
1604 				/*
1605 				 * See if anyone else might know about this page.
1606 				 * If not and it is not valid, then free it.
1607 				 */
1608 				if (pg->wire_count == 0 && pg->valid == 0 &&
1609 				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1610 				    pg->hold_count == 0)
1611 					vm_page_free(pg);
1612 				sbunlock(&so->so_snd);
1613 				goto done;
1614 			}
1615 		}
1616 
1617 		/*
1618 		 * Allocate a kernel virtual page and insert the physical page
1619 		 * into it.
1620 		 */
1621 
1622 		sf = sf_buf_alloc();
1623 		sf->m = pg;
1624 		pmap_qenter(sf->kva, &pg, 1);
1625 		/*
1626 		 * Get an mbuf header and set it up as having external storage.
1627 		 */
1628 		MGETHDR(m, M_WAIT, MT_DATA);
1629 		if (m == NULL) {
1630 			error = ENOBUFS;
1631 			goto done;
1632 		}
1633 		m->m_ext.ext_free = sf_buf_free;
1634 		m->m_ext.ext_ref = sf_buf_ref;
1635 		m->m_ext.ext_buf = (void *)sf->kva;
1636 		m->m_ext.ext_size = PAGE_SIZE;
1637 		m->m_data = (char *) sf->kva + pgoff;
1638 		m->m_flags |= M_EXT;
1639 		m->m_pkthdr.len = m->m_len = xfsize;
1640 		/*
1641 		 * Add the buffer to the socket buffer chain.
1642 		 */
1643 		s = splnet();
1644 retry_space:
1645 		/*
1646 		 * Make sure that the socket is still able to take more data.
1647 		 * CANTSENDMORE being true usually means that the connection
1648 		 * was closed. so_error is true when an error was sensed after
1649 		 * a previous send.
1650 		 * The state is checked after the page mapping and buffer
1651 		 * allocation above since those operations may block and make
1652 		 * any socket checks stale. From this point forward, nothing
1653 		 * blocks before the pru_send (or more accurately, any blocking
1654 		 * results in a loop back to here to re-check).
1655 		 */
1656 		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1657 			if (so->so_state & SS_CANTSENDMORE) {
1658 				error = EPIPE;
1659 			} else {
1660 				error = so->so_error;
1661 				so->so_error = 0;
1662 			}
1663 			m_freem(m);
1664 			sbunlock(&so->so_snd);
1665 			splx(s);
1666 			goto done;
1667 		}
1668 		/*
1669 		 * Wait for socket space to become available. We do this just
1670 		 * after checking the connection state above in order to avoid
1671 		 * a race condition with sbwait().
1672 		 */
1673 		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1674 			if (so->so_state & SS_NBIO) {
1675 				m_freem(m);
1676 				sbunlock(&so->so_snd);
1677 				splx(s);
1678 				error = EAGAIN;
1679 				goto done;
1680 			}
1681 			error = sbwait(&so->so_snd);
1682 			/*
1683 			 * An error from sbwait usually indicates that we've
1684 			 * been interrupted by a signal. If we've sent anything
1685 			 * then return bytes sent, otherwise return the error.
1686 			 */
1687 			if (error) {
1688 				m_freem(m);
1689 				sbunlock(&so->so_snd);
1690 				splx(s);
1691 				goto done;
1692 			}
1693 			goto retry_space;
1694 		}
1695 		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
1696 		splx(s);
1697 		if (error) {
1698 			sbunlock(&so->so_snd);
1699 			goto done;
1700 		}
1701 	}
1702 	sbunlock(&so->so_snd);
1703 
1704 	/*
1705 	 * Send trailers. Wimp out and use writev(2).
1706 	 */
1707 	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1708 			nuap.fd = uap->s;
1709 			nuap.iovp = hdtr.trailers;
1710 			nuap.iovcnt = hdtr.trl_cnt;
1711 			error = writev(p, &nuap);
1712 			if (error)
1713 				goto done;
1714 			sbytes += p->p_retval[0];
1715 	}
1716 
1717 done:
1718 	if (uap->sbytes != NULL) {
1719 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
1720 	}
1721 	if (vp)
1722 		vrele(vp);
1723 	return (error);
1724 }
1725