xref: /freebsd/sys/kern/uipc_syscalls.c (revision a1a4f1a0d87b594d3f17a97dc0127eec1417e6f6)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37  * $FreeBSD$
38  */
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/sysproto.h>
47 #include <sys/malloc.h>
48 #include <sys/filedesc.h>
49 #include <sys/proc.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/mbuf.h>
53 #include <sys/protosw.h>
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <sys/signalvar.h>
57 #include <sys/uio.h>
58 #include <sys/vnode.h>
59 #include <sys/lock.h>
60 #include <sys/mount.h>
61 #ifdef KTRACE
62 #include <sys/ktrace.h>
63 #endif
64 #include <vm/vm.h>
65 #include <vm/vm_prot.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_pager.h>
69 #include <vm/vm_pageout.h>
70 #include <vm/vm_kern.h>
71 #include <vm/vm_extern.h>
72 #include <machine/limits.h>
73 
74 static void sf_buf_init(void *arg);
75 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
76 static struct sf_buf *sf_buf_alloc(void);
77 static void sf_buf_ref(caddr_t addr, u_int size);
78 static void sf_buf_free(caddr_t addr, u_int size);
79 
80 static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags));
81 static int recvit __P((struct proc *p, int s, struct msghdr *mp,
82 		       caddr_t namelenp));
83 
84 static int accept1 __P((struct proc *p, struct accept_args *uap, int compat));
85 static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
86 			     int compat));
87 static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
88 			     int compat));
89 
90 static SLIST_HEAD(, sf_buf) sf_freelist;
91 static vm_offset_t sf_base;
92 static struct sf_buf *sf_bufs;
93 static int sf_buf_alloc_want;
94 
95 /*
96  * System call interface to the socket abstraction.
97  */
98 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
99 #define COMPAT_OLDSOCK
100 #endif
101 
102 extern	struct fileops socketops;
103 
104 int
105 socket(p, uap)
106 	struct proc *p;
107 	register struct socket_args /* {
108 		int	domain;
109 		int	type;
110 		int	protocol;
111 	} */ *uap;
112 {
113 	struct filedesc *fdp = p->p_fd;
114 	struct socket *so;
115 	struct file *fp;
116 	int fd, error;
117 
118 	error = falloc(p, &fp, &fd);
119 	if (error)
120 		return (error);
121 	error = socreate(uap->domain, &so, uap->type, uap->protocol, p);
122 	if (error) {
123 		fdp->fd_ofiles[fd] = 0;
124 		ffree(fp);
125 	} else {
126 		fp->f_data = (caddr_t)so;
127 		fp->f_flag = FREAD|FWRITE;
128 		fp->f_ops = &socketops;
129 		fp->f_type = DTYPE_SOCKET;
130 		p->p_retval[0] = fd;
131 	}
132 	return (error);
133 }
134 
135 /* ARGSUSED */
136 int
137 bind(p, uap)
138 	struct proc *p;
139 	register struct bind_args /* {
140 		int	s;
141 		caddr_t	name;
142 		int	namelen;
143 	} */ *uap;
144 {
145 	struct file *fp;
146 	struct sockaddr *sa;
147 	int error;
148 
149 	error = getsock(p->p_fd, uap->s, &fp);
150 	if (error)
151 		return (error);
152 	error = getsockaddr(&sa, uap->name, uap->namelen);
153 	if (error)
154 		return (error);
155 	error = sobind((struct socket *)fp->f_data, sa, p);
156 	FREE(sa, M_SONAME);
157 	return (error);
158 }
159 
160 /* ARGSUSED */
161 int
162 listen(p, uap)
163 	struct proc *p;
164 	register struct listen_args /* {
165 		int	s;
166 		int	backlog;
167 	} */ *uap;
168 {
169 	struct file *fp;
170 	int error;
171 
172 	error = getsock(p->p_fd, uap->s, &fp);
173 	if (error)
174 		return (error);
175 	return (solisten((struct socket *)fp->f_data, uap->backlog, p));
176 }
177 
178 static int
179 accept1(p, uap, compat)
180 	struct proc *p;
181 	register struct accept_args /* {
182 		int	s;
183 		caddr_t	name;
184 		int	*anamelen;
185 	} */ *uap;
186 	int compat;
187 {
188 	struct file *fp;
189 	struct sockaddr *sa;
190 	int namelen, error, s;
191 	struct socket *head, *so;
192 	int fd;
193 	short fflag;		/* type must match fp->f_flag */
194 
195 	if (uap->name) {
196 		error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
197 			sizeof (namelen));
198 		if(error)
199 			return (error);
200 	}
201 	error = getsock(p->p_fd, uap->s, &fp);
202 	if (error)
203 		return (error);
204 	s = splnet();
205 	head = (struct socket *)fp->f_data;
206 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
207 		splx(s);
208 		return (EINVAL);
209 	}
210 	if ((head->so_state & SS_NBIO) && head->so_comp.tqh_first == NULL) {
211 		splx(s);
212 		return (EWOULDBLOCK);
213 	}
214 	while (head->so_comp.tqh_first == NULL && head->so_error == 0) {
215 		if (head->so_state & SS_CANTRCVMORE) {
216 			head->so_error = ECONNABORTED;
217 			break;
218 		}
219 		error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
220 		    "accept", 0);
221 		if (error) {
222 			splx(s);
223 			return (error);
224 		}
225 	}
226 	if (head->so_error) {
227 		error = head->so_error;
228 		head->so_error = 0;
229 		splx(s);
230 		return (error);
231 	}
232 
233 	/*
234 	 * At this point we know that there is at least one connection
235 	 * ready to be accepted. Remove it from the queue prior to
236 	 * allocating the file descriptor for it since falloc() may
237 	 * block allowing another process to accept the connection
238 	 * instead.
239 	 */
240 	so = head->so_comp.tqh_first;
241 	TAILQ_REMOVE(&head->so_comp, so, so_list);
242 	head->so_qlen--;
243 
244 	fflag = fp->f_flag;
245 	error = falloc(p, &fp, &fd);
246 	if (error) {
247 		/*
248 		 * Probably ran out of file descriptors. Put the
249 		 * unaccepted connection back onto the queue and
250 		 * do another wakeup so some other process might
251 		 * have a chance at it.
252 		 */
253 		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
254 		head->so_qlen++;
255 		wakeup_one(&head->so_timeo);
256 		splx(s);
257 		return (error);
258 	} else
259 		p->p_retval[0] = fd;
260 
261 	so->so_state &= ~SS_COMP;
262 	so->so_head = NULL;
263 	if (head->so_sigio != NULL)
264 		fsetown(fgetown(head->so_sigio), &so->so_sigio);
265 
266 	fp->f_data = (caddr_t)so;
267 	fp->f_flag = fflag;
268 	fp->f_ops = &socketops;
269 	fp->f_type = DTYPE_SOCKET;
270 	sa = 0;
271 	(void) soaccept(so, &sa);
272 	if (sa == 0) {
273 		namelen = 0;
274 		if (uap->name)
275 			goto gotnoname;
276 		return 0;
277 	}
278 	if (uap->name) {
279 		/* check sa_len before it is destroyed */
280 		if (namelen > sa->sa_len)
281 			namelen = sa->sa_len;
282 #ifdef COMPAT_OLDSOCK
283 		if (compat)
284 			((struct osockaddr *)sa)->sa_family =
285 			    sa->sa_family;
286 #endif
287 		error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
288 		if (!error)
289 gotnoname:
290 			error = copyout((caddr_t)&namelen,
291 			    (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
292 	}
293 	if (sa)
294 		FREE(sa, M_SONAME);
295 	splx(s);
296 	return (error);
297 }
298 
299 int
300 accept(p, uap)
301 	struct proc *p;
302 	struct accept_args *uap;
303 {
304 
305 	return (accept1(p, uap, 0));
306 }
307 
308 #ifdef COMPAT_OLDSOCK
309 int
310 oaccept(p, uap)
311 	struct proc *p;
312 	struct accept_args *uap;
313 {
314 
315 	return (accept1(p, uap, 1));
316 }
317 #endif /* COMPAT_OLDSOCK */
318 
319 /* ARGSUSED */
320 int
321 connect(p, uap)
322 	struct proc *p;
323 	register struct connect_args /* {
324 		int	s;
325 		caddr_t	name;
326 		int	namelen;
327 	} */ *uap;
328 {
329 	struct file *fp;
330 	register struct socket *so;
331 	struct sockaddr *sa;
332 	int error, s;
333 
334 	error = getsock(p->p_fd, uap->s, &fp);
335 	if (error)
336 		return (error);
337 	so = (struct socket *)fp->f_data;
338 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING))
339 		return (EALREADY);
340 	error = getsockaddr(&sa, uap->name, uap->namelen);
341 	if (error)
342 		return (error);
343 	error = soconnect(so, sa, p);
344 	if (error)
345 		goto bad;
346 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
347 		FREE(sa, M_SONAME);
348 		return (EINPROGRESS);
349 	}
350 	s = splnet();
351 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
352 		error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
353 		    "connec", 0);
354 		if (error)
355 			break;
356 	}
357 	if (error == 0) {
358 		error = so->so_error;
359 		so->so_error = 0;
360 	}
361 	splx(s);
362 bad:
363 	so->so_state &= ~SS_ISCONNECTING;
364 	FREE(sa, M_SONAME);
365 	if (error == ERESTART)
366 		error = EINTR;
367 	return (error);
368 }
369 
370 int
371 socketpair(p, uap)
372 	struct proc *p;
373 	register struct socketpair_args /* {
374 		int	domain;
375 		int	type;
376 		int	protocol;
377 		int	*rsv;
378 	} */ *uap;
379 {
380 	register struct filedesc *fdp = p->p_fd;
381 	struct file *fp1, *fp2;
382 	struct socket *so1, *so2;
383 	int fd, error, sv[2];
384 
385 	error = socreate(uap->domain, &so1, uap->type, uap->protocol, p);
386 	if (error)
387 		return (error);
388 	error = socreate(uap->domain, &so2, uap->type, uap->protocol, p);
389 	if (error)
390 		goto free1;
391 	error = falloc(p, &fp1, &fd);
392 	if (error)
393 		goto free2;
394 	sv[0] = fd;
395 	fp1->f_data = (caddr_t)so1;
396 	error = falloc(p, &fp2, &fd);
397 	if (error)
398 		goto free3;
399 	fp2->f_data = (caddr_t)so2;
400 	sv[1] = fd;
401 	error = soconnect2(so1, so2);
402 	if (error)
403 		goto free4;
404 	if (uap->type == SOCK_DGRAM) {
405 		/*
406 		 * Datagram socket connection is asymmetric.
407 		 */
408 		 error = soconnect2(so2, so1);
409 		 if (error)
410 			goto free4;
411 	}
412 	fp1->f_flag = fp2->f_flag = FREAD|FWRITE;
413 	fp1->f_ops = fp2->f_ops = &socketops;
414 	fp1->f_type = fp2->f_type = DTYPE_SOCKET;
415 	error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
416 	return (error);
417 free4:
418 	fdp->fd_ofiles[sv[1]] = 0;
419 	ffree(fp2);
420 free3:
421 	fdp->fd_ofiles[sv[0]] = 0;
422 	ffree(fp1);
423 free2:
424 	(void)soclose(so2);
425 free1:
426 	(void)soclose(so1);
427 	return (error);
428 }
429 
430 static int
431 sendit(p, s, mp, flags)
432 	register struct proc *p;
433 	int s;
434 	register struct msghdr *mp;
435 	int flags;
436 {
437 	struct file *fp;
438 	struct uio auio;
439 	register struct iovec *iov;
440 	register int i;
441 	struct mbuf *control;
442 	struct sockaddr *to;
443 	int len, error;
444 	struct socket *so;
445 #ifdef KTRACE
446 	struct iovec *ktriov = NULL;
447 #endif
448 
449 	error = getsock(p->p_fd, s, &fp);
450 	if (error)
451 		return (error);
452 	auio.uio_iov = mp->msg_iov;
453 	auio.uio_iovcnt = mp->msg_iovlen;
454 	auio.uio_segflg = UIO_USERSPACE;
455 	auio.uio_rw = UIO_WRITE;
456 	auio.uio_procp = p;
457 	auio.uio_offset = 0;			/* XXX */
458 	auio.uio_resid = 0;
459 	iov = mp->msg_iov;
460 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
461 		if ((auio.uio_resid += iov->iov_len) < 0)
462 			return (EINVAL);
463 	}
464 	if (mp->msg_name) {
465 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
466 		if (error)
467 			return (error);
468 	} else
469 		to = 0;
470 	if (mp->msg_control) {
471 		if (mp->msg_controllen < sizeof(struct cmsghdr)
472 #ifdef COMPAT_OLDSOCK
473 		    && mp->msg_flags != MSG_COMPAT
474 #endif
475 		) {
476 			error = EINVAL;
477 			goto bad;
478 		}
479 		error = sockargs(&control, mp->msg_control,
480 		    mp->msg_controllen, MT_CONTROL);
481 		if (error)
482 			goto bad;
483 #ifdef COMPAT_OLDSOCK
484 		if (mp->msg_flags == MSG_COMPAT) {
485 			register struct cmsghdr *cm;
486 
487 			M_PREPEND(control, sizeof(*cm), M_WAIT);
488 			if (control == 0) {
489 				error = ENOBUFS;
490 				goto bad;
491 			} else {
492 				cm = mtod(control, struct cmsghdr *);
493 				cm->cmsg_len = control->m_len;
494 				cm->cmsg_level = SOL_SOCKET;
495 				cm->cmsg_type = SCM_RIGHTS;
496 			}
497 		}
498 #endif
499 	} else
500 		control = 0;
501 #ifdef KTRACE
502 	if (KTRPOINT(p, KTR_GENIO)) {
503 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
504 
505 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
506 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
507 	}
508 #endif
509 	len = auio.uio_resid;
510 	so = (struct socket *)fp->f_data;
511 	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
512 						     flags, p);
513 	if (error) {
514 		if (auio.uio_resid != len && (error == ERESTART ||
515 		    error == EINTR || error == EWOULDBLOCK))
516 			error = 0;
517 		if (error == EPIPE)
518 			psignal(p, SIGPIPE);
519 	}
520 	if (error == 0)
521 		p->p_retval[0] = len - auio.uio_resid;
522 #ifdef KTRACE
523 	if (ktriov != NULL) {
524 		if (error == 0)
525 			ktrgenio(p->p_tracep, s, UIO_WRITE,
526 				ktriov, p->p_retval[0], error);
527 		FREE(ktriov, M_TEMP);
528 	}
529 #endif
530 bad:
531 	if (to)
532 		FREE(to, M_SONAME);
533 	return (error);
534 }
535 
536 int
537 sendto(p, uap)
538 	struct proc *p;
539 	register struct sendto_args /* {
540 		int	s;
541 		caddr_t	buf;
542 		size_t	len;
543 		int	flags;
544 		caddr_t	to;
545 		int	tolen;
546 	} */ *uap;
547 {
548 	struct msghdr msg;
549 	struct iovec aiov;
550 
551 	msg.msg_name = uap->to;
552 	msg.msg_namelen = uap->tolen;
553 	msg.msg_iov = &aiov;
554 	msg.msg_iovlen = 1;
555 	msg.msg_control = 0;
556 #ifdef COMPAT_OLDSOCK
557 	msg.msg_flags = 0;
558 #endif
559 	aiov.iov_base = uap->buf;
560 	aiov.iov_len = uap->len;
561 	return (sendit(p, uap->s, &msg, uap->flags));
562 }
563 
564 #ifdef COMPAT_OLDSOCK
565 int
566 osend(p, uap)
567 	struct proc *p;
568 	register struct osend_args /* {
569 		int	s;
570 		caddr_t	buf;
571 		int	len;
572 		int	flags;
573 	} */ *uap;
574 {
575 	struct msghdr msg;
576 	struct iovec aiov;
577 
578 	msg.msg_name = 0;
579 	msg.msg_namelen = 0;
580 	msg.msg_iov = &aiov;
581 	msg.msg_iovlen = 1;
582 	aiov.iov_base = uap->buf;
583 	aiov.iov_len = uap->len;
584 	msg.msg_control = 0;
585 	msg.msg_flags = 0;
586 	return (sendit(p, uap->s, &msg, uap->flags));
587 }
588 
589 int
590 osendmsg(p, uap)
591 	struct proc *p;
592 	register struct osendmsg_args /* {
593 		int	s;
594 		caddr_t	msg;
595 		int	flags;
596 	} */ *uap;
597 {
598 	struct msghdr msg;
599 	struct iovec aiov[UIO_SMALLIOV], *iov;
600 	int error;
601 
602 	error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
603 	if (error)
604 		return (error);
605 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
606 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
607 			return (EMSGSIZE);
608 		MALLOC(iov, struct iovec *,
609 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
610 		      M_WAITOK);
611 	} else
612 		iov = aiov;
613 	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
614 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
615 	if (error)
616 		goto done;
617 	msg.msg_flags = MSG_COMPAT;
618 	msg.msg_iov = iov;
619 	error = sendit(p, uap->s, &msg, uap->flags);
620 done:
621 	if (iov != aiov)
622 		FREE(iov, M_IOV);
623 	return (error);
624 }
625 #endif
626 
627 int
628 sendmsg(p, uap)
629 	struct proc *p;
630 	register struct sendmsg_args /* {
631 		int	s;
632 		caddr_t	msg;
633 		int	flags;
634 	} */ *uap;
635 {
636 	struct msghdr msg;
637 	struct iovec aiov[UIO_SMALLIOV], *iov;
638 	int error;
639 
640 	error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
641 	if (error)
642 		return (error);
643 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
644 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
645 			return (EMSGSIZE);
646 		MALLOC(iov, struct iovec *,
647 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
648 		       M_WAITOK);
649 	} else
650 		iov = aiov;
651 	if (msg.msg_iovlen &&
652 	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
653 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
654 		goto done;
655 	msg.msg_iov = iov;
656 #ifdef COMPAT_OLDSOCK
657 	msg.msg_flags = 0;
658 #endif
659 	error = sendit(p, uap->s, &msg, uap->flags);
660 done:
661 	if (iov != aiov)
662 		FREE(iov, M_IOV);
663 	return (error);
664 }
665 
666 static int
667 recvit(p, s, mp, namelenp)
668 	register struct proc *p;
669 	int s;
670 	register struct msghdr *mp;
671 	caddr_t namelenp;
672 {
673 	struct file *fp;
674 	struct uio auio;
675 	register struct iovec *iov;
676 	register int i;
677 	int len, error;
678 	struct mbuf *m, *control = 0;
679 	caddr_t ctlbuf;
680 	struct socket *so;
681 	struct sockaddr *fromsa = 0;
682 #ifdef KTRACE
683 	struct iovec *ktriov = NULL;
684 #endif
685 
686 	error = getsock(p->p_fd, s, &fp);
687 	if (error)
688 		return (error);
689 	auio.uio_iov = mp->msg_iov;
690 	auio.uio_iovcnt = mp->msg_iovlen;
691 	auio.uio_segflg = UIO_USERSPACE;
692 	auio.uio_rw = UIO_READ;
693 	auio.uio_procp = p;
694 	auio.uio_offset = 0;			/* XXX */
695 	auio.uio_resid = 0;
696 	iov = mp->msg_iov;
697 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
698 		if ((auio.uio_resid += iov->iov_len) < 0)
699 			return (EINVAL);
700 	}
701 #ifdef KTRACE
702 	if (KTRPOINT(p, KTR_GENIO)) {
703 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
704 
705 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
706 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
707 	}
708 #endif
709 	len = auio.uio_resid;
710 	so = (struct socket *)fp->f_data;
711 	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
712 	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
713 	    &mp->msg_flags);
714 	if (error) {
715 		if (auio.uio_resid != len && (error == ERESTART ||
716 		    error == EINTR || error == EWOULDBLOCK))
717 			error = 0;
718 	}
719 #ifdef KTRACE
720 	if (ktriov != NULL) {
721 		if (error == 0)
722 			ktrgenio(p->p_tracep, s, UIO_READ,
723 				ktriov, len - auio.uio_resid, error);
724 		FREE(ktriov, M_TEMP);
725 	}
726 #endif
727 	if (error)
728 		goto out;
729 	p->p_retval[0] = len - auio.uio_resid;
730 	if (mp->msg_name) {
731 		len = mp->msg_namelen;
732 		if (len <= 0 || fromsa == 0)
733 			len = 0;
734 		else {
735 #ifndef MIN
736 #define MIN(a,b) ((a)>(b)?(b):(a))
737 #endif
738 			/* save sa_len before it is destroyed by MSG_COMPAT */
739 			len = MIN(len, fromsa->sa_len);
740 #ifdef COMPAT_OLDSOCK
741 			if (mp->msg_flags & MSG_COMPAT)
742 				((struct osockaddr *)fromsa)->sa_family =
743 				    fromsa->sa_family;
744 #endif
745 			error = copyout(fromsa,
746 			    (caddr_t)mp->msg_name, (unsigned)len);
747 			if (error)
748 				goto out;
749 		}
750 		mp->msg_namelen = len;
751 		if (namelenp &&
752 		    (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
753 #ifdef COMPAT_OLDSOCK
754 			if (mp->msg_flags & MSG_COMPAT)
755 				error = 0;	/* old recvfrom didn't check */
756 			else
757 #endif
758 			goto out;
759 		}
760 	}
761 	if (mp->msg_control) {
762 #ifdef COMPAT_OLDSOCK
763 		/*
764 		 * We assume that old recvmsg calls won't receive access
765 		 * rights and other control info, esp. as control info
766 		 * is always optional and those options didn't exist in 4.3.
767 		 * If we receive rights, trim the cmsghdr; anything else
768 		 * is tossed.
769 		 */
770 		if (control && mp->msg_flags & MSG_COMPAT) {
771 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
772 			    SOL_SOCKET ||
773 			    mtod(control, struct cmsghdr *)->cmsg_type !=
774 			    SCM_RIGHTS) {
775 				mp->msg_controllen = 0;
776 				goto out;
777 			}
778 			control->m_len -= sizeof (struct cmsghdr);
779 			control->m_data += sizeof (struct cmsghdr);
780 		}
781 #endif
782 		len = mp->msg_controllen;
783 		m = control;
784 		mp->msg_controllen = 0;
785 		ctlbuf = (caddr_t) mp->msg_control;
786 
787 		while (m && len > 0) {
788 			unsigned int tocopy;
789 
790 			if (len >= m->m_len)
791 				tocopy = m->m_len;
792 			else {
793 				mp->msg_flags |= MSG_CTRUNC;
794 				tocopy = len;
795 			}
796 
797 			if ((error = copyout((caddr_t)mtod(m, caddr_t),
798 					ctlbuf, tocopy)) != 0)
799 				goto out;
800 
801 			ctlbuf += tocopy;
802 			len -= tocopy;
803 			m = m->m_next;
804 		}
805 		mp->msg_controllen = ctlbuf - mp->msg_control;
806 	}
807 out:
808 	if (fromsa)
809 		FREE(fromsa, M_SONAME);
810 	if (control)
811 		m_freem(control);
812 	return (error);
813 }
814 
815 int
816 recvfrom(p, uap)
817 	struct proc *p;
818 	register struct recvfrom_args /* {
819 		int	s;
820 		caddr_t	buf;
821 		size_t	len;
822 		int	flags;
823 		caddr_t	from;
824 		int	*fromlenaddr;
825 	} */ *uap;
826 {
827 	struct msghdr msg;
828 	struct iovec aiov;
829 	int error;
830 
831 	if (uap->fromlenaddr) {
832 		error = copyin((caddr_t)uap->fromlenaddr,
833 		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
834 		if (error)
835 			return (error);
836 	} else
837 		msg.msg_namelen = 0;
838 	msg.msg_name = uap->from;
839 	msg.msg_iov = &aiov;
840 	msg.msg_iovlen = 1;
841 	aiov.iov_base = uap->buf;
842 	aiov.iov_len = uap->len;
843 	msg.msg_control = 0;
844 	msg.msg_flags = uap->flags;
845 	return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr));
846 }
847 
848 #ifdef COMPAT_OLDSOCK
849 int
850 orecvfrom(p, uap)
851 	struct proc *p;
852 	struct recvfrom_args *uap;
853 {
854 
855 	uap->flags |= MSG_COMPAT;
856 	return (recvfrom(p, uap));
857 }
858 #endif
859 
860 
861 #ifdef COMPAT_OLDSOCK
862 int
863 orecv(p, uap)
864 	struct proc *p;
865 	register struct orecv_args /* {
866 		int	s;
867 		caddr_t	buf;
868 		int	len;
869 		int	flags;
870 	} */ *uap;
871 {
872 	struct msghdr msg;
873 	struct iovec aiov;
874 
875 	msg.msg_name = 0;
876 	msg.msg_namelen = 0;
877 	msg.msg_iov = &aiov;
878 	msg.msg_iovlen = 1;
879 	aiov.iov_base = uap->buf;
880 	aiov.iov_len = uap->len;
881 	msg.msg_control = 0;
882 	msg.msg_flags = uap->flags;
883 	return (recvit(p, uap->s, &msg, (caddr_t)0));
884 }
885 
886 /*
887  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
888  * overlays the new one, missing only the flags, and with the (old) access
889  * rights where the control fields are now.
890  */
891 int
892 orecvmsg(p, uap)
893 	struct proc *p;
894 	register struct orecvmsg_args /* {
895 		int	s;
896 		struct	omsghdr *msg;
897 		int	flags;
898 	} */ *uap;
899 {
900 	struct msghdr msg;
901 	struct iovec aiov[UIO_SMALLIOV], *iov;
902 	int error;
903 
904 	error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
905 	    sizeof (struct omsghdr));
906 	if (error)
907 		return (error);
908 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
909 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
910 			return (EMSGSIZE);
911 		MALLOC(iov, struct iovec *,
912 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
913 		      M_WAITOK);
914 	} else
915 		iov = aiov;
916 	msg.msg_flags = uap->flags | MSG_COMPAT;
917 	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
918 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
919 	if (error)
920 		goto done;
921 	msg.msg_iov = iov;
922 	error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
923 
924 	if (msg.msg_controllen && error == 0)
925 		error = copyout((caddr_t)&msg.msg_controllen,
926 		    (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
927 done:
928 	if (iov != aiov)
929 		FREE(iov, M_IOV);
930 	return (error);
931 }
932 #endif
933 
934 int
935 recvmsg(p, uap)
936 	struct proc *p;
937 	register struct recvmsg_args /* {
938 		int	s;
939 		struct	msghdr *msg;
940 		int	flags;
941 	} */ *uap;
942 {
943 	struct msghdr msg;
944 	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
945 	register int error;
946 
947 	error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
948 	if (error)
949 		return (error);
950 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
951 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
952 			return (EMSGSIZE);
953 		MALLOC(iov, struct iovec *,
954 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
955 		       M_WAITOK);
956 	} else
957 		iov = aiov;
958 #ifdef COMPAT_OLDSOCK
959 	msg.msg_flags = uap->flags &~ MSG_COMPAT;
960 #else
961 	msg.msg_flags = uap->flags;
962 #endif
963 	uiov = msg.msg_iov;
964 	msg.msg_iov = iov;
965 	error = copyin((caddr_t)uiov, (caddr_t)iov,
966 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
967 	if (error)
968 		goto done;
969 	error = recvit(p, uap->s, &msg, (caddr_t)0);
970 	if (!error) {
971 		msg.msg_iov = uiov;
972 		error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
973 	}
974 done:
975 	if (iov != aiov)
976 		FREE(iov, M_IOV);
977 	return (error);
978 }
979 
980 /* ARGSUSED */
981 int
982 shutdown(p, uap)
983 	struct proc *p;
984 	register struct shutdown_args /* {
985 		int	s;
986 		int	how;
987 	} */ *uap;
988 {
989 	struct file *fp;
990 	int error;
991 
992 	error = getsock(p->p_fd, uap->s, &fp);
993 	if (error)
994 		return (error);
995 	return (soshutdown((struct socket *)fp->f_data, uap->how));
996 }
997 
998 /* ARGSUSED */
999 int
1000 setsockopt(p, uap)
1001 	struct proc *p;
1002 	register struct setsockopt_args /* {
1003 		int	s;
1004 		int	level;
1005 		int	name;
1006 		caddr_t	val;
1007 		int	valsize;
1008 	} */ *uap;
1009 {
1010 	struct file *fp;
1011 	struct sockopt sopt;
1012 	int error;
1013 
1014 	if (uap->val == 0 && uap->valsize != 0)
1015 		return (EFAULT);
1016 	if (uap->valsize < 0)
1017 		return (EINVAL);
1018 
1019 	error = getsock(p->p_fd, uap->s, &fp);
1020 	if (error)
1021 		return (error);
1022 
1023 	sopt.sopt_dir = SOPT_SET;
1024 	sopt.sopt_level = uap->level;
1025 	sopt.sopt_name = uap->name;
1026 	sopt.sopt_val = uap->val;
1027 	sopt.sopt_valsize = uap->valsize;
1028 	sopt.sopt_p = p;
1029 
1030 	return (sosetopt((struct socket *)fp->f_data, &sopt));
1031 }
1032 
1033 /* ARGSUSED */
1034 int
1035 getsockopt(p, uap)
1036 	struct proc *p;
1037 	register struct getsockopt_args /* {
1038 		int	s;
1039 		int	level;
1040 		int	name;
1041 		caddr_t	val;
1042 		int	*avalsize;
1043 	} */ *uap;
1044 {
1045 	int	valsize, error;
1046 	struct	file *fp;
1047 	struct	sockopt sopt;
1048 
1049 	error = getsock(p->p_fd, uap->s, &fp);
1050 	if (error)
1051 		return (error);
1052 	if (uap->val) {
1053 		error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
1054 		    sizeof (valsize));
1055 		if (error)
1056 			return (error);
1057 		if (valsize < 0)
1058 			return (EINVAL);
1059 	} else
1060 		valsize = 0;
1061 
1062 	sopt.sopt_dir = SOPT_GET;
1063 	sopt.sopt_level = uap->level;
1064 	sopt.sopt_name = uap->name;
1065 	sopt.sopt_val = uap->val;
1066 	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1067 	sopt.sopt_p = p;
1068 
1069 	error = sogetopt((struct socket *)fp->f_data, &sopt);
1070 	if (error == 0) {
1071 		valsize = sopt.sopt_valsize;
1072 		error = copyout((caddr_t)&valsize,
1073 				(caddr_t)uap->avalsize, sizeof (valsize));
1074 	}
1075 	return (error);
1076 }
1077 
1078 /*
1079  * Get socket name.
1080  */
1081 /* ARGSUSED */
1082 static int
1083 getsockname1(p, uap, compat)
1084 	struct proc *p;
1085 	register struct getsockname_args /* {
1086 		int	fdes;
1087 		caddr_t	asa;
1088 		int	*alen;
1089 	} */ *uap;
1090 	int compat;
1091 {
1092 	struct file *fp;
1093 	register struct socket *so;
1094 	struct sockaddr *sa;
1095 	int len, error;
1096 
1097 	error = getsock(p->p_fd, uap->fdes, &fp);
1098 	if (error)
1099 		return (error);
1100 	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1101 	if (error)
1102 		return (error);
1103 	so = (struct socket *)fp->f_data;
1104 	sa = 0;
1105 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1106 	if (error)
1107 		goto bad;
1108 	if (sa == 0) {
1109 		len = 0;
1110 		goto gotnothing;
1111 	}
1112 
1113 	len = MIN(len, sa->sa_len);
1114 #ifdef COMPAT_OLDSOCK
1115 	if (compat)
1116 		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1117 #endif
1118 	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1119 	if (error == 0)
1120 gotnothing:
1121 		error = copyout((caddr_t)&len, (caddr_t)uap->alen,
1122 		    sizeof (len));
1123 bad:
1124 	if (sa)
1125 		FREE(sa, M_SONAME);
1126 	return (error);
1127 }
1128 
1129 int
1130 getsockname(p, uap)
1131 	struct proc *p;
1132 	struct getsockname_args *uap;
1133 {
1134 
1135 	return (getsockname1(p, uap, 0));
1136 }
1137 
1138 #ifdef COMPAT_OLDSOCK
1139 int
1140 ogetsockname(p, uap)
1141 	struct proc *p;
1142 	struct getsockname_args *uap;
1143 {
1144 
1145 	return (getsockname1(p, uap, 1));
1146 }
1147 #endif /* COMPAT_OLDSOCK */
1148 
1149 /*
1150  * Get name of peer for connected socket.
1151  */
1152 /* ARGSUSED */
1153 static int
1154 getpeername1(p, uap, compat)
1155 	struct proc *p;
1156 	register struct getpeername_args /* {
1157 		int	fdes;
1158 		caddr_t	asa;
1159 		int	*alen;
1160 	} */ *uap;
1161 	int compat;
1162 {
1163 	struct file *fp;
1164 	register struct socket *so;
1165 	struct sockaddr *sa;
1166 	int len, error;
1167 
1168 	error = getsock(p->p_fd, uap->fdes, &fp);
1169 	if (error)
1170 		return (error);
1171 	so = (struct socket *)fp->f_data;
1172 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
1173 		return (ENOTCONN);
1174 	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1175 	if (error)
1176 		return (error);
1177 	sa = 0;
1178 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1179 	if (error)
1180 		goto bad;
1181 	if (sa == 0) {
1182 		len = 0;
1183 		goto gotnothing;
1184 	}
1185 	len = MIN(len, sa->sa_len);
1186 #ifdef COMPAT_OLDSOCK
1187 	if (compat)
1188 		((struct osockaddr *)sa)->sa_family =
1189 		    sa->sa_family;
1190 #endif
1191 	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1192 	if (error)
1193 		goto bad;
1194 gotnothing:
1195 	error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
1196 bad:
1197 	if (sa) FREE(sa, M_SONAME);
1198 	return (error);
1199 }
1200 
1201 int
1202 getpeername(p, uap)
1203 	struct proc *p;
1204 	struct getpeername_args *uap;
1205 {
1206 
1207 	return (getpeername1(p, uap, 0));
1208 }
1209 
1210 #ifdef COMPAT_OLDSOCK
1211 int
1212 ogetpeername(p, uap)
1213 	struct proc *p;
1214 	struct ogetpeername_args *uap;
1215 {
1216 
1217 	/* XXX uap should have type `getpeername_args *' to begin with. */
1218 	return (getpeername1(p, (struct getpeername_args *)uap, 1));
1219 }
1220 #endif /* COMPAT_OLDSOCK */
1221 
1222 int
1223 sockargs(mp, buf, buflen, type)
1224 	struct mbuf **mp;
1225 	caddr_t buf;
1226 	int buflen, type;
1227 {
1228 	register struct sockaddr *sa;
1229 	register struct mbuf *m;
1230 	int error;
1231 
1232 	if ((u_int)buflen > MLEN) {
1233 #ifdef COMPAT_OLDSOCK
1234 		if (type == MT_SONAME && (u_int)buflen <= 112)
1235 			buflen = MLEN;		/* unix domain compat. hack */
1236 		else
1237 #endif
1238 		return (EINVAL);
1239 	}
1240 	m = m_get(M_WAIT, type);
1241 	if (m == NULL)
1242 		return (ENOBUFS);
1243 	m->m_len = buflen;
1244 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1245 	if (error)
1246 		(void) m_free(m);
1247 	else {
1248 		*mp = m;
1249 		if (type == MT_SONAME) {
1250 			sa = mtod(m, struct sockaddr *);
1251 
1252 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1253 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1254 				sa->sa_family = sa->sa_len;
1255 #endif
1256 			sa->sa_len = buflen;
1257 		}
1258 	}
1259 	return (error);
1260 }
1261 
1262 int
1263 getsockaddr(namp, uaddr, len)
1264 	struct sockaddr **namp;
1265 	caddr_t uaddr;
1266 	size_t len;
1267 {
1268 	struct sockaddr *sa;
1269 	int error;
1270 
1271 	if (len > SOCK_MAXADDRLEN)
1272 		return ENAMETOOLONG;
1273 	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1274 	error = copyin(uaddr, sa, len);
1275 	if (error) {
1276 		FREE(sa, M_SONAME);
1277 	} else {
1278 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1279 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1280 			sa->sa_family = sa->sa_len;
1281 #endif
1282 		sa->sa_len = len;
1283 		*namp = sa;
1284 	}
1285 	return error;
1286 }
1287 
1288 int
1289 getsock(fdp, fdes, fpp)
1290 	struct filedesc *fdp;
1291 	int fdes;
1292 	struct file **fpp;
1293 {
1294 	register struct file *fp;
1295 
1296 	if ((unsigned)fdes >= fdp->fd_nfiles ||
1297 	    (fp = fdp->fd_ofiles[fdes]) == NULL)
1298 		return (EBADF);
1299 	if (fp->f_type != DTYPE_SOCKET)
1300 		return (ENOTSOCK);
1301 	*fpp = fp;
1302 	return (0);
1303 }
1304 
1305 /*
1306  * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1307  * XXX - The sf_buf functions are currently private to sendfile(2), so have
1308  * been made static, but may be useful in the future for doing zero-copy in
1309  * other parts of the networking code.
1310  */
1311 static void
1312 sf_buf_init(void *arg)
1313 {
1314 	int i;
1315 
1316 	SLIST_INIT(&sf_freelist);
1317 	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
1318 	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT);
1319 	bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf));
1320 	for (i = 0; i < nsfbufs; i++) {
1321 		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1322 		SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list);
1323 	}
1324 }
1325 
1326 /*
1327  * Get an sf_buf from the freelist. Will block if none are available.
1328  */
1329 static struct sf_buf *
1330 sf_buf_alloc()
1331 {
1332 	struct sf_buf *sf;
1333 	int s;
1334 
1335 	s = splimp();
1336 	while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) {
1337 		sf_buf_alloc_want = 1;
1338 		tsleep(&sf_freelist, PVM, "sfbufa", 0);
1339 	}
1340 	SLIST_REMOVE_HEAD(&sf_freelist, free_list);
1341 	splx(s);
1342 	sf->refcnt = 1;
1343 	return (sf);
1344 }
1345 
1346 #define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1347 static void
1348 sf_buf_ref(caddr_t addr, u_int size)
1349 {
1350 	struct sf_buf *sf;
1351 
1352 	sf = dtosf(addr);
1353 	if (sf->refcnt == 0)
1354 		panic("sf_buf_ref: referencing a free sf_buf");
1355 	sf->refcnt++;
1356 }
1357 
1358 /*
1359  * Lose a reference to an sf_buf. When none left, detach mapped page
1360  * and release resources back to the system.
1361  *
1362  * Must be called at splimp.
1363  */
1364 static void
1365 sf_buf_free(caddr_t addr, u_int size)
1366 {
1367 	struct sf_buf *sf;
1368 	struct vm_page *m;
1369 	int s;
1370 
1371 	sf = dtosf(addr);
1372 	if (sf->refcnt == 0)
1373 		panic("sf_buf_free: freeing free sf_buf");
1374 	sf->refcnt--;
1375 	if (sf->refcnt == 0) {
1376 		pmap_qremove((vm_offset_t)addr, 1);
1377 		m = sf->m;
1378 		s = splvm();
1379 		vm_page_unwire(m, 0);
1380 		/*
1381 		 * Check for the object going away on us. This can
1382 		 * happen since we don't hold a reference to it.
1383 		 * If so, we're responsible for freeing the page.
1384 		 */
1385 		if (m->wire_count == 0 && m->object == NULL)
1386 			vm_page_free(m);
1387 		splx(s);
1388 		sf->m = NULL;
1389 		SLIST_INSERT_HEAD(&sf_freelist, sf, free_list);
1390 		if (sf_buf_alloc_want) {
1391 			sf_buf_alloc_want = 0;
1392 			wakeup(&sf_freelist);
1393 		}
1394 	}
1395 }
1396 
1397 /*
1398  * sendfile(2).
1399  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1400  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1401  *
1402  * Send a file specified by 'fd' and starting at 'offset' to a socket
1403  * specified by 's'. Send only 'nbytes' of the file or until EOF if
1404  * nbytes == 0. Optionally add a header and/or trailer to the socket
1405  * output. If specified, write the total number of bytes sent into *sbytes.
1406  */
1407 int
1408 sendfile(struct proc *p, struct sendfile_args *uap)
1409 {
1410 	struct file *fp;
1411 	struct filedesc *fdp = p->p_fd;
1412 	struct vnode *vp;
1413 	struct vm_object *obj;
1414 	struct socket *so;
1415 	struct mbuf *m;
1416 	struct sf_buf *sf;
1417 	struct vm_page *pg;
1418 	struct writev_args nuap;
1419 	struct sf_hdtr hdtr;
1420 	off_t off, xfsize, sbytes = 0;
1421 	int error = 0, s;
1422 
1423 	/*
1424 	 * Do argument checking. Must be a regular file in, stream
1425 	 * type and connected socket out, positive offset.
1426 	 */
1427 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
1428 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
1429 	    (fp->f_flag & FREAD) == 0) {
1430 		error = EBADF;
1431 		goto done;
1432 	}
1433 	if (fp->f_type != DTYPE_VNODE) {
1434 		error = EINVAL;
1435 		goto done;
1436 	}
1437 	vp = (struct vnode *)fp->f_data;
1438 	obj = vp->v_object;
1439 	if (vp->v_type != VREG || obj == NULL) {
1440 		error = EINVAL;
1441 		goto done;
1442 	}
1443 	error = getsock(p->p_fd, uap->s, &fp);
1444 	if (error)
1445 		goto done;
1446 	so = (struct socket *)fp->f_data;
1447 	if (so->so_type != SOCK_STREAM) {
1448 		error = EINVAL;
1449 		goto done;
1450 	}
1451 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1452 		error = ENOTCONN;
1453 		goto done;
1454 	}
1455 	if (uap->offset < 0) {
1456 		error = EINVAL;
1457 		goto done;
1458 	}
1459 
1460 	/*
1461 	 * If specified, get the pointer to the sf_hdtr struct for
1462 	 * any headers/trailers.
1463 	 */
1464 	if (uap->hdtr != NULL) {
1465 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1466 		if (error)
1467 			goto done;
1468 		/*
1469 		 * Send any headers. Wimp out and use writev(2).
1470 		 */
1471 		if (hdtr.headers != NULL) {
1472 			nuap.fd = uap->s;
1473 			nuap.iovp = hdtr.headers;
1474 			nuap.iovcnt = hdtr.hdr_cnt;
1475 			error = writev(p, &nuap);
1476 			if (error)
1477 				goto done;
1478 			sbytes += p->p_retval[0];
1479 		}
1480 	}
1481 
1482 	/*
1483 	 * Protect against multiple writers to the socket.
1484 	 */
1485 	(void) sblock(&so->so_snd, M_WAITOK);
1486 
1487 	/*
1488 	 * Loop through the pages in the file, starting with the requested
1489 	 * offset. Get a file page (do I/O if necessary), map the file page
1490 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1491 	 * it on the socket.
1492 	 */
1493 	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1494 		vm_pindex_t pindex;
1495 		vm_offset_t pgoff;
1496 
1497 		pindex = OFF_TO_IDX(off);
1498 retry_lookup:
1499 		/*
1500 		 * Calculate the amount to transfer. Not to exceed a page,
1501 		 * the EOF, or the passed in nbytes.
1502 		 */
1503 		xfsize = obj->un_pager.vnp.vnp_size - off;
1504 		if (xfsize > PAGE_SIZE)
1505 			xfsize = PAGE_SIZE;
1506 		pgoff = (vm_offset_t)(off & PAGE_MASK);
1507 		if (PAGE_SIZE - pgoff < xfsize)
1508 			xfsize = PAGE_SIZE - pgoff;
1509 		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1510 			xfsize = uap->nbytes - sbytes;
1511 		if (xfsize <= 0)
1512 			break;
1513 		/*
1514 		 * Optimize the non-blocking case by looking at the socket space
1515 		 * before going to the extra work of constituting the sf_buf.
1516 		 */
1517 		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1518 			if (so->so_state & SS_CANTSENDMORE)
1519 				error = EPIPE;
1520 			else
1521 				error = EAGAIN;
1522 			sbunlock(&so->so_snd);
1523 			goto done;
1524 		}
1525 		/*
1526 		 * Attempt to look up the page.
1527 		 *
1528 		 *	Allocate if not found
1529 		 *
1530 		 *	Wait and loop if busy.
1531 		 */
1532 		pg = vm_page_lookup(obj, pindex);
1533 
1534 		if (pg == NULL) {
1535 			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1536 			if (pg == NULL) {
1537 				VM_WAIT;
1538 				goto retry_lookup;
1539 			}
1540 			vm_page_wakeup(pg);
1541 		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
1542 			goto retry_lookup;
1543 		}
1544 
1545 		/*
1546 		 * Wire the page so it does not get ripped out from under
1547 		 * us.
1548 		 */
1549 
1550 		vm_page_wire(pg);
1551 
1552 		/*
1553 		 * If page is not valid for what we need, initiate I/O
1554 		 */
1555 
1556 		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1557 			struct uio auio;
1558 			struct iovec aiov;
1559 			int bsize;
1560 
1561 			/*
1562 			 * Ensure that our page is still around when the I/O
1563 			 * completes.
1564 			 */
1565 			vm_page_io_start(pg);
1566 
1567 			/*
1568 			 * Get the page from backing store.
1569 			 */
1570 			bsize = vp->v_mount->mnt_stat.f_iosize;
1571 			auio.uio_iov = &aiov;
1572 			auio.uio_iovcnt = 1;
1573 			aiov.iov_base = 0;
1574 			aiov.iov_len = MAXBSIZE;
1575 			auio.uio_resid = MAXBSIZE;
1576 			auio.uio_offset = trunc_page(off);
1577 			auio.uio_segflg = UIO_NOCOPY;
1578 			auio.uio_rw = UIO_READ;
1579 			auio.uio_procp = p;
1580 			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
1581 			error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
1582 			        p->p_ucred);
1583 			VOP_UNLOCK(vp, 0, p);
1584 			vm_page_flag_clear(pg, PG_ZERO);
1585 			vm_page_io_finish(pg);
1586 			if (error) {
1587 				vm_page_unwire(pg, 0);
1588 				/*
1589 				 * See if anyone else might know about this page.
1590 				 * If not and it is not valid, then free it.
1591 				 */
1592 				if (pg->wire_count == 0 && pg->valid == 0 &&
1593 				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1594 				    pg->hold_count == 0)
1595 					vm_page_free(pg);
1596 				sbunlock(&so->so_snd);
1597 				goto done;
1598 			}
1599 		}
1600 
1601 		/*
1602 		 * Allocate a kernel virtual page and insert the physical page
1603 		 * into it.
1604 		 */
1605 
1606 		sf = sf_buf_alloc();
1607 		sf->m = pg;
1608 		pmap_qenter(sf->kva, &pg, 1);
1609 		/*
1610 		 * Get an mbuf header and set it up as having external storage.
1611 		 */
1612 		MGETHDR(m, M_WAIT, MT_DATA);
1613 		m->m_ext.ext_free = sf_buf_free;
1614 		m->m_ext.ext_ref = sf_buf_ref;
1615 		m->m_ext.ext_buf = (void *)sf->kva;
1616 		m->m_ext.ext_size = PAGE_SIZE;
1617 		m->m_data = (char *) sf->kva + pgoff;
1618 		m->m_flags |= M_EXT;
1619 		m->m_pkthdr.len = m->m_len = xfsize;
1620 		/*
1621 		 * Add the buffer to the socket buffer chain.
1622 		 */
1623 		s = splnet();
1624 retry_space:
1625 		/*
1626 		 * Make sure that the socket is still able to take more data.
1627 		 * CANTSENDMORE being true usually means that the connection
1628 		 * was closed. so_error is true when an error was sensed after
1629 		 * a previous send.
1630 		 * The state is checked after the page mapping and buffer
1631 		 * allocation above since those operations may block and make
1632 		 * any socket checks stale. From this point forward, nothing
1633 		 * blocks before the pru_send (or more accurately, any blocking
1634 		 * results in a loop back to here to re-check).
1635 		 */
1636 		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1637 			if (so->so_state & SS_CANTSENDMORE) {
1638 				error = EPIPE;
1639 			} else {
1640 				error = so->so_error;
1641 				so->so_error = 0;
1642 			}
1643 			m_freem(m);
1644 			sbunlock(&so->so_snd);
1645 			splx(s);
1646 			goto done;
1647 		}
1648 		/*
1649 		 * Wait for socket space to become available. We do this just
1650 		 * after checking the connection state above in order to avoid
1651 		 * a race condition with sbwait().
1652 		 */
1653 		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1654 			if (so->so_state & SS_NBIO) {
1655 				m_freem(m);
1656 				sbunlock(&so->so_snd);
1657 				splx(s);
1658 				error = EAGAIN;
1659 				goto done;
1660 			}
1661 			error = sbwait(&so->so_snd);
1662 			/*
1663 			 * An error from sbwait usually indicates that we've
1664 			 * been interrupted by a signal. If we've sent anything
1665 			 * then return bytes sent, otherwise return the error.
1666 			 */
1667 			if (error) {
1668 				m_freem(m);
1669 				sbunlock(&so->so_snd);
1670 				splx(s);
1671 				goto done;
1672 			}
1673 			goto retry_space;
1674 		}
1675 		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
1676 		splx(s);
1677 		if (error) {
1678 			sbunlock(&so->so_snd);
1679 			goto done;
1680 		}
1681 	}
1682 	sbunlock(&so->so_snd);
1683 
1684 	/*
1685 	 * Send trailers. Wimp out and use writev(2).
1686 	 */
1687 	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1688 			nuap.fd = uap->s;
1689 			nuap.iovp = hdtr.trailers;
1690 			nuap.iovcnt = hdtr.trl_cnt;
1691 			error = writev(p, &nuap);
1692 			if (error)
1693 				goto done;
1694 			sbytes += p->p_retval[0];
1695 	}
1696 
1697 done:
1698 	if (uap->sbytes != NULL) {
1699 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
1700 	}
1701 	return (error);
1702 }
1703