xref: /freebsd/sys/kern/uipc_syscalls.c (revision 23f282aa31e9b6fceacd449020e936e98d6f2298)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37  * $FreeBSD$
38  */
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/sysproto.h>
47 #include <sys/malloc.h>
48 #include <sys/filedesc.h>
49 #include <sys/event.h>
50 #include <sys/proc.h>
51 #include <sys/fcntl.h>
52 #include <sys/file.h>
53 #include <sys/mbuf.h>
54 #include <sys/protosw.h>
55 #include <sys/socket.h>
56 #include <sys/socketvar.h>
57 #include <sys/signalvar.h>
58 #include <sys/uio.h>
59 #include <sys/vnode.h>
60 #include <sys/lock.h>
61 #include <sys/mount.h>
62 #ifdef KTRACE
63 #include <sys/ktrace.h>
64 #endif
65 #include <vm/vm.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_pageout.h>
69 #include <vm/vm_kern.h>
70 #include <vm/vm_extern.h>
71 
72 static void sf_buf_init(void *arg);
73 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
74 static struct sf_buf *sf_buf_alloc(void);
75 static void sf_buf_ref(caddr_t addr, u_int size);
76 static void sf_buf_free(caddr_t addr, u_int size);
77 
78 static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags));
79 static int recvit __P((struct proc *p, int s, struct msghdr *mp,
80 		       caddr_t namelenp));
81 
82 static int accept1 __P((struct proc *p, struct accept_args *uap, int compat));
83 static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
84 			     int compat));
85 static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
86 			     int compat));
87 
88 static SLIST_HEAD(, sf_buf) sf_freelist;
89 static vm_offset_t sf_base;
90 static struct sf_buf *sf_bufs;
91 static int sf_buf_alloc_want;
92 
93 /*
94  * System call interface to the socket abstraction.
95  */
96 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
97 #define COMPAT_OLDSOCK
98 #endif
99 
100 extern	struct fileops socketops;
101 
102 int
103 socket(p, uap)
104 	struct proc *p;
105 	register struct socket_args /* {
106 		int	domain;
107 		int	type;
108 		int	protocol;
109 	} */ *uap;
110 {
111 	struct filedesc *fdp = p->p_fd;
112 	struct socket *so;
113 	struct file *fp;
114 	int fd, error;
115 
116 	error = falloc(p, &fp, &fd);
117 	if (error)
118 		return (error);
119 	error = socreate(uap->domain, &so, uap->type, uap->protocol, p);
120 	if (error) {
121 		fdp->fd_ofiles[fd] = 0;
122 		ffree(fp);
123 	} else {
124 		fp->f_data = (caddr_t)so;
125 		fp->f_flag = FREAD|FWRITE;
126 		fp->f_ops = &socketops;
127 		fp->f_type = DTYPE_SOCKET;
128 		p->p_retval[0] = fd;
129 	}
130 	return (error);
131 }
132 
133 /* ARGSUSED */
134 int
135 bind(p, uap)
136 	struct proc *p;
137 	register struct bind_args /* {
138 		int	s;
139 		caddr_t	name;
140 		int	namelen;
141 	} */ *uap;
142 {
143 	struct file *fp;
144 	struct sockaddr *sa;
145 	int error;
146 
147 	error = getsock(p->p_fd, uap->s, &fp);
148 	if (error)
149 		return (error);
150 	error = getsockaddr(&sa, uap->name, uap->namelen);
151 	if (error)
152 		return (error);
153 	error = sobind((struct socket *)fp->f_data, sa, p);
154 	FREE(sa, M_SONAME);
155 	return (error);
156 }
157 
158 /* ARGSUSED */
159 int
160 listen(p, uap)
161 	struct proc *p;
162 	register struct listen_args /* {
163 		int	s;
164 		int	backlog;
165 	} */ *uap;
166 {
167 	struct file *fp;
168 	int error;
169 
170 	error = getsock(p->p_fd, uap->s, &fp);
171 	if (error)
172 		return (error);
173 	return (solisten((struct socket *)fp->f_data, uap->backlog, p));
174 }
175 
176 static int
177 accept1(p, uap, compat)
178 	struct proc *p;
179 	register struct accept_args /* {
180 		int	s;
181 		caddr_t	name;
182 		int	*anamelen;
183 	} */ *uap;
184 	int compat;
185 {
186 	struct filedesc *fdp = p->p_fd;
187 	struct file *fp;
188 	struct sockaddr *sa;
189 	int namelen, error, s;
190 	struct socket *head, *so;
191 	int fd;
192 	short fflag;		/* type must match fp->f_flag */
193 
194 	if (uap->name) {
195 		error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
196 			sizeof (namelen));
197 		if(error)
198 			return (error);
199 	}
200 	error = getsock(fdp, uap->s, &fp);
201 	if (error)
202 		return (error);
203 	s = splnet();
204 	head = (struct socket *)fp->f_data;
205 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
206 		splx(s);
207 		return (EINVAL);
208 	}
209 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
210 		splx(s);
211 		return (EWOULDBLOCK);
212 	}
213 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
214 		if (head->so_state & SS_CANTRCVMORE) {
215 			head->so_error = ECONNABORTED;
216 			break;
217 		}
218 		error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
219 		    "accept", 0);
220 		if (error) {
221 			splx(s);
222 			return (error);
223 		}
224 	}
225 	if (head->so_error) {
226 		error = head->so_error;
227 		head->so_error = 0;
228 		splx(s);
229 		return (error);
230 	}
231 
232 	/*
233 	 * At this point we know that there is at least one connection
234 	 * ready to be accepted. Remove it from the queue prior to
235 	 * allocating the file descriptor for it since falloc() may
236 	 * block allowing another process to accept the connection
237 	 * instead.
238 	 */
239 	so = TAILQ_FIRST(&head->so_comp);
240 	TAILQ_REMOVE(&head->so_comp, so, so_list);
241 	head->so_qlen--;
242 
243 	fflag = fp->f_flag;
244 	error = falloc(p, &fp, &fd);
245 	if (error) {
246 		/*
247 		 * Probably ran out of file descriptors. Put the
248 		 * unaccepted connection back onto the queue and
249 		 * do another wakeup so some other process might
250 		 * have a chance at it.
251 		 */
252 		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
253 		head->so_qlen++;
254 		wakeup_one(&head->so_timeo);
255 		splx(s);
256 		return (error);
257 	} else
258 		p->p_retval[0] = fd;
259 
260 	/* connection has been removed from the listen queue */
261 	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
262 
263 	so->so_state &= ~SS_COMP;
264 	so->so_head = NULL;
265 	if (head->so_sigio != NULL)
266 		fsetown(fgetown(head->so_sigio), &so->so_sigio);
267 
268 	fp->f_data = (caddr_t)so;
269 	fp->f_flag = fflag;
270 	fp->f_ops = &socketops;
271 	fp->f_type = DTYPE_SOCKET;
272 	sa = 0;
273 	(void) soaccept(so, &sa);
274 	if (sa == 0) {
275 		namelen = 0;
276 		if (uap->name)
277 			goto gotnoname;
278 		splx(s);
279 		return 0;
280 	}
281 	if (uap->name) {
282 		/* check sa_len before it is destroyed */
283 		if (namelen > sa->sa_len)
284 			namelen = sa->sa_len;
285 #ifdef COMPAT_OLDSOCK
286 		if (compat)
287 			((struct osockaddr *)sa)->sa_family =
288 			    sa->sa_family;
289 #endif
290 		error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
291 		if (!error)
292 gotnoname:
293 			error = copyout((caddr_t)&namelen,
294 			    (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
295 	}
296 	if (sa)
297 		FREE(sa, M_SONAME);
298 	if (error) {
299 		fdp->fd_ofiles[fd] = 0;
300 		ffree(fp);
301 	}
302 	splx(s);
303 	return (error);
304 }
305 
306 int
307 accept(p, uap)
308 	struct proc *p;
309 	struct accept_args *uap;
310 {
311 
312 	return (accept1(p, uap, 0));
313 }
314 
315 #ifdef COMPAT_OLDSOCK
316 int
317 oaccept(p, uap)
318 	struct proc *p;
319 	struct accept_args *uap;
320 {
321 
322 	return (accept1(p, uap, 1));
323 }
324 #endif /* COMPAT_OLDSOCK */
325 
326 /* ARGSUSED */
327 int
328 connect(p, uap)
329 	struct proc *p;
330 	register struct connect_args /* {
331 		int	s;
332 		caddr_t	name;
333 		int	namelen;
334 	} */ *uap;
335 {
336 	struct file *fp;
337 	register struct socket *so;
338 	struct sockaddr *sa;
339 	int error, s;
340 
341 	error = getsock(p->p_fd, uap->s, &fp);
342 	if (error)
343 		return (error);
344 	so = (struct socket *)fp->f_data;
345 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING))
346 		return (EALREADY);
347 	error = getsockaddr(&sa, uap->name, uap->namelen);
348 	if (error)
349 		return (error);
350 	error = soconnect(so, sa, p);
351 	if (error)
352 		goto bad;
353 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
354 		FREE(sa, M_SONAME);
355 		return (EINPROGRESS);
356 	}
357 	s = splnet();
358 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
359 		error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
360 		    "connec", 0);
361 		if (error)
362 			break;
363 	}
364 	if (error == 0) {
365 		error = so->so_error;
366 		so->so_error = 0;
367 	}
368 	splx(s);
369 bad:
370 	so->so_state &= ~SS_ISCONNECTING;
371 	FREE(sa, M_SONAME);
372 	if (error == ERESTART)
373 		error = EINTR;
374 	return (error);
375 }
376 
377 int
378 socketpair(p, uap)
379 	struct proc *p;
380 	register struct socketpair_args /* {
381 		int	domain;
382 		int	type;
383 		int	protocol;
384 		int	*rsv;
385 	} */ *uap;
386 {
387 	register struct filedesc *fdp = p->p_fd;
388 	struct file *fp1, *fp2;
389 	struct socket *so1, *so2;
390 	int fd, error, sv[2];
391 
392 	error = socreate(uap->domain, &so1, uap->type, uap->protocol, p);
393 	if (error)
394 		return (error);
395 	error = socreate(uap->domain, &so2, uap->type, uap->protocol, p);
396 	if (error)
397 		goto free1;
398 	error = falloc(p, &fp1, &fd);
399 	if (error)
400 		goto free2;
401 	sv[0] = fd;
402 	fp1->f_data = (caddr_t)so1;
403 	error = falloc(p, &fp2, &fd);
404 	if (error)
405 		goto free3;
406 	fp2->f_data = (caddr_t)so2;
407 	sv[1] = fd;
408 	error = soconnect2(so1, so2);
409 	if (error)
410 		goto free4;
411 	if (uap->type == SOCK_DGRAM) {
412 		/*
413 		 * Datagram socket connection is asymmetric.
414 		 */
415 		 error = soconnect2(so2, so1);
416 		 if (error)
417 			goto free4;
418 	}
419 	fp1->f_flag = fp2->f_flag = FREAD|FWRITE;
420 	fp1->f_ops = fp2->f_ops = &socketops;
421 	fp1->f_type = fp2->f_type = DTYPE_SOCKET;
422 	error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
423 	return (error);
424 free4:
425 	fdp->fd_ofiles[sv[1]] = 0;
426 	ffree(fp2);
427 free3:
428 	fdp->fd_ofiles[sv[0]] = 0;
429 	ffree(fp1);
430 free2:
431 	(void)soclose(so2);
432 free1:
433 	(void)soclose(so1);
434 	return (error);
435 }
436 
437 static int
438 sendit(p, s, mp, flags)
439 	register struct proc *p;
440 	int s;
441 	register struct msghdr *mp;
442 	int flags;
443 {
444 	struct file *fp;
445 	struct uio auio;
446 	register struct iovec *iov;
447 	register int i;
448 	struct mbuf *control;
449 	struct sockaddr *to;
450 	int len, error;
451 	struct socket *so;
452 #ifdef KTRACE
453 	struct iovec *ktriov = NULL;
454 #endif
455 
456 	error = getsock(p->p_fd, s, &fp);
457 	if (error)
458 		return (error);
459 	auio.uio_iov = mp->msg_iov;
460 	auio.uio_iovcnt = mp->msg_iovlen;
461 	auio.uio_segflg = UIO_USERSPACE;
462 	auio.uio_rw = UIO_WRITE;
463 	auio.uio_procp = p;
464 	auio.uio_offset = 0;			/* XXX */
465 	auio.uio_resid = 0;
466 	iov = mp->msg_iov;
467 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
468 		if ((auio.uio_resid += iov->iov_len) < 0)
469 			return (EINVAL);
470 	}
471 	if (mp->msg_name) {
472 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
473 		if (error)
474 			return (error);
475 	} else
476 		to = 0;
477 	if (mp->msg_control) {
478 		if (mp->msg_controllen < sizeof(struct cmsghdr)
479 #ifdef COMPAT_OLDSOCK
480 		    && mp->msg_flags != MSG_COMPAT
481 #endif
482 		) {
483 			error = EINVAL;
484 			goto bad;
485 		}
486 		error = sockargs(&control, mp->msg_control,
487 		    mp->msg_controllen, MT_CONTROL);
488 		if (error)
489 			goto bad;
490 #ifdef COMPAT_OLDSOCK
491 		if (mp->msg_flags == MSG_COMPAT) {
492 			register struct cmsghdr *cm;
493 
494 			M_PREPEND(control, sizeof(*cm), M_WAIT);
495 			if (control == 0) {
496 				error = ENOBUFS;
497 				goto bad;
498 			} else {
499 				cm = mtod(control, struct cmsghdr *);
500 				cm->cmsg_len = control->m_len;
501 				cm->cmsg_level = SOL_SOCKET;
502 				cm->cmsg_type = SCM_RIGHTS;
503 			}
504 		}
505 #endif
506 	} else
507 		control = 0;
508 #ifdef KTRACE
509 	if (KTRPOINT(p, KTR_GENIO)) {
510 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
511 
512 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
513 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
514 	}
515 #endif
516 	len = auio.uio_resid;
517 	so = (struct socket *)fp->f_data;
518 	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
519 						     flags, p);
520 	if (error) {
521 		if (auio.uio_resid != len && (error == ERESTART ||
522 		    error == EINTR || error == EWOULDBLOCK))
523 			error = 0;
524 		if (error == EPIPE)
525 			psignal(p, SIGPIPE);
526 	}
527 	if (error == 0)
528 		p->p_retval[0] = len - auio.uio_resid;
529 #ifdef KTRACE
530 	if (ktriov != NULL) {
531 		if (error == 0)
532 			ktrgenio(p->p_tracep, s, UIO_WRITE,
533 				ktriov, p->p_retval[0], error);
534 		FREE(ktriov, M_TEMP);
535 	}
536 #endif
537 bad:
538 	if (to)
539 		FREE(to, M_SONAME);
540 	return (error);
541 }
542 
543 int
544 sendto(p, uap)
545 	struct proc *p;
546 	register struct sendto_args /* {
547 		int	s;
548 		caddr_t	buf;
549 		size_t	len;
550 		int	flags;
551 		caddr_t	to;
552 		int	tolen;
553 	} */ *uap;
554 {
555 	struct msghdr msg;
556 	struct iovec aiov;
557 
558 	msg.msg_name = uap->to;
559 	msg.msg_namelen = uap->tolen;
560 	msg.msg_iov = &aiov;
561 	msg.msg_iovlen = 1;
562 	msg.msg_control = 0;
563 #ifdef COMPAT_OLDSOCK
564 	msg.msg_flags = 0;
565 #endif
566 	aiov.iov_base = uap->buf;
567 	aiov.iov_len = uap->len;
568 	return (sendit(p, uap->s, &msg, uap->flags));
569 }
570 
571 #ifdef COMPAT_OLDSOCK
572 int
573 osend(p, uap)
574 	struct proc *p;
575 	register struct osend_args /* {
576 		int	s;
577 		caddr_t	buf;
578 		int	len;
579 		int	flags;
580 	} */ *uap;
581 {
582 	struct msghdr msg;
583 	struct iovec aiov;
584 
585 	msg.msg_name = 0;
586 	msg.msg_namelen = 0;
587 	msg.msg_iov = &aiov;
588 	msg.msg_iovlen = 1;
589 	aiov.iov_base = uap->buf;
590 	aiov.iov_len = uap->len;
591 	msg.msg_control = 0;
592 	msg.msg_flags = 0;
593 	return (sendit(p, uap->s, &msg, uap->flags));
594 }
595 
596 int
597 osendmsg(p, uap)
598 	struct proc *p;
599 	register struct osendmsg_args /* {
600 		int	s;
601 		caddr_t	msg;
602 		int	flags;
603 	} */ *uap;
604 {
605 	struct msghdr msg;
606 	struct iovec aiov[UIO_SMALLIOV], *iov;
607 	int error;
608 
609 	error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
610 	if (error)
611 		return (error);
612 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
613 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
614 			return (EMSGSIZE);
615 		MALLOC(iov, struct iovec *,
616 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
617 		      M_WAITOK);
618 	} else
619 		iov = aiov;
620 	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
621 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
622 	if (error)
623 		goto done;
624 	msg.msg_flags = MSG_COMPAT;
625 	msg.msg_iov = iov;
626 	error = sendit(p, uap->s, &msg, uap->flags);
627 done:
628 	if (iov != aiov)
629 		FREE(iov, M_IOV);
630 	return (error);
631 }
632 #endif
633 
634 int
635 sendmsg(p, uap)
636 	struct proc *p;
637 	register struct sendmsg_args /* {
638 		int	s;
639 		caddr_t	msg;
640 		int	flags;
641 	} */ *uap;
642 {
643 	struct msghdr msg;
644 	struct iovec aiov[UIO_SMALLIOV], *iov;
645 	int error;
646 
647 	error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
648 	if (error)
649 		return (error);
650 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
651 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
652 			return (EMSGSIZE);
653 		MALLOC(iov, struct iovec *,
654 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
655 		       M_WAITOK);
656 	} else
657 		iov = aiov;
658 	if (msg.msg_iovlen &&
659 	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
660 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
661 		goto done;
662 	msg.msg_iov = iov;
663 #ifdef COMPAT_OLDSOCK
664 	msg.msg_flags = 0;
665 #endif
666 	error = sendit(p, uap->s, &msg, uap->flags);
667 done:
668 	if (iov != aiov)
669 		FREE(iov, M_IOV);
670 	return (error);
671 }
672 
673 static int
674 recvit(p, s, mp, namelenp)
675 	register struct proc *p;
676 	int s;
677 	register struct msghdr *mp;
678 	caddr_t namelenp;
679 {
680 	struct file *fp;
681 	struct uio auio;
682 	register struct iovec *iov;
683 	register int i;
684 	int len, error;
685 	struct mbuf *m, *control = 0;
686 	caddr_t ctlbuf;
687 	struct socket *so;
688 	struct sockaddr *fromsa = 0;
689 #ifdef KTRACE
690 	struct iovec *ktriov = NULL;
691 #endif
692 
693 	error = getsock(p->p_fd, s, &fp);
694 	if (error)
695 		return (error);
696 	auio.uio_iov = mp->msg_iov;
697 	auio.uio_iovcnt = mp->msg_iovlen;
698 	auio.uio_segflg = UIO_USERSPACE;
699 	auio.uio_rw = UIO_READ;
700 	auio.uio_procp = p;
701 	auio.uio_offset = 0;			/* XXX */
702 	auio.uio_resid = 0;
703 	iov = mp->msg_iov;
704 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
705 		if ((auio.uio_resid += iov->iov_len) < 0)
706 			return (EINVAL);
707 	}
708 #ifdef KTRACE
709 	if (KTRPOINT(p, KTR_GENIO)) {
710 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
711 
712 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
713 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
714 	}
715 #endif
716 	len = auio.uio_resid;
717 	so = (struct socket *)fp->f_data;
718 	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
719 	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
720 	    &mp->msg_flags);
721 	if (error) {
722 		if (auio.uio_resid != len && (error == ERESTART ||
723 		    error == EINTR || error == EWOULDBLOCK))
724 			error = 0;
725 	}
726 #ifdef KTRACE
727 	if (ktriov != NULL) {
728 		if (error == 0)
729 			ktrgenio(p->p_tracep, s, UIO_READ,
730 				ktriov, len - auio.uio_resid, error);
731 		FREE(ktriov, M_TEMP);
732 	}
733 #endif
734 	if (error)
735 		goto out;
736 	p->p_retval[0] = len - auio.uio_resid;
737 	if (mp->msg_name) {
738 		len = mp->msg_namelen;
739 		if (len <= 0 || fromsa == 0)
740 			len = 0;
741 		else {
742 #ifndef MIN
743 #define MIN(a,b) ((a)>(b)?(b):(a))
744 #endif
745 			/* save sa_len before it is destroyed by MSG_COMPAT */
746 			len = MIN(len, fromsa->sa_len);
747 #ifdef COMPAT_OLDSOCK
748 			if (mp->msg_flags & MSG_COMPAT)
749 				((struct osockaddr *)fromsa)->sa_family =
750 				    fromsa->sa_family;
751 #endif
752 			error = copyout(fromsa,
753 			    (caddr_t)mp->msg_name, (unsigned)len);
754 			if (error)
755 				goto out;
756 		}
757 		mp->msg_namelen = len;
758 		if (namelenp &&
759 		    (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
760 #ifdef COMPAT_OLDSOCK
761 			if (mp->msg_flags & MSG_COMPAT)
762 				error = 0;	/* old recvfrom didn't check */
763 			else
764 #endif
765 			goto out;
766 		}
767 	}
768 	if (mp->msg_control) {
769 #ifdef COMPAT_OLDSOCK
770 		/*
771 		 * We assume that old recvmsg calls won't receive access
772 		 * rights and other control info, esp. as control info
773 		 * is always optional and those options didn't exist in 4.3.
774 		 * If we receive rights, trim the cmsghdr; anything else
775 		 * is tossed.
776 		 */
777 		if (control && mp->msg_flags & MSG_COMPAT) {
778 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
779 			    SOL_SOCKET ||
780 			    mtod(control, struct cmsghdr *)->cmsg_type !=
781 			    SCM_RIGHTS) {
782 				mp->msg_controllen = 0;
783 				goto out;
784 			}
785 			control->m_len -= sizeof (struct cmsghdr);
786 			control->m_data += sizeof (struct cmsghdr);
787 		}
788 #endif
789 		len = mp->msg_controllen;
790 		m = control;
791 		mp->msg_controllen = 0;
792 		ctlbuf = (caddr_t) mp->msg_control;
793 
794 		while (m && len > 0) {
795 			unsigned int tocopy;
796 
797 			if (len >= m->m_len)
798 				tocopy = m->m_len;
799 			else {
800 				mp->msg_flags |= MSG_CTRUNC;
801 				tocopy = len;
802 			}
803 
804 			if ((error = copyout((caddr_t)mtod(m, caddr_t),
805 					ctlbuf, tocopy)) != 0)
806 				goto out;
807 
808 			ctlbuf += tocopy;
809 			len -= tocopy;
810 			m = m->m_next;
811 		}
812 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
813 	}
814 out:
815 	if (fromsa)
816 		FREE(fromsa, M_SONAME);
817 	if (control)
818 		m_freem(control);
819 	return (error);
820 }
821 
822 int
823 recvfrom(p, uap)
824 	struct proc *p;
825 	register struct recvfrom_args /* {
826 		int	s;
827 		caddr_t	buf;
828 		size_t	len;
829 		int	flags;
830 		caddr_t	from;
831 		int	*fromlenaddr;
832 	} */ *uap;
833 {
834 	struct msghdr msg;
835 	struct iovec aiov;
836 	int error;
837 
838 	if (uap->fromlenaddr) {
839 		error = copyin((caddr_t)uap->fromlenaddr,
840 		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
841 		if (error)
842 			return (error);
843 	} else
844 		msg.msg_namelen = 0;
845 	msg.msg_name = uap->from;
846 	msg.msg_iov = &aiov;
847 	msg.msg_iovlen = 1;
848 	aiov.iov_base = uap->buf;
849 	aiov.iov_len = uap->len;
850 	msg.msg_control = 0;
851 	msg.msg_flags = uap->flags;
852 	return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr));
853 }
854 
855 #ifdef COMPAT_OLDSOCK
856 int
857 orecvfrom(p, uap)
858 	struct proc *p;
859 	struct recvfrom_args *uap;
860 {
861 
862 	uap->flags |= MSG_COMPAT;
863 	return (recvfrom(p, uap));
864 }
865 #endif
866 
867 
868 #ifdef COMPAT_OLDSOCK
869 int
870 orecv(p, uap)
871 	struct proc *p;
872 	register struct orecv_args /* {
873 		int	s;
874 		caddr_t	buf;
875 		int	len;
876 		int	flags;
877 	} */ *uap;
878 {
879 	struct msghdr msg;
880 	struct iovec aiov;
881 
882 	msg.msg_name = 0;
883 	msg.msg_namelen = 0;
884 	msg.msg_iov = &aiov;
885 	msg.msg_iovlen = 1;
886 	aiov.iov_base = uap->buf;
887 	aiov.iov_len = uap->len;
888 	msg.msg_control = 0;
889 	msg.msg_flags = uap->flags;
890 	return (recvit(p, uap->s, &msg, (caddr_t)0));
891 }
892 
893 /*
894  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
895  * overlays the new one, missing only the flags, and with the (old) access
896  * rights where the control fields are now.
897  */
898 int
899 orecvmsg(p, uap)
900 	struct proc *p;
901 	register struct orecvmsg_args /* {
902 		int	s;
903 		struct	omsghdr *msg;
904 		int	flags;
905 	} */ *uap;
906 {
907 	struct msghdr msg;
908 	struct iovec aiov[UIO_SMALLIOV], *iov;
909 	int error;
910 
911 	error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
912 	    sizeof (struct omsghdr));
913 	if (error)
914 		return (error);
915 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
916 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
917 			return (EMSGSIZE);
918 		MALLOC(iov, struct iovec *,
919 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
920 		      M_WAITOK);
921 	} else
922 		iov = aiov;
923 	msg.msg_flags = uap->flags | MSG_COMPAT;
924 	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
925 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
926 	if (error)
927 		goto done;
928 	msg.msg_iov = iov;
929 	error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
930 
931 	if (msg.msg_controllen && error == 0)
932 		error = copyout((caddr_t)&msg.msg_controllen,
933 		    (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
934 done:
935 	if (iov != aiov)
936 		FREE(iov, M_IOV);
937 	return (error);
938 }
939 #endif
940 
941 int
942 recvmsg(p, uap)
943 	struct proc *p;
944 	register struct recvmsg_args /* {
945 		int	s;
946 		struct	msghdr *msg;
947 		int	flags;
948 	} */ *uap;
949 {
950 	struct msghdr msg;
951 	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
952 	register int error;
953 
954 	error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
955 	if (error)
956 		return (error);
957 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
958 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
959 			return (EMSGSIZE);
960 		MALLOC(iov, struct iovec *,
961 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
962 		       M_WAITOK);
963 	} else
964 		iov = aiov;
965 #ifdef COMPAT_OLDSOCK
966 	msg.msg_flags = uap->flags &~ MSG_COMPAT;
967 #else
968 	msg.msg_flags = uap->flags;
969 #endif
970 	uiov = msg.msg_iov;
971 	msg.msg_iov = iov;
972 	error = copyin((caddr_t)uiov, (caddr_t)iov,
973 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
974 	if (error)
975 		goto done;
976 	error = recvit(p, uap->s, &msg, (caddr_t)0);
977 	if (!error) {
978 		msg.msg_iov = uiov;
979 		error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
980 	}
981 done:
982 	if (iov != aiov)
983 		FREE(iov, M_IOV);
984 	return (error);
985 }
986 
987 /* ARGSUSED */
988 int
989 shutdown(p, uap)
990 	struct proc *p;
991 	register struct shutdown_args /* {
992 		int	s;
993 		int	how;
994 	} */ *uap;
995 {
996 	struct file *fp;
997 	int error;
998 
999 	error = getsock(p->p_fd, uap->s, &fp);
1000 	if (error)
1001 		return (error);
1002 	return (soshutdown((struct socket *)fp->f_data, uap->how));
1003 }
1004 
1005 /* ARGSUSED */
1006 int
1007 setsockopt(p, uap)
1008 	struct proc *p;
1009 	register struct setsockopt_args /* {
1010 		int	s;
1011 		int	level;
1012 		int	name;
1013 		caddr_t	val;
1014 		int	valsize;
1015 	} */ *uap;
1016 {
1017 	struct file *fp;
1018 	struct sockopt sopt;
1019 	int error;
1020 
1021 	if (uap->val == 0 && uap->valsize != 0)
1022 		return (EFAULT);
1023 	if (uap->valsize < 0)
1024 		return (EINVAL);
1025 
1026 	error = getsock(p->p_fd, uap->s, &fp);
1027 	if (error)
1028 		return (error);
1029 
1030 	sopt.sopt_dir = SOPT_SET;
1031 	sopt.sopt_level = uap->level;
1032 	sopt.sopt_name = uap->name;
1033 	sopt.sopt_val = uap->val;
1034 	sopt.sopt_valsize = uap->valsize;
1035 	sopt.sopt_p = p;
1036 
1037 	return (sosetopt((struct socket *)fp->f_data, &sopt));
1038 }
1039 
1040 /* ARGSUSED */
1041 int
1042 getsockopt(p, uap)
1043 	struct proc *p;
1044 	register struct getsockopt_args /* {
1045 		int	s;
1046 		int	level;
1047 		int	name;
1048 		caddr_t	val;
1049 		int	*avalsize;
1050 	} */ *uap;
1051 {
1052 	int	valsize, error;
1053 	struct	file *fp;
1054 	struct	sockopt sopt;
1055 
1056 	error = getsock(p->p_fd, uap->s, &fp);
1057 	if (error)
1058 		return (error);
1059 	if (uap->val) {
1060 		error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
1061 		    sizeof (valsize));
1062 		if (error)
1063 			return (error);
1064 		if (valsize < 0)
1065 			return (EINVAL);
1066 	} else
1067 		valsize = 0;
1068 
1069 	sopt.sopt_dir = SOPT_GET;
1070 	sopt.sopt_level = uap->level;
1071 	sopt.sopt_name = uap->name;
1072 	sopt.sopt_val = uap->val;
1073 	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1074 	sopt.sopt_p = p;
1075 
1076 	error = sogetopt((struct socket *)fp->f_data, &sopt);
1077 	if (error == 0) {
1078 		valsize = sopt.sopt_valsize;
1079 		error = copyout((caddr_t)&valsize,
1080 				(caddr_t)uap->avalsize, sizeof (valsize));
1081 	}
1082 	return (error);
1083 }
1084 
1085 /*
1086  * Get socket name.
1087  */
1088 /* ARGSUSED */
1089 static int
1090 getsockname1(p, uap, compat)
1091 	struct proc *p;
1092 	register struct getsockname_args /* {
1093 		int	fdes;
1094 		caddr_t	asa;
1095 		int	*alen;
1096 	} */ *uap;
1097 	int compat;
1098 {
1099 	struct file *fp;
1100 	register struct socket *so;
1101 	struct sockaddr *sa;
1102 	int len, error;
1103 
1104 	error = getsock(p->p_fd, uap->fdes, &fp);
1105 	if (error)
1106 		return (error);
1107 	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1108 	if (error)
1109 		return (error);
1110 	so = (struct socket *)fp->f_data;
1111 	sa = 0;
1112 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1113 	if (error)
1114 		goto bad;
1115 	if (sa == 0) {
1116 		len = 0;
1117 		goto gotnothing;
1118 	}
1119 
1120 	len = MIN(len, sa->sa_len);
1121 #ifdef COMPAT_OLDSOCK
1122 	if (compat)
1123 		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1124 #endif
1125 	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1126 	if (error == 0)
1127 gotnothing:
1128 		error = copyout((caddr_t)&len, (caddr_t)uap->alen,
1129 		    sizeof (len));
1130 bad:
1131 	if (sa)
1132 		FREE(sa, M_SONAME);
1133 	return (error);
1134 }
1135 
1136 int
1137 getsockname(p, uap)
1138 	struct proc *p;
1139 	struct getsockname_args *uap;
1140 {
1141 
1142 	return (getsockname1(p, uap, 0));
1143 }
1144 
1145 #ifdef COMPAT_OLDSOCK
1146 int
1147 ogetsockname(p, uap)
1148 	struct proc *p;
1149 	struct getsockname_args *uap;
1150 {
1151 
1152 	return (getsockname1(p, uap, 1));
1153 }
1154 #endif /* COMPAT_OLDSOCK */
1155 
1156 /*
1157  * Get name of peer for connected socket.
1158  */
1159 /* ARGSUSED */
1160 static int
1161 getpeername1(p, uap, compat)
1162 	struct proc *p;
1163 	register struct getpeername_args /* {
1164 		int	fdes;
1165 		caddr_t	asa;
1166 		int	*alen;
1167 	} */ *uap;
1168 	int compat;
1169 {
1170 	struct file *fp;
1171 	register struct socket *so;
1172 	struct sockaddr *sa;
1173 	int len, error;
1174 
1175 	error = getsock(p->p_fd, uap->fdes, &fp);
1176 	if (error)
1177 		return (error);
1178 	so = (struct socket *)fp->f_data;
1179 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
1180 		return (ENOTCONN);
1181 	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1182 	if (error)
1183 		return (error);
1184 	sa = 0;
1185 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1186 	if (error)
1187 		goto bad;
1188 	if (sa == 0) {
1189 		len = 0;
1190 		goto gotnothing;
1191 	}
1192 	len = MIN(len, sa->sa_len);
1193 #ifdef COMPAT_OLDSOCK
1194 	if (compat)
1195 		((struct osockaddr *)sa)->sa_family =
1196 		    sa->sa_family;
1197 #endif
1198 	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1199 	if (error)
1200 		goto bad;
1201 gotnothing:
1202 	error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
1203 bad:
1204 	if (sa) FREE(sa, M_SONAME);
1205 	return (error);
1206 }
1207 
1208 int
1209 getpeername(p, uap)
1210 	struct proc *p;
1211 	struct getpeername_args *uap;
1212 {
1213 
1214 	return (getpeername1(p, uap, 0));
1215 }
1216 
1217 #ifdef COMPAT_OLDSOCK
1218 int
1219 ogetpeername(p, uap)
1220 	struct proc *p;
1221 	struct ogetpeername_args *uap;
1222 {
1223 
1224 	/* XXX uap should have type `getpeername_args *' to begin with. */
1225 	return (getpeername1(p, (struct getpeername_args *)uap, 1));
1226 }
1227 #endif /* COMPAT_OLDSOCK */
1228 
1229 int
1230 sockargs(mp, buf, buflen, type)
1231 	struct mbuf **mp;
1232 	caddr_t buf;
1233 	int buflen, type;
1234 {
1235 	register struct sockaddr *sa;
1236 	register struct mbuf *m;
1237 	int error;
1238 
1239 	if ((u_int)buflen > MLEN) {
1240 #ifdef COMPAT_OLDSOCK
1241 		if (type == MT_SONAME && (u_int)buflen <= 112)
1242 			buflen = MLEN;		/* unix domain compat. hack */
1243 		else
1244 #endif
1245 		return (EINVAL);
1246 	}
1247 	m = m_get(M_WAIT, type);
1248 	if (m == NULL)
1249 		return (ENOBUFS);
1250 	m->m_len = buflen;
1251 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1252 	if (error)
1253 		(void) m_free(m);
1254 	else {
1255 		*mp = m;
1256 		if (type == MT_SONAME) {
1257 			sa = mtod(m, struct sockaddr *);
1258 
1259 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1260 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1261 				sa->sa_family = sa->sa_len;
1262 #endif
1263 			sa->sa_len = buflen;
1264 		}
1265 	}
1266 	return (error);
1267 }
1268 
1269 int
1270 getsockaddr(namp, uaddr, len)
1271 	struct sockaddr **namp;
1272 	caddr_t uaddr;
1273 	size_t len;
1274 {
1275 	struct sockaddr *sa;
1276 	int error;
1277 
1278 	if (len > SOCK_MAXADDRLEN)
1279 		return ENAMETOOLONG;
1280 	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1281 	error = copyin(uaddr, sa, len);
1282 	if (error) {
1283 		FREE(sa, M_SONAME);
1284 	} else {
1285 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1286 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1287 			sa->sa_family = sa->sa_len;
1288 #endif
1289 		sa->sa_len = len;
1290 		*namp = sa;
1291 	}
1292 	return error;
1293 }
1294 
1295 int
1296 getsock(fdp, fdes, fpp)
1297 	struct filedesc *fdp;
1298 	int fdes;
1299 	struct file **fpp;
1300 {
1301 	register struct file *fp;
1302 
1303 	if ((unsigned)fdes >= fdp->fd_nfiles ||
1304 	    (fp = fdp->fd_ofiles[fdes]) == NULL)
1305 		return (EBADF);
1306 	if (fp->f_type != DTYPE_SOCKET)
1307 		return (ENOTSOCK);
1308 	*fpp = fp;
1309 	return (0);
1310 }
1311 
1312 /*
1313  * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1314  * XXX - The sf_buf functions are currently private to sendfile(2), so have
1315  * been made static, but may be useful in the future for doing zero-copy in
1316  * other parts of the networking code.
1317  */
1318 static void
1319 sf_buf_init(void *arg)
1320 {
1321 	int i;
1322 
1323 	SLIST_INIT(&sf_freelist);
1324 	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
1325 	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT);
1326 	bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf));
1327 	for (i = 0; i < nsfbufs; i++) {
1328 		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1329 		SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list);
1330 	}
1331 }
1332 
1333 /*
1334  * Get an sf_buf from the freelist. Will block if none are available.
1335  */
1336 static struct sf_buf *
1337 sf_buf_alloc()
1338 {
1339 	struct sf_buf *sf;
1340 	int s;
1341 
1342 	s = splimp();
1343 	while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) {
1344 		sf_buf_alloc_want = 1;
1345 		tsleep(&sf_freelist, PVM, "sfbufa", 0);
1346 	}
1347 	SLIST_REMOVE_HEAD(&sf_freelist, free_list);
1348 	splx(s);
1349 	sf->refcnt = 1;
1350 	return (sf);
1351 }
1352 
1353 #define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1354 static void
1355 sf_buf_ref(caddr_t addr, u_int size)
1356 {
1357 	struct sf_buf *sf;
1358 
1359 	sf = dtosf(addr);
1360 	if (sf->refcnt == 0)
1361 		panic("sf_buf_ref: referencing a free sf_buf");
1362 	sf->refcnt++;
1363 }
1364 
1365 /*
1366  * Lose a reference to an sf_buf. When none left, detach mapped page
1367  * and release resources back to the system.
1368  *
1369  * Must be called at splimp.
1370  */
1371 static void
1372 sf_buf_free(caddr_t addr, u_int size)
1373 {
1374 	struct sf_buf *sf;
1375 	struct vm_page *m;
1376 	int s;
1377 
1378 	sf = dtosf(addr);
1379 	if (sf->refcnt == 0)
1380 		panic("sf_buf_free: freeing free sf_buf");
1381 	sf->refcnt--;
1382 	if (sf->refcnt == 0) {
1383 		pmap_qremove((vm_offset_t)addr, 1);
1384 		m = sf->m;
1385 		s = splvm();
1386 		vm_page_unwire(m, 0);
1387 		/*
1388 		 * Check for the object going away on us. This can
1389 		 * happen since we don't hold a reference to it.
1390 		 * If so, we're responsible for freeing the page.
1391 		 */
1392 		if (m->wire_count == 0 && m->object == NULL)
1393 			vm_page_free(m);
1394 		splx(s);
1395 		sf->m = NULL;
1396 		SLIST_INSERT_HEAD(&sf_freelist, sf, free_list);
1397 		if (sf_buf_alloc_want) {
1398 			sf_buf_alloc_want = 0;
1399 			wakeup(&sf_freelist);
1400 		}
1401 	}
1402 }
1403 
1404 /*
1405  * sendfile(2).
1406  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1407  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1408  *
1409  * Send a file specified by 'fd' and starting at 'offset' to a socket
1410  * specified by 's'. Send only 'nbytes' of the file or until EOF if
1411  * nbytes == 0. Optionally add a header and/or trailer to the socket
1412  * output. If specified, write the total number of bytes sent into *sbytes.
1413  */
1414 int
1415 sendfile(struct proc *p, struct sendfile_args *uap)
1416 {
1417 	struct file *fp;
1418 	struct filedesc *fdp = p->p_fd;
1419 	struct vnode *vp;
1420 	struct vm_object *obj;
1421 	struct socket *so;
1422 	struct mbuf *m;
1423 	struct sf_buf *sf;
1424 	struct vm_page *pg;
1425 	struct writev_args nuap;
1426 	struct sf_hdtr hdtr;
1427 	off_t off, xfsize, sbytes = 0;
1428 	int error = 0, s;
1429 
1430 	vp = NULL;
1431 	/*
1432 	 * Do argument checking. Must be a regular file in, stream
1433 	 * type and connected socket out, positive offset.
1434 	 */
1435 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
1436 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
1437 	    (fp->f_flag & FREAD) == 0) {
1438 		error = EBADF;
1439 		goto done;
1440 	}
1441 	if (fp->f_type != DTYPE_VNODE) {
1442 		error = EINVAL;
1443 		goto done;
1444 	}
1445 	vp = (struct vnode *)fp->f_data;
1446 	vref(vp);
1447 	obj = vp->v_object;
1448 	if (vp->v_type != VREG || obj == NULL) {
1449 		error = EINVAL;
1450 		goto done;
1451 	}
1452 	error = getsock(p->p_fd, uap->s, &fp);
1453 	if (error)
1454 		goto done;
1455 	so = (struct socket *)fp->f_data;
1456 	if (so->so_type != SOCK_STREAM) {
1457 		error = EINVAL;
1458 		goto done;
1459 	}
1460 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1461 		error = ENOTCONN;
1462 		goto done;
1463 	}
1464 	if (uap->offset < 0) {
1465 		error = EINVAL;
1466 		goto done;
1467 	}
1468 
1469 	/*
1470 	 * If specified, get the pointer to the sf_hdtr struct for
1471 	 * any headers/trailers.
1472 	 */
1473 	if (uap->hdtr != NULL) {
1474 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1475 		if (error)
1476 			goto done;
1477 		/*
1478 		 * Send any headers. Wimp out and use writev(2).
1479 		 */
1480 		if (hdtr.headers != NULL) {
1481 			nuap.fd = uap->s;
1482 			nuap.iovp = hdtr.headers;
1483 			nuap.iovcnt = hdtr.hdr_cnt;
1484 			error = writev(p, &nuap);
1485 			if (error)
1486 				goto done;
1487 			sbytes += p->p_retval[0];
1488 		}
1489 	}
1490 
1491 	/*
1492 	 * Protect against multiple writers to the socket.
1493 	 */
1494 	(void) sblock(&so->so_snd, M_WAITOK);
1495 
1496 	/*
1497 	 * Loop through the pages in the file, starting with the requested
1498 	 * offset. Get a file page (do I/O if necessary), map the file page
1499 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1500 	 * it on the socket.
1501 	 */
1502 	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1503 		vm_pindex_t pindex;
1504 		vm_offset_t pgoff;
1505 
1506 		pindex = OFF_TO_IDX(off);
1507 retry_lookup:
1508 		/*
1509 		 * Calculate the amount to transfer. Not to exceed a page,
1510 		 * the EOF, or the passed in nbytes.
1511 		 */
1512 		xfsize = obj->un_pager.vnp.vnp_size - off;
1513 		if (xfsize > PAGE_SIZE)
1514 			xfsize = PAGE_SIZE;
1515 		pgoff = (vm_offset_t)(off & PAGE_MASK);
1516 		if (PAGE_SIZE - pgoff < xfsize)
1517 			xfsize = PAGE_SIZE - pgoff;
1518 		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1519 			xfsize = uap->nbytes - sbytes;
1520 		if (xfsize <= 0)
1521 			break;
1522 		/*
1523 		 * Optimize the non-blocking case by looking at the socket space
1524 		 * before going to the extra work of constituting the sf_buf.
1525 		 */
1526 		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1527 			if (so->so_state & SS_CANTSENDMORE)
1528 				error = EPIPE;
1529 			else
1530 				error = EAGAIN;
1531 			sbunlock(&so->so_snd);
1532 			goto done;
1533 		}
1534 		/*
1535 		 * Attempt to look up the page.
1536 		 *
1537 		 *	Allocate if not found
1538 		 *
1539 		 *	Wait and loop if busy.
1540 		 */
1541 		pg = vm_page_lookup(obj, pindex);
1542 
1543 		if (pg == NULL) {
1544 			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1545 			if (pg == NULL) {
1546 				VM_WAIT;
1547 				goto retry_lookup;
1548 			}
1549 			vm_page_wakeup(pg);
1550 		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
1551 			goto retry_lookup;
1552 		}
1553 
1554 		/*
1555 		 * Wire the page so it does not get ripped out from under
1556 		 * us.
1557 		 */
1558 
1559 		vm_page_wire(pg);
1560 
1561 		/*
1562 		 * If page is not valid for what we need, initiate I/O
1563 		 */
1564 
1565 		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1566 			struct uio auio;
1567 			struct iovec aiov;
1568 			int bsize;
1569 
1570 			/*
1571 			 * Ensure that our page is still around when the I/O
1572 			 * completes.
1573 			 */
1574 			vm_page_io_start(pg);
1575 
1576 			/*
1577 			 * Get the page from backing store.
1578 			 */
1579 			bsize = vp->v_mount->mnt_stat.f_iosize;
1580 			auio.uio_iov = &aiov;
1581 			auio.uio_iovcnt = 1;
1582 			aiov.iov_base = 0;
1583 			aiov.iov_len = MAXBSIZE;
1584 			auio.uio_resid = MAXBSIZE;
1585 			auio.uio_offset = trunc_page(off);
1586 			auio.uio_segflg = UIO_NOCOPY;
1587 			auio.uio_rw = UIO_READ;
1588 			auio.uio_procp = p;
1589 			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
1590 			error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
1591 			        p->p_ucred);
1592 			VOP_UNLOCK(vp, 0, p);
1593 			vm_page_flag_clear(pg, PG_ZERO);
1594 			vm_page_io_finish(pg);
1595 			if (error) {
1596 				vm_page_unwire(pg, 0);
1597 				/*
1598 				 * See if anyone else might know about this page.
1599 				 * If not and it is not valid, then free it.
1600 				 */
1601 				if (pg->wire_count == 0 && pg->valid == 0 &&
1602 				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1603 				    pg->hold_count == 0)
1604 					vm_page_free(pg);
1605 				sbunlock(&so->so_snd);
1606 				goto done;
1607 			}
1608 		}
1609 
1610 		/*
1611 		 * Allocate a kernel virtual page and insert the physical page
1612 		 * into it.
1613 		 */
1614 
1615 		sf = sf_buf_alloc();
1616 		sf->m = pg;
1617 		pmap_qenter(sf->kva, &pg, 1);
1618 		/*
1619 		 * Get an mbuf header and set it up as having external storage.
1620 		 */
1621 		MGETHDR(m, M_WAIT, MT_DATA);
1622 		if (m == NULL) {
1623 			error = ENOBUFS;
1624 			goto done;
1625 		}
1626 		m->m_ext.ext_free = sf_buf_free;
1627 		m->m_ext.ext_ref = sf_buf_ref;
1628 		m->m_ext.ext_buf = (void *)sf->kva;
1629 		m->m_ext.ext_size = PAGE_SIZE;
1630 		m->m_data = (char *) sf->kva + pgoff;
1631 		m->m_flags |= M_EXT;
1632 		m->m_pkthdr.len = m->m_len = xfsize;
1633 		/*
1634 		 * Add the buffer to the socket buffer chain.
1635 		 */
1636 		s = splnet();
1637 retry_space:
1638 		/*
1639 		 * Make sure that the socket is still able to take more data.
1640 		 * CANTSENDMORE being true usually means that the connection
1641 		 * was closed. so_error is true when an error was sensed after
1642 		 * a previous send.
1643 		 * The state is checked after the page mapping and buffer
1644 		 * allocation above since those operations may block and make
1645 		 * any socket checks stale. From this point forward, nothing
1646 		 * blocks before the pru_send (or more accurately, any blocking
1647 		 * results in a loop back to here to re-check).
1648 		 */
1649 		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1650 			if (so->so_state & SS_CANTSENDMORE) {
1651 				error = EPIPE;
1652 			} else {
1653 				error = so->so_error;
1654 				so->so_error = 0;
1655 			}
1656 			m_freem(m);
1657 			sbunlock(&so->so_snd);
1658 			splx(s);
1659 			goto done;
1660 		}
1661 		/*
1662 		 * Wait for socket space to become available. We do this just
1663 		 * after checking the connection state above in order to avoid
1664 		 * a race condition with sbwait().
1665 		 */
1666 		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1667 			if (so->so_state & SS_NBIO) {
1668 				m_freem(m);
1669 				sbunlock(&so->so_snd);
1670 				splx(s);
1671 				error = EAGAIN;
1672 				goto done;
1673 			}
1674 			error = sbwait(&so->so_snd);
1675 			/*
1676 			 * An error from sbwait usually indicates that we've
1677 			 * been interrupted by a signal. If we've sent anything
1678 			 * then return bytes sent, otherwise return the error.
1679 			 */
1680 			if (error) {
1681 				m_freem(m);
1682 				sbunlock(&so->so_snd);
1683 				splx(s);
1684 				goto done;
1685 			}
1686 			goto retry_space;
1687 		}
1688 		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
1689 		splx(s);
1690 		if (error) {
1691 			sbunlock(&so->so_snd);
1692 			goto done;
1693 		}
1694 	}
1695 	sbunlock(&so->so_snd);
1696 
1697 	/*
1698 	 * Send trailers. Wimp out and use writev(2).
1699 	 */
1700 	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1701 			nuap.fd = uap->s;
1702 			nuap.iovp = hdtr.trailers;
1703 			nuap.iovcnt = hdtr.trl_cnt;
1704 			error = writev(p, &nuap);
1705 			if (error)
1706 				goto done;
1707 			sbytes += p->p_retval[0];
1708 	}
1709 
1710 done:
1711 	if (uap->sbytes != NULL) {
1712 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
1713 	}
1714 	if (vp)
1715 		vrele(vp);
1716 	return (error);
1717 }
1718