xref: /freebsd/sys/kern/uipc_syscalls.c (revision 2ad872c5794e4c26fdf6ed219ad3f09ca0d5304a)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37  * $Id: uipc_syscalls.c,v 1.48 1998/12/03 12:35:47 dg Exp $
38  */
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/sysproto.h>
47 #include <sys/malloc.h>
48 #include <sys/filedesc.h>
49 #include <sys/proc.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/mbuf.h>
53 #include <sys/protosw.h>
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <sys/signalvar.h>
57 #include <sys/uio.h>
58 #include <sys/vnode.h>
59 #include <sys/lock.h>
60 #include <sys/mount.h>
61 #ifdef KTRACE
62 #include <sys/ktrace.h>
63 #endif
64 #include <vm/vm.h>
65 #include <vm/vm_prot.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_pager.h>
69 #include <vm/vm_pageout.h>
70 #include <vm/vm_kern.h>
71 #include <vm/vm_extern.h>
72 #include <machine/limits.h>
73 
74 static void sf_buf_init(void *arg);
75 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
76 static struct sf_buf *sf_buf_alloc(void);
77 static void sf_buf_ref(caddr_t addr, u_int size);
78 static void sf_buf_free(caddr_t addr, u_int size);
79 
80 static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags));
81 static int recvit __P((struct proc *p, int s, struct msghdr *mp,
82 		       caddr_t namelenp));
83 
84 static int accept1 __P((struct proc *p, struct accept_args *uap, int compat));
85 static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
86 			     int compat));
87 static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
88 			     int compat));
89 
90 static SLIST_HEAD(, sf_buf) sf_freelist;
91 static vm_offset_t sf_base;
92 static struct sf_buf *sf_bufs;
93 static int sf_buf_alloc_want;
94 
95 /*
96  * System call interface to the socket abstraction.
97  */
98 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
99 #define COMPAT_OLDSOCK
100 #endif
101 
102 extern	struct fileops socketops;
103 
104 int
105 socket(p, uap)
106 	struct proc *p;
107 	register struct socket_args /* {
108 		int	domain;
109 		int	type;
110 		int	protocol;
111 	} */ *uap;
112 {
113 	struct filedesc *fdp = p->p_fd;
114 	struct socket *so;
115 	struct file *fp;
116 	int fd, error;
117 
118 	error = falloc(p, &fp, &fd);
119 	if (error)
120 		return (error);
121 	fp->f_flag = FREAD|FWRITE;
122 	fp->f_type = DTYPE_SOCKET;
123 	fp->f_ops = &socketops;
124 	error = socreate(uap->domain, &so, uap->type, uap->protocol, p);
125 	if (error) {
126 		fdp->fd_ofiles[fd] = 0;
127 		ffree(fp);
128 	} else {
129 		fp->f_data = (caddr_t)so;
130 		p->p_retval[0] = fd;
131 	}
132 	return (error);
133 }
134 
135 /* ARGSUSED */
136 int
137 bind(p, uap)
138 	struct proc *p;
139 	register struct bind_args /* {
140 		int	s;
141 		caddr_t	name;
142 		int	namelen;
143 	} */ *uap;
144 {
145 	struct file *fp;
146 	struct sockaddr *sa;
147 	int error;
148 
149 	error = getsock(p->p_fd, uap->s, &fp);
150 	if (error)
151 		return (error);
152 	error = getsockaddr(&sa, uap->name, uap->namelen);
153 	if (error)
154 		return (error);
155 	error = sobind((struct socket *)fp->f_data, sa, p);
156 	FREE(sa, M_SONAME);
157 	return (error);
158 }
159 
160 /* ARGSUSED */
161 int
162 listen(p, uap)
163 	struct proc *p;
164 	register struct listen_args /* {
165 		int	s;
166 		int	backlog;
167 	} */ *uap;
168 {
169 	struct file *fp;
170 	int error;
171 
172 	error = getsock(p->p_fd, uap->s, &fp);
173 	if (error)
174 		return (error);
175 	return (solisten((struct socket *)fp->f_data, uap->backlog, p));
176 }
177 
178 static int
179 accept1(p, uap, compat)
180 	struct proc *p;
181 	register struct accept_args /* {
182 		int	s;
183 		caddr_t	name;
184 		int	*anamelen;
185 	} */ *uap;
186 	int compat;
187 {
188 	struct file *fp;
189 	struct sockaddr *sa;
190 	int namelen, error, s;
191 	struct socket *head, *so;
192 	int fd;
193 	short fflag;		/* type must match fp->f_flag */
194 
195 	if (uap->name) {
196 		error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
197 			sizeof (namelen));
198 		if(error)
199 			return (error);
200 	}
201 	error = getsock(p->p_fd, uap->s, &fp);
202 	if (error)
203 		return (error);
204 	s = splnet();
205 	head = (struct socket *)fp->f_data;
206 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
207 		splx(s);
208 		return (EINVAL);
209 	}
210 	if ((head->so_state & SS_NBIO) && head->so_comp.tqh_first == NULL) {
211 		splx(s);
212 		return (EWOULDBLOCK);
213 	}
214 	while (head->so_comp.tqh_first == NULL && head->so_error == 0) {
215 		if (head->so_state & SS_CANTRCVMORE) {
216 			head->so_error = ECONNABORTED;
217 			break;
218 		}
219 		error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
220 		    "accept", 0);
221 		if (error) {
222 			splx(s);
223 			return (error);
224 		}
225 	}
226 	if (head->so_error) {
227 		error = head->so_error;
228 		head->so_error = 0;
229 		splx(s);
230 		return (error);
231 	}
232 
233 	/*
234 	 * At this point we know that there is at least one connection
235 	 * ready to be accepted. Remove it from the queue prior to
236 	 * allocating the file descriptor for it since falloc() may
237 	 * block allowing another process to accept the connection
238 	 * instead.
239 	 */
240 	so = head->so_comp.tqh_first;
241 	TAILQ_REMOVE(&head->so_comp, so, so_list);
242 	head->so_qlen--;
243 
244 	fflag = fp->f_flag;
245 	error = falloc(p, &fp, &fd);
246 	if (error) {
247 		/*
248 		 * Probably ran out of file descriptors. Put the
249 		 * unaccepted connection back onto the queue and
250 		 * do another wakeup so some other process might
251 		 * have a chance at it.
252 		 */
253 		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
254 		head->so_qlen++;
255 		wakeup_one(&head->so_timeo);
256 		splx(s);
257 		return (error);
258 	} else
259 		p->p_retval[0] = fd;
260 
261 	so->so_state &= ~SS_COMP;
262 	so->so_head = NULL;
263 	if (head->so_sigio != NULL)
264 		fsetown(fgetown(head->so_sigio), &so->so_sigio);
265 
266 	fp->f_type = DTYPE_SOCKET;
267 	fp->f_flag = fflag;
268 	fp->f_ops = &socketops;
269 	fp->f_data = (caddr_t)so;
270 	sa = 0;
271 	(void) soaccept(so, &sa);
272 	if (sa == 0) {
273 		namelen = 0;
274 		if (uap->name)
275 			goto gotnoname;
276 		return 0;
277 	}
278 	if (uap->name) {
279 		/* check sa_len before it is destroyed */
280 		if (namelen > sa->sa_len)
281 			namelen = sa->sa_len;
282 #ifdef COMPAT_OLDSOCK
283 		if (compat)
284 			((struct osockaddr *)sa)->sa_family =
285 			    sa->sa_family;
286 #endif
287 		error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
288 		if (!error)
289 gotnoname:
290 			error = copyout((caddr_t)&namelen,
291 			    (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
292 	}
293 	FREE(sa, M_SONAME);
294 	splx(s);
295 	return (error);
296 }
297 
298 int
299 accept(p, uap)
300 	struct proc *p;
301 	struct accept_args *uap;
302 {
303 
304 	return (accept1(p, uap, 0));
305 }
306 
307 #ifdef COMPAT_OLDSOCK
308 int
309 oaccept(p, uap)
310 	struct proc *p;
311 	struct accept_args *uap;
312 {
313 
314 	return (accept1(p, uap, 1));
315 }
316 #endif /* COMPAT_OLDSOCK */
317 
318 /* ARGSUSED */
319 int
320 connect(p, uap)
321 	struct proc *p;
322 	register struct connect_args /* {
323 		int	s;
324 		caddr_t	name;
325 		int	namelen;
326 	} */ *uap;
327 {
328 	struct file *fp;
329 	register struct socket *so;
330 	struct sockaddr *sa;
331 	int error, s;
332 
333 	error = getsock(p->p_fd, uap->s, &fp);
334 	if (error)
335 		return (error);
336 	so = (struct socket *)fp->f_data;
337 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING))
338 		return (EALREADY);
339 	error = getsockaddr(&sa, uap->name, uap->namelen);
340 	if (error)
341 		return (error);
342 	error = soconnect(so, sa, p);
343 	if (error)
344 		goto bad;
345 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
346 		FREE(sa, M_SONAME);
347 		return (EINPROGRESS);
348 	}
349 	s = splnet();
350 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
351 		error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
352 		    "connec", 0);
353 		if (error)
354 			break;
355 	}
356 	if (error == 0) {
357 		error = so->so_error;
358 		so->so_error = 0;
359 	}
360 	splx(s);
361 bad:
362 	so->so_state &= ~SS_ISCONNECTING;
363 	FREE(sa, M_SONAME);
364 	if (error == ERESTART)
365 		error = EINTR;
366 	return (error);
367 }
368 
369 int
370 socketpair(p, uap)
371 	struct proc *p;
372 	register struct socketpair_args /* {
373 		int	domain;
374 		int	type;
375 		int	protocol;
376 		int	*rsv;
377 	} */ *uap;
378 {
379 	register struct filedesc *fdp = p->p_fd;
380 	struct file *fp1, *fp2;
381 	struct socket *so1, *so2;
382 	int fd, error, sv[2];
383 
384 	error = socreate(uap->domain, &so1, uap->type, uap->protocol, p);
385 	if (error)
386 		return (error);
387 	error = socreate(uap->domain, &so2, uap->type, uap->protocol, p);
388 	if (error)
389 		goto free1;
390 	error = falloc(p, &fp1, &fd);
391 	if (error)
392 		goto free2;
393 	sv[0] = fd;
394 	fp1->f_flag = FREAD|FWRITE;
395 	fp1->f_type = DTYPE_SOCKET;
396 	fp1->f_ops = &socketops;
397 	fp1->f_data = (caddr_t)so1;
398 	error = falloc(p, &fp2, &fd);
399 	if (error)
400 		goto free3;
401 	fp2->f_flag = FREAD|FWRITE;
402 	fp2->f_type = DTYPE_SOCKET;
403 	fp2->f_ops = &socketops;
404 	fp2->f_data = (caddr_t)so2;
405 	sv[1] = fd;
406 	error = soconnect2(so1, so2);
407 	if (error)
408 		goto free4;
409 	if (uap->type == SOCK_DGRAM) {
410 		/*
411 		 * Datagram socket connection is asymmetric.
412 		 */
413 		 error = soconnect2(so2, so1);
414 		 if (error)
415 			goto free4;
416 	}
417 	error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
418 	return (error);
419 free4:
420 	ffree(fp2);
421 	fdp->fd_ofiles[sv[1]] = 0;
422 free3:
423 	ffree(fp1);
424 	fdp->fd_ofiles[sv[0]] = 0;
425 free2:
426 	(void)soclose(so2);
427 free1:
428 	(void)soclose(so1);
429 	return (error);
430 }
431 
432 static int
433 sendit(p, s, mp, flags)
434 	register struct proc *p;
435 	int s;
436 	register struct msghdr *mp;
437 	int flags;
438 {
439 	struct file *fp;
440 	struct uio auio;
441 	register struct iovec *iov;
442 	register int i;
443 	struct mbuf *control;
444 	struct sockaddr *to;
445 	int len, error;
446 	struct socket *so;
447 #ifdef KTRACE
448 	struct iovec *ktriov = NULL;
449 #endif
450 
451 	error = getsock(p->p_fd, s, &fp);
452 	if (error)
453 		return (error);
454 	auio.uio_iov = mp->msg_iov;
455 	auio.uio_iovcnt = mp->msg_iovlen;
456 	auio.uio_segflg = UIO_USERSPACE;
457 	auio.uio_rw = UIO_WRITE;
458 	auio.uio_procp = p;
459 	auio.uio_offset = 0;			/* XXX */
460 	auio.uio_resid = 0;
461 	iov = mp->msg_iov;
462 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
463 		if ((auio.uio_resid += iov->iov_len) < 0)
464 			return (EINVAL);
465 	}
466 	if (mp->msg_name) {
467 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
468 		if (error)
469 			return (error);
470 	} else
471 		to = 0;
472 	if (mp->msg_control) {
473 		if (mp->msg_controllen < sizeof(struct cmsghdr)
474 #ifdef COMPAT_OLDSOCK
475 		    && mp->msg_flags != MSG_COMPAT
476 #endif
477 		) {
478 			error = EINVAL;
479 			goto bad;
480 		}
481 		error = sockargs(&control, mp->msg_control,
482 		    mp->msg_controllen, MT_CONTROL);
483 		if (error)
484 			goto bad;
485 #ifdef COMPAT_OLDSOCK
486 		if (mp->msg_flags == MSG_COMPAT) {
487 			register struct cmsghdr *cm;
488 
489 			M_PREPEND(control, sizeof(*cm), M_WAIT);
490 			if (control == 0) {
491 				error = ENOBUFS;
492 				goto bad;
493 			} else {
494 				cm = mtod(control, struct cmsghdr *);
495 				cm->cmsg_len = control->m_len;
496 				cm->cmsg_level = SOL_SOCKET;
497 				cm->cmsg_type = SCM_RIGHTS;
498 			}
499 		}
500 #endif
501 	} else
502 		control = 0;
503 #ifdef KTRACE
504 	if (KTRPOINT(p, KTR_GENIO)) {
505 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
506 
507 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
508 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
509 	}
510 #endif
511 	len = auio.uio_resid;
512 	so = (struct socket *)fp->f_data;
513 	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
514 						     flags, p);
515 	if (error) {
516 		if (auio.uio_resid != len && (error == ERESTART ||
517 		    error == EINTR || error == EWOULDBLOCK))
518 			error = 0;
519 		if (error == EPIPE)
520 			psignal(p, SIGPIPE);
521 	}
522 	if (error == 0)
523 		p->p_retval[0] = len - auio.uio_resid;
524 #ifdef KTRACE
525 	if (ktriov != NULL) {
526 		if (error == 0)
527 			ktrgenio(p->p_tracep, s, UIO_WRITE,
528 				ktriov, p->p_retval[0], error);
529 		FREE(ktriov, M_TEMP);
530 	}
531 #endif
532 bad:
533 	if (to)
534 		FREE(to, M_SONAME);
535 	return (error);
536 }
537 
538 int
539 sendto(p, uap)
540 	struct proc *p;
541 	register struct sendto_args /* {
542 		int	s;
543 		caddr_t	buf;
544 		size_t	len;
545 		int	flags;
546 		caddr_t	to;
547 		int	tolen;
548 	} */ *uap;
549 {
550 	struct msghdr msg;
551 	struct iovec aiov;
552 
553 	msg.msg_name = uap->to;
554 	msg.msg_namelen = uap->tolen;
555 	msg.msg_iov = &aiov;
556 	msg.msg_iovlen = 1;
557 	msg.msg_control = 0;
558 #ifdef COMPAT_OLDSOCK
559 	msg.msg_flags = 0;
560 #endif
561 	aiov.iov_base = uap->buf;
562 	aiov.iov_len = uap->len;
563 	return (sendit(p, uap->s, &msg, uap->flags));
564 }
565 
566 #ifdef COMPAT_OLDSOCK
567 int
568 osend(p, uap)
569 	struct proc *p;
570 	register struct osend_args /* {
571 		int	s;
572 		caddr_t	buf;
573 		int	len;
574 		int	flags;
575 	} */ *uap;
576 {
577 	struct msghdr msg;
578 	struct iovec aiov;
579 
580 	msg.msg_name = 0;
581 	msg.msg_namelen = 0;
582 	msg.msg_iov = &aiov;
583 	msg.msg_iovlen = 1;
584 	aiov.iov_base = uap->buf;
585 	aiov.iov_len = uap->len;
586 	msg.msg_control = 0;
587 	msg.msg_flags = 0;
588 	return (sendit(p, uap->s, &msg, uap->flags));
589 }
590 
591 int
592 osendmsg(p, uap)
593 	struct proc *p;
594 	register struct osendmsg_args /* {
595 		int	s;
596 		caddr_t	msg;
597 		int	flags;
598 	} */ *uap;
599 {
600 	struct msghdr msg;
601 	struct iovec aiov[UIO_SMALLIOV], *iov;
602 	int error;
603 
604 	error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
605 	if (error)
606 		return (error);
607 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
608 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
609 			return (EMSGSIZE);
610 		MALLOC(iov, struct iovec *,
611 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
612 		      M_WAITOK);
613 	} else
614 		iov = aiov;
615 	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
616 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
617 	if (error)
618 		goto done;
619 	msg.msg_flags = MSG_COMPAT;
620 	msg.msg_iov = iov;
621 	error = sendit(p, uap->s, &msg, uap->flags);
622 done:
623 	if (iov != aiov)
624 		FREE(iov, M_IOV);
625 	return (error);
626 }
627 #endif
628 
629 int
630 sendmsg(p, uap)
631 	struct proc *p;
632 	register struct sendmsg_args /* {
633 		int	s;
634 		caddr_t	msg;
635 		int	flags;
636 	} */ *uap;
637 {
638 	struct msghdr msg;
639 	struct iovec aiov[UIO_SMALLIOV], *iov;
640 	int error;
641 
642 	error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
643 	if (error)
644 		return (error);
645 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
646 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
647 			return (EMSGSIZE);
648 		MALLOC(iov, struct iovec *,
649 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
650 		       M_WAITOK);
651 	} else
652 		iov = aiov;
653 	if (msg.msg_iovlen &&
654 	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
655 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
656 		goto done;
657 	msg.msg_iov = iov;
658 #ifdef COMPAT_OLDSOCK
659 	msg.msg_flags = 0;
660 #endif
661 	error = sendit(p, uap->s, &msg, uap->flags);
662 done:
663 	if (iov != aiov)
664 		FREE(iov, M_IOV);
665 	return (error);
666 }
667 
668 static int
669 recvit(p, s, mp, namelenp)
670 	register struct proc *p;
671 	int s;
672 	register struct msghdr *mp;
673 	caddr_t namelenp;
674 {
675 	struct file *fp;
676 	struct uio auio;
677 	register struct iovec *iov;
678 	register int i;
679 	int len, error;
680 	struct mbuf *m, *control = 0;
681 	caddr_t ctlbuf;
682 	struct socket *so;
683 	struct sockaddr *fromsa = 0;
684 #ifdef KTRACE
685 	struct iovec *ktriov = NULL;
686 #endif
687 
688 	error = getsock(p->p_fd, s, &fp);
689 	if (error)
690 		return (error);
691 	auio.uio_iov = mp->msg_iov;
692 	auio.uio_iovcnt = mp->msg_iovlen;
693 	auio.uio_segflg = UIO_USERSPACE;
694 	auio.uio_rw = UIO_READ;
695 	auio.uio_procp = p;
696 	auio.uio_offset = 0;			/* XXX */
697 	auio.uio_resid = 0;
698 	iov = mp->msg_iov;
699 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
700 		if ((auio.uio_resid += iov->iov_len) < 0)
701 			return (EINVAL);
702 	}
703 #ifdef KTRACE
704 	if (KTRPOINT(p, KTR_GENIO)) {
705 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
706 
707 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
708 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
709 	}
710 #endif
711 	len = auio.uio_resid;
712 	so = (struct socket *)fp->f_data;
713 	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
714 	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
715 	    &mp->msg_flags);
716 	if (error) {
717 		if (auio.uio_resid != len && (error == ERESTART ||
718 		    error == EINTR || error == EWOULDBLOCK))
719 			error = 0;
720 	}
721 #ifdef KTRACE
722 	if (ktriov != NULL) {
723 		if (error == 0)
724 			ktrgenio(p->p_tracep, s, UIO_READ,
725 				ktriov, len - auio.uio_resid, error);
726 		FREE(ktriov, M_TEMP);
727 	}
728 #endif
729 	if (error)
730 		goto out;
731 	p->p_retval[0] = len - auio.uio_resid;
732 	if (mp->msg_name) {
733 		len = mp->msg_namelen;
734 		if (len <= 0 || fromsa == 0)
735 			len = 0;
736 		else {
737 #ifndef MIN
738 #define MIN(a,b) ((a)>(b)?(b):(a))
739 #endif
740 			/* save sa_len before it is destroyed by MSG_COMPAT */
741 			len = MIN(len, fromsa->sa_len);
742 #ifdef COMPAT_OLDSOCK
743 			if (mp->msg_flags & MSG_COMPAT)
744 				((struct osockaddr *)fromsa)->sa_family =
745 				    fromsa->sa_family;
746 #endif
747 			error = copyout(fromsa,
748 			    (caddr_t)mp->msg_name, (unsigned)len);
749 			if (error)
750 				goto out;
751 		}
752 		mp->msg_namelen = len;
753 		if (namelenp &&
754 		    (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
755 #ifdef COMPAT_OLDSOCK
756 			if (mp->msg_flags & MSG_COMPAT)
757 				error = 0;	/* old recvfrom didn't check */
758 			else
759 #endif
760 			goto out;
761 		}
762 	}
763 	if (mp->msg_control) {
764 #ifdef COMPAT_OLDSOCK
765 		/*
766 		 * We assume that old recvmsg calls won't receive access
767 		 * rights and other control info, esp. as control info
768 		 * is always optional and those options didn't exist in 4.3.
769 		 * If we receive rights, trim the cmsghdr; anything else
770 		 * is tossed.
771 		 */
772 		if (control && mp->msg_flags & MSG_COMPAT) {
773 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
774 			    SOL_SOCKET ||
775 			    mtod(control, struct cmsghdr *)->cmsg_type !=
776 			    SCM_RIGHTS) {
777 				mp->msg_controllen = 0;
778 				goto out;
779 			}
780 			control->m_len -= sizeof (struct cmsghdr);
781 			control->m_data += sizeof (struct cmsghdr);
782 		}
783 #endif
784 		len = mp->msg_controllen;
785 		m = control;
786 		mp->msg_controllen = 0;
787 		ctlbuf = (caddr_t) mp->msg_control;
788 
789 		while (m && len > 0) {
790 			unsigned int tocopy;
791 
792 			if (len >= m->m_len)
793 				tocopy = m->m_len;
794 			else {
795 				mp->msg_flags |= MSG_CTRUNC;
796 				tocopy = len;
797 			}
798 
799 			if (error = copyout((caddr_t)mtod(m, caddr_t),
800 					ctlbuf, tocopy))
801 				goto out;
802 
803 			ctlbuf += tocopy;
804 			len -= tocopy;
805 			m = m->m_next;
806 		}
807 		mp->msg_controllen = ctlbuf - mp->msg_control;
808 	}
809 out:
810 	if (fromsa)
811 		FREE(fromsa, M_SONAME);
812 	if (control)
813 		m_freem(control);
814 	return (error);
815 }
816 
817 int
818 recvfrom(p, uap)
819 	struct proc *p;
820 	register struct recvfrom_args /* {
821 		int	s;
822 		caddr_t	buf;
823 		size_t	len;
824 		int	flags;
825 		caddr_t	from;
826 		int	*fromlenaddr;
827 	} */ *uap;
828 {
829 	struct msghdr msg;
830 	struct iovec aiov;
831 	int error;
832 
833 	if (uap->fromlenaddr) {
834 		error = copyin((caddr_t)uap->fromlenaddr,
835 		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
836 		if (error)
837 			return (error);
838 	} else
839 		msg.msg_namelen = 0;
840 	msg.msg_name = uap->from;
841 	msg.msg_iov = &aiov;
842 	msg.msg_iovlen = 1;
843 	aiov.iov_base = uap->buf;
844 	aiov.iov_len = uap->len;
845 	msg.msg_control = 0;
846 	msg.msg_flags = uap->flags;
847 	return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr));
848 }
849 
850 #ifdef COMPAT_OLDSOCK
851 int
852 orecvfrom(p, uap)
853 	struct proc *p;
854 	struct recvfrom_args *uap;
855 {
856 
857 	uap->flags |= MSG_COMPAT;
858 	return (recvfrom(p, uap));
859 }
860 #endif
861 
862 
863 #ifdef COMPAT_OLDSOCK
864 int
865 orecv(p, uap)
866 	struct proc *p;
867 	register struct orecv_args /* {
868 		int	s;
869 		caddr_t	buf;
870 		int	len;
871 		int	flags;
872 	} */ *uap;
873 {
874 	struct msghdr msg;
875 	struct iovec aiov;
876 
877 	msg.msg_name = 0;
878 	msg.msg_namelen = 0;
879 	msg.msg_iov = &aiov;
880 	msg.msg_iovlen = 1;
881 	aiov.iov_base = uap->buf;
882 	aiov.iov_len = uap->len;
883 	msg.msg_control = 0;
884 	msg.msg_flags = uap->flags;
885 	return (recvit(p, uap->s, &msg, (caddr_t)0));
886 }
887 
888 /*
889  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
890  * overlays the new one, missing only the flags, and with the (old) access
891  * rights where the control fields are now.
892  */
893 int
894 orecvmsg(p, uap)
895 	struct proc *p;
896 	register struct orecvmsg_args /* {
897 		int	s;
898 		struct	omsghdr *msg;
899 		int	flags;
900 	} */ *uap;
901 {
902 	struct msghdr msg;
903 	struct iovec aiov[UIO_SMALLIOV], *iov;
904 	int error;
905 
906 	error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
907 	    sizeof (struct omsghdr));
908 	if (error)
909 		return (error);
910 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
911 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
912 			return (EMSGSIZE);
913 		MALLOC(iov, struct iovec *,
914 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
915 		      M_WAITOK);
916 	} else
917 		iov = aiov;
918 	msg.msg_flags = uap->flags | MSG_COMPAT;
919 	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
920 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
921 	if (error)
922 		goto done;
923 	msg.msg_iov = iov;
924 	error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
925 
926 	if (msg.msg_controllen && error == 0)
927 		error = copyout((caddr_t)&msg.msg_controllen,
928 		    (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
929 done:
930 	if (iov != aiov)
931 		FREE(iov, M_IOV);
932 	return (error);
933 }
934 #endif
935 
936 int
937 recvmsg(p, uap)
938 	struct proc *p;
939 	register struct recvmsg_args /* {
940 		int	s;
941 		struct	msghdr *msg;
942 		int	flags;
943 	} */ *uap;
944 {
945 	struct msghdr msg;
946 	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
947 	register int error;
948 
949 	error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
950 	if (error)
951 		return (error);
952 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
953 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
954 			return (EMSGSIZE);
955 		MALLOC(iov, struct iovec *,
956 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
957 		       M_WAITOK);
958 	} else
959 		iov = aiov;
960 #ifdef COMPAT_OLDSOCK
961 	msg.msg_flags = uap->flags &~ MSG_COMPAT;
962 #else
963 	msg.msg_flags = uap->flags;
964 #endif
965 	uiov = msg.msg_iov;
966 	msg.msg_iov = iov;
967 	error = copyin((caddr_t)uiov, (caddr_t)iov,
968 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
969 	if (error)
970 		goto done;
971 	error = recvit(p, uap->s, &msg, (caddr_t)0);
972 	if (!error) {
973 		msg.msg_iov = uiov;
974 		error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
975 	}
976 done:
977 	if (iov != aiov)
978 		FREE(iov, M_IOV);
979 	return (error);
980 }
981 
982 /* ARGSUSED */
983 int
984 shutdown(p, uap)
985 	struct proc *p;
986 	register struct shutdown_args /* {
987 		int	s;
988 		int	how;
989 	} */ *uap;
990 {
991 	struct file *fp;
992 	int error;
993 
994 	error = getsock(p->p_fd, uap->s, &fp);
995 	if (error)
996 		return (error);
997 	return (soshutdown((struct socket *)fp->f_data, uap->how));
998 }
999 
1000 /* ARGSUSED */
1001 int
1002 setsockopt(p, uap)
1003 	struct proc *p;
1004 	register struct setsockopt_args /* {
1005 		int	s;
1006 		int	level;
1007 		int	name;
1008 		caddr_t	val;
1009 		int	valsize;
1010 	} */ *uap;
1011 {
1012 	struct file *fp;
1013 	struct sockopt sopt;
1014 	int error;
1015 
1016 	if (uap->val == 0 && uap->valsize != 0)
1017 		return (EFAULT);
1018 	if (uap->valsize < 0)
1019 		return (EINVAL);
1020 
1021 	error = getsock(p->p_fd, uap->s, &fp);
1022 	if (error)
1023 		return (error);
1024 
1025 	sopt.sopt_dir = SOPT_SET;
1026 	sopt.sopt_level = uap->level;
1027 	sopt.sopt_name = uap->name;
1028 	sopt.sopt_val = uap->val;
1029 	sopt.sopt_valsize = uap->valsize;
1030 	sopt.sopt_p = p;
1031 
1032 	return (sosetopt((struct socket *)fp->f_data, &sopt));
1033 }
1034 
1035 /* ARGSUSED */
1036 int
1037 getsockopt(p, uap)
1038 	struct proc *p;
1039 	register struct getsockopt_args /* {
1040 		int	s;
1041 		int	level;
1042 		int	name;
1043 		caddr_t	val;
1044 		int	*avalsize;
1045 	} */ *uap;
1046 {
1047 	int	valsize, error;
1048 	struct	file *fp;
1049 	struct	sockopt sopt;
1050 
1051 	error = getsock(p->p_fd, uap->s, &fp);
1052 	if (error)
1053 		return (error);
1054 	if (uap->val) {
1055 		error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
1056 		    sizeof (valsize));
1057 		if (error)
1058 			return (error);
1059 		if (valsize < 0)
1060 			return (EINVAL);
1061 	} else
1062 		valsize = 0;
1063 
1064 	sopt.sopt_dir = SOPT_GET;
1065 	sopt.sopt_level = uap->level;
1066 	sopt.sopt_name = uap->name;
1067 	sopt.sopt_val = uap->val;
1068 	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1069 	sopt.sopt_p = p;
1070 
1071 	error = sogetopt((struct socket *)fp->f_data, &sopt);
1072 	if (error == 0) {
1073 		valsize = sopt.sopt_valsize;
1074 		error = copyout((caddr_t)&valsize,
1075 				(caddr_t)uap->avalsize, sizeof (valsize));
1076 	}
1077 	return (error);
1078 }
1079 
1080 /*
1081  * Get socket name.
1082  */
1083 /* ARGSUSED */
1084 static int
1085 getsockname1(p, uap, compat)
1086 	struct proc *p;
1087 	register struct getsockname_args /* {
1088 		int	fdes;
1089 		caddr_t	asa;
1090 		int	*alen;
1091 	} */ *uap;
1092 	int compat;
1093 {
1094 	struct file *fp;
1095 	register struct socket *so;
1096 	struct sockaddr *sa;
1097 	int len, error;
1098 
1099 	error = getsock(p->p_fd, uap->fdes, &fp);
1100 	if (error)
1101 		return (error);
1102 	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1103 	if (error)
1104 		return (error);
1105 	so = (struct socket *)fp->f_data;
1106 	sa = 0;
1107 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1108 	if (error)
1109 		goto bad;
1110 	if (sa == 0) {
1111 		len = 0;
1112 		goto gotnothing;
1113 	}
1114 
1115 	len = MIN(len, sa->sa_len);
1116 #ifdef COMPAT_OLDSOCK
1117 	if (compat)
1118 		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1119 #endif
1120 	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1121 	if (error == 0)
1122 gotnothing:
1123 		error = copyout((caddr_t)&len, (caddr_t)uap->alen,
1124 		    sizeof (len));
1125 bad:
1126 	if (sa)
1127 		FREE(sa, M_SONAME);
1128 	return (error);
1129 }
1130 
1131 int
1132 getsockname(p, uap)
1133 	struct proc *p;
1134 	struct getsockname_args *uap;
1135 {
1136 
1137 	return (getsockname1(p, uap, 0));
1138 }
1139 
1140 #ifdef COMPAT_OLDSOCK
1141 int
1142 ogetsockname(p, uap)
1143 	struct proc *p;
1144 	struct getsockname_args *uap;
1145 {
1146 
1147 	return (getsockname1(p, uap, 1));
1148 }
1149 #endif /* COMPAT_OLDSOCK */
1150 
1151 /*
1152  * Get name of peer for connected socket.
1153  */
1154 /* ARGSUSED */
1155 static int
1156 getpeername1(p, uap, compat)
1157 	struct proc *p;
1158 	register struct getpeername_args /* {
1159 		int	fdes;
1160 		caddr_t	asa;
1161 		int	*alen;
1162 	} */ *uap;
1163 	int compat;
1164 {
1165 	struct file *fp;
1166 	register struct socket *so;
1167 	struct sockaddr *sa;
1168 	int len, error;
1169 
1170 	error = getsock(p->p_fd, uap->fdes, &fp);
1171 	if (error)
1172 		return (error);
1173 	so = (struct socket *)fp->f_data;
1174 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
1175 		return (ENOTCONN);
1176 	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1177 	if (error)
1178 		return (error);
1179 	sa = 0;
1180 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1181 	if (error)
1182 		goto bad;
1183 	if (sa == 0) {
1184 		len = 0;
1185 		goto gotnothing;
1186 	}
1187 	len = MIN(len, sa->sa_len);
1188 #ifdef COMPAT_OLDSOCK
1189 	if (compat)
1190 		((struct osockaddr *)sa)->sa_family =
1191 		    sa->sa_family;
1192 #endif
1193 	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1194 	if (error)
1195 		goto bad;
1196 gotnothing:
1197 	error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
1198 bad:
1199 	if (sa) FREE(sa, M_SONAME);
1200 	return (error);
1201 }
1202 
1203 int
1204 getpeername(p, uap)
1205 	struct proc *p;
1206 	struct getpeername_args *uap;
1207 {
1208 
1209 	return (getpeername1(p, uap, 0));
1210 }
1211 
1212 #ifdef COMPAT_OLDSOCK
1213 int
1214 ogetpeername(p, uap)
1215 	struct proc *p;
1216 	struct ogetpeername_args *uap;
1217 {
1218 
1219 	/* XXX uap should have type `getpeername_args *' to begin with. */
1220 	return (getpeername1(p, (struct getpeername_args *)uap, 1));
1221 }
1222 #endif /* COMPAT_OLDSOCK */
1223 
1224 int
1225 sockargs(mp, buf, buflen, type)
1226 	struct mbuf **mp;
1227 	caddr_t buf;
1228 	int buflen, type;
1229 {
1230 	register struct sockaddr *sa;
1231 	register struct mbuf *m;
1232 	int error;
1233 
1234 	if ((u_int)buflen > MLEN) {
1235 #ifdef COMPAT_OLDSOCK
1236 		if (type == MT_SONAME && (u_int)buflen <= 112)
1237 			buflen = MLEN;		/* unix domain compat. hack */
1238 		else
1239 #endif
1240 		return (EINVAL);
1241 	}
1242 	m = m_get(M_WAIT, type);
1243 	if (m == NULL)
1244 		return (ENOBUFS);
1245 	m->m_len = buflen;
1246 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1247 	if (error)
1248 		(void) m_free(m);
1249 	else {
1250 		*mp = m;
1251 		if (type == MT_SONAME) {
1252 			sa = mtod(m, struct sockaddr *);
1253 
1254 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1255 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1256 				sa->sa_family = sa->sa_len;
1257 #endif
1258 			sa->sa_len = buflen;
1259 		}
1260 	}
1261 	return (error);
1262 }
1263 
1264 int
1265 getsockaddr(namp, uaddr, len)
1266 	struct sockaddr **namp;
1267 	caddr_t uaddr;
1268 	size_t len;
1269 {
1270 	struct sockaddr *sa;
1271 	int error;
1272 
1273 	if (len > SOCK_MAXADDRLEN)
1274 		return ENAMETOOLONG;
1275 	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1276 	error = copyin(uaddr, sa, len);
1277 	if (error) {
1278 		FREE(sa, M_SONAME);
1279 	} else {
1280 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1281 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1282 			sa->sa_family = sa->sa_len;
1283 #endif
1284 		sa->sa_len = len;
1285 		*namp = sa;
1286 	}
1287 	return error;
1288 }
1289 
1290 int
1291 getsock(fdp, fdes, fpp)
1292 	struct filedesc *fdp;
1293 	int fdes;
1294 	struct file **fpp;
1295 {
1296 	register struct file *fp;
1297 
1298 	if ((unsigned)fdes >= fdp->fd_nfiles ||
1299 	    (fp = fdp->fd_ofiles[fdes]) == NULL)
1300 		return (EBADF);
1301 	if (fp->f_type != DTYPE_SOCKET)
1302 		return (ENOTSOCK);
1303 	*fpp = fp;
1304 	return (0);
1305 }
1306 
1307 /*
1308  * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1309  * XXX - The sf_buf functions are currently private to sendfile(2), so have
1310  * been made static, but may be useful in the future for doing zero-copy in
1311  * other parts of the networking code.
1312  */
1313 static void
1314 sf_buf_init(void *arg)
1315 {
1316 	int i;
1317 
1318 	SLIST_INIT(&sf_freelist);
1319 	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
1320 	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT);
1321 	bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf));
1322 	for (i = 0; i < nsfbufs; i++) {
1323 		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1324 		SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list);
1325 	}
1326 }
1327 
1328 /*
1329  * Get an sf_buf from the freelist. Will block if none are available.
1330  */
1331 static struct sf_buf *
1332 sf_buf_alloc()
1333 {
1334 	struct sf_buf *sf;
1335 	int s;
1336 
1337 	s = splimp();
1338 	while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) {
1339 		sf_buf_alloc_want = 1;
1340 		tsleep(&sf_freelist, PVM, "sfbufa", 0);
1341 	}
1342 	SLIST_REMOVE_HEAD(&sf_freelist, free_list);
1343 	splx(s);
1344 	sf->refcnt = 1;
1345 	return (sf);
1346 }
1347 
1348 #define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1349 static void
1350 sf_buf_ref(caddr_t addr, u_int size)
1351 {
1352 	struct sf_buf *sf;
1353 
1354 	sf = dtosf(addr);
1355 	if (sf->refcnt == 0)
1356 		panic("sf_buf_ref: referencing a free sf_buf");
1357 	sf->refcnt++;
1358 }
1359 
1360 /*
1361  * Lose a reference to an sf_buf. When none left, detach mapped page
1362  * and release resources back to the system.
1363  *
1364  * Must be called at splimp.
1365  */
1366 static void
1367 sf_buf_free(caddr_t addr, u_int size)
1368 {
1369 	struct sf_buf *sf;
1370 	struct vm_page *m;
1371 	int s;
1372 
1373 	sf = dtosf(addr);
1374 	if (sf->refcnt == 0)
1375 		panic("sf_buf_free: freeing free sf_buf");
1376 	sf->refcnt--;
1377 	if (sf->refcnt == 0) {
1378 		pmap_qremove((vm_offset_t)addr, 1);
1379 		m = sf->m;
1380 		s = splvm();
1381 		vm_page_unwire(m, 0);
1382 		/*
1383 		 * Check for the object going away on us. This can
1384 		 * happen since we don't hold a reference to it.
1385 		 * If so, we're responsible for freeing the page.
1386 		 */
1387 		if (m->wire_count == 0 && m->object == NULL)
1388 			vm_page_free(m);
1389 		splx(s);
1390 		sf->m = NULL;
1391 		SLIST_INSERT_HEAD(&sf_freelist, sf, free_list);
1392 		if (sf_buf_alloc_want) {
1393 			sf_buf_alloc_want = 0;
1394 			wakeup(&sf_freelist);
1395 		}
1396 	}
1397 }
1398 
1399 /*
1400  * sendfile(2).
1401  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1402  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1403  *
1404  * Send a file specified by 'fd' and starting at 'offset' to a socket
1405  * specified by 's'. Send only 'nbytes' of the file or until EOF if
1406  * nbytes == 0. Optionally add a header and/or trailer to the socket
1407  * output. If specified, write the total number of bytes sent into *sbytes.
1408  */
1409 int
1410 sendfile(struct proc *p, struct sendfile_args *uap)
1411 {
1412 	struct file *fp;
1413 	struct filedesc *fdp = p->p_fd;
1414 	struct vnode *vp;
1415 	struct vm_object *obj;
1416 	struct socket *so;
1417 	struct mbuf *m;
1418 	struct sf_buf *sf;
1419 	struct vm_page *pg;
1420 	struct writev_args nuap;
1421 	struct sf_hdtr hdtr;
1422 	off_t off, xfsize, sbytes = 0;
1423 	int error = 0, s;
1424 
1425 	/*
1426 	 * Do argument checking. Must be a regular file in, stream
1427 	 * type and connected socket out, positive offset.
1428 	 */
1429 	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
1430 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
1431 	    (fp->f_flag & FREAD) == 0) {
1432 		error = EBADF;
1433 		goto done;
1434 	}
1435 	if (fp->f_type != DTYPE_VNODE) {
1436 		error = EINVAL;
1437 		goto done;
1438 	}
1439 	vp = (struct vnode *)fp->f_data;
1440 	obj = vp->v_object;
1441 	if (vp->v_type != VREG || obj == NULL) {
1442 		error = EINVAL;
1443 		goto done;
1444 	}
1445 	error = getsock(p->p_fd, uap->s, &fp);
1446 	if (error)
1447 		goto done;
1448 	so = (struct socket *)fp->f_data;
1449 	if (so->so_type != SOCK_STREAM) {
1450 		error = EINVAL;
1451 		goto done;
1452 	}
1453 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1454 		error = ENOTCONN;
1455 		goto done;
1456 	}
1457 	if (uap->offset < 0) {
1458 		error = EINVAL;
1459 		goto done;
1460 	}
1461 
1462 	/*
1463 	 * If specified, get the pointer to the sf_hdtr struct for
1464 	 * any headers/trailers.
1465 	 */
1466 	if (uap->hdtr != NULL) {
1467 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1468 		if (error)
1469 			goto done;
1470 		/*
1471 		 * Send any headers. Wimp out and use writev(2).
1472 		 */
1473 		if (hdtr.headers != NULL) {
1474 			nuap.fd = uap->s;
1475 			nuap.iovp = hdtr.headers;
1476 			nuap.iovcnt = hdtr.hdr_cnt;
1477 			error = writev(p, &nuap);
1478 			if (error)
1479 				goto done;
1480 			sbytes += p->p_retval[0];
1481 		}
1482 	}
1483 
1484 	/*
1485 	 * Protect against multiple writers to the socket.
1486 	 */
1487 	(void) sblock(&so->so_snd, M_WAITOK);
1488 
1489 	/*
1490 	 * Loop through the pages in the file, starting with the requested
1491 	 * offset. Get a file page (do I/O if necessary), map the file page
1492 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1493 	 * it on the socket.
1494 	 */
1495 	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1496 		vm_pindex_t pindex;
1497 		vm_offset_t pgoff;
1498 
1499 		pindex = OFF_TO_IDX(off);
1500 retry_lookup:
1501 		/*
1502 		 * Calculate the amount to transfer. Not to exceed a page,
1503 		 * the EOF, or the passed in nbytes.
1504 		 */
1505 		xfsize = obj->un_pager.vnp.vnp_size - off;
1506 		if (xfsize > PAGE_SIZE)
1507 			xfsize = PAGE_SIZE;
1508 		pgoff = (vm_offset_t)(off & PAGE_MASK);
1509 		if (PAGE_SIZE - pgoff < xfsize)
1510 			xfsize = PAGE_SIZE - pgoff;
1511 		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1512 			xfsize = uap->nbytes - sbytes;
1513 		if (xfsize <= 0)
1514 			break;
1515 		/*
1516 		 * Optimize the non-blocking case by looking at the socket space
1517 		 * before going to the extra work of constituting the sf_buf.
1518 		 */
1519 		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1520 			if (so->so_state & SS_CANTSENDMORE)
1521 				error = EPIPE;
1522 			else
1523 				error = EAGAIN;
1524 			sbunlock(&so->so_snd);
1525 			goto done;
1526 		}
1527 		/*
1528 		 * Attempt to look up the page. If the page doesn't exist or the
1529 		 * part we're interested in isn't valid, then read it from disk.
1530 		 * If some other part of the kernel has this page (i.e. it's busy),
1531 		 * then disk I/O may be occuring on it, so wait and retry.
1532 		 */
1533 		pg = vm_page_lookup(obj, pindex);
1534 		if (pg == NULL || (!(pg->flags & PG_BUSY) && !pg->busy &&
1535 		    !vm_page_is_valid(pg, pgoff, xfsize))) {
1536 			struct uio auio;
1537 			struct iovec aiov;
1538 			int bsize;
1539 
1540 			if (pg == NULL) {
1541 				pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1542 				if (pg == NULL) {
1543 					VM_WAIT;
1544 					goto retry_lookup;
1545 				}
1546 				vm_page_flag_clear(pg, PG_BUSY);
1547 			}
1548 			/*
1549 			 * Ensure that our page is still around when the I/O completes.
1550 			 */
1551 			vm_page_io_start(pg);
1552 			vm_page_wire(pg);
1553 			/*
1554 			 * Get the page from backing store.
1555 			 */
1556 			bsize = vp->v_mount->mnt_stat.f_iosize;
1557 			auio.uio_iov = &aiov;
1558 			auio.uio_iovcnt = 1;
1559 			aiov.iov_base = 0;
1560 			aiov.iov_len = MAXBSIZE;
1561 			auio.uio_resid = MAXBSIZE;
1562 			auio.uio_offset = trunc_page(off);
1563 			auio.uio_segflg = UIO_NOCOPY;
1564 			auio.uio_rw = UIO_READ;
1565 			auio.uio_procp = p;
1566 			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
1567 			error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
1568 			        p->p_ucred);
1569 			VOP_UNLOCK(vp, 0, p);
1570 			vm_page_flag_clear(pg, PG_ZERO);
1571 			vm_page_io_finish(pg);
1572 			if (error) {
1573 				vm_page_unwire(pg, 0);
1574 				/*
1575 				 * See if anyone else might know about this page.
1576 				 * If not and it is not valid, then free it.
1577 				 */
1578 				if (pg->wire_count == 0 && pg->valid == 0 &&
1579 				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1580 				    pg->hold_count == 0)
1581 					vm_page_free(pg);
1582 				sbunlock(&so->so_snd);
1583 				goto done;
1584 			}
1585 		} else {
1586 			if ((pg->flags & PG_BUSY) || pg->busy)  {
1587 				s = splvm();
1588 				if ((pg->flags & PG_BUSY) || pg->busy) {
1589 					/*
1590 					 * Page is busy. Wait and retry.
1591 					 */
1592 					vm_page_flag_set(pg, PG_WANTED);
1593 					tsleep(pg, PVM, "sfpbsy", 0);
1594 					splx(s);
1595 					goto retry_lookup;
1596 				}
1597 				splx(s);
1598 			}
1599 			/*
1600 			 * Protect from having the page ripped out from beneath us.
1601 			 */
1602 			vm_page_wire(pg);
1603 		}
1604 		/*
1605 		 * Allocate a kernel virtual page and insert the physical page
1606 		 * into it.
1607 		 */
1608 		sf = sf_buf_alloc();
1609 		sf->m = pg;
1610 		pmap_qenter(sf->kva, &pg, 1);
1611 		/*
1612 		 * Get an mbuf header and set it up as having external storage.
1613 		 */
1614 		MGETHDR(m, M_WAIT, MT_DATA);
1615 		m->m_ext.ext_free = sf_buf_free;
1616 		m->m_ext.ext_ref = sf_buf_ref;
1617 		m->m_ext.ext_buf = (void *)sf->kva;
1618 		m->m_ext.ext_size = PAGE_SIZE;
1619 		m->m_data = (char *) sf->kva + pgoff;
1620 		m->m_flags |= M_EXT;
1621 		m->m_pkthdr.len = m->m_len = xfsize;
1622 		/*
1623 		 * Add the buffer to the socket buffer chain.
1624 		 */
1625 		s = splnet();
1626 retry_space:
1627 		/*
1628 		 * Make sure that the socket is still able to take more data.
1629 		 * CANTSENDMORE being true usually means that the connection
1630 		 * was closed. so_error is true when an error was sensed after
1631 		 * a previous send.
1632 		 * The state is checked after the page mapping and buffer
1633 		 * allocation above since those operations may block and make
1634 		 * any socket checks stale. From this point forward, nothing
1635 		 * blocks before the pru_send (or more accurately, any blocking
1636 		 * results in a loop back to here to re-check).
1637 		 */
1638 		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1639 			if (so->so_state & SS_CANTSENDMORE) {
1640 				error = EPIPE;
1641 			} else {
1642 				error = so->so_error;
1643 				so->so_error = 0;
1644 			}
1645 			m_freem(m);
1646 			sbunlock(&so->so_snd);
1647 			splx(s);
1648 			goto done;
1649 		}
1650 		/*
1651 		 * Wait for socket space to become available. We do this just
1652 		 * after checking the connection state above in order to avoid
1653 		 * a race condition with sbwait().
1654 		 */
1655 		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1656 			if (so->so_state & SS_NBIO) {
1657 				m_freem(m);
1658 				sbunlock(&so->so_snd);
1659 				splx(s);
1660 				error = EAGAIN;
1661 				goto done;
1662 			}
1663 			error = sbwait(&so->so_snd);
1664 			/*
1665 			 * An error from sbwait usually indicates that we've
1666 			 * been interrupted by a signal. If we've sent anything
1667 			 * then return bytes sent, otherwise return the error.
1668 			 */
1669 			if (error) {
1670 				m_freem(m);
1671 				sbunlock(&so->so_snd);
1672 				splx(s);
1673 				goto done;
1674 			}
1675 			goto retry_space;
1676 		}
1677 		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
1678 		splx(s);
1679 		if (error) {
1680 			sbunlock(&so->so_snd);
1681 			goto done;
1682 		}
1683 	}
1684 	sbunlock(&so->so_snd);
1685 
1686 	/*
1687 	 * Send trailers. Wimp out and use writev(2).
1688 	 */
1689 	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1690 			nuap.fd = uap->s;
1691 			nuap.iovp = hdtr.trailers;
1692 			nuap.iovcnt = hdtr.trl_cnt;
1693 			error = writev(p, &nuap);
1694 			if (error)
1695 				goto done;
1696 			sbytes += p->p_retval[0];
1697 	}
1698 
1699 done:
1700 	if (uap->sbytes != NULL) {
1701 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
1702 	}
1703 	return (error);
1704 }
1705