xref: /freebsd/sys/kern/uipc_syscalls.c (revision c678bc4f13a340ad88debe321afd0097db2590cb)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37  * $FreeBSD$
38  */
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/mutex.h>
48 #include <sys/sysproto.h>
49 #include <sys/malloc.h>
50 #include <sys/filedesc.h>
51 #include <sys/event.h>
52 #include <sys/proc.h>
53 #include <sys/fcntl.h>
54 #include <sys/file.h>
55 #include <sys/lock.h>
56 #include <sys/mount.h>
57 #include <sys/mbuf.h>
58 #include <sys/protosw.h>
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/signalvar.h>
62 #include <sys/uio.h>
63 #include <sys/vnode.h>
64 #ifdef KTRACE
65 #include <sys/ktrace.h>
66 #endif
67 
68 #include <vm/vm.h>
69 #include <vm/vm_object.h>
70 #include <vm/vm_page.h>
71 #include <vm/vm_pageout.h>
72 #include <vm/vm_kern.h>
73 #include <vm/vm_extern.h>
74 
75 static void sf_buf_init(void *arg);
76 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
77 static struct sf_buf *sf_buf_alloc(void);
78 static void sf_buf_free(caddr_t addr, void *args);
79 
80 static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags));
81 static int recvit __P((struct proc *p, int s, struct msghdr *mp,
82 		       caddr_t namelenp));
83 
84 static int accept1 __P((struct proc *p, struct accept_args *uap, int compat));
85 static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
86 			     int compat));
87 static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
88 			     int compat));
89 
90 /*
91  * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the
92  * sf_freelist head with the sf_lock mutex.
93  */
94 static struct {
95 	SLIST_HEAD(, sf_buf) sf_head;
96 	struct mtx sf_lock;
97 } sf_freelist;
98 
99 static vm_offset_t sf_base;
100 static struct sf_buf *sf_bufs;
101 static u_int sf_buf_alloc_want;
102 
103 /*
104  * System call interface to the socket abstraction.
105  */
106 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
107 #define COMPAT_OLDSOCK
108 #endif
109 
110 extern	struct fileops socketops;
111 
112 int
113 socket(p, uap)
114 	struct proc *p;
115 	register struct socket_args /* {
116 		int	domain;
117 		int	type;
118 		int	protocol;
119 	} */ *uap;
120 {
121 	struct filedesc *fdp = p->p_fd;
122 	struct socket *so;
123 	struct file *fp;
124 	int fd, error;
125 
126 	error = falloc(p, &fp, &fd);
127 	if (error)
128 		return (error);
129 	fhold(fp);
130 	error = socreate(uap->domain, &so, uap->type, uap->protocol, p);
131 	if (error) {
132 		if (fdp->fd_ofiles[fd] == fp) {
133 			fdp->fd_ofiles[fd] = NULL;
134 			fdrop(fp, p);
135 		}
136 	} else {
137 		fp->f_data = (caddr_t)so;
138 		fp->f_flag = FREAD|FWRITE;
139 		fp->f_ops = &socketops;
140 		fp->f_type = DTYPE_SOCKET;
141 		p->p_retval[0] = fd;
142 	}
143 	fdrop(fp, p);
144 	return (error);
145 }
146 
147 /* ARGSUSED */
148 int
149 bind(p, uap)
150 	struct proc *p;
151 	register struct bind_args /* {
152 		int	s;
153 		caddr_t	name;
154 		int	namelen;
155 	} */ *uap;
156 {
157 	struct file *fp;
158 	struct sockaddr *sa;
159 	int error;
160 
161 	error = holdsock(p->p_fd, uap->s, &fp);
162 	if (error)
163 		return (error);
164 	error = getsockaddr(&sa, uap->name, uap->namelen);
165 	if (error) {
166 		fdrop(fp, p);
167 		return (error);
168 	}
169 	error = sobind((struct socket *)fp->f_data, sa, p);
170 	FREE(sa, M_SONAME);
171 	fdrop(fp, p);
172 	return (error);
173 }
174 
175 /* ARGSUSED */
176 int
177 listen(p, uap)
178 	struct proc *p;
179 	register struct listen_args /* {
180 		int	s;
181 		int	backlog;
182 	} */ *uap;
183 {
184 	struct file *fp;
185 	int error;
186 
187 	error = holdsock(p->p_fd, uap->s, &fp);
188 	if (error)
189 		return (error);
190 	error = solisten((struct socket *)fp->f_data, uap->backlog, p);
191 	fdrop(fp, p);
192 	return(error);
193 }
194 
195 static int
196 accept1(p, uap, compat)
197 	struct proc *p;
198 	register struct accept_args /* {
199 		int	s;
200 		caddr_t	name;
201 		int	*anamelen;
202 	} */ *uap;
203 	int compat;
204 {
205 	struct filedesc *fdp = p->p_fd;
206 	struct file *lfp = NULL;
207 	struct file *nfp = NULL;
208 	struct sockaddr *sa;
209 	int namelen, error, s;
210 	struct socket *head, *so;
211 	int fd;
212 	short fflag;		/* type must match fp->f_flag */
213 
214 	if (uap->name) {
215 		error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
216 			sizeof (namelen));
217 		if(error)
218 			return (error);
219 	}
220 	error = holdsock(fdp, uap->s, &lfp);
221 	if (error)
222 		return (error);
223 	s = splnet();
224 	head = (struct socket *)lfp->f_data;
225 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
226 		splx(s);
227 		error = EINVAL;
228 		goto done;
229 	}
230 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
231 		splx(s);
232 		error = EWOULDBLOCK;
233 		goto done;
234 	}
235 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
236 		if (head->so_state & SS_CANTRCVMORE) {
237 			head->so_error = ECONNABORTED;
238 			break;
239 		}
240 		error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
241 		    "accept", 0);
242 		if (error) {
243 			splx(s);
244 			goto done;
245 		}
246 	}
247 	if (head->so_error) {
248 		error = head->so_error;
249 		head->so_error = 0;
250 		splx(s);
251 		goto done;
252 	}
253 
254 	/*
255 	 * At this point we know that there is at least one connection
256 	 * ready to be accepted. Remove it from the queue prior to
257 	 * allocating the file descriptor for it since falloc() may
258 	 * block allowing another process to accept the connection
259 	 * instead.
260 	 */
261 	so = TAILQ_FIRST(&head->so_comp);
262 	TAILQ_REMOVE(&head->so_comp, so, so_list);
263 	head->so_qlen--;
264 
265 	fflag = lfp->f_flag;
266 	error = falloc(p, &nfp, &fd);
267 	if (error) {
268 		/*
269 		 * Probably ran out of file descriptors. Put the
270 		 * unaccepted connection back onto the queue and
271 		 * do another wakeup so some other process might
272 		 * have a chance at it.
273 		 */
274 		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
275 		head->so_qlen++;
276 		wakeup_one(&head->so_timeo);
277 		splx(s);
278 		goto done;
279 	}
280 	fhold(nfp);
281 	p->p_retval[0] = fd;
282 
283 	/* connection has been removed from the listen queue */
284 	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
285 
286 	so->so_state &= ~SS_COMP;
287 	so->so_head = NULL;
288 	if (head->so_sigio != NULL)
289 		fsetown(fgetown(head->so_sigio), &so->so_sigio);
290 
291 	nfp->f_data = (caddr_t)so;
292 	nfp->f_flag = fflag;
293 	nfp->f_ops = &socketops;
294 	nfp->f_type = DTYPE_SOCKET;
295 	sa = 0;
296 	error = soaccept(so, &sa);
297 	if (error) {
298 		/*
299 		 * return a namelen of zero for older code which might
300 	 	 * ignore the return value from accept.
301 		 */
302 		if (uap->name != NULL) {
303 			namelen = 0;
304 			(void) copyout((caddr_t)&namelen,
305 			    (caddr_t)uap->anamelen, sizeof(*uap->anamelen));
306 		}
307 		goto noconnection;
308 	}
309 	if (sa == NULL) {
310 		namelen = 0;
311 		if (uap->name)
312 			goto gotnoname;
313 		splx(s);
314 		error = 0;
315 		goto done;
316 	}
317 	if (uap->name) {
318 		/* check sa_len before it is destroyed */
319 		if (namelen > sa->sa_len)
320 			namelen = sa->sa_len;
321 #ifdef COMPAT_OLDSOCK
322 		if (compat)
323 			((struct osockaddr *)sa)->sa_family =
324 			    sa->sa_family;
325 #endif
326 		error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
327 		if (!error)
328 gotnoname:
329 			error = copyout((caddr_t)&namelen,
330 			    (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
331 	}
332 noconnection:
333 	if (sa)
334 		FREE(sa, M_SONAME);
335 
336 	/*
337 	 * close the new descriptor, assuming someone hasn't ripped it
338 	 * out from under us.
339 	 */
340 	if (error) {
341 		if (fdp->fd_ofiles[fd] == nfp) {
342 			fdp->fd_ofiles[fd] = NULL;
343 			fdrop(nfp, p);
344 		}
345 	}
346 	splx(s);
347 
348 	/*
349 	 * Release explicitly held references before returning.
350 	 */
351 done:
352 	if (nfp != NULL)
353 		fdrop(nfp, p);
354 	fdrop(lfp, p);
355 	return (error);
356 }
357 
358 int
359 accept(p, uap)
360 	struct proc *p;
361 	struct accept_args *uap;
362 {
363 
364 	return (accept1(p, uap, 0));
365 }
366 
367 #ifdef COMPAT_OLDSOCK
368 int
369 oaccept(p, uap)
370 	struct proc *p;
371 	struct accept_args *uap;
372 {
373 
374 	return (accept1(p, uap, 1));
375 }
376 #endif /* COMPAT_OLDSOCK */
377 
378 /* ARGSUSED */
379 int
380 connect(p, uap)
381 	struct proc *p;
382 	register struct connect_args /* {
383 		int	s;
384 		caddr_t	name;
385 		int	namelen;
386 	} */ *uap;
387 {
388 	struct file *fp;
389 	register struct socket *so;
390 	struct sockaddr *sa;
391 	int error, s;
392 
393 	error = holdsock(p->p_fd, uap->s, &fp);
394 	if (error)
395 		return (error);
396 	so = (struct socket *)fp->f_data;
397 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
398 		error = EALREADY;
399 		goto done;
400 	}
401 	error = getsockaddr(&sa, uap->name, uap->namelen);
402 	if (error)
403 		goto done;
404 	error = soconnect(so, sa, p);
405 	if (error)
406 		goto bad;
407 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
408 		FREE(sa, M_SONAME);
409 		error = EINPROGRESS;
410 		goto done;
411 	}
412 	s = splnet();
413 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
414 		error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
415 		    "connec", 0);
416 		if (error)
417 			break;
418 	}
419 	if (error == 0) {
420 		error = so->so_error;
421 		so->so_error = 0;
422 	}
423 	splx(s);
424 bad:
425 	so->so_state &= ~SS_ISCONNECTING;
426 	FREE(sa, M_SONAME);
427 	if (error == ERESTART)
428 		error = EINTR;
429 done:
430 	fdrop(fp, p);
431 	return (error);
432 }
433 
434 int
435 socketpair(p, uap)
436 	struct proc *p;
437 	register struct socketpair_args /* {
438 		int	domain;
439 		int	type;
440 		int	protocol;
441 		int	*rsv;
442 	} */ *uap;
443 {
444 	register struct filedesc *fdp = p->p_fd;
445 	struct file *fp1, *fp2;
446 	struct socket *so1, *so2;
447 	int fd, error, sv[2];
448 
449 	error = socreate(uap->domain, &so1, uap->type, uap->protocol, p);
450 	if (error)
451 		return (error);
452 	error = socreate(uap->domain, &so2, uap->type, uap->protocol, p);
453 	if (error)
454 		goto free1;
455 	error = falloc(p, &fp1, &fd);
456 	if (error)
457 		goto free2;
458 	fhold(fp1);
459 	sv[0] = fd;
460 	fp1->f_data = (caddr_t)so1;
461 	error = falloc(p, &fp2, &fd);
462 	if (error)
463 		goto free3;
464 	fhold(fp2);
465 	fp2->f_data = (caddr_t)so2;
466 	sv[1] = fd;
467 	error = soconnect2(so1, so2);
468 	if (error)
469 		goto free4;
470 	if (uap->type == SOCK_DGRAM) {
471 		/*
472 		 * Datagram socket connection is asymmetric.
473 		 */
474 		 error = soconnect2(so2, so1);
475 		 if (error)
476 			goto free4;
477 	}
478 	fp1->f_flag = fp2->f_flag = FREAD|FWRITE;
479 	fp1->f_ops = fp2->f_ops = &socketops;
480 	fp1->f_type = fp2->f_type = DTYPE_SOCKET;
481 	error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
482 	fdrop(fp1, p);
483 	fdrop(fp2, p);
484 	return (error);
485 free4:
486 	if (fdp->fd_ofiles[sv[1]] == fp2) {
487 		fdp->fd_ofiles[sv[1]] = NULL;
488 		fdrop(fp2, p);
489 	}
490 	fdrop(fp2, p);
491 free3:
492 	if (fdp->fd_ofiles[sv[0]] == fp1) {
493 		fdp->fd_ofiles[sv[0]] = NULL;
494 		fdrop(fp1, p);
495 	}
496 	fdrop(fp1, p);
497 free2:
498 	(void)soclose(so2);
499 free1:
500 	(void)soclose(so1);
501 	return (error);
502 }
503 
504 static int
505 sendit(p, s, mp, flags)
506 	register struct proc *p;
507 	int s;
508 	register struct msghdr *mp;
509 	int flags;
510 {
511 	struct file *fp;
512 	struct uio auio;
513 	register struct iovec *iov;
514 	register int i;
515 	struct mbuf *control;
516 	struct sockaddr *to;
517 	int len, error;
518 	struct socket *so;
519 #ifdef KTRACE
520 	struct iovec *ktriov = NULL;
521 	struct uio ktruio;
522 #endif
523 
524 	error = holdsock(p->p_fd, s, &fp);
525 	if (error)
526 		return (error);
527 	auio.uio_iov = mp->msg_iov;
528 	auio.uio_iovcnt = mp->msg_iovlen;
529 	auio.uio_segflg = UIO_USERSPACE;
530 	auio.uio_rw = UIO_WRITE;
531 	auio.uio_procp = p;
532 	auio.uio_offset = 0;			/* XXX */
533 	auio.uio_resid = 0;
534 	iov = mp->msg_iov;
535 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
536 		if ((auio.uio_resid += iov->iov_len) < 0) {
537 			fdrop(fp, p);
538 			return (EINVAL);
539 		}
540 	}
541 	if (mp->msg_name) {
542 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
543 		if (error) {
544 			fdrop(fp, p);
545 			return (error);
546 		}
547 	} else {
548 		to = 0;
549 	}
550 	if (mp->msg_control) {
551 		if (mp->msg_controllen < sizeof(struct cmsghdr)
552 #ifdef COMPAT_OLDSOCK
553 		    && mp->msg_flags != MSG_COMPAT
554 #endif
555 		) {
556 			error = EINVAL;
557 			goto bad;
558 		}
559 		error = sockargs(&control, mp->msg_control,
560 		    mp->msg_controllen, MT_CONTROL);
561 		if (error)
562 			goto bad;
563 #ifdef COMPAT_OLDSOCK
564 		if (mp->msg_flags == MSG_COMPAT) {
565 			register struct cmsghdr *cm;
566 
567 			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
568 			if (control == 0) {
569 				error = ENOBUFS;
570 				goto bad;
571 			} else {
572 				cm = mtod(control, struct cmsghdr *);
573 				cm->cmsg_len = control->m_len;
574 				cm->cmsg_level = SOL_SOCKET;
575 				cm->cmsg_type = SCM_RIGHTS;
576 			}
577 		}
578 #endif
579 	} else {
580 		control = 0;
581 	}
582 #ifdef KTRACE
583 	if (KTRPOINT(p, KTR_GENIO)) {
584 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
585 
586 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
587 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
588 		ktruio = auio;
589 	}
590 #endif
591 	len = auio.uio_resid;
592 	so = (struct socket *)fp->f_data;
593 	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
594 						     flags, p);
595 	if (error) {
596 		if (auio.uio_resid != len && (error == ERESTART ||
597 		    error == EINTR || error == EWOULDBLOCK))
598 			error = 0;
599 		if (error == EPIPE) {
600 			PROC_LOCK(p);
601 			psignal(p, SIGPIPE);
602 			PROC_UNLOCK(p);
603 		}
604 	}
605 	if (error == 0)
606 		p->p_retval[0] = len - auio.uio_resid;
607 #ifdef KTRACE
608 	if (ktriov != NULL) {
609 		if (error == 0) {
610 			ktruio.uio_iov = ktriov;
611 			ktruio.uio_resid = p->p_retval[0];
612 			ktrgenio(p->p_tracep, s, UIO_WRITE, &ktruio, error);
613 		}
614 		FREE(ktriov, M_TEMP);
615 	}
616 #endif
617 bad:
618 	fdrop(fp, p);
619 	if (to)
620 		FREE(to, M_SONAME);
621 	return (error);
622 }
623 
624 int
625 sendto(p, uap)
626 	struct proc *p;
627 	register struct sendto_args /* {
628 		int	s;
629 		caddr_t	buf;
630 		size_t	len;
631 		int	flags;
632 		caddr_t	to;
633 		int	tolen;
634 	} */ *uap;
635 {
636 	struct msghdr msg;
637 	struct iovec aiov;
638 
639 	msg.msg_name = uap->to;
640 	msg.msg_namelen = uap->tolen;
641 	msg.msg_iov = &aiov;
642 	msg.msg_iovlen = 1;
643 	msg.msg_control = 0;
644 #ifdef COMPAT_OLDSOCK
645 	msg.msg_flags = 0;
646 #endif
647 	aiov.iov_base = uap->buf;
648 	aiov.iov_len = uap->len;
649 	return (sendit(p, uap->s, &msg, uap->flags));
650 }
651 
652 #ifdef COMPAT_OLDSOCK
653 int
654 osend(p, uap)
655 	struct proc *p;
656 	register struct osend_args /* {
657 		int	s;
658 		caddr_t	buf;
659 		int	len;
660 		int	flags;
661 	} */ *uap;
662 {
663 	struct msghdr msg;
664 	struct iovec aiov;
665 
666 	msg.msg_name = 0;
667 	msg.msg_namelen = 0;
668 	msg.msg_iov = &aiov;
669 	msg.msg_iovlen = 1;
670 	aiov.iov_base = uap->buf;
671 	aiov.iov_len = uap->len;
672 	msg.msg_control = 0;
673 	msg.msg_flags = 0;
674 	return (sendit(p, uap->s, &msg, uap->flags));
675 }
676 
677 int
678 osendmsg(p, uap)
679 	struct proc *p;
680 	register struct osendmsg_args /* {
681 		int	s;
682 		caddr_t	msg;
683 		int	flags;
684 	} */ *uap;
685 {
686 	struct msghdr msg;
687 	struct iovec aiov[UIO_SMALLIOV], *iov;
688 	int error;
689 
690 	error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
691 	if (error)
692 		return (error);
693 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
694 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
695 			return (EMSGSIZE);
696 		MALLOC(iov, struct iovec *,
697 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
698 		      M_WAITOK);
699 	} else
700 		iov = aiov;
701 	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
702 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
703 	if (error)
704 		goto done;
705 	msg.msg_flags = MSG_COMPAT;
706 	msg.msg_iov = iov;
707 	error = sendit(p, uap->s, &msg, uap->flags);
708 done:
709 	if (iov != aiov)
710 		FREE(iov, M_IOV);
711 	return (error);
712 }
713 #endif
714 
715 int
716 sendmsg(p, uap)
717 	struct proc *p;
718 	register struct sendmsg_args /* {
719 		int	s;
720 		caddr_t	msg;
721 		int	flags;
722 	} */ *uap;
723 {
724 	struct msghdr msg;
725 	struct iovec aiov[UIO_SMALLIOV], *iov;
726 	int error;
727 
728 	error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
729 	if (error)
730 		return (error);
731 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
732 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
733 			return (EMSGSIZE);
734 		MALLOC(iov, struct iovec *,
735 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
736 		       M_WAITOK);
737 	} else
738 		iov = aiov;
739 	if (msg.msg_iovlen &&
740 	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
741 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
742 		goto done;
743 	msg.msg_iov = iov;
744 #ifdef COMPAT_OLDSOCK
745 	msg.msg_flags = 0;
746 #endif
747 	error = sendit(p, uap->s, &msg, uap->flags);
748 done:
749 	if (iov != aiov)
750 		FREE(iov, M_IOV);
751 	return (error);
752 }
753 
754 static int
755 recvit(p, s, mp, namelenp)
756 	register struct proc *p;
757 	int s;
758 	register struct msghdr *mp;
759 	caddr_t namelenp;
760 {
761 	struct file *fp;
762 	struct uio auio;
763 	register struct iovec *iov;
764 	register int i;
765 	int len, error;
766 	struct mbuf *m, *control = 0;
767 	caddr_t ctlbuf;
768 	struct socket *so;
769 	struct sockaddr *fromsa = 0;
770 #ifdef KTRACE
771 	struct iovec *ktriov = NULL;
772 	struct uio ktruio;
773 #endif
774 
775 	error = holdsock(p->p_fd, s, &fp);
776 	if (error)
777 		return (error);
778 	auio.uio_iov = mp->msg_iov;
779 	auio.uio_iovcnt = mp->msg_iovlen;
780 	auio.uio_segflg = UIO_USERSPACE;
781 	auio.uio_rw = UIO_READ;
782 	auio.uio_procp = p;
783 	auio.uio_offset = 0;			/* XXX */
784 	auio.uio_resid = 0;
785 	iov = mp->msg_iov;
786 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
787 		if ((auio.uio_resid += iov->iov_len) < 0) {
788 			fdrop(fp, p);
789 			return (EINVAL);
790 		}
791 	}
792 #ifdef KTRACE
793 	if (KTRPOINT(p, KTR_GENIO)) {
794 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
795 
796 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
797 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
798 		ktruio = auio;
799 	}
800 #endif
801 	len = auio.uio_resid;
802 	so = (struct socket *)fp->f_data;
803 	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
804 	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
805 	    &mp->msg_flags);
806 	if (error) {
807 		if (auio.uio_resid != len && (error == ERESTART ||
808 		    error == EINTR || error == EWOULDBLOCK))
809 			error = 0;
810 	}
811 #ifdef KTRACE
812 	if (ktriov != NULL) {
813 		if (error == 0) {
814 			ktruio.uio_iov = ktriov;
815 			ktruio.uio_resid = len - auio.uio_resid;
816 			ktrgenio(p->p_tracep, s, UIO_READ, &ktruio, error);
817 		}
818 		FREE(ktriov, M_TEMP);
819 	}
820 #endif
821 	if (error)
822 		goto out;
823 	p->p_retval[0] = len - auio.uio_resid;
824 	if (mp->msg_name) {
825 		len = mp->msg_namelen;
826 		if (len <= 0 || fromsa == 0)
827 			len = 0;
828 		else {
829 #ifndef MIN
830 #define MIN(a,b) ((a)>(b)?(b):(a))
831 #endif
832 			/* save sa_len before it is destroyed by MSG_COMPAT */
833 			len = MIN(len, fromsa->sa_len);
834 #ifdef COMPAT_OLDSOCK
835 			if (mp->msg_flags & MSG_COMPAT)
836 				((struct osockaddr *)fromsa)->sa_family =
837 				    fromsa->sa_family;
838 #endif
839 			error = copyout(fromsa,
840 			    (caddr_t)mp->msg_name, (unsigned)len);
841 			if (error)
842 				goto out;
843 		}
844 		mp->msg_namelen = len;
845 		if (namelenp &&
846 		    (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
847 #ifdef COMPAT_OLDSOCK
848 			if (mp->msg_flags & MSG_COMPAT)
849 				error = 0;	/* old recvfrom didn't check */
850 			else
851 #endif
852 			goto out;
853 		}
854 	}
855 	if (mp->msg_control) {
856 #ifdef COMPAT_OLDSOCK
857 		/*
858 		 * We assume that old recvmsg calls won't receive access
859 		 * rights and other control info, esp. as control info
860 		 * is always optional and those options didn't exist in 4.3.
861 		 * If we receive rights, trim the cmsghdr; anything else
862 		 * is tossed.
863 		 */
864 		if (control && mp->msg_flags & MSG_COMPAT) {
865 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
866 			    SOL_SOCKET ||
867 			    mtod(control, struct cmsghdr *)->cmsg_type !=
868 			    SCM_RIGHTS) {
869 				mp->msg_controllen = 0;
870 				goto out;
871 			}
872 			control->m_len -= sizeof (struct cmsghdr);
873 			control->m_data += sizeof (struct cmsghdr);
874 		}
875 #endif
876 		len = mp->msg_controllen;
877 		m = control;
878 		mp->msg_controllen = 0;
879 		ctlbuf = (caddr_t) mp->msg_control;
880 
881 		while (m && len > 0) {
882 			unsigned int tocopy;
883 
884 			if (len >= m->m_len)
885 				tocopy = m->m_len;
886 			else {
887 				mp->msg_flags |= MSG_CTRUNC;
888 				tocopy = len;
889 			}
890 
891 			if ((error = copyout((caddr_t)mtod(m, caddr_t),
892 					ctlbuf, tocopy)) != 0)
893 				goto out;
894 
895 			ctlbuf += tocopy;
896 			len -= tocopy;
897 			m = m->m_next;
898 		}
899 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
900 	}
901 out:
902 	fdrop(fp, p);
903 	if (fromsa)
904 		FREE(fromsa, M_SONAME);
905 	if (control)
906 		m_freem(control);
907 	return (error);
908 }
909 
910 int
911 recvfrom(p, uap)
912 	struct proc *p;
913 	register struct recvfrom_args /* {
914 		int	s;
915 		caddr_t	buf;
916 		size_t	len;
917 		int	flags;
918 		caddr_t	from;
919 		int	*fromlenaddr;
920 	} */ *uap;
921 {
922 	struct msghdr msg;
923 	struct iovec aiov;
924 	int error;
925 
926 	if (uap->fromlenaddr) {
927 		error = copyin((caddr_t)uap->fromlenaddr,
928 		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
929 		if (error)
930 			return (error);
931 	} else
932 		msg.msg_namelen = 0;
933 	msg.msg_name = uap->from;
934 	msg.msg_iov = &aiov;
935 	msg.msg_iovlen = 1;
936 	aiov.iov_base = uap->buf;
937 	aiov.iov_len = uap->len;
938 	msg.msg_control = 0;
939 	msg.msg_flags = uap->flags;
940 	return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr));
941 }
942 
943 #ifdef COMPAT_OLDSOCK
944 int
945 orecvfrom(p, uap)
946 	struct proc *p;
947 	struct recvfrom_args *uap;
948 {
949 
950 	uap->flags |= MSG_COMPAT;
951 	return (recvfrom(p, uap));
952 }
953 #endif
954 
955 
956 #ifdef COMPAT_OLDSOCK
957 int
958 orecv(p, uap)
959 	struct proc *p;
960 	register struct orecv_args /* {
961 		int	s;
962 		caddr_t	buf;
963 		int	len;
964 		int	flags;
965 	} */ *uap;
966 {
967 	struct msghdr msg;
968 	struct iovec aiov;
969 
970 	msg.msg_name = 0;
971 	msg.msg_namelen = 0;
972 	msg.msg_iov = &aiov;
973 	msg.msg_iovlen = 1;
974 	aiov.iov_base = uap->buf;
975 	aiov.iov_len = uap->len;
976 	msg.msg_control = 0;
977 	msg.msg_flags = uap->flags;
978 	return (recvit(p, uap->s, &msg, (caddr_t)0));
979 }
980 
981 /*
982  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
983  * overlays the new one, missing only the flags, and with the (old) access
984  * rights where the control fields are now.
985  */
986 int
987 orecvmsg(p, uap)
988 	struct proc *p;
989 	register struct orecvmsg_args /* {
990 		int	s;
991 		struct	omsghdr *msg;
992 		int	flags;
993 	} */ *uap;
994 {
995 	struct msghdr msg;
996 	struct iovec aiov[UIO_SMALLIOV], *iov;
997 	int error;
998 
999 	error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
1000 	    sizeof (struct omsghdr));
1001 	if (error)
1002 		return (error);
1003 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1004 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
1005 			return (EMSGSIZE);
1006 		MALLOC(iov, struct iovec *,
1007 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1008 		      M_WAITOK);
1009 	} else
1010 		iov = aiov;
1011 	msg.msg_flags = uap->flags | MSG_COMPAT;
1012 	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
1013 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1014 	if (error)
1015 		goto done;
1016 	msg.msg_iov = iov;
1017 	error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
1018 
1019 	if (msg.msg_controllen && error == 0)
1020 		error = copyout((caddr_t)&msg.msg_controllen,
1021 		    (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
1022 done:
1023 	if (iov != aiov)
1024 		FREE(iov, M_IOV);
1025 	return (error);
1026 }
1027 #endif
1028 
1029 int
1030 recvmsg(p, uap)
1031 	struct proc *p;
1032 	register struct recvmsg_args /* {
1033 		int	s;
1034 		struct	msghdr *msg;
1035 		int	flags;
1036 	} */ *uap;
1037 {
1038 	struct msghdr msg;
1039 	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
1040 	register int error;
1041 
1042 	error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
1043 	if (error)
1044 		return (error);
1045 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1046 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
1047 			return (EMSGSIZE);
1048 		MALLOC(iov, struct iovec *,
1049 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1050 		       M_WAITOK);
1051 	} else
1052 		iov = aiov;
1053 #ifdef COMPAT_OLDSOCK
1054 	msg.msg_flags = uap->flags &~ MSG_COMPAT;
1055 #else
1056 	msg.msg_flags = uap->flags;
1057 #endif
1058 	uiov = msg.msg_iov;
1059 	msg.msg_iov = iov;
1060 	error = copyin((caddr_t)uiov, (caddr_t)iov,
1061 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1062 	if (error)
1063 		goto done;
1064 	error = recvit(p, uap->s, &msg, (caddr_t)0);
1065 	if (!error) {
1066 		msg.msg_iov = uiov;
1067 		error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
1068 	}
1069 done:
1070 	if (iov != aiov)
1071 		FREE(iov, M_IOV);
1072 	return (error);
1073 }
1074 
1075 /* ARGSUSED */
1076 int
1077 shutdown(p, uap)
1078 	struct proc *p;
1079 	register struct shutdown_args /* {
1080 		int	s;
1081 		int	how;
1082 	} */ *uap;
1083 {
1084 	struct file *fp;
1085 	int error;
1086 
1087 	error = holdsock(p->p_fd, uap->s, &fp);
1088 	if (error)
1089 		return (error);
1090 	error = soshutdown((struct socket *)fp->f_data, uap->how);
1091 	fdrop(fp, p);
1092 	return(error);
1093 }
1094 
1095 /* ARGSUSED */
1096 int
1097 setsockopt(p, uap)
1098 	struct proc *p;
1099 	register struct setsockopt_args /* {
1100 		int	s;
1101 		int	level;
1102 		int	name;
1103 		caddr_t	val;
1104 		int	valsize;
1105 	} */ *uap;
1106 {
1107 	struct file *fp;
1108 	struct sockopt sopt;
1109 	int error;
1110 
1111 	if (uap->val == 0 && uap->valsize != 0)
1112 		return (EFAULT);
1113 	if (uap->valsize < 0)
1114 		return (EINVAL);
1115 
1116 	error = holdsock(p->p_fd, uap->s, &fp);
1117 	if (error)
1118 		return (error);
1119 
1120 	sopt.sopt_dir = SOPT_SET;
1121 	sopt.sopt_level = uap->level;
1122 	sopt.sopt_name = uap->name;
1123 	sopt.sopt_val = uap->val;
1124 	sopt.sopt_valsize = uap->valsize;
1125 	sopt.sopt_p = p;
1126 	error = sosetopt((struct socket *)fp->f_data, &sopt);
1127 	fdrop(fp, p);
1128 	return(error);
1129 }
1130 
1131 /* ARGSUSED */
1132 int
1133 getsockopt(p, uap)
1134 	struct proc *p;
1135 	register struct getsockopt_args /* {
1136 		int	s;
1137 		int	level;
1138 		int	name;
1139 		caddr_t	val;
1140 		int	*avalsize;
1141 	} */ *uap;
1142 {
1143 	int	valsize, error;
1144 	struct	file *fp;
1145 	struct	sockopt sopt;
1146 
1147 	error = holdsock(p->p_fd, uap->s, &fp);
1148 	if (error)
1149 		return (error);
1150 	if (uap->val) {
1151 		error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
1152 		    sizeof (valsize));
1153 		if (error) {
1154 			fdrop(fp, p);
1155 			return (error);
1156 		}
1157 		if (valsize < 0) {
1158 			fdrop(fp, p);
1159 			return (EINVAL);
1160 		}
1161 	} else {
1162 		valsize = 0;
1163 	}
1164 
1165 	sopt.sopt_dir = SOPT_GET;
1166 	sopt.sopt_level = uap->level;
1167 	sopt.sopt_name = uap->name;
1168 	sopt.sopt_val = uap->val;
1169 	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1170 	sopt.sopt_p = p;
1171 
1172 	error = sogetopt((struct socket *)fp->f_data, &sopt);
1173 	if (error == 0) {
1174 		valsize = sopt.sopt_valsize;
1175 		error = copyout((caddr_t)&valsize,
1176 				(caddr_t)uap->avalsize, sizeof (valsize));
1177 	}
1178 	fdrop(fp, p);
1179 	return (error);
1180 }
1181 
1182 /*
1183  * Get socket name.
1184  */
1185 /* ARGSUSED */
1186 static int
1187 getsockname1(p, uap, compat)
1188 	struct proc *p;
1189 	register struct getsockname_args /* {
1190 		int	fdes;
1191 		caddr_t	asa;
1192 		int	*alen;
1193 	} */ *uap;
1194 	int compat;
1195 {
1196 	struct file *fp;
1197 	register struct socket *so;
1198 	struct sockaddr *sa;
1199 	int len, error;
1200 
1201 	error = holdsock(p->p_fd, uap->fdes, &fp);
1202 	if (error)
1203 		return (error);
1204 	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1205 	if (error) {
1206 		fdrop(fp, p);
1207 		return (error);
1208 	}
1209 	so = (struct socket *)fp->f_data;
1210 	sa = 0;
1211 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1212 	if (error)
1213 		goto bad;
1214 	if (sa == 0) {
1215 		len = 0;
1216 		goto gotnothing;
1217 	}
1218 
1219 	len = MIN(len, sa->sa_len);
1220 #ifdef COMPAT_OLDSOCK
1221 	if (compat)
1222 		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1223 #endif
1224 	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1225 	if (error == 0)
1226 gotnothing:
1227 		error = copyout((caddr_t)&len, (caddr_t)uap->alen,
1228 		    sizeof (len));
1229 bad:
1230 	if (sa)
1231 		FREE(sa, M_SONAME);
1232 	fdrop(fp, p);
1233 	return (error);
1234 }
1235 
1236 int
1237 getsockname(p, uap)
1238 	struct proc *p;
1239 	struct getsockname_args *uap;
1240 {
1241 
1242 	return (getsockname1(p, uap, 0));
1243 }
1244 
1245 #ifdef COMPAT_OLDSOCK
1246 int
1247 ogetsockname(p, uap)
1248 	struct proc *p;
1249 	struct getsockname_args *uap;
1250 {
1251 
1252 	return (getsockname1(p, uap, 1));
1253 }
1254 #endif /* COMPAT_OLDSOCK */
1255 
1256 /*
1257  * Get name of peer for connected socket.
1258  */
1259 /* ARGSUSED */
1260 static int
1261 getpeername1(p, uap, compat)
1262 	struct proc *p;
1263 	register struct getpeername_args /* {
1264 		int	fdes;
1265 		caddr_t	asa;
1266 		int	*alen;
1267 	} */ *uap;
1268 	int compat;
1269 {
1270 	struct file *fp;
1271 	register struct socket *so;
1272 	struct sockaddr *sa;
1273 	int len, error;
1274 
1275 	error = holdsock(p->p_fd, uap->fdes, &fp);
1276 	if (error)
1277 		return (error);
1278 	so = (struct socket *)fp->f_data;
1279 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1280 		fdrop(fp, p);
1281 		return (ENOTCONN);
1282 	}
1283 	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1284 	if (error) {
1285 		fdrop(fp, p);
1286 		return (error);
1287 	}
1288 	sa = 0;
1289 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1290 	if (error)
1291 		goto bad;
1292 	if (sa == 0) {
1293 		len = 0;
1294 		goto gotnothing;
1295 	}
1296 	len = MIN(len, sa->sa_len);
1297 #ifdef COMPAT_OLDSOCK
1298 	if (compat)
1299 		((struct osockaddr *)sa)->sa_family =
1300 		    sa->sa_family;
1301 #endif
1302 	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1303 	if (error)
1304 		goto bad;
1305 gotnothing:
1306 	error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
1307 bad:
1308 	if (sa)
1309 		FREE(sa, M_SONAME);
1310 	fdrop(fp, p);
1311 	return (error);
1312 }
1313 
1314 int
1315 getpeername(p, uap)
1316 	struct proc *p;
1317 	struct getpeername_args *uap;
1318 {
1319 
1320 	return (getpeername1(p, uap, 0));
1321 }
1322 
1323 #ifdef COMPAT_OLDSOCK
1324 int
1325 ogetpeername(p, uap)
1326 	struct proc *p;
1327 	struct ogetpeername_args *uap;
1328 {
1329 
1330 	/* XXX uap should have type `getpeername_args *' to begin with. */
1331 	return (getpeername1(p, (struct getpeername_args *)uap, 1));
1332 }
1333 #endif /* COMPAT_OLDSOCK */
1334 
1335 int
1336 sockargs(mp, buf, buflen, type)
1337 	struct mbuf **mp;
1338 	caddr_t buf;
1339 	int buflen, type;
1340 {
1341 	register struct sockaddr *sa;
1342 	register struct mbuf *m;
1343 	int error;
1344 
1345 	if ((u_int)buflen > MLEN) {
1346 #ifdef COMPAT_OLDSOCK
1347 		if (type == MT_SONAME && (u_int)buflen <= 112)
1348 			buflen = MLEN;		/* unix domain compat. hack */
1349 		else
1350 #endif
1351 		return (EINVAL);
1352 	}
1353 	m = m_get(M_TRYWAIT, type);
1354 	if (m == NULL)
1355 		return (ENOBUFS);
1356 	m->m_len = buflen;
1357 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1358 	if (error)
1359 		(void) m_free(m);
1360 	else {
1361 		*mp = m;
1362 		if (type == MT_SONAME) {
1363 			sa = mtod(m, struct sockaddr *);
1364 
1365 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1366 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1367 				sa->sa_family = sa->sa_len;
1368 #endif
1369 			sa->sa_len = buflen;
1370 		}
1371 	}
1372 	return (error);
1373 }
1374 
1375 int
1376 getsockaddr(namp, uaddr, len)
1377 	struct sockaddr **namp;
1378 	caddr_t uaddr;
1379 	size_t len;
1380 {
1381 	struct sockaddr *sa;
1382 	int error;
1383 
1384 	if (len > SOCK_MAXADDRLEN)
1385 		return ENAMETOOLONG;
1386 	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1387 	error = copyin(uaddr, sa, len);
1388 	if (error) {
1389 		FREE(sa, M_SONAME);
1390 	} else {
1391 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1392 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1393 			sa->sa_family = sa->sa_len;
1394 #endif
1395 		sa->sa_len = len;
1396 		*namp = sa;
1397 	}
1398 	return error;
1399 }
1400 
1401 /*
1402  * holdsock() - load the struct file pointer associated
1403  * with a socket into *fpp.  If an error occurs, non-zero
1404  * will be returned and *fpp will be set to NULL.
1405  */
1406 int
1407 holdsock(fdp, fdes, fpp)
1408 	struct filedesc *fdp;
1409 	int fdes;
1410 	struct file **fpp;
1411 {
1412 	register struct file *fp = NULL;
1413 	int error = 0;
1414 
1415 	if ((unsigned)fdes >= fdp->fd_nfiles ||
1416 	    (fp = fdp->fd_ofiles[fdes]) == NULL) {
1417 		error = EBADF;
1418 	} else if (fp->f_type != DTYPE_SOCKET) {
1419 		error = ENOTSOCK;
1420 		fp = NULL;
1421 	} else {
1422 		fhold(fp);
1423 	}
1424 	*fpp = fp;
1425 	return(error);
1426 }
1427 
1428 /*
1429  * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1430  * XXX - The sf_buf functions are currently private to sendfile(2), so have
1431  * been made static, but may be useful in the future for doing zero-copy in
1432  * other parts of the networking code.
1433  */
1434 static void
1435 sf_buf_init(void *arg)
1436 {
1437 	int i;
1438 
1439 	mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", MTX_DEF);
1440 	mtx_lock(&sf_freelist.sf_lock);
1441 	SLIST_INIT(&sf_freelist.sf_head);
1442 	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
1443 	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
1444 	    M_NOWAIT | M_ZERO);
1445 	for (i = 0; i < nsfbufs; i++) {
1446 		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1447 		SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list);
1448 	}
1449 	sf_buf_alloc_want = 0;
1450 	mtx_unlock(&sf_freelist.sf_lock);
1451 }
1452 
1453 /*
1454  * Get an sf_buf from the freelist. Will block if none are available.
1455  */
1456 static struct sf_buf *
1457 sf_buf_alloc()
1458 {
1459 	struct sf_buf *sf;
1460 	int error;
1461 
1462 	mtx_lock(&sf_freelist.sf_lock);
1463 	while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) {
1464 		sf_buf_alloc_want++;
1465 		error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH,
1466 		    "sfbufa", 0);
1467 		sf_buf_alloc_want--;
1468 
1469 		/*
1470 		 * If we got a signal, don't risk going back to sleep.
1471 		 */
1472 		if (error)
1473 			break;
1474 	}
1475 	SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list);
1476 	mtx_unlock(&sf_freelist.sf_lock);
1477 	return (sf);
1478 }
1479 
1480 #define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1481 
1482 /*
1483  * Detatch mapped page and release resources back to the system.
1484  */
1485 static void
1486 sf_buf_free(caddr_t addr, void *args)
1487 {
1488 	struct sf_buf *sf;
1489 	struct vm_page *m;
1490 
1491 	sf = dtosf(addr);
1492 	mtx_lock(&vm_mtx);
1493 	pmap_qremove((vm_offset_t)addr, 1);
1494 	m = sf->m;
1495 	vm_page_unwire(m, 0);
1496 	/*
1497 	 * Check for the object going away on us. This can
1498 	 * happen since we don't hold a reference to it.
1499 	 * If so, we're responsible for freeing the page.
1500 	 */
1501 	if (m->wire_count == 0 && m->object == NULL)
1502 		vm_page_free(m);
1503 	mtx_unlock(&vm_mtx);
1504 	sf->m = NULL;
1505 	mtx_lock(&sf_freelist.sf_lock);
1506 	SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list);
1507 	if (sf_buf_alloc_want > 0)
1508 		wakeup_one(&sf_freelist);
1509 	mtx_unlock(&sf_freelist.sf_lock);
1510 }
1511 
1512 /*
1513  * sendfile(2)
1514  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1515  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1516  *
1517  * Send a file specified by 'fd' and starting at 'offset' to a socket
1518  * specified by 's'. Send only 'nbytes' of the file or until EOF if
1519  * nbytes == 0. Optionally add a header and/or trailer to the socket
1520  * output. If specified, write the total number of bytes sent into *sbytes.
1521  */
1522 int
1523 sendfile(struct proc *p, struct sendfile_args *uap)
1524 {
1525 	struct file *fp;
1526 	struct filedesc *fdp = p->p_fd;
1527 	struct vnode *vp;
1528 	struct vm_object *obj;
1529 	struct socket *so;
1530 	struct mbuf *m;
1531 	struct sf_buf *sf;
1532 	struct vm_page *pg;
1533 	struct writev_args nuap;
1534 	struct sf_hdtr hdtr;
1535 	off_t off, xfsize, sbytes = 0;
1536 	int error = 0, s;
1537 
1538 	vp = NULL;
1539 	/*
1540 	 * Do argument checking. Must be a regular file in, stream
1541 	 * type and connected socket out, positive offset.
1542 	 */
1543 	fp = holdfp(fdp, uap->fd, FREAD);
1544 	if (fp == NULL) {
1545 		error = EBADF;
1546 		goto done;
1547 	}
1548 	if (fp->f_type != DTYPE_VNODE) {
1549 		error = EINVAL;
1550 		goto done;
1551 	}
1552 	vp = (struct vnode *)fp->f_data;
1553 	vref(vp);
1554 	if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
1555 		error = EINVAL;
1556 		goto done;
1557 	}
1558 	fdrop(fp, p);
1559 	error = holdsock(p->p_fd, uap->s, &fp);
1560 	if (error)
1561 		goto done;
1562 	so = (struct socket *)fp->f_data;
1563 	if (so->so_type != SOCK_STREAM) {
1564 		error = EINVAL;
1565 		goto done;
1566 	}
1567 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1568 		error = ENOTCONN;
1569 		goto done;
1570 	}
1571 	if (uap->offset < 0) {
1572 		error = EINVAL;
1573 		goto done;
1574 	}
1575 
1576 	/*
1577 	 * If specified, get the pointer to the sf_hdtr struct for
1578 	 * any headers/trailers.
1579 	 */
1580 	if (uap->hdtr != NULL) {
1581 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1582 		if (error)
1583 			goto done;
1584 		/*
1585 		 * Send any headers. Wimp out and use writev(2).
1586 		 */
1587 		if (hdtr.headers != NULL) {
1588 			nuap.fd = uap->s;
1589 			nuap.iovp = hdtr.headers;
1590 			nuap.iovcnt = hdtr.hdr_cnt;
1591 			error = writev(p, &nuap);
1592 			if (error)
1593 				goto done;
1594 			sbytes += p->p_retval[0];
1595 		}
1596 	}
1597 
1598 	/*
1599 	 * Protect against multiple writers to the socket.
1600 	 */
1601 	(void) sblock(&so->so_snd, M_WAITOK);
1602 
1603 	/*
1604 	 * Loop through the pages in the file, starting with the requested
1605 	 * offset. Get a file page (do I/O if necessary), map the file page
1606 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1607 	 * it on the socket.
1608 	 */
1609 	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1610 		vm_pindex_t pindex;
1611 		vm_offset_t pgoff;
1612 
1613 		pindex = OFF_TO_IDX(off);
1614 retry_lookup:
1615 		/*
1616 		 * Calculate the amount to transfer. Not to exceed a page,
1617 		 * the EOF, or the passed in nbytes.
1618 		 */
1619 		xfsize = obj->un_pager.vnp.vnp_size - off;
1620 		if (xfsize > PAGE_SIZE)
1621 			xfsize = PAGE_SIZE;
1622 		pgoff = (vm_offset_t)(off & PAGE_MASK);
1623 		if (PAGE_SIZE - pgoff < xfsize)
1624 			xfsize = PAGE_SIZE - pgoff;
1625 		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1626 			xfsize = uap->nbytes - sbytes;
1627 		if (xfsize <= 0)
1628 			break;
1629 		/*
1630 		 * Optimize the non-blocking case by looking at the socket space
1631 		 * before going to the extra work of constituting the sf_buf.
1632 		 */
1633 		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1634 			if (so->so_state & SS_CANTSENDMORE)
1635 				error = EPIPE;
1636 			else
1637 				error = EAGAIN;
1638 			sbunlock(&so->so_snd);
1639 			goto done;
1640 		}
1641 		/*
1642 		 * Attempt to look up the page.
1643 		 *
1644 		 *	Allocate if not found
1645 		 *
1646 		 *	Wait and loop if busy.
1647 		 */
1648 		mtx_lock(&vm_mtx);
1649 		pg = vm_page_lookup(obj, pindex);
1650 
1651 		if (pg == NULL) {
1652 			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1653 			if (pg == NULL) {
1654 				VM_WAIT;
1655 				mtx_unlock(&vm_mtx);
1656 				goto retry_lookup;
1657 			}
1658 			vm_page_wakeup(pg);
1659 		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
1660 			mtx_unlock(&vm_mtx);
1661 			goto retry_lookup;
1662 		}
1663 
1664 		/*
1665 		 * Wire the page so it does not get ripped out from under
1666 		 * us.
1667 		 */
1668 
1669 		vm_page_wire(pg);
1670 
1671 		/*
1672 		 * If page is not valid for what we need, initiate I/O
1673 		 */
1674 
1675 		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1676 			struct uio auio;
1677 			struct iovec aiov;
1678 			int bsize;
1679 
1680 			/*
1681 			 * Ensure that our page is still around when the I/O
1682 			 * completes.
1683 			 */
1684 			vm_page_io_start(pg);
1685 			mtx_unlock(&vm_mtx);
1686 
1687 			/*
1688 			 * Get the page from backing store.
1689 			 */
1690 			bsize = vp->v_mount->mnt_stat.f_iosize;
1691 			auio.uio_iov = &aiov;
1692 			auio.uio_iovcnt = 1;
1693 			aiov.iov_base = 0;
1694 			aiov.iov_len = MAXBSIZE;
1695 			auio.uio_resid = MAXBSIZE;
1696 			auio.uio_offset = trunc_page(off);
1697 			auio.uio_segflg = UIO_NOCOPY;
1698 			auio.uio_rw = UIO_READ;
1699 			auio.uio_procp = p;
1700 			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
1701 			error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
1702 			        p->p_ucred);
1703 			VOP_UNLOCK(vp, 0, p);
1704 			mtx_lock(&vm_mtx);
1705 			vm_page_flag_clear(pg, PG_ZERO);
1706 			vm_page_io_finish(pg);
1707 			if (error) {
1708 				vm_page_unwire(pg, 0);
1709 				/*
1710 				 * See if anyone else might know about this page.
1711 				 * If not and it is not valid, then free it.
1712 				 */
1713 				if (pg->wire_count == 0 && pg->valid == 0 &&
1714 				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1715 				    pg->hold_count == 0) {
1716 					vm_page_busy(pg);
1717 					vm_page_free(pg);
1718 				}
1719 				mtx_unlock(&vm_mtx);
1720 				sbunlock(&so->so_snd);
1721 				goto done;
1722 			}
1723 		}
1724 
1725 
1726 		/*
1727 		 * Get a sendfile buf. We usually wait as long as necessary,
1728 		 * but this wait can be interrupted.
1729 		 */
1730 		mtx_unlock(&vm_mtx);
1731 		if ((sf = sf_buf_alloc()) == NULL) {
1732 			mtx_lock(&vm_mtx);
1733 			vm_page_unwire(pg, 0);
1734 			if (pg->wire_count == 0 && pg->object == NULL)
1735 				vm_page_free(pg);
1736 			mtx_unlock(&vm_mtx);
1737 			sbunlock(&so->so_snd);
1738 			error = EINTR;
1739 			goto done;
1740 		}
1741 
1742 		/*
1743 		 * Allocate a kernel virtual page and insert the physical page
1744 		 * into it.
1745 		 */
1746 		mtx_lock(&vm_mtx);
1747 		sf->m = pg;
1748 		pmap_qenter(sf->kva, &pg, 1);
1749 		mtx_unlock(&vm_mtx);
1750 		/*
1751 		 * Get an mbuf header and set it up as having external storage.
1752 		 */
1753 		MGETHDR(m, M_TRYWAIT, MT_DATA);
1754 		if (m == NULL) {
1755 			error = ENOBUFS;
1756 			sf_buf_free((void *)sf->kva, NULL);
1757 			sbunlock(&so->so_snd);
1758 			goto done;
1759 		}
1760 		/*
1761 		 * Setup external storage for mbuf.
1762 		 */
1763 		MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY,
1764 		    EXT_SFBUF);
1765 		m->m_data = (char *) sf->kva + pgoff;
1766 		m->m_pkthdr.len = m->m_len = xfsize;
1767 		/*
1768 		 * Add the buffer to the socket buffer chain.
1769 		 */
1770 		s = splnet();
1771 retry_space:
1772 		/*
1773 		 * Make sure that the socket is still able to take more data.
1774 		 * CANTSENDMORE being true usually means that the connection
1775 		 * was closed. so_error is true when an error was sensed after
1776 		 * a previous send.
1777 		 * The state is checked after the page mapping and buffer
1778 		 * allocation above since those operations may block and make
1779 		 * any socket checks stale. From this point forward, nothing
1780 		 * blocks before the pru_send (or more accurately, any blocking
1781 		 * results in a loop back to here to re-check).
1782 		 */
1783 		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1784 			if (so->so_state & SS_CANTSENDMORE) {
1785 				error = EPIPE;
1786 			} else {
1787 				error = so->so_error;
1788 				so->so_error = 0;
1789 			}
1790 			m_freem(m);
1791 			sbunlock(&so->so_snd);
1792 			splx(s);
1793 			goto done;
1794 		}
1795 		/*
1796 		 * Wait for socket space to become available. We do this just
1797 		 * after checking the connection state above in order to avoid
1798 		 * a race condition with sbwait().
1799 		 */
1800 		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1801 			if (so->so_state & SS_NBIO) {
1802 				m_freem(m);
1803 				sbunlock(&so->so_snd);
1804 				splx(s);
1805 				error = EAGAIN;
1806 				goto done;
1807 			}
1808 			error = sbwait(&so->so_snd);
1809 			/*
1810 			 * An error from sbwait usually indicates that we've
1811 			 * been interrupted by a signal. If we've sent anything
1812 			 * then return bytes sent, otherwise return the error.
1813 			 */
1814 			if (error) {
1815 				m_freem(m);
1816 				sbunlock(&so->so_snd);
1817 				splx(s);
1818 				goto done;
1819 			}
1820 			goto retry_space;
1821 		}
1822 		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
1823 		splx(s);
1824 		if (error) {
1825 			sbunlock(&so->so_snd);
1826 			goto done;
1827 		}
1828 	}
1829 	sbunlock(&so->so_snd);
1830 
1831 	/*
1832 	 * Send trailers. Wimp out and use writev(2).
1833 	 */
1834 	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1835 			nuap.fd = uap->s;
1836 			nuap.iovp = hdtr.trailers;
1837 			nuap.iovcnt = hdtr.trl_cnt;
1838 			error = writev(p, &nuap);
1839 			if (error)
1840 				goto done;
1841 			sbytes += p->p_retval[0];
1842 	}
1843 
1844 done:
1845 	/*
1846 	 * If there was no error we have to clear p->p_retval[0]
1847 	 * because it may have been set by writev.
1848 	 */
1849 	if (error == 0) {
1850 		p->p_retval[0] = 0;
1851 	}
1852 	if (uap->sbytes != NULL) {
1853 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
1854 	}
1855 	if (vp)
1856 		vrele(vp);
1857 	if (fp)
1858 		fdrop(fp, p);
1859 	return (error);
1860 }
1861