xref: /freebsd/sys/kern/uipc_syscalls.c (revision 11f0b352e05306cf6f1f85e9087022c0a92624a3)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37  * $FreeBSD$
38  */
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/mutex.h>
48 #include <sys/sysproto.h>
49 #include <sys/malloc.h>
50 #include <sys/filedesc.h>
51 #include <sys/event.h>
52 #include <sys/proc.h>
53 #include <sys/fcntl.h>
54 #include <sys/file.h>
55 #include <sys/lock.h>
56 #include <sys/mount.h>
57 #include <sys/mbuf.h>
58 #include <sys/protosw.h>
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/signalvar.h>
62 #include <sys/uio.h>
63 #include <sys/vnode.h>
64 #ifdef KTRACE
65 #include <sys/ktrace.h>
66 #endif
67 
68 #include <vm/vm.h>
69 #include <vm/vm_object.h>
70 #include <vm/vm_page.h>
71 #include <vm/vm_pageout.h>
72 #include <vm/vm_kern.h>
73 #include <vm/vm_extern.h>
74 
75 static void sf_buf_init(void *arg);
76 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
77 struct sf_buf *sf_buf_alloc(void);
78 void sf_buf_free(void *addr, void *args);
79 
80 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
81 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
82 
83 static int accept1(struct thread *td, struct accept_args *uap, int compat);
84 static int getsockname1(struct thread *td, struct getsockname_args *uap,
85 			int compat);
86 static int getpeername1(struct thread *td, struct getpeername_args *uap,
87 			int compat);
88 
89 /*
90  * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the
91  * sf_freelist head with the sf_lock mutex.
92  */
93 static struct {
94 	SLIST_HEAD(, sf_buf) sf_head;
95 	struct mtx sf_lock;
96 } sf_freelist;
97 
98 vm_offset_t sf_base;
99 struct sf_buf *sf_bufs;
100 u_int sf_buf_alloc_want;
101 
102 /*
103  * System call interface to the socket abstraction.
104  */
105 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
106 #define COMPAT_OLDSOCK
107 #endif
108 
109 extern	struct fileops socketops;
110 
111 /*
112  * MPSAFE
113  */
114 int
115 socket(td, uap)
116 	struct thread *td;
117 	register struct socket_args /* {
118 		int	domain;
119 		int	type;
120 		int	protocol;
121 	} */ *uap;
122 {
123 	struct filedesc *fdp;
124 	struct socket *so;
125 	struct file *fp;
126 	int fd, error;
127 
128 	mtx_lock(&Giant);
129 	fdp = td->td_proc->p_fd;
130 	error = falloc(td, &fp, &fd);
131 	if (error)
132 		goto done2;
133 	fhold(fp);
134 	error = socreate(uap->domain, &so, uap->type, uap->protocol,
135 	    td->td_ucred, td);
136 	FILEDESC_LOCK(fdp);
137 	if (error) {
138 		if (fdp->fd_ofiles[fd] == fp) {
139 			fdp->fd_ofiles[fd] = NULL;
140 			FILEDESC_UNLOCK(fdp);
141 			fdrop(fp, td);
142 		} else
143 			FILEDESC_UNLOCK(fdp);
144 	} else {
145 		fp->f_data = so;	/* already has ref count */
146 		fp->f_flag = FREAD|FWRITE;
147 		fp->f_ops = &socketops;
148 		fp->f_type = DTYPE_SOCKET;
149 		FILEDESC_UNLOCK(fdp);
150 		td->td_retval[0] = fd;
151 	}
152 	fdrop(fp, td);
153 done2:
154 	mtx_unlock(&Giant);
155 	return (error);
156 }
157 
158 /*
159  * MPSAFE
160  */
161 /* ARGSUSED */
162 int
163 bind(td, uap)
164 	struct thread *td;
165 	register struct bind_args /* {
166 		int	s;
167 		caddr_t	name;
168 		int	namelen;
169 	} */ *uap;
170 {
171 	struct socket *so;
172 	struct sockaddr *sa;
173 	int error;
174 
175 	mtx_lock(&Giant);
176 	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
177 		goto done2;
178 	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
179 		goto done1;
180 	error = sobind(so, sa, td);
181 	FREE(sa, M_SONAME);
182 done1:
183 	fputsock(so);
184 done2:
185 	mtx_unlock(&Giant);
186 	return (error);
187 }
188 
189 /*
190  * MPSAFE
191  */
192 /* ARGSUSED */
193 int
194 listen(td, uap)
195 	struct thread *td;
196 	register struct listen_args /* {
197 		int	s;
198 		int	backlog;
199 	} */ *uap;
200 {
201 	struct socket *so;
202 	int error;
203 
204 	mtx_lock(&Giant);
205 	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
206 		error = solisten(so, uap->backlog, td);
207 		fputsock(so);
208 	}
209 	mtx_unlock(&Giant);
210 	return(error);
211 }
212 
213 /*
214  * accept1()
215  * MPSAFE
216  */
217 static int
218 accept1(td, uap, compat)
219 	struct thread *td;
220 	register struct accept_args /* {
221 		int	s;
222 		caddr_t	name;
223 		int	*anamelen;
224 	} */ *uap;
225 	int compat;
226 {
227 	struct filedesc *fdp;
228 	struct file *nfp = NULL;
229 	struct sockaddr *sa;
230 	int namelen, error, s;
231 	struct socket *head, *so;
232 	int fd;
233 	u_int fflag;
234 
235 	mtx_lock(&Giant);
236 	fdp = td->td_proc->p_fd;
237 	if (uap->name) {
238 		error = copyin(uap->anamelen, &namelen, sizeof (namelen));
239 		if(error)
240 			goto done2;
241 	}
242 	error = fgetsock(td, uap->s, &head, &fflag);
243 	if (error)
244 		goto done2;
245 	s = splnet();
246 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
247 		splx(s);
248 		error = EINVAL;
249 		goto done;
250 	}
251 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
252 		splx(s);
253 		error = EWOULDBLOCK;
254 		goto done;
255 	}
256 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
257 		if (head->so_state & SS_CANTRCVMORE) {
258 			head->so_error = ECONNABORTED;
259 			break;
260 		}
261 		error = tsleep(&head->so_timeo, PSOCK | PCATCH,
262 		    "accept", 0);
263 		if (error) {
264 			splx(s);
265 			goto done;
266 		}
267 	}
268 	if (head->so_error) {
269 		error = head->so_error;
270 		head->so_error = 0;
271 		splx(s);
272 		goto done;
273 	}
274 
275 	/*
276 	 * At this point we know that there is at least one connection
277 	 * ready to be accepted. Remove it from the queue prior to
278 	 * allocating the file descriptor for it since falloc() may
279 	 * block allowing another process to accept the connection
280 	 * instead.
281 	 */
282 	so = TAILQ_FIRST(&head->so_comp);
283 	TAILQ_REMOVE(&head->so_comp, so, so_list);
284 	head->so_qlen--;
285 
286 	error = falloc(td, &nfp, &fd);
287 	if (error) {
288 		/*
289 		 * Probably ran out of file descriptors. Put the
290 		 * unaccepted connection back onto the queue and
291 		 * do another wakeup so some other process might
292 		 * have a chance at it.
293 		 */
294 		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
295 		head->so_qlen++;
296 		wakeup_one(&head->so_timeo);
297 		splx(s);
298 		goto done;
299 	}
300 	fhold(nfp);
301 	td->td_retval[0] = fd;
302 
303 	/* connection has been removed from the listen queue */
304 	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
305 
306 	so->so_state &= ~SS_COMP;
307 	so->so_head = NULL;
308 	if (head->so_sigio != NULL)
309 		fsetown(fgetown(head->so_sigio), &so->so_sigio);
310 
311 	FILE_LOCK(nfp);
312 	soref(so);			/* file descriptor reference */
313 	nfp->f_data = so;		/* nfp has ref count from falloc */
314 	nfp->f_flag = fflag;
315 	nfp->f_ops = &socketops;
316 	nfp->f_type = DTYPE_SOCKET;
317 	FILE_UNLOCK(nfp);
318 	sa = 0;
319 	error = soaccept(so, &sa);
320 	if (error) {
321 		/*
322 		 * return a namelen of zero for older code which might
323 	 	 * ignore the return value from accept.
324 		 */
325 		if (uap->name != NULL) {
326 			namelen = 0;
327 			(void) copyout(&namelen,
328 			    uap->anamelen, sizeof(*uap->anamelen));
329 		}
330 		goto noconnection;
331 	}
332 	if (sa == NULL) {
333 		namelen = 0;
334 		if (uap->name)
335 			goto gotnoname;
336 		splx(s);
337 		error = 0;
338 		goto done;
339 	}
340 	if (uap->name) {
341 		/* check sa_len before it is destroyed */
342 		if (namelen > sa->sa_len)
343 			namelen = sa->sa_len;
344 #ifdef COMPAT_OLDSOCK
345 		if (compat)
346 			((struct osockaddr *)sa)->sa_family =
347 			    sa->sa_family;
348 #endif
349 		error = copyout(sa, uap->name, (u_int)namelen);
350 		if (!error)
351 gotnoname:
352 			error = copyout(&namelen,
353 			    uap->anamelen, sizeof (*uap->anamelen));
354 	}
355 noconnection:
356 	if (sa)
357 		FREE(sa, M_SONAME);
358 
359 	/*
360 	 * close the new descriptor, assuming someone hasn't ripped it
361 	 * out from under us.
362 	 */
363 	if (error) {
364 		FILEDESC_LOCK(fdp);
365 		if (fdp->fd_ofiles[fd] == nfp) {
366 			fdp->fd_ofiles[fd] = NULL;
367 			FILEDESC_UNLOCK(fdp);
368 			fdrop(nfp, td);
369 		} else {
370 			FILEDESC_UNLOCK(fdp);
371 		}
372 	}
373 	splx(s);
374 
375 	/*
376 	 * Release explicitly held references before returning.
377 	 */
378 done:
379 	if (nfp != NULL)
380 		fdrop(nfp, td);
381 	fputsock(head);
382 done2:
383 	mtx_unlock(&Giant);
384 	return (error);
385 }
386 
387 /*
388  * MPSAFE (accept1() is MPSAFE)
389  */
390 int
391 accept(td, uap)
392 	struct thread *td;
393 	struct accept_args *uap;
394 {
395 
396 	return (accept1(td, uap, 0));
397 }
398 
399 #ifdef COMPAT_OLDSOCK
400 /*
401  * MPSAFE (accept1() is MPSAFE)
402  */
403 int
404 oaccept(td, uap)
405 	struct thread *td;
406 	struct accept_args *uap;
407 {
408 
409 	return (accept1(td, uap, 1));
410 }
411 #endif /* COMPAT_OLDSOCK */
412 
413 /*
414  * MPSAFE
415  */
416 /* ARGSUSED */
417 int
418 connect(td, uap)
419 	struct thread *td;
420 	register struct connect_args /* {
421 		int	s;
422 		caddr_t	name;
423 		int	namelen;
424 	} */ *uap;
425 {
426 	struct socket *so;
427 	struct sockaddr *sa;
428 	int error, s;
429 
430 	mtx_lock(&Giant);
431 	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
432 		goto done2;
433 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
434 		error = EALREADY;
435 		goto done1;
436 	}
437 	error = getsockaddr(&sa, uap->name, uap->namelen);
438 	if (error)
439 		goto done1;
440 	error = soconnect(so, sa, td);
441 	if (error)
442 		goto bad;
443 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
444 		FREE(sa, M_SONAME);
445 		error = EINPROGRESS;
446 		goto done1;
447 	}
448 	s = splnet();
449 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
450 		error = tsleep(&so->so_timeo, PSOCK | PCATCH, "connec", 0);
451 		if (error)
452 			break;
453 	}
454 	if (error == 0) {
455 		error = so->so_error;
456 		so->so_error = 0;
457 	}
458 	splx(s);
459 bad:
460 	so->so_state &= ~SS_ISCONNECTING;
461 	FREE(sa, M_SONAME);
462 	if (error == ERESTART)
463 		error = EINTR;
464 done1:
465 	fputsock(so);
466 done2:
467 	mtx_unlock(&Giant);
468 	return (error);
469 }
470 
471 /*
472  * MPSAFE
473  */
474 int
475 socketpair(td, uap)
476 	struct thread *td;
477 	register struct socketpair_args /* {
478 		int	domain;
479 		int	type;
480 		int	protocol;
481 		int	*rsv;
482 	} */ *uap;
483 {
484 	register struct filedesc *fdp = td->td_proc->p_fd;
485 	struct file *fp1, *fp2;
486 	struct socket *so1, *so2;
487 	int fd, error, sv[2];
488 
489 	mtx_lock(&Giant);
490 	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
491 	    td->td_ucred, td);
492 	if (error)
493 		goto done2;
494 	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
495 	    td->td_ucred, td);
496 	if (error)
497 		goto free1;
498 	error = falloc(td, &fp1, &fd);
499 	if (error)
500 		goto free2;
501 	fhold(fp1);
502 	sv[0] = fd;
503 	fp1->f_data = so1;		/* so1 already has ref count */
504 	error = falloc(td, &fp2, &fd);
505 	if (error)
506 		goto free3;
507 	fhold(fp2);
508 	fp2->f_data = so2;		/* so2 already has ref count */
509 	sv[1] = fd;
510 	error = soconnect2(so1, so2);
511 	if (error)
512 		goto free4;
513 	if (uap->type == SOCK_DGRAM) {
514 		/*
515 		 * Datagram socket connection is asymmetric.
516 		 */
517 		 error = soconnect2(so2, so1);
518 		 if (error)
519 			goto free4;
520 	}
521 	FILE_LOCK(fp1);
522 	fp1->f_flag = FREAD|FWRITE;
523 	fp1->f_ops = &socketops;
524 	fp1->f_type = DTYPE_SOCKET;
525 	FILE_UNLOCK(fp1);
526 	FILE_LOCK(fp2);
527 	fp2->f_flag = FREAD|FWRITE;
528 	fp2->f_ops = &socketops;
529 	fp2->f_type = DTYPE_SOCKET;
530 	FILE_UNLOCK(fp2);
531 	error = copyout(sv, uap->rsv, 2 * sizeof (int));
532 	fdrop(fp1, td);
533 	fdrop(fp2, td);
534 	goto done2;
535 free4:
536 	FILEDESC_LOCK(fdp);
537 	if (fdp->fd_ofiles[sv[1]] == fp2) {
538 		fdp->fd_ofiles[sv[1]] = NULL;
539 		FILEDESC_UNLOCK(fdp);
540 		fdrop(fp2, td);
541 	} else
542 		FILEDESC_UNLOCK(fdp);
543 	fdrop(fp2, td);
544 free3:
545 	FILEDESC_LOCK(fdp);
546 	if (fdp->fd_ofiles[sv[0]] == fp1) {
547 		fdp->fd_ofiles[sv[0]] = NULL;
548 		FILEDESC_UNLOCK(fdp);
549 		fdrop(fp1, td);
550 	} else
551 		FILEDESC_UNLOCK(fdp);
552 	fdrop(fp1, td);
553 free2:
554 	(void)soclose(so2);
555 free1:
556 	(void)soclose(so1);
557 done2:
558 	mtx_unlock(&Giant);
559 	return (error);
560 }
561 
562 static int
563 sendit(td, s, mp, flags)
564 	register struct thread *td;
565 	int s;
566 	register struct msghdr *mp;
567 	int flags;
568 {
569 	struct uio auio;
570 	register struct iovec *iov;
571 	register int i;
572 	struct mbuf *control;
573 	struct sockaddr *to = NULL;
574 	int len, error;
575 	struct socket *so;
576 #ifdef KTRACE
577 	struct iovec *ktriov = NULL;
578 	struct uio ktruio;
579 	int iovlen;
580 #endif
581 
582 	if ((error = fgetsock(td, s, &so, NULL)) != 0)
583 		return (error);
584 	auio.uio_iov = mp->msg_iov;
585 	auio.uio_iovcnt = mp->msg_iovlen;
586 	auio.uio_segflg = UIO_USERSPACE;
587 	auio.uio_rw = UIO_WRITE;
588 	auio.uio_td = td;
589 	auio.uio_offset = 0;			/* XXX */
590 	auio.uio_resid = 0;
591 	iov = mp->msg_iov;
592 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
593 		if ((auio.uio_resid += iov->iov_len) < 0) {
594 			error = EINVAL;
595 			goto bad;
596 		}
597 	}
598 	if (mp->msg_name) {
599 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
600 		if (error)
601 			goto bad;
602 	}
603 	if (mp->msg_control) {
604 		if (mp->msg_controllen < sizeof(struct cmsghdr)
605 #ifdef COMPAT_OLDSOCK
606 		    && mp->msg_flags != MSG_COMPAT
607 #endif
608 		) {
609 			error = EINVAL;
610 			goto bad;
611 		}
612 		error = sockargs(&control, mp->msg_control,
613 		    mp->msg_controllen, MT_CONTROL);
614 		if (error)
615 			goto bad;
616 #ifdef COMPAT_OLDSOCK
617 		if (mp->msg_flags == MSG_COMPAT) {
618 			register struct cmsghdr *cm;
619 
620 			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
621 			if (control == 0) {
622 				error = ENOBUFS;
623 				goto bad;
624 			} else {
625 				cm = mtod(control, struct cmsghdr *);
626 				cm->cmsg_len = control->m_len;
627 				cm->cmsg_level = SOL_SOCKET;
628 				cm->cmsg_type = SCM_RIGHTS;
629 			}
630 		}
631 #endif
632 	} else {
633 		control = 0;
634 	}
635 #ifdef KTRACE
636 	if (KTRPOINT(td, KTR_GENIO)) {
637 		iovlen = auio.uio_iovcnt * sizeof (struct iovec);
638 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
639 		bcopy(auio.uio_iov, ktriov, iovlen);
640 		ktruio = auio;
641 	}
642 #endif
643 	len = auio.uio_resid;
644 	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
645 						     flags, td);
646 	if (error) {
647 		if (auio.uio_resid != len && (error == ERESTART ||
648 		    error == EINTR || error == EWOULDBLOCK))
649 			error = 0;
650 		/* Generation of SIGPIPE can be controlled per socket */
651 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE)) {
652 			PROC_LOCK(td->td_proc);
653 			psignal(td->td_proc, SIGPIPE);
654 			PROC_UNLOCK(td->td_proc);
655 		}
656 	}
657 	if (error == 0)
658 		td->td_retval[0] = len - auio.uio_resid;
659 #ifdef KTRACE
660 	if (ktriov != NULL) {
661 		if (error == 0) {
662 			ktruio.uio_iov = ktriov;
663 			ktruio.uio_resid = td->td_retval[0];
664 			ktrgenio(s, UIO_WRITE, &ktruio, error);
665 		}
666 		FREE(ktriov, M_TEMP);
667 	}
668 #endif
669 bad:
670 	fputsock(so);
671 	if (to)
672 		FREE(to, M_SONAME);
673 	return (error);
674 }
675 
676 /*
677  * MPSAFE
678  */
679 int
680 sendto(td, uap)
681 	struct thread *td;
682 	register struct sendto_args /* {
683 		int	s;
684 		caddr_t	buf;
685 		size_t	len;
686 		int	flags;
687 		caddr_t	to;
688 		int	tolen;
689 	} */ *uap;
690 {
691 	struct msghdr msg;
692 	struct iovec aiov;
693 	int error;
694 
695 	msg.msg_name = uap->to;
696 	msg.msg_namelen = uap->tolen;
697 	msg.msg_iov = &aiov;
698 	msg.msg_iovlen = 1;
699 	msg.msg_control = 0;
700 #ifdef COMPAT_OLDSOCK
701 	msg.msg_flags = 0;
702 #endif
703 	aiov.iov_base = uap->buf;
704 	aiov.iov_len = uap->len;
705 	mtx_lock(&Giant);
706 	error = sendit(td, uap->s, &msg, uap->flags);
707 	mtx_unlock(&Giant);
708 	return (error);
709 }
710 
711 #ifdef COMPAT_OLDSOCK
712 /*
713  * MPSAFE
714  */
715 int
716 osend(td, uap)
717 	struct thread *td;
718 	register struct osend_args /* {
719 		int	s;
720 		caddr_t	buf;
721 		int	len;
722 		int	flags;
723 	} */ *uap;
724 {
725 	struct msghdr msg;
726 	struct iovec aiov;
727 	int error;
728 
729 	msg.msg_name = 0;
730 	msg.msg_namelen = 0;
731 	msg.msg_iov = &aiov;
732 	msg.msg_iovlen = 1;
733 	aiov.iov_base = uap->buf;
734 	aiov.iov_len = uap->len;
735 	msg.msg_control = 0;
736 	msg.msg_flags = 0;
737 	mtx_lock(&Giant);
738 	error = sendit(td, uap->s, &msg, uap->flags);
739 	mtx_unlock(&Giant);
740 	return (error);
741 }
742 
743 /*
744  * MPSAFE
745  */
746 int
747 osendmsg(td, uap)
748 	struct thread *td;
749 	register struct osendmsg_args /* {
750 		int	s;
751 		caddr_t	msg;
752 		int	flags;
753 	} */ *uap;
754 {
755 	struct msghdr msg;
756 	struct iovec aiov[UIO_SMALLIOV], *iov;
757 	int error;
758 
759 	mtx_lock(&Giant);
760 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
761 	if (error)
762 		goto done2;
763 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
764 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
765 			error = EMSGSIZE;
766 			goto done2;
767 		}
768 		MALLOC(iov, struct iovec *,
769 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
770 		      M_WAITOK);
771 	} else {
772 		iov = aiov;
773 	}
774 	error = copyin(msg.msg_iov, iov,
775 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
776 	if (error)
777 		goto done;
778 	msg.msg_flags = MSG_COMPAT;
779 	msg.msg_iov = iov;
780 	error = sendit(td, uap->s, &msg, uap->flags);
781 done:
782 	if (iov != aiov)
783 		FREE(iov, M_IOV);
784 done2:
785 	mtx_unlock(&Giant);
786 	return (error);
787 }
788 #endif
789 
790 /*
791  * MPSAFE
792  */
793 int
794 sendmsg(td, uap)
795 	struct thread *td;
796 	register struct sendmsg_args /* {
797 		int	s;
798 		caddr_t	msg;
799 		int	flags;
800 	} */ *uap;
801 {
802 	struct msghdr msg;
803 	struct iovec aiov[UIO_SMALLIOV], *iov;
804 	int error;
805 
806 	mtx_lock(&Giant);
807 	error = copyin(uap->msg, &msg, sizeof (msg));
808 	if (error)
809 		goto done2;
810 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
811 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
812 			error = EMSGSIZE;
813 			goto done2;
814 		}
815 		MALLOC(iov, struct iovec *,
816 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
817 		       M_WAITOK);
818 	} else {
819 		iov = aiov;
820 	}
821 	if (msg.msg_iovlen &&
822 	    (error = copyin(msg.msg_iov, iov,
823 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
824 		goto done;
825 	msg.msg_iov = iov;
826 #ifdef COMPAT_OLDSOCK
827 	msg.msg_flags = 0;
828 #endif
829 	error = sendit(td, uap->s, &msg, uap->flags);
830 done:
831 	if (iov != aiov)
832 		FREE(iov, M_IOV);
833 done2:
834 	mtx_unlock(&Giant);
835 	return (error);
836 }
837 
838 static int
839 recvit(td, s, mp, namelenp)
840 	register struct thread *td;
841 	int s;
842 	register struct msghdr *mp;
843 	void *namelenp;
844 {
845 	struct uio auio;
846 	register struct iovec *iov;
847 	register int i;
848 	int len, error;
849 	struct mbuf *m, *control = 0;
850 	caddr_t ctlbuf;
851 	struct socket *so;
852 	struct sockaddr *fromsa = 0;
853 #ifdef KTRACE
854 	struct iovec *ktriov = NULL;
855 	struct uio ktruio;
856 	int iovlen;
857 #endif
858 
859 	if ((error = fgetsock(td, s, &so, NULL)) != 0)
860 		return (error);
861 	auio.uio_iov = mp->msg_iov;
862 	auio.uio_iovcnt = mp->msg_iovlen;
863 	auio.uio_segflg = UIO_USERSPACE;
864 	auio.uio_rw = UIO_READ;
865 	auio.uio_td = td;
866 	auio.uio_offset = 0;			/* XXX */
867 	auio.uio_resid = 0;
868 	iov = mp->msg_iov;
869 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
870 		if ((auio.uio_resid += iov->iov_len) < 0) {
871 			fputsock(so);
872 			return (EINVAL);
873 		}
874 	}
875 #ifdef KTRACE
876 	if (KTRPOINT(td, KTR_GENIO)) {
877 		iovlen = auio.uio_iovcnt * sizeof (struct iovec);
878 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
879 		bcopy(auio.uio_iov, ktriov, iovlen);
880 		ktruio = auio;
881 	}
882 #endif
883 	len = auio.uio_resid;
884 	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
885 	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
886 	    &mp->msg_flags);
887 	if (error) {
888 		if (auio.uio_resid != len && (error == ERESTART ||
889 		    error == EINTR || error == EWOULDBLOCK))
890 			error = 0;
891 	}
892 #ifdef KTRACE
893 	if (ktriov != NULL) {
894 		if (error == 0) {
895 			ktruio.uio_iov = ktriov;
896 			ktruio.uio_resid = len - auio.uio_resid;
897 			ktrgenio(s, UIO_READ, &ktruio, error);
898 		}
899 		FREE(ktriov, M_TEMP);
900 	}
901 #endif
902 	if (error)
903 		goto out;
904 	td->td_retval[0] = len - auio.uio_resid;
905 	if (mp->msg_name) {
906 		len = mp->msg_namelen;
907 		if (len <= 0 || fromsa == 0)
908 			len = 0;
909 		else {
910 #ifndef MIN
911 #define MIN(a,b) ((a)>(b)?(b):(a))
912 #endif
913 			/* save sa_len before it is destroyed by MSG_COMPAT */
914 			len = MIN(len, fromsa->sa_len);
915 #ifdef COMPAT_OLDSOCK
916 			if (mp->msg_flags & MSG_COMPAT)
917 				((struct osockaddr *)fromsa)->sa_family =
918 				    fromsa->sa_family;
919 #endif
920 			error = copyout(fromsa, mp->msg_name, (unsigned)len);
921 			if (error)
922 				goto out;
923 		}
924 		mp->msg_namelen = len;
925 		if (namelenp &&
926 		    (error = copyout(&len, namelenp, sizeof (int)))) {
927 #ifdef COMPAT_OLDSOCK
928 			if (mp->msg_flags & MSG_COMPAT)
929 				error = 0;	/* old recvfrom didn't check */
930 			else
931 #endif
932 			goto out;
933 		}
934 	}
935 	if (mp->msg_control) {
936 #ifdef COMPAT_OLDSOCK
937 		/*
938 		 * We assume that old recvmsg calls won't receive access
939 		 * rights and other control info, esp. as control info
940 		 * is always optional and those options didn't exist in 4.3.
941 		 * If we receive rights, trim the cmsghdr; anything else
942 		 * is tossed.
943 		 */
944 		if (control && mp->msg_flags & MSG_COMPAT) {
945 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
946 			    SOL_SOCKET ||
947 			    mtod(control, struct cmsghdr *)->cmsg_type !=
948 			    SCM_RIGHTS) {
949 				mp->msg_controllen = 0;
950 				goto out;
951 			}
952 			control->m_len -= sizeof (struct cmsghdr);
953 			control->m_data += sizeof (struct cmsghdr);
954 		}
955 #endif
956 		len = mp->msg_controllen;
957 		m = control;
958 		mp->msg_controllen = 0;
959 		ctlbuf = mp->msg_control;
960 
961 		while (m && len > 0) {
962 			unsigned int tocopy;
963 
964 			if (len >= m->m_len)
965 				tocopy = m->m_len;
966 			else {
967 				mp->msg_flags |= MSG_CTRUNC;
968 				tocopy = len;
969 			}
970 
971 			if ((error = copyout(mtod(m, caddr_t),
972 					ctlbuf, tocopy)) != 0)
973 				goto out;
974 
975 			ctlbuf += tocopy;
976 			len -= tocopy;
977 			m = m->m_next;
978 		}
979 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
980 	}
981 out:
982 	fputsock(so);
983 	if (fromsa)
984 		FREE(fromsa, M_SONAME);
985 	if (control)
986 		m_freem(control);
987 	return (error);
988 }
989 
990 /*
991  * MPSAFE
992  */
993 int
994 recvfrom(td, uap)
995 	struct thread *td;
996 	register struct recvfrom_args /* {
997 		int	s;
998 		caddr_t	buf;
999 		size_t	len;
1000 		int	flags;
1001 		caddr_t	from;
1002 		int	*fromlenaddr;
1003 	} */ *uap;
1004 {
1005 	struct msghdr msg;
1006 	struct iovec aiov;
1007 	int error;
1008 
1009 	mtx_lock(&Giant);
1010 	if (uap->fromlenaddr) {
1011 		error = copyin(uap->fromlenaddr,
1012 		    &msg.msg_namelen, sizeof (msg.msg_namelen));
1013 		if (error)
1014 			goto done2;
1015 	} else {
1016 		msg.msg_namelen = 0;
1017 	}
1018 	msg.msg_name = uap->from;
1019 	msg.msg_iov = &aiov;
1020 	msg.msg_iovlen = 1;
1021 	aiov.iov_base = uap->buf;
1022 	aiov.iov_len = uap->len;
1023 	msg.msg_control = 0;
1024 	msg.msg_flags = uap->flags;
1025 	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1026 done2:
1027 	mtx_unlock(&Giant);
1028 	return(error);
1029 }
1030 
1031 #ifdef COMPAT_OLDSOCK
1032 /*
1033  * MPSAFE
1034  */
1035 int
1036 orecvfrom(td, uap)
1037 	struct thread *td;
1038 	struct recvfrom_args *uap;
1039 {
1040 
1041 	uap->flags |= MSG_COMPAT;
1042 	return (recvfrom(td, uap));
1043 }
1044 #endif
1045 
1046 
1047 #ifdef COMPAT_OLDSOCK
1048 /*
1049  * MPSAFE
1050  */
1051 int
1052 orecv(td, uap)
1053 	struct thread *td;
1054 	register struct orecv_args /* {
1055 		int	s;
1056 		caddr_t	buf;
1057 		int	len;
1058 		int	flags;
1059 	} */ *uap;
1060 {
1061 	struct msghdr msg;
1062 	struct iovec aiov;
1063 	int error;
1064 
1065 	mtx_lock(&Giant);
1066 	msg.msg_name = 0;
1067 	msg.msg_namelen = 0;
1068 	msg.msg_iov = &aiov;
1069 	msg.msg_iovlen = 1;
1070 	aiov.iov_base = uap->buf;
1071 	aiov.iov_len = uap->len;
1072 	msg.msg_control = 0;
1073 	msg.msg_flags = uap->flags;
1074 	error = recvit(td, uap->s, &msg, NULL);
1075 	mtx_unlock(&Giant);
1076 	return (error);
1077 }
1078 
1079 /*
1080  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1081  * overlays the new one, missing only the flags, and with the (old) access
1082  * rights where the control fields are now.
1083  *
1084  * MPSAFE
1085  */
1086 int
1087 orecvmsg(td, uap)
1088 	struct thread *td;
1089 	register struct orecvmsg_args /* {
1090 		int	s;
1091 		struct	omsghdr *msg;
1092 		int	flags;
1093 	} */ *uap;
1094 {
1095 	struct msghdr msg;
1096 	struct iovec aiov[UIO_SMALLIOV], *iov;
1097 	int error;
1098 
1099 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1100 	if (error)
1101 		return (error);
1102 
1103 	mtx_lock(&Giant);
1104 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1105 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
1106 			error = EMSGSIZE;
1107 			goto done2;
1108 		}
1109 		MALLOC(iov, struct iovec *,
1110 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1111 		      M_WAITOK);
1112 	} else {
1113 		iov = aiov;
1114 	}
1115 	msg.msg_flags = uap->flags | MSG_COMPAT;
1116 	error = copyin(msg.msg_iov, iov,
1117 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1118 	if (error)
1119 		goto done;
1120 	msg.msg_iov = iov;
1121 	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1122 
1123 	if (msg.msg_controllen && error == 0)
1124 		error = copyout(&msg.msg_controllen,
1125 		    &uap->msg->msg_accrightslen, sizeof (int));
1126 done:
1127 	if (iov != aiov)
1128 		FREE(iov, M_IOV);
1129 done2:
1130 	mtx_unlock(&Giant);
1131 	return (error);
1132 }
1133 #endif
1134 
1135 /*
1136  * MPSAFE
1137  */
1138 int
1139 recvmsg(td, uap)
1140 	struct thread *td;
1141 	register struct recvmsg_args /* {
1142 		int	s;
1143 		struct	msghdr *msg;
1144 		int	flags;
1145 	} */ *uap;
1146 {
1147 	struct msghdr msg;
1148 	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
1149 	register int error;
1150 
1151 	mtx_lock(&Giant);
1152 	error = copyin(uap->msg, &msg, sizeof (msg));
1153 	if (error)
1154 		goto done2;
1155 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1156 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
1157 			error = EMSGSIZE;
1158 			goto done2;
1159 		}
1160 		MALLOC(iov, struct iovec *,
1161 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1162 		       M_WAITOK);
1163 	} else {
1164 		iov = aiov;
1165 	}
1166 #ifdef COMPAT_OLDSOCK
1167 	msg.msg_flags = uap->flags &~ MSG_COMPAT;
1168 #else
1169 	msg.msg_flags = uap->flags;
1170 #endif
1171 	uiov = msg.msg_iov;
1172 	msg.msg_iov = iov;
1173 	error = copyin(uiov, iov,
1174 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1175 	if (error)
1176 		goto done;
1177 	error = recvit(td, uap->s, &msg, NULL);
1178 	if (!error) {
1179 		msg.msg_iov = uiov;
1180 		error = copyout(&msg, uap->msg, sizeof(msg));
1181 	}
1182 done:
1183 	if (iov != aiov)
1184 		FREE(iov, M_IOV);
1185 done2:
1186 	mtx_unlock(&Giant);
1187 	return (error);
1188 }
1189 
1190 /*
1191  * MPSAFE
1192  */
1193 /* ARGSUSED */
1194 int
1195 shutdown(td, uap)
1196 	struct thread *td;
1197 	register struct shutdown_args /* {
1198 		int	s;
1199 		int	how;
1200 	} */ *uap;
1201 {
1202 	struct socket *so;
1203 	int error;
1204 
1205 	mtx_lock(&Giant);
1206 	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
1207 		error = soshutdown(so, uap->how);
1208 		fputsock(so);
1209 	}
1210 	mtx_unlock(&Giant);
1211 	return(error);
1212 }
1213 
1214 /*
1215  * MPSAFE
1216  */
1217 /* ARGSUSED */
1218 int
1219 setsockopt(td, uap)
1220 	struct thread *td;
1221 	register struct setsockopt_args /* {
1222 		int	s;
1223 		int	level;
1224 		int	name;
1225 		caddr_t	val;
1226 		int	valsize;
1227 	} */ *uap;
1228 {
1229 	struct socket *so;
1230 	struct sockopt sopt;
1231 	int error;
1232 
1233 	if (uap->val == 0 && uap->valsize != 0)
1234 		return (EFAULT);
1235 	if (uap->valsize < 0)
1236 		return (EINVAL);
1237 
1238 	mtx_lock(&Giant);
1239 	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
1240 		sopt.sopt_dir = SOPT_SET;
1241 		sopt.sopt_level = uap->level;
1242 		sopt.sopt_name = uap->name;
1243 		sopt.sopt_val = uap->val;
1244 		sopt.sopt_valsize = uap->valsize;
1245 		sopt.sopt_td = td;
1246 		error = sosetopt(so, &sopt);
1247 		fputsock(so);
1248 	}
1249 	mtx_unlock(&Giant);
1250 	return(error);
1251 }
1252 
1253 /*
1254  * MPSAFE
1255  */
1256 /* ARGSUSED */
1257 int
1258 getsockopt(td, uap)
1259 	struct thread *td;
1260 	register struct getsockopt_args /* {
1261 		int	s;
1262 		int	level;
1263 		int	name;
1264 		caddr_t	val;
1265 		int	*avalsize;
1266 	} */ *uap;
1267 {
1268 	int	valsize, error;
1269 	struct  socket *so;
1270 	struct	sockopt sopt;
1271 
1272 	mtx_lock(&Giant);
1273 	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
1274 		goto done2;
1275 	if (uap->val) {
1276 		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1277 		if (error)
1278 			goto done1;
1279 		if (valsize < 0) {
1280 			error = EINVAL;
1281 			goto done1;
1282 		}
1283 	} else {
1284 		valsize = 0;
1285 	}
1286 
1287 	sopt.sopt_dir = SOPT_GET;
1288 	sopt.sopt_level = uap->level;
1289 	sopt.sopt_name = uap->name;
1290 	sopt.sopt_val = uap->val;
1291 	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1292 	sopt.sopt_td = td;
1293 
1294 	error = sogetopt(so, &sopt);
1295 	if (error == 0) {
1296 		valsize = sopt.sopt_valsize;
1297 		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1298 	}
1299 done1:
1300 	fputsock(so);
1301 done2:
1302 	mtx_unlock(&Giant);
1303 	return (error);
1304 }
1305 
1306 /*
1307  * getsockname1() - Get socket name.
1308  *
1309  * MPSAFE
1310  */
1311 /* ARGSUSED */
1312 static int
1313 getsockname1(td, uap, compat)
1314 	struct thread *td;
1315 	register struct getsockname_args /* {
1316 		int	fdes;
1317 		caddr_t	asa;
1318 		int	*alen;
1319 	} */ *uap;
1320 	int compat;
1321 {
1322 	struct socket *so;
1323 	struct sockaddr *sa;
1324 	int len, error;
1325 
1326 	mtx_lock(&Giant);
1327 	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
1328 		goto done2;
1329 	error = copyin(uap->alen, &len, sizeof (len));
1330 	if (error)
1331 		goto done1;
1332 	sa = 0;
1333 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1334 	if (error)
1335 		goto bad;
1336 	if (sa == 0) {
1337 		len = 0;
1338 		goto gotnothing;
1339 	}
1340 
1341 	len = MIN(len, sa->sa_len);
1342 #ifdef COMPAT_OLDSOCK
1343 	if (compat)
1344 		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1345 #endif
1346 	error = copyout(sa, uap->asa, (u_int)len);
1347 	if (error == 0)
1348 gotnothing:
1349 		error = copyout(&len, uap->alen, sizeof (len));
1350 bad:
1351 	if (sa)
1352 		FREE(sa, M_SONAME);
1353 done1:
1354 	fputsock(so);
1355 done2:
1356 	mtx_unlock(&Giant);
1357 	return (error);
1358 }
1359 
1360 /*
1361  * MPSAFE
1362  */
1363 int
1364 getsockname(td, uap)
1365 	struct thread *td;
1366 	struct getsockname_args *uap;
1367 {
1368 
1369 	return (getsockname1(td, uap, 0));
1370 }
1371 
1372 #ifdef COMPAT_OLDSOCK
1373 /*
1374  * MPSAFE
1375  */
1376 int
1377 ogetsockname(td, uap)
1378 	struct thread *td;
1379 	struct getsockname_args *uap;
1380 {
1381 
1382 	return (getsockname1(td, uap, 1));
1383 }
1384 #endif /* COMPAT_OLDSOCK */
1385 
1386 /*
1387  * getpeername1() - Get name of peer for connected socket.
1388  *
1389  * MPSAFE
1390  */
1391 /* ARGSUSED */
1392 static int
1393 getpeername1(td, uap, compat)
1394 	struct thread *td;
1395 	register struct getpeername_args /* {
1396 		int	fdes;
1397 		caddr_t	asa;
1398 		int	*alen;
1399 	} */ *uap;
1400 	int compat;
1401 {
1402 	struct socket *so;
1403 	struct sockaddr *sa;
1404 	int len, error;
1405 
1406 	mtx_lock(&Giant);
1407 	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
1408 		goto done2;
1409 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1410 		error = ENOTCONN;
1411 		goto done1;
1412 	}
1413 	error = copyin(uap->alen, &len, sizeof (len));
1414 	if (error)
1415 		goto done1;
1416 	sa = 0;
1417 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1418 	if (error)
1419 		goto bad;
1420 	if (sa == 0) {
1421 		len = 0;
1422 		goto gotnothing;
1423 	}
1424 	len = MIN(len, sa->sa_len);
1425 #ifdef COMPAT_OLDSOCK
1426 	if (compat)
1427 		((struct osockaddr *)sa)->sa_family =
1428 		    sa->sa_family;
1429 #endif
1430 	error = copyout(sa, uap->asa, (u_int)len);
1431 	if (error)
1432 		goto bad;
1433 gotnothing:
1434 	error = copyout(&len, uap->alen, sizeof (len));
1435 bad:
1436 	if (sa)
1437 		FREE(sa, M_SONAME);
1438 done1:
1439 	fputsock(so);
1440 done2:
1441 	mtx_unlock(&Giant);
1442 	return (error);
1443 }
1444 
1445 /*
1446  * MPSAFE
1447  */
1448 int
1449 getpeername(td, uap)
1450 	struct thread *td;
1451 	struct getpeername_args *uap;
1452 {
1453 
1454 	return (getpeername1(td, uap, 0));
1455 }
1456 
1457 #ifdef COMPAT_OLDSOCK
1458 /*
1459  * MPSAFE
1460  */
1461 int
1462 ogetpeername(td, uap)
1463 	struct thread *td;
1464 	struct ogetpeername_args *uap;
1465 {
1466 
1467 	/* XXX uap should have type `getpeername_args *' to begin with. */
1468 	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1469 }
1470 #endif /* COMPAT_OLDSOCK */
1471 
1472 int
1473 sockargs(mp, buf, buflen, type)
1474 	struct mbuf **mp;
1475 	caddr_t buf;
1476 	int buflen, type;
1477 {
1478 	register struct sockaddr *sa;
1479 	register struct mbuf *m;
1480 	int error;
1481 
1482 	if ((u_int)buflen > MLEN) {
1483 #ifdef COMPAT_OLDSOCK
1484 		if (type == MT_SONAME && (u_int)buflen <= 112)
1485 			buflen = MLEN;		/* unix domain compat. hack */
1486 		else
1487 #endif
1488 		return (EINVAL);
1489 	}
1490 	m = m_get(M_TRYWAIT, type);
1491 	if (m == NULL)
1492 		return (ENOBUFS);
1493 	m->m_len = buflen;
1494 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1495 	if (error)
1496 		(void) m_free(m);
1497 	else {
1498 		*mp = m;
1499 		if (type == MT_SONAME) {
1500 			sa = mtod(m, struct sockaddr *);
1501 
1502 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1503 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1504 				sa->sa_family = sa->sa_len;
1505 #endif
1506 			sa->sa_len = buflen;
1507 		}
1508 	}
1509 	return (error);
1510 }
1511 
1512 int
1513 getsockaddr(namp, uaddr, len)
1514 	struct sockaddr **namp;
1515 	caddr_t uaddr;
1516 	size_t len;
1517 {
1518 	struct sockaddr *sa;
1519 	int error;
1520 
1521 	if (len > SOCK_MAXADDRLEN)
1522 		return ENAMETOOLONG;
1523 	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1524 	error = copyin(uaddr, sa, len);
1525 	if (error) {
1526 		FREE(sa, M_SONAME);
1527 	} else {
1528 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1529 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1530 			sa->sa_family = sa->sa_len;
1531 #endif
1532 		sa->sa_len = len;
1533 		*namp = sa;
1534 	}
1535 	return error;
1536 }
1537 
1538 /*
1539  * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1540  * XXX - The sf_buf functions are currently private to sendfile(2), so have
1541  * been made static, but may be useful in the future for doing zero-copy in
1542  * other parts of the networking code.
1543  */
1544 static void
1545 sf_buf_init(void *arg)
1546 {
1547 	int i;
1548 
1549 	mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF);
1550 	mtx_lock(&sf_freelist.sf_lock);
1551 	SLIST_INIT(&sf_freelist.sf_head);
1552 	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
1553 	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
1554 	    M_NOWAIT | M_ZERO);
1555 	for (i = 0; i < nsfbufs; i++) {
1556 		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1557 		SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list);
1558 	}
1559 	sf_buf_alloc_want = 0;
1560 	mtx_unlock(&sf_freelist.sf_lock);
1561 }
1562 
1563 /*
1564  * Get an sf_buf from the freelist. Will block if none are available.
1565  */
1566 struct sf_buf *
1567 sf_buf_alloc()
1568 {
1569 	struct sf_buf *sf;
1570 	int error;
1571 
1572 	mtx_lock(&sf_freelist.sf_lock);
1573 	while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) {
1574 		sf_buf_alloc_want++;
1575 		error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH,
1576 		    "sfbufa", 0);
1577 		sf_buf_alloc_want--;
1578 
1579 		/*
1580 		 * If we got a signal, don't risk going back to sleep.
1581 		 */
1582 		if (error)
1583 			break;
1584 	}
1585 	if (sf != NULL)
1586 		SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list);
1587 	mtx_unlock(&sf_freelist.sf_lock);
1588 	return (sf);
1589 }
1590 
1591 #define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1592 
1593 /*
1594  * Detatch mapped page and release resources back to the system.
1595  */
1596 void
1597 sf_buf_free(void *addr, void *args)
1598 {
1599 	struct sf_buf *sf;
1600 	struct vm_page *m;
1601 
1602 	GIANT_REQUIRED;
1603 
1604 	sf = dtosf(addr);
1605 	pmap_qremove((vm_offset_t)addr, 1);
1606 	m = sf->m;
1607 	vm_page_unwire(m, 0);
1608 	/*
1609 	 * Check for the object going away on us. This can
1610 	 * happen since we don't hold a reference to it.
1611 	 * If so, we're responsible for freeing the page.
1612 	 */
1613 	if (m->wire_count == 0 && m->object == NULL)
1614 		vm_page_free(m);
1615 	sf->m = NULL;
1616 	mtx_lock(&sf_freelist.sf_lock);
1617 	SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list);
1618 	if (sf_buf_alloc_want > 0)
1619 		wakeup_one(&sf_freelist);
1620 	mtx_unlock(&sf_freelist.sf_lock);
1621 }
1622 
1623 /*
1624  * sendfile(2)
1625  *
1626  * MPSAFE
1627  *
1628  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1629  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1630  *
1631  * Send a file specified by 'fd' and starting at 'offset' to a socket
1632  * specified by 's'. Send only 'nbytes' of the file or until EOF if
1633  * nbytes == 0. Optionally add a header and/or trailer to the socket
1634  * output. If specified, write the total number of bytes sent into *sbytes.
1635  *
1636  */
1637 int
1638 sendfile(struct thread *td, struct sendfile_args *uap)
1639 {
1640 	struct vnode *vp;
1641 	struct vm_object *obj;
1642 	struct socket *so = NULL;
1643 	struct mbuf *m;
1644 	struct sf_buf *sf;
1645 	struct vm_page *pg;
1646 	struct writev_args nuap;
1647 	struct sf_hdtr hdtr;
1648 	off_t off, xfsize, hdtr_size, sbytes = 0;
1649 	int error, s;
1650 
1651 	mtx_lock(&Giant);
1652 
1653 	hdtr_size = 0;
1654 
1655 	/*
1656 	 * The descriptor must be a regular file and have a backing VM object.
1657 	 */
1658 	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
1659 		goto done;
1660 	if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
1661 		error = EINVAL;
1662 		goto done;
1663 	}
1664 	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
1665 		goto done;
1666 	if (so->so_type != SOCK_STREAM) {
1667 		error = EINVAL;
1668 		goto done;
1669 	}
1670 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1671 		error = ENOTCONN;
1672 		goto done;
1673 	}
1674 	if (uap->offset < 0) {
1675 		error = EINVAL;
1676 		goto done;
1677 	}
1678 
1679 	/*
1680 	 * If specified, get the pointer to the sf_hdtr struct for
1681 	 * any headers/trailers.
1682 	 */
1683 	if (uap->hdtr != NULL) {
1684 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1685 		if (error)
1686 			goto done;
1687 		/*
1688 		 * Send any headers. Wimp out and use writev(2).
1689 		 */
1690 		if (hdtr.headers != NULL) {
1691 			nuap.fd = uap->s;
1692 			nuap.iovp = hdtr.headers;
1693 			nuap.iovcnt = hdtr.hdr_cnt;
1694 			error = writev(td, &nuap);
1695 			if (error)
1696 				goto done;
1697 			hdtr_size += td->td_retval[0];
1698 		}
1699 	}
1700 
1701 	/*
1702 	 * Protect against multiple writers to the socket.
1703 	 */
1704 	(void) sblock(&so->so_snd, M_WAITOK);
1705 
1706 	/*
1707 	 * Loop through the pages in the file, starting with the requested
1708 	 * offset. Get a file page (do I/O if necessary), map the file page
1709 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1710 	 * it on the socket.
1711 	 */
1712 	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1713 		vm_pindex_t pindex;
1714 		vm_offset_t pgoff;
1715 
1716 		pindex = OFF_TO_IDX(off);
1717 retry_lookup:
1718 		/*
1719 		 * Calculate the amount to transfer. Not to exceed a page,
1720 		 * the EOF, or the passed in nbytes.
1721 		 */
1722 		xfsize = obj->un_pager.vnp.vnp_size - off;
1723 		if (xfsize > PAGE_SIZE)
1724 			xfsize = PAGE_SIZE;
1725 		pgoff = (vm_offset_t)(off & PAGE_MASK);
1726 		if (PAGE_SIZE - pgoff < xfsize)
1727 			xfsize = PAGE_SIZE - pgoff;
1728 		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1729 			xfsize = uap->nbytes - sbytes;
1730 		if (xfsize <= 0)
1731 			break;
1732 		/*
1733 		 * Optimize the non-blocking case by looking at the socket space
1734 		 * before going to the extra work of constituting the sf_buf.
1735 		 */
1736 		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1737 			if (so->so_state & SS_CANTSENDMORE)
1738 				error = EPIPE;
1739 			else
1740 				error = EAGAIN;
1741 			sbunlock(&so->so_snd);
1742 			goto done;
1743 		}
1744 		/*
1745 		 * Attempt to look up the page.
1746 		 *
1747 		 *	Allocate if not found
1748 		 *
1749 		 *	Wait and loop if busy.
1750 		 */
1751 		pg = vm_page_lookup(obj, pindex);
1752 
1753 		if (pg == NULL) {
1754 			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1755 			if (pg == NULL) {
1756 				VM_WAIT;
1757 				goto retry_lookup;
1758 			}
1759 			vm_page_wakeup(pg);
1760 		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
1761 			goto retry_lookup;
1762 		}
1763 
1764 		/*
1765 		 * Wire the page so it does not get ripped out from under
1766 		 * us.
1767 		 */
1768 
1769 		vm_page_wire(pg);
1770 
1771 		/*
1772 		 * If page is not valid for what we need, initiate I/O
1773 		 */
1774 
1775 		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1776 			int bsize;
1777 
1778 			/*
1779 			 * Ensure that our page is still around when the I/O
1780 			 * completes.
1781 			 */
1782 			vm_page_io_start(pg);
1783 
1784 			/*
1785 			 * Get the page from backing store.
1786 			 */
1787 			bsize = vp->v_mount->mnt_stat.f_iosize;
1788 			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
1789 			error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
1790 			    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
1791 			    IO_VMIO | ((MAXBSIZE / bsize) << 16),
1792 			    td->td_ucred, NULL, td);
1793 			VOP_UNLOCK(vp, 0, td);
1794 			vm_page_flag_clear(pg, PG_ZERO);
1795 			vm_page_io_finish(pg);
1796 			if (error) {
1797 				vm_page_unwire(pg, 0);
1798 				/*
1799 				 * See if anyone else might know about this page.
1800 				 * If not and it is not valid, then free it.
1801 				 */
1802 				if (pg->wire_count == 0 && pg->valid == 0 &&
1803 				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1804 				    pg->hold_count == 0) {
1805 					vm_page_busy(pg);
1806 					vm_page_free(pg);
1807 				}
1808 				sbunlock(&so->so_snd);
1809 				goto done;
1810 			}
1811 		}
1812 
1813 
1814 		/*
1815 		 * Get a sendfile buf. We usually wait as long as necessary,
1816 		 * but this wait can be interrupted.
1817 		 */
1818 		if ((sf = sf_buf_alloc()) == NULL) {
1819 			vm_page_unwire(pg, 0);
1820 			if (pg->wire_count == 0 && pg->object == NULL)
1821 				vm_page_free(pg);
1822 			sbunlock(&so->so_snd);
1823 			error = EINTR;
1824 			goto done;
1825 		}
1826 
1827 		/*
1828 		 * Allocate a kernel virtual page and insert the physical page
1829 		 * into it.
1830 		 */
1831 		sf->m = pg;
1832 		pmap_qenter(sf->kva, &pg, 1);
1833 		/*
1834 		 * Get an mbuf header and set it up as having external storage.
1835 		 */
1836 		MGETHDR(m, M_TRYWAIT, MT_DATA);
1837 		if (m == NULL) {
1838 			error = ENOBUFS;
1839 			sf_buf_free((void *)sf->kva, NULL);
1840 			sbunlock(&so->so_snd);
1841 			goto done;
1842 		}
1843 		/*
1844 		 * Setup external storage for mbuf.
1845 		 */
1846 		MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY,
1847 		    EXT_SFBUF);
1848 		m->m_data = (char *) sf->kva + pgoff;
1849 		m->m_pkthdr.len = m->m_len = xfsize;
1850 		/*
1851 		 * Add the buffer to the socket buffer chain.
1852 		 */
1853 		s = splnet();
1854 retry_space:
1855 		/*
1856 		 * Make sure that the socket is still able to take more data.
1857 		 * CANTSENDMORE being true usually means that the connection
1858 		 * was closed. so_error is true when an error was sensed after
1859 		 * a previous send.
1860 		 * The state is checked after the page mapping and buffer
1861 		 * allocation above since those operations may block and make
1862 		 * any socket checks stale. From this point forward, nothing
1863 		 * blocks before the pru_send (or more accurately, any blocking
1864 		 * results in a loop back to here to re-check).
1865 		 */
1866 		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1867 			if (so->so_state & SS_CANTSENDMORE) {
1868 				error = EPIPE;
1869 			} else {
1870 				error = so->so_error;
1871 				so->so_error = 0;
1872 			}
1873 			m_freem(m);
1874 			sbunlock(&so->so_snd);
1875 			splx(s);
1876 			goto done;
1877 		}
1878 		/*
1879 		 * Wait for socket space to become available. We do this just
1880 		 * after checking the connection state above in order to avoid
1881 		 * a race condition with sbwait().
1882 		 */
1883 		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1884 			if (so->so_state & SS_NBIO) {
1885 				m_freem(m);
1886 				sbunlock(&so->so_snd);
1887 				splx(s);
1888 				error = EAGAIN;
1889 				goto done;
1890 			}
1891 			error = sbwait(&so->so_snd);
1892 			/*
1893 			 * An error from sbwait usually indicates that we've
1894 			 * been interrupted by a signal. If we've sent anything
1895 			 * then return bytes sent, otherwise return the error.
1896 			 */
1897 			if (error) {
1898 				m_freem(m);
1899 				sbunlock(&so->so_snd);
1900 				splx(s);
1901 				goto done;
1902 			}
1903 			goto retry_space;
1904 		}
1905 		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td);
1906 		splx(s);
1907 		if (error) {
1908 			sbunlock(&so->so_snd);
1909 			goto done;
1910 		}
1911 	}
1912 	sbunlock(&so->so_snd);
1913 
1914 	/*
1915 	 * Send trailers. Wimp out and use writev(2).
1916 	 */
1917 	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1918 			nuap.fd = uap->s;
1919 			nuap.iovp = hdtr.trailers;
1920 			nuap.iovcnt = hdtr.trl_cnt;
1921 			error = writev(td, &nuap);
1922 			if (error)
1923 				goto done;
1924 			hdtr_size += td->td_retval[0];
1925 	}
1926 
1927 done:
1928 	/*
1929 	 * If there was no error we have to clear td->td_retval[0]
1930 	 * because it may have been set by writev.
1931 	 */
1932 	if (error == 0) {
1933 		td->td_retval[0] = 0;
1934 	}
1935 	if (uap->sbytes != NULL) {
1936 		sbytes += hdtr_size;
1937 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
1938 	}
1939 	if (vp)
1940 		vrele(vp);
1941 	if (so)
1942 		fputsock(so);
1943 	mtx_unlock(&Giant);
1944 	return (error);
1945 }
1946