xref: /freebsd/sys/kern/uipc_syscalls.c (revision 2228a60af1845742127b7aa079e0c9a1cdfd3dfe)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37  * $FreeBSD$
38  */
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/mutex.h>
48 #include <sys/sysproto.h>
49 #include <sys/malloc.h>
50 #include <sys/filedesc.h>
51 #include <sys/event.h>
52 #include <sys/proc.h>
53 #include <sys/fcntl.h>
54 #include <sys/file.h>
55 #include <sys/lock.h>
56 #include <sys/mount.h>
57 #include <sys/mbuf.h>
58 #include <sys/protosw.h>
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/signalvar.h>
62 #include <sys/uio.h>
63 #include <sys/vnode.h>
64 #ifdef KTRACE
65 #include <sys/ktrace.h>
66 #endif
67 
68 #include <vm/vm.h>
69 #include <vm/vm_object.h>
70 #include <vm/vm_page.h>
71 #include <vm/vm_pageout.h>
72 #include <vm/vm_kern.h>
73 #include <vm/vm_extern.h>
74 
75 static void sf_buf_init(void *arg);
76 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
77 static struct sf_buf *sf_buf_alloc(void);
78 static void sf_buf_free(caddr_t addr, void *args);
79 
80 static int sendit __P((struct thread *td, int s, struct msghdr *mp, int flags));
81 static int recvit __P((struct thread *td, int s, struct msghdr *mp,
82 		       caddr_t namelenp));
83 
84 static int accept1 __P((struct thread *td, struct accept_args *uap, int compat));
85 static int getsockname1 __P((struct thread *td, struct getsockname_args *uap,
86 			     int compat));
87 static int getpeername1 __P((struct thread *td, struct getpeername_args *uap,
88 			     int compat));
89 
90 /*
91  * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the
92  * sf_freelist head with the sf_lock mutex.
93  */
94 static struct {
95 	SLIST_HEAD(, sf_buf) sf_head;
96 	struct mtx sf_lock;
97 } sf_freelist;
98 
99 static vm_offset_t sf_base;
100 static struct sf_buf *sf_bufs;
101 static u_int sf_buf_alloc_want;
102 
103 /*
104  * System call interface to the socket abstraction.
105  */
106 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
107 #define COMPAT_OLDSOCK
108 #endif
109 
110 extern	struct fileops socketops;
111 
112 /*
113  * MPSAFE
114  */
115 int
116 socket(td, uap)
117 	struct thread *td;
118 	register struct socket_args /* {
119 		int	domain;
120 		int	type;
121 		int	protocol;
122 	} */ *uap;
123 {
124 	struct filedesc *fdp;
125 	struct socket *so;
126 	struct file *fp;
127 	int fd, error;
128 
129 	mtx_lock(&Giant);
130 	fdp = td->td_proc->p_fd;
131 	error = falloc(td, &fp, &fd);
132 	if (error)
133 		goto done2;
134 	fhold(fp);
135 	error = socreate(uap->domain, &so, uap->type, uap->protocol,
136 	    td->td_proc->p_ucred, td);
137 	if (error) {
138 		if (fdp->fd_ofiles[fd] == fp) {
139 			fdp->fd_ofiles[fd] = NULL;
140 			fdrop(fp, td);
141 		}
142 	} else {
143 		fp->f_data = (caddr_t)so;	/* already has ref count */
144 		fp->f_flag = FREAD|FWRITE;
145 		fp->f_ops = &socketops;
146 		fp->f_type = DTYPE_SOCKET;
147 		td->td_retval[0] = fd;
148 	}
149 	fdrop(fp, td);
150 done2:
151 	mtx_unlock(&Giant);
152 	return (error);
153 }
154 
155 /*
156  * MPSAFE
157  */
158 /* ARGSUSED */
159 int
160 bind(td, uap)
161 	struct thread *td;
162 	register struct bind_args /* {
163 		int	s;
164 		caddr_t	name;
165 		int	namelen;
166 	} */ *uap;
167 {
168 	struct sockaddr *sa;
169 	struct socket *sp;
170 	int error;
171 
172 	mtx_lock(&Giant);
173 	if ((error = fgetsock(td, uap->s, &sp, NULL)) != 0)
174 		goto done2;
175 	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
176 		goto done1;
177 	error = sobind(sp, sa, td);
178 	FREE(sa, M_SONAME);
179 done1:
180 	fputsock(sp);
181 done2:
182 	mtx_unlock(&Giant);
183 	return (error);
184 }
185 
186 /*
187  * MPSAFE
188  */
189 /* ARGSUSED */
190 int
191 listen(td, uap)
192 	struct thread *td;
193 	register struct listen_args /* {
194 		int	s;
195 		int	backlog;
196 	} */ *uap;
197 {
198 	struct socket *sp;
199 	int error;
200 
201 	mtx_lock(&Giant);
202 	if ((error = fgetsock(td, uap->s, &sp, NULL)) == 0) {
203 		error = solisten(sp, uap->backlog, td);
204 		fputsock(sp);
205 	}
206 	mtx_unlock(&Giant);
207 	return(error);
208 }
209 
210 /*
211  * accept1()
212  * MPSAFE
213  */
214 static int
215 accept1(td, uap, compat)
216 	struct thread *td;
217 	register struct accept_args /* {
218 		int	s;
219 		caddr_t	name;
220 		int	*anamelen;
221 	} */ *uap;
222 	int compat;
223 {
224 	struct filedesc *fdp;
225 	struct file *nfp = NULL;
226 	struct sockaddr *sa;
227 	int namelen, error, s;
228 	struct socket *head, *so;
229 	int fd;
230 	u_int fflag;
231 
232 	mtx_lock(&Giant);
233 	fdp = td->td_proc->p_fd;
234 	if (uap->name) {
235 		error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
236 			sizeof (namelen));
237 		if(error)
238 			goto done2;
239 	}
240 	error = fgetsock(td, uap->s, &head, &fflag);
241 	if (error)
242 		goto done2;
243 	s = splnet();
244 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
245 		splx(s);
246 		error = EINVAL;
247 		goto done;
248 	}
249 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
250 		splx(s);
251 		error = EWOULDBLOCK;
252 		goto done;
253 	}
254 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
255 		if (head->so_state & SS_CANTRCVMORE) {
256 			head->so_error = ECONNABORTED;
257 			break;
258 		}
259 		error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
260 		    "accept", 0);
261 		if (error) {
262 			splx(s);
263 			goto done;
264 		}
265 	}
266 	if (head->so_error) {
267 		error = head->so_error;
268 		head->so_error = 0;
269 		splx(s);
270 		goto done;
271 	}
272 
273 	/*
274 	 * At this point we know that there is at least one connection
275 	 * ready to be accepted. Remove it from the queue prior to
276 	 * allocating the file descriptor for it since falloc() may
277 	 * block allowing another process to accept the connection
278 	 * instead.
279 	 */
280 	so = TAILQ_FIRST(&head->so_comp);
281 	TAILQ_REMOVE(&head->so_comp, so, so_list);
282 	head->so_qlen--;
283 
284 	error = falloc(td, &nfp, &fd);
285 	if (error) {
286 		/*
287 		 * Probably ran out of file descriptors. Put the
288 		 * unaccepted connection back onto the queue and
289 		 * do another wakeup so some other process might
290 		 * have a chance at it.
291 		 */
292 		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
293 		head->so_qlen++;
294 		wakeup_one(&head->so_timeo);
295 		splx(s);
296 		goto done;
297 	}
298 	fhold(nfp);
299 	td->td_retval[0] = fd;
300 
301 	/* connection has been removed from the listen queue */
302 	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
303 
304 	so->so_state &= ~SS_COMP;
305 	so->so_head = NULL;
306 	if (head->so_sigio != NULL)
307 		fsetown(fgetown(head->so_sigio), &so->so_sigio);
308 
309 	soref(so);			/* file descriptor reference */
310 	nfp->f_data = (caddr_t)so;	/* nfp has ref count from falloc */
311 	nfp->f_flag = fflag;
312 	nfp->f_ops = &socketops;
313 	nfp->f_type = DTYPE_SOCKET;
314 	sa = 0;
315 	error = soaccept(so, &sa);
316 	if (error) {
317 		/*
318 		 * return a namelen of zero for older code which might
319 	 	 * ignore the return value from accept.
320 		 */
321 		if (uap->name != NULL) {
322 			namelen = 0;
323 			(void) copyout((caddr_t)&namelen,
324 			    (caddr_t)uap->anamelen, sizeof(*uap->anamelen));
325 		}
326 		goto noconnection;
327 	}
328 	if (sa == NULL) {
329 		namelen = 0;
330 		if (uap->name)
331 			goto gotnoname;
332 		splx(s);
333 		error = 0;
334 		goto done;
335 	}
336 	if (uap->name) {
337 		/* check sa_len before it is destroyed */
338 		if (namelen > sa->sa_len)
339 			namelen = sa->sa_len;
340 #ifdef COMPAT_OLDSOCK
341 		if (compat)
342 			((struct osockaddr *)sa)->sa_family =
343 			    sa->sa_family;
344 #endif
345 		error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
346 		if (!error)
347 gotnoname:
348 			error = copyout((caddr_t)&namelen,
349 			    (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
350 	}
351 noconnection:
352 	if (sa)
353 		FREE(sa, M_SONAME);
354 
355 	/*
356 	 * close the new descriptor, assuming someone hasn't ripped it
357 	 * out from under us.
358 	 */
359 	if (error) {
360 		if (fdp->fd_ofiles[fd] == nfp) {
361 			fdp->fd_ofiles[fd] = NULL;
362 			fdrop(nfp, td);
363 		}
364 	}
365 	splx(s);
366 
367 	/*
368 	 * Release explicitly held references before returning.
369 	 */
370 done:
371 	if (nfp != NULL)
372 		fdrop(nfp, td);
373 	fputsock(head);
374 done2:
375 	mtx_unlock(&Giant);
376 	return (error);
377 }
378 
379 /*
380  * MPSAFE (accept1() is MPSAFE)
381  */
382 int
383 accept(td, uap)
384 	struct thread *td;
385 	struct accept_args *uap;
386 {
387 
388 	return (accept1(td, uap, 0));
389 }
390 
391 #ifdef COMPAT_OLDSOCK
392 /*
393  * MPSAFE (accept1() is MPSAFE)
394  */
395 int
396 oaccept(td, uap)
397 	struct thread *td;
398 	struct accept_args *uap;
399 {
400 
401 	return (accept1(td, uap, 1));
402 }
403 #endif /* COMPAT_OLDSOCK */
404 
405 /*
406  * MPSAFE
407  */
408 /* ARGSUSED */
409 int
410 connect(td, uap)
411 	struct thread *td;
412 	register struct connect_args /* {
413 		int	s;
414 		caddr_t	name;
415 		int	namelen;
416 	} */ *uap;
417 {
418 	struct socket *so;
419 	struct sockaddr *sa;
420 	int error, s;
421 
422 	mtx_lock(&Giant);
423 	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
424 		goto done2;
425 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
426 		error = EALREADY;
427 		goto done1;
428 	}
429 	error = getsockaddr(&sa, uap->name, uap->namelen);
430 	if (error)
431 		goto done1;
432 	error = soconnect(so, sa, td);
433 	if (error)
434 		goto bad;
435 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
436 		FREE(sa, M_SONAME);
437 		error = EINPROGRESS;
438 		goto done1;
439 	}
440 	s = splnet();
441 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
442 		error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, "connec", 0);
443 		if (error)
444 			break;
445 	}
446 	if (error == 0) {
447 		error = so->so_error;
448 		so->so_error = 0;
449 	}
450 	splx(s);
451 bad:
452 	so->so_state &= ~SS_ISCONNECTING;
453 	FREE(sa, M_SONAME);
454 	if (error == ERESTART)
455 		error = EINTR;
456 done1:
457 	fputsock(so);
458 done2:
459 	mtx_unlock(&Giant);
460 	return (error);
461 }
462 
463 /*
464  * MPSAFE
465  */
466 int
467 socketpair(td, uap)
468 	struct thread *td;
469 	register struct socketpair_args /* {
470 		int	domain;
471 		int	type;
472 		int	protocol;
473 		int	*rsv;
474 	} */ *uap;
475 {
476 	register struct filedesc *fdp = td->td_proc->p_fd;
477 	struct file *fp1, *fp2;
478 	struct socket *so1, *so2;
479 	int fd, error, sv[2];
480 
481 	mtx_lock(&Giant);
482 	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
483 	    td->td_proc->p_ucred, td);
484 	if (error)
485 		goto done2;
486 	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
487 	    td->td_proc->p_ucred, td);
488 	if (error)
489 		goto free1;
490 	error = falloc(td, &fp1, &fd);
491 	if (error)
492 		goto free2;
493 	fhold(fp1);
494 	sv[0] = fd;
495 	fp1->f_data = (caddr_t)so1;	/* so1 already has ref count */
496 	error = falloc(td, &fp2, &fd);
497 	if (error)
498 		goto free3;
499 	fhold(fp2);
500 	fp2->f_data = (caddr_t)so2;	/* so2 already has ref count */
501 	sv[1] = fd;
502 	error = soconnect2(so1, so2);
503 	if (error)
504 		goto free4;
505 	if (uap->type == SOCK_DGRAM) {
506 		/*
507 		 * Datagram socket connection is asymmetric.
508 		 */
509 		 error = soconnect2(so2, so1);
510 		 if (error)
511 			goto free4;
512 	}
513 	fp1->f_flag = fp2->f_flag = FREAD|FWRITE;
514 	fp1->f_ops = fp2->f_ops = &socketops;
515 	fp1->f_type = fp2->f_type = DTYPE_SOCKET;
516 	error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
517 	fdrop(fp1, td);
518 	fdrop(fp2, td);
519 	goto done2;
520 free4:
521 	if (fdp->fd_ofiles[sv[1]] == fp2) {
522 		fdp->fd_ofiles[sv[1]] = NULL;
523 		fdrop(fp2, td);
524 	}
525 	fdrop(fp2, td);
526 free3:
527 	if (fdp->fd_ofiles[sv[0]] == fp1) {
528 		fdp->fd_ofiles[sv[0]] = NULL;
529 		fdrop(fp1, td);
530 	}
531 	fdrop(fp1, td);
532 free2:
533 	(void)soclose(so2);
534 free1:
535 	(void)soclose(so1);
536 done2:
537 	mtx_unlock(&Giant);
538 	return (error);
539 }
540 
541 static int
542 sendit(td, s, mp, flags)
543 	register struct thread *td;
544 	int s;
545 	register struct msghdr *mp;
546 	int flags;
547 {
548 	struct uio auio;
549 	register struct iovec *iov;
550 	register int i;
551 	struct mbuf *control;
552 	struct sockaddr *to = NULL;
553 	int len, error;
554 	struct socket *so;
555 #ifdef KTRACE
556 	struct iovec *ktriov = NULL;
557 	struct uio ktruio;
558 #endif
559 
560 	if ((error = fgetsock(td, s, &so, NULL)) != 0)
561 		return (error);
562 	auio.uio_iov = mp->msg_iov;
563 	auio.uio_iovcnt = mp->msg_iovlen;
564 	auio.uio_segflg = UIO_USERSPACE;
565 	auio.uio_rw = UIO_WRITE;
566 	auio.uio_td = td;
567 	auio.uio_offset = 0;			/* XXX */
568 	auio.uio_resid = 0;
569 	iov = mp->msg_iov;
570 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
571 		if ((auio.uio_resid += iov->iov_len) < 0) {
572 			error = EINVAL;
573 			goto bad;
574 		}
575 	}
576 	if (mp->msg_name) {
577 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
578 		if (error)
579 			goto bad;
580 	}
581 	if (mp->msg_control) {
582 		if (mp->msg_controllen < sizeof(struct cmsghdr)
583 #ifdef COMPAT_OLDSOCK
584 		    && mp->msg_flags != MSG_COMPAT
585 #endif
586 		) {
587 			error = EINVAL;
588 			goto bad;
589 		}
590 		error = sockargs(&control, mp->msg_control,
591 		    mp->msg_controllen, MT_CONTROL);
592 		if (error)
593 			goto bad;
594 #ifdef COMPAT_OLDSOCK
595 		if (mp->msg_flags == MSG_COMPAT) {
596 			register struct cmsghdr *cm;
597 
598 			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
599 			if (control == 0) {
600 				error = ENOBUFS;
601 				goto bad;
602 			} else {
603 				cm = mtod(control, struct cmsghdr *);
604 				cm->cmsg_len = control->m_len;
605 				cm->cmsg_level = SOL_SOCKET;
606 				cm->cmsg_type = SCM_RIGHTS;
607 			}
608 		}
609 #endif
610 	} else {
611 		control = 0;
612 	}
613 #ifdef KTRACE
614 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
615 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
616 
617 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
618 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
619 		ktruio = auio;
620 	}
621 #endif
622 	len = auio.uio_resid;
623 	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
624 						     flags, td);
625 	if (error) {
626 		if (auio.uio_resid != len && (error == ERESTART ||
627 		    error == EINTR || error == EWOULDBLOCK))
628 			error = 0;
629 		if (error == EPIPE) {
630 			PROC_LOCK(td->td_proc);
631 			psignal(td->td_proc, SIGPIPE);
632 			PROC_UNLOCK(td->td_proc);
633 		}
634 	}
635 	if (error == 0)
636 		td->td_retval[0] = len - auio.uio_resid;
637 #ifdef KTRACE
638 	if (ktriov != NULL) {
639 		if (error == 0) {
640 			ktruio.uio_iov = ktriov;
641 			ktruio.uio_resid = td->td_retval[0];
642 			ktrgenio(td->td_proc->p_tracep, s, UIO_WRITE, &ktruio, error);
643 		}
644 		FREE(ktriov, M_TEMP);
645 	}
646 #endif
647 bad:
648 	fputsock(so);
649 	if (to)
650 		FREE(to, M_SONAME);
651 	return (error);
652 }
653 
654 /*
655  * MPSAFE
656  */
657 int
658 sendto(td, uap)
659 	struct thread *td;
660 	register struct sendto_args /* {
661 		int	s;
662 		caddr_t	buf;
663 		size_t	len;
664 		int	flags;
665 		caddr_t	to;
666 		int	tolen;
667 	} */ *uap;
668 {
669 	struct msghdr msg;
670 	struct iovec aiov;
671 	int error;
672 
673 	msg.msg_name = uap->to;
674 	msg.msg_namelen = uap->tolen;
675 	msg.msg_iov = &aiov;
676 	msg.msg_iovlen = 1;
677 	msg.msg_control = 0;
678 #ifdef COMPAT_OLDSOCK
679 	msg.msg_flags = 0;
680 #endif
681 	aiov.iov_base = uap->buf;
682 	aiov.iov_len = uap->len;
683 	mtx_lock(&Giant);
684 	error = sendit(td, uap->s, &msg, uap->flags);
685 	mtx_unlock(&Giant);
686 	return (error);
687 }
688 
689 #ifdef COMPAT_OLDSOCK
690 /*
691  * MPSAFE
692  */
693 int
694 osend(td, uap)
695 	struct thread *td;
696 	register struct osend_args /* {
697 		int	s;
698 		caddr_t	buf;
699 		int	len;
700 		int	flags;
701 	} */ *uap;
702 {
703 	struct msghdr msg;
704 	struct iovec aiov;
705 	int error;
706 
707 	msg.msg_name = 0;
708 	msg.msg_namelen = 0;
709 	msg.msg_iov = &aiov;
710 	msg.msg_iovlen = 1;
711 	aiov.iov_base = uap->buf;
712 	aiov.iov_len = uap->len;
713 	msg.msg_control = 0;
714 	msg.msg_flags = 0;
715 	mtx_lock(&Giant);
716 	error = sendit(td, uap->s, &msg, uap->flags);
717 	mtx_unlock(&Giant);
718 	return (error);
719 }
720 
721 /*
722  * MPSAFE
723  */
724 int
725 osendmsg(td, uap)
726 	struct thread *td;
727 	register struct osendmsg_args /* {
728 		int	s;
729 		caddr_t	msg;
730 		int	flags;
731 	} */ *uap;
732 {
733 	struct msghdr msg;
734 	struct iovec aiov[UIO_SMALLIOV], *iov;
735 	int error;
736 
737 	mtx_lock(&Giant);
738 	error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
739 	if (error)
740 		goto done2;
741 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
742 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
743 			error = EMSGSIZE;
744 			goto done2;
745 		}
746 		MALLOC(iov, struct iovec *,
747 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
748 		      M_WAITOK);
749 	} else {
750 		iov = aiov;
751 	}
752 	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
753 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
754 	if (error)
755 		goto done;
756 	msg.msg_flags = MSG_COMPAT;
757 	msg.msg_iov = iov;
758 	error = sendit(td, uap->s, &msg, uap->flags);
759 done:
760 	if (iov != aiov)
761 		FREE(iov, M_IOV);
762 done2:
763 	mtx_unlock(&Giant);
764 	return (error);
765 }
766 #endif
767 
768 /*
769  * MPSAFE
770  */
771 int
772 sendmsg(td, uap)
773 	struct thread *td;
774 	register struct sendmsg_args /* {
775 		int	s;
776 		caddr_t	msg;
777 		int	flags;
778 	} */ *uap;
779 {
780 	struct msghdr msg;
781 	struct iovec aiov[UIO_SMALLIOV], *iov;
782 	int error;
783 
784 	mtx_lock(&Giant);
785 	error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
786 	if (error)
787 		goto done2;
788 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
789 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
790 			error = EMSGSIZE;
791 			goto done2;
792 		}
793 		MALLOC(iov, struct iovec *,
794 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
795 		       M_WAITOK);
796 	} else {
797 		iov = aiov;
798 	}
799 	if (msg.msg_iovlen &&
800 	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
801 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
802 		goto done;
803 	msg.msg_iov = iov;
804 #ifdef COMPAT_OLDSOCK
805 	msg.msg_flags = 0;
806 #endif
807 	error = sendit(td, uap->s, &msg, uap->flags);
808 done:
809 	if (iov != aiov)
810 		FREE(iov, M_IOV);
811 done2:
812 	mtx_unlock(&Giant);
813 	return (error);
814 }
815 
816 static int
817 recvit(td, s, mp, namelenp)
818 	register struct thread *td;
819 	int s;
820 	register struct msghdr *mp;
821 	caddr_t namelenp;
822 {
823 	struct uio auio;
824 	register struct iovec *iov;
825 	register int i;
826 	int len, error;
827 	struct mbuf *m, *control = 0;
828 	caddr_t ctlbuf;
829 	struct socket *so;
830 	struct sockaddr *fromsa = 0;
831 #ifdef KTRACE
832 	struct iovec *ktriov = NULL;
833 	struct uio ktruio;
834 #endif
835 
836 	if ((error = fgetsock(td, s, &so, NULL)) != 0)
837 		return (error);
838 	auio.uio_iov = mp->msg_iov;
839 	auio.uio_iovcnt = mp->msg_iovlen;
840 	auio.uio_segflg = UIO_USERSPACE;
841 	auio.uio_rw = UIO_READ;
842 	auio.uio_td = td;
843 	auio.uio_offset = 0;			/* XXX */
844 	auio.uio_resid = 0;
845 	iov = mp->msg_iov;
846 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
847 		if ((auio.uio_resid += iov->iov_len) < 0) {
848 			fputsock(so);
849 			return (EINVAL);
850 		}
851 	}
852 #ifdef KTRACE
853 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
854 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
855 
856 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
857 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
858 		ktruio = auio;
859 	}
860 #endif
861 	len = auio.uio_resid;
862 	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
863 	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
864 	    &mp->msg_flags);
865 	if (error) {
866 		if (auio.uio_resid != len && (error == ERESTART ||
867 		    error == EINTR || error == EWOULDBLOCK))
868 			error = 0;
869 	}
870 #ifdef KTRACE
871 	if (ktriov != NULL) {
872 		if (error == 0) {
873 			ktruio.uio_iov = ktriov;
874 			ktruio.uio_resid = len - auio.uio_resid;
875 			ktrgenio(td->td_proc->p_tracep, s, UIO_READ, &ktruio, error);
876 		}
877 		FREE(ktriov, M_TEMP);
878 	}
879 #endif
880 	if (error)
881 		goto out;
882 	td->td_retval[0] = len - auio.uio_resid;
883 	if (mp->msg_name) {
884 		len = mp->msg_namelen;
885 		if (len <= 0 || fromsa == 0)
886 			len = 0;
887 		else {
888 #ifndef MIN
889 #define MIN(a,b) ((a)>(b)?(b):(a))
890 #endif
891 			/* save sa_len before it is destroyed by MSG_COMPAT */
892 			len = MIN(len, fromsa->sa_len);
893 #ifdef COMPAT_OLDSOCK
894 			if (mp->msg_flags & MSG_COMPAT)
895 				((struct osockaddr *)fromsa)->sa_family =
896 				    fromsa->sa_family;
897 #endif
898 			error = copyout(fromsa,
899 			    (caddr_t)mp->msg_name, (unsigned)len);
900 			if (error)
901 				goto out;
902 		}
903 		mp->msg_namelen = len;
904 		if (namelenp &&
905 		    (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
906 #ifdef COMPAT_OLDSOCK
907 			if (mp->msg_flags & MSG_COMPAT)
908 				error = 0;	/* old recvfrom didn't check */
909 			else
910 #endif
911 			goto out;
912 		}
913 	}
914 	if (mp->msg_control) {
915 #ifdef COMPAT_OLDSOCK
916 		/*
917 		 * We assume that old recvmsg calls won't receive access
918 		 * rights and other control info, esp. as control info
919 		 * is always optional and those options didn't exist in 4.3.
920 		 * If we receive rights, trim the cmsghdr; anything else
921 		 * is tossed.
922 		 */
923 		if (control && mp->msg_flags & MSG_COMPAT) {
924 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
925 			    SOL_SOCKET ||
926 			    mtod(control, struct cmsghdr *)->cmsg_type !=
927 			    SCM_RIGHTS) {
928 				mp->msg_controllen = 0;
929 				goto out;
930 			}
931 			control->m_len -= sizeof (struct cmsghdr);
932 			control->m_data += sizeof (struct cmsghdr);
933 		}
934 #endif
935 		len = mp->msg_controllen;
936 		m = control;
937 		mp->msg_controllen = 0;
938 		ctlbuf = (caddr_t) mp->msg_control;
939 
940 		while (m && len > 0) {
941 			unsigned int tocopy;
942 
943 			if (len >= m->m_len)
944 				tocopy = m->m_len;
945 			else {
946 				mp->msg_flags |= MSG_CTRUNC;
947 				tocopy = len;
948 			}
949 
950 			if ((error = copyout((caddr_t)mtod(m, caddr_t),
951 					ctlbuf, tocopy)) != 0)
952 				goto out;
953 
954 			ctlbuf += tocopy;
955 			len -= tocopy;
956 			m = m->m_next;
957 		}
958 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
959 	}
960 out:
961 	fputsock(so);
962 	if (fromsa)
963 		FREE(fromsa, M_SONAME);
964 	if (control)
965 		m_freem(control);
966 	return (error);
967 }
968 
969 /*
970  * MPSAFE
971  */
972 int
973 recvfrom(td, uap)
974 	struct thread *td;
975 	register struct recvfrom_args /* {
976 		int	s;
977 		caddr_t	buf;
978 		size_t	len;
979 		int	flags;
980 		caddr_t	from;
981 		int	*fromlenaddr;
982 	} */ *uap;
983 {
984 	struct msghdr msg;
985 	struct iovec aiov;
986 	int error;
987 
988 	mtx_lock(&Giant);
989 	if (uap->fromlenaddr) {
990 		error = copyin((caddr_t)uap->fromlenaddr,
991 		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
992 		if (error)
993 			goto done2;
994 	} else {
995 		msg.msg_namelen = 0;
996 	}
997 	msg.msg_name = uap->from;
998 	msg.msg_iov = &aiov;
999 	msg.msg_iovlen = 1;
1000 	aiov.iov_base = uap->buf;
1001 	aiov.iov_len = uap->len;
1002 	msg.msg_control = 0;
1003 	msg.msg_flags = uap->flags;
1004 	error = recvit(td, uap->s, &msg, (caddr_t)uap->fromlenaddr);
1005 done2:
1006 	mtx_unlock(&Giant);
1007 	return(error);
1008 }
1009 
1010 #ifdef COMPAT_OLDSOCK
1011 /*
1012  * MPSAFE
1013  */
1014 int
1015 orecvfrom(td, uap)
1016 	struct thread *td;
1017 	struct recvfrom_args *uap;
1018 {
1019 
1020 	uap->flags |= MSG_COMPAT;
1021 	return (recvfrom(td, uap));
1022 }
1023 #endif
1024 
1025 
1026 #ifdef COMPAT_OLDSOCK
1027 /*
1028  * MPSAFE
1029  */
1030 int
1031 orecv(td, uap)
1032 	struct thread *td;
1033 	register struct orecv_args /* {
1034 		int	s;
1035 		caddr_t	buf;
1036 		int	len;
1037 		int	flags;
1038 	} */ *uap;
1039 {
1040 	struct msghdr msg;
1041 	struct iovec aiov;
1042 	int error;
1043 
1044 	mtx_lock(&Giant);
1045 	msg.msg_name = 0;
1046 	msg.msg_namelen = 0;
1047 	msg.msg_iov = &aiov;
1048 	msg.msg_iovlen = 1;
1049 	aiov.iov_base = uap->buf;
1050 	aiov.iov_len = uap->len;
1051 	msg.msg_control = 0;
1052 	msg.msg_flags = uap->flags;
1053 	error = recvit(td, uap->s, &msg, (caddr_t)0);
1054 	mtx_unlock(&Giant);
1055 	return (error);
1056 }
1057 
1058 /*
1059  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
1060  * overlays the new one, missing only the flags, and with the (old) access
1061  * rights where the control fields are now.
1062  *
1063  * MPSAFE
1064  */
1065 int
1066 orecvmsg(td, uap)
1067 	struct thread *td;
1068 	register struct orecvmsg_args /* {
1069 		int	s;
1070 		struct	omsghdr *msg;
1071 		int	flags;
1072 	} */ *uap;
1073 {
1074 	struct msghdr msg;
1075 	struct iovec aiov[UIO_SMALLIOV], *iov;
1076 	int error;
1077 
1078 	error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
1079 	    sizeof (struct omsghdr));
1080 	if (error)
1081 		return (error);
1082 
1083 	mtx_lock(&Giant);
1084 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1085 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
1086 			error = EMSGSIZE;
1087 			goto done2;
1088 		}
1089 		MALLOC(iov, struct iovec *,
1090 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1091 		      M_WAITOK);
1092 	} else {
1093 		iov = aiov;
1094 	}
1095 	msg.msg_flags = uap->flags | MSG_COMPAT;
1096 	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
1097 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1098 	if (error)
1099 		goto done;
1100 	msg.msg_iov = iov;
1101 	error = recvit(td, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
1102 
1103 	if (msg.msg_controllen && error == 0)
1104 		error = copyout((caddr_t)&msg.msg_controllen,
1105 		    (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
1106 done:
1107 	if (iov != aiov)
1108 		FREE(iov, M_IOV);
1109 done2:
1110 	mtx_unlock(&Giant);
1111 	return (error);
1112 }
1113 #endif
1114 
1115 /*
1116  * MPSAFE
1117  */
1118 int
1119 recvmsg(td, uap)
1120 	struct thread *td;
1121 	register struct recvmsg_args /* {
1122 		int	s;
1123 		struct	msghdr *msg;
1124 		int	flags;
1125 	} */ *uap;
1126 {
1127 	struct msghdr msg;
1128 	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
1129 	register int error;
1130 
1131 	mtx_lock(&Giant);
1132 	error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
1133 	if (error)
1134 		goto done2;
1135 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
1136 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
1137 			error = EMSGSIZE;
1138 			goto done2;
1139 		}
1140 		MALLOC(iov, struct iovec *,
1141 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
1142 		       M_WAITOK);
1143 	} else {
1144 		iov = aiov;
1145 	}
1146 #ifdef COMPAT_OLDSOCK
1147 	msg.msg_flags = uap->flags &~ MSG_COMPAT;
1148 #else
1149 	msg.msg_flags = uap->flags;
1150 #endif
1151 	uiov = msg.msg_iov;
1152 	msg.msg_iov = iov;
1153 	error = copyin((caddr_t)uiov, (caddr_t)iov,
1154 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
1155 	if (error)
1156 		goto done;
1157 	error = recvit(td, uap->s, &msg, (caddr_t)0);
1158 	if (!error) {
1159 		msg.msg_iov = uiov;
1160 		error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
1161 	}
1162 done:
1163 	if (iov != aiov)
1164 		FREE(iov, M_IOV);
1165 done2:
1166 	mtx_unlock(&Giant);
1167 	return (error);
1168 }
1169 
1170 /*
1171  * MPSAFE
1172  */
1173 /* ARGSUSED */
1174 int
1175 shutdown(td, uap)
1176 	struct thread *td;
1177 	register struct shutdown_args /* {
1178 		int	s;
1179 		int	how;
1180 	} */ *uap;
1181 {
1182 	struct socket *so;
1183 	int error;
1184 
1185 	mtx_lock(&Giant);
1186 	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
1187 		error = soshutdown(so, uap->how);
1188 		fputsock(so);
1189 	}
1190 	mtx_unlock(&Giant);
1191 	return(error);
1192 }
1193 
1194 /*
1195  * MPSAFE
1196  */
1197 /* ARGSUSED */
1198 int
1199 setsockopt(td, uap)
1200 	struct thread *td;
1201 	register struct setsockopt_args /* {
1202 		int	s;
1203 		int	level;
1204 		int	name;
1205 		caddr_t	val;
1206 		int	valsize;
1207 	} */ *uap;
1208 {
1209 	struct socket *so;
1210 	struct sockopt sopt;
1211 	int error;
1212 
1213 	if (uap->val == 0 && uap->valsize != 0)
1214 		return (EFAULT);
1215 	if (uap->valsize < 0)
1216 		return (EINVAL);
1217 
1218 	mtx_lock(&Giant);
1219 	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
1220 		sopt.sopt_dir = SOPT_SET;
1221 		sopt.sopt_level = uap->level;
1222 		sopt.sopt_name = uap->name;
1223 		sopt.sopt_val = uap->val;
1224 		sopt.sopt_valsize = uap->valsize;
1225 		sopt.sopt_td = td;
1226 		error = sosetopt(so, &sopt);
1227 		fputsock(so);
1228 	}
1229 	mtx_unlock(&Giant);
1230 	return(error);
1231 }
1232 
1233 /*
1234  * MPSAFE
1235  */
1236 /* ARGSUSED */
1237 int
1238 getsockopt(td, uap)
1239 	struct thread *td;
1240 	register struct getsockopt_args /* {
1241 		int	s;
1242 		int	level;
1243 		int	name;
1244 		caddr_t	val;
1245 		int	*avalsize;
1246 	} */ *uap;
1247 {
1248 	int	valsize, error;
1249 	struct  socket *so;
1250 	struct	sockopt sopt;
1251 
1252 	mtx_lock(&Giant);
1253 	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
1254 		goto done2;
1255 	if (uap->val) {
1256 		error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
1257 		    sizeof (valsize));
1258 		if (error)
1259 			goto done1;
1260 		if (valsize < 0) {
1261 			error = EINVAL;
1262 			goto done1;
1263 		}
1264 	} else {
1265 		valsize = 0;
1266 	}
1267 
1268 	sopt.sopt_dir = SOPT_GET;
1269 	sopt.sopt_level = uap->level;
1270 	sopt.sopt_name = uap->name;
1271 	sopt.sopt_val = uap->val;
1272 	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1273 	sopt.sopt_td = td;
1274 
1275 	error = sogetopt(so, &sopt);
1276 	if (error == 0) {
1277 		valsize = sopt.sopt_valsize;
1278 		error = copyout((caddr_t)&valsize,
1279 				(caddr_t)uap->avalsize, sizeof (valsize));
1280 	}
1281 done1:
1282 	fputsock(so);
1283 done2:
1284 	mtx_unlock(&Giant);
1285 	return (error);
1286 }
1287 
1288 /*
1289  * getsockname1() - Get socket name.
1290  *
1291  * MPSAFE
1292  */
1293 /* ARGSUSED */
1294 static int
1295 getsockname1(td, uap, compat)
1296 	struct thread *td;
1297 	register struct getsockname_args /* {
1298 		int	fdes;
1299 		caddr_t	asa;
1300 		int	*alen;
1301 	} */ *uap;
1302 	int compat;
1303 {
1304 	struct socket *so;
1305 	struct sockaddr *sa;
1306 	int len, error;
1307 
1308 	mtx_lock(&Giant);
1309 	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
1310 		goto done2;
1311 	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1312 	if (error)
1313 		goto done1;
1314 	sa = 0;
1315 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1316 	if (error)
1317 		goto bad;
1318 	if (sa == 0) {
1319 		len = 0;
1320 		goto gotnothing;
1321 	}
1322 
1323 	len = MIN(len, sa->sa_len);
1324 #ifdef COMPAT_OLDSOCK
1325 	if (compat)
1326 		((struct osockaddr *)sa)->sa_family = sa->sa_family;
1327 #endif
1328 	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1329 	if (error == 0)
1330 gotnothing:
1331 		error = copyout((caddr_t)&len, (caddr_t)uap->alen,
1332 		    sizeof (len));
1333 bad:
1334 	if (sa)
1335 		FREE(sa, M_SONAME);
1336 done1:
1337 	fputsock(so);
1338 done2:
1339 	mtx_unlock(&Giant);
1340 	return (error);
1341 }
1342 
1343 /*
1344  * MPSAFE
1345  */
1346 int
1347 getsockname(td, uap)
1348 	struct thread *td;
1349 	struct getsockname_args *uap;
1350 {
1351 
1352 	return (getsockname1(td, uap, 0));
1353 }
1354 
1355 #ifdef COMPAT_OLDSOCK
1356 /*
1357  * MPSAFE
1358  */
1359 int
1360 ogetsockname(td, uap)
1361 	struct thread *td;
1362 	struct getsockname_args *uap;
1363 {
1364 
1365 	return (getsockname1(td, uap, 1));
1366 }
1367 #endif /* COMPAT_OLDSOCK */
1368 
1369 /*
1370  * getpeername1() - Get name of peer for connected socket.
1371  *
1372  * MPSAFE
1373  */
1374 /* ARGSUSED */
1375 static int
1376 getpeername1(td, uap, compat)
1377 	struct thread *td;
1378 	register struct getpeername_args /* {
1379 		int	fdes;
1380 		caddr_t	asa;
1381 		int	*alen;
1382 	} */ *uap;
1383 	int compat;
1384 {
1385 	struct socket *so;
1386 	struct sockaddr *sa;
1387 	int len, error;
1388 
1389 	mtx_lock(&Giant);
1390 	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
1391 		goto done2;
1392 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1393 		error = ENOTCONN;
1394 		goto done1;
1395 	}
1396 	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1397 	if (error)
1398 		goto done1;
1399 	sa = 0;
1400 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1401 	if (error)
1402 		goto bad;
1403 	if (sa == 0) {
1404 		len = 0;
1405 		goto gotnothing;
1406 	}
1407 	len = MIN(len, sa->sa_len);
1408 #ifdef COMPAT_OLDSOCK
1409 	if (compat)
1410 		((struct osockaddr *)sa)->sa_family =
1411 		    sa->sa_family;
1412 #endif
1413 	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1414 	if (error)
1415 		goto bad;
1416 gotnothing:
1417 	error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
1418 bad:
1419 	if (sa)
1420 		FREE(sa, M_SONAME);
1421 done1:
1422 	fputsock(so);
1423 done2:
1424 	mtx_unlock(&Giant);
1425 	return (error);
1426 }
1427 
1428 /*
1429  * MPSAFE
1430  */
1431 int
1432 getpeername(td, uap)
1433 	struct thread *td;
1434 	struct getpeername_args *uap;
1435 {
1436 
1437 	return (getpeername1(td, uap, 0));
1438 }
1439 
1440 #ifdef COMPAT_OLDSOCK
1441 /*
1442  * MPSAFE
1443  */
1444 int
1445 ogetpeername(td, uap)
1446 	struct thread *td;
1447 	struct ogetpeername_args *uap;
1448 {
1449 
1450 	/* XXX uap should have type `getpeername_args *' to begin with. */
1451 	return (getpeername1(td, (struct getpeername_args *)uap, 1));
1452 }
1453 #endif /* COMPAT_OLDSOCK */
1454 
1455 int
1456 sockargs(mp, buf, buflen, type)
1457 	struct mbuf **mp;
1458 	caddr_t buf;
1459 	int buflen, type;
1460 {
1461 	register struct sockaddr *sa;
1462 	register struct mbuf *m;
1463 	int error;
1464 
1465 	if ((u_int)buflen > MLEN) {
1466 #ifdef COMPAT_OLDSOCK
1467 		if (type == MT_SONAME && (u_int)buflen <= 112)
1468 			buflen = MLEN;		/* unix domain compat. hack */
1469 		else
1470 #endif
1471 		return (EINVAL);
1472 	}
1473 	m = m_get(M_TRYWAIT, type);
1474 	if (m == NULL)
1475 		return (ENOBUFS);
1476 	m->m_len = buflen;
1477 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1478 	if (error)
1479 		(void) m_free(m);
1480 	else {
1481 		*mp = m;
1482 		if (type == MT_SONAME) {
1483 			sa = mtod(m, struct sockaddr *);
1484 
1485 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1486 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1487 				sa->sa_family = sa->sa_len;
1488 #endif
1489 			sa->sa_len = buflen;
1490 		}
1491 	}
1492 	return (error);
1493 }
1494 
1495 int
1496 getsockaddr(namp, uaddr, len)
1497 	struct sockaddr **namp;
1498 	caddr_t uaddr;
1499 	size_t len;
1500 {
1501 	struct sockaddr *sa;
1502 	int error;
1503 
1504 	if (len > SOCK_MAXADDRLEN)
1505 		return ENAMETOOLONG;
1506 	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1507 	error = copyin(uaddr, sa, len);
1508 	if (error) {
1509 		FREE(sa, M_SONAME);
1510 	} else {
1511 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1512 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1513 			sa->sa_family = sa->sa_len;
1514 #endif
1515 		sa->sa_len = len;
1516 		*namp = sa;
1517 	}
1518 	return error;
1519 }
1520 
1521 /*
1522  * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1523  * XXX - The sf_buf functions are currently private to sendfile(2), so have
1524  * been made static, but may be useful in the future for doing zero-copy in
1525  * other parts of the networking code.
1526  */
1527 static void
1528 sf_buf_init(void *arg)
1529 {
1530 	int i;
1531 
1532 	mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", MTX_DEF);
1533 	mtx_lock(&sf_freelist.sf_lock);
1534 	SLIST_INIT(&sf_freelist.sf_head);
1535 	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
1536 	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
1537 	    M_NOWAIT | M_ZERO);
1538 	for (i = 0; i < nsfbufs; i++) {
1539 		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1540 		SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list);
1541 	}
1542 	sf_buf_alloc_want = 0;
1543 	mtx_unlock(&sf_freelist.sf_lock);
1544 }
1545 
1546 /*
1547  * Get an sf_buf from the freelist. Will block if none are available.
1548  */
1549 static struct sf_buf *
1550 sf_buf_alloc()
1551 {
1552 	struct sf_buf *sf;
1553 	int error;
1554 
1555 	mtx_lock(&sf_freelist.sf_lock);
1556 	while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) {
1557 		sf_buf_alloc_want++;
1558 		error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH,
1559 		    "sfbufa", 0);
1560 		sf_buf_alloc_want--;
1561 
1562 		/*
1563 		 * If we got a signal, don't risk going back to sleep.
1564 		 */
1565 		if (error)
1566 			break;
1567 	}
1568 	if (sf != NULL)
1569 		SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list);
1570 	mtx_unlock(&sf_freelist.sf_lock);
1571 	return (sf);
1572 }
1573 
1574 #define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1575 
1576 /*
1577  * Detatch mapped page and release resources back to the system.
1578  */
1579 static void
1580 sf_buf_free(caddr_t addr, void *args)
1581 {
1582 	struct sf_buf *sf;
1583 	struct vm_page *m;
1584 
1585 	GIANT_REQUIRED;
1586 
1587 	sf = dtosf(addr);
1588 	pmap_qremove((vm_offset_t)addr, 1);
1589 	m = sf->m;
1590 	vm_page_unwire(m, 0);
1591 	/*
1592 	 * Check for the object going away on us. This can
1593 	 * happen since we don't hold a reference to it.
1594 	 * If so, we're responsible for freeing the page.
1595 	 */
1596 	if (m->wire_count == 0 && m->object == NULL)
1597 		vm_page_free(m);
1598 	sf->m = NULL;
1599 	mtx_lock(&sf_freelist.sf_lock);
1600 	SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list);
1601 	if (sf_buf_alloc_want > 0)
1602 		wakeup_one(&sf_freelist);
1603 	mtx_unlock(&sf_freelist.sf_lock);
1604 }
1605 
1606 /*
1607  * sendfile(2)
1608  *
1609  * MPSAFE
1610  *
1611  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1612  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1613  *
1614  * Send a file specified by 'fd' and starting at 'offset' to a socket
1615  * specified by 's'. Send only 'nbytes' of the file or until EOF if
1616  * nbytes == 0. Optionally add a header and/or trailer to the socket
1617  * output. If specified, write the total number of bytes sent into *sbytes.
1618  *
1619  */
1620 int
1621 sendfile(struct thread *td, struct sendfile_args *uap)
1622 {
1623 	struct vnode *vp;
1624 	struct vm_object *obj;
1625 	struct socket *so = NULL;
1626 	struct mbuf *m;
1627 	struct sf_buf *sf;
1628 	struct vm_page *pg;
1629 	struct writev_args nuap;
1630 	struct sf_hdtr hdtr;
1631 	off_t off, xfsize, sbytes = 0;
1632 	int error, s;
1633 
1634 	mtx_lock(&Giant);
1635 
1636 	/*
1637 	 * The descriptor must be a regular file and have a backing VM object.
1638 	 */
1639 	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
1640 		goto done;
1641 	if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
1642 		error = EINVAL;
1643 		goto done;
1644 	}
1645 	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
1646 		goto done;
1647 	if (so->so_type != SOCK_STREAM) {
1648 		error = EINVAL;
1649 		goto done;
1650 	}
1651 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1652 		error = ENOTCONN;
1653 		goto done;
1654 	}
1655 	if (uap->offset < 0) {
1656 		error = EINVAL;
1657 		goto done;
1658 	}
1659 
1660 	/*
1661 	 * If specified, get the pointer to the sf_hdtr struct for
1662 	 * any headers/trailers.
1663 	 */
1664 	if (uap->hdtr != NULL) {
1665 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1666 		if (error)
1667 			goto done;
1668 		/*
1669 		 * Send any headers. Wimp out and use writev(2).
1670 		 */
1671 		if (hdtr.headers != NULL) {
1672 			nuap.fd = uap->s;
1673 			nuap.iovp = hdtr.headers;
1674 			nuap.iovcnt = hdtr.hdr_cnt;
1675 			error = writev(td, &nuap);
1676 			if (error)
1677 				goto done;
1678 			sbytes += td->td_retval[0];
1679 		}
1680 	}
1681 
1682 	/*
1683 	 * Protect against multiple writers to the socket.
1684 	 */
1685 	(void) sblock(&so->so_snd, M_WAITOK);
1686 
1687 	/*
1688 	 * Loop through the pages in the file, starting with the requested
1689 	 * offset. Get a file page (do I/O if necessary), map the file page
1690 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1691 	 * it on the socket.
1692 	 */
1693 	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1694 		vm_pindex_t pindex;
1695 		vm_offset_t pgoff;
1696 
1697 		pindex = OFF_TO_IDX(off);
1698 retry_lookup:
1699 		/*
1700 		 * Calculate the amount to transfer. Not to exceed a page,
1701 		 * the EOF, or the passed in nbytes.
1702 		 */
1703 		xfsize = obj->un_pager.vnp.vnp_size - off;
1704 		if (xfsize > PAGE_SIZE)
1705 			xfsize = PAGE_SIZE;
1706 		pgoff = (vm_offset_t)(off & PAGE_MASK);
1707 		if (PAGE_SIZE - pgoff < xfsize)
1708 			xfsize = PAGE_SIZE - pgoff;
1709 		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1710 			xfsize = uap->nbytes - sbytes;
1711 		if (xfsize <= 0)
1712 			break;
1713 		/*
1714 		 * Optimize the non-blocking case by looking at the socket space
1715 		 * before going to the extra work of constituting the sf_buf.
1716 		 */
1717 		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1718 			if (so->so_state & SS_CANTSENDMORE)
1719 				error = EPIPE;
1720 			else
1721 				error = EAGAIN;
1722 			sbunlock(&so->so_snd);
1723 			goto done;
1724 		}
1725 		/*
1726 		 * Attempt to look up the page.
1727 		 *
1728 		 *	Allocate if not found
1729 		 *
1730 		 *	Wait and loop if busy.
1731 		 */
1732 		pg = vm_page_lookup(obj, pindex);
1733 
1734 		if (pg == NULL) {
1735 			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1736 			if (pg == NULL) {
1737 				VM_WAIT;
1738 				goto retry_lookup;
1739 			}
1740 			vm_page_wakeup(pg);
1741 		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
1742 			goto retry_lookup;
1743 		}
1744 
1745 		/*
1746 		 * Wire the page so it does not get ripped out from under
1747 		 * us.
1748 		 */
1749 
1750 		vm_page_wire(pg);
1751 
1752 		/*
1753 		 * If page is not valid for what we need, initiate I/O
1754 		 */
1755 
1756 		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1757 			struct uio auio;
1758 			struct iovec aiov;
1759 			int bsize;
1760 
1761 			/*
1762 			 * Ensure that our page is still around when the I/O
1763 			 * completes.
1764 			 */
1765 			vm_page_io_start(pg);
1766 
1767 			/*
1768 			 * Get the page from backing store.
1769 			 */
1770 			bsize = vp->v_mount->mnt_stat.f_iosize;
1771 			auio.uio_iov = &aiov;
1772 			auio.uio_iovcnt = 1;
1773 			aiov.iov_base = 0;
1774 			aiov.iov_len = MAXBSIZE;
1775 			auio.uio_resid = MAXBSIZE;
1776 			auio.uio_offset = trunc_page(off);
1777 			auio.uio_segflg = UIO_NOCOPY;
1778 			auio.uio_rw = UIO_READ;
1779 			auio.uio_td = td;
1780 			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
1781 			error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
1782 			        td->td_proc->p_ucred);
1783 			VOP_UNLOCK(vp, 0, td);
1784 			vm_page_flag_clear(pg, PG_ZERO);
1785 			vm_page_io_finish(pg);
1786 			if (error) {
1787 				vm_page_unwire(pg, 0);
1788 				/*
1789 				 * See if anyone else might know about this page.
1790 				 * If not and it is not valid, then free it.
1791 				 */
1792 				if (pg->wire_count == 0 && pg->valid == 0 &&
1793 				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1794 				    pg->hold_count == 0) {
1795 					vm_page_busy(pg);
1796 					vm_page_free(pg);
1797 				}
1798 				sbunlock(&so->so_snd);
1799 				goto done;
1800 			}
1801 		}
1802 
1803 
1804 		/*
1805 		 * Get a sendfile buf. We usually wait as long as necessary,
1806 		 * but this wait can be interrupted.
1807 		 */
1808 		if ((sf = sf_buf_alloc()) == NULL) {
1809 			vm_page_unwire(pg, 0);
1810 			if (pg->wire_count == 0 && pg->object == NULL)
1811 				vm_page_free(pg);
1812 			sbunlock(&so->so_snd);
1813 			error = EINTR;
1814 			goto done;
1815 		}
1816 
1817 		/*
1818 		 * Allocate a kernel virtual page and insert the physical page
1819 		 * into it.
1820 		 */
1821 		sf->m = pg;
1822 		pmap_qenter(sf->kva, &pg, 1);
1823 		/*
1824 		 * Get an mbuf header and set it up as having external storage.
1825 		 */
1826 		MGETHDR(m, M_TRYWAIT, MT_DATA);
1827 		if (m == NULL) {
1828 			error = ENOBUFS;
1829 			sf_buf_free((void *)sf->kva, NULL);
1830 			sbunlock(&so->so_snd);
1831 			goto done;
1832 		}
1833 		/*
1834 		 * Setup external storage for mbuf.
1835 		 */
1836 		MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY,
1837 		    EXT_SFBUF);
1838 		m->m_data = (char *) sf->kva + pgoff;
1839 		m->m_pkthdr.len = m->m_len = xfsize;
1840 		/*
1841 		 * Add the buffer to the socket buffer chain.
1842 		 */
1843 		s = splnet();
1844 retry_space:
1845 		/*
1846 		 * Make sure that the socket is still able to take more data.
1847 		 * CANTSENDMORE being true usually means that the connection
1848 		 * was closed. so_error is true when an error was sensed after
1849 		 * a previous send.
1850 		 * The state is checked after the page mapping and buffer
1851 		 * allocation above since those operations may block and make
1852 		 * any socket checks stale. From this point forward, nothing
1853 		 * blocks before the pru_send (or more accurately, any blocking
1854 		 * results in a loop back to here to re-check).
1855 		 */
1856 		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1857 			if (so->so_state & SS_CANTSENDMORE) {
1858 				error = EPIPE;
1859 			} else {
1860 				error = so->so_error;
1861 				so->so_error = 0;
1862 			}
1863 			m_freem(m);
1864 			sbunlock(&so->so_snd);
1865 			splx(s);
1866 			goto done;
1867 		}
1868 		/*
1869 		 * Wait for socket space to become available. We do this just
1870 		 * after checking the connection state above in order to avoid
1871 		 * a race condition with sbwait().
1872 		 */
1873 		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1874 			if (so->so_state & SS_NBIO) {
1875 				m_freem(m);
1876 				sbunlock(&so->so_snd);
1877 				splx(s);
1878 				error = EAGAIN;
1879 				goto done;
1880 			}
1881 			error = sbwait(&so->so_snd);
1882 			/*
1883 			 * An error from sbwait usually indicates that we've
1884 			 * been interrupted by a signal. If we've sent anything
1885 			 * then return bytes sent, otherwise return the error.
1886 			 */
1887 			if (error) {
1888 				m_freem(m);
1889 				sbunlock(&so->so_snd);
1890 				splx(s);
1891 				goto done;
1892 			}
1893 			goto retry_space;
1894 		}
1895 		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td);
1896 		splx(s);
1897 		if (error) {
1898 			sbunlock(&so->so_snd);
1899 			goto done;
1900 		}
1901 	}
1902 	sbunlock(&so->so_snd);
1903 
1904 	/*
1905 	 * Send trailers. Wimp out and use writev(2).
1906 	 */
1907 	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1908 			nuap.fd = uap->s;
1909 			nuap.iovp = hdtr.trailers;
1910 			nuap.iovcnt = hdtr.trl_cnt;
1911 			error = writev(td, &nuap);
1912 			if (error)
1913 				goto done;
1914 			sbytes += td->td_retval[0];
1915 	}
1916 
1917 done:
1918 	/*
1919 	 * If there was no error we have to clear td->td_retval[0]
1920 	 * because it may have been set by writev.
1921 	 */
1922 	if (error == 0) {
1923 		td->td_retval[0] = 0;
1924 	}
1925 	if (uap->sbytes != NULL) {
1926 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
1927 	}
1928 	if (vp)
1929 		vrele(vp);
1930 	if (so)
1931 		fputsock(so);
1932 	mtx_unlock(&Giant);
1933 	return (error);
1934 }
1935 
1936