1 /* 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 37 * $FreeBSD$ 38 */ 39 40 #include "opt_compat.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/kernel.h> 46 #include <sys/lock.h> 47 #include <sys/mutex.h> 48 #include <sys/sysproto.h> 49 #include <sys/malloc.h> 50 #include <sys/filedesc.h> 51 #include <sys/event.h> 52 #include <sys/proc.h> 53 #include <sys/fcntl.h> 54 #include <sys/file.h> 55 #include <sys/lock.h> 56 #include <sys/mount.h> 57 #include <sys/mbuf.h> 58 #include <sys/protosw.h> 59 #include <sys/socket.h> 60 #include <sys/socketvar.h> 61 #include <sys/signalvar.h> 62 #include <sys/uio.h> 63 #include <sys/vnode.h> 64 #ifdef KTRACE 65 #include <sys/ktrace.h> 66 #endif 67 68 #include <vm/vm.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_page.h> 71 #include <vm/vm_pageout.h> 72 #include <vm/vm_kern.h> 73 #include <vm/vm_extern.h> 74 75 static void sf_buf_init(void *arg); 76 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) 77 struct sf_buf *sf_buf_alloc(void); 78 void sf_buf_free(void *addr, void *args); 79 80 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); 81 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); 82 83 static int accept1(struct thread *td, struct accept_args *uap, int compat); 84 static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat); 85 static int getsockname1(struct thread *td, struct getsockname_args *uap, 86 int compat); 87 static int getpeername1(struct thread *td, struct getpeername_args *uap, 88 int compat); 89 90 /* 91 * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the 92 * sf_freelist head with the sf_lock mutex. 93 */ 94 static struct { 95 SLIST_HEAD(, sf_buf) sf_head; 96 struct mtx sf_lock; 97 } sf_freelist; 98 99 vm_offset_t sf_base; 100 struct sf_buf *sf_bufs; 101 u_int sf_buf_alloc_want; 102 103 /* 104 * System call interface to the socket abstraction. 105 */ 106 #if defined(COMPAT_43) || defined(COMPAT_SUNOS) 107 #define COMPAT_OLDSOCK 108 #endif 109 110 extern struct fileops socketops; 111 112 /* 113 * MPSAFE 114 */ 115 int 116 socket(td, uap) 117 struct thread *td; 118 register struct socket_args /* { 119 int domain; 120 int type; 121 int protocol; 122 } */ *uap; 123 { 124 struct filedesc *fdp; 125 struct socket *so; 126 struct file *fp; 127 int fd, error; 128 129 mtx_lock(&Giant); 130 fdp = td->td_proc->p_fd; 131 error = falloc(td, &fp, &fd); 132 if (error) 133 goto done2; 134 fhold(fp); 135 error = socreate(uap->domain, &so, uap->type, uap->protocol, 136 td->td_ucred, td); 137 FILEDESC_LOCK(fdp); 138 if (error) { 139 if (fdp->fd_ofiles[fd] == fp) { 140 fdp->fd_ofiles[fd] = NULL; 141 FILEDESC_UNLOCK(fdp); 142 fdrop(fp, td); 143 } else 144 FILEDESC_UNLOCK(fdp); 145 } else { 146 fp->f_data = so; /* already has ref count */ 147 fp->f_flag = FREAD|FWRITE; 148 fp->f_ops = &socketops; 149 fp->f_type = DTYPE_SOCKET; 150 FILEDESC_UNLOCK(fdp); 151 td->td_retval[0] = fd; 152 } 153 fdrop(fp, td); 154 done2: 155 mtx_unlock(&Giant); 156 return (error); 157 } 158 159 /* 160 * MPSAFE 161 */ 162 /* ARGSUSED */ 163 int 164 bind(td, uap) 165 struct thread *td; 166 register struct bind_args /* { 167 int s; 168 caddr_t name; 169 int namelen; 170 } */ *uap; 171 { 172 struct socket *so; 173 struct sockaddr *sa; 174 int error; 175 176 mtx_lock(&Giant); 177 if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) 178 goto done2; 179 if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0) 180 goto done1; 181 error = sobind(so, sa, td); 182 FREE(sa, M_SONAME); 183 done1: 184 fputsock(so); 185 done2: 186 mtx_unlock(&Giant); 187 return (error); 188 } 189 190 /* 191 * MPSAFE 192 */ 193 /* ARGSUSED */ 194 int 195 listen(td, uap) 196 struct thread *td; 197 register struct listen_args /* { 198 int s; 199 int backlog; 200 } */ *uap; 201 { 202 struct socket *so; 203 int error; 204 205 mtx_lock(&Giant); 206 if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { 207 error = solisten(so, uap->backlog, td); 208 fputsock(so); 209 } 210 mtx_unlock(&Giant); 211 return(error); 212 } 213 214 /* 215 * accept1() 216 * MPSAFE 217 */ 218 static int 219 accept1(td, uap, compat) 220 struct thread *td; 221 register struct accept_args /* { 222 int s; 223 caddr_t name; 224 int *anamelen; 225 } */ *uap; 226 int compat; 227 { 228 struct filedesc *fdp; 229 struct file *nfp = NULL; 230 struct sockaddr *sa; 231 int namelen, error, s; 232 struct socket *head, *so; 233 int fd; 234 u_int fflag; 235 236 mtx_lock(&Giant); 237 fdp = td->td_proc->p_fd; 238 if (uap->name) { 239 error = copyin(uap->anamelen, &namelen, sizeof (namelen)); 240 if(error) 241 goto done2; 242 } 243 error = fgetsock(td, uap->s, &head, &fflag); 244 if (error) 245 goto done2; 246 s = splnet(); 247 if ((head->so_options & SO_ACCEPTCONN) == 0) { 248 splx(s); 249 error = EINVAL; 250 goto done; 251 } 252 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { 253 splx(s); 254 error = EWOULDBLOCK; 255 goto done; 256 } 257 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { 258 if (head->so_state & SS_CANTRCVMORE) { 259 head->so_error = ECONNABORTED; 260 break; 261 } 262 error = tsleep(&head->so_timeo, PSOCK | PCATCH, 263 "accept", 0); 264 if (error) { 265 splx(s); 266 goto done; 267 } 268 } 269 if (head->so_error) { 270 error = head->so_error; 271 head->so_error = 0; 272 splx(s); 273 goto done; 274 } 275 276 /* 277 * At this point we know that there is at least one connection 278 * ready to be accepted. Remove it from the queue prior to 279 * allocating the file descriptor for it since falloc() may 280 * block allowing another process to accept the connection 281 * instead. 282 */ 283 so = TAILQ_FIRST(&head->so_comp); 284 TAILQ_REMOVE(&head->so_comp, so, so_list); 285 head->so_qlen--; 286 287 error = falloc(td, &nfp, &fd); 288 if (error) { 289 /* 290 * Probably ran out of file descriptors. Put the 291 * unaccepted connection back onto the queue and 292 * do another wakeup so some other process might 293 * have a chance at it. 294 */ 295 TAILQ_INSERT_HEAD(&head->so_comp, so, so_list); 296 head->so_qlen++; 297 wakeup_one(&head->so_timeo); 298 splx(s); 299 goto done; 300 } 301 fhold(nfp); 302 td->td_retval[0] = fd; 303 304 /* connection has been removed from the listen queue */ 305 KNOTE(&head->so_rcv.sb_sel.si_note, 0); 306 307 so->so_state &= ~SS_COMP; 308 so->so_head = NULL; 309 if (head->so_sigio != NULL) 310 fsetown(fgetown(head->so_sigio), &so->so_sigio); 311 312 FILE_LOCK(nfp); 313 soref(so); /* file descriptor reference */ 314 nfp->f_data = so; /* nfp has ref count from falloc */ 315 nfp->f_flag = fflag; 316 nfp->f_ops = &socketops; 317 nfp->f_type = DTYPE_SOCKET; 318 FILE_UNLOCK(nfp); 319 sa = 0; 320 error = soaccept(so, &sa); 321 if (error) { 322 /* 323 * return a namelen of zero for older code which might 324 * ignore the return value from accept. 325 */ 326 if (uap->name != NULL) { 327 namelen = 0; 328 (void) copyout(&namelen, 329 uap->anamelen, sizeof(*uap->anamelen)); 330 } 331 goto noconnection; 332 } 333 if (sa == NULL) { 334 namelen = 0; 335 if (uap->name) 336 goto gotnoname; 337 splx(s); 338 error = 0; 339 goto done; 340 } 341 if (uap->name) { 342 /* check sa_len before it is destroyed */ 343 if (namelen > sa->sa_len) 344 namelen = sa->sa_len; 345 #ifdef COMPAT_OLDSOCK 346 if (compat) 347 ((struct osockaddr *)sa)->sa_family = 348 sa->sa_family; 349 #endif 350 error = copyout(sa, uap->name, (u_int)namelen); 351 if (!error) 352 gotnoname: 353 error = copyout(&namelen, 354 uap->anamelen, sizeof (*uap->anamelen)); 355 } 356 noconnection: 357 if (sa) 358 FREE(sa, M_SONAME); 359 360 /* 361 * close the new descriptor, assuming someone hasn't ripped it 362 * out from under us. 363 */ 364 if (error) { 365 FILEDESC_LOCK(fdp); 366 if (fdp->fd_ofiles[fd] == nfp) { 367 fdp->fd_ofiles[fd] = NULL; 368 FILEDESC_UNLOCK(fdp); 369 fdrop(nfp, td); 370 } else { 371 FILEDESC_UNLOCK(fdp); 372 } 373 } 374 splx(s); 375 376 /* 377 * Release explicitly held references before returning. 378 */ 379 done: 380 if (nfp != NULL) 381 fdrop(nfp, td); 382 fputsock(head); 383 done2: 384 mtx_unlock(&Giant); 385 return (error); 386 } 387 388 /* 389 * MPSAFE (accept1() is MPSAFE) 390 */ 391 int 392 accept(td, uap) 393 struct thread *td; 394 struct accept_args *uap; 395 { 396 397 return (accept1(td, uap, 0)); 398 } 399 400 #ifdef COMPAT_OLDSOCK 401 /* 402 * MPSAFE (accept1() is MPSAFE) 403 */ 404 int 405 oaccept(td, uap) 406 struct thread *td; 407 struct accept_args *uap; 408 { 409 410 return (accept1(td, uap, 1)); 411 } 412 #endif /* COMPAT_OLDSOCK */ 413 414 /* 415 * MPSAFE 416 */ 417 /* ARGSUSED */ 418 int 419 connect(td, uap) 420 struct thread *td; 421 register struct connect_args /* { 422 int s; 423 caddr_t name; 424 int namelen; 425 } */ *uap; 426 { 427 struct socket *so; 428 struct sockaddr *sa; 429 int error, s; 430 431 mtx_lock(&Giant); 432 if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) 433 goto done2; 434 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 435 error = EALREADY; 436 goto done1; 437 } 438 error = getsockaddr(&sa, uap->name, uap->namelen); 439 if (error) 440 goto done1; 441 error = soconnect(so, sa, td); 442 if (error) 443 goto bad; 444 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 445 FREE(sa, M_SONAME); 446 error = EINPROGRESS; 447 goto done1; 448 } 449 s = splnet(); 450 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 451 error = tsleep(&so->so_timeo, PSOCK | PCATCH, "connec", 0); 452 if (error) 453 break; 454 } 455 if (error == 0) { 456 error = so->so_error; 457 so->so_error = 0; 458 } 459 splx(s); 460 bad: 461 so->so_state &= ~SS_ISCONNECTING; 462 FREE(sa, M_SONAME); 463 if (error == ERESTART) 464 error = EINTR; 465 done1: 466 fputsock(so); 467 done2: 468 mtx_unlock(&Giant); 469 return (error); 470 } 471 472 /* 473 * MPSAFE 474 */ 475 int 476 socketpair(td, uap) 477 struct thread *td; 478 register struct socketpair_args /* { 479 int domain; 480 int type; 481 int protocol; 482 int *rsv; 483 } */ *uap; 484 { 485 register struct filedesc *fdp = td->td_proc->p_fd; 486 struct file *fp1, *fp2; 487 struct socket *so1, *so2; 488 int fd, error, sv[2]; 489 490 mtx_lock(&Giant); 491 error = socreate(uap->domain, &so1, uap->type, uap->protocol, 492 td->td_ucred, td); 493 if (error) 494 goto done2; 495 error = socreate(uap->domain, &so2, uap->type, uap->protocol, 496 td->td_ucred, td); 497 if (error) 498 goto free1; 499 error = falloc(td, &fp1, &fd); 500 if (error) 501 goto free2; 502 fhold(fp1); 503 sv[0] = fd; 504 fp1->f_data = so1; /* so1 already has ref count */ 505 error = falloc(td, &fp2, &fd); 506 if (error) 507 goto free3; 508 fhold(fp2); 509 fp2->f_data = so2; /* so2 already has ref count */ 510 sv[1] = fd; 511 error = soconnect2(so1, so2); 512 if (error) 513 goto free4; 514 if (uap->type == SOCK_DGRAM) { 515 /* 516 * Datagram socket connection is asymmetric. 517 */ 518 error = soconnect2(so2, so1); 519 if (error) 520 goto free4; 521 } 522 FILE_LOCK(fp1); 523 fp1->f_flag = FREAD|FWRITE; 524 fp1->f_ops = &socketops; 525 fp1->f_type = DTYPE_SOCKET; 526 FILE_UNLOCK(fp1); 527 FILE_LOCK(fp2); 528 fp2->f_flag = FREAD|FWRITE; 529 fp2->f_ops = &socketops; 530 fp2->f_type = DTYPE_SOCKET; 531 FILE_UNLOCK(fp2); 532 error = copyout(sv, uap->rsv, 2 * sizeof (int)); 533 fdrop(fp1, td); 534 fdrop(fp2, td); 535 goto done2; 536 free4: 537 FILEDESC_LOCK(fdp); 538 if (fdp->fd_ofiles[sv[1]] == fp2) { 539 fdp->fd_ofiles[sv[1]] = NULL; 540 FILEDESC_UNLOCK(fdp); 541 fdrop(fp2, td); 542 } else 543 FILEDESC_UNLOCK(fdp); 544 fdrop(fp2, td); 545 free3: 546 FILEDESC_LOCK(fdp); 547 if (fdp->fd_ofiles[sv[0]] == fp1) { 548 fdp->fd_ofiles[sv[0]] = NULL; 549 FILEDESC_UNLOCK(fdp); 550 fdrop(fp1, td); 551 } else 552 FILEDESC_UNLOCK(fdp); 553 fdrop(fp1, td); 554 free2: 555 (void)soclose(so2); 556 free1: 557 (void)soclose(so1); 558 done2: 559 mtx_unlock(&Giant); 560 return (error); 561 } 562 563 static int 564 sendit(td, s, mp, flags) 565 register struct thread *td; 566 int s; 567 register struct msghdr *mp; 568 int flags; 569 { 570 struct uio auio; 571 register struct iovec *iov; 572 register int i; 573 struct mbuf *control; 574 struct sockaddr *to = NULL; 575 int len, error; 576 struct socket *so; 577 #ifdef KTRACE 578 struct iovec *ktriov = NULL; 579 struct uio ktruio; 580 int iovlen; 581 #endif 582 583 if ((error = fgetsock(td, s, &so, NULL)) != 0) 584 return (error); 585 auio.uio_iov = mp->msg_iov; 586 auio.uio_iovcnt = mp->msg_iovlen; 587 auio.uio_segflg = UIO_USERSPACE; 588 auio.uio_rw = UIO_WRITE; 589 auio.uio_td = td; 590 auio.uio_offset = 0; /* XXX */ 591 auio.uio_resid = 0; 592 iov = mp->msg_iov; 593 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 594 if ((auio.uio_resid += iov->iov_len) < 0) { 595 error = EINVAL; 596 goto bad; 597 } 598 } 599 if (mp->msg_name) { 600 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 601 if (error) 602 goto bad; 603 } 604 if (mp->msg_control) { 605 if (mp->msg_controllen < sizeof(struct cmsghdr) 606 #ifdef COMPAT_OLDSOCK 607 && mp->msg_flags != MSG_COMPAT 608 #endif 609 ) { 610 error = EINVAL; 611 goto bad; 612 } 613 error = sockargs(&control, mp->msg_control, 614 mp->msg_controllen, MT_CONTROL); 615 if (error) 616 goto bad; 617 #ifdef COMPAT_OLDSOCK 618 if (mp->msg_flags == MSG_COMPAT) { 619 register struct cmsghdr *cm; 620 621 M_PREPEND(control, sizeof(*cm), M_TRYWAIT); 622 if (control == 0) { 623 error = ENOBUFS; 624 goto bad; 625 } else { 626 cm = mtod(control, struct cmsghdr *); 627 cm->cmsg_len = control->m_len; 628 cm->cmsg_level = SOL_SOCKET; 629 cm->cmsg_type = SCM_RIGHTS; 630 } 631 } 632 #endif 633 } else { 634 control = 0; 635 } 636 #ifdef KTRACE 637 if (KTRPOINT(td, KTR_GENIO)) { 638 iovlen = auio.uio_iovcnt * sizeof (struct iovec); 639 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 640 bcopy(auio.uio_iov, ktriov, iovlen); 641 ktruio = auio; 642 } 643 #endif 644 len = auio.uio_resid; 645 error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control, 646 flags, td); 647 if (error) { 648 if (auio.uio_resid != len && (error == ERESTART || 649 error == EINTR || error == EWOULDBLOCK)) 650 error = 0; 651 /* Generation of SIGPIPE can be controlled per socket */ 652 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE)) { 653 PROC_LOCK(td->td_proc); 654 psignal(td->td_proc, SIGPIPE); 655 PROC_UNLOCK(td->td_proc); 656 } 657 } 658 if (error == 0) 659 td->td_retval[0] = len - auio.uio_resid; 660 #ifdef KTRACE 661 if (ktriov != NULL) { 662 if (error == 0) { 663 ktruio.uio_iov = ktriov; 664 ktruio.uio_resid = td->td_retval[0]; 665 ktrgenio(s, UIO_WRITE, &ktruio, error); 666 } 667 FREE(ktriov, M_TEMP); 668 } 669 #endif 670 bad: 671 fputsock(so); 672 if (to) 673 FREE(to, M_SONAME); 674 return (error); 675 } 676 677 /* 678 * MPSAFE 679 */ 680 int 681 sendto(td, uap) 682 struct thread *td; 683 register struct sendto_args /* { 684 int s; 685 caddr_t buf; 686 size_t len; 687 int flags; 688 caddr_t to; 689 int tolen; 690 } */ *uap; 691 { 692 struct msghdr msg; 693 struct iovec aiov; 694 int error; 695 696 msg.msg_name = uap->to; 697 msg.msg_namelen = uap->tolen; 698 msg.msg_iov = &aiov; 699 msg.msg_iovlen = 1; 700 msg.msg_control = 0; 701 #ifdef COMPAT_OLDSOCK 702 msg.msg_flags = 0; 703 #endif 704 aiov.iov_base = uap->buf; 705 aiov.iov_len = uap->len; 706 mtx_lock(&Giant); 707 error = sendit(td, uap->s, &msg, uap->flags); 708 mtx_unlock(&Giant); 709 return (error); 710 } 711 712 #ifdef COMPAT_OLDSOCK 713 /* 714 * MPSAFE 715 */ 716 int 717 osend(td, uap) 718 struct thread *td; 719 register struct osend_args /* { 720 int s; 721 caddr_t buf; 722 int len; 723 int flags; 724 } */ *uap; 725 { 726 struct msghdr msg; 727 struct iovec aiov; 728 int error; 729 730 msg.msg_name = 0; 731 msg.msg_namelen = 0; 732 msg.msg_iov = &aiov; 733 msg.msg_iovlen = 1; 734 aiov.iov_base = uap->buf; 735 aiov.iov_len = uap->len; 736 msg.msg_control = 0; 737 msg.msg_flags = 0; 738 mtx_lock(&Giant); 739 error = sendit(td, uap->s, &msg, uap->flags); 740 mtx_unlock(&Giant); 741 return (error); 742 } 743 744 /* 745 * MPSAFE 746 */ 747 int 748 osendmsg(td, uap) 749 struct thread *td; 750 register struct osendmsg_args /* { 751 int s; 752 caddr_t msg; 753 int flags; 754 } */ *uap; 755 { 756 struct msghdr msg; 757 struct iovec aiov[UIO_SMALLIOV], *iov; 758 int error; 759 760 mtx_lock(&Giant); 761 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 762 if (error) 763 goto done2; 764 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 765 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { 766 error = EMSGSIZE; 767 goto done2; 768 } 769 MALLOC(iov, struct iovec *, 770 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 771 M_WAITOK); 772 } else { 773 iov = aiov; 774 } 775 error = copyin(msg.msg_iov, iov, 776 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 777 if (error) 778 goto done; 779 msg.msg_flags = MSG_COMPAT; 780 msg.msg_iov = iov; 781 error = sendit(td, uap->s, &msg, uap->flags); 782 done: 783 if (iov != aiov) 784 FREE(iov, M_IOV); 785 done2: 786 mtx_unlock(&Giant); 787 return (error); 788 } 789 #endif 790 791 /* 792 * MPSAFE 793 */ 794 int 795 sendmsg(td, uap) 796 struct thread *td; 797 register struct sendmsg_args /* { 798 int s; 799 caddr_t msg; 800 int flags; 801 } */ *uap; 802 { 803 struct msghdr msg; 804 struct iovec aiov[UIO_SMALLIOV], *iov; 805 int error; 806 807 mtx_lock(&Giant); 808 error = copyin(uap->msg, &msg, sizeof (msg)); 809 if (error) 810 goto done2; 811 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 812 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { 813 error = EMSGSIZE; 814 goto done2; 815 } 816 MALLOC(iov, struct iovec *, 817 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 818 M_WAITOK); 819 } else { 820 iov = aiov; 821 } 822 if (msg.msg_iovlen && 823 (error = copyin(msg.msg_iov, iov, 824 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))) 825 goto done; 826 msg.msg_iov = iov; 827 #ifdef COMPAT_OLDSOCK 828 msg.msg_flags = 0; 829 #endif 830 error = sendit(td, uap->s, &msg, uap->flags); 831 done: 832 if (iov != aiov) 833 FREE(iov, M_IOV); 834 done2: 835 mtx_unlock(&Giant); 836 return (error); 837 } 838 839 static int 840 recvit(td, s, mp, namelenp) 841 register struct thread *td; 842 int s; 843 register struct msghdr *mp; 844 void *namelenp; 845 { 846 struct uio auio; 847 register struct iovec *iov; 848 register int i; 849 int len, error; 850 struct mbuf *m, *control = 0; 851 caddr_t ctlbuf; 852 struct socket *so; 853 struct sockaddr *fromsa = 0; 854 #ifdef KTRACE 855 struct iovec *ktriov = NULL; 856 struct uio ktruio; 857 int iovlen; 858 #endif 859 860 if ((error = fgetsock(td, s, &so, NULL)) != 0) 861 return (error); 862 auio.uio_iov = mp->msg_iov; 863 auio.uio_iovcnt = mp->msg_iovlen; 864 auio.uio_segflg = UIO_USERSPACE; 865 auio.uio_rw = UIO_READ; 866 auio.uio_td = td; 867 auio.uio_offset = 0; /* XXX */ 868 auio.uio_resid = 0; 869 iov = mp->msg_iov; 870 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 871 if ((auio.uio_resid += iov->iov_len) < 0) { 872 fputsock(so); 873 return (EINVAL); 874 } 875 } 876 #ifdef KTRACE 877 if (KTRPOINT(td, KTR_GENIO)) { 878 iovlen = auio.uio_iovcnt * sizeof (struct iovec); 879 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 880 bcopy(auio.uio_iov, ktriov, iovlen); 881 ktruio = auio; 882 } 883 #endif 884 len = auio.uio_resid; 885 error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio, 886 (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, 887 &mp->msg_flags); 888 if (error) { 889 if (auio.uio_resid != len && (error == ERESTART || 890 error == EINTR || error == EWOULDBLOCK)) 891 error = 0; 892 } 893 #ifdef KTRACE 894 if (ktriov != NULL) { 895 if (error == 0) { 896 ktruio.uio_iov = ktriov; 897 ktruio.uio_resid = len - auio.uio_resid; 898 ktrgenio(s, UIO_READ, &ktruio, error); 899 } 900 FREE(ktriov, M_TEMP); 901 } 902 #endif 903 if (error) 904 goto out; 905 td->td_retval[0] = len - auio.uio_resid; 906 if (mp->msg_name) { 907 len = mp->msg_namelen; 908 if (len <= 0 || fromsa == 0) 909 len = 0; 910 else { 911 #ifndef MIN 912 #define MIN(a,b) ((a)>(b)?(b):(a)) 913 #endif 914 /* save sa_len before it is destroyed by MSG_COMPAT */ 915 len = MIN(len, fromsa->sa_len); 916 #ifdef COMPAT_OLDSOCK 917 if (mp->msg_flags & MSG_COMPAT) 918 ((struct osockaddr *)fromsa)->sa_family = 919 fromsa->sa_family; 920 #endif 921 error = copyout(fromsa, mp->msg_name, (unsigned)len); 922 if (error) 923 goto out; 924 } 925 mp->msg_namelen = len; 926 if (namelenp && 927 (error = copyout(&len, namelenp, sizeof (int)))) { 928 #ifdef COMPAT_OLDSOCK 929 if (mp->msg_flags & MSG_COMPAT) 930 error = 0; /* old recvfrom didn't check */ 931 else 932 #endif 933 goto out; 934 } 935 } 936 if (mp->msg_control) { 937 #ifdef COMPAT_OLDSOCK 938 /* 939 * We assume that old recvmsg calls won't receive access 940 * rights and other control info, esp. as control info 941 * is always optional and those options didn't exist in 4.3. 942 * If we receive rights, trim the cmsghdr; anything else 943 * is tossed. 944 */ 945 if (control && mp->msg_flags & MSG_COMPAT) { 946 if (mtod(control, struct cmsghdr *)->cmsg_level != 947 SOL_SOCKET || 948 mtod(control, struct cmsghdr *)->cmsg_type != 949 SCM_RIGHTS) { 950 mp->msg_controllen = 0; 951 goto out; 952 } 953 control->m_len -= sizeof (struct cmsghdr); 954 control->m_data += sizeof (struct cmsghdr); 955 } 956 #endif 957 len = mp->msg_controllen; 958 m = control; 959 mp->msg_controllen = 0; 960 ctlbuf = mp->msg_control; 961 962 while (m && len > 0) { 963 unsigned int tocopy; 964 965 if (len >= m->m_len) 966 tocopy = m->m_len; 967 else { 968 mp->msg_flags |= MSG_CTRUNC; 969 tocopy = len; 970 } 971 972 if ((error = copyout(mtod(m, caddr_t), 973 ctlbuf, tocopy)) != 0) 974 goto out; 975 976 ctlbuf += tocopy; 977 len -= tocopy; 978 m = m->m_next; 979 } 980 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; 981 } 982 out: 983 fputsock(so); 984 if (fromsa) 985 FREE(fromsa, M_SONAME); 986 if (control) 987 m_freem(control); 988 return (error); 989 } 990 991 /* 992 * MPSAFE 993 */ 994 int 995 recvfrom(td, uap) 996 struct thread *td; 997 register struct recvfrom_args /* { 998 int s; 999 caddr_t buf; 1000 size_t len; 1001 int flags; 1002 caddr_t from; 1003 int *fromlenaddr; 1004 } */ *uap; 1005 { 1006 struct msghdr msg; 1007 struct iovec aiov; 1008 int error; 1009 1010 mtx_lock(&Giant); 1011 if (uap->fromlenaddr) { 1012 error = copyin(uap->fromlenaddr, 1013 &msg.msg_namelen, sizeof (msg.msg_namelen)); 1014 if (error) 1015 goto done2; 1016 } else { 1017 msg.msg_namelen = 0; 1018 } 1019 msg.msg_name = uap->from; 1020 msg.msg_iov = &aiov; 1021 msg.msg_iovlen = 1; 1022 aiov.iov_base = uap->buf; 1023 aiov.iov_len = uap->len; 1024 msg.msg_control = 0; 1025 msg.msg_flags = uap->flags; 1026 error = recvit(td, uap->s, &msg, uap->fromlenaddr); 1027 done2: 1028 mtx_unlock(&Giant); 1029 return(error); 1030 } 1031 1032 #ifdef COMPAT_OLDSOCK 1033 /* 1034 * MPSAFE 1035 */ 1036 int 1037 orecvfrom(td, uap) 1038 struct thread *td; 1039 struct recvfrom_args *uap; 1040 { 1041 1042 uap->flags |= MSG_COMPAT; 1043 return (recvfrom(td, uap)); 1044 } 1045 #endif 1046 1047 1048 #ifdef COMPAT_OLDSOCK 1049 /* 1050 * MPSAFE 1051 */ 1052 int 1053 orecv(td, uap) 1054 struct thread *td; 1055 register struct orecv_args /* { 1056 int s; 1057 caddr_t buf; 1058 int len; 1059 int flags; 1060 } */ *uap; 1061 { 1062 struct msghdr msg; 1063 struct iovec aiov; 1064 int error; 1065 1066 mtx_lock(&Giant); 1067 msg.msg_name = 0; 1068 msg.msg_namelen = 0; 1069 msg.msg_iov = &aiov; 1070 msg.msg_iovlen = 1; 1071 aiov.iov_base = uap->buf; 1072 aiov.iov_len = uap->len; 1073 msg.msg_control = 0; 1074 msg.msg_flags = uap->flags; 1075 error = recvit(td, uap->s, &msg, NULL); 1076 mtx_unlock(&Giant); 1077 return (error); 1078 } 1079 1080 /* 1081 * Old recvmsg. This code takes advantage of the fact that the old msghdr 1082 * overlays the new one, missing only the flags, and with the (old) access 1083 * rights where the control fields are now. 1084 * 1085 * MPSAFE 1086 */ 1087 int 1088 orecvmsg(td, uap) 1089 struct thread *td; 1090 register struct orecvmsg_args /* { 1091 int s; 1092 struct omsghdr *msg; 1093 int flags; 1094 } */ *uap; 1095 { 1096 struct msghdr msg; 1097 struct iovec aiov[UIO_SMALLIOV], *iov; 1098 int error; 1099 1100 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1101 if (error) 1102 return (error); 1103 1104 mtx_lock(&Giant); 1105 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 1106 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { 1107 error = EMSGSIZE; 1108 goto done2; 1109 } 1110 MALLOC(iov, struct iovec *, 1111 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 1112 M_WAITOK); 1113 } else { 1114 iov = aiov; 1115 } 1116 msg.msg_flags = uap->flags | MSG_COMPAT; 1117 error = copyin(msg.msg_iov, iov, 1118 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 1119 if (error) 1120 goto done; 1121 msg.msg_iov = iov; 1122 error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); 1123 1124 if (msg.msg_controllen && error == 0) 1125 error = copyout(&msg.msg_controllen, 1126 &uap->msg->msg_accrightslen, sizeof (int)); 1127 done: 1128 if (iov != aiov) 1129 FREE(iov, M_IOV); 1130 done2: 1131 mtx_unlock(&Giant); 1132 return (error); 1133 } 1134 #endif 1135 1136 /* 1137 * MPSAFE 1138 */ 1139 int 1140 recvmsg(td, uap) 1141 struct thread *td; 1142 register struct recvmsg_args /* { 1143 int s; 1144 struct msghdr *msg; 1145 int flags; 1146 } */ *uap; 1147 { 1148 struct msghdr msg; 1149 struct iovec aiov[UIO_SMALLIOV], *uiov, *iov; 1150 register int error; 1151 1152 mtx_lock(&Giant); 1153 error = copyin(uap->msg, &msg, sizeof (msg)); 1154 if (error) 1155 goto done2; 1156 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 1157 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { 1158 error = EMSGSIZE; 1159 goto done2; 1160 } 1161 MALLOC(iov, struct iovec *, 1162 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 1163 M_WAITOK); 1164 } else { 1165 iov = aiov; 1166 } 1167 #ifdef COMPAT_OLDSOCK 1168 msg.msg_flags = uap->flags &~ MSG_COMPAT; 1169 #else 1170 msg.msg_flags = uap->flags; 1171 #endif 1172 uiov = msg.msg_iov; 1173 msg.msg_iov = iov; 1174 error = copyin(uiov, iov, 1175 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 1176 if (error) 1177 goto done; 1178 error = recvit(td, uap->s, &msg, NULL); 1179 if (!error) { 1180 msg.msg_iov = uiov; 1181 error = copyout(&msg, uap->msg, sizeof(msg)); 1182 } 1183 done: 1184 if (iov != aiov) 1185 FREE(iov, M_IOV); 1186 done2: 1187 mtx_unlock(&Giant); 1188 return (error); 1189 } 1190 1191 /* 1192 * MPSAFE 1193 */ 1194 /* ARGSUSED */ 1195 int 1196 shutdown(td, uap) 1197 struct thread *td; 1198 register struct shutdown_args /* { 1199 int s; 1200 int how; 1201 } */ *uap; 1202 { 1203 struct socket *so; 1204 int error; 1205 1206 mtx_lock(&Giant); 1207 if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { 1208 error = soshutdown(so, uap->how); 1209 fputsock(so); 1210 } 1211 mtx_unlock(&Giant); 1212 return(error); 1213 } 1214 1215 /* 1216 * MPSAFE 1217 */ 1218 /* ARGSUSED */ 1219 int 1220 setsockopt(td, uap) 1221 struct thread *td; 1222 register struct setsockopt_args /* { 1223 int s; 1224 int level; 1225 int name; 1226 caddr_t val; 1227 int valsize; 1228 } */ *uap; 1229 { 1230 struct socket *so; 1231 struct sockopt sopt; 1232 int error; 1233 1234 if (uap->val == 0 && uap->valsize != 0) 1235 return (EFAULT); 1236 if (uap->valsize < 0) 1237 return (EINVAL); 1238 1239 mtx_lock(&Giant); 1240 if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { 1241 sopt.sopt_dir = SOPT_SET; 1242 sopt.sopt_level = uap->level; 1243 sopt.sopt_name = uap->name; 1244 sopt.sopt_val = uap->val; 1245 sopt.sopt_valsize = uap->valsize; 1246 sopt.sopt_td = td; 1247 error = sosetopt(so, &sopt); 1248 fputsock(so); 1249 } 1250 mtx_unlock(&Giant); 1251 return(error); 1252 } 1253 1254 /* 1255 * MPSAFE 1256 */ 1257 /* ARGSUSED */ 1258 int 1259 getsockopt(td, uap) 1260 struct thread *td; 1261 register struct getsockopt_args /* { 1262 int s; 1263 int level; 1264 int name; 1265 caddr_t val; 1266 int *avalsize; 1267 } */ *uap; 1268 { 1269 int valsize, error; 1270 struct socket *so; 1271 struct sockopt sopt; 1272 1273 mtx_lock(&Giant); 1274 if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) 1275 goto done2; 1276 if (uap->val) { 1277 error = copyin(uap->avalsize, &valsize, sizeof (valsize)); 1278 if (error) 1279 goto done1; 1280 if (valsize < 0) { 1281 error = EINVAL; 1282 goto done1; 1283 } 1284 } else { 1285 valsize = 0; 1286 } 1287 1288 sopt.sopt_dir = SOPT_GET; 1289 sopt.sopt_level = uap->level; 1290 sopt.sopt_name = uap->name; 1291 sopt.sopt_val = uap->val; 1292 sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */ 1293 sopt.sopt_td = td; 1294 1295 error = sogetopt(so, &sopt); 1296 if (error == 0) { 1297 valsize = sopt.sopt_valsize; 1298 error = copyout(&valsize, uap->avalsize, sizeof (valsize)); 1299 } 1300 done1: 1301 fputsock(so); 1302 done2: 1303 mtx_unlock(&Giant); 1304 return (error); 1305 } 1306 1307 /* 1308 * getsockname1() - Get socket name. 1309 * 1310 * MPSAFE 1311 */ 1312 /* ARGSUSED */ 1313 static int 1314 getsockname1(td, uap, compat) 1315 struct thread *td; 1316 register struct getsockname_args /* { 1317 int fdes; 1318 caddr_t asa; 1319 int *alen; 1320 } */ *uap; 1321 int compat; 1322 { 1323 struct socket *so; 1324 struct sockaddr *sa; 1325 int len, error; 1326 1327 mtx_lock(&Giant); 1328 if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0) 1329 goto done2; 1330 error = copyin(uap->alen, &len, sizeof (len)); 1331 if (error) 1332 goto done1; 1333 sa = 0; 1334 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa); 1335 if (error) 1336 goto bad; 1337 if (sa == 0) { 1338 len = 0; 1339 goto gotnothing; 1340 } 1341 1342 len = MIN(len, sa->sa_len); 1343 #ifdef COMPAT_OLDSOCK 1344 if (compat) 1345 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1346 #endif 1347 error = copyout(sa, uap->asa, (u_int)len); 1348 if (error == 0) 1349 gotnothing: 1350 error = copyout(&len, uap->alen, sizeof (len)); 1351 bad: 1352 if (sa) 1353 FREE(sa, M_SONAME); 1354 done1: 1355 fputsock(so); 1356 done2: 1357 mtx_unlock(&Giant); 1358 return (error); 1359 } 1360 1361 /* 1362 * MPSAFE 1363 */ 1364 int 1365 getsockname(td, uap) 1366 struct thread *td; 1367 struct getsockname_args *uap; 1368 { 1369 1370 return (getsockname1(td, uap, 0)); 1371 } 1372 1373 #ifdef COMPAT_OLDSOCK 1374 /* 1375 * MPSAFE 1376 */ 1377 int 1378 ogetsockname(td, uap) 1379 struct thread *td; 1380 struct getsockname_args *uap; 1381 { 1382 1383 return (getsockname1(td, uap, 1)); 1384 } 1385 #endif /* COMPAT_OLDSOCK */ 1386 1387 /* 1388 * getpeername1() - Get name of peer for connected socket. 1389 * 1390 * MPSAFE 1391 */ 1392 /* ARGSUSED */ 1393 static int 1394 getpeername1(td, uap, compat) 1395 struct thread *td; 1396 register struct getpeername_args /* { 1397 int fdes; 1398 caddr_t asa; 1399 int *alen; 1400 } */ *uap; 1401 int compat; 1402 { 1403 struct socket *so; 1404 struct sockaddr *sa; 1405 int len, error; 1406 1407 mtx_lock(&Giant); 1408 if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0) 1409 goto done2; 1410 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1411 error = ENOTCONN; 1412 goto done1; 1413 } 1414 error = copyin(uap->alen, &len, sizeof (len)); 1415 if (error) 1416 goto done1; 1417 sa = 0; 1418 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa); 1419 if (error) 1420 goto bad; 1421 if (sa == 0) { 1422 len = 0; 1423 goto gotnothing; 1424 } 1425 len = MIN(len, sa->sa_len); 1426 #ifdef COMPAT_OLDSOCK 1427 if (compat) 1428 ((struct osockaddr *)sa)->sa_family = 1429 sa->sa_family; 1430 #endif 1431 error = copyout(sa, uap->asa, (u_int)len); 1432 if (error) 1433 goto bad; 1434 gotnothing: 1435 error = copyout(&len, uap->alen, sizeof (len)); 1436 bad: 1437 if (sa) 1438 FREE(sa, M_SONAME); 1439 done1: 1440 fputsock(so); 1441 done2: 1442 mtx_unlock(&Giant); 1443 return (error); 1444 } 1445 1446 /* 1447 * MPSAFE 1448 */ 1449 int 1450 getpeername(td, uap) 1451 struct thread *td; 1452 struct getpeername_args *uap; 1453 { 1454 1455 return (getpeername1(td, uap, 0)); 1456 } 1457 1458 #ifdef COMPAT_OLDSOCK 1459 /* 1460 * MPSAFE 1461 */ 1462 int 1463 ogetpeername(td, uap) 1464 struct thread *td; 1465 struct ogetpeername_args *uap; 1466 { 1467 1468 /* XXX uap should have type `getpeername_args *' to begin with. */ 1469 return (getpeername1(td, (struct getpeername_args *)uap, 1)); 1470 } 1471 #endif /* COMPAT_OLDSOCK */ 1472 1473 int 1474 sockargs(mp, buf, buflen, type) 1475 struct mbuf **mp; 1476 caddr_t buf; 1477 int buflen, type; 1478 { 1479 register struct sockaddr *sa; 1480 register struct mbuf *m; 1481 int error; 1482 1483 if ((u_int)buflen > MLEN) { 1484 #ifdef COMPAT_OLDSOCK 1485 if (type == MT_SONAME && (u_int)buflen <= 112) 1486 buflen = MLEN; /* unix domain compat. hack */ 1487 else 1488 #endif 1489 return (EINVAL); 1490 } 1491 m = m_get(M_TRYWAIT, type); 1492 if (m == NULL) 1493 return (ENOBUFS); 1494 m->m_len = buflen; 1495 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1496 if (error) 1497 (void) m_free(m); 1498 else { 1499 *mp = m; 1500 if (type == MT_SONAME) { 1501 sa = mtod(m, struct sockaddr *); 1502 1503 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1504 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1505 sa->sa_family = sa->sa_len; 1506 #endif 1507 sa->sa_len = buflen; 1508 } 1509 } 1510 return (error); 1511 } 1512 1513 int 1514 getsockaddr(namp, uaddr, len) 1515 struct sockaddr **namp; 1516 caddr_t uaddr; 1517 size_t len; 1518 { 1519 struct sockaddr *sa; 1520 int error; 1521 1522 if (len > SOCK_MAXADDRLEN) 1523 return ENAMETOOLONG; 1524 MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); 1525 error = copyin(uaddr, sa, len); 1526 if (error) { 1527 FREE(sa, M_SONAME); 1528 } else { 1529 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1530 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1531 sa->sa_family = sa->sa_len; 1532 #endif 1533 sa->sa_len = len; 1534 *namp = sa; 1535 } 1536 return error; 1537 } 1538 1539 /* 1540 * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-)) 1541 * XXX - The sf_buf functions are currently private to sendfile(2), so have 1542 * been made static, but may be useful in the future for doing zero-copy in 1543 * other parts of the networking code. 1544 */ 1545 static void 1546 sf_buf_init(void *arg) 1547 { 1548 int i; 1549 1550 mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF); 1551 mtx_lock(&sf_freelist.sf_lock); 1552 SLIST_INIT(&sf_freelist.sf_head); 1553 sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE); 1554 sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, 1555 M_NOWAIT | M_ZERO); 1556 for (i = 0; i < nsfbufs; i++) { 1557 sf_bufs[i].kva = sf_base + i * PAGE_SIZE; 1558 SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list); 1559 } 1560 sf_buf_alloc_want = 0; 1561 mtx_unlock(&sf_freelist.sf_lock); 1562 } 1563 1564 /* 1565 * Get an sf_buf from the freelist. Will block if none are available. 1566 */ 1567 struct sf_buf * 1568 sf_buf_alloc() 1569 { 1570 struct sf_buf *sf; 1571 int error; 1572 1573 mtx_lock(&sf_freelist.sf_lock); 1574 while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) { 1575 sf_buf_alloc_want++; 1576 error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH, 1577 "sfbufa", 0); 1578 sf_buf_alloc_want--; 1579 1580 /* 1581 * If we got a signal, don't risk going back to sleep. 1582 */ 1583 if (error) 1584 break; 1585 } 1586 if (sf != NULL) 1587 SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list); 1588 mtx_unlock(&sf_freelist.sf_lock); 1589 return (sf); 1590 } 1591 1592 #define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT]) 1593 1594 /* 1595 * Detatch mapped page and release resources back to the system. 1596 */ 1597 void 1598 sf_buf_free(void *addr, void *args) 1599 { 1600 struct sf_buf *sf; 1601 struct vm_page *m; 1602 1603 GIANT_REQUIRED; 1604 1605 sf = dtosf(addr); 1606 pmap_qremove((vm_offset_t)addr, 1); 1607 m = sf->m; 1608 vm_page_lock_queues(); 1609 vm_page_unwire(m, 0); 1610 /* 1611 * Check for the object going away on us. This can 1612 * happen since we don't hold a reference to it. 1613 * If so, we're responsible for freeing the page. 1614 */ 1615 if (m->wire_count == 0 && m->object == NULL) 1616 vm_page_free(m); 1617 vm_page_unlock_queues(); 1618 sf->m = NULL; 1619 mtx_lock(&sf_freelist.sf_lock); 1620 SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list); 1621 if (sf_buf_alloc_want > 0) 1622 wakeup_one(&sf_freelist); 1623 mtx_unlock(&sf_freelist.sf_lock); 1624 } 1625 1626 /* 1627 * sendfile(2) 1628 * 1629 * MPSAFE 1630 * 1631 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 1632 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 1633 * 1634 * Send a file specified by 'fd' and starting at 'offset' to a socket 1635 * specified by 's'. Send only 'nbytes' of the file or until EOF if 1636 * nbytes == 0. Optionally add a header and/or trailer to the socket 1637 * output. If specified, write the total number of bytes sent into *sbytes. 1638 * 1639 */ 1640 int 1641 sendfile(struct thread *td, struct sendfile_args *uap) 1642 { 1643 1644 return (do_sendfile(td, uap, 0)); 1645 } 1646 1647 #ifdef COMPAT_FREEBSD4 1648 int 1649 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) 1650 { 1651 struct sendfile_args args; 1652 1653 args.fd = uap->fd; 1654 args.s = uap->s; 1655 args.offset = uap->offset; 1656 args.nbytes = uap->nbytes; 1657 args.hdtr = uap->hdtr; 1658 args.sbytes = uap->sbytes; 1659 args.flags = uap->flags; 1660 1661 return (do_sendfile(td, &args, 1)); 1662 } 1663 #endif /* COMPAT_FREEBSD4 */ 1664 1665 static int 1666 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) 1667 { 1668 struct vnode *vp; 1669 struct vm_object *obj; 1670 struct socket *so = NULL; 1671 struct mbuf *m; 1672 struct sf_buf *sf; 1673 struct vm_page *pg; 1674 struct writev_args nuap; 1675 struct sf_hdtr hdtr; 1676 off_t off, xfsize, hdtr_size, sbytes = 0; 1677 int error, s; 1678 1679 mtx_lock(&Giant); 1680 1681 hdtr_size = 0; 1682 1683 /* 1684 * The descriptor must be a regular file and have a backing VM object. 1685 */ 1686 if ((error = fgetvp_read(td, uap->fd, &vp)) != 0) 1687 goto done; 1688 if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) { 1689 error = EINVAL; 1690 goto done; 1691 } 1692 if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) 1693 goto done; 1694 if (so->so_type != SOCK_STREAM) { 1695 error = EINVAL; 1696 goto done; 1697 } 1698 if ((so->so_state & SS_ISCONNECTED) == 0) { 1699 error = ENOTCONN; 1700 goto done; 1701 } 1702 if (uap->offset < 0) { 1703 error = EINVAL; 1704 goto done; 1705 } 1706 1707 /* 1708 * If specified, get the pointer to the sf_hdtr struct for 1709 * any headers/trailers. 1710 */ 1711 if (uap->hdtr != NULL) { 1712 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 1713 if (error) 1714 goto done; 1715 /* 1716 * Send any headers. Wimp out and use writev(2). 1717 */ 1718 if (hdtr.headers != NULL) { 1719 nuap.fd = uap->s; 1720 nuap.iovp = hdtr.headers; 1721 nuap.iovcnt = hdtr.hdr_cnt; 1722 error = writev(td, &nuap); 1723 if (error) 1724 goto done; 1725 if (compat) 1726 sbytes += td->td_retval[0]; 1727 else 1728 hdtr_size += td->td_retval[0]; 1729 } 1730 } 1731 1732 /* 1733 * Protect against multiple writers to the socket. 1734 */ 1735 (void) sblock(&so->so_snd, M_WAITOK); 1736 1737 /* 1738 * Loop through the pages in the file, starting with the requested 1739 * offset. Get a file page (do I/O if necessary), map the file page 1740 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 1741 * it on the socket. 1742 */ 1743 for (off = uap->offset; ; off += xfsize, sbytes += xfsize) { 1744 vm_pindex_t pindex; 1745 vm_offset_t pgoff; 1746 1747 pindex = OFF_TO_IDX(off); 1748 retry_lookup: 1749 /* 1750 * Calculate the amount to transfer. Not to exceed a page, 1751 * the EOF, or the passed in nbytes. 1752 */ 1753 xfsize = obj->un_pager.vnp.vnp_size - off; 1754 if (xfsize > PAGE_SIZE) 1755 xfsize = PAGE_SIZE; 1756 pgoff = (vm_offset_t)(off & PAGE_MASK); 1757 if (PAGE_SIZE - pgoff < xfsize) 1758 xfsize = PAGE_SIZE - pgoff; 1759 if (uap->nbytes && xfsize > (uap->nbytes - sbytes)) 1760 xfsize = uap->nbytes - sbytes; 1761 if (xfsize <= 0) 1762 break; 1763 /* 1764 * Optimize the non-blocking case by looking at the socket space 1765 * before going to the extra work of constituting the sf_buf. 1766 */ 1767 if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { 1768 if (so->so_state & SS_CANTSENDMORE) 1769 error = EPIPE; 1770 else 1771 error = EAGAIN; 1772 sbunlock(&so->so_snd); 1773 goto done; 1774 } 1775 /* 1776 * Attempt to look up the page. 1777 * 1778 * Allocate if not found 1779 * 1780 * Wait and loop if busy. 1781 */ 1782 pg = vm_page_lookup(obj, pindex); 1783 1784 if (pg == NULL) { 1785 pg = vm_page_alloc(obj, pindex, 1786 VM_ALLOC_NORMAL | VM_ALLOC_WIRED); 1787 if (pg == NULL) { 1788 VM_WAIT; 1789 goto retry_lookup; 1790 } 1791 vm_page_wakeup(pg); 1792 } else { 1793 if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) 1794 goto retry_lookup; 1795 /* 1796 * Wire the page so it does not get ripped out from 1797 * under us. 1798 */ 1799 vm_page_lock_queues(); 1800 vm_page_wire(pg); 1801 vm_page_unlock_queues(); 1802 } 1803 1804 /* 1805 * If page is not valid for what we need, initiate I/O 1806 */ 1807 1808 if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) { 1809 int bsize; 1810 1811 /* 1812 * Ensure that our page is still around when the I/O 1813 * completes. 1814 */ 1815 vm_page_io_start(pg); 1816 1817 /* 1818 * Get the page from backing store. 1819 */ 1820 bsize = vp->v_mount->mnt_stat.f_iosize; 1821 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td); 1822 error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE, 1823 trunc_page(off), UIO_NOCOPY, IO_NODELOCKED | 1824 IO_VMIO | ((MAXBSIZE / bsize) << 16), 1825 td->td_ucred, NULL, td); 1826 VOP_UNLOCK(vp, 0, td); 1827 vm_page_flag_clear(pg, PG_ZERO); 1828 vm_page_io_finish(pg); 1829 if (error) { 1830 vm_page_lock_queues(); 1831 vm_page_unwire(pg, 0); 1832 /* 1833 * See if anyone else might know about this page. 1834 * If not and it is not valid, then free it. 1835 */ 1836 if (pg->wire_count == 0 && pg->valid == 0 && 1837 pg->busy == 0 && !(pg->flags & PG_BUSY) && 1838 pg->hold_count == 0) { 1839 vm_page_busy(pg); 1840 vm_page_free(pg); 1841 } 1842 vm_page_unlock_queues(); 1843 sbunlock(&so->so_snd); 1844 goto done; 1845 } 1846 } 1847 1848 1849 /* 1850 * Get a sendfile buf. We usually wait as long as necessary, 1851 * but this wait can be interrupted. 1852 */ 1853 if ((sf = sf_buf_alloc()) == NULL) { 1854 vm_page_lock_queues(); 1855 vm_page_unwire(pg, 0); 1856 if (pg->wire_count == 0 && pg->object == NULL) 1857 vm_page_free(pg); 1858 vm_page_unlock_queues(); 1859 sbunlock(&so->so_snd); 1860 error = EINTR; 1861 goto done; 1862 } 1863 1864 /* 1865 * Allocate a kernel virtual page and insert the physical page 1866 * into it. 1867 */ 1868 sf->m = pg; 1869 pmap_qenter(sf->kva, &pg, 1); 1870 /* 1871 * Get an mbuf header and set it up as having external storage. 1872 */ 1873 MGETHDR(m, M_TRYWAIT, MT_DATA); 1874 if (m == NULL) { 1875 error = ENOBUFS; 1876 sf_buf_free((void *)sf->kva, NULL); 1877 sbunlock(&so->so_snd); 1878 goto done; 1879 } 1880 /* 1881 * Setup external storage for mbuf. 1882 */ 1883 MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY, 1884 EXT_SFBUF); 1885 m->m_data = (char *) sf->kva + pgoff; 1886 m->m_pkthdr.len = m->m_len = xfsize; 1887 /* 1888 * Add the buffer to the socket buffer chain. 1889 */ 1890 s = splnet(); 1891 retry_space: 1892 /* 1893 * Make sure that the socket is still able to take more data. 1894 * CANTSENDMORE being true usually means that the connection 1895 * was closed. so_error is true when an error was sensed after 1896 * a previous send. 1897 * The state is checked after the page mapping and buffer 1898 * allocation above since those operations may block and make 1899 * any socket checks stale. From this point forward, nothing 1900 * blocks before the pru_send (or more accurately, any blocking 1901 * results in a loop back to here to re-check). 1902 */ 1903 if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { 1904 if (so->so_state & SS_CANTSENDMORE) { 1905 error = EPIPE; 1906 } else { 1907 error = so->so_error; 1908 so->so_error = 0; 1909 } 1910 m_freem(m); 1911 sbunlock(&so->so_snd); 1912 splx(s); 1913 goto done; 1914 } 1915 /* 1916 * Wait for socket space to become available. We do this just 1917 * after checking the connection state above in order to avoid 1918 * a race condition with sbwait(). 1919 */ 1920 if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { 1921 if (so->so_state & SS_NBIO) { 1922 m_freem(m); 1923 sbunlock(&so->so_snd); 1924 splx(s); 1925 error = EAGAIN; 1926 goto done; 1927 } 1928 error = sbwait(&so->so_snd); 1929 /* 1930 * An error from sbwait usually indicates that we've 1931 * been interrupted by a signal. If we've sent anything 1932 * then return bytes sent, otherwise return the error. 1933 */ 1934 if (error) { 1935 m_freem(m); 1936 sbunlock(&so->so_snd); 1937 splx(s); 1938 goto done; 1939 } 1940 goto retry_space; 1941 } 1942 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td); 1943 splx(s); 1944 if (error) { 1945 sbunlock(&so->so_snd); 1946 goto done; 1947 } 1948 } 1949 sbunlock(&so->so_snd); 1950 1951 /* 1952 * Send trailers. Wimp out and use writev(2). 1953 */ 1954 if (uap->hdtr != NULL && hdtr.trailers != NULL) { 1955 nuap.fd = uap->s; 1956 nuap.iovp = hdtr.trailers; 1957 nuap.iovcnt = hdtr.trl_cnt; 1958 error = writev(td, &nuap); 1959 if (error) 1960 goto done; 1961 if (compat) 1962 sbytes += td->td_retval[0]; 1963 else 1964 hdtr_size += td->td_retval[0]; 1965 } 1966 1967 done: 1968 /* 1969 * If there was no error we have to clear td->td_retval[0] 1970 * because it may have been set by writev. 1971 */ 1972 if (error == 0) { 1973 td->td_retval[0] = 0; 1974 } 1975 if (uap->sbytes != NULL) { 1976 if (!compat) 1977 sbytes += hdtr_size; 1978 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 1979 } 1980 if (vp) 1981 vrele(vp); 1982 if (so) 1983 fputsock(so); 1984 mtx_unlock(&Giant); 1985 return (error); 1986 } 1987