1 /* 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 37 * $FreeBSD$ 38 */ 39 40 #include "opt_compat.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/kernel.h> 46 #include <sys/lock.h> 47 #include <sys/mutex.h> 48 #include <sys/sysproto.h> 49 #include <sys/malloc.h> 50 #include <sys/filedesc.h> 51 #include <sys/event.h> 52 #include <sys/proc.h> 53 #include <sys/fcntl.h> 54 #include <sys/file.h> 55 #include <sys/lock.h> 56 #include <sys/mount.h> 57 #include <sys/mbuf.h> 58 #include <sys/protosw.h> 59 #include <sys/socket.h> 60 #include <sys/socketvar.h> 61 #include <sys/signalvar.h> 62 #include <sys/uio.h> 63 #include <sys/vnode.h> 64 #ifdef KTRACE 65 #include <sys/ktrace.h> 66 #endif 67 68 #include <vm/vm.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_page.h> 71 #include <vm/vm_pageout.h> 72 #include <vm/vm_kern.h> 73 #include <vm/vm_extern.h> 74 75 static void sf_buf_init(void *arg); 76 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) 77 struct sf_buf *sf_buf_alloc(void); 78 void sf_buf_free(void *addr, void *args); 79 80 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); 81 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); 82 83 static int accept1(struct thread *td, struct accept_args *uap, int compat); 84 static int getsockname1(struct thread *td, struct getsockname_args *uap, 85 int compat); 86 static int getpeername1(struct thread *td, struct getpeername_args *uap, 87 int compat); 88 89 /* 90 * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the 91 * sf_freelist head with the sf_lock mutex. 92 */ 93 static struct { 94 SLIST_HEAD(, sf_buf) sf_head; 95 struct mtx sf_lock; 96 } sf_freelist; 97 98 vm_offset_t sf_base; 99 struct sf_buf *sf_bufs; 100 u_int sf_buf_alloc_want; 101 102 /* 103 * System call interface to the socket abstraction. 104 */ 105 #if defined(COMPAT_43) || defined(COMPAT_SUNOS) 106 #define COMPAT_OLDSOCK 107 #endif 108 109 extern struct fileops socketops; 110 111 /* 112 * MPSAFE 113 */ 114 int 115 socket(td, uap) 116 struct thread *td; 117 register struct socket_args /* { 118 int domain; 119 int type; 120 int protocol; 121 } */ *uap; 122 { 123 struct filedesc *fdp; 124 struct socket *so; 125 struct file *fp; 126 int fd, error; 127 128 mtx_lock(&Giant); 129 fdp = td->td_proc->p_fd; 130 error = falloc(td, &fp, &fd); 131 if (error) 132 goto done2; 133 fhold(fp); 134 error = socreate(uap->domain, &so, uap->type, uap->protocol, 135 td->td_ucred, td); 136 FILEDESC_LOCK(fdp); 137 if (error) { 138 if (fdp->fd_ofiles[fd] == fp) { 139 fdp->fd_ofiles[fd] = NULL; 140 FILEDESC_UNLOCK(fdp); 141 fdrop(fp, td); 142 } else 143 FILEDESC_UNLOCK(fdp); 144 } else { 145 fp->f_data = so; /* already has ref count */ 146 fp->f_flag = FREAD|FWRITE; 147 fp->f_ops = &socketops; 148 fp->f_type = DTYPE_SOCKET; 149 FILEDESC_UNLOCK(fdp); 150 td->td_retval[0] = fd; 151 } 152 fdrop(fp, td); 153 done2: 154 mtx_unlock(&Giant); 155 return (error); 156 } 157 158 /* 159 * MPSAFE 160 */ 161 /* ARGSUSED */ 162 int 163 bind(td, uap) 164 struct thread *td; 165 register struct bind_args /* { 166 int s; 167 caddr_t name; 168 int namelen; 169 } */ *uap; 170 { 171 struct socket *so; 172 struct sockaddr *sa; 173 int error; 174 175 mtx_lock(&Giant); 176 if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) 177 goto done2; 178 if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0) 179 goto done1; 180 error = sobind(so, sa, td); 181 FREE(sa, M_SONAME); 182 done1: 183 fputsock(so); 184 done2: 185 mtx_unlock(&Giant); 186 return (error); 187 } 188 189 /* 190 * MPSAFE 191 */ 192 /* ARGSUSED */ 193 int 194 listen(td, uap) 195 struct thread *td; 196 register struct listen_args /* { 197 int s; 198 int backlog; 199 } */ *uap; 200 { 201 struct socket *so; 202 int error; 203 204 mtx_lock(&Giant); 205 if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { 206 error = solisten(so, uap->backlog, td); 207 fputsock(so); 208 } 209 mtx_unlock(&Giant); 210 return(error); 211 } 212 213 /* 214 * accept1() 215 * MPSAFE 216 */ 217 static int 218 accept1(td, uap, compat) 219 struct thread *td; 220 register struct accept_args /* { 221 int s; 222 caddr_t name; 223 int *anamelen; 224 } */ *uap; 225 int compat; 226 { 227 struct filedesc *fdp; 228 struct file *nfp = NULL; 229 struct sockaddr *sa; 230 int namelen, error, s; 231 struct socket *head, *so; 232 int fd; 233 u_int fflag; 234 235 mtx_lock(&Giant); 236 fdp = td->td_proc->p_fd; 237 if (uap->name) { 238 error = copyin(uap->anamelen, &namelen, sizeof (namelen)); 239 if(error) 240 goto done2; 241 } 242 error = fgetsock(td, uap->s, &head, &fflag); 243 if (error) 244 goto done2; 245 s = splnet(); 246 if ((head->so_options & SO_ACCEPTCONN) == 0) { 247 splx(s); 248 error = EINVAL; 249 goto done; 250 } 251 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { 252 splx(s); 253 error = EWOULDBLOCK; 254 goto done; 255 } 256 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { 257 if (head->so_state & SS_CANTRCVMORE) { 258 head->so_error = ECONNABORTED; 259 break; 260 } 261 error = tsleep(&head->so_timeo, PSOCK | PCATCH, 262 "accept", 0); 263 if (error) { 264 splx(s); 265 goto done; 266 } 267 } 268 if (head->so_error) { 269 error = head->so_error; 270 head->so_error = 0; 271 splx(s); 272 goto done; 273 } 274 275 /* 276 * At this point we know that there is at least one connection 277 * ready to be accepted. Remove it from the queue prior to 278 * allocating the file descriptor for it since falloc() may 279 * block allowing another process to accept the connection 280 * instead. 281 */ 282 so = TAILQ_FIRST(&head->so_comp); 283 TAILQ_REMOVE(&head->so_comp, so, so_list); 284 head->so_qlen--; 285 286 error = falloc(td, &nfp, &fd); 287 if (error) { 288 /* 289 * Probably ran out of file descriptors. Put the 290 * unaccepted connection back onto the queue and 291 * do another wakeup so some other process might 292 * have a chance at it. 293 */ 294 TAILQ_INSERT_HEAD(&head->so_comp, so, so_list); 295 head->so_qlen++; 296 wakeup_one(&head->so_timeo); 297 splx(s); 298 goto done; 299 } 300 fhold(nfp); 301 td->td_retval[0] = fd; 302 303 /* connection has been removed from the listen queue */ 304 KNOTE(&head->so_rcv.sb_sel.si_note, 0); 305 306 so->so_state &= ~SS_COMP; 307 so->so_head = NULL; 308 if (head->so_sigio != NULL) 309 fsetown(fgetown(head->so_sigio), &so->so_sigio); 310 311 FILE_LOCK(nfp); 312 soref(so); /* file descriptor reference */ 313 nfp->f_data = so; /* nfp has ref count from falloc */ 314 nfp->f_flag = fflag; 315 nfp->f_ops = &socketops; 316 nfp->f_type = DTYPE_SOCKET; 317 FILE_UNLOCK(nfp); 318 sa = 0; 319 error = soaccept(so, &sa); 320 if (error) { 321 /* 322 * return a namelen of zero for older code which might 323 * ignore the return value from accept. 324 */ 325 if (uap->name != NULL) { 326 namelen = 0; 327 (void) copyout(&namelen, 328 uap->anamelen, sizeof(*uap->anamelen)); 329 } 330 goto noconnection; 331 } 332 if (sa == NULL) { 333 namelen = 0; 334 if (uap->name) 335 goto gotnoname; 336 splx(s); 337 error = 0; 338 goto done; 339 } 340 if (uap->name) { 341 /* check sa_len before it is destroyed */ 342 if (namelen > sa->sa_len) 343 namelen = sa->sa_len; 344 #ifdef COMPAT_OLDSOCK 345 if (compat) 346 ((struct osockaddr *)sa)->sa_family = 347 sa->sa_family; 348 #endif 349 error = copyout(sa, uap->name, (u_int)namelen); 350 if (!error) 351 gotnoname: 352 error = copyout(&namelen, 353 uap->anamelen, sizeof (*uap->anamelen)); 354 } 355 noconnection: 356 if (sa) 357 FREE(sa, M_SONAME); 358 359 /* 360 * close the new descriptor, assuming someone hasn't ripped it 361 * out from under us. 362 */ 363 if (error) { 364 FILEDESC_LOCK(fdp); 365 if (fdp->fd_ofiles[fd] == nfp) { 366 fdp->fd_ofiles[fd] = NULL; 367 FILEDESC_UNLOCK(fdp); 368 fdrop(nfp, td); 369 } else { 370 FILEDESC_UNLOCK(fdp); 371 } 372 } 373 splx(s); 374 375 /* 376 * Release explicitly held references before returning. 377 */ 378 done: 379 if (nfp != NULL) 380 fdrop(nfp, td); 381 fputsock(head); 382 done2: 383 mtx_unlock(&Giant); 384 return (error); 385 } 386 387 /* 388 * MPSAFE (accept1() is MPSAFE) 389 */ 390 int 391 accept(td, uap) 392 struct thread *td; 393 struct accept_args *uap; 394 { 395 396 return (accept1(td, uap, 0)); 397 } 398 399 #ifdef COMPAT_OLDSOCK 400 /* 401 * MPSAFE (accept1() is MPSAFE) 402 */ 403 int 404 oaccept(td, uap) 405 struct thread *td; 406 struct accept_args *uap; 407 { 408 409 return (accept1(td, uap, 1)); 410 } 411 #endif /* COMPAT_OLDSOCK */ 412 413 /* 414 * MPSAFE 415 */ 416 /* ARGSUSED */ 417 int 418 connect(td, uap) 419 struct thread *td; 420 register struct connect_args /* { 421 int s; 422 caddr_t name; 423 int namelen; 424 } */ *uap; 425 { 426 struct socket *so; 427 struct sockaddr *sa; 428 int error, s; 429 430 mtx_lock(&Giant); 431 if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) 432 goto done2; 433 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 434 error = EALREADY; 435 goto done1; 436 } 437 error = getsockaddr(&sa, uap->name, uap->namelen); 438 if (error) 439 goto done1; 440 error = soconnect(so, sa, td); 441 if (error) 442 goto bad; 443 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 444 FREE(sa, M_SONAME); 445 error = EINPROGRESS; 446 goto done1; 447 } 448 s = splnet(); 449 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 450 error = tsleep(&so->so_timeo, PSOCK | PCATCH, "connec", 0); 451 if (error) 452 break; 453 } 454 if (error == 0) { 455 error = so->so_error; 456 so->so_error = 0; 457 } 458 splx(s); 459 bad: 460 so->so_state &= ~SS_ISCONNECTING; 461 FREE(sa, M_SONAME); 462 if (error == ERESTART) 463 error = EINTR; 464 done1: 465 fputsock(so); 466 done2: 467 mtx_unlock(&Giant); 468 return (error); 469 } 470 471 /* 472 * MPSAFE 473 */ 474 int 475 socketpair(td, uap) 476 struct thread *td; 477 register struct socketpair_args /* { 478 int domain; 479 int type; 480 int protocol; 481 int *rsv; 482 } */ *uap; 483 { 484 register struct filedesc *fdp = td->td_proc->p_fd; 485 struct file *fp1, *fp2; 486 struct socket *so1, *so2; 487 int fd, error, sv[2]; 488 489 mtx_lock(&Giant); 490 error = socreate(uap->domain, &so1, uap->type, uap->protocol, 491 td->td_ucred, td); 492 if (error) 493 goto done2; 494 error = socreate(uap->domain, &so2, uap->type, uap->protocol, 495 td->td_ucred, td); 496 if (error) 497 goto free1; 498 error = falloc(td, &fp1, &fd); 499 if (error) 500 goto free2; 501 fhold(fp1); 502 sv[0] = fd; 503 fp1->f_data = so1; /* so1 already has ref count */ 504 error = falloc(td, &fp2, &fd); 505 if (error) 506 goto free3; 507 fhold(fp2); 508 fp2->f_data = so2; /* so2 already has ref count */ 509 sv[1] = fd; 510 error = soconnect2(so1, so2); 511 if (error) 512 goto free4; 513 if (uap->type == SOCK_DGRAM) { 514 /* 515 * Datagram socket connection is asymmetric. 516 */ 517 error = soconnect2(so2, so1); 518 if (error) 519 goto free4; 520 } 521 FILE_LOCK(fp1); 522 fp1->f_flag = FREAD|FWRITE; 523 fp1->f_ops = &socketops; 524 fp1->f_type = DTYPE_SOCKET; 525 FILE_UNLOCK(fp1); 526 FILE_LOCK(fp2); 527 fp2->f_flag = FREAD|FWRITE; 528 fp2->f_ops = &socketops; 529 fp2->f_type = DTYPE_SOCKET; 530 FILE_UNLOCK(fp2); 531 error = copyout(sv, uap->rsv, 2 * sizeof (int)); 532 fdrop(fp1, td); 533 fdrop(fp2, td); 534 goto done2; 535 free4: 536 FILEDESC_LOCK(fdp); 537 if (fdp->fd_ofiles[sv[1]] == fp2) { 538 fdp->fd_ofiles[sv[1]] = NULL; 539 FILEDESC_UNLOCK(fdp); 540 fdrop(fp2, td); 541 } else 542 FILEDESC_UNLOCK(fdp); 543 fdrop(fp2, td); 544 free3: 545 FILEDESC_LOCK(fdp); 546 if (fdp->fd_ofiles[sv[0]] == fp1) { 547 fdp->fd_ofiles[sv[0]] = NULL; 548 FILEDESC_UNLOCK(fdp); 549 fdrop(fp1, td); 550 } else 551 FILEDESC_UNLOCK(fdp); 552 fdrop(fp1, td); 553 free2: 554 (void)soclose(so2); 555 free1: 556 (void)soclose(so1); 557 done2: 558 mtx_unlock(&Giant); 559 return (error); 560 } 561 562 static int 563 sendit(td, s, mp, flags) 564 register struct thread *td; 565 int s; 566 register struct msghdr *mp; 567 int flags; 568 { 569 struct uio auio; 570 register struct iovec *iov; 571 register int i; 572 struct mbuf *control; 573 struct sockaddr *to = NULL; 574 int len, error; 575 struct socket *so; 576 #ifdef KTRACE 577 struct iovec *ktriov = NULL; 578 struct uio ktruio; 579 int iovlen; 580 #endif 581 582 if ((error = fgetsock(td, s, &so, NULL)) != 0) 583 return (error); 584 auio.uio_iov = mp->msg_iov; 585 auio.uio_iovcnt = mp->msg_iovlen; 586 auio.uio_segflg = UIO_USERSPACE; 587 auio.uio_rw = UIO_WRITE; 588 auio.uio_td = td; 589 auio.uio_offset = 0; /* XXX */ 590 auio.uio_resid = 0; 591 iov = mp->msg_iov; 592 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 593 if ((auio.uio_resid += iov->iov_len) < 0) { 594 error = EINVAL; 595 goto bad; 596 } 597 } 598 if (mp->msg_name) { 599 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 600 if (error) 601 goto bad; 602 } 603 if (mp->msg_control) { 604 if (mp->msg_controllen < sizeof(struct cmsghdr) 605 #ifdef COMPAT_OLDSOCK 606 && mp->msg_flags != MSG_COMPAT 607 #endif 608 ) { 609 error = EINVAL; 610 goto bad; 611 } 612 error = sockargs(&control, mp->msg_control, 613 mp->msg_controllen, MT_CONTROL); 614 if (error) 615 goto bad; 616 #ifdef COMPAT_OLDSOCK 617 if (mp->msg_flags == MSG_COMPAT) { 618 register struct cmsghdr *cm; 619 620 M_PREPEND(control, sizeof(*cm), M_TRYWAIT); 621 if (control == 0) { 622 error = ENOBUFS; 623 goto bad; 624 } else { 625 cm = mtod(control, struct cmsghdr *); 626 cm->cmsg_len = control->m_len; 627 cm->cmsg_level = SOL_SOCKET; 628 cm->cmsg_type = SCM_RIGHTS; 629 } 630 } 631 #endif 632 } else { 633 control = 0; 634 } 635 #ifdef KTRACE 636 if (KTRPOINT(td, KTR_GENIO)) { 637 iovlen = auio.uio_iovcnt * sizeof (struct iovec); 638 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 639 bcopy(auio.uio_iov, ktriov, iovlen); 640 ktruio = auio; 641 } 642 #endif 643 len = auio.uio_resid; 644 error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control, 645 flags, td); 646 if (error) { 647 if (auio.uio_resid != len && (error == ERESTART || 648 error == EINTR || error == EWOULDBLOCK)) 649 error = 0; 650 /* Generation of SIGPIPE can be controlled per socket */ 651 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE)) { 652 PROC_LOCK(td->td_proc); 653 psignal(td->td_proc, SIGPIPE); 654 PROC_UNLOCK(td->td_proc); 655 } 656 } 657 if (error == 0) 658 td->td_retval[0] = len - auio.uio_resid; 659 #ifdef KTRACE 660 if (ktriov != NULL) { 661 if (error == 0) { 662 ktruio.uio_iov = ktriov; 663 ktruio.uio_resid = td->td_retval[0]; 664 ktrgenio(s, UIO_WRITE, &ktruio, error); 665 } 666 FREE(ktriov, M_TEMP); 667 } 668 #endif 669 bad: 670 fputsock(so); 671 if (to) 672 FREE(to, M_SONAME); 673 return (error); 674 } 675 676 /* 677 * MPSAFE 678 */ 679 int 680 sendto(td, uap) 681 struct thread *td; 682 register struct sendto_args /* { 683 int s; 684 caddr_t buf; 685 size_t len; 686 int flags; 687 caddr_t to; 688 int tolen; 689 } */ *uap; 690 { 691 struct msghdr msg; 692 struct iovec aiov; 693 int error; 694 695 msg.msg_name = uap->to; 696 msg.msg_namelen = uap->tolen; 697 msg.msg_iov = &aiov; 698 msg.msg_iovlen = 1; 699 msg.msg_control = 0; 700 #ifdef COMPAT_OLDSOCK 701 msg.msg_flags = 0; 702 #endif 703 aiov.iov_base = uap->buf; 704 aiov.iov_len = uap->len; 705 mtx_lock(&Giant); 706 error = sendit(td, uap->s, &msg, uap->flags); 707 mtx_unlock(&Giant); 708 return (error); 709 } 710 711 #ifdef COMPAT_OLDSOCK 712 /* 713 * MPSAFE 714 */ 715 int 716 osend(td, uap) 717 struct thread *td; 718 register struct osend_args /* { 719 int s; 720 caddr_t buf; 721 int len; 722 int flags; 723 } */ *uap; 724 { 725 struct msghdr msg; 726 struct iovec aiov; 727 int error; 728 729 msg.msg_name = 0; 730 msg.msg_namelen = 0; 731 msg.msg_iov = &aiov; 732 msg.msg_iovlen = 1; 733 aiov.iov_base = uap->buf; 734 aiov.iov_len = uap->len; 735 msg.msg_control = 0; 736 msg.msg_flags = 0; 737 mtx_lock(&Giant); 738 error = sendit(td, uap->s, &msg, uap->flags); 739 mtx_unlock(&Giant); 740 return (error); 741 } 742 743 /* 744 * MPSAFE 745 */ 746 int 747 osendmsg(td, uap) 748 struct thread *td; 749 register struct osendmsg_args /* { 750 int s; 751 caddr_t msg; 752 int flags; 753 } */ *uap; 754 { 755 struct msghdr msg; 756 struct iovec aiov[UIO_SMALLIOV], *iov; 757 int error; 758 759 mtx_lock(&Giant); 760 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 761 if (error) 762 goto done2; 763 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 764 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { 765 error = EMSGSIZE; 766 goto done2; 767 } 768 MALLOC(iov, struct iovec *, 769 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 770 M_WAITOK); 771 } else { 772 iov = aiov; 773 } 774 error = copyin(msg.msg_iov, iov, 775 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 776 if (error) 777 goto done; 778 msg.msg_flags = MSG_COMPAT; 779 msg.msg_iov = iov; 780 error = sendit(td, uap->s, &msg, uap->flags); 781 done: 782 if (iov != aiov) 783 FREE(iov, M_IOV); 784 done2: 785 mtx_unlock(&Giant); 786 return (error); 787 } 788 #endif 789 790 /* 791 * MPSAFE 792 */ 793 int 794 sendmsg(td, uap) 795 struct thread *td; 796 register struct sendmsg_args /* { 797 int s; 798 caddr_t msg; 799 int flags; 800 } */ *uap; 801 { 802 struct msghdr msg; 803 struct iovec aiov[UIO_SMALLIOV], *iov; 804 int error; 805 806 mtx_lock(&Giant); 807 error = copyin(uap->msg, &msg, sizeof (msg)); 808 if (error) 809 goto done2; 810 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 811 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { 812 error = EMSGSIZE; 813 goto done2; 814 } 815 MALLOC(iov, struct iovec *, 816 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 817 M_WAITOK); 818 } else { 819 iov = aiov; 820 } 821 if (msg.msg_iovlen && 822 (error = copyin(msg.msg_iov, iov, 823 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))) 824 goto done; 825 msg.msg_iov = iov; 826 #ifdef COMPAT_OLDSOCK 827 msg.msg_flags = 0; 828 #endif 829 error = sendit(td, uap->s, &msg, uap->flags); 830 done: 831 if (iov != aiov) 832 FREE(iov, M_IOV); 833 done2: 834 mtx_unlock(&Giant); 835 return (error); 836 } 837 838 static int 839 recvit(td, s, mp, namelenp) 840 register struct thread *td; 841 int s; 842 register struct msghdr *mp; 843 void *namelenp; 844 { 845 struct uio auio; 846 register struct iovec *iov; 847 register int i; 848 int len, error; 849 struct mbuf *m, *control = 0; 850 caddr_t ctlbuf; 851 struct socket *so; 852 struct sockaddr *fromsa = 0; 853 #ifdef KTRACE 854 struct iovec *ktriov = NULL; 855 struct uio ktruio; 856 int iovlen; 857 #endif 858 859 if ((error = fgetsock(td, s, &so, NULL)) != 0) 860 return (error); 861 auio.uio_iov = mp->msg_iov; 862 auio.uio_iovcnt = mp->msg_iovlen; 863 auio.uio_segflg = UIO_USERSPACE; 864 auio.uio_rw = UIO_READ; 865 auio.uio_td = td; 866 auio.uio_offset = 0; /* XXX */ 867 auio.uio_resid = 0; 868 iov = mp->msg_iov; 869 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 870 if ((auio.uio_resid += iov->iov_len) < 0) { 871 fputsock(so); 872 return (EINVAL); 873 } 874 } 875 #ifdef KTRACE 876 if (KTRPOINT(td, KTR_GENIO)) { 877 iovlen = auio.uio_iovcnt * sizeof (struct iovec); 878 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 879 bcopy(auio.uio_iov, ktriov, iovlen); 880 ktruio = auio; 881 } 882 #endif 883 len = auio.uio_resid; 884 error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio, 885 (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, 886 &mp->msg_flags); 887 if (error) { 888 if (auio.uio_resid != len && (error == ERESTART || 889 error == EINTR || error == EWOULDBLOCK)) 890 error = 0; 891 } 892 #ifdef KTRACE 893 if (ktriov != NULL) { 894 if (error == 0) { 895 ktruio.uio_iov = ktriov; 896 ktruio.uio_resid = len - auio.uio_resid; 897 ktrgenio(s, UIO_READ, &ktruio, error); 898 } 899 FREE(ktriov, M_TEMP); 900 } 901 #endif 902 if (error) 903 goto out; 904 td->td_retval[0] = len - auio.uio_resid; 905 if (mp->msg_name) { 906 len = mp->msg_namelen; 907 if (len <= 0 || fromsa == 0) 908 len = 0; 909 else { 910 #ifndef MIN 911 #define MIN(a,b) ((a)>(b)?(b):(a)) 912 #endif 913 /* save sa_len before it is destroyed by MSG_COMPAT */ 914 len = MIN(len, fromsa->sa_len); 915 #ifdef COMPAT_OLDSOCK 916 if (mp->msg_flags & MSG_COMPAT) 917 ((struct osockaddr *)fromsa)->sa_family = 918 fromsa->sa_family; 919 #endif 920 error = copyout(fromsa, mp->msg_name, (unsigned)len); 921 if (error) 922 goto out; 923 } 924 mp->msg_namelen = len; 925 if (namelenp && 926 (error = copyout(&len, namelenp, sizeof (int)))) { 927 #ifdef COMPAT_OLDSOCK 928 if (mp->msg_flags & MSG_COMPAT) 929 error = 0; /* old recvfrom didn't check */ 930 else 931 #endif 932 goto out; 933 } 934 } 935 if (mp->msg_control) { 936 #ifdef COMPAT_OLDSOCK 937 /* 938 * We assume that old recvmsg calls won't receive access 939 * rights and other control info, esp. as control info 940 * is always optional and those options didn't exist in 4.3. 941 * If we receive rights, trim the cmsghdr; anything else 942 * is tossed. 943 */ 944 if (control && mp->msg_flags & MSG_COMPAT) { 945 if (mtod(control, struct cmsghdr *)->cmsg_level != 946 SOL_SOCKET || 947 mtod(control, struct cmsghdr *)->cmsg_type != 948 SCM_RIGHTS) { 949 mp->msg_controllen = 0; 950 goto out; 951 } 952 control->m_len -= sizeof (struct cmsghdr); 953 control->m_data += sizeof (struct cmsghdr); 954 } 955 #endif 956 len = mp->msg_controllen; 957 m = control; 958 mp->msg_controllen = 0; 959 ctlbuf = mp->msg_control; 960 961 while (m && len > 0) { 962 unsigned int tocopy; 963 964 if (len >= m->m_len) 965 tocopy = m->m_len; 966 else { 967 mp->msg_flags |= MSG_CTRUNC; 968 tocopy = len; 969 } 970 971 if ((error = copyout(mtod(m, caddr_t), 972 ctlbuf, tocopy)) != 0) 973 goto out; 974 975 ctlbuf += tocopy; 976 len -= tocopy; 977 m = m->m_next; 978 } 979 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; 980 } 981 out: 982 fputsock(so); 983 if (fromsa) 984 FREE(fromsa, M_SONAME); 985 if (control) 986 m_freem(control); 987 return (error); 988 } 989 990 /* 991 * MPSAFE 992 */ 993 int 994 recvfrom(td, uap) 995 struct thread *td; 996 register struct recvfrom_args /* { 997 int s; 998 caddr_t buf; 999 size_t len; 1000 int flags; 1001 caddr_t from; 1002 int *fromlenaddr; 1003 } */ *uap; 1004 { 1005 struct msghdr msg; 1006 struct iovec aiov; 1007 int error; 1008 1009 mtx_lock(&Giant); 1010 if (uap->fromlenaddr) { 1011 error = copyin(uap->fromlenaddr, 1012 &msg.msg_namelen, sizeof (msg.msg_namelen)); 1013 if (error) 1014 goto done2; 1015 } else { 1016 msg.msg_namelen = 0; 1017 } 1018 msg.msg_name = uap->from; 1019 msg.msg_iov = &aiov; 1020 msg.msg_iovlen = 1; 1021 aiov.iov_base = uap->buf; 1022 aiov.iov_len = uap->len; 1023 msg.msg_control = 0; 1024 msg.msg_flags = uap->flags; 1025 error = recvit(td, uap->s, &msg, uap->fromlenaddr); 1026 done2: 1027 mtx_unlock(&Giant); 1028 return(error); 1029 } 1030 1031 #ifdef COMPAT_OLDSOCK 1032 /* 1033 * MPSAFE 1034 */ 1035 int 1036 orecvfrom(td, uap) 1037 struct thread *td; 1038 struct recvfrom_args *uap; 1039 { 1040 1041 uap->flags |= MSG_COMPAT; 1042 return (recvfrom(td, uap)); 1043 } 1044 #endif 1045 1046 1047 #ifdef COMPAT_OLDSOCK 1048 /* 1049 * MPSAFE 1050 */ 1051 int 1052 orecv(td, uap) 1053 struct thread *td; 1054 register struct orecv_args /* { 1055 int s; 1056 caddr_t buf; 1057 int len; 1058 int flags; 1059 } */ *uap; 1060 { 1061 struct msghdr msg; 1062 struct iovec aiov; 1063 int error; 1064 1065 mtx_lock(&Giant); 1066 msg.msg_name = 0; 1067 msg.msg_namelen = 0; 1068 msg.msg_iov = &aiov; 1069 msg.msg_iovlen = 1; 1070 aiov.iov_base = uap->buf; 1071 aiov.iov_len = uap->len; 1072 msg.msg_control = 0; 1073 msg.msg_flags = uap->flags; 1074 error = recvit(td, uap->s, &msg, NULL); 1075 mtx_unlock(&Giant); 1076 return (error); 1077 } 1078 1079 /* 1080 * Old recvmsg. This code takes advantage of the fact that the old msghdr 1081 * overlays the new one, missing only the flags, and with the (old) access 1082 * rights where the control fields are now. 1083 * 1084 * MPSAFE 1085 */ 1086 int 1087 orecvmsg(td, uap) 1088 struct thread *td; 1089 register struct orecvmsg_args /* { 1090 int s; 1091 struct omsghdr *msg; 1092 int flags; 1093 } */ *uap; 1094 { 1095 struct msghdr msg; 1096 struct iovec aiov[UIO_SMALLIOV], *iov; 1097 int error; 1098 1099 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1100 if (error) 1101 return (error); 1102 1103 mtx_lock(&Giant); 1104 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 1105 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { 1106 error = EMSGSIZE; 1107 goto done2; 1108 } 1109 MALLOC(iov, struct iovec *, 1110 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 1111 M_WAITOK); 1112 } else { 1113 iov = aiov; 1114 } 1115 msg.msg_flags = uap->flags | MSG_COMPAT; 1116 error = copyin(msg.msg_iov, iov, 1117 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 1118 if (error) 1119 goto done; 1120 msg.msg_iov = iov; 1121 error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); 1122 1123 if (msg.msg_controllen && error == 0) 1124 error = copyout(&msg.msg_controllen, 1125 &uap->msg->msg_accrightslen, sizeof (int)); 1126 done: 1127 if (iov != aiov) 1128 FREE(iov, M_IOV); 1129 done2: 1130 mtx_unlock(&Giant); 1131 return (error); 1132 } 1133 #endif 1134 1135 /* 1136 * MPSAFE 1137 */ 1138 int 1139 recvmsg(td, uap) 1140 struct thread *td; 1141 register struct recvmsg_args /* { 1142 int s; 1143 struct msghdr *msg; 1144 int flags; 1145 } */ *uap; 1146 { 1147 struct msghdr msg; 1148 struct iovec aiov[UIO_SMALLIOV], *uiov, *iov; 1149 register int error; 1150 1151 mtx_lock(&Giant); 1152 error = copyin(uap->msg, &msg, sizeof (msg)); 1153 if (error) 1154 goto done2; 1155 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 1156 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { 1157 error = EMSGSIZE; 1158 goto done2; 1159 } 1160 MALLOC(iov, struct iovec *, 1161 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 1162 M_WAITOK); 1163 } else { 1164 iov = aiov; 1165 } 1166 #ifdef COMPAT_OLDSOCK 1167 msg.msg_flags = uap->flags &~ MSG_COMPAT; 1168 #else 1169 msg.msg_flags = uap->flags; 1170 #endif 1171 uiov = msg.msg_iov; 1172 msg.msg_iov = iov; 1173 error = copyin(uiov, iov, 1174 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 1175 if (error) 1176 goto done; 1177 error = recvit(td, uap->s, &msg, NULL); 1178 if (!error) { 1179 msg.msg_iov = uiov; 1180 error = copyout(&msg, uap->msg, sizeof(msg)); 1181 } 1182 done: 1183 if (iov != aiov) 1184 FREE(iov, M_IOV); 1185 done2: 1186 mtx_unlock(&Giant); 1187 return (error); 1188 } 1189 1190 /* 1191 * MPSAFE 1192 */ 1193 /* ARGSUSED */ 1194 int 1195 shutdown(td, uap) 1196 struct thread *td; 1197 register struct shutdown_args /* { 1198 int s; 1199 int how; 1200 } */ *uap; 1201 { 1202 struct socket *so; 1203 int error; 1204 1205 mtx_lock(&Giant); 1206 if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { 1207 error = soshutdown(so, uap->how); 1208 fputsock(so); 1209 } 1210 mtx_unlock(&Giant); 1211 return(error); 1212 } 1213 1214 /* 1215 * MPSAFE 1216 */ 1217 /* ARGSUSED */ 1218 int 1219 setsockopt(td, uap) 1220 struct thread *td; 1221 register struct setsockopt_args /* { 1222 int s; 1223 int level; 1224 int name; 1225 caddr_t val; 1226 int valsize; 1227 } */ *uap; 1228 { 1229 struct socket *so; 1230 struct sockopt sopt; 1231 int error; 1232 1233 if (uap->val == 0 && uap->valsize != 0) 1234 return (EFAULT); 1235 if (uap->valsize < 0) 1236 return (EINVAL); 1237 1238 mtx_lock(&Giant); 1239 if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { 1240 sopt.sopt_dir = SOPT_SET; 1241 sopt.sopt_level = uap->level; 1242 sopt.sopt_name = uap->name; 1243 sopt.sopt_val = uap->val; 1244 sopt.sopt_valsize = uap->valsize; 1245 sopt.sopt_td = td; 1246 error = sosetopt(so, &sopt); 1247 fputsock(so); 1248 } 1249 mtx_unlock(&Giant); 1250 return(error); 1251 } 1252 1253 /* 1254 * MPSAFE 1255 */ 1256 /* ARGSUSED */ 1257 int 1258 getsockopt(td, uap) 1259 struct thread *td; 1260 register struct getsockopt_args /* { 1261 int s; 1262 int level; 1263 int name; 1264 caddr_t val; 1265 int *avalsize; 1266 } */ *uap; 1267 { 1268 int valsize, error; 1269 struct socket *so; 1270 struct sockopt sopt; 1271 1272 mtx_lock(&Giant); 1273 if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) 1274 goto done2; 1275 if (uap->val) { 1276 error = copyin(uap->avalsize, &valsize, sizeof (valsize)); 1277 if (error) 1278 goto done1; 1279 if (valsize < 0) { 1280 error = EINVAL; 1281 goto done1; 1282 } 1283 } else { 1284 valsize = 0; 1285 } 1286 1287 sopt.sopt_dir = SOPT_GET; 1288 sopt.sopt_level = uap->level; 1289 sopt.sopt_name = uap->name; 1290 sopt.sopt_val = uap->val; 1291 sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */ 1292 sopt.sopt_td = td; 1293 1294 error = sogetopt(so, &sopt); 1295 if (error == 0) { 1296 valsize = sopt.sopt_valsize; 1297 error = copyout(&valsize, uap->avalsize, sizeof (valsize)); 1298 } 1299 done1: 1300 fputsock(so); 1301 done2: 1302 mtx_unlock(&Giant); 1303 return (error); 1304 } 1305 1306 /* 1307 * getsockname1() - Get socket name. 1308 * 1309 * MPSAFE 1310 */ 1311 /* ARGSUSED */ 1312 static int 1313 getsockname1(td, uap, compat) 1314 struct thread *td; 1315 register struct getsockname_args /* { 1316 int fdes; 1317 caddr_t asa; 1318 int *alen; 1319 } */ *uap; 1320 int compat; 1321 { 1322 struct socket *so; 1323 struct sockaddr *sa; 1324 int len, error; 1325 1326 mtx_lock(&Giant); 1327 if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0) 1328 goto done2; 1329 error = copyin(uap->alen, &len, sizeof (len)); 1330 if (error) 1331 goto done1; 1332 sa = 0; 1333 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa); 1334 if (error) 1335 goto bad; 1336 if (sa == 0) { 1337 len = 0; 1338 goto gotnothing; 1339 } 1340 1341 len = MIN(len, sa->sa_len); 1342 #ifdef COMPAT_OLDSOCK 1343 if (compat) 1344 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1345 #endif 1346 error = copyout(sa, uap->asa, (u_int)len); 1347 if (error == 0) 1348 gotnothing: 1349 error = copyout(&len, uap->alen, sizeof (len)); 1350 bad: 1351 if (sa) 1352 FREE(sa, M_SONAME); 1353 done1: 1354 fputsock(so); 1355 done2: 1356 mtx_unlock(&Giant); 1357 return (error); 1358 } 1359 1360 /* 1361 * MPSAFE 1362 */ 1363 int 1364 getsockname(td, uap) 1365 struct thread *td; 1366 struct getsockname_args *uap; 1367 { 1368 1369 return (getsockname1(td, uap, 0)); 1370 } 1371 1372 #ifdef COMPAT_OLDSOCK 1373 /* 1374 * MPSAFE 1375 */ 1376 int 1377 ogetsockname(td, uap) 1378 struct thread *td; 1379 struct getsockname_args *uap; 1380 { 1381 1382 return (getsockname1(td, uap, 1)); 1383 } 1384 #endif /* COMPAT_OLDSOCK */ 1385 1386 /* 1387 * getpeername1() - Get name of peer for connected socket. 1388 * 1389 * MPSAFE 1390 */ 1391 /* ARGSUSED */ 1392 static int 1393 getpeername1(td, uap, compat) 1394 struct thread *td; 1395 register struct getpeername_args /* { 1396 int fdes; 1397 caddr_t asa; 1398 int *alen; 1399 } */ *uap; 1400 int compat; 1401 { 1402 struct socket *so; 1403 struct sockaddr *sa; 1404 int len, error; 1405 1406 mtx_lock(&Giant); 1407 if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0) 1408 goto done2; 1409 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1410 error = ENOTCONN; 1411 goto done1; 1412 } 1413 error = copyin(uap->alen, &len, sizeof (len)); 1414 if (error) 1415 goto done1; 1416 sa = 0; 1417 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa); 1418 if (error) 1419 goto bad; 1420 if (sa == 0) { 1421 len = 0; 1422 goto gotnothing; 1423 } 1424 len = MIN(len, sa->sa_len); 1425 #ifdef COMPAT_OLDSOCK 1426 if (compat) 1427 ((struct osockaddr *)sa)->sa_family = 1428 sa->sa_family; 1429 #endif 1430 error = copyout(sa, uap->asa, (u_int)len); 1431 if (error) 1432 goto bad; 1433 gotnothing: 1434 error = copyout(&len, uap->alen, sizeof (len)); 1435 bad: 1436 if (sa) 1437 FREE(sa, M_SONAME); 1438 done1: 1439 fputsock(so); 1440 done2: 1441 mtx_unlock(&Giant); 1442 return (error); 1443 } 1444 1445 /* 1446 * MPSAFE 1447 */ 1448 int 1449 getpeername(td, uap) 1450 struct thread *td; 1451 struct getpeername_args *uap; 1452 { 1453 1454 return (getpeername1(td, uap, 0)); 1455 } 1456 1457 #ifdef COMPAT_OLDSOCK 1458 /* 1459 * MPSAFE 1460 */ 1461 int 1462 ogetpeername(td, uap) 1463 struct thread *td; 1464 struct ogetpeername_args *uap; 1465 { 1466 1467 /* XXX uap should have type `getpeername_args *' to begin with. */ 1468 return (getpeername1(td, (struct getpeername_args *)uap, 1)); 1469 } 1470 #endif /* COMPAT_OLDSOCK */ 1471 1472 int 1473 sockargs(mp, buf, buflen, type) 1474 struct mbuf **mp; 1475 caddr_t buf; 1476 int buflen, type; 1477 { 1478 register struct sockaddr *sa; 1479 register struct mbuf *m; 1480 int error; 1481 1482 if ((u_int)buflen > MLEN) { 1483 #ifdef COMPAT_OLDSOCK 1484 if (type == MT_SONAME && (u_int)buflen <= 112) 1485 buflen = MLEN; /* unix domain compat. hack */ 1486 else 1487 #endif 1488 return (EINVAL); 1489 } 1490 m = m_get(M_TRYWAIT, type); 1491 if (m == NULL) 1492 return (ENOBUFS); 1493 m->m_len = buflen; 1494 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1495 if (error) 1496 (void) m_free(m); 1497 else { 1498 *mp = m; 1499 if (type == MT_SONAME) { 1500 sa = mtod(m, struct sockaddr *); 1501 1502 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1503 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1504 sa->sa_family = sa->sa_len; 1505 #endif 1506 sa->sa_len = buflen; 1507 } 1508 } 1509 return (error); 1510 } 1511 1512 int 1513 getsockaddr(namp, uaddr, len) 1514 struct sockaddr **namp; 1515 caddr_t uaddr; 1516 size_t len; 1517 { 1518 struct sockaddr *sa; 1519 int error; 1520 1521 if (len > SOCK_MAXADDRLEN) 1522 return ENAMETOOLONG; 1523 MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); 1524 error = copyin(uaddr, sa, len); 1525 if (error) { 1526 FREE(sa, M_SONAME); 1527 } else { 1528 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1529 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1530 sa->sa_family = sa->sa_len; 1531 #endif 1532 sa->sa_len = len; 1533 *namp = sa; 1534 } 1535 return error; 1536 } 1537 1538 /* 1539 * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-)) 1540 * XXX - The sf_buf functions are currently private to sendfile(2), so have 1541 * been made static, but may be useful in the future for doing zero-copy in 1542 * other parts of the networking code. 1543 */ 1544 static void 1545 sf_buf_init(void *arg) 1546 { 1547 int i; 1548 1549 mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", NULL, MTX_DEF); 1550 mtx_lock(&sf_freelist.sf_lock); 1551 SLIST_INIT(&sf_freelist.sf_head); 1552 sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE); 1553 sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, 1554 M_NOWAIT | M_ZERO); 1555 for (i = 0; i < nsfbufs; i++) { 1556 sf_bufs[i].kva = sf_base + i * PAGE_SIZE; 1557 SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list); 1558 } 1559 sf_buf_alloc_want = 0; 1560 mtx_unlock(&sf_freelist.sf_lock); 1561 } 1562 1563 /* 1564 * Get an sf_buf from the freelist. Will block if none are available. 1565 */ 1566 struct sf_buf * 1567 sf_buf_alloc() 1568 { 1569 struct sf_buf *sf; 1570 int error; 1571 1572 mtx_lock(&sf_freelist.sf_lock); 1573 while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) { 1574 sf_buf_alloc_want++; 1575 error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH, 1576 "sfbufa", 0); 1577 sf_buf_alloc_want--; 1578 1579 /* 1580 * If we got a signal, don't risk going back to sleep. 1581 */ 1582 if (error) 1583 break; 1584 } 1585 if (sf != NULL) 1586 SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list); 1587 mtx_unlock(&sf_freelist.sf_lock); 1588 return (sf); 1589 } 1590 1591 #define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT]) 1592 1593 /* 1594 * Detatch mapped page and release resources back to the system. 1595 */ 1596 void 1597 sf_buf_free(void *addr, void *args) 1598 { 1599 struct sf_buf *sf; 1600 struct vm_page *m; 1601 1602 GIANT_REQUIRED; 1603 1604 sf = dtosf(addr); 1605 pmap_qremove((vm_offset_t)addr, 1); 1606 m = sf->m; 1607 vm_page_unwire(m, 0); 1608 /* 1609 * Check for the object going away on us. This can 1610 * happen since we don't hold a reference to it. 1611 * If so, we're responsible for freeing the page. 1612 */ 1613 if (m->wire_count == 0 && m->object == NULL) 1614 vm_page_free(m); 1615 sf->m = NULL; 1616 mtx_lock(&sf_freelist.sf_lock); 1617 SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list); 1618 if (sf_buf_alloc_want > 0) 1619 wakeup_one(&sf_freelist); 1620 mtx_unlock(&sf_freelist.sf_lock); 1621 } 1622 1623 /* 1624 * sendfile(2) 1625 * 1626 * MPSAFE 1627 * 1628 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 1629 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 1630 * 1631 * Send a file specified by 'fd' and starting at 'offset' to a socket 1632 * specified by 's'. Send only 'nbytes' of the file or until EOF if 1633 * nbytes == 0. Optionally add a header and/or trailer to the socket 1634 * output. If specified, write the total number of bytes sent into *sbytes. 1635 * 1636 */ 1637 int 1638 sendfile(struct thread *td, struct sendfile_args *uap) 1639 { 1640 struct vnode *vp; 1641 struct vm_object *obj; 1642 struct socket *so = NULL; 1643 struct mbuf *m; 1644 struct sf_buf *sf; 1645 struct vm_page *pg; 1646 struct writev_args nuap; 1647 struct sf_hdtr hdtr; 1648 off_t off, xfsize, hdtr_size, sbytes = 0; 1649 int error, s; 1650 1651 mtx_lock(&Giant); 1652 1653 hdtr_size = 0; 1654 1655 /* 1656 * The descriptor must be a regular file and have a backing VM object. 1657 */ 1658 if ((error = fgetvp_read(td, uap->fd, &vp)) != 0) 1659 goto done; 1660 if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) { 1661 error = EINVAL; 1662 goto done; 1663 } 1664 if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) 1665 goto done; 1666 if (so->so_type != SOCK_STREAM) { 1667 error = EINVAL; 1668 goto done; 1669 } 1670 if ((so->so_state & SS_ISCONNECTED) == 0) { 1671 error = ENOTCONN; 1672 goto done; 1673 } 1674 if (uap->offset < 0) { 1675 error = EINVAL; 1676 goto done; 1677 } 1678 1679 /* 1680 * If specified, get the pointer to the sf_hdtr struct for 1681 * any headers/trailers. 1682 */ 1683 if (uap->hdtr != NULL) { 1684 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 1685 if (error) 1686 goto done; 1687 /* 1688 * Send any headers. Wimp out and use writev(2). 1689 */ 1690 if (hdtr.headers != NULL) { 1691 nuap.fd = uap->s; 1692 nuap.iovp = hdtr.headers; 1693 nuap.iovcnt = hdtr.hdr_cnt; 1694 error = writev(td, &nuap); 1695 if (error) 1696 goto done; 1697 hdtr_size += td->td_retval[0]; 1698 } 1699 } 1700 1701 /* 1702 * Protect against multiple writers to the socket. 1703 */ 1704 (void) sblock(&so->so_snd, M_WAITOK); 1705 1706 /* 1707 * Loop through the pages in the file, starting with the requested 1708 * offset. Get a file page (do I/O if necessary), map the file page 1709 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 1710 * it on the socket. 1711 */ 1712 for (off = uap->offset; ; off += xfsize, sbytes += xfsize) { 1713 vm_pindex_t pindex; 1714 vm_offset_t pgoff; 1715 1716 pindex = OFF_TO_IDX(off); 1717 retry_lookup: 1718 /* 1719 * Calculate the amount to transfer. Not to exceed a page, 1720 * the EOF, or the passed in nbytes. 1721 */ 1722 xfsize = obj->un_pager.vnp.vnp_size - off; 1723 if (xfsize > PAGE_SIZE) 1724 xfsize = PAGE_SIZE; 1725 pgoff = (vm_offset_t)(off & PAGE_MASK); 1726 if (PAGE_SIZE - pgoff < xfsize) 1727 xfsize = PAGE_SIZE - pgoff; 1728 if (uap->nbytes && xfsize > (uap->nbytes - sbytes)) 1729 xfsize = uap->nbytes - sbytes; 1730 if (xfsize <= 0) 1731 break; 1732 /* 1733 * Optimize the non-blocking case by looking at the socket space 1734 * before going to the extra work of constituting the sf_buf. 1735 */ 1736 if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { 1737 if (so->so_state & SS_CANTSENDMORE) 1738 error = EPIPE; 1739 else 1740 error = EAGAIN; 1741 sbunlock(&so->so_snd); 1742 goto done; 1743 } 1744 /* 1745 * Attempt to look up the page. 1746 * 1747 * Allocate if not found 1748 * 1749 * Wait and loop if busy. 1750 */ 1751 pg = vm_page_lookup(obj, pindex); 1752 1753 if (pg == NULL) { 1754 pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL); 1755 if (pg == NULL) { 1756 VM_WAIT; 1757 goto retry_lookup; 1758 } 1759 vm_page_wakeup(pg); 1760 } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) { 1761 goto retry_lookup; 1762 } 1763 1764 /* 1765 * Wire the page so it does not get ripped out from under 1766 * us. 1767 */ 1768 1769 vm_page_wire(pg); 1770 1771 /* 1772 * If page is not valid for what we need, initiate I/O 1773 */ 1774 1775 if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) { 1776 int bsize; 1777 1778 /* 1779 * Ensure that our page is still around when the I/O 1780 * completes. 1781 */ 1782 vm_page_io_start(pg); 1783 1784 /* 1785 * Get the page from backing store. 1786 */ 1787 bsize = vp->v_mount->mnt_stat.f_iosize; 1788 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td); 1789 error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE, 1790 trunc_page(off), UIO_NOCOPY, IO_NODELOCKED | 1791 IO_VMIO | ((MAXBSIZE / bsize) << 16), 1792 td->td_ucred, NULL, td); 1793 VOP_UNLOCK(vp, 0, td); 1794 vm_page_flag_clear(pg, PG_ZERO); 1795 vm_page_io_finish(pg); 1796 if (error) { 1797 vm_page_unwire(pg, 0); 1798 /* 1799 * See if anyone else might know about this page. 1800 * If not and it is not valid, then free it. 1801 */ 1802 if (pg->wire_count == 0 && pg->valid == 0 && 1803 pg->busy == 0 && !(pg->flags & PG_BUSY) && 1804 pg->hold_count == 0) { 1805 vm_page_busy(pg); 1806 vm_page_free(pg); 1807 } 1808 sbunlock(&so->so_snd); 1809 goto done; 1810 } 1811 } 1812 1813 1814 /* 1815 * Get a sendfile buf. We usually wait as long as necessary, 1816 * but this wait can be interrupted. 1817 */ 1818 if ((sf = sf_buf_alloc()) == NULL) { 1819 vm_page_unwire(pg, 0); 1820 if (pg->wire_count == 0 && pg->object == NULL) 1821 vm_page_free(pg); 1822 sbunlock(&so->so_snd); 1823 error = EINTR; 1824 goto done; 1825 } 1826 1827 /* 1828 * Allocate a kernel virtual page and insert the physical page 1829 * into it. 1830 */ 1831 sf->m = pg; 1832 pmap_qenter(sf->kva, &pg, 1); 1833 /* 1834 * Get an mbuf header and set it up as having external storage. 1835 */ 1836 MGETHDR(m, M_TRYWAIT, MT_DATA); 1837 if (m == NULL) { 1838 error = ENOBUFS; 1839 sf_buf_free((void *)sf->kva, NULL); 1840 sbunlock(&so->so_snd); 1841 goto done; 1842 } 1843 /* 1844 * Setup external storage for mbuf. 1845 */ 1846 MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY, 1847 EXT_SFBUF); 1848 m->m_data = (char *) sf->kva + pgoff; 1849 m->m_pkthdr.len = m->m_len = xfsize; 1850 /* 1851 * Add the buffer to the socket buffer chain. 1852 */ 1853 s = splnet(); 1854 retry_space: 1855 /* 1856 * Make sure that the socket is still able to take more data. 1857 * CANTSENDMORE being true usually means that the connection 1858 * was closed. so_error is true when an error was sensed after 1859 * a previous send. 1860 * The state is checked after the page mapping and buffer 1861 * allocation above since those operations may block and make 1862 * any socket checks stale. From this point forward, nothing 1863 * blocks before the pru_send (or more accurately, any blocking 1864 * results in a loop back to here to re-check). 1865 */ 1866 if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { 1867 if (so->so_state & SS_CANTSENDMORE) { 1868 error = EPIPE; 1869 } else { 1870 error = so->so_error; 1871 so->so_error = 0; 1872 } 1873 m_freem(m); 1874 sbunlock(&so->so_snd); 1875 splx(s); 1876 goto done; 1877 } 1878 /* 1879 * Wait for socket space to become available. We do this just 1880 * after checking the connection state above in order to avoid 1881 * a race condition with sbwait(). 1882 */ 1883 if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { 1884 if (so->so_state & SS_NBIO) { 1885 m_freem(m); 1886 sbunlock(&so->so_snd); 1887 splx(s); 1888 error = EAGAIN; 1889 goto done; 1890 } 1891 error = sbwait(&so->so_snd); 1892 /* 1893 * An error from sbwait usually indicates that we've 1894 * been interrupted by a signal. If we've sent anything 1895 * then return bytes sent, otherwise return the error. 1896 */ 1897 if (error) { 1898 m_freem(m); 1899 sbunlock(&so->so_snd); 1900 splx(s); 1901 goto done; 1902 } 1903 goto retry_space; 1904 } 1905 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td); 1906 splx(s); 1907 if (error) { 1908 sbunlock(&so->so_snd); 1909 goto done; 1910 } 1911 } 1912 sbunlock(&so->so_snd); 1913 1914 /* 1915 * Send trailers. Wimp out and use writev(2). 1916 */ 1917 if (uap->hdtr != NULL && hdtr.trailers != NULL) { 1918 nuap.fd = uap->s; 1919 nuap.iovp = hdtr.trailers; 1920 nuap.iovcnt = hdtr.trl_cnt; 1921 error = writev(td, &nuap); 1922 if (error) 1923 goto done; 1924 hdtr_size += td->td_retval[0]; 1925 } 1926 1927 done: 1928 /* 1929 * If there was no error we have to clear td->td_retval[0] 1930 * because it may have been set by writev. 1931 */ 1932 if (error == 0) { 1933 td->td_retval[0] = 0; 1934 } 1935 if (uap->sbytes != NULL) { 1936 sbytes += hdtr_size; 1937 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 1938 } 1939 if (vp) 1940 vrele(vp); 1941 if (so) 1942 fputsock(so); 1943 mtx_unlock(&Giant); 1944 return (error); 1945 } 1946