1 /* 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 37 * $FreeBSD$ 38 */ 39 40 #include "opt_compat.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/kernel.h> 46 #include <sys/lock.h> 47 #include <sys/mutex.h> 48 #include <sys/sysproto.h> 49 #include <sys/malloc.h> 50 #include <sys/filedesc.h> 51 #include <sys/event.h> 52 #include <sys/proc.h> 53 #include <sys/fcntl.h> 54 #include <sys/file.h> 55 #include <sys/lock.h> 56 #include <sys/mount.h> 57 #include <sys/mbuf.h> 58 #include <sys/protosw.h> 59 #include <sys/socket.h> 60 #include <sys/socketvar.h> 61 #include <sys/signalvar.h> 62 #include <sys/uio.h> 63 #include <sys/vnode.h> 64 #ifdef KTRACE 65 #include <sys/ktrace.h> 66 #endif 67 68 #include <vm/vm.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_page.h> 71 #include <vm/vm_pageout.h> 72 #include <vm/vm_kern.h> 73 #include <vm/vm_extern.h> 74 75 static void sf_buf_init(void *arg); 76 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) 77 static struct sf_buf *sf_buf_alloc(void); 78 static void sf_buf_free(caddr_t addr, void *args); 79 80 static int sendit __P((struct thread *td, int s, struct msghdr *mp, int flags)); 81 static int recvit __P((struct thread *td, int s, struct msghdr *mp, 82 caddr_t namelenp)); 83 84 static int accept1 __P((struct thread *td, struct accept_args *uap, int compat)); 85 static int getsockname1 __P((struct thread *td, struct getsockname_args *uap, 86 int compat)); 87 static int getpeername1 __P((struct thread *td, struct getpeername_args *uap, 88 int compat)); 89 90 /* 91 * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the 92 * sf_freelist head with the sf_lock mutex. 93 */ 94 static struct { 95 SLIST_HEAD(, sf_buf) sf_head; 96 struct mtx sf_lock; 97 } sf_freelist; 98 99 static vm_offset_t sf_base; 100 static struct sf_buf *sf_bufs; 101 static u_int sf_buf_alloc_want; 102 103 /* 104 * System call interface to the socket abstraction. 105 */ 106 #if defined(COMPAT_43) || defined(COMPAT_SUNOS) 107 #define COMPAT_OLDSOCK 108 #endif 109 110 extern struct fileops socketops; 111 112 /* 113 * MPSAFE 114 */ 115 int 116 socket(td, uap) 117 struct thread *td; 118 register struct socket_args /* { 119 int domain; 120 int type; 121 int protocol; 122 } */ *uap; 123 { 124 struct filedesc *fdp; 125 struct socket *so; 126 struct file *fp; 127 int fd, error; 128 129 mtx_lock(&Giant); 130 fdp = td->td_proc->p_fd; 131 error = falloc(td, &fp, &fd); 132 if (error) 133 goto done2; 134 fhold(fp); 135 error = socreate(uap->domain, &so, uap->type, uap->protocol, td); 136 if (error) { 137 if (fdp->fd_ofiles[fd] == fp) { 138 fdp->fd_ofiles[fd] = NULL; 139 fdrop(fp, td); 140 } 141 } else { 142 fp->f_data = (caddr_t)so; /* already has ref count */ 143 fp->f_flag = FREAD|FWRITE; 144 fp->f_ops = &socketops; 145 fp->f_type = DTYPE_SOCKET; 146 td->td_retval[0] = fd; 147 } 148 fdrop(fp, td); 149 done2: 150 mtx_unlock(&Giant); 151 return (error); 152 } 153 154 /* 155 * MPSAFE 156 */ 157 /* ARGSUSED */ 158 int 159 bind(td, uap) 160 struct thread *td; 161 register struct bind_args /* { 162 int s; 163 caddr_t name; 164 int namelen; 165 } */ *uap; 166 { 167 struct sockaddr *sa; 168 struct socket *sp; 169 int error; 170 171 mtx_lock(&Giant); 172 if ((error = fgetsock(td, uap->s, &sp, NULL)) != 0) 173 goto done2; 174 if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0) 175 goto done1; 176 error = sobind(sp, sa, td); 177 FREE(sa, M_SONAME); 178 done1: 179 fputsock(sp); 180 done2: 181 mtx_unlock(&Giant); 182 return (error); 183 } 184 185 /* 186 * MPSAFE 187 */ 188 /* ARGSUSED */ 189 int 190 listen(td, uap) 191 struct thread *td; 192 register struct listen_args /* { 193 int s; 194 int backlog; 195 } */ *uap; 196 { 197 struct socket *sp; 198 int error; 199 200 mtx_lock(&Giant); 201 if ((error = fgetsock(td, uap->s, &sp, NULL)) == 0) { 202 error = solisten(sp, uap->backlog, td); 203 fputsock(sp); 204 } 205 mtx_unlock(&Giant); 206 return(error); 207 } 208 209 /* 210 * accept1() 211 * MPSAFE 212 */ 213 static int 214 accept1(td, uap, compat) 215 struct thread *td; 216 register struct accept_args /* { 217 int s; 218 caddr_t name; 219 int *anamelen; 220 } */ *uap; 221 int compat; 222 { 223 struct filedesc *fdp; 224 struct file *nfp = NULL; 225 struct sockaddr *sa; 226 int namelen, error, s; 227 struct socket *head, *so; 228 int fd; 229 u_int fflag; 230 231 mtx_lock(&Giant); 232 fdp = td->td_proc->p_fd; 233 if (uap->name) { 234 error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen, 235 sizeof (namelen)); 236 if(error) 237 goto done2; 238 } 239 error = fgetsock(td, uap->s, &head, &fflag); 240 if (error) 241 goto done2; 242 s = splnet(); 243 if ((head->so_options & SO_ACCEPTCONN) == 0) { 244 splx(s); 245 error = EINVAL; 246 goto done; 247 } 248 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { 249 splx(s); 250 error = EWOULDBLOCK; 251 goto done; 252 } 253 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { 254 if (head->so_state & SS_CANTRCVMORE) { 255 head->so_error = ECONNABORTED; 256 break; 257 } 258 error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH, 259 "accept", 0); 260 if (error) { 261 splx(s); 262 goto done; 263 } 264 } 265 if (head->so_error) { 266 error = head->so_error; 267 head->so_error = 0; 268 splx(s); 269 goto done; 270 } 271 272 /* 273 * At this point we know that there is at least one connection 274 * ready to be accepted. Remove it from the queue prior to 275 * allocating the file descriptor for it since falloc() may 276 * block allowing another process to accept the connection 277 * instead. 278 */ 279 so = TAILQ_FIRST(&head->so_comp); 280 TAILQ_REMOVE(&head->so_comp, so, so_list); 281 head->so_qlen--; 282 283 error = falloc(td, &nfp, &fd); 284 if (error) { 285 /* 286 * Probably ran out of file descriptors. Put the 287 * unaccepted connection back onto the queue and 288 * do another wakeup so some other process might 289 * have a chance at it. 290 */ 291 TAILQ_INSERT_HEAD(&head->so_comp, so, so_list); 292 head->so_qlen++; 293 wakeup_one(&head->so_timeo); 294 splx(s); 295 goto done; 296 } 297 fhold(nfp); 298 td->td_retval[0] = fd; 299 300 /* connection has been removed from the listen queue */ 301 KNOTE(&head->so_rcv.sb_sel.si_note, 0); 302 303 so->so_state &= ~SS_COMP; 304 so->so_head = NULL; 305 if (head->so_sigio != NULL) 306 fsetown(fgetown(head->so_sigio), &so->so_sigio); 307 308 soref(so); /* file descriptor reference */ 309 nfp->f_data = (caddr_t)so; /* nfp has ref count from falloc */ 310 nfp->f_flag = fflag; 311 nfp->f_ops = &socketops; 312 nfp->f_type = DTYPE_SOCKET; 313 sa = 0; 314 error = soaccept(so, &sa); 315 if (error) { 316 /* 317 * return a namelen of zero for older code which might 318 * ignore the return value from accept. 319 */ 320 if (uap->name != NULL) { 321 namelen = 0; 322 (void) copyout((caddr_t)&namelen, 323 (caddr_t)uap->anamelen, sizeof(*uap->anamelen)); 324 } 325 goto noconnection; 326 } 327 if (sa == NULL) { 328 namelen = 0; 329 if (uap->name) 330 goto gotnoname; 331 splx(s); 332 error = 0; 333 goto done; 334 } 335 if (uap->name) { 336 /* check sa_len before it is destroyed */ 337 if (namelen > sa->sa_len) 338 namelen = sa->sa_len; 339 #ifdef COMPAT_OLDSOCK 340 if (compat) 341 ((struct osockaddr *)sa)->sa_family = 342 sa->sa_family; 343 #endif 344 error = copyout(sa, (caddr_t)uap->name, (u_int)namelen); 345 if (!error) 346 gotnoname: 347 error = copyout((caddr_t)&namelen, 348 (caddr_t)uap->anamelen, sizeof (*uap->anamelen)); 349 } 350 noconnection: 351 if (sa) 352 FREE(sa, M_SONAME); 353 354 /* 355 * close the new descriptor, assuming someone hasn't ripped it 356 * out from under us. 357 */ 358 if (error) { 359 if (fdp->fd_ofiles[fd] == nfp) { 360 fdp->fd_ofiles[fd] = NULL; 361 fdrop(nfp, td); 362 } 363 } 364 splx(s); 365 366 /* 367 * Release explicitly held references before returning. 368 */ 369 done: 370 if (nfp != NULL) 371 fdrop(nfp, td); 372 fputsock(head); 373 done2: 374 mtx_unlock(&Giant); 375 return (error); 376 } 377 378 /* 379 * MPSAFE (accept1() is MPSAFE) 380 */ 381 int 382 accept(td, uap) 383 struct thread *td; 384 struct accept_args *uap; 385 { 386 387 return (accept1(td, uap, 0)); 388 } 389 390 #ifdef COMPAT_OLDSOCK 391 /* 392 * MPSAFE (accept1() is MPSAFE) 393 */ 394 int 395 oaccept(td, uap) 396 struct thread *td; 397 struct accept_args *uap; 398 { 399 400 return (accept1(td, uap, 1)); 401 } 402 #endif /* COMPAT_OLDSOCK */ 403 404 /* 405 * MPSAFE 406 */ 407 /* ARGSUSED */ 408 int 409 connect(td, uap) 410 struct thread *td; 411 register struct connect_args /* { 412 int s; 413 caddr_t name; 414 int namelen; 415 } */ *uap; 416 { 417 struct socket *so; 418 struct sockaddr *sa; 419 int error, s; 420 421 mtx_lock(&Giant); 422 if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) 423 goto done2; 424 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 425 error = EALREADY; 426 goto done1; 427 } 428 error = getsockaddr(&sa, uap->name, uap->namelen); 429 if (error) 430 goto done1; 431 error = soconnect(so, sa, td); 432 if (error) 433 goto bad; 434 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 435 FREE(sa, M_SONAME); 436 error = EINPROGRESS; 437 goto done1; 438 } 439 s = splnet(); 440 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 441 error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, "connec", 0); 442 if (error) 443 break; 444 } 445 if (error == 0) { 446 error = so->so_error; 447 so->so_error = 0; 448 } 449 splx(s); 450 bad: 451 so->so_state &= ~SS_ISCONNECTING; 452 FREE(sa, M_SONAME); 453 if (error == ERESTART) 454 error = EINTR; 455 done1: 456 fputsock(so); 457 done2: 458 mtx_unlock(&Giant); 459 return (error); 460 } 461 462 /* 463 * MPSAFE 464 */ 465 int 466 socketpair(td, uap) 467 struct thread *td; 468 register struct socketpair_args /* { 469 int domain; 470 int type; 471 int protocol; 472 int *rsv; 473 } */ *uap; 474 { 475 register struct filedesc *fdp = td->td_proc->p_fd; 476 struct file *fp1, *fp2; 477 struct socket *so1, *so2; 478 int fd, error, sv[2]; 479 480 mtx_lock(&Giant); 481 error = socreate(uap->domain, &so1, uap->type, uap->protocol, td); 482 if (error) 483 goto done2; 484 error = socreate(uap->domain, &so2, uap->type, uap->protocol, td); 485 if (error) 486 goto free1; 487 error = falloc(td, &fp1, &fd); 488 if (error) 489 goto free2; 490 fhold(fp1); 491 sv[0] = fd; 492 fp1->f_data = (caddr_t)so1; /* so1 already has ref count */ 493 error = falloc(td, &fp2, &fd); 494 if (error) 495 goto free3; 496 fhold(fp2); 497 fp2->f_data = (caddr_t)so2; /* so2 already has ref count */ 498 sv[1] = fd; 499 error = soconnect2(so1, so2); 500 if (error) 501 goto free4; 502 if (uap->type == SOCK_DGRAM) { 503 /* 504 * Datagram socket connection is asymmetric. 505 */ 506 error = soconnect2(so2, so1); 507 if (error) 508 goto free4; 509 } 510 fp1->f_flag = fp2->f_flag = FREAD|FWRITE; 511 fp1->f_ops = fp2->f_ops = &socketops; 512 fp1->f_type = fp2->f_type = DTYPE_SOCKET; 513 error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int)); 514 fdrop(fp1, td); 515 fdrop(fp2, td); 516 goto done2; 517 free4: 518 if (fdp->fd_ofiles[sv[1]] == fp2) { 519 fdp->fd_ofiles[sv[1]] = NULL; 520 fdrop(fp2, td); 521 } 522 fdrop(fp2, td); 523 free3: 524 if (fdp->fd_ofiles[sv[0]] == fp1) { 525 fdp->fd_ofiles[sv[0]] = NULL; 526 fdrop(fp1, td); 527 } 528 fdrop(fp1, td); 529 free2: 530 (void)soclose(so2); 531 free1: 532 (void)soclose(so1); 533 done2: 534 mtx_unlock(&Giant); 535 return (error); 536 } 537 538 static int 539 sendit(td, s, mp, flags) 540 register struct thread *td; 541 int s; 542 register struct msghdr *mp; 543 int flags; 544 { 545 struct uio auio; 546 register struct iovec *iov; 547 register int i; 548 struct mbuf *control; 549 struct sockaddr *to = NULL; 550 int len, error; 551 struct socket *so; 552 #ifdef KTRACE 553 struct iovec *ktriov = NULL; 554 struct uio ktruio; 555 #endif 556 557 if ((error = fgetsock(td, s, &so, NULL)) != 0) 558 return (error); 559 auio.uio_iov = mp->msg_iov; 560 auio.uio_iovcnt = mp->msg_iovlen; 561 auio.uio_segflg = UIO_USERSPACE; 562 auio.uio_rw = UIO_WRITE; 563 auio.uio_td = td; 564 auio.uio_offset = 0; /* XXX */ 565 auio.uio_resid = 0; 566 iov = mp->msg_iov; 567 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 568 if ((auio.uio_resid += iov->iov_len) < 0) { 569 error = EINVAL; 570 goto bad; 571 } 572 } 573 if (mp->msg_name) { 574 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 575 if (error) 576 goto bad; 577 } 578 if (mp->msg_control) { 579 if (mp->msg_controllen < sizeof(struct cmsghdr) 580 #ifdef COMPAT_OLDSOCK 581 && mp->msg_flags != MSG_COMPAT 582 #endif 583 ) { 584 error = EINVAL; 585 goto bad; 586 } 587 error = sockargs(&control, mp->msg_control, 588 mp->msg_controllen, MT_CONTROL); 589 if (error) 590 goto bad; 591 #ifdef COMPAT_OLDSOCK 592 if (mp->msg_flags == MSG_COMPAT) { 593 register struct cmsghdr *cm; 594 595 M_PREPEND(control, sizeof(*cm), M_TRYWAIT); 596 if (control == 0) { 597 error = ENOBUFS; 598 goto bad; 599 } else { 600 cm = mtod(control, struct cmsghdr *); 601 cm->cmsg_len = control->m_len; 602 cm->cmsg_level = SOL_SOCKET; 603 cm->cmsg_type = SCM_RIGHTS; 604 } 605 } 606 #endif 607 } else { 608 control = 0; 609 } 610 #ifdef KTRACE 611 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 612 int iovlen = auio.uio_iovcnt * sizeof (struct iovec); 613 614 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 615 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 616 ktruio = auio; 617 } 618 #endif 619 len = auio.uio_resid; 620 error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control, 621 flags, td); 622 if (error) { 623 if (auio.uio_resid != len && (error == ERESTART || 624 error == EINTR || error == EWOULDBLOCK)) 625 error = 0; 626 if (error == EPIPE) { 627 PROC_LOCK(td->td_proc); 628 psignal(td->td_proc, SIGPIPE); 629 PROC_UNLOCK(td->td_proc); 630 } 631 } 632 if (error == 0) 633 td->td_retval[0] = len - auio.uio_resid; 634 #ifdef KTRACE 635 if (ktriov != NULL) { 636 if (error == 0) { 637 ktruio.uio_iov = ktriov; 638 ktruio.uio_resid = td->td_retval[0]; 639 ktrgenio(td->td_proc->p_tracep, s, UIO_WRITE, &ktruio, error); 640 } 641 FREE(ktriov, M_TEMP); 642 } 643 #endif 644 bad: 645 fputsock(so); 646 if (to) 647 FREE(to, M_SONAME); 648 return (error); 649 } 650 651 /* 652 * MPSAFE 653 */ 654 int 655 sendto(td, uap) 656 struct thread *td; 657 register struct sendto_args /* { 658 int s; 659 caddr_t buf; 660 size_t len; 661 int flags; 662 caddr_t to; 663 int tolen; 664 } */ *uap; 665 { 666 struct msghdr msg; 667 struct iovec aiov; 668 int error; 669 670 msg.msg_name = uap->to; 671 msg.msg_namelen = uap->tolen; 672 msg.msg_iov = &aiov; 673 msg.msg_iovlen = 1; 674 msg.msg_control = 0; 675 #ifdef COMPAT_OLDSOCK 676 msg.msg_flags = 0; 677 #endif 678 aiov.iov_base = uap->buf; 679 aiov.iov_len = uap->len; 680 mtx_lock(&Giant); 681 error = sendit(td, uap->s, &msg, uap->flags); 682 mtx_unlock(&Giant); 683 return (error); 684 } 685 686 #ifdef COMPAT_OLDSOCK 687 /* 688 * MPSAFE 689 */ 690 int 691 osend(td, uap) 692 struct thread *td; 693 register struct osend_args /* { 694 int s; 695 caddr_t buf; 696 int len; 697 int flags; 698 } */ *uap; 699 { 700 struct msghdr msg; 701 struct iovec aiov; 702 int error; 703 704 msg.msg_name = 0; 705 msg.msg_namelen = 0; 706 msg.msg_iov = &aiov; 707 msg.msg_iovlen = 1; 708 aiov.iov_base = uap->buf; 709 aiov.iov_len = uap->len; 710 msg.msg_control = 0; 711 msg.msg_flags = 0; 712 mtx_lock(&Giant); 713 error = sendit(td, uap->s, &msg, uap->flags); 714 mtx_unlock(&Giant); 715 return (error); 716 } 717 718 /* 719 * MPSAFE 720 */ 721 int 722 osendmsg(td, uap) 723 struct thread *td; 724 register struct osendmsg_args /* { 725 int s; 726 caddr_t msg; 727 int flags; 728 } */ *uap; 729 { 730 struct msghdr msg; 731 struct iovec aiov[UIO_SMALLIOV], *iov; 732 int error; 733 734 mtx_lock(&Giant); 735 error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr)); 736 if (error) 737 goto done2; 738 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 739 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { 740 error = EMSGSIZE; 741 goto done2; 742 } 743 MALLOC(iov, struct iovec *, 744 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 745 M_WAITOK); 746 } else { 747 iov = aiov; 748 } 749 error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, 750 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 751 if (error) 752 goto done; 753 msg.msg_flags = MSG_COMPAT; 754 msg.msg_iov = iov; 755 error = sendit(td, uap->s, &msg, uap->flags); 756 done: 757 if (iov != aiov) 758 FREE(iov, M_IOV); 759 done2: 760 mtx_unlock(&Giant); 761 return (error); 762 } 763 #endif 764 765 /* 766 * MPSAFE 767 */ 768 int 769 sendmsg(td, uap) 770 struct thread *td; 771 register struct sendmsg_args /* { 772 int s; 773 caddr_t msg; 774 int flags; 775 } */ *uap; 776 { 777 struct msghdr msg; 778 struct iovec aiov[UIO_SMALLIOV], *iov; 779 int error; 780 781 mtx_lock(&Giant); 782 error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg)); 783 if (error) 784 goto done2; 785 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 786 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { 787 error = EMSGSIZE; 788 goto done2; 789 } 790 MALLOC(iov, struct iovec *, 791 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 792 M_WAITOK); 793 } else { 794 iov = aiov; 795 } 796 if (msg.msg_iovlen && 797 (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, 798 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))) 799 goto done; 800 msg.msg_iov = iov; 801 #ifdef COMPAT_OLDSOCK 802 msg.msg_flags = 0; 803 #endif 804 error = sendit(td, uap->s, &msg, uap->flags); 805 done: 806 if (iov != aiov) 807 FREE(iov, M_IOV); 808 done2: 809 mtx_unlock(&Giant); 810 return (error); 811 } 812 813 static int 814 recvit(td, s, mp, namelenp) 815 register struct thread *td; 816 int s; 817 register struct msghdr *mp; 818 caddr_t namelenp; 819 { 820 struct uio auio; 821 register struct iovec *iov; 822 register int i; 823 int len, error; 824 struct mbuf *m, *control = 0; 825 caddr_t ctlbuf; 826 struct socket *so; 827 struct sockaddr *fromsa = 0; 828 #ifdef KTRACE 829 struct iovec *ktriov = NULL; 830 struct uio ktruio; 831 #endif 832 833 if ((error = fgetsock(td, s, &so, NULL)) != 0) 834 return (error); 835 auio.uio_iov = mp->msg_iov; 836 auio.uio_iovcnt = mp->msg_iovlen; 837 auio.uio_segflg = UIO_USERSPACE; 838 auio.uio_rw = UIO_READ; 839 auio.uio_td = td; 840 auio.uio_offset = 0; /* XXX */ 841 auio.uio_resid = 0; 842 iov = mp->msg_iov; 843 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 844 if ((auio.uio_resid += iov->iov_len) < 0) { 845 fputsock(so); 846 return (EINVAL); 847 } 848 } 849 #ifdef KTRACE 850 if (KTRPOINT(td->td_proc, KTR_GENIO)) { 851 int iovlen = auio.uio_iovcnt * sizeof (struct iovec); 852 853 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 854 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 855 ktruio = auio; 856 } 857 #endif 858 len = auio.uio_resid; 859 error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio, 860 (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, 861 &mp->msg_flags); 862 if (error) { 863 if (auio.uio_resid != len && (error == ERESTART || 864 error == EINTR || error == EWOULDBLOCK)) 865 error = 0; 866 } 867 #ifdef KTRACE 868 if (ktriov != NULL) { 869 if (error == 0) { 870 ktruio.uio_iov = ktriov; 871 ktruio.uio_resid = len - auio.uio_resid; 872 ktrgenio(td->td_proc->p_tracep, s, UIO_READ, &ktruio, error); 873 } 874 FREE(ktriov, M_TEMP); 875 } 876 #endif 877 if (error) 878 goto out; 879 td->td_retval[0] = len - auio.uio_resid; 880 if (mp->msg_name) { 881 len = mp->msg_namelen; 882 if (len <= 0 || fromsa == 0) 883 len = 0; 884 else { 885 #ifndef MIN 886 #define MIN(a,b) ((a)>(b)?(b):(a)) 887 #endif 888 /* save sa_len before it is destroyed by MSG_COMPAT */ 889 len = MIN(len, fromsa->sa_len); 890 #ifdef COMPAT_OLDSOCK 891 if (mp->msg_flags & MSG_COMPAT) 892 ((struct osockaddr *)fromsa)->sa_family = 893 fromsa->sa_family; 894 #endif 895 error = copyout(fromsa, 896 (caddr_t)mp->msg_name, (unsigned)len); 897 if (error) 898 goto out; 899 } 900 mp->msg_namelen = len; 901 if (namelenp && 902 (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) { 903 #ifdef COMPAT_OLDSOCK 904 if (mp->msg_flags & MSG_COMPAT) 905 error = 0; /* old recvfrom didn't check */ 906 else 907 #endif 908 goto out; 909 } 910 } 911 if (mp->msg_control) { 912 #ifdef COMPAT_OLDSOCK 913 /* 914 * We assume that old recvmsg calls won't receive access 915 * rights and other control info, esp. as control info 916 * is always optional and those options didn't exist in 4.3. 917 * If we receive rights, trim the cmsghdr; anything else 918 * is tossed. 919 */ 920 if (control && mp->msg_flags & MSG_COMPAT) { 921 if (mtod(control, struct cmsghdr *)->cmsg_level != 922 SOL_SOCKET || 923 mtod(control, struct cmsghdr *)->cmsg_type != 924 SCM_RIGHTS) { 925 mp->msg_controllen = 0; 926 goto out; 927 } 928 control->m_len -= sizeof (struct cmsghdr); 929 control->m_data += sizeof (struct cmsghdr); 930 } 931 #endif 932 len = mp->msg_controllen; 933 m = control; 934 mp->msg_controllen = 0; 935 ctlbuf = (caddr_t) mp->msg_control; 936 937 while (m && len > 0) { 938 unsigned int tocopy; 939 940 if (len >= m->m_len) 941 tocopy = m->m_len; 942 else { 943 mp->msg_flags |= MSG_CTRUNC; 944 tocopy = len; 945 } 946 947 if ((error = copyout((caddr_t)mtod(m, caddr_t), 948 ctlbuf, tocopy)) != 0) 949 goto out; 950 951 ctlbuf += tocopy; 952 len -= tocopy; 953 m = m->m_next; 954 } 955 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; 956 } 957 out: 958 fputsock(so); 959 if (fromsa) 960 FREE(fromsa, M_SONAME); 961 if (control) 962 m_freem(control); 963 return (error); 964 } 965 966 /* 967 * MPSAFE 968 */ 969 int 970 recvfrom(td, uap) 971 struct thread *td; 972 register struct recvfrom_args /* { 973 int s; 974 caddr_t buf; 975 size_t len; 976 int flags; 977 caddr_t from; 978 int *fromlenaddr; 979 } */ *uap; 980 { 981 struct msghdr msg; 982 struct iovec aiov; 983 int error; 984 985 mtx_lock(&Giant); 986 if (uap->fromlenaddr) { 987 error = copyin((caddr_t)uap->fromlenaddr, 988 (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen)); 989 if (error) 990 goto done2; 991 } else { 992 msg.msg_namelen = 0; 993 } 994 msg.msg_name = uap->from; 995 msg.msg_iov = &aiov; 996 msg.msg_iovlen = 1; 997 aiov.iov_base = uap->buf; 998 aiov.iov_len = uap->len; 999 msg.msg_control = 0; 1000 msg.msg_flags = uap->flags; 1001 error = recvit(td, uap->s, &msg, (caddr_t)uap->fromlenaddr); 1002 done2: 1003 mtx_unlock(&Giant); 1004 return(error); 1005 } 1006 1007 #ifdef COMPAT_OLDSOCK 1008 /* 1009 * MPSAFE 1010 */ 1011 int 1012 orecvfrom(td, uap) 1013 struct thread *td; 1014 struct recvfrom_args *uap; 1015 { 1016 1017 uap->flags |= MSG_COMPAT; 1018 return (recvfrom(td, uap)); 1019 } 1020 #endif 1021 1022 1023 #ifdef COMPAT_OLDSOCK 1024 /* 1025 * MPSAFE 1026 */ 1027 int 1028 orecv(td, uap) 1029 struct thread *td; 1030 register struct orecv_args /* { 1031 int s; 1032 caddr_t buf; 1033 int len; 1034 int flags; 1035 } */ *uap; 1036 { 1037 struct msghdr msg; 1038 struct iovec aiov; 1039 int error; 1040 1041 mtx_lock(&Giant); 1042 msg.msg_name = 0; 1043 msg.msg_namelen = 0; 1044 msg.msg_iov = &aiov; 1045 msg.msg_iovlen = 1; 1046 aiov.iov_base = uap->buf; 1047 aiov.iov_len = uap->len; 1048 msg.msg_control = 0; 1049 msg.msg_flags = uap->flags; 1050 error = recvit(td, uap->s, &msg, (caddr_t)0); 1051 mtx_unlock(&Giant); 1052 return (error); 1053 } 1054 1055 /* 1056 * Old recvmsg. This code takes advantage of the fact that the old msghdr 1057 * overlays the new one, missing only the flags, and with the (old) access 1058 * rights where the control fields are now. 1059 * 1060 * MPSAFE 1061 */ 1062 int 1063 orecvmsg(td, uap) 1064 struct thread *td; 1065 register struct orecvmsg_args /* { 1066 int s; 1067 struct omsghdr *msg; 1068 int flags; 1069 } */ *uap; 1070 { 1071 struct msghdr msg; 1072 struct iovec aiov[UIO_SMALLIOV], *iov; 1073 int error; 1074 1075 error = copyin((caddr_t)uap->msg, (caddr_t)&msg, 1076 sizeof (struct omsghdr)); 1077 if (error) 1078 return (error); 1079 1080 mtx_lock(&Giant); 1081 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 1082 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { 1083 error = EMSGSIZE; 1084 goto done2; 1085 } 1086 MALLOC(iov, struct iovec *, 1087 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 1088 M_WAITOK); 1089 } else { 1090 iov = aiov; 1091 } 1092 msg.msg_flags = uap->flags | MSG_COMPAT; 1093 error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, 1094 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 1095 if (error) 1096 goto done; 1097 msg.msg_iov = iov; 1098 error = recvit(td, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen); 1099 1100 if (msg.msg_controllen && error == 0) 1101 error = copyout((caddr_t)&msg.msg_controllen, 1102 (caddr_t)&uap->msg->msg_accrightslen, sizeof (int)); 1103 done: 1104 if (iov != aiov) 1105 FREE(iov, M_IOV); 1106 done2: 1107 mtx_unlock(&Giant); 1108 return (error); 1109 } 1110 #endif 1111 1112 /* 1113 * MPSAFE 1114 */ 1115 int 1116 recvmsg(td, uap) 1117 struct thread *td; 1118 register struct recvmsg_args /* { 1119 int s; 1120 struct msghdr *msg; 1121 int flags; 1122 } */ *uap; 1123 { 1124 struct msghdr msg; 1125 struct iovec aiov[UIO_SMALLIOV], *uiov, *iov; 1126 register int error; 1127 1128 mtx_lock(&Giant); 1129 error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg)); 1130 if (error) 1131 goto done2; 1132 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 1133 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) { 1134 error = EMSGSIZE; 1135 goto done2; 1136 } 1137 MALLOC(iov, struct iovec *, 1138 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 1139 M_WAITOK); 1140 } else { 1141 iov = aiov; 1142 } 1143 #ifdef COMPAT_OLDSOCK 1144 msg.msg_flags = uap->flags &~ MSG_COMPAT; 1145 #else 1146 msg.msg_flags = uap->flags; 1147 #endif 1148 uiov = msg.msg_iov; 1149 msg.msg_iov = iov; 1150 error = copyin((caddr_t)uiov, (caddr_t)iov, 1151 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 1152 if (error) 1153 goto done; 1154 error = recvit(td, uap->s, &msg, (caddr_t)0); 1155 if (!error) { 1156 msg.msg_iov = uiov; 1157 error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg)); 1158 } 1159 done: 1160 if (iov != aiov) 1161 FREE(iov, M_IOV); 1162 done2: 1163 mtx_unlock(&Giant); 1164 return (error); 1165 } 1166 1167 /* 1168 * MPSAFE 1169 */ 1170 /* ARGSUSED */ 1171 int 1172 shutdown(td, uap) 1173 struct thread *td; 1174 register struct shutdown_args /* { 1175 int s; 1176 int how; 1177 } */ *uap; 1178 { 1179 struct socket *so; 1180 int error; 1181 1182 mtx_lock(&Giant); 1183 if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { 1184 error = soshutdown(so, uap->how); 1185 fputsock(so); 1186 } 1187 mtx_unlock(&Giant); 1188 return(error); 1189 } 1190 1191 /* 1192 * MPSAFE 1193 */ 1194 /* ARGSUSED */ 1195 int 1196 setsockopt(td, uap) 1197 struct thread *td; 1198 register struct setsockopt_args /* { 1199 int s; 1200 int level; 1201 int name; 1202 caddr_t val; 1203 int valsize; 1204 } */ *uap; 1205 { 1206 struct socket *so; 1207 struct sockopt sopt; 1208 int error; 1209 1210 if (uap->val == 0 && uap->valsize != 0) 1211 return (EFAULT); 1212 if (uap->valsize < 0) 1213 return (EINVAL); 1214 1215 mtx_lock(&Giant); 1216 if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) { 1217 sopt.sopt_dir = SOPT_SET; 1218 sopt.sopt_level = uap->level; 1219 sopt.sopt_name = uap->name; 1220 sopt.sopt_val = uap->val; 1221 sopt.sopt_valsize = uap->valsize; 1222 sopt.sopt_td = td; 1223 error = sosetopt(so, &sopt); 1224 fputsock(so); 1225 } 1226 mtx_unlock(&Giant); 1227 return(error); 1228 } 1229 1230 /* 1231 * MPSAFE 1232 */ 1233 /* ARGSUSED */ 1234 int 1235 getsockopt(td, uap) 1236 struct thread *td; 1237 register struct getsockopt_args /* { 1238 int s; 1239 int level; 1240 int name; 1241 caddr_t val; 1242 int *avalsize; 1243 } */ *uap; 1244 { 1245 int valsize, error; 1246 struct socket *so; 1247 struct sockopt sopt; 1248 1249 mtx_lock(&Giant); 1250 if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) 1251 goto done2; 1252 if (uap->val) { 1253 error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize, 1254 sizeof (valsize)); 1255 if (error) 1256 goto done1; 1257 if (valsize < 0) { 1258 error = EINVAL; 1259 goto done1; 1260 } 1261 } else { 1262 valsize = 0; 1263 } 1264 1265 sopt.sopt_dir = SOPT_GET; 1266 sopt.sopt_level = uap->level; 1267 sopt.sopt_name = uap->name; 1268 sopt.sopt_val = uap->val; 1269 sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */ 1270 sopt.sopt_td = td; 1271 1272 error = sogetopt(so, &sopt); 1273 if (error == 0) { 1274 valsize = sopt.sopt_valsize; 1275 error = copyout((caddr_t)&valsize, 1276 (caddr_t)uap->avalsize, sizeof (valsize)); 1277 } 1278 done1: 1279 fputsock(so); 1280 done2: 1281 mtx_unlock(&Giant); 1282 return (error); 1283 } 1284 1285 /* 1286 * getsockname1() - Get socket name. 1287 * 1288 * MPSAFE 1289 */ 1290 /* ARGSUSED */ 1291 static int 1292 getsockname1(td, uap, compat) 1293 struct thread *td; 1294 register struct getsockname_args /* { 1295 int fdes; 1296 caddr_t asa; 1297 int *alen; 1298 } */ *uap; 1299 int compat; 1300 { 1301 struct socket *so; 1302 struct sockaddr *sa; 1303 int len, error; 1304 1305 mtx_lock(&Giant); 1306 if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0) 1307 goto done2; 1308 error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); 1309 if (error) 1310 goto done1; 1311 sa = 0; 1312 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa); 1313 if (error) 1314 goto bad; 1315 if (sa == 0) { 1316 len = 0; 1317 goto gotnothing; 1318 } 1319 1320 len = MIN(len, sa->sa_len); 1321 #ifdef COMPAT_OLDSOCK 1322 if (compat) 1323 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1324 #endif 1325 error = copyout(sa, (caddr_t)uap->asa, (u_int)len); 1326 if (error == 0) 1327 gotnothing: 1328 error = copyout((caddr_t)&len, (caddr_t)uap->alen, 1329 sizeof (len)); 1330 bad: 1331 if (sa) 1332 FREE(sa, M_SONAME); 1333 done1: 1334 fputsock(so); 1335 done2: 1336 mtx_unlock(&Giant); 1337 return (error); 1338 } 1339 1340 /* 1341 * MPSAFE 1342 */ 1343 int 1344 getsockname(td, uap) 1345 struct thread *td; 1346 struct getsockname_args *uap; 1347 { 1348 1349 return (getsockname1(td, uap, 0)); 1350 } 1351 1352 #ifdef COMPAT_OLDSOCK 1353 /* 1354 * MPSAFE 1355 */ 1356 int 1357 ogetsockname(td, uap) 1358 struct thread *td; 1359 struct getsockname_args *uap; 1360 { 1361 1362 return (getsockname1(td, uap, 1)); 1363 } 1364 #endif /* COMPAT_OLDSOCK */ 1365 1366 /* 1367 * getpeername1() - Get name of peer for connected socket. 1368 * 1369 * MPSAFE 1370 */ 1371 /* ARGSUSED */ 1372 static int 1373 getpeername1(td, uap, compat) 1374 struct thread *td; 1375 register struct getpeername_args /* { 1376 int fdes; 1377 caddr_t asa; 1378 int *alen; 1379 } */ *uap; 1380 int compat; 1381 { 1382 struct socket *so; 1383 struct sockaddr *sa; 1384 int len, error; 1385 1386 mtx_lock(&Giant); 1387 if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0) 1388 goto done2; 1389 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1390 error = ENOTCONN; 1391 goto done1; 1392 } 1393 error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); 1394 if (error) 1395 goto done1; 1396 sa = 0; 1397 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa); 1398 if (error) 1399 goto bad; 1400 if (sa == 0) { 1401 len = 0; 1402 goto gotnothing; 1403 } 1404 len = MIN(len, sa->sa_len); 1405 #ifdef COMPAT_OLDSOCK 1406 if (compat) 1407 ((struct osockaddr *)sa)->sa_family = 1408 sa->sa_family; 1409 #endif 1410 error = copyout(sa, (caddr_t)uap->asa, (u_int)len); 1411 if (error) 1412 goto bad; 1413 gotnothing: 1414 error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len)); 1415 bad: 1416 if (sa) 1417 FREE(sa, M_SONAME); 1418 done1: 1419 fputsock(so); 1420 done2: 1421 mtx_unlock(&Giant); 1422 return (error); 1423 } 1424 1425 /* 1426 * MPSAFE 1427 */ 1428 int 1429 getpeername(td, uap) 1430 struct thread *td; 1431 struct getpeername_args *uap; 1432 { 1433 1434 return (getpeername1(td, uap, 0)); 1435 } 1436 1437 #ifdef COMPAT_OLDSOCK 1438 /* 1439 * MPSAFE 1440 */ 1441 int 1442 ogetpeername(td, uap) 1443 struct thread *td; 1444 struct ogetpeername_args *uap; 1445 { 1446 1447 /* XXX uap should have type `getpeername_args *' to begin with. */ 1448 return (getpeername1(td, (struct getpeername_args *)uap, 1)); 1449 } 1450 #endif /* COMPAT_OLDSOCK */ 1451 1452 int 1453 sockargs(mp, buf, buflen, type) 1454 struct mbuf **mp; 1455 caddr_t buf; 1456 int buflen, type; 1457 { 1458 register struct sockaddr *sa; 1459 register struct mbuf *m; 1460 int error; 1461 1462 if ((u_int)buflen > MLEN) { 1463 #ifdef COMPAT_OLDSOCK 1464 if (type == MT_SONAME && (u_int)buflen <= 112) 1465 buflen = MLEN; /* unix domain compat. hack */ 1466 else 1467 #endif 1468 return (EINVAL); 1469 } 1470 m = m_get(M_TRYWAIT, type); 1471 if (m == NULL) 1472 return (ENOBUFS); 1473 m->m_len = buflen; 1474 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1475 if (error) 1476 (void) m_free(m); 1477 else { 1478 *mp = m; 1479 if (type == MT_SONAME) { 1480 sa = mtod(m, struct sockaddr *); 1481 1482 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1483 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1484 sa->sa_family = sa->sa_len; 1485 #endif 1486 sa->sa_len = buflen; 1487 } 1488 } 1489 return (error); 1490 } 1491 1492 int 1493 getsockaddr(namp, uaddr, len) 1494 struct sockaddr **namp; 1495 caddr_t uaddr; 1496 size_t len; 1497 { 1498 struct sockaddr *sa; 1499 int error; 1500 1501 if (len > SOCK_MAXADDRLEN) 1502 return ENAMETOOLONG; 1503 MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); 1504 error = copyin(uaddr, sa, len); 1505 if (error) { 1506 FREE(sa, M_SONAME); 1507 } else { 1508 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1509 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1510 sa->sa_family = sa->sa_len; 1511 #endif 1512 sa->sa_len = len; 1513 *namp = sa; 1514 } 1515 return error; 1516 } 1517 1518 /* 1519 * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-)) 1520 * XXX - The sf_buf functions are currently private to sendfile(2), so have 1521 * been made static, but may be useful in the future for doing zero-copy in 1522 * other parts of the networking code. 1523 */ 1524 static void 1525 sf_buf_init(void *arg) 1526 { 1527 int i; 1528 1529 mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", MTX_DEF); 1530 mtx_lock(&sf_freelist.sf_lock); 1531 SLIST_INIT(&sf_freelist.sf_head); 1532 sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE); 1533 sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, 1534 M_NOWAIT | M_ZERO); 1535 for (i = 0; i < nsfbufs; i++) { 1536 sf_bufs[i].kva = sf_base + i * PAGE_SIZE; 1537 SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list); 1538 } 1539 sf_buf_alloc_want = 0; 1540 mtx_unlock(&sf_freelist.sf_lock); 1541 } 1542 1543 /* 1544 * Get an sf_buf from the freelist. Will block if none are available. 1545 */ 1546 static struct sf_buf * 1547 sf_buf_alloc() 1548 { 1549 struct sf_buf *sf; 1550 int error; 1551 1552 mtx_lock(&sf_freelist.sf_lock); 1553 while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) { 1554 sf_buf_alloc_want++; 1555 error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH, 1556 "sfbufa", 0); 1557 sf_buf_alloc_want--; 1558 1559 /* 1560 * If we got a signal, don't risk going back to sleep. 1561 */ 1562 if (error) 1563 break; 1564 } 1565 if (sf != NULL) 1566 SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list); 1567 mtx_unlock(&sf_freelist.sf_lock); 1568 return (sf); 1569 } 1570 1571 #define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT]) 1572 1573 /* 1574 * Detatch mapped page and release resources back to the system. 1575 */ 1576 static void 1577 sf_buf_free(caddr_t addr, void *args) 1578 { 1579 struct sf_buf *sf; 1580 struct vm_page *m; 1581 1582 GIANT_REQUIRED; 1583 1584 sf = dtosf(addr); 1585 pmap_qremove((vm_offset_t)addr, 1); 1586 m = sf->m; 1587 vm_page_unwire(m, 0); 1588 /* 1589 * Check for the object going away on us. This can 1590 * happen since we don't hold a reference to it. 1591 * If so, we're responsible for freeing the page. 1592 */ 1593 if (m->wire_count == 0 && m->object == NULL) 1594 vm_page_free(m); 1595 sf->m = NULL; 1596 mtx_lock(&sf_freelist.sf_lock); 1597 SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list); 1598 if (sf_buf_alloc_want > 0) 1599 wakeup_one(&sf_freelist); 1600 mtx_unlock(&sf_freelist.sf_lock); 1601 } 1602 1603 /* 1604 * sendfile(2) 1605 * 1606 * MPSAFE 1607 * 1608 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 1609 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 1610 * 1611 * Send a file specified by 'fd' and starting at 'offset' to a socket 1612 * specified by 's'. Send only 'nbytes' of the file or until EOF if 1613 * nbytes == 0. Optionally add a header and/or trailer to the socket 1614 * output. If specified, write the total number of bytes sent into *sbytes. 1615 * 1616 */ 1617 int 1618 sendfile(struct thread *td, struct sendfile_args *uap) 1619 { 1620 struct vnode *vp; 1621 struct vm_object *obj; 1622 struct socket *so = NULL; 1623 struct mbuf *m; 1624 struct sf_buf *sf; 1625 struct vm_page *pg; 1626 struct writev_args nuap; 1627 struct sf_hdtr hdtr; 1628 off_t off, xfsize, sbytes = 0; 1629 int error, s; 1630 1631 mtx_lock(&Giant); 1632 1633 /* 1634 * The descriptor must be a regular file and have a backing VM object. 1635 */ 1636 if ((error = fgetvp_read(td, uap->fd, &vp)) != 0) 1637 goto done; 1638 if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) { 1639 error = EINVAL; 1640 goto done; 1641 } 1642 if ((error = fgetsock(td, uap->s, &so, NULL)) != 0) 1643 goto done; 1644 if (so->so_type != SOCK_STREAM) { 1645 error = EINVAL; 1646 goto done; 1647 } 1648 if ((so->so_state & SS_ISCONNECTED) == 0) { 1649 error = ENOTCONN; 1650 goto done; 1651 } 1652 if (uap->offset < 0) { 1653 error = EINVAL; 1654 goto done; 1655 } 1656 1657 /* 1658 * If specified, get the pointer to the sf_hdtr struct for 1659 * any headers/trailers. 1660 */ 1661 if (uap->hdtr != NULL) { 1662 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 1663 if (error) 1664 goto done; 1665 /* 1666 * Send any headers. Wimp out and use writev(2). 1667 */ 1668 if (hdtr.headers != NULL) { 1669 nuap.fd = uap->s; 1670 nuap.iovp = hdtr.headers; 1671 nuap.iovcnt = hdtr.hdr_cnt; 1672 error = writev(td, &nuap); 1673 if (error) 1674 goto done; 1675 sbytes += td->td_retval[0]; 1676 } 1677 } 1678 1679 /* 1680 * Protect against multiple writers to the socket. 1681 */ 1682 (void) sblock(&so->so_snd, M_WAITOK); 1683 1684 /* 1685 * Loop through the pages in the file, starting with the requested 1686 * offset. Get a file page (do I/O if necessary), map the file page 1687 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 1688 * it on the socket. 1689 */ 1690 for (off = uap->offset; ; off += xfsize, sbytes += xfsize) { 1691 vm_pindex_t pindex; 1692 vm_offset_t pgoff; 1693 1694 pindex = OFF_TO_IDX(off); 1695 retry_lookup: 1696 /* 1697 * Calculate the amount to transfer. Not to exceed a page, 1698 * the EOF, or the passed in nbytes. 1699 */ 1700 xfsize = obj->un_pager.vnp.vnp_size - off; 1701 if (xfsize > PAGE_SIZE) 1702 xfsize = PAGE_SIZE; 1703 pgoff = (vm_offset_t)(off & PAGE_MASK); 1704 if (PAGE_SIZE - pgoff < xfsize) 1705 xfsize = PAGE_SIZE - pgoff; 1706 if (uap->nbytes && xfsize > (uap->nbytes - sbytes)) 1707 xfsize = uap->nbytes - sbytes; 1708 if (xfsize <= 0) 1709 break; 1710 /* 1711 * Optimize the non-blocking case by looking at the socket space 1712 * before going to the extra work of constituting the sf_buf. 1713 */ 1714 if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { 1715 if (so->so_state & SS_CANTSENDMORE) 1716 error = EPIPE; 1717 else 1718 error = EAGAIN; 1719 sbunlock(&so->so_snd); 1720 goto done; 1721 } 1722 /* 1723 * Attempt to look up the page. 1724 * 1725 * Allocate if not found 1726 * 1727 * Wait and loop if busy. 1728 */ 1729 pg = vm_page_lookup(obj, pindex); 1730 1731 if (pg == NULL) { 1732 pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL); 1733 if (pg == NULL) { 1734 VM_WAIT; 1735 goto retry_lookup; 1736 } 1737 vm_page_wakeup(pg); 1738 } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) { 1739 goto retry_lookup; 1740 } 1741 1742 /* 1743 * Wire the page so it does not get ripped out from under 1744 * us. 1745 */ 1746 1747 vm_page_wire(pg); 1748 1749 /* 1750 * If page is not valid for what we need, initiate I/O 1751 */ 1752 1753 if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) { 1754 struct uio auio; 1755 struct iovec aiov; 1756 int bsize; 1757 1758 /* 1759 * Ensure that our page is still around when the I/O 1760 * completes. 1761 */ 1762 vm_page_io_start(pg); 1763 1764 /* 1765 * Get the page from backing store. 1766 */ 1767 bsize = vp->v_mount->mnt_stat.f_iosize; 1768 auio.uio_iov = &aiov; 1769 auio.uio_iovcnt = 1; 1770 aiov.iov_base = 0; 1771 aiov.iov_len = MAXBSIZE; 1772 auio.uio_resid = MAXBSIZE; 1773 auio.uio_offset = trunc_page(off); 1774 auio.uio_segflg = UIO_NOCOPY; 1775 auio.uio_rw = UIO_READ; 1776 auio.uio_td = td; 1777 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td); 1778 error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16), 1779 td->td_proc->p_ucred); 1780 VOP_UNLOCK(vp, 0, td); 1781 vm_page_flag_clear(pg, PG_ZERO); 1782 vm_page_io_finish(pg); 1783 if (error) { 1784 vm_page_unwire(pg, 0); 1785 /* 1786 * See if anyone else might know about this page. 1787 * If not and it is not valid, then free it. 1788 */ 1789 if (pg->wire_count == 0 && pg->valid == 0 && 1790 pg->busy == 0 && !(pg->flags & PG_BUSY) && 1791 pg->hold_count == 0) { 1792 vm_page_busy(pg); 1793 vm_page_free(pg); 1794 } 1795 sbunlock(&so->so_snd); 1796 goto done; 1797 } 1798 } 1799 1800 1801 /* 1802 * Get a sendfile buf. We usually wait as long as necessary, 1803 * but this wait can be interrupted. 1804 */ 1805 if ((sf = sf_buf_alloc()) == NULL) { 1806 vm_page_unwire(pg, 0); 1807 if (pg->wire_count == 0 && pg->object == NULL) 1808 vm_page_free(pg); 1809 sbunlock(&so->so_snd); 1810 error = EINTR; 1811 goto done; 1812 } 1813 1814 /* 1815 * Allocate a kernel virtual page and insert the physical page 1816 * into it. 1817 */ 1818 sf->m = pg; 1819 pmap_qenter(sf->kva, &pg, 1); 1820 /* 1821 * Get an mbuf header and set it up as having external storage. 1822 */ 1823 MGETHDR(m, M_TRYWAIT, MT_DATA); 1824 if (m == NULL) { 1825 error = ENOBUFS; 1826 sf_buf_free((void *)sf->kva, NULL); 1827 sbunlock(&so->so_snd); 1828 goto done; 1829 } 1830 /* 1831 * Setup external storage for mbuf. 1832 */ 1833 MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY, 1834 EXT_SFBUF); 1835 m->m_data = (char *) sf->kva + pgoff; 1836 m->m_pkthdr.len = m->m_len = xfsize; 1837 /* 1838 * Add the buffer to the socket buffer chain. 1839 */ 1840 s = splnet(); 1841 retry_space: 1842 /* 1843 * Make sure that the socket is still able to take more data. 1844 * CANTSENDMORE being true usually means that the connection 1845 * was closed. so_error is true when an error was sensed after 1846 * a previous send. 1847 * The state is checked after the page mapping and buffer 1848 * allocation above since those operations may block and make 1849 * any socket checks stale. From this point forward, nothing 1850 * blocks before the pru_send (or more accurately, any blocking 1851 * results in a loop back to here to re-check). 1852 */ 1853 if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { 1854 if (so->so_state & SS_CANTSENDMORE) { 1855 error = EPIPE; 1856 } else { 1857 error = so->so_error; 1858 so->so_error = 0; 1859 } 1860 m_freem(m); 1861 sbunlock(&so->so_snd); 1862 splx(s); 1863 goto done; 1864 } 1865 /* 1866 * Wait for socket space to become available. We do this just 1867 * after checking the connection state above in order to avoid 1868 * a race condition with sbwait(). 1869 */ 1870 if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { 1871 if (so->so_state & SS_NBIO) { 1872 m_freem(m); 1873 sbunlock(&so->so_snd); 1874 splx(s); 1875 error = EAGAIN; 1876 goto done; 1877 } 1878 error = sbwait(&so->so_snd); 1879 /* 1880 * An error from sbwait usually indicates that we've 1881 * been interrupted by a signal. If we've sent anything 1882 * then return bytes sent, otherwise return the error. 1883 */ 1884 if (error) { 1885 m_freem(m); 1886 sbunlock(&so->so_snd); 1887 splx(s); 1888 goto done; 1889 } 1890 goto retry_space; 1891 } 1892 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td); 1893 splx(s); 1894 if (error) { 1895 sbunlock(&so->so_snd); 1896 goto done; 1897 } 1898 } 1899 sbunlock(&so->so_snd); 1900 1901 /* 1902 * Send trailers. Wimp out and use writev(2). 1903 */ 1904 if (uap->hdtr != NULL && hdtr.trailers != NULL) { 1905 nuap.fd = uap->s; 1906 nuap.iovp = hdtr.trailers; 1907 nuap.iovcnt = hdtr.trl_cnt; 1908 error = writev(td, &nuap); 1909 if (error) 1910 goto done; 1911 sbytes += td->td_retval[0]; 1912 } 1913 1914 done: 1915 /* 1916 * If there was no error we have to clear td->td_retval[0] 1917 * because it may have been set by writev. 1918 */ 1919 if (error == 0) { 1920 td->td_retval[0] = 0; 1921 } 1922 if (uap->sbytes != NULL) { 1923 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 1924 } 1925 if (vp) 1926 vrele(vp); 1927 if (so) 1928 fputsock(so); 1929 mtx_unlock(&Giant); 1930 return (error); 1931 } 1932 1933