1 /* 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 37 * $FreeBSD$ 38 */ 39 40 #include "opt_compat.h" 41 #include "opt_ktrace.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/kernel.h> 46 #include <sys/lock.h> 47 #include <sys/mutex.h> 48 #include <sys/sysproto.h> 49 #include <sys/malloc.h> 50 #include <sys/filedesc.h> 51 #include <sys/event.h> 52 #include <sys/proc.h> 53 #include <sys/fcntl.h> 54 #include <sys/file.h> 55 #include <sys/lock.h> 56 #include <sys/mount.h> 57 #include <sys/mbuf.h> 58 #include <sys/protosw.h> 59 #include <sys/socket.h> 60 #include <sys/socketvar.h> 61 #include <sys/signalvar.h> 62 #include <sys/uio.h> 63 #include <sys/vnode.h> 64 #ifdef KTRACE 65 #include <sys/ktrace.h> 66 #endif 67 68 #include <vm/vm.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_page.h> 71 #include <vm/vm_pageout.h> 72 #include <vm/vm_kern.h> 73 #include <vm/vm_extern.h> 74 75 static void sf_buf_init(void *arg); 76 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) 77 static struct sf_buf *sf_buf_alloc(void); 78 static void sf_buf_free(caddr_t addr, void *args); 79 80 static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags)); 81 static int recvit __P((struct proc *p, int s, struct msghdr *mp, 82 caddr_t namelenp)); 83 84 static int accept1 __P((struct proc *p, struct accept_args *uap, int compat)); 85 static int getsockname1 __P((struct proc *p, struct getsockname_args *uap, 86 int compat)); 87 static int getpeername1 __P((struct proc *p, struct getpeername_args *uap, 88 int compat)); 89 90 /* 91 * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the 92 * sf_freelist head with the sf_lock mutex. 93 */ 94 static struct { 95 SLIST_HEAD(, sf_buf) sf_head; 96 struct mtx sf_lock; 97 } sf_freelist; 98 99 static vm_offset_t sf_base; 100 static struct sf_buf *sf_bufs; 101 static u_int sf_buf_alloc_want; 102 103 /* 104 * System call interface to the socket abstraction. 105 */ 106 #if defined(COMPAT_43) || defined(COMPAT_SUNOS) 107 #define COMPAT_OLDSOCK 108 #endif 109 110 extern struct fileops socketops; 111 112 int 113 socket(p, uap) 114 struct proc *p; 115 register struct socket_args /* { 116 int domain; 117 int type; 118 int protocol; 119 } */ *uap; 120 { 121 struct filedesc *fdp = p->p_fd; 122 struct socket *so; 123 struct file *fp; 124 int fd, error; 125 126 error = falloc(p, &fp, &fd); 127 if (error) 128 return (error); 129 fhold(fp); 130 error = socreate(uap->domain, &so, uap->type, uap->protocol, p); 131 if (error) { 132 if (fdp->fd_ofiles[fd] == fp) { 133 fdp->fd_ofiles[fd] = NULL; 134 fdrop(fp, p); 135 } 136 } else { 137 fp->f_data = (caddr_t)so; 138 fp->f_flag = FREAD|FWRITE; 139 fp->f_ops = &socketops; 140 fp->f_type = DTYPE_SOCKET; 141 p->p_retval[0] = fd; 142 } 143 fdrop(fp, p); 144 return (error); 145 } 146 147 /* ARGSUSED */ 148 int 149 bind(p, uap) 150 struct proc *p; 151 register struct bind_args /* { 152 int s; 153 caddr_t name; 154 int namelen; 155 } */ *uap; 156 { 157 struct file *fp; 158 struct sockaddr *sa; 159 int error; 160 161 error = holdsock(p->p_fd, uap->s, &fp); 162 if (error) 163 return (error); 164 error = getsockaddr(&sa, uap->name, uap->namelen); 165 if (error) { 166 fdrop(fp, p); 167 return (error); 168 } 169 error = sobind((struct socket *)fp->f_data, sa, p); 170 FREE(sa, M_SONAME); 171 fdrop(fp, p); 172 return (error); 173 } 174 175 /* ARGSUSED */ 176 int 177 listen(p, uap) 178 struct proc *p; 179 register struct listen_args /* { 180 int s; 181 int backlog; 182 } */ *uap; 183 { 184 struct file *fp; 185 int error; 186 187 error = holdsock(p->p_fd, uap->s, &fp); 188 if (error) 189 return (error); 190 error = solisten((struct socket *)fp->f_data, uap->backlog, p); 191 fdrop(fp, p); 192 return(error); 193 } 194 195 static int 196 accept1(p, uap, compat) 197 struct proc *p; 198 register struct accept_args /* { 199 int s; 200 caddr_t name; 201 int *anamelen; 202 } */ *uap; 203 int compat; 204 { 205 struct filedesc *fdp = p->p_fd; 206 struct file *lfp = NULL; 207 struct file *nfp = NULL; 208 struct sockaddr *sa; 209 int namelen, error, s; 210 struct socket *head, *so; 211 int fd; 212 short fflag; /* type must match fp->f_flag */ 213 214 if (uap->name) { 215 error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen, 216 sizeof (namelen)); 217 if(error) 218 return (error); 219 } 220 error = holdsock(fdp, uap->s, &lfp); 221 if (error) 222 return (error); 223 s = splnet(); 224 head = (struct socket *)lfp->f_data; 225 if ((head->so_options & SO_ACCEPTCONN) == 0) { 226 splx(s); 227 error = EINVAL; 228 goto done; 229 } 230 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { 231 splx(s); 232 error = EWOULDBLOCK; 233 goto done; 234 } 235 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { 236 if (head->so_state & SS_CANTRCVMORE) { 237 head->so_error = ECONNABORTED; 238 break; 239 } 240 error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH, 241 "accept", 0); 242 if (error) { 243 splx(s); 244 goto done; 245 } 246 } 247 if (head->so_error) { 248 error = head->so_error; 249 head->so_error = 0; 250 splx(s); 251 goto done; 252 } 253 254 /* 255 * At this point we know that there is at least one connection 256 * ready to be accepted. Remove it from the queue prior to 257 * allocating the file descriptor for it since falloc() may 258 * block allowing another process to accept the connection 259 * instead. 260 */ 261 so = TAILQ_FIRST(&head->so_comp); 262 TAILQ_REMOVE(&head->so_comp, so, so_list); 263 head->so_qlen--; 264 265 fflag = lfp->f_flag; 266 error = falloc(p, &nfp, &fd); 267 if (error) { 268 /* 269 * Probably ran out of file descriptors. Put the 270 * unaccepted connection back onto the queue and 271 * do another wakeup so some other process might 272 * have a chance at it. 273 */ 274 TAILQ_INSERT_HEAD(&head->so_comp, so, so_list); 275 head->so_qlen++; 276 wakeup_one(&head->so_timeo); 277 splx(s); 278 goto done; 279 } 280 fhold(nfp); 281 p->p_retval[0] = fd; 282 283 /* connection has been removed from the listen queue */ 284 KNOTE(&head->so_rcv.sb_sel.si_note, 0); 285 286 so->so_state &= ~SS_COMP; 287 so->so_head = NULL; 288 if (head->so_sigio != NULL) 289 fsetown(fgetown(head->so_sigio), &so->so_sigio); 290 291 nfp->f_data = (caddr_t)so; 292 nfp->f_flag = fflag; 293 nfp->f_ops = &socketops; 294 nfp->f_type = DTYPE_SOCKET; 295 sa = 0; 296 error = soaccept(so, &sa); 297 if (error) { 298 /* 299 * return a namelen of zero for older code which might 300 * ignore the return value from accept. 301 */ 302 if (uap->name != NULL) { 303 namelen = 0; 304 (void) copyout((caddr_t)&namelen, 305 (caddr_t)uap->anamelen, sizeof(*uap->anamelen)); 306 } 307 goto noconnection; 308 } 309 if (sa == NULL) { 310 namelen = 0; 311 if (uap->name) 312 goto gotnoname; 313 splx(s); 314 error = 0; 315 goto done; 316 } 317 if (uap->name) { 318 /* check sa_len before it is destroyed */ 319 if (namelen > sa->sa_len) 320 namelen = sa->sa_len; 321 #ifdef COMPAT_OLDSOCK 322 if (compat) 323 ((struct osockaddr *)sa)->sa_family = 324 sa->sa_family; 325 #endif 326 error = copyout(sa, (caddr_t)uap->name, (u_int)namelen); 327 if (!error) 328 gotnoname: 329 error = copyout((caddr_t)&namelen, 330 (caddr_t)uap->anamelen, sizeof (*uap->anamelen)); 331 } 332 noconnection: 333 if (sa) 334 FREE(sa, M_SONAME); 335 336 /* 337 * close the new descriptor, assuming someone hasn't ripped it 338 * out from under us. 339 */ 340 if (error) { 341 if (fdp->fd_ofiles[fd] == nfp) { 342 fdp->fd_ofiles[fd] = NULL; 343 fdrop(nfp, p); 344 } 345 } 346 splx(s); 347 348 /* 349 * Release explicitly held references before returning. 350 */ 351 done: 352 if (nfp != NULL) 353 fdrop(nfp, p); 354 fdrop(lfp, p); 355 return (error); 356 } 357 358 int 359 accept(p, uap) 360 struct proc *p; 361 struct accept_args *uap; 362 { 363 364 return (accept1(p, uap, 0)); 365 } 366 367 #ifdef COMPAT_OLDSOCK 368 int 369 oaccept(p, uap) 370 struct proc *p; 371 struct accept_args *uap; 372 { 373 374 return (accept1(p, uap, 1)); 375 } 376 #endif /* COMPAT_OLDSOCK */ 377 378 /* ARGSUSED */ 379 int 380 connect(p, uap) 381 struct proc *p; 382 register struct connect_args /* { 383 int s; 384 caddr_t name; 385 int namelen; 386 } */ *uap; 387 { 388 struct file *fp; 389 register struct socket *so; 390 struct sockaddr *sa; 391 int error, s; 392 393 error = holdsock(p->p_fd, uap->s, &fp); 394 if (error) 395 return (error); 396 so = (struct socket *)fp->f_data; 397 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 398 error = EALREADY; 399 goto done; 400 } 401 error = getsockaddr(&sa, uap->name, uap->namelen); 402 if (error) 403 goto done; 404 error = soconnect(so, sa, p); 405 if (error) 406 goto bad; 407 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 408 FREE(sa, M_SONAME); 409 error = EINPROGRESS; 410 goto done; 411 } 412 s = splnet(); 413 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 414 error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, 415 "connec", 0); 416 if (error) 417 break; 418 } 419 if (error == 0) { 420 error = so->so_error; 421 so->so_error = 0; 422 } 423 splx(s); 424 bad: 425 so->so_state &= ~SS_ISCONNECTING; 426 FREE(sa, M_SONAME); 427 if (error == ERESTART) 428 error = EINTR; 429 done: 430 fdrop(fp, p); 431 return (error); 432 } 433 434 int 435 socketpair(p, uap) 436 struct proc *p; 437 register struct socketpair_args /* { 438 int domain; 439 int type; 440 int protocol; 441 int *rsv; 442 } */ *uap; 443 { 444 register struct filedesc *fdp = p->p_fd; 445 struct file *fp1, *fp2; 446 struct socket *so1, *so2; 447 int fd, error, sv[2]; 448 449 error = socreate(uap->domain, &so1, uap->type, uap->protocol, p); 450 if (error) 451 return (error); 452 error = socreate(uap->domain, &so2, uap->type, uap->protocol, p); 453 if (error) 454 goto free1; 455 error = falloc(p, &fp1, &fd); 456 if (error) 457 goto free2; 458 fhold(fp1); 459 sv[0] = fd; 460 fp1->f_data = (caddr_t)so1; 461 error = falloc(p, &fp2, &fd); 462 if (error) 463 goto free3; 464 fhold(fp2); 465 fp2->f_data = (caddr_t)so2; 466 sv[1] = fd; 467 error = soconnect2(so1, so2); 468 if (error) 469 goto free4; 470 if (uap->type == SOCK_DGRAM) { 471 /* 472 * Datagram socket connection is asymmetric. 473 */ 474 error = soconnect2(so2, so1); 475 if (error) 476 goto free4; 477 } 478 fp1->f_flag = fp2->f_flag = FREAD|FWRITE; 479 fp1->f_ops = fp2->f_ops = &socketops; 480 fp1->f_type = fp2->f_type = DTYPE_SOCKET; 481 error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int)); 482 fdrop(fp1, p); 483 fdrop(fp2, p); 484 return (error); 485 free4: 486 if (fdp->fd_ofiles[sv[1]] == fp2) { 487 fdp->fd_ofiles[sv[1]] = NULL; 488 fdrop(fp2, p); 489 } 490 fdrop(fp2, p); 491 free3: 492 if (fdp->fd_ofiles[sv[0]] == fp1) { 493 fdp->fd_ofiles[sv[0]] = NULL; 494 fdrop(fp1, p); 495 } 496 fdrop(fp1, p); 497 free2: 498 (void)soclose(so2); 499 free1: 500 (void)soclose(so1); 501 return (error); 502 } 503 504 static int 505 sendit(p, s, mp, flags) 506 register struct proc *p; 507 int s; 508 register struct msghdr *mp; 509 int flags; 510 { 511 struct file *fp; 512 struct uio auio; 513 register struct iovec *iov; 514 register int i; 515 struct mbuf *control; 516 struct sockaddr *to; 517 int len, error; 518 struct socket *so; 519 #ifdef KTRACE 520 struct iovec *ktriov = NULL; 521 struct uio ktruio; 522 #endif 523 524 error = holdsock(p->p_fd, s, &fp); 525 if (error) 526 return (error); 527 auio.uio_iov = mp->msg_iov; 528 auio.uio_iovcnt = mp->msg_iovlen; 529 auio.uio_segflg = UIO_USERSPACE; 530 auio.uio_rw = UIO_WRITE; 531 auio.uio_procp = p; 532 auio.uio_offset = 0; /* XXX */ 533 auio.uio_resid = 0; 534 iov = mp->msg_iov; 535 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 536 if ((auio.uio_resid += iov->iov_len) < 0) { 537 fdrop(fp, p); 538 return (EINVAL); 539 } 540 } 541 if (mp->msg_name) { 542 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 543 if (error) { 544 fdrop(fp, p); 545 return (error); 546 } 547 } else { 548 to = 0; 549 } 550 if (mp->msg_control) { 551 if (mp->msg_controllen < sizeof(struct cmsghdr) 552 #ifdef COMPAT_OLDSOCK 553 && mp->msg_flags != MSG_COMPAT 554 #endif 555 ) { 556 error = EINVAL; 557 goto bad; 558 } 559 error = sockargs(&control, mp->msg_control, 560 mp->msg_controllen, MT_CONTROL); 561 if (error) 562 goto bad; 563 #ifdef COMPAT_OLDSOCK 564 if (mp->msg_flags == MSG_COMPAT) { 565 register struct cmsghdr *cm; 566 567 M_PREPEND(control, sizeof(*cm), M_TRYWAIT); 568 if (control == 0) { 569 error = ENOBUFS; 570 goto bad; 571 } else { 572 cm = mtod(control, struct cmsghdr *); 573 cm->cmsg_len = control->m_len; 574 cm->cmsg_level = SOL_SOCKET; 575 cm->cmsg_type = SCM_RIGHTS; 576 } 577 } 578 #endif 579 } else { 580 control = 0; 581 } 582 #ifdef KTRACE 583 if (KTRPOINT(p, KTR_GENIO)) { 584 int iovlen = auio.uio_iovcnt * sizeof (struct iovec); 585 586 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 587 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 588 ktruio = auio; 589 } 590 #endif 591 len = auio.uio_resid; 592 so = (struct socket *)fp->f_data; 593 error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control, 594 flags, p); 595 if (error) { 596 if (auio.uio_resid != len && (error == ERESTART || 597 error == EINTR || error == EWOULDBLOCK)) 598 error = 0; 599 if (error == EPIPE) { 600 PROC_LOCK(p); 601 psignal(p, SIGPIPE); 602 PROC_UNLOCK(p); 603 } 604 } 605 if (error == 0) 606 p->p_retval[0] = len - auio.uio_resid; 607 #ifdef KTRACE 608 if (ktriov != NULL) { 609 if (error == 0) { 610 ktruio.uio_iov = ktriov; 611 ktruio.uio_resid = p->p_retval[0]; 612 ktrgenio(p->p_tracep, s, UIO_WRITE, &ktruio, error); 613 } 614 FREE(ktriov, M_TEMP); 615 } 616 #endif 617 bad: 618 fdrop(fp, p); 619 if (to) 620 FREE(to, M_SONAME); 621 return (error); 622 } 623 624 int 625 sendto(p, uap) 626 struct proc *p; 627 register struct sendto_args /* { 628 int s; 629 caddr_t buf; 630 size_t len; 631 int flags; 632 caddr_t to; 633 int tolen; 634 } */ *uap; 635 { 636 struct msghdr msg; 637 struct iovec aiov; 638 639 msg.msg_name = uap->to; 640 msg.msg_namelen = uap->tolen; 641 msg.msg_iov = &aiov; 642 msg.msg_iovlen = 1; 643 msg.msg_control = 0; 644 #ifdef COMPAT_OLDSOCK 645 msg.msg_flags = 0; 646 #endif 647 aiov.iov_base = uap->buf; 648 aiov.iov_len = uap->len; 649 return (sendit(p, uap->s, &msg, uap->flags)); 650 } 651 652 #ifdef COMPAT_OLDSOCK 653 int 654 osend(p, uap) 655 struct proc *p; 656 register struct osend_args /* { 657 int s; 658 caddr_t buf; 659 int len; 660 int flags; 661 } */ *uap; 662 { 663 struct msghdr msg; 664 struct iovec aiov; 665 666 msg.msg_name = 0; 667 msg.msg_namelen = 0; 668 msg.msg_iov = &aiov; 669 msg.msg_iovlen = 1; 670 aiov.iov_base = uap->buf; 671 aiov.iov_len = uap->len; 672 msg.msg_control = 0; 673 msg.msg_flags = 0; 674 return (sendit(p, uap->s, &msg, uap->flags)); 675 } 676 677 int 678 osendmsg(p, uap) 679 struct proc *p; 680 register struct osendmsg_args /* { 681 int s; 682 caddr_t msg; 683 int flags; 684 } */ *uap; 685 { 686 struct msghdr msg; 687 struct iovec aiov[UIO_SMALLIOV], *iov; 688 int error; 689 690 error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr)); 691 if (error) 692 return (error); 693 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 694 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) 695 return (EMSGSIZE); 696 MALLOC(iov, struct iovec *, 697 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 698 M_WAITOK); 699 } else 700 iov = aiov; 701 error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, 702 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 703 if (error) 704 goto done; 705 msg.msg_flags = MSG_COMPAT; 706 msg.msg_iov = iov; 707 error = sendit(p, uap->s, &msg, uap->flags); 708 done: 709 if (iov != aiov) 710 FREE(iov, M_IOV); 711 return (error); 712 } 713 #endif 714 715 int 716 sendmsg(p, uap) 717 struct proc *p; 718 register struct sendmsg_args /* { 719 int s; 720 caddr_t msg; 721 int flags; 722 } */ *uap; 723 { 724 struct msghdr msg; 725 struct iovec aiov[UIO_SMALLIOV], *iov; 726 int error; 727 728 error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg)); 729 if (error) 730 return (error); 731 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 732 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) 733 return (EMSGSIZE); 734 MALLOC(iov, struct iovec *, 735 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 736 M_WAITOK); 737 } else 738 iov = aiov; 739 if (msg.msg_iovlen && 740 (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, 741 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))) 742 goto done; 743 msg.msg_iov = iov; 744 #ifdef COMPAT_OLDSOCK 745 msg.msg_flags = 0; 746 #endif 747 error = sendit(p, uap->s, &msg, uap->flags); 748 done: 749 if (iov != aiov) 750 FREE(iov, M_IOV); 751 return (error); 752 } 753 754 static int 755 recvit(p, s, mp, namelenp) 756 register struct proc *p; 757 int s; 758 register struct msghdr *mp; 759 caddr_t namelenp; 760 { 761 struct file *fp; 762 struct uio auio; 763 register struct iovec *iov; 764 register int i; 765 int len, error; 766 struct mbuf *m, *control = 0; 767 caddr_t ctlbuf; 768 struct socket *so; 769 struct sockaddr *fromsa = 0; 770 #ifdef KTRACE 771 struct iovec *ktriov = NULL; 772 struct uio ktruio; 773 #endif 774 775 error = holdsock(p->p_fd, s, &fp); 776 if (error) 777 return (error); 778 auio.uio_iov = mp->msg_iov; 779 auio.uio_iovcnt = mp->msg_iovlen; 780 auio.uio_segflg = UIO_USERSPACE; 781 auio.uio_rw = UIO_READ; 782 auio.uio_procp = p; 783 auio.uio_offset = 0; /* XXX */ 784 auio.uio_resid = 0; 785 iov = mp->msg_iov; 786 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 787 if ((auio.uio_resid += iov->iov_len) < 0) { 788 fdrop(fp, p); 789 return (EINVAL); 790 } 791 } 792 #ifdef KTRACE 793 if (KTRPOINT(p, KTR_GENIO)) { 794 int iovlen = auio.uio_iovcnt * sizeof (struct iovec); 795 796 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 797 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); 798 ktruio = auio; 799 } 800 #endif 801 len = auio.uio_resid; 802 so = (struct socket *)fp->f_data; 803 error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio, 804 (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, 805 &mp->msg_flags); 806 if (error) { 807 if (auio.uio_resid != len && (error == ERESTART || 808 error == EINTR || error == EWOULDBLOCK)) 809 error = 0; 810 } 811 #ifdef KTRACE 812 if (ktriov != NULL) { 813 if (error == 0) { 814 ktruio.uio_iov = ktriov; 815 ktruio.uio_resid = len - auio.uio_resid; 816 ktrgenio(p->p_tracep, s, UIO_READ, &ktruio, error); 817 } 818 FREE(ktriov, M_TEMP); 819 } 820 #endif 821 if (error) 822 goto out; 823 p->p_retval[0] = len - auio.uio_resid; 824 if (mp->msg_name) { 825 len = mp->msg_namelen; 826 if (len <= 0 || fromsa == 0) 827 len = 0; 828 else { 829 #ifndef MIN 830 #define MIN(a,b) ((a)>(b)?(b):(a)) 831 #endif 832 /* save sa_len before it is destroyed by MSG_COMPAT */ 833 len = MIN(len, fromsa->sa_len); 834 #ifdef COMPAT_OLDSOCK 835 if (mp->msg_flags & MSG_COMPAT) 836 ((struct osockaddr *)fromsa)->sa_family = 837 fromsa->sa_family; 838 #endif 839 error = copyout(fromsa, 840 (caddr_t)mp->msg_name, (unsigned)len); 841 if (error) 842 goto out; 843 } 844 mp->msg_namelen = len; 845 if (namelenp && 846 (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) { 847 #ifdef COMPAT_OLDSOCK 848 if (mp->msg_flags & MSG_COMPAT) 849 error = 0; /* old recvfrom didn't check */ 850 else 851 #endif 852 goto out; 853 } 854 } 855 if (mp->msg_control) { 856 #ifdef COMPAT_OLDSOCK 857 /* 858 * We assume that old recvmsg calls won't receive access 859 * rights and other control info, esp. as control info 860 * is always optional and those options didn't exist in 4.3. 861 * If we receive rights, trim the cmsghdr; anything else 862 * is tossed. 863 */ 864 if (control && mp->msg_flags & MSG_COMPAT) { 865 if (mtod(control, struct cmsghdr *)->cmsg_level != 866 SOL_SOCKET || 867 mtod(control, struct cmsghdr *)->cmsg_type != 868 SCM_RIGHTS) { 869 mp->msg_controllen = 0; 870 goto out; 871 } 872 control->m_len -= sizeof (struct cmsghdr); 873 control->m_data += sizeof (struct cmsghdr); 874 } 875 #endif 876 len = mp->msg_controllen; 877 m = control; 878 mp->msg_controllen = 0; 879 ctlbuf = (caddr_t) mp->msg_control; 880 881 while (m && len > 0) { 882 unsigned int tocopy; 883 884 if (len >= m->m_len) 885 tocopy = m->m_len; 886 else { 887 mp->msg_flags |= MSG_CTRUNC; 888 tocopy = len; 889 } 890 891 if ((error = copyout((caddr_t)mtod(m, caddr_t), 892 ctlbuf, tocopy)) != 0) 893 goto out; 894 895 ctlbuf += tocopy; 896 len -= tocopy; 897 m = m->m_next; 898 } 899 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; 900 } 901 out: 902 fdrop(fp, p); 903 if (fromsa) 904 FREE(fromsa, M_SONAME); 905 if (control) 906 m_freem(control); 907 return (error); 908 } 909 910 int 911 recvfrom(p, uap) 912 struct proc *p; 913 register struct recvfrom_args /* { 914 int s; 915 caddr_t buf; 916 size_t len; 917 int flags; 918 caddr_t from; 919 int *fromlenaddr; 920 } */ *uap; 921 { 922 struct msghdr msg; 923 struct iovec aiov; 924 int error; 925 926 if (uap->fromlenaddr) { 927 error = copyin((caddr_t)uap->fromlenaddr, 928 (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen)); 929 if (error) 930 return (error); 931 } else 932 msg.msg_namelen = 0; 933 msg.msg_name = uap->from; 934 msg.msg_iov = &aiov; 935 msg.msg_iovlen = 1; 936 aiov.iov_base = uap->buf; 937 aiov.iov_len = uap->len; 938 msg.msg_control = 0; 939 msg.msg_flags = uap->flags; 940 return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr)); 941 } 942 943 #ifdef COMPAT_OLDSOCK 944 int 945 orecvfrom(p, uap) 946 struct proc *p; 947 struct recvfrom_args *uap; 948 { 949 950 uap->flags |= MSG_COMPAT; 951 return (recvfrom(p, uap)); 952 } 953 #endif 954 955 956 #ifdef COMPAT_OLDSOCK 957 int 958 orecv(p, uap) 959 struct proc *p; 960 register struct orecv_args /* { 961 int s; 962 caddr_t buf; 963 int len; 964 int flags; 965 } */ *uap; 966 { 967 struct msghdr msg; 968 struct iovec aiov; 969 970 msg.msg_name = 0; 971 msg.msg_namelen = 0; 972 msg.msg_iov = &aiov; 973 msg.msg_iovlen = 1; 974 aiov.iov_base = uap->buf; 975 aiov.iov_len = uap->len; 976 msg.msg_control = 0; 977 msg.msg_flags = uap->flags; 978 return (recvit(p, uap->s, &msg, (caddr_t)0)); 979 } 980 981 /* 982 * Old recvmsg. This code takes advantage of the fact that the old msghdr 983 * overlays the new one, missing only the flags, and with the (old) access 984 * rights where the control fields are now. 985 */ 986 int 987 orecvmsg(p, uap) 988 struct proc *p; 989 register struct orecvmsg_args /* { 990 int s; 991 struct omsghdr *msg; 992 int flags; 993 } */ *uap; 994 { 995 struct msghdr msg; 996 struct iovec aiov[UIO_SMALLIOV], *iov; 997 int error; 998 999 error = copyin((caddr_t)uap->msg, (caddr_t)&msg, 1000 sizeof (struct omsghdr)); 1001 if (error) 1002 return (error); 1003 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 1004 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) 1005 return (EMSGSIZE); 1006 MALLOC(iov, struct iovec *, 1007 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 1008 M_WAITOK); 1009 } else 1010 iov = aiov; 1011 msg.msg_flags = uap->flags | MSG_COMPAT; 1012 error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, 1013 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 1014 if (error) 1015 goto done; 1016 msg.msg_iov = iov; 1017 error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen); 1018 1019 if (msg.msg_controllen && error == 0) 1020 error = copyout((caddr_t)&msg.msg_controllen, 1021 (caddr_t)&uap->msg->msg_accrightslen, sizeof (int)); 1022 done: 1023 if (iov != aiov) 1024 FREE(iov, M_IOV); 1025 return (error); 1026 } 1027 #endif 1028 1029 int 1030 recvmsg(p, uap) 1031 struct proc *p; 1032 register struct recvmsg_args /* { 1033 int s; 1034 struct msghdr *msg; 1035 int flags; 1036 } */ *uap; 1037 { 1038 struct msghdr msg; 1039 struct iovec aiov[UIO_SMALLIOV], *uiov, *iov; 1040 register int error; 1041 1042 error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg)); 1043 if (error) 1044 return (error); 1045 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { 1046 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) 1047 return (EMSGSIZE); 1048 MALLOC(iov, struct iovec *, 1049 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 1050 M_WAITOK); 1051 } else 1052 iov = aiov; 1053 #ifdef COMPAT_OLDSOCK 1054 msg.msg_flags = uap->flags &~ MSG_COMPAT; 1055 #else 1056 msg.msg_flags = uap->flags; 1057 #endif 1058 uiov = msg.msg_iov; 1059 msg.msg_iov = iov; 1060 error = copyin((caddr_t)uiov, (caddr_t)iov, 1061 (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); 1062 if (error) 1063 goto done; 1064 error = recvit(p, uap->s, &msg, (caddr_t)0); 1065 if (!error) { 1066 msg.msg_iov = uiov; 1067 error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg)); 1068 } 1069 done: 1070 if (iov != aiov) 1071 FREE(iov, M_IOV); 1072 return (error); 1073 } 1074 1075 /* ARGSUSED */ 1076 int 1077 shutdown(p, uap) 1078 struct proc *p; 1079 register struct shutdown_args /* { 1080 int s; 1081 int how; 1082 } */ *uap; 1083 { 1084 struct file *fp; 1085 int error; 1086 1087 error = holdsock(p->p_fd, uap->s, &fp); 1088 if (error) 1089 return (error); 1090 error = soshutdown((struct socket *)fp->f_data, uap->how); 1091 fdrop(fp, p); 1092 return(error); 1093 } 1094 1095 /* ARGSUSED */ 1096 int 1097 setsockopt(p, uap) 1098 struct proc *p; 1099 register struct setsockopt_args /* { 1100 int s; 1101 int level; 1102 int name; 1103 caddr_t val; 1104 int valsize; 1105 } */ *uap; 1106 { 1107 struct file *fp; 1108 struct sockopt sopt; 1109 int error; 1110 1111 if (uap->val == 0 && uap->valsize != 0) 1112 return (EFAULT); 1113 if (uap->valsize < 0) 1114 return (EINVAL); 1115 1116 error = holdsock(p->p_fd, uap->s, &fp); 1117 if (error) 1118 return (error); 1119 1120 sopt.sopt_dir = SOPT_SET; 1121 sopt.sopt_level = uap->level; 1122 sopt.sopt_name = uap->name; 1123 sopt.sopt_val = uap->val; 1124 sopt.sopt_valsize = uap->valsize; 1125 sopt.sopt_p = p; 1126 error = sosetopt((struct socket *)fp->f_data, &sopt); 1127 fdrop(fp, p); 1128 return(error); 1129 } 1130 1131 /* ARGSUSED */ 1132 int 1133 getsockopt(p, uap) 1134 struct proc *p; 1135 register struct getsockopt_args /* { 1136 int s; 1137 int level; 1138 int name; 1139 caddr_t val; 1140 int *avalsize; 1141 } */ *uap; 1142 { 1143 int valsize, error; 1144 struct file *fp; 1145 struct sockopt sopt; 1146 1147 error = holdsock(p->p_fd, uap->s, &fp); 1148 if (error) 1149 return (error); 1150 if (uap->val) { 1151 error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize, 1152 sizeof (valsize)); 1153 if (error) { 1154 fdrop(fp, p); 1155 return (error); 1156 } 1157 if (valsize < 0) { 1158 fdrop(fp, p); 1159 return (EINVAL); 1160 } 1161 } else { 1162 valsize = 0; 1163 } 1164 1165 sopt.sopt_dir = SOPT_GET; 1166 sopt.sopt_level = uap->level; 1167 sopt.sopt_name = uap->name; 1168 sopt.sopt_val = uap->val; 1169 sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */ 1170 sopt.sopt_p = p; 1171 1172 error = sogetopt((struct socket *)fp->f_data, &sopt); 1173 if (error == 0) { 1174 valsize = sopt.sopt_valsize; 1175 error = copyout((caddr_t)&valsize, 1176 (caddr_t)uap->avalsize, sizeof (valsize)); 1177 } 1178 fdrop(fp, p); 1179 return (error); 1180 } 1181 1182 /* 1183 * Get socket name. 1184 */ 1185 /* ARGSUSED */ 1186 static int 1187 getsockname1(p, uap, compat) 1188 struct proc *p; 1189 register struct getsockname_args /* { 1190 int fdes; 1191 caddr_t asa; 1192 int *alen; 1193 } */ *uap; 1194 int compat; 1195 { 1196 struct file *fp; 1197 register struct socket *so; 1198 struct sockaddr *sa; 1199 int len, error; 1200 1201 error = holdsock(p->p_fd, uap->fdes, &fp); 1202 if (error) 1203 return (error); 1204 error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); 1205 if (error) { 1206 fdrop(fp, p); 1207 return (error); 1208 } 1209 so = (struct socket *)fp->f_data; 1210 sa = 0; 1211 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa); 1212 if (error) 1213 goto bad; 1214 if (sa == 0) { 1215 len = 0; 1216 goto gotnothing; 1217 } 1218 1219 len = MIN(len, sa->sa_len); 1220 #ifdef COMPAT_OLDSOCK 1221 if (compat) 1222 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1223 #endif 1224 error = copyout(sa, (caddr_t)uap->asa, (u_int)len); 1225 if (error == 0) 1226 gotnothing: 1227 error = copyout((caddr_t)&len, (caddr_t)uap->alen, 1228 sizeof (len)); 1229 bad: 1230 if (sa) 1231 FREE(sa, M_SONAME); 1232 fdrop(fp, p); 1233 return (error); 1234 } 1235 1236 int 1237 getsockname(p, uap) 1238 struct proc *p; 1239 struct getsockname_args *uap; 1240 { 1241 1242 return (getsockname1(p, uap, 0)); 1243 } 1244 1245 #ifdef COMPAT_OLDSOCK 1246 int 1247 ogetsockname(p, uap) 1248 struct proc *p; 1249 struct getsockname_args *uap; 1250 { 1251 1252 return (getsockname1(p, uap, 1)); 1253 } 1254 #endif /* COMPAT_OLDSOCK */ 1255 1256 /* 1257 * Get name of peer for connected socket. 1258 */ 1259 /* ARGSUSED */ 1260 static int 1261 getpeername1(p, uap, compat) 1262 struct proc *p; 1263 register struct getpeername_args /* { 1264 int fdes; 1265 caddr_t asa; 1266 int *alen; 1267 } */ *uap; 1268 int compat; 1269 { 1270 struct file *fp; 1271 register struct socket *so; 1272 struct sockaddr *sa; 1273 int len, error; 1274 1275 error = holdsock(p->p_fd, uap->fdes, &fp); 1276 if (error) 1277 return (error); 1278 so = (struct socket *)fp->f_data; 1279 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1280 fdrop(fp, p); 1281 return (ENOTCONN); 1282 } 1283 error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); 1284 if (error) { 1285 fdrop(fp, p); 1286 return (error); 1287 } 1288 sa = 0; 1289 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa); 1290 if (error) 1291 goto bad; 1292 if (sa == 0) { 1293 len = 0; 1294 goto gotnothing; 1295 } 1296 len = MIN(len, sa->sa_len); 1297 #ifdef COMPAT_OLDSOCK 1298 if (compat) 1299 ((struct osockaddr *)sa)->sa_family = 1300 sa->sa_family; 1301 #endif 1302 error = copyout(sa, (caddr_t)uap->asa, (u_int)len); 1303 if (error) 1304 goto bad; 1305 gotnothing: 1306 error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len)); 1307 bad: 1308 if (sa) 1309 FREE(sa, M_SONAME); 1310 fdrop(fp, p); 1311 return (error); 1312 } 1313 1314 int 1315 getpeername(p, uap) 1316 struct proc *p; 1317 struct getpeername_args *uap; 1318 { 1319 1320 return (getpeername1(p, uap, 0)); 1321 } 1322 1323 #ifdef COMPAT_OLDSOCK 1324 int 1325 ogetpeername(p, uap) 1326 struct proc *p; 1327 struct ogetpeername_args *uap; 1328 { 1329 1330 /* XXX uap should have type `getpeername_args *' to begin with. */ 1331 return (getpeername1(p, (struct getpeername_args *)uap, 1)); 1332 } 1333 #endif /* COMPAT_OLDSOCK */ 1334 1335 int 1336 sockargs(mp, buf, buflen, type) 1337 struct mbuf **mp; 1338 caddr_t buf; 1339 int buflen, type; 1340 { 1341 register struct sockaddr *sa; 1342 register struct mbuf *m; 1343 int error; 1344 1345 if ((u_int)buflen > MLEN) { 1346 #ifdef COMPAT_OLDSOCK 1347 if (type == MT_SONAME && (u_int)buflen <= 112) 1348 buflen = MLEN; /* unix domain compat. hack */ 1349 else 1350 #endif 1351 return (EINVAL); 1352 } 1353 m = m_get(M_TRYWAIT, type); 1354 if (m == NULL) 1355 return (ENOBUFS); 1356 m->m_len = buflen; 1357 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1358 if (error) 1359 (void) m_free(m); 1360 else { 1361 *mp = m; 1362 if (type == MT_SONAME) { 1363 sa = mtod(m, struct sockaddr *); 1364 1365 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1366 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1367 sa->sa_family = sa->sa_len; 1368 #endif 1369 sa->sa_len = buflen; 1370 } 1371 } 1372 return (error); 1373 } 1374 1375 int 1376 getsockaddr(namp, uaddr, len) 1377 struct sockaddr **namp; 1378 caddr_t uaddr; 1379 size_t len; 1380 { 1381 struct sockaddr *sa; 1382 int error; 1383 1384 if (len > SOCK_MAXADDRLEN) 1385 return ENAMETOOLONG; 1386 MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); 1387 error = copyin(uaddr, sa, len); 1388 if (error) { 1389 FREE(sa, M_SONAME); 1390 } else { 1391 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1392 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1393 sa->sa_family = sa->sa_len; 1394 #endif 1395 sa->sa_len = len; 1396 *namp = sa; 1397 } 1398 return error; 1399 } 1400 1401 /* 1402 * holdsock() - load the struct file pointer associated 1403 * with a socket into *fpp. If an error occurs, non-zero 1404 * will be returned and *fpp will be set to NULL. 1405 */ 1406 int 1407 holdsock(fdp, fdes, fpp) 1408 struct filedesc *fdp; 1409 int fdes; 1410 struct file **fpp; 1411 { 1412 register struct file *fp = NULL; 1413 int error = 0; 1414 1415 if ((unsigned)fdes >= fdp->fd_nfiles || 1416 (fp = fdp->fd_ofiles[fdes]) == NULL) { 1417 error = EBADF; 1418 } else if (fp->f_type != DTYPE_SOCKET) { 1419 error = ENOTSOCK; 1420 fp = NULL; 1421 } else { 1422 fhold(fp); 1423 } 1424 *fpp = fp; 1425 return(error); 1426 } 1427 1428 /* 1429 * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-)) 1430 * XXX - The sf_buf functions are currently private to sendfile(2), so have 1431 * been made static, but may be useful in the future for doing zero-copy in 1432 * other parts of the networking code. 1433 */ 1434 static void 1435 sf_buf_init(void *arg) 1436 { 1437 int i; 1438 1439 mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", MTX_DEF); 1440 mtx_lock(&sf_freelist.sf_lock); 1441 SLIST_INIT(&sf_freelist.sf_head); 1442 sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE); 1443 sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, 1444 M_NOWAIT | M_ZERO); 1445 for (i = 0; i < nsfbufs; i++) { 1446 sf_bufs[i].kva = sf_base + i * PAGE_SIZE; 1447 SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list); 1448 } 1449 sf_buf_alloc_want = 0; 1450 mtx_unlock(&sf_freelist.sf_lock); 1451 } 1452 1453 /* 1454 * Get an sf_buf from the freelist. Will block if none are available. 1455 */ 1456 static struct sf_buf * 1457 sf_buf_alloc() 1458 { 1459 struct sf_buf *sf; 1460 int error; 1461 1462 mtx_lock(&sf_freelist.sf_lock); 1463 while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) { 1464 sf_buf_alloc_want++; 1465 error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH, 1466 "sfbufa", 0); 1467 sf_buf_alloc_want--; 1468 1469 /* 1470 * If we got a signal, don't risk going back to sleep. 1471 */ 1472 if (error) 1473 break; 1474 } 1475 SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list); 1476 mtx_unlock(&sf_freelist.sf_lock); 1477 return (sf); 1478 } 1479 1480 #define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT]) 1481 1482 /* 1483 * Detatch mapped page and release resources back to the system. 1484 */ 1485 static void 1486 sf_buf_free(caddr_t addr, void *args) 1487 { 1488 struct sf_buf *sf; 1489 struct vm_page *m; 1490 1491 sf = dtosf(addr); 1492 mtx_lock(&vm_mtx); 1493 pmap_qremove((vm_offset_t)addr, 1); 1494 m = sf->m; 1495 vm_page_unwire(m, 0); 1496 /* 1497 * Check for the object going away on us. This can 1498 * happen since we don't hold a reference to it. 1499 * If so, we're responsible for freeing the page. 1500 */ 1501 if (m->wire_count == 0 && m->object == NULL) 1502 vm_page_free(m); 1503 mtx_unlock(&vm_mtx); 1504 sf->m = NULL; 1505 mtx_lock(&sf_freelist.sf_lock); 1506 SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list); 1507 if (sf_buf_alloc_want > 0) 1508 wakeup_one(&sf_freelist); 1509 mtx_unlock(&sf_freelist.sf_lock); 1510 } 1511 1512 /* 1513 * sendfile(2) 1514 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 1515 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 1516 * 1517 * Send a file specified by 'fd' and starting at 'offset' to a socket 1518 * specified by 's'. Send only 'nbytes' of the file or until EOF if 1519 * nbytes == 0. Optionally add a header and/or trailer to the socket 1520 * output. If specified, write the total number of bytes sent into *sbytes. 1521 */ 1522 int 1523 sendfile(struct proc *p, struct sendfile_args *uap) 1524 { 1525 struct file *fp; 1526 struct filedesc *fdp = p->p_fd; 1527 struct vnode *vp; 1528 struct vm_object *obj; 1529 struct socket *so; 1530 struct mbuf *m; 1531 struct sf_buf *sf; 1532 struct vm_page *pg; 1533 struct writev_args nuap; 1534 struct sf_hdtr hdtr; 1535 off_t off, xfsize, sbytes = 0; 1536 int error = 0, s; 1537 1538 vp = NULL; 1539 /* 1540 * Do argument checking. Must be a regular file in, stream 1541 * type and connected socket out, positive offset. 1542 */ 1543 fp = holdfp(fdp, uap->fd, FREAD); 1544 if (fp == NULL) { 1545 error = EBADF; 1546 goto done; 1547 } 1548 if (fp->f_type != DTYPE_VNODE) { 1549 error = EINVAL; 1550 goto done; 1551 } 1552 vp = (struct vnode *)fp->f_data; 1553 vref(vp); 1554 if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) { 1555 error = EINVAL; 1556 goto done; 1557 } 1558 fdrop(fp, p); 1559 error = holdsock(p->p_fd, uap->s, &fp); 1560 if (error) 1561 goto done; 1562 so = (struct socket *)fp->f_data; 1563 if (so->so_type != SOCK_STREAM) { 1564 error = EINVAL; 1565 goto done; 1566 } 1567 if ((so->so_state & SS_ISCONNECTED) == 0) { 1568 error = ENOTCONN; 1569 goto done; 1570 } 1571 if (uap->offset < 0) { 1572 error = EINVAL; 1573 goto done; 1574 } 1575 1576 /* 1577 * If specified, get the pointer to the sf_hdtr struct for 1578 * any headers/trailers. 1579 */ 1580 if (uap->hdtr != NULL) { 1581 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 1582 if (error) 1583 goto done; 1584 /* 1585 * Send any headers. Wimp out and use writev(2). 1586 */ 1587 if (hdtr.headers != NULL) { 1588 nuap.fd = uap->s; 1589 nuap.iovp = hdtr.headers; 1590 nuap.iovcnt = hdtr.hdr_cnt; 1591 error = writev(p, &nuap); 1592 if (error) 1593 goto done; 1594 sbytes += p->p_retval[0]; 1595 } 1596 } 1597 1598 /* 1599 * Protect against multiple writers to the socket. 1600 */ 1601 (void) sblock(&so->so_snd, M_WAITOK); 1602 1603 /* 1604 * Loop through the pages in the file, starting with the requested 1605 * offset. Get a file page (do I/O if necessary), map the file page 1606 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 1607 * it on the socket. 1608 */ 1609 for (off = uap->offset; ; off += xfsize, sbytes += xfsize) { 1610 vm_pindex_t pindex; 1611 vm_offset_t pgoff; 1612 1613 pindex = OFF_TO_IDX(off); 1614 retry_lookup: 1615 /* 1616 * Calculate the amount to transfer. Not to exceed a page, 1617 * the EOF, or the passed in nbytes. 1618 */ 1619 xfsize = obj->un_pager.vnp.vnp_size - off; 1620 if (xfsize > PAGE_SIZE) 1621 xfsize = PAGE_SIZE; 1622 pgoff = (vm_offset_t)(off & PAGE_MASK); 1623 if (PAGE_SIZE - pgoff < xfsize) 1624 xfsize = PAGE_SIZE - pgoff; 1625 if (uap->nbytes && xfsize > (uap->nbytes - sbytes)) 1626 xfsize = uap->nbytes - sbytes; 1627 if (xfsize <= 0) 1628 break; 1629 /* 1630 * Optimize the non-blocking case by looking at the socket space 1631 * before going to the extra work of constituting the sf_buf. 1632 */ 1633 if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { 1634 if (so->so_state & SS_CANTSENDMORE) 1635 error = EPIPE; 1636 else 1637 error = EAGAIN; 1638 sbunlock(&so->so_snd); 1639 goto done; 1640 } 1641 /* 1642 * Attempt to look up the page. 1643 * 1644 * Allocate if not found 1645 * 1646 * Wait and loop if busy. 1647 */ 1648 mtx_lock(&vm_mtx); 1649 pg = vm_page_lookup(obj, pindex); 1650 1651 if (pg == NULL) { 1652 pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL); 1653 if (pg == NULL) { 1654 VM_WAIT; 1655 mtx_unlock(&vm_mtx); 1656 goto retry_lookup; 1657 } 1658 vm_page_wakeup(pg); 1659 } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) { 1660 mtx_unlock(&vm_mtx); 1661 goto retry_lookup; 1662 } 1663 1664 /* 1665 * Wire the page so it does not get ripped out from under 1666 * us. 1667 */ 1668 1669 vm_page_wire(pg); 1670 1671 /* 1672 * If page is not valid for what we need, initiate I/O 1673 */ 1674 1675 if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) { 1676 struct uio auio; 1677 struct iovec aiov; 1678 int bsize; 1679 1680 /* 1681 * Ensure that our page is still around when the I/O 1682 * completes. 1683 */ 1684 vm_page_io_start(pg); 1685 mtx_unlock(&vm_mtx); 1686 1687 /* 1688 * Get the page from backing store. 1689 */ 1690 bsize = vp->v_mount->mnt_stat.f_iosize; 1691 auio.uio_iov = &aiov; 1692 auio.uio_iovcnt = 1; 1693 aiov.iov_base = 0; 1694 aiov.iov_len = MAXBSIZE; 1695 auio.uio_resid = MAXBSIZE; 1696 auio.uio_offset = trunc_page(off); 1697 auio.uio_segflg = UIO_NOCOPY; 1698 auio.uio_rw = UIO_READ; 1699 auio.uio_procp = p; 1700 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p); 1701 error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16), 1702 p->p_ucred); 1703 VOP_UNLOCK(vp, 0, p); 1704 mtx_lock(&vm_mtx); 1705 vm_page_flag_clear(pg, PG_ZERO); 1706 vm_page_io_finish(pg); 1707 if (error) { 1708 vm_page_unwire(pg, 0); 1709 /* 1710 * See if anyone else might know about this page. 1711 * If not and it is not valid, then free it. 1712 */ 1713 if (pg->wire_count == 0 && pg->valid == 0 && 1714 pg->busy == 0 && !(pg->flags & PG_BUSY) && 1715 pg->hold_count == 0) { 1716 vm_page_busy(pg); 1717 vm_page_free(pg); 1718 } 1719 mtx_unlock(&vm_mtx); 1720 sbunlock(&so->so_snd); 1721 goto done; 1722 } 1723 } 1724 1725 1726 /* 1727 * Get a sendfile buf. We usually wait as long as necessary, 1728 * but this wait can be interrupted. 1729 */ 1730 mtx_unlock(&vm_mtx); 1731 if ((sf = sf_buf_alloc()) == NULL) { 1732 mtx_lock(&vm_mtx); 1733 vm_page_unwire(pg, 0); 1734 if (pg->wire_count == 0 && pg->object == NULL) 1735 vm_page_free(pg); 1736 mtx_unlock(&vm_mtx); 1737 sbunlock(&so->so_snd); 1738 error = EINTR; 1739 goto done; 1740 } 1741 1742 /* 1743 * Allocate a kernel virtual page and insert the physical page 1744 * into it. 1745 */ 1746 mtx_lock(&vm_mtx); 1747 sf->m = pg; 1748 pmap_qenter(sf->kva, &pg, 1); 1749 mtx_unlock(&vm_mtx); 1750 /* 1751 * Get an mbuf header and set it up as having external storage. 1752 */ 1753 MGETHDR(m, M_TRYWAIT, MT_DATA); 1754 if (m == NULL) { 1755 error = ENOBUFS; 1756 sf_buf_free((void *)sf->kva, NULL); 1757 sbunlock(&so->so_snd); 1758 goto done; 1759 } 1760 /* 1761 * Setup external storage for mbuf. 1762 */ 1763 MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY, 1764 EXT_SFBUF); 1765 m->m_data = (char *) sf->kva + pgoff; 1766 m->m_pkthdr.len = m->m_len = xfsize; 1767 /* 1768 * Add the buffer to the socket buffer chain. 1769 */ 1770 s = splnet(); 1771 retry_space: 1772 /* 1773 * Make sure that the socket is still able to take more data. 1774 * CANTSENDMORE being true usually means that the connection 1775 * was closed. so_error is true when an error was sensed after 1776 * a previous send. 1777 * The state is checked after the page mapping and buffer 1778 * allocation above since those operations may block and make 1779 * any socket checks stale. From this point forward, nothing 1780 * blocks before the pru_send (or more accurately, any blocking 1781 * results in a loop back to here to re-check). 1782 */ 1783 if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { 1784 if (so->so_state & SS_CANTSENDMORE) { 1785 error = EPIPE; 1786 } else { 1787 error = so->so_error; 1788 so->so_error = 0; 1789 } 1790 m_freem(m); 1791 sbunlock(&so->so_snd); 1792 splx(s); 1793 goto done; 1794 } 1795 /* 1796 * Wait for socket space to become available. We do this just 1797 * after checking the connection state above in order to avoid 1798 * a race condition with sbwait(). 1799 */ 1800 if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { 1801 if (so->so_state & SS_NBIO) { 1802 m_freem(m); 1803 sbunlock(&so->so_snd); 1804 splx(s); 1805 error = EAGAIN; 1806 goto done; 1807 } 1808 error = sbwait(&so->so_snd); 1809 /* 1810 * An error from sbwait usually indicates that we've 1811 * been interrupted by a signal. If we've sent anything 1812 * then return bytes sent, otherwise return the error. 1813 */ 1814 if (error) { 1815 m_freem(m); 1816 sbunlock(&so->so_snd); 1817 splx(s); 1818 goto done; 1819 } 1820 goto retry_space; 1821 } 1822 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p); 1823 splx(s); 1824 if (error) { 1825 sbunlock(&so->so_snd); 1826 goto done; 1827 } 1828 } 1829 sbunlock(&so->so_snd); 1830 1831 /* 1832 * Send trailers. Wimp out and use writev(2). 1833 */ 1834 if (uap->hdtr != NULL && hdtr.trailers != NULL) { 1835 nuap.fd = uap->s; 1836 nuap.iovp = hdtr.trailers; 1837 nuap.iovcnt = hdtr.trl_cnt; 1838 error = writev(p, &nuap); 1839 if (error) 1840 goto done; 1841 sbytes += p->p_retval[0]; 1842 } 1843 1844 done: 1845 /* 1846 * If there was no error we have to clear p->p_retval[0] 1847 * because it may have been set by writev. 1848 */ 1849 if (error == 0) { 1850 p->p_retval[0] = 0; 1851 } 1852 if (uap->sbytes != NULL) { 1853 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 1854 } 1855 if (vp) 1856 vrele(vp); 1857 if (fp) 1858 fdrop(fp, p); 1859 return (error); 1860 } 1861