1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_capsicum.h" 39 #include "opt_inet.h" 40 #include "opt_inet6.h" 41 #include "opt_compat.h" 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/capsicum.h> 47 #include <sys/condvar.h> 48 #include <sys/kernel.h> 49 #include <sys/lock.h> 50 #include <sys/mutex.h> 51 #include <sys/sysproto.h> 52 #include <sys/malloc.h> 53 #include <sys/filedesc.h> 54 #include <sys/event.h> 55 #include <sys/proc.h> 56 #include <sys/fcntl.h> 57 #include <sys/file.h> 58 #include <sys/filio.h> 59 #include <sys/jail.h> 60 #include <sys/mman.h> 61 #include <sys/mount.h> 62 #include <sys/mbuf.h> 63 #include <sys/protosw.h> 64 #include <sys/rwlock.h> 65 #include <sys/sf_buf.h> 66 #include <sys/sf_sync.h> 67 #include <sys/sf_base.h> 68 #include <sys/sysent.h> 69 #include <sys/socket.h> 70 #include <sys/socketvar.h> 71 #include <sys/signalvar.h> 72 #include <sys/syscallsubr.h> 73 #include <sys/sysctl.h> 74 #include <sys/uio.h> 75 #include <sys/vnode.h> 76 #ifdef KTRACE 77 #include <sys/ktrace.h> 78 #endif 79 #ifdef COMPAT_FREEBSD32 80 #include <compat/freebsd32/freebsd32_util.h> 81 #endif 82 83 #include <net/vnet.h> 84 85 #include <security/audit/audit.h> 86 #include <security/mac/mac_framework.h> 87 88 #include <vm/vm.h> 89 #include <vm/vm_param.h> 90 #include <vm/vm_object.h> 91 #include <vm/vm_page.h> 92 #include <vm/vm_pager.h> 93 #include <vm/vm_kern.h> 94 #include <vm/vm_extern.h> 95 #include <vm/uma.h> 96 97 /* 98 * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC 99 * and SOCK_NONBLOCK. 100 */ 101 #define ACCEPT4_INHERIT 0x1 102 #define ACCEPT4_COMPAT 0x2 103 104 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); 105 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); 106 107 static int accept1(struct thread *td, int s, struct sockaddr *uname, 108 socklen_t *anamelen, int flags); 109 static int do_sendfile(struct thread *td, struct sendfile_args *uap, 110 int compat); 111 static int getsockname1(struct thread *td, struct getsockname_args *uap, 112 int compat); 113 static int getpeername1(struct thread *td, struct getpeername_args *uap, 114 int compat); 115 116 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; 117 118 static int filt_sfsync_attach(struct knote *kn); 119 static void filt_sfsync_detach(struct knote *kn); 120 static int filt_sfsync(struct knote *kn, long hint); 121 122 /* 123 * sendfile(2)-related variables and associated sysctls 124 */ 125 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0, 126 "sendfile(2) tunables"); 127 static int sfreadahead = 1; 128 SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW, 129 &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks"); 130 131 #ifdef SFSYNC_DEBUG 132 static int sf_sync_debug = 0; 133 SYSCTL_INT(_debug, OID_AUTO, sf_sync_debug, CTLFLAG_RW, 134 &sf_sync_debug, 0, "Output debugging during sf_sync lifecycle"); 135 #define SFSYNC_DPRINTF(s, ...) \ 136 do { \ 137 if (sf_sync_debug) \ 138 printf((s), ##__VA_ARGS__); \ 139 } while (0) 140 #else 141 #define SFSYNC_DPRINTF(c, ...) 142 #endif 143 144 static uma_zone_t zone_sfsync; 145 146 static struct filterops sendfile_filtops = { 147 .f_isfd = 0, 148 .f_attach = filt_sfsync_attach, 149 .f_detach = filt_sfsync_detach, 150 .f_event = filt_sfsync, 151 }; 152 153 static void 154 sfstat_init(const void *unused) 155 { 156 157 COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), 158 M_WAITOK); 159 } 160 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); 161 162 static void 163 sf_sync_init(const void *unused) 164 { 165 166 zone_sfsync = uma_zcreate("sendfile_sync", sizeof(struct sendfile_sync), 167 NULL, NULL, 168 NULL, NULL, 169 UMA_ALIGN_CACHE, 170 0); 171 kqueue_add_filteropts(EVFILT_SENDFILE, &sendfile_filtops); 172 } 173 SYSINIT(sf_sync, SI_SUB_MBUF, SI_ORDER_FIRST, sf_sync_init, NULL); 174 175 static int 176 sfstat_sysctl(SYSCTL_HANDLER_ARGS) 177 { 178 struct sfstat s; 179 180 COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); 181 if (req->newptr) 182 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); 183 return (SYSCTL_OUT(req, &s, sizeof(s))); 184 } 185 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, 186 NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); 187 188 /* 189 * Convert a user file descriptor to a kernel file entry and check if required 190 * capability rights are present. 191 * A reference on the file entry is held upon returning. 192 */ 193 int 194 getsock_cap(struct filedesc *fdp, int fd, cap_rights_t *rightsp, 195 struct file **fpp, u_int *fflagp) 196 { 197 struct file *fp; 198 int error; 199 200 error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL); 201 if (error != 0) 202 return (error); 203 if (fp->f_type != DTYPE_SOCKET) { 204 fdrop(fp, curthread); 205 return (ENOTSOCK); 206 } 207 if (fflagp != NULL) 208 *fflagp = fp->f_flag; 209 *fpp = fp; 210 return (0); 211 } 212 213 /* 214 * System call interface to the socket abstraction. 215 */ 216 #if defined(COMPAT_43) 217 #define COMPAT_OLDSOCK 218 #endif 219 220 int 221 sys_socket(td, uap) 222 struct thread *td; 223 struct socket_args /* { 224 int domain; 225 int type; 226 int protocol; 227 } */ *uap; 228 { 229 struct socket *so; 230 struct file *fp; 231 int fd, error, type, oflag, fflag; 232 233 AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); 234 235 type = uap->type; 236 oflag = 0; 237 fflag = 0; 238 if ((type & SOCK_CLOEXEC) != 0) { 239 type &= ~SOCK_CLOEXEC; 240 oflag |= O_CLOEXEC; 241 } 242 if ((type & SOCK_NONBLOCK) != 0) { 243 type &= ~SOCK_NONBLOCK; 244 fflag |= FNONBLOCK; 245 } 246 247 #ifdef MAC 248 error = mac_socket_check_create(td->td_ucred, uap->domain, type, 249 uap->protocol); 250 if (error != 0) 251 return (error); 252 #endif 253 error = falloc(td, &fp, &fd, oflag); 254 if (error != 0) 255 return (error); 256 /* An extra reference on `fp' has been held for us by falloc(). */ 257 error = socreate(uap->domain, &so, type, uap->protocol, 258 td->td_ucred, td); 259 if (error != 0) { 260 fdclose(td->td_proc->p_fd, fp, fd, td); 261 } else { 262 finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); 263 if ((fflag & FNONBLOCK) != 0) 264 (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); 265 td->td_retval[0] = fd; 266 } 267 fdrop(fp, td); 268 return (error); 269 } 270 271 /* ARGSUSED */ 272 int 273 sys_bind(td, uap) 274 struct thread *td; 275 struct bind_args /* { 276 int s; 277 caddr_t name; 278 int namelen; 279 } */ *uap; 280 { 281 struct sockaddr *sa; 282 int error; 283 284 error = getsockaddr(&sa, uap->name, uap->namelen); 285 if (error == 0) { 286 error = kern_bind(td, uap->s, sa); 287 free(sa, M_SONAME); 288 } 289 return (error); 290 } 291 292 static int 293 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 294 { 295 struct socket *so; 296 struct file *fp; 297 cap_rights_t rights; 298 int error; 299 300 AUDIT_ARG_FD(fd); 301 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 302 error = getsock_cap(td->td_proc->p_fd, fd, 303 cap_rights_init(&rights, CAP_BIND), &fp, NULL); 304 if (error != 0) 305 return (error); 306 so = fp->f_data; 307 #ifdef KTRACE 308 if (KTRPOINT(td, KTR_STRUCT)) 309 ktrsockaddr(sa); 310 #endif 311 #ifdef MAC 312 error = mac_socket_check_bind(td->td_ucred, so, sa); 313 if (error == 0) { 314 #endif 315 if (dirfd == AT_FDCWD) 316 error = sobind(so, sa, td); 317 else 318 error = sobindat(dirfd, so, sa, td); 319 #ifdef MAC 320 } 321 #endif 322 fdrop(fp, td); 323 return (error); 324 } 325 326 int 327 kern_bind(struct thread *td, int fd, struct sockaddr *sa) 328 { 329 330 return (kern_bindat(td, AT_FDCWD, fd, sa)); 331 } 332 333 /* ARGSUSED */ 334 int 335 sys_bindat(td, uap) 336 struct thread *td; 337 struct bindat_args /* { 338 int fd; 339 int s; 340 caddr_t name; 341 int namelen; 342 } */ *uap; 343 { 344 struct sockaddr *sa; 345 int error; 346 347 error = getsockaddr(&sa, uap->name, uap->namelen); 348 if (error == 0) { 349 error = kern_bindat(td, uap->fd, uap->s, sa); 350 free(sa, M_SONAME); 351 } 352 return (error); 353 } 354 355 /* ARGSUSED */ 356 int 357 sys_listen(td, uap) 358 struct thread *td; 359 struct listen_args /* { 360 int s; 361 int backlog; 362 } */ *uap; 363 { 364 struct socket *so; 365 struct file *fp; 366 cap_rights_t rights; 367 int error; 368 369 AUDIT_ARG_FD(uap->s); 370 error = getsock_cap(td->td_proc->p_fd, uap->s, 371 cap_rights_init(&rights, CAP_LISTEN), &fp, NULL); 372 if (error == 0) { 373 so = fp->f_data; 374 #ifdef MAC 375 error = mac_socket_check_listen(td->td_ucred, so); 376 if (error == 0) 377 #endif 378 error = solisten(so, uap->backlog, td); 379 fdrop(fp, td); 380 } 381 return(error); 382 } 383 384 /* 385 * accept1() 386 */ 387 static int 388 accept1(td, s, uname, anamelen, flags) 389 struct thread *td; 390 int s; 391 struct sockaddr *uname; 392 socklen_t *anamelen; 393 int flags; 394 { 395 struct sockaddr *name; 396 socklen_t namelen; 397 struct file *fp; 398 int error; 399 400 if (uname == NULL) 401 return (kern_accept4(td, s, NULL, NULL, flags, NULL)); 402 403 error = copyin(anamelen, &namelen, sizeof (namelen)); 404 if (error != 0) 405 return (error); 406 407 error = kern_accept4(td, s, &name, &namelen, flags, &fp); 408 409 if (error != 0) 410 return (error); 411 412 if (error == 0 && uname != NULL) { 413 #ifdef COMPAT_OLDSOCK 414 if (flags & ACCEPT4_COMPAT) 415 ((struct osockaddr *)name)->sa_family = 416 name->sa_family; 417 #endif 418 error = copyout(name, uname, namelen); 419 } 420 if (error == 0) 421 error = copyout(&namelen, anamelen, 422 sizeof(namelen)); 423 if (error != 0) 424 fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td); 425 fdrop(fp, td); 426 free(name, M_SONAME); 427 return (error); 428 } 429 430 int 431 kern_accept(struct thread *td, int s, struct sockaddr **name, 432 socklen_t *namelen, struct file **fp) 433 { 434 return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); 435 } 436 437 int 438 kern_accept4(struct thread *td, int s, struct sockaddr **name, 439 socklen_t *namelen, int flags, struct file **fp) 440 { 441 struct filedesc *fdp; 442 struct file *headfp, *nfp = NULL; 443 struct sockaddr *sa = NULL; 444 struct socket *head, *so; 445 cap_rights_t rights; 446 u_int fflag; 447 pid_t pgid; 448 int error, fd, tmp; 449 450 if (name != NULL) 451 *name = NULL; 452 453 AUDIT_ARG_FD(s); 454 fdp = td->td_proc->p_fd; 455 error = getsock_cap(fdp, s, cap_rights_init(&rights, CAP_ACCEPT), 456 &headfp, &fflag); 457 if (error != 0) 458 return (error); 459 head = headfp->f_data; 460 if ((head->so_options & SO_ACCEPTCONN) == 0) { 461 error = EINVAL; 462 goto done; 463 } 464 #ifdef MAC 465 error = mac_socket_check_accept(td->td_ucred, head); 466 if (error != 0) 467 goto done; 468 #endif 469 error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); 470 if (error != 0) 471 goto done; 472 ACCEPT_LOCK(); 473 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { 474 ACCEPT_UNLOCK(); 475 error = EWOULDBLOCK; 476 goto noconnection; 477 } 478 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { 479 if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { 480 head->so_error = ECONNABORTED; 481 break; 482 } 483 error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, 484 "accept", 0); 485 if (error != 0) { 486 ACCEPT_UNLOCK(); 487 goto noconnection; 488 } 489 } 490 if (head->so_error) { 491 error = head->so_error; 492 head->so_error = 0; 493 ACCEPT_UNLOCK(); 494 goto noconnection; 495 } 496 so = TAILQ_FIRST(&head->so_comp); 497 KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); 498 KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); 499 500 /* 501 * Before changing the flags on the socket, we have to bump the 502 * reference count. Otherwise, if the protocol calls sofree(), 503 * the socket will be released due to a zero refcount. 504 */ 505 SOCK_LOCK(so); /* soref() and so_state update */ 506 soref(so); /* file descriptor reference */ 507 508 TAILQ_REMOVE(&head->so_comp, so, so_list); 509 head->so_qlen--; 510 if (flags & ACCEPT4_INHERIT) 511 so->so_state |= (head->so_state & SS_NBIO); 512 else 513 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 514 so->so_qstate &= ~SQ_COMP; 515 so->so_head = NULL; 516 517 SOCK_UNLOCK(so); 518 ACCEPT_UNLOCK(); 519 520 /* An extra reference on `nfp' has been held for us by falloc(). */ 521 td->td_retval[0] = fd; 522 523 /* connection has been removed from the listen queue */ 524 KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); 525 526 if (flags & ACCEPT4_INHERIT) { 527 pgid = fgetown(&head->so_sigio); 528 if (pgid != 0) 529 fsetown(pgid, &so->so_sigio); 530 } else { 531 fflag &= ~(FNONBLOCK | FASYNC); 532 if (flags & SOCK_NONBLOCK) 533 fflag |= FNONBLOCK; 534 } 535 536 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); 537 /* Sync socket nonblocking/async state with file flags */ 538 tmp = fflag & FNONBLOCK; 539 (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); 540 tmp = fflag & FASYNC; 541 (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); 542 sa = 0; 543 error = soaccept(so, &sa); 544 if (error != 0) 545 goto noconnection; 546 if (sa == NULL) { 547 if (name) 548 *namelen = 0; 549 goto done; 550 } 551 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); 552 if (name) { 553 /* check sa_len before it is destroyed */ 554 if (*namelen > sa->sa_len) 555 *namelen = sa->sa_len; 556 #ifdef KTRACE 557 if (KTRPOINT(td, KTR_STRUCT)) 558 ktrsockaddr(sa); 559 #endif 560 *name = sa; 561 sa = NULL; 562 } 563 noconnection: 564 free(sa, M_SONAME); 565 566 /* 567 * close the new descriptor, assuming someone hasn't ripped it 568 * out from under us. 569 */ 570 if (error != 0) 571 fdclose(fdp, nfp, fd, td); 572 573 /* 574 * Release explicitly held references before returning. We return 575 * a reference on nfp to the caller on success if they request it. 576 */ 577 done: 578 if (fp != NULL) { 579 if (error == 0) { 580 *fp = nfp; 581 nfp = NULL; 582 } else 583 *fp = NULL; 584 } 585 if (nfp != NULL) 586 fdrop(nfp, td); 587 fdrop(headfp, td); 588 return (error); 589 } 590 591 int 592 sys_accept(td, uap) 593 struct thread *td; 594 struct accept_args *uap; 595 { 596 597 return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); 598 } 599 600 int 601 sys_accept4(td, uap) 602 struct thread *td; 603 struct accept4_args *uap; 604 { 605 606 if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 607 return (EINVAL); 608 609 return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); 610 } 611 612 #ifdef COMPAT_OLDSOCK 613 int 614 oaccept(td, uap) 615 struct thread *td; 616 struct accept_args *uap; 617 { 618 619 return (accept1(td, uap->s, uap->name, uap->anamelen, 620 ACCEPT4_INHERIT | ACCEPT4_COMPAT)); 621 } 622 #endif /* COMPAT_OLDSOCK */ 623 624 /* ARGSUSED */ 625 int 626 sys_connect(td, uap) 627 struct thread *td; 628 struct connect_args /* { 629 int s; 630 caddr_t name; 631 int namelen; 632 } */ *uap; 633 { 634 struct sockaddr *sa; 635 int error; 636 637 error = getsockaddr(&sa, uap->name, uap->namelen); 638 if (error == 0) { 639 error = kern_connect(td, uap->s, sa); 640 free(sa, M_SONAME); 641 } 642 return (error); 643 } 644 645 static int 646 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 647 { 648 struct socket *so; 649 struct file *fp; 650 cap_rights_t rights; 651 int error, interrupted = 0; 652 653 AUDIT_ARG_FD(fd); 654 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 655 error = getsock_cap(td->td_proc->p_fd, fd, 656 cap_rights_init(&rights, CAP_CONNECT), &fp, NULL); 657 if (error != 0) 658 return (error); 659 so = fp->f_data; 660 if (so->so_state & SS_ISCONNECTING) { 661 error = EALREADY; 662 goto done1; 663 } 664 #ifdef KTRACE 665 if (KTRPOINT(td, KTR_STRUCT)) 666 ktrsockaddr(sa); 667 #endif 668 #ifdef MAC 669 error = mac_socket_check_connect(td->td_ucred, so, sa); 670 if (error != 0) 671 goto bad; 672 #endif 673 if (dirfd == AT_FDCWD) 674 error = soconnect(so, sa, td); 675 else 676 error = soconnectat(dirfd, so, sa, td); 677 if (error != 0) 678 goto bad; 679 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 680 error = EINPROGRESS; 681 goto done1; 682 } 683 SOCK_LOCK(so); 684 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 685 error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, 686 "connec", 0); 687 if (error != 0) { 688 if (error == EINTR || error == ERESTART) 689 interrupted = 1; 690 break; 691 } 692 } 693 if (error == 0) { 694 error = so->so_error; 695 so->so_error = 0; 696 } 697 SOCK_UNLOCK(so); 698 bad: 699 if (!interrupted) 700 so->so_state &= ~SS_ISCONNECTING; 701 if (error == ERESTART) 702 error = EINTR; 703 done1: 704 fdrop(fp, td); 705 return (error); 706 } 707 708 int 709 kern_connect(struct thread *td, int fd, struct sockaddr *sa) 710 { 711 712 return (kern_connectat(td, AT_FDCWD, fd, sa)); 713 } 714 715 /* ARGSUSED */ 716 int 717 sys_connectat(td, uap) 718 struct thread *td; 719 struct connectat_args /* { 720 int fd; 721 int s; 722 caddr_t name; 723 int namelen; 724 } */ *uap; 725 { 726 struct sockaddr *sa; 727 int error; 728 729 error = getsockaddr(&sa, uap->name, uap->namelen); 730 if (error == 0) { 731 error = kern_connectat(td, uap->fd, uap->s, sa); 732 free(sa, M_SONAME); 733 } 734 return (error); 735 } 736 737 int 738 kern_socketpair(struct thread *td, int domain, int type, int protocol, 739 int *rsv) 740 { 741 struct filedesc *fdp = td->td_proc->p_fd; 742 struct file *fp1, *fp2; 743 struct socket *so1, *so2; 744 int fd, error, oflag, fflag; 745 746 AUDIT_ARG_SOCKET(domain, type, protocol); 747 748 oflag = 0; 749 fflag = 0; 750 if ((type & SOCK_CLOEXEC) != 0) { 751 type &= ~SOCK_CLOEXEC; 752 oflag |= O_CLOEXEC; 753 } 754 if ((type & SOCK_NONBLOCK) != 0) { 755 type &= ~SOCK_NONBLOCK; 756 fflag |= FNONBLOCK; 757 } 758 #ifdef MAC 759 /* We might want to have a separate check for socket pairs. */ 760 error = mac_socket_check_create(td->td_ucred, domain, type, 761 protocol); 762 if (error != 0) 763 return (error); 764 #endif 765 error = socreate(domain, &so1, type, protocol, td->td_ucred, td); 766 if (error != 0) 767 return (error); 768 error = socreate(domain, &so2, type, protocol, td->td_ucred, td); 769 if (error != 0) 770 goto free1; 771 /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ 772 error = falloc(td, &fp1, &fd, oflag); 773 if (error != 0) 774 goto free2; 775 rsv[0] = fd; 776 fp1->f_data = so1; /* so1 already has ref count */ 777 error = falloc(td, &fp2, &fd, oflag); 778 if (error != 0) 779 goto free3; 780 fp2->f_data = so2; /* so2 already has ref count */ 781 rsv[1] = fd; 782 error = soconnect2(so1, so2); 783 if (error != 0) 784 goto free4; 785 if (type == SOCK_DGRAM) { 786 /* 787 * Datagram socket connection is asymmetric. 788 */ 789 error = soconnect2(so2, so1); 790 if (error != 0) 791 goto free4; 792 } 793 finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, 794 &socketops); 795 finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, 796 &socketops); 797 if ((fflag & FNONBLOCK) != 0) { 798 (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); 799 (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); 800 } 801 fdrop(fp1, td); 802 fdrop(fp2, td); 803 return (0); 804 free4: 805 fdclose(fdp, fp2, rsv[1], td); 806 fdrop(fp2, td); 807 free3: 808 fdclose(fdp, fp1, rsv[0], td); 809 fdrop(fp1, td); 810 free2: 811 if (so2 != NULL) 812 (void)soclose(so2); 813 free1: 814 if (so1 != NULL) 815 (void)soclose(so1); 816 return (error); 817 } 818 819 int 820 sys_socketpair(struct thread *td, struct socketpair_args *uap) 821 { 822 int error, sv[2]; 823 824 error = kern_socketpair(td, uap->domain, uap->type, 825 uap->protocol, sv); 826 if (error != 0) 827 return (error); 828 error = copyout(sv, uap->rsv, 2 * sizeof(int)); 829 if (error != 0) { 830 (void)kern_close(td, sv[0]); 831 (void)kern_close(td, sv[1]); 832 } 833 return (error); 834 } 835 836 static int 837 sendit(td, s, mp, flags) 838 struct thread *td; 839 int s; 840 struct msghdr *mp; 841 int flags; 842 { 843 struct mbuf *control; 844 struct sockaddr *to; 845 int error; 846 847 #ifdef CAPABILITY_MODE 848 if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) 849 return (ECAPMODE); 850 #endif 851 852 if (mp->msg_name != NULL) { 853 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 854 if (error != 0) { 855 to = NULL; 856 goto bad; 857 } 858 mp->msg_name = to; 859 } else { 860 to = NULL; 861 } 862 863 if (mp->msg_control) { 864 if (mp->msg_controllen < sizeof(struct cmsghdr) 865 #ifdef COMPAT_OLDSOCK 866 && mp->msg_flags != MSG_COMPAT 867 #endif 868 ) { 869 error = EINVAL; 870 goto bad; 871 } 872 error = sockargs(&control, mp->msg_control, 873 mp->msg_controllen, MT_CONTROL); 874 if (error != 0) 875 goto bad; 876 #ifdef COMPAT_OLDSOCK 877 if (mp->msg_flags == MSG_COMPAT) { 878 struct cmsghdr *cm; 879 880 M_PREPEND(control, sizeof(*cm), M_WAITOK); 881 cm = mtod(control, struct cmsghdr *); 882 cm->cmsg_len = control->m_len; 883 cm->cmsg_level = SOL_SOCKET; 884 cm->cmsg_type = SCM_RIGHTS; 885 } 886 #endif 887 } else { 888 control = NULL; 889 } 890 891 error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); 892 893 bad: 894 free(to, M_SONAME); 895 return (error); 896 } 897 898 int 899 kern_sendit(td, s, mp, flags, control, segflg) 900 struct thread *td; 901 int s; 902 struct msghdr *mp; 903 int flags; 904 struct mbuf *control; 905 enum uio_seg segflg; 906 { 907 struct file *fp; 908 struct uio auio; 909 struct iovec *iov; 910 struct socket *so; 911 cap_rights_t rights; 912 #ifdef KTRACE 913 struct uio *ktruio = NULL; 914 #endif 915 ssize_t len; 916 int i, error; 917 918 AUDIT_ARG_FD(s); 919 cap_rights_init(&rights, CAP_SEND); 920 if (mp->msg_name != NULL) { 921 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); 922 cap_rights_set(&rights, CAP_CONNECT); 923 } 924 error = getsock_cap(td->td_proc->p_fd, s, &rights, &fp, NULL); 925 if (error != 0) 926 return (error); 927 so = (struct socket *)fp->f_data; 928 929 #ifdef KTRACE 930 if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) 931 ktrsockaddr(mp->msg_name); 932 #endif 933 #ifdef MAC 934 if (mp->msg_name != NULL) { 935 error = mac_socket_check_connect(td->td_ucred, so, 936 mp->msg_name); 937 if (error != 0) 938 goto bad; 939 } 940 error = mac_socket_check_send(td->td_ucred, so); 941 if (error != 0) 942 goto bad; 943 #endif 944 945 auio.uio_iov = mp->msg_iov; 946 auio.uio_iovcnt = mp->msg_iovlen; 947 auio.uio_segflg = segflg; 948 auio.uio_rw = UIO_WRITE; 949 auio.uio_td = td; 950 auio.uio_offset = 0; /* XXX */ 951 auio.uio_resid = 0; 952 iov = mp->msg_iov; 953 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 954 if ((auio.uio_resid += iov->iov_len) < 0) { 955 error = EINVAL; 956 goto bad; 957 } 958 } 959 #ifdef KTRACE 960 if (KTRPOINT(td, KTR_GENIO)) 961 ktruio = cloneuio(&auio); 962 #endif 963 len = auio.uio_resid; 964 error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); 965 if (error != 0) { 966 if (auio.uio_resid != len && (error == ERESTART || 967 error == EINTR || error == EWOULDBLOCK)) 968 error = 0; 969 /* Generation of SIGPIPE can be controlled per socket */ 970 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 971 !(flags & MSG_NOSIGNAL)) { 972 PROC_LOCK(td->td_proc); 973 tdsignal(td, SIGPIPE); 974 PROC_UNLOCK(td->td_proc); 975 } 976 } 977 if (error == 0) 978 td->td_retval[0] = len - auio.uio_resid; 979 #ifdef KTRACE 980 if (ktruio != NULL) { 981 ktruio->uio_resid = td->td_retval[0]; 982 ktrgenio(s, UIO_WRITE, ktruio, error); 983 } 984 #endif 985 bad: 986 fdrop(fp, td); 987 return (error); 988 } 989 990 int 991 sys_sendto(td, uap) 992 struct thread *td; 993 struct sendto_args /* { 994 int s; 995 caddr_t buf; 996 size_t len; 997 int flags; 998 caddr_t to; 999 int tolen; 1000 } */ *uap; 1001 { 1002 struct msghdr msg; 1003 struct iovec aiov; 1004 1005 msg.msg_name = uap->to; 1006 msg.msg_namelen = uap->tolen; 1007 msg.msg_iov = &aiov; 1008 msg.msg_iovlen = 1; 1009 msg.msg_control = 0; 1010 #ifdef COMPAT_OLDSOCK 1011 msg.msg_flags = 0; 1012 #endif 1013 aiov.iov_base = uap->buf; 1014 aiov.iov_len = uap->len; 1015 return (sendit(td, uap->s, &msg, uap->flags)); 1016 } 1017 1018 #ifdef COMPAT_OLDSOCK 1019 int 1020 osend(td, uap) 1021 struct thread *td; 1022 struct osend_args /* { 1023 int s; 1024 caddr_t buf; 1025 int len; 1026 int flags; 1027 } */ *uap; 1028 { 1029 struct msghdr msg; 1030 struct iovec aiov; 1031 1032 msg.msg_name = 0; 1033 msg.msg_namelen = 0; 1034 msg.msg_iov = &aiov; 1035 msg.msg_iovlen = 1; 1036 aiov.iov_base = uap->buf; 1037 aiov.iov_len = uap->len; 1038 msg.msg_control = 0; 1039 msg.msg_flags = 0; 1040 return (sendit(td, uap->s, &msg, uap->flags)); 1041 } 1042 1043 int 1044 osendmsg(td, uap) 1045 struct thread *td; 1046 struct osendmsg_args /* { 1047 int s; 1048 caddr_t msg; 1049 int flags; 1050 } */ *uap; 1051 { 1052 struct msghdr msg; 1053 struct iovec *iov; 1054 int error; 1055 1056 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1057 if (error != 0) 1058 return (error); 1059 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1060 if (error != 0) 1061 return (error); 1062 msg.msg_iov = iov; 1063 msg.msg_flags = MSG_COMPAT; 1064 error = sendit(td, uap->s, &msg, uap->flags); 1065 free(iov, M_IOV); 1066 return (error); 1067 } 1068 #endif 1069 1070 int 1071 sys_sendmsg(td, uap) 1072 struct thread *td; 1073 struct sendmsg_args /* { 1074 int s; 1075 caddr_t msg; 1076 int flags; 1077 } */ *uap; 1078 { 1079 struct msghdr msg; 1080 struct iovec *iov; 1081 int error; 1082 1083 error = copyin(uap->msg, &msg, sizeof (msg)); 1084 if (error != 0) 1085 return (error); 1086 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1087 if (error != 0) 1088 return (error); 1089 msg.msg_iov = iov; 1090 #ifdef COMPAT_OLDSOCK 1091 msg.msg_flags = 0; 1092 #endif 1093 error = sendit(td, uap->s, &msg, uap->flags); 1094 free(iov, M_IOV); 1095 return (error); 1096 } 1097 1098 int 1099 kern_recvit(td, s, mp, fromseg, controlp) 1100 struct thread *td; 1101 int s; 1102 struct msghdr *mp; 1103 enum uio_seg fromseg; 1104 struct mbuf **controlp; 1105 { 1106 struct uio auio; 1107 struct iovec *iov; 1108 struct mbuf *m, *control = NULL; 1109 caddr_t ctlbuf; 1110 struct file *fp; 1111 struct socket *so; 1112 struct sockaddr *fromsa = NULL; 1113 cap_rights_t rights; 1114 #ifdef KTRACE 1115 struct uio *ktruio = NULL; 1116 #endif 1117 ssize_t len; 1118 int error, i; 1119 1120 if (controlp != NULL) 1121 *controlp = NULL; 1122 1123 AUDIT_ARG_FD(s); 1124 error = getsock_cap(td->td_proc->p_fd, s, 1125 cap_rights_init(&rights, CAP_RECV), &fp, NULL); 1126 if (error != 0) 1127 return (error); 1128 so = fp->f_data; 1129 1130 #ifdef MAC 1131 error = mac_socket_check_receive(td->td_ucred, so); 1132 if (error != 0) { 1133 fdrop(fp, td); 1134 return (error); 1135 } 1136 #endif 1137 1138 auio.uio_iov = mp->msg_iov; 1139 auio.uio_iovcnt = mp->msg_iovlen; 1140 auio.uio_segflg = UIO_USERSPACE; 1141 auio.uio_rw = UIO_READ; 1142 auio.uio_td = td; 1143 auio.uio_offset = 0; /* XXX */ 1144 auio.uio_resid = 0; 1145 iov = mp->msg_iov; 1146 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 1147 if ((auio.uio_resid += iov->iov_len) < 0) { 1148 fdrop(fp, td); 1149 return (EINVAL); 1150 } 1151 } 1152 #ifdef KTRACE 1153 if (KTRPOINT(td, KTR_GENIO)) 1154 ktruio = cloneuio(&auio); 1155 #endif 1156 len = auio.uio_resid; 1157 error = soreceive(so, &fromsa, &auio, NULL, 1158 (mp->msg_control || controlp) ? &control : NULL, 1159 &mp->msg_flags); 1160 if (error != 0) { 1161 if (auio.uio_resid != len && (error == ERESTART || 1162 error == EINTR || error == EWOULDBLOCK)) 1163 error = 0; 1164 } 1165 if (fromsa != NULL) 1166 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); 1167 #ifdef KTRACE 1168 if (ktruio != NULL) { 1169 ktruio->uio_resid = len - auio.uio_resid; 1170 ktrgenio(s, UIO_READ, ktruio, error); 1171 } 1172 #endif 1173 if (error != 0) 1174 goto out; 1175 td->td_retval[0] = len - auio.uio_resid; 1176 if (mp->msg_name) { 1177 len = mp->msg_namelen; 1178 if (len <= 0 || fromsa == NULL) 1179 len = 0; 1180 else { 1181 /* save sa_len before it is destroyed by MSG_COMPAT */ 1182 len = MIN(len, fromsa->sa_len); 1183 #ifdef COMPAT_OLDSOCK 1184 if (mp->msg_flags & MSG_COMPAT) 1185 ((struct osockaddr *)fromsa)->sa_family = 1186 fromsa->sa_family; 1187 #endif 1188 if (fromseg == UIO_USERSPACE) { 1189 error = copyout(fromsa, mp->msg_name, 1190 (unsigned)len); 1191 if (error != 0) 1192 goto out; 1193 } else 1194 bcopy(fromsa, mp->msg_name, len); 1195 } 1196 mp->msg_namelen = len; 1197 } 1198 if (mp->msg_control && controlp == NULL) { 1199 #ifdef COMPAT_OLDSOCK 1200 /* 1201 * We assume that old recvmsg calls won't receive access 1202 * rights and other control info, esp. as control info 1203 * is always optional and those options didn't exist in 4.3. 1204 * If we receive rights, trim the cmsghdr; anything else 1205 * is tossed. 1206 */ 1207 if (control && mp->msg_flags & MSG_COMPAT) { 1208 if (mtod(control, struct cmsghdr *)->cmsg_level != 1209 SOL_SOCKET || 1210 mtod(control, struct cmsghdr *)->cmsg_type != 1211 SCM_RIGHTS) { 1212 mp->msg_controllen = 0; 1213 goto out; 1214 } 1215 control->m_len -= sizeof (struct cmsghdr); 1216 control->m_data += sizeof (struct cmsghdr); 1217 } 1218 #endif 1219 len = mp->msg_controllen; 1220 m = control; 1221 mp->msg_controllen = 0; 1222 ctlbuf = mp->msg_control; 1223 1224 while (m && len > 0) { 1225 unsigned int tocopy; 1226 1227 if (len >= m->m_len) 1228 tocopy = m->m_len; 1229 else { 1230 mp->msg_flags |= MSG_CTRUNC; 1231 tocopy = len; 1232 } 1233 1234 if ((error = copyout(mtod(m, caddr_t), 1235 ctlbuf, tocopy)) != 0) 1236 goto out; 1237 1238 ctlbuf += tocopy; 1239 len -= tocopy; 1240 m = m->m_next; 1241 } 1242 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; 1243 } 1244 out: 1245 fdrop(fp, td); 1246 #ifdef KTRACE 1247 if (fromsa && KTRPOINT(td, KTR_STRUCT)) 1248 ktrsockaddr(fromsa); 1249 #endif 1250 free(fromsa, M_SONAME); 1251 1252 if (error == 0 && controlp != NULL) 1253 *controlp = control; 1254 else if (control) 1255 m_freem(control); 1256 1257 return (error); 1258 } 1259 1260 static int 1261 recvit(td, s, mp, namelenp) 1262 struct thread *td; 1263 int s; 1264 struct msghdr *mp; 1265 void *namelenp; 1266 { 1267 int error; 1268 1269 error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); 1270 if (error != 0) 1271 return (error); 1272 if (namelenp != NULL) { 1273 error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); 1274 #ifdef COMPAT_OLDSOCK 1275 if (mp->msg_flags & MSG_COMPAT) 1276 error = 0; /* old recvfrom didn't check */ 1277 #endif 1278 } 1279 return (error); 1280 } 1281 1282 int 1283 sys_recvfrom(td, uap) 1284 struct thread *td; 1285 struct recvfrom_args /* { 1286 int s; 1287 caddr_t buf; 1288 size_t len; 1289 int flags; 1290 struct sockaddr * __restrict from; 1291 socklen_t * __restrict fromlenaddr; 1292 } */ *uap; 1293 { 1294 struct msghdr msg; 1295 struct iovec aiov; 1296 int error; 1297 1298 if (uap->fromlenaddr) { 1299 error = copyin(uap->fromlenaddr, 1300 &msg.msg_namelen, sizeof (msg.msg_namelen)); 1301 if (error != 0) 1302 goto done2; 1303 } else { 1304 msg.msg_namelen = 0; 1305 } 1306 msg.msg_name = uap->from; 1307 msg.msg_iov = &aiov; 1308 msg.msg_iovlen = 1; 1309 aiov.iov_base = uap->buf; 1310 aiov.iov_len = uap->len; 1311 msg.msg_control = 0; 1312 msg.msg_flags = uap->flags; 1313 error = recvit(td, uap->s, &msg, uap->fromlenaddr); 1314 done2: 1315 return (error); 1316 } 1317 1318 #ifdef COMPAT_OLDSOCK 1319 int 1320 orecvfrom(td, uap) 1321 struct thread *td; 1322 struct recvfrom_args *uap; 1323 { 1324 1325 uap->flags |= MSG_COMPAT; 1326 return (sys_recvfrom(td, uap)); 1327 } 1328 #endif 1329 1330 #ifdef COMPAT_OLDSOCK 1331 int 1332 orecv(td, uap) 1333 struct thread *td; 1334 struct orecv_args /* { 1335 int s; 1336 caddr_t buf; 1337 int len; 1338 int flags; 1339 } */ *uap; 1340 { 1341 struct msghdr msg; 1342 struct iovec aiov; 1343 1344 msg.msg_name = 0; 1345 msg.msg_namelen = 0; 1346 msg.msg_iov = &aiov; 1347 msg.msg_iovlen = 1; 1348 aiov.iov_base = uap->buf; 1349 aiov.iov_len = uap->len; 1350 msg.msg_control = 0; 1351 msg.msg_flags = uap->flags; 1352 return (recvit(td, uap->s, &msg, NULL)); 1353 } 1354 1355 /* 1356 * Old recvmsg. This code takes advantage of the fact that the old msghdr 1357 * overlays the new one, missing only the flags, and with the (old) access 1358 * rights where the control fields are now. 1359 */ 1360 int 1361 orecvmsg(td, uap) 1362 struct thread *td; 1363 struct orecvmsg_args /* { 1364 int s; 1365 struct omsghdr *msg; 1366 int flags; 1367 } */ *uap; 1368 { 1369 struct msghdr msg; 1370 struct iovec *iov; 1371 int error; 1372 1373 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1374 if (error != 0) 1375 return (error); 1376 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1377 if (error != 0) 1378 return (error); 1379 msg.msg_flags = uap->flags | MSG_COMPAT; 1380 msg.msg_iov = iov; 1381 error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); 1382 if (msg.msg_controllen && error == 0) 1383 error = copyout(&msg.msg_controllen, 1384 &uap->msg->msg_accrightslen, sizeof (int)); 1385 free(iov, M_IOV); 1386 return (error); 1387 } 1388 #endif 1389 1390 int 1391 sys_recvmsg(td, uap) 1392 struct thread *td; 1393 struct recvmsg_args /* { 1394 int s; 1395 struct msghdr *msg; 1396 int flags; 1397 } */ *uap; 1398 { 1399 struct msghdr msg; 1400 struct iovec *uiov, *iov; 1401 int error; 1402 1403 error = copyin(uap->msg, &msg, sizeof (msg)); 1404 if (error != 0) 1405 return (error); 1406 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1407 if (error != 0) 1408 return (error); 1409 msg.msg_flags = uap->flags; 1410 #ifdef COMPAT_OLDSOCK 1411 msg.msg_flags &= ~MSG_COMPAT; 1412 #endif 1413 uiov = msg.msg_iov; 1414 msg.msg_iov = iov; 1415 error = recvit(td, uap->s, &msg, NULL); 1416 if (error == 0) { 1417 msg.msg_iov = uiov; 1418 error = copyout(&msg, uap->msg, sizeof(msg)); 1419 } 1420 free(iov, M_IOV); 1421 return (error); 1422 } 1423 1424 /* ARGSUSED */ 1425 int 1426 sys_shutdown(td, uap) 1427 struct thread *td; 1428 struct shutdown_args /* { 1429 int s; 1430 int how; 1431 } */ *uap; 1432 { 1433 struct socket *so; 1434 struct file *fp; 1435 cap_rights_t rights; 1436 int error; 1437 1438 AUDIT_ARG_FD(uap->s); 1439 error = getsock_cap(td->td_proc->p_fd, uap->s, 1440 cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL); 1441 if (error == 0) { 1442 so = fp->f_data; 1443 error = soshutdown(so, uap->how); 1444 fdrop(fp, td); 1445 } 1446 return (error); 1447 } 1448 1449 /* ARGSUSED */ 1450 int 1451 sys_setsockopt(td, uap) 1452 struct thread *td; 1453 struct setsockopt_args /* { 1454 int s; 1455 int level; 1456 int name; 1457 caddr_t val; 1458 int valsize; 1459 } */ *uap; 1460 { 1461 1462 return (kern_setsockopt(td, uap->s, uap->level, uap->name, 1463 uap->val, UIO_USERSPACE, uap->valsize)); 1464 } 1465 1466 int 1467 kern_setsockopt(td, s, level, name, val, valseg, valsize) 1468 struct thread *td; 1469 int s; 1470 int level; 1471 int name; 1472 void *val; 1473 enum uio_seg valseg; 1474 socklen_t valsize; 1475 { 1476 struct socket *so; 1477 struct file *fp; 1478 struct sockopt sopt; 1479 cap_rights_t rights; 1480 int error; 1481 1482 if (val == NULL && valsize != 0) 1483 return (EFAULT); 1484 if ((int)valsize < 0) 1485 return (EINVAL); 1486 1487 sopt.sopt_dir = SOPT_SET; 1488 sopt.sopt_level = level; 1489 sopt.sopt_name = name; 1490 sopt.sopt_val = val; 1491 sopt.sopt_valsize = valsize; 1492 switch (valseg) { 1493 case UIO_USERSPACE: 1494 sopt.sopt_td = td; 1495 break; 1496 case UIO_SYSSPACE: 1497 sopt.sopt_td = NULL; 1498 break; 1499 default: 1500 panic("kern_setsockopt called with bad valseg"); 1501 } 1502 1503 AUDIT_ARG_FD(s); 1504 error = getsock_cap(td->td_proc->p_fd, s, 1505 cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL); 1506 if (error == 0) { 1507 so = fp->f_data; 1508 error = sosetopt(so, &sopt); 1509 fdrop(fp, td); 1510 } 1511 return(error); 1512 } 1513 1514 /* ARGSUSED */ 1515 int 1516 sys_getsockopt(td, uap) 1517 struct thread *td; 1518 struct getsockopt_args /* { 1519 int s; 1520 int level; 1521 int name; 1522 void * __restrict val; 1523 socklen_t * __restrict avalsize; 1524 } */ *uap; 1525 { 1526 socklen_t valsize; 1527 int error; 1528 1529 if (uap->val) { 1530 error = copyin(uap->avalsize, &valsize, sizeof (valsize)); 1531 if (error != 0) 1532 return (error); 1533 } 1534 1535 error = kern_getsockopt(td, uap->s, uap->level, uap->name, 1536 uap->val, UIO_USERSPACE, &valsize); 1537 1538 if (error == 0) 1539 error = copyout(&valsize, uap->avalsize, sizeof (valsize)); 1540 return (error); 1541 } 1542 1543 /* 1544 * Kernel version of getsockopt. 1545 * optval can be a userland or userspace. optlen is always a kernel pointer. 1546 */ 1547 int 1548 kern_getsockopt(td, s, level, name, val, valseg, valsize) 1549 struct thread *td; 1550 int s; 1551 int level; 1552 int name; 1553 void *val; 1554 enum uio_seg valseg; 1555 socklen_t *valsize; 1556 { 1557 struct socket *so; 1558 struct file *fp; 1559 struct sockopt sopt; 1560 cap_rights_t rights; 1561 int error; 1562 1563 if (val == NULL) 1564 *valsize = 0; 1565 if ((int)*valsize < 0) 1566 return (EINVAL); 1567 1568 sopt.sopt_dir = SOPT_GET; 1569 sopt.sopt_level = level; 1570 sopt.sopt_name = name; 1571 sopt.sopt_val = val; 1572 sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ 1573 switch (valseg) { 1574 case UIO_USERSPACE: 1575 sopt.sopt_td = td; 1576 break; 1577 case UIO_SYSSPACE: 1578 sopt.sopt_td = NULL; 1579 break; 1580 default: 1581 panic("kern_getsockopt called with bad valseg"); 1582 } 1583 1584 AUDIT_ARG_FD(s); 1585 error = getsock_cap(td->td_proc->p_fd, s, 1586 cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL); 1587 if (error == 0) { 1588 so = fp->f_data; 1589 error = sogetopt(so, &sopt); 1590 *valsize = sopt.sopt_valsize; 1591 fdrop(fp, td); 1592 } 1593 return (error); 1594 } 1595 1596 /* 1597 * getsockname1() - Get socket name. 1598 */ 1599 /* ARGSUSED */ 1600 static int 1601 getsockname1(td, uap, compat) 1602 struct thread *td; 1603 struct getsockname_args /* { 1604 int fdes; 1605 struct sockaddr * __restrict asa; 1606 socklen_t * __restrict alen; 1607 } */ *uap; 1608 int compat; 1609 { 1610 struct sockaddr *sa; 1611 socklen_t len; 1612 int error; 1613 1614 error = copyin(uap->alen, &len, sizeof(len)); 1615 if (error != 0) 1616 return (error); 1617 1618 error = kern_getsockname(td, uap->fdes, &sa, &len); 1619 if (error != 0) 1620 return (error); 1621 1622 if (len != 0) { 1623 #ifdef COMPAT_OLDSOCK 1624 if (compat) 1625 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1626 #endif 1627 error = copyout(sa, uap->asa, (u_int)len); 1628 } 1629 free(sa, M_SONAME); 1630 if (error == 0) 1631 error = copyout(&len, uap->alen, sizeof(len)); 1632 return (error); 1633 } 1634 1635 int 1636 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, 1637 socklen_t *alen) 1638 { 1639 struct socket *so; 1640 struct file *fp; 1641 cap_rights_t rights; 1642 socklen_t len; 1643 int error; 1644 1645 AUDIT_ARG_FD(fd); 1646 error = getsock_cap(td->td_proc->p_fd, fd, 1647 cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL); 1648 if (error != 0) 1649 return (error); 1650 so = fp->f_data; 1651 *sa = NULL; 1652 CURVNET_SET(so->so_vnet); 1653 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); 1654 CURVNET_RESTORE(); 1655 if (error != 0) 1656 goto bad; 1657 if (*sa == NULL) 1658 len = 0; 1659 else 1660 len = MIN(*alen, (*sa)->sa_len); 1661 *alen = len; 1662 #ifdef KTRACE 1663 if (KTRPOINT(td, KTR_STRUCT)) 1664 ktrsockaddr(*sa); 1665 #endif 1666 bad: 1667 fdrop(fp, td); 1668 if (error != 0 && *sa != NULL) { 1669 free(*sa, M_SONAME); 1670 *sa = NULL; 1671 } 1672 return (error); 1673 } 1674 1675 int 1676 sys_getsockname(td, uap) 1677 struct thread *td; 1678 struct getsockname_args *uap; 1679 { 1680 1681 return (getsockname1(td, uap, 0)); 1682 } 1683 1684 #ifdef COMPAT_OLDSOCK 1685 int 1686 ogetsockname(td, uap) 1687 struct thread *td; 1688 struct getsockname_args *uap; 1689 { 1690 1691 return (getsockname1(td, uap, 1)); 1692 } 1693 #endif /* COMPAT_OLDSOCK */ 1694 1695 /* 1696 * getpeername1() - Get name of peer for connected socket. 1697 */ 1698 /* ARGSUSED */ 1699 static int 1700 getpeername1(td, uap, compat) 1701 struct thread *td; 1702 struct getpeername_args /* { 1703 int fdes; 1704 struct sockaddr * __restrict asa; 1705 socklen_t * __restrict alen; 1706 } */ *uap; 1707 int compat; 1708 { 1709 struct sockaddr *sa; 1710 socklen_t len; 1711 int error; 1712 1713 error = copyin(uap->alen, &len, sizeof (len)); 1714 if (error != 0) 1715 return (error); 1716 1717 error = kern_getpeername(td, uap->fdes, &sa, &len); 1718 if (error != 0) 1719 return (error); 1720 1721 if (len != 0) { 1722 #ifdef COMPAT_OLDSOCK 1723 if (compat) 1724 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1725 #endif 1726 error = copyout(sa, uap->asa, (u_int)len); 1727 } 1728 free(sa, M_SONAME); 1729 if (error == 0) 1730 error = copyout(&len, uap->alen, sizeof(len)); 1731 return (error); 1732 } 1733 1734 int 1735 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, 1736 socklen_t *alen) 1737 { 1738 struct socket *so; 1739 struct file *fp; 1740 cap_rights_t rights; 1741 socklen_t len; 1742 int error; 1743 1744 AUDIT_ARG_FD(fd); 1745 error = getsock_cap(td->td_proc->p_fd, fd, 1746 cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL); 1747 if (error != 0) 1748 return (error); 1749 so = fp->f_data; 1750 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1751 error = ENOTCONN; 1752 goto done; 1753 } 1754 *sa = NULL; 1755 CURVNET_SET(so->so_vnet); 1756 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); 1757 CURVNET_RESTORE(); 1758 if (error != 0) 1759 goto bad; 1760 if (*sa == NULL) 1761 len = 0; 1762 else 1763 len = MIN(*alen, (*sa)->sa_len); 1764 *alen = len; 1765 #ifdef KTRACE 1766 if (KTRPOINT(td, KTR_STRUCT)) 1767 ktrsockaddr(*sa); 1768 #endif 1769 bad: 1770 if (error != 0 && *sa != NULL) { 1771 free(*sa, M_SONAME); 1772 *sa = NULL; 1773 } 1774 done: 1775 fdrop(fp, td); 1776 return (error); 1777 } 1778 1779 int 1780 sys_getpeername(td, uap) 1781 struct thread *td; 1782 struct getpeername_args *uap; 1783 { 1784 1785 return (getpeername1(td, uap, 0)); 1786 } 1787 1788 #ifdef COMPAT_OLDSOCK 1789 int 1790 ogetpeername(td, uap) 1791 struct thread *td; 1792 struct ogetpeername_args *uap; 1793 { 1794 1795 /* XXX uap should have type `getpeername_args *' to begin with. */ 1796 return (getpeername1(td, (struct getpeername_args *)uap, 1)); 1797 } 1798 #endif /* COMPAT_OLDSOCK */ 1799 1800 int 1801 sockargs(mp, buf, buflen, type) 1802 struct mbuf **mp; 1803 caddr_t buf; 1804 int buflen, type; 1805 { 1806 struct sockaddr *sa; 1807 struct mbuf *m; 1808 int error; 1809 1810 if (buflen > MLEN) { 1811 #ifdef COMPAT_OLDSOCK 1812 if (type == MT_SONAME && buflen <= 112) 1813 buflen = MLEN; /* unix domain compat. hack */ 1814 else 1815 #endif 1816 if (buflen > MCLBYTES) 1817 return (EINVAL); 1818 } 1819 m = m_get2(buflen, M_WAITOK, type, 0); 1820 m->m_len = buflen; 1821 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1822 if (error != 0) 1823 (void) m_free(m); 1824 else { 1825 *mp = m; 1826 if (type == MT_SONAME) { 1827 sa = mtod(m, struct sockaddr *); 1828 1829 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1830 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1831 sa->sa_family = sa->sa_len; 1832 #endif 1833 sa->sa_len = buflen; 1834 } 1835 } 1836 return (error); 1837 } 1838 1839 int 1840 getsockaddr(namp, uaddr, len) 1841 struct sockaddr **namp; 1842 caddr_t uaddr; 1843 size_t len; 1844 { 1845 struct sockaddr *sa; 1846 int error; 1847 1848 if (len > SOCK_MAXADDRLEN) 1849 return (ENAMETOOLONG); 1850 if (len < offsetof(struct sockaddr, sa_data[0])) 1851 return (EINVAL); 1852 sa = malloc(len, M_SONAME, M_WAITOK); 1853 error = copyin(uaddr, sa, len); 1854 if (error != 0) { 1855 free(sa, M_SONAME); 1856 } else { 1857 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1858 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1859 sa->sa_family = sa->sa_len; 1860 #endif 1861 sa->sa_len = len; 1862 *namp = sa; 1863 } 1864 return (error); 1865 } 1866 1867 static int 1868 filt_sfsync_attach(struct knote *kn) 1869 { 1870 struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_sdata; 1871 struct knlist *knl = &sfs->klist; 1872 1873 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1874 1875 /* 1876 * Validate that we actually received this via the kernel API. 1877 */ 1878 if ((kn->kn_flags & EV_FLAG1) == 0) 1879 return (EPERM); 1880 1881 kn->kn_ptr.p_v = sfs; 1882 kn->kn_flags &= ~EV_FLAG1; 1883 1884 knl->kl_lock(knl->kl_lockarg); 1885 /* 1886 * If we're in the "freeing" state, 1887 * don't allow the add. That way we don't 1888 * end up racing with some other thread that 1889 * is trying to finish some setup. 1890 */ 1891 if (sfs->state == SF_STATE_FREEING) { 1892 knl->kl_unlock(knl->kl_lockarg); 1893 return (EINVAL); 1894 } 1895 knlist_add(&sfs->klist, kn, 1); 1896 knl->kl_unlock(knl->kl_lockarg); 1897 1898 return (0); 1899 } 1900 1901 /* 1902 * Called when a knote is being detached. 1903 */ 1904 static void 1905 filt_sfsync_detach(struct knote *kn) 1906 { 1907 struct knlist *knl; 1908 struct sendfile_sync *sfs; 1909 int do_free = 0; 1910 1911 sfs = kn->kn_ptr.p_v; 1912 knl = &sfs->klist; 1913 1914 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1915 1916 knl->kl_lock(knl->kl_lockarg); 1917 if (!knlist_empty(knl)) 1918 knlist_remove(knl, kn, 1); 1919 1920 /* 1921 * If the list is empty _AND_ the refcount is 0 1922 * _AND_ we've finished the setup phase and now 1923 * we're in the running phase, we can free the 1924 * underlying sendfile_sync. 1925 * 1926 * But we shouldn't do it before finishing the 1927 * underlying divorce from the knote. 1928 * 1929 * So, we have the sfsync lock held; transition 1930 * it to "freeing", then unlock, then free 1931 * normally. 1932 */ 1933 if (knlist_empty(knl)) { 1934 if (sfs->state == SF_STATE_COMPLETED && sfs->count == 0) { 1935 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, " 1936 "count==0, empty list: time to free!\n", 1937 __func__, 1938 (unsigned long long) curthread->td_tid, 1939 sfs); 1940 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 1941 do_free = 1; 1942 } 1943 } 1944 knl->kl_unlock(knl->kl_lockarg); 1945 1946 /* 1947 * Only call free if we're the one who has transitioned things 1948 * to free. Otherwise we could race with another thread that 1949 * is currently tearing things down. 1950 */ 1951 if (do_free == 1) { 1952 SFSYNC_DPRINTF("%s: (%llu) sfs=%p, %s:%d\n", 1953 __func__, 1954 (unsigned long long) curthread->td_tid, 1955 sfs, 1956 __FILE__, 1957 __LINE__); 1958 sf_sync_free(sfs); 1959 } 1960 } 1961 1962 static int 1963 filt_sfsync(struct knote *kn, long hint) 1964 { 1965 struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_ptr.p_v; 1966 int ret; 1967 1968 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1969 1970 /* 1971 * XXX add a lock assertion here! 1972 */ 1973 ret = (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED); 1974 1975 return (ret); 1976 } 1977 1978 /* 1979 * Add more references to a vm_page + sf_buf + sendfile_sync. 1980 */ 1981 void 1982 sf_ext_ref(void *arg1, void *arg2) 1983 { 1984 struct sf_buf *sf = arg1; 1985 struct sendfile_sync *sfs = arg2; 1986 vm_page_t pg = sf_buf_page(sf); 1987 1988 sf_buf_ref(sf); 1989 1990 vm_page_lock(pg); 1991 vm_page_wire(pg); 1992 vm_page_unlock(pg); 1993 1994 if (sfs != NULL) { 1995 mtx_lock(&sfs->mtx); 1996 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); 1997 sfs->count++; 1998 mtx_unlock(&sfs->mtx); 1999 } 2000 } 2001 2002 /* 2003 * Detach mapped page and release resources back to the system. 2004 */ 2005 void 2006 sf_ext_free(void *arg1, void *arg2) 2007 { 2008 struct sf_buf *sf = arg1; 2009 struct sendfile_sync *sfs = arg2; 2010 vm_page_t pg = sf_buf_page(sf); 2011 2012 sf_buf_free(sf); 2013 2014 vm_page_lock(pg); 2015 vm_page_unwire(pg, PQ_INACTIVE); 2016 /* 2017 * Check for the object going away on us. This can 2018 * happen since we don't hold a reference to it. 2019 * If so, we're responsible for freeing the page. 2020 */ 2021 if (pg->wire_count == 0 && pg->object == NULL) 2022 vm_page_free(pg); 2023 vm_page_unlock(pg); 2024 2025 if (sfs != NULL) 2026 sf_sync_deref(sfs); 2027 } 2028 2029 /* 2030 * Called to remove a reference to a sf_sync object. 2031 * 2032 * This is generally done during the mbuf free path to signify 2033 * that one of the mbufs in the transaction has been completed. 2034 * 2035 * If we're doing SF_SYNC and the refcount is zero then we'll wake 2036 * up any waiters. 2037 * 2038 * IF we're doing SF_KQUEUE and the refcount is zero then we'll 2039 * fire off the knote. 2040 */ 2041 void 2042 sf_sync_deref(struct sendfile_sync *sfs) 2043 { 2044 int do_free = 0; 2045 2046 if (sfs == NULL) 2047 return; 2048 2049 mtx_lock(&sfs->mtx); 2050 KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0")); 2051 sfs->count --; 2052 2053 /* 2054 * Only fire off the wakeup / kqueue notification if 2055 * we are in the running state. 2056 */ 2057 if (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED) { 2058 if (sfs->flags & SF_SYNC) 2059 cv_signal(&sfs->cv); 2060 2061 if (sfs->flags & SF_KQUEUE) { 2062 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: knote!\n", 2063 __func__, 2064 (unsigned long long) curthread->td_tid, 2065 sfs); 2066 KNOTE_LOCKED(&sfs->klist, 1); 2067 } 2068 2069 /* 2070 * If we're not waiting around for a sync, 2071 * check if the knote list is empty. 2072 * If it is, we transition to free. 2073 * 2074 * XXX I think it's about time I added some state 2075 * or flag that says whether we're supposed to be 2076 * waiting around until we've done a signal. 2077 * 2078 * XXX Ie, the reason that I don't free it here 2079 * is because the caller will free the last reference, 2080 * not us. That should be codified in some flag 2081 * that indicates "self-free" rather than checking 2082 * for SF_SYNC all the time. 2083 */ 2084 if ((sfs->flags & SF_SYNC) == 0 && knlist_empty(&sfs->klist)) { 2085 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, " 2086 "count==0, empty list: time to free!\n", 2087 __func__, 2088 (unsigned long long) curthread->td_tid, 2089 sfs); 2090 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2091 do_free = 1; 2092 } 2093 2094 } 2095 mtx_unlock(&sfs->mtx); 2096 2097 /* 2098 * Attempt to do a free here. 2099 * 2100 * We do this outside of the lock because it may destroy the 2101 * lock in question as it frees things. We can optimise this 2102 * later. 2103 * 2104 * XXX yes, we should make it a requirement to hold the 2105 * lock across sf_sync_free(). 2106 */ 2107 if (do_free == 1) { 2108 SFSYNC_DPRINTF("%s: (%llu) sfs=%p\n", 2109 __func__, 2110 (unsigned long long) curthread->td_tid, 2111 sfs); 2112 sf_sync_free(sfs); 2113 } 2114 } 2115 2116 /* 2117 * Allocate a sendfile_sync state structure. 2118 * 2119 * For now this only knows about the "sleep" sync, but later it will 2120 * grow various other personalities. 2121 */ 2122 struct sendfile_sync * 2123 sf_sync_alloc(uint32_t flags) 2124 { 2125 struct sendfile_sync *sfs; 2126 2127 sfs = uma_zalloc(zone_sfsync, M_WAITOK | M_ZERO); 2128 mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); 2129 cv_init(&sfs->cv, "sendfile"); 2130 sfs->flags = flags; 2131 sfs->state = SF_STATE_SETUP; 2132 knlist_init_mtx(&sfs->klist, &sfs->mtx); 2133 2134 SFSYNC_DPRINTF("%s: sfs=%p, flags=0x%08x\n", __func__, sfs, sfs->flags); 2135 2136 return (sfs); 2137 } 2138 2139 /* 2140 * Take a reference to a sfsync instance. 2141 * 2142 * This has to map 1:1 to free calls coming in via sf_ext_free(), 2143 * so typically this will be referenced once for each mbuf allocated. 2144 */ 2145 void 2146 sf_sync_ref(struct sendfile_sync *sfs) 2147 { 2148 2149 if (sfs == NULL) 2150 return; 2151 2152 mtx_lock(&sfs->mtx); 2153 sfs->count++; 2154 mtx_unlock(&sfs->mtx); 2155 } 2156 2157 void 2158 sf_sync_syscall_wait(struct sendfile_sync *sfs) 2159 { 2160 2161 if (sfs == NULL) 2162 return; 2163 2164 KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!", 2165 __func__, 2166 sfs)); 2167 2168 /* 2169 * If we're not requested to wait during the syscall, 2170 * don't bother waiting. 2171 */ 2172 if ((sfs->flags & SF_SYNC) == 0) 2173 goto out; 2174 2175 /* 2176 * This is a bit suboptimal and confusing, so bear with me. 2177 * 2178 * Ideally sf_sync_syscall_wait() will wait until 2179 * all pending mbuf transmit operations are done. 2180 * This means that when sendfile becomes async, it'll 2181 * run in the background and will transition from 2182 * RUNNING to COMPLETED when it's finished acquiring 2183 * new things to send. Then, when the mbufs finish 2184 * sending, COMPLETED + sfs->count == 0 is enough to 2185 * know that no further work is being done. 2186 * 2187 * So, we will sleep on both RUNNING and COMPLETED. 2188 * It's up to the (in progress) async sendfile loop 2189 * to transition the sf_sync from RUNNING to 2190 * COMPLETED so the wakeup above will actually 2191 * do the cv_signal() call. 2192 */ 2193 if (sfs->state != SF_STATE_COMPLETED && sfs->state != SF_STATE_RUNNING) 2194 goto out; 2195 2196 if (sfs->count != 0) 2197 cv_wait(&sfs->cv, &sfs->mtx); 2198 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2199 2200 out: 2201 return; 2202 } 2203 2204 /* 2205 * Free an sf_sync if it's appropriate to. 2206 */ 2207 void 2208 sf_sync_free(struct sendfile_sync *sfs) 2209 { 2210 2211 if (sfs == NULL) 2212 return; 2213 2214 SFSYNC_DPRINTF("%s: (%lld) sfs=%p; called; state=%d, flags=0x%08x " 2215 "count=%d\n", 2216 __func__, 2217 (long long) curthread->td_tid, 2218 sfs, 2219 sfs->state, 2220 sfs->flags, 2221 sfs->count); 2222 2223 mtx_lock(&sfs->mtx); 2224 2225 /* 2226 * We keep the sf_sync around if the state is active, 2227 * we are doing kqueue notification and we have active 2228 * knotes. 2229 * 2230 * If the caller wants to free us right this second it 2231 * should transition this to the freeing state. 2232 * 2233 * So, complain loudly if they break this rule. 2234 */ 2235 if (sfs->state != SF_STATE_FREEING) { 2236 printf("%s: (%llu) sfs=%p; not freeing; let's wait!\n", 2237 __func__, 2238 (unsigned long long) curthread->td_tid, 2239 sfs); 2240 mtx_unlock(&sfs->mtx); 2241 return; 2242 } 2243 2244 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2245 cv_destroy(&sfs->cv); 2246 /* 2247 * This doesn't call knlist_detach() on each knote; it just frees 2248 * the entire list. 2249 */ 2250 knlist_delete(&sfs->klist, curthread, 1); 2251 mtx_destroy(&sfs->mtx); 2252 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; freeing\n", 2253 __func__, 2254 (unsigned long long) curthread->td_tid, 2255 sfs); 2256 uma_zfree(zone_sfsync, sfs); 2257 } 2258 2259 /* 2260 * Setup a sf_sync to post a kqueue notification when things are complete. 2261 */ 2262 int 2263 sf_sync_kqueue_setup(struct sendfile_sync *sfs, struct sf_hdtr_kq *sfkq) 2264 { 2265 struct kevent kev; 2266 int error; 2267 2268 sfs->flags |= SF_KQUEUE; 2269 2270 /* Check the flags are valid */ 2271 if ((sfkq->kq_flags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) 2272 return (EINVAL); 2273 2274 SFSYNC_DPRINTF("%s: sfs=%p: kqfd=%d, flags=0x%08x, ident=%p, udata=%p\n", 2275 __func__, 2276 sfs, 2277 sfkq->kq_fd, 2278 sfkq->kq_flags, 2279 (void *) sfkq->kq_ident, 2280 (void *) sfkq->kq_udata); 2281 2282 /* Setup and register a knote on the given kqfd. */ 2283 kev.ident = (uintptr_t) sfkq->kq_ident; 2284 kev.filter = EVFILT_SENDFILE; 2285 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | sfkq->kq_flags; 2286 kev.data = (intptr_t) sfs; 2287 kev.udata = sfkq->kq_udata; 2288 2289 error = kqfd_register(sfkq->kq_fd, &kev, curthread, 1); 2290 if (error != 0) { 2291 SFSYNC_DPRINTF("%s: returned %d\n", __func__, error); 2292 } 2293 return (error); 2294 } 2295 2296 void 2297 sf_sync_set_state(struct sendfile_sync *sfs, sendfile_sync_state_t state, 2298 int islocked) 2299 { 2300 sendfile_sync_state_t old_state; 2301 2302 if (! islocked) 2303 mtx_lock(&sfs->mtx); 2304 2305 /* 2306 * Update our current state. 2307 */ 2308 old_state = sfs->state; 2309 sfs->state = state; 2310 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; going from %d to %d\n", 2311 __func__, 2312 (unsigned long long) curthread->td_tid, 2313 sfs, 2314 old_state, 2315 state); 2316 2317 /* 2318 * If we're transitioning from RUNNING to COMPLETED and the count is 2319 * zero, then post the knote. The caller may have completed the 2320 * send before we updated the state to COMPLETED and we need to make 2321 * sure this is communicated. 2322 */ 2323 if (old_state == SF_STATE_RUNNING 2324 && state == SF_STATE_COMPLETED 2325 && sfs->count == 0 2326 && sfs->flags & SF_KQUEUE) { 2327 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: triggering knote!\n", 2328 __func__, 2329 (unsigned long long) curthread->td_tid, 2330 sfs); 2331 KNOTE_LOCKED(&sfs->klist, 1); 2332 } 2333 2334 if (! islocked) 2335 mtx_unlock(&sfs->mtx); 2336 } 2337 2338 /* 2339 * Set the retval/errno for the given transaction. 2340 * 2341 * This will eventually/ideally be used when the KNOTE is fired off 2342 * to signify the completion of this transaction. 2343 * 2344 * The sfsync lock should be held before entering this function. 2345 */ 2346 void 2347 sf_sync_set_retval(struct sendfile_sync *sfs, off_t retval, int xerrno) 2348 { 2349 2350 KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!", 2351 __func__, 2352 sfs)); 2353 2354 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: errno=%d, retval=%jd\n", 2355 __func__, 2356 (unsigned long long) curthread->td_tid, 2357 sfs, 2358 xerrno, 2359 (intmax_t) retval); 2360 2361 sfs->retval = retval; 2362 sfs->xerrno = xerrno; 2363 } 2364 2365 /* 2366 * sendfile(2) 2367 * 2368 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 2369 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 2370 * 2371 * Send a file specified by 'fd' and starting at 'offset' to a socket 2372 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == 2373 * 0. Optionally add a header and/or trailer to the socket output. If 2374 * specified, write the total number of bytes sent into *sbytes. 2375 */ 2376 int 2377 sys_sendfile(struct thread *td, struct sendfile_args *uap) 2378 { 2379 2380 return (do_sendfile(td, uap, 0)); 2381 } 2382 2383 int 2384 _do_sendfile(struct thread *td, int src_fd, int sock_fd, int flags, 2385 int compat, off_t offset, size_t nbytes, off_t *sbytes, 2386 struct uio *hdr_uio, 2387 struct uio *trl_uio, struct sf_hdtr_kq *hdtr_kq) 2388 { 2389 cap_rights_t rights; 2390 struct sendfile_sync *sfs = NULL; 2391 struct file *fp; 2392 int error; 2393 int do_kqueue = 0; 2394 int do_free = 0; 2395 2396 AUDIT_ARG_FD(src_fd); 2397 2398 if (hdtr_kq != NULL) 2399 do_kqueue = 1; 2400 2401 /* 2402 * sendfile(2) can start at any offset within a file so we require 2403 * CAP_READ+CAP_SEEK = CAP_PREAD. 2404 */ 2405 if ((error = fget_read(td, src_fd, 2406 cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { 2407 goto out; 2408 } 2409 2410 /* 2411 * IF SF_KQUEUE is set but we haven't copied in anything for 2412 * kqueue data, error out. 2413 */ 2414 if (flags & SF_KQUEUE && do_kqueue == 0) { 2415 SFSYNC_DPRINTF("%s: SF_KQUEUE but no KQUEUE data!\n", __func__); 2416 goto out; 2417 } 2418 2419 /* 2420 * If we need to wait for completion, initialise the sfsync 2421 * state here. 2422 */ 2423 if (flags & (SF_SYNC | SF_KQUEUE)) 2424 sfs = sf_sync_alloc(flags & (SF_SYNC | SF_KQUEUE)); 2425 2426 if (flags & SF_KQUEUE) { 2427 error = sf_sync_kqueue_setup(sfs, hdtr_kq); 2428 if (error) { 2429 SFSYNC_DPRINTF("%s: (%llu) error; sfs=%p\n", 2430 __func__, 2431 (unsigned long long) curthread->td_tid, 2432 sfs); 2433 sf_sync_set_state(sfs, SF_STATE_FREEING, 0); 2434 sf_sync_free(sfs); 2435 goto out; 2436 } 2437 } 2438 2439 /* 2440 * Do the sendfile call. 2441 * 2442 * If this fails, it'll free the mbuf chain which will free up the 2443 * sendfile_sync references. 2444 */ 2445 error = fo_sendfile(fp, sock_fd, hdr_uio, trl_uio, offset, 2446 nbytes, sbytes, flags, compat ? SFK_COMPAT : 0, sfs, td); 2447 2448 /* 2449 * If the sendfile call succeeded, transition the sf_sync state 2450 * to RUNNING, then COMPLETED. 2451 * 2452 * If the sendfile call failed, then the sendfile call may have 2453 * actually sent some data first - so we check to see whether 2454 * any data was sent. If some data was queued (ie, count > 0) 2455 * then we can't call free; we have to wait until the partial 2456 * transaction completes before we continue along. 2457 * 2458 * This has the side effect of firing off the knote 2459 * if the refcount has hit zero by the time we get here. 2460 */ 2461 if (sfs != NULL) { 2462 mtx_lock(&sfs->mtx); 2463 if (error == 0 || sfs->count > 0) { 2464 /* 2465 * When it's time to do async sendfile, the transition 2466 * to RUNNING signifies that we're actually actively 2467 * adding and completing mbufs. When the last disk 2468 * buffer is read (ie, when we're not doing any 2469 * further read IO and all subsequent stuff is mbuf 2470 * transmissions) we'll transition to COMPLETED 2471 * and when the final mbuf is freed, the completion 2472 * will be signaled. 2473 */ 2474 sf_sync_set_state(sfs, SF_STATE_RUNNING, 1); 2475 2476 /* 2477 * Set the retval before we signal completed. 2478 * If we do it the other way around then transitioning to 2479 * COMPLETED may post the knote before you set the return 2480 * status! 2481 * 2482 * XXX for now, errno is always 0, as we don't post 2483 * knotes if sendfile failed. Maybe that'll change later. 2484 */ 2485 sf_sync_set_retval(sfs, *sbytes, error); 2486 2487 /* 2488 * And now transition to completed, which will kick off 2489 * the knote if required. 2490 */ 2491 sf_sync_set_state(sfs, SF_STATE_COMPLETED, 1); 2492 } else { 2493 /* 2494 * Error isn't zero, sfs_count is zero, so we 2495 * won't have some other thing to wake things up. 2496 * Thus free. 2497 */ 2498 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2499 do_free = 1; 2500 } 2501 2502 /* 2503 * Next - wait if appropriate. 2504 */ 2505 sf_sync_syscall_wait(sfs); 2506 2507 /* 2508 * If we're not doing kqueue notifications, we can 2509 * transition this immediately to the freeing state. 2510 */ 2511 if ((sfs->flags & SF_KQUEUE) == 0) { 2512 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2513 do_free = 1; 2514 } 2515 2516 mtx_unlock(&sfs->mtx); 2517 } 2518 2519 /* 2520 * If do_free is set, free here. 2521 * 2522 * If we're doing no-kqueue notification and it's just sleep notification, 2523 * we also do free; it's the only chance we have. 2524 */ 2525 if (sfs != NULL && do_free == 1) { 2526 sf_sync_free(sfs); 2527 } 2528 2529 /* 2530 * XXX Should we wait until the send has completed before freeing the source 2531 * file handle? It's the previous behaviour, sure, but is it required? 2532 * We've wired down the page references after all. 2533 */ 2534 fdrop(fp, td); 2535 2536 out: 2537 /* Return error */ 2538 return (error); 2539 } 2540 2541 2542 static int 2543 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) 2544 { 2545 struct sf_hdtr hdtr; 2546 struct sf_hdtr_kq hdtr_kq; 2547 struct uio *hdr_uio, *trl_uio; 2548 int error; 2549 off_t sbytes; 2550 int do_kqueue = 0; 2551 2552 /* 2553 * File offset must be positive. If it goes beyond EOF 2554 * we send only the header/trailer and no payload data. 2555 */ 2556 if (uap->offset < 0) 2557 return (EINVAL); 2558 2559 hdr_uio = trl_uio = NULL; 2560 2561 if (uap->hdtr != NULL) { 2562 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 2563 if (error != 0) 2564 goto out; 2565 if (hdtr.headers != NULL) { 2566 error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); 2567 if (error != 0) 2568 goto out; 2569 } 2570 if (hdtr.trailers != NULL) { 2571 error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); 2572 if (error != 0) 2573 goto out; 2574 } 2575 2576 /* 2577 * If SF_KQUEUE is set, then we need to also copy in 2578 * the kqueue data after the normal hdtr set and set 2579 * do_kqueue=1. 2580 */ 2581 if (uap->flags & SF_KQUEUE) { 2582 error = copyin(((char *) uap->hdtr) + sizeof(hdtr), 2583 &hdtr_kq, 2584 sizeof(hdtr_kq)); 2585 if (error != 0) 2586 goto out; 2587 do_kqueue = 1; 2588 } 2589 } 2590 2591 /* Call sendfile */ 2592 error = _do_sendfile(td, uap->fd, uap->s, uap->flags, compat, 2593 uap->offset, uap->nbytes, &sbytes, hdr_uio, trl_uio, &hdtr_kq); 2594 2595 if (uap->sbytes != NULL) { 2596 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 2597 } 2598 out: 2599 free(hdr_uio, M_IOV); 2600 free(trl_uio, M_IOV); 2601 return (error); 2602 } 2603 2604 #ifdef COMPAT_FREEBSD4 2605 int 2606 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) 2607 { 2608 struct sendfile_args args; 2609 2610 args.fd = uap->fd; 2611 args.s = uap->s; 2612 args.offset = uap->offset; 2613 args.nbytes = uap->nbytes; 2614 args.hdtr = uap->hdtr; 2615 args.sbytes = uap->sbytes; 2616 args.flags = uap->flags; 2617 2618 return (do_sendfile(td, &args, 1)); 2619 } 2620 #endif /* COMPAT_FREEBSD4 */ 2621 2622 static int 2623 sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd, 2624 off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res) 2625 { 2626 vm_page_t m; 2627 vm_pindex_t pindex; 2628 ssize_t resid; 2629 int error, readahead, rv; 2630 2631 pindex = OFF_TO_IDX(off); 2632 VM_OBJECT_WLOCK(obj); 2633 m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY | 2634 VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL); 2635 2636 /* 2637 * Check if page is valid for what we need, otherwise initiate I/O. 2638 * 2639 * The non-zero nd argument prevents disk I/O, instead we 2640 * return the caller what he specified in nd. In particular, 2641 * if we already turned some pages into mbufs, nd == EAGAIN 2642 * and the main function send them the pages before we come 2643 * here again and block. 2644 */ 2645 if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) { 2646 if (vp == NULL) 2647 vm_page_xunbusy(m); 2648 VM_OBJECT_WUNLOCK(obj); 2649 *res = m; 2650 return (0); 2651 } else if (nd != 0) { 2652 if (vp == NULL) 2653 vm_page_xunbusy(m); 2654 error = nd; 2655 goto free_page; 2656 } 2657 2658 /* 2659 * Get the page from backing store. 2660 */ 2661 error = 0; 2662 if (vp != NULL) { 2663 VM_OBJECT_WUNLOCK(obj); 2664 readahead = sfreadahead * MAXBSIZE; 2665 2666 /* 2667 * Use vn_rdwr() instead of the pager interface for 2668 * the vnode, to allow the read-ahead. 2669 * 2670 * XXXMAC: Because we don't have fp->f_cred here, we 2671 * pass in NOCRED. This is probably wrong, but is 2672 * consistent with our original implementation. 2673 */ 2674 error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off), 2675 UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead / 2676 bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); 2677 SFSTAT_INC(sf_iocnt); 2678 VM_OBJECT_WLOCK(obj); 2679 } else { 2680 if (vm_pager_has_page(obj, pindex, NULL, NULL)) { 2681 rv = vm_pager_get_pages(obj, &m, 1, 0); 2682 SFSTAT_INC(sf_iocnt); 2683 m = vm_page_lookup(obj, pindex); 2684 if (m == NULL) 2685 error = EIO; 2686 else if (rv != VM_PAGER_OK) { 2687 vm_page_lock(m); 2688 vm_page_free(m); 2689 vm_page_unlock(m); 2690 m = NULL; 2691 error = EIO; 2692 } 2693 } else { 2694 pmap_zero_page(m); 2695 m->valid = VM_PAGE_BITS_ALL; 2696 m->dirty = 0; 2697 } 2698 if (m != NULL) 2699 vm_page_xunbusy(m); 2700 } 2701 if (error == 0) { 2702 *res = m; 2703 } else if (m != NULL) { 2704 free_page: 2705 vm_page_lock(m); 2706 vm_page_unwire(m, PQ_INACTIVE); 2707 2708 /* 2709 * See if anyone else might know about this page. If 2710 * not and it is not valid, then free it. 2711 */ 2712 if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m)) 2713 vm_page_free(m); 2714 vm_page_unlock(m); 2715 } 2716 KASSERT(error != 0 || (m->wire_count > 0 && 2717 vm_page_is_valid(m, off & PAGE_MASK, xfsize)), 2718 ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off, 2719 xfsize)); 2720 VM_OBJECT_WUNLOCK(obj); 2721 return (error); 2722 } 2723 2724 static int 2725 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, 2726 struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, 2727 int *bsize) 2728 { 2729 struct vattr va; 2730 vm_object_t obj; 2731 struct vnode *vp; 2732 struct shmfd *shmfd; 2733 int error; 2734 2735 vp = *vp_res = NULL; 2736 obj = NULL; 2737 shmfd = *shmfd_res = NULL; 2738 *bsize = 0; 2739 2740 /* 2741 * The file descriptor must be a regular file and have a 2742 * backing VM object. 2743 */ 2744 if (fp->f_type == DTYPE_VNODE) { 2745 vp = fp->f_vnode; 2746 vn_lock(vp, LK_SHARED | LK_RETRY); 2747 if (vp->v_type != VREG) { 2748 error = EINVAL; 2749 goto out; 2750 } 2751 *bsize = vp->v_mount->mnt_stat.f_iosize; 2752 error = VOP_GETATTR(vp, &va, td->td_ucred); 2753 if (error != 0) 2754 goto out; 2755 *obj_size = va.va_size; 2756 obj = vp->v_object; 2757 if (obj == NULL) { 2758 error = EINVAL; 2759 goto out; 2760 } 2761 } else if (fp->f_type == DTYPE_SHM) { 2762 shmfd = fp->f_data; 2763 obj = shmfd->shm_object; 2764 *obj_size = shmfd->shm_size; 2765 } else { 2766 error = EINVAL; 2767 goto out; 2768 } 2769 2770 VM_OBJECT_WLOCK(obj); 2771 if ((obj->flags & OBJ_DEAD) != 0) { 2772 VM_OBJECT_WUNLOCK(obj); 2773 error = EBADF; 2774 goto out; 2775 } 2776 2777 /* 2778 * Temporarily increase the backing VM object's reference 2779 * count so that a forced reclamation of its vnode does not 2780 * immediately destroy it. 2781 */ 2782 vm_object_reference_locked(obj); 2783 VM_OBJECT_WUNLOCK(obj); 2784 *obj_res = obj; 2785 *vp_res = vp; 2786 *shmfd_res = shmfd; 2787 2788 out: 2789 if (vp != NULL) 2790 VOP_UNLOCK(vp, 0); 2791 return (error); 2792 } 2793 2794 static int 2795 kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp, 2796 struct socket **so) 2797 { 2798 cap_rights_t rights; 2799 int error; 2800 2801 *sock_fp = NULL; 2802 *so = NULL; 2803 2804 /* 2805 * The socket must be a stream socket and connected. 2806 */ 2807 error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights, 2808 CAP_SEND), sock_fp, NULL); 2809 if (error != 0) 2810 return (error); 2811 *so = (*sock_fp)->f_data; 2812 if ((*so)->so_type != SOCK_STREAM) 2813 return (EINVAL); 2814 if (((*so)->so_state & SS_ISCONNECTED) == 0) 2815 return (ENOTCONN); 2816 return (0); 2817 } 2818 2819 int 2820 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 2821 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 2822 int kflags, struct sendfile_sync *sfs, struct thread *td) 2823 { 2824 struct file *sock_fp; 2825 struct vnode *vp; 2826 struct vm_object *obj; 2827 struct socket *so; 2828 struct mbuf *m; 2829 struct sf_buf *sf; 2830 struct vm_page *pg; 2831 struct shmfd *shmfd; 2832 struct vattr va; 2833 off_t off, xfsize, fsbytes, sbytes, rem, obj_size; 2834 int error, bsize, nd, hdrlen, mnw; 2835 2836 pg = NULL; 2837 obj = NULL; 2838 so = NULL; 2839 m = NULL; 2840 fsbytes = sbytes = 0; 2841 hdrlen = mnw = 0; 2842 rem = nbytes; 2843 obj_size = 0; 2844 2845 error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); 2846 if (error != 0) 2847 return (error); 2848 if (rem == 0) 2849 rem = obj_size; 2850 2851 error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so); 2852 if (error != 0) 2853 goto out; 2854 2855 /* 2856 * Do not wait on memory allocations but return ENOMEM for 2857 * caller to retry later. 2858 * XXX: Experimental. 2859 */ 2860 if (flags & SF_MNOWAIT) 2861 mnw = 1; 2862 2863 #ifdef MAC 2864 error = mac_socket_check_send(td->td_ucred, so); 2865 if (error != 0) 2866 goto out; 2867 #endif 2868 2869 /* If headers are specified copy them into mbufs. */ 2870 if (hdr_uio != NULL) { 2871 hdr_uio->uio_td = td; 2872 hdr_uio->uio_rw = UIO_WRITE; 2873 if (hdr_uio->uio_resid > 0) { 2874 /* 2875 * In FBSD < 5.0 the nbytes to send also included 2876 * the header. If compat is specified subtract the 2877 * header size from nbytes. 2878 */ 2879 if (kflags & SFK_COMPAT) { 2880 if (nbytes > hdr_uio->uio_resid) 2881 nbytes -= hdr_uio->uio_resid; 2882 else 2883 nbytes = 0; 2884 } 2885 m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), 2886 0, 0, 0); 2887 if (m == NULL) { 2888 error = mnw ? EAGAIN : ENOBUFS; 2889 goto out; 2890 } 2891 hdrlen = m_length(m, NULL); 2892 } 2893 } 2894 2895 /* 2896 * Protect against multiple writers to the socket. 2897 * 2898 * XXXRW: Historically this has assumed non-interruptibility, so now 2899 * we implement that, but possibly shouldn't. 2900 */ 2901 (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); 2902 2903 /* 2904 * Loop through the pages of the file, starting with the requested 2905 * offset. Get a file page (do I/O if necessary), map the file page 2906 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 2907 * it on the socket. 2908 * This is done in two loops. The inner loop turns as many pages 2909 * as it can, up to available socket buffer space, without blocking 2910 * into mbufs to have it bulk delivered into the socket send buffer. 2911 * The outer loop checks the state and available space of the socket 2912 * and takes care of the overall progress. 2913 */ 2914 for (off = offset; ; ) { 2915 struct mbuf *mtail; 2916 int loopbytes; 2917 int space; 2918 int done; 2919 2920 if ((nbytes != 0 && nbytes == fsbytes) || 2921 (nbytes == 0 && obj_size == fsbytes)) 2922 break; 2923 2924 mtail = NULL; 2925 loopbytes = 0; 2926 space = 0; 2927 done = 0; 2928 2929 /* 2930 * Check the socket state for ongoing connection, 2931 * no errors and space in socket buffer. 2932 * If space is low allow for the remainder of the 2933 * file to be processed if it fits the socket buffer. 2934 * Otherwise block in waiting for sufficient space 2935 * to proceed, or if the socket is nonblocking, return 2936 * to userland with EAGAIN while reporting how far 2937 * we've come. 2938 * We wait until the socket buffer has significant free 2939 * space to do bulk sends. This makes good use of file 2940 * system read ahead and allows packet segmentation 2941 * offloading hardware to take over lots of work. If 2942 * we were not careful here we would send off only one 2943 * sfbuf at a time. 2944 */ 2945 SOCKBUF_LOCK(&so->so_snd); 2946 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) 2947 so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; 2948 retry_space: 2949 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2950 error = EPIPE; 2951 SOCKBUF_UNLOCK(&so->so_snd); 2952 goto done; 2953 } else if (so->so_error) { 2954 error = so->so_error; 2955 so->so_error = 0; 2956 SOCKBUF_UNLOCK(&so->so_snd); 2957 goto done; 2958 } 2959 space = sbspace(&so->so_snd); 2960 if (space < rem && 2961 (space <= 0 || 2962 space < so->so_snd.sb_lowat)) { 2963 if (so->so_state & SS_NBIO) { 2964 SOCKBUF_UNLOCK(&so->so_snd); 2965 error = EAGAIN; 2966 goto done; 2967 } 2968 /* 2969 * sbwait drops the lock while sleeping. 2970 * When we loop back to retry_space the 2971 * state may have changed and we retest 2972 * for it. 2973 */ 2974 error = sbwait(&so->so_snd); 2975 /* 2976 * An error from sbwait usually indicates that we've 2977 * been interrupted by a signal. If we've sent anything 2978 * then return bytes sent, otherwise return the error. 2979 */ 2980 if (error != 0) { 2981 SOCKBUF_UNLOCK(&so->so_snd); 2982 goto done; 2983 } 2984 goto retry_space; 2985 } 2986 SOCKBUF_UNLOCK(&so->so_snd); 2987 2988 /* 2989 * Reduce space in the socket buffer by the size of 2990 * the header mbuf chain. 2991 * hdrlen is set to 0 after the first loop. 2992 */ 2993 space -= hdrlen; 2994 2995 if (vp != NULL) { 2996 error = vn_lock(vp, LK_SHARED); 2997 if (error != 0) 2998 goto done; 2999 error = VOP_GETATTR(vp, &va, td->td_ucred); 3000 if (error != 0 || off >= va.va_size) { 3001 VOP_UNLOCK(vp, 0); 3002 goto done; 3003 } 3004 obj_size = va.va_size; 3005 } 3006 3007 /* 3008 * Loop and construct maximum sized mbuf chain to be bulk 3009 * dumped into socket buffer. 3010 */ 3011 while (space > loopbytes) { 3012 vm_offset_t pgoff; 3013 struct mbuf *m0; 3014 3015 /* 3016 * Calculate the amount to transfer. 3017 * Not to exceed a page, the EOF, 3018 * or the passed in nbytes. 3019 */ 3020 pgoff = (vm_offset_t)(off & PAGE_MASK); 3021 rem = obj_size - offset; 3022 if (nbytes != 0) 3023 rem = omin(rem, nbytes); 3024 rem -= fsbytes + loopbytes; 3025 xfsize = omin(PAGE_SIZE - pgoff, rem); 3026 xfsize = omin(space - loopbytes, xfsize); 3027 if (xfsize <= 0) { 3028 done = 1; /* all data sent */ 3029 break; 3030 } 3031 3032 /* 3033 * Attempt to look up the page. Allocate 3034 * if not found or wait and loop if busy. 3035 */ 3036 if (m != NULL) 3037 nd = EAGAIN; /* send what we already got */ 3038 else if ((flags & SF_NODISKIO) != 0) 3039 nd = EBUSY; 3040 else 3041 nd = 0; 3042 error = sendfile_readpage(obj, vp, nd, off, 3043 xfsize, bsize, td, &pg); 3044 if (error != 0) { 3045 if (error == EAGAIN) 3046 error = 0; /* not a real error */ 3047 break; 3048 } 3049 3050 /* 3051 * Get a sendfile buf. When allocating the 3052 * first buffer for mbuf chain, we usually 3053 * wait as long as necessary, but this wait 3054 * can be interrupted. For consequent 3055 * buffers, do not sleep, since several 3056 * threads might exhaust the buffers and then 3057 * deadlock. 3058 */ 3059 sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT : 3060 SFB_CATCH); 3061 if (sf == NULL) { 3062 SFSTAT_INC(sf_allocfail); 3063 vm_page_lock(pg); 3064 vm_page_unwire(pg, PQ_INACTIVE); 3065 KASSERT(pg->object != NULL, 3066 ("%s: object disappeared", __func__)); 3067 vm_page_unlock(pg); 3068 if (m == NULL) 3069 error = (mnw ? EAGAIN : EINTR); 3070 break; 3071 } 3072 3073 /* 3074 * Get an mbuf and set it up as having 3075 * external storage. 3076 */ 3077 m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); 3078 if (m0 == NULL) { 3079 error = (mnw ? EAGAIN : ENOBUFS); 3080 sf_ext_free(sf, NULL); 3081 break; 3082 } 3083 /* 3084 * Attach EXT_SFBUF external storage. 3085 */ 3086 m0->m_ext.ext_buf = (caddr_t )sf_buf_kva(sf); 3087 m0->m_ext.ext_size = PAGE_SIZE; 3088 m0->m_ext.ext_arg1 = sf; 3089 m0->m_ext.ext_arg2 = sfs; 3090 m0->m_ext.ext_type = EXT_SFBUF; 3091 m0->m_ext.ext_flags = 0; 3092 m0->m_flags |= (M_EXT|M_RDONLY); 3093 m0->m_data = (char *)sf_buf_kva(sf) + pgoff; 3094 m0->m_len = xfsize; 3095 3096 /* Append to mbuf chain. */ 3097 if (mtail != NULL) 3098 mtail->m_next = m0; 3099 else if (m != NULL) 3100 m_last(m)->m_next = m0; 3101 else 3102 m = m0; 3103 mtail = m0; 3104 3105 /* Keep track of bits processed. */ 3106 loopbytes += xfsize; 3107 off += xfsize; 3108 3109 /* 3110 * XXX eventually this should be a sfsync 3111 * method call! 3112 */ 3113 if (sfs != NULL) 3114 sf_sync_ref(sfs); 3115 } 3116 3117 if (vp != NULL) 3118 VOP_UNLOCK(vp, 0); 3119 3120 /* Add the buffer chain to the socket buffer. */ 3121 if (m != NULL) { 3122 int mlen, err; 3123 3124 mlen = m_length(m, NULL); 3125 SOCKBUF_LOCK(&so->so_snd); 3126 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3127 error = EPIPE; 3128 SOCKBUF_UNLOCK(&so->so_snd); 3129 goto done; 3130 } 3131 SOCKBUF_UNLOCK(&so->so_snd); 3132 CURVNET_SET(so->so_vnet); 3133 /* Avoid error aliasing. */ 3134 err = (*so->so_proto->pr_usrreqs->pru_send) 3135 (so, 0, m, NULL, NULL, td); 3136 CURVNET_RESTORE(); 3137 if (err == 0) { 3138 /* 3139 * We need two counters to get the 3140 * file offset and nbytes to send 3141 * right: 3142 * - sbytes contains the total amount 3143 * of bytes sent, including headers. 3144 * - fsbytes contains the total amount 3145 * of bytes sent from the file. 3146 */ 3147 sbytes += mlen; 3148 fsbytes += mlen; 3149 if (hdrlen) { 3150 fsbytes -= hdrlen; 3151 hdrlen = 0; 3152 } 3153 } else if (error == 0) 3154 error = err; 3155 m = NULL; /* pru_send always consumes */ 3156 } 3157 3158 /* Quit outer loop on error or when we're done. */ 3159 if (done) 3160 break; 3161 if (error != 0) 3162 goto done; 3163 } 3164 3165 /* 3166 * Send trailers. Wimp out and use writev(2). 3167 */ 3168 if (trl_uio != NULL) { 3169 sbunlock(&so->so_snd); 3170 error = kern_writev(td, sockfd, trl_uio); 3171 if (error == 0) 3172 sbytes += td->td_retval[0]; 3173 goto out; 3174 } 3175 3176 done: 3177 sbunlock(&so->so_snd); 3178 out: 3179 /* 3180 * If there was no error we have to clear td->td_retval[0] 3181 * because it may have been set by writev. 3182 */ 3183 if (error == 0) { 3184 td->td_retval[0] = 0; 3185 } 3186 if (sent != NULL) { 3187 (*sent) = sbytes; 3188 } 3189 if (obj != NULL) 3190 vm_object_deallocate(obj); 3191 if (so) 3192 fdrop(sock_fp, td); 3193 if (m) 3194 m_freem(m); 3195 3196 if (error == ERESTART) 3197 error = EINTR; 3198 3199 return (error); 3200 } 3201