1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_capsicum.h" 39 #include "opt_inet.h" 40 #include "opt_inet6.h" 41 #include "opt_compat.h" 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/capsicum.h> 47 #include <sys/condvar.h> 48 #include <sys/kernel.h> 49 #include <sys/lock.h> 50 #include <sys/mutex.h> 51 #include <sys/sysproto.h> 52 #include <sys/malloc.h> 53 #include <sys/filedesc.h> 54 #include <sys/event.h> 55 #include <sys/proc.h> 56 #include <sys/fcntl.h> 57 #include <sys/file.h> 58 #include <sys/filio.h> 59 #include <sys/jail.h> 60 #include <sys/mman.h> 61 #include <sys/mount.h> 62 #include <sys/mbuf.h> 63 #include <sys/protosw.h> 64 #include <sys/rwlock.h> 65 #include <sys/sf_buf.h> 66 #include <sys/sysent.h> 67 #include <sys/socket.h> 68 #include <sys/socketvar.h> 69 #include <sys/signalvar.h> 70 #include <sys/syscallsubr.h> 71 #include <sys/sysctl.h> 72 #include <sys/uio.h> 73 #include <sys/vnode.h> 74 #ifdef KTRACE 75 #include <sys/ktrace.h> 76 #endif 77 #ifdef COMPAT_FREEBSD32 78 #include <compat/freebsd32/freebsd32_util.h> 79 #endif 80 81 #include <net/vnet.h> 82 83 #include <security/audit/audit.h> 84 #include <security/mac/mac_framework.h> 85 86 #include <vm/vm.h> 87 #include <vm/vm_param.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_page.h> 90 #include <vm/vm_pager.h> 91 #include <vm/vm_kern.h> 92 #include <vm/vm_extern.h> 93 #include <vm/uma.h> 94 95 /* 96 * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC 97 * and SOCK_NONBLOCK. 98 */ 99 #define ACCEPT4_INHERIT 0x1 100 #define ACCEPT4_COMPAT 0x2 101 102 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); 103 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); 104 105 static int accept1(struct thread *td, int s, struct sockaddr *uname, 106 socklen_t *anamelen, int flags); 107 static int do_sendfile(struct thread *td, struct sendfile_args *uap, 108 int compat); 109 static int getsockname1(struct thread *td, struct getsockname_args *uap, 110 int compat); 111 static int getpeername1(struct thread *td, struct getpeername_args *uap, 112 int compat); 113 114 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; 115 116 static void 117 sfstat_init(const void *unused) 118 { 119 120 COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), 121 M_WAITOK); 122 } 123 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); 124 125 static int 126 sfstat_sysctl(SYSCTL_HANDLER_ARGS) 127 { 128 struct sfstat s; 129 130 COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); 131 if (req->newptr) 132 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); 133 return (SYSCTL_OUT(req, &s, sizeof(s))); 134 } 135 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, 136 NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); 137 138 /* 139 * Convert a user file descriptor to a kernel file entry and check if required 140 * capability rights are present. 141 * A reference on the file entry is held upon returning. 142 */ 143 int 144 getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, 145 struct file **fpp, u_int *fflagp) 146 { 147 struct file *fp; 148 int error; 149 150 error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL); 151 if (error != 0) 152 return (error); 153 if (fp->f_type != DTYPE_SOCKET) { 154 fdrop(fp, td); 155 return (ENOTSOCK); 156 } 157 if (fflagp != NULL) 158 *fflagp = fp->f_flag; 159 *fpp = fp; 160 return (0); 161 } 162 163 /* 164 * System call interface to the socket abstraction. 165 */ 166 #if defined(COMPAT_43) 167 #define COMPAT_OLDSOCK 168 #endif 169 170 int 171 sys_socket(td, uap) 172 struct thread *td; 173 struct socket_args /* { 174 int domain; 175 int type; 176 int protocol; 177 } */ *uap; 178 { 179 struct socket *so; 180 struct file *fp; 181 int fd, error, type, oflag, fflag; 182 183 AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); 184 185 type = uap->type; 186 oflag = 0; 187 fflag = 0; 188 if ((type & SOCK_CLOEXEC) != 0) { 189 type &= ~SOCK_CLOEXEC; 190 oflag |= O_CLOEXEC; 191 } 192 if ((type & SOCK_NONBLOCK) != 0) { 193 type &= ~SOCK_NONBLOCK; 194 fflag |= FNONBLOCK; 195 } 196 197 #ifdef MAC 198 error = mac_socket_check_create(td->td_ucred, uap->domain, type, 199 uap->protocol); 200 if (error != 0) 201 return (error); 202 #endif 203 error = falloc(td, &fp, &fd, oflag); 204 if (error != 0) 205 return (error); 206 /* An extra reference on `fp' has been held for us by falloc(). */ 207 error = socreate(uap->domain, &so, type, uap->protocol, 208 td->td_ucred, td); 209 if (error != 0) { 210 fdclose(td, fp, fd); 211 } else { 212 finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); 213 if ((fflag & FNONBLOCK) != 0) 214 (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); 215 td->td_retval[0] = fd; 216 } 217 fdrop(fp, td); 218 return (error); 219 } 220 221 /* ARGSUSED */ 222 int 223 sys_bind(td, uap) 224 struct thread *td; 225 struct bind_args /* { 226 int s; 227 caddr_t name; 228 int namelen; 229 } */ *uap; 230 { 231 struct sockaddr *sa; 232 int error; 233 234 error = getsockaddr(&sa, uap->name, uap->namelen); 235 if (error == 0) { 236 error = kern_bindat(td, AT_FDCWD, uap->s, sa); 237 free(sa, M_SONAME); 238 } 239 return (error); 240 } 241 242 int 243 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 244 { 245 struct socket *so; 246 struct file *fp; 247 cap_rights_t rights; 248 int error; 249 250 AUDIT_ARG_FD(fd); 251 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 252 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND), 253 &fp, NULL); 254 if (error != 0) 255 return (error); 256 so = fp->f_data; 257 #ifdef KTRACE 258 if (KTRPOINT(td, KTR_STRUCT)) 259 ktrsockaddr(sa); 260 #endif 261 #ifdef MAC 262 error = mac_socket_check_bind(td->td_ucred, so, sa); 263 if (error == 0) { 264 #endif 265 if (dirfd == AT_FDCWD) 266 error = sobind(so, sa, td); 267 else 268 error = sobindat(dirfd, so, sa, td); 269 #ifdef MAC 270 } 271 #endif 272 fdrop(fp, td); 273 return (error); 274 } 275 276 /* ARGSUSED */ 277 int 278 sys_bindat(td, uap) 279 struct thread *td; 280 struct bindat_args /* { 281 int fd; 282 int s; 283 caddr_t name; 284 int namelen; 285 } */ *uap; 286 { 287 struct sockaddr *sa; 288 int error; 289 290 error = getsockaddr(&sa, uap->name, uap->namelen); 291 if (error == 0) { 292 error = kern_bindat(td, uap->fd, uap->s, sa); 293 free(sa, M_SONAME); 294 } 295 return (error); 296 } 297 298 /* ARGSUSED */ 299 int 300 sys_listen(td, uap) 301 struct thread *td; 302 struct listen_args /* { 303 int s; 304 int backlog; 305 } */ *uap; 306 { 307 struct socket *so; 308 struct file *fp; 309 cap_rights_t rights; 310 int error; 311 312 AUDIT_ARG_FD(uap->s); 313 error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_LISTEN), 314 &fp, NULL); 315 if (error == 0) { 316 so = fp->f_data; 317 #ifdef MAC 318 error = mac_socket_check_listen(td->td_ucred, so); 319 if (error == 0) 320 #endif 321 error = solisten(so, uap->backlog, td); 322 fdrop(fp, td); 323 } 324 return(error); 325 } 326 327 /* 328 * accept1() 329 */ 330 static int 331 accept1(td, s, uname, anamelen, flags) 332 struct thread *td; 333 int s; 334 struct sockaddr *uname; 335 socklen_t *anamelen; 336 int flags; 337 { 338 struct sockaddr *name; 339 socklen_t namelen; 340 struct file *fp; 341 int error; 342 343 if (uname == NULL) 344 return (kern_accept4(td, s, NULL, NULL, flags, NULL)); 345 346 error = copyin(anamelen, &namelen, sizeof (namelen)); 347 if (error != 0) 348 return (error); 349 350 error = kern_accept4(td, s, &name, &namelen, flags, &fp); 351 352 if (error != 0) 353 return (error); 354 355 if (error == 0 && uname != NULL) { 356 #ifdef COMPAT_OLDSOCK 357 if (flags & ACCEPT4_COMPAT) 358 ((struct osockaddr *)name)->sa_family = 359 name->sa_family; 360 #endif 361 error = copyout(name, uname, namelen); 362 } 363 if (error == 0) 364 error = copyout(&namelen, anamelen, 365 sizeof(namelen)); 366 if (error != 0) 367 fdclose(td, fp, td->td_retval[0]); 368 fdrop(fp, td); 369 free(name, M_SONAME); 370 return (error); 371 } 372 373 int 374 kern_accept(struct thread *td, int s, struct sockaddr **name, 375 socklen_t *namelen, struct file **fp) 376 { 377 return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); 378 } 379 380 int 381 kern_accept4(struct thread *td, int s, struct sockaddr **name, 382 socklen_t *namelen, int flags, struct file **fp) 383 { 384 struct file *headfp, *nfp = NULL; 385 struct sockaddr *sa = NULL; 386 struct socket *head, *so; 387 cap_rights_t rights; 388 u_int fflag; 389 pid_t pgid; 390 int error, fd, tmp; 391 392 if (name != NULL) 393 *name = NULL; 394 395 AUDIT_ARG_FD(s); 396 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT), 397 &headfp, &fflag); 398 if (error != 0) 399 return (error); 400 head = headfp->f_data; 401 if ((head->so_options & SO_ACCEPTCONN) == 0) { 402 error = EINVAL; 403 goto done; 404 } 405 #ifdef MAC 406 error = mac_socket_check_accept(td->td_ucred, head); 407 if (error != 0) 408 goto done; 409 #endif 410 error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); 411 if (error != 0) 412 goto done; 413 ACCEPT_LOCK(); 414 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { 415 ACCEPT_UNLOCK(); 416 error = EWOULDBLOCK; 417 goto noconnection; 418 } 419 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { 420 if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { 421 head->so_error = ECONNABORTED; 422 break; 423 } 424 error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, 425 "accept", 0); 426 if (error != 0) { 427 ACCEPT_UNLOCK(); 428 goto noconnection; 429 } 430 } 431 if (head->so_error) { 432 error = head->so_error; 433 head->so_error = 0; 434 ACCEPT_UNLOCK(); 435 goto noconnection; 436 } 437 so = TAILQ_FIRST(&head->so_comp); 438 KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); 439 KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); 440 441 /* 442 * Before changing the flags on the socket, we have to bump the 443 * reference count. Otherwise, if the protocol calls sofree(), 444 * the socket will be released due to a zero refcount. 445 */ 446 SOCK_LOCK(so); /* soref() and so_state update */ 447 soref(so); /* file descriptor reference */ 448 449 TAILQ_REMOVE(&head->so_comp, so, so_list); 450 head->so_qlen--; 451 if (flags & ACCEPT4_INHERIT) 452 so->so_state |= (head->so_state & SS_NBIO); 453 else 454 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 455 so->so_qstate &= ~SQ_COMP; 456 so->so_head = NULL; 457 458 SOCK_UNLOCK(so); 459 ACCEPT_UNLOCK(); 460 461 /* An extra reference on `nfp' has been held for us by falloc(). */ 462 td->td_retval[0] = fd; 463 464 /* connection has been removed from the listen queue */ 465 KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); 466 467 if (flags & ACCEPT4_INHERIT) { 468 pgid = fgetown(&head->so_sigio); 469 if (pgid != 0) 470 fsetown(pgid, &so->so_sigio); 471 } else { 472 fflag &= ~(FNONBLOCK | FASYNC); 473 if (flags & SOCK_NONBLOCK) 474 fflag |= FNONBLOCK; 475 } 476 477 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); 478 /* Sync socket nonblocking/async state with file flags */ 479 tmp = fflag & FNONBLOCK; 480 (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); 481 tmp = fflag & FASYNC; 482 (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); 483 sa = 0; 484 error = soaccept(so, &sa); 485 if (error != 0) 486 goto noconnection; 487 if (sa == NULL) { 488 if (name) 489 *namelen = 0; 490 goto done; 491 } 492 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); 493 if (name) { 494 /* check sa_len before it is destroyed */ 495 if (*namelen > sa->sa_len) 496 *namelen = sa->sa_len; 497 #ifdef KTRACE 498 if (KTRPOINT(td, KTR_STRUCT)) 499 ktrsockaddr(sa); 500 #endif 501 *name = sa; 502 sa = NULL; 503 } 504 noconnection: 505 free(sa, M_SONAME); 506 507 /* 508 * close the new descriptor, assuming someone hasn't ripped it 509 * out from under us. 510 */ 511 if (error != 0) 512 fdclose(td, nfp, fd); 513 514 /* 515 * Release explicitly held references before returning. We return 516 * a reference on nfp to the caller on success if they request it. 517 */ 518 done: 519 if (fp != NULL) { 520 if (error == 0) { 521 *fp = nfp; 522 nfp = NULL; 523 } else 524 *fp = NULL; 525 } 526 if (nfp != NULL) 527 fdrop(nfp, td); 528 fdrop(headfp, td); 529 return (error); 530 } 531 532 int 533 sys_accept(td, uap) 534 struct thread *td; 535 struct accept_args *uap; 536 { 537 538 return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); 539 } 540 541 int 542 sys_accept4(td, uap) 543 struct thread *td; 544 struct accept4_args *uap; 545 { 546 547 if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 548 return (EINVAL); 549 550 return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); 551 } 552 553 #ifdef COMPAT_OLDSOCK 554 int 555 oaccept(td, uap) 556 struct thread *td; 557 struct accept_args *uap; 558 { 559 560 return (accept1(td, uap->s, uap->name, uap->anamelen, 561 ACCEPT4_INHERIT | ACCEPT4_COMPAT)); 562 } 563 #endif /* COMPAT_OLDSOCK */ 564 565 /* ARGSUSED */ 566 int 567 sys_connect(td, uap) 568 struct thread *td; 569 struct connect_args /* { 570 int s; 571 caddr_t name; 572 int namelen; 573 } */ *uap; 574 { 575 struct sockaddr *sa; 576 int error; 577 578 error = getsockaddr(&sa, uap->name, uap->namelen); 579 if (error == 0) { 580 error = kern_connectat(td, AT_FDCWD, uap->s, sa); 581 free(sa, M_SONAME); 582 } 583 return (error); 584 } 585 586 int 587 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 588 { 589 struct socket *so; 590 struct file *fp; 591 cap_rights_t rights; 592 int error, interrupted = 0; 593 594 AUDIT_ARG_FD(fd); 595 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 596 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT), 597 &fp, NULL); 598 if (error != 0) 599 return (error); 600 so = fp->f_data; 601 if (so->so_state & SS_ISCONNECTING) { 602 error = EALREADY; 603 goto done1; 604 } 605 #ifdef KTRACE 606 if (KTRPOINT(td, KTR_STRUCT)) 607 ktrsockaddr(sa); 608 #endif 609 #ifdef MAC 610 error = mac_socket_check_connect(td->td_ucred, so, sa); 611 if (error != 0) 612 goto bad; 613 #endif 614 if (dirfd == AT_FDCWD) 615 error = soconnect(so, sa, td); 616 else 617 error = soconnectat(dirfd, so, sa, td); 618 if (error != 0) 619 goto bad; 620 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 621 error = EINPROGRESS; 622 goto done1; 623 } 624 SOCK_LOCK(so); 625 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 626 error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, 627 "connec", 0); 628 if (error != 0) { 629 if (error == EINTR || error == ERESTART) 630 interrupted = 1; 631 break; 632 } 633 } 634 if (error == 0) { 635 error = so->so_error; 636 so->so_error = 0; 637 } 638 SOCK_UNLOCK(so); 639 bad: 640 if (!interrupted) 641 so->so_state &= ~SS_ISCONNECTING; 642 if (error == ERESTART) 643 error = EINTR; 644 done1: 645 fdrop(fp, td); 646 return (error); 647 } 648 649 /* ARGSUSED */ 650 int 651 sys_connectat(td, uap) 652 struct thread *td; 653 struct connectat_args /* { 654 int fd; 655 int s; 656 caddr_t name; 657 int namelen; 658 } */ *uap; 659 { 660 struct sockaddr *sa; 661 int error; 662 663 error = getsockaddr(&sa, uap->name, uap->namelen); 664 if (error == 0) { 665 error = kern_connectat(td, uap->fd, uap->s, sa); 666 free(sa, M_SONAME); 667 } 668 return (error); 669 } 670 671 int 672 kern_socketpair(struct thread *td, int domain, int type, int protocol, 673 int *rsv) 674 { 675 struct file *fp1, *fp2; 676 struct socket *so1, *so2; 677 int fd, error, oflag, fflag; 678 679 AUDIT_ARG_SOCKET(domain, type, protocol); 680 681 oflag = 0; 682 fflag = 0; 683 if ((type & SOCK_CLOEXEC) != 0) { 684 type &= ~SOCK_CLOEXEC; 685 oflag |= O_CLOEXEC; 686 } 687 if ((type & SOCK_NONBLOCK) != 0) { 688 type &= ~SOCK_NONBLOCK; 689 fflag |= FNONBLOCK; 690 } 691 #ifdef MAC 692 /* We might want to have a separate check for socket pairs. */ 693 error = mac_socket_check_create(td->td_ucred, domain, type, 694 protocol); 695 if (error != 0) 696 return (error); 697 #endif 698 error = socreate(domain, &so1, type, protocol, td->td_ucred, td); 699 if (error != 0) 700 return (error); 701 error = socreate(domain, &so2, type, protocol, td->td_ucred, td); 702 if (error != 0) 703 goto free1; 704 /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ 705 error = falloc(td, &fp1, &fd, oflag); 706 if (error != 0) 707 goto free2; 708 rsv[0] = fd; 709 fp1->f_data = so1; /* so1 already has ref count */ 710 error = falloc(td, &fp2, &fd, oflag); 711 if (error != 0) 712 goto free3; 713 fp2->f_data = so2; /* so2 already has ref count */ 714 rsv[1] = fd; 715 error = soconnect2(so1, so2); 716 if (error != 0) 717 goto free4; 718 if (type == SOCK_DGRAM) { 719 /* 720 * Datagram socket connection is asymmetric. 721 */ 722 error = soconnect2(so2, so1); 723 if (error != 0) 724 goto free4; 725 } 726 finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, 727 &socketops); 728 finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, 729 &socketops); 730 if ((fflag & FNONBLOCK) != 0) { 731 (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); 732 (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); 733 } 734 fdrop(fp1, td); 735 fdrop(fp2, td); 736 return (0); 737 free4: 738 fdclose(td, fp2, rsv[1]); 739 fdrop(fp2, td); 740 free3: 741 fdclose(td, fp1, rsv[0]); 742 fdrop(fp1, td); 743 free2: 744 if (so2 != NULL) 745 (void)soclose(so2); 746 free1: 747 if (so1 != NULL) 748 (void)soclose(so1); 749 return (error); 750 } 751 752 int 753 sys_socketpair(struct thread *td, struct socketpair_args *uap) 754 { 755 int error, sv[2]; 756 757 error = kern_socketpair(td, uap->domain, uap->type, 758 uap->protocol, sv); 759 if (error != 0) 760 return (error); 761 error = copyout(sv, uap->rsv, 2 * sizeof(int)); 762 if (error != 0) { 763 (void)kern_close(td, sv[0]); 764 (void)kern_close(td, sv[1]); 765 } 766 return (error); 767 } 768 769 static int 770 sendit(td, s, mp, flags) 771 struct thread *td; 772 int s; 773 struct msghdr *mp; 774 int flags; 775 { 776 struct mbuf *control; 777 struct sockaddr *to; 778 int error; 779 780 #ifdef CAPABILITY_MODE 781 if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) 782 return (ECAPMODE); 783 #endif 784 785 if (mp->msg_name != NULL) { 786 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 787 if (error != 0) { 788 to = NULL; 789 goto bad; 790 } 791 mp->msg_name = to; 792 } else { 793 to = NULL; 794 } 795 796 if (mp->msg_control) { 797 if (mp->msg_controllen < sizeof(struct cmsghdr) 798 #ifdef COMPAT_OLDSOCK 799 && mp->msg_flags != MSG_COMPAT 800 #endif 801 ) { 802 error = EINVAL; 803 goto bad; 804 } 805 error = sockargs(&control, mp->msg_control, 806 mp->msg_controllen, MT_CONTROL); 807 if (error != 0) 808 goto bad; 809 #ifdef COMPAT_OLDSOCK 810 if (mp->msg_flags == MSG_COMPAT) { 811 struct cmsghdr *cm; 812 813 M_PREPEND(control, sizeof(*cm), M_WAITOK); 814 cm = mtod(control, struct cmsghdr *); 815 cm->cmsg_len = control->m_len; 816 cm->cmsg_level = SOL_SOCKET; 817 cm->cmsg_type = SCM_RIGHTS; 818 } 819 #endif 820 } else { 821 control = NULL; 822 } 823 824 error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); 825 826 bad: 827 free(to, M_SONAME); 828 return (error); 829 } 830 831 int 832 kern_sendit(td, s, mp, flags, control, segflg) 833 struct thread *td; 834 int s; 835 struct msghdr *mp; 836 int flags; 837 struct mbuf *control; 838 enum uio_seg segflg; 839 { 840 struct file *fp; 841 struct uio auio; 842 struct iovec *iov; 843 struct socket *so; 844 cap_rights_t rights; 845 #ifdef KTRACE 846 struct uio *ktruio = NULL; 847 #endif 848 ssize_t len; 849 int i, error; 850 851 AUDIT_ARG_FD(s); 852 cap_rights_init(&rights, CAP_SEND); 853 if (mp->msg_name != NULL) { 854 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); 855 cap_rights_set(&rights, CAP_CONNECT); 856 } 857 error = getsock_cap(td, s, &rights, &fp, NULL); 858 if (error != 0) 859 return (error); 860 so = (struct socket *)fp->f_data; 861 862 #ifdef KTRACE 863 if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) 864 ktrsockaddr(mp->msg_name); 865 #endif 866 #ifdef MAC 867 if (mp->msg_name != NULL) { 868 error = mac_socket_check_connect(td->td_ucred, so, 869 mp->msg_name); 870 if (error != 0) 871 goto bad; 872 } 873 error = mac_socket_check_send(td->td_ucred, so); 874 if (error != 0) 875 goto bad; 876 #endif 877 878 auio.uio_iov = mp->msg_iov; 879 auio.uio_iovcnt = mp->msg_iovlen; 880 auio.uio_segflg = segflg; 881 auio.uio_rw = UIO_WRITE; 882 auio.uio_td = td; 883 auio.uio_offset = 0; /* XXX */ 884 auio.uio_resid = 0; 885 iov = mp->msg_iov; 886 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 887 if ((auio.uio_resid += iov->iov_len) < 0) { 888 error = EINVAL; 889 goto bad; 890 } 891 } 892 #ifdef KTRACE 893 if (KTRPOINT(td, KTR_GENIO)) 894 ktruio = cloneuio(&auio); 895 #endif 896 len = auio.uio_resid; 897 error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); 898 if (error != 0) { 899 if (auio.uio_resid != len && (error == ERESTART || 900 error == EINTR || error == EWOULDBLOCK)) 901 error = 0; 902 /* Generation of SIGPIPE can be controlled per socket */ 903 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 904 !(flags & MSG_NOSIGNAL)) { 905 PROC_LOCK(td->td_proc); 906 tdsignal(td, SIGPIPE); 907 PROC_UNLOCK(td->td_proc); 908 } 909 } 910 if (error == 0) 911 td->td_retval[0] = len - auio.uio_resid; 912 #ifdef KTRACE 913 if (ktruio != NULL) { 914 ktruio->uio_resid = td->td_retval[0]; 915 ktrgenio(s, UIO_WRITE, ktruio, error); 916 } 917 #endif 918 bad: 919 fdrop(fp, td); 920 return (error); 921 } 922 923 int 924 sys_sendto(td, uap) 925 struct thread *td; 926 struct sendto_args /* { 927 int s; 928 caddr_t buf; 929 size_t len; 930 int flags; 931 caddr_t to; 932 int tolen; 933 } */ *uap; 934 { 935 struct msghdr msg; 936 struct iovec aiov; 937 938 msg.msg_name = uap->to; 939 msg.msg_namelen = uap->tolen; 940 msg.msg_iov = &aiov; 941 msg.msg_iovlen = 1; 942 msg.msg_control = 0; 943 #ifdef COMPAT_OLDSOCK 944 msg.msg_flags = 0; 945 #endif 946 aiov.iov_base = uap->buf; 947 aiov.iov_len = uap->len; 948 return (sendit(td, uap->s, &msg, uap->flags)); 949 } 950 951 #ifdef COMPAT_OLDSOCK 952 int 953 osend(td, uap) 954 struct thread *td; 955 struct osend_args /* { 956 int s; 957 caddr_t buf; 958 int len; 959 int flags; 960 } */ *uap; 961 { 962 struct msghdr msg; 963 struct iovec aiov; 964 965 msg.msg_name = 0; 966 msg.msg_namelen = 0; 967 msg.msg_iov = &aiov; 968 msg.msg_iovlen = 1; 969 aiov.iov_base = uap->buf; 970 aiov.iov_len = uap->len; 971 msg.msg_control = 0; 972 msg.msg_flags = 0; 973 return (sendit(td, uap->s, &msg, uap->flags)); 974 } 975 976 int 977 osendmsg(td, uap) 978 struct thread *td; 979 struct osendmsg_args /* { 980 int s; 981 caddr_t msg; 982 int flags; 983 } */ *uap; 984 { 985 struct msghdr msg; 986 struct iovec *iov; 987 int error; 988 989 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 990 if (error != 0) 991 return (error); 992 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 993 if (error != 0) 994 return (error); 995 msg.msg_iov = iov; 996 msg.msg_flags = MSG_COMPAT; 997 error = sendit(td, uap->s, &msg, uap->flags); 998 free(iov, M_IOV); 999 return (error); 1000 } 1001 #endif 1002 1003 int 1004 sys_sendmsg(td, uap) 1005 struct thread *td; 1006 struct sendmsg_args /* { 1007 int s; 1008 caddr_t msg; 1009 int flags; 1010 } */ *uap; 1011 { 1012 struct msghdr msg; 1013 struct iovec *iov; 1014 int error; 1015 1016 error = copyin(uap->msg, &msg, sizeof (msg)); 1017 if (error != 0) 1018 return (error); 1019 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1020 if (error != 0) 1021 return (error); 1022 msg.msg_iov = iov; 1023 #ifdef COMPAT_OLDSOCK 1024 msg.msg_flags = 0; 1025 #endif 1026 error = sendit(td, uap->s, &msg, uap->flags); 1027 free(iov, M_IOV); 1028 return (error); 1029 } 1030 1031 int 1032 kern_recvit(td, s, mp, fromseg, controlp) 1033 struct thread *td; 1034 int s; 1035 struct msghdr *mp; 1036 enum uio_seg fromseg; 1037 struct mbuf **controlp; 1038 { 1039 struct uio auio; 1040 struct iovec *iov; 1041 struct mbuf *m, *control = NULL; 1042 caddr_t ctlbuf; 1043 struct file *fp; 1044 struct socket *so; 1045 struct sockaddr *fromsa = NULL; 1046 cap_rights_t rights; 1047 #ifdef KTRACE 1048 struct uio *ktruio = NULL; 1049 #endif 1050 ssize_t len; 1051 int error, i; 1052 1053 if (controlp != NULL) 1054 *controlp = NULL; 1055 1056 AUDIT_ARG_FD(s); 1057 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV), 1058 &fp, NULL); 1059 if (error != 0) 1060 return (error); 1061 so = fp->f_data; 1062 1063 #ifdef MAC 1064 error = mac_socket_check_receive(td->td_ucred, so); 1065 if (error != 0) { 1066 fdrop(fp, td); 1067 return (error); 1068 } 1069 #endif 1070 1071 auio.uio_iov = mp->msg_iov; 1072 auio.uio_iovcnt = mp->msg_iovlen; 1073 auio.uio_segflg = UIO_USERSPACE; 1074 auio.uio_rw = UIO_READ; 1075 auio.uio_td = td; 1076 auio.uio_offset = 0; /* XXX */ 1077 auio.uio_resid = 0; 1078 iov = mp->msg_iov; 1079 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 1080 if ((auio.uio_resid += iov->iov_len) < 0) { 1081 fdrop(fp, td); 1082 return (EINVAL); 1083 } 1084 } 1085 #ifdef KTRACE 1086 if (KTRPOINT(td, KTR_GENIO)) 1087 ktruio = cloneuio(&auio); 1088 #endif 1089 len = auio.uio_resid; 1090 error = soreceive(so, &fromsa, &auio, NULL, 1091 (mp->msg_control || controlp) ? &control : NULL, 1092 &mp->msg_flags); 1093 if (error != 0) { 1094 if (auio.uio_resid != len && (error == ERESTART || 1095 error == EINTR || error == EWOULDBLOCK)) 1096 error = 0; 1097 } 1098 if (fromsa != NULL) 1099 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); 1100 #ifdef KTRACE 1101 if (ktruio != NULL) { 1102 ktruio->uio_resid = len - auio.uio_resid; 1103 ktrgenio(s, UIO_READ, ktruio, error); 1104 } 1105 #endif 1106 if (error != 0) 1107 goto out; 1108 td->td_retval[0] = len - auio.uio_resid; 1109 if (mp->msg_name) { 1110 len = mp->msg_namelen; 1111 if (len <= 0 || fromsa == NULL) 1112 len = 0; 1113 else { 1114 /* save sa_len before it is destroyed by MSG_COMPAT */ 1115 len = MIN(len, fromsa->sa_len); 1116 #ifdef COMPAT_OLDSOCK 1117 if (mp->msg_flags & MSG_COMPAT) 1118 ((struct osockaddr *)fromsa)->sa_family = 1119 fromsa->sa_family; 1120 #endif 1121 if (fromseg == UIO_USERSPACE) { 1122 error = copyout(fromsa, mp->msg_name, 1123 (unsigned)len); 1124 if (error != 0) 1125 goto out; 1126 } else 1127 bcopy(fromsa, mp->msg_name, len); 1128 } 1129 mp->msg_namelen = len; 1130 } 1131 if (mp->msg_control && controlp == NULL) { 1132 #ifdef COMPAT_OLDSOCK 1133 /* 1134 * We assume that old recvmsg calls won't receive access 1135 * rights and other control info, esp. as control info 1136 * is always optional and those options didn't exist in 4.3. 1137 * If we receive rights, trim the cmsghdr; anything else 1138 * is tossed. 1139 */ 1140 if (control && mp->msg_flags & MSG_COMPAT) { 1141 if (mtod(control, struct cmsghdr *)->cmsg_level != 1142 SOL_SOCKET || 1143 mtod(control, struct cmsghdr *)->cmsg_type != 1144 SCM_RIGHTS) { 1145 mp->msg_controllen = 0; 1146 goto out; 1147 } 1148 control->m_len -= sizeof (struct cmsghdr); 1149 control->m_data += sizeof (struct cmsghdr); 1150 } 1151 #endif 1152 len = mp->msg_controllen; 1153 m = control; 1154 mp->msg_controllen = 0; 1155 ctlbuf = mp->msg_control; 1156 1157 while (m && len > 0) { 1158 unsigned int tocopy; 1159 1160 if (len >= m->m_len) 1161 tocopy = m->m_len; 1162 else { 1163 mp->msg_flags |= MSG_CTRUNC; 1164 tocopy = len; 1165 } 1166 1167 if ((error = copyout(mtod(m, caddr_t), 1168 ctlbuf, tocopy)) != 0) 1169 goto out; 1170 1171 ctlbuf += tocopy; 1172 len -= tocopy; 1173 m = m->m_next; 1174 } 1175 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; 1176 } 1177 out: 1178 fdrop(fp, td); 1179 #ifdef KTRACE 1180 if (fromsa && KTRPOINT(td, KTR_STRUCT)) 1181 ktrsockaddr(fromsa); 1182 #endif 1183 free(fromsa, M_SONAME); 1184 1185 if (error == 0 && controlp != NULL) 1186 *controlp = control; 1187 else if (control) 1188 m_freem(control); 1189 1190 return (error); 1191 } 1192 1193 static int 1194 recvit(td, s, mp, namelenp) 1195 struct thread *td; 1196 int s; 1197 struct msghdr *mp; 1198 void *namelenp; 1199 { 1200 int error; 1201 1202 error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); 1203 if (error != 0) 1204 return (error); 1205 if (namelenp != NULL) { 1206 error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); 1207 #ifdef COMPAT_OLDSOCK 1208 if (mp->msg_flags & MSG_COMPAT) 1209 error = 0; /* old recvfrom didn't check */ 1210 #endif 1211 } 1212 return (error); 1213 } 1214 1215 int 1216 sys_recvfrom(td, uap) 1217 struct thread *td; 1218 struct recvfrom_args /* { 1219 int s; 1220 caddr_t buf; 1221 size_t len; 1222 int flags; 1223 struct sockaddr * __restrict from; 1224 socklen_t * __restrict fromlenaddr; 1225 } */ *uap; 1226 { 1227 struct msghdr msg; 1228 struct iovec aiov; 1229 int error; 1230 1231 if (uap->fromlenaddr) { 1232 error = copyin(uap->fromlenaddr, 1233 &msg.msg_namelen, sizeof (msg.msg_namelen)); 1234 if (error != 0) 1235 goto done2; 1236 } else { 1237 msg.msg_namelen = 0; 1238 } 1239 msg.msg_name = uap->from; 1240 msg.msg_iov = &aiov; 1241 msg.msg_iovlen = 1; 1242 aiov.iov_base = uap->buf; 1243 aiov.iov_len = uap->len; 1244 msg.msg_control = 0; 1245 msg.msg_flags = uap->flags; 1246 error = recvit(td, uap->s, &msg, uap->fromlenaddr); 1247 done2: 1248 return (error); 1249 } 1250 1251 #ifdef COMPAT_OLDSOCK 1252 int 1253 orecvfrom(td, uap) 1254 struct thread *td; 1255 struct recvfrom_args *uap; 1256 { 1257 1258 uap->flags |= MSG_COMPAT; 1259 return (sys_recvfrom(td, uap)); 1260 } 1261 #endif 1262 1263 #ifdef COMPAT_OLDSOCK 1264 int 1265 orecv(td, uap) 1266 struct thread *td; 1267 struct orecv_args /* { 1268 int s; 1269 caddr_t buf; 1270 int len; 1271 int flags; 1272 } */ *uap; 1273 { 1274 struct msghdr msg; 1275 struct iovec aiov; 1276 1277 msg.msg_name = 0; 1278 msg.msg_namelen = 0; 1279 msg.msg_iov = &aiov; 1280 msg.msg_iovlen = 1; 1281 aiov.iov_base = uap->buf; 1282 aiov.iov_len = uap->len; 1283 msg.msg_control = 0; 1284 msg.msg_flags = uap->flags; 1285 return (recvit(td, uap->s, &msg, NULL)); 1286 } 1287 1288 /* 1289 * Old recvmsg. This code takes advantage of the fact that the old msghdr 1290 * overlays the new one, missing only the flags, and with the (old) access 1291 * rights where the control fields are now. 1292 */ 1293 int 1294 orecvmsg(td, uap) 1295 struct thread *td; 1296 struct orecvmsg_args /* { 1297 int s; 1298 struct omsghdr *msg; 1299 int flags; 1300 } */ *uap; 1301 { 1302 struct msghdr msg; 1303 struct iovec *iov; 1304 int error; 1305 1306 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1307 if (error != 0) 1308 return (error); 1309 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1310 if (error != 0) 1311 return (error); 1312 msg.msg_flags = uap->flags | MSG_COMPAT; 1313 msg.msg_iov = iov; 1314 error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); 1315 if (msg.msg_controllen && error == 0) 1316 error = copyout(&msg.msg_controllen, 1317 &uap->msg->msg_accrightslen, sizeof (int)); 1318 free(iov, M_IOV); 1319 return (error); 1320 } 1321 #endif 1322 1323 int 1324 sys_recvmsg(td, uap) 1325 struct thread *td; 1326 struct recvmsg_args /* { 1327 int s; 1328 struct msghdr *msg; 1329 int flags; 1330 } */ *uap; 1331 { 1332 struct msghdr msg; 1333 struct iovec *uiov, *iov; 1334 int error; 1335 1336 error = copyin(uap->msg, &msg, sizeof (msg)); 1337 if (error != 0) 1338 return (error); 1339 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1340 if (error != 0) 1341 return (error); 1342 msg.msg_flags = uap->flags; 1343 #ifdef COMPAT_OLDSOCK 1344 msg.msg_flags &= ~MSG_COMPAT; 1345 #endif 1346 uiov = msg.msg_iov; 1347 msg.msg_iov = iov; 1348 error = recvit(td, uap->s, &msg, NULL); 1349 if (error == 0) { 1350 msg.msg_iov = uiov; 1351 error = copyout(&msg, uap->msg, sizeof(msg)); 1352 } 1353 free(iov, M_IOV); 1354 return (error); 1355 } 1356 1357 /* ARGSUSED */ 1358 int 1359 sys_shutdown(td, uap) 1360 struct thread *td; 1361 struct shutdown_args /* { 1362 int s; 1363 int how; 1364 } */ *uap; 1365 { 1366 struct socket *so; 1367 struct file *fp; 1368 cap_rights_t rights; 1369 int error; 1370 1371 AUDIT_ARG_FD(uap->s); 1372 error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN), 1373 &fp, NULL); 1374 if (error == 0) { 1375 so = fp->f_data; 1376 error = soshutdown(so, uap->how); 1377 /* 1378 * Previous versions did not return ENOTCONN, but 0 in 1379 * case the socket was not connected. Some important 1380 * programs like syslogd up to r279016, 2015-02-19, 1381 * still depend on this behavior. 1382 */ 1383 if (error == ENOTCONN && 1384 td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN) 1385 error = 0; 1386 fdrop(fp, td); 1387 } 1388 return (error); 1389 } 1390 1391 /* ARGSUSED */ 1392 int 1393 sys_setsockopt(td, uap) 1394 struct thread *td; 1395 struct setsockopt_args /* { 1396 int s; 1397 int level; 1398 int name; 1399 caddr_t val; 1400 int valsize; 1401 } */ *uap; 1402 { 1403 1404 return (kern_setsockopt(td, uap->s, uap->level, uap->name, 1405 uap->val, UIO_USERSPACE, uap->valsize)); 1406 } 1407 1408 int 1409 kern_setsockopt(td, s, level, name, val, valseg, valsize) 1410 struct thread *td; 1411 int s; 1412 int level; 1413 int name; 1414 void *val; 1415 enum uio_seg valseg; 1416 socklen_t valsize; 1417 { 1418 struct socket *so; 1419 struct file *fp; 1420 struct sockopt sopt; 1421 cap_rights_t rights; 1422 int error; 1423 1424 if (val == NULL && valsize != 0) 1425 return (EFAULT); 1426 if ((int)valsize < 0) 1427 return (EINVAL); 1428 1429 sopt.sopt_dir = SOPT_SET; 1430 sopt.sopt_level = level; 1431 sopt.sopt_name = name; 1432 sopt.sopt_val = val; 1433 sopt.sopt_valsize = valsize; 1434 switch (valseg) { 1435 case UIO_USERSPACE: 1436 sopt.sopt_td = td; 1437 break; 1438 case UIO_SYSSPACE: 1439 sopt.sopt_td = NULL; 1440 break; 1441 default: 1442 panic("kern_setsockopt called with bad valseg"); 1443 } 1444 1445 AUDIT_ARG_FD(s); 1446 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT), 1447 &fp, NULL); 1448 if (error == 0) { 1449 so = fp->f_data; 1450 error = sosetopt(so, &sopt); 1451 fdrop(fp, td); 1452 } 1453 return(error); 1454 } 1455 1456 /* ARGSUSED */ 1457 int 1458 sys_getsockopt(td, uap) 1459 struct thread *td; 1460 struct getsockopt_args /* { 1461 int s; 1462 int level; 1463 int name; 1464 void * __restrict val; 1465 socklen_t * __restrict avalsize; 1466 } */ *uap; 1467 { 1468 socklen_t valsize; 1469 int error; 1470 1471 if (uap->val) { 1472 error = copyin(uap->avalsize, &valsize, sizeof (valsize)); 1473 if (error != 0) 1474 return (error); 1475 } 1476 1477 error = kern_getsockopt(td, uap->s, uap->level, uap->name, 1478 uap->val, UIO_USERSPACE, &valsize); 1479 1480 if (error == 0) 1481 error = copyout(&valsize, uap->avalsize, sizeof (valsize)); 1482 return (error); 1483 } 1484 1485 /* 1486 * Kernel version of getsockopt. 1487 * optval can be a userland or userspace. optlen is always a kernel pointer. 1488 */ 1489 int 1490 kern_getsockopt(td, s, level, name, val, valseg, valsize) 1491 struct thread *td; 1492 int s; 1493 int level; 1494 int name; 1495 void *val; 1496 enum uio_seg valseg; 1497 socklen_t *valsize; 1498 { 1499 struct socket *so; 1500 struct file *fp; 1501 struct sockopt sopt; 1502 cap_rights_t rights; 1503 int error; 1504 1505 if (val == NULL) 1506 *valsize = 0; 1507 if ((int)*valsize < 0) 1508 return (EINVAL); 1509 1510 sopt.sopt_dir = SOPT_GET; 1511 sopt.sopt_level = level; 1512 sopt.sopt_name = name; 1513 sopt.sopt_val = val; 1514 sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ 1515 switch (valseg) { 1516 case UIO_USERSPACE: 1517 sopt.sopt_td = td; 1518 break; 1519 case UIO_SYSSPACE: 1520 sopt.sopt_td = NULL; 1521 break; 1522 default: 1523 panic("kern_getsockopt called with bad valseg"); 1524 } 1525 1526 AUDIT_ARG_FD(s); 1527 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT), 1528 &fp, NULL); 1529 if (error == 0) { 1530 so = fp->f_data; 1531 error = sogetopt(so, &sopt); 1532 *valsize = sopt.sopt_valsize; 1533 fdrop(fp, td); 1534 } 1535 return (error); 1536 } 1537 1538 /* 1539 * getsockname1() - Get socket name. 1540 */ 1541 /* ARGSUSED */ 1542 static int 1543 getsockname1(td, uap, compat) 1544 struct thread *td; 1545 struct getsockname_args /* { 1546 int fdes; 1547 struct sockaddr * __restrict asa; 1548 socklen_t * __restrict alen; 1549 } */ *uap; 1550 int compat; 1551 { 1552 struct sockaddr *sa; 1553 socklen_t len; 1554 int error; 1555 1556 error = copyin(uap->alen, &len, sizeof(len)); 1557 if (error != 0) 1558 return (error); 1559 1560 error = kern_getsockname(td, uap->fdes, &sa, &len); 1561 if (error != 0) 1562 return (error); 1563 1564 if (len != 0) { 1565 #ifdef COMPAT_OLDSOCK 1566 if (compat) 1567 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1568 #endif 1569 error = copyout(sa, uap->asa, (u_int)len); 1570 } 1571 free(sa, M_SONAME); 1572 if (error == 0) 1573 error = copyout(&len, uap->alen, sizeof(len)); 1574 return (error); 1575 } 1576 1577 int 1578 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, 1579 socklen_t *alen) 1580 { 1581 struct socket *so; 1582 struct file *fp; 1583 cap_rights_t rights; 1584 socklen_t len; 1585 int error; 1586 1587 AUDIT_ARG_FD(fd); 1588 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME), 1589 &fp, NULL); 1590 if (error != 0) 1591 return (error); 1592 so = fp->f_data; 1593 *sa = NULL; 1594 CURVNET_SET(so->so_vnet); 1595 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); 1596 CURVNET_RESTORE(); 1597 if (error != 0) 1598 goto bad; 1599 if (*sa == NULL) 1600 len = 0; 1601 else 1602 len = MIN(*alen, (*sa)->sa_len); 1603 *alen = len; 1604 #ifdef KTRACE 1605 if (KTRPOINT(td, KTR_STRUCT)) 1606 ktrsockaddr(*sa); 1607 #endif 1608 bad: 1609 fdrop(fp, td); 1610 if (error != 0 && *sa != NULL) { 1611 free(*sa, M_SONAME); 1612 *sa = NULL; 1613 } 1614 return (error); 1615 } 1616 1617 int 1618 sys_getsockname(td, uap) 1619 struct thread *td; 1620 struct getsockname_args *uap; 1621 { 1622 1623 return (getsockname1(td, uap, 0)); 1624 } 1625 1626 #ifdef COMPAT_OLDSOCK 1627 int 1628 ogetsockname(td, uap) 1629 struct thread *td; 1630 struct getsockname_args *uap; 1631 { 1632 1633 return (getsockname1(td, uap, 1)); 1634 } 1635 #endif /* COMPAT_OLDSOCK */ 1636 1637 /* 1638 * getpeername1() - Get name of peer for connected socket. 1639 */ 1640 /* ARGSUSED */ 1641 static int 1642 getpeername1(td, uap, compat) 1643 struct thread *td; 1644 struct getpeername_args /* { 1645 int fdes; 1646 struct sockaddr * __restrict asa; 1647 socklen_t * __restrict alen; 1648 } */ *uap; 1649 int compat; 1650 { 1651 struct sockaddr *sa; 1652 socklen_t len; 1653 int error; 1654 1655 error = copyin(uap->alen, &len, sizeof (len)); 1656 if (error != 0) 1657 return (error); 1658 1659 error = kern_getpeername(td, uap->fdes, &sa, &len); 1660 if (error != 0) 1661 return (error); 1662 1663 if (len != 0) { 1664 #ifdef COMPAT_OLDSOCK 1665 if (compat) 1666 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1667 #endif 1668 error = copyout(sa, uap->asa, (u_int)len); 1669 } 1670 free(sa, M_SONAME); 1671 if (error == 0) 1672 error = copyout(&len, uap->alen, sizeof(len)); 1673 return (error); 1674 } 1675 1676 int 1677 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, 1678 socklen_t *alen) 1679 { 1680 struct socket *so; 1681 struct file *fp; 1682 cap_rights_t rights; 1683 socklen_t len; 1684 int error; 1685 1686 AUDIT_ARG_FD(fd); 1687 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME), 1688 &fp, NULL); 1689 if (error != 0) 1690 return (error); 1691 so = fp->f_data; 1692 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1693 error = ENOTCONN; 1694 goto done; 1695 } 1696 *sa = NULL; 1697 CURVNET_SET(so->so_vnet); 1698 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); 1699 CURVNET_RESTORE(); 1700 if (error != 0) 1701 goto bad; 1702 if (*sa == NULL) 1703 len = 0; 1704 else 1705 len = MIN(*alen, (*sa)->sa_len); 1706 *alen = len; 1707 #ifdef KTRACE 1708 if (KTRPOINT(td, KTR_STRUCT)) 1709 ktrsockaddr(*sa); 1710 #endif 1711 bad: 1712 if (error != 0 && *sa != NULL) { 1713 free(*sa, M_SONAME); 1714 *sa = NULL; 1715 } 1716 done: 1717 fdrop(fp, td); 1718 return (error); 1719 } 1720 1721 int 1722 sys_getpeername(td, uap) 1723 struct thread *td; 1724 struct getpeername_args *uap; 1725 { 1726 1727 return (getpeername1(td, uap, 0)); 1728 } 1729 1730 #ifdef COMPAT_OLDSOCK 1731 int 1732 ogetpeername(td, uap) 1733 struct thread *td; 1734 struct ogetpeername_args *uap; 1735 { 1736 1737 /* XXX uap should have type `getpeername_args *' to begin with. */ 1738 return (getpeername1(td, (struct getpeername_args *)uap, 1)); 1739 } 1740 #endif /* COMPAT_OLDSOCK */ 1741 1742 int 1743 sockargs(mp, buf, buflen, type) 1744 struct mbuf **mp; 1745 caddr_t buf; 1746 int buflen, type; 1747 { 1748 struct sockaddr *sa; 1749 struct mbuf *m; 1750 int error; 1751 1752 if (buflen > MLEN) { 1753 #ifdef COMPAT_OLDSOCK 1754 if (type == MT_SONAME && buflen <= 112) 1755 buflen = MLEN; /* unix domain compat. hack */ 1756 else 1757 #endif 1758 if (buflen > MCLBYTES) 1759 return (EINVAL); 1760 } 1761 m = m_get2(buflen, M_WAITOK, type, 0); 1762 m->m_len = buflen; 1763 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1764 if (error != 0) 1765 (void) m_free(m); 1766 else { 1767 *mp = m; 1768 if (type == MT_SONAME) { 1769 sa = mtod(m, struct sockaddr *); 1770 1771 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1772 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1773 sa->sa_family = sa->sa_len; 1774 #endif 1775 sa->sa_len = buflen; 1776 } 1777 } 1778 return (error); 1779 } 1780 1781 int 1782 getsockaddr(namp, uaddr, len) 1783 struct sockaddr **namp; 1784 caddr_t uaddr; 1785 size_t len; 1786 { 1787 struct sockaddr *sa; 1788 int error; 1789 1790 if (len > SOCK_MAXADDRLEN) 1791 return (ENAMETOOLONG); 1792 if (len < offsetof(struct sockaddr, sa_data[0])) 1793 return (EINVAL); 1794 sa = malloc(len, M_SONAME, M_WAITOK); 1795 error = copyin(uaddr, sa, len); 1796 if (error != 0) { 1797 free(sa, M_SONAME); 1798 } else { 1799 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1800 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1801 sa->sa_family = sa->sa_len; 1802 #endif 1803 sa->sa_len = len; 1804 *namp = sa; 1805 } 1806 return (error); 1807 } 1808 1809 struct sendfile_sync { 1810 struct mtx mtx; 1811 struct cv cv; 1812 unsigned count; 1813 }; 1814 1815 /* 1816 * Add more references to a vm_page + sf_buf + sendfile_sync. 1817 */ 1818 void 1819 sf_ext_ref(void *arg1, void *arg2) 1820 { 1821 struct sf_buf *sf = arg1; 1822 struct sendfile_sync *sfs = arg2; 1823 vm_page_t pg = sf_buf_page(sf); 1824 1825 sf_buf_ref(sf); 1826 1827 vm_page_lock(pg); 1828 vm_page_wire(pg); 1829 vm_page_unlock(pg); 1830 1831 if (sfs != NULL) { 1832 mtx_lock(&sfs->mtx); 1833 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); 1834 sfs->count++; 1835 mtx_unlock(&sfs->mtx); 1836 } 1837 } 1838 1839 /* 1840 * Detach mapped page and release resources back to the system. 1841 */ 1842 void 1843 sf_ext_free(void *arg1, void *arg2) 1844 { 1845 struct sf_buf *sf = arg1; 1846 struct sendfile_sync *sfs = arg2; 1847 vm_page_t pg = sf_buf_page(sf); 1848 1849 sf_buf_free(sf); 1850 1851 vm_page_lock(pg); 1852 /* 1853 * Check for the object going away on us. This can 1854 * happen since we don't hold a reference to it. 1855 * If so, we're responsible for freeing the page. 1856 */ 1857 if (vm_page_unwire(pg, PQ_INACTIVE) && pg->object == NULL) 1858 vm_page_free(pg); 1859 vm_page_unlock(pg); 1860 1861 if (sfs != NULL) { 1862 mtx_lock(&sfs->mtx); 1863 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); 1864 if (--sfs->count == 0) 1865 cv_signal(&sfs->cv); 1866 mtx_unlock(&sfs->mtx); 1867 } 1868 } 1869 1870 /* 1871 * Same as above, but forces the page to be detached from the object 1872 * and go into free pool. 1873 */ 1874 void 1875 sf_ext_free_nocache(void *arg1, void *arg2) 1876 { 1877 struct sf_buf *sf = arg1; 1878 struct sendfile_sync *sfs = arg2; 1879 vm_page_t pg = sf_buf_page(sf); 1880 1881 sf_buf_free(sf); 1882 1883 vm_page_lock(pg); 1884 if (vm_page_unwire(pg, PQ_NONE)) { 1885 vm_object_t obj; 1886 1887 /* Try to free the page, but only if it is cheap to. */ 1888 if ((obj = pg->object) == NULL) 1889 vm_page_free(pg); 1890 else if (!vm_page_xbusied(pg) && VM_OBJECT_TRYWLOCK(obj)) { 1891 vm_page_free(pg); 1892 VM_OBJECT_WUNLOCK(obj); 1893 } else 1894 vm_page_deactivate(pg); 1895 } 1896 vm_page_unlock(pg); 1897 1898 if (sfs != NULL) { 1899 mtx_lock(&sfs->mtx); 1900 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); 1901 if (--sfs->count == 0) 1902 cv_signal(&sfs->cv); 1903 mtx_unlock(&sfs->mtx); 1904 } 1905 } 1906 1907 /* 1908 * sendfile(2) 1909 * 1910 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 1911 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 1912 * 1913 * Send a file specified by 'fd' and starting at 'offset' to a socket 1914 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == 1915 * 0. Optionally add a header and/or trailer to the socket output. If 1916 * specified, write the total number of bytes sent into *sbytes. 1917 */ 1918 int 1919 sys_sendfile(struct thread *td, struct sendfile_args *uap) 1920 { 1921 1922 return (do_sendfile(td, uap, 0)); 1923 } 1924 1925 static int 1926 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) 1927 { 1928 struct sf_hdtr hdtr; 1929 struct uio *hdr_uio, *trl_uio; 1930 struct file *fp; 1931 cap_rights_t rights; 1932 off_t sbytes; 1933 int error; 1934 1935 /* 1936 * File offset must be positive. If it goes beyond EOF 1937 * we send only the header/trailer and no payload data. 1938 */ 1939 if (uap->offset < 0) 1940 return (EINVAL); 1941 1942 hdr_uio = trl_uio = NULL; 1943 1944 if (uap->hdtr != NULL) { 1945 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 1946 if (error != 0) 1947 goto out; 1948 if (hdtr.headers != NULL) { 1949 error = copyinuio(hdtr.headers, hdtr.hdr_cnt, 1950 &hdr_uio); 1951 if (error != 0) 1952 goto out; 1953 } 1954 if (hdtr.trailers != NULL) { 1955 error = copyinuio(hdtr.trailers, hdtr.trl_cnt, 1956 &trl_uio); 1957 if (error != 0) 1958 goto out; 1959 } 1960 } 1961 1962 AUDIT_ARG_FD(uap->fd); 1963 1964 /* 1965 * sendfile(2) can start at any offset within a file so we require 1966 * CAP_READ+CAP_SEEK = CAP_PREAD. 1967 */ 1968 if ((error = fget_read(td, uap->fd, 1969 cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { 1970 goto out; 1971 } 1972 1973 error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset, 1974 uap->nbytes, &sbytes, uap->flags, compat ? SFK_COMPAT : 0, td); 1975 fdrop(fp, td); 1976 1977 if (uap->sbytes != NULL) 1978 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 1979 1980 out: 1981 free(hdr_uio, M_IOV); 1982 free(trl_uio, M_IOV); 1983 return (error); 1984 } 1985 1986 #ifdef COMPAT_FREEBSD4 1987 int 1988 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) 1989 { 1990 struct sendfile_args args; 1991 1992 args.fd = uap->fd; 1993 args.s = uap->s; 1994 args.offset = uap->offset; 1995 args.nbytes = uap->nbytes; 1996 args.hdtr = uap->hdtr; 1997 args.sbytes = uap->sbytes; 1998 args.flags = uap->flags; 1999 2000 return (do_sendfile(td, &args, 1)); 2001 } 2002 #endif /* COMPAT_FREEBSD4 */ 2003 2004 /* 2005 * How much data to put into page i of n. 2006 * Only first and last pages are special. 2007 */ 2008 static inline off_t 2009 xfsize(int i, int n, off_t off, off_t len) 2010 { 2011 2012 if (i == 0) 2013 return (omin(PAGE_SIZE - (off & PAGE_MASK), len)); 2014 2015 if (i == n - 1 && ((off + len) & PAGE_MASK) > 0) 2016 return ((off + len) & PAGE_MASK); 2017 2018 return (PAGE_SIZE); 2019 } 2020 2021 /* 2022 * Offset within object for i page. 2023 */ 2024 static inline vm_offset_t 2025 vmoff(int i, off_t off) 2026 { 2027 2028 if (i == 0) 2029 return ((vm_offset_t)off); 2030 2031 return (trunc_page(off + i * PAGE_SIZE)); 2032 } 2033 2034 /* 2035 * Pretend as if we don't have enough space, subtract xfsize() of 2036 * all pages that failed. 2037 */ 2038 static inline void 2039 fixspace(int old, int new, off_t off, int *space) 2040 { 2041 2042 KASSERT(old > new, ("%s: old %d new %d", __func__, old, new)); 2043 2044 /* Subtract last one. */ 2045 *space -= xfsize(old - 1, old, off, *space); 2046 old--; 2047 2048 if (new == old) 2049 /* There was only one page. */ 2050 return; 2051 2052 /* Subtract first one. */ 2053 if (new == 0) { 2054 *space -= xfsize(0, old, off, *space); 2055 new++; 2056 } 2057 2058 /* Rest of pages are full sized. */ 2059 *space -= (old - new) * PAGE_SIZE; 2060 2061 KASSERT(*space >= 0, ("%s: space went backwards", __func__)); 2062 } 2063 2064 /* 2065 * Structure describing a single sendfile(2) I/O, which may consist of 2066 * several underlying pager I/Os. 2067 * 2068 * The syscall context allocates the structure and initializes 'nios' 2069 * to 1. As sendfile_swapin() runs through pages and starts asynchronous 2070 * paging operations, it increments 'nios'. 2071 * 2072 * Every I/O completion calls sf_iodone(), which decrements the 'nios', and 2073 * the syscall also calls sf_iodone() after allocating all mbufs, linking them 2074 * and sending to socket. Whoever reaches zero 'nios' is responsible to 2075 * call pru_ready on the socket, to notify it of readyness of the data. 2076 */ 2077 struct sf_io { 2078 volatile u_int nios; 2079 u_int error; 2080 int npages; 2081 struct file *sock_fp; 2082 struct mbuf *m; 2083 vm_page_t pa[]; 2084 }; 2085 2086 static void 2087 sf_iodone(void *arg, vm_page_t *pg, int count, int error) 2088 { 2089 struct sf_io *sfio = arg; 2090 struct socket *so; 2091 2092 for (int i = 0; i < count; i++) 2093 vm_page_xunbusy(pg[i]); 2094 2095 if (error) 2096 sfio->error = error; 2097 2098 if (!refcount_release(&sfio->nios)) 2099 return; 2100 2101 so = sfio->sock_fp->f_data; 2102 2103 if (sfio->error) { 2104 struct mbuf *m; 2105 2106 /* 2107 * I/O operation failed. The state of data in the socket 2108 * is now inconsistent, and all what we can do is to tear 2109 * it down. Protocol abort method would tear down protocol 2110 * state, free all ready mbufs and detach not ready ones. 2111 * We will free the mbufs corresponding to this I/O manually. 2112 * 2113 * The socket would be marked with EIO and made available 2114 * for read, so that application receives EIO on next 2115 * syscall and eventually closes the socket. 2116 */ 2117 so->so_proto->pr_usrreqs->pru_abort(so); 2118 so->so_error = EIO; 2119 2120 m = sfio->m; 2121 for (int i = 0; i < sfio->npages; i++) 2122 m = m_free(m); 2123 } else { 2124 CURVNET_SET(so->so_vnet); 2125 (void )(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m, 2126 sfio->npages); 2127 CURVNET_RESTORE(); 2128 } 2129 2130 /* XXXGL: curthread */ 2131 fdrop(sfio->sock_fp, curthread); 2132 free(sfio, M_TEMP); 2133 } 2134 2135 /* 2136 * Iterate through pages vector and request paging for non-valid pages. 2137 */ 2138 static int 2139 sendfile_swapin(vm_object_t obj, struct sf_io *sfio, off_t off, off_t len, 2140 int npages, int rhpages, int flags) 2141 { 2142 vm_page_t *pa = sfio->pa; 2143 int nios; 2144 2145 nios = 0; 2146 flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0; 2147 2148 /* 2149 * First grab all the pages and wire them. Note that we grab 2150 * only required pages. Readahead pages are dealt with later. 2151 */ 2152 VM_OBJECT_WLOCK(obj); 2153 for (int i = 0; i < npages; i++) { 2154 pa[i] = vm_page_grab(obj, OFF_TO_IDX(vmoff(i, off)), 2155 VM_ALLOC_WIRED | VM_ALLOC_NORMAL | flags); 2156 if (pa[i] == NULL) { 2157 npages = i; 2158 rhpages = 0; 2159 break; 2160 } 2161 } 2162 2163 for (int i = 0; i < npages;) { 2164 int j, a, count, rv; 2165 2166 /* Skip valid pages. */ 2167 if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK, 2168 xfsize(i, npages, off, len))) { 2169 vm_page_xunbusy(pa[i]); 2170 SFSTAT_INC(sf_pages_valid); 2171 i++; 2172 continue; 2173 } 2174 2175 /* 2176 * Now 'i' points to first invalid page, iterate further 2177 * to make 'j' point at first valid after a bunch of 2178 * invalid ones. 2179 */ 2180 for (j = i + 1; j < npages; j++) 2181 if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK, 2182 xfsize(j, npages, off, len))) { 2183 SFSTAT_INC(sf_pages_valid); 2184 break; 2185 } 2186 2187 /* 2188 * Now we got region of invalid pages between 'i' and 'j'. 2189 * Check that they belong to pager. They may not be there, 2190 * which is a regular situation for shmem pager. For vnode 2191 * pager this happens only in case of sparse file. 2192 * 2193 * Important feature of vm_pager_has_page() is the hint 2194 * stored in 'a', about how many pages we can pagein after 2195 * this page in a single I/O. 2196 */ 2197 while (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)), 2198 NULL, &a) && i < j) { 2199 pmap_zero_page(pa[i]); 2200 pa[i]->valid = VM_PAGE_BITS_ALL; 2201 pa[i]->dirty = 0; 2202 vm_page_xunbusy(pa[i]); 2203 i++; 2204 } 2205 if (i == j) 2206 continue; 2207 2208 /* 2209 * We want to pagein as many pages as possible, limited only 2210 * by the 'a' hint and actual request. 2211 * 2212 * We should not pagein into already valid page, thus if 2213 * 'j' didn't reach last page, trim by that page. 2214 * 2215 * When the pagein fulfils the request, also specify readahead. 2216 */ 2217 if (j < npages) 2218 a = min(a, j - i - 1); 2219 count = min(a + 1, npages - i); 2220 2221 refcount_acquire(&sfio->nios); 2222 rv = vm_pager_get_pages_async(obj, pa + i, count, NULL, 2223 i + count == npages ? &rhpages : NULL, 2224 &sf_iodone, sfio); 2225 KASSERT(rv == VM_PAGER_OK, ("%s: pager fail obj %p page %p", 2226 __func__, obj, pa[i])); 2227 2228 SFSTAT_INC(sf_iocnt); 2229 SFSTAT_ADD(sf_pages_read, count); 2230 if (i + count == npages) 2231 SFSTAT_ADD(sf_rhpages_read, rhpages); 2232 2233 #ifdef INVARIANTS 2234 for (j = i; j < i + count && j < npages; j++) 2235 KASSERT(pa[j] == vm_page_lookup(obj, 2236 OFF_TO_IDX(vmoff(j, off))), 2237 ("pa[j] %p lookup %p\n", pa[j], 2238 vm_page_lookup(obj, OFF_TO_IDX(vmoff(j, off))))); 2239 #endif 2240 i += count; 2241 nios++; 2242 } 2243 2244 VM_OBJECT_WUNLOCK(obj); 2245 2246 if (nios == 0 && npages != 0) 2247 SFSTAT_INC(sf_noiocnt); 2248 2249 return (nios); 2250 } 2251 2252 static int 2253 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, 2254 struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, 2255 int *bsize) 2256 { 2257 struct vattr va; 2258 vm_object_t obj; 2259 struct vnode *vp; 2260 struct shmfd *shmfd; 2261 int error; 2262 2263 vp = *vp_res = NULL; 2264 obj = NULL; 2265 shmfd = *shmfd_res = NULL; 2266 *bsize = 0; 2267 2268 /* 2269 * The file descriptor must be a regular file and have a 2270 * backing VM object. 2271 */ 2272 if (fp->f_type == DTYPE_VNODE) { 2273 vp = fp->f_vnode; 2274 vn_lock(vp, LK_SHARED | LK_RETRY); 2275 if (vp->v_type != VREG) { 2276 error = EINVAL; 2277 goto out; 2278 } 2279 *bsize = vp->v_mount->mnt_stat.f_iosize; 2280 error = VOP_GETATTR(vp, &va, td->td_ucred); 2281 if (error != 0) 2282 goto out; 2283 *obj_size = va.va_size; 2284 obj = vp->v_object; 2285 if (obj == NULL) { 2286 error = EINVAL; 2287 goto out; 2288 } 2289 } else if (fp->f_type == DTYPE_SHM) { 2290 error = 0; 2291 shmfd = fp->f_data; 2292 obj = shmfd->shm_object; 2293 *obj_size = shmfd->shm_size; 2294 } else { 2295 error = EINVAL; 2296 goto out; 2297 } 2298 2299 VM_OBJECT_WLOCK(obj); 2300 if ((obj->flags & OBJ_DEAD) != 0) { 2301 VM_OBJECT_WUNLOCK(obj); 2302 error = EBADF; 2303 goto out; 2304 } 2305 2306 /* 2307 * Temporarily increase the backing VM object's reference 2308 * count so that a forced reclamation of its vnode does not 2309 * immediately destroy it. 2310 */ 2311 vm_object_reference_locked(obj); 2312 VM_OBJECT_WUNLOCK(obj); 2313 *obj_res = obj; 2314 *vp_res = vp; 2315 *shmfd_res = shmfd; 2316 2317 out: 2318 if (vp != NULL) 2319 VOP_UNLOCK(vp, 0); 2320 return (error); 2321 } 2322 2323 static int 2324 kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp, 2325 struct socket **so) 2326 { 2327 cap_rights_t rights; 2328 int error; 2329 2330 *sock_fp = NULL; 2331 *so = NULL; 2332 2333 /* 2334 * The socket must be a stream socket and connected. 2335 */ 2336 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND), 2337 sock_fp, NULL); 2338 if (error != 0) 2339 return (error); 2340 *so = (*sock_fp)->f_data; 2341 if ((*so)->so_type != SOCK_STREAM) 2342 return (EINVAL); 2343 if (((*so)->so_state & SS_ISCONNECTED) == 0) 2344 return (ENOTCONN); 2345 return (0); 2346 } 2347 2348 int 2349 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 2350 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 2351 int kflags, struct thread *td) 2352 { 2353 struct file *sock_fp; 2354 struct vnode *vp; 2355 struct vm_object *obj; 2356 struct socket *so; 2357 struct mbuf *m, *mh, *mhtail; 2358 struct sf_buf *sf; 2359 struct shmfd *shmfd; 2360 struct sendfile_sync *sfs; 2361 struct vattr va; 2362 off_t off, sbytes, rem, obj_size; 2363 int error, softerr, bsize, hdrlen; 2364 2365 obj = NULL; 2366 so = NULL; 2367 m = mh = NULL; 2368 sfs = NULL; 2369 sbytes = 0; 2370 softerr = 0; 2371 2372 error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); 2373 if (error != 0) 2374 return (error); 2375 2376 error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so); 2377 if (error != 0) 2378 goto out; 2379 2380 #ifdef MAC 2381 error = mac_socket_check_send(td->td_ucred, so); 2382 if (error != 0) 2383 goto out; 2384 #endif 2385 2386 SFSTAT_INC(sf_syscalls); 2387 SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags)); 2388 2389 if (flags & SF_SYNC) { 2390 sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO); 2391 mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); 2392 cv_init(&sfs->cv, "sendfile"); 2393 } 2394 2395 /* If headers are specified copy them into mbufs. */ 2396 if (hdr_uio != NULL && hdr_uio->uio_resid > 0) { 2397 hdr_uio->uio_td = td; 2398 hdr_uio->uio_rw = UIO_WRITE; 2399 /* 2400 * In FBSD < 5.0 the nbytes to send also included 2401 * the header. If compat is specified subtract the 2402 * header size from nbytes. 2403 */ 2404 if (kflags & SFK_COMPAT) { 2405 if (nbytes > hdr_uio->uio_resid) 2406 nbytes -= hdr_uio->uio_resid; 2407 else 2408 nbytes = 0; 2409 } 2410 mh = m_uiotombuf(hdr_uio, M_WAITOK, 0, 0, 0); 2411 hdrlen = m_length(mh, &mhtail); 2412 } else 2413 hdrlen = 0; 2414 2415 rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset; 2416 2417 /* 2418 * Protect against multiple writers to the socket. 2419 * 2420 * XXXRW: Historically this has assumed non-interruptibility, so now 2421 * we implement that, but possibly shouldn't. 2422 */ 2423 (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); 2424 2425 /* 2426 * Loop through the pages of the file, starting with the requested 2427 * offset. Get a file page (do I/O if necessary), map the file page 2428 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 2429 * it on the socket. 2430 * This is done in two loops. The inner loop turns as many pages 2431 * as it can, up to available socket buffer space, without blocking 2432 * into mbufs to have it bulk delivered into the socket send buffer. 2433 * The outer loop checks the state and available space of the socket 2434 * and takes care of the overall progress. 2435 */ 2436 for (off = offset; rem > 0; ) { 2437 struct sf_io *sfio; 2438 vm_page_t *pa; 2439 struct mbuf *mtail; 2440 int nios, space, npages, rhpages; 2441 2442 mtail = NULL; 2443 /* 2444 * Check the socket state for ongoing connection, 2445 * no errors and space in socket buffer. 2446 * If space is low allow for the remainder of the 2447 * file to be processed if it fits the socket buffer. 2448 * Otherwise block in waiting for sufficient space 2449 * to proceed, or if the socket is nonblocking, return 2450 * to userland with EAGAIN while reporting how far 2451 * we've come. 2452 * We wait until the socket buffer has significant free 2453 * space to do bulk sends. This makes good use of file 2454 * system read ahead and allows packet segmentation 2455 * offloading hardware to take over lots of work. If 2456 * we were not careful here we would send off only one 2457 * sfbuf at a time. 2458 */ 2459 SOCKBUF_LOCK(&so->so_snd); 2460 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) 2461 so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; 2462 retry_space: 2463 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2464 error = EPIPE; 2465 SOCKBUF_UNLOCK(&so->so_snd); 2466 goto done; 2467 } else if (so->so_error) { 2468 error = so->so_error; 2469 so->so_error = 0; 2470 SOCKBUF_UNLOCK(&so->so_snd); 2471 goto done; 2472 } 2473 space = sbspace(&so->so_snd); 2474 if (space < rem && 2475 (space <= 0 || 2476 space < so->so_snd.sb_lowat)) { 2477 if (so->so_state & SS_NBIO) { 2478 SOCKBUF_UNLOCK(&so->so_snd); 2479 error = EAGAIN; 2480 goto done; 2481 } 2482 /* 2483 * sbwait drops the lock while sleeping. 2484 * When we loop back to retry_space the 2485 * state may have changed and we retest 2486 * for it. 2487 */ 2488 error = sbwait(&so->so_snd); 2489 /* 2490 * An error from sbwait usually indicates that we've 2491 * been interrupted by a signal. If we've sent anything 2492 * then return bytes sent, otherwise return the error. 2493 */ 2494 if (error != 0) { 2495 SOCKBUF_UNLOCK(&so->so_snd); 2496 goto done; 2497 } 2498 goto retry_space; 2499 } 2500 SOCKBUF_UNLOCK(&so->so_snd); 2501 2502 /* 2503 * Reduce space in the socket buffer by the size of 2504 * the header mbuf chain. 2505 * hdrlen is set to 0 after the first loop. 2506 */ 2507 space -= hdrlen; 2508 2509 if (vp != NULL) { 2510 error = vn_lock(vp, LK_SHARED); 2511 if (error != 0) 2512 goto done; 2513 error = VOP_GETATTR(vp, &va, td->td_ucred); 2514 if (error != 0 || off >= va.va_size) { 2515 VOP_UNLOCK(vp, 0); 2516 goto done; 2517 } 2518 if (va.va_size != obj_size) { 2519 if (nbytes == 0) 2520 rem += va.va_size - obj_size; 2521 else if (offset + nbytes > va.va_size) 2522 rem -= (offset + nbytes - va.va_size); 2523 obj_size = va.va_size; 2524 } 2525 } 2526 2527 if (space > rem) 2528 space = rem; 2529 2530 npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE); 2531 2532 /* 2533 * Calculate maximum allowed number of pages for readahead 2534 * at this iteration. First, we allow readahead up to "rem". 2535 * If application wants more, let it be, but there is no 2536 * reason to go above MAXPHYS. Also check against "obj_size", 2537 * since vm_pager_has_page() can hint beyond EOF. 2538 */ 2539 rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) - npages; 2540 rhpages += SF_READAHEAD(flags); 2541 rhpages = min(howmany(MAXPHYS, PAGE_SIZE), rhpages); 2542 rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) - 2543 npages, rhpages); 2544 2545 sfio = malloc(sizeof(struct sf_io) + 2546 npages * sizeof(vm_page_t), M_TEMP, M_WAITOK); 2547 refcount_init(&sfio->nios, 1); 2548 sfio->error = 0; 2549 2550 nios = sendfile_swapin(obj, sfio, off, space, npages, rhpages, 2551 flags); 2552 2553 /* 2554 * Loop and construct maximum sized mbuf chain to be bulk 2555 * dumped into socket buffer. 2556 */ 2557 pa = sfio->pa; 2558 for (int i = 0; i < npages; i++) { 2559 struct mbuf *m0; 2560 2561 /* 2562 * If a page wasn't grabbed successfully, then 2563 * trim the array. Can happen only with SF_NODISKIO. 2564 */ 2565 if (pa[i] == NULL) { 2566 SFSTAT_INC(sf_busy); 2567 fixspace(npages, i, off, &space); 2568 npages = i; 2569 softerr = EBUSY; 2570 break; 2571 } 2572 2573 /* 2574 * Get a sendfile buf. When allocating the 2575 * first buffer for mbuf chain, we usually 2576 * wait as long as necessary, but this wait 2577 * can be interrupted. For consequent 2578 * buffers, do not sleep, since several 2579 * threads might exhaust the buffers and then 2580 * deadlock. 2581 */ 2582 sf = sf_buf_alloc(pa[i], 2583 m != NULL ? SFB_NOWAIT : SFB_CATCH); 2584 if (sf == NULL) { 2585 SFSTAT_INC(sf_allocfail); 2586 for (int j = i; j < npages; j++) { 2587 vm_page_lock(pa[j]); 2588 vm_page_unwire(pa[j], PQ_INACTIVE); 2589 vm_page_unlock(pa[j]); 2590 } 2591 if (m == NULL) 2592 softerr = ENOBUFS; 2593 fixspace(npages, i, off, &space); 2594 npages = i; 2595 break; 2596 } 2597 2598 m0 = m_get(M_WAITOK, MT_DATA); 2599 m0->m_ext.ext_buf = (char *)sf_buf_kva(sf); 2600 m0->m_ext.ext_size = PAGE_SIZE; 2601 m0->m_ext.ext_arg1 = sf; 2602 m0->m_ext.ext_arg2 = sfs; 2603 /* 2604 * SF_NOCACHE sets the page as being freed upon send. 2605 * However, we ignore it for the last page in 'space', 2606 * if the page is truncated, and we got more data to 2607 * send (rem > space), or if we have readahead 2608 * configured (rhpages > 0). 2609 */ 2610 if ((flags & SF_NOCACHE) == 0 || 2611 (i == npages - 1 && 2612 ((off + space) & PAGE_MASK) && 2613 (rem > space || rhpages > 0))) 2614 m0->m_ext.ext_type = EXT_SFBUF; 2615 else 2616 m0->m_ext.ext_type = EXT_SFBUF_NOCACHE; 2617 m0->m_ext.ext_flags = 0; 2618 m0->m_flags |= (M_EXT | M_RDONLY); 2619 if (nios) 2620 m0->m_flags |= M_NOTREADY; 2621 m0->m_data = (char *)sf_buf_kva(sf) + 2622 (vmoff(i, off) & PAGE_MASK); 2623 m0->m_len = xfsize(i, npages, off, space); 2624 2625 if (i == 0) 2626 sfio->m = m0; 2627 2628 /* Append to mbuf chain. */ 2629 if (mtail != NULL) 2630 mtail->m_next = m0; 2631 else 2632 m = m0; 2633 mtail = m0; 2634 2635 if (sfs != NULL) { 2636 mtx_lock(&sfs->mtx); 2637 sfs->count++; 2638 mtx_unlock(&sfs->mtx); 2639 } 2640 } 2641 2642 if (vp != NULL) 2643 VOP_UNLOCK(vp, 0); 2644 2645 /* Keep track of bytes processed. */ 2646 off += space; 2647 rem -= space; 2648 2649 /* Prepend header, if any. */ 2650 if (hdrlen) { 2651 mhtail->m_next = m; 2652 m = mh; 2653 mh = NULL; 2654 } 2655 2656 if (m == NULL) { 2657 KASSERT(softerr, ("%s: m NULL, no error", __func__)); 2658 error = softerr; 2659 free(sfio, M_TEMP); 2660 goto done; 2661 } 2662 2663 /* Add the buffer chain to the socket buffer. */ 2664 KASSERT(m_length(m, NULL) == space + hdrlen, 2665 ("%s: mlen %u space %d hdrlen %d", 2666 __func__, m_length(m, NULL), space, hdrlen)); 2667 2668 CURVNET_SET(so->so_vnet); 2669 if (nios == 0) { 2670 /* 2671 * If sendfile_swapin() didn't initiate any I/Os, 2672 * which happens if all data is cached in VM, then 2673 * we can send data right now without the 2674 * PRUS_NOTREADY flag. 2675 */ 2676 free(sfio, M_TEMP); 2677 error = (*so->so_proto->pr_usrreqs->pru_send) 2678 (so, 0, m, NULL, NULL, td); 2679 } else { 2680 sfio->sock_fp = sock_fp; 2681 sfio->npages = npages; 2682 fhold(sock_fp); 2683 error = (*so->so_proto->pr_usrreqs->pru_send) 2684 (so, PRUS_NOTREADY, m, NULL, NULL, td); 2685 sf_iodone(sfio, NULL, 0, 0); 2686 } 2687 CURVNET_RESTORE(); 2688 2689 m = NULL; /* pru_send always consumes */ 2690 if (error) 2691 goto done; 2692 sbytes += space + hdrlen; 2693 if (hdrlen) 2694 hdrlen = 0; 2695 if (softerr) { 2696 error = softerr; 2697 goto done; 2698 } 2699 } 2700 2701 /* 2702 * Send trailers. Wimp out and use writev(2). 2703 */ 2704 if (trl_uio != NULL) { 2705 sbunlock(&so->so_snd); 2706 error = kern_writev(td, sockfd, trl_uio); 2707 if (error == 0) 2708 sbytes += td->td_retval[0]; 2709 goto out; 2710 } 2711 2712 done: 2713 sbunlock(&so->so_snd); 2714 out: 2715 /* 2716 * If there was no error we have to clear td->td_retval[0] 2717 * because it may have been set by writev. 2718 */ 2719 if (error == 0) { 2720 td->td_retval[0] = 0; 2721 } 2722 if (sent != NULL) { 2723 (*sent) = sbytes; 2724 } 2725 if (obj != NULL) 2726 vm_object_deallocate(obj); 2727 if (so) 2728 fdrop(sock_fp, td); 2729 if (m) 2730 m_freem(m); 2731 if (mh) 2732 m_freem(mh); 2733 2734 if (sfs != NULL) { 2735 mtx_lock(&sfs->mtx); 2736 if (sfs->count != 0) 2737 cv_wait(&sfs->cv, &sfs->mtx); 2738 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2739 cv_destroy(&sfs->cv); 2740 mtx_destroy(&sfs->mtx); 2741 free(sfs, M_TEMP); 2742 } 2743 2744 if (error == ERESTART) 2745 error = EINTR; 2746 2747 return (error); 2748 } 2749