1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_capsicum.h" 39 #include "opt_inet.h" 40 #include "opt_inet6.h" 41 #include "opt_compat.h" 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/capsicum.h> 47 #include <sys/condvar.h> 48 #include <sys/kernel.h> 49 #include <sys/lock.h> 50 #include <sys/mutex.h> 51 #include <sys/sysproto.h> 52 #include <sys/malloc.h> 53 #include <sys/filedesc.h> 54 #include <sys/event.h> 55 #include <sys/proc.h> 56 #include <sys/fcntl.h> 57 #include <sys/file.h> 58 #include <sys/filio.h> 59 #include <sys/jail.h> 60 #include <sys/mman.h> 61 #include <sys/mount.h> 62 #include <sys/mbuf.h> 63 #include <sys/protosw.h> 64 #include <sys/rwlock.h> 65 #include <sys/sf_buf.h> 66 #include <sys/sysent.h> 67 #include <sys/socket.h> 68 #include <sys/socketvar.h> 69 #include <sys/signalvar.h> 70 #include <sys/syscallsubr.h> 71 #include <sys/sysctl.h> 72 #include <sys/uio.h> 73 #include <sys/vnode.h> 74 #ifdef KTRACE 75 #include <sys/ktrace.h> 76 #endif 77 #ifdef COMPAT_FREEBSD32 78 #include <compat/freebsd32/freebsd32_util.h> 79 #endif 80 81 #include <net/vnet.h> 82 83 #include <security/audit/audit.h> 84 #include <security/mac/mac_framework.h> 85 86 #include <vm/vm.h> 87 #include <vm/vm_param.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_page.h> 90 #include <vm/vm_pager.h> 91 #include <vm/vm_kern.h> 92 #include <vm/vm_extern.h> 93 #include <vm/uma.h> 94 95 /* 96 * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC 97 * and SOCK_NONBLOCK. 98 */ 99 #define ACCEPT4_INHERIT 0x1 100 #define ACCEPT4_COMPAT 0x2 101 102 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); 103 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); 104 105 static int accept1(struct thread *td, int s, struct sockaddr *uname, 106 socklen_t *anamelen, int flags); 107 static int do_sendfile(struct thread *td, struct sendfile_args *uap, 108 int compat); 109 static int getsockname1(struct thread *td, struct getsockname_args *uap, 110 int compat); 111 static int getpeername1(struct thread *td, struct getpeername_args *uap, 112 int compat); 113 114 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; 115 116 /* 117 * sendfile(2)-related variables and associated sysctls 118 */ 119 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0, 120 "sendfile(2) tunables"); 121 static int sfreadahead = 1; 122 SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW, 123 &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks"); 124 125 static void 126 sfstat_init(const void *unused) 127 { 128 129 COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), 130 M_WAITOK); 131 } 132 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); 133 134 static int 135 sfstat_sysctl(SYSCTL_HANDLER_ARGS) 136 { 137 struct sfstat s; 138 139 COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); 140 if (req->newptr) 141 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); 142 return (SYSCTL_OUT(req, &s, sizeof(s))); 143 } 144 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, 145 NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); 146 147 /* 148 * Convert a user file descriptor to a kernel file entry and check if required 149 * capability rights are present. 150 * A reference on the file entry is held upon returning. 151 */ 152 int 153 getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, 154 struct file **fpp, u_int *fflagp) 155 { 156 struct file *fp; 157 int error; 158 159 error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL); 160 if (error != 0) 161 return (error); 162 if (fp->f_type != DTYPE_SOCKET) { 163 fdrop(fp, td); 164 return (ENOTSOCK); 165 } 166 if (fflagp != NULL) 167 *fflagp = fp->f_flag; 168 *fpp = fp; 169 return (0); 170 } 171 172 /* 173 * System call interface to the socket abstraction. 174 */ 175 #if defined(COMPAT_43) 176 #define COMPAT_OLDSOCK 177 #endif 178 179 int 180 sys_socket(td, uap) 181 struct thread *td; 182 struct socket_args /* { 183 int domain; 184 int type; 185 int protocol; 186 } */ *uap; 187 { 188 struct socket *so; 189 struct file *fp; 190 int fd, error, type, oflag, fflag; 191 192 AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); 193 194 type = uap->type; 195 oflag = 0; 196 fflag = 0; 197 if ((type & SOCK_CLOEXEC) != 0) { 198 type &= ~SOCK_CLOEXEC; 199 oflag |= O_CLOEXEC; 200 } 201 if ((type & SOCK_NONBLOCK) != 0) { 202 type &= ~SOCK_NONBLOCK; 203 fflag |= FNONBLOCK; 204 } 205 206 #ifdef MAC 207 error = mac_socket_check_create(td->td_ucred, uap->domain, type, 208 uap->protocol); 209 if (error != 0) 210 return (error); 211 #endif 212 error = falloc(td, &fp, &fd, oflag); 213 if (error != 0) 214 return (error); 215 /* An extra reference on `fp' has been held for us by falloc(). */ 216 error = socreate(uap->domain, &so, type, uap->protocol, 217 td->td_ucred, td); 218 if (error != 0) { 219 fdclose(td, fp, fd); 220 } else { 221 finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); 222 if ((fflag & FNONBLOCK) != 0) 223 (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); 224 td->td_retval[0] = fd; 225 } 226 fdrop(fp, td); 227 return (error); 228 } 229 230 /* ARGSUSED */ 231 int 232 sys_bind(td, uap) 233 struct thread *td; 234 struct bind_args /* { 235 int s; 236 caddr_t name; 237 int namelen; 238 } */ *uap; 239 { 240 struct sockaddr *sa; 241 int error; 242 243 error = getsockaddr(&sa, uap->name, uap->namelen); 244 if (error == 0) { 245 error = kern_bindat(td, AT_FDCWD, uap->s, sa); 246 free(sa, M_SONAME); 247 } 248 return (error); 249 } 250 251 int 252 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 253 { 254 struct socket *so; 255 struct file *fp; 256 cap_rights_t rights; 257 int error; 258 259 AUDIT_ARG_FD(fd); 260 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 261 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND), 262 &fp, NULL); 263 if (error != 0) 264 return (error); 265 so = fp->f_data; 266 #ifdef KTRACE 267 if (KTRPOINT(td, KTR_STRUCT)) 268 ktrsockaddr(sa); 269 #endif 270 #ifdef MAC 271 error = mac_socket_check_bind(td->td_ucred, so, sa); 272 if (error == 0) { 273 #endif 274 if (dirfd == AT_FDCWD) 275 error = sobind(so, sa, td); 276 else 277 error = sobindat(dirfd, so, sa, td); 278 #ifdef MAC 279 } 280 #endif 281 fdrop(fp, td); 282 return (error); 283 } 284 285 /* ARGSUSED */ 286 int 287 sys_bindat(td, uap) 288 struct thread *td; 289 struct bindat_args /* { 290 int fd; 291 int s; 292 caddr_t name; 293 int namelen; 294 } */ *uap; 295 { 296 struct sockaddr *sa; 297 int error; 298 299 error = getsockaddr(&sa, uap->name, uap->namelen); 300 if (error == 0) { 301 error = kern_bindat(td, uap->fd, uap->s, sa); 302 free(sa, M_SONAME); 303 } 304 return (error); 305 } 306 307 /* ARGSUSED */ 308 int 309 sys_listen(td, uap) 310 struct thread *td; 311 struct listen_args /* { 312 int s; 313 int backlog; 314 } */ *uap; 315 { 316 struct socket *so; 317 struct file *fp; 318 cap_rights_t rights; 319 int error; 320 321 AUDIT_ARG_FD(uap->s); 322 error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_LISTEN), 323 &fp, NULL); 324 if (error == 0) { 325 so = fp->f_data; 326 #ifdef MAC 327 error = mac_socket_check_listen(td->td_ucred, so); 328 if (error == 0) 329 #endif 330 error = solisten(so, uap->backlog, td); 331 fdrop(fp, td); 332 } 333 return(error); 334 } 335 336 /* 337 * accept1() 338 */ 339 static int 340 accept1(td, s, uname, anamelen, flags) 341 struct thread *td; 342 int s; 343 struct sockaddr *uname; 344 socklen_t *anamelen; 345 int flags; 346 { 347 struct sockaddr *name; 348 socklen_t namelen; 349 struct file *fp; 350 int error; 351 352 if (uname == NULL) 353 return (kern_accept4(td, s, NULL, NULL, flags, NULL)); 354 355 error = copyin(anamelen, &namelen, sizeof (namelen)); 356 if (error != 0) 357 return (error); 358 359 error = kern_accept4(td, s, &name, &namelen, flags, &fp); 360 361 if (error != 0) 362 return (error); 363 364 if (error == 0 && uname != NULL) { 365 #ifdef COMPAT_OLDSOCK 366 if (flags & ACCEPT4_COMPAT) 367 ((struct osockaddr *)name)->sa_family = 368 name->sa_family; 369 #endif 370 error = copyout(name, uname, namelen); 371 } 372 if (error == 0) 373 error = copyout(&namelen, anamelen, 374 sizeof(namelen)); 375 if (error != 0) 376 fdclose(td, fp, td->td_retval[0]); 377 fdrop(fp, td); 378 free(name, M_SONAME); 379 return (error); 380 } 381 382 int 383 kern_accept(struct thread *td, int s, struct sockaddr **name, 384 socklen_t *namelen, struct file **fp) 385 { 386 return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); 387 } 388 389 int 390 kern_accept4(struct thread *td, int s, struct sockaddr **name, 391 socklen_t *namelen, int flags, struct file **fp) 392 { 393 struct file *headfp, *nfp = NULL; 394 struct sockaddr *sa = NULL; 395 struct socket *head, *so; 396 cap_rights_t rights; 397 u_int fflag; 398 pid_t pgid; 399 int error, fd, tmp; 400 401 if (name != NULL) 402 *name = NULL; 403 404 AUDIT_ARG_FD(s); 405 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT), 406 &headfp, &fflag); 407 if (error != 0) 408 return (error); 409 head = headfp->f_data; 410 if ((head->so_options & SO_ACCEPTCONN) == 0) { 411 error = EINVAL; 412 goto done; 413 } 414 #ifdef MAC 415 error = mac_socket_check_accept(td->td_ucred, head); 416 if (error != 0) 417 goto done; 418 #endif 419 error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); 420 if (error != 0) 421 goto done; 422 ACCEPT_LOCK(); 423 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { 424 ACCEPT_UNLOCK(); 425 error = EWOULDBLOCK; 426 goto noconnection; 427 } 428 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { 429 if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { 430 head->so_error = ECONNABORTED; 431 break; 432 } 433 error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, 434 "accept", 0); 435 if (error != 0) { 436 ACCEPT_UNLOCK(); 437 goto noconnection; 438 } 439 } 440 if (head->so_error) { 441 error = head->so_error; 442 head->so_error = 0; 443 ACCEPT_UNLOCK(); 444 goto noconnection; 445 } 446 so = TAILQ_FIRST(&head->so_comp); 447 KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); 448 KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); 449 450 /* 451 * Before changing the flags on the socket, we have to bump the 452 * reference count. Otherwise, if the protocol calls sofree(), 453 * the socket will be released due to a zero refcount. 454 */ 455 SOCK_LOCK(so); /* soref() and so_state update */ 456 soref(so); /* file descriptor reference */ 457 458 TAILQ_REMOVE(&head->so_comp, so, so_list); 459 head->so_qlen--; 460 if (flags & ACCEPT4_INHERIT) 461 so->so_state |= (head->so_state & SS_NBIO); 462 else 463 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 464 so->so_qstate &= ~SQ_COMP; 465 so->so_head = NULL; 466 467 SOCK_UNLOCK(so); 468 ACCEPT_UNLOCK(); 469 470 /* An extra reference on `nfp' has been held for us by falloc(). */ 471 td->td_retval[0] = fd; 472 473 /* connection has been removed from the listen queue */ 474 KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); 475 476 if (flags & ACCEPT4_INHERIT) { 477 pgid = fgetown(&head->so_sigio); 478 if (pgid != 0) 479 fsetown(pgid, &so->so_sigio); 480 } else { 481 fflag &= ~(FNONBLOCK | FASYNC); 482 if (flags & SOCK_NONBLOCK) 483 fflag |= FNONBLOCK; 484 } 485 486 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); 487 /* Sync socket nonblocking/async state with file flags */ 488 tmp = fflag & FNONBLOCK; 489 (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); 490 tmp = fflag & FASYNC; 491 (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); 492 sa = 0; 493 error = soaccept(so, &sa); 494 if (error != 0) 495 goto noconnection; 496 if (sa == NULL) { 497 if (name) 498 *namelen = 0; 499 goto done; 500 } 501 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); 502 if (name) { 503 /* check sa_len before it is destroyed */ 504 if (*namelen > sa->sa_len) 505 *namelen = sa->sa_len; 506 #ifdef KTRACE 507 if (KTRPOINT(td, KTR_STRUCT)) 508 ktrsockaddr(sa); 509 #endif 510 *name = sa; 511 sa = NULL; 512 } 513 noconnection: 514 free(sa, M_SONAME); 515 516 /* 517 * close the new descriptor, assuming someone hasn't ripped it 518 * out from under us. 519 */ 520 if (error != 0) 521 fdclose(td, nfp, fd); 522 523 /* 524 * Release explicitly held references before returning. We return 525 * a reference on nfp to the caller on success if they request it. 526 */ 527 done: 528 if (fp != NULL) { 529 if (error == 0) { 530 *fp = nfp; 531 nfp = NULL; 532 } else 533 *fp = NULL; 534 } 535 if (nfp != NULL) 536 fdrop(nfp, td); 537 fdrop(headfp, td); 538 return (error); 539 } 540 541 int 542 sys_accept(td, uap) 543 struct thread *td; 544 struct accept_args *uap; 545 { 546 547 return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); 548 } 549 550 int 551 sys_accept4(td, uap) 552 struct thread *td; 553 struct accept4_args *uap; 554 { 555 556 if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 557 return (EINVAL); 558 559 return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); 560 } 561 562 #ifdef COMPAT_OLDSOCK 563 int 564 oaccept(td, uap) 565 struct thread *td; 566 struct accept_args *uap; 567 { 568 569 return (accept1(td, uap->s, uap->name, uap->anamelen, 570 ACCEPT4_INHERIT | ACCEPT4_COMPAT)); 571 } 572 #endif /* COMPAT_OLDSOCK */ 573 574 /* ARGSUSED */ 575 int 576 sys_connect(td, uap) 577 struct thread *td; 578 struct connect_args /* { 579 int s; 580 caddr_t name; 581 int namelen; 582 } */ *uap; 583 { 584 struct sockaddr *sa; 585 int error; 586 587 error = getsockaddr(&sa, uap->name, uap->namelen); 588 if (error == 0) { 589 error = kern_connectat(td, AT_FDCWD, uap->s, sa); 590 free(sa, M_SONAME); 591 } 592 return (error); 593 } 594 595 int 596 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 597 { 598 struct socket *so; 599 struct file *fp; 600 cap_rights_t rights; 601 int error, interrupted = 0; 602 603 AUDIT_ARG_FD(fd); 604 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 605 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT), 606 &fp, NULL); 607 if (error != 0) 608 return (error); 609 so = fp->f_data; 610 if (so->so_state & SS_ISCONNECTING) { 611 error = EALREADY; 612 goto done1; 613 } 614 #ifdef KTRACE 615 if (KTRPOINT(td, KTR_STRUCT)) 616 ktrsockaddr(sa); 617 #endif 618 #ifdef MAC 619 error = mac_socket_check_connect(td->td_ucred, so, sa); 620 if (error != 0) 621 goto bad; 622 #endif 623 if (dirfd == AT_FDCWD) 624 error = soconnect(so, sa, td); 625 else 626 error = soconnectat(dirfd, so, sa, td); 627 if (error != 0) 628 goto bad; 629 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 630 error = EINPROGRESS; 631 goto done1; 632 } 633 SOCK_LOCK(so); 634 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 635 error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, 636 "connec", 0); 637 if (error != 0) { 638 if (error == EINTR || error == ERESTART) 639 interrupted = 1; 640 break; 641 } 642 } 643 if (error == 0) { 644 error = so->so_error; 645 so->so_error = 0; 646 } 647 SOCK_UNLOCK(so); 648 bad: 649 if (!interrupted) 650 so->so_state &= ~SS_ISCONNECTING; 651 if (error == ERESTART) 652 error = EINTR; 653 done1: 654 fdrop(fp, td); 655 return (error); 656 } 657 658 /* ARGSUSED */ 659 int 660 sys_connectat(td, uap) 661 struct thread *td; 662 struct connectat_args /* { 663 int fd; 664 int s; 665 caddr_t name; 666 int namelen; 667 } */ *uap; 668 { 669 struct sockaddr *sa; 670 int error; 671 672 error = getsockaddr(&sa, uap->name, uap->namelen); 673 if (error == 0) { 674 error = kern_connectat(td, uap->fd, uap->s, sa); 675 free(sa, M_SONAME); 676 } 677 return (error); 678 } 679 680 int 681 kern_socketpair(struct thread *td, int domain, int type, int protocol, 682 int *rsv) 683 { 684 struct file *fp1, *fp2; 685 struct socket *so1, *so2; 686 int fd, error, oflag, fflag; 687 688 AUDIT_ARG_SOCKET(domain, type, protocol); 689 690 oflag = 0; 691 fflag = 0; 692 if ((type & SOCK_CLOEXEC) != 0) { 693 type &= ~SOCK_CLOEXEC; 694 oflag |= O_CLOEXEC; 695 } 696 if ((type & SOCK_NONBLOCK) != 0) { 697 type &= ~SOCK_NONBLOCK; 698 fflag |= FNONBLOCK; 699 } 700 #ifdef MAC 701 /* We might want to have a separate check for socket pairs. */ 702 error = mac_socket_check_create(td->td_ucred, domain, type, 703 protocol); 704 if (error != 0) 705 return (error); 706 #endif 707 error = socreate(domain, &so1, type, protocol, td->td_ucred, td); 708 if (error != 0) 709 return (error); 710 error = socreate(domain, &so2, type, protocol, td->td_ucred, td); 711 if (error != 0) 712 goto free1; 713 /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ 714 error = falloc(td, &fp1, &fd, oflag); 715 if (error != 0) 716 goto free2; 717 rsv[0] = fd; 718 fp1->f_data = so1; /* so1 already has ref count */ 719 error = falloc(td, &fp2, &fd, oflag); 720 if (error != 0) 721 goto free3; 722 fp2->f_data = so2; /* so2 already has ref count */ 723 rsv[1] = fd; 724 error = soconnect2(so1, so2); 725 if (error != 0) 726 goto free4; 727 if (type == SOCK_DGRAM) { 728 /* 729 * Datagram socket connection is asymmetric. 730 */ 731 error = soconnect2(so2, so1); 732 if (error != 0) 733 goto free4; 734 } 735 finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, 736 &socketops); 737 finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, 738 &socketops); 739 if ((fflag & FNONBLOCK) != 0) { 740 (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); 741 (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); 742 } 743 fdrop(fp1, td); 744 fdrop(fp2, td); 745 return (0); 746 free4: 747 fdclose(td, fp2, rsv[1]); 748 fdrop(fp2, td); 749 free3: 750 fdclose(td, fp1, rsv[0]); 751 fdrop(fp1, td); 752 free2: 753 if (so2 != NULL) 754 (void)soclose(so2); 755 free1: 756 if (so1 != NULL) 757 (void)soclose(so1); 758 return (error); 759 } 760 761 int 762 sys_socketpair(struct thread *td, struct socketpair_args *uap) 763 { 764 int error, sv[2]; 765 766 error = kern_socketpair(td, uap->domain, uap->type, 767 uap->protocol, sv); 768 if (error != 0) 769 return (error); 770 error = copyout(sv, uap->rsv, 2 * sizeof(int)); 771 if (error != 0) { 772 (void)kern_close(td, sv[0]); 773 (void)kern_close(td, sv[1]); 774 } 775 return (error); 776 } 777 778 static int 779 sendit(td, s, mp, flags) 780 struct thread *td; 781 int s; 782 struct msghdr *mp; 783 int flags; 784 { 785 struct mbuf *control; 786 struct sockaddr *to; 787 int error; 788 789 #ifdef CAPABILITY_MODE 790 if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) 791 return (ECAPMODE); 792 #endif 793 794 if (mp->msg_name != NULL) { 795 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 796 if (error != 0) { 797 to = NULL; 798 goto bad; 799 } 800 mp->msg_name = to; 801 } else { 802 to = NULL; 803 } 804 805 if (mp->msg_control) { 806 if (mp->msg_controllen < sizeof(struct cmsghdr) 807 #ifdef COMPAT_OLDSOCK 808 && mp->msg_flags != MSG_COMPAT 809 #endif 810 ) { 811 error = EINVAL; 812 goto bad; 813 } 814 error = sockargs(&control, mp->msg_control, 815 mp->msg_controllen, MT_CONTROL); 816 if (error != 0) 817 goto bad; 818 #ifdef COMPAT_OLDSOCK 819 if (mp->msg_flags == MSG_COMPAT) { 820 struct cmsghdr *cm; 821 822 M_PREPEND(control, sizeof(*cm), M_WAITOK); 823 cm = mtod(control, struct cmsghdr *); 824 cm->cmsg_len = control->m_len; 825 cm->cmsg_level = SOL_SOCKET; 826 cm->cmsg_type = SCM_RIGHTS; 827 } 828 #endif 829 } else { 830 control = NULL; 831 } 832 833 error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); 834 835 bad: 836 free(to, M_SONAME); 837 return (error); 838 } 839 840 int 841 kern_sendit(td, s, mp, flags, control, segflg) 842 struct thread *td; 843 int s; 844 struct msghdr *mp; 845 int flags; 846 struct mbuf *control; 847 enum uio_seg segflg; 848 { 849 struct file *fp; 850 struct uio auio; 851 struct iovec *iov; 852 struct socket *so; 853 cap_rights_t rights; 854 #ifdef KTRACE 855 struct uio *ktruio = NULL; 856 #endif 857 ssize_t len; 858 int i, error; 859 860 AUDIT_ARG_FD(s); 861 cap_rights_init(&rights, CAP_SEND); 862 if (mp->msg_name != NULL) { 863 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); 864 cap_rights_set(&rights, CAP_CONNECT); 865 } 866 error = getsock_cap(td, s, &rights, &fp, NULL); 867 if (error != 0) 868 return (error); 869 so = (struct socket *)fp->f_data; 870 871 #ifdef KTRACE 872 if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) 873 ktrsockaddr(mp->msg_name); 874 #endif 875 #ifdef MAC 876 if (mp->msg_name != NULL) { 877 error = mac_socket_check_connect(td->td_ucred, so, 878 mp->msg_name); 879 if (error != 0) 880 goto bad; 881 } 882 error = mac_socket_check_send(td->td_ucred, so); 883 if (error != 0) 884 goto bad; 885 #endif 886 887 auio.uio_iov = mp->msg_iov; 888 auio.uio_iovcnt = mp->msg_iovlen; 889 auio.uio_segflg = segflg; 890 auio.uio_rw = UIO_WRITE; 891 auio.uio_td = td; 892 auio.uio_offset = 0; /* XXX */ 893 auio.uio_resid = 0; 894 iov = mp->msg_iov; 895 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 896 if ((auio.uio_resid += iov->iov_len) < 0) { 897 error = EINVAL; 898 goto bad; 899 } 900 } 901 #ifdef KTRACE 902 if (KTRPOINT(td, KTR_GENIO)) 903 ktruio = cloneuio(&auio); 904 #endif 905 len = auio.uio_resid; 906 error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); 907 if (error != 0) { 908 if (auio.uio_resid != len && (error == ERESTART || 909 error == EINTR || error == EWOULDBLOCK)) 910 error = 0; 911 /* Generation of SIGPIPE can be controlled per socket */ 912 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 913 !(flags & MSG_NOSIGNAL)) { 914 PROC_LOCK(td->td_proc); 915 tdsignal(td, SIGPIPE); 916 PROC_UNLOCK(td->td_proc); 917 } 918 } 919 if (error == 0) 920 td->td_retval[0] = len - auio.uio_resid; 921 #ifdef KTRACE 922 if (ktruio != NULL) { 923 ktruio->uio_resid = td->td_retval[0]; 924 ktrgenio(s, UIO_WRITE, ktruio, error); 925 } 926 #endif 927 bad: 928 fdrop(fp, td); 929 return (error); 930 } 931 932 int 933 sys_sendto(td, uap) 934 struct thread *td; 935 struct sendto_args /* { 936 int s; 937 caddr_t buf; 938 size_t len; 939 int flags; 940 caddr_t to; 941 int tolen; 942 } */ *uap; 943 { 944 struct msghdr msg; 945 struct iovec aiov; 946 947 msg.msg_name = uap->to; 948 msg.msg_namelen = uap->tolen; 949 msg.msg_iov = &aiov; 950 msg.msg_iovlen = 1; 951 msg.msg_control = 0; 952 #ifdef COMPAT_OLDSOCK 953 msg.msg_flags = 0; 954 #endif 955 aiov.iov_base = uap->buf; 956 aiov.iov_len = uap->len; 957 return (sendit(td, uap->s, &msg, uap->flags)); 958 } 959 960 #ifdef COMPAT_OLDSOCK 961 int 962 osend(td, uap) 963 struct thread *td; 964 struct osend_args /* { 965 int s; 966 caddr_t buf; 967 int len; 968 int flags; 969 } */ *uap; 970 { 971 struct msghdr msg; 972 struct iovec aiov; 973 974 msg.msg_name = 0; 975 msg.msg_namelen = 0; 976 msg.msg_iov = &aiov; 977 msg.msg_iovlen = 1; 978 aiov.iov_base = uap->buf; 979 aiov.iov_len = uap->len; 980 msg.msg_control = 0; 981 msg.msg_flags = 0; 982 return (sendit(td, uap->s, &msg, uap->flags)); 983 } 984 985 int 986 osendmsg(td, uap) 987 struct thread *td; 988 struct osendmsg_args /* { 989 int s; 990 caddr_t msg; 991 int flags; 992 } */ *uap; 993 { 994 struct msghdr msg; 995 struct iovec *iov; 996 int error; 997 998 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 999 if (error != 0) 1000 return (error); 1001 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1002 if (error != 0) 1003 return (error); 1004 msg.msg_iov = iov; 1005 msg.msg_flags = MSG_COMPAT; 1006 error = sendit(td, uap->s, &msg, uap->flags); 1007 free(iov, M_IOV); 1008 return (error); 1009 } 1010 #endif 1011 1012 int 1013 sys_sendmsg(td, uap) 1014 struct thread *td; 1015 struct sendmsg_args /* { 1016 int s; 1017 caddr_t msg; 1018 int flags; 1019 } */ *uap; 1020 { 1021 struct msghdr msg; 1022 struct iovec *iov; 1023 int error; 1024 1025 error = copyin(uap->msg, &msg, sizeof (msg)); 1026 if (error != 0) 1027 return (error); 1028 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1029 if (error != 0) 1030 return (error); 1031 msg.msg_iov = iov; 1032 #ifdef COMPAT_OLDSOCK 1033 msg.msg_flags = 0; 1034 #endif 1035 error = sendit(td, uap->s, &msg, uap->flags); 1036 free(iov, M_IOV); 1037 return (error); 1038 } 1039 1040 int 1041 kern_recvit(td, s, mp, fromseg, controlp) 1042 struct thread *td; 1043 int s; 1044 struct msghdr *mp; 1045 enum uio_seg fromseg; 1046 struct mbuf **controlp; 1047 { 1048 struct uio auio; 1049 struct iovec *iov; 1050 struct mbuf *m, *control = NULL; 1051 caddr_t ctlbuf; 1052 struct file *fp; 1053 struct socket *so; 1054 struct sockaddr *fromsa = NULL; 1055 cap_rights_t rights; 1056 #ifdef KTRACE 1057 struct uio *ktruio = NULL; 1058 #endif 1059 ssize_t len; 1060 int error, i; 1061 1062 if (controlp != NULL) 1063 *controlp = NULL; 1064 1065 AUDIT_ARG_FD(s); 1066 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV), 1067 &fp, NULL); 1068 if (error != 0) 1069 return (error); 1070 so = fp->f_data; 1071 1072 #ifdef MAC 1073 error = mac_socket_check_receive(td->td_ucred, so); 1074 if (error != 0) { 1075 fdrop(fp, td); 1076 return (error); 1077 } 1078 #endif 1079 1080 auio.uio_iov = mp->msg_iov; 1081 auio.uio_iovcnt = mp->msg_iovlen; 1082 auio.uio_segflg = UIO_USERSPACE; 1083 auio.uio_rw = UIO_READ; 1084 auio.uio_td = td; 1085 auio.uio_offset = 0; /* XXX */ 1086 auio.uio_resid = 0; 1087 iov = mp->msg_iov; 1088 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 1089 if ((auio.uio_resid += iov->iov_len) < 0) { 1090 fdrop(fp, td); 1091 return (EINVAL); 1092 } 1093 } 1094 #ifdef KTRACE 1095 if (KTRPOINT(td, KTR_GENIO)) 1096 ktruio = cloneuio(&auio); 1097 #endif 1098 len = auio.uio_resid; 1099 error = soreceive(so, &fromsa, &auio, NULL, 1100 (mp->msg_control || controlp) ? &control : NULL, 1101 &mp->msg_flags); 1102 if (error != 0) { 1103 if (auio.uio_resid != len && (error == ERESTART || 1104 error == EINTR || error == EWOULDBLOCK)) 1105 error = 0; 1106 } 1107 if (fromsa != NULL) 1108 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); 1109 #ifdef KTRACE 1110 if (ktruio != NULL) { 1111 ktruio->uio_resid = len - auio.uio_resid; 1112 ktrgenio(s, UIO_READ, ktruio, error); 1113 } 1114 #endif 1115 if (error != 0) 1116 goto out; 1117 td->td_retval[0] = len - auio.uio_resid; 1118 if (mp->msg_name) { 1119 len = mp->msg_namelen; 1120 if (len <= 0 || fromsa == NULL) 1121 len = 0; 1122 else { 1123 /* save sa_len before it is destroyed by MSG_COMPAT */ 1124 len = MIN(len, fromsa->sa_len); 1125 #ifdef COMPAT_OLDSOCK 1126 if (mp->msg_flags & MSG_COMPAT) 1127 ((struct osockaddr *)fromsa)->sa_family = 1128 fromsa->sa_family; 1129 #endif 1130 if (fromseg == UIO_USERSPACE) { 1131 error = copyout(fromsa, mp->msg_name, 1132 (unsigned)len); 1133 if (error != 0) 1134 goto out; 1135 } else 1136 bcopy(fromsa, mp->msg_name, len); 1137 } 1138 mp->msg_namelen = len; 1139 } 1140 if (mp->msg_control && controlp == NULL) { 1141 #ifdef COMPAT_OLDSOCK 1142 /* 1143 * We assume that old recvmsg calls won't receive access 1144 * rights and other control info, esp. as control info 1145 * is always optional and those options didn't exist in 4.3. 1146 * If we receive rights, trim the cmsghdr; anything else 1147 * is tossed. 1148 */ 1149 if (control && mp->msg_flags & MSG_COMPAT) { 1150 if (mtod(control, struct cmsghdr *)->cmsg_level != 1151 SOL_SOCKET || 1152 mtod(control, struct cmsghdr *)->cmsg_type != 1153 SCM_RIGHTS) { 1154 mp->msg_controllen = 0; 1155 goto out; 1156 } 1157 control->m_len -= sizeof (struct cmsghdr); 1158 control->m_data += sizeof (struct cmsghdr); 1159 } 1160 #endif 1161 len = mp->msg_controllen; 1162 m = control; 1163 mp->msg_controllen = 0; 1164 ctlbuf = mp->msg_control; 1165 1166 while (m && len > 0) { 1167 unsigned int tocopy; 1168 1169 if (len >= m->m_len) 1170 tocopy = m->m_len; 1171 else { 1172 mp->msg_flags |= MSG_CTRUNC; 1173 tocopy = len; 1174 } 1175 1176 if ((error = copyout(mtod(m, caddr_t), 1177 ctlbuf, tocopy)) != 0) 1178 goto out; 1179 1180 ctlbuf += tocopy; 1181 len -= tocopy; 1182 m = m->m_next; 1183 } 1184 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; 1185 } 1186 out: 1187 fdrop(fp, td); 1188 #ifdef KTRACE 1189 if (fromsa && KTRPOINT(td, KTR_STRUCT)) 1190 ktrsockaddr(fromsa); 1191 #endif 1192 free(fromsa, M_SONAME); 1193 1194 if (error == 0 && controlp != NULL) 1195 *controlp = control; 1196 else if (control) 1197 m_freem(control); 1198 1199 return (error); 1200 } 1201 1202 static int 1203 recvit(td, s, mp, namelenp) 1204 struct thread *td; 1205 int s; 1206 struct msghdr *mp; 1207 void *namelenp; 1208 { 1209 int error; 1210 1211 error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); 1212 if (error != 0) 1213 return (error); 1214 if (namelenp != NULL) { 1215 error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); 1216 #ifdef COMPAT_OLDSOCK 1217 if (mp->msg_flags & MSG_COMPAT) 1218 error = 0; /* old recvfrom didn't check */ 1219 #endif 1220 } 1221 return (error); 1222 } 1223 1224 int 1225 sys_recvfrom(td, uap) 1226 struct thread *td; 1227 struct recvfrom_args /* { 1228 int s; 1229 caddr_t buf; 1230 size_t len; 1231 int flags; 1232 struct sockaddr * __restrict from; 1233 socklen_t * __restrict fromlenaddr; 1234 } */ *uap; 1235 { 1236 struct msghdr msg; 1237 struct iovec aiov; 1238 int error; 1239 1240 if (uap->fromlenaddr) { 1241 error = copyin(uap->fromlenaddr, 1242 &msg.msg_namelen, sizeof (msg.msg_namelen)); 1243 if (error != 0) 1244 goto done2; 1245 } else { 1246 msg.msg_namelen = 0; 1247 } 1248 msg.msg_name = uap->from; 1249 msg.msg_iov = &aiov; 1250 msg.msg_iovlen = 1; 1251 aiov.iov_base = uap->buf; 1252 aiov.iov_len = uap->len; 1253 msg.msg_control = 0; 1254 msg.msg_flags = uap->flags; 1255 error = recvit(td, uap->s, &msg, uap->fromlenaddr); 1256 done2: 1257 return (error); 1258 } 1259 1260 #ifdef COMPAT_OLDSOCK 1261 int 1262 orecvfrom(td, uap) 1263 struct thread *td; 1264 struct recvfrom_args *uap; 1265 { 1266 1267 uap->flags |= MSG_COMPAT; 1268 return (sys_recvfrom(td, uap)); 1269 } 1270 #endif 1271 1272 #ifdef COMPAT_OLDSOCK 1273 int 1274 orecv(td, uap) 1275 struct thread *td; 1276 struct orecv_args /* { 1277 int s; 1278 caddr_t buf; 1279 int len; 1280 int flags; 1281 } */ *uap; 1282 { 1283 struct msghdr msg; 1284 struct iovec aiov; 1285 1286 msg.msg_name = 0; 1287 msg.msg_namelen = 0; 1288 msg.msg_iov = &aiov; 1289 msg.msg_iovlen = 1; 1290 aiov.iov_base = uap->buf; 1291 aiov.iov_len = uap->len; 1292 msg.msg_control = 0; 1293 msg.msg_flags = uap->flags; 1294 return (recvit(td, uap->s, &msg, NULL)); 1295 } 1296 1297 /* 1298 * Old recvmsg. This code takes advantage of the fact that the old msghdr 1299 * overlays the new one, missing only the flags, and with the (old) access 1300 * rights where the control fields are now. 1301 */ 1302 int 1303 orecvmsg(td, uap) 1304 struct thread *td; 1305 struct orecvmsg_args /* { 1306 int s; 1307 struct omsghdr *msg; 1308 int flags; 1309 } */ *uap; 1310 { 1311 struct msghdr msg; 1312 struct iovec *iov; 1313 int error; 1314 1315 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1316 if (error != 0) 1317 return (error); 1318 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1319 if (error != 0) 1320 return (error); 1321 msg.msg_flags = uap->flags | MSG_COMPAT; 1322 msg.msg_iov = iov; 1323 error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); 1324 if (msg.msg_controllen && error == 0) 1325 error = copyout(&msg.msg_controllen, 1326 &uap->msg->msg_accrightslen, sizeof (int)); 1327 free(iov, M_IOV); 1328 return (error); 1329 } 1330 #endif 1331 1332 int 1333 sys_recvmsg(td, uap) 1334 struct thread *td; 1335 struct recvmsg_args /* { 1336 int s; 1337 struct msghdr *msg; 1338 int flags; 1339 } */ *uap; 1340 { 1341 struct msghdr msg; 1342 struct iovec *uiov, *iov; 1343 int error; 1344 1345 error = copyin(uap->msg, &msg, sizeof (msg)); 1346 if (error != 0) 1347 return (error); 1348 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1349 if (error != 0) 1350 return (error); 1351 msg.msg_flags = uap->flags; 1352 #ifdef COMPAT_OLDSOCK 1353 msg.msg_flags &= ~MSG_COMPAT; 1354 #endif 1355 uiov = msg.msg_iov; 1356 msg.msg_iov = iov; 1357 error = recvit(td, uap->s, &msg, NULL); 1358 if (error == 0) { 1359 msg.msg_iov = uiov; 1360 error = copyout(&msg, uap->msg, sizeof(msg)); 1361 } 1362 free(iov, M_IOV); 1363 return (error); 1364 } 1365 1366 /* ARGSUSED */ 1367 int 1368 sys_shutdown(td, uap) 1369 struct thread *td; 1370 struct shutdown_args /* { 1371 int s; 1372 int how; 1373 } */ *uap; 1374 { 1375 struct socket *so; 1376 struct file *fp; 1377 cap_rights_t rights; 1378 int error; 1379 1380 AUDIT_ARG_FD(uap->s); 1381 error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN), 1382 &fp, NULL); 1383 if (error == 0) { 1384 so = fp->f_data; 1385 error = soshutdown(so, uap->how); 1386 /* 1387 * Previous versions did not return ENOTCONN, but 0 in 1388 * case the socket was not connected. Some important 1389 * programs like syslogd up to r279016, 2015-02-19, 1390 * still depend on this behavior. 1391 */ 1392 if (error == ENOTCONN && 1393 td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN) 1394 error = 0; 1395 fdrop(fp, td); 1396 } 1397 return (error); 1398 } 1399 1400 /* ARGSUSED */ 1401 int 1402 sys_setsockopt(td, uap) 1403 struct thread *td; 1404 struct setsockopt_args /* { 1405 int s; 1406 int level; 1407 int name; 1408 caddr_t val; 1409 int valsize; 1410 } */ *uap; 1411 { 1412 1413 return (kern_setsockopt(td, uap->s, uap->level, uap->name, 1414 uap->val, UIO_USERSPACE, uap->valsize)); 1415 } 1416 1417 int 1418 kern_setsockopt(td, s, level, name, val, valseg, valsize) 1419 struct thread *td; 1420 int s; 1421 int level; 1422 int name; 1423 void *val; 1424 enum uio_seg valseg; 1425 socklen_t valsize; 1426 { 1427 struct socket *so; 1428 struct file *fp; 1429 struct sockopt sopt; 1430 cap_rights_t rights; 1431 int error; 1432 1433 if (val == NULL && valsize != 0) 1434 return (EFAULT); 1435 if ((int)valsize < 0) 1436 return (EINVAL); 1437 1438 sopt.sopt_dir = SOPT_SET; 1439 sopt.sopt_level = level; 1440 sopt.sopt_name = name; 1441 sopt.sopt_val = val; 1442 sopt.sopt_valsize = valsize; 1443 switch (valseg) { 1444 case UIO_USERSPACE: 1445 sopt.sopt_td = td; 1446 break; 1447 case UIO_SYSSPACE: 1448 sopt.sopt_td = NULL; 1449 break; 1450 default: 1451 panic("kern_setsockopt called with bad valseg"); 1452 } 1453 1454 AUDIT_ARG_FD(s); 1455 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT), 1456 &fp, NULL); 1457 if (error == 0) { 1458 so = fp->f_data; 1459 error = sosetopt(so, &sopt); 1460 fdrop(fp, td); 1461 } 1462 return(error); 1463 } 1464 1465 /* ARGSUSED */ 1466 int 1467 sys_getsockopt(td, uap) 1468 struct thread *td; 1469 struct getsockopt_args /* { 1470 int s; 1471 int level; 1472 int name; 1473 void * __restrict val; 1474 socklen_t * __restrict avalsize; 1475 } */ *uap; 1476 { 1477 socklen_t valsize; 1478 int error; 1479 1480 if (uap->val) { 1481 error = copyin(uap->avalsize, &valsize, sizeof (valsize)); 1482 if (error != 0) 1483 return (error); 1484 } 1485 1486 error = kern_getsockopt(td, uap->s, uap->level, uap->name, 1487 uap->val, UIO_USERSPACE, &valsize); 1488 1489 if (error == 0) 1490 error = copyout(&valsize, uap->avalsize, sizeof (valsize)); 1491 return (error); 1492 } 1493 1494 /* 1495 * Kernel version of getsockopt. 1496 * optval can be a userland or userspace. optlen is always a kernel pointer. 1497 */ 1498 int 1499 kern_getsockopt(td, s, level, name, val, valseg, valsize) 1500 struct thread *td; 1501 int s; 1502 int level; 1503 int name; 1504 void *val; 1505 enum uio_seg valseg; 1506 socklen_t *valsize; 1507 { 1508 struct socket *so; 1509 struct file *fp; 1510 struct sockopt sopt; 1511 cap_rights_t rights; 1512 int error; 1513 1514 if (val == NULL) 1515 *valsize = 0; 1516 if ((int)*valsize < 0) 1517 return (EINVAL); 1518 1519 sopt.sopt_dir = SOPT_GET; 1520 sopt.sopt_level = level; 1521 sopt.sopt_name = name; 1522 sopt.sopt_val = val; 1523 sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ 1524 switch (valseg) { 1525 case UIO_USERSPACE: 1526 sopt.sopt_td = td; 1527 break; 1528 case UIO_SYSSPACE: 1529 sopt.sopt_td = NULL; 1530 break; 1531 default: 1532 panic("kern_getsockopt called with bad valseg"); 1533 } 1534 1535 AUDIT_ARG_FD(s); 1536 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT), 1537 &fp, NULL); 1538 if (error == 0) { 1539 so = fp->f_data; 1540 error = sogetopt(so, &sopt); 1541 *valsize = sopt.sopt_valsize; 1542 fdrop(fp, td); 1543 } 1544 return (error); 1545 } 1546 1547 /* 1548 * getsockname1() - Get socket name. 1549 */ 1550 /* ARGSUSED */ 1551 static int 1552 getsockname1(td, uap, compat) 1553 struct thread *td; 1554 struct getsockname_args /* { 1555 int fdes; 1556 struct sockaddr * __restrict asa; 1557 socklen_t * __restrict alen; 1558 } */ *uap; 1559 int compat; 1560 { 1561 struct sockaddr *sa; 1562 socklen_t len; 1563 int error; 1564 1565 error = copyin(uap->alen, &len, sizeof(len)); 1566 if (error != 0) 1567 return (error); 1568 1569 error = kern_getsockname(td, uap->fdes, &sa, &len); 1570 if (error != 0) 1571 return (error); 1572 1573 if (len != 0) { 1574 #ifdef COMPAT_OLDSOCK 1575 if (compat) 1576 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1577 #endif 1578 error = copyout(sa, uap->asa, (u_int)len); 1579 } 1580 free(sa, M_SONAME); 1581 if (error == 0) 1582 error = copyout(&len, uap->alen, sizeof(len)); 1583 return (error); 1584 } 1585 1586 int 1587 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, 1588 socklen_t *alen) 1589 { 1590 struct socket *so; 1591 struct file *fp; 1592 cap_rights_t rights; 1593 socklen_t len; 1594 int error; 1595 1596 AUDIT_ARG_FD(fd); 1597 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME), 1598 &fp, NULL); 1599 if (error != 0) 1600 return (error); 1601 so = fp->f_data; 1602 *sa = NULL; 1603 CURVNET_SET(so->so_vnet); 1604 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); 1605 CURVNET_RESTORE(); 1606 if (error != 0) 1607 goto bad; 1608 if (*sa == NULL) 1609 len = 0; 1610 else 1611 len = MIN(*alen, (*sa)->sa_len); 1612 *alen = len; 1613 #ifdef KTRACE 1614 if (KTRPOINT(td, KTR_STRUCT)) 1615 ktrsockaddr(*sa); 1616 #endif 1617 bad: 1618 fdrop(fp, td); 1619 if (error != 0 && *sa != NULL) { 1620 free(*sa, M_SONAME); 1621 *sa = NULL; 1622 } 1623 return (error); 1624 } 1625 1626 int 1627 sys_getsockname(td, uap) 1628 struct thread *td; 1629 struct getsockname_args *uap; 1630 { 1631 1632 return (getsockname1(td, uap, 0)); 1633 } 1634 1635 #ifdef COMPAT_OLDSOCK 1636 int 1637 ogetsockname(td, uap) 1638 struct thread *td; 1639 struct getsockname_args *uap; 1640 { 1641 1642 return (getsockname1(td, uap, 1)); 1643 } 1644 #endif /* COMPAT_OLDSOCK */ 1645 1646 /* 1647 * getpeername1() - Get name of peer for connected socket. 1648 */ 1649 /* ARGSUSED */ 1650 static int 1651 getpeername1(td, uap, compat) 1652 struct thread *td; 1653 struct getpeername_args /* { 1654 int fdes; 1655 struct sockaddr * __restrict asa; 1656 socklen_t * __restrict alen; 1657 } */ *uap; 1658 int compat; 1659 { 1660 struct sockaddr *sa; 1661 socklen_t len; 1662 int error; 1663 1664 error = copyin(uap->alen, &len, sizeof (len)); 1665 if (error != 0) 1666 return (error); 1667 1668 error = kern_getpeername(td, uap->fdes, &sa, &len); 1669 if (error != 0) 1670 return (error); 1671 1672 if (len != 0) { 1673 #ifdef COMPAT_OLDSOCK 1674 if (compat) 1675 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1676 #endif 1677 error = copyout(sa, uap->asa, (u_int)len); 1678 } 1679 free(sa, M_SONAME); 1680 if (error == 0) 1681 error = copyout(&len, uap->alen, sizeof(len)); 1682 return (error); 1683 } 1684 1685 int 1686 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, 1687 socklen_t *alen) 1688 { 1689 struct socket *so; 1690 struct file *fp; 1691 cap_rights_t rights; 1692 socklen_t len; 1693 int error; 1694 1695 AUDIT_ARG_FD(fd); 1696 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME), 1697 &fp, NULL); 1698 if (error != 0) 1699 return (error); 1700 so = fp->f_data; 1701 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1702 error = ENOTCONN; 1703 goto done; 1704 } 1705 *sa = NULL; 1706 CURVNET_SET(so->so_vnet); 1707 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); 1708 CURVNET_RESTORE(); 1709 if (error != 0) 1710 goto bad; 1711 if (*sa == NULL) 1712 len = 0; 1713 else 1714 len = MIN(*alen, (*sa)->sa_len); 1715 *alen = len; 1716 #ifdef KTRACE 1717 if (KTRPOINT(td, KTR_STRUCT)) 1718 ktrsockaddr(*sa); 1719 #endif 1720 bad: 1721 if (error != 0 && *sa != NULL) { 1722 free(*sa, M_SONAME); 1723 *sa = NULL; 1724 } 1725 done: 1726 fdrop(fp, td); 1727 return (error); 1728 } 1729 1730 int 1731 sys_getpeername(td, uap) 1732 struct thread *td; 1733 struct getpeername_args *uap; 1734 { 1735 1736 return (getpeername1(td, uap, 0)); 1737 } 1738 1739 #ifdef COMPAT_OLDSOCK 1740 int 1741 ogetpeername(td, uap) 1742 struct thread *td; 1743 struct ogetpeername_args *uap; 1744 { 1745 1746 /* XXX uap should have type `getpeername_args *' to begin with. */ 1747 return (getpeername1(td, (struct getpeername_args *)uap, 1)); 1748 } 1749 #endif /* COMPAT_OLDSOCK */ 1750 1751 int 1752 sockargs(mp, buf, buflen, type) 1753 struct mbuf **mp; 1754 caddr_t buf; 1755 int buflen, type; 1756 { 1757 struct sockaddr *sa; 1758 struct mbuf *m; 1759 int error; 1760 1761 if (buflen > MLEN) { 1762 #ifdef COMPAT_OLDSOCK 1763 if (type == MT_SONAME && buflen <= 112) 1764 buflen = MLEN; /* unix domain compat. hack */ 1765 else 1766 #endif 1767 if (buflen > MCLBYTES) 1768 return (EINVAL); 1769 } 1770 m = m_get2(buflen, M_WAITOK, type, 0); 1771 m->m_len = buflen; 1772 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1773 if (error != 0) 1774 (void) m_free(m); 1775 else { 1776 *mp = m; 1777 if (type == MT_SONAME) { 1778 sa = mtod(m, struct sockaddr *); 1779 1780 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1781 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1782 sa->sa_family = sa->sa_len; 1783 #endif 1784 sa->sa_len = buflen; 1785 } 1786 } 1787 return (error); 1788 } 1789 1790 int 1791 getsockaddr(namp, uaddr, len) 1792 struct sockaddr **namp; 1793 caddr_t uaddr; 1794 size_t len; 1795 { 1796 struct sockaddr *sa; 1797 int error; 1798 1799 if (len > SOCK_MAXADDRLEN) 1800 return (ENAMETOOLONG); 1801 if (len < offsetof(struct sockaddr, sa_data[0])) 1802 return (EINVAL); 1803 sa = malloc(len, M_SONAME, M_WAITOK); 1804 error = copyin(uaddr, sa, len); 1805 if (error != 0) { 1806 free(sa, M_SONAME); 1807 } else { 1808 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1809 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1810 sa->sa_family = sa->sa_len; 1811 #endif 1812 sa->sa_len = len; 1813 *namp = sa; 1814 } 1815 return (error); 1816 } 1817 1818 struct sendfile_sync { 1819 struct mtx mtx; 1820 struct cv cv; 1821 unsigned count; 1822 }; 1823 1824 /* 1825 * Add more references to a vm_page + sf_buf + sendfile_sync. 1826 */ 1827 void 1828 sf_ext_ref(void *arg1, void *arg2) 1829 { 1830 struct sf_buf *sf = arg1; 1831 struct sendfile_sync *sfs = arg2; 1832 vm_page_t pg = sf_buf_page(sf); 1833 1834 sf_buf_ref(sf); 1835 1836 vm_page_lock(pg); 1837 vm_page_wire(pg); 1838 vm_page_unlock(pg); 1839 1840 if (sfs != NULL) { 1841 mtx_lock(&sfs->mtx); 1842 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); 1843 sfs->count++; 1844 mtx_unlock(&sfs->mtx); 1845 } 1846 } 1847 1848 /* 1849 * Detach mapped page and release resources back to the system. 1850 */ 1851 void 1852 sf_ext_free(void *arg1, void *arg2) 1853 { 1854 struct sf_buf *sf = arg1; 1855 struct sendfile_sync *sfs = arg2; 1856 vm_page_t pg = sf_buf_page(sf); 1857 1858 sf_buf_free(sf); 1859 1860 vm_page_lock(pg); 1861 vm_page_unwire(pg, PQ_INACTIVE); 1862 /* 1863 * Check for the object going away on us. This can 1864 * happen since we don't hold a reference to it. 1865 * If so, we're responsible for freeing the page. 1866 */ 1867 if (pg->wire_count == 0 && pg->object == NULL) 1868 vm_page_free(pg); 1869 vm_page_unlock(pg); 1870 1871 if (sfs != NULL) { 1872 mtx_lock(&sfs->mtx); 1873 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); 1874 if (--sfs->count == 0) 1875 cv_signal(&sfs->cv); 1876 mtx_unlock(&sfs->mtx); 1877 } 1878 } 1879 1880 /* 1881 * sendfile(2) 1882 * 1883 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 1884 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 1885 * 1886 * Send a file specified by 'fd' and starting at 'offset' to a socket 1887 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == 1888 * 0. Optionally add a header and/or trailer to the socket output. If 1889 * specified, write the total number of bytes sent into *sbytes. 1890 */ 1891 int 1892 sys_sendfile(struct thread *td, struct sendfile_args *uap) 1893 { 1894 1895 return (do_sendfile(td, uap, 0)); 1896 } 1897 1898 static int 1899 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) 1900 { 1901 struct sf_hdtr hdtr; 1902 struct uio *hdr_uio, *trl_uio; 1903 struct file *fp; 1904 cap_rights_t rights; 1905 off_t sbytes; 1906 int error; 1907 1908 /* 1909 * File offset must be positive. If it goes beyond EOF 1910 * we send only the header/trailer and no payload data. 1911 */ 1912 if (uap->offset < 0) 1913 return (EINVAL); 1914 1915 hdr_uio = trl_uio = NULL; 1916 1917 if (uap->hdtr != NULL) { 1918 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 1919 if (error != 0) 1920 goto out; 1921 if (hdtr.headers != NULL) { 1922 error = copyinuio(hdtr.headers, hdtr.hdr_cnt, 1923 &hdr_uio); 1924 if (error != 0) 1925 goto out; 1926 } 1927 if (hdtr.trailers != NULL) { 1928 error = copyinuio(hdtr.trailers, hdtr.trl_cnt, 1929 &trl_uio); 1930 if (error != 0) 1931 goto out; 1932 } 1933 } 1934 1935 AUDIT_ARG_FD(uap->fd); 1936 1937 /* 1938 * sendfile(2) can start at any offset within a file so we require 1939 * CAP_READ+CAP_SEEK = CAP_PREAD. 1940 */ 1941 if ((error = fget_read(td, uap->fd, 1942 cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { 1943 goto out; 1944 } 1945 1946 error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset, 1947 uap->nbytes, &sbytes, uap->flags, compat ? SFK_COMPAT : 0, td); 1948 fdrop(fp, td); 1949 1950 if (uap->sbytes != NULL) 1951 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 1952 1953 out: 1954 free(hdr_uio, M_IOV); 1955 free(trl_uio, M_IOV); 1956 return (error); 1957 } 1958 1959 #ifdef COMPAT_FREEBSD4 1960 int 1961 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) 1962 { 1963 struct sendfile_args args; 1964 1965 args.fd = uap->fd; 1966 args.s = uap->s; 1967 args.offset = uap->offset; 1968 args.nbytes = uap->nbytes; 1969 args.hdtr = uap->hdtr; 1970 args.sbytes = uap->sbytes; 1971 args.flags = uap->flags; 1972 1973 return (do_sendfile(td, &args, 1)); 1974 } 1975 #endif /* COMPAT_FREEBSD4 */ 1976 1977 static int 1978 sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd, 1979 off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res) 1980 { 1981 vm_page_t m; 1982 vm_pindex_t pindex; 1983 ssize_t resid; 1984 int error, readahead, rv; 1985 1986 pindex = OFF_TO_IDX(off); 1987 VM_OBJECT_WLOCK(obj); 1988 m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY | 1989 VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL); 1990 1991 /* 1992 * Check if page is valid for what we need, otherwise initiate I/O. 1993 * 1994 * The non-zero nd argument prevents disk I/O, instead we 1995 * return the caller what he specified in nd. In particular, 1996 * if we already turned some pages into mbufs, nd == EAGAIN 1997 * and the main function send them the pages before we come 1998 * here again and block. 1999 */ 2000 if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) { 2001 if (vp == NULL) 2002 vm_page_xunbusy(m); 2003 VM_OBJECT_WUNLOCK(obj); 2004 *res = m; 2005 return (0); 2006 } else if (nd != 0) { 2007 if (vp == NULL) 2008 vm_page_xunbusy(m); 2009 error = nd; 2010 goto free_page; 2011 } 2012 2013 /* 2014 * Get the page from backing store. 2015 */ 2016 error = 0; 2017 if (vp != NULL) { 2018 VM_OBJECT_WUNLOCK(obj); 2019 readahead = sfreadahead * MAXBSIZE; 2020 2021 /* 2022 * Use vn_rdwr() instead of the pager interface for 2023 * the vnode, to allow the read-ahead. 2024 * 2025 * XXXMAC: Because we don't have fp->f_cred here, we 2026 * pass in NOCRED. This is probably wrong, but is 2027 * consistent with our original implementation. 2028 */ 2029 error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off), 2030 UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead / 2031 bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); 2032 SFSTAT_INC(sf_iocnt); 2033 VM_OBJECT_WLOCK(obj); 2034 } else { 2035 if (vm_pager_has_page(obj, pindex, NULL, NULL)) { 2036 rv = vm_pager_get_pages(obj, &m, 1, 0); 2037 SFSTAT_INC(sf_iocnt); 2038 if (rv != VM_PAGER_OK) { 2039 vm_page_lock(m); 2040 vm_page_free(m); 2041 vm_page_unlock(m); 2042 m = NULL; 2043 error = EIO; 2044 } 2045 } else { 2046 pmap_zero_page(m); 2047 m->valid = VM_PAGE_BITS_ALL; 2048 m->dirty = 0; 2049 } 2050 if (m != NULL) 2051 vm_page_xunbusy(m); 2052 } 2053 if (error == 0) { 2054 *res = m; 2055 } else if (m != NULL) { 2056 free_page: 2057 vm_page_lock(m); 2058 vm_page_unwire(m, PQ_INACTIVE); 2059 2060 /* 2061 * See if anyone else might know about this page. If 2062 * not and it is not valid, then free it. 2063 */ 2064 if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m)) 2065 vm_page_free(m); 2066 vm_page_unlock(m); 2067 } 2068 KASSERT(error != 0 || (m->wire_count > 0 && 2069 vm_page_is_valid(m, off & PAGE_MASK, xfsize)), 2070 ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off, 2071 xfsize)); 2072 VM_OBJECT_WUNLOCK(obj); 2073 return (error); 2074 } 2075 2076 static int 2077 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, 2078 struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, 2079 int *bsize) 2080 { 2081 struct vattr va; 2082 vm_object_t obj; 2083 struct vnode *vp; 2084 struct shmfd *shmfd; 2085 int error; 2086 2087 vp = *vp_res = NULL; 2088 obj = NULL; 2089 shmfd = *shmfd_res = NULL; 2090 *bsize = 0; 2091 2092 /* 2093 * The file descriptor must be a regular file and have a 2094 * backing VM object. 2095 */ 2096 if (fp->f_type == DTYPE_VNODE) { 2097 vp = fp->f_vnode; 2098 vn_lock(vp, LK_SHARED | LK_RETRY); 2099 if (vp->v_type != VREG) { 2100 error = EINVAL; 2101 goto out; 2102 } 2103 *bsize = vp->v_mount->mnt_stat.f_iosize; 2104 error = VOP_GETATTR(vp, &va, td->td_ucred); 2105 if (error != 0) 2106 goto out; 2107 *obj_size = va.va_size; 2108 obj = vp->v_object; 2109 if (obj == NULL) { 2110 error = EINVAL; 2111 goto out; 2112 } 2113 } else if (fp->f_type == DTYPE_SHM) { 2114 error = 0; 2115 shmfd = fp->f_data; 2116 obj = shmfd->shm_object; 2117 *obj_size = shmfd->shm_size; 2118 } else { 2119 error = EINVAL; 2120 goto out; 2121 } 2122 2123 VM_OBJECT_WLOCK(obj); 2124 if ((obj->flags & OBJ_DEAD) != 0) { 2125 VM_OBJECT_WUNLOCK(obj); 2126 error = EBADF; 2127 goto out; 2128 } 2129 2130 /* 2131 * Temporarily increase the backing VM object's reference 2132 * count so that a forced reclamation of its vnode does not 2133 * immediately destroy it. 2134 */ 2135 vm_object_reference_locked(obj); 2136 VM_OBJECT_WUNLOCK(obj); 2137 *obj_res = obj; 2138 *vp_res = vp; 2139 *shmfd_res = shmfd; 2140 2141 out: 2142 if (vp != NULL) 2143 VOP_UNLOCK(vp, 0); 2144 return (error); 2145 } 2146 2147 static int 2148 kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp, 2149 struct socket **so) 2150 { 2151 cap_rights_t rights; 2152 int error; 2153 2154 *sock_fp = NULL; 2155 *so = NULL; 2156 2157 /* 2158 * The socket must be a stream socket and connected. 2159 */ 2160 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND), 2161 sock_fp, NULL); 2162 if (error != 0) 2163 return (error); 2164 *so = (*sock_fp)->f_data; 2165 if ((*so)->so_type != SOCK_STREAM) 2166 return (EINVAL); 2167 if (((*so)->so_state & SS_ISCONNECTED) == 0) 2168 return (ENOTCONN); 2169 return (0); 2170 } 2171 2172 int 2173 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 2174 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 2175 int kflags, struct thread *td) 2176 { 2177 struct file *sock_fp; 2178 struct vnode *vp; 2179 struct vm_object *obj; 2180 struct socket *so; 2181 struct mbuf *m; 2182 struct sf_buf *sf; 2183 struct vm_page *pg; 2184 struct shmfd *shmfd; 2185 struct sendfile_sync *sfs; 2186 struct vattr va; 2187 off_t off, xfsize, fsbytes, sbytes, rem, obj_size; 2188 int error, bsize, nd, hdrlen, mnw; 2189 2190 pg = NULL; 2191 obj = NULL; 2192 so = NULL; 2193 m = NULL; 2194 sfs = NULL; 2195 fsbytes = sbytes = 0; 2196 hdrlen = mnw = 0; 2197 rem = nbytes; 2198 obj_size = 0; 2199 2200 error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); 2201 if (error != 0) 2202 return (error); 2203 if (rem == 0) 2204 rem = obj_size; 2205 2206 error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so); 2207 if (error != 0) 2208 goto out; 2209 2210 /* 2211 * Do not wait on memory allocations but return ENOMEM for 2212 * caller to retry later. 2213 * XXX: Experimental. 2214 */ 2215 if (flags & SF_MNOWAIT) 2216 mnw = 1; 2217 2218 if (flags & SF_SYNC) { 2219 sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO); 2220 mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); 2221 cv_init(&sfs->cv, "sendfile"); 2222 } 2223 2224 #ifdef MAC 2225 error = mac_socket_check_send(td->td_ucred, so); 2226 if (error != 0) 2227 goto out; 2228 #endif 2229 2230 /* If headers are specified copy them into mbufs. */ 2231 if (hdr_uio != NULL) { 2232 hdr_uio->uio_td = td; 2233 hdr_uio->uio_rw = UIO_WRITE; 2234 if (hdr_uio->uio_resid > 0) { 2235 /* 2236 * In FBSD < 5.0 the nbytes to send also included 2237 * the header. If compat is specified subtract the 2238 * header size from nbytes. 2239 */ 2240 if (kflags & SFK_COMPAT) { 2241 if (nbytes > hdr_uio->uio_resid) 2242 nbytes -= hdr_uio->uio_resid; 2243 else 2244 nbytes = 0; 2245 } 2246 m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), 2247 0, 0, 0); 2248 if (m == NULL) { 2249 error = mnw ? EAGAIN : ENOBUFS; 2250 goto out; 2251 } 2252 hdrlen = m_length(m, NULL); 2253 } 2254 } 2255 2256 /* 2257 * Protect against multiple writers to the socket. 2258 * 2259 * XXXRW: Historically this has assumed non-interruptibility, so now 2260 * we implement that, but possibly shouldn't. 2261 */ 2262 (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); 2263 2264 /* 2265 * Loop through the pages of the file, starting with the requested 2266 * offset. Get a file page (do I/O if necessary), map the file page 2267 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 2268 * it on the socket. 2269 * This is done in two loops. The inner loop turns as many pages 2270 * as it can, up to available socket buffer space, without blocking 2271 * into mbufs to have it bulk delivered into the socket send buffer. 2272 * The outer loop checks the state and available space of the socket 2273 * and takes care of the overall progress. 2274 */ 2275 for (off = offset; ; ) { 2276 struct mbuf *mtail; 2277 int loopbytes; 2278 int space; 2279 int done; 2280 2281 if ((nbytes != 0 && nbytes == fsbytes) || 2282 (nbytes == 0 && obj_size == fsbytes)) 2283 break; 2284 2285 mtail = NULL; 2286 loopbytes = 0; 2287 space = 0; 2288 done = 0; 2289 2290 /* 2291 * Check the socket state for ongoing connection, 2292 * no errors and space in socket buffer. 2293 * If space is low allow for the remainder of the 2294 * file to be processed if it fits the socket buffer. 2295 * Otherwise block in waiting for sufficient space 2296 * to proceed, or if the socket is nonblocking, return 2297 * to userland with EAGAIN while reporting how far 2298 * we've come. 2299 * We wait until the socket buffer has significant free 2300 * space to do bulk sends. This makes good use of file 2301 * system read ahead and allows packet segmentation 2302 * offloading hardware to take over lots of work. If 2303 * we were not careful here we would send off only one 2304 * sfbuf at a time. 2305 */ 2306 SOCKBUF_LOCK(&so->so_snd); 2307 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) 2308 so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; 2309 retry_space: 2310 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2311 error = EPIPE; 2312 SOCKBUF_UNLOCK(&so->so_snd); 2313 goto done; 2314 } else if (so->so_error) { 2315 error = so->so_error; 2316 so->so_error = 0; 2317 SOCKBUF_UNLOCK(&so->so_snd); 2318 goto done; 2319 } 2320 space = sbspace(&so->so_snd); 2321 if (space < rem && 2322 (space <= 0 || 2323 space < so->so_snd.sb_lowat)) { 2324 if (so->so_state & SS_NBIO) { 2325 SOCKBUF_UNLOCK(&so->so_snd); 2326 error = EAGAIN; 2327 goto done; 2328 } 2329 /* 2330 * sbwait drops the lock while sleeping. 2331 * When we loop back to retry_space the 2332 * state may have changed and we retest 2333 * for it. 2334 */ 2335 error = sbwait(&so->so_snd); 2336 /* 2337 * An error from sbwait usually indicates that we've 2338 * been interrupted by a signal. If we've sent anything 2339 * then return bytes sent, otherwise return the error. 2340 */ 2341 if (error != 0) { 2342 SOCKBUF_UNLOCK(&so->so_snd); 2343 goto done; 2344 } 2345 goto retry_space; 2346 } 2347 SOCKBUF_UNLOCK(&so->so_snd); 2348 2349 /* 2350 * Reduce space in the socket buffer by the size of 2351 * the header mbuf chain. 2352 * hdrlen is set to 0 after the first loop. 2353 */ 2354 space -= hdrlen; 2355 2356 if (vp != NULL) { 2357 error = vn_lock(vp, LK_SHARED); 2358 if (error != 0) 2359 goto done; 2360 error = VOP_GETATTR(vp, &va, td->td_ucred); 2361 if (error != 0 || off >= va.va_size) { 2362 VOP_UNLOCK(vp, 0); 2363 goto done; 2364 } 2365 obj_size = va.va_size; 2366 } 2367 2368 /* 2369 * Loop and construct maximum sized mbuf chain to be bulk 2370 * dumped into socket buffer. 2371 */ 2372 while (space > loopbytes) { 2373 vm_offset_t pgoff; 2374 struct mbuf *m0; 2375 2376 /* 2377 * Calculate the amount to transfer. 2378 * Not to exceed a page, the EOF, 2379 * or the passed in nbytes. 2380 */ 2381 pgoff = (vm_offset_t)(off & PAGE_MASK); 2382 rem = obj_size - offset; 2383 if (nbytes != 0) 2384 rem = omin(rem, nbytes); 2385 rem -= fsbytes + loopbytes; 2386 xfsize = omin(PAGE_SIZE - pgoff, rem); 2387 xfsize = omin(space - loopbytes, xfsize); 2388 if (xfsize <= 0) { 2389 done = 1; /* all data sent */ 2390 break; 2391 } 2392 2393 /* 2394 * Attempt to look up the page. Allocate 2395 * if not found or wait and loop if busy. 2396 */ 2397 if (m != NULL) 2398 nd = EAGAIN; /* send what we already got */ 2399 else if ((flags & SF_NODISKIO) != 0) 2400 nd = EBUSY; 2401 else 2402 nd = 0; 2403 error = sendfile_readpage(obj, vp, nd, off, 2404 xfsize, bsize, td, &pg); 2405 if (error != 0) { 2406 if (error == EAGAIN) 2407 error = 0; /* not a real error */ 2408 break; 2409 } 2410 2411 /* 2412 * Get a sendfile buf. When allocating the 2413 * first buffer for mbuf chain, we usually 2414 * wait as long as necessary, but this wait 2415 * can be interrupted. For consequent 2416 * buffers, do not sleep, since several 2417 * threads might exhaust the buffers and then 2418 * deadlock. 2419 */ 2420 sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT : 2421 SFB_CATCH); 2422 if (sf == NULL) { 2423 SFSTAT_INC(sf_allocfail); 2424 vm_page_lock(pg); 2425 vm_page_unwire(pg, PQ_INACTIVE); 2426 KASSERT(pg->object != NULL, 2427 ("%s: object disappeared", __func__)); 2428 vm_page_unlock(pg); 2429 if (m == NULL) 2430 error = (mnw ? EAGAIN : EINTR); 2431 break; 2432 } 2433 2434 /* 2435 * Get an mbuf and set it up as having 2436 * external storage. 2437 */ 2438 m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); 2439 if (m0 == NULL) { 2440 error = (mnw ? EAGAIN : ENOBUFS); 2441 sf_ext_free(sf, NULL); 2442 break; 2443 } 2444 /* 2445 * Attach EXT_SFBUF external storage. 2446 */ 2447 m0->m_ext.ext_buf = (caddr_t )sf_buf_kva(sf); 2448 m0->m_ext.ext_size = PAGE_SIZE; 2449 m0->m_ext.ext_arg1 = sf; 2450 m0->m_ext.ext_arg2 = sfs; 2451 m0->m_ext.ext_type = EXT_SFBUF; 2452 m0->m_ext.ext_flags = 0; 2453 m0->m_flags |= (M_EXT|M_RDONLY); 2454 m0->m_data = (char *)sf_buf_kva(sf) + pgoff; 2455 m0->m_len = xfsize; 2456 2457 /* Append to mbuf chain. */ 2458 if (mtail != NULL) 2459 mtail->m_next = m0; 2460 else if (m != NULL) 2461 m_last(m)->m_next = m0; 2462 else 2463 m = m0; 2464 mtail = m0; 2465 2466 /* Keep track of bits processed. */ 2467 loopbytes += xfsize; 2468 off += xfsize; 2469 2470 if (sfs != NULL) { 2471 mtx_lock(&sfs->mtx); 2472 sfs->count++; 2473 mtx_unlock(&sfs->mtx); 2474 } 2475 } 2476 2477 if (vp != NULL) 2478 VOP_UNLOCK(vp, 0); 2479 2480 /* Add the buffer chain to the socket buffer. */ 2481 if (m != NULL) { 2482 int mlen, err; 2483 2484 mlen = m_length(m, NULL); 2485 SOCKBUF_LOCK(&so->so_snd); 2486 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2487 error = EPIPE; 2488 SOCKBUF_UNLOCK(&so->so_snd); 2489 goto done; 2490 } 2491 SOCKBUF_UNLOCK(&so->so_snd); 2492 CURVNET_SET(so->so_vnet); 2493 /* Avoid error aliasing. */ 2494 err = (*so->so_proto->pr_usrreqs->pru_send) 2495 (so, 0, m, NULL, NULL, td); 2496 CURVNET_RESTORE(); 2497 if (err == 0) { 2498 /* 2499 * We need two counters to get the 2500 * file offset and nbytes to send 2501 * right: 2502 * - sbytes contains the total amount 2503 * of bytes sent, including headers. 2504 * - fsbytes contains the total amount 2505 * of bytes sent from the file. 2506 */ 2507 sbytes += mlen; 2508 fsbytes += mlen; 2509 if (hdrlen) { 2510 fsbytes -= hdrlen; 2511 hdrlen = 0; 2512 } 2513 } else if (error == 0) 2514 error = err; 2515 m = NULL; /* pru_send always consumes */ 2516 } 2517 2518 /* Quit outer loop on error or when we're done. */ 2519 if (done) 2520 break; 2521 if (error != 0) 2522 goto done; 2523 } 2524 2525 /* 2526 * Send trailers. Wimp out and use writev(2). 2527 */ 2528 if (trl_uio != NULL) { 2529 sbunlock(&so->so_snd); 2530 error = kern_writev(td, sockfd, trl_uio); 2531 if (error == 0) 2532 sbytes += td->td_retval[0]; 2533 goto out; 2534 } 2535 2536 done: 2537 sbunlock(&so->so_snd); 2538 out: 2539 /* 2540 * If there was no error we have to clear td->td_retval[0] 2541 * because it may have been set by writev. 2542 */ 2543 if (error == 0) { 2544 td->td_retval[0] = 0; 2545 } 2546 if (sent != NULL) { 2547 (*sent) = sbytes; 2548 } 2549 if (obj != NULL) 2550 vm_object_deallocate(obj); 2551 if (so) 2552 fdrop(sock_fp, td); 2553 if (m) 2554 m_freem(m); 2555 2556 if (sfs != NULL) { 2557 mtx_lock(&sfs->mtx); 2558 if (sfs->count != 0) 2559 cv_wait(&sfs->cv, &sfs->mtx); 2560 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2561 cv_destroy(&sfs->cv); 2562 mtx_destroy(&sfs->mtx); 2563 free(sfs, M_TEMP); 2564 } 2565 2566 if (error == ERESTART) 2567 error = EINTR; 2568 2569 return (error); 2570 } 2571