1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_capsicum.h" 39 #include "opt_inet.h" 40 #include "opt_inet6.h" 41 #include "opt_compat.h" 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/capsicum.h> 47 #include <sys/condvar.h> 48 #include <sys/kernel.h> 49 #include <sys/lock.h> 50 #include <sys/mutex.h> 51 #include <sys/sysproto.h> 52 #include <sys/malloc.h> 53 #include <sys/filedesc.h> 54 #include <sys/event.h> 55 #include <sys/proc.h> 56 #include <sys/fcntl.h> 57 #include <sys/file.h> 58 #include <sys/filio.h> 59 #include <sys/jail.h> 60 #include <sys/mman.h> 61 #include <sys/mount.h> 62 #include <sys/mbuf.h> 63 #include <sys/protosw.h> 64 #include <sys/rwlock.h> 65 #include <sys/sf_buf.h> 66 #include <sys/sysent.h> 67 #include <sys/socket.h> 68 #include <sys/socketvar.h> 69 #include <sys/signalvar.h> 70 #include <sys/syscallsubr.h> 71 #include <sys/sysctl.h> 72 #include <sys/uio.h> 73 #include <sys/vnode.h> 74 #ifdef KTRACE 75 #include <sys/ktrace.h> 76 #endif 77 #ifdef COMPAT_FREEBSD32 78 #include <compat/freebsd32/freebsd32_util.h> 79 #endif 80 81 #include <net/vnet.h> 82 83 #include <security/audit/audit.h> 84 #include <security/mac/mac_framework.h> 85 86 #include <vm/vm.h> 87 #include <vm/vm_param.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_page.h> 90 #include <vm/vm_pager.h> 91 #include <vm/vm_kern.h> 92 #include <vm/vm_extern.h> 93 #include <vm/uma.h> 94 95 /* 96 * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC 97 * and SOCK_NONBLOCK. 98 */ 99 #define ACCEPT4_INHERIT 0x1 100 #define ACCEPT4_COMPAT 0x2 101 102 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); 103 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); 104 105 static int accept1(struct thread *td, int s, struct sockaddr *uname, 106 socklen_t *anamelen, int flags); 107 static int do_sendfile(struct thread *td, struct sendfile_args *uap, 108 int compat); 109 static int getsockname1(struct thread *td, struct getsockname_args *uap, 110 int compat); 111 static int getpeername1(struct thread *td, struct getpeername_args *uap, 112 int compat); 113 114 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; 115 116 /* 117 * sendfile(2)-related variables and associated sysctls 118 */ 119 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0, 120 "sendfile(2) tunables"); 121 static int sfreadahead = 1; 122 SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW, 123 &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks"); 124 125 static void 126 sfstat_init(const void *unused) 127 { 128 129 COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), 130 M_WAITOK); 131 } 132 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); 133 134 static int 135 sfstat_sysctl(SYSCTL_HANDLER_ARGS) 136 { 137 struct sfstat s; 138 139 COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); 140 if (req->newptr) 141 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); 142 return (SYSCTL_OUT(req, &s, sizeof(s))); 143 } 144 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, 145 NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); 146 147 /* 148 * Convert a user file descriptor to a kernel file entry and check if required 149 * capability rights are present. 150 * A reference on the file entry is held upon returning. 151 */ 152 int 153 getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, 154 struct file **fpp, u_int *fflagp) 155 { 156 struct file *fp; 157 int error; 158 159 error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL); 160 if (error != 0) 161 return (error); 162 if (fp->f_type != DTYPE_SOCKET) { 163 fdrop(fp, td); 164 return (ENOTSOCK); 165 } 166 if (fflagp != NULL) 167 *fflagp = fp->f_flag; 168 *fpp = fp; 169 return (0); 170 } 171 172 /* 173 * System call interface to the socket abstraction. 174 */ 175 #if defined(COMPAT_43) 176 #define COMPAT_OLDSOCK 177 #endif 178 179 int 180 sys_socket(td, uap) 181 struct thread *td; 182 struct socket_args /* { 183 int domain; 184 int type; 185 int protocol; 186 } */ *uap; 187 { 188 struct socket *so; 189 struct file *fp; 190 int fd, error, type, oflag, fflag; 191 192 AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); 193 194 type = uap->type; 195 oflag = 0; 196 fflag = 0; 197 if ((type & SOCK_CLOEXEC) != 0) { 198 type &= ~SOCK_CLOEXEC; 199 oflag |= O_CLOEXEC; 200 } 201 if ((type & SOCK_NONBLOCK) != 0) { 202 type &= ~SOCK_NONBLOCK; 203 fflag |= FNONBLOCK; 204 } 205 206 #ifdef MAC 207 error = mac_socket_check_create(td->td_ucred, uap->domain, type, 208 uap->protocol); 209 if (error != 0) 210 return (error); 211 #endif 212 error = falloc(td, &fp, &fd, oflag); 213 if (error != 0) 214 return (error); 215 /* An extra reference on `fp' has been held for us by falloc(). */ 216 error = socreate(uap->domain, &so, type, uap->protocol, 217 td->td_ucred, td); 218 if (error != 0) { 219 fdclose(td, fp, fd); 220 } else { 221 finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); 222 if ((fflag & FNONBLOCK) != 0) 223 (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); 224 td->td_retval[0] = fd; 225 } 226 fdrop(fp, td); 227 return (error); 228 } 229 230 /* ARGSUSED */ 231 int 232 sys_bind(td, uap) 233 struct thread *td; 234 struct bind_args /* { 235 int s; 236 caddr_t name; 237 int namelen; 238 } */ *uap; 239 { 240 struct sockaddr *sa; 241 int error; 242 243 error = getsockaddr(&sa, uap->name, uap->namelen); 244 if (error == 0) { 245 error = kern_bindat(td, AT_FDCWD, uap->s, sa); 246 free(sa, M_SONAME); 247 } 248 return (error); 249 } 250 251 int 252 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 253 { 254 struct socket *so; 255 struct file *fp; 256 cap_rights_t rights; 257 int error; 258 259 AUDIT_ARG_FD(fd); 260 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 261 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND), 262 &fp, NULL); 263 if (error != 0) 264 return (error); 265 so = fp->f_data; 266 #ifdef KTRACE 267 if (KTRPOINT(td, KTR_STRUCT)) 268 ktrsockaddr(sa); 269 #endif 270 #ifdef MAC 271 error = mac_socket_check_bind(td->td_ucred, so, sa); 272 if (error == 0) { 273 #endif 274 if (dirfd == AT_FDCWD) 275 error = sobind(so, sa, td); 276 else 277 error = sobindat(dirfd, so, sa, td); 278 #ifdef MAC 279 } 280 #endif 281 fdrop(fp, td); 282 return (error); 283 } 284 285 /* ARGSUSED */ 286 int 287 sys_bindat(td, uap) 288 struct thread *td; 289 struct bindat_args /* { 290 int fd; 291 int s; 292 caddr_t name; 293 int namelen; 294 } */ *uap; 295 { 296 struct sockaddr *sa; 297 int error; 298 299 error = getsockaddr(&sa, uap->name, uap->namelen); 300 if (error == 0) { 301 error = kern_bindat(td, uap->fd, uap->s, sa); 302 free(sa, M_SONAME); 303 } 304 return (error); 305 } 306 307 /* ARGSUSED */ 308 int 309 sys_listen(td, uap) 310 struct thread *td; 311 struct listen_args /* { 312 int s; 313 int backlog; 314 } */ *uap; 315 { 316 struct socket *so; 317 struct file *fp; 318 cap_rights_t rights; 319 int error; 320 321 AUDIT_ARG_FD(uap->s); 322 error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_LISTEN), 323 &fp, NULL); 324 if (error == 0) { 325 so = fp->f_data; 326 #ifdef MAC 327 error = mac_socket_check_listen(td->td_ucred, so); 328 if (error == 0) 329 #endif 330 error = solisten(so, uap->backlog, td); 331 fdrop(fp, td); 332 } 333 return(error); 334 } 335 336 /* 337 * accept1() 338 */ 339 static int 340 accept1(td, s, uname, anamelen, flags) 341 struct thread *td; 342 int s; 343 struct sockaddr *uname; 344 socklen_t *anamelen; 345 int flags; 346 { 347 struct sockaddr *name; 348 socklen_t namelen; 349 struct file *fp; 350 int error; 351 352 if (uname == NULL) 353 return (kern_accept4(td, s, NULL, NULL, flags, NULL)); 354 355 error = copyin(anamelen, &namelen, sizeof (namelen)); 356 if (error != 0) 357 return (error); 358 359 error = kern_accept4(td, s, &name, &namelen, flags, &fp); 360 361 if (error != 0) 362 return (error); 363 364 if (error == 0 && uname != NULL) { 365 #ifdef COMPAT_OLDSOCK 366 if (flags & ACCEPT4_COMPAT) 367 ((struct osockaddr *)name)->sa_family = 368 name->sa_family; 369 #endif 370 error = copyout(name, uname, namelen); 371 } 372 if (error == 0) 373 error = copyout(&namelen, anamelen, 374 sizeof(namelen)); 375 if (error != 0) 376 fdclose(td, fp, td->td_retval[0]); 377 fdrop(fp, td); 378 free(name, M_SONAME); 379 return (error); 380 } 381 382 int 383 kern_accept(struct thread *td, int s, struct sockaddr **name, 384 socklen_t *namelen, struct file **fp) 385 { 386 return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); 387 } 388 389 int 390 kern_accept4(struct thread *td, int s, struct sockaddr **name, 391 socklen_t *namelen, int flags, struct file **fp) 392 { 393 struct file *headfp, *nfp = NULL; 394 struct sockaddr *sa = NULL; 395 struct socket *head, *so; 396 cap_rights_t rights; 397 u_int fflag; 398 pid_t pgid; 399 int error, fd, tmp; 400 401 if (name != NULL) 402 *name = NULL; 403 404 AUDIT_ARG_FD(s); 405 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT), 406 &headfp, &fflag); 407 if (error != 0) 408 return (error); 409 head = headfp->f_data; 410 if ((head->so_options & SO_ACCEPTCONN) == 0) { 411 error = EINVAL; 412 goto done; 413 } 414 #ifdef MAC 415 error = mac_socket_check_accept(td->td_ucred, head); 416 if (error != 0) 417 goto done; 418 #endif 419 error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); 420 if (error != 0) 421 goto done; 422 ACCEPT_LOCK(); 423 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { 424 ACCEPT_UNLOCK(); 425 error = EWOULDBLOCK; 426 goto noconnection; 427 } 428 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { 429 if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { 430 head->so_error = ECONNABORTED; 431 break; 432 } 433 error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, 434 "accept", 0); 435 if (error != 0) { 436 ACCEPT_UNLOCK(); 437 goto noconnection; 438 } 439 } 440 if (head->so_error) { 441 error = head->so_error; 442 head->so_error = 0; 443 ACCEPT_UNLOCK(); 444 goto noconnection; 445 } 446 so = TAILQ_FIRST(&head->so_comp); 447 KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); 448 KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); 449 450 /* 451 * Before changing the flags on the socket, we have to bump the 452 * reference count. Otherwise, if the protocol calls sofree(), 453 * the socket will be released due to a zero refcount. 454 */ 455 SOCK_LOCK(so); /* soref() and so_state update */ 456 soref(so); /* file descriptor reference */ 457 458 TAILQ_REMOVE(&head->so_comp, so, so_list); 459 head->so_qlen--; 460 if (flags & ACCEPT4_INHERIT) 461 so->so_state |= (head->so_state & SS_NBIO); 462 else 463 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 464 so->so_qstate &= ~SQ_COMP; 465 so->so_head = NULL; 466 467 SOCK_UNLOCK(so); 468 ACCEPT_UNLOCK(); 469 470 /* An extra reference on `nfp' has been held for us by falloc(). */ 471 td->td_retval[0] = fd; 472 473 /* connection has been removed from the listen queue */ 474 KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); 475 476 if (flags & ACCEPT4_INHERIT) { 477 pgid = fgetown(&head->so_sigio); 478 if (pgid != 0) 479 fsetown(pgid, &so->so_sigio); 480 } else { 481 fflag &= ~(FNONBLOCK | FASYNC); 482 if (flags & SOCK_NONBLOCK) 483 fflag |= FNONBLOCK; 484 } 485 486 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); 487 /* Sync socket nonblocking/async state with file flags */ 488 tmp = fflag & FNONBLOCK; 489 (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); 490 tmp = fflag & FASYNC; 491 (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); 492 sa = 0; 493 error = soaccept(so, &sa); 494 if (error != 0) 495 goto noconnection; 496 if (sa == NULL) { 497 if (name) 498 *namelen = 0; 499 goto done; 500 } 501 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); 502 if (name) { 503 /* check sa_len before it is destroyed */ 504 if (*namelen > sa->sa_len) 505 *namelen = sa->sa_len; 506 #ifdef KTRACE 507 if (KTRPOINT(td, KTR_STRUCT)) 508 ktrsockaddr(sa); 509 #endif 510 *name = sa; 511 sa = NULL; 512 } 513 noconnection: 514 free(sa, M_SONAME); 515 516 /* 517 * close the new descriptor, assuming someone hasn't ripped it 518 * out from under us. 519 */ 520 if (error != 0) 521 fdclose(td, nfp, fd); 522 523 /* 524 * Release explicitly held references before returning. We return 525 * a reference on nfp to the caller on success if they request it. 526 */ 527 done: 528 if (fp != NULL) { 529 if (error == 0) { 530 *fp = nfp; 531 nfp = NULL; 532 } else 533 *fp = NULL; 534 } 535 if (nfp != NULL) 536 fdrop(nfp, td); 537 fdrop(headfp, td); 538 return (error); 539 } 540 541 int 542 sys_accept(td, uap) 543 struct thread *td; 544 struct accept_args *uap; 545 { 546 547 return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); 548 } 549 550 int 551 sys_accept4(td, uap) 552 struct thread *td; 553 struct accept4_args *uap; 554 { 555 556 if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 557 return (EINVAL); 558 559 return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); 560 } 561 562 #ifdef COMPAT_OLDSOCK 563 int 564 oaccept(td, uap) 565 struct thread *td; 566 struct accept_args *uap; 567 { 568 569 return (accept1(td, uap->s, uap->name, uap->anamelen, 570 ACCEPT4_INHERIT | ACCEPT4_COMPAT)); 571 } 572 #endif /* COMPAT_OLDSOCK */ 573 574 /* ARGSUSED */ 575 int 576 sys_connect(td, uap) 577 struct thread *td; 578 struct connect_args /* { 579 int s; 580 caddr_t name; 581 int namelen; 582 } */ *uap; 583 { 584 struct sockaddr *sa; 585 int error; 586 587 error = getsockaddr(&sa, uap->name, uap->namelen); 588 if (error == 0) { 589 error = kern_connectat(td, AT_FDCWD, uap->s, sa); 590 free(sa, M_SONAME); 591 } 592 return (error); 593 } 594 595 int 596 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 597 { 598 struct socket *so; 599 struct file *fp; 600 cap_rights_t rights; 601 int error, interrupted = 0; 602 603 AUDIT_ARG_FD(fd); 604 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 605 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT), 606 &fp, NULL); 607 if (error != 0) 608 return (error); 609 so = fp->f_data; 610 if (so->so_state & SS_ISCONNECTING) { 611 error = EALREADY; 612 goto done1; 613 } 614 #ifdef KTRACE 615 if (KTRPOINT(td, KTR_STRUCT)) 616 ktrsockaddr(sa); 617 #endif 618 #ifdef MAC 619 error = mac_socket_check_connect(td->td_ucred, so, sa); 620 if (error != 0) 621 goto bad; 622 #endif 623 if (dirfd == AT_FDCWD) 624 error = soconnect(so, sa, td); 625 else 626 error = soconnectat(dirfd, so, sa, td); 627 if (error != 0) 628 goto bad; 629 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 630 error = EINPROGRESS; 631 goto done1; 632 } 633 SOCK_LOCK(so); 634 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 635 error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, 636 "connec", 0); 637 if (error != 0) { 638 if (error == EINTR || error == ERESTART) 639 interrupted = 1; 640 break; 641 } 642 } 643 if (error == 0) { 644 error = so->so_error; 645 so->so_error = 0; 646 } 647 SOCK_UNLOCK(so); 648 bad: 649 if (!interrupted) 650 so->so_state &= ~SS_ISCONNECTING; 651 if (error == ERESTART) 652 error = EINTR; 653 done1: 654 fdrop(fp, td); 655 return (error); 656 } 657 658 /* ARGSUSED */ 659 int 660 sys_connectat(td, uap) 661 struct thread *td; 662 struct connectat_args /* { 663 int fd; 664 int s; 665 caddr_t name; 666 int namelen; 667 } */ *uap; 668 { 669 struct sockaddr *sa; 670 int error; 671 672 error = getsockaddr(&sa, uap->name, uap->namelen); 673 if (error == 0) { 674 error = kern_connectat(td, uap->fd, uap->s, sa); 675 free(sa, M_SONAME); 676 } 677 return (error); 678 } 679 680 int 681 kern_socketpair(struct thread *td, int domain, int type, int protocol, 682 int *rsv) 683 { 684 struct file *fp1, *fp2; 685 struct socket *so1, *so2; 686 int fd, error, oflag, fflag; 687 688 AUDIT_ARG_SOCKET(domain, type, protocol); 689 690 oflag = 0; 691 fflag = 0; 692 if ((type & SOCK_CLOEXEC) != 0) { 693 type &= ~SOCK_CLOEXEC; 694 oflag |= O_CLOEXEC; 695 } 696 if ((type & SOCK_NONBLOCK) != 0) { 697 type &= ~SOCK_NONBLOCK; 698 fflag |= FNONBLOCK; 699 } 700 #ifdef MAC 701 /* We might want to have a separate check for socket pairs. */ 702 error = mac_socket_check_create(td->td_ucred, domain, type, 703 protocol); 704 if (error != 0) 705 return (error); 706 #endif 707 error = socreate(domain, &so1, type, protocol, td->td_ucred, td); 708 if (error != 0) 709 return (error); 710 error = socreate(domain, &so2, type, protocol, td->td_ucred, td); 711 if (error != 0) 712 goto free1; 713 /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ 714 error = falloc(td, &fp1, &fd, oflag); 715 if (error != 0) 716 goto free2; 717 rsv[0] = fd; 718 fp1->f_data = so1; /* so1 already has ref count */ 719 error = falloc(td, &fp2, &fd, oflag); 720 if (error != 0) 721 goto free3; 722 fp2->f_data = so2; /* so2 already has ref count */ 723 rsv[1] = fd; 724 error = soconnect2(so1, so2); 725 if (error != 0) 726 goto free4; 727 if (type == SOCK_DGRAM) { 728 /* 729 * Datagram socket connection is asymmetric. 730 */ 731 error = soconnect2(so2, so1); 732 if (error != 0) 733 goto free4; 734 } 735 finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, 736 &socketops); 737 finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, 738 &socketops); 739 if ((fflag & FNONBLOCK) != 0) { 740 (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); 741 (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); 742 } 743 fdrop(fp1, td); 744 fdrop(fp2, td); 745 return (0); 746 free4: 747 fdclose(td, fp2, rsv[1]); 748 fdrop(fp2, td); 749 free3: 750 fdclose(td, fp1, rsv[0]); 751 fdrop(fp1, td); 752 free2: 753 if (so2 != NULL) 754 (void)soclose(so2); 755 free1: 756 if (so1 != NULL) 757 (void)soclose(so1); 758 return (error); 759 } 760 761 int 762 sys_socketpair(struct thread *td, struct socketpair_args *uap) 763 { 764 int error, sv[2]; 765 766 error = kern_socketpair(td, uap->domain, uap->type, 767 uap->protocol, sv); 768 if (error != 0) 769 return (error); 770 error = copyout(sv, uap->rsv, 2 * sizeof(int)); 771 if (error != 0) { 772 (void)kern_close(td, sv[0]); 773 (void)kern_close(td, sv[1]); 774 } 775 return (error); 776 } 777 778 static int 779 sendit(td, s, mp, flags) 780 struct thread *td; 781 int s; 782 struct msghdr *mp; 783 int flags; 784 { 785 struct mbuf *control; 786 struct sockaddr *to; 787 int error; 788 789 #ifdef CAPABILITY_MODE 790 if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) 791 return (ECAPMODE); 792 #endif 793 794 if (mp->msg_name != NULL) { 795 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 796 if (error != 0) { 797 to = NULL; 798 goto bad; 799 } 800 mp->msg_name = to; 801 } else { 802 to = NULL; 803 } 804 805 if (mp->msg_control) { 806 if (mp->msg_controllen < sizeof(struct cmsghdr) 807 #ifdef COMPAT_OLDSOCK 808 && mp->msg_flags != MSG_COMPAT 809 #endif 810 ) { 811 error = EINVAL; 812 goto bad; 813 } 814 error = sockargs(&control, mp->msg_control, 815 mp->msg_controllen, MT_CONTROL); 816 if (error != 0) 817 goto bad; 818 #ifdef COMPAT_OLDSOCK 819 if (mp->msg_flags == MSG_COMPAT) { 820 struct cmsghdr *cm; 821 822 M_PREPEND(control, sizeof(*cm), M_WAITOK); 823 cm = mtod(control, struct cmsghdr *); 824 cm->cmsg_len = control->m_len; 825 cm->cmsg_level = SOL_SOCKET; 826 cm->cmsg_type = SCM_RIGHTS; 827 } 828 #endif 829 } else { 830 control = NULL; 831 } 832 833 error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); 834 835 bad: 836 free(to, M_SONAME); 837 return (error); 838 } 839 840 int 841 kern_sendit(td, s, mp, flags, control, segflg) 842 struct thread *td; 843 int s; 844 struct msghdr *mp; 845 int flags; 846 struct mbuf *control; 847 enum uio_seg segflg; 848 { 849 struct file *fp; 850 struct uio auio; 851 struct iovec *iov; 852 struct socket *so; 853 cap_rights_t rights; 854 #ifdef KTRACE 855 struct uio *ktruio = NULL; 856 #endif 857 ssize_t len; 858 int i, error; 859 860 AUDIT_ARG_FD(s); 861 cap_rights_init(&rights, CAP_SEND); 862 if (mp->msg_name != NULL) { 863 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); 864 cap_rights_set(&rights, CAP_CONNECT); 865 } 866 error = getsock_cap(td, s, &rights, &fp, NULL); 867 if (error != 0) 868 return (error); 869 so = (struct socket *)fp->f_data; 870 871 #ifdef KTRACE 872 if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) 873 ktrsockaddr(mp->msg_name); 874 #endif 875 #ifdef MAC 876 if (mp->msg_name != NULL) { 877 error = mac_socket_check_connect(td->td_ucred, so, 878 mp->msg_name); 879 if (error != 0) 880 goto bad; 881 } 882 error = mac_socket_check_send(td->td_ucred, so); 883 if (error != 0) 884 goto bad; 885 #endif 886 887 auio.uio_iov = mp->msg_iov; 888 auio.uio_iovcnt = mp->msg_iovlen; 889 auio.uio_segflg = segflg; 890 auio.uio_rw = UIO_WRITE; 891 auio.uio_td = td; 892 auio.uio_offset = 0; /* XXX */ 893 auio.uio_resid = 0; 894 iov = mp->msg_iov; 895 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 896 if ((auio.uio_resid += iov->iov_len) < 0) { 897 error = EINVAL; 898 goto bad; 899 } 900 } 901 #ifdef KTRACE 902 if (KTRPOINT(td, KTR_GENIO)) 903 ktruio = cloneuio(&auio); 904 #endif 905 len = auio.uio_resid; 906 error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); 907 if (error != 0) { 908 if (auio.uio_resid != len && (error == ERESTART || 909 error == EINTR || error == EWOULDBLOCK)) 910 error = 0; 911 /* Generation of SIGPIPE can be controlled per socket */ 912 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 913 !(flags & MSG_NOSIGNAL)) { 914 PROC_LOCK(td->td_proc); 915 tdsignal(td, SIGPIPE); 916 PROC_UNLOCK(td->td_proc); 917 } 918 } 919 if (error == 0) 920 td->td_retval[0] = len - auio.uio_resid; 921 #ifdef KTRACE 922 if (ktruio != NULL) { 923 ktruio->uio_resid = td->td_retval[0]; 924 ktrgenio(s, UIO_WRITE, ktruio, error); 925 } 926 #endif 927 bad: 928 fdrop(fp, td); 929 return (error); 930 } 931 932 int 933 sys_sendto(td, uap) 934 struct thread *td; 935 struct sendto_args /* { 936 int s; 937 caddr_t buf; 938 size_t len; 939 int flags; 940 caddr_t to; 941 int tolen; 942 } */ *uap; 943 { 944 struct msghdr msg; 945 struct iovec aiov; 946 947 msg.msg_name = uap->to; 948 msg.msg_namelen = uap->tolen; 949 msg.msg_iov = &aiov; 950 msg.msg_iovlen = 1; 951 msg.msg_control = 0; 952 #ifdef COMPAT_OLDSOCK 953 msg.msg_flags = 0; 954 #endif 955 aiov.iov_base = uap->buf; 956 aiov.iov_len = uap->len; 957 return (sendit(td, uap->s, &msg, uap->flags)); 958 } 959 960 #ifdef COMPAT_OLDSOCK 961 int 962 osend(td, uap) 963 struct thread *td; 964 struct osend_args /* { 965 int s; 966 caddr_t buf; 967 int len; 968 int flags; 969 } */ *uap; 970 { 971 struct msghdr msg; 972 struct iovec aiov; 973 974 msg.msg_name = 0; 975 msg.msg_namelen = 0; 976 msg.msg_iov = &aiov; 977 msg.msg_iovlen = 1; 978 aiov.iov_base = uap->buf; 979 aiov.iov_len = uap->len; 980 msg.msg_control = 0; 981 msg.msg_flags = 0; 982 return (sendit(td, uap->s, &msg, uap->flags)); 983 } 984 985 int 986 osendmsg(td, uap) 987 struct thread *td; 988 struct osendmsg_args /* { 989 int s; 990 caddr_t msg; 991 int flags; 992 } */ *uap; 993 { 994 struct msghdr msg; 995 struct iovec *iov; 996 int error; 997 998 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 999 if (error != 0) 1000 return (error); 1001 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1002 if (error != 0) 1003 return (error); 1004 msg.msg_iov = iov; 1005 msg.msg_flags = MSG_COMPAT; 1006 error = sendit(td, uap->s, &msg, uap->flags); 1007 free(iov, M_IOV); 1008 return (error); 1009 } 1010 #endif 1011 1012 int 1013 sys_sendmsg(td, uap) 1014 struct thread *td; 1015 struct sendmsg_args /* { 1016 int s; 1017 caddr_t msg; 1018 int flags; 1019 } */ *uap; 1020 { 1021 struct msghdr msg; 1022 struct iovec *iov; 1023 int error; 1024 1025 error = copyin(uap->msg, &msg, sizeof (msg)); 1026 if (error != 0) 1027 return (error); 1028 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1029 if (error != 0) 1030 return (error); 1031 msg.msg_iov = iov; 1032 #ifdef COMPAT_OLDSOCK 1033 msg.msg_flags = 0; 1034 #endif 1035 error = sendit(td, uap->s, &msg, uap->flags); 1036 free(iov, M_IOV); 1037 return (error); 1038 } 1039 1040 int 1041 kern_recvit(td, s, mp, fromseg, controlp) 1042 struct thread *td; 1043 int s; 1044 struct msghdr *mp; 1045 enum uio_seg fromseg; 1046 struct mbuf **controlp; 1047 { 1048 struct uio auio; 1049 struct iovec *iov; 1050 struct mbuf *m, *control = NULL; 1051 caddr_t ctlbuf; 1052 struct file *fp; 1053 struct socket *so; 1054 struct sockaddr *fromsa = NULL; 1055 cap_rights_t rights; 1056 #ifdef KTRACE 1057 struct uio *ktruio = NULL; 1058 #endif 1059 ssize_t len; 1060 int error, i; 1061 1062 if (controlp != NULL) 1063 *controlp = NULL; 1064 1065 AUDIT_ARG_FD(s); 1066 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV), 1067 &fp, NULL); 1068 if (error != 0) 1069 return (error); 1070 so = fp->f_data; 1071 1072 #ifdef MAC 1073 error = mac_socket_check_receive(td->td_ucred, so); 1074 if (error != 0) { 1075 fdrop(fp, td); 1076 return (error); 1077 } 1078 #endif 1079 1080 auio.uio_iov = mp->msg_iov; 1081 auio.uio_iovcnt = mp->msg_iovlen; 1082 auio.uio_segflg = UIO_USERSPACE; 1083 auio.uio_rw = UIO_READ; 1084 auio.uio_td = td; 1085 auio.uio_offset = 0; /* XXX */ 1086 auio.uio_resid = 0; 1087 iov = mp->msg_iov; 1088 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 1089 if ((auio.uio_resid += iov->iov_len) < 0) { 1090 fdrop(fp, td); 1091 return (EINVAL); 1092 } 1093 } 1094 #ifdef KTRACE 1095 if (KTRPOINT(td, KTR_GENIO)) 1096 ktruio = cloneuio(&auio); 1097 #endif 1098 len = auio.uio_resid; 1099 error = soreceive(so, &fromsa, &auio, NULL, 1100 (mp->msg_control || controlp) ? &control : NULL, 1101 &mp->msg_flags); 1102 if (error != 0) { 1103 if (auio.uio_resid != len && (error == ERESTART || 1104 error == EINTR || error == EWOULDBLOCK)) 1105 error = 0; 1106 } 1107 if (fromsa != NULL) 1108 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); 1109 #ifdef KTRACE 1110 if (ktruio != NULL) { 1111 ktruio->uio_resid = len - auio.uio_resid; 1112 ktrgenio(s, UIO_READ, ktruio, error); 1113 } 1114 #endif 1115 if (error != 0) 1116 goto out; 1117 td->td_retval[0] = len - auio.uio_resid; 1118 if (mp->msg_name) { 1119 len = mp->msg_namelen; 1120 if (len <= 0 || fromsa == NULL) 1121 len = 0; 1122 else { 1123 /* save sa_len before it is destroyed by MSG_COMPAT */ 1124 len = MIN(len, fromsa->sa_len); 1125 #ifdef COMPAT_OLDSOCK 1126 if (mp->msg_flags & MSG_COMPAT) 1127 ((struct osockaddr *)fromsa)->sa_family = 1128 fromsa->sa_family; 1129 #endif 1130 if (fromseg == UIO_USERSPACE) { 1131 error = copyout(fromsa, mp->msg_name, 1132 (unsigned)len); 1133 if (error != 0) 1134 goto out; 1135 } else 1136 bcopy(fromsa, mp->msg_name, len); 1137 } 1138 mp->msg_namelen = len; 1139 } 1140 if (mp->msg_control && controlp == NULL) { 1141 #ifdef COMPAT_OLDSOCK 1142 /* 1143 * We assume that old recvmsg calls won't receive access 1144 * rights and other control info, esp. as control info 1145 * is always optional and those options didn't exist in 4.3. 1146 * If we receive rights, trim the cmsghdr; anything else 1147 * is tossed. 1148 */ 1149 if (control && mp->msg_flags & MSG_COMPAT) { 1150 if (mtod(control, struct cmsghdr *)->cmsg_level != 1151 SOL_SOCKET || 1152 mtod(control, struct cmsghdr *)->cmsg_type != 1153 SCM_RIGHTS) { 1154 mp->msg_controllen = 0; 1155 goto out; 1156 } 1157 control->m_len -= sizeof (struct cmsghdr); 1158 control->m_data += sizeof (struct cmsghdr); 1159 } 1160 #endif 1161 len = mp->msg_controllen; 1162 m = control; 1163 mp->msg_controllen = 0; 1164 ctlbuf = mp->msg_control; 1165 1166 while (m && len > 0) { 1167 unsigned int tocopy; 1168 1169 if (len >= m->m_len) 1170 tocopy = m->m_len; 1171 else { 1172 mp->msg_flags |= MSG_CTRUNC; 1173 tocopy = len; 1174 } 1175 1176 if ((error = copyout(mtod(m, caddr_t), 1177 ctlbuf, tocopy)) != 0) 1178 goto out; 1179 1180 ctlbuf += tocopy; 1181 len -= tocopy; 1182 m = m->m_next; 1183 } 1184 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; 1185 } 1186 out: 1187 fdrop(fp, td); 1188 #ifdef KTRACE 1189 if (fromsa && KTRPOINT(td, KTR_STRUCT)) 1190 ktrsockaddr(fromsa); 1191 #endif 1192 free(fromsa, M_SONAME); 1193 1194 if (error == 0 && controlp != NULL) 1195 *controlp = control; 1196 else if (control) 1197 m_freem(control); 1198 1199 return (error); 1200 } 1201 1202 static int 1203 recvit(td, s, mp, namelenp) 1204 struct thread *td; 1205 int s; 1206 struct msghdr *mp; 1207 void *namelenp; 1208 { 1209 int error; 1210 1211 error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); 1212 if (error != 0) 1213 return (error); 1214 if (namelenp != NULL) { 1215 error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); 1216 #ifdef COMPAT_OLDSOCK 1217 if (mp->msg_flags & MSG_COMPAT) 1218 error = 0; /* old recvfrom didn't check */ 1219 #endif 1220 } 1221 return (error); 1222 } 1223 1224 int 1225 sys_recvfrom(td, uap) 1226 struct thread *td; 1227 struct recvfrom_args /* { 1228 int s; 1229 caddr_t buf; 1230 size_t len; 1231 int flags; 1232 struct sockaddr * __restrict from; 1233 socklen_t * __restrict fromlenaddr; 1234 } */ *uap; 1235 { 1236 struct msghdr msg; 1237 struct iovec aiov; 1238 int error; 1239 1240 if (uap->fromlenaddr) { 1241 error = copyin(uap->fromlenaddr, 1242 &msg.msg_namelen, sizeof (msg.msg_namelen)); 1243 if (error != 0) 1244 goto done2; 1245 } else { 1246 msg.msg_namelen = 0; 1247 } 1248 msg.msg_name = uap->from; 1249 msg.msg_iov = &aiov; 1250 msg.msg_iovlen = 1; 1251 aiov.iov_base = uap->buf; 1252 aiov.iov_len = uap->len; 1253 msg.msg_control = 0; 1254 msg.msg_flags = uap->flags; 1255 error = recvit(td, uap->s, &msg, uap->fromlenaddr); 1256 done2: 1257 return (error); 1258 } 1259 1260 #ifdef COMPAT_OLDSOCK 1261 int 1262 orecvfrom(td, uap) 1263 struct thread *td; 1264 struct recvfrom_args *uap; 1265 { 1266 1267 uap->flags |= MSG_COMPAT; 1268 return (sys_recvfrom(td, uap)); 1269 } 1270 #endif 1271 1272 #ifdef COMPAT_OLDSOCK 1273 int 1274 orecv(td, uap) 1275 struct thread *td; 1276 struct orecv_args /* { 1277 int s; 1278 caddr_t buf; 1279 int len; 1280 int flags; 1281 } */ *uap; 1282 { 1283 struct msghdr msg; 1284 struct iovec aiov; 1285 1286 msg.msg_name = 0; 1287 msg.msg_namelen = 0; 1288 msg.msg_iov = &aiov; 1289 msg.msg_iovlen = 1; 1290 aiov.iov_base = uap->buf; 1291 aiov.iov_len = uap->len; 1292 msg.msg_control = 0; 1293 msg.msg_flags = uap->flags; 1294 return (recvit(td, uap->s, &msg, NULL)); 1295 } 1296 1297 /* 1298 * Old recvmsg. This code takes advantage of the fact that the old msghdr 1299 * overlays the new one, missing only the flags, and with the (old) access 1300 * rights where the control fields are now. 1301 */ 1302 int 1303 orecvmsg(td, uap) 1304 struct thread *td; 1305 struct orecvmsg_args /* { 1306 int s; 1307 struct omsghdr *msg; 1308 int flags; 1309 } */ *uap; 1310 { 1311 struct msghdr msg; 1312 struct iovec *iov; 1313 int error; 1314 1315 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1316 if (error != 0) 1317 return (error); 1318 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1319 if (error != 0) 1320 return (error); 1321 msg.msg_flags = uap->flags | MSG_COMPAT; 1322 msg.msg_iov = iov; 1323 error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); 1324 if (msg.msg_controllen && error == 0) 1325 error = copyout(&msg.msg_controllen, 1326 &uap->msg->msg_accrightslen, sizeof (int)); 1327 free(iov, M_IOV); 1328 return (error); 1329 } 1330 #endif 1331 1332 int 1333 sys_recvmsg(td, uap) 1334 struct thread *td; 1335 struct recvmsg_args /* { 1336 int s; 1337 struct msghdr *msg; 1338 int flags; 1339 } */ *uap; 1340 { 1341 struct msghdr msg; 1342 struct iovec *uiov, *iov; 1343 int error; 1344 1345 error = copyin(uap->msg, &msg, sizeof (msg)); 1346 if (error != 0) 1347 return (error); 1348 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1349 if (error != 0) 1350 return (error); 1351 msg.msg_flags = uap->flags; 1352 #ifdef COMPAT_OLDSOCK 1353 msg.msg_flags &= ~MSG_COMPAT; 1354 #endif 1355 uiov = msg.msg_iov; 1356 msg.msg_iov = iov; 1357 error = recvit(td, uap->s, &msg, NULL); 1358 if (error == 0) { 1359 msg.msg_iov = uiov; 1360 error = copyout(&msg, uap->msg, sizeof(msg)); 1361 } 1362 free(iov, M_IOV); 1363 return (error); 1364 } 1365 1366 /* ARGSUSED */ 1367 int 1368 sys_shutdown(td, uap) 1369 struct thread *td; 1370 struct shutdown_args /* { 1371 int s; 1372 int how; 1373 } */ *uap; 1374 { 1375 struct socket *so; 1376 struct file *fp; 1377 cap_rights_t rights; 1378 int error; 1379 1380 AUDIT_ARG_FD(uap->s); 1381 error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN), 1382 &fp, NULL); 1383 if (error == 0) { 1384 so = fp->f_data; 1385 error = soshutdown(so, uap->how); 1386 fdrop(fp, td); 1387 } 1388 return (error); 1389 } 1390 1391 /* ARGSUSED */ 1392 int 1393 sys_setsockopt(td, uap) 1394 struct thread *td; 1395 struct setsockopt_args /* { 1396 int s; 1397 int level; 1398 int name; 1399 caddr_t val; 1400 int valsize; 1401 } */ *uap; 1402 { 1403 1404 return (kern_setsockopt(td, uap->s, uap->level, uap->name, 1405 uap->val, UIO_USERSPACE, uap->valsize)); 1406 } 1407 1408 int 1409 kern_setsockopt(td, s, level, name, val, valseg, valsize) 1410 struct thread *td; 1411 int s; 1412 int level; 1413 int name; 1414 void *val; 1415 enum uio_seg valseg; 1416 socklen_t valsize; 1417 { 1418 struct socket *so; 1419 struct file *fp; 1420 struct sockopt sopt; 1421 cap_rights_t rights; 1422 int error; 1423 1424 if (val == NULL && valsize != 0) 1425 return (EFAULT); 1426 if ((int)valsize < 0) 1427 return (EINVAL); 1428 1429 sopt.sopt_dir = SOPT_SET; 1430 sopt.sopt_level = level; 1431 sopt.sopt_name = name; 1432 sopt.sopt_val = val; 1433 sopt.sopt_valsize = valsize; 1434 switch (valseg) { 1435 case UIO_USERSPACE: 1436 sopt.sopt_td = td; 1437 break; 1438 case UIO_SYSSPACE: 1439 sopt.sopt_td = NULL; 1440 break; 1441 default: 1442 panic("kern_setsockopt called with bad valseg"); 1443 } 1444 1445 AUDIT_ARG_FD(s); 1446 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT), 1447 &fp, NULL); 1448 if (error == 0) { 1449 so = fp->f_data; 1450 error = sosetopt(so, &sopt); 1451 fdrop(fp, td); 1452 } 1453 return(error); 1454 } 1455 1456 /* ARGSUSED */ 1457 int 1458 sys_getsockopt(td, uap) 1459 struct thread *td; 1460 struct getsockopt_args /* { 1461 int s; 1462 int level; 1463 int name; 1464 void * __restrict val; 1465 socklen_t * __restrict avalsize; 1466 } */ *uap; 1467 { 1468 socklen_t valsize; 1469 int error; 1470 1471 if (uap->val) { 1472 error = copyin(uap->avalsize, &valsize, sizeof (valsize)); 1473 if (error != 0) 1474 return (error); 1475 } 1476 1477 error = kern_getsockopt(td, uap->s, uap->level, uap->name, 1478 uap->val, UIO_USERSPACE, &valsize); 1479 1480 if (error == 0) 1481 error = copyout(&valsize, uap->avalsize, sizeof (valsize)); 1482 return (error); 1483 } 1484 1485 /* 1486 * Kernel version of getsockopt. 1487 * optval can be a userland or userspace. optlen is always a kernel pointer. 1488 */ 1489 int 1490 kern_getsockopt(td, s, level, name, val, valseg, valsize) 1491 struct thread *td; 1492 int s; 1493 int level; 1494 int name; 1495 void *val; 1496 enum uio_seg valseg; 1497 socklen_t *valsize; 1498 { 1499 struct socket *so; 1500 struct file *fp; 1501 struct sockopt sopt; 1502 cap_rights_t rights; 1503 int error; 1504 1505 if (val == NULL) 1506 *valsize = 0; 1507 if ((int)*valsize < 0) 1508 return (EINVAL); 1509 1510 sopt.sopt_dir = SOPT_GET; 1511 sopt.sopt_level = level; 1512 sopt.sopt_name = name; 1513 sopt.sopt_val = val; 1514 sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ 1515 switch (valseg) { 1516 case UIO_USERSPACE: 1517 sopt.sopt_td = td; 1518 break; 1519 case UIO_SYSSPACE: 1520 sopt.sopt_td = NULL; 1521 break; 1522 default: 1523 panic("kern_getsockopt called with bad valseg"); 1524 } 1525 1526 AUDIT_ARG_FD(s); 1527 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT), 1528 &fp, NULL); 1529 if (error == 0) { 1530 so = fp->f_data; 1531 error = sogetopt(so, &sopt); 1532 *valsize = sopt.sopt_valsize; 1533 fdrop(fp, td); 1534 } 1535 return (error); 1536 } 1537 1538 /* 1539 * getsockname1() - Get socket name. 1540 */ 1541 /* ARGSUSED */ 1542 static int 1543 getsockname1(td, uap, compat) 1544 struct thread *td; 1545 struct getsockname_args /* { 1546 int fdes; 1547 struct sockaddr * __restrict asa; 1548 socklen_t * __restrict alen; 1549 } */ *uap; 1550 int compat; 1551 { 1552 struct sockaddr *sa; 1553 socklen_t len; 1554 int error; 1555 1556 error = copyin(uap->alen, &len, sizeof(len)); 1557 if (error != 0) 1558 return (error); 1559 1560 error = kern_getsockname(td, uap->fdes, &sa, &len); 1561 if (error != 0) 1562 return (error); 1563 1564 if (len != 0) { 1565 #ifdef COMPAT_OLDSOCK 1566 if (compat) 1567 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1568 #endif 1569 error = copyout(sa, uap->asa, (u_int)len); 1570 } 1571 free(sa, M_SONAME); 1572 if (error == 0) 1573 error = copyout(&len, uap->alen, sizeof(len)); 1574 return (error); 1575 } 1576 1577 int 1578 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, 1579 socklen_t *alen) 1580 { 1581 struct socket *so; 1582 struct file *fp; 1583 cap_rights_t rights; 1584 socklen_t len; 1585 int error; 1586 1587 AUDIT_ARG_FD(fd); 1588 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME), 1589 &fp, NULL); 1590 if (error != 0) 1591 return (error); 1592 so = fp->f_data; 1593 *sa = NULL; 1594 CURVNET_SET(so->so_vnet); 1595 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); 1596 CURVNET_RESTORE(); 1597 if (error != 0) 1598 goto bad; 1599 if (*sa == NULL) 1600 len = 0; 1601 else 1602 len = MIN(*alen, (*sa)->sa_len); 1603 *alen = len; 1604 #ifdef KTRACE 1605 if (KTRPOINT(td, KTR_STRUCT)) 1606 ktrsockaddr(*sa); 1607 #endif 1608 bad: 1609 fdrop(fp, td); 1610 if (error != 0 && *sa != NULL) { 1611 free(*sa, M_SONAME); 1612 *sa = NULL; 1613 } 1614 return (error); 1615 } 1616 1617 int 1618 sys_getsockname(td, uap) 1619 struct thread *td; 1620 struct getsockname_args *uap; 1621 { 1622 1623 return (getsockname1(td, uap, 0)); 1624 } 1625 1626 #ifdef COMPAT_OLDSOCK 1627 int 1628 ogetsockname(td, uap) 1629 struct thread *td; 1630 struct getsockname_args *uap; 1631 { 1632 1633 return (getsockname1(td, uap, 1)); 1634 } 1635 #endif /* COMPAT_OLDSOCK */ 1636 1637 /* 1638 * getpeername1() - Get name of peer for connected socket. 1639 */ 1640 /* ARGSUSED */ 1641 static int 1642 getpeername1(td, uap, compat) 1643 struct thread *td; 1644 struct getpeername_args /* { 1645 int fdes; 1646 struct sockaddr * __restrict asa; 1647 socklen_t * __restrict alen; 1648 } */ *uap; 1649 int compat; 1650 { 1651 struct sockaddr *sa; 1652 socklen_t len; 1653 int error; 1654 1655 error = copyin(uap->alen, &len, sizeof (len)); 1656 if (error != 0) 1657 return (error); 1658 1659 error = kern_getpeername(td, uap->fdes, &sa, &len); 1660 if (error != 0) 1661 return (error); 1662 1663 if (len != 0) { 1664 #ifdef COMPAT_OLDSOCK 1665 if (compat) 1666 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1667 #endif 1668 error = copyout(sa, uap->asa, (u_int)len); 1669 } 1670 free(sa, M_SONAME); 1671 if (error == 0) 1672 error = copyout(&len, uap->alen, sizeof(len)); 1673 return (error); 1674 } 1675 1676 int 1677 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, 1678 socklen_t *alen) 1679 { 1680 struct socket *so; 1681 struct file *fp; 1682 cap_rights_t rights; 1683 socklen_t len; 1684 int error; 1685 1686 AUDIT_ARG_FD(fd); 1687 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME), 1688 &fp, NULL); 1689 if (error != 0) 1690 return (error); 1691 so = fp->f_data; 1692 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1693 error = ENOTCONN; 1694 goto done; 1695 } 1696 *sa = NULL; 1697 CURVNET_SET(so->so_vnet); 1698 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); 1699 CURVNET_RESTORE(); 1700 if (error != 0) 1701 goto bad; 1702 if (*sa == NULL) 1703 len = 0; 1704 else 1705 len = MIN(*alen, (*sa)->sa_len); 1706 *alen = len; 1707 #ifdef KTRACE 1708 if (KTRPOINT(td, KTR_STRUCT)) 1709 ktrsockaddr(*sa); 1710 #endif 1711 bad: 1712 if (error != 0 && *sa != NULL) { 1713 free(*sa, M_SONAME); 1714 *sa = NULL; 1715 } 1716 done: 1717 fdrop(fp, td); 1718 return (error); 1719 } 1720 1721 int 1722 sys_getpeername(td, uap) 1723 struct thread *td; 1724 struct getpeername_args *uap; 1725 { 1726 1727 return (getpeername1(td, uap, 0)); 1728 } 1729 1730 #ifdef COMPAT_OLDSOCK 1731 int 1732 ogetpeername(td, uap) 1733 struct thread *td; 1734 struct ogetpeername_args *uap; 1735 { 1736 1737 /* XXX uap should have type `getpeername_args *' to begin with. */ 1738 return (getpeername1(td, (struct getpeername_args *)uap, 1)); 1739 } 1740 #endif /* COMPAT_OLDSOCK */ 1741 1742 int 1743 sockargs(mp, buf, buflen, type) 1744 struct mbuf **mp; 1745 caddr_t buf; 1746 int buflen, type; 1747 { 1748 struct sockaddr *sa; 1749 struct mbuf *m; 1750 int error; 1751 1752 if (buflen > MLEN) { 1753 #ifdef COMPAT_OLDSOCK 1754 if (type == MT_SONAME && buflen <= 112) 1755 buflen = MLEN; /* unix domain compat. hack */ 1756 else 1757 #endif 1758 if (buflen > MCLBYTES) 1759 return (EINVAL); 1760 } 1761 m = m_get2(buflen, M_WAITOK, type, 0); 1762 m->m_len = buflen; 1763 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1764 if (error != 0) 1765 (void) m_free(m); 1766 else { 1767 *mp = m; 1768 if (type == MT_SONAME) { 1769 sa = mtod(m, struct sockaddr *); 1770 1771 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1772 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1773 sa->sa_family = sa->sa_len; 1774 #endif 1775 sa->sa_len = buflen; 1776 } 1777 } 1778 return (error); 1779 } 1780 1781 int 1782 getsockaddr(namp, uaddr, len) 1783 struct sockaddr **namp; 1784 caddr_t uaddr; 1785 size_t len; 1786 { 1787 struct sockaddr *sa; 1788 int error; 1789 1790 if (len > SOCK_MAXADDRLEN) 1791 return (ENAMETOOLONG); 1792 if (len < offsetof(struct sockaddr, sa_data[0])) 1793 return (EINVAL); 1794 sa = malloc(len, M_SONAME, M_WAITOK); 1795 error = copyin(uaddr, sa, len); 1796 if (error != 0) { 1797 free(sa, M_SONAME); 1798 } else { 1799 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1800 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1801 sa->sa_family = sa->sa_len; 1802 #endif 1803 sa->sa_len = len; 1804 *namp = sa; 1805 } 1806 return (error); 1807 } 1808 1809 struct sendfile_sync { 1810 struct mtx mtx; 1811 struct cv cv; 1812 unsigned count; 1813 }; 1814 1815 /* 1816 * Add more references to a vm_page + sf_buf + sendfile_sync. 1817 */ 1818 void 1819 sf_ext_ref(void *arg1, void *arg2) 1820 { 1821 struct sf_buf *sf = arg1; 1822 struct sendfile_sync *sfs = arg2; 1823 vm_page_t pg = sf_buf_page(sf); 1824 1825 sf_buf_ref(sf); 1826 1827 vm_page_lock(pg); 1828 vm_page_wire(pg); 1829 vm_page_unlock(pg); 1830 1831 if (sfs != NULL) { 1832 mtx_lock(&sfs->mtx); 1833 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); 1834 sfs->count++; 1835 mtx_unlock(&sfs->mtx); 1836 } 1837 } 1838 1839 /* 1840 * Detach mapped page and release resources back to the system. 1841 */ 1842 void 1843 sf_ext_free(void *arg1, void *arg2) 1844 { 1845 struct sf_buf *sf = arg1; 1846 struct sendfile_sync *sfs = arg2; 1847 vm_page_t pg = sf_buf_page(sf); 1848 1849 sf_buf_free(sf); 1850 1851 vm_page_lock(pg); 1852 vm_page_unwire(pg, PQ_INACTIVE); 1853 /* 1854 * Check for the object going away on us. This can 1855 * happen since we don't hold a reference to it. 1856 * If so, we're responsible for freeing the page. 1857 */ 1858 if (pg->wire_count == 0 && pg->object == NULL) 1859 vm_page_free(pg); 1860 vm_page_unlock(pg); 1861 1862 if (sfs != NULL) { 1863 mtx_lock(&sfs->mtx); 1864 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); 1865 if (--sfs->count == 0) 1866 cv_signal(&sfs->cv); 1867 mtx_unlock(&sfs->mtx); 1868 } 1869 } 1870 1871 /* 1872 * sendfile(2) 1873 * 1874 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 1875 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 1876 * 1877 * Send a file specified by 'fd' and starting at 'offset' to a socket 1878 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == 1879 * 0. Optionally add a header and/or trailer to the socket output. If 1880 * specified, write the total number of bytes sent into *sbytes. 1881 */ 1882 int 1883 sys_sendfile(struct thread *td, struct sendfile_args *uap) 1884 { 1885 1886 return (do_sendfile(td, uap, 0)); 1887 } 1888 1889 static int 1890 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) 1891 { 1892 struct sf_hdtr hdtr; 1893 struct uio *hdr_uio, *trl_uio; 1894 struct file *fp; 1895 cap_rights_t rights; 1896 off_t sbytes; 1897 int error; 1898 1899 /* 1900 * File offset must be positive. If it goes beyond EOF 1901 * we send only the header/trailer and no payload data. 1902 */ 1903 if (uap->offset < 0) 1904 return (EINVAL); 1905 1906 hdr_uio = trl_uio = NULL; 1907 1908 if (uap->hdtr != NULL) { 1909 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 1910 if (error != 0) 1911 goto out; 1912 if (hdtr.headers != NULL) { 1913 error = copyinuio(hdtr.headers, hdtr.hdr_cnt, 1914 &hdr_uio); 1915 if (error != 0) 1916 goto out; 1917 } 1918 if (hdtr.trailers != NULL) { 1919 error = copyinuio(hdtr.trailers, hdtr.trl_cnt, 1920 &trl_uio); 1921 if (error != 0) 1922 goto out; 1923 } 1924 } 1925 1926 AUDIT_ARG_FD(uap->fd); 1927 1928 /* 1929 * sendfile(2) can start at any offset within a file so we require 1930 * CAP_READ+CAP_SEEK = CAP_PREAD. 1931 */ 1932 if ((error = fget_read(td, uap->fd, 1933 cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { 1934 goto out; 1935 } 1936 1937 error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset, 1938 uap->nbytes, &sbytes, uap->flags, compat ? SFK_COMPAT : 0, td); 1939 fdrop(fp, td); 1940 1941 if (uap->sbytes != NULL) 1942 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 1943 1944 out: 1945 free(hdr_uio, M_IOV); 1946 free(trl_uio, M_IOV); 1947 return (error); 1948 } 1949 1950 #ifdef COMPAT_FREEBSD4 1951 int 1952 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) 1953 { 1954 struct sendfile_args args; 1955 1956 args.fd = uap->fd; 1957 args.s = uap->s; 1958 args.offset = uap->offset; 1959 args.nbytes = uap->nbytes; 1960 args.hdtr = uap->hdtr; 1961 args.sbytes = uap->sbytes; 1962 args.flags = uap->flags; 1963 1964 return (do_sendfile(td, &args, 1)); 1965 } 1966 #endif /* COMPAT_FREEBSD4 */ 1967 1968 static int 1969 sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd, 1970 off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res) 1971 { 1972 vm_page_t m; 1973 vm_pindex_t pindex; 1974 ssize_t resid; 1975 int error, readahead, rv; 1976 1977 pindex = OFF_TO_IDX(off); 1978 VM_OBJECT_WLOCK(obj); 1979 m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY | 1980 VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL); 1981 1982 /* 1983 * Check if page is valid for what we need, otherwise initiate I/O. 1984 * 1985 * The non-zero nd argument prevents disk I/O, instead we 1986 * return the caller what he specified in nd. In particular, 1987 * if we already turned some pages into mbufs, nd == EAGAIN 1988 * and the main function send them the pages before we come 1989 * here again and block. 1990 */ 1991 if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) { 1992 if (vp == NULL) 1993 vm_page_xunbusy(m); 1994 VM_OBJECT_WUNLOCK(obj); 1995 *res = m; 1996 return (0); 1997 } else if (nd != 0) { 1998 if (vp == NULL) 1999 vm_page_xunbusy(m); 2000 error = nd; 2001 goto free_page; 2002 } 2003 2004 /* 2005 * Get the page from backing store. 2006 */ 2007 error = 0; 2008 if (vp != NULL) { 2009 VM_OBJECT_WUNLOCK(obj); 2010 readahead = sfreadahead * MAXBSIZE; 2011 2012 /* 2013 * Use vn_rdwr() instead of the pager interface for 2014 * the vnode, to allow the read-ahead. 2015 * 2016 * XXXMAC: Because we don't have fp->f_cred here, we 2017 * pass in NOCRED. This is probably wrong, but is 2018 * consistent with our original implementation. 2019 */ 2020 error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off), 2021 UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead / 2022 bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); 2023 SFSTAT_INC(sf_iocnt); 2024 VM_OBJECT_WLOCK(obj); 2025 } else { 2026 if (vm_pager_has_page(obj, pindex, NULL, NULL)) { 2027 rv = vm_pager_get_pages(obj, &m, 1, 0); 2028 SFSTAT_INC(sf_iocnt); 2029 if (rv != VM_PAGER_OK) { 2030 vm_page_lock(m); 2031 vm_page_free(m); 2032 vm_page_unlock(m); 2033 m = NULL; 2034 error = EIO; 2035 } 2036 } else { 2037 pmap_zero_page(m); 2038 m->valid = VM_PAGE_BITS_ALL; 2039 m->dirty = 0; 2040 } 2041 if (m != NULL) 2042 vm_page_xunbusy(m); 2043 } 2044 if (error == 0) { 2045 *res = m; 2046 } else if (m != NULL) { 2047 free_page: 2048 vm_page_lock(m); 2049 vm_page_unwire(m, PQ_INACTIVE); 2050 2051 /* 2052 * See if anyone else might know about this page. If 2053 * not and it is not valid, then free it. 2054 */ 2055 if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m)) 2056 vm_page_free(m); 2057 vm_page_unlock(m); 2058 } 2059 KASSERT(error != 0 || (m->wire_count > 0 && 2060 vm_page_is_valid(m, off & PAGE_MASK, xfsize)), 2061 ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off, 2062 xfsize)); 2063 VM_OBJECT_WUNLOCK(obj); 2064 return (error); 2065 } 2066 2067 static int 2068 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, 2069 struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, 2070 int *bsize) 2071 { 2072 struct vattr va; 2073 vm_object_t obj; 2074 struct vnode *vp; 2075 struct shmfd *shmfd; 2076 int error; 2077 2078 vp = *vp_res = NULL; 2079 obj = NULL; 2080 shmfd = *shmfd_res = NULL; 2081 *bsize = 0; 2082 2083 /* 2084 * The file descriptor must be a regular file and have a 2085 * backing VM object. 2086 */ 2087 if (fp->f_type == DTYPE_VNODE) { 2088 vp = fp->f_vnode; 2089 vn_lock(vp, LK_SHARED | LK_RETRY); 2090 if (vp->v_type != VREG) { 2091 error = EINVAL; 2092 goto out; 2093 } 2094 *bsize = vp->v_mount->mnt_stat.f_iosize; 2095 error = VOP_GETATTR(vp, &va, td->td_ucred); 2096 if (error != 0) 2097 goto out; 2098 *obj_size = va.va_size; 2099 obj = vp->v_object; 2100 if (obj == NULL) { 2101 error = EINVAL; 2102 goto out; 2103 } 2104 } else if (fp->f_type == DTYPE_SHM) { 2105 error = 0; 2106 shmfd = fp->f_data; 2107 obj = shmfd->shm_object; 2108 *obj_size = shmfd->shm_size; 2109 } else { 2110 error = EINVAL; 2111 goto out; 2112 } 2113 2114 VM_OBJECT_WLOCK(obj); 2115 if ((obj->flags & OBJ_DEAD) != 0) { 2116 VM_OBJECT_WUNLOCK(obj); 2117 error = EBADF; 2118 goto out; 2119 } 2120 2121 /* 2122 * Temporarily increase the backing VM object's reference 2123 * count so that a forced reclamation of its vnode does not 2124 * immediately destroy it. 2125 */ 2126 vm_object_reference_locked(obj); 2127 VM_OBJECT_WUNLOCK(obj); 2128 *obj_res = obj; 2129 *vp_res = vp; 2130 *shmfd_res = shmfd; 2131 2132 out: 2133 if (vp != NULL) 2134 VOP_UNLOCK(vp, 0); 2135 return (error); 2136 } 2137 2138 static int 2139 kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp, 2140 struct socket **so) 2141 { 2142 cap_rights_t rights; 2143 int error; 2144 2145 *sock_fp = NULL; 2146 *so = NULL; 2147 2148 /* 2149 * The socket must be a stream socket and connected. 2150 */ 2151 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND), 2152 sock_fp, NULL); 2153 if (error != 0) 2154 return (error); 2155 *so = (*sock_fp)->f_data; 2156 if ((*so)->so_type != SOCK_STREAM) 2157 return (EINVAL); 2158 if (((*so)->so_state & SS_ISCONNECTED) == 0) 2159 return (ENOTCONN); 2160 return (0); 2161 } 2162 2163 int 2164 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 2165 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 2166 int kflags, struct thread *td) 2167 { 2168 struct file *sock_fp; 2169 struct vnode *vp; 2170 struct vm_object *obj; 2171 struct socket *so; 2172 struct mbuf *m; 2173 struct sf_buf *sf; 2174 struct vm_page *pg; 2175 struct shmfd *shmfd; 2176 struct sendfile_sync *sfs; 2177 struct vattr va; 2178 off_t off, xfsize, fsbytes, sbytes, rem, obj_size; 2179 int error, bsize, nd, hdrlen, mnw; 2180 2181 pg = NULL; 2182 obj = NULL; 2183 so = NULL; 2184 m = NULL; 2185 sfs = NULL; 2186 fsbytes = sbytes = 0; 2187 hdrlen = mnw = 0; 2188 rem = nbytes; 2189 obj_size = 0; 2190 2191 error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); 2192 if (error != 0) 2193 return (error); 2194 if (rem == 0) 2195 rem = obj_size; 2196 2197 error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so); 2198 if (error != 0) 2199 goto out; 2200 2201 /* 2202 * Do not wait on memory allocations but return ENOMEM for 2203 * caller to retry later. 2204 * XXX: Experimental. 2205 */ 2206 if (flags & SF_MNOWAIT) 2207 mnw = 1; 2208 2209 if (flags & SF_SYNC) { 2210 sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO); 2211 mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); 2212 cv_init(&sfs->cv, "sendfile"); 2213 } 2214 2215 #ifdef MAC 2216 error = mac_socket_check_send(td->td_ucred, so); 2217 if (error != 0) 2218 goto out; 2219 #endif 2220 2221 /* If headers are specified copy them into mbufs. */ 2222 if (hdr_uio != NULL) { 2223 hdr_uio->uio_td = td; 2224 hdr_uio->uio_rw = UIO_WRITE; 2225 if (hdr_uio->uio_resid > 0) { 2226 /* 2227 * In FBSD < 5.0 the nbytes to send also included 2228 * the header. If compat is specified subtract the 2229 * header size from nbytes. 2230 */ 2231 if (kflags & SFK_COMPAT) { 2232 if (nbytes > hdr_uio->uio_resid) 2233 nbytes -= hdr_uio->uio_resid; 2234 else 2235 nbytes = 0; 2236 } 2237 m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), 2238 0, 0, 0); 2239 if (m == NULL) { 2240 error = mnw ? EAGAIN : ENOBUFS; 2241 goto out; 2242 } 2243 hdrlen = m_length(m, NULL); 2244 } 2245 } 2246 2247 /* 2248 * Protect against multiple writers to the socket. 2249 * 2250 * XXXRW: Historically this has assumed non-interruptibility, so now 2251 * we implement that, but possibly shouldn't. 2252 */ 2253 (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); 2254 2255 /* 2256 * Loop through the pages of the file, starting with the requested 2257 * offset. Get a file page (do I/O if necessary), map the file page 2258 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 2259 * it on the socket. 2260 * This is done in two loops. The inner loop turns as many pages 2261 * as it can, up to available socket buffer space, without blocking 2262 * into mbufs to have it bulk delivered into the socket send buffer. 2263 * The outer loop checks the state and available space of the socket 2264 * and takes care of the overall progress. 2265 */ 2266 for (off = offset; ; ) { 2267 struct mbuf *mtail; 2268 int loopbytes; 2269 int space; 2270 int done; 2271 2272 if ((nbytes != 0 && nbytes == fsbytes) || 2273 (nbytes == 0 && obj_size == fsbytes)) 2274 break; 2275 2276 mtail = NULL; 2277 loopbytes = 0; 2278 space = 0; 2279 done = 0; 2280 2281 /* 2282 * Check the socket state for ongoing connection, 2283 * no errors and space in socket buffer. 2284 * If space is low allow for the remainder of the 2285 * file to be processed if it fits the socket buffer. 2286 * Otherwise block in waiting for sufficient space 2287 * to proceed, or if the socket is nonblocking, return 2288 * to userland with EAGAIN while reporting how far 2289 * we've come. 2290 * We wait until the socket buffer has significant free 2291 * space to do bulk sends. This makes good use of file 2292 * system read ahead and allows packet segmentation 2293 * offloading hardware to take over lots of work. If 2294 * we were not careful here we would send off only one 2295 * sfbuf at a time. 2296 */ 2297 SOCKBUF_LOCK(&so->so_snd); 2298 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) 2299 so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; 2300 retry_space: 2301 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2302 error = EPIPE; 2303 SOCKBUF_UNLOCK(&so->so_snd); 2304 goto done; 2305 } else if (so->so_error) { 2306 error = so->so_error; 2307 so->so_error = 0; 2308 SOCKBUF_UNLOCK(&so->so_snd); 2309 goto done; 2310 } 2311 space = sbspace(&so->so_snd); 2312 if (space < rem && 2313 (space <= 0 || 2314 space < so->so_snd.sb_lowat)) { 2315 if (so->so_state & SS_NBIO) { 2316 SOCKBUF_UNLOCK(&so->so_snd); 2317 error = EAGAIN; 2318 goto done; 2319 } 2320 /* 2321 * sbwait drops the lock while sleeping. 2322 * When we loop back to retry_space the 2323 * state may have changed and we retest 2324 * for it. 2325 */ 2326 error = sbwait(&so->so_snd); 2327 /* 2328 * An error from sbwait usually indicates that we've 2329 * been interrupted by a signal. If we've sent anything 2330 * then return bytes sent, otherwise return the error. 2331 */ 2332 if (error != 0) { 2333 SOCKBUF_UNLOCK(&so->so_snd); 2334 goto done; 2335 } 2336 goto retry_space; 2337 } 2338 SOCKBUF_UNLOCK(&so->so_snd); 2339 2340 /* 2341 * Reduce space in the socket buffer by the size of 2342 * the header mbuf chain. 2343 * hdrlen is set to 0 after the first loop. 2344 */ 2345 space -= hdrlen; 2346 2347 if (vp != NULL) { 2348 error = vn_lock(vp, LK_SHARED); 2349 if (error != 0) 2350 goto done; 2351 error = VOP_GETATTR(vp, &va, td->td_ucred); 2352 if (error != 0 || off >= va.va_size) { 2353 VOP_UNLOCK(vp, 0); 2354 goto done; 2355 } 2356 obj_size = va.va_size; 2357 } 2358 2359 /* 2360 * Loop and construct maximum sized mbuf chain to be bulk 2361 * dumped into socket buffer. 2362 */ 2363 while (space > loopbytes) { 2364 vm_offset_t pgoff; 2365 struct mbuf *m0; 2366 2367 /* 2368 * Calculate the amount to transfer. 2369 * Not to exceed a page, the EOF, 2370 * or the passed in nbytes. 2371 */ 2372 pgoff = (vm_offset_t)(off & PAGE_MASK); 2373 rem = obj_size - offset; 2374 if (nbytes != 0) 2375 rem = omin(rem, nbytes); 2376 rem -= fsbytes + loopbytes; 2377 xfsize = omin(PAGE_SIZE - pgoff, rem); 2378 xfsize = omin(space - loopbytes, xfsize); 2379 if (xfsize <= 0) { 2380 done = 1; /* all data sent */ 2381 break; 2382 } 2383 2384 /* 2385 * Attempt to look up the page. Allocate 2386 * if not found or wait and loop if busy. 2387 */ 2388 if (m != NULL) 2389 nd = EAGAIN; /* send what we already got */ 2390 else if ((flags & SF_NODISKIO) != 0) 2391 nd = EBUSY; 2392 else 2393 nd = 0; 2394 error = sendfile_readpage(obj, vp, nd, off, 2395 xfsize, bsize, td, &pg); 2396 if (error != 0) { 2397 if (error == EAGAIN) 2398 error = 0; /* not a real error */ 2399 break; 2400 } 2401 2402 /* 2403 * Get a sendfile buf. When allocating the 2404 * first buffer for mbuf chain, we usually 2405 * wait as long as necessary, but this wait 2406 * can be interrupted. For consequent 2407 * buffers, do not sleep, since several 2408 * threads might exhaust the buffers and then 2409 * deadlock. 2410 */ 2411 sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT : 2412 SFB_CATCH); 2413 if (sf == NULL) { 2414 SFSTAT_INC(sf_allocfail); 2415 vm_page_lock(pg); 2416 vm_page_unwire(pg, PQ_INACTIVE); 2417 KASSERT(pg->object != NULL, 2418 ("%s: object disappeared", __func__)); 2419 vm_page_unlock(pg); 2420 if (m == NULL) 2421 error = (mnw ? EAGAIN : EINTR); 2422 break; 2423 } 2424 2425 /* 2426 * Get an mbuf and set it up as having 2427 * external storage. 2428 */ 2429 m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); 2430 if (m0 == NULL) { 2431 error = (mnw ? EAGAIN : ENOBUFS); 2432 sf_ext_free(sf, NULL); 2433 break; 2434 } 2435 /* 2436 * Attach EXT_SFBUF external storage. 2437 */ 2438 m0->m_ext.ext_buf = (caddr_t )sf_buf_kva(sf); 2439 m0->m_ext.ext_size = PAGE_SIZE; 2440 m0->m_ext.ext_arg1 = sf; 2441 m0->m_ext.ext_arg2 = sfs; 2442 m0->m_ext.ext_type = EXT_SFBUF; 2443 m0->m_ext.ext_flags = 0; 2444 m0->m_flags |= (M_EXT|M_RDONLY); 2445 m0->m_data = (char *)sf_buf_kva(sf) + pgoff; 2446 m0->m_len = xfsize; 2447 2448 /* Append to mbuf chain. */ 2449 if (mtail != NULL) 2450 mtail->m_next = m0; 2451 else if (m != NULL) 2452 m_last(m)->m_next = m0; 2453 else 2454 m = m0; 2455 mtail = m0; 2456 2457 /* Keep track of bits processed. */ 2458 loopbytes += xfsize; 2459 off += xfsize; 2460 2461 if (sfs != NULL) { 2462 mtx_lock(&sfs->mtx); 2463 sfs->count++; 2464 mtx_unlock(&sfs->mtx); 2465 } 2466 } 2467 2468 if (vp != NULL) 2469 VOP_UNLOCK(vp, 0); 2470 2471 /* Add the buffer chain to the socket buffer. */ 2472 if (m != NULL) { 2473 int mlen, err; 2474 2475 mlen = m_length(m, NULL); 2476 SOCKBUF_LOCK(&so->so_snd); 2477 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2478 error = EPIPE; 2479 SOCKBUF_UNLOCK(&so->so_snd); 2480 goto done; 2481 } 2482 SOCKBUF_UNLOCK(&so->so_snd); 2483 CURVNET_SET(so->so_vnet); 2484 /* Avoid error aliasing. */ 2485 err = (*so->so_proto->pr_usrreqs->pru_send) 2486 (so, 0, m, NULL, NULL, td); 2487 CURVNET_RESTORE(); 2488 if (err == 0) { 2489 /* 2490 * We need two counters to get the 2491 * file offset and nbytes to send 2492 * right: 2493 * - sbytes contains the total amount 2494 * of bytes sent, including headers. 2495 * - fsbytes contains the total amount 2496 * of bytes sent from the file. 2497 */ 2498 sbytes += mlen; 2499 fsbytes += mlen; 2500 if (hdrlen) { 2501 fsbytes -= hdrlen; 2502 hdrlen = 0; 2503 } 2504 } else if (error == 0) 2505 error = err; 2506 m = NULL; /* pru_send always consumes */ 2507 } 2508 2509 /* Quit outer loop on error or when we're done. */ 2510 if (done) 2511 break; 2512 if (error != 0) 2513 goto done; 2514 } 2515 2516 /* 2517 * Send trailers. Wimp out and use writev(2). 2518 */ 2519 if (trl_uio != NULL) { 2520 sbunlock(&so->so_snd); 2521 error = kern_writev(td, sockfd, trl_uio); 2522 if (error == 0) 2523 sbytes += td->td_retval[0]; 2524 goto out; 2525 } 2526 2527 done: 2528 sbunlock(&so->so_snd); 2529 out: 2530 /* 2531 * If there was no error we have to clear td->td_retval[0] 2532 * because it may have been set by writev. 2533 */ 2534 if (error == 0) { 2535 td->td_retval[0] = 0; 2536 } 2537 if (sent != NULL) { 2538 (*sent) = sbytes; 2539 } 2540 if (obj != NULL) 2541 vm_object_deallocate(obj); 2542 if (so) 2543 fdrop(sock_fp, td); 2544 if (m) 2545 m_freem(m); 2546 2547 if (sfs != NULL) { 2548 mtx_lock(&sfs->mtx); 2549 if (sfs->count != 0) 2550 cv_wait(&sfs->cv, &sfs->mtx); 2551 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2552 cv_destroy(&sfs->cv); 2553 mtx_destroy(&sfs->mtx); 2554 free(sfs, M_TEMP); 2555 } 2556 2557 if (error == ERESTART) 2558 error = EINTR; 2559 2560 return (error); 2561 } 2562