1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_capsicum.h" 39 #include "opt_inet.h" 40 #include "opt_inet6.h" 41 #include "opt_compat.h" 42 #include "opt_ktrace.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/capsicum.h> 47 #include <sys/condvar.h> 48 #include <sys/kernel.h> 49 #include <sys/lock.h> 50 #include <sys/mutex.h> 51 #include <sys/sysproto.h> 52 #include <sys/malloc.h> 53 #include <sys/filedesc.h> 54 #include <sys/event.h> 55 #include <sys/proc.h> 56 #include <sys/fcntl.h> 57 #include <sys/file.h> 58 #include <sys/filio.h> 59 #include <sys/jail.h> 60 #include <sys/mman.h> 61 #include <sys/mount.h> 62 #include <sys/mbuf.h> 63 #include <sys/protosw.h> 64 #include <sys/rwlock.h> 65 #include <sys/sf_buf.h> 66 #include <sys/sysent.h> 67 #include <sys/socket.h> 68 #include <sys/socketvar.h> 69 #include <sys/signalvar.h> 70 #include <sys/syscallsubr.h> 71 #include <sys/sysctl.h> 72 #include <sys/uio.h> 73 #include <sys/vnode.h> 74 #ifdef KTRACE 75 #include <sys/ktrace.h> 76 #endif 77 #ifdef COMPAT_FREEBSD32 78 #include <compat/freebsd32/freebsd32_util.h> 79 #endif 80 81 #include <net/vnet.h> 82 83 #include <security/audit/audit.h> 84 #include <security/mac/mac_framework.h> 85 86 #include <vm/vm.h> 87 #include <vm/vm_param.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_page.h> 90 #include <vm/vm_pager.h> 91 #include <vm/vm_kern.h> 92 #include <vm/vm_extern.h> 93 #include <vm/uma.h> 94 95 /* 96 * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC 97 * and SOCK_NONBLOCK. 98 */ 99 #define ACCEPT4_INHERIT 0x1 100 #define ACCEPT4_COMPAT 0x2 101 102 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); 103 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); 104 105 static int accept1(struct thread *td, int s, struct sockaddr *uname, 106 socklen_t *anamelen, int flags); 107 static int do_sendfile(struct thread *td, struct sendfile_args *uap, 108 int compat); 109 static int getsockname1(struct thread *td, struct getsockname_args *uap, 110 int compat); 111 static int getpeername1(struct thread *td, struct getpeername_args *uap, 112 int compat); 113 114 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; 115 116 /* 117 * sendfile(2)-related variables and associated sysctls 118 */ 119 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0, 120 "sendfile(2) tunables"); 121 static int sfreadahead = 1; 122 SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW, 123 &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks"); 124 125 static void 126 sfstat_init(const void *unused) 127 { 128 129 COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), 130 M_WAITOK); 131 } 132 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); 133 134 static int 135 sfstat_sysctl(SYSCTL_HANDLER_ARGS) 136 { 137 struct sfstat s; 138 139 COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); 140 if (req->newptr) 141 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); 142 return (SYSCTL_OUT(req, &s, sizeof(s))); 143 } 144 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, 145 NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); 146 147 /* 148 * Convert a user file descriptor to a kernel file entry and check if required 149 * capability rights are present. 150 * A reference on the file entry is held upon returning. 151 */ 152 int 153 getsock_cap(struct filedesc *fdp, int fd, cap_rights_t *rightsp, 154 struct file **fpp, u_int *fflagp) 155 { 156 struct file *fp; 157 int error; 158 159 error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL); 160 if (error != 0) 161 return (error); 162 if (fp->f_type != DTYPE_SOCKET) { 163 fdrop(fp, curthread); 164 return (ENOTSOCK); 165 } 166 if (fflagp != NULL) 167 *fflagp = fp->f_flag; 168 *fpp = fp; 169 return (0); 170 } 171 172 /* 173 * System call interface to the socket abstraction. 174 */ 175 #if defined(COMPAT_43) 176 #define COMPAT_OLDSOCK 177 #endif 178 179 int 180 sys_socket(td, uap) 181 struct thread *td; 182 struct socket_args /* { 183 int domain; 184 int type; 185 int protocol; 186 } */ *uap; 187 { 188 struct socket *so; 189 struct file *fp; 190 int fd, error, type, oflag, fflag; 191 192 AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); 193 194 type = uap->type; 195 oflag = 0; 196 fflag = 0; 197 if ((type & SOCK_CLOEXEC) != 0) { 198 type &= ~SOCK_CLOEXEC; 199 oflag |= O_CLOEXEC; 200 } 201 if ((type & SOCK_NONBLOCK) != 0) { 202 type &= ~SOCK_NONBLOCK; 203 fflag |= FNONBLOCK; 204 } 205 206 #ifdef MAC 207 error = mac_socket_check_create(td->td_ucred, uap->domain, type, 208 uap->protocol); 209 if (error != 0) 210 return (error); 211 #endif 212 error = falloc(td, &fp, &fd, oflag); 213 if (error != 0) 214 return (error); 215 /* An extra reference on `fp' has been held for us by falloc(). */ 216 error = socreate(uap->domain, &so, type, uap->protocol, 217 td->td_ucred, td); 218 if (error != 0) { 219 fdclose(td->td_proc->p_fd, fp, fd, td); 220 } else { 221 finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); 222 if ((fflag & FNONBLOCK) != 0) 223 (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); 224 td->td_retval[0] = fd; 225 } 226 fdrop(fp, td); 227 return (error); 228 } 229 230 /* ARGSUSED */ 231 int 232 sys_bind(td, uap) 233 struct thread *td; 234 struct bind_args /* { 235 int s; 236 caddr_t name; 237 int namelen; 238 } */ *uap; 239 { 240 struct sockaddr *sa; 241 int error; 242 243 error = getsockaddr(&sa, uap->name, uap->namelen); 244 if (error == 0) { 245 error = kern_bind(td, uap->s, sa); 246 free(sa, M_SONAME); 247 } 248 return (error); 249 } 250 251 static int 252 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 253 { 254 struct socket *so; 255 struct file *fp; 256 cap_rights_t rights; 257 int error; 258 259 AUDIT_ARG_FD(fd); 260 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 261 error = getsock_cap(td->td_proc->p_fd, fd, 262 cap_rights_init(&rights, CAP_BIND), &fp, NULL); 263 if (error != 0) 264 return (error); 265 so = fp->f_data; 266 #ifdef KTRACE 267 if (KTRPOINT(td, KTR_STRUCT)) 268 ktrsockaddr(sa); 269 #endif 270 #ifdef MAC 271 error = mac_socket_check_bind(td->td_ucred, so, sa); 272 if (error == 0) { 273 #endif 274 if (dirfd == AT_FDCWD) 275 error = sobind(so, sa, td); 276 else 277 error = sobindat(dirfd, so, sa, td); 278 #ifdef MAC 279 } 280 #endif 281 fdrop(fp, td); 282 return (error); 283 } 284 285 int 286 kern_bind(struct thread *td, int fd, struct sockaddr *sa) 287 { 288 289 return (kern_bindat(td, AT_FDCWD, fd, sa)); 290 } 291 292 /* ARGSUSED */ 293 int 294 sys_bindat(td, uap) 295 struct thread *td; 296 struct bindat_args /* { 297 int fd; 298 int s; 299 caddr_t name; 300 int namelen; 301 } */ *uap; 302 { 303 struct sockaddr *sa; 304 int error; 305 306 error = getsockaddr(&sa, uap->name, uap->namelen); 307 if (error == 0) { 308 error = kern_bindat(td, uap->fd, uap->s, sa); 309 free(sa, M_SONAME); 310 } 311 return (error); 312 } 313 314 /* ARGSUSED */ 315 int 316 sys_listen(td, uap) 317 struct thread *td; 318 struct listen_args /* { 319 int s; 320 int backlog; 321 } */ *uap; 322 { 323 struct socket *so; 324 struct file *fp; 325 cap_rights_t rights; 326 int error; 327 328 AUDIT_ARG_FD(uap->s); 329 error = getsock_cap(td->td_proc->p_fd, uap->s, 330 cap_rights_init(&rights, CAP_LISTEN), &fp, NULL); 331 if (error == 0) { 332 so = fp->f_data; 333 #ifdef MAC 334 error = mac_socket_check_listen(td->td_ucred, so); 335 if (error == 0) 336 #endif 337 error = solisten(so, uap->backlog, td); 338 fdrop(fp, td); 339 } 340 return(error); 341 } 342 343 /* 344 * accept1() 345 */ 346 static int 347 accept1(td, s, uname, anamelen, flags) 348 struct thread *td; 349 int s; 350 struct sockaddr *uname; 351 socklen_t *anamelen; 352 int flags; 353 { 354 struct sockaddr *name; 355 socklen_t namelen; 356 struct file *fp; 357 int error; 358 359 if (uname == NULL) 360 return (kern_accept4(td, s, NULL, NULL, flags, NULL)); 361 362 error = copyin(anamelen, &namelen, sizeof (namelen)); 363 if (error != 0) 364 return (error); 365 366 error = kern_accept4(td, s, &name, &namelen, flags, &fp); 367 368 if (error != 0) 369 return (error); 370 371 if (error == 0 && uname != NULL) { 372 #ifdef COMPAT_OLDSOCK 373 if (flags & ACCEPT4_COMPAT) 374 ((struct osockaddr *)name)->sa_family = 375 name->sa_family; 376 #endif 377 error = copyout(name, uname, namelen); 378 } 379 if (error == 0) 380 error = copyout(&namelen, anamelen, 381 sizeof(namelen)); 382 if (error != 0) 383 fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td); 384 fdrop(fp, td); 385 free(name, M_SONAME); 386 return (error); 387 } 388 389 int 390 kern_accept(struct thread *td, int s, struct sockaddr **name, 391 socklen_t *namelen, struct file **fp) 392 { 393 return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); 394 } 395 396 int 397 kern_accept4(struct thread *td, int s, struct sockaddr **name, 398 socklen_t *namelen, int flags, struct file **fp) 399 { 400 struct filedesc *fdp; 401 struct file *headfp, *nfp = NULL; 402 struct sockaddr *sa = NULL; 403 struct socket *head, *so; 404 cap_rights_t rights; 405 u_int fflag; 406 pid_t pgid; 407 int error, fd, tmp; 408 409 if (name != NULL) 410 *name = NULL; 411 412 AUDIT_ARG_FD(s); 413 fdp = td->td_proc->p_fd; 414 error = getsock_cap(fdp, s, cap_rights_init(&rights, CAP_ACCEPT), 415 &headfp, &fflag); 416 if (error != 0) 417 return (error); 418 head = headfp->f_data; 419 if ((head->so_options & SO_ACCEPTCONN) == 0) { 420 error = EINVAL; 421 goto done; 422 } 423 #ifdef MAC 424 error = mac_socket_check_accept(td->td_ucred, head); 425 if (error != 0) 426 goto done; 427 #endif 428 error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); 429 if (error != 0) 430 goto done; 431 ACCEPT_LOCK(); 432 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { 433 ACCEPT_UNLOCK(); 434 error = EWOULDBLOCK; 435 goto noconnection; 436 } 437 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { 438 if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { 439 head->so_error = ECONNABORTED; 440 break; 441 } 442 error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, 443 "accept", 0); 444 if (error != 0) { 445 ACCEPT_UNLOCK(); 446 goto noconnection; 447 } 448 } 449 if (head->so_error) { 450 error = head->so_error; 451 head->so_error = 0; 452 ACCEPT_UNLOCK(); 453 goto noconnection; 454 } 455 so = TAILQ_FIRST(&head->so_comp); 456 KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); 457 KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); 458 459 /* 460 * Before changing the flags on the socket, we have to bump the 461 * reference count. Otherwise, if the protocol calls sofree(), 462 * the socket will be released due to a zero refcount. 463 */ 464 SOCK_LOCK(so); /* soref() and so_state update */ 465 soref(so); /* file descriptor reference */ 466 467 TAILQ_REMOVE(&head->so_comp, so, so_list); 468 head->so_qlen--; 469 if (flags & ACCEPT4_INHERIT) 470 so->so_state |= (head->so_state & SS_NBIO); 471 else 472 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 473 so->so_qstate &= ~SQ_COMP; 474 so->so_head = NULL; 475 476 SOCK_UNLOCK(so); 477 ACCEPT_UNLOCK(); 478 479 /* An extra reference on `nfp' has been held for us by falloc(). */ 480 td->td_retval[0] = fd; 481 482 /* connection has been removed from the listen queue */ 483 KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); 484 485 if (flags & ACCEPT4_INHERIT) { 486 pgid = fgetown(&head->so_sigio); 487 if (pgid != 0) 488 fsetown(pgid, &so->so_sigio); 489 } else { 490 fflag &= ~(FNONBLOCK | FASYNC); 491 if (flags & SOCK_NONBLOCK) 492 fflag |= FNONBLOCK; 493 } 494 495 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); 496 /* Sync socket nonblocking/async state with file flags */ 497 tmp = fflag & FNONBLOCK; 498 (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); 499 tmp = fflag & FASYNC; 500 (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); 501 sa = 0; 502 error = soaccept(so, &sa); 503 if (error != 0) 504 goto noconnection; 505 if (sa == NULL) { 506 if (name) 507 *namelen = 0; 508 goto done; 509 } 510 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); 511 if (name) { 512 /* check sa_len before it is destroyed */ 513 if (*namelen > sa->sa_len) 514 *namelen = sa->sa_len; 515 #ifdef KTRACE 516 if (KTRPOINT(td, KTR_STRUCT)) 517 ktrsockaddr(sa); 518 #endif 519 *name = sa; 520 sa = NULL; 521 } 522 noconnection: 523 free(sa, M_SONAME); 524 525 /* 526 * close the new descriptor, assuming someone hasn't ripped it 527 * out from under us. 528 */ 529 if (error != 0) 530 fdclose(fdp, nfp, fd, td); 531 532 /* 533 * Release explicitly held references before returning. We return 534 * a reference on nfp to the caller on success if they request it. 535 */ 536 done: 537 if (fp != NULL) { 538 if (error == 0) { 539 *fp = nfp; 540 nfp = NULL; 541 } else 542 *fp = NULL; 543 } 544 if (nfp != NULL) 545 fdrop(nfp, td); 546 fdrop(headfp, td); 547 return (error); 548 } 549 550 int 551 sys_accept(td, uap) 552 struct thread *td; 553 struct accept_args *uap; 554 { 555 556 return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); 557 } 558 559 int 560 sys_accept4(td, uap) 561 struct thread *td; 562 struct accept4_args *uap; 563 { 564 565 if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 566 return (EINVAL); 567 568 return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); 569 } 570 571 #ifdef COMPAT_OLDSOCK 572 int 573 oaccept(td, uap) 574 struct thread *td; 575 struct accept_args *uap; 576 { 577 578 return (accept1(td, uap->s, uap->name, uap->anamelen, 579 ACCEPT4_INHERIT | ACCEPT4_COMPAT)); 580 } 581 #endif /* COMPAT_OLDSOCK */ 582 583 /* ARGSUSED */ 584 int 585 sys_connect(td, uap) 586 struct thread *td; 587 struct connect_args /* { 588 int s; 589 caddr_t name; 590 int namelen; 591 } */ *uap; 592 { 593 struct sockaddr *sa; 594 int error; 595 596 error = getsockaddr(&sa, uap->name, uap->namelen); 597 if (error == 0) { 598 error = kern_connect(td, uap->s, sa); 599 free(sa, M_SONAME); 600 } 601 return (error); 602 } 603 604 static int 605 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 606 { 607 struct socket *so; 608 struct file *fp; 609 cap_rights_t rights; 610 int error, interrupted = 0; 611 612 AUDIT_ARG_FD(fd); 613 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 614 error = getsock_cap(td->td_proc->p_fd, fd, 615 cap_rights_init(&rights, CAP_CONNECT), &fp, NULL); 616 if (error != 0) 617 return (error); 618 so = fp->f_data; 619 if (so->so_state & SS_ISCONNECTING) { 620 error = EALREADY; 621 goto done1; 622 } 623 #ifdef KTRACE 624 if (KTRPOINT(td, KTR_STRUCT)) 625 ktrsockaddr(sa); 626 #endif 627 #ifdef MAC 628 error = mac_socket_check_connect(td->td_ucred, so, sa); 629 if (error != 0) 630 goto bad; 631 #endif 632 if (dirfd == AT_FDCWD) 633 error = soconnect(so, sa, td); 634 else 635 error = soconnectat(dirfd, so, sa, td); 636 if (error != 0) 637 goto bad; 638 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 639 error = EINPROGRESS; 640 goto done1; 641 } 642 SOCK_LOCK(so); 643 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 644 error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, 645 "connec", 0); 646 if (error != 0) { 647 if (error == EINTR || error == ERESTART) 648 interrupted = 1; 649 break; 650 } 651 } 652 if (error == 0) { 653 error = so->so_error; 654 so->so_error = 0; 655 } 656 SOCK_UNLOCK(so); 657 bad: 658 if (!interrupted) 659 so->so_state &= ~SS_ISCONNECTING; 660 if (error == ERESTART) 661 error = EINTR; 662 done1: 663 fdrop(fp, td); 664 return (error); 665 } 666 667 int 668 kern_connect(struct thread *td, int fd, struct sockaddr *sa) 669 { 670 671 return (kern_connectat(td, AT_FDCWD, fd, sa)); 672 } 673 674 /* ARGSUSED */ 675 int 676 sys_connectat(td, uap) 677 struct thread *td; 678 struct connectat_args /* { 679 int fd; 680 int s; 681 caddr_t name; 682 int namelen; 683 } */ *uap; 684 { 685 struct sockaddr *sa; 686 int error; 687 688 error = getsockaddr(&sa, uap->name, uap->namelen); 689 if (error == 0) { 690 error = kern_connectat(td, uap->fd, uap->s, sa); 691 free(sa, M_SONAME); 692 } 693 return (error); 694 } 695 696 int 697 kern_socketpair(struct thread *td, int domain, int type, int protocol, 698 int *rsv) 699 { 700 struct filedesc *fdp = td->td_proc->p_fd; 701 struct file *fp1, *fp2; 702 struct socket *so1, *so2; 703 int fd, error, oflag, fflag; 704 705 AUDIT_ARG_SOCKET(domain, type, protocol); 706 707 oflag = 0; 708 fflag = 0; 709 if ((type & SOCK_CLOEXEC) != 0) { 710 type &= ~SOCK_CLOEXEC; 711 oflag |= O_CLOEXEC; 712 } 713 if ((type & SOCK_NONBLOCK) != 0) { 714 type &= ~SOCK_NONBLOCK; 715 fflag |= FNONBLOCK; 716 } 717 #ifdef MAC 718 /* We might want to have a separate check for socket pairs. */ 719 error = mac_socket_check_create(td->td_ucred, domain, type, 720 protocol); 721 if (error != 0) 722 return (error); 723 #endif 724 error = socreate(domain, &so1, type, protocol, td->td_ucred, td); 725 if (error != 0) 726 return (error); 727 error = socreate(domain, &so2, type, protocol, td->td_ucred, td); 728 if (error != 0) 729 goto free1; 730 /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ 731 error = falloc(td, &fp1, &fd, oflag); 732 if (error != 0) 733 goto free2; 734 rsv[0] = fd; 735 fp1->f_data = so1; /* so1 already has ref count */ 736 error = falloc(td, &fp2, &fd, oflag); 737 if (error != 0) 738 goto free3; 739 fp2->f_data = so2; /* so2 already has ref count */ 740 rsv[1] = fd; 741 error = soconnect2(so1, so2); 742 if (error != 0) 743 goto free4; 744 if (type == SOCK_DGRAM) { 745 /* 746 * Datagram socket connection is asymmetric. 747 */ 748 error = soconnect2(so2, so1); 749 if (error != 0) 750 goto free4; 751 } 752 finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, 753 &socketops); 754 finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, 755 &socketops); 756 if ((fflag & FNONBLOCK) != 0) { 757 (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); 758 (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); 759 } 760 fdrop(fp1, td); 761 fdrop(fp2, td); 762 return (0); 763 free4: 764 fdclose(fdp, fp2, rsv[1], td); 765 fdrop(fp2, td); 766 free3: 767 fdclose(fdp, fp1, rsv[0], td); 768 fdrop(fp1, td); 769 free2: 770 if (so2 != NULL) 771 (void)soclose(so2); 772 free1: 773 if (so1 != NULL) 774 (void)soclose(so1); 775 return (error); 776 } 777 778 int 779 sys_socketpair(struct thread *td, struct socketpair_args *uap) 780 { 781 int error, sv[2]; 782 783 error = kern_socketpair(td, uap->domain, uap->type, 784 uap->protocol, sv); 785 if (error != 0) 786 return (error); 787 error = copyout(sv, uap->rsv, 2 * sizeof(int)); 788 if (error != 0) { 789 (void)kern_close(td, sv[0]); 790 (void)kern_close(td, sv[1]); 791 } 792 return (error); 793 } 794 795 static int 796 sendit(td, s, mp, flags) 797 struct thread *td; 798 int s; 799 struct msghdr *mp; 800 int flags; 801 { 802 struct mbuf *control; 803 struct sockaddr *to; 804 int error; 805 806 #ifdef CAPABILITY_MODE 807 if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) 808 return (ECAPMODE); 809 #endif 810 811 if (mp->msg_name != NULL) { 812 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 813 if (error != 0) { 814 to = NULL; 815 goto bad; 816 } 817 mp->msg_name = to; 818 } else { 819 to = NULL; 820 } 821 822 if (mp->msg_control) { 823 if (mp->msg_controllen < sizeof(struct cmsghdr) 824 #ifdef COMPAT_OLDSOCK 825 && mp->msg_flags != MSG_COMPAT 826 #endif 827 ) { 828 error = EINVAL; 829 goto bad; 830 } 831 error = sockargs(&control, mp->msg_control, 832 mp->msg_controllen, MT_CONTROL); 833 if (error != 0) 834 goto bad; 835 #ifdef COMPAT_OLDSOCK 836 if (mp->msg_flags == MSG_COMPAT) { 837 struct cmsghdr *cm; 838 839 M_PREPEND(control, sizeof(*cm), M_WAITOK); 840 cm = mtod(control, struct cmsghdr *); 841 cm->cmsg_len = control->m_len; 842 cm->cmsg_level = SOL_SOCKET; 843 cm->cmsg_type = SCM_RIGHTS; 844 } 845 #endif 846 } else { 847 control = NULL; 848 } 849 850 error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); 851 852 bad: 853 free(to, M_SONAME); 854 return (error); 855 } 856 857 int 858 kern_sendit(td, s, mp, flags, control, segflg) 859 struct thread *td; 860 int s; 861 struct msghdr *mp; 862 int flags; 863 struct mbuf *control; 864 enum uio_seg segflg; 865 { 866 struct file *fp; 867 struct uio auio; 868 struct iovec *iov; 869 struct socket *so; 870 cap_rights_t rights; 871 #ifdef KTRACE 872 struct uio *ktruio = NULL; 873 #endif 874 ssize_t len; 875 int i, error; 876 877 AUDIT_ARG_FD(s); 878 cap_rights_init(&rights, CAP_SEND); 879 if (mp->msg_name != NULL) { 880 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); 881 cap_rights_set(&rights, CAP_CONNECT); 882 } 883 error = getsock_cap(td->td_proc->p_fd, s, &rights, &fp, NULL); 884 if (error != 0) 885 return (error); 886 so = (struct socket *)fp->f_data; 887 888 #ifdef KTRACE 889 if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) 890 ktrsockaddr(mp->msg_name); 891 #endif 892 #ifdef MAC 893 if (mp->msg_name != NULL) { 894 error = mac_socket_check_connect(td->td_ucred, so, 895 mp->msg_name); 896 if (error != 0) 897 goto bad; 898 } 899 error = mac_socket_check_send(td->td_ucred, so); 900 if (error != 0) 901 goto bad; 902 #endif 903 904 auio.uio_iov = mp->msg_iov; 905 auio.uio_iovcnt = mp->msg_iovlen; 906 auio.uio_segflg = segflg; 907 auio.uio_rw = UIO_WRITE; 908 auio.uio_td = td; 909 auio.uio_offset = 0; /* XXX */ 910 auio.uio_resid = 0; 911 iov = mp->msg_iov; 912 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 913 if ((auio.uio_resid += iov->iov_len) < 0) { 914 error = EINVAL; 915 goto bad; 916 } 917 } 918 #ifdef KTRACE 919 if (KTRPOINT(td, KTR_GENIO)) 920 ktruio = cloneuio(&auio); 921 #endif 922 len = auio.uio_resid; 923 error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); 924 if (error != 0) { 925 if (auio.uio_resid != len && (error == ERESTART || 926 error == EINTR || error == EWOULDBLOCK)) 927 error = 0; 928 /* Generation of SIGPIPE can be controlled per socket */ 929 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 930 !(flags & MSG_NOSIGNAL)) { 931 PROC_LOCK(td->td_proc); 932 tdsignal(td, SIGPIPE); 933 PROC_UNLOCK(td->td_proc); 934 } 935 } 936 if (error == 0) 937 td->td_retval[0] = len - auio.uio_resid; 938 #ifdef KTRACE 939 if (ktruio != NULL) { 940 ktruio->uio_resid = td->td_retval[0]; 941 ktrgenio(s, UIO_WRITE, ktruio, error); 942 } 943 #endif 944 bad: 945 fdrop(fp, td); 946 return (error); 947 } 948 949 int 950 sys_sendto(td, uap) 951 struct thread *td; 952 struct sendto_args /* { 953 int s; 954 caddr_t buf; 955 size_t len; 956 int flags; 957 caddr_t to; 958 int tolen; 959 } */ *uap; 960 { 961 struct msghdr msg; 962 struct iovec aiov; 963 964 msg.msg_name = uap->to; 965 msg.msg_namelen = uap->tolen; 966 msg.msg_iov = &aiov; 967 msg.msg_iovlen = 1; 968 msg.msg_control = 0; 969 #ifdef COMPAT_OLDSOCK 970 msg.msg_flags = 0; 971 #endif 972 aiov.iov_base = uap->buf; 973 aiov.iov_len = uap->len; 974 return (sendit(td, uap->s, &msg, uap->flags)); 975 } 976 977 #ifdef COMPAT_OLDSOCK 978 int 979 osend(td, uap) 980 struct thread *td; 981 struct osend_args /* { 982 int s; 983 caddr_t buf; 984 int len; 985 int flags; 986 } */ *uap; 987 { 988 struct msghdr msg; 989 struct iovec aiov; 990 991 msg.msg_name = 0; 992 msg.msg_namelen = 0; 993 msg.msg_iov = &aiov; 994 msg.msg_iovlen = 1; 995 aiov.iov_base = uap->buf; 996 aiov.iov_len = uap->len; 997 msg.msg_control = 0; 998 msg.msg_flags = 0; 999 return (sendit(td, uap->s, &msg, uap->flags)); 1000 } 1001 1002 int 1003 osendmsg(td, uap) 1004 struct thread *td; 1005 struct osendmsg_args /* { 1006 int s; 1007 caddr_t msg; 1008 int flags; 1009 } */ *uap; 1010 { 1011 struct msghdr msg; 1012 struct iovec *iov; 1013 int error; 1014 1015 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1016 if (error != 0) 1017 return (error); 1018 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1019 if (error != 0) 1020 return (error); 1021 msg.msg_iov = iov; 1022 msg.msg_flags = MSG_COMPAT; 1023 error = sendit(td, uap->s, &msg, uap->flags); 1024 free(iov, M_IOV); 1025 return (error); 1026 } 1027 #endif 1028 1029 int 1030 sys_sendmsg(td, uap) 1031 struct thread *td; 1032 struct sendmsg_args /* { 1033 int s; 1034 caddr_t msg; 1035 int flags; 1036 } */ *uap; 1037 { 1038 struct msghdr msg; 1039 struct iovec *iov; 1040 int error; 1041 1042 error = copyin(uap->msg, &msg, sizeof (msg)); 1043 if (error != 0) 1044 return (error); 1045 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1046 if (error != 0) 1047 return (error); 1048 msg.msg_iov = iov; 1049 #ifdef COMPAT_OLDSOCK 1050 msg.msg_flags = 0; 1051 #endif 1052 error = sendit(td, uap->s, &msg, uap->flags); 1053 free(iov, M_IOV); 1054 return (error); 1055 } 1056 1057 int 1058 kern_recvit(td, s, mp, fromseg, controlp) 1059 struct thread *td; 1060 int s; 1061 struct msghdr *mp; 1062 enum uio_seg fromseg; 1063 struct mbuf **controlp; 1064 { 1065 struct uio auio; 1066 struct iovec *iov; 1067 struct mbuf *m, *control = NULL; 1068 caddr_t ctlbuf; 1069 struct file *fp; 1070 struct socket *so; 1071 struct sockaddr *fromsa = NULL; 1072 cap_rights_t rights; 1073 #ifdef KTRACE 1074 struct uio *ktruio = NULL; 1075 #endif 1076 ssize_t len; 1077 int error, i; 1078 1079 if (controlp != NULL) 1080 *controlp = NULL; 1081 1082 AUDIT_ARG_FD(s); 1083 error = getsock_cap(td->td_proc->p_fd, s, 1084 cap_rights_init(&rights, CAP_RECV), &fp, NULL); 1085 if (error != 0) 1086 return (error); 1087 so = fp->f_data; 1088 1089 #ifdef MAC 1090 error = mac_socket_check_receive(td->td_ucred, so); 1091 if (error != 0) { 1092 fdrop(fp, td); 1093 return (error); 1094 } 1095 #endif 1096 1097 auio.uio_iov = mp->msg_iov; 1098 auio.uio_iovcnt = mp->msg_iovlen; 1099 auio.uio_segflg = UIO_USERSPACE; 1100 auio.uio_rw = UIO_READ; 1101 auio.uio_td = td; 1102 auio.uio_offset = 0; /* XXX */ 1103 auio.uio_resid = 0; 1104 iov = mp->msg_iov; 1105 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 1106 if ((auio.uio_resid += iov->iov_len) < 0) { 1107 fdrop(fp, td); 1108 return (EINVAL); 1109 } 1110 } 1111 #ifdef KTRACE 1112 if (KTRPOINT(td, KTR_GENIO)) 1113 ktruio = cloneuio(&auio); 1114 #endif 1115 len = auio.uio_resid; 1116 error = soreceive(so, &fromsa, &auio, NULL, 1117 (mp->msg_control || controlp) ? &control : NULL, 1118 &mp->msg_flags); 1119 if (error != 0) { 1120 if (auio.uio_resid != len && (error == ERESTART || 1121 error == EINTR || error == EWOULDBLOCK)) 1122 error = 0; 1123 } 1124 if (fromsa != NULL) 1125 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); 1126 #ifdef KTRACE 1127 if (ktruio != NULL) { 1128 ktruio->uio_resid = len - auio.uio_resid; 1129 ktrgenio(s, UIO_READ, ktruio, error); 1130 } 1131 #endif 1132 if (error != 0) 1133 goto out; 1134 td->td_retval[0] = len - auio.uio_resid; 1135 if (mp->msg_name) { 1136 len = mp->msg_namelen; 1137 if (len <= 0 || fromsa == NULL) 1138 len = 0; 1139 else { 1140 /* save sa_len before it is destroyed by MSG_COMPAT */ 1141 len = MIN(len, fromsa->sa_len); 1142 #ifdef COMPAT_OLDSOCK 1143 if (mp->msg_flags & MSG_COMPAT) 1144 ((struct osockaddr *)fromsa)->sa_family = 1145 fromsa->sa_family; 1146 #endif 1147 if (fromseg == UIO_USERSPACE) { 1148 error = copyout(fromsa, mp->msg_name, 1149 (unsigned)len); 1150 if (error != 0) 1151 goto out; 1152 } else 1153 bcopy(fromsa, mp->msg_name, len); 1154 } 1155 mp->msg_namelen = len; 1156 } 1157 if (mp->msg_control && controlp == NULL) { 1158 #ifdef COMPAT_OLDSOCK 1159 /* 1160 * We assume that old recvmsg calls won't receive access 1161 * rights and other control info, esp. as control info 1162 * is always optional and those options didn't exist in 4.3. 1163 * If we receive rights, trim the cmsghdr; anything else 1164 * is tossed. 1165 */ 1166 if (control && mp->msg_flags & MSG_COMPAT) { 1167 if (mtod(control, struct cmsghdr *)->cmsg_level != 1168 SOL_SOCKET || 1169 mtod(control, struct cmsghdr *)->cmsg_type != 1170 SCM_RIGHTS) { 1171 mp->msg_controllen = 0; 1172 goto out; 1173 } 1174 control->m_len -= sizeof (struct cmsghdr); 1175 control->m_data += sizeof (struct cmsghdr); 1176 } 1177 #endif 1178 len = mp->msg_controllen; 1179 m = control; 1180 mp->msg_controllen = 0; 1181 ctlbuf = mp->msg_control; 1182 1183 while (m && len > 0) { 1184 unsigned int tocopy; 1185 1186 if (len >= m->m_len) 1187 tocopy = m->m_len; 1188 else { 1189 mp->msg_flags |= MSG_CTRUNC; 1190 tocopy = len; 1191 } 1192 1193 if ((error = copyout(mtod(m, caddr_t), 1194 ctlbuf, tocopy)) != 0) 1195 goto out; 1196 1197 ctlbuf += tocopy; 1198 len -= tocopy; 1199 m = m->m_next; 1200 } 1201 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; 1202 } 1203 out: 1204 fdrop(fp, td); 1205 #ifdef KTRACE 1206 if (fromsa && KTRPOINT(td, KTR_STRUCT)) 1207 ktrsockaddr(fromsa); 1208 #endif 1209 free(fromsa, M_SONAME); 1210 1211 if (error == 0 && controlp != NULL) 1212 *controlp = control; 1213 else if (control) 1214 m_freem(control); 1215 1216 return (error); 1217 } 1218 1219 static int 1220 recvit(td, s, mp, namelenp) 1221 struct thread *td; 1222 int s; 1223 struct msghdr *mp; 1224 void *namelenp; 1225 { 1226 int error; 1227 1228 error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); 1229 if (error != 0) 1230 return (error); 1231 if (namelenp != NULL) { 1232 error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); 1233 #ifdef COMPAT_OLDSOCK 1234 if (mp->msg_flags & MSG_COMPAT) 1235 error = 0; /* old recvfrom didn't check */ 1236 #endif 1237 } 1238 return (error); 1239 } 1240 1241 int 1242 sys_recvfrom(td, uap) 1243 struct thread *td; 1244 struct recvfrom_args /* { 1245 int s; 1246 caddr_t buf; 1247 size_t len; 1248 int flags; 1249 struct sockaddr * __restrict from; 1250 socklen_t * __restrict fromlenaddr; 1251 } */ *uap; 1252 { 1253 struct msghdr msg; 1254 struct iovec aiov; 1255 int error; 1256 1257 if (uap->fromlenaddr) { 1258 error = copyin(uap->fromlenaddr, 1259 &msg.msg_namelen, sizeof (msg.msg_namelen)); 1260 if (error != 0) 1261 goto done2; 1262 } else { 1263 msg.msg_namelen = 0; 1264 } 1265 msg.msg_name = uap->from; 1266 msg.msg_iov = &aiov; 1267 msg.msg_iovlen = 1; 1268 aiov.iov_base = uap->buf; 1269 aiov.iov_len = uap->len; 1270 msg.msg_control = 0; 1271 msg.msg_flags = uap->flags; 1272 error = recvit(td, uap->s, &msg, uap->fromlenaddr); 1273 done2: 1274 return (error); 1275 } 1276 1277 #ifdef COMPAT_OLDSOCK 1278 int 1279 orecvfrom(td, uap) 1280 struct thread *td; 1281 struct recvfrom_args *uap; 1282 { 1283 1284 uap->flags |= MSG_COMPAT; 1285 return (sys_recvfrom(td, uap)); 1286 } 1287 #endif 1288 1289 #ifdef COMPAT_OLDSOCK 1290 int 1291 orecv(td, uap) 1292 struct thread *td; 1293 struct orecv_args /* { 1294 int s; 1295 caddr_t buf; 1296 int len; 1297 int flags; 1298 } */ *uap; 1299 { 1300 struct msghdr msg; 1301 struct iovec aiov; 1302 1303 msg.msg_name = 0; 1304 msg.msg_namelen = 0; 1305 msg.msg_iov = &aiov; 1306 msg.msg_iovlen = 1; 1307 aiov.iov_base = uap->buf; 1308 aiov.iov_len = uap->len; 1309 msg.msg_control = 0; 1310 msg.msg_flags = uap->flags; 1311 return (recvit(td, uap->s, &msg, NULL)); 1312 } 1313 1314 /* 1315 * Old recvmsg. This code takes advantage of the fact that the old msghdr 1316 * overlays the new one, missing only the flags, and with the (old) access 1317 * rights where the control fields are now. 1318 */ 1319 int 1320 orecvmsg(td, uap) 1321 struct thread *td; 1322 struct orecvmsg_args /* { 1323 int s; 1324 struct omsghdr *msg; 1325 int flags; 1326 } */ *uap; 1327 { 1328 struct msghdr msg; 1329 struct iovec *iov; 1330 int error; 1331 1332 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1333 if (error != 0) 1334 return (error); 1335 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1336 if (error != 0) 1337 return (error); 1338 msg.msg_flags = uap->flags | MSG_COMPAT; 1339 msg.msg_iov = iov; 1340 error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); 1341 if (msg.msg_controllen && error == 0) 1342 error = copyout(&msg.msg_controllen, 1343 &uap->msg->msg_accrightslen, sizeof (int)); 1344 free(iov, M_IOV); 1345 return (error); 1346 } 1347 #endif 1348 1349 int 1350 sys_recvmsg(td, uap) 1351 struct thread *td; 1352 struct recvmsg_args /* { 1353 int s; 1354 struct msghdr *msg; 1355 int flags; 1356 } */ *uap; 1357 { 1358 struct msghdr msg; 1359 struct iovec *uiov, *iov; 1360 int error; 1361 1362 error = copyin(uap->msg, &msg, sizeof (msg)); 1363 if (error != 0) 1364 return (error); 1365 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1366 if (error != 0) 1367 return (error); 1368 msg.msg_flags = uap->flags; 1369 #ifdef COMPAT_OLDSOCK 1370 msg.msg_flags &= ~MSG_COMPAT; 1371 #endif 1372 uiov = msg.msg_iov; 1373 msg.msg_iov = iov; 1374 error = recvit(td, uap->s, &msg, NULL); 1375 if (error == 0) { 1376 msg.msg_iov = uiov; 1377 error = copyout(&msg, uap->msg, sizeof(msg)); 1378 } 1379 free(iov, M_IOV); 1380 return (error); 1381 } 1382 1383 /* ARGSUSED */ 1384 int 1385 sys_shutdown(td, uap) 1386 struct thread *td; 1387 struct shutdown_args /* { 1388 int s; 1389 int how; 1390 } */ *uap; 1391 { 1392 struct socket *so; 1393 struct file *fp; 1394 cap_rights_t rights; 1395 int error; 1396 1397 AUDIT_ARG_FD(uap->s); 1398 error = getsock_cap(td->td_proc->p_fd, uap->s, 1399 cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL); 1400 if (error == 0) { 1401 so = fp->f_data; 1402 error = soshutdown(so, uap->how); 1403 fdrop(fp, td); 1404 } 1405 return (error); 1406 } 1407 1408 /* ARGSUSED */ 1409 int 1410 sys_setsockopt(td, uap) 1411 struct thread *td; 1412 struct setsockopt_args /* { 1413 int s; 1414 int level; 1415 int name; 1416 caddr_t val; 1417 int valsize; 1418 } */ *uap; 1419 { 1420 1421 return (kern_setsockopt(td, uap->s, uap->level, uap->name, 1422 uap->val, UIO_USERSPACE, uap->valsize)); 1423 } 1424 1425 int 1426 kern_setsockopt(td, s, level, name, val, valseg, valsize) 1427 struct thread *td; 1428 int s; 1429 int level; 1430 int name; 1431 void *val; 1432 enum uio_seg valseg; 1433 socklen_t valsize; 1434 { 1435 struct socket *so; 1436 struct file *fp; 1437 struct sockopt sopt; 1438 cap_rights_t rights; 1439 int error; 1440 1441 if (val == NULL && valsize != 0) 1442 return (EFAULT); 1443 if ((int)valsize < 0) 1444 return (EINVAL); 1445 1446 sopt.sopt_dir = SOPT_SET; 1447 sopt.sopt_level = level; 1448 sopt.sopt_name = name; 1449 sopt.sopt_val = val; 1450 sopt.sopt_valsize = valsize; 1451 switch (valseg) { 1452 case UIO_USERSPACE: 1453 sopt.sopt_td = td; 1454 break; 1455 case UIO_SYSSPACE: 1456 sopt.sopt_td = NULL; 1457 break; 1458 default: 1459 panic("kern_setsockopt called with bad valseg"); 1460 } 1461 1462 AUDIT_ARG_FD(s); 1463 error = getsock_cap(td->td_proc->p_fd, s, 1464 cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL); 1465 if (error == 0) { 1466 so = fp->f_data; 1467 error = sosetopt(so, &sopt); 1468 fdrop(fp, td); 1469 } 1470 return(error); 1471 } 1472 1473 /* ARGSUSED */ 1474 int 1475 sys_getsockopt(td, uap) 1476 struct thread *td; 1477 struct getsockopt_args /* { 1478 int s; 1479 int level; 1480 int name; 1481 void * __restrict val; 1482 socklen_t * __restrict avalsize; 1483 } */ *uap; 1484 { 1485 socklen_t valsize; 1486 int error; 1487 1488 if (uap->val) { 1489 error = copyin(uap->avalsize, &valsize, sizeof (valsize)); 1490 if (error != 0) 1491 return (error); 1492 } 1493 1494 error = kern_getsockopt(td, uap->s, uap->level, uap->name, 1495 uap->val, UIO_USERSPACE, &valsize); 1496 1497 if (error == 0) 1498 error = copyout(&valsize, uap->avalsize, sizeof (valsize)); 1499 return (error); 1500 } 1501 1502 /* 1503 * Kernel version of getsockopt. 1504 * optval can be a userland or userspace. optlen is always a kernel pointer. 1505 */ 1506 int 1507 kern_getsockopt(td, s, level, name, val, valseg, valsize) 1508 struct thread *td; 1509 int s; 1510 int level; 1511 int name; 1512 void *val; 1513 enum uio_seg valseg; 1514 socklen_t *valsize; 1515 { 1516 struct socket *so; 1517 struct file *fp; 1518 struct sockopt sopt; 1519 cap_rights_t rights; 1520 int error; 1521 1522 if (val == NULL) 1523 *valsize = 0; 1524 if ((int)*valsize < 0) 1525 return (EINVAL); 1526 1527 sopt.sopt_dir = SOPT_GET; 1528 sopt.sopt_level = level; 1529 sopt.sopt_name = name; 1530 sopt.sopt_val = val; 1531 sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ 1532 switch (valseg) { 1533 case UIO_USERSPACE: 1534 sopt.sopt_td = td; 1535 break; 1536 case UIO_SYSSPACE: 1537 sopt.sopt_td = NULL; 1538 break; 1539 default: 1540 panic("kern_getsockopt called with bad valseg"); 1541 } 1542 1543 AUDIT_ARG_FD(s); 1544 error = getsock_cap(td->td_proc->p_fd, s, 1545 cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL); 1546 if (error == 0) { 1547 so = fp->f_data; 1548 error = sogetopt(so, &sopt); 1549 *valsize = sopt.sopt_valsize; 1550 fdrop(fp, td); 1551 } 1552 return (error); 1553 } 1554 1555 /* 1556 * getsockname1() - Get socket name. 1557 */ 1558 /* ARGSUSED */ 1559 static int 1560 getsockname1(td, uap, compat) 1561 struct thread *td; 1562 struct getsockname_args /* { 1563 int fdes; 1564 struct sockaddr * __restrict asa; 1565 socklen_t * __restrict alen; 1566 } */ *uap; 1567 int compat; 1568 { 1569 struct sockaddr *sa; 1570 socklen_t len; 1571 int error; 1572 1573 error = copyin(uap->alen, &len, sizeof(len)); 1574 if (error != 0) 1575 return (error); 1576 1577 error = kern_getsockname(td, uap->fdes, &sa, &len); 1578 if (error != 0) 1579 return (error); 1580 1581 if (len != 0) { 1582 #ifdef COMPAT_OLDSOCK 1583 if (compat) 1584 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1585 #endif 1586 error = copyout(sa, uap->asa, (u_int)len); 1587 } 1588 free(sa, M_SONAME); 1589 if (error == 0) 1590 error = copyout(&len, uap->alen, sizeof(len)); 1591 return (error); 1592 } 1593 1594 int 1595 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, 1596 socklen_t *alen) 1597 { 1598 struct socket *so; 1599 struct file *fp; 1600 cap_rights_t rights; 1601 socklen_t len; 1602 int error; 1603 1604 AUDIT_ARG_FD(fd); 1605 error = getsock_cap(td->td_proc->p_fd, fd, 1606 cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL); 1607 if (error != 0) 1608 return (error); 1609 so = fp->f_data; 1610 *sa = NULL; 1611 CURVNET_SET(so->so_vnet); 1612 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); 1613 CURVNET_RESTORE(); 1614 if (error != 0) 1615 goto bad; 1616 if (*sa == NULL) 1617 len = 0; 1618 else 1619 len = MIN(*alen, (*sa)->sa_len); 1620 *alen = len; 1621 #ifdef KTRACE 1622 if (KTRPOINT(td, KTR_STRUCT)) 1623 ktrsockaddr(*sa); 1624 #endif 1625 bad: 1626 fdrop(fp, td); 1627 if (error != 0 && *sa != NULL) { 1628 free(*sa, M_SONAME); 1629 *sa = NULL; 1630 } 1631 return (error); 1632 } 1633 1634 int 1635 sys_getsockname(td, uap) 1636 struct thread *td; 1637 struct getsockname_args *uap; 1638 { 1639 1640 return (getsockname1(td, uap, 0)); 1641 } 1642 1643 #ifdef COMPAT_OLDSOCK 1644 int 1645 ogetsockname(td, uap) 1646 struct thread *td; 1647 struct getsockname_args *uap; 1648 { 1649 1650 return (getsockname1(td, uap, 1)); 1651 } 1652 #endif /* COMPAT_OLDSOCK */ 1653 1654 /* 1655 * getpeername1() - Get name of peer for connected socket. 1656 */ 1657 /* ARGSUSED */ 1658 static int 1659 getpeername1(td, uap, compat) 1660 struct thread *td; 1661 struct getpeername_args /* { 1662 int fdes; 1663 struct sockaddr * __restrict asa; 1664 socklen_t * __restrict alen; 1665 } */ *uap; 1666 int compat; 1667 { 1668 struct sockaddr *sa; 1669 socklen_t len; 1670 int error; 1671 1672 error = copyin(uap->alen, &len, sizeof (len)); 1673 if (error != 0) 1674 return (error); 1675 1676 error = kern_getpeername(td, uap->fdes, &sa, &len); 1677 if (error != 0) 1678 return (error); 1679 1680 if (len != 0) { 1681 #ifdef COMPAT_OLDSOCK 1682 if (compat) 1683 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1684 #endif 1685 error = copyout(sa, uap->asa, (u_int)len); 1686 } 1687 free(sa, M_SONAME); 1688 if (error == 0) 1689 error = copyout(&len, uap->alen, sizeof(len)); 1690 return (error); 1691 } 1692 1693 int 1694 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, 1695 socklen_t *alen) 1696 { 1697 struct socket *so; 1698 struct file *fp; 1699 cap_rights_t rights; 1700 socklen_t len; 1701 int error; 1702 1703 AUDIT_ARG_FD(fd); 1704 error = getsock_cap(td->td_proc->p_fd, fd, 1705 cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL); 1706 if (error != 0) 1707 return (error); 1708 so = fp->f_data; 1709 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1710 error = ENOTCONN; 1711 goto done; 1712 } 1713 *sa = NULL; 1714 CURVNET_SET(so->so_vnet); 1715 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); 1716 CURVNET_RESTORE(); 1717 if (error != 0) 1718 goto bad; 1719 if (*sa == NULL) 1720 len = 0; 1721 else 1722 len = MIN(*alen, (*sa)->sa_len); 1723 *alen = len; 1724 #ifdef KTRACE 1725 if (KTRPOINT(td, KTR_STRUCT)) 1726 ktrsockaddr(*sa); 1727 #endif 1728 bad: 1729 if (error != 0 && *sa != NULL) { 1730 free(*sa, M_SONAME); 1731 *sa = NULL; 1732 } 1733 done: 1734 fdrop(fp, td); 1735 return (error); 1736 } 1737 1738 int 1739 sys_getpeername(td, uap) 1740 struct thread *td; 1741 struct getpeername_args *uap; 1742 { 1743 1744 return (getpeername1(td, uap, 0)); 1745 } 1746 1747 #ifdef COMPAT_OLDSOCK 1748 int 1749 ogetpeername(td, uap) 1750 struct thread *td; 1751 struct ogetpeername_args *uap; 1752 { 1753 1754 /* XXX uap should have type `getpeername_args *' to begin with. */ 1755 return (getpeername1(td, (struct getpeername_args *)uap, 1)); 1756 } 1757 #endif /* COMPAT_OLDSOCK */ 1758 1759 int 1760 sockargs(mp, buf, buflen, type) 1761 struct mbuf **mp; 1762 caddr_t buf; 1763 int buflen, type; 1764 { 1765 struct sockaddr *sa; 1766 struct mbuf *m; 1767 int error; 1768 1769 if (buflen > MLEN) { 1770 #ifdef COMPAT_OLDSOCK 1771 if (type == MT_SONAME && buflen <= 112) 1772 buflen = MLEN; /* unix domain compat. hack */ 1773 else 1774 #endif 1775 if (buflen > MCLBYTES) 1776 return (EINVAL); 1777 } 1778 m = m_get2(buflen, M_WAITOK, type, 0); 1779 m->m_len = buflen; 1780 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1781 if (error != 0) 1782 (void) m_free(m); 1783 else { 1784 *mp = m; 1785 if (type == MT_SONAME) { 1786 sa = mtod(m, struct sockaddr *); 1787 1788 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1789 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1790 sa->sa_family = sa->sa_len; 1791 #endif 1792 sa->sa_len = buflen; 1793 } 1794 } 1795 return (error); 1796 } 1797 1798 int 1799 getsockaddr(namp, uaddr, len) 1800 struct sockaddr **namp; 1801 caddr_t uaddr; 1802 size_t len; 1803 { 1804 struct sockaddr *sa; 1805 int error; 1806 1807 if (len > SOCK_MAXADDRLEN) 1808 return (ENAMETOOLONG); 1809 if (len < offsetof(struct sockaddr, sa_data[0])) 1810 return (EINVAL); 1811 sa = malloc(len, M_SONAME, M_WAITOK); 1812 error = copyin(uaddr, sa, len); 1813 if (error != 0) { 1814 free(sa, M_SONAME); 1815 } else { 1816 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1817 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1818 sa->sa_family = sa->sa_len; 1819 #endif 1820 sa->sa_len = len; 1821 *namp = sa; 1822 } 1823 return (error); 1824 } 1825 1826 struct sendfile_sync { 1827 struct mtx mtx; 1828 struct cv cv; 1829 unsigned count; 1830 }; 1831 1832 /* 1833 * Add more references to a vm_page + sf_buf + sendfile_sync. 1834 */ 1835 void 1836 sf_ext_ref(void *arg1, void *arg2) 1837 { 1838 struct sf_buf *sf = arg1; 1839 struct sendfile_sync *sfs = arg2; 1840 vm_page_t pg = sf_buf_page(sf); 1841 1842 sf_buf_ref(sf); 1843 1844 vm_page_lock(pg); 1845 vm_page_wire(pg); 1846 vm_page_unlock(pg); 1847 1848 if (sfs != NULL) { 1849 mtx_lock(&sfs->mtx); 1850 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); 1851 sfs->count++; 1852 mtx_unlock(&sfs->mtx); 1853 } 1854 } 1855 1856 /* 1857 * Detach mapped page and release resources back to the system. 1858 */ 1859 void 1860 sf_ext_free(void *arg1, void *arg2) 1861 { 1862 struct sf_buf *sf = arg1; 1863 struct sendfile_sync *sfs = arg2; 1864 vm_page_t pg = sf_buf_page(sf); 1865 1866 sf_buf_free(sf); 1867 1868 vm_page_lock(pg); 1869 vm_page_unwire(pg, PQ_INACTIVE); 1870 /* 1871 * Check for the object going away on us. This can 1872 * happen since we don't hold a reference to it. 1873 * If so, we're responsible for freeing the page. 1874 */ 1875 if (pg->wire_count == 0 && pg->object == NULL) 1876 vm_page_free(pg); 1877 vm_page_unlock(pg); 1878 1879 if (sfs != NULL) { 1880 mtx_lock(&sfs->mtx); 1881 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); 1882 if (--sfs->count == 0) 1883 cv_signal(&sfs->cv); 1884 mtx_unlock(&sfs->mtx); 1885 } 1886 } 1887 1888 /* 1889 * sendfile(2) 1890 * 1891 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 1892 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 1893 * 1894 * Send a file specified by 'fd' and starting at 'offset' to a socket 1895 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == 1896 * 0. Optionally add a header and/or trailer to the socket output. If 1897 * specified, write the total number of bytes sent into *sbytes. 1898 */ 1899 int 1900 sys_sendfile(struct thread *td, struct sendfile_args *uap) 1901 { 1902 1903 return (do_sendfile(td, uap, 0)); 1904 } 1905 1906 static int 1907 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) 1908 { 1909 struct sf_hdtr hdtr; 1910 struct uio *hdr_uio, *trl_uio; 1911 struct file *fp; 1912 cap_rights_t rights; 1913 off_t sbytes; 1914 int error; 1915 1916 /* 1917 * File offset must be positive. If it goes beyond EOF 1918 * we send only the header/trailer and no payload data. 1919 */ 1920 if (uap->offset < 0) 1921 return (EINVAL); 1922 1923 hdr_uio = trl_uio = NULL; 1924 1925 if (uap->hdtr != NULL) { 1926 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 1927 if (error != 0) 1928 goto out; 1929 if (hdtr.headers != NULL) { 1930 error = copyinuio(hdtr.headers, hdtr.hdr_cnt, 1931 &hdr_uio); 1932 if (error != 0) 1933 goto out; 1934 } 1935 if (hdtr.trailers != NULL) { 1936 error = copyinuio(hdtr.trailers, hdtr.trl_cnt, 1937 &trl_uio); 1938 if (error != 0) 1939 goto out; 1940 } 1941 } 1942 1943 AUDIT_ARG_FD(src_fd); 1944 1945 /* 1946 * sendfile(2) can start at any offset within a file so we require 1947 * CAP_READ+CAP_SEEK = CAP_PREAD. 1948 */ 1949 if ((error = fget_read(td, uap->fd, 1950 cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { 1951 goto out; 1952 } 1953 1954 error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset, 1955 uap->nbytes, &sbytes, uap->flags, compat ? SFK_COMPAT : 0, td); 1956 fdrop(fp, td); 1957 1958 if (uap->sbytes != NULL) 1959 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 1960 1961 out: 1962 free(hdr_uio, M_IOV); 1963 free(trl_uio, M_IOV); 1964 return (error); 1965 } 1966 1967 #ifdef COMPAT_FREEBSD4 1968 int 1969 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) 1970 { 1971 struct sendfile_args args; 1972 1973 args.fd = uap->fd; 1974 args.s = uap->s; 1975 args.offset = uap->offset; 1976 args.nbytes = uap->nbytes; 1977 args.hdtr = uap->hdtr; 1978 args.sbytes = uap->sbytes; 1979 args.flags = uap->flags; 1980 1981 return (do_sendfile(td, &args, 1)); 1982 } 1983 #endif /* COMPAT_FREEBSD4 */ 1984 1985 static int 1986 sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd, 1987 off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res) 1988 { 1989 vm_page_t m; 1990 vm_pindex_t pindex; 1991 ssize_t resid; 1992 int error, readahead, rv; 1993 1994 pindex = OFF_TO_IDX(off); 1995 VM_OBJECT_WLOCK(obj); 1996 m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY | 1997 VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL); 1998 1999 /* 2000 * Check if page is valid for what we need, otherwise initiate I/O. 2001 * 2002 * The non-zero nd argument prevents disk I/O, instead we 2003 * return the caller what he specified in nd. In particular, 2004 * if we already turned some pages into mbufs, nd == EAGAIN 2005 * and the main function send them the pages before we come 2006 * here again and block. 2007 */ 2008 if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) { 2009 if (vp == NULL) 2010 vm_page_xunbusy(m); 2011 VM_OBJECT_WUNLOCK(obj); 2012 *res = m; 2013 return (0); 2014 } else if (nd != 0) { 2015 if (vp == NULL) 2016 vm_page_xunbusy(m); 2017 error = nd; 2018 goto free_page; 2019 } 2020 2021 /* 2022 * Get the page from backing store. 2023 */ 2024 error = 0; 2025 if (vp != NULL) { 2026 VM_OBJECT_WUNLOCK(obj); 2027 readahead = sfreadahead * MAXBSIZE; 2028 2029 /* 2030 * Use vn_rdwr() instead of the pager interface for 2031 * the vnode, to allow the read-ahead. 2032 * 2033 * XXXMAC: Because we don't have fp->f_cred here, we 2034 * pass in NOCRED. This is probably wrong, but is 2035 * consistent with our original implementation. 2036 */ 2037 error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off), 2038 UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead / 2039 bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); 2040 SFSTAT_INC(sf_iocnt); 2041 VM_OBJECT_WLOCK(obj); 2042 } else { 2043 if (vm_pager_has_page(obj, pindex, NULL, NULL)) { 2044 rv = vm_pager_get_pages(obj, &m, 1, 0); 2045 SFSTAT_INC(sf_iocnt); 2046 m = vm_page_lookup(obj, pindex); 2047 if (m == NULL) 2048 error = EIO; 2049 else if (rv != VM_PAGER_OK) { 2050 vm_page_lock(m); 2051 vm_page_free(m); 2052 vm_page_unlock(m); 2053 m = NULL; 2054 error = EIO; 2055 } 2056 } else { 2057 pmap_zero_page(m); 2058 m->valid = VM_PAGE_BITS_ALL; 2059 m->dirty = 0; 2060 } 2061 if (m != NULL) 2062 vm_page_xunbusy(m); 2063 } 2064 if (error == 0) { 2065 *res = m; 2066 } else if (m != NULL) { 2067 free_page: 2068 vm_page_lock(m); 2069 vm_page_unwire(m, PQ_INACTIVE); 2070 2071 /* 2072 * See if anyone else might know about this page. If 2073 * not and it is not valid, then free it. 2074 */ 2075 if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m)) 2076 vm_page_free(m); 2077 vm_page_unlock(m); 2078 } 2079 KASSERT(error != 0 || (m->wire_count > 0 && 2080 vm_page_is_valid(m, off & PAGE_MASK, xfsize)), 2081 ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off, 2082 xfsize)); 2083 VM_OBJECT_WUNLOCK(obj); 2084 return (error); 2085 } 2086 2087 static int 2088 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, 2089 struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, 2090 int *bsize) 2091 { 2092 struct vattr va; 2093 vm_object_t obj; 2094 struct vnode *vp; 2095 struct shmfd *shmfd; 2096 int error; 2097 2098 vp = *vp_res = NULL; 2099 obj = NULL; 2100 shmfd = *shmfd_res = NULL; 2101 *bsize = 0; 2102 2103 /* 2104 * The file descriptor must be a regular file and have a 2105 * backing VM object. 2106 */ 2107 if (fp->f_type == DTYPE_VNODE) { 2108 vp = fp->f_vnode; 2109 vn_lock(vp, LK_SHARED | LK_RETRY); 2110 if (vp->v_type != VREG) { 2111 error = EINVAL; 2112 goto out; 2113 } 2114 *bsize = vp->v_mount->mnt_stat.f_iosize; 2115 error = VOP_GETATTR(vp, &va, td->td_ucred); 2116 if (error != 0) 2117 goto out; 2118 *obj_size = va.va_size; 2119 obj = vp->v_object; 2120 if (obj == NULL) { 2121 error = EINVAL; 2122 goto out; 2123 } 2124 } else if (fp->f_type == DTYPE_SHM) { 2125 shmfd = fp->f_data; 2126 obj = shmfd->shm_object; 2127 *obj_size = shmfd->shm_size; 2128 } else { 2129 error = EINVAL; 2130 goto out; 2131 } 2132 2133 VM_OBJECT_WLOCK(obj); 2134 if ((obj->flags & OBJ_DEAD) != 0) { 2135 VM_OBJECT_WUNLOCK(obj); 2136 error = EBADF; 2137 goto out; 2138 } 2139 2140 /* 2141 * Temporarily increase the backing VM object's reference 2142 * count so that a forced reclamation of its vnode does not 2143 * immediately destroy it. 2144 */ 2145 vm_object_reference_locked(obj); 2146 VM_OBJECT_WUNLOCK(obj); 2147 *obj_res = obj; 2148 *vp_res = vp; 2149 *shmfd_res = shmfd; 2150 2151 out: 2152 if (vp != NULL) 2153 VOP_UNLOCK(vp, 0); 2154 return (error); 2155 } 2156 2157 static int 2158 kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp, 2159 struct socket **so) 2160 { 2161 cap_rights_t rights; 2162 int error; 2163 2164 *sock_fp = NULL; 2165 *so = NULL; 2166 2167 /* 2168 * The socket must be a stream socket and connected. 2169 */ 2170 error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights, 2171 CAP_SEND), sock_fp, NULL); 2172 if (error != 0) 2173 return (error); 2174 *so = (*sock_fp)->f_data; 2175 if ((*so)->so_type != SOCK_STREAM) 2176 return (EINVAL); 2177 if (((*so)->so_state & SS_ISCONNECTED) == 0) 2178 return (ENOTCONN); 2179 return (0); 2180 } 2181 2182 int 2183 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 2184 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 2185 int kflags, struct thread *td) 2186 { 2187 struct file *sock_fp; 2188 struct vnode *vp; 2189 struct vm_object *obj; 2190 struct socket *so; 2191 struct mbuf *m; 2192 struct sf_buf *sf; 2193 struct vm_page *pg; 2194 struct shmfd *shmfd; 2195 struct sendfile_sync *sfs; 2196 struct vattr va; 2197 off_t off, xfsize, fsbytes, sbytes, rem, obj_size; 2198 int error, bsize, nd, hdrlen, mnw; 2199 2200 pg = NULL; 2201 obj = NULL; 2202 so = NULL; 2203 m = NULL; 2204 sfs = NULL; 2205 fsbytes = sbytes = 0; 2206 hdrlen = mnw = 0; 2207 rem = nbytes; 2208 obj_size = 0; 2209 2210 error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); 2211 if (error != 0) 2212 return (error); 2213 if (rem == 0) 2214 rem = obj_size; 2215 2216 error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so); 2217 if (error != 0) 2218 goto out; 2219 2220 /* 2221 * Do not wait on memory allocations but return ENOMEM for 2222 * caller to retry later. 2223 * XXX: Experimental. 2224 */ 2225 if (flags & SF_MNOWAIT) 2226 mnw = 1; 2227 2228 if (flags & SF_SYNC) { 2229 sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO); 2230 mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); 2231 cv_init(&sfs->cv, "sendfile"); 2232 } 2233 2234 #ifdef MAC 2235 error = mac_socket_check_send(td->td_ucred, so); 2236 if (error != 0) 2237 goto out; 2238 #endif 2239 2240 /* If headers are specified copy them into mbufs. */ 2241 if (hdr_uio != NULL) { 2242 hdr_uio->uio_td = td; 2243 hdr_uio->uio_rw = UIO_WRITE; 2244 if (hdr_uio->uio_resid > 0) { 2245 /* 2246 * In FBSD < 5.0 the nbytes to send also included 2247 * the header. If compat is specified subtract the 2248 * header size from nbytes. 2249 */ 2250 if (kflags & SFK_COMPAT) { 2251 if (nbytes > hdr_uio->uio_resid) 2252 nbytes -= hdr_uio->uio_resid; 2253 else 2254 nbytes = 0; 2255 } 2256 m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), 2257 0, 0, 0); 2258 if (m == NULL) { 2259 error = mnw ? EAGAIN : ENOBUFS; 2260 goto out; 2261 } 2262 hdrlen = m_length(m, NULL); 2263 } 2264 } 2265 2266 /* 2267 * Protect against multiple writers to the socket. 2268 * 2269 * XXXRW: Historically this has assumed non-interruptibility, so now 2270 * we implement that, but possibly shouldn't. 2271 */ 2272 (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); 2273 2274 /* 2275 * Loop through the pages of the file, starting with the requested 2276 * offset. Get a file page (do I/O if necessary), map the file page 2277 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 2278 * it on the socket. 2279 * This is done in two loops. The inner loop turns as many pages 2280 * as it can, up to available socket buffer space, without blocking 2281 * into mbufs to have it bulk delivered into the socket send buffer. 2282 * The outer loop checks the state and available space of the socket 2283 * and takes care of the overall progress. 2284 */ 2285 for (off = offset; ; ) { 2286 struct mbuf *mtail; 2287 int loopbytes; 2288 int space; 2289 int done; 2290 2291 if ((nbytes != 0 && nbytes == fsbytes) || 2292 (nbytes == 0 && obj_size == fsbytes)) 2293 break; 2294 2295 mtail = NULL; 2296 loopbytes = 0; 2297 space = 0; 2298 done = 0; 2299 2300 /* 2301 * Check the socket state for ongoing connection, 2302 * no errors and space in socket buffer. 2303 * If space is low allow for the remainder of the 2304 * file to be processed if it fits the socket buffer. 2305 * Otherwise block in waiting for sufficient space 2306 * to proceed, or if the socket is nonblocking, return 2307 * to userland with EAGAIN while reporting how far 2308 * we've come. 2309 * We wait until the socket buffer has significant free 2310 * space to do bulk sends. This makes good use of file 2311 * system read ahead and allows packet segmentation 2312 * offloading hardware to take over lots of work. If 2313 * we were not careful here we would send off only one 2314 * sfbuf at a time. 2315 */ 2316 SOCKBUF_LOCK(&so->so_snd); 2317 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) 2318 so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; 2319 retry_space: 2320 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2321 error = EPIPE; 2322 SOCKBUF_UNLOCK(&so->so_snd); 2323 goto done; 2324 } else if (so->so_error) { 2325 error = so->so_error; 2326 so->so_error = 0; 2327 SOCKBUF_UNLOCK(&so->so_snd); 2328 goto done; 2329 } 2330 space = sbspace(&so->so_snd); 2331 if (space < rem && 2332 (space <= 0 || 2333 space < so->so_snd.sb_lowat)) { 2334 if (so->so_state & SS_NBIO) { 2335 SOCKBUF_UNLOCK(&so->so_snd); 2336 error = EAGAIN; 2337 goto done; 2338 } 2339 /* 2340 * sbwait drops the lock while sleeping. 2341 * When we loop back to retry_space the 2342 * state may have changed and we retest 2343 * for it. 2344 */ 2345 error = sbwait(&so->so_snd); 2346 /* 2347 * An error from sbwait usually indicates that we've 2348 * been interrupted by a signal. If we've sent anything 2349 * then return bytes sent, otherwise return the error. 2350 */ 2351 if (error != 0) { 2352 SOCKBUF_UNLOCK(&so->so_snd); 2353 goto done; 2354 } 2355 goto retry_space; 2356 } 2357 SOCKBUF_UNLOCK(&so->so_snd); 2358 2359 /* 2360 * Reduce space in the socket buffer by the size of 2361 * the header mbuf chain. 2362 * hdrlen is set to 0 after the first loop. 2363 */ 2364 space -= hdrlen; 2365 2366 if (vp != NULL) { 2367 error = vn_lock(vp, LK_SHARED); 2368 if (error != 0) 2369 goto done; 2370 error = VOP_GETATTR(vp, &va, td->td_ucred); 2371 if (error != 0 || off >= va.va_size) { 2372 VOP_UNLOCK(vp, 0); 2373 goto done; 2374 } 2375 obj_size = va.va_size; 2376 } 2377 2378 /* 2379 * Loop and construct maximum sized mbuf chain to be bulk 2380 * dumped into socket buffer. 2381 */ 2382 while (space > loopbytes) { 2383 vm_offset_t pgoff; 2384 struct mbuf *m0; 2385 2386 /* 2387 * Calculate the amount to transfer. 2388 * Not to exceed a page, the EOF, 2389 * or the passed in nbytes. 2390 */ 2391 pgoff = (vm_offset_t)(off & PAGE_MASK); 2392 rem = obj_size - offset; 2393 if (nbytes != 0) 2394 rem = omin(rem, nbytes); 2395 rem -= fsbytes + loopbytes; 2396 xfsize = omin(PAGE_SIZE - pgoff, rem); 2397 xfsize = omin(space - loopbytes, xfsize); 2398 if (xfsize <= 0) { 2399 done = 1; /* all data sent */ 2400 break; 2401 } 2402 2403 /* 2404 * Attempt to look up the page. Allocate 2405 * if not found or wait and loop if busy. 2406 */ 2407 if (m != NULL) 2408 nd = EAGAIN; /* send what we already got */ 2409 else if ((flags & SF_NODISKIO) != 0) 2410 nd = EBUSY; 2411 else 2412 nd = 0; 2413 error = sendfile_readpage(obj, vp, nd, off, 2414 xfsize, bsize, td, &pg); 2415 if (error != 0) { 2416 if (error == EAGAIN) 2417 error = 0; /* not a real error */ 2418 break; 2419 } 2420 2421 /* 2422 * Get a sendfile buf. When allocating the 2423 * first buffer for mbuf chain, we usually 2424 * wait as long as necessary, but this wait 2425 * can be interrupted. For consequent 2426 * buffers, do not sleep, since several 2427 * threads might exhaust the buffers and then 2428 * deadlock. 2429 */ 2430 sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT : 2431 SFB_CATCH); 2432 if (sf == NULL) { 2433 SFSTAT_INC(sf_allocfail); 2434 vm_page_lock(pg); 2435 vm_page_unwire(pg, PQ_INACTIVE); 2436 KASSERT(pg->object != NULL, 2437 ("%s: object disappeared", __func__)); 2438 vm_page_unlock(pg); 2439 if (m == NULL) 2440 error = (mnw ? EAGAIN : EINTR); 2441 break; 2442 } 2443 2444 /* 2445 * Get an mbuf and set it up as having 2446 * external storage. 2447 */ 2448 m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); 2449 if (m0 == NULL) { 2450 error = (mnw ? EAGAIN : ENOBUFS); 2451 sf_ext_free(sf, NULL); 2452 break; 2453 } 2454 /* 2455 * Attach EXT_SFBUF external storage. 2456 */ 2457 m0->m_ext.ext_buf = (caddr_t )sf_buf_kva(sf); 2458 m0->m_ext.ext_size = PAGE_SIZE; 2459 m0->m_ext.ext_arg1 = sf; 2460 m0->m_ext.ext_arg2 = sfs; 2461 m0->m_ext.ext_type = EXT_SFBUF; 2462 m0->m_ext.ext_flags = 0; 2463 m0->m_flags |= (M_EXT|M_RDONLY); 2464 m0->m_data = (char *)sf_buf_kva(sf) + pgoff; 2465 m0->m_len = xfsize; 2466 2467 /* Append to mbuf chain. */ 2468 if (mtail != NULL) 2469 mtail->m_next = m0; 2470 else if (m != NULL) 2471 m_last(m)->m_next = m0; 2472 else 2473 m = m0; 2474 mtail = m0; 2475 2476 /* Keep track of bits processed. */ 2477 loopbytes += xfsize; 2478 off += xfsize; 2479 2480 if (sfs != NULL) { 2481 mtx_lock(&sfs->mtx); 2482 sfs->count++; 2483 mtx_unlock(&sfs->mtx); 2484 } 2485 } 2486 2487 if (vp != NULL) 2488 VOP_UNLOCK(vp, 0); 2489 2490 /* Add the buffer chain to the socket buffer. */ 2491 if (m != NULL) { 2492 int mlen, err; 2493 2494 mlen = m_length(m, NULL); 2495 SOCKBUF_LOCK(&so->so_snd); 2496 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2497 error = EPIPE; 2498 SOCKBUF_UNLOCK(&so->so_snd); 2499 goto done; 2500 } 2501 SOCKBUF_UNLOCK(&so->so_snd); 2502 CURVNET_SET(so->so_vnet); 2503 /* Avoid error aliasing. */ 2504 err = (*so->so_proto->pr_usrreqs->pru_send) 2505 (so, 0, m, NULL, NULL, td); 2506 CURVNET_RESTORE(); 2507 if (err == 0) { 2508 /* 2509 * We need two counters to get the 2510 * file offset and nbytes to send 2511 * right: 2512 * - sbytes contains the total amount 2513 * of bytes sent, including headers. 2514 * - fsbytes contains the total amount 2515 * of bytes sent from the file. 2516 */ 2517 sbytes += mlen; 2518 fsbytes += mlen; 2519 if (hdrlen) { 2520 fsbytes -= hdrlen; 2521 hdrlen = 0; 2522 } 2523 } else if (error == 0) 2524 error = err; 2525 m = NULL; /* pru_send always consumes */ 2526 } 2527 2528 /* Quit outer loop on error or when we're done. */ 2529 if (done) 2530 break; 2531 if (error != 0) 2532 goto done; 2533 } 2534 2535 /* 2536 * Send trailers. Wimp out and use writev(2). 2537 */ 2538 if (trl_uio != NULL) { 2539 sbunlock(&so->so_snd); 2540 error = kern_writev(td, sockfd, trl_uio); 2541 if (error == 0) 2542 sbytes += td->td_retval[0]; 2543 goto out; 2544 } 2545 2546 done: 2547 sbunlock(&so->so_snd); 2548 out: 2549 /* 2550 * If there was no error we have to clear td->td_retval[0] 2551 * because it may have been set by writev. 2552 */ 2553 if (error == 0) { 2554 td->td_retval[0] = 0; 2555 } 2556 if (sent != NULL) { 2557 (*sent) = sbytes; 2558 } 2559 if (obj != NULL) 2560 vm_object_deallocate(obj); 2561 if (so) 2562 fdrop(sock_fp, td); 2563 if (m) 2564 m_freem(m); 2565 2566 if (sfs != NULL) { 2567 mtx_lock(&sfs->mtx); 2568 if (sfs->count != 0) 2569 cv_wait(&sfs->cv, &sfs->mtx); 2570 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2571 cv_destroy(&sfs->cv); 2572 mtx_destroy(&sfs->mtx); 2573 free(sfs, M_TEMP); 2574 } 2575 2576 if (error == ERESTART) 2577 error = EINTR; 2578 2579 return (error); 2580 } 2581