1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_capsicum.h" 39 #include "opt_inet.h" 40 #include "opt_inet6.h" 41 #include "opt_sctp.h" 42 #include "opt_compat.h" 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/capsicum.h> 48 #include <sys/condvar.h> 49 #include <sys/kernel.h> 50 #include <sys/lock.h> 51 #include <sys/mutex.h> 52 #include <sys/sysproto.h> 53 #include <sys/malloc.h> 54 #include <sys/filedesc.h> 55 #include <sys/event.h> 56 #include <sys/proc.h> 57 #include <sys/fcntl.h> 58 #include <sys/file.h> 59 #include <sys/filio.h> 60 #include <sys/jail.h> 61 #include <sys/mman.h> 62 #include <sys/mount.h> 63 #include <sys/mbuf.h> 64 #include <sys/protosw.h> 65 #include <sys/rwlock.h> 66 #include <sys/sf_buf.h> 67 #include <sys/sf_sync.h> 68 #include <sys/sf_base.h> 69 #include <sys/sysent.h> 70 #include <sys/socket.h> 71 #include <sys/socketvar.h> 72 #include <sys/signalvar.h> 73 #include <sys/syscallsubr.h> 74 #include <sys/sysctl.h> 75 #include <sys/uio.h> 76 #include <sys/vnode.h> 77 #ifdef KTRACE 78 #include <sys/ktrace.h> 79 #endif 80 #ifdef COMPAT_FREEBSD32 81 #include <compat/freebsd32/freebsd32_util.h> 82 #endif 83 84 #include <net/vnet.h> 85 86 #include <security/audit/audit.h> 87 #include <security/mac/mac_framework.h> 88 89 #include <vm/vm.h> 90 #include <vm/vm_param.h> 91 #include <vm/vm_object.h> 92 #include <vm/vm_page.h> 93 #include <vm/vm_pager.h> 94 #include <vm/vm_kern.h> 95 #include <vm/vm_extern.h> 96 #include <vm/uma.h> 97 98 #if defined(INET) || defined(INET6) 99 #ifdef SCTP 100 #include <netinet/sctp.h> 101 #include <netinet/sctp_peeloff.h> 102 #endif /* SCTP */ 103 #endif /* INET || INET6 */ 104 105 /* 106 * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC 107 * and SOCK_NONBLOCK. 108 */ 109 #define ACCEPT4_INHERIT 0x1 110 #define ACCEPT4_COMPAT 0x2 111 112 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); 113 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); 114 115 static int accept1(struct thread *td, int s, struct sockaddr *uname, 116 socklen_t *anamelen, int flags); 117 static int do_sendfile(struct thread *td, struct sendfile_args *uap, 118 int compat); 119 static int getsockname1(struct thread *td, struct getsockname_args *uap, 120 int compat); 121 static int getpeername1(struct thread *td, struct getpeername_args *uap, 122 int compat); 123 124 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; 125 126 static int filt_sfsync_attach(struct knote *kn); 127 static void filt_sfsync_detach(struct knote *kn); 128 static int filt_sfsync(struct knote *kn, long hint); 129 130 /* 131 * sendfile(2)-related variables and associated sysctls 132 */ 133 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0, 134 "sendfile(2) tunables"); 135 static int sfreadahead = 1; 136 SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW, 137 &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks"); 138 139 #ifdef SFSYNC_DEBUG 140 static int sf_sync_debug = 0; 141 SYSCTL_INT(_debug, OID_AUTO, sf_sync_debug, CTLFLAG_RW, 142 &sf_sync_debug, 0, "Output debugging during sf_sync lifecycle"); 143 #define SFSYNC_DPRINTF(s, ...) \ 144 do { \ 145 if (sf_sync_debug) \ 146 printf((s), ##__VA_ARGS__); \ 147 } while (0) 148 #else 149 #define SFSYNC_DPRINTF(c, ...) 150 #endif 151 152 static uma_zone_t zone_sfsync; 153 154 static struct filterops sendfile_filtops = { 155 .f_isfd = 0, 156 .f_attach = filt_sfsync_attach, 157 .f_detach = filt_sfsync_detach, 158 .f_event = filt_sfsync, 159 }; 160 161 static void 162 sfstat_init(const void *unused) 163 { 164 165 COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), 166 M_WAITOK); 167 } 168 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); 169 170 static void 171 sf_sync_init(const void *unused) 172 { 173 174 zone_sfsync = uma_zcreate("sendfile_sync", sizeof(struct sendfile_sync), 175 NULL, NULL, 176 NULL, NULL, 177 UMA_ALIGN_CACHE, 178 0); 179 kqueue_add_filteropts(EVFILT_SENDFILE, &sendfile_filtops); 180 } 181 SYSINIT(sf_sync, SI_SUB_MBUF, SI_ORDER_FIRST, sf_sync_init, NULL); 182 183 static int 184 sfstat_sysctl(SYSCTL_HANDLER_ARGS) 185 { 186 struct sfstat s; 187 188 COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); 189 if (req->newptr) 190 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); 191 return (SYSCTL_OUT(req, &s, sizeof(s))); 192 } 193 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, 194 NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); 195 196 /* 197 * Convert a user file descriptor to a kernel file entry and check if required 198 * capability rights are present. 199 * A reference on the file entry is held upon returning. 200 */ 201 static int 202 getsock_cap(struct filedesc *fdp, int fd, cap_rights_t *rightsp, 203 struct file **fpp, u_int *fflagp) 204 { 205 struct file *fp; 206 int error; 207 208 error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL); 209 if (error != 0) 210 return (error); 211 if (fp->f_type != DTYPE_SOCKET) { 212 fdrop(fp, curthread); 213 return (ENOTSOCK); 214 } 215 if (fflagp != NULL) 216 *fflagp = fp->f_flag; 217 *fpp = fp; 218 return (0); 219 } 220 221 /* 222 * System call interface to the socket abstraction. 223 */ 224 #if defined(COMPAT_43) 225 #define COMPAT_OLDSOCK 226 #endif 227 228 int 229 sys_socket(td, uap) 230 struct thread *td; 231 struct socket_args /* { 232 int domain; 233 int type; 234 int protocol; 235 } */ *uap; 236 { 237 struct socket *so; 238 struct file *fp; 239 int fd, error, type, oflag, fflag; 240 241 AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); 242 243 type = uap->type; 244 oflag = 0; 245 fflag = 0; 246 if ((type & SOCK_CLOEXEC) != 0) { 247 type &= ~SOCK_CLOEXEC; 248 oflag |= O_CLOEXEC; 249 } 250 if ((type & SOCK_NONBLOCK) != 0) { 251 type &= ~SOCK_NONBLOCK; 252 fflag |= FNONBLOCK; 253 } 254 255 #ifdef MAC 256 error = mac_socket_check_create(td->td_ucred, uap->domain, type, 257 uap->protocol); 258 if (error != 0) 259 return (error); 260 #endif 261 error = falloc(td, &fp, &fd, oflag); 262 if (error != 0) 263 return (error); 264 /* An extra reference on `fp' has been held for us by falloc(). */ 265 error = socreate(uap->domain, &so, type, uap->protocol, 266 td->td_ucred, td); 267 if (error != 0) { 268 fdclose(td->td_proc->p_fd, fp, fd, td); 269 } else { 270 finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); 271 if ((fflag & FNONBLOCK) != 0) 272 (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); 273 td->td_retval[0] = fd; 274 } 275 fdrop(fp, td); 276 return (error); 277 } 278 279 /* ARGSUSED */ 280 int 281 sys_bind(td, uap) 282 struct thread *td; 283 struct bind_args /* { 284 int s; 285 caddr_t name; 286 int namelen; 287 } */ *uap; 288 { 289 struct sockaddr *sa; 290 int error; 291 292 error = getsockaddr(&sa, uap->name, uap->namelen); 293 if (error == 0) { 294 error = kern_bind(td, uap->s, sa); 295 free(sa, M_SONAME); 296 } 297 return (error); 298 } 299 300 static int 301 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 302 { 303 struct socket *so; 304 struct file *fp; 305 cap_rights_t rights; 306 int error; 307 308 AUDIT_ARG_FD(fd); 309 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 310 error = getsock_cap(td->td_proc->p_fd, fd, 311 cap_rights_init(&rights, CAP_BIND), &fp, NULL); 312 if (error != 0) 313 return (error); 314 so = fp->f_data; 315 #ifdef KTRACE 316 if (KTRPOINT(td, KTR_STRUCT)) 317 ktrsockaddr(sa); 318 #endif 319 #ifdef MAC 320 error = mac_socket_check_bind(td->td_ucred, so, sa); 321 if (error == 0) { 322 #endif 323 if (dirfd == AT_FDCWD) 324 error = sobind(so, sa, td); 325 else 326 error = sobindat(dirfd, so, sa, td); 327 #ifdef MAC 328 } 329 #endif 330 fdrop(fp, td); 331 return (error); 332 } 333 334 int 335 kern_bind(struct thread *td, int fd, struct sockaddr *sa) 336 { 337 338 return (kern_bindat(td, AT_FDCWD, fd, sa)); 339 } 340 341 /* ARGSUSED */ 342 int 343 sys_bindat(td, uap) 344 struct thread *td; 345 struct bindat_args /* { 346 int fd; 347 int s; 348 caddr_t name; 349 int namelen; 350 } */ *uap; 351 { 352 struct sockaddr *sa; 353 int error; 354 355 error = getsockaddr(&sa, uap->name, uap->namelen); 356 if (error == 0) { 357 error = kern_bindat(td, uap->fd, uap->s, sa); 358 free(sa, M_SONAME); 359 } 360 return (error); 361 } 362 363 /* ARGSUSED */ 364 int 365 sys_listen(td, uap) 366 struct thread *td; 367 struct listen_args /* { 368 int s; 369 int backlog; 370 } */ *uap; 371 { 372 struct socket *so; 373 struct file *fp; 374 cap_rights_t rights; 375 int error; 376 377 AUDIT_ARG_FD(uap->s); 378 error = getsock_cap(td->td_proc->p_fd, uap->s, 379 cap_rights_init(&rights, CAP_LISTEN), &fp, NULL); 380 if (error == 0) { 381 so = fp->f_data; 382 #ifdef MAC 383 error = mac_socket_check_listen(td->td_ucred, so); 384 if (error == 0) 385 #endif 386 error = solisten(so, uap->backlog, td); 387 fdrop(fp, td); 388 } 389 return(error); 390 } 391 392 /* 393 * accept1() 394 */ 395 static int 396 accept1(td, s, uname, anamelen, flags) 397 struct thread *td; 398 int s; 399 struct sockaddr *uname; 400 socklen_t *anamelen; 401 int flags; 402 { 403 struct sockaddr *name; 404 socklen_t namelen; 405 struct file *fp; 406 int error; 407 408 if (uname == NULL) 409 return (kern_accept4(td, s, NULL, NULL, flags, NULL)); 410 411 error = copyin(anamelen, &namelen, sizeof (namelen)); 412 if (error != 0) 413 return (error); 414 415 error = kern_accept4(td, s, &name, &namelen, flags, &fp); 416 417 if (error != 0) 418 return (error); 419 420 if (error == 0 && uname != NULL) { 421 #ifdef COMPAT_OLDSOCK 422 if (flags & ACCEPT4_COMPAT) 423 ((struct osockaddr *)name)->sa_family = 424 name->sa_family; 425 #endif 426 error = copyout(name, uname, namelen); 427 } 428 if (error == 0) 429 error = copyout(&namelen, anamelen, 430 sizeof(namelen)); 431 if (error != 0) 432 fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td); 433 fdrop(fp, td); 434 free(name, M_SONAME); 435 return (error); 436 } 437 438 int 439 kern_accept(struct thread *td, int s, struct sockaddr **name, 440 socklen_t *namelen, struct file **fp) 441 { 442 return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); 443 } 444 445 int 446 kern_accept4(struct thread *td, int s, struct sockaddr **name, 447 socklen_t *namelen, int flags, struct file **fp) 448 { 449 struct filedesc *fdp; 450 struct file *headfp, *nfp = NULL; 451 struct sockaddr *sa = NULL; 452 struct socket *head, *so; 453 cap_rights_t rights; 454 u_int fflag; 455 pid_t pgid; 456 int error, fd, tmp; 457 458 if (name != NULL) 459 *name = NULL; 460 461 AUDIT_ARG_FD(s); 462 fdp = td->td_proc->p_fd; 463 error = getsock_cap(fdp, s, cap_rights_init(&rights, CAP_ACCEPT), 464 &headfp, &fflag); 465 if (error != 0) 466 return (error); 467 head = headfp->f_data; 468 if ((head->so_options & SO_ACCEPTCONN) == 0) { 469 error = EINVAL; 470 goto done; 471 } 472 #ifdef MAC 473 error = mac_socket_check_accept(td->td_ucred, head); 474 if (error != 0) 475 goto done; 476 #endif 477 error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); 478 if (error != 0) 479 goto done; 480 ACCEPT_LOCK(); 481 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { 482 ACCEPT_UNLOCK(); 483 error = EWOULDBLOCK; 484 goto noconnection; 485 } 486 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { 487 if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { 488 head->so_error = ECONNABORTED; 489 break; 490 } 491 error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, 492 "accept", 0); 493 if (error != 0) { 494 ACCEPT_UNLOCK(); 495 goto noconnection; 496 } 497 } 498 if (head->so_error) { 499 error = head->so_error; 500 head->so_error = 0; 501 ACCEPT_UNLOCK(); 502 goto noconnection; 503 } 504 so = TAILQ_FIRST(&head->so_comp); 505 KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); 506 KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); 507 508 /* 509 * Before changing the flags on the socket, we have to bump the 510 * reference count. Otherwise, if the protocol calls sofree(), 511 * the socket will be released due to a zero refcount. 512 */ 513 SOCK_LOCK(so); /* soref() and so_state update */ 514 soref(so); /* file descriptor reference */ 515 516 TAILQ_REMOVE(&head->so_comp, so, so_list); 517 head->so_qlen--; 518 if (flags & ACCEPT4_INHERIT) 519 so->so_state |= (head->so_state & SS_NBIO); 520 else 521 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 522 so->so_qstate &= ~SQ_COMP; 523 so->so_head = NULL; 524 525 SOCK_UNLOCK(so); 526 ACCEPT_UNLOCK(); 527 528 /* An extra reference on `nfp' has been held for us by falloc(). */ 529 td->td_retval[0] = fd; 530 531 /* connection has been removed from the listen queue */ 532 KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); 533 534 if (flags & ACCEPT4_INHERIT) { 535 pgid = fgetown(&head->so_sigio); 536 if (pgid != 0) 537 fsetown(pgid, &so->so_sigio); 538 } else { 539 fflag &= ~(FNONBLOCK | FASYNC); 540 if (flags & SOCK_NONBLOCK) 541 fflag |= FNONBLOCK; 542 } 543 544 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); 545 /* Sync socket nonblocking/async state with file flags */ 546 tmp = fflag & FNONBLOCK; 547 (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); 548 tmp = fflag & FASYNC; 549 (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); 550 sa = 0; 551 error = soaccept(so, &sa); 552 if (error != 0) 553 goto noconnection; 554 if (sa == NULL) { 555 if (name) 556 *namelen = 0; 557 goto done; 558 } 559 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); 560 if (name) { 561 /* check sa_len before it is destroyed */ 562 if (*namelen > sa->sa_len) 563 *namelen = sa->sa_len; 564 #ifdef KTRACE 565 if (KTRPOINT(td, KTR_STRUCT)) 566 ktrsockaddr(sa); 567 #endif 568 *name = sa; 569 sa = NULL; 570 } 571 noconnection: 572 free(sa, M_SONAME); 573 574 /* 575 * close the new descriptor, assuming someone hasn't ripped it 576 * out from under us. 577 */ 578 if (error != 0) 579 fdclose(fdp, nfp, fd, td); 580 581 /* 582 * Release explicitly held references before returning. We return 583 * a reference on nfp to the caller on success if they request it. 584 */ 585 done: 586 if (fp != NULL) { 587 if (error == 0) { 588 *fp = nfp; 589 nfp = NULL; 590 } else 591 *fp = NULL; 592 } 593 if (nfp != NULL) 594 fdrop(nfp, td); 595 fdrop(headfp, td); 596 return (error); 597 } 598 599 int 600 sys_accept(td, uap) 601 struct thread *td; 602 struct accept_args *uap; 603 { 604 605 return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); 606 } 607 608 int 609 sys_accept4(td, uap) 610 struct thread *td; 611 struct accept4_args *uap; 612 { 613 614 if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 615 return (EINVAL); 616 617 return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); 618 } 619 620 #ifdef COMPAT_OLDSOCK 621 int 622 oaccept(td, uap) 623 struct thread *td; 624 struct accept_args *uap; 625 { 626 627 return (accept1(td, uap->s, uap->name, uap->anamelen, 628 ACCEPT4_INHERIT | ACCEPT4_COMPAT)); 629 } 630 #endif /* COMPAT_OLDSOCK */ 631 632 /* ARGSUSED */ 633 int 634 sys_connect(td, uap) 635 struct thread *td; 636 struct connect_args /* { 637 int s; 638 caddr_t name; 639 int namelen; 640 } */ *uap; 641 { 642 struct sockaddr *sa; 643 int error; 644 645 error = getsockaddr(&sa, uap->name, uap->namelen); 646 if (error == 0) { 647 error = kern_connect(td, uap->s, sa); 648 free(sa, M_SONAME); 649 } 650 return (error); 651 } 652 653 static int 654 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 655 { 656 struct socket *so; 657 struct file *fp; 658 cap_rights_t rights; 659 int error, interrupted = 0; 660 661 AUDIT_ARG_FD(fd); 662 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 663 error = getsock_cap(td->td_proc->p_fd, fd, 664 cap_rights_init(&rights, CAP_CONNECT), &fp, NULL); 665 if (error != 0) 666 return (error); 667 so = fp->f_data; 668 if (so->so_state & SS_ISCONNECTING) { 669 error = EALREADY; 670 goto done1; 671 } 672 #ifdef KTRACE 673 if (KTRPOINT(td, KTR_STRUCT)) 674 ktrsockaddr(sa); 675 #endif 676 #ifdef MAC 677 error = mac_socket_check_connect(td->td_ucred, so, sa); 678 if (error != 0) 679 goto bad; 680 #endif 681 if (dirfd == AT_FDCWD) 682 error = soconnect(so, sa, td); 683 else 684 error = soconnectat(dirfd, so, sa, td); 685 if (error != 0) 686 goto bad; 687 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 688 error = EINPROGRESS; 689 goto done1; 690 } 691 SOCK_LOCK(so); 692 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 693 error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, 694 "connec", 0); 695 if (error != 0) { 696 if (error == EINTR || error == ERESTART) 697 interrupted = 1; 698 break; 699 } 700 } 701 if (error == 0) { 702 error = so->so_error; 703 so->so_error = 0; 704 } 705 SOCK_UNLOCK(so); 706 bad: 707 if (!interrupted) 708 so->so_state &= ~SS_ISCONNECTING; 709 if (error == ERESTART) 710 error = EINTR; 711 done1: 712 fdrop(fp, td); 713 return (error); 714 } 715 716 int 717 kern_connect(struct thread *td, int fd, struct sockaddr *sa) 718 { 719 720 return (kern_connectat(td, AT_FDCWD, fd, sa)); 721 } 722 723 /* ARGSUSED */ 724 int 725 sys_connectat(td, uap) 726 struct thread *td; 727 struct connectat_args /* { 728 int fd; 729 int s; 730 caddr_t name; 731 int namelen; 732 } */ *uap; 733 { 734 struct sockaddr *sa; 735 int error; 736 737 error = getsockaddr(&sa, uap->name, uap->namelen); 738 if (error == 0) { 739 error = kern_connectat(td, uap->fd, uap->s, sa); 740 free(sa, M_SONAME); 741 } 742 return (error); 743 } 744 745 int 746 kern_socketpair(struct thread *td, int domain, int type, int protocol, 747 int *rsv) 748 { 749 struct filedesc *fdp = td->td_proc->p_fd; 750 struct file *fp1, *fp2; 751 struct socket *so1, *so2; 752 int fd, error, oflag, fflag; 753 754 AUDIT_ARG_SOCKET(domain, type, protocol); 755 756 oflag = 0; 757 fflag = 0; 758 if ((type & SOCK_CLOEXEC) != 0) { 759 type &= ~SOCK_CLOEXEC; 760 oflag |= O_CLOEXEC; 761 } 762 if ((type & SOCK_NONBLOCK) != 0) { 763 type &= ~SOCK_NONBLOCK; 764 fflag |= FNONBLOCK; 765 } 766 #ifdef MAC 767 /* We might want to have a separate check for socket pairs. */ 768 error = mac_socket_check_create(td->td_ucred, domain, type, 769 protocol); 770 if (error != 0) 771 return (error); 772 #endif 773 error = socreate(domain, &so1, type, protocol, td->td_ucred, td); 774 if (error != 0) 775 return (error); 776 error = socreate(domain, &so2, type, protocol, td->td_ucred, td); 777 if (error != 0) 778 goto free1; 779 /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ 780 error = falloc(td, &fp1, &fd, oflag); 781 if (error != 0) 782 goto free2; 783 rsv[0] = fd; 784 fp1->f_data = so1; /* so1 already has ref count */ 785 error = falloc(td, &fp2, &fd, oflag); 786 if (error != 0) 787 goto free3; 788 fp2->f_data = so2; /* so2 already has ref count */ 789 rsv[1] = fd; 790 error = soconnect2(so1, so2); 791 if (error != 0) 792 goto free4; 793 if (type == SOCK_DGRAM) { 794 /* 795 * Datagram socket connection is asymmetric. 796 */ 797 error = soconnect2(so2, so1); 798 if (error != 0) 799 goto free4; 800 } 801 finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, 802 &socketops); 803 finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, 804 &socketops); 805 if ((fflag & FNONBLOCK) != 0) { 806 (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); 807 (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); 808 } 809 fdrop(fp1, td); 810 fdrop(fp2, td); 811 return (0); 812 free4: 813 fdclose(fdp, fp2, rsv[1], td); 814 fdrop(fp2, td); 815 free3: 816 fdclose(fdp, fp1, rsv[0], td); 817 fdrop(fp1, td); 818 free2: 819 if (so2 != NULL) 820 (void)soclose(so2); 821 free1: 822 if (so1 != NULL) 823 (void)soclose(so1); 824 return (error); 825 } 826 827 int 828 sys_socketpair(struct thread *td, struct socketpair_args *uap) 829 { 830 int error, sv[2]; 831 832 error = kern_socketpair(td, uap->domain, uap->type, 833 uap->protocol, sv); 834 if (error != 0) 835 return (error); 836 error = copyout(sv, uap->rsv, 2 * sizeof(int)); 837 if (error != 0) { 838 (void)kern_close(td, sv[0]); 839 (void)kern_close(td, sv[1]); 840 } 841 return (error); 842 } 843 844 static int 845 sendit(td, s, mp, flags) 846 struct thread *td; 847 int s; 848 struct msghdr *mp; 849 int flags; 850 { 851 struct mbuf *control; 852 struct sockaddr *to; 853 int error; 854 855 #ifdef CAPABILITY_MODE 856 if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) 857 return (ECAPMODE); 858 #endif 859 860 if (mp->msg_name != NULL) { 861 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 862 if (error != 0) { 863 to = NULL; 864 goto bad; 865 } 866 mp->msg_name = to; 867 } else { 868 to = NULL; 869 } 870 871 if (mp->msg_control) { 872 if (mp->msg_controllen < sizeof(struct cmsghdr) 873 #ifdef COMPAT_OLDSOCK 874 && mp->msg_flags != MSG_COMPAT 875 #endif 876 ) { 877 error = EINVAL; 878 goto bad; 879 } 880 error = sockargs(&control, mp->msg_control, 881 mp->msg_controllen, MT_CONTROL); 882 if (error != 0) 883 goto bad; 884 #ifdef COMPAT_OLDSOCK 885 if (mp->msg_flags == MSG_COMPAT) { 886 struct cmsghdr *cm; 887 888 M_PREPEND(control, sizeof(*cm), M_WAITOK); 889 cm = mtod(control, struct cmsghdr *); 890 cm->cmsg_len = control->m_len; 891 cm->cmsg_level = SOL_SOCKET; 892 cm->cmsg_type = SCM_RIGHTS; 893 } 894 #endif 895 } else { 896 control = NULL; 897 } 898 899 error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); 900 901 bad: 902 free(to, M_SONAME); 903 return (error); 904 } 905 906 int 907 kern_sendit(td, s, mp, flags, control, segflg) 908 struct thread *td; 909 int s; 910 struct msghdr *mp; 911 int flags; 912 struct mbuf *control; 913 enum uio_seg segflg; 914 { 915 struct file *fp; 916 struct uio auio; 917 struct iovec *iov; 918 struct socket *so; 919 cap_rights_t rights; 920 #ifdef KTRACE 921 struct uio *ktruio = NULL; 922 #endif 923 ssize_t len; 924 int i, error; 925 926 AUDIT_ARG_FD(s); 927 cap_rights_init(&rights, CAP_SEND); 928 if (mp->msg_name != NULL) { 929 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); 930 cap_rights_set(&rights, CAP_CONNECT); 931 } 932 error = getsock_cap(td->td_proc->p_fd, s, &rights, &fp, NULL); 933 if (error != 0) 934 return (error); 935 so = (struct socket *)fp->f_data; 936 937 #ifdef KTRACE 938 if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) 939 ktrsockaddr(mp->msg_name); 940 #endif 941 #ifdef MAC 942 if (mp->msg_name != NULL) { 943 error = mac_socket_check_connect(td->td_ucred, so, 944 mp->msg_name); 945 if (error != 0) 946 goto bad; 947 } 948 error = mac_socket_check_send(td->td_ucred, so); 949 if (error != 0) 950 goto bad; 951 #endif 952 953 auio.uio_iov = mp->msg_iov; 954 auio.uio_iovcnt = mp->msg_iovlen; 955 auio.uio_segflg = segflg; 956 auio.uio_rw = UIO_WRITE; 957 auio.uio_td = td; 958 auio.uio_offset = 0; /* XXX */ 959 auio.uio_resid = 0; 960 iov = mp->msg_iov; 961 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 962 if ((auio.uio_resid += iov->iov_len) < 0) { 963 error = EINVAL; 964 goto bad; 965 } 966 } 967 #ifdef KTRACE 968 if (KTRPOINT(td, KTR_GENIO)) 969 ktruio = cloneuio(&auio); 970 #endif 971 len = auio.uio_resid; 972 error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); 973 if (error != 0) { 974 if (auio.uio_resid != len && (error == ERESTART || 975 error == EINTR || error == EWOULDBLOCK)) 976 error = 0; 977 /* Generation of SIGPIPE can be controlled per socket */ 978 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 979 !(flags & MSG_NOSIGNAL)) { 980 PROC_LOCK(td->td_proc); 981 tdsignal(td, SIGPIPE); 982 PROC_UNLOCK(td->td_proc); 983 } 984 } 985 if (error == 0) 986 td->td_retval[0] = len - auio.uio_resid; 987 #ifdef KTRACE 988 if (ktruio != NULL) { 989 ktruio->uio_resid = td->td_retval[0]; 990 ktrgenio(s, UIO_WRITE, ktruio, error); 991 } 992 #endif 993 bad: 994 fdrop(fp, td); 995 return (error); 996 } 997 998 int 999 sys_sendto(td, uap) 1000 struct thread *td; 1001 struct sendto_args /* { 1002 int s; 1003 caddr_t buf; 1004 size_t len; 1005 int flags; 1006 caddr_t to; 1007 int tolen; 1008 } */ *uap; 1009 { 1010 struct msghdr msg; 1011 struct iovec aiov; 1012 1013 msg.msg_name = uap->to; 1014 msg.msg_namelen = uap->tolen; 1015 msg.msg_iov = &aiov; 1016 msg.msg_iovlen = 1; 1017 msg.msg_control = 0; 1018 #ifdef COMPAT_OLDSOCK 1019 msg.msg_flags = 0; 1020 #endif 1021 aiov.iov_base = uap->buf; 1022 aiov.iov_len = uap->len; 1023 return (sendit(td, uap->s, &msg, uap->flags)); 1024 } 1025 1026 #ifdef COMPAT_OLDSOCK 1027 int 1028 osend(td, uap) 1029 struct thread *td; 1030 struct osend_args /* { 1031 int s; 1032 caddr_t buf; 1033 int len; 1034 int flags; 1035 } */ *uap; 1036 { 1037 struct msghdr msg; 1038 struct iovec aiov; 1039 1040 msg.msg_name = 0; 1041 msg.msg_namelen = 0; 1042 msg.msg_iov = &aiov; 1043 msg.msg_iovlen = 1; 1044 aiov.iov_base = uap->buf; 1045 aiov.iov_len = uap->len; 1046 msg.msg_control = 0; 1047 msg.msg_flags = 0; 1048 return (sendit(td, uap->s, &msg, uap->flags)); 1049 } 1050 1051 int 1052 osendmsg(td, uap) 1053 struct thread *td; 1054 struct osendmsg_args /* { 1055 int s; 1056 caddr_t msg; 1057 int flags; 1058 } */ *uap; 1059 { 1060 struct msghdr msg; 1061 struct iovec *iov; 1062 int error; 1063 1064 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1065 if (error != 0) 1066 return (error); 1067 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1068 if (error != 0) 1069 return (error); 1070 msg.msg_iov = iov; 1071 msg.msg_flags = MSG_COMPAT; 1072 error = sendit(td, uap->s, &msg, uap->flags); 1073 free(iov, M_IOV); 1074 return (error); 1075 } 1076 #endif 1077 1078 int 1079 sys_sendmsg(td, uap) 1080 struct thread *td; 1081 struct sendmsg_args /* { 1082 int s; 1083 caddr_t msg; 1084 int flags; 1085 } */ *uap; 1086 { 1087 struct msghdr msg; 1088 struct iovec *iov; 1089 int error; 1090 1091 error = copyin(uap->msg, &msg, sizeof (msg)); 1092 if (error != 0) 1093 return (error); 1094 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1095 if (error != 0) 1096 return (error); 1097 msg.msg_iov = iov; 1098 #ifdef COMPAT_OLDSOCK 1099 msg.msg_flags = 0; 1100 #endif 1101 error = sendit(td, uap->s, &msg, uap->flags); 1102 free(iov, M_IOV); 1103 return (error); 1104 } 1105 1106 int 1107 kern_recvit(td, s, mp, fromseg, controlp) 1108 struct thread *td; 1109 int s; 1110 struct msghdr *mp; 1111 enum uio_seg fromseg; 1112 struct mbuf **controlp; 1113 { 1114 struct uio auio; 1115 struct iovec *iov; 1116 struct mbuf *m, *control = NULL; 1117 caddr_t ctlbuf; 1118 struct file *fp; 1119 struct socket *so; 1120 struct sockaddr *fromsa = NULL; 1121 cap_rights_t rights; 1122 #ifdef KTRACE 1123 struct uio *ktruio = NULL; 1124 #endif 1125 ssize_t len; 1126 int error, i; 1127 1128 if (controlp != NULL) 1129 *controlp = NULL; 1130 1131 AUDIT_ARG_FD(s); 1132 error = getsock_cap(td->td_proc->p_fd, s, 1133 cap_rights_init(&rights, CAP_RECV), &fp, NULL); 1134 if (error != 0) 1135 return (error); 1136 so = fp->f_data; 1137 1138 #ifdef MAC 1139 error = mac_socket_check_receive(td->td_ucred, so); 1140 if (error != 0) { 1141 fdrop(fp, td); 1142 return (error); 1143 } 1144 #endif 1145 1146 auio.uio_iov = mp->msg_iov; 1147 auio.uio_iovcnt = mp->msg_iovlen; 1148 auio.uio_segflg = UIO_USERSPACE; 1149 auio.uio_rw = UIO_READ; 1150 auio.uio_td = td; 1151 auio.uio_offset = 0; /* XXX */ 1152 auio.uio_resid = 0; 1153 iov = mp->msg_iov; 1154 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 1155 if ((auio.uio_resid += iov->iov_len) < 0) { 1156 fdrop(fp, td); 1157 return (EINVAL); 1158 } 1159 } 1160 #ifdef KTRACE 1161 if (KTRPOINT(td, KTR_GENIO)) 1162 ktruio = cloneuio(&auio); 1163 #endif 1164 len = auio.uio_resid; 1165 error = soreceive(so, &fromsa, &auio, NULL, 1166 (mp->msg_control || controlp) ? &control : NULL, 1167 &mp->msg_flags); 1168 if (error != 0) { 1169 if (auio.uio_resid != len && (error == ERESTART || 1170 error == EINTR || error == EWOULDBLOCK)) 1171 error = 0; 1172 } 1173 if (fromsa != NULL) 1174 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); 1175 #ifdef KTRACE 1176 if (ktruio != NULL) { 1177 ktruio->uio_resid = len - auio.uio_resid; 1178 ktrgenio(s, UIO_READ, ktruio, error); 1179 } 1180 #endif 1181 if (error != 0) 1182 goto out; 1183 td->td_retval[0] = len - auio.uio_resid; 1184 if (mp->msg_name) { 1185 len = mp->msg_namelen; 1186 if (len <= 0 || fromsa == NULL) 1187 len = 0; 1188 else { 1189 /* save sa_len before it is destroyed by MSG_COMPAT */ 1190 len = MIN(len, fromsa->sa_len); 1191 #ifdef COMPAT_OLDSOCK 1192 if (mp->msg_flags & MSG_COMPAT) 1193 ((struct osockaddr *)fromsa)->sa_family = 1194 fromsa->sa_family; 1195 #endif 1196 if (fromseg == UIO_USERSPACE) { 1197 error = copyout(fromsa, mp->msg_name, 1198 (unsigned)len); 1199 if (error != 0) 1200 goto out; 1201 } else 1202 bcopy(fromsa, mp->msg_name, len); 1203 } 1204 mp->msg_namelen = len; 1205 } 1206 if (mp->msg_control && controlp == NULL) { 1207 #ifdef COMPAT_OLDSOCK 1208 /* 1209 * We assume that old recvmsg calls won't receive access 1210 * rights and other control info, esp. as control info 1211 * is always optional and those options didn't exist in 4.3. 1212 * If we receive rights, trim the cmsghdr; anything else 1213 * is tossed. 1214 */ 1215 if (control && mp->msg_flags & MSG_COMPAT) { 1216 if (mtod(control, struct cmsghdr *)->cmsg_level != 1217 SOL_SOCKET || 1218 mtod(control, struct cmsghdr *)->cmsg_type != 1219 SCM_RIGHTS) { 1220 mp->msg_controllen = 0; 1221 goto out; 1222 } 1223 control->m_len -= sizeof (struct cmsghdr); 1224 control->m_data += sizeof (struct cmsghdr); 1225 } 1226 #endif 1227 len = mp->msg_controllen; 1228 m = control; 1229 mp->msg_controllen = 0; 1230 ctlbuf = mp->msg_control; 1231 1232 while (m && len > 0) { 1233 unsigned int tocopy; 1234 1235 if (len >= m->m_len) 1236 tocopy = m->m_len; 1237 else { 1238 mp->msg_flags |= MSG_CTRUNC; 1239 tocopy = len; 1240 } 1241 1242 if ((error = copyout(mtod(m, caddr_t), 1243 ctlbuf, tocopy)) != 0) 1244 goto out; 1245 1246 ctlbuf += tocopy; 1247 len -= tocopy; 1248 m = m->m_next; 1249 } 1250 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; 1251 } 1252 out: 1253 fdrop(fp, td); 1254 #ifdef KTRACE 1255 if (fromsa && KTRPOINT(td, KTR_STRUCT)) 1256 ktrsockaddr(fromsa); 1257 #endif 1258 free(fromsa, M_SONAME); 1259 1260 if (error == 0 && controlp != NULL) 1261 *controlp = control; 1262 else if (control) 1263 m_freem(control); 1264 1265 return (error); 1266 } 1267 1268 static int 1269 recvit(td, s, mp, namelenp) 1270 struct thread *td; 1271 int s; 1272 struct msghdr *mp; 1273 void *namelenp; 1274 { 1275 int error; 1276 1277 error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); 1278 if (error != 0) 1279 return (error); 1280 if (namelenp != NULL) { 1281 error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); 1282 #ifdef COMPAT_OLDSOCK 1283 if (mp->msg_flags & MSG_COMPAT) 1284 error = 0; /* old recvfrom didn't check */ 1285 #endif 1286 } 1287 return (error); 1288 } 1289 1290 int 1291 sys_recvfrom(td, uap) 1292 struct thread *td; 1293 struct recvfrom_args /* { 1294 int s; 1295 caddr_t buf; 1296 size_t len; 1297 int flags; 1298 struct sockaddr * __restrict from; 1299 socklen_t * __restrict fromlenaddr; 1300 } */ *uap; 1301 { 1302 struct msghdr msg; 1303 struct iovec aiov; 1304 int error; 1305 1306 if (uap->fromlenaddr) { 1307 error = copyin(uap->fromlenaddr, 1308 &msg.msg_namelen, sizeof (msg.msg_namelen)); 1309 if (error != 0) 1310 goto done2; 1311 } else { 1312 msg.msg_namelen = 0; 1313 } 1314 msg.msg_name = uap->from; 1315 msg.msg_iov = &aiov; 1316 msg.msg_iovlen = 1; 1317 aiov.iov_base = uap->buf; 1318 aiov.iov_len = uap->len; 1319 msg.msg_control = 0; 1320 msg.msg_flags = uap->flags; 1321 error = recvit(td, uap->s, &msg, uap->fromlenaddr); 1322 done2: 1323 return (error); 1324 } 1325 1326 #ifdef COMPAT_OLDSOCK 1327 int 1328 orecvfrom(td, uap) 1329 struct thread *td; 1330 struct recvfrom_args *uap; 1331 { 1332 1333 uap->flags |= MSG_COMPAT; 1334 return (sys_recvfrom(td, uap)); 1335 } 1336 #endif 1337 1338 #ifdef COMPAT_OLDSOCK 1339 int 1340 orecv(td, uap) 1341 struct thread *td; 1342 struct orecv_args /* { 1343 int s; 1344 caddr_t buf; 1345 int len; 1346 int flags; 1347 } */ *uap; 1348 { 1349 struct msghdr msg; 1350 struct iovec aiov; 1351 1352 msg.msg_name = 0; 1353 msg.msg_namelen = 0; 1354 msg.msg_iov = &aiov; 1355 msg.msg_iovlen = 1; 1356 aiov.iov_base = uap->buf; 1357 aiov.iov_len = uap->len; 1358 msg.msg_control = 0; 1359 msg.msg_flags = uap->flags; 1360 return (recvit(td, uap->s, &msg, NULL)); 1361 } 1362 1363 /* 1364 * Old recvmsg. This code takes advantage of the fact that the old msghdr 1365 * overlays the new one, missing only the flags, and with the (old) access 1366 * rights where the control fields are now. 1367 */ 1368 int 1369 orecvmsg(td, uap) 1370 struct thread *td; 1371 struct orecvmsg_args /* { 1372 int s; 1373 struct omsghdr *msg; 1374 int flags; 1375 } */ *uap; 1376 { 1377 struct msghdr msg; 1378 struct iovec *iov; 1379 int error; 1380 1381 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1382 if (error != 0) 1383 return (error); 1384 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1385 if (error != 0) 1386 return (error); 1387 msg.msg_flags = uap->flags | MSG_COMPAT; 1388 msg.msg_iov = iov; 1389 error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); 1390 if (msg.msg_controllen && error == 0) 1391 error = copyout(&msg.msg_controllen, 1392 &uap->msg->msg_accrightslen, sizeof (int)); 1393 free(iov, M_IOV); 1394 return (error); 1395 } 1396 #endif 1397 1398 int 1399 sys_recvmsg(td, uap) 1400 struct thread *td; 1401 struct recvmsg_args /* { 1402 int s; 1403 struct msghdr *msg; 1404 int flags; 1405 } */ *uap; 1406 { 1407 struct msghdr msg; 1408 struct iovec *uiov, *iov; 1409 int error; 1410 1411 error = copyin(uap->msg, &msg, sizeof (msg)); 1412 if (error != 0) 1413 return (error); 1414 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1415 if (error != 0) 1416 return (error); 1417 msg.msg_flags = uap->flags; 1418 #ifdef COMPAT_OLDSOCK 1419 msg.msg_flags &= ~MSG_COMPAT; 1420 #endif 1421 uiov = msg.msg_iov; 1422 msg.msg_iov = iov; 1423 error = recvit(td, uap->s, &msg, NULL); 1424 if (error == 0) { 1425 msg.msg_iov = uiov; 1426 error = copyout(&msg, uap->msg, sizeof(msg)); 1427 } 1428 free(iov, M_IOV); 1429 return (error); 1430 } 1431 1432 /* ARGSUSED */ 1433 int 1434 sys_shutdown(td, uap) 1435 struct thread *td; 1436 struct shutdown_args /* { 1437 int s; 1438 int how; 1439 } */ *uap; 1440 { 1441 struct socket *so; 1442 struct file *fp; 1443 cap_rights_t rights; 1444 int error; 1445 1446 AUDIT_ARG_FD(uap->s); 1447 error = getsock_cap(td->td_proc->p_fd, uap->s, 1448 cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL); 1449 if (error == 0) { 1450 so = fp->f_data; 1451 error = soshutdown(so, uap->how); 1452 fdrop(fp, td); 1453 } 1454 return (error); 1455 } 1456 1457 /* ARGSUSED */ 1458 int 1459 sys_setsockopt(td, uap) 1460 struct thread *td; 1461 struct setsockopt_args /* { 1462 int s; 1463 int level; 1464 int name; 1465 caddr_t val; 1466 int valsize; 1467 } */ *uap; 1468 { 1469 1470 return (kern_setsockopt(td, uap->s, uap->level, uap->name, 1471 uap->val, UIO_USERSPACE, uap->valsize)); 1472 } 1473 1474 int 1475 kern_setsockopt(td, s, level, name, val, valseg, valsize) 1476 struct thread *td; 1477 int s; 1478 int level; 1479 int name; 1480 void *val; 1481 enum uio_seg valseg; 1482 socklen_t valsize; 1483 { 1484 struct socket *so; 1485 struct file *fp; 1486 struct sockopt sopt; 1487 cap_rights_t rights; 1488 int error; 1489 1490 if (val == NULL && valsize != 0) 1491 return (EFAULT); 1492 if ((int)valsize < 0) 1493 return (EINVAL); 1494 1495 sopt.sopt_dir = SOPT_SET; 1496 sopt.sopt_level = level; 1497 sopt.sopt_name = name; 1498 sopt.sopt_val = val; 1499 sopt.sopt_valsize = valsize; 1500 switch (valseg) { 1501 case UIO_USERSPACE: 1502 sopt.sopt_td = td; 1503 break; 1504 case UIO_SYSSPACE: 1505 sopt.sopt_td = NULL; 1506 break; 1507 default: 1508 panic("kern_setsockopt called with bad valseg"); 1509 } 1510 1511 AUDIT_ARG_FD(s); 1512 error = getsock_cap(td->td_proc->p_fd, s, 1513 cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL); 1514 if (error == 0) { 1515 so = fp->f_data; 1516 error = sosetopt(so, &sopt); 1517 fdrop(fp, td); 1518 } 1519 return(error); 1520 } 1521 1522 /* ARGSUSED */ 1523 int 1524 sys_getsockopt(td, uap) 1525 struct thread *td; 1526 struct getsockopt_args /* { 1527 int s; 1528 int level; 1529 int name; 1530 void * __restrict val; 1531 socklen_t * __restrict avalsize; 1532 } */ *uap; 1533 { 1534 socklen_t valsize; 1535 int error; 1536 1537 if (uap->val) { 1538 error = copyin(uap->avalsize, &valsize, sizeof (valsize)); 1539 if (error != 0) 1540 return (error); 1541 } 1542 1543 error = kern_getsockopt(td, uap->s, uap->level, uap->name, 1544 uap->val, UIO_USERSPACE, &valsize); 1545 1546 if (error == 0) 1547 error = copyout(&valsize, uap->avalsize, sizeof (valsize)); 1548 return (error); 1549 } 1550 1551 /* 1552 * Kernel version of getsockopt. 1553 * optval can be a userland or userspace. optlen is always a kernel pointer. 1554 */ 1555 int 1556 kern_getsockopt(td, s, level, name, val, valseg, valsize) 1557 struct thread *td; 1558 int s; 1559 int level; 1560 int name; 1561 void *val; 1562 enum uio_seg valseg; 1563 socklen_t *valsize; 1564 { 1565 struct socket *so; 1566 struct file *fp; 1567 struct sockopt sopt; 1568 cap_rights_t rights; 1569 int error; 1570 1571 if (val == NULL) 1572 *valsize = 0; 1573 if ((int)*valsize < 0) 1574 return (EINVAL); 1575 1576 sopt.sopt_dir = SOPT_GET; 1577 sopt.sopt_level = level; 1578 sopt.sopt_name = name; 1579 sopt.sopt_val = val; 1580 sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ 1581 switch (valseg) { 1582 case UIO_USERSPACE: 1583 sopt.sopt_td = td; 1584 break; 1585 case UIO_SYSSPACE: 1586 sopt.sopt_td = NULL; 1587 break; 1588 default: 1589 panic("kern_getsockopt called with bad valseg"); 1590 } 1591 1592 AUDIT_ARG_FD(s); 1593 error = getsock_cap(td->td_proc->p_fd, s, 1594 cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL); 1595 if (error == 0) { 1596 so = fp->f_data; 1597 error = sogetopt(so, &sopt); 1598 *valsize = sopt.sopt_valsize; 1599 fdrop(fp, td); 1600 } 1601 return (error); 1602 } 1603 1604 /* 1605 * getsockname1() - Get socket name. 1606 */ 1607 /* ARGSUSED */ 1608 static int 1609 getsockname1(td, uap, compat) 1610 struct thread *td; 1611 struct getsockname_args /* { 1612 int fdes; 1613 struct sockaddr * __restrict asa; 1614 socklen_t * __restrict alen; 1615 } */ *uap; 1616 int compat; 1617 { 1618 struct sockaddr *sa; 1619 socklen_t len; 1620 int error; 1621 1622 error = copyin(uap->alen, &len, sizeof(len)); 1623 if (error != 0) 1624 return (error); 1625 1626 error = kern_getsockname(td, uap->fdes, &sa, &len); 1627 if (error != 0) 1628 return (error); 1629 1630 if (len != 0) { 1631 #ifdef COMPAT_OLDSOCK 1632 if (compat) 1633 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1634 #endif 1635 error = copyout(sa, uap->asa, (u_int)len); 1636 } 1637 free(sa, M_SONAME); 1638 if (error == 0) 1639 error = copyout(&len, uap->alen, sizeof(len)); 1640 return (error); 1641 } 1642 1643 int 1644 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, 1645 socklen_t *alen) 1646 { 1647 struct socket *so; 1648 struct file *fp; 1649 cap_rights_t rights; 1650 socklen_t len; 1651 int error; 1652 1653 AUDIT_ARG_FD(fd); 1654 error = getsock_cap(td->td_proc->p_fd, fd, 1655 cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL); 1656 if (error != 0) 1657 return (error); 1658 so = fp->f_data; 1659 *sa = NULL; 1660 CURVNET_SET(so->so_vnet); 1661 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); 1662 CURVNET_RESTORE(); 1663 if (error != 0) 1664 goto bad; 1665 if (*sa == NULL) 1666 len = 0; 1667 else 1668 len = MIN(*alen, (*sa)->sa_len); 1669 *alen = len; 1670 #ifdef KTRACE 1671 if (KTRPOINT(td, KTR_STRUCT)) 1672 ktrsockaddr(*sa); 1673 #endif 1674 bad: 1675 fdrop(fp, td); 1676 if (error != 0 && *sa != NULL) { 1677 free(*sa, M_SONAME); 1678 *sa = NULL; 1679 } 1680 return (error); 1681 } 1682 1683 int 1684 sys_getsockname(td, uap) 1685 struct thread *td; 1686 struct getsockname_args *uap; 1687 { 1688 1689 return (getsockname1(td, uap, 0)); 1690 } 1691 1692 #ifdef COMPAT_OLDSOCK 1693 int 1694 ogetsockname(td, uap) 1695 struct thread *td; 1696 struct getsockname_args *uap; 1697 { 1698 1699 return (getsockname1(td, uap, 1)); 1700 } 1701 #endif /* COMPAT_OLDSOCK */ 1702 1703 /* 1704 * getpeername1() - Get name of peer for connected socket. 1705 */ 1706 /* ARGSUSED */ 1707 static int 1708 getpeername1(td, uap, compat) 1709 struct thread *td; 1710 struct getpeername_args /* { 1711 int fdes; 1712 struct sockaddr * __restrict asa; 1713 socklen_t * __restrict alen; 1714 } */ *uap; 1715 int compat; 1716 { 1717 struct sockaddr *sa; 1718 socklen_t len; 1719 int error; 1720 1721 error = copyin(uap->alen, &len, sizeof (len)); 1722 if (error != 0) 1723 return (error); 1724 1725 error = kern_getpeername(td, uap->fdes, &sa, &len); 1726 if (error != 0) 1727 return (error); 1728 1729 if (len != 0) { 1730 #ifdef COMPAT_OLDSOCK 1731 if (compat) 1732 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1733 #endif 1734 error = copyout(sa, uap->asa, (u_int)len); 1735 } 1736 free(sa, M_SONAME); 1737 if (error == 0) 1738 error = copyout(&len, uap->alen, sizeof(len)); 1739 return (error); 1740 } 1741 1742 int 1743 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, 1744 socklen_t *alen) 1745 { 1746 struct socket *so; 1747 struct file *fp; 1748 cap_rights_t rights; 1749 socklen_t len; 1750 int error; 1751 1752 AUDIT_ARG_FD(fd); 1753 error = getsock_cap(td->td_proc->p_fd, fd, 1754 cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL); 1755 if (error != 0) 1756 return (error); 1757 so = fp->f_data; 1758 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1759 error = ENOTCONN; 1760 goto done; 1761 } 1762 *sa = NULL; 1763 CURVNET_SET(so->so_vnet); 1764 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); 1765 CURVNET_RESTORE(); 1766 if (error != 0) 1767 goto bad; 1768 if (*sa == NULL) 1769 len = 0; 1770 else 1771 len = MIN(*alen, (*sa)->sa_len); 1772 *alen = len; 1773 #ifdef KTRACE 1774 if (KTRPOINT(td, KTR_STRUCT)) 1775 ktrsockaddr(*sa); 1776 #endif 1777 bad: 1778 if (error != 0 && *sa != NULL) { 1779 free(*sa, M_SONAME); 1780 *sa = NULL; 1781 } 1782 done: 1783 fdrop(fp, td); 1784 return (error); 1785 } 1786 1787 int 1788 sys_getpeername(td, uap) 1789 struct thread *td; 1790 struct getpeername_args *uap; 1791 { 1792 1793 return (getpeername1(td, uap, 0)); 1794 } 1795 1796 #ifdef COMPAT_OLDSOCK 1797 int 1798 ogetpeername(td, uap) 1799 struct thread *td; 1800 struct ogetpeername_args *uap; 1801 { 1802 1803 /* XXX uap should have type `getpeername_args *' to begin with. */ 1804 return (getpeername1(td, (struct getpeername_args *)uap, 1)); 1805 } 1806 #endif /* COMPAT_OLDSOCK */ 1807 1808 int 1809 sockargs(mp, buf, buflen, type) 1810 struct mbuf **mp; 1811 caddr_t buf; 1812 int buflen, type; 1813 { 1814 struct sockaddr *sa; 1815 struct mbuf *m; 1816 int error; 1817 1818 if (buflen > MLEN) { 1819 #ifdef COMPAT_OLDSOCK 1820 if (type == MT_SONAME && buflen <= 112) 1821 buflen = MLEN; /* unix domain compat. hack */ 1822 else 1823 #endif 1824 if (buflen > MCLBYTES) 1825 return (EINVAL); 1826 } 1827 m = m_get2(buflen, M_WAITOK, type, 0); 1828 m->m_len = buflen; 1829 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1830 if (error != 0) 1831 (void) m_free(m); 1832 else { 1833 *mp = m; 1834 if (type == MT_SONAME) { 1835 sa = mtod(m, struct sockaddr *); 1836 1837 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1838 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1839 sa->sa_family = sa->sa_len; 1840 #endif 1841 sa->sa_len = buflen; 1842 } 1843 } 1844 return (error); 1845 } 1846 1847 int 1848 getsockaddr(namp, uaddr, len) 1849 struct sockaddr **namp; 1850 caddr_t uaddr; 1851 size_t len; 1852 { 1853 struct sockaddr *sa; 1854 int error; 1855 1856 if (len > SOCK_MAXADDRLEN) 1857 return (ENAMETOOLONG); 1858 if (len < offsetof(struct sockaddr, sa_data[0])) 1859 return (EINVAL); 1860 sa = malloc(len, M_SONAME, M_WAITOK); 1861 error = copyin(uaddr, sa, len); 1862 if (error != 0) { 1863 free(sa, M_SONAME); 1864 } else { 1865 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1866 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1867 sa->sa_family = sa->sa_len; 1868 #endif 1869 sa->sa_len = len; 1870 *namp = sa; 1871 } 1872 return (error); 1873 } 1874 1875 static int 1876 filt_sfsync_attach(struct knote *kn) 1877 { 1878 struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_sdata; 1879 struct knlist *knl = &sfs->klist; 1880 1881 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1882 1883 /* 1884 * Validate that we actually received this via the kernel API. 1885 */ 1886 if ((kn->kn_flags & EV_FLAG1) == 0) 1887 return (EPERM); 1888 1889 kn->kn_ptr.p_v = sfs; 1890 kn->kn_flags &= ~EV_FLAG1; 1891 1892 knl->kl_lock(knl->kl_lockarg); 1893 /* 1894 * If we're in the "freeing" state, 1895 * don't allow the add. That way we don't 1896 * end up racing with some other thread that 1897 * is trying to finish some setup. 1898 */ 1899 if (sfs->state == SF_STATE_FREEING) { 1900 knl->kl_unlock(knl->kl_lockarg); 1901 return (EINVAL); 1902 } 1903 knlist_add(&sfs->klist, kn, 1); 1904 knl->kl_unlock(knl->kl_lockarg); 1905 1906 return (0); 1907 } 1908 1909 /* 1910 * Called when a knote is being detached. 1911 */ 1912 static void 1913 filt_sfsync_detach(struct knote *kn) 1914 { 1915 struct knlist *knl; 1916 struct sendfile_sync *sfs; 1917 int do_free = 0; 1918 1919 sfs = kn->kn_ptr.p_v; 1920 knl = &sfs->klist; 1921 1922 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1923 1924 knl->kl_lock(knl->kl_lockarg); 1925 if (!knlist_empty(knl)) 1926 knlist_remove(knl, kn, 1); 1927 1928 /* 1929 * If the list is empty _AND_ the refcount is 0 1930 * _AND_ we've finished the setup phase and now 1931 * we're in the running phase, we can free the 1932 * underlying sendfile_sync. 1933 * 1934 * But we shouldn't do it before finishing the 1935 * underlying divorce from the knote. 1936 * 1937 * So, we have the sfsync lock held; transition 1938 * it to "freeing", then unlock, then free 1939 * normally. 1940 */ 1941 if (knlist_empty(knl)) { 1942 if (sfs->state == SF_STATE_COMPLETED && sfs->count == 0) { 1943 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, " 1944 "count==0, empty list: time to free!\n", 1945 __func__, 1946 (unsigned long long) curthread->td_tid, 1947 sfs); 1948 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 1949 do_free = 1; 1950 } 1951 } 1952 knl->kl_unlock(knl->kl_lockarg); 1953 1954 /* 1955 * Only call free if we're the one who has transitioned things 1956 * to free. Otherwise we could race with another thread that 1957 * is currently tearing things down. 1958 */ 1959 if (do_free == 1) { 1960 SFSYNC_DPRINTF("%s: (%llu) sfs=%p, %s:%d\n", 1961 __func__, 1962 (unsigned long long) curthread->td_tid, 1963 sfs, 1964 __FILE__, 1965 __LINE__); 1966 sf_sync_free(sfs); 1967 } 1968 } 1969 1970 static int 1971 filt_sfsync(struct knote *kn, long hint) 1972 { 1973 struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_ptr.p_v; 1974 int ret; 1975 1976 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1977 1978 /* 1979 * XXX add a lock assertion here! 1980 */ 1981 ret = (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED); 1982 1983 return (ret); 1984 } 1985 1986 /* 1987 * Add more references to a vm_page + sf_buf + sendfile_sync. 1988 */ 1989 void 1990 sf_ext_ref(void *arg1, void *arg2) 1991 { 1992 struct sf_buf *sf = arg1; 1993 struct sendfile_sync *sfs = arg2; 1994 vm_page_t pg = sf_buf_page(sf); 1995 1996 /* XXXGL: there should be sf_buf_ref() */ 1997 sf_buf_alloc(sf_buf_page(sf), SFB_NOWAIT); 1998 1999 vm_page_lock(pg); 2000 vm_page_wire(pg); 2001 vm_page_unlock(pg); 2002 2003 if (sfs != NULL) { 2004 mtx_lock(&sfs->mtx); 2005 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); 2006 sfs->count++; 2007 mtx_unlock(&sfs->mtx); 2008 } 2009 } 2010 2011 /* 2012 * Detach mapped page and release resources back to the system. 2013 */ 2014 void 2015 sf_ext_free(void *arg1, void *arg2) 2016 { 2017 struct sf_buf *sf = arg1; 2018 struct sendfile_sync *sfs = arg2; 2019 vm_page_t pg = sf_buf_page(sf); 2020 2021 sf_buf_free(sf); 2022 2023 vm_page_lock(pg); 2024 vm_page_unwire(pg, PQ_INACTIVE); 2025 /* 2026 * Check for the object going away on us. This can 2027 * happen since we don't hold a reference to it. 2028 * If so, we're responsible for freeing the page. 2029 */ 2030 if (pg->wire_count == 0 && pg->object == NULL) 2031 vm_page_free(pg); 2032 vm_page_unlock(pg); 2033 2034 if (sfs != NULL) 2035 sf_sync_deref(sfs); 2036 } 2037 2038 /* 2039 * Called to remove a reference to a sf_sync object. 2040 * 2041 * This is generally done during the mbuf free path to signify 2042 * that one of the mbufs in the transaction has been completed. 2043 * 2044 * If we're doing SF_SYNC and the refcount is zero then we'll wake 2045 * up any waiters. 2046 * 2047 * IF we're doing SF_KQUEUE and the refcount is zero then we'll 2048 * fire off the knote. 2049 */ 2050 void 2051 sf_sync_deref(struct sendfile_sync *sfs) 2052 { 2053 int do_free = 0; 2054 2055 if (sfs == NULL) 2056 return; 2057 2058 mtx_lock(&sfs->mtx); 2059 KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0")); 2060 sfs->count --; 2061 2062 /* 2063 * Only fire off the wakeup / kqueue notification if 2064 * we are in the running state. 2065 */ 2066 if (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED) { 2067 if (sfs->flags & SF_SYNC) 2068 cv_signal(&sfs->cv); 2069 2070 if (sfs->flags & SF_KQUEUE) { 2071 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: knote!\n", 2072 __func__, 2073 (unsigned long long) curthread->td_tid, 2074 sfs); 2075 KNOTE_LOCKED(&sfs->klist, 1); 2076 } 2077 2078 /* 2079 * If we're not waiting around for a sync, 2080 * check if the knote list is empty. 2081 * If it is, we transition to free. 2082 * 2083 * XXX I think it's about time I added some state 2084 * or flag that says whether we're supposed to be 2085 * waiting around until we've done a signal. 2086 * 2087 * XXX Ie, the reason that I don't free it here 2088 * is because the caller will free the last reference, 2089 * not us. That should be codified in some flag 2090 * that indicates "self-free" rather than checking 2091 * for SF_SYNC all the time. 2092 */ 2093 if ((sfs->flags & SF_SYNC) == 0 && knlist_empty(&sfs->klist)) { 2094 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, " 2095 "count==0, empty list: time to free!\n", 2096 __func__, 2097 (unsigned long long) curthread->td_tid, 2098 sfs); 2099 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2100 do_free = 1; 2101 } 2102 2103 } 2104 mtx_unlock(&sfs->mtx); 2105 2106 /* 2107 * Attempt to do a free here. 2108 * 2109 * We do this outside of the lock because it may destroy the 2110 * lock in question as it frees things. We can optimise this 2111 * later. 2112 * 2113 * XXX yes, we should make it a requirement to hold the 2114 * lock across sf_sync_free(). 2115 */ 2116 if (do_free == 1) { 2117 SFSYNC_DPRINTF("%s: (%llu) sfs=%p\n", 2118 __func__, 2119 (unsigned long long) curthread->td_tid, 2120 sfs); 2121 sf_sync_free(sfs); 2122 } 2123 } 2124 2125 /* 2126 * Allocate a sendfile_sync state structure. 2127 * 2128 * For now this only knows about the "sleep" sync, but later it will 2129 * grow various other personalities. 2130 */ 2131 struct sendfile_sync * 2132 sf_sync_alloc(uint32_t flags) 2133 { 2134 struct sendfile_sync *sfs; 2135 2136 sfs = uma_zalloc(zone_sfsync, M_WAITOK | M_ZERO); 2137 mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); 2138 cv_init(&sfs->cv, "sendfile"); 2139 sfs->flags = flags; 2140 sfs->state = SF_STATE_SETUP; 2141 knlist_init_mtx(&sfs->klist, &sfs->mtx); 2142 2143 SFSYNC_DPRINTF("%s: sfs=%p, flags=0x%08x\n", __func__, sfs, sfs->flags); 2144 2145 return (sfs); 2146 } 2147 2148 /* 2149 * Take a reference to a sfsync instance. 2150 * 2151 * This has to map 1:1 to free calls coming in via sf_ext_free(), 2152 * so typically this will be referenced once for each mbuf allocated. 2153 */ 2154 void 2155 sf_sync_ref(struct sendfile_sync *sfs) 2156 { 2157 2158 if (sfs == NULL) 2159 return; 2160 2161 mtx_lock(&sfs->mtx); 2162 sfs->count++; 2163 mtx_unlock(&sfs->mtx); 2164 } 2165 2166 void 2167 sf_sync_syscall_wait(struct sendfile_sync *sfs) 2168 { 2169 2170 if (sfs == NULL) 2171 return; 2172 2173 KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!", 2174 __func__, 2175 sfs)); 2176 2177 /* 2178 * If we're not requested to wait during the syscall, 2179 * don't bother waiting. 2180 */ 2181 if ((sfs->flags & SF_SYNC) == 0) 2182 goto out; 2183 2184 /* 2185 * This is a bit suboptimal and confusing, so bear with me. 2186 * 2187 * Ideally sf_sync_syscall_wait() will wait until 2188 * all pending mbuf transmit operations are done. 2189 * This means that when sendfile becomes async, it'll 2190 * run in the background and will transition from 2191 * RUNNING to COMPLETED when it's finished acquiring 2192 * new things to send. Then, when the mbufs finish 2193 * sending, COMPLETED + sfs->count == 0 is enough to 2194 * know that no further work is being done. 2195 * 2196 * So, we will sleep on both RUNNING and COMPLETED. 2197 * It's up to the (in progress) async sendfile loop 2198 * to transition the sf_sync from RUNNING to 2199 * COMPLETED so the wakeup above will actually 2200 * do the cv_signal() call. 2201 */ 2202 if (sfs->state != SF_STATE_COMPLETED && sfs->state != SF_STATE_RUNNING) 2203 goto out; 2204 2205 if (sfs->count != 0) 2206 cv_wait(&sfs->cv, &sfs->mtx); 2207 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2208 2209 out: 2210 return; 2211 } 2212 2213 /* 2214 * Free an sf_sync if it's appropriate to. 2215 */ 2216 void 2217 sf_sync_free(struct sendfile_sync *sfs) 2218 { 2219 2220 if (sfs == NULL) 2221 return; 2222 2223 SFSYNC_DPRINTF("%s: (%lld) sfs=%p; called; state=%d, flags=0x%08x " 2224 "count=%d\n", 2225 __func__, 2226 (long long) curthread->td_tid, 2227 sfs, 2228 sfs->state, 2229 sfs->flags, 2230 sfs->count); 2231 2232 mtx_lock(&sfs->mtx); 2233 2234 /* 2235 * We keep the sf_sync around if the state is active, 2236 * we are doing kqueue notification and we have active 2237 * knotes. 2238 * 2239 * If the caller wants to free us right this second it 2240 * should transition this to the freeing state. 2241 * 2242 * So, complain loudly if they break this rule. 2243 */ 2244 if (sfs->state != SF_STATE_FREEING) { 2245 printf("%s: (%llu) sfs=%p; not freeing; let's wait!\n", 2246 __func__, 2247 (unsigned long long) curthread->td_tid, 2248 sfs); 2249 mtx_unlock(&sfs->mtx); 2250 return; 2251 } 2252 2253 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2254 cv_destroy(&sfs->cv); 2255 /* 2256 * This doesn't call knlist_detach() on each knote; it just frees 2257 * the entire list. 2258 */ 2259 knlist_delete(&sfs->klist, curthread, 1); 2260 mtx_destroy(&sfs->mtx); 2261 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; freeing\n", 2262 __func__, 2263 (unsigned long long) curthread->td_tid, 2264 sfs); 2265 uma_zfree(zone_sfsync, sfs); 2266 } 2267 2268 /* 2269 * Setup a sf_sync to post a kqueue notification when things are complete. 2270 */ 2271 int 2272 sf_sync_kqueue_setup(struct sendfile_sync *sfs, struct sf_hdtr_kq *sfkq) 2273 { 2274 struct kevent kev; 2275 int error; 2276 2277 sfs->flags |= SF_KQUEUE; 2278 2279 /* Check the flags are valid */ 2280 if ((sfkq->kq_flags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) 2281 return (EINVAL); 2282 2283 SFSYNC_DPRINTF("%s: sfs=%p: kqfd=%d, flags=0x%08x, ident=%p, udata=%p\n", 2284 __func__, 2285 sfs, 2286 sfkq->kq_fd, 2287 sfkq->kq_flags, 2288 (void *) sfkq->kq_ident, 2289 (void *) sfkq->kq_udata); 2290 2291 /* Setup and register a knote on the given kqfd. */ 2292 kev.ident = (uintptr_t) sfkq->kq_ident; 2293 kev.filter = EVFILT_SENDFILE; 2294 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | sfkq->kq_flags; 2295 kev.data = (intptr_t) sfs; 2296 kev.udata = sfkq->kq_udata; 2297 2298 error = kqfd_register(sfkq->kq_fd, &kev, curthread, 1); 2299 if (error != 0) { 2300 SFSYNC_DPRINTF("%s: returned %d\n", __func__, error); 2301 } 2302 return (error); 2303 } 2304 2305 void 2306 sf_sync_set_state(struct sendfile_sync *sfs, sendfile_sync_state_t state, 2307 int islocked) 2308 { 2309 sendfile_sync_state_t old_state; 2310 2311 if (! islocked) 2312 mtx_lock(&sfs->mtx); 2313 2314 /* 2315 * Update our current state. 2316 */ 2317 old_state = sfs->state; 2318 sfs->state = state; 2319 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; going from %d to %d\n", 2320 __func__, 2321 (unsigned long long) curthread->td_tid, 2322 sfs, 2323 old_state, 2324 state); 2325 2326 /* 2327 * If we're transitioning from RUNNING to COMPLETED and the count is 2328 * zero, then post the knote. The caller may have completed the 2329 * send before we updated the state to COMPLETED and we need to make 2330 * sure this is communicated. 2331 */ 2332 if (old_state == SF_STATE_RUNNING 2333 && state == SF_STATE_COMPLETED 2334 && sfs->count == 0 2335 && sfs->flags & SF_KQUEUE) { 2336 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: triggering knote!\n", 2337 __func__, 2338 (unsigned long long) curthread->td_tid, 2339 sfs); 2340 KNOTE_LOCKED(&sfs->klist, 1); 2341 } 2342 2343 if (! islocked) 2344 mtx_unlock(&sfs->mtx); 2345 } 2346 2347 /* 2348 * Set the retval/errno for the given transaction. 2349 * 2350 * This will eventually/ideally be used when the KNOTE is fired off 2351 * to signify the completion of this transaction. 2352 * 2353 * The sfsync lock should be held before entering this function. 2354 */ 2355 void 2356 sf_sync_set_retval(struct sendfile_sync *sfs, off_t retval, int xerrno) 2357 { 2358 2359 KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!", 2360 __func__, 2361 sfs)); 2362 2363 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: errno=%d, retval=%jd\n", 2364 __func__, 2365 (unsigned long long) curthread->td_tid, 2366 sfs, 2367 xerrno, 2368 (intmax_t) retval); 2369 2370 sfs->retval = retval; 2371 sfs->xerrno = xerrno; 2372 } 2373 2374 /* 2375 * sendfile(2) 2376 * 2377 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 2378 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 2379 * 2380 * Send a file specified by 'fd' and starting at 'offset' to a socket 2381 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == 2382 * 0. Optionally add a header and/or trailer to the socket output. If 2383 * specified, write the total number of bytes sent into *sbytes. 2384 */ 2385 int 2386 sys_sendfile(struct thread *td, struct sendfile_args *uap) 2387 { 2388 2389 return (do_sendfile(td, uap, 0)); 2390 } 2391 2392 int 2393 _do_sendfile(struct thread *td, int src_fd, int sock_fd, int flags, 2394 int compat, off_t offset, size_t nbytes, off_t *sbytes, 2395 struct uio *hdr_uio, 2396 struct uio *trl_uio, struct sf_hdtr_kq *hdtr_kq) 2397 { 2398 cap_rights_t rights; 2399 struct sendfile_sync *sfs = NULL; 2400 struct file *fp; 2401 int error; 2402 int do_kqueue = 0; 2403 int do_free = 0; 2404 2405 AUDIT_ARG_FD(src_fd); 2406 2407 if (hdtr_kq != NULL) 2408 do_kqueue = 1; 2409 2410 /* 2411 * sendfile(2) can start at any offset within a file so we require 2412 * CAP_READ+CAP_SEEK = CAP_PREAD. 2413 */ 2414 if ((error = fget_read(td, src_fd, 2415 cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { 2416 goto out; 2417 } 2418 2419 /* 2420 * IF SF_KQUEUE is set but we haven't copied in anything for 2421 * kqueue data, error out. 2422 */ 2423 if (flags & SF_KQUEUE && do_kqueue == 0) { 2424 SFSYNC_DPRINTF("%s: SF_KQUEUE but no KQUEUE data!\n", __func__); 2425 goto out; 2426 } 2427 2428 /* 2429 * If we need to wait for completion, initialise the sfsync 2430 * state here. 2431 */ 2432 if (flags & (SF_SYNC | SF_KQUEUE)) 2433 sfs = sf_sync_alloc(flags & (SF_SYNC | SF_KQUEUE)); 2434 2435 if (flags & SF_KQUEUE) { 2436 error = sf_sync_kqueue_setup(sfs, hdtr_kq); 2437 if (error) { 2438 SFSYNC_DPRINTF("%s: (%llu) error; sfs=%p\n", 2439 __func__, 2440 (unsigned long long) curthread->td_tid, 2441 sfs); 2442 sf_sync_set_state(sfs, SF_STATE_FREEING, 0); 2443 sf_sync_free(sfs); 2444 goto out; 2445 } 2446 } 2447 2448 /* 2449 * Do the sendfile call. 2450 * 2451 * If this fails, it'll free the mbuf chain which will free up the 2452 * sendfile_sync references. 2453 */ 2454 error = fo_sendfile(fp, sock_fd, hdr_uio, trl_uio, offset, 2455 nbytes, sbytes, flags, compat ? SFK_COMPAT : 0, sfs, td); 2456 2457 /* 2458 * If the sendfile call succeeded, transition the sf_sync state 2459 * to RUNNING, then COMPLETED. 2460 * 2461 * If the sendfile call failed, then the sendfile call may have 2462 * actually sent some data first - so we check to see whether 2463 * any data was sent. If some data was queued (ie, count > 0) 2464 * then we can't call free; we have to wait until the partial 2465 * transaction completes before we continue along. 2466 * 2467 * This has the side effect of firing off the knote 2468 * if the refcount has hit zero by the time we get here. 2469 */ 2470 if (sfs != NULL) { 2471 mtx_lock(&sfs->mtx); 2472 if (error == 0 || sfs->count > 0) { 2473 /* 2474 * When it's time to do async sendfile, the transition 2475 * to RUNNING signifies that we're actually actively 2476 * adding and completing mbufs. When the last disk 2477 * buffer is read (ie, when we're not doing any 2478 * further read IO and all subsequent stuff is mbuf 2479 * transmissions) we'll transition to COMPLETED 2480 * and when the final mbuf is freed, the completion 2481 * will be signaled. 2482 */ 2483 sf_sync_set_state(sfs, SF_STATE_RUNNING, 1); 2484 2485 /* 2486 * Set the retval before we signal completed. 2487 * If we do it the other way around then transitioning to 2488 * COMPLETED may post the knote before you set the return 2489 * status! 2490 * 2491 * XXX for now, errno is always 0, as we don't post 2492 * knotes if sendfile failed. Maybe that'll change later. 2493 */ 2494 sf_sync_set_retval(sfs, *sbytes, error); 2495 2496 /* 2497 * And now transition to completed, which will kick off 2498 * the knote if required. 2499 */ 2500 sf_sync_set_state(sfs, SF_STATE_COMPLETED, 1); 2501 } else { 2502 /* 2503 * Error isn't zero, sfs_count is zero, so we 2504 * won't have some other thing to wake things up. 2505 * Thus free. 2506 */ 2507 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2508 do_free = 1; 2509 } 2510 2511 /* 2512 * Next - wait if appropriate. 2513 */ 2514 sf_sync_syscall_wait(sfs); 2515 2516 /* 2517 * If we're not doing kqueue notifications, we can 2518 * transition this immediately to the freeing state. 2519 */ 2520 if ((sfs->flags & SF_KQUEUE) == 0) { 2521 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2522 do_free = 1; 2523 } 2524 2525 mtx_unlock(&sfs->mtx); 2526 } 2527 2528 /* 2529 * If do_free is set, free here. 2530 * 2531 * If we're doing no-kqueue notification and it's just sleep notification, 2532 * we also do free; it's the only chance we have. 2533 */ 2534 if (sfs != NULL && do_free == 1) { 2535 sf_sync_free(sfs); 2536 } 2537 2538 /* 2539 * XXX Should we wait until the send has completed before freeing the source 2540 * file handle? It's the previous behaviour, sure, but is it required? 2541 * We've wired down the page references after all. 2542 */ 2543 fdrop(fp, td); 2544 2545 out: 2546 /* Return error */ 2547 return (error); 2548 } 2549 2550 2551 static int 2552 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) 2553 { 2554 struct sf_hdtr hdtr; 2555 struct sf_hdtr_kq hdtr_kq; 2556 struct uio *hdr_uio, *trl_uio; 2557 int error; 2558 off_t sbytes; 2559 int do_kqueue = 0; 2560 2561 /* 2562 * File offset must be positive. If it goes beyond EOF 2563 * we send only the header/trailer and no payload data. 2564 */ 2565 if (uap->offset < 0) 2566 return (EINVAL); 2567 2568 hdr_uio = trl_uio = NULL; 2569 2570 if (uap->hdtr != NULL) { 2571 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 2572 if (error != 0) 2573 goto out; 2574 if (hdtr.headers != NULL) { 2575 error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); 2576 if (error != 0) 2577 goto out; 2578 } 2579 if (hdtr.trailers != NULL) { 2580 error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); 2581 if (error != 0) 2582 goto out; 2583 } 2584 2585 /* 2586 * If SF_KQUEUE is set, then we need to also copy in 2587 * the kqueue data after the normal hdtr set and set 2588 * do_kqueue=1. 2589 */ 2590 if (uap->flags & SF_KQUEUE) { 2591 error = copyin(((char *) uap->hdtr) + sizeof(hdtr), 2592 &hdtr_kq, 2593 sizeof(hdtr_kq)); 2594 if (error != 0) 2595 goto out; 2596 do_kqueue = 1; 2597 } 2598 } 2599 2600 /* Call sendfile */ 2601 error = _do_sendfile(td, uap->fd, uap->s, uap->flags, compat, 2602 uap->offset, uap->nbytes, &sbytes, hdr_uio, trl_uio, &hdtr_kq); 2603 2604 if (uap->sbytes != NULL) { 2605 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 2606 } 2607 out: 2608 free(hdr_uio, M_IOV); 2609 free(trl_uio, M_IOV); 2610 return (error); 2611 } 2612 2613 #ifdef COMPAT_FREEBSD4 2614 int 2615 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) 2616 { 2617 struct sendfile_args args; 2618 2619 args.fd = uap->fd; 2620 args.s = uap->s; 2621 args.offset = uap->offset; 2622 args.nbytes = uap->nbytes; 2623 args.hdtr = uap->hdtr; 2624 args.sbytes = uap->sbytes; 2625 args.flags = uap->flags; 2626 2627 return (do_sendfile(td, &args, 1)); 2628 } 2629 #endif /* COMPAT_FREEBSD4 */ 2630 2631 static int 2632 sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd, 2633 off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res) 2634 { 2635 vm_page_t m; 2636 vm_pindex_t pindex; 2637 ssize_t resid; 2638 int error, readahead, rv; 2639 2640 pindex = OFF_TO_IDX(off); 2641 VM_OBJECT_WLOCK(obj); 2642 m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY | 2643 VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL); 2644 2645 /* 2646 * Check if page is valid for what we need, otherwise initiate I/O. 2647 * 2648 * The non-zero nd argument prevents disk I/O, instead we 2649 * return the caller what he specified in nd. In particular, 2650 * if we already turned some pages into mbufs, nd == EAGAIN 2651 * and the main function send them the pages before we come 2652 * here again and block. 2653 */ 2654 if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) { 2655 if (vp == NULL) 2656 vm_page_xunbusy(m); 2657 VM_OBJECT_WUNLOCK(obj); 2658 *res = m; 2659 return (0); 2660 } else if (nd != 0) { 2661 if (vp == NULL) 2662 vm_page_xunbusy(m); 2663 error = nd; 2664 goto free_page; 2665 } 2666 2667 /* 2668 * Get the page from backing store. 2669 */ 2670 error = 0; 2671 if (vp != NULL) { 2672 VM_OBJECT_WUNLOCK(obj); 2673 readahead = sfreadahead * MAXBSIZE; 2674 2675 /* 2676 * Use vn_rdwr() instead of the pager interface for 2677 * the vnode, to allow the read-ahead. 2678 * 2679 * XXXMAC: Because we don't have fp->f_cred here, we 2680 * pass in NOCRED. This is probably wrong, but is 2681 * consistent with our original implementation. 2682 */ 2683 error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off), 2684 UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead / 2685 bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); 2686 SFSTAT_INC(sf_iocnt); 2687 VM_OBJECT_WLOCK(obj); 2688 } else { 2689 if (vm_pager_has_page(obj, pindex, NULL, NULL)) { 2690 rv = vm_pager_get_pages(obj, &m, 1, 0); 2691 SFSTAT_INC(sf_iocnt); 2692 m = vm_page_lookup(obj, pindex); 2693 if (m == NULL) 2694 error = EIO; 2695 else if (rv != VM_PAGER_OK) { 2696 vm_page_lock(m); 2697 vm_page_free(m); 2698 vm_page_unlock(m); 2699 m = NULL; 2700 error = EIO; 2701 } 2702 } else { 2703 pmap_zero_page(m); 2704 m->valid = VM_PAGE_BITS_ALL; 2705 m->dirty = 0; 2706 } 2707 if (m != NULL) 2708 vm_page_xunbusy(m); 2709 } 2710 if (error == 0) { 2711 *res = m; 2712 } else if (m != NULL) { 2713 free_page: 2714 vm_page_lock(m); 2715 vm_page_unwire(m, PQ_INACTIVE); 2716 2717 /* 2718 * See if anyone else might know about this page. If 2719 * not and it is not valid, then free it. 2720 */ 2721 if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m)) 2722 vm_page_free(m); 2723 vm_page_unlock(m); 2724 } 2725 KASSERT(error != 0 || (m->wire_count > 0 && 2726 vm_page_is_valid(m, off & PAGE_MASK, xfsize)), 2727 ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off, 2728 xfsize)); 2729 VM_OBJECT_WUNLOCK(obj); 2730 return (error); 2731 } 2732 2733 static int 2734 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, 2735 struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, 2736 int *bsize) 2737 { 2738 struct vattr va; 2739 vm_object_t obj; 2740 struct vnode *vp; 2741 struct shmfd *shmfd; 2742 int error; 2743 2744 vp = *vp_res = NULL; 2745 obj = NULL; 2746 shmfd = *shmfd_res = NULL; 2747 *bsize = 0; 2748 2749 /* 2750 * The file descriptor must be a regular file and have a 2751 * backing VM object. 2752 */ 2753 if (fp->f_type == DTYPE_VNODE) { 2754 vp = fp->f_vnode; 2755 vn_lock(vp, LK_SHARED | LK_RETRY); 2756 if (vp->v_type != VREG) { 2757 error = EINVAL; 2758 goto out; 2759 } 2760 *bsize = vp->v_mount->mnt_stat.f_iosize; 2761 error = VOP_GETATTR(vp, &va, td->td_ucred); 2762 if (error != 0) 2763 goto out; 2764 *obj_size = va.va_size; 2765 obj = vp->v_object; 2766 if (obj == NULL) { 2767 error = EINVAL; 2768 goto out; 2769 } 2770 } else if (fp->f_type == DTYPE_SHM) { 2771 shmfd = fp->f_data; 2772 obj = shmfd->shm_object; 2773 *obj_size = shmfd->shm_size; 2774 } else { 2775 error = EINVAL; 2776 goto out; 2777 } 2778 2779 VM_OBJECT_WLOCK(obj); 2780 if ((obj->flags & OBJ_DEAD) != 0) { 2781 VM_OBJECT_WUNLOCK(obj); 2782 error = EBADF; 2783 goto out; 2784 } 2785 2786 /* 2787 * Temporarily increase the backing VM object's reference 2788 * count so that a forced reclamation of its vnode does not 2789 * immediately destroy it. 2790 */ 2791 vm_object_reference_locked(obj); 2792 VM_OBJECT_WUNLOCK(obj); 2793 *obj_res = obj; 2794 *vp_res = vp; 2795 *shmfd_res = shmfd; 2796 2797 out: 2798 if (vp != NULL) 2799 VOP_UNLOCK(vp, 0); 2800 return (error); 2801 } 2802 2803 static int 2804 kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp, 2805 struct socket **so) 2806 { 2807 cap_rights_t rights; 2808 int error; 2809 2810 *sock_fp = NULL; 2811 *so = NULL; 2812 2813 /* 2814 * The socket must be a stream socket and connected. 2815 */ 2816 error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights, 2817 CAP_SEND), sock_fp, NULL); 2818 if (error != 0) 2819 return (error); 2820 *so = (*sock_fp)->f_data; 2821 if ((*so)->so_type != SOCK_STREAM) 2822 return (EINVAL); 2823 if (((*so)->so_state & SS_ISCONNECTED) == 0) 2824 return (ENOTCONN); 2825 return (0); 2826 } 2827 2828 int 2829 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 2830 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 2831 int kflags, struct sendfile_sync *sfs, struct thread *td) 2832 { 2833 struct file *sock_fp; 2834 struct vnode *vp; 2835 struct vm_object *obj; 2836 struct socket *so; 2837 struct mbuf *m; 2838 struct sf_buf *sf; 2839 struct vm_page *pg; 2840 struct shmfd *shmfd; 2841 struct vattr va; 2842 off_t off, xfsize, fsbytes, sbytes, rem, obj_size; 2843 int error, bsize, nd, hdrlen, mnw; 2844 2845 pg = NULL; 2846 obj = NULL; 2847 so = NULL; 2848 m = NULL; 2849 fsbytes = sbytes = 0; 2850 hdrlen = mnw = 0; 2851 rem = nbytes; 2852 obj_size = 0; 2853 2854 error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); 2855 if (error != 0) 2856 return (error); 2857 if (rem == 0) 2858 rem = obj_size; 2859 2860 error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so); 2861 if (error != 0) 2862 goto out; 2863 2864 /* 2865 * Do not wait on memory allocations but return ENOMEM for 2866 * caller to retry later. 2867 * XXX: Experimental. 2868 */ 2869 if (flags & SF_MNOWAIT) 2870 mnw = 1; 2871 2872 #ifdef MAC 2873 error = mac_socket_check_send(td->td_ucred, so); 2874 if (error != 0) 2875 goto out; 2876 #endif 2877 2878 /* If headers are specified copy them into mbufs. */ 2879 if (hdr_uio != NULL) { 2880 hdr_uio->uio_td = td; 2881 hdr_uio->uio_rw = UIO_WRITE; 2882 if (hdr_uio->uio_resid > 0) { 2883 /* 2884 * In FBSD < 5.0 the nbytes to send also included 2885 * the header. If compat is specified subtract the 2886 * header size from nbytes. 2887 */ 2888 if (kflags & SFK_COMPAT) { 2889 if (nbytes > hdr_uio->uio_resid) 2890 nbytes -= hdr_uio->uio_resid; 2891 else 2892 nbytes = 0; 2893 } 2894 m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), 2895 0, 0, 0); 2896 if (m == NULL) { 2897 error = mnw ? EAGAIN : ENOBUFS; 2898 goto out; 2899 } 2900 hdrlen = m_length(m, NULL); 2901 } 2902 } 2903 2904 /* 2905 * Protect against multiple writers to the socket. 2906 * 2907 * XXXRW: Historically this has assumed non-interruptibility, so now 2908 * we implement that, but possibly shouldn't. 2909 */ 2910 (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); 2911 2912 /* 2913 * Loop through the pages of the file, starting with the requested 2914 * offset. Get a file page (do I/O if necessary), map the file page 2915 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 2916 * it on the socket. 2917 * This is done in two loops. The inner loop turns as many pages 2918 * as it can, up to available socket buffer space, without blocking 2919 * into mbufs to have it bulk delivered into the socket send buffer. 2920 * The outer loop checks the state and available space of the socket 2921 * and takes care of the overall progress. 2922 */ 2923 for (off = offset; ; ) { 2924 struct mbuf *mtail; 2925 int loopbytes; 2926 int space; 2927 int done; 2928 2929 if ((nbytes != 0 && nbytes == fsbytes) || 2930 (nbytes == 0 && obj_size == fsbytes)) 2931 break; 2932 2933 mtail = NULL; 2934 loopbytes = 0; 2935 space = 0; 2936 done = 0; 2937 2938 /* 2939 * Check the socket state for ongoing connection, 2940 * no errors and space in socket buffer. 2941 * If space is low allow for the remainder of the 2942 * file to be processed if it fits the socket buffer. 2943 * Otherwise block in waiting for sufficient space 2944 * to proceed, or if the socket is nonblocking, return 2945 * to userland with EAGAIN while reporting how far 2946 * we've come. 2947 * We wait until the socket buffer has significant free 2948 * space to do bulk sends. This makes good use of file 2949 * system read ahead and allows packet segmentation 2950 * offloading hardware to take over lots of work. If 2951 * we were not careful here we would send off only one 2952 * sfbuf at a time. 2953 */ 2954 SOCKBUF_LOCK(&so->so_snd); 2955 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) 2956 so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; 2957 retry_space: 2958 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2959 error = EPIPE; 2960 SOCKBUF_UNLOCK(&so->so_snd); 2961 goto done; 2962 } else if (so->so_error) { 2963 error = so->so_error; 2964 so->so_error = 0; 2965 SOCKBUF_UNLOCK(&so->so_snd); 2966 goto done; 2967 } 2968 space = sbspace(&so->so_snd); 2969 if (space < rem && 2970 (space <= 0 || 2971 space < so->so_snd.sb_lowat)) { 2972 if (so->so_state & SS_NBIO) { 2973 SOCKBUF_UNLOCK(&so->so_snd); 2974 error = EAGAIN; 2975 goto done; 2976 } 2977 /* 2978 * sbwait drops the lock while sleeping. 2979 * When we loop back to retry_space the 2980 * state may have changed and we retest 2981 * for it. 2982 */ 2983 error = sbwait(&so->so_snd); 2984 /* 2985 * An error from sbwait usually indicates that we've 2986 * been interrupted by a signal. If we've sent anything 2987 * then return bytes sent, otherwise return the error. 2988 */ 2989 if (error != 0) { 2990 SOCKBUF_UNLOCK(&so->so_snd); 2991 goto done; 2992 } 2993 goto retry_space; 2994 } 2995 SOCKBUF_UNLOCK(&so->so_snd); 2996 2997 /* 2998 * Reduce space in the socket buffer by the size of 2999 * the header mbuf chain. 3000 * hdrlen is set to 0 after the first loop. 3001 */ 3002 space -= hdrlen; 3003 3004 if (vp != NULL) { 3005 error = vn_lock(vp, LK_SHARED); 3006 if (error != 0) 3007 goto done; 3008 error = VOP_GETATTR(vp, &va, td->td_ucred); 3009 if (error != 0 || off >= va.va_size) { 3010 VOP_UNLOCK(vp, 0); 3011 goto done; 3012 } 3013 obj_size = va.va_size; 3014 } 3015 3016 /* 3017 * Loop and construct maximum sized mbuf chain to be bulk 3018 * dumped into socket buffer. 3019 */ 3020 while (space > loopbytes) { 3021 vm_offset_t pgoff; 3022 struct mbuf *m0; 3023 3024 /* 3025 * Calculate the amount to transfer. 3026 * Not to exceed a page, the EOF, 3027 * or the passed in nbytes. 3028 */ 3029 pgoff = (vm_offset_t)(off & PAGE_MASK); 3030 rem = obj_size - offset; 3031 if (nbytes != 0) 3032 rem = omin(rem, nbytes); 3033 rem -= fsbytes + loopbytes; 3034 xfsize = omin(PAGE_SIZE - pgoff, rem); 3035 xfsize = omin(space - loopbytes, xfsize); 3036 if (xfsize <= 0) { 3037 done = 1; /* all data sent */ 3038 break; 3039 } 3040 3041 /* 3042 * Attempt to look up the page. Allocate 3043 * if not found or wait and loop if busy. 3044 */ 3045 if (m != NULL) 3046 nd = EAGAIN; /* send what we already got */ 3047 else if ((flags & SF_NODISKIO) != 0) 3048 nd = EBUSY; 3049 else 3050 nd = 0; 3051 error = sendfile_readpage(obj, vp, nd, off, 3052 xfsize, bsize, td, &pg); 3053 if (error != 0) { 3054 if (error == EAGAIN) 3055 error = 0; /* not a real error */ 3056 break; 3057 } 3058 3059 /* 3060 * Get a sendfile buf. When allocating the 3061 * first buffer for mbuf chain, we usually 3062 * wait as long as necessary, but this wait 3063 * can be interrupted. For consequent 3064 * buffers, do not sleep, since several 3065 * threads might exhaust the buffers and then 3066 * deadlock. 3067 */ 3068 sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT : 3069 SFB_CATCH); 3070 if (sf == NULL) { 3071 SFSTAT_INC(sf_allocfail); 3072 vm_page_lock(pg); 3073 vm_page_unwire(pg, PQ_INACTIVE); 3074 KASSERT(pg->object != NULL, 3075 ("%s: object disappeared", __func__)); 3076 vm_page_unlock(pg); 3077 if (m == NULL) 3078 error = (mnw ? EAGAIN : EINTR); 3079 break; 3080 } 3081 3082 /* 3083 * Get an mbuf and set it up as having 3084 * external storage. 3085 */ 3086 m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); 3087 if (m0 == NULL) { 3088 error = (mnw ? EAGAIN : ENOBUFS); 3089 sf_ext_free(sf, NULL); 3090 break; 3091 } 3092 /* 3093 * Attach EXT_SFBUF external storage. 3094 */ 3095 m0->m_ext.ext_buf = (caddr_t )sf_buf_kva(sf); 3096 m0->m_ext.ext_size = PAGE_SIZE; 3097 m0->m_ext.ext_arg1 = sf; 3098 m0->m_ext.ext_arg2 = sfs; 3099 m0->m_ext.ext_type = EXT_SFBUF; 3100 m0->m_ext.ext_flags = 0; 3101 m0->m_flags |= (M_EXT|M_RDONLY); 3102 m0->m_data = (char *)sf_buf_kva(sf) + pgoff; 3103 m0->m_len = xfsize; 3104 3105 /* Append to mbuf chain. */ 3106 if (mtail != NULL) 3107 mtail->m_next = m0; 3108 else if (m != NULL) 3109 m_last(m)->m_next = m0; 3110 else 3111 m = m0; 3112 mtail = m0; 3113 3114 /* Keep track of bits processed. */ 3115 loopbytes += xfsize; 3116 off += xfsize; 3117 3118 /* 3119 * XXX eventually this should be a sfsync 3120 * method call! 3121 */ 3122 if (sfs != NULL) 3123 sf_sync_ref(sfs); 3124 } 3125 3126 if (vp != NULL) 3127 VOP_UNLOCK(vp, 0); 3128 3129 /* Add the buffer chain to the socket buffer. */ 3130 if (m != NULL) { 3131 int mlen, err; 3132 3133 mlen = m_length(m, NULL); 3134 SOCKBUF_LOCK(&so->so_snd); 3135 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3136 error = EPIPE; 3137 SOCKBUF_UNLOCK(&so->so_snd); 3138 goto done; 3139 } 3140 SOCKBUF_UNLOCK(&so->so_snd); 3141 CURVNET_SET(so->so_vnet); 3142 /* Avoid error aliasing. */ 3143 err = (*so->so_proto->pr_usrreqs->pru_send) 3144 (so, 0, m, NULL, NULL, td); 3145 CURVNET_RESTORE(); 3146 if (err == 0) { 3147 /* 3148 * We need two counters to get the 3149 * file offset and nbytes to send 3150 * right: 3151 * - sbytes contains the total amount 3152 * of bytes sent, including headers. 3153 * - fsbytes contains the total amount 3154 * of bytes sent from the file. 3155 */ 3156 sbytes += mlen; 3157 fsbytes += mlen; 3158 if (hdrlen) { 3159 fsbytes -= hdrlen; 3160 hdrlen = 0; 3161 } 3162 } else if (error == 0) 3163 error = err; 3164 m = NULL; /* pru_send always consumes */ 3165 } 3166 3167 /* Quit outer loop on error or when we're done. */ 3168 if (done) 3169 break; 3170 if (error != 0) 3171 goto done; 3172 } 3173 3174 /* 3175 * Send trailers. Wimp out and use writev(2). 3176 */ 3177 if (trl_uio != NULL) { 3178 sbunlock(&so->so_snd); 3179 error = kern_writev(td, sockfd, trl_uio); 3180 if (error == 0) 3181 sbytes += td->td_retval[0]; 3182 goto out; 3183 } 3184 3185 done: 3186 sbunlock(&so->so_snd); 3187 out: 3188 /* 3189 * If there was no error we have to clear td->td_retval[0] 3190 * because it may have been set by writev. 3191 */ 3192 if (error == 0) { 3193 td->td_retval[0] = 0; 3194 } 3195 if (sent != NULL) { 3196 (*sent) = sbytes; 3197 } 3198 if (obj != NULL) 3199 vm_object_deallocate(obj); 3200 if (so) 3201 fdrop(sock_fp, td); 3202 if (m) 3203 m_freem(m); 3204 3205 if (error == ERESTART) 3206 error = EINTR; 3207 3208 return (error); 3209 } 3210 3211 /* 3212 * SCTP syscalls. 3213 * Functionality only compiled in if SCTP is defined in the kernel Makefile, 3214 * otherwise all return EOPNOTSUPP. 3215 * XXX: We should make this loadable one day. 3216 */ 3217 int 3218 sys_sctp_peeloff(td, uap) 3219 struct thread *td; 3220 struct sctp_peeloff_args /* { 3221 int sd; 3222 caddr_t name; 3223 } */ *uap; 3224 { 3225 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3226 struct file *nfp = NULL; 3227 struct socket *head, *so; 3228 cap_rights_t rights; 3229 u_int fflag; 3230 int error, fd; 3231 3232 AUDIT_ARG_FD(uap->sd); 3233 error = fgetsock(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF), 3234 &head, &fflag); 3235 if (error != 0) 3236 goto done2; 3237 if (head->so_proto->pr_protocol != IPPROTO_SCTP) { 3238 error = EOPNOTSUPP; 3239 goto done; 3240 } 3241 error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name); 3242 if (error != 0) 3243 goto done; 3244 /* 3245 * At this point we know we do have a assoc to pull 3246 * we proceed to get the fd setup. This may block 3247 * but that is ok. 3248 */ 3249 3250 error = falloc(td, &nfp, &fd, 0); 3251 if (error != 0) 3252 goto done; 3253 td->td_retval[0] = fd; 3254 3255 CURVNET_SET(head->so_vnet); 3256 so = sonewconn(head, SS_ISCONNECTED); 3257 if (so == NULL) { 3258 error = ENOMEM; 3259 goto noconnection; 3260 } 3261 /* 3262 * Before changing the flags on the socket, we have to bump the 3263 * reference count. Otherwise, if the protocol calls sofree(), 3264 * the socket will be released due to a zero refcount. 3265 */ 3266 SOCK_LOCK(so); 3267 soref(so); /* file descriptor reference */ 3268 SOCK_UNLOCK(so); 3269 3270 ACCEPT_LOCK(); 3271 3272 TAILQ_REMOVE(&head->so_comp, so, so_list); 3273 head->so_qlen--; 3274 so->so_state |= (head->so_state & SS_NBIO); 3275 so->so_state &= ~SS_NOFDREF; 3276 so->so_qstate &= ~SQ_COMP; 3277 so->so_head = NULL; 3278 ACCEPT_UNLOCK(); 3279 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); 3280 error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); 3281 if (error != 0) 3282 goto noconnection; 3283 if (head->so_sigio != NULL) 3284 fsetown(fgetown(&head->so_sigio), &so->so_sigio); 3285 3286 noconnection: 3287 /* 3288 * close the new descriptor, assuming someone hasn't ripped it 3289 * out from under us. 3290 */ 3291 if (error != 0) 3292 fdclose(td->td_proc->p_fd, nfp, fd, td); 3293 3294 /* 3295 * Release explicitly held references before returning. 3296 */ 3297 CURVNET_RESTORE(); 3298 done: 3299 if (nfp != NULL) 3300 fdrop(nfp, td); 3301 fputsock(head); 3302 done2: 3303 return (error); 3304 #else /* SCTP */ 3305 return (EOPNOTSUPP); 3306 #endif /* SCTP */ 3307 } 3308 3309 int 3310 sys_sctp_generic_sendmsg (td, uap) 3311 struct thread *td; 3312 struct sctp_generic_sendmsg_args /* { 3313 int sd, 3314 caddr_t msg, 3315 int mlen, 3316 caddr_t to, 3317 __socklen_t tolen, 3318 struct sctp_sndrcvinfo *sinfo, 3319 int flags 3320 } */ *uap; 3321 { 3322 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3323 struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; 3324 struct socket *so; 3325 struct file *fp = NULL; 3326 struct sockaddr *to = NULL; 3327 #ifdef KTRACE 3328 struct uio *ktruio = NULL; 3329 #endif 3330 struct uio auio; 3331 struct iovec iov[1]; 3332 cap_rights_t rights; 3333 int error = 0, len; 3334 3335 if (uap->sinfo != NULL) { 3336 error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); 3337 if (error != 0) 3338 return (error); 3339 u_sinfo = &sinfo; 3340 } 3341 3342 cap_rights_init(&rights, CAP_SEND); 3343 if (uap->tolen != 0) { 3344 error = getsockaddr(&to, uap->to, uap->tolen); 3345 if (error != 0) { 3346 to = NULL; 3347 goto sctp_bad2; 3348 } 3349 cap_rights_set(&rights, CAP_CONNECT); 3350 } 3351 3352 AUDIT_ARG_FD(uap->sd); 3353 error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL); 3354 if (error != 0) 3355 goto sctp_bad; 3356 #ifdef KTRACE 3357 if (to && (KTRPOINT(td, KTR_STRUCT))) 3358 ktrsockaddr(to); 3359 #endif 3360 3361 iov[0].iov_base = uap->msg; 3362 iov[0].iov_len = uap->mlen; 3363 3364 so = (struct socket *)fp->f_data; 3365 if (so->so_proto->pr_protocol != IPPROTO_SCTP) { 3366 error = EOPNOTSUPP; 3367 goto sctp_bad; 3368 } 3369 #ifdef MAC 3370 error = mac_socket_check_send(td->td_ucred, so); 3371 if (error != 0) 3372 goto sctp_bad; 3373 #endif /* MAC */ 3374 3375 auio.uio_iov = iov; 3376 auio.uio_iovcnt = 1; 3377 auio.uio_segflg = UIO_USERSPACE; 3378 auio.uio_rw = UIO_WRITE; 3379 auio.uio_td = td; 3380 auio.uio_offset = 0; /* XXX */ 3381 auio.uio_resid = 0; 3382 len = auio.uio_resid = uap->mlen; 3383 CURVNET_SET(so->so_vnet); 3384 error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, 3385 (struct mbuf *)NULL, uap->flags, u_sinfo, td); 3386 CURVNET_RESTORE(); 3387 if (error != 0) { 3388 if (auio.uio_resid != len && (error == ERESTART || 3389 error == EINTR || error == EWOULDBLOCK)) 3390 error = 0; 3391 /* Generation of SIGPIPE can be controlled per socket. */ 3392 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 3393 !(uap->flags & MSG_NOSIGNAL)) { 3394 PROC_LOCK(td->td_proc); 3395 tdsignal(td, SIGPIPE); 3396 PROC_UNLOCK(td->td_proc); 3397 } 3398 } 3399 if (error == 0) 3400 td->td_retval[0] = len - auio.uio_resid; 3401 #ifdef KTRACE 3402 if (ktruio != NULL) { 3403 ktruio->uio_resid = td->td_retval[0]; 3404 ktrgenio(uap->sd, UIO_WRITE, ktruio, error); 3405 } 3406 #endif /* KTRACE */ 3407 sctp_bad: 3408 if (fp != NULL) 3409 fdrop(fp, td); 3410 sctp_bad2: 3411 free(to, M_SONAME); 3412 return (error); 3413 #else /* SCTP */ 3414 return (EOPNOTSUPP); 3415 #endif /* SCTP */ 3416 } 3417 3418 int 3419 sys_sctp_generic_sendmsg_iov(td, uap) 3420 struct thread *td; 3421 struct sctp_generic_sendmsg_iov_args /* { 3422 int sd, 3423 struct iovec *iov, 3424 int iovlen, 3425 caddr_t to, 3426 __socklen_t tolen, 3427 struct sctp_sndrcvinfo *sinfo, 3428 int flags 3429 } */ *uap; 3430 { 3431 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3432 struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; 3433 struct socket *so; 3434 struct file *fp = NULL; 3435 struct sockaddr *to = NULL; 3436 #ifdef KTRACE 3437 struct uio *ktruio = NULL; 3438 #endif 3439 struct uio auio; 3440 struct iovec *iov, *tiov; 3441 cap_rights_t rights; 3442 ssize_t len; 3443 int error, i; 3444 3445 if (uap->sinfo != NULL) { 3446 error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); 3447 if (error != 0) 3448 return (error); 3449 u_sinfo = &sinfo; 3450 } 3451 cap_rights_init(&rights, CAP_SEND); 3452 if (uap->tolen != 0) { 3453 error = getsockaddr(&to, uap->to, uap->tolen); 3454 if (error != 0) { 3455 to = NULL; 3456 goto sctp_bad2; 3457 } 3458 cap_rights_set(&rights, CAP_CONNECT); 3459 } 3460 3461 AUDIT_ARG_FD(uap->sd); 3462 error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL); 3463 if (error != 0) 3464 goto sctp_bad1; 3465 3466 #ifdef COMPAT_FREEBSD32 3467 if (SV_CURPROC_FLAG(SV_ILP32)) 3468 error = freebsd32_copyiniov((struct iovec32 *)uap->iov, 3469 uap->iovlen, &iov, EMSGSIZE); 3470 else 3471 #endif 3472 error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); 3473 if (error != 0) 3474 goto sctp_bad1; 3475 #ifdef KTRACE 3476 if (to && (KTRPOINT(td, KTR_STRUCT))) 3477 ktrsockaddr(to); 3478 #endif 3479 3480 so = (struct socket *)fp->f_data; 3481 if (so->so_proto->pr_protocol != IPPROTO_SCTP) { 3482 error = EOPNOTSUPP; 3483 goto sctp_bad; 3484 } 3485 #ifdef MAC 3486 error = mac_socket_check_send(td->td_ucred, so); 3487 if (error != 0) 3488 goto sctp_bad; 3489 #endif /* MAC */ 3490 3491 auio.uio_iov = iov; 3492 auio.uio_iovcnt = uap->iovlen; 3493 auio.uio_segflg = UIO_USERSPACE; 3494 auio.uio_rw = UIO_WRITE; 3495 auio.uio_td = td; 3496 auio.uio_offset = 0; /* XXX */ 3497 auio.uio_resid = 0; 3498 tiov = iov; 3499 for (i = 0; i <uap->iovlen; i++, tiov++) { 3500 if ((auio.uio_resid += tiov->iov_len) < 0) { 3501 error = EINVAL; 3502 goto sctp_bad; 3503 } 3504 } 3505 len = auio.uio_resid; 3506 CURVNET_SET(so->so_vnet); 3507 error = sctp_lower_sosend(so, to, &auio, 3508 (struct mbuf *)NULL, (struct mbuf *)NULL, 3509 uap->flags, u_sinfo, td); 3510 CURVNET_RESTORE(); 3511 if (error != 0) { 3512 if (auio.uio_resid != len && (error == ERESTART || 3513 error == EINTR || error == EWOULDBLOCK)) 3514 error = 0; 3515 /* Generation of SIGPIPE can be controlled per socket */ 3516 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 3517 !(uap->flags & MSG_NOSIGNAL)) { 3518 PROC_LOCK(td->td_proc); 3519 tdsignal(td, SIGPIPE); 3520 PROC_UNLOCK(td->td_proc); 3521 } 3522 } 3523 if (error == 0) 3524 td->td_retval[0] = len - auio.uio_resid; 3525 #ifdef KTRACE 3526 if (ktruio != NULL) { 3527 ktruio->uio_resid = td->td_retval[0]; 3528 ktrgenio(uap->sd, UIO_WRITE, ktruio, error); 3529 } 3530 #endif /* KTRACE */ 3531 sctp_bad: 3532 free(iov, M_IOV); 3533 sctp_bad1: 3534 if (fp != NULL) 3535 fdrop(fp, td); 3536 sctp_bad2: 3537 free(to, M_SONAME); 3538 return (error); 3539 #else /* SCTP */ 3540 return (EOPNOTSUPP); 3541 #endif /* SCTP */ 3542 } 3543 3544 int 3545 sys_sctp_generic_recvmsg(td, uap) 3546 struct thread *td; 3547 struct sctp_generic_recvmsg_args /* { 3548 int sd, 3549 struct iovec *iov, 3550 int iovlen, 3551 struct sockaddr *from, 3552 __socklen_t *fromlenaddr, 3553 struct sctp_sndrcvinfo *sinfo, 3554 int *msg_flags 3555 } */ *uap; 3556 { 3557 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3558 uint8_t sockbufstore[256]; 3559 struct uio auio; 3560 struct iovec *iov, *tiov; 3561 struct sctp_sndrcvinfo sinfo; 3562 struct socket *so; 3563 struct file *fp = NULL; 3564 struct sockaddr *fromsa; 3565 cap_rights_t rights; 3566 #ifdef KTRACE 3567 struct uio *ktruio = NULL; 3568 #endif 3569 ssize_t len; 3570 int error, fromlen, i, msg_flags; 3571 3572 AUDIT_ARG_FD(uap->sd); 3573 error = getsock_cap(td->td_proc->p_fd, uap->sd, 3574 cap_rights_init(&rights, CAP_RECV), &fp, NULL); 3575 if (error != 0) 3576 return (error); 3577 #ifdef COMPAT_FREEBSD32 3578 if (SV_CURPROC_FLAG(SV_ILP32)) 3579 error = freebsd32_copyiniov((struct iovec32 *)uap->iov, 3580 uap->iovlen, &iov, EMSGSIZE); 3581 else 3582 #endif 3583 error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); 3584 if (error != 0) 3585 goto out1; 3586 3587 so = fp->f_data; 3588 if (so->so_proto->pr_protocol != IPPROTO_SCTP) { 3589 error = EOPNOTSUPP; 3590 goto out; 3591 } 3592 #ifdef MAC 3593 error = mac_socket_check_receive(td->td_ucred, so); 3594 if (error != 0) 3595 goto out; 3596 #endif /* MAC */ 3597 3598 if (uap->fromlenaddr != NULL) { 3599 error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen)); 3600 if (error != 0) 3601 goto out; 3602 } else { 3603 fromlen = 0; 3604 } 3605 if (uap->msg_flags) { 3606 error = copyin(uap->msg_flags, &msg_flags, sizeof (int)); 3607 if (error != 0) 3608 goto out; 3609 } else { 3610 msg_flags = 0; 3611 } 3612 auio.uio_iov = iov; 3613 auio.uio_iovcnt = uap->iovlen; 3614 auio.uio_segflg = UIO_USERSPACE; 3615 auio.uio_rw = UIO_READ; 3616 auio.uio_td = td; 3617 auio.uio_offset = 0; /* XXX */ 3618 auio.uio_resid = 0; 3619 tiov = iov; 3620 for (i = 0; i <uap->iovlen; i++, tiov++) { 3621 if ((auio.uio_resid += tiov->iov_len) < 0) { 3622 error = EINVAL; 3623 goto out; 3624 } 3625 } 3626 len = auio.uio_resid; 3627 fromsa = (struct sockaddr *)sockbufstore; 3628 3629 #ifdef KTRACE 3630 if (KTRPOINT(td, KTR_GENIO)) 3631 ktruio = cloneuio(&auio); 3632 #endif /* KTRACE */ 3633 memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo)); 3634 CURVNET_SET(so->so_vnet); 3635 error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL, 3636 fromsa, fromlen, &msg_flags, 3637 (struct sctp_sndrcvinfo *)&sinfo, 1); 3638 CURVNET_RESTORE(); 3639 if (error != 0) { 3640 if (auio.uio_resid != len && (error == ERESTART || 3641 error == EINTR || error == EWOULDBLOCK)) 3642 error = 0; 3643 } else { 3644 if (uap->sinfo) 3645 error = copyout(&sinfo, uap->sinfo, sizeof (sinfo)); 3646 } 3647 #ifdef KTRACE 3648 if (ktruio != NULL) { 3649 ktruio->uio_resid = len - auio.uio_resid; 3650 ktrgenio(uap->sd, UIO_READ, ktruio, error); 3651 } 3652 #endif /* KTRACE */ 3653 if (error != 0) 3654 goto out; 3655 td->td_retval[0] = len - auio.uio_resid; 3656 3657 if (fromlen && uap->from) { 3658 len = fromlen; 3659 if (len <= 0 || fromsa == 0) 3660 len = 0; 3661 else { 3662 len = MIN(len, fromsa->sa_len); 3663 error = copyout(fromsa, uap->from, (size_t)len); 3664 if (error != 0) 3665 goto out; 3666 } 3667 error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t)); 3668 if (error != 0) 3669 goto out; 3670 } 3671 #ifdef KTRACE 3672 if (KTRPOINT(td, KTR_STRUCT)) 3673 ktrsockaddr(fromsa); 3674 #endif 3675 if (uap->msg_flags) { 3676 error = copyout(&msg_flags, uap->msg_flags, sizeof (int)); 3677 if (error != 0) 3678 goto out; 3679 } 3680 out: 3681 free(iov, M_IOV); 3682 out1: 3683 if (fp != NULL) 3684 fdrop(fp, td); 3685 3686 return (error); 3687 #else /* SCTP */ 3688 return (EOPNOTSUPP); 3689 #endif /* SCTP */ 3690 } 3691