1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_capsicum.h" 39 #include "opt_inet.h" 40 #include "opt_inet6.h" 41 #include "opt_sctp.h" 42 #include "opt_compat.h" 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/capsicum.h> 48 #include <sys/condvar.h> 49 #include <sys/kernel.h> 50 #include <sys/lock.h> 51 #include <sys/mutex.h> 52 #include <sys/sysproto.h> 53 #include <sys/malloc.h> 54 #include <sys/filedesc.h> 55 #include <sys/event.h> 56 #include <sys/proc.h> 57 #include <sys/fcntl.h> 58 #include <sys/file.h> 59 #include <sys/filio.h> 60 #include <sys/jail.h> 61 #include <sys/mman.h> 62 #include <sys/mount.h> 63 #include <sys/mbuf.h> 64 #include <sys/protosw.h> 65 #include <sys/rwlock.h> 66 #include <sys/sf_buf.h> 67 #include <sys/sf_sync.h> 68 #include <sys/sf_base.h> 69 #include <sys/sysent.h> 70 #include <sys/socket.h> 71 #include <sys/socketvar.h> 72 #include <sys/signalvar.h> 73 #include <sys/syscallsubr.h> 74 #include <sys/sysctl.h> 75 #include <sys/uio.h> 76 #include <sys/vnode.h> 77 #ifdef KTRACE 78 #include <sys/ktrace.h> 79 #endif 80 #ifdef COMPAT_FREEBSD32 81 #include <compat/freebsd32/freebsd32_util.h> 82 #endif 83 84 #include <net/vnet.h> 85 86 #include <security/audit/audit.h> 87 #include <security/mac/mac_framework.h> 88 89 #include <vm/vm.h> 90 #include <vm/vm_param.h> 91 #include <vm/vm_object.h> 92 #include <vm/vm_page.h> 93 #include <vm/vm_pager.h> 94 #include <vm/vm_kern.h> 95 #include <vm/vm_extern.h> 96 #include <vm/uma.h> 97 98 #if defined(INET) || defined(INET6) 99 #ifdef SCTP 100 #include <netinet/sctp.h> 101 #include <netinet/sctp_peeloff.h> 102 #endif /* SCTP */ 103 #endif /* INET || INET6 */ 104 105 /* 106 * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC 107 * and SOCK_NONBLOCK. 108 */ 109 #define ACCEPT4_INHERIT 0x1 110 #define ACCEPT4_COMPAT 0x2 111 112 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); 113 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); 114 115 static int accept1(struct thread *td, int s, struct sockaddr *uname, 116 socklen_t *anamelen, int flags); 117 static int do_sendfile(struct thread *td, struct sendfile_args *uap, 118 int compat); 119 static int getsockname1(struct thread *td, struct getsockname_args *uap, 120 int compat); 121 static int getpeername1(struct thread *td, struct getpeername_args *uap, 122 int compat); 123 124 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; 125 126 static int filt_sfsync_attach(struct knote *kn); 127 static void filt_sfsync_detach(struct knote *kn); 128 static int filt_sfsync(struct knote *kn, long hint); 129 130 /* 131 * sendfile(2)-related variables and associated sysctls 132 */ 133 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0, 134 "sendfile(2) tunables"); 135 static int sfreadahead = 1; 136 SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW, 137 &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks"); 138 139 #ifdef SFSYNC_DEBUG 140 static int sf_sync_debug = 0; 141 SYSCTL_INT(_debug, OID_AUTO, sf_sync_debug, CTLFLAG_RW, 142 &sf_sync_debug, 0, "Output debugging during sf_sync lifecycle"); 143 #define SFSYNC_DPRINTF(s, ...) \ 144 do { \ 145 if (sf_sync_debug) \ 146 printf((s), ##__VA_ARGS__); \ 147 } while (0) 148 #else 149 #define SFSYNC_DPRINTF(c, ...) 150 #endif 151 152 static uma_zone_t zone_sfsync; 153 154 static struct filterops sendfile_filtops = { 155 .f_isfd = 0, 156 .f_attach = filt_sfsync_attach, 157 .f_detach = filt_sfsync_detach, 158 .f_event = filt_sfsync, 159 }; 160 161 static void 162 sfstat_init(const void *unused) 163 { 164 165 COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), 166 M_WAITOK); 167 } 168 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); 169 170 static void 171 sf_sync_init(const void *unused) 172 { 173 174 zone_sfsync = uma_zcreate("sendfile_sync", sizeof(struct sendfile_sync), 175 NULL, NULL, 176 NULL, NULL, 177 UMA_ALIGN_CACHE, 178 0); 179 kqueue_add_filteropts(EVFILT_SENDFILE, &sendfile_filtops); 180 } 181 SYSINIT(sf_sync, SI_SUB_MBUF, SI_ORDER_FIRST, sf_sync_init, NULL); 182 183 static int 184 sfstat_sysctl(SYSCTL_HANDLER_ARGS) 185 { 186 struct sfstat s; 187 188 COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); 189 if (req->newptr) 190 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); 191 return (SYSCTL_OUT(req, &s, sizeof(s))); 192 } 193 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, 194 NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); 195 196 /* 197 * Convert a user file descriptor to a kernel file entry and check if required 198 * capability rights are present. 199 * A reference on the file entry is held upon returning. 200 */ 201 static int 202 getsock_cap(struct filedesc *fdp, int fd, cap_rights_t *rightsp, 203 struct file **fpp, u_int *fflagp) 204 { 205 struct file *fp; 206 int error; 207 208 error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL); 209 if (error != 0) 210 return (error); 211 if (fp->f_type != DTYPE_SOCKET) { 212 fdrop(fp, curthread); 213 return (ENOTSOCK); 214 } 215 if (fflagp != NULL) 216 *fflagp = fp->f_flag; 217 *fpp = fp; 218 return (0); 219 } 220 221 /* 222 * System call interface to the socket abstraction. 223 */ 224 #if defined(COMPAT_43) 225 #define COMPAT_OLDSOCK 226 #endif 227 228 int 229 sys_socket(td, uap) 230 struct thread *td; 231 struct socket_args /* { 232 int domain; 233 int type; 234 int protocol; 235 } */ *uap; 236 { 237 struct socket *so; 238 struct file *fp; 239 int fd, error, type, oflag, fflag; 240 241 AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); 242 243 type = uap->type; 244 oflag = 0; 245 fflag = 0; 246 if ((type & SOCK_CLOEXEC) != 0) { 247 type &= ~SOCK_CLOEXEC; 248 oflag |= O_CLOEXEC; 249 } 250 if ((type & SOCK_NONBLOCK) != 0) { 251 type &= ~SOCK_NONBLOCK; 252 fflag |= FNONBLOCK; 253 } 254 255 #ifdef MAC 256 error = mac_socket_check_create(td->td_ucred, uap->domain, type, 257 uap->protocol); 258 if (error != 0) 259 return (error); 260 #endif 261 error = falloc(td, &fp, &fd, oflag); 262 if (error != 0) 263 return (error); 264 /* An extra reference on `fp' has been held for us by falloc(). */ 265 error = socreate(uap->domain, &so, type, uap->protocol, 266 td->td_ucred, td); 267 if (error != 0) { 268 fdclose(td->td_proc->p_fd, fp, fd, td); 269 } else { 270 finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); 271 if ((fflag & FNONBLOCK) != 0) 272 (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); 273 td->td_retval[0] = fd; 274 } 275 fdrop(fp, td); 276 return (error); 277 } 278 279 /* ARGSUSED */ 280 int 281 sys_bind(td, uap) 282 struct thread *td; 283 struct bind_args /* { 284 int s; 285 caddr_t name; 286 int namelen; 287 } */ *uap; 288 { 289 struct sockaddr *sa; 290 int error; 291 292 error = getsockaddr(&sa, uap->name, uap->namelen); 293 if (error == 0) { 294 error = kern_bind(td, uap->s, sa); 295 free(sa, M_SONAME); 296 } 297 return (error); 298 } 299 300 static int 301 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 302 { 303 struct socket *so; 304 struct file *fp; 305 cap_rights_t rights; 306 int error; 307 308 AUDIT_ARG_FD(fd); 309 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 310 error = getsock_cap(td->td_proc->p_fd, fd, 311 cap_rights_init(&rights, CAP_BIND), &fp, NULL); 312 if (error != 0) 313 return (error); 314 so = fp->f_data; 315 #ifdef KTRACE 316 if (KTRPOINT(td, KTR_STRUCT)) 317 ktrsockaddr(sa); 318 #endif 319 #ifdef MAC 320 error = mac_socket_check_bind(td->td_ucred, so, sa); 321 if (error == 0) { 322 #endif 323 if (dirfd == AT_FDCWD) 324 error = sobind(so, sa, td); 325 else 326 error = sobindat(dirfd, so, sa, td); 327 #ifdef MAC 328 } 329 #endif 330 fdrop(fp, td); 331 return (error); 332 } 333 334 int 335 kern_bind(struct thread *td, int fd, struct sockaddr *sa) 336 { 337 338 return (kern_bindat(td, AT_FDCWD, fd, sa)); 339 } 340 341 /* ARGSUSED */ 342 int 343 sys_bindat(td, uap) 344 struct thread *td; 345 struct bindat_args /* { 346 int fd; 347 int s; 348 caddr_t name; 349 int namelen; 350 } */ *uap; 351 { 352 struct sockaddr *sa; 353 int error; 354 355 error = getsockaddr(&sa, uap->name, uap->namelen); 356 if (error == 0) { 357 error = kern_bindat(td, uap->fd, uap->s, sa); 358 free(sa, M_SONAME); 359 } 360 return (error); 361 } 362 363 /* ARGSUSED */ 364 int 365 sys_listen(td, uap) 366 struct thread *td; 367 struct listen_args /* { 368 int s; 369 int backlog; 370 } */ *uap; 371 { 372 struct socket *so; 373 struct file *fp; 374 cap_rights_t rights; 375 int error; 376 377 AUDIT_ARG_FD(uap->s); 378 error = getsock_cap(td->td_proc->p_fd, uap->s, 379 cap_rights_init(&rights, CAP_LISTEN), &fp, NULL); 380 if (error == 0) { 381 so = fp->f_data; 382 #ifdef MAC 383 error = mac_socket_check_listen(td->td_ucred, so); 384 if (error == 0) 385 #endif 386 error = solisten(so, uap->backlog, td); 387 fdrop(fp, td); 388 } 389 return(error); 390 } 391 392 /* 393 * accept1() 394 */ 395 static int 396 accept1(td, s, uname, anamelen, flags) 397 struct thread *td; 398 int s; 399 struct sockaddr *uname; 400 socklen_t *anamelen; 401 int flags; 402 { 403 struct sockaddr *name; 404 socklen_t namelen; 405 struct file *fp; 406 int error; 407 408 if (uname == NULL) 409 return (kern_accept4(td, s, NULL, NULL, flags, NULL)); 410 411 error = copyin(anamelen, &namelen, sizeof (namelen)); 412 if (error != 0) 413 return (error); 414 415 error = kern_accept4(td, s, &name, &namelen, flags, &fp); 416 417 if (error != 0) 418 return (error); 419 420 if (error == 0 && uname != NULL) { 421 #ifdef COMPAT_OLDSOCK 422 if (flags & ACCEPT4_COMPAT) 423 ((struct osockaddr *)name)->sa_family = 424 name->sa_family; 425 #endif 426 error = copyout(name, uname, namelen); 427 } 428 if (error == 0) 429 error = copyout(&namelen, anamelen, 430 sizeof(namelen)); 431 if (error != 0) 432 fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td); 433 fdrop(fp, td); 434 free(name, M_SONAME); 435 return (error); 436 } 437 438 int 439 kern_accept(struct thread *td, int s, struct sockaddr **name, 440 socklen_t *namelen, struct file **fp) 441 { 442 return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); 443 } 444 445 int 446 kern_accept4(struct thread *td, int s, struct sockaddr **name, 447 socklen_t *namelen, int flags, struct file **fp) 448 { 449 struct filedesc *fdp; 450 struct file *headfp, *nfp = NULL; 451 struct sockaddr *sa = NULL; 452 struct socket *head, *so; 453 cap_rights_t rights; 454 u_int fflag; 455 pid_t pgid; 456 int error, fd, tmp; 457 458 if (name != NULL) 459 *name = NULL; 460 461 AUDIT_ARG_FD(s); 462 fdp = td->td_proc->p_fd; 463 error = getsock_cap(fdp, s, cap_rights_init(&rights, CAP_ACCEPT), 464 &headfp, &fflag); 465 if (error != 0) 466 return (error); 467 head = headfp->f_data; 468 if ((head->so_options & SO_ACCEPTCONN) == 0) { 469 error = EINVAL; 470 goto done; 471 } 472 #ifdef MAC 473 error = mac_socket_check_accept(td->td_ucred, head); 474 if (error != 0) 475 goto done; 476 #endif 477 error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); 478 if (error != 0) 479 goto done; 480 ACCEPT_LOCK(); 481 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { 482 ACCEPT_UNLOCK(); 483 error = EWOULDBLOCK; 484 goto noconnection; 485 } 486 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { 487 if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { 488 head->so_error = ECONNABORTED; 489 break; 490 } 491 error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, 492 "accept", 0); 493 if (error != 0) { 494 ACCEPT_UNLOCK(); 495 goto noconnection; 496 } 497 } 498 if (head->so_error) { 499 error = head->so_error; 500 head->so_error = 0; 501 ACCEPT_UNLOCK(); 502 goto noconnection; 503 } 504 so = TAILQ_FIRST(&head->so_comp); 505 KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); 506 KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); 507 508 /* 509 * Before changing the flags on the socket, we have to bump the 510 * reference count. Otherwise, if the protocol calls sofree(), 511 * the socket will be released due to a zero refcount. 512 */ 513 SOCK_LOCK(so); /* soref() and so_state update */ 514 soref(so); /* file descriptor reference */ 515 516 TAILQ_REMOVE(&head->so_comp, so, so_list); 517 head->so_qlen--; 518 if (flags & ACCEPT4_INHERIT) 519 so->so_state |= (head->so_state & SS_NBIO); 520 else 521 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 522 so->so_qstate &= ~SQ_COMP; 523 so->so_head = NULL; 524 525 SOCK_UNLOCK(so); 526 ACCEPT_UNLOCK(); 527 528 /* An extra reference on `nfp' has been held for us by falloc(). */ 529 td->td_retval[0] = fd; 530 531 /* connection has been removed from the listen queue */ 532 KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); 533 534 if (flags & ACCEPT4_INHERIT) { 535 pgid = fgetown(&head->so_sigio); 536 if (pgid != 0) 537 fsetown(pgid, &so->so_sigio); 538 } else { 539 fflag &= ~(FNONBLOCK | FASYNC); 540 if (flags & SOCK_NONBLOCK) 541 fflag |= FNONBLOCK; 542 } 543 544 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); 545 /* Sync socket nonblocking/async state with file flags */ 546 tmp = fflag & FNONBLOCK; 547 (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); 548 tmp = fflag & FASYNC; 549 (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); 550 sa = 0; 551 error = soaccept(so, &sa); 552 if (error != 0) 553 goto noconnection; 554 if (sa == NULL) { 555 if (name) 556 *namelen = 0; 557 goto done; 558 } 559 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); 560 if (name) { 561 /* check sa_len before it is destroyed */ 562 if (*namelen > sa->sa_len) 563 *namelen = sa->sa_len; 564 #ifdef KTRACE 565 if (KTRPOINT(td, KTR_STRUCT)) 566 ktrsockaddr(sa); 567 #endif 568 *name = sa; 569 sa = NULL; 570 } 571 noconnection: 572 free(sa, M_SONAME); 573 574 /* 575 * close the new descriptor, assuming someone hasn't ripped it 576 * out from under us. 577 */ 578 if (error != 0) 579 fdclose(fdp, nfp, fd, td); 580 581 /* 582 * Release explicitly held references before returning. We return 583 * a reference on nfp to the caller on success if they request it. 584 */ 585 done: 586 if (fp != NULL) { 587 if (error == 0) { 588 *fp = nfp; 589 nfp = NULL; 590 } else 591 *fp = NULL; 592 } 593 if (nfp != NULL) 594 fdrop(nfp, td); 595 fdrop(headfp, td); 596 return (error); 597 } 598 599 int 600 sys_accept(td, uap) 601 struct thread *td; 602 struct accept_args *uap; 603 { 604 605 return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); 606 } 607 608 int 609 sys_accept4(td, uap) 610 struct thread *td; 611 struct accept4_args *uap; 612 { 613 614 if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 615 return (EINVAL); 616 617 return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); 618 } 619 620 #ifdef COMPAT_OLDSOCK 621 int 622 oaccept(td, uap) 623 struct thread *td; 624 struct accept_args *uap; 625 { 626 627 return (accept1(td, uap->s, uap->name, uap->anamelen, 628 ACCEPT4_INHERIT | ACCEPT4_COMPAT)); 629 } 630 #endif /* COMPAT_OLDSOCK */ 631 632 /* ARGSUSED */ 633 int 634 sys_connect(td, uap) 635 struct thread *td; 636 struct connect_args /* { 637 int s; 638 caddr_t name; 639 int namelen; 640 } */ *uap; 641 { 642 struct sockaddr *sa; 643 int error; 644 645 error = getsockaddr(&sa, uap->name, uap->namelen); 646 if (error == 0) { 647 error = kern_connect(td, uap->s, sa); 648 free(sa, M_SONAME); 649 } 650 return (error); 651 } 652 653 static int 654 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 655 { 656 struct socket *so; 657 struct file *fp; 658 cap_rights_t rights; 659 int error, interrupted = 0; 660 661 AUDIT_ARG_FD(fd); 662 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 663 error = getsock_cap(td->td_proc->p_fd, fd, 664 cap_rights_init(&rights, CAP_CONNECT), &fp, NULL); 665 if (error != 0) 666 return (error); 667 so = fp->f_data; 668 if (so->so_state & SS_ISCONNECTING) { 669 error = EALREADY; 670 goto done1; 671 } 672 #ifdef KTRACE 673 if (KTRPOINT(td, KTR_STRUCT)) 674 ktrsockaddr(sa); 675 #endif 676 #ifdef MAC 677 error = mac_socket_check_connect(td->td_ucred, so, sa); 678 if (error != 0) 679 goto bad; 680 #endif 681 if (dirfd == AT_FDCWD) 682 error = soconnect(so, sa, td); 683 else 684 error = soconnectat(dirfd, so, sa, td); 685 if (error != 0) 686 goto bad; 687 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 688 error = EINPROGRESS; 689 goto done1; 690 } 691 SOCK_LOCK(so); 692 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 693 error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, 694 "connec", 0); 695 if (error != 0) { 696 if (error == EINTR || error == ERESTART) 697 interrupted = 1; 698 break; 699 } 700 } 701 if (error == 0) { 702 error = so->so_error; 703 so->so_error = 0; 704 } 705 SOCK_UNLOCK(so); 706 bad: 707 if (!interrupted) 708 so->so_state &= ~SS_ISCONNECTING; 709 if (error == ERESTART) 710 error = EINTR; 711 done1: 712 fdrop(fp, td); 713 return (error); 714 } 715 716 int 717 kern_connect(struct thread *td, int fd, struct sockaddr *sa) 718 { 719 720 return (kern_connectat(td, AT_FDCWD, fd, sa)); 721 } 722 723 /* ARGSUSED */ 724 int 725 sys_connectat(td, uap) 726 struct thread *td; 727 struct connectat_args /* { 728 int fd; 729 int s; 730 caddr_t name; 731 int namelen; 732 } */ *uap; 733 { 734 struct sockaddr *sa; 735 int error; 736 737 error = getsockaddr(&sa, uap->name, uap->namelen); 738 if (error == 0) { 739 error = kern_connectat(td, uap->fd, uap->s, sa); 740 free(sa, M_SONAME); 741 } 742 return (error); 743 } 744 745 int 746 kern_socketpair(struct thread *td, int domain, int type, int protocol, 747 int *rsv) 748 { 749 struct filedesc *fdp = td->td_proc->p_fd; 750 struct file *fp1, *fp2; 751 struct socket *so1, *so2; 752 int fd, error, oflag, fflag; 753 754 AUDIT_ARG_SOCKET(domain, type, protocol); 755 756 oflag = 0; 757 fflag = 0; 758 if ((type & SOCK_CLOEXEC) != 0) { 759 type &= ~SOCK_CLOEXEC; 760 oflag |= O_CLOEXEC; 761 } 762 if ((type & SOCK_NONBLOCK) != 0) { 763 type &= ~SOCK_NONBLOCK; 764 fflag |= FNONBLOCK; 765 } 766 #ifdef MAC 767 /* We might want to have a separate check for socket pairs. */ 768 error = mac_socket_check_create(td->td_ucred, domain, type, 769 protocol); 770 if (error != 0) 771 return (error); 772 #endif 773 error = socreate(domain, &so1, type, protocol, td->td_ucred, td); 774 if (error != 0) 775 return (error); 776 error = socreate(domain, &so2, type, protocol, td->td_ucred, td); 777 if (error != 0) 778 goto free1; 779 /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ 780 error = falloc(td, &fp1, &fd, oflag); 781 if (error != 0) 782 goto free2; 783 rsv[0] = fd; 784 fp1->f_data = so1; /* so1 already has ref count */ 785 error = falloc(td, &fp2, &fd, oflag); 786 if (error != 0) 787 goto free3; 788 fp2->f_data = so2; /* so2 already has ref count */ 789 rsv[1] = fd; 790 error = soconnect2(so1, so2); 791 if (error != 0) 792 goto free4; 793 if (type == SOCK_DGRAM) { 794 /* 795 * Datagram socket connection is asymmetric. 796 */ 797 error = soconnect2(so2, so1); 798 if (error != 0) 799 goto free4; 800 } 801 finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, 802 &socketops); 803 finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, 804 &socketops); 805 if ((fflag & FNONBLOCK) != 0) { 806 (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); 807 (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); 808 } 809 fdrop(fp1, td); 810 fdrop(fp2, td); 811 return (0); 812 free4: 813 fdclose(fdp, fp2, rsv[1], td); 814 fdrop(fp2, td); 815 free3: 816 fdclose(fdp, fp1, rsv[0], td); 817 fdrop(fp1, td); 818 free2: 819 if (so2 != NULL) 820 (void)soclose(so2); 821 free1: 822 if (so1 != NULL) 823 (void)soclose(so1); 824 return (error); 825 } 826 827 int 828 sys_socketpair(struct thread *td, struct socketpair_args *uap) 829 { 830 int error, sv[2]; 831 832 error = kern_socketpair(td, uap->domain, uap->type, 833 uap->protocol, sv); 834 if (error != 0) 835 return (error); 836 error = copyout(sv, uap->rsv, 2 * sizeof(int)); 837 if (error != 0) { 838 (void)kern_close(td, sv[0]); 839 (void)kern_close(td, sv[1]); 840 } 841 return (error); 842 } 843 844 static int 845 sendit(td, s, mp, flags) 846 struct thread *td; 847 int s; 848 struct msghdr *mp; 849 int flags; 850 { 851 struct mbuf *control; 852 struct sockaddr *to; 853 int error; 854 855 #ifdef CAPABILITY_MODE 856 if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) 857 return (ECAPMODE); 858 #endif 859 860 if (mp->msg_name != NULL) { 861 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 862 if (error != 0) { 863 to = NULL; 864 goto bad; 865 } 866 mp->msg_name = to; 867 } else { 868 to = NULL; 869 } 870 871 if (mp->msg_control) { 872 if (mp->msg_controllen < sizeof(struct cmsghdr) 873 #ifdef COMPAT_OLDSOCK 874 && mp->msg_flags != MSG_COMPAT 875 #endif 876 ) { 877 error = EINVAL; 878 goto bad; 879 } 880 error = sockargs(&control, mp->msg_control, 881 mp->msg_controllen, MT_CONTROL); 882 if (error != 0) 883 goto bad; 884 #ifdef COMPAT_OLDSOCK 885 if (mp->msg_flags == MSG_COMPAT) { 886 struct cmsghdr *cm; 887 888 M_PREPEND(control, sizeof(*cm), M_WAITOK); 889 cm = mtod(control, struct cmsghdr *); 890 cm->cmsg_len = control->m_len; 891 cm->cmsg_level = SOL_SOCKET; 892 cm->cmsg_type = SCM_RIGHTS; 893 } 894 #endif 895 } else { 896 control = NULL; 897 } 898 899 error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); 900 901 bad: 902 free(to, M_SONAME); 903 return (error); 904 } 905 906 int 907 kern_sendit(td, s, mp, flags, control, segflg) 908 struct thread *td; 909 int s; 910 struct msghdr *mp; 911 int flags; 912 struct mbuf *control; 913 enum uio_seg segflg; 914 { 915 struct file *fp; 916 struct uio auio; 917 struct iovec *iov; 918 struct socket *so; 919 cap_rights_t rights; 920 #ifdef KTRACE 921 struct uio *ktruio = NULL; 922 #endif 923 ssize_t len; 924 int i, error; 925 926 AUDIT_ARG_FD(s); 927 cap_rights_init(&rights, CAP_SEND); 928 if (mp->msg_name != NULL) { 929 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); 930 cap_rights_set(&rights, CAP_CONNECT); 931 } 932 error = getsock_cap(td->td_proc->p_fd, s, &rights, &fp, NULL); 933 if (error != 0) 934 return (error); 935 so = (struct socket *)fp->f_data; 936 937 #ifdef KTRACE 938 if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) 939 ktrsockaddr(mp->msg_name); 940 #endif 941 #ifdef MAC 942 if (mp->msg_name != NULL) { 943 error = mac_socket_check_connect(td->td_ucred, so, 944 mp->msg_name); 945 if (error != 0) 946 goto bad; 947 } 948 error = mac_socket_check_send(td->td_ucred, so); 949 if (error != 0) 950 goto bad; 951 #endif 952 953 auio.uio_iov = mp->msg_iov; 954 auio.uio_iovcnt = mp->msg_iovlen; 955 auio.uio_segflg = segflg; 956 auio.uio_rw = UIO_WRITE; 957 auio.uio_td = td; 958 auio.uio_offset = 0; /* XXX */ 959 auio.uio_resid = 0; 960 iov = mp->msg_iov; 961 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 962 if ((auio.uio_resid += iov->iov_len) < 0) { 963 error = EINVAL; 964 goto bad; 965 } 966 } 967 #ifdef KTRACE 968 if (KTRPOINT(td, KTR_GENIO)) 969 ktruio = cloneuio(&auio); 970 #endif 971 len = auio.uio_resid; 972 error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); 973 if (error != 0) { 974 if (auio.uio_resid != len && (error == ERESTART || 975 error == EINTR || error == EWOULDBLOCK)) 976 error = 0; 977 /* Generation of SIGPIPE can be controlled per socket */ 978 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 979 !(flags & MSG_NOSIGNAL)) { 980 PROC_LOCK(td->td_proc); 981 tdsignal(td, SIGPIPE); 982 PROC_UNLOCK(td->td_proc); 983 } 984 } 985 if (error == 0) 986 td->td_retval[0] = len - auio.uio_resid; 987 #ifdef KTRACE 988 if (ktruio != NULL) { 989 ktruio->uio_resid = td->td_retval[0]; 990 ktrgenio(s, UIO_WRITE, ktruio, error); 991 } 992 #endif 993 bad: 994 fdrop(fp, td); 995 return (error); 996 } 997 998 int 999 sys_sendto(td, uap) 1000 struct thread *td; 1001 struct sendto_args /* { 1002 int s; 1003 caddr_t buf; 1004 size_t len; 1005 int flags; 1006 caddr_t to; 1007 int tolen; 1008 } */ *uap; 1009 { 1010 struct msghdr msg; 1011 struct iovec aiov; 1012 1013 msg.msg_name = uap->to; 1014 msg.msg_namelen = uap->tolen; 1015 msg.msg_iov = &aiov; 1016 msg.msg_iovlen = 1; 1017 msg.msg_control = 0; 1018 #ifdef COMPAT_OLDSOCK 1019 msg.msg_flags = 0; 1020 #endif 1021 aiov.iov_base = uap->buf; 1022 aiov.iov_len = uap->len; 1023 return (sendit(td, uap->s, &msg, uap->flags)); 1024 } 1025 1026 #ifdef COMPAT_OLDSOCK 1027 int 1028 osend(td, uap) 1029 struct thread *td; 1030 struct osend_args /* { 1031 int s; 1032 caddr_t buf; 1033 int len; 1034 int flags; 1035 } */ *uap; 1036 { 1037 struct msghdr msg; 1038 struct iovec aiov; 1039 1040 msg.msg_name = 0; 1041 msg.msg_namelen = 0; 1042 msg.msg_iov = &aiov; 1043 msg.msg_iovlen = 1; 1044 aiov.iov_base = uap->buf; 1045 aiov.iov_len = uap->len; 1046 msg.msg_control = 0; 1047 msg.msg_flags = 0; 1048 return (sendit(td, uap->s, &msg, uap->flags)); 1049 } 1050 1051 int 1052 osendmsg(td, uap) 1053 struct thread *td; 1054 struct osendmsg_args /* { 1055 int s; 1056 caddr_t msg; 1057 int flags; 1058 } */ *uap; 1059 { 1060 struct msghdr msg; 1061 struct iovec *iov; 1062 int error; 1063 1064 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1065 if (error != 0) 1066 return (error); 1067 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1068 if (error != 0) 1069 return (error); 1070 msg.msg_iov = iov; 1071 msg.msg_flags = MSG_COMPAT; 1072 error = sendit(td, uap->s, &msg, uap->flags); 1073 free(iov, M_IOV); 1074 return (error); 1075 } 1076 #endif 1077 1078 int 1079 sys_sendmsg(td, uap) 1080 struct thread *td; 1081 struct sendmsg_args /* { 1082 int s; 1083 caddr_t msg; 1084 int flags; 1085 } */ *uap; 1086 { 1087 struct msghdr msg; 1088 struct iovec *iov; 1089 int error; 1090 1091 error = copyin(uap->msg, &msg, sizeof (msg)); 1092 if (error != 0) 1093 return (error); 1094 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1095 if (error != 0) 1096 return (error); 1097 msg.msg_iov = iov; 1098 #ifdef COMPAT_OLDSOCK 1099 msg.msg_flags = 0; 1100 #endif 1101 error = sendit(td, uap->s, &msg, uap->flags); 1102 free(iov, M_IOV); 1103 return (error); 1104 } 1105 1106 int 1107 kern_recvit(td, s, mp, fromseg, controlp) 1108 struct thread *td; 1109 int s; 1110 struct msghdr *mp; 1111 enum uio_seg fromseg; 1112 struct mbuf **controlp; 1113 { 1114 struct uio auio; 1115 struct iovec *iov; 1116 struct mbuf *m, *control = NULL; 1117 caddr_t ctlbuf; 1118 struct file *fp; 1119 struct socket *so; 1120 struct sockaddr *fromsa = NULL; 1121 cap_rights_t rights; 1122 #ifdef KTRACE 1123 struct uio *ktruio = NULL; 1124 #endif 1125 ssize_t len; 1126 int error, i; 1127 1128 if (controlp != NULL) 1129 *controlp = NULL; 1130 1131 AUDIT_ARG_FD(s); 1132 error = getsock_cap(td->td_proc->p_fd, s, 1133 cap_rights_init(&rights, CAP_RECV), &fp, NULL); 1134 if (error != 0) 1135 return (error); 1136 so = fp->f_data; 1137 1138 #ifdef MAC 1139 error = mac_socket_check_receive(td->td_ucred, so); 1140 if (error != 0) { 1141 fdrop(fp, td); 1142 return (error); 1143 } 1144 #endif 1145 1146 auio.uio_iov = mp->msg_iov; 1147 auio.uio_iovcnt = mp->msg_iovlen; 1148 auio.uio_segflg = UIO_USERSPACE; 1149 auio.uio_rw = UIO_READ; 1150 auio.uio_td = td; 1151 auio.uio_offset = 0; /* XXX */ 1152 auio.uio_resid = 0; 1153 iov = mp->msg_iov; 1154 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 1155 if ((auio.uio_resid += iov->iov_len) < 0) { 1156 fdrop(fp, td); 1157 return (EINVAL); 1158 } 1159 } 1160 #ifdef KTRACE 1161 if (KTRPOINT(td, KTR_GENIO)) 1162 ktruio = cloneuio(&auio); 1163 #endif 1164 len = auio.uio_resid; 1165 error = soreceive(so, &fromsa, &auio, NULL, 1166 (mp->msg_control || controlp) ? &control : NULL, 1167 &mp->msg_flags); 1168 if (error != 0) { 1169 if (auio.uio_resid != len && (error == ERESTART || 1170 error == EINTR || error == EWOULDBLOCK)) 1171 error = 0; 1172 } 1173 if (fromsa != NULL) 1174 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); 1175 #ifdef KTRACE 1176 if (ktruio != NULL) { 1177 ktruio->uio_resid = len - auio.uio_resid; 1178 ktrgenio(s, UIO_READ, ktruio, error); 1179 } 1180 #endif 1181 if (error != 0) 1182 goto out; 1183 td->td_retval[0] = len - auio.uio_resid; 1184 if (mp->msg_name) { 1185 len = mp->msg_namelen; 1186 if (len <= 0 || fromsa == NULL) 1187 len = 0; 1188 else { 1189 /* save sa_len before it is destroyed by MSG_COMPAT */ 1190 len = MIN(len, fromsa->sa_len); 1191 #ifdef COMPAT_OLDSOCK 1192 if (mp->msg_flags & MSG_COMPAT) 1193 ((struct osockaddr *)fromsa)->sa_family = 1194 fromsa->sa_family; 1195 #endif 1196 if (fromseg == UIO_USERSPACE) { 1197 error = copyout(fromsa, mp->msg_name, 1198 (unsigned)len); 1199 if (error != 0) 1200 goto out; 1201 } else 1202 bcopy(fromsa, mp->msg_name, len); 1203 } 1204 mp->msg_namelen = len; 1205 } 1206 if (mp->msg_control && controlp == NULL) { 1207 #ifdef COMPAT_OLDSOCK 1208 /* 1209 * We assume that old recvmsg calls won't receive access 1210 * rights and other control info, esp. as control info 1211 * is always optional and those options didn't exist in 4.3. 1212 * If we receive rights, trim the cmsghdr; anything else 1213 * is tossed. 1214 */ 1215 if (control && mp->msg_flags & MSG_COMPAT) { 1216 if (mtod(control, struct cmsghdr *)->cmsg_level != 1217 SOL_SOCKET || 1218 mtod(control, struct cmsghdr *)->cmsg_type != 1219 SCM_RIGHTS) { 1220 mp->msg_controllen = 0; 1221 goto out; 1222 } 1223 control->m_len -= sizeof (struct cmsghdr); 1224 control->m_data += sizeof (struct cmsghdr); 1225 } 1226 #endif 1227 len = mp->msg_controllen; 1228 m = control; 1229 mp->msg_controllen = 0; 1230 ctlbuf = mp->msg_control; 1231 1232 while (m && len > 0) { 1233 unsigned int tocopy; 1234 1235 if (len >= m->m_len) 1236 tocopy = m->m_len; 1237 else { 1238 mp->msg_flags |= MSG_CTRUNC; 1239 tocopy = len; 1240 } 1241 1242 if ((error = copyout(mtod(m, caddr_t), 1243 ctlbuf, tocopy)) != 0) 1244 goto out; 1245 1246 ctlbuf += tocopy; 1247 len -= tocopy; 1248 m = m->m_next; 1249 } 1250 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; 1251 } 1252 out: 1253 fdrop(fp, td); 1254 #ifdef KTRACE 1255 if (fromsa && KTRPOINT(td, KTR_STRUCT)) 1256 ktrsockaddr(fromsa); 1257 #endif 1258 free(fromsa, M_SONAME); 1259 1260 if (error == 0 && controlp != NULL) 1261 *controlp = control; 1262 else if (control) 1263 m_freem(control); 1264 1265 return (error); 1266 } 1267 1268 static int 1269 recvit(td, s, mp, namelenp) 1270 struct thread *td; 1271 int s; 1272 struct msghdr *mp; 1273 void *namelenp; 1274 { 1275 int error; 1276 1277 error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); 1278 if (error != 0) 1279 return (error); 1280 if (namelenp != NULL) { 1281 error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); 1282 #ifdef COMPAT_OLDSOCK 1283 if (mp->msg_flags & MSG_COMPAT) 1284 error = 0; /* old recvfrom didn't check */ 1285 #endif 1286 } 1287 return (error); 1288 } 1289 1290 int 1291 sys_recvfrom(td, uap) 1292 struct thread *td; 1293 struct recvfrom_args /* { 1294 int s; 1295 caddr_t buf; 1296 size_t len; 1297 int flags; 1298 struct sockaddr * __restrict from; 1299 socklen_t * __restrict fromlenaddr; 1300 } */ *uap; 1301 { 1302 struct msghdr msg; 1303 struct iovec aiov; 1304 int error; 1305 1306 if (uap->fromlenaddr) { 1307 error = copyin(uap->fromlenaddr, 1308 &msg.msg_namelen, sizeof (msg.msg_namelen)); 1309 if (error != 0) 1310 goto done2; 1311 } else { 1312 msg.msg_namelen = 0; 1313 } 1314 msg.msg_name = uap->from; 1315 msg.msg_iov = &aiov; 1316 msg.msg_iovlen = 1; 1317 aiov.iov_base = uap->buf; 1318 aiov.iov_len = uap->len; 1319 msg.msg_control = 0; 1320 msg.msg_flags = uap->flags; 1321 error = recvit(td, uap->s, &msg, uap->fromlenaddr); 1322 done2: 1323 return (error); 1324 } 1325 1326 #ifdef COMPAT_OLDSOCK 1327 int 1328 orecvfrom(td, uap) 1329 struct thread *td; 1330 struct recvfrom_args *uap; 1331 { 1332 1333 uap->flags |= MSG_COMPAT; 1334 return (sys_recvfrom(td, uap)); 1335 } 1336 #endif 1337 1338 #ifdef COMPAT_OLDSOCK 1339 int 1340 orecv(td, uap) 1341 struct thread *td; 1342 struct orecv_args /* { 1343 int s; 1344 caddr_t buf; 1345 int len; 1346 int flags; 1347 } */ *uap; 1348 { 1349 struct msghdr msg; 1350 struct iovec aiov; 1351 1352 msg.msg_name = 0; 1353 msg.msg_namelen = 0; 1354 msg.msg_iov = &aiov; 1355 msg.msg_iovlen = 1; 1356 aiov.iov_base = uap->buf; 1357 aiov.iov_len = uap->len; 1358 msg.msg_control = 0; 1359 msg.msg_flags = uap->flags; 1360 return (recvit(td, uap->s, &msg, NULL)); 1361 } 1362 1363 /* 1364 * Old recvmsg. This code takes advantage of the fact that the old msghdr 1365 * overlays the new one, missing only the flags, and with the (old) access 1366 * rights where the control fields are now. 1367 */ 1368 int 1369 orecvmsg(td, uap) 1370 struct thread *td; 1371 struct orecvmsg_args /* { 1372 int s; 1373 struct omsghdr *msg; 1374 int flags; 1375 } */ *uap; 1376 { 1377 struct msghdr msg; 1378 struct iovec *iov; 1379 int error; 1380 1381 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1382 if (error != 0) 1383 return (error); 1384 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1385 if (error != 0) 1386 return (error); 1387 msg.msg_flags = uap->flags | MSG_COMPAT; 1388 msg.msg_iov = iov; 1389 error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); 1390 if (msg.msg_controllen && error == 0) 1391 error = copyout(&msg.msg_controllen, 1392 &uap->msg->msg_accrightslen, sizeof (int)); 1393 free(iov, M_IOV); 1394 return (error); 1395 } 1396 #endif 1397 1398 int 1399 sys_recvmsg(td, uap) 1400 struct thread *td; 1401 struct recvmsg_args /* { 1402 int s; 1403 struct msghdr *msg; 1404 int flags; 1405 } */ *uap; 1406 { 1407 struct msghdr msg; 1408 struct iovec *uiov, *iov; 1409 int error; 1410 1411 error = copyin(uap->msg, &msg, sizeof (msg)); 1412 if (error != 0) 1413 return (error); 1414 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1415 if (error != 0) 1416 return (error); 1417 msg.msg_flags = uap->flags; 1418 #ifdef COMPAT_OLDSOCK 1419 msg.msg_flags &= ~MSG_COMPAT; 1420 #endif 1421 uiov = msg.msg_iov; 1422 msg.msg_iov = iov; 1423 error = recvit(td, uap->s, &msg, NULL); 1424 if (error == 0) { 1425 msg.msg_iov = uiov; 1426 error = copyout(&msg, uap->msg, sizeof(msg)); 1427 } 1428 free(iov, M_IOV); 1429 return (error); 1430 } 1431 1432 /* ARGSUSED */ 1433 int 1434 sys_shutdown(td, uap) 1435 struct thread *td; 1436 struct shutdown_args /* { 1437 int s; 1438 int how; 1439 } */ *uap; 1440 { 1441 struct socket *so; 1442 struct file *fp; 1443 cap_rights_t rights; 1444 int error; 1445 1446 AUDIT_ARG_FD(uap->s); 1447 error = getsock_cap(td->td_proc->p_fd, uap->s, 1448 cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL); 1449 if (error == 0) { 1450 so = fp->f_data; 1451 error = soshutdown(so, uap->how); 1452 fdrop(fp, td); 1453 } 1454 return (error); 1455 } 1456 1457 /* ARGSUSED */ 1458 int 1459 sys_setsockopt(td, uap) 1460 struct thread *td; 1461 struct setsockopt_args /* { 1462 int s; 1463 int level; 1464 int name; 1465 caddr_t val; 1466 int valsize; 1467 } */ *uap; 1468 { 1469 1470 return (kern_setsockopt(td, uap->s, uap->level, uap->name, 1471 uap->val, UIO_USERSPACE, uap->valsize)); 1472 } 1473 1474 int 1475 kern_setsockopt(td, s, level, name, val, valseg, valsize) 1476 struct thread *td; 1477 int s; 1478 int level; 1479 int name; 1480 void *val; 1481 enum uio_seg valseg; 1482 socklen_t valsize; 1483 { 1484 struct socket *so; 1485 struct file *fp; 1486 struct sockopt sopt; 1487 cap_rights_t rights; 1488 int error; 1489 1490 if (val == NULL && valsize != 0) 1491 return (EFAULT); 1492 if ((int)valsize < 0) 1493 return (EINVAL); 1494 1495 sopt.sopt_dir = SOPT_SET; 1496 sopt.sopt_level = level; 1497 sopt.sopt_name = name; 1498 sopt.sopt_val = val; 1499 sopt.sopt_valsize = valsize; 1500 switch (valseg) { 1501 case UIO_USERSPACE: 1502 sopt.sopt_td = td; 1503 break; 1504 case UIO_SYSSPACE: 1505 sopt.sopt_td = NULL; 1506 break; 1507 default: 1508 panic("kern_setsockopt called with bad valseg"); 1509 } 1510 1511 AUDIT_ARG_FD(s); 1512 error = getsock_cap(td->td_proc->p_fd, s, 1513 cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL); 1514 if (error == 0) { 1515 so = fp->f_data; 1516 error = sosetopt(so, &sopt); 1517 fdrop(fp, td); 1518 } 1519 return(error); 1520 } 1521 1522 /* ARGSUSED */ 1523 int 1524 sys_getsockopt(td, uap) 1525 struct thread *td; 1526 struct getsockopt_args /* { 1527 int s; 1528 int level; 1529 int name; 1530 void * __restrict val; 1531 socklen_t * __restrict avalsize; 1532 } */ *uap; 1533 { 1534 socklen_t valsize; 1535 int error; 1536 1537 if (uap->val) { 1538 error = copyin(uap->avalsize, &valsize, sizeof (valsize)); 1539 if (error != 0) 1540 return (error); 1541 } 1542 1543 error = kern_getsockopt(td, uap->s, uap->level, uap->name, 1544 uap->val, UIO_USERSPACE, &valsize); 1545 1546 if (error == 0) 1547 error = copyout(&valsize, uap->avalsize, sizeof (valsize)); 1548 return (error); 1549 } 1550 1551 /* 1552 * Kernel version of getsockopt. 1553 * optval can be a userland or userspace. optlen is always a kernel pointer. 1554 */ 1555 int 1556 kern_getsockopt(td, s, level, name, val, valseg, valsize) 1557 struct thread *td; 1558 int s; 1559 int level; 1560 int name; 1561 void *val; 1562 enum uio_seg valseg; 1563 socklen_t *valsize; 1564 { 1565 struct socket *so; 1566 struct file *fp; 1567 struct sockopt sopt; 1568 cap_rights_t rights; 1569 int error; 1570 1571 if (val == NULL) 1572 *valsize = 0; 1573 if ((int)*valsize < 0) 1574 return (EINVAL); 1575 1576 sopt.sopt_dir = SOPT_GET; 1577 sopt.sopt_level = level; 1578 sopt.sopt_name = name; 1579 sopt.sopt_val = val; 1580 sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ 1581 switch (valseg) { 1582 case UIO_USERSPACE: 1583 sopt.sopt_td = td; 1584 break; 1585 case UIO_SYSSPACE: 1586 sopt.sopt_td = NULL; 1587 break; 1588 default: 1589 panic("kern_getsockopt called with bad valseg"); 1590 } 1591 1592 AUDIT_ARG_FD(s); 1593 error = getsock_cap(td->td_proc->p_fd, s, 1594 cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL); 1595 if (error == 0) { 1596 so = fp->f_data; 1597 error = sogetopt(so, &sopt); 1598 *valsize = sopt.sopt_valsize; 1599 fdrop(fp, td); 1600 } 1601 return (error); 1602 } 1603 1604 /* 1605 * getsockname1() - Get socket name. 1606 */ 1607 /* ARGSUSED */ 1608 static int 1609 getsockname1(td, uap, compat) 1610 struct thread *td; 1611 struct getsockname_args /* { 1612 int fdes; 1613 struct sockaddr * __restrict asa; 1614 socklen_t * __restrict alen; 1615 } */ *uap; 1616 int compat; 1617 { 1618 struct sockaddr *sa; 1619 socklen_t len; 1620 int error; 1621 1622 error = copyin(uap->alen, &len, sizeof(len)); 1623 if (error != 0) 1624 return (error); 1625 1626 error = kern_getsockname(td, uap->fdes, &sa, &len); 1627 if (error != 0) 1628 return (error); 1629 1630 if (len != 0) { 1631 #ifdef COMPAT_OLDSOCK 1632 if (compat) 1633 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1634 #endif 1635 error = copyout(sa, uap->asa, (u_int)len); 1636 } 1637 free(sa, M_SONAME); 1638 if (error == 0) 1639 error = copyout(&len, uap->alen, sizeof(len)); 1640 return (error); 1641 } 1642 1643 int 1644 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, 1645 socklen_t *alen) 1646 { 1647 struct socket *so; 1648 struct file *fp; 1649 cap_rights_t rights; 1650 socklen_t len; 1651 int error; 1652 1653 AUDIT_ARG_FD(fd); 1654 error = getsock_cap(td->td_proc->p_fd, fd, 1655 cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL); 1656 if (error != 0) 1657 return (error); 1658 so = fp->f_data; 1659 *sa = NULL; 1660 CURVNET_SET(so->so_vnet); 1661 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); 1662 CURVNET_RESTORE(); 1663 if (error != 0) 1664 goto bad; 1665 if (*sa == NULL) 1666 len = 0; 1667 else 1668 len = MIN(*alen, (*sa)->sa_len); 1669 *alen = len; 1670 #ifdef KTRACE 1671 if (KTRPOINT(td, KTR_STRUCT)) 1672 ktrsockaddr(*sa); 1673 #endif 1674 bad: 1675 fdrop(fp, td); 1676 if (error != 0 && *sa != NULL) { 1677 free(*sa, M_SONAME); 1678 *sa = NULL; 1679 } 1680 return (error); 1681 } 1682 1683 int 1684 sys_getsockname(td, uap) 1685 struct thread *td; 1686 struct getsockname_args *uap; 1687 { 1688 1689 return (getsockname1(td, uap, 0)); 1690 } 1691 1692 #ifdef COMPAT_OLDSOCK 1693 int 1694 ogetsockname(td, uap) 1695 struct thread *td; 1696 struct getsockname_args *uap; 1697 { 1698 1699 return (getsockname1(td, uap, 1)); 1700 } 1701 #endif /* COMPAT_OLDSOCK */ 1702 1703 /* 1704 * getpeername1() - Get name of peer for connected socket. 1705 */ 1706 /* ARGSUSED */ 1707 static int 1708 getpeername1(td, uap, compat) 1709 struct thread *td; 1710 struct getpeername_args /* { 1711 int fdes; 1712 struct sockaddr * __restrict asa; 1713 socklen_t * __restrict alen; 1714 } */ *uap; 1715 int compat; 1716 { 1717 struct sockaddr *sa; 1718 socklen_t len; 1719 int error; 1720 1721 error = copyin(uap->alen, &len, sizeof (len)); 1722 if (error != 0) 1723 return (error); 1724 1725 error = kern_getpeername(td, uap->fdes, &sa, &len); 1726 if (error != 0) 1727 return (error); 1728 1729 if (len != 0) { 1730 #ifdef COMPAT_OLDSOCK 1731 if (compat) 1732 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1733 #endif 1734 error = copyout(sa, uap->asa, (u_int)len); 1735 } 1736 free(sa, M_SONAME); 1737 if (error == 0) 1738 error = copyout(&len, uap->alen, sizeof(len)); 1739 return (error); 1740 } 1741 1742 int 1743 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, 1744 socklen_t *alen) 1745 { 1746 struct socket *so; 1747 struct file *fp; 1748 cap_rights_t rights; 1749 socklen_t len; 1750 int error; 1751 1752 AUDIT_ARG_FD(fd); 1753 error = getsock_cap(td->td_proc->p_fd, fd, 1754 cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL); 1755 if (error != 0) 1756 return (error); 1757 so = fp->f_data; 1758 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1759 error = ENOTCONN; 1760 goto done; 1761 } 1762 *sa = NULL; 1763 CURVNET_SET(so->so_vnet); 1764 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); 1765 CURVNET_RESTORE(); 1766 if (error != 0) 1767 goto bad; 1768 if (*sa == NULL) 1769 len = 0; 1770 else 1771 len = MIN(*alen, (*sa)->sa_len); 1772 *alen = len; 1773 #ifdef KTRACE 1774 if (KTRPOINT(td, KTR_STRUCT)) 1775 ktrsockaddr(*sa); 1776 #endif 1777 bad: 1778 if (error != 0 && *sa != NULL) { 1779 free(*sa, M_SONAME); 1780 *sa = NULL; 1781 } 1782 done: 1783 fdrop(fp, td); 1784 return (error); 1785 } 1786 1787 int 1788 sys_getpeername(td, uap) 1789 struct thread *td; 1790 struct getpeername_args *uap; 1791 { 1792 1793 return (getpeername1(td, uap, 0)); 1794 } 1795 1796 #ifdef COMPAT_OLDSOCK 1797 int 1798 ogetpeername(td, uap) 1799 struct thread *td; 1800 struct ogetpeername_args *uap; 1801 { 1802 1803 /* XXX uap should have type `getpeername_args *' to begin with. */ 1804 return (getpeername1(td, (struct getpeername_args *)uap, 1)); 1805 } 1806 #endif /* COMPAT_OLDSOCK */ 1807 1808 int 1809 sockargs(mp, buf, buflen, type) 1810 struct mbuf **mp; 1811 caddr_t buf; 1812 int buflen, type; 1813 { 1814 struct sockaddr *sa; 1815 struct mbuf *m; 1816 int error; 1817 1818 if (buflen > MLEN) { 1819 #ifdef COMPAT_OLDSOCK 1820 if (type == MT_SONAME && buflen <= 112) 1821 buflen = MLEN; /* unix domain compat. hack */ 1822 else 1823 #endif 1824 if (buflen > MCLBYTES) 1825 return (EINVAL); 1826 } 1827 m = m_get2(buflen, M_WAITOK, type, 0); 1828 m->m_len = buflen; 1829 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1830 if (error != 0) 1831 (void) m_free(m); 1832 else { 1833 *mp = m; 1834 if (type == MT_SONAME) { 1835 sa = mtod(m, struct sockaddr *); 1836 1837 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1838 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1839 sa->sa_family = sa->sa_len; 1840 #endif 1841 sa->sa_len = buflen; 1842 } 1843 } 1844 return (error); 1845 } 1846 1847 int 1848 getsockaddr(namp, uaddr, len) 1849 struct sockaddr **namp; 1850 caddr_t uaddr; 1851 size_t len; 1852 { 1853 struct sockaddr *sa; 1854 int error; 1855 1856 if (len > SOCK_MAXADDRLEN) 1857 return (ENAMETOOLONG); 1858 if (len < offsetof(struct sockaddr, sa_data[0])) 1859 return (EINVAL); 1860 sa = malloc(len, M_SONAME, M_WAITOK); 1861 error = copyin(uaddr, sa, len); 1862 if (error != 0) { 1863 free(sa, M_SONAME); 1864 } else { 1865 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1866 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1867 sa->sa_family = sa->sa_len; 1868 #endif 1869 sa->sa_len = len; 1870 *namp = sa; 1871 } 1872 return (error); 1873 } 1874 1875 static int 1876 filt_sfsync_attach(struct knote *kn) 1877 { 1878 struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_sdata; 1879 struct knlist *knl = &sfs->klist; 1880 1881 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1882 1883 /* 1884 * Validate that we actually received this via the kernel API. 1885 */ 1886 if ((kn->kn_flags & EV_FLAG1) == 0) 1887 return (EPERM); 1888 1889 kn->kn_ptr.p_v = sfs; 1890 kn->kn_flags &= ~EV_FLAG1; 1891 1892 knl->kl_lock(knl->kl_lockarg); 1893 /* 1894 * If we're in the "freeing" state, 1895 * don't allow the add. That way we don't 1896 * end up racing with some other thread that 1897 * is trying to finish some setup. 1898 */ 1899 if (sfs->state == SF_STATE_FREEING) { 1900 knl->kl_unlock(knl->kl_lockarg); 1901 return (EINVAL); 1902 } 1903 knlist_add(&sfs->klist, kn, 1); 1904 knl->kl_unlock(knl->kl_lockarg); 1905 1906 return (0); 1907 } 1908 1909 /* 1910 * Called when a knote is being detached. 1911 */ 1912 static void 1913 filt_sfsync_detach(struct knote *kn) 1914 { 1915 struct knlist *knl; 1916 struct sendfile_sync *sfs; 1917 int do_free = 0; 1918 1919 sfs = kn->kn_ptr.p_v; 1920 knl = &sfs->klist; 1921 1922 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1923 1924 knl->kl_lock(knl->kl_lockarg); 1925 if (!knlist_empty(knl)) 1926 knlist_remove(knl, kn, 1); 1927 1928 /* 1929 * If the list is empty _AND_ the refcount is 0 1930 * _AND_ we've finished the setup phase and now 1931 * we're in the running phase, we can free the 1932 * underlying sendfile_sync. 1933 * 1934 * But we shouldn't do it before finishing the 1935 * underlying divorce from the knote. 1936 * 1937 * So, we have the sfsync lock held; transition 1938 * it to "freeing", then unlock, then free 1939 * normally. 1940 */ 1941 if (knlist_empty(knl)) { 1942 if (sfs->state == SF_STATE_COMPLETED && sfs->count == 0) { 1943 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, " 1944 "count==0, empty list: time to free!\n", 1945 __func__, 1946 (unsigned long long) curthread->td_tid, 1947 sfs); 1948 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 1949 do_free = 1; 1950 } 1951 } 1952 knl->kl_unlock(knl->kl_lockarg); 1953 1954 /* 1955 * Only call free if we're the one who has transitioned things 1956 * to free. Otherwise we could race with another thread that 1957 * is currently tearing things down. 1958 */ 1959 if (do_free == 1) { 1960 SFSYNC_DPRINTF("%s: (%llu) sfs=%p, %s:%d\n", 1961 __func__, 1962 (unsigned long long) curthread->td_tid, 1963 sfs, 1964 __FILE__, 1965 __LINE__); 1966 sf_sync_free(sfs); 1967 } 1968 } 1969 1970 static int 1971 filt_sfsync(struct knote *kn, long hint) 1972 { 1973 struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_ptr.p_v; 1974 int ret; 1975 1976 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1977 1978 /* 1979 * XXX add a lock assertion here! 1980 */ 1981 ret = (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED); 1982 1983 return (ret); 1984 } 1985 1986 /* 1987 * Add more references to a vm_page + sf_buf + sendfile_sync. 1988 */ 1989 void 1990 sf_ext_ref(void *arg1, void *arg2) 1991 { 1992 struct sf_buf *sf = arg1; 1993 struct sendfile_sync *sfs = arg2; 1994 vm_page_t pg = sf_buf_page(sf); 1995 1996 sf_buf_ref(sf); 1997 1998 vm_page_lock(pg); 1999 vm_page_wire(pg); 2000 vm_page_unlock(pg); 2001 2002 if (sfs != NULL) { 2003 mtx_lock(&sfs->mtx); 2004 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); 2005 sfs->count++; 2006 mtx_unlock(&sfs->mtx); 2007 } 2008 } 2009 2010 /* 2011 * Detach mapped page and release resources back to the system. 2012 */ 2013 void 2014 sf_ext_free(void *arg1, void *arg2) 2015 { 2016 struct sf_buf *sf = arg1; 2017 struct sendfile_sync *sfs = arg2; 2018 vm_page_t pg = sf_buf_page(sf); 2019 2020 sf_buf_free(sf); 2021 2022 vm_page_lock(pg); 2023 vm_page_unwire(pg, PQ_INACTIVE); 2024 /* 2025 * Check for the object going away on us. This can 2026 * happen since we don't hold a reference to it. 2027 * If so, we're responsible for freeing the page. 2028 */ 2029 if (pg->wire_count == 0 && pg->object == NULL) 2030 vm_page_free(pg); 2031 vm_page_unlock(pg); 2032 2033 if (sfs != NULL) 2034 sf_sync_deref(sfs); 2035 } 2036 2037 /* 2038 * Called to remove a reference to a sf_sync object. 2039 * 2040 * This is generally done during the mbuf free path to signify 2041 * that one of the mbufs in the transaction has been completed. 2042 * 2043 * If we're doing SF_SYNC and the refcount is zero then we'll wake 2044 * up any waiters. 2045 * 2046 * IF we're doing SF_KQUEUE and the refcount is zero then we'll 2047 * fire off the knote. 2048 */ 2049 void 2050 sf_sync_deref(struct sendfile_sync *sfs) 2051 { 2052 int do_free = 0; 2053 2054 if (sfs == NULL) 2055 return; 2056 2057 mtx_lock(&sfs->mtx); 2058 KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0")); 2059 sfs->count --; 2060 2061 /* 2062 * Only fire off the wakeup / kqueue notification if 2063 * we are in the running state. 2064 */ 2065 if (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED) { 2066 if (sfs->flags & SF_SYNC) 2067 cv_signal(&sfs->cv); 2068 2069 if (sfs->flags & SF_KQUEUE) { 2070 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: knote!\n", 2071 __func__, 2072 (unsigned long long) curthread->td_tid, 2073 sfs); 2074 KNOTE_LOCKED(&sfs->klist, 1); 2075 } 2076 2077 /* 2078 * If we're not waiting around for a sync, 2079 * check if the knote list is empty. 2080 * If it is, we transition to free. 2081 * 2082 * XXX I think it's about time I added some state 2083 * or flag that says whether we're supposed to be 2084 * waiting around until we've done a signal. 2085 * 2086 * XXX Ie, the reason that I don't free it here 2087 * is because the caller will free the last reference, 2088 * not us. That should be codified in some flag 2089 * that indicates "self-free" rather than checking 2090 * for SF_SYNC all the time. 2091 */ 2092 if ((sfs->flags & SF_SYNC) == 0 && knlist_empty(&sfs->klist)) { 2093 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, " 2094 "count==0, empty list: time to free!\n", 2095 __func__, 2096 (unsigned long long) curthread->td_tid, 2097 sfs); 2098 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2099 do_free = 1; 2100 } 2101 2102 } 2103 mtx_unlock(&sfs->mtx); 2104 2105 /* 2106 * Attempt to do a free here. 2107 * 2108 * We do this outside of the lock because it may destroy the 2109 * lock in question as it frees things. We can optimise this 2110 * later. 2111 * 2112 * XXX yes, we should make it a requirement to hold the 2113 * lock across sf_sync_free(). 2114 */ 2115 if (do_free == 1) { 2116 SFSYNC_DPRINTF("%s: (%llu) sfs=%p\n", 2117 __func__, 2118 (unsigned long long) curthread->td_tid, 2119 sfs); 2120 sf_sync_free(sfs); 2121 } 2122 } 2123 2124 /* 2125 * Allocate a sendfile_sync state structure. 2126 * 2127 * For now this only knows about the "sleep" sync, but later it will 2128 * grow various other personalities. 2129 */ 2130 struct sendfile_sync * 2131 sf_sync_alloc(uint32_t flags) 2132 { 2133 struct sendfile_sync *sfs; 2134 2135 sfs = uma_zalloc(zone_sfsync, M_WAITOK | M_ZERO); 2136 mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); 2137 cv_init(&sfs->cv, "sendfile"); 2138 sfs->flags = flags; 2139 sfs->state = SF_STATE_SETUP; 2140 knlist_init_mtx(&sfs->klist, &sfs->mtx); 2141 2142 SFSYNC_DPRINTF("%s: sfs=%p, flags=0x%08x\n", __func__, sfs, sfs->flags); 2143 2144 return (sfs); 2145 } 2146 2147 /* 2148 * Take a reference to a sfsync instance. 2149 * 2150 * This has to map 1:1 to free calls coming in via sf_ext_free(), 2151 * so typically this will be referenced once for each mbuf allocated. 2152 */ 2153 void 2154 sf_sync_ref(struct sendfile_sync *sfs) 2155 { 2156 2157 if (sfs == NULL) 2158 return; 2159 2160 mtx_lock(&sfs->mtx); 2161 sfs->count++; 2162 mtx_unlock(&sfs->mtx); 2163 } 2164 2165 void 2166 sf_sync_syscall_wait(struct sendfile_sync *sfs) 2167 { 2168 2169 if (sfs == NULL) 2170 return; 2171 2172 KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!", 2173 __func__, 2174 sfs)); 2175 2176 /* 2177 * If we're not requested to wait during the syscall, 2178 * don't bother waiting. 2179 */ 2180 if ((sfs->flags & SF_SYNC) == 0) 2181 goto out; 2182 2183 /* 2184 * This is a bit suboptimal and confusing, so bear with me. 2185 * 2186 * Ideally sf_sync_syscall_wait() will wait until 2187 * all pending mbuf transmit operations are done. 2188 * This means that when sendfile becomes async, it'll 2189 * run in the background and will transition from 2190 * RUNNING to COMPLETED when it's finished acquiring 2191 * new things to send. Then, when the mbufs finish 2192 * sending, COMPLETED + sfs->count == 0 is enough to 2193 * know that no further work is being done. 2194 * 2195 * So, we will sleep on both RUNNING and COMPLETED. 2196 * It's up to the (in progress) async sendfile loop 2197 * to transition the sf_sync from RUNNING to 2198 * COMPLETED so the wakeup above will actually 2199 * do the cv_signal() call. 2200 */ 2201 if (sfs->state != SF_STATE_COMPLETED && sfs->state != SF_STATE_RUNNING) 2202 goto out; 2203 2204 if (sfs->count != 0) 2205 cv_wait(&sfs->cv, &sfs->mtx); 2206 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2207 2208 out: 2209 return; 2210 } 2211 2212 /* 2213 * Free an sf_sync if it's appropriate to. 2214 */ 2215 void 2216 sf_sync_free(struct sendfile_sync *sfs) 2217 { 2218 2219 if (sfs == NULL) 2220 return; 2221 2222 SFSYNC_DPRINTF("%s: (%lld) sfs=%p; called; state=%d, flags=0x%08x " 2223 "count=%d\n", 2224 __func__, 2225 (long long) curthread->td_tid, 2226 sfs, 2227 sfs->state, 2228 sfs->flags, 2229 sfs->count); 2230 2231 mtx_lock(&sfs->mtx); 2232 2233 /* 2234 * We keep the sf_sync around if the state is active, 2235 * we are doing kqueue notification and we have active 2236 * knotes. 2237 * 2238 * If the caller wants to free us right this second it 2239 * should transition this to the freeing state. 2240 * 2241 * So, complain loudly if they break this rule. 2242 */ 2243 if (sfs->state != SF_STATE_FREEING) { 2244 printf("%s: (%llu) sfs=%p; not freeing; let's wait!\n", 2245 __func__, 2246 (unsigned long long) curthread->td_tid, 2247 sfs); 2248 mtx_unlock(&sfs->mtx); 2249 return; 2250 } 2251 2252 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2253 cv_destroy(&sfs->cv); 2254 /* 2255 * This doesn't call knlist_detach() on each knote; it just frees 2256 * the entire list. 2257 */ 2258 knlist_delete(&sfs->klist, curthread, 1); 2259 mtx_destroy(&sfs->mtx); 2260 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; freeing\n", 2261 __func__, 2262 (unsigned long long) curthread->td_tid, 2263 sfs); 2264 uma_zfree(zone_sfsync, sfs); 2265 } 2266 2267 /* 2268 * Setup a sf_sync to post a kqueue notification when things are complete. 2269 */ 2270 int 2271 sf_sync_kqueue_setup(struct sendfile_sync *sfs, struct sf_hdtr_kq *sfkq) 2272 { 2273 struct kevent kev; 2274 int error; 2275 2276 sfs->flags |= SF_KQUEUE; 2277 2278 /* Check the flags are valid */ 2279 if ((sfkq->kq_flags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) 2280 return (EINVAL); 2281 2282 SFSYNC_DPRINTF("%s: sfs=%p: kqfd=%d, flags=0x%08x, ident=%p, udata=%p\n", 2283 __func__, 2284 sfs, 2285 sfkq->kq_fd, 2286 sfkq->kq_flags, 2287 (void *) sfkq->kq_ident, 2288 (void *) sfkq->kq_udata); 2289 2290 /* Setup and register a knote on the given kqfd. */ 2291 kev.ident = (uintptr_t) sfkq->kq_ident; 2292 kev.filter = EVFILT_SENDFILE; 2293 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | sfkq->kq_flags; 2294 kev.data = (intptr_t) sfs; 2295 kev.udata = sfkq->kq_udata; 2296 2297 error = kqfd_register(sfkq->kq_fd, &kev, curthread, 1); 2298 if (error != 0) { 2299 SFSYNC_DPRINTF("%s: returned %d\n", __func__, error); 2300 } 2301 return (error); 2302 } 2303 2304 void 2305 sf_sync_set_state(struct sendfile_sync *sfs, sendfile_sync_state_t state, 2306 int islocked) 2307 { 2308 sendfile_sync_state_t old_state; 2309 2310 if (! islocked) 2311 mtx_lock(&sfs->mtx); 2312 2313 /* 2314 * Update our current state. 2315 */ 2316 old_state = sfs->state; 2317 sfs->state = state; 2318 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; going from %d to %d\n", 2319 __func__, 2320 (unsigned long long) curthread->td_tid, 2321 sfs, 2322 old_state, 2323 state); 2324 2325 /* 2326 * If we're transitioning from RUNNING to COMPLETED and the count is 2327 * zero, then post the knote. The caller may have completed the 2328 * send before we updated the state to COMPLETED and we need to make 2329 * sure this is communicated. 2330 */ 2331 if (old_state == SF_STATE_RUNNING 2332 && state == SF_STATE_COMPLETED 2333 && sfs->count == 0 2334 && sfs->flags & SF_KQUEUE) { 2335 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: triggering knote!\n", 2336 __func__, 2337 (unsigned long long) curthread->td_tid, 2338 sfs); 2339 KNOTE_LOCKED(&sfs->klist, 1); 2340 } 2341 2342 if (! islocked) 2343 mtx_unlock(&sfs->mtx); 2344 } 2345 2346 /* 2347 * Set the retval/errno for the given transaction. 2348 * 2349 * This will eventually/ideally be used when the KNOTE is fired off 2350 * to signify the completion of this transaction. 2351 * 2352 * The sfsync lock should be held before entering this function. 2353 */ 2354 void 2355 sf_sync_set_retval(struct sendfile_sync *sfs, off_t retval, int xerrno) 2356 { 2357 2358 KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!", 2359 __func__, 2360 sfs)); 2361 2362 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: errno=%d, retval=%jd\n", 2363 __func__, 2364 (unsigned long long) curthread->td_tid, 2365 sfs, 2366 xerrno, 2367 (intmax_t) retval); 2368 2369 sfs->retval = retval; 2370 sfs->xerrno = xerrno; 2371 } 2372 2373 /* 2374 * sendfile(2) 2375 * 2376 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 2377 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 2378 * 2379 * Send a file specified by 'fd' and starting at 'offset' to a socket 2380 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == 2381 * 0. Optionally add a header and/or trailer to the socket output. If 2382 * specified, write the total number of bytes sent into *sbytes. 2383 */ 2384 int 2385 sys_sendfile(struct thread *td, struct sendfile_args *uap) 2386 { 2387 2388 return (do_sendfile(td, uap, 0)); 2389 } 2390 2391 int 2392 _do_sendfile(struct thread *td, int src_fd, int sock_fd, int flags, 2393 int compat, off_t offset, size_t nbytes, off_t *sbytes, 2394 struct uio *hdr_uio, 2395 struct uio *trl_uio, struct sf_hdtr_kq *hdtr_kq) 2396 { 2397 cap_rights_t rights; 2398 struct sendfile_sync *sfs = NULL; 2399 struct file *fp; 2400 int error; 2401 int do_kqueue = 0; 2402 int do_free = 0; 2403 2404 AUDIT_ARG_FD(src_fd); 2405 2406 if (hdtr_kq != NULL) 2407 do_kqueue = 1; 2408 2409 /* 2410 * sendfile(2) can start at any offset within a file so we require 2411 * CAP_READ+CAP_SEEK = CAP_PREAD. 2412 */ 2413 if ((error = fget_read(td, src_fd, 2414 cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { 2415 goto out; 2416 } 2417 2418 /* 2419 * IF SF_KQUEUE is set but we haven't copied in anything for 2420 * kqueue data, error out. 2421 */ 2422 if (flags & SF_KQUEUE && do_kqueue == 0) { 2423 SFSYNC_DPRINTF("%s: SF_KQUEUE but no KQUEUE data!\n", __func__); 2424 goto out; 2425 } 2426 2427 /* 2428 * If we need to wait for completion, initialise the sfsync 2429 * state here. 2430 */ 2431 if (flags & (SF_SYNC | SF_KQUEUE)) 2432 sfs = sf_sync_alloc(flags & (SF_SYNC | SF_KQUEUE)); 2433 2434 if (flags & SF_KQUEUE) { 2435 error = sf_sync_kqueue_setup(sfs, hdtr_kq); 2436 if (error) { 2437 SFSYNC_DPRINTF("%s: (%llu) error; sfs=%p\n", 2438 __func__, 2439 (unsigned long long) curthread->td_tid, 2440 sfs); 2441 sf_sync_set_state(sfs, SF_STATE_FREEING, 0); 2442 sf_sync_free(sfs); 2443 goto out; 2444 } 2445 } 2446 2447 /* 2448 * Do the sendfile call. 2449 * 2450 * If this fails, it'll free the mbuf chain which will free up the 2451 * sendfile_sync references. 2452 */ 2453 error = fo_sendfile(fp, sock_fd, hdr_uio, trl_uio, offset, 2454 nbytes, sbytes, flags, compat ? SFK_COMPAT : 0, sfs, td); 2455 2456 /* 2457 * If the sendfile call succeeded, transition the sf_sync state 2458 * to RUNNING, then COMPLETED. 2459 * 2460 * If the sendfile call failed, then the sendfile call may have 2461 * actually sent some data first - so we check to see whether 2462 * any data was sent. If some data was queued (ie, count > 0) 2463 * then we can't call free; we have to wait until the partial 2464 * transaction completes before we continue along. 2465 * 2466 * This has the side effect of firing off the knote 2467 * if the refcount has hit zero by the time we get here. 2468 */ 2469 if (sfs != NULL) { 2470 mtx_lock(&sfs->mtx); 2471 if (error == 0 || sfs->count > 0) { 2472 /* 2473 * When it's time to do async sendfile, the transition 2474 * to RUNNING signifies that we're actually actively 2475 * adding and completing mbufs. When the last disk 2476 * buffer is read (ie, when we're not doing any 2477 * further read IO and all subsequent stuff is mbuf 2478 * transmissions) we'll transition to COMPLETED 2479 * and when the final mbuf is freed, the completion 2480 * will be signaled. 2481 */ 2482 sf_sync_set_state(sfs, SF_STATE_RUNNING, 1); 2483 2484 /* 2485 * Set the retval before we signal completed. 2486 * If we do it the other way around then transitioning to 2487 * COMPLETED may post the knote before you set the return 2488 * status! 2489 * 2490 * XXX for now, errno is always 0, as we don't post 2491 * knotes if sendfile failed. Maybe that'll change later. 2492 */ 2493 sf_sync_set_retval(sfs, *sbytes, error); 2494 2495 /* 2496 * And now transition to completed, which will kick off 2497 * the knote if required. 2498 */ 2499 sf_sync_set_state(sfs, SF_STATE_COMPLETED, 1); 2500 } else { 2501 /* 2502 * Error isn't zero, sfs_count is zero, so we 2503 * won't have some other thing to wake things up. 2504 * Thus free. 2505 */ 2506 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2507 do_free = 1; 2508 } 2509 2510 /* 2511 * Next - wait if appropriate. 2512 */ 2513 sf_sync_syscall_wait(sfs); 2514 2515 /* 2516 * If we're not doing kqueue notifications, we can 2517 * transition this immediately to the freeing state. 2518 */ 2519 if ((sfs->flags & SF_KQUEUE) == 0) { 2520 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2521 do_free = 1; 2522 } 2523 2524 mtx_unlock(&sfs->mtx); 2525 } 2526 2527 /* 2528 * If do_free is set, free here. 2529 * 2530 * If we're doing no-kqueue notification and it's just sleep notification, 2531 * we also do free; it's the only chance we have. 2532 */ 2533 if (sfs != NULL && do_free == 1) { 2534 sf_sync_free(sfs); 2535 } 2536 2537 /* 2538 * XXX Should we wait until the send has completed before freeing the source 2539 * file handle? It's the previous behaviour, sure, but is it required? 2540 * We've wired down the page references after all. 2541 */ 2542 fdrop(fp, td); 2543 2544 out: 2545 /* Return error */ 2546 return (error); 2547 } 2548 2549 2550 static int 2551 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) 2552 { 2553 struct sf_hdtr hdtr; 2554 struct sf_hdtr_kq hdtr_kq; 2555 struct uio *hdr_uio, *trl_uio; 2556 int error; 2557 off_t sbytes; 2558 int do_kqueue = 0; 2559 2560 /* 2561 * File offset must be positive. If it goes beyond EOF 2562 * we send only the header/trailer and no payload data. 2563 */ 2564 if (uap->offset < 0) 2565 return (EINVAL); 2566 2567 hdr_uio = trl_uio = NULL; 2568 2569 if (uap->hdtr != NULL) { 2570 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 2571 if (error != 0) 2572 goto out; 2573 if (hdtr.headers != NULL) { 2574 error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); 2575 if (error != 0) 2576 goto out; 2577 } 2578 if (hdtr.trailers != NULL) { 2579 error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); 2580 if (error != 0) 2581 goto out; 2582 } 2583 2584 /* 2585 * If SF_KQUEUE is set, then we need to also copy in 2586 * the kqueue data after the normal hdtr set and set 2587 * do_kqueue=1. 2588 */ 2589 if (uap->flags & SF_KQUEUE) { 2590 error = copyin(((char *) uap->hdtr) + sizeof(hdtr), 2591 &hdtr_kq, 2592 sizeof(hdtr_kq)); 2593 if (error != 0) 2594 goto out; 2595 do_kqueue = 1; 2596 } 2597 } 2598 2599 /* Call sendfile */ 2600 error = _do_sendfile(td, uap->fd, uap->s, uap->flags, compat, 2601 uap->offset, uap->nbytes, &sbytes, hdr_uio, trl_uio, &hdtr_kq); 2602 2603 if (uap->sbytes != NULL) { 2604 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 2605 } 2606 out: 2607 free(hdr_uio, M_IOV); 2608 free(trl_uio, M_IOV); 2609 return (error); 2610 } 2611 2612 #ifdef COMPAT_FREEBSD4 2613 int 2614 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) 2615 { 2616 struct sendfile_args args; 2617 2618 args.fd = uap->fd; 2619 args.s = uap->s; 2620 args.offset = uap->offset; 2621 args.nbytes = uap->nbytes; 2622 args.hdtr = uap->hdtr; 2623 args.sbytes = uap->sbytes; 2624 args.flags = uap->flags; 2625 2626 return (do_sendfile(td, &args, 1)); 2627 } 2628 #endif /* COMPAT_FREEBSD4 */ 2629 2630 static int 2631 sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd, 2632 off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res) 2633 { 2634 vm_page_t m; 2635 vm_pindex_t pindex; 2636 ssize_t resid; 2637 int error, readahead, rv; 2638 2639 pindex = OFF_TO_IDX(off); 2640 VM_OBJECT_WLOCK(obj); 2641 m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY | 2642 VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL); 2643 2644 /* 2645 * Check if page is valid for what we need, otherwise initiate I/O. 2646 * 2647 * The non-zero nd argument prevents disk I/O, instead we 2648 * return the caller what he specified in nd. In particular, 2649 * if we already turned some pages into mbufs, nd == EAGAIN 2650 * and the main function send them the pages before we come 2651 * here again and block. 2652 */ 2653 if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) { 2654 if (vp == NULL) 2655 vm_page_xunbusy(m); 2656 VM_OBJECT_WUNLOCK(obj); 2657 *res = m; 2658 return (0); 2659 } else if (nd != 0) { 2660 if (vp == NULL) 2661 vm_page_xunbusy(m); 2662 error = nd; 2663 goto free_page; 2664 } 2665 2666 /* 2667 * Get the page from backing store. 2668 */ 2669 error = 0; 2670 if (vp != NULL) { 2671 VM_OBJECT_WUNLOCK(obj); 2672 readahead = sfreadahead * MAXBSIZE; 2673 2674 /* 2675 * Use vn_rdwr() instead of the pager interface for 2676 * the vnode, to allow the read-ahead. 2677 * 2678 * XXXMAC: Because we don't have fp->f_cred here, we 2679 * pass in NOCRED. This is probably wrong, but is 2680 * consistent with our original implementation. 2681 */ 2682 error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off), 2683 UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead / 2684 bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); 2685 SFSTAT_INC(sf_iocnt); 2686 VM_OBJECT_WLOCK(obj); 2687 } else { 2688 if (vm_pager_has_page(obj, pindex, NULL, NULL)) { 2689 rv = vm_pager_get_pages(obj, &m, 1, 0); 2690 SFSTAT_INC(sf_iocnt); 2691 m = vm_page_lookup(obj, pindex); 2692 if (m == NULL) 2693 error = EIO; 2694 else if (rv != VM_PAGER_OK) { 2695 vm_page_lock(m); 2696 vm_page_free(m); 2697 vm_page_unlock(m); 2698 m = NULL; 2699 error = EIO; 2700 } 2701 } else { 2702 pmap_zero_page(m); 2703 m->valid = VM_PAGE_BITS_ALL; 2704 m->dirty = 0; 2705 } 2706 if (m != NULL) 2707 vm_page_xunbusy(m); 2708 } 2709 if (error == 0) { 2710 *res = m; 2711 } else if (m != NULL) { 2712 free_page: 2713 vm_page_lock(m); 2714 vm_page_unwire(m, PQ_INACTIVE); 2715 2716 /* 2717 * See if anyone else might know about this page. If 2718 * not and it is not valid, then free it. 2719 */ 2720 if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m)) 2721 vm_page_free(m); 2722 vm_page_unlock(m); 2723 } 2724 KASSERT(error != 0 || (m->wire_count > 0 && 2725 vm_page_is_valid(m, off & PAGE_MASK, xfsize)), 2726 ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off, 2727 xfsize)); 2728 VM_OBJECT_WUNLOCK(obj); 2729 return (error); 2730 } 2731 2732 static int 2733 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, 2734 struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, 2735 int *bsize) 2736 { 2737 struct vattr va; 2738 vm_object_t obj; 2739 struct vnode *vp; 2740 struct shmfd *shmfd; 2741 int error; 2742 2743 vp = *vp_res = NULL; 2744 obj = NULL; 2745 shmfd = *shmfd_res = NULL; 2746 *bsize = 0; 2747 2748 /* 2749 * The file descriptor must be a regular file and have a 2750 * backing VM object. 2751 */ 2752 if (fp->f_type == DTYPE_VNODE) { 2753 vp = fp->f_vnode; 2754 vn_lock(vp, LK_SHARED | LK_RETRY); 2755 if (vp->v_type != VREG) { 2756 error = EINVAL; 2757 goto out; 2758 } 2759 *bsize = vp->v_mount->mnt_stat.f_iosize; 2760 error = VOP_GETATTR(vp, &va, td->td_ucred); 2761 if (error != 0) 2762 goto out; 2763 *obj_size = va.va_size; 2764 obj = vp->v_object; 2765 if (obj == NULL) { 2766 error = EINVAL; 2767 goto out; 2768 } 2769 } else if (fp->f_type == DTYPE_SHM) { 2770 shmfd = fp->f_data; 2771 obj = shmfd->shm_object; 2772 *obj_size = shmfd->shm_size; 2773 } else { 2774 error = EINVAL; 2775 goto out; 2776 } 2777 2778 VM_OBJECT_WLOCK(obj); 2779 if ((obj->flags & OBJ_DEAD) != 0) { 2780 VM_OBJECT_WUNLOCK(obj); 2781 error = EBADF; 2782 goto out; 2783 } 2784 2785 /* 2786 * Temporarily increase the backing VM object's reference 2787 * count so that a forced reclamation of its vnode does not 2788 * immediately destroy it. 2789 */ 2790 vm_object_reference_locked(obj); 2791 VM_OBJECT_WUNLOCK(obj); 2792 *obj_res = obj; 2793 *vp_res = vp; 2794 *shmfd_res = shmfd; 2795 2796 out: 2797 if (vp != NULL) 2798 VOP_UNLOCK(vp, 0); 2799 return (error); 2800 } 2801 2802 static int 2803 kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp, 2804 struct socket **so) 2805 { 2806 cap_rights_t rights; 2807 int error; 2808 2809 *sock_fp = NULL; 2810 *so = NULL; 2811 2812 /* 2813 * The socket must be a stream socket and connected. 2814 */ 2815 error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights, 2816 CAP_SEND), sock_fp, NULL); 2817 if (error != 0) 2818 return (error); 2819 *so = (*sock_fp)->f_data; 2820 if ((*so)->so_type != SOCK_STREAM) 2821 return (EINVAL); 2822 if (((*so)->so_state & SS_ISCONNECTED) == 0) 2823 return (ENOTCONN); 2824 return (0); 2825 } 2826 2827 int 2828 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 2829 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 2830 int kflags, struct sendfile_sync *sfs, struct thread *td) 2831 { 2832 struct file *sock_fp; 2833 struct vnode *vp; 2834 struct vm_object *obj; 2835 struct socket *so; 2836 struct mbuf *m; 2837 struct sf_buf *sf; 2838 struct vm_page *pg; 2839 struct shmfd *shmfd; 2840 struct vattr va; 2841 off_t off, xfsize, fsbytes, sbytes, rem, obj_size; 2842 int error, bsize, nd, hdrlen, mnw; 2843 2844 pg = NULL; 2845 obj = NULL; 2846 so = NULL; 2847 m = NULL; 2848 fsbytes = sbytes = 0; 2849 hdrlen = mnw = 0; 2850 rem = nbytes; 2851 obj_size = 0; 2852 2853 error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); 2854 if (error != 0) 2855 return (error); 2856 if (rem == 0) 2857 rem = obj_size; 2858 2859 error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so); 2860 if (error != 0) 2861 goto out; 2862 2863 /* 2864 * Do not wait on memory allocations but return ENOMEM for 2865 * caller to retry later. 2866 * XXX: Experimental. 2867 */ 2868 if (flags & SF_MNOWAIT) 2869 mnw = 1; 2870 2871 #ifdef MAC 2872 error = mac_socket_check_send(td->td_ucred, so); 2873 if (error != 0) 2874 goto out; 2875 #endif 2876 2877 /* If headers are specified copy them into mbufs. */ 2878 if (hdr_uio != NULL) { 2879 hdr_uio->uio_td = td; 2880 hdr_uio->uio_rw = UIO_WRITE; 2881 if (hdr_uio->uio_resid > 0) { 2882 /* 2883 * In FBSD < 5.0 the nbytes to send also included 2884 * the header. If compat is specified subtract the 2885 * header size from nbytes. 2886 */ 2887 if (kflags & SFK_COMPAT) { 2888 if (nbytes > hdr_uio->uio_resid) 2889 nbytes -= hdr_uio->uio_resid; 2890 else 2891 nbytes = 0; 2892 } 2893 m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), 2894 0, 0, 0); 2895 if (m == NULL) { 2896 error = mnw ? EAGAIN : ENOBUFS; 2897 goto out; 2898 } 2899 hdrlen = m_length(m, NULL); 2900 } 2901 } 2902 2903 /* 2904 * Protect against multiple writers to the socket. 2905 * 2906 * XXXRW: Historically this has assumed non-interruptibility, so now 2907 * we implement that, but possibly shouldn't. 2908 */ 2909 (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); 2910 2911 /* 2912 * Loop through the pages of the file, starting with the requested 2913 * offset. Get a file page (do I/O if necessary), map the file page 2914 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 2915 * it on the socket. 2916 * This is done in two loops. The inner loop turns as many pages 2917 * as it can, up to available socket buffer space, without blocking 2918 * into mbufs to have it bulk delivered into the socket send buffer. 2919 * The outer loop checks the state and available space of the socket 2920 * and takes care of the overall progress. 2921 */ 2922 for (off = offset; ; ) { 2923 struct mbuf *mtail; 2924 int loopbytes; 2925 int space; 2926 int done; 2927 2928 if ((nbytes != 0 && nbytes == fsbytes) || 2929 (nbytes == 0 && obj_size == fsbytes)) 2930 break; 2931 2932 mtail = NULL; 2933 loopbytes = 0; 2934 space = 0; 2935 done = 0; 2936 2937 /* 2938 * Check the socket state for ongoing connection, 2939 * no errors and space in socket buffer. 2940 * If space is low allow for the remainder of the 2941 * file to be processed if it fits the socket buffer. 2942 * Otherwise block in waiting for sufficient space 2943 * to proceed, or if the socket is nonblocking, return 2944 * to userland with EAGAIN while reporting how far 2945 * we've come. 2946 * We wait until the socket buffer has significant free 2947 * space to do bulk sends. This makes good use of file 2948 * system read ahead and allows packet segmentation 2949 * offloading hardware to take over lots of work. If 2950 * we were not careful here we would send off only one 2951 * sfbuf at a time. 2952 */ 2953 SOCKBUF_LOCK(&so->so_snd); 2954 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) 2955 so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; 2956 retry_space: 2957 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2958 error = EPIPE; 2959 SOCKBUF_UNLOCK(&so->so_snd); 2960 goto done; 2961 } else if (so->so_error) { 2962 error = so->so_error; 2963 so->so_error = 0; 2964 SOCKBUF_UNLOCK(&so->so_snd); 2965 goto done; 2966 } 2967 space = sbspace(&so->so_snd); 2968 if (space < rem && 2969 (space <= 0 || 2970 space < so->so_snd.sb_lowat)) { 2971 if (so->so_state & SS_NBIO) { 2972 SOCKBUF_UNLOCK(&so->so_snd); 2973 error = EAGAIN; 2974 goto done; 2975 } 2976 /* 2977 * sbwait drops the lock while sleeping. 2978 * When we loop back to retry_space the 2979 * state may have changed and we retest 2980 * for it. 2981 */ 2982 error = sbwait(&so->so_snd); 2983 /* 2984 * An error from sbwait usually indicates that we've 2985 * been interrupted by a signal. If we've sent anything 2986 * then return bytes sent, otherwise return the error. 2987 */ 2988 if (error != 0) { 2989 SOCKBUF_UNLOCK(&so->so_snd); 2990 goto done; 2991 } 2992 goto retry_space; 2993 } 2994 SOCKBUF_UNLOCK(&so->so_snd); 2995 2996 /* 2997 * Reduce space in the socket buffer by the size of 2998 * the header mbuf chain. 2999 * hdrlen is set to 0 after the first loop. 3000 */ 3001 space -= hdrlen; 3002 3003 if (vp != NULL) { 3004 error = vn_lock(vp, LK_SHARED); 3005 if (error != 0) 3006 goto done; 3007 error = VOP_GETATTR(vp, &va, td->td_ucred); 3008 if (error != 0 || off >= va.va_size) { 3009 VOP_UNLOCK(vp, 0); 3010 goto done; 3011 } 3012 obj_size = va.va_size; 3013 } 3014 3015 /* 3016 * Loop and construct maximum sized mbuf chain to be bulk 3017 * dumped into socket buffer. 3018 */ 3019 while (space > loopbytes) { 3020 vm_offset_t pgoff; 3021 struct mbuf *m0; 3022 3023 /* 3024 * Calculate the amount to transfer. 3025 * Not to exceed a page, the EOF, 3026 * or the passed in nbytes. 3027 */ 3028 pgoff = (vm_offset_t)(off & PAGE_MASK); 3029 rem = obj_size - offset; 3030 if (nbytes != 0) 3031 rem = omin(rem, nbytes); 3032 rem -= fsbytes + loopbytes; 3033 xfsize = omin(PAGE_SIZE - pgoff, rem); 3034 xfsize = omin(space - loopbytes, xfsize); 3035 if (xfsize <= 0) { 3036 done = 1; /* all data sent */ 3037 break; 3038 } 3039 3040 /* 3041 * Attempt to look up the page. Allocate 3042 * if not found or wait and loop if busy. 3043 */ 3044 if (m != NULL) 3045 nd = EAGAIN; /* send what we already got */ 3046 else if ((flags & SF_NODISKIO) != 0) 3047 nd = EBUSY; 3048 else 3049 nd = 0; 3050 error = sendfile_readpage(obj, vp, nd, off, 3051 xfsize, bsize, td, &pg); 3052 if (error != 0) { 3053 if (error == EAGAIN) 3054 error = 0; /* not a real error */ 3055 break; 3056 } 3057 3058 /* 3059 * Get a sendfile buf. When allocating the 3060 * first buffer for mbuf chain, we usually 3061 * wait as long as necessary, but this wait 3062 * can be interrupted. For consequent 3063 * buffers, do not sleep, since several 3064 * threads might exhaust the buffers and then 3065 * deadlock. 3066 */ 3067 sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT : 3068 SFB_CATCH); 3069 if (sf == NULL) { 3070 SFSTAT_INC(sf_allocfail); 3071 vm_page_lock(pg); 3072 vm_page_unwire(pg, PQ_INACTIVE); 3073 KASSERT(pg->object != NULL, 3074 ("%s: object disappeared", __func__)); 3075 vm_page_unlock(pg); 3076 if (m == NULL) 3077 error = (mnw ? EAGAIN : EINTR); 3078 break; 3079 } 3080 3081 /* 3082 * Get an mbuf and set it up as having 3083 * external storage. 3084 */ 3085 m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); 3086 if (m0 == NULL) { 3087 error = (mnw ? EAGAIN : ENOBUFS); 3088 sf_ext_free(sf, NULL); 3089 break; 3090 } 3091 /* 3092 * Attach EXT_SFBUF external storage. 3093 */ 3094 m0->m_ext.ext_buf = (caddr_t )sf_buf_kva(sf); 3095 m0->m_ext.ext_size = PAGE_SIZE; 3096 m0->m_ext.ext_arg1 = sf; 3097 m0->m_ext.ext_arg2 = sfs; 3098 m0->m_ext.ext_type = EXT_SFBUF; 3099 m0->m_ext.ext_flags = 0; 3100 m0->m_flags |= (M_EXT|M_RDONLY); 3101 m0->m_data = (char *)sf_buf_kva(sf) + pgoff; 3102 m0->m_len = xfsize; 3103 3104 /* Append to mbuf chain. */ 3105 if (mtail != NULL) 3106 mtail->m_next = m0; 3107 else if (m != NULL) 3108 m_last(m)->m_next = m0; 3109 else 3110 m = m0; 3111 mtail = m0; 3112 3113 /* Keep track of bits processed. */ 3114 loopbytes += xfsize; 3115 off += xfsize; 3116 3117 /* 3118 * XXX eventually this should be a sfsync 3119 * method call! 3120 */ 3121 if (sfs != NULL) 3122 sf_sync_ref(sfs); 3123 } 3124 3125 if (vp != NULL) 3126 VOP_UNLOCK(vp, 0); 3127 3128 /* Add the buffer chain to the socket buffer. */ 3129 if (m != NULL) { 3130 int mlen, err; 3131 3132 mlen = m_length(m, NULL); 3133 SOCKBUF_LOCK(&so->so_snd); 3134 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3135 error = EPIPE; 3136 SOCKBUF_UNLOCK(&so->so_snd); 3137 goto done; 3138 } 3139 SOCKBUF_UNLOCK(&so->so_snd); 3140 CURVNET_SET(so->so_vnet); 3141 /* Avoid error aliasing. */ 3142 err = (*so->so_proto->pr_usrreqs->pru_send) 3143 (so, 0, m, NULL, NULL, td); 3144 CURVNET_RESTORE(); 3145 if (err == 0) { 3146 /* 3147 * We need two counters to get the 3148 * file offset and nbytes to send 3149 * right: 3150 * - sbytes contains the total amount 3151 * of bytes sent, including headers. 3152 * - fsbytes contains the total amount 3153 * of bytes sent from the file. 3154 */ 3155 sbytes += mlen; 3156 fsbytes += mlen; 3157 if (hdrlen) { 3158 fsbytes -= hdrlen; 3159 hdrlen = 0; 3160 } 3161 } else if (error == 0) 3162 error = err; 3163 m = NULL; /* pru_send always consumes */ 3164 } 3165 3166 /* Quit outer loop on error or when we're done. */ 3167 if (done) 3168 break; 3169 if (error != 0) 3170 goto done; 3171 } 3172 3173 /* 3174 * Send trailers. Wimp out and use writev(2). 3175 */ 3176 if (trl_uio != NULL) { 3177 sbunlock(&so->so_snd); 3178 error = kern_writev(td, sockfd, trl_uio); 3179 if (error == 0) 3180 sbytes += td->td_retval[0]; 3181 goto out; 3182 } 3183 3184 done: 3185 sbunlock(&so->so_snd); 3186 out: 3187 /* 3188 * If there was no error we have to clear td->td_retval[0] 3189 * because it may have been set by writev. 3190 */ 3191 if (error == 0) { 3192 td->td_retval[0] = 0; 3193 } 3194 if (sent != NULL) { 3195 (*sent) = sbytes; 3196 } 3197 if (obj != NULL) 3198 vm_object_deallocate(obj); 3199 if (so) 3200 fdrop(sock_fp, td); 3201 if (m) 3202 m_freem(m); 3203 3204 if (error == ERESTART) 3205 error = EINTR; 3206 3207 return (error); 3208 } 3209 3210 /* 3211 * SCTP syscalls. 3212 * Functionality only compiled in if SCTP is defined in the kernel Makefile, 3213 * otherwise all return EOPNOTSUPP. 3214 * XXX: We should make this loadable one day. 3215 */ 3216 int 3217 sys_sctp_peeloff(td, uap) 3218 struct thread *td; 3219 struct sctp_peeloff_args /* { 3220 int sd; 3221 caddr_t name; 3222 } */ *uap; 3223 { 3224 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3225 struct file *nfp = NULL; 3226 struct socket *head, *so; 3227 cap_rights_t rights; 3228 u_int fflag; 3229 int error, fd; 3230 3231 AUDIT_ARG_FD(uap->sd); 3232 error = fgetsock(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF), 3233 &head, &fflag); 3234 if (error != 0) 3235 goto done2; 3236 if (head->so_proto->pr_protocol != IPPROTO_SCTP) { 3237 error = EOPNOTSUPP; 3238 goto done; 3239 } 3240 error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name); 3241 if (error != 0) 3242 goto done; 3243 /* 3244 * At this point we know we do have a assoc to pull 3245 * we proceed to get the fd setup. This may block 3246 * but that is ok. 3247 */ 3248 3249 error = falloc(td, &nfp, &fd, 0); 3250 if (error != 0) 3251 goto done; 3252 td->td_retval[0] = fd; 3253 3254 CURVNET_SET(head->so_vnet); 3255 so = sonewconn(head, SS_ISCONNECTED); 3256 if (so == NULL) { 3257 error = ENOMEM; 3258 goto noconnection; 3259 } 3260 /* 3261 * Before changing the flags on the socket, we have to bump the 3262 * reference count. Otherwise, if the protocol calls sofree(), 3263 * the socket will be released due to a zero refcount. 3264 */ 3265 SOCK_LOCK(so); 3266 soref(so); /* file descriptor reference */ 3267 SOCK_UNLOCK(so); 3268 3269 ACCEPT_LOCK(); 3270 3271 TAILQ_REMOVE(&head->so_comp, so, so_list); 3272 head->so_qlen--; 3273 so->so_state |= (head->so_state & SS_NBIO); 3274 so->so_state &= ~SS_NOFDREF; 3275 so->so_qstate &= ~SQ_COMP; 3276 so->so_head = NULL; 3277 ACCEPT_UNLOCK(); 3278 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); 3279 error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); 3280 if (error != 0) 3281 goto noconnection; 3282 if (head->so_sigio != NULL) 3283 fsetown(fgetown(&head->so_sigio), &so->so_sigio); 3284 3285 noconnection: 3286 /* 3287 * close the new descriptor, assuming someone hasn't ripped it 3288 * out from under us. 3289 */ 3290 if (error != 0) 3291 fdclose(td->td_proc->p_fd, nfp, fd, td); 3292 3293 /* 3294 * Release explicitly held references before returning. 3295 */ 3296 CURVNET_RESTORE(); 3297 done: 3298 if (nfp != NULL) 3299 fdrop(nfp, td); 3300 fputsock(head); 3301 done2: 3302 return (error); 3303 #else /* SCTP */ 3304 return (EOPNOTSUPP); 3305 #endif /* SCTP */ 3306 } 3307 3308 int 3309 sys_sctp_generic_sendmsg (td, uap) 3310 struct thread *td; 3311 struct sctp_generic_sendmsg_args /* { 3312 int sd, 3313 caddr_t msg, 3314 int mlen, 3315 caddr_t to, 3316 __socklen_t tolen, 3317 struct sctp_sndrcvinfo *sinfo, 3318 int flags 3319 } */ *uap; 3320 { 3321 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3322 struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; 3323 struct socket *so; 3324 struct file *fp = NULL; 3325 struct sockaddr *to = NULL; 3326 #ifdef KTRACE 3327 struct uio *ktruio = NULL; 3328 #endif 3329 struct uio auio; 3330 struct iovec iov[1]; 3331 cap_rights_t rights; 3332 int error = 0, len; 3333 3334 if (uap->sinfo != NULL) { 3335 error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); 3336 if (error != 0) 3337 return (error); 3338 u_sinfo = &sinfo; 3339 } 3340 3341 cap_rights_init(&rights, CAP_SEND); 3342 if (uap->tolen != 0) { 3343 error = getsockaddr(&to, uap->to, uap->tolen); 3344 if (error != 0) { 3345 to = NULL; 3346 goto sctp_bad2; 3347 } 3348 cap_rights_set(&rights, CAP_CONNECT); 3349 } 3350 3351 AUDIT_ARG_FD(uap->sd); 3352 error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL); 3353 if (error != 0) 3354 goto sctp_bad; 3355 #ifdef KTRACE 3356 if (to && (KTRPOINT(td, KTR_STRUCT))) 3357 ktrsockaddr(to); 3358 #endif 3359 3360 iov[0].iov_base = uap->msg; 3361 iov[0].iov_len = uap->mlen; 3362 3363 so = (struct socket *)fp->f_data; 3364 if (so->so_proto->pr_protocol != IPPROTO_SCTP) { 3365 error = EOPNOTSUPP; 3366 goto sctp_bad; 3367 } 3368 #ifdef MAC 3369 error = mac_socket_check_send(td->td_ucred, so); 3370 if (error != 0) 3371 goto sctp_bad; 3372 #endif /* MAC */ 3373 3374 auio.uio_iov = iov; 3375 auio.uio_iovcnt = 1; 3376 auio.uio_segflg = UIO_USERSPACE; 3377 auio.uio_rw = UIO_WRITE; 3378 auio.uio_td = td; 3379 auio.uio_offset = 0; /* XXX */ 3380 auio.uio_resid = 0; 3381 len = auio.uio_resid = uap->mlen; 3382 CURVNET_SET(so->so_vnet); 3383 error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, 3384 (struct mbuf *)NULL, uap->flags, u_sinfo, td); 3385 CURVNET_RESTORE(); 3386 if (error != 0) { 3387 if (auio.uio_resid != len && (error == ERESTART || 3388 error == EINTR || error == EWOULDBLOCK)) 3389 error = 0; 3390 /* Generation of SIGPIPE can be controlled per socket. */ 3391 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 3392 !(uap->flags & MSG_NOSIGNAL)) { 3393 PROC_LOCK(td->td_proc); 3394 tdsignal(td, SIGPIPE); 3395 PROC_UNLOCK(td->td_proc); 3396 } 3397 } 3398 if (error == 0) 3399 td->td_retval[0] = len - auio.uio_resid; 3400 #ifdef KTRACE 3401 if (ktruio != NULL) { 3402 ktruio->uio_resid = td->td_retval[0]; 3403 ktrgenio(uap->sd, UIO_WRITE, ktruio, error); 3404 } 3405 #endif /* KTRACE */ 3406 sctp_bad: 3407 if (fp != NULL) 3408 fdrop(fp, td); 3409 sctp_bad2: 3410 free(to, M_SONAME); 3411 return (error); 3412 #else /* SCTP */ 3413 return (EOPNOTSUPP); 3414 #endif /* SCTP */ 3415 } 3416 3417 int 3418 sys_sctp_generic_sendmsg_iov(td, uap) 3419 struct thread *td; 3420 struct sctp_generic_sendmsg_iov_args /* { 3421 int sd, 3422 struct iovec *iov, 3423 int iovlen, 3424 caddr_t to, 3425 __socklen_t tolen, 3426 struct sctp_sndrcvinfo *sinfo, 3427 int flags 3428 } */ *uap; 3429 { 3430 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3431 struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; 3432 struct socket *so; 3433 struct file *fp = NULL; 3434 struct sockaddr *to = NULL; 3435 #ifdef KTRACE 3436 struct uio *ktruio = NULL; 3437 #endif 3438 struct uio auio; 3439 struct iovec *iov, *tiov; 3440 cap_rights_t rights; 3441 ssize_t len; 3442 int error, i; 3443 3444 if (uap->sinfo != NULL) { 3445 error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); 3446 if (error != 0) 3447 return (error); 3448 u_sinfo = &sinfo; 3449 } 3450 cap_rights_init(&rights, CAP_SEND); 3451 if (uap->tolen != 0) { 3452 error = getsockaddr(&to, uap->to, uap->tolen); 3453 if (error != 0) { 3454 to = NULL; 3455 goto sctp_bad2; 3456 } 3457 cap_rights_set(&rights, CAP_CONNECT); 3458 } 3459 3460 AUDIT_ARG_FD(uap->sd); 3461 error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL); 3462 if (error != 0) 3463 goto sctp_bad1; 3464 3465 #ifdef COMPAT_FREEBSD32 3466 if (SV_CURPROC_FLAG(SV_ILP32)) 3467 error = freebsd32_copyiniov((struct iovec32 *)uap->iov, 3468 uap->iovlen, &iov, EMSGSIZE); 3469 else 3470 #endif 3471 error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); 3472 if (error != 0) 3473 goto sctp_bad1; 3474 #ifdef KTRACE 3475 if (to && (KTRPOINT(td, KTR_STRUCT))) 3476 ktrsockaddr(to); 3477 #endif 3478 3479 so = (struct socket *)fp->f_data; 3480 if (so->so_proto->pr_protocol != IPPROTO_SCTP) { 3481 error = EOPNOTSUPP; 3482 goto sctp_bad; 3483 } 3484 #ifdef MAC 3485 error = mac_socket_check_send(td->td_ucred, so); 3486 if (error != 0) 3487 goto sctp_bad; 3488 #endif /* MAC */ 3489 3490 auio.uio_iov = iov; 3491 auio.uio_iovcnt = uap->iovlen; 3492 auio.uio_segflg = UIO_USERSPACE; 3493 auio.uio_rw = UIO_WRITE; 3494 auio.uio_td = td; 3495 auio.uio_offset = 0; /* XXX */ 3496 auio.uio_resid = 0; 3497 tiov = iov; 3498 for (i = 0; i <uap->iovlen; i++, tiov++) { 3499 if ((auio.uio_resid += tiov->iov_len) < 0) { 3500 error = EINVAL; 3501 goto sctp_bad; 3502 } 3503 } 3504 len = auio.uio_resid; 3505 CURVNET_SET(so->so_vnet); 3506 error = sctp_lower_sosend(so, to, &auio, 3507 (struct mbuf *)NULL, (struct mbuf *)NULL, 3508 uap->flags, u_sinfo, td); 3509 CURVNET_RESTORE(); 3510 if (error != 0) { 3511 if (auio.uio_resid != len && (error == ERESTART || 3512 error == EINTR || error == EWOULDBLOCK)) 3513 error = 0; 3514 /* Generation of SIGPIPE can be controlled per socket */ 3515 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 3516 !(uap->flags & MSG_NOSIGNAL)) { 3517 PROC_LOCK(td->td_proc); 3518 tdsignal(td, SIGPIPE); 3519 PROC_UNLOCK(td->td_proc); 3520 } 3521 } 3522 if (error == 0) 3523 td->td_retval[0] = len - auio.uio_resid; 3524 #ifdef KTRACE 3525 if (ktruio != NULL) { 3526 ktruio->uio_resid = td->td_retval[0]; 3527 ktrgenio(uap->sd, UIO_WRITE, ktruio, error); 3528 } 3529 #endif /* KTRACE */ 3530 sctp_bad: 3531 free(iov, M_IOV); 3532 sctp_bad1: 3533 if (fp != NULL) 3534 fdrop(fp, td); 3535 sctp_bad2: 3536 free(to, M_SONAME); 3537 return (error); 3538 #else /* SCTP */ 3539 return (EOPNOTSUPP); 3540 #endif /* SCTP */ 3541 } 3542 3543 int 3544 sys_sctp_generic_recvmsg(td, uap) 3545 struct thread *td; 3546 struct sctp_generic_recvmsg_args /* { 3547 int sd, 3548 struct iovec *iov, 3549 int iovlen, 3550 struct sockaddr *from, 3551 __socklen_t *fromlenaddr, 3552 struct sctp_sndrcvinfo *sinfo, 3553 int *msg_flags 3554 } */ *uap; 3555 { 3556 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3557 uint8_t sockbufstore[256]; 3558 struct uio auio; 3559 struct iovec *iov, *tiov; 3560 struct sctp_sndrcvinfo sinfo; 3561 struct socket *so; 3562 struct file *fp = NULL; 3563 struct sockaddr *fromsa; 3564 cap_rights_t rights; 3565 #ifdef KTRACE 3566 struct uio *ktruio = NULL; 3567 #endif 3568 ssize_t len; 3569 int error, fromlen, i, msg_flags; 3570 3571 AUDIT_ARG_FD(uap->sd); 3572 error = getsock_cap(td->td_proc->p_fd, uap->sd, 3573 cap_rights_init(&rights, CAP_RECV), &fp, NULL); 3574 if (error != 0) 3575 return (error); 3576 #ifdef COMPAT_FREEBSD32 3577 if (SV_CURPROC_FLAG(SV_ILP32)) 3578 error = freebsd32_copyiniov((struct iovec32 *)uap->iov, 3579 uap->iovlen, &iov, EMSGSIZE); 3580 else 3581 #endif 3582 error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); 3583 if (error != 0) 3584 goto out1; 3585 3586 so = fp->f_data; 3587 if (so->so_proto->pr_protocol != IPPROTO_SCTP) { 3588 error = EOPNOTSUPP; 3589 goto out; 3590 } 3591 #ifdef MAC 3592 error = mac_socket_check_receive(td->td_ucred, so); 3593 if (error != 0) 3594 goto out; 3595 #endif /* MAC */ 3596 3597 if (uap->fromlenaddr != NULL) { 3598 error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen)); 3599 if (error != 0) 3600 goto out; 3601 } else { 3602 fromlen = 0; 3603 } 3604 if (uap->msg_flags) { 3605 error = copyin(uap->msg_flags, &msg_flags, sizeof (int)); 3606 if (error != 0) 3607 goto out; 3608 } else { 3609 msg_flags = 0; 3610 } 3611 auio.uio_iov = iov; 3612 auio.uio_iovcnt = uap->iovlen; 3613 auio.uio_segflg = UIO_USERSPACE; 3614 auio.uio_rw = UIO_READ; 3615 auio.uio_td = td; 3616 auio.uio_offset = 0; /* XXX */ 3617 auio.uio_resid = 0; 3618 tiov = iov; 3619 for (i = 0; i <uap->iovlen; i++, tiov++) { 3620 if ((auio.uio_resid += tiov->iov_len) < 0) { 3621 error = EINVAL; 3622 goto out; 3623 } 3624 } 3625 len = auio.uio_resid; 3626 fromsa = (struct sockaddr *)sockbufstore; 3627 3628 #ifdef KTRACE 3629 if (KTRPOINT(td, KTR_GENIO)) 3630 ktruio = cloneuio(&auio); 3631 #endif /* KTRACE */ 3632 memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo)); 3633 CURVNET_SET(so->so_vnet); 3634 error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL, 3635 fromsa, fromlen, &msg_flags, 3636 (struct sctp_sndrcvinfo *)&sinfo, 1); 3637 CURVNET_RESTORE(); 3638 if (error != 0) { 3639 if (auio.uio_resid != len && (error == ERESTART || 3640 error == EINTR || error == EWOULDBLOCK)) 3641 error = 0; 3642 } else { 3643 if (uap->sinfo) 3644 error = copyout(&sinfo, uap->sinfo, sizeof (sinfo)); 3645 } 3646 #ifdef KTRACE 3647 if (ktruio != NULL) { 3648 ktruio->uio_resid = len - auio.uio_resid; 3649 ktrgenio(uap->sd, UIO_READ, ktruio, error); 3650 } 3651 #endif /* KTRACE */ 3652 if (error != 0) 3653 goto out; 3654 td->td_retval[0] = len - auio.uio_resid; 3655 3656 if (fromlen && uap->from) { 3657 len = fromlen; 3658 if (len <= 0 || fromsa == 0) 3659 len = 0; 3660 else { 3661 len = MIN(len, fromsa->sa_len); 3662 error = copyout(fromsa, uap->from, (size_t)len); 3663 if (error != 0) 3664 goto out; 3665 } 3666 error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t)); 3667 if (error != 0) 3668 goto out; 3669 } 3670 #ifdef KTRACE 3671 if (KTRPOINT(td, KTR_STRUCT)) 3672 ktrsockaddr(fromsa); 3673 #endif 3674 if (uap->msg_flags) { 3675 error = copyout(&msg_flags, uap->msg_flags, sizeof (int)); 3676 if (error != 0) 3677 goto out; 3678 } 3679 out: 3680 free(iov, M_IOV); 3681 out1: 3682 if (fp != NULL) 3683 fdrop(fp, td); 3684 3685 return (error); 3686 #else /* SCTP */ 3687 return (EOPNOTSUPP); 3688 #endif /* SCTP */ 3689 } 3690