1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_capsicum.h" 39 #include "opt_inet.h" 40 #include "opt_inet6.h" 41 #include "opt_sctp.h" 42 #include "opt_compat.h" 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/capsicum.h> 48 #include <sys/condvar.h> 49 #include <sys/kernel.h> 50 #include <sys/lock.h> 51 #include <sys/mutex.h> 52 #include <sys/sysproto.h> 53 #include <sys/malloc.h> 54 #include <sys/filedesc.h> 55 #include <sys/event.h> 56 #include <sys/proc.h> 57 #include <sys/fcntl.h> 58 #include <sys/file.h> 59 #include <sys/filio.h> 60 #include <sys/jail.h> 61 #include <sys/mman.h> 62 #include <sys/mount.h> 63 #include <sys/mbuf.h> 64 #include <sys/protosw.h> 65 #include <sys/rwlock.h> 66 #include <sys/sf_buf.h> 67 #include <sys/sf_sync.h> 68 #include <sys/sf_base.h> 69 #include <sys/sysent.h> 70 #include <sys/socket.h> 71 #include <sys/socketvar.h> 72 #include <sys/signalvar.h> 73 #include <sys/syscallsubr.h> 74 #include <sys/sysctl.h> 75 #include <sys/uio.h> 76 #include <sys/vnode.h> 77 #ifdef KTRACE 78 #include <sys/ktrace.h> 79 #endif 80 #ifdef COMPAT_FREEBSD32 81 #include <compat/freebsd32/freebsd32_util.h> 82 #endif 83 84 #include <net/vnet.h> 85 86 #include <security/audit/audit.h> 87 #include <security/mac/mac_framework.h> 88 89 #include <vm/vm.h> 90 #include <vm/vm_param.h> 91 #include <vm/vm_object.h> 92 #include <vm/vm_page.h> 93 #include <vm/vm_pager.h> 94 #include <vm/vm_kern.h> 95 #include <vm/vm_extern.h> 96 #include <vm/uma.h> 97 98 #if defined(INET) || defined(INET6) 99 #ifdef SCTP 100 #include <netinet/sctp.h> 101 #include <netinet/sctp_peeloff.h> 102 #endif /* SCTP */ 103 #endif /* INET || INET6 */ 104 105 /* 106 * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC 107 * and SOCK_NONBLOCK. 108 */ 109 #define ACCEPT4_INHERIT 0x1 110 #define ACCEPT4_COMPAT 0x2 111 112 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); 113 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); 114 115 static int accept1(struct thread *td, int s, struct sockaddr *uname, 116 socklen_t *anamelen, int flags); 117 static int do_sendfile(struct thread *td, struct sendfile_args *uap, 118 int compat); 119 static int getsockname1(struct thread *td, struct getsockname_args *uap, 120 int compat); 121 static int getpeername1(struct thread *td, struct getpeername_args *uap, 122 int compat); 123 124 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; 125 126 static int filt_sfsync_attach(struct knote *kn); 127 static void filt_sfsync_detach(struct knote *kn); 128 static int filt_sfsync(struct knote *kn, long hint); 129 130 /* 131 * sendfile(2)-related variables and associated sysctls 132 */ 133 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0, 134 "sendfile(2) tunables"); 135 static int sfreadahead = 1; 136 SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW, 137 &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks"); 138 139 #ifdef SFSYNC_DEBUG 140 static int sf_sync_debug = 0; 141 SYSCTL_INT(_debug, OID_AUTO, sf_sync_debug, CTLFLAG_RW, 142 &sf_sync_debug, 0, "Output debugging during sf_sync lifecycle"); 143 #define SFSYNC_DPRINTF(s, ...) \ 144 do { \ 145 if (sf_sync_debug) \ 146 printf((s), ##__VA_ARGS__); \ 147 } while (0) 148 #else 149 #define SFSYNC_DPRINTF(c, ...) 150 #endif 151 152 static uma_zone_t zone_sfsync; 153 154 static struct filterops sendfile_filtops = { 155 .f_isfd = 0, 156 .f_attach = filt_sfsync_attach, 157 .f_detach = filt_sfsync_detach, 158 .f_event = filt_sfsync, 159 }; 160 161 static void 162 sfstat_init(const void *unused) 163 { 164 165 COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), 166 M_WAITOK); 167 } 168 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); 169 170 static void 171 sf_sync_init(const void *unused) 172 { 173 174 zone_sfsync = uma_zcreate("sendfile_sync", sizeof(struct sendfile_sync), 175 NULL, NULL, 176 NULL, NULL, 177 UMA_ALIGN_CACHE, 178 0); 179 kqueue_add_filteropts(EVFILT_SENDFILE, &sendfile_filtops); 180 } 181 SYSINIT(sf_sync, SI_SUB_MBUF, SI_ORDER_FIRST, sf_sync_init, NULL); 182 183 static int 184 sfstat_sysctl(SYSCTL_HANDLER_ARGS) 185 { 186 struct sfstat s; 187 188 COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); 189 if (req->newptr) 190 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); 191 return (SYSCTL_OUT(req, &s, sizeof(s))); 192 } 193 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, 194 NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); 195 196 /* 197 * Convert a user file descriptor to a kernel file entry and check if required 198 * capability rights are present. 199 * A reference on the file entry is held upon returning. 200 */ 201 static int 202 getsock_cap(struct filedesc *fdp, int fd, cap_rights_t *rightsp, 203 struct file **fpp, u_int *fflagp) 204 { 205 struct file *fp; 206 int error; 207 208 error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL); 209 if (error != 0) 210 return (error); 211 if (fp->f_type != DTYPE_SOCKET) { 212 fdrop(fp, curthread); 213 return (ENOTSOCK); 214 } 215 if (fflagp != NULL) 216 *fflagp = fp->f_flag; 217 *fpp = fp; 218 return (0); 219 } 220 221 /* 222 * System call interface to the socket abstraction. 223 */ 224 #if defined(COMPAT_43) 225 #define COMPAT_OLDSOCK 226 #endif 227 228 int 229 sys_socket(td, uap) 230 struct thread *td; 231 struct socket_args /* { 232 int domain; 233 int type; 234 int protocol; 235 } */ *uap; 236 { 237 struct socket *so; 238 struct file *fp; 239 int fd, error, type, oflag, fflag; 240 241 AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); 242 243 type = uap->type; 244 oflag = 0; 245 fflag = 0; 246 if ((type & SOCK_CLOEXEC) != 0) { 247 type &= ~SOCK_CLOEXEC; 248 oflag |= O_CLOEXEC; 249 } 250 if ((type & SOCK_NONBLOCK) != 0) { 251 type &= ~SOCK_NONBLOCK; 252 fflag |= FNONBLOCK; 253 } 254 255 #ifdef MAC 256 error = mac_socket_check_create(td->td_ucred, uap->domain, type, 257 uap->protocol); 258 if (error != 0) 259 return (error); 260 #endif 261 error = falloc(td, &fp, &fd, oflag); 262 if (error != 0) 263 return (error); 264 /* An extra reference on `fp' has been held for us by falloc(). */ 265 error = socreate(uap->domain, &so, type, uap->protocol, 266 td->td_ucred, td); 267 if (error != 0) { 268 fdclose(td->td_proc->p_fd, fp, fd, td); 269 } else { 270 finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); 271 if ((fflag & FNONBLOCK) != 0) 272 (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); 273 td->td_retval[0] = fd; 274 } 275 fdrop(fp, td); 276 return (error); 277 } 278 279 /* ARGSUSED */ 280 int 281 sys_bind(td, uap) 282 struct thread *td; 283 struct bind_args /* { 284 int s; 285 caddr_t name; 286 int namelen; 287 } */ *uap; 288 { 289 struct sockaddr *sa; 290 int error; 291 292 error = getsockaddr(&sa, uap->name, uap->namelen); 293 if (error == 0) { 294 error = kern_bind(td, uap->s, sa); 295 free(sa, M_SONAME); 296 } 297 return (error); 298 } 299 300 static int 301 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 302 { 303 struct socket *so; 304 struct file *fp; 305 cap_rights_t rights; 306 int error; 307 308 AUDIT_ARG_FD(fd); 309 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 310 error = getsock_cap(td->td_proc->p_fd, fd, 311 cap_rights_init(&rights, CAP_BIND), &fp, NULL); 312 if (error != 0) 313 return (error); 314 so = fp->f_data; 315 #ifdef KTRACE 316 if (KTRPOINT(td, KTR_STRUCT)) 317 ktrsockaddr(sa); 318 #endif 319 #ifdef MAC 320 error = mac_socket_check_bind(td->td_ucred, so, sa); 321 if (error == 0) { 322 #endif 323 if (dirfd == AT_FDCWD) 324 error = sobind(so, sa, td); 325 else 326 error = sobindat(dirfd, so, sa, td); 327 #ifdef MAC 328 } 329 #endif 330 fdrop(fp, td); 331 return (error); 332 } 333 334 int 335 kern_bind(struct thread *td, int fd, struct sockaddr *sa) 336 { 337 338 return (kern_bindat(td, AT_FDCWD, fd, sa)); 339 } 340 341 /* ARGSUSED */ 342 int 343 sys_bindat(td, uap) 344 struct thread *td; 345 struct bindat_args /* { 346 int fd; 347 int s; 348 caddr_t name; 349 int namelen; 350 } */ *uap; 351 { 352 struct sockaddr *sa; 353 int error; 354 355 error = getsockaddr(&sa, uap->name, uap->namelen); 356 if (error == 0) { 357 error = kern_bindat(td, uap->fd, uap->s, sa); 358 free(sa, M_SONAME); 359 } 360 return (error); 361 } 362 363 /* ARGSUSED */ 364 int 365 sys_listen(td, uap) 366 struct thread *td; 367 struct listen_args /* { 368 int s; 369 int backlog; 370 } */ *uap; 371 { 372 struct socket *so; 373 struct file *fp; 374 cap_rights_t rights; 375 int error; 376 377 AUDIT_ARG_FD(uap->s); 378 error = getsock_cap(td->td_proc->p_fd, uap->s, 379 cap_rights_init(&rights, CAP_LISTEN), &fp, NULL); 380 if (error == 0) { 381 so = fp->f_data; 382 #ifdef MAC 383 error = mac_socket_check_listen(td->td_ucred, so); 384 if (error == 0) 385 #endif 386 error = solisten(so, uap->backlog, td); 387 fdrop(fp, td); 388 } 389 return(error); 390 } 391 392 /* 393 * accept1() 394 */ 395 static int 396 accept1(td, s, uname, anamelen, flags) 397 struct thread *td; 398 int s; 399 struct sockaddr *uname; 400 socklen_t *anamelen; 401 int flags; 402 { 403 struct sockaddr *name; 404 socklen_t namelen; 405 struct file *fp; 406 int error; 407 408 if (uname == NULL) 409 return (kern_accept4(td, s, NULL, NULL, flags, NULL)); 410 411 error = copyin(anamelen, &namelen, sizeof (namelen)); 412 if (error != 0) 413 return (error); 414 415 error = kern_accept4(td, s, &name, &namelen, flags, &fp); 416 417 if (error != 0) 418 return (error); 419 420 if (error == 0 && uname != NULL) { 421 #ifdef COMPAT_OLDSOCK 422 if (flags & ACCEPT4_COMPAT) 423 ((struct osockaddr *)name)->sa_family = 424 name->sa_family; 425 #endif 426 error = copyout(name, uname, namelen); 427 } 428 if (error == 0) 429 error = copyout(&namelen, anamelen, 430 sizeof(namelen)); 431 if (error != 0) 432 fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td); 433 fdrop(fp, td); 434 free(name, M_SONAME); 435 return (error); 436 } 437 438 int 439 kern_accept(struct thread *td, int s, struct sockaddr **name, 440 socklen_t *namelen, struct file **fp) 441 { 442 return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); 443 } 444 445 int 446 kern_accept4(struct thread *td, int s, struct sockaddr **name, 447 socklen_t *namelen, int flags, struct file **fp) 448 { 449 struct filedesc *fdp; 450 struct file *headfp, *nfp = NULL; 451 struct sockaddr *sa = NULL; 452 struct socket *head, *so; 453 cap_rights_t rights; 454 u_int fflag; 455 pid_t pgid; 456 int error, fd, tmp; 457 458 if (name != NULL) 459 *name = NULL; 460 461 AUDIT_ARG_FD(s); 462 fdp = td->td_proc->p_fd; 463 error = getsock_cap(fdp, s, cap_rights_init(&rights, CAP_ACCEPT), 464 &headfp, &fflag); 465 if (error != 0) 466 return (error); 467 head = headfp->f_data; 468 if ((head->so_options & SO_ACCEPTCONN) == 0) { 469 error = EINVAL; 470 goto done; 471 } 472 #ifdef MAC 473 error = mac_socket_check_accept(td->td_ucred, head); 474 if (error != 0) 475 goto done; 476 #endif 477 error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); 478 if (error != 0) 479 goto done; 480 ACCEPT_LOCK(); 481 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { 482 ACCEPT_UNLOCK(); 483 error = EWOULDBLOCK; 484 goto noconnection; 485 } 486 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { 487 if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { 488 head->so_error = ECONNABORTED; 489 break; 490 } 491 error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, 492 "accept", 0); 493 if (error != 0) { 494 ACCEPT_UNLOCK(); 495 goto noconnection; 496 } 497 } 498 if (head->so_error) { 499 error = head->so_error; 500 head->so_error = 0; 501 ACCEPT_UNLOCK(); 502 goto noconnection; 503 } 504 so = TAILQ_FIRST(&head->so_comp); 505 KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); 506 KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); 507 508 /* 509 * Before changing the flags on the socket, we have to bump the 510 * reference count. Otherwise, if the protocol calls sofree(), 511 * the socket will be released due to a zero refcount. 512 */ 513 SOCK_LOCK(so); /* soref() and so_state update */ 514 soref(so); /* file descriptor reference */ 515 516 TAILQ_REMOVE(&head->so_comp, so, so_list); 517 head->so_qlen--; 518 if (flags & ACCEPT4_INHERIT) 519 so->so_state |= (head->so_state & SS_NBIO); 520 else 521 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 522 so->so_qstate &= ~SQ_COMP; 523 so->so_head = NULL; 524 525 SOCK_UNLOCK(so); 526 ACCEPT_UNLOCK(); 527 528 /* An extra reference on `nfp' has been held for us by falloc(). */ 529 td->td_retval[0] = fd; 530 531 /* connection has been removed from the listen queue */ 532 KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); 533 534 if (flags & ACCEPT4_INHERIT) { 535 pgid = fgetown(&head->so_sigio); 536 if (pgid != 0) 537 fsetown(pgid, &so->so_sigio); 538 } else { 539 fflag &= ~(FNONBLOCK | FASYNC); 540 if (flags & SOCK_NONBLOCK) 541 fflag |= FNONBLOCK; 542 } 543 544 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); 545 /* Sync socket nonblocking/async state with file flags */ 546 tmp = fflag & FNONBLOCK; 547 (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); 548 tmp = fflag & FASYNC; 549 (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); 550 sa = 0; 551 error = soaccept(so, &sa); 552 if (error != 0) 553 goto noconnection; 554 if (sa == NULL) { 555 if (name) 556 *namelen = 0; 557 goto done; 558 } 559 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); 560 if (name) { 561 /* check sa_len before it is destroyed */ 562 if (*namelen > sa->sa_len) 563 *namelen = sa->sa_len; 564 #ifdef KTRACE 565 if (KTRPOINT(td, KTR_STRUCT)) 566 ktrsockaddr(sa); 567 #endif 568 *name = sa; 569 sa = NULL; 570 } 571 noconnection: 572 free(sa, M_SONAME); 573 574 /* 575 * close the new descriptor, assuming someone hasn't ripped it 576 * out from under us. 577 */ 578 if (error != 0) 579 fdclose(fdp, nfp, fd, td); 580 581 /* 582 * Release explicitly held references before returning. We return 583 * a reference on nfp to the caller on success if they request it. 584 */ 585 done: 586 if (fp != NULL) { 587 if (error == 0) { 588 *fp = nfp; 589 nfp = NULL; 590 } else 591 *fp = NULL; 592 } 593 if (nfp != NULL) 594 fdrop(nfp, td); 595 fdrop(headfp, td); 596 return (error); 597 } 598 599 int 600 sys_accept(td, uap) 601 struct thread *td; 602 struct accept_args *uap; 603 { 604 605 return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); 606 } 607 608 int 609 sys_accept4(td, uap) 610 struct thread *td; 611 struct accept4_args *uap; 612 { 613 614 if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 615 return (EINVAL); 616 617 return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); 618 } 619 620 #ifdef COMPAT_OLDSOCK 621 int 622 oaccept(td, uap) 623 struct thread *td; 624 struct accept_args *uap; 625 { 626 627 return (accept1(td, uap->s, uap->name, uap->anamelen, 628 ACCEPT4_INHERIT | ACCEPT4_COMPAT)); 629 } 630 #endif /* COMPAT_OLDSOCK */ 631 632 /* ARGSUSED */ 633 int 634 sys_connect(td, uap) 635 struct thread *td; 636 struct connect_args /* { 637 int s; 638 caddr_t name; 639 int namelen; 640 } */ *uap; 641 { 642 struct sockaddr *sa; 643 int error; 644 645 error = getsockaddr(&sa, uap->name, uap->namelen); 646 if (error == 0) { 647 error = kern_connect(td, uap->s, sa); 648 free(sa, M_SONAME); 649 } 650 return (error); 651 } 652 653 static int 654 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 655 { 656 struct socket *so; 657 struct file *fp; 658 cap_rights_t rights; 659 int error, interrupted = 0; 660 661 AUDIT_ARG_FD(fd); 662 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 663 error = getsock_cap(td->td_proc->p_fd, fd, 664 cap_rights_init(&rights, CAP_CONNECT), &fp, NULL); 665 if (error != 0) 666 return (error); 667 so = fp->f_data; 668 if (so->so_state & SS_ISCONNECTING) { 669 error = EALREADY; 670 goto done1; 671 } 672 #ifdef KTRACE 673 if (KTRPOINT(td, KTR_STRUCT)) 674 ktrsockaddr(sa); 675 #endif 676 #ifdef MAC 677 error = mac_socket_check_connect(td->td_ucred, so, sa); 678 if (error != 0) 679 goto bad; 680 #endif 681 if (dirfd == AT_FDCWD) 682 error = soconnect(so, sa, td); 683 else 684 error = soconnectat(dirfd, so, sa, td); 685 if (error != 0) 686 goto bad; 687 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 688 error = EINPROGRESS; 689 goto done1; 690 } 691 SOCK_LOCK(so); 692 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 693 error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, 694 "connec", 0); 695 if (error != 0) { 696 if (error == EINTR || error == ERESTART) 697 interrupted = 1; 698 break; 699 } 700 } 701 if (error == 0) { 702 error = so->so_error; 703 so->so_error = 0; 704 } 705 SOCK_UNLOCK(so); 706 bad: 707 if (!interrupted) 708 so->so_state &= ~SS_ISCONNECTING; 709 if (error == ERESTART) 710 error = EINTR; 711 done1: 712 fdrop(fp, td); 713 return (error); 714 } 715 716 int 717 kern_connect(struct thread *td, int fd, struct sockaddr *sa) 718 { 719 720 return (kern_connectat(td, AT_FDCWD, fd, sa)); 721 } 722 723 /* ARGSUSED */ 724 int 725 sys_connectat(td, uap) 726 struct thread *td; 727 struct connectat_args /* { 728 int fd; 729 int s; 730 caddr_t name; 731 int namelen; 732 } */ *uap; 733 { 734 struct sockaddr *sa; 735 int error; 736 737 error = getsockaddr(&sa, uap->name, uap->namelen); 738 if (error == 0) { 739 error = kern_connectat(td, uap->fd, uap->s, sa); 740 free(sa, M_SONAME); 741 } 742 return (error); 743 } 744 745 int 746 kern_socketpair(struct thread *td, int domain, int type, int protocol, 747 int *rsv) 748 { 749 struct filedesc *fdp = td->td_proc->p_fd; 750 struct file *fp1, *fp2; 751 struct socket *so1, *so2; 752 int fd, error, oflag, fflag; 753 754 AUDIT_ARG_SOCKET(domain, type, protocol); 755 756 oflag = 0; 757 fflag = 0; 758 if ((type & SOCK_CLOEXEC) != 0) { 759 type &= ~SOCK_CLOEXEC; 760 oflag |= O_CLOEXEC; 761 } 762 if ((type & SOCK_NONBLOCK) != 0) { 763 type &= ~SOCK_NONBLOCK; 764 fflag |= FNONBLOCK; 765 } 766 #ifdef MAC 767 /* We might want to have a separate check for socket pairs. */ 768 error = mac_socket_check_create(td->td_ucred, domain, type, 769 protocol); 770 if (error != 0) 771 return (error); 772 #endif 773 error = socreate(domain, &so1, type, protocol, td->td_ucred, td); 774 if (error != 0) 775 return (error); 776 error = socreate(domain, &so2, type, protocol, td->td_ucred, td); 777 if (error != 0) 778 goto free1; 779 /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ 780 error = falloc(td, &fp1, &fd, oflag); 781 if (error != 0) 782 goto free2; 783 rsv[0] = fd; 784 fp1->f_data = so1; /* so1 already has ref count */ 785 error = falloc(td, &fp2, &fd, oflag); 786 if (error != 0) 787 goto free3; 788 fp2->f_data = so2; /* so2 already has ref count */ 789 rsv[1] = fd; 790 error = soconnect2(so1, so2); 791 if (error != 0) 792 goto free4; 793 if (type == SOCK_DGRAM) { 794 /* 795 * Datagram socket connection is asymmetric. 796 */ 797 error = soconnect2(so2, so1); 798 if (error != 0) 799 goto free4; 800 } 801 finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, 802 &socketops); 803 finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, 804 &socketops); 805 if ((fflag & FNONBLOCK) != 0) { 806 (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); 807 (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); 808 } 809 fdrop(fp1, td); 810 fdrop(fp2, td); 811 return (0); 812 free4: 813 fdclose(fdp, fp2, rsv[1], td); 814 fdrop(fp2, td); 815 free3: 816 fdclose(fdp, fp1, rsv[0], td); 817 fdrop(fp1, td); 818 free2: 819 if (so2 != NULL) 820 (void)soclose(so2); 821 free1: 822 if (so1 != NULL) 823 (void)soclose(so1); 824 return (error); 825 } 826 827 int 828 sys_socketpair(struct thread *td, struct socketpair_args *uap) 829 { 830 int error, sv[2]; 831 832 error = kern_socketpair(td, uap->domain, uap->type, 833 uap->protocol, sv); 834 if (error != 0) 835 return (error); 836 error = copyout(sv, uap->rsv, 2 * sizeof(int)); 837 if (error != 0) { 838 (void)kern_close(td, sv[0]); 839 (void)kern_close(td, sv[1]); 840 } 841 return (error); 842 } 843 844 static int 845 sendit(td, s, mp, flags) 846 struct thread *td; 847 int s; 848 struct msghdr *mp; 849 int flags; 850 { 851 struct mbuf *control; 852 struct sockaddr *to; 853 int error; 854 855 #ifdef CAPABILITY_MODE 856 if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) 857 return (ECAPMODE); 858 #endif 859 860 if (mp->msg_name != NULL) { 861 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 862 if (error != 0) { 863 to = NULL; 864 goto bad; 865 } 866 mp->msg_name = to; 867 } else { 868 to = NULL; 869 } 870 871 if (mp->msg_control) { 872 if (mp->msg_controllen < sizeof(struct cmsghdr) 873 #ifdef COMPAT_OLDSOCK 874 && mp->msg_flags != MSG_COMPAT 875 #endif 876 ) { 877 error = EINVAL; 878 goto bad; 879 } 880 error = sockargs(&control, mp->msg_control, 881 mp->msg_controllen, MT_CONTROL); 882 if (error != 0) 883 goto bad; 884 #ifdef COMPAT_OLDSOCK 885 if (mp->msg_flags == MSG_COMPAT) { 886 struct cmsghdr *cm; 887 888 M_PREPEND(control, sizeof(*cm), M_WAITOK); 889 cm = mtod(control, struct cmsghdr *); 890 cm->cmsg_len = control->m_len; 891 cm->cmsg_level = SOL_SOCKET; 892 cm->cmsg_type = SCM_RIGHTS; 893 } 894 #endif 895 } else { 896 control = NULL; 897 } 898 899 error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); 900 901 bad: 902 free(to, M_SONAME); 903 return (error); 904 } 905 906 int 907 kern_sendit(td, s, mp, flags, control, segflg) 908 struct thread *td; 909 int s; 910 struct msghdr *mp; 911 int flags; 912 struct mbuf *control; 913 enum uio_seg segflg; 914 { 915 struct file *fp; 916 struct uio auio; 917 struct iovec *iov; 918 struct socket *so; 919 cap_rights_t rights; 920 #ifdef KTRACE 921 struct uio *ktruio = NULL; 922 #endif 923 ssize_t len; 924 int i, error; 925 926 AUDIT_ARG_FD(s); 927 cap_rights_init(&rights, CAP_SEND); 928 if (mp->msg_name != NULL) { 929 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); 930 cap_rights_set(&rights, CAP_CONNECT); 931 } 932 error = getsock_cap(td->td_proc->p_fd, s, &rights, &fp, NULL); 933 if (error != 0) 934 return (error); 935 so = (struct socket *)fp->f_data; 936 937 #ifdef KTRACE 938 if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) 939 ktrsockaddr(mp->msg_name); 940 #endif 941 #ifdef MAC 942 if (mp->msg_name != NULL) { 943 error = mac_socket_check_connect(td->td_ucred, so, 944 mp->msg_name); 945 if (error != 0) 946 goto bad; 947 } 948 error = mac_socket_check_send(td->td_ucred, so); 949 if (error != 0) 950 goto bad; 951 #endif 952 953 auio.uio_iov = mp->msg_iov; 954 auio.uio_iovcnt = mp->msg_iovlen; 955 auio.uio_segflg = segflg; 956 auio.uio_rw = UIO_WRITE; 957 auio.uio_td = td; 958 auio.uio_offset = 0; /* XXX */ 959 auio.uio_resid = 0; 960 iov = mp->msg_iov; 961 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 962 if ((auio.uio_resid += iov->iov_len) < 0) { 963 error = EINVAL; 964 goto bad; 965 } 966 } 967 #ifdef KTRACE 968 if (KTRPOINT(td, KTR_GENIO)) 969 ktruio = cloneuio(&auio); 970 #endif 971 len = auio.uio_resid; 972 error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); 973 if (error != 0) { 974 if (auio.uio_resid != len && (error == ERESTART || 975 error == EINTR || error == EWOULDBLOCK)) 976 error = 0; 977 /* Generation of SIGPIPE can be controlled per socket */ 978 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 979 !(flags & MSG_NOSIGNAL)) { 980 PROC_LOCK(td->td_proc); 981 tdsignal(td, SIGPIPE); 982 PROC_UNLOCK(td->td_proc); 983 } 984 } 985 if (error == 0) 986 td->td_retval[0] = len - auio.uio_resid; 987 #ifdef KTRACE 988 if (ktruio != NULL) { 989 ktruio->uio_resid = td->td_retval[0]; 990 ktrgenio(s, UIO_WRITE, ktruio, error); 991 } 992 #endif 993 bad: 994 fdrop(fp, td); 995 return (error); 996 } 997 998 int 999 sys_sendto(td, uap) 1000 struct thread *td; 1001 struct sendto_args /* { 1002 int s; 1003 caddr_t buf; 1004 size_t len; 1005 int flags; 1006 caddr_t to; 1007 int tolen; 1008 } */ *uap; 1009 { 1010 struct msghdr msg; 1011 struct iovec aiov; 1012 1013 msg.msg_name = uap->to; 1014 msg.msg_namelen = uap->tolen; 1015 msg.msg_iov = &aiov; 1016 msg.msg_iovlen = 1; 1017 msg.msg_control = 0; 1018 #ifdef COMPAT_OLDSOCK 1019 msg.msg_flags = 0; 1020 #endif 1021 aiov.iov_base = uap->buf; 1022 aiov.iov_len = uap->len; 1023 return (sendit(td, uap->s, &msg, uap->flags)); 1024 } 1025 1026 #ifdef COMPAT_OLDSOCK 1027 int 1028 osend(td, uap) 1029 struct thread *td; 1030 struct osend_args /* { 1031 int s; 1032 caddr_t buf; 1033 int len; 1034 int flags; 1035 } */ *uap; 1036 { 1037 struct msghdr msg; 1038 struct iovec aiov; 1039 1040 msg.msg_name = 0; 1041 msg.msg_namelen = 0; 1042 msg.msg_iov = &aiov; 1043 msg.msg_iovlen = 1; 1044 aiov.iov_base = uap->buf; 1045 aiov.iov_len = uap->len; 1046 msg.msg_control = 0; 1047 msg.msg_flags = 0; 1048 return (sendit(td, uap->s, &msg, uap->flags)); 1049 } 1050 1051 int 1052 osendmsg(td, uap) 1053 struct thread *td; 1054 struct osendmsg_args /* { 1055 int s; 1056 caddr_t msg; 1057 int flags; 1058 } */ *uap; 1059 { 1060 struct msghdr msg; 1061 struct iovec *iov; 1062 int error; 1063 1064 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1065 if (error != 0) 1066 return (error); 1067 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1068 if (error != 0) 1069 return (error); 1070 msg.msg_iov = iov; 1071 msg.msg_flags = MSG_COMPAT; 1072 error = sendit(td, uap->s, &msg, uap->flags); 1073 free(iov, M_IOV); 1074 return (error); 1075 } 1076 #endif 1077 1078 int 1079 sys_sendmsg(td, uap) 1080 struct thread *td; 1081 struct sendmsg_args /* { 1082 int s; 1083 caddr_t msg; 1084 int flags; 1085 } */ *uap; 1086 { 1087 struct msghdr msg; 1088 struct iovec *iov; 1089 int error; 1090 1091 error = copyin(uap->msg, &msg, sizeof (msg)); 1092 if (error != 0) 1093 return (error); 1094 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1095 if (error != 0) 1096 return (error); 1097 msg.msg_iov = iov; 1098 #ifdef COMPAT_OLDSOCK 1099 msg.msg_flags = 0; 1100 #endif 1101 error = sendit(td, uap->s, &msg, uap->flags); 1102 free(iov, M_IOV); 1103 return (error); 1104 } 1105 1106 int 1107 kern_recvit(td, s, mp, fromseg, controlp) 1108 struct thread *td; 1109 int s; 1110 struct msghdr *mp; 1111 enum uio_seg fromseg; 1112 struct mbuf **controlp; 1113 { 1114 struct uio auio; 1115 struct iovec *iov; 1116 struct mbuf *m, *control = NULL; 1117 caddr_t ctlbuf; 1118 struct file *fp; 1119 struct socket *so; 1120 struct sockaddr *fromsa = NULL; 1121 cap_rights_t rights; 1122 #ifdef KTRACE 1123 struct uio *ktruio = NULL; 1124 #endif 1125 ssize_t len; 1126 int error, i; 1127 1128 if (controlp != NULL) 1129 *controlp = NULL; 1130 1131 AUDIT_ARG_FD(s); 1132 error = getsock_cap(td->td_proc->p_fd, s, 1133 cap_rights_init(&rights, CAP_RECV), &fp, NULL); 1134 if (error != 0) 1135 return (error); 1136 so = fp->f_data; 1137 1138 #ifdef MAC 1139 error = mac_socket_check_receive(td->td_ucred, so); 1140 if (error != 0) { 1141 fdrop(fp, td); 1142 return (error); 1143 } 1144 #endif 1145 1146 auio.uio_iov = mp->msg_iov; 1147 auio.uio_iovcnt = mp->msg_iovlen; 1148 auio.uio_segflg = UIO_USERSPACE; 1149 auio.uio_rw = UIO_READ; 1150 auio.uio_td = td; 1151 auio.uio_offset = 0; /* XXX */ 1152 auio.uio_resid = 0; 1153 iov = mp->msg_iov; 1154 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 1155 if ((auio.uio_resid += iov->iov_len) < 0) { 1156 fdrop(fp, td); 1157 return (EINVAL); 1158 } 1159 } 1160 #ifdef KTRACE 1161 if (KTRPOINT(td, KTR_GENIO)) 1162 ktruio = cloneuio(&auio); 1163 #endif 1164 len = auio.uio_resid; 1165 error = soreceive(so, &fromsa, &auio, NULL, 1166 (mp->msg_control || controlp) ? &control : NULL, 1167 &mp->msg_flags); 1168 if (error != 0) { 1169 if (auio.uio_resid != len && (error == ERESTART || 1170 error == EINTR || error == EWOULDBLOCK)) 1171 error = 0; 1172 } 1173 if (fromsa != NULL) 1174 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); 1175 #ifdef KTRACE 1176 if (ktruio != NULL) { 1177 ktruio->uio_resid = len - auio.uio_resid; 1178 ktrgenio(s, UIO_READ, ktruio, error); 1179 } 1180 #endif 1181 if (error != 0) 1182 goto out; 1183 td->td_retval[0] = len - auio.uio_resid; 1184 if (mp->msg_name) { 1185 len = mp->msg_namelen; 1186 if (len <= 0 || fromsa == NULL) 1187 len = 0; 1188 else { 1189 /* save sa_len before it is destroyed by MSG_COMPAT */ 1190 len = MIN(len, fromsa->sa_len); 1191 #ifdef COMPAT_OLDSOCK 1192 if (mp->msg_flags & MSG_COMPAT) 1193 ((struct osockaddr *)fromsa)->sa_family = 1194 fromsa->sa_family; 1195 #endif 1196 if (fromseg == UIO_USERSPACE) { 1197 error = copyout(fromsa, mp->msg_name, 1198 (unsigned)len); 1199 if (error != 0) 1200 goto out; 1201 } else 1202 bcopy(fromsa, mp->msg_name, len); 1203 } 1204 mp->msg_namelen = len; 1205 } 1206 if (mp->msg_control && controlp == NULL) { 1207 #ifdef COMPAT_OLDSOCK 1208 /* 1209 * We assume that old recvmsg calls won't receive access 1210 * rights and other control info, esp. as control info 1211 * is always optional and those options didn't exist in 4.3. 1212 * If we receive rights, trim the cmsghdr; anything else 1213 * is tossed. 1214 */ 1215 if (control && mp->msg_flags & MSG_COMPAT) { 1216 if (mtod(control, struct cmsghdr *)->cmsg_level != 1217 SOL_SOCKET || 1218 mtod(control, struct cmsghdr *)->cmsg_type != 1219 SCM_RIGHTS) { 1220 mp->msg_controllen = 0; 1221 goto out; 1222 } 1223 control->m_len -= sizeof (struct cmsghdr); 1224 control->m_data += sizeof (struct cmsghdr); 1225 } 1226 #endif 1227 len = mp->msg_controllen; 1228 m = control; 1229 mp->msg_controllen = 0; 1230 ctlbuf = mp->msg_control; 1231 1232 while (m && len > 0) { 1233 unsigned int tocopy; 1234 1235 if (len >= m->m_len) 1236 tocopy = m->m_len; 1237 else { 1238 mp->msg_flags |= MSG_CTRUNC; 1239 tocopy = len; 1240 } 1241 1242 if ((error = copyout(mtod(m, caddr_t), 1243 ctlbuf, tocopy)) != 0) 1244 goto out; 1245 1246 ctlbuf += tocopy; 1247 len -= tocopy; 1248 m = m->m_next; 1249 } 1250 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; 1251 } 1252 out: 1253 fdrop(fp, td); 1254 #ifdef KTRACE 1255 if (fromsa && KTRPOINT(td, KTR_STRUCT)) 1256 ktrsockaddr(fromsa); 1257 #endif 1258 free(fromsa, M_SONAME); 1259 1260 if (error == 0 && controlp != NULL) 1261 *controlp = control; 1262 else if (control) 1263 m_freem(control); 1264 1265 return (error); 1266 } 1267 1268 static int 1269 recvit(td, s, mp, namelenp) 1270 struct thread *td; 1271 int s; 1272 struct msghdr *mp; 1273 void *namelenp; 1274 { 1275 int error; 1276 1277 error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); 1278 if (error != 0) 1279 return (error); 1280 if (namelenp != NULL) { 1281 error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); 1282 #ifdef COMPAT_OLDSOCK 1283 if (mp->msg_flags & MSG_COMPAT) 1284 error = 0; /* old recvfrom didn't check */ 1285 #endif 1286 } 1287 return (error); 1288 } 1289 1290 int 1291 sys_recvfrom(td, uap) 1292 struct thread *td; 1293 struct recvfrom_args /* { 1294 int s; 1295 caddr_t buf; 1296 size_t len; 1297 int flags; 1298 struct sockaddr * __restrict from; 1299 socklen_t * __restrict fromlenaddr; 1300 } */ *uap; 1301 { 1302 struct msghdr msg; 1303 struct iovec aiov; 1304 int error; 1305 1306 if (uap->fromlenaddr) { 1307 error = copyin(uap->fromlenaddr, 1308 &msg.msg_namelen, sizeof (msg.msg_namelen)); 1309 if (error != 0) 1310 goto done2; 1311 } else { 1312 msg.msg_namelen = 0; 1313 } 1314 msg.msg_name = uap->from; 1315 msg.msg_iov = &aiov; 1316 msg.msg_iovlen = 1; 1317 aiov.iov_base = uap->buf; 1318 aiov.iov_len = uap->len; 1319 msg.msg_control = 0; 1320 msg.msg_flags = uap->flags; 1321 error = recvit(td, uap->s, &msg, uap->fromlenaddr); 1322 done2: 1323 return (error); 1324 } 1325 1326 #ifdef COMPAT_OLDSOCK 1327 int 1328 orecvfrom(td, uap) 1329 struct thread *td; 1330 struct recvfrom_args *uap; 1331 { 1332 1333 uap->flags |= MSG_COMPAT; 1334 return (sys_recvfrom(td, uap)); 1335 } 1336 #endif 1337 1338 #ifdef COMPAT_OLDSOCK 1339 int 1340 orecv(td, uap) 1341 struct thread *td; 1342 struct orecv_args /* { 1343 int s; 1344 caddr_t buf; 1345 int len; 1346 int flags; 1347 } */ *uap; 1348 { 1349 struct msghdr msg; 1350 struct iovec aiov; 1351 1352 msg.msg_name = 0; 1353 msg.msg_namelen = 0; 1354 msg.msg_iov = &aiov; 1355 msg.msg_iovlen = 1; 1356 aiov.iov_base = uap->buf; 1357 aiov.iov_len = uap->len; 1358 msg.msg_control = 0; 1359 msg.msg_flags = uap->flags; 1360 return (recvit(td, uap->s, &msg, NULL)); 1361 } 1362 1363 /* 1364 * Old recvmsg. This code takes advantage of the fact that the old msghdr 1365 * overlays the new one, missing only the flags, and with the (old) access 1366 * rights where the control fields are now. 1367 */ 1368 int 1369 orecvmsg(td, uap) 1370 struct thread *td; 1371 struct orecvmsg_args /* { 1372 int s; 1373 struct omsghdr *msg; 1374 int flags; 1375 } */ *uap; 1376 { 1377 struct msghdr msg; 1378 struct iovec *iov; 1379 int error; 1380 1381 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1382 if (error != 0) 1383 return (error); 1384 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1385 if (error != 0) 1386 return (error); 1387 msg.msg_flags = uap->flags | MSG_COMPAT; 1388 msg.msg_iov = iov; 1389 error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); 1390 if (msg.msg_controllen && error == 0) 1391 error = copyout(&msg.msg_controllen, 1392 &uap->msg->msg_accrightslen, sizeof (int)); 1393 free(iov, M_IOV); 1394 return (error); 1395 } 1396 #endif 1397 1398 int 1399 sys_recvmsg(td, uap) 1400 struct thread *td; 1401 struct recvmsg_args /* { 1402 int s; 1403 struct msghdr *msg; 1404 int flags; 1405 } */ *uap; 1406 { 1407 struct msghdr msg; 1408 struct iovec *uiov, *iov; 1409 int error; 1410 1411 error = copyin(uap->msg, &msg, sizeof (msg)); 1412 if (error != 0) 1413 return (error); 1414 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1415 if (error != 0) 1416 return (error); 1417 msg.msg_flags = uap->flags; 1418 #ifdef COMPAT_OLDSOCK 1419 msg.msg_flags &= ~MSG_COMPAT; 1420 #endif 1421 uiov = msg.msg_iov; 1422 msg.msg_iov = iov; 1423 error = recvit(td, uap->s, &msg, NULL); 1424 if (error == 0) { 1425 msg.msg_iov = uiov; 1426 error = copyout(&msg, uap->msg, sizeof(msg)); 1427 } 1428 free(iov, M_IOV); 1429 return (error); 1430 } 1431 1432 /* ARGSUSED */ 1433 int 1434 sys_shutdown(td, uap) 1435 struct thread *td; 1436 struct shutdown_args /* { 1437 int s; 1438 int how; 1439 } */ *uap; 1440 { 1441 struct socket *so; 1442 struct file *fp; 1443 cap_rights_t rights; 1444 int error; 1445 1446 AUDIT_ARG_FD(uap->s); 1447 error = getsock_cap(td->td_proc->p_fd, uap->s, 1448 cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL); 1449 if (error == 0) { 1450 so = fp->f_data; 1451 error = soshutdown(so, uap->how); 1452 fdrop(fp, td); 1453 } 1454 return (error); 1455 } 1456 1457 /* ARGSUSED */ 1458 int 1459 sys_setsockopt(td, uap) 1460 struct thread *td; 1461 struct setsockopt_args /* { 1462 int s; 1463 int level; 1464 int name; 1465 caddr_t val; 1466 int valsize; 1467 } */ *uap; 1468 { 1469 1470 return (kern_setsockopt(td, uap->s, uap->level, uap->name, 1471 uap->val, UIO_USERSPACE, uap->valsize)); 1472 } 1473 1474 int 1475 kern_setsockopt(td, s, level, name, val, valseg, valsize) 1476 struct thread *td; 1477 int s; 1478 int level; 1479 int name; 1480 void *val; 1481 enum uio_seg valseg; 1482 socklen_t valsize; 1483 { 1484 struct socket *so; 1485 struct file *fp; 1486 struct sockopt sopt; 1487 cap_rights_t rights; 1488 int error; 1489 1490 if (val == NULL && valsize != 0) 1491 return (EFAULT); 1492 if ((int)valsize < 0) 1493 return (EINVAL); 1494 1495 sopt.sopt_dir = SOPT_SET; 1496 sopt.sopt_level = level; 1497 sopt.sopt_name = name; 1498 sopt.sopt_val = val; 1499 sopt.sopt_valsize = valsize; 1500 switch (valseg) { 1501 case UIO_USERSPACE: 1502 sopt.sopt_td = td; 1503 break; 1504 case UIO_SYSSPACE: 1505 sopt.sopt_td = NULL; 1506 break; 1507 default: 1508 panic("kern_setsockopt called with bad valseg"); 1509 } 1510 1511 AUDIT_ARG_FD(s); 1512 error = getsock_cap(td->td_proc->p_fd, s, 1513 cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL); 1514 if (error == 0) { 1515 so = fp->f_data; 1516 error = sosetopt(so, &sopt); 1517 fdrop(fp, td); 1518 } 1519 return(error); 1520 } 1521 1522 /* ARGSUSED */ 1523 int 1524 sys_getsockopt(td, uap) 1525 struct thread *td; 1526 struct getsockopt_args /* { 1527 int s; 1528 int level; 1529 int name; 1530 void * __restrict val; 1531 socklen_t * __restrict avalsize; 1532 } */ *uap; 1533 { 1534 socklen_t valsize; 1535 int error; 1536 1537 if (uap->val) { 1538 error = copyin(uap->avalsize, &valsize, sizeof (valsize)); 1539 if (error != 0) 1540 return (error); 1541 } 1542 1543 error = kern_getsockopt(td, uap->s, uap->level, uap->name, 1544 uap->val, UIO_USERSPACE, &valsize); 1545 1546 if (error == 0) 1547 error = copyout(&valsize, uap->avalsize, sizeof (valsize)); 1548 return (error); 1549 } 1550 1551 /* 1552 * Kernel version of getsockopt. 1553 * optval can be a userland or userspace. optlen is always a kernel pointer. 1554 */ 1555 int 1556 kern_getsockopt(td, s, level, name, val, valseg, valsize) 1557 struct thread *td; 1558 int s; 1559 int level; 1560 int name; 1561 void *val; 1562 enum uio_seg valseg; 1563 socklen_t *valsize; 1564 { 1565 struct socket *so; 1566 struct file *fp; 1567 struct sockopt sopt; 1568 cap_rights_t rights; 1569 int error; 1570 1571 if (val == NULL) 1572 *valsize = 0; 1573 if ((int)*valsize < 0) 1574 return (EINVAL); 1575 1576 sopt.sopt_dir = SOPT_GET; 1577 sopt.sopt_level = level; 1578 sopt.sopt_name = name; 1579 sopt.sopt_val = val; 1580 sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ 1581 switch (valseg) { 1582 case UIO_USERSPACE: 1583 sopt.sopt_td = td; 1584 break; 1585 case UIO_SYSSPACE: 1586 sopt.sopt_td = NULL; 1587 break; 1588 default: 1589 panic("kern_getsockopt called with bad valseg"); 1590 } 1591 1592 AUDIT_ARG_FD(s); 1593 error = getsock_cap(td->td_proc->p_fd, s, 1594 cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL); 1595 if (error == 0) { 1596 so = fp->f_data; 1597 error = sogetopt(so, &sopt); 1598 *valsize = sopt.sopt_valsize; 1599 fdrop(fp, td); 1600 } 1601 return (error); 1602 } 1603 1604 /* 1605 * getsockname1() - Get socket name. 1606 */ 1607 /* ARGSUSED */ 1608 static int 1609 getsockname1(td, uap, compat) 1610 struct thread *td; 1611 struct getsockname_args /* { 1612 int fdes; 1613 struct sockaddr * __restrict asa; 1614 socklen_t * __restrict alen; 1615 } */ *uap; 1616 int compat; 1617 { 1618 struct sockaddr *sa; 1619 socklen_t len; 1620 int error; 1621 1622 error = copyin(uap->alen, &len, sizeof(len)); 1623 if (error != 0) 1624 return (error); 1625 1626 error = kern_getsockname(td, uap->fdes, &sa, &len); 1627 if (error != 0) 1628 return (error); 1629 1630 if (len != 0) { 1631 #ifdef COMPAT_OLDSOCK 1632 if (compat) 1633 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1634 #endif 1635 error = copyout(sa, uap->asa, (u_int)len); 1636 } 1637 free(sa, M_SONAME); 1638 if (error == 0) 1639 error = copyout(&len, uap->alen, sizeof(len)); 1640 return (error); 1641 } 1642 1643 int 1644 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, 1645 socklen_t *alen) 1646 { 1647 struct socket *so; 1648 struct file *fp; 1649 cap_rights_t rights; 1650 socklen_t len; 1651 int error; 1652 1653 AUDIT_ARG_FD(fd); 1654 error = getsock_cap(td->td_proc->p_fd, fd, 1655 cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL); 1656 if (error != 0) 1657 return (error); 1658 so = fp->f_data; 1659 *sa = NULL; 1660 CURVNET_SET(so->so_vnet); 1661 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); 1662 CURVNET_RESTORE(); 1663 if (error != 0) 1664 goto bad; 1665 if (*sa == NULL) 1666 len = 0; 1667 else 1668 len = MIN(*alen, (*sa)->sa_len); 1669 *alen = len; 1670 #ifdef KTRACE 1671 if (KTRPOINT(td, KTR_STRUCT)) 1672 ktrsockaddr(*sa); 1673 #endif 1674 bad: 1675 fdrop(fp, td); 1676 if (error != 0 && *sa != NULL) { 1677 free(*sa, M_SONAME); 1678 *sa = NULL; 1679 } 1680 return (error); 1681 } 1682 1683 int 1684 sys_getsockname(td, uap) 1685 struct thread *td; 1686 struct getsockname_args *uap; 1687 { 1688 1689 return (getsockname1(td, uap, 0)); 1690 } 1691 1692 #ifdef COMPAT_OLDSOCK 1693 int 1694 ogetsockname(td, uap) 1695 struct thread *td; 1696 struct getsockname_args *uap; 1697 { 1698 1699 return (getsockname1(td, uap, 1)); 1700 } 1701 #endif /* COMPAT_OLDSOCK */ 1702 1703 /* 1704 * getpeername1() - Get name of peer for connected socket. 1705 */ 1706 /* ARGSUSED */ 1707 static int 1708 getpeername1(td, uap, compat) 1709 struct thread *td; 1710 struct getpeername_args /* { 1711 int fdes; 1712 struct sockaddr * __restrict asa; 1713 socklen_t * __restrict alen; 1714 } */ *uap; 1715 int compat; 1716 { 1717 struct sockaddr *sa; 1718 socklen_t len; 1719 int error; 1720 1721 error = copyin(uap->alen, &len, sizeof (len)); 1722 if (error != 0) 1723 return (error); 1724 1725 error = kern_getpeername(td, uap->fdes, &sa, &len); 1726 if (error != 0) 1727 return (error); 1728 1729 if (len != 0) { 1730 #ifdef COMPAT_OLDSOCK 1731 if (compat) 1732 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1733 #endif 1734 error = copyout(sa, uap->asa, (u_int)len); 1735 } 1736 free(sa, M_SONAME); 1737 if (error == 0) 1738 error = copyout(&len, uap->alen, sizeof(len)); 1739 return (error); 1740 } 1741 1742 int 1743 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, 1744 socklen_t *alen) 1745 { 1746 struct socket *so; 1747 struct file *fp; 1748 cap_rights_t rights; 1749 socklen_t len; 1750 int error; 1751 1752 AUDIT_ARG_FD(fd); 1753 error = getsock_cap(td->td_proc->p_fd, fd, 1754 cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL); 1755 if (error != 0) 1756 return (error); 1757 so = fp->f_data; 1758 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1759 error = ENOTCONN; 1760 goto done; 1761 } 1762 *sa = NULL; 1763 CURVNET_SET(so->so_vnet); 1764 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); 1765 CURVNET_RESTORE(); 1766 if (error != 0) 1767 goto bad; 1768 if (*sa == NULL) 1769 len = 0; 1770 else 1771 len = MIN(*alen, (*sa)->sa_len); 1772 *alen = len; 1773 #ifdef KTRACE 1774 if (KTRPOINT(td, KTR_STRUCT)) 1775 ktrsockaddr(*sa); 1776 #endif 1777 bad: 1778 if (error != 0 && *sa != NULL) { 1779 free(*sa, M_SONAME); 1780 *sa = NULL; 1781 } 1782 done: 1783 fdrop(fp, td); 1784 return (error); 1785 } 1786 1787 int 1788 sys_getpeername(td, uap) 1789 struct thread *td; 1790 struct getpeername_args *uap; 1791 { 1792 1793 return (getpeername1(td, uap, 0)); 1794 } 1795 1796 #ifdef COMPAT_OLDSOCK 1797 int 1798 ogetpeername(td, uap) 1799 struct thread *td; 1800 struct ogetpeername_args *uap; 1801 { 1802 1803 /* XXX uap should have type `getpeername_args *' to begin with. */ 1804 return (getpeername1(td, (struct getpeername_args *)uap, 1)); 1805 } 1806 #endif /* COMPAT_OLDSOCK */ 1807 1808 int 1809 sockargs(mp, buf, buflen, type) 1810 struct mbuf **mp; 1811 caddr_t buf; 1812 int buflen, type; 1813 { 1814 struct sockaddr *sa; 1815 struct mbuf *m; 1816 int error; 1817 1818 if (buflen > MLEN) { 1819 #ifdef COMPAT_OLDSOCK 1820 if (type == MT_SONAME && buflen <= 112) 1821 buflen = MLEN; /* unix domain compat. hack */ 1822 else 1823 #endif 1824 if (buflen > MCLBYTES) 1825 return (EINVAL); 1826 } 1827 m = m_get2(buflen, M_WAITOK, type, 0); 1828 m->m_len = buflen; 1829 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1830 if (error != 0) 1831 (void) m_free(m); 1832 else { 1833 *mp = m; 1834 if (type == MT_SONAME) { 1835 sa = mtod(m, struct sockaddr *); 1836 1837 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1838 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1839 sa->sa_family = sa->sa_len; 1840 #endif 1841 sa->sa_len = buflen; 1842 } 1843 } 1844 return (error); 1845 } 1846 1847 int 1848 getsockaddr(namp, uaddr, len) 1849 struct sockaddr **namp; 1850 caddr_t uaddr; 1851 size_t len; 1852 { 1853 struct sockaddr *sa; 1854 int error; 1855 1856 if (len > SOCK_MAXADDRLEN) 1857 return (ENAMETOOLONG); 1858 if (len < offsetof(struct sockaddr, sa_data[0])) 1859 return (EINVAL); 1860 sa = malloc(len, M_SONAME, M_WAITOK); 1861 error = copyin(uaddr, sa, len); 1862 if (error != 0) { 1863 free(sa, M_SONAME); 1864 } else { 1865 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1866 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1867 sa->sa_family = sa->sa_len; 1868 #endif 1869 sa->sa_len = len; 1870 *namp = sa; 1871 } 1872 return (error); 1873 } 1874 1875 static int 1876 filt_sfsync_attach(struct knote *kn) 1877 { 1878 struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_sdata; 1879 struct knlist *knl = &sfs->klist; 1880 1881 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1882 1883 /* 1884 * Validate that we actually received this via the kernel API. 1885 */ 1886 if ((kn->kn_flags & EV_FLAG1) == 0) 1887 return (EPERM); 1888 1889 kn->kn_ptr.p_v = sfs; 1890 kn->kn_flags &= ~EV_FLAG1; 1891 1892 knl->kl_lock(knl->kl_lockarg); 1893 /* 1894 * If we're in the "freeing" state, 1895 * don't allow the add. That way we don't 1896 * end up racing with some other thread that 1897 * is trying to finish some setup. 1898 */ 1899 if (sfs->state == SF_STATE_FREEING) { 1900 knl->kl_unlock(knl->kl_lockarg); 1901 return (EINVAL); 1902 } 1903 knlist_add(&sfs->klist, kn, 1); 1904 knl->kl_unlock(knl->kl_lockarg); 1905 1906 return (0); 1907 } 1908 1909 /* 1910 * Called when a knote is being detached. 1911 */ 1912 static void 1913 filt_sfsync_detach(struct knote *kn) 1914 { 1915 struct knlist *knl; 1916 struct sendfile_sync *sfs; 1917 int do_free = 0; 1918 1919 sfs = kn->kn_ptr.p_v; 1920 knl = &sfs->klist; 1921 1922 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1923 1924 knl->kl_lock(knl->kl_lockarg); 1925 if (!knlist_empty(knl)) 1926 knlist_remove(knl, kn, 1); 1927 1928 /* 1929 * If the list is empty _AND_ the refcount is 0 1930 * _AND_ we've finished the setup phase and now 1931 * we're in the running phase, we can free the 1932 * underlying sendfile_sync. 1933 * 1934 * But we shouldn't do it before finishing the 1935 * underlying divorce from the knote. 1936 * 1937 * So, we have the sfsync lock held; transition 1938 * it to "freeing", then unlock, then free 1939 * normally. 1940 */ 1941 if (knlist_empty(knl)) { 1942 if (sfs->state == SF_STATE_COMPLETED && sfs->count == 0) { 1943 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, " 1944 "count==0, empty list: time to free!\n", 1945 __func__, 1946 (unsigned long long) curthread->td_tid, 1947 sfs); 1948 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 1949 do_free = 1; 1950 } 1951 } 1952 knl->kl_unlock(knl->kl_lockarg); 1953 1954 /* 1955 * Only call free if we're the one who has transitioned things 1956 * to free. Otherwise we could race with another thread that 1957 * is currently tearing things down. 1958 */ 1959 if (do_free == 1) { 1960 SFSYNC_DPRINTF("%s: (%llu) sfs=%p, %s:%d\n", 1961 __func__, 1962 (unsigned long long) curthread->td_tid, 1963 sfs, 1964 __FILE__, 1965 __LINE__); 1966 sf_sync_free(sfs); 1967 } 1968 } 1969 1970 static int 1971 filt_sfsync(struct knote *kn, long hint) 1972 { 1973 struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_ptr.p_v; 1974 int ret; 1975 1976 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1977 1978 /* 1979 * XXX add a lock assertion here! 1980 */ 1981 ret = (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED); 1982 1983 return (ret); 1984 } 1985 1986 1987 /* 1988 * Detach mapped page and release resources back to the system. 1989 */ 1990 int 1991 sf_buf_mext(struct mbuf *mb, void *addr, void *args) 1992 { 1993 vm_page_t m; 1994 struct sendfile_sync *sfs; 1995 1996 m = sf_buf_page(args); 1997 sf_buf_free(args); 1998 vm_page_lock(m); 1999 vm_page_unwire(m, 0); 2000 /* 2001 * Check for the object going away on us. This can 2002 * happen since we don't hold a reference to it. 2003 * If so, we're responsible for freeing the page. 2004 */ 2005 if (m->wire_count == 0 && m->object == NULL) 2006 vm_page_free(m); 2007 vm_page_unlock(m); 2008 if (addr != NULL) { 2009 sfs = addr; 2010 sf_sync_deref(sfs); 2011 } 2012 /* 2013 * sfs may be invalid at this point, don't use it! 2014 */ 2015 return (EXT_FREE_OK); 2016 } 2017 2018 /* 2019 * Called to remove a reference to a sf_sync object. 2020 * 2021 * This is generally done during the mbuf free path to signify 2022 * that one of the mbufs in the transaction has been completed. 2023 * 2024 * If we're doing SF_SYNC and the refcount is zero then we'll wake 2025 * up any waiters. 2026 * 2027 * IF we're doing SF_KQUEUE and the refcount is zero then we'll 2028 * fire off the knote. 2029 */ 2030 void 2031 sf_sync_deref(struct sendfile_sync *sfs) 2032 { 2033 int do_free = 0; 2034 2035 if (sfs == NULL) 2036 return; 2037 2038 mtx_lock(&sfs->mtx); 2039 KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0")); 2040 sfs->count --; 2041 2042 /* 2043 * Only fire off the wakeup / kqueue notification if 2044 * we are in the running state. 2045 */ 2046 if (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED) { 2047 if (sfs->flags & SF_SYNC) 2048 cv_signal(&sfs->cv); 2049 2050 if (sfs->flags & SF_KQUEUE) { 2051 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: knote!\n", 2052 __func__, 2053 (unsigned long long) curthread->td_tid, 2054 sfs); 2055 KNOTE_LOCKED(&sfs->klist, 1); 2056 } 2057 2058 /* 2059 * If we're not waiting around for a sync, 2060 * check if the knote list is empty. 2061 * If it is, we transition to free. 2062 * 2063 * XXX I think it's about time I added some state 2064 * or flag that says whether we're supposed to be 2065 * waiting around until we've done a signal. 2066 * 2067 * XXX Ie, the reason that I don't free it here 2068 * is because the caller will free the last reference, 2069 * not us. That should be codified in some flag 2070 * that indicates "self-free" rather than checking 2071 * for SF_SYNC all the time. 2072 */ 2073 if ((sfs->flags & SF_SYNC) == 0 && knlist_empty(&sfs->klist)) { 2074 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, " 2075 "count==0, empty list: time to free!\n", 2076 __func__, 2077 (unsigned long long) curthread->td_tid, 2078 sfs); 2079 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2080 do_free = 1; 2081 } 2082 2083 } 2084 mtx_unlock(&sfs->mtx); 2085 2086 /* 2087 * Attempt to do a free here. 2088 * 2089 * We do this outside of the lock because it may destroy the 2090 * lock in question as it frees things. We can optimise this 2091 * later. 2092 * 2093 * XXX yes, we should make it a requirement to hold the 2094 * lock across sf_sync_free(). 2095 */ 2096 if (do_free == 1) { 2097 SFSYNC_DPRINTF("%s: (%llu) sfs=%p\n", 2098 __func__, 2099 (unsigned long long) curthread->td_tid, 2100 sfs); 2101 sf_sync_free(sfs); 2102 } 2103 } 2104 2105 /* 2106 * Allocate a sendfile_sync state structure. 2107 * 2108 * For now this only knows about the "sleep" sync, but later it will 2109 * grow various other personalities. 2110 */ 2111 struct sendfile_sync * 2112 sf_sync_alloc(uint32_t flags) 2113 { 2114 struct sendfile_sync *sfs; 2115 2116 sfs = uma_zalloc(zone_sfsync, M_WAITOK | M_ZERO); 2117 mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); 2118 cv_init(&sfs->cv, "sendfile"); 2119 sfs->flags = flags; 2120 sfs->state = SF_STATE_SETUP; 2121 knlist_init_mtx(&sfs->klist, &sfs->mtx); 2122 2123 SFSYNC_DPRINTF("%s: sfs=%p, flags=0x%08x\n", __func__, sfs, sfs->flags); 2124 2125 return (sfs); 2126 } 2127 2128 /* 2129 * Take a reference to a sfsync instance. 2130 * 2131 * This has to map 1:1 to free calls coming in via sf_buf_mext(), 2132 * so typically this will be referenced once for each mbuf allocated. 2133 */ 2134 void 2135 sf_sync_ref(struct sendfile_sync *sfs) 2136 { 2137 2138 if (sfs == NULL) 2139 return; 2140 2141 mtx_lock(&sfs->mtx); 2142 sfs->count++; 2143 mtx_unlock(&sfs->mtx); 2144 } 2145 2146 void 2147 sf_sync_syscall_wait(struct sendfile_sync *sfs) 2148 { 2149 2150 if (sfs == NULL) 2151 return; 2152 2153 KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!", 2154 __func__, 2155 sfs)); 2156 2157 /* 2158 * If we're not requested to wait during the syscall, 2159 * don't bother waiting. 2160 */ 2161 if ((sfs->flags & SF_SYNC) == 0) 2162 goto out; 2163 2164 /* 2165 * This is a bit suboptimal and confusing, so bear with me. 2166 * 2167 * Ideally sf_sync_syscall_wait() will wait until 2168 * all pending mbuf transmit operations are done. 2169 * This means that when sendfile becomes async, it'll 2170 * run in the background and will transition from 2171 * RUNNING to COMPLETED when it's finished acquiring 2172 * new things to send. Then, when the mbufs finish 2173 * sending, COMPLETED + sfs->count == 0 is enough to 2174 * know that no further work is being done. 2175 * 2176 * So, we will sleep on both RUNNING and COMPLETED. 2177 * It's up to the (in progress) async sendfile loop 2178 * to transition the sf_sync from RUNNING to 2179 * COMPLETED so the wakeup above will actually 2180 * do the cv_signal() call. 2181 */ 2182 if (sfs->state != SF_STATE_COMPLETED && sfs->state != SF_STATE_RUNNING) 2183 goto out; 2184 2185 if (sfs->count != 0) 2186 cv_wait(&sfs->cv, &sfs->mtx); 2187 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2188 2189 out: 2190 return; 2191 } 2192 2193 /* 2194 * Free an sf_sync if it's appropriate to. 2195 */ 2196 void 2197 sf_sync_free(struct sendfile_sync *sfs) 2198 { 2199 2200 if (sfs == NULL) 2201 return; 2202 2203 SFSYNC_DPRINTF("%s: (%lld) sfs=%p; called; state=%d, flags=0x%08x " 2204 "count=%d\n", 2205 __func__, 2206 (long long) curthread->td_tid, 2207 sfs, 2208 sfs->state, 2209 sfs->flags, 2210 sfs->count); 2211 2212 mtx_lock(&sfs->mtx); 2213 2214 /* 2215 * We keep the sf_sync around if the state is active, 2216 * we are doing kqueue notification and we have active 2217 * knotes. 2218 * 2219 * If the caller wants to free us right this second it 2220 * should transition this to the freeing state. 2221 * 2222 * So, complain loudly if they break this rule. 2223 */ 2224 if (sfs->state != SF_STATE_FREEING) { 2225 printf("%s: (%llu) sfs=%p; not freeing; let's wait!\n", 2226 __func__, 2227 (unsigned long long) curthread->td_tid, 2228 sfs); 2229 mtx_unlock(&sfs->mtx); 2230 return; 2231 } 2232 2233 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2234 cv_destroy(&sfs->cv); 2235 /* 2236 * This doesn't call knlist_detach() on each knote; it just frees 2237 * the entire list. 2238 */ 2239 knlist_delete(&sfs->klist, curthread, 1); 2240 mtx_destroy(&sfs->mtx); 2241 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; freeing\n", 2242 __func__, 2243 (unsigned long long) curthread->td_tid, 2244 sfs); 2245 uma_zfree(zone_sfsync, sfs); 2246 } 2247 2248 /* 2249 * Setup a sf_sync to post a kqueue notification when things are complete. 2250 */ 2251 int 2252 sf_sync_kqueue_setup(struct sendfile_sync *sfs, struct sf_hdtr_kq *sfkq) 2253 { 2254 struct kevent kev; 2255 int error; 2256 2257 sfs->flags |= SF_KQUEUE; 2258 2259 /* Check the flags are valid */ 2260 if ((sfkq->kq_flags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) 2261 return (EINVAL); 2262 2263 SFSYNC_DPRINTF("%s: sfs=%p: kqfd=%d, flags=0x%08x, ident=%p, udata=%p\n", 2264 __func__, 2265 sfs, 2266 sfkq->kq_fd, 2267 sfkq->kq_flags, 2268 (void *) sfkq->kq_ident, 2269 (void *) sfkq->kq_udata); 2270 2271 /* Setup and register a knote on the given kqfd. */ 2272 kev.ident = (uintptr_t) sfkq->kq_ident; 2273 kev.filter = EVFILT_SENDFILE; 2274 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | sfkq->kq_flags; 2275 kev.data = (intptr_t) sfs; 2276 kev.udata = sfkq->kq_udata; 2277 2278 error = kqfd_register(sfkq->kq_fd, &kev, curthread, 1); 2279 if (error != 0) { 2280 SFSYNC_DPRINTF("%s: returned %d\n", __func__, error); 2281 } 2282 return (error); 2283 } 2284 2285 void 2286 sf_sync_set_state(struct sendfile_sync *sfs, sendfile_sync_state_t state, 2287 int islocked) 2288 { 2289 sendfile_sync_state_t old_state; 2290 2291 if (! islocked) 2292 mtx_lock(&sfs->mtx); 2293 2294 /* 2295 * Update our current state. 2296 */ 2297 old_state = sfs->state; 2298 sfs->state = state; 2299 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; going from %d to %d\n", 2300 __func__, 2301 (unsigned long long) curthread->td_tid, 2302 sfs, 2303 old_state, 2304 state); 2305 2306 /* 2307 * If we're transitioning from RUNNING to COMPLETED and the count is 2308 * zero, then post the knote. The caller may have completed the 2309 * send before we updated the state to COMPLETED and we need to make 2310 * sure this is communicated. 2311 */ 2312 if (old_state == SF_STATE_RUNNING 2313 && state == SF_STATE_COMPLETED 2314 && sfs->count == 0 2315 && sfs->flags & SF_KQUEUE) { 2316 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: triggering knote!\n", 2317 __func__, 2318 (unsigned long long) curthread->td_tid, 2319 sfs); 2320 KNOTE_LOCKED(&sfs->klist, 1); 2321 } 2322 2323 if (! islocked) 2324 mtx_unlock(&sfs->mtx); 2325 } 2326 2327 /* 2328 * Set the retval/errno for the given transaction. 2329 * 2330 * This will eventually/ideally be used when the KNOTE is fired off 2331 * to signify the completion of this transaction. 2332 * 2333 * The sfsync lock should be held before entering this function. 2334 */ 2335 void 2336 sf_sync_set_retval(struct sendfile_sync *sfs, off_t retval, int xerrno) 2337 { 2338 2339 KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!", 2340 __func__, 2341 sfs)); 2342 2343 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: errno=%d, retval=%jd\n", 2344 __func__, 2345 (unsigned long long) curthread->td_tid, 2346 sfs, 2347 xerrno, 2348 (intmax_t) retval); 2349 2350 sfs->retval = retval; 2351 sfs->xerrno = xerrno; 2352 } 2353 2354 /* 2355 * sendfile(2) 2356 * 2357 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 2358 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 2359 * 2360 * Send a file specified by 'fd' and starting at 'offset' to a socket 2361 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == 2362 * 0. Optionally add a header and/or trailer to the socket output. If 2363 * specified, write the total number of bytes sent into *sbytes. 2364 */ 2365 int 2366 sys_sendfile(struct thread *td, struct sendfile_args *uap) 2367 { 2368 2369 return (do_sendfile(td, uap, 0)); 2370 } 2371 2372 int 2373 _do_sendfile(struct thread *td, int src_fd, int sock_fd, int flags, 2374 int compat, off_t offset, size_t nbytes, off_t *sbytes, 2375 struct uio *hdr_uio, 2376 struct uio *trl_uio, struct sf_hdtr_kq *hdtr_kq) 2377 { 2378 cap_rights_t rights; 2379 struct sendfile_sync *sfs = NULL; 2380 struct file *fp; 2381 int error; 2382 int do_kqueue = 0; 2383 int do_free = 0; 2384 2385 AUDIT_ARG_FD(src_fd); 2386 2387 if (hdtr_kq != NULL) 2388 do_kqueue = 1; 2389 2390 /* 2391 * sendfile(2) can start at any offset within a file so we require 2392 * CAP_READ+CAP_SEEK = CAP_PREAD. 2393 */ 2394 if ((error = fget_read(td, src_fd, 2395 cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { 2396 goto out; 2397 } 2398 2399 /* 2400 * IF SF_KQUEUE is set but we haven't copied in anything for 2401 * kqueue data, error out. 2402 */ 2403 if (flags & SF_KQUEUE && do_kqueue == 0) { 2404 SFSYNC_DPRINTF("%s: SF_KQUEUE but no KQUEUE data!\n", __func__); 2405 goto out; 2406 } 2407 2408 /* 2409 * If we need to wait for completion, initialise the sfsync 2410 * state here. 2411 */ 2412 if (flags & (SF_SYNC | SF_KQUEUE)) 2413 sfs = sf_sync_alloc(flags & (SF_SYNC | SF_KQUEUE)); 2414 2415 if (flags & SF_KQUEUE) { 2416 error = sf_sync_kqueue_setup(sfs, hdtr_kq); 2417 if (error) { 2418 SFSYNC_DPRINTF("%s: (%llu) error; sfs=%p\n", 2419 __func__, 2420 (unsigned long long) curthread->td_tid, 2421 sfs); 2422 sf_sync_set_state(sfs, SF_STATE_FREEING, 0); 2423 sf_sync_free(sfs); 2424 goto out; 2425 } 2426 } 2427 2428 /* 2429 * Do the sendfile call. 2430 * 2431 * If this fails, it'll free the mbuf chain which will free up the 2432 * sendfile_sync references. 2433 */ 2434 error = fo_sendfile(fp, sock_fd, hdr_uio, trl_uio, offset, 2435 nbytes, sbytes, flags, compat ? SFK_COMPAT : 0, sfs, td); 2436 2437 /* 2438 * If the sendfile call succeeded, transition the sf_sync state 2439 * to RUNNING, then COMPLETED. 2440 * 2441 * If the sendfile call failed, then the sendfile call may have 2442 * actually sent some data first - so we check to see whether 2443 * any data was sent. If some data was queued (ie, count > 0) 2444 * then we can't call free; we have to wait until the partial 2445 * transaction completes before we continue along. 2446 * 2447 * This has the side effect of firing off the knote 2448 * if the refcount has hit zero by the time we get here. 2449 */ 2450 if (sfs != NULL) { 2451 mtx_lock(&sfs->mtx); 2452 if (error == 0 || sfs->count > 0) { 2453 /* 2454 * When it's time to do async sendfile, the transition 2455 * to RUNNING signifies that we're actually actively 2456 * adding and completing mbufs. When the last disk 2457 * buffer is read (ie, when we're not doing any 2458 * further read IO and all subsequent stuff is mbuf 2459 * transmissions) we'll transition to COMPLETED 2460 * and when the final mbuf is freed, the completion 2461 * will be signaled. 2462 */ 2463 sf_sync_set_state(sfs, SF_STATE_RUNNING, 1); 2464 2465 /* 2466 * Set the retval before we signal completed. 2467 * If we do it the other way around then transitioning to 2468 * COMPLETED may post the knote before you set the return 2469 * status! 2470 * 2471 * XXX for now, errno is always 0, as we don't post 2472 * knotes if sendfile failed. Maybe that'll change later. 2473 */ 2474 sf_sync_set_retval(sfs, *sbytes, error); 2475 2476 /* 2477 * And now transition to completed, which will kick off 2478 * the knote if required. 2479 */ 2480 sf_sync_set_state(sfs, SF_STATE_COMPLETED, 1); 2481 } else { 2482 /* 2483 * Error isn't zero, sfs_count is zero, so we 2484 * won't have some other thing to wake things up. 2485 * Thus free. 2486 */ 2487 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2488 do_free = 1; 2489 } 2490 2491 /* 2492 * Next - wait if appropriate. 2493 */ 2494 sf_sync_syscall_wait(sfs); 2495 2496 /* 2497 * If we're not doing kqueue notifications, we can 2498 * transition this immediately to the freeing state. 2499 */ 2500 if ((sfs->flags & SF_KQUEUE) == 0) { 2501 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2502 do_free = 1; 2503 } 2504 2505 mtx_unlock(&sfs->mtx); 2506 } 2507 2508 /* 2509 * If do_free is set, free here. 2510 * 2511 * If we're doing no-kqueue notification and it's just sleep notification, 2512 * we also do free; it's the only chance we have. 2513 */ 2514 if (sfs != NULL && do_free == 1) { 2515 sf_sync_free(sfs); 2516 } 2517 2518 /* 2519 * XXX Should we wait until the send has completed before freeing the source 2520 * file handle? It's the previous behaviour, sure, but is it required? 2521 * We've wired down the page references after all. 2522 */ 2523 fdrop(fp, td); 2524 2525 out: 2526 /* Return error */ 2527 return (error); 2528 } 2529 2530 2531 static int 2532 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) 2533 { 2534 struct sf_hdtr hdtr; 2535 struct sf_hdtr_kq hdtr_kq; 2536 struct uio *hdr_uio, *trl_uio; 2537 int error; 2538 off_t sbytes; 2539 int do_kqueue = 0; 2540 2541 /* 2542 * File offset must be positive. If it goes beyond EOF 2543 * we send only the header/trailer and no payload data. 2544 */ 2545 if (uap->offset < 0) 2546 return (EINVAL); 2547 2548 hdr_uio = trl_uio = NULL; 2549 2550 if (uap->hdtr != NULL) { 2551 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 2552 if (error != 0) 2553 goto out; 2554 if (hdtr.headers != NULL) { 2555 error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); 2556 if (error != 0) 2557 goto out; 2558 } 2559 if (hdtr.trailers != NULL) { 2560 error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); 2561 if (error != 0) 2562 goto out; 2563 } 2564 2565 /* 2566 * If SF_KQUEUE is set, then we need to also copy in 2567 * the kqueue data after the normal hdtr set and set 2568 * do_kqueue=1. 2569 */ 2570 if (uap->flags & SF_KQUEUE) { 2571 error = copyin(((char *) uap->hdtr) + sizeof(hdtr), 2572 &hdtr_kq, 2573 sizeof(hdtr_kq)); 2574 if (error != 0) 2575 goto out; 2576 do_kqueue = 1; 2577 } 2578 } 2579 2580 /* Call sendfile */ 2581 error = _do_sendfile(td, uap->fd, uap->s, uap->flags, compat, 2582 uap->offset, uap->nbytes, &sbytes, hdr_uio, trl_uio, &hdtr_kq); 2583 2584 if (uap->sbytes != NULL) { 2585 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 2586 } 2587 out: 2588 free(hdr_uio, M_IOV); 2589 free(trl_uio, M_IOV); 2590 return (error); 2591 } 2592 2593 #ifdef COMPAT_FREEBSD4 2594 int 2595 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) 2596 { 2597 struct sendfile_args args; 2598 2599 args.fd = uap->fd; 2600 args.s = uap->s; 2601 args.offset = uap->offset; 2602 args.nbytes = uap->nbytes; 2603 args.hdtr = uap->hdtr; 2604 args.sbytes = uap->sbytes; 2605 args.flags = uap->flags; 2606 2607 return (do_sendfile(td, &args, 1)); 2608 } 2609 #endif /* COMPAT_FREEBSD4 */ 2610 2611 static int 2612 sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd, 2613 off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res) 2614 { 2615 vm_page_t m; 2616 vm_pindex_t pindex; 2617 ssize_t resid; 2618 int error, readahead, rv; 2619 2620 pindex = OFF_TO_IDX(off); 2621 VM_OBJECT_WLOCK(obj); 2622 m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY | 2623 VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL); 2624 2625 /* 2626 * Check if page is valid for what we need, otherwise initiate I/O. 2627 * 2628 * The non-zero nd argument prevents disk I/O, instead we 2629 * return the caller what he specified in nd. In particular, 2630 * if we already turned some pages into mbufs, nd == EAGAIN 2631 * and the main function send them the pages before we come 2632 * here again and block. 2633 */ 2634 if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) { 2635 if (vp == NULL) 2636 vm_page_xunbusy(m); 2637 VM_OBJECT_WUNLOCK(obj); 2638 *res = m; 2639 return (0); 2640 } else if (nd != 0) { 2641 if (vp == NULL) 2642 vm_page_xunbusy(m); 2643 error = nd; 2644 goto free_page; 2645 } 2646 2647 /* 2648 * Get the page from backing store. 2649 */ 2650 error = 0; 2651 if (vp != NULL) { 2652 VM_OBJECT_WUNLOCK(obj); 2653 readahead = sfreadahead * MAXBSIZE; 2654 2655 /* 2656 * Use vn_rdwr() instead of the pager interface for 2657 * the vnode, to allow the read-ahead. 2658 * 2659 * XXXMAC: Because we don't have fp->f_cred here, we 2660 * pass in NOCRED. This is probably wrong, but is 2661 * consistent with our original implementation. 2662 */ 2663 error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off), 2664 UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead / 2665 bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); 2666 SFSTAT_INC(sf_iocnt); 2667 VM_OBJECT_WLOCK(obj); 2668 } else { 2669 if (vm_pager_has_page(obj, pindex, NULL, NULL)) { 2670 rv = vm_pager_get_pages(obj, &m, 1, 0); 2671 SFSTAT_INC(sf_iocnt); 2672 m = vm_page_lookup(obj, pindex); 2673 if (m == NULL) 2674 error = EIO; 2675 else if (rv != VM_PAGER_OK) { 2676 vm_page_lock(m); 2677 vm_page_free(m); 2678 vm_page_unlock(m); 2679 m = NULL; 2680 error = EIO; 2681 } 2682 } else { 2683 pmap_zero_page(m); 2684 m->valid = VM_PAGE_BITS_ALL; 2685 m->dirty = 0; 2686 } 2687 if (m != NULL) 2688 vm_page_xunbusy(m); 2689 } 2690 if (error == 0) { 2691 *res = m; 2692 } else if (m != NULL) { 2693 free_page: 2694 vm_page_lock(m); 2695 vm_page_unwire(m, 0); 2696 2697 /* 2698 * See if anyone else might know about this page. If 2699 * not and it is not valid, then free it. 2700 */ 2701 if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m)) 2702 vm_page_free(m); 2703 vm_page_unlock(m); 2704 } 2705 KASSERT(error != 0 || (m->wire_count > 0 && 2706 vm_page_is_valid(m, off & PAGE_MASK, xfsize)), 2707 ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off, 2708 xfsize)); 2709 VM_OBJECT_WUNLOCK(obj); 2710 return (error); 2711 } 2712 2713 static int 2714 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, 2715 struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, 2716 int *bsize) 2717 { 2718 struct vattr va; 2719 vm_object_t obj; 2720 struct vnode *vp; 2721 struct shmfd *shmfd; 2722 int error; 2723 2724 vp = *vp_res = NULL; 2725 obj = NULL; 2726 shmfd = *shmfd_res = NULL; 2727 *bsize = 0; 2728 2729 /* 2730 * The file descriptor must be a regular file and have a 2731 * backing VM object. 2732 */ 2733 if (fp->f_type == DTYPE_VNODE) { 2734 vp = fp->f_vnode; 2735 vn_lock(vp, LK_SHARED | LK_RETRY); 2736 if (vp->v_type != VREG) { 2737 error = EINVAL; 2738 goto out; 2739 } 2740 *bsize = vp->v_mount->mnt_stat.f_iosize; 2741 error = VOP_GETATTR(vp, &va, td->td_ucred); 2742 if (error != 0) 2743 goto out; 2744 *obj_size = va.va_size; 2745 obj = vp->v_object; 2746 if (obj == NULL) { 2747 error = EINVAL; 2748 goto out; 2749 } 2750 } else if (fp->f_type == DTYPE_SHM) { 2751 shmfd = fp->f_data; 2752 obj = shmfd->shm_object; 2753 *obj_size = shmfd->shm_size; 2754 } else { 2755 error = EINVAL; 2756 goto out; 2757 } 2758 2759 VM_OBJECT_WLOCK(obj); 2760 if ((obj->flags & OBJ_DEAD) != 0) { 2761 VM_OBJECT_WUNLOCK(obj); 2762 error = EBADF; 2763 goto out; 2764 } 2765 2766 /* 2767 * Temporarily increase the backing VM object's reference 2768 * count so that a forced reclamation of its vnode does not 2769 * immediately destroy it. 2770 */ 2771 vm_object_reference_locked(obj); 2772 VM_OBJECT_WUNLOCK(obj); 2773 *obj_res = obj; 2774 *vp_res = vp; 2775 *shmfd_res = shmfd; 2776 2777 out: 2778 if (vp != NULL) 2779 VOP_UNLOCK(vp, 0); 2780 return (error); 2781 } 2782 2783 static int 2784 kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp, 2785 struct socket **so) 2786 { 2787 cap_rights_t rights; 2788 int error; 2789 2790 *sock_fp = NULL; 2791 *so = NULL; 2792 2793 /* 2794 * The socket must be a stream socket and connected. 2795 */ 2796 error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights, 2797 CAP_SEND), sock_fp, NULL); 2798 if (error != 0) 2799 return (error); 2800 *so = (*sock_fp)->f_data; 2801 if ((*so)->so_type != SOCK_STREAM) 2802 return (EINVAL); 2803 if (((*so)->so_state & SS_ISCONNECTED) == 0) 2804 return (ENOTCONN); 2805 return (0); 2806 } 2807 2808 int 2809 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 2810 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 2811 int kflags, struct sendfile_sync *sfs, struct thread *td) 2812 { 2813 struct file *sock_fp; 2814 struct vnode *vp; 2815 struct vm_object *obj; 2816 struct socket *so; 2817 struct mbuf *m; 2818 struct sf_buf *sf; 2819 struct vm_page *pg; 2820 struct shmfd *shmfd; 2821 struct vattr va; 2822 off_t off, xfsize, fsbytes, sbytes, rem, obj_size; 2823 int error, bsize, nd, hdrlen, mnw; 2824 2825 pg = NULL; 2826 obj = NULL; 2827 so = NULL; 2828 m = NULL; 2829 fsbytes = sbytes = 0; 2830 hdrlen = mnw = 0; 2831 rem = nbytes; 2832 obj_size = 0; 2833 2834 error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); 2835 if (error != 0) 2836 return (error); 2837 if (rem == 0) 2838 rem = obj_size; 2839 2840 error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so); 2841 if (error != 0) 2842 goto out; 2843 2844 /* 2845 * Do not wait on memory allocations but return ENOMEM for 2846 * caller to retry later. 2847 * XXX: Experimental. 2848 */ 2849 if (flags & SF_MNOWAIT) 2850 mnw = 1; 2851 2852 #ifdef MAC 2853 error = mac_socket_check_send(td->td_ucred, so); 2854 if (error != 0) 2855 goto out; 2856 #endif 2857 2858 /* If headers are specified copy them into mbufs. */ 2859 if (hdr_uio != NULL) { 2860 hdr_uio->uio_td = td; 2861 hdr_uio->uio_rw = UIO_WRITE; 2862 if (hdr_uio->uio_resid > 0) { 2863 /* 2864 * In FBSD < 5.0 the nbytes to send also included 2865 * the header. If compat is specified subtract the 2866 * header size from nbytes. 2867 */ 2868 if (kflags & SFK_COMPAT) { 2869 if (nbytes > hdr_uio->uio_resid) 2870 nbytes -= hdr_uio->uio_resid; 2871 else 2872 nbytes = 0; 2873 } 2874 m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), 2875 0, 0, 0); 2876 if (m == NULL) { 2877 error = mnw ? EAGAIN : ENOBUFS; 2878 goto out; 2879 } 2880 hdrlen = m_length(m, NULL); 2881 } 2882 } 2883 2884 /* 2885 * Protect against multiple writers to the socket. 2886 * 2887 * XXXRW: Historically this has assumed non-interruptibility, so now 2888 * we implement that, but possibly shouldn't. 2889 */ 2890 (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); 2891 2892 /* 2893 * Loop through the pages of the file, starting with the requested 2894 * offset. Get a file page (do I/O if necessary), map the file page 2895 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 2896 * it on the socket. 2897 * This is done in two loops. The inner loop turns as many pages 2898 * as it can, up to available socket buffer space, without blocking 2899 * into mbufs to have it bulk delivered into the socket send buffer. 2900 * The outer loop checks the state and available space of the socket 2901 * and takes care of the overall progress. 2902 */ 2903 for (off = offset; ; ) { 2904 struct mbuf *mtail; 2905 int loopbytes; 2906 int space; 2907 int done; 2908 2909 if ((nbytes != 0 && nbytes == fsbytes) || 2910 (nbytes == 0 && obj_size == fsbytes)) 2911 break; 2912 2913 mtail = NULL; 2914 loopbytes = 0; 2915 space = 0; 2916 done = 0; 2917 2918 /* 2919 * Check the socket state for ongoing connection, 2920 * no errors and space in socket buffer. 2921 * If space is low allow for the remainder of the 2922 * file to be processed if it fits the socket buffer. 2923 * Otherwise block in waiting for sufficient space 2924 * to proceed, or if the socket is nonblocking, return 2925 * to userland with EAGAIN while reporting how far 2926 * we've come. 2927 * We wait until the socket buffer has significant free 2928 * space to do bulk sends. This makes good use of file 2929 * system read ahead and allows packet segmentation 2930 * offloading hardware to take over lots of work. If 2931 * we were not careful here we would send off only one 2932 * sfbuf at a time. 2933 */ 2934 SOCKBUF_LOCK(&so->so_snd); 2935 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) 2936 so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; 2937 retry_space: 2938 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2939 error = EPIPE; 2940 SOCKBUF_UNLOCK(&so->so_snd); 2941 goto done; 2942 } else if (so->so_error) { 2943 error = so->so_error; 2944 so->so_error = 0; 2945 SOCKBUF_UNLOCK(&so->so_snd); 2946 goto done; 2947 } 2948 space = sbspace(&so->so_snd); 2949 if (space < rem && 2950 (space <= 0 || 2951 space < so->so_snd.sb_lowat)) { 2952 if (so->so_state & SS_NBIO) { 2953 SOCKBUF_UNLOCK(&so->so_snd); 2954 error = EAGAIN; 2955 goto done; 2956 } 2957 /* 2958 * sbwait drops the lock while sleeping. 2959 * When we loop back to retry_space the 2960 * state may have changed and we retest 2961 * for it. 2962 */ 2963 error = sbwait(&so->so_snd); 2964 /* 2965 * An error from sbwait usually indicates that we've 2966 * been interrupted by a signal. If we've sent anything 2967 * then return bytes sent, otherwise return the error. 2968 */ 2969 if (error != 0) { 2970 SOCKBUF_UNLOCK(&so->so_snd); 2971 goto done; 2972 } 2973 goto retry_space; 2974 } 2975 SOCKBUF_UNLOCK(&so->so_snd); 2976 2977 /* 2978 * Reduce space in the socket buffer by the size of 2979 * the header mbuf chain. 2980 * hdrlen is set to 0 after the first loop. 2981 */ 2982 space -= hdrlen; 2983 2984 if (vp != NULL) { 2985 error = vn_lock(vp, LK_SHARED); 2986 if (error != 0) 2987 goto done; 2988 error = VOP_GETATTR(vp, &va, td->td_ucred); 2989 if (error != 0 || off >= va.va_size) { 2990 VOP_UNLOCK(vp, 0); 2991 goto done; 2992 } 2993 obj_size = va.va_size; 2994 } 2995 2996 /* 2997 * Loop and construct maximum sized mbuf chain to be bulk 2998 * dumped into socket buffer. 2999 */ 3000 while (space > loopbytes) { 3001 vm_offset_t pgoff; 3002 struct mbuf *m0; 3003 3004 /* 3005 * Calculate the amount to transfer. 3006 * Not to exceed a page, the EOF, 3007 * or the passed in nbytes. 3008 */ 3009 pgoff = (vm_offset_t)(off & PAGE_MASK); 3010 rem = obj_size - offset; 3011 if (nbytes != 0) 3012 rem = omin(rem, nbytes); 3013 rem -= fsbytes + loopbytes; 3014 xfsize = omin(PAGE_SIZE - pgoff, rem); 3015 xfsize = omin(space - loopbytes, xfsize); 3016 if (xfsize <= 0) { 3017 done = 1; /* all data sent */ 3018 break; 3019 } 3020 3021 /* 3022 * Attempt to look up the page. Allocate 3023 * if not found or wait and loop if busy. 3024 */ 3025 if (m != NULL) 3026 nd = EAGAIN; /* send what we already got */ 3027 else if ((flags & SF_NODISKIO) != 0) 3028 nd = EBUSY; 3029 else 3030 nd = 0; 3031 error = sendfile_readpage(obj, vp, nd, off, 3032 xfsize, bsize, td, &pg); 3033 if (error != 0) { 3034 if (error == EAGAIN) 3035 error = 0; /* not a real error */ 3036 break; 3037 } 3038 3039 /* 3040 * Get a sendfile buf. When allocating the 3041 * first buffer for mbuf chain, we usually 3042 * wait as long as necessary, but this wait 3043 * can be interrupted. For consequent 3044 * buffers, do not sleep, since several 3045 * threads might exhaust the buffers and then 3046 * deadlock. 3047 */ 3048 sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT : 3049 SFB_CATCH); 3050 if (sf == NULL) { 3051 SFSTAT_INC(sf_allocfail); 3052 vm_page_lock(pg); 3053 vm_page_unwire(pg, 0); 3054 KASSERT(pg->object != NULL, 3055 ("%s: object disappeared", __func__)); 3056 vm_page_unlock(pg); 3057 if (m == NULL) 3058 error = (mnw ? EAGAIN : EINTR); 3059 break; 3060 } 3061 3062 /* 3063 * Get an mbuf and set it up as having 3064 * external storage. 3065 */ 3066 m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); 3067 if (m0 == NULL) { 3068 error = (mnw ? EAGAIN : ENOBUFS); 3069 (void)sf_buf_mext(NULL, NULL, sf); 3070 break; 3071 } 3072 if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE, 3073 sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF, 3074 (mnw ? M_NOWAIT : M_WAITOK)) != 0) { 3075 error = (mnw ? EAGAIN : ENOBUFS); 3076 (void)sf_buf_mext(NULL, NULL, sf); 3077 m_freem(m0); 3078 break; 3079 } 3080 m0->m_data = (char *)sf_buf_kva(sf) + pgoff; 3081 m0->m_len = xfsize; 3082 3083 /* Append to mbuf chain. */ 3084 if (mtail != NULL) 3085 mtail->m_next = m0; 3086 else if (m != NULL) 3087 m_last(m)->m_next = m0; 3088 else 3089 m = m0; 3090 mtail = m0; 3091 3092 /* Keep track of bits processed. */ 3093 loopbytes += xfsize; 3094 off += xfsize; 3095 3096 /* 3097 * XXX eventually this should be a sfsync 3098 * method call! 3099 */ 3100 if (sfs != NULL) 3101 sf_sync_ref(sfs); 3102 } 3103 3104 if (vp != NULL) 3105 VOP_UNLOCK(vp, 0); 3106 3107 /* Add the buffer chain to the socket buffer. */ 3108 if (m != NULL) { 3109 int mlen, err; 3110 3111 mlen = m_length(m, NULL); 3112 SOCKBUF_LOCK(&so->so_snd); 3113 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3114 error = EPIPE; 3115 SOCKBUF_UNLOCK(&so->so_snd); 3116 goto done; 3117 } 3118 SOCKBUF_UNLOCK(&so->so_snd); 3119 CURVNET_SET(so->so_vnet); 3120 /* Avoid error aliasing. */ 3121 err = (*so->so_proto->pr_usrreqs->pru_send) 3122 (so, 0, m, NULL, NULL, td); 3123 CURVNET_RESTORE(); 3124 if (err == 0) { 3125 /* 3126 * We need two counters to get the 3127 * file offset and nbytes to send 3128 * right: 3129 * - sbytes contains the total amount 3130 * of bytes sent, including headers. 3131 * - fsbytes contains the total amount 3132 * of bytes sent from the file. 3133 */ 3134 sbytes += mlen; 3135 fsbytes += mlen; 3136 if (hdrlen) { 3137 fsbytes -= hdrlen; 3138 hdrlen = 0; 3139 } 3140 } else if (error == 0) 3141 error = err; 3142 m = NULL; /* pru_send always consumes */ 3143 } 3144 3145 /* Quit outer loop on error or when we're done. */ 3146 if (done) 3147 break; 3148 if (error != 0) 3149 goto done; 3150 } 3151 3152 /* 3153 * Send trailers. Wimp out and use writev(2). 3154 */ 3155 if (trl_uio != NULL) { 3156 sbunlock(&so->so_snd); 3157 error = kern_writev(td, sockfd, trl_uio); 3158 if (error == 0) 3159 sbytes += td->td_retval[0]; 3160 goto out; 3161 } 3162 3163 done: 3164 sbunlock(&so->so_snd); 3165 out: 3166 /* 3167 * If there was no error we have to clear td->td_retval[0] 3168 * because it may have been set by writev. 3169 */ 3170 if (error == 0) { 3171 td->td_retval[0] = 0; 3172 } 3173 if (sent != NULL) { 3174 (*sent) = sbytes; 3175 } 3176 if (obj != NULL) 3177 vm_object_deallocate(obj); 3178 if (so) 3179 fdrop(sock_fp, td); 3180 if (m) 3181 m_freem(m); 3182 3183 if (error == ERESTART) 3184 error = EINTR; 3185 3186 return (error); 3187 } 3188 3189 /* 3190 * SCTP syscalls. 3191 * Functionality only compiled in if SCTP is defined in the kernel Makefile, 3192 * otherwise all return EOPNOTSUPP. 3193 * XXX: We should make this loadable one day. 3194 */ 3195 int 3196 sys_sctp_peeloff(td, uap) 3197 struct thread *td; 3198 struct sctp_peeloff_args /* { 3199 int sd; 3200 caddr_t name; 3201 } */ *uap; 3202 { 3203 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3204 struct file *nfp = NULL; 3205 struct socket *head, *so; 3206 cap_rights_t rights; 3207 u_int fflag; 3208 int error, fd; 3209 3210 AUDIT_ARG_FD(uap->sd); 3211 error = fgetsock(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF), 3212 &head, &fflag); 3213 if (error != 0) 3214 goto done2; 3215 if (head->so_proto->pr_protocol != IPPROTO_SCTP) { 3216 error = EOPNOTSUPP; 3217 goto done; 3218 } 3219 error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name); 3220 if (error != 0) 3221 goto done; 3222 /* 3223 * At this point we know we do have a assoc to pull 3224 * we proceed to get the fd setup. This may block 3225 * but that is ok. 3226 */ 3227 3228 error = falloc(td, &nfp, &fd, 0); 3229 if (error != 0) 3230 goto done; 3231 td->td_retval[0] = fd; 3232 3233 CURVNET_SET(head->so_vnet); 3234 so = sonewconn(head, SS_ISCONNECTED); 3235 if (so == NULL) { 3236 error = ENOMEM; 3237 goto noconnection; 3238 } 3239 /* 3240 * Before changing the flags on the socket, we have to bump the 3241 * reference count. Otherwise, if the protocol calls sofree(), 3242 * the socket will be released due to a zero refcount. 3243 */ 3244 SOCK_LOCK(so); 3245 soref(so); /* file descriptor reference */ 3246 SOCK_UNLOCK(so); 3247 3248 ACCEPT_LOCK(); 3249 3250 TAILQ_REMOVE(&head->so_comp, so, so_list); 3251 head->so_qlen--; 3252 so->so_state |= (head->so_state & SS_NBIO); 3253 so->so_state &= ~SS_NOFDREF; 3254 so->so_qstate &= ~SQ_COMP; 3255 so->so_head = NULL; 3256 ACCEPT_UNLOCK(); 3257 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); 3258 error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); 3259 if (error != 0) 3260 goto noconnection; 3261 if (head->so_sigio != NULL) 3262 fsetown(fgetown(&head->so_sigio), &so->so_sigio); 3263 3264 noconnection: 3265 /* 3266 * close the new descriptor, assuming someone hasn't ripped it 3267 * out from under us. 3268 */ 3269 if (error != 0) 3270 fdclose(td->td_proc->p_fd, nfp, fd, td); 3271 3272 /* 3273 * Release explicitly held references before returning. 3274 */ 3275 CURVNET_RESTORE(); 3276 done: 3277 if (nfp != NULL) 3278 fdrop(nfp, td); 3279 fputsock(head); 3280 done2: 3281 return (error); 3282 #else /* SCTP */ 3283 return (EOPNOTSUPP); 3284 #endif /* SCTP */ 3285 } 3286 3287 int 3288 sys_sctp_generic_sendmsg (td, uap) 3289 struct thread *td; 3290 struct sctp_generic_sendmsg_args /* { 3291 int sd, 3292 caddr_t msg, 3293 int mlen, 3294 caddr_t to, 3295 __socklen_t tolen, 3296 struct sctp_sndrcvinfo *sinfo, 3297 int flags 3298 } */ *uap; 3299 { 3300 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3301 struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; 3302 struct socket *so; 3303 struct file *fp = NULL; 3304 struct sockaddr *to = NULL; 3305 #ifdef KTRACE 3306 struct uio *ktruio = NULL; 3307 #endif 3308 struct uio auio; 3309 struct iovec iov[1]; 3310 cap_rights_t rights; 3311 int error = 0, len; 3312 3313 if (uap->sinfo != NULL) { 3314 error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); 3315 if (error != 0) 3316 return (error); 3317 u_sinfo = &sinfo; 3318 } 3319 3320 cap_rights_init(&rights, CAP_SEND); 3321 if (uap->tolen != 0) { 3322 error = getsockaddr(&to, uap->to, uap->tolen); 3323 if (error != 0) { 3324 to = NULL; 3325 goto sctp_bad2; 3326 } 3327 cap_rights_set(&rights, CAP_CONNECT); 3328 } 3329 3330 AUDIT_ARG_FD(uap->sd); 3331 error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL); 3332 if (error != 0) 3333 goto sctp_bad; 3334 #ifdef KTRACE 3335 if (to && (KTRPOINT(td, KTR_STRUCT))) 3336 ktrsockaddr(to); 3337 #endif 3338 3339 iov[0].iov_base = uap->msg; 3340 iov[0].iov_len = uap->mlen; 3341 3342 so = (struct socket *)fp->f_data; 3343 if (so->so_proto->pr_protocol != IPPROTO_SCTP) { 3344 error = EOPNOTSUPP; 3345 goto sctp_bad; 3346 } 3347 #ifdef MAC 3348 error = mac_socket_check_send(td->td_ucred, so); 3349 if (error != 0) 3350 goto sctp_bad; 3351 #endif /* MAC */ 3352 3353 auio.uio_iov = iov; 3354 auio.uio_iovcnt = 1; 3355 auio.uio_segflg = UIO_USERSPACE; 3356 auio.uio_rw = UIO_WRITE; 3357 auio.uio_td = td; 3358 auio.uio_offset = 0; /* XXX */ 3359 auio.uio_resid = 0; 3360 len = auio.uio_resid = uap->mlen; 3361 CURVNET_SET(so->so_vnet); 3362 error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, 3363 (struct mbuf *)NULL, uap->flags, u_sinfo, td); 3364 CURVNET_RESTORE(); 3365 if (error != 0) { 3366 if (auio.uio_resid != len && (error == ERESTART || 3367 error == EINTR || error == EWOULDBLOCK)) 3368 error = 0; 3369 /* Generation of SIGPIPE can be controlled per socket. */ 3370 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 3371 !(uap->flags & MSG_NOSIGNAL)) { 3372 PROC_LOCK(td->td_proc); 3373 tdsignal(td, SIGPIPE); 3374 PROC_UNLOCK(td->td_proc); 3375 } 3376 } 3377 if (error == 0) 3378 td->td_retval[0] = len - auio.uio_resid; 3379 #ifdef KTRACE 3380 if (ktruio != NULL) { 3381 ktruio->uio_resid = td->td_retval[0]; 3382 ktrgenio(uap->sd, UIO_WRITE, ktruio, error); 3383 } 3384 #endif /* KTRACE */ 3385 sctp_bad: 3386 if (fp != NULL) 3387 fdrop(fp, td); 3388 sctp_bad2: 3389 free(to, M_SONAME); 3390 return (error); 3391 #else /* SCTP */ 3392 return (EOPNOTSUPP); 3393 #endif /* SCTP */ 3394 } 3395 3396 int 3397 sys_sctp_generic_sendmsg_iov(td, uap) 3398 struct thread *td; 3399 struct sctp_generic_sendmsg_iov_args /* { 3400 int sd, 3401 struct iovec *iov, 3402 int iovlen, 3403 caddr_t to, 3404 __socklen_t tolen, 3405 struct sctp_sndrcvinfo *sinfo, 3406 int flags 3407 } */ *uap; 3408 { 3409 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3410 struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; 3411 struct socket *so; 3412 struct file *fp = NULL; 3413 struct sockaddr *to = NULL; 3414 #ifdef KTRACE 3415 struct uio *ktruio = NULL; 3416 #endif 3417 struct uio auio; 3418 struct iovec *iov, *tiov; 3419 cap_rights_t rights; 3420 ssize_t len; 3421 int error, i; 3422 3423 if (uap->sinfo != NULL) { 3424 error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); 3425 if (error != 0) 3426 return (error); 3427 u_sinfo = &sinfo; 3428 } 3429 cap_rights_init(&rights, CAP_SEND); 3430 if (uap->tolen != 0) { 3431 error = getsockaddr(&to, uap->to, uap->tolen); 3432 if (error != 0) { 3433 to = NULL; 3434 goto sctp_bad2; 3435 } 3436 cap_rights_set(&rights, CAP_CONNECT); 3437 } 3438 3439 AUDIT_ARG_FD(uap->sd); 3440 error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL); 3441 if (error != 0) 3442 goto sctp_bad1; 3443 3444 #ifdef COMPAT_FREEBSD32 3445 if (SV_CURPROC_FLAG(SV_ILP32)) 3446 error = freebsd32_copyiniov((struct iovec32 *)uap->iov, 3447 uap->iovlen, &iov, EMSGSIZE); 3448 else 3449 #endif 3450 error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); 3451 if (error != 0) 3452 goto sctp_bad1; 3453 #ifdef KTRACE 3454 if (to && (KTRPOINT(td, KTR_STRUCT))) 3455 ktrsockaddr(to); 3456 #endif 3457 3458 so = (struct socket *)fp->f_data; 3459 if (so->so_proto->pr_protocol != IPPROTO_SCTP) { 3460 error = EOPNOTSUPP; 3461 goto sctp_bad; 3462 } 3463 #ifdef MAC 3464 error = mac_socket_check_send(td->td_ucred, so); 3465 if (error != 0) 3466 goto sctp_bad; 3467 #endif /* MAC */ 3468 3469 auio.uio_iov = iov; 3470 auio.uio_iovcnt = uap->iovlen; 3471 auio.uio_segflg = UIO_USERSPACE; 3472 auio.uio_rw = UIO_WRITE; 3473 auio.uio_td = td; 3474 auio.uio_offset = 0; /* XXX */ 3475 auio.uio_resid = 0; 3476 tiov = iov; 3477 for (i = 0; i <uap->iovlen; i++, tiov++) { 3478 if ((auio.uio_resid += tiov->iov_len) < 0) { 3479 error = EINVAL; 3480 goto sctp_bad; 3481 } 3482 } 3483 len = auio.uio_resid; 3484 CURVNET_SET(so->so_vnet); 3485 error = sctp_lower_sosend(so, to, &auio, 3486 (struct mbuf *)NULL, (struct mbuf *)NULL, 3487 uap->flags, u_sinfo, td); 3488 CURVNET_RESTORE(); 3489 if (error != 0) { 3490 if (auio.uio_resid != len && (error == ERESTART || 3491 error == EINTR || error == EWOULDBLOCK)) 3492 error = 0; 3493 /* Generation of SIGPIPE can be controlled per socket */ 3494 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 3495 !(uap->flags & MSG_NOSIGNAL)) { 3496 PROC_LOCK(td->td_proc); 3497 tdsignal(td, SIGPIPE); 3498 PROC_UNLOCK(td->td_proc); 3499 } 3500 } 3501 if (error == 0) 3502 td->td_retval[0] = len - auio.uio_resid; 3503 #ifdef KTRACE 3504 if (ktruio != NULL) { 3505 ktruio->uio_resid = td->td_retval[0]; 3506 ktrgenio(uap->sd, UIO_WRITE, ktruio, error); 3507 } 3508 #endif /* KTRACE */ 3509 sctp_bad: 3510 free(iov, M_IOV); 3511 sctp_bad1: 3512 if (fp != NULL) 3513 fdrop(fp, td); 3514 sctp_bad2: 3515 free(to, M_SONAME); 3516 return (error); 3517 #else /* SCTP */ 3518 return (EOPNOTSUPP); 3519 #endif /* SCTP */ 3520 } 3521 3522 int 3523 sys_sctp_generic_recvmsg(td, uap) 3524 struct thread *td; 3525 struct sctp_generic_recvmsg_args /* { 3526 int sd, 3527 struct iovec *iov, 3528 int iovlen, 3529 struct sockaddr *from, 3530 __socklen_t *fromlenaddr, 3531 struct sctp_sndrcvinfo *sinfo, 3532 int *msg_flags 3533 } */ *uap; 3534 { 3535 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3536 uint8_t sockbufstore[256]; 3537 struct uio auio; 3538 struct iovec *iov, *tiov; 3539 struct sctp_sndrcvinfo sinfo; 3540 struct socket *so; 3541 struct file *fp = NULL; 3542 struct sockaddr *fromsa; 3543 cap_rights_t rights; 3544 #ifdef KTRACE 3545 struct uio *ktruio = NULL; 3546 #endif 3547 ssize_t len; 3548 int error, fromlen, i, msg_flags; 3549 3550 AUDIT_ARG_FD(uap->sd); 3551 error = getsock_cap(td->td_proc->p_fd, uap->sd, 3552 cap_rights_init(&rights, CAP_RECV), &fp, NULL); 3553 if (error != 0) 3554 return (error); 3555 #ifdef COMPAT_FREEBSD32 3556 if (SV_CURPROC_FLAG(SV_ILP32)) 3557 error = freebsd32_copyiniov((struct iovec32 *)uap->iov, 3558 uap->iovlen, &iov, EMSGSIZE); 3559 else 3560 #endif 3561 error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); 3562 if (error != 0) 3563 goto out1; 3564 3565 so = fp->f_data; 3566 if (so->so_proto->pr_protocol != IPPROTO_SCTP) { 3567 error = EOPNOTSUPP; 3568 goto out; 3569 } 3570 #ifdef MAC 3571 error = mac_socket_check_receive(td->td_ucred, so); 3572 if (error != 0) 3573 goto out; 3574 #endif /* MAC */ 3575 3576 if (uap->fromlenaddr != NULL) { 3577 error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen)); 3578 if (error != 0) 3579 goto out; 3580 } else { 3581 fromlen = 0; 3582 } 3583 if (uap->msg_flags) { 3584 error = copyin(uap->msg_flags, &msg_flags, sizeof (int)); 3585 if (error != 0) 3586 goto out; 3587 } else { 3588 msg_flags = 0; 3589 } 3590 auio.uio_iov = iov; 3591 auio.uio_iovcnt = uap->iovlen; 3592 auio.uio_segflg = UIO_USERSPACE; 3593 auio.uio_rw = UIO_READ; 3594 auio.uio_td = td; 3595 auio.uio_offset = 0; /* XXX */ 3596 auio.uio_resid = 0; 3597 tiov = iov; 3598 for (i = 0; i <uap->iovlen; i++, tiov++) { 3599 if ((auio.uio_resid += tiov->iov_len) < 0) { 3600 error = EINVAL; 3601 goto out; 3602 } 3603 } 3604 len = auio.uio_resid; 3605 fromsa = (struct sockaddr *)sockbufstore; 3606 3607 #ifdef KTRACE 3608 if (KTRPOINT(td, KTR_GENIO)) 3609 ktruio = cloneuio(&auio); 3610 #endif /* KTRACE */ 3611 memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo)); 3612 CURVNET_SET(so->so_vnet); 3613 error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL, 3614 fromsa, fromlen, &msg_flags, 3615 (struct sctp_sndrcvinfo *)&sinfo, 1); 3616 CURVNET_RESTORE(); 3617 if (error != 0) { 3618 if (auio.uio_resid != len && (error == ERESTART || 3619 error == EINTR || error == EWOULDBLOCK)) 3620 error = 0; 3621 } else { 3622 if (uap->sinfo) 3623 error = copyout(&sinfo, uap->sinfo, sizeof (sinfo)); 3624 } 3625 #ifdef KTRACE 3626 if (ktruio != NULL) { 3627 ktruio->uio_resid = len - auio.uio_resid; 3628 ktrgenio(uap->sd, UIO_READ, ktruio, error); 3629 } 3630 #endif /* KTRACE */ 3631 if (error != 0) 3632 goto out; 3633 td->td_retval[0] = len - auio.uio_resid; 3634 3635 if (fromlen && uap->from) { 3636 len = fromlen; 3637 if (len <= 0 || fromsa == 0) 3638 len = 0; 3639 else { 3640 len = MIN(len, fromsa->sa_len); 3641 error = copyout(fromsa, uap->from, (size_t)len); 3642 if (error != 0) 3643 goto out; 3644 } 3645 error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t)); 3646 if (error != 0) 3647 goto out; 3648 } 3649 #ifdef KTRACE 3650 if (KTRPOINT(td, KTR_STRUCT)) 3651 ktrsockaddr(fromsa); 3652 #endif 3653 if (uap->msg_flags) { 3654 error = copyout(&msg_flags, uap->msg_flags, sizeof (int)); 3655 if (error != 0) 3656 goto out; 3657 } 3658 out: 3659 free(iov, M_IOV); 3660 out1: 3661 if (fp != NULL) 3662 fdrop(fp, td); 3663 3664 return (error); 3665 #else /* SCTP */ 3666 return (EOPNOTSUPP); 3667 #endif /* SCTP */ 3668 } 3669