1 /*- 2 * Copyright (c) 1982, 1986, 1989, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * sendfile(2) and related extensions: 6 * Copyright (c) 1998, David Greenman. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include "opt_capsicum.h" 39 #include "opt_inet.h" 40 #include "opt_inet6.h" 41 #include "opt_sctp.h" 42 #include "opt_compat.h" 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/capability.h> 48 #include <sys/condvar.h> 49 #include <sys/kernel.h> 50 #include <sys/lock.h> 51 #include <sys/mutex.h> 52 #include <sys/sysproto.h> 53 #include <sys/malloc.h> 54 #include <sys/filedesc.h> 55 #include <sys/event.h> 56 #include <sys/proc.h> 57 #include <sys/fcntl.h> 58 #include <sys/file.h> 59 #include <sys/filio.h> 60 #include <sys/jail.h> 61 #include <sys/mman.h> 62 #include <sys/mount.h> 63 #include <sys/mbuf.h> 64 #include <sys/protosw.h> 65 #include <sys/rwlock.h> 66 #include <sys/sf_buf.h> 67 #include <sys/sf_sync.h> 68 #include <sys/sf_base.h> 69 #include <sys/sysent.h> 70 #include <sys/socket.h> 71 #include <sys/socketvar.h> 72 #include <sys/signalvar.h> 73 #include <sys/syscallsubr.h> 74 #include <sys/sysctl.h> 75 #include <sys/uio.h> 76 #include <sys/vnode.h> 77 #ifdef KTRACE 78 #include <sys/ktrace.h> 79 #endif 80 #ifdef COMPAT_FREEBSD32 81 #include <compat/freebsd32/freebsd32_util.h> 82 #endif 83 84 #include <net/vnet.h> 85 86 #include <security/audit/audit.h> 87 #include <security/mac/mac_framework.h> 88 89 #include <vm/vm.h> 90 #include <vm/vm_param.h> 91 #include <vm/vm_object.h> 92 #include <vm/vm_page.h> 93 #include <vm/vm_pager.h> 94 #include <vm/vm_kern.h> 95 #include <vm/vm_extern.h> 96 #include <vm/uma.h> 97 98 #if defined(INET) || defined(INET6) 99 #ifdef SCTP 100 #include <netinet/sctp.h> 101 #include <netinet/sctp_peeloff.h> 102 #endif /* SCTP */ 103 #endif /* INET || INET6 */ 104 105 /* 106 * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC 107 * and SOCK_NONBLOCK. 108 */ 109 #define ACCEPT4_INHERIT 0x1 110 #define ACCEPT4_COMPAT 0x2 111 112 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); 113 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); 114 115 static int accept1(struct thread *td, int s, struct sockaddr *uname, 116 socklen_t *anamelen, int flags); 117 static int do_sendfile(struct thread *td, struct sendfile_args *uap, 118 int compat); 119 static int getsockname1(struct thread *td, struct getsockname_args *uap, 120 int compat); 121 static int getpeername1(struct thread *td, struct getpeername_args *uap, 122 int compat); 123 124 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; 125 126 static int filt_sfsync_attach(struct knote *kn); 127 static void filt_sfsync_detach(struct knote *kn); 128 static int filt_sfsync(struct knote *kn, long hint); 129 130 /* 131 * sendfile(2)-related variables and associated sysctls 132 */ 133 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0, 134 "sendfile(2) tunables"); 135 static int sfreadahead = 1; 136 SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW, 137 &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks"); 138 139 #ifdef SFSYNC_DEBUG 140 static int sf_sync_debug = 0; 141 SYSCTL_INT(_debug, OID_AUTO, sf_sync_debug, CTLFLAG_RW, 142 &sf_sync_debug, 0, "Output debugging during sf_sync lifecycle"); 143 #define SFSYNC_DPRINTF(s, ...) \ 144 do { \ 145 if (sf_sync_debug) \ 146 printf((s), ##__VA_ARGS__); \ 147 } while (0) 148 #else 149 #define SFSYNC_DPRINTF(c, ...) 150 #endif 151 152 static uma_zone_t zone_sfsync; 153 154 static struct filterops sendfile_filtops = { 155 .f_isfd = 0, 156 .f_attach = filt_sfsync_attach, 157 .f_detach = filt_sfsync_detach, 158 .f_event = filt_sfsync, 159 }; 160 161 static void 162 sfstat_init(const void *unused) 163 { 164 165 COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), 166 M_WAITOK); 167 } 168 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); 169 170 static void 171 sf_sync_init(const void *unused) 172 { 173 174 zone_sfsync = uma_zcreate("sendfile_sync", sizeof(struct sendfile_sync), 175 NULL, NULL, 176 NULL, NULL, 177 UMA_ALIGN_CACHE, 178 0); 179 kqueue_add_filteropts(EVFILT_SENDFILE, &sendfile_filtops); 180 } 181 SYSINIT(sf_sync, SI_SUB_MBUF, SI_ORDER_FIRST, sf_sync_init, NULL); 182 183 static int 184 sfstat_sysctl(SYSCTL_HANDLER_ARGS) 185 { 186 struct sfstat s; 187 188 COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); 189 if (req->newptr) 190 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); 191 return (SYSCTL_OUT(req, &s, sizeof(s))); 192 } 193 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, 194 NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); 195 196 /* 197 * Convert a user file descriptor to a kernel file entry and check if required 198 * capability rights are present. 199 * A reference on the file entry is held upon returning. 200 */ 201 static int 202 getsock_cap(struct filedesc *fdp, int fd, cap_rights_t *rightsp, 203 struct file **fpp, u_int *fflagp) 204 { 205 struct file *fp; 206 int error; 207 208 error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL); 209 if (error != 0) 210 return (error); 211 if (fp->f_type != DTYPE_SOCKET) { 212 fdrop(fp, curthread); 213 return (ENOTSOCK); 214 } 215 if (fflagp != NULL) 216 *fflagp = fp->f_flag; 217 *fpp = fp; 218 return (0); 219 } 220 221 /* 222 * System call interface to the socket abstraction. 223 */ 224 #if defined(COMPAT_43) 225 #define COMPAT_OLDSOCK 226 #endif 227 228 int 229 sys_socket(td, uap) 230 struct thread *td; 231 struct socket_args /* { 232 int domain; 233 int type; 234 int protocol; 235 } */ *uap; 236 { 237 struct socket *so; 238 struct file *fp; 239 int fd, error, type, oflag, fflag; 240 241 AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); 242 243 type = uap->type; 244 oflag = 0; 245 fflag = 0; 246 if ((type & SOCK_CLOEXEC) != 0) { 247 type &= ~SOCK_CLOEXEC; 248 oflag |= O_CLOEXEC; 249 } 250 if ((type & SOCK_NONBLOCK) != 0) { 251 type &= ~SOCK_NONBLOCK; 252 fflag |= FNONBLOCK; 253 } 254 255 #ifdef MAC 256 error = mac_socket_check_create(td->td_ucred, uap->domain, type, 257 uap->protocol); 258 if (error != 0) 259 return (error); 260 #endif 261 error = falloc(td, &fp, &fd, oflag); 262 if (error != 0) 263 return (error); 264 /* An extra reference on `fp' has been held for us by falloc(). */ 265 error = socreate(uap->domain, &so, type, uap->protocol, 266 td->td_ucred, td); 267 if (error != 0) { 268 fdclose(td->td_proc->p_fd, fp, fd, td); 269 } else { 270 finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); 271 if ((fflag & FNONBLOCK) != 0) 272 (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); 273 td->td_retval[0] = fd; 274 } 275 fdrop(fp, td); 276 return (error); 277 } 278 279 /* ARGSUSED */ 280 int 281 sys_bind(td, uap) 282 struct thread *td; 283 struct bind_args /* { 284 int s; 285 caddr_t name; 286 int namelen; 287 } */ *uap; 288 { 289 struct sockaddr *sa; 290 int error; 291 292 error = getsockaddr(&sa, uap->name, uap->namelen); 293 if (error == 0) { 294 error = kern_bind(td, uap->s, sa); 295 free(sa, M_SONAME); 296 } 297 return (error); 298 } 299 300 static int 301 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 302 { 303 struct socket *so; 304 struct file *fp; 305 cap_rights_t rights; 306 int error; 307 308 AUDIT_ARG_FD(fd); 309 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 310 error = getsock_cap(td->td_proc->p_fd, fd, 311 cap_rights_init(&rights, CAP_BIND), &fp, NULL); 312 if (error != 0) 313 return (error); 314 so = fp->f_data; 315 #ifdef KTRACE 316 if (KTRPOINT(td, KTR_STRUCT)) 317 ktrsockaddr(sa); 318 #endif 319 #ifdef MAC 320 error = mac_socket_check_bind(td->td_ucred, so, sa); 321 if (error == 0) { 322 #endif 323 if (dirfd == AT_FDCWD) 324 error = sobind(so, sa, td); 325 else 326 error = sobindat(dirfd, so, sa, td); 327 #ifdef MAC 328 } 329 #endif 330 fdrop(fp, td); 331 return (error); 332 } 333 334 int 335 kern_bind(struct thread *td, int fd, struct sockaddr *sa) 336 { 337 338 return (kern_bindat(td, AT_FDCWD, fd, sa)); 339 } 340 341 /* ARGSUSED */ 342 int 343 sys_bindat(td, uap) 344 struct thread *td; 345 struct bindat_args /* { 346 int fd; 347 int s; 348 caddr_t name; 349 int namelen; 350 } */ *uap; 351 { 352 struct sockaddr *sa; 353 int error; 354 355 error = getsockaddr(&sa, uap->name, uap->namelen); 356 if (error == 0) { 357 error = kern_bindat(td, uap->fd, uap->s, sa); 358 free(sa, M_SONAME); 359 } 360 return (error); 361 } 362 363 /* ARGSUSED */ 364 int 365 sys_listen(td, uap) 366 struct thread *td; 367 struct listen_args /* { 368 int s; 369 int backlog; 370 } */ *uap; 371 { 372 struct socket *so; 373 struct file *fp; 374 cap_rights_t rights; 375 int error; 376 377 AUDIT_ARG_FD(uap->s); 378 error = getsock_cap(td->td_proc->p_fd, uap->s, 379 cap_rights_init(&rights, CAP_LISTEN), &fp, NULL); 380 if (error == 0) { 381 so = fp->f_data; 382 #ifdef MAC 383 error = mac_socket_check_listen(td->td_ucred, so); 384 if (error == 0) 385 #endif 386 error = solisten(so, uap->backlog, td); 387 fdrop(fp, td); 388 } 389 return(error); 390 } 391 392 /* 393 * accept1() 394 */ 395 static int 396 accept1(td, s, uname, anamelen, flags) 397 struct thread *td; 398 int s; 399 struct sockaddr *uname; 400 socklen_t *anamelen; 401 int flags; 402 { 403 struct sockaddr *name; 404 socklen_t namelen; 405 struct file *fp; 406 int error; 407 408 if (uname == NULL) 409 return (kern_accept4(td, s, NULL, NULL, flags, NULL)); 410 411 error = copyin(anamelen, &namelen, sizeof (namelen)); 412 if (error != 0) 413 return (error); 414 415 error = kern_accept4(td, s, &name, &namelen, flags, &fp); 416 417 /* 418 * return a namelen of zero for older code which might 419 * ignore the return value from accept. 420 */ 421 if (error != 0) { 422 (void) copyout(&namelen, anamelen, sizeof(*anamelen)); 423 return (error); 424 } 425 426 if (error == 0 && uname != NULL) { 427 #ifdef COMPAT_OLDSOCK 428 if (flags & ACCEPT4_COMPAT) 429 ((struct osockaddr *)name)->sa_family = 430 name->sa_family; 431 #endif 432 error = copyout(name, uname, namelen); 433 } 434 if (error == 0) 435 error = copyout(&namelen, anamelen, 436 sizeof(namelen)); 437 if (error != 0) 438 fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td); 439 fdrop(fp, td); 440 free(name, M_SONAME); 441 return (error); 442 } 443 444 int 445 kern_accept(struct thread *td, int s, struct sockaddr **name, 446 socklen_t *namelen, struct file **fp) 447 { 448 return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); 449 } 450 451 int 452 kern_accept4(struct thread *td, int s, struct sockaddr **name, 453 socklen_t *namelen, int flags, struct file **fp) 454 { 455 struct filedesc *fdp; 456 struct file *headfp, *nfp = NULL; 457 struct sockaddr *sa = NULL; 458 struct socket *head, *so; 459 cap_rights_t rights; 460 u_int fflag; 461 pid_t pgid; 462 int error, fd, tmp; 463 464 if (name != NULL) 465 *name = NULL; 466 467 AUDIT_ARG_FD(s); 468 fdp = td->td_proc->p_fd; 469 error = getsock_cap(fdp, s, cap_rights_init(&rights, CAP_ACCEPT), 470 &headfp, &fflag); 471 if (error != 0) 472 return (error); 473 head = headfp->f_data; 474 if ((head->so_options & SO_ACCEPTCONN) == 0) { 475 error = EINVAL; 476 goto done; 477 } 478 #ifdef MAC 479 error = mac_socket_check_accept(td->td_ucred, head); 480 if (error != 0) 481 goto done; 482 #endif 483 error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); 484 if (error != 0) 485 goto done; 486 ACCEPT_LOCK(); 487 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { 488 ACCEPT_UNLOCK(); 489 error = EWOULDBLOCK; 490 goto noconnection; 491 } 492 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { 493 if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { 494 head->so_error = ECONNABORTED; 495 break; 496 } 497 error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, 498 "accept", 0); 499 if (error != 0) { 500 ACCEPT_UNLOCK(); 501 goto noconnection; 502 } 503 } 504 if (head->so_error) { 505 error = head->so_error; 506 head->so_error = 0; 507 ACCEPT_UNLOCK(); 508 goto noconnection; 509 } 510 so = TAILQ_FIRST(&head->so_comp); 511 KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); 512 KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); 513 514 /* 515 * Before changing the flags on the socket, we have to bump the 516 * reference count. Otherwise, if the protocol calls sofree(), 517 * the socket will be released due to a zero refcount. 518 */ 519 SOCK_LOCK(so); /* soref() and so_state update */ 520 soref(so); /* file descriptor reference */ 521 522 TAILQ_REMOVE(&head->so_comp, so, so_list); 523 head->so_qlen--; 524 if (flags & ACCEPT4_INHERIT) 525 so->so_state |= (head->so_state & SS_NBIO); 526 else 527 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; 528 so->so_qstate &= ~SQ_COMP; 529 so->so_head = NULL; 530 531 SOCK_UNLOCK(so); 532 ACCEPT_UNLOCK(); 533 534 /* An extra reference on `nfp' has been held for us by falloc(). */ 535 td->td_retval[0] = fd; 536 537 /* connection has been removed from the listen queue */ 538 KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); 539 540 if (flags & ACCEPT4_INHERIT) { 541 pgid = fgetown(&head->so_sigio); 542 if (pgid != 0) 543 fsetown(pgid, &so->so_sigio); 544 } else { 545 fflag &= ~(FNONBLOCK | FASYNC); 546 if (flags & SOCK_NONBLOCK) 547 fflag |= FNONBLOCK; 548 } 549 550 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); 551 /* Sync socket nonblocking/async state with file flags */ 552 tmp = fflag & FNONBLOCK; 553 (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); 554 tmp = fflag & FASYNC; 555 (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); 556 sa = 0; 557 error = soaccept(so, &sa); 558 if (error != 0) { 559 /* 560 * return a namelen of zero for older code which might 561 * ignore the return value from accept. 562 */ 563 if (name) 564 *namelen = 0; 565 goto noconnection; 566 } 567 if (sa == NULL) { 568 if (name) 569 *namelen = 0; 570 goto done; 571 } 572 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); 573 if (name) { 574 /* check sa_len before it is destroyed */ 575 if (*namelen > sa->sa_len) 576 *namelen = sa->sa_len; 577 #ifdef KTRACE 578 if (KTRPOINT(td, KTR_STRUCT)) 579 ktrsockaddr(sa); 580 #endif 581 *name = sa; 582 sa = NULL; 583 } 584 noconnection: 585 free(sa, M_SONAME); 586 587 /* 588 * close the new descriptor, assuming someone hasn't ripped it 589 * out from under us. 590 */ 591 if (error != 0) 592 fdclose(fdp, nfp, fd, td); 593 594 /* 595 * Release explicitly held references before returning. We return 596 * a reference on nfp to the caller on success if they request it. 597 */ 598 done: 599 if (fp != NULL) { 600 if (error == 0) { 601 *fp = nfp; 602 nfp = NULL; 603 } else 604 *fp = NULL; 605 } 606 if (nfp != NULL) 607 fdrop(nfp, td); 608 fdrop(headfp, td); 609 return (error); 610 } 611 612 int 613 sys_accept(td, uap) 614 struct thread *td; 615 struct accept_args *uap; 616 { 617 618 return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); 619 } 620 621 int 622 sys_accept4(td, uap) 623 struct thread *td; 624 struct accept4_args *uap; 625 { 626 627 if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 628 return (EINVAL); 629 630 return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); 631 } 632 633 #ifdef COMPAT_OLDSOCK 634 int 635 oaccept(td, uap) 636 struct thread *td; 637 struct accept_args *uap; 638 { 639 640 return (accept1(td, uap->s, uap->name, uap->anamelen, 641 ACCEPT4_INHERIT | ACCEPT4_COMPAT)); 642 } 643 #endif /* COMPAT_OLDSOCK */ 644 645 /* ARGSUSED */ 646 int 647 sys_connect(td, uap) 648 struct thread *td; 649 struct connect_args /* { 650 int s; 651 caddr_t name; 652 int namelen; 653 } */ *uap; 654 { 655 struct sockaddr *sa; 656 int error; 657 658 error = getsockaddr(&sa, uap->name, uap->namelen); 659 if (error == 0) { 660 error = kern_connect(td, uap->s, sa); 661 free(sa, M_SONAME); 662 } 663 return (error); 664 } 665 666 static int 667 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) 668 { 669 struct socket *so; 670 struct file *fp; 671 cap_rights_t rights; 672 int error, interrupted = 0; 673 674 AUDIT_ARG_FD(fd); 675 AUDIT_ARG_SOCKADDR(td, dirfd, sa); 676 error = getsock_cap(td->td_proc->p_fd, fd, 677 cap_rights_init(&rights, CAP_CONNECT), &fp, NULL); 678 if (error != 0) 679 return (error); 680 so = fp->f_data; 681 if (so->so_state & SS_ISCONNECTING) { 682 error = EALREADY; 683 goto done1; 684 } 685 #ifdef KTRACE 686 if (KTRPOINT(td, KTR_STRUCT)) 687 ktrsockaddr(sa); 688 #endif 689 #ifdef MAC 690 error = mac_socket_check_connect(td->td_ucred, so, sa); 691 if (error != 0) 692 goto bad; 693 #endif 694 if (dirfd == AT_FDCWD) 695 error = soconnect(so, sa, td); 696 else 697 error = soconnectat(dirfd, so, sa, td); 698 if (error != 0) 699 goto bad; 700 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { 701 error = EINPROGRESS; 702 goto done1; 703 } 704 SOCK_LOCK(so); 705 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { 706 error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, 707 "connec", 0); 708 if (error != 0) { 709 if (error == EINTR || error == ERESTART) 710 interrupted = 1; 711 break; 712 } 713 } 714 if (error == 0) { 715 error = so->so_error; 716 so->so_error = 0; 717 } 718 SOCK_UNLOCK(so); 719 bad: 720 if (!interrupted) 721 so->so_state &= ~SS_ISCONNECTING; 722 if (error == ERESTART) 723 error = EINTR; 724 done1: 725 fdrop(fp, td); 726 return (error); 727 } 728 729 int 730 kern_connect(struct thread *td, int fd, struct sockaddr *sa) 731 { 732 733 return (kern_connectat(td, AT_FDCWD, fd, sa)); 734 } 735 736 /* ARGSUSED */ 737 int 738 sys_connectat(td, uap) 739 struct thread *td; 740 struct connectat_args /* { 741 int fd; 742 int s; 743 caddr_t name; 744 int namelen; 745 } */ *uap; 746 { 747 struct sockaddr *sa; 748 int error; 749 750 error = getsockaddr(&sa, uap->name, uap->namelen); 751 if (error == 0) { 752 error = kern_connectat(td, uap->fd, uap->s, sa); 753 free(sa, M_SONAME); 754 } 755 return (error); 756 } 757 758 int 759 kern_socketpair(struct thread *td, int domain, int type, int protocol, 760 int *rsv) 761 { 762 struct filedesc *fdp = td->td_proc->p_fd; 763 struct file *fp1, *fp2; 764 struct socket *so1, *so2; 765 int fd, error, oflag, fflag; 766 767 AUDIT_ARG_SOCKET(domain, type, protocol); 768 769 oflag = 0; 770 fflag = 0; 771 if ((type & SOCK_CLOEXEC) != 0) { 772 type &= ~SOCK_CLOEXEC; 773 oflag |= O_CLOEXEC; 774 } 775 if ((type & SOCK_NONBLOCK) != 0) { 776 type &= ~SOCK_NONBLOCK; 777 fflag |= FNONBLOCK; 778 } 779 #ifdef MAC 780 /* We might want to have a separate check for socket pairs. */ 781 error = mac_socket_check_create(td->td_ucred, domain, type, 782 protocol); 783 if (error != 0) 784 return (error); 785 #endif 786 error = socreate(domain, &so1, type, protocol, td->td_ucred, td); 787 if (error != 0) 788 return (error); 789 error = socreate(domain, &so2, type, protocol, td->td_ucred, td); 790 if (error != 0) 791 goto free1; 792 /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ 793 error = falloc(td, &fp1, &fd, oflag); 794 if (error != 0) 795 goto free2; 796 rsv[0] = fd; 797 fp1->f_data = so1; /* so1 already has ref count */ 798 error = falloc(td, &fp2, &fd, oflag); 799 if (error != 0) 800 goto free3; 801 fp2->f_data = so2; /* so2 already has ref count */ 802 rsv[1] = fd; 803 error = soconnect2(so1, so2); 804 if (error != 0) 805 goto free4; 806 if (type == SOCK_DGRAM) { 807 /* 808 * Datagram socket connection is asymmetric. 809 */ 810 error = soconnect2(so2, so1); 811 if (error != 0) 812 goto free4; 813 } 814 finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, 815 &socketops); 816 finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, 817 &socketops); 818 if ((fflag & FNONBLOCK) != 0) { 819 (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); 820 (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); 821 } 822 fdrop(fp1, td); 823 fdrop(fp2, td); 824 return (0); 825 free4: 826 fdclose(fdp, fp2, rsv[1], td); 827 fdrop(fp2, td); 828 free3: 829 fdclose(fdp, fp1, rsv[0], td); 830 fdrop(fp1, td); 831 free2: 832 if (so2 != NULL) 833 (void)soclose(so2); 834 free1: 835 if (so1 != NULL) 836 (void)soclose(so1); 837 return (error); 838 } 839 840 int 841 sys_socketpair(struct thread *td, struct socketpair_args *uap) 842 { 843 int error, sv[2]; 844 845 error = kern_socketpair(td, uap->domain, uap->type, 846 uap->protocol, sv); 847 if (error != 0) 848 return (error); 849 error = copyout(sv, uap->rsv, 2 * sizeof(int)); 850 if (error != 0) { 851 (void)kern_close(td, sv[0]); 852 (void)kern_close(td, sv[1]); 853 } 854 return (error); 855 } 856 857 static int 858 sendit(td, s, mp, flags) 859 struct thread *td; 860 int s; 861 struct msghdr *mp; 862 int flags; 863 { 864 struct mbuf *control; 865 struct sockaddr *to; 866 int error; 867 868 #ifdef CAPABILITY_MODE 869 if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) 870 return (ECAPMODE); 871 #endif 872 873 if (mp->msg_name != NULL) { 874 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); 875 if (error != 0) { 876 to = NULL; 877 goto bad; 878 } 879 mp->msg_name = to; 880 } else { 881 to = NULL; 882 } 883 884 if (mp->msg_control) { 885 if (mp->msg_controllen < sizeof(struct cmsghdr) 886 #ifdef COMPAT_OLDSOCK 887 && mp->msg_flags != MSG_COMPAT 888 #endif 889 ) { 890 error = EINVAL; 891 goto bad; 892 } 893 error = sockargs(&control, mp->msg_control, 894 mp->msg_controllen, MT_CONTROL); 895 if (error != 0) 896 goto bad; 897 #ifdef COMPAT_OLDSOCK 898 if (mp->msg_flags == MSG_COMPAT) { 899 struct cmsghdr *cm; 900 901 M_PREPEND(control, sizeof(*cm), M_WAITOK); 902 cm = mtod(control, struct cmsghdr *); 903 cm->cmsg_len = control->m_len; 904 cm->cmsg_level = SOL_SOCKET; 905 cm->cmsg_type = SCM_RIGHTS; 906 } 907 #endif 908 } else { 909 control = NULL; 910 } 911 912 error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); 913 914 bad: 915 free(to, M_SONAME); 916 return (error); 917 } 918 919 int 920 kern_sendit(td, s, mp, flags, control, segflg) 921 struct thread *td; 922 int s; 923 struct msghdr *mp; 924 int flags; 925 struct mbuf *control; 926 enum uio_seg segflg; 927 { 928 struct file *fp; 929 struct uio auio; 930 struct iovec *iov; 931 struct socket *so; 932 cap_rights_t rights; 933 #ifdef KTRACE 934 struct uio *ktruio = NULL; 935 #endif 936 ssize_t len; 937 int i, error; 938 939 AUDIT_ARG_FD(s); 940 cap_rights_init(&rights, CAP_SEND); 941 if (mp->msg_name != NULL) { 942 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); 943 cap_rights_set(&rights, CAP_CONNECT); 944 } 945 error = getsock_cap(td->td_proc->p_fd, s, &rights, &fp, NULL); 946 if (error != 0) 947 return (error); 948 so = (struct socket *)fp->f_data; 949 950 #ifdef KTRACE 951 if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) 952 ktrsockaddr(mp->msg_name); 953 #endif 954 #ifdef MAC 955 if (mp->msg_name != NULL) { 956 error = mac_socket_check_connect(td->td_ucred, so, 957 mp->msg_name); 958 if (error != 0) 959 goto bad; 960 } 961 error = mac_socket_check_send(td->td_ucred, so); 962 if (error != 0) 963 goto bad; 964 #endif 965 966 auio.uio_iov = mp->msg_iov; 967 auio.uio_iovcnt = mp->msg_iovlen; 968 auio.uio_segflg = segflg; 969 auio.uio_rw = UIO_WRITE; 970 auio.uio_td = td; 971 auio.uio_offset = 0; /* XXX */ 972 auio.uio_resid = 0; 973 iov = mp->msg_iov; 974 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 975 if ((auio.uio_resid += iov->iov_len) < 0) { 976 error = EINVAL; 977 goto bad; 978 } 979 } 980 #ifdef KTRACE 981 if (KTRPOINT(td, KTR_GENIO)) 982 ktruio = cloneuio(&auio); 983 #endif 984 len = auio.uio_resid; 985 error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); 986 if (error != 0) { 987 if (auio.uio_resid != len && (error == ERESTART || 988 error == EINTR || error == EWOULDBLOCK)) 989 error = 0; 990 /* Generation of SIGPIPE can be controlled per socket */ 991 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 992 !(flags & MSG_NOSIGNAL)) { 993 PROC_LOCK(td->td_proc); 994 tdsignal(td, SIGPIPE); 995 PROC_UNLOCK(td->td_proc); 996 } 997 } 998 if (error == 0) 999 td->td_retval[0] = len - auio.uio_resid; 1000 #ifdef KTRACE 1001 if (ktruio != NULL) { 1002 ktruio->uio_resid = td->td_retval[0]; 1003 ktrgenio(s, UIO_WRITE, ktruio, error); 1004 } 1005 #endif 1006 bad: 1007 fdrop(fp, td); 1008 return (error); 1009 } 1010 1011 int 1012 sys_sendto(td, uap) 1013 struct thread *td; 1014 struct sendto_args /* { 1015 int s; 1016 caddr_t buf; 1017 size_t len; 1018 int flags; 1019 caddr_t to; 1020 int tolen; 1021 } */ *uap; 1022 { 1023 struct msghdr msg; 1024 struct iovec aiov; 1025 1026 msg.msg_name = uap->to; 1027 msg.msg_namelen = uap->tolen; 1028 msg.msg_iov = &aiov; 1029 msg.msg_iovlen = 1; 1030 msg.msg_control = 0; 1031 #ifdef COMPAT_OLDSOCK 1032 msg.msg_flags = 0; 1033 #endif 1034 aiov.iov_base = uap->buf; 1035 aiov.iov_len = uap->len; 1036 return (sendit(td, uap->s, &msg, uap->flags)); 1037 } 1038 1039 #ifdef COMPAT_OLDSOCK 1040 int 1041 osend(td, uap) 1042 struct thread *td; 1043 struct osend_args /* { 1044 int s; 1045 caddr_t buf; 1046 int len; 1047 int flags; 1048 } */ *uap; 1049 { 1050 struct msghdr msg; 1051 struct iovec aiov; 1052 1053 msg.msg_name = 0; 1054 msg.msg_namelen = 0; 1055 msg.msg_iov = &aiov; 1056 msg.msg_iovlen = 1; 1057 aiov.iov_base = uap->buf; 1058 aiov.iov_len = uap->len; 1059 msg.msg_control = 0; 1060 msg.msg_flags = 0; 1061 return (sendit(td, uap->s, &msg, uap->flags)); 1062 } 1063 1064 int 1065 osendmsg(td, uap) 1066 struct thread *td; 1067 struct osendmsg_args /* { 1068 int s; 1069 caddr_t msg; 1070 int flags; 1071 } */ *uap; 1072 { 1073 struct msghdr msg; 1074 struct iovec *iov; 1075 int error; 1076 1077 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1078 if (error != 0) 1079 return (error); 1080 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1081 if (error != 0) 1082 return (error); 1083 msg.msg_iov = iov; 1084 msg.msg_flags = MSG_COMPAT; 1085 error = sendit(td, uap->s, &msg, uap->flags); 1086 free(iov, M_IOV); 1087 return (error); 1088 } 1089 #endif 1090 1091 int 1092 sys_sendmsg(td, uap) 1093 struct thread *td; 1094 struct sendmsg_args /* { 1095 int s; 1096 caddr_t msg; 1097 int flags; 1098 } */ *uap; 1099 { 1100 struct msghdr msg; 1101 struct iovec *iov; 1102 int error; 1103 1104 error = copyin(uap->msg, &msg, sizeof (msg)); 1105 if (error != 0) 1106 return (error); 1107 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1108 if (error != 0) 1109 return (error); 1110 msg.msg_iov = iov; 1111 #ifdef COMPAT_OLDSOCK 1112 msg.msg_flags = 0; 1113 #endif 1114 error = sendit(td, uap->s, &msg, uap->flags); 1115 free(iov, M_IOV); 1116 return (error); 1117 } 1118 1119 int 1120 kern_recvit(td, s, mp, fromseg, controlp) 1121 struct thread *td; 1122 int s; 1123 struct msghdr *mp; 1124 enum uio_seg fromseg; 1125 struct mbuf **controlp; 1126 { 1127 struct uio auio; 1128 struct iovec *iov; 1129 struct mbuf *m, *control = NULL; 1130 caddr_t ctlbuf; 1131 struct file *fp; 1132 struct socket *so; 1133 struct sockaddr *fromsa = NULL; 1134 cap_rights_t rights; 1135 #ifdef KTRACE 1136 struct uio *ktruio = NULL; 1137 #endif 1138 ssize_t len; 1139 int error, i; 1140 1141 if (controlp != NULL) 1142 *controlp = NULL; 1143 1144 AUDIT_ARG_FD(s); 1145 error = getsock_cap(td->td_proc->p_fd, s, 1146 cap_rights_init(&rights, CAP_RECV), &fp, NULL); 1147 if (error != 0) 1148 return (error); 1149 so = fp->f_data; 1150 1151 #ifdef MAC 1152 error = mac_socket_check_receive(td->td_ucred, so); 1153 if (error != 0) { 1154 fdrop(fp, td); 1155 return (error); 1156 } 1157 #endif 1158 1159 auio.uio_iov = mp->msg_iov; 1160 auio.uio_iovcnt = mp->msg_iovlen; 1161 auio.uio_segflg = UIO_USERSPACE; 1162 auio.uio_rw = UIO_READ; 1163 auio.uio_td = td; 1164 auio.uio_offset = 0; /* XXX */ 1165 auio.uio_resid = 0; 1166 iov = mp->msg_iov; 1167 for (i = 0; i < mp->msg_iovlen; i++, iov++) { 1168 if ((auio.uio_resid += iov->iov_len) < 0) { 1169 fdrop(fp, td); 1170 return (EINVAL); 1171 } 1172 } 1173 #ifdef KTRACE 1174 if (KTRPOINT(td, KTR_GENIO)) 1175 ktruio = cloneuio(&auio); 1176 #endif 1177 len = auio.uio_resid; 1178 error = soreceive(so, &fromsa, &auio, NULL, 1179 (mp->msg_control || controlp) ? &control : NULL, 1180 &mp->msg_flags); 1181 if (error != 0) { 1182 if (auio.uio_resid != len && (error == ERESTART || 1183 error == EINTR || error == EWOULDBLOCK)) 1184 error = 0; 1185 } 1186 if (fromsa != NULL) 1187 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); 1188 #ifdef KTRACE 1189 if (ktruio != NULL) { 1190 ktruio->uio_resid = len - auio.uio_resid; 1191 ktrgenio(s, UIO_READ, ktruio, error); 1192 } 1193 #endif 1194 if (error != 0) 1195 goto out; 1196 td->td_retval[0] = len - auio.uio_resid; 1197 if (mp->msg_name) { 1198 len = mp->msg_namelen; 1199 if (len <= 0 || fromsa == NULL) 1200 len = 0; 1201 else { 1202 /* save sa_len before it is destroyed by MSG_COMPAT */ 1203 len = MIN(len, fromsa->sa_len); 1204 #ifdef COMPAT_OLDSOCK 1205 if (mp->msg_flags & MSG_COMPAT) 1206 ((struct osockaddr *)fromsa)->sa_family = 1207 fromsa->sa_family; 1208 #endif 1209 if (fromseg == UIO_USERSPACE) { 1210 error = copyout(fromsa, mp->msg_name, 1211 (unsigned)len); 1212 if (error != 0) 1213 goto out; 1214 } else 1215 bcopy(fromsa, mp->msg_name, len); 1216 } 1217 mp->msg_namelen = len; 1218 } 1219 if (mp->msg_control && controlp == NULL) { 1220 #ifdef COMPAT_OLDSOCK 1221 /* 1222 * We assume that old recvmsg calls won't receive access 1223 * rights and other control info, esp. as control info 1224 * is always optional and those options didn't exist in 4.3. 1225 * If we receive rights, trim the cmsghdr; anything else 1226 * is tossed. 1227 */ 1228 if (control && mp->msg_flags & MSG_COMPAT) { 1229 if (mtod(control, struct cmsghdr *)->cmsg_level != 1230 SOL_SOCKET || 1231 mtod(control, struct cmsghdr *)->cmsg_type != 1232 SCM_RIGHTS) { 1233 mp->msg_controllen = 0; 1234 goto out; 1235 } 1236 control->m_len -= sizeof (struct cmsghdr); 1237 control->m_data += sizeof (struct cmsghdr); 1238 } 1239 #endif 1240 len = mp->msg_controllen; 1241 m = control; 1242 mp->msg_controllen = 0; 1243 ctlbuf = mp->msg_control; 1244 1245 while (m && len > 0) { 1246 unsigned int tocopy; 1247 1248 if (len >= m->m_len) 1249 tocopy = m->m_len; 1250 else { 1251 mp->msg_flags |= MSG_CTRUNC; 1252 tocopy = len; 1253 } 1254 1255 if ((error = copyout(mtod(m, caddr_t), 1256 ctlbuf, tocopy)) != 0) 1257 goto out; 1258 1259 ctlbuf += tocopy; 1260 len -= tocopy; 1261 m = m->m_next; 1262 } 1263 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; 1264 } 1265 out: 1266 fdrop(fp, td); 1267 #ifdef KTRACE 1268 if (fromsa && KTRPOINT(td, KTR_STRUCT)) 1269 ktrsockaddr(fromsa); 1270 #endif 1271 free(fromsa, M_SONAME); 1272 1273 if (error == 0 && controlp != NULL) 1274 *controlp = control; 1275 else if (control) 1276 m_freem(control); 1277 1278 return (error); 1279 } 1280 1281 static int 1282 recvit(td, s, mp, namelenp) 1283 struct thread *td; 1284 int s; 1285 struct msghdr *mp; 1286 void *namelenp; 1287 { 1288 int error; 1289 1290 error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); 1291 if (error != 0) 1292 return (error); 1293 if (namelenp != NULL) { 1294 error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); 1295 #ifdef COMPAT_OLDSOCK 1296 if (mp->msg_flags & MSG_COMPAT) 1297 error = 0; /* old recvfrom didn't check */ 1298 #endif 1299 } 1300 return (error); 1301 } 1302 1303 int 1304 sys_recvfrom(td, uap) 1305 struct thread *td; 1306 struct recvfrom_args /* { 1307 int s; 1308 caddr_t buf; 1309 size_t len; 1310 int flags; 1311 struct sockaddr * __restrict from; 1312 socklen_t * __restrict fromlenaddr; 1313 } */ *uap; 1314 { 1315 struct msghdr msg; 1316 struct iovec aiov; 1317 int error; 1318 1319 if (uap->fromlenaddr) { 1320 error = copyin(uap->fromlenaddr, 1321 &msg.msg_namelen, sizeof (msg.msg_namelen)); 1322 if (error != 0) 1323 goto done2; 1324 } else { 1325 msg.msg_namelen = 0; 1326 } 1327 msg.msg_name = uap->from; 1328 msg.msg_iov = &aiov; 1329 msg.msg_iovlen = 1; 1330 aiov.iov_base = uap->buf; 1331 aiov.iov_len = uap->len; 1332 msg.msg_control = 0; 1333 msg.msg_flags = uap->flags; 1334 error = recvit(td, uap->s, &msg, uap->fromlenaddr); 1335 done2: 1336 return (error); 1337 } 1338 1339 #ifdef COMPAT_OLDSOCK 1340 int 1341 orecvfrom(td, uap) 1342 struct thread *td; 1343 struct recvfrom_args *uap; 1344 { 1345 1346 uap->flags |= MSG_COMPAT; 1347 return (sys_recvfrom(td, uap)); 1348 } 1349 #endif 1350 1351 #ifdef COMPAT_OLDSOCK 1352 int 1353 orecv(td, uap) 1354 struct thread *td; 1355 struct orecv_args /* { 1356 int s; 1357 caddr_t buf; 1358 int len; 1359 int flags; 1360 } */ *uap; 1361 { 1362 struct msghdr msg; 1363 struct iovec aiov; 1364 1365 msg.msg_name = 0; 1366 msg.msg_namelen = 0; 1367 msg.msg_iov = &aiov; 1368 msg.msg_iovlen = 1; 1369 aiov.iov_base = uap->buf; 1370 aiov.iov_len = uap->len; 1371 msg.msg_control = 0; 1372 msg.msg_flags = uap->flags; 1373 return (recvit(td, uap->s, &msg, NULL)); 1374 } 1375 1376 /* 1377 * Old recvmsg. This code takes advantage of the fact that the old msghdr 1378 * overlays the new one, missing only the flags, and with the (old) access 1379 * rights where the control fields are now. 1380 */ 1381 int 1382 orecvmsg(td, uap) 1383 struct thread *td; 1384 struct orecvmsg_args /* { 1385 int s; 1386 struct omsghdr *msg; 1387 int flags; 1388 } */ *uap; 1389 { 1390 struct msghdr msg; 1391 struct iovec *iov; 1392 int error; 1393 1394 error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); 1395 if (error != 0) 1396 return (error); 1397 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1398 if (error != 0) 1399 return (error); 1400 msg.msg_flags = uap->flags | MSG_COMPAT; 1401 msg.msg_iov = iov; 1402 error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); 1403 if (msg.msg_controllen && error == 0) 1404 error = copyout(&msg.msg_controllen, 1405 &uap->msg->msg_accrightslen, sizeof (int)); 1406 free(iov, M_IOV); 1407 return (error); 1408 } 1409 #endif 1410 1411 int 1412 sys_recvmsg(td, uap) 1413 struct thread *td; 1414 struct recvmsg_args /* { 1415 int s; 1416 struct msghdr *msg; 1417 int flags; 1418 } */ *uap; 1419 { 1420 struct msghdr msg; 1421 struct iovec *uiov, *iov; 1422 int error; 1423 1424 error = copyin(uap->msg, &msg, sizeof (msg)); 1425 if (error != 0) 1426 return (error); 1427 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); 1428 if (error != 0) 1429 return (error); 1430 msg.msg_flags = uap->flags; 1431 #ifdef COMPAT_OLDSOCK 1432 msg.msg_flags &= ~MSG_COMPAT; 1433 #endif 1434 uiov = msg.msg_iov; 1435 msg.msg_iov = iov; 1436 error = recvit(td, uap->s, &msg, NULL); 1437 if (error == 0) { 1438 msg.msg_iov = uiov; 1439 error = copyout(&msg, uap->msg, sizeof(msg)); 1440 } 1441 free(iov, M_IOV); 1442 return (error); 1443 } 1444 1445 /* ARGSUSED */ 1446 int 1447 sys_shutdown(td, uap) 1448 struct thread *td; 1449 struct shutdown_args /* { 1450 int s; 1451 int how; 1452 } */ *uap; 1453 { 1454 struct socket *so; 1455 struct file *fp; 1456 cap_rights_t rights; 1457 int error; 1458 1459 AUDIT_ARG_FD(uap->s); 1460 error = getsock_cap(td->td_proc->p_fd, uap->s, 1461 cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL); 1462 if (error == 0) { 1463 so = fp->f_data; 1464 error = soshutdown(so, uap->how); 1465 fdrop(fp, td); 1466 } 1467 return (error); 1468 } 1469 1470 /* ARGSUSED */ 1471 int 1472 sys_setsockopt(td, uap) 1473 struct thread *td; 1474 struct setsockopt_args /* { 1475 int s; 1476 int level; 1477 int name; 1478 caddr_t val; 1479 int valsize; 1480 } */ *uap; 1481 { 1482 1483 return (kern_setsockopt(td, uap->s, uap->level, uap->name, 1484 uap->val, UIO_USERSPACE, uap->valsize)); 1485 } 1486 1487 int 1488 kern_setsockopt(td, s, level, name, val, valseg, valsize) 1489 struct thread *td; 1490 int s; 1491 int level; 1492 int name; 1493 void *val; 1494 enum uio_seg valseg; 1495 socklen_t valsize; 1496 { 1497 struct socket *so; 1498 struct file *fp; 1499 struct sockopt sopt; 1500 cap_rights_t rights; 1501 int error; 1502 1503 if (val == NULL && valsize != 0) 1504 return (EFAULT); 1505 if ((int)valsize < 0) 1506 return (EINVAL); 1507 1508 sopt.sopt_dir = SOPT_SET; 1509 sopt.sopt_level = level; 1510 sopt.sopt_name = name; 1511 sopt.sopt_val = val; 1512 sopt.sopt_valsize = valsize; 1513 switch (valseg) { 1514 case UIO_USERSPACE: 1515 sopt.sopt_td = td; 1516 break; 1517 case UIO_SYSSPACE: 1518 sopt.sopt_td = NULL; 1519 break; 1520 default: 1521 panic("kern_setsockopt called with bad valseg"); 1522 } 1523 1524 AUDIT_ARG_FD(s); 1525 error = getsock_cap(td->td_proc->p_fd, s, 1526 cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL); 1527 if (error == 0) { 1528 so = fp->f_data; 1529 error = sosetopt(so, &sopt); 1530 fdrop(fp, td); 1531 } 1532 return(error); 1533 } 1534 1535 /* ARGSUSED */ 1536 int 1537 sys_getsockopt(td, uap) 1538 struct thread *td; 1539 struct getsockopt_args /* { 1540 int s; 1541 int level; 1542 int name; 1543 void * __restrict val; 1544 socklen_t * __restrict avalsize; 1545 } */ *uap; 1546 { 1547 socklen_t valsize; 1548 int error; 1549 1550 if (uap->val) { 1551 error = copyin(uap->avalsize, &valsize, sizeof (valsize)); 1552 if (error != 0) 1553 return (error); 1554 } 1555 1556 error = kern_getsockopt(td, uap->s, uap->level, uap->name, 1557 uap->val, UIO_USERSPACE, &valsize); 1558 1559 if (error == 0) 1560 error = copyout(&valsize, uap->avalsize, sizeof (valsize)); 1561 return (error); 1562 } 1563 1564 /* 1565 * Kernel version of getsockopt. 1566 * optval can be a userland or userspace. optlen is always a kernel pointer. 1567 */ 1568 int 1569 kern_getsockopt(td, s, level, name, val, valseg, valsize) 1570 struct thread *td; 1571 int s; 1572 int level; 1573 int name; 1574 void *val; 1575 enum uio_seg valseg; 1576 socklen_t *valsize; 1577 { 1578 struct socket *so; 1579 struct file *fp; 1580 struct sockopt sopt; 1581 cap_rights_t rights; 1582 int error; 1583 1584 if (val == NULL) 1585 *valsize = 0; 1586 if ((int)*valsize < 0) 1587 return (EINVAL); 1588 1589 sopt.sopt_dir = SOPT_GET; 1590 sopt.sopt_level = level; 1591 sopt.sopt_name = name; 1592 sopt.sopt_val = val; 1593 sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ 1594 switch (valseg) { 1595 case UIO_USERSPACE: 1596 sopt.sopt_td = td; 1597 break; 1598 case UIO_SYSSPACE: 1599 sopt.sopt_td = NULL; 1600 break; 1601 default: 1602 panic("kern_getsockopt called with bad valseg"); 1603 } 1604 1605 AUDIT_ARG_FD(s); 1606 error = getsock_cap(td->td_proc->p_fd, s, 1607 cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL); 1608 if (error == 0) { 1609 so = fp->f_data; 1610 error = sogetopt(so, &sopt); 1611 *valsize = sopt.sopt_valsize; 1612 fdrop(fp, td); 1613 } 1614 return (error); 1615 } 1616 1617 /* 1618 * getsockname1() - Get socket name. 1619 */ 1620 /* ARGSUSED */ 1621 static int 1622 getsockname1(td, uap, compat) 1623 struct thread *td; 1624 struct getsockname_args /* { 1625 int fdes; 1626 struct sockaddr * __restrict asa; 1627 socklen_t * __restrict alen; 1628 } */ *uap; 1629 int compat; 1630 { 1631 struct sockaddr *sa; 1632 socklen_t len; 1633 int error; 1634 1635 error = copyin(uap->alen, &len, sizeof(len)); 1636 if (error != 0) 1637 return (error); 1638 1639 error = kern_getsockname(td, uap->fdes, &sa, &len); 1640 if (error != 0) 1641 return (error); 1642 1643 if (len != 0) { 1644 #ifdef COMPAT_OLDSOCK 1645 if (compat) 1646 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1647 #endif 1648 error = copyout(sa, uap->asa, (u_int)len); 1649 } 1650 free(sa, M_SONAME); 1651 if (error == 0) 1652 error = copyout(&len, uap->alen, sizeof(len)); 1653 return (error); 1654 } 1655 1656 int 1657 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, 1658 socklen_t *alen) 1659 { 1660 struct socket *so; 1661 struct file *fp; 1662 cap_rights_t rights; 1663 socklen_t len; 1664 int error; 1665 1666 AUDIT_ARG_FD(fd); 1667 error = getsock_cap(td->td_proc->p_fd, fd, 1668 cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL); 1669 if (error != 0) 1670 return (error); 1671 so = fp->f_data; 1672 *sa = NULL; 1673 CURVNET_SET(so->so_vnet); 1674 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); 1675 CURVNET_RESTORE(); 1676 if (error != 0) 1677 goto bad; 1678 if (*sa == NULL) 1679 len = 0; 1680 else 1681 len = MIN(*alen, (*sa)->sa_len); 1682 *alen = len; 1683 #ifdef KTRACE 1684 if (KTRPOINT(td, KTR_STRUCT)) 1685 ktrsockaddr(*sa); 1686 #endif 1687 bad: 1688 fdrop(fp, td); 1689 if (error != 0 && *sa != NULL) { 1690 free(*sa, M_SONAME); 1691 *sa = NULL; 1692 } 1693 return (error); 1694 } 1695 1696 int 1697 sys_getsockname(td, uap) 1698 struct thread *td; 1699 struct getsockname_args *uap; 1700 { 1701 1702 return (getsockname1(td, uap, 0)); 1703 } 1704 1705 #ifdef COMPAT_OLDSOCK 1706 int 1707 ogetsockname(td, uap) 1708 struct thread *td; 1709 struct getsockname_args *uap; 1710 { 1711 1712 return (getsockname1(td, uap, 1)); 1713 } 1714 #endif /* COMPAT_OLDSOCK */ 1715 1716 /* 1717 * getpeername1() - Get name of peer for connected socket. 1718 */ 1719 /* ARGSUSED */ 1720 static int 1721 getpeername1(td, uap, compat) 1722 struct thread *td; 1723 struct getpeername_args /* { 1724 int fdes; 1725 struct sockaddr * __restrict asa; 1726 socklen_t * __restrict alen; 1727 } */ *uap; 1728 int compat; 1729 { 1730 struct sockaddr *sa; 1731 socklen_t len; 1732 int error; 1733 1734 error = copyin(uap->alen, &len, sizeof (len)); 1735 if (error != 0) 1736 return (error); 1737 1738 error = kern_getpeername(td, uap->fdes, &sa, &len); 1739 if (error != 0) 1740 return (error); 1741 1742 if (len != 0) { 1743 #ifdef COMPAT_OLDSOCK 1744 if (compat) 1745 ((struct osockaddr *)sa)->sa_family = sa->sa_family; 1746 #endif 1747 error = copyout(sa, uap->asa, (u_int)len); 1748 } 1749 free(sa, M_SONAME); 1750 if (error == 0) 1751 error = copyout(&len, uap->alen, sizeof(len)); 1752 return (error); 1753 } 1754 1755 int 1756 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, 1757 socklen_t *alen) 1758 { 1759 struct socket *so; 1760 struct file *fp; 1761 cap_rights_t rights; 1762 socklen_t len; 1763 int error; 1764 1765 AUDIT_ARG_FD(fd); 1766 error = getsock_cap(td->td_proc->p_fd, fd, 1767 cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL); 1768 if (error != 0) 1769 return (error); 1770 so = fp->f_data; 1771 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { 1772 error = ENOTCONN; 1773 goto done; 1774 } 1775 *sa = NULL; 1776 CURVNET_SET(so->so_vnet); 1777 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); 1778 CURVNET_RESTORE(); 1779 if (error != 0) 1780 goto bad; 1781 if (*sa == NULL) 1782 len = 0; 1783 else 1784 len = MIN(*alen, (*sa)->sa_len); 1785 *alen = len; 1786 #ifdef KTRACE 1787 if (KTRPOINT(td, KTR_STRUCT)) 1788 ktrsockaddr(*sa); 1789 #endif 1790 bad: 1791 if (error != 0 && *sa != NULL) { 1792 free(*sa, M_SONAME); 1793 *sa = NULL; 1794 } 1795 done: 1796 fdrop(fp, td); 1797 return (error); 1798 } 1799 1800 int 1801 sys_getpeername(td, uap) 1802 struct thread *td; 1803 struct getpeername_args *uap; 1804 { 1805 1806 return (getpeername1(td, uap, 0)); 1807 } 1808 1809 #ifdef COMPAT_OLDSOCK 1810 int 1811 ogetpeername(td, uap) 1812 struct thread *td; 1813 struct ogetpeername_args *uap; 1814 { 1815 1816 /* XXX uap should have type `getpeername_args *' to begin with. */ 1817 return (getpeername1(td, (struct getpeername_args *)uap, 1)); 1818 } 1819 #endif /* COMPAT_OLDSOCK */ 1820 1821 int 1822 sockargs(mp, buf, buflen, type) 1823 struct mbuf **mp; 1824 caddr_t buf; 1825 int buflen, type; 1826 { 1827 struct sockaddr *sa; 1828 struct mbuf *m; 1829 int error; 1830 1831 if (buflen > MLEN) { 1832 #ifdef COMPAT_OLDSOCK 1833 if (type == MT_SONAME && buflen <= 112) 1834 buflen = MLEN; /* unix domain compat. hack */ 1835 else 1836 #endif 1837 if (buflen > MCLBYTES) 1838 return (EINVAL); 1839 } 1840 m = m_get2(buflen, M_WAITOK, type, 0); 1841 m->m_len = buflen; 1842 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); 1843 if (error != 0) 1844 (void) m_free(m); 1845 else { 1846 *mp = m; 1847 if (type == MT_SONAME) { 1848 sa = mtod(m, struct sockaddr *); 1849 1850 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1851 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1852 sa->sa_family = sa->sa_len; 1853 #endif 1854 sa->sa_len = buflen; 1855 } 1856 } 1857 return (error); 1858 } 1859 1860 int 1861 getsockaddr(namp, uaddr, len) 1862 struct sockaddr **namp; 1863 caddr_t uaddr; 1864 size_t len; 1865 { 1866 struct sockaddr *sa; 1867 int error; 1868 1869 if (len > SOCK_MAXADDRLEN) 1870 return (ENAMETOOLONG); 1871 if (len < offsetof(struct sockaddr, sa_data[0])) 1872 return (EINVAL); 1873 sa = malloc(len, M_SONAME, M_WAITOK); 1874 error = copyin(uaddr, sa, len); 1875 if (error != 0) { 1876 free(sa, M_SONAME); 1877 } else { 1878 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN 1879 if (sa->sa_family == 0 && sa->sa_len < AF_MAX) 1880 sa->sa_family = sa->sa_len; 1881 #endif 1882 sa->sa_len = len; 1883 *namp = sa; 1884 } 1885 return (error); 1886 } 1887 1888 static int 1889 filt_sfsync_attach(struct knote *kn) 1890 { 1891 struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_sdata; 1892 struct knlist *knl = &sfs->klist; 1893 1894 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1895 1896 /* 1897 * Validate that we actually received this via the kernel API. 1898 */ 1899 if ((kn->kn_flags & EV_FLAG1) == 0) 1900 return (EPERM); 1901 1902 kn->kn_ptr.p_v = sfs; 1903 kn->kn_flags &= ~EV_FLAG1; 1904 1905 knl->kl_lock(knl->kl_lockarg); 1906 /* 1907 * If we're in the "freeing" state, 1908 * don't allow the add. That way we don't 1909 * end up racing with some other thread that 1910 * is trying to finish some setup. 1911 */ 1912 if (sfs->state == SF_STATE_FREEING) { 1913 knl->kl_unlock(knl->kl_lockarg); 1914 return (EINVAL); 1915 } 1916 knlist_add(&sfs->klist, kn, 1); 1917 knl->kl_unlock(knl->kl_lockarg); 1918 1919 return (0); 1920 } 1921 1922 /* 1923 * Called when a knote is being detached. 1924 */ 1925 static void 1926 filt_sfsync_detach(struct knote *kn) 1927 { 1928 struct knlist *knl; 1929 struct sendfile_sync *sfs; 1930 int do_free = 0; 1931 1932 sfs = kn->kn_ptr.p_v; 1933 knl = &sfs->klist; 1934 1935 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1936 1937 knl->kl_lock(knl->kl_lockarg); 1938 if (!knlist_empty(knl)) 1939 knlist_remove(knl, kn, 1); 1940 1941 /* 1942 * If the list is empty _AND_ the refcount is 0 1943 * _AND_ we've finished the setup phase and now 1944 * we're in the running phase, we can free the 1945 * underlying sendfile_sync. 1946 * 1947 * But we shouldn't do it before finishing the 1948 * underlying divorce from the knote. 1949 * 1950 * So, we have the sfsync lock held; transition 1951 * it to "freeing", then unlock, then free 1952 * normally. 1953 */ 1954 if (knlist_empty(knl)) { 1955 if (sfs->state == SF_STATE_COMPLETED && sfs->count == 0) { 1956 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, " 1957 "count==0, empty list: time to free!\n", 1958 __func__, 1959 (unsigned long long) curthread->td_tid, 1960 sfs); 1961 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 1962 do_free = 1; 1963 } 1964 } 1965 knl->kl_unlock(knl->kl_lockarg); 1966 1967 /* 1968 * Only call free if we're the one who has transitioned things 1969 * to free. Otherwise we could race with another thread that 1970 * is currently tearing things down. 1971 */ 1972 if (do_free == 1) { 1973 SFSYNC_DPRINTF("%s: (%llu) sfs=%p, %s:%d\n", 1974 __func__, 1975 (unsigned long long) curthread->td_tid, 1976 sfs, 1977 __FILE__, 1978 __LINE__); 1979 sf_sync_free(sfs); 1980 } 1981 } 1982 1983 static int 1984 filt_sfsync(struct knote *kn, long hint) 1985 { 1986 struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_ptr.p_v; 1987 int ret; 1988 1989 SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs); 1990 1991 /* 1992 * XXX add a lock assertion here! 1993 */ 1994 ret = (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED); 1995 1996 return (ret); 1997 } 1998 1999 2000 /* 2001 * Detach mapped page and release resources back to the system. 2002 */ 2003 int 2004 sf_buf_mext(struct mbuf *mb, void *addr, void *args) 2005 { 2006 vm_page_t m; 2007 struct sendfile_sync *sfs; 2008 2009 m = sf_buf_page(args); 2010 sf_buf_free(args); 2011 vm_page_lock(m); 2012 vm_page_unwire(m, 0); 2013 /* 2014 * Check for the object going away on us. This can 2015 * happen since we don't hold a reference to it. 2016 * If so, we're responsible for freeing the page. 2017 */ 2018 if (m->wire_count == 0 && m->object == NULL) 2019 vm_page_free(m); 2020 vm_page_unlock(m); 2021 if (addr != NULL) { 2022 sfs = addr; 2023 sf_sync_deref(sfs); 2024 } 2025 /* 2026 * sfs may be invalid at this point, don't use it! 2027 */ 2028 return (EXT_FREE_OK); 2029 } 2030 2031 /* 2032 * Called to remove a reference to a sf_sync object. 2033 * 2034 * This is generally done during the mbuf free path to signify 2035 * that one of the mbufs in the transaction has been completed. 2036 * 2037 * If we're doing SF_SYNC and the refcount is zero then we'll wake 2038 * up any waiters. 2039 * 2040 * IF we're doing SF_KQUEUE and the refcount is zero then we'll 2041 * fire off the knote. 2042 */ 2043 void 2044 sf_sync_deref(struct sendfile_sync *sfs) 2045 { 2046 int do_free = 0; 2047 2048 if (sfs == NULL) 2049 return; 2050 2051 mtx_lock(&sfs->mtx); 2052 KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0")); 2053 sfs->count --; 2054 2055 /* 2056 * Only fire off the wakeup / kqueue notification if 2057 * we are in the running state. 2058 */ 2059 if (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED) { 2060 if (sfs->flags & SF_SYNC) 2061 cv_signal(&sfs->cv); 2062 2063 if (sfs->flags & SF_KQUEUE) { 2064 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: knote!\n", 2065 __func__, 2066 (unsigned long long) curthread->td_tid, 2067 sfs); 2068 KNOTE_LOCKED(&sfs->klist, 1); 2069 } 2070 2071 /* 2072 * If we're not waiting around for a sync, 2073 * check if the knote list is empty. 2074 * If it is, we transition to free. 2075 * 2076 * XXX I think it's about time I added some state 2077 * or flag that says whether we're supposed to be 2078 * waiting around until we've done a signal. 2079 * 2080 * XXX Ie, the reason that I don't free it here 2081 * is because the caller will free the last reference, 2082 * not us. That should be codified in some flag 2083 * that indicates "self-free" rather than checking 2084 * for SF_SYNC all the time. 2085 */ 2086 if ((sfs->flags & SF_SYNC) == 0 && knlist_empty(&sfs->klist)) { 2087 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, " 2088 "count==0, empty list: time to free!\n", 2089 __func__, 2090 (unsigned long long) curthread->td_tid, 2091 sfs); 2092 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2093 do_free = 1; 2094 } 2095 2096 } 2097 mtx_unlock(&sfs->mtx); 2098 2099 /* 2100 * Attempt to do a free here. 2101 * 2102 * We do this outside of the lock because it may destroy the 2103 * lock in question as it frees things. We can optimise this 2104 * later. 2105 * 2106 * XXX yes, we should make it a requirement to hold the 2107 * lock across sf_sync_free(). 2108 */ 2109 if (do_free == 1) { 2110 SFSYNC_DPRINTF("%s: (%llu) sfs=%p\n", 2111 __func__, 2112 (unsigned long long) curthread->td_tid, 2113 sfs); 2114 sf_sync_free(sfs); 2115 } 2116 } 2117 2118 /* 2119 * Allocate a sendfile_sync state structure. 2120 * 2121 * For now this only knows about the "sleep" sync, but later it will 2122 * grow various other personalities. 2123 */ 2124 struct sendfile_sync * 2125 sf_sync_alloc(uint32_t flags) 2126 { 2127 struct sendfile_sync *sfs; 2128 2129 sfs = uma_zalloc(zone_sfsync, M_WAITOK | M_ZERO); 2130 mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); 2131 cv_init(&sfs->cv, "sendfile"); 2132 sfs->flags = flags; 2133 sfs->state = SF_STATE_SETUP; 2134 knlist_init_mtx(&sfs->klist, &sfs->mtx); 2135 2136 SFSYNC_DPRINTF("%s: sfs=%p, flags=0x%08x\n", __func__, sfs, sfs->flags); 2137 2138 return (sfs); 2139 } 2140 2141 /* 2142 * Take a reference to a sfsync instance. 2143 * 2144 * This has to map 1:1 to free calls coming in via sf_buf_mext(), 2145 * so typically this will be referenced once for each mbuf allocated. 2146 */ 2147 void 2148 sf_sync_ref(struct sendfile_sync *sfs) 2149 { 2150 2151 if (sfs == NULL) 2152 return; 2153 2154 mtx_lock(&sfs->mtx); 2155 sfs->count++; 2156 mtx_unlock(&sfs->mtx); 2157 } 2158 2159 void 2160 sf_sync_syscall_wait(struct sendfile_sync *sfs) 2161 { 2162 2163 if (sfs == NULL) 2164 return; 2165 2166 KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!", 2167 __func__, 2168 sfs)); 2169 2170 /* 2171 * If we're not requested to wait during the syscall, 2172 * don't bother waiting. 2173 */ 2174 if ((sfs->flags & SF_SYNC) == 0) 2175 goto out; 2176 2177 /* 2178 * This is a bit suboptimal and confusing, so bear with me. 2179 * 2180 * Ideally sf_sync_syscall_wait() will wait until 2181 * all pending mbuf transmit operations are done. 2182 * This means that when sendfile becomes async, it'll 2183 * run in the background and will transition from 2184 * RUNNING to COMPLETED when it's finished acquiring 2185 * new things to send. Then, when the mbufs finish 2186 * sending, COMPLETED + sfs->count == 0 is enough to 2187 * know that no further work is being done. 2188 * 2189 * So, we will sleep on both RUNNING and COMPLETED. 2190 * It's up to the (in progress) async sendfile loop 2191 * to transition the sf_sync from RUNNING to 2192 * COMPLETED so the wakeup above will actually 2193 * do the cv_signal() call. 2194 */ 2195 if (sfs->state != SF_STATE_COMPLETED && sfs->state != SF_STATE_RUNNING) 2196 goto out; 2197 2198 if (sfs->count != 0) 2199 cv_wait(&sfs->cv, &sfs->mtx); 2200 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2201 2202 out: 2203 return; 2204 } 2205 2206 /* 2207 * Free an sf_sync if it's appropriate to. 2208 */ 2209 void 2210 sf_sync_free(struct sendfile_sync *sfs) 2211 { 2212 2213 if (sfs == NULL) 2214 return; 2215 2216 SFSYNC_DPRINTF("%s: (%lld) sfs=%p; called; state=%d, flags=0x%08x " 2217 "count=%d\n", 2218 __func__, 2219 (long long) curthread->td_tid, 2220 sfs, 2221 sfs->state, 2222 sfs->flags, 2223 sfs->count); 2224 2225 mtx_lock(&sfs->mtx); 2226 2227 /* 2228 * We keep the sf_sync around if the state is active, 2229 * we are doing kqueue notification and we have active 2230 * knotes. 2231 * 2232 * If the caller wants to free us right this second it 2233 * should transition this to the freeing state. 2234 * 2235 * So, complain loudly if they break this rule. 2236 */ 2237 if (sfs->state != SF_STATE_FREEING) { 2238 printf("%s: (%llu) sfs=%p; not freeing; let's wait!\n", 2239 __func__, 2240 (unsigned long long) curthread->td_tid, 2241 sfs); 2242 mtx_unlock(&sfs->mtx); 2243 return; 2244 } 2245 2246 KASSERT(sfs->count == 0, ("sendfile sync still busy")); 2247 cv_destroy(&sfs->cv); 2248 /* 2249 * This doesn't call knlist_detach() on each knote; it just frees 2250 * the entire list. 2251 */ 2252 knlist_delete(&sfs->klist, curthread, 1); 2253 mtx_destroy(&sfs->mtx); 2254 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; freeing\n", 2255 __func__, 2256 (unsigned long long) curthread->td_tid, 2257 sfs); 2258 uma_zfree(zone_sfsync, sfs); 2259 } 2260 2261 /* 2262 * Setup a sf_sync to post a kqueue notification when things are complete. 2263 */ 2264 int 2265 sf_sync_kqueue_setup(struct sendfile_sync *sfs, struct sf_hdtr_kq *sfkq) 2266 { 2267 struct kevent kev; 2268 int error; 2269 2270 sfs->flags |= SF_KQUEUE; 2271 2272 /* Check the flags are valid */ 2273 if ((sfkq->kq_flags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) 2274 return (EINVAL); 2275 2276 SFSYNC_DPRINTF("%s: sfs=%p: kqfd=%d, flags=0x%08x, ident=%p, udata=%p\n", 2277 __func__, 2278 sfs, 2279 sfkq->kq_fd, 2280 sfkq->kq_flags, 2281 (void *) sfkq->kq_ident, 2282 (void *) sfkq->kq_udata); 2283 2284 /* Setup and register a knote on the given kqfd. */ 2285 kev.ident = (uintptr_t) sfkq->kq_ident; 2286 kev.filter = EVFILT_SENDFILE; 2287 kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | sfkq->kq_flags; 2288 kev.data = (intptr_t) sfs; 2289 kev.udata = sfkq->kq_udata; 2290 2291 error = kqfd_register(sfkq->kq_fd, &kev, curthread, 1); 2292 if (error != 0) { 2293 SFSYNC_DPRINTF("%s: returned %d\n", __func__, error); 2294 } 2295 return (error); 2296 } 2297 2298 void 2299 sf_sync_set_state(struct sendfile_sync *sfs, sendfile_sync_state_t state, 2300 int islocked) 2301 { 2302 sendfile_sync_state_t old_state; 2303 2304 if (! islocked) 2305 mtx_lock(&sfs->mtx); 2306 2307 /* 2308 * Update our current state. 2309 */ 2310 old_state = sfs->state; 2311 sfs->state = state; 2312 SFSYNC_DPRINTF("%s: (%llu) sfs=%p; going from %d to %d\n", 2313 __func__, 2314 (unsigned long long) curthread->td_tid, 2315 sfs, 2316 old_state, 2317 state); 2318 2319 /* 2320 * If we're transitioning from RUNNING to COMPLETED and the count is 2321 * zero, then post the knote. The caller may have completed the 2322 * send before we updated the state to COMPLETED and we need to make 2323 * sure this is communicated. 2324 */ 2325 if (old_state == SF_STATE_RUNNING 2326 && state == SF_STATE_COMPLETED 2327 && sfs->count == 0 2328 && sfs->flags & SF_KQUEUE) { 2329 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: triggering knote!\n", 2330 __func__, 2331 (unsigned long long) curthread->td_tid, 2332 sfs); 2333 KNOTE_LOCKED(&sfs->klist, 1); 2334 } 2335 2336 if (! islocked) 2337 mtx_unlock(&sfs->mtx); 2338 } 2339 2340 /* 2341 * Set the retval/errno for the given transaction. 2342 * 2343 * This will eventually/ideally be used when the KNOTE is fired off 2344 * to signify the completion of this transaction. 2345 * 2346 * The sfsync lock should be held before entering this function. 2347 */ 2348 void 2349 sf_sync_set_retval(struct sendfile_sync *sfs, off_t retval, int xerrno) 2350 { 2351 2352 KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!", 2353 __func__, 2354 sfs)); 2355 2356 SFSYNC_DPRINTF("%s: (%llu) sfs=%p: errno=%d, retval=%jd\n", 2357 __func__, 2358 (unsigned long long) curthread->td_tid, 2359 sfs, 2360 xerrno, 2361 (intmax_t) retval); 2362 2363 sfs->retval = retval; 2364 sfs->xerrno = xerrno; 2365 } 2366 2367 /* 2368 * sendfile(2) 2369 * 2370 * int sendfile(int fd, int s, off_t offset, size_t nbytes, 2371 * struct sf_hdtr *hdtr, off_t *sbytes, int flags) 2372 * 2373 * Send a file specified by 'fd' and starting at 'offset' to a socket 2374 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == 2375 * 0. Optionally add a header and/or trailer to the socket output. If 2376 * specified, write the total number of bytes sent into *sbytes. 2377 */ 2378 int 2379 sys_sendfile(struct thread *td, struct sendfile_args *uap) 2380 { 2381 2382 return (do_sendfile(td, uap, 0)); 2383 } 2384 2385 int 2386 _do_sendfile(struct thread *td, int src_fd, int sock_fd, int flags, 2387 int compat, off_t offset, size_t nbytes, off_t *sbytes, 2388 struct uio *hdr_uio, 2389 struct uio *trl_uio, struct sf_hdtr_kq *hdtr_kq) 2390 { 2391 cap_rights_t rights; 2392 struct sendfile_sync *sfs = NULL; 2393 struct file *fp; 2394 int error; 2395 int do_kqueue = 0; 2396 int do_free = 0; 2397 2398 AUDIT_ARG_FD(src_fd); 2399 2400 if (hdtr_kq != NULL) 2401 do_kqueue = 1; 2402 2403 /* 2404 * sendfile(2) can start at any offset within a file so we require 2405 * CAP_READ+CAP_SEEK = CAP_PREAD. 2406 */ 2407 if ((error = fget_read(td, src_fd, 2408 cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { 2409 goto out; 2410 } 2411 2412 /* 2413 * IF SF_KQUEUE is set but we haven't copied in anything for 2414 * kqueue data, error out. 2415 */ 2416 if (flags & SF_KQUEUE && do_kqueue == 0) { 2417 SFSYNC_DPRINTF("%s: SF_KQUEUE but no KQUEUE data!\n", __func__); 2418 goto out; 2419 } 2420 2421 /* 2422 * If we need to wait for completion, initialise the sfsync 2423 * state here. 2424 */ 2425 if (flags & (SF_SYNC | SF_KQUEUE)) 2426 sfs = sf_sync_alloc(flags & (SF_SYNC | SF_KQUEUE)); 2427 2428 if (flags & SF_KQUEUE) { 2429 error = sf_sync_kqueue_setup(sfs, hdtr_kq); 2430 if (error) { 2431 SFSYNC_DPRINTF("%s: (%llu) error; sfs=%p\n", 2432 __func__, 2433 (unsigned long long) curthread->td_tid, 2434 sfs); 2435 sf_sync_set_state(sfs, SF_STATE_FREEING, 0); 2436 sf_sync_free(sfs); 2437 goto out; 2438 } 2439 } 2440 2441 /* 2442 * Do the sendfile call. 2443 * 2444 * If this fails, it'll free the mbuf chain which will free up the 2445 * sendfile_sync references. 2446 */ 2447 error = fo_sendfile(fp, sock_fd, hdr_uio, trl_uio, offset, 2448 nbytes, sbytes, flags, compat ? SFK_COMPAT : 0, sfs, td); 2449 2450 /* 2451 * If the sendfile call succeeded, transition the sf_sync state 2452 * to RUNNING, then COMPLETED. 2453 * 2454 * If the sendfile call failed, then the sendfile call may have 2455 * actually sent some data first - so we check to see whether 2456 * any data was sent. If some data was queued (ie, count > 0) 2457 * then we can't call free; we have to wait until the partial 2458 * transaction completes before we continue along. 2459 * 2460 * This has the side effect of firing off the knote 2461 * if the refcount has hit zero by the time we get here. 2462 */ 2463 if (sfs != NULL) { 2464 mtx_lock(&sfs->mtx); 2465 if (error == 0 || sfs->count > 0) { 2466 /* 2467 * When it's time to do async sendfile, the transition 2468 * to RUNNING signifies that we're actually actively 2469 * adding and completing mbufs. When the last disk 2470 * buffer is read (ie, when we're not doing any 2471 * further read IO and all subsequent stuff is mbuf 2472 * transmissions) we'll transition to COMPLETED 2473 * and when the final mbuf is freed, the completion 2474 * will be signaled. 2475 */ 2476 sf_sync_set_state(sfs, SF_STATE_RUNNING, 1); 2477 2478 /* 2479 * Set the retval before we signal completed. 2480 * If we do it the other way around then transitioning to 2481 * COMPLETED may post the knote before you set the return 2482 * status! 2483 * 2484 * XXX for now, errno is always 0, as we don't post 2485 * knotes if sendfile failed. Maybe that'll change later. 2486 */ 2487 sf_sync_set_retval(sfs, *sbytes, error); 2488 2489 /* 2490 * And now transition to completed, which will kick off 2491 * the knote if required. 2492 */ 2493 sf_sync_set_state(sfs, SF_STATE_COMPLETED, 1); 2494 } else { 2495 /* 2496 * Error isn't zero, sfs_count is zero, so we 2497 * won't have some other thing to wake things up. 2498 * Thus free. 2499 */ 2500 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2501 do_free = 1; 2502 } 2503 2504 /* 2505 * Next - wait if appropriate. 2506 */ 2507 sf_sync_syscall_wait(sfs); 2508 2509 /* 2510 * If we're not doing kqueue notifications, we can 2511 * transition this immediately to the freeing state. 2512 */ 2513 if ((sfs->flags & SF_KQUEUE) == 0) { 2514 sf_sync_set_state(sfs, SF_STATE_FREEING, 1); 2515 do_free = 1; 2516 } 2517 2518 mtx_unlock(&sfs->mtx); 2519 } 2520 2521 /* 2522 * If do_free is set, free here. 2523 * 2524 * If we're doing no-kqueue notification and it's just sleep notification, 2525 * we also do free; it's the only chance we have. 2526 */ 2527 if (sfs != NULL && do_free == 1) { 2528 sf_sync_free(sfs); 2529 } 2530 2531 /* 2532 * XXX Should we wait until the send has completed before freeing the source 2533 * file handle? It's the previous behaviour, sure, but is it required? 2534 * We've wired down the page references after all. 2535 */ 2536 fdrop(fp, td); 2537 2538 out: 2539 /* Return error */ 2540 return (error); 2541 } 2542 2543 2544 static int 2545 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) 2546 { 2547 struct sf_hdtr hdtr; 2548 struct sf_hdtr_kq hdtr_kq; 2549 struct uio *hdr_uio, *trl_uio; 2550 int error; 2551 off_t sbytes; 2552 int do_kqueue = 0; 2553 2554 /* 2555 * File offset must be positive. If it goes beyond EOF 2556 * we send only the header/trailer and no payload data. 2557 */ 2558 if (uap->offset < 0) 2559 return (EINVAL); 2560 2561 hdr_uio = trl_uio = NULL; 2562 2563 if (uap->hdtr != NULL) { 2564 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); 2565 if (error != 0) 2566 goto out; 2567 if (hdtr.headers != NULL) { 2568 error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); 2569 if (error != 0) 2570 goto out; 2571 } 2572 if (hdtr.trailers != NULL) { 2573 error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); 2574 if (error != 0) 2575 goto out; 2576 } 2577 2578 /* 2579 * If SF_KQUEUE is set, then we need to also copy in 2580 * the kqueue data after the normal hdtr set and set 2581 * do_kqueue=1. 2582 */ 2583 if (uap->flags & SF_KQUEUE) { 2584 error = copyin(((char *) uap->hdtr) + sizeof(hdtr), 2585 &hdtr_kq, 2586 sizeof(hdtr_kq)); 2587 if (error != 0) 2588 goto out; 2589 do_kqueue = 1; 2590 } 2591 } 2592 2593 /* Call sendfile */ 2594 error = _do_sendfile(td, uap->fd, uap->s, uap->flags, compat, 2595 uap->offset, uap->nbytes, &sbytes, hdr_uio, trl_uio, &hdtr_kq); 2596 2597 if (uap->sbytes != NULL) { 2598 copyout(&sbytes, uap->sbytes, sizeof(off_t)); 2599 } 2600 out: 2601 free(hdr_uio, M_IOV); 2602 free(trl_uio, M_IOV); 2603 return (error); 2604 } 2605 2606 #ifdef COMPAT_FREEBSD4 2607 int 2608 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) 2609 { 2610 struct sendfile_args args; 2611 2612 args.fd = uap->fd; 2613 args.s = uap->s; 2614 args.offset = uap->offset; 2615 args.nbytes = uap->nbytes; 2616 args.hdtr = uap->hdtr; 2617 args.sbytes = uap->sbytes; 2618 args.flags = uap->flags; 2619 2620 return (do_sendfile(td, &args, 1)); 2621 } 2622 #endif /* COMPAT_FREEBSD4 */ 2623 2624 static int 2625 sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd, 2626 off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res) 2627 { 2628 vm_page_t m; 2629 vm_pindex_t pindex; 2630 ssize_t resid; 2631 int error, readahead, rv; 2632 2633 pindex = OFF_TO_IDX(off); 2634 VM_OBJECT_WLOCK(obj); 2635 m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY | 2636 VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL); 2637 2638 /* 2639 * Check if page is valid for what we need, otherwise initiate I/O. 2640 * 2641 * The non-zero nd argument prevents disk I/O, instead we 2642 * return the caller what he specified in nd. In particular, 2643 * if we already turned some pages into mbufs, nd == EAGAIN 2644 * and the main function send them the pages before we come 2645 * here again and block. 2646 */ 2647 if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) { 2648 if (vp == NULL) 2649 vm_page_xunbusy(m); 2650 VM_OBJECT_WUNLOCK(obj); 2651 *res = m; 2652 return (0); 2653 } else if (nd != 0) { 2654 if (vp == NULL) 2655 vm_page_xunbusy(m); 2656 error = nd; 2657 goto free_page; 2658 } 2659 2660 /* 2661 * Get the page from backing store. 2662 */ 2663 error = 0; 2664 if (vp != NULL) { 2665 VM_OBJECT_WUNLOCK(obj); 2666 readahead = sfreadahead * MAXBSIZE; 2667 2668 /* 2669 * Use vn_rdwr() instead of the pager interface for 2670 * the vnode, to allow the read-ahead. 2671 * 2672 * XXXMAC: Because we don't have fp->f_cred here, we 2673 * pass in NOCRED. This is probably wrong, but is 2674 * consistent with our original implementation. 2675 */ 2676 error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off), 2677 UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead / 2678 bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); 2679 SFSTAT_INC(sf_iocnt); 2680 VM_OBJECT_WLOCK(obj); 2681 } else { 2682 if (vm_pager_has_page(obj, pindex, NULL, NULL)) { 2683 rv = vm_pager_get_pages(obj, &m, 1, 0); 2684 SFSTAT_INC(sf_iocnt); 2685 m = vm_page_lookup(obj, pindex); 2686 if (m == NULL) 2687 error = EIO; 2688 else if (rv != VM_PAGER_OK) { 2689 vm_page_lock(m); 2690 vm_page_free(m); 2691 vm_page_unlock(m); 2692 m = NULL; 2693 error = EIO; 2694 } 2695 } else { 2696 pmap_zero_page(m); 2697 m->valid = VM_PAGE_BITS_ALL; 2698 m->dirty = 0; 2699 } 2700 if (m != NULL) 2701 vm_page_xunbusy(m); 2702 } 2703 if (error == 0) { 2704 *res = m; 2705 } else if (m != NULL) { 2706 free_page: 2707 vm_page_lock(m); 2708 vm_page_unwire(m, 0); 2709 2710 /* 2711 * See if anyone else might know about this page. If 2712 * not and it is not valid, then free it. 2713 */ 2714 if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m)) 2715 vm_page_free(m); 2716 vm_page_unlock(m); 2717 } 2718 KASSERT(error != 0 || (m->wire_count > 0 && 2719 vm_page_is_valid(m, off & PAGE_MASK, xfsize)), 2720 ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off, 2721 xfsize)); 2722 VM_OBJECT_WUNLOCK(obj); 2723 return (error); 2724 } 2725 2726 static int 2727 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, 2728 struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, 2729 int *bsize) 2730 { 2731 struct vattr va; 2732 vm_object_t obj; 2733 struct vnode *vp; 2734 struct shmfd *shmfd; 2735 int error; 2736 2737 vp = *vp_res = NULL; 2738 obj = NULL; 2739 shmfd = *shmfd_res = NULL; 2740 *bsize = 0; 2741 2742 /* 2743 * The file descriptor must be a regular file and have a 2744 * backing VM object. 2745 */ 2746 if (fp->f_type == DTYPE_VNODE) { 2747 vp = fp->f_vnode; 2748 vn_lock(vp, LK_SHARED | LK_RETRY); 2749 if (vp->v_type != VREG) { 2750 error = EINVAL; 2751 goto out; 2752 } 2753 *bsize = vp->v_mount->mnt_stat.f_iosize; 2754 error = VOP_GETATTR(vp, &va, td->td_ucred); 2755 if (error != 0) 2756 goto out; 2757 *obj_size = va.va_size; 2758 obj = vp->v_object; 2759 if (obj == NULL) { 2760 error = EINVAL; 2761 goto out; 2762 } 2763 } else if (fp->f_type == DTYPE_SHM) { 2764 shmfd = fp->f_data; 2765 obj = shmfd->shm_object; 2766 *obj_size = shmfd->shm_size; 2767 } else { 2768 error = EINVAL; 2769 goto out; 2770 } 2771 2772 VM_OBJECT_WLOCK(obj); 2773 if ((obj->flags & OBJ_DEAD) != 0) { 2774 VM_OBJECT_WUNLOCK(obj); 2775 error = EBADF; 2776 goto out; 2777 } 2778 2779 /* 2780 * Temporarily increase the backing VM object's reference 2781 * count so that a forced reclamation of its vnode does not 2782 * immediately destroy it. 2783 */ 2784 vm_object_reference_locked(obj); 2785 VM_OBJECT_WUNLOCK(obj); 2786 *obj_res = obj; 2787 *vp_res = vp; 2788 *shmfd_res = shmfd; 2789 2790 out: 2791 if (vp != NULL) 2792 VOP_UNLOCK(vp, 0); 2793 return (error); 2794 } 2795 2796 static int 2797 kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp, 2798 struct socket **so) 2799 { 2800 cap_rights_t rights; 2801 int error; 2802 2803 *sock_fp = NULL; 2804 *so = NULL; 2805 2806 /* 2807 * The socket must be a stream socket and connected. 2808 */ 2809 error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights, 2810 CAP_SEND), sock_fp, NULL); 2811 if (error != 0) 2812 return (error); 2813 *so = (*sock_fp)->f_data; 2814 if ((*so)->so_type != SOCK_STREAM) 2815 return (EINVAL); 2816 if (((*so)->so_state & SS_ISCONNECTED) == 0) 2817 return (ENOTCONN); 2818 return (0); 2819 } 2820 2821 int 2822 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, 2823 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, 2824 int kflags, struct sendfile_sync *sfs, struct thread *td) 2825 { 2826 struct file *sock_fp; 2827 struct vnode *vp; 2828 struct vm_object *obj; 2829 struct socket *so; 2830 struct mbuf *m; 2831 struct sf_buf *sf; 2832 struct vm_page *pg; 2833 struct shmfd *shmfd; 2834 struct vattr va; 2835 off_t off, xfsize, fsbytes, sbytes, rem, obj_size; 2836 int error, bsize, nd, hdrlen, mnw; 2837 2838 pg = NULL; 2839 obj = NULL; 2840 so = NULL; 2841 m = NULL; 2842 fsbytes = sbytes = 0; 2843 hdrlen = mnw = 0; 2844 rem = nbytes; 2845 obj_size = 0; 2846 2847 error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); 2848 if (error != 0) 2849 return (error); 2850 if (rem == 0) 2851 rem = obj_size; 2852 2853 error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so); 2854 if (error != 0) 2855 goto out; 2856 2857 /* 2858 * Do not wait on memory allocations but return ENOMEM for 2859 * caller to retry later. 2860 * XXX: Experimental. 2861 */ 2862 if (flags & SF_MNOWAIT) 2863 mnw = 1; 2864 2865 #ifdef MAC 2866 error = mac_socket_check_send(td->td_ucred, so); 2867 if (error != 0) 2868 goto out; 2869 #endif 2870 2871 /* If headers are specified copy them into mbufs. */ 2872 if (hdr_uio != NULL) { 2873 hdr_uio->uio_td = td; 2874 hdr_uio->uio_rw = UIO_WRITE; 2875 if (hdr_uio->uio_resid > 0) { 2876 /* 2877 * In FBSD < 5.0 the nbytes to send also included 2878 * the header. If compat is specified subtract the 2879 * header size from nbytes. 2880 */ 2881 if (kflags & SFK_COMPAT) { 2882 if (nbytes > hdr_uio->uio_resid) 2883 nbytes -= hdr_uio->uio_resid; 2884 else 2885 nbytes = 0; 2886 } 2887 m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), 2888 0, 0, 0); 2889 if (m == NULL) { 2890 error = mnw ? EAGAIN : ENOBUFS; 2891 goto out; 2892 } 2893 hdrlen = m_length(m, NULL); 2894 } 2895 } 2896 2897 /* 2898 * Protect against multiple writers to the socket. 2899 * 2900 * XXXRW: Historically this has assumed non-interruptibility, so now 2901 * we implement that, but possibly shouldn't. 2902 */ 2903 (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); 2904 2905 /* 2906 * Loop through the pages of the file, starting with the requested 2907 * offset. Get a file page (do I/O if necessary), map the file page 2908 * into an sf_buf, attach an mbuf header to the sf_buf, and queue 2909 * it on the socket. 2910 * This is done in two loops. The inner loop turns as many pages 2911 * as it can, up to available socket buffer space, without blocking 2912 * into mbufs to have it bulk delivered into the socket send buffer. 2913 * The outer loop checks the state and available space of the socket 2914 * and takes care of the overall progress. 2915 */ 2916 for (off = offset; ; ) { 2917 struct mbuf *mtail; 2918 int loopbytes; 2919 int space; 2920 int done; 2921 2922 if ((nbytes != 0 && nbytes == fsbytes) || 2923 (nbytes == 0 && obj_size == fsbytes)) 2924 break; 2925 2926 mtail = NULL; 2927 loopbytes = 0; 2928 space = 0; 2929 done = 0; 2930 2931 /* 2932 * Check the socket state for ongoing connection, 2933 * no errors and space in socket buffer. 2934 * If space is low allow for the remainder of the 2935 * file to be processed if it fits the socket buffer. 2936 * Otherwise block in waiting for sufficient space 2937 * to proceed, or if the socket is nonblocking, return 2938 * to userland with EAGAIN while reporting how far 2939 * we've come. 2940 * We wait until the socket buffer has significant free 2941 * space to do bulk sends. This makes good use of file 2942 * system read ahead and allows packet segmentation 2943 * offloading hardware to take over lots of work. If 2944 * we were not careful here we would send off only one 2945 * sfbuf at a time. 2946 */ 2947 SOCKBUF_LOCK(&so->so_snd); 2948 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) 2949 so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; 2950 retry_space: 2951 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2952 error = EPIPE; 2953 SOCKBUF_UNLOCK(&so->so_snd); 2954 goto done; 2955 } else if (so->so_error) { 2956 error = so->so_error; 2957 so->so_error = 0; 2958 SOCKBUF_UNLOCK(&so->so_snd); 2959 goto done; 2960 } 2961 space = sbspace(&so->so_snd); 2962 if (space < rem && 2963 (space <= 0 || 2964 space < so->so_snd.sb_lowat)) { 2965 if (so->so_state & SS_NBIO) { 2966 SOCKBUF_UNLOCK(&so->so_snd); 2967 error = EAGAIN; 2968 goto done; 2969 } 2970 /* 2971 * sbwait drops the lock while sleeping. 2972 * When we loop back to retry_space the 2973 * state may have changed and we retest 2974 * for it. 2975 */ 2976 error = sbwait(&so->so_snd); 2977 /* 2978 * An error from sbwait usually indicates that we've 2979 * been interrupted by a signal. If we've sent anything 2980 * then return bytes sent, otherwise return the error. 2981 */ 2982 if (error != 0) { 2983 SOCKBUF_UNLOCK(&so->so_snd); 2984 goto done; 2985 } 2986 goto retry_space; 2987 } 2988 SOCKBUF_UNLOCK(&so->so_snd); 2989 2990 /* 2991 * Reduce space in the socket buffer by the size of 2992 * the header mbuf chain. 2993 * hdrlen is set to 0 after the first loop. 2994 */ 2995 space -= hdrlen; 2996 2997 if (vp != NULL) { 2998 error = vn_lock(vp, LK_SHARED); 2999 if (error != 0) 3000 goto done; 3001 error = VOP_GETATTR(vp, &va, td->td_ucred); 3002 if (error != 0 || off >= va.va_size) { 3003 VOP_UNLOCK(vp, 0); 3004 goto done; 3005 } 3006 obj_size = va.va_size; 3007 } 3008 3009 /* 3010 * Loop and construct maximum sized mbuf chain to be bulk 3011 * dumped into socket buffer. 3012 */ 3013 while (space > loopbytes) { 3014 vm_offset_t pgoff; 3015 struct mbuf *m0; 3016 3017 /* 3018 * Calculate the amount to transfer. 3019 * Not to exceed a page, the EOF, 3020 * or the passed in nbytes. 3021 */ 3022 pgoff = (vm_offset_t)(off & PAGE_MASK); 3023 rem = obj_size - offset; 3024 if (nbytes != 0) 3025 rem = omin(rem, nbytes); 3026 rem -= fsbytes + loopbytes; 3027 xfsize = omin(PAGE_SIZE - pgoff, rem); 3028 xfsize = omin(space - loopbytes, xfsize); 3029 if (xfsize <= 0) { 3030 done = 1; /* all data sent */ 3031 break; 3032 } 3033 3034 /* 3035 * Attempt to look up the page. Allocate 3036 * if not found or wait and loop if busy. 3037 */ 3038 if (m != NULL) 3039 nd = EAGAIN; /* send what we already got */ 3040 else if ((flags & SF_NODISKIO) != 0) 3041 nd = EBUSY; 3042 else 3043 nd = 0; 3044 error = sendfile_readpage(obj, vp, nd, off, 3045 xfsize, bsize, td, &pg); 3046 if (error != 0) { 3047 if (error == EAGAIN) 3048 error = 0; /* not a real error */ 3049 break; 3050 } 3051 3052 /* 3053 * Get a sendfile buf. When allocating the 3054 * first buffer for mbuf chain, we usually 3055 * wait as long as necessary, but this wait 3056 * can be interrupted. For consequent 3057 * buffers, do not sleep, since several 3058 * threads might exhaust the buffers and then 3059 * deadlock. 3060 */ 3061 sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT : 3062 SFB_CATCH); 3063 if (sf == NULL) { 3064 SFSTAT_INC(sf_allocfail); 3065 vm_page_lock(pg); 3066 vm_page_unwire(pg, 0); 3067 KASSERT(pg->object != NULL, 3068 ("%s: object disappeared", __func__)); 3069 vm_page_unlock(pg); 3070 if (m == NULL) 3071 error = (mnw ? EAGAIN : EINTR); 3072 break; 3073 } 3074 3075 /* 3076 * Get an mbuf and set it up as having 3077 * external storage. 3078 */ 3079 m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); 3080 if (m0 == NULL) { 3081 error = (mnw ? EAGAIN : ENOBUFS); 3082 (void)sf_buf_mext(NULL, NULL, sf); 3083 break; 3084 } 3085 if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE, 3086 sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF, 3087 (mnw ? M_NOWAIT : M_WAITOK)) != 0) { 3088 error = (mnw ? EAGAIN : ENOBUFS); 3089 (void)sf_buf_mext(NULL, NULL, sf); 3090 m_freem(m0); 3091 break; 3092 } 3093 m0->m_data = (char *)sf_buf_kva(sf) + pgoff; 3094 m0->m_len = xfsize; 3095 3096 /* Append to mbuf chain. */ 3097 if (mtail != NULL) 3098 mtail->m_next = m0; 3099 else if (m != NULL) 3100 m_last(m)->m_next = m0; 3101 else 3102 m = m0; 3103 mtail = m0; 3104 3105 /* Keep track of bits processed. */ 3106 loopbytes += xfsize; 3107 off += xfsize; 3108 3109 /* 3110 * XXX eventually this should be a sfsync 3111 * method call! 3112 */ 3113 if (sfs != NULL) 3114 sf_sync_ref(sfs); 3115 } 3116 3117 if (vp != NULL) 3118 VOP_UNLOCK(vp, 0); 3119 3120 /* Add the buffer chain to the socket buffer. */ 3121 if (m != NULL) { 3122 int mlen, err; 3123 3124 mlen = m_length(m, NULL); 3125 SOCKBUF_LOCK(&so->so_snd); 3126 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 3127 error = EPIPE; 3128 SOCKBUF_UNLOCK(&so->so_snd); 3129 goto done; 3130 } 3131 SOCKBUF_UNLOCK(&so->so_snd); 3132 CURVNET_SET(so->so_vnet); 3133 /* Avoid error aliasing. */ 3134 err = (*so->so_proto->pr_usrreqs->pru_send) 3135 (so, 0, m, NULL, NULL, td); 3136 CURVNET_RESTORE(); 3137 if (err == 0) { 3138 /* 3139 * We need two counters to get the 3140 * file offset and nbytes to send 3141 * right: 3142 * - sbytes contains the total amount 3143 * of bytes sent, including headers. 3144 * - fsbytes contains the total amount 3145 * of bytes sent from the file. 3146 */ 3147 sbytes += mlen; 3148 fsbytes += mlen; 3149 if (hdrlen) { 3150 fsbytes -= hdrlen; 3151 hdrlen = 0; 3152 } 3153 } else if (error == 0) 3154 error = err; 3155 m = NULL; /* pru_send always consumes */ 3156 } 3157 3158 /* Quit outer loop on error or when we're done. */ 3159 if (done) 3160 break; 3161 if (error != 0) 3162 goto done; 3163 } 3164 3165 /* 3166 * Send trailers. Wimp out and use writev(2). 3167 */ 3168 if (trl_uio != NULL) { 3169 sbunlock(&so->so_snd); 3170 error = kern_writev(td, sockfd, trl_uio); 3171 if (error == 0) 3172 sbytes += td->td_retval[0]; 3173 goto out; 3174 } 3175 3176 done: 3177 sbunlock(&so->so_snd); 3178 out: 3179 /* 3180 * If there was no error we have to clear td->td_retval[0] 3181 * because it may have been set by writev. 3182 */ 3183 if (error == 0) { 3184 td->td_retval[0] = 0; 3185 } 3186 if (sent != NULL) { 3187 (*sent) = sbytes; 3188 } 3189 if (obj != NULL) 3190 vm_object_deallocate(obj); 3191 if (so) 3192 fdrop(sock_fp, td); 3193 if (m) 3194 m_freem(m); 3195 3196 if (error == ERESTART) 3197 error = EINTR; 3198 3199 return (error); 3200 } 3201 3202 /* 3203 * SCTP syscalls. 3204 * Functionality only compiled in if SCTP is defined in the kernel Makefile, 3205 * otherwise all return EOPNOTSUPP. 3206 * XXX: We should make this loadable one day. 3207 */ 3208 int 3209 sys_sctp_peeloff(td, uap) 3210 struct thread *td; 3211 struct sctp_peeloff_args /* { 3212 int sd; 3213 caddr_t name; 3214 } */ *uap; 3215 { 3216 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3217 struct file *nfp = NULL; 3218 struct socket *head, *so; 3219 cap_rights_t rights; 3220 u_int fflag; 3221 int error, fd; 3222 3223 AUDIT_ARG_FD(uap->sd); 3224 error = fgetsock(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF), 3225 &head, &fflag); 3226 if (error != 0) 3227 goto done2; 3228 if (head->so_proto->pr_protocol != IPPROTO_SCTP) { 3229 error = EOPNOTSUPP; 3230 goto done; 3231 } 3232 error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name); 3233 if (error != 0) 3234 goto done; 3235 /* 3236 * At this point we know we do have a assoc to pull 3237 * we proceed to get the fd setup. This may block 3238 * but that is ok. 3239 */ 3240 3241 error = falloc(td, &nfp, &fd, 0); 3242 if (error != 0) 3243 goto done; 3244 td->td_retval[0] = fd; 3245 3246 CURVNET_SET(head->so_vnet); 3247 so = sonewconn(head, SS_ISCONNECTED); 3248 if (so == NULL) { 3249 error = ENOMEM; 3250 goto noconnection; 3251 } 3252 /* 3253 * Before changing the flags on the socket, we have to bump the 3254 * reference count. Otherwise, if the protocol calls sofree(), 3255 * the socket will be released due to a zero refcount. 3256 */ 3257 SOCK_LOCK(so); 3258 soref(so); /* file descriptor reference */ 3259 SOCK_UNLOCK(so); 3260 3261 ACCEPT_LOCK(); 3262 3263 TAILQ_REMOVE(&head->so_comp, so, so_list); 3264 head->so_qlen--; 3265 so->so_state |= (head->so_state & SS_NBIO); 3266 so->so_state &= ~SS_NOFDREF; 3267 so->so_qstate &= ~SQ_COMP; 3268 so->so_head = NULL; 3269 ACCEPT_UNLOCK(); 3270 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); 3271 error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); 3272 if (error != 0) 3273 goto noconnection; 3274 if (head->so_sigio != NULL) 3275 fsetown(fgetown(&head->so_sigio), &so->so_sigio); 3276 3277 noconnection: 3278 /* 3279 * close the new descriptor, assuming someone hasn't ripped it 3280 * out from under us. 3281 */ 3282 if (error != 0) 3283 fdclose(td->td_proc->p_fd, nfp, fd, td); 3284 3285 /* 3286 * Release explicitly held references before returning. 3287 */ 3288 CURVNET_RESTORE(); 3289 done: 3290 if (nfp != NULL) 3291 fdrop(nfp, td); 3292 fputsock(head); 3293 done2: 3294 return (error); 3295 #else /* SCTP */ 3296 return (EOPNOTSUPP); 3297 #endif /* SCTP */ 3298 } 3299 3300 int 3301 sys_sctp_generic_sendmsg (td, uap) 3302 struct thread *td; 3303 struct sctp_generic_sendmsg_args /* { 3304 int sd, 3305 caddr_t msg, 3306 int mlen, 3307 caddr_t to, 3308 __socklen_t tolen, 3309 struct sctp_sndrcvinfo *sinfo, 3310 int flags 3311 } */ *uap; 3312 { 3313 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3314 struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; 3315 struct socket *so; 3316 struct file *fp = NULL; 3317 struct sockaddr *to = NULL; 3318 #ifdef KTRACE 3319 struct uio *ktruio = NULL; 3320 #endif 3321 struct uio auio; 3322 struct iovec iov[1]; 3323 cap_rights_t rights; 3324 int error = 0, len; 3325 3326 if (uap->sinfo != NULL) { 3327 error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); 3328 if (error != 0) 3329 return (error); 3330 u_sinfo = &sinfo; 3331 } 3332 3333 cap_rights_init(&rights, CAP_SEND); 3334 if (uap->tolen != 0) { 3335 error = getsockaddr(&to, uap->to, uap->tolen); 3336 if (error != 0) { 3337 to = NULL; 3338 goto sctp_bad2; 3339 } 3340 cap_rights_set(&rights, CAP_CONNECT); 3341 } 3342 3343 AUDIT_ARG_FD(uap->sd); 3344 error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL); 3345 if (error != 0) 3346 goto sctp_bad; 3347 #ifdef KTRACE 3348 if (to && (KTRPOINT(td, KTR_STRUCT))) 3349 ktrsockaddr(to); 3350 #endif 3351 3352 iov[0].iov_base = uap->msg; 3353 iov[0].iov_len = uap->mlen; 3354 3355 so = (struct socket *)fp->f_data; 3356 if (so->so_proto->pr_protocol != IPPROTO_SCTP) { 3357 error = EOPNOTSUPP; 3358 goto sctp_bad; 3359 } 3360 #ifdef MAC 3361 error = mac_socket_check_send(td->td_ucred, so); 3362 if (error != 0) 3363 goto sctp_bad; 3364 #endif /* MAC */ 3365 3366 auio.uio_iov = iov; 3367 auio.uio_iovcnt = 1; 3368 auio.uio_segflg = UIO_USERSPACE; 3369 auio.uio_rw = UIO_WRITE; 3370 auio.uio_td = td; 3371 auio.uio_offset = 0; /* XXX */ 3372 auio.uio_resid = 0; 3373 len = auio.uio_resid = uap->mlen; 3374 CURVNET_SET(so->so_vnet); 3375 error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, 3376 (struct mbuf *)NULL, uap->flags, u_sinfo, td); 3377 CURVNET_RESTORE(); 3378 if (error != 0) { 3379 if (auio.uio_resid != len && (error == ERESTART || 3380 error == EINTR || error == EWOULDBLOCK)) 3381 error = 0; 3382 /* Generation of SIGPIPE can be controlled per socket. */ 3383 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 3384 !(uap->flags & MSG_NOSIGNAL)) { 3385 PROC_LOCK(td->td_proc); 3386 tdsignal(td, SIGPIPE); 3387 PROC_UNLOCK(td->td_proc); 3388 } 3389 } 3390 if (error == 0) 3391 td->td_retval[0] = len - auio.uio_resid; 3392 #ifdef KTRACE 3393 if (ktruio != NULL) { 3394 ktruio->uio_resid = td->td_retval[0]; 3395 ktrgenio(uap->sd, UIO_WRITE, ktruio, error); 3396 } 3397 #endif /* KTRACE */ 3398 sctp_bad: 3399 if (fp != NULL) 3400 fdrop(fp, td); 3401 sctp_bad2: 3402 free(to, M_SONAME); 3403 return (error); 3404 #else /* SCTP */ 3405 return (EOPNOTSUPP); 3406 #endif /* SCTP */ 3407 } 3408 3409 int 3410 sys_sctp_generic_sendmsg_iov(td, uap) 3411 struct thread *td; 3412 struct sctp_generic_sendmsg_iov_args /* { 3413 int sd, 3414 struct iovec *iov, 3415 int iovlen, 3416 caddr_t to, 3417 __socklen_t tolen, 3418 struct sctp_sndrcvinfo *sinfo, 3419 int flags 3420 } */ *uap; 3421 { 3422 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3423 struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; 3424 struct socket *so; 3425 struct file *fp = NULL; 3426 struct sockaddr *to = NULL; 3427 #ifdef KTRACE 3428 struct uio *ktruio = NULL; 3429 #endif 3430 struct uio auio; 3431 struct iovec *iov, *tiov; 3432 cap_rights_t rights; 3433 ssize_t len; 3434 int error, i; 3435 3436 if (uap->sinfo != NULL) { 3437 error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); 3438 if (error != 0) 3439 return (error); 3440 u_sinfo = &sinfo; 3441 } 3442 cap_rights_init(&rights, CAP_SEND); 3443 if (uap->tolen != 0) { 3444 error = getsockaddr(&to, uap->to, uap->tolen); 3445 if (error != 0) { 3446 to = NULL; 3447 goto sctp_bad2; 3448 } 3449 cap_rights_set(&rights, CAP_CONNECT); 3450 } 3451 3452 AUDIT_ARG_FD(uap->sd); 3453 error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL); 3454 if (error != 0) 3455 goto sctp_bad1; 3456 3457 #ifdef COMPAT_FREEBSD32 3458 if (SV_CURPROC_FLAG(SV_ILP32)) 3459 error = freebsd32_copyiniov((struct iovec32 *)uap->iov, 3460 uap->iovlen, &iov, EMSGSIZE); 3461 else 3462 #endif 3463 error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); 3464 if (error != 0) 3465 goto sctp_bad1; 3466 #ifdef KTRACE 3467 if (to && (KTRPOINT(td, KTR_STRUCT))) 3468 ktrsockaddr(to); 3469 #endif 3470 3471 so = (struct socket *)fp->f_data; 3472 if (so->so_proto->pr_protocol != IPPROTO_SCTP) { 3473 error = EOPNOTSUPP; 3474 goto sctp_bad; 3475 } 3476 #ifdef MAC 3477 error = mac_socket_check_send(td->td_ucred, so); 3478 if (error != 0) 3479 goto sctp_bad; 3480 #endif /* MAC */ 3481 3482 auio.uio_iov = iov; 3483 auio.uio_iovcnt = uap->iovlen; 3484 auio.uio_segflg = UIO_USERSPACE; 3485 auio.uio_rw = UIO_WRITE; 3486 auio.uio_td = td; 3487 auio.uio_offset = 0; /* XXX */ 3488 auio.uio_resid = 0; 3489 tiov = iov; 3490 for (i = 0; i <uap->iovlen; i++, tiov++) { 3491 if ((auio.uio_resid += tiov->iov_len) < 0) { 3492 error = EINVAL; 3493 goto sctp_bad; 3494 } 3495 } 3496 len = auio.uio_resid; 3497 CURVNET_SET(so->so_vnet); 3498 error = sctp_lower_sosend(so, to, &auio, 3499 (struct mbuf *)NULL, (struct mbuf *)NULL, 3500 uap->flags, u_sinfo, td); 3501 CURVNET_RESTORE(); 3502 if (error != 0) { 3503 if (auio.uio_resid != len && (error == ERESTART || 3504 error == EINTR || error == EWOULDBLOCK)) 3505 error = 0; 3506 /* Generation of SIGPIPE can be controlled per socket */ 3507 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && 3508 !(uap->flags & MSG_NOSIGNAL)) { 3509 PROC_LOCK(td->td_proc); 3510 tdsignal(td, SIGPIPE); 3511 PROC_UNLOCK(td->td_proc); 3512 } 3513 } 3514 if (error == 0) 3515 td->td_retval[0] = len - auio.uio_resid; 3516 #ifdef KTRACE 3517 if (ktruio != NULL) { 3518 ktruio->uio_resid = td->td_retval[0]; 3519 ktrgenio(uap->sd, UIO_WRITE, ktruio, error); 3520 } 3521 #endif /* KTRACE */ 3522 sctp_bad: 3523 free(iov, M_IOV); 3524 sctp_bad1: 3525 if (fp != NULL) 3526 fdrop(fp, td); 3527 sctp_bad2: 3528 free(to, M_SONAME); 3529 return (error); 3530 #else /* SCTP */ 3531 return (EOPNOTSUPP); 3532 #endif /* SCTP */ 3533 } 3534 3535 int 3536 sys_sctp_generic_recvmsg(td, uap) 3537 struct thread *td; 3538 struct sctp_generic_recvmsg_args /* { 3539 int sd, 3540 struct iovec *iov, 3541 int iovlen, 3542 struct sockaddr *from, 3543 __socklen_t *fromlenaddr, 3544 struct sctp_sndrcvinfo *sinfo, 3545 int *msg_flags 3546 } */ *uap; 3547 { 3548 #if (defined(INET) || defined(INET6)) && defined(SCTP) 3549 uint8_t sockbufstore[256]; 3550 struct uio auio; 3551 struct iovec *iov, *tiov; 3552 struct sctp_sndrcvinfo sinfo; 3553 struct socket *so; 3554 struct file *fp = NULL; 3555 struct sockaddr *fromsa; 3556 cap_rights_t rights; 3557 #ifdef KTRACE 3558 struct uio *ktruio = NULL; 3559 #endif 3560 ssize_t len; 3561 int error, fromlen, i, msg_flags; 3562 3563 AUDIT_ARG_FD(uap->sd); 3564 error = getsock_cap(td->td_proc->p_fd, uap->sd, 3565 cap_rights_init(&rights, CAP_RECV), &fp, NULL); 3566 if (error != 0) 3567 return (error); 3568 #ifdef COMPAT_FREEBSD32 3569 if (SV_CURPROC_FLAG(SV_ILP32)) 3570 error = freebsd32_copyiniov((struct iovec32 *)uap->iov, 3571 uap->iovlen, &iov, EMSGSIZE); 3572 else 3573 #endif 3574 error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); 3575 if (error != 0) 3576 goto out1; 3577 3578 so = fp->f_data; 3579 if (so->so_proto->pr_protocol != IPPROTO_SCTP) { 3580 error = EOPNOTSUPP; 3581 goto out; 3582 } 3583 #ifdef MAC 3584 error = mac_socket_check_receive(td->td_ucred, so); 3585 if (error != 0) 3586 goto out; 3587 #endif /* MAC */ 3588 3589 if (uap->fromlenaddr != NULL) { 3590 error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen)); 3591 if (error != 0) 3592 goto out; 3593 } else { 3594 fromlen = 0; 3595 } 3596 if (uap->msg_flags) { 3597 error = copyin(uap->msg_flags, &msg_flags, sizeof (int)); 3598 if (error != 0) 3599 goto out; 3600 } else { 3601 msg_flags = 0; 3602 } 3603 auio.uio_iov = iov; 3604 auio.uio_iovcnt = uap->iovlen; 3605 auio.uio_segflg = UIO_USERSPACE; 3606 auio.uio_rw = UIO_READ; 3607 auio.uio_td = td; 3608 auio.uio_offset = 0; /* XXX */ 3609 auio.uio_resid = 0; 3610 tiov = iov; 3611 for (i = 0; i <uap->iovlen; i++, tiov++) { 3612 if ((auio.uio_resid += tiov->iov_len) < 0) { 3613 error = EINVAL; 3614 goto out; 3615 } 3616 } 3617 len = auio.uio_resid; 3618 fromsa = (struct sockaddr *)sockbufstore; 3619 3620 #ifdef KTRACE 3621 if (KTRPOINT(td, KTR_GENIO)) 3622 ktruio = cloneuio(&auio); 3623 #endif /* KTRACE */ 3624 memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo)); 3625 CURVNET_SET(so->so_vnet); 3626 error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL, 3627 fromsa, fromlen, &msg_flags, 3628 (struct sctp_sndrcvinfo *)&sinfo, 1); 3629 CURVNET_RESTORE(); 3630 if (error != 0) { 3631 if (auio.uio_resid != len && (error == ERESTART || 3632 error == EINTR || error == EWOULDBLOCK)) 3633 error = 0; 3634 } else { 3635 if (uap->sinfo) 3636 error = copyout(&sinfo, uap->sinfo, sizeof (sinfo)); 3637 } 3638 #ifdef KTRACE 3639 if (ktruio != NULL) { 3640 ktruio->uio_resid = len - auio.uio_resid; 3641 ktrgenio(uap->sd, UIO_READ, ktruio, error); 3642 } 3643 #endif /* KTRACE */ 3644 if (error != 0) 3645 goto out; 3646 td->td_retval[0] = len - auio.uio_resid; 3647 3648 if (fromlen && uap->from) { 3649 len = fromlen; 3650 if (len <= 0 || fromsa == 0) 3651 len = 0; 3652 else { 3653 len = MIN(len, fromsa->sa_len); 3654 error = copyout(fromsa, uap->from, (size_t)len); 3655 if (error != 0) 3656 goto out; 3657 } 3658 error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t)); 3659 if (error != 0) 3660 goto out; 3661 } 3662 #ifdef KTRACE 3663 if (KTRPOINT(td, KTR_STRUCT)) 3664 ktrsockaddr(fromsa); 3665 #endif 3666 if (uap->msg_flags) { 3667 error = copyout(&msg_flags, uap->msg_flags, sizeof (int)); 3668 if (error != 0) 3669 goto out; 3670 } 3671 out: 3672 free(iov, M_IOV); 3673 out1: 3674 if (fp != NULL) 3675 fdrop(fp, td); 3676 3677 return (error); 3678 #else /* SCTP */ 3679 return (EOPNOTSUPP); 3680 #endif /* SCTP */ 3681 } 3682