1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ 27 28 #include <sys/types.h> 29 #include <sys/t_lock.h> 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/buf.h> 33 #include <sys/conf.h> 34 #include <sys/cred.h> 35 #include <sys/kmem.h> 36 #include <sys/sysmacros.h> 37 #include <sys/vfs.h> 38 #include <sys/vnode.h> 39 #include <sys/debug.h> 40 #include <sys/errno.h> 41 #include <sys/time.h> 42 #include <sys/file.h> 43 #include <sys/user.h> 44 #include <sys/stream.h> 45 #include <sys/strsubr.h> 46 #include <sys/strsun.h> 47 #include <sys/sunddi.h> 48 #include <sys/esunddi.h> 49 #include <sys/flock.h> 50 #include <sys/modctl.h> 51 #include <sys/cmn_err.h> 52 #include <sys/vmsystm.h> 53 #include <sys/policy.h> 54 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 58 #include <sys/isa_defs.h> 59 #include <sys/inttypes.h> 60 #include <sys/systm.h> 61 #include <sys/cpuvar.h> 62 #include <sys/filio.h> 63 #include <sys/sendfile.h> 64 #include <sys/ddi.h> 65 #include <vm/seg.h> 66 #include <vm/seg_map.h> 67 #include <vm/seg_kpm.h> 68 69 #include <fs/sockfs/nl7c.h> 70 #include <fs/sockfs/sockcommon.h> 71 #include <fs/sockfs/sockfilter_impl.h> 72 #include <fs/sockfs/socktpi.h> 73 74 #ifdef SOCK_TEST 75 int do_useracc = 1; /* Controlled by setting SO_DEBUG to 4 */ 76 #else 77 #define do_useracc 1 78 #endif /* SOCK_TEST */ 79 80 extern int xnet_truncate_print; 81 82 extern void nl7c_init(void); 83 extern int sockfs_defer_nl7c_init; 84 85 /* 86 * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c" 87 * as there isn't a formal definition of IOV_MAX ??? 88 */ 89 #define MSG_MAXIOVLEN 16 90 91 /* 92 * Kernel component of socket creation. 93 * 94 * The socket library determines which version number to use. 95 * First the library calls this with a NULL devpath. If this fails 96 * to find a transport (using solookup) the library will look in /etc/netconfig 97 * for the appropriate transport. If one is found it will pass in the 98 * devpath for the kernel to use. 99 */ 100 int 101 so_socket(int family, int type_w_flags, int protocol, char *devpath, 102 int version) 103 { 104 struct sonode *so; 105 vnode_t *vp; 106 struct file *fp; 107 int fd; 108 int error; 109 int type; 110 111 type = type_w_flags & SOCK_TYPE_MASK; 112 if (devpath != NULL) { 113 char *buf; 114 size_t kdevpathlen = 0; 115 116 buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); 117 if ((error = copyinstr(devpath, buf, 118 MAXPATHLEN, &kdevpathlen)) != 0) { 119 kmem_free(buf, MAXPATHLEN); 120 return (set_errno(error)); 121 } 122 so = socket_create(family, type, protocol, buf, NULL, 123 SOCKET_SLEEP, version, CRED(), &error); 124 kmem_free(buf, MAXPATHLEN); 125 } else { 126 so = socket_create(family, type, protocol, NULL, NULL, 127 SOCKET_SLEEP, version, CRED(), &error); 128 } 129 if (so == NULL) 130 return (set_errno(error)); 131 132 /* Allocate a file descriptor for the socket */ 133 vp = SOTOV(so); 134 if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) { 135 (void) socket_close(so, 0, CRED()); 136 socket_destroy(so); 137 return (set_errno(error)); 138 } 139 140 /* 141 * Now fill in the entries that falloc reserved 142 */ 143 mutex_exit(&fp->f_tlock); 144 setf(fd, fp); 145 if ((type_w_flags & SOCK_CLOEXEC) != 0) { 146 f_setfd(fd, FD_CLOEXEC); 147 } 148 149 return (fd); 150 } 151 152 /* 153 * Map from a file descriptor to a socket node. 154 * Returns with the file descriptor held i.e. the caller has to 155 * use releasef when done with the file descriptor. 156 */ 157 struct sonode * 158 getsonode(int sock, int *errorp, file_t **fpp) 159 { 160 file_t *fp; 161 vnode_t *vp; 162 struct sonode *so; 163 164 if ((fp = getf(sock)) == NULL) { 165 *errorp = EBADF; 166 eprintline(*errorp); 167 return (NULL); 168 } 169 vp = fp->f_vnode; 170 /* Check if it is a socket */ 171 if (vp->v_type != VSOCK) { 172 releasef(sock); 173 *errorp = ENOTSOCK; 174 eprintline(*errorp); 175 return (NULL); 176 } 177 /* 178 * Use the stream head to find the real socket vnode. 179 * This is needed when namefs sits above sockfs. 180 */ 181 if (vp->v_stream) { 182 ASSERT(vp->v_stream->sd_vnode); 183 vp = vp->v_stream->sd_vnode; 184 185 so = VTOSO(vp); 186 if (so->so_version == SOV_STREAM) { 187 releasef(sock); 188 *errorp = ENOTSOCK; 189 eprintsoline(so, *errorp); 190 return (NULL); 191 } 192 } else { 193 so = VTOSO(vp); 194 } 195 if (fpp) 196 *fpp = fp; 197 return (so); 198 } 199 200 /* 201 * Allocate and copyin a sockaddr. 202 * Ensures NULL termination for AF_UNIX addresses by extending them 203 * with one NULL byte if need be. Verifies that the length is not 204 * excessive to prevent an application from consuming all of kernel 205 * memory. Returns NULL when an error occurred. 206 */ 207 static struct sockaddr * 208 copyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp, 209 int *errorp) 210 { 211 char *faddr; 212 size_t namelen = (size_t)*namelenp; 213 214 ASSERT(namelen != 0); 215 if (namelen > SO_MAXARGSIZE) { 216 *errorp = EINVAL; 217 eprintsoline(so, *errorp); 218 return (NULL); 219 } 220 221 faddr = (char *)kmem_alloc(namelen, KM_SLEEP); 222 if (copyin(name, faddr, namelen)) { 223 kmem_free(faddr, namelen); 224 *errorp = EFAULT; 225 eprintsoline(so, *errorp); 226 return (NULL); 227 } 228 229 /* 230 * Add space for NULL termination if needed. 231 * Do a quick check if the last byte is NUL. 232 */ 233 if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') { 234 /* Check if there is any NULL termination */ 235 size_t i; 236 int foundnull = 0; 237 238 for (i = sizeof (name->sa_family); i < namelen; i++) { 239 if (faddr[i] == '\0') { 240 foundnull = 1; 241 break; 242 } 243 } 244 if (!foundnull) { 245 /* Add extra byte for NUL padding */ 246 char *nfaddr; 247 248 nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP); 249 bcopy(faddr, nfaddr, namelen); 250 kmem_free(faddr, namelen); 251 252 /* NUL terminate */ 253 nfaddr[namelen] = '\0'; 254 namelen++; 255 ASSERT((socklen_t)namelen == namelen); 256 *namelenp = (socklen_t)namelen; 257 faddr = nfaddr; 258 } 259 } 260 return ((struct sockaddr *)faddr); 261 } 262 263 /* 264 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL. 265 */ 266 static int 267 copyout_arg(void *uaddr, socklen_t ulen, void *ulenp, 268 void *kaddr, socklen_t klen) 269 { 270 if (uaddr != NULL) { 271 if (ulen > klen) 272 ulen = klen; 273 274 if (ulen != 0) { 275 if (copyout(kaddr, uaddr, ulen)) 276 return (EFAULT); 277 } 278 } else 279 ulen = 0; 280 281 if (ulenp != NULL) { 282 if (copyout(&ulen, ulenp, sizeof (ulen))) 283 return (EFAULT); 284 } 285 return (0); 286 } 287 288 /* 289 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL. 290 * If klen is greater than ulen it still uses the non-truncated 291 * klen to update ulenp. 292 */ 293 static int 294 copyout_name(void *uaddr, socklen_t ulen, void *ulenp, 295 void *kaddr, socklen_t klen) 296 { 297 if (uaddr != NULL) { 298 if (ulen >= klen) 299 ulen = klen; 300 else if (ulen != 0 && xnet_truncate_print) { 301 printf("sockfs: truncating copyout of address using " 302 "XNET semantics for pid = %d. Lengths %d, %d\n", 303 curproc->p_pid, klen, ulen); 304 } 305 306 if (ulen != 0) { 307 if (copyout(kaddr, uaddr, ulen)) 308 return (EFAULT); 309 } else 310 klen = 0; 311 } else 312 klen = 0; 313 314 if (ulenp != NULL) { 315 if (copyout(&klen, ulenp, sizeof (klen))) 316 return (EFAULT); 317 } 318 return (0); 319 } 320 321 /* 322 * The socketpair() code in libsocket creates two sockets (using 323 * the /etc/netconfig fallback if needed) before calling this routine 324 * to connect the two sockets together. 325 * 326 * For a SOCK_STREAM socketpair a listener is needed - in that case this 327 * routine will create a new file descriptor as part of accepting the 328 * connection. The library socketpair() will check if svs[2] has changed 329 * in which case it will close the changed fd. 330 * 331 * Note that this code could use the TPI feature of accepting the connection 332 * on the listening endpoint. However, that would require significant changes 333 * to soaccept. 334 */ 335 int 336 so_socketpair(int sv[2]) 337 { 338 int svs[2]; 339 struct sonode *so1, *so2; 340 int error; 341 struct sockaddr_ux *name; 342 size_t namelen; 343 sotpi_info_t *sti1; 344 sotpi_info_t *sti2; 345 346 dprint(1, ("so_socketpair(%p)\n", (void *)sv)); 347 348 error = useracc(sv, sizeof (svs), B_WRITE); 349 if (error && do_useracc) 350 return (set_errno(EFAULT)); 351 352 if (copyin(sv, svs, sizeof (svs))) 353 return (set_errno(EFAULT)); 354 355 if ((so1 = getsonode(svs[0], &error, NULL)) == NULL) 356 return (set_errno(error)); 357 358 if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) { 359 releasef(svs[0]); 360 return (set_errno(error)); 361 } 362 363 if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) { 364 error = EOPNOTSUPP; 365 goto done; 366 } 367 368 sti1 = SOTOTPI(so1); 369 sti2 = SOTOTPI(so2); 370 371 /* 372 * The code below makes assumptions about the "sockfs" implementation. 373 * So make sure that the correct implementation is really used. 374 */ 375 ASSERT(so1->so_ops == &sotpi_sonodeops); 376 ASSERT(so2->so_ops == &sotpi_sonodeops); 377 378 if (so1->so_type == SOCK_DGRAM) { 379 /* 380 * Bind both sockets and connect them with each other. 381 * Need to allocate name/namelen for soconnect. 382 */ 383 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED()); 384 if (error) { 385 eprintsoline(so1, error); 386 goto done; 387 } 388 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED()); 389 if (error) { 390 eprintsoline(so2, error); 391 goto done; 392 } 393 namelen = sizeof (struct sockaddr_ux); 394 name = kmem_alloc(namelen, KM_SLEEP); 395 name->sou_family = AF_UNIX; 396 name->sou_addr = sti2->sti_ux_laddr; 397 error = socket_connect(so1, 398 (struct sockaddr *)name, 399 (socklen_t)namelen, 400 0, _SOCONNECT_NOXLATE, CRED()); 401 if (error) { 402 kmem_free(name, namelen); 403 eprintsoline(so1, error); 404 goto done; 405 } 406 name->sou_addr = sti1->sti_ux_laddr; 407 error = socket_connect(so2, 408 (struct sockaddr *)name, 409 (socklen_t)namelen, 410 0, _SOCONNECT_NOXLATE, CRED()); 411 kmem_free(name, namelen); 412 if (error) { 413 eprintsoline(so2, error); 414 goto done; 415 } 416 releasef(svs[0]); 417 releasef(svs[1]); 418 } else { 419 /* 420 * Bind both sockets, with so1 being a listener. 421 * Connect so2 to so1 - nonblocking to avoid waiting for 422 * soaccept to complete. 423 * Accept a connection on so1. Pass out the new fd as sv[0]. 424 * The library will detect the changed fd and close 425 * the original one. 426 */ 427 struct sonode *nso; 428 struct vnode *nvp; 429 struct file *nfp; 430 int nfd; 431 432 /* 433 * We could simply call socket_listen() here (which would do the 434 * binding automatically) if the code didn't rely on passing 435 * _SOBIND_NOXLATE to the TPI implementation of socket_bind(). 436 */ 437 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC| 438 _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR, 439 CRED()); 440 if (error) { 441 eprintsoline(so1, error); 442 goto done; 443 } 444 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED()); 445 if (error) { 446 eprintsoline(so2, error); 447 goto done; 448 } 449 450 namelen = sizeof (struct sockaddr_ux); 451 name = kmem_alloc(namelen, KM_SLEEP); 452 name->sou_family = AF_UNIX; 453 name->sou_addr = sti1->sti_ux_laddr; 454 error = socket_connect(so2, 455 (struct sockaddr *)name, 456 (socklen_t)namelen, 457 FNONBLOCK, _SOCONNECT_NOXLATE, CRED()); 458 kmem_free(name, namelen); 459 if (error) { 460 if (error != EINPROGRESS) { 461 eprintsoline(so2, error); goto done; 462 } 463 } 464 465 error = socket_accept(so1, 0, CRED(), &nso); 466 if (error) { 467 eprintsoline(so1, error); 468 goto done; 469 } 470 471 /* wait for so2 being SS_CONNECTED ignoring signals */ 472 mutex_enter(&so2->so_lock); 473 error = sowaitconnected(so2, 0, 1); 474 mutex_exit(&so2->so_lock); 475 if (error != 0) { 476 (void) socket_close(nso, 0, CRED()); 477 socket_destroy(nso); 478 eprintsoline(so2, error); 479 goto done; 480 } 481 482 nvp = SOTOV(nso); 483 if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) { 484 (void) socket_close(nso, 0, CRED()); 485 socket_destroy(nso); 486 eprintsoline(nso, error); 487 goto done; 488 } 489 /* 490 * fill in the entries that falloc reserved 491 */ 492 mutex_exit(&nfp->f_tlock); 493 setf(nfd, nfp); 494 495 releasef(svs[0]); 496 releasef(svs[1]); 497 svs[0] = nfd; 498 499 /* 500 * The socketpair library routine will close the original 501 * svs[0] when this code passes out a different file 502 * descriptor. 503 */ 504 if (copyout(svs, sv, sizeof (svs))) { 505 (void) closeandsetf(nfd, NULL); 506 eprintline(EFAULT); 507 return (set_errno(EFAULT)); 508 } 509 } 510 return (0); 511 512 done: 513 releasef(svs[0]); 514 releasef(svs[1]); 515 return (set_errno(error)); 516 } 517 518 int 519 bind(int sock, struct sockaddr *name, socklen_t namelen, int version) 520 { 521 struct sonode *so; 522 int error; 523 524 dprint(1, ("bind(%d, %p, %d)\n", 525 sock, (void *)name, namelen)); 526 527 if ((so = getsonode(sock, &error, NULL)) == NULL) 528 return (set_errno(error)); 529 530 /* Allocate and copyin name */ 531 /* 532 * X/Open test does not expect EFAULT with NULL name and non-zero 533 * namelen. 534 */ 535 if (name != NULL && namelen != 0) { 536 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 537 name = copyin_name(so, name, &namelen, &error); 538 if (name == NULL) { 539 releasef(sock); 540 return (set_errno(error)); 541 } 542 } else { 543 name = NULL; 544 namelen = 0; 545 } 546 547 switch (version) { 548 default: 549 error = socket_bind(so, name, namelen, 0, CRED()); 550 break; 551 case SOV_XPG4_2: 552 error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED()); 553 break; 554 case SOV_SOCKBSD: 555 error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED()); 556 break; 557 } 558 done: 559 releasef(sock); 560 if (name != NULL) 561 kmem_free(name, (size_t)namelen); 562 563 if (error) 564 return (set_errno(error)); 565 return (0); 566 } 567 568 /* ARGSUSED2 */ 569 int 570 listen(int sock, int backlog, int version) 571 { 572 struct sonode *so; 573 int error; 574 575 dprint(1, ("listen(%d, %d)\n", 576 sock, backlog)); 577 578 if ((so = getsonode(sock, &error, NULL)) == NULL) 579 return (set_errno(error)); 580 581 error = socket_listen(so, backlog, CRED()); 582 583 releasef(sock); 584 if (error) 585 return (set_errno(error)); 586 return (0); 587 } 588 589 /*ARGSUSED3*/ 590 int 591 accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version) 592 { 593 struct sonode *so; 594 file_t *fp; 595 int error; 596 socklen_t namelen; 597 struct sonode *nso; 598 struct vnode *nvp; 599 struct file *nfp; 600 int nfd; 601 struct sockaddr *addrp; 602 socklen_t addrlen; 603 604 dprint(1, ("accept(%d, %p, %p)\n", 605 sock, (void *)name, (void *)namelenp)); 606 607 if ((so = getsonode(sock, &error, &fp)) == NULL) 608 return (set_errno(error)); 609 610 if (name != NULL) { 611 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 612 if (copyin(namelenp, &namelen, sizeof (namelen))) { 613 releasef(sock); 614 return (set_errno(EFAULT)); 615 } 616 if (namelen != 0) { 617 error = useracc(name, (size_t)namelen, B_WRITE); 618 if (error && do_useracc) { 619 releasef(sock); 620 return (set_errno(EFAULT)); 621 } 622 } else 623 name = NULL; 624 } else { 625 namelen = 0; 626 } 627 628 /* 629 * Allocate the user fd before socket_accept() in order to 630 * catch EMFILE errors before calling socket_accept(). 631 */ 632 if ((nfd = ufalloc(0)) == -1) { 633 eprintsoline(so, EMFILE); 634 releasef(sock); 635 return (set_errno(EMFILE)); 636 } 637 error = socket_accept(so, fp->f_flag, CRED(), &nso); 638 if (error) { 639 setf(nfd, NULL); 640 releasef(sock); 641 return (set_errno(error)); 642 } 643 644 nvp = SOTOV(nso); 645 646 ASSERT(MUTEX_NOT_HELD(&nso->so_lock)); 647 if (namelen != 0) { 648 addrlen = so->so_max_addr_len; 649 addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP); 650 651 if ((error = socket_getpeername(nso, (struct sockaddr *)addrp, 652 &addrlen, B_TRUE, CRED())) == 0) { 653 error = copyout_name(name, namelen, namelenp, 654 addrp, addrlen); 655 } else { 656 ASSERT(error == EINVAL || error == ENOTCONN); 657 error = ECONNABORTED; 658 } 659 kmem_free(addrp, so->so_max_addr_len); 660 } 661 662 if (error) { 663 setf(nfd, NULL); 664 (void) socket_close(nso, 0, CRED()); 665 socket_destroy(nso); 666 releasef(sock); 667 return (set_errno(error)); 668 } 669 if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) { 670 setf(nfd, NULL); 671 (void) socket_close(nso, 0, CRED()); 672 socket_destroy(nso); 673 eprintsoline(so, error); 674 releasef(sock); 675 return (set_errno(error)); 676 } 677 /* 678 * fill in the entries that falloc reserved 679 */ 680 nfp->f_vnode = nvp; 681 mutex_exit(&nfp->f_tlock); 682 setf(nfd, nfp); 683 684 /* 685 * Copy FNDELAY and FNONBLOCK from listener to acceptor 686 */ 687 if (so->so_state & (SS_NDELAY|SS_NONBLOCK)) { 688 uint_t oflag = nfp->f_flag; 689 int arg = 0; 690 691 if (so->so_state & SS_NONBLOCK) 692 arg |= FNONBLOCK; 693 else if (so->so_state & SS_NDELAY) 694 arg |= FNDELAY; 695 696 /* 697 * This code is a simplification of the F_SETFL code in fcntl() 698 * Ignore any errors from VOP_SETFL. 699 */ 700 if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred, NULL)) 701 != 0) { 702 eprintsoline(so, error); 703 error = 0; 704 } else { 705 mutex_enter(&nfp->f_tlock); 706 nfp->f_flag &= ~FMASK | (FREAD|FWRITE); 707 nfp->f_flag |= arg; 708 mutex_exit(&nfp->f_tlock); 709 } 710 } 711 releasef(sock); 712 return (nfd); 713 } 714 715 int 716 connect(int sock, struct sockaddr *name, socklen_t namelen, int version) 717 { 718 struct sonode *so; 719 file_t *fp; 720 int error; 721 722 dprint(1, ("connect(%d, %p, %d)\n", 723 sock, (void *)name, namelen)); 724 725 if ((so = getsonode(sock, &error, &fp)) == NULL) 726 return (set_errno(error)); 727 728 /* Allocate and copyin name */ 729 if (namelen != 0) { 730 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 731 name = copyin_name(so, name, &namelen, &error); 732 if (name == NULL) { 733 releasef(sock); 734 return (set_errno(error)); 735 } 736 } else 737 name = NULL; 738 739 error = socket_connect(so, name, namelen, fp->f_flag, 740 (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED()); 741 releasef(sock); 742 if (name) 743 kmem_free(name, (size_t)namelen); 744 if (error) 745 return (set_errno(error)); 746 return (0); 747 } 748 749 /*ARGSUSED2*/ 750 int 751 shutdown(int sock, int how, int version) 752 { 753 struct sonode *so; 754 int error; 755 756 dprint(1, ("shutdown(%d, %d)\n", 757 sock, how)); 758 759 if ((so = getsonode(sock, &error, NULL)) == NULL) 760 return (set_errno(error)); 761 762 error = socket_shutdown(so, how, CRED()); 763 764 releasef(sock); 765 if (error) 766 return (set_errno(error)); 767 return (0); 768 } 769 770 /* 771 * Common receive routine. 772 */ 773 static ssize_t 774 recvit(int sock, 775 struct nmsghdr *msg, 776 struct uio *uiop, 777 int flags, 778 socklen_t *namelenp, 779 socklen_t *controllenp, 780 int *flagsp) 781 { 782 struct sonode *so; 783 file_t *fp; 784 void *name; 785 socklen_t namelen; 786 void *control; 787 socklen_t controllen; 788 ssize_t len; 789 int error; 790 791 if ((so = getsonode(sock, &error, &fp)) == NULL) 792 return (set_errno(error)); 793 794 len = uiop->uio_resid; 795 uiop->uio_fmode = fp->f_flag; 796 uiop->uio_extflg = UIO_COPY_CACHED; 797 798 name = msg->msg_name; 799 namelen = msg->msg_namelen; 800 control = msg->msg_control; 801 controllen = msg->msg_controllen; 802 803 msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL | 804 MSG_DONTWAIT | MSG_XPG4_2); 805 806 error = socket_recvmsg(so, msg, uiop, CRED()); 807 if (error) { 808 releasef(sock); 809 return (set_errno(error)); 810 } 811 lwp_stat_update(LWP_STAT_MSGRCV, 1); 812 releasef(sock); 813 814 error = copyout_name(name, namelen, namelenp, 815 msg->msg_name, msg->msg_namelen); 816 if (error) 817 goto err; 818 819 if (flagsp != NULL) { 820 /* 821 * Clear internal flag. 822 */ 823 msg->msg_flags &= ~MSG_XPG4_2; 824 825 /* 826 * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only 827 * when controllen is zero and there is control data to 828 * copy out. 829 */ 830 if (controllen != 0 && 831 (msg->msg_controllen > controllen || control == NULL)) { 832 dprint(1, ("recvit: CTRUNC %d %d %p\n", 833 msg->msg_controllen, controllen, control)); 834 835 msg->msg_flags |= MSG_CTRUNC; 836 } 837 if (copyout(&msg->msg_flags, flagsp, 838 sizeof (msg->msg_flags))) { 839 error = EFAULT; 840 goto err; 841 } 842 } 843 /* 844 * Note: This MUST be done last. There can be no "goto err" after this 845 * point since it could make so_closefds run twice on some part 846 * of the file descriptor array. 847 */ 848 if (controllen != 0) { 849 if (!(flags & MSG_XPG4_2)) { 850 /* 851 * Good old msg_accrights can only return a multiple 852 * of 4 bytes. 853 */ 854 controllen &= ~((int)sizeof (uint32_t) - 1); 855 } 856 error = copyout_arg(control, controllen, controllenp, 857 msg->msg_control, msg->msg_controllen); 858 if (error) 859 goto err; 860 861 if (msg->msg_controllen > controllen || control == NULL) { 862 if (control == NULL) 863 controllen = 0; 864 so_closefds(msg->msg_control, msg->msg_controllen, 865 !(flags & MSG_XPG4_2), controllen); 866 } 867 } 868 if (msg->msg_namelen != 0) 869 kmem_free(msg->msg_name, (size_t)msg->msg_namelen); 870 if (msg->msg_controllen != 0) 871 kmem_free(msg->msg_control, (size_t)msg->msg_controllen); 872 return (len - uiop->uio_resid); 873 874 err: 875 /* 876 * If we fail and the control part contains file descriptors 877 * we have to close the fd's. 878 */ 879 if (msg->msg_controllen != 0) 880 so_closefds(msg->msg_control, msg->msg_controllen, 881 !(flags & MSG_XPG4_2), 0); 882 if (msg->msg_namelen != 0) 883 kmem_free(msg->msg_name, (size_t)msg->msg_namelen); 884 if (msg->msg_controllen != 0) 885 kmem_free(msg->msg_control, (size_t)msg->msg_controllen); 886 return (set_errno(error)); 887 } 888 889 /* 890 * Native system call 891 */ 892 ssize_t 893 recv(int sock, void *buffer, size_t len, int flags) 894 { 895 struct nmsghdr lmsg; 896 struct uio auio; 897 struct iovec aiov[1]; 898 899 dprint(1, ("recv(%d, %p, %ld, %d)\n", 900 sock, buffer, len, flags)); 901 902 if ((ssize_t)len < 0) { 903 return (set_errno(EINVAL)); 904 } 905 906 aiov[0].iov_base = buffer; 907 aiov[0].iov_len = len; 908 auio.uio_loffset = 0; 909 auio.uio_iov = aiov; 910 auio.uio_iovcnt = 1; 911 auio.uio_resid = len; 912 auio.uio_segflg = UIO_USERSPACE; 913 auio.uio_limit = 0; 914 915 lmsg.msg_namelen = 0; 916 lmsg.msg_controllen = 0; 917 lmsg.msg_flags = 0; 918 return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL)); 919 } 920 921 ssize_t 922 recvfrom(int sock, void *buffer, size_t len, int flags, 923 struct sockaddr *name, socklen_t *namelenp) 924 { 925 struct nmsghdr lmsg; 926 struct uio auio; 927 struct iovec aiov[1]; 928 929 dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n", 930 sock, buffer, len, flags, (void *)name, (void *)namelenp)); 931 932 if ((ssize_t)len < 0) { 933 return (set_errno(EINVAL)); 934 } 935 936 aiov[0].iov_base = buffer; 937 aiov[0].iov_len = len; 938 auio.uio_loffset = 0; 939 auio.uio_iov = aiov; 940 auio.uio_iovcnt = 1; 941 auio.uio_resid = len; 942 auio.uio_segflg = UIO_USERSPACE; 943 auio.uio_limit = 0; 944 945 lmsg.msg_name = (char *)name; 946 if (namelenp != NULL) { 947 if (copyin(namelenp, &lmsg.msg_namelen, 948 sizeof (lmsg.msg_namelen))) 949 return (set_errno(EFAULT)); 950 } else { 951 lmsg.msg_namelen = 0; 952 } 953 lmsg.msg_controllen = 0; 954 lmsg.msg_flags = 0; 955 956 return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL)); 957 } 958 959 /* 960 * Uses the MSG_XPG4_2 flag to determine if the caller is using 961 * struct omsghdr or struct nmsghdr. 962 */ 963 ssize_t 964 recvmsg(int sock, struct nmsghdr *msg, int flags) 965 { 966 STRUCT_DECL(nmsghdr, u_lmsg); 967 STRUCT_HANDLE(nmsghdr, umsgptr); 968 struct nmsghdr lmsg; 969 struct uio auio; 970 struct iovec aiov[MSG_MAXIOVLEN]; 971 int iovcnt; 972 ssize_t len; 973 int i; 974 int *flagsp; 975 model_t model; 976 977 dprint(1, ("recvmsg(%d, %p, %d)\n", 978 sock, (void *)msg, flags)); 979 980 model = get_udatamodel(); 981 STRUCT_INIT(u_lmsg, model); 982 STRUCT_SET_HANDLE(umsgptr, model, msg); 983 984 if (flags & MSG_XPG4_2) { 985 if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg))) 986 return (set_errno(EFAULT)); 987 flagsp = STRUCT_FADDR(umsgptr, msg_flags); 988 } else { 989 /* 990 * Assumes that nmsghdr and omsghdr are identically shaped 991 * except for the added msg_flags field. 992 */ 993 if (copyin(msg, STRUCT_BUF(u_lmsg), 994 SIZEOF_STRUCT(omsghdr, model))) 995 return (set_errno(EFAULT)); 996 STRUCT_FSET(u_lmsg, msg_flags, 0); 997 flagsp = NULL; 998 } 999 1000 /* 1001 * Code below us will kmem_alloc memory and hang it 1002 * off msg_control and msg_name fields. This forces 1003 * us to copy the structure to its native form. 1004 */ 1005 lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name); 1006 lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen); 1007 lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov); 1008 lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen); 1009 lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control); 1010 lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen); 1011 lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags); 1012 1013 iovcnt = lmsg.msg_iovlen; 1014 1015 if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) { 1016 return (set_errno(EMSGSIZE)); 1017 } 1018 1019 #ifdef _SYSCALL32_IMPL 1020 /* 1021 * 32-bit callers need to have their iovec expanded, while ensuring 1022 * that they can't move more than 2Gbytes of data in a single call. 1023 */ 1024 if (model == DATAMODEL_ILP32) { 1025 struct iovec32 aiov32[MSG_MAXIOVLEN]; 1026 ssize32_t count32; 1027 1028 if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, 1029 iovcnt * sizeof (struct iovec32))) 1030 return (set_errno(EFAULT)); 1031 1032 count32 = 0; 1033 for (i = 0; i < iovcnt; i++) { 1034 ssize32_t iovlen32; 1035 1036 iovlen32 = aiov32[i].iov_len; 1037 count32 += iovlen32; 1038 if (iovlen32 < 0 || count32 < 0) 1039 return (set_errno(EINVAL)); 1040 aiov[i].iov_len = iovlen32; 1041 aiov[i].iov_base = 1042 (caddr_t)(uintptr_t)aiov32[i].iov_base; 1043 } 1044 } else 1045 #endif /* _SYSCALL32_IMPL */ 1046 if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) { 1047 return (set_errno(EFAULT)); 1048 } 1049 len = 0; 1050 for (i = 0; i < iovcnt; i++) { 1051 ssize_t iovlen = aiov[i].iov_len; 1052 len += iovlen; 1053 if (iovlen < 0 || len < 0) { 1054 return (set_errno(EINVAL)); 1055 } 1056 } 1057 auio.uio_loffset = 0; 1058 auio.uio_iov = aiov; 1059 auio.uio_iovcnt = iovcnt; 1060 auio.uio_resid = len; 1061 auio.uio_segflg = UIO_USERSPACE; 1062 auio.uio_limit = 0; 1063 1064 if (lmsg.msg_control != NULL && 1065 (do_useracc == 0 || 1066 useracc(lmsg.msg_control, lmsg.msg_controllen, 1067 B_WRITE) != 0)) { 1068 return (set_errno(EFAULT)); 1069 } 1070 1071 return (recvit(sock, &lmsg, &auio, flags, 1072 STRUCT_FADDR(umsgptr, msg_namelen), 1073 STRUCT_FADDR(umsgptr, msg_controllen), flagsp)); 1074 } 1075 1076 /* 1077 * Common send function. 1078 */ 1079 static ssize_t 1080 sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags) 1081 { 1082 struct sonode *so; 1083 file_t *fp; 1084 void *name; 1085 socklen_t namelen; 1086 void *control; 1087 socklen_t controllen; 1088 ssize_t len; 1089 int error; 1090 1091 if ((so = getsonode(sock, &error, &fp)) == NULL) 1092 return (set_errno(error)); 1093 1094 uiop->uio_fmode = fp->f_flag; 1095 1096 if (so->so_family == AF_UNIX) 1097 uiop->uio_extflg = UIO_COPY_CACHED; 1098 else 1099 uiop->uio_extflg = UIO_COPY_DEFAULT; 1100 1101 /* Allocate and copyin name and control */ 1102 name = msg->msg_name; 1103 namelen = msg->msg_namelen; 1104 if (name != NULL && namelen != 0) { 1105 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1106 name = copyin_name(so, 1107 (struct sockaddr *)name, 1108 &namelen, &error); 1109 if (name == NULL) 1110 goto done3; 1111 /* copyin_name null terminates addresses for AF_UNIX */ 1112 msg->msg_namelen = namelen; 1113 msg->msg_name = name; 1114 } else { 1115 msg->msg_name = name = NULL; 1116 msg->msg_namelen = namelen = 0; 1117 } 1118 1119 control = msg->msg_control; 1120 controllen = msg->msg_controllen; 1121 if ((control != NULL) && (controllen != 0)) { 1122 /* 1123 * Verify that the length is not excessive to prevent 1124 * an application from consuming all of kernel memory. 1125 */ 1126 if (controllen > SO_MAXARGSIZE) { 1127 error = EINVAL; 1128 goto done2; 1129 } 1130 control = kmem_alloc(controllen, KM_SLEEP); 1131 1132 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1133 if (copyin(msg->msg_control, control, controllen)) { 1134 error = EFAULT; 1135 goto done1; 1136 } 1137 msg->msg_control = control; 1138 } else { 1139 msg->msg_control = control = NULL; 1140 msg->msg_controllen = controllen = 0; 1141 } 1142 1143 len = uiop->uio_resid; 1144 msg->msg_flags = flags; 1145 1146 error = socket_sendmsg(so, msg, uiop, CRED()); 1147 done1: 1148 if (control != NULL) 1149 kmem_free(control, controllen); 1150 done2: 1151 if (name != NULL) 1152 kmem_free(name, namelen); 1153 done3: 1154 if (error != 0) { 1155 releasef(sock); 1156 return (set_errno(error)); 1157 } 1158 lwp_stat_update(LWP_STAT_MSGSND, 1); 1159 releasef(sock); 1160 return (len - uiop->uio_resid); 1161 } 1162 1163 /* 1164 * Native system call 1165 */ 1166 ssize_t 1167 send(int sock, void *buffer, size_t len, int flags) 1168 { 1169 struct nmsghdr lmsg; 1170 struct uio auio; 1171 struct iovec aiov[1]; 1172 1173 dprint(1, ("send(%d, %p, %ld, %d)\n", 1174 sock, buffer, len, flags)); 1175 1176 if ((ssize_t)len < 0) { 1177 return (set_errno(EINVAL)); 1178 } 1179 1180 aiov[0].iov_base = buffer; 1181 aiov[0].iov_len = len; 1182 auio.uio_loffset = 0; 1183 auio.uio_iov = aiov; 1184 auio.uio_iovcnt = 1; 1185 auio.uio_resid = len; 1186 auio.uio_segflg = UIO_USERSPACE; 1187 auio.uio_limit = 0; 1188 1189 lmsg.msg_name = NULL; 1190 lmsg.msg_control = NULL; 1191 if (!(flags & MSG_XPG4_2)) { 1192 /* 1193 * In order to be compatible with the libsocket/sockmod 1194 * implementation we set EOR for all send* calls. 1195 */ 1196 flags |= MSG_EOR; 1197 } 1198 return (sendit(sock, &lmsg, &auio, flags)); 1199 } 1200 1201 /* 1202 * Uses the MSG_XPG4_2 flag to determine if the caller is using 1203 * struct omsghdr or struct nmsghdr. 1204 */ 1205 ssize_t 1206 sendmsg(int sock, struct nmsghdr *msg, int flags) 1207 { 1208 struct nmsghdr lmsg; 1209 STRUCT_DECL(nmsghdr, u_lmsg); 1210 struct uio auio; 1211 struct iovec aiov[MSG_MAXIOVLEN]; 1212 int iovcnt; 1213 ssize_t len; 1214 int i; 1215 model_t model; 1216 1217 dprint(1, ("sendmsg(%d, %p, %d)\n", sock, (void *)msg, flags)); 1218 1219 model = get_udatamodel(); 1220 STRUCT_INIT(u_lmsg, model); 1221 1222 if (flags & MSG_XPG4_2) { 1223 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg), 1224 STRUCT_SIZE(u_lmsg))) 1225 return (set_errno(EFAULT)); 1226 } else { 1227 /* 1228 * Assumes that nmsghdr and omsghdr are identically shaped 1229 * except for the added msg_flags field. 1230 */ 1231 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg), 1232 SIZEOF_STRUCT(omsghdr, model))) 1233 return (set_errno(EFAULT)); 1234 /* 1235 * In order to be compatible with the libsocket/sockmod 1236 * implementation we set EOR for all send* calls. 1237 */ 1238 flags |= MSG_EOR; 1239 } 1240 1241 /* 1242 * Code below us will kmem_alloc memory and hang it 1243 * off msg_control and msg_name fields. This forces 1244 * us to copy the structure to its native form. 1245 */ 1246 lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name); 1247 lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen); 1248 lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov); 1249 lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen); 1250 lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control); 1251 lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen); 1252 lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags); 1253 1254 iovcnt = lmsg.msg_iovlen; 1255 1256 if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) { 1257 /* 1258 * Unless this is XPG 4.2 we allow iovcnt == 0 to 1259 * be compatible with SunOS 4.X and 4.4BSD. 1260 */ 1261 if (iovcnt != 0 || (flags & MSG_XPG4_2)) 1262 return (set_errno(EMSGSIZE)); 1263 } 1264 1265 #ifdef _SYSCALL32_IMPL 1266 /* 1267 * 32-bit callers need to have their iovec expanded, while ensuring 1268 * that they can't move more than 2Gbytes of data in a single call. 1269 */ 1270 if (model == DATAMODEL_ILP32) { 1271 struct iovec32 aiov32[MSG_MAXIOVLEN]; 1272 ssize32_t count32; 1273 1274 if (iovcnt != 0 && 1275 copyin((struct iovec32 *)lmsg.msg_iov, aiov32, 1276 iovcnt * sizeof (struct iovec32))) 1277 return (set_errno(EFAULT)); 1278 1279 count32 = 0; 1280 for (i = 0; i < iovcnt; i++) { 1281 ssize32_t iovlen32; 1282 1283 iovlen32 = aiov32[i].iov_len; 1284 count32 += iovlen32; 1285 if (iovlen32 < 0 || count32 < 0) 1286 return (set_errno(EINVAL)); 1287 aiov[i].iov_len = iovlen32; 1288 aiov[i].iov_base = 1289 (caddr_t)(uintptr_t)aiov32[i].iov_base; 1290 } 1291 } else 1292 #endif /* _SYSCALL32_IMPL */ 1293 if (iovcnt != 0 && 1294 copyin(lmsg.msg_iov, aiov, 1295 (unsigned)iovcnt * sizeof (struct iovec))) { 1296 return (set_errno(EFAULT)); 1297 } 1298 len = 0; 1299 for (i = 0; i < iovcnt; i++) { 1300 ssize_t iovlen = aiov[i].iov_len; 1301 len += iovlen; 1302 if (iovlen < 0 || len < 0) { 1303 return (set_errno(EINVAL)); 1304 } 1305 } 1306 auio.uio_loffset = 0; 1307 auio.uio_iov = aiov; 1308 auio.uio_iovcnt = iovcnt; 1309 auio.uio_resid = len; 1310 auio.uio_segflg = UIO_USERSPACE; 1311 auio.uio_limit = 0; 1312 1313 return (sendit(sock, &lmsg, &auio, flags)); 1314 } 1315 1316 ssize_t 1317 sendto(int sock, void *buffer, size_t len, int flags, 1318 struct sockaddr *name, socklen_t namelen) 1319 { 1320 struct nmsghdr lmsg; 1321 struct uio auio; 1322 struct iovec aiov[1]; 1323 1324 dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n", 1325 sock, buffer, len, flags, (void *)name, namelen)); 1326 1327 if ((ssize_t)len < 0) { 1328 return (set_errno(EINVAL)); 1329 } 1330 1331 aiov[0].iov_base = buffer; 1332 aiov[0].iov_len = len; 1333 auio.uio_loffset = 0; 1334 auio.uio_iov = aiov; 1335 auio.uio_iovcnt = 1; 1336 auio.uio_resid = len; 1337 auio.uio_segflg = UIO_USERSPACE; 1338 auio.uio_limit = 0; 1339 1340 lmsg.msg_name = (char *)name; 1341 lmsg.msg_namelen = namelen; 1342 lmsg.msg_control = NULL; 1343 if (!(flags & MSG_XPG4_2)) { 1344 /* 1345 * In order to be compatible with the libsocket/sockmod 1346 * implementation we set EOR for all send* calls. 1347 */ 1348 flags |= MSG_EOR; 1349 } 1350 return (sendit(sock, &lmsg, &auio, flags)); 1351 } 1352 1353 /*ARGSUSED3*/ 1354 int 1355 getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version) 1356 { 1357 struct sonode *so; 1358 int error; 1359 socklen_t namelen; 1360 socklen_t sock_addrlen; 1361 struct sockaddr *sock_addrp; 1362 1363 dprint(1, ("getpeername(%d, %p, %p)\n", 1364 sock, (void *)name, (void *)namelenp)); 1365 1366 if ((so = getsonode(sock, &error, NULL)) == NULL) 1367 goto bad; 1368 1369 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1370 if (copyin(namelenp, &namelen, sizeof (namelen)) || 1371 (name == NULL && namelen != 0)) { 1372 error = EFAULT; 1373 goto rel_out; 1374 } 1375 sock_addrlen = so->so_max_addr_len; 1376 sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP); 1377 1378 if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen, 1379 B_FALSE, CRED())) == 0) { 1380 ASSERT(sock_addrlen <= so->so_max_addr_len); 1381 error = copyout_name(name, namelen, namelenp, 1382 (void *)sock_addrp, sock_addrlen); 1383 } 1384 kmem_free(sock_addrp, so->so_max_addr_len); 1385 rel_out: 1386 releasef(sock); 1387 bad: return (error != 0 ? set_errno(error) : 0); 1388 } 1389 1390 /*ARGSUSED3*/ 1391 int 1392 getsockname(int sock, struct sockaddr *name, 1393 socklen_t *namelenp, int version) 1394 { 1395 struct sonode *so; 1396 int error; 1397 socklen_t namelen, sock_addrlen; 1398 struct sockaddr *sock_addrp; 1399 1400 dprint(1, ("getsockname(%d, %p, %p)\n", 1401 sock, (void *)name, (void *)namelenp)); 1402 1403 if ((so = getsonode(sock, &error, NULL)) == NULL) 1404 goto bad; 1405 1406 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1407 if (copyin(namelenp, &namelen, sizeof (namelen)) || 1408 (name == NULL && namelen != 0)) { 1409 error = EFAULT; 1410 goto rel_out; 1411 } 1412 1413 sock_addrlen = so->so_max_addr_len; 1414 sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP); 1415 if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen, 1416 CRED())) == 0) { 1417 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1418 ASSERT(sock_addrlen <= so->so_max_addr_len); 1419 error = copyout_name(name, namelen, namelenp, 1420 (void *)sock_addrp, sock_addrlen); 1421 } 1422 kmem_free(sock_addrp, so->so_max_addr_len); 1423 rel_out: 1424 releasef(sock); 1425 bad: return (error != 0 ? set_errno(error) : 0); 1426 } 1427 1428 /*ARGSUSED5*/ 1429 int 1430 getsockopt(int sock, 1431 int level, 1432 int option_name, 1433 void *option_value, 1434 socklen_t *option_lenp, 1435 int version) 1436 { 1437 struct sonode *so; 1438 socklen_t optlen, optlen_res; 1439 void *optval; 1440 int error; 1441 1442 dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n", 1443 sock, level, option_name, option_value, (void *)option_lenp)); 1444 1445 if ((so = getsonode(sock, &error, NULL)) == NULL) 1446 return (set_errno(error)); 1447 1448 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1449 if (copyin(option_lenp, &optlen, sizeof (optlen))) { 1450 releasef(sock); 1451 return (set_errno(EFAULT)); 1452 } 1453 /* 1454 * Verify that the length is not excessive to prevent 1455 * an application from consuming all of kernel memory. 1456 */ 1457 if (optlen > SO_MAXARGSIZE) { 1458 error = EINVAL; 1459 releasef(sock); 1460 return (set_errno(error)); 1461 } 1462 optval = kmem_alloc(optlen, KM_SLEEP); 1463 optlen_res = optlen; 1464 error = socket_getsockopt(so, level, option_name, optval, 1465 &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2, 1466 CRED()); 1467 releasef(sock); 1468 if (error) { 1469 kmem_free(optval, optlen); 1470 return (set_errno(error)); 1471 } 1472 error = copyout_arg(option_value, optlen, option_lenp, 1473 optval, optlen_res); 1474 kmem_free(optval, optlen); 1475 if (error) 1476 return (set_errno(error)); 1477 return (0); 1478 } 1479 1480 /*ARGSUSED5*/ 1481 int 1482 setsockopt(int sock, 1483 int level, 1484 int option_name, 1485 void *option_value, 1486 socklen_t option_len, 1487 int version) 1488 { 1489 struct sonode *so; 1490 intptr_t buffer[2]; 1491 void *optval = NULL; 1492 int error; 1493 1494 dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n", 1495 sock, level, option_name, option_value, option_len)); 1496 1497 if ((so = getsonode(sock, &error, NULL)) == NULL) 1498 return (set_errno(error)); 1499 1500 if (option_value != NULL) { 1501 if (option_len != 0) { 1502 /* 1503 * Verify that the length is not excessive to prevent 1504 * an application from consuming all of kernel memory. 1505 */ 1506 if (option_len > SO_MAXARGSIZE) { 1507 error = EINVAL; 1508 goto done2; 1509 } 1510 optval = option_len <= sizeof (buffer) ? 1511 &buffer : kmem_alloc((size_t)option_len, KM_SLEEP); 1512 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1513 if (copyin(option_value, optval, (size_t)option_len)) { 1514 error = EFAULT; 1515 goto done1; 1516 } 1517 } 1518 } else 1519 option_len = 0; 1520 1521 error = socket_setsockopt(so, level, option_name, optval, 1522 (t_uscalar_t)option_len, CRED()); 1523 done1: 1524 if (optval != buffer) 1525 kmem_free(optval, (size_t)option_len); 1526 done2: 1527 releasef(sock); 1528 if (error) 1529 return (set_errno(error)); 1530 return (0); 1531 } 1532 1533 static int 1534 sockconf_add_sock(int family, int type, int protocol, char *name) 1535 { 1536 int error = 0; 1537 char *kdevpath = NULL; 1538 char *kmodule = NULL; 1539 char *buf = NULL; 1540 size_t pathlen = 0; 1541 struct sockparams *sp; 1542 1543 if (name == NULL) 1544 return (EINVAL); 1545 /* 1546 * Copyin the name. 1547 * This also makes it possible to check for too long pathnames. 1548 * Compress the space needed for the name before passing it 1549 * to soconfig - soconfig will store the string until 1550 * the configuration is removed. 1551 */ 1552 buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); 1553 if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) { 1554 kmem_free(buf, MAXPATHLEN); 1555 return (error); 1556 } 1557 if (strncmp(buf, "/dev", strlen("/dev")) == 0) { 1558 /* For device */ 1559 1560 /* 1561 * Special handling for NCA: 1562 * 1563 * DEV_NCA is never opened even if an application 1564 * requests for AF_NCA. The device opened is instead a 1565 * predefined AF_INET transport (NCA_INET_DEV). 1566 * 1567 * Prior to Volo (PSARC/2007/587) NCA would determine 1568 * the device using a lookup, which worked then because 1569 * all protocols were based on TPI. Since TPI is no 1570 * longer the default, we have to explicitly state 1571 * which device to use. 1572 */ 1573 if (strcmp(buf, NCA_DEV) == 0) { 1574 /* only support entry <28, 2, 0> */ 1575 if (family != AF_NCA || type != SOCK_STREAM || 1576 protocol != 0) { 1577 kmem_free(buf, MAXPATHLEN); 1578 return (EINVAL); 1579 } 1580 1581 pathlen = strlen(NCA_INET_DEV) + 1; 1582 kdevpath = kmem_alloc(pathlen, KM_SLEEP); 1583 bcopy(NCA_INET_DEV, kdevpath, pathlen); 1584 kdevpath[pathlen - 1] = '\0'; 1585 } else { 1586 kdevpath = kmem_alloc(pathlen, KM_SLEEP); 1587 bcopy(buf, kdevpath, pathlen); 1588 kdevpath[pathlen - 1] = '\0'; 1589 } 1590 } else { 1591 /* For socket module */ 1592 kmodule = kmem_alloc(pathlen, KM_SLEEP); 1593 bcopy(buf, kmodule, pathlen); 1594 kmodule[pathlen - 1] = '\0'; 1595 pathlen = 0; 1596 } 1597 kmem_free(buf, MAXPATHLEN); 1598 1599 /* sockparams_create frees mod name and devpath upon failure */ 1600 sp = sockparams_create(family, type, protocol, kmodule, 1601 kdevpath, pathlen, 0, KM_SLEEP, &error); 1602 if (sp != NULL) { 1603 error = sockparams_add(sp); 1604 if (error != 0) 1605 sockparams_destroy(sp); 1606 } 1607 1608 return (error); 1609 } 1610 1611 static int 1612 sockconf_remove_sock(int family, int type, int protocol) 1613 { 1614 return (sockparams_delete(family, type, protocol)); 1615 } 1616 1617 static int 1618 sockconfig_remove_filter(const char *uname) 1619 { 1620 char kname[SOF_MAXNAMELEN]; 1621 size_t len; 1622 int error; 1623 sof_entry_t *ent; 1624 1625 if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0) 1626 return (error); 1627 1628 ent = sof_entry_remove_by_name(kname); 1629 if (ent == NULL) 1630 return (ENXIO); 1631 1632 mutex_enter(&ent->sofe_lock); 1633 ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED)); 1634 if (ent->sofe_refcnt == 0) { 1635 mutex_exit(&ent->sofe_lock); 1636 sof_entry_free(ent); 1637 } else { 1638 /* let the last socket free the filter */ 1639 ent->sofe_flags |= SOFEF_CONDEMED; 1640 mutex_exit(&ent->sofe_lock); 1641 } 1642 1643 return (0); 1644 } 1645 1646 static int 1647 sockconfig_add_filter(const char *uname, void *ufilpropp) 1648 { 1649 struct sockconfig_filter_props filprop; 1650 sof_entry_t *ent; 1651 int error; 1652 size_t tuplesz, len; 1653 char hintbuf[SOF_MAXNAMELEN]; 1654 1655 ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP); 1656 mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL); 1657 1658 if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN, 1659 &len)) != 0) { 1660 sof_entry_free(ent); 1661 return (error); 1662 } 1663 1664 if (get_udatamodel() == DATAMODEL_NATIVE) { 1665 if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) { 1666 sof_entry_free(ent); 1667 return (EFAULT); 1668 } 1669 } 1670 #ifdef _SYSCALL32_IMPL 1671 else { 1672 struct sockconfig_filter_props32 filprop32; 1673 1674 if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) { 1675 sof_entry_free(ent); 1676 return (EFAULT); 1677 } 1678 filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname; 1679 filprop.sfp_autoattach = filprop32.sfp_autoattach; 1680 filprop.sfp_hint = filprop32.sfp_hint; 1681 filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg; 1682 filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt; 1683 filprop.sfp_socktuple = 1684 (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple; 1685 } 1686 #endif /* _SYSCALL32_IMPL */ 1687 1688 if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname, 1689 sizeof (ent->sofe_modname), &len)) != 0) { 1690 sof_entry_free(ent); 1691 return (error); 1692 } 1693 1694 /* 1695 * A filter must specify at least one socket tuple. 1696 */ 1697 if (filprop.sfp_socktuple_cnt == 0 || 1698 filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) { 1699 sof_entry_free(ent); 1700 return (EINVAL); 1701 } 1702 ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG; 1703 ent->sofe_hint = filprop.sfp_hint; 1704 1705 /* 1706 * Verify the hint, and copy in the hint argument, if necessary. 1707 */ 1708 switch (ent->sofe_hint) { 1709 case SOF_HINT_BEFORE: 1710 case SOF_HINT_AFTER: 1711 if ((error = copyinstr(filprop.sfp_hintarg, hintbuf, 1712 sizeof (hintbuf), &len)) != 0) { 1713 sof_entry_free(ent); 1714 return (error); 1715 } 1716 ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP); 1717 bcopy(hintbuf, ent->sofe_hintarg, len); 1718 /* FALLTHRU */ 1719 case SOF_HINT_TOP: 1720 case SOF_HINT_BOTTOM: 1721 /* hints cannot be used with programmatic filters */ 1722 if (ent->sofe_flags & SOFEF_PROG) { 1723 sof_entry_free(ent); 1724 return (EINVAL); 1725 } 1726 break; 1727 case SOF_HINT_NONE: 1728 break; 1729 default: 1730 /* bad hint value */ 1731 sof_entry_free(ent); 1732 return (EINVAL); 1733 } 1734 1735 ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt; 1736 tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt; 1737 ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP); 1738 1739 if (get_udatamodel() == DATAMODEL_NATIVE) { 1740 if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple, 1741 tuplesz)) { 1742 sof_entry_free(ent); 1743 return (EFAULT); 1744 } 1745 } 1746 #ifdef _SYSCALL32_IMPL 1747 else { 1748 int i; 1749 caddr_t data = (caddr_t)filprop.sfp_socktuple; 1750 sof_socktuple_t *tup = ent->sofe_socktuple; 1751 sof_socktuple32_t tup32; 1752 1753 tup = ent->sofe_socktuple; 1754 for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) { 1755 ASSERT(tup < ent->sofe_socktuple + tuplesz); 1756 1757 if (copyin(data, &tup32, sizeof (tup32)) != 0) { 1758 sof_entry_free(ent); 1759 return (EFAULT); 1760 } 1761 tup->sofst_family = tup32.sofst_family; 1762 tup->sofst_type = tup32.sofst_type; 1763 tup->sofst_protocol = tup32.sofst_protocol; 1764 1765 data += sizeof (tup32); 1766 } 1767 } 1768 #endif /* _SYSCALL32_IMPL */ 1769 1770 /* Sockets can start using the filter as soon as the filter is added */ 1771 if ((error = sof_entry_add(ent)) != 0) 1772 sof_entry_free(ent); 1773 1774 return (error); 1775 } 1776 1777 /* 1778 * Socket configuration system call. It is used to add and remove 1779 * socket types. 1780 */ 1781 int 1782 sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) 1783 { 1784 int error = 0; 1785 1786 if (secpolicy_net_config(CRED(), B_FALSE) != 0) 1787 return (set_errno(EPERM)); 1788 1789 if (sockfs_defer_nl7c_init) { 1790 nl7c_init(); 1791 sockfs_defer_nl7c_init = 0; 1792 } 1793 1794 switch (cmd) { 1795 case SOCKCONFIG_ADD_SOCK: 1796 error = sockconf_add_sock((int)(uintptr_t)arg1, 1797 (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4); 1798 break; 1799 case SOCKCONFIG_REMOVE_SOCK: 1800 error = sockconf_remove_sock((int)(uintptr_t)arg1, 1801 (int)(uintptr_t)arg2, (int)(uintptr_t)arg3); 1802 break; 1803 case SOCKCONFIG_ADD_FILTER: 1804 error = sockconfig_add_filter((const char *)arg1, arg2); 1805 break; 1806 case SOCKCONFIG_REMOVE_FILTER: 1807 error = sockconfig_remove_filter((const char *)arg1); 1808 break; 1809 default: 1810 #ifdef DEBUG 1811 cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd); 1812 #endif 1813 error = EINVAL; 1814 break; 1815 } 1816 1817 if (error != 0) { 1818 eprintline(error); 1819 return (set_errno(error)); 1820 } 1821 return (0); 1822 } 1823 1824 1825 /* 1826 * Sendfile is implemented through two schemes, direct I/O or by 1827 * caching in the filesystem page cache. We cache the input file by 1828 * default and use direct I/O only if sendfile_max_size is set 1829 * appropriately as explained below. Note that this logic is consistent 1830 * with other filesystems where caching is turned on by default 1831 * unless explicitly turned off by using the DIRECTIO ioctl. 1832 * 1833 * We choose a slightly different scheme here. One can turn off 1834 * caching by setting sendfile_max_size to 0. One can also enable 1835 * caching of files <= sendfile_max_size by setting sendfile_max_size 1836 * to an appropriate value. By default sendfile_max_size is set to the 1837 * maximum value so that all files are cached. In future, we may provide 1838 * better interfaces for caching the file. 1839 * 1840 * Sendfile through Direct I/O (Zero copy) 1841 * -------------------------------------- 1842 * 1843 * As disks are normally slower than the network, we can't have a 1844 * single thread that reads the disk and writes to the network. We 1845 * need to have parallelism. This is done by having the sendfile 1846 * thread create another thread that reads from the filesystem 1847 * and queues it for network processing. In this scheme, the data 1848 * is never copied anywhere i.e it is zero copy unlike the other 1849 * scheme. 1850 * 1851 * We have a sendfile queue (snfq) where each sendfile 1852 * request (snf_req_t) is queued for processing by a thread. Number 1853 * of threads is dynamically allocated and they exit if they are idling 1854 * beyond a specified amount of time. When each request (snf_req_t) is 1855 * processed by a thread, it produces a number of mblk_t structures to 1856 * be consumed by the sendfile thread. snf_deque and snf_enque are 1857 * used for consuming and producing mblks. Size of the filesystem 1858 * read is determined by the tunable (sendfile_read_size). A single 1859 * mblk holds sendfile_read_size worth of data (except the last 1860 * read of the file) which is sent down as a whole to the network. 1861 * sendfile_read_size is set to 1 MB as this seems to be the optimal 1862 * value for the UFS filesystem backed by a striped storage array. 1863 * 1864 * Synchronisation between read (producer) and write (consumer) threads. 1865 * -------------------------------------------------------------------- 1866 * 1867 * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while 1868 * adding and deleting items in this list. Error can happen anytime 1869 * during read or write. There could be unprocessed mblks in the 1870 * sr_ib_XXX list when a read or write error occurs. Whenever error 1871 * is encountered, we need two things to happen : 1872 * 1873 * a) One of the threads need to clean the mblks. 1874 * b) When one thread encounters an error, the other should stop. 1875 * 1876 * For (a), we don't want to penalize the reader thread as it could do 1877 * some useful work processing other requests. For (b), the error can 1878 * be detected by examining sr_read_error or sr_write_error. 1879 * sr_lock protects sr_read_error and sr_write_error. If both reader and 1880 * writer encounters error, we need to report the write error back to 1881 * the application as that's what would have happened if the operations 1882 * were done sequentially. With this in mind, following should work : 1883 * 1884 * - Check for errors before read or write. 1885 * - If the reader encounters error, set the error in sr_read_error. 1886 * Check sr_write_error, if it is set, send cv_signal as it is 1887 * waiting for reader to complete. If it is not set, the writer 1888 * is either running sinking data to the network or blocked 1889 * because of flow control. For handling the latter case, we 1890 * always send a signal. In any case, it will examine sr_read_error 1891 * and return. sr_read_error is marked with SR_READ_DONE to tell 1892 * the writer that the reader is done in all the cases. 1893 * - If the writer encounters error, set the error in sr_write_error. 1894 * The reader thread is either blocked because of flow control or 1895 * running reading data from the disk. For the former, we need to 1896 * wakeup the thread. Again to keep it simple, we always wake up 1897 * the reader thread. Then, wait for the read thread to complete 1898 * if it is not done yet. Cleanup and return. 1899 * 1900 * High and low water marks for the read thread. 1901 * -------------------------------------------- 1902 * 1903 * If sendfile() is used to send data over a slow network, we need to 1904 * make sure that the read thread does not produce data at a faster 1905 * rate than the network. This can happen if the disk is faster than 1906 * the network. In such a case, we don't want to build a very large queue. 1907 * But we would still like to get all of the network throughput possible. 1908 * This implies that network should never block waiting for data. 1909 * As there are lot of disk throughput/network throughput combinations 1910 * possible, it is difficult to come up with an accurate number. 1911 * A typical 10K RPM disk has a max seek latency 17ms and rotational 1912 * latency of 3ms for reading a disk block. Thus, the total latency to 1913 * initiate a new read, transfer data from the disk and queue for 1914 * transmission would take about a max of 25ms. Todays max transfer rate 1915 * for network is 100MB/sec. If the thread is blocked because of flow 1916 * control, it would take 25ms to get new data ready for transmission. 1917 * We have to make sure that network is not idling, while we are initiating 1918 * new transfers. So, at 100MB/sec, to keep network busy we would need 1919 * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data. 1920 * We need to pick a high water mark so that the woken up thread would 1921 * do considerable work before blocking again to prevent thrashing. Currently, 1922 * we pick this to be 10 times that of the low water mark. 1923 * 1924 * Sendfile with segmap caching (One copy from page cache to mblks). 1925 * ---------------------------------------------------------------- 1926 * 1927 * We use the segmap cache for caching the file, if the size of file 1928 * is <= sendfile_max_size. In this case we don't use threads as VM 1929 * is reasonably fast enough to keep up with the network. If the underlying 1930 * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth 1931 * of data into segmap space, and use the virtual address from segmap 1932 * directly through desballoc() to avoid copy. Once the transport is done 1933 * with the data, the mapping will be released through segmap_release() 1934 * called by the call-back routine. 1935 * 1936 * If zero-copy is not allowed by the transport, we simply call VOP_READ() 1937 * to copy the data from the filesystem into our temporary network buffer. 1938 * 1939 * To disable caching, set sendfile_max_size to 0. 1940 */ 1941 1942 uint_t sendfile_read_size = 1024 * 1024; 1943 #define SENDFILE_REQ_LOWAT 3 * 1024 * 1024 1944 uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT; 1945 uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT; 1946 struct sendfile_stats sf_stats; 1947 struct sendfile_queue *snfq; 1948 clock_t snfq_timeout; 1949 off64_t sendfile_max_size; 1950 1951 static void snf_enque(snf_req_t *, mblk_t *); 1952 static mblk_t *snf_deque(snf_req_t *); 1953 1954 void 1955 sendfile_init(void) 1956 { 1957 snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP); 1958 1959 mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL); 1960 cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL); 1961 snfq->snfq_max_threads = max_ncpus; 1962 snfq_timeout = SNFQ_TIMEOUT; 1963 /* Cache all files by default. */ 1964 sendfile_max_size = MAXOFFSET_T; 1965 } 1966 1967 /* 1968 * Queues a mblk_t for network processing. 1969 */ 1970 static void 1971 snf_enque(snf_req_t *sr, mblk_t *mp) 1972 { 1973 mp->b_next = NULL; 1974 mutex_enter(&sr->sr_lock); 1975 if (sr->sr_mp_head == NULL) { 1976 sr->sr_mp_head = sr->sr_mp_tail = mp; 1977 cv_signal(&sr->sr_cv); 1978 } else { 1979 sr->sr_mp_tail->b_next = mp; 1980 sr->sr_mp_tail = mp; 1981 } 1982 sr->sr_qlen += MBLKL(mp); 1983 while ((sr->sr_qlen > sr->sr_hiwat) && 1984 (sr->sr_write_error == 0)) { 1985 sf_stats.ss_full_waits++; 1986 cv_wait(&sr->sr_cv, &sr->sr_lock); 1987 } 1988 mutex_exit(&sr->sr_lock); 1989 } 1990 1991 /* 1992 * De-queues a mblk_t for network processing. 1993 */ 1994 static mblk_t * 1995 snf_deque(snf_req_t *sr) 1996 { 1997 mblk_t *mp; 1998 1999 mutex_enter(&sr->sr_lock); 2000 /* 2001 * If we have encountered an error on read or read is 2002 * completed and no more mblks, return NULL. 2003 * We need to check for NULL sr_mp_head also as 2004 * the reads could have completed and there is 2005 * nothing more to come. 2006 */ 2007 if (((sr->sr_read_error & ~SR_READ_DONE) != 0) || 2008 ((sr->sr_read_error & SR_READ_DONE) && 2009 sr->sr_mp_head == NULL)) { 2010 mutex_exit(&sr->sr_lock); 2011 return (NULL); 2012 } 2013 /* 2014 * To start with neither SR_READ_DONE is marked nor 2015 * the error is set. When we wake up from cv_wait, 2016 * following are the possibilities : 2017 * 2018 * a) sr_read_error is zero and mblks are queued. 2019 * b) sr_read_error is set to SR_READ_DONE 2020 * and mblks are queued. 2021 * c) sr_read_error is set to SR_READ_DONE 2022 * and no mblks. 2023 * d) sr_read_error is set to some error other 2024 * than SR_READ_DONE. 2025 */ 2026 2027 while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) { 2028 sf_stats.ss_empty_waits++; 2029 cv_wait(&sr->sr_cv, &sr->sr_lock); 2030 } 2031 /* Handle (a) and (b) first - the normal case. */ 2032 if (((sr->sr_read_error & ~SR_READ_DONE) == 0) && 2033 (sr->sr_mp_head != NULL)) { 2034 mp = sr->sr_mp_head; 2035 sr->sr_mp_head = mp->b_next; 2036 sr->sr_qlen -= MBLKL(mp); 2037 if (sr->sr_qlen < sr->sr_lowat) 2038 cv_signal(&sr->sr_cv); 2039 mutex_exit(&sr->sr_lock); 2040 mp->b_next = NULL; 2041 return (mp); 2042 } 2043 /* Handle (c) and (d). */ 2044 mutex_exit(&sr->sr_lock); 2045 return (NULL); 2046 } 2047 2048 /* 2049 * Reads data from the filesystem and queues it for network processing. 2050 */ 2051 void 2052 snf_async_read(snf_req_t *sr) 2053 { 2054 size_t iosize; 2055 u_offset_t fileoff; 2056 u_offset_t size; 2057 int ret_size; 2058 int error; 2059 file_t *fp; 2060 mblk_t *mp; 2061 struct vnode *vp; 2062 int extra = 0; 2063 int maxblk = 0; 2064 int wroff = 0; 2065 struct sonode *so; 2066 2067 fp = sr->sr_fp; 2068 size = sr->sr_file_size; 2069 fileoff = sr->sr_file_off; 2070 2071 /* 2072 * Ignore the error for filesystems that doesn't support DIRECTIO. 2073 */ 2074 (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0, 2075 kcred, NULL, NULL); 2076 2077 vp = sr->sr_vp; 2078 if (vp->v_type == VSOCK) { 2079 stdata_t *stp; 2080 2081 /* 2082 * Get the extra space to insert a header and a trailer. 2083 */ 2084 so = VTOSO(vp); 2085 stp = vp->v_stream; 2086 if (stp == NULL) { 2087 wroff = so->so_proto_props.sopp_wroff; 2088 maxblk = so->so_proto_props.sopp_maxblk; 2089 extra = wroff + so->so_proto_props.sopp_tail; 2090 } else { 2091 wroff = (int)(stp->sd_wroff); 2092 maxblk = (int)(stp->sd_maxblk); 2093 extra = wroff + (int)(stp->sd_tail); 2094 } 2095 } 2096 2097 while ((size != 0) && (sr->sr_write_error == 0)) { 2098 2099 iosize = (int)MIN(sr->sr_maxpsz, size); 2100 2101 /* 2102 * Socket filters can limit the mblk size, 2103 * so limit reads to maxblk if there are 2104 * filters present. 2105 */ 2106 if (vp->v_type == VSOCK && 2107 so->so_filter_active > 0 && maxblk != INFPSZ) 2108 iosize = (int)MIN(iosize, maxblk); 2109 2110 if (is_system_labeled()) { 2111 mp = allocb_cred(iosize + extra, CRED(), 2112 curproc->p_pid); 2113 } else { 2114 mp = allocb(iosize + extra, BPRI_MED); 2115 } 2116 if (mp == NULL) { 2117 error = EAGAIN; 2118 break; 2119 } 2120 2121 mp->b_rptr += wroff; 2122 2123 ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize); 2124 2125 /* Error or Reached EOF ? */ 2126 if ((error != 0) || (ret_size == 0)) { 2127 freeb(mp); 2128 break; 2129 } 2130 mp->b_wptr = mp->b_rptr + ret_size; 2131 2132 snf_enque(sr, mp); 2133 size -= ret_size; 2134 fileoff += ret_size; 2135 } 2136 (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0, 2137 kcred, NULL, NULL); 2138 mutex_enter(&sr->sr_lock); 2139 sr->sr_read_error = error; 2140 sr->sr_read_error |= SR_READ_DONE; 2141 cv_signal(&sr->sr_cv); 2142 mutex_exit(&sr->sr_lock); 2143 } 2144 2145 void 2146 snf_async_thread(void) 2147 { 2148 snf_req_t *sr; 2149 callb_cpr_t cprinfo; 2150 clock_t time_left = 1; 2151 2152 CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq"); 2153 2154 mutex_enter(&snfq->snfq_lock); 2155 for (;;) { 2156 /* 2157 * If we didn't find a entry, then block until woken up 2158 * again and then look through the queues again. 2159 */ 2160 while ((sr = snfq->snfq_req_head) == NULL) { 2161 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2162 if (time_left <= 0) { 2163 snfq->snfq_svc_threads--; 2164 CALLB_CPR_EXIT(&cprinfo); 2165 thread_exit(); 2166 /* NOTREACHED */ 2167 } 2168 snfq->snfq_idle_cnt++; 2169 2170 time_left = cv_reltimedwait(&snfq->snfq_cv, 2171 &snfq->snfq_lock, snfq_timeout, TR_CLOCK_TICK); 2172 snfq->snfq_idle_cnt--; 2173 2174 CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock); 2175 } 2176 snfq->snfq_req_head = sr->sr_next; 2177 snfq->snfq_req_cnt--; 2178 mutex_exit(&snfq->snfq_lock); 2179 snf_async_read(sr); 2180 mutex_enter(&snfq->snfq_lock); 2181 } 2182 } 2183 2184 2185 snf_req_t * 2186 create_thread(int operation, struct vnode *vp, file_t *fp, 2187 u_offset_t fileoff, u_offset_t size) 2188 { 2189 snf_req_t *sr; 2190 stdata_t *stp; 2191 2192 sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP); 2193 2194 sr->sr_vp = vp; 2195 sr->sr_fp = fp; 2196 stp = vp->v_stream; 2197 2198 /* 2199 * store sd_qn_maxpsz into sr_maxpsz while we have stream head. 2200 * stream might be closed before thread returns from snf_async_read. 2201 */ 2202 if (stp != NULL && stp->sd_qn_maxpsz > 0) { 2203 sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz); 2204 } else { 2205 sr->sr_maxpsz = MAXBSIZE; 2206 } 2207 2208 sr->sr_operation = operation; 2209 sr->sr_file_off = fileoff; 2210 sr->sr_file_size = size; 2211 sr->sr_hiwat = sendfile_req_hiwat; 2212 sr->sr_lowat = sendfile_req_lowat; 2213 mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL); 2214 cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL); 2215 /* 2216 * See whether we need another thread for servicing this 2217 * request. If there are already enough requests queued 2218 * for the threads, create one if not exceeding 2219 * snfq_max_threads. 2220 */ 2221 mutex_enter(&snfq->snfq_lock); 2222 if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt && 2223 snfq->snfq_svc_threads < snfq->snfq_max_threads) { 2224 (void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0, 2225 TS_RUN, minclsyspri); 2226 snfq->snfq_svc_threads++; 2227 } 2228 if (snfq->snfq_req_head == NULL) { 2229 snfq->snfq_req_head = snfq->snfq_req_tail = sr; 2230 cv_signal(&snfq->snfq_cv); 2231 } else { 2232 snfq->snfq_req_tail->sr_next = sr; 2233 snfq->snfq_req_tail = sr; 2234 } 2235 snfq->snfq_req_cnt++; 2236 mutex_exit(&snfq->snfq_lock); 2237 return (sr); 2238 } 2239 2240 int 2241 snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size, 2242 ssize_t *count) 2243 { 2244 snf_req_t *sr; 2245 mblk_t *mp; 2246 int iosize; 2247 int error = 0; 2248 short fflag; 2249 struct vnode *vp; 2250 int ksize; 2251 struct nmsghdr msg; 2252 2253 ksize = 0; 2254 *count = 0; 2255 bzero(&msg, sizeof (msg)); 2256 2257 vp = fp->f_vnode; 2258 fflag = fp->f_flag; 2259 if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL) 2260 return (EAGAIN); 2261 2262 /* 2263 * We check for read error in snf_deque. It has to check 2264 * for successful READ_DONE and return NULL, and we might 2265 * as well make an additional check there. 2266 */ 2267 while ((mp = snf_deque(sr)) != NULL) { 2268 2269 if (ISSIG(curthread, JUSTLOOKING)) { 2270 freeb(mp); 2271 error = EINTR; 2272 break; 2273 } 2274 iosize = MBLKL(mp); 2275 2276 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp); 2277 2278 if (error != 0) { 2279 if (mp != NULL) 2280 freeb(mp); 2281 break; 2282 } 2283 ksize += iosize; 2284 } 2285 *count = ksize; 2286 2287 mutex_enter(&sr->sr_lock); 2288 sr->sr_write_error = error; 2289 /* Look at the big comments on why we cv_signal here. */ 2290 cv_signal(&sr->sr_cv); 2291 2292 /* Wait for the reader to complete always. */ 2293 while (!(sr->sr_read_error & SR_READ_DONE)) { 2294 cv_wait(&sr->sr_cv, &sr->sr_lock); 2295 } 2296 /* If there is no write error, check for read error. */ 2297 if (error == 0) 2298 error = (sr->sr_read_error & ~SR_READ_DONE); 2299 2300 if (error != 0) { 2301 mblk_t *next_mp; 2302 2303 mp = sr->sr_mp_head; 2304 while (mp != NULL) { 2305 next_mp = mp->b_next; 2306 mp->b_next = NULL; 2307 freeb(mp); 2308 mp = next_mp; 2309 } 2310 } 2311 mutex_exit(&sr->sr_lock); 2312 kmem_free(sr, sizeof (snf_req_t)); 2313 return (error); 2314 } 2315 2316 /* Maximum no.of pages allocated by vpm for sendfile at a time */ 2317 #define SNF_VPMMAXPGS (VPMMAXPGS/2) 2318 2319 /* 2320 * Maximum no.of elements in the list returned by vpm, including 2321 * NULL for the last entry 2322 */ 2323 #define SNF_MAXVMAPS (SNF_VPMMAXPGS + 1) 2324 2325 typedef struct { 2326 unsigned int snfv_ref; 2327 frtn_t snfv_frtn; 2328 vnode_t *snfv_vp; 2329 struct vmap snfv_vml[SNF_MAXVMAPS]; 2330 } snf_vmap_desbinfo; 2331 2332 typedef struct { 2333 frtn_t snfi_frtn; 2334 caddr_t snfi_base; 2335 uint_t snfi_mapoff; 2336 size_t snfi_len; 2337 vnode_t *snfi_vp; 2338 } snf_smap_desbinfo; 2339 2340 /* 2341 * The callback function used for vpm mapped mblks called when the last ref of 2342 * the mblk is dropped which normally occurs when TCP receives the ack. But it 2343 * can be the driver too due to lazy reclaim. 2344 */ 2345 void 2346 snf_vmap_desbfree(snf_vmap_desbinfo *snfv) 2347 { 2348 ASSERT(snfv->snfv_ref != 0); 2349 if (atomic_add_32_nv(&snfv->snfv_ref, -1) == 0) { 2350 vpm_unmap_pages(snfv->snfv_vml, S_READ); 2351 VN_RELE(snfv->snfv_vp); 2352 kmem_free(snfv, sizeof (snf_vmap_desbinfo)); 2353 } 2354 } 2355 2356 /* 2357 * The callback function used for segmap'ped mblks called when the last ref of 2358 * the mblk is dropped which normally occurs when TCP receives the ack. But it 2359 * can be the driver too due to lazy reclaim. 2360 */ 2361 void 2362 snf_smap_desbfree(snf_smap_desbinfo *snfi) 2363 { 2364 if (! IS_KPM_ADDR(snfi->snfi_base)) { 2365 /* 2366 * We don't need to call segmap_fault(F_SOFTUNLOCK) for 2367 * segmap_kpm as long as the latter never falls back to 2368 * "use_segmap_range". (See segmap_getmapflt().) 2369 * 2370 * Using S_OTHER saves an redundant hat_setref() in 2371 * segmap_unlock() 2372 */ 2373 (void) segmap_fault(kas.a_hat, segkmap, 2374 (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base + 2375 snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len, 2376 F_SOFTUNLOCK, S_OTHER); 2377 } 2378 (void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED); 2379 VN_RELE(snfi->snfi_vp); 2380 kmem_free(snfi, sizeof (*snfi)); 2381 } 2382 2383 /* 2384 * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk. 2385 * When segmap is used, the mblk contains a segmap slot of no more 2386 * than MAXBSIZE. 2387 * 2388 * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained 2389 * in each iteration and sent by socket_sendmblk until an error occurs or 2390 * the requested size has been transferred. An mblk is esballoca'ed from 2391 * each mapped page and a chain of these mblk is sent to the transport layer. 2392 * vpm will be called to unmap the pages when all mblks have been freed by 2393 * free_func. 2394 * 2395 * At the end of the whole sendfile() operation, we wait till the data from 2396 * the last mblk is ack'ed by the transport before returning so that the 2397 * caller of sendfile() can safely modify the file content. 2398 */ 2399 int 2400 snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t total_size, 2401 ssize_t *count, boolean_t nowait) 2402 { 2403 caddr_t base; 2404 int mapoff; 2405 vnode_t *vp; 2406 mblk_t *mp = NULL; 2407 int chain_size; 2408 int error; 2409 clock_t deadlk_wait; 2410 short fflag; 2411 int ksize; 2412 struct vattr va; 2413 boolean_t dowait = B_FALSE; 2414 struct nmsghdr msg; 2415 2416 vp = fp->f_vnode; 2417 fflag = fp->f_flag; 2418 ksize = 0; 2419 bzero(&msg, sizeof (msg)); 2420 2421 for (;;) { 2422 if (ISSIG(curthread, JUSTLOOKING)) { 2423 error = EINTR; 2424 break; 2425 } 2426 2427 if (vpm_enable) { 2428 snf_vmap_desbinfo *snfv; 2429 mblk_t *nmp; 2430 int mblk_size; 2431 int maxsize; 2432 int i; 2433 2434 mapoff = fileoff & PAGEOFFSET; 2435 maxsize = MIN((SNF_VPMMAXPGS * PAGESIZE), total_size); 2436 2437 snfv = kmem_zalloc(sizeof (snf_vmap_desbinfo), 2438 KM_SLEEP); 2439 2440 /* 2441 * Get vpm mappings for maxsize with read access. 2442 * If the pages aren't available yet, we get 2443 * DEADLK, so wait and try again a little later using 2444 * an increasing wait. We might be here a long time. 2445 * 2446 * If delay_sig returns EINTR, be sure to exit and 2447 * pass it up to the caller. 2448 */ 2449 deadlk_wait = 0; 2450 while ((error = vpm_map_pages(fvp, fileoff, 2451 (size_t)maxsize, (VPM_FETCHPAGE), snfv->snfv_vml, 2452 SNF_MAXVMAPS, NULL, S_READ)) == EDEADLK) { 2453 deadlk_wait += (deadlk_wait < 5) ? 1 : 4; 2454 if ((error = delay_sig(deadlk_wait)) != 0) { 2455 break; 2456 } 2457 } 2458 if (error != 0) { 2459 kmem_free(snfv, sizeof (snf_vmap_desbinfo)); 2460 error = (error == EINTR) ? EINTR : EIO; 2461 goto out; 2462 } 2463 snfv->snfv_frtn.free_func = snf_vmap_desbfree; 2464 snfv->snfv_frtn.free_arg = (caddr_t)snfv; 2465 2466 /* Construct the mblk chain from the page mappings */ 2467 chain_size = 0; 2468 for (i = 0; (snfv->snfv_vml[i].vs_addr != NULL) && 2469 total_size > 0; i++) { 2470 ASSERT(chain_size < maxsize); 2471 mblk_size = MIN(snfv->snfv_vml[i].vs_len - 2472 mapoff, total_size); 2473 nmp = esballoca( 2474 (uchar_t *)snfv->snfv_vml[i].vs_addr + 2475 mapoff, mblk_size, BPRI_HI, 2476 &snfv->snfv_frtn); 2477 2478 /* 2479 * We return EAGAIN after unmapping the pages 2480 * if we cannot allocate the the head of the 2481 * chain. Otherwise, we continue sending the 2482 * mblks constructed so far. 2483 */ 2484 if (nmp == NULL) { 2485 if (i == 0) { 2486 vpm_unmap_pages(snfv->snfv_vml, 2487 S_READ); 2488 kmem_free(snfv, 2489 sizeof (snf_vmap_desbinfo)); 2490 error = EAGAIN; 2491 goto out; 2492 } 2493 break; 2494 } 2495 /* Mark this dblk with the zero-copy flag */ 2496 nmp->b_datap->db_struioflag |= STRUIO_ZC; 2497 nmp->b_wptr += mblk_size; 2498 chain_size += mblk_size; 2499 fileoff += mblk_size; 2500 total_size -= mblk_size; 2501 snfv->snfv_ref++; 2502 mapoff = 0; 2503 if (i > 0) 2504 linkb(mp, nmp); 2505 else 2506 mp = nmp; 2507 } 2508 VN_HOLD(fvp); 2509 snfv->snfv_vp = fvp; 2510 } else { 2511 /* vpm not supported. fallback to segmap */ 2512 snf_smap_desbinfo *snfi; 2513 2514 mapoff = fileoff & MAXBOFFSET; 2515 chain_size = MAXBSIZE - mapoff; 2516 if (chain_size > total_size) 2517 chain_size = total_size; 2518 /* 2519 * we don't forcefault because we'll call 2520 * segmap_fault(F_SOFTLOCK) next. 2521 * 2522 * S_READ will get the ref bit set (by either 2523 * segmap_getmapflt() or segmap_fault()) and page 2524 * shared locked. 2525 */ 2526 base = segmap_getmapflt(segkmap, fvp, fileoff, 2527 chain_size, segmap_kpm ? SM_FAULT : 0, S_READ); 2528 2529 snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP); 2530 snfi->snfi_len = (size_t)roundup(mapoff+chain_size, 2531 PAGESIZE)- (mapoff & PAGEMASK); 2532 /* 2533 * We must call segmap_fault() even for segmap_kpm 2534 * because that's how error gets returned. 2535 * (segmap_getmapflt() never fails but segmap_fault() 2536 * does.) 2537 * 2538 * If the pages aren't available yet, we get 2539 * DEADLK, so wait and try again a little later using 2540 * an increasing wait. We might be here a long time. 2541 * 2542 * If delay_sig returns EINTR, be sure to exit and 2543 * pass it up to the caller. 2544 */ 2545 deadlk_wait = 0; 2546 while ((error = FC_ERRNO(segmap_fault(kas.a_hat, 2547 segkmap, (caddr_t)(uintptr_t)(((uintptr_t)base + 2548 mapoff) & PAGEMASK), snfi->snfi_len, F_SOFTLOCK, 2549 S_READ))) == EDEADLK) { 2550 deadlk_wait += (deadlk_wait < 5) ? 1 : 4; 2551 if ((error = delay_sig(deadlk_wait)) != 0) { 2552 break; 2553 } 2554 } 2555 if (error != 0) { 2556 (void) segmap_release(segkmap, base, 0); 2557 kmem_free(snfi, sizeof (*snfi)); 2558 error = (error == EINTR) ? EINTR : EIO; 2559 goto out; 2560 } 2561 snfi->snfi_frtn.free_func = snf_smap_desbfree; 2562 snfi->snfi_frtn.free_arg = (caddr_t)snfi; 2563 snfi->snfi_base = base; 2564 snfi->snfi_mapoff = mapoff; 2565 mp = esballoca((uchar_t *)base + mapoff, chain_size, 2566 BPRI_HI, &snfi->snfi_frtn); 2567 2568 if (mp == NULL) { 2569 (void) segmap_fault(kas.a_hat, segkmap, 2570 (caddr_t)(uintptr_t)(((uintptr_t)base + 2571 mapoff) & PAGEMASK), snfi->snfi_len, 2572 F_SOFTUNLOCK, S_OTHER); 2573 (void) segmap_release(segkmap, base, 0); 2574 kmem_free(snfi, sizeof (*snfi)); 2575 freemsg(mp); 2576 error = EAGAIN; 2577 goto out; 2578 } 2579 VN_HOLD(fvp); 2580 snfi->snfi_vp = fvp; 2581 mp->b_wptr += chain_size; 2582 2583 /* Mark this dblk with the zero-copy flag */ 2584 mp->b_datap->db_struioflag |= STRUIO_ZC; 2585 fileoff += chain_size; 2586 total_size -= chain_size; 2587 } 2588 2589 if (total_size == 0 && !nowait) { 2590 ASSERT(!dowait); 2591 dowait = B_TRUE; 2592 mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; 2593 } 2594 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2595 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp); 2596 if (error != 0) { 2597 /* 2598 * mp contains the mblks that were not sent by 2599 * socket_sendmblk. Use its size to update *count 2600 */ 2601 *count = ksize + (chain_size - msgdsize(mp)); 2602 if (mp != NULL) 2603 freemsg(mp); 2604 return (error); 2605 } 2606 ksize += chain_size; 2607 if (total_size == 0) 2608 goto done; 2609 2610 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2611 va.va_mask = AT_SIZE; 2612 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL); 2613 if (error) 2614 break; 2615 /* Read as much as possible. */ 2616 if (fileoff >= va.va_size) 2617 break; 2618 if (total_size + fileoff > va.va_size) 2619 total_size = va.va_size - fileoff; 2620 } 2621 out: 2622 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2623 done: 2624 *count = ksize; 2625 if (dowait) { 2626 stdata_t *stp; 2627 2628 stp = vp->v_stream; 2629 if (stp == NULL) { 2630 struct sonode *so; 2631 so = VTOSO(vp); 2632 error = so_zcopy_wait(so); 2633 } else { 2634 mutex_enter(&stp->sd_lock); 2635 while (!(stp->sd_flag & STZCNOTIFY)) { 2636 if (cv_wait_sig(&stp->sd_zcopy_wait, 2637 &stp->sd_lock) == 0) { 2638 error = EINTR; 2639 break; 2640 } 2641 } 2642 stp->sd_flag &= ~STZCNOTIFY; 2643 mutex_exit(&stp->sd_lock); 2644 } 2645 } 2646 return (error); 2647 } 2648 2649 int 2650 snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size, 2651 uint_t maxpsz, ssize_t *count) 2652 { 2653 struct vnode *vp; 2654 mblk_t *mp; 2655 int iosize; 2656 int extra = 0; 2657 int error; 2658 short fflag; 2659 int ksize; 2660 int ioflag; 2661 struct uio auio; 2662 struct iovec aiov; 2663 struct vattr va; 2664 int maxblk = 0; 2665 int wroff = 0; 2666 struct sonode *so; 2667 struct nmsghdr msg; 2668 2669 vp = fp->f_vnode; 2670 if (vp->v_type == VSOCK) { 2671 stdata_t *stp; 2672 2673 /* 2674 * Get the extra space to insert a header and a trailer. 2675 */ 2676 so = VTOSO(vp); 2677 stp = vp->v_stream; 2678 if (stp == NULL) { 2679 wroff = so->so_proto_props.sopp_wroff; 2680 maxblk = so->so_proto_props.sopp_maxblk; 2681 extra = wroff + so->so_proto_props.sopp_tail; 2682 } else { 2683 wroff = (int)(stp->sd_wroff); 2684 maxblk = (int)(stp->sd_maxblk); 2685 extra = wroff + (int)(stp->sd_tail); 2686 } 2687 } 2688 bzero(&msg, sizeof (msg)); 2689 fflag = fp->f_flag; 2690 ksize = 0; 2691 auio.uio_iov = &aiov; 2692 auio.uio_iovcnt = 1; 2693 auio.uio_segflg = UIO_SYSSPACE; 2694 auio.uio_llimit = MAXOFFSET_T; 2695 auio.uio_fmode = fflag; 2696 auio.uio_extflg = UIO_COPY_CACHED; 2697 ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC); 2698 /* If read sync is not asked for, filter sync flags */ 2699 if ((ioflag & FRSYNC) == 0) 2700 ioflag &= ~(FSYNC|FDSYNC); 2701 for (;;) { 2702 if (ISSIG(curthread, JUSTLOOKING)) { 2703 error = EINTR; 2704 break; 2705 } 2706 iosize = (int)MIN(maxpsz, size); 2707 2708 /* 2709 * Socket filters can limit the mblk size, 2710 * so limit reads to maxblk if there are 2711 * filters present. 2712 */ 2713 if (vp->v_type == VSOCK && 2714 so->so_filter_active > 0 && maxblk != INFPSZ) 2715 iosize = (int)MIN(iosize, maxblk); 2716 2717 if (is_system_labeled()) { 2718 mp = allocb_cred(iosize + extra, CRED(), 2719 curproc->p_pid); 2720 } else { 2721 mp = allocb(iosize + extra, BPRI_MED); 2722 } 2723 if (mp == NULL) { 2724 error = EAGAIN; 2725 break; 2726 } 2727 2728 mp->b_rptr += wroff; 2729 2730 aiov.iov_base = (caddr_t)mp->b_rptr; 2731 aiov.iov_len = iosize; 2732 auio.uio_loffset = fileoff; 2733 auio.uio_resid = iosize; 2734 2735 error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL); 2736 iosize -= auio.uio_resid; 2737 2738 if (error == EINTR && iosize != 0) 2739 error = 0; 2740 2741 if (error != 0 || iosize == 0) { 2742 freeb(mp); 2743 break; 2744 } 2745 mp->b_wptr = mp->b_rptr + iosize; 2746 2747 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2748 2749 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp); 2750 2751 if (error != 0) { 2752 *count = ksize; 2753 if (mp != NULL) 2754 freeb(mp); 2755 return (error); 2756 } 2757 ksize += iosize; 2758 size -= iosize; 2759 if (size == 0) 2760 goto done; 2761 2762 fileoff += iosize; 2763 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2764 va.va_mask = AT_SIZE; 2765 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL); 2766 if (error) 2767 break; 2768 /* Read as much as possible. */ 2769 if (fileoff >= va.va_size) 2770 size = 0; 2771 else if (size + fileoff > va.va_size) 2772 size = va.va_size - fileoff; 2773 } 2774 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2775 done: 2776 *count = ksize; 2777 return (error); 2778 } 2779 2780 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 2781 /* 2782 * Largefile support for 32 bit applications only. 2783 */ 2784 int 2785 sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv, 2786 ssize32_t *count32) 2787 { 2788 ssize32_t sfv_len; 2789 u_offset_t sfv_off, va_size; 2790 struct vnode *vp, *fvp, *realvp; 2791 struct vattr va; 2792 stdata_t *stp; 2793 ssize_t count = 0; 2794 int error = 0; 2795 boolean_t dozcopy = B_FALSE; 2796 uint_t maxpsz; 2797 2798 sfv_len = (ssize32_t)sfv->sfv_len; 2799 if (sfv_len < 0) { 2800 error = EINVAL; 2801 goto out; 2802 } 2803 2804 if (sfv_len == 0) goto out; 2805 2806 sfv_off = (u_offset_t)sfv->sfv_off; 2807 2808 /* Same checks as in pread */ 2809 if (sfv_off > MAXOFFSET_T) { 2810 error = EINVAL; 2811 goto out; 2812 } 2813 if (sfv_off + sfv_len > MAXOFFSET_T) 2814 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 2815 2816 /* 2817 * There are no more checks on sfv_len. So, we cast it to 2818 * u_offset_t and share the snf_direct_io/snf_cache code between 2819 * 32 bit and 64 bit. 2820 * 2821 * TODO: should do nbl_need_check() like read()? 2822 */ 2823 if (sfv_len > sendfile_max_size) { 2824 sf_stats.ss_file_not_cached++; 2825 error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len, 2826 &count); 2827 goto out; 2828 } 2829 fvp = rfp->f_vnode; 2830 if (VOP_REALVP(fvp, &realvp, NULL) == 0) 2831 fvp = realvp; 2832 /* 2833 * Grab the lock as a reader to prevent the file size 2834 * from changing underneath. 2835 */ 2836 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2837 va.va_mask = AT_SIZE; 2838 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL); 2839 va_size = va.va_size; 2840 if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) { 2841 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2842 goto out; 2843 } 2844 /* Read as much as possible. */ 2845 if (sfv_off + sfv_len > va_size) 2846 sfv_len = va_size - sfv_off; 2847 2848 vp = fp->f_vnode; 2849 stp = vp->v_stream; 2850 /* 2851 * When the NOWAIT flag is not set, we enable zero-copy only if the 2852 * transfer size is large enough. This prevents performance loss 2853 * when the caller sends the file piece by piece. 2854 */ 2855 if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) || 2856 (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) && 2857 !vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) { 2858 uint_t copyflag; 2859 copyflag = stp != NULL ? stp->sd_copyflag : 2860 VTOSO(vp)->so_proto_props.sopp_zcopyflag; 2861 if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) { 2862 int on = 1; 2863 2864 if (socket_setsockopt(VTOSO(vp), SOL_SOCKET, 2865 SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0) 2866 dozcopy = B_TRUE; 2867 } else { 2868 dozcopy = copyflag & STZCVMSAFE; 2869 } 2870 } 2871 if (dozcopy) { 2872 sf_stats.ss_file_segmap++; 2873 error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len, 2874 &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0)); 2875 } else { 2876 if (vp->v_type == VSOCK && stp == NULL) { 2877 sonode_t *so = VTOSO(vp); 2878 maxpsz = so->so_proto_props.sopp_maxpsz; 2879 } else if (stp != NULL) { 2880 maxpsz = stp->sd_qn_maxpsz; 2881 } else { 2882 maxpsz = maxphys; 2883 } 2884 2885 if (maxpsz == INFPSZ) 2886 maxpsz = maxphys; 2887 else 2888 maxpsz = roundup(maxpsz, MAXBSIZE); 2889 sf_stats.ss_file_cached++; 2890 error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len, 2891 maxpsz, &count); 2892 } 2893 out: 2894 releasef(sfv->sfv_fd); 2895 *count32 = (ssize32_t)count; 2896 return (error); 2897 } 2898 #endif 2899 2900 #ifdef _SYSCALL32_IMPL 2901 /* 2902 * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a 2903 * ssize_t rather than ssize32_t; see the comments above read32 for details. 2904 */ 2905 2906 ssize_t 2907 recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags) 2908 { 2909 return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags)); 2910 } 2911 2912 ssize_t 2913 recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags, 2914 caddr32_t name, caddr32_t namelenp) 2915 { 2916 return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags, 2917 (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp)); 2918 } 2919 2920 ssize_t 2921 send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags) 2922 { 2923 return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags)); 2924 } 2925 2926 ssize_t 2927 sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags, 2928 caddr32_t name, socklen_t namelen) 2929 { 2930 return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags, 2931 (void *)(uintptr_t)name, namelen)); 2932 } 2933 #endif /* _SYSCALL32_IMPL */ 2934 2935 /* 2936 * Function wrappers (mostly around the sonode switch) for 2937 * backward compatibility. 2938 */ 2939 2940 int 2941 soaccept(struct sonode *so, int fflag, struct sonode **nsop) 2942 { 2943 return (socket_accept(so, fflag, CRED(), nsop)); 2944 } 2945 2946 int 2947 sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen, 2948 int backlog, int flags) 2949 { 2950 int error; 2951 2952 error = socket_bind(so, name, namelen, flags, CRED()); 2953 if (error == 0 && backlog != 0) 2954 return (socket_listen(so, backlog, CRED())); 2955 2956 return (error); 2957 } 2958 2959 int 2960 solisten(struct sonode *so, int backlog) 2961 { 2962 return (socket_listen(so, backlog, CRED())); 2963 } 2964 2965 int 2966 soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen, 2967 int fflag, int flags) 2968 { 2969 return (socket_connect(so, name, namelen, fflag, flags, CRED())); 2970 } 2971 2972 int 2973 sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) 2974 { 2975 return (socket_recvmsg(so, msg, uiop, CRED())); 2976 } 2977 2978 int 2979 sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) 2980 { 2981 return (socket_sendmsg(so, msg, uiop, CRED())); 2982 } 2983 2984 int 2985 soshutdown(struct sonode *so, int how) 2986 { 2987 return (socket_shutdown(so, how, CRED())); 2988 } 2989 2990 int 2991 sogetsockopt(struct sonode *so, int level, int option_name, void *optval, 2992 socklen_t *optlenp, int flags) 2993 { 2994 return (socket_getsockopt(so, level, option_name, optval, optlenp, 2995 flags, CRED())); 2996 } 2997 2998 int 2999 sosetsockopt(struct sonode *so, int level, int option_name, const void *optval, 3000 t_uscalar_t optlen) 3001 { 3002 return (socket_setsockopt(so, level, option_name, optval, optlen, 3003 CRED())); 3004 } 3005 3006 /* 3007 * Because this is backward compatibility interface it only needs to be 3008 * able to handle the creation of TPI sockfs sockets. 3009 */ 3010 struct sonode * 3011 socreate(struct sockparams *sp, int family, int type, int protocol, int version, 3012 int *errorp) 3013 { 3014 struct sonode *so; 3015 3016 ASSERT(sp != NULL); 3017 3018 so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol, 3019 version, SOCKET_SLEEP, errorp, CRED()); 3020 if (so == NULL) { 3021 SOCKPARAMS_DEC_REF(sp); 3022 } else { 3023 if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) { 3024 /* Cannot fail, only bumps so_count */ 3025 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL); 3026 } else { 3027 socket_destroy(so); 3028 so = NULL; 3029 } 3030 } 3031 return (so); 3032 } 3033