1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/t_lock.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/sysmacros.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/debug.h> 41 #include <sys/errno.h> 42 #include <sys/time.h> 43 #include <sys/file.h> 44 #include <sys/open.h> 45 #include <sys/user.h> 46 #include <sys/termios.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/strsun.h> 50 #include <sys/esunddi.h> 51 #include <sys/flock.h> 52 #include <sys/modctl.h> 53 #include <sys/cmn_err.h> 54 #include <sys/vmsystm.h> 55 #include <sys/policy.h> 56 57 #include <sys/socket.h> 58 #include <sys/socketvar.h> 59 #include <netinet/in.h> 60 #include <sys/un.h> 61 #include <inet/nca/ncadoorhdr.h> 62 63 #include <sys/isa_defs.h> 64 #include <sys/inttypes.h> 65 #include <sys/systm.h> 66 #include <sys/cpuvar.h> 67 #include <sys/atomic.h> 68 #include <sys/filio.h> 69 #include <sys/sendfile.h> 70 #include <sys/ddi.h> 71 #include <vm/seg.h> 72 #include <vm/seg_map.h> 73 #include <vm/seg_kpm.h> 74 #include <fs/sockfs/nl7c.h> 75 76 #ifdef SOCK_TEST 77 int do_useracc = 1; /* Controlled by setting SO_DEBUG to 4 */ 78 #else 79 #define do_useracc 1 80 #endif /* SOCK_TEST */ 81 82 extern int xnet_truncate_print; 83 84 /* 85 * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c" 86 * as there isn't a formal definition of IOV_MAX ??? 87 */ 88 #define MSG_MAXIOVLEN 16 89 90 /* 91 * Kernel component of socket creation. 92 * 93 * The socket library determines which version number to use. 94 * First the library calls this with a NULL devpath. If this fails 95 * to find a transport (using solookup) the library will look in /etc/netconfig 96 * for the appropriate transport. If one is found it will pass in the 97 * devpath for the kernel to use. 98 */ 99 int 100 so_socket(int domain, int type, int protocol, char *devpath, int version) 101 { 102 vnode_t *accessvp; 103 struct sonode *so; 104 vnode_t *vp; 105 struct file *fp; 106 int fd; 107 int error; 108 boolean_t wildcard = B_FALSE; 109 int saved_error = 0; 110 int sdomain = domain; 111 112 dprint(1, ("so_socket(%d,%d,%d,%p,%d)\n", 113 domain, type, protocol, devpath, version)); 114 115 if (domain == AF_NCA) { 116 /* 117 * The request is for an NCA socket so for NL7C use the 118 * INET domain instead and mark NL7C_AF_NCA below. 119 */ 120 domain = AF_INET; 121 /* 122 * NL7C is not supported in non-global zones, 123 * we enforce this restriction here. 124 */ 125 if (getzoneid() != GLOBAL_ZONEID) { 126 return (set_errno(ENOTSUP)); 127 } 128 } 129 130 accessvp = solookup(domain, type, protocol, devpath, &error); 131 if (accessvp == NULL) { 132 /* 133 * If there is either an EPROTONOSUPPORT or EPROTOTYPE error 134 * it makes sense doing the wildcard lookup since the 135 * protocol might not be in the table. 136 */ 137 if (devpath != NULL || protocol == 0 || 138 !(error == EPROTONOSUPPORT || error == EPROTOTYPE)) 139 return (set_errno(error)); 140 141 saved_error = error; 142 143 /* 144 * Try wildcard lookup. Never use devpath for wildcards. 145 */ 146 accessvp = solookup(domain, type, 0, NULL, &error); 147 if (accessvp == NULL) { 148 /* 149 * Can't find in kernel table - have library 150 * fall back to /etc/netconfig and tell us 151 * the devpath (The library will do this if it didn't 152 * already pass in a devpath). 153 */ 154 if (saved_error != 0) 155 error = saved_error; 156 return (set_errno(error)); 157 } 158 wildcard = B_TRUE; 159 } 160 161 /* Check the device policy */ 162 if ((error = secpolicy_spec_open(CRED(), 163 accessvp, FREAD|FWRITE)) != 0) { 164 return (set_errno(error)); 165 } 166 167 if (domain == AF_NCA) { 168 so = sonca_create(accessvp, domain, type, protocol, version, 169 NULL, &error); 170 } else if (protocol == IPPROTO_SCTP) { 171 so = sosctp_create(accessvp, domain, type, protocol, version, 172 NULL, &error); 173 } else { 174 so = sotpi_create(accessvp, domain, type, protocol, version, 175 NULL, &error); 176 } 177 if (so == NULL) { 178 return (set_errno(error)); 179 } 180 if (sdomain == AF_NCA && domain == AF_INET) { 181 so->so_nl7c_flags = NL7C_AF_NCA; 182 } 183 vp = SOTOV(so); 184 185 if (wildcard) { 186 /* 187 * Issue SO_PROTOTYPE setsockopt. 188 */ 189 error = SOP_SETSOCKOPT(so, SOL_SOCKET, SO_PROTOTYPE, 190 &protocol, 191 (t_uscalar_t)sizeof (protocol)); 192 if (error) { 193 (void) VOP_CLOSE(vp, 0, 1, 0, CRED()); 194 VN_RELE(vp); 195 /* 196 * Setsockopt often fails with ENOPROTOOPT but socket() 197 * should fail with EPROTONOSUPPORT/EPROTOTYPE. 198 */ 199 if (saved_error != 0 && error == ENOPROTOOPT) 200 error = saved_error; 201 else 202 error = EPROTONOSUPPORT; 203 return (set_errno(error)); 204 } 205 } 206 if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) { 207 (void) VOP_CLOSE(vp, 0, 1, 0, CRED()); 208 VN_RELE(vp); 209 return (set_errno(error)); 210 } 211 212 /* 213 * Now fill in the entries that falloc reserved 214 */ 215 mutex_exit(&fp->f_tlock); 216 setf(fd, fp); 217 218 return (fd); 219 } 220 221 /* 222 * Map from a file descriptor to a socket node. 223 * Returns with the file descriptor held i.e. the caller has to 224 * use releasef when done with the file descriptor. 225 */ 226 static struct sonode * 227 getsonode(int sock, int *errorp, file_t **fpp) 228 { 229 file_t *fp; 230 vnode_t *vp; 231 struct sonode *so; 232 233 if ((fp = getf(sock)) == NULL) { 234 *errorp = EBADF; 235 eprintline(*errorp); 236 return (NULL); 237 } 238 vp = fp->f_vnode; 239 /* Check if it is a socket */ 240 if (vp->v_type != VSOCK) { 241 releasef(sock); 242 *errorp = ENOTSOCK; 243 eprintline(*errorp); 244 return (NULL); 245 } 246 /* 247 * Use the stream head to find the real socket vnode. 248 * This is needed when namefs sits above sockfs. 249 */ 250 if (vp->v_stream) { 251 ASSERT(vp->v_stream->sd_vnode); 252 vp = vp->v_stream->sd_vnode; 253 254 so = VTOSO(vp); 255 if (so->so_version == SOV_STREAM) { 256 releasef(sock); 257 *errorp = ENOTSOCK; 258 eprintsoline(so, *errorp); 259 return (NULL); 260 } 261 } else { 262 so = VTOSO(vp); 263 } 264 if (fpp) 265 *fpp = fp; 266 return (so); 267 } 268 269 /* 270 * Allocate and copyin a sockaddr. 271 * Ensures NULL termination for AF_UNIX addresses by extending them 272 * with one NULL byte if need be. Verifies that the length is not 273 * excessive to prevent an application from consuming all of kernel 274 * memory. Returns NULL when an error occurred. 275 */ 276 static struct sockaddr * 277 copyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp, 278 int *errorp) 279 { 280 char *faddr; 281 size_t namelen = (size_t)*namelenp; 282 283 ASSERT(namelen != 0); 284 if (namelen > SO_MAXARGSIZE) { 285 *errorp = EINVAL; 286 eprintsoline(so, *errorp); 287 return (NULL); 288 } 289 290 faddr = (char *)kmem_alloc(namelen, KM_SLEEP); 291 if (copyin(name, faddr, namelen)) { 292 kmem_free(faddr, namelen); 293 *errorp = EFAULT; 294 eprintsoline(so, *errorp); 295 return (NULL); 296 } 297 298 /* 299 * Add space for NULL termination if needed. 300 * Do a quick check if the last byte is NUL. 301 */ 302 if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') { 303 /* Check if there is any NULL termination */ 304 size_t i; 305 int foundnull = 0; 306 307 for (i = sizeof (name->sa_family); i < namelen; i++) { 308 if (faddr[i] == '\0') { 309 foundnull = 1; 310 break; 311 } 312 } 313 if (!foundnull) { 314 /* Add extra byte for NUL padding */ 315 char *nfaddr; 316 317 nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP); 318 bcopy(faddr, nfaddr, namelen); 319 kmem_free(faddr, namelen); 320 321 /* NUL terminate */ 322 nfaddr[namelen] = '\0'; 323 namelen++; 324 ASSERT((socklen_t)namelen == namelen); 325 *namelenp = (socklen_t)namelen; 326 faddr = nfaddr; 327 } 328 } 329 return ((struct sockaddr *)faddr); 330 } 331 332 /* 333 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL. 334 */ 335 static int 336 copyout_arg(void *uaddr, socklen_t ulen, void *ulenp, 337 void *kaddr, socklen_t klen) 338 { 339 if (uaddr != NULL) { 340 if (ulen > klen) 341 ulen = klen; 342 343 if (ulen != 0) { 344 if (copyout(kaddr, uaddr, ulen)) 345 return (EFAULT); 346 } 347 } else 348 ulen = 0; 349 350 if (ulenp != NULL) { 351 if (copyout(&ulen, ulenp, sizeof (ulen))) 352 return (EFAULT); 353 } 354 return (0); 355 } 356 357 /* 358 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL. 359 * If klen is greater than ulen it still uses the non-truncated 360 * klen to update ulenp. 361 */ 362 static int 363 copyout_name(void *uaddr, socklen_t ulen, void *ulenp, 364 void *kaddr, socklen_t klen) 365 { 366 if (uaddr != NULL) { 367 if (ulen >= klen) 368 ulen = klen; 369 else if (ulen != 0 && xnet_truncate_print) { 370 printf("sockfs: truncating copyout of address using " 371 "XNET semantics for pid = %d. Lengths %d, %d\n", 372 curproc->p_pid, klen, ulen); 373 } 374 375 if (ulen != 0) { 376 if (copyout(kaddr, uaddr, ulen)) 377 return (EFAULT); 378 } else 379 klen = 0; 380 } else 381 klen = 0; 382 383 if (ulenp != NULL) { 384 if (copyout(&klen, ulenp, sizeof (klen))) 385 return (EFAULT); 386 } 387 return (0); 388 } 389 390 /* 391 * The socketpair() code in libsocket creates two sockets (using 392 * the /etc/netconfig fallback if needed) before calling this routine 393 * to connect the two sockets together. 394 * 395 * For a SOCK_STREAM socketpair a listener is needed - in that case this 396 * routine will create a new file descriptor as part of accepting the 397 * connection. The library socketpair() will check if svs[2] has changed 398 * in which case it will close the changed fd. 399 * 400 * Note that this code could use the TPI feature of accepting the connection 401 * on the listening endpoint. However, that would require significant changes 402 * to soaccept. 403 */ 404 int 405 so_socketpair(int sv[2]) 406 { 407 int svs[2]; 408 struct sonode *so1, *so2; 409 int error; 410 struct sockaddr_ux *name; 411 size_t namelen; 412 413 dprint(1, ("so_socketpair(%p)\n", sv)); 414 415 error = useracc(sv, sizeof (svs), B_WRITE); 416 if (error && do_useracc) 417 return (set_errno(EFAULT)); 418 419 if (copyin(sv, svs, sizeof (svs))) 420 return (set_errno(EFAULT)); 421 422 if ((so1 = getsonode(svs[0], &error, NULL)) == NULL) 423 return (set_errno(error)); 424 425 if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) { 426 releasef(svs[0]); 427 return (set_errno(error)); 428 } 429 430 if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) { 431 error = EOPNOTSUPP; 432 goto done; 433 } 434 435 /* 436 * The code below makes assumptions about the "sockfs" implementation. 437 * So make sure that the correct implementation is really used. 438 */ 439 ASSERT(so1->so_ops == &sotpi_sonodeops); 440 ASSERT(so2->so_ops == &sotpi_sonodeops); 441 442 if (so1->so_type == SOCK_DGRAM) { 443 /* 444 * Bind both sockets and connect them with each other. 445 * Need to allocate name/namelen for soconnect. 446 */ 447 error = SOP_BIND(so1, NULL, 0, _SOBIND_UNSPEC); 448 if (error) { 449 eprintsoline(so1, error); 450 goto done; 451 } 452 error = SOP_BIND(so2, NULL, 0, _SOBIND_UNSPEC); 453 if (error) { 454 eprintsoline(so2, error); 455 goto done; 456 } 457 namelen = sizeof (struct sockaddr_ux); 458 name = kmem_alloc(namelen, KM_SLEEP); 459 name->sou_family = AF_UNIX; 460 name->sou_addr = so2->so_ux_laddr; 461 error = SOP_CONNECT(so1, 462 (struct sockaddr *)name, 463 (socklen_t)namelen, 464 0, _SOCONNECT_NOXLATE); 465 if (error) { 466 kmem_free(name, namelen); 467 eprintsoline(so1, error); 468 goto done; 469 } 470 name->sou_addr = so1->so_ux_laddr; 471 error = SOP_CONNECT(so2, 472 (struct sockaddr *)name, 473 (socklen_t)namelen, 474 0, _SOCONNECT_NOXLATE); 475 kmem_free(name, namelen); 476 if (error) { 477 eprintsoline(so2, error); 478 goto done; 479 } 480 releasef(svs[0]); 481 releasef(svs[1]); 482 } else { 483 /* 484 * Bind both sockets, with so1 being a listener. 485 * Connect so2 to so1 - nonblocking to avoid waiting for 486 * soaccept to complete. 487 * Accept a connection on so1. Pass out the new fd as sv[0]. 488 * The library will detect the changed fd and close 489 * the original one. 490 */ 491 struct sonode *nso; 492 struct vnode *nvp; 493 struct file *nfp; 494 int nfd; 495 496 /* 497 * We could simply call SOP_LISTEN() here (which would do the 498 * binding automatically) if the code didn't rely on passing 499 * _SOBIND_NOXLATE to the TPI implementation of SOP_BIND(). 500 */ 501 error = SOP_BIND(so1, NULL, 0, _SOBIND_UNSPEC|_SOBIND_NOXLATE| 502 _SOBIND_LISTEN|_SOBIND_SOCKETPAIR); 503 if (error) { 504 eprintsoline(so1, error); 505 goto done; 506 } 507 error = SOP_BIND(so2, NULL, 0, _SOBIND_UNSPEC); 508 if (error) { 509 eprintsoline(so2, error); 510 goto done; 511 } 512 513 namelen = sizeof (struct sockaddr_ux); 514 name = kmem_alloc(namelen, KM_SLEEP); 515 name->sou_family = AF_UNIX; 516 name->sou_addr = so1->so_ux_laddr; 517 error = SOP_CONNECT(so2, 518 (struct sockaddr *)name, 519 (socklen_t)namelen, 520 FNONBLOCK, _SOCONNECT_NOXLATE); 521 kmem_free(name, namelen); 522 if (error) { 523 if (error != EINPROGRESS) { 524 eprintsoline(so2, error); 525 goto done; 526 } 527 } 528 529 error = SOP_ACCEPT(so1, 0, &nso); 530 if (error) { 531 eprintsoline(so1, error); 532 goto done; 533 } 534 535 /* wait for so2 being SS_CONNECTED ignoring signals */ 536 mutex_enter(&so2->so_lock); 537 error = sowaitconnected(so2, 0, 1); 538 mutex_exit(&so2->so_lock); 539 nvp = SOTOV(nso); 540 if (error != 0) { 541 (void) VOP_CLOSE(nvp, 0, 1, 0, CRED()); 542 VN_RELE(nvp); 543 eprintsoline(so2, error); 544 goto done; 545 } 546 547 if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) { 548 (void) VOP_CLOSE(nvp, 0, 1, 0, CRED()); 549 VN_RELE(nvp); 550 eprintsoline(nso, error); 551 goto done; 552 } 553 /* 554 * fill in the entries that falloc reserved 555 */ 556 mutex_exit(&nfp->f_tlock); 557 setf(nfd, nfp); 558 559 releasef(svs[0]); 560 releasef(svs[1]); 561 svs[0] = nfd; 562 563 /* 564 * The socketpair library routine will close the original 565 * svs[0] when this code passes out a different file 566 * descriptor. 567 */ 568 if (copyout(svs, sv, sizeof (svs))) { 569 (void) closeandsetf(nfd, NULL); 570 eprintline(EFAULT); 571 return (set_errno(EFAULT)); 572 } 573 } 574 return (0); 575 576 done: 577 releasef(svs[0]); 578 releasef(svs[1]); 579 return (set_errno(error)); 580 } 581 582 int 583 bind(int sock, struct sockaddr *name, socklen_t namelen, int version) 584 { 585 struct sonode *so; 586 int error; 587 588 dprint(1, ("bind(%d, %p, %d)\n", 589 sock, name, namelen)); 590 591 if ((so = getsonode(sock, &error, NULL)) == NULL) 592 return (set_errno(error)); 593 594 /* Allocate and copyin name */ 595 /* 596 * X/Open test does not expect EFAULT with NULL name and non-zero 597 * namelen. 598 */ 599 if (name != NULL && namelen != 0) { 600 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 601 name = copyin_name(so, name, &namelen, &error); 602 if (name == NULL) { 603 releasef(sock); 604 return (set_errno(error)); 605 } 606 } else { 607 name = NULL; 608 namelen = 0; 609 } 610 611 switch (version) { 612 default: 613 error = SOP_BIND(so, name, namelen, 0); 614 break; 615 case SOV_XPG4_2: 616 error = SOP_BIND(so, name, namelen, _SOBIND_XPG4_2); 617 break; 618 case SOV_SOCKBSD: 619 error = SOP_BIND(so, name, namelen, _SOBIND_SOCKBSD); 620 break; 621 } 622 done: 623 releasef(sock); 624 if (name != NULL) 625 kmem_free(name, (size_t)namelen); 626 627 if (error) 628 return (set_errno(error)); 629 return (0); 630 } 631 632 /* ARGSUSED2 */ 633 int 634 listen(int sock, int backlog, int version) 635 { 636 struct sonode *so; 637 int error; 638 639 dprint(1, ("listen(%d, %d)\n", 640 sock, backlog)); 641 642 if ((so = getsonode(sock, &error, NULL)) == NULL) 643 return (set_errno(error)); 644 645 error = SOP_LISTEN(so, backlog); 646 647 releasef(sock); 648 if (error) 649 return (set_errno(error)); 650 return (0); 651 } 652 653 /*ARGSUSED3*/ 654 int 655 accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version) 656 { 657 struct sonode *so; 658 file_t *fp; 659 int error; 660 socklen_t namelen; 661 struct sonode *nso; 662 struct vnode *nvp; 663 struct file *nfp; 664 int nfd; 665 666 dprint(1, ("accept(%d, %p, %p)\n", 667 sock, name, namelenp)); 668 669 if ((so = getsonode(sock, &error, &fp)) == NULL) 670 return (set_errno(error)); 671 672 if (name != NULL) { 673 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 674 if (copyin(namelenp, &namelen, sizeof (namelen))) { 675 releasef(sock); 676 return (set_errno(EFAULT)); 677 } 678 if (namelen != 0) { 679 error = useracc(name, (size_t)namelen, B_WRITE); 680 if (error && do_useracc) { 681 releasef(sock); 682 return (set_errno(EFAULT)); 683 } 684 } else 685 name = NULL; 686 } else { 687 namelen = 0; 688 } 689 690 /* 691 * Allocate the user fd before SOP_ACCEPT() in order to 692 * catch EMFILE errors before calling SOP_ACCEPT(). 693 */ 694 if ((nfd = ufalloc(0)) == -1) { 695 eprintsoline(so, EMFILE); 696 releasef(sock); 697 return (set_errno(EMFILE)); 698 } 699 error = SOP_ACCEPT(so, fp->f_flag, &nso); 700 releasef(sock); 701 if (error) { 702 setf(nfd, NULL); 703 return (set_errno(error)); 704 } 705 706 nvp = SOTOV(nso); 707 708 /* 709 * so_faddr_sa can not go away even though we are not holding so_lock. 710 * However, in theory its content could change from underneath us. 711 * But this is not possible in practice since it can only 712 * change due to either some socket system call 713 * or due to a T_CONN_CON being received from the stream head. 714 * Since the falloc/setf have not yet been done no thread 715 * can do any system call on nso and T_CONN_CON can not arrive 716 * on a socket that is already connected. 717 * Thus there is no reason to hold so_lock here. 718 * 719 * SOP_ACCEPT() is required to have set the valid bit for the faddr, 720 * but it could be instantly cleared by a disconnect from the transport. 721 * For that reason we ignore it here. 722 */ 723 ASSERT(MUTEX_NOT_HELD(&nso->so_lock)); 724 error = copyout_name(name, namelen, namelenp, 725 nso->so_faddr_sa, (socklen_t)nso->so_faddr_len); 726 if (error) { 727 setf(nfd, NULL); 728 (void) VOP_CLOSE(nvp, 0, 1, 0, CRED()); 729 VN_RELE(nvp); 730 return (set_errno(error)); 731 } 732 if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) { 733 setf(nfd, NULL); 734 (void) VOP_CLOSE(nvp, 0, 1, 0, CRED()); 735 VN_RELE(nvp); 736 eprintsoline(so, error); 737 return (set_errno(error)); 738 } 739 /* 740 * fill in the entries that falloc reserved 741 */ 742 nfp->f_vnode = nvp; 743 mutex_exit(&nfp->f_tlock); 744 setf(nfd, nfp); 745 746 /* 747 * Copy FNDELAY and FNONBLOCK from listener to acceptor 748 */ 749 if (so->so_state & (SS_NDELAY|SS_NONBLOCK)) { 750 uint_t oflag = nfp->f_flag; 751 int arg = 0; 752 753 if (so->so_state & SS_NONBLOCK) 754 arg |= FNONBLOCK; 755 else if (so->so_state & SS_NDELAY) 756 arg |= FNDELAY; 757 758 /* 759 * This code is a simplification of the F_SETFL code in fcntl() 760 * Ignore any errors from VOP_SETFL. 761 */ 762 if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred)) != 0) { 763 eprintsoline(so, error); 764 error = 0; 765 } else { 766 mutex_enter(&nfp->f_tlock); 767 nfp->f_flag &= ~FMASK | (FREAD|FWRITE); 768 nfp->f_flag |= arg; 769 mutex_exit(&nfp->f_tlock); 770 } 771 } 772 return (nfd); 773 } 774 775 int 776 connect(int sock, struct sockaddr *name, socklen_t namelen, int version) 777 { 778 struct sonode *so; 779 file_t *fp; 780 int error; 781 782 dprint(1, ("connect(%d, %p, %d)\n", 783 sock, name, namelen)); 784 785 if ((so = getsonode(sock, &error, &fp)) == NULL) 786 return (set_errno(error)); 787 788 /* Allocate and copyin name */ 789 if (namelen != 0) { 790 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 791 name = copyin_name(so, name, &namelen, &error); 792 if (name == NULL) { 793 releasef(sock); 794 return (set_errno(error)); 795 } 796 } else 797 name = NULL; 798 799 error = SOP_CONNECT(so, name, namelen, fp->f_flag, 800 (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2); 801 releasef(sock); 802 if (name) 803 kmem_free(name, (size_t)namelen); 804 if (error) 805 return (set_errno(error)); 806 return (0); 807 } 808 809 /*ARGSUSED2*/ 810 int 811 shutdown(int sock, int how, int version) 812 { 813 struct sonode *so; 814 int error; 815 816 dprint(1, ("shutdown(%d, %d)\n", 817 sock, how)); 818 819 if ((so = getsonode(sock, &error, NULL)) == NULL) 820 return (set_errno(error)); 821 822 error = SOP_SHUTDOWN(so, how); 823 824 releasef(sock); 825 if (error) 826 return (set_errno(error)); 827 return (0); 828 } 829 830 /* 831 * Common receive routine. 832 */ 833 static ssize_t 834 recvit(int sock, 835 struct nmsghdr *msg, 836 struct uio *uiop, 837 int flags, 838 socklen_t *namelenp, 839 socklen_t *controllenp, 840 int *flagsp) 841 { 842 struct sonode *so; 843 file_t *fp; 844 void *name; 845 socklen_t namelen; 846 void *control; 847 socklen_t controllen; 848 ssize_t len; 849 int error; 850 851 if ((so = getsonode(sock, &error, &fp)) == NULL) 852 return (set_errno(error)); 853 854 len = uiop->uio_resid; 855 uiop->uio_fmode = fp->f_flag; 856 uiop->uio_extflg = UIO_COPY_CACHED; 857 858 name = msg->msg_name; 859 namelen = msg->msg_namelen; 860 control = msg->msg_control; 861 controllen = msg->msg_controllen; 862 863 msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL | 864 MSG_DONTWAIT | MSG_XPG4_2); 865 866 error = SOP_RECVMSG(so, msg, uiop); 867 if (error) { 868 releasef(sock); 869 return (set_errno(error)); 870 } 871 lwp_stat_update(LWP_STAT_MSGRCV, 1); 872 so_update_attrs(so, SOACC); 873 releasef(sock); 874 875 error = copyout_name(name, namelen, namelenp, 876 msg->msg_name, msg->msg_namelen); 877 if (error) 878 goto err; 879 880 if (flagsp != NULL) { 881 /* 882 * Clear internal flag. 883 */ 884 msg->msg_flags &= ~MSG_XPG4_2; 885 886 /* 887 * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only 888 * when controllen is zero and there is control data to 889 * copy out. 890 */ 891 if (controllen != 0 && 892 (msg->msg_controllen > controllen || control == NULL)) { 893 dprint(1, ("recvit: CTRUNC %d %d %p\n", 894 msg->msg_controllen, controllen, control)); 895 896 msg->msg_flags |= MSG_CTRUNC; 897 } 898 if (copyout(&msg->msg_flags, flagsp, 899 sizeof (msg->msg_flags))) { 900 error = EFAULT; 901 goto err; 902 } 903 } 904 /* 905 * Note: This MUST be done last. There can be no "goto err" after this 906 * point since it could make so_closefds run twice on some part 907 * of the file descriptor array. 908 */ 909 if (controllen != 0) { 910 if (!(flags & MSG_XPG4_2)) { 911 /* 912 * Good old msg_accrights can only return a multiple 913 * of 4 bytes. 914 */ 915 controllen &= ~((int)sizeof (uint32_t) - 1); 916 } 917 error = copyout_arg(control, controllen, controllenp, 918 msg->msg_control, msg->msg_controllen); 919 if (error) 920 goto err; 921 922 if (msg->msg_controllen > controllen || control == NULL) { 923 if (control == NULL) 924 controllen = 0; 925 so_closefds(msg->msg_control, msg->msg_controllen, 926 !(flags & MSG_XPG4_2), controllen); 927 } 928 } 929 if (msg->msg_namelen != 0) 930 kmem_free(msg->msg_name, (size_t)msg->msg_namelen); 931 if (msg->msg_controllen != 0) 932 kmem_free(msg->msg_control, (size_t)msg->msg_controllen); 933 return (len - uiop->uio_resid); 934 935 err: 936 /* 937 * If we fail and the control part contains file descriptors 938 * we have to close the fd's. 939 */ 940 if (msg->msg_controllen != 0) 941 so_closefds(msg->msg_control, msg->msg_controllen, 942 !(flags & MSG_XPG4_2), 0); 943 if (msg->msg_namelen != 0) 944 kmem_free(msg->msg_name, (size_t)msg->msg_namelen); 945 if (msg->msg_controllen != 0) 946 kmem_free(msg->msg_control, (size_t)msg->msg_controllen); 947 return (set_errno(error)); 948 } 949 950 /* 951 * Native system call 952 */ 953 ssize_t 954 recv(int sock, void *buffer, size_t len, int flags) 955 { 956 struct nmsghdr lmsg; 957 struct uio auio; 958 struct iovec aiov[1]; 959 960 dprint(1, ("recv(%d, %p, %ld, %d)\n", 961 sock, buffer, len, flags)); 962 963 if ((ssize_t)len < 0) { 964 return (set_errno(EINVAL)); 965 } 966 967 aiov[0].iov_base = buffer; 968 aiov[0].iov_len = len; 969 auio.uio_loffset = 0; 970 auio.uio_iov = aiov; 971 auio.uio_iovcnt = 1; 972 auio.uio_resid = len; 973 auio.uio_segflg = UIO_USERSPACE; 974 auio.uio_limit = 0; 975 976 lmsg.msg_namelen = 0; 977 lmsg.msg_controllen = 0; 978 lmsg.msg_flags = 0; 979 return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL)); 980 } 981 982 ssize_t 983 recvfrom(int sock, void *buffer, size_t len, int flags, 984 struct sockaddr *name, socklen_t *namelenp) 985 { 986 struct nmsghdr lmsg; 987 struct uio auio; 988 struct iovec aiov[1]; 989 990 dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n", 991 sock, buffer, len, flags, name, namelenp)); 992 993 if ((ssize_t)len < 0) { 994 return (set_errno(EINVAL)); 995 } 996 997 aiov[0].iov_base = buffer; 998 aiov[0].iov_len = len; 999 auio.uio_loffset = 0; 1000 auio.uio_iov = aiov; 1001 auio.uio_iovcnt = 1; 1002 auio.uio_resid = len; 1003 auio.uio_segflg = UIO_USERSPACE; 1004 auio.uio_limit = 0; 1005 1006 lmsg.msg_name = (char *)name; 1007 if (namelenp != NULL) { 1008 if (copyin(namelenp, &lmsg.msg_namelen, 1009 sizeof (lmsg.msg_namelen))) 1010 return (set_errno(EFAULT)); 1011 } else { 1012 lmsg.msg_namelen = 0; 1013 } 1014 lmsg.msg_controllen = 0; 1015 lmsg.msg_flags = 0; 1016 1017 return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL)); 1018 } 1019 1020 /* 1021 * Uses the MSG_XPG4_2 flag to determine if the caller is using 1022 * struct omsghdr or struct nmsghdr. 1023 */ 1024 ssize_t 1025 recvmsg(int sock, struct nmsghdr *msg, int flags) 1026 { 1027 STRUCT_DECL(nmsghdr, u_lmsg); 1028 STRUCT_HANDLE(nmsghdr, umsgptr); 1029 struct nmsghdr lmsg; 1030 struct uio auio; 1031 struct iovec aiov[MSG_MAXIOVLEN]; 1032 int iovcnt; 1033 ssize_t len; 1034 int i; 1035 int *flagsp; 1036 model_t model; 1037 1038 dprint(1, ("recvmsg(%d, %p, %d)\n", 1039 sock, msg, flags)); 1040 1041 model = get_udatamodel(); 1042 STRUCT_INIT(u_lmsg, model); 1043 STRUCT_SET_HANDLE(umsgptr, model, msg); 1044 1045 if (flags & MSG_XPG4_2) { 1046 if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg))) 1047 return (set_errno(EFAULT)); 1048 flagsp = STRUCT_FADDR(umsgptr, msg_flags); 1049 } else { 1050 /* 1051 * Assumes that nmsghdr and omsghdr are identically shaped 1052 * except for the added msg_flags field. 1053 */ 1054 if (copyin(msg, STRUCT_BUF(u_lmsg), 1055 SIZEOF_STRUCT(omsghdr, model))) 1056 return (set_errno(EFAULT)); 1057 STRUCT_FSET(u_lmsg, msg_flags, 0); 1058 flagsp = NULL; 1059 } 1060 1061 /* 1062 * Code below us will kmem_alloc memory and hang it 1063 * off msg_control and msg_name fields. This forces 1064 * us to copy the structure to its native form. 1065 */ 1066 lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name); 1067 lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen); 1068 lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov); 1069 lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen); 1070 lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control); 1071 lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen); 1072 lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags); 1073 1074 iovcnt = lmsg.msg_iovlen; 1075 1076 if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) { 1077 return (set_errno(EMSGSIZE)); 1078 } 1079 1080 #ifdef _SYSCALL32_IMPL 1081 /* 1082 * 32-bit callers need to have their iovec expanded, while ensuring 1083 * that they can't move more than 2Gbytes of data in a single call. 1084 */ 1085 if (model == DATAMODEL_ILP32) { 1086 struct iovec32 aiov32[MSG_MAXIOVLEN]; 1087 ssize32_t count32; 1088 1089 if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, 1090 iovcnt * sizeof (struct iovec32))) 1091 return (set_errno(EFAULT)); 1092 1093 count32 = 0; 1094 for (i = 0; i < iovcnt; i++) { 1095 ssize32_t iovlen32; 1096 1097 iovlen32 = aiov32[i].iov_len; 1098 count32 += iovlen32; 1099 if (iovlen32 < 0 || count32 < 0) 1100 return (set_errno(EINVAL)); 1101 aiov[i].iov_len = iovlen32; 1102 aiov[i].iov_base = 1103 (caddr_t)(uintptr_t)aiov32[i].iov_base; 1104 } 1105 } else 1106 #endif /* _SYSCALL32_IMPL */ 1107 if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) { 1108 return (set_errno(EFAULT)); 1109 } 1110 len = 0; 1111 for (i = 0; i < iovcnt; i++) { 1112 ssize_t iovlen = aiov[i].iov_len; 1113 len += iovlen; 1114 if (iovlen < 0 || len < 0) { 1115 return (set_errno(EINVAL)); 1116 } 1117 } 1118 auio.uio_loffset = 0; 1119 auio.uio_iov = aiov; 1120 auio.uio_iovcnt = iovcnt; 1121 auio.uio_resid = len; 1122 auio.uio_segflg = UIO_USERSPACE; 1123 auio.uio_limit = 0; 1124 1125 if (lmsg.msg_control != NULL && 1126 (do_useracc == 0 || 1127 useracc(lmsg.msg_control, lmsg.msg_controllen, 1128 B_WRITE) != 0)) { 1129 return (set_errno(EFAULT)); 1130 } 1131 1132 return (recvit(sock, &lmsg, &auio, flags, 1133 STRUCT_FADDR(umsgptr, msg_namelen), 1134 STRUCT_FADDR(umsgptr, msg_controllen), flagsp)); 1135 } 1136 1137 /* 1138 * Common send function. 1139 */ 1140 static ssize_t 1141 sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags) 1142 { 1143 struct sonode *so; 1144 file_t *fp; 1145 void *name; 1146 socklen_t namelen; 1147 void *control; 1148 socklen_t controllen; 1149 ssize_t len; 1150 int error; 1151 1152 if ((so = getsonode(sock, &error, &fp)) == NULL) 1153 return (set_errno(error)); 1154 1155 uiop->uio_fmode = fp->f_flag; 1156 1157 if (so->so_family == AF_UNIX) 1158 uiop->uio_extflg = UIO_COPY_CACHED; 1159 else 1160 uiop->uio_extflg = UIO_COPY_DEFAULT; 1161 1162 /* Allocate and copyin name and control */ 1163 name = msg->msg_name; 1164 namelen = msg->msg_namelen; 1165 if (name != NULL && namelen != 0) { 1166 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1167 name = copyin_name(so, 1168 (struct sockaddr *)name, 1169 &namelen, &error); 1170 if (name == NULL) 1171 goto done3; 1172 /* copyin_name null terminates addresses for AF_UNIX */ 1173 msg->msg_namelen = namelen; 1174 msg->msg_name = name; 1175 } else { 1176 msg->msg_name = name = NULL; 1177 msg->msg_namelen = namelen = 0; 1178 } 1179 1180 control = msg->msg_control; 1181 controllen = msg->msg_controllen; 1182 if ((control != NULL) && (controllen != 0)) { 1183 /* 1184 * Verify that the length is not excessive to prevent 1185 * an application from consuming all of kernel memory. 1186 */ 1187 if (controllen > SO_MAXARGSIZE) { 1188 error = EINVAL; 1189 goto done2; 1190 } 1191 control = kmem_alloc(controllen, KM_SLEEP); 1192 1193 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1194 if (copyin(msg->msg_control, control, controllen)) { 1195 error = EFAULT; 1196 goto done1; 1197 } 1198 msg->msg_control = control; 1199 } else { 1200 msg->msg_control = control = NULL; 1201 msg->msg_controllen = controllen = 0; 1202 } 1203 1204 len = uiop->uio_resid; 1205 msg->msg_flags = flags; 1206 1207 error = SOP_SENDMSG(so, msg, uiop); 1208 done1: 1209 if (control != NULL) 1210 kmem_free(control, controllen); 1211 done2: 1212 if (name != NULL) 1213 kmem_free(name, namelen); 1214 done3: 1215 if (error != 0) { 1216 releasef(sock); 1217 return (set_errno(error)); 1218 } 1219 lwp_stat_update(LWP_STAT_MSGSND, 1); 1220 so_update_attrs(so, SOMOD); 1221 releasef(sock); 1222 return (len - uiop->uio_resid); 1223 } 1224 1225 /* 1226 * Native system call 1227 */ 1228 ssize_t 1229 send(int sock, void *buffer, size_t len, int flags) 1230 { 1231 struct nmsghdr lmsg; 1232 struct uio auio; 1233 struct iovec aiov[1]; 1234 1235 dprint(1, ("send(%d, %p, %ld, %d)\n", 1236 sock, buffer, len, flags)); 1237 1238 if ((ssize_t)len < 0) { 1239 return (set_errno(EINVAL)); 1240 } 1241 1242 aiov[0].iov_base = buffer; 1243 aiov[0].iov_len = len; 1244 auio.uio_loffset = 0; 1245 auio.uio_iov = aiov; 1246 auio.uio_iovcnt = 1; 1247 auio.uio_resid = len; 1248 auio.uio_segflg = UIO_USERSPACE; 1249 auio.uio_limit = 0; 1250 1251 lmsg.msg_name = NULL; 1252 lmsg.msg_control = NULL; 1253 if (!(flags & MSG_XPG4_2)) { 1254 /* 1255 * In order to be compatible with the libsocket/sockmod 1256 * implementation we set EOR for all send* calls. 1257 */ 1258 flags |= MSG_EOR; 1259 } 1260 return (sendit(sock, &lmsg, &auio, flags)); 1261 } 1262 1263 /* 1264 * Uses the MSG_XPG4_2 flag to determine if the caller is using 1265 * struct omsghdr or struct nmsghdr. 1266 */ 1267 ssize_t 1268 sendmsg(int sock, struct nmsghdr *msg, int flags) 1269 { 1270 struct nmsghdr lmsg; 1271 STRUCT_DECL(nmsghdr, u_lmsg); 1272 struct uio auio; 1273 struct iovec aiov[MSG_MAXIOVLEN]; 1274 int iovcnt; 1275 ssize_t len; 1276 int i; 1277 model_t model; 1278 1279 dprint(1, ("sendmsg(%d, %p, %d)\n", sock, msg, flags)); 1280 1281 model = get_udatamodel(); 1282 STRUCT_INIT(u_lmsg, model); 1283 1284 if (flags & MSG_XPG4_2) { 1285 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg), 1286 STRUCT_SIZE(u_lmsg))) 1287 return (set_errno(EFAULT)); 1288 } else { 1289 /* 1290 * Assumes that nmsghdr and omsghdr are identically shaped 1291 * except for the added msg_flags field. 1292 */ 1293 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg), 1294 SIZEOF_STRUCT(omsghdr, model))) 1295 return (set_errno(EFAULT)); 1296 /* 1297 * In order to be compatible with the libsocket/sockmod 1298 * implementation we set EOR for all send* calls. 1299 */ 1300 flags |= MSG_EOR; 1301 } 1302 1303 /* 1304 * Code below us will kmem_alloc memory and hang it 1305 * off msg_control and msg_name fields. This forces 1306 * us to copy the structure to its native form. 1307 */ 1308 lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name); 1309 lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen); 1310 lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov); 1311 lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen); 1312 lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control); 1313 lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen); 1314 lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags); 1315 1316 iovcnt = lmsg.msg_iovlen; 1317 1318 if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) { 1319 /* 1320 * Unless this is XPG 4.2 we allow iovcnt == 0 to 1321 * be compatible with SunOS 4.X and 4.4BSD. 1322 */ 1323 if (iovcnt != 0 || (flags & MSG_XPG4_2)) 1324 return (set_errno(EMSGSIZE)); 1325 } 1326 1327 #ifdef _SYSCALL32_IMPL 1328 /* 1329 * 32-bit callers need to have their iovec expanded, while ensuring 1330 * that they can't move more than 2Gbytes of data in a single call. 1331 */ 1332 if (model == DATAMODEL_ILP32) { 1333 struct iovec32 aiov32[MSG_MAXIOVLEN]; 1334 ssize32_t count32; 1335 1336 if (iovcnt != 0 && 1337 copyin((struct iovec32 *)lmsg.msg_iov, aiov32, 1338 iovcnt * sizeof (struct iovec32))) 1339 return (set_errno(EFAULT)); 1340 1341 count32 = 0; 1342 for (i = 0; i < iovcnt; i++) { 1343 ssize32_t iovlen32; 1344 1345 iovlen32 = aiov32[i].iov_len; 1346 count32 += iovlen32; 1347 if (iovlen32 < 0 || count32 < 0) 1348 return (set_errno(EINVAL)); 1349 aiov[i].iov_len = iovlen32; 1350 aiov[i].iov_base = 1351 (caddr_t)(uintptr_t)aiov32[i].iov_base; 1352 } 1353 } else 1354 #endif /* _SYSCALL32_IMPL */ 1355 if (iovcnt != 0 && 1356 copyin(lmsg.msg_iov, aiov, 1357 (unsigned)iovcnt * sizeof (struct iovec))) { 1358 return (set_errno(EFAULT)); 1359 } 1360 len = 0; 1361 for (i = 0; i < iovcnt; i++) { 1362 ssize_t iovlen = aiov[i].iov_len; 1363 len += iovlen; 1364 if (iovlen < 0 || len < 0) { 1365 return (set_errno(EINVAL)); 1366 } 1367 } 1368 auio.uio_loffset = 0; 1369 auio.uio_iov = aiov; 1370 auio.uio_iovcnt = iovcnt; 1371 auio.uio_resid = len; 1372 auio.uio_segflg = UIO_USERSPACE; 1373 auio.uio_limit = 0; 1374 1375 return (sendit(sock, &lmsg, &auio, flags)); 1376 } 1377 1378 ssize_t 1379 sendto(int sock, void *buffer, size_t len, int flags, 1380 struct sockaddr *name, socklen_t namelen) 1381 { 1382 struct nmsghdr lmsg; 1383 struct uio auio; 1384 struct iovec aiov[1]; 1385 1386 dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n", 1387 sock, buffer, len, flags, name, namelen)); 1388 1389 if ((ssize_t)len < 0) { 1390 return (set_errno(EINVAL)); 1391 } 1392 1393 aiov[0].iov_base = buffer; 1394 aiov[0].iov_len = len; 1395 auio.uio_loffset = 0; 1396 auio.uio_iov = aiov; 1397 auio.uio_iovcnt = 1; 1398 auio.uio_resid = len; 1399 auio.uio_segflg = UIO_USERSPACE; 1400 auio.uio_limit = 0; 1401 1402 lmsg.msg_name = (char *)name; 1403 lmsg.msg_namelen = namelen; 1404 lmsg.msg_control = NULL; 1405 if (!(flags & MSG_XPG4_2)) { 1406 /* 1407 * In order to be compatible with the libsocket/sockmod 1408 * implementation we set EOR for all send* calls. 1409 */ 1410 flags |= MSG_EOR; 1411 } 1412 return (sendit(sock, &lmsg, &auio, flags)); 1413 } 1414 1415 /*ARGSUSED3*/ 1416 int 1417 getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version) 1418 { 1419 struct sonode *so; 1420 int error; 1421 socklen_t namelen; 1422 union { 1423 struct sockaddr_in sin; 1424 struct sockaddr_in6 sin6; 1425 } sin; /* Temporary buffer, common case */ 1426 void *addr; /* Temporary buffer, uncommon case */ 1427 socklen_t addrlen, size; 1428 1429 dprint(1, ("getpeername(%d, %p, %p)\n", 1430 sock, name, namelenp)); 1431 1432 if ((so = getsonode(sock, &error, NULL)) == NULL) 1433 goto bad; 1434 1435 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1436 if (copyin(namelenp, &namelen, sizeof (namelen)) || 1437 (name == NULL && namelen != 0)) { 1438 error = EFAULT; 1439 goto rel_out; 1440 } 1441 /* 1442 * If a connect or accept has been done, unless we're an Xnet socket, 1443 * the remote address has already been updated in so_faddr_sa. 1444 */ 1445 if (so->so_version != SOV_SOCKSTREAM && so->so_version != SOV_SOCKBSD || 1446 !(so->so_state & SS_FADDR_VALID)) { 1447 if ((error = SOP_GETPEERNAME(so)) != 0) 1448 goto rel_out; 1449 } 1450 1451 if (so->so_faddr_maxlen <= sizeof (sin)) { 1452 size = 0; 1453 addr = &sin; 1454 } else { 1455 /* 1456 * Allocate temporary to avoid holding so_lock across 1457 * copyout 1458 */ 1459 size = so->so_faddr_maxlen; 1460 addr = kmem_alloc(size, KM_SLEEP); 1461 } 1462 /* Prevent so_faddr_sa/len from changing while accessed */ 1463 mutex_enter(&so->so_lock); 1464 if (!(so->so_state & SS_ISCONNECTED)) { 1465 mutex_exit(&so->so_lock); 1466 error = ENOTCONN; 1467 goto free_out; 1468 } 1469 addrlen = so->so_faddr_len; 1470 bcopy(so->so_faddr_sa, addr, addrlen); 1471 mutex_exit(&so->so_lock); 1472 1473 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1474 error = copyout_name(name, namelen, namelenp, addr, 1475 (so->so_state & SS_FADDR_NOXLATE) ? 0 : addrlen); 1476 free_out: 1477 if (size != 0) 1478 kmem_free(addr, size); 1479 rel_out: 1480 releasef(sock); 1481 bad: return (error != 0 ? set_errno(error) : 0); 1482 } 1483 1484 /*ARGSUSED3*/ 1485 int 1486 getsockname(int sock, struct sockaddr *name, 1487 socklen_t *namelenp, int version) 1488 { 1489 struct sonode *so; 1490 int error; 1491 socklen_t namelen; 1492 union { 1493 struct sockaddr_in sin; 1494 struct sockaddr_in6 sin6; 1495 } sin; /* Temporary buffer, common case */ 1496 void *addr; /* Temporary buffer, uncommon case */ 1497 socklen_t addrlen, size; 1498 1499 dprint(1, ("getsockname(%d, %p, %p)\n", 1500 sock, name, namelenp)); 1501 1502 if ((so = getsonode(sock, &error, NULL)) == NULL) 1503 goto bad; 1504 1505 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1506 if (copyin(namelenp, &namelen, sizeof (namelen)) || 1507 (name == NULL && namelen != 0)) { 1508 error = EFAULT; 1509 goto rel_out; 1510 } 1511 1512 /* 1513 * If a bind or accept has been done, unless we're an Xnet endpoint, 1514 * the local address has already been updated in so_laddr_sa. 1515 */ 1516 if ((so->so_version != SOV_SOCKSTREAM && 1517 so->so_version != SOV_SOCKBSD) || 1518 !(so->so_state & SS_LADDR_VALID)) { 1519 if ((error = SOP_GETSOCKNAME(so)) != 0) 1520 goto rel_out; 1521 } 1522 1523 if (so->so_laddr_maxlen <= sizeof (sin)) { 1524 size = 0; 1525 addr = &sin; 1526 } else { 1527 /* 1528 * Allocate temporary to avoid holding so_lock across 1529 * copyout 1530 */ 1531 size = so->so_laddr_maxlen; 1532 addr = kmem_alloc(size, KM_SLEEP); 1533 } 1534 /* Prevent so_laddr_sa/len from changing while accessed */ 1535 mutex_enter(&so->so_lock); 1536 addrlen = so->so_laddr_len; 1537 bcopy(so->so_laddr_sa, addr, addrlen); 1538 mutex_exit(&so->so_lock); 1539 1540 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1541 error = copyout_name(name, namelen, namelenp, 1542 addr, addrlen); 1543 if (size != 0) 1544 kmem_free(addr, size); 1545 rel_out: 1546 releasef(sock); 1547 bad: return (error != 0 ? set_errno(error) : 0); 1548 } 1549 1550 /*ARGSUSED5*/ 1551 int 1552 getsockopt(int sock, 1553 int level, 1554 int option_name, 1555 void *option_value, 1556 socklen_t *option_lenp, 1557 int version) 1558 { 1559 struct sonode *so; 1560 socklen_t optlen, optlen_res; 1561 void *optval; 1562 int error; 1563 1564 dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n", 1565 sock, level, option_name, option_value, option_lenp)); 1566 1567 if ((so = getsonode(sock, &error, NULL)) == NULL) 1568 return (set_errno(error)); 1569 1570 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1571 if (copyin(option_lenp, &optlen, sizeof (optlen))) { 1572 releasef(sock); 1573 return (set_errno(EFAULT)); 1574 } 1575 /* 1576 * Verify that the length is not excessive to prevent 1577 * an application from consuming all of kernel memory. 1578 */ 1579 if (optlen > SO_MAXARGSIZE) { 1580 error = EINVAL; 1581 releasef(sock); 1582 return (set_errno(error)); 1583 } 1584 optval = kmem_alloc(optlen, KM_SLEEP); 1585 optlen_res = optlen; 1586 error = SOP_GETSOCKOPT(so, level, option_name, optval, 1587 &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2); 1588 releasef(sock); 1589 if (error) { 1590 kmem_free(optval, optlen); 1591 return (set_errno(error)); 1592 } 1593 error = copyout_arg(option_value, optlen, option_lenp, 1594 optval, optlen_res); 1595 kmem_free(optval, optlen); 1596 if (error) 1597 return (set_errno(error)); 1598 return (0); 1599 } 1600 1601 /*ARGSUSED5*/ 1602 int 1603 setsockopt(int sock, 1604 int level, 1605 int option_name, 1606 void *option_value, 1607 socklen_t option_len, 1608 int version) 1609 { 1610 struct sonode *so; 1611 intptr_t buffer[2]; 1612 void *optval = NULL; 1613 int error; 1614 1615 dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n", 1616 sock, level, option_name, option_value, option_len)); 1617 1618 if ((so = getsonode(sock, &error, NULL)) == NULL) 1619 return (set_errno(error)); 1620 1621 if (option_value != NULL) { 1622 if (option_len != 0) { 1623 /* 1624 * Verify that the length is not excessive to prevent 1625 * an application from consuming all of kernel memory. 1626 */ 1627 if (option_len > SO_MAXARGSIZE) { 1628 error = EINVAL; 1629 goto done2; 1630 } 1631 optval = option_len <= sizeof (buffer) ? 1632 &buffer : kmem_alloc((size_t)option_len, KM_SLEEP); 1633 ASSERT(MUTEX_NOT_HELD(&so->so_lock)); 1634 if (copyin(option_value, optval, (size_t)option_len)) { 1635 error = EFAULT; 1636 goto done1; 1637 } 1638 } 1639 } else 1640 option_len = 0; 1641 1642 error = SOP_SETSOCKOPT(so, level, option_name, optval, 1643 (t_uscalar_t)option_len); 1644 done1: 1645 if (optval != buffer) 1646 kmem_free(optval, (size_t)option_len); 1647 done2: 1648 releasef(sock); 1649 if (error) 1650 return (set_errno(error)); 1651 return (0); 1652 } 1653 1654 /* 1655 * Add config info when devpath is non-NULL; delete info when devpath is NULL. 1656 * devpath is a user address. 1657 */ 1658 int 1659 sockconfig(int domain, int type, int protocol, char *devpath) 1660 { 1661 char *kdevpath; /* Copied in devpath string */ 1662 size_t kdevpathlen; 1663 int error = 0; 1664 1665 dprint(1, ("sockconfig(%d, %d, %d, %p)\n", 1666 domain, type, protocol, devpath)); 1667 1668 if (secpolicy_net_config(CRED(), B_FALSE) != 0) 1669 return (set_errno(EPERM)); 1670 1671 if (devpath == NULL) { 1672 /* Deleting an entry */ 1673 kdevpath = NULL; 1674 kdevpathlen = 0; 1675 } else { 1676 /* 1677 * Adding an entry. 1678 * Copyin the devpath. 1679 * This also makes it possible to check for too long pathnames. 1680 * Compress the space needed for the devpath before passing it 1681 * to soconfig - soconfig will store the string until 1682 * the configuration is removed. 1683 */ 1684 char *buf; 1685 1686 buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); 1687 if ((error = copyinstr(devpath, buf, MAXPATHLEN, 1688 &kdevpathlen)) != 0) { 1689 kmem_free(buf, MAXPATHLEN); 1690 goto done; 1691 } 1692 1693 kdevpath = kmem_alloc(kdevpathlen, KM_SLEEP); 1694 bcopy(buf, kdevpath, kdevpathlen); 1695 kdevpath[kdevpathlen - 1] = '\0'; 1696 1697 kmem_free(buf, MAXPATHLEN); 1698 } 1699 error = soconfig(domain, type, protocol, kdevpath, (int)kdevpathlen); 1700 done: 1701 if (error) { 1702 eprintline(error); 1703 return (set_errno(error)); 1704 } 1705 return (0); 1706 } 1707 1708 1709 /* 1710 * Sendfile is implemented through two schemes, direct I/O or by 1711 * caching in the filesystem page cache. We cache the input file by 1712 * default and use direct I/O only if sendfile_max_size is set 1713 * appropriately as explained below. Note that this logic is consistent 1714 * with other filesystems where caching is turned on by default 1715 * unless explicitly turned off by using the DIRECTIO ioctl. 1716 * 1717 * We choose a slightly different scheme here. One can turn off 1718 * caching by setting sendfile_max_size to 0. One can also enable 1719 * caching of files <= sendfile_max_size by setting sendfile_max_size 1720 * to an appropriate value. By default sendfile_max_size is set to the 1721 * maximum value so that all files are cached. In future, we may provide 1722 * better interfaces for caching the file. 1723 * 1724 * Sendfile through Direct I/O (Zero copy) 1725 * -------------------------------------- 1726 * 1727 * As disks are normally slower than the network, we can't have a 1728 * single thread that reads the disk and writes to the network. We 1729 * need to have parallelism. This is done by having the sendfile 1730 * thread create another thread that reads from the filesystem 1731 * and queues it for network processing. In this scheme, the data 1732 * is never copied anywhere i.e it is zero copy unlike the other 1733 * scheme. 1734 * 1735 * We have a sendfile queue (snfq) where each sendfile 1736 * request (snf_req_t) is queued for processing by a thread. Number 1737 * of threads is dynamically allocated and they exit if they are idling 1738 * beyond a specified amount of time. When each request (snf_req_t) is 1739 * processed by a thread, it produces a number of mblk_t structures to 1740 * be consumed by the sendfile thread. snf_deque and snf_enque are 1741 * used for consuming and producing mblks. Size of the filesystem 1742 * read is determined by the tuneable (sendfile_read_size). A single 1743 * mblk holds sendfile_read_size worth of data (except the last 1744 * read of the file) which is sent down as a whole to the network. 1745 * sendfile_read_size is set to 1 MB as this seems to be the optimal 1746 * value for the UFS filesystem backed by a striped storage array. 1747 * 1748 * Synchronisation between read (producer) and write (consumer) threads. 1749 * -------------------------------------------------------------------- 1750 * 1751 * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while 1752 * adding and deleting items in this list. Error can happen anytime 1753 * during read or write. There could be unprocessed mblks in the 1754 * sr_ib_XXX list when a read or write error occurs. Whenever error 1755 * is encountered, we need two things to happen : 1756 * 1757 * a) One of the threads need to clean the mblks. 1758 * b) When one thread encounters an error, the other should stop. 1759 * 1760 * For (a), we don't want to penalise the reader thread as it could do 1761 * some useful work processing other requests. For (b), the error can 1762 * be detected by examining sr_read_error or sr_write_error. 1763 * sr_lock protects sr_read_error and sr_write_error. If both reader and 1764 * writer encounters error, we need to report the write error back to 1765 * the application as that's what would have happened if the operations 1766 * were done sequentially. With this in mind, following should work : 1767 * 1768 * - Check for errors before read or write. 1769 * - If the reader encounters error, set the error in sr_read_error. 1770 * Check sr_write_error, if it is set, send cv_signal as it is 1771 * waiting for reader to complete. If it is not set, the writer 1772 * is either running sinking data to the network or blocked 1773 * because of flow control. For handling the latter case, we 1774 * always send a signal. In any case, it will examine sr_read_error 1775 * and return. sr_read_error is marked with SR_READ_DONE to tell 1776 * the writer that the reader is done in all the cases. 1777 * - If the writer encounters error, set the error in sr_write_error. 1778 * The reader thread is either blocked because of flow control or 1779 * running reading data from the disk. For the former, we need to 1780 * wakeup the thread. Again to keep it simple, we always wake up 1781 * the reader thread. Then, wait for the read thread to complete 1782 * if it is not done yet. Cleanup and return. 1783 * 1784 * High and low water marks for the read thread. 1785 * -------------------------------------------- 1786 * 1787 * If sendfile() is used to send data over a slow network, we need to 1788 * make sure that the read thread does not produce data at a faster 1789 * rate than the network. This can happen if the disk is faster than 1790 * the network. In such a case, we don't want to build a very large queue. 1791 * But we would still like to get all of the network throughput possible. 1792 * This implies that network should never block waiting for data. 1793 * As there are lot of disk throughput/network throughput combinations 1794 * possible, it is difficult to come up with an accurate number. 1795 * A typical 10K RPM disk has a max seek latency 17ms and rotational 1796 * latency of 3ms for reading a disk block. Thus, the total latency to 1797 * initiate a new read, transfer data from the disk and queue for 1798 * transmission would take about a max of 25ms. Todays max transfer rate 1799 * for network is 100MB/sec. If the thread is blocked because of flow 1800 * control, it would take 25ms to get new data ready for transmission. 1801 * We have to make sure that network is not idling, while we are initiating 1802 * new transfers. So, at 100MB/sec, to keep network busy we would need 1803 * 2.5MB of data. Roundig off, we keep the low water mark to be 3MB of data. 1804 * We need to pick a high water mark so that the woken up thread would 1805 * do considerable work before blocking again to prevent thrashing. Currently, 1806 * we pick this to be 10 times that of the low water mark. 1807 * 1808 * Sendfile with segmap caching (One copy from page cache to mblks). 1809 * ---------------------------------------------------------------- 1810 * 1811 * We use the segmap cache for caching the file, if the size of file 1812 * is <= sendfile_max_size. In this case we don't use threads as VM 1813 * is reasonably fast enough to keep up with the network. If the underlying 1814 * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth 1815 * of data into segmap space, and use the virtual address from segmap 1816 * directly through desballoc() to avoid copy. Once the transport is done 1817 * with the data, the mapping will be released through segmap_release() 1818 * called by the call-back routine. 1819 * 1820 * If zero-copy is not allowed by the transport, we simply call VOP_READ() 1821 * to copy the data from the filesystem into our temporary network buffer. 1822 * 1823 * To disable caching, set sendfile_max_size to 0. 1824 */ 1825 1826 uint_t sendfile_read_size = 1024 * 1024; 1827 #define SENDFILE_REQ_LOWAT 3 * 1024 * 1024 1828 uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT; 1829 uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT; 1830 struct sendfile_stats sf_stats; 1831 struct sendfile_queue *snfq; 1832 clock_t snfq_timeout; 1833 off64_t sendfile_max_size; 1834 1835 static void snf_enque(snf_req_t *, mblk_t *); 1836 static mblk_t *snf_deque(snf_req_t *); 1837 1838 void 1839 sendfile_init(void) 1840 { 1841 snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP); 1842 1843 mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL); 1844 cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL); 1845 snfq->snfq_max_threads = max_ncpus; 1846 snfq_timeout = SNFQ_TIMEOUT; 1847 /* Cache all files by default. */ 1848 sendfile_max_size = MAXOFFSET_T; 1849 } 1850 1851 /* 1852 * Queues a mblk_t for network processing. 1853 */ 1854 static void 1855 snf_enque(snf_req_t *sr, mblk_t *mp) 1856 { 1857 mp->b_next = NULL; 1858 mutex_enter(&sr->sr_lock); 1859 if (sr->sr_mp_head == NULL) { 1860 sr->sr_mp_head = sr->sr_mp_tail = mp; 1861 cv_signal(&sr->sr_cv); 1862 } else { 1863 sr->sr_mp_tail->b_next = mp; 1864 sr->sr_mp_tail = mp; 1865 } 1866 sr->sr_qlen += MBLKL(mp); 1867 while ((sr->sr_qlen > sr->sr_hiwat) && 1868 (sr->sr_write_error == 0)) { 1869 sf_stats.ss_full_waits++; 1870 cv_wait(&sr->sr_cv, &sr->sr_lock); 1871 } 1872 mutex_exit(&sr->sr_lock); 1873 } 1874 1875 /* 1876 * De-queues a mblk_t for network processing. 1877 */ 1878 static mblk_t * 1879 snf_deque(snf_req_t *sr) 1880 { 1881 mblk_t *mp; 1882 1883 mutex_enter(&sr->sr_lock); 1884 /* 1885 * If we have encountered an error on read or read is 1886 * completed and no more mblks, return NULL. 1887 * We need to check for NULL sr_mp_head also as 1888 * the reads could have completed and there is 1889 * nothing more to come. 1890 */ 1891 if (((sr->sr_read_error & ~SR_READ_DONE) != 0) || 1892 ((sr->sr_read_error & SR_READ_DONE) && 1893 sr->sr_mp_head == NULL)) { 1894 mutex_exit(&sr->sr_lock); 1895 return (NULL); 1896 } 1897 /* 1898 * To start with neither SR_READ_DONE is marked nor 1899 * the error is set. When we wake up from cv_wait, 1900 * following are the possibilities : 1901 * 1902 * a) sr_read_error is zero and mblks are queued. 1903 * b) sr_read_error is set to SR_READ_DONE 1904 * and mblks are queued. 1905 * c) sr_read_error is set to SR_READ_DONE 1906 * and no mblks. 1907 * d) sr_read_error is set to some error other 1908 * than SR_READ_DONE. 1909 */ 1910 1911 while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) { 1912 sf_stats.ss_empty_waits++; 1913 cv_wait(&sr->sr_cv, &sr->sr_lock); 1914 } 1915 /* Handle (a) and (b) first - the normal case. */ 1916 if (((sr->sr_read_error & ~SR_READ_DONE) == 0) && 1917 (sr->sr_mp_head != NULL)) { 1918 mp = sr->sr_mp_head; 1919 sr->sr_mp_head = mp->b_next; 1920 sr->sr_qlen -= MBLKL(mp); 1921 if (sr->sr_qlen < sr->sr_lowat) 1922 cv_signal(&sr->sr_cv); 1923 mutex_exit(&sr->sr_lock); 1924 mp->b_next = NULL; 1925 return (mp); 1926 } 1927 /* Handle (c) and (d). */ 1928 mutex_exit(&sr->sr_lock); 1929 return (NULL); 1930 } 1931 1932 /* 1933 * Reads data from the filesystem and queues it for network processing. 1934 */ 1935 void 1936 snf_async_read(snf_req_t *sr) 1937 { 1938 size_t iosize; 1939 u_offset_t fileoff; 1940 u_offset_t size; 1941 int ret_size; 1942 int error; 1943 file_t *fp; 1944 mblk_t *mp; 1945 1946 fp = sr->sr_fp; 1947 size = sr->sr_file_size; 1948 fileoff = sr->sr_file_off; 1949 1950 /* 1951 * Ignore the error for filesystems that doesn't support DIRECTIO. 1952 */ 1953 (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0, 1954 kcred, NULL); 1955 1956 while ((size != 0) && (sr->sr_write_error == 0)) { 1957 1958 iosize = (int)MIN(sr->sr_maxpsz, size); 1959 1960 if ((mp = allocb(iosize, BPRI_MED)) == NULL) { 1961 error = EAGAIN; 1962 break; 1963 } 1964 ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize); 1965 1966 /* Error or Reached EOF ? */ 1967 if ((error != 0) || (ret_size == 0)) { 1968 freeb(mp); 1969 break; 1970 } 1971 mp->b_wptr = mp->b_rptr + ret_size; 1972 1973 snf_enque(sr, mp); 1974 size -= ret_size; 1975 fileoff += ret_size; 1976 } 1977 (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0, 1978 kcred, NULL); 1979 mutex_enter(&sr->sr_lock); 1980 sr->sr_read_error = error; 1981 sr->sr_read_error |= SR_READ_DONE; 1982 cv_signal(&sr->sr_cv); 1983 mutex_exit(&sr->sr_lock); 1984 } 1985 1986 void 1987 snf_async_thread(void) 1988 { 1989 snf_req_t *sr; 1990 callb_cpr_t cprinfo; 1991 clock_t time_left = 1; 1992 clock_t now; 1993 1994 CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq"); 1995 1996 mutex_enter(&snfq->snfq_lock); 1997 for (;;) { 1998 /* 1999 * If we didn't find a entry, then block until woken up 2000 * again and then look through the queues again. 2001 */ 2002 while ((sr = snfq->snfq_req_head) == NULL) { 2003 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2004 if (time_left <= 0) { 2005 snfq->snfq_svc_threads--; 2006 CALLB_CPR_EXIT(&cprinfo); 2007 thread_exit(); 2008 /* NOTREACHED */ 2009 } 2010 snfq->snfq_idle_cnt++; 2011 2012 time_to_wait(&now, snfq_timeout); 2013 time_left = cv_timedwait(&snfq->snfq_cv, 2014 &snfq->snfq_lock, now); 2015 snfq->snfq_idle_cnt--; 2016 2017 CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock); 2018 } 2019 snfq->snfq_req_head = sr->sr_next; 2020 snfq->snfq_req_cnt--; 2021 mutex_exit(&snfq->snfq_lock); 2022 snf_async_read(sr); 2023 mutex_enter(&snfq->snfq_lock); 2024 } 2025 } 2026 2027 2028 snf_req_t * 2029 create_thread(int operation, struct vnode *vp, file_t *fp, 2030 u_offset_t fileoff, u_offset_t size) 2031 { 2032 snf_req_t *sr; 2033 stdata_t *stp; 2034 2035 sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP); 2036 2037 sr->sr_vp = vp; 2038 sr->sr_fp = fp; 2039 stp = vp->v_stream; 2040 2041 /* 2042 * store sd_qn_maxpsz into sr_maxpsz while we have stream head. 2043 * stream might be closed before thread returns from snf_async_read. 2044 */ 2045 if (stp->sd_qn_maxpsz > 0) { 2046 sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz); 2047 } else { 2048 sr->sr_maxpsz = MAXBSIZE; 2049 } 2050 2051 sr->sr_operation = operation; 2052 sr->sr_file_off = fileoff; 2053 sr->sr_file_size = size; 2054 sr->sr_hiwat = sendfile_req_hiwat; 2055 sr->sr_lowat = sendfile_req_lowat; 2056 mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL); 2057 cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL); 2058 /* 2059 * See whether we need another thread for servicing this 2060 * request. If there are already enough requests queued 2061 * for the threads, create one if not exceeding 2062 * snfq_max_threads. 2063 */ 2064 mutex_enter(&snfq->snfq_lock); 2065 if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt && 2066 snfq->snfq_svc_threads < snfq->snfq_max_threads) { 2067 (void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0, 2068 TS_RUN, minclsyspri); 2069 snfq->snfq_svc_threads++; 2070 } 2071 if (snfq->snfq_req_head == NULL) { 2072 snfq->snfq_req_head = snfq->snfq_req_tail = sr; 2073 cv_signal(&snfq->snfq_cv); 2074 } else { 2075 snfq->snfq_req_tail->sr_next = sr; 2076 snfq->snfq_req_tail = sr; 2077 } 2078 snfq->snfq_req_cnt++; 2079 mutex_exit(&snfq->snfq_lock); 2080 return (sr); 2081 } 2082 2083 int 2084 snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size, 2085 ssize_t *count) 2086 { 2087 snf_req_t *sr; 2088 mblk_t *mp; 2089 int iosize; 2090 int error = 0; 2091 short fflag; 2092 struct vnode *vp; 2093 int ksize; 2094 2095 ksize = 0; 2096 *count = 0; 2097 2098 vp = fp->f_vnode; 2099 fflag = fp->f_flag; 2100 if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL) 2101 return (EAGAIN); 2102 2103 /* 2104 * We check for read error in snf_deque. It has to check 2105 * for successful READ_DONE and return NULL, and we might 2106 * as well make an additional check there. 2107 */ 2108 while ((mp = snf_deque(sr)) != NULL) { 2109 2110 if (ISSIG(curthread, JUSTLOOKING)) { 2111 freeb(mp); 2112 error = EINTR; 2113 break; 2114 } 2115 iosize = MBLKL(mp); 2116 2117 if ((error = kstrwritemp(vp, mp, fflag)) != 0) { 2118 freeb(mp); 2119 break; 2120 } 2121 ksize += iosize; 2122 } 2123 *count = ksize; 2124 2125 mutex_enter(&sr->sr_lock); 2126 sr->sr_write_error = error; 2127 /* Look at the big comments on why we cv_signal here. */ 2128 cv_signal(&sr->sr_cv); 2129 2130 /* Wait for the reader to complete always. */ 2131 while (!(sr->sr_read_error & SR_READ_DONE)) { 2132 cv_wait(&sr->sr_cv, &sr->sr_lock); 2133 } 2134 /* If there is no write error, check for read error. */ 2135 if (error == 0) 2136 error = (sr->sr_read_error & ~SR_READ_DONE); 2137 2138 if (error != 0) { 2139 mblk_t *next_mp; 2140 2141 mp = sr->sr_mp_head; 2142 while (mp != NULL) { 2143 next_mp = mp->b_next; 2144 mp->b_next = NULL; 2145 freeb(mp); 2146 mp = next_mp; 2147 } 2148 } 2149 mutex_exit(&sr->sr_lock); 2150 kmem_free(sr, sizeof (snf_req_t)); 2151 return (error); 2152 } 2153 2154 typedef struct { 2155 frtn_t snfi_frtn; 2156 caddr_t snfi_base; 2157 uint_t snfi_mapoff; 2158 size_t snfi_len; 2159 vnode_t *snfi_vp; 2160 } snf_smap_desbinfo; 2161 2162 /* 2163 * The callback function when the last ref of the mblk is dropped, 2164 * normally occurs when TCP receives the ack. But it can be the driver 2165 * too due to lazy reclaim. 2166 */ 2167 void 2168 snf_smap_desbfree(snf_smap_desbinfo *snfi) 2169 { 2170 if (!segmap_kpm) { 2171 /* 2172 * We don't need to call segmap_fault(F_SOFTUNLOCK) for 2173 * segmap_kpm as long as the latter never falls back to 2174 * "use_segmap_range". (See segmap_getmapflt().) 2175 * 2176 * Using S_OTHER saves an redundant hat_setref() in 2177 * segmap_unlock() 2178 */ 2179 (void) segmap_fault(kas.a_hat, segkmap, 2180 (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base + 2181 snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len, 2182 F_SOFTUNLOCK, S_OTHER); 2183 } 2184 (void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED); 2185 VN_RELE(snfi->snfi_vp); 2186 kmem_free(snfi, sizeof (*snfi)); 2187 } 2188 2189 /* 2190 * Use segmap instead of bcopy to send down a chain of desballoca'ed, mblks. 2191 * Each mblk contains a segmap slot of no more than MAXBSIZE. The total 2192 * length of a chain is no more than sd_qn_maxpsz. 2193 * 2194 * At the end of the whole sendfile() operation, we wait till the data from 2195 * the last mblk is ack'ed by the transport before returning so that the 2196 * caller of sendfile() can safely modify the file content. 2197 */ 2198 int 2199 snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size, 2200 uint_t maxpsz, ssize_t *count, boolean_t nowait) 2201 { 2202 caddr_t base; 2203 int mapoff; 2204 vnode_t *vp; 2205 mblk_t *mp, *mp1; 2206 int iosize, iosize1; 2207 int error; 2208 short fflag; 2209 int ksize; 2210 snf_smap_desbinfo *snfi; 2211 struct vattr va; 2212 boolean_t dowait = B_FALSE; 2213 2214 vp = fp->f_vnode; 2215 fflag = fp->f_flag; 2216 ksize = 0; 2217 for (;;) { 2218 if (ISSIG(curthread, JUSTLOOKING)) { 2219 error = EINTR; 2220 break; 2221 } 2222 iosize = 0; 2223 mp = NULL; 2224 do { 2225 mapoff = fileoff & MAXBOFFSET; 2226 iosize1 = MAXBSIZE - mapoff; 2227 if (iosize1 > size) 2228 iosize1 = size; 2229 /* 2230 * we don't forcefault because we'll call 2231 * segmap_fault(F_SOFTLOCK) next. 2232 * 2233 * S_READ will get the ref bit set (by either 2234 * segmap_getmapflt() or segmap_fault()) and page 2235 * shared locked. 2236 */ 2237 base = segmap_getmapflt(segkmap, fvp, fileoff, iosize1, 2238 segmap_kpm ? SM_FAULT : 0, S_READ); 2239 2240 snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP); 2241 snfi->snfi_len = (size_t)roundup(mapoff+iosize1, 2242 PAGESIZE)- (mapoff & PAGEMASK); 2243 /* 2244 * We must call segmap_fault() even for segmap_kpm 2245 * because that's how error gets returned. 2246 * (segmap_getmapflt() never fails but segmap_fault() 2247 * does.) 2248 */ 2249 if (segmap_fault(kas.a_hat, segkmap, 2250 (caddr_t)(uintptr_t)(((uintptr_t)base + mapoff) & 2251 PAGEMASK), snfi->snfi_len, F_SOFTLOCK, 2252 S_READ) != 0) { 2253 (void) segmap_release(segkmap, base, 0); 2254 kmem_free(snfi, sizeof (*snfi)); 2255 freemsg(mp); 2256 error = EIO; 2257 goto out; 2258 } 2259 snfi->snfi_frtn.free_func = snf_smap_desbfree; 2260 snfi->snfi_frtn.free_arg = (caddr_t)snfi; 2261 snfi->snfi_base = base; 2262 snfi->snfi_mapoff = mapoff; 2263 mp1 = desballoca((uchar_t *)base + mapoff, 2264 iosize1, BPRI_HI, &snfi->snfi_frtn); 2265 2266 if (mp1 == NULL) { 2267 (void) segmap_fault(kas.a_hat, segkmap, 2268 (caddr_t)(uintptr_t)(((uintptr_t)base + 2269 mapoff) & PAGEMASK), snfi->snfi_len, 2270 F_SOFTUNLOCK, S_OTHER); 2271 (void) segmap_release(segkmap, base, 0); 2272 kmem_free(snfi, sizeof (*snfi)); 2273 freemsg(mp); 2274 error = EAGAIN; 2275 goto out; 2276 } 2277 VN_HOLD(fvp); 2278 snfi->snfi_vp = fvp; 2279 mp1->b_wptr += iosize1; 2280 2281 /* Mark this dblk with the zero-copy flag */ 2282 mp1->b_datap->db_struioflag |= STRUIO_ZC; 2283 if (mp == NULL) 2284 mp = mp1; 2285 else 2286 linkb(mp, mp1); 2287 iosize += iosize1; 2288 fileoff += iosize1; 2289 size -= iosize1; 2290 } while (iosize < maxpsz && size != 0); 2291 2292 if (size == 0 && !nowait) { 2293 ASSERT(!dowait); 2294 dowait = B_TRUE; 2295 mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; 2296 } 2297 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2298 if ((error = kstrwritemp(vp, mp, fflag)) != 0) { 2299 *count = ksize; 2300 freemsg(mp); 2301 return (error); 2302 } 2303 ksize += iosize; 2304 if (size == 0) 2305 goto done; 2306 2307 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2308 va.va_mask = AT_SIZE; 2309 error = VOP_GETATTR(fvp, &va, 0, kcred); 2310 if (error) 2311 break; 2312 /* Read as much as possible. */ 2313 if (fileoff >= va.va_size) 2314 break; 2315 if (size + fileoff > va.va_size) 2316 size = va.va_size - fileoff; 2317 } 2318 out: 2319 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2320 done: 2321 *count = ksize; 2322 if (dowait) { 2323 stdata_t *stp; 2324 2325 stp = vp->v_stream; 2326 mutex_enter(&stp->sd_lock); 2327 while (!(stp->sd_flag & STZCNOTIFY)) { 2328 (void) cv_wait_sig(&stp->sd_zcopy_wait, 2329 &stp->sd_lock); 2330 } 2331 stp->sd_flag &= ~STZCNOTIFY; 2332 mutex_exit(&stp->sd_lock); 2333 } 2334 return (error); 2335 } 2336 2337 int 2338 snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size, 2339 uint_t maxpsz, ssize_t *count) 2340 { 2341 struct vnode *vp; 2342 mblk_t *mp; 2343 int iosize; 2344 int error; 2345 short fflag; 2346 int ksize; 2347 int ioflag; 2348 struct uio auio; 2349 struct iovec aiov; 2350 struct vattr va; 2351 2352 vp = fp->f_vnode; 2353 fflag = fp->f_flag; 2354 ksize = 0; 2355 auio.uio_iov = &aiov; 2356 auio.uio_iovcnt = 1; 2357 auio.uio_segflg = UIO_SYSSPACE; 2358 auio.uio_llimit = MAXOFFSET_T; 2359 auio.uio_fmode = fflag; 2360 auio.uio_extflg = UIO_COPY_CACHED; 2361 ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC); 2362 /* If read sync is not asked for, filter sync flags */ 2363 if ((ioflag & FRSYNC) == 0) 2364 ioflag &= ~(FSYNC|FDSYNC); 2365 for (;;) { 2366 if (ISSIG(curthread, JUSTLOOKING)) { 2367 error = EINTR; 2368 break; 2369 } 2370 iosize = (int)MIN(maxpsz, size); 2371 if ((mp = allocb(iosize, BPRI_MED)) == NULL) { 2372 error = EAGAIN; 2373 break; 2374 } 2375 aiov.iov_base = (caddr_t)mp->b_rptr; 2376 aiov.iov_len = iosize; 2377 auio.uio_loffset = fileoff; 2378 auio.uio_resid = iosize; 2379 2380 error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL); 2381 iosize -= auio.uio_resid; 2382 2383 if (error == EINTR && iosize != 0) 2384 error = 0; 2385 2386 if (error != 0 || iosize == 0) { 2387 freeb(mp); 2388 break; 2389 } 2390 mp->b_wptr = mp->b_rptr + iosize; 2391 2392 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2393 if ((error = kstrwritemp(vp, mp, fflag)) != 0) { 2394 *count = ksize; 2395 freeb(mp); 2396 return (error); 2397 } 2398 ksize += iosize; 2399 size -= iosize; 2400 if (size == 0) 2401 goto done; 2402 2403 fileoff += iosize; 2404 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2405 va.va_mask = AT_SIZE; 2406 error = VOP_GETATTR(fvp, &va, 0, kcred); 2407 if (error) 2408 break; 2409 /* Read as much as possible. */ 2410 if (fileoff >= va.va_size) 2411 size = 0; 2412 else if (size + fileoff > va.va_size) 2413 size = va.va_size - fileoff; 2414 } 2415 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2416 done: 2417 *count = ksize; 2418 return (error); 2419 } 2420 2421 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 2422 /* 2423 * Largefile support for 32 bit applications only. 2424 */ 2425 int 2426 sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv, 2427 ssize32_t *count32) 2428 { 2429 ssize32_t sfv_len; 2430 u_offset_t sfv_off, va_size; 2431 struct vnode *vp, *fvp, *realvp; 2432 struct vattr va; 2433 stdata_t *stp; 2434 ssize_t count = 0; 2435 int error = 0; 2436 boolean_t dozcopy = B_FALSE; 2437 uint_t maxpsz; 2438 2439 sfv_len = (ssize32_t)sfv->sfv_len; 2440 if (sfv_len < 0) { 2441 error = EINVAL; 2442 goto out; 2443 } 2444 2445 if (sfv_len == 0) goto out; 2446 2447 sfv_off = (u_offset_t)sfv->sfv_off; 2448 2449 /* Same checks as in pread */ 2450 if (sfv_off > MAXOFFSET_T) { 2451 error = EINVAL; 2452 goto out; 2453 } 2454 if (sfv_off + sfv_len > MAXOFFSET_T) 2455 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 2456 2457 /* 2458 * There are no more checks on sfv_len. So, we cast it to 2459 * u_offset_t and share the snf_direct_io/snf_cache code between 2460 * 32 bit and 64 bit. 2461 * 2462 * TODO: should do nbl_need_check() like read()? 2463 */ 2464 if (sfv_len > sendfile_max_size) { 2465 sf_stats.ss_file_not_cached++; 2466 error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len, 2467 &count); 2468 goto out; 2469 } 2470 fvp = rfp->f_vnode; 2471 if (VOP_REALVP(fvp, &realvp) == 0) 2472 fvp = realvp; 2473 /* 2474 * Grab the lock as a reader to prevent the file size 2475 * from changing underneath. 2476 */ 2477 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2478 va.va_mask = AT_SIZE; 2479 error = VOP_GETATTR(fvp, &va, 0, kcred); 2480 va_size = va.va_size; 2481 if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) { 2482 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); 2483 goto out; 2484 } 2485 /* Read as much as possible. */ 2486 if (sfv_off + sfv_len > va_size) 2487 sfv_len = va_size - sfv_off; 2488 2489 vp = fp->f_vnode; 2490 stp = vp->v_stream; 2491 if (stp->sd_qn_maxpsz == INFPSZ) 2492 maxpsz = MAXOFF32_T; 2493 else 2494 maxpsz = roundup(stp->sd_qn_maxpsz, MAXBSIZE); 2495 /* 2496 * When the NOWAIT flag is not set, we enable zero-copy only if the 2497 * transfer size is large enough. This prevents performance loss 2498 * when the caller sends the file piece by piece. 2499 */ 2500 if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) || 2501 (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) && 2502 !vn_has_flocks(fvp)) { 2503 if ((stp->sd_copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) { 2504 int on = 1; 2505 2506 if (SOP_SETSOCKOPT(VTOSO(vp), SOL_SOCKET, 2507 SO_SND_COPYAVOID, &on, sizeof (on)) == 0) 2508 dozcopy = B_TRUE; 2509 } else { 2510 dozcopy = (stp->sd_copyflag & STZCVMSAFE); 2511 } 2512 } 2513 if (dozcopy) { 2514 sf_stats.ss_file_segmap++; 2515 error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len, 2516 maxpsz, &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0)); 2517 } else { 2518 sf_stats.ss_file_cached++; 2519 error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len, 2520 maxpsz, &count); 2521 } 2522 out: 2523 releasef(sfv->sfv_fd); 2524 *count32 = (ssize32_t)count; 2525 return (error); 2526 } 2527 #endif 2528 2529 #ifdef _SYSCALL32_IMPL 2530 /* 2531 * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a 2532 * ssize_t rather than ssize32_t; see the comments above read32 for details. 2533 */ 2534 2535 ssize_t 2536 recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags) 2537 { 2538 return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags)); 2539 } 2540 2541 ssize_t 2542 recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags, 2543 caddr32_t name, caddr32_t namelenp) 2544 { 2545 return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags, 2546 (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp)); 2547 } 2548 2549 ssize_t 2550 send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags) 2551 { 2552 return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags)); 2553 } 2554 2555 ssize_t 2556 sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags, 2557 caddr32_t name, socklen_t namelen) 2558 { 2559 return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags, 2560 (void *)(uintptr_t)name, namelen)); 2561 } 2562 #endif /* _SYSCALL32_IMPL */ 2563 2564 /* 2565 * Function wrappers (mostly arround the sonode switch) for 2566 * backward compatibility. 2567 */ 2568 2569 int 2570 soaccept(struct sonode *so, int fflag, struct sonode **nsop) 2571 { 2572 return (SOP_ACCEPT(so, fflag, nsop)); 2573 } 2574 2575 int 2576 sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen, 2577 int backlog, int flags) 2578 { 2579 int error; 2580 2581 error = SOP_BIND(so, name, namelen, flags); 2582 if (error == 0 && backlog != 0) 2583 return (SOP_LISTEN(so, backlog)); 2584 2585 return (error); 2586 } 2587 2588 int 2589 solisten(struct sonode *so, int backlog) 2590 { 2591 return (SOP_LISTEN(so, backlog)); 2592 } 2593 2594 int 2595 soconnect(struct sonode *so, const struct sockaddr *name, socklen_t namelen, 2596 int fflag, int flags) 2597 { 2598 return (SOP_CONNECT(so, name, namelen, fflag, flags)); 2599 } 2600 2601 int 2602 sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) 2603 { 2604 return (SOP_RECVMSG(so, msg, uiop)); 2605 } 2606 2607 int 2608 sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) 2609 { 2610 return (SOP_SENDMSG(so, msg, uiop)); 2611 } 2612 2613 int 2614 sogetpeername(struct sonode *so) 2615 { 2616 return (SOP_GETPEERNAME(so)); 2617 } 2618 2619 int 2620 sogetsockname(struct sonode *so) 2621 { 2622 return (SOP_GETSOCKNAME(so)); 2623 } 2624 2625 int 2626 soshutdown(struct sonode *so, int how) 2627 { 2628 return (SOP_SHUTDOWN(so, how)); 2629 } 2630 2631 int 2632 sogetsockopt(struct sonode *so, int level, int option_name, void *optval, 2633 socklen_t *optlenp, int flags) 2634 { 2635 return (SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, 2636 flags)); 2637 } 2638 2639 int 2640 sosetsockopt(struct sonode *so, int level, int option_name, const void *optval, 2641 t_uscalar_t optlen) 2642 { 2643 return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen)); 2644 } 2645 2646 /* 2647 * Because this is backward compatibility interface it only needs to be 2648 * able to handle the creation of TPI sockfs sockets. 2649 */ 2650 struct sonode * 2651 socreate(vnode_t *accessvp, int domain, int type, int protocol, int version, 2652 struct sonode *tso, int *errorp) 2653 { 2654 return (sotpi_create(accessvp, domain, type, protocol, version, tso, 2655 errorp)); 2656 } 2657