1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2015, Joyent, Inc. All rights reserved. 26 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 27 * Copyright 2022 Garrett D'Amore 28 * Copyright 2024 Oxide Computer Company 29 */ 30 31 #include <sys/types.h> 32 #include <sys/t_lock.h> 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/buf.h> 36 #include <sys/conf.h> 37 #include <sys/cred.h> 38 #include <sys/kmem.h> 39 #include <sys/sysmacros.h> 40 #include <sys/vfs.h> 41 #include <sys/vfs_opreg.h> 42 #include <sys/vnode.h> 43 #include <sys/debug.h> 44 #include <sys/errno.h> 45 #include <sys/time.h> 46 #include <sys/file.h> 47 #include <sys/open.h> 48 #include <sys/user.h> 49 #include <sys/termios.h> 50 #include <sys/stream.h> 51 #include <sys/strsubr.h> 52 #include <sys/strsun.h> 53 #include <sys/esunddi.h> 54 #include <sys/flock.h> 55 #include <sys/modctl.h> 56 #include <sys/cmn_err.h> 57 #include <sys/mkdev.h> 58 #include <sys/pathname.h> 59 #include <sys/ddi.h> 60 #include <sys/stat.h> 61 #include <sys/fs/snode.h> 62 #include <sys/fs/dv_node.h> 63 #include <fs/fs_subr.h> 64 #include <sys/zone.h> 65 66 #include <sys/socket.h> 67 #include <sys/socketvar.h> 68 #include <netinet/in.h> 69 #include <sys/un.h> 70 #include <sys/ucred.h> 71 72 #include <sys/tiuser.h> 73 #define _SUN_TPI_VERSION 2 74 #include <sys/tihdr.h> 75 76 #include <c2/audit.h> 77 78 #include <fs/sockfs/sockcommon.h> 79 #include <fs/sockfs/sockfilter_impl.h> 80 #include <fs/sockfs/socktpi.h> 81 #include <fs/sockfs/socktpi_impl.h> 82 #include <fs/sockfs/sodirect.h> 83 84 /* 85 * Macros that operate on struct cmsghdr. 86 * The CMSG_VALID macro does not assume that the last option buffer is padded. 87 */ 88 #define CMSG_CONTENT(cmsg) (&((cmsg)[1])) 89 #define CMSG_CONTENTLEN(cmsg) ((cmsg)->cmsg_len - sizeof (struct cmsghdr)) 90 #define CMSG_VALID(cmsg, start, end) \ 91 (ISALIGNED_cmsghdr(cmsg) && \ 92 ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \ 93 ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \ 94 ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \ 95 ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end))) 96 #define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */ 97 98 dev_t sockdev; /* For fsid in getattr */ 99 100 struct socklist socklist; 101 102 struct kmem_cache *socket_cache; 103 104 /* 105 * This is a global vfs_t that we have to maintain as the solitary vfs_t that is 106 * used across all sockfs vnodes. This ensures that we have a reasonable vfs_t 107 * present that points to our ops vectors. 108 */ 109 vfs_t *sock_vfsp; 110 static struct vfsops *sockfs_vfsops; 111 112 /* 113 * sockconf_lock protects the socket configuration (socket types and 114 * socket filters) which is changed via the sockconfig system call. 115 */ 116 krwlock_t sockconf_lock; 117 118 static int sockfs_update(kstat_t *, int); 119 static int sockfs_snapshot(kstat_t *, void *, int); 120 extern smod_info_t *sotpi_smod_create(void); 121 122 extern void sendfile_init(); 123 124 extern int modrootloaded; 125 126 /* 127 * Translate from a device pathname (e.g. "/dev/tcp") to a vnode. 128 * Returns with the vnode held. 129 */ 130 int 131 sogetvp(char *devpath, vnode_t **vpp, int uioflag) 132 { 133 struct snode *csp; 134 vnode_t *vp, *dvp; 135 major_t maj; 136 int error; 137 138 ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE); 139 140 /* 141 * Lookup the underlying filesystem vnode. 142 */ 143 error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp); 144 if (error) 145 return (error); 146 147 /* Check that it is the correct vnode */ 148 if (vp->v_type != VCHR) { 149 VN_RELE(vp); 150 return (ENOTSOCK); 151 } 152 153 /* 154 * If devpath went through devfs, the device should already 155 * be configured. If devpath is a mknod file, however, we 156 * need to make sure the device is properly configured. 157 * To do this, we do something similar to spec_open() 158 * except that we resolve to the minor/leaf level since 159 * we need to return a vnode. 160 */ 161 csp = VTOS(VTOS(vp)->s_commonvp); 162 if (!(csp->s_flag & SDIPSET)) { 163 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 164 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname); 165 if (error == 0) 166 error = devfs_lookupname(pathname, NULLVPP, &dvp); 167 VN_RELE(vp); 168 kmem_free(pathname, MAXPATHLEN); 169 if (error != 0) 170 return (ENXIO); 171 vp = dvp; /* use the devfs vp */ 172 } 173 174 /* device is configured at this point */ 175 maj = getmajor(vp->v_rdev); 176 if (!STREAMSTAB(maj)) { 177 VN_RELE(vp); 178 return (ENOSTR); 179 } 180 181 *vpp = vp; 182 return (0); 183 } 184 185 /* 186 * Update the accessed, updated, or changed times in an sonode 187 * with the current time. 188 * 189 * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable 190 * attributes in a fstat call. (They return the current time and 0 for 191 * all timestamps, respectively.) We maintain the current timestamps 192 * here primarily so that should sockmod be popped the resulting 193 * file descriptor will behave like a stream w.r.t. the timestamps. 194 */ 195 void 196 so_update_attrs(struct sonode *so, int flag) 197 { 198 time_t now = gethrestime_sec(); 199 200 if (SOCK_IS_NONSTR(so)) 201 return; 202 203 mutex_enter(&so->so_lock); 204 so->so_flag |= flag; 205 if (flag & SOACC) 206 SOTOTPI(so)->sti_atime = now; 207 if (flag & SOMOD) 208 SOTOTPI(so)->sti_mtime = now; 209 mutex_exit(&so->so_lock); 210 } 211 212 extern so_create_func_t sock_comm_create_function; 213 extern so_destroy_func_t sock_comm_destroy_function; 214 215 /* 216 * Init function called when sockfs is loaded. 217 */ 218 int 219 sockinit(int fstype, char *name) 220 { 221 static const fs_operation_def_t sock_vfsops_template[] = { 222 { VFSNAME_STATVFS, { .vfs_statvfs = sockfs_statvfs } }, 223 { NULL, NULL } 224 }; 225 int error; 226 major_t dev; 227 char *err_str; 228 229 error = vfs_setfsops(fstype, sock_vfsops_template, &sockfs_vfsops); 230 if (error != 0) { 231 zcmn_err(GLOBAL_ZONEID, CE_WARN, 232 "sockinit: bad vfs ops template"); 233 return (error); 234 } 235 236 error = vn_make_ops(name, socket_vnodeops_template, 237 &socket_vnodeops); 238 if (error != 0) { 239 err_str = "sockinit: bad socket vnode ops template"; 240 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */ 241 socket_vnodeops = NULL; 242 goto failure; 243 } 244 245 socket_cache = kmem_cache_create("socket_cache", 246 sizeof (struct sonode), 0, sonode_constructor, 247 sonode_destructor, NULL, NULL, NULL, 0); 248 249 rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL); 250 251 error = socktpi_init(); 252 if (error != 0) { 253 err_str = NULL; 254 goto failure; 255 } 256 257 error = sod_init(); 258 if (error != 0) { 259 err_str = NULL; 260 goto failure; 261 } 262 263 /* 264 * Set up the default create and destroy functions 265 */ 266 sock_comm_create_function = socket_sonode_create; 267 sock_comm_destroy_function = socket_sonode_destroy; 268 269 /* 270 * Build initial list mapping socket parameters to vnode. 271 */ 272 smod_init(); 273 smod_add(sotpi_smod_create()); 274 275 sockparams_init(); 276 277 /* 278 * If sockets are needed before init runs /sbin/soconfig 279 * it is possible to preload the sockparams list here using 280 * calls like: 281 * sockconfig(1,2,3, "/dev/tcp", 0); 282 */ 283 284 /* 285 * Create a unique dev_t for use in so_fsid. 286 */ 287 288 if ((dev = getudev()) == (major_t)-1) 289 dev = 0; 290 sockdev = makedevice(dev, 0); 291 292 mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL); 293 sendfile_init(); 294 295 /* Initialize socket filters */ 296 sof_init(); 297 298 sock_vfsp = fs_vfsp_global(sockfs_vfsops, sockdev, fstype, 299 PAGESIZE); 300 301 return (0); 302 303 failure: 304 (void) vfs_freevfsops_by_type(fstype); 305 if (socket_vnodeops != NULL) 306 vn_freevnodeops(socket_vnodeops); 307 if (err_str != NULL) 308 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str); 309 return (error); 310 } 311 312 /* 313 * Caller must hold the mutex. Used to set SOLOCKED. 314 */ 315 void 316 so_lock_single(struct sonode *so) 317 { 318 ASSERT(MUTEX_HELD(&so->so_lock)); 319 320 while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) { 321 cv_wait_stop(&so->so_single_cv, &so->so_lock, 322 SO_LOCK_WAKEUP_TIME); 323 } 324 so->so_flag |= SOLOCKED; 325 } 326 327 /* 328 * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND. 329 * Used to clear SOLOCKED or SOASYNC_UNBIND. 330 */ 331 void 332 so_unlock_single(struct sonode *so, int flag) 333 { 334 ASSERT(MUTEX_HELD(&so->so_lock)); 335 ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND)); 336 ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0); 337 ASSERT(so->so_flag & flag); 338 /* 339 * Process the T_DISCON_IND on sti_discon_ind_mp. 340 * 341 * Call to so_drain_discon_ind will result in so_lock 342 * being dropped and re-acquired later. 343 */ 344 if (!SOCK_IS_NONSTR(so)) { 345 sotpi_info_t *sti = SOTOTPI(so); 346 347 if (sti->sti_discon_ind_mp != NULL) 348 so_drain_discon_ind(so); 349 } 350 351 cv_signal(&so->so_single_cv); 352 so->so_flag &= ~flag; 353 } 354 355 /* 356 * Caller must hold the mutex. Used to set SOREADLOCKED. 357 * If the caller wants nonblocking behavior it should set fmode. 358 */ 359 int 360 so_lock_read(struct sonode *so, int fmode) 361 { 362 ASSERT(MUTEX_HELD(&so->so_lock)); 363 364 while (so->so_flag & SOREADLOCKED) { 365 if (fmode & (FNDELAY|FNONBLOCK)) 366 return (EWOULDBLOCK); 367 cv_wait_stop(&so->so_read_cv, &so->so_lock, 368 SO_LOCK_WAKEUP_TIME); 369 } 370 so->so_flag |= SOREADLOCKED; 371 return (0); 372 } 373 374 /* 375 * Like so_lock_read above but allows signals. 376 */ 377 int 378 so_lock_read_intr(struct sonode *so, int fmode) 379 { 380 ASSERT(MUTEX_HELD(&so->so_lock)); 381 382 while (so->so_flag & SOREADLOCKED) { 383 if (fmode & (FNDELAY|FNONBLOCK)) 384 return (EWOULDBLOCK); 385 if (!cv_wait_sig(&so->so_read_cv, &so->so_lock)) 386 return (EINTR); 387 } 388 so->so_flag |= SOREADLOCKED; 389 return (0); 390 } 391 392 /* 393 * Caller must hold the mutex. Used to clear SOREADLOCKED, 394 * set in so_lock_read() or so_lock_read_intr(). 395 */ 396 void 397 so_unlock_read(struct sonode *so) 398 { 399 ASSERT(MUTEX_HELD(&so->so_lock)); 400 ASSERT(so->so_flag & SOREADLOCKED); 401 402 cv_signal(&so->so_read_cv); 403 so->so_flag &= ~SOREADLOCKED; 404 } 405 406 /* 407 * Verify that the specified offset falls within the mblk and 408 * that the resulting pointer is aligned. 409 * Returns NULL if not. 410 */ 411 void * 412 sogetoff(mblk_t *mp, t_uscalar_t offset, 413 t_uscalar_t length, uint_t align_size) 414 { 415 uintptr_t ptr1, ptr2; 416 417 ASSERT(mp && mp->b_wptr >= mp->b_rptr); 418 ptr1 = (uintptr_t)mp->b_rptr + offset; 419 ptr2 = (uintptr_t)ptr1 + length; 420 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) { 421 eprintline(0); 422 return (NULL); 423 } 424 if ((ptr1 & (align_size - 1)) != 0) { 425 eprintline(0); 426 return (NULL); 427 } 428 return ((void *)ptr1); 429 } 430 431 /* 432 * Return the AF_UNIX underlying filesystem vnode matching a given name. 433 * Makes sure the sending and the destination sonodes are compatible. 434 * The vnode is returned held. 435 * 436 * The underlying filesystem VSOCK vnode has a v_stream pointer that 437 * references the actual stream head (hence indirectly the actual sonode). 438 */ 439 static int 440 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess, 441 vnode_t **vpp) 442 { 443 vnode_t *vp; /* Underlying filesystem vnode */ 444 vnode_t *rvp; /* real vnode */ 445 vnode_t *svp; /* sockfs vnode */ 446 struct sonode *so2; 447 int error; 448 449 dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so, 450 soun->sun_path)); 451 452 error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); 453 if (error) { 454 eprintsoline(so, error); 455 return (error); 456 } 457 458 /* 459 * Traverse lofs mounts get the real vnode 460 */ 461 if (VOP_REALVP(vp, &rvp, NULL) == 0) { 462 VN_HOLD(rvp); /* hold the real vnode */ 463 VN_RELE(vp); /* release hold from lookup */ 464 vp = rvp; 465 } 466 467 if (vp->v_type != VSOCK) { 468 error = ENOTSOCK; 469 eprintsoline(so, error); 470 goto done2; 471 } 472 473 if (checkaccess) { 474 /* 475 * Check that we have permissions to access the destination 476 * vnode. This check is not done in BSD but it is required 477 * by X/Open. 478 */ 479 error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL); 480 if (error != 0) { 481 eprintsoline(so, error); 482 goto done2; 483 } 484 } 485 486 /* 487 * Check if the remote socket has been closed. 488 * 489 * Synchronize with vn_rele_stream by holding v_lock while traversing 490 * v_stream->sd_vnode. 491 */ 492 mutex_enter(&vp->v_lock); 493 if (vp->v_stream == NULL) { 494 mutex_exit(&vp->v_lock); 495 if (so->so_type == SOCK_DGRAM) 496 error = EDESTADDRREQ; 497 else 498 error = ECONNREFUSED; 499 500 eprintsoline(so, error); 501 goto done2; 502 } 503 ASSERT(vp->v_stream->sd_vnode); 504 svp = vp->v_stream->sd_vnode; 505 /* 506 * holding v_lock on underlying filesystem vnode and acquiring 507 * it on sockfs vnode. Assumes that no code ever attempts to 508 * acquire these locks in the reverse order. 509 */ 510 VN_HOLD(svp); 511 mutex_exit(&vp->v_lock); 512 513 if (svp->v_type != VSOCK) { 514 error = ENOTSOCK; 515 eprintsoline(so, error); 516 goto done; 517 } 518 519 so2 = VTOSO(svp); 520 521 if (so->so_type != so2->so_type) { 522 error = EPROTOTYPE; 523 eprintsoline(so, error); 524 goto done; 525 } 526 527 VN_RELE(svp); 528 *vpp = vp; 529 return (0); 530 531 done: 532 VN_RELE(svp); 533 done2: 534 VN_RELE(vp); 535 return (error); 536 } 537 538 /* 539 * Verify peer address for connect and sendto/sendmsg. 540 * Since sendto/sendmsg would not get synchronous errors from the transport 541 * provider we have to do these ugly checks in the socket layer to 542 * preserve compatibility with SunOS 4.X. 543 */ 544 int 545 so_addr_verify(struct sonode *so, const struct sockaddr *name, 546 socklen_t namelen) 547 { 548 int family; 549 550 dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n", 551 (void *)so, (void *)name, namelen)); 552 553 ASSERT(name != NULL); 554 555 family = so->so_family; 556 switch (family) { 557 case AF_INET: 558 if (name->sa_family != family) { 559 eprintsoline(so, EAFNOSUPPORT); 560 return (EAFNOSUPPORT); 561 } 562 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) { 563 eprintsoline(so, EINVAL); 564 return (EINVAL); 565 } 566 break; 567 case AF_INET6: { 568 #ifdef DEBUG 569 struct sockaddr_in6 *sin6; 570 #endif /* DEBUG */ 571 572 if (name->sa_family != family) { 573 eprintsoline(so, EAFNOSUPPORT); 574 return (EAFNOSUPPORT); 575 } 576 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) { 577 eprintsoline(so, EINVAL); 578 return (EINVAL); 579 } 580 #ifdef DEBUG 581 /* Verify that apps don't forget to clear sin6_scope_id etc */ 582 sin6 = (struct sockaddr_in6 *)name; 583 if (sin6->sin6_scope_id != 0 && 584 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) { 585 zcmn_err(getzoneid(), CE_WARN, 586 "connect/send* with uninitialized sin6_scope_id " 587 "(%d) on socket. Pid = %d\n", 588 (int)sin6->sin6_scope_id, (int)curproc->p_pid); 589 } 590 #endif /* DEBUG */ 591 break; 592 } 593 case AF_UNIX: 594 if (SOTOTPI(so)->sti_faddr_noxlate) { 595 return (0); 596 } 597 if (namelen < (socklen_t)sizeof (short)) { 598 eprintsoline(so, ENOENT); 599 return (ENOENT); 600 } 601 if (name->sa_family != family) { 602 eprintsoline(so, EAFNOSUPPORT); 603 return (EAFNOSUPPORT); 604 } 605 /* MAXPATHLEN + soun_family + nul termination */ 606 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) { 607 eprintsoline(so, ENAMETOOLONG); 608 return (ENAMETOOLONG); 609 } 610 611 break; 612 613 default: 614 /* 615 * Default is don't do any length or sa_family check 616 * to allow non-sockaddr style addresses. 617 */ 618 break; 619 } 620 621 return (0); 622 } 623 624 625 /* 626 * Translate an AF_UNIX sockaddr_un to the transport internal name. 627 * Assumes caller has called so_addr_verify first. The translated 628 * (internal form) address is stored in sti->sti_ux_taddr. 629 */ 630 /*ARGSUSED*/ 631 int 632 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name, 633 socklen_t namelen, int checkaccess, 634 void **addrp, socklen_t *addrlenp) 635 { 636 int error; 637 struct sockaddr_un *soun; 638 vnode_t *vp; 639 void *addr; 640 socklen_t addrlen; 641 sotpi_info_t *sti = SOTOTPI(so); 642 643 dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n", 644 (void *)so, (void *)name, namelen, checkaccess)); 645 646 ASSERT(name != NULL); 647 ASSERT(so->so_family == AF_UNIX); 648 ASSERT(!sti->sti_faddr_noxlate); 649 ASSERT(namelen >= (socklen_t)sizeof (short)); 650 ASSERT(name->sa_family == AF_UNIX); 651 soun = (struct sockaddr_un *)name; 652 /* 653 * Lookup vnode for the specified path name and verify that 654 * it is a socket. 655 */ 656 error = so_ux_lookup(so, soun, checkaccess, &vp); 657 if (error) { 658 eprintsoline(so, error); 659 return (error); 660 } 661 /* 662 * Use the address of the peer vnode as the address to send 663 * to. We release the peer vnode here. In case it has been 664 * closed by the time the T_CONN_REQ or T_UNITDATA_REQ reaches the 665 * transport the message will get an error or be dropped. 666 * Note that that soua_vp is never dereferenced; it's just a 667 * convenient value by which we can identify the peer. 668 */ 669 sti->sti_ux_taddr.soua_vp = vp; 670 sti->sti_ux_taddr.soua_magic = SOU_MAGIC_EXPLICIT; 671 addr = &sti->sti_ux_taddr; 672 addrlen = (socklen_t)sizeof (sti->sti_ux_taddr); 673 dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n", 674 addrlen, (void *)vp)); 675 VN_RELE(vp); 676 *addrp = addr; 677 *addrlenp = (socklen_t)addrlen; 678 return (0); 679 } 680 681 /* 682 * Esballoc free function for messages that contain SO_FILEP option. 683 * Decrement the reference count on the file pointers using closef. 684 */ 685 void 686 fdbuf_free(struct fdbuf *fdbuf) 687 { 688 int i; 689 struct file *fp; 690 691 dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd)); 692 for (i = 0; i < fdbuf->fd_numfd; i++) { 693 /* 694 * We need pointer size alignment for fd_fds. On a LP64 695 * kernel, the required alignment is 8 bytes while 696 * the option headers and values are only 4 bytes 697 * aligned. So its safer to do a bcopy compared to 698 * assigning fdbuf->fd_fds[i] to fp. 699 */ 700 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 701 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp)); 702 (void) closef(fp); 703 } 704 if (fdbuf->fd_ebuf != NULL) 705 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen); 706 kmem_free(fdbuf, fdbuf->fd_size); 707 } 708 709 /* 710 * Allocate an esballoc'ed message for AF_UNIX file descriptor passing. 711 * Waits if memory is not available. 712 */ 713 mblk_t * 714 fdbuf_allocmsg(int size, struct fdbuf *fdbuf) 715 { 716 uchar_t *buf; 717 mblk_t *mp; 718 719 dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd)); 720 buf = kmem_alloc(size, KM_SLEEP); 721 fdbuf->fd_ebuf = (caddr_t)buf; 722 fdbuf->fd_ebuflen = size; 723 fdbuf->fd_frtn.free_func = fdbuf_free; 724 fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf; 725 726 mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn); 727 mp->b_datap->db_type = M_PROTO; 728 return (mp); 729 } 730 731 /* 732 * Extract file descriptors from a fdbuf. 733 * Return list in rights/rightslen. 734 */ 735 /*ARGSUSED*/ 736 static int 737 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen, int msg_flags) 738 { 739 int i, fd; 740 int *rp; 741 struct file *fp; 742 int numfd; 743 744 dprint(1, ("fdbuf_extract: %d fds, len %d\n", 745 fdbuf->fd_numfd, rightslen)); 746 747 numfd = fdbuf->fd_numfd; 748 ASSERT(rightslen == numfd * (int)sizeof (int)); 749 750 /* 751 * Allocate a file descriptor and increment the f_count. 752 * The latter is needed since we always call fdbuf_free 753 * which performs a closef. 754 */ 755 rp = (int *)rights; 756 for (i = 0; i < numfd; i++) { 757 if ((fd = ufalloc(0)) == -1) 758 goto cleanup; 759 /* 760 * We need pointer size alignment for fd_fds. On a LP64 761 * kernel, the required alignment is 8 bytes while 762 * the option headers and values are only 4 bytes 763 * aligned. So its safer to do a bcopy compared to 764 * assigning fdbuf->fd_fds[i] to fp. 765 */ 766 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 767 mutex_enter(&fp->f_tlock); 768 fp->f_count++; 769 mutex_exit(&fp->f_tlock); 770 setf(fd, fp); 771 if ((msg_flags & MSG_CMSG_CLOEXEC) != 0) { 772 f_setfd_or(fd, FD_CLOEXEC); 773 } 774 if ((msg_flags & MSG_CMSG_CLOFORK) != 0) { 775 f_setfd_or(fd, FD_CLOFORK); 776 } 777 *rp++ = fd; 778 if (AU_AUDITING()) 779 audit_fdrecv(fd, fp); 780 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n", 781 i, fd, (void *)fp, fp->f_count)); 782 } 783 return (0); 784 785 cleanup: 786 /* 787 * Undo whatever partial work the loop above has done. 788 */ 789 { 790 int j; 791 792 rp = (int *)rights; 793 for (j = 0; j < i; j++) { 794 dprint(0, 795 ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp)); 796 (void) closeandsetf(*rp++, NULL); 797 } 798 } 799 800 return (EMFILE); 801 } 802 803 /* 804 * Insert file descriptors into an fdbuf. 805 * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed 806 * by calling fdbuf_free(). 807 */ 808 int 809 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp) 810 { 811 int numfd, i; 812 int *fds; 813 struct file *fp; 814 struct fdbuf *fdbuf; 815 int fdbufsize; 816 817 dprint(1, ("fdbuf_create: len %d\n", rightslen)); 818 819 numfd = rightslen / (int)sizeof (int); 820 821 fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)); 822 fdbuf = kmem_alloc(fdbufsize, KM_SLEEP); 823 fdbuf->fd_size = fdbufsize; 824 fdbuf->fd_numfd = 0; 825 fdbuf->fd_ebuf = NULL; 826 fdbuf->fd_ebuflen = 0; 827 fds = (int *)rights; 828 for (i = 0; i < numfd; i++) { 829 if ((fp = getf(fds[i])) == NULL) { 830 fdbuf_free(fdbuf); 831 return (EBADF); 832 } 833 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n", 834 i, fds[i], (void *)fp, fp->f_count)); 835 mutex_enter(&fp->f_tlock); 836 fp->f_count++; 837 mutex_exit(&fp->f_tlock); 838 /* 839 * The maximum alignment for fdbuf (or any option header 840 * and its value) it 4 bytes. On a LP64 kernel, the alignment 841 * is not sufficient for pointers (fd_fds in this case). Since 842 * we just did a kmem_alloc (we get a double word alignment), 843 * we don't need to do anything on the send side (we loose 844 * the double word alignment because fdbuf goes after an 845 * option header (eg T_unitdata_req) which is only 4 byte 846 * aligned). We take care of this when we extract the file 847 * descriptor in fdbuf_extract or fdbuf_free. 848 */ 849 fdbuf->fd_fds[i] = fp; 850 fdbuf->fd_numfd++; 851 releasef(fds[i]); 852 if (AU_AUDITING()) 853 audit_fdsend(fds[i], fp, 0); 854 } 855 *fdbufp = fdbuf; 856 return (0); 857 } 858 859 static int 860 fdbuf_optlen(int rightslen) 861 { 862 int numfd; 863 864 numfd = rightslen / (int)sizeof (int); 865 866 return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *))); 867 } 868 869 static t_uscalar_t 870 fdbuf_cmsglen(int fdbuflen) 871 { 872 return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) / 873 (int)sizeof (struct file *) * (int)sizeof (int)); 874 } 875 876 877 /* 878 * Return non-zero if the mblk and fdbuf are consistent. 879 */ 880 static int 881 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen) 882 { 883 if (fdbuflen >= FDBUF_HDRSIZE && 884 fdbuflen == fdbuf->fd_size) { 885 frtn_t *frp = mp->b_datap->db_frtnp; 886 /* 887 * Check that the SO_FILEP portion of the 888 * message has not been modified by 889 * the loopback transport. The sending sockfs generates 890 * a message that is esballoc'ed with the free function 891 * being fdbuf_free() and where free_arg contains the 892 * identical information as the SO_FILEP content. 893 * 894 * If any of these constraints are not satisfied we 895 * silently ignore the option. 896 */ 897 ASSERT(mp); 898 if (frp != NULL && 899 frp->free_func == fdbuf_free && 900 frp->free_arg != NULL && 901 bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) { 902 dprint(1, ("fdbuf_verify: fdbuf %p len %d\n", 903 (void *)fdbuf, fdbuflen)); 904 return (1); 905 } else { 906 zcmn_err(getzoneid(), CE_WARN, 907 "sockfs: mismatched fdbuf content (%p)", 908 (void *)mp); 909 return (0); 910 } 911 } else { 912 zcmn_err(getzoneid(), CE_WARN, 913 "sockfs: mismatched fdbuf len %d, %d\n", 914 fdbuflen, fdbuf->fd_size); 915 return (0); 916 } 917 } 918 919 /* 920 * When the file descriptors returned by sorecvmsg can not be passed 921 * to the application this routine will cleanup the references on 922 * the files. Start at startoff bytes into the buffer. 923 */ 924 static void 925 close_fds(void *fdbuf, int fdbuflen, int startoff) 926 { 927 int *fds = (int *)fdbuf; 928 int numfd = fdbuflen / (int)sizeof (int); 929 int i; 930 931 dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff)); 932 933 for (i = 0; i < numfd; i++) { 934 if (startoff < 0) 935 startoff = 0; 936 if (startoff < (int)sizeof (int)) { 937 /* 938 * This file descriptor is partially or fully after 939 * the offset 940 */ 941 dprint(0, 942 ("close_fds: cleanup[%d] = %d\n", i, fds[i])); 943 (void) closeandsetf(fds[i], NULL); 944 } 945 startoff -= (int)sizeof (int); 946 } 947 } 948 949 /* 950 * Close all file descriptors contained in the control part starting at 951 * the startoffset. 952 */ 953 void 954 so_closefds(void *control, t_uscalar_t controllen, int oldflg, 955 int startoff) 956 { 957 struct cmsghdr *cmsg; 958 959 if (control == NULL) 960 return; 961 962 if (oldflg) { 963 close_fds(control, controllen, startoff); 964 return; 965 } 966 /* Scan control part for file descriptors. */ 967 for (cmsg = (struct cmsghdr *)control; 968 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 969 cmsg = CMSG_NEXT(cmsg)) { 970 if (cmsg->cmsg_level == SOL_SOCKET && 971 cmsg->cmsg_type == SCM_RIGHTS) { 972 close_fds(CMSG_CONTENT(cmsg), 973 (int)CMSG_CONTENTLEN(cmsg), 974 startoff - (int)sizeof (struct cmsghdr)); 975 } 976 startoff -= ROUNDUP_cmsglen(cmsg->cmsg_len); 977 } 978 } 979 980 /* 981 * Handle truncation of a cmsg when the receive buffer is not big enough. 982 * Adjust the cmsg_len header field in the last cmsg that will be included in 983 * the buffer to reflect the number of bytes included. 984 */ 985 void 986 so_truncatecmsg(void *control, t_uscalar_t controllen, uint_t maxlen) 987 { 988 struct cmsghdr *cmsg; 989 uint_t len = 0; 990 991 if (control == NULL) 992 return; 993 994 for (cmsg = control; 995 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 996 cmsg = CMSG_NEXT(cmsg)) { 997 998 len += ROUNDUP_cmsglen(cmsg->cmsg_len); 999 1000 if (len > maxlen) { 1001 /* 1002 * This cmsg is the last one that will be included in 1003 * the truncated buffer. 1004 */ 1005 socklen_t diff = len - maxlen; 1006 1007 if (diff < CMSG_CONTENTLEN(cmsg)) { 1008 dprint(1, ("so_truncatecmsg: %d -> %d\n", 1009 cmsg->cmsg_len, cmsg->cmsg_len - diff)); 1010 cmsg->cmsg_len -= diff; 1011 } else { 1012 cmsg->cmsg_len = sizeof (struct cmsghdr); 1013 } 1014 break; 1015 } 1016 } 1017 } 1018 1019 /* 1020 * Returns a pointer/length for the file descriptors contained 1021 * in the control buffer. Returns with *fdlenp == -1 if there are no 1022 * file descriptor options present. This is different than there being 1023 * a zero-length file descriptor option. 1024 * Fail if there are multiple SCM_RIGHT cmsgs. 1025 */ 1026 int 1027 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg, 1028 void **fdsp, int *fdlenp) 1029 { 1030 struct cmsghdr *cmsg; 1031 void *fds; 1032 int fdlen; 1033 1034 if (control == NULL) { 1035 *fdsp = NULL; 1036 *fdlenp = -1; 1037 return (0); 1038 } 1039 1040 if (oldflg) { 1041 *fdsp = control; 1042 if (controllen == 0) 1043 *fdlenp = -1; 1044 else 1045 *fdlenp = controllen; 1046 dprint(1, ("so_getfdopt: old %d\n", *fdlenp)); 1047 return (0); 1048 } 1049 1050 fds = NULL; 1051 fdlen = 0; 1052 1053 for (cmsg = (struct cmsghdr *)control; 1054 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1055 cmsg = CMSG_NEXT(cmsg)) { 1056 if (cmsg->cmsg_level == SOL_SOCKET && 1057 cmsg->cmsg_type == SCM_RIGHTS) { 1058 if (fds != NULL) 1059 return (EINVAL); 1060 fds = CMSG_CONTENT(cmsg); 1061 fdlen = (int)CMSG_CONTENTLEN(cmsg); 1062 dprint(1, ("so_getfdopt: new %lu\n", 1063 (size_t)CMSG_CONTENTLEN(cmsg))); 1064 } 1065 } 1066 if (fds == NULL) { 1067 dprint(1, ("so_getfdopt: NONE\n")); 1068 *fdlenp = -1; 1069 } else 1070 *fdlenp = fdlen; 1071 *fdsp = fds; 1072 return (0); 1073 } 1074 1075 /* 1076 * Return the length of the options including any file descriptor options. 1077 */ 1078 t_uscalar_t 1079 so_optlen(void *control, t_uscalar_t controllen, int oldflg) 1080 { 1081 struct cmsghdr *cmsg; 1082 t_uscalar_t optlen = 0; 1083 t_uscalar_t len; 1084 1085 if (control == NULL) 1086 return (0); 1087 1088 if (oldflg) 1089 return ((t_uscalar_t)(sizeof (struct T_opthdr) + 1090 fdbuf_optlen(controllen))); 1091 1092 for (cmsg = (struct cmsghdr *)control; 1093 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1094 cmsg = CMSG_NEXT(cmsg)) { 1095 if (cmsg->cmsg_level == SOL_SOCKET && 1096 cmsg->cmsg_type == SCM_RIGHTS) { 1097 len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg)); 1098 } else { 1099 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1100 } 1101 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) + 1102 sizeof (struct T_opthdr)); 1103 } 1104 dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n", 1105 controllen, oldflg, optlen)); 1106 return (optlen); 1107 } 1108 1109 /* 1110 * Copy options from control to the mblk. Skip any file descriptor options. 1111 */ 1112 void 1113 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp) 1114 { 1115 struct T_opthdr toh; 1116 struct cmsghdr *cmsg; 1117 1118 if (control == NULL) 1119 return; 1120 1121 if (oldflg) { 1122 /* No real options - caller has handled file descriptors */ 1123 return; 1124 } 1125 for (cmsg = (struct cmsghdr *)control; 1126 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1127 cmsg = CMSG_NEXT(cmsg)) { 1128 /* 1129 * Note: The caller handles file descriptors prior 1130 * to calling this function. 1131 */ 1132 t_uscalar_t len; 1133 1134 if (cmsg->cmsg_level == SOL_SOCKET && 1135 cmsg->cmsg_type == SCM_RIGHTS) 1136 continue; 1137 1138 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1139 toh.level = cmsg->cmsg_level; 1140 toh.name = cmsg->cmsg_type; 1141 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr); 1142 toh.status = 0; 1143 1144 soappendmsg(mp, &toh, sizeof (toh)); 1145 soappendmsg(mp, CMSG_CONTENT(cmsg), len); 1146 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len; 1147 ASSERT(mp->b_wptr <= mp->b_datap->db_lim); 1148 } 1149 } 1150 1151 /* 1152 * Return the length of the control message derived from the options. 1153 * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP. 1154 * When oldflg is set only include SO_FILEP. 1155 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1156 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1157 * also be checked for any possible impacts. 1158 */ 1159 t_uscalar_t 1160 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg) 1161 { 1162 t_uscalar_t cmsglen = 0; 1163 struct T_opthdr *tohp; 1164 t_uscalar_t len; 1165 t_uscalar_t last_roundup = 0; 1166 1167 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1168 1169 for (tohp = (struct T_opthdr *)opt; 1170 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1171 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1172 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n", 1173 tohp->level, tohp->name, tohp->len)); 1174 if (tohp->level == SOL_SOCKET && 1175 (tohp->name == SO_SRCADDR || 1176 tohp->name == SO_UNIX_CLOSE)) { 1177 continue; 1178 } 1179 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1180 struct fdbuf *fdbuf; 1181 int fdbuflen; 1182 1183 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1184 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1185 1186 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1187 continue; 1188 if (oldflg) { 1189 cmsglen += fdbuf_cmsglen(fdbuflen); 1190 continue; 1191 } 1192 len = fdbuf_cmsglen(fdbuflen); 1193 } else if (tohp->level == SOL_SOCKET && 1194 tohp->name == SCM_TIMESTAMP) { 1195 if (oldflg) 1196 continue; 1197 1198 if (get_udatamodel() == DATAMODEL_NATIVE) { 1199 len = sizeof (struct timeval); 1200 } else { 1201 len = sizeof (struct timeval32); 1202 } 1203 } else { 1204 if (oldflg) 1205 continue; 1206 len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1207 } 1208 /* 1209 * Exclude roundup for last option to not set 1210 * MSG_CTRUNC when the cmsg fits but the padding doesn't fit. 1211 */ 1212 last_roundup = (t_uscalar_t) 1213 (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) - 1214 (len + (int)sizeof (struct cmsghdr))); 1215 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) + 1216 last_roundup; 1217 } 1218 cmsglen -= last_roundup; 1219 dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n", 1220 optlen, oldflg, cmsglen)); 1221 return (cmsglen); 1222 } 1223 1224 /* 1225 * Copy options from options to the control. Convert SO_FILEP to 1226 * file descriptors. 1227 * Returns errno or zero. 1228 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1229 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1230 * also be checked for any possible impacts. 1231 */ 1232 int 1233 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int msg_flags, 1234 void *control, t_uscalar_t controllen) 1235 { 1236 struct T_opthdr *tohp; 1237 struct cmsghdr *cmsg; 1238 struct fdbuf *fdbuf; 1239 int fdbuflen; 1240 int error; 1241 int oldflg = (msg_flags & MSG_XPG4_2) == 0; 1242 #if defined(DEBUG) || defined(__lint) 1243 struct cmsghdr *cend = (struct cmsghdr *) 1244 (((uint8_t *)control) + ROUNDUP_cmsglen(controllen)); 1245 #endif 1246 cmsg = (struct cmsghdr *)control; 1247 1248 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1249 1250 for (tohp = (struct T_opthdr *)opt; 1251 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1252 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1253 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n", 1254 tohp->level, tohp->name, tohp->len)); 1255 1256 if (tohp->level == SOL_SOCKET && 1257 (tohp->name == SO_SRCADDR || 1258 tohp->name == SO_UNIX_CLOSE)) { 1259 continue; 1260 } 1261 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen); 1262 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1263 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1264 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1265 1266 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1267 return (EPROTO); 1268 if (oldflg) { 1269 error = fdbuf_extract(fdbuf, control, 1270 (int)controllen, msg_flags); 1271 if (error != 0) 1272 return (error); 1273 continue; 1274 } else { 1275 int fdlen; 1276 1277 fdlen = (int)fdbuf_cmsglen( 1278 (int)_TPI_TOPT_DATALEN(tohp)); 1279 1280 cmsg->cmsg_level = tohp->level; 1281 cmsg->cmsg_type = SCM_RIGHTS; 1282 cmsg->cmsg_len = (socklen_t)(fdlen + 1283 sizeof (struct cmsghdr)); 1284 1285 error = fdbuf_extract(fdbuf, 1286 CMSG_CONTENT(cmsg), fdlen, msg_flags); 1287 if (error != 0) 1288 return (error); 1289 } 1290 } else if (tohp->level == SOL_SOCKET && 1291 tohp->name == SCM_TIMESTAMP) { 1292 timestruc_t *timestamp; 1293 1294 if (oldflg) 1295 continue; 1296 1297 cmsg->cmsg_level = tohp->level; 1298 cmsg->cmsg_type = tohp->name; 1299 1300 timestamp = 1301 (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1], 1302 sizeof (intptr_t)); 1303 1304 if (get_udatamodel() == DATAMODEL_NATIVE) { 1305 struct timeval tv; 1306 1307 cmsg->cmsg_len = sizeof (struct timeval) + 1308 sizeof (struct cmsghdr); 1309 tv.tv_sec = timestamp->tv_sec; 1310 tv.tv_usec = timestamp->tv_nsec / 1311 (NANOSEC / MICROSEC); 1312 /* 1313 * on LP64 systems, the struct timeval in 1314 * the destination will not be 8-byte aligned, 1315 * so use bcopy to avoid alignment trouble 1316 */ 1317 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv)); 1318 } else { 1319 struct timeval32 *time32; 1320 1321 cmsg->cmsg_len = sizeof (struct timeval32) + 1322 sizeof (struct cmsghdr); 1323 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg); 1324 time32->tv_sec = (time32_t)timestamp->tv_sec; 1325 time32->tv_usec = 1326 (int32_t)(timestamp->tv_nsec / 1327 (NANOSEC / MICROSEC)); 1328 } 1329 1330 } else { 1331 if (oldflg) 1332 continue; 1333 1334 cmsg->cmsg_level = tohp->level; 1335 cmsg->cmsg_type = tohp->name; 1336 cmsg->cmsg_len = (socklen_t)sizeof (struct cmsghdr); 1337 if (tohp->level == IPPROTO_IP && 1338 (tohp->name == IP_RECVTOS || 1339 tohp->name == IP_RECVTTL)) { 1340 /* 1341 * The data for these is a uint8_t but, in 1342 * order to maintain alignment for any 1343 * following TPI primitives in the message, 1344 * there will be some trailing padding bytes 1345 * which are included in the TPI_TOPT_DATALEN. 1346 * For these types, we set the cmsg_len 1347 * explicitly to the correct value. 1348 */ 1349 cmsg->cmsg_len += (socklen_t)sizeof (uint8_t); 1350 } else { 1351 cmsg->cmsg_len += 1352 (socklen_t)(_TPI_TOPT_DATALEN(tohp)); 1353 } 1354 1355 /* copy content to control data part */ 1356 bcopy(&tohp[1], CMSG_CONTENT(cmsg), 1357 CMSG_CONTENTLEN(cmsg)); 1358 } 1359 /* move to next CMSG structure! */ 1360 cmsg = CMSG_NEXT(cmsg); 1361 } 1362 dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n", 1363 control, controllen, (void *)cend, (void *)cmsg)); 1364 ASSERT(cmsg <= cend); 1365 return (0); 1366 } 1367 1368 /* 1369 * Extract the SO_SRCADDR option value if present. 1370 */ 1371 void 1372 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp, 1373 t_uscalar_t *srclenp) 1374 { 1375 struct T_opthdr *tohp; 1376 1377 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1378 1379 ASSERT(srcp != NULL && srclenp != NULL); 1380 *srcp = NULL; 1381 *srclenp = 0; 1382 1383 for (tohp = (struct T_opthdr *)opt; 1384 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1385 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1386 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n", 1387 tohp->level, tohp->name, tohp->len)); 1388 if (tohp->level == SOL_SOCKET && 1389 tohp->name == SO_SRCADDR) { 1390 *srcp = _TPI_TOPT_DATA(tohp); 1391 *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1392 } 1393 } 1394 } 1395 1396 /* 1397 * Verify if the SO_UNIX_CLOSE option is present. 1398 */ 1399 int 1400 so_getopt_unix_close(void *opt, t_uscalar_t optlen) 1401 { 1402 struct T_opthdr *tohp; 1403 1404 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1405 1406 for (tohp = (struct T_opthdr *)opt; 1407 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1408 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1409 dprint(1, 1410 ("so_getopt_unix_close: level 0x%x, name %d, len %d\n", 1411 tohp->level, tohp->name, tohp->len)); 1412 if (tohp->level == SOL_SOCKET && 1413 tohp->name == SO_UNIX_CLOSE) 1414 return (1); 1415 } 1416 return (0); 1417 } 1418 1419 /* 1420 * Allocate an M_PROTO message. 1421 * 1422 * If allocation fails the behavior depends on sleepflg: 1423 * _ALLOC_NOSLEEP fail immediately 1424 * _ALLOC_INTR sleep for memory until a signal is caught 1425 * _ALLOC_SLEEP sleep forever. Don't return NULL. 1426 */ 1427 mblk_t * 1428 soallocproto(size_t size, int sleepflg, cred_t *cr) 1429 { 1430 mblk_t *mp; 1431 1432 /* Round up size for reuse */ 1433 size = MAX(size, 64); 1434 if (cr != NULL) 1435 mp = allocb_cred(size, cr, curproc->p_pid); 1436 else 1437 mp = allocb(size, BPRI_MED); 1438 1439 if (mp == NULL) { 1440 int error; /* Dummy - error not returned to caller */ 1441 1442 switch (sleepflg) { 1443 case _ALLOC_SLEEP: 1444 if (cr != NULL) { 1445 mp = allocb_cred_wait(size, STR_NOSIG, &error, 1446 cr, curproc->p_pid); 1447 } else { 1448 mp = allocb_wait(size, BPRI_MED, STR_NOSIG, 1449 &error); 1450 } 1451 ASSERT(mp); 1452 break; 1453 case _ALLOC_INTR: 1454 if (cr != NULL) { 1455 mp = allocb_cred_wait(size, 0, &error, cr, 1456 curproc->p_pid); 1457 } else { 1458 mp = allocb_wait(size, BPRI_MED, 0, &error); 1459 } 1460 if (mp == NULL) { 1461 /* Caught signal while sleeping for memory */ 1462 eprintline(ENOBUFS); 1463 return (NULL); 1464 } 1465 break; 1466 case _ALLOC_NOSLEEP: 1467 default: 1468 eprintline(ENOBUFS); 1469 return (NULL); 1470 } 1471 } 1472 DB_TYPE(mp) = M_PROTO; 1473 return (mp); 1474 } 1475 1476 /* 1477 * Allocate an M_PROTO message with a single component. 1478 * len is the length of buf. size is the amount to allocate. 1479 * 1480 * buf can be NULL with a non-zero len. 1481 * This results in a bzero'ed chunk being placed the message. 1482 */ 1483 mblk_t * 1484 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg, 1485 cred_t *cr) 1486 { 1487 mblk_t *mp; 1488 1489 if (size == 0) 1490 size = len; 1491 1492 ASSERT(size >= len); 1493 /* Round up size for reuse */ 1494 size = MAX(size, 64); 1495 mp = soallocproto(size, sleepflg, cr); 1496 if (mp == NULL) 1497 return (NULL); 1498 mp->b_datap->db_type = M_PROTO; 1499 if (len != 0) { 1500 if (buf != NULL) 1501 bcopy(buf, mp->b_wptr, len); 1502 else 1503 bzero(mp->b_wptr, len); 1504 mp->b_wptr += len; 1505 } 1506 return (mp); 1507 } 1508 1509 /* 1510 * Append buf/len to mp. 1511 * The caller has to ensure that there is enough room in the mblk. 1512 * 1513 * buf can be NULL with a non-zero len. 1514 * This results in a bzero'ed chunk being placed the message. 1515 */ 1516 void 1517 soappendmsg(mblk_t *mp, const void *buf, ssize_t len) 1518 { 1519 ASSERT(mp); 1520 1521 if (len != 0) { 1522 /* Assert for room left */ 1523 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len); 1524 if (buf != NULL) 1525 bcopy(buf, mp->b_wptr, len); 1526 else 1527 bzero(mp->b_wptr, len); 1528 } 1529 mp->b_wptr += len; 1530 } 1531 1532 /* 1533 * Create a message using two kernel buffers. 1534 * If size is set that will determine the allocation size (e.g. for future 1535 * soappendmsg calls). If size is zero it is derived from the buffer 1536 * lengths. 1537 */ 1538 mblk_t * 1539 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1540 ssize_t size, int sleepflg, cred_t *cr) 1541 { 1542 mblk_t *mp; 1543 1544 if (size == 0) 1545 size = len1 + len2; 1546 ASSERT(size >= len1 + len2); 1547 1548 mp = soallocproto1(buf1, len1, size, sleepflg, cr); 1549 if (mp) 1550 soappendmsg(mp, buf2, len2); 1551 return (mp); 1552 } 1553 1554 /* 1555 * Create a message using three kernel buffers. 1556 * If size is set that will determine the allocation size (for future 1557 * soappendmsg calls). If size is zero it is derived from the buffer 1558 * lengths. 1559 */ 1560 mblk_t * 1561 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1562 const void *buf3, ssize_t len3, ssize_t size, int sleepflg, cred_t *cr) 1563 { 1564 mblk_t *mp; 1565 1566 if (size == 0) 1567 size = len1 + len2 +len3; 1568 ASSERT(size >= len1 + len2 + len3); 1569 1570 mp = soallocproto1(buf1, len1, size, sleepflg, cr); 1571 if (mp != NULL) { 1572 soappendmsg(mp, buf2, len2); 1573 soappendmsg(mp, buf3, len3); 1574 } 1575 return (mp); 1576 } 1577 1578 #ifdef DEBUG 1579 char * 1580 pr_state(uint_t state, uint_t mode) 1581 { 1582 static char buf[1024]; 1583 1584 buf[0] = 0; 1585 if (state & SS_ISCONNECTED) 1586 (void) strcat(buf, "ISCONNECTED "); 1587 if (state & SS_ISCONNECTING) 1588 (void) strcat(buf, "ISCONNECTING "); 1589 if (state & SS_ISDISCONNECTING) 1590 (void) strcat(buf, "ISDISCONNECTING "); 1591 if (state & SS_CANTSENDMORE) 1592 (void) strcat(buf, "CANTSENDMORE "); 1593 1594 if (state & SS_CANTRCVMORE) 1595 (void) strcat(buf, "CANTRCVMORE "); 1596 if (state & SS_ISBOUND) 1597 (void) strcat(buf, "ISBOUND "); 1598 if (state & SS_NDELAY) 1599 (void) strcat(buf, "NDELAY "); 1600 if (state & SS_NONBLOCK) 1601 (void) strcat(buf, "NONBLOCK "); 1602 1603 if (state & SS_ASYNC) 1604 (void) strcat(buf, "ASYNC "); 1605 if (state & SS_ACCEPTCONN) 1606 (void) strcat(buf, "ACCEPTCONN "); 1607 if (state & SS_SAVEDEOR) 1608 (void) strcat(buf, "SAVEDEOR "); 1609 1610 if (state & SS_RCVATMARK) 1611 (void) strcat(buf, "RCVATMARK "); 1612 if (state & SS_OOBPEND) 1613 (void) strcat(buf, "OOBPEND "); 1614 if (state & SS_HAVEOOBDATA) 1615 (void) strcat(buf, "HAVEOOBDATA "); 1616 if (state & SS_HADOOBDATA) 1617 (void) strcat(buf, "HADOOBDATA "); 1618 1619 if (mode & SM_PRIV) 1620 (void) strcat(buf, "PRIV "); 1621 if (mode & SM_ATOMIC) 1622 (void) strcat(buf, "ATOMIC "); 1623 if (mode & SM_ADDR) 1624 (void) strcat(buf, "ADDR "); 1625 if (mode & SM_CONNREQUIRED) 1626 (void) strcat(buf, "CONNREQUIRED "); 1627 1628 if (mode & SM_FDPASSING) 1629 (void) strcat(buf, "FDPASSING "); 1630 if (mode & SM_EXDATA) 1631 (void) strcat(buf, "EXDATA "); 1632 if (mode & SM_OPTDATA) 1633 (void) strcat(buf, "OPTDATA "); 1634 if (mode & SM_BYTESTREAM) 1635 (void) strcat(buf, "BYTESTREAM "); 1636 return (buf); 1637 } 1638 1639 char * 1640 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen) 1641 { 1642 static char buf[1024]; 1643 1644 if (addr == NULL || addrlen == 0) { 1645 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr); 1646 return (buf); 1647 } 1648 switch (family) { 1649 case AF_INET: { 1650 struct sockaddr_in sin; 1651 1652 bcopy(addr, &sin, sizeof (sin)); 1653 1654 (void) sprintf(buf, "(len %d) %x/%d", 1655 addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port)); 1656 break; 1657 } 1658 case AF_INET6: { 1659 struct sockaddr_in6 sin6; 1660 uint16_t *piece = (uint16_t *)&sin6.sin6_addr; 1661 1662 bcopy((char *)addr, (char *)&sin6, sizeof (sin6)); 1663 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d", 1664 addrlen, 1665 ntohs(piece[0]), ntohs(piece[1]), 1666 ntohs(piece[2]), ntohs(piece[3]), 1667 ntohs(piece[4]), ntohs(piece[5]), 1668 ntohs(piece[6]), ntohs(piece[7]), 1669 ntohs(sin6.sin6_port)); 1670 break; 1671 } 1672 case AF_UNIX: { 1673 struct sockaddr_un *soun = (struct sockaddr_un *)addr; 1674 1675 (void) sprintf(buf, "(len %d) %s", addrlen, 1676 (soun == NULL) ? "(none)" : soun->sun_path); 1677 break; 1678 } 1679 default: 1680 (void) sprintf(buf, "(unknown af %d)", family); 1681 break; 1682 } 1683 return (buf); 1684 } 1685 1686 /* The logical equivalence operator (a if-and-only-if b) */ 1687 #define EQUIVALENT(a, b) (((a) && (b)) || (!(a) && (!(b)))) 1688 1689 /* 1690 * Verify limitations and invariants on oob state. 1691 * Return 1 if OK, otherwise 0 so that it can be used as 1692 * ASSERT(verify_oobstate(so)); 1693 */ 1694 int 1695 so_verify_oobstate(struct sonode *so) 1696 { 1697 boolean_t havemark; 1698 1699 ASSERT(MUTEX_HELD(&so->so_lock)); 1700 1701 /* 1702 * The possible state combinations are: 1703 * 0 1704 * SS_OOBPEND 1705 * SS_OOBPEND|SS_HAVEOOBDATA 1706 * SS_OOBPEND|SS_HADOOBDATA 1707 * SS_HADOOBDATA 1708 */ 1709 switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) { 1710 case 0: 1711 case SS_OOBPEND: 1712 case SS_OOBPEND|SS_HAVEOOBDATA: 1713 case SS_OOBPEND|SS_HADOOBDATA: 1714 case SS_HADOOBDATA: 1715 break; 1716 default: 1717 printf("Bad oob state 1 (%p): state %s\n", 1718 (void *)so, pr_state(so->so_state, so->so_mode)); 1719 return (0); 1720 } 1721 1722 /* SS_RCVATMARK should only be set when SS_OOBPEND is set */ 1723 if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) { 1724 printf("Bad oob state 2 (%p): state %s\n", 1725 (void *)so, pr_state(so->so_state, so->so_mode)); 1726 return (0); 1727 } 1728 1729 /* 1730 * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND 1731 * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt. 1732 */ 1733 havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 : 1734 SOTOTPI(so)->sti_oobsigcnt > 0; 1735 1736 if (!EQUIVALENT(havemark || (so->so_state & SS_RCVATMARK), 1737 so->so_state & SS_OOBPEND)) { 1738 printf("Bad oob state 3 (%p): state %s\n", 1739 (void *)so, pr_state(so->so_state, so->so_mode)); 1740 return (0); 1741 } 1742 1743 /* 1744 * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA 1745 */ 1746 if (!(so->so_options & SO_OOBINLINE) && 1747 !EQUIVALENT(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) { 1748 printf("Bad oob state 4 (%p): state %s\n", 1749 (void *)so, pr_state(so->so_state, so->so_mode)); 1750 return (0); 1751 } 1752 1753 if (!SOCK_IS_NONSTR(so) && 1754 SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) { 1755 printf("Bad oob state 5 (%p): counts %d/%d state %s\n", 1756 (void *)so, SOTOTPI(so)->sti_oobsigcnt, 1757 SOTOTPI(so)->sti_oobcnt, 1758 pr_state(so->so_state, so->so_mode)); 1759 return (0); 1760 } 1761 1762 return (1); 1763 } 1764 #undef EQUIVALENT 1765 #endif /* DEBUG */ 1766 1767 /* initialize sockfs zone specific kstat related items */ 1768 void * 1769 sock_kstat_init(zoneid_t zoneid) 1770 { 1771 kstat_t *ksp; 1772 1773 ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc", 1774 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid); 1775 1776 if (ksp != NULL) { 1777 ksp->ks_update = sockfs_update; 1778 ksp->ks_snapshot = sockfs_snapshot; 1779 ksp->ks_lock = &socklist.sl_lock; 1780 ksp->ks_private = (void *)(uintptr_t)zoneid; 1781 kstat_install(ksp); 1782 } 1783 1784 return (ksp); 1785 } 1786 1787 /* tear down sockfs zone specific kstat related items */ 1788 /*ARGSUSED*/ 1789 void 1790 sock_kstat_fini(zoneid_t zoneid, void *arg) 1791 { 1792 kstat_t *ksp = (kstat_t *)arg; 1793 1794 if (ksp != NULL) { 1795 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private); 1796 kstat_delete(ksp); 1797 } 1798 } 1799 1800 /* 1801 * Zones: 1802 * Note that nactive is going to be different for each zone. 1803 * This means we require kstat to call sockfs_update and then sockfs_snapshot 1804 * for the same zone, or sockfs_snapshot will be taken into the wrong size 1805 * buffer. This is safe, but if the buffer is too small, user will not be 1806 * given details of all sockets. However, as this kstat has a ks_lock, kstat 1807 * driver will keep it locked between the update and the snapshot, so no 1808 * other process (zone) can currently get inbetween resulting in a wrong size 1809 * buffer allocation. 1810 */ 1811 static int 1812 sockfs_update(kstat_t *ksp, int rw) 1813 { 1814 uint_t nactive = 0; /* # of active AF_UNIX sockets */ 1815 struct sonode *so; /* current sonode on socklist */ 1816 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 1817 1818 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 1819 1820 if (rw == KSTAT_WRITE) { /* bounce all writes */ 1821 return (EACCES); 1822 } 1823 1824 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { 1825 if (so->so_count != 0 && so->so_zoneid == myzoneid) { 1826 nactive++; 1827 } 1828 } 1829 ksp->ks_ndata = nactive; 1830 ksp->ks_data_size = nactive * sizeof (struct sockinfo); 1831 1832 return (0); 1833 } 1834 1835 static int 1836 sockfs_snapshot(kstat_t *ksp, void *buf, int rw) 1837 { 1838 int ns; /* # of sonodes we've copied */ 1839 struct sonode *so; /* current sonode on socklist */ 1840 struct sockinfo *psi; /* where we put sockinfo data */ 1841 t_uscalar_t sn_len; /* soa_len */ 1842 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 1843 sotpi_info_t *sti; 1844 1845 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 1846 1847 ksp->ks_snaptime = gethrtime(); 1848 1849 if (rw == KSTAT_WRITE) { /* bounce all writes */ 1850 return (EACCES); 1851 } 1852 1853 /* 1854 * For each sonode on the socklist, we massage the important 1855 * info into buf, in sockinfo format. 1856 */ 1857 psi = (struct sockinfo *)buf; 1858 ns = 0; 1859 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { 1860 vattr_t attr; 1861 1862 /* only stuff active sonodes and the same zone: */ 1863 if (so->so_count == 0 || so->so_zoneid != myzoneid) { 1864 continue; 1865 } 1866 1867 /* 1868 * If the sonode was activated between the update and the 1869 * snapshot, we're done - as this is only a snapshot. 1870 */ 1871 if ((caddr_t)(psi) >= (caddr_t)buf + ksp->ks_data_size) { 1872 break; 1873 } 1874 1875 sti = SOTOTPI(so); 1876 /* copy important info into buf: */ 1877 psi->si_size = sizeof (struct sockinfo); 1878 psi->si_family = so->so_family; 1879 psi->si_type = so->so_type; 1880 psi->si_flag = so->so_flag; 1881 psi->si_state = so->so_state; 1882 psi->si_serv_type = sti->sti_serv_type; 1883 psi->si_ux_laddr_sou_magic = sti->sti_ux_laddr.soua_magic; 1884 psi->si_ux_faddr_sou_magic = sti->sti_ux_faddr.soua_magic; 1885 psi->si_laddr_soa_len = sti->sti_laddr.soa_len; 1886 psi->si_faddr_soa_len = sti->sti_faddr.soa_len; 1887 psi->si_szoneid = so->so_zoneid; 1888 psi->si_faddr_noxlate = sti->sti_faddr_noxlate; 1889 1890 /* 1891 * Grab the inode, if possible. 1892 * This must be done before entering so_lock as VOP_GETATTR 1893 * will acquire it. 1894 */ 1895 if (so->so_vnode == NULL || 1896 VOP_GETATTR(so->so_vnode, &attr, 0, CRED(), NULL) != 0) 1897 attr.va_nodeid = 0; 1898 1899 psi->si_inode = attr.va_nodeid; 1900 1901 mutex_enter(&so->so_lock); 1902 1903 if (sti->sti_laddr_sa != NULL) { 1904 ASSERT(sti->sti_laddr_sa->sa_data != NULL); 1905 sn_len = sti->sti_laddr_len; 1906 ASSERT(sn_len <= sizeof (short) + 1907 sizeof (psi->si_laddr_sun_path)); 1908 1909 psi->si_laddr_family = 1910 sti->sti_laddr_sa->sa_family; 1911 if (sn_len != 0) { 1912 /* AF_UNIX socket names are NULL terminated */ 1913 (void) strncpy(psi->si_laddr_sun_path, 1914 sti->sti_laddr_sa->sa_data, 1915 sizeof (psi->si_laddr_sun_path)); 1916 sn_len = strlen(psi->si_laddr_sun_path); 1917 } 1918 psi->si_laddr_sun_path[sn_len] = 0; 1919 } 1920 1921 if (sti->sti_faddr_sa != NULL) { 1922 ASSERT(sti->sti_faddr_sa->sa_data != NULL); 1923 sn_len = sti->sti_faddr_len; 1924 ASSERT(sn_len <= sizeof (short) + 1925 sizeof (psi->si_faddr_sun_path)); 1926 1927 psi->si_faddr_family = 1928 sti->sti_faddr_sa->sa_family; 1929 if (sn_len != 0) { 1930 (void) strncpy(psi->si_faddr_sun_path, 1931 sti->sti_faddr_sa->sa_data, 1932 sizeof (psi->si_faddr_sun_path)); 1933 sn_len = strlen(psi->si_faddr_sun_path); 1934 } 1935 psi->si_faddr_sun_path[sn_len] = 0; 1936 } 1937 1938 mutex_exit(&so->so_lock); 1939 1940 (void) snprintf(psi->si_son_straddr, 1941 sizeof (psi->si_son_straddr), "%p", (void *)so); 1942 (void) snprintf(psi->si_lvn_straddr, 1943 sizeof (psi->si_lvn_straddr), "%p", 1944 (void *)sti->sti_ux_laddr.soua_vp); 1945 (void) snprintf(psi->si_fvn_straddr, 1946 sizeof (psi->si_fvn_straddr), "%p", 1947 (void *)sti->sti_ux_faddr.soua_vp); 1948 1949 ns++; 1950 psi++; 1951 } 1952 1953 ksp->ks_ndata = ns; 1954 return (0); 1955 } 1956 1957 ssize_t 1958 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size) 1959 { 1960 struct uio auio; 1961 struct iovec aiov[1]; 1962 register vnode_t *vp; 1963 int ioflag, rwflag; 1964 ssize_t cnt; 1965 int error = 0; 1966 int iovcnt = 0; 1967 short fflag; 1968 1969 vp = fp->f_vnode; 1970 fflag = fp->f_flag; 1971 1972 rwflag = 0; 1973 aiov[0].iov_base = (caddr_t)buf; 1974 aiov[0].iov_len = size; 1975 iovcnt = 1; 1976 cnt = (ssize_t)size; 1977 (void) VOP_RWLOCK(vp, rwflag, NULL); 1978 1979 auio.uio_loffset = fileoff; 1980 auio.uio_iov = aiov; 1981 auio.uio_iovcnt = iovcnt; 1982 auio.uio_resid = cnt; 1983 auio.uio_segflg = UIO_SYSSPACE; 1984 auio.uio_llimit = MAXOFFSET_T; 1985 auio.uio_fmode = fflag; 1986 auio.uio_extflg = UIO_COPY_CACHED; 1987 1988 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1989 1990 /* If read sync is not asked for, filter sync flags */ 1991 if ((ioflag & FRSYNC) == 0) 1992 ioflag &= ~(FSYNC|FDSYNC); 1993 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL); 1994 cnt -= auio.uio_resid; 1995 1996 VOP_RWUNLOCK(vp, rwflag, NULL); 1997 1998 if (error == EINTR && cnt != 0) 1999 error = 0; 2000 2001 if (error != 0) { 2002 *err = error; 2003 return (0); 2004 } else { 2005 *err = 0; 2006 return (cnt); 2007 } 2008 } 2009 2010 int 2011 so_copyin(const void *from, void *to, size_t size, int fromkernel) 2012 { 2013 if (fromkernel) { 2014 bcopy(from, to, size); 2015 return (0); 2016 } 2017 return (xcopyin(from, to, size)); 2018 } 2019 2020 int 2021 so_copyout(const void *from, void *to, size_t size, int tokernel) 2022 { 2023 if (tokernel) { 2024 bcopy(from, to, size); 2025 return (0); 2026 } 2027 return (xcopyout(from, to, size)); 2028 } 2029