1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vfs.h> 37 #include <sys/vfs_opreg.h> 38 #include <sys/vnode.h> 39 #include <sys/debug.h> 40 #include <sys/errno.h> 41 #include <sys/time.h> 42 #include <sys/file.h> 43 #include <sys/open.h> 44 #include <sys/user.h> 45 #include <sys/uio.h> 46 #include <sys/termios.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/strsun.h> 50 #include <sys/esunddi.h> 51 #include <sys/flock.h> 52 #include <sys/modctl.h> 53 #include <sys/cmn_err.h> 54 #include <sys/mkdev.h> 55 #include <sys/pathname.h> 56 #include <sys/ddi.h> 57 #include <sys/stat.h> 58 #include <sys/fs/snode.h> 59 #include <sys/fs/dv_node.h> 60 #include <sys/zone.h> 61 62 #include <sys/socket.h> 63 #include <sys/socketvar.h> 64 #include <netinet/in.h> 65 #include <sys/un.h> 66 67 #include <sys/ucred.h> 68 69 #include <sys/tiuser.h> 70 #define _SUN_TPI_VERSION 2 71 #include <sys/tihdr.h> 72 73 #include <c2/audit.h> 74 75 #include <fs/sockfs/nl7c.h> 76 #include <fs/sockfs/sockcommon.h> 77 #include <fs/sockfs/socktpi.h> 78 #include <fs/sockfs/socktpi_impl.h> 79 80 /* 81 * Macros that operate on struct cmsghdr. 82 * The CMSG_VALID macro does not assume that the last option buffer is padded. 83 */ 84 #define CMSG_CONTENT(cmsg) (&((cmsg)[1])) 85 #define CMSG_CONTENTLEN(cmsg) ((cmsg)->cmsg_len - sizeof (struct cmsghdr)) 86 #define CMSG_VALID(cmsg, start, end) \ 87 (ISALIGNED_cmsghdr(cmsg) && \ 88 ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \ 89 ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \ 90 ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \ 91 ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end))) 92 #define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */ 93 94 dev_t sockdev; /* For fsid in getattr */ 95 int sockfs_defer_nl7c_init = 0; 96 97 struct socklist socklist; 98 99 struct kmem_cache *socket_cache; 100 101 static int sockfs_update(kstat_t *, int); 102 static int sockfs_snapshot(kstat_t *, void *, int); 103 extern smod_info_t *sotpi_smod_create(void); 104 105 extern void sendfile_init(); 106 107 extern void nl7c_init(void); 108 109 extern int sostr_init(); 110 111 extern int modrootloaded; 112 113 #define ADRSTRLEN (2 * sizeof (void *) + 1) 114 /* 115 * kernel structure for passing the sockinfo data back up to the user. 116 * the strings array allows us to convert AF_UNIX addresses into strings 117 * with a common method regardless of which n-bit kernel we're running. 118 */ 119 struct k_sockinfo { 120 struct sockinfo ks_si; 121 char ks_straddr[3][ADRSTRLEN]; 122 }; 123 124 /* 125 * Translate from a device pathname (e.g. "/dev/tcp") to a vnode. 126 * Returns with the vnode held. 127 */ 128 int 129 sogetvp(char *devpath, vnode_t **vpp, int uioflag) 130 { 131 struct snode *csp; 132 vnode_t *vp, *dvp; 133 major_t maj; 134 int error; 135 136 ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE); 137 138 /* 139 * Lookup the underlying filesystem vnode. 140 */ 141 error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp); 142 if (error) 143 return (error); 144 145 /* Check that it is the correct vnode */ 146 if (vp->v_type != VCHR) { 147 VN_RELE(vp); 148 return (ENOTSOCK); 149 } 150 151 /* 152 * If devpath went through devfs, the device should already 153 * be configured. If devpath is a mknod file, however, we 154 * need to make sure the device is properly configured. 155 * To do this, we do something similar to spec_open() 156 * except that we resolve to the minor/leaf level since 157 * we need to return a vnode. 158 */ 159 csp = VTOS(VTOS(vp)->s_commonvp); 160 if (!(csp->s_flag & SDIPSET)) { 161 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 162 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname); 163 if (error == 0) 164 error = devfs_lookupname(pathname, NULLVPP, &dvp); 165 VN_RELE(vp); 166 kmem_free(pathname, MAXPATHLEN); 167 if (error != 0) 168 return (ENXIO); 169 vp = dvp; /* use the devfs vp */ 170 } 171 172 /* device is configured at this point */ 173 maj = getmajor(vp->v_rdev); 174 if (!STREAMSTAB(maj)) { 175 VN_RELE(vp); 176 return (ENOSTR); 177 } 178 179 *vpp = vp; 180 return (0); 181 } 182 183 /* 184 * Update the accessed, updated, or changed times in an sonode 185 * with the current time. 186 * 187 * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable 188 * attributes in a fstat call. (They return the current time and 0 for 189 * all timestamps, respectively.) We maintain the current timestamps 190 * here primarily so that should sockmod be popped the resulting 191 * file descriptor will behave like a stream w.r.t. the timestamps. 192 */ 193 void 194 so_update_attrs(struct sonode *so, int flag) 195 { 196 time_t now = gethrestime_sec(); 197 198 if (SOCK_IS_NONSTR(so)) 199 return; 200 201 mutex_enter(&so->so_lock); 202 so->so_flag |= flag; 203 if (flag & SOACC) 204 SOTOTPI(so)->sti_atime = now; 205 if (flag & SOMOD) 206 SOTOTPI(so)->sti_mtime = now; 207 mutex_exit(&so->so_lock); 208 } 209 210 extern so_create_func_t sock_comm_create_function; 211 extern so_destroy_func_t sock_comm_destroy_function; 212 /* 213 * Init function called when sockfs is loaded. 214 */ 215 int 216 sockinit(int fstype, char *name) 217 { 218 static const fs_operation_def_t sock_vfsops_template[] = { 219 NULL, NULL 220 }; 221 int error; 222 major_t dev; 223 char *err_str; 224 225 error = vfs_setfsops(fstype, sock_vfsops_template, NULL); 226 if (error != 0) { 227 zcmn_err(GLOBAL_ZONEID, CE_WARN, 228 "sockinit: bad vfs ops template"); 229 return (error); 230 } 231 232 error = vn_make_ops(name, socket_vnodeops_template, 233 &socket_vnodeops); 234 if (error != 0) { 235 err_str = "sockinit: bad socket vnode ops template"; 236 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */ 237 socket_vnodeops = NULL; 238 goto failure; 239 } 240 241 socket_cache = kmem_cache_create("socket_cache", 242 sizeof (struct sonode), 0, sonode_constructor, 243 sonode_destructor, NULL, NULL, NULL, 0); 244 245 error = socktpi_init(); 246 if (error != 0) { 247 err_str = NULL; 248 goto failure; 249 } 250 251 error = sostr_init(); 252 if (error != 0) { 253 err_str = NULL; 254 goto failure; 255 } 256 257 /* 258 * Set up the default create and destroy functions 259 */ 260 sock_comm_create_function = socket_sonode_create; 261 sock_comm_destroy_function = socket_sonode_destroy; 262 263 /* 264 * Build initial list mapping socket parameters to vnode. 265 */ 266 smod_init(); 267 smod_add(sotpi_smod_create()); 268 269 sockparams_init(); 270 271 /* 272 * If sockets are needed before init runs /sbin/soconfig 273 * it is possible to preload the sockparams list here using 274 * calls like: 275 * sockconfig(1,2,3, "/dev/tcp", 0); 276 */ 277 278 /* 279 * Create a unique dev_t for use in so_fsid. 280 */ 281 282 if ((dev = getudev()) == (major_t)-1) 283 dev = 0; 284 sockdev = makedevice(dev, 0); 285 286 mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL); 287 sendfile_init(); 288 if (!modrootloaded) { 289 sockfs_defer_nl7c_init = 1; 290 } else { 291 nl7c_init(); 292 } 293 294 return (0); 295 296 failure: 297 (void) vfs_freevfsops_by_type(fstype); 298 if (socket_vnodeops != NULL) 299 vn_freevnodeops(socket_vnodeops); 300 if (err_str != NULL) 301 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str); 302 return (error); 303 } 304 305 /* 306 * Caller must hold the mutex. Used to set SOLOCKED. 307 */ 308 void 309 so_lock_single(struct sonode *so) 310 { 311 ASSERT(MUTEX_HELD(&so->so_lock)); 312 313 while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) { 314 so->so_flag |= SOWANT; 315 cv_wait_stop(&so->so_want_cv, &so->so_lock, 316 SO_LOCK_WAKEUP_TIME); 317 } 318 so->so_flag |= SOLOCKED; 319 } 320 321 /* 322 * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND. 323 * Used to clear SOLOCKED or SOASYNC_UNBIND. 324 */ 325 void 326 so_unlock_single(struct sonode *so, int flag) 327 { 328 ASSERT(MUTEX_HELD(&so->so_lock)); 329 ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND)); 330 ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0); 331 ASSERT(so->so_flag & flag); 332 /* 333 * Process the T_DISCON_IND on sti_discon_ind_mp. 334 * 335 * Call to so_drain_discon_ind will result in so_lock 336 * being dropped and re-acquired later. 337 */ 338 if (!SOCK_IS_NONSTR(so)) { 339 sotpi_info_t *sti = SOTOTPI(so); 340 341 if (sti->sti_discon_ind_mp != NULL) 342 so_drain_discon_ind(so); 343 } 344 345 if (so->so_flag & SOWANT) 346 cv_broadcast(&so->so_want_cv); 347 so->so_flag &= ~(SOWANT|flag); 348 } 349 350 /* 351 * Caller must hold the mutex. Used to set SOREADLOCKED. 352 * If the caller wants nonblocking behavior it should set fmode. 353 */ 354 int 355 so_lock_read(struct sonode *so, int fmode) 356 { 357 ASSERT(MUTEX_HELD(&so->so_lock)); 358 359 while (so->so_flag & SOREADLOCKED) { 360 if (fmode & (FNDELAY|FNONBLOCK)) 361 return (EWOULDBLOCK); 362 so->so_flag |= SOWANT; 363 cv_wait_stop(&so->so_want_cv, &so->so_lock, 364 SO_LOCK_WAKEUP_TIME); 365 } 366 so->so_flag |= SOREADLOCKED; 367 return (0); 368 } 369 370 /* 371 * Like so_lock_read above but allows signals. 372 */ 373 int 374 so_lock_read_intr(struct sonode *so, int fmode) 375 { 376 ASSERT(MUTEX_HELD(&so->so_lock)); 377 378 while (so->so_flag & SOREADLOCKED) { 379 if (fmode & (FNDELAY|FNONBLOCK)) 380 return (EWOULDBLOCK); 381 so->so_flag |= SOWANT; 382 if (!cv_wait_sig(&so->so_want_cv, &so->so_lock)) 383 return (EINTR); 384 } 385 so->so_flag |= SOREADLOCKED; 386 return (0); 387 } 388 389 /* 390 * Caller must hold the mutex. Used to clear SOREADLOCKED, 391 * set in so_lock_read() or so_lock_read_intr(). 392 */ 393 void 394 so_unlock_read(struct sonode *so) 395 { 396 ASSERT(MUTEX_HELD(&so->so_lock)); 397 ASSERT(so->so_flag & SOREADLOCKED); 398 399 if (so->so_flag & SOWANT) 400 cv_broadcast(&so->so_want_cv); 401 so->so_flag &= ~(SOWANT|SOREADLOCKED); 402 } 403 404 /* 405 * Verify that the specified offset falls within the mblk and 406 * that the resulting pointer is aligned. 407 * Returns NULL if not. 408 */ 409 void * 410 sogetoff(mblk_t *mp, t_uscalar_t offset, 411 t_uscalar_t length, uint_t align_size) 412 { 413 uintptr_t ptr1, ptr2; 414 415 ASSERT(mp && mp->b_wptr >= mp->b_rptr); 416 ptr1 = (uintptr_t)mp->b_rptr + offset; 417 ptr2 = (uintptr_t)ptr1 + length; 418 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) { 419 eprintline(0); 420 return (NULL); 421 } 422 if ((ptr1 & (align_size - 1)) != 0) { 423 eprintline(0); 424 return (NULL); 425 } 426 return ((void *)ptr1); 427 } 428 429 /* 430 * Return the AF_UNIX underlying filesystem vnode matching a given name. 431 * Makes sure the sending and the destination sonodes are compatible. 432 * The vnode is returned held. 433 * 434 * The underlying filesystem VSOCK vnode has a v_stream pointer that 435 * references the actual stream head (hence indirectly the actual sonode). 436 */ 437 static int 438 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess, 439 vnode_t **vpp) 440 { 441 vnode_t *vp; /* Underlying filesystem vnode */ 442 vnode_t *rvp; /* real vnode */ 443 vnode_t *svp; /* sockfs vnode */ 444 struct sonode *so2; 445 int error; 446 447 dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so, 448 soun->sun_path)); 449 450 error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); 451 if (error) { 452 eprintsoline(so, error); 453 return (error); 454 } 455 456 /* 457 * Traverse lofs mounts get the real vnode 458 */ 459 if (VOP_REALVP(vp, &rvp, NULL) == 0) { 460 VN_HOLD(rvp); /* hold the real vnode */ 461 VN_RELE(vp); /* release hold from lookup */ 462 vp = rvp; 463 } 464 465 if (vp->v_type != VSOCK) { 466 error = ENOTSOCK; 467 eprintsoline(so, error); 468 goto done2; 469 } 470 471 if (checkaccess) { 472 /* 473 * Check that we have permissions to access the destination 474 * vnode. This check is not done in BSD but it is required 475 * by X/Open. 476 */ 477 if (error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL)) { 478 eprintsoline(so, error); 479 goto done2; 480 } 481 } 482 483 /* 484 * Check if the remote socket has been closed. 485 * 486 * Synchronize with vn_rele_stream by holding v_lock while traversing 487 * v_stream->sd_vnode. 488 */ 489 mutex_enter(&vp->v_lock); 490 if (vp->v_stream == NULL) { 491 mutex_exit(&vp->v_lock); 492 if (so->so_type == SOCK_DGRAM) 493 error = EDESTADDRREQ; 494 else 495 error = ECONNREFUSED; 496 497 eprintsoline(so, error); 498 goto done2; 499 } 500 ASSERT(vp->v_stream->sd_vnode); 501 svp = vp->v_stream->sd_vnode; 502 /* 503 * holding v_lock on underlying filesystem vnode and acquiring 504 * it on sockfs vnode. Assumes that no code ever attempts to 505 * acquire these locks in the reverse order. 506 */ 507 VN_HOLD(svp); 508 mutex_exit(&vp->v_lock); 509 510 if (svp->v_type != VSOCK) { 511 error = ENOTSOCK; 512 eprintsoline(so, error); 513 goto done; 514 } 515 516 so2 = VTOSO(svp); 517 518 if (so->so_type != so2->so_type) { 519 error = EPROTOTYPE; 520 eprintsoline(so, error); 521 goto done; 522 } 523 524 VN_RELE(svp); 525 *vpp = vp; 526 return (0); 527 528 done: 529 VN_RELE(svp); 530 done2: 531 VN_RELE(vp); 532 return (error); 533 } 534 535 /* 536 * Verify peer address for connect and sendto/sendmsg. 537 * Since sendto/sendmsg would not get synchronous errors from the transport 538 * provider we have to do these ugly checks in the socket layer to 539 * preserve compatibility with SunOS 4.X. 540 */ 541 int 542 so_addr_verify(struct sonode *so, const struct sockaddr *name, 543 socklen_t namelen) 544 { 545 int family; 546 547 dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n", 548 (void *)so, (void *)name, namelen)); 549 550 ASSERT(name != NULL); 551 552 family = so->so_family; 553 switch (family) { 554 case AF_INET: 555 if (name->sa_family != family) { 556 eprintsoline(so, EAFNOSUPPORT); 557 return (EAFNOSUPPORT); 558 } 559 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) { 560 eprintsoline(so, EINVAL); 561 return (EINVAL); 562 } 563 break; 564 case AF_INET6: { 565 #ifdef DEBUG 566 struct sockaddr_in6 *sin6; 567 #endif /* DEBUG */ 568 569 if (name->sa_family != family) { 570 eprintsoline(so, EAFNOSUPPORT); 571 return (EAFNOSUPPORT); 572 } 573 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) { 574 eprintsoline(so, EINVAL); 575 return (EINVAL); 576 } 577 #ifdef DEBUG 578 /* Verify that apps don't forget to clear sin6_scope_id etc */ 579 sin6 = (struct sockaddr_in6 *)name; 580 if (sin6->sin6_scope_id != 0 && 581 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) { 582 zcmn_err(getzoneid(), CE_WARN, 583 "connect/send* with uninitialized sin6_scope_id " 584 "(%d) on socket. Pid = %d\n", 585 (int)sin6->sin6_scope_id, (int)curproc->p_pid); 586 } 587 #endif /* DEBUG */ 588 break; 589 } 590 case AF_UNIX: 591 if (SOTOTPI(so)->sti_faddr_noxlate) { 592 return (0); 593 } 594 if (namelen < (socklen_t)sizeof (short)) { 595 eprintsoline(so, ENOENT); 596 return (ENOENT); 597 } 598 if (name->sa_family != family) { 599 eprintsoline(so, EAFNOSUPPORT); 600 return (EAFNOSUPPORT); 601 } 602 /* MAXPATHLEN + soun_family + nul termination */ 603 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) { 604 eprintsoline(so, ENAMETOOLONG); 605 return (ENAMETOOLONG); 606 } 607 608 break; 609 610 default: 611 /* 612 * Default is don't do any length or sa_family check 613 * to allow non-sockaddr style addresses. 614 */ 615 break; 616 } 617 618 return (0); 619 } 620 621 622 /* 623 * Translate an AF_UNIX sockaddr_un to the transport internal name. 624 * Assumes caller has called so_addr_verify first. 625 */ 626 /*ARGSUSED*/ 627 int 628 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name, 629 socklen_t namelen, int checkaccess, 630 void **addrp, socklen_t *addrlenp) 631 { 632 int error; 633 struct sockaddr_un *soun; 634 vnode_t *vp; 635 void *addr; 636 socklen_t addrlen; 637 sotpi_info_t *sti = SOTOTPI(so); 638 639 dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n", 640 (void *)so, (void *)name, namelen, checkaccess)); 641 642 ASSERT(name != NULL); 643 ASSERT(so->so_family == AF_UNIX); 644 ASSERT(!sti->sti_faddr_noxlate); 645 ASSERT(namelen >= (socklen_t)sizeof (short)); 646 ASSERT(name->sa_family == AF_UNIX); 647 soun = (struct sockaddr_un *)name; 648 /* 649 * Lookup vnode for the specified path name and verify that 650 * it is a socket. 651 */ 652 error = so_ux_lookup(so, soun, checkaccess, &vp); 653 if (error) { 654 eprintsoline(so, error); 655 return (error); 656 } 657 /* 658 * Use the address of the peer vnode as the address to send 659 * to. We release the peer vnode here. In case it has been 660 * closed by the time the T_CONN_REQ or T_UNIDATA_REQ reaches the 661 * transport the message will get an error or be dropped. 662 */ 663 sti->sti_ux_faddr.soua_vp = vp; 664 sti->sti_ux_faddr.soua_magic = SOU_MAGIC_EXPLICIT; 665 addr = &sti->sti_ux_faddr; 666 addrlen = (socklen_t)sizeof (sti->sti_ux_faddr); 667 dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n", 668 addrlen, (void *)vp)); 669 VN_RELE(vp); 670 *addrp = addr; 671 *addrlenp = (socklen_t)addrlen; 672 return (0); 673 } 674 675 /* 676 * Esballoc free function for messages that contain SO_FILEP option. 677 * Decrement the reference count on the file pointers using closef. 678 */ 679 void 680 fdbuf_free(struct fdbuf *fdbuf) 681 { 682 int i; 683 struct file *fp; 684 685 dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd)); 686 for (i = 0; i < fdbuf->fd_numfd; i++) { 687 /* 688 * We need pointer size alignment for fd_fds. On a LP64 689 * kernel, the required alignment is 8 bytes while 690 * the option headers and values are only 4 bytes 691 * aligned. So its safer to do a bcopy compared to 692 * assigning fdbuf->fd_fds[i] to fp. 693 */ 694 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 695 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp)); 696 (void) closef(fp); 697 } 698 if (fdbuf->fd_ebuf != NULL) 699 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen); 700 kmem_free(fdbuf, fdbuf->fd_size); 701 } 702 703 /* 704 * Allocate an esballoc'ed message for AF_UNIX file descriptor passing. 705 * Waits if memory is not available. 706 */ 707 mblk_t * 708 fdbuf_allocmsg(int size, struct fdbuf *fdbuf) 709 { 710 uchar_t *buf; 711 mblk_t *mp; 712 713 dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd)); 714 buf = kmem_alloc(size, KM_SLEEP); 715 fdbuf->fd_ebuf = (caddr_t)buf; 716 fdbuf->fd_ebuflen = size; 717 fdbuf->fd_frtn.free_func = fdbuf_free; 718 fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf; 719 720 mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn); 721 mp->b_datap->db_type = M_PROTO; 722 return (mp); 723 } 724 725 /* 726 * Extract file descriptors from a fdbuf. 727 * Return list in rights/rightslen. 728 */ 729 /*ARGSUSED*/ 730 static int 731 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen) 732 { 733 int i, fd; 734 int *rp; 735 struct file *fp; 736 int numfd; 737 738 dprint(1, ("fdbuf_extract: %d fds, len %d\n", 739 fdbuf->fd_numfd, rightslen)); 740 741 numfd = fdbuf->fd_numfd; 742 ASSERT(rightslen == numfd * (int)sizeof (int)); 743 744 /* 745 * Allocate a file descriptor and increment the f_count. 746 * The latter is needed since we always call fdbuf_free 747 * which performs a closef. 748 */ 749 rp = (int *)rights; 750 for (i = 0; i < numfd; i++) { 751 if ((fd = ufalloc(0)) == -1) 752 goto cleanup; 753 /* 754 * We need pointer size alignment for fd_fds. On a LP64 755 * kernel, the required alignment is 8 bytes while 756 * the option headers and values are only 4 bytes 757 * aligned. So its safer to do a bcopy compared to 758 * assigning fdbuf->fd_fds[i] to fp. 759 */ 760 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 761 mutex_enter(&fp->f_tlock); 762 fp->f_count++; 763 mutex_exit(&fp->f_tlock); 764 setf(fd, fp); 765 *rp++ = fd; 766 if (audit_active) 767 audit_fdrecv(fd, fp); 768 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n", 769 i, fd, (void *)fp, fp->f_count)); 770 } 771 return (0); 772 773 cleanup: 774 /* 775 * Undo whatever partial work the loop above has done. 776 */ 777 { 778 int j; 779 780 rp = (int *)rights; 781 for (j = 0; j < i; j++) { 782 dprint(0, 783 ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp)); 784 (void) closeandsetf(*rp++, NULL); 785 } 786 } 787 788 return (EMFILE); 789 } 790 791 /* 792 * Insert file descriptors into an fdbuf. 793 * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed 794 * by calling fdbuf_free(). 795 */ 796 int 797 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp) 798 { 799 int numfd, i; 800 int *fds; 801 struct file *fp; 802 struct fdbuf *fdbuf; 803 int fdbufsize; 804 805 dprint(1, ("fdbuf_create: len %d\n", rightslen)); 806 807 numfd = rightslen / (int)sizeof (int); 808 809 fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)); 810 fdbuf = kmem_alloc(fdbufsize, KM_SLEEP); 811 fdbuf->fd_size = fdbufsize; 812 fdbuf->fd_numfd = 0; 813 fdbuf->fd_ebuf = NULL; 814 fdbuf->fd_ebuflen = 0; 815 fds = (int *)rights; 816 for (i = 0; i < numfd; i++) { 817 if ((fp = getf(fds[i])) == NULL) { 818 fdbuf_free(fdbuf); 819 return (EBADF); 820 } 821 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n", 822 i, fds[i], (void *)fp, fp->f_count)); 823 mutex_enter(&fp->f_tlock); 824 fp->f_count++; 825 mutex_exit(&fp->f_tlock); 826 /* 827 * The maximum alignment for fdbuf (or any option header 828 * and its value) it 4 bytes. On a LP64 kernel, the alignment 829 * is not sufficient for pointers (fd_fds in this case). Since 830 * we just did a kmem_alloc (we get a double word alignment), 831 * we don't need to do anything on the send side (we loose 832 * the double word alignment because fdbuf goes after an 833 * option header (eg T_unitdata_req) which is only 4 byte 834 * aligned). We take care of this when we extract the file 835 * descriptor in fdbuf_extract or fdbuf_free. 836 */ 837 fdbuf->fd_fds[i] = fp; 838 fdbuf->fd_numfd++; 839 releasef(fds[i]); 840 if (audit_active) 841 audit_fdsend(fds[i], fp, 0); 842 } 843 *fdbufp = fdbuf; 844 return (0); 845 } 846 847 static int 848 fdbuf_optlen(int rightslen) 849 { 850 int numfd; 851 852 numfd = rightslen / (int)sizeof (int); 853 854 return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *))); 855 } 856 857 static t_uscalar_t 858 fdbuf_cmsglen(int fdbuflen) 859 { 860 return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) / 861 (int)sizeof (struct file *) * (int)sizeof (int)); 862 } 863 864 865 /* 866 * Return non-zero if the mblk and fdbuf are consistent. 867 */ 868 static int 869 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen) 870 { 871 if (fdbuflen >= FDBUF_HDRSIZE && 872 fdbuflen == fdbuf->fd_size) { 873 frtn_t *frp = mp->b_datap->db_frtnp; 874 /* 875 * Check that the SO_FILEP portion of the 876 * message has not been modified by 877 * the loopback transport. The sending sockfs generates 878 * a message that is esballoc'ed with the free function 879 * being fdbuf_free() and where free_arg contains the 880 * identical information as the SO_FILEP content. 881 * 882 * If any of these constraints are not satisfied we 883 * silently ignore the option. 884 */ 885 ASSERT(mp); 886 if (frp != NULL && 887 frp->free_func == fdbuf_free && 888 frp->free_arg != NULL && 889 bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) { 890 dprint(1, ("fdbuf_verify: fdbuf %p len %d\n", 891 (void *)fdbuf, fdbuflen)); 892 return (1); 893 } else { 894 zcmn_err(getzoneid(), CE_WARN, 895 "sockfs: mismatched fdbuf content (%p)", 896 (void *)mp); 897 return (0); 898 } 899 } else { 900 zcmn_err(getzoneid(), CE_WARN, 901 "sockfs: mismatched fdbuf len %d, %d\n", 902 fdbuflen, fdbuf->fd_size); 903 return (0); 904 } 905 } 906 907 /* 908 * When the file descriptors returned by sorecvmsg can not be passed 909 * to the application this routine will cleanup the references on 910 * the files. Start at startoff bytes into the buffer. 911 */ 912 static void 913 close_fds(void *fdbuf, int fdbuflen, int startoff) 914 { 915 int *fds = (int *)fdbuf; 916 int numfd = fdbuflen / (int)sizeof (int); 917 int i; 918 919 dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff)); 920 921 for (i = 0; i < numfd; i++) { 922 if (startoff < 0) 923 startoff = 0; 924 if (startoff < (int)sizeof (int)) { 925 /* 926 * This file descriptor is partially or fully after 927 * the offset 928 */ 929 dprint(0, 930 ("close_fds: cleanup[%d] = %d\n", i, fds[i])); 931 (void) closeandsetf(fds[i], NULL); 932 } 933 startoff -= (int)sizeof (int); 934 } 935 } 936 937 /* 938 * Close all file descriptors contained in the control part starting at 939 * the startoffset. 940 */ 941 void 942 so_closefds(void *control, t_uscalar_t controllen, int oldflg, 943 int startoff) 944 { 945 struct cmsghdr *cmsg; 946 947 if (control == NULL) 948 return; 949 950 if (oldflg) { 951 close_fds(control, controllen, startoff); 952 return; 953 } 954 /* Scan control part for file descriptors. */ 955 for (cmsg = (struct cmsghdr *)control; 956 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 957 cmsg = CMSG_NEXT(cmsg)) { 958 if (cmsg->cmsg_level == SOL_SOCKET && 959 cmsg->cmsg_type == SCM_RIGHTS) { 960 close_fds(CMSG_CONTENT(cmsg), 961 (int)CMSG_CONTENTLEN(cmsg), 962 startoff - (int)sizeof (struct cmsghdr)); 963 } 964 startoff -= cmsg->cmsg_len; 965 } 966 } 967 968 /* 969 * Returns a pointer/length for the file descriptors contained 970 * in the control buffer. Returns with *fdlenp == -1 if there are no 971 * file descriptor options present. This is different than there being 972 * a zero-length file descriptor option. 973 * Fail if there are multiple SCM_RIGHT cmsgs. 974 */ 975 int 976 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg, 977 void **fdsp, int *fdlenp) 978 { 979 struct cmsghdr *cmsg; 980 void *fds; 981 int fdlen; 982 983 if (control == NULL) { 984 *fdsp = NULL; 985 *fdlenp = -1; 986 return (0); 987 } 988 989 if (oldflg) { 990 *fdsp = control; 991 if (controllen == 0) 992 *fdlenp = -1; 993 else 994 *fdlenp = controllen; 995 dprint(1, ("so_getfdopt: old %d\n", *fdlenp)); 996 return (0); 997 } 998 999 fds = NULL; 1000 fdlen = 0; 1001 1002 for (cmsg = (struct cmsghdr *)control; 1003 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1004 cmsg = CMSG_NEXT(cmsg)) { 1005 if (cmsg->cmsg_level == SOL_SOCKET && 1006 cmsg->cmsg_type == SCM_RIGHTS) { 1007 if (fds != NULL) 1008 return (EINVAL); 1009 fds = CMSG_CONTENT(cmsg); 1010 fdlen = (int)CMSG_CONTENTLEN(cmsg); 1011 dprint(1, ("so_getfdopt: new %lu\n", 1012 (size_t)CMSG_CONTENTLEN(cmsg))); 1013 } 1014 } 1015 if (fds == NULL) { 1016 dprint(1, ("so_getfdopt: NONE\n")); 1017 *fdlenp = -1; 1018 } else 1019 *fdlenp = fdlen; 1020 *fdsp = fds; 1021 return (0); 1022 } 1023 1024 /* 1025 * Return the length of the options including any file descriptor options. 1026 */ 1027 t_uscalar_t 1028 so_optlen(void *control, t_uscalar_t controllen, int oldflg) 1029 { 1030 struct cmsghdr *cmsg; 1031 t_uscalar_t optlen = 0; 1032 t_uscalar_t len; 1033 1034 if (control == NULL) 1035 return (0); 1036 1037 if (oldflg) 1038 return ((t_uscalar_t)(sizeof (struct T_opthdr) + 1039 fdbuf_optlen(controllen))); 1040 1041 for (cmsg = (struct cmsghdr *)control; 1042 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1043 cmsg = CMSG_NEXT(cmsg)) { 1044 if (cmsg->cmsg_level == SOL_SOCKET && 1045 cmsg->cmsg_type == SCM_RIGHTS) { 1046 len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg)); 1047 } else { 1048 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1049 } 1050 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) + 1051 sizeof (struct T_opthdr)); 1052 } 1053 dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n", 1054 controllen, oldflg, optlen)); 1055 return (optlen); 1056 } 1057 1058 /* 1059 * Copy options from control to the mblk. Skip any file descriptor options. 1060 */ 1061 void 1062 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp) 1063 { 1064 struct T_opthdr toh; 1065 struct cmsghdr *cmsg; 1066 1067 if (control == NULL) 1068 return; 1069 1070 if (oldflg) { 1071 /* No real options - caller has handled file descriptors */ 1072 return; 1073 } 1074 for (cmsg = (struct cmsghdr *)control; 1075 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1076 cmsg = CMSG_NEXT(cmsg)) { 1077 /* 1078 * Note: The caller handles file descriptors prior 1079 * to calling this function. 1080 */ 1081 t_uscalar_t len; 1082 1083 if (cmsg->cmsg_level == SOL_SOCKET && 1084 cmsg->cmsg_type == SCM_RIGHTS) 1085 continue; 1086 1087 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1088 toh.level = cmsg->cmsg_level; 1089 toh.name = cmsg->cmsg_type; 1090 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr); 1091 toh.status = 0; 1092 1093 soappendmsg(mp, &toh, sizeof (toh)); 1094 soappendmsg(mp, CMSG_CONTENT(cmsg), len); 1095 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len; 1096 ASSERT(mp->b_wptr <= mp->b_datap->db_lim); 1097 } 1098 } 1099 1100 /* 1101 * Return the length of the control message derived from the options. 1102 * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP. 1103 * When oldflg is set only include SO_FILEP. 1104 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1105 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1106 * also be checked for any possible impacts. 1107 */ 1108 t_uscalar_t 1109 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg) 1110 { 1111 t_uscalar_t cmsglen = 0; 1112 struct T_opthdr *tohp; 1113 t_uscalar_t len; 1114 t_uscalar_t last_roundup = 0; 1115 1116 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1117 1118 for (tohp = (struct T_opthdr *)opt; 1119 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1120 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1121 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n", 1122 tohp->level, tohp->name, tohp->len)); 1123 if (tohp->level == SOL_SOCKET && 1124 (tohp->name == SO_SRCADDR || 1125 tohp->name == SO_UNIX_CLOSE)) { 1126 continue; 1127 } 1128 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1129 struct fdbuf *fdbuf; 1130 int fdbuflen; 1131 1132 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1133 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1134 1135 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1136 continue; 1137 if (oldflg) { 1138 cmsglen += fdbuf_cmsglen(fdbuflen); 1139 continue; 1140 } 1141 len = fdbuf_cmsglen(fdbuflen); 1142 } else if (tohp->level == SOL_SOCKET && 1143 tohp->name == SCM_TIMESTAMP) { 1144 if (oldflg) 1145 continue; 1146 1147 if (get_udatamodel() == DATAMODEL_NATIVE) { 1148 len = sizeof (struct timeval); 1149 } else { 1150 len = sizeof (struct timeval32); 1151 } 1152 } else { 1153 if (oldflg) 1154 continue; 1155 len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1156 } 1157 /* 1158 * Exclude roundup for last option to not set 1159 * MSG_CTRUNC when the cmsg fits but the padding doesn't fit. 1160 */ 1161 last_roundup = (t_uscalar_t) 1162 (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) - 1163 (len + (int)sizeof (struct cmsghdr))); 1164 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) + 1165 last_roundup; 1166 } 1167 cmsglen -= last_roundup; 1168 dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n", 1169 optlen, oldflg, cmsglen)); 1170 return (cmsglen); 1171 } 1172 1173 /* 1174 * Copy options from options to the control. Convert SO_FILEP to 1175 * file descriptors. 1176 * Returns errno or zero. 1177 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1178 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1179 * also be checked for any possible impacts. 1180 */ 1181 int 1182 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg, 1183 void *control, t_uscalar_t controllen) 1184 { 1185 struct T_opthdr *tohp; 1186 struct cmsghdr *cmsg; 1187 struct fdbuf *fdbuf; 1188 int fdbuflen; 1189 int error; 1190 #if defined(DEBUG) || defined(__lint) 1191 struct cmsghdr *cend = (struct cmsghdr *) 1192 (((uint8_t *)control) + ROUNDUP_cmsglen(controllen)); 1193 #endif 1194 cmsg = (struct cmsghdr *)control; 1195 1196 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1197 1198 for (tohp = (struct T_opthdr *)opt; 1199 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1200 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1201 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n", 1202 tohp->level, tohp->name, tohp->len)); 1203 1204 if (tohp->level == SOL_SOCKET && 1205 (tohp->name == SO_SRCADDR || 1206 tohp->name == SO_UNIX_CLOSE)) { 1207 continue; 1208 } 1209 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen); 1210 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1211 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1212 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1213 1214 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1215 return (EPROTO); 1216 if (oldflg) { 1217 error = fdbuf_extract(fdbuf, control, 1218 (int)controllen); 1219 if (error != 0) 1220 return (error); 1221 continue; 1222 } else { 1223 int fdlen; 1224 1225 fdlen = (int)fdbuf_cmsglen( 1226 (int)_TPI_TOPT_DATALEN(tohp)); 1227 1228 cmsg->cmsg_level = tohp->level; 1229 cmsg->cmsg_type = SCM_RIGHTS; 1230 cmsg->cmsg_len = (socklen_t)(fdlen + 1231 sizeof (struct cmsghdr)); 1232 1233 error = fdbuf_extract(fdbuf, 1234 CMSG_CONTENT(cmsg), fdlen); 1235 if (error != 0) 1236 return (error); 1237 } 1238 } else if (tohp->level == SOL_SOCKET && 1239 tohp->name == SCM_TIMESTAMP) { 1240 timestruc_t *timestamp; 1241 1242 if (oldflg) 1243 continue; 1244 1245 cmsg->cmsg_level = tohp->level; 1246 cmsg->cmsg_type = tohp->name; 1247 1248 timestamp = 1249 (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1], 1250 sizeof (intptr_t)); 1251 1252 if (get_udatamodel() == DATAMODEL_NATIVE) { 1253 struct timeval tv; 1254 1255 cmsg->cmsg_len = sizeof (struct timeval) + 1256 sizeof (struct cmsghdr); 1257 tv.tv_sec = timestamp->tv_sec; 1258 tv.tv_usec = timestamp->tv_nsec / 1259 (NANOSEC / MICROSEC); 1260 /* 1261 * on LP64 systems, the struct timeval in 1262 * the destination will not be 8-byte aligned, 1263 * so use bcopy to avoid alignment trouble 1264 */ 1265 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv)); 1266 } else { 1267 struct timeval32 *time32; 1268 1269 cmsg->cmsg_len = sizeof (struct timeval32) + 1270 sizeof (struct cmsghdr); 1271 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg); 1272 time32->tv_sec = (time32_t)timestamp->tv_sec; 1273 time32->tv_usec = 1274 (int32_t)(timestamp->tv_nsec / 1275 (NANOSEC / MICROSEC)); 1276 } 1277 1278 } else { 1279 if (oldflg) 1280 continue; 1281 1282 cmsg->cmsg_level = tohp->level; 1283 cmsg->cmsg_type = tohp->name; 1284 cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) + 1285 sizeof (struct cmsghdr)); 1286 1287 /* copy content to control data part */ 1288 bcopy(&tohp[1], CMSG_CONTENT(cmsg), 1289 CMSG_CONTENTLEN(cmsg)); 1290 } 1291 /* move to next CMSG structure! */ 1292 cmsg = CMSG_NEXT(cmsg); 1293 } 1294 dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n", 1295 control, controllen, (void *)cend, (void *)cmsg)); 1296 ASSERT(cmsg <= cend); 1297 return (0); 1298 } 1299 1300 /* 1301 * Extract the SO_SRCADDR option value if present. 1302 */ 1303 void 1304 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp, 1305 t_uscalar_t *srclenp) 1306 { 1307 struct T_opthdr *tohp; 1308 1309 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1310 1311 ASSERT(srcp != NULL && srclenp != NULL); 1312 *srcp = NULL; 1313 *srclenp = 0; 1314 1315 for (tohp = (struct T_opthdr *)opt; 1316 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1317 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1318 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n", 1319 tohp->level, tohp->name, tohp->len)); 1320 if (tohp->level == SOL_SOCKET && 1321 tohp->name == SO_SRCADDR) { 1322 *srcp = _TPI_TOPT_DATA(tohp); 1323 *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1324 } 1325 } 1326 } 1327 1328 /* 1329 * Verify if the SO_UNIX_CLOSE option is present. 1330 */ 1331 int 1332 so_getopt_unix_close(void *opt, t_uscalar_t optlen) 1333 { 1334 struct T_opthdr *tohp; 1335 1336 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1337 1338 for (tohp = (struct T_opthdr *)opt; 1339 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1340 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1341 dprint(1, 1342 ("so_getopt_unix_close: level 0x%x, name %d, len %d\n", 1343 tohp->level, tohp->name, tohp->len)); 1344 if (tohp->level == SOL_SOCKET && 1345 tohp->name == SO_UNIX_CLOSE) 1346 return (1); 1347 } 1348 return (0); 1349 } 1350 1351 /* 1352 * Allocate an M_PROTO message. 1353 * 1354 * If allocation fails the behavior depends on sleepflg: 1355 * _ALLOC_NOSLEEP fail immediately 1356 * _ALLOC_INTR sleep for memory until a signal is caught 1357 * _ALLOC_SLEEP sleep forever. Don't return NULL. 1358 */ 1359 mblk_t * 1360 soallocproto(size_t size, int sleepflg) 1361 { 1362 mblk_t *mp; 1363 1364 /* Round up size for reuse */ 1365 size = MAX(size, 64); 1366 mp = allocb(size, BPRI_MED); 1367 if (mp == NULL) { 1368 int error; /* Dummy - error not returned to caller */ 1369 1370 switch (sleepflg) { 1371 case _ALLOC_SLEEP: 1372 mp = allocb_wait(size, BPRI_MED, STR_NOSIG, &error); 1373 ASSERT(mp); 1374 break; 1375 case _ALLOC_INTR: 1376 mp = allocb_wait(size, BPRI_MED, 0, &error); 1377 if (mp == NULL) { 1378 /* Caught signal while sleeping for memory */ 1379 eprintline(ENOBUFS); 1380 return (NULL); 1381 } 1382 break; 1383 case _ALLOC_NOSLEEP: 1384 default: 1385 eprintline(ENOBUFS); 1386 return (NULL); 1387 } 1388 } 1389 DB_TYPE(mp) = M_PROTO; 1390 return (mp); 1391 } 1392 1393 /* 1394 * Allocate an M_PROTO message with a single component. 1395 * len is the length of buf. size is the amount to allocate. 1396 * 1397 * buf can be NULL with a non-zero len. 1398 * This results in a bzero'ed chunk being placed the message. 1399 */ 1400 mblk_t * 1401 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg) 1402 { 1403 mblk_t *mp; 1404 1405 if (size == 0) 1406 size = len; 1407 1408 ASSERT(size >= len); 1409 /* Round up size for reuse */ 1410 size = MAX(size, 64); 1411 mp = soallocproto(size, sleepflg); 1412 if (mp == NULL) 1413 return (NULL); 1414 mp->b_datap->db_type = M_PROTO; 1415 if (len != 0) { 1416 if (buf != NULL) 1417 bcopy(buf, mp->b_wptr, len); 1418 else 1419 bzero(mp->b_wptr, len); 1420 mp->b_wptr += len; 1421 } 1422 return (mp); 1423 } 1424 1425 /* 1426 * Append buf/len to mp. 1427 * The caller has to ensure that there is enough room in the mblk. 1428 * 1429 * buf can be NULL with a non-zero len. 1430 * This results in a bzero'ed chunk being placed the message. 1431 */ 1432 void 1433 soappendmsg(mblk_t *mp, const void *buf, ssize_t len) 1434 { 1435 ASSERT(mp); 1436 1437 if (len != 0) { 1438 /* Assert for room left */ 1439 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len); 1440 if (buf != NULL) 1441 bcopy(buf, mp->b_wptr, len); 1442 else 1443 bzero(mp->b_wptr, len); 1444 } 1445 mp->b_wptr += len; 1446 } 1447 1448 /* 1449 * Create a message using two kernel buffers. 1450 * If size is set that will determine the allocation size (e.g. for future 1451 * soappendmsg calls). If size is zero it is derived from the buffer 1452 * lengths. 1453 */ 1454 mblk_t * 1455 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1456 ssize_t size, int sleepflg) 1457 { 1458 mblk_t *mp; 1459 1460 if (size == 0) 1461 size = len1 + len2; 1462 ASSERT(size >= len1 + len2); 1463 1464 mp = soallocproto1(buf1, len1, size, sleepflg); 1465 if (mp) 1466 soappendmsg(mp, buf2, len2); 1467 return (mp); 1468 } 1469 1470 /* 1471 * Create a message using three kernel buffers. 1472 * If size is set that will determine the allocation size (for future 1473 * soappendmsg calls). If size is zero it is derived from the buffer 1474 * lengths. 1475 */ 1476 mblk_t * 1477 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1478 const void *buf3, ssize_t len3, ssize_t size, int sleepflg) 1479 { 1480 mblk_t *mp; 1481 1482 if (size == 0) 1483 size = len1 + len2 +len3; 1484 ASSERT(size >= len1 + len2 + len3); 1485 1486 mp = soallocproto1(buf1, len1, size, sleepflg); 1487 if (mp != NULL) { 1488 soappendmsg(mp, buf2, len2); 1489 soappendmsg(mp, buf3, len3); 1490 } 1491 return (mp); 1492 } 1493 1494 #ifdef DEBUG 1495 char * 1496 pr_state(uint_t state, uint_t mode) 1497 { 1498 static char buf[1024]; 1499 1500 buf[0] = 0; 1501 if (state & SS_ISCONNECTED) 1502 (void) strcat(buf, "ISCONNECTED "); 1503 if (state & SS_ISCONNECTING) 1504 (void) strcat(buf, "ISCONNECTING "); 1505 if (state & SS_ISDISCONNECTING) 1506 (void) strcat(buf, "ISDISCONNECTING "); 1507 if (state & SS_CANTSENDMORE) 1508 (void) strcat(buf, "CANTSENDMORE "); 1509 1510 if (state & SS_CANTRCVMORE) 1511 (void) strcat(buf, "CANTRCVMORE "); 1512 if (state & SS_ISBOUND) 1513 (void) strcat(buf, "ISBOUND "); 1514 if (state & SS_NDELAY) 1515 (void) strcat(buf, "NDELAY "); 1516 if (state & SS_NONBLOCK) 1517 (void) strcat(buf, "NONBLOCK "); 1518 1519 if (state & SS_ASYNC) 1520 (void) strcat(buf, "ASYNC "); 1521 if (state & SS_ACCEPTCONN) 1522 (void) strcat(buf, "ACCEPTCONN "); 1523 if (state & SS_SAVEDEOR) 1524 (void) strcat(buf, "SAVEDEOR "); 1525 1526 if (state & SS_RCVATMARK) 1527 (void) strcat(buf, "RCVATMARK "); 1528 if (state & SS_OOBPEND) 1529 (void) strcat(buf, "OOBPEND "); 1530 if (state & SS_HAVEOOBDATA) 1531 (void) strcat(buf, "HAVEOOBDATA "); 1532 if (state & SS_HADOOBDATA) 1533 (void) strcat(buf, "HADOOBDATA "); 1534 1535 if (mode & SM_PRIV) 1536 (void) strcat(buf, "PRIV "); 1537 if (mode & SM_ATOMIC) 1538 (void) strcat(buf, "ATOMIC "); 1539 if (mode & SM_ADDR) 1540 (void) strcat(buf, "ADDR "); 1541 if (mode & SM_CONNREQUIRED) 1542 (void) strcat(buf, "CONNREQUIRED "); 1543 1544 if (mode & SM_FDPASSING) 1545 (void) strcat(buf, "FDPASSING "); 1546 if (mode & SM_EXDATA) 1547 (void) strcat(buf, "EXDATA "); 1548 if (mode & SM_OPTDATA) 1549 (void) strcat(buf, "OPTDATA "); 1550 if (mode & SM_BYTESTREAM) 1551 (void) strcat(buf, "BYTESTREAM "); 1552 return (buf); 1553 } 1554 1555 char * 1556 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen) 1557 { 1558 static char buf[1024]; 1559 1560 if (addr == NULL || addrlen == 0) { 1561 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr); 1562 return (buf); 1563 } 1564 switch (family) { 1565 case AF_INET: { 1566 struct sockaddr_in sin; 1567 1568 bcopy(addr, &sin, sizeof (sin)); 1569 1570 (void) sprintf(buf, "(len %d) %x/%d", 1571 addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port)); 1572 break; 1573 } 1574 case AF_INET6: { 1575 struct sockaddr_in6 sin6; 1576 uint16_t *piece = (uint16_t *)&sin6.sin6_addr; 1577 1578 bcopy((char *)addr, (char *)&sin6, sizeof (sin6)); 1579 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d", 1580 addrlen, 1581 ntohs(piece[0]), ntohs(piece[1]), 1582 ntohs(piece[2]), ntohs(piece[3]), 1583 ntohs(piece[4]), ntohs(piece[5]), 1584 ntohs(piece[6]), ntohs(piece[7]), 1585 ntohs(sin6.sin6_port)); 1586 break; 1587 } 1588 case AF_UNIX: { 1589 struct sockaddr_un *soun = (struct sockaddr_un *)addr; 1590 1591 (void) sprintf(buf, "(len %d) %s", addrlen, 1592 (soun == NULL) ? "(none)" : soun->sun_path); 1593 break; 1594 } 1595 default: 1596 (void) sprintf(buf, "(unknown af %d)", family); 1597 break; 1598 } 1599 return (buf); 1600 } 1601 1602 /* The logical equivalence operator (a if-and-only-if b) */ 1603 #define EQUIV(a, b) (((a) && (b)) || (!(a) && (!(b)))) 1604 1605 /* 1606 * Verify limitations and invariants on oob state. 1607 * Return 1 if OK, otherwise 0 so that it can be used as 1608 * ASSERT(verify_oobstate(so)); 1609 */ 1610 int 1611 so_verify_oobstate(struct sonode *so) 1612 { 1613 boolean_t havemark; 1614 1615 ASSERT(MUTEX_HELD(&so->so_lock)); 1616 1617 /* 1618 * The possible state combinations are: 1619 * 0 1620 * SS_OOBPEND 1621 * SS_OOBPEND|SS_HAVEOOBDATA 1622 * SS_OOBPEND|SS_HADOOBDATA 1623 * SS_HADOOBDATA 1624 */ 1625 switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) { 1626 case 0: 1627 case SS_OOBPEND: 1628 case SS_OOBPEND|SS_HAVEOOBDATA: 1629 case SS_OOBPEND|SS_HADOOBDATA: 1630 case SS_HADOOBDATA: 1631 break; 1632 default: 1633 printf("Bad oob state 1 (%p): state %s\n", 1634 (void *)so, pr_state(so->so_state, so->so_mode)); 1635 return (0); 1636 } 1637 1638 /* SS_RCVATMARK should only be set when SS_OOBPEND is set */ 1639 if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) { 1640 printf("Bad oob state 2 (%p): state %s\n", 1641 (void *)so, pr_state(so->so_state, so->so_mode)); 1642 return (0); 1643 } 1644 1645 /* 1646 * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND 1647 * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt. 1648 */ 1649 havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 : 1650 SOTOTPI(so)->sti_oobsigcnt > 0; 1651 1652 if (!EQUIV(havemark || (so->so_state & SS_RCVATMARK), 1653 so->so_state & SS_OOBPEND)) { 1654 printf("Bad oob state 3 (%p): state %s\n", 1655 (void *)so, pr_state(so->so_state, so->so_mode)); 1656 return (0); 1657 } 1658 1659 /* 1660 * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA 1661 */ 1662 if (!(so->so_options & SO_OOBINLINE) && 1663 !EQUIV(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) { 1664 printf("Bad oob state 4 (%p): state %s\n", 1665 (void *)so, pr_state(so->so_state, so->so_mode)); 1666 return (0); 1667 } 1668 1669 if (!SOCK_IS_NONSTR(so) && 1670 SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) { 1671 printf("Bad oob state 5 (%p): counts %d/%d state %s\n", 1672 (void *)so, SOTOTPI(so)->sti_oobsigcnt, 1673 SOTOTPI(so)->sti_oobcnt, 1674 pr_state(so->so_state, so->so_mode)); 1675 return (0); 1676 } 1677 1678 return (1); 1679 } 1680 #undef EQUIV 1681 #endif /* DEBUG */ 1682 1683 /* initialize sockfs zone specific kstat related items */ 1684 void * 1685 sock_kstat_init(zoneid_t zoneid) 1686 { 1687 kstat_t *ksp; 1688 1689 ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc", 1690 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid); 1691 1692 if (ksp != NULL) { 1693 ksp->ks_update = sockfs_update; 1694 ksp->ks_snapshot = sockfs_snapshot; 1695 ksp->ks_lock = &socklist.sl_lock; 1696 ksp->ks_private = (void *)(uintptr_t)zoneid; 1697 kstat_install(ksp); 1698 } 1699 1700 return (ksp); 1701 } 1702 1703 /* tear down sockfs zone specific kstat related items */ 1704 /*ARGSUSED*/ 1705 void 1706 sock_kstat_fini(zoneid_t zoneid, void *arg) 1707 { 1708 kstat_t *ksp = (kstat_t *)arg; 1709 1710 if (ksp != NULL) { 1711 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private); 1712 kstat_delete(ksp); 1713 } 1714 } 1715 1716 /* 1717 * Zones: 1718 * Note that nactive is going to be different for each zone. 1719 * This means we require kstat to call sockfs_update and then sockfs_snapshot 1720 * for the same zone, or sockfs_snapshot will be taken into the wrong size 1721 * buffer. This is safe, but if the buffer is too small, user will not be 1722 * given details of all sockets. However, as this kstat has a ks_lock, kstat 1723 * driver will keep it locked between the update and the snapshot, so no 1724 * other process (zone) can currently get inbetween resulting in a wrong size 1725 * buffer allocation. 1726 */ 1727 static int 1728 sockfs_update(kstat_t *ksp, int rw) 1729 { 1730 uint_t nactive = 0; /* # of active AF_UNIX sockets */ 1731 struct sonode *so; /* current sonode on socklist */ 1732 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 1733 1734 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 1735 1736 if (rw == KSTAT_WRITE) { /* bounce all writes */ 1737 return (EACCES); 1738 } 1739 1740 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { 1741 if (so->so_count != 0 && so->so_zoneid == myzoneid) { 1742 nactive++; 1743 } 1744 } 1745 ksp->ks_ndata = nactive; 1746 ksp->ks_data_size = nactive * sizeof (struct k_sockinfo); 1747 1748 return (0); 1749 } 1750 1751 static int 1752 sockfs_snapshot(kstat_t *ksp, void *buf, int rw) 1753 { 1754 int ns; /* # of sonodes we've copied */ 1755 struct sonode *so; /* current sonode on socklist */ 1756 struct k_sockinfo *pksi; /* where we put sockinfo data */ 1757 t_uscalar_t sn_len; /* soa_len */ 1758 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 1759 sotpi_info_t *sti; 1760 1761 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 1762 1763 ksp->ks_snaptime = gethrtime(); 1764 1765 if (rw == KSTAT_WRITE) { /* bounce all writes */ 1766 return (EACCES); 1767 } 1768 1769 /* 1770 * for each sonode on the socklist, we massage the important 1771 * info into buf, in k_sockinfo format. 1772 */ 1773 pksi = (struct k_sockinfo *)buf; 1774 ns = 0; 1775 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { 1776 /* only stuff active sonodes and the same zone: */ 1777 if (so->so_count == 0 || so->so_zoneid != myzoneid) { 1778 continue; 1779 } 1780 1781 /* 1782 * If the sonode was activated between the update and the 1783 * snapshot, we're done - as this is only a snapshot. 1784 */ 1785 if ((caddr_t)(pksi) >= (caddr_t)buf + ksp->ks_data_size) { 1786 break; 1787 } 1788 1789 sti = SOTOTPI(so); 1790 /* copy important info into buf: */ 1791 pksi->ks_si.si_size = sizeof (struct k_sockinfo); 1792 pksi->ks_si.si_family = so->so_family; 1793 pksi->ks_si.si_type = so->so_type; 1794 pksi->ks_si.si_flag = so->so_flag; 1795 pksi->ks_si.si_state = so->so_state; 1796 pksi->ks_si.si_serv_type = sti->sti_serv_type; 1797 pksi->ks_si.si_ux_laddr_sou_magic = 1798 sti->sti_ux_laddr.soua_magic; 1799 pksi->ks_si.si_ux_faddr_sou_magic = 1800 sti->sti_ux_faddr.soua_magic; 1801 pksi->ks_si.si_laddr_soa_len = sti->sti_laddr.soa_len; 1802 pksi->ks_si.si_faddr_soa_len = sti->sti_faddr.soa_len; 1803 pksi->ks_si.si_szoneid = so->so_zoneid; 1804 pksi->ks_si.si_faddr_noxlate = sti->sti_faddr_noxlate; 1805 1806 mutex_enter(&so->so_lock); 1807 1808 if (sti->sti_laddr_sa != NULL) { 1809 ASSERT(sti->sti_laddr_sa->sa_data != NULL); 1810 sn_len = sti->sti_laddr_len; 1811 ASSERT(sn_len <= sizeof (short) + 1812 sizeof (pksi->ks_si.si_laddr_sun_path)); 1813 1814 pksi->ks_si.si_laddr_family = 1815 sti->sti_laddr_sa->sa_family; 1816 if (sn_len != 0) { 1817 /* AF_UNIX socket names are NULL terminated */ 1818 (void) strncpy(pksi->ks_si.si_laddr_sun_path, 1819 sti->sti_laddr_sa->sa_data, 1820 sizeof (pksi->ks_si.si_laddr_sun_path)); 1821 sn_len = strlen(pksi->ks_si.si_laddr_sun_path); 1822 } 1823 pksi->ks_si.si_laddr_sun_path[sn_len] = 0; 1824 } 1825 1826 if (sti->sti_faddr_sa != NULL) { 1827 ASSERT(sti->sti_faddr_sa->sa_data != NULL); 1828 sn_len = sti->sti_faddr_len; 1829 ASSERT(sn_len <= sizeof (short) + 1830 sizeof (pksi->ks_si.si_faddr_sun_path)); 1831 1832 pksi->ks_si.si_faddr_family = 1833 sti->sti_faddr_sa->sa_family; 1834 if (sn_len != 0) { 1835 (void) strncpy(pksi->ks_si.si_faddr_sun_path, 1836 sti->sti_faddr_sa->sa_data, 1837 sizeof (pksi->ks_si.si_faddr_sun_path)); 1838 sn_len = strlen(pksi->ks_si.si_faddr_sun_path); 1839 } 1840 pksi->ks_si.si_faddr_sun_path[sn_len] = 0; 1841 } 1842 1843 mutex_exit(&so->so_lock); 1844 1845 (void) sprintf(pksi->ks_straddr[0], "%p", (void *)so); 1846 (void) sprintf(pksi->ks_straddr[1], "%p", 1847 (void *)sti->sti_ux_laddr.soua_vp); 1848 (void) sprintf(pksi->ks_straddr[2], "%p", 1849 (void *)sti->sti_ux_faddr.soua_vp); 1850 1851 ns++; 1852 pksi++; 1853 } 1854 1855 ksp->ks_ndata = ns; 1856 return (0); 1857 } 1858 1859 ssize_t 1860 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size) 1861 { 1862 struct uio auio; 1863 struct iovec aiov[MSG_MAXIOVLEN]; 1864 register vnode_t *vp; 1865 int ioflag, rwflag; 1866 ssize_t cnt; 1867 int error = 0; 1868 int iovcnt = 0; 1869 short fflag; 1870 1871 vp = fp->f_vnode; 1872 fflag = fp->f_flag; 1873 1874 rwflag = 0; 1875 aiov[0].iov_base = (caddr_t)buf; 1876 aiov[0].iov_len = size; 1877 iovcnt = 1; 1878 cnt = (ssize_t)size; 1879 (void) VOP_RWLOCK(vp, rwflag, NULL); 1880 1881 auio.uio_loffset = fileoff; 1882 auio.uio_iov = aiov; 1883 auio.uio_iovcnt = iovcnt; 1884 auio.uio_resid = cnt; 1885 auio.uio_segflg = UIO_SYSSPACE; 1886 auio.uio_llimit = MAXOFFSET_T; 1887 auio.uio_fmode = fflag; 1888 auio.uio_extflg = UIO_COPY_CACHED; 1889 1890 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1891 1892 /* If read sync is not asked for, filter sync flags */ 1893 if ((ioflag & FRSYNC) == 0) 1894 ioflag &= ~(FSYNC|FDSYNC); 1895 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL); 1896 cnt -= auio.uio_resid; 1897 1898 VOP_RWUNLOCK(vp, rwflag, NULL); 1899 1900 if (error == EINTR && cnt != 0) 1901 error = 0; 1902 out: 1903 if (error != 0) { 1904 *err = error; 1905 return (0); 1906 } else { 1907 *err = 0; 1908 return (cnt); 1909 } 1910 } 1911 1912 int 1913 so_copyin(const void *from, void *to, size_t size, int fromkernel) 1914 { 1915 if (fromkernel) { 1916 bcopy(from, to, size); 1917 return (0); 1918 } 1919 return (xcopyin(from, to, size)); 1920 } 1921 1922 int 1923 so_copyout(const void *from, void *to, size_t size, int tokernel) 1924 { 1925 if (tokernel) { 1926 bcopy(from, to, size); 1927 return (0); 1928 } 1929 return (xcopyout(from, to, size)); 1930 } 1931