1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vfs.h> 37 #include <sys/vfs_opreg.h> 38 #include <sys/vnode.h> 39 #include <sys/debug.h> 40 #include <sys/errno.h> 41 #include <sys/time.h> 42 #include <sys/file.h> 43 #include <sys/open.h> 44 #include <sys/user.h> 45 #include <sys/termios.h> 46 #include <sys/stream.h> 47 #include <sys/strsubr.h> 48 #include <sys/strsun.h> 49 #include <sys/esunddi.h> 50 #include <sys/flock.h> 51 #include <sys/modctl.h> 52 #include <sys/cmn_err.h> 53 #include <sys/mkdev.h> 54 #include <sys/pathname.h> 55 #include <sys/ddi.h> 56 #include <sys/stat.h> 57 #include <sys/fs/snode.h> 58 #include <sys/fs/dv_node.h> 59 #include <sys/zone.h> 60 61 #include <sys/socket.h> 62 #include <sys/socketvar.h> 63 #include <netinet/in.h> 64 #include <sys/un.h> 65 66 #include <sys/ucred.h> 67 68 #include <sys/tiuser.h> 69 #define _SUN_TPI_VERSION 2 70 #include <sys/tihdr.h> 71 72 #include <c2/audit.h> 73 74 #include <fs/sockfs/nl7c.h> 75 #include <fs/sockfs/sockcommon.h> 76 #include <fs/sockfs/socktpi.h> 77 #include <fs/sockfs/socktpi_impl.h> 78 #include <fs/sockfs/sodirect.h> 79 80 /* 81 * Macros that operate on struct cmsghdr. 82 * The CMSG_VALID macro does not assume that the last option buffer is padded. 83 */ 84 #define CMSG_CONTENT(cmsg) (&((cmsg)[1])) 85 #define CMSG_CONTENTLEN(cmsg) ((cmsg)->cmsg_len - sizeof (struct cmsghdr)) 86 #define CMSG_VALID(cmsg, start, end) \ 87 (ISALIGNED_cmsghdr(cmsg) && \ 88 ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \ 89 ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \ 90 ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \ 91 ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end))) 92 #define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */ 93 94 dev_t sockdev; /* For fsid in getattr */ 95 int sockfs_defer_nl7c_init = 0; 96 97 struct socklist socklist; 98 99 struct kmem_cache *socket_cache; 100 101 static int sockfs_update(kstat_t *, int); 102 static int sockfs_snapshot(kstat_t *, void *, int); 103 extern smod_info_t *sotpi_smod_create(void); 104 105 extern void sendfile_init(); 106 107 extern void nl7c_init(void); 108 109 extern int modrootloaded; 110 111 #define ADRSTRLEN (2 * sizeof (void *) + 1) 112 /* 113 * kernel structure for passing the sockinfo data back up to the user. 114 * the strings array allows us to convert AF_UNIX addresses into strings 115 * with a common method regardless of which n-bit kernel we're running. 116 */ 117 struct k_sockinfo { 118 struct sockinfo ks_si; 119 char ks_straddr[3][ADRSTRLEN]; 120 }; 121 122 /* 123 * Translate from a device pathname (e.g. "/dev/tcp") to a vnode. 124 * Returns with the vnode held. 125 */ 126 int 127 sogetvp(char *devpath, vnode_t **vpp, int uioflag) 128 { 129 struct snode *csp; 130 vnode_t *vp, *dvp; 131 major_t maj; 132 int error; 133 134 ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE); 135 136 /* 137 * Lookup the underlying filesystem vnode. 138 */ 139 error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp); 140 if (error) 141 return (error); 142 143 /* Check that it is the correct vnode */ 144 if (vp->v_type != VCHR) { 145 VN_RELE(vp); 146 return (ENOTSOCK); 147 } 148 149 /* 150 * If devpath went through devfs, the device should already 151 * be configured. If devpath is a mknod file, however, we 152 * need to make sure the device is properly configured. 153 * To do this, we do something similar to spec_open() 154 * except that we resolve to the minor/leaf level since 155 * we need to return a vnode. 156 */ 157 csp = VTOS(VTOS(vp)->s_commonvp); 158 if (!(csp->s_flag & SDIPSET)) { 159 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 160 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname); 161 if (error == 0) 162 error = devfs_lookupname(pathname, NULLVPP, &dvp); 163 VN_RELE(vp); 164 kmem_free(pathname, MAXPATHLEN); 165 if (error != 0) 166 return (ENXIO); 167 vp = dvp; /* use the devfs vp */ 168 } 169 170 /* device is configured at this point */ 171 maj = getmajor(vp->v_rdev); 172 if (!STREAMSTAB(maj)) { 173 VN_RELE(vp); 174 return (ENOSTR); 175 } 176 177 *vpp = vp; 178 return (0); 179 } 180 181 /* 182 * Update the accessed, updated, or changed times in an sonode 183 * with the current time. 184 * 185 * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable 186 * attributes in a fstat call. (They return the current time and 0 for 187 * all timestamps, respectively.) We maintain the current timestamps 188 * here primarily so that should sockmod be popped the resulting 189 * file descriptor will behave like a stream w.r.t. the timestamps. 190 */ 191 void 192 so_update_attrs(struct sonode *so, int flag) 193 { 194 time_t now = gethrestime_sec(); 195 196 if (SOCK_IS_NONSTR(so)) 197 return; 198 199 mutex_enter(&so->so_lock); 200 so->so_flag |= flag; 201 if (flag & SOACC) 202 SOTOTPI(so)->sti_atime = now; 203 if (flag & SOMOD) 204 SOTOTPI(so)->sti_mtime = now; 205 mutex_exit(&so->so_lock); 206 } 207 208 extern so_create_func_t sock_comm_create_function; 209 extern so_destroy_func_t sock_comm_destroy_function; 210 /* 211 * Init function called when sockfs is loaded. 212 */ 213 int 214 sockinit(int fstype, char *name) 215 { 216 static const fs_operation_def_t sock_vfsops_template[] = { 217 NULL, NULL 218 }; 219 int error; 220 major_t dev; 221 char *err_str; 222 223 error = vfs_setfsops(fstype, sock_vfsops_template, NULL); 224 if (error != 0) { 225 zcmn_err(GLOBAL_ZONEID, CE_WARN, 226 "sockinit: bad vfs ops template"); 227 return (error); 228 } 229 230 error = vn_make_ops(name, socket_vnodeops_template, 231 &socket_vnodeops); 232 if (error != 0) { 233 err_str = "sockinit: bad socket vnode ops template"; 234 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */ 235 socket_vnodeops = NULL; 236 goto failure; 237 } 238 239 socket_cache = kmem_cache_create("socket_cache", 240 sizeof (struct sonode), 0, sonode_constructor, 241 sonode_destructor, NULL, NULL, NULL, 0); 242 243 error = socktpi_init(); 244 if (error != 0) { 245 err_str = NULL; 246 goto failure; 247 } 248 249 error = sod_init(); 250 if (error != 0) { 251 err_str = NULL; 252 goto failure; 253 } 254 255 /* 256 * Set up the default create and destroy functions 257 */ 258 sock_comm_create_function = socket_sonode_create; 259 sock_comm_destroy_function = socket_sonode_destroy; 260 261 /* 262 * Build initial list mapping socket parameters to vnode. 263 */ 264 smod_init(); 265 smod_add(sotpi_smod_create()); 266 267 sockparams_init(); 268 269 /* 270 * If sockets are needed before init runs /sbin/soconfig 271 * it is possible to preload the sockparams list here using 272 * calls like: 273 * sockconfig(1,2,3, "/dev/tcp", 0); 274 */ 275 276 /* 277 * Create a unique dev_t for use in so_fsid. 278 */ 279 280 if ((dev = getudev()) == (major_t)-1) 281 dev = 0; 282 sockdev = makedevice(dev, 0); 283 284 mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL); 285 sendfile_init(); 286 if (!modrootloaded) { 287 sockfs_defer_nl7c_init = 1; 288 } else { 289 nl7c_init(); 290 } 291 292 return (0); 293 294 failure: 295 (void) vfs_freevfsops_by_type(fstype); 296 if (socket_vnodeops != NULL) 297 vn_freevnodeops(socket_vnodeops); 298 if (err_str != NULL) 299 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str); 300 return (error); 301 } 302 303 /* 304 * Caller must hold the mutex. Used to set SOLOCKED. 305 */ 306 void 307 so_lock_single(struct sonode *so) 308 { 309 ASSERT(MUTEX_HELD(&so->so_lock)); 310 311 while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) { 312 so->so_flag |= SOWANT; 313 cv_wait_stop(&so->so_want_cv, &so->so_lock, 314 SO_LOCK_WAKEUP_TIME); 315 } 316 so->so_flag |= SOLOCKED; 317 } 318 319 /* 320 * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND. 321 * Used to clear SOLOCKED or SOASYNC_UNBIND. 322 */ 323 void 324 so_unlock_single(struct sonode *so, int flag) 325 { 326 ASSERT(MUTEX_HELD(&so->so_lock)); 327 ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND)); 328 ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0); 329 ASSERT(so->so_flag & flag); 330 /* 331 * Process the T_DISCON_IND on sti_discon_ind_mp. 332 * 333 * Call to so_drain_discon_ind will result in so_lock 334 * being dropped and re-acquired later. 335 */ 336 if (!SOCK_IS_NONSTR(so)) { 337 sotpi_info_t *sti = SOTOTPI(so); 338 339 if (sti->sti_discon_ind_mp != NULL) 340 so_drain_discon_ind(so); 341 } 342 343 if (so->so_flag & SOWANT) 344 cv_broadcast(&so->so_want_cv); 345 so->so_flag &= ~(SOWANT|flag); 346 } 347 348 /* 349 * Caller must hold the mutex. Used to set SOREADLOCKED. 350 * If the caller wants nonblocking behavior it should set fmode. 351 */ 352 int 353 so_lock_read(struct sonode *so, int fmode) 354 { 355 ASSERT(MUTEX_HELD(&so->so_lock)); 356 357 while (so->so_flag & SOREADLOCKED) { 358 if (fmode & (FNDELAY|FNONBLOCK)) 359 return (EWOULDBLOCK); 360 so->so_flag |= SOWANT; 361 cv_wait_stop(&so->so_want_cv, &so->so_lock, 362 SO_LOCK_WAKEUP_TIME); 363 } 364 so->so_flag |= SOREADLOCKED; 365 return (0); 366 } 367 368 /* 369 * Like so_lock_read above but allows signals. 370 */ 371 int 372 so_lock_read_intr(struct sonode *so, int fmode) 373 { 374 ASSERT(MUTEX_HELD(&so->so_lock)); 375 376 while (so->so_flag & SOREADLOCKED) { 377 if (fmode & (FNDELAY|FNONBLOCK)) 378 return (EWOULDBLOCK); 379 so->so_flag |= SOWANT; 380 if (!cv_wait_sig(&so->so_want_cv, &so->so_lock)) 381 return (EINTR); 382 } 383 so->so_flag |= SOREADLOCKED; 384 return (0); 385 } 386 387 /* 388 * Caller must hold the mutex. Used to clear SOREADLOCKED, 389 * set in so_lock_read() or so_lock_read_intr(). 390 */ 391 void 392 so_unlock_read(struct sonode *so) 393 { 394 ASSERT(MUTEX_HELD(&so->so_lock)); 395 ASSERT(so->so_flag & SOREADLOCKED); 396 397 if (so->so_flag & SOWANT) 398 cv_broadcast(&so->so_want_cv); 399 so->so_flag &= ~(SOWANT|SOREADLOCKED); 400 } 401 402 /* 403 * Verify that the specified offset falls within the mblk and 404 * that the resulting pointer is aligned. 405 * Returns NULL if not. 406 */ 407 void * 408 sogetoff(mblk_t *mp, t_uscalar_t offset, 409 t_uscalar_t length, uint_t align_size) 410 { 411 uintptr_t ptr1, ptr2; 412 413 ASSERT(mp && mp->b_wptr >= mp->b_rptr); 414 ptr1 = (uintptr_t)mp->b_rptr + offset; 415 ptr2 = (uintptr_t)ptr1 + length; 416 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) { 417 eprintline(0); 418 return (NULL); 419 } 420 if ((ptr1 & (align_size - 1)) != 0) { 421 eprintline(0); 422 return (NULL); 423 } 424 return ((void *)ptr1); 425 } 426 427 /* 428 * Return the AF_UNIX underlying filesystem vnode matching a given name. 429 * Makes sure the sending and the destination sonodes are compatible. 430 * The vnode is returned held. 431 * 432 * The underlying filesystem VSOCK vnode has a v_stream pointer that 433 * references the actual stream head (hence indirectly the actual sonode). 434 */ 435 static int 436 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess, 437 vnode_t **vpp) 438 { 439 vnode_t *vp; /* Underlying filesystem vnode */ 440 vnode_t *rvp; /* real vnode */ 441 vnode_t *svp; /* sockfs vnode */ 442 struct sonode *so2; 443 int error; 444 445 dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so, 446 soun->sun_path)); 447 448 error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); 449 if (error) { 450 eprintsoline(so, error); 451 return (error); 452 } 453 454 /* 455 * Traverse lofs mounts get the real vnode 456 */ 457 if (VOP_REALVP(vp, &rvp, NULL) == 0) { 458 VN_HOLD(rvp); /* hold the real vnode */ 459 VN_RELE(vp); /* release hold from lookup */ 460 vp = rvp; 461 } 462 463 if (vp->v_type != VSOCK) { 464 error = ENOTSOCK; 465 eprintsoline(so, error); 466 goto done2; 467 } 468 469 if (checkaccess) { 470 /* 471 * Check that we have permissions to access the destination 472 * vnode. This check is not done in BSD but it is required 473 * by X/Open. 474 */ 475 if (error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL)) { 476 eprintsoline(so, error); 477 goto done2; 478 } 479 } 480 481 /* 482 * Check if the remote socket has been closed. 483 * 484 * Synchronize with vn_rele_stream by holding v_lock while traversing 485 * v_stream->sd_vnode. 486 */ 487 mutex_enter(&vp->v_lock); 488 if (vp->v_stream == NULL) { 489 mutex_exit(&vp->v_lock); 490 if (so->so_type == SOCK_DGRAM) 491 error = EDESTADDRREQ; 492 else 493 error = ECONNREFUSED; 494 495 eprintsoline(so, error); 496 goto done2; 497 } 498 ASSERT(vp->v_stream->sd_vnode); 499 svp = vp->v_stream->sd_vnode; 500 /* 501 * holding v_lock on underlying filesystem vnode and acquiring 502 * it on sockfs vnode. Assumes that no code ever attempts to 503 * acquire these locks in the reverse order. 504 */ 505 VN_HOLD(svp); 506 mutex_exit(&vp->v_lock); 507 508 if (svp->v_type != VSOCK) { 509 error = ENOTSOCK; 510 eprintsoline(so, error); 511 goto done; 512 } 513 514 so2 = VTOSO(svp); 515 516 if (so->so_type != so2->so_type) { 517 error = EPROTOTYPE; 518 eprintsoline(so, error); 519 goto done; 520 } 521 522 VN_RELE(svp); 523 *vpp = vp; 524 return (0); 525 526 done: 527 VN_RELE(svp); 528 done2: 529 VN_RELE(vp); 530 return (error); 531 } 532 533 /* 534 * Verify peer address for connect and sendto/sendmsg. 535 * Since sendto/sendmsg would not get synchronous errors from the transport 536 * provider we have to do these ugly checks in the socket layer to 537 * preserve compatibility with SunOS 4.X. 538 */ 539 int 540 so_addr_verify(struct sonode *so, const struct sockaddr *name, 541 socklen_t namelen) 542 { 543 int family; 544 545 dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n", 546 (void *)so, (void *)name, namelen)); 547 548 ASSERT(name != NULL); 549 550 family = so->so_family; 551 switch (family) { 552 case AF_INET: 553 if (name->sa_family != family) { 554 eprintsoline(so, EAFNOSUPPORT); 555 return (EAFNOSUPPORT); 556 } 557 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) { 558 eprintsoline(so, EINVAL); 559 return (EINVAL); 560 } 561 break; 562 case AF_INET6: { 563 #ifdef DEBUG 564 struct sockaddr_in6 *sin6; 565 #endif /* DEBUG */ 566 567 if (name->sa_family != family) { 568 eprintsoline(so, EAFNOSUPPORT); 569 return (EAFNOSUPPORT); 570 } 571 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) { 572 eprintsoline(so, EINVAL); 573 return (EINVAL); 574 } 575 #ifdef DEBUG 576 /* Verify that apps don't forget to clear sin6_scope_id etc */ 577 sin6 = (struct sockaddr_in6 *)name; 578 if (sin6->sin6_scope_id != 0 && 579 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) { 580 zcmn_err(getzoneid(), CE_WARN, 581 "connect/send* with uninitialized sin6_scope_id " 582 "(%d) on socket. Pid = %d\n", 583 (int)sin6->sin6_scope_id, (int)curproc->p_pid); 584 } 585 #endif /* DEBUG */ 586 break; 587 } 588 case AF_UNIX: 589 if (SOTOTPI(so)->sti_faddr_noxlate) { 590 return (0); 591 } 592 if (namelen < (socklen_t)sizeof (short)) { 593 eprintsoline(so, ENOENT); 594 return (ENOENT); 595 } 596 if (name->sa_family != family) { 597 eprintsoline(so, EAFNOSUPPORT); 598 return (EAFNOSUPPORT); 599 } 600 /* MAXPATHLEN + soun_family + nul termination */ 601 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) { 602 eprintsoline(so, ENAMETOOLONG); 603 return (ENAMETOOLONG); 604 } 605 606 break; 607 608 default: 609 /* 610 * Default is don't do any length or sa_family check 611 * to allow non-sockaddr style addresses. 612 */ 613 break; 614 } 615 616 return (0); 617 } 618 619 620 /* 621 * Translate an AF_UNIX sockaddr_un to the transport internal name. 622 * Assumes caller has called so_addr_verify first. 623 */ 624 /*ARGSUSED*/ 625 int 626 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name, 627 socklen_t namelen, int checkaccess, 628 void **addrp, socklen_t *addrlenp) 629 { 630 int error; 631 struct sockaddr_un *soun; 632 vnode_t *vp; 633 void *addr; 634 socklen_t addrlen; 635 sotpi_info_t *sti = SOTOTPI(so); 636 637 dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n", 638 (void *)so, (void *)name, namelen, checkaccess)); 639 640 ASSERT(name != NULL); 641 ASSERT(so->so_family == AF_UNIX); 642 ASSERT(!sti->sti_faddr_noxlate); 643 ASSERT(namelen >= (socklen_t)sizeof (short)); 644 ASSERT(name->sa_family == AF_UNIX); 645 soun = (struct sockaddr_un *)name; 646 /* 647 * Lookup vnode for the specified path name and verify that 648 * it is a socket. 649 */ 650 error = so_ux_lookup(so, soun, checkaccess, &vp); 651 if (error) { 652 eprintsoline(so, error); 653 return (error); 654 } 655 /* 656 * Use the address of the peer vnode as the address to send 657 * to. We release the peer vnode here. In case it has been 658 * closed by the time the T_CONN_REQ or T_UNIDATA_REQ reaches the 659 * transport the message will get an error or be dropped. 660 */ 661 sti->sti_ux_faddr.soua_vp = vp; 662 sti->sti_ux_faddr.soua_magic = SOU_MAGIC_EXPLICIT; 663 addr = &sti->sti_ux_faddr; 664 addrlen = (socklen_t)sizeof (sti->sti_ux_faddr); 665 dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n", 666 addrlen, (void *)vp)); 667 VN_RELE(vp); 668 *addrp = addr; 669 *addrlenp = (socklen_t)addrlen; 670 return (0); 671 } 672 673 /* 674 * Esballoc free function for messages that contain SO_FILEP option. 675 * Decrement the reference count on the file pointers using closef. 676 */ 677 void 678 fdbuf_free(struct fdbuf *fdbuf) 679 { 680 int i; 681 struct file *fp; 682 683 dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd)); 684 for (i = 0; i < fdbuf->fd_numfd; i++) { 685 /* 686 * We need pointer size alignment for fd_fds. On a LP64 687 * kernel, the required alignment is 8 bytes while 688 * the option headers and values are only 4 bytes 689 * aligned. So its safer to do a bcopy compared to 690 * assigning fdbuf->fd_fds[i] to fp. 691 */ 692 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 693 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp)); 694 (void) closef(fp); 695 } 696 if (fdbuf->fd_ebuf != NULL) 697 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen); 698 kmem_free(fdbuf, fdbuf->fd_size); 699 } 700 701 /* 702 * Allocate an esballoc'ed message for AF_UNIX file descriptor passing. 703 * Waits if memory is not available. 704 */ 705 mblk_t * 706 fdbuf_allocmsg(int size, struct fdbuf *fdbuf) 707 { 708 uchar_t *buf; 709 mblk_t *mp; 710 711 dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd)); 712 buf = kmem_alloc(size, KM_SLEEP); 713 fdbuf->fd_ebuf = (caddr_t)buf; 714 fdbuf->fd_ebuflen = size; 715 fdbuf->fd_frtn.free_func = fdbuf_free; 716 fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf; 717 718 mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn); 719 mp->b_datap->db_type = M_PROTO; 720 return (mp); 721 } 722 723 /* 724 * Extract file descriptors from a fdbuf. 725 * Return list in rights/rightslen. 726 */ 727 /*ARGSUSED*/ 728 static int 729 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen) 730 { 731 int i, fd; 732 int *rp; 733 struct file *fp; 734 int numfd; 735 736 dprint(1, ("fdbuf_extract: %d fds, len %d\n", 737 fdbuf->fd_numfd, rightslen)); 738 739 numfd = fdbuf->fd_numfd; 740 ASSERT(rightslen == numfd * (int)sizeof (int)); 741 742 /* 743 * Allocate a file descriptor and increment the f_count. 744 * The latter is needed since we always call fdbuf_free 745 * which performs a closef. 746 */ 747 rp = (int *)rights; 748 for (i = 0; i < numfd; i++) { 749 if ((fd = ufalloc(0)) == -1) 750 goto cleanup; 751 /* 752 * We need pointer size alignment for fd_fds. On a LP64 753 * kernel, the required alignment is 8 bytes while 754 * the option headers and values are only 4 bytes 755 * aligned. So its safer to do a bcopy compared to 756 * assigning fdbuf->fd_fds[i] to fp. 757 */ 758 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 759 mutex_enter(&fp->f_tlock); 760 fp->f_count++; 761 mutex_exit(&fp->f_tlock); 762 setf(fd, fp); 763 *rp++ = fd; 764 if (audit_active) 765 audit_fdrecv(fd, fp); 766 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n", 767 i, fd, (void *)fp, fp->f_count)); 768 } 769 return (0); 770 771 cleanup: 772 /* 773 * Undo whatever partial work the loop above has done. 774 */ 775 { 776 int j; 777 778 rp = (int *)rights; 779 for (j = 0; j < i; j++) { 780 dprint(0, 781 ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp)); 782 (void) closeandsetf(*rp++, NULL); 783 } 784 } 785 786 return (EMFILE); 787 } 788 789 /* 790 * Insert file descriptors into an fdbuf. 791 * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed 792 * by calling fdbuf_free(). 793 */ 794 int 795 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp) 796 { 797 int numfd, i; 798 int *fds; 799 struct file *fp; 800 struct fdbuf *fdbuf; 801 int fdbufsize; 802 803 dprint(1, ("fdbuf_create: len %d\n", rightslen)); 804 805 numfd = rightslen / (int)sizeof (int); 806 807 fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)); 808 fdbuf = kmem_alloc(fdbufsize, KM_SLEEP); 809 fdbuf->fd_size = fdbufsize; 810 fdbuf->fd_numfd = 0; 811 fdbuf->fd_ebuf = NULL; 812 fdbuf->fd_ebuflen = 0; 813 fds = (int *)rights; 814 for (i = 0; i < numfd; i++) { 815 if ((fp = getf(fds[i])) == NULL) { 816 fdbuf_free(fdbuf); 817 return (EBADF); 818 } 819 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n", 820 i, fds[i], (void *)fp, fp->f_count)); 821 mutex_enter(&fp->f_tlock); 822 fp->f_count++; 823 mutex_exit(&fp->f_tlock); 824 /* 825 * The maximum alignment for fdbuf (or any option header 826 * and its value) it 4 bytes. On a LP64 kernel, the alignment 827 * is not sufficient for pointers (fd_fds in this case). Since 828 * we just did a kmem_alloc (we get a double word alignment), 829 * we don't need to do anything on the send side (we loose 830 * the double word alignment because fdbuf goes after an 831 * option header (eg T_unitdata_req) which is only 4 byte 832 * aligned). We take care of this when we extract the file 833 * descriptor in fdbuf_extract or fdbuf_free. 834 */ 835 fdbuf->fd_fds[i] = fp; 836 fdbuf->fd_numfd++; 837 releasef(fds[i]); 838 if (audit_active) 839 audit_fdsend(fds[i], fp, 0); 840 } 841 *fdbufp = fdbuf; 842 return (0); 843 } 844 845 static int 846 fdbuf_optlen(int rightslen) 847 { 848 int numfd; 849 850 numfd = rightslen / (int)sizeof (int); 851 852 return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *))); 853 } 854 855 static t_uscalar_t 856 fdbuf_cmsglen(int fdbuflen) 857 { 858 return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) / 859 (int)sizeof (struct file *) * (int)sizeof (int)); 860 } 861 862 863 /* 864 * Return non-zero if the mblk and fdbuf are consistent. 865 */ 866 static int 867 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen) 868 { 869 if (fdbuflen >= FDBUF_HDRSIZE && 870 fdbuflen == fdbuf->fd_size) { 871 frtn_t *frp = mp->b_datap->db_frtnp; 872 /* 873 * Check that the SO_FILEP portion of the 874 * message has not been modified by 875 * the loopback transport. The sending sockfs generates 876 * a message that is esballoc'ed with the free function 877 * being fdbuf_free() and where free_arg contains the 878 * identical information as the SO_FILEP content. 879 * 880 * If any of these constraints are not satisfied we 881 * silently ignore the option. 882 */ 883 ASSERT(mp); 884 if (frp != NULL && 885 frp->free_func == fdbuf_free && 886 frp->free_arg != NULL && 887 bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) { 888 dprint(1, ("fdbuf_verify: fdbuf %p len %d\n", 889 (void *)fdbuf, fdbuflen)); 890 return (1); 891 } else { 892 zcmn_err(getzoneid(), CE_WARN, 893 "sockfs: mismatched fdbuf content (%p)", 894 (void *)mp); 895 return (0); 896 } 897 } else { 898 zcmn_err(getzoneid(), CE_WARN, 899 "sockfs: mismatched fdbuf len %d, %d\n", 900 fdbuflen, fdbuf->fd_size); 901 return (0); 902 } 903 } 904 905 /* 906 * When the file descriptors returned by sorecvmsg can not be passed 907 * to the application this routine will cleanup the references on 908 * the files. Start at startoff bytes into the buffer. 909 */ 910 static void 911 close_fds(void *fdbuf, int fdbuflen, int startoff) 912 { 913 int *fds = (int *)fdbuf; 914 int numfd = fdbuflen / (int)sizeof (int); 915 int i; 916 917 dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff)); 918 919 for (i = 0; i < numfd; i++) { 920 if (startoff < 0) 921 startoff = 0; 922 if (startoff < (int)sizeof (int)) { 923 /* 924 * This file descriptor is partially or fully after 925 * the offset 926 */ 927 dprint(0, 928 ("close_fds: cleanup[%d] = %d\n", i, fds[i])); 929 (void) closeandsetf(fds[i], NULL); 930 } 931 startoff -= (int)sizeof (int); 932 } 933 } 934 935 /* 936 * Close all file descriptors contained in the control part starting at 937 * the startoffset. 938 */ 939 void 940 so_closefds(void *control, t_uscalar_t controllen, int oldflg, 941 int startoff) 942 { 943 struct cmsghdr *cmsg; 944 945 if (control == NULL) 946 return; 947 948 if (oldflg) { 949 close_fds(control, controllen, startoff); 950 return; 951 } 952 /* Scan control part for file descriptors. */ 953 for (cmsg = (struct cmsghdr *)control; 954 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 955 cmsg = CMSG_NEXT(cmsg)) { 956 if (cmsg->cmsg_level == SOL_SOCKET && 957 cmsg->cmsg_type == SCM_RIGHTS) { 958 close_fds(CMSG_CONTENT(cmsg), 959 (int)CMSG_CONTENTLEN(cmsg), 960 startoff - (int)sizeof (struct cmsghdr)); 961 } 962 startoff -= cmsg->cmsg_len; 963 } 964 } 965 966 /* 967 * Returns a pointer/length for the file descriptors contained 968 * in the control buffer. Returns with *fdlenp == -1 if there are no 969 * file descriptor options present. This is different than there being 970 * a zero-length file descriptor option. 971 * Fail if there are multiple SCM_RIGHT cmsgs. 972 */ 973 int 974 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg, 975 void **fdsp, int *fdlenp) 976 { 977 struct cmsghdr *cmsg; 978 void *fds; 979 int fdlen; 980 981 if (control == NULL) { 982 *fdsp = NULL; 983 *fdlenp = -1; 984 return (0); 985 } 986 987 if (oldflg) { 988 *fdsp = control; 989 if (controllen == 0) 990 *fdlenp = -1; 991 else 992 *fdlenp = controllen; 993 dprint(1, ("so_getfdopt: old %d\n", *fdlenp)); 994 return (0); 995 } 996 997 fds = NULL; 998 fdlen = 0; 999 1000 for (cmsg = (struct cmsghdr *)control; 1001 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1002 cmsg = CMSG_NEXT(cmsg)) { 1003 if (cmsg->cmsg_level == SOL_SOCKET && 1004 cmsg->cmsg_type == SCM_RIGHTS) { 1005 if (fds != NULL) 1006 return (EINVAL); 1007 fds = CMSG_CONTENT(cmsg); 1008 fdlen = (int)CMSG_CONTENTLEN(cmsg); 1009 dprint(1, ("so_getfdopt: new %lu\n", 1010 (size_t)CMSG_CONTENTLEN(cmsg))); 1011 } 1012 } 1013 if (fds == NULL) { 1014 dprint(1, ("so_getfdopt: NONE\n")); 1015 *fdlenp = -1; 1016 } else 1017 *fdlenp = fdlen; 1018 *fdsp = fds; 1019 return (0); 1020 } 1021 1022 /* 1023 * Return the length of the options including any file descriptor options. 1024 */ 1025 t_uscalar_t 1026 so_optlen(void *control, t_uscalar_t controllen, int oldflg) 1027 { 1028 struct cmsghdr *cmsg; 1029 t_uscalar_t optlen = 0; 1030 t_uscalar_t len; 1031 1032 if (control == NULL) 1033 return (0); 1034 1035 if (oldflg) 1036 return ((t_uscalar_t)(sizeof (struct T_opthdr) + 1037 fdbuf_optlen(controllen))); 1038 1039 for (cmsg = (struct cmsghdr *)control; 1040 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1041 cmsg = CMSG_NEXT(cmsg)) { 1042 if (cmsg->cmsg_level == SOL_SOCKET && 1043 cmsg->cmsg_type == SCM_RIGHTS) { 1044 len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg)); 1045 } else { 1046 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1047 } 1048 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) + 1049 sizeof (struct T_opthdr)); 1050 } 1051 dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n", 1052 controllen, oldflg, optlen)); 1053 return (optlen); 1054 } 1055 1056 /* 1057 * Copy options from control to the mblk. Skip any file descriptor options. 1058 */ 1059 void 1060 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp) 1061 { 1062 struct T_opthdr toh; 1063 struct cmsghdr *cmsg; 1064 1065 if (control == NULL) 1066 return; 1067 1068 if (oldflg) { 1069 /* No real options - caller has handled file descriptors */ 1070 return; 1071 } 1072 for (cmsg = (struct cmsghdr *)control; 1073 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1074 cmsg = CMSG_NEXT(cmsg)) { 1075 /* 1076 * Note: The caller handles file descriptors prior 1077 * to calling this function. 1078 */ 1079 t_uscalar_t len; 1080 1081 if (cmsg->cmsg_level == SOL_SOCKET && 1082 cmsg->cmsg_type == SCM_RIGHTS) 1083 continue; 1084 1085 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1086 toh.level = cmsg->cmsg_level; 1087 toh.name = cmsg->cmsg_type; 1088 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr); 1089 toh.status = 0; 1090 1091 soappendmsg(mp, &toh, sizeof (toh)); 1092 soappendmsg(mp, CMSG_CONTENT(cmsg), len); 1093 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len; 1094 ASSERT(mp->b_wptr <= mp->b_datap->db_lim); 1095 } 1096 } 1097 1098 /* 1099 * Return the length of the control message derived from the options. 1100 * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP. 1101 * When oldflg is set only include SO_FILEP. 1102 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1103 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1104 * also be checked for any possible impacts. 1105 */ 1106 t_uscalar_t 1107 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg) 1108 { 1109 t_uscalar_t cmsglen = 0; 1110 struct T_opthdr *tohp; 1111 t_uscalar_t len; 1112 t_uscalar_t last_roundup = 0; 1113 1114 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1115 1116 for (tohp = (struct T_opthdr *)opt; 1117 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1118 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1119 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n", 1120 tohp->level, tohp->name, tohp->len)); 1121 if (tohp->level == SOL_SOCKET && 1122 (tohp->name == SO_SRCADDR || 1123 tohp->name == SO_UNIX_CLOSE)) { 1124 continue; 1125 } 1126 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1127 struct fdbuf *fdbuf; 1128 int fdbuflen; 1129 1130 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1131 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1132 1133 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1134 continue; 1135 if (oldflg) { 1136 cmsglen += fdbuf_cmsglen(fdbuflen); 1137 continue; 1138 } 1139 len = fdbuf_cmsglen(fdbuflen); 1140 } else if (tohp->level == SOL_SOCKET && 1141 tohp->name == SCM_TIMESTAMP) { 1142 if (oldflg) 1143 continue; 1144 1145 if (get_udatamodel() == DATAMODEL_NATIVE) { 1146 len = sizeof (struct timeval); 1147 } else { 1148 len = sizeof (struct timeval32); 1149 } 1150 } else { 1151 if (oldflg) 1152 continue; 1153 len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1154 } 1155 /* 1156 * Exclude roundup for last option to not set 1157 * MSG_CTRUNC when the cmsg fits but the padding doesn't fit. 1158 */ 1159 last_roundup = (t_uscalar_t) 1160 (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) - 1161 (len + (int)sizeof (struct cmsghdr))); 1162 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) + 1163 last_roundup; 1164 } 1165 cmsglen -= last_roundup; 1166 dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n", 1167 optlen, oldflg, cmsglen)); 1168 return (cmsglen); 1169 } 1170 1171 /* 1172 * Copy options from options to the control. Convert SO_FILEP to 1173 * file descriptors. 1174 * Returns errno or zero. 1175 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1176 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1177 * also be checked for any possible impacts. 1178 */ 1179 int 1180 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg, 1181 void *control, t_uscalar_t controllen) 1182 { 1183 struct T_opthdr *tohp; 1184 struct cmsghdr *cmsg; 1185 struct fdbuf *fdbuf; 1186 int fdbuflen; 1187 int error; 1188 #if defined(DEBUG) || defined(__lint) 1189 struct cmsghdr *cend = (struct cmsghdr *) 1190 (((uint8_t *)control) + ROUNDUP_cmsglen(controllen)); 1191 #endif 1192 cmsg = (struct cmsghdr *)control; 1193 1194 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1195 1196 for (tohp = (struct T_opthdr *)opt; 1197 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1198 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1199 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n", 1200 tohp->level, tohp->name, tohp->len)); 1201 1202 if (tohp->level == SOL_SOCKET && 1203 (tohp->name == SO_SRCADDR || 1204 tohp->name == SO_UNIX_CLOSE)) { 1205 continue; 1206 } 1207 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen); 1208 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1209 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1210 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1211 1212 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1213 return (EPROTO); 1214 if (oldflg) { 1215 error = fdbuf_extract(fdbuf, control, 1216 (int)controllen); 1217 if (error != 0) 1218 return (error); 1219 continue; 1220 } else { 1221 int fdlen; 1222 1223 fdlen = (int)fdbuf_cmsglen( 1224 (int)_TPI_TOPT_DATALEN(tohp)); 1225 1226 cmsg->cmsg_level = tohp->level; 1227 cmsg->cmsg_type = SCM_RIGHTS; 1228 cmsg->cmsg_len = (socklen_t)(fdlen + 1229 sizeof (struct cmsghdr)); 1230 1231 error = fdbuf_extract(fdbuf, 1232 CMSG_CONTENT(cmsg), fdlen); 1233 if (error != 0) 1234 return (error); 1235 } 1236 } else if (tohp->level == SOL_SOCKET && 1237 tohp->name == SCM_TIMESTAMP) { 1238 timestruc_t *timestamp; 1239 1240 if (oldflg) 1241 continue; 1242 1243 cmsg->cmsg_level = tohp->level; 1244 cmsg->cmsg_type = tohp->name; 1245 1246 timestamp = 1247 (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1], 1248 sizeof (intptr_t)); 1249 1250 if (get_udatamodel() == DATAMODEL_NATIVE) { 1251 struct timeval tv; 1252 1253 cmsg->cmsg_len = sizeof (struct timeval) + 1254 sizeof (struct cmsghdr); 1255 tv.tv_sec = timestamp->tv_sec; 1256 tv.tv_usec = timestamp->tv_nsec / 1257 (NANOSEC / MICROSEC); 1258 /* 1259 * on LP64 systems, the struct timeval in 1260 * the destination will not be 8-byte aligned, 1261 * so use bcopy to avoid alignment trouble 1262 */ 1263 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv)); 1264 } else { 1265 struct timeval32 *time32; 1266 1267 cmsg->cmsg_len = sizeof (struct timeval32) + 1268 sizeof (struct cmsghdr); 1269 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg); 1270 time32->tv_sec = (time32_t)timestamp->tv_sec; 1271 time32->tv_usec = 1272 (int32_t)(timestamp->tv_nsec / 1273 (NANOSEC / MICROSEC)); 1274 } 1275 1276 } else { 1277 if (oldflg) 1278 continue; 1279 1280 cmsg->cmsg_level = tohp->level; 1281 cmsg->cmsg_type = tohp->name; 1282 cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) + 1283 sizeof (struct cmsghdr)); 1284 1285 /* copy content to control data part */ 1286 bcopy(&tohp[1], CMSG_CONTENT(cmsg), 1287 CMSG_CONTENTLEN(cmsg)); 1288 } 1289 /* move to next CMSG structure! */ 1290 cmsg = CMSG_NEXT(cmsg); 1291 } 1292 dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n", 1293 control, controllen, (void *)cend, (void *)cmsg)); 1294 ASSERT(cmsg <= cend); 1295 return (0); 1296 } 1297 1298 /* 1299 * Extract the SO_SRCADDR option value if present. 1300 */ 1301 void 1302 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp, 1303 t_uscalar_t *srclenp) 1304 { 1305 struct T_opthdr *tohp; 1306 1307 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1308 1309 ASSERT(srcp != NULL && srclenp != NULL); 1310 *srcp = NULL; 1311 *srclenp = 0; 1312 1313 for (tohp = (struct T_opthdr *)opt; 1314 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1315 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1316 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n", 1317 tohp->level, tohp->name, tohp->len)); 1318 if (tohp->level == SOL_SOCKET && 1319 tohp->name == SO_SRCADDR) { 1320 *srcp = _TPI_TOPT_DATA(tohp); 1321 *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1322 } 1323 } 1324 } 1325 1326 /* 1327 * Verify if the SO_UNIX_CLOSE option is present. 1328 */ 1329 int 1330 so_getopt_unix_close(void *opt, t_uscalar_t optlen) 1331 { 1332 struct T_opthdr *tohp; 1333 1334 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1335 1336 for (tohp = (struct T_opthdr *)opt; 1337 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1338 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1339 dprint(1, 1340 ("so_getopt_unix_close: level 0x%x, name %d, len %d\n", 1341 tohp->level, tohp->name, tohp->len)); 1342 if (tohp->level == SOL_SOCKET && 1343 tohp->name == SO_UNIX_CLOSE) 1344 return (1); 1345 } 1346 return (0); 1347 } 1348 1349 /* 1350 * Allocate an M_PROTO message. 1351 * 1352 * If allocation fails the behavior depends on sleepflg: 1353 * _ALLOC_NOSLEEP fail immediately 1354 * _ALLOC_INTR sleep for memory until a signal is caught 1355 * _ALLOC_SLEEP sleep forever. Don't return NULL. 1356 */ 1357 mblk_t * 1358 soallocproto(size_t size, int sleepflg, cred_t *cr) 1359 { 1360 mblk_t *mp; 1361 1362 /* Round up size for reuse */ 1363 size = MAX(size, 64); 1364 if (cr != NULL) 1365 mp = allocb_cred(size, cr, curproc->p_pid); 1366 else 1367 mp = allocb(size, BPRI_MED); 1368 1369 if (mp == NULL) { 1370 int error; /* Dummy - error not returned to caller */ 1371 1372 switch (sleepflg) { 1373 case _ALLOC_SLEEP: 1374 if (cr != NULL) { 1375 mp = allocb_cred_wait(size, STR_NOSIG, &error, 1376 cr, curproc->p_pid); 1377 } else { 1378 mp = allocb_wait(size, BPRI_MED, STR_NOSIG, 1379 &error); 1380 } 1381 ASSERT(mp); 1382 break; 1383 case _ALLOC_INTR: 1384 if (cr != NULL) { 1385 mp = allocb_cred_wait(size, 0, &error, cr, 1386 curproc->p_pid); 1387 } else { 1388 mp = allocb_wait(size, BPRI_MED, 0, &error); 1389 } 1390 if (mp == NULL) { 1391 /* Caught signal while sleeping for memory */ 1392 eprintline(ENOBUFS); 1393 return (NULL); 1394 } 1395 break; 1396 case _ALLOC_NOSLEEP: 1397 default: 1398 eprintline(ENOBUFS); 1399 return (NULL); 1400 } 1401 } 1402 DB_TYPE(mp) = M_PROTO; 1403 return (mp); 1404 } 1405 1406 /* 1407 * Allocate an M_PROTO message with a single component. 1408 * len is the length of buf. size is the amount to allocate. 1409 * 1410 * buf can be NULL with a non-zero len. 1411 * This results in a bzero'ed chunk being placed the message. 1412 */ 1413 mblk_t * 1414 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg, 1415 cred_t *cr) 1416 { 1417 mblk_t *mp; 1418 1419 if (size == 0) 1420 size = len; 1421 1422 ASSERT(size >= len); 1423 /* Round up size for reuse */ 1424 size = MAX(size, 64); 1425 mp = soallocproto(size, sleepflg, cr); 1426 if (mp == NULL) 1427 return (NULL); 1428 mp->b_datap->db_type = M_PROTO; 1429 if (len != 0) { 1430 if (buf != NULL) 1431 bcopy(buf, mp->b_wptr, len); 1432 else 1433 bzero(mp->b_wptr, len); 1434 mp->b_wptr += len; 1435 } 1436 return (mp); 1437 } 1438 1439 /* 1440 * Append buf/len to mp. 1441 * The caller has to ensure that there is enough room in the mblk. 1442 * 1443 * buf can be NULL with a non-zero len. 1444 * This results in a bzero'ed chunk being placed the message. 1445 */ 1446 void 1447 soappendmsg(mblk_t *mp, const void *buf, ssize_t len) 1448 { 1449 ASSERT(mp); 1450 1451 if (len != 0) { 1452 /* Assert for room left */ 1453 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len); 1454 if (buf != NULL) 1455 bcopy(buf, mp->b_wptr, len); 1456 else 1457 bzero(mp->b_wptr, len); 1458 } 1459 mp->b_wptr += len; 1460 } 1461 1462 /* 1463 * Create a message using two kernel buffers. 1464 * If size is set that will determine the allocation size (e.g. for future 1465 * soappendmsg calls). If size is zero it is derived from the buffer 1466 * lengths. 1467 */ 1468 mblk_t * 1469 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1470 ssize_t size, int sleepflg, cred_t *cr) 1471 { 1472 mblk_t *mp; 1473 1474 if (size == 0) 1475 size = len1 + len2; 1476 ASSERT(size >= len1 + len2); 1477 1478 mp = soallocproto1(buf1, len1, size, sleepflg, cr); 1479 if (mp) 1480 soappendmsg(mp, buf2, len2); 1481 return (mp); 1482 } 1483 1484 /* 1485 * Create a message using three kernel buffers. 1486 * If size is set that will determine the allocation size (for future 1487 * soappendmsg calls). If size is zero it is derived from the buffer 1488 * lengths. 1489 */ 1490 mblk_t * 1491 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1492 const void *buf3, ssize_t len3, ssize_t size, int sleepflg, cred_t *cr) 1493 { 1494 mblk_t *mp; 1495 1496 if (size == 0) 1497 size = len1 + len2 +len3; 1498 ASSERT(size >= len1 + len2 + len3); 1499 1500 mp = soallocproto1(buf1, len1, size, sleepflg, cr); 1501 if (mp != NULL) { 1502 soappendmsg(mp, buf2, len2); 1503 soappendmsg(mp, buf3, len3); 1504 } 1505 return (mp); 1506 } 1507 1508 #ifdef DEBUG 1509 char * 1510 pr_state(uint_t state, uint_t mode) 1511 { 1512 static char buf[1024]; 1513 1514 buf[0] = 0; 1515 if (state & SS_ISCONNECTED) 1516 (void) strcat(buf, "ISCONNECTED "); 1517 if (state & SS_ISCONNECTING) 1518 (void) strcat(buf, "ISCONNECTING "); 1519 if (state & SS_ISDISCONNECTING) 1520 (void) strcat(buf, "ISDISCONNECTING "); 1521 if (state & SS_CANTSENDMORE) 1522 (void) strcat(buf, "CANTSENDMORE "); 1523 1524 if (state & SS_CANTRCVMORE) 1525 (void) strcat(buf, "CANTRCVMORE "); 1526 if (state & SS_ISBOUND) 1527 (void) strcat(buf, "ISBOUND "); 1528 if (state & SS_NDELAY) 1529 (void) strcat(buf, "NDELAY "); 1530 if (state & SS_NONBLOCK) 1531 (void) strcat(buf, "NONBLOCK "); 1532 1533 if (state & SS_ASYNC) 1534 (void) strcat(buf, "ASYNC "); 1535 if (state & SS_ACCEPTCONN) 1536 (void) strcat(buf, "ACCEPTCONN "); 1537 if (state & SS_SAVEDEOR) 1538 (void) strcat(buf, "SAVEDEOR "); 1539 1540 if (state & SS_RCVATMARK) 1541 (void) strcat(buf, "RCVATMARK "); 1542 if (state & SS_OOBPEND) 1543 (void) strcat(buf, "OOBPEND "); 1544 if (state & SS_HAVEOOBDATA) 1545 (void) strcat(buf, "HAVEOOBDATA "); 1546 if (state & SS_HADOOBDATA) 1547 (void) strcat(buf, "HADOOBDATA "); 1548 1549 if (mode & SM_PRIV) 1550 (void) strcat(buf, "PRIV "); 1551 if (mode & SM_ATOMIC) 1552 (void) strcat(buf, "ATOMIC "); 1553 if (mode & SM_ADDR) 1554 (void) strcat(buf, "ADDR "); 1555 if (mode & SM_CONNREQUIRED) 1556 (void) strcat(buf, "CONNREQUIRED "); 1557 1558 if (mode & SM_FDPASSING) 1559 (void) strcat(buf, "FDPASSING "); 1560 if (mode & SM_EXDATA) 1561 (void) strcat(buf, "EXDATA "); 1562 if (mode & SM_OPTDATA) 1563 (void) strcat(buf, "OPTDATA "); 1564 if (mode & SM_BYTESTREAM) 1565 (void) strcat(buf, "BYTESTREAM "); 1566 return (buf); 1567 } 1568 1569 char * 1570 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen) 1571 { 1572 static char buf[1024]; 1573 1574 if (addr == NULL || addrlen == 0) { 1575 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr); 1576 return (buf); 1577 } 1578 switch (family) { 1579 case AF_INET: { 1580 struct sockaddr_in sin; 1581 1582 bcopy(addr, &sin, sizeof (sin)); 1583 1584 (void) sprintf(buf, "(len %d) %x/%d", 1585 addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port)); 1586 break; 1587 } 1588 case AF_INET6: { 1589 struct sockaddr_in6 sin6; 1590 uint16_t *piece = (uint16_t *)&sin6.sin6_addr; 1591 1592 bcopy((char *)addr, (char *)&sin6, sizeof (sin6)); 1593 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d", 1594 addrlen, 1595 ntohs(piece[0]), ntohs(piece[1]), 1596 ntohs(piece[2]), ntohs(piece[3]), 1597 ntohs(piece[4]), ntohs(piece[5]), 1598 ntohs(piece[6]), ntohs(piece[7]), 1599 ntohs(sin6.sin6_port)); 1600 break; 1601 } 1602 case AF_UNIX: { 1603 struct sockaddr_un *soun = (struct sockaddr_un *)addr; 1604 1605 (void) sprintf(buf, "(len %d) %s", addrlen, 1606 (soun == NULL) ? "(none)" : soun->sun_path); 1607 break; 1608 } 1609 default: 1610 (void) sprintf(buf, "(unknown af %d)", family); 1611 break; 1612 } 1613 return (buf); 1614 } 1615 1616 /* The logical equivalence operator (a if-and-only-if b) */ 1617 #define EQUIV(a, b) (((a) && (b)) || (!(a) && (!(b)))) 1618 1619 /* 1620 * Verify limitations and invariants on oob state. 1621 * Return 1 if OK, otherwise 0 so that it can be used as 1622 * ASSERT(verify_oobstate(so)); 1623 */ 1624 int 1625 so_verify_oobstate(struct sonode *so) 1626 { 1627 boolean_t havemark; 1628 1629 ASSERT(MUTEX_HELD(&so->so_lock)); 1630 1631 /* 1632 * The possible state combinations are: 1633 * 0 1634 * SS_OOBPEND 1635 * SS_OOBPEND|SS_HAVEOOBDATA 1636 * SS_OOBPEND|SS_HADOOBDATA 1637 * SS_HADOOBDATA 1638 */ 1639 switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) { 1640 case 0: 1641 case SS_OOBPEND: 1642 case SS_OOBPEND|SS_HAVEOOBDATA: 1643 case SS_OOBPEND|SS_HADOOBDATA: 1644 case SS_HADOOBDATA: 1645 break; 1646 default: 1647 printf("Bad oob state 1 (%p): state %s\n", 1648 (void *)so, pr_state(so->so_state, so->so_mode)); 1649 return (0); 1650 } 1651 1652 /* SS_RCVATMARK should only be set when SS_OOBPEND is set */ 1653 if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) { 1654 printf("Bad oob state 2 (%p): state %s\n", 1655 (void *)so, pr_state(so->so_state, so->so_mode)); 1656 return (0); 1657 } 1658 1659 /* 1660 * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND 1661 * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt. 1662 */ 1663 havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 : 1664 SOTOTPI(so)->sti_oobsigcnt > 0; 1665 1666 if (!EQUIV(havemark || (so->so_state & SS_RCVATMARK), 1667 so->so_state & SS_OOBPEND)) { 1668 printf("Bad oob state 3 (%p): state %s\n", 1669 (void *)so, pr_state(so->so_state, so->so_mode)); 1670 return (0); 1671 } 1672 1673 /* 1674 * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA 1675 */ 1676 if (!(so->so_options & SO_OOBINLINE) && 1677 !EQUIV(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) { 1678 printf("Bad oob state 4 (%p): state %s\n", 1679 (void *)so, pr_state(so->so_state, so->so_mode)); 1680 return (0); 1681 } 1682 1683 if (!SOCK_IS_NONSTR(so) && 1684 SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) { 1685 printf("Bad oob state 5 (%p): counts %d/%d state %s\n", 1686 (void *)so, SOTOTPI(so)->sti_oobsigcnt, 1687 SOTOTPI(so)->sti_oobcnt, 1688 pr_state(so->so_state, so->so_mode)); 1689 return (0); 1690 } 1691 1692 return (1); 1693 } 1694 #undef EQUIV 1695 #endif /* DEBUG */ 1696 1697 /* initialize sockfs zone specific kstat related items */ 1698 void * 1699 sock_kstat_init(zoneid_t zoneid) 1700 { 1701 kstat_t *ksp; 1702 1703 ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc", 1704 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid); 1705 1706 if (ksp != NULL) { 1707 ksp->ks_update = sockfs_update; 1708 ksp->ks_snapshot = sockfs_snapshot; 1709 ksp->ks_lock = &socklist.sl_lock; 1710 ksp->ks_private = (void *)(uintptr_t)zoneid; 1711 kstat_install(ksp); 1712 } 1713 1714 return (ksp); 1715 } 1716 1717 /* tear down sockfs zone specific kstat related items */ 1718 /*ARGSUSED*/ 1719 void 1720 sock_kstat_fini(zoneid_t zoneid, void *arg) 1721 { 1722 kstat_t *ksp = (kstat_t *)arg; 1723 1724 if (ksp != NULL) { 1725 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private); 1726 kstat_delete(ksp); 1727 } 1728 } 1729 1730 /* 1731 * Zones: 1732 * Note that nactive is going to be different for each zone. 1733 * This means we require kstat to call sockfs_update and then sockfs_snapshot 1734 * for the same zone, or sockfs_snapshot will be taken into the wrong size 1735 * buffer. This is safe, but if the buffer is too small, user will not be 1736 * given details of all sockets. However, as this kstat has a ks_lock, kstat 1737 * driver will keep it locked between the update and the snapshot, so no 1738 * other process (zone) can currently get inbetween resulting in a wrong size 1739 * buffer allocation. 1740 */ 1741 static int 1742 sockfs_update(kstat_t *ksp, int rw) 1743 { 1744 uint_t nactive = 0; /* # of active AF_UNIX sockets */ 1745 struct sonode *so; /* current sonode on socklist */ 1746 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 1747 1748 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 1749 1750 if (rw == KSTAT_WRITE) { /* bounce all writes */ 1751 return (EACCES); 1752 } 1753 1754 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { 1755 if (so->so_count != 0 && so->so_zoneid == myzoneid) { 1756 nactive++; 1757 } 1758 } 1759 ksp->ks_ndata = nactive; 1760 ksp->ks_data_size = nactive * sizeof (struct k_sockinfo); 1761 1762 return (0); 1763 } 1764 1765 static int 1766 sockfs_snapshot(kstat_t *ksp, void *buf, int rw) 1767 { 1768 int ns; /* # of sonodes we've copied */ 1769 struct sonode *so; /* current sonode on socklist */ 1770 struct k_sockinfo *pksi; /* where we put sockinfo data */ 1771 t_uscalar_t sn_len; /* soa_len */ 1772 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 1773 sotpi_info_t *sti; 1774 1775 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 1776 1777 ksp->ks_snaptime = gethrtime(); 1778 1779 if (rw == KSTAT_WRITE) { /* bounce all writes */ 1780 return (EACCES); 1781 } 1782 1783 /* 1784 * for each sonode on the socklist, we massage the important 1785 * info into buf, in k_sockinfo format. 1786 */ 1787 pksi = (struct k_sockinfo *)buf; 1788 ns = 0; 1789 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { 1790 /* only stuff active sonodes and the same zone: */ 1791 if (so->so_count == 0 || so->so_zoneid != myzoneid) { 1792 continue; 1793 } 1794 1795 /* 1796 * If the sonode was activated between the update and the 1797 * snapshot, we're done - as this is only a snapshot. 1798 */ 1799 if ((caddr_t)(pksi) >= (caddr_t)buf + ksp->ks_data_size) { 1800 break; 1801 } 1802 1803 sti = SOTOTPI(so); 1804 /* copy important info into buf: */ 1805 pksi->ks_si.si_size = sizeof (struct k_sockinfo); 1806 pksi->ks_si.si_family = so->so_family; 1807 pksi->ks_si.si_type = so->so_type; 1808 pksi->ks_si.si_flag = so->so_flag; 1809 pksi->ks_si.si_state = so->so_state; 1810 pksi->ks_si.si_serv_type = sti->sti_serv_type; 1811 pksi->ks_si.si_ux_laddr_sou_magic = 1812 sti->sti_ux_laddr.soua_magic; 1813 pksi->ks_si.si_ux_faddr_sou_magic = 1814 sti->sti_ux_faddr.soua_magic; 1815 pksi->ks_si.si_laddr_soa_len = sti->sti_laddr.soa_len; 1816 pksi->ks_si.si_faddr_soa_len = sti->sti_faddr.soa_len; 1817 pksi->ks_si.si_szoneid = so->so_zoneid; 1818 pksi->ks_si.si_faddr_noxlate = sti->sti_faddr_noxlate; 1819 1820 mutex_enter(&so->so_lock); 1821 1822 if (sti->sti_laddr_sa != NULL) { 1823 ASSERT(sti->sti_laddr_sa->sa_data != NULL); 1824 sn_len = sti->sti_laddr_len; 1825 ASSERT(sn_len <= sizeof (short) + 1826 sizeof (pksi->ks_si.si_laddr_sun_path)); 1827 1828 pksi->ks_si.si_laddr_family = 1829 sti->sti_laddr_sa->sa_family; 1830 if (sn_len != 0) { 1831 /* AF_UNIX socket names are NULL terminated */ 1832 (void) strncpy(pksi->ks_si.si_laddr_sun_path, 1833 sti->sti_laddr_sa->sa_data, 1834 sizeof (pksi->ks_si.si_laddr_sun_path)); 1835 sn_len = strlen(pksi->ks_si.si_laddr_sun_path); 1836 } 1837 pksi->ks_si.si_laddr_sun_path[sn_len] = 0; 1838 } 1839 1840 if (sti->sti_faddr_sa != NULL) { 1841 ASSERT(sti->sti_faddr_sa->sa_data != NULL); 1842 sn_len = sti->sti_faddr_len; 1843 ASSERT(sn_len <= sizeof (short) + 1844 sizeof (pksi->ks_si.si_faddr_sun_path)); 1845 1846 pksi->ks_si.si_faddr_family = 1847 sti->sti_faddr_sa->sa_family; 1848 if (sn_len != 0) { 1849 (void) strncpy(pksi->ks_si.si_faddr_sun_path, 1850 sti->sti_faddr_sa->sa_data, 1851 sizeof (pksi->ks_si.si_faddr_sun_path)); 1852 sn_len = strlen(pksi->ks_si.si_faddr_sun_path); 1853 } 1854 pksi->ks_si.si_faddr_sun_path[sn_len] = 0; 1855 } 1856 1857 mutex_exit(&so->so_lock); 1858 1859 (void) sprintf(pksi->ks_straddr[0], "%p", (void *)so); 1860 (void) sprintf(pksi->ks_straddr[1], "%p", 1861 (void *)sti->sti_ux_laddr.soua_vp); 1862 (void) sprintf(pksi->ks_straddr[2], "%p", 1863 (void *)sti->sti_ux_faddr.soua_vp); 1864 1865 ns++; 1866 pksi++; 1867 } 1868 1869 ksp->ks_ndata = ns; 1870 return (0); 1871 } 1872 1873 ssize_t 1874 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size) 1875 { 1876 struct uio auio; 1877 struct iovec aiov[MSG_MAXIOVLEN]; 1878 register vnode_t *vp; 1879 int ioflag, rwflag; 1880 ssize_t cnt; 1881 int error = 0; 1882 int iovcnt = 0; 1883 short fflag; 1884 1885 vp = fp->f_vnode; 1886 fflag = fp->f_flag; 1887 1888 rwflag = 0; 1889 aiov[0].iov_base = (caddr_t)buf; 1890 aiov[0].iov_len = size; 1891 iovcnt = 1; 1892 cnt = (ssize_t)size; 1893 (void) VOP_RWLOCK(vp, rwflag, NULL); 1894 1895 auio.uio_loffset = fileoff; 1896 auio.uio_iov = aiov; 1897 auio.uio_iovcnt = iovcnt; 1898 auio.uio_resid = cnt; 1899 auio.uio_segflg = UIO_SYSSPACE; 1900 auio.uio_llimit = MAXOFFSET_T; 1901 auio.uio_fmode = fflag; 1902 auio.uio_extflg = UIO_COPY_CACHED; 1903 1904 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1905 1906 /* If read sync is not asked for, filter sync flags */ 1907 if ((ioflag & FRSYNC) == 0) 1908 ioflag &= ~(FSYNC|FDSYNC); 1909 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL); 1910 cnt -= auio.uio_resid; 1911 1912 VOP_RWUNLOCK(vp, rwflag, NULL); 1913 1914 if (error == EINTR && cnt != 0) 1915 error = 0; 1916 out: 1917 if (error != 0) { 1918 *err = error; 1919 return (0); 1920 } else { 1921 *err = 0; 1922 return (cnt); 1923 } 1924 } 1925 1926 int 1927 so_copyin(const void *from, void *to, size_t size, int fromkernel) 1928 { 1929 if (fromkernel) { 1930 bcopy(from, to, size); 1931 return (0); 1932 } 1933 return (xcopyin(from, to, size)); 1934 } 1935 1936 int 1937 so_copyout(const void *from, void *to, size_t size, int tokernel) 1938 { 1939 if (tokernel) { 1940 bcopy(from, to, size); 1941 return (0); 1942 } 1943 return (xcopyout(from, to, size)); 1944 } 1945