1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/t_lock.h> 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/buf.h> 33 #include <sys/conf.h> 34 #include <sys/cred.h> 35 #include <sys/kmem.h> 36 #include <sys/sysmacros.h> 37 #include <sys/vfs.h> 38 #include <sys/vfs_opreg.h> 39 #include <sys/vnode.h> 40 #include <sys/debug.h> 41 #include <sys/errno.h> 42 #include <sys/time.h> 43 #include <sys/file.h> 44 #include <sys/open.h> 45 #include <sys/user.h> 46 #include <sys/termios.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/strsun.h> 50 #include <sys/esunddi.h> 51 #include <sys/flock.h> 52 #include <sys/modctl.h> 53 #include <sys/cmn_err.h> 54 #include <sys/mkdev.h> 55 #include <sys/pathname.h> 56 #include <sys/ddi.h> 57 #include <sys/stat.h> 58 #include <sys/fs/snode.h> 59 #include <sys/fs/dv_node.h> 60 #include <sys/zone.h> 61 62 #include <sys/socket.h> 63 #include <sys/socketvar.h> 64 #include <netinet/in.h> 65 #include <sys/un.h> 66 #include <sys/ucred.h> 67 68 #include <sys/tiuser.h> 69 #define _SUN_TPI_VERSION 2 70 #include <sys/tihdr.h> 71 72 #include <c2/audit.h> 73 74 #include <fs/sockfs/nl7c.h> 75 #include <fs/sockfs/sockcommon.h> 76 #include <fs/sockfs/sockfilter_impl.h> 77 #include <fs/sockfs/socktpi.h> 78 #include <fs/sockfs/socktpi_impl.h> 79 #include <fs/sockfs/sodirect.h> 80 81 /* 82 * Macros that operate on struct cmsghdr. 83 * The CMSG_VALID macro does not assume that the last option buffer is padded. 84 */ 85 #define CMSG_CONTENT(cmsg) (&((cmsg)[1])) 86 #define CMSG_CONTENTLEN(cmsg) ((cmsg)->cmsg_len - sizeof (struct cmsghdr)) 87 #define CMSG_VALID(cmsg, start, end) \ 88 (ISALIGNED_cmsghdr(cmsg) && \ 89 ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \ 90 ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \ 91 ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \ 92 ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end))) 93 #define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */ 94 95 dev_t sockdev; /* For fsid in getattr */ 96 int sockfs_defer_nl7c_init = 0; 97 98 struct socklist socklist; 99 100 struct kmem_cache *socket_cache; 101 102 /* 103 * sockconf_lock protects the socket configuration (socket types and 104 * socket filters) which is changed via the sockconfig system call. 105 */ 106 krwlock_t sockconf_lock; 107 108 static int sockfs_update(kstat_t *, int); 109 static int sockfs_snapshot(kstat_t *, void *, int); 110 extern smod_info_t *sotpi_smod_create(void); 111 112 extern void sendfile_init(); 113 114 extern void nl7c_init(void); 115 116 extern int modrootloaded; 117 118 /* 119 * Translate from a device pathname (e.g. "/dev/tcp") to a vnode. 120 * Returns with the vnode held. 121 */ 122 int 123 sogetvp(char *devpath, vnode_t **vpp, int uioflag) 124 { 125 struct snode *csp; 126 vnode_t *vp, *dvp; 127 major_t maj; 128 int error; 129 130 ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE); 131 132 /* 133 * Lookup the underlying filesystem vnode. 134 */ 135 error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp); 136 if (error) 137 return (error); 138 139 /* Check that it is the correct vnode */ 140 if (vp->v_type != VCHR) { 141 VN_RELE(vp); 142 return (ENOTSOCK); 143 } 144 145 /* 146 * If devpath went through devfs, the device should already 147 * be configured. If devpath is a mknod file, however, we 148 * need to make sure the device is properly configured. 149 * To do this, we do something similar to spec_open() 150 * except that we resolve to the minor/leaf level since 151 * we need to return a vnode. 152 */ 153 csp = VTOS(VTOS(vp)->s_commonvp); 154 if (!(csp->s_flag & SDIPSET)) { 155 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 156 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname); 157 if (error == 0) 158 error = devfs_lookupname(pathname, NULLVPP, &dvp); 159 VN_RELE(vp); 160 kmem_free(pathname, MAXPATHLEN); 161 if (error != 0) 162 return (ENXIO); 163 vp = dvp; /* use the devfs vp */ 164 } 165 166 /* device is configured at this point */ 167 maj = getmajor(vp->v_rdev); 168 if (!STREAMSTAB(maj)) { 169 VN_RELE(vp); 170 return (ENOSTR); 171 } 172 173 *vpp = vp; 174 return (0); 175 } 176 177 /* 178 * Update the accessed, updated, or changed times in an sonode 179 * with the current time. 180 * 181 * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable 182 * attributes in a fstat call. (They return the current time and 0 for 183 * all timestamps, respectively.) We maintain the current timestamps 184 * here primarily so that should sockmod be popped the resulting 185 * file descriptor will behave like a stream w.r.t. the timestamps. 186 */ 187 void 188 so_update_attrs(struct sonode *so, int flag) 189 { 190 time_t now = gethrestime_sec(); 191 192 if (SOCK_IS_NONSTR(so)) 193 return; 194 195 mutex_enter(&so->so_lock); 196 so->so_flag |= flag; 197 if (flag & SOACC) 198 SOTOTPI(so)->sti_atime = now; 199 if (flag & SOMOD) 200 SOTOTPI(so)->sti_mtime = now; 201 mutex_exit(&so->so_lock); 202 } 203 204 extern so_create_func_t sock_comm_create_function; 205 extern so_destroy_func_t sock_comm_destroy_function; 206 /* 207 * Init function called when sockfs is loaded. 208 */ 209 int 210 sockinit(int fstype, char *name) 211 { 212 static const fs_operation_def_t sock_vfsops_template[] = { 213 NULL, NULL 214 }; 215 int error; 216 major_t dev; 217 char *err_str; 218 219 error = vfs_setfsops(fstype, sock_vfsops_template, NULL); 220 if (error != 0) { 221 zcmn_err(GLOBAL_ZONEID, CE_WARN, 222 "sockinit: bad vfs ops template"); 223 return (error); 224 } 225 226 error = vn_make_ops(name, socket_vnodeops_template, 227 &socket_vnodeops); 228 if (error != 0) { 229 err_str = "sockinit: bad socket vnode ops template"; 230 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */ 231 socket_vnodeops = NULL; 232 goto failure; 233 } 234 235 socket_cache = kmem_cache_create("socket_cache", 236 sizeof (struct sonode), 0, sonode_constructor, 237 sonode_destructor, NULL, NULL, NULL, 0); 238 239 rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL); 240 241 error = socktpi_init(); 242 if (error != 0) { 243 err_str = NULL; 244 goto failure; 245 } 246 247 error = sod_init(); 248 if (error != 0) { 249 err_str = NULL; 250 goto failure; 251 } 252 253 /* 254 * Set up the default create and destroy functions 255 */ 256 sock_comm_create_function = socket_sonode_create; 257 sock_comm_destroy_function = socket_sonode_destroy; 258 259 /* 260 * Build initial list mapping socket parameters to vnode. 261 */ 262 smod_init(); 263 smod_add(sotpi_smod_create()); 264 265 sockparams_init(); 266 267 /* 268 * If sockets are needed before init runs /sbin/soconfig 269 * it is possible to preload the sockparams list here using 270 * calls like: 271 * sockconfig(1,2,3, "/dev/tcp", 0); 272 */ 273 274 /* 275 * Create a unique dev_t for use in so_fsid. 276 */ 277 278 if ((dev = getudev()) == (major_t)-1) 279 dev = 0; 280 sockdev = makedevice(dev, 0); 281 282 mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL); 283 sendfile_init(); 284 if (!modrootloaded) { 285 sockfs_defer_nl7c_init = 1; 286 } else { 287 nl7c_init(); 288 } 289 290 /* Initialize socket filters */ 291 sof_init(); 292 293 return (0); 294 295 failure: 296 (void) vfs_freevfsops_by_type(fstype); 297 if (socket_vnodeops != NULL) 298 vn_freevnodeops(socket_vnodeops); 299 if (err_str != NULL) 300 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str); 301 return (error); 302 } 303 304 /* 305 * Caller must hold the mutex. Used to set SOLOCKED. 306 */ 307 void 308 so_lock_single(struct sonode *so) 309 { 310 ASSERT(MUTEX_HELD(&so->so_lock)); 311 312 while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) { 313 cv_wait_stop(&so->so_single_cv, &so->so_lock, 314 SO_LOCK_WAKEUP_TIME); 315 } 316 so->so_flag |= SOLOCKED; 317 } 318 319 /* 320 * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND. 321 * Used to clear SOLOCKED or SOASYNC_UNBIND. 322 */ 323 void 324 so_unlock_single(struct sonode *so, int flag) 325 { 326 ASSERT(MUTEX_HELD(&so->so_lock)); 327 ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND)); 328 ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0); 329 ASSERT(so->so_flag & flag); 330 /* 331 * Process the T_DISCON_IND on sti_discon_ind_mp. 332 * 333 * Call to so_drain_discon_ind will result in so_lock 334 * being dropped and re-acquired later. 335 */ 336 if (!SOCK_IS_NONSTR(so)) { 337 sotpi_info_t *sti = SOTOTPI(so); 338 339 if (sti->sti_discon_ind_mp != NULL) 340 so_drain_discon_ind(so); 341 } 342 343 cv_signal(&so->so_single_cv); 344 so->so_flag &= ~flag; 345 } 346 347 /* 348 * Caller must hold the mutex. Used to set SOREADLOCKED. 349 * If the caller wants nonblocking behavior it should set fmode. 350 */ 351 int 352 so_lock_read(struct sonode *so, int fmode) 353 { 354 ASSERT(MUTEX_HELD(&so->so_lock)); 355 356 while (so->so_flag & SOREADLOCKED) { 357 if (fmode & (FNDELAY|FNONBLOCK)) 358 return (EWOULDBLOCK); 359 cv_wait_stop(&so->so_read_cv, &so->so_lock, 360 SO_LOCK_WAKEUP_TIME); 361 } 362 so->so_flag |= SOREADLOCKED; 363 return (0); 364 } 365 366 /* 367 * Like so_lock_read above but allows signals. 368 */ 369 int 370 so_lock_read_intr(struct sonode *so, int fmode) 371 { 372 ASSERT(MUTEX_HELD(&so->so_lock)); 373 374 while (so->so_flag & SOREADLOCKED) { 375 if (fmode & (FNDELAY|FNONBLOCK)) 376 return (EWOULDBLOCK); 377 if (!cv_wait_sig(&so->so_read_cv, &so->so_lock)) 378 return (EINTR); 379 } 380 so->so_flag |= SOREADLOCKED; 381 return (0); 382 } 383 384 /* 385 * Caller must hold the mutex. Used to clear SOREADLOCKED, 386 * set in so_lock_read() or so_lock_read_intr(). 387 */ 388 void 389 so_unlock_read(struct sonode *so) 390 { 391 ASSERT(MUTEX_HELD(&so->so_lock)); 392 ASSERT(so->so_flag & SOREADLOCKED); 393 394 cv_signal(&so->so_read_cv); 395 so->so_flag &= ~SOREADLOCKED; 396 } 397 398 /* 399 * Verify that the specified offset falls within the mblk and 400 * that the resulting pointer is aligned. 401 * Returns NULL if not. 402 */ 403 void * 404 sogetoff(mblk_t *mp, t_uscalar_t offset, 405 t_uscalar_t length, uint_t align_size) 406 { 407 uintptr_t ptr1, ptr2; 408 409 ASSERT(mp && mp->b_wptr >= mp->b_rptr); 410 ptr1 = (uintptr_t)mp->b_rptr + offset; 411 ptr2 = (uintptr_t)ptr1 + length; 412 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) { 413 eprintline(0); 414 return (NULL); 415 } 416 if ((ptr1 & (align_size - 1)) != 0) { 417 eprintline(0); 418 return (NULL); 419 } 420 return ((void *)ptr1); 421 } 422 423 /* 424 * Return the AF_UNIX underlying filesystem vnode matching a given name. 425 * Makes sure the sending and the destination sonodes are compatible. 426 * The vnode is returned held. 427 * 428 * The underlying filesystem VSOCK vnode has a v_stream pointer that 429 * references the actual stream head (hence indirectly the actual sonode). 430 */ 431 static int 432 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess, 433 vnode_t **vpp) 434 { 435 vnode_t *vp; /* Underlying filesystem vnode */ 436 vnode_t *rvp; /* real vnode */ 437 vnode_t *svp; /* sockfs vnode */ 438 struct sonode *so2; 439 int error; 440 441 dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so, 442 soun->sun_path)); 443 444 error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); 445 if (error) { 446 eprintsoline(so, error); 447 return (error); 448 } 449 450 /* 451 * Traverse lofs mounts get the real vnode 452 */ 453 if (VOP_REALVP(vp, &rvp, NULL) == 0) { 454 VN_HOLD(rvp); /* hold the real vnode */ 455 VN_RELE(vp); /* release hold from lookup */ 456 vp = rvp; 457 } 458 459 if (vp->v_type != VSOCK) { 460 error = ENOTSOCK; 461 eprintsoline(so, error); 462 goto done2; 463 } 464 465 if (checkaccess) { 466 /* 467 * Check that we have permissions to access the destination 468 * vnode. This check is not done in BSD but it is required 469 * by X/Open. 470 */ 471 if (error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL)) { 472 eprintsoline(so, error); 473 goto done2; 474 } 475 } 476 477 /* 478 * Check if the remote socket has been closed. 479 * 480 * Synchronize with vn_rele_stream by holding v_lock while traversing 481 * v_stream->sd_vnode. 482 */ 483 mutex_enter(&vp->v_lock); 484 if (vp->v_stream == NULL) { 485 mutex_exit(&vp->v_lock); 486 if (so->so_type == SOCK_DGRAM) 487 error = EDESTADDRREQ; 488 else 489 error = ECONNREFUSED; 490 491 eprintsoline(so, error); 492 goto done2; 493 } 494 ASSERT(vp->v_stream->sd_vnode); 495 svp = vp->v_stream->sd_vnode; 496 /* 497 * holding v_lock on underlying filesystem vnode and acquiring 498 * it on sockfs vnode. Assumes that no code ever attempts to 499 * acquire these locks in the reverse order. 500 */ 501 VN_HOLD(svp); 502 mutex_exit(&vp->v_lock); 503 504 if (svp->v_type != VSOCK) { 505 error = ENOTSOCK; 506 eprintsoline(so, error); 507 goto done; 508 } 509 510 so2 = VTOSO(svp); 511 512 if (so->so_type != so2->so_type) { 513 error = EPROTOTYPE; 514 eprintsoline(so, error); 515 goto done; 516 } 517 518 VN_RELE(svp); 519 *vpp = vp; 520 return (0); 521 522 done: 523 VN_RELE(svp); 524 done2: 525 VN_RELE(vp); 526 return (error); 527 } 528 529 /* 530 * Verify peer address for connect and sendto/sendmsg. 531 * Since sendto/sendmsg would not get synchronous errors from the transport 532 * provider we have to do these ugly checks in the socket layer to 533 * preserve compatibility with SunOS 4.X. 534 */ 535 int 536 so_addr_verify(struct sonode *so, const struct sockaddr *name, 537 socklen_t namelen) 538 { 539 int family; 540 541 dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n", 542 (void *)so, (void *)name, namelen)); 543 544 ASSERT(name != NULL); 545 546 family = so->so_family; 547 switch (family) { 548 case AF_INET: 549 if (name->sa_family != family) { 550 eprintsoline(so, EAFNOSUPPORT); 551 return (EAFNOSUPPORT); 552 } 553 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) { 554 eprintsoline(so, EINVAL); 555 return (EINVAL); 556 } 557 break; 558 case AF_INET6: { 559 #ifdef DEBUG 560 struct sockaddr_in6 *sin6; 561 #endif /* DEBUG */ 562 563 if (name->sa_family != family) { 564 eprintsoline(so, EAFNOSUPPORT); 565 return (EAFNOSUPPORT); 566 } 567 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) { 568 eprintsoline(so, EINVAL); 569 return (EINVAL); 570 } 571 #ifdef DEBUG 572 /* Verify that apps don't forget to clear sin6_scope_id etc */ 573 sin6 = (struct sockaddr_in6 *)name; 574 if (sin6->sin6_scope_id != 0 && 575 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) { 576 zcmn_err(getzoneid(), CE_WARN, 577 "connect/send* with uninitialized sin6_scope_id " 578 "(%d) on socket. Pid = %d\n", 579 (int)sin6->sin6_scope_id, (int)curproc->p_pid); 580 } 581 #endif /* DEBUG */ 582 break; 583 } 584 case AF_UNIX: 585 if (SOTOTPI(so)->sti_faddr_noxlate) { 586 return (0); 587 } 588 if (namelen < (socklen_t)sizeof (short)) { 589 eprintsoline(so, ENOENT); 590 return (ENOENT); 591 } 592 if (name->sa_family != family) { 593 eprintsoline(so, EAFNOSUPPORT); 594 return (EAFNOSUPPORT); 595 } 596 /* MAXPATHLEN + soun_family + nul termination */ 597 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) { 598 eprintsoline(so, ENAMETOOLONG); 599 return (ENAMETOOLONG); 600 } 601 602 break; 603 604 default: 605 /* 606 * Default is don't do any length or sa_family check 607 * to allow non-sockaddr style addresses. 608 */ 609 break; 610 } 611 612 return (0); 613 } 614 615 616 /* 617 * Translate an AF_UNIX sockaddr_un to the transport internal name. 618 * Assumes caller has called so_addr_verify first. The translated 619 * (internal form) address is stored in sti->sti_ux_taddr. 620 */ 621 /*ARGSUSED*/ 622 int 623 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name, 624 socklen_t namelen, int checkaccess, 625 void **addrp, socklen_t *addrlenp) 626 { 627 int error; 628 struct sockaddr_un *soun; 629 vnode_t *vp; 630 void *addr; 631 socklen_t addrlen; 632 sotpi_info_t *sti = SOTOTPI(so); 633 634 dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n", 635 (void *)so, (void *)name, namelen, checkaccess)); 636 637 ASSERT(name != NULL); 638 ASSERT(so->so_family == AF_UNIX); 639 ASSERT(!sti->sti_faddr_noxlate); 640 ASSERT(namelen >= (socklen_t)sizeof (short)); 641 ASSERT(name->sa_family == AF_UNIX); 642 soun = (struct sockaddr_un *)name; 643 /* 644 * Lookup vnode for the specified path name and verify that 645 * it is a socket. 646 */ 647 error = so_ux_lookup(so, soun, checkaccess, &vp); 648 if (error) { 649 eprintsoline(so, error); 650 return (error); 651 } 652 /* 653 * Use the address of the peer vnode as the address to send 654 * to. We release the peer vnode here. In case it has been 655 * closed by the time the T_CONN_REQ or T_UNITDATA_REQ reaches the 656 * transport the message will get an error or be dropped. 657 * Note that that soua_vp is never dereferenced; it's just a 658 * convenient value by which we can identify the peer. 659 */ 660 sti->sti_ux_taddr.soua_vp = vp; 661 sti->sti_ux_taddr.soua_magic = SOU_MAGIC_EXPLICIT; 662 addr = &sti->sti_ux_taddr; 663 addrlen = (socklen_t)sizeof (sti->sti_ux_taddr); 664 dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n", 665 addrlen, (void *)vp)); 666 VN_RELE(vp); 667 *addrp = addr; 668 *addrlenp = (socklen_t)addrlen; 669 return (0); 670 } 671 672 /* 673 * Esballoc free function for messages that contain SO_FILEP option. 674 * Decrement the reference count on the file pointers using closef. 675 */ 676 void 677 fdbuf_free(struct fdbuf *fdbuf) 678 { 679 int i; 680 struct file *fp; 681 682 dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd)); 683 for (i = 0; i < fdbuf->fd_numfd; i++) { 684 /* 685 * We need pointer size alignment for fd_fds. On a LP64 686 * kernel, the required alignment is 8 bytes while 687 * the option headers and values are only 4 bytes 688 * aligned. So its safer to do a bcopy compared to 689 * assigning fdbuf->fd_fds[i] to fp. 690 */ 691 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 692 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp)); 693 (void) closef(fp); 694 } 695 if (fdbuf->fd_ebuf != NULL) 696 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen); 697 kmem_free(fdbuf, fdbuf->fd_size); 698 } 699 700 /* 701 * Allocate an esballoc'ed message for AF_UNIX file descriptor passing. 702 * Waits if memory is not available. 703 */ 704 mblk_t * 705 fdbuf_allocmsg(int size, struct fdbuf *fdbuf) 706 { 707 uchar_t *buf; 708 mblk_t *mp; 709 710 dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd)); 711 buf = kmem_alloc(size, KM_SLEEP); 712 fdbuf->fd_ebuf = (caddr_t)buf; 713 fdbuf->fd_ebuflen = size; 714 fdbuf->fd_frtn.free_func = fdbuf_free; 715 fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf; 716 717 mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn); 718 mp->b_datap->db_type = M_PROTO; 719 return (mp); 720 } 721 722 /* 723 * Extract file descriptors from a fdbuf. 724 * Return list in rights/rightslen. 725 */ 726 /*ARGSUSED*/ 727 static int 728 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen) 729 { 730 int i, fd; 731 int *rp; 732 struct file *fp; 733 int numfd; 734 735 dprint(1, ("fdbuf_extract: %d fds, len %d\n", 736 fdbuf->fd_numfd, rightslen)); 737 738 numfd = fdbuf->fd_numfd; 739 ASSERT(rightslen == numfd * (int)sizeof (int)); 740 741 /* 742 * Allocate a file descriptor and increment the f_count. 743 * The latter is needed since we always call fdbuf_free 744 * which performs a closef. 745 */ 746 rp = (int *)rights; 747 for (i = 0; i < numfd; i++) { 748 if ((fd = ufalloc(0)) == -1) 749 goto cleanup; 750 /* 751 * We need pointer size alignment for fd_fds. On a LP64 752 * kernel, the required alignment is 8 bytes while 753 * the option headers and values are only 4 bytes 754 * aligned. So its safer to do a bcopy compared to 755 * assigning fdbuf->fd_fds[i] to fp. 756 */ 757 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 758 mutex_enter(&fp->f_tlock); 759 fp->f_count++; 760 mutex_exit(&fp->f_tlock); 761 setf(fd, fp); 762 *rp++ = fd; 763 if (AU_AUDITING()) 764 audit_fdrecv(fd, fp); 765 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n", 766 i, fd, (void *)fp, fp->f_count)); 767 } 768 return (0); 769 770 cleanup: 771 /* 772 * Undo whatever partial work the loop above has done. 773 */ 774 { 775 int j; 776 777 rp = (int *)rights; 778 for (j = 0; j < i; j++) { 779 dprint(0, 780 ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp)); 781 (void) closeandsetf(*rp++, NULL); 782 } 783 } 784 785 return (EMFILE); 786 } 787 788 /* 789 * Insert file descriptors into an fdbuf. 790 * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed 791 * by calling fdbuf_free(). 792 */ 793 int 794 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp) 795 { 796 int numfd, i; 797 int *fds; 798 struct file *fp; 799 struct fdbuf *fdbuf; 800 int fdbufsize; 801 802 dprint(1, ("fdbuf_create: len %d\n", rightslen)); 803 804 numfd = rightslen / (int)sizeof (int); 805 806 fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)); 807 fdbuf = kmem_alloc(fdbufsize, KM_SLEEP); 808 fdbuf->fd_size = fdbufsize; 809 fdbuf->fd_numfd = 0; 810 fdbuf->fd_ebuf = NULL; 811 fdbuf->fd_ebuflen = 0; 812 fds = (int *)rights; 813 for (i = 0; i < numfd; i++) { 814 if ((fp = getf(fds[i])) == NULL) { 815 fdbuf_free(fdbuf); 816 return (EBADF); 817 } 818 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n", 819 i, fds[i], (void *)fp, fp->f_count)); 820 mutex_enter(&fp->f_tlock); 821 fp->f_count++; 822 mutex_exit(&fp->f_tlock); 823 /* 824 * The maximum alignment for fdbuf (or any option header 825 * and its value) it 4 bytes. On a LP64 kernel, the alignment 826 * is not sufficient for pointers (fd_fds in this case). Since 827 * we just did a kmem_alloc (we get a double word alignment), 828 * we don't need to do anything on the send side (we loose 829 * the double word alignment because fdbuf goes after an 830 * option header (eg T_unitdata_req) which is only 4 byte 831 * aligned). We take care of this when we extract the file 832 * descriptor in fdbuf_extract or fdbuf_free. 833 */ 834 fdbuf->fd_fds[i] = fp; 835 fdbuf->fd_numfd++; 836 releasef(fds[i]); 837 if (AU_AUDITING()) 838 audit_fdsend(fds[i], fp, 0); 839 } 840 *fdbufp = fdbuf; 841 return (0); 842 } 843 844 static int 845 fdbuf_optlen(int rightslen) 846 { 847 int numfd; 848 849 numfd = rightslen / (int)sizeof (int); 850 851 return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *))); 852 } 853 854 static t_uscalar_t 855 fdbuf_cmsglen(int fdbuflen) 856 { 857 return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) / 858 (int)sizeof (struct file *) * (int)sizeof (int)); 859 } 860 861 862 /* 863 * Return non-zero if the mblk and fdbuf are consistent. 864 */ 865 static int 866 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen) 867 { 868 if (fdbuflen >= FDBUF_HDRSIZE && 869 fdbuflen == fdbuf->fd_size) { 870 frtn_t *frp = mp->b_datap->db_frtnp; 871 /* 872 * Check that the SO_FILEP portion of the 873 * message has not been modified by 874 * the loopback transport. The sending sockfs generates 875 * a message that is esballoc'ed with the free function 876 * being fdbuf_free() and where free_arg contains the 877 * identical information as the SO_FILEP content. 878 * 879 * If any of these constraints are not satisfied we 880 * silently ignore the option. 881 */ 882 ASSERT(mp); 883 if (frp != NULL && 884 frp->free_func == fdbuf_free && 885 frp->free_arg != NULL && 886 bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) { 887 dprint(1, ("fdbuf_verify: fdbuf %p len %d\n", 888 (void *)fdbuf, fdbuflen)); 889 return (1); 890 } else { 891 zcmn_err(getzoneid(), CE_WARN, 892 "sockfs: mismatched fdbuf content (%p)", 893 (void *)mp); 894 return (0); 895 } 896 } else { 897 zcmn_err(getzoneid(), CE_WARN, 898 "sockfs: mismatched fdbuf len %d, %d\n", 899 fdbuflen, fdbuf->fd_size); 900 return (0); 901 } 902 } 903 904 /* 905 * When the file descriptors returned by sorecvmsg can not be passed 906 * to the application this routine will cleanup the references on 907 * the files. Start at startoff bytes into the buffer. 908 */ 909 static void 910 close_fds(void *fdbuf, int fdbuflen, int startoff) 911 { 912 int *fds = (int *)fdbuf; 913 int numfd = fdbuflen / (int)sizeof (int); 914 int i; 915 916 dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff)); 917 918 for (i = 0; i < numfd; i++) { 919 if (startoff < 0) 920 startoff = 0; 921 if (startoff < (int)sizeof (int)) { 922 /* 923 * This file descriptor is partially or fully after 924 * the offset 925 */ 926 dprint(0, 927 ("close_fds: cleanup[%d] = %d\n", i, fds[i])); 928 (void) closeandsetf(fds[i], NULL); 929 } 930 startoff -= (int)sizeof (int); 931 } 932 } 933 934 /* 935 * Close all file descriptors contained in the control part starting at 936 * the startoffset. 937 */ 938 void 939 so_closefds(void *control, t_uscalar_t controllen, int oldflg, 940 int startoff) 941 { 942 struct cmsghdr *cmsg; 943 944 if (control == NULL) 945 return; 946 947 if (oldflg) { 948 close_fds(control, controllen, startoff); 949 return; 950 } 951 /* Scan control part for file descriptors. */ 952 for (cmsg = (struct cmsghdr *)control; 953 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 954 cmsg = CMSG_NEXT(cmsg)) { 955 if (cmsg->cmsg_level == SOL_SOCKET && 956 cmsg->cmsg_type == SCM_RIGHTS) { 957 close_fds(CMSG_CONTENT(cmsg), 958 (int)CMSG_CONTENTLEN(cmsg), 959 startoff - (int)sizeof (struct cmsghdr)); 960 } 961 startoff -= cmsg->cmsg_len; 962 } 963 } 964 965 /* 966 * Returns a pointer/length for the file descriptors contained 967 * in the control buffer. Returns with *fdlenp == -1 if there are no 968 * file descriptor options present. This is different than there being 969 * a zero-length file descriptor option. 970 * Fail if there are multiple SCM_RIGHT cmsgs. 971 */ 972 int 973 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg, 974 void **fdsp, int *fdlenp) 975 { 976 struct cmsghdr *cmsg; 977 void *fds; 978 int fdlen; 979 980 if (control == NULL) { 981 *fdsp = NULL; 982 *fdlenp = -1; 983 return (0); 984 } 985 986 if (oldflg) { 987 *fdsp = control; 988 if (controllen == 0) 989 *fdlenp = -1; 990 else 991 *fdlenp = controllen; 992 dprint(1, ("so_getfdopt: old %d\n", *fdlenp)); 993 return (0); 994 } 995 996 fds = NULL; 997 fdlen = 0; 998 999 for (cmsg = (struct cmsghdr *)control; 1000 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1001 cmsg = CMSG_NEXT(cmsg)) { 1002 if (cmsg->cmsg_level == SOL_SOCKET && 1003 cmsg->cmsg_type == SCM_RIGHTS) { 1004 if (fds != NULL) 1005 return (EINVAL); 1006 fds = CMSG_CONTENT(cmsg); 1007 fdlen = (int)CMSG_CONTENTLEN(cmsg); 1008 dprint(1, ("so_getfdopt: new %lu\n", 1009 (size_t)CMSG_CONTENTLEN(cmsg))); 1010 } 1011 } 1012 if (fds == NULL) { 1013 dprint(1, ("so_getfdopt: NONE\n")); 1014 *fdlenp = -1; 1015 } else 1016 *fdlenp = fdlen; 1017 *fdsp = fds; 1018 return (0); 1019 } 1020 1021 /* 1022 * Return the length of the options including any file descriptor options. 1023 */ 1024 t_uscalar_t 1025 so_optlen(void *control, t_uscalar_t controllen, int oldflg) 1026 { 1027 struct cmsghdr *cmsg; 1028 t_uscalar_t optlen = 0; 1029 t_uscalar_t len; 1030 1031 if (control == NULL) 1032 return (0); 1033 1034 if (oldflg) 1035 return ((t_uscalar_t)(sizeof (struct T_opthdr) + 1036 fdbuf_optlen(controllen))); 1037 1038 for (cmsg = (struct cmsghdr *)control; 1039 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1040 cmsg = CMSG_NEXT(cmsg)) { 1041 if (cmsg->cmsg_level == SOL_SOCKET && 1042 cmsg->cmsg_type == SCM_RIGHTS) { 1043 len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg)); 1044 } else { 1045 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1046 } 1047 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) + 1048 sizeof (struct T_opthdr)); 1049 } 1050 dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n", 1051 controllen, oldflg, optlen)); 1052 return (optlen); 1053 } 1054 1055 /* 1056 * Copy options from control to the mblk. Skip any file descriptor options. 1057 */ 1058 void 1059 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp) 1060 { 1061 struct T_opthdr toh; 1062 struct cmsghdr *cmsg; 1063 1064 if (control == NULL) 1065 return; 1066 1067 if (oldflg) { 1068 /* No real options - caller has handled file descriptors */ 1069 return; 1070 } 1071 for (cmsg = (struct cmsghdr *)control; 1072 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1073 cmsg = CMSG_NEXT(cmsg)) { 1074 /* 1075 * Note: The caller handles file descriptors prior 1076 * to calling this function. 1077 */ 1078 t_uscalar_t len; 1079 1080 if (cmsg->cmsg_level == SOL_SOCKET && 1081 cmsg->cmsg_type == SCM_RIGHTS) 1082 continue; 1083 1084 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1085 toh.level = cmsg->cmsg_level; 1086 toh.name = cmsg->cmsg_type; 1087 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr); 1088 toh.status = 0; 1089 1090 soappendmsg(mp, &toh, sizeof (toh)); 1091 soappendmsg(mp, CMSG_CONTENT(cmsg), len); 1092 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len; 1093 ASSERT(mp->b_wptr <= mp->b_datap->db_lim); 1094 } 1095 } 1096 1097 /* 1098 * Return the length of the control message derived from the options. 1099 * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP. 1100 * When oldflg is set only include SO_FILEP. 1101 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1102 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1103 * also be checked for any possible impacts. 1104 */ 1105 t_uscalar_t 1106 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg) 1107 { 1108 t_uscalar_t cmsglen = 0; 1109 struct T_opthdr *tohp; 1110 t_uscalar_t len; 1111 t_uscalar_t last_roundup = 0; 1112 1113 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1114 1115 for (tohp = (struct T_opthdr *)opt; 1116 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1117 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1118 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n", 1119 tohp->level, tohp->name, tohp->len)); 1120 if (tohp->level == SOL_SOCKET && 1121 (tohp->name == SO_SRCADDR || 1122 tohp->name == SO_UNIX_CLOSE)) { 1123 continue; 1124 } 1125 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1126 struct fdbuf *fdbuf; 1127 int fdbuflen; 1128 1129 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1130 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1131 1132 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1133 continue; 1134 if (oldflg) { 1135 cmsglen += fdbuf_cmsglen(fdbuflen); 1136 continue; 1137 } 1138 len = fdbuf_cmsglen(fdbuflen); 1139 } else if (tohp->level == SOL_SOCKET && 1140 tohp->name == SCM_TIMESTAMP) { 1141 if (oldflg) 1142 continue; 1143 1144 if (get_udatamodel() == DATAMODEL_NATIVE) { 1145 len = sizeof (struct timeval); 1146 } else { 1147 len = sizeof (struct timeval32); 1148 } 1149 } else { 1150 if (oldflg) 1151 continue; 1152 len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1153 } 1154 /* 1155 * Exclude roundup for last option to not set 1156 * MSG_CTRUNC when the cmsg fits but the padding doesn't fit. 1157 */ 1158 last_roundup = (t_uscalar_t) 1159 (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) - 1160 (len + (int)sizeof (struct cmsghdr))); 1161 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) + 1162 last_roundup; 1163 } 1164 cmsglen -= last_roundup; 1165 dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n", 1166 optlen, oldflg, cmsglen)); 1167 return (cmsglen); 1168 } 1169 1170 /* 1171 * Copy options from options to the control. Convert SO_FILEP to 1172 * file descriptors. 1173 * Returns errno or zero. 1174 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1175 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1176 * also be checked for any possible impacts. 1177 */ 1178 int 1179 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg, 1180 void *control, t_uscalar_t controllen) 1181 { 1182 struct T_opthdr *tohp; 1183 struct cmsghdr *cmsg; 1184 struct fdbuf *fdbuf; 1185 int fdbuflen; 1186 int error; 1187 #if defined(DEBUG) || defined(__lint) 1188 struct cmsghdr *cend = (struct cmsghdr *) 1189 (((uint8_t *)control) + ROUNDUP_cmsglen(controllen)); 1190 #endif 1191 cmsg = (struct cmsghdr *)control; 1192 1193 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1194 1195 for (tohp = (struct T_opthdr *)opt; 1196 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1197 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1198 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n", 1199 tohp->level, tohp->name, tohp->len)); 1200 1201 if (tohp->level == SOL_SOCKET && 1202 (tohp->name == SO_SRCADDR || 1203 tohp->name == SO_UNIX_CLOSE)) { 1204 continue; 1205 } 1206 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen); 1207 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1208 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1209 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1210 1211 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1212 return (EPROTO); 1213 if (oldflg) { 1214 error = fdbuf_extract(fdbuf, control, 1215 (int)controllen); 1216 if (error != 0) 1217 return (error); 1218 continue; 1219 } else { 1220 int fdlen; 1221 1222 fdlen = (int)fdbuf_cmsglen( 1223 (int)_TPI_TOPT_DATALEN(tohp)); 1224 1225 cmsg->cmsg_level = tohp->level; 1226 cmsg->cmsg_type = SCM_RIGHTS; 1227 cmsg->cmsg_len = (socklen_t)(fdlen + 1228 sizeof (struct cmsghdr)); 1229 1230 error = fdbuf_extract(fdbuf, 1231 CMSG_CONTENT(cmsg), fdlen); 1232 if (error != 0) 1233 return (error); 1234 } 1235 } else if (tohp->level == SOL_SOCKET && 1236 tohp->name == SCM_TIMESTAMP) { 1237 timestruc_t *timestamp; 1238 1239 if (oldflg) 1240 continue; 1241 1242 cmsg->cmsg_level = tohp->level; 1243 cmsg->cmsg_type = tohp->name; 1244 1245 timestamp = 1246 (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1], 1247 sizeof (intptr_t)); 1248 1249 if (get_udatamodel() == DATAMODEL_NATIVE) { 1250 struct timeval tv; 1251 1252 cmsg->cmsg_len = sizeof (struct timeval) + 1253 sizeof (struct cmsghdr); 1254 tv.tv_sec = timestamp->tv_sec; 1255 tv.tv_usec = timestamp->tv_nsec / 1256 (NANOSEC / MICROSEC); 1257 /* 1258 * on LP64 systems, the struct timeval in 1259 * the destination will not be 8-byte aligned, 1260 * so use bcopy to avoid alignment trouble 1261 */ 1262 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv)); 1263 } else { 1264 struct timeval32 *time32; 1265 1266 cmsg->cmsg_len = sizeof (struct timeval32) + 1267 sizeof (struct cmsghdr); 1268 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg); 1269 time32->tv_sec = (time32_t)timestamp->tv_sec; 1270 time32->tv_usec = 1271 (int32_t)(timestamp->tv_nsec / 1272 (NANOSEC / MICROSEC)); 1273 } 1274 1275 } else { 1276 if (oldflg) 1277 continue; 1278 1279 cmsg->cmsg_level = tohp->level; 1280 cmsg->cmsg_type = tohp->name; 1281 cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) + 1282 sizeof (struct cmsghdr)); 1283 1284 /* copy content to control data part */ 1285 bcopy(&tohp[1], CMSG_CONTENT(cmsg), 1286 CMSG_CONTENTLEN(cmsg)); 1287 } 1288 /* move to next CMSG structure! */ 1289 cmsg = CMSG_NEXT(cmsg); 1290 } 1291 dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n", 1292 control, controllen, (void *)cend, (void *)cmsg)); 1293 ASSERT(cmsg <= cend); 1294 return (0); 1295 } 1296 1297 /* 1298 * Extract the SO_SRCADDR option value if present. 1299 */ 1300 void 1301 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp, 1302 t_uscalar_t *srclenp) 1303 { 1304 struct T_opthdr *tohp; 1305 1306 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1307 1308 ASSERT(srcp != NULL && srclenp != NULL); 1309 *srcp = NULL; 1310 *srclenp = 0; 1311 1312 for (tohp = (struct T_opthdr *)opt; 1313 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1314 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1315 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n", 1316 tohp->level, tohp->name, tohp->len)); 1317 if (tohp->level == SOL_SOCKET && 1318 tohp->name == SO_SRCADDR) { 1319 *srcp = _TPI_TOPT_DATA(tohp); 1320 *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1321 } 1322 } 1323 } 1324 1325 /* 1326 * Verify if the SO_UNIX_CLOSE option is present. 1327 */ 1328 int 1329 so_getopt_unix_close(void *opt, t_uscalar_t optlen) 1330 { 1331 struct T_opthdr *tohp; 1332 1333 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1334 1335 for (tohp = (struct T_opthdr *)opt; 1336 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1337 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1338 dprint(1, 1339 ("so_getopt_unix_close: level 0x%x, name %d, len %d\n", 1340 tohp->level, tohp->name, tohp->len)); 1341 if (tohp->level == SOL_SOCKET && 1342 tohp->name == SO_UNIX_CLOSE) 1343 return (1); 1344 } 1345 return (0); 1346 } 1347 1348 /* 1349 * Allocate an M_PROTO message. 1350 * 1351 * If allocation fails the behavior depends on sleepflg: 1352 * _ALLOC_NOSLEEP fail immediately 1353 * _ALLOC_INTR sleep for memory until a signal is caught 1354 * _ALLOC_SLEEP sleep forever. Don't return NULL. 1355 */ 1356 mblk_t * 1357 soallocproto(size_t size, int sleepflg, cred_t *cr) 1358 { 1359 mblk_t *mp; 1360 1361 /* Round up size for reuse */ 1362 size = MAX(size, 64); 1363 if (cr != NULL) 1364 mp = allocb_cred(size, cr, curproc->p_pid); 1365 else 1366 mp = allocb(size, BPRI_MED); 1367 1368 if (mp == NULL) { 1369 int error; /* Dummy - error not returned to caller */ 1370 1371 switch (sleepflg) { 1372 case _ALLOC_SLEEP: 1373 if (cr != NULL) { 1374 mp = allocb_cred_wait(size, STR_NOSIG, &error, 1375 cr, curproc->p_pid); 1376 } else { 1377 mp = allocb_wait(size, BPRI_MED, STR_NOSIG, 1378 &error); 1379 } 1380 ASSERT(mp); 1381 break; 1382 case _ALLOC_INTR: 1383 if (cr != NULL) { 1384 mp = allocb_cred_wait(size, 0, &error, cr, 1385 curproc->p_pid); 1386 } else { 1387 mp = allocb_wait(size, BPRI_MED, 0, &error); 1388 } 1389 if (mp == NULL) { 1390 /* Caught signal while sleeping for memory */ 1391 eprintline(ENOBUFS); 1392 return (NULL); 1393 } 1394 break; 1395 case _ALLOC_NOSLEEP: 1396 default: 1397 eprintline(ENOBUFS); 1398 return (NULL); 1399 } 1400 } 1401 DB_TYPE(mp) = M_PROTO; 1402 return (mp); 1403 } 1404 1405 /* 1406 * Allocate an M_PROTO message with a single component. 1407 * len is the length of buf. size is the amount to allocate. 1408 * 1409 * buf can be NULL with a non-zero len. 1410 * This results in a bzero'ed chunk being placed the message. 1411 */ 1412 mblk_t * 1413 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg, 1414 cred_t *cr) 1415 { 1416 mblk_t *mp; 1417 1418 if (size == 0) 1419 size = len; 1420 1421 ASSERT(size >= len); 1422 /* Round up size for reuse */ 1423 size = MAX(size, 64); 1424 mp = soallocproto(size, sleepflg, cr); 1425 if (mp == NULL) 1426 return (NULL); 1427 mp->b_datap->db_type = M_PROTO; 1428 if (len != 0) { 1429 if (buf != NULL) 1430 bcopy(buf, mp->b_wptr, len); 1431 else 1432 bzero(mp->b_wptr, len); 1433 mp->b_wptr += len; 1434 } 1435 return (mp); 1436 } 1437 1438 /* 1439 * Append buf/len to mp. 1440 * The caller has to ensure that there is enough room in the mblk. 1441 * 1442 * buf can be NULL with a non-zero len. 1443 * This results in a bzero'ed chunk being placed the message. 1444 */ 1445 void 1446 soappendmsg(mblk_t *mp, const void *buf, ssize_t len) 1447 { 1448 ASSERT(mp); 1449 1450 if (len != 0) { 1451 /* Assert for room left */ 1452 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len); 1453 if (buf != NULL) 1454 bcopy(buf, mp->b_wptr, len); 1455 else 1456 bzero(mp->b_wptr, len); 1457 } 1458 mp->b_wptr += len; 1459 } 1460 1461 /* 1462 * Create a message using two kernel buffers. 1463 * If size is set that will determine the allocation size (e.g. for future 1464 * soappendmsg calls). If size is zero it is derived from the buffer 1465 * lengths. 1466 */ 1467 mblk_t * 1468 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1469 ssize_t size, int sleepflg, cred_t *cr) 1470 { 1471 mblk_t *mp; 1472 1473 if (size == 0) 1474 size = len1 + len2; 1475 ASSERT(size >= len1 + len2); 1476 1477 mp = soallocproto1(buf1, len1, size, sleepflg, cr); 1478 if (mp) 1479 soappendmsg(mp, buf2, len2); 1480 return (mp); 1481 } 1482 1483 /* 1484 * Create a message using three kernel buffers. 1485 * If size is set that will determine the allocation size (for future 1486 * soappendmsg calls). If size is zero it is derived from the buffer 1487 * lengths. 1488 */ 1489 mblk_t * 1490 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1491 const void *buf3, ssize_t len3, ssize_t size, int sleepflg, cred_t *cr) 1492 { 1493 mblk_t *mp; 1494 1495 if (size == 0) 1496 size = len1 + len2 +len3; 1497 ASSERT(size >= len1 + len2 + len3); 1498 1499 mp = soallocproto1(buf1, len1, size, sleepflg, cr); 1500 if (mp != NULL) { 1501 soappendmsg(mp, buf2, len2); 1502 soappendmsg(mp, buf3, len3); 1503 } 1504 return (mp); 1505 } 1506 1507 #ifdef DEBUG 1508 char * 1509 pr_state(uint_t state, uint_t mode) 1510 { 1511 static char buf[1024]; 1512 1513 buf[0] = 0; 1514 if (state & SS_ISCONNECTED) 1515 (void) strcat(buf, "ISCONNECTED "); 1516 if (state & SS_ISCONNECTING) 1517 (void) strcat(buf, "ISCONNECTING "); 1518 if (state & SS_ISDISCONNECTING) 1519 (void) strcat(buf, "ISDISCONNECTING "); 1520 if (state & SS_CANTSENDMORE) 1521 (void) strcat(buf, "CANTSENDMORE "); 1522 1523 if (state & SS_CANTRCVMORE) 1524 (void) strcat(buf, "CANTRCVMORE "); 1525 if (state & SS_ISBOUND) 1526 (void) strcat(buf, "ISBOUND "); 1527 if (state & SS_NDELAY) 1528 (void) strcat(buf, "NDELAY "); 1529 if (state & SS_NONBLOCK) 1530 (void) strcat(buf, "NONBLOCK "); 1531 1532 if (state & SS_ASYNC) 1533 (void) strcat(buf, "ASYNC "); 1534 if (state & SS_ACCEPTCONN) 1535 (void) strcat(buf, "ACCEPTCONN "); 1536 if (state & SS_SAVEDEOR) 1537 (void) strcat(buf, "SAVEDEOR "); 1538 1539 if (state & SS_RCVATMARK) 1540 (void) strcat(buf, "RCVATMARK "); 1541 if (state & SS_OOBPEND) 1542 (void) strcat(buf, "OOBPEND "); 1543 if (state & SS_HAVEOOBDATA) 1544 (void) strcat(buf, "HAVEOOBDATA "); 1545 if (state & SS_HADOOBDATA) 1546 (void) strcat(buf, "HADOOBDATA "); 1547 1548 if (mode & SM_PRIV) 1549 (void) strcat(buf, "PRIV "); 1550 if (mode & SM_ATOMIC) 1551 (void) strcat(buf, "ATOMIC "); 1552 if (mode & SM_ADDR) 1553 (void) strcat(buf, "ADDR "); 1554 if (mode & SM_CONNREQUIRED) 1555 (void) strcat(buf, "CONNREQUIRED "); 1556 1557 if (mode & SM_FDPASSING) 1558 (void) strcat(buf, "FDPASSING "); 1559 if (mode & SM_EXDATA) 1560 (void) strcat(buf, "EXDATA "); 1561 if (mode & SM_OPTDATA) 1562 (void) strcat(buf, "OPTDATA "); 1563 if (mode & SM_BYTESTREAM) 1564 (void) strcat(buf, "BYTESTREAM "); 1565 return (buf); 1566 } 1567 1568 char * 1569 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen) 1570 { 1571 static char buf[1024]; 1572 1573 if (addr == NULL || addrlen == 0) { 1574 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr); 1575 return (buf); 1576 } 1577 switch (family) { 1578 case AF_INET: { 1579 struct sockaddr_in sin; 1580 1581 bcopy(addr, &sin, sizeof (sin)); 1582 1583 (void) sprintf(buf, "(len %d) %x/%d", 1584 addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port)); 1585 break; 1586 } 1587 case AF_INET6: { 1588 struct sockaddr_in6 sin6; 1589 uint16_t *piece = (uint16_t *)&sin6.sin6_addr; 1590 1591 bcopy((char *)addr, (char *)&sin6, sizeof (sin6)); 1592 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d", 1593 addrlen, 1594 ntohs(piece[0]), ntohs(piece[1]), 1595 ntohs(piece[2]), ntohs(piece[3]), 1596 ntohs(piece[4]), ntohs(piece[5]), 1597 ntohs(piece[6]), ntohs(piece[7]), 1598 ntohs(sin6.sin6_port)); 1599 break; 1600 } 1601 case AF_UNIX: { 1602 struct sockaddr_un *soun = (struct sockaddr_un *)addr; 1603 1604 (void) sprintf(buf, "(len %d) %s", addrlen, 1605 (soun == NULL) ? "(none)" : soun->sun_path); 1606 break; 1607 } 1608 default: 1609 (void) sprintf(buf, "(unknown af %d)", family); 1610 break; 1611 } 1612 return (buf); 1613 } 1614 1615 /* The logical equivalence operator (a if-and-only-if b) */ 1616 #define EQUIVALENT(a, b) (((a) && (b)) || (!(a) && (!(b)))) 1617 1618 /* 1619 * Verify limitations and invariants on oob state. 1620 * Return 1 if OK, otherwise 0 so that it can be used as 1621 * ASSERT(verify_oobstate(so)); 1622 */ 1623 int 1624 so_verify_oobstate(struct sonode *so) 1625 { 1626 boolean_t havemark; 1627 1628 ASSERT(MUTEX_HELD(&so->so_lock)); 1629 1630 /* 1631 * The possible state combinations are: 1632 * 0 1633 * SS_OOBPEND 1634 * SS_OOBPEND|SS_HAVEOOBDATA 1635 * SS_OOBPEND|SS_HADOOBDATA 1636 * SS_HADOOBDATA 1637 */ 1638 switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) { 1639 case 0: 1640 case SS_OOBPEND: 1641 case SS_OOBPEND|SS_HAVEOOBDATA: 1642 case SS_OOBPEND|SS_HADOOBDATA: 1643 case SS_HADOOBDATA: 1644 break; 1645 default: 1646 printf("Bad oob state 1 (%p): state %s\n", 1647 (void *)so, pr_state(so->so_state, so->so_mode)); 1648 return (0); 1649 } 1650 1651 /* SS_RCVATMARK should only be set when SS_OOBPEND is set */ 1652 if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) { 1653 printf("Bad oob state 2 (%p): state %s\n", 1654 (void *)so, pr_state(so->so_state, so->so_mode)); 1655 return (0); 1656 } 1657 1658 /* 1659 * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND 1660 * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt. 1661 */ 1662 havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 : 1663 SOTOTPI(so)->sti_oobsigcnt > 0; 1664 1665 if (!EQUIVALENT(havemark || (so->so_state & SS_RCVATMARK), 1666 so->so_state & SS_OOBPEND)) { 1667 printf("Bad oob state 3 (%p): state %s\n", 1668 (void *)so, pr_state(so->so_state, so->so_mode)); 1669 return (0); 1670 } 1671 1672 /* 1673 * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA 1674 */ 1675 if (!(so->so_options & SO_OOBINLINE) && 1676 !EQUIVALENT(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) { 1677 printf("Bad oob state 4 (%p): state %s\n", 1678 (void *)so, pr_state(so->so_state, so->so_mode)); 1679 return (0); 1680 } 1681 1682 if (!SOCK_IS_NONSTR(so) && 1683 SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) { 1684 printf("Bad oob state 5 (%p): counts %d/%d state %s\n", 1685 (void *)so, SOTOTPI(so)->sti_oobsigcnt, 1686 SOTOTPI(so)->sti_oobcnt, 1687 pr_state(so->so_state, so->so_mode)); 1688 return (0); 1689 } 1690 1691 return (1); 1692 } 1693 #undef EQUIVALENT 1694 #endif /* DEBUG */ 1695 1696 /* initialize sockfs zone specific kstat related items */ 1697 void * 1698 sock_kstat_init(zoneid_t zoneid) 1699 { 1700 kstat_t *ksp; 1701 1702 ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc", 1703 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid); 1704 1705 if (ksp != NULL) { 1706 ksp->ks_update = sockfs_update; 1707 ksp->ks_snapshot = sockfs_snapshot; 1708 ksp->ks_lock = &socklist.sl_lock; 1709 ksp->ks_private = (void *)(uintptr_t)zoneid; 1710 kstat_install(ksp); 1711 } 1712 1713 return (ksp); 1714 } 1715 1716 /* tear down sockfs zone specific kstat related items */ 1717 /*ARGSUSED*/ 1718 void 1719 sock_kstat_fini(zoneid_t zoneid, void *arg) 1720 { 1721 kstat_t *ksp = (kstat_t *)arg; 1722 1723 if (ksp != NULL) { 1724 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private); 1725 kstat_delete(ksp); 1726 } 1727 } 1728 1729 /* 1730 * Zones: 1731 * Note that nactive is going to be different for each zone. 1732 * This means we require kstat to call sockfs_update and then sockfs_snapshot 1733 * for the same zone, or sockfs_snapshot will be taken into the wrong size 1734 * buffer. This is safe, but if the buffer is too small, user will not be 1735 * given details of all sockets. However, as this kstat has a ks_lock, kstat 1736 * driver will keep it locked between the update and the snapshot, so no 1737 * other process (zone) can currently get inbetween resulting in a wrong size 1738 * buffer allocation. 1739 */ 1740 static int 1741 sockfs_update(kstat_t *ksp, int rw) 1742 { 1743 uint_t nactive = 0; /* # of active AF_UNIX sockets */ 1744 struct sonode *so; /* current sonode on socklist */ 1745 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 1746 1747 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 1748 1749 if (rw == KSTAT_WRITE) { /* bounce all writes */ 1750 return (EACCES); 1751 } 1752 1753 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { 1754 if (so->so_count != 0 && so->so_zoneid == myzoneid) { 1755 nactive++; 1756 } 1757 } 1758 ksp->ks_ndata = nactive; 1759 ksp->ks_data_size = nactive * sizeof (struct sockinfo); 1760 1761 return (0); 1762 } 1763 1764 static int 1765 sockfs_snapshot(kstat_t *ksp, void *buf, int rw) 1766 { 1767 int ns; /* # of sonodes we've copied */ 1768 struct sonode *so; /* current sonode on socklist */ 1769 struct sockinfo *psi; /* where we put sockinfo data */ 1770 t_uscalar_t sn_len; /* soa_len */ 1771 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 1772 sotpi_info_t *sti; 1773 1774 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 1775 1776 ksp->ks_snaptime = gethrtime(); 1777 1778 if (rw == KSTAT_WRITE) { /* bounce all writes */ 1779 return (EACCES); 1780 } 1781 1782 /* 1783 * For each sonode on the socklist, we massage the important 1784 * info into buf, in sockinfo format. 1785 */ 1786 psi = (struct sockinfo *)buf; 1787 ns = 0; 1788 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { 1789 vattr_t attr; 1790 1791 /* only stuff active sonodes and the same zone: */ 1792 if (so->so_count == 0 || so->so_zoneid != myzoneid) { 1793 continue; 1794 } 1795 1796 /* 1797 * If the sonode was activated between the update and the 1798 * snapshot, we're done - as this is only a snapshot. 1799 */ 1800 if ((caddr_t)(psi) >= (caddr_t)buf + ksp->ks_data_size) { 1801 break; 1802 } 1803 1804 sti = SOTOTPI(so); 1805 /* copy important info into buf: */ 1806 psi->si_size = sizeof (struct sockinfo); 1807 psi->si_family = so->so_family; 1808 psi->si_type = so->so_type; 1809 psi->si_flag = so->so_flag; 1810 psi->si_state = so->so_state; 1811 psi->si_serv_type = sti->sti_serv_type; 1812 psi->si_ux_laddr_sou_magic = sti->sti_ux_laddr.soua_magic; 1813 psi->si_ux_faddr_sou_magic = sti->sti_ux_faddr.soua_magic; 1814 psi->si_laddr_soa_len = sti->sti_laddr.soa_len; 1815 psi->si_faddr_soa_len = sti->sti_faddr.soa_len; 1816 psi->si_szoneid = so->so_zoneid; 1817 psi->si_faddr_noxlate = sti->sti_faddr_noxlate; 1818 1819 /* 1820 * Grab the inode, if possible. 1821 * This must be done before entering so_lock as VOP_GETATTR 1822 * will acquire it. 1823 */ 1824 if (so->so_vnode == NULL || 1825 VOP_GETATTR(so->so_vnode, &attr, 0, CRED(), NULL) != 0) 1826 attr.va_nodeid = 0; 1827 1828 psi->si_inode = attr.va_nodeid; 1829 1830 mutex_enter(&so->so_lock); 1831 1832 if (sti->sti_laddr_sa != NULL) { 1833 ASSERT(sti->sti_laddr_sa->sa_data != NULL); 1834 sn_len = sti->sti_laddr_len; 1835 ASSERT(sn_len <= sizeof (short) + 1836 sizeof (psi->si_laddr_sun_path)); 1837 1838 psi->si_laddr_family = 1839 sti->sti_laddr_sa->sa_family; 1840 if (sn_len != 0) { 1841 /* AF_UNIX socket names are NULL terminated */ 1842 (void) strncpy(psi->si_laddr_sun_path, 1843 sti->sti_laddr_sa->sa_data, 1844 sizeof (psi->si_laddr_sun_path)); 1845 sn_len = strlen(psi->si_laddr_sun_path); 1846 } 1847 psi->si_laddr_sun_path[sn_len] = 0; 1848 } 1849 1850 if (sti->sti_faddr_sa != NULL) { 1851 ASSERT(sti->sti_faddr_sa->sa_data != NULL); 1852 sn_len = sti->sti_faddr_len; 1853 ASSERT(sn_len <= sizeof (short) + 1854 sizeof (psi->si_faddr_sun_path)); 1855 1856 psi->si_faddr_family = 1857 sti->sti_faddr_sa->sa_family; 1858 if (sn_len != 0) { 1859 (void) strncpy(psi->si_faddr_sun_path, 1860 sti->sti_faddr_sa->sa_data, 1861 sizeof (psi->si_faddr_sun_path)); 1862 sn_len = strlen(psi->si_faddr_sun_path); 1863 } 1864 psi->si_faddr_sun_path[sn_len] = 0; 1865 } 1866 1867 mutex_exit(&so->so_lock); 1868 1869 (void) snprintf(psi->si_son_straddr, 1870 sizeof (psi->si_son_straddr), "%p", (void *)so); 1871 (void) snprintf(psi->si_lvn_straddr, 1872 sizeof (psi->si_lvn_straddr), "%p", 1873 (void *)sti->sti_ux_laddr.soua_vp); 1874 (void) snprintf(psi->si_fvn_straddr, 1875 sizeof (psi->si_fvn_straddr), "%p", 1876 (void *)sti->sti_ux_faddr.soua_vp); 1877 1878 ns++; 1879 psi++; 1880 } 1881 1882 ksp->ks_ndata = ns; 1883 return (0); 1884 } 1885 1886 ssize_t 1887 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size) 1888 { 1889 struct uio auio; 1890 struct iovec aiov[MSG_MAXIOVLEN]; 1891 register vnode_t *vp; 1892 int ioflag, rwflag; 1893 ssize_t cnt; 1894 int error = 0; 1895 int iovcnt = 0; 1896 short fflag; 1897 1898 vp = fp->f_vnode; 1899 fflag = fp->f_flag; 1900 1901 rwflag = 0; 1902 aiov[0].iov_base = (caddr_t)buf; 1903 aiov[0].iov_len = size; 1904 iovcnt = 1; 1905 cnt = (ssize_t)size; 1906 (void) VOP_RWLOCK(vp, rwflag, NULL); 1907 1908 auio.uio_loffset = fileoff; 1909 auio.uio_iov = aiov; 1910 auio.uio_iovcnt = iovcnt; 1911 auio.uio_resid = cnt; 1912 auio.uio_segflg = UIO_SYSSPACE; 1913 auio.uio_llimit = MAXOFFSET_T; 1914 auio.uio_fmode = fflag; 1915 auio.uio_extflg = UIO_COPY_CACHED; 1916 1917 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1918 1919 /* If read sync is not asked for, filter sync flags */ 1920 if ((ioflag & FRSYNC) == 0) 1921 ioflag &= ~(FSYNC|FDSYNC); 1922 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL); 1923 cnt -= auio.uio_resid; 1924 1925 VOP_RWUNLOCK(vp, rwflag, NULL); 1926 1927 if (error == EINTR && cnt != 0) 1928 error = 0; 1929 out: 1930 if (error != 0) { 1931 *err = error; 1932 return (0); 1933 } else { 1934 *err = 0; 1935 return (cnt); 1936 } 1937 } 1938 1939 int 1940 so_copyin(const void *from, void *to, size_t size, int fromkernel) 1941 { 1942 if (fromkernel) { 1943 bcopy(from, to, size); 1944 return (0); 1945 } 1946 return (xcopyin(from, to, size)); 1947 } 1948 1949 int 1950 so_copyout(const void *from, void *to, size_t size, int tokernel) 1951 { 1952 if (tokernel) { 1953 bcopy(from, to, size); 1954 return (0); 1955 } 1956 return (xcopyout(from, to, size)); 1957 } 1958