1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/t_lock.h> 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/buf.h> 31 #include <sys/conf.h> 32 #include <sys/cred.h> 33 #include <sys/kmem.h> 34 #include <sys/sysmacros.h> 35 #include <sys/vfs.h> 36 #include <sys/vfs_opreg.h> 37 #include <sys/vnode.h> 38 #include <sys/debug.h> 39 #include <sys/errno.h> 40 #include <sys/time.h> 41 #include <sys/file.h> 42 #include <sys/open.h> 43 #include <sys/user.h> 44 #include <sys/termios.h> 45 #include <sys/stream.h> 46 #include <sys/strsubr.h> 47 #include <sys/strsun.h> 48 #include <sys/esunddi.h> 49 #include <sys/flock.h> 50 #include <sys/modctl.h> 51 #include <sys/cmn_err.h> 52 #include <sys/mkdev.h> 53 #include <sys/pathname.h> 54 #include <sys/ddi.h> 55 #include <sys/stat.h> 56 #include <sys/fs/snode.h> 57 #include <sys/fs/dv_node.h> 58 #include <sys/zone.h> 59 60 #include <sys/socket.h> 61 #include <sys/socketvar.h> 62 #include <netinet/in.h> 63 #include <sys/un.h> 64 #include <sys/ucred.h> 65 66 #include <sys/tiuser.h> 67 #define _SUN_TPI_VERSION 2 68 #include <sys/tihdr.h> 69 70 #include <c2/audit.h> 71 72 #include <fs/sockfs/nl7c.h> 73 #include <fs/sockfs/sockcommon.h> 74 #include <fs/sockfs/sockfilter_impl.h> 75 #include <fs/sockfs/socktpi.h> 76 #include <fs/sockfs/socktpi_impl.h> 77 #include <fs/sockfs/sodirect.h> 78 79 /* 80 * Macros that operate on struct cmsghdr. 81 * The CMSG_VALID macro does not assume that the last option buffer is padded. 82 */ 83 #define CMSG_CONTENT(cmsg) (&((cmsg)[1])) 84 #define CMSG_CONTENTLEN(cmsg) ((cmsg)->cmsg_len - sizeof (struct cmsghdr)) 85 #define CMSG_VALID(cmsg, start, end) \ 86 (ISALIGNED_cmsghdr(cmsg) && \ 87 ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \ 88 ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \ 89 ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \ 90 ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end))) 91 #define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */ 92 93 dev_t sockdev; /* For fsid in getattr */ 94 int sockfs_defer_nl7c_init = 0; 95 96 struct socklist socklist; 97 98 struct kmem_cache *socket_cache; 99 100 /* 101 * sockconf_lock protects the socket configuration (socket types and 102 * socket filters) which is changed via the sockconfig system call. 103 */ 104 krwlock_t sockconf_lock; 105 106 static int sockfs_update(kstat_t *, int); 107 static int sockfs_snapshot(kstat_t *, void *, int); 108 extern smod_info_t *sotpi_smod_create(void); 109 110 extern void sendfile_init(); 111 112 extern void nl7c_init(void); 113 114 extern int modrootloaded; 115 116 #define ADRSTRLEN (2 * sizeof (void *) + 1) 117 /* 118 * kernel structure for passing the sockinfo data back up to the user. 119 * the strings array allows us to convert AF_UNIX addresses into strings 120 * with a common method regardless of which n-bit kernel we're running. 121 */ 122 struct k_sockinfo { 123 struct sockinfo ks_si; 124 char ks_straddr[3][ADRSTRLEN]; 125 }; 126 127 /* 128 * Translate from a device pathname (e.g. "/dev/tcp") to a vnode. 129 * Returns with the vnode held. 130 */ 131 int 132 sogetvp(char *devpath, vnode_t **vpp, int uioflag) 133 { 134 struct snode *csp; 135 vnode_t *vp, *dvp; 136 major_t maj; 137 int error; 138 139 ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE); 140 141 /* 142 * Lookup the underlying filesystem vnode. 143 */ 144 error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp); 145 if (error) 146 return (error); 147 148 /* Check that it is the correct vnode */ 149 if (vp->v_type != VCHR) { 150 VN_RELE(vp); 151 return (ENOTSOCK); 152 } 153 154 /* 155 * If devpath went through devfs, the device should already 156 * be configured. If devpath is a mknod file, however, we 157 * need to make sure the device is properly configured. 158 * To do this, we do something similar to spec_open() 159 * except that we resolve to the minor/leaf level since 160 * we need to return a vnode. 161 */ 162 csp = VTOS(VTOS(vp)->s_commonvp); 163 if (!(csp->s_flag & SDIPSET)) { 164 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 165 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname); 166 if (error == 0) 167 error = devfs_lookupname(pathname, NULLVPP, &dvp); 168 VN_RELE(vp); 169 kmem_free(pathname, MAXPATHLEN); 170 if (error != 0) 171 return (ENXIO); 172 vp = dvp; /* use the devfs vp */ 173 } 174 175 /* device is configured at this point */ 176 maj = getmajor(vp->v_rdev); 177 if (!STREAMSTAB(maj)) { 178 VN_RELE(vp); 179 return (ENOSTR); 180 } 181 182 *vpp = vp; 183 return (0); 184 } 185 186 /* 187 * Update the accessed, updated, or changed times in an sonode 188 * with the current time. 189 * 190 * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable 191 * attributes in a fstat call. (They return the current time and 0 for 192 * all timestamps, respectively.) We maintain the current timestamps 193 * here primarily so that should sockmod be popped the resulting 194 * file descriptor will behave like a stream w.r.t. the timestamps. 195 */ 196 void 197 so_update_attrs(struct sonode *so, int flag) 198 { 199 time_t now = gethrestime_sec(); 200 201 if (SOCK_IS_NONSTR(so)) 202 return; 203 204 mutex_enter(&so->so_lock); 205 so->so_flag |= flag; 206 if (flag & SOACC) 207 SOTOTPI(so)->sti_atime = now; 208 if (flag & SOMOD) 209 SOTOTPI(so)->sti_mtime = now; 210 mutex_exit(&so->so_lock); 211 } 212 213 extern so_create_func_t sock_comm_create_function; 214 extern so_destroy_func_t sock_comm_destroy_function; 215 /* 216 * Init function called when sockfs is loaded. 217 */ 218 int 219 sockinit(int fstype, char *name) 220 { 221 static const fs_operation_def_t sock_vfsops_template[] = { 222 NULL, NULL 223 }; 224 int error; 225 major_t dev; 226 char *err_str; 227 228 error = vfs_setfsops(fstype, sock_vfsops_template, NULL); 229 if (error != 0) { 230 zcmn_err(GLOBAL_ZONEID, CE_WARN, 231 "sockinit: bad vfs ops template"); 232 return (error); 233 } 234 235 error = vn_make_ops(name, socket_vnodeops_template, 236 &socket_vnodeops); 237 if (error != 0) { 238 err_str = "sockinit: bad socket vnode ops template"; 239 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */ 240 socket_vnodeops = NULL; 241 goto failure; 242 } 243 244 socket_cache = kmem_cache_create("socket_cache", 245 sizeof (struct sonode), 0, sonode_constructor, 246 sonode_destructor, NULL, NULL, NULL, 0); 247 248 rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL); 249 250 error = socktpi_init(); 251 if (error != 0) { 252 err_str = NULL; 253 goto failure; 254 } 255 256 error = sod_init(); 257 if (error != 0) { 258 err_str = NULL; 259 goto failure; 260 } 261 262 /* 263 * Set up the default create and destroy functions 264 */ 265 sock_comm_create_function = socket_sonode_create; 266 sock_comm_destroy_function = socket_sonode_destroy; 267 268 /* 269 * Build initial list mapping socket parameters to vnode. 270 */ 271 smod_init(); 272 smod_add(sotpi_smod_create()); 273 274 sockparams_init(); 275 276 /* 277 * If sockets are needed before init runs /sbin/soconfig 278 * it is possible to preload the sockparams list here using 279 * calls like: 280 * sockconfig(1,2,3, "/dev/tcp", 0); 281 */ 282 283 /* 284 * Create a unique dev_t for use in so_fsid. 285 */ 286 287 if ((dev = getudev()) == (major_t)-1) 288 dev = 0; 289 sockdev = makedevice(dev, 0); 290 291 mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL); 292 sendfile_init(); 293 if (!modrootloaded) { 294 sockfs_defer_nl7c_init = 1; 295 } else { 296 nl7c_init(); 297 } 298 299 /* Initialize socket filters */ 300 sof_init(); 301 302 return (0); 303 304 failure: 305 (void) vfs_freevfsops_by_type(fstype); 306 if (socket_vnodeops != NULL) 307 vn_freevnodeops(socket_vnodeops); 308 if (err_str != NULL) 309 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str); 310 return (error); 311 } 312 313 /* 314 * Caller must hold the mutex. Used to set SOLOCKED. 315 */ 316 void 317 so_lock_single(struct sonode *so) 318 { 319 ASSERT(MUTEX_HELD(&so->so_lock)); 320 321 while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) { 322 cv_wait_stop(&so->so_single_cv, &so->so_lock, 323 SO_LOCK_WAKEUP_TIME); 324 } 325 so->so_flag |= SOLOCKED; 326 } 327 328 /* 329 * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND. 330 * Used to clear SOLOCKED or SOASYNC_UNBIND. 331 */ 332 void 333 so_unlock_single(struct sonode *so, int flag) 334 { 335 ASSERT(MUTEX_HELD(&so->so_lock)); 336 ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND)); 337 ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0); 338 ASSERT(so->so_flag & flag); 339 /* 340 * Process the T_DISCON_IND on sti_discon_ind_mp. 341 * 342 * Call to so_drain_discon_ind will result in so_lock 343 * being dropped and re-acquired later. 344 */ 345 if (!SOCK_IS_NONSTR(so)) { 346 sotpi_info_t *sti = SOTOTPI(so); 347 348 if (sti->sti_discon_ind_mp != NULL) 349 so_drain_discon_ind(so); 350 } 351 352 cv_signal(&so->so_single_cv); 353 so->so_flag &= ~flag; 354 } 355 356 /* 357 * Caller must hold the mutex. Used to set SOREADLOCKED. 358 * If the caller wants nonblocking behavior it should set fmode. 359 */ 360 int 361 so_lock_read(struct sonode *so, int fmode) 362 { 363 ASSERT(MUTEX_HELD(&so->so_lock)); 364 365 while (so->so_flag & SOREADLOCKED) { 366 if (fmode & (FNDELAY|FNONBLOCK)) 367 return (EWOULDBLOCK); 368 cv_wait_stop(&so->so_read_cv, &so->so_lock, 369 SO_LOCK_WAKEUP_TIME); 370 } 371 so->so_flag |= SOREADLOCKED; 372 return (0); 373 } 374 375 /* 376 * Like so_lock_read above but allows signals. 377 */ 378 int 379 so_lock_read_intr(struct sonode *so, int fmode) 380 { 381 ASSERT(MUTEX_HELD(&so->so_lock)); 382 383 while (so->so_flag & SOREADLOCKED) { 384 if (fmode & (FNDELAY|FNONBLOCK)) 385 return (EWOULDBLOCK); 386 if (!cv_wait_sig(&so->so_read_cv, &so->so_lock)) 387 return (EINTR); 388 } 389 so->so_flag |= SOREADLOCKED; 390 return (0); 391 } 392 393 /* 394 * Caller must hold the mutex. Used to clear SOREADLOCKED, 395 * set in so_lock_read() or so_lock_read_intr(). 396 */ 397 void 398 so_unlock_read(struct sonode *so) 399 { 400 ASSERT(MUTEX_HELD(&so->so_lock)); 401 ASSERT(so->so_flag & SOREADLOCKED); 402 403 cv_signal(&so->so_read_cv); 404 so->so_flag &= ~SOREADLOCKED; 405 } 406 407 /* 408 * Verify that the specified offset falls within the mblk and 409 * that the resulting pointer is aligned. 410 * Returns NULL if not. 411 */ 412 void * 413 sogetoff(mblk_t *mp, t_uscalar_t offset, 414 t_uscalar_t length, uint_t align_size) 415 { 416 uintptr_t ptr1, ptr2; 417 418 ASSERT(mp && mp->b_wptr >= mp->b_rptr); 419 ptr1 = (uintptr_t)mp->b_rptr + offset; 420 ptr2 = (uintptr_t)ptr1 + length; 421 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) { 422 eprintline(0); 423 return (NULL); 424 } 425 if ((ptr1 & (align_size - 1)) != 0) { 426 eprintline(0); 427 return (NULL); 428 } 429 return ((void *)ptr1); 430 } 431 432 /* 433 * Return the AF_UNIX underlying filesystem vnode matching a given name. 434 * Makes sure the sending and the destination sonodes are compatible. 435 * The vnode is returned held. 436 * 437 * The underlying filesystem VSOCK vnode has a v_stream pointer that 438 * references the actual stream head (hence indirectly the actual sonode). 439 */ 440 static int 441 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess, 442 vnode_t **vpp) 443 { 444 vnode_t *vp; /* Underlying filesystem vnode */ 445 vnode_t *rvp; /* real vnode */ 446 vnode_t *svp; /* sockfs vnode */ 447 struct sonode *so2; 448 int error; 449 450 dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so, 451 soun->sun_path)); 452 453 error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); 454 if (error) { 455 eprintsoline(so, error); 456 return (error); 457 } 458 459 /* 460 * Traverse lofs mounts get the real vnode 461 */ 462 if (VOP_REALVP(vp, &rvp, NULL) == 0) { 463 VN_HOLD(rvp); /* hold the real vnode */ 464 VN_RELE(vp); /* release hold from lookup */ 465 vp = rvp; 466 } 467 468 if (vp->v_type != VSOCK) { 469 error = ENOTSOCK; 470 eprintsoline(so, error); 471 goto done2; 472 } 473 474 if (checkaccess) { 475 /* 476 * Check that we have permissions to access the destination 477 * vnode. This check is not done in BSD but it is required 478 * by X/Open. 479 */ 480 if (error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL)) { 481 eprintsoline(so, error); 482 goto done2; 483 } 484 } 485 486 /* 487 * Check if the remote socket has been closed. 488 * 489 * Synchronize with vn_rele_stream by holding v_lock while traversing 490 * v_stream->sd_vnode. 491 */ 492 mutex_enter(&vp->v_lock); 493 if (vp->v_stream == NULL) { 494 mutex_exit(&vp->v_lock); 495 if (so->so_type == SOCK_DGRAM) 496 error = EDESTADDRREQ; 497 else 498 error = ECONNREFUSED; 499 500 eprintsoline(so, error); 501 goto done2; 502 } 503 ASSERT(vp->v_stream->sd_vnode); 504 svp = vp->v_stream->sd_vnode; 505 /* 506 * holding v_lock on underlying filesystem vnode and acquiring 507 * it on sockfs vnode. Assumes that no code ever attempts to 508 * acquire these locks in the reverse order. 509 */ 510 VN_HOLD(svp); 511 mutex_exit(&vp->v_lock); 512 513 if (svp->v_type != VSOCK) { 514 error = ENOTSOCK; 515 eprintsoline(so, error); 516 goto done; 517 } 518 519 so2 = VTOSO(svp); 520 521 if (so->so_type != so2->so_type) { 522 error = EPROTOTYPE; 523 eprintsoline(so, error); 524 goto done; 525 } 526 527 VN_RELE(svp); 528 *vpp = vp; 529 return (0); 530 531 done: 532 VN_RELE(svp); 533 done2: 534 VN_RELE(vp); 535 return (error); 536 } 537 538 /* 539 * Verify peer address for connect and sendto/sendmsg. 540 * Since sendto/sendmsg would not get synchronous errors from the transport 541 * provider we have to do these ugly checks in the socket layer to 542 * preserve compatibility with SunOS 4.X. 543 */ 544 int 545 so_addr_verify(struct sonode *so, const struct sockaddr *name, 546 socklen_t namelen) 547 { 548 int family; 549 550 dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n", 551 (void *)so, (void *)name, namelen)); 552 553 ASSERT(name != NULL); 554 555 family = so->so_family; 556 switch (family) { 557 case AF_INET: 558 if (name->sa_family != family) { 559 eprintsoline(so, EAFNOSUPPORT); 560 return (EAFNOSUPPORT); 561 } 562 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) { 563 eprintsoline(so, EINVAL); 564 return (EINVAL); 565 } 566 break; 567 case AF_INET6: { 568 #ifdef DEBUG 569 struct sockaddr_in6 *sin6; 570 #endif /* DEBUG */ 571 572 if (name->sa_family != family) { 573 eprintsoline(so, EAFNOSUPPORT); 574 return (EAFNOSUPPORT); 575 } 576 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) { 577 eprintsoline(so, EINVAL); 578 return (EINVAL); 579 } 580 #ifdef DEBUG 581 /* Verify that apps don't forget to clear sin6_scope_id etc */ 582 sin6 = (struct sockaddr_in6 *)name; 583 if (sin6->sin6_scope_id != 0 && 584 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) { 585 zcmn_err(getzoneid(), CE_WARN, 586 "connect/send* with uninitialized sin6_scope_id " 587 "(%d) on socket. Pid = %d\n", 588 (int)sin6->sin6_scope_id, (int)curproc->p_pid); 589 } 590 #endif /* DEBUG */ 591 break; 592 } 593 case AF_UNIX: 594 if (SOTOTPI(so)->sti_faddr_noxlate) { 595 return (0); 596 } 597 if (namelen < (socklen_t)sizeof (short)) { 598 eprintsoline(so, ENOENT); 599 return (ENOENT); 600 } 601 if (name->sa_family != family) { 602 eprintsoline(so, EAFNOSUPPORT); 603 return (EAFNOSUPPORT); 604 } 605 /* MAXPATHLEN + soun_family + nul termination */ 606 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) { 607 eprintsoline(so, ENAMETOOLONG); 608 return (ENAMETOOLONG); 609 } 610 611 break; 612 613 default: 614 /* 615 * Default is don't do any length or sa_family check 616 * to allow non-sockaddr style addresses. 617 */ 618 break; 619 } 620 621 return (0); 622 } 623 624 625 /* 626 * Translate an AF_UNIX sockaddr_un to the transport internal name. 627 * Assumes caller has called so_addr_verify first. 628 */ 629 /*ARGSUSED*/ 630 int 631 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name, 632 socklen_t namelen, int checkaccess, 633 void **addrp, socklen_t *addrlenp) 634 { 635 int error; 636 struct sockaddr_un *soun; 637 vnode_t *vp; 638 void *addr; 639 socklen_t addrlen; 640 sotpi_info_t *sti = SOTOTPI(so); 641 642 dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n", 643 (void *)so, (void *)name, namelen, checkaccess)); 644 645 ASSERT(name != NULL); 646 ASSERT(so->so_family == AF_UNIX); 647 ASSERT(!sti->sti_faddr_noxlate); 648 ASSERT(namelen >= (socklen_t)sizeof (short)); 649 ASSERT(name->sa_family == AF_UNIX); 650 soun = (struct sockaddr_un *)name; 651 /* 652 * Lookup vnode for the specified path name and verify that 653 * it is a socket. 654 */ 655 error = so_ux_lookup(so, soun, checkaccess, &vp); 656 if (error) { 657 eprintsoline(so, error); 658 return (error); 659 } 660 /* 661 * Use the address of the peer vnode as the address to send 662 * to. We release the peer vnode here. In case it has been 663 * closed by the time the T_CONN_REQ or T_UNIDATA_REQ reaches the 664 * transport the message will get an error or be dropped. 665 */ 666 sti->sti_ux_faddr.soua_vp = vp; 667 sti->sti_ux_faddr.soua_magic = SOU_MAGIC_EXPLICIT; 668 addr = &sti->sti_ux_faddr; 669 addrlen = (socklen_t)sizeof (sti->sti_ux_faddr); 670 dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n", 671 addrlen, (void *)vp)); 672 VN_RELE(vp); 673 *addrp = addr; 674 *addrlenp = (socklen_t)addrlen; 675 return (0); 676 } 677 678 /* 679 * Esballoc free function for messages that contain SO_FILEP option. 680 * Decrement the reference count on the file pointers using closef. 681 */ 682 void 683 fdbuf_free(struct fdbuf *fdbuf) 684 { 685 int i; 686 struct file *fp; 687 688 dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd)); 689 for (i = 0; i < fdbuf->fd_numfd; i++) { 690 /* 691 * We need pointer size alignment for fd_fds. On a LP64 692 * kernel, the required alignment is 8 bytes while 693 * the option headers and values are only 4 bytes 694 * aligned. So its safer to do a bcopy compared to 695 * assigning fdbuf->fd_fds[i] to fp. 696 */ 697 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 698 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp)); 699 (void) closef(fp); 700 } 701 if (fdbuf->fd_ebuf != NULL) 702 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen); 703 kmem_free(fdbuf, fdbuf->fd_size); 704 } 705 706 /* 707 * Allocate an esballoc'ed message for AF_UNIX file descriptor passing. 708 * Waits if memory is not available. 709 */ 710 mblk_t * 711 fdbuf_allocmsg(int size, struct fdbuf *fdbuf) 712 { 713 uchar_t *buf; 714 mblk_t *mp; 715 716 dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd)); 717 buf = kmem_alloc(size, KM_SLEEP); 718 fdbuf->fd_ebuf = (caddr_t)buf; 719 fdbuf->fd_ebuflen = size; 720 fdbuf->fd_frtn.free_func = fdbuf_free; 721 fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf; 722 723 mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn); 724 mp->b_datap->db_type = M_PROTO; 725 return (mp); 726 } 727 728 /* 729 * Extract file descriptors from a fdbuf. 730 * Return list in rights/rightslen. 731 */ 732 /*ARGSUSED*/ 733 static int 734 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen) 735 { 736 int i, fd; 737 int *rp; 738 struct file *fp; 739 int numfd; 740 741 dprint(1, ("fdbuf_extract: %d fds, len %d\n", 742 fdbuf->fd_numfd, rightslen)); 743 744 numfd = fdbuf->fd_numfd; 745 ASSERT(rightslen == numfd * (int)sizeof (int)); 746 747 /* 748 * Allocate a file descriptor and increment the f_count. 749 * The latter is needed since we always call fdbuf_free 750 * which performs a closef. 751 */ 752 rp = (int *)rights; 753 for (i = 0; i < numfd; i++) { 754 if ((fd = ufalloc(0)) == -1) 755 goto cleanup; 756 /* 757 * We need pointer size alignment for fd_fds. On a LP64 758 * kernel, the required alignment is 8 bytes while 759 * the option headers and values are only 4 bytes 760 * aligned. So its safer to do a bcopy compared to 761 * assigning fdbuf->fd_fds[i] to fp. 762 */ 763 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 764 mutex_enter(&fp->f_tlock); 765 fp->f_count++; 766 mutex_exit(&fp->f_tlock); 767 setf(fd, fp); 768 *rp++ = fd; 769 if (AU_AUDITING()) 770 audit_fdrecv(fd, fp); 771 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n", 772 i, fd, (void *)fp, fp->f_count)); 773 } 774 return (0); 775 776 cleanup: 777 /* 778 * Undo whatever partial work the loop above has done. 779 */ 780 { 781 int j; 782 783 rp = (int *)rights; 784 for (j = 0; j < i; j++) { 785 dprint(0, 786 ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp)); 787 (void) closeandsetf(*rp++, NULL); 788 } 789 } 790 791 return (EMFILE); 792 } 793 794 /* 795 * Insert file descriptors into an fdbuf. 796 * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed 797 * by calling fdbuf_free(). 798 */ 799 int 800 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp) 801 { 802 int numfd, i; 803 int *fds; 804 struct file *fp; 805 struct fdbuf *fdbuf; 806 int fdbufsize; 807 808 dprint(1, ("fdbuf_create: len %d\n", rightslen)); 809 810 numfd = rightslen / (int)sizeof (int); 811 812 fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)); 813 fdbuf = kmem_alloc(fdbufsize, KM_SLEEP); 814 fdbuf->fd_size = fdbufsize; 815 fdbuf->fd_numfd = 0; 816 fdbuf->fd_ebuf = NULL; 817 fdbuf->fd_ebuflen = 0; 818 fds = (int *)rights; 819 for (i = 0; i < numfd; i++) { 820 if ((fp = getf(fds[i])) == NULL) { 821 fdbuf_free(fdbuf); 822 return (EBADF); 823 } 824 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n", 825 i, fds[i], (void *)fp, fp->f_count)); 826 mutex_enter(&fp->f_tlock); 827 fp->f_count++; 828 mutex_exit(&fp->f_tlock); 829 /* 830 * The maximum alignment for fdbuf (or any option header 831 * and its value) it 4 bytes. On a LP64 kernel, the alignment 832 * is not sufficient for pointers (fd_fds in this case). Since 833 * we just did a kmem_alloc (we get a double word alignment), 834 * we don't need to do anything on the send side (we loose 835 * the double word alignment because fdbuf goes after an 836 * option header (eg T_unitdata_req) which is only 4 byte 837 * aligned). We take care of this when we extract the file 838 * descriptor in fdbuf_extract or fdbuf_free. 839 */ 840 fdbuf->fd_fds[i] = fp; 841 fdbuf->fd_numfd++; 842 releasef(fds[i]); 843 if (AU_AUDITING()) 844 audit_fdsend(fds[i], fp, 0); 845 } 846 *fdbufp = fdbuf; 847 return (0); 848 } 849 850 static int 851 fdbuf_optlen(int rightslen) 852 { 853 int numfd; 854 855 numfd = rightslen / (int)sizeof (int); 856 857 return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *))); 858 } 859 860 static t_uscalar_t 861 fdbuf_cmsglen(int fdbuflen) 862 { 863 return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) / 864 (int)sizeof (struct file *) * (int)sizeof (int)); 865 } 866 867 868 /* 869 * Return non-zero if the mblk and fdbuf are consistent. 870 */ 871 static int 872 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen) 873 { 874 if (fdbuflen >= FDBUF_HDRSIZE && 875 fdbuflen == fdbuf->fd_size) { 876 frtn_t *frp = mp->b_datap->db_frtnp; 877 /* 878 * Check that the SO_FILEP portion of the 879 * message has not been modified by 880 * the loopback transport. The sending sockfs generates 881 * a message that is esballoc'ed with the free function 882 * being fdbuf_free() and where free_arg contains the 883 * identical information as the SO_FILEP content. 884 * 885 * If any of these constraints are not satisfied we 886 * silently ignore the option. 887 */ 888 ASSERT(mp); 889 if (frp != NULL && 890 frp->free_func == fdbuf_free && 891 frp->free_arg != NULL && 892 bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) { 893 dprint(1, ("fdbuf_verify: fdbuf %p len %d\n", 894 (void *)fdbuf, fdbuflen)); 895 return (1); 896 } else { 897 zcmn_err(getzoneid(), CE_WARN, 898 "sockfs: mismatched fdbuf content (%p)", 899 (void *)mp); 900 return (0); 901 } 902 } else { 903 zcmn_err(getzoneid(), CE_WARN, 904 "sockfs: mismatched fdbuf len %d, %d\n", 905 fdbuflen, fdbuf->fd_size); 906 return (0); 907 } 908 } 909 910 /* 911 * When the file descriptors returned by sorecvmsg can not be passed 912 * to the application this routine will cleanup the references on 913 * the files. Start at startoff bytes into the buffer. 914 */ 915 static void 916 close_fds(void *fdbuf, int fdbuflen, int startoff) 917 { 918 int *fds = (int *)fdbuf; 919 int numfd = fdbuflen / (int)sizeof (int); 920 int i; 921 922 dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff)); 923 924 for (i = 0; i < numfd; i++) { 925 if (startoff < 0) 926 startoff = 0; 927 if (startoff < (int)sizeof (int)) { 928 /* 929 * This file descriptor is partially or fully after 930 * the offset 931 */ 932 dprint(0, 933 ("close_fds: cleanup[%d] = %d\n", i, fds[i])); 934 (void) closeandsetf(fds[i], NULL); 935 } 936 startoff -= (int)sizeof (int); 937 } 938 } 939 940 /* 941 * Close all file descriptors contained in the control part starting at 942 * the startoffset. 943 */ 944 void 945 so_closefds(void *control, t_uscalar_t controllen, int oldflg, 946 int startoff) 947 { 948 struct cmsghdr *cmsg; 949 950 if (control == NULL) 951 return; 952 953 if (oldflg) { 954 close_fds(control, controllen, startoff); 955 return; 956 } 957 /* Scan control part for file descriptors. */ 958 for (cmsg = (struct cmsghdr *)control; 959 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 960 cmsg = CMSG_NEXT(cmsg)) { 961 if (cmsg->cmsg_level == SOL_SOCKET && 962 cmsg->cmsg_type == SCM_RIGHTS) { 963 close_fds(CMSG_CONTENT(cmsg), 964 (int)CMSG_CONTENTLEN(cmsg), 965 startoff - (int)sizeof (struct cmsghdr)); 966 } 967 startoff -= cmsg->cmsg_len; 968 } 969 } 970 971 /* 972 * Returns a pointer/length for the file descriptors contained 973 * in the control buffer. Returns with *fdlenp == -1 if there are no 974 * file descriptor options present. This is different than there being 975 * a zero-length file descriptor option. 976 * Fail if there are multiple SCM_RIGHT cmsgs. 977 */ 978 int 979 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg, 980 void **fdsp, int *fdlenp) 981 { 982 struct cmsghdr *cmsg; 983 void *fds; 984 int fdlen; 985 986 if (control == NULL) { 987 *fdsp = NULL; 988 *fdlenp = -1; 989 return (0); 990 } 991 992 if (oldflg) { 993 *fdsp = control; 994 if (controllen == 0) 995 *fdlenp = -1; 996 else 997 *fdlenp = controllen; 998 dprint(1, ("so_getfdopt: old %d\n", *fdlenp)); 999 return (0); 1000 } 1001 1002 fds = NULL; 1003 fdlen = 0; 1004 1005 for (cmsg = (struct cmsghdr *)control; 1006 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1007 cmsg = CMSG_NEXT(cmsg)) { 1008 if (cmsg->cmsg_level == SOL_SOCKET && 1009 cmsg->cmsg_type == SCM_RIGHTS) { 1010 if (fds != NULL) 1011 return (EINVAL); 1012 fds = CMSG_CONTENT(cmsg); 1013 fdlen = (int)CMSG_CONTENTLEN(cmsg); 1014 dprint(1, ("so_getfdopt: new %lu\n", 1015 (size_t)CMSG_CONTENTLEN(cmsg))); 1016 } 1017 } 1018 if (fds == NULL) { 1019 dprint(1, ("so_getfdopt: NONE\n")); 1020 *fdlenp = -1; 1021 } else 1022 *fdlenp = fdlen; 1023 *fdsp = fds; 1024 return (0); 1025 } 1026 1027 /* 1028 * Return the length of the options including any file descriptor options. 1029 */ 1030 t_uscalar_t 1031 so_optlen(void *control, t_uscalar_t controllen, int oldflg) 1032 { 1033 struct cmsghdr *cmsg; 1034 t_uscalar_t optlen = 0; 1035 t_uscalar_t len; 1036 1037 if (control == NULL) 1038 return (0); 1039 1040 if (oldflg) 1041 return ((t_uscalar_t)(sizeof (struct T_opthdr) + 1042 fdbuf_optlen(controllen))); 1043 1044 for (cmsg = (struct cmsghdr *)control; 1045 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1046 cmsg = CMSG_NEXT(cmsg)) { 1047 if (cmsg->cmsg_level == SOL_SOCKET && 1048 cmsg->cmsg_type == SCM_RIGHTS) { 1049 len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg)); 1050 } else { 1051 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1052 } 1053 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) + 1054 sizeof (struct T_opthdr)); 1055 } 1056 dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n", 1057 controllen, oldflg, optlen)); 1058 return (optlen); 1059 } 1060 1061 /* 1062 * Copy options from control to the mblk. Skip any file descriptor options. 1063 */ 1064 void 1065 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp) 1066 { 1067 struct T_opthdr toh; 1068 struct cmsghdr *cmsg; 1069 1070 if (control == NULL) 1071 return; 1072 1073 if (oldflg) { 1074 /* No real options - caller has handled file descriptors */ 1075 return; 1076 } 1077 for (cmsg = (struct cmsghdr *)control; 1078 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1079 cmsg = CMSG_NEXT(cmsg)) { 1080 /* 1081 * Note: The caller handles file descriptors prior 1082 * to calling this function. 1083 */ 1084 t_uscalar_t len; 1085 1086 if (cmsg->cmsg_level == SOL_SOCKET && 1087 cmsg->cmsg_type == SCM_RIGHTS) 1088 continue; 1089 1090 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1091 toh.level = cmsg->cmsg_level; 1092 toh.name = cmsg->cmsg_type; 1093 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr); 1094 toh.status = 0; 1095 1096 soappendmsg(mp, &toh, sizeof (toh)); 1097 soappendmsg(mp, CMSG_CONTENT(cmsg), len); 1098 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len; 1099 ASSERT(mp->b_wptr <= mp->b_datap->db_lim); 1100 } 1101 } 1102 1103 /* 1104 * Return the length of the control message derived from the options. 1105 * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP. 1106 * When oldflg is set only include SO_FILEP. 1107 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1108 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1109 * also be checked for any possible impacts. 1110 */ 1111 t_uscalar_t 1112 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg) 1113 { 1114 t_uscalar_t cmsglen = 0; 1115 struct T_opthdr *tohp; 1116 t_uscalar_t len; 1117 t_uscalar_t last_roundup = 0; 1118 1119 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1120 1121 for (tohp = (struct T_opthdr *)opt; 1122 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1123 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1124 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n", 1125 tohp->level, tohp->name, tohp->len)); 1126 if (tohp->level == SOL_SOCKET && 1127 (tohp->name == SO_SRCADDR || 1128 tohp->name == SO_UNIX_CLOSE)) { 1129 continue; 1130 } 1131 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1132 struct fdbuf *fdbuf; 1133 int fdbuflen; 1134 1135 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1136 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1137 1138 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1139 continue; 1140 if (oldflg) { 1141 cmsglen += fdbuf_cmsglen(fdbuflen); 1142 continue; 1143 } 1144 len = fdbuf_cmsglen(fdbuflen); 1145 } else if (tohp->level == SOL_SOCKET && 1146 tohp->name == SCM_TIMESTAMP) { 1147 if (oldflg) 1148 continue; 1149 1150 if (get_udatamodel() == DATAMODEL_NATIVE) { 1151 len = sizeof (struct timeval); 1152 } else { 1153 len = sizeof (struct timeval32); 1154 } 1155 } else { 1156 if (oldflg) 1157 continue; 1158 len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1159 } 1160 /* 1161 * Exclude roundup for last option to not set 1162 * MSG_CTRUNC when the cmsg fits but the padding doesn't fit. 1163 */ 1164 last_roundup = (t_uscalar_t) 1165 (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) - 1166 (len + (int)sizeof (struct cmsghdr))); 1167 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) + 1168 last_roundup; 1169 } 1170 cmsglen -= last_roundup; 1171 dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n", 1172 optlen, oldflg, cmsglen)); 1173 return (cmsglen); 1174 } 1175 1176 /* 1177 * Copy options from options to the control. Convert SO_FILEP to 1178 * file descriptors. 1179 * Returns errno or zero. 1180 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1181 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1182 * also be checked for any possible impacts. 1183 */ 1184 int 1185 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg, 1186 void *control, t_uscalar_t controllen) 1187 { 1188 struct T_opthdr *tohp; 1189 struct cmsghdr *cmsg; 1190 struct fdbuf *fdbuf; 1191 int fdbuflen; 1192 int error; 1193 #if defined(DEBUG) || defined(__lint) 1194 struct cmsghdr *cend = (struct cmsghdr *) 1195 (((uint8_t *)control) + ROUNDUP_cmsglen(controllen)); 1196 #endif 1197 cmsg = (struct cmsghdr *)control; 1198 1199 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1200 1201 for (tohp = (struct T_opthdr *)opt; 1202 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1203 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1204 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n", 1205 tohp->level, tohp->name, tohp->len)); 1206 1207 if (tohp->level == SOL_SOCKET && 1208 (tohp->name == SO_SRCADDR || 1209 tohp->name == SO_UNIX_CLOSE)) { 1210 continue; 1211 } 1212 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen); 1213 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1214 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1215 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1216 1217 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1218 return (EPROTO); 1219 if (oldflg) { 1220 error = fdbuf_extract(fdbuf, control, 1221 (int)controllen); 1222 if (error != 0) 1223 return (error); 1224 continue; 1225 } else { 1226 int fdlen; 1227 1228 fdlen = (int)fdbuf_cmsglen( 1229 (int)_TPI_TOPT_DATALEN(tohp)); 1230 1231 cmsg->cmsg_level = tohp->level; 1232 cmsg->cmsg_type = SCM_RIGHTS; 1233 cmsg->cmsg_len = (socklen_t)(fdlen + 1234 sizeof (struct cmsghdr)); 1235 1236 error = fdbuf_extract(fdbuf, 1237 CMSG_CONTENT(cmsg), fdlen); 1238 if (error != 0) 1239 return (error); 1240 } 1241 } else if (tohp->level == SOL_SOCKET && 1242 tohp->name == SCM_TIMESTAMP) { 1243 timestruc_t *timestamp; 1244 1245 if (oldflg) 1246 continue; 1247 1248 cmsg->cmsg_level = tohp->level; 1249 cmsg->cmsg_type = tohp->name; 1250 1251 timestamp = 1252 (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1], 1253 sizeof (intptr_t)); 1254 1255 if (get_udatamodel() == DATAMODEL_NATIVE) { 1256 struct timeval tv; 1257 1258 cmsg->cmsg_len = sizeof (struct timeval) + 1259 sizeof (struct cmsghdr); 1260 tv.tv_sec = timestamp->tv_sec; 1261 tv.tv_usec = timestamp->tv_nsec / 1262 (NANOSEC / MICROSEC); 1263 /* 1264 * on LP64 systems, the struct timeval in 1265 * the destination will not be 8-byte aligned, 1266 * so use bcopy to avoid alignment trouble 1267 */ 1268 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv)); 1269 } else { 1270 struct timeval32 *time32; 1271 1272 cmsg->cmsg_len = sizeof (struct timeval32) + 1273 sizeof (struct cmsghdr); 1274 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg); 1275 time32->tv_sec = (time32_t)timestamp->tv_sec; 1276 time32->tv_usec = 1277 (int32_t)(timestamp->tv_nsec / 1278 (NANOSEC / MICROSEC)); 1279 } 1280 1281 } else { 1282 if (oldflg) 1283 continue; 1284 1285 cmsg->cmsg_level = tohp->level; 1286 cmsg->cmsg_type = tohp->name; 1287 cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) + 1288 sizeof (struct cmsghdr)); 1289 1290 /* copy content to control data part */ 1291 bcopy(&tohp[1], CMSG_CONTENT(cmsg), 1292 CMSG_CONTENTLEN(cmsg)); 1293 } 1294 /* move to next CMSG structure! */ 1295 cmsg = CMSG_NEXT(cmsg); 1296 } 1297 dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n", 1298 control, controllen, (void *)cend, (void *)cmsg)); 1299 ASSERT(cmsg <= cend); 1300 return (0); 1301 } 1302 1303 /* 1304 * Extract the SO_SRCADDR option value if present. 1305 */ 1306 void 1307 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp, 1308 t_uscalar_t *srclenp) 1309 { 1310 struct T_opthdr *tohp; 1311 1312 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1313 1314 ASSERT(srcp != NULL && srclenp != NULL); 1315 *srcp = NULL; 1316 *srclenp = 0; 1317 1318 for (tohp = (struct T_opthdr *)opt; 1319 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1320 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1321 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n", 1322 tohp->level, tohp->name, tohp->len)); 1323 if (tohp->level == SOL_SOCKET && 1324 tohp->name == SO_SRCADDR) { 1325 *srcp = _TPI_TOPT_DATA(tohp); 1326 *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1327 } 1328 } 1329 } 1330 1331 /* 1332 * Verify if the SO_UNIX_CLOSE option is present. 1333 */ 1334 int 1335 so_getopt_unix_close(void *opt, t_uscalar_t optlen) 1336 { 1337 struct T_opthdr *tohp; 1338 1339 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1340 1341 for (tohp = (struct T_opthdr *)opt; 1342 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1343 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1344 dprint(1, 1345 ("so_getopt_unix_close: level 0x%x, name %d, len %d\n", 1346 tohp->level, tohp->name, tohp->len)); 1347 if (tohp->level == SOL_SOCKET && 1348 tohp->name == SO_UNIX_CLOSE) 1349 return (1); 1350 } 1351 return (0); 1352 } 1353 1354 /* 1355 * Allocate an M_PROTO message. 1356 * 1357 * If allocation fails the behavior depends on sleepflg: 1358 * _ALLOC_NOSLEEP fail immediately 1359 * _ALLOC_INTR sleep for memory until a signal is caught 1360 * _ALLOC_SLEEP sleep forever. Don't return NULL. 1361 */ 1362 mblk_t * 1363 soallocproto(size_t size, int sleepflg, cred_t *cr) 1364 { 1365 mblk_t *mp; 1366 1367 /* Round up size for reuse */ 1368 size = MAX(size, 64); 1369 if (cr != NULL) 1370 mp = allocb_cred(size, cr, curproc->p_pid); 1371 else 1372 mp = allocb(size, BPRI_MED); 1373 1374 if (mp == NULL) { 1375 int error; /* Dummy - error not returned to caller */ 1376 1377 switch (sleepflg) { 1378 case _ALLOC_SLEEP: 1379 if (cr != NULL) { 1380 mp = allocb_cred_wait(size, STR_NOSIG, &error, 1381 cr, curproc->p_pid); 1382 } else { 1383 mp = allocb_wait(size, BPRI_MED, STR_NOSIG, 1384 &error); 1385 } 1386 ASSERT(mp); 1387 break; 1388 case _ALLOC_INTR: 1389 if (cr != NULL) { 1390 mp = allocb_cred_wait(size, 0, &error, cr, 1391 curproc->p_pid); 1392 } else { 1393 mp = allocb_wait(size, BPRI_MED, 0, &error); 1394 } 1395 if (mp == NULL) { 1396 /* Caught signal while sleeping for memory */ 1397 eprintline(ENOBUFS); 1398 return (NULL); 1399 } 1400 break; 1401 case _ALLOC_NOSLEEP: 1402 default: 1403 eprintline(ENOBUFS); 1404 return (NULL); 1405 } 1406 } 1407 DB_TYPE(mp) = M_PROTO; 1408 return (mp); 1409 } 1410 1411 /* 1412 * Allocate an M_PROTO message with a single component. 1413 * len is the length of buf. size is the amount to allocate. 1414 * 1415 * buf can be NULL with a non-zero len. 1416 * This results in a bzero'ed chunk being placed the message. 1417 */ 1418 mblk_t * 1419 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg, 1420 cred_t *cr) 1421 { 1422 mblk_t *mp; 1423 1424 if (size == 0) 1425 size = len; 1426 1427 ASSERT(size >= len); 1428 /* Round up size for reuse */ 1429 size = MAX(size, 64); 1430 mp = soallocproto(size, sleepflg, cr); 1431 if (mp == NULL) 1432 return (NULL); 1433 mp->b_datap->db_type = M_PROTO; 1434 if (len != 0) { 1435 if (buf != NULL) 1436 bcopy(buf, mp->b_wptr, len); 1437 else 1438 bzero(mp->b_wptr, len); 1439 mp->b_wptr += len; 1440 } 1441 return (mp); 1442 } 1443 1444 /* 1445 * Append buf/len to mp. 1446 * The caller has to ensure that there is enough room in the mblk. 1447 * 1448 * buf can be NULL with a non-zero len. 1449 * This results in a bzero'ed chunk being placed the message. 1450 */ 1451 void 1452 soappendmsg(mblk_t *mp, const void *buf, ssize_t len) 1453 { 1454 ASSERT(mp); 1455 1456 if (len != 0) { 1457 /* Assert for room left */ 1458 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len); 1459 if (buf != NULL) 1460 bcopy(buf, mp->b_wptr, len); 1461 else 1462 bzero(mp->b_wptr, len); 1463 } 1464 mp->b_wptr += len; 1465 } 1466 1467 /* 1468 * Create a message using two kernel buffers. 1469 * If size is set that will determine the allocation size (e.g. for future 1470 * soappendmsg calls). If size is zero it is derived from the buffer 1471 * lengths. 1472 */ 1473 mblk_t * 1474 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1475 ssize_t size, int sleepflg, cred_t *cr) 1476 { 1477 mblk_t *mp; 1478 1479 if (size == 0) 1480 size = len1 + len2; 1481 ASSERT(size >= len1 + len2); 1482 1483 mp = soallocproto1(buf1, len1, size, sleepflg, cr); 1484 if (mp) 1485 soappendmsg(mp, buf2, len2); 1486 return (mp); 1487 } 1488 1489 /* 1490 * Create a message using three kernel buffers. 1491 * If size is set that will determine the allocation size (for future 1492 * soappendmsg calls). If size is zero it is derived from the buffer 1493 * lengths. 1494 */ 1495 mblk_t * 1496 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1497 const void *buf3, ssize_t len3, ssize_t size, int sleepflg, cred_t *cr) 1498 { 1499 mblk_t *mp; 1500 1501 if (size == 0) 1502 size = len1 + len2 +len3; 1503 ASSERT(size >= len1 + len2 + len3); 1504 1505 mp = soallocproto1(buf1, len1, size, sleepflg, cr); 1506 if (mp != NULL) { 1507 soappendmsg(mp, buf2, len2); 1508 soappendmsg(mp, buf3, len3); 1509 } 1510 return (mp); 1511 } 1512 1513 #ifdef DEBUG 1514 char * 1515 pr_state(uint_t state, uint_t mode) 1516 { 1517 static char buf[1024]; 1518 1519 buf[0] = 0; 1520 if (state & SS_ISCONNECTED) 1521 (void) strcat(buf, "ISCONNECTED "); 1522 if (state & SS_ISCONNECTING) 1523 (void) strcat(buf, "ISCONNECTING "); 1524 if (state & SS_ISDISCONNECTING) 1525 (void) strcat(buf, "ISDISCONNECTING "); 1526 if (state & SS_CANTSENDMORE) 1527 (void) strcat(buf, "CANTSENDMORE "); 1528 1529 if (state & SS_CANTRCVMORE) 1530 (void) strcat(buf, "CANTRCVMORE "); 1531 if (state & SS_ISBOUND) 1532 (void) strcat(buf, "ISBOUND "); 1533 if (state & SS_NDELAY) 1534 (void) strcat(buf, "NDELAY "); 1535 if (state & SS_NONBLOCK) 1536 (void) strcat(buf, "NONBLOCK "); 1537 1538 if (state & SS_ASYNC) 1539 (void) strcat(buf, "ASYNC "); 1540 if (state & SS_ACCEPTCONN) 1541 (void) strcat(buf, "ACCEPTCONN "); 1542 if (state & SS_SAVEDEOR) 1543 (void) strcat(buf, "SAVEDEOR "); 1544 1545 if (state & SS_RCVATMARK) 1546 (void) strcat(buf, "RCVATMARK "); 1547 if (state & SS_OOBPEND) 1548 (void) strcat(buf, "OOBPEND "); 1549 if (state & SS_HAVEOOBDATA) 1550 (void) strcat(buf, "HAVEOOBDATA "); 1551 if (state & SS_HADOOBDATA) 1552 (void) strcat(buf, "HADOOBDATA "); 1553 1554 if (mode & SM_PRIV) 1555 (void) strcat(buf, "PRIV "); 1556 if (mode & SM_ATOMIC) 1557 (void) strcat(buf, "ATOMIC "); 1558 if (mode & SM_ADDR) 1559 (void) strcat(buf, "ADDR "); 1560 if (mode & SM_CONNREQUIRED) 1561 (void) strcat(buf, "CONNREQUIRED "); 1562 1563 if (mode & SM_FDPASSING) 1564 (void) strcat(buf, "FDPASSING "); 1565 if (mode & SM_EXDATA) 1566 (void) strcat(buf, "EXDATA "); 1567 if (mode & SM_OPTDATA) 1568 (void) strcat(buf, "OPTDATA "); 1569 if (mode & SM_BYTESTREAM) 1570 (void) strcat(buf, "BYTESTREAM "); 1571 return (buf); 1572 } 1573 1574 char * 1575 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen) 1576 { 1577 static char buf[1024]; 1578 1579 if (addr == NULL || addrlen == 0) { 1580 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr); 1581 return (buf); 1582 } 1583 switch (family) { 1584 case AF_INET: { 1585 struct sockaddr_in sin; 1586 1587 bcopy(addr, &sin, sizeof (sin)); 1588 1589 (void) sprintf(buf, "(len %d) %x/%d", 1590 addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port)); 1591 break; 1592 } 1593 case AF_INET6: { 1594 struct sockaddr_in6 sin6; 1595 uint16_t *piece = (uint16_t *)&sin6.sin6_addr; 1596 1597 bcopy((char *)addr, (char *)&sin6, sizeof (sin6)); 1598 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d", 1599 addrlen, 1600 ntohs(piece[0]), ntohs(piece[1]), 1601 ntohs(piece[2]), ntohs(piece[3]), 1602 ntohs(piece[4]), ntohs(piece[5]), 1603 ntohs(piece[6]), ntohs(piece[7]), 1604 ntohs(sin6.sin6_port)); 1605 break; 1606 } 1607 case AF_UNIX: { 1608 struct sockaddr_un *soun = (struct sockaddr_un *)addr; 1609 1610 (void) sprintf(buf, "(len %d) %s", addrlen, 1611 (soun == NULL) ? "(none)" : soun->sun_path); 1612 break; 1613 } 1614 default: 1615 (void) sprintf(buf, "(unknown af %d)", family); 1616 break; 1617 } 1618 return (buf); 1619 } 1620 1621 /* The logical equivalence operator (a if-and-only-if b) */ 1622 #define EQUIVALENT(a, b) (((a) && (b)) || (!(a) && (!(b)))) 1623 1624 /* 1625 * Verify limitations and invariants on oob state. 1626 * Return 1 if OK, otherwise 0 so that it can be used as 1627 * ASSERT(verify_oobstate(so)); 1628 */ 1629 int 1630 so_verify_oobstate(struct sonode *so) 1631 { 1632 boolean_t havemark; 1633 1634 ASSERT(MUTEX_HELD(&so->so_lock)); 1635 1636 /* 1637 * The possible state combinations are: 1638 * 0 1639 * SS_OOBPEND 1640 * SS_OOBPEND|SS_HAVEOOBDATA 1641 * SS_OOBPEND|SS_HADOOBDATA 1642 * SS_HADOOBDATA 1643 */ 1644 switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) { 1645 case 0: 1646 case SS_OOBPEND: 1647 case SS_OOBPEND|SS_HAVEOOBDATA: 1648 case SS_OOBPEND|SS_HADOOBDATA: 1649 case SS_HADOOBDATA: 1650 break; 1651 default: 1652 printf("Bad oob state 1 (%p): state %s\n", 1653 (void *)so, pr_state(so->so_state, so->so_mode)); 1654 return (0); 1655 } 1656 1657 /* SS_RCVATMARK should only be set when SS_OOBPEND is set */ 1658 if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) { 1659 printf("Bad oob state 2 (%p): state %s\n", 1660 (void *)so, pr_state(so->so_state, so->so_mode)); 1661 return (0); 1662 } 1663 1664 /* 1665 * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND 1666 * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt. 1667 */ 1668 havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 : 1669 SOTOTPI(so)->sti_oobsigcnt > 0; 1670 1671 if (!EQUIVALENT(havemark || (so->so_state & SS_RCVATMARK), 1672 so->so_state & SS_OOBPEND)) { 1673 printf("Bad oob state 3 (%p): state %s\n", 1674 (void *)so, pr_state(so->so_state, so->so_mode)); 1675 return (0); 1676 } 1677 1678 /* 1679 * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA 1680 */ 1681 if (!(so->so_options & SO_OOBINLINE) && 1682 !EQUIVALENT(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) { 1683 printf("Bad oob state 4 (%p): state %s\n", 1684 (void *)so, pr_state(so->so_state, so->so_mode)); 1685 return (0); 1686 } 1687 1688 if (!SOCK_IS_NONSTR(so) && 1689 SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) { 1690 printf("Bad oob state 5 (%p): counts %d/%d state %s\n", 1691 (void *)so, SOTOTPI(so)->sti_oobsigcnt, 1692 SOTOTPI(so)->sti_oobcnt, 1693 pr_state(so->so_state, so->so_mode)); 1694 return (0); 1695 } 1696 1697 return (1); 1698 } 1699 #undef EQUIVALENT 1700 #endif /* DEBUG */ 1701 1702 /* initialize sockfs zone specific kstat related items */ 1703 void * 1704 sock_kstat_init(zoneid_t zoneid) 1705 { 1706 kstat_t *ksp; 1707 1708 ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc", 1709 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid); 1710 1711 if (ksp != NULL) { 1712 ksp->ks_update = sockfs_update; 1713 ksp->ks_snapshot = sockfs_snapshot; 1714 ksp->ks_lock = &socklist.sl_lock; 1715 ksp->ks_private = (void *)(uintptr_t)zoneid; 1716 kstat_install(ksp); 1717 } 1718 1719 return (ksp); 1720 } 1721 1722 /* tear down sockfs zone specific kstat related items */ 1723 /*ARGSUSED*/ 1724 void 1725 sock_kstat_fini(zoneid_t zoneid, void *arg) 1726 { 1727 kstat_t *ksp = (kstat_t *)arg; 1728 1729 if (ksp != NULL) { 1730 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private); 1731 kstat_delete(ksp); 1732 } 1733 } 1734 1735 /* 1736 * Zones: 1737 * Note that nactive is going to be different for each zone. 1738 * This means we require kstat to call sockfs_update and then sockfs_snapshot 1739 * for the same zone, or sockfs_snapshot will be taken into the wrong size 1740 * buffer. This is safe, but if the buffer is too small, user will not be 1741 * given details of all sockets. However, as this kstat has a ks_lock, kstat 1742 * driver will keep it locked between the update and the snapshot, so no 1743 * other process (zone) can currently get inbetween resulting in a wrong size 1744 * buffer allocation. 1745 */ 1746 static int 1747 sockfs_update(kstat_t *ksp, int rw) 1748 { 1749 uint_t nactive = 0; /* # of active AF_UNIX sockets */ 1750 struct sonode *so; /* current sonode on socklist */ 1751 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 1752 1753 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 1754 1755 if (rw == KSTAT_WRITE) { /* bounce all writes */ 1756 return (EACCES); 1757 } 1758 1759 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { 1760 if (so->so_count != 0 && so->so_zoneid == myzoneid) { 1761 nactive++; 1762 } 1763 } 1764 ksp->ks_ndata = nactive; 1765 ksp->ks_data_size = nactive * sizeof (struct k_sockinfo); 1766 1767 return (0); 1768 } 1769 1770 static int 1771 sockfs_snapshot(kstat_t *ksp, void *buf, int rw) 1772 { 1773 int ns; /* # of sonodes we've copied */ 1774 struct sonode *so; /* current sonode on socklist */ 1775 struct k_sockinfo *pksi; /* where we put sockinfo data */ 1776 t_uscalar_t sn_len; /* soa_len */ 1777 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 1778 sotpi_info_t *sti; 1779 1780 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 1781 1782 ksp->ks_snaptime = gethrtime(); 1783 1784 if (rw == KSTAT_WRITE) { /* bounce all writes */ 1785 return (EACCES); 1786 } 1787 1788 /* 1789 * for each sonode on the socklist, we massage the important 1790 * info into buf, in k_sockinfo format. 1791 */ 1792 pksi = (struct k_sockinfo *)buf; 1793 ns = 0; 1794 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { 1795 /* only stuff active sonodes and the same zone: */ 1796 if (so->so_count == 0 || so->so_zoneid != myzoneid) { 1797 continue; 1798 } 1799 1800 /* 1801 * If the sonode was activated between the update and the 1802 * snapshot, we're done - as this is only a snapshot. 1803 */ 1804 if ((caddr_t)(pksi) >= (caddr_t)buf + ksp->ks_data_size) { 1805 break; 1806 } 1807 1808 sti = SOTOTPI(so); 1809 /* copy important info into buf: */ 1810 pksi->ks_si.si_size = sizeof (struct k_sockinfo); 1811 pksi->ks_si.si_family = so->so_family; 1812 pksi->ks_si.si_type = so->so_type; 1813 pksi->ks_si.si_flag = so->so_flag; 1814 pksi->ks_si.si_state = so->so_state; 1815 pksi->ks_si.si_serv_type = sti->sti_serv_type; 1816 pksi->ks_si.si_ux_laddr_sou_magic = 1817 sti->sti_ux_laddr.soua_magic; 1818 pksi->ks_si.si_ux_faddr_sou_magic = 1819 sti->sti_ux_faddr.soua_magic; 1820 pksi->ks_si.si_laddr_soa_len = sti->sti_laddr.soa_len; 1821 pksi->ks_si.si_faddr_soa_len = sti->sti_faddr.soa_len; 1822 pksi->ks_si.si_szoneid = so->so_zoneid; 1823 pksi->ks_si.si_faddr_noxlate = sti->sti_faddr_noxlate; 1824 1825 mutex_enter(&so->so_lock); 1826 1827 if (sti->sti_laddr_sa != NULL) { 1828 ASSERT(sti->sti_laddr_sa->sa_data != NULL); 1829 sn_len = sti->sti_laddr_len; 1830 ASSERT(sn_len <= sizeof (short) + 1831 sizeof (pksi->ks_si.si_laddr_sun_path)); 1832 1833 pksi->ks_si.si_laddr_family = 1834 sti->sti_laddr_sa->sa_family; 1835 if (sn_len != 0) { 1836 /* AF_UNIX socket names are NULL terminated */ 1837 (void) strncpy(pksi->ks_si.si_laddr_sun_path, 1838 sti->sti_laddr_sa->sa_data, 1839 sizeof (pksi->ks_si.si_laddr_sun_path)); 1840 sn_len = strlen(pksi->ks_si.si_laddr_sun_path); 1841 } 1842 pksi->ks_si.si_laddr_sun_path[sn_len] = 0; 1843 } 1844 1845 if (sti->sti_faddr_sa != NULL) { 1846 ASSERT(sti->sti_faddr_sa->sa_data != NULL); 1847 sn_len = sti->sti_faddr_len; 1848 ASSERT(sn_len <= sizeof (short) + 1849 sizeof (pksi->ks_si.si_faddr_sun_path)); 1850 1851 pksi->ks_si.si_faddr_family = 1852 sti->sti_faddr_sa->sa_family; 1853 if (sn_len != 0) { 1854 (void) strncpy(pksi->ks_si.si_faddr_sun_path, 1855 sti->sti_faddr_sa->sa_data, 1856 sizeof (pksi->ks_si.si_faddr_sun_path)); 1857 sn_len = strlen(pksi->ks_si.si_faddr_sun_path); 1858 } 1859 pksi->ks_si.si_faddr_sun_path[sn_len] = 0; 1860 } 1861 1862 mutex_exit(&so->so_lock); 1863 1864 (void) sprintf(pksi->ks_straddr[0], "%p", (void *)so); 1865 (void) sprintf(pksi->ks_straddr[1], "%p", 1866 (void *)sti->sti_ux_laddr.soua_vp); 1867 (void) sprintf(pksi->ks_straddr[2], "%p", 1868 (void *)sti->sti_ux_faddr.soua_vp); 1869 1870 ns++; 1871 pksi++; 1872 } 1873 1874 ksp->ks_ndata = ns; 1875 return (0); 1876 } 1877 1878 ssize_t 1879 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size) 1880 { 1881 struct uio auio; 1882 struct iovec aiov[MSG_MAXIOVLEN]; 1883 register vnode_t *vp; 1884 int ioflag, rwflag; 1885 ssize_t cnt; 1886 int error = 0; 1887 int iovcnt = 0; 1888 short fflag; 1889 1890 vp = fp->f_vnode; 1891 fflag = fp->f_flag; 1892 1893 rwflag = 0; 1894 aiov[0].iov_base = (caddr_t)buf; 1895 aiov[0].iov_len = size; 1896 iovcnt = 1; 1897 cnt = (ssize_t)size; 1898 (void) VOP_RWLOCK(vp, rwflag, NULL); 1899 1900 auio.uio_loffset = fileoff; 1901 auio.uio_iov = aiov; 1902 auio.uio_iovcnt = iovcnt; 1903 auio.uio_resid = cnt; 1904 auio.uio_segflg = UIO_SYSSPACE; 1905 auio.uio_llimit = MAXOFFSET_T; 1906 auio.uio_fmode = fflag; 1907 auio.uio_extflg = UIO_COPY_CACHED; 1908 1909 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1910 1911 /* If read sync is not asked for, filter sync flags */ 1912 if ((ioflag & FRSYNC) == 0) 1913 ioflag &= ~(FSYNC|FDSYNC); 1914 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL); 1915 cnt -= auio.uio_resid; 1916 1917 VOP_RWUNLOCK(vp, rwflag, NULL); 1918 1919 if (error == EINTR && cnt != 0) 1920 error = 0; 1921 out: 1922 if (error != 0) { 1923 *err = error; 1924 return (0); 1925 } else { 1926 *err = 0; 1927 return (cnt); 1928 } 1929 } 1930 1931 int 1932 so_copyin(const void *from, void *to, size_t size, int fromkernel) 1933 { 1934 if (fromkernel) { 1935 bcopy(from, to, size); 1936 return (0); 1937 } 1938 return (xcopyin(from, to, size)); 1939 } 1940 1941 int 1942 so_copyout(const void *from, void *to, size_t size, int tokernel) 1943 { 1944 if (tokernel) { 1945 bcopy(from, to, size); 1946 return (0); 1947 } 1948 return (xcopyout(from, to, size)); 1949 } 1950