1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vfs.h> 37 #include <sys/vfs_opreg.h> 38 #include <sys/vnode.h> 39 #include <sys/debug.h> 40 #include <sys/errno.h> 41 #include <sys/time.h> 42 #include <sys/file.h> 43 #include <sys/open.h> 44 #include <sys/user.h> 45 #include <sys/termios.h> 46 #include <sys/stream.h> 47 #include <sys/strsubr.h> 48 #include <sys/strsun.h> 49 #include <sys/esunddi.h> 50 #include <sys/flock.h> 51 #include <sys/modctl.h> 52 #include <sys/cmn_err.h> 53 #include <sys/mkdev.h> 54 #include <sys/pathname.h> 55 #include <sys/ddi.h> 56 #include <sys/stat.h> 57 #include <sys/fs/snode.h> 58 #include <sys/fs/dv_node.h> 59 #include <sys/zone.h> 60 61 #include <sys/socket.h> 62 #include <sys/socketvar.h> 63 #include <netinet/in.h> 64 #include <sys/un.h> 65 66 #include <sys/ucred.h> 67 68 #include <sys/tiuser.h> 69 #define _SUN_TPI_VERSION 2 70 #include <sys/tihdr.h> 71 72 #include <c2/audit.h> 73 74 #include <fs/sockfs/nl7c.h> 75 #include <fs/sockfs/sockcommon.h> 76 #include <fs/sockfs/socktpi.h> 77 #include <fs/sockfs/socktpi_impl.h> 78 #include <fs/sockfs/sodirect.h> 79 80 /* 81 * Macros that operate on struct cmsghdr. 82 * The CMSG_VALID macro does not assume that the last option buffer is padded. 83 */ 84 #define CMSG_CONTENT(cmsg) (&((cmsg)[1])) 85 #define CMSG_CONTENTLEN(cmsg) ((cmsg)->cmsg_len - sizeof (struct cmsghdr)) 86 #define CMSG_VALID(cmsg, start, end) \ 87 (ISALIGNED_cmsghdr(cmsg) && \ 88 ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \ 89 ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \ 90 ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \ 91 ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end))) 92 #define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */ 93 94 dev_t sockdev; /* For fsid in getattr */ 95 int sockfs_defer_nl7c_init = 0; 96 97 struct socklist socklist; 98 99 struct kmem_cache *socket_cache; 100 101 static int sockfs_update(kstat_t *, int); 102 static int sockfs_snapshot(kstat_t *, void *, int); 103 extern smod_info_t *sotpi_smod_create(void); 104 105 extern void sendfile_init(); 106 107 extern void nl7c_init(void); 108 109 extern int modrootloaded; 110 111 #define ADRSTRLEN (2 * sizeof (void *) + 1) 112 /* 113 * kernel structure for passing the sockinfo data back up to the user. 114 * the strings array allows us to convert AF_UNIX addresses into strings 115 * with a common method regardless of which n-bit kernel we're running. 116 */ 117 struct k_sockinfo { 118 struct sockinfo ks_si; 119 char ks_straddr[3][ADRSTRLEN]; 120 }; 121 122 /* 123 * Translate from a device pathname (e.g. "/dev/tcp") to a vnode. 124 * Returns with the vnode held. 125 */ 126 int 127 sogetvp(char *devpath, vnode_t **vpp, int uioflag) 128 { 129 struct snode *csp; 130 vnode_t *vp, *dvp; 131 major_t maj; 132 int error; 133 134 ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE); 135 136 /* 137 * Lookup the underlying filesystem vnode. 138 */ 139 error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp); 140 if (error) 141 return (error); 142 143 /* Check that it is the correct vnode */ 144 if (vp->v_type != VCHR) { 145 VN_RELE(vp); 146 return (ENOTSOCK); 147 } 148 149 /* 150 * If devpath went through devfs, the device should already 151 * be configured. If devpath is a mknod file, however, we 152 * need to make sure the device is properly configured. 153 * To do this, we do something similar to spec_open() 154 * except that we resolve to the minor/leaf level since 155 * we need to return a vnode. 156 */ 157 csp = VTOS(VTOS(vp)->s_commonvp); 158 if (!(csp->s_flag & SDIPSET)) { 159 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 160 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname); 161 if (error == 0) 162 error = devfs_lookupname(pathname, NULLVPP, &dvp); 163 VN_RELE(vp); 164 kmem_free(pathname, MAXPATHLEN); 165 if (error != 0) 166 return (ENXIO); 167 vp = dvp; /* use the devfs vp */ 168 } 169 170 /* device is configured at this point */ 171 maj = getmajor(vp->v_rdev); 172 if (!STREAMSTAB(maj)) { 173 VN_RELE(vp); 174 return (ENOSTR); 175 } 176 177 *vpp = vp; 178 return (0); 179 } 180 181 /* 182 * Update the accessed, updated, or changed times in an sonode 183 * with the current time. 184 * 185 * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable 186 * attributes in a fstat call. (They return the current time and 0 for 187 * all timestamps, respectively.) We maintain the current timestamps 188 * here primarily so that should sockmod be popped the resulting 189 * file descriptor will behave like a stream w.r.t. the timestamps. 190 */ 191 void 192 so_update_attrs(struct sonode *so, int flag) 193 { 194 time_t now = gethrestime_sec(); 195 196 if (SOCK_IS_NONSTR(so)) 197 return; 198 199 mutex_enter(&so->so_lock); 200 so->so_flag |= flag; 201 if (flag & SOACC) 202 SOTOTPI(so)->sti_atime = now; 203 if (flag & SOMOD) 204 SOTOTPI(so)->sti_mtime = now; 205 mutex_exit(&so->so_lock); 206 } 207 208 extern so_create_func_t sock_comm_create_function; 209 extern so_destroy_func_t sock_comm_destroy_function; 210 /* 211 * Init function called when sockfs is loaded. 212 */ 213 int 214 sockinit(int fstype, char *name) 215 { 216 static const fs_operation_def_t sock_vfsops_template[] = { 217 NULL, NULL 218 }; 219 int error; 220 major_t dev; 221 char *err_str; 222 223 error = vfs_setfsops(fstype, sock_vfsops_template, NULL); 224 if (error != 0) { 225 zcmn_err(GLOBAL_ZONEID, CE_WARN, 226 "sockinit: bad vfs ops template"); 227 return (error); 228 } 229 230 error = vn_make_ops(name, socket_vnodeops_template, 231 &socket_vnodeops); 232 if (error != 0) { 233 err_str = "sockinit: bad socket vnode ops template"; 234 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */ 235 socket_vnodeops = NULL; 236 goto failure; 237 } 238 239 socket_cache = kmem_cache_create("socket_cache", 240 sizeof (struct sonode), 0, sonode_constructor, 241 sonode_destructor, NULL, NULL, NULL, 0); 242 243 error = socktpi_init(); 244 if (error != 0) { 245 err_str = NULL; 246 goto failure; 247 } 248 249 error = sod_init(); 250 if (error != 0) { 251 err_str = NULL; 252 goto failure; 253 } 254 255 /* 256 * Set up the default create and destroy functions 257 */ 258 sock_comm_create_function = socket_sonode_create; 259 sock_comm_destroy_function = socket_sonode_destroy; 260 261 /* 262 * Build initial list mapping socket parameters to vnode. 263 */ 264 smod_init(); 265 smod_add(sotpi_smod_create()); 266 267 sockparams_init(); 268 269 /* 270 * If sockets are needed before init runs /sbin/soconfig 271 * it is possible to preload the sockparams list here using 272 * calls like: 273 * sockconfig(1,2,3, "/dev/tcp", 0); 274 */ 275 276 /* 277 * Create a unique dev_t for use in so_fsid. 278 */ 279 280 if ((dev = getudev()) == (major_t)-1) 281 dev = 0; 282 sockdev = makedevice(dev, 0); 283 284 mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL); 285 sendfile_init(); 286 if (!modrootloaded) { 287 sockfs_defer_nl7c_init = 1; 288 } else { 289 nl7c_init(); 290 } 291 292 return (0); 293 294 failure: 295 (void) vfs_freevfsops_by_type(fstype); 296 if (socket_vnodeops != NULL) 297 vn_freevnodeops(socket_vnodeops); 298 if (err_str != NULL) 299 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str); 300 return (error); 301 } 302 303 /* 304 * Caller must hold the mutex. Used to set SOLOCKED. 305 */ 306 void 307 so_lock_single(struct sonode *so) 308 { 309 ASSERT(MUTEX_HELD(&so->so_lock)); 310 311 while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) { 312 cv_wait_stop(&so->so_single_cv, &so->so_lock, 313 SO_LOCK_WAKEUP_TIME); 314 } 315 so->so_flag |= SOLOCKED; 316 } 317 318 /* 319 * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND. 320 * Used to clear SOLOCKED or SOASYNC_UNBIND. 321 */ 322 void 323 so_unlock_single(struct sonode *so, int flag) 324 { 325 ASSERT(MUTEX_HELD(&so->so_lock)); 326 ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND)); 327 ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0); 328 ASSERT(so->so_flag & flag); 329 /* 330 * Process the T_DISCON_IND on sti_discon_ind_mp. 331 * 332 * Call to so_drain_discon_ind will result in so_lock 333 * being dropped and re-acquired later. 334 */ 335 if (!SOCK_IS_NONSTR(so)) { 336 sotpi_info_t *sti = SOTOTPI(so); 337 338 if (sti->sti_discon_ind_mp != NULL) 339 so_drain_discon_ind(so); 340 } 341 342 cv_signal(&so->so_single_cv); 343 so->so_flag &= ~flag; 344 } 345 346 /* 347 * Caller must hold the mutex. Used to set SOREADLOCKED. 348 * If the caller wants nonblocking behavior it should set fmode. 349 */ 350 int 351 so_lock_read(struct sonode *so, int fmode) 352 { 353 ASSERT(MUTEX_HELD(&so->so_lock)); 354 355 while (so->so_flag & SOREADLOCKED) { 356 if (fmode & (FNDELAY|FNONBLOCK)) 357 return (EWOULDBLOCK); 358 cv_wait_stop(&so->so_read_cv, &so->so_lock, 359 SO_LOCK_WAKEUP_TIME); 360 } 361 so->so_flag |= SOREADLOCKED; 362 return (0); 363 } 364 365 /* 366 * Like so_lock_read above but allows signals. 367 */ 368 int 369 so_lock_read_intr(struct sonode *so, int fmode) 370 { 371 ASSERT(MUTEX_HELD(&so->so_lock)); 372 373 while (so->so_flag & SOREADLOCKED) { 374 if (fmode & (FNDELAY|FNONBLOCK)) 375 return (EWOULDBLOCK); 376 if (!cv_wait_sig(&so->so_read_cv, &so->so_lock)) 377 return (EINTR); 378 } 379 so->so_flag |= SOREADLOCKED; 380 return (0); 381 } 382 383 /* 384 * Caller must hold the mutex. Used to clear SOREADLOCKED, 385 * set in so_lock_read() or so_lock_read_intr(). 386 */ 387 void 388 so_unlock_read(struct sonode *so) 389 { 390 ASSERT(MUTEX_HELD(&so->so_lock)); 391 ASSERT(so->so_flag & SOREADLOCKED); 392 393 cv_signal(&so->so_read_cv); 394 so->so_flag &= ~SOREADLOCKED; 395 } 396 397 /* 398 * Verify that the specified offset falls within the mblk and 399 * that the resulting pointer is aligned. 400 * Returns NULL if not. 401 */ 402 void * 403 sogetoff(mblk_t *mp, t_uscalar_t offset, 404 t_uscalar_t length, uint_t align_size) 405 { 406 uintptr_t ptr1, ptr2; 407 408 ASSERT(mp && mp->b_wptr >= mp->b_rptr); 409 ptr1 = (uintptr_t)mp->b_rptr + offset; 410 ptr2 = (uintptr_t)ptr1 + length; 411 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) { 412 eprintline(0); 413 return (NULL); 414 } 415 if ((ptr1 & (align_size - 1)) != 0) { 416 eprintline(0); 417 return (NULL); 418 } 419 return ((void *)ptr1); 420 } 421 422 /* 423 * Return the AF_UNIX underlying filesystem vnode matching a given name. 424 * Makes sure the sending and the destination sonodes are compatible. 425 * The vnode is returned held. 426 * 427 * The underlying filesystem VSOCK vnode has a v_stream pointer that 428 * references the actual stream head (hence indirectly the actual sonode). 429 */ 430 static int 431 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess, 432 vnode_t **vpp) 433 { 434 vnode_t *vp; /* Underlying filesystem vnode */ 435 vnode_t *rvp; /* real vnode */ 436 vnode_t *svp; /* sockfs vnode */ 437 struct sonode *so2; 438 int error; 439 440 dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so, 441 soun->sun_path)); 442 443 error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); 444 if (error) { 445 eprintsoline(so, error); 446 return (error); 447 } 448 449 /* 450 * Traverse lofs mounts get the real vnode 451 */ 452 if (VOP_REALVP(vp, &rvp, NULL) == 0) { 453 VN_HOLD(rvp); /* hold the real vnode */ 454 VN_RELE(vp); /* release hold from lookup */ 455 vp = rvp; 456 } 457 458 if (vp->v_type != VSOCK) { 459 error = ENOTSOCK; 460 eprintsoline(so, error); 461 goto done2; 462 } 463 464 if (checkaccess) { 465 /* 466 * Check that we have permissions to access the destination 467 * vnode. This check is not done in BSD but it is required 468 * by X/Open. 469 */ 470 if (error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL)) { 471 eprintsoline(so, error); 472 goto done2; 473 } 474 } 475 476 /* 477 * Check if the remote socket has been closed. 478 * 479 * Synchronize with vn_rele_stream by holding v_lock while traversing 480 * v_stream->sd_vnode. 481 */ 482 mutex_enter(&vp->v_lock); 483 if (vp->v_stream == NULL) { 484 mutex_exit(&vp->v_lock); 485 if (so->so_type == SOCK_DGRAM) 486 error = EDESTADDRREQ; 487 else 488 error = ECONNREFUSED; 489 490 eprintsoline(so, error); 491 goto done2; 492 } 493 ASSERT(vp->v_stream->sd_vnode); 494 svp = vp->v_stream->sd_vnode; 495 /* 496 * holding v_lock on underlying filesystem vnode and acquiring 497 * it on sockfs vnode. Assumes that no code ever attempts to 498 * acquire these locks in the reverse order. 499 */ 500 VN_HOLD(svp); 501 mutex_exit(&vp->v_lock); 502 503 if (svp->v_type != VSOCK) { 504 error = ENOTSOCK; 505 eprintsoline(so, error); 506 goto done; 507 } 508 509 so2 = VTOSO(svp); 510 511 if (so->so_type != so2->so_type) { 512 error = EPROTOTYPE; 513 eprintsoline(so, error); 514 goto done; 515 } 516 517 VN_RELE(svp); 518 *vpp = vp; 519 return (0); 520 521 done: 522 VN_RELE(svp); 523 done2: 524 VN_RELE(vp); 525 return (error); 526 } 527 528 /* 529 * Verify peer address for connect and sendto/sendmsg. 530 * Since sendto/sendmsg would not get synchronous errors from the transport 531 * provider we have to do these ugly checks in the socket layer to 532 * preserve compatibility with SunOS 4.X. 533 */ 534 int 535 so_addr_verify(struct sonode *so, const struct sockaddr *name, 536 socklen_t namelen) 537 { 538 int family; 539 540 dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n", 541 (void *)so, (void *)name, namelen)); 542 543 ASSERT(name != NULL); 544 545 family = so->so_family; 546 switch (family) { 547 case AF_INET: 548 if (name->sa_family != family) { 549 eprintsoline(so, EAFNOSUPPORT); 550 return (EAFNOSUPPORT); 551 } 552 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) { 553 eprintsoline(so, EINVAL); 554 return (EINVAL); 555 } 556 break; 557 case AF_INET6: { 558 #ifdef DEBUG 559 struct sockaddr_in6 *sin6; 560 #endif /* DEBUG */ 561 562 if (name->sa_family != family) { 563 eprintsoline(so, EAFNOSUPPORT); 564 return (EAFNOSUPPORT); 565 } 566 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) { 567 eprintsoline(so, EINVAL); 568 return (EINVAL); 569 } 570 #ifdef DEBUG 571 /* Verify that apps don't forget to clear sin6_scope_id etc */ 572 sin6 = (struct sockaddr_in6 *)name; 573 if (sin6->sin6_scope_id != 0 && 574 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) { 575 zcmn_err(getzoneid(), CE_WARN, 576 "connect/send* with uninitialized sin6_scope_id " 577 "(%d) on socket. Pid = %d\n", 578 (int)sin6->sin6_scope_id, (int)curproc->p_pid); 579 } 580 #endif /* DEBUG */ 581 break; 582 } 583 case AF_UNIX: 584 if (SOTOTPI(so)->sti_faddr_noxlate) { 585 return (0); 586 } 587 if (namelen < (socklen_t)sizeof (short)) { 588 eprintsoline(so, ENOENT); 589 return (ENOENT); 590 } 591 if (name->sa_family != family) { 592 eprintsoline(so, EAFNOSUPPORT); 593 return (EAFNOSUPPORT); 594 } 595 /* MAXPATHLEN + soun_family + nul termination */ 596 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) { 597 eprintsoline(so, ENAMETOOLONG); 598 return (ENAMETOOLONG); 599 } 600 601 break; 602 603 default: 604 /* 605 * Default is don't do any length or sa_family check 606 * to allow non-sockaddr style addresses. 607 */ 608 break; 609 } 610 611 return (0); 612 } 613 614 615 /* 616 * Translate an AF_UNIX sockaddr_un to the transport internal name. 617 * Assumes caller has called so_addr_verify first. 618 */ 619 /*ARGSUSED*/ 620 int 621 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name, 622 socklen_t namelen, int checkaccess, 623 void **addrp, socklen_t *addrlenp) 624 { 625 int error; 626 struct sockaddr_un *soun; 627 vnode_t *vp; 628 void *addr; 629 socklen_t addrlen; 630 sotpi_info_t *sti = SOTOTPI(so); 631 632 dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n", 633 (void *)so, (void *)name, namelen, checkaccess)); 634 635 ASSERT(name != NULL); 636 ASSERT(so->so_family == AF_UNIX); 637 ASSERT(!sti->sti_faddr_noxlate); 638 ASSERT(namelen >= (socklen_t)sizeof (short)); 639 ASSERT(name->sa_family == AF_UNIX); 640 soun = (struct sockaddr_un *)name; 641 /* 642 * Lookup vnode for the specified path name and verify that 643 * it is a socket. 644 */ 645 error = so_ux_lookup(so, soun, checkaccess, &vp); 646 if (error) { 647 eprintsoline(so, error); 648 return (error); 649 } 650 /* 651 * Use the address of the peer vnode as the address to send 652 * to. We release the peer vnode here. In case it has been 653 * closed by the time the T_CONN_REQ or T_UNIDATA_REQ reaches the 654 * transport the message will get an error or be dropped. 655 */ 656 sti->sti_ux_faddr.soua_vp = vp; 657 sti->sti_ux_faddr.soua_magic = SOU_MAGIC_EXPLICIT; 658 addr = &sti->sti_ux_faddr; 659 addrlen = (socklen_t)sizeof (sti->sti_ux_faddr); 660 dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n", 661 addrlen, (void *)vp)); 662 VN_RELE(vp); 663 *addrp = addr; 664 *addrlenp = (socklen_t)addrlen; 665 return (0); 666 } 667 668 /* 669 * Esballoc free function for messages that contain SO_FILEP option. 670 * Decrement the reference count on the file pointers using closef. 671 */ 672 void 673 fdbuf_free(struct fdbuf *fdbuf) 674 { 675 int i; 676 struct file *fp; 677 678 dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd)); 679 for (i = 0; i < fdbuf->fd_numfd; i++) { 680 /* 681 * We need pointer size alignment for fd_fds. On a LP64 682 * kernel, the required alignment is 8 bytes while 683 * the option headers and values are only 4 bytes 684 * aligned. So its safer to do a bcopy compared to 685 * assigning fdbuf->fd_fds[i] to fp. 686 */ 687 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 688 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp)); 689 (void) closef(fp); 690 } 691 if (fdbuf->fd_ebuf != NULL) 692 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen); 693 kmem_free(fdbuf, fdbuf->fd_size); 694 } 695 696 /* 697 * Allocate an esballoc'ed message for AF_UNIX file descriptor passing. 698 * Waits if memory is not available. 699 */ 700 mblk_t * 701 fdbuf_allocmsg(int size, struct fdbuf *fdbuf) 702 { 703 uchar_t *buf; 704 mblk_t *mp; 705 706 dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd)); 707 buf = kmem_alloc(size, KM_SLEEP); 708 fdbuf->fd_ebuf = (caddr_t)buf; 709 fdbuf->fd_ebuflen = size; 710 fdbuf->fd_frtn.free_func = fdbuf_free; 711 fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf; 712 713 mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn); 714 mp->b_datap->db_type = M_PROTO; 715 return (mp); 716 } 717 718 /* 719 * Extract file descriptors from a fdbuf. 720 * Return list in rights/rightslen. 721 */ 722 /*ARGSUSED*/ 723 static int 724 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen) 725 { 726 int i, fd; 727 int *rp; 728 struct file *fp; 729 int numfd; 730 731 dprint(1, ("fdbuf_extract: %d fds, len %d\n", 732 fdbuf->fd_numfd, rightslen)); 733 734 numfd = fdbuf->fd_numfd; 735 ASSERT(rightslen == numfd * (int)sizeof (int)); 736 737 /* 738 * Allocate a file descriptor and increment the f_count. 739 * The latter is needed since we always call fdbuf_free 740 * which performs a closef. 741 */ 742 rp = (int *)rights; 743 for (i = 0; i < numfd; i++) { 744 if ((fd = ufalloc(0)) == -1) 745 goto cleanup; 746 /* 747 * We need pointer size alignment for fd_fds. On a LP64 748 * kernel, the required alignment is 8 bytes while 749 * the option headers and values are only 4 bytes 750 * aligned. So its safer to do a bcopy compared to 751 * assigning fdbuf->fd_fds[i] to fp. 752 */ 753 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 754 mutex_enter(&fp->f_tlock); 755 fp->f_count++; 756 mutex_exit(&fp->f_tlock); 757 setf(fd, fp); 758 *rp++ = fd; 759 if (audit_active) 760 audit_fdrecv(fd, fp); 761 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n", 762 i, fd, (void *)fp, fp->f_count)); 763 } 764 return (0); 765 766 cleanup: 767 /* 768 * Undo whatever partial work the loop above has done. 769 */ 770 { 771 int j; 772 773 rp = (int *)rights; 774 for (j = 0; j < i; j++) { 775 dprint(0, 776 ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp)); 777 (void) closeandsetf(*rp++, NULL); 778 } 779 } 780 781 return (EMFILE); 782 } 783 784 /* 785 * Insert file descriptors into an fdbuf. 786 * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed 787 * by calling fdbuf_free(). 788 */ 789 int 790 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp) 791 { 792 int numfd, i; 793 int *fds; 794 struct file *fp; 795 struct fdbuf *fdbuf; 796 int fdbufsize; 797 798 dprint(1, ("fdbuf_create: len %d\n", rightslen)); 799 800 numfd = rightslen / (int)sizeof (int); 801 802 fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)); 803 fdbuf = kmem_alloc(fdbufsize, KM_SLEEP); 804 fdbuf->fd_size = fdbufsize; 805 fdbuf->fd_numfd = 0; 806 fdbuf->fd_ebuf = NULL; 807 fdbuf->fd_ebuflen = 0; 808 fds = (int *)rights; 809 for (i = 0; i < numfd; i++) { 810 if ((fp = getf(fds[i])) == NULL) { 811 fdbuf_free(fdbuf); 812 return (EBADF); 813 } 814 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n", 815 i, fds[i], (void *)fp, fp->f_count)); 816 mutex_enter(&fp->f_tlock); 817 fp->f_count++; 818 mutex_exit(&fp->f_tlock); 819 /* 820 * The maximum alignment for fdbuf (or any option header 821 * and its value) it 4 bytes. On a LP64 kernel, the alignment 822 * is not sufficient for pointers (fd_fds in this case). Since 823 * we just did a kmem_alloc (we get a double word alignment), 824 * we don't need to do anything on the send side (we loose 825 * the double word alignment because fdbuf goes after an 826 * option header (eg T_unitdata_req) which is only 4 byte 827 * aligned). We take care of this when we extract the file 828 * descriptor in fdbuf_extract or fdbuf_free. 829 */ 830 fdbuf->fd_fds[i] = fp; 831 fdbuf->fd_numfd++; 832 releasef(fds[i]); 833 if (audit_active) 834 audit_fdsend(fds[i], fp, 0); 835 } 836 *fdbufp = fdbuf; 837 return (0); 838 } 839 840 static int 841 fdbuf_optlen(int rightslen) 842 { 843 int numfd; 844 845 numfd = rightslen / (int)sizeof (int); 846 847 return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *))); 848 } 849 850 static t_uscalar_t 851 fdbuf_cmsglen(int fdbuflen) 852 { 853 return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) / 854 (int)sizeof (struct file *) * (int)sizeof (int)); 855 } 856 857 858 /* 859 * Return non-zero if the mblk and fdbuf are consistent. 860 */ 861 static int 862 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen) 863 { 864 if (fdbuflen >= FDBUF_HDRSIZE && 865 fdbuflen == fdbuf->fd_size) { 866 frtn_t *frp = mp->b_datap->db_frtnp; 867 /* 868 * Check that the SO_FILEP portion of the 869 * message has not been modified by 870 * the loopback transport. The sending sockfs generates 871 * a message that is esballoc'ed with the free function 872 * being fdbuf_free() and where free_arg contains the 873 * identical information as the SO_FILEP content. 874 * 875 * If any of these constraints are not satisfied we 876 * silently ignore the option. 877 */ 878 ASSERT(mp); 879 if (frp != NULL && 880 frp->free_func == fdbuf_free && 881 frp->free_arg != NULL && 882 bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) { 883 dprint(1, ("fdbuf_verify: fdbuf %p len %d\n", 884 (void *)fdbuf, fdbuflen)); 885 return (1); 886 } else { 887 zcmn_err(getzoneid(), CE_WARN, 888 "sockfs: mismatched fdbuf content (%p)", 889 (void *)mp); 890 return (0); 891 } 892 } else { 893 zcmn_err(getzoneid(), CE_WARN, 894 "sockfs: mismatched fdbuf len %d, %d\n", 895 fdbuflen, fdbuf->fd_size); 896 return (0); 897 } 898 } 899 900 /* 901 * When the file descriptors returned by sorecvmsg can not be passed 902 * to the application this routine will cleanup the references on 903 * the files. Start at startoff bytes into the buffer. 904 */ 905 static void 906 close_fds(void *fdbuf, int fdbuflen, int startoff) 907 { 908 int *fds = (int *)fdbuf; 909 int numfd = fdbuflen / (int)sizeof (int); 910 int i; 911 912 dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff)); 913 914 for (i = 0; i < numfd; i++) { 915 if (startoff < 0) 916 startoff = 0; 917 if (startoff < (int)sizeof (int)) { 918 /* 919 * This file descriptor is partially or fully after 920 * the offset 921 */ 922 dprint(0, 923 ("close_fds: cleanup[%d] = %d\n", i, fds[i])); 924 (void) closeandsetf(fds[i], NULL); 925 } 926 startoff -= (int)sizeof (int); 927 } 928 } 929 930 /* 931 * Close all file descriptors contained in the control part starting at 932 * the startoffset. 933 */ 934 void 935 so_closefds(void *control, t_uscalar_t controllen, int oldflg, 936 int startoff) 937 { 938 struct cmsghdr *cmsg; 939 940 if (control == NULL) 941 return; 942 943 if (oldflg) { 944 close_fds(control, controllen, startoff); 945 return; 946 } 947 /* Scan control part for file descriptors. */ 948 for (cmsg = (struct cmsghdr *)control; 949 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 950 cmsg = CMSG_NEXT(cmsg)) { 951 if (cmsg->cmsg_level == SOL_SOCKET && 952 cmsg->cmsg_type == SCM_RIGHTS) { 953 close_fds(CMSG_CONTENT(cmsg), 954 (int)CMSG_CONTENTLEN(cmsg), 955 startoff - (int)sizeof (struct cmsghdr)); 956 } 957 startoff -= cmsg->cmsg_len; 958 } 959 } 960 961 /* 962 * Returns a pointer/length for the file descriptors contained 963 * in the control buffer. Returns with *fdlenp == -1 if there are no 964 * file descriptor options present. This is different than there being 965 * a zero-length file descriptor option. 966 * Fail if there are multiple SCM_RIGHT cmsgs. 967 */ 968 int 969 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg, 970 void **fdsp, int *fdlenp) 971 { 972 struct cmsghdr *cmsg; 973 void *fds; 974 int fdlen; 975 976 if (control == NULL) { 977 *fdsp = NULL; 978 *fdlenp = -1; 979 return (0); 980 } 981 982 if (oldflg) { 983 *fdsp = control; 984 if (controllen == 0) 985 *fdlenp = -1; 986 else 987 *fdlenp = controllen; 988 dprint(1, ("so_getfdopt: old %d\n", *fdlenp)); 989 return (0); 990 } 991 992 fds = NULL; 993 fdlen = 0; 994 995 for (cmsg = (struct cmsghdr *)control; 996 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 997 cmsg = CMSG_NEXT(cmsg)) { 998 if (cmsg->cmsg_level == SOL_SOCKET && 999 cmsg->cmsg_type == SCM_RIGHTS) { 1000 if (fds != NULL) 1001 return (EINVAL); 1002 fds = CMSG_CONTENT(cmsg); 1003 fdlen = (int)CMSG_CONTENTLEN(cmsg); 1004 dprint(1, ("so_getfdopt: new %lu\n", 1005 (size_t)CMSG_CONTENTLEN(cmsg))); 1006 } 1007 } 1008 if (fds == NULL) { 1009 dprint(1, ("so_getfdopt: NONE\n")); 1010 *fdlenp = -1; 1011 } else 1012 *fdlenp = fdlen; 1013 *fdsp = fds; 1014 return (0); 1015 } 1016 1017 /* 1018 * Return the length of the options including any file descriptor options. 1019 */ 1020 t_uscalar_t 1021 so_optlen(void *control, t_uscalar_t controllen, int oldflg) 1022 { 1023 struct cmsghdr *cmsg; 1024 t_uscalar_t optlen = 0; 1025 t_uscalar_t len; 1026 1027 if (control == NULL) 1028 return (0); 1029 1030 if (oldflg) 1031 return ((t_uscalar_t)(sizeof (struct T_opthdr) + 1032 fdbuf_optlen(controllen))); 1033 1034 for (cmsg = (struct cmsghdr *)control; 1035 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1036 cmsg = CMSG_NEXT(cmsg)) { 1037 if (cmsg->cmsg_level == SOL_SOCKET && 1038 cmsg->cmsg_type == SCM_RIGHTS) { 1039 len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg)); 1040 } else { 1041 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1042 } 1043 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) + 1044 sizeof (struct T_opthdr)); 1045 } 1046 dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n", 1047 controllen, oldflg, optlen)); 1048 return (optlen); 1049 } 1050 1051 /* 1052 * Copy options from control to the mblk. Skip any file descriptor options. 1053 */ 1054 void 1055 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp) 1056 { 1057 struct T_opthdr toh; 1058 struct cmsghdr *cmsg; 1059 1060 if (control == NULL) 1061 return; 1062 1063 if (oldflg) { 1064 /* No real options - caller has handled file descriptors */ 1065 return; 1066 } 1067 for (cmsg = (struct cmsghdr *)control; 1068 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1069 cmsg = CMSG_NEXT(cmsg)) { 1070 /* 1071 * Note: The caller handles file descriptors prior 1072 * to calling this function. 1073 */ 1074 t_uscalar_t len; 1075 1076 if (cmsg->cmsg_level == SOL_SOCKET && 1077 cmsg->cmsg_type == SCM_RIGHTS) 1078 continue; 1079 1080 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1081 toh.level = cmsg->cmsg_level; 1082 toh.name = cmsg->cmsg_type; 1083 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr); 1084 toh.status = 0; 1085 1086 soappendmsg(mp, &toh, sizeof (toh)); 1087 soappendmsg(mp, CMSG_CONTENT(cmsg), len); 1088 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len; 1089 ASSERT(mp->b_wptr <= mp->b_datap->db_lim); 1090 } 1091 } 1092 1093 /* 1094 * Return the length of the control message derived from the options. 1095 * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP. 1096 * When oldflg is set only include SO_FILEP. 1097 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1098 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1099 * also be checked for any possible impacts. 1100 */ 1101 t_uscalar_t 1102 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg) 1103 { 1104 t_uscalar_t cmsglen = 0; 1105 struct T_opthdr *tohp; 1106 t_uscalar_t len; 1107 t_uscalar_t last_roundup = 0; 1108 1109 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1110 1111 for (tohp = (struct T_opthdr *)opt; 1112 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1113 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1114 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n", 1115 tohp->level, tohp->name, tohp->len)); 1116 if (tohp->level == SOL_SOCKET && 1117 (tohp->name == SO_SRCADDR || 1118 tohp->name == SO_UNIX_CLOSE)) { 1119 continue; 1120 } 1121 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1122 struct fdbuf *fdbuf; 1123 int fdbuflen; 1124 1125 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1126 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1127 1128 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1129 continue; 1130 if (oldflg) { 1131 cmsglen += fdbuf_cmsglen(fdbuflen); 1132 continue; 1133 } 1134 len = fdbuf_cmsglen(fdbuflen); 1135 } else if (tohp->level == SOL_SOCKET && 1136 tohp->name == SCM_TIMESTAMP) { 1137 if (oldflg) 1138 continue; 1139 1140 if (get_udatamodel() == DATAMODEL_NATIVE) { 1141 len = sizeof (struct timeval); 1142 } else { 1143 len = sizeof (struct timeval32); 1144 } 1145 } else { 1146 if (oldflg) 1147 continue; 1148 len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1149 } 1150 /* 1151 * Exclude roundup for last option to not set 1152 * MSG_CTRUNC when the cmsg fits but the padding doesn't fit. 1153 */ 1154 last_roundup = (t_uscalar_t) 1155 (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) - 1156 (len + (int)sizeof (struct cmsghdr))); 1157 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) + 1158 last_roundup; 1159 } 1160 cmsglen -= last_roundup; 1161 dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n", 1162 optlen, oldflg, cmsglen)); 1163 return (cmsglen); 1164 } 1165 1166 /* 1167 * Copy options from options to the control. Convert SO_FILEP to 1168 * file descriptors. 1169 * Returns errno or zero. 1170 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1171 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1172 * also be checked for any possible impacts. 1173 */ 1174 int 1175 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg, 1176 void *control, t_uscalar_t controllen) 1177 { 1178 struct T_opthdr *tohp; 1179 struct cmsghdr *cmsg; 1180 struct fdbuf *fdbuf; 1181 int fdbuflen; 1182 int error; 1183 #if defined(DEBUG) || defined(__lint) 1184 struct cmsghdr *cend = (struct cmsghdr *) 1185 (((uint8_t *)control) + ROUNDUP_cmsglen(controllen)); 1186 #endif 1187 cmsg = (struct cmsghdr *)control; 1188 1189 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1190 1191 for (tohp = (struct T_opthdr *)opt; 1192 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1193 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1194 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n", 1195 tohp->level, tohp->name, tohp->len)); 1196 1197 if (tohp->level == SOL_SOCKET && 1198 (tohp->name == SO_SRCADDR || 1199 tohp->name == SO_UNIX_CLOSE)) { 1200 continue; 1201 } 1202 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen); 1203 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1204 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1205 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1206 1207 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1208 return (EPROTO); 1209 if (oldflg) { 1210 error = fdbuf_extract(fdbuf, control, 1211 (int)controllen); 1212 if (error != 0) 1213 return (error); 1214 continue; 1215 } else { 1216 int fdlen; 1217 1218 fdlen = (int)fdbuf_cmsglen( 1219 (int)_TPI_TOPT_DATALEN(tohp)); 1220 1221 cmsg->cmsg_level = tohp->level; 1222 cmsg->cmsg_type = SCM_RIGHTS; 1223 cmsg->cmsg_len = (socklen_t)(fdlen + 1224 sizeof (struct cmsghdr)); 1225 1226 error = fdbuf_extract(fdbuf, 1227 CMSG_CONTENT(cmsg), fdlen); 1228 if (error != 0) 1229 return (error); 1230 } 1231 } else if (tohp->level == SOL_SOCKET && 1232 tohp->name == SCM_TIMESTAMP) { 1233 timestruc_t *timestamp; 1234 1235 if (oldflg) 1236 continue; 1237 1238 cmsg->cmsg_level = tohp->level; 1239 cmsg->cmsg_type = tohp->name; 1240 1241 timestamp = 1242 (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1], 1243 sizeof (intptr_t)); 1244 1245 if (get_udatamodel() == DATAMODEL_NATIVE) { 1246 struct timeval tv; 1247 1248 cmsg->cmsg_len = sizeof (struct timeval) + 1249 sizeof (struct cmsghdr); 1250 tv.tv_sec = timestamp->tv_sec; 1251 tv.tv_usec = timestamp->tv_nsec / 1252 (NANOSEC / MICROSEC); 1253 /* 1254 * on LP64 systems, the struct timeval in 1255 * the destination will not be 8-byte aligned, 1256 * so use bcopy to avoid alignment trouble 1257 */ 1258 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv)); 1259 } else { 1260 struct timeval32 *time32; 1261 1262 cmsg->cmsg_len = sizeof (struct timeval32) + 1263 sizeof (struct cmsghdr); 1264 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg); 1265 time32->tv_sec = (time32_t)timestamp->tv_sec; 1266 time32->tv_usec = 1267 (int32_t)(timestamp->tv_nsec / 1268 (NANOSEC / MICROSEC)); 1269 } 1270 1271 } else { 1272 if (oldflg) 1273 continue; 1274 1275 cmsg->cmsg_level = tohp->level; 1276 cmsg->cmsg_type = tohp->name; 1277 cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) + 1278 sizeof (struct cmsghdr)); 1279 1280 /* copy content to control data part */ 1281 bcopy(&tohp[1], CMSG_CONTENT(cmsg), 1282 CMSG_CONTENTLEN(cmsg)); 1283 } 1284 /* move to next CMSG structure! */ 1285 cmsg = CMSG_NEXT(cmsg); 1286 } 1287 dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n", 1288 control, controllen, (void *)cend, (void *)cmsg)); 1289 ASSERT(cmsg <= cend); 1290 return (0); 1291 } 1292 1293 /* 1294 * Extract the SO_SRCADDR option value if present. 1295 */ 1296 void 1297 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp, 1298 t_uscalar_t *srclenp) 1299 { 1300 struct T_opthdr *tohp; 1301 1302 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1303 1304 ASSERT(srcp != NULL && srclenp != NULL); 1305 *srcp = NULL; 1306 *srclenp = 0; 1307 1308 for (tohp = (struct T_opthdr *)opt; 1309 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1310 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1311 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n", 1312 tohp->level, tohp->name, tohp->len)); 1313 if (tohp->level == SOL_SOCKET && 1314 tohp->name == SO_SRCADDR) { 1315 *srcp = _TPI_TOPT_DATA(tohp); 1316 *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1317 } 1318 } 1319 } 1320 1321 /* 1322 * Verify if the SO_UNIX_CLOSE option is present. 1323 */ 1324 int 1325 so_getopt_unix_close(void *opt, t_uscalar_t optlen) 1326 { 1327 struct T_opthdr *tohp; 1328 1329 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1330 1331 for (tohp = (struct T_opthdr *)opt; 1332 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1333 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1334 dprint(1, 1335 ("so_getopt_unix_close: level 0x%x, name %d, len %d\n", 1336 tohp->level, tohp->name, tohp->len)); 1337 if (tohp->level == SOL_SOCKET && 1338 tohp->name == SO_UNIX_CLOSE) 1339 return (1); 1340 } 1341 return (0); 1342 } 1343 1344 /* 1345 * Allocate an M_PROTO message. 1346 * 1347 * If allocation fails the behavior depends on sleepflg: 1348 * _ALLOC_NOSLEEP fail immediately 1349 * _ALLOC_INTR sleep for memory until a signal is caught 1350 * _ALLOC_SLEEP sleep forever. Don't return NULL. 1351 */ 1352 mblk_t * 1353 soallocproto(size_t size, int sleepflg, cred_t *cr) 1354 { 1355 mblk_t *mp; 1356 1357 /* Round up size for reuse */ 1358 size = MAX(size, 64); 1359 if (cr != NULL) 1360 mp = allocb_cred(size, cr, curproc->p_pid); 1361 else 1362 mp = allocb(size, BPRI_MED); 1363 1364 if (mp == NULL) { 1365 int error; /* Dummy - error not returned to caller */ 1366 1367 switch (sleepflg) { 1368 case _ALLOC_SLEEP: 1369 if (cr != NULL) { 1370 mp = allocb_cred_wait(size, STR_NOSIG, &error, 1371 cr, curproc->p_pid); 1372 } else { 1373 mp = allocb_wait(size, BPRI_MED, STR_NOSIG, 1374 &error); 1375 } 1376 ASSERT(mp); 1377 break; 1378 case _ALLOC_INTR: 1379 if (cr != NULL) { 1380 mp = allocb_cred_wait(size, 0, &error, cr, 1381 curproc->p_pid); 1382 } else { 1383 mp = allocb_wait(size, BPRI_MED, 0, &error); 1384 } 1385 if (mp == NULL) { 1386 /* Caught signal while sleeping for memory */ 1387 eprintline(ENOBUFS); 1388 return (NULL); 1389 } 1390 break; 1391 case _ALLOC_NOSLEEP: 1392 default: 1393 eprintline(ENOBUFS); 1394 return (NULL); 1395 } 1396 } 1397 DB_TYPE(mp) = M_PROTO; 1398 return (mp); 1399 } 1400 1401 /* 1402 * Allocate an M_PROTO message with a single component. 1403 * len is the length of buf. size is the amount to allocate. 1404 * 1405 * buf can be NULL with a non-zero len. 1406 * This results in a bzero'ed chunk being placed the message. 1407 */ 1408 mblk_t * 1409 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg, 1410 cred_t *cr) 1411 { 1412 mblk_t *mp; 1413 1414 if (size == 0) 1415 size = len; 1416 1417 ASSERT(size >= len); 1418 /* Round up size for reuse */ 1419 size = MAX(size, 64); 1420 mp = soallocproto(size, sleepflg, cr); 1421 if (mp == NULL) 1422 return (NULL); 1423 mp->b_datap->db_type = M_PROTO; 1424 if (len != 0) { 1425 if (buf != NULL) 1426 bcopy(buf, mp->b_wptr, len); 1427 else 1428 bzero(mp->b_wptr, len); 1429 mp->b_wptr += len; 1430 } 1431 return (mp); 1432 } 1433 1434 /* 1435 * Append buf/len to mp. 1436 * The caller has to ensure that there is enough room in the mblk. 1437 * 1438 * buf can be NULL with a non-zero len. 1439 * This results in a bzero'ed chunk being placed the message. 1440 */ 1441 void 1442 soappendmsg(mblk_t *mp, const void *buf, ssize_t len) 1443 { 1444 ASSERT(mp); 1445 1446 if (len != 0) { 1447 /* Assert for room left */ 1448 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len); 1449 if (buf != NULL) 1450 bcopy(buf, mp->b_wptr, len); 1451 else 1452 bzero(mp->b_wptr, len); 1453 } 1454 mp->b_wptr += len; 1455 } 1456 1457 /* 1458 * Create a message using two kernel buffers. 1459 * If size is set that will determine the allocation size (e.g. for future 1460 * soappendmsg calls). If size is zero it is derived from the buffer 1461 * lengths. 1462 */ 1463 mblk_t * 1464 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1465 ssize_t size, int sleepflg, cred_t *cr) 1466 { 1467 mblk_t *mp; 1468 1469 if (size == 0) 1470 size = len1 + len2; 1471 ASSERT(size >= len1 + len2); 1472 1473 mp = soallocproto1(buf1, len1, size, sleepflg, cr); 1474 if (mp) 1475 soappendmsg(mp, buf2, len2); 1476 return (mp); 1477 } 1478 1479 /* 1480 * Create a message using three kernel buffers. 1481 * If size is set that will determine the allocation size (for future 1482 * soappendmsg calls). If size is zero it is derived from the buffer 1483 * lengths. 1484 */ 1485 mblk_t * 1486 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1487 const void *buf3, ssize_t len3, ssize_t size, int sleepflg, cred_t *cr) 1488 { 1489 mblk_t *mp; 1490 1491 if (size == 0) 1492 size = len1 + len2 +len3; 1493 ASSERT(size >= len1 + len2 + len3); 1494 1495 mp = soallocproto1(buf1, len1, size, sleepflg, cr); 1496 if (mp != NULL) { 1497 soappendmsg(mp, buf2, len2); 1498 soappendmsg(mp, buf3, len3); 1499 } 1500 return (mp); 1501 } 1502 1503 #ifdef DEBUG 1504 char * 1505 pr_state(uint_t state, uint_t mode) 1506 { 1507 static char buf[1024]; 1508 1509 buf[0] = 0; 1510 if (state & SS_ISCONNECTED) 1511 (void) strcat(buf, "ISCONNECTED "); 1512 if (state & SS_ISCONNECTING) 1513 (void) strcat(buf, "ISCONNECTING "); 1514 if (state & SS_ISDISCONNECTING) 1515 (void) strcat(buf, "ISDISCONNECTING "); 1516 if (state & SS_CANTSENDMORE) 1517 (void) strcat(buf, "CANTSENDMORE "); 1518 1519 if (state & SS_CANTRCVMORE) 1520 (void) strcat(buf, "CANTRCVMORE "); 1521 if (state & SS_ISBOUND) 1522 (void) strcat(buf, "ISBOUND "); 1523 if (state & SS_NDELAY) 1524 (void) strcat(buf, "NDELAY "); 1525 if (state & SS_NONBLOCK) 1526 (void) strcat(buf, "NONBLOCK "); 1527 1528 if (state & SS_ASYNC) 1529 (void) strcat(buf, "ASYNC "); 1530 if (state & SS_ACCEPTCONN) 1531 (void) strcat(buf, "ACCEPTCONN "); 1532 if (state & SS_SAVEDEOR) 1533 (void) strcat(buf, "SAVEDEOR "); 1534 1535 if (state & SS_RCVATMARK) 1536 (void) strcat(buf, "RCVATMARK "); 1537 if (state & SS_OOBPEND) 1538 (void) strcat(buf, "OOBPEND "); 1539 if (state & SS_HAVEOOBDATA) 1540 (void) strcat(buf, "HAVEOOBDATA "); 1541 if (state & SS_HADOOBDATA) 1542 (void) strcat(buf, "HADOOBDATA "); 1543 1544 if (mode & SM_PRIV) 1545 (void) strcat(buf, "PRIV "); 1546 if (mode & SM_ATOMIC) 1547 (void) strcat(buf, "ATOMIC "); 1548 if (mode & SM_ADDR) 1549 (void) strcat(buf, "ADDR "); 1550 if (mode & SM_CONNREQUIRED) 1551 (void) strcat(buf, "CONNREQUIRED "); 1552 1553 if (mode & SM_FDPASSING) 1554 (void) strcat(buf, "FDPASSING "); 1555 if (mode & SM_EXDATA) 1556 (void) strcat(buf, "EXDATA "); 1557 if (mode & SM_OPTDATA) 1558 (void) strcat(buf, "OPTDATA "); 1559 if (mode & SM_BYTESTREAM) 1560 (void) strcat(buf, "BYTESTREAM "); 1561 return (buf); 1562 } 1563 1564 char * 1565 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen) 1566 { 1567 static char buf[1024]; 1568 1569 if (addr == NULL || addrlen == 0) { 1570 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr); 1571 return (buf); 1572 } 1573 switch (family) { 1574 case AF_INET: { 1575 struct sockaddr_in sin; 1576 1577 bcopy(addr, &sin, sizeof (sin)); 1578 1579 (void) sprintf(buf, "(len %d) %x/%d", 1580 addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port)); 1581 break; 1582 } 1583 case AF_INET6: { 1584 struct sockaddr_in6 sin6; 1585 uint16_t *piece = (uint16_t *)&sin6.sin6_addr; 1586 1587 bcopy((char *)addr, (char *)&sin6, sizeof (sin6)); 1588 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d", 1589 addrlen, 1590 ntohs(piece[0]), ntohs(piece[1]), 1591 ntohs(piece[2]), ntohs(piece[3]), 1592 ntohs(piece[4]), ntohs(piece[5]), 1593 ntohs(piece[6]), ntohs(piece[7]), 1594 ntohs(sin6.sin6_port)); 1595 break; 1596 } 1597 case AF_UNIX: { 1598 struct sockaddr_un *soun = (struct sockaddr_un *)addr; 1599 1600 (void) sprintf(buf, "(len %d) %s", addrlen, 1601 (soun == NULL) ? "(none)" : soun->sun_path); 1602 break; 1603 } 1604 default: 1605 (void) sprintf(buf, "(unknown af %d)", family); 1606 break; 1607 } 1608 return (buf); 1609 } 1610 1611 /* The logical equivalence operator (a if-and-only-if b) */ 1612 #define EQUIVALENT(a, b) (((a) && (b)) || (!(a) && (!(b)))) 1613 1614 /* 1615 * Verify limitations and invariants on oob state. 1616 * Return 1 if OK, otherwise 0 so that it can be used as 1617 * ASSERT(verify_oobstate(so)); 1618 */ 1619 int 1620 so_verify_oobstate(struct sonode *so) 1621 { 1622 boolean_t havemark; 1623 1624 ASSERT(MUTEX_HELD(&so->so_lock)); 1625 1626 /* 1627 * The possible state combinations are: 1628 * 0 1629 * SS_OOBPEND 1630 * SS_OOBPEND|SS_HAVEOOBDATA 1631 * SS_OOBPEND|SS_HADOOBDATA 1632 * SS_HADOOBDATA 1633 */ 1634 switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) { 1635 case 0: 1636 case SS_OOBPEND: 1637 case SS_OOBPEND|SS_HAVEOOBDATA: 1638 case SS_OOBPEND|SS_HADOOBDATA: 1639 case SS_HADOOBDATA: 1640 break; 1641 default: 1642 printf("Bad oob state 1 (%p): state %s\n", 1643 (void *)so, pr_state(so->so_state, so->so_mode)); 1644 return (0); 1645 } 1646 1647 /* SS_RCVATMARK should only be set when SS_OOBPEND is set */ 1648 if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) { 1649 printf("Bad oob state 2 (%p): state %s\n", 1650 (void *)so, pr_state(so->so_state, so->so_mode)); 1651 return (0); 1652 } 1653 1654 /* 1655 * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND 1656 * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt. 1657 */ 1658 havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 : 1659 SOTOTPI(so)->sti_oobsigcnt > 0; 1660 1661 if (!EQUIVALENT(havemark || (so->so_state & SS_RCVATMARK), 1662 so->so_state & SS_OOBPEND)) { 1663 printf("Bad oob state 3 (%p): state %s\n", 1664 (void *)so, pr_state(so->so_state, so->so_mode)); 1665 return (0); 1666 } 1667 1668 /* 1669 * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA 1670 */ 1671 if (!(so->so_options & SO_OOBINLINE) && 1672 !EQUIVALENT(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) { 1673 printf("Bad oob state 4 (%p): state %s\n", 1674 (void *)so, pr_state(so->so_state, so->so_mode)); 1675 return (0); 1676 } 1677 1678 if (!SOCK_IS_NONSTR(so) && 1679 SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) { 1680 printf("Bad oob state 5 (%p): counts %d/%d state %s\n", 1681 (void *)so, SOTOTPI(so)->sti_oobsigcnt, 1682 SOTOTPI(so)->sti_oobcnt, 1683 pr_state(so->so_state, so->so_mode)); 1684 return (0); 1685 } 1686 1687 return (1); 1688 } 1689 #undef EQUIVALENT 1690 #endif /* DEBUG */ 1691 1692 /* initialize sockfs zone specific kstat related items */ 1693 void * 1694 sock_kstat_init(zoneid_t zoneid) 1695 { 1696 kstat_t *ksp; 1697 1698 ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc", 1699 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid); 1700 1701 if (ksp != NULL) { 1702 ksp->ks_update = sockfs_update; 1703 ksp->ks_snapshot = sockfs_snapshot; 1704 ksp->ks_lock = &socklist.sl_lock; 1705 ksp->ks_private = (void *)(uintptr_t)zoneid; 1706 kstat_install(ksp); 1707 } 1708 1709 return (ksp); 1710 } 1711 1712 /* tear down sockfs zone specific kstat related items */ 1713 /*ARGSUSED*/ 1714 void 1715 sock_kstat_fini(zoneid_t zoneid, void *arg) 1716 { 1717 kstat_t *ksp = (kstat_t *)arg; 1718 1719 if (ksp != NULL) { 1720 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private); 1721 kstat_delete(ksp); 1722 } 1723 } 1724 1725 /* 1726 * Zones: 1727 * Note that nactive is going to be different for each zone. 1728 * This means we require kstat to call sockfs_update and then sockfs_snapshot 1729 * for the same zone, or sockfs_snapshot will be taken into the wrong size 1730 * buffer. This is safe, but if the buffer is too small, user will not be 1731 * given details of all sockets. However, as this kstat has a ks_lock, kstat 1732 * driver will keep it locked between the update and the snapshot, so no 1733 * other process (zone) can currently get inbetween resulting in a wrong size 1734 * buffer allocation. 1735 */ 1736 static int 1737 sockfs_update(kstat_t *ksp, int rw) 1738 { 1739 uint_t nactive = 0; /* # of active AF_UNIX sockets */ 1740 struct sonode *so; /* current sonode on socklist */ 1741 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 1742 1743 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 1744 1745 if (rw == KSTAT_WRITE) { /* bounce all writes */ 1746 return (EACCES); 1747 } 1748 1749 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { 1750 if (so->so_count != 0 && so->so_zoneid == myzoneid) { 1751 nactive++; 1752 } 1753 } 1754 ksp->ks_ndata = nactive; 1755 ksp->ks_data_size = nactive * sizeof (struct k_sockinfo); 1756 1757 return (0); 1758 } 1759 1760 static int 1761 sockfs_snapshot(kstat_t *ksp, void *buf, int rw) 1762 { 1763 int ns; /* # of sonodes we've copied */ 1764 struct sonode *so; /* current sonode on socklist */ 1765 struct k_sockinfo *pksi; /* where we put sockinfo data */ 1766 t_uscalar_t sn_len; /* soa_len */ 1767 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 1768 sotpi_info_t *sti; 1769 1770 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 1771 1772 ksp->ks_snaptime = gethrtime(); 1773 1774 if (rw == KSTAT_WRITE) { /* bounce all writes */ 1775 return (EACCES); 1776 } 1777 1778 /* 1779 * for each sonode on the socklist, we massage the important 1780 * info into buf, in k_sockinfo format. 1781 */ 1782 pksi = (struct k_sockinfo *)buf; 1783 ns = 0; 1784 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { 1785 /* only stuff active sonodes and the same zone: */ 1786 if (so->so_count == 0 || so->so_zoneid != myzoneid) { 1787 continue; 1788 } 1789 1790 /* 1791 * If the sonode was activated between the update and the 1792 * snapshot, we're done - as this is only a snapshot. 1793 */ 1794 if ((caddr_t)(pksi) >= (caddr_t)buf + ksp->ks_data_size) { 1795 break; 1796 } 1797 1798 sti = SOTOTPI(so); 1799 /* copy important info into buf: */ 1800 pksi->ks_si.si_size = sizeof (struct k_sockinfo); 1801 pksi->ks_si.si_family = so->so_family; 1802 pksi->ks_si.si_type = so->so_type; 1803 pksi->ks_si.si_flag = so->so_flag; 1804 pksi->ks_si.si_state = so->so_state; 1805 pksi->ks_si.si_serv_type = sti->sti_serv_type; 1806 pksi->ks_si.si_ux_laddr_sou_magic = 1807 sti->sti_ux_laddr.soua_magic; 1808 pksi->ks_si.si_ux_faddr_sou_magic = 1809 sti->sti_ux_faddr.soua_magic; 1810 pksi->ks_si.si_laddr_soa_len = sti->sti_laddr.soa_len; 1811 pksi->ks_si.si_faddr_soa_len = sti->sti_faddr.soa_len; 1812 pksi->ks_si.si_szoneid = so->so_zoneid; 1813 pksi->ks_si.si_faddr_noxlate = sti->sti_faddr_noxlate; 1814 1815 mutex_enter(&so->so_lock); 1816 1817 if (sti->sti_laddr_sa != NULL) { 1818 ASSERT(sti->sti_laddr_sa->sa_data != NULL); 1819 sn_len = sti->sti_laddr_len; 1820 ASSERT(sn_len <= sizeof (short) + 1821 sizeof (pksi->ks_si.si_laddr_sun_path)); 1822 1823 pksi->ks_si.si_laddr_family = 1824 sti->sti_laddr_sa->sa_family; 1825 if (sn_len != 0) { 1826 /* AF_UNIX socket names are NULL terminated */ 1827 (void) strncpy(pksi->ks_si.si_laddr_sun_path, 1828 sti->sti_laddr_sa->sa_data, 1829 sizeof (pksi->ks_si.si_laddr_sun_path)); 1830 sn_len = strlen(pksi->ks_si.si_laddr_sun_path); 1831 } 1832 pksi->ks_si.si_laddr_sun_path[sn_len] = 0; 1833 } 1834 1835 if (sti->sti_faddr_sa != NULL) { 1836 ASSERT(sti->sti_faddr_sa->sa_data != NULL); 1837 sn_len = sti->sti_faddr_len; 1838 ASSERT(sn_len <= sizeof (short) + 1839 sizeof (pksi->ks_si.si_faddr_sun_path)); 1840 1841 pksi->ks_si.si_faddr_family = 1842 sti->sti_faddr_sa->sa_family; 1843 if (sn_len != 0) { 1844 (void) strncpy(pksi->ks_si.si_faddr_sun_path, 1845 sti->sti_faddr_sa->sa_data, 1846 sizeof (pksi->ks_si.si_faddr_sun_path)); 1847 sn_len = strlen(pksi->ks_si.si_faddr_sun_path); 1848 } 1849 pksi->ks_si.si_faddr_sun_path[sn_len] = 0; 1850 } 1851 1852 mutex_exit(&so->so_lock); 1853 1854 (void) sprintf(pksi->ks_straddr[0], "%p", (void *)so); 1855 (void) sprintf(pksi->ks_straddr[1], "%p", 1856 (void *)sti->sti_ux_laddr.soua_vp); 1857 (void) sprintf(pksi->ks_straddr[2], "%p", 1858 (void *)sti->sti_ux_faddr.soua_vp); 1859 1860 ns++; 1861 pksi++; 1862 } 1863 1864 ksp->ks_ndata = ns; 1865 return (0); 1866 } 1867 1868 ssize_t 1869 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size) 1870 { 1871 struct uio auio; 1872 struct iovec aiov[MSG_MAXIOVLEN]; 1873 register vnode_t *vp; 1874 int ioflag, rwflag; 1875 ssize_t cnt; 1876 int error = 0; 1877 int iovcnt = 0; 1878 short fflag; 1879 1880 vp = fp->f_vnode; 1881 fflag = fp->f_flag; 1882 1883 rwflag = 0; 1884 aiov[0].iov_base = (caddr_t)buf; 1885 aiov[0].iov_len = size; 1886 iovcnt = 1; 1887 cnt = (ssize_t)size; 1888 (void) VOP_RWLOCK(vp, rwflag, NULL); 1889 1890 auio.uio_loffset = fileoff; 1891 auio.uio_iov = aiov; 1892 auio.uio_iovcnt = iovcnt; 1893 auio.uio_resid = cnt; 1894 auio.uio_segflg = UIO_SYSSPACE; 1895 auio.uio_llimit = MAXOFFSET_T; 1896 auio.uio_fmode = fflag; 1897 auio.uio_extflg = UIO_COPY_CACHED; 1898 1899 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1900 1901 /* If read sync is not asked for, filter sync flags */ 1902 if ((ioflag & FRSYNC) == 0) 1903 ioflag &= ~(FSYNC|FDSYNC); 1904 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL); 1905 cnt -= auio.uio_resid; 1906 1907 VOP_RWUNLOCK(vp, rwflag, NULL); 1908 1909 if (error == EINTR && cnt != 0) 1910 error = 0; 1911 out: 1912 if (error != 0) { 1913 *err = error; 1914 return (0); 1915 } else { 1916 *err = 0; 1917 return (cnt); 1918 } 1919 } 1920 1921 int 1922 so_copyin(const void *from, void *to, size_t size, int fromkernel) 1923 { 1924 if (fromkernel) { 1925 bcopy(from, to, size); 1926 return (0); 1927 } 1928 return (xcopyin(from, to, size)); 1929 } 1930 1931 int 1932 so_copyout(const void *from, void *to, size_t size, int tokernel) 1933 { 1934 if (tokernel) { 1935 bcopy(from, to, size); 1936 return (0); 1937 } 1938 return (xcopyout(from, to, size)); 1939 } 1940