1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vfs.h> 37 #include <sys/vfs_opreg.h> 38 #include <sys/vnode.h> 39 #include <sys/debug.h> 40 #include <sys/errno.h> 41 #include <sys/time.h> 42 #include <sys/file.h> 43 #include <sys/open.h> 44 #include <sys/user.h> 45 #include <sys/termios.h> 46 #include <sys/stream.h> 47 #include <sys/strsubr.h> 48 #include <sys/strsun.h> 49 #include <sys/esunddi.h> 50 #include <sys/flock.h> 51 #include <sys/modctl.h> 52 #include <sys/cmn_err.h> 53 #include <sys/mkdev.h> 54 #include <sys/pathname.h> 55 #include <sys/ddi.h> 56 #include <sys/stat.h> 57 #include <sys/fs/snode.h> 58 #include <sys/fs/dv_node.h> 59 #include <sys/zone.h> 60 61 #include <sys/socket.h> 62 #include <sys/socketvar.h> 63 #include <netinet/in.h> 64 #include <sys/un.h> 65 #include <sys/ucred.h> 66 67 #include <sys/tiuser.h> 68 #define _SUN_TPI_VERSION 2 69 #include <sys/tihdr.h> 70 71 #include <c2/audit.h> 72 73 #include <fs/sockfs/nl7c.h> 74 #include <fs/sockfs/sockcommon.h> 75 #include <fs/sockfs/sockfilter_impl.h> 76 #include <fs/sockfs/socktpi.h> 77 #include <fs/sockfs/socktpi_impl.h> 78 #include <fs/sockfs/sodirect.h> 79 80 /* 81 * Macros that operate on struct cmsghdr. 82 * The CMSG_VALID macro does not assume that the last option buffer is padded. 83 */ 84 #define CMSG_CONTENT(cmsg) (&((cmsg)[1])) 85 #define CMSG_CONTENTLEN(cmsg) ((cmsg)->cmsg_len - sizeof (struct cmsghdr)) 86 #define CMSG_VALID(cmsg, start, end) \ 87 (ISALIGNED_cmsghdr(cmsg) && \ 88 ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \ 89 ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \ 90 ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \ 91 ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end))) 92 #define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */ 93 94 dev_t sockdev; /* For fsid in getattr */ 95 int sockfs_defer_nl7c_init = 0; 96 97 struct socklist socklist; 98 99 struct kmem_cache *socket_cache; 100 101 /* 102 * sockconf_lock protects the socket configuration (socket types and 103 * socket filters) which is changed via the sockconfig system call. 104 */ 105 krwlock_t sockconf_lock; 106 107 static int sockfs_update(kstat_t *, int); 108 static int sockfs_snapshot(kstat_t *, void *, int); 109 extern smod_info_t *sotpi_smod_create(void); 110 111 extern void sendfile_init(); 112 113 extern void nl7c_init(void); 114 115 extern int modrootloaded; 116 117 #define ADRSTRLEN (2 * sizeof (void *) + 1) 118 /* 119 * kernel structure for passing the sockinfo data back up to the user. 120 * the strings array allows us to convert AF_UNIX addresses into strings 121 * with a common method regardless of which n-bit kernel we're running. 122 */ 123 struct k_sockinfo { 124 struct sockinfo ks_si; 125 char ks_straddr[3][ADRSTRLEN]; 126 }; 127 128 /* 129 * Translate from a device pathname (e.g. "/dev/tcp") to a vnode. 130 * Returns with the vnode held. 131 */ 132 int 133 sogetvp(char *devpath, vnode_t **vpp, int uioflag) 134 { 135 struct snode *csp; 136 vnode_t *vp, *dvp; 137 major_t maj; 138 int error; 139 140 ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE); 141 142 /* 143 * Lookup the underlying filesystem vnode. 144 */ 145 error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp); 146 if (error) 147 return (error); 148 149 /* Check that it is the correct vnode */ 150 if (vp->v_type != VCHR) { 151 VN_RELE(vp); 152 return (ENOTSOCK); 153 } 154 155 /* 156 * If devpath went through devfs, the device should already 157 * be configured. If devpath is a mknod file, however, we 158 * need to make sure the device is properly configured. 159 * To do this, we do something similar to spec_open() 160 * except that we resolve to the minor/leaf level since 161 * we need to return a vnode. 162 */ 163 csp = VTOS(VTOS(vp)->s_commonvp); 164 if (!(csp->s_flag & SDIPSET)) { 165 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 166 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname); 167 if (error == 0) 168 error = devfs_lookupname(pathname, NULLVPP, &dvp); 169 VN_RELE(vp); 170 kmem_free(pathname, MAXPATHLEN); 171 if (error != 0) 172 return (ENXIO); 173 vp = dvp; /* use the devfs vp */ 174 } 175 176 /* device is configured at this point */ 177 maj = getmajor(vp->v_rdev); 178 if (!STREAMSTAB(maj)) { 179 VN_RELE(vp); 180 return (ENOSTR); 181 } 182 183 *vpp = vp; 184 return (0); 185 } 186 187 /* 188 * Update the accessed, updated, or changed times in an sonode 189 * with the current time. 190 * 191 * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable 192 * attributes in a fstat call. (They return the current time and 0 for 193 * all timestamps, respectively.) We maintain the current timestamps 194 * here primarily so that should sockmod be popped the resulting 195 * file descriptor will behave like a stream w.r.t. the timestamps. 196 */ 197 void 198 so_update_attrs(struct sonode *so, int flag) 199 { 200 time_t now = gethrestime_sec(); 201 202 if (SOCK_IS_NONSTR(so)) 203 return; 204 205 mutex_enter(&so->so_lock); 206 so->so_flag |= flag; 207 if (flag & SOACC) 208 SOTOTPI(so)->sti_atime = now; 209 if (flag & SOMOD) 210 SOTOTPI(so)->sti_mtime = now; 211 mutex_exit(&so->so_lock); 212 } 213 214 extern so_create_func_t sock_comm_create_function; 215 extern so_destroy_func_t sock_comm_destroy_function; 216 /* 217 * Init function called when sockfs is loaded. 218 */ 219 int 220 sockinit(int fstype, char *name) 221 { 222 static const fs_operation_def_t sock_vfsops_template[] = { 223 NULL, NULL 224 }; 225 int error; 226 major_t dev; 227 char *err_str; 228 229 error = vfs_setfsops(fstype, sock_vfsops_template, NULL); 230 if (error != 0) { 231 zcmn_err(GLOBAL_ZONEID, CE_WARN, 232 "sockinit: bad vfs ops template"); 233 return (error); 234 } 235 236 error = vn_make_ops(name, socket_vnodeops_template, 237 &socket_vnodeops); 238 if (error != 0) { 239 err_str = "sockinit: bad socket vnode ops template"; 240 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */ 241 socket_vnodeops = NULL; 242 goto failure; 243 } 244 245 socket_cache = kmem_cache_create("socket_cache", 246 sizeof (struct sonode), 0, sonode_constructor, 247 sonode_destructor, NULL, NULL, NULL, 0); 248 249 rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL); 250 251 error = socktpi_init(); 252 if (error != 0) { 253 err_str = NULL; 254 goto failure; 255 } 256 257 error = sod_init(); 258 if (error != 0) { 259 err_str = NULL; 260 goto failure; 261 } 262 263 /* 264 * Set up the default create and destroy functions 265 */ 266 sock_comm_create_function = socket_sonode_create; 267 sock_comm_destroy_function = socket_sonode_destroy; 268 269 /* 270 * Build initial list mapping socket parameters to vnode. 271 */ 272 smod_init(); 273 smod_add(sotpi_smod_create()); 274 275 sockparams_init(); 276 277 /* 278 * If sockets are needed before init runs /sbin/soconfig 279 * it is possible to preload the sockparams list here using 280 * calls like: 281 * sockconfig(1,2,3, "/dev/tcp", 0); 282 */ 283 284 /* 285 * Create a unique dev_t for use in so_fsid. 286 */ 287 288 if ((dev = getudev()) == (major_t)-1) 289 dev = 0; 290 sockdev = makedevice(dev, 0); 291 292 mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL); 293 sendfile_init(); 294 if (!modrootloaded) { 295 sockfs_defer_nl7c_init = 1; 296 } else { 297 nl7c_init(); 298 } 299 300 /* Initialize socket filters */ 301 sof_init(); 302 303 return (0); 304 305 failure: 306 (void) vfs_freevfsops_by_type(fstype); 307 if (socket_vnodeops != NULL) 308 vn_freevnodeops(socket_vnodeops); 309 if (err_str != NULL) 310 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str); 311 return (error); 312 } 313 314 /* 315 * Caller must hold the mutex. Used to set SOLOCKED. 316 */ 317 void 318 so_lock_single(struct sonode *so) 319 { 320 ASSERT(MUTEX_HELD(&so->so_lock)); 321 322 while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) { 323 cv_wait_stop(&so->so_single_cv, &so->so_lock, 324 SO_LOCK_WAKEUP_TIME); 325 } 326 so->so_flag |= SOLOCKED; 327 } 328 329 /* 330 * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND. 331 * Used to clear SOLOCKED or SOASYNC_UNBIND. 332 */ 333 void 334 so_unlock_single(struct sonode *so, int flag) 335 { 336 ASSERT(MUTEX_HELD(&so->so_lock)); 337 ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND)); 338 ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0); 339 ASSERT(so->so_flag & flag); 340 /* 341 * Process the T_DISCON_IND on sti_discon_ind_mp. 342 * 343 * Call to so_drain_discon_ind will result in so_lock 344 * being dropped and re-acquired later. 345 */ 346 if (!SOCK_IS_NONSTR(so)) { 347 sotpi_info_t *sti = SOTOTPI(so); 348 349 if (sti->sti_discon_ind_mp != NULL) 350 so_drain_discon_ind(so); 351 } 352 353 cv_signal(&so->so_single_cv); 354 so->so_flag &= ~flag; 355 } 356 357 /* 358 * Caller must hold the mutex. Used to set SOREADLOCKED. 359 * If the caller wants nonblocking behavior it should set fmode. 360 */ 361 int 362 so_lock_read(struct sonode *so, int fmode) 363 { 364 ASSERT(MUTEX_HELD(&so->so_lock)); 365 366 while (so->so_flag & SOREADLOCKED) { 367 if (fmode & (FNDELAY|FNONBLOCK)) 368 return (EWOULDBLOCK); 369 cv_wait_stop(&so->so_read_cv, &so->so_lock, 370 SO_LOCK_WAKEUP_TIME); 371 } 372 so->so_flag |= SOREADLOCKED; 373 return (0); 374 } 375 376 /* 377 * Like so_lock_read above but allows signals. 378 */ 379 int 380 so_lock_read_intr(struct sonode *so, int fmode) 381 { 382 ASSERT(MUTEX_HELD(&so->so_lock)); 383 384 while (so->so_flag & SOREADLOCKED) { 385 if (fmode & (FNDELAY|FNONBLOCK)) 386 return (EWOULDBLOCK); 387 if (!cv_wait_sig(&so->so_read_cv, &so->so_lock)) 388 return (EINTR); 389 } 390 so->so_flag |= SOREADLOCKED; 391 return (0); 392 } 393 394 /* 395 * Caller must hold the mutex. Used to clear SOREADLOCKED, 396 * set in so_lock_read() or so_lock_read_intr(). 397 */ 398 void 399 so_unlock_read(struct sonode *so) 400 { 401 ASSERT(MUTEX_HELD(&so->so_lock)); 402 ASSERT(so->so_flag & SOREADLOCKED); 403 404 cv_signal(&so->so_read_cv); 405 so->so_flag &= ~SOREADLOCKED; 406 } 407 408 /* 409 * Verify that the specified offset falls within the mblk and 410 * that the resulting pointer is aligned. 411 * Returns NULL if not. 412 */ 413 void * 414 sogetoff(mblk_t *mp, t_uscalar_t offset, 415 t_uscalar_t length, uint_t align_size) 416 { 417 uintptr_t ptr1, ptr2; 418 419 ASSERT(mp && mp->b_wptr >= mp->b_rptr); 420 ptr1 = (uintptr_t)mp->b_rptr + offset; 421 ptr2 = (uintptr_t)ptr1 + length; 422 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) { 423 eprintline(0); 424 return (NULL); 425 } 426 if ((ptr1 & (align_size - 1)) != 0) { 427 eprintline(0); 428 return (NULL); 429 } 430 return ((void *)ptr1); 431 } 432 433 /* 434 * Return the AF_UNIX underlying filesystem vnode matching a given name. 435 * Makes sure the sending and the destination sonodes are compatible. 436 * The vnode is returned held. 437 * 438 * The underlying filesystem VSOCK vnode has a v_stream pointer that 439 * references the actual stream head (hence indirectly the actual sonode). 440 */ 441 static int 442 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess, 443 vnode_t **vpp) 444 { 445 vnode_t *vp; /* Underlying filesystem vnode */ 446 vnode_t *rvp; /* real vnode */ 447 vnode_t *svp; /* sockfs vnode */ 448 struct sonode *so2; 449 int error; 450 451 dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so, 452 soun->sun_path)); 453 454 error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); 455 if (error) { 456 eprintsoline(so, error); 457 return (error); 458 } 459 460 /* 461 * Traverse lofs mounts get the real vnode 462 */ 463 if (VOP_REALVP(vp, &rvp, NULL) == 0) { 464 VN_HOLD(rvp); /* hold the real vnode */ 465 VN_RELE(vp); /* release hold from lookup */ 466 vp = rvp; 467 } 468 469 if (vp->v_type != VSOCK) { 470 error = ENOTSOCK; 471 eprintsoline(so, error); 472 goto done2; 473 } 474 475 if (checkaccess) { 476 /* 477 * Check that we have permissions to access the destination 478 * vnode. This check is not done in BSD but it is required 479 * by X/Open. 480 */ 481 if (error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL)) { 482 eprintsoline(so, error); 483 goto done2; 484 } 485 } 486 487 /* 488 * Check if the remote socket has been closed. 489 * 490 * Synchronize with vn_rele_stream by holding v_lock while traversing 491 * v_stream->sd_vnode. 492 */ 493 mutex_enter(&vp->v_lock); 494 if (vp->v_stream == NULL) { 495 mutex_exit(&vp->v_lock); 496 if (so->so_type == SOCK_DGRAM) 497 error = EDESTADDRREQ; 498 else 499 error = ECONNREFUSED; 500 501 eprintsoline(so, error); 502 goto done2; 503 } 504 ASSERT(vp->v_stream->sd_vnode); 505 svp = vp->v_stream->sd_vnode; 506 /* 507 * holding v_lock on underlying filesystem vnode and acquiring 508 * it on sockfs vnode. Assumes that no code ever attempts to 509 * acquire these locks in the reverse order. 510 */ 511 VN_HOLD(svp); 512 mutex_exit(&vp->v_lock); 513 514 if (svp->v_type != VSOCK) { 515 error = ENOTSOCK; 516 eprintsoline(so, error); 517 goto done; 518 } 519 520 so2 = VTOSO(svp); 521 522 if (so->so_type != so2->so_type) { 523 error = EPROTOTYPE; 524 eprintsoline(so, error); 525 goto done; 526 } 527 528 VN_RELE(svp); 529 *vpp = vp; 530 return (0); 531 532 done: 533 VN_RELE(svp); 534 done2: 535 VN_RELE(vp); 536 return (error); 537 } 538 539 /* 540 * Verify peer address for connect and sendto/sendmsg. 541 * Since sendto/sendmsg would not get synchronous errors from the transport 542 * provider we have to do these ugly checks in the socket layer to 543 * preserve compatibility with SunOS 4.X. 544 */ 545 int 546 so_addr_verify(struct sonode *so, const struct sockaddr *name, 547 socklen_t namelen) 548 { 549 int family; 550 551 dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n", 552 (void *)so, (void *)name, namelen)); 553 554 ASSERT(name != NULL); 555 556 family = so->so_family; 557 switch (family) { 558 case AF_INET: 559 if (name->sa_family != family) { 560 eprintsoline(so, EAFNOSUPPORT); 561 return (EAFNOSUPPORT); 562 } 563 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) { 564 eprintsoline(so, EINVAL); 565 return (EINVAL); 566 } 567 break; 568 case AF_INET6: { 569 #ifdef DEBUG 570 struct sockaddr_in6 *sin6; 571 #endif /* DEBUG */ 572 573 if (name->sa_family != family) { 574 eprintsoline(so, EAFNOSUPPORT); 575 return (EAFNOSUPPORT); 576 } 577 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) { 578 eprintsoline(so, EINVAL); 579 return (EINVAL); 580 } 581 #ifdef DEBUG 582 /* Verify that apps don't forget to clear sin6_scope_id etc */ 583 sin6 = (struct sockaddr_in6 *)name; 584 if (sin6->sin6_scope_id != 0 && 585 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) { 586 zcmn_err(getzoneid(), CE_WARN, 587 "connect/send* with uninitialized sin6_scope_id " 588 "(%d) on socket. Pid = %d\n", 589 (int)sin6->sin6_scope_id, (int)curproc->p_pid); 590 } 591 #endif /* DEBUG */ 592 break; 593 } 594 case AF_UNIX: 595 if (SOTOTPI(so)->sti_faddr_noxlate) { 596 return (0); 597 } 598 if (namelen < (socklen_t)sizeof (short)) { 599 eprintsoline(so, ENOENT); 600 return (ENOENT); 601 } 602 if (name->sa_family != family) { 603 eprintsoline(so, EAFNOSUPPORT); 604 return (EAFNOSUPPORT); 605 } 606 /* MAXPATHLEN + soun_family + nul termination */ 607 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) { 608 eprintsoline(so, ENAMETOOLONG); 609 return (ENAMETOOLONG); 610 } 611 612 break; 613 614 default: 615 /* 616 * Default is don't do any length or sa_family check 617 * to allow non-sockaddr style addresses. 618 */ 619 break; 620 } 621 622 return (0); 623 } 624 625 626 /* 627 * Translate an AF_UNIX sockaddr_un to the transport internal name. 628 * Assumes caller has called so_addr_verify first. The translated 629 * (internal form) address is stored in sti->sti_ux_taddr. 630 */ 631 /*ARGSUSED*/ 632 int 633 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name, 634 socklen_t namelen, int checkaccess, 635 void **addrp, socklen_t *addrlenp) 636 { 637 int error; 638 struct sockaddr_un *soun; 639 vnode_t *vp; 640 void *addr; 641 socklen_t addrlen; 642 sotpi_info_t *sti = SOTOTPI(so); 643 644 dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n", 645 (void *)so, (void *)name, namelen, checkaccess)); 646 647 ASSERT(name != NULL); 648 ASSERT(so->so_family == AF_UNIX); 649 ASSERT(!sti->sti_faddr_noxlate); 650 ASSERT(namelen >= (socklen_t)sizeof (short)); 651 ASSERT(name->sa_family == AF_UNIX); 652 soun = (struct sockaddr_un *)name; 653 /* 654 * Lookup vnode for the specified path name and verify that 655 * it is a socket. 656 */ 657 error = so_ux_lookup(so, soun, checkaccess, &vp); 658 if (error) { 659 eprintsoline(so, error); 660 return (error); 661 } 662 /* 663 * Use the address of the peer vnode as the address to send 664 * to. We release the peer vnode here. In case it has been 665 * closed by the time the T_CONN_REQ or T_UNITDATA_REQ reaches the 666 * transport the message will get an error or be dropped. 667 * Note that that soua_vp is never dereferenced; it's just a 668 * convenient value by which we can identify the peer. 669 */ 670 sti->sti_ux_taddr.soua_vp = vp; 671 sti->sti_ux_taddr.soua_magic = SOU_MAGIC_EXPLICIT; 672 addr = &sti->sti_ux_taddr; 673 addrlen = (socklen_t)sizeof (sti->sti_ux_taddr); 674 dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n", 675 addrlen, (void *)vp)); 676 VN_RELE(vp); 677 *addrp = addr; 678 *addrlenp = (socklen_t)addrlen; 679 return (0); 680 } 681 682 /* 683 * Esballoc free function for messages that contain SO_FILEP option. 684 * Decrement the reference count on the file pointers using closef. 685 */ 686 void 687 fdbuf_free(struct fdbuf *fdbuf) 688 { 689 int i; 690 struct file *fp; 691 692 dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd)); 693 for (i = 0; i < fdbuf->fd_numfd; i++) { 694 /* 695 * We need pointer size alignment for fd_fds. On a LP64 696 * kernel, the required alignment is 8 bytes while 697 * the option headers and values are only 4 bytes 698 * aligned. So its safer to do a bcopy compared to 699 * assigning fdbuf->fd_fds[i] to fp. 700 */ 701 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 702 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp)); 703 (void) closef(fp); 704 } 705 if (fdbuf->fd_ebuf != NULL) 706 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen); 707 kmem_free(fdbuf, fdbuf->fd_size); 708 } 709 710 /* 711 * Allocate an esballoc'ed message for AF_UNIX file descriptor passing. 712 * Waits if memory is not available. 713 */ 714 mblk_t * 715 fdbuf_allocmsg(int size, struct fdbuf *fdbuf) 716 { 717 uchar_t *buf; 718 mblk_t *mp; 719 720 dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd)); 721 buf = kmem_alloc(size, KM_SLEEP); 722 fdbuf->fd_ebuf = (caddr_t)buf; 723 fdbuf->fd_ebuflen = size; 724 fdbuf->fd_frtn.free_func = fdbuf_free; 725 fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf; 726 727 mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn); 728 mp->b_datap->db_type = M_PROTO; 729 return (mp); 730 } 731 732 /* 733 * Extract file descriptors from a fdbuf. 734 * Return list in rights/rightslen. 735 */ 736 /*ARGSUSED*/ 737 static int 738 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen) 739 { 740 int i, fd; 741 int *rp; 742 struct file *fp; 743 int numfd; 744 745 dprint(1, ("fdbuf_extract: %d fds, len %d\n", 746 fdbuf->fd_numfd, rightslen)); 747 748 numfd = fdbuf->fd_numfd; 749 ASSERT(rightslen == numfd * (int)sizeof (int)); 750 751 /* 752 * Allocate a file descriptor and increment the f_count. 753 * The latter is needed since we always call fdbuf_free 754 * which performs a closef. 755 */ 756 rp = (int *)rights; 757 for (i = 0; i < numfd; i++) { 758 if ((fd = ufalloc(0)) == -1) 759 goto cleanup; 760 /* 761 * We need pointer size alignment for fd_fds. On a LP64 762 * kernel, the required alignment is 8 bytes while 763 * the option headers and values are only 4 bytes 764 * aligned. So its safer to do a bcopy compared to 765 * assigning fdbuf->fd_fds[i] to fp. 766 */ 767 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 768 mutex_enter(&fp->f_tlock); 769 fp->f_count++; 770 mutex_exit(&fp->f_tlock); 771 setf(fd, fp); 772 *rp++ = fd; 773 if (AU_AUDITING()) 774 audit_fdrecv(fd, fp); 775 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n", 776 i, fd, (void *)fp, fp->f_count)); 777 } 778 return (0); 779 780 cleanup: 781 /* 782 * Undo whatever partial work the loop above has done. 783 */ 784 { 785 int j; 786 787 rp = (int *)rights; 788 for (j = 0; j < i; j++) { 789 dprint(0, 790 ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp)); 791 (void) closeandsetf(*rp++, NULL); 792 } 793 } 794 795 return (EMFILE); 796 } 797 798 /* 799 * Insert file descriptors into an fdbuf. 800 * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed 801 * by calling fdbuf_free(). 802 */ 803 int 804 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp) 805 { 806 int numfd, i; 807 int *fds; 808 struct file *fp; 809 struct fdbuf *fdbuf; 810 int fdbufsize; 811 812 dprint(1, ("fdbuf_create: len %d\n", rightslen)); 813 814 numfd = rightslen / (int)sizeof (int); 815 816 fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)); 817 fdbuf = kmem_alloc(fdbufsize, KM_SLEEP); 818 fdbuf->fd_size = fdbufsize; 819 fdbuf->fd_numfd = 0; 820 fdbuf->fd_ebuf = NULL; 821 fdbuf->fd_ebuflen = 0; 822 fds = (int *)rights; 823 for (i = 0; i < numfd; i++) { 824 if ((fp = getf(fds[i])) == NULL) { 825 fdbuf_free(fdbuf); 826 return (EBADF); 827 } 828 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n", 829 i, fds[i], (void *)fp, fp->f_count)); 830 mutex_enter(&fp->f_tlock); 831 fp->f_count++; 832 mutex_exit(&fp->f_tlock); 833 /* 834 * The maximum alignment for fdbuf (or any option header 835 * and its value) it 4 bytes. On a LP64 kernel, the alignment 836 * is not sufficient for pointers (fd_fds in this case). Since 837 * we just did a kmem_alloc (we get a double word alignment), 838 * we don't need to do anything on the send side (we loose 839 * the double word alignment because fdbuf goes after an 840 * option header (eg T_unitdata_req) which is only 4 byte 841 * aligned). We take care of this when we extract the file 842 * descriptor in fdbuf_extract or fdbuf_free. 843 */ 844 fdbuf->fd_fds[i] = fp; 845 fdbuf->fd_numfd++; 846 releasef(fds[i]); 847 if (AU_AUDITING()) 848 audit_fdsend(fds[i], fp, 0); 849 } 850 *fdbufp = fdbuf; 851 return (0); 852 } 853 854 static int 855 fdbuf_optlen(int rightslen) 856 { 857 int numfd; 858 859 numfd = rightslen / (int)sizeof (int); 860 861 return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *))); 862 } 863 864 static t_uscalar_t 865 fdbuf_cmsglen(int fdbuflen) 866 { 867 return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) / 868 (int)sizeof (struct file *) * (int)sizeof (int)); 869 } 870 871 872 /* 873 * Return non-zero if the mblk and fdbuf are consistent. 874 */ 875 static int 876 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen) 877 { 878 if (fdbuflen >= FDBUF_HDRSIZE && 879 fdbuflen == fdbuf->fd_size) { 880 frtn_t *frp = mp->b_datap->db_frtnp; 881 /* 882 * Check that the SO_FILEP portion of the 883 * message has not been modified by 884 * the loopback transport. The sending sockfs generates 885 * a message that is esballoc'ed with the free function 886 * being fdbuf_free() and where free_arg contains the 887 * identical information as the SO_FILEP content. 888 * 889 * If any of these constraints are not satisfied we 890 * silently ignore the option. 891 */ 892 ASSERT(mp); 893 if (frp != NULL && 894 frp->free_func == fdbuf_free && 895 frp->free_arg != NULL && 896 bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) { 897 dprint(1, ("fdbuf_verify: fdbuf %p len %d\n", 898 (void *)fdbuf, fdbuflen)); 899 return (1); 900 } else { 901 zcmn_err(getzoneid(), CE_WARN, 902 "sockfs: mismatched fdbuf content (%p)", 903 (void *)mp); 904 return (0); 905 } 906 } else { 907 zcmn_err(getzoneid(), CE_WARN, 908 "sockfs: mismatched fdbuf len %d, %d\n", 909 fdbuflen, fdbuf->fd_size); 910 return (0); 911 } 912 } 913 914 /* 915 * When the file descriptors returned by sorecvmsg can not be passed 916 * to the application this routine will cleanup the references on 917 * the files. Start at startoff bytes into the buffer. 918 */ 919 static void 920 close_fds(void *fdbuf, int fdbuflen, int startoff) 921 { 922 int *fds = (int *)fdbuf; 923 int numfd = fdbuflen / (int)sizeof (int); 924 int i; 925 926 dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff)); 927 928 for (i = 0; i < numfd; i++) { 929 if (startoff < 0) 930 startoff = 0; 931 if (startoff < (int)sizeof (int)) { 932 /* 933 * This file descriptor is partially or fully after 934 * the offset 935 */ 936 dprint(0, 937 ("close_fds: cleanup[%d] = %d\n", i, fds[i])); 938 (void) closeandsetf(fds[i], NULL); 939 } 940 startoff -= (int)sizeof (int); 941 } 942 } 943 944 /* 945 * Close all file descriptors contained in the control part starting at 946 * the startoffset. 947 */ 948 void 949 so_closefds(void *control, t_uscalar_t controllen, int oldflg, 950 int startoff) 951 { 952 struct cmsghdr *cmsg; 953 954 if (control == NULL) 955 return; 956 957 if (oldflg) { 958 close_fds(control, controllen, startoff); 959 return; 960 } 961 /* Scan control part for file descriptors. */ 962 for (cmsg = (struct cmsghdr *)control; 963 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 964 cmsg = CMSG_NEXT(cmsg)) { 965 if (cmsg->cmsg_level == SOL_SOCKET && 966 cmsg->cmsg_type == SCM_RIGHTS) { 967 close_fds(CMSG_CONTENT(cmsg), 968 (int)CMSG_CONTENTLEN(cmsg), 969 startoff - (int)sizeof (struct cmsghdr)); 970 } 971 startoff -= cmsg->cmsg_len; 972 } 973 } 974 975 /* 976 * Returns a pointer/length for the file descriptors contained 977 * in the control buffer. Returns with *fdlenp == -1 if there are no 978 * file descriptor options present. This is different than there being 979 * a zero-length file descriptor option. 980 * Fail if there are multiple SCM_RIGHT cmsgs. 981 */ 982 int 983 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg, 984 void **fdsp, int *fdlenp) 985 { 986 struct cmsghdr *cmsg; 987 void *fds; 988 int fdlen; 989 990 if (control == NULL) { 991 *fdsp = NULL; 992 *fdlenp = -1; 993 return (0); 994 } 995 996 if (oldflg) { 997 *fdsp = control; 998 if (controllen == 0) 999 *fdlenp = -1; 1000 else 1001 *fdlenp = controllen; 1002 dprint(1, ("so_getfdopt: old %d\n", *fdlenp)); 1003 return (0); 1004 } 1005 1006 fds = NULL; 1007 fdlen = 0; 1008 1009 for (cmsg = (struct cmsghdr *)control; 1010 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1011 cmsg = CMSG_NEXT(cmsg)) { 1012 if (cmsg->cmsg_level == SOL_SOCKET && 1013 cmsg->cmsg_type == SCM_RIGHTS) { 1014 if (fds != NULL) 1015 return (EINVAL); 1016 fds = CMSG_CONTENT(cmsg); 1017 fdlen = (int)CMSG_CONTENTLEN(cmsg); 1018 dprint(1, ("so_getfdopt: new %lu\n", 1019 (size_t)CMSG_CONTENTLEN(cmsg))); 1020 } 1021 } 1022 if (fds == NULL) { 1023 dprint(1, ("so_getfdopt: NONE\n")); 1024 *fdlenp = -1; 1025 } else 1026 *fdlenp = fdlen; 1027 *fdsp = fds; 1028 return (0); 1029 } 1030 1031 /* 1032 * Return the length of the options including any file descriptor options. 1033 */ 1034 t_uscalar_t 1035 so_optlen(void *control, t_uscalar_t controllen, int oldflg) 1036 { 1037 struct cmsghdr *cmsg; 1038 t_uscalar_t optlen = 0; 1039 t_uscalar_t len; 1040 1041 if (control == NULL) 1042 return (0); 1043 1044 if (oldflg) 1045 return ((t_uscalar_t)(sizeof (struct T_opthdr) + 1046 fdbuf_optlen(controllen))); 1047 1048 for (cmsg = (struct cmsghdr *)control; 1049 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1050 cmsg = CMSG_NEXT(cmsg)) { 1051 if (cmsg->cmsg_level == SOL_SOCKET && 1052 cmsg->cmsg_type == SCM_RIGHTS) { 1053 len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg)); 1054 } else { 1055 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1056 } 1057 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) + 1058 sizeof (struct T_opthdr)); 1059 } 1060 dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n", 1061 controllen, oldflg, optlen)); 1062 return (optlen); 1063 } 1064 1065 /* 1066 * Copy options from control to the mblk. Skip any file descriptor options. 1067 */ 1068 void 1069 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp) 1070 { 1071 struct T_opthdr toh; 1072 struct cmsghdr *cmsg; 1073 1074 if (control == NULL) 1075 return; 1076 1077 if (oldflg) { 1078 /* No real options - caller has handled file descriptors */ 1079 return; 1080 } 1081 for (cmsg = (struct cmsghdr *)control; 1082 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1083 cmsg = CMSG_NEXT(cmsg)) { 1084 /* 1085 * Note: The caller handles file descriptors prior 1086 * to calling this function. 1087 */ 1088 t_uscalar_t len; 1089 1090 if (cmsg->cmsg_level == SOL_SOCKET && 1091 cmsg->cmsg_type == SCM_RIGHTS) 1092 continue; 1093 1094 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1095 toh.level = cmsg->cmsg_level; 1096 toh.name = cmsg->cmsg_type; 1097 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr); 1098 toh.status = 0; 1099 1100 soappendmsg(mp, &toh, sizeof (toh)); 1101 soappendmsg(mp, CMSG_CONTENT(cmsg), len); 1102 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len; 1103 ASSERT(mp->b_wptr <= mp->b_datap->db_lim); 1104 } 1105 } 1106 1107 /* 1108 * Return the length of the control message derived from the options. 1109 * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP. 1110 * When oldflg is set only include SO_FILEP. 1111 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1112 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1113 * also be checked for any possible impacts. 1114 */ 1115 t_uscalar_t 1116 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg) 1117 { 1118 t_uscalar_t cmsglen = 0; 1119 struct T_opthdr *tohp; 1120 t_uscalar_t len; 1121 t_uscalar_t last_roundup = 0; 1122 1123 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1124 1125 for (tohp = (struct T_opthdr *)opt; 1126 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1127 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1128 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n", 1129 tohp->level, tohp->name, tohp->len)); 1130 if (tohp->level == SOL_SOCKET && 1131 (tohp->name == SO_SRCADDR || 1132 tohp->name == SO_UNIX_CLOSE)) { 1133 continue; 1134 } 1135 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1136 struct fdbuf *fdbuf; 1137 int fdbuflen; 1138 1139 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1140 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1141 1142 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1143 continue; 1144 if (oldflg) { 1145 cmsglen += fdbuf_cmsglen(fdbuflen); 1146 continue; 1147 } 1148 len = fdbuf_cmsglen(fdbuflen); 1149 } else if (tohp->level == SOL_SOCKET && 1150 tohp->name == SCM_TIMESTAMP) { 1151 if (oldflg) 1152 continue; 1153 1154 if (get_udatamodel() == DATAMODEL_NATIVE) { 1155 len = sizeof (struct timeval); 1156 } else { 1157 len = sizeof (struct timeval32); 1158 } 1159 } else { 1160 if (oldflg) 1161 continue; 1162 len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1163 } 1164 /* 1165 * Exclude roundup for last option to not set 1166 * MSG_CTRUNC when the cmsg fits but the padding doesn't fit. 1167 */ 1168 last_roundup = (t_uscalar_t) 1169 (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) - 1170 (len + (int)sizeof (struct cmsghdr))); 1171 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) + 1172 last_roundup; 1173 } 1174 cmsglen -= last_roundup; 1175 dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n", 1176 optlen, oldflg, cmsglen)); 1177 return (cmsglen); 1178 } 1179 1180 /* 1181 * Copy options from options to the control. Convert SO_FILEP to 1182 * file descriptors. 1183 * Returns errno or zero. 1184 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1185 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1186 * also be checked for any possible impacts. 1187 */ 1188 int 1189 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg, 1190 void *control, t_uscalar_t controllen) 1191 { 1192 struct T_opthdr *tohp; 1193 struct cmsghdr *cmsg; 1194 struct fdbuf *fdbuf; 1195 int fdbuflen; 1196 int error; 1197 #if defined(DEBUG) || defined(__lint) 1198 struct cmsghdr *cend = (struct cmsghdr *) 1199 (((uint8_t *)control) + ROUNDUP_cmsglen(controllen)); 1200 #endif 1201 cmsg = (struct cmsghdr *)control; 1202 1203 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1204 1205 for (tohp = (struct T_opthdr *)opt; 1206 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1207 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1208 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n", 1209 tohp->level, tohp->name, tohp->len)); 1210 1211 if (tohp->level == SOL_SOCKET && 1212 (tohp->name == SO_SRCADDR || 1213 tohp->name == SO_UNIX_CLOSE)) { 1214 continue; 1215 } 1216 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen); 1217 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1218 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1219 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1220 1221 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1222 return (EPROTO); 1223 if (oldflg) { 1224 error = fdbuf_extract(fdbuf, control, 1225 (int)controllen); 1226 if (error != 0) 1227 return (error); 1228 continue; 1229 } else { 1230 int fdlen; 1231 1232 fdlen = (int)fdbuf_cmsglen( 1233 (int)_TPI_TOPT_DATALEN(tohp)); 1234 1235 cmsg->cmsg_level = tohp->level; 1236 cmsg->cmsg_type = SCM_RIGHTS; 1237 cmsg->cmsg_len = (socklen_t)(fdlen + 1238 sizeof (struct cmsghdr)); 1239 1240 error = fdbuf_extract(fdbuf, 1241 CMSG_CONTENT(cmsg), fdlen); 1242 if (error != 0) 1243 return (error); 1244 } 1245 } else if (tohp->level == SOL_SOCKET && 1246 tohp->name == SCM_TIMESTAMP) { 1247 timestruc_t *timestamp; 1248 1249 if (oldflg) 1250 continue; 1251 1252 cmsg->cmsg_level = tohp->level; 1253 cmsg->cmsg_type = tohp->name; 1254 1255 timestamp = 1256 (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1], 1257 sizeof (intptr_t)); 1258 1259 if (get_udatamodel() == DATAMODEL_NATIVE) { 1260 struct timeval tv; 1261 1262 cmsg->cmsg_len = sizeof (struct timeval) + 1263 sizeof (struct cmsghdr); 1264 tv.tv_sec = timestamp->tv_sec; 1265 tv.tv_usec = timestamp->tv_nsec / 1266 (NANOSEC / MICROSEC); 1267 /* 1268 * on LP64 systems, the struct timeval in 1269 * the destination will not be 8-byte aligned, 1270 * so use bcopy to avoid alignment trouble 1271 */ 1272 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv)); 1273 } else { 1274 struct timeval32 *time32; 1275 1276 cmsg->cmsg_len = sizeof (struct timeval32) + 1277 sizeof (struct cmsghdr); 1278 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg); 1279 time32->tv_sec = (time32_t)timestamp->tv_sec; 1280 time32->tv_usec = 1281 (int32_t)(timestamp->tv_nsec / 1282 (NANOSEC / MICROSEC)); 1283 } 1284 1285 } else { 1286 if (oldflg) 1287 continue; 1288 1289 cmsg->cmsg_level = tohp->level; 1290 cmsg->cmsg_type = tohp->name; 1291 cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) + 1292 sizeof (struct cmsghdr)); 1293 1294 /* copy content to control data part */ 1295 bcopy(&tohp[1], CMSG_CONTENT(cmsg), 1296 CMSG_CONTENTLEN(cmsg)); 1297 } 1298 /* move to next CMSG structure! */ 1299 cmsg = CMSG_NEXT(cmsg); 1300 } 1301 dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n", 1302 control, controllen, (void *)cend, (void *)cmsg)); 1303 ASSERT(cmsg <= cend); 1304 return (0); 1305 } 1306 1307 /* 1308 * Extract the SO_SRCADDR option value if present. 1309 */ 1310 void 1311 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp, 1312 t_uscalar_t *srclenp) 1313 { 1314 struct T_opthdr *tohp; 1315 1316 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1317 1318 ASSERT(srcp != NULL && srclenp != NULL); 1319 *srcp = NULL; 1320 *srclenp = 0; 1321 1322 for (tohp = (struct T_opthdr *)opt; 1323 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1324 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1325 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n", 1326 tohp->level, tohp->name, tohp->len)); 1327 if (tohp->level == SOL_SOCKET && 1328 tohp->name == SO_SRCADDR) { 1329 *srcp = _TPI_TOPT_DATA(tohp); 1330 *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1331 } 1332 } 1333 } 1334 1335 /* 1336 * Verify if the SO_UNIX_CLOSE option is present. 1337 */ 1338 int 1339 so_getopt_unix_close(void *opt, t_uscalar_t optlen) 1340 { 1341 struct T_opthdr *tohp; 1342 1343 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1344 1345 for (tohp = (struct T_opthdr *)opt; 1346 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1347 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1348 dprint(1, 1349 ("so_getopt_unix_close: level 0x%x, name %d, len %d\n", 1350 tohp->level, tohp->name, tohp->len)); 1351 if (tohp->level == SOL_SOCKET && 1352 tohp->name == SO_UNIX_CLOSE) 1353 return (1); 1354 } 1355 return (0); 1356 } 1357 1358 /* 1359 * Allocate an M_PROTO message. 1360 * 1361 * If allocation fails the behavior depends on sleepflg: 1362 * _ALLOC_NOSLEEP fail immediately 1363 * _ALLOC_INTR sleep for memory until a signal is caught 1364 * _ALLOC_SLEEP sleep forever. Don't return NULL. 1365 */ 1366 mblk_t * 1367 soallocproto(size_t size, int sleepflg, cred_t *cr) 1368 { 1369 mblk_t *mp; 1370 1371 /* Round up size for reuse */ 1372 size = MAX(size, 64); 1373 if (cr != NULL) 1374 mp = allocb_cred(size, cr, curproc->p_pid); 1375 else 1376 mp = allocb(size, BPRI_MED); 1377 1378 if (mp == NULL) { 1379 int error; /* Dummy - error not returned to caller */ 1380 1381 switch (sleepflg) { 1382 case _ALLOC_SLEEP: 1383 if (cr != NULL) { 1384 mp = allocb_cred_wait(size, STR_NOSIG, &error, 1385 cr, curproc->p_pid); 1386 } else { 1387 mp = allocb_wait(size, BPRI_MED, STR_NOSIG, 1388 &error); 1389 } 1390 ASSERT(mp); 1391 break; 1392 case _ALLOC_INTR: 1393 if (cr != NULL) { 1394 mp = allocb_cred_wait(size, 0, &error, cr, 1395 curproc->p_pid); 1396 } else { 1397 mp = allocb_wait(size, BPRI_MED, 0, &error); 1398 } 1399 if (mp == NULL) { 1400 /* Caught signal while sleeping for memory */ 1401 eprintline(ENOBUFS); 1402 return (NULL); 1403 } 1404 break; 1405 case _ALLOC_NOSLEEP: 1406 default: 1407 eprintline(ENOBUFS); 1408 return (NULL); 1409 } 1410 } 1411 DB_TYPE(mp) = M_PROTO; 1412 return (mp); 1413 } 1414 1415 /* 1416 * Allocate an M_PROTO message with a single component. 1417 * len is the length of buf. size is the amount to allocate. 1418 * 1419 * buf can be NULL with a non-zero len. 1420 * This results in a bzero'ed chunk being placed the message. 1421 */ 1422 mblk_t * 1423 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg, 1424 cred_t *cr) 1425 { 1426 mblk_t *mp; 1427 1428 if (size == 0) 1429 size = len; 1430 1431 ASSERT(size >= len); 1432 /* Round up size for reuse */ 1433 size = MAX(size, 64); 1434 mp = soallocproto(size, sleepflg, cr); 1435 if (mp == NULL) 1436 return (NULL); 1437 mp->b_datap->db_type = M_PROTO; 1438 if (len != 0) { 1439 if (buf != NULL) 1440 bcopy(buf, mp->b_wptr, len); 1441 else 1442 bzero(mp->b_wptr, len); 1443 mp->b_wptr += len; 1444 } 1445 return (mp); 1446 } 1447 1448 /* 1449 * Append buf/len to mp. 1450 * The caller has to ensure that there is enough room in the mblk. 1451 * 1452 * buf can be NULL with a non-zero len. 1453 * This results in a bzero'ed chunk being placed the message. 1454 */ 1455 void 1456 soappendmsg(mblk_t *mp, const void *buf, ssize_t len) 1457 { 1458 ASSERT(mp); 1459 1460 if (len != 0) { 1461 /* Assert for room left */ 1462 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len); 1463 if (buf != NULL) 1464 bcopy(buf, mp->b_wptr, len); 1465 else 1466 bzero(mp->b_wptr, len); 1467 } 1468 mp->b_wptr += len; 1469 } 1470 1471 /* 1472 * Create a message using two kernel buffers. 1473 * If size is set that will determine the allocation size (e.g. for future 1474 * soappendmsg calls). If size is zero it is derived from the buffer 1475 * lengths. 1476 */ 1477 mblk_t * 1478 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1479 ssize_t size, int sleepflg, cred_t *cr) 1480 { 1481 mblk_t *mp; 1482 1483 if (size == 0) 1484 size = len1 + len2; 1485 ASSERT(size >= len1 + len2); 1486 1487 mp = soallocproto1(buf1, len1, size, sleepflg, cr); 1488 if (mp) 1489 soappendmsg(mp, buf2, len2); 1490 return (mp); 1491 } 1492 1493 /* 1494 * Create a message using three kernel buffers. 1495 * If size is set that will determine the allocation size (for future 1496 * soappendmsg calls). If size is zero it is derived from the buffer 1497 * lengths. 1498 */ 1499 mblk_t * 1500 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1501 const void *buf3, ssize_t len3, ssize_t size, int sleepflg, cred_t *cr) 1502 { 1503 mblk_t *mp; 1504 1505 if (size == 0) 1506 size = len1 + len2 +len3; 1507 ASSERT(size >= len1 + len2 + len3); 1508 1509 mp = soallocproto1(buf1, len1, size, sleepflg, cr); 1510 if (mp != NULL) { 1511 soappendmsg(mp, buf2, len2); 1512 soappendmsg(mp, buf3, len3); 1513 } 1514 return (mp); 1515 } 1516 1517 #ifdef DEBUG 1518 char * 1519 pr_state(uint_t state, uint_t mode) 1520 { 1521 static char buf[1024]; 1522 1523 buf[0] = 0; 1524 if (state & SS_ISCONNECTED) 1525 (void) strcat(buf, "ISCONNECTED "); 1526 if (state & SS_ISCONNECTING) 1527 (void) strcat(buf, "ISCONNECTING "); 1528 if (state & SS_ISDISCONNECTING) 1529 (void) strcat(buf, "ISDISCONNECTING "); 1530 if (state & SS_CANTSENDMORE) 1531 (void) strcat(buf, "CANTSENDMORE "); 1532 1533 if (state & SS_CANTRCVMORE) 1534 (void) strcat(buf, "CANTRCVMORE "); 1535 if (state & SS_ISBOUND) 1536 (void) strcat(buf, "ISBOUND "); 1537 if (state & SS_NDELAY) 1538 (void) strcat(buf, "NDELAY "); 1539 if (state & SS_NONBLOCK) 1540 (void) strcat(buf, "NONBLOCK "); 1541 1542 if (state & SS_ASYNC) 1543 (void) strcat(buf, "ASYNC "); 1544 if (state & SS_ACCEPTCONN) 1545 (void) strcat(buf, "ACCEPTCONN "); 1546 if (state & SS_SAVEDEOR) 1547 (void) strcat(buf, "SAVEDEOR "); 1548 1549 if (state & SS_RCVATMARK) 1550 (void) strcat(buf, "RCVATMARK "); 1551 if (state & SS_OOBPEND) 1552 (void) strcat(buf, "OOBPEND "); 1553 if (state & SS_HAVEOOBDATA) 1554 (void) strcat(buf, "HAVEOOBDATA "); 1555 if (state & SS_HADOOBDATA) 1556 (void) strcat(buf, "HADOOBDATA "); 1557 1558 if (mode & SM_PRIV) 1559 (void) strcat(buf, "PRIV "); 1560 if (mode & SM_ATOMIC) 1561 (void) strcat(buf, "ATOMIC "); 1562 if (mode & SM_ADDR) 1563 (void) strcat(buf, "ADDR "); 1564 if (mode & SM_CONNREQUIRED) 1565 (void) strcat(buf, "CONNREQUIRED "); 1566 1567 if (mode & SM_FDPASSING) 1568 (void) strcat(buf, "FDPASSING "); 1569 if (mode & SM_EXDATA) 1570 (void) strcat(buf, "EXDATA "); 1571 if (mode & SM_OPTDATA) 1572 (void) strcat(buf, "OPTDATA "); 1573 if (mode & SM_BYTESTREAM) 1574 (void) strcat(buf, "BYTESTREAM "); 1575 return (buf); 1576 } 1577 1578 char * 1579 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen) 1580 { 1581 static char buf[1024]; 1582 1583 if (addr == NULL || addrlen == 0) { 1584 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr); 1585 return (buf); 1586 } 1587 switch (family) { 1588 case AF_INET: { 1589 struct sockaddr_in sin; 1590 1591 bcopy(addr, &sin, sizeof (sin)); 1592 1593 (void) sprintf(buf, "(len %d) %x/%d", 1594 addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port)); 1595 break; 1596 } 1597 case AF_INET6: { 1598 struct sockaddr_in6 sin6; 1599 uint16_t *piece = (uint16_t *)&sin6.sin6_addr; 1600 1601 bcopy((char *)addr, (char *)&sin6, sizeof (sin6)); 1602 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d", 1603 addrlen, 1604 ntohs(piece[0]), ntohs(piece[1]), 1605 ntohs(piece[2]), ntohs(piece[3]), 1606 ntohs(piece[4]), ntohs(piece[5]), 1607 ntohs(piece[6]), ntohs(piece[7]), 1608 ntohs(sin6.sin6_port)); 1609 break; 1610 } 1611 case AF_UNIX: { 1612 struct sockaddr_un *soun = (struct sockaddr_un *)addr; 1613 1614 (void) sprintf(buf, "(len %d) %s", addrlen, 1615 (soun == NULL) ? "(none)" : soun->sun_path); 1616 break; 1617 } 1618 default: 1619 (void) sprintf(buf, "(unknown af %d)", family); 1620 break; 1621 } 1622 return (buf); 1623 } 1624 1625 /* The logical equivalence operator (a if-and-only-if b) */ 1626 #define EQUIVALENT(a, b) (((a) && (b)) || (!(a) && (!(b)))) 1627 1628 /* 1629 * Verify limitations and invariants on oob state. 1630 * Return 1 if OK, otherwise 0 so that it can be used as 1631 * ASSERT(verify_oobstate(so)); 1632 */ 1633 int 1634 so_verify_oobstate(struct sonode *so) 1635 { 1636 boolean_t havemark; 1637 1638 ASSERT(MUTEX_HELD(&so->so_lock)); 1639 1640 /* 1641 * The possible state combinations are: 1642 * 0 1643 * SS_OOBPEND 1644 * SS_OOBPEND|SS_HAVEOOBDATA 1645 * SS_OOBPEND|SS_HADOOBDATA 1646 * SS_HADOOBDATA 1647 */ 1648 switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) { 1649 case 0: 1650 case SS_OOBPEND: 1651 case SS_OOBPEND|SS_HAVEOOBDATA: 1652 case SS_OOBPEND|SS_HADOOBDATA: 1653 case SS_HADOOBDATA: 1654 break; 1655 default: 1656 printf("Bad oob state 1 (%p): state %s\n", 1657 (void *)so, pr_state(so->so_state, so->so_mode)); 1658 return (0); 1659 } 1660 1661 /* SS_RCVATMARK should only be set when SS_OOBPEND is set */ 1662 if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) { 1663 printf("Bad oob state 2 (%p): state %s\n", 1664 (void *)so, pr_state(so->so_state, so->so_mode)); 1665 return (0); 1666 } 1667 1668 /* 1669 * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND 1670 * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt. 1671 */ 1672 havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 : 1673 SOTOTPI(so)->sti_oobsigcnt > 0; 1674 1675 if (!EQUIVALENT(havemark || (so->so_state & SS_RCVATMARK), 1676 so->so_state & SS_OOBPEND)) { 1677 printf("Bad oob state 3 (%p): state %s\n", 1678 (void *)so, pr_state(so->so_state, so->so_mode)); 1679 return (0); 1680 } 1681 1682 /* 1683 * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA 1684 */ 1685 if (!(so->so_options & SO_OOBINLINE) && 1686 !EQUIVALENT(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) { 1687 printf("Bad oob state 4 (%p): state %s\n", 1688 (void *)so, pr_state(so->so_state, so->so_mode)); 1689 return (0); 1690 } 1691 1692 if (!SOCK_IS_NONSTR(so) && 1693 SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) { 1694 printf("Bad oob state 5 (%p): counts %d/%d state %s\n", 1695 (void *)so, SOTOTPI(so)->sti_oobsigcnt, 1696 SOTOTPI(so)->sti_oobcnt, 1697 pr_state(so->so_state, so->so_mode)); 1698 return (0); 1699 } 1700 1701 return (1); 1702 } 1703 #undef EQUIVALENT 1704 #endif /* DEBUG */ 1705 1706 /* initialize sockfs zone specific kstat related items */ 1707 void * 1708 sock_kstat_init(zoneid_t zoneid) 1709 { 1710 kstat_t *ksp; 1711 1712 ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc", 1713 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid); 1714 1715 if (ksp != NULL) { 1716 ksp->ks_update = sockfs_update; 1717 ksp->ks_snapshot = sockfs_snapshot; 1718 ksp->ks_lock = &socklist.sl_lock; 1719 ksp->ks_private = (void *)(uintptr_t)zoneid; 1720 kstat_install(ksp); 1721 } 1722 1723 return (ksp); 1724 } 1725 1726 /* tear down sockfs zone specific kstat related items */ 1727 /*ARGSUSED*/ 1728 void 1729 sock_kstat_fini(zoneid_t zoneid, void *arg) 1730 { 1731 kstat_t *ksp = (kstat_t *)arg; 1732 1733 if (ksp != NULL) { 1734 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private); 1735 kstat_delete(ksp); 1736 } 1737 } 1738 1739 /* 1740 * Zones: 1741 * Note that nactive is going to be different for each zone. 1742 * This means we require kstat to call sockfs_update and then sockfs_snapshot 1743 * for the same zone, or sockfs_snapshot will be taken into the wrong size 1744 * buffer. This is safe, but if the buffer is too small, user will not be 1745 * given details of all sockets. However, as this kstat has a ks_lock, kstat 1746 * driver will keep it locked between the update and the snapshot, so no 1747 * other process (zone) can currently get inbetween resulting in a wrong size 1748 * buffer allocation. 1749 */ 1750 static int 1751 sockfs_update(kstat_t *ksp, int rw) 1752 { 1753 uint_t nactive = 0; /* # of active AF_UNIX sockets */ 1754 struct sonode *so; /* current sonode on socklist */ 1755 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 1756 1757 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 1758 1759 if (rw == KSTAT_WRITE) { /* bounce all writes */ 1760 return (EACCES); 1761 } 1762 1763 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { 1764 if (so->so_count != 0 && so->so_zoneid == myzoneid) { 1765 nactive++; 1766 } 1767 } 1768 ksp->ks_ndata = nactive; 1769 ksp->ks_data_size = nactive * sizeof (struct k_sockinfo); 1770 1771 return (0); 1772 } 1773 1774 static int 1775 sockfs_snapshot(kstat_t *ksp, void *buf, int rw) 1776 { 1777 int ns; /* # of sonodes we've copied */ 1778 struct sonode *so; /* current sonode on socklist */ 1779 struct k_sockinfo *pksi; /* where we put sockinfo data */ 1780 t_uscalar_t sn_len; /* soa_len */ 1781 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 1782 sotpi_info_t *sti; 1783 1784 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 1785 1786 ksp->ks_snaptime = gethrtime(); 1787 1788 if (rw == KSTAT_WRITE) { /* bounce all writes */ 1789 return (EACCES); 1790 } 1791 1792 /* 1793 * for each sonode on the socklist, we massage the important 1794 * info into buf, in k_sockinfo format. 1795 */ 1796 pksi = (struct k_sockinfo *)buf; 1797 ns = 0; 1798 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { 1799 /* only stuff active sonodes and the same zone: */ 1800 if (so->so_count == 0 || so->so_zoneid != myzoneid) { 1801 continue; 1802 } 1803 1804 /* 1805 * If the sonode was activated between the update and the 1806 * snapshot, we're done - as this is only a snapshot. 1807 */ 1808 if ((caddr_t)(pksi) >= (caddr_t)buf + ksp->ks_data_size) { 1809 break; 1810 } 1811 1812 sti = SOTOTPI(so); 1813 /* copy important info into buf: */ 1814 pksi->ks_si.si_size = sizeof (struct k_sockinfo); 1815 pksi->ks_si.si_family = so->so_family; 1816 pksi->ks_si.si_type = so->so_type; 1817 pksi->ks_si.si_flag = so->so_flag; 1818 pksi->ks_si.si_state = so->so_state; 1819 pksi->ks_si.si_serv_type = sti->sti_serv_type; 1820 pksi->ks_si.si_ux_laddr_sou_magic = 1821 sti->sti_ux_laddr.soua_magic; 1822 pksi->ks_si.si_ux_faddr_sou_magic = 1823 sti->sti_ux_faddr.soua_magic; 1824 pksi->ks_si.si_laddr_soa_len = sti->sti_laddr.soa_len; 1825 pksi->ks_si.si_faddr_soa_len = sti->sti_faddr.soa_len; 1826 pksi->ks_si.si_szoneid = so->so_zoneid; 1827 pksi->ks_si.si_faddr_noxlate = sti->sti_faddr_noxlate; 1828 1829 mutex_enter(&so->so_lock); 1830 1831 if (sti->sti_laddr_sa != NULL) { 1832 ASSERT(sti->sti_laddr_sa->sa_data != NULL); 1833 sn_len = sti->sti_laddr_len; 1834 ASSERT(sn_len <= sizeof (short) + 1835 sizeof (pksi->ks_si.si_laddr_sun_path)); 1836 1837 pksi->ks_si.si_laddr_family = 1838 sti->sti_laddr_sa->sa_family; 1839 if (sn_len != 0) { 1840 /* AF_UNIX socket names are NULL terminated */ 1841 (void) strncpy(pksi->ks_si.si_laddr_sun_path, 1842 sti->sti_laddr_sa->sa_data, 1843 sizeof (pksi->ks_si.si_laddr_sun_path)); 1844 sn_len = strlen(pksi->ks_si.si_laddr_sun_path); 1845 } 1846 pksi->ks_si.si_laddr_sun_path[sn_len] = 0; 1847 } 1848 1849 if (sti->sti_faddr_sa != NULL) { 1850 ASSERT(sti->sti_faddr_sa->sa_data != NULL); 1851 sn_len = sti->sti_faddr_len; 1852 ASSERT(sn_len <= sizeof (short) + 1853 sizeof (pksi->ks_si.si_faddr_sun_path)); 1854 1855 pksi->ks_si.si_faddr_family = 1856 sti->sti_faddr_sa->sa_family; 1857 if (sn_len != 0) { 1858 (void) strncpy(pksi->ks_si.si_faddr_sun_path, 1859 sti->sti_faddr_sa->sa_data, 1860 sizeof (pksi->ks_si.si_faddr_sun_path)); 1861 sn_len = strlen(pksi->ks_si.si_faddr_sun_path); 1862 } 1863 pksi->ks_si.si_faddr_sun_path[sn_len] = 0; 1864 } 1865 1866 mutex_exit(&so->so_lock); 1867 1868 (void) sprintf(pksi->ks_straddr[0], "%p", (void *)so); 1869 (void) sprintf(pksi->ks_straddr[1], "%p", 1870 (void *)sti->sti_ux_laddr.soua_vp); 1871 (void) sprintf(pksi->ks_straddr[2], "%p", 1872 (void *)sti->sti_ux_faddr.soua_vp); 1873 1874 ns++; 1875 pksi++; 1876 } 1877 1878 ksp->ks_ndata = ns; 1879 return (0); 1880 } 1881 1882 ssize_t 1883 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size) 1884 { 1885 struct uio auio; 1886 struct iovec aiov[MSG_MAXIOVLEN]; 1887 register vnode_t *vp; 1888 int ioflag, rwflag; 1889 ssize_t cnt; 1890 int error = 0; 1891 int iovcnt = 0; 1892 short fflag; 1893 1894 vp = fp->f_vnode; 1895 fflag = fp->f_flag; 1896 1897 rwflag = 0; 1898 aiov[0].iov_base = (caddr_t)buf; 1899 aiov[0].iov_len = size; 1900 iovcnt = 1; 1901 cnt = (ssize_t)size; 1902 (void) VOP_RWLOCK(vp, rwflag, NULL); 1903 1904 auio.uio_loffset = fileoff; 1905 auio.uio_iov = aiov; 1906 auio.uio_iovcnt = iovcnt; 1907 auio.uio_resid = cnt; 1908 auio.uio_segflg = UIO_SYSSPACE; 1909 auio.uio_llimit = MAXOFFSET_T; 1910 auio.uio_fmode = fflag; 1911 auio.uio_extflg = UIO_COPY_CACHED; 1912 1913 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1914 1915 /* If read sync is not asked for, filter sync flags */ 1916 if ((ioflag & FRSYNC) == 0) 1917 ioflag &= ~(FSYNC|FDSYNC); 1918 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL); 1919 cnt -= auio.uio_resid; 1920 1921 VOP_RWUNLOCK(vp, rwflag, NULL); 1922 1923 if (error == EINTR && cnt != 0) 1924 error = 0; 1925 out: 1926 if (error != 0) { 1927 *err = error; 1928 return (0); 1929 } else { 1930 *err = 0; 1931 return (cnt); 1932 } 1933 } 1934 1935 int 1936 so_copyin(const void *from, void *to, size_t size, int fromkernel) 1937 { 1938 if (fromkernel) { 1939 bcopy(from, to, size); 1940 return (0); 1941 } 1942 return (xcopyin(from, to, size)); 1943 } 1944 1945 int 1946 so_copyout(const void *from, void *to, size_t size, int tokernel) 1947 { 1948 if (tokernel) { 1949 bcopy(from, to, size); 1950 return (0); 1951 } 1952 return (xcopyout(from, to, size)); 1953 } 1954