1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vfs.h> 37 #include <sys/vfs_opreg.h> 38 #include <sys/vnode.h> 39 #include <sys/debug.h> 40 #include <sys/errno.h> 41 #include <sys/time.h> 42 #include <sys/file.h> 43 #include <sys/open.h> 44 #include <sys/user.h> 45 #include <sys/uio.h> 46 #include <sys/termios.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/strsun.h> 50 #include <sys/esunddi.h> 51 #include <sys/flock.h> 52 #include <sys/modctl.h> 53 #include <sys/cmn_err.h> 54 #include <sys/mkdev.h> 55 #include <sys/pathname.h> 56 #include <sys/ddi.h> 57 #include <sys/stat.h> 58 #include <sys/fs/snode.h> 59 #include <sys/fs/dv_node.h> 60 #include <sys/zone.h> 61 62 #include <sys/socket.h> 63 #include <sys/socketvar.h> 64 #include <netinet/in.h> 65 #include <sys/un.h> 66 67 #include <sys/ucred.h> 68 69 #include <sys/tiuser.h> 70 #define _SUN_TPI_VERSION 2 71 #include <sys/tihdr.h> 72 73 #include <c2/audit.h> 74 75 #include <fs/sockfs/nl7c.h> 76 77 /* 78 * Macros that operate on struct cmsghdr. 79 * The CMSG_VALID macro does not assume that the last option buffer is padded. 80 */ 81 #define CMSG_CONTENT(cmsg) (&((cmsg)[1])) 82 #define CMSG_CONTENTLEN(cmsg) ((cmsg)->cmsg_len - sizeof (struct cmsghdr)) 83 #define CMSG_VALID(cmsg, start, end) \ 84 (ISALIGNED_cmsghdr(cmsg) && \ 85 ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \ 86 ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \ 87 ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \ 88 ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end))) 89 #define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */ 90 91 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache; 92 struct kmem_cache *socktpi_sod_cache; 93 94 dev_t sockdev; /* For fsid in getattr */ 95 int sockfs_defer_nl7c_init = 0; 96 struct sockparams *sphead; 97 krwlock_t splist_lock; 98 99 struct socklist socklist; 100 101 static int sockfs_update(kstat_t *, int); 102 static int sockfs_snapshot(kstat_t *, void *, int); 103 104 extern void sendfile_init(); 105 106 extern void nl7c_init(void); 107 108 extern int sostr_init(); 109 110 extern int modrootloaded; 111 112 #define ADRSTRLEN (2 * sizeof (void *) + 1) 113 /* 114 * kernel structure for passing the sockinfo data back up to the user. 115 * the strings array allows us to convert AF_UNIX addresses into strings 116 * with a common method regardless of which n-bit kernel we're running. 117 */ 118 struct k_sockinfo { 119 struct sockinfo ks_si; 120 char ks_straddr[3][ADRSTRLEN]; 121 }; 122 123 /* 124 * Translate from a device pathname (e.g. "/dev/tcp") to a vnode. 125 * Returns with the vnode held. 126 */ 127 static int 128 sogetvp(char *devpath, vnode_t **vpp, int uioflag) 129 { 130 struct snode *csp; 131 vnode_t *vp, *dvp; 132 major_t maj; 133 int error; 134 135 ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE); 136 /* 137 * Lookup the underlying filesystem vnode. 138 */ 139 error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp); 140 if (error) 141 return (error); 142 143 /* Check that it is the correct vnode */ 144 if (vp->v_type != VCHR) { 145 VN_RELE(vp); 146 return (ENOTSOCK); 147 } 148 149 /* 150 * If devpath went through devfs, the device should already 151 * be configured. If devpath is a mknod file, however, we 152 * need to make sure the device is properly configured. 153 * To do this, we do something similar to spec_open() 154 * except that we resolve to the minor/leaf level since 155 * we need to return a vnode. 156 */ 157 csp = VTOS(VTOS(vp)->s_commonvp); 158 if (!(csp->s_flag & SDIPSET)) { 159 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 160 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname); 161 if (error == 0) 162 error = devfs_lookupname(pathname, NULLVPP, &dvp); 163 VN_RELE(vp); 164 kmem_free(pathname, MAXPATHLEN); 165 if (error != 0) 166 return (ENXIO); 167 vp = dvp; /* use the devfs vp */ 168 } 169 170 /* device is configured at this point */ 171 maj = getmajor(vp->v_rdev); 172 if (!STREAMSTAB(maj)) { 173 VN_RELE(vp); 174 return (ENOSTR); 175 } 176 177 *vpp = vp; 178 return (0); 179 } 180 181 /* 182 * Add or delete (latter if devpath is NULL) an enter to the sockparams 183 * table. If devpathlen is zero the devpath with not be kmem_freed. Otherwise 184 * this routine assumes that the caller has kmem_alloced devpath/devpathlen 185 * for this routine to consume. 186 * The zero devpathlen could be used if the kernel wants to create entries 187 * itself by calling sockconfig(1,2,3, "/dev/tcp", 0); 188 */ 189 int 190 soconfig(int domain, int type, int protocol, 191 char *devpath, int devpathlen) 192 { 193 struct sockparams **spp; 194 struct sockparams *sp; 195 int error = 0; 196 197 dprint(0, ("soconfig(%d,%d,%d,%s,%d)\n", 198 domain, type, protocol, devpath, devpathlen)); 199 200 if (sockfs_defer_nl7c_init) { 201 nl7c_init(); 202 sockfs_defer_nl7c_init = 0; 203 } 204 205 /* 206 * Look for an existing match. 207 */ 208 rw_enter(&splist_lock, RW_WRITER); 209 for (spp = &sphead; (sp = *spp) != NULL; spp = &sp->sp_next) { 210 if (sp->sp_domain == domain && 211 sp->sp_type == type && 212 sp->sp_protocol == protocol) { 213 break; 214 } 215 } 216 if (devpath == NULL) { 217 ASSERT(devpathlen == 0); 218 219 /* Delete existing entry */ 220 if (sp == NULL) { 221 error = ENXIO; 222 goto done; 223 } 224 /* Unlink and free existing entry */ 225 *spp = sp->sp_next; 226 ASSERT(sp->sp_vnode); 227 VN_RELE(sp->sp_vnode); 228 if (sp->sp_devpathlen != 0) 229 kmem_free(sp->sp_devpath, sp->sp_devpathlen); 230 kmem_free(sp, sizeof (*sp)); 231 } else { 232 vnode_t *vp; 233 234 /* Add new entry */ 235 if (sp != NULL) { 236 error = EEXIST; 237 goto done; 238 } 239 240 error = sogetvp(devpath, &vp, UIO_SYSSPACE); 241 if (error) { 242 dprint(0, ("soconfig: vp %s failed with %d\n", 243 devpath, error)); 244 goto done; 245 } 246 247 dprint(0, ("soconfig: %s => vp %p, dev 0x%lx\n", 248 devpath, (void *)vp, vp->v_rdev)); 249 250 sp = kmem_alloc(sizeof (*sp), KM_SLEEP); 251 sp->sp_domain = domain; 252 sp->sp_type = type; 253 sp->sp_protocol = protocol; 254 sp->sp_devpath = devpath; 255 sp->sp_devpathlen = devpathlen; 256 sp->sp_vnode = vp; 257 sp->sp_next = NULL; 258 *spp = sp; 259 } 260 done: 261 rw_exit(&splist_lock); 262 if (error) { 263 if (devpath != NULL) 264 kmem_free(devpath, devpathlen); 265 #ifdef SOCK_DEBUG 266 eprintline(error); 267 #endif /* SOCK_DEBUG */ 268 } 269 return (error); 270 } 271 272 /* 273 * Lookup an entry in the sockparams list based on the triple. 274 * If no entry is found and devpath is not NULL translate devpath to a 275 * vnode. Note that devpath is a pointer to a user address! 276 * Returns with the vnode held. 277 * 278 * When this routine uses devpath it does not create an entry in the sockparams 279 * list since this routine can run on behalf of any user and one user 280 * should not be able to effect the transport used by another user. 281 * 282 * In order to return the correct error this routine has to do wildcard scans 283 * of the list. The errors are (in decreasing precedence): 284 * EAFNOSUPPORT - address family not in list 285 * EPROTONOSUPPORT - address family supported but not protocol. 286 * EPROTOTYPE - address family and protocol supported but not socket type. 287 */ 288 vnode_t * 289 solookup(int domain, int type, int protocol, char *devpath, int *errorp) 290 { 291 struct sockparams *sp; 292 int error; 293 vnode_t *vp; 294 295 rw_enter(&splist_lock, RW_READER); 296 for (sp = sphead; sp != NULL; sp = sp->sp_next) { 297 if (sp->sp_domain == domain && 298 sp->sp_type == type && 299 sp->sp_protocol == protocol) { 300 break; 301 } 302 } 303 if (sp == NULL) { 304 dprint(0, ("solookup(%d,%d,%d) not found\n", 305 domain, type, protocol)); 306 if (devpath == NULL) { 307 /* Determine correct error code */ 308 int found = 0; 309 310 for (sp = sphead; sp != NULL; sp = sp->sp_next) { 311 if (sp->sp_domain == domain && found < 1) 312 found = 1; 313 if (sp->sp_domain == domain && 314 sp->sp_protocol == protocol && found < 2) 315 found = 2; 316 } 317 rw_exit(&splist_lock); 318 switch (found) { 319 case 0: 320 *errorp = EAFNOSUPPORT; 321 break; 322 case 1: 323 *errorp = EPROTONOSUPPORT; 324 break; 325 case 2: 326 *errorp = EPROTOTYPE; 327 break; 328 } 329 return (NULL); 330 } 331 rw_exit(&splist_lock); 332 333 /* 334 * Return vp based on devpath. 335 * Do not enter into table to avoid random users 336 * modifying the sockparams list. 337 */ 338 error = sogetvp(devpath, &vp, UIO_USERSPACE); 339 if (error) { 340 dprint(0, ("solookup: vp %p failed with %d\n", 341 (void *)devpath, error)); 342 *errorp = EPROTONOSUPPORT; 343 return (NULL); 344 } 345 dprint(0, ("solookup: %p => vp %p, dev 0x%lx\n", 346 (void *)devpath, (void *)vp, vp->v_rdev)); 347 348 return (vp); 349 } 350 dprint(0, ("solookup(%d,%d,%d) vp %p devpath %s\n", 351 domain, type, protocol, (void *)sp->sp_vnode, sp->sp_devpath)); 352 353 vp = sp->sp_vnode; 354 VN_HOLD(vp); 355 rw_exit(&splist_lock); 356 return (vp); 357 } 358 359 /* 360 * Return a socket vnode. 361 * 362 * Assumes that the caller is "passing" an VN_HOLD for accessvp i.e. 363 * when the socket is freed a VN_RELE will take place. 364 * 365 * Note that sockets assume that the driver will clone (either itself 366 * or by using the clone driver) i.e. a socket() call will always 367 * result in a new vnode being created. 368 */ 369 struct vnode * 370 makesockvp(struct vnode *accessvp, int domain, int type, int protocol) 371 { 372 kmem_cache_t *cp; 373 struct sonode *so; 374 struct vnode *vp; 375 time_t now; 376 dev_t dev; 377 378 cp = (domain == AF_UNIX) ? socktpi_unix_cache : socktpi_cache; 379 so = kmem_cache_alloc(cp, KM_SLEEP); 380 so->so_cache = cp; 381 so->so_obj = so; 382 vp = SOTOV(so); 383 now = gethrestime_sec(); 384 385 so->so_flag = 0; 386 ASSERT(so->so_accessvp == NULL); 387 so->so_accessvp = accessvp; 388 dev = accessvp->v_rdev; 389 390 /* 391 * Record in so_flag that it is a clone. 392 */ 393 if (getmajor(dev) == clone_major) { 394 so->so_flag |= SOCLONE; 395 } 396 so->so_dev = dev; 397 398 so->so_state = 0; 399 so->so_mode = 0; 400 401 so->so_fsid = sockdev; 402 so->so_atime = now; 403 so->so_mtime = now; 404 so->so_ctime = now; /* Never modified */ 405 so->so_count = 0; 406 407 so->so_family = (short)domain; 408 so->so_type = (short)type; 409 so->so_protocol = (short)protocol; 410 so->so_pushcnt = 0; 411 412 so->so_options = 0; 413 so->so_linger.l_onoff = 0; 414 so->so_linger.l_linger = 0; 415 so->so_sndbuf = 0; 416 so->so_rcvbuf = 0; 417 so->so_sndlowat = 0; 418 so->so_rcvlowat = 0; 419 #ifdef notyet 420 so->so_sndtimeo = 0; 421 so->so_rcvtimeo = 0; 422 #endif /* notyet */ 423 so->so_error = 0; 424 so->so_delayed_error = 0; 425 426 ASSERT(so->so_oobmsg == NULL); 427 so->so_oobcnt = 0; 428 so->so_oobsigcnt = 0; 429 so->so_pgrp = 0; 430 so->so_provinfo = NULL; 431 432 ASSERT(so->so_laddr_sa == NULL && so->so_faddr_sa == NULL); 433 so->so_laddr_len = so->so_faddr_len = 0; 434 so->so_laddr_maxlen = so->so_faddr_maxlen = 0; 435 so->so_eaddr_mp = NULL; 436 so->so_priv = NULL; 437 438 so->so_peercred = NULL; 439 440 ASSERT(so->so_ack_mp == NULL); 441 ASSERT(so->so_conn_ind_head == NULL); 442 ASSERT(so->so_conn_ind_tail == NULL); 443 ASSERT(so->so_ux_bound_vp == NULL); 444 ASSERT(so->so_unbind_mp == NULL); 445 446 vn_reinit(vp); 447 vp->v_vfsp = rootvfs; 448 vp->v_type = VSOCK; 449 vp->v_rdev = so->so_dev; 450 vn_exists(vp); 451 452 return (vp); 453 } 454 455 void 456 sockfree(struct sonode *so) 457 { 458 mblk_t *mp; 459 vnode_t *vp; 460 461 ASSERT(so->so_count == 0); 462 ASSERT(so->so_accessvp); 463 ASSERT(so->so_discon_ind_mp == NULL); 464 465 vp = so->so_accessvp; 466 VN_RELE(vp); 467 468 /* 469 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely 470 * indirect them. It also uses so_accessvp as a validity test. 471 */ 472 mutex_enter(&so->so_lock); 473 474 so->so_accessvp = NULL; 475 476 if (so->so_laddr_sa) { 477 ASSERT((caddr_t)so->so_faddr_sa == 478 (caddr_t)so->so_laddr_sa + so->so_laddr_maxlen); 479 ASSERT(so->so_faddr_maxlen == so->so_laddr_maxlen); 480 so->so_state &= ~(SS_LADDR_VALID | SS_FADDR_VALID); 481 kmem_free(so->so_laddr_sa, so->so_laddr_maxlen * 2); 482 so->so_laddr_sa = NULL; 483 so->so_laddr_len = so->so_laddr_maxlen = 0; 484 so->so_faddr_sa = NULL; 485 so->so_faddr_len = so->so_faddr_maxlen = 0; 486 } 487 488 mutex_exit(&so->so_lock); 489 490 if ((mp = so->so_eaddr_mp) != NULL) { 491 freemsg(mp); 492 so->so_eaddr_mp = NULL; 493 so->so_delayed_error = 0; 494 } 495 if ((mp = so->so_ack_mp) != NULL) { 496 freemsg(mp); 497 so->so_ack_mp = NULL; 498 } 499 if ((mp = so->so_conn_ind_head) != NULL) { 500 mblk_t *mp1; 501 502 while (mp) { 503 mp1 = mp->b_next; 504 mp->b_next = NULL; 505 freemsg(mp); 506 mp = mp1; 507 } 508 so->so_conn_ind_head = so->so_conn_ind_tail = NULL; 509 so->so_state &= ~SS_HASCONNIND; 510 } 511 #ifdef DEBUG 512 mutex_enter(&so->so_lock); 513 ASSERT(so_verify_oobstate(so)); 514 mutex_exit(&so->so_lock); 515 #endif /* DEBUG */ 516 if ((mp = so->so_oobmsg) != NULL) { 517 freemsg(mp); 518 so->so_oobmsg = NULL; 519 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA); 520 } 521 522 if ((mp = so->so_nl7c_rcv_mp) != NULL) { 523 so->so_nl7c_rcv_mp = NULL; 524 freemsg(mp); 525 } 526 so->so_nl7c_rcv_rval = 0; 527 if (so->so_nl7c_uri != NULL) { 528 nl7c_urifree(so); 529 /* urifree() cleared nl7c_uri */ 530 } 531 if (so->so_nl7c_flags) { 532 so->so_nl7c_flags = 0; 533 } 534 535 if (so->so_direct != NULL) { 536 sodirect_t *sodp = so->so_direct; 537 538 ASSERT(sodp->sod_uioafh == NULL); 539 540 so->so_direct = NULL; 541 kmem_cache_free(socktpi_sod_cache, sodp); 542 } 543 544 ASSERT(so->so_ux_bound_vp == NULL); 545 if ((mp = so->so_unbind_mp) != NULL) { 546 freemsg(mp); 547 so->so_unbind_mp = NULL; 548 } 549 vn_invalid(SOTOV(so)); 550 551 if (so->so_peercred != NULL) 552 crfree(so->so_peercred); 553 554 kmem_cache_free(so->so_cache, so->so_obj); 555 } 556 557 /* 558 * Update the accessed, updated, or changed times in an sonode 559 * with the current time. 560 * 561 * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable 562 * attributes in a fstat call. (They return the current time and 0 for 563 * all timestamps, respectively.) We maintain the current timestamps 564 * here primarily so that should sockmod be popped the resulting 565 * file descriptor will behave like a stream w.r.t. the timestamps. 566 */ 567 void 568 so_update_attrs(struct sonode *so, int flag) 569 { 570 time_t now = gethrestime_sec(); 571 572 mutex_enter(&so->so_lock); 573 so->so_flag |= flag; 574 if (flag & SOACC) 575 so->so_atime = now; 576 if (flag & SOMOD) 577 so->so_mtime = now; 578 mutex_exit(&so->so_lock); 579 } 580 581 /*ARGSUSED*/ 582 static int 583 socktpi_constructor(void *buf, void *cdrarg, int kmflags) 584 { 585 struct sonode *so = buf; 586 struct vnode *vp; 587 588 vp = so->so_vnode = vn_alloc(kmflags); 589 if (vp == NULL) { 590 return (-1); 591 } 592 vn_setops(vp, socktpi_vnodeops); 593 vp->v_data = so; 594 595 so->so_direct = NULL; 596 597 so->so_nl7c_flags = 0; 598 so->so_nl7c_uri = NULL; 599 so->so_nl7c_rcv_mp = NULL; 600 601 so->so_oobmsg = NULL; 602 so->so_ack_mp = NULL; 603 so->so_conn_ind_head = NULL; 604 so->so_conn_ind_tail = NULL; 605 so->so_discon_ind_mp = NULL; 606 so->so_ux_bound_vp = NULL; 607 so->so_unbind_mp = NULL; 608 so->so_accessvp = NULL; 609 so->so_laddr_sa = NULL; 610 so->so_faddr_sa = NULL; 611 so->so_ops = &sotpi_sonodeops; 612 613 mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL); 614 mutex_init(&so->so_plumb_lock, NULL, MUTEX_DEFAULT, NULL); 615 cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL); 616 cv_init(&so->so_ack_cv, NULL, CV_DEFAULT, NULL); 617 cv_init(&so->so_connind_cv, NULL, CV_DEFAULT, NULL); 618 cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL); 619 620 return (0); 621 } 622 623 /*ARGSUSED1*/ 624 static void 625 socktpi_destructor(void *buf, void *cdrarg) 626 { 627 struct sonode *so = buf; 628 struct vnode *vp = SOTOV(so); 629 630 ASSERT(so->so_direct == NULL); 631 632 ASSERT(so->so_nl7c_flags == 0); 633 ASSERT(so->so_nl7c_uri == NULL); 634 ASSERT(so->so_nl7c_rcv_mp == NULL); 635 636 ASSERT(so->so_oobmsg == NULL); 637 ASSERT(so->so_ack_mp == NULL); 638 ASSERT(so->so_conn_ind_head == NULL); 639 ASSERT(so->so_conn_ind_tail == NULL); 640 ASSERT(so->so_discon_ind_mp == NULL); 641 ASSERT(so->so_ux_bound_vp == NULL); 642 ASSERT(so->so_unbind_mp == NULL); 643 ASSERT(so->so_ops == &sotpi_sonodeops); 644 645 ASSERT(vn_matchops(vp, socktpi_vnodeops)); 646 ASSERT(vp->v_data == so); 647 648 vn_free(vp); 649 650 mutex_destroy(&so->so_lock); 651 mutex_destroy(&so->so_plumb_lock); 652 cv_destroy(&so->so_state_cv); 653 cv_destroy(&so->so_ack_cv); 654 cv_destroy(&so->so_connind_cv); 655 cv_destroy(&so->so_want_cv); 656 } 657 658 static int 659 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags) 660 { 661 int retval; 662 663 if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) { 664 struct sonode *so = (struct sonode *)buf; 665 666 mutex_enter(&socklist.sl_lock); 667 668 so->so_next = socklist.sl_list; 669 so->so_prev = NULL; 670 if (so->so_next != NULL) 671 so->so_next->so_prev = so; 672 socklist.sl_list = so; 673 674 mutex_exit(&socklist.sl_lock); 675 676 } 677 return (retval); 678 } 679 680 static void 681 socktpi_unix_destructor(void *buf, void *cdrarg) 682 { 683 struct sonode *so = (struct sonode *)buf; 684 685 mutex_enter(&socklist.sl_lock); 686 687 if (so->so_next != NULL) 688 so->so_next->so_prev = so->so_prev; 689 if (so->so_prev != NULL) 690 so->so_prev->so_next = so->so_next; 691 else 692 socklist.sl_list = so->so_next; 693 694 mutex_exit(&socklist.sl_lock); 695 696 socktpi_destructor(buf, cdrarg); 697 } 698 699 /* 700 * Init function called when sockfs is loaded. 701 */ 702 int 703 sockinit(int fstype, char *name) 704 { 705 static const fs_operation_def_t sock_vfsops_template[] = { 706 NULL, NULL 707 }; 708 int error; 709 major_t dev; 710 char *err_str; 711 712 error = vfs_setfsops(fstype, sock_vfsops_template, NULL); 713 if (error != 0) { 714 zcmn_err(GLOBAL_ZONEID, CE_WARN, 715 "sockinit: bad vfs ops template"); 716 return (error); 717 } 718 719 error = vn_make_ops(name, socktpi_vnodeops_template, &socktpi_vnodeops); 720 if (error != 0) { 721 err_str = "sockinit: bad sock vnode ops template"; 722 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */ 723 socktpi_vnodeops = NULL; 724 goto failure; 725 } 726 727 error = sosctp_init(); 728 if (error != 0) { 729 err_str = NULL; 730 goto failure; 731 } 732 733 error = sosdp_init(); 734 if (error != 0) { 735 err_str = NULL; 736 goto failure; 737 } 738 739 error = sostr_init(); 740 if (error != 0) { 741 err_str = NULL; 742 goto failure; 743 } 744 745 /* 746 * Create sonode caches. We create a special one for AF_UNIX so 747 * that we can track them for netstat(1m). 748 */ 749 socktpi_cache = kmem_cache_create("socktpi_cache", 750 sizeof (struct sonode), 0, socktpi_constructor, 751 socktpi_destructor, NULL, NULL, NULL, 0); 752 753 socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache", 754 sizeof (struct sonode), 0, socktpi_unix_constructor, 755 socktpi_unix_destructor, NULL, NULL, NULL, 0); 756 757 /* 758 * Build initial list mapping socket parameters to vnode. 759 */ 760 rw_init(&splist_lock, NULL, RW_DEFAULT, NULL); 761 762 /* 763 * If sockets are needed before init runs /sbin/soconfig 764 * it is possible to preload the sockparams list here using 765 * calls like: 766 * sockconfig(1,2,3, "/dev/tcp", 0); 767 */ 768 769 /* 770 * Create a unique dev_t for use in so_fsid. 771 */ 772 773 if ((dev = getudev()) == (major_t)-1) 774 dev = 0; 775 sockdev = makedevice(dev, 0); 776 777 mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL); 778 sendfile_init(); 779 if (!modrootloaded) { 780 sockfs_defer_nl7c_init = 1; 781 } else { 782 nl7c_init(); 783 } 784 785 return (0); 786 787 failure: 788 (void) vfs_freevfsops_by_type(fstype); 789 if (socktpi_vnodeops != NULL) 790 vn_freevnodeops(socktpi_vnodeops); 791 if (err_str != NULL) 792 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str); 793 return (error); 794 } 795 796 /* 797 * Caller must hold the mutex. Used to set SOLOCKED. 798 */ 799 void 800 so_lock_single(struct sonode *so) 801 { 802 ASSERT(MUTEX_HELD(&so->so_lock)); 803 804 while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) { 805 so->so_flag |= SOWANT; 806 cv_wait_stop(&so->so_want_cv, &so->so_lock, 807 SO_LOCK_WAKEUP_TIME); 808 } 809 so->so_flag |= SOLOCKED; 810 } 811 812 /* 813 * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND. 814 * Used to clear SOLOCKED or SOASYNC_UNBIND. 815 */ 816 void 817 so_unlock_single(struct sonode *so, int flag) 818 { 819 ASSERT(MUTEX_HELD(&so->so_lock)); 820 ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND)); 821 ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0); 822 ASSERT(so->so_flag & flag); 823 824 /* 825 * Process the T_DISCON_IND on so_discon_ind_mp. 826 * 827 * Call to so_drain_discon_ind will result in so_lock 828 * being dropped and re-acquired later. 829 */ 830 if (so->so_discon_ind_mp != NULL) 831 so_drain_discon_ind(so); 832 833 if (so->so_flag & SOWANT) 834 cv_broadcast(&so->so_want_cv); 835 so->so_flag &= ~(SOWANT|flag); 836 } 837 838 /* 839 * Caller must hold the mutex. Used to set SOREADLOCKED. 840 * If the caller wants nonblocking behavior it should set fmode. 841 */ 842 int 843 so_lock_read(struct sonode *so, int fmode) 844 { 845 ASSERT(MUTEX_HELD(&so->so_lock)); 846 847 while (so->so_flag & SOREADLOCKED) { 848 if (fmode & (FNDELAY|FNONBLOCK)) 849 return (EWOULDBLOCK); 850 so->so_flag |= SOWANT; 851 cv_wait_stop(&so->so_want_cv, &so->so_lock, 852 SO_LOCK_WAKEUP_TIME); 853 } 854 so->so_flag |= SOREADLOCKED; 855 return (0); 856 } 857 858 /* 859 * Like so_lock_read above but allows signals. 860 */ 861 int 862 so_lock_read_intr(struct sonode *so, int fmode) 863 { 864 ASSERT(MUTEX_HELD(&so->so_lock)); 865 866 while (so->so_flag & SOREADLOCKED) { 867 if (fmode & (FNDELAY|FNONBLOCK)) 868 return (EWOULDBLOCK); 869 so->so_flag |= SOWANT; 870 if (!cv_wait_sig(&so->so_want_cv, &so->so_lock)) 871 return (EINTR); 872 } 873 so->so_flag |= SOREADLOCKED; 874 return (0); 875 } 876 877 /* 878 * Caller must hold the mutex. Used to clear SOREADLOCKED, 879 * set in so_lock_read() or so_lock_read_intr(). 880 */ 881 void 882 so_unlock_read(struct sonode *so) 883 { 884 ASSERT(MUTEX_HELD(&so->so_lock)); 885 ASSERT(so->so_flag & SOREADLOCKED); 886 887 if (so->so_flag & SOWANT) 888 cv_broadcast(&so->so_want_cv); 889 so->so_flag &= ~(SOWANT|SOREADLOCKED); 890 } 891 892 /* 893 * Verify that the specified offset falls within the mblk and 894 * that the resulting pointer is aligned. 895 * Returns NULL if not. 896 */ 897 void * 898 sogetoff(mblk_t *mp, t_uscalar_t offset, 899 t_uscalar_t length, uint_t align_size) 900 { 901 uintptr_t ptr1, ptr2; 902 903 ASSERT(mp && mp->b_wptr >= mp->b_rptr); 904 ptr1 = (uintptr_t)mp->b_rptr + offset; 905 ptr2 = (uintptr_t)ptr1 + length; 906 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) { 907 eprintline(0); 908 return (NULL); 909 } 910 if ((ptr1 & (align_size - 1)) != 0) { 911 eprintline(0); 912 return (NULL); 913 } 914 return ((void *)ptr1); 915 } 916 917 /* 918 * Return the AF_UNIX underlying filesystem vnode matching a given name. 919 * Makes sure the sending and the destination sonodes are compatible. 920 * The vnode is returned held. 921 * 922 * The underlying filesystem VSOCK vnode has a v_stream pointer that 923 * references the actual stream head (hence indirectly the actual sonode). 924 */ 925 static int 926 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess, 927 vnode_t **vpp) 928 { 929 vnode_t *vp; /* Underlying filesystem vnode */ 930 vnode_t *rvp; /* real vnode */ 931 vnode_t *svp; /* sockfs vnode */ 932 struct sonode *so2; 933 int error; 934 935 dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so, 936 soun->sun_path)); 937 938 error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); 939 if (error) { 940 eprintsoline(so, error); 941 return (error); 942 } 943 944 /* 945 * Traverse lofs mounts get the real vnode 946 */ 947 if (VOP_REALVP(vp, &rvp, NULL) == 0) { 948 VN_HOLD(rvp); /* hold the real vnode */ 949 VN_RELE(vp); /* release hold from lookup */ 950 vp = rvp; 951 } 952 953 if (vp->v_type != VSOCK) { 954 error = ENOTSOCK; 955 eprintsoline(so, error); 956 goto done2; 957 } 958 959 if (checkaccess) { 960 /* 961 * Check that we have permissions to access the destination 962 * vnode. This check is not done in BSD but it is required 963 * by X/Open. 964 */ 965 if (error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL)) { 966 eprintsoline(so, error); 967 goto done2; 968 } 969 } 970 971 /* 972 * Check if the remote socket has been closed. 973 * 974 * Synchronize with vn_rele_stream by holding v_lock while traversing 975 * v_stream->sd_vnode. 976 */ 977 mutex_enter(&vp->v_lock); 978 if (vp->v_stream == NULL) { 979 mutex_exit(&vp->v_lock); 980 if (so->so_type == SOCK_DGRAM) 981 error = EDESTADDRREQ; 982 else 983 error = ECONNREFUSED; 984 985 eprintsoline(so, error); 986 goto done2; 987 } 988 ASSERT(vp->v_stream->sd_vnode); 989 svp = vp->v_stream->sd_vnode; 990 /* 991 * holding v_lock on underlying filesystem vnode and acquiring 992 * it on sockfs vnode. Assumes that no code ever attempts to 993 * acquire these locks in the reverse order. 994 */ 995 VN_HOLD(svp); 996 mutex_exit(&vp->v_lock); 997 998 if (svp->v_type != VSOCK) { 999 error = ENOTSOCK; 1000 eprintsoline(so, error); 1001 goto done; 1002 } 1003 1004 so2 = VTOSO(svp); 1005 1006 if (so->so_type != so2->so_type) { 1007 error = EPROTOTYPE; 1008 eprintsoline(so, error); 1009 goto done; 1010 } 1011 1012 VN_RELE(svp); 1013 *vpp = vp; 1014 return (0); 1015 1016 done: 1017 VN_RELE(svp); 1018 done2: 1019 VN_RELE(vp); 1020 return (error); 1021 } 1022 1023 /* 1024 * Verify peer address for connect and sendto/sendmsg. 1025 * Since sendto/sendmsg would not get synchronous errors from the transport 1026 * provider we have to do these ugly checks in the socket layer to 1027 * preserve compatibility with SunOS 4.X. 1028 */ 1029 int 1030 so_addr_verify(struct sonode *so, const struct sockaddr *name, 1031 socklen_t namelen) 1032 { 1033 int family; 1034 1035 dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n", 1036 (void *)so, (void *)name, namelen)); 1037 1038 ASSERT(name != NULL); 1039 1040 family = so->so_family; 1041 switch (family) { 1042 case AF_INET: 1043 if (name->sa_family != family) { 1044 eprintsoline(so, EAFNOSUPPORT); 1045 return (EAFNOSUPPORT); 1046 } 1047 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) { 1048 eprintsoline(so, EINVAL); 1049 return (EINVAL); 1050 } 1051 break; 1052 case AF_INET6: { 1053 #ifdef DEBUG 1054 struct sockaddr_in6 *sin6; 1055 #endif /* DEBUG */ 1056 1057 if (name->sa_family != family) { 1058 eprintsoline(so, EAFNOSUPPORT); 1059 return (EAFNOSUPPORT); 1060 } 1061 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) { 1062 eprintsoline(so, EINVAL); 1063 return (EINVAL); 1064 } 1065 #ifdef DEBUG 1066 /* Verify that apps don't forget to clear sin6_scope_id etc */ 1067 sin6 = (struct sockaddr_in6 *)name; 1068 if (sin6->sin6_scope_id != 0 && 1069 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) { 1070 zcmn_err(getzoneid(), CE_WARN, 1071 "connect/send* with uninitialized sin6_scope_id " 1072 "(%d) on socket. Pid = %d\n", 1073 (int)sin6->sin6_scope_id, (int)curproc->p_pid); 1074 } 1075 #endif /* DEBUG */ 1076 break; 1077 } 1078 case AF_UNIX: 1079 if (so->so_state & SS_FADDR_NOXLATE) { 1080 return (0); 1081 } 1082 if (namelen < (socklen_t)sizeof (short)) { 1083 eprintsoline(so, ENOENT); 1084 return (ENOENT); 1085 } 1086 if (name->sa_family != family) { 1087 eprintsoline(so, EAFNOSUPPORT); 1088 return (EAFNOSUPPORT); 1089 } 1090 /* MAXPATHLEN + soun_family + nul termination */ 1091 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) { 1092 eprintsoline(so, ENAMETOOLONG); 1093 return (ENAMETOOLONG); 1094 } 1095 1096 break; 1097 1098 default: 1099 /* 1100 * Default is don't do any length or sa_family check 1101 * to allow non-sockaddr style addresses. 1102 */ 1103 break; 1104 } 1105 1106 return (0); 1107 } 1108 1109 1110 /* 1111 * Translate an AF_UNIX sockaddr_un to the transport internal name. 1112 * Assumes caller has called so_addr_verify first. 1113 */ 1114 /*ARGSUSED*/ 1115 int 1116 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name, 1117 socklen_t namelen, int checkaccess, 1118 void **addrp, socklen_t *addrlenp) 1119 { 1120 int error; 1121 struct sockaddr_un *soun; 1122 vnode_t *vp; 1123 void *addr; 1124 socklen_t addrlen; 1125 1126 dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n", 1127 (void *)so, (void *)name, namelen, checkaccess)); 1128 1129 ASSERT(name != NULL); 1130 ASSERT(so->so_family == AF_UNIX); 1131 ASSERT(!(so->so_state & SS_FADDR_NOXLATE)); 1132 ASSERT(namelen >= (socklen_t)sizeof (short)); 1133 ASSERT(name->sa_family == AF_UNIX); 1134 soun = (struct sockaddr_un *)name; 1135 /* 1136 * Lookup vnode for the specified path name and verify that 1137 * it is a socket. 1138 */ 1139 error = so_ux_lookup(so, soun, checkaccess, &vp); 1140 if (error) { 1141 eprintsoline(so, error); 1142 return (error); 1143 } 1144 /* 1145 * Use the address of the peer vnode as the address to send 1146 * to. We release the peer vnode here. In case it has been 1147 * closed by the time the T_CONN_REQ or T_UNIDATA_REQ reaches the 1148 * transport the message will get an error or be dropped. 1149 */ 1150 so->so_ux_faddr.soua_vp = vp; 1151 so->so_ux_faddr.soua_magic = SOU_MAGIC_EXPLICIT; 1152 addr = &so->so_ux_faddr; 1153 addrlen = (socklen_t)sizeof (so->so_ux_faddr); 1154 dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n", 1155 addrlen, (void *)vp)); 1156 VN_RELE(vp); 1157 *addrp = addr; 1158 *addrlenp = (socklen_t)addrlen; 1159 return (0); 1160 } 1161 1162 /* 1163 * Esballoc free function for messages that contain SO_FILEP option. 1164 * Decrement the reference count on the file pointers using closef. 1165 */ 1166 void 1167 fdbuf_free(struct fdbuf *fdbuf) 1168 { 1169 int i; 1170 struct file *fp; 1171 1172 dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd)); 1173 for (i = 0; i < fdbuf->fd_numfd; i++) { 1174 /* 1175 * We need pointer size alignment for fd_fds. On a LP64 1176 * kernel, the required alignment is 8 bytes while 1177 * the option headers and values are only 4 bytes 1178 * aligned. So its safer to do a bcopy compared to 1179 * assigning fdbuf->fd_fds[i] to fp. 1180 */ 1181 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 1182 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp)); 1183 (void) closef(fp); 1184 } 1185 if (fdbuf->fd_ebuf != NULL) 1186 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen); 1187 kmem_free(fdbuf, fdbuf->fd_size); 1188 } 1189 1190 /* 1191 * Allocate an esballoc'ed message for AF_UNIX file descriptor passing. 1192 * Waits if memory is not available. 1193 */ 1194 mblk_t * 1195 fdbuf_allocmsg(int size, struct fdbuf *fdbuf) 1196 { 1197 uchar_t *buf; 1198 mblk_t *mp; 1199 1200 dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd)); 1201 buf = kmem_alloc(size, KM_SLEEP); 1202 fdbuf->fd_ebuf = (caddr_t)buf; 1203 fdbuf->fd_ebuflen = size; 1204 fdbuf->fd_frtn.free_func = fdbuf_free; 1205 fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf; 1206 1207 mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn); 1208 mp->b_datap->db_type = M_PROTO; 1209 return (mp); 1210 } 1211 1212 /* 1213 * Extract file descriptors from a fdbuf. 1214 * Return list in rights/rightslen. 1215 */ 1216 /*ARGSUSED*/ 1217 static int 1218 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen) 1219 { 1220 int i, fd; 1221 int *rp; 1222 struct file *fp; 1223 int numfd; 1224 1225 dprint(1, ("fdbuf_extract: %d fds, len %d\n", 1226 fdbuf->fd_numfd, rightslen)); 1227 1228 numfd = fdbuf->fd_numfd; 1229 ASSERT(rightslen == numfd * (int)sizeof (int)); 1230 1231 /* 1232 * Allocate a file descriptor and increment the f_count. 1233 * The latter is needed since we always call fdbuf_free 1234 * which performs a closef. 1235 */ 1236 rp = (int *)rights; 1237 for (i = 0; i < numfd; i++) { 1238 if ((fd = ufalloc(0)) == -1) 1239 goto cleanup; 1240 /* 1241 * We need pointer size alignment for fd_fds. On a LP64 1242 * kernel, the required alignment is 8 bytes while 1243 * the option headers and values are only 4 bytes 1244 * aligned. So its safer to do a bcopy compared to 1245 * assigning fdbuf->fd_fds[i] to fp. 1246 */ 1247 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp)); 1248 mutex_enter(&fp->f_tlock); 1249 fp->f_count++; 1250 mutex_exit(&fp->f_tlock); 1251 setf(fd, fp); 1252 *rp++ = fd; 1253 if (audit_active) 1254 audit_fdrecv(fd, fp); 1255 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n", 1256 i, fd, (void *)fp, fp->f_count)); 1257 } 1258 return (0); 1259 1260 cleanup: 1261 /* 1262 * Undo whatever partial work the loop above has done. 1263 */ 1264 { 1265 int j; 1266 1267 rp = (int *)rights; 1268 for (j = 0; j < i; j++) { 1269 dprint(0, 1270 ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp)); 1271 (void) closeandsetf(*rp++, NULL); 1272 } 1273 } 1274 1275 return (EMFILE); 1276 } 1277 1278 /* 1279 * Insert file descriptors into an fdbuf. 1280 * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed 1281 * by calling fdbuf_free(). 1282 */ 1283 int 1284 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp) 1285 { 1286 int numfd, i; 1287 int *fds; 1288 struct file *fp; 1289 struct fdbuf *fdbuf; 1290 int fdbufsize; 1291 1292 dprint(1, ("fdbuf_create: len %d\n", rightslen)); 1293 1294 numfd = rightslen / (int)sizeof (int); 1295 1296 fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)); 1297 fdbuf = kmem_alloc(fdbufsize, KM_SLEEP); 1298 fdbuf->fd_size = fdbufsize; 1299 fdbuf->fd_numfd = 0; 1300 fdbuf->fd_ebuf = NULL; 1301 fdbuf->fd_ebuflen = 0; 1302 fds = (int *)rights; 1303 for (i = 0; i < numfd; i++) { 1304 if ((fp = getf(fds[i])) == NULL) { 1305 fdbuf_free(fdbuf); 1306 return (EBADF); 1307 } 1308 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n", 1309 i, fds[i], (void *)fp, fp->f_count)); 1310 mutex_enter(&fp->f_tlock); 1311 fp->f_count++; 1312 mutex_exit(&fp->f_tlock); 1313 /* 1314 * The maximum alignment for fdbuf (or any option header 1315 * and its value) it 4 bytes. On a LP64 kernel, the alignment 1316 * is not sufficient for pointers (fd_fds in this case). Since 1317 * we just did a kmem_alloc (we get a double word alignment), 1318 * we don't need to do anything on the send side (we loose 1319 * the double word alignment because fdbuf goes after an 1320 * option header (eg T_unitdata_req) which is only 4 byte 1321 * aligned). We take care of this when we extract the file 1322 * descriptor in fdbuf_extract or fdbuf_free. 1323 */ 1324 fdbuf->fd_fds[i] = fp; 1325 fdbuf->fd_numfd++; 1326 releasef(fds[i]); 1327 if (audit_active) 1328 audit_fdsend(fds[i], fp, 0); 1329 } 1330 *fdbufp = fdbuf; 1331 return (0); 1332 } 1333 1334 static int 1335 fdbuf_optlen(int rightslen) 1336 { 1337 int numfd; 1338 1339 numfd = rightslen / (int)sizeof (int); 1340 1341 return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *))); 1342 } 1343 1344 static t_uscalar_t 1345 fdbuf_cmsglen(int fdbuflen) 1346 { 1347 return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) / 1348 (int)sizeof (struct file *) * (int)sizeof (int)); 1349 } 1350 1351 1352 /* 1353 * Return non-zero if the mblk and fdbuf are consistent. 1354 */ 1355 static int 1356 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen) 1357 { 1358 if (fdbuflen >= FDBUF_HDRSIZE && 1359 fdbuflen == fdbuf->fd_size) { 1360 frtn_t *frp = mp->b_datap->db_frtnp; 1361 /* 1362 * Check that the SO_FILEP portion of the 1363 * message has not been modified by 1364 * the loopback transport. The sending sockfs generates 1365 * a message that is esballoc'ed with the free function 1366 * being fdbuf_free() and where free_arg contains the 1367 * identical information as the SO_FILEP content. 1368 * 1369 * If any of these constraints are not satisfied we 1370 * silently ignore the option. 1371 */ 1372 ASSERT(mp); 1373 if (frp != NULL && 1374 frp->free_func == fdbuf_free && 1375 frp->free_arg != NULL && 1376 bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) { 1377 dprint(1, ("fdbuf_verify: fdbuf %p len %d\n", 1378 (void *)fdbuf, fdbuflen)); 1379 return (1); 1380 } else { 1381 zcmn_err(getzoneid(), CE_WARN, 1382 "sockfs: mismatched fdbuf content (%p)", 1383 (void *)mp); 1384 return (0); 1385 } 1386 } else { 1387 zcmn_err(getzoneid(), CE_WARN, 1388 "sockfs: mismatched fdbuf len %d, %d\n", 1389 fdbuflen, fdbuf->fd_size); 1390 return (0); 1391 } 1392 } 1393 1394 /* 1395 * When the file descriptors returned by sorecvmsg can not be passed 1396 * to the application this routine will cleanup the references on 1397 * the files. Start at startoff bytes into the buffer. 1398 */ 1399 static void 1400 close_fds(void *fdbuf, int fdbuflen, int startoff) 1401 { 1402 int *fds = (int *)fdbuf; 1403 int numfd = fdbuflen / (int)sizeof (int); 1404 int i; 1405 1406 dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff)); 1407 1408 for (i = 0; i < numfd; i++) { 1409 if (startoff < 0) 1410 startoff = 0; 1411 if (startoff < (int)sizeof (int)) { 1412 /* 1413 * This file descriptor is partially or fully after 1414 * the offset 1415 */ 1416 dprint(0, 1417 ("close_fds: cleanup[%d] = %d\n", i, fds[i])); 1418 (void) closeandsetf(fds[i], NULL); 1419 } 1420 startoff -= (int)sizeof (int); 1421 } 1422 } 1423 1424 /* 1425 * Close all file descriptors contained in the control part starting at 1426 * the startoffset. 1427 */ 1428 void 1429 so_closefds(void *control, t_uscalar_t controllen, int oldflg, 1430 int startoff) 1431 { 1432 struct cmsghdr *cmsg; 1433 1434 if (control == NULL) 1435 return; 1436 1437 if (oldflg) { 1438 close_fds(control, controllen, startoff); 1439 return; 1440 } 1441 /* Scan control part for file descriptors. */ 1442 for (cmsg = (struct cmsghdr *)control; 1443 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1444 cmsg = CMSG_NEXT(cmsg)) { 1445 if (cmsg->cmsg_level == SOL_SOCKET && 1446 cmsg->cmsg_type == SCM_RIGHTS) { 1447 close_fds(CMSG_CONTENT(cmsg), 1448 (int)CMSG_CONTENTLEN(cmsg), 1449 startoff - (int)sizeof (struct cmsghdr)); 1450 } 1451 startoff -= cmsg->cmsg_len; 1452 } 1453 } 1454 1455 /* 1456 * Returns a pointer/length for the file descriptors contained 1457 * in the control buffer. Returns with *fdlenp == -1 if there are no 1458 * file descriptor options present. This is different than there being 1459 * a zero-length file descriptor option. 1460 * Fail if there are multiple SCM_RIGHT cmsgs. 1461 */ 1462 int 1463 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg, 1464 void **fdsp, int *fdlenp) 1465 { 1466 struct cmsghdr *cmsg; 1467 void *fds; 1468 int fdlen; 1469 1470 if (control == NULL) { 1471 *fdsp = NULL; 1472 *fdlenp = -1; 1473 return (0); 1474 } 1475 1476 if (oldflg) { 1477 *fdsp = control; 1478 if (controllen == 0) 1479 *fdlenp = -1; 1480 else 1481 *fdlenp = controllen; 1482 dprint(1, ("so_getfdopt: old %d\n", *fdlenp)); 1483 return (0); 1484 } 1485 1486 fds = NULL; 1487 fdlen = 0; 1488 1489 for (cmsg = (struct cmsghdr *)control; 1490 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1491 cmsg = CMSG_NEXT(cmsg)) { 1492 if (cmsg->cmsg_level == SOL_SOCKET && 1493 cmsg->cmsg_type == SCM_RIGHTS) { 1494 if (fds != NULL) 1495 return (EINVAL); 1496 fds = CMSG_CONTENT(cmsg); 1497 fdlen = (int)CMSG_CONTENTLEN(cmsg); 1498 dprint(1, ("so_getfdopt: new %lu\n", 1499 (size_t)CMSG_CONTENTLEN(cmsg))); 1500 } 1501 } 1502 if (fds == NULL) { 1503 dprint(1, ("so_getfdopt: NONE\n")); 1504 *fdlenp = -1; 1505 } else 1506 *fdlenp = fdlen; 1507 *fdsp = fds; 1508 return (0); 1509 } 1510 1511 /* 1512 * Return the length of the options including any file descriptor options. 1513 */ 1514 t_uscalar_t 1515 so_optlen(void *control, t_uscalar_t controllen, int oldflg) 1516 { 1517 struct cmsghdr *cmsg; 1518 t_uscalar_t optlen = 0; 1519 t_uscalar_t len; 1520 1521 if (control == NULL) 1522 return (0); 1523 1524 if (oldflg) 1525 return ((t_uscalar_t)(sizeof (struct T_opthdr) + 1526 fdbuf_optlen(controllen))); 1527 1528 for (cmsg = (struct cmsghdr *)control; 1529 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1530 cmsg = CMSG_NEXT(cmsg)) { 1531 if (cmsg->cmsg_level == SOL_SOCKET && 1532 cmsg->cmsg_type == SCM_RIGHTS) { 1533 len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg)); 1534 } else { 1535 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1536 } 1537 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) + 1538 sizeof (struct T_opthdr)); 1539 } 1540 dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n", 1541 controllen, oldflg, optlen)); 1542 return (optlen); 1543 } 1544 1545 /* 1546 * Copy options from control to the mblk. Skip any file descriptor options. 1547 */ 1548 void 1549 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp) 1550 { 1551 struct T_opthdr toh; 1552 struct cmsghdr *cmsg; 1553 1554 if (control == NULL) 1555 return; 1556 1557 if (oldflg) { 1558 /* No real options - caller has handled file descriptors */ 1559 return; 1560 } 1561 for (cmsg = (struct cmsghdr *)control; 1562 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); 1563 cmsg = CMSG_NEXT(cmsg)) { 1564 /* 1565 * Note: The caller handles file descriptors prior 1566 * to calling this function. 1567 */ 1568 t_uscalar_t len; 1569 1570 if (cmsg->cmsg_level == SOL_SOCKET && 1571 cmsg->cmsg_type == SCM_RIGHTS) 1572 continue; 1573 1574 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); 1575 toh.level = cmsg->cmsg_level; 1576 toh.name = cmsg->cmsg_type; 1577 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr); 1578 toh.status = 0; 1579 1580 soappendmsg(mp, &toh, sizeof (toh)); 1581 soappendmsg(mp, CMSG_CONTENT(cmsg), len); 1582 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len; 1583 ASSERT(mp->b_wptr <= mp->b_datap->db_lim); 1584 } 1585 } 1586 1587 /* 1588 * Return the length of the control message derived from the options. 1589 * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP. 1590 * When oldflg is set only include SO_FILEP. 1591 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1592 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1593 * also be checked for any possible impacts. 1594 */ 1595 t_uscalar_t 1596 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg) 1597 { 1598 t_uscalar_t cmsglen = 0; 1599 struct T_opthdr *tohp; 1600 t_uscalar_t len; 1601 t_uscalar_t last_roundup = 0; 1602 1603 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1604 1605 for (tohp = (struct T_opthdr *)opt; 1606 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1607 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1608 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n", 1609 tohp->level, tohp->name, tohp->len)); 1610 if (tohp->level == SOL_SOCKET && 1611 (tohp->name == SO_SRCADDR || 1612 tohp->name == SO_UNIX_CLOSE)) { 1613 continue; 1614 } 1615 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1616 struct fdbuf *fdbuf; 1617 int fdbuflen; 1618 1619 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1620 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1621 1622 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1623 continue; 1624 if (oldflg) { 1625 cmsglen += fdbuf_cmsglen(fdbuflen); 1626 continue; 1627 } 1628 len = fdbuf_cmsglen(fdbuflen); 1629 } else if (tohp->level == SOL_SOCKET && 1630 tohp->name == SCM_TIMESTAMP) { 1631 if (oldflg) 1632 continue; 1633 1634 if (get_udatamodel() == DATAMODEL_NATIVE) { 1635 len = sizeof (struct timeval); 1636 } else { 1637 len = sizeof (struct timeval32); 1638 } 1639 } else { 1640 if (oldflg) 1641 continue; 1642 len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1643 } 1644 /* 1645 * Exclude roundup for last option to not set 1646 * MSG_CTRUNC when the cmsg fits but the padding doesn't fit. 1647 */ 1648 last_roundup = (t_uscalar_t) 1649 (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) - 1650 (len + (int)sizeof (struct cmsghdr))); 1651 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) + 1652 last_roundup; 1653 } 1654 cmsglen -= last_roundup; 1655 dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n", 1656 optlen, oldflg, cmsglen)); 1657 return (cmsglen); 1658 } 1659 1660 /* 1661 * Copy options from options to the control. Convert SO_FILEP to 1662 * file descriptors. 1663 * Returns errno or zero. 1664 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen 1665 * allocates the space that so_opt2cmsg fills. If one changes, the other should 1666 * also be checked for any possible impacts. 1667 */ 1668 int 1669 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg, 1670 void *control, t_uscalar_t controllen) 1671 { 1672 struct T_opthdr *tohp; 1673 struct cmsghdr *cmsg; 1674 struct fdbuf *fdbuf; 1675 int fdbuflen; 1676 int error; 1677 #if defined(DEBUG) || defined(__lint) 1678 struct cmsghdr *cend = (struct cmsghdr *) 1679 (((uint8_t *)control) + ROUNDUP_cmsglen(controllen)); 1680 #endif 1681 cmsg = (struct cmsghdr *)control; 1682 1683 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1684 1685 for (tohp = (struct T_opthdr *)opt; 1686 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1687 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1688 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n", 1689 tohp->level, tohp->name, tohp->len)); 1690 1691 if (tohp->level == SOL_SOCKET && 1692 (tohp->name == SO_SRCADDR || 1693 tohp->name == SO_UNIX_CLOSE)) { 1694 continue; 1695 } 1696 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen); 1697 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) { 1698 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp); 1699 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp); 1700 1701 if (!fdbuf_verify(mp, fdbuf, fdbuflen)) 1702 return (EPROTO); 1703 if (oldflg) { 1704 error = fdbuf_extract(fdbuf, control, 1705 (int)controllen); 1706 if (error != 0) 1707 return (error); 1708 continue; 1709 } else { 1710 int fdlen; 1711 1712 fdlen = (int)fdbuf_cmsglen( 1713 (int)_TPI_TOPT_DATALEN(tohp)); 1714 1715 cmsg->cmsg_level = tohp->level; 1716 cmsg->cmsg_type = SCM_RIGHTS; 1717 cmsg->cmsg_len = (socklen_t)(fdlen + 1718 sizeof (struct cmsghdr)); 1719 1720 error = fdbuf_extract(fdbuf, 1721 CMSG_CONTENT(cmsg), fdlen); 1722 if (error != 0) 1723 return (error); 1724 } 1725 } else if (tohp->level == SOL_SOCKET && 1726 tohp->name == SCM_TIMESTAMP) { 1727 timestruc_t *timestamp; 1728 1729 if (oldflg) 1730 continue; 1731 1732 cmsg->cmsg_level = tohp->level; 1733 cmsg->cmsg_type = tohp->name; 1734 1735 timestamp = 1736 (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1], 1737 sizeof (intptr_t)); 1738 1739 if (get_udatamodel() == DATAMODEL_NATIVE) { 1740 struct timeval tv; 1741 1742 cmsg->cmsg_len = sizeof (struct timeval) + 1743 sizeof (struct cmsghdr); 1744 tv.tv_sec = timestamp->tv_sec; 1745 tv.tv_usec = timestamp->tv_nsec / 1746 (NANOSEC / MICROSEC); 1747 /* 1748 * on LP64 systems, the struct timeval in 1749 * the destination will not be 8-byte aligned, 1750 * so use bcopy to avoid alignment trouble 1751 */ 1752 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv)); 1753 } else { 1754 struct timeval32 *time32; 1755 1756 cmsg->cmsg_len = sizeof (struct timeval32) + 1757 sizeof (struct cmsghdr); 1758 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg); 1759 time32->tv_sec = (time32_t)timestamp->tv_sec; 1760 time32->tv_usec = 1761 (int32_t)(timestamp->tv_nsec / 1762 (NANOSEC / MICROSEC)); 1763 } 1764 1765 } else { 1766 if (oldflg) 1767 continue; 1768 1769 cmsg->cmsg_level = tohp->level; 1770 cmsg->cmsg_type = tohp->name; 1771 cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) + 1772 sizeof (struct cmsghdr)); 1773 1774 /* copy content to control data part */ 1775 bcopy(&tohp[1], CMSG_CONTENT(cmsg), 1776 CMSG_CONTENTLEN(cmsg)); 1777 } 1778 /* move to next CMSG structure! */ 1779 cmsg = CMSG_NEXT(cmsg); 1780 } 1781 dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n", 1782 control, controllen, (void *)cend, (void *)cmsg)); 1783 ASSERT(cmsg <= cend); 1784 return (0); 1785 } 1786 1787 /* 1788 * Extract the SO_SRCADDR option value if present. 1789 */ 1790 void 1791 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp, 1792 t_uscalar_t *srclenp) 1793 { 1794 struct T_opthdr *tohp; 1795 1796 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1797 1798 ASSERT(srcp != NULL && srclenp != NULL); 1799 *srcp = NULL; 1800 *srclenp = 0; 1801 1802 for (tohp = (struct T_opthdr *)opt; 1803 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1804 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1805 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n", 1806 tohp->level, tohp->name, tohp->len)); 1807 if (tohp->level == SOL_SOCKET && 1808 tohp->name == SO_SRCADDR) { 1809 *srcp = _TPI_TOPT_DATA(tohp); 1810 *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp); 1811 } 1812 } 1813 } 1814 1815 /* 1816 * Verify if the SO_UNIX_CLOSE option is present. 1817 */ 1818 int 1819 so_getopt_unix_close(void *opt, t_uscalar_t optlen) 1820 { 1821 struct T_opthdr *tohp; 1822 1823 ASSERT(__TPI_TOPT_ISALIGNED(opt)); 1824 1825 for (tohp = (struct T_opthdr *)opt; 1826 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen); 1827 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) { 1828 dprint(1, 1829 ("so_getopt_unix_close: level 0x%x, name %d, len %d\n", 1830 tohp->level, tohp->name, tohp->len)); 1831 if (tohp->level == SOL_SOCKET && 1832 tohp->name == SO_UNIX_CLOSE) 1833 return (1); 1834 } 1835 return (0); 1836 } 1837 1838 /* 1839 * Allocate an M_PROTO message. 1840 * 1841 * If allocation fails the behavior depends on sleepflg: 1842 * _ALLOC_NOSLEEP fail immediately 1843 * _ALLOC_INTR sleep for memory until a signal is caught 1844 * _ALLOC_SLEEP sleep forever. Don't return NULL. 1845 */ 1846 mblk_t * 1847 soallocproto(size_t size, int sleepflg) 1848 { 1849 mblk_t *mp; 1850 1851 /* Round up size for reuse */ 1852 size = MAX(size, 64); 1853 mp = allocb(size, BPRI_MED); 1854 if (mp == NULL) { 1855 int error; /* Dummy - error not returned to caller */ 1856 1857 switch (sleepflg) { 1858 case _ALLOC_SLEEP: 1859 mp = allocb_wait(size, BPRI_MED, STR_NOSIG, &error); 1860 ASSERT(mp); 1861 break; 1862 case _ALLOC_INTR: 1863 mp = allocb_wait(size, BPRI_MED, 0, &error); 1864 if (mp == NULL) { 1865 /* Caught signal while sleeping for memory */ 1866 eprintline(ENOBUFS); 1867 return (NULL); 1868 } 1869 break; 1870 case _ALLOC_NOSLEEP: 1871 default: 1872 eprintline(ENOBUFS); 1873 return (NULL); 1874 } 1875 } 1876 DB_TYPE(mp) = M_PROTO; 1877 return (mp); 1878 } 1879 1880 /* 1881 * Allocate an M_PROTO message with a single component. 1882 * len is the length of buf. size is the amount to allocate. 1883 * 1884 * buf can be NULL with a non-zero len. 1885 * This results in a bzero'ed chunk being placed the message. 1886 */ 1887 mblk_t * 1888 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg) 1889 { 1890 mblk_t *mp; 1891 1892 if (size == 0) 1893 size = len; 1894 1895 ASSERT(size >= len); 1896 /* Round up size for reuse */ 1897 size = MAX(size, 64); 1898 mp = soallocproto(size, sleepflg); 1899 if (mp == NULL) 1900 return (NULL); 1901 mp->b_datap->db_type = M_PROTO; 1902 if (len != 0) { 1903 if (buf != NULL) 1904 bcopy(buf, mp->b_wptr, len); 1905 else 1906 bzero(mp->b_wptr, len); 1907 mp->b_wptr += len; 1908 } 1909 return (mp); 1910 } 1911 1912 /* 1913 * Append buf/len to mp. 1914 * The caller has to ensure that there is enough room in the mblk. 1915 * 1916 * buf can be NULL with a non-zero len. 1917 * This results in a bzero'ed chunk being placed the message. 1918 */ 1919 void 1920 soappendmsg(mblk_t *mp, const void *buf, ssize_t len) 1921 { 1922 ASSERT(mp); 1923 1924 if (len != 0) { 1925 /* Assert for room left */ 1926 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len); 1927 if (buf != NULL) 1928 bcopy(buf, mp->b_wptr, len); 1929 else 1930 bzero(mp->b_wptr, len); 1931 } 1932 mp->b_wptr += len; 1933 } 1934 1935 /* 1936 * Create a message using two kernel buffers. 1937 * If size is set that will determine the allocation size (e.g. for future 1938 * soappendmsg calls). If size is zero it is derived from the buffer 1939 * lengths. 1940 */ 1941 mblk_t * 1942 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1943 ssize_t size, int sleepflg) 1944 { 1945 mblk_t *mp; 1946 1947 if (size == 0) 1948 size = len1 + len2; 1949 ASSERT(size >= len1 + len2); 1950 1951 mp = soallocproto1(buf1, len1, size, sleepflg); 1952 if (mp) 1953 soappendmsg(mp, buf2, len2); 1954 return (mp); 1955 } 1956 1957 /* 1958 * Create a message using three kernel buffers. 1959 * If size is set that will determine the allocation size (for future 1960 * soappendmsg calls). If size is zero it is derived from the buffer 1961 * lengths. 1962 */ 1963 mblk_t * 1964 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2, 1965 const void *buf3, ssize_t len3, ssize_t size, int sleepflg) 1966 { 1967 mblk_t *mp; 1968 1969 if (size == 0) 1970 size = len1 + len2 +len3; 1971 ASSERT(size >= len1 + len2 + len3); 1972 1973 mp = soallocproto1(buf1, len1, size, sleepflg); 1974 if (mp != NULL) { 1975 soappendmsg(mp, buf2, len2); 1976 soappendmsg(mp, buf3, len3); 1977 } 1978 return (mp); 1979 } 1980 1981 #ifdef DEBUG 1982 char * 1983 pr_state(uint_t state, uint_t mode) 1984 { 1985 static char buf[1024]; 1986 1987 buf[0] = 0; 1988 if (state & SS_ISCONNECTED) 1989 (void) strcat(buf, "ISCONNECTED "); 1990 if (state & SS_ISCONNECTING) 1991 (void) strcat(buf, "ISCONNECTING "); 1992 if (state & SS_ISDISCONNECTING) 1993 (void) strcat(buf, "ISDISCONNECTING "); 1994 if (state & SS_CANTSENDMORE) 1995 (void) strcat(buf, "CANTSENDMORE "); 1996 1997 if (state & SS_CANTRCVMORE) 1998 (void) strcat(buf, "CANTRCVMORE "); 1999 if (state & SS_ISBOUND) 2000 (void) strcat(buf, "ISBOUND "); 2001 if (state & SS_NDELAY) 2002 (void) strcat(buf, "NDELAY "); 2003 if (state & SS_NONBLOCK) 2004 (void) strcat(buf, "NONBLOCK "); 2005 2006 if (state & SS_ASYNC) 2007 (void) strcat(buf, "ASYNC "); 2008 if (state & SS_ACCEPTCONN) 2009 (void) strcat(buf, "ACCEPTCONN "); 2010 if (state & SS_HASCONNIND) 2011 (void) strcat(buf, "HASCONNIND "); 2012 if (state & SS_SAVEDEOR) 2013 (void) strcat(buf, "SAVEDEOR "); 2014 2015 if (state & SS_RCVATMARK) 2016 (void) strcat(buf, "RCVATMARK "); 2017 if (state & SS_OOBPEND) 2018 (void) strcat(buf, "OOBPEND "); 2019 if (state & SS_HAVEOOBDATA) 2020 (void) strcat(buf, "HAVEOOBDATA "); 2021 if (state & SS_HADOOBDATA) 2022 (void) strcat(buf, "HADOOBDATA "); 2023 2024 if (state & SS_FADDR_NOXLATE) 2025 (void) strcat(buf, "FADDR_NOXLATE "); 2026 2027 if (mode & SM_PRIV) 2028 (void) strcat(buf, "PRIV "); 2029 if (mode & SM_ATOMIC) 2030 (void) strcat(buf, "ATOMIC "); 2031 if (mode & SM_ADDR) 2032 (void) strcat(buf, "ADDR "); 2033 if (mode & SM_CONNREQUIRED) 2034 (void) strcat(buf, "CONNREQUIRED "); 2035 2036 if (mode & SM_FDPASSING) 2037 (void) strcat(buf, "FDPASSING "); 2038 if (mode & SM_EXDATA) 2039 (void) strcat(buf, "EXDATA "); 2040 if (mode & SM_OPTDATA) 2041 (void) strcat(buf, "OPTDATA "); 2042 if (mode & SM_BYTESTREAM) 2043 (void) strcat(buf, "BYTESTREAM "); 2044 return (buf); 2045 } 2046 2047 char * 2048 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen) 2049 { 2050 static char buf[1024]; 2051 2052 if (addr == NULL || addrlen == 0) { 2053 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr); 2054 return (buf); 2055 } 2056 switch (family) { 2057 case AF_INET: { 2058 struct sockaddr_in sin; 2059 2060 bcopy(addr, &sin, sizeof (sin)); 2061 2062 (void) sprintf(buf, "(len %d) %x/%d", 2063 addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port)); 2064 break; 2065 } 2066 case AF_INET6: { 2067 struct sockaddr_in6 sin6; 2068 uint16_t *piece = (uint16_t *)&sin6.sin6_addr; 2069 2070 bcopy((char *)addr, (char *)&sin6, sizeof (sin6)); 2071 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d", 2072 addrlen, 2073 ntohs(piece[0]), ntohs(piece[1]), 2074 ntohs(piece[2]), ntohs(piece[3]), 2075 ntohs(piece[4]), ntohs(piece[5]), 2076 ntohs(piece[6]), ntohs(piece[7]), 2077 ntohs(sin6.sin6_port)); 2078 break; 2079 } 2080 case AF_UNIX: { 2081 struct sockaddr_un *soun = (struct sockaddr_un *)addr; 2082 2083 (void) sprintf(buf, "(len %d) %s", addrlen, 2084 (soun == NULL) ? "(none)" : soun->sun_path); 2085 break; 2086 } 2087 default: 2088 (void) sprintf(buf, "(unknown af %d)", family); 2089 break; 2090 } 2091 return (buf); 2092 } 2093 2094 /* The logical equivalence operator (a if-and-only-if b) */ 2095 #define EQUIV(a, b) (((a) && (b)) || (!(a) && (!(b)))) 2096 2097 /* 2098 * Verify limitations and invariants on oob state. 2099 * Return 1 if OK, otherwise 0 so that it can be used as 2100 * ASSERT(verify_oobstate(so)); 2101 */ 2102 int 2103 so_verify_oobstate(struct sonode *so) 2104 { 2105 ASSERT(MUTEX_HELD(&so->so_lock)); 2106 2107 /* 2108 * The possible state combinations are: 2109 * 0 2110 * SS_OOBPEND 2111 * SS_OOBPEND|SS_HAVEOOBDATA 2112 * SS_OOBPEND|SS_HADOOBDATA 2113 * SS_HADOOBDATA 2114 */ 2115 switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) { 2116 case 0: 2117 case SS_OOBPEND: 2118 case SS_OOBPEND|SS_HAVEOOBDATA: 2119 case SS_OOBPEND|SS_HADOOBDATA: 2120 case SS_HADOOBDATA: 2121 break; 2122 default: 2123 printf("Bad oob state 1 (%p): counts %d/%d state %s\n", 2124 (void *)so, so->so_oobsigcnt, 2125 so->so_oobcnt, pr_state(so->so_state, so->so_mode)); 2126 return (0); 2127 } 2128 2129 /* SS_RCVATMARK should only be set when SS_OOBPEND is set */ 2130 if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) { 2131 printf("Bad oob state 2 (%p): counts %d/%d state %s\n", 2132 (void *)so, so->so_oobsigcnt, 2133 so->so_oobcnt, pr_state(so->so_state, so->so_mode)); 2134 return (0); 2135 } 2136 2137 /* 2138 * (so_oobsigcnt != 0 or SS_RCVATMARK) iff SS_OOBPEND 2139 */ 2140 if (!EQUIV((so->so_oobsigcnt != 0) || (so->so_state & SS_RCVATMARK), 2141 so->so_state & SS_OOBPEND)) { 2142 printf("Bad oob state 3 (%p): counts %d/%d state %s\n", 2143 (void *)so, so->so_oobsigcnt, 2144 so->so_oobcnt, pr_state(so->so_state, so->so_mode)); 2145 return (0); 2146 } 2147 2148 /* 2149 * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA 2150 */ 2151 if (!(so->so_options & SO_OOBINLINE) && 2152 !EQUIV(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) { 2153 printf("Bad oob state 4 (%p): counts %d/%d state %s\n", 2154 (void *)so, so->so_oobsigcnt, 2155 so->so_oobcnt, pr_state(so->so_state, so->so_mode)); 2156 return (0); 2157 } 2158 if (so->so_oobsigcnt < so->so_oobcnt) { 2159 printf("Bad oob state 5 (%p): counts %d/%d state %s\n", 2160 (void *)so, so->so_oobsigcnt, 2161 so->so_oobcnt, pr_state(so->so_state, so->so_mode)); 2162 return (0); 2163 } 2164 return (1); 2165 } 2166 #undef EQUIV 2167 2168 #endif /* DEBUG */ 2169 2170 /* initialize sockfs zone specific kstat related items */ 2171 void * 2172 sock_kstat_init(zoneid_t zoneid) 2173 { 2174 kstat_t *ksp; 2175 2176 ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc", 2177 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid); 2178 2179 if (ksp != NULL) { 2180 ksp->ks_update = sockfs_update; 2181 ksp->ks_snapshot = sockfs_snapshot; 2182 ksp->ks_lock = &socklist.sl_lock; 2183 ksp->ks_private = (void *)(uintptr_t)zoneid; 2184 kstat_install(ksp); 2185 } 2186 2187 return (ksp); 2188 } 2189 2190 /* tear down sockfs zone specific kstat related items */ 2191 /*ARGSUSED*/ 2192 void 2193 sock_kstat_fini(zoneid_t zoneid, void *arg) 2194 { 2195 kstat_t *ksp = (kstat_t *)arg; 2196 2197 if (ksp != NULL) { 2198 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private); 2199 kstat_delete(ksp); 2200 } 2201 } 2202 2203 /* 2204 * Zones: 2205 * Note that nactive is going to be different for each zone. 2206 * This means we require kstat to call sockfs_update and then sockfs_snapshot 2207 * for the same zone, or sockfs_snapshot will be taken into the wrong size 2208 * buffer. This is safe, but if the buffer is too small, user will not be 2209 * given details of all sockets. However, as this kstat has a ks_lock, kstat 2210 * driver will keep it locked between the update and the snapshot, so no 2211 * other process (zone) can currently get inbetween resulting in a wrong size 2212 * buffer allocation. 2213 */ 2214 static int 2215 sockfs_update(kstat_t *ksp, int rw) 2216 { 2217 uint_t nactive = 0; /* # of active AF_UNIX sockets */ 2218 struct sonode *so; /* current sonode on socklist */ 2219 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 2220 2221 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 2222 2223 if (rw == KSTAT_WRITE) { /* bounce all writes */ 2224 return (EACCES); 2225 } 2226 2227 for (so = socklist.sl_list; so != NULL; so = so->so_next) { 2228 if (so->so_accessvp != NULL && so->so_zoneid == myzoneid) { 2229 nactive++; 2230 } 2231 } 2232 ksp->ks_ndata = nactive; 2233 ksp->ks_data_size = nactive * sizeof (struct k_sockinfo); 2234 2235 return (0); 2236 } 2237 2238 static int 2239 sockfs_snapshot(kstat_t *ksp, void *buf, int rw) 2240 { 2241 int ns; /* # of sonodes we've copied */ 2242 struct sonode *so; /* current sonode on socklist */ 2243 struct k_sockinfo *pksi; /* where we put sockinfo data */ 2244 t_uscalar_t sn_len; /* soa_len */ 2245 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; 2246 2247 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); 2248 2249 ksp->ks_snaptime = gethrtime(); 2250 2251 if (rw == KSTAT_WRITE) { /* bounce all writes */ 2252 return (EACCES); 2253 } 2254 2255 /* 2256 * for each sonode on the socklist, we massage the important 2257 * info into buf, in k_sockinfo format. 2258 */ 2259 pksi = (struct k_sockinfo *)buf; 2260 for (ns = 0, so = socklist.sl_list; so != NULL; so = so->so_next) { 2261 /* only stuff active sonodes and the same zone: */ 2262 if (so->so_accessvp == NULL || so->so_zoneid != myzoneid) { 2263 continue; 2264 } 2265 2266 /* 2267 * If the sonode was activated between the update and the 2268 * snapshot, we're done - as this is only a snapshot. 2269 */ 2270 if ((caddr_t)(pksi) >= (caddr_t)buf + ksp->ks_data_size) { 2271 break; 2272 } 2273 2274 /* copy important info into buf: */ 2275 pksi->ks_si.si_size = sizeof (struct k_sockinfo); 2276 pksi->ks_si.si_family = so->so_family; 2277 pksi->ks_si.si_type = so->so_type; 2278 pksi->ks_si.si_flag = so->so_flag; 2279 pksi->ks_si.si_state = so->so_state; 2280 pksi->ks_si.si_serv_type = so->so_serv_type; 2281 pksi->ks_si.si_ux_laddr_sou_magic = so->so_ux_laddr.soua_magic; 2282 pksi->ks_si.si_ux_faddr_sou_magic = so->so_ux_faddr.soua_magic; 2283 pksi->ks_si.si_laddr_soa_len = so->so_laddr.soa_len; 2284 pksi->ks_si.si_faddr_soa_len = so->so_faddr.soa_len; 2285 pksi->ks_si.si_szoneid = so->so_zoneid; 2286 2287 mutex_enter(&so->so_lock); 2288 2289 if (so->so_laddr_sa != NULL) { 2290 ASSERT(so->so_laddr_sa->sa_data != NULL); 2291 sn_len = so->so_laddr_len; 2292 ASSERT(sn_len <= sizeof (short) + 2293 sizeof (pksi->ks_si.si_laddr_sun_path)); 2294 2295 pksi->ks_si.si_laddr_family = 2296 so->so_laddr_sa->sa_family; 2297 if (sn_len != 0) { 2298 /* AF_UNIX socket names are NULL terminated */ 2299 (void) strncpy(pksi->ks_si.si_laddr_sun_path, 2300 so->so_laddr_sa->sa_data, 2301 sizeof (pksi->ks_si.si_laddr_sun_path)); 2302 sn_len = strlen(pksi->ks_si.si_laddr_sun_path); 2303 } 2304 pksi->ks_si.si_laddr_sun_path[sn_len] = 0; 2305 } 2306 2307 if (so->so_faddr_sa != NULL) { 2308 ASSERT(so->so_faddr_sa->sa_data != NULL); 2309 sn_len = so->so_faddr_len; 2310 ASSERT(sn_len <= sizeof (short) + 2311 sizeof (pksi->ks_si.si_faddr_sun_path)); 2312 2313 pksi->ks_si.si_faddr_family = 2314 so->so_faddr_sa->sa_family; 2315 if (sn_len != 0) { 2316 (void) strncpy(pksi->ks_si.si_faddr_sun_path, 2317 so->so_faddr_sa->sa_data, 2318 sizeof (pksi->ks_si.si_faddr_sun_path)); 2319 sn_len = strlen(pksi->ks_si.si_faddr_sun_path); 2320 } 2321 pksi->ks_si.si_faddr_sun_path[sn_len] = 0; 2322 } 2323 2324 mutex_exit(&so->so_lock); 2325 2326 (void) sprintf(pksi->ks_straddr[0], "%p", (void *)so); 2327 (void) sprintf(pksi->ks_straddr[1], "%p", 2328 (void *)so->so_ux_laddr.soua_vp); 2329 (void) sprintf(pksi->ks_straddr[2], "%p", 2330 (void *)so->so_ux_faddr.soua_vp); 2331 2332 ns++; 2333 pksi++; 2334 } 2335 2336 ksp->ks_ndata = ns; 2337 return (0); 2338 } 2339 2340 ssize_t 2341 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size) 2342 { 2343 struct uio auio; 2344 struct iovec aiov[MSG_MAXIOVLEN]; 2345 register vnode_t *vp; 2346 int ioflag, rwflag; 2347 ssize_t cnt; 2348 int error = 0; 2349 int iovcnt = 0; 2350 short fflag; 2351 2352 vp = fp->f_vnode; 2353 fflag = fp->f_flag; 2354 2355 rwflag = 0; 2356 aiov[0].iov_base = (caddr_t)buf; 2357 aiov[0].iov_len = size; 2358 iovcnt = 1; 2359 cnt = (ssize_t)size; 2360 (void) VOP_RWLOCK(vp, rwflag, NULL); 2361 2362 auio.uio_loffset = fileoff; 2363 auio.uio_iov = aiov; 2364 auio.uio_iovcnt = iovcnt; 2365 auio.uio_resid = cnt; 2366 auio.uio_segflg = UIO_SYSSPACE; 2367 auio.uio_llimit = MAXOFFSET_T; 2368 auio.uio_fmode = fflag; 2369 auio.uio_extflg = UIO_COPY_CACHED; 2370 2371 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 2372 2373 /* If read sync is not asked for, filter sync flags */ 2374 if ((ioflag & FRSYNC) == 0) 2375 ioflag &= ~(FSYNC|FDSYNC); 2376 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL); 2377 cnt -= auio.uio_resid; 2378 2379 VOP_RWUNLOCK(vp, rwflag, NULL); 2380 2381 if (error == EINTR && cnt != 0) 2382 error = 0; 2383 out: 2384 if (error != 0) { 2385 *err = error; 2386 return (0); 2387 } else { 2388 *err = 0; 2389 return (cnt); 2390 } 2391 } 2392