1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/file.h> 27 #include <sys/stat.h> 28 #include <sys/atomic.h> 29 #include <sys/mntio.h> 30 #include <sys/mnttab.h> 31 #include <sys/mount.h> 32 #include <sys/sunddi.h> 33 #include <sys/sysmacros.h> 34 #include <sys/systm.h> 35 #include <sys/vfs.h> 36 #include <sys/vfs_opreg.h> 37 #include <sys/fs/mntdata.h> 38 #include <fs/fs_subr.h> 39 #include <sys/vmsystm.h> 40 #include <vm/seg_vn.h> 41 #include <sys/time.h> 42 #include <sys/ksynch.h> 43 #include <sys/sdt.h> 44 45 #define MNTROOTINO 2 46 47 static mntnode_t *mntgetnode(vnode_t *); 48 49 vnodeops_t *mntvnodeops; 50 extern void vfs_mnttab_readop(void); 51 52 /* 53 * Design of kernel mnttab accounting. 54 * 55 * mntfs provides two methods of reading the in-kernel mnttab, i.e. the state of 56 * the mounted resources: the read-only file /etc/mnttab, and a collection of 57 * ioctl() commands. Most of these interfaces are public and are described in 58 * mnttab(4). Three private ioctl() commands, MNTIOC_GETMNTENT, 59 * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY, provide for the getmntent(3C) 60 * family of functions, allowing them to support white space in mount names. 61 * 62 * A significant feature of mntfs is that it provides a file descriptor with a 63 * snapshot once it begins to consume mnttab data. Thus, as the process 64 * continues to consume data, its view of the in-kernel mnttab does not change 65 * even if resources are mounted or unmounted. The intent is to ensure that 66 * processes are guaranteed to read self-consistent data even as the system 67 * changes. 68 * 69 * The snapshot is implemented by a "database", unique to each zone, that 70 * comprises a linked list of mntelem_ts. The database is identified by 71 * zone_mntfs_db and is protected by zone_mntfs_db_lock. Each element contains 72 * the text entry in /etc/mnttab for a mounted resource, i.e. a vfs_t, and is 73 * marked with its time of "birth", i.e. creation. An element is "killed", and 74 * marked with its time of death, when it is found to be out of date, e.g. when 75 * the corresponding resource has been unmounted. 76 * 77 * When a process performs the first read() or ioctl() for a file descriptor for 78 * /etc/mnttab, the database is updated by a call to mntfs_snapshot() to ensure 79 * that an element exists for each currently mounted resource. Following this, 80 * the current time is written into a snapshot structure, a mntsnap_t, embedded 81 * in the descriptor's mntnode_t. 82 * 83 * mntfs is able to enumerate the /etc/mnttab entries corresponding to a 84 * particular file descriptor by searching the database for entries that were 85 * born before the appropriate snapshot and that either are still alive or died 86 * after the snapshot was created. Consumers use the iterator function 87 * mntfs_get_next_elem() to identify the next suitable element in the database. 88 * 89 * Each snapshot has a hold on its corresponding database elements, effected by 90 * a per-element reference count. At last close(), a snapshot is destroyed in 91 * mntfs_freesnap() by releasing all of its holds; an element is destroyed if 92 * its reference count becomes zero. Therefore the database never exists unless 93 * there is at least one active consumer of /etc/mnttab. 94 * 95 * getmntent(3C) et al. "do not open, close or rewind the file." This implies 96 * that getmntent() and read() must be able to operate without interaction on 97 * the same file descriptor; this is accomplished by the use of separate 98 * mntsnap_ts for both read() and ioctl(). 99 * 100 * NOTE: The following variable enables the generation of the "dev=xxx" 101 * in the option string for a mounted file system. Really this should 102 * be gotten rid of altogether, but for the sake of backwards compatibility 103 * we had to leave it in. It is defined as a 32-bit device number. This 104 * means that when 64-bit device numbers are in use, if either the major or 105 * minor part of the device number will not fit in a 16 bit quantity, the 106 * "dev=" will be set to NODEV (0x7fffffff). See PSARC 1999/566 and 107 * 1999/131 for details. The cmpldev() function used to generate the 32-bit 108 * device number handles this check and assigns the proper value. 109 */ 110 int mntfs_enabledev = 1; /* enable old "dev=xxx" option */ 111 112 extern void vfs_mono_time(timespec_t *); 113 enum { MNTFS_FIRST, MNTFS_SECOND, MNTFS_NEITHER }; 114 115 /* 116 * Determine whether a field within a line from /etc/mnttab contains actual 117 * content or simply the marker string "-". This never applies to the time, 118 * therefore the delimiter must be a tab. 119 */ 120 #define MNTFS_REAL_FIELD(x) (*(x) != '-' || *((x) + 1) != '\t') 121 122 static int 123 mntfs_devsize(struct vfs *vfsp) 124 { 125 dev32_t odev; 126 127 (void) cmpldev(&odev, vfsp->vfs_dev); 128 return (snprintf(NULL, 0, "dev=%x", odev)); 129 } 130 131 static int 132 mntfs_devprint(struct vfs *vfsp, char *buf) 133 { 134 dev32_t odev; 135 136 (void) cmpldev(&odev, vfsp->vfs_dev); 137 return (snprintf(buf, MAX_MNTOPT_STR, "dev=%x", odev)); 138 } 139 140 /* Identify which, if either, of two supplied timespec structs is newer. */ 141 static int 142 mntfs_newest(timespec_t *a, timespec_t *b) 143 { 144 if (a->tv_sec == b->tv_sec && 145 a->tv_nsec == b->tv_nsec) { 146 return (MNTFS_NEITHER); 147 } else if (b->tv_sec > a->tv_sec || 148 (b->tv_sec == a->tv_sec && 149 b->tv_nsec > a->tv_nsec)) { 150 return (MNTFS_SECOND); 151 } else { 152 return (MNTFS_FIRST); 153 } 154 } 155 156 static int 157 mntfs_optsize(struct vfs *vfsp) 158 { 159 int i, size = 0; 160 mntopt_t *mop; 161 162 for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) { 163 mop = &vfsp->vfs_mntopts.mo_list[i]; 164 if (mop->mo_flags & MO_NODISPLAY) 165 continue; 166 if (mop->mo_flags & MO_SET) { 167 if (size) 168 size++; /* space for comma */ 169 size += strlen(mop->mo_name); 170 /* 171 * count option value if there is one 172 */ 173 if (mop->mo_arg != NULL) { 174 size += strlen(mop->mo_arg) + 1; 175 } 176 } 177 } 178 if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) { 179 /* 180 * Add space for "zone=<zone_name>" if required. 181 */ 182 if (size) 183 size++; /* space for comma */ 184 size += sizeof ("zone=") - 1; 185 size += strlen(vfsp->vfs_zone->zone_name); 186 } 187 if (mntfs_enabledev) { 188 if (size != 0) 189 size++; /* space for comma */ 190 size += mntfs_devsize(vfsp); 191 } 192 if (size == 0) 193 size = strlen("-"); 194 return (size); 195 } 196 197 static int 198 mntfs_optprint(struct vfs *vfsp, char *buf) 199 { 200 int i, optinbuf = 0; 201 mntopt_t *mop; 202 char *origbuf = buf; 203 204 for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) { 205 mop = &vfsp->vfs_mntopts.mo_list[i]; 206 if (mop->mo_flags & MO_NODISPLAY) 207 continue; 208 if (mop->mo_flags & MO_SET) { 209 if (optinbuf) 210 *buf++ = ','; 211 else 212 optinbuf = 1; 213 buf += snprintf(buf, MAX_MNTOPT_STR, 214 "%s", mop->mo_name); 215 /* 216 * print option value if there is one 217 */ 218 if (mop->mo_arg != NULL) { 219 buf += snprintf(buf, MAX_MNTOPT_STR, "=%s", 220 mop->mo_arg); 221 } 222 } 223 } 224 if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) { 225 if (optinbuf) 226 *buf++ = ','; 227 else 228 optinbuf = 1; 229 buf += snprintf(buf, MAX_MNTOPT_STR, "zone=%s", 230 vfsp->vfs_zone->zone_name); 231 } 232 if (mntfs_enabledev) { 233 if (optinbuf++) 234 *buf++ = ','; 235 buf += mntfs_devprint(vfsp, buf); 236 } 237 if (!optinbuf) { 238 buf += snprintf(buf, MAX_MNTOPT_STR, "-"); 239 } 240 return (buf - origbuf); 241 } 242 243 void 244 mntfs_populate_text(vfs_t *vfsp, zone_t *zonep, mntelem_t *elemp) 245 { 246 struct extmnttab *tabp = &elemp->mnte_tab; 247 const char *resource, *mntpt; 248 char *cp = elemp->mnte_text; 249 mntpt = refstr_value(vfsp->vfs_mntpt); 250 resource = refstr_value(vfsp->vfs_resource); 251 252 tabp->mnt_special = 0; 253 if (resource != NULL && resource[0] != '\0') { 254 if (resource[0] != '/') { 255 cp += snprintf(cp, MAXPATHLEN, "%s\t", resource); 256 } else if (!ZONE_PATH_VISIBLE(resource, zonep)) { 257 /* 258 * Use the mount point as the resource. 259 */ 260 cp += snprintf(cp, MAXPATHLEN, "%s\t", 261 ZONE_PATH_TRANSLATE(mntpt, zonep)); 262 } else { 263 cp += snprintf(cp, MAXPATHLEN, "%s\t", 264 ZONE_PATH_TRANSLATE(resource, zonep)); 265 } 266 } else { 267 cp += snprintf(cp, MAXPATHLEN, "-\t"); 268 } 269 270 tabp->mnt_mountp = (char *)(cp - elemp->mnte_text); 271 if (mntpt != NULL && mntpt[0] != '\0') { 272 /* 273 * We know the mount point is visible from within the zone, 274 * otherwise it wouldn't be on the zone's vfs list. 275 */ 276 cp += snprintf(cp, MAXPATHLEN, "%s\t", 277 ZONE_PATH_TRANSLATE(mntpt, zonep)); 278 } else { 279 cp += snprintf(cp, MAXPATHLEN, "-\t"); 280 } 281 282 tabp->mnt_fstype = (char *)(cp - elemp->mnte_text); 283 cp += snprintf(cp, MAXPATHLEN, "%s\t", 284 vfssw[vfsp->vfs_fstype].vsw_name); 285 286 tabp->mnt_mntopts = (char *)(cp - elemp->mnte_text); 287 cp += mntfs_optprint(vfsp, cp); 288 *cp++ = '\t'; 289 290 tabp->mnt_time = (char *)(cp - elemp->mnte_text); 291 cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime); 292 *cp++ = '\n'; /* over-write snprintf's trailing null-byte */ 293 294 tabp->mnt_major = getmajor(vfsp->vfs_dev); 295 tabp->mnt_minor = getminor(vfsp->vfs_dev); 296 297 elemp->mnte_text_size = cp - elemp->mnte_text; 298 elemp->mnte_vfs_ctime = vfsp->vfs_hrctime; 299 elemp->mnte_hidden = vfsp->vfs_flag & VFS_NOMNTTAB; 300 } 301 302 /* Determine the length of the /etc/mnttab entry for this vfs_t. */ 303 static size_t 304 mntfs_text_len(vfs_t *vfsp, zone_t *zone) 305 { 306 size_t size = 0; 307 const char *resource, *mntpt; 308 size_t mntsize; 309 310 mntpt = refstr_value(vfsp->vfs_mntpt); 311 if (mntpt != NULL && mntpt[0] != '\0') { 312 mntsize = strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1; 313 } else { 314 mntsize = 2; /* "-\t" */ 315 } 316 size += mntsize; 317 318 resource = refstr_value(vfsp->vfs_resource); 319 if (resource != NULL && resource[0] != '\0') { 320 if (resource[0] != '/') { 321 size += strlen(resource) + 1; 322 } else if (!ZONE_PATH_VISIBLE(resource, zone)) { 323 /* 324 * Same as the zone's view of the mount point. 325 */ 326 size += mntsize; 327 } else { 328 size += strlen(ZONE_PATH_TRANSLATE(resource, zone)) + 1; 329 } 330 } else { 331 size += 2; /* "-\t" */ 332 } 333 size += strlen(vfssw[vfsp->vfs_fstype].vsw_name) + 1; 334 size += mntfs_optsize(vfsp); 335 size += snprintf(NULL, 0, "\t%ld\n", vfsp->vfs_mtime); 336 return (size); 337 } 338 339 /* Destroy the resources associated with a snapshot element. */ 340 static void 341 mntfs_destroy_elem(mntelem_t *elemp) 342 { 343 kmem_free(elemp->mnte_text, elemp->mnte_text_size); 344 kmem_free(elemp, sizeof (mntelem_t)); 345 } 346 347 /* 348 * Return 1 if the given snapshot is in the range of the given element; return 349 * 0 otherwise. 350 */ 351 static int 352 mntfs_elem_in_range(mntsnap_t *snapp, mntelem_t *elemp) 353 { 354 timespec_t *stimep = &snapp->mnts_time; 355 timespec_t *btimep = &elemp->mnte_birth; 356 timespec_t *dtimep = &elemp->mnte_death; 357 358 /* 359 * If a snapshot is in range of an element then the snapshot must have 360 * been created after the birth of the element, and either the element 361 * is still alive or it died after the snapshot was created. 362 */ 363 if (mntfs_newest(btimep, stimep) == MNTFS_SECOND && 364 (MNTFS_ELEM_IS_ALIVE(elemp) || 365 mntfs_newest(stimep, dtimep) == MNTFS_SECOND)) 366 return (1); 367 else 368 return (0); 369 } 370 371 /* 372 * Return the next valid database element, after the one provided, for a given 373 * snapshot; return NULL if none exists. The caller must hold the zone's 374 * database lock as a reader before calling this function. 375 */ 376 static mntelem_t * 377 mntfs_get_next_elem(mntsnap_t *snapp, mntelem_t *elemp) 378 { 379 int show_hidden = snapp->mnts_flags & MNTS_SHOWHIDDEN; 380 381 do { 382 elemp = elemp->mnte_next; 383 } while (elemp && 384 (!mntfs_elem_in_range(snapp, elemp) || 385 (!show_hidden && elemp->mnte_hidden))); 386 return (elemp); 387 } 388 389 /* 390 * This function frees the resources associated with a mntsnap_t. It walks 391 * through the database, decrementing the reference count of any element that 392 * satisfies the snapshot. If the reference count of an element becomes zero 393 * then it is removed from the database. 394 */ 395 static void 396 mntfs_freesnap(mntnode_t *mnp, mntsnap_t *snapp) 397 { 398 zone_t *zonep = MTOD(mnp)->mnt_zone; 399 krwlock_t *dblockp = &zonep->zone_mntfs_db_lock; 400 mntelem_t **elempp = &zonep->zone_mntfs_db; 401 mntelem_t *elemp; 402 int show_hidden = snapp->mnts_flags & MNTS_SHOWHIDDEN; 403 size_t number_decremented = 0; 404 405 ASSERT(RW_WRITE_HELD(&mnp->mnt_contents)); 406 407 /* Ignore an uninitialised snapshot. */ 408 if (snapp->mnts_nmnts == 0) 409 return; 410 411 /* Drop the holds on any matching database elements. */ 412 rw_enter(dblockp, RW_WRITER); 413 while ((elemp = *elempp) != NULL) { 414 if (mntfs_elem_in_range(snapp, elemp) && 415 (!elemp->mnte_hidden || show_hidden) && 416 ++number_decremented && --elemp->mnte_refcnt == 0) { 417 if ((*elempp = elemp->mnte_next) != NULL) 418 (*elempp)->mnte_prev = elemp->mnte_prev; 419 mntfs_destroy_elem(elemp); 420 } else { 421 elempp = &elemp->mnte_next; 422 } 423 } 424 rw_exit(dblockp); 425 ASSERT(number_decremented == snapp->mnts_nmnts); 426 427 /* Clear the snapshot data. */ 428 bzero(snapp, sizeof (mntsnap_t)); 429 } 430 431 /* Insert the new database element newp after the existing element prevp. */ 432 static void 433 mntfs_insert_after(mntelem_t *newp, mntelem_t *prevp) 434 { 435 newp->mnte_prev = prevp; 436 newp->mnte_next = prevp->mnte_next; 437 prevp->mnte_next = newp; 438 if (newp->mnte_next != NULL) 439 newp->mnte_next->mnte_prev = newp; 440 } 441 442 /* Create and return a copy of a given database element. */ 443 static mntelem_t * 444 mntfs_copy(mntelem_t *origp) 445 { 446 mntelem_t *copyp; 447 448 copyp = kmem_zalloc(sizeof (mntelem_t), KM_SLEEP); 449 copyp->mnte_vfs_ctime = origp->mnte_vfs_ctime; 450 copyp->mnte_text_size = origp->mnte_text_size; 451 copyp->mnte_text = kmem_alloc(copyp->mnte_text_size, KM_SLEEP); 452 bcopy(origp->mnte_text, copyp->mnte_text, copyp->mnte_text_size); 453 copyp->mnte_tab = origp->mnte_tab; 454 copyp->mnte_hidden = origp->mnte_hidden; 455 456 return (copyp); 457 } 458 459 /* 460 * Compare two database elements and determine whether or not the vfs_t payload 461 * data of each are the same. Return 1 if so and 0 otherwise. 462 */ 463 static int 464 mntfs_is_same_element(mntelem_t *a, mntelem_t *b) 465 { 466 if (a->mnte_hidden == b->mnte_hidden && 467 a->mnte_text_size == b->mnte_text_size && 468 bcmp(a->mnte_text, b->mnte_text, a->mnte_text_size) == 0 && 469 bcmp(&a->mnte_tab, &b->mnte_tab, sizeof (struct extmnttab)) == 0) 470 return (1); 471 else 472 return (0); 473 } 474 475 /* 476 * mntfs_snapshot() updates the database, creating it if necessary, so that it 477 * accurately reflects the state of the in-kernel mnttab. It also increments 478 * the reference count on all database elements that correspond to currently- 479 * mounted resources. Finally, it initialises the appropriate snapshot 480 * structure. 481 * 482 * Each vfs_t is given a high-resolution time stamp, for the benefit of mntfs, 483 * when it is inserted into the in-kernel mnttab. This time stamp is copied into 484 * the corresponding database element when it is created, allowing the element 485 * and the vfs_t to be identified as a pair. It is possible that some file 486 * systems may make unadvertised changes to, for example, a resource's mount 487 * options. Therefore, in order to determine whether a database element is an 488 * up-to-date representation of a given vfs_t, it is compared with a temporary 489 * element generated for this purpose. Although less efficient, this is safer 490 * than implementing an mtime for a vfs_t. 491 * 492 * Some mounted resources are marked as "hidden" with a VFS_NOMNTTAB flag. These 493 * are considered invisible unless the user has already set the MNT_SHOWHIDDEN 494 * flag in the vnode using the MNTIOC_SHOWHIDDEN ioctl. 495 */ 496 static void 497 mntfs_snapshot(mntnode_t *mnp, mntsnap_t *snapp) 498 { 499 zone_t *zonep = MTOD(mnp)->mnt_zone; 500 int is_global_zone = (zonep == global_zone); 501 int show_hidden = mnp->mnt_flags & MNT_SHOWHIDDEN; 502 vfs_t *vfsp, *firstvfsp, *lastvfsp; 503 vfs_t dummyvfs; 504 vfs_t *dummyvfsp = NULL; 505 krwlock_t *dblockp = &zonep->zone_mntfs_db_lock; 506 mntelem_t **headpp = &zonep->zone_mntfs_db; 507 mntelem_t *elemp; 508 mntelem_t *prevp = NULL; 509 int order; 510 mntelem_t *tempelemp; 511 mntelem_t *newp; 512 mntelem_t *firstp = NULL; 513 size_t nmnts = 0; 514 size_t text_size = 0; 515 int insert_before; 516 timespec_t last_mtime; 517 size_t entry_length, new_entry_length; 518 519 520 ASSERT(RW_WRITE_HELD(&mnp->mnt_contents)); 521 vfs_list_read_lock(); 522 vfs_mnttab_modtime(&last_mtime); 523 524 /* 525 * If this snapshot already exists then we must have been asked to 526 * rewind the file, i.e. discard the snapshot and create a new one in 527 * its place. In this case we first see if the in-kernel mnttab has 528 * advertised a change; if not then we simply reinitialise the metadata. 529 */ 530 if (snapp->mnts_nmnts) { 531 if (mntfs_newest(&last_mtime, &snapp->mnts_last_mtime) == 532 MNTFS_NEITHER) { 533 /* 534 * An unchanged mtime is no guarantee that the 535 * in-kernel mnttab is unchanged; for example, a 536 * concurrent remount may be between calls to 537 * vfs_setmntopt_nolock() and vfs_mnttab_modtimeupd(). 538 * It follows that the database may have changed, and 539 * in particular that some elements in this snapshot 540 * may have been killed by another call to 541 * mntfs_snapshot(). It is therefore not merely 542 * unnecessary to update the snapshot's time but in 543 * fact dangerous; it needs to be left alone. 544 */ 545 snapp->mnts_next = snapp->mnts_first; 546 snapp->mnts_flags &= ~MNTS_REWIND; 547 snapp->mnts_foffset = snapp->mnts_ieoffset = 0; 548 vfs_list_unlock(); 549 return; 550 } else { 551 mntfs_freesnap(mnp, snapp); 552 } 553 } 554 555 /* 556 * Create a temporary database element. For each vfs_t, the temporary 557 * element will be populated with the corresponding text. If the vfs_t 558 * does not have a corresponding element within the database, or if 559 * there is such an element but it is stale, a copy of the temporary 560 * element is inserted into the database at the appropriate location. 561 */ 562 tempelemp = kmem_alloc(sizeof (mntelem_t), KM_SLEEP); 563 entry_length = MNT_LINE_MAX; 564 tempelemp->mnte_text = kmem_alloc(entry_length, KM_SLEEP); 565 566 /* Find the first and last vfs_t for the given zone. */ 567 if (is_global_zone) { 568 firstvfsp = rootvfs; 569 lastvfsp = firstvfsp->vfs_prev; 570 } else { 571 firstvfsp = zonep->zone_vfslist; 572 /* 573 * If there isn't already a vfs_t for root then we create a 574 * dummy which will be used as the head of the list (which will 575 * therefore no longer be circular). 576 */ 577 if (firstvfsp == NULL || 578 strcmp(refstr_value(firstvfsp->vfs_mntpt), 579 zonep->zone_rootpath) != 0) { 580 /* 581 * The zone's vfs_ts will have mount points relative to 582 * the zone's root path. The vfs_t for the zone's 583 * root file system would therefore have a mount point 584 * equal to the zone's root path. Since the zone's root 585 * path isn't a mount point, we copy the vfs_t of the 586 * zone's root vnode, and provide it with a fake mount 587 * point and resource. 588 * 589 * Note that by cloning another vfs_t we also acquire 590 * its high-resolution ctime. This might appear to 591 * violate the requirement that the ctimes in the list 592 * of vfs_ts are unique and monotonically increasing; 593 * this is not the case. The dummy vfs_t appears in only 594 * a non-global zone's vfs_t list, where the cloned 595 * vfs_t would not ordinarily be visible; the ctimes are 596 * therefore unique. The zone's root path must be 597 * available before the zone boots, and so its root 598 * vnode's vfs_t's ctime must be lower than those of any 599 * resources subsequently mounted by the zone. The 600 * ctimes are therefore monotonically increasing. 601 */ 602 dummyvfs = *zonep->zone_rootvp->v_vfsp; 603 dummyvfs.vfs_mntpt = refstr_alloc(zonep->zone_rootpath); 604 dummyvfs.vfs_resource = dummyvfs.vfs_mntpt; 605 dummyvfsp = &dummyvfs; 606 if (firstvfsp == NULL) { 607 lastvfsp = dummyvfsp; 608 } else { 609 lastvfsp = firstvfsp->vfs_zone_prev; 610 dummyvfsp->vfs_zone_next = firstvfsp; 611 } 612 firstvfsp = dummyvfsp; 613 } else { 614 lastvfsp = firstvfsp->vfs_zone_prev; 615 } 616 } 617 618 /* 619 * Now walk through all the vfs_ts for this zone. For each one, find the 620 * corresponding database element, creating it first if necessary, and 621 * increment its reference count. 622 */ 623 rw_enter(dblockp, RW_WRITER); 624 elemp = zonep->zone_mntfs_db; 625 /* CSTYLED */ 626 for (vfsp = firstvfsp;; 627 vfsp = is_global_zone ? vfsp->vfs_next : vfsp->vfs_zone_next) { 628 DTRACE_PROBE1(new__vfs, vfs_t *, vfsp); 629 /* Consider only visible entries. */ 630 if ((vfsp->vfs_flag & VFS_NOMNTTAB) == 0 || show_hidden) { 631 /* 632 * Walk through the existing database looking for either 633 * an element that matches the current vfs_t, or for the 634 * correct place in which to insert a new element. 635 */ 636 insert_before = 0; 637 for (; elemp; prevp = elemp, elemp = elemp->mnte_next) { 638 DTRACE_PROBE1(considering__elem, mntelem_t *, 639 elemp); 640 641 /* Compare the vfs_t with the element. */ 642 order = mntfs_newest(&elemp->mnte_vfs_ctime, 643 &vfsp->vfs_hrctime); 644 645 /* 646 * If we encounter a database element newer than 647 * this vfs_t then we've stepped over a gap 648 * where the element for this vfs_t must be 649 * inserted. 650 */ 651 if (order == MNTFS_FIRST) { 652 insert_before = 1; 653 break; 654 } 655 656 /* Dead elements no longer interest us. */ 657 if (MNTFS_ELEM_IS_DEAD(elemp)) 658 continue; 659 660 /* 661 * If the time stamps are the same then the 662 * element is potential match for the vfs_t, 663 * although it may later prove to be stale. 664 */ 665 if (order == MNTFS_NEITHER) 666 break; 667 668 /* 669 * This element must be older than the vfs_t. 670 * It must, therefore, correspond to a vfs_t 671 * that has been unmounted. Since the element is 672 * still alive, we kill it if it is visible. 673 */ 674 if (!elemp->mnte_hidden || show_hidden) 675 vfs_mono_time(&elemp->mnte_death); 676 } 677 DTRACE_PROBE2(possible__match, vfs_t *, vfsp, 678 mntelem_t *, elemp); 679 680 /* Create a new database element if required. */ 681 new_entry_length = mntfs_text_len(vfsp, zonep); 682 if (new_entry_length > entry_length) { 683 kmem_free(tempelemp->mnte_text, entry_length); 684 tempelemp->mnte_text = 685 kmem_alloc(new_entry_length, KM_SLEEP); 686 entry_length = new_entry_length; 687 } 688 mntfs_populate_text(vfsp, zonep, tempelemp); 689 ASSERT(tempelemp->mnte_text_size == new_entry_length); 690 if (elemp == NULL) { 691 /* 692 * We ran off the end of the database. Insert a 693 * new element at the end. 694 */ 695 newp = mntfs_copy(tempelemp); 696 vfs_mono_time(&newp->mnte_birth); 697 if (prevp) { 698 mntfs_insert_after(newp, prevp); 699 } else { 700 newp->mnte_next = NULL; 701 newp->mnte_prev = NULL; 702 ASSERT(*headpp == NULL); 703 *headpp = newp; 704 } 705 elemp = newp; 706 } else if (insert_before) { 707 /* 708 * Insert a new element before the current one. 709 */ 710 newp = mntfs_copy(tempelemp); 711 vfs_mono_time(&newp->mnte_birth); 712 if (prevp) { 713 mntfs_insert_after(newp, prevp); 714 } else { 715 newp->mnte_next = elemp; 716 newp->mnte_prev = NULL; 717 elemp->mnte_prev = newp; 718 ASSERT(*headpp == elemp); 719 *headpp = newp; 720 } 721 elemp = newp; 722 } else if (!mntfs_is_same_element(elemp, tempelemp)) { 723 /* 724 * The element corresponds to the vfs_t, but the 725 * vfs_t has changed; it must have been 726 * remounted. Kill the old element and insert a 727 * new one after it. 728 */ 729 vfs_mono_time(&elemp->mnte_death); 730 newp = mntfs_copy(tempelemp); 731 vfs_mono_time(&newp->mnte_birth); 732 mntfs_insert_after(newp, elemp); 733 elemp = newp; 734 } 735 736 /* We've found the corresponding element. Hold it. */ 737 DTRACE_PROBE1(incrementing, mntelem_t *, elemp); 738 elemp->mnte_refcnt++; 739 740 /* 741 * Update the parameters used to initialise the 742 * snapshot. 743 */ 744 nmnts++; 745 text_size += elemp->mnte_text_size; 746 if (!firstp) 747 firstp = elemp; 748 749 prevp = elemp; 750 elemp = elemp->mnte_next; 751 } 752 753 if (vfsp == lastvfsp) 754 break; 755 } 756 757 /* 758 * Any remaining visible database elements that are still alive must be 759 * killed now, because their corresponding vfs_ts must have been 760 * unmounted. 761 */ 762 for (; elemp; elemp = elemp->mnte_next) { 763 if (MNTFS_ELEM_IS_ALIVE(elemp) && 764 (!elemp->mnte_hidden || show_hidden)) 765 vfs_mono_time(&elemp->mnte_death); 766 } 767 768 /* Initialise the snapshot. */ 769 vfs_mono_time(&snapp->mnts_time); 770 snapp->mnts_last_mtime = last_mtime; 771 snapp->mnts_first = snapp->mnts_next = firstp; 772 snapp->mnts_flags = show_hidden ? MNTS_SHOWHIDDEN : 0; 773 snapp->mnts_nmnts = nmnts; 774 snapp->mnts_text_size = MTOD(mnp)->mnt_size = text_size; 775 snapp->mnts_foffset = snapp->mnts_ieoffset = 0; 776 777 /* Clean up. */ 778 rw_exit(dblockp); 779 vfs_list_unlock(); 780 if (dummyvfsp != NULL) 781 refstr_rele(dummyvfsp->vfs_mntpt); 782 kmem_free(tempelemp->mnte_text, entry_length); 783 kmem_free(tempelemp, sizeof (mntelem_t)); 784 } 785 786 /* 787 * Public function to convert vfs_mntopts into a string. 788 * A buffer of sufficient size is allocated, which is returned via bufp, 789 * and whose length is returned via lenp. 790 */ 791 void 792 mntfs_getmntopts(struct vfs *vfsp, char **bufp, size_t *lenp) 793 { 794 size_t len; 795 char *buf; 796 797 vfs_list_read_lock(); 798 799 len = mntfs_optsize(vfsp) + 1; 800 buf = kmem_alloc(len, KM_NOSLEEP); 801 if (buf == NULL) { 802 *bufp = NULL; 803 vfs_list_unlock(); 804 return; 805 } 806 buf[len - 1] = '\0'; 807 (void) mntfs_optprint(vfsp, buf); 808 ASSERT(buf[len - 1] == '\0'); 809 810 vfs_list_unlock(); 811 *bufp = buf; 812 *lenp = len; 813 } 814 815 /* ARGSUSED */ 816 static int 817 mntopen(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 818 { 819 vnode_t *vp = *vpp; 820 mntnode_t *nmnp; 821 822 /* 823 * Not allowed to open for writing, return error. 824 */ 825 if (flag & FWRITE) 826 return (EPERM); 827 /* 828 * Create a new mnt/vnode for each open, this will give us a handle to 829 * hang the snapshot on. 830 */ 831 nmnp = mntgetnode(vp); 832 833 *vpp = MTOV(nmnp); 834 atomic_add_32(&MTOD(nmnp)->mnt_nopen, 1); 835 VN_RELE(vp); 836 return (0); 837 } 838 839 /* ARGSUSED */ 840 static int 841 mntclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 842 caller_context_t *ct) 843 { 844 mntnode_t *mnp = VTOM(vp); 845 846 /* Clean up any locks or shares held by the current process */ 847 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 848 cleanshares(vp, ttoproc(curthread)->p_pid); 849 850 if (count > 1) 851 return (0); 852 if (vp->v_count == 1) { 853 rw_enter(&mnp->mnt_contents, RW_WRITER); 854 mntfs_freesnap(mnp, &mnp->mnt_read); 855 mntfs_freesnap(mnp, &mnp->mnt_ioctl); 856 rw_exit(&mnp->mnt_contents); 857 atomic_add_32(&MTOD(mnp)->mnt_nopen, -1); 858 } 859 return (0); 860 } 861 862 /* ARGSUSED */ 863 static int 864 mntread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, caller_context_t *ct) 865 { 866 mntnode_t *mnp = VTOM(vp); 867 zone_t *zonep = MTOD(mnp)->mnt_zone; 868 mntsnap_t *snapp = &mnp->mnt_read; 869 off_t off = uio->uio_offset; 870 size_t len = uio->uio_resid; 871 char *bufferp; 872 size_t available, copylen; 873 size_t written = 0; 874 mntelem_t *elemp; 875 krwlock_t *dblockp = &zonep->zone_mntfs_db_lock; 876 int error = 0; 877 off_t ieoffset; 878 879 rw_enter(&mnp->mnt_contents, RW_WRITER); 880 if (snapp->mnts_nmnts == 0 || (off == (off_t)0)) 881 mntfs_snapshot(mnp, snapp); 882 883 if ((size_t)(off + len) > snapp->mnts_text_size) 884 len = snapp->mnts_text_size - off; 885 886 if (off < 0 || len > snapp->mnts_text_size) { 887 rw_exit(&mnp->mnt_contents); 888 return (EFAULT); 889 } 890 891 if (len == 0) { 892 rw_exit(&mnp->mnt_contents); 893 return (0); 894 } 895 896 /* 897 * For the file offset provided, locate the corresponding database 898 * element and calculate the corresponding offset within its text. If 899 * the file offset is the same as that reached during the last read(2) 900 * then use the saved element and intra-element offset. 901 */ 902 rw_enter(dblockp, RW_READER); 903 if (off == 0 || (off == snapp->mnts_foffset)) { 904 elemp = snapp->mnts_next; 905 ieoffset = snapp->mnts_ieoffset; 906 } else { 907 off_t total_off; 908 /* 909 * Find the element corresponding to the requested file offset 910 * by walking through the database and summing the text sizes 911 * of the individual elements. If the requested file offset is 912 * greater than that reached on the last visit then we can start 913 * at the last seen element; otherwise, we have to start at the 914 * beginning. 915 */ 916 if (off > snapp->mnts_foffset) { 917 elemp = snapp->mnts_next; 918 total_off = snapp->mnts_foffset - snapp->mnts_ieoffset; 919 } else { 920 elemp = snapp->mnts_first; 921 total_off = 0; 922 } 923 while (off > total_off + elemp->mnte_text_size) { 924 total_off += elemp->mnte_text_size; 925 elemp = mntfs_get_next_elem(snapp, elemp); 926 ASSERT(elemp != NULL); 927 } 928 /* Calculate the intra-element offset. */ 929 if (off > total_off) 930 ieoffset = off - total_off; 931 else 932 ieoffset = 0; 933 } 934 935 /* 936 * Create a buffer and populate it with the text from successive 937 * database elements until it is full. 938 */ 939 bufferp = kmem_alloc(len, KM_SLEEP); 940 while (written < len) { 941 available = elemp->mnte_text_size - ieoffset; 942 copylen = MIN(len - written, available); 943 bcopy(elemp->mnte_text + ieoffset, bufferp + written, copylen); 944 written += copylen; 945 if (copylen == available) { 946 elemp = mntfs_get_next_elem(snapp, elemp); 947 ASSERT(elemp != NULL || written == len); 948 ieoffset = 0; 949 } else { 950 ieoffset += copylen; 951 } 952 } 953 rw_exit(dblockp); 954 955 /* 956 * Write the populated buffer, update the snapshot's state if 957 * successful and then advertise our read. 958 */ 959 error = uiomove(bufferp, len, UIO_READ, uio); 960 if (error == 0) { 961 snapp->mnts_next = elemp; 962 snapp->mnts_foffset = off + len; 963 snapp->mnts_ieoffset = ieoffset; 964 } 965 vfs_mnttab_readop(); 966 rw_exit(&mnp->mnt_contents); 967 968 /* Clean up. */ 969 kmem_free(bufferp, len); 970 return (error); 971 } 972 973 static int 974 mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 975 caller_context_t *ct) 976 { 977 mntnode_t *mnp = VTOM(vp); 978 int error; 979 vnode_t *rvp; 980 extern timespec_t vfs_mnttab_ctime; 981 mntdata_t *mntdata = MTOD(VTOM(vp)); 982 mntsnap_t *snap; 983 984 rw_enter(&mnp->mnt_contents, RW_READER); 985 snap = mnp->mnt_read.mnts_nmnts ? &mnp->mnt_read : &mnp->mnt_ioctl; 986 /* 987 * Return all the attributes. Should be refined 988 * so that it returns only those asked for. 989 * Most of this is complete fakery anyway. 990 */ 991 rvp = mnp->mnt_mountvp; 992 /* 993 * Attributes are same as underlying file with modifications 994 */ 995 if (error = VOP_GETATTR(rvp, vap, flags, cr, ct)) { 996 rw_exit(&mnp->mnt_contents); 997 return (error); 998 } 999 1000 /* 1001 * We always look like a regular file 1002 */ 1003 vap->va_type = VREG; 1004 /* 1005 * mode should basically be read only 1006 */ 1007 vap->va_mode &= 07444; 1008 vap->va_fsid = vp->v_vfsp->vfs_dev; 1009 vap->va_blksize = DEV_BSIZE; 1010 vap->va_rdev = 0; 1011 vap->va_seq = 0; 1012 /* 1013 * Set nlink to the number of open vnodes for mnttab info 1014 * plus one for existing. 1015 */ 1016 vap->va_nlink = mntdata->mnt_nopen + 1; 1017 /* 1018 * If we haven't taken a snapshot yet, set the 1019 * size to the size of the latest snapshot. 1020 */ 1021 vap->va_size = snap->mnts_text_size ? snap->mnts_text_size : 1022 mntdata->mnt_size; 1023 rw_exit(&mnp->mnt_contents); 1024 /* 1025 * Fetch mtime from the vfs mnttab timestamp 1026 */ 1027 vap->va_ctime = vfs_mnttab_ctime; 1028 vfs_list_read_lock(); 1029 vfs_mnttab_modtime(&vap->va_mtime); 1030 vap->va_atime = vap->va_mtime; 1031 vfs_list_unlock(); 1032 /* 1033 * Nodeid is always ROOTINO; 1034 */ 1035 vap->va_nodeid = (ino64_t)MNTROOTINO; 1036 vap->va_nblocks = btod(vap->va_size); 1037 return (0); 1038 } 1039 1040 1041 static int 1042 mntaccess(vnode_t *vp, int mode, int flags, cred_t *cr, 1043 caller_context_t *ct) 1044 { 1045 mntnode_t *mnp = VTOM(vp); 1046 1047 if (mode & (VWRITE|VEXEC)) 1048 return (EROFS); 1049 1050 /* 1051 * Do access check on the underlying directory vnode. 1052 */ 1053 return (VOP_ACCESS(mnp->mnt_mountvp, mode, flags, cr, ct)); 1054 } 1055 1056 1057 /* 1058 * New /mntfs vnode required; allocate it and fill in most of the fields. 1059 */ 1060 static mntnode_t * 1061 mntgetnode(vnode_t *dp) 1062 { 1063 mntnode_t *mnp; 1064 vnode_t *vp; 1065 1066 mnp = kmem_zalloc(sizeof (mntnode_t), KM_SLEEP); 1067 mnp->mnt_vnode = vn_alloc(KM_SLEEP); 1068 mnp->mnt_mountvp = VTOM(dp)->mnt_mountvp; 1069 rw_init(&mnp->mnt_contents, NULL, RW_DEFAULT, NULL); 1070 vp = MTOV(mnp); 1071 vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; 1072 vn_setops(vp, mntvnodeops); 1073 vp->v_vfsp = dp->v_vfsp; 1074 vp->v_type = VREG; 1075 vp->v_data = (caddr_t)mnp; 1076 1077 return (mnp); 1078 } 1079 1080 /* 1081 * Free the storage obtained from mntgetnode(). 1082 */ 1083 static void 1084 mntfreenode(mntnode_t *mnp) 1085 { 1086 vnode_t *vp = MTOV(mnp); 1087 1088 rw_destroy(&mnp->mnt_contents); 1089 vn_invalid(vp); 1090 vn_free(vp); 1091 kmem_free(mnp, sizeof (*mnp)); 1092 } 1093 1094 1095 /* ARGSUSED */ 1096 static int 1097 mntfsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 1098 { 1099 return (0); 1100 } 1101 1102 /* ARGSUSED */ 1103 static void 1104 mntinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 1105 { 1106 mntnode_t *mnp = VTOM(vp); 1107 1108 mntfreenode(mnp); 1109 } 1110 1111 /* 1112 * lseek(2) is supported only to rewind the file. Rewinding has a special 1113 * meaning for /etc/mnttab: it forces mntfs to refresh the snapshot at the next 1114 * read() or ioctl(). 1115 * 1116 * The generic lseek() code will have already changed the file offset. Therefore 1117 * mntread() can detect a rewind simply by looking for a zero offset. For the 1118 * benefit of mntioctl() we advertise a rewind with a specific flag. 1119 */ 1120 /* ARGSUSED */ 1121 static int 1122 mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) 1123 { 1124 mntnode_t *mnp = VTOM(vp); 1125 1126 if (*noffp == 0) { 1127 rw_enter(&mnp->mnt_contents, RW_WRITER); 1128 mnp->mnt_ioctl.mnts_flags |= MNTS_REWIND; 1129 rw_exit(&mnp->mnt_contents); 1130 } 1131 1132 return (0); 1133 } 1134 1135 /* 1136 * Return the answer requested to poll(). 1137 * POLLRDBAND will return when the mtime of the mnttab 1138 * information is newer than the latest one read for this open. 1139 */ 1140 /* ARGSUSED */ 1141 static int 1142 mntpoll(vnode_t *vp, short ev, int any, short *revp, pollhead_t **phpp, 1143 caller_context_t *ct) 1144 { 1145 mntnode_t *mnp = VTOM(vp); 1146 mntsnap_t *snapp; 1147 1148 rw_enter(&mnp->mnt_contents, RW_READER); 1149 if (mntfs_newest(&mnp->mnt_ioctl.mnts_last_mtime, 1150 &mnp->mnt_read.mnts_last_mtime) == MNTFS_FIRST) 1151 snapp = &mnp->mnt_ioctl; 1152 else 1153 snapp = &mnp->mnt_read; 1154 1155 *revp = 0; 1156 *phpp = (pollhead_t *)NULL; 1157 if (ev & POLLIN) 1158 *revp |= POLLIN; 1159 1160 if (ev & POLLRDNORM) 1161 *revp |= POLLRDNORM; 1162 1163 if (ev & POLLRDBAND) { 1164 vfs_mnttab_poll(&snapp->mnts_last_mtime, phpp); 1165 if (*phpp == (pollhead_t *)NULL) 1166 *revp |= POLLRDBAND; 1167 } 1168 rw_exit(&mnp->mnt_contents); 1169 1170 if (*revp || *phpp != NULL || any) { 1171 return (0); 1172 } 1173 /* 1174 * If someone is polling an unsupported poll events (e.g. 1175 * POLLOUT, POLLPRI, etc.), just return POLLERR revents. 1176 * That way we will ensure that we don't return a 0 1177 * revents with a NULL pollhead pointer. 1178 */ 1179 *revp = POLLERR; 1180 return (0); 1181 } 1182 1183 /* 1184 * mntfs_same_word() returns 1 if two words are the same in the context of 1185 * MNTIOC_GETMNTANY and 0 otherwise. 1186 * 1187 * worda is a memory address that lies somewhere in the buffer bufa; it cannot 1188 * be NULL since this is used to indicate to getmntany(3C) that the user does 1189 * not wish to match a particular field. The text to which worda points is 1190 * supplied by the user; if it is not null-terminated then it cannot match. 1191 * 1192 * Buffer bufb contains a line from /etc/mnttab, in which the fields are 1193 * delimited by tab or new-line characters. offb is the offset of the second 1194 * word within this buffer. 1195 * 1196 * mntfs_same_word() returns 1 if the words are the same and 0 otherwise. 1197 */ 1198 int 1199 mntfs_same_word(char *worda, char *bufa, size_t sizea, off_t offb, char *bufb, 1200 size_t sizeb) 1201 { 1202 char *wordb = bufb + offb; 1203 int bytes_remaining; 1204 1205 ASSERT(worda != NULL); 1206 1207 bytes_remaining = MIN(((bufa + sizea) - worda), 1208 ((bufb + sizeb) - wordb)); 1209 while (bytes_remaining && *worda == *wordb) { 1210 worda++; 1211 wordb++; 1212 bytes_remaining--; 1213 } 1214 if (bytes_remaining && 1215 *worda == '\0' && (*wordb == '\t' || *wordb == '\n')) 1216 return (1); 1217 else 1218 return (0); 1219 } 1220 1221 /* 1222 * mntfs_special_info_string() returns which, if either, of VBLK or VCHR 1223 * corresponds to a supplied path. If the path is a special device then the 1224 * function optionally sets the major and minor numbers. 1225 */ 1226 vtype_t 1227 mntfs_special_info_string(char *path, uint_t *major, uint_t *minor, cred_t *cr) 1228 { 1229 vattr_t vattr; 1230 vnode_t *vp; 1231 vtype_t type; 1232 int error; 1233 1234 if (path == NULL || *path != '/' || 1235 lookupnameat(path + 1, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp, rootdir)) 1236 return (0); 1237 1238 vattr.va_mask = AT_TYPE | AT_RDEV; 1239 error = VOP_GETATTR(vp, &vattr, ATTR_REAL, cr, NULL); 1240 VN_RELE(vp); 1241 1242 if (error == 0 && ((type = vattr.va_type) == VBLK || type == VCHR)) { 1243 if (major && minor) { 1244 *major = getmajor(vattr.va_rdev); 1245 *minor = getminor(vattr.va_rdev); 1246 } 1247 return (type); 1248 } else { 1249 return (0); 1250 } 1251 } 1252 1253 /* 1254 * mntfs_special_info_element() extracts the name of the mounted resource 1255 * for a given element and copies it into a null-terminated string, which it 1256 * then passes to mntfs_special_info_string(). 1257 */ 1258 vtype_t 1259 mntfs_special_info_element(mntelem_t *elemp, cred_t *cr) 1260 { 1261 char *newpath; 1262 vtype_t type; 1263 1264 newpath = kmem_alloc(elemp->mnte_text_size, KM_SLEEP); 1265 bcopy(elemp->mnte_text, newpath, (off_t)(elemp->mnte_tab.mnt_mountp)); 1266 *(newpath + (off_t)elemp->mnte_tab.mnt_mountp - 1) = '\0'; 1267 type = mntfs_special_info_string(newpath, NULL, NULL, cr); 1268 kmem_free(newpath, elemp->mnte_text_size); 1269 1270 return (type); 1271 } 1272 1273 /* 1274 * Convert an address that points to a byte within a user buffer into an 1275 * address that points to the corresponding offset within a kernel buffer. If 1276 * the user address is NULL then make no conversion. If the address does not 1277 * lie within the buffer then reset it to NULL. 1278 */ 1279 char * 1280 mntfs_import_addr(char *uaddr, char *ubufp, char *kbufp, size_t bufsize) 1281 { 1282 if (uaddr < ubufp || uaddr >= ubufp + bufsize) 1283 return (NULL); 1284 else 1285 return (kbufp + (uaddr - ubufp)); 1286 } 1287 1288 /* 1289 * These 32-bit versions are to support STRUCT_DECL(9F) etc. in 1290 * mntfs_copyout_element() and mntioctl(). 1291 */ 1292 #ifdef _SYSCALL32_IMPL 1293 typedef struct extmnttab32 { 1294 uint32_t mnt_special; 1295 uint32_t mnt_mountp; 1296 uint32_t mnt_fstype; 1297 uint32_t mnt_mntopts; 1298 uint32_t mnt_time; 1299 uint_t mnt_major; 1300 uint_t mnt_minor; 1301 } extmnttab32_t; 1302 1303 typedef struct mnttab32 { 1304 uint32_t mnt_special; 1305 uint32_t mnt_mountp; 1306 uint32_t mnt_fstype; 1307 uint32_t mnt_mntopts; 1308 uint32_t mnt_time; 1309 } mnttab32_t; 1310 1311 struct mntentbuf32 { 1312 uint32_t mbuf_emp; 1313 uint_t mbuf_bufsize; 1314 uint32_t mbuf_buf; 1315 }; 1316 #endif 1317 1318 /* 1319 * mntfs_copyout_element() is common code for the MNTIOC_GETMNTENT, 1320 * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY ioctls. Having identifed the 1321 * database element desired by the user, this function copies out the text and 1322 * the pointers to the relevant userland addresses. It returns 0 on success 1323 * and non-zero otherwise. 1324 */ 1325 int 1326 mntfs_copyout_elem(mntelem_t *elemp, struct extmnttab *uemp, 1327 char *ubufp, int cmd, int datamodel) 1328 { 1329 STRUCT_DECL(extmnttab, ktab); 1330 char *dbbufp = elemp->mnte_text; 1331 size_t dbbufsize = elemp->mnte_text_size; 1332 struct extmnttab *dbtabp = &elemp->mnte_tab; 1333 size_t ssize; 1334 char *kbufp; 1335 int error = 0; 1336 1337 1338 /* 1339 * We create a struct extmnttab within the kernel of the size 1340 * determined by the user's data model. We then populate its 1341 * fields by combining the start address of the text buffer 1342 * supplied by the user, ubufp, with the offsets stored for 1343 * this database element within dbtabp, a pointer to a struct 1344 * extmnttab. 1345 * 1346 * Note that if the corresponding field is "-" this signifies 1347 * no real content, and we set the address to NULL. This does 1348 * not apply to mnt_time. 1349 */ 1350 STRUCT_INIT(ktab, datamodel); 1351 STRUCT_FSETP(ktab, mnt_special, 1352 MNTFS_REAL_FIELD(dbbufp) ? ubufp : NULL); 1353 STRUCT_FSETP(ktab, mnt_mountp, 1354 MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_mountp) ? 1355 ubufp + (off_t)dbtabp->mnt_mountp : NULL); 1356 STRUCT_FSETP(ktab, mnt_fstype, 1357 MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_fstype) ? 1358 ubufp + (off_t)dbtabp->mnt_fstype : NULL); 1359 STRUCT_FSETP(ktab, mnt_mntopts, 1360 MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_mntopts) ? 1361 ubufp + (off_t)dbtabp->mnt_mntopts : NULL); 1362 STRUCT_FSETP(ktab, mnt_time, 1363 ubufp + (off_t)dbtabp->mnt_time); 1364 if (cmd == MNTIOC_GETEXTMNTENT) { 1365 STRUCT_FSETP(ktab, mnt_major, dbtabp->mnt_major); 1366 STRUCT_FSETP(ktab, mnt_minor, dbtabp->mnt_minor); 1367 ssize = SIZEOF_STRUCT(extmnttab, datamodel); 1368 } else { 1369 ssize = SIZEOF_STRUCT(mnttab, datamodel); 1370 } 1371 if (copyout(STRUCT_BUF(ktab), uemp, ssize)) 1372 return (EFAULT); 1373 1374 /* 1375 * We create a text buffer in the kernel into which we copy the 1376 * /etc/mnttab entry for this element. We change the tab and 1377 * new-line delimiters to null bytes before copying out the 1378 * buffer. 1379 */ 1380 kbufp = kmem_alloc(dbbufsize, KM_SLEEP); 1381 bcopy(elemp->mnte_text, kbufp, dbbufsize); 1382 *(kbufp + (off_t)dbtabp->mnt_mountp - 1) = 1383 *(kbufp + (off_t)dbtabp->mnt_fstype - 1) = 1384 *(kbufp + (off_t)dbtabp->mnt_mntopts - 1) = 1385 *(kbufp + (off_t)dbtabp->mnt_time - 1) = 1386 *(kbufp + dbbufsize - 1) = '\0'; 1387 if (copyout(kbufp, ubufp, dbbufsize)) 1388 error = EFAULT; 1389 1390 kmem_free(kbufp, dbbufsize); 1391 return (error); 1392 } 1393 1394 /* ARGSUSED */ 1395 static int 1396 mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr, 1397 int *rvalp, caller_context_t *ct) 1398 { 1399 uint_t *up = (uint_t *)arg; 1400 mntnode_t *mnp = VTOM(vp); 1401 mntsnap_t *snapp = &mnp->mnt_ioctl; 1402 int error = 0; 1403 zone_t *zonep = MTOD(mnp)->mnt_zone; 1404 krwlock_t *dblockp = &zonep->zone_mntfs_db_lock; 1405 model_t datamodel = flag & DATAMODEL_MASK; 1406 1407 switch (cmd) { 1408 1409 case MNTIOC_NMNTS: /* get no. of mounted resources */ 1410 { 1411 rw_enter(&mnp->mnt_contents, RW_READER); 1412 if (snapp->mnts_nmnts == 0 || 1413 (snapp->mnts_flags & MNTS_REWIND)) { 1414 if (!rw_tryupgrade(&mnp->mnt_contents)) { 1415 rw_exit(&mnp->mnt_contents); 1416 rw_enter(&mnp->mnt_contents, RW_WRITER); 1417 } 1418 if (snapp->mnts_nmnts == 0 || 1419 (snapp->mnts_flags & MNTS_REWIND)) 1420 mntfs_snapshot(mnp, snapp); 1421 } 1422 rw_exit(&mnp->mnt_contents); 1423 1424 if (suword32(up, snapp->mnts_nmnts) != 0) 1425 error = EFAULT; 1426 break; 1427 } 1428 1429 case MNTIOC_GETDEVLIST: /* get mounted device major/minor nos */ 1430 { 1431 size_t len; 1432 uint_t *devlist; 1433 mntelem_t *elemp; 1434 int i = 0; 1435 1436 rw_enter(&mnp->mnt_contents, RW_READER); 1437 if (snapp->mnts_nmnts == 0 || 1438 (snapp->mnts_flags & MNTS_REWIND)) { 1439 if (!rw_tryupgrade(&mnp->mnt_contents)) { 1440 rw_exit(&mnp->mnt_contents); 1441 rw_enter(&mnp->mnt_contents, RW_WRITER); 1442 } 1443 if (snapp->mnts_nmnts == 0 || 1444 (snapp->mnts_flags & MNTS_REWIND)) 1445 mntfs_snapshot(mnp, snapp); 1446 rw_downgrade(&mnp->mnt_contents); 1447 } 1448 1449 /* Create a local buffer to hold the device numbers. */ 1450 len = 2 * snapp->mnts_nmnts * sizeof (uint_t); 1451 devlist = kmem_alloc(len, KM_SLEEP); 1452 1453 /* 1454 * Walk the database elements for this snapshot and add their 1455 * major and minor numbers. 1456 */ 1457 rw_enter(dblockp, RW_READER); 1458 for (elemp = snapp->mnts_first; elemp; 1459 elemp = mntfs_get_next_elem(snapp, elemp)) { 1460 devlist[2 * i] = elemp->mnte_tab.mnt_major; 1461 devlist[2 * i + 1] = elemp->mnte_tab.mnt_minor; 1462 i++; 1463 } 1464 rw_exit(dblockp); 1465 ASSERT(i == snapp->mnts_nmnts); 1466 rw_exit(&mnp->mnt_contents); 1467 1468 error = xcopyout(devlist, up, len); 1469 kmem_free(devlist, len); 1470 break; 1471 } 1472 1473 case MNTIOC_SETTAG: /* set tag on mounted file system */ 1474 case MNTIOC_CLRTAG: /* clear tag on mounted file system */ 1475 { 1476 struct mnttagdesc *dp = (struct mnttagdesc *)arg; 1477 STRUCT_DECL(mnttagdesc, tagdesc); 1478 char *cptr; 1479 uint32_t major, minor; 1480 char tagbuf[MAX_MNTOPT_TAG]; 1481 char *pbuf; 1482 size_t len; 1483 uint_t start = 0; 1484 mntdata_t *mntdata = MTOD(mnp); 1485 zone_t *zone = mntdata->mnt_zone; 1486 1487 STRUCT_INIT(tagdesc, flag & DATAMODEL_MASK); 1488 if (copyin(dp, STRUCT_BUF(tagdesc), STRUCT_SIZE(tagdesc))) { 1489 error = EFAULT; 1490 break; 1491 } 1492 pbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP); 1493 if (zone != global_zone) { 1494 (void) strcpy(pbuf, zone->zone_rootpath); 1495 /* truncate "/" and nul */ 1496 start = zone->zone_rootpathlen - 2; 1497 ASSERT(pbuf[start] == '/'); 1498 } 1499 cptr = STRUCT_FGETP(tagdesc, mtd_mntpt); 1500 error = copyinstr(cptr, pbuf + start, MAXPATHLEN - start, &len); 1501 if (error) { 1502 kmem_free(pbuf, MAXPATHLEN); 1503 break; 1504 } 1505 if (start != 0 && pbuf[start] != '/') { 1506 kmem_free(pbuf, MAXPATHLEN); 1507 error = EINVAL; 1508 break; 1509 } 1510 cptr = STRUCT_FGETP(tagdesc, mtd_tag); 1511 if ((error = copyinstr(cptr, tagbuf, MAX_MNTOPT_TAG, &len))) { 1512 kmem_free(pbuf, MAXPATHLEN); 1513 break; 1514 } 1515 major = STRUCT_FGET(tagdesc, mtd_major); 1516 minor = STRUCT_FGET(tagdesc, mtd_minor); 1517 if (cmd == MNTIOC_SETTAG) 1518 error = vfs_settag(major, minor, pbuf, tagbuf, cr); 1519 else 1520 error = vfs_clrtag(major, minor, pbuf, tagbuf, cr); 1521 kmem_free(pbuf, MAXPATHLEN); 1522 break; 1523 } 1524 1525 case MNTIOC_SHOWHIDDEN: 1526 { 1527 mutex_enter(&vp->v_lock); 1528 mnp->mnt_flags |= MNT_SHOWHIDDEN; 1529 mutex_exit(&vp->v_lock); 1530 break; 1531 } 1532 1533 case MNTIOC_GETMNTANY: 1534 { 1535 STRUCT_DECL(mntentbuf, embuf); /* Our copy of user's embuf */ 1536 STRUCT_DECL(extmnttab, ktab); /* Out copy of user's emp */ 1537 struct extmnttab *uemp; /* uaddr of user's emp */ 1538 char *ubufp; /* uaddr of user's text buf */ 1539 size_t ubufsize; /* size of the above */ 1540 struct extmnttab preftab; /* our version of user's emp */ 1541 char *prefbuf; /* our copy of user's text */ 1542 mntelem_t *elemp; /* a database element */ 1543 struct extmnttab *dbtabp; /* element's extmnttab */ 1544 char *dbbufp; /* element's text buf */ 1545 size_t dbbufsize; /* size of the above */ 1546 vtype_t type; /* type, if any, of special */ 1547 1548 1549 /* 1550 * embuf is a struct embuf within the kernel. We copy into it 1551 * the struct embuf supplied by the user. 1552 */ 1553 STRUCT_INIT(embuf, datamodel); 1554 if (copyin((void *) arg, STRUCT_BUF(embuf), 1555 STRUCT_SIZE(embuf))) { 1556 error = EFAULT; 1557 break; 1558 } 1559 uemp = STRUCT_FGETP(embuf, mbuf_emp); 1560 ubufp = STRUCT_FGETP(embuf, mbuf_buf); 1561 ubufsize = STRUCT_FGET(embuf, mbuf_bufsize); 1562 1563 /* 1564 * Check that the text buffer offered by the user is the 1565 * agreed size. 1566 */ 1567 if (ubufsize != MNT_LINE_MAX) { 1568 error = EINVAL; 1569 break; 1570 } 1571 1572 /* Copy the user-supplied entry into a local buffer. */ 1573 prefbuf = kmem_alloc(MNT_LINE_MAX, KM_SLEEP); 1574 if (copyin(ubufp, prefbuf, MNT_LINE_MAX)) { 1575 kmem_free(prefbuf, MNT_LINE_MAX); 1576 error = EFAULT; 1577 break; 1578 } 1579 1580 /* Ensure that any string within it is null-terminated. */ 1581 *(prefbuf + MNT_LINE_MAX - 1) = 0; 1582 1583 /* Copy in the user-supplied mpref */ 1584 STRUCT_INIT(ktab, datamodel); 1585 if (copyin(uemp, STRUCT_BUF(ktab), 1586 SIZEOF_STRUCT(mnttab, datamodel))) { 1587 kmem_free(prefbuf, MNT_LINE_MAX); 1588 error = EFAULT; 1589 break; 1590 } 1591 1592 /* 1593 * Copy the members of the user's pref struct into a local 1594 * struct. The pointers need to be offset and verified to 1595 * ensure that they lie within the bounds of the buffer. 1596 */ 1597 preftab.mnt_special = mntfs_import_addr(STRUCT_FGETP(ktab, 1598 mnt_special), ubufp, prefbuf, MNT_LINE_MAX); 1599 preftab.mnt_mountp = mntfs_import_addr(STRUCT_FGETP(ktab, 1600 mnt_mountp), ubufp, prefbuf, MNT_LINE_MAX); 1601 preftab.mnt_fstype = mntfs_import_addr(STRUCT_FGETP(ktab, 1602 mnt_fstype), ubufp, prefbuf, MNT_LINE_MAX); 1603 preftab.mnt_mntopts = mntfs_import_addr(STRUCT_FGETP(ktab, 1604 mnt_mntopts), ubufp, prefbuf, MNT_LINE_MAX); 1605 preftab.mnt_time = mntfs_import_addr(STRUCT_FGETP(ktab, 1606 mnt_time), ubufp, prefbuf, MNT_LINE_MAX); 1607 1608 /* 1609 * If the user specifies a mounted resource that is a special 1610 * device then we capture its mode and major and minor numbers; 1611 * c.f. the block comment below. 1612 */ 1613 type = mntfs_special_info_string(preftab.mnt_special, 1614 &preftab.mnt_major, &preftab.mnt_minor, cr); 1615 1616 rw_enter(&mnp->mnt_contents, RW_WRITER); 1617 if (snapp->mnts_nmnts == 0 || 1618 (snapp->mnts_flags & MNTS_REWIND)) 1619 mntfs_snapshot(mnp, snapp); 1620 1621 /* 1622 * This is the core functionality that implements getmntany(). 1623 * We walk through the mntfs database until we find an element 1624 * matching the user's preferences that are contained in 1625 * preftab. Typically, this means checking that the text 1626 * matches. However, the mounted resource is special: if the 1627 * user is looking for a special device then we must find a 1628 * database element with the same major and minor numbers and 1629 * the same type, i.e. VBLK or VCHR. The type is not recorded 1630 * in the element because it cannot be inferred from the vfs_t. 1631 * We therefore check the type of suitable candidates via 1632 * mntfs_special_info_element(); since this calls into the 1633 * underlying file system we make sure to drop the database lock 1634 * first. 1635 */ 1636 elemp = snapp->mnts_next; 1637 rw_enter(dblockp, RW_READER); 1638 for (;;) { 1639 for (; elemp; elemp = mntfs_get_next_elem(snapp, 1640 elemp)) { 1641 dbtabp = &elemp->mnte_tab; 1642 dbbufp = elemp->mnte_text; 1643 dbbufsize = elemp->mnte_text_size; 1644 1645 if (((type && 1646 dbtabp->mnt_major == preftab.mnt_major && 1647 dbtabp->mnt_minor == preftab.mnt_minor && 1648 MNTFS_REAL_FIELD(dbbufp)) || 1649 (!type && (!preftab.mnt_special || 1650 mntfs_same_word(preftab.mnt_special, 1651 prefbuf, MNT_LINE_MAX, (off_t)0, dbbufp, 1652 dbbufsize)))) && 1653 1654 (!preftab.mnt_mountp || mntfs_same_word( 1655 preftab.mnt_mountp, prefbuf, MNT_LINE_MAX, 1656 (off_t)dbtabp->mnt_mountp, dbbufp, 1657 dbbufsize)) && 1658 1659 (!preftab.mnt_fstype || mntfs_same_word( 1660 preftab.mnt_fstype, prefbuf, MNT_LINE_MAX, 1661 (off_t)dbtabp->mnt_fstype, dbbufp, 1662 dbbufsize)) && 1663 1664 (!preftab.mnt_mntopts || mntfs_same_word( 1665 preftab.mnt_mntopts, prefbuf, MNT_LINE_MAX, 1666 (off_t)dbtabp->mnt_mntopts, dbbufp, 1667 dbbufsize)) && 1668 1669 (!preftab.mnt_time || mntfs_same_word( 1670 preftab.mnt_time, prefbuf, MNT_LINE_MAX, 1671 (off_t)dbtabp->mnt_time, dbbufp, 1672 dbbufsize))) 1673 break; 1674 } 1675 rw_exit(dblockp); 1676 1677 if (elemp == NULL || type == 0 || 1678 type == mntfs_special_info_element(elemp, cr)) 1679 break; 1680 1681 rw_enter(dblockp, RW_READER); 1682 elemp = mntfs_get_next_elem(snapp, elemp); 1683 } 1684 1685 kmem_free(prefbuf, MNT_LINE_MAX); 1686 1687 /* If we failed to find a match then return EOF. */ 1688 if (elemp == NULL) { 1689 rw_exit(&mnp->mnt_contents); 1690 *rvalp = MNTFS_EOF; 1691 break; 1692 } 1693 1694 /* 1695 * Check that the text buffer offered by the user will be large 1696 * enough to accommodate the text for this entry. 1697 */ 1698 if (elemp->mnte_text_size > MNT_LINE_MAX) { 1699 rw_exit(&mnp->mnt_contents); 1700 *rvalp = MNTFS_TOOLONG; 1701 break; 1702 } 1703 1704 /* 1705 * Populate the user's struct mnttab and text buffer using the 1706 * element's contents. 1707 */ 1708 if (mntfs_copyout_elem(elemp, uemp, ubufp, cmd, datamodel)) { 1709 error = EFAULT; 1710 } else { 1711 rw_enter(dblockp, RW_READER); 1712 elemp = mntfs_get_next_elem(snapp, elemp); 1713 rw_exit(dblockp); 1714 snapp->mnts_next = elemp; 1715 } 1716 rw_exit(&mnp->mnt_contents); 1717 break; 1718 } 1719 1720 case MNTIOC_GETMNTENT: 1721 case MNTIOC_GETEXTMNTENT: 1722 { 1723 STRUCT_DECL(mntentbuf, embuf); /* Our copy of user's embuf */ 1724 struct extmnttab *uemp; /* uaddr of user's emp */ 1725 char *ubufp; /* uaddr of user's text buf */ 1726 size_t ubufsize; /* size of the above */ 1727 mntelem_t *elemp; /* a database element */ 1728 1729 1730 rw_enter(&mnp->mnt_contents, RW_WRITER); 1731 if (snapp->mnts_nmnts == 0 || 1732 (snapp->mnts_flags & MNTS_REWIND)) 1733 mntfs_snapshot(mnp, snapp); 1734 if ((elemp = snapp->mnts_next) == NULL) { 1735 rw_exit(&mnp->mnt_contents); 1736 *rvalp = MNTFS_EOF; 1737 break; 1738 } 1739 1740 /* 1741 * embuf is a struct embuf within the kernel. We copy into it 1742 * the struct embuf supplied by the user. 1743 */ 1744 STRUCT_INIT(embuf, datamodel); 1745 if (copyin((void *) arg, STRUCT_BUF(embuf), 1746 STRUCT_SIZE(embuf))) { 1747 rw_exit(&mnp->mnt_contents); 1748 error = EFAULT; 1749 break; 1750 } 1751 uemp = STRUCT_FGETP(embuf, mbuf_emp); 1752 ubufp = STRUCT_FGETP(embuf, mbuf_buf); 1753 ubufsize = STRUCT_FGET(embuf, mbuf_bufsize); 1754 1755 /* 1756 * Check that the text buffer offered by the user will be large 1757 * enough to accommodate the text for this entry. 1758 */ 1759 if (elemp->mnte_text_size > ubufsize) { 1760 rw_exit(&mnp->mnt_contents); 1761 *rvalp = MNTFS_TOOLONG; 1762 break; 1763 } 1764 1765 /* 1766 * Populate the user's struct mnttab and text buffer using the 1767 * element's contents. 1768 */ 1769 if (mntfs_copyout_elem(elemp, uemp, ubufp, cmd, datamodel)) { 1770 error = EFAULT; 1771 } else { 1772 rw_enter(dblockp, RW_READER); 1773 elemp = mntfs_get_next_elem(snapp, elemp); 1774 rw_exit(dblockp); 1775 snapp->mnts_next = elemp; 1776 } 1777 rw_exit(&mnp->mnt_contents); 1778 break; 1779 } 1780 1781 default: 1782 error = EINVAL; 1783 break; 1784 } 1785 1786 return (error); 1787 } 1788 1789 /* 1790 * /mntfs vnode operations vector 1791 */ 1792 const fs_operation_def_t mnt_vnodeops_template[] = { 1793 VOPNAME_OPEN, { .vop_open = mntopen }, 1794 VOPNAME_CLOSE, { .vop_close = mntclose }, 1795 VOPNAME_READ, { .vop_read = mntread }, 1796 VOPNAME_IOCTL, { .vop_ioctl = mntioctl }, 1797 VOPNAME_GETATTR, { .vop_getattr = mntgetattr }, 1798 VOPNAME_ACCESS, { .vop_access = mntaccess }, 1799 VOPNAME_FSYNC, { .vop_fsync = mntfsync }, 1800 VOPNAME_INACTIVE, { .vop_inactive = mntinactive }, 1801 VOPNAME_SEEK, { .vop_seek = mntseek }, 1802 VOPNAME_POLL, { .vop_poll = mntpoll }, 1803 VOPNAME_DISPOSE, { .error = fs_error }, 1804 VOPNAME_SHRLOCK, { .error = fs_error }, 1805 NULL, NULL 1806 }; 1807