xref: /titanic_52/usr/src/uts/common/fs/mntfs/mntvnops.c (revision 1e49577a7fcde812700ded04431b49d67cc57d6d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/file.h>
27 #include <sys/stat.h>
28 #include <sys/atomic.h>
29 #include <sys/mntio.h>
30 #include <sys/mnttab.h>
31 #include <sys/mount.h>
32 #include <sys/sunddi.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/vfs.h>
36 #include <sys/vfs_opreg.h>
37 #include <sys/fs/mntdata.h>
38 #include <fs/fs_subr.h>
39 #include <sys/vmsystm.h>
40 #include <vm/seg_vn.h>
41 #include <sys/time.h>
42 #include <sys/ksynch.h>
43 #include <sys/sdt.h>
44 
45 #define	MNTROOTINO	2
46 
47 static mntnode_t *mntgetnode(vnode_t *);
48 
49 vnodeops_t *mntvnodeops;
50 extern void vfs_mnttab_readop(void);
51 
52 /*
53  * Design of kernel mnttab accounting.
54  *
55  * mntfs provides two methods of reading the in-kernel mnttab, i.e. the state of
56  * the mounted resources: the read-only file /etc/mnttab, and a collection of
57  * ioctl() commands. Most of these interfaces are public and are described in
58  * mnttab(4). Three private ioctl() commands, MNTIOC_GETMNTENT,
59  * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY, provide for the getmntent(3C)
60  * family of functions, allowing them to support white space in mount names.
61  *
62  * A significant feature of mntfs is that it provides a file descriptor with a
63  * snapshot once it begins to consume mnttab data. Thus, as the process
64  * continues to consume data, its view of the in-kernel mnttab does not change
65  * even if resources are mounted or unmounted. The intent is to ensure that
66  * processes are guaranteed to read self-consistent data even as the system
67  * changes.
68  *
69  * The snapshot is implemented by a "database", unique to each zone, that
70  * comprises a linked list of mntelem_ts. The database is identified by
71  * zone_mntfs_db and is protected by zone_mntfs_db_lock. Each element contains
72  * the text entry in /etc/mnttab for a mounted resource, i.e. a vfs_t, and is
73  * marked with its time of "birth", i.e. creation. An element is "killed", and
74  * marked with its time of death, when it is found to be out of date, e.g. when
75  * the corresponding resource has been unmounted.
76  *
77  * When a process performs the first read() or ioctl() for a file descriptor for
78  * /etc/mnttab, the database is updated by a call to mntfs_snapshot() to ensure
79  * that an element exists for each currently mounted resource. Following this,
80  * the current time is written into a snapshot structure, a mntsnap_t, embedded
81  * in the descriptor's mntnode_t.
82  *
83  * mntfs is able to enumerate the /etc/mnttab entries corresponding to a
84  * particular file descriptor by searching the database for entries that were
85  * born before the appropriate snapshot and that either are still alive or died
86  * after the snapshot was created. Consumers use the iterator function
87  * mntfs_get_next_elem() to identify the next suitable element in the database.
88  *
89  * Each snapshot has a hold on its corresponding database elements, effected by
90  * a per-element reference count. At last close(), a snapshot is destroyed in
91  * mntfs_freesnap() by releasing all of its holds; an element is destroyed if
92  * its reference count becomes zero. Therefore the database never exists unless
93  * there is at least one active consumer of /etc/mnttab.
94  *
95  * getmntent(3C) et al. "do not open, close or rewind the file." This implies
96  * that getmntent() and read() must be able to operate without interaction on
97  * the same file descriptor; this is accomplished by the use of separate
98  * mntsnap_ts for both read() and ioctl().
99  *
100  * mntfs observes the following lock-ordering:
101  *
102  *	mnp->mnt_contents -> vfslist -> zonep->zone_mntfs_db_lock
103  *
104  * NOTE: The following variable enables the generation of the "dev=xxx"
105  * in the option string for a mounted file system.  Really this should
106  * be gotten rid of altogether, but for the sake of backwards compatibility
107  * we had to leave it in.  It is defined as a 32-bit device number.  This
108  * means that when 64-bit device numbers are in use, if either the major or
109  * minor part of the device number will not fit in a 16 bit quantity, the
110  * "dev=" will be set to NODEV (0x7fffffff).  See PSARC 1999/566 and
111  * 1999/131 for details.  The cmpldev() function used to generate the 32-bit
112  * device number handles this check and assigns the proper value.
113  */
114 int mntfs_enabledev = 1;	/* enable old "dev=xxx" option */
115 
116 extern void vfs_mono_time(timespec_t *);
117 enum { MNTFS_FIRST, MNTFS_SECOND, MNTFS_NEITHER };
118 
119 /*
120  * Determine whether a field within a line from /etc/mnttab contains actual
121  * content or simply the marker string "-". This never applies to the time,
122  * therefore the delimiter must be a tab.
123  */
124 #define	MNTFS_REAL_FIELD(x)	(*(x) != '-' || *((x) + 1) != '\t')
125 
126 static int
127 mntfs_devsize(struct vfs *vfsp)
128 {
129 	dev32_t odev;
130 
131 	(void) cmpldev(&odev, vfsp->vfs_dev);
132 	return (snprintf(NULL, 0, "dev=%x", odev));
133 }
134 
135 static int
136 mntfs_devprint(struct vfs *vfsp, char *buf)
137 {
138 	dev32_t odev;
139 
140 	(void) cmpldev(&odev, vfsp->vfs_dev);
141 	return (snprintf(buf, MAX_MNTOPT_STR, "dev=%x", odev));
142 }
143 
144 /* Identify which, if either, of two supplied timespec structs is newer. */
145 static int
146 mntfs_newest(timespec_t *a, timespec_t *b)
147 {
148 	if (a->tv_sec == b->tv_sec &&
149 	    a->tv_nsec == b->tv_nsec) {
150 		return (MNTFS_NEITHER);
151 	} else if (b->tv_sec > a->tv_sec ||
152 	    (b->tv_sec == a->tv_sec &&
153 	    b->tv_nsec > a->tv_nsec)) {
154 		return (MNTFS_SECOND);
155 	} else {
156 		return (MNTFS_FIRST);
157 	}
158 }
159 
160 static int
161 mntfs_optsize(struct vfs *vfsp)
162 {
163 	int i, size = 0;
164 	mntopt_t *mop;
165 
166 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
167 		mop = &vfsp->vfs_mntopts.mo_list[i];
168 		if (mop->mo_flags & MO_NODISPLAY)
169 			continue;
170 		if (mop->mo_flags & MO_SET) {
171 			if (size)
172 				size++; /* space for comma */
173 			size += strlen(mop->mo_name);
174 			/*
175 			 * count option value if there is one
176 			 */
177 			if (mop->mo_arg != NULL) {
178 				size += strlen(mop->mo_arg) + 1;
179 			}
180 		}
181 	}
182 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
183 		/*
184 		 * Add space for "zone=<zone_name>" if required.
185 		 */
186 		if (size)
187 			size++;	/* space for comma */
188 		size += sizeof ("zone=") - 1;
189 		size += strlen(vfsp->vfs_zone->zone_name);
190 	}
191 	if (mntfs_enabledev) {
192 		if (size != 0)
193 			size++; /* space for comma */
194 		size += mntfs_devsize(vfsp);
195 	}
196 	if (size == 0)
197 		size = strlen("-");
198 	return (size);
199 }
200 
201 static int
202 mntfs_optprint(struct vfs *vfsp, char *buf)
203 {
204 	int i, optinbuf = 0;
205 	mntopt_t *mop;
206 	char *origbuf = buf;
207 
208 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
209 		mop = &vfsp->vfs_mntopts.mo_list[i];
210 		if (mop->mo_flags & MO_NODISPLAY)
211 			continue;
212 		if (mop->mo_flags & MO_SET) {
213 			if (optinbuf)
214 				*buf++ = ',';
215 			else
216 				optinbuf = 1;
217 			buf += snprintf(buf, MAX_MNTOPT_STR,
218 			    "%s", mop->mo_name);
219 			/*
220 			 * print option value if there is one
221 			 */
222 			if (mop->mo_arg != NULL) {
223 				buf += snprintf(buf, MAX_MNTOPT_STR, "=%s",
224 				    mop->mo_arg);
225 			}
226 		}
227 	}
228 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
229 		if (optinbuf)
230 			*buf++ = ',';
231 		else
232 			optinbuf = 1;
233 		buf += snprintf(buf, MAX_MNTOPT_STR, "zone=%s",
234 		    vfsp->vfs_zone->zone_name);
235 	}
236 	if (mntfs_enabledev) {
237 		if (optinbuf++)
238 			*buf++ = ',';
239 		buf += mntfs_devprint(vfsp, buf);
240 	}
241 	if (!optinbuf) {
242 		buf += snprintf(buf, MAX_MNTOPT_STR, "-");
243 	}
244 	return (buf - origbuf);
245 }
246 
247 void
248 mntfs_populate_text(vfs_t *vfsp, zone_t *zonep, mntelem_t *elemp)
249 {
250 	struct extmnttab *tabp = &elemp->mnte_tab;
251 	const char *resource, *mntpt;
252 	char *cp = elemp->mnte_text;
253 	mntpt = refstr_value(vfsp->vfs_mntpt);
254 	resource = refstr_value(vfsp->vfs_resource);
255 
256 	tabp->mnt_special = 0;
257 	if (resource != NULL && resource[0] != '\0') {
258 		if (resource[0] != '/') {
259 			cp += snprintf(cp, MAXPATHLEN, "%s\t", resource);
260 		} else if (!ZONE_PATH_VISIBLE(resource, zonep)) {
261 			/*
262 			 * Use the mount point as the resource.
263 			 */
264 			cp += snprintf(cp, MAXPATHLEN, "%s\t",
265 			    ZONE_PATH_TRANSLATE(mntpt, zonep));
266 		} else {
267 			cp += snprintf(cp, MAXPATHLEN, "%s\t",
268 			    ZONE_PATH_TRANSLATE(resource, zonep));
269 		}
270 	} else {
271 		cp += snprintf(cp, MAXPATHLEN, "-\t");
272 	}
273 
274 	tabp->mnt_mountp = (char *)(cp - elemp->mnte_text);
275 	if (mntpt != NULL && mntpt[0] != '\0') {
276 		/*
277 		 * We know the mount point is visible from within the zone,
278 		 * otherwise it wouldn't be on the zone's vfs list.
279 		 */
280 		cp += snprintf(cp, MAXPATHLEN, "%s\t",
281 		    ZONE_PATH_TRANSLATE(mntpt, zonep));
282 	} else {
283 		cp += snprintf(cp, MAXPATHLEN, "-\t");
284 	}
285 
286 	tabp->mnt_fstype = (char *)(cp - elemp->mnte_text);
287 	cp += snprintf(cp, MAXPATHLEN, "%s\t",
288 	    vfssw[vfsp->vfs_fstype].vsw_name);
289 
290 	tabp->mnt_mntopts = (char *)(cp - elemp->mnte_text);
291 	cp += mntfs_optprint(vfsp, cp);
292 	*cp++ = '\t';
293 
294 	tabp->mnt_time = (char *)(cp - elemp->mnte_text);
295 	cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime);
296 	*cp++ = '\n'; /* over-write snprintf's trailing null-byte */
297 
298 	tabp->mnt_major = getmajor(vfsp->vfs_dev);
299 	tabp->mnt_minor = getminor(vfsp->vfs_dev);
300 
301 	elemp->mnte_text_size = cp - elemp->mnte_text;
302 	elemp->mnte_vfs_ctime = vfsp->vfs_hrctime;
303 	elemp->mnte_hidden = vfsp->vfs_flag & VFS_NOMNTTAB;
304 }
305 
306 /* Determine the length of the /etc/mnttab entry for this vfs_t. */
307 static size_t
308 mntfs_text_len(vfs_t *vfsp, zone_t *zone)
309 {
310 	size_t size = 0;
311 	const char *resource, *mntpt;
312 	size_t mntsize;
313 
314 	mntpt = refstr_value(vfsp->vfs_mntpt);
315 	if (mntpt != NULL && mntpt[0] != '\0') {
316 		mntsize = strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
317 	} else {
318 		mntsize = 2;	/* "-\t" */
319 	}
320 	size += mntsize;
321 
322 	resource = refstr_value(vfsp->vfs_resource);
323 	if (resource != NULL && resource[0] != '\0') {
324 		if (resource[0] != '/') {
325 			size += strlen(resource) + 1;
326 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
327 			/*
328 			 * Same as the zone's view of the mount point.
329 			 */
330 			size += mntsize;
331 		} else {
332 			size += strlen(ZONE_PATH_TRANSLATE(resource, zone)) + 1;
333 		}
334 	} else {
335 		size += 2;	/* "-\t" */
336 	}
337 	size += strlen(vfssw[vfsp->vfs_fstype].vsw_name) + 1;
338 	size += mntfs_optsize(vfsp);
339 	size += snprintf(NULL, 0, "\t%ld\n", vfsp->vfs_mtime);
340 	return (size);
341 }
342 
343 /* Destroy the resources associated with a snapshot element. */
344 static void
345 mntfs_destroy_elem(mntelem_t *elemp)
346 {
347 	kmem_free(elemp->mnte_text, elemp->mnte_text_size);
348 	kmem_free(elemp, sizeof (mntelem_t));
349 }
350 
351 /*
352  * Return 1 if the given snapshot is in the range of the given element; return
353  * 0 otherwise.
354  */
355 static int
356 mntfs_elem_in_range(mntsnap_t *snapp, mntelem_t *elemp)
357 {
358 	timespec_t	*stimep = &snapp->mnts_time;
359 	timespec_t	*btimep = &elemp->mnte_birth;
360 	timespec_t	*dtimep = &elemp->mnte_death;
361 
362 	/*
363 	 * If a snapshot is in range of an element then the snapshot must have
364 	 * been created after the birth of the element, and either the element
365 	 * is still alive or it died after the snapshot was created.
366 	 */
367 	if (mntfs_newest(btimep, stimep) == MNTFS_SECOND &&
368 	    (MNTFS_ELEM_IS_ALIVE(elemp) ||
369 	    mntfs_newest(stimep, dtimep) == MNTFS_SECOND))
370 		return (1);
371 	else
372 		return (0);
373 }
374 
375 /*
376  * Return the next valid database element, after the one provided, for a given
377  * snapshot; return NULL if none exists. The caller must hold the zone's
378  * database lock as a reader before calling this function.
379  */
380 static mntelem_t *
381 mntfs_get_next_elem(mntsnap_t *snapp, mntelem_t *elemp)
382 {
383 	int show_hidden = snapp->mnts_flags & MNTS_SHOWHIDDEN;
384 
385 	do {
386 		elemp = elemp->mnte_next;
387 	} while (elemp &&
388 	    (!mntfs_elem_in_range(snapp, elemp) ||
389 	    (!show_hidden && elemp->mnte_hidden)));
390 	return (elemp);
391 }
392 
393 /*
394  * This function frees the resources associated with a mntsnap_t. It walks
395  * through the database, decrementing the reference count of any element that
396  * satisfies the snapshot. If the reference count of an element becomes zero
397  * then it is removed from the database.
398  */
399 static void
400 mntfs_freesnap(mntnode_t *mnp, mntsnap_t *snapp)
401 {
402 	zone_t *zonep = MTOD(mnp)->mnt_zone;
403 	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
404 	mntelem_t **elempp = &zonep->zone_mntfs_db;
405 	mntelem_t *elemp;
406 	int show_hidden = snapp->mnts_flags & MNTS_SHOWHIDDEN;
407 	size_t number_decremented = 0;
408 
409 	ASSERT(RW_WRITE_HELD(&mnp->mnt_contents));
410 
411 	/* Ignore an uninitialised snapshot. */
412 	if (snapp->mnts_nmnts == 0)
413 		return;
414 
415 	/* Drop the holds on any matching database elements. */
416 	rw_enter(dblockp, RW_WRITER);
417 	while ((elemp = *elempp) != NULL) {
418 		if (mntfs_elem_in_range(snapp, elemp) &&
419 		    (!elemp->mnte_hidden || show_hidden) &&
420 		    ++number_decremented && --elemp->mnte_refcnt == 0) {
421 			if ((*elempp = elemp->mnte_next) != NULL)
422 				(*elempp)->mnte_prev = elemp->mnte_prev;
423 			mntfs_destroy_elem(elemp);
424 		} else {
425 			elempp = &elemp->mnte_next;
426 		}
427 	}
428 	rw_exit(dblockp);
429 	ASSERT(number_decremented == snapp->mnts_nmnts);
430 
431 	/* Clear the snapshot data. */
432 	bzero(snapp, sizeof (mntsnap_t));
433 }
434 
435 /* Insert the new database element newp after the existing element prevp. */
436 static void
437 mntfs_insert_after(mntelem_t *newp, mntelem_t *prevp)
438 {
439 	newp->mnte_prev = prevp;
440 	newp->mnte_next = prevp->mnte_next;
441 	prevp->mnte_next = newp;
442 	if (newp->mnte_next != NULL)
443 		newp->mnte_next->mnte_prev = newp;
444 }
445 
446 /* Create and return a copy of a given database element. */
447 static mntelem_t *
448 mntfs_copy(mntelem_t *origp)
449 {
450 	mntelem_t *copyp;
451 
452 	copyp = kmem_zalloc(sizeof (mntelem_t), KM_SLEEP);
453 	copyp->mnte_vfs_ctime = origp->mnte_vfs_ctime;
454 	copyp->mnte_text_size = origp->mnte_text_size;
455 	copyp->mnte_text = kmem_alloc(copyp->mnte_text_size, KM_SLEEP);
456 	bcopy(origp->mnte_text, copyp->mnte_text, copyp->mnte_text_size);
457 	copyp->mnte_tab = origp->mnte_tab;
458 	copyp->mnte_hidden = origp->mnte_hidden;
459 
460 	return (copyp);
461 }
462 
463 /*
464  * Compare two database elements and determine whether or not the vfs_t payload
465  * data of each are the same. Return 1 if so and 0 otherwise.
466  */
467 static int
468 mntfs_is_same_element(mntelem_t *a, mntelem_t *b)
469 {
470 	if (a->mnte_hidden == b->mnte_hidden &&
471 	    a->mnte_text_size == b->mnte_text_size &&
472 	    bcmp(a->mnte_text, b->mnte_text, a->mnte_text_size) == 0 &&
473 	    bcmp(&a->mnte_tab, &b->mnte_tab, sizeof (struct extmnttab)) == 0)
474 		return (1);
475 	else
476 		return (0);
477 }
478 
479 /*
480  * mntfs_snapshot() updates the database, creating it if necessary, so that it
481  * accurately reflects the state of the in-kernel mnttab. It also increments
482  * the reference count on all database elements that correspond to currently-
483  * mounted resources. Finally, it initialises the appropriate snapshot
484  * structure.
485  *
486  * Each vfs_t is given a high-resolution time stamp, for the benefit of mntfs,
487  * when it is inserted into the in-kernel mnttab. This time stamp is copied into
488  * the corresponding database element when it is created, allowing the element
489  * and the vfs_t to be identified as a pair. It is possible that some file
490  * systems may make unadvertised changes to, for example, a resource's mount
491  * options. Therefore, in order to determine whether a database element is an
492  * up-to-date representation of a given vfs_t, it is compared with a temporary
493  * element generated for this purpose. Although less efficient, this is safer
494  * than implementing an mtime for a vfs_t.
495  *
496  * Some mounted resources are marked as "hidden" with a VFS_NOMNTTAB flag. These
497  * are considered invisible unless the user has already set the MNT_SHOWHIDDEN
498  * flag in the vnode using the MNTIOC_SHOWHIDDEN ioctl.
499  */
500 static void
501 mntfs_snapshot(mntnode_t *mnp, mntsnap_t *snapp)
502 {
503 	mntdata_t	*mnd = MTOD(mnp);
504 	zone_t		*zonep = mnd->mnt_zone;
505 	int		is_global_zone = (zonep == global_zone);
506 	int		show_hidden = mnp->mnt_flags & MNT_SHOWHIDDEN;
507 	vfs_t		*vfsp, *firstvfsp, *lastvfsp;
508 	vfs_t		dummyvfs;
509 	vfs_t		*dummyvfsp = NULL;
510 	krwlock_t	*dblockp = &zonep->zone_mntfs_db_lock;
511 	mntelem_t	**headpp = &zonep->zone_mntfs_db;
512 	mntelem_t	*elemp;
513 	mntelem_t	*prevp = NULL;
514 	int		order;
515 	mntelem_t	*tempelemp;
516 	mntelem_t	*newp;
517 	mntelem_t	*firstp = NULL;
518 	size_t		nmnts = 0;
519 	size_t		total_text_size = 0;
520 	size_t		normal_text_size = 0;
521 	int		insert_before;
522 	timespec_t	last_mtime;
523 	size_t		entry_length, new_entry_length;
524 
525 
526 	ASSERT(RW_WRITE_HELD(&mnp->mnt_contents));
527 	vfs_list_read_lock();
528 	vfs_mnttab_modtime(&last_mtime);
529 
530 	/*
531 	 * If this snapshot already exists then we must have been asked to
532 	 * rewind the file, i.e. discard the snapshot and create a new one in
533 	 * its place. In this case we first see if the in-kernel mnttab has
534 	 * advertised a change; if not then we simply reinitialise the metadata.
535 	 */
536 	if (snapp->mnts_nmnts) {
537 		if (mntfs_newest(&last_mtime, &snapp->mnts_last_mtime) ==
538 		    MNTFS_NEITHER) {
539 			/*
540 			 * An unchanged mtime is no guarantee that the
541 			 * in-kernel mnttab is unchanged; for example, a
542 			 * concurrent remount may be between calls to
543 			 * vfs_setmntopt_nolock() and vfs_mnttab_modtimeupd().
544 			 * It follows that the database may have changed, and
545 			 * in particular that some elements in this snapshot
546 			 * may have been killed by another call to
547 			 * mntfs_snapshot(). It is therefore not merely
548 			 * unnecessary to update the snapshot's time but in
549 			 * fact dangerous; it needs to be left alone.
550 			 */
551 			snapp->mnts_next = snapp->mnts_first;
552 			snapp->mnts_flags &= ~MNTS_REWIND;
553 			snapp->mnts_foffset = snapp->mnts_ieoffset = 0;
554 			vfs_list_unlock();
555 			return;
556 		} else {
557 			mntfs_freesnap(mnp, snapp);
558 		}
559 	}
560 
561 	/*
562 	 * Create a temporary database element. For each vfs_t, the temporary
563 	 * element will be populated with the corresponding text. If the vfs_t
564 	 * does not have a corresponding element within the database, or if
565 	 * there is such an element but it is stale, a copy of the temporary
566 	 * element is inserted into the database at the appropriate location.
567 	 */
568 	tempelemp = kmem_alloc(sizeof (mntelem_t), KM_SLEEP);
569 	entry_length = MNT_LINE_MAX;
570 	tempelemp->mnte_text = kmem_alloc(entry_length, KM_SLEEP);
571 
572 	/* Find the first and last vfs_t for the given zone. */
573 	if (is_global_zone) {
574 		firstvfsp = rootvfs;
575 		lastvfsp = firstvfsp->vfs_prev;
576 	} else {
577 		firstvfsp = zonep->zone_vfslist;
578 		/*
579 		 * If there isn't already a vfs_t for root then we create a
580 		 * dummy which will be used as the head of the list (which will
581 		 * therefore no longer be circular).
582 		 */
583 		if (firstvfsp == NULL ||
584 		    strcmp(refstr_value(firstvfsp->vfs_mntpt),
585 		    zonep->zone_rootpath) != 0) {
586 			/*
587 			 * The zone's vfs_ts will have mount points relative to
588 			 * the zone's root path. The vfs_t for the zone's
589 			 * root file system would therefore have a mount point
590 			 * equal to the zone's root path. Since the zone's root
591 			 * path isn't a mount point, we copy the vfs_t of the
592 			 * zone's root vnode, and provide it with a fake mount
593 			 * point and resource.
594 			 *
595 			 * Note that by cloning another vfs_t we also acquire
596 			 * its high-resolution ctime. This might appear to
597 			 * violate the requirement that the ctimes in the list
598 			 * of vfs_ts are unique and monotonically increasing;
599 			 * this is not the case. The dummy vfs_t appears in only
600 			 * a non-global zone's vfs_t list, where the cloned
601 			 * vfs_t would not ordinarily be visible; the ctimes are
602 			 * therefore unique. The zone's root path must be
603 			 * available before the zone boots, and so its root
604 			 * vnode's vfs_t's ctime must be lower than those of any
605 			 * resources subsequently mounted by the zone. The
606 			 * ctimes are therefore monotonically increasing.
607 			 */
608 			dummyvfs = *zonep->zone_rootvp->v_vfsp;
609 			dummyvfs.vfs_mntpt = refstr_alloc(zonep->zone_rootpath);
610 			dummyvfs.vfs_resource = dummyvfs.vfs_mntpt;
611 			dummyvfsp = &dummyvfs;
612 			if (firstvfsp == NULL) {
613 				lastvfsp = dummyvfsp;
614 			} else {
615 				lastvfsp = firstvfsp->vfs_zone_prev;
616 				dummyvfsp->vfs_zone_next = firstvfsp;
617 			}
618 			firstvfsp = dummyvfsp;
619 		} else {
620 			lastvfsp = firstvfsp->vfs_zone_prev;
621 		}
622 	}
623 
624 	/*
625 	 * Now walk through all the vfs_ts for this zone. For each one, find the
626 	 * corresponding database element, creating it first if necessary, and
627 	 * increment its reference count.
628 	 */
629 	rw_enter(dblockp, RW_WRITER);
630 	elemp = zonep->zone_mntfs_db;
631 	/* CSTYLED */
632 	for (vfsp = firstvfsp;;
633 	    vfsp = is_global_zone ? vfsp->vfs_next : vfsp->vfs_zone_next) {
634 		DTRACE_PROBE1(new__vfs, vfs_t *, vfsp);
635 		/* Consider only visible entries. */
636 		if ((vfsp->vfs_flag & VFS_NOMNTTAB) == 0 || show_hidden) {
637 			/*
638 			 * Walk through the existing database looking for either
639 			 * an element that matches the current vfs_t, or for the
640 			 * correct place in which to insert a new element.
641 			 */
642 			insert_before = 0;
643 			for (; elemp; prevp = elemp, elemp = elemp->mnte_next) {
644 				DTRACE_PROBE1(considering__elem, mntelem_t *,
645 				    elemp);
646 
647 				/* Compare the vfs_t with the element. */
648 				order = mntfs_newest(&elemp->mnte_vfs_ctime,
649 				    &vfsp->vfs_hrctime);
650 
651 				/*
652 				 * If we encounter a database element newer than
653 				 * this vfs_t then we've stepped over a gap
654 				 * where the element for this vfs_t must be
655 				 * inserted.
656 				 */
657 				if (order == MNTFS_FIRST) {
658 					insert_before = 1;
659 					break;
660 				}
661 
662 				/* Dead elements no longer interest us. */
663 				if (MNTFS_ELEM_IS_DEAD(elemp))
664 					continue;
665 
666 				/*
667 				 * If the time stamps are the same then the
668 				 * element is potential match for the vfs_t,
669 				 * although it may later prove to be stale.
670 				 */
671 				if (order == MNTFS_NEITHER)
672 					break;
673 
674 				/*
675 				 * This element must be older than the vfs_t.
676 				 * It must, therefore, correspond to a vfs_t
677 				 * that has been unmounted. Since the element is
678 				 * still alive, we kill it if it is visible.
679 				 */
680 				if (!elemp->mnte_hidden || show_hidden)
681 					vfs_mono_time(&elemp->mnte_death);
682 			}
683 			DTRACE_PROBE2(possible__match, vfs_t *, vfsp,
684 			    mntelem_t *, elemp);
685 
686 			/* Create a new database element if required. */
687 			new_entry_length = mntfs_text_len(vfsp, zonep);
688 			if (new_entry_length > entry_length) {
689 				kmem_free(tempelemp->mnte_text, entry_length);
690 				tempelemp->mnte_text =
691 				    kmem_alloc(new_entry_length, KM_SLEEP);
692 				entry_length = new_entry_length;
693 			}
694 			mntfs_populate_text(vfsp, zonep, tempelemp);
695 			ASSERT(tempelemp->mnte_text_size == new_entry_length);
696 			if (elemp == NULL) {
697 				/*
698 				 * We ran off the end of the database. Insert a
699 				 * new element at the end.
700 				 */
701 				newp = mntfs_copy(tempelemp);
702 				vfs_mono_time(&newp->mnte_birth);
703 				if (prevp) {
704 					mntfs_insert_after(newp, prevp);
705 				} else {
706 					newp->mnte_next = NULL;
707 					newp->mnte_prev = NULL;
708 					ASSERT(*headpp == NULL);
709 					*headpp = newp;
710 				}
711 				elemp = newp;
712 			} else if (insert_before) {
713 				/*
714 				 * Insert a new element before the current one.
715 				 */
716 				newp = mntfs_copy(tempelemp);
717 				vfs_mono_time(&newp->mnte_birth);
718 				if (prevp) {
719 					mntfs_insert_after(newp, prevp);
720 				} else {
721 					newp->mnte_next = elemp;
722 					newp->mnte_prev = NULL;
723 					elemp->mnte_prev = newp;
724 					ASSERT(*headpp == elemp);
725 					*headpp = newp;
726 				}
727 				elemp = newp;
728 			} else if (!mntfs_is_same_element(elemp, tempelemp)) {
729 				/*
730 				 * The element corresponds to the vfs_t, but the
731 				 * vfs_t has changed; it must have been
732 				 * remounted. Kill the old element and insert a
733 				 * new one after it.
734 				 */
735 				vfs_mono_time(&elemp->mnte_death);
736 				newp = mntfs_copy(tempelemp);
737 				vfs_mono_time(&newp->mnte_birth);
738 				mntfs_insert_after(newp, elemp);
739 				elemp = newp;
740 			}
741 
742 			/* We've found the corresponding element. Hold it. */
743 			DTRACE_PROBE1(incrementing, mntelem_t *, elemp);
744 			elemp->mnte_refcnt++;
745 
746 			/*
747 			 * Update the parameters used to initialise the
748 			 * snapshot.
749 			 */
750 			nmnts++;
751 			total_text_size += elemp->mnte_text_size;
752 			if (!elemp->mnte_hidden)
753 				normal_text_size += elemp->mnte_text_size;
754 			if (!firstp)
755 				firstp = elemp;
756 
757 			prevp = elemp;
758 			elemp = elemp->mnte_next;
759 		}
760 
761 		if (vfsp == lastvfsp)
762 			break;
763 	}
764 
765 	/*
766 	 * Any remaining visible database elements that are still alive must be
767 	 * killed now, because their corresponding vfs_ts must have been
768 	 * unmounted.
769 	 */
770 	for (; elemp; elemp = elemp->mnte_next) {
771 		if (MNTFS_ELEM_IS_ALIVE(elemp) &&
772 		    (!elemp->mnte_hidden || show_hidden))
773 			vfs_mono_time(&elemp->mnte_death);
774 	}
775 
776 	/* Initialise the snapshot. */
777 	vfs_mono_time(&snapp->mnts_time);
778 	snapp->mnts_last_mtime = last_mtime;
779 	snapp->mnts_first = snapp->mnts_next = firstp;
780 	snapp->mnts_flags = show_hidden ? MNTS_SHOWHIDDEN : 0;
781 	snapp->mnts_nmnts = nmnts;
782 	snapp->mnts_text_size = total_text_size;
783 	snapp->mnts_foffset = snapp->mnts_ieoffset = 0;
784 
785 	/*
786 	 * Record /etc/mnttab's current size and mtime for possible future use
787 	 * by mntgetattr().
788 	 */
789 	mnd->mnt_size = normal_text_size;
790 	mnd->mnt_mtime = last_mtime;
791 	if (show_hidden) {
792 		mnd->mnt_hidden_size = total_text_size;
793 		mnd->mnt_hidden_mtime = last_mtime;
794 	}
795 
796 	/* Clean up. */
797 	rw_exit(dblockp);
798 	vfs_list_unlock();
799 	if (dummyvfsp != NULL)
800 		refstr_rele(dummyvfsp->vfs_mntpt);
801 	kmem_free(tempelemp->mnte_text, entry_length);
802 	kmem_free(tempelemp, sizeof (mntelem_t));
803 }
804 
805 /*
806  * Public function to convert vfs_mntopts into a string.
807  * A buffer of sufficient size is allocated, which is returned via bufp,
808  * and whose length is returned via lenp.
809  */
810 void
811 mntfs_getmntopts(struct vfs *vfsp, char **bufp, size_t *lenp)
812 {
813 	size_t len;
814 	char *buf;
815 
816 	vfs_list_read_lock();
817 
818 	len = mntfs_optsize(vfsp) + 1;
819 	buf = kmem_alloc(len, KM_NOSLEEP);
820 	if (buf == NULL) {
821 		*bufp = NULL;
822 		vfs_list_unlock();
823 		return;
824 	}
825 	buf[len - 1] = '\0';
826 	(void) mntfs_optprint(vfsp, buf);
827 	ASSERT(buf[len - 1] == '\0');
828 
829 	vfs_list_unlock();
830 	*bufp = buf;
831 	*lenp = len;
832 }
833 
834 /* ARGSUSED */
835 static int
836 mntopen(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
837 {
838 	vnode_t *vp = *vpp;
839 	mntnode_t *nmnp;
840 
841 	/*
842 	 * Not allowed to open for writing, return error.
843 	 */
844 	if (flag & FWRITE)
845 		return (EPERM);
846 	/*
847 	 * Create a new mnt/vnode for each open, this will give us a handle to
848 	 * hang the snapshot on.
849 	 */
850 	nmnp = mntgetnode(vp);
851 
852 	*vpp = MTOV(nmnp);
853 	atomic_add_32(&MTOD(nmnp)->mnt_nopen, 1);
854 	VN_RELE(vp);
855 	return (0);
856 }
857 
858 /* ARGSUSED */
859 static int
860 mntclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
861 	caller_context_t *ct)
862 {
863 	mntnode_t *mnp = VTOM(vp);
864 
865 	/* Clean up any locks or shares held by the current process */
866 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
867 	cleanshares(vp, ttoproc(curthread)->p_pid);
868 
869 	if (count > 1)
870 		return (0);
871 	if (vp->v_count == 1) {
872 		rw_enter(&mnp->mnt_contents, RW_WRITER);
873 		mntfs_freesnap(mnp, &mnp->mnt_read);
874 		mntfs_freesnap(mnp, &mnp->mnt_ioctl);
875 		rw_exit(&mnp->mnt_contents);
876 		atomic_add_32(&MTOD(mnp)->mnt_nopen, -1);
877 	}
878 	return (0);
879 }
880 
881 /* ARGSUSED */
882 static int
883 mntread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, caller_context_t *ct)
884 {
885 	mntnode_t *mnp = VTOM(vp);
886 	zone_t *zonep = MTOD(mnp)->mnt_zone;
887 	mntsnap_t *snapp = &mnp->mnt_read;
888 	off_t off = uio->uio_offset;
889 	size_t len = uio->uio_resid;
890 	char *bufferp;
891 	size_t available, copylen;
892 	size_t written = 0;
893 	mntelem_t *elemp;
894 	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
895 	int error = 0;
896 	off_t	ieoffset;
897 
898 	rw_enter(&mnp->mnt_contents, RW_WRITER);
899 	if (snapp->mnts_nmnts == 0 || (off == (off_t)0))
900 		mntfs_snapshot(mnp, snapp);
901 
902 	if ((size_t)(off + len) > snapp->mnts_text_size)
903 		len = snapp->mnts_text_size - off;
904 
905 	if (off < 0 || len > snapp->mnts_text_size) {
906 		rw_exit(&mnp->mnt_contents);
907 		return (EFAULT);
908 	}
909 
910 	if (len == 0) {
911 		rw_exit(&mnp->mnt_contents);
912 		return (0);
913 	}
914 
915 	/*
916 	 * For the file offset provided, locate the corresponding database
917 	 * element and calculate the corresponding offset within its text. If
918 	 * the file offset is the same as that reached during the last read(2)
919 	 * then use the saved element and intra-element offset.
920 	 */
921 	rw_enter(dblockp, RW_READER);
922 	if (off == 0 || (off == snapp->mnts_foffset)) {
923 		elemp = snapp->mnts_next;
924 		ieoffset = snapp->mnts_ieoffset;
925 	} else {
926 		off_t total_off;
927 		/*
928 		 * Find the element corresponding to the requested file offset
929 		 * by walking through the database and summing the text sizes
930 		 * of the individual elements. If the requested file offset is
931 		 * greater than that reached on the last visit then we can start
932 		 * at the last seen element; otherwise, we have to start at the
933 		 * beginning.
934 		 */
935 		if (off > snapp->mnts_foffset) {
936 			elemp = snapp->mnts_next;
937 			total_off = snapp->mnts_foffset - snapp->mnts_ieoffset;
938 		} else {
939 			elemp = snapp->mnts_first;
940 			total_off = 0;
941 		}
942 		while (off > total_off + elemp->mnte_text_size) {
943 			total_off += elemp->mnte_text_size;
944 			elemp = mntfs_get_next_elem(snapp, elemp);
945 			ASSERT(elemp != NULL);
946 		}
947 		/* Calculate the intra-element offset. */
948 		if (off > total_off)
949 			ieoffset = off - total_off;
950 		else
951 			ieoffset = 0;
952 	}
953 
954 	/*
955 	 * Create a buffer and populate it with the text from successive
956 	 * database elements until it is full.
957 	 */
958 	bufferp = kmem_alloc(len, KM_SLEEP);
959 	while (written < len) {
960 		available = elemp->mnte_text_size - ieoffset;
961 		copylen = MIN(len - written, available);
962 		bcopy(elemp->mnte_text + ieoffset, bufferp + written, copylen);
963 		written += copylen;
964 		if (copylen == available) {
965 			elemp = mntfs_get_next_elem(snapp, elemp);
966 			ASSERT(elemp != NULL || written == len);
967 			ieoffset = 0;
968 		} else {
969 			ieoffset += copylen;
970 		}
971 	}
972 	rw_exit(dblockp);
973 
974 	/*
975 	 * Write the populated buffer, update the snapshot's state if
976 	 * successful and then advertise our read.
977 	 */
978 	error = uiomove(bufferp, len, UIO_READ, uio);
979 	if (error == 0) {
980 		snapp->mnts_next = elemp;
981 		snapp->mnts_foffset = off + len;
982 		snapp->mnts_ieoffset = ieoffset;
983 	}
984 	vfs_mnttab_readop();
985 	rw_exit(&mnp->mnt_contents);
986 
987 	/* Clean up. */
988 	kmem_free(bufferp, len);
989 	return (error);
990 }
991 
992 static int
993 mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
994 	caller_context_t *ct)
995 {
996 	int mask = vap->va_mask;
997 	int error;
998 	mntnode_t *mnp = VTOM(vp);
999 	timespec_t mtime, old_mtime;
1000 	size_t size, old_size;
1001 	mntdata_t *mntdata = MTOD(VTOM(vp));
1002 	mntsnap_t *rsnapp, *isnapp;
1003 	extern timespec_t vfs_mnttab_ctime;
1004 
1005 
1006 	/* AT_MODE, AT_UID and AT_GID are derived from the underlying file. */
1007 	if (mask & AT_MODE|AT_UID|AT_GID) {
1008 		if (error = VOP_GETATTR(mnp->mnt_mountvp, vap, flags, cr, ct))
1009 			return (error);
1010 	}
1011 
1012 	/*
1013 	 * There are some minor subtleties in the determination of
1014 	 * /etc/mnttab's size and mtime. We wish to avoid any condition in
1015 	 * which, in the vicinity of a change to the in-kernel mnttab, we
1016 	 * return an old value for one but a new value for the other. We cannot
1017 	 * simply hold vfslist for the entire calculation because we might need
1018 	 * to call mntfs_snapshot(), which calls vfs_list_read_lock().
1019 	 */
1020 	if (mask & AT_SIZE|AT_NBLOCKS) {
1021 		rw_enter(&mnp->mnt_contents, RW_WRITER);
1022 
1023 		vfs_list_read_lock();
1024 		vfs_mnttab_modtime(&mtime);
1025 		if (mnp->mnt_flags & MNT_SHOWHIDDEN) {
1026 			old_mtime = mntdata->mnt_hidden_mtime;
1027 			old_size = mntdata->mnt_hidden_size;
1028 		} else {
1029 			old_mtime = mntdata->mnt_mtime;
1030 			old_size = mntdata->mnt_size;
1031 		}
1032 		vfs_list_unlock();
1033 
1034 		rsnapp = &mnp->mnt_read;
1035 		isnapp = &mnp->mnt_ioctl;
1036 		if (rsnapp->mnts_nmnts || isnapp->mnts_nmnts) {
1037 			/*
1038 			 * The mntnode already has at least one snapshot from
1039 			 * which to take the size; the user will understand from
1040 			 * mnttab(4) that the current size of the in-kernel
1041 			 * mnttab is irrelevant.
1042 			 */
1043 			size = rsnapp->mnts_nmnts ? rsnapp->mnts_text_size :
1044 			    isnapp->mnts_text_size;
1045 		} else if (mntfs_newest(&mtime, &old_mtime) == MNTFS_NEITHER) {
1046 			/*
1047 			 * There is no existing valid snapshot but the in-kernel
1048 			 * mnttab has not changed since the time that the last
1049 			 * one was generated. Use the old file size; note that
1050 			 * it is guaranteed to be consistent with mtime, which
1051 			 * may be returned to the user later.
1052 			 */
1053 			size = old_size;
1054 		} else {
1055 			/*
1056 			 * There is no snapshot and the in-kernel mnttab has
1057 			 * changed since the last one was created. We generate a
1058 			 * new snapshot which we use for not only the size but
1059 			 * also the mtime, thereby ensuring that the two are
1060 			 * consistent.
1061 			 */
1062 			mntfs_snapshot(mnp, rsnapp);
1063 			size = rsnapp->mnts_text_size;
1064 			mtime = rsnapp->mnts_last_mtime;
1065 			mntfs_freesnap(mnp, rsnapp);
1066 		}
1067 
1068 		rw_exit(&mnp->mnt_contents);
1069 	} else if (mask & AT_ATIME|AT_MTIME) {
1070 		vfs_list_read_lock();
1071 		vfs_mnttab_modtime(&mtime);
1072 		vfs_list_unlock();
1073 	}
1074 
1075 	/* Always look like a regular file. */
1076 	if (mask & AT_TYPE)
1077 		vap->va_type = VREG;
1078 	/* Mode should basically be read only. */
1079 	if (mask & AT_MODE)
1080 		vap->va_mode &= 07444;
1081 	if (mask & AT_FSID)
1082 		vap->va_fsid = vp->v_vfsp->vfs_dev;
1083 	/* Nodeid is always ROOTINO. */
1084 	if (mask & AT_NODEID)
1085 		vap->va_nodeid = (ino64_t)MNTROOTINO;
1086 	/*
1087 	 * Set nlink to the number of open vnodes for mnttab info
1088 	 * plus one for existing.
1089 	 */
1090 	if (mask & AT_NLINK)
1091 		vap->va_nlink = mntdata->mnt_nopen + 1;
1092 	if (mask & AT_SIZE)
1093 		vap->va_size = size;
1094 	if (mask & AT_ATIME)
1095 		vap->va_atime = mtime;
1096 	if (mask & AT_MTIME)
1097 		vap->va_mtime = mtime;
1098 	if (mask & AT_CTIME)
1099 		vap->va_ctime = vfs_mnttab_ctime;
1100 	if (mask & AT_RDEV)
1101 		vap->va_rdev = 0;
1102 	if (mask & AT_BLKSIZE)
1103 		vap->va_blksize = DEV_BSIZE;
1104 	if (mask & AT_NBLOCKS)
1105 		vap->va_nblocks = btod(size);
1106 	if (mask & AT_SEQ)
1107 		vap->va_seq = 0;
1108 
1109 	return (0);
1110 }
1111 
1112 static int
1113 mntaccess(vnode_t *vp, int mode, int flags, cred_t *cr,
1114 	caller_context_t *ct)
1115 {
1116 	mntnode_t *mnp = VTOM(vp);
1117 
1118 	if (mode & (VWRITE|VEXEC))
1119 		return (EROFS);
1120 
1121 	/*
1122 	 * Do access check on the underlying directory vnode.
1123 	 */
1124 	return (VOP_ACCESS(mnp->mnt_mountvp, mode, flags, cr, ct));
1125 }
1126 
1127 
1128 /*
1129  * New /mntfs vnode required; allocate it and fill in most of the fields.
1130  */
1131 static mntnode_t *
1132 mntgetnode(vnode_t *dp)
1133 {
1134 	mntnode_t *mnp;
1135 	vnode_t *vp;
1136 
1137 	mnp = kmem_zalloc(sizeof (mntnode_t), KM_SLEEP);
1138 	mnp->mnt_vnode = vn_alloc(KM_SLEEP);
1139 	mnp->mnt_mountvp = VTOM(dp)->mnt_mountvp;
1140 	rw_init(&mnp->mnt_contents, NULL, RW_DEFAULT, NULL);
1141 	vp = MTOV(mnp);
1142 	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
1143 	vn_setops(vp, mntvnodeops);
1144 	vp->v_vfsp = dp->v_vfsp;
1145 	vp->v_type = VREG;
1146 	vp->v_data = (caddr_t)mnp;
1147 
1148 	return (mnp);
1149 }
1150 
1151 /*
1152  * Free the storage obtained from mntgetnode().
1153  */
1154 static void
1155 mntfreenode(mntnode_t *mnp)
1156 {
1157 	vnode_t *vp = MTOV(mnp);
1158 
1159 	rw_destroy(&mnp->mnt_contents);
1160 	vn_invalid(vp);
1161 	vn_free(vp);
1162 	kmem_free(mnp, sizeof (*mnp));
1163 }
1164 
1165 
1166 /* ARGSUSED */
1167 static int
1168 mntfsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1169 {
1170 	return (0);
1171 }
1172 
1173 /* ARGSUSED */
1174 static void
1175 mntinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1176 {
1177 	mntnode_t *mnp = VTOM(vp);
1178 
1179 	mntfreenode(mnp);
1180 }
1181 
1182 /*
1183  * lseek(2) is supported only to rewind the file by resetmnttab(3C). Rewinding
1184  * has a special meaning for /etc/mnttab: it forces mntfs to refresh the
1185  * snapshot at the next ioctl().
1186  *
1187  * mnttab(4) explains that "the snapshot...is taken any time a read(2) is
1188  * performed at offset 0". We therefore ignore the read snapshot here.
1189  */
1190 /* ARGSUSED */
1191 static int
1192 mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
1193 {
1194 	mntnode_t *mnp = VTOM(vp);
1195 
1196 	if (*noffp == 0) {
1197 		rw_enter(&mnp->mnt_contents, RW_WRITER);
1198 		mnp->mnt_ioctl.mnts_flags |= MNTS_REWIND;
1199 		rw_exit(&mnp->mnt_contents);
1200 	}
1201 
1202 	return (0);
1203 }
1204 
1205 /*
1206  * Return the answer requested to poll().
1207  * POLLRDBAND will return when the mtime of the mnttab
1208  * information is newer than the latest one read for this open.
1209  */
1210 /* ARGSUSED */
1211 static int
1212 mntpoll(vnode_t *vp, short ev, int any, short *revp, pollhead_t **phpp,
1213 	caller_context_t *ct)
1214 {
1215 	mntnode_t *mnp = VTOM(vp);
1216 	mntsnap_t *snapp;
1217 
1218 	rw_enter(&mnp->mnt_contents, RW_READER);
1219 	if (mntfs_newest(&mnp->mnt_ioctl.mnts_last_mtime,
1220 	    &mnp->mnt_read.mnts_last_mtime) == MNTFS_FIRST)
1221 		snapp = &mnp->mnt_ioctl;
1222 	else
1223 		snapp = &mnp->mnt_read;
1224 
1225 	*revp = 0;
1226 	*phpp = (pollhead_t *)NULL;
1227 	if (ev & POLLIN)
1228 		*revp |= POLLIN;
1229 
1230 	if (ev & POLLRDNORM)
1231 		*revp |= POLLRDNORM;
1232 
1233 	if (ev & POLLRDBAND) {
1234 		vfs_mnttab_poll(&snapp->mnts_last_mtime, phpp);
1235 		if (*phpp == (pollhead_t *)NULL)
1236 			*revp |= POLLRDBAND;
1237 	}
1238 	rw_exit(&mnp->mnt_contents);
1239 
1240 	if (*revp || *phpp != NULL || any) {
1241 		return (0);
1242 	}
1243 	/*
1244 	 * If someone is polling an unsupported poll events (e.g.
1245 	 * POLLOUT, POLLPRI, etc.), just return POLLERR revents.
1246 	 * That way we will ensure that we don't return a 0
1247 	 * revents with a NULL pollhead pointer.
1248 	 */
1249 	*revp = POLLERR;
1250 	return (0);
1251 }
1252 
1253 /*
1254  * mntfs_same_word() returns 1 if two words are the same in the context of
1255  * MNTIOC_GETMNTANY and 0 otherwise.
1256  *
1257  * worda is a memory address that lies somewhere in the buffer bufa; it cannot
1258  * be NULL since this is used to indicate to getmntany(3C) that the user does
1259  * not wish to match a particular field. The text to which worda points is
1260  * supplied by the user; if it is not null-terminated then it cannot match.
1261  *
1262  * Buffer bufb contains a line from /etc/mnttab, in which the fields are
1263  * delimited by tab or new-line characters. offb is the offset of the second
1264  * word within this buffer.
1265  *
1266  * mntfs_same_word() returns 1 if the words are the same and 0 otherwise.
1267  */
1268 int
1269 mntfs_same_word(char *worda, char *bufa, size_t sizea, off_t offb, char *bufb,
1270     size_t sizeb)
1271 {
1272 	char *wordb = bufb + offb;
1273 	int bytes_remaining;
1274 
1275 	ASSERT(worda != NULL);
1276 
1277 	bytes_remaining = MIN(((bufa + sizea) - worda),
1278 	    ((bufb + sizeb) - wordb));
1279 	while (bytes_remaining && *worda == *wordb) {
1280 		worda++;
1281 		wordb++;
1282 		bytes_remaining--;
1283 	}
1284 	if (bytes_remaining &&
1285 	    *worda == '\0' && (*wordb == '\t' || *wordb == '\n'))
1286 		return (1);
1287 	else
1288 		return (0);
1289 }
1290 
1291 /*
1292  * mntfs_special_info_string() returns which, if either, of VBLK or VCHR
1293  * corresponds to a supplied path. If the path is a special device then the
1294  * function optionally sets the major and minor numbers.
1295  */
1296 vtype_t
1297 mntfs_special_info_string(char *path, uint_t *major, uint_t *minor, cred_t *cr)
1298 {
1299 	vattr_t vattr;
1300 	vnode_t *vp;
1301 	vtype_t type;
1302 	int error;
1303 
1304 	if (path == NULL || *path != '/' ||
1305 	    lookupnameat(path + 1, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp, rootdir))
1306 		return (0);
1307 
1308 	vattr.va_mask = AT_TYPE | AT_RDEV;
1309 	error = VOP_GETATTR(vp, &vattr, ATTR_REAL, cr, NULL);
1310 	VN_RELE(vp);
1311 
1312 	if (error == 0 && ((type = vattr.va_type) == VBLK || type == VCHR)) {
1313 		if (major && minor) {
1314 			*major = getmajor(vattr.va_rdev);
1315 			*minor = getminor(vattr.va_rdev);
1316 		}
1317 		return (type);
1318 	} else {
1319 		return (0);
1320 	}
1321 }
1322 
1323 /*
1324  * mntfs_special_info_element() extracts the name of the mounted resource
1325  * for a given element and copies it into a null-terminated string, which it
1326  * then passes to mntfs_special_info_string().
1327  */
1328 vtype_t
1329 mntfs_special_info_element(mntelem_t *elemp, cred_t *cr)
1330 {
1331 	char *newpath;
1332 	vtype_t type;
1333 
1334 	newpath = kmem_alloc(elemp->mnte_text_size, KM_SLEEP);
1335 	bcopy(elemp->mnte_text, newpath, (off_t)(elemp->mnte_tab.mnt_mountp));
1336 	*(newpath + (off_t)elemp->mnte_tab.mnt_mountp - 1) = '\0';
1337 	type = mntfs_special_info_string(newpath, NULL, NULL, cr);
1338 	kmem_free(newpath, elemp->mnte_text_size);
1339 
1340 	return (type);
1341 }
1342 
1343 /*
1344  * Convert an address that points to a byte within a user buffer into an
1345  * address that points to the corresponding offset within a kernel buffer. If
1346  * the user address is NULL then make no conversion. If the address does not
1347  * lie within the buffer then reset it to NULL.
1348  */
1349 char *
1350 mntfs_import_addr(char *uaddr, char *ubufp, char *kbufp, size_t bufsize)
1351 {
1352 	if (uaddr < ubufp || uaddr >= ubufp + bufsize)
1353 		return (NULL);
1354 	else
1355 		return (kbufp + (uaddr - ubufp));
1356 }
1357 
1358 /*
1359  * These 32-bit versions are to support STRUCT_DECL(9F) etc. in
1360  * mntfs_copyout_element() and mntioctl().
1361  */
1362 #ifdef _SYSCALL32_IMPL
1363 typedef struct extmnttab32 {
1364 	uint32_t	mnt_special;
1365 	uint32_t	mnt_mountp;
1366 	uint32_t	mnt_fstype;
1367 	uint32_t	mnt_mntopts;
1368 	uint32_t	mnt_time;
1369 	uint_t		mnt_major;
1370 	uint_t		mnt_minor;
1371 } extmnttab32_t;
1372 
1373 typedef struct mnttab32 {
1374 	uint32_t	mnt_special;
1375 	uint32_t	mnt_mountp;
1376 	uint32_t	mnt_fstype;
1377 	uint32_t	mnt_mntopts;
1378 	uint32_t	mnt_time;
1379 } mnttab32_t;
1380 
1381 struct mntentbuf32 {
1382 	uint32_t	mbuf_emp;
1383 	uint_t		mbuf_bufsize;
1384 	uint32_t	mbuf_buf;
1385 };
1386 #endif
1387 
1388 /*
1389  * mntfs_copyout_element() is common code for the MNTIOC_GETMNTENT,
1390  * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY ioctls. Having identifed the
1391  * database element desired by the user, this function copies out the text and
1392  * the pointers to the relevant userland addresses. It returns 0 on success
1393  * and non-zero otherwise.
1394  */
1395 int
1396 mntfs_copyout_elem(mntelem_t *elemp, struct extmnttab *uemp,
1397     char *ubufp, int cmd, int datamodel)
1398 {
1399 		STRUCT_DECL(extmnttab, ktab);
1400 		char *dbbufp = elemp->mnte_text;
1401 		size_t dbbufsize = elemp->mnte_text_size;
1402 		struct extmnttab *dbtabp = &elemp->mnte_tab;
1403 		size_t ssize;
1404 		char *kbufp;
1405 		int error = 0;
1406 
1407 
1408 		/*
1409 		 * We create a struct extmnttab within the kernel of the size
1410 		 * determined by the user's data model. We then populate its
1411 		 * fields by combining the start address of the text buffer
1412 		 * supplied by the user, ubufp, with the offsets stored for
1413 		 * this database element within dbtabp, a pointer to a struct
1414 		 * extmnttab.
1415 		 *
1416 		 * Note that if the corresponding field is "-" this signifies
1417 		 * no real content, and we set the address to NULL. This does
1418 		 * not apply to mnt_time.
1419 		 */
1420 		STRUCT_INIT(ktab, datamodel);
1421 		STRUCT_FSETP(ktab, mnt_special,
1422 		    MNTFS_REAL_FIELD(dbbufp) ? ubufp : NULL);
1423 		STRUCT_FSETP(ktab, mnt_mountp,
1424 		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_mountp) ?
1425 		    ubufp + (off_t)dbtabp->mnt_mountp : NULL);
1426 		STRUCT_FSETP(ktab, mnt_fstype,
1427 		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_fstype) ?
1428 		    ubufp + (off_t)dbtabp->mnt_fstype : NULL);
1429 		STRUCT_FSETP(ktab, mnt_mntopts,
1430 		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_mntopts) ?
1431 		    ubufp + (off_t)dbtabp->mnt_mntopts : NULL);
1432 		STRUCT_FSETP(ktab, mnt_time,
1433 		    ubufp + (off_t)dbtabp->mnt_time);
1434 		if (cmd == MNTIOC_GETEXTMNTENT) {
1435 			STRUCT_FSETP(ktab, mnt_major, dbtabp->mnt_major);
1436 			STRUCT_FSETP(ktab, mnt_minor, dbtabp->mnt_minor);
1437 			ssize = SIZEOF_STRUCT(extmnttab, datamodel);
1438 		} else {
1439 			ssize = SIZEOF_STRUCT(mnttab, datamodel);
1440 		}
1441 		if (copyout(STRUCT_BUF(ktab), uemp, ssize))
1442 			return (EFAULT);
1443 
1444 		/*
1445 		 * We create a text buffer in the kernel into which we copy the
1446 		 * /etc/mnttab entry for this element. We change the tab and
1447 		 * new-line delimiters to null bytes before copying out the
1448 		 * buffer.
1449 		 */
1450 		kbufp = kmem_alloc(dbbufsize, KM_SLEEP);
1451 		bcopy(elemp->mnte_text, kbufp, dbbufsize);
1452 		*(kbufp + (off_t)dbtabp->mnt_mountp - 1) =
1453 		    *(kbufp + (off_t)dbtabp->mnt_fstype - 1) =
1454 		    *(kbufp + (off_t)dbtabp->mnt_mntopts - 1) =
1455 		    *(kbufp + (off_t)dbtabp->mnt_time - 1) =
1456 		    *(kbufp + dbbufsize - 1) = '\0';
1457 		if (copyout(kbufp, ubufp, dbbufsize))
1458 			error = EFAULT;
1459 
1460 		kmem_free(kbufp, dbbufsize);
1461 		return (error);
1462 }
1463 
1464 /* ARGSUSED */
1465 static int
1466 mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
1467     int *rvalp, caller_context_t *ct)
1468 {
1469 	uint_t *up = (uint_t *)arg;
1470 	mntnode_t *mnp = VTOM(vp);
1471 	mntsnap_t *snapp = &mnp->mnt_ioctl;
1472 	int error = 0;
1473 	zone_t *zonep = MTOD(mnp)->mnt_zone;
1474 	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
1475 	model_t datamodel = flag & DATAMODEL_MASK;
1476 
1477 	switch (cmd) {
1478 
1479 	case MNTIOC_NMNTS:  		/* get no. of mounted resources */
1480 	{
1481 		rw_enter(&mnp->mnt_contents, RW_READER);
1482 		if (snapp->mnts_nmnts == 0 ||
1483 		    (snapp->mnts_flags & MNTS_REWIND)) {
1484 			if (!rw_tryupgrade(&mnp->mnt_contents)) {
1485 				rw_exit(&mnp->mnt_contents);
1486 				rw_enter(&mnp->mnt_contents, RW_WRITER);
1487 			}
1488 			if (snapp->mnts_nmnts == 0 ||
1489 			    (snapp->mnts_flags & MNTS_REWIND))
1490 				mntfs_snapshot(mnp, snapp);
1491 		}
1492 		rw_exit(&mnp->mnt_contents);
1493 
1494 		if (suword32(up, snapp->mnts_nmnts) != 0)
1495 			error = EFAULT;
1496 		break;
1497 	}
1498 
1499 	case MNTIOC_GETDEVLIST:  	/* get mounted device major/minor nos */
1500 	{
1501 		size_t len;
1502 		uint_t *devlist;
1503 		mntelem_t *elemp;
1504 		int i = 0;
1505 
1506 		rw_enter(&mnp->mnt_contents, RW_READER);
1507 		if (snapp->mnts_nmnts == 0 ||
1508 		    (snapp->mnts_flags & MNTS_REWIND)) {
1509 			if (!rw_tryupgrade(&mnp->mnt_contents)) {
1510 				rw_exit(&mnp->mnt_contents);
1511 				rw_enter(&mnp->mnt_contents, RW_WRITER);
1512 			}
1513 			if (snapp->mnts_nmnts == 0 ||
1514 			    (snapp->mnts_flags & MNTS_REWIND))
1515 				mntfs_snapshot(mnp, snapp);
1516 			rw_downgrade(&mnp->mnt_contents);
1517 		}
1518 
1519 		/* Create a local buffer to hold the device numbers. */
1520 		len = 2 * snapp->mnts_nmnts * sizeof (uint_t);
1521 		devlist = kmem_alloc(len, KM_SLEEP);
1522 
1523 		/*
1524 		 * Walk the database elements for this snapshot and add their
1525 		 * major and minor numbers.
1526 		 */
1527 		rw_enter(dblockp, RW_READER);
1528 		for (elemp = snapp->mnts_first; elemp;
1529 		    elemp = mntfs_get_next_elem(snapp, elemp)) {
1530 				devlist[2 * i] = elemp->mnte_tab.mnt_major;
1531 				devlist[2 * i + 1] = elemp->mnte_tab.mnt_minor;
1532 				i++;
1533 		}
1534 		rw_exit(dblockp);
1535 		ASSERT(i == snapp->mnts_nmnts);
1536 		rw_exit(&mnp->mnt_contents);
1537 
1538 		error = xcopyout(devlist, up, len);
1539 		kmem_free(devlist, len);
1540 		break;
1541 	}
1542 
1543 	case MNTIOC_SETTAG:		/* set tag on mounted file system */
1544 	case MNTIOC_CLRTAG:		/* clear tag on mounted file system */
1545 	{
1546 		struct mnttagdesc *dp = (struct mnttagdesc *)arg;
1547 		STRUCT_DECL(mnttagdesc, tagdesc);
1548 		char *cptr;
1549 		uint32_t major, minor;
1550 		char tagbuf[MAX_MNTOPT_TAG];
1551 		char *pbuf;
1552 		size_t len;
1553 		uint_t start = 0;
1554 		mntdata_t *mntdata = MTOD(mnp);
1555 		zone_t *zone = mntdata->mnt_zone;
1556 
1557 		STRUCT_INIT(tagdesc, flag & DATAMODEL_MASK);
1558 		if (copyin(dp, STRUCT_BUF(tagdesc), STRUCT_SIZE(tagdesc))) {
1559 			error = EFAULT;
1560 			break;
1561 		}
1562 		pbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1563 		if (zone != global_zone) {
1564 			(void) strcpy(pbuf, zone->zone_rootpath);
1565 			/* truncate "/" and nul */
1566 			start = zone->zone_rootpathlen - 2;
1567 			ASSERT(pbuf[start] == '/');
1568 		}
1569 		cptr = STRUCT_FGETP(tagdesc, mtd_mntpt);
1570 		error = copyinstr(cptr, pbuf + start, MAXPATHLEN - start, &len);
1571 		if (error) {
1572 			kmem_free(pbuf, MAXPATHLEN);
1573 			break;
1574 		}
1575 		if (start != 0 && pbuf[start] != '/') {
1576 			kmem_free(pbuf, MAXPATHLEN);
1577 			error = EINVAL;
1578 			break;
1579 		}
1580 		cptr = STRUCT_FGETP(tagdesc, mtd_tag);
1581 		if ((error = copyinstr(cptr, tagbuf, MAX_MNTOPT_TAG, &len))) {
1582 			kmem_free(pbuf, MAXPATHLEN);
1583 			break;
1584 		}
1585 		major = STRUCT_FGET(tagdesc, mtd_major);
1586 		minor = STRUCT_FGET(tagdesc, mtd_minor);
1587 		if (cmd == MNTIOC_SETTAG)
1588 			error = vfs_settag(major, minor, pbuf, tagbuf, cr);
1589 		else
1590 			error = vfs_clrtag(major, minor, pbuf, tagbuf, cr);
1591 		kmem_free(pbuf, MAXPATHLEN);
1592 		break;
1593 	}
1594 
1595 	case MNTIOC_SHOWHIDDEN:
1596 	{
1597 		rw_enter(&mnp->mnt_contents, RW_WRITER);
1598 		mnp->mnt_flags |= MNT_SHOWHIDDEN;
1599 		rw_exit(&mnp->mnt_contents);
1600 		break;
1601 	}
1602 
1603 	case MNTIOC_GETMNTANY:
1604 	{
1605 		STRUCT_DECL(mntentbuf, embuf);	/* Our copy of user's embuf */
1606 		STRUCT_DECL(extmnttab, ktab);	/* Out copy of user's emp */
1607 		struct extmnttab *uemp;		/* uaddr of user's emp */
1608 		char *ubufp;			/* uaddr of user's text buf */
1609 		size_t ubufsize;		/* size of the above */
1610 		struct extmnttab preftab;	/* our version of user's emp */
1611 		char *prefbuf;			/* our copy of user's text */
1612 		mntelem_t *elemp;		/* a database element */
1613 		struct extmnttab *dbtabp;	/* element's extmnttab */
1614 		char *dbbufp;			/* element's text buf */
1615 		size_t dbbufsize;		/* size of the above */
1616 		vtype_t type;			/* type, if any, of special */
1617 
1618 
1619 		/*
1620 		 * embuf is a struct embuf within the kernel. We copy into it
1621 		 * the struct embuf supplied by the user.
1622 		 */
1623 		STRUCT_INIT(embuf, datamodel);
1624 		if (copyin((void *) arg, STRUCT_BUF(embuf),
1625 		    STRUCT_SIZE(embuf))) {
1626 			error = EFAULT;
1627 			break;
1628 		}
1629 		uemp = STRUCT_FGETP(embuf, mbuf_emp);
1630 		ubufp = STRUCT_FGETP(embuf, mbuf_buf);
1631 		ubufsize = STRUCT_FGET(embuf, mbuf_bufsize);
1632 
1633 		/*
1634 		 * Check that the text buffer offered by the user is the
1635 		 * agreed size.
1636 		 */
1637 		if (ubufsize != MNT_LINE_MAX) {
1638 			error = EINVAL;
1639 			break;
1640 		}
1641 
1642 		/* Copy the user-supplied entry into a local buffer. */
1643 		prefbuf = kmem_alloc(MNT_LINE_MAX, KM_SLEEP);
1644 		if (copyin(ubufp, prefbuf, MNT_LINE_MAX)) {
1645 			kmem_free(prefbuf, MNT_LINE_MAX);
1646 			error = EFAULT;
1647 			break;
1648 		}
1649 
1650 		/* Ensure that any string within it is null-terminated. */
1651 		*(prefbuf + MNT_LINE_MAX - 1) = 0;
1652 
1653 		/* Copy in the user-supplied mpref */
1654 		STRUCT_INIT(ktab, datamodel);
1655 		if (copyin(uemp, STRUCT_BUF(ktab),
1656 		    SIZEOF_STRUCT(mnttab, datamodel))) {
1657 			kmem_free(prefbuf, MNT_LINE_MAX);
1658 			error = EFAULT;
1659 			break;
1660 		}
1661 
1662 		/*
1663 		 * Copy the members of the user's pref struct into a local
1664 		 * struct. The pointers need to be offset and verified to
1665 		 * ensure that they lie within the bounds of the buffer.
1666 		 */
1667 		preftab.mnt_special = mntfs_import_addr(STRUCT_FGETP(ktab,
1668 		    mnt_special), ubufp, prefbuf, MNT_LINE_MAX);
1669 		preftab.mnt_mountp = mntfs_import_addr(STRUCT_FGETP(ktab,
1670 		    mnt_mountp), ubufp, prefbuf, MNT_LINE_MAX);
1671 		preftab.mnt_fstype = mntfs_import_addr(STRUCT_FGETP(ktab,
1672 		    mnt_fstype), ubufp, prefbuf, MNT_LINE_MAX);
1673 		preftab.mnt_mntopts = mntfs_import_addr(STRUCT_FGETP(ktab,
1674 		    mnt_mntopts), ubufp, prefbuf, MNT_LINE_MAX);
1675 		preftab.mnt_time = mntfs_import_addr(STRUCT_FGETP(ktab,
1676 		    mnt_time), ubufp, prefbuf, MNT_LINE_MAX);
1677 
1678 		/*
1679 		 * If the user specifies a mounted resource that is a special
1680 		 * device then we capture its mode and major and minor numbers;
1681 		 * cf. the block comment below.
1682 		 */
1683 		type = mntfs_special_info_string(preftab.mnt_special,
1684 		    &preftab.mnt_major, &preftab.mnt_minor, cr);
1685 
1686 		rw_enter(&mnp->mnt_contents, RW_WRITER);
1687 		if (snapp->mnts_nmnts == 0 ||
1688 		    (snapp->mnts_flags & MNTS_REWIND))
1689 			mntfs_snapshot(mnp, snapp);
1690 
1691 		/*
1692 		 * This is the core functionality that implements getmntany().
1693 		 * We walk through the mntfs database until we find an element
1694 		 * matching the user's preferences that are contained in
1695 		 * preftab. Typically, this means checking that the text
1696 		 * matches. However, the mounted resource is special: if the
1697 		 * user is looking for a special device then we must find a
1698 		 * database element with the same major and minor numbers and
1699 		 * the same type, i.e. VBLK or VCHR. The type is not recorded
1700 		 * in the element because it cannot be inferred from the vfs_t.
1701 		 * We therefore check the type of suitable candidates via
1702 		 * mntfs_special_info_element(); since this calls into the
1703 		 * underlying file system we make sure to drop the database lock
1704 		 * first.
1705 		 */
1706 		elemp = snapp->mnts_next;
1707 		rw_enter(dblockp, RW_READER);
1708 		for (;;) {
1709 			for (; elemp; elemp = mntfs_get_next_elem(snapp,
1710 			    elemp)) {
1711 				dbtabp = &elemp->mnte_tab;
1712 				dbbufp = elemp->mnte_text;
1713 				dbbufsize = elemp->mnte_text_size;
1714 
1715 				if (((type &&
1716 				    dbtabp->mnt_major == preftab.mnt_major &&
1717 				    dbtabp->mnt_minor == preftab.mnt_minor &&
1718 				    MNTFS_REAL_FIELD(dbbufp)) ||
1719 				    (!type && (!preftab.mnt_special ||
1720 				    mntfs_same_word(preftab.mnt_special,
1721 				    prefbuf, MNT_LINE_MAX, (off_t)0, dbbufp,
1722 				    dbbufsize)))) &&
1723 
1724 				    (!preftab.mnt_mountp || mntfs_same_word(
1725 				    preftab.mnt_mountp, prefbuf, MNT_LINE_MAX,
1726 				    (off_t)dbtabp->mnt_mountp, dbbufp,
1727 				    dbbufsize)) &&
1728 
1729 				    (!preftab.mnt_fstype || mntfs_same_word(
1730 				    preftab.mnt_fstype, prefbuf, MNT_LINE_MAX,
1731 				    (off_t)dbtabp->mnt_fstype, dbbufp,
1732 				    dbbufsize)) &&
1733 
1734 				    (!preftab.mnt_mntopts || mntfs_same_word(
1735 				    preftab.mnt_mntopts, prefbuf, MNT_LINE_MAX,
1736 				    (off_t)dbtabp->mnt_mntopts, dbbufp,
1737 				    dbbufsize)) &&
1738 
1739 				    (!preftab.mnt_time || mntfs_same_word(
1740 				    preftab.mnt_time, prefbuf, MNT_LINE_MAX,
1741 				    (off_t)dbtabp->mnt_time, dbbufp,
1742 				    dbbufsize)))
1743 					break;
1744 			}
1745 			rw_exit(dblockp);
1746 
1747 			if (elemp == NULL || type == 0 ||
1748 			    type == mntfs_special_info_element(elemp, cr))
1749 				break;
1750 
1751 			rw_enter(dblockp, RW_READER);
1752 			elemp = mntfs_get_next_elem(snapp, elemp);
1753 		}
1754 
1755 		kmem_free(prefbuf, MNT_LINE_MAX);
1756 
1757 		/* If we failed to find a match then return EOF. */
1758 		if (elemp == NULL) {
1759 			rw_exit(&mnp->mnt_contents);
1760 			*rvalp = MNTFS_EOF;
1761 			break;
1762 		}
1763 
1764 		/*
1765 		 * Check that the text buffer offered by the user will be large
1766 		 * enough to accommodate the text for this entry.
1767 		 */
1768 		if (elemp->mnte_text_size > MNT_LINE_MAX) {
1769 			rw_exit(&mnp->mnt_contents);
1770 			*rvalp = MNTFS_TOOLONG;
1771 			break;
1772 		}
1773 
1774 		/*
1775 		 * Populate the user's struct mnttab and text buffer using the
1776 		 * element's contents.
1777 		 */
1778 		if (mntfs_copyout_elem(elemp, uemp, ubufp, cmd, datamodel)) {
1779 			error = EFAULT;
1780 		} else {
1781 			rw_enter(dblockp, RW_READER);
1782 			elemp = mntfs_get_next_elem(snapp, elemp);
1783 			rw_exit(dblockp);
1784 			snapp->mnts_next = elemp;
1785 		}
1786 		rw_exit(&mnp->mnt_contents);
1787 		break;
1788 	}
1789 
1790 	case MNTIOC_GETMNTENT:
1791 	case MNTIOC_GETEXTMNTENT:
1792 	{
1793 		STRUCT_DECL(mntentbuf, embuf);	/* Our copy of user's embuf */
1794 		struct extmnttab *uemp;		/* uaddr of user's emp */
1795 		char *ubufp;			/* uaddr of user's text buf */
1796 		size_t ubufsize;		/* size of the above */
1797 		mntelem_t *elemp;		/* a database element */
1798 
1799 
1800 		rw_enter(&mnp->mnt_contents, RW_WRITER);
1801 		if (snapp->mnts_nmnts == 0 ||
1802 		    (snapp->mnts_flags & MNTS_REWIND))
1803 			mntfs_snapshot(mnp, snapp);
1804 		if ((elemp = snapp->mnts_next) == NULL) {
1805 			rw_exit(&mnp->mnt_contents);
1806 			*rvalp = MNTFS_EOF;
1807 			break;
1808 		}
1809 
1810 		/*
1811 		 * embuf is a struct embuf within the kernel. We copy into it
1812 		 * the struct embuf supplied by the user.
1813 		 */
1814 		STRUCT_INIT(embuf, datamodel);
1815 		if (copyin((void *) arg, STRUCT_BUF(embuf),
1816 		    STRUCT_SIZE(embuf))) {
1817 			rw_exit(&mnp->mnt_contents);
1818 			error = EFAULT;
1819 			break;
1820 		}
1821 		uemp = STRUCT_FGETP(embuf, mbuf_emp);
1822 		ubufp = STRUCT_FGETP(embuf, mbuf_buf);
1823 		ubufsize = STRUCT_FGET(embuf, mbuf_bufsize);
1824 
1825 		/*
1826 		 * Check that the text buffer offered by the user will be large
1827 		 * enough to accommodate the text for this entry.
1828 		 */
1829 		if (elemp->mnte_text_size > ubufsize) {
1830 			rw_exit(&mnp->mnt_contents);
1831 			*rvalp = MNTFS_TOOLONG;
1832 			break;
1833 		}
1834 
1835 		/*
1836 		 * Populate the user's struct mnttab and text buffer using the
1837 		 * element's contents.
1838 		 */
1839 		if (mntfs_copyout_elem(elemp, uemp, ubufp, cmd, datamodel)) {
1840 			error = EFAULT;
1841 		} else {
1842 			rw_enter(dblockp, RW_READER);
1843 			elemp = mntfs_get_next_elem(snapp, elemp);
1844 			rw_exit(dblockp);
1845 			snapp->mnts_next = elemp;
1846 		}
1847 		rw_exit(&mnp->mnt_contents);
1848 		break;
1849 	}
1850 
1851 	default:
1852 		error = EINVAL;
1853 		break;
1854 	}
1855 
1856 	return (error);
1857 }
1858 
1859 /*
1860  * mntfs provides a new vnode for each open(2). Two vnodes will represent the
1861  * same instance of /etc/mnttab if they share the same (zone-specific) vfs.
1862  */
1863 /* ARGSUSED */
1864 int
1865 mntcmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
1866 {
1867 	return (vp1 != NULL && vp2 != NULL && vp1->v_vfsp == vp2->v_vfsp);
1868 }
1869 
1870 /*
1871  * /mntfs vnode operations vector
1872  */
1873 const fs_operation_def_t mnt_vnodeops_template[] = {
1874 	VOPNAME_OPEN,		{ .vop_open = mntopen },
1875 	VOPNAME_CLOSE,		{ .vop_close = mntclose },
1876 	VOPNAME_READ,		{ .vop_read = mntread },
1877 	VOPNAME_IOCTL,		{ .vop_ioctl = mntioctl },
1878 	VOPNAME_GETATTR,	{ .vop_getattr = mntgetattr },
1879 	VOPNAME_ACCESS,		{ .vop_access = mntaccess },
1880 	VOPNAME_FSYNC,		{ .vop_fsync = mntfsync },
1881 	VOPNAME_INACTIVE,	{ .vop_inactive = mntinactive },
1882 	VOPNAME_SEEK,		{ .vop_seek = mntseek },
1883 	VOPNAME_POLL,		{ .vop_poll = mntpoll },
1884 	VOPNAME_CMP,		{ .vop_cmp = mntcmp },
1885 	VOPNAME_DISPOSE,	{ .error = fs_error },
1886 	VOPNAME_SHRLOCK,	{ .error = fs_error },
1887 	NULL,			NULL
1888 };
1889