xref: /illumos-gate/usr/src/uts/common/fs/mntfs/mntvnops.c (revision ce17336ed725d3b7fdff67bf0a0ee2b55018fec6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/file.h>
26 #include <sys/stat.h>
27 #include <sys/atomic.h>
28 #include <sys/mntio.h>
29 #include <sys/mnttab.h>
30 #include <sys/mount.h>
31 #include <sys/sunddi.h>
32 #include <sys/sysmacros.h>
33 #include <sys/systm.h>
34 #include <sys/vfs.h>
35 #include <sys/vfs_opreg.h>
36 #include <sys/fs/mntdata.h>
37 #include <fs/fs_subr.h>
38 #include <sys/vmsystm.h>
39 #include <vm/seg_vn.h>
40 #include <sys/time.h>
41 #include <sys/ksynch.h>
42 #include <sys/sdt.h>
43 
44 #define	MNTROOTINO	2
45 
46 static mntnode_t *mntgetnode(vnode_t *);
47 
48 vnodeops_t *mntvnodeops;
49 extern void vfs_mnttab_readop(void);
50 
51 /*
52  * Design of kernel mnttab accounting.
53  *
54  * mntfs provides two methods of reading the in-kernel mnttab, i.e. the state of
55  * the mounted resources: the read-only file /etc/mnttab, and a collection of
56  * ioctl() commands. Most of these interfaces are public and are described in
57  * mnttab(5). Three private ioctl() commands, MNTIOC_GETMNTENT,
58  * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY, provide for the getmntent(3C)
59  * family of functions, allowing them to support white space in mount names.
60  *
61  * A significant feature of mntfs is that it provides a file descriptor with a
62  * snapshot once it begins to consume mnttab data. Thus, as the process
63  * continues to consume data, its view of the in-kernel mnttab does not change
64  * even if resources are mounted or unmounted. The intent is to ensure that
65  * processes are guaranteed to read self-consistent data even as the system
66  * changes.
67  *
68  * The snapshot is implemented by a "database", unique to each zone, that
69  * comprises a linked list of mntelem_ts. The database is identified by
70  * zone_mntfs_db and is protected by zone_mntfs_db_lock. Each element contains
71  * the text entry in /etc/mnttab for a mounted resource, i.e. a vfs_t, and is
72  * marked with its time of "birth", i.e. creation. An element is "killed", and
73  * marked with its time of death, when it is found to be out of date, e.g. when
74  * the corresponding resource has been unmounted.
75  *
76  * When a process performs the first read() or ioctl() for a file descriptor for
77  * /etc/mnttab, the database is updated by a call to mntfs_snapshot() to ensure
78  * that an element exists for each currently mounted resource. Following this,
79  * the current time is written into a snapshot structure, a mntsnap_t, embedded
80  * in the descriptor's mntnode_t.
81  *
82  * mntfs is able to enumerate the /etc/mnttab entries corresponding to a
83  * particular file descriptor by searching the database for entries that were
84  * born before the appropriate snapshot and that either are still alive or died
85  * after the snapshot was created. Consumers use the iterator function
86  * mntfs_get_next_elem() to identify the next suitable element in the database.
87  *
88  * Each snapshot has a hold on its corresponding database elements, effected by
89  * a per-element reference count. At last close(), a snapshot is destroyed in
90  * mntfs_freesnap() by releasing all of its holds; an element is destroyed if
91  * its reference count becomes zero. Therefore the database never exists unless
92  * there is at least one active consumer of /etc/mnttab.
93  *
94  * getmntent(3C) et al. "do not open, close or rewind the file." This implies
95  * that getmntent() and read() must be able to operate without interaction on
96  * the same file descriptor; this is accomplished by the use of separate
97  * mntsnap_ts for both read() and ioctl().
98  *
99  * mntfs observes the following lock-ordering:
100  *
101  *	mnp->mnt_contents -> vfslist -> zonep->zone_mntfs_db_lock
102  *
103  * NOTE: The following variable enables the generation of the "dev=xxx"
104  * in the option string for a mounted file system.  Really this should
105  * be gotten rid of altogether, but for the sake of backwards compatibility
106  * we had to leave it in.  It is defined as a 32-bit device number.  This
107  * means that when 64-bit device numbers are in use, if either the major or
108  * minor part of the device number will not fit in a 16 bit quantity, the
109  * "dev=" will be set to NODEV (0x7fffffff).  See PSARC 1999/566 and
110  * 1999/131 for details.  The cmpldev() function used to generate the 32-bit
111  * device number handles this check and assigns the proper value.
112  */
113 int mntfs_enabledev = 1;	/* enable old "dev=xxx" option */
114 
115 extern void vfs_mono_time(timespec_t *);
116 enum { MNTFS_FIRST, MNTFS_SECOND, MNTFS_NEITHER };
117 
118 /*
119  * Determine whether a field within a line from /etc/mnttab contains actual
120  * content or simply the marker string "-". This never applies to the time,
121  * therefore the delimiter must be a tab.
122  */
123 #define	MNTFS_REAL_FIELD(x)	(*(x) != '-' || *((x) + 1) != '\t')
124 
125 static int
126 mntfs_devsize(struct vfs *vfsp)
127 {
128 	dev32_t odev;
129 
130 	(void) cmpldev(&odev, vfsp->vfs_dev);
131 	return (snprintf(NULL, 0, "dev=%x", odev));
132 }
133 
134 static int
135 mntfs_devprint(struct vfs *vfsp, char *buf)
136 {
137 	dev32_t odev;
138 
139 	(void) cmpldev(&odev, vfsp->vfs_dev);
140 	return (snprintf(buf, MAX_MNTOPT_STR, "dev=%x", odev));
141 }
142 
143 /* Identify which, if either, of two supplied timespec structs is newer. */
144 static int
145 mntfs_newest(timespec_t *a, timespec_t *b)
146 {
147 	if (a->tv_sec == b->tv_sec &&
148 	    a->tv_nsec == b->tv_nsec) {
149 		return (MNTFS_NEITHER);
150 	} else if (b->tv_sec > a->tv_sec ||
151 	    (b->tv_sec == a->tv_sec &&
152 	    b->tv_nsec > a->tv_nsec)) {
153 		return (MNTFS_SECOND);
154 	} else {
155 		return (MNTFS_FIRST);
156 	}
157 }
158 
159 static int
160 mntfs_optsize(struct vfs *vfsp)
161 {
162 	int i, size = 0;
163 	mntopt_t *mop;
164 
165 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
166 		mop = &vfsp->vfs_mntopts.mo_list[i];
167 		if (mop->mo_flags & MO_NODISPLAY)
168 			continue;
169 		if (mop->mo_flags & MO_SET) {
170 			if (size)
171 				size++; /* space for comma */
172 			size += strlen(mop->mo_name);
173 			/*
174 			 * count option value if there is one
175 			 */
176 			if (mop->mo_arg != NULL) {
177 				size += strlen(mop->mo_arg) + 1;
178 			}
179 		}
180 	}
181 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
182 		/*
183 		 * Add space for "zone=<zone_name>" if required.
184 		 */
185 		if (size)
186 			size++;	/* space for comma */
187 		size += sizeof ("zone=") - 1;
188 		size += strlen(vfsp->vfs_zone->zone_name);
189 	}
190 	if (mntfs_enabledev) {
191 		if (size != 0)
192 			size++; /* space for comma */
193 		size += mntfs_devsize(vfsp);
194 	}
195 	if (size == 0)
196 		size = strlen("-");
197 	return (size);
198 }
199 
200 static int
201 mntfs_optprint(struct vfs *vfsp, char *buf)
202 {
203 	int i, optinbuf = 0;
204 	mntopt_t *mop;
205 	char *origbuf = buf;
206 
207 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
208 		mop = &vfsp->vfs_mntopts.mo_list[i];
209 		if (mop->mo_flags & MO_NODISPLAY)
210 			continue;
211 		if (mop->mo_flags & MO_SET) {
212 			if (optinbuf)
213 				*buf++ = ',';
214 			else
215 				optinbuf = 1;
216 			buf += snprintf(buf, MAX_MNTOPT_STR,
217 			    "%s", mop->mo_name);
218 			/*
219 			 * print option value if there is one
220 			 */
221 			if (mop->mo_arg != NULL) {
222 				buf += snprintf(buf, MAX_MNTOPT_STR, "=%s",
223 				    mop->mo_arg);
224 			}
225 		}
226 	}
227 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
228 		if (optinbuf)
229 			*buf++ = ',';
230 		else
231 			optinbuf = 1;
232 		buf += snprintf(buf, MAX_MNTOPT_STR, "zone=%s",
233 		    vfsp->vfs_zone->zone_name);
234 	}
235 	if (mntfs_enabledev) {
236 		if (optinbuf++)
237 			*buf++ = ',';
238 		buf += mntfs_devprint(vfsp, buf);
239 	}
240 	if (!optinbuf) {
241 		buf += snprintf(buf, MAX_MNTOPT_STR, "-");
242 	}
243 	return (buf - origbuf);
244 }
245 
246 void
247 mntfs_populate_text(vfs_t *vfsp, zone_t *zonep, mntelem_t *elemp)
248 {
249 	struct extmnttab *tabp = &elemp->mnte_tab;
250 	const char *resource, *mntpt;
251 	char *cp = elemp->mnte_text;
252 	mntpt = refstr_value(vfsp->vfs_mntpt);
253 	resource = refstr_value(vfsp->vfs_resource);
254 
255 	tabp->mnt_special = 0;
256 	if (resource != NULL && resource[0] != '\0') {
257 		if (resource[0] != '/') {
258 			cp += snprintf(cp, MAXPATHLEN, "%s\t", resource);
259 		} else if (!ZONE_PATH_VISIBLE(resource, zonep)) {
260 			/*
261 			 * Use the mount point as the resource.
262 			 */
263 			cp += snprintf(cp, MAXPATHLEN, "%s\t",
264 			    ZONE_PATH_TRANSLATE(mntpt, zonep));
265 		} else {
266 			cp += snprintf(cp, MAXPATHLEN, "%s\t",
267 			    ZONE_PATH_TRANSLATE(resource, zonep));
268 		}
269 	} else {
270 		cp += snprintf(cp, MAXPATHLEN, "-\t");
271 	}
272 
273 	tabp->mnt_mountp = (char *)(cp - elemp->mnte_text);
274 	if (mntpt != NULL && mntpt[0] != '\0') {
275 		/*
276 		 * We know the mount point is visible from within the zone,
277 		 * otherwise it wouldn't be on the zone's vfs list.
278 		 */
279 		cp += snprintf(cp, MAXPATHLEN, "%s\t",
280 		    ZONE_PATH_TRANSLATE(mntpt, zonep));
281 	} else {
282 		cp += snprintf(cp, MAXPATHLEN, "-\t");
283 	}
284 
285 	tabp->mnt_fstype = (char *)(cp - elemp->mnte_text);
286 	cp += snprintf(cp, MAXPATHLEN, "%s\t",
287 	    vfssw[vfsp->vfs_fstype].vsw_name);
288 
289 	tabp->mnt_mntopts = (char *)(cp - elemp->mnte_text);
290 	cp += mntfs_optprint(vfsp, cp);
291 	*cp++ = '\t';
292 
293 	tabp->mnt_time = (char *)(cp - elemp->mnte_text);
294 	cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime);
295 	*cp++ = '\n'; /* over-write snprintf's trailing null-byte */
296 
297 	tabp->mnt_major = getmajor(vfsp->vfs_dev);
298 	tabp->mnt_minor = getminor(vfsp->vfs_dev);
299 
300 	elemp->mnte_text_size = cp - elemp->mnte_text;
301 	elemp->mnte_vfs_ctime = vfsp->vfs_hrctime;
302 	elemp->mnte_hidden = vfsp->vfs_flag & VFS_NOMNTTAB;
303 }
304 
305 /* Determine the length of the /etc/mnttab entry for this vfs_t. */
306 static size_t
307 mntfs_text_len(vfs_t *vfsp, zone_t *zone)
308 {
309 	size_t size = 0;
310 	const char *resource, *mntpt;
311 	size_t mntsize;
312 
313 	mntpt = refstr_value(vfsp->vfs_mntpt);
314 	if (mntpt != NULL && mntpt[0] != '\0') {
315 		mntsize = strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
316 	} else {
317 		mntsize = 2;	/* "-\t" */
318 	}
319 	size += mntsize;
320 
321 	resource = refstr_value(vfsp->vfs_resource);
322 	if (resource != NULL && resource[0] != '\0') {
323 		if (resource[0] != '/') {
324 			size += strlen(resource) + 1;
325 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
326 			/*
327 			 * Same as the zone's view of the mount point.
328 			 */
329 			size += mntsize;
330 		} else {
331 			size += strlen(ZONE_PATH_TRANSLATE(resource, zone)) + 1;
332 		}
333 	} else {
334 		size += 2;	/* "-\t" */
335 	}
336 	size += strlen(vfssw[vfsp->vfs_fstype].vsw_name) + 1;
337 	size += mntfs_optsize(vfsp);
338 	size += snprintf(NULL, 0, "\t%ld\n", vfsp->vfs_mtime);
339 	return (size);
340 }
341 
342 /* Destroy the resources associated with a snapshot element. */
343 static void
344 mntfs_destroy_elem(mntelem_t *elemp)
345 {
346 	kmem_free(elemp->mnte_text, elemp->mnte_text_size);
347 	kmem_free(elemp, sizeof (mntelem_t));
348 }
349 
350 /*
351  * Return 1 if the given snapshot is in the range of the given element; return
352  * 0 otherwise.
353  */
354 static int
355 mntfs_elem_in_range(mntsnap_t *snapp, mntelem_t *elemp)
356 {
357 	timespec_t	*stimep = &snapp->mnts_time;
358 	timespec_t	*btimep = &elemp->mnte_birth;
359 	timespec_t	*dtimep = &elemp->mnte_death;
360 
361 	/*
362 	 * If a snapshot is in range of an element then the snapshot must have
363 	 * been created after the birth of the element, and either the element
364 	 * is still alive or it died after the snapshot was created.
365 	 */
366 	if (mntfs_newest(btimep, stimep) == MNTFS_SECOND &&
367 	    (MNTFS_ELEM_IS_ALIVE(elemp) ||
368 	    mntfs_newest(stimep, dtimep) == MNTFS_SECOND))
369 		return (1);
370 	else
371 		return (0);
372 }
373 
374 /*
375  * Return the next valid database element, after the one provided, for a given
376  * snapshot; return NULL if none exists. The caller must hold the zone's
377  * database lock as a reader before calling this function.
378  */
379 static mntelem_t *
380 mntfs_get_next_elem(mntsnap_t *snapp, mntelem_t *elemp)
381 {
382 	int show_hidden = snapp->mnts_flags & MNTS_SHOWHIDDEN;
383 
384 	do {
385 		elemp = elemp->mnte_next;
386 	} while (elemp &&
387 	    (!mntfs_elem_in_range(snapp, elemp) ||
388 	    (!show_hidden && elemp->mnte_hidden)));
389 	return (elemp);
390 }
391 
392 /*
393  * This function frees the resources associated with a mntsnap_t. It walks
394  * through the database, decrementing the reference count of any element that
395  * satisfies the snapshot. If the reference count of an element becomes zero
396  * then it is removed from the database.
397  */
398 static void
399 mntfs_freesnap(mntnode_t *mnp, mntsnap_t *snapp)
400 {
401 	zone_t *zonep = MTOD(mnp)->mnt_zone_ref.zref_zone;
402 	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
403 	mntelem_t **elempp = &zonep->zone_mntfs_db;
404 	mntelem_t *elemp;
405 	int show_hidden = snapp->mnts_flags & MNTS_SHOWHIDDEN;
406 	size_t number_decremented = 0;
407 
408 	ASSERT(RW_WRITE_HELD(&mnp->mnt_contents));
409 
410 	/* Ignore an uninitialised snapshot. */
411 	if (snapp->mnts_nmnts == 0)
412 		return;
413 
414 	/* Drop the holds on any matching database elements. */
415 	rw_enter(dblockp, RW_WRITER);
416 	while ((elemp = *elempp) != NULL) {
417 		if (mntfs_elem_in_range(snapp, elemp) &&
418 		    (!elemp->mnte_hidden || show_hidden) &&
419 		    ++number_decremented && --elemp->mnte_refcnt == 0) {
420 			if ((*elempp = elemp->mnte_next) != NULL)
421 				(*elempp)->mnte_prev = elemp->mnte_prev;
422 			mntfs_destroy_elem(elemp);
423 		} else {
424 			elempp = &elemp->mnte_next;
425 		}
426 	}
427 	rw_exit(dblockp);
428 	ASSERT(number_decremented == snapp->mnts_nmnts);
429 
430 	/* Clear the snapshot data. */
431 	bzero(snapp, sizeof (mntsnap_t));
432 }
433 
434 /* Insert the new database element newp after the existing element prevp. */
435 static void
436 mntfs_insert_after(mntelem_t *newp, mntelem_t *prevp)
437 {
438 	newp->mnte_prev = prevp;
439 	newp->mnte_next = prevp->mnte_next;
440 	prevp->mnte_next = newp;
441 	if (newp->mnte_next != NULL)
442 		newp->mnte_next->mnte_prev = newp;
443 }
444 
445 /* Create and return a copy of a given database element. */
446 static mntelem_t *
447 mntfs_copy(mntelem_t *origp)
448 {
449 	mntelem_t *copyp;
450 
451 	copyp = kmem_zalloc(sizeof (mntelem_t), KM_SLEEP);
452 	copyp->mnte_vfs_ctime = origp->mnte_vfs_ctime;
453 	copyp->mnte_text_size = origp->mnte_text_size;
454 	copyp->mnte_text = kmem_alloc(copyp->mnte_text_size, KM_SLEEP);
455 	bcopy(origp->mnte_text, copyp->mnte_text, copyp->mnte_text_size);
456 	copyp->mnte_tab = origp->mnte_tab;
457 	copyp->mnte_hidden = origp->mnte_hidden;
458 
459 	return (copyp);
460 }
461 
462 /*
463  * Compare two database elements and determine whether or not the vfs_t payload
464  * data of each are the same. Return 1 if so and 0 otherwise.
465  */
466 static int
467 mntfs_is_same_element(mntelem_t *a, mntelem_t *b)
468 {
469 	if (a->mnte_hidden == b->mnte_hidden &&
470 	    a->mnte_text_size == b->mnte_text_size &&
471 	    bcmp(a->mnte_text, b->mnte_text, a->mnte_text_size) == 0 &&
472 	    bcmp(&a->mnte_tab, &b->mnte_tab, sizeof (struct extmnttab)) == 0)
473 		return (1);
474 	else
475 		return (0);
476 }
477 
478 /*
479  * mntfs_snapshot() updates the database, creating it if necessary, so that it
480  * accurately reflects the state of the in-kernel mnttab. It also increments
481  * the reference count on all database elements that correspond to currently-
482  * mounted resources. Finally, it initialises the appropriate snapshot
483  * structure.
484  *
485  * Each vfs_t is given a high-resolution time stamp, for the benefit of mntfs,
486  * when it is inserted into the in-kernel mnttab. This time stamp is copied into
487  * the corresponding database element when it is created, allowing the element
488  * and the vfs_t to be identified as a pair. It is possible that some file
489  * systems may make unadvertised changes to, for example, a resource's mount
490  * options. Therefore, in order to determine whether a database element is an
491  * up-to-date representation of a given vfs_t, it is compared with a temporary
492  * element generated for this purpose. Although less efficient, this is safer
493  * than implementing an mtime for a vfs_t.
494  *
495  * Some mounted resources are marked as "hidden" with a VFS_NOMNTTAB flag. These
496  * are considered invisible unless the user has already set the MNT_SHOWHIDDEN
497  * flag in the vnode using the MNTIOC_SHOWHIDDEN ioctl.
498  */
499 static void
500 mntfs_snapshot(mntnode_t *mnp, mntsnap_t *snapp)
501 {
502 	mntdata_t	*mnd = MTOD(mnp);
503 	zone_t		*zonep = mnd->mnt_zone_ref.zref_zone;
504 	int		is_global_zone = (zonep == global_zone);
505 	int		show_hidden = mnp->mnt_flags & MNT_SHOWHIDDEN;
506 	vfs_t		*vfsp, *firstvfsp, *lastvfsp;
507 	vfs_t		dummyvfs;
508 	vfs_t		*dummyvfsp = NULL;
509 	krwlock_t	*dblockp = &zonep->zone_mntfs_db_lock;
510 	mntelem_t	**headpp = &zonep->zone_mntfs_db;
511 	mntelem_t	*elemp;
512 	mntelem_t	*prevp = NULL;
513 	int		order;
514 	mntelem_t	*tempelemp;
515 	mntelem_t	*newp;
516 	mntelem_t	*firstp = NULL;
517 	size_t		nmnts = 0;
518 	size_t		total_text_size = 0;
519 	size_t		normal_text_size = 0;
520 	int		insert_before;
521 	timespec_t	last_mtime;
522 	size_t		entry_length, new_entry_length;
523 
524 
525 	ASSERT(RW_WRITE_HELD(&mnp->mnt_contents));
526 	vfs_list_read_lock();
527 	vfs_mnttab_modtime(&last_mtime);
528 
529 	/*
530 	 * If this snapshot already exists then we must have been asked to
531 	 * rewind the file, i.e. discard the snapshot and create a new one in
532 	 * its place. In this case we first see if the in-kernel mnttab has
533 	 * advertised a change; if not then we simply reinitialise the metadata.
534 	 */
535 	if (snapp->mnts_nmnts) {
536 		if (mntfs_newest(&last_mtime, &snapp->mnts_last_mtime) ==
537 		    MNTFS_NEITHER) {
538 			/*
539 			 * An unchanged mtime is no guarantee that the
540 			 * in-kernel mnttab is unchanged; for example, a
541 			 * concurrent remount may be between calls to
542 			 * vfs_setmntopt_nolock() and vfs_mnttab_modtimeupd().
543 			 * It follows that the database may have changed, and
544 			 * in particular that some elements in this snapshot
545 			 * may have been killed by another call to
546 			 * mntfs_snapshot(). It is therefore not merely
547 			 * unnecessary to update the snapshot's time but in
548 			 * fact dangerous; it needs to be left alone.
549 			 */
550 			snapp->mnts_next = snapp->mnts_first;
551 			snapp->mnts_flags &= ~MNTS_REWIND;
552 			snapp->mnts_foffset = snapp->mnts_ieoffset = 0;
553 			vfs_list_unlock();
554 			return;
555 		} else {
556 			mntfs_freesnap(mnp, snapp);
557 		}
558 	}
559 
560 	/*
561 	 * Create a temporary database element. For each vfs_t, the temporary
562 	 * element will be populated with the corresponding text. If the vfs_t
563 	 * does not have a corresponding element within the database, or if
564 	 * there is such an element but it is stale, a copy of the temporary
565 	 * element is inserted into the database at the appropriate location.
566 	 */
567 	tempelemp = kmem_alloc(sizeof (mntelem_t), KM_SLEEP);
568 	entry_length = MNT_LINE_MAX;
569 	tempelemp->mnte_text = kmem_alloc(entry_length, KM_SLEEP);
570 
571 	/* Find the first and last vfs_t for the given zone. */
572 	if (is_global_zone) {
573 		firstvfsp = rootvfs;
574 		lastvfsp = firstvfsp->vfs_prev;
575 	} else {
576 		firstvfsp = zonep->zone_vfslist;
577 		/*
578 		 * If there isn't already a vfs_t for root then we create a
579 		 * dummy which will be used as the head of the list (which will
580 		 * therefore no longer be circular).
581 		 */
582 		if (firstvfsp == NULL ||
583 		    strcmp(refstr_value(firstvfsp->vfs_mntpt),
584 		    zonep->zone_rootpath) != 0) {
585 			/*
586 			 * The zone's vfs_ts will have mount points relative to
587 			 * the zone's root path. The vfs_t for the zone's
588 			 * root file system would therefore have a mount point
589 			 * equal to the zone's root path. Since the zone's root
590 			 * path isn't a mount point, we copy the vfs_t of the
591 			 * zone's root vnode, and provide it with a fake mount
592 			 * and resource. However, if the zone's root is a
593 			 * zfs dataset, use the dataset name as the resource.
594 			 *
595 			 * Note that by cloning another vfs_t we also acquire
596 			 * its high-resolution ctime. This might appear to
597 			 * violate the requirement that the ctimes in the list
598 			 * of vfs_ts are unique and monotonically increasing;
599 			 * this is not the case. The dummy vfs_t appears in only
600 			 * a non-global zone's vfs_t list, where the cloned
601 			 * vfs_t would not ordinarily be visible; the ctimes are
602 			 * therefore unique. The zone's root path must be
603 			 * available before the zone boots, and so its root
604 			 * vnode's vfs_t's ctime must be lower than those of any
605 			 * resources subsequently mounted by the zone. The
606 			 * ctimes are therefore monotonically increasing.
607 			 */
608 			dummyvfs = *zonep->zone_rootvp->v_vfsp;
609 			dummyvfs.vfs_mntpt = refstr_alloc(zonep->zone_rootpath);
610 			if (strcmp(vfssw[dummyvfs.vfs_fstype].vsw_name, "zfs")
611 			    != 0)
612 				dummyvfs.vfs_resource = dummyvfs.vfs_mntpt;
613 			dummyvfsp = &dummyvfs;
614 			if (firstvfsp == NULL) {
615 				lastvfsp = dummyvfsp;
616 			} else {
617 				lastvfsp = firstvfsp->vfs_zone_prev;
618 				dummyvfsp->vfs_zone_next = firstvfsp;
619 			}
620 			firstvfsp = dummyvfsp;
621 		} else {
622 			lastvfsp = firstvfsp->vfs_zone_prev;
623 		}
624 	}
625 
626 	/*
627 	 * Now walk through all the vfs_ts for this zone. For each one, find the
628 	 * corresponding database element, creating it first if necessary, and
629 	 * increment its reference count.
630 	 */
631 	rw_enter(dblockp, RW_WRITER);
632 	elemp = zonep->zone_mntfs_db;
633 	/* CSTYLED */
634 	for (vfsp = firstvfsp;;
635 	    vfsp = is_global_zone ? vfsp->vfs_next : vfsp->vfs_zone_next) {
636 		DTRACE_PROBE1(new__vfs, vfs_t *, vfsp);
637 		/* Consider only visible entries. */
638 		if ((vfsp->vfs_flag & VFS_NOMNTTAB) == 0 || show_hidden) {
639 			/*
640 			 * Walk through the existing database looking for either
641 			 * an element that matches the current vfs_t, or for the
642 			 * correct place in which to insert a new element.
643 			 */
644 			insert_before = 0;
645 			for (; elemp; prevp = elemp, elemp = elemp->mnte_next) {
646 				DTRACE_PROBE1(considering__elem, mntelem_t *,
647 				    elemp);
648 
649 				/* Compare the vfs_t with the element. */
650 				order = mntfs_newest(&elemp->mnte_vfs_ctime,
651 				    &vfsp->vfs_hrctime);
652 
653 				/*
654 				 * If we encounter a database element newer than
655 				 * this vfs_t then we've stepped over a gap
656 				 * where the element for this vfs_t must be
657 				 * inserted.
658 				 */
659 				if (order == MNTFS_FIRST) {
660 					insert_before = 1;
661 					break;
662 				}
663 
664 				/* Dead elements no longer interest us. */
665 				if (MNTFS_ELEM_IS_DEAD(elemp))
666 					continue;
667 
668 				/*
669 				 * If the time stamps are the same then the
670 				 * element is potential match for the vfs_t,
671 				 * although it may later prove to be stale.
672 				 */
673 				if (order == MNTFS_NEITHER)
674 					break;
675 
676 				/*
677 				 * This element must be older than the vfs_t.
678 				 * It must, therefore, correspond to a vfs_t
679 				 * that has been unmounted. Since the element is
680 				 * still alive, we kill it if it is visible.
681 				 */
682 				if (!elemp->mnte_hidden || show_hidden)
683 					vfs_mono_time(&elemp->mnte_death);
684 			}
685 			DTRACE_PROBE2(possible__match, vfs_t *, vfsp,
686 			    mntelem_t *, elemp);
687 
688 			/* Create a new database element if required. */
689 			new_entry_length = mntfs_text_len(vfsp, zonep);
690 			if (new_entry_length > entry_length) {
691 				kmem_free(tempelemp->mnte_text, entry_length);
692 				tempelemp->mnte_text =
693 				    kmem_alloc(new_entry_length, KM_SLEEP);
694 				entry_length = new_entry_length;
695 			}
696 			mntfs_populate_text(vfsp, zonep, tempelemp);
697 			ASSERT(tempelemp->mnte_text_size == new_entry_length);
698 			if (elemp == NULL) {
699 				/*
700 				 * We ran off the end of the database. Insert a
701 				 * new element at the end.
702 				 */
703 				newp = mntfs_copy(tempelemp);
704 				vfs_mono_time(&newp->mnte_birth);
705 				if (prevp) {
706 					mntfs_insert_after(newp, prevp);
707 				} else {
708 					newp->mnte_next = NULL;
709 					newp->mnte_prev = NULL;
710 					ASSERT(*headpp == NULL);
711 					*headpp = newp;
712 				}
713 				elemp = newp;
714 			} else if (insert_before) {
715 				/*
716 				 * Insert a new element before the current one.
717 				 */
718 				newp = mntfs_copy(tempelemp);
719 				vfs_mono_time(&newp->mnte_birth);
720 				if (prevp) {
721 					mntfs_insert_after(newp, prevp);
722 				} else {
723 					newp->mnte_next = elemp;
724 					newp->mnte_prev = NULL;
725 					elemp->mnte_prev = newp;
726 					ASSERT(*headpp == elemp);
727 					*headpp = newp;
728 				}
729 				elemp = newp;
730 			} else if (!mntfs_is_same_element(elemp, tempelemp)) {
731 				/*
732 				 * The element corresponds to the vfs_t, but the
733 				 * vfs_t has changed; it must have been
734 				 * remounted. Kill the old element and insert a
735 				 * new one after it.
736 				 */
737 				vfs_mono_time(&elemp->mnte_death);
738 				newp = mntfs_copy(tempelemp);
739 				vfs_mono_time(&newp->mnte_birth);
740 				mntfs_insert_after(newp, elemp);
741 				elemp = newp;
742 			}
743 
744 			/* We've found the corresponding element. Hold it. */
745 			DTRACE_PROBE1(incrementing, mntelem_t *, elemp);
746 			elemp->mnte_refcnt++;
747 
748 			/*
749 			 * Update the parameters used to initialise the
750 			 * snapshot.
751 			 */
752 			nmnts++;
753 			total_text_size += elemp->mnte_text_size;
754 			if (!elemp->mnte_hidden)
755 				normal_text_size += elemp->mnte_text_size;
756 			if (!firstp)
757 				firstp = elemp;
758 
759 			prevp = elemp;
760 			elemp = elemp->mnte_next;
761 		}
762 
763 		if (vfsp == lastvfsp)
764 			break;
765 	}
766 
767 	/*
768 	 * Any remaining visible database elements that are still alive must be
769 	 * killed now, because their corresponding vfs_ts must have been
770 	 * unmounted.
771 	 */
772 	for (; elemp; elemp = elemp->mnte_next) {
773 		if (MNTFS_ELEM_IS_ALIVE(elemp) &&
774 		    (!elemp->mnte_hidden || show_hidden))
775 			vfs_mono_time(&elemp->mnte_death);
776 	}
777 
778 	/* Initialise the snapshot. */
779 	vfs_mono_time(&snapp->mnts_time);
780 	snapp->mnts_last_mtime = last_mtime;
781 	snapp->mnts_first = snapp->mnts_next = firstp;
782 	snapp->mnts_flags = show_hidden ? MNTS_SHOWHIDDEN : 0;
783 	snapp->mnts_nmnts = nmnts;
784 	snapp->mnts_text_size = total_text_size;
785 	snapp->mnts_foffset = snapp->mnts_ieoffset = 0;
786 
787 	/*
788 	 * Record /etc/mnttab's current size and mtime for possible future use
789 	 * by mntgetattr().
790 	 */
791 	mnd->mnt_size = normal_text_size;
792 	mnd->mnt_mtime = last_mtime;
793 	if (show_hidden) {
794 		mnd->mnt_hidden_size = total_text_size;
795 		mnd->mnt_hidden_mtime = last_mtime;
796 	}
797 
798 	/* Clean up. */
799 	rw_exit(dblockp);
800 	vfs_list_unlock();
801 	if (dummyvfsp != NULL)
802 		refstr_rele(dummyvfsp->vfs_mntpt);
803 	kmem_free(tempelemp->mnte_text, entry_length);
804 	kmem_free(tempelemp, sizeof (mntelem_t));
805 }
806 
807 /*
808  * Public function to convert vfs_mntopts into a string.
809  * A buffer of sufficient size is allocated, which is returned via bufp,
810  * and whose length is returned via lenp.
811  */
812 void
813 mntfs_getmntopts(struct vfs *vfsp, char **bufp, size_t *lenp)
814 {
815 	size_t len;
816 	char *buf;
817 
818 	vfs_list_read_lock();
819 
820 	len = mntfs_optsize(vfsp) + 1;
821 	buf = kmem_alloc(len, KM_NOSLEEP);
822 	if (buf == NULL) {
823 		*bufp = NULL;
824 		vfs_list_unlock();
825 		return;
826 	}
827 	buf[len - 1] = '\0';
828 	(void) mntfs_optprint(vfsp, buf);
829 	ASSERT(buf[len - 1] == '\0');
830 
831 	vfs_list_unlock();
832 	*bufp = buf;
833 	*lenp = len;
834 }
835 
836 /* ARGSUSED */
837 static int
838 mntopen(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
839 {
840 	vnode_t *vp = *vpp;
841 	mntnode_t *nmnp;
842 
843 	/*
844 	 * Not allowed to open for writing, return error.
845 	 */
846 	if (flag & FWRITE)
847 		return (EPERM);
848 	/*
849 	 * Create a new mnt/vnode for each open, this will give us a handle to
850 	 * hang the snapshot on.
851 	 */
852 	nmnp = mntgetnode(vp);
853 
854 	*vpp = MTOV(nmnp);
855 	atomic_inc_32(&MTOD(nmnp)->mnt_nopen);
856 	VN_RELE(vp);
857 	return (0);
858 }
859 
860 /* ARGSUSED */
861 static int
862 mntclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
863 	caller_context_t *ct)
864 {
865 	mntnode_t *mnp = VTOM(vp);
866 
867 	/* Clean up any locks or shares held by the current process */
868 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
869 	cleanshares(vp, ttoproc(curthread)->p_pid);
870 
871 	if (count > 1)
872 		return (0);
873 	if (vp->v_count == 1) {
874 		rw_enter(&mnp->mnt_contents, RW_WRITER);
875 		mntfs_freesnap(mnp, &mnp->mnt_read);
876 		mntfs_freesnap(mnp, &mnp->mnt_ioctl);
877 		rw_exit(&mnp->mnt_contents);
878 		atomic_dec_32(&MTOD(mnp)->mnt_nopen);
879 	}
880 	return (0);
881 }
882 
883 /* ARGSUSED */
884 static int
885 mntread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, caller_context_t *ct)
886 {
887 	mntnode_t *mnp = VTOM(vp);
888 	zone_t *zonep = MTOD(mnp)->mnt_zone_ref.zref_zone;
889 	mntsnap_t *snapp = &mnp->mnt_read;
890 	off_t off = uio->uio_offset;
891 	size_t len = uio->uio_resid;
892 	char *bufferp;
893 	size_t available, copylen;
894 	size_t written = 0;
895 	mntelem_t *elemp;
896 	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
897 	int error = 0;
898 	off_t	ieoffset;
899 
900 	rw_enter(&mnp->mnt_contents, RW_WRITER);
901 	if (snapp->mnts_nmnts == 0 || (off == (off_t)0))
902 		mntfs_snapshot(mnp, snapp);
903 
904 	if ((size_t)(off + len) > snapp->mnts_text_size)
905 		len = snapp->mnts_text_size - off;
906 
907 	if (off < 0 || len > snapp->mnts_text_size) {
908 		rw_exit(&mnp->mnt_contents);
909 		return (EFAULT);
910 	}
911 
912 	if (len == 0) {
913 		rw_exit(&mnp->mnt_contents);
914 		return (0);
915 	}
916 
917 	/*
918 	 * For the file offset provided, locate the corresponding database
919 	 * element and calculate the corresponding offset within its text. If
920 	 * the file offset is the same as that reached during the last read(2)
921 	 * then use the saved element and intra-element offset.
922 	 */
923 	rw_enter(dblockp, RW_READER);
924 	if (off == 0 || (off == snapp->mnts_foffset)) {
925 		elemp = snapp->mnts_next;
926 		ieoffset = snapp->mnts_ieoffset;
927 	} else {
928 		off_t total_off;
929 		/*
930 		 * Find the element corresponding to the requested file offset
931 		 * by walking through the database and summing the text sizes
932 		 * of the individual elements. If the requested file offset is
933 		 * greater than that reached on the last visit then we can start
934 		 * at the last seen element; otherwise, we have to start at the
935 		 * beginning.
936 		 */
937 		if (off > snapp->mnts_foffset) {
938 			elemp = snapp->mnts_next;
939 			total_off = snapp->mnts_foffset - snapp->mnts_ieoffset;
940 		} else {
941 			elemp = snapp->mnts_first;
942 			total_off = 0;
943 		}
944 		while (off > total_off + elemp->mnte_text_size) {
945 			total_off += elemp->mnte_text_size;
946 			elemp = mntfs_get_next_elem(snapp, elemp);
947 			ASSERT(elemp != NULL);
948 		}
949 		/* Calculate the intra-element offset. */
950 		if (off > total_off)
951 			ieoffset = off - total_off;
952 		else
953 			ieoffset = 0;
954 	}
955 
956 	/*
957 	 * Create a buffer and populate it with the text from successive
958 	 * database elements until it is full.
959 	 */
960 	bufferp = kmem_alloc(len, KM_SLEEP);
961 	while (written < len) {
962 		available = elemp->mnte_text_size - ieoffset;
963 		copylen = MIN(len - written, available);
964 		bcopy(elemp->mnte_text + ieoffset, bufferp + written, copylen);
965 		written += copylen;
966 		if (copylen == available) {
967 			elemp = mntfs_get_next_elem(snapp, elemp);
968 			ASSERT(elemp != NULL || written == len);
969 			ieoffset = 0;
970 		} else {
971 			ieoffset += copylen;
972 		}
973 	}
974 	rw_exit(dblockp);
975 
976 	/*
977 	 * Write the populated buffer, update the snapshot's state if
978 	 * successful and then advertise our read.
979 	 */
980 	error = uiomove(bufferp, len, UIO_READ, uio);
981 	if (error == 0) {
982 		snapp->mnts_next = elemp;
983 		snapp->mnts_foffset = off + len;
984 		snapp->mnts_ieoffset = ieoffset;
985 	}
986 	vfs_mnttab_readop();
987 	rw_exit(&mnp->mnt_contents);
988 
989 	/* Clean up. */
990 	kmem_free(bufferp, len);
991 	return (error);
992 }
993 
994 static int
995 mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
996 	caller_context_t *ct)
997 {
998 	int mask = vap->va_mask;
999 	int error;
1000 	mntnode_t *mnp = VTOM(vp);
1001 	timespec_t mtime, old_mtime;
1002 	size_t size, old_size;
1003 	mntdata_t *mntdata = MTOD(VTOM(vp));
1004 	mntsnap_t *rsnapp, *isnapp;
1005 	extern timespec_t vfs_mnttab_ctime;
1006 
1007 
1008 	/* AT_MODE, AT_UID and AT_GID are derived from the underlying file. */
1009 	if (mask & AT_MODE|AT_UID|AT_GID) {
1010 		if (error = VOP_GETATTR(mnp->mnt_mountvp, vap, flags, cr, ct))
1011 			return (error);
1012 	}
1013 
1014 	/*
1015 	 * There are some minor subtleties in the determination of
1016 	 * /etc/mnttab's size and mtime. We wish to avoid any condition in
1017 	 * which, in the vicinity of a change to the in-kernel mnttab, we
1018 	 * return an old value for one but a new value for the other. We cannot
1019 	 * simply hold vfslist for the entire calculation because we might need
1020 	 * to call mntfs_snapshot(), which calls vfs_list_read_lock().
1021 	 */
1022 	if (mask & AT_SIZE|AT_NBLOCKS) {
1023 		rw_enter(&mnp->mnt_contents, RW_WRITER);
1024 
1025 		vfs_list_read_lock();
1026 		vfs_mnttab_modtime(&mtime);
1027 		if (mnp->mnt_flags & MNT_SHOWHIDDEN) {
1028 			old_mtime = mntdata->mnt_hidden_mtime;
1029 			old_size = mntdata->mnt_hidden_size;
1030 		} else {
1031 			old_mtime = mntdata->mnt_mtime;
1032 			old_size = mntdata->mnt_size;
1033 		}
1034 		vfs_list_unlock();
1035 
1036 		rsnapp = &mnp->mnt_read;
1037 		isnapp = &mnp->mnt_ioctl;
1038 		if (rsnapp->mnts_nmnts || isnapp->mnts_nmnts) {
1039 			/*
1040 			 * The mntnode already has at least one snapshot from
1041 			 * which to take the size; the user will understand from
1042 			 * mnttab(5) that the current size of the in-kernel
1043 			 * mnttab is irrelevant.
1044 			 */
1045 			size = rsnapp->mnts_nmnts ? rsnapp->mnts_text_size :
1046 			    isnapp->mnts_text_size;
1047 		} else if (mntfs_newest(&mtime, &old_mtime) == MNTFS_NEITHER) {
1048 			/*
1049 			 * There is no existing valid snapshot but the in-kernel
1050 			 * mnttab has not changed since the time that the last
1051 			 * one was generated. Use the old file size; note that
1052 			 * it is guaranteed to be consistent with mtime, which
1053 			 * may be returned to the user later.
1054 			 */
1055 			size = old_size;
1056 		} else {
1057 			/*
1058 			 * There is no snapshot and the in-kernel mnttab has
1059 			 * changed since the last one was created. We generate a
1060 			 * new snapshot which we use for not only the size but
1061 			 * also the mtime, thereby ensuring that the two are
1062 			 * consistent.
1063 			 */
1064 			mntfs_snapshot(mnp, rsnapp);
1065 			size = rsnapp->mnts_text_size;
1066 			mtime = rsnapp->mnts_last_mtime;
1067 			mntfs_freesnap(mnp, rsnapp);
1068 		}
1069 
1070 		rw_exit(&mnp->mnt_contents);
1071 	} else if (mask & AT_ATIME|AT_MTIME) {
1072 		vfs_list_read_lock();
1073 		vfs_mnttab_modtime(&mtime);
1074 		vfs_list_unlock();
1075 	}
1076 
1077 	/* Always look like a regular file. */
1078 	if (mask & AT_TYPE)
1079 		vap->va_type = VREG;
1080 	/* Mode should basically be read only. */
1081 	if (mask & AT_MODE)
1082 		vap->va_mode &= 07444;
1083 	if (mask & AT_FSID)
1084 		vap->va_fsid = vp->v_vfsp->vfs_dev;
1085 	/* Nodeid is always ROOTINO. */
1086 	if (mask & AT_NODEID)
1087 		vap->va_nodeid = (ino64_t)MNTROOTINO;
1088 	/*
1089 	 * Set nlink to the number of open vnodes for mnttab info
1090 	 * plus one for existing.
1091 	 */
1092 	if (mask & AT_NLINK)
1093 		vap->va_nlink = mntdata->mnt_nopen + 1;
1094 	if (mask & AT_SIZE)
1095 		vap->va_size = size;
1096 	if (mask & AT_ATIME)
1097 		vap->va_atime = mtime;
1098 	if (mask & AT_MTIME)
1099 		vap->va_mtime = mtime;
1100 	if (mask & AT_CTIME)
1101 		vap->va_ctime = vfs_mnttab_ctime;
1102 	if (mask & AT_RDEV)
1103 		vap->va_rdev = 0;
1104 	if (mask & AT_BLKSIZE)
1105 		vap->va_blksize = DEV_BSIZE;
1106 	if (mask & AT_NBLOCKS)
1107 		vap->va_nblocks = btod(size);
1108 	if (mask & AT_SEQ)
1109 		vap->va_seq = 0;
1110 
1111 	return (0);
1112 }
1113 
1114 static int
1115 mntaccess(vnode_t *vp, int mode, int flags, cred_t *cr,
1116 	caller_context_t *ct)
1117 {
1118 	mntnode_t *mnp = VTOM(vp);
1119 
1120 	if (mode & (VWRITE|VEXEC))
1121 		return (EROFS);
1122 
1123 	/*
1124 	 * Do access check on the underlying directory vnode.
1125 	 */
1126 	return (VOP_ACCESS(mnp->mnt_mountvp, mode, flags, cr, ct));
1127 }
1128 
1129 
1130 /*
1131  * New /mntfs vnode required; allocate it and fill in most of the fields.
1132  */
1133 static mntnode_t *
1134 mntgetnode(vnode_t *dp)
1135 {
1136 	mntnode_t *mnp;
1137 	vnode_t *vp;
1138 
1139 	mnp = kmem_zalloc(sizeof (mntnode_t), KM_SLEEP);
1140 	mnp->mnt_vnode = vn_alloc(KM_SLEEP);
1141 	mnp->mnt_mountvp = VTOM(dp)->mnt_mountvp;
1142 	rw_init(&mnp->mnt_contents, NULL, RW_DEFAULT, NULL);
1143 	vp = MTOV(mnp);
1144 	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
1145 	vn_setops(vp, mntvnodeops);
1146 	vp->v_vfsp = dp->v_vfsp;
1147 	vp->v_type = VREG;
1148 	vp->v_data = (caddr_t)mnp;
1149 
1150 	return (mnp);
1151 }
1152 
1153 /*
1154  * Free the storage obtained from mntgetnode().
1155  */
1156 static void
1157 mntfreenode(mntnode_t *mnp)
1158 {
1159 	vnode_t *vp = MTOV(mnp);
1160 
1161 	rw_destroy(&mnp->mnt_contents);
1162 	vn_invalid(vp);
1163 	vn_free(vp);
1164 	kmem_free(mnp, sizeof (*mnp));
1165 }
1166 
1167 
1168 /* ARGSUSED */
1169 static int
1170 mntfsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1171 {
1172 	return (0);
1173 }
1174 
1175 /* ARGSUSED */
1176 static void
1177 mntinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1178 {
1179 	mntnode_t *mnp = VTOM(vp);
1180 
1181 	mntfreenode(mnp);
1182 }
1183 
1184 /*
1185  * lseek(2) is supported only to rewind the file by resetmnttab(3C). Rewinding
1186  * has a special meaning for /etc/mnttab: it forces mntfs to refresh the
1187  * snapshot at the next ioctl().
1188  *
1189  * mnttab(5) explains that "the snapshot...is taken any time a read(2) is
1190  * performed at offset 0". We therefore ignore the read snapshot here.
1191  */
1192 /* ARGSUSED */
1193 static int
1194 mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
1195 {
1196 	mntnode_t *mnp = VTOM(vp);
1197 
1198 	if (*noffp == 0) {
1199 		rw_enter(&mnp->mnt_contents, RW_WRITER);
1200 		mnp->mnt_ioctl.mnts_flags |= MNTS_REWIND;
1201 		rw_exit(&mnp->mnt_contents);
1202 	}
1203 
1204 	return (0);
1205 }
1206 
1207 /*
1208  * Return the answer requested to poll().
1209  * POLLRDBAND will return when the mtime of the mnttab
1210  * information is newer than the latest one read for this open.
1211  */
1212 /* ARGSUSED */
1213 static int
1214 mntpoll(vnode_t *vp, short ev, int any, short *revp, pollhead_t **phpp,
1215 	caller_context_t *ct)
1216 {
1217 	mntnode_t *mnp = VTOM(vp);
1218 	mntsnap_t *snapp;
1219 
1220 	rw_enter(&mnp->mnt_contents, RW_READER);
1221 	if (mntfs_newest(&mnp->mnt_ioctl.mnts_last_mtime,
1222 	    &mnp->mnt_read.mnts_last_mtime) == MNTFS_FIRST)
1223 		snapp = &mnp->mnt_ioctl;
1224 	else
1225 		snapp = &mnp->mnt_read;
1226 
1227 	*revp = 0;
1228 	*phpp = (pollhead_t *)NULL;
1229 	if (ev & POLLIN)
1230 		*revp |= POLLIN;
1231 
1232 	if (ev & POLLRDNORM)
1233 		*revp |= POLLRDNORM;
1234 
1235 	if (ev & POLLRDBAND) {
1236 		vfs_mnttab_poll(&snapp->mnts_last_mtime, phpp);
1237 		if (*phpp == (pollhead_t *)NULL)
1238 			*revp |= POLLRDBAND;
1239 	}
1240 	rw_exit(&mnp->mnt_contents);
1241 
1242 	if (*revp || *phpp != NULL || any) {
1243 		return (0);
1244 	}
1245 	/*
1246 	 * If someone is polling an unsupported poll events (e.g.
1247 	 * POLLOUT, POLLPRI, etc.), just return POLLERR revents.
1248 	 * That way we will ensure that we don't return a 0
1249 	 * revents with a NULL pollhead pointer.
1250 	 */
1251 	*revp = POLLERR;
1252 	return (0);
1253 }
1254 
1255 /*
1256  * mntfs_same_word() returns 1 if two words are the same in the context of
1257  * MNTIOC_GETMNTANY and 0 otherwise.
1258  *
1259  * worda is a memory address that lies somewhere in the buffer bufa; it cannot
1260  * be NULL since this is used to indicate to getmntany(3C) that the user does
1261  * not wish to match a particular field. The text to which worda points is
1262  * supplied by the user; if it is not null-terminated then it cannot match.
1263  *
1264  * Buffer bufb contains a line from /etc/mnttab, in which the fields are
1265  * delimited by tab or new-line characters. offb is the offset of the second
1266  * word within this buffer.
1267  *
1268  * mntfs_same_word() returns 1 if the words are the same and 0 otherwise.
1269  */
1270 int
1271 mntfs_same_word(char *worda, char *bufa, size_t sizea, off_t offb, char *bufb,
1272     size_t sizeb)
1273 {
1274 	char *wordb = bufb + offb;
1275 	int bytes_remaining;
1276 
1277 	ASSERT(worda != NULL);
1278 
1279 	bytes_remaining = MIN(((bufa + sizea) - worda),
1280 	    ((bufb + sizeb) - wordb));
1281 	while (bytes_remaining && *worda == *wordb) {
1282 		worda++;
1283 		wordb++;
1284 		bytes_remaining--;
1285 	}
1286 	if (bytes_remaining &&
1287 	    *worda == '\0' && (*wordb == '\t' || *wordb == '\n'))
1288 		return (1);
1289 	else
1290 		return (0);
1291 }
1292 
1293 /*
1294  * mntfs_special_info_string() returns which, if either, of VBLK or VCHR
1295  * corresponds to a supplied path. If the path is a special device then the
1296  * function optionally sets the major and minor numbers.
1297  */
1298 vtype_t
1299 mntfs_special_info_string(char *path, uint_t *major, uint_t *minor, cred_t *cr)
1300 {
1301 	vattr_t vattr;
1302 	vnode_t *vp;
1303 	vtype_t type;
1304 	int error;
1305 
1306 	if (path == NULL || *path != '/' ||
1307 	    lookupnameat(path + 1, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp, rootdir))
1308 		return (0);
1309 
1310 	vattr.va_mask = AT_TYPE | AT_RDEV;
1311 	error = VOP_GETATTR(vp, &vattr, ATTR_REAL, cr, NULL);
1312 	VN_RELE(vp);
1313 
1314 	if (error == 0 && ((type = vattr.va_type) == VBLK || type == VCHR)) {
1315 		if (major && minor) {
1316 			*major = getmajor(vattr.va_rdev);
1317 			*minor = getminor(vattr.va_rdev);
1318 		}
1319 		return (type);
1320 	} else {
1321 		return (0);
1322 	}
1323 }
1324 
1325 /*
1326  * mntfs_special_info_element() extracts the name of the mounted resource
1327  * for a given element and copies it into a null-terminated string, which it
1328  * then passes to mntfs_special_info_string().
1329  */
1330 vtype_t
1331 mntfs_special_info_element(mntelem_t *elemp, cred_t *cr)
1332 {
1333 	char *newpath;
1334 	vtype_t type;
1335 
1336 	newpath = kmem_alloc(elemp->mnte_text_size, KM_SLEEP);
1337 	bcopy(elemp->mnte_text, newpath, (off_t)(elemp->mnte_tab.mnt_mountp));
1338 	*(newpath + (off_t)elemp->mnte_tab.mnt_mountp - 1) = '\0';
1339 	type = mntfs_special_info_string(newpath, NULL, NULL, cr);
1340 	kmem_free(newpath, elemp->mnte_text_size);
1341 
1342 	return (type);
1343 }
1344 
1345 /*
1346  * Convert an address that points to a byte within a user buffer into an
1347  * address that points to the corresponding offset within a kernel buffer. If
1348  * the user address is NULL then make no conversion. If the address does not
1349  * lie within the buffer then reset it to NULL.
1350  */
1351 char *
1352 mntfs_import_addr(char *uaddr, char *ubufp, char *kbufp, size_t bufsize)
1353 {
1354 	if (uaddr < ubufp || uaddr >= ubufp + bufsize)
1355 		return (NULL);
1356 	else
1357 		return (kbufp + (uaddr - ubufp));
1358 }
1359 
1360 /*
1361  * These 32-bit versions are to support STRUCT_DECL(9F) etc. in
1362  * mntfs_copyout_element() and mntioctl().
1363  */
1364 #ifdef _SYSCALL32_IMPL
1365 typedef struct extmnttab32 {
1366 	uint32_t	mnt_special;
1367 	uint32_t	mnt_mountp;
1368 	uint32_t	mnt_fstype;
1369 	uint32_t	mnt_mntopts;
1370 	uint32_t	mnt_time;
1371 	uint_t		mnt_major;
1372 	uint_t		mnt_minor;
1373 } extmnttab32_t;
1374 
1375 typedef struct mnttab32 {
1376 	uint32_t	mnt_special;
1377 	uint32_t	mnt_mountp;
1378 	uint32_t	mnt_fstype;
1379 	uint32_t	mnt_mntopts;
1380 	uint32_t	mnt_time;
1381 } mnttab32_t;
1382 
1383 struct mntentbuf32 {
1384 	uint32_t	mbuf_emp;
1385 	uint_t		mbuf_bufsize;
1386 	uint32_t	mbuf_buf;
1387 };
1388 #endif
1389 
1390 /*
1391  * mntfs_copyout_element() is common code for the MNTIOC_GETMNTENT,
1392  * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY ioctls. Having identifed the
1393  * database element desired by the user, this function copies out the text and
1394  * the pointers to the relevant userland addresses. It returns 0 on success
1395  * and non-zero otherwise.
1396  */
1397 int
1398 mntfs_copyout_elem(mntelem_t *elemp, struct extmnttab *uemp,
1399     char *ubufp, int cmd, int datamodel)
1400 {
1401 		STRUCT_DECL(extmnttab, ktab);
1402 		char *dbbufp = elemp->mnte_text;
1403 		size_t dbbufsize = elemp->mnte_text_size;
1404 		struct extmnttab *dbtabp = &elemp->mnte_tab;
1405 		size_t ssize;
1406 		char *kbufp;
1407 		int error = 0;
1408 
1409 
1410 		/*
1411 		 * We create a struct extmnttab within the kernel of the size
1412 		 * determined by the user's data model. We then populate its
1413 		 * fields by combining the start address of the text buffer
1414 		 * supplied by the user, ubufp, with the offsets stored for
1415 		 * this database element within dbtabp, a pointer to a struct
1416 		 * extmnttab.
1417 		 *
1418 		 * Note that if the corresponding field is "-" this signifies
1419 		 * no real content, and we set the address to NULL. This does
1420 		 * not apply to mnt_time.
1421 		 */
1422 		STRUCT_INIT(ktab, datamodel);
1423 		STRUCT_FSETP(ktab, mnt_special,
1424 		    MNTFS_REAL_FIELD(dbbufp) ? ubufp : NULL);
1425 		STRUCT_FSETP(ktab, mnt_mountp,
1426 		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_mountp) ?
1427 		    ubufp + (off_t)dbtabp->mnt_mountp : NULL);
1428 		STRUCT_FSETP(ktab, mnt_fstype,
1429 		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_fstype) ?
1430 		    ubufp + (off_t)dbtabp->mnt_fstype : NULL);
1431 		STRUCT_FSETP(ktab, mnt_mntopts,
1432 		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_mntopts) ?
1433 		    ubufp + (off_t)dbtabp->mnt_mntopts : NULL);
1434 		STRUCT_FSETP(ktab, mnt_time,
1435 		    ubufp + (off_t)dbtabp->mnt_time);
1436 		if (cmd == MNTIOC_GETEXTMNTENT) {
1437 			STRUCT_FSETP(ktab, mnt_major, dbtabp->mnt_major);
1438 			STRUCT_FSETP(ktab, mnt_minor, dbtabp->mnt_minor);
1439 			ssize = SIZEOF_STRUCT(extmnttab, datamodel);
1440 		} else {
1441 			ssize = SIZEOF_STRUCT(mnttab, datamodel);
1442 		}
1443 		if (copyout(STRUCT_BUF(ktab), uemp, ssize))
1444 			return (EFAULT);
1445 
1446 		/*
1447 		 * We create a text buffer in the kernel into which we copy the
1448 		 * /etc/mnttab entry for this element. We change the tab and
1449 		 * new-line delimiters to null bytes before copying out the
1450 		 * buffer.
1451 		 */
1452 		kbufp = kmem_alloc(dbbufsize, KM_SLEEP);
1453 		bcopy(elemp->mnte_text, kbufp, dbbufsize);
1454 		*(kbufp + (off_t)dbtabp->mnt_mountp - 1) =
1455 		    *(kbufp + (off_t)dbtabp->mnt_fstype - 1) =
1456 		    *(kbufp + (off_t)dbtabp->mnt_mntopts - 1) =
1457 		    *(kbufp + (off_t)dbtabp->mnt_time - 1) =
1458 		    *(kbufp + dbbufsize - 1) = '\0';
1459 		if (copyout(kbufp, ubufp, dbbufsize))
1460 			error = EFAULT;
1461 
1462 		kmem_free(kbufp, dbbufsize);
1463 		return (error);
1464 }
1465 
1466 /* ARGSUSED */
1467 static int
1468 mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
1469     int *rvalp, caller_context_t *ct)
1470 {
1471 	uint_t *up = (uint_t *)arg;
1472 	mntnode_t *mnp = VTOM(vp);
1473 	mntsnap_t *snapp = &mnp->mnt_ioctl;
1474 	int error = 0;
1475 	zone_t *zonep = MTOD(mnp)->mnt_zone_ref.zref_zone;
1476 	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
1477 	model_t datamodel = flag & DATAMODEL_MASK;
1478 
1479 	switch (cmd) {
1480 
1481 	case MNTIOC_NMNTS:  		/* get no. of mounted resources */
1482 	{
1483 		rw_enter(&mnp->mnt_contents, RW_READER);
1484 		if (snapp->mnts_nmnts == 0 ||
1485 		    (snapp->mnts_flags & MNTS_REWIND)) {
1486 			if (!rw_tryupgrade(&mnp->mnt_contents)) {
1487 				rw_exit(&mnp->mnt_contents);
1488 				rw_enter(&mnp->mnt_contents, RW_WRITER);
1489 			}
1490 			if (snapp->mnts_nmnts == 0 ||
1491 			    (snapp->mnts_flags & MNTS_REWIND))
1492 				mntfs_snapshot(mnp, snapp);
1493 		}
1494 		rw_exit(&mnp->mnt_contents);
1495 
1496 		if (suword32(up, snapp->mnts_nmnts) != 0)
1497 			error = EFAULT;
1498 		break;
1499 	}
1500 
1501 	case MNTIOC_GETDEVLIST:  	/* get mounted device major/minor nos */
1502 	{
1503 		size_t len;
1504 		uint_t *devlist;
1505 		mntelem_t *elemp;
1506 		int i = 0;
1507 
1508 		rw_enter(&mnp->mnt_contents, RW_READER);
1509 		if (snapp->mnts_nmnts == 0 ||
1510 		    (snapp->mnts_flags & MNTS_REWIND)) {
1511 			if (!rw_tryupgrade(&mnp->mnt_contents)) {
1512 				rw_exit(&mnp->mnt_contents);
1513 				rw_enter(&mnp->mnt_contents, RW_WRITER);
1514 			}
1515 			if (snapp->mnts_nmnts == 0 ||
1516 			    (snapp->mnts_flags & MNTS_REWIND))
1517 				mntfs_snapshot(mnp, snapp);
1518 			rw_downgrade(&mnp->mnt_contents);
1519 		}
1520 
1521 		/* Create a local buffer to hold the device numbers. */
1522 		len = 2 * snapp->mnts_nmnts * sizeof (uint_t);
1523 		devlist = kmem_alloc(len, KM_SLEEP);
1524 
1525 		/*
1526 		 * Walk the database elements for this snapshot and add their
1527 		 * major and minor numbers.
1528 		 */
1529 		rw_enter(dblockp, RW_READER);
1530 		for (elemp = snapp->mnts_first; elemp;
1531 		    elemp = mntfs_get_next_elem(snapp, elemp)) {
1532 				devlist[2 * i] = elemp->mnte_tab.mnt_major;
1533 				devlist[2 * i + 1] = elemp->mnte_tab.mnt_minor;
1534 				i++;
1535 		}
1536 		rw_exit(dblockp);
1537 		ASSERT(i == snapp->mnts_nmnts);
1538 		rw_exit(&mnp->mnt_contents);
1539 
1540 		error = xcopyout(devlist, up, len);
1541 		kmem_free(devlist, len);
1542 		break;
1543 	}
1544 
1545 	case MNTIOC_SETTAG:		/* set tag on mounted file system */
1546 	case MNTIOC_CLRTAG:		/* clear tag on mounted file system */
1547 	{
1548 		struct mnttagdesc *dp = (struct mnttagdesc *)arg;
1549 		STRUCT_DECL(mnttagdesc, tagdesc);
1550 		char *cptr;
1551 		uint32_t major, minor;
1552 		char tagbuf[MAX_MNTOPT_TAG];
1553 		char *pbuf;
1554 		size_t len;
1555 		uint_t start = 0;
1556 		mntdata_t *mntdata = MTOD(mnp);
1557 		zone_t *zone = mntdata->mnt_zone_ref.zref_zone;
1558 
1559 		STRUCT_INIT(tagdesc, flag & DATAMODEL_MASK);
1560 		if (copyin(dp, STRUCT_BUF(tagdesc), STRUCT_SIZE(tagdesc))) {
1561 			error = EFAULT;
1562 			break;
1563 		}
1564 		pbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1565 		if (zone != global_zone) {
1566 			(void) strcpy(pbuf, zone->zone_rootpath);
1567 			/* truncate "/" and nul */
1568 			start = zone->zone_rootpathlen - 2;
1569 			ASSERT(pbuf[start] == '/');
1570 		}
1571 		cptr = STRUCT_FGETP(tagdesc, mtd_mntpt);
1572 		error = copyinstr(cptr, pbuf + start, MAXPATHLEN - start, &len);
1573 		if (error) {
1574 			kmem_free(pbuf, MAXPATHLEN);
1575 			break;
1576 		}
1577 		if (start != 0 && pbuf[start] != '/') {
1578 			kmem_free(pbuf, MAXPATHLEN);
1579 			error = EINVAL;
1580 			break;
1581 		}
1582 		cptr = STRUCT_FGETP(tagdesc, mtd_tag);
1583 		if ((error = copyinstr(cptr, tagbuf, MAX_MNTOPT_TAG, &len))) {
1584 			kmem_free(pbuf, MAXPATHLEN);
1585 			break;
1586 		}
1587 		major = STRUCT_FGET(tagdesc, mtd_major);
1588 		minor = STRUCT_FGET(tagdesc, mtd_minor);
1589 		if (cmd == MNTIOC_SETTAG)
1590 			error = vfs_settag(major, minor, pbuf, tagbuf, cr);
1591 		else
1592 			error = vfs_clrtag(major, minor, pbuf, tagbuf, cr);
1593 		kmem_free(pbuf, MAXPATHLEN);
1594 		break;
1595 	}
1596 
1597 	case MNTIOC_SHOWHIDDEN:
1598 	{
1599 		rw_enter(&mnp->mnt_contents, RW_WRITER);
1600 		mnp->mnt_flags |= MNT_SHOWHIDDEN;
1601 		rw_exit(&mnp->mnt_contents);
1602 		break;
1603 	}
1604 
1605 	case MNTIOC_GETMNTANY:
1606 	{
1607 		STRUCT_DECL(mntentbuf, embuf);	/* Our copy of user's embuf */
1608 		STRUCT_DECL(extmnttab, ktab);	/* Out copy of user's emp */
1609 		struct extmnttab *uemp;		/* uaddr of user's emp */
1610 		char *ubufp;			/* uaddr of user's text buf */
1611 		size_t ubufsize;		/* size of the above */
1612 		struct extmnttab preftab;	/* our version of user's emp */
1613 		char *prefbuf;			/* our copy of user's text */
1614 		mntelem_t *elemp;		/* a database element */
1615 		struct extmnttab *dbtabp;	/* element's extmnttab */
1616 		char *dbbufp;			/* element's text buf */
1617 		size_t dbbufsize;		/* size of the above */
1618 		vtype_t type;			/* type, if any, of special */
1619 
1620 
1621 		/*
1622 		 * embuf is a struct embuf within the kernel. We copy into it
1623 		 * the struct embuf supplied by the user.
1624 		 */
1625 		STRUCT_INIT(embuf, datamodel);
1626 		if (copyin((void *) arg, STRUCT_BUF(embuf),
1627 		    STRUCT_SIZE(embuf))) {
1628 			error = EFAULT;
1629 			break;
1630 		}
1631 		uemp = STRUCT_FGETP(embuf, mbuf_emp);
1632 		ubufp = STRUCT_FGETP(embuf, mbuf_buf);
1633 		ubufsize = STRUCT_FGET(embuf, mbuf_bufsize);
1634 
1635 		/*
1636 		 * Check that the text buffer offered by the user is the
1637 		 * agreed size.
1638 		 */
1639 		if (ubufsize != MNT_LINE_MAX) {
1640 			error = EINVAL;
1641 			break;
1642 		}
1643 
1644 		/* Copy the user-supplied entry into a local buffer. */
1645 		prefbuf = kmem_alloc(MNT_LINE_MAX, KM_SLEEP);
1646 		if (copyin(ubufp, prefbuf, MNT_LINE_MAX)) {
1647 			kmem_free(prefbuf, MNT_LINE_MAX);
1648 			error = EFAULT;
1649 			break;
1650 		}
1651 
1652 		/* Ensure that any string within it is null-terminated. */
1653 		*(prefbuf + MNT_LINE_MAX - 1) = 0;
1654 
1655 		/* Copy in the user-supplied mpref */
1656 		STRUCT_INIT(ktab, datamodel);
1657 		if (copyin(uemp, STRUCT_BUF(ktab),
1658 		    SIZEOF_STRUCT(mnttab, datamodel))) {
1659 			kmem_free(prefbuf, MNT_LINE_MAX);
1660 			error = EFAULT;
1661 			break;
1662 		}
1663 
1664 		/*
1665 		 * Copy the members of the user's pref struct into a local
1666 		 * struct. The pointers need to be offset and verified to
1667 		 * ensure that they lie within the bounds of the buffer.
1668 		 */
1669 		preftab.mnt_special = mntfs_import_addr(STRUCT_FGETP(ktab,
1670 		    mnt_special), ubufp, prefbuf, MNT_LINE_MAX);
1671 		preftab.mnt_mountp = mntfs_import_addr(STRUCT_FGETP(ktab,
1672 		    mnt_mountp), ubufp, prefbuf, MNT_LINE_MAX);
1673 		preftab.mnt_fstype = mntfs_import_addr(STRUCT_FGETP(ktab,
1674 		    mnt_fstype), ubufp, prefbuf, MNT_LINE_MAX);
1675 		preftab.mnt_mntopts = mntfs_import_addr(STRUCT_FGETP(ktab,
1676 		    mnt_mntopts), ubufp, prefbuf, MNT_LINE_MAX);
1677 		preftab.mnt_time = mntfs_import_addr(STRUCT_FGETP(ktab,
1678 		    mnt_time), ubufp, prefbuf, MNT_LINE_MAX);
1679 
1680 		/*
1681 		 * If the user specifies a mounted resource that is a special
1682 		 * device then we capture its mode and major and minor numbers;
1683 		 * cf. the block comment below.
1684 		 */
1685 		type = mntfs_special_info_string(preftab.mnt_special,
1686 		    &preftab.mnt_major, &preftab.mnt_minor, cr);
1687 
1688 		rw_enter(&mnp->mnt_contents, RW_WRITER);
1689 		if (snapp->mnts_nmnts == 0 ||
1690 		    (snapp->mnts_flags & MNTS_REWIND))
1691 			mntfs_snapshot(mnp, snapp);
1692 
1693 		/*
1694 		 * This is the core functionality that implements getmntany().
1695 		 * We walk through the mntfs database until we find an element
1696 		 * matching the user's preferences that are contained in
1697 		 * preftab. Typically, this means checking that the text
1698 		 * matches. However, the mounted resource is special: if the
1699 		 * user is looking for a special device then we must find a
1700 		 * database element with the same major and minor numbers and
1701 		 * the same type, i.e. VBLK or VCHR. The type is not recorded
1702 		 * in the element because it cannot be inferred from the vfs_t.
1703 		 * We therefore check the type of suitable candidates via
1704 		 * mntfs_special_info_element(); since this calls into the
1705 		 * underlying file system we make sure to drop the database lock
1706 		 * first.
1707 		 */
1708 		elemp = snapp->mnts_next;
1709 		rw_enter(dblockp, RW_READER);
1710 		for (;;) {
1711 			for (; elemp; elemp = mntfs_get_next_elem(snapp,
1712 			    elemp)) {
1713 				dbtabp = &elemp->mnte_tab;
1714 				dbbufp = elemp->mnte_text;
1715 				dbbufsize = elemp->mnte_text_size;
1716 
1717 				if (((type &&
1718 				    dbtabp->mnt_major == preftab.mnt_major &&
1719 				    dbtabp->mnt_minor == preftab.mnt_minor &&
1720 				    MNTFS_REAL_FIELD(dbbufp)) ||
1721 				    (!type && (!preftab.mnt_special ||
1722 				    mntfs_same_word(preftab.mnt_special,
1723 				    prefbuf, MNT_LINE_MAX, (off_t)0, dbbufp,
1724 				    dbbufsize)))) &&
1725 
1726 				    (!preftab.mnt_mountp || mntfs_same_word(
1727 				    preftab.mnt_mountp, prefbuf, MNT_LINE_MAX,
1728 				    (off_t)dbtabp->mnt_mountp, dbbufp,
1729 				    dbbufsize)) &&
1730 
1731 				    (!preftab.mnt_fstype || mntfs_same_word(
1732 				    preftab.mnt_fstype, prefbuf, MNT_LINE_MAX,
1733 				    (off_t)dbtabp->mnt_fstype, dbbufp,
1734 				    dbbufsize)) &&
1735 
1736 				    (!preftab.mnt_mntopts || mntfs_same_word(
1737 				    preftab.mnt_mntopts, prefbuf, MNT_LINE_MAX,
1738 				    (off_t)dbtabp->mnt_mntopts, dbbufp,
1739 				    dbbufsize)) &&
1740 
1741 				    (!preftab.mnt_time || mntfs_same_word(
1742 				    preftab.mnt_time, prefbuf, MNT_LINE_MAX,
1743 				    (off_t)dbtabp->mnt_time, dbbufp,
1744 				    dbbufsize)))
1745 					break;
1746 			}
1747 			rw_exit(dblockp);
1748 
1749 			if (elemp == NULL || type == 0 ||
1750 			    type == mntfs_special_info_element(elemp, cr))
1751 				break;
1752 
1753 			rw_enter(dblockp, RW_READER);
1754 			elemp = mntfs_get_next_elem(snapp, elemp);
1755 		}
1756 
1757 		kmem_free(prefbuf, MNT_LINE_MAX);
1758 
1759 		/* If we failed to find a match then return EOF. */
1760 		if (elemp == NULL) {
1761 			rw_exit(&mnp->mnt_contents);
1762 			*rvalp = MNTFS_EOF;
1763 			break;
1764 		}
1765 
1766 		/*
1767 		 * Check that the text buffer offered by the user will be large
1768 		 * enough to accommodate the text for this entry.
1769 		 */
1770 		if (elemp->mnte_text_size > MNT_LINE_MAX) {
1771 			rw_exit(&mnp->mnt_contents);
1772 			*rvalp = MNTFS_TOOLONG;
1773 			break;
1774 		}
1775 
1776 		/*
1777 		 * Populate the user's struct mnttab and text buffer using the
1778 		 * element's contents.
1779 		 */
1780 		if (mntfs_copyout_elem(elemp, uemp, ubufp, cmd, datamodel)) {
1781 			error = EFAULT;
1782 		} else {
1783 			rw_enter(dblockp, RW_READER);
1784 			elemp = mntfs_get_next_elem(snapp, elemp);
1785 			rw_exit(dblockp);
1786 			snapp->mnts_next = elemp;
1787 		}
1788 		rw_exit(&mnp->mnt_contents);
1789 		break;
1790 	}
1791 
1792 	case MNTIOC_GETMNTENT:
1793 	case MNTIOC_GETEXTMNTENT:
1794 	{
1795 		STRUCT_DECL(mntentbuf, embuf);	/* Our copy of user's embuf */
1796 		struct extmnttab *uemp;		/* uaddr of user's emp */
1797 		char *ubufp;			/* uaddr of user's text buf */
1798 		size_t ubufsize;		/* size of the above */
1799 		mntelem_t *elemp;		/* a database element */
1800 
1801 
1802 		rw_enter(&mnp->mnt_contents, RW_WRITER);
1803 		if (snapp->mnts_nmnts == 0 ||
1804 		    (snapp->mnts_flags & MNTS_REWIND))
1805 			mntfs_snapshot(mnp, snapp);
1806 		if ((elemp = snapp->mnts_next) == NULL) {
1807 			rw_exit(&mnp->mnt_contents);
1808 			*rvalp = MNTFS_EOF;
1809 			break;
1810 		}
1811 
1812 		/*
1813 		 * embuf is a struct embuf within the kernel. We copy into it
1814 		 * the struct embuf supplied by the user.
1815 		 */
1816 		STRUCT_INIT(embuf, datamodel);
1817 		if (copyin((void *) arg, STRUCT_BUF(embuf),
1818 		    STRUCT_SIZE(embuf))) {
1819 			rw_exit(&mnp->mnt_contents);
1820 			error = EFAULT;
1821 			break;
1822 		}
1823 		uemp = STRUCT_FGETP(embuf, mbuf_emp);
1824 		ubufp = STRUCT_FGETP(embuf, mbuf_buf);
1825 		ubufsize = STRUCT_FGET(embuf, mbuf_bufsize);
1826 
1827 		/*
1828 		 * Check that the text buffer offered by the user will be large
1829 		 * enough to accommodate the text for this entry.
1830 		 */
1831 		if (elemp->mnte_text_size > ubufsize) {
1832 			rw_exit(&mnp->mnt_contents);
1833 			*rvalp = MNTFS_TOOLONG;
1834 			break;
1835 		}
1836 
1837 		/*
1838 		 * Populate the user's struct mnttab and text buffer using the
1839 		 * element's contents.
1840 		 */
1841 		if (mntfs_copyout_elem(elemp, uemp, ubufp, cmd, datamodel)) {
1842 			error = EFAULT;
1843 		} else {
1844 			rw_enter(dblockp, RW_READER);
1845 			elemp = mntfs_get_next_elem(snapp, elemp);
1846 			rw_exit(dblockp);
1847 			snapp->mnts_next = elemp;
1848 		}
1849 		rw_exit(&mnp->mnt_contents);
1850 		break;
1851 	}
1852 
1853 	default:
1854 		error = EINVAL;
1855 		break;
1856 	}
1857 
1858 	return (error);
1859 }
1860 
1861 /*
1862  * mntfs provides a new vnode for each open(2). Two vnodes will represent the
1863  * same instance of /etc/mnttab if they share the same (zone-specific) vfs.
1864  */
1865 /* ARGSUSED */
1866 int
1867 mntcmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
1868 {
1869 	return (vp1 != NULL && vp2 != NULL && vp1->v_vfsp == vp2->v_vfsp);
1870 }
1871 
1872 /*
1873  * /mntfs vnode operations vector
1874  */
1875 const fs_operation_def_t mnt_vnodeops_template[] = {
1876 	VOPNAME_OPEN,		{ .vop_open = mntopen },
1877 	VOPNAME_CLOSE,		{ .vop_close = mntclose },
1878 	VOPNAME_READ,		{ .vop_read = mntread },
1879 	VOPNAME_IOCTL,		{ .vop_ioctl = mntioctl },
1880 	VOPNAME_GETATTR,	{ .vop_getattr = mntgetattr },
1881 	VOPNAME_ACCESS,		{ .vop_access = mntaccess },
1882 	VOPNAME_FSYNC,		{ .vop_fsync = mntfsync },
1883 	VOPNAME_INACTIVE,	{ .vop_inactive = mntinactive },
1884 	VOPNAME_SEEK,		{ .vop_seek = mntseek },
1885 	VOPNAME_POLL,		{ .vop_poll = mntpoll },
1886 	VOPNAME_CMP,		{ .vop_cmp = mntcmp },
1887 	VOPNAME_DISPOSE,	{ .error = fs_error },
1888 	VOPNAME_SHRLOCK,	{ .error = fs_error },
1889 	NULL,			NULL
1890 };
1891