xref: /titanic_41/usr/src/uts/common/fs/mntfs/mntvnops.c (revision 8c06a49046077b6920ed1f299752f78e91f34c3f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/file.h>
29 #include <sys/stat.h>
30 #include <sys/atomic.h>
31 #include <sys/mntio.h>
32 #include <sys/mnttab.h>
33 #include <sys/mount.h>
34 #include <sys/sunddi.h>
35 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
37 #include <sys/vfs.h>
38 #include <sys/vfs_opreg.h>
39 #include <sys/fs/mntdata.h>
40 #include <fs/fs_subr.h>
41 #include <sys/vmsystm.h>
42 #include <vm/seg_vn.h>
43 
44 #define	MNTROOTINO	2
45 
46 static mntnode_t *mntgetnode(vnode_t *);
47 
48 vnodeops_t *mntvnodeops;
49 vnodeops_t *mntdummyvnodeops;
50 extern struct vnode *mntdummyvp;
51 
52 /*
53  * Design of kernel mnttab accounting.
54  *
55  * To support whitespace in mount names, we implement an ioctl
56  * (MNTIOC_GETMNTENT) which allows a programmatic interface to the data in
57  * /etc/mnttab.  The libc functions getmntent() and getextmntent() are built
58  * atop this interface.
59  *
60  * To minimize the amount of memory used in the kernel, we keep all the
61  * necessary information in the user's address space.  Large server
62  * configurations can have /etc/mnttab files in excess of 64k.
63  *
64  * To support both vanilla read() calls as well as ioctl() calls, we have two
65  * different snapshots of the kernel data structures, mnt_read and mnt_ioctl.
66  * These snapshots include the base location in user memory, the number of
67  * mounts in the snapshot, and any metadata associated with it.  The metadata is
68  * used only to support the ioctl() interface, and is a series of extmnttab
69  * structures.  When the user issues an ioctl(), we simply copyout a pointer to
70  * that structure, and the rest is handled in userland.
71  */
72 
73 /*
74  * NOTE: The following variable enables the generation of the "dev=xxx"
75  * in the option string for a mounted file system.  Really this should
76  * be gotten rid of altogether, but for the sake of backwards compatibility
77  * we had to leave it in.  It is defined as a 32-bit device number.  This
78  * means that when 64-bit device numbers are in use, if either the major or
79  * minor part of the device number will not fit in a 16 bit quantity, the
80  * "dev=" will be set to NODEV (0x7fffffff).  See PSARC 1999/566 and
81  * 1999/131 for details.  The cmpldev() function used to generate the 32-bit
82  * device number handles this check and assigns the proper value.
83  */
84 int mntfs_enabledev = 1;	/* enable old "dev=xxx" option */
85 
86 static int
87 mntfs_devsize(struct vfs *vfsp)
88 {
89 	dev32_t odev;
90 
91 	(void) cmpldev(&odev, vfsp->vfs_dev);
92 	return (snprintf(NULL, 0, "dev=%x", odev));
93 }
94 
95 static int
96 mntfs_devprint(struct vfs *vfsp, char *buf)
97 {
98 	dev32_t odev;
99 
100 	(void) cmpldev(&odev, vfsp->vfs_dev);
101 	return (snprintf(buf, MAX_MNTOPT_STR, "dev=%x", odev));
102 }
103 
104 static int
105 mntfs_optsize(struct vfs *vfsp)
106 {
107 	int i, size = 0;
108 	mntopt_t *mop;
109 
110 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
111 		mop = &vfsp->vfs_mntopts.mo_list[i];
112 		if (mop->mo_flags & MO_NODISPLAY)
113 			continue;
114 		if (mop->mo_flags & MO_SET) {
115 			if (size)
116 				size++; /* space for comma */
117 			size += strlen(mop->mo_name);
118 			/*
119 			 * count option value if there is one
120 			 */
121 			if (mop->mo_arg != NULL) {
122 				size += strlen(mop->mo_arg) + 1;
123 			}
124 		}
125 	}
126 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
127 		/*
128 		 * Add space for "zone=<zone_name>" if required.
129 		 */
130 		if (size)
131 			size++;	/* space for comma */
132 		size += sizeof ("zone=") - 1;
133 		size += strlen(vfsp->vfs_zone->zone_name);
134 	}
135 	if (mntfs_enabledev) {
136 		if (size != 0)
137 			size++; /* space for comma */
138 		size += mntfs_devsize(vfsp);
139 	}
140 	if (size == 0)
141 		size = strlen("-");
142 	return (size);
143 }
144 
145 static int
146 mntfs_optprint(struct vfs *vfsp, char *buf)
147 {
148 	int i, optinbuf = 0;
149 	mntopt_t *mop;
150 	char *origbuf = buf;
151 
152 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
153 		mop = &vfsp->vfs_mntopts.mo_list[i];
154 		if (mop->mo_flags & MO_NODISPLAY)
155 			continue;
156 		if (mop->mo_flags & MO_SET) {
157 			if (optinbuf)
158 				*buf++ = ',';
159 			else
160 				optinbuf = 1;
161 			buf += snprintf(buf, MAX_MNTOPT_STR,
162 				"%s", mop->mo_name);
163 			/*
164 			 * print option value if there is one
165 			 */
166 			if (mop->mo_arg != NULL) {
167 				buf += snprintf(buf, MAX_MNTOPT_STR, "=%s",
168 					mop->mo_arg);
169 			}
170 		}
171 	}
172 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
173 		if (optinbuf)
174 			*buf++ = ',';
175 		else
176 			optinbuf = 1;
177 		buf += snprintf(buf, MAX_MNTOPT_STR, "zone=%s",
178 		    vfsp->vfs_zone->zone_name);
179 	}
180 	if (mntfs_enabledev) {
181 		if (optinbuf++)
182 			*buf++ = ',';
183 		buf += mntfs_devprint(vfsp, buf);
184 	}
185 	if (!optinbuf) {
186 		buf += snprintf(buf, MAX_MNTOPT_STR, "-");
187 	}
188 	return (buf - origbuf);
189 }
190 
191 static size_t
192 mntfs_vfs_len(vfs_t *vfsp, zone_t *zone)
193 {
194 	size_t size = 0;
195 	const char *resource, *mntpt;
196 
197 	mntpt = refstr_value(vfsp->vfs_mntpt);
198 	if (mntpt != NULL && mntpt[0] != '\0') {
199 		size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
200 	} else {
201 		size += strlen("-") + 1;
202 	}
203 
204 	resource = refstr_value(vfsp->vfs_resource);
205 	if (resource != NULL && resource[0] != '\0') {
206 		if (resource[0] != '/') {
207 			size += strlen(resource) + 1;
208 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
209 			/*
210 			 * Same as the zone's view of the mount point.
211 			 */
212 			size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
213 		} else {
214 			size += strlen(ZONE_PATH_TRANSLATE(resource, zone)) + 1;
215 		}
216 	} else {
217 		size += strlen("-") + 1;
218 	}
219 	size += strlen(vfssw[vfsp->vfs_fstype].vsw_name) + 1;
220 	size += mntfs_optsize(vfsp);
221 	size += snprintf(NULL, 0, "\t%ld\n", vfsp->vfs_mtime);
222 	return (size);
223 }
224 
225 static void
226 mntfs_zonerootvfs(zone_t *zone, vfs_t *rootvfsp)
227 {
228 	/*
229 	 * Basically copy over the real vfs_t on which the root vnode is
230 	 * located, changing its mountpoint and resource to match those of
231 	 * the zone's rootpath.
232 	 */
233 	*rootvfsp = *zone->zone_rootvp->v_vfsp;
234 	rootvfsp->vfs_mntpt = refstr_alloc(zone->zone_rootpath);
235 	rootvfsp->vfs_resource = rootvfsp->vfs_mntpt;
236 }
237 
238 static size_t
239 mntfs_zone_len(uint_t *nent_ptr, zone_t *zone, int showhidden)
240 {
241 	struct vfs *zonelist;
242 	struct vfs *vfsp;
243 	size_t size = 0;
244 	uint_t cnt = 0;
245 
246 	ASSERT(zone->zone_rootpath != NULL);
247 
248 	/*
249 	 * If the zone has a root entry, it will be the first in the list.  If
250 	 * it doesn't, we conjure one up.
251 	 */
252 	vfsp = zonelist = zone->zone_vfslist;
253 	if (zonelist == NULL ||
254 	    strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
255 		vfs_t tvfs;
256 		/*
257 		 * The root of the zone is not a mount point.  The vfs we want
258 		 * to report is that of the zone's root vnode.
259 		 */
260 		ASSERT(zone != global_zone);
261 		mntfs_zonerootvfs(zone, &tvfs);
262 		size += mntfs_vfs_len(&tvfs, zone);
263 		refstr_rele(tvfs.vfs_mntpt);
264 		cnt++;
265 	}
266 	if (zonelist == NULL)
267 		goto out;
268 	do {
269 		/*
270 		 * Skip mounts that should not show up in mnttab
271 		 */
272 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
273 			vfsp = vfsp->vfs_zone_next;
274 			continue;
275 		}
276 		cnt++;
277 		size += mntfs_vfs_len(vfsp, zone);
278 		vfsp = vfsp->vfs_zone_next;
279 	} while (vfsp != zonelist);
280 out:
281 	*nent_ptr = cnt;
282 	return (size);
283 }
284 
285 static size_t
286 mntfs_global_len(uint_t *nent_ptr, int showhidden)
287 {
288 	struct vfs *vfsp;
289 	size_t size = 0;
290 	uint_t cnt = 0;
291 
292 	vfsp = rootvfs;
293 	do {
294 		/*
295 		 * Skip mounts that should not show up in mnttab
296 		 */
297 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
298 			vfsp = vfsp->vfs_next;
299 			continue;
300 		}
301 		cnt++;
302 		size += mntfs_vfs_len(vfsp, global_zone);
303 		vfsp = vfsp->vfs_next;
304 	} while (vfsp != rootvfs);
305 	*nent_ptr = cnt;
306 	return (size);
307 }
308 
309 static void
310 mntfs_vfs_generate(vfs_t *vfsp, zone_t *zone, struct extmnttab *tab,
311     char **basep, int forread)
312 {
313 	const char *resource, *mntpt;
314 	char *cp = *basep;
315 
316 	mntpt = refstr_value(vfsp->vfs_mntpt);
317 	resource = refstr_value(vfsp->vfs_resource);
318 
319 	if (tab)
320 		tab->mnt_special = cp;
321 	if (resource != NULL && resource[0] != '\0') {
322 		if (resource[0] != '/') {
323 			cp += snprintf(cp, MAXPATHLEN, "%s", resource);
324 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
325 			/*
326 			 * Use the mount point as the resource.
327 			 */
328 			cp += snprintf(cp, MAXPATHLEN, "%s",
329 			    ZONE_PATH_TRANSLATE(mntpt, zone));
330 		} else {
331 			cp += snprintf(cp, MAXPATHLEN, "%s",
332 			    ZONE_PATH_TRANSLATE(resource, zone));
333 		}
334 	} else {
335 		cp += snprintf(cp, MAXPATHLEN, "-");
336 	}
337 	*cp++ = forread ? '\t' : '\0';
338 
339 	if (tab)
340 		tab->mnt_mountp = cp;
341 	if (mntpt != NULL && mntpt[0] != '\0') {
342 		/*
343 		 * We know the mount point is visible from within the zone,
344 		 * otherwise it wouldn't be on the zone's vfs list.
345 		 */
346 		cp += snprintf(cp, MAXPATHLEN, "%s",
347 		    ZONE_PATH_TRANSLATE(mntpt, zone));
348 	} else {
349 		cp += snprintf(cp, MAXPATHLEN, "-");
350 	}
351 	*cp++ = forread ? '\t' : '\0';
352 
353 	if (tab)
354 		tab->mnt_fstype = cp;
355 	cp += snprintf(cp, MAXPATHLEN, "%s",
356 	    vfssw[vfsp->vfs_fstype].vsw_name);
357 	*cp++ = forread ? '\t' : '\0';
358 
359 	if (tab)
360 		tab->mnt_mntopts = cp;
361 	cp += mntfs_optprint(vfsp, cp);
362 	*cp++ = forread ? '\t' : '\0';
363 
364 	if (tab)
365 		tab->mnt_time = cp;
366 	cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime);
367 	*cp++ = forread ? '\n' : '\0';
368 
369 	if (tab) {
370 		tab->mnt_major = getmajor(vfsp->vfs_dev);
371 		tab->mnt_minor = getminor(vfsp->vfs_dev);
372 	}
373 
374 	*basep = cp;
375 }
376 
377 static void
378 mntfs_zone_generate(zone_t *zone, int showhidden, struct extmnttab *tab,
379     char *basep, int forread)
380 {
381 	vfs_t *zonelist;
382 	vfs_t *vfsp;
383 	char *cp = basep;
384 
385 	/*
386 	 * If the zone has a root entry, it will be the first in the list.  If
387 	 * it doesn't, we conjure one up.
388 	 */
389 	vfsp = zonelist = zone->zone_vfslist;
390 	if (zonelist == NULL ||
391 	    strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
392 		vfs_t tvfs;
393 		/*
394 		 * The root of the zone is not a mount point.  The vfs we want
395 		 * to report is that of the zone's root vnode.
396 		 */
397 		ASSERT(zone != global_zone);
398 		mntfs_zonerootvfs(zone, &tvfs);
399 		mntfs_vfs_generate(&tvfs, zone, tab, &cp, forread);
400 		refstr_rele(tvfs.vfs_mntpt);
401 		if (tab)
402 			tab++;
403 	}
404 	if (zonelist == NULL)
405 		return;
406 	do {
407 		/*
408 		 * Skip mounts that should not show up in mnttab
409 		 */
410 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
411 			vfsp = vfsp->vfs_zone_next;
412 			continue;
413 		}
414 		mntfs_vfs_generate(vfsp, zone, tab, &cp, forread);
415 		if (tab)
416 			tab++;
417 		vfsp = vfsp->vfs_zone_next;
418 	} while (vfsp != zonelist);
419 }
420 
421 static void
422 mntfs_global_generate(int showhidden, struct extmnttab *tab, char *basep,
423     int forread)
424 {
425 	vfs_t *vfsp;
426 	char *cp = basep;
427 
428 	vfsp = rootvfs;
429 	do {
430 		/*
431 		 * Skip mounts that should not show up in mnttab
432 		 */
433 		if (!showhidden && vfsp->vfs_flag & VFS_NOMNTTAB) {
434 			vfsp = vfsp->vfs_next;
435 			continue;
436 		}
437 		mntfs_vfs_generate(vfsp, global_zone, tab, &cp, forread);
438 		if (tab)
439 			tab++;
440 		vfsp = vfsp->vfs_next;
441 	} while (vfsp != rootvfs);
442 }
443 
444 static char *
445 mntfs_mapin(char *base, size_t size)
446 {
447 	size_t rlen = roundup(size, PAGESIZE);
448 	struct as *as = curproc->p_as;
449 	char *addr;
450 
451 	as_rangelock(as);
452 	map_addr(&addr, rlen, 0, 1, 0);
453 	if (addr == NULL || as_map(as, addr, rlen, segvn_create, zfod_argsp)) {
454 		as_rangeunlock(as);
455 		return (NULL);
456 	}
457 	as_rangeunlock(as);
458 	if (copyout(base, addr, size)) {
459 		(void) as_unmap(as, addr, rlen);
460 		return (NULL);
461 	}
462 	return (addr);
463 }
464 
465 static void
466 mntfs_freesnap(mntsnap_t *snap)
467 {
468 	if (snap->mnts_text != NULL)
469 		(void) as_unmap(curproc->p_as, snap->mnts_text,
470 			roundup(snap->mnts_textsize, PAGESIZE));
471 	snap->mnts_textsize = snap->mnts_count = 0;
472 	if (snap->mnts_metadata != NULL)
473 		(void) as_unmap(curproc->p_as, snap->mnts_metadata,
474 			roundup(snap->mnts_metasize, PAGESIZE));
475 	snap->mnts_metasize = 0;
476 }
477 
478 #ifdef _SYSCALL32_IMPL
479 
480 typedef struct extmnttab32 {
481 	uint32_t	mnt_special;
482 	uint32_t	mnt_mountp;
483 	uint32_t	mnt_fstype;
484 	uint32_t	mnt_mntopts;
485 	uint32_t	mnt_time;
486 	uint_t		mnt_major;
487 	uint_t		mnt_minor;
488 } extmnttab32_t;
489 
490 #endif
491 
492 /*
493  * called to generate a dummy read vop call so that
494  * any module monitoring /etc/mnttab for access gets notified.
495  */
496 static void
497 mntdummyreadop()
498 {
499 	struct uio	uio;
500 	struct iovec	iov;
501 	char		tbuf[1];
502 
503 	if (mntdummyvp == NULL)
504 		return;
505 
506 	/*
507 	 * Make a VOP_READ call on the dummy vnode so that any
508 	 * module interested in mnttab getting modified could
509 	 * intercept this vnode and capture the event.
510 	 *
511 	 * Pass a dummy uio struct. Nobody should reference the buffer.
512 	 * We need to pass a valid uio struct pointer to take care of
513 	 * any module intercepting this vnode which could attempt to
514 	 * look at it. Currently only the file events notification
515 	 * module intercepts this vnode.
516 	 */
517 	bzero(&uio, sizeof (uio));
518 	bzero(&iov, sizeof (iov));
519 	iov.iov_base = tbuf;
520 	iov.iov_len = 0;
521 	uio.uio_iov = &iov;
522 	uio.uio_iovcnt = 1;
523 	uio.uio_loffset = 0;
524 	uio.uio_segflg = UIO_SYSSPACE;
525 	uio.uio_resid = 0;
526 	(void) VOP_READ(mntdummyvp, &uio, 0, kcred, NULL);
527 }
528 
529 /*
530  * Snapshot the latest version of the kernel mounted resource information
531  *
532  * There are two types of snapshots: one destined for reading, and one destined
533  * for ioctl().  The difference is that the ioctl() interface is delimited by
534  * NULLs, while the read() interface is delimited by tabs and newlines.
535  */
536 /* ARGSUSED */
537 static int
538 mntfs_snapshot(mntnode_t *mnp, int forread, int datamodel)
539 {
540 	size_t size;
541 	timespec_t lastmodt;
542 	mntdata_t *mntdata = MTOD(mnp);
543 	zone_t *zone = mntdata->mnt_zone;
544 	boolean_t global_view = (MTOD(mnp)->mnt_zone == global_zone);
545 	boolean_t showhidden = ((mnp->mnt_flags & MNT_SHOWHIDDEN) != 0);
546 	struct extmnttab *metadata_baseaddr;
547 	char *text_baseaddr;
548 	int i;
549 	mntsnap_t *snap;
550 
551 	if (forread)
552 		snap = &mnp->mnt_read;
553 	else
554 		snap = &mnp->mnt_ioctl;
555 
556 	vfs_list_read_lock();
557 	/*
558 	 * Check if the mnttab info has changed since the last snapshot
559 	 */
560 	vfs_mnttab_modtime(&lastmodt);
561 	if (snap->mnts_count &&
562 	    lastmodt.tv_sec == snap->mnts_time.tv_sec &&
563 	    lastmodt.tv_nsec == snap->mnts_time.tv_nsec) {
564 		vfs_list_unlock();
565 		return (0);
566 	}
567 
568 
569 	if (snap->mnts_count != 0)
570 		mntfs_freesnap(snap);
571 	if (global_view)
572 		size = mntfs_global_len(&snap->mnts_count, showhidden);
573 	else
574 		size = mntfs_zone_len(&snap->mnts_count, zone, showhidden);
575 	ASSERT(size != 0);
576 
577 	if (!forread)
578 		metadata_baseaddr = kmem_alloc(
579 		    snap->mnts_count * sizeof (struct extmnttab), KM_SLEEP);
580 	else
581 		metadata_baseaddr = NULL;
582 
583 	text_baseaddr = kmem_alloc(size, KM_SLEEP);
584 
585 	if (global_view)
586 		mntfs_global_generate(showhidden, metadata_baseaddr,
587 		    text_baseaddr, forread);
588 	else
589 		mntfs_zone_generate(zone, showhidden,
590 		    metadata_baseaddr, text_baseaddr, forread);
591 
592 	vfs_mnttab_modtime(&snap->mnts_time);
593 	vfs_list_unlock();
594 
595 	snap->mnts_text = mntfs_mapin(text_baseaddr, size);
596 	snap->mnts_textsize = size;
597 	kmem_free(text_baseaddr, size);
598 
599 	/*
600 	 * The pointers in the metadata refer to addreesses in the range
601 	 * [base_addr, base_addr + size].  Now that we have mapped the text into
602 	 * the user's address space, we have to convert these addresses into the
603 	 * new (user) range.  We also handle the conversion for 32-bit and
604 	 * 32-bit applications here.
605 	 */
606 	if (!forread) {
607 		struct extmnttab *tab;
608 #ifdef _SYSCALL32_IMPL
609 		struct extmnttab32 *tab32;
610 
611 		if (datamodel == DATAMODEL_ILP32) {
612 			tab = (struct extmnttab *)metadata_baseaddr;
613 			tab32 = (struct extmnttab32 *)metadata_baseaddr;
614 
615 			for (i = 0; i < snap->mnts_count; i++) {
616 				tab32[i].mnt_special =
617 				    (uintptr_t)snap->mnts_text +
618 				    (tab[i].mnt_special - text_baseaddr);
619 				tab32[i].mnt_mountp =
620 				    (uintptr_t)snap->mnts_text +
621 				    (tab[i].mnt_mountp - text_baseaddr);
622 				tab32[i].mnt_fstype =
623 				    (uintptr_t)snap->mnts_text +
624 				    (tab[i].mnt_fstype - text_baseaddr);
625 				tab32[i].mnt_mntopts =
626 				    (uintptr_t)snap->mnts_text +
627 				    (tab[i].mnt_mntopts - text_baseaddr);
628 				tab32[i].mnt_time = (uintptr_t)snap->mnts_text +
629 				    (tab[i].mnt_time - text_baseaddr);
630 				tab32[i].mnt_major = tab[i].mnt_major;
631 				tab32[i].mnt_minor = tab[i].mnt_minor;
632 			}
633 
634 			snap->mnts_metasize =
635 			    snap->mnts_count * sizeof (struct extmnttab32);
636 			snap->mnts_metadata = mntfs_mapin(
637 			    (char *)metadata_baseaddr,
638 			    snap->mnts_metasize);
639 
640 		} else {
641 #endif
642 			tab = (struct extmnttab *)metadata_baseaddr;
643 			for (i = 0; i < snap->mnts_count; i++) {
644 				tab[i].mnt_special = snap->mnts_text +
645 				    (tab[i].mnt_special - text_baseaddr);
646 				tab[i].mnt_mountp = snap->mnts_text +
647 				    (tab[i].mnt_mountp - text_baseaddr);
648 				tab[i].mnt_fstype = snap->mnts_text +
649 				    (tab[i].mnt_fstype - text_baseaddr);
650 				tab[i].mnt_mntopts = snap->mnts_text +
651 				    (tab[i].mnt_mntopts - text_baseaddr);
652 				tab[i].mnt_time = snap->mnts_text +
653 				    (tab[i].mnt_time - text_baseaddr);
654 			}
655 
656 			snap->mnts_metasize =
657 			    snap->mnts_count * sizeof (struct extmnttab);
658 			snap->mnts_metadata = mntfs_mapin(
659 			    (char *)metadata_baseaddr, snap->mnts_metasize);
660 #ifdef _SYSCALL32_IMPL
661 		}
662 #endif
663 
664 		kmem_free(metadata_baseaddr,
665 		    snap->mnts_count * sizeof (struct extmnttab));
666 	}
667 
668 	mntdata->mnt_size = size;
669 
670 	if (snap->mnts_text == NULL ||
671 	    (!forread && snap->mnts_metadata == NULL)) {
672 		mntfs_freesnap(snap);
673 		return (ENOMEM);
674 	}
675 	mntdummyreadop();
676 	return (0);
677 }
678 
679 /*
680  * Public function to convert vfs_mntopts into a string.
681  * A buffer of sufficient size is allocated, which is returned via bufp,
682  * and whose length is returned via lenp.
683  */
684 void
685 mntfs_getmntopts(struct vfs *vfsp, char **bufp, size_t *lenp)
686 {
687 	size_t len;
688 	char *buf;
689 
690 	vfs_list_read_lock();
691 
692 	len = mntfs_optsize(vfsp) + 1;
693 	buf = kmem_alloc(len, KM_NOSLEEP);
694 	if (buf == NULL) {
695 		*bufp = NULL;
696 		vfs_list_unlock();
697 		return;
698 	}
699 	buf[len - 1] = '\0';
700 	(void) mntfs_optprint(vfsp, buf);
701 	ASSERT(buf[len - 1] == '\0');
702 
703 	vfs_list_unlock();
704 	*bufp = buf;
705 	*lenp = len;
706 }
707 
708 
709 /* ARGSUSED */
710 static int
711 mntopen(vnode_t **vpp, int flag, cred_t *cr)
712 {
713 	vnode_t *vp = *vpp;
714 	mntnode_t *nmnp;
715 
716 	/*
717 	 * Not allowed to open for writing, return error.
718 	 */
719 	if (flag & FWRITE)
720 		return (EPERM);
721 	/*
722 	 * Create a new mnt/vnode for each open, this will give us a handle to
723 	 * hang the snapshot on.
724 	 */
725 	nmnp = mntgetnode(vp);
726 
727 	*vpp = MTOV(nmnp);
728 	atomic_add_32(&MTOD(nmnp)->mnt_nopen, 1);
729 	VN_RELE(vp);
730 	return (0);
731 }
732 
733 /* ARGSUSED */
734 static int
735 mntclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
736 {
737 	mntnode_t *mnp = VTOM(vp);
738 
739 	/* Clean up any locks or shares held by the current process */
740 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
741 	cleanshares(vp, ttoproc(curthread)->p_pid);
742 
743 	if (count > 1)
744 		return (0);
745 	if (vp->v_count == 1) {
746 		mntfs_freesnap(&mnp->mnt_read);
747 		mntfs_freesnap(&mnp->mnt_ioctl);
748 		atomic_add_32(&MTOD(mnp)->mnt_nopen, -1);
749 	}
750 	return (0);
751 }
752 
753 /* ARGSUSED */
754 static int
755 mntread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, caller_context_t *ct)
756 {
757 	int error = 0;
758 	off_t off = uio->uio_offset;
759 	size_t len = uio->uio_resid;
760 	mntnode_t *mnp = VTOM(vp);
761 	char *buf;
762 	mntsnap_t *snap = &mnp->mnt_read;
763 	int datamodel;
764 
765 	if (off == (off_t)0 || snap->mnts_count == 0) {
766 		/*
767 		 * It is assumed that any kernel callers wishing
768 		 * to read mnttab will be using extmnttab entries
769 		 * and not extmnttab32 entries, whether or not
770 		 * the kernel is LP64 or ILP32.  Thus, force the
771 		 * datamodel that mntfs_snapshot uses to be
772 		 * DATAMODEL_LP64.
773 		 */
774 		if (uio->uio_segflg == UIO_SYSSPACE)
775 			datamodel = DATAMODEL_LP64;
776 		else
777 			datamodel = get_udatamodel();
778 		if ((error = mntfs_snapshot(mnp, 1, datamodel)) != 0)
779 			return (error);
780 	}
781 	if ((size_t)(off + len) > snap->mnts_textsize)
782 		len = snap->mnts_textsize - off;
783 
784 	if (off < 0 || len > snap->mnts_textsize)
785 		return (EFAULT);
786 
787 	if (len == 0)
788 		return (0);
789 
790 	/*
791 	 * The mnttab image is stored in the user's address space,
792 	 * so we have to copy it into the kernel from userland,
793 	 * then copy it back out to the specified address.
794 	 */
795 	buf = kmem_alloc(len, KM_SLEEP);
796 	if (copyin(snap->mnts_text + off, buf, len))
797 		error = EFAULT;
798 	else {
799 		error = uiomove(buf, len, UIO_READ, uio);
800 	}
801 	kmem_free(buf, len);
802 	mntdummyreadop();
803 	return (error);
804 }
805 
806 
807 static int
808 mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
809 {
810 	mntnode_t *mnp = VTOM(vp);
811 	int error;
812 	vnode_t *rvp;
813 	extern timespec_t vfs_mnttab_ctime;
814 	mntdata_t *mntdata = MTOD(VTOM(vp));
815 	mntsnap_t *snap = mnp->mnt_read.mnts_count ?
816 	    &mnp->mnt_read : &mnp->mnt_ioctl;
817 
818 	/*
819 	 * Return all the attributes.  Should be refined
820 	 * so that it returns only those asked for.
821 	 * Most of this is complete fakery anyway.
822 	 */
823 	rvp = mnp->mnt_mountvp;
824 	/*
825 	 * Attributes are same as underlying file with modifications
826 	 */
827 	if (error = VOP_GETATTR(rvp, vap, flags, cr))
828 		return (error);
829 
830 	/*
831 	 * We always look like a regular file
832 	 */
833 	vap->va_type = VREG;
834 	/*
835 	 * mode should basically be read only
836 	 */
837 	vap->va_mode &= 07444;
838 	vap->va_fsid = vp->v_vfsp->vfs_dev;
839 	vap->va_blksize = DEV_BSIZE;
840 	vap->va_rdev = 0;
841 	vap->va_seq = 0;
842 	/*
843 	 * Set nlink to the number of open vnodes for mnttab info
844 	 * plus one for existing.
845 	 */
846 	vap->va_nlink = mntdata->mnt_nopen + 1;
847 	/*
848 	 * If we haven't taken a snapshot yet, set the
849 	 * size to the size of the latest snapshot.
850 	 */
851 	vap->va_size = snap->mnts_textsize ? snap->mnts_textsize :
852 	    mntdata->mnt_size;
853 	/*
854 	 * Fetch mtime from the vfs mnttab timestamp
855 	 */
856 	vap->va_ctime = vfs_mnttab_ctime;
857 	vfs_list_read_lock();
858 	vfs_mnttab_modtime(&vap->va_mtime);
859 	vap->va_atime = vap->va_mtime;
860 	vfs_list_unlock();
861 	/*
862 	 * Nodeid is always ROOTINO;
863 	 */
864 	vap->va_nodeid = (ino64_t)MNTROOTINO;
865 	vap->va_nblocks = btod(vap->va_size);
866 	return (0);
867 }
868 
869 
870 static int
871 mntaccess(vnode_t *vp, int mode, int flags, cred_t *cr)
872 {
873 	mntnode_t *mnp = VTOM(vp);
874 
875 	if (mode & (VWRITE|VEXEC))
876 		return (EROFS);
877 
878 	/*
879 	 * Do access check on the underlying directory vnode.
880 	 */
881 	return (VOP_ACCESS(mnp->mnt_mountvp, mode, flags, cr));
882 }
883 
884 
885 /*
886  * New /mntfs vnode required; allocate it and fill in most of the fields.
887  */
888 static mntnode_t *
889 mntgetnode(vnode_t *dp)
890 {
891 	mntnode_t *mnp;
892 	vnode_t *vp;
893 
894 	mnp = kmem_zalloc(sizeof (mntnode_t), KM_SLEEP);
895 	mnp->mnt_vnode = vn_alloc(KM_SLEEP);
896 	mnp->mnt_mountvp = VTOM(dp)->mnt_mountvp;
897 	vp = MTOV(mnp);
898 	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
899 	vn_setops(vp, mntvnodeops);
900 	vp->v_vfsp = dp->v_vfsp;
901 	vp->v_type = VREG;
902 	vp->v_data = (caddr_t)mnp;
903 
904 	return (mnp);
905 }
906 
907 /*
908  * Free the storage obtained from mntgetnode().
909  */
910 static void
911 mntfreenode(mntnode_t *mnp)
912 {
913 	vnode_t *vp = MTOV(mnp);
914 
915 	vn_invalid(vp);
916 	vn_free(vp);
917 	kmem_free(mnp, sizeof (*mnp));
918 }
919 
920 
921 /* ARGSUSED */
922 static int
923 mntfsync(vnode_t *vp, int syncflag, cred_t *cr)
924 {
925 	return (0);
926 }
927 
928 /* ARGSUSED */
929 static void
930 mntinactive(vnode_t *vp, cred_t *cr)
931 {
932 	mntnode_t *mnp = VTOM(vp);
933 
934 	mntfreenode(mnp);
935 }
936 
937 /* ARGSUSED */
938 static int
939 mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp)
940 {
941 	if (*noffp == 0)
942 		VTOM(vp)->mnt_offset = 0;
943 
944 	return (0);
945 }
946 
947 /*
948  * Return the answer requested to poll().
949  * POLLRDBAND will return when the mtime of the mnttab
950  * information is newer than the latest one read for this open.
951  */
952 /* ARGSUSED */
953 static int
954 mntpoll(vnode_t *vp, short ev, int any, short *revp, pollhead_t **phpp)
955 {
956 	mntnode_t *mnp = VTOM(vp);
957 	mntsnap_t *snap = &mnp->mnt_read;
958 
959 	if (mnp->mnt_ioctl.mnts_time.tv_sec > snap->mnts_time.tv_sec ||
960 	    (mnp->mnt_ioctl.mnts_time.tv_sec == snap->mnts_time.tv_sec &&
961 	    mnp->mnt_ioctl.mnts_time.tv_nsec > snap->mnts_time.tv_nsec))
962 		snap = &mnp->mnt_ioctl;
963 
964 	*revp = 0;
965 	*phpp = (pollhead_t *)NULL;
966 	if (ev & POLLIN)
967 		*revp |= POLLIN;
968 
969 	if (ev & POLLRDNORM)
970 		*revp |= POLLRDNORM;
971 
972 	if (ev & POLLRDBAND) {
973 		vfs_mnttab_poll(&snap->mnts_time, phpp);
974 		if (*phpp == (pollhead_t *)NULL)
975 			*revp |= POLLRDBAND;
976 	}
977 	if (*revp || *phpp != NULL || any) {
978 		return (0);
979 	}
980 	/*
981 	 * If someone is polling an unsupported poll events (e.g.
982 	 * POLLOUT, POLLPRI, etc.), just return POLLERR revents.
983 	 * That way we will ensure that we don't return a 0
984 	 * revents with a NULL pollhead pointer.
985 	 */
986 	*revp = POLLERR;
987 	return (0);
988 }
989 /* ARGSUSED */
990 static int
991 mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
992 	cred_t *cr, int *rvalp)
993 {
994 	uint_t *up = (uint_t *)arg;
995 	mntnode_t *mnp = VTOM(vp);
996 	mntsnap_t *snap = &mnp->mnt_ioctl;
997 	int error;
998 
999 	error = 0;
1000 	switch (cmd) {
1001 
1002 	case MNTIOC_NMNTS: {		/* get no. of mounted resources */
1003 		if (snap->mnts_count == 0) {
1004 			if ((error =
1005 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
1006 				return (error);
1007 		}
1008 		if (suword32(up, snap->mnts_count) != 0)
1009 			error = EFAULT;
1010 		break;
1011 	}
1012 
1013 	case MNTIOC_GETDEVLIST: {	/* get mounted device major/minor nos */
1014 		uint_t *devlist;
1015 		int i;
1016 		size_t len;
1017 
1018 		if (snap->mnts_count == 0) {
1019 			if ((error =
1020 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
1021 				return (error);
1022 		}
1023 
1024 		len = 2 * snap->mnts_count * sizeof (uint_t);
1025 		devlist = kmem_alloc(len, KM_SLEEP);
1026 		for (i = 0; i < snap->mnts_count; i++) {
1027 
1028 #ifdef _SYSCALL32_IMPL
1029 			if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
1030 				struct extmnttab32 tab;
1031 
1032 				if ((error = xcopyin(snap->mnts_text +
1033 				    i * sizeof (struct extmnttab32), &tab,
1034 				    sizeof (tab))) != 0)
1035 					break;
1036 
1037 				devlist[i*2] = tab.mnt_major;
1038 				devlist[i*2+1] = tab.mnt_minor;
1039 			} else {
1040 #endif
1041 				struct extmnttab tab;
1042 
1043 				if ((error = xcopyin(snap->mnts_text +
1044 				    i * sizeof (struct extmnttab), &tab,
1045 				    sizeof (tab))) != 0)
1046 					break;
1047 
1048 				devlist[i*2] = tab.mnt_major;
1049 				devlist[i*2+1] = tab.mnt_minor;
1050 #ifdef _SYSCALL32_IMPL
1051 			}
1052 #endif
1053 		}
1054 
1055 		if (error == 0)
1056 			error = xcopyout(devlist, up, len);
1057 		kmem_free(devlist, len);
1058 		break;
1059 	}
1060 
1061 	case MNTIOC_SETTAG:		/* set tag on mounted file system */
1062 	case MNTIOC_CLRTAG:		/* clear tag on mounted file system */
1063 	{
1064 		struct mnttagdesc *dp = (struct mnttagdesc *)arg;
1065 		STRUCT_DECL(mnttagdesc, tagdesc);
1066 		char *cptr;
1067 		uint32_t major, minor;
1068 		char tagbuf[MAX_MNTOPT_TAG];
1069 		char *pbuf;
1070 		size_t len;
1071 		uint_t start = 0;
1072 		mntdata_t *mntdata = MTOD(mnp);
1073 		zone_t *zone = mntdata->mnt_zone;
1074 
1075 		STRUCT_INIT(tagdesc, flag & DATAMODEL_MASK);
1076 		if (copyin(dp, STRUCT_BUF(tagdesc), STRUCT_SIZE(tagdesc))) {
1077 			error = EFAULT;
1078 			break;
1079 		}
1080 		pbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1081 		if (zone != global_zone) {
1082 			(void) strcpy(pbuf, zone->zone_rootpath);
1083 			/* truncate "/" and nul */
1084 			start = zone->zone_rootpathlen - 2;
1085 			ASSERT(pbuf[start] == '/');
1086 		}
1087 		cptr = STRUCT_FGETP(tagdesc, mtd_mntpt);
1088 		error = copyinstr(cptr, pbuf + start, MAXPATHLEN - start, &len);
1089 		if (error) {
1090 			kmem_free(pbuf, MAXPATHLEN);
1091 			break;
1092 		}
1093 		if (start != 0 && pbuf[start] != '/') {
1094 			kmem_free(pbuf, MAXPATHLEN);
1095 			error = EINVAL;
1096 			break;
1097 		}
1098 		cptr = STRUCT_FGETP(tagdesc, mtd_tag);
1099 		if ((error = copyinstr(cptr, tagbuf, MAX_MNTOPT_TAG, &len))) {
1100 			kmem_free(pbuf, MAXPATHLEN);
1101 			break;
1102 		}
1103 		major = STRUCT_FGET(tagdesc, mtd_major);
1104 		minor = STRUCT_FGET(tagdesc, mtd_minor);
1105 		if (cmd == MNTIOC_SETTAG)
1106 			error = vfs_settag(major, minor, pbuf, tagbuf, cr);
1107 		else
1108 			error = vfs_clrtag(major, minor, pbuf, tagbuf, cr);
1109 		kmem_free(pbuf, MAXPATHLEN);
1110 		break;
1111 	}
1112 
1113 	case MNTIOC_SHOWHIDDEN:
1114 	{
1115 		mutex_enter(&vp->v_lock);
1116 		mnp->mnt_flags |= MNT_SHOWHIDDEN;
1117 		mutex_exit(&vp->v_lock);
1118 		break;
1119 	}
1120 
1121 	case MNTIOC_GETMNTENT:
1122 	{
1123 		size_t idx;
1124 		uintptr_t addr;
1125 
1126 		idx = mnp->mnt_offset;
1127 		if (snap->mnts_count == 0 || idx == 0) {
1128 			if ((error =
1129 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
1130 				return (error);
1131 		}
1132 		/*
1133 		 * If the next index is beyond the end of the current mnttab,
1134 		 * return EOF
1135 		 */
1136 		if (idx >= snap->mnts_count) {
1137 			*rvalp = 1;
1138 			return (0);
1139 		}
1140 
1141 #ifdef _SYSCALL32_IMPL
1142 		if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
1143 			addr = (uintptr_t)(snap->mnts_metadata + idx *
1144 			    sizeof (struct extmnttab32));
1145 			error = suword32((void *)arg, addr);
1146 		} else {
1147 #endif
1148 			addr = (uintptr_t)(snap->mnts_metadata + idx *
1149 			    sizeof (struct extmnttab));
1150 			error = sulword((void *)arg, addr);
1151 #ifdef _SYSCALL32_IMPL
1152 		}
1153 #endif
1154 
1155 		if (error != 0)
1156 			return (error);
1157 
1158 		mnp->mnt_offset++;
1159 		break;
1160 	}
1161 
1162 	default:
1163 		error = EINVAL;
1164 		break;
1165 	}
1166 
1167 	return (error);
1168 }
1169 
1170 /* ARGSUSED */
1171 static int
1172 mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
1173 	caller_context_t *ct)
1174 {
1175 	return (0);
1176 }
1177 
1178 /* ARGSUSED */
1179 static int
1180 mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
1181 	caller_context_t *ct)
1182 {
1183 	return (0);
1184 }
1185 
1186 
1187 /*
1188  * /mntfs vnode operations vector
1189  */
1190 const fs_operation_def_t mnt_vnodeops_template[] = {
1191 	VOPNAME_OPEN,		{ .vop_open = mntopen },
1192 	VOPNAME_CLOSE,		{ .vop_close = mntclose },
1193 	VOPNAME_READ,		{ .vop_read = mntread },
1194 	VOPNAME_IOCTL,		{ .vop_ioctl = mntioctl },
1195 	VOPNAME_GETATTR,	{ .vop_getattr = mntgetattr },
1196 	VOPNAME_ACCESS,		{ .vop_access = mntaccess },
1197 	VOPNAME_FSYNC,		{ .vop_fsync = mntfsync },
1198 	VOPNAME_INACTIVE,	{ .vop_inactive = mntinactive },
1199 	VOPNAME_SEEK,		{ .vop_seek = mntseek },
1200 	VOPNAME_POLL,		{ .vop_poll = mntpoll },
1201 	VOPNAME_DISPOSE,	{ .error = fs_error },
1202 	VOPNAME_SHRLOCK,	{ .error = fs_error },
1203 	NULL,			NULL
1204 };
1205 
1206 const fs_operation_def_t mnt_dummyvnodeops_template[] = {
1207 	VOPNAME_READ, 		{ .vop_read = mntdummyread },
1208 	VOPNAME_WRITE, 		{ .vop_write = mntdummywrite },
1209 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
1210 	NULL, NULL
1211 };
1212