xref: /titanic_44/usr/src/uts/common/fs/mntfs/mntvnops.c (revision fe598cdcd847f8359013532d5c691bb6190378c0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/file.h>
29 #include <sys/stat.h>
30 #include <sys/atomic.h>
31 #include <sys/mntio.h>
32 #include <sys/mnttab.h>
33 #include <sys/mount.h>
34 #include <sys/sunddi.h>
35 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
37 #include <sys/vfs.h>
38 #include <sys/vfs_opreg.h>
39 #include <sys/fs/mntdata.h>
40 #include <fs/fs_subr.h>
41 #include <sys/vmsystm.h>
42 #include <vm/seg_vn.h>
43 
44 #define	MNTROOTINO	2
45 
46 static mntnode_t *mntgetnode(vnode_t *);
47 
48 vnodeops_t *mntvnodeops;
49 vnodeops_t *mntdummyvnodeops;
50 extern struct vnode *mntdummyvp;
51 
52 /*
53  * Design of kernel mnttab accounting.
54  *
55  * To support whitespace in mount names, we implement an ioctl
56  * (MNTIOC_GETMNTENT) which allows a programmatic interface to the data in
57  * /etc/mnttab.  The libc functions getmntent() and getextmntent() are built
58  * atop this interface.
59  *
60  * To minimize the amount of memory used in the kernel, we keep all the
61  * necessary information in the user's address space.  Large server
62  * configurations can have /etc/mnttab files in excess of 64k.
63  *
64  * To support both vanilla read() calls as well as ioctl() calls, we have two
65  * different snapshots of the kernel data structures, mnt_read and mnt_ioctl.
66  * These snapshots include the base location in user memory, the number of
67  * mounts in the snapshot, and any metadata associated with it.  The metadata is
68  * used only to support the ioctl() interface, and is a series of extmnttab
69  * structures.  When the user issues an ioctl(), we simply copyout a pointer to
70  * that structure, and the rest is handled in userland.
71  */
72 
73 /*
74  * NOTE: The following variable enables the generation of the "dev=xxx"
75  * in the option string for a mounted file system.  Really this should
76  * be gotten rid of altogether, but for the sake of backwards compatibility
77  * we had to leave it in.  It is defined as a 32-bit device number.  This
78  * means that when 64-bit device numbers are in use, if either the major or
79  * minor part of the device number will not fit in a 16 bit quantity, the
80  * "dev=" will be set to NODEV (0x7fffffff).  See PSARC 1999/566 and
81  * 1999/131 for details.  The cmpldev() function used to generate the 32-bit
82  * device number handles this check and assigns the proper value.
83  */
84 int mntfs_enabledev = 1;	/* enable old "dev=xxx" option */
85 
86 static int
87 mntfs_devsize(struct vfs *vfsp)
88 {
89 	dev32_t odev;
90 
91 	(void) cmpldev(&odev, vfsp->vfs_dev);
92 	return (snprintf(NULL, 0, "dev=%x", odev));
93 }
94 
95 static int
96 mntfs_devprint(struct vfs *vfsp, char *buf)
97 {
98 	dev32_t odev;
99 
100 	(void) cmpldev(&odev, vfsp->vfs_dev);
101 	return (snprintf(buf, MAX_MNTOPT_STR, "dev=%x", odev));
102 }
103 
104 static int
105 mntfs_optsize(struct vfs *vfsp)
106 {
107 	int i, size = 0;
108 	mntopt_t *mop;
109 
110 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
111 		mop = &vfsp->vfs_mntopts.mo_list[i];
112 		if (mop->mo_flags & MO_NODISPLAY)
113 			continue;
114 		if (mop->mo_flags & MO_SET) {
115 			if (size)
116 				size++; /* space for comma */
117 			size += strlen(mop->mo_name);
118 			/*
119 			 * count option value if there is one
120 			 */
121 			if (mop->mo_arg != NULL) {
122 				size += strlen(mop->mo_arg) + 1;
123 			}
124 		}
125 	}
126 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
127 		/*
128 		 * Add space for "zone=<zone_name>" if required.
129 		 */
130 		if (size)
131 			size++;	/* space for comma */
132 		size += sizeof ("zone=") - 1;
133 		size += strlen(vfsp->vfs_zone->zone_name);
134 	}
135 	if (mntfs_enabledev) {
136 		if (size != 0)
137 			size++; /* space for comma */
138 		size += mntfs_devsize(vfsp);
139 	}
140 	if (size == 0)
141 		size = strlen("-");
142 	return (size);
143 }
144 
145 static int
146 mntfs_optprint(struct vfs *vfsp, char *buf)
147 {
148 	int i, optinbuf = 0;
149 	mntopt_t *mop;
150 	char *origbuf = buf;
151 
152 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
153 		mop = &vfsp->vfs_mntopts.mo_list[i];
154 		if (mop->mo_flags & MO_NODISPLAY)
155 			continue;
156 		if (mop->mo_flags & MO_SET) {
157 			if (optinbuf)
158 				*buf++ = ',';
159 			else
160 				optinbuf = 1;
161 			buf += snprintf(buf, MAX_MNTOPT_STR,
162 				"%s", mop->mo_name);
163 			/*
164 			 * print option value if there is one
165 			 */
166 			if (mop->mo_arg != NULL) {
167 				buf += snprintf(buf, MAX_MNTOPT_STR, "=%s",
168 					mop->mo_arg);
169 			}
170 		}
171 	}
172 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
173 		if (optinbuf)
174 			*buf++ = ',';
175 		else
176 			optinbuf = 1;
177 		buf += snprintf(buf, MAX_MNTOPT_STR, "zone=%s",
178 		    vfsp->vfs_zone->zone_name);
179 	}
180 	if (mntfs_enabledev) {
181 		if (optinbuf++)
182 			*buf++ = ',';
183 		buf += mntfs_devprint(vfsp, buf);
184 	}
185 	if (!optinbuf) {
186 		buf += snprintf(buf, MAX_MNTOPT_STR, "-");
187 	}
188 	return (buf - origbuf);
189 }
190 
191 static size_t
192 mntfs_vfs_len(vfs_t *vfsp, zone_t *zone)
193 {
194 	size_t size = 0;
195 	const char *resource, *mntpt;
196 
197 	mntpt = refstr_value(vfsp->vfs_mntpt);
198 	if (mntpt != NULL && mntpt[0] != '\0') {
199 		size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
200 	} else {
201 		size += strlen("-") + 1;
202 	}
203 
204 	resource = refstr_value(vfsp->vfs_resource);
205 	if (resource != NULL && resource[0] != '\0') {
206 		if (resource[0] != '/') {
207 			size += strlen(resource) + 1;
208 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
209 			/*
210 			 * Same as the zone's view of the mount point.
211 			 */
212 			size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
213 		} else {
214 			size += strlen(ZONE_PATH_TRANSLATE(resource, zone)) + 1;
215 		}
216 	} else {
217 		size += strlen("-") + 1;
218 	}
219 	size += strlen(vfssw[vfsp->vfs_fstype].vsw_name) + 1;
220 	size += mntfs_optsize(vfsp);
221 	size += snprintf(NULL, 0, "\t%ld\n", vfsp->vfs_mtime);
222 	return (size);
223 }
224 
225 static void
226 mntfs_zonerootvfs(zone_t *zone, vfs_t *rootvfsp)
227 {
228 	/*
229 	 * Basically copy over the real vfs_t on which the root vnode is
230 	 * located, changing its mountpoint and resource to match those of
231 	 * the zone's rootpath.
232 	 */
233 	*rootvfsp = *zone->zone_rootvp->v_vfsp;
234 	rootvfsp->vfs_mntpt = refstr_alloc(zone->zone_rootpath);
235 	rootvfsp->vfs_resource = rootvfsp->vfs_mntpt;
236 }
237 
238 static size_t
239 mntfs_zone_len(uint_t *nent_ptr, zone_t *zone, int showhidden)
240 {
241 	struct vfs *zonelist;
242 	struct vfs *vfsp;
243 	size_t size = 0;
244 	uint_t cnt = 0;
245 
246 	ASSERT(zone->zone_rootpath != NULL);
247 
248 	/*
249 	 * If the zone has a root entry, it will be the first in the list.  If
250 	 * it doesn't, we conjure one up.
251 	 */
252 	vfsp = zonelist = zone->zone_vfslist;
253 	if (zonelist == NULL ||
254 	    strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
255 		vfs_t tvfs;
256 		/*
257 		 * The root of the zone is not a mount point.  The vfs we want
258 		 * to report is that of the zone's root vnode.
259 		 */
260 		ASSERT(zone != global_zone);
261 		mntfs_zonerootvfs(zone, &tvfs);
262 		size += mntfs_vfs_len(&tvfs, zone);
263 		refstr_rele(tvfs.vfs_mntpt);
264 		cnt++;
265 	}
266 	if (zonelist == NULL)
267 		goto out;
268 	do {
269 		/*
270 		 * Skip mounts that should not show up in mnttab
271 		 */
272 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
273 			vfsp = vfsp->vfs_zone_next;
274 			continue;
275 		}
276 		cnt++;
277 		size += mntfs_vfs_len(vfsp, zone);
278 		vfsp = vfsp->vfs_zone_next;
279 	} while (vfsp != zonelist);
280 out:
281 	*nent_ptr = cnt;
282 	return (size);
283 }
284 
285 static size_t
286 mntfs_global_len(uint_t *nent_ptr, int showhidden)
287 {
288 	struct vfs *vfsp;
289 	size_t size = 0;
290 	uint_t cnt = 0;
291 
292 	vfsp = rootvfs;
293 	do {
294 		/*
295 		 * Skip mounts that should not show up in mnttab
296 		 */
297 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
298 			vfsp = vfsp->vfs_next;
299 			continue;
300 		}
301 		cnt++;
302 		size += mntfs_vfs_len(vfsp, global_zone);
303 		vfsp = vfsp->vfs_next;
304 	} while (vfsp != rootvfs);
305 	*nent_ptr = cnt;
306 	return (size);
307 }
308 
309 static void
310 mntfs_vfs_generate(vfs_t *vfsp, zone_t *zone, struct extmnttab *tab,
311     char **basep, int forread)
312 {
313 	const char *resource, *mntpt;
314 	char *cp = *basep;
315 
316 	mntpt = refstr_value(vfsp->vfs_mntpt);
317 	resource = refstr_value(vfsp->vfs_resource);
318 
319 	if (tab)
320 		tab->mnt_special = cp;
321 	if (resource != NULL && resource[0] != '\0') {
322 		if (resource[0] != '/') {
323 			cp += snprintf(cp, MAXPATHLEN, "%s", resource);
324 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
325 			/*
326 			 * Use the mount point as the resource.
327 			 */
328 			cp += snprintf(cp, MAXPATHLEN, "%s",
329 			    ZONE_PATH_TRANSLATE(mntpt, zone));
330 		} else {
331 			cp += snprintf(cp, MAXPATHLEN, "%s",
332 			    ZONE_PATH_TRANSLATE(resource, zone));
333 		}
334 	} else {
335 		cp += snprintf(cp, MAXPATHLEN, "-");
336 	}
337 	*cp++ = forread ? '\t' : '\0';
338 
339 	if (tab)
340 		tab->mnt_mountp = cp;
341 	if (mntpt != NULL && mntpt[0] != '\0') {
342 		/*
343 		 * We know the mount point is visible from within the zone,
344 		 * otherwise it wouldn't be on the zone's vfs list.
345 		 */
346 		cp += snprintf(cp, MAXPATHLEN, "%s",
347 		    ZONE_PATH_TRANSLATE(mntpt, zone));
348 	} else {
349 		cp += snprintf(cp, MAXPATHLEN, "-");
350 	}
351 	*cp++ = forread ? '\t' : '\0';
352 
353 	if (tab)
354 		tab->mnt_fstype = cp;
355 	cp += snprintf(cp, MAXPATHLEN, "%s",
356 	    vfssw[vfsp->vfs_fstype].vsw_name);
357 	*cp++ = forread ? '\t' : '\0';
358 
359 	if (tab)
360 		tab->mnt_mntopts = cp;
361 	cp += mntfs_optprint(vfsp, cp);
362 	*cp++ = forread ? '\t' : '\0';
363 
364 	if (tab)
365 		tab->mnt_time = cp;
366 	cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime);
367 	*cp++ = forread ? '\n' : '\0';
368 
369 	if (tab) {
370 		tab->mnt_major = getmajor(vfsp->vfs_dev);
371 		tab->mnt_minor = getminor(vfsp->vfs_dev);
372 	}
373 
374 	*basep = cp;
375 }
376 
377 static void
378 mntfs_zone_generate(zone_t *zone, int showhidden, struct extmnttab *tab,
379     char *basep, int forread)
380 {
381 	vfs_t *zonelist;
382 	vfs_t *vfsp;
383 	char *cp = basep;
384 
385 	/*
386 	 * If the zone has a root entry, it will be the first in the list.  If
387 	 * it doesn't, we conjure one up.
388 	 */
389 	vfsp = zonelist = zone->zone_vfslist;
390 	if (zonelist == NULL ||
391 	    strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
392 		vfs_t tvfs;
393 		/*
394 		 * The root of the zone is not a mount point.  The vfs we want
395 		 * to report is that of the zone's root vnode.
396 		 */
397 		ASSERT(zone != global_zone);
398 		mntfs_zonerootvfs(zone, &tvfs);
399 		mntfs_vfs_generate(&tvfs, zone, tab, &cp, forread);
400 		refstr_rele(tvfs.vfs_mntpt);
401 		if (tab)
402 			tab++;
403 	}
404 	if (zonelist == NULL)
405 		return;
406 	do {
407 		/*
408 		 * Skip mounts that should not show up in mnttab
409 		 */
410 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
411 			vfsp = vfsp->vfs_zone_next;
412 			continue;
413 		}
414 		mntfs_vfs_generate(vfsp, zone, tab, &cp, forread);
415 		if (tab)
416 			tab++;
417 		vfsp = vfsp->vfs_zone_next;
418 	} while (vfsp != zonelist);
419 }
420 
421 static void
422 mntfs_global_generate(int showhidden, struct extmnttab *tab, char *basep,
423     int forread)
424 {
425 	vfs_t *vfsp;
426 	char *cp = basep;
427 
428 	vfsp = rootvfs;
429 	do {
430 		/*
431 		 * Skip mounts that should not show up in mnttab
432 		 */
433 		if (!showhidden && vfsp->vfs_flag & VFS_NOMNTTAB) {
434 			vfsp = vfsp->vfs_next;
435 			continue;
436 		}
437 		mntfs_vfs_generate(vfsp, global_zone, tab, &cp, forread);
438 		if (tab)
439 			tab++;
440 		vfsp = vfsp->vfs_next;
441 	} while (vfsp != rootvfs);
442 }
443 
444 static char *
445 mntfs_mapin(char *base, size_t size)
446 {
447 	size_t rlen = roundup(size, PAGESIZE);
448 	struct as *as = curproc->p_as;
449 	char *addr;
450 
451 	as_rangelock(as);
452 	map_addr(&addr, rlen, 0, 1, 0);
453 	if (addr == NULL || as_map(as, addr, rlen, segvn_create, zfod_argsp)) {
454 		as_rangeunlock(as);
455 		return (NULL);
456 	}
457 	as_rangeunlock(as);
458 	if (copyout(base, addr, size)) {
459 		(void) as_unmap(as, addr, rlen);
460 		return (NULL);
461 	}
462 	return (addr);
463 }
464 
465 static void
466 mntfs_freesnap(mntsnap_t *snap)
467 {
468 	if (snap->mnts_text != NULL)
469 		(void) as_unmap(curproc->p_as, snap->mnts_text,
470 			roundup(snap->mnts_textsize, PAGESIZE));
471 	snap->mnts_textsize = snap->mnts_count = 0;
472 	if (snap->mnts_metadata != NULL)
473 		(void) as_unmap(curproc->p_as, snap->mnts_metadata,
474 			roundup(snap->mnts_metasize, PAGESIZE));
475 	snap->mnts_metasize = 0;
476 }
477 
478 #ifdef _SYSCALL32_IMPL
479 
480 typedef struct extmnttab32 {
481 	uint32_t	mnt_special;
482 	uint32_t	mnt_mountp;
483 	uint32_t	mnt_fstype;
484 	uint32_t	mnt_mntopts;
485 	uint32_t	mnt_time;
486 	uint_t		mnt_major;
487 	uint_t		mnt_minor;
488 } extmnttab32_t;
489 
490 #endif
491 
492 /*
493  * called to generate a dummy read vop call so that
494  * any module monitoring /etc/mnttab for access gets notified.
495  */
496 static void
497 mntdummyreadop()
498 {
499 	struct uio	uio;
500 	struct iovec	iov;
501 	char		tbuf[1];
502 
503 	/*
504 	 * Make a VOP_READ call on the dummy vnode so that any
505 	 * module interested in mnttab getting modified could
506 	 * intercept this vnode and capture the event.
507 	 *
508 	 * Pass a dummy uio struct. Nobody should reference the buffer.
509 	 * We need to pass a valid uio struct pointer to take care of
510 	 * any module intercepting this vnode which could attempt to
511 	 * look at it. Currently only the file events notification
512 	 * module intercepts this vnode.
513 	 */
514 	bzero(&uio, sizeof (uio));
515 	bzero(&iov, sizeof (iov));
516 	iov.iov_base = tbuf;
517 	iov.iov_len = 0;
518 	uio.uio_iov = &iov;
519 	uio.uio_iovcnt = 1;
520 	uio.uio_loffset = 0;
521 	uio.uio_segflg = UIO_SYSSPACE;
522 	uio.uio_resid = 0;
523 	(void) VOP_READ(mntdummyvp, &uio, 0, kcred, NULL);
524 }
525 
526 /*
527  * Snapshot the latest version of the kernel mounted resource information
528  *
529  * There are two types of snapshots: one destined for reading, and one destined
530  * for ioctl().  The difference is that the ioctl() interface is delimited by
531  * NULLs, while the read() interface is delimited by tabs and newlines.
532  */
533 /* ARGSUSED */
534 static int
535 mntfs_snapshot(mntnode_t *mnp, int forread, int datamodel)
536 {
537 	size_t size;
538 	timespec_t lastmodt;
539 	mntdata_t *mntdata = MTOD(mnp);
540 	zone_t *zone = mntdata->mnt_zone;
541 	boolean_t global_view = (MTOD(mnp)->mnt_zone == global_zone);
542 	boolean_t showhidden = ((mnp->mnt_flags & MNT_SHOWHIDDEN) != 0);
543 	struct extmnttab *metadata_baseaddr;
544 	char *text_baseaddr;
545 	int i;
546 	mntsnap_t *snap;
547 
548 	if (forread)
549 		snap = &mnp->mnt_read;
550 	else
551 		snap = &mnp->mnt_ioctl;
552 
553 	vfs_list_read_lock();
554 	/*
555 	 * Check if the mnttab info has changed since the last snapshot
556 	 */
557 	vfs_mnttab_modtime(&lastmodt);
558 	if (snap->mnts_count &&
559 	    lastmodt.tv_sec == snap->mnts_time.tv_sec &&
560 	    lastmodt.tv_nsec == snap->mnts_time.tv_nsec) {
561 		vfs_list_unlock();
562 		return (0);
563 	}
564 
565 
566 	if (snap->mnts_count != 0)
567 		mntfs_freesnap(snap);
568 	if (global_view)
569 		size = mntfs_global_len(&snap->mnts_count, showhidden);
570 	else
571 		size = mntfs_zone_len(&snap->mnts_count, zone, showhidden);
572 	ASSERT(size != 0);
573 
574 	if (!forread)
575 		metadata_baseaddr = kmem_alloc(
576 		    snap->mnts_count * sizeof (struct extmnttab), KM_SLEEP);
577 	else
578 		metadata_baseaddr = NULL;
579 
580 	text_baseaddr = kmem_alloc(size, KM_SLEEP);
581 
582 	if (global_view)
583 		mntfs_global_generate(showhidden, metadata_baseaddr,
584 		    text_baseaddr, forread);
585 	else
586 		mntfs_zone_generate(zone, showhidden,
587 		    metadata_baseaddr, text_baseaddr, forread);
588 
589 	vfs_mnttab_modtime(&snap->mnts_time);
590 	vfs_list_unlock();
591 
592 	snap->mnts_text = mntfs_mapin(text_baseaddr, size);
593 	snap->mnts_textsize = size;
594 	kmem_free(text_baseaddr, size);
595 
596 	/*
597 	 * The pointers in the metadata refer to addreesses in the range
598 	 * [base_addr, base_addr + size].  Now that we have mapped the text into
599 	 * the user's address space, we have to convert these addresses into the
600 	 * new (user) range.  We also handle the conversion for 32-bit and
601 	 * 32-bit applications here.
602 	 */
603 	if (!forread) {
604 		struct extmnttab *tab;
605 #ifdef _SYSCALL32_IMPL
606 		struct extmnttab32 *tab32;
607 
608 		if (datamodel == DATAMODEL_ILP32) {
609 			tab = (struct extmnttab *)metadata_baseaddr;
610 			tab32 = (struct extmnttab32 *)metadata_baseaddr;
611 
612 			for (i = 0; i < snap->mnts_count; i++) {
613 				tab32[i].mnt_special =
614 				    (uintptr_t)snap->mnts_text +
615 				    (tab[i].mnt_special - text_baseaddr);
616 				tab32[i].mnt_mountp =
617 				    (uintptr_t)snap->mnts_text +
618 				    (tab[i].mnt_mountp - text_baseaddr);
619 				tab32[i].mnt_fstype =
620 				    (uintptr_t)snap->mnts_text +
621 				    (tab[i].mnt_fstype - text_baseaddr);
622 				tab32[i].mnt_mntopts =
623 				    (uintptr_t)snap->mnts_text +
624 				    (tab[i].mnt_mntopts - text_baseaddr);
625 				tab32[i].mnt_time = (uintptr_t)snap->mnts_text +
626 				    (tab[i].mnt_time - text_baseaddr);
627 				tab32[i].mnt_major = tab[i].mnt_major;
628 				tab32[i].mnt_minor = tab[i].mnt_minor;
629 			}
630 
631 			snap->mnts_metasize =
632 			    snap->mnts_count * sizeof (struct extmnttab32);
633 			snap->mnts_metadata = mntfs_mapin(
634 			    (char *)metadata_baseaddr,
635 			    snap->mnts_metasize);
636 
637 		} else {
638 #endif
639 			tab = (struct extmnttab *)metadata_baseaddr;
640 			for (i = 0; i < snap->mnts_count; i++) {
641 				tab[i].mnt_special = snap->mnts_text +
642 				    (tab[i].mnt_special - text_baseaddr);
643 				tab[i].mnt_mountp = snap->mnts_text +
644 				    (tab[i].mnt_mountp - text_baseaddr);
645 				tab[i].mnt_fstype = snap->mnts_text +
646 				    (tab[i].mnt_fstype - text_baseaddr);
647 				tab[i].mnt_mntopts = snap->mnts_text +
648 				    (tab[i].mnt_mntopts - text_baseaddr);
649 				tab[i].mnt_time = snap->mnts_text +
650 				    (tab[i].mnt_time - text_baseaddr);
651 			}
652 
653 			snap->mnts_metasize =
654 			    snap->mnts_count * sizeof (struct extmnttab);
655 			snap->mnts_metadata = mntfs_mapin(
656 			    (char *)metadata_baseaddr, snap->mnts_metasize);
657 #ifdef _SYSCALL32_IMPL
658 		}
659 #endif
660 
661 		kmem_free(metadata_baseaddr,
662 		    snap->mnts_count * sizeof (struct extmnttab));
663 	}
664 
665 	mntdata->mnt_size = size;
666 
667 	if (snap->mnts_text == NULL ||
668 	    (!forread && snap->mnts_metadata == NULL)) {
669 		mntfs_freesnap(snap);
670 		return (ENOMEM);
671 	}
672 	mntdummyreadop();
673 	return (0);
674 }
675 
676 /*
677  * Public function to convert vfs_mntopts into a string.
678  * A buffer of sufficient size is allocated, which is returned via bufp,
679  * and whose length is returned via lenp.
680  */
681 void
682 mntfs_getmntopts(struct vfs *vfsp, char **bufp, size_t *lenp)
683 {
684 	size_t len;
685 	char *buf;
686 
687 	vfs_list_read_lock();
688 
689 	len = mntfs_optsize(vfsp) + 1;
690 	buf = kmem_alloc(len, KM_NOSLEEP);
691 	if (buf == NULL) {
692 		*bufp = NULL;
693 		vfs_list_unlock();
694 		return;
695 	}
696 	buf[len - 1] = '\0';
697 	(void) mntfs_optprint(vfsp, buf);
698 	ASSERT(buf[len - 1] == '\0');
699 
700 	vfs_list_unlock();
701 	*bufp = buf;
702 	*lenp = len;
703 }
704 
705 
706 /* ARGSUSED */
707 static int
708 mntopen(vnode_t **vpp, int flag, cred_t *cr)
709 {
710 	vnode_t *vp = *vpp;
711 	mntnode_t *nmnp;
712 
713 	/*
714 	 * Not allowed to open for writing, return error.
715 	 */
716 	if (flag & FWRITE)
717 		return (EPERM);
718 	/*
719 	 * Create a new mnt/vnode for each open, this will give us a handle to
720 	 * hang the snapshot on.
721 	 */
722 	nmnp = mntgetnode(vp);
723 
724 	*vpp = MTOV(nmnp);
725 	atomic_add_32(&MTOD(nmnp)->mnt_nopen, 1);
726 	VN_RELE(vp);
727 	return (0);
728 }
729 
730 /* ARGSUSED */
731 static int
732 mntclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
733 {
734 	mntnode_t *mnp = VTOM(vp);
735 
736 	/* Clean up any locks or shares held by the current process */
737 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
738 	cleanshares(vp, ttoproc(curthread)->p_pid);
739 
740 	if (count > 1)
741 		return (0);
742 	if (vp->v_count == 1) {
743 		mntfs_freesnap(&mnp->mnt_read);
744 		mntfs_freesnap(&mnp->mnt_ioctl);
745 		atomic_add_32(&MTOD(mnp)->mnt_nopen, -1);
746 	}
747 	return (0);
748 }
749 
750 /* ARGSUSED */
751 static int
752 mntread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, caller_context_t *ct)
753 {
754 	int error = 0;
755 	off_t off = uio->uio_offset;
756 	size_t len = uio->uio_resid;
757 	mntnode_t *mnp = VTOM(vp);
758 	char *buf;
759 	mntsnap_t *snap = &mnp->mnt_read;
760 	int datamodel;
761 
762 	if (off == (off_t)0 || snap->mnts_count == 0) {
763 		/*
764 		 * It is assumed that any kernel callers wishing
765 		 * to read mnttab will be using extmnttab entries
766 		 * and not extmnttab32 entries, whether or not
767 		 * the kernel is LP64 or ILP32.  Thus, force the
768 		 * datamodel that mntfs_snapshot uses to be
769 		 * DATAMODEL_LP64.
770 		 */
771 		if (uio->uio_segflg == UIO_SYSSPACE)
772 			datamodel = DATAMODEL_LP64;
773 		else
774 			datamodel = get_udatamodel();
775 		if ((error = mntfs_snapshot(mnp, 1, datamodel)) != 0)
776 			return (error);
777 	}
778 	if ((size_t)(off + len) > snap->mnts_textsize)
779 		len = snap->mnts_textsize - off;
780 
781 	if (off < 0 || len > snap->mnts_textsize)
782 		return (EFAULT);
783 
784 	if (len == 0)
785 		return (0);
786 
787 	/*
788 	 * The mnttab image is stored in the user's address space,
789 	 * so we have to copy it into the kernel from userland,
790 	 * then copy it back out to the specified address.
791 	 */
792 	buf = kmem_alloc(len, KM_SLEEP);
793 	if (copyin(snap->mnts_text + off, buf, len))
794 		error = EFAULT;
795 	else {
796 		error = uiomove(buf, len, UIO_READ, uio);
797 	}
798 	kmem_free(buf, len);
799 	mntdummyreadop();
800 	return (error);
801 }
802 
803 
804 static int
805 mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
806 {
807 	mntnode_t *mnp = VTOM(vp);
808 	int error;
809 	vnode_t *rvp;
810 	extern timespec_t vfs_mnttab_ctime;
811 	mntdata_t *mntdata = MTOD(VTOM(vp));
812 	mntsnap_t *snap = mnp->mnt_read.mnts_count ?
813 	    &mnp->mnt_read : &mnp->mnt_ioctl;
814 
815 	/*
816 	 * Return all the attributes.  Should be refined
817 	 * so that it returns only those asked for.
818 	 * Most of this is complete fakery anyway.
819 	 */
820 	rvp = mnp->mnt_mountvp;
821 	/*
822 	 * Attributes are same as underlying file with modifications
823 	 */
824 	if (error = VOP_GETATTR(rvp, vap, flags, cr))
825 		return (error);
826 
827 	/*
828 	 * We always look like a regular file
829 	 */
830 	vap->va_type = VREG;
831 	/*
832 	 * mode should basically be read only
833 	 */
834 	vap->va_mode &= 07444;
835 	vap->va_fsid = vp->v_vfsp->vfs_dev;
836 	vap->va_blksize = DEV_BSIZE;
837 	vap->va_rdev = 0;
838 	vap->va_seq = 0;
839 	/*
840 	 * Set nlink to the number of open vnodes for mnttab info
841 	 * plus one for existing.
842 	 */
843 	vap->va_nlink = mntdata->mnt_nopen + 1;
844 	/*
845 	 * If we haven't taken a snapshot yet, set the
846 	 * size to the size of the latest snapshot.
847 	 */
848 	vap->va_size = snap->mnts_textsize ? snap->mnts_textsize :
849 	    mntdata->mnt_size;
850 	/*
851 	 * Fetch mtime from the vfs mnttab timestamp
852 	 */
853 	vap->va_ctime = vfs_mnttab_ctime;
854 	vfs_list_read_lock();
855 	vfs_mnttab_modtime(&vap->va_mtime);
856 	vap->va_atime = vap->va_mtime;
857 	vfs_list_unlock();
858 	/*
859 	 * Nodeid is always ROOTINO;
860 	 */
861 	vap->va_nodeid = (ino64_t)MNTROOTINO;
862 	vap->va_nblocks = btod(vap->va_size);
863 	return (0);
864 }
865 
866 
867 static int
868 mntaccess(vnode_t *vp, int mode, int flags, cred_t *cr)
869 {
870 	mntnode_t *mnp = VTOM(vp);
871 
872 	if (mode & (VWRITE|VEXEC))
873 		return (EROFS);
874 
875 	/*
876 	 * Do access check on the underlying directory vnode.
877 	 */
878 	return (VOP_ACCESS(mnp->mnt_mountvp, mode, flags, cr));
879 }
880 
881 
882 /*
883  * New /mntfs vnode required; allocate it and fill in most of the fields.
884  */
885 static mntnode_t *
886 mntgetnode(vnode_t *dp)
887 {
888 	mntnode_t *mnp;
889 	vnode_t *vp;
890 
891 	mnp = kmem_zalloc(sizeof (mntnode_t), KM_SLEEP);
892 	mnp->mnt_vnode = vn_alloc(KM_SLEEP);
893 	mnp->mnt_mountvp = VTOM(dp)->mnt_mountvp;
894 	vp = MTOV(mnp);
895 	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
896 	vn_setops(vp, mntvnodeops);
897 	vp->v_vfsp = dp->v_vfsp;
898 	vp->v_type = VREG;
899 	vp->v_data = (caddr_t)mnp;
900 
901 	return (mnp);
902 }
903 
904 /*
905  * Free the storage obtained from mntgetnode().
906  */
907 static void
908 mntfreenode(mntnode_t *mnp)
909 {
910 	vnode_t *vp = MTOV(mnp);
911 
912 	vn_invalid(vp);
913 	vn_free(vp);
914 	kmem_free(mnp, sizeof (*mnp));
915 }
916 
917 
918 /* ARGSUSED */
919 static int
920 mntfsync(vnode_t *vp, int syncflag, cred_t *cr)
921 {
922 	return (0);
923 }
924 
925 /* ARGSUSED */
926 static void
927 mntinactive(vnode_t *vp, cred_t *cr)
928 {
929 	mntnode_t *mnp = VTOM(vp);
930 
931 	mntfreenode(mnp);
932 }
933 
934 /* ARGSUSED */
935 static int
936 mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp)
937 {
938 	if (*noffp == 0)
939 		VTOM(vp)->mnt_offset = 0;
940 
941 	return (0);
942 }
943 
944 /*
945  * Return the answer requested to poll().
946  * POLLRDBAND will return when the mtime of the mnttab
947  * information is newer than the latest one read for this open.
948  */
949 /* ARGSUSED */
950 static int
951 mntpoll(vnode_t *vp, short ev, int any, short *revp, pollhead_t **phpp)
952 {
953 	mntnode_t *mnp = VTOM(vp);
954 	mntsnap_t *snap = &mnp->mnt_read;
955 
956 	if (mnp->mnt_ioctl.mnts_time.tv_sec > snap->mnts_time.tv_sec ||
957 	    (mnp->mnt_ioctl.mnts_time.tv_sec == snap->mnts_time.tv_sec &&
958 	    mnp->mnt_ioctl.mnts_time.tv_nsec > snap->mnts_time.tv_nsec))
959 		snap = &mnp->mnt_ioctl;
960 
961 	*revp = 0;
962 	*phpp = (pollhead_t *)NULL;
963 	if (ev & POLLIN)
964 		*revp |= POLLIN;
965 
966 	if (ev & POLLRDNORM)
967 		*revp |= POLLRDNORM;
968 
969 	if (ev & POLLRDBAND) {
970 		vfs_mnttab_poll(&snap->mnts_time, phpp);
971 		if (*phpp == (pollhead_t *)NULL)
972 			*revp |= POLLRDBAND;
973 	}
974 	if (*revp || *phpp != NULL || any) {
975 		return (0);
976 	}
977 	/*
978 	 * If someone is polling an unsupported poll events (e.g.
979 	 * POLLOUT, POLLPRI, etc.), just return POLLERR revents.
980 	 * That way we will ensure that we don't return a 0
981 	 * revents with a NULL pollhead pointer.
982 	 */
983 	*revp = POLLERR;
984 	return (0);
985 }
986 /* ARGSUSED */
987 static int
988 mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
989 	cred_t *cr, int *rvalp)
990 {
991 	uint_t *up = (uint_t *)arg;
992 	mntnode_t *mnp = VTOM(vp);
993 	mntsnap_t *snap = &mnp->mnt_ioctl;
994 	int error;
995 
996 	error = 0;
997 	switch (cmd) {
998 
999 	case MNTIOC_NMNTS: {		/* get no. of mounted resources */
1000 		if (snap->mnts_count == 0) {
1001 			if ((error =
1002 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
1003 				return (error);
1004 		}
1005 		if (suword32(up, snap->mnts_count) != 0)
1006 			error = EFAULT;
1007 		break;
1008 	}
1009 
1010 	case MNTIOC_GETDEVLIST: {	/* get mounted device major/minor nos */
1011 		uint_t *devlist;
1012 		int i;
1013 		size_t len;
1014 
1015 		if (snap->mnts_count == 0) {
1016 			if ((error =
1017 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
1018 				return (error);
1019 		}
1020 
1021 		len = 2 * snap->mnts_count * sizeof (uint_t);
1022 		devlist = kmem_alloc(len, KM_SLEEP);
1023 		for (i = 0; i < snap->mnts_count; i++) {
1024 
1025 #ifdef _SYSCALL32_IMPL
1026 			if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
1027 				struct extmnttab32 tab;
1028 
1029 				if ((error = xcopyin(snap->mnts_text +
1030 				    i * sizeof (struct extmnttab32), &tab,
1031 				    sizeof (tab))) != 0)
1032 					break;
1033 
1034 				devlist[i*2] = tab.mnt_major;
1035 				devlist[i*2+1] = tab.mnt_minor;
1036 			} else {
1037 #endif
1038 				struct extmnttab tab;
1039 
1040 				if ((error = xcopyin(snap->mnts_text +
1041 				    i * sizeof (struct extmnttab), &tab,
1042 				    sizeof (tab))) != 0)
1043 					break;
1044 
1045 				devlist[i*2] = tab.mnt_major;
1046 				devlist[i*2+1] = tab.mnt_minor;
1047 #ifdef _SYSCALL32_IMPL
1048 			}
1049 #endif
1050 		}
1051 
1052 		if (error == 0)
1053 			error = xcopyout(devlist, up, len);
1054 		kmem_free(devlist, len);
1055 		break;
1056 	}
1057 
1058 	case MNTIOC_SETTAG:		/* set tag on mounted file system */
1059 	case MNTIOC_CLRTAG:		/* clear tag on mounted file system */
1060 	{
1061 		struct mnttagdesc *dp = (struct mnttagdesc *)arg;
1062 		STRUCT_DECL(mnttagdesc, tagdesc);
1063 		char *cptr;
1064 		uint32_t major, minor;
1065 		char tagbuf[MAX_MNTOPT_TAG];
1066 		char *pbuf;
1067 		size_t len;
1068 		uint_t start = 0;
1069 		mntdata_t *mntdata = MTOD(mnp);
1070 		zone_t *zone = mntdata->mnt_zone;
1071 
1072 		STRUCT_INIT(tagdesc, flag & DATAMODEL_MASK);
1073 		if (copyin(dp, STRUCT_BUF(tagdesc), STRUCT_SIZE(tagdesc))) {
1074 			error = EFAULT;
1075 			break;
1076 		}
1077 		pbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1078 		if (zone != global_zone) {
1079 			(void) strcpy(pbuf, zone->zone_rootpath);
1080 			/* truncate "/" and nul */
1081 			start = zone->zone_rootpathlen - 2;
1082 			ASSERT(pbuf[start] == '/');
1083 		}
1084 		cptr = STRUCT_FGETP(tagdesc, mtd_mntpt);
1085 		error = copyinstr(cptr, pbuf + start, MAXPATHLEN - start, &len);
1086 		if (error) {
1087 			kmem_free(pbuf, MAXPATHLEN);
1088 			break;
1089 		}
1090 		if (start != 0 && pbuf[start] != '/') {
1091 			kmem_free(pbuf, MAXPATHLEN);
1092 			error = EINVAL;
1093 			break;
1094 		}
1095 		cptr = STRUCT_FGETP(tagdesc, mtd_tag);
1096 		if ((error = copyinstr(cptr, tagbuf, MAX_MNTOPT_TAG, &len))) {
1097 			kmem_free(pbuf, MAXPATHLEN);
1098 			break;
1099 		}
1100 		major = STRUCT_FGET(tagdesc, mtd_major);
1101 		minor = STRUCT_FGET(tagdesc, mtd_minor);
1102 		if (cmd == MNTIOC_SETTAG)
1103 			error = vfs_settag(major, minor, pbuf, tagbuf, cr);
1104 		else
1105 			error = vfs_clrtag(major, minor, pbuf, tagbuf, cr);
1106 		kmem_free(pbuf, MAXPATHLEN);
1107 		break;
1108 	}
1109 
1110 	case MNTIOC_SHOWHIDDEN:
1111 	{
1112 		mutex_enter(&vp->v_lock);
1113 		mnp->mnt_flags |= MNT_SHOWHIDDEN;
1114 		mutex_exit(&vp->v_lock);
1115 		break;
1116 	}
1117 
1118 	case MNTIOC_GETMNTENT:
1119 	{
1120 		size_t idx;
1121 		uintptr_t addr;
1122 
1123 		idx = mnp->mnt_offset;
1124 		if (snap->mnts_count == 0 || idx == 0) {
1125 			if ((error =
1126 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
1127 				return (error);
1128 		}
1129 		/*
1130 		 * If the next index is beyond the end of the current mnttab,
1131 		 * return EOF
1132 		 */
1133 		if (idx >= snap->mnts_count) {
1134 			*rvalp = 1;
1135 			return (0);
1136 		}
1137 
1138 #ifdef _SYSCALL32_IMPL
1139 		if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
1140 			addr = (uintptr_t)(snap->mnts_metadata + idx *
1141 			    sizeof (struct extmnttab32));
1142 			error = suword32((void *)arg, addr);
1143 		} else {
1144 #endif
1145 			addr = (uintptr_t)(snap->mnts_metadata + idx *
1146 			    sizeof (struct extmnttab));
1147 			error = sulword((void *)arg, addr);
1148 #ifdef _SYSCALL32_IMPL
1149 		}
1150 #endif
1151 
1152 		if (error != 0)
1153 			return (error);
1154 
1155 		mnp->mnt_offset++;
1156 		break;
1157 	}
1158 
1159 	default:
1160 		error = EINVAL;
1161 		break;
1162 	}
1163 
1164 	return (error);
1165 }
1166 
1167 /* ARGSUSED */
1168 static int
1169 mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
1170 	caller_context_t *ct)
1171 {
1172 	return (0);
1173 }
1174 
1175 /* ARGSUSED */
1176 static int
1177 mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
1178 	caller_context_t *ct)
1179 {
1180 	return (0);
1181 }
1182 
1183 
1184 /*
1185  * /mntfs vnode operations vector
1186  */
1187 const fs_operation_def_t mnt_vnodeops_template[] = {
1188 	VOPNAME_OPEN,		{ .vop_open = mntopen },
1189 	VOPNAME_CLOSE,		{ .vop_close = mntclose },
1190 	VOPNAME_READ,		{ .vop_read = mntread },
1191 	VOPNAME_IOCTL,		{ .vop_ioctl = mntioctl },
1192 	VOPNAME_GETATTR,	{ .vop_getattr = mntgetattr },
1193 	VOPNAME_ACCESS,		{ .vop_access = mntaccess },
1194 	VOPNAME_FSYNC,		{ .vop_fsync = mntfsync },
1195 	VOPNAME_INACTIVE,	{ .vop_inactive = mntinactive },
1196 	VOPNAME_SEEK,		{ .vop_seek = mntseek },
1197 	VOPNAME_POLL,		{ .vop_poll = mntpoll },
1198 	VOPNAME_DISPOSE,	{ .error = fs_error },
1199 	VOPNAME_SHRLOCK,	{ .error = fs_error },
1200 	NULL,			NULL
1201 };
1202 
1203 const fs_operation_def_t mnt_dummyvnodeops_template[] = {
1204 	VOPNAME_READ, 		{ .vop_read = mntdummyread },
1205 	VOPNAME_WRITE, 		{ .vop_write = mntdummywrite },
1206 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
1207 	NULL, NULL
1208 };
1209