xref: /titanic_51/usr/src/uts/common/fs/mntfs/mntvnops.c (revision 261a51afbf7133d9f7c89f1388050677f56b7d1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/file.h>
29 #include <sys/stat.h>
30 #include <sys/atomic.h>
31 #include <sys/mntio.h>
32 #include <sys/mnttab.h>
33 #include <sys/mount.h>
34 #include <sys/sunddi.h>
35 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
37 #include <sys/vfs.h>
38 #include <sys/vfs_opreg.h>
39 #include <sys/fs/mntdata.h>
40 #include <fs/fs_subr.h>
41 #include <sys/vmsystm.h>
42 #include <vm/seg_vn.h>
43 
44 #define	MNTROOTINO	2
45 
46 static mntnode_t *mntgetnode(vnode_t *);
47 
48 vnodeops_t *mntvnodeops;
49 
50 /*
51  * Design of kernel mnttab accounting.
52  *
53  * To support whitespace in mount names, we implement an ioctl
54  * (MNTIOC_GETMNTENT) which allows a programmatic interface to the data in
55  * /etc/mnttab.  The libc functions getmntent() and getextmntent() are built
56  * atop this interface.
57  *
58  * To minimize the amount of memory used in the kernel, we keep all the
59  * necessary information in the user's address space.  Large server
60  * configurations can have /etc/mnttab files in excess of 64k.
61  *
62  * To support both vanilla read() calls as well as ioctl() calls, we have two
63  * different snapshots of the kernel data structures, mnt_read and mnt_ioctl.
64  * These snapshots include the base location in user memory, the number of
65  * mounts in the snapshot, and any metadata associated with it.  The metadata is
66  * used only to support the ioctl() interface, and is a series of extmnttab
67  * structures.  When the user issues an ioctl(), we simply copyout a pointer to
68  * that structure, and the rest is handled in userland.
69  */
70 
71 /*
72  * NOTE: The following variable enables the generation of the "dev=xxx"
73  * in the option string for a mounted file system.  Really this should
74  * be gotten rid of altogether, but for the sake of backwards compatibility
75  * we had to leave it in.  It is defined as a 32-bit device number.  This
76  * means that when 64-bit device numbers are in use, if either the major or
77  * minor part of the device number will not fit in a 16 bit quantity, the
78  * "dev=" will be set to NODEV (0x7fffffff).  See PSARC 1999/566 and
79  * 1999/131 for details.  The cmpldev() function used to generate the 32-bit
80  * device number handles this check and assigns the proper value.
81  */
82 int mntfs_enabledev = 1;	/* enable old "dev=xxx" option */
83 
84 static int
85 mntfs_devsize(struct vfs *vfsp)
86 {
87 	dev32_t odev;
88 
89 	(void) cmpldev(&odev, vfsp->vfs_dev);
90 	return (snprintf(NULL, 0, "dev=%x", odev));
91 }
92 
93 static int
94 mntfs_devprint(struct vfs *vfsp, char *buf)
95 {
96 	dev32_t odev;
97 
98 	(void) cmpldev(&odev, vfsp->vfs_dev);
99 	return (snprintf(buf, MAX_MNTOPT_STR, "dev=%x", odev));
100 }
101 
102 static int
103 mntfs_optsize(struct vfs *vfsp)
104 {
105 	int i, size = 0;
106 	mntopt_t *mop;
107 
108 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
109 		mop = &vfsp->vfs_mntopts.mo_list[i];
110 		if (mop->mo_flags & MO_NODISPLAY)
111 			continue;
112 		if (mop->mo_flags & MO_SET) {
113 			if (size)
114 				size++; /* space for comma */
115 			size += strlen(mop->mo_name);
116 			/*
117 			 * count option value if there is one
118 			 */
119 			if (mop->mo_arg != NULL) {
120 				size += strlen(mop->mo_arg) + 1;
121 			}
122 		}
123 	}
124 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
125 		/*
126 		 * Add space for "zone=<zone_name>" if required.
127 		 */
128 		if (size)
129 			size++;	/* space for comma */
130 		size += sizeof ("zone=") - 1;
131 		size += strlen(vfsp->vfs_zone->zone_name);
132 	}
133 	if (mntfs_enabledev) {
134 		if (size != 0)
135 			size++; /* space for comma */
136 		size += mntfs_devsize(vfsp);
137 	}
138 	if (size == 0)
139 		size = strlen("-");
140 	return (size);
141 }
142 
143 static int
144 mntfs_optprint(struct vfs *vfsp, char *buf)
145 {
146 	int i, optinbuf = 0;
147 	mntopt_t *mop;
148 	char *origbuf = buf;
149 
150 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
151 		mop = &vfsp->vfs_mntopts.mo_list[i];
152 		if (mop->mo_flags & MO_NODISPLAY)
153 			continue;
154 		if (mop->mo_flags & MO_SET) {
155 			if (optinbuf)
156 				*buf++ = ',';
157 			else
158 				optinbuf = 1;
159 			buf += snprintf(buf, MAX_MNTOPT_STR,
160 				"%s", mop->mo_name);
161 			/*
162 			 * print option value if there is one
163 			 */
164 			if (mop->mo_arg != NULL) {
165 				buf += snprintf(buf, MAX_MNTOPT_STR, "=%s",
166 					mop->mo_arg);
167 			}
168 		}
169 	}
170 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
171 		if (optinbuf)
172 			*buf++ = ',';
173 		else
174 			optinbuf = 1;
175 		buf += snprintf(buf, MAX_MNTOPT_STR, "zone=%s",
176 		    vfsp->vfs_zone->zone_name);
177 	}
178 	if (mntfs_enabledev) {
179 		if (optinbuf++)
180 			*buf++ = ',';
181 		buf += mntfs_devprint(vfsp, buf);
182 	}
183 	if (!optinbuf) {
184 		buf += snprintf(buf, MAX_MNTOPT_STR, "-");
185 	}
186 	return (buf - origbuf);
187 }
188 
189 static size_t
190 mntfs_vfs_len(vfs_t *vfsp, zone_t *zone)
191 {
192 	size_t size = 0;
193 	const char *resource, *mntpt;
194 
195 	mntpt = refstr_value(vfsp->vfs_mntpt);
196 	if (mntpt != NULL && mntpt[0] != '\0') {
197 		size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
198 	} else {
199 		size += strlen("-") + 1;
200 	}
201 
202 	resource = refstr_value(vfsp->vfs_resource);
203 	if (resource != NULL && resource[0] != '\0') {
204 		if (resource[0] != '/') {
205 			size += strlen(resource) + 1;
206 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
207 			/*
208 			 * Same as the zone's view of the mount point.
209 			 */
210 			size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
211 		} else {
212 			size += strlen(ZONE_PATH_TRANSLATE(resource, zone)) + 1;
213 		}
214 	} else {
215 		size += strlen("-") + 1;
216 	}
217 	size += strlen(vfssw[vfsp->vfs_fstype].vsw_name) + 1;
218 	size += mntfs_optsize(vfsp);
219 	size += snprintf(NULL, 0, "\t%ld\n", vfsp->vfs_mtime);
220 	return (size);
221 }
222 
223 static void
224 mntfs_zonerootvfs(zone_t *zone, vfs_t *rootvfsp)
225 {
226 	/*
227 	 * Basically copy over the real vfs_t on which the root vnode is
228 	 * located, changing its mountpoint and resource to match those of
229 	 * the zone's rootpath.
230 	 */
231 	*rootvfsp = *zone->zone_rootvp->v_vfsp;
232 	rootvfsp->vfs_mntpt = refstr_alloc(zone->zone_rootpath);
233 	rootvfsp->vfs_resource = rootvfsp->vfs_mntpt;
234 }
235 
236 static size_t
237 mntfs_zone_len(uint_t *nent_ptr, zone_t *zone, int showhidden)
238 {
239 	struct vfs *zonelist;
240 	struct vfs *vfsp;
241 	size_t size = 0;
242 	uint_t cnt = 0;
243 
244 	ASSERT(zone->zone_rootpath != NULL);
245 
246 	/*
247 	 * If the zone has a root entry, it will be the first in the list.  If
248 	 * it doesn't, we conjure one up.
249 	 */
250 	vfsp = zonelist = zone->zone_vfslist;
251 	if (zonelist == NULL ||
252 	    strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
253 		vfs_t tvfs;
254 		/*
255 		 * The root of the zone is not a mount point.  The vfs we want
256 		 * to report is that of the zone's root vnode.
257 		 */
258 		ASSERT(zone != global_zone);
259 		mntfs_zonerootvfs(zone, &tvfs);
260 		size += mntfs_vfs_len(&tvfs, zone);
261 		refstr_rele(tvfs.vfs_mntpt);
262 		cnt++;
263 	}
264 	if (zonelist == NULL)
265 		goto out;
266 	do {
267 		/*
268 		 * Skip mounts that should not show up in mnttab
269 		 */
270 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
271 			vfsp = vfsp->vfs_zone_next;
272 			continue;
273 		}
274 		cnt++;
275 		size += mntfs_vfs_len(vfsp, zone);
276 		vfsp = vfsp->vfs_zone_next;
277 	} while (vfsp != zonelist);
278 out:
279 	*nent_ptr = cnt;
280 	return (size);
281 }
282 
283 static size_t
284 mntfs_global_len(uint_t *nent_ptr, int showhidden)
285 {
286 	struct vfs *vfsp;
287 	size_t size = 0;
288 	uint_t cnt = 0;
289 
290 	vfsp = rootvfs;
291 	do {
292 		/*
293 		 * Skip mounts that should not show up in mnttab
294 		 */
295 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
296 			vfsp = vfsp->vfs_next;
297 			continue;
298 		}
299 		cnt++;
300 		size += mntfs_vfs_len(vfsp, global_zone);
301 		vfsp = vfsp->vfs_next;
302 	} while (vfsp != rootvfs);
303 	*nent_ptr = cnt;
304 	return (size);
305 }
306 
307 static void
308 mntfs_vfs_generate(vfs_t *vfsp, zone_t *zone, struct extmnttab *tab,
309     char **basep, int forread)
310 {
311 	const char *resource, *mntpt;
312 	char *cp = *basep;
313 
314 	mntpt = refstr_value(vfsp->vfs_mntpt);
315 	resource = refstr_value(vfsp->vfs_resource);
316 
317 	if (tab)
318 		tab->mnt_special = cp;
319 	if (resource != NULL && resource[0] != '\0') {
320 		if (resource[0] != '/') {
321 			cp += snprintf(cp, MAXPATHLEN, "%s", resource);
322 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
323 			/*
324 			 * Use the mount point as the resource.
325 			 */
326 			cp += snprintf(cp, MAXPATHLEN, "%s",
327 			    ZONE_PATH_TRANSLATE(mntpt, zone));
328 		} else {
329 			cp += snprintf(cp, MAXPATHLEN, "%s",
330 			    ZONE_PATH_TRANSLATE(resource, zone));
331 		}
332 	} else {
333 		cp += snprintf(cp, MAXPATHLEN, "-");
334 	}
335 	*cp++ = forread ? '\t' : '\0';
336 
337 	if (tab)
338 		tab->mnt_mountp = cp;
339 	if (mntpt != NULL && mntpt[0] != '\0') {
340 		/*
341 		 * We know the mount point is visible from within the zone,
342 		 * otherwise it wouldn't be on the zone's vfs list.
343 		 */
344 		cp += snprintf(cp, MAXPATHLEN, "%s",
345 		    ZONE_PATH_TRANSLATE(mntpt, zone));
346 	} else {
347 		cp += snprintf(cp, MAXPATHLEN, "-");
348 	}
349 	*cp++ = forread ? '\t' : '\0';
350 
351 	if (tab)
352 		tab->mnt_fstype = cp;
353 	cp += snprintf(cp, MAXPATHLEN, "%s",
354 	    vfssw[vfsp->vfs_fstype].vsw_name);
355 	*cp++ = forread ? '\t' : '\0';
356 
357 	if (tab)
358 		tab->mnt_mntopts = cp;
359 	cp += mntfs_optprint(vfsp, cp);
360 	*cp++ = forread ? '\t' : '\0';
361 
362 	if (tab)
363 		tab->mnt_time = cp;
364 	cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime);
365 	*cp++ = forread ? '\n' : '\0';
366 
367 	if (tab) {
368 		tab->mnt_major = getmajor(vfsp->vfs_dev);
369 		tab->mnt_minor = getminor(vfsp->vfs_dev);
370 	}
371 
372 	*basep = cp;
373 }
374 
375 static void
376 mntfs_zone_generate(zone_t *zone, int showhidden, struct extmnttab *tab,
377     char *basep, int forread)
378 {
379 	vfs_t *zonelist;
380 	vfs_t *vfsp;
381 	char *cp = basep;
382 
383 	/*
384 	 * If the zone has a root entry, it will be the first in the list.  If
385 	 * it doesn't, we conjure one up.
386 	 */
387 	vfsp = zonelist = zone->zone_vfslist;
388 	if (zonelist == NULL ||
389 	    strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
390 		vfs_t tvfs;
391 		/*
392 		 * The root of the zone is not a mount point.  The vfs we want
393 		 * to report is that of the zone's root vnode.
394 		 */
395 		ASSERT(zone != global_zone);
396 		mntfs_zonerootvfs(zone, &tvfs);
397 		mntfs_vfs_generate(&tvfs, zone, tab, &cp, forread);
398 		refstr_rele(tvfs.vfs_mntpt);
399 		if (tab)
400 			tab++;
401 	}
402 	if (zonelist == NULL)
403 		return;
404 	do {
405 		/*
406 		 * Skip mounts that should not show up in mnttab
407 		 */
408 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
409 			vfsp = vfsp->vfs_zone_next;
410 			continue;
411 		}
412 		mntfs_vfs_generate(vfsp, zone, tab, &cp, forread);
413 		if (tab)
414 			tab++;
415 		vfsp = vfsp->vfs_zone_next;
416 	} while (vfsp != zonelist);
417 }
418 
419 static void
420 mntfs_global_generate(int showhidden, struct extmnttab *tab, char *basep,
421     int forread)
422 {
423 	vfs_t *vfsp;
424 	char *cp = basep;
425 
426 	vfsp = rootvfs;
427 	do {
428 		/*
429 		 * Skip mounts that should not show up in mnttab
430 		 */
431 		if (!showhidden && vfsp->vfs_flag & VFS_NOMNTTAB) {
432 			vfsp = vfsp->vfs_next;
433 			continue;
434 		}
435 		mntfs_vfs_generate(vfsp, global_zone, tab, &cp, forread);
436 		if (tab)
437 			tab++;
438 		vfsp = vfsp->vfs_next;
439 	} while (vfsp != rootvfs);
440 }
441 
442 static char *
443 mntfs_mapin(char *base, size_t size)
444 {
445 	size_t rlen = roundup(size, PAGESIZE);
446 	struct as *as = curproc->p_as;
447 	char *addr;
448 
449 	as_rangelock(as);
450 	map_addr(&addr, rlen, 0, 1, 0);
451 	if (addr == NULL || as_map(as, addr, rlen, segvn_create, zfod_argsp)) {
452 		as_rangeunlock(as);
453 		return (NULL);
454 	}
455 	as_rangeunlock(as);
456 	if (copyout(base, addr, size)) {
457 		(void) as_unmap(as, addr, rlen);
458 		return (NULL);
459 	}
460 	return (addr);
461 }
462 
463 static void
464 mntfs_freesnap(mntsnap_t *snap)
465 {
466 	if (snap->mnts_text != NULL)
467 		(void) as_unmap(curproc->p_as, snap->mnts_text,
468 			roundup(snap->mnts_textsize, PAGESIZE));
469 	snap->mnts_textsize = snap->mnts_count = 0;
470 	if (snap->mnts_metadata != NULL)
471 		(void) as_unmap(curproc->p_as, snap->mnts_metadata,
472 			roundup(snap->mnts_metasize, PAGESIZE));
473 	snap->mnts_metasize = 0;
474 }
475 
476 #ifdef _SYSCALL32_IMPL
477 
478 typedef struct extmnttab32 {
479 	uint32_t	mnt_special;
480 	uint32_t	mnt_mountp;
481 	uint32_t	mnt_fstype;
482 	uint32_t	mnt_mntopts;
483 	uint32_t	mnt_time;
484 	uint_t		mnt_major;
485 	uint_t		mnt_minor;
486 } extmnttab32_t;
487 
488 #endif
489 
490 /*
491  * Snapshot the latest version of the kernel mounted resource information
492  *
493  * There are two types of snapshots: one destined for reading, and one destined
494  * for ioctl().  The difference is that the ioctl() interface is delimited by
495  * NULLs, while the read() interface is delimited by tabs and newlines.
496  */
497 /* ARGSUSED */
498 static int
499 mntfs_snapshot(mntnode_t *mnp, int forread, int datamodel)
500 {
501 	size_t size;
502 	timespec_t lastmodt;
503 	mntdata_t *mntdata = MTOD(mnp);
504 	zone_t *zone = mntdata->mnt_zone;
505 	boolean_t global_view = (MTOD(mnp)->mnt_zone == global_zone);
506 	boolean_t showhidden = ((mnp->mnt_flags & MNT_SHOWHIDDEN) != 0);
507 	struct extmnttab *metadata_baseaddr;
508 	char *text_baseaddr;
509 	int i;
510 	mntsnap_t *snap;
511 
512 	if (forread)
513 		snap = &mnp->mnt_read;
514 	else
515 		snap = &mnp->mnt_ioctl;
516 
517 	vfs_list_read_lock();
518 	/*
519 	 * Check if the mnttab info has changed since the last snapshot
520 	 */
521 	vfs_mnttab_modtime(&lastmodt);
522 	if (snap->mnts_count &&
523 	    lastmodt.tv_sec == snap->mnts_time.tv_sec &&
524 	    lastmodt.tv_nsec == snap->mnts_time.tv_nsec) {
525 		vfs_list_unlock();
526 		return (0);
527 	}
528 
529 
530 	if (snap->mnts_count != 0)
531 		mntfs_freesnap(snap);
532 	if (global_view)
533 		size = mntfs_global_len(&snap->mnts_count, showhidden);
534 	else
535 		size = mntfs_zone_len(&snap->mnts_count, zone, showhidden);
536 	ASSERT(size != 0);
537 
538 	if (!forread)
539 		metadata_baseaddr = kmem_alloc(
540 		    snap->mnts_count * sizeof (struct extmnttab), KM_SLEEP);
541 	else
542 		metadata_baseaddr = NULL;
543 
544 	text_baseaddr = kmem_alloc(size, KM_SLEEP);
545 
546 	if (global_view)
547 		mntfs_global_generate(showhidden, metadata_baseaddr,
548 		    text_baseaddr, forread);
549 	else
550 		mntfs_zone_generate(zone, showhidden,
551 		    metadata_baseaddr, text_baseaddr, forread);
552 
553 	vfs_mnttab_modtime(&snap->mnts_time);
554 	vfs_list_unlock();
555 
556 	snap->mnts_text = mntfs_mapin(text_baseaddr, size);
557 	snap->mnts_textsize = size;
558 	kmem_free(text_baseaddr, size);
559 
560 	/*
561 	 * The pointers in the metadata refer to addreesses in the range
562 	 * [base_addr, base_addr + size].  Now that we have mapped the text into
563 	 * the user's address space, we have to convert these addresses into the
564 	 * new (user) range.  We also handle the conversion for 32-bit and
565 	 * 32-bit applications here.
566 	 */
567 	if (!forread) {
568 		struct extmnttab *tab;
569 #ifdef _SYSCALL32_IMPL
570 		struct extmnttab32 *tab32;
571 
572 		if (datamodel == DATAMODEL_ILP32) {
573 			tab = (struct extmnttab *)metadata_baseaddr;
574 			tab32 = (struct extmnttab32 *)metadata_baseaddr;
575 
576 			for (i = 0; i < snap->mnts_count; i++) {
577 				tab32[i].mnt_special =
578 				    (uintptr_t)snap->mnts_text +
579 				    (tab[i].mnt_special - text_baseaddr);
580 				tab32[i].mnt_mountp =
581 				    (uintptr_t)snap->mnts_text +
582 				    (tab[i].mnt_mountp - text_baseaddr);
583 				tab32[i].mnt_fstype =
584 				    (uintptr_t)snap->mnts_text +
585 				    (tab[i].mnt_fstype - text_baseaddr);
586 				tab32[i].mnt_mntopts =
587 				    (uintptr_t)snap->mnts_text +
588 				    (tab[i].mnt_mntopts - text_baseaddr);
589 				tab32[i].mnt_time = (uintptr_t)snap->mnts_text +
590 				    (tab[i].mnt_time - text_baseaddr);
591 				tab32[i].mnt_major = tab[i].mnt_major;
592 				tab32[i].mnt_minor = tab[i].mnt_minor;
593 			}
594 
595 			snap->mnts_metasize =
596 			    snap->mnts_count * sizeof (struct extmnttab32);
597 			snap->mnts_metadata = mntfs_mapin(
598 			    (char *)metadata_baseaddr,
599 			    snap->mnts_metasize);
600 
601 		} else {
602 #endif
603 			tab = (struct extmnttab *)metadata_baseaddr;
604 			for (i = 0; i < snap->mnts_count; i++) {
605 				tab[i].mnt_special = snap->mnts_text +
606 				    (tab[i].mnt_special - text_baseaddr);
607 				tab[i].mnt_mountp = snap->mnts_text +
608 				    (tab[i].mnt_mountp - text_baseaddr);
609 				tab[i].mnt_fstype = snap->mnts_text +
610 				    (tab[i].mnt_fstype - text_baseaddr);
611 				tab[i].mnt_mntopts = snap->mnts_text +
612 				    (tab[i].mnt_mntopts - text_baseaddr);
613 				tab[i].mnt_time = snap->mnts_text +
614 				    (tab[i].mnt_time - text_baseaddr);
615 			}
616 
617 			snap->mnts_metasize =
618 			    snap->mnts_count * sizeof (struct extmnttab);
619 			snap->mnts_metadata = mntfs_mapin(
620 			    (char *)metadata_baseaddr, snap->mnts_metasize);
621 #ifdef _SYSCALL32_IMPL
622 		}
623 #endif
624 
625 		kmem_free(metadata_baseaddr,
626 		    snap->mnts_count * sizeof (struct extmnttab));
627 	}
628 
629 	mntdata->mnt_size = size;
630 
631 	if (snap->mnts_text == NULL ||
632 	    (!forread && snap->mnts_metadata == NULL)) {
633 		mntfs_freesnap(snap);
634 		return (ENOMEM);
635 	}
636 
637 	return (0);
638 }
639 
640 /*
641  * Public function to convert vfs_mntopts into a string.
642  * A buffer of sufficient size is allocated, which is returned via bufp,
643  * and whose length is returned via lenp.
644  */
645 void
646 mntfs_getmntopts(struct vfs *vfsp, char **bufp, size_t *lenp)
647 {
648 	size_t len;
649 	char *buf;
650 
651 	vfs_list_read_lock();
652 
653 	len = mntfs_optsize(vfsp) + 1;
654 	buf = kmem_alloc(len, KM_NOSLEEP);
655 	if (buf == NULL) {
656 		*bufp = NULL;
657 		vfs_list_unlock();
658 		return;
659 	}
660 	buf[len - 1] = '\0';
661 	(void) mntfs_optprint(vfsp, buf);
662 	ASSERT(buf[len - 1] == '\0');
663 
664 	vfs_list_unlock();
665 	*bufp = buf;
666 	*lenp = len;
667 }
668 
669 
670 /* ARGSUSED */
671 static int
672 mntopen(vnode_t **vpp, int flag, cred_t *cr)
673 {
674 	vnode_t *vp = *vpp;
675 	mntnode_t *nmnp;
676 
677 	/*
678 	 * Not allowed to open for writing, return error.
679 	 */
680 	if (flag & FWRITE)
681 		return (EPERM);
682 	/*
683 	 * Create a new mnt/vnode for each open, this will give us a handle to
684 	 * hang the snapshot on.
685 	 */
686 	nmnp = mntgetnode(vp);
687 
688 	*vpp = MTOV(nmnp);
689 	atomic_add_32(&MTOD(nmnp)->mnt_nopen, 1);
690 	VN_RELE(vp);
691 	return (0);
692 }
693 
694 /* ARGSUSED */
695 static int
696 mntclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
697 {
698 	mntnode_t *mnp = VTOM(vp);
699 
700 	/* Clean up any locks or shares held by the current process */
701 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
702 	cleanshares(vp, ttoproc(curthread)->p_pid);
703 
704 	if (count > 1)
705 		return (0);
706 	if (vp->v_count == 1) {
707 		mntfs_freesnap(&mnp->mnt_read);
708 		mntfs_freesnap(&mnp->mnt_ioctl);
709 		atomic_add_32(&MTOD(mnp)->mnt_nopen, -1);
710 	}
711 	return (0);
712 }
713 
714 /* ARGSUSED */
715 static int
716 mntread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, caller_context_t *ct)
717 {
718 	int error = 0;
719 	off_t off = uio->uio_offset;
720 	size_t len = uio->uio_resid;
721 	mntnode_t *mnp = VTOM(vp);
722 	char *buf;
723 	mntsnap_t *snap = &mnp->mnt_read;
724 	int datamodel;
725 
726 	if (off == (off_t)0 || snap->mnts_count == 0) {
727 		/*
728 		 * It is assumed that any kernel callers wishing
729 		 * to read mnttab will be using extmnttab entries
730 		 * and not extmnttab32 entries, whether or not
731 		 * the kernel is LP64 or ILP32.  Thus, force the
732 		 * datamodel that mntfs_snapshot uses to be
733 		 * DATAMODEL_LP64.
734 		 */
735 		if (uio->uio_segflg == UIO_SYSSPACE)
736 			datamodel = DATAMODEL_LP64;
737 		else
738 			datamodel = get_udatamodel();
739 		if ((error = mntfs_snapshot(mnp, 1, datamodel)) != 0)
740 			return (error);
741 	}
742 	if ((size_t)(off + len) > snap->mnts_textsize)
743 		len = snap->mnts_textsize - off;
744 
745 	if (off < 0 || len > snap->mnts_textsize)
746 		return (EFAULT);
747 
748 	if (len == 0)
749 		return (0);
750 
751 	/*
752 	 * The mnttab image is stored in the user's address space,
753 	 * so we have to copy it into the kernel from userland,
754 	 * then copy it back out to the specified address.
755 	 */
756 	buf = kmem_alloc(len, KM_SLEEP);
757 	if (copyin(snap->mnts_text + off, buf, len))
758 		error = EFAULT;
759 	else {
760 		error = uiomove(buf, len, UIO_READ, uio);
761 	}
762 	kmem_free(buf, len);
763 
764 	return (error);
765 }
766 
767 
768 static int
769 mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
770 {
771 	mntnode_t *mnp = VTOM(vp);
772 	int error;
773 	vnode_t *rvp;
774 	extern timespec_t vfs_mnttab_ctime;
775 	mntdata_t *mntdata = MTOD(VTOM(vp));
776 	mntsnap_t *snap = mnp->mnt_read.mnts_count ?
777 	    &mnp->mnt_read : &mnp->mnt_ioctl;
778 
779 	/*
780 	 * Return all the attributes.  Should be refined
781 	 * so that it returns only those asked for.
782 	 * Most of this is complete fakery anyway.
783 	 */
784 	rvp = mnp->mnt_mountvp;
785 	/*
786 	 * Attributes are same as underlying file with modifications
787 	 */
788 	if (error = VOP_GETATTR(rvp, vap, flags, cr))
789 		return (error);
790 
791 	/*
792 	 * We always look like a regular file
793 	 */
794 	vap->va_type = VREG;
795 	/*
796 	 * mode should basically be read only
797 	 */
798 	vap->va_mode &= 07444;
799 	vap->va_fsid = vp->v_vfsp->vfs_dev;
800 	vap->va_blksize = DEV_BSIZE;
801 	vap->va_rdev = 0;
802 	vap->va_seq = 0;
803 	/*
804 	 * Set nlink to the number of open vnodes for mnttab info
805 	 * plus one for existing.
806 	 */
807 	vap->va_nlink = mntdata->mnt_nopen + 1;
808 	/*
809 	 * If we haven't taken a snapshot yet, set the
810 	 * size to the size of the latest snapshot.
811 	 */
812 	vap->va_size = snap->mnts_textsize ? snap->mnts_textsize :
813 	    mntdata->mnt_size;
814 	/*
815 	 * Fetch mtime from the vfs mnttab timestamp
816 	 */
817 	vap->va_ctime = vfs_mnttab_ctime;
818 	vfs_list_read_lock();
819 	vfs_mnttab_modtime(&vap->va_mtime);
820 	vap->va_atime = vap->va_mtime;
821 	vfs_list_unlock();
822 	/*
823 	 * Nodeid is always ROOTINO;
824 	 */
825 	vap->va_nodeid = (ino64_t)MNTROOTINO;
826 	vap->va_nblocks = btod(vap->va_size);
827 	return (0);
828 }
829 
830 
831 static int
832 mntaccess(vnode_t *vp, int mode, int flags, cred_t *cr)
833 {
834 	mntnode_t *mnp = VTOM(vp);
835 
836 	if (mode & (VWRITE|VEXEC))
837 		return (EROFS);
838 
839 	/*
840 	 * Do access check on the underlying directory vnode.
841 	 */
842 	return (VOP_ACCESS(mnp->mnt_mountvp, mode, flags, cr));
843 }
844 
845 
846 /*
847  * New /mntfs vnode required; allocate it and fill in most of the fields.
848  */
849 static mntnode_t *
850 mntgetnode(vnode_t *dp)
851 {
852 	mntnode_t *mnp;
853 	vnode_t *vp;
854 
855 	mnp = kmem_zalloc(sizeof (mntnode_t), KM_SLEEP);
856 	mnp->mnt_vnode = vn_alloc(KM_SLEEP);
857 	mnp->mnt_mountvp = VTOM(dp)->mnt_mountvp;
858 	vp = MTOV(mnp);
859 	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
860 	vn_setops(vp, mntvnodeops);
861 	vp->v_vfsp = dp->v_vfsp;
862 	vp->v_type = VREG;
863 	vp->v_data = (caddr_t)mnp;
864 
865 	return (mnp);
866 }
867 
868 /*
869  * Free the storage obtained from mntgetnode().
870  */
871 static void
872 mntfreenode(mntnode_t *mnp)
873 {
874 	vnode_t *vp = MTOV(mnp);
875 
876 	vn_invalid(vp);
877 	vn_free(vp);
878 	kmem_free(mnp, sizeof (*mnp));
879 }
880 
881 
882 /* ARGSUSED */
883 static int
884 mntfsync(vnode_t *vp, int syncflag, cred_t *cr)
885 {
886 	return (0);
887 }
888 
889 /* ARGSUSED */
890 static void
891 mntinactive(vnode_t *vp, cred_t *cr)
892 {
893 	mntnode_t *mnp = VTOM(vp);
894 
895 	mntfreenode(mnp);
896 }
897 
898 /* ARGSUSED */
899 static int
900 mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp)
901 {
902 	if (*noffp == 0)
903 		VTOM(vp)->mnt_offset = 0;
904 
905 	return (0);
906 }
907 
908 /*
909  * Return the answer requested to poll().
910  * POLLRDBAND will return when the mtime of the mnttab
911  * information is newer than the latest one read for this open.
912  */
913 /* ARGSUSED */
914 static int
915 mntpoll(vnode_t *vp, short ev, int any, short *revp, pollhead_t **phpp)
916 {
917 	mntnode_t *mnp = VTOM(vp);
918 	mntsnap_t *snap = &mnp->mnt_read;
919 
920 	if (mnp->mnt_ioctl.mnts_time.tv_sec > snap->mnts_time.tv_sec ||
921 	    (mnp->mnt_ioctl.mnts_time.tv_sec == snap->mnts_time.tv_sec &&
922 	    mnp->mnt_ioctl.mnts_time.tv_nsec > snap->mnts_time.tv_nsec))
923 		snap = &mnp->mnt_ioctl;
924 
925 	*revp = 0;
926 	*phpp = (pollhead_t *)NULL;
927 	if (ev & POLLIN)
928 		*revp |= POLLIN;
929 
930 	if (ev & POLLRDNORM)
931 		*revp |= POLLRDNORM;
932 
933 	if (ev & POLLRDBAND) {
934 		vfs_mnttab_poll(&snap->mnts_time, phpp);
935 		if (*phpp == (pollhead_t *)NULL)
936 			*revp |= POLLRDBAND;
937 	}
938 	if (*revp || *phpp != NULL || any) {
939 		return (0);
940 	}
941 	/*
942 	 * If someone is polling an unsupported poll events (e.g.
943 	 * POLLOUT, POLLPRI, etc.), just return POLLERR revents.
944 	 * That way we will ensure that we don't return a 0
945 	 * revents with a NULL pollhead pointer.
946 	 */
947 	*revp = POLLERR;
948 	return (0);
949 }
950 /* ARGSUSED */
951 static int
952 mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
953 	cred_t *cr, int *rvalp)
954 {
955 	uint_t *up = (uint_t *)arg;
956 	mntnode_t *mnp = VTOM(vp);
957 	mntsnap_t *snap = &mnp->mnt_ioctl;
958 	int error;
959 
960 	error = 0;
961 	switch (cmd) {
962 
963 	case MNTIOC_NMNTS: {		/* get no. of mounted resources */
964 		if (snap->mnts_count == 0) {
965 			if ((error =
966 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
967 				return (error);
968 		}
969 		if (suword32(up, snap->mnts_count) != 0)
970 			error = EFAULT;
971 		break;
972 	}
973 
974 	case MNTIOC_GETDEVLIST: {	/* get mounted device major/minor nos */
975 		uint_t *devlist;
976 		int i;
977 		size_t len;
978 
979 		if (snap->mnts_count == 0) {
980 			if ((error =
981 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
982 				return (error);
983 		}
984 
985 		len = 2 * snap->mnts_count * sizeof (uint_t);
986 		devlist = kmem_alloc(len, KM_SLEEP);
987 		for (i = 0; i < snap->mnts_count; i++) {
988 
989 #ifdef _SYSCALL32_IMPL
990 			if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
991 				struct extmnttab32 tab;
992 
993 				if ((error = xcopyin(snap->mnts_text +
994 				    i * sizeof (struct extmnttab32), &tab,
995 				    sizeof (tab))) != 0)
996 					break;
997 
998 				devlist[i*2] = tab.mnt_major;
999 				devlist[i*2+1] = tab.mnt_minor;
1000 			} else {
1001 #endif
1002 				struct extmnttab tab;
1003 
1004 				if ((error = xcopyin(snap->mnts_text +
1005 				    i * sizeof (struct extmnttab), &tab,
1006 				    sizeof (tab))) != 0)
1007 					break;
1008 
1009 				devlist[i*2] = tab.mnt_major;
1010 				devlist[i*2+1] = tab.mnt_minor;
1011 #ifdef _SYSCALL32_IMPL
1012 			}
1013 #endif
1014 		}
1015 
1016 		if (error == 0)
1017 			error = xcopyout(devlist, up, len);
1018 		kmem_free(devlist, len);
1019 		break;
1020 	}
1021 
1022 	case MNTIOC_SETTAG:		/* set tag on mounted file system */
1023 	case MNTIOC_CLRTAG:		/* clear tag on mounted file system */
1024 	{
1025 		struct mnttagdesc *dp = (struct mnttagdesc *)arg;
1026 		STRUCT_DECL(mnttagdesc, tagdesc);
1027 		char *cptr;
1028 		uint32_t major, minor;
1029 		char tagbuf[MAX_MNTOPT_TAG];
1030 		char *pbuf;
1031 		size_t len;
1032 		uint_t start = 0;
1033 		mntdata_t *mntdata = MTOD(mnp);
1034 		zone_t *zone = mntdata->mnt_zone;
1035 
1036 		STRUCT_INIT(tagdesc, flag & DATAMODEL_MASK);
1037 		if (copyin(dp, STRUCT_BUF(tagdesc), STRUCT_SIZE(tagdesc))) {
1038 			error = EFAULT;
1039 			break;
1040 		}
1041 		pbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1042 		if (zone != global_zone) {
1043 			(void) strcpy(pbuf, zone->zone_rootpath);
1044 			/* truncate "/" and nul */
1045 			start = zone->zone_rootpathlen - 2;
1046 			ASSERT(pbuf[start] == '/');
1047 		}
1048 		cptr = STRUCT_FGETP(tagdesc, mtd_mntpt);
1049 		error = copyinstr(cptr, pbuf + start, MAXPATHLEN - start, &len);
1050 		if (error) {
1051 			kmem_free(pbuf, MAXPATHLEN);
1052 			break;
1053 		}
1054 		if (start != 0 && pbuf[start] != '/') {
1055 			kmem_free(pbuf, MAXPATHLEN);
1056 			error = EINVAL;
1057 			break;
1058 		}
1059 		cptr = STRUCT_FGETP(tagdesc, mtd_tag);
1060 		if ((error = copyinstr(cptr, tagbuf, MAX_MNTOPT_TAG, &len))) {
1061 			kmem_free(pbuf, MAXPATHLEN);
1062 			break;
1063 		}
1064 		major = STRUCT_FGET(tagdesc, mtd_major);
1065 		minor = STRUCT_FGET(tagdesc, mtd_minor);
1066 		if (cmd == MNTIOC_SETTAG)
1067 			error = vfs_settag(major, minor, pbuf, tagbuf, cr);
1068 		else
1069 			error = vfs_clrtag(major, minor, pbuf, tagbuf, cr);
1070 		kmem_free(pbuf, MAXPATHLEN);
1071 		break;
1072 	}
1073 
1074 	case MNTIOC_SHOWHIDDEN:
1075 	{
1076 		mutex_enter(&vp->v_lock);
1077 		mnp->mnt_flags |= MNT_SHOWHIDDEN;
1078 		mutex_exit(&vp->v_lock);
1079 		break;
1080 	}
1081 
1082 	case MNTIOC_GETMNTENT:
1083 	{
1084 		size_t idx;
1085 		uintptr_t addr;
1086 
1087 		idx = mnp->mnt_offset;
1088 		if (snap->mnts_count == 0 || idx == 0) {
1089 			if ((error =
1090 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
1091 				return (error);
1092 		}
1093 		/*
1094 		 * If the next index is beyond the end of the current mnttab,
1095 		 * return EOF
1096 		 */
1097 		if (idx >= snap->mnts_count) {
1098 			*rvalp = 1;
1099 			return (0);
1100 		}
1101 
1102 #ifdef _SYSCALL32_IMPL
1103 		if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
1104 			addr = (uintptr_t)(snap->mnts_metadata + idx *
1105 			    sizeof (struct extmnttab32));
1106 			error = suword32((void *)arg, addr);
1107 		} else {
1108 #endif
1109 			addr = (uintptr_t)(snap->mnts_metadata + idx *
1110 			    sizeof (struct extmnttab));
1111 			error = sulword((void *)arg, addr);
1112 #ifdef _SYSCALL32_IMPL
1113 		}
1114 #endif
1115 
1116 		if (error != 0)
1117 			return (error);
1118 
1119 		mnp->mnt_offset++;
1120 		break;
1121 	}
1122 
1123 	default:
1124 		error = EINVAL;
1125 		break;
1126 	}
1127 
1128 	return (error);
1129 }
1130 
1131 
1132 /*
1133  * /mntfs vnode operations vector
1134  */
1135 const fs_operation_def_t mnt_vnodeops_template[] = {
1136 	VOPNAME_OPEN,		{ .vop_open = mntopen },
1137 	VOPNAME_CLOSE,		{ .vop_close = mntclose },
1138 	VOPNAME_READ,		{ .vop_read = mntread },
1139 	VOPNAME_IOCTL,		{ .vop_ioctl = mntioctl },
1140 	VOPNAME_GETATTR,	{ .vop_getattr = mntgetattr },
1141 	VOPNAME_ACCESS,		{ .vop_access = mntaccess },
1142 	VOPNAME_FSYNC,		{ .vop_fsync = mntfsync },
1143 	VOPNAME_INACTIVE,	{ .vop_inactive = mntinactive },
1144 	VOPNAME_SEEK,		{ .vop_seek = mntseek },
1145 	VOPNAME_POLL,		{ .vop_poll = mntpoll },
1146 	VOPNAME_DISPOSE,	{ .error = fs_error },
1147 	VOPNAME_SHRLOCK,	{ .error = fs_error },
1148 	NULL,			NULL
1149 };
1150