xref: /titanic_44/usr/src/uts/common/fs/mntfs/mntvnops.c (revision b695575577bae0337af339d76949713bfe1c9013)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/file.h>
27 #include <sys/stat.h>
28 #include <sys/atomic.h>
29 #include <sys/mntio.h>
30 #include <sys/mnttab.h>
31 #include <sys/mount.h>
32 #include <sys/sunddi.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/vfs.h>
36 #include <sys/vfs_opreg.h>
37 #include <sys/fs/mntdata.h>
38 #include <fs/fs_subr.h>
39 #include <sys/vmsystm.h>
40 #include <vm/seg_vn.h>
41 
42 #define	MNTROOTINO	2
43 
44 static mntnode_t *mntgetnode(vnode_t *);
45 
46 vnodeops_t *mntvnodeops;
47 extern void vfs_mnttab_readop(void);
48 
49 /*
50  * Design of kernel mnttab accounting.
51  *
52  * To support whitespace in mount names, we implement an ioctl
53  * (MNTIOC_GETMNTENT) which allows a programmatic interface to the data in
54  * /etc/mnttab.  The libc functions getmntent() and getextmntent() are built
55  * atop this interface.
56  *
57  * To minimize the amount of memory used in the kernel, we keep all the
58  * necessary information in the user's address space.  Large server
59  * configurations can have /etc/mnttab files in excess of 64k.
60  *
61  * To support both vanilla read() calls as well as ioctl() calls, we have two
62  * different snapshots of the kernel data structures, mnt_read and mnt_ioctl.
63  * These snapshots include the base location in user memory, the number of
64  * mounts in the snapshot, and any metadata associated with it.  The metadata is
65  * used only to support the ioctl() interface, and is a series of extmnttab
66  * structures.  When the user issues an ioctl(), we simply copyout a pointer to
67  * that structure, and the rest is handled in userland.
68  */
69 
70 /*
71  * NOTE: The following variable enables the generation of the "dev=xxx"
72  * in the option string for a mounted file system.  Really this should
73  * be gotten rid of altogether, but for the sake of backwards compatibility
74  * we had to leave it in.  It is defined as a 32-bit device number.  This
75  * means that when 64-bit device numbers are in use, if either the major or
76  * minor part of the device number will not fit in a 16 bit quantity, the
77  * "dev=" will be set to NODEV (0x7fffffff).  See PSARC 1999/566 and
78  * 1999/131 for details.  The cmpldev() function used to generate the 32-bit
79  * device number handles this check and assigns the proper value.
80  */
81 int mntfs_enabledev = 1;	/* enable old "dev=xxx" option */
82 
83 static int
84 mntfs_devsize(struct vfs *vfsp)
85 {
86 	dev32_t odev;
87 
88 	(void) cmpldev(&odev, vfsp->vfs_dev);
89 	return (snprintf(NULL, 0, "dev=%x", odev));
90 }
91 
92 static int
93 mntfs_devprint(struct vfs *vfsp, char *buf)
94 {
95 	dev32_t odev;
96 
97 	(void) cmpldev(&odev, vfsp->vfs_dev);
98 	return (snprintf(buf, MAX_MNTOPT_STR, "dev=%x", odev));
99 }
100 
101 static int
102 mntfs_optsize(struct vfs *vfsp)
103 {
104 	int i, size = 0;
105 	mntopt_t *mop;
106 
107 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
108 		mop = &vfsp->vfs_mntopts.mo_list[i];
109 		if (mop->mo_flags & MO_NODISPLAY)
110 			continue;
111 		if (mop->mo_flags & MO_SET) {
112 			if (size)
113 				size++; /* space for comma */
114 			size += strlen(mop->mo_name);
115 			/*
116 			 * count option value if there is one
117 			 */
118 			if (mop->mo_arg != NULL) {
119 				size += strlen(mop->mo_arg) + 1;
120 			}
121 		}
122 	}
123 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
124 		/*
125 		 * Add space for "zone=<zone_name>" if required.
126 		 */
127 		if (size)
128 			size++;	/* space for comma */
129 		size += sizeof ("zone=") - 1;
130 		size += strlen(vfsp->vfs_zone->zone_name);
131 	}
132 	if (mntfs_enabledev) {
133 		if (size != 0)
134 			size++; /* space for comma */
135 		size += mntfs_devsize(vfsp);
136 	}
137 	if (size == 0)
138 		size = strlen("-");
139 	return (size);
140 }
141 
142 static int
143 mntfs_optprint(struct vfs *vfsp, char *buf)
144 {
145 	int i, optinbuf = 0;
146 	mntopt_t *mop;
147 	char *origbuf = buf;
148 
149 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
150 		mop = &vfsp->vfs_mntopts.mo_list[i];
151 		if (mop->mo_flags & MO_NODISPLAY)
152 			continue;
153 		if (mop->mo_flags & MO_SET) {
154 			if (optinbuf)
155 				*buf++ = ',';
156 			else
157 				optinbuf = 1;
158 			buf += snprintf(buf, MAX_MNTOPT_STR,
159 			    "%s", mop->mo_name);
160 			/*
161 			 * print option value if there is one
162 			 */
163 			if (mop->mo_arg != NULL) {
164 				buf += snprintf(buf, MAX_MNTOPT_STR, "=%s",
165 				    mop->mo_arg);
166 			}
167 		}
168 	}
169 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
170 		if (optinbuf)
171 			*buf++ = ',';
172 		else
173 			optinbuf = 1;
174 		buf += snprintf(buf, MAX_MNTOPT_STR, "zone=%s",
175 		    vfsp->vfs_zone->zone_name);
176 	}
177 	if (mntfs_enabledev) {
178 		if (optinbuf++)
179 			*buf++ = ',';
180 		buf += mntfs_devprint(vfsp, buf);
181 	}
182 	if (!optinbuf) {
183 		buf += snprintf(buf, MAX_MNTOPT_STR, "-");
184 	}
185 	return (buf - origbuf);
186 }
187 
188 static size_t
189 mntfs_vfs_len(vfs_t *vfsp, zone_t *zone)
190 {
191 	size_t size = 0;
192 	const char *resource, *mntpt;
193 
194 	mntpt = refstr_value(vfsp->vfs_mntpt);
195 	if (mntpt != NULL && mntpt[0] != '\0') {
196 		size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
197 	} else {
198 		size += strlen("-") + 1;
199 	}
200 
201 	resource = refstr_value(vfsp->vfs_resource);
202 	if (resource != NULL && resource[0] != '\0') {
203 		if (resource[0] != '/') {
204 			size += strlen(resource) + 1;
205 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
206 			/*
207 			 * Same as the zone's view of the mount point.
208 			 */
209 			size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
210 		} else {
211 			size += strlen(ZONE_PATH_TRANSLATE(resource, zone)) + 1;
212 		}
213 	} else {
214 		size += strlen("-") + 1;
215 	}
216 	size += strlen(vfssw[vfsp->vfs_fstype].vsw_name) + 1;
217 	size += mntfs_optsize(vfsp);
218 	size += snprintf(NULL, 0, "\t%ld\n", vfsp->vfs_mtime);
219 	return (size);
220 }
221 
222 static void
223 mntfs_zonerootvfs(zone_t *zone, vfs_t *rootvfsp)
224 {
225 	/*
226 	 * Basically copy over the real vfs_t on which the root vnode is
227 	 * located, changing its mountpoint and resource to match those of
228 	 * the zone's rootpath.
229 	 */
230 	*rootvfsp = *zone->zone_rootvp->v_vfsp;
231 	rootvfsp->vfs_mntpt = refstr_alloc(zone->zone_rootpath);
232 	rootvfsp->vfs_resource = rootvfsp->vfs_mntpt;
233 }
234 
235 static size_t
236 mntfs_zone_len(uint_t *nent_ptr, zone_t *zone, int showhidden)
237 {
238 	struct vfs *zonelist;
239 	struct vfs *vfsp;
240 	size_t size = 0;
241 	uint_t cnt = 0;
242 
243 	ASSERT(zone->zone_rootpath != NULL);
244 
245 	/*
246 	 * If the zone has a root entry, it will be the first in the list.  If
247 	 * it doesn't, we conjure one up.
248 	 */
249 	vfsp = zonelist = zone->zone_vfslist;
250 	if (zonelist == NULL ||
251 	    strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
252 		vfs_t tvfs;
253 		/*
254 		 * The root of the zone is not a mount point.  The vfs we want
255 		 * to report is that of the zone's root vnode.
256 		 */
257 		ASSERT(zone != global_zone);
258 		mntfs_zonerootvfs(zone, &tvfs);
259 		size += mntfs_vfs_len(&tvfs, zone);
260 		refstr_rele(tvfs.vfs_mntpt);
261 		cnt++;
262 	}
263 	if (zonelist == NULL)
264 		goto out;
265 	do {
266 		/*
267 		 * Skip mounts that should not show up in mnttab
268 		 */
269 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
270 			vfsp = vfsp->vfs_zone_next;
271 			continue;
272 		}
273 		cnt++;
274 		size += mntfs_vfs_len(vfsp, zone);
275 		vfsp = vfsp->vfs_zone_next;
276 	} while (vfsp != zonelist);
277 out:
278 	*nent_ptr = cnt;
279 	return (size);
280 }
281 
282 static size_t
283 mntfs_global_len(uint_t *nent_ptr, int showhidden)
284 {
285 	struct vfs *vfsp;
286 	size_t size = 0;
287 	uint_t cnt = 0;
288 
289 	vfsp = rootvfs;
290 	do {
291 		/*
292 		 * Skip mounts that should not show up in mnttab
293 		 */
294 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
295 			vfsp = vfsp->vfs_next;
296 			continue;
297 		}
298 		cnt++;
299 		size += mntfs_vfs_len(vfsp, global_zone);
300 		vfsp = vfsp->vfs_next;
301 	} while (vfsp != rootvfs);
302 	*nent_ptr = cnt;
303 	return (size);
304 }
305 
306 static void
307 mntfs_vfs_generate(vfs_t *vfsp, zone_t *zone, struct extmnttab *tab,
308     char **basep, int forread)
309 {
310 	const char *resource, *mntpt;
311 	char *cp = *basep;
312 
313 	mntpt = refstr_value(vfsp->vfs_mntpt);
314 	resource = refstr_value(vfsp->vfs_resource);
315 
316 	if (tab)
317 		tab->mnt_special = cp;
318 	if (resource != NULL && resource[0] != '\0') {
319 		if (resource[0] != '/') {
320 			cp += snprintf(cp, MAXPATHLEN, "%s", resource);
321 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
322 			/*
323 			 * Use the mount point as the resource.
324 			 */
325 			cp += snprintf(cp, MAXPATHLEN, "%s",
326 			    ZONE_PATH_TRANSLATE(mntpt, zone));
327 		} else {
328 			cp += snprintf(cp, MAXPATHLEN, "%s",
329 			    ZONE_PATH_TRANSLATE(resource, zone));
330 		}
331 	} else {
332 		cp += snprintf(cp, MAXPATHLEN, "-");
333 	}
334 	*cp++ = forread ? '\t' : '\0';
335 
336 	if (tab)
337 		tab->mnt_mountp = cp;
338 	if (mntpt != NULL && mntpt[0] != '\0') {
339 		/*
340 		 * We know the mount point is visible from within the zone,
341 		 * otherwise it wouldn't be on the zone's vfs list.
342 		 */
343 		cp += snprintf(cp, MAXPATHLEN, "%s",
344 		    ZONE_PATH_TRANSLATE(mntpt, zone));
345 	} else {
346 		cp += snprintf(cp, MAXPATHLEN, "-");
347 	}
348 	*cp++ = forread ? '\t' : '\0';
349 
350 	if (tab)
351 		tab->mnt_fstype = cp;
352 	cp += snprintf(cp, MAXPATHLEN, "%s",
353 	    vfssw[vfsp->vfs_fstype].vsw_name);
354 	*cp++ = forread ? '\t' : '\0';
355 
356 	if (tab)
357 		tab->mnt_mntopts = cp;
358 	cp += mntfs_optprint(vfsp, cp);
359 	*cp++ = forread ? '\t' : '\0';
360 
361 	if (tab)
362 		tab->mnt_time = cp;
363 	cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime);
364 	*cp++ = forread ? '\n' : '\0';
365 
366 	if (tab) {
367 		tab->mnt_major = getmajor(vfsp->vfs_dev);
368 		tab->mnt_minor = getminor(vfsp->vfs_dev);
369 	}
370 
371 	*basep = cp;
372 }
373 
374 static void
375 mntfs_zone_generate(zone_t *zone, int showhidden, struct extmnttab *tab,
376     char *basep, int forread)
377 {
378 	vfs_t *zonelist;
379 	vfs_t *vfsp;
380 	char *cp = basep;
381 
382 	/*
383 	 * If the zone has a root entry, it will be the first in the list.  If
384 	 * it doesn't, we conjure one up.
385 	 */
386 	vfsp = zonelist = zone->zone_vfslist;
387 	if (zonelist == NULL ||
388 	    strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
389 		vfs_t tvfs;
390 		/*
391 		 * The root of the zone is not a mount point.  The vfs we want
392 		 * to report is that of the zone's root vnode.
393 		 */
394 		ASSERT(zone != global_zone);
395 		mntfs_zonerootvfs(zone, &tvfs);
396 		mntfs_vfs_generate(&tvfs, zone, tab, &cp, forread);
397 		refstr_rele(tvfs.vfs_mntpt);
398 		if (tab)
399 			tab++;
400 	}
401 	if (zonelist == NULL)
402 		return;
403 	do {
404 		/*
405 		 * Skip mounts that should not show up in mnttab
406 		 */
407 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
408 			vfsp = vfsp->vfs_zone_next;
409 			continue;
410 		}
411 		mntfs_vfs_generate(vfsp, zone, tab, &cp, forread);
412 		if (tab)
413 			tab++;
414 		vfsp = vfsp->vfs_zone_next;
415 	} while (vfsp != zonelist);
416 }
417 
418 static void
419 mntfs_global_generate(int showhidden, struct extmnttab *tab, char *basep,
420     int forread)
421 {
422 	vfs_t *vfsp;
423 	char *cp = basep;
424 
425 	vfsp = rootvfs;
426 	do {
427 		/*
428 		 * Skip mounts that should not show up in mnttab
429 		 */
430 		if (!showhidden && vfsp->vfs_flag & VFS_NOMNTTAB) {
431 			vfsp = vfsp->vfs_next;
432 			continue;
433 		}
434 		mntfs_vfs_generate(vfsp, global_zone, tab, &cp, forread);
435 		if (tab)
436 			tab++;
437 		vfsp = vfsp->vfs_next;
438 	} while (vfsp != rootvfs);
439 }
440 
441 static char *
442 mntfs_mapin(char *base, size_t size)
443 {
444 	size_t rlen = roundup(size, PAGESIZE);
445 	struct as *as = curproc->p_as;
446 	char *addr = NULL;
447 
448 	as_rangelock(as);
449 	map_addr(&addr, rlen, 0, 1, 0);
450 	if (addr == NULL || as_map(as, addr, rlen, segvn_create, zfod_argsp)) {
451 		as_rangeunlock(as);
452 		return (NULL);
453 	}
454 	as_rangeunlock(as);
455 	if (copyout(base, addr, size)) {
456 		(void) as_unmap(as, addr, rlen);
457 		return (NULL);
458 	}
459 	return (addr);
460 }
461 
462 static void
463 mntfs_freesnap(mntsnap_t *snap)
464 {
465 	if (snap->mnts_text != NULL)
466 		(void) as_unmap(curproc->p_as, snap->mnts_text,
467 		    roundup(snap->mnts_textsize, PAGESIZE));
468 	snap->mnts_textsize = snap->mnts_count = 0;
469 	if (snap->mnts_metadata != NULL)
470 		(void) as_unmap(curproc->p_as, snap->mnts_metadata,
471 		    roundup(snap->mnts_metasize, PAGESIZE));
472 	snap->mnts_metasize = 0;
473 }
474 
475 #ifdef _SYSCALL32_IMPL
476 
477 typedef struct extmnttab32 {
478 	uint32_t	mnt_special;
479 	uint32_t	mnt_mountp;
480 	uint32_t	mnt_fstype;
481 	uint32_t	mnt_mntopts;
482 	uint32_t	mnt_time;
483 	uint_t		mnt_major;
484 	uint_t		mnt_minor;
485 } extmnttab32_t;
486 
487 #endif
488 
489 /*
490  * Snapshot the latest version of the kernel mounted resource information
491  *
492  * There are two types of snapshots: one destined for reading, and one destined
493  * for ioctl().  The difference is that the ioctl() interface is delimited by
494  * NULLs, while the read() interface is delimited by tabs and newlines.
495  */
496 /* ARGSUSED */
497 static int
498 mntfs_snapshot(mntnode_t *mnp, int forread, int datamodel)
499 {
500 	size_t size;
501 	timespec_t lastmodt;
502 	mntdata_t *mntdata = MTOD(mnp);
503 	zone_t *zone = mntdata->mnt_zone;
504 	boolean_t global_view = (MTOD(mnp)->mnt_zone == global_zone);
505 	boolean_t showhidden = ((mnp->mnt_flags & MNT_SHOWHIDDEN) != 0);
506 	struct extmnttab *metadata_baseaddr;
507 	char *text_baseaddr;
508 	int i;
509 	mntsnap_t *snap;
510 
511 	if (forread)
512 		snap = &mnp->mnt_read;
513 	else
514 		snap = &mnp->mnt_ioctl;
515 
516 	vfs_list_read_lock();
517 	/*
518 	 * Check if the mnttab info has changed since the last snapshot
519 	 */
520 	vfs_mnttab_modtime(&lastmodt);
521 	if (snap->mnts_count &&
522 	    lastmodt.tv_sec == snap->mnts_time.tv_sec &&
523 	    lastmodt.tv_nsec == snap->mnts_time.tv_nsec) {
524 		vfs_list_unlock();
525 		return (0);
526 	}
527 
528 
529 	if (snap->mnts_count != 0)
530 		mntfs_freesnap(snap);
531 	if (global_view)
532 		size = mntfs_global_len(&snap->mnts_count, showhidden);
533 	else
534 		size = mntfs_zone_len(&snap->mnts_count, zone, showhidden);
535 	ASSERT(size != 0);
536 
537 	if (!forread)
538 		metadata_baseaddr = kmem_alloc(
539 		    snap->mnts_count * sizeof (struct extmnttab), KM_SLEEP);
540 	else
541 		metadata_baseaddr = NULL;
542 
543 	text_baseaddr = kmem_alloc(size, KM_SLEEP);
544 
545 	if (global_view)
546 		mntfs_global_generate(showhidden, metadata_baseaddr,
547 		    text_baseaddr, forread);
548 	else
549 		mntfs_zone_generate(zone, showhidden,
550 		    metadata_baseaddr, text_baseaddr, forread);
551 
552 	vfs_mnttab_modtime(&snap->mnts_time);
553 	vfs_list_unlock();
554 
555 	snap->mnts_text = mntfs_mapin(text_baseaddr, size);
556 	snap->mnts_textsize = size;
557 	kmem_free(text_baseaddr, size);
558 
559 	/*
560 	 * The pointers in the metadata refer to addreesses in the range
561 	 * [base_addr, base_addr + size].  Now that we have mapped the text into
562 	 * the user's address space, we have to convert these addresses into the
563 	 * new (user) range.  We also handle the conversion for 32-bit and
564 	 * 32-bit applications here.
565 	 */
566 	if (!forread) {
567 		struct extmnttab *tab;
568 #ifdef _SYSCALL32_IMPL
569 		struct extmnttab32 *tab32;
570 
571 		if (datamodel == DATAMODEL_ILP32) {
572 			tab = (struct extmnttab *)metadata_baseaddr;
573 			tab32 = (struct extmnttab32 *)metadata_baseaddr;
574 
575 			for (i = 0; i < snap->mnts_count; i++) {
576 				tab32[i].mnt_special =
577 				    (uintptr_t)snap->mnts_text +
578 				    (tab[i].mnt_special - text_baseaddr);
579 				tab32[i].mnt_mountp =
580 				    (uintptr_t)snap->mnts_text +
581 				    (tab[i].mnt_mountp - text_baseaddr);
582 				tab32[i].mnt_fstype =
583 				    (uintptr_t)snap->mnts_text +
584 				    (tab[i].mnt_fstype - text_baseaddr);
585 				tab32[i].mnt_mntopts =
586 				    (uintptr_t)snap->mnts_text +
587 				    (tab[i].mnt_mntopts - text_baseaddr);
588 				tab32[i].mnt_time = (uintptr_t)snap->mnts_text +
589 				    (tab[i].mnt_time - text_baseaddr);
590 				tab32[i].mnt_major = tab[i].mnt_major;
591 				tab32[i].mnt_minor = tab[i].mnt_minor;
592 			}
593 
594 			snap->mnts_metasize =
595 			    snap->mnts_count * sizeof (struct extmnttab32);
596 			snap->mnts_metadata = mntfs_mapin(
597 			    (char *)metadata_baseaddr,
598 			    snap->mnts_metasize);
599 
600 		} else {
601 #endif
602 			tab = (struct extmnttab *)metadata_baseaddr;
603 			for (i = 0; i < snap->mnts_count; i++) {
604 				tab[i].mnt_special = snap->mnts_text +
605 				    (tab[i].mnt_special - text_baseaddr);
606 				tab[i].mnt_mountp = snap->mnts_text +
607 				    (tab[i].mnt_mountp - text_baseaddr);
608 				tab[i].mnt_fstype = snap->mnts_text +
609 				    (tab[i].mnt_fstype - text_baseaddr);
610 				tab[i].mnt_mntopts = snap->mnts_text +
611 				    (tab[i].mnt_mntopts - text_baseaddr);
612 				tab[i].mnt_time = snap->mnts_text +
613 				    (tab[i].mnt_time - text_baseaddr);
614 			}
615 
616 			snap->mnts_metasize =
617 			    snap->mnts_count * sizeof (struct extmnttab);
618 			snap->mnts_metadata = mntfs_mapin(
619 			    (char *)metadata_baseaddr, snap->mnts_metasize);
620 #ifdef _SYSCALL32_IMPL
621 		}
622 #endif
623 
624 		kmem_free(metadata_baseaddr,
625 		    snap->mnts_count * sizeof (struct extmnttab));
626 	}
627 
628 	mntdata->mnt_size = size;
629 
630 	if (snap->mnts_text == NULL ||
631 	    (!forread && snap->mnts_metadata == NULL)) {
632 		mntfs_freesnap(snap);
633 		return (ENOMEM);
634 	}
635 	vfs_mnttab_readop();
636 	return (0);
637 }
638 
639 /*
640  * Public function to convert vfs_mntopts into a string.
641  * A buffer of sufficient size is allocated, which is returned via bufp,
642  * and whose length is returned via lenp.
643  */
644 void
645 mntfs_getmntopts(struct vfs *vfsp, char **bufp, size_t *lenp)
646 {
647 	size_t len;
648 	char *buf;
649 
650 	vfs_list_read_lock();
651 
652 	len = mntfs_optsize(vfsp) + 1;
653 	buf = kmem_alloc(len, KM_NOSLEEP);
654 	if (buf == NULL) {
655 		*bufp = NULL;
656 		vfs_list_unlock();
657 		return;
658 	}
659 	buf[len - 1] = '\0';
660 	(void) mntfs_optprint(vfsp, buf);
661 	ASSERT(buf[len - 1] == '\0');
662 
663 	vfs_list_unlock();
664 	*bufp = buf;
665 	*lenp = len;
666 }
667 
668 
669 /* ARGSUSED */
670 static int
671 mntopen(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
672 {
673 	vnode_t *vp = *vpp;
674 	mntnode_t *nmnp;
675 
676 	/*
677 	 * Not allowed to open for writing, return error.
678 	 */
679 	if (flag & FWRITE)
680 		return (EPERM);
681 	/*
682 	 * Create a new mnt/vnode for each open, this will give us a handle to
683 	 * hang the snapshot on.
684 	 */
685 	nmnp = mntgetnode(vp);
686 
687 	*vpp = MTOV(nmnp);
688 	atomic_add_32(&MTOD(nmnp)->mnt_nopen, 1);
689 	VN_RELE(vp);
690 	return (0);
691 }
692 
693 /* ARGSUSED */
694 static int
695 mntclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
696 	caller_context_t *ct)
697 {
698 	mntnode_t *mnp = VTOM(vp);
699 
700 	/* Clean up any locks or shares held by the current process */
701 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
702 	cleanshares(vp, ttoproc(curthread)->p_pid);
703 
704 	if (count > 1)
705 		return (0);
706 	if (vp->v_count == 1) {
707 		mntfs_freesnap(&mnp->mnt_read);
708 		mntfs_freesnap(&mnp->mnt_ioctl);
709 		atomic_add_32(&MTOD(mnp)->mnt_nopen, -1);
710 	}
711 	return (0);
712 }
713 
714 /* ARGSUSED */
715 static int
716 mntread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, caller_context_t *ct)
717 {
718 	int error = 0;
719 	off_t off = uio->uio_offset;
720 	size_t len = uio->uio_resid;
721 	mntnode_t *mnp = VTOM(vp);
722 	char *buf;
723 	mntsnap_t *snap;
724 	int datamodel;
725 
726 	rw_enter(&mnp->mnt_contents, RW_READER);
727 	snap = &mnp->mnt_read;
728 	if (off == (off_t)0 || snap->mnts_count == 0) {
729 		/*
730 		 * It is assumed that any kernel callers wishing
731 		 * to read mnttab will be using extmnttab entries
732 		 * and not extmnttab32 entries, whether or not
733 		 * the kernel is LP64 or ILP32.  Thus, force the
734 		 * datamodel that mntfs_snapshot uses to be
735 		 * DATAMODEL_LP64.
736 		 */
737 		if (uio->uio_segflg == UIO_SYSSPACE)
738 			datamodel = DATAMODEL_LP64;
739 		else
740 			datamodel = get_udatamodel();
741 		if (!rw_tryupgrade(&mnp->mnt_contents)) {
742 			rw_exit(&mnp->mnt_contents);
743 			rw_enter(&mnp->mnt_contents, RW_WRITER);
744 		}
745 		if ((error = mntfs_snapshot(mnp, 1, datamodel)) != 0) {
746 			rw_exit(&mnp->mnt_contents);
747 			return (error);
748 		}
749 		rw_downgrade(&mnp->mnt_contents);
750 	}
751 	if ((size_t)(off + len) > snap->mnts_textsize)
752 		len = snap->mnts_textsize - off;
753 
754 	if (off < 0 || len > snap->mnts_textsize) {
755 		rw_exit(&mnp->mnt_contents);
756 		return (EFAULT);
757 	}
758 
759 	if (len == 0) {
760 		rw_exit(&mnp->mnt_contents);
761 		return (0);
762 	}
763 
764 	/*
765 	 * The mnttab image is stored in the user's address space,
766 	 * so we have to copy it into the kernel from userland,
767 	 * then copy it back out to the specified address.
768 	 */
769 	buf = kmem_alloc(len, KM_SLEEP);
770 	if (copyin(snap->mnts_text + off, buf, len))
771 		error = EFAULT;
772 	else {
773 		error = uiomove(buf, len, UIO_READ, uio);
774 	}
775 	kmem_free(buf, len);
776 	vfs_mnttab_readop();
777 	rw_exit(&mnp->mnt_contents);
778 	return (error);
779 }
780 
781 
782 static int
783 mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
784 	caller_context_t *ct)
785 {
786 	mntnode_t *mnp = VTOM(vp);
787 	int error;
788 	vnode_t *rvp;
789 	extern timespec_t vfs_mnttab_ctime;
790 	mntdata_t *mntdata = MTOD(VTOM(vp));
791 	mntsnap_t *snap;
792 
793 	rw_enter(&mnp->mnt_contents, RW_READER);
794 	snap = mnp->mnt_read.mnts_count ? &mnp->mnt_read : &mnp->mnt_ioctl;
795 	/*
796 	 * Return all the attributes.  Should be refined
797 	 * so that it returns only those asked for.
798 	 * Most of this is complete fakery anyway.
799 	 */
800 	rvp = mnp->mnt_mountvp;
801 	/*
802 	 * Attributes are same as underlying file with modifications
803 	 */
804 	if (error = VOP_GETATTR(rvp, vap, flags, cr, ct))
805 		return (error);
806 
807 	/*
808 	 * We always look like a regular file
809 	 */
810 	vap->va_type = VREG;
811 	/*
812 	 * mode should basically be read only
813 	 */
814 	vap->va_mode &= 07444;
815 	vap->va_fsid = vp->v_vfsp->vfs_dev;
816 	vap->va_blksize = DEV_BSIZE;
817 	vap->va_rdev = 0;
818 	vap->va_seq = 0;
819 	/*
820 	 * Set nlink to the number of open vnodes for mnttab info
821 	 * plus one for existing.
822 	 */
823 	vap->va_nlink = mntdata->mnt_nopen + 1;
824 	/*
825 	 * If we haven't taken a snapshot yet, set the
826 	 * size to the size of the latest snapshot.
827 	 */
828 	vap->va_size = snap->mnts_textsize ? snap->mnts_textsize :
829 	    mntdata->mnt_size;
830 	rw_exit(&mnp->mnt_contents);
831 	/*
832 	 * Fetch mtime from the vfs mnttab timestamp
833 	 */
834 	vap->va_ctime = vfs_mnttab_ctime;
835 	vfs_list_read_lock();
836 	vfs_mnttab_modtime(&vap->va_mtime);
837 	vap->va_atime = vap->va_mtime;
838 	vfs_list_unlock();
839 	/*
840 	 * Nodeid is always ROOTINO;
841 	 */
842 	vap->va_nodeid = (ino64_t)MNTROOTINO;
843 	vap->va_nblocks = btod(vap->va_size);
844 	return (0);
845 }
846 
847 
848 static int
849 mntaccess(vnode_t *vp, int mode, int flags, cred_t *cr,
850 	caller_context_t *ct)
851 {
852 	mntnode_t *mnp = VTOM(vp);
853 
854 	if (mode & (VWRITE|VEXEC))
855 		return (EROFS);
856 
857 	/*
858 	 * Do access check on the underlying directory vnode.
859 	 */
860 	return (VOP_ACCESS(mnp->mnt_mountvp, mode, flags, cr, ct));
861 }
862 
863 
864 /*
865  * New /mntfs vnode required; allocate it and fill in most of the fields.
866  */
867 static mntnode_t *
868 mntgetnode(vnode_t *dp)
869 {
870 	mntnode_t *mnp;
871 	vnode_t *vp;
872 
873 	mnp = kmem_zalloc(sizeof (mntnode_t), KM_SLEEP);
874 	mnp->mnt_vnode = vn_alloc(KM_SLEEP);
875 	mnp->mnt_mountvp = VTOM(dp)->mnt_mountvp;
876 	rw_init(&mnp->mnt_contents, NULL, RW_DEFAULT, NULL);
877 	vp = MTOV(mnp);
878 	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
879 	vn_setops(vp, mntvnodeops);
880 	vp->v_vfsp = dp->v_vfsp;
881 	vp->v_type = VREG;
882 	vp->v_data = (caddr_t)mnp;
883 
884 	return (mnp);
885 }
886 
887 /*
888  * Free the storage obtained from mntgetnode().
889  */
890 static void
891 mntfreenode(mntnode_t *mnp)
892 {
893 	vnode_t *vp = MTOV(mnp);
894 
895 	rw_destroy(&mnp->mnt_contents);
896 	vn_invalid(vp);
897 	vn_free(vp);
898 	kmem_free(mnp, sizeof (*mnp));
899 }
900 
901 
902 /* ARGSUSED */
903 static int
904 mntfsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
905 {
906 	return (0);
907 }
908 
909 /* ARGSUSED */
910 static void
911 mntinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
912 {
913 	mntnode_t *mnp = VTOM(vp);
914 
915 	mntfreenode(mnp);
916 }
917 
918 /* ARGSUSED */
919 static int
920 mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp,
921 	caller_context_t *ct)
922 {
923 	mntnode_t *mnp = VTOM(vp);
924 
925 	if (*noffp == 0) {
926 		rw_enter(&mnp->mnt_contents, RW_WRITER);
927 		VTOM(vp)->mnt_offset = 0;
928 		rw_exit(&mnp->mnt_contents);
929 	}
930 
931 	return (0);
932 }
933 
934 /*
935  * Return the answer requested to poll().
936  * POLLRDBAND will return when the mtime of the mnttab
937  * information is newer than the latest one read for this open.
938  */
939 /* ARGSUSED */
940 static int
941 mntpoll(vnode_t *vp, short ev, int any, short *revp, pollhead_t **phpp,
942 	caller_context_t *ct)
943 {
944 	mntnode_t *mnp = VTOM(vp);
945 	mntsnap_t *snap;
946 
947 	rw_enter(&mnp->mnt_contents, RW_READER);
948 	snap = &mnp->mnt_read;
949 	if (mnp->mnt_ioctl.mnts_time.tv_sec > snap->mnts_time.tv_sec ||
950 	    (mnp->mnt_ioctl.mnts_time.tv_sec == snap->mnts_time.tv_sec &&
951 	    mnp->mnt_ioctl.mnts_time.tv_nsec > snap->mnts_time.tv_nsec))
952 		snap = &mnp->mnt_ioctl;
953 
954 	*revp = 0;
955 	*phpp = (pollhead_t *)NULL;
956 	if (ev & POLLIN)
957 		*revp |= POLLIN;
958 
959 	if (ev & POLLRDNORM)
960 		*revp |= POLLRDNORM;
961 
962 	if (ev & POLLRDBAND) {
963 		vfs_mnttab_poll(&snap->mnts_time, phpp);
964 		if (*phpp == (pollhead_t *)NULL)
965 			*revp |= POLLRDBAND;
966 	}
967 	rw_exit(&mnp->mnt_contents);
968 
969 	if (*revp || *phpp != NULL || any) {
970 		return (0);
971 	}
972 	/*
973 	 * If someone is polling an unsupported poll events (e.g.
974 	 * POLLOUT, POLLPRI, etc.), just return POLLERR revents.
975 	 * That way we will ensure that we don't return a 0
976 	 * revents with a NULL pollhead pointer.
977 	 */
978 	*revp = POLLERR;
979 	return (0);
980 }
981 /* ARGSUSED */
982 static int
983 mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
984 	cred_t *cr, int *rvalp, caller_context_t *ct)
985 {
986 	uint_t *up = (uint_t *)arg;
987 	mntnode_t *mnp = VTOM(vp);
988 	mntsnap_t *snap;
989 	int error;
990 
991 	error = 0;
992 	rw_enter(&mnp->mnt_contents, RW_READER);
993 	snap = &mnp->mnt_ioctl;
994 	switch (cmd) {
995 
996 	case MNTIOC_NMNTS: {		/* get no. of mounted resources */
997 		if (snap->mnts_count == 0) {
998 			if (!rw_tryupgrade(&mnp->mnt_contents)) {
999 				rw_exit(&mnp->mnt_contents);
1000 				rw_enter(&mnp->mnt_contents, RW_WRITER);
1001 			}
1002 			if ((error =
1003 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK))
1004 			    != 0) {
1005 				rw_exit(&mnp->mnt_contents);
1006 				return (error);
1007 			}
1008 			rw_downgrade(&mnp->mnt_contents);
1009 		}
1010 		if (suword32(up, snap->mnts_count) != 0)
1011 			error = EFAULT;
1012 		break;
1013 	}
1014 
1015 	case MNTIOC_GETDEVLIST: {	/* get mounted device major/minor nos */
1016 		uint_t *devlist;
1017 		int i;
1018 		size_t len;
1019 
1020 		if (snap->mnts_count == 0) {
1021 			if (!rw_tryupgrade(&mnp->mnt_contents)) {
1022 				rw_exit(&mnp->mnt_contents);
1023 				rw_enter(&mnp->mnt_contents, RW_WRITER);
1024 			}
1025 			if ((error =
1026 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK))
1027 			    != 0) {
1028 				rw_exit(&mnp->mnt_contents);
1029 				return (error);
1030 			}
1031 			rw_downgrade(&mnp->mnt_contents);
1032 		}
1033 
1034 		len = 2 * snap->mnts_count * sizeof (uint_t);
1035 		devlist = kmem_alloc(len, KM_SLEEP);
1036 		for (i = 0; i < snap->mnts_count; i++) {
1037 
1038 #ifdef _SYSCALL32_IMPL
1039 			if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
1040 				struct extmnttab32 tab;
1041 
1042 				if ((error = xcopyin(snap->mnts_text +
1043 				    i * sizeof (struct extmnttab32), &tab,
1044 				    sizeof (tab))) != 0)
1045 					break;
1046 
1047 				devlist[i*2] = tab.mnt_major;
1048 				devlist[i*2+1] = tab.mnt_minor;
1049 			} else {
1050 #endif
1051 				struct extmnttab tab;
1052 
1053 				if ((error = xcopyin(snap->mnts_text +
1054 				    i * sizeof (struct extmnttab), &tab,
1055 				    sizeof (tab))) != 0)
1056 					break;
1057 
1058 				devlist[i*2] = tab.mnt_major;
1059 				devlist[i*2+1] = tab.mnt_minor;
1060 #ifdef _SYSCALL32_IMPL
1061 			}
1062 #endif
1063 		}
1064 
1065 		if (error == 0)
1066 			error = xcopyout(devlist, up, len);
1067 		kmem_free(devlist, len);
1068 		break;
1069 	}
1070 
1071 	case MNTIOC_SETTAG:		/* set tag on mounted file system */
1072 	case MNTIOC_CLRTAG:		/* clear tag on mounted file system */
1073 	{
1074 		struct mnttagdesc *dp = (struct mnttagdesc *)arg;
1075 		STRUCT_DECL(mnttagdesc, tagdesc);
1076 		char *cptr;
1077 		uint32_t major, minor;
1078 		char tagbuf[MAX_MNTOPT_TAG];
1079 		char *pbuf;
1080 		size_t len;
1081 		uint_t start = 0;
1082 		mntdata_t *mntdata = MTOD(mnp);
1083 		zone_t *zone = mntdata->mnt_zone;
1084 
1085 		STRUCT_INIT(tagdesc, flag & DATAMODEL_MASK);
1086 		if (copyin(dp, STRUCT_BUF(tagdesc), STRUCT_SIZE(tagdesc))) {
1087 			error = EFAULT;
1088 			break;
1089 		}
1090 		pbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1091 		if (zone != global_zone) {
1092 			(void) strcpy(pbuf, zone->zone_rootpath);
1093 			/* truncate "/" and nul */
1094 			start = zone->zone_rootpathlen - 2;
1095 			ASSERT(pbuf[start] == '/');
1096 		}
1097 		cptr = STRUCT_FGETP(tagdesc, mtd_mntpt);
1098 		error = copyinstr(cptr, pbuf + start, MAXPATHLEN - start, &len);
1099 		if (error) {
1100 			kmem_free(pbuf, MAXPATHLEN);
1101 			break;
1102 		}
1103 		if (start != 0 && pbuf[start] != '/') {
1104 			kmem_free(pbuf, MAXPATHLEN);
1105 			error = EINVAL;
1106 			break;
1107 		}
1108 		cptr = STRUCT_FGETP(tagdesc, mtd_tag);
1109 		if ((error = copyinstr(cptr, tagbuf, MAX_MNTOPT_TAG, &len))) {
1110 			kmem_free(pbuf, MAXPATHLEN);
1111 			break;
1112 		}
1113 		major = STRUCT_FGET(tagdesc, mtd_major);
1114 		minor = STRUCT_FGET(tagdesc, mtd_minor);
1115 		if (cmd == MNTIOC_SETTAG)
1116 			error = vfs_settag(major, minor, pbuf, tagbuf, cr);
1117 		else
1118 			error = vfs_clrtag(major, minor, pbuf, tagbuf, cr);
1119 		kmem_free(pbuf, MAXPATHLEN);
1120 		break;
1121 	}
1122 
1123 	case MNTIOC_SHOWHIDDEN:
1124 	{
1125 		mutex_enter(&vp->v_lock);
1126 		mnp->mnt_flags |= MNT_SHOWHIDDEN;
1127 		mutex_exit(&vp->v_lock);
1128 		break;
1129 	}
1130 
1131 	case MNTIOC_GETMNTENT:
1132 	{
1133 		size_t idx;
1134 		uintptr_t addr;
1135 
1136 		if (!rw_tryupgrade(&mnp->mnt_contents)) {
1137 			rw_exit(&mnp->mnt_contents);
1138 			rw_enter(&mnp->mnt_contents, RW_WRITER);
1139 		}
1140 		idx = mnp->mnt_offset;
1141 		if (snap->mnts_count == 0 || idx == 0) {
1142 			if ((error =
1143 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK))
1144 			    != 0) {
1145 				rw_exit(&mnp->mnt_contents);
1146 				return (error);
1147 			}
1148 		}
1149 		/*
1150 		 * If the next index is beyond the end of the current mnttab,
1151 		 * return EOF
1152 		 */
1153 		if (idx >= snap->mnts_count) {
1154 			*rvalp = 1;
1155 			rw_exit(&mnp->mnt_contents);
1156 			return (0);
1157 		}
1158 
1159 #ifdef _SYSCALL32_IMPL
1160 		if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
1161 			addr = (uintptr_t)(snap->mnts_metadata + idx *
1162 			    sizeof (struct extmnttab32));
1163 			error = suword32((void *)arg, addr);
1164 		} else {
1165 #endif
1166 			addr = (uintptr_t)(snap->mnts_metadata + idx *
1167 			    sizeof (struct extmnttab));
1168 			error = sulword((void *)arg, addr);
1169 #ifdef _SYSCALL32_IMPL
1170 		}
1171 #endif
1172 
1173 		if (error != 0) {
1174 			rw_exit(&mnp->mnt_contents);
1175 			return (error);
1176 		}
1177 
1178 		mnp->mnt_offset++;
1179 		break;
1180 	}
1181 
1182 	default:
1183 		error = EINVAL;
1184 		break;
1185 	}
1186 
1187 	rw_exit(&mnp->mnt_contents);
1188 	return (error);
1189 }
1190 
1191 /*
1192  * /mntfs vnode operations vector
1193  */
1194 const fs_operation_def_t mnt_vnodeops_template[] = {
1195 	VOPNAME_OPEN,		{ .vop_open = mntopen },
1196 	VOPNAME_CLOSE,		{ .vop_close = mntclose },
1197 	VOPNAME_READ,		{ .vop_read = mntread },
1198 	VOPNAME_IOCTL,		{ .vop_ioctl = mntioctl },
1199 	VOPNAME_GETATTR,	{ .vop_getattr = mntgetattr },
1200 	VOPNAME_ACCESS,		{ .vop_access = mntaccess },
1201 	VOPNAME_FSYNC,		{ .vop_fsync = mntfsync },
1202 	VOPNAME_INACTIVE,	{ .vop_inactive = mntinactive },
1203 	VOPNAME_SEEK,		{ .vop_seek = mntseek },
1204 	VOPNAME_POLL,		{ .vop_poll = mntpoll },
1205 	VOPNAME_DISPOSE,	{ .error = fs_error },
1206 	VOPNAME_SHRLOCK,	{ .error = fs_error },
1207 	NULL,			NULL
1208 };
1209