xref: /illumos-gate/usr/src/uts/common/fs/mntfs/mntvnops.c (revision bea83d026ee1bd1b2a2419e1d0232f107a5d7d9b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/file.h>
29 #include <sys/stat.h>
30 #include <sys/atomic.h>
31 #include <sys/mntio.h>
32 #include <sys/mnttab.h>
33 #include <sys/mount.h>
34 #include <sys/sunddi.h>
35 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
37 #include <sys/vfs.h>
38 #include <sys/vfs_opreg.h>
39 #include <sys/fs/mntdata.h>
40 #include <fs/fs_subr.h>
41 #include <sys/vmsystm.h>
42 #include <vm/seg_vn.h>
43 
44 #define	MNTROOTINO	2
45 
46 static mntnode_t *mntgetnode(vnode_t *);
47 
48 vnodeops_t *mntvnodeops;
49 extern void vfs_mnttab_readop(void);
50 
51 /*
52  * Design of kernel mnttab accounting.
53  *
54  * To support whitespace in mount names, we implement an ioctl
55  * (MNTIOC_GETMNTENT) which allows a programmatic interface to the data in
56  * /etc/mnttab.  The libc functions getmntent() and getextmntent() are built
57  * atop this interface.
58  *
59  * To minimize the amount of memory used in the kernel, we keep all the
60  * necessary information in the user's address space.  Large server
61  * configurations can have /etc/mnttab files in excess of 64k.
62  *
63  * To support both vanilla read() calls as well as ioctl() calls, we have two
64  * different snapshots of the kernel data structures, mnt_read and mnt_ioctl.
65  * These snapshots include the base location in user memory, the number of
66  * mounts in the snapshot, and any metadata associated with it.  The metadata is
67  * used only to support the ioctl() interface, and is a series of extmnttab
68  * structures.  When the user issues an ioctl(), we simply copyout a pointer to
69  * that structure, and the rest is handled in userland.
70  */
71 
72 /*
73  * NOTE: The following variable enables the generation of the "dev=xxx"
74  * in the option string for a mounted file system.  Really this should
75  * be gotten rid of altogether, but for the sake of backwards compatibility
76  * we had to leave it in.  It is defined as a 32-bit device number.  This
77  * means that when 64-bit device numbers are in use, if either the major or
78  * minor part of the device number will not fit in a 16 bit quantity, the
79  * "dev=" will be set to NODEV (0x7fffffff).  See PSARC 1999/566 and
80  * 1999/131 for details.  The cmpldev() function used to generate the 32-bit
81  * device number handles this check and assigns the proper value.
82  */
83 int mntfs_enabledev = 1;	/* enable old "dev=xxx" option */
84 
85 static int
86 mntfs_devsize(struct vfs *vfsp)
87 {
88 	dev32_t odev;
89 
90 	(void) cmpldev(&odev, vfsp->vfs_dev);
91 	return (snprintf(NULL, 0, "dev=%x", odev));
92 }
93 
94 static int
95 mntfs_devprint(struct vfs *vfsp, char *buf)
96 {
97 	dev32_t odev;
98 
99 	(void) cmpldev(&odev, vfsp->vfs_dev);
100 	return (snprintf(buf, MAX_MNTOPT_STR, "dev=%x", odev));
101 }
102 
103 static int
104 mntfs_optsize(struct vfs *vfsp)
105 {
106 	int i, size = 0;
107 	mntopt_t *mop;
108 
109 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
110 		mop = &vfsp->vfs_mntopts.mo_list[i];
111 		if (mop->mo_flags & MO_NODISPLAY)
112 			continue;
113 		if (mop->mo_flags & MO_SET) {
114 			if (size)
115 				size++; /* space for comma */
116 			size += strlen(mop->mo_name);
117 			/*
118 			 * count option value if there is one
119 			 */
120 			if (mop->mo_arg != NULL) {
121 				size += strlen(mop->mo_arg) + 1;
122 			}
123 		}
124 	}
125 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
126 		/*
127 		 * Add space for "zone=<zone_name>" if required.
128 		 */
129 		if (size)
130 			size++;	/* space for comma */
131 		size += sizeof ("zone=") - 1;
132 		size += strlen(vfsp->vfs_zone->zone_name);
133 	}
134 	if (mntfs_enabledev) {
135 		if (size != 0)
136 			size++; /* space for comma */
137 		size += mntfs_devsize(vfsp);
138 	}
139 	if (size == 0)
140 		size = strlen("-");
141 	return (size);
142 }
143 
144 static int
145 mntfs_optprint(struct vfs *vfsp, char *buf)
146 {
147 	int i, optinbuf = 0;
148 	mntopt_t *mop;
149 	char *origbuf = buf;
150 
151 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
152 		mop = &vfsp->vfs_mntopts.mo_list[i];
153 		if (mop->mo_flags & MO_NODISPLAY)
154 			continue;
155 		if (mop->mo_flags & MO_SET) {
156 			if (optinbuf)
157 				*buf++ = ',';
158 			else
159 				optinbuf = 1;
160 			buf += snprintf(buf, MAX_MNTOPT_STR,
161 				"%s", mop->mo_name);
162 			/*
163 			 * print option value if there is one
164 			 */
165 			if (mop->mo_arg != NULL) {
166 				buf += snprintf(buf, MAX_MNTOPT_STR, "=%s",
167 					mop->mo_arg);
168 			}
169 		}
170 	}
171 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
172 		if (optinbuf)
173 			*buf++ = ',';
174 		else
175 			optinbuf = 1;
176 		buf += snprintf(buf, MAX_MNTOPT_STR, "zone=%s",
177 		    vfsp->vfs_zone->zone_name);
178 	}
179 	if (mntfs_enabledev) {
180 		if (optinbuf++)
181 			*buf++ = ',';
182 		buf += mntfs_devprint(vfsp, buf);
183 	}
184 	if (!optinbuf) {
185 		buf += snprintf(buf, MAX_MNTOPT_STR, "-");
186 	}
187 	return (buf - origbuf);
188 }
189 
190 static size_t
191 mntfs_vfs_len(vfs_t *vfsp, zone_t *zone)
192 {
193 	size_t size = 0;
194 	const char *resource, *mntpt;
195 
196 	mntpt = refstr_value(vfsp->vfs_mntpt);
197 	if (mntpt != NULL && mntpt[0] != '\0') {
198 		size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
199 	} else {
200 		size += strlen("-") + 1;
201 	}
202 
203 	resource = refstr_value(vfsp->vfs_resource);
204 	if (resource != NULL && resource[0] != '\0') {
205 		if (resource[0] != '/') {
206 			size += strlen(resource) + 1;
207 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
208 			/*
209 			 * Same as the zone's view of the mount point.
210 			 */
211 			size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
212 		} else {
213 			size += strlen(ZONE_PATH_TRANSLATE(resource, zone)) + 1;
214 		}
215 	} else {
216 		size += strlen("-") + 1;
217 	}
218 	size += strlen(vfssw[vfsp->vfs_fstype].vsw_name) + 1;
219 	size += mntfs_optsize(vfsp);
220 	size += snprintf(NULL, 0, "\t%ld\n", vfsp->vfs_mtime);
221 	return (size);
222 }
223 
224 static void
225 mntfs_zonerootvfs(zone_t *zone, vfs_t *rootvfsp)
226 {
227 	/*
228 	 * Basically copy over the real vfs_t on which the root vnode is
229 	 * located, changing its mountpoint and resource to match those of
230 	 * the zone's rootpath.
231 	 */
232 	*rootvfsp = *zone->zone_rootvp->v_vfsp;
233 	rootvfsp->vfs_mntpt = refstr_alloc(zone->zone_rootpath);
234 	rootvfsp->vfs_resource = rootvfsp->vfs_mntpt;
235 }
236 
237 static size_t
238 mntfs_zone_len(uint_t *nent_ptr, zone_t *zone, int showhidden)
239 {
240 	struct vfs *zonelist;
241 	struct vfs *vfsp;
242 	size_t size = 0;
243 	uint_t cnt = 0;
244 
245 	ASSERT(zone->zone_rootpath != NULL);
246 
247 	/*
248 	 * If the zone has a root entry, it will be the first in the list.  If
249 	 * it doesn't, we conjure one up.
250 	 */
251 	vfsp = zonelist = zone->zone_vfslist;
252 	if (zonelist == NULL ||
253 	    strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
254 		vfs_t tvfs;
255 		/*
256 		 * The root of the zone is not a mount point.  The vfs we want
257 		 * to report is that of the zone's root vnode.
258 		 */
259 		ASSERT(zone != global_zone);
260 		mntfs_zonerootvfs(zone, &tvfs);
261 		size += mntfs_vfs_len(&tvfs, zone);
262 		refstr_rele(tvfs.vfs_mntpt);
263 		cnt++;
264 	}
265 	if (zonelist == NULL)
266 		goto out;
267 	do {
268 		/*
269 		 * Skip mounts that should not show up in mnttab
270 		 */
271 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
272 			vfsp = vfsp->vfs_zone_next;
273 			continue;
274 		}
275 		cnt++;
276 		size += mntfs_vfs_len(vfsp, zone);
277 		vfsp = vfsp->vfs_zone_next;
278 	} while (vfsp != zonelist);
279 out:
280 	*nent_ptr = cnt;
281 	return (size);
282 }
283 
284 static size_t
285 mntfs_global_len(uint_t *nent_ptr, int showhidden)
286 {
287 	struct vfs *vfsp;
288 	size_t size = 0;
289 	uint_t cnt = 0;
290 
291 	vfsp = rootvfs;
292 	do {
293 		/*
294 		 * Skip mounts that should not show up in mnttab
295 		 */
296 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
297 			vfsp = vfsp->vfs_next;
298 			continue;
299 		}
300 		cnt++;
301 		size += mntfs_vfs_len(vfsp, global_zone);
302 		vfsp = vfsp->vfs_next;
303 	} while (vfsp != rootvfs);
304 	*nent_ptr = cnt;
305 	return (size);
306 }
307 
308 static void
309 mntfs_vfs_generate(vfs_t *vfsp, zone_t *zone, struct extmnttab *tab,
310     char **basep, int forread)
311 {
312 	const char *resource, *mntpt;
313 	char *cp = *basep;
314 
315 	mntpt = refstr_value(vfsp->vfs_mntpt);
316 	resource = refstr_value(vfsp->vfs_resource);
317 
318 	if (tab)
319 		tab->mnt_special = cp;
320 	if (resource != NULL && resource[0] != '\0') {
321 		if (resource[0] != '/') {
322 			cp += snprintf(cp, MAXPATHLEN, "%s", resource);
323 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
324 			/*
325 			 * Use the mount point as the resource.
326 			 */
327 			cp += snprintf(cp, MAXPATHLEN, "%s",
328 			    ZONE_PATH_TRANSLATE(mntpt, zone));
329 		} else {
330 			cp += snprintf(cp, MAXPATHLEN, "%s",
331 			    ZONE_PATH_TRANSLATE(resource, zone));
332 		}
333 	} else {
334 		cp += snprintf(cp, MAXPATHLEN, "-");
335 	}
336 	*cp++ = forread ? '\t' : '\0';
337 
338 	if (tab)
339 		tab->mnt_mountp = cp;
340 	if (mntpt != NULL && mntpt[0] != '\0') {
341 		/*
342 		 * We know the mount point is visible from within the zone,
343 		 * otherwise it wouldn't be on the zone's vfs list.
344 		 */
345 		cp += snprintf(cp, MAXPATHLEN, "%s",
346 		    ZONE_PATH_TRANSLATE(mntpt, zone));
347 	} else {
348 		cp += snprintf(cp, MAXPATHLEN, "-");
349 	}
350 	*cp++ = forread ? '\t' : '\0';
351 
352 	if (tab)
353 		tab->mnt_fstype = cp;
354 	cp += snprintf(cp, MAXPATHLEN, "%s",
355 	    vfssw[vfsp->vfs_fstype].vsw_name);
356 	*cp++ = forread ? '\t' : '\0';
357 
358 	if (tab)
359 		tab->mnt_mntopts = cp;
360 	cp += mntfs_optprint(vfsp, cp);
361 	*cp++ = forread ? '\t' : '\0';
362 
363 	if (tab)
364 		tab->mnt_time = cp;
365 	cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime);
366 	*cp++ = forread ? '\n' : '\0';
367 
368 	if (tab) {
369 		tab->mnt_major = getmajor(vfsp->vfs_dev);
370 		tab->mnt_minor = getminor(vfsp->vfs_dev);
371 	}
372 
373 	*basep = cp;
374 }
375 
376 static void
377 mntfs_zone_generate(zone_t *zone, int showhidden, struct extmnttab *tab,
378     char *basep, int forread)
379 {
380 	vfs_t *zonelist;
381 	vfs_t *vfsp;
382 	char *cp = basep;
383 
384 	/*
385 	 * If the zone has a root entry, it will be the first in the list.  If
386 	 * it doesn't, we conjure one up.
387 	 */
388 	vfsp = zonelist = zone->zone_vfslist;
389 	if (zonelist == NULL ||
390 	    strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
391 		vfs_t tvfs;
392 		/*
393 		 * The root of the zone is not a mount point.  The vfs we want
394 		 * to report is that of the zone's root vnode.
395 		 */
396 		ASSERT(zone != global_zone);
397 		mntfs_zonerootvfs(zone, &tvfs);
398 		mntfs_vfs_generate(&tvfs, zone, tab, &cp, forread);
399 		refstr_rele(tvfs.vfs_mntpt);
400 		if (tab)
401 			tab++;
402 	}
403 	if (zonelist == NULL)
404 		return;
405 	do {
406 		/*
407 		 * Skip mounts that should not show up in mnttab
408 		 */
409 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
410 			vfsp = vfsp->vfs_zone_next;
411 			continue;
412 		}
413 		mntfs_vfs_generate(vfsp, zone, tab, &cp, forread);
414 		if (tab)
415 			tab++;
416 		vfsp = vfsp->vfs_zone_next;
417 	} while (vfsp != zonelist);
418 }
419 
420 static void
421 mntfs_global_generate(int showhidden, struct extmnttab *tab, char *basep,
422     int forread)
423 {
424 	vfs_t *vfsp;
425 	char *cp = basep;
426 
427 	vfsp = rootvfs;
428 	do {
429 		/*
430 		 * Skip mounts that should not show up in mnttab
431 		 */
432 		if (!showhidden && vfsp->vfs_flag & VFS_NOMNTTAB) {
433 			vfsp = vfsp->vfs_next;
434 			continue;
435 		}
436 		mntfs_vfs_generate(vfsp, global_zone, tab, &cp, forread);
437 		if (tab)
438 			tab++;
439 		vfsp = vfsp->vfs_next;
440 	} while (vfsp != rootvfs);
441 }
442 
443 static char *
444 mntfs_mapin(char *base, size_t size)
445 {
446 	size_t rlen = roundup(size, PAGESIZE);
447 	struct as *as = curproc->p_as;
448 	char *addr;
449 
450 	as_rangelock(as);
451 	map_addr(&addr, rlen, 0, 1, 0);
452 	if (addr == NULL || as_map(as, addr, rlen, segvn_create, zfod_argsp)) {
453 		as_rangeunlock(as);
454 		return (NULL);
455 	}
456 	as_rangeunlock(as);
457 	if (copyout(base, addr, size)) {
458 		(void) as_unmap(as, addr, rlen);
459 		return (NULL);
460 	}
461 	return (addr);
462 }
463 
464 static void
465 mntfs_freesnap(mntsnap_t *snap)
466 {
467 	if (snap->mnts_text != NULL)
468 		(void) as_unmap(curproc->p_as, snap->mnts_text,
469 			roundup(snap->mnts_textsize, PAGESIZE));
470 	snap->mnts_textsize = snap->mnts_count = 0;
471 	if (snap->mnts_metadata != NULL)
472 		(void) as_unmap(curproc->p_as, snap->mnts_metadata,
473 			roundup(snap->mnts_metasize, PAGESIZE));
474 	snap->mnts_metasize = 0;
475 }
476 
477 #ifdef _SYSCALL32_IMPL
478 
479 typedef struct extmnttab32 {
480 	uint32_t	mnt_special;
481 	uint32_t	mnt_mountp;
482 	uint32_t	mnt_fstype;
483 	uint32_t	mnt_mntopts;
484 	uint32_t	mnt_time;
485 	uint_t		mnt_major;
486 	uint_t		mnt_minor;
487 } extmnttab32_t;
488 
489 #endif
490 
491 /*
492  * Snapshot the latest version of the kernel mounted resource information
493  *
494  * There are two types of snapshots: one destined for reading, and one destined
495  * for ioctl().  The difference is that the ioctl() interface is delimited by
496  * NULLs, while the read() interface is delimited by tabs and newlines.
497  */
498 /* ARGSUSED */
499 static int
500 mntfs_snapshot(mntnode_t *mnp, int forread, int datamodel)
501 {
502 	size_t size;
503 	timespec_t lastmodt;
504 	mntdata_t *mntdata = MTOD(mnp);
505 	zone_t *zone = mntdata->mnt_zone;
506 	boolean_t global_view = (MTOD(mnp)->mnt_zone == global_zone);
507 	boolean_t showhidden = ((mnp->mnt_flags & MNT_SHOWHIDDEN) != 0);
508 	struct extmnttab *metadata_baseaddr;
509 	char *text_baseaddr;
510 	int i;
511 	mntsnap_t *snap;
512 
513 	if (forread)
514 		snap = &mnp->mnt_read;
515 	else
516 		snap = &mnp->mnt_ioctl;
517 
518 	vfs_list_read_lock();
519 	/*
520 	 * Check if the mnttab info has changed since the last snapshot
521 	 */
522 	vfs_mnttab_modtime(&lastmodt);
523 	if (snap->mnts_count &&
524 	    lastmodt.tv_sec == snap->mnts_time.tv_sec &&
525 	    lastmodt.tv_nsec == snap->mnts_time.tv_nsec) {
526 		vfs_list_unlock();
527 		return (0);
528 	}
529 
530 
531 	if (snap->mnts_count != 0)
532 		mntfs_freesnap(snap);
533 	if (global_view)
534 		size = mntfs_global_len(&snap->mnts_count, showhidden);
535 	else
536 		size = mntfs_zone_len(&snap->mnts_count, zone, showhidden);
537 	ASSERT(size != 0);
538 
539 	if (!forread)
540 		metadata_baseaddr = kmem_alloc(
541 		    snap->mnts_count * sizeof (struct extmnttab), KM_SLEEP);
542 	else
543 		metadata_baseaddr = NULL;
544 
545 	text_baseaddr = kmem_alloc(size, KM_SLEEP);
546 
547 	if (global_view)
548 		mntfs_global_generate(showhidden, metadata_baseaddr,
549 		    text_baseaddr, forread);
550 	else
551 		mntfs_zone_generate(zone, showhidden,
552 		    metadata_baseaddr, text_baseaddr, forread);
553 
554 	vfs_mnttab_modtime(&snap->mnts_time);
555 	vfs_list_unlock();
556 
557 	snap->mnts_text = mntfs_mapin(text_baseaddr, size);
558 	snap->mnts_textsize = size;
559 	kmem_free(text_baseaddr, size);
560 
561 	/*
562 	 * The pointers in the metadata refer to addreesses in the range
563 	 * [base_addr, base_addr + size].  Now that we have mapped the text into
564 	 * the user's address space, we have to convert these addresses into the
565 	 * new (user) range.  We also handle the conversion for 32-bit and
566 	 * 32-bit applications here.
567 	 */
568 	if (!forread) {
569 		struct extmnttab *tab;
570 #ifdef _SYSCALL32_IMPL
571 		struct extmnttab32 *tab32;
572 
573 		if (datamodel == DATAMODEL_ILP32) {
574 			tab = (struct extmnttab *)metadata_baseaddr;
575 			tab32 = (struct extmnttab32 *)metadata_baseaddr;
576 
577 			for (i = 0; i < snap->mnts_count; i++) {
578 				tab32[i].mnt_special =
579 				    (uintptr_t)snap->mnts_text +
580 				    (tab[i].mnt_special - text_baseaddr);
581 				tab32[i].mnt_mountp =
582 				    (uintptr_t)snap->mnts_text +
583 				    (tab[i].mnt_mountp - text_baseaddr);
584 				tab32[i].mnt_fstype =
585 				    (uintptr_t)snap->mnts_text +
586 				    (tab[i].mnt_fstype - text_baseaddr);
587 				tab32[i].mnt_mntopts =
588 				    (uintptr_t)snap->mnts_text +
589 				    (tab[i].mnt_mntopts - text_baseaddr);
590 				tab32[i].mnt_time = (uintptr_t)snap->mnts_text +
591 				    (tab[i].mnt_time - text_baseaddr);
592 				tab32[i].mnt_major = tab[i].mnt_major;
593 				tab32[i].mnt_minor = tab[i].mnt_minor;
594 			}
595 
596 			snap->mnts_metasize =
597 			    snap->mnts_count * sizeof (struct extmnttab32);
598 			snap->mnts_metadata = mntfs_mapin(
599 			    (char *)metadata_baseaddr,
600 			    snap->mnts_metasize);
601 
602 		} else {
603 #endif
604 			tab = (struct extmnttab *)metadata_baseaddr;
605 			for (i = 0; i < snap->mnts_count; i++) {
606 				tab[i].mnt_special = snap->mnts_text +
607 				    (tab[i].mnt_special - text_baseaddr);
608 				tab[i].mnt_mountp = snap->mnts_text +
609 				    (tab[i].mnt_mountp - text_baseaddr);
610 				tab[i].mnt_fstype = snap->mnts_text +
611 				    (tab[i].mnt_fstype - text_baseaddr);
612 				tab[i].mnt_mntopts = snap->mnts_text +
613 				    (tab[i].mnt_mntopts - text_baseaddr);
614 				tab[i].mnt_time = snap->mnts_text +
615 				    (tab[i].mnt_time - text_baseaddr);
616 			}
617 
618 			snap->mnts_metasize =
619 			    snap->mnts_count * sizeof (struct extmnttab);
620 			snap->mnts_metadata = mntfs_mapin(
621 			    (char *)metadata_baseaddr, snap->mnts_metasize);
622 #ifdef _SYSCALL32_IMPL
623 		}
624 #endif
625 
626 		kmem_free(metadata_baseaddr,
627 		    snap->mnts_count * sizeof (struct extmnttab));
628 	}
629 
630 	mntdata->mnt_size = size;
631 
632 	if (snap->mnts_text == NULL ||
633 	    (!forread && snap->mnts_metadata == NULL)) {
634 		mntfs_freesnap(snap);
635 		return (ENOMEM);
636 	}
637 	vfs_mnttab_readop();
638 	return (0);
639 }
640 
641 /*
642  * Public function to convert vfs_mntopts into a string.
643  * A buffer of sufficient size is allocated, which is returned via bufp,
644  * and whose length is returned via lenp.
645  */
646 void
647 mntfs_getmntopts(struct vfs *vfsp, char **bufp, size_t *lenp)
648 {
649 	size_t len;
650 	char *buf;
651 
652 	vfs_list_read_lock();
653 
654 	len = mntfs_optsize(vfsp) + 1;
655 	buf = kmem_alloc(len, KM_NOSLEEP);
656 	if (buf == NULL) {
657 		*bufp = NULL;
658 		vfs_list_unlock();
659 		return;
660 	}
661 	buf[len - 1] = '\0';
662 	(void) mntfs_optprint(vfsp, buf);
663 	ASSERT(buf[len - 1] == '\0');
664 
665 	vfs_list_unlock();
666 	*bufp = buf;
667 	*lenp = len;
668 }
669 
670 
671 /* ARGSUSED */
672 static int
673 mntopen(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
674 {
675 	vnode_t *vp = *vpp;
676 	mntnode_t *nmnp;
677 
678 	/*
679 	 * Not allowed to open for writing, return error.
680 	 */
681 	if (flag & FWRITE)
682 		return (EPERM);
683 	/*
684 	 * Create a new mnt/vnode for each open, this will give us a handle to
685 	 * hang the snapshot on.
686 	 */
687 	nmnp = mntgetnode(vp);
688 
689 	*vpp = MTOV(nmnp);
690 	atomic_add_32(&MTOD(nmnp)->mnt_nopen, 1);
691 	VN_RELE(vp);
692 	return (0);
693 }
694 
695 /* ARGSUSED */
696 static int
697 mntclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
698 	caller_context_t *ct)
699 {
700 	mntnode_t *mnp = VTOM(vp);
701 
702 	/* Clean up any locks or shares held by the current process */
703 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
704 	cleanshares(vp, ttoproc(curthread)->p_pid);
705 
706 	if (count > 1)
707 		return (0);
708 	if (vp->v_count == 1) {
709 		mntfs_freesnap(&mnp->mnt_read);
710 		mntfs_freesnap(&mnp->mnt_ioctl);
711 		atomic_add_32(&MTOD(mnp)->mnt_nopen, -1);
712 	}
713 	return (0);
714 }
715 
716 /* ARGSUSED */
717 static int
718 mntread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, caller_context_t *ct)
719 {
720 	int error = 0;
721 	off_t off = uio->uio_offset;
722 	size_t len = uio->uio_resid;
723 	mntnode_t *mnp = VTOM(vp);
724 	char *buf;
725 	mntsnap_t *snap = &mnp->mnt_read;
726 	int datamodel;
727 
728 	if (off == (off_t)0 || snap->mnts_count == 0) {
729 		/*
730 		 * It is assumed that any kernel callers wishing
731 		 * to read mnttab will be using extmnttab entries
732 		 * and not extmnttab32 entries, whether or not
733 		 * the kernel is LP64 or ILP32.  Thus, force the
734 		 * datamodel that mntfs_snapshot uses to be
735 		 * DATAMODEL_LP64.
736 		 */
737 		if (uio->uio_segflg == UIO_SYSSPACE)
738 			datamodel = DATAMODEL_LP64;
739 		else
740 			datamodel = get_udatamodel();
741 		if ((error = mntfs_snapshot(mnp, 1, datamodel)) != 0)
742 			return (error);
743 	}
744 	if ((size_t)(off + len) > snap->mnts_textsize)
745 		len = snap->mnts_textsize - off;
746 
747 	if (off < 0 || len > snap->mnts_textsize)
748 		return (EFAULT);
749 
750 	if (len == 0)
751 		return (0);
752 
753 	/*
754 	 * The mnttab image is stored in the user's address space,
755 	 * so we have to copy it into the kernel from userland,
756 	 * then copy it back out to the specified address.
757 	 */
758 	buf = kmem_alloc(len, KM_SLEEP);
759 	if (copyin(snap->mnts_text + off, buf, len))
760 		error = EFAULT;
761 	else {
762 		error = uiomove(buf, len, UIO_READ, uio);
763 	}
764 	kmem_free(buf, len);
765 	vfs_mnttab_readop();
766 	return (error);
767 }
768 
769 
770 static int
771 mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
772 	caller_context_t *ct)
773 {
774 	mntnode_t *mnp = VTOM(vp);
775 	int error;
776 	vnode_t *rvp;
777 	extern timespec_t vfs_mnttab_ctime;
778 	mntdata_t *mntdata = MTOD(VTOM(vp));
779 	mntsnap_t *snap = mnp->mnt_read.mnts_count ?
780 	    &mnp->mnt_read : &mnp->mnt_ioctl;
781 
782 	/*
783 	 * Return all the attributes.  Should be refined
784 	 * so that it returns only those asked for.
785 	 * Most of this is complete fakery anyway.
786 	 */
787 	rvp = mnp->mnt_mountvp;
788 	/*
789 	 * Attributes are same as underlying file with modifications
790 	 */
791 	if (error = VOP_GETATTR(rvp, vap, flags, cr, ct))
792 		return (error);
793 
794 	/*
795 	 * We always look like a regular file
796 	 */
797 	vap->va_type = VREG;
798 	/*
799 	 * mode should basically be read only
800 	 */
801 	vap->va_mode &= 07444;
802 	vap->va_fsid = vp->v_vfsp->vfs_dev;
803 	vap->va_blksize = DEV_BSIZE;
804 	vap->va_rdev = 0;
805 	vap->va_seq = 0;
806 	/*
807 	 * Set nlink to the number of open vnodes for mnttab info
808 	 * plus one for existing.
809 	 */
810 	vap->va_nlink = mntdata->mnt_nopen + 1;
811 	/*
812 	 * If we haven't taken a snapshot yet, set the
813 	 * size to the size of the latest snapshot.
814 	 */
815 	vap->va_size = snap->mnts_textsize ? snap->mnts_textsize :
816 	    mntdata->mnt_size;
817 	/*
818 	 * Fetch mtime from the vfs mnttab timestamp
819 	 */
820 	vap->va_ctime = vfs_mnttab_ctime;
821 	vfs_list_read_lock();
822 	vfs_mnttab_modtime(&vap->va_mtime);
823 	vap->va_atime = vap->va_mtime;
824 	vfs_list_unlock();
825 	/*
826 	 * Nodeid is always ROOTINO;
827 	 */
828 	vap->va_nodeid = (ino64_t)MNTROOTINO;
829 	vap->va_nblocks = btod(vap->va_size);
830 	return (0);
831 }
832 
833 
834 static int
835 mntaccess(vnode_t *vp, int mode, int flags, cred_t *cr,
836 	caller_context_t *ct)
837 {
838 	mntnode_t *mnp = VTOM(vp);
839 
840 	if (mode & (VWRITE|VEXEC))
841 		return (EROFS);
842 
843 	/*
844 	 * Do access check on the underlying directory vnode.
845 	 */
846 	return (VOP_ACCESS(mnp->mnt_mountvp, mode, flags, cr, ct));
847 }
848 
849 
850 /*
851  * New /mntfs vnode required; allocate it and fill in most of the fields.
852  */
853 static mntnode_t *
854 mntgetnode(vnode_t *dp)
855 {
856 	mntnode_t *mnp;
857 	vnode_t *vp;
858 
859 	mnp = kmem_zalloc(sizeof (mntnode_t), KM_SLEEP);
860 	mnp->mnt_vnode = vn_alloc(KM_SLEEP);
861 	mnp->mnt_mountvp = VTOM(dp)->mnt_mountvp;
862 	vp = MTOV(mnp);
863 	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
864 	vn_setops(vp, mntvnodeops);
865 	vp->v_vfsp = dp->v_vfsp;
866 	vp->v_type = VREG;
867 	vp->v_data = (caddr_t)mnp;
868 
869 	return (mnp);
870 }
871 
872 /*
873  * Free the storage obtained from mntgetnode().
874  */
875 static void
876 mntfreenode(mntnode_t *mnp)
877 {
878 	vnode_t *vp = MTOV(mnp);
879 
880 	vn_invalid(vp);
881 	vn_free(vp);
882 	kmem_free(mnp, sizeof (*mnp));
883 }
884 
885 
886 /* ARGSUSED */
887 static int
888 mntfsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
889 {
890 	return (0);
891 }
892 
893 /* ARGSUSED */
894 static void
895 mntinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
896 {
897 	mntnode_t *mnp = VTOM(vp);
898 
899 	mntfreenode(mnp);
900 }
901 
902 /* ARGSUSED */
903 static int
904 mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp,
905 	caller_context_t *ct)
906 {
907 	if (*noffp == 0)
908 		VTOM(vp)->mnt_offset = 0;
909 
910 	return (0);
911 }
912 
913 /*
914  * Return the answer requested to poll().
915  * POLLRDBAND will return when the mtime of the mnttab
916  * information is newer than the latest one read for this open.
917  */
918 /* ARGSUSED */
919 static int
920 mntpoll(vnode_t *vp, short ev, int any, short *revp, pollhead_t **phpp,
921 	caller_context_t *ct)
922 {
923 	mntnode_t *mnp = VTOM(vp);
924 	mntsnap_t *snap = &mnp->mnt_read;
925 
926 	if (mnp->mnt_ioctl.mnts_time.tv_sec > snap->mnts_time.tv_sec ||
927 	    (mnp->mnt_ioctl.mnts_time.tv_sec == snap->mnts_time.tv_sec &&
928 	    mnp->mnt_ioctl.mnts_time.tv_nsec > snap->mnts_time.tv_nsec))
929 		snap = &mnp->mnt_ioctl;
930 
931 	*revp = 0;
932 	*phpp = (pollhead_t *)NULL;
933 	if (ev & POLLIN)
934 		*revp |= POLLIN;
935 
936 	if (ev & POLLRDNORM)
937 		*revp |= POLLRDNORM;
938 
939 	if (ev & POLLRDBAND) {
940 		vfs_mnttab_poll(&snap->mnts_time, phpp);
941 		if (*phpp == (pollhead_t *)NULL)
942 			*revp |= POLLRDBAND;
943 	}
944 	if (*revp || *phpp != NULL || any) {
945 		return (0);
946 	}
947 	/*
948 	 * If someone is polling an unsupported poll events (e.g.
949 	 * POLLOUT, POLLPRI, etc.), just return POLLERR revents.
950 	 * That way we will ensure that we don't return a 0
951 	 * revents with a NULL pollhead pointer.
952 	 */
953 	*revp = POLLERR;
954 	return (0);
955 }
956 /* ARGSUSED */
957 static int
958 mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
959 	cred_t *cr, int *rvalp, caller_context_t *ct)
960 {
961 	uint_t *up = (uint_t *)arg;
962 	mntnode_t *mnp = VTOM(vp);
963 	mntsnap_t *snap = &mnp->mnt_ioctl;
964 	int error;
965 
966 	error = 0;
967 	switch (cmd) {
968 
969 	case MNTIOC_NMNTS: {		/* get no. of mounted resources */
970 		if (snap->mnts_count == 0) {
971 			if ((error =
972 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
973 				return (error);
974 		}
975 		if (suword32(up, snap->mnts_count) != 0)
976 			error = EFAULT;
977 		break;
978 	}
979 
980 	case MNTIOC_GETDEVLIST: {	/* get mounted device major/minor nos */
981 		uint_t *devlist;
982 		int i;
983 		size_t len;
984 
985 		if (snap->mnts_count == 0) {
986 			if ((error =
987 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
988 				return (error);
989 		}
990 
991 		len = 2 * snap->mnts_count * sizeof (uint_t);
992 		devlist = kmem_alloc(len, KM_SLEEP);
993 		for (i = 0; i < snap->mnts_count; i++) {
994 
995 #ifdef _SYSCALL32_IMPL
996 			if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
997 				struct extmnttab32 tab;
998 
999 				if ((error = xcopyin(snap->mnts_text +
1000 				    i * sizeof (struct extmnttab32), &tab,
1001 				    sizeof (tab))) != 0)
1002 					break;
1003 
1004 				devlist[i*2] = tab.mnt_major;
1005 				devlist[i*2+1] = tab.mnt_minor;
1006 			} else {
1007 #endif
1008 				struct extmnttab tab;
1009 
1010 				if ((error = xcopyin(snap->mnts_text +
1011 				    i * sizeof (struct extmnttab), &tab,
1012 				    sizeof (tab))) != 0)
1013 					break;
1014 
1015 				devlist[i*2] = tab.mnt_major;
1016 				devlist[i*2+1] = tab.mnt_minor;
1017 #ifdef _SYSCALL32_IMPL
1018 			}
1019 #endif
1020 		}
1021 
1022 		if (error == 0)
1023 			error = xcopyout(devlist, up, len);
1024 		kmem_free(devlist, len);
1025 		break;
1026 	}
1027 
1028 	case MNTIOC_SETTAG:		/* set tag on mounted file system */
1029 	case MNTIOC_CLRTAG:		/* clear tag on mounted file system */
1030 	{
1031 		struct mnttagdesc *dp = (struct mnttagdesc *)arg;
1032 		STRUCT_DECL(mnttagdesc, tagdesc);
1033 		char *cptr;
1034 		uint32_t major, minor;
1035 		char tagbuf[MAX_MNTOPT_TAG];
1036 		char *pbuf;
1037 		size_t len;
1038 		uint_t start = 0;
1039 		mntdata_t *mntdata = MTOD(mnp);
1040 		zone_t *zone = mntdata->mnt_zone;
1041 
1042 		STRUCT_INIT(tagdesc, flag & DATAMODEL_MASK);
1043 		if (copyin(dp, STRUCT_BUF(tagdesc), STRUCT_SIZE(tagdesc))) {
1044 			error = EFAULT;
1045 			break;
1046 		}
1047 		pbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1048 		if (zone != global_zone) {
1049 			(void) strcpy(pbuf, zone->zone_rootpath);
1050 			/* truncate "/" and nul */
1051 			start = zone->zone_rootpathlen - 2;
1052 			ASSERT(pbuf[start] == '/');
1053 		}
1054 		cptr = STRUCT_FGETP(tagdesc, mtd_mntpt);
1055 		error = copyinstr(cptr, pbuf + start, MAXPATHLEN - start, &len);
1056 		if (error) {
1057 			kmem_free(pbuf, MAXPATHLEN);
1058 			break;
1059 		}
1060 		if (start != 0 && pbuf[start] != '/') {
1061 			kmem_free(pbuf, MAXPATHLEN);
1062 			error = EINVAL;
1063 			break;
1064 		}
1065 		cptr = STRUCT_FGETP(tagdesc, mtd_tag);
1066 		if ((error = copyinstr(cptr, tagbuf, MAX_MNTOPT_TAG, &len))) {
1067 			kmem_free(pbuf, MAXPATHLEN);
1068 			break;
1069 		}
1070 		major = STRUCT_FGET(tagdesc, mtd_major);
1071 		minor = STRUCT_FGET(tagdesc, mtd_minor);
1072 		if (cmd == MNTIOC_SETTAG)
1073 			error = vfs_settag(major, minor, pbuf, tagbuf, cr);
1074 		else
1075 			error = vfs_clrtag(major, minor, pbuf, tagbuf, cr);
1076 		kmem_free(pbuf, MAXPATHLEN);
1077 		break;
1078 	}
1079 
1080 	case MNTIOC_SHOWHIDDEN:
1081 	{
1082 		mutex_enter(&vp->v_lock);
1083 		mnp->mnt_flags |= MNT_SHOWHIDDEN;
1084 		mutex_exit(&vp->v_lock);
1085 		break;
1086 	}
1087 
1088 	case MNTIOC_GETMNTENT:
1089 	{
1090 		size_t idx;
1091 		uintptr_t addr;
1092 
1093 		idx = mnp->mnt_offset;
1094 		if (snap->mnts_count == 0 || idx == 0) {
1095 			if ((error =
1096 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
1097 				return (error);
1098 		}
1099 		/*
1100 		 * If the next index is beyond the end of the current mnttab,
1101 		 * return EOF
1102 		 */
1103 		if (idx >= snap->mnts_count) {
1104 			*rvalp = 1;
1105 			return (0);
1106 		}
1107 
1108 #ifdef _SYSCALL32_IMPL
1109 		if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
1110 			addr = (uintptr_t)(snap->mnts_metadata + idx *
1111 			    sizeof (struct extmnttab32));
1112 			error = suword32((void *)arg, addr);
1113 		} else {
1114 #endif
1115 			addr = (uintptr_t)(snap->mnts_metadata + idx *
1116 			    sizeof (struct extmnttab));
1117 			error = sulword((void *)arg, addr);
1118 #ifdef _SYSCALL32_IMPL
1119 		}
1120 #endif
1121 
1122 		if (error != 0)
1123 			return (error);
1124 
1125 		mnp->mnt_offset++;
1126 		break;
1127 	}
1128 
1129 	default:
1130 		error = EINVAL;
1131 		break;
1132 	}
1133 
1134 	return (error);
1135 }
1136 
1137 /*
1138  * /mntfs vnode operations vector
1139  */
1140 const fs_operation_def_t mnt_vnodeops_template[] = {
1141 	VOPNAME_OPEN,		{ .vop_open = mntopen },
1142 	VOPNAME_CLOSE,		{ .vop_close = mntclose },
1143 	VOPNAME_READ,		{ .vop_read = mntread },
1144 	VOPNAME_IOCTL,		{ .vop_ioctl = mntioctl },
1145 	VOPNAME_GETATTR,	{ .vop_getattr = mntgetattr },
1146 	VOPNAME_ACCESS,		{ .vop_access = mntaccess },
1147 	VOPNAME_FSYNC,		{ .vop_fsync = mntfsync },
1148 	VOPNAME_INACTIVE,	{ .vop_inactive = mntinactive },
1149 	VOPNAME_SEEK,		{ .vop_seek = mntseek },
1150 	VOPNAME_POLL,		{ .vop_poll = mntpoll },
1151 	VOPNAME_DISPOSE,	{ .error = fs_error },
1152 	VOPNAME_SHRLOCK,	{ .error = fs_error },
1153 	NULL,			NULL
1154 };
1155