xref: /titanic_41/usr/src/uts/common/fs/mntfs/mntvnops.c (revision d29b2c4438482eb00488be49a1f5d6835f455546)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/file.h>
29 #include <sys/stat.h>
30 #include <sys/atomic.h>
31 #include <sys/mntio.h>
32 #include <sys/mnttab.h>
33 #include <sys/mount.h>
34 #include <sys/sunddi.h>
35 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
37 #include <sys/vfs.h>
38 #include <sys/vfs_opreg.h>
39 #include <sys/fs/mntdata.h>
40 #include <fs/fs_subr.h>
41 #include <sys/vmsystm.h>
42 #include <vm/seg_vn.h>
43 
44 #define	MNTROOTINO	2
45 
46 static mntnode_t *mntgetnode(vnode_t *);
47 
48 vnodeops_t *mntvnodeops;
49 extern void vfs_mnttab_readop(void);
50 
51 /*
52  * Design of kernel mnttab accounting.
53  *
54  * To support whitespace in mount names, we implement an ioctl
55  * (MNTIOC_GETMNTENT) which allows a programmatic interface to the data in
56  * /etc/mnttab.  The libc functions getmntent() and getextmntent() are built
57  * atop this interface.
58  *
59  * To minimize the amount of memory used in the kernel, we keep all the
60  * necessary information in the user's address space.  Large server
61  * configurations can have /etc/mnttab files in excess of 64k.
62  *
63  * To support both vanilla read() calls as well as ioctl() calls, we have two
64  * different snapshots of the kernel data structures, mnt_read and mnt_ioctl.
65  * These snapshots include the base location in user memory, the number of
66  * mounts in the snapshot, and any metadata associated with it.  The metadata is
67  * used only to support the ioctl() interface, and is a series of extmnttab
68  * structures.  When the user issues an ioctl(), we simply copyout a pointer to
69  * that structure, and the rest is handled in userland.
70  */
71 
72 /*
73  * NOTE: The following variable enables the generation of the "dev=xxx"
74  * in the option string for a mounted file system.  Really this should
75  * be gotten rid of altogether, but for the sake of backwards compatibility
76  * we had to leave it in.  It is defined as a 32-bit device number.  This
77  * means that when 64-bit device numbers are in use, if either the major or
78  * minor part of the device number will not fit in a 16 bit quantity, the
79  * "dev=" will be set to NODEV (0x7fffffff).  See PSARC 1999/566 and
80  * 1999/131 for details.  The cmpldev() function used to generate the 32-bit
81  * device number handles this check and assigns the proper value.
82  */
83 int mntfs_enabledev = 1;	/* enable old "dev=xxx" option */
84 
85 static int
86 mntfs_devsize(struct vfs *vfsp)
87 {
88 	dev32_t odev;
89 
90 	(void) cmpldev(&odev, vfsp->vfs_dev);
91 	return (snprintf(NULL, 0, "dev=%x", odev));
92 }
93 
94 static int
95 mntfs_devprint(struct vfs *vfsp, char *buf)
96 {
97 	dev32_t odev;
98 
99 	(void) cmpldev(&odev, vfsp->vfs_dev);
100 	return (snprintf(buf, MAX_MNTOPT_STR, "dev=%x", odev));
101 }
102 
103 static int
104 mntfs_optsize(struct vfs *vfsp)
105 {
106 	int i, size = 0;
107 	mntopt_t *mop;
108 
109 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
110 		mop = &vfsp->vfs_mntopts.mo_list[i];
111 		if (mop->mo_flags & MO_NODISPLAY)
112 			continue;
113 		if (mop->mo_flags & MO_SET) {
114 			if (size)
115 				size++; /* space for comma */
116 			size += strlen(mop->mo_name);
117 			/*
118 			 * count option value if there is one
119 			 */
120 			if (mop->mo_arg != NULL) {
121 				size += strlen(mop->mo_arg) + 1;
122 			}
123 		}
124 	}
125 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
126 		/*
127 		 * Add space for "zone=<zone_name>" if required.
128 		 */
129 		if (size)
130 			size++;	/* space for comma */
131 		size += sizeof ("zone=") - 1;
132 		size += strlen(vfsp->vfs_zone->zone_name);
133 	}
134 	if (mntfs_enabledev) {
135 		if (size != 0)
136 			size++; /* space for comma */
137 		size += mntfs_devsize(vfsp);
138 	}
139 	if (size == 0)
140 		size = strlen("-");
141 	return (size);
142 }
143 
144 static int
145 mntfs_optprint(struct vfs *vfsp, char *buf)
146 {
147 	int i, optinbuf = 0;
148 	mntopt_t *mop;
149 	char *origbuf = buf;
150 
151 	for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) {
152 		mop = &vfsp->vfs_mntopts.mo_list[i];
153 		if (mop->mo_flags & MO_NODISPLAY)
154 			continue;
155 		if (mop->mo_flags & MO_SET) {
156 			if (optinbuf)
157 				*buf++ = ',';
158 			else
159 				optinbuf = 1;
160 			buf += snprintf(buf, MAX_MNTOPT_STR,
161 				"%s", mop->mo_name);
162 			/*
163 			 * print option value if there is one
164 			 */
165 			if (mop->mo_arg != NULL) {
166 				buf += snprintf(buf, MAX_MNTOPT_STR, "=%s",
167 					mop->mo_arg);
168 			}
169 		}
170 	}
171 	if (vfsp->vfs_zone != NULL && vfsp->vfs_zone != global_zone) {
172 		if (optinbuf)
173 			*buf++ = ',';
174 		else
175 			optinbuf = 1;
176 		buf += snprintf(buf, MAX_MNTOPT_STR, "zone=%s",
177 		    vfsp->vfs_zone->zone_name);
178 	}
179 	if (mntfs_enabledev) {
180 		if (optinbuf++)
181 			*buf++ = ',';
182 		buf += mntfs_devprint(vfsp, buf);
183 	}
184 	if (!optinbuf) {
185 		buf += snprintf(buf, MAX_MNTOPT_STR, "-");
186 	}
187 	return (buf - origbuf);
188 }
189 
190 static size_t
191 mntfs_vfs_len(vfs_t *vfsp, zone_t *zone)
192 {
193 	size_t size = 0;
194 	const char *resource, *mntpt;
195 
196 	mntpt = refstr_value(vfsp->vfs_mntpt);
197 	if (mntpt != NULL && mntpt[0] != '\0') {
198 		size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
199 	} else {
200 		size += strlen("-") + 1;
201 	}
202 
203 	resource = refstr_value(vfsp->vfs_resource);
204 	if (resource != NULL && resource[0] != '\0') {
205 		if (resource[0] != '/') {
206 			size += strlen(resource) + 1;
207 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
208 			/*
209 			 * Same as the zone's view of the mount point.
210 			 */
211 			size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
212 		} else {
213 			size += strlen(ZONE_PATH_TRANSLATE(resource, zone)) + 1;
214 		}
215 	} else {
216 		size += strlen("-") + 1;
217 	}
218 	size += strlen(vfssw[vfsp->vfs_fstype].vsw_name) + 1;
219 	size += mntfs_optsize(vfsp);
220 	size += snprintf(NULL, 0, "\t%ld\n", vfsp->vfs_mtime);
221 	return (size);
222 }
223 
224 static void
225 mntfs_zonerootvfs(zone_t *zone, vfs_t *rootvfsp)
226 {
227 	/*
228 	 * Basically copy over the real vfs_t on which the root vnode is
229 	 * located, changing its mountpoint and resource to match those of
230 	 * the zone's rootpath.
231 	 */
232 	*rootvfsp = *zone->zone_rootvp->v_vfsp;
233 	rootvfsp->vfs_mntpt = refstr_alloc(zone->zone_rootpath);
234 	rootvfsp->vfs_resource = rootvfsp->vfs_mntpt;
235 }
236 
237 static size_t
238 mntfs_zone_len(uint_t *nent_ptr, zone_t *zone, int showhidden)
239 {
240 	struct vfs *zonelist;
241 	struct vfs *vfsp;
242 	size_t size = 0;
243 	uint_t cnt = 0;
244 
245 	ASSERT(zone->zone_rootpath != NULL);
246 
247 	/*
248 	 * If the zone has a root entry, it will be the first in the list.  If
249 	 * it doesn't, we conjure one up.
250 	 */
251 	vfsp = zonelist = zone->zone_vfslist;
252 	if (zonelist == NULL ||
253 	    strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
254 		vfs_t tvfs;
255 		/*
256 		 * The root of the zone is not a mount point.  The vfs we want
257 		 * to report is that of the zone's root vnode.
258 		 */
259 		ASSERT(zone != global_zone);
260 		mntfs_zonerootvfs(zone, &tvfs);
261 		size += mntfs_vfs_len(&tvfs, zone);
262 		refstr_rele(tvfs.vfs_mntpt);
263 		cnt++;
264 	}
265 	if (zonelist == NULL)
266 		goto out;
267 	do {
268 		/*
269 		 * Skip mounts that should not show up in mnttab
270 		 */
271 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
272 			vfsp = vfsp->vfs_zone_next;
273 			continue;
274 		}
275 		cnt++;
276 		size += mntfs_vfs_len(vfsp, zone);
277 		vfsp = vfsp->vfs_zone_next;
278 	} while (vfsp != zonelist);
279 out:
280 	*nent_ptr = cnt;
281 	return (size);
282 }
283 
284 static size_t
285 mntfs_global_len(uint_t *nent_ptr, int showhidden)
286 {
287 	struct vfs *vfsp;
288 	size_t size = 0;
289 	uint_t cnt = 0;
290 
291 	vfsp = rootvfs;
292 	do {
293 		/*
294 		 * Skip mounts that should not show up in mnttab
295 		 */
296 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
297 			vfsp = vfsp->vfs_next;
298 			continue;
299 		}
300 		cnt++;
301 		size += mntfs_vfs_len(vfsp, global_zone);
302 		vfsp = vfsp->vfs_next;
303 	} while (vfsp != rootvfs);
304 	*nent_ptr = cnt;
305 	return (size);
306 }
307 
308 static void
309 mntfs_vfs_generate(vfs_t *vfsp, zone_t *zone, struct extmnttab *tab,
310     char **basep, int forread)
311 {
312 	const char *resource, *mntpt;
313 	char *cp = *basep;
314 
315 	mntpt = refstr_value(vfsp->vfs_mntpt);
316 	resource = refstr_value(vfsp->vfs_resource);
317 
318 	if (tab)
319 		tab->mnt_special = cp;
320 	if (resource != NULL && resource[0] != '\0') {
321 		if (resource[0] != '/') {
322 			cp += snprintf(cp, MAXPATHLEN, "%s", resource);
323 		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
324 			/*
325 			 * Use the mount point as the resource.
326 			 */
327 			cp += snprintf(cp, MAXPATHLEN, "%s",
328 			    ZONE_PATH_TRANSLATE(mntpt, zone));
329 		} else {
330 			cp += snprintf(cp, MAXPATHLEN, "%s",
331 			    ZONE_PATH_TRANSLATE(resource, zone));
332 		}
333 	} else {
334 		cp += snprintf(cp, MAXPATHLEN, "-");
335 	}
336 	*cp++ = forread ? '\t' : '\0';
337 
338 	if (tab)
339 		tab->mnt_mountp = cp;
340 	if (mntpt != NULL && mntpt[0] != '\0') {
341 		/*
342 		 * We know the mount point is visible from within the zone,
343 		 * otherwise it wouldn't be on the zone's vfs list.
344 		 */
345 		cp += snprintf(cp, MAXPATHLEN, "%s",
346 		    ZONE_PATH_TRANSLATE(mntpt, zone));
347 	} else {
348 		cp += snprintf(cp, MAXPATHLEN, "-");
349 	}
350 	*cp++ = forread ? '\t' : '\0';
351 
352 	if (tab)
353 		tab->mnt_fstype = cp;
354 	cp += snprintf(cp, MAXPATHLEN, "%s",
355 	    vfssw[vfsp->vfs_fstype].vsw_name);
356 	*cp++ = forread ? '\t' : '\0';
357 
358 	if (tab)
359 		tab->mnt_mntopts = cp;
360 	cp += mntfs_optprint(vfsp, cp);
361 	*cp++ = forread ? '\t' : '\0';
362 
363 	if (tab)
364 		tab->mnt_time = cp;
365 	cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime);
366 	*cp++ = forread ? '\n' : '\0';
367 
368 	if (tab) {
369 		tab->mnt_major = getmajor(vfsp->vfs_dev);
370 		tab->mnt_minor = getminor(vfsp->vfs_dev);
371 	}
372 
373 	*basep = cp;
374 }
375 
376 static void
377 mntfs_zone_generate(zone_t *zone, int showhidden, struct extmnttab *tab,
378     char *basep, int forread)
379 {
380 	vfs_t *zonelist;
381 	vfs_t *vfsp;
382 	char *cp = basep;
383 
384 	/*
385 	 * If the zone has a root entry, it will be the first in the list.  If
386 	 * it doesn't, we conjure one up.
387 	 */
388 	vfsp = zonelist = zone->zone_vfslist;
389 	if (zonelist == NULL ||
390 	    strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
391 		vfs_t tvfs;
392 		/*
393 		 * The root of the zone is not a mount point.  The vfs we want
394 		 * to report is that of the zone's root vnode.
395 		 */
396 		ASSERT(zone != global_zone);
397 		mntfs_zonerootvfs(zone, &tvfs);
398 		mntfs_vfs_generate(&tvfs, zone, tab, &cp, forread);
399 		refstr_rele(tvfs.vfs_mntpt);
400 		if (tab)
401 			tab++;
402 	}
403 	if (zonelist == NULL)
404 		return;
405 	do {
406 		/*
407 		 * Skip mounts that should not show up in mnttab
408 		 */
409 		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
410 			vfsp = vfsp->vfs_zone_next;
411 			continue;
412 		}
413 		mntfs_vfs_generate(vfsp, zone, tab, &cp, forread);
414 		if (tab)
415 			tab++;
416 		vfsp = vfsp->vfs_zone_next;
417 	} while (vfsp != zonelist);
418 }
419 
420 static void
421 mntfs_global_generate(int showhidden, struct extmnttab *tab, char *basep,
422     int forread)
423 {
424 	vfs_t *vfsp;
425 	char *cp = basep;
426 
427 	vfsp = rootvfs;
428 	do {
429 		/*
430 		 * Skip mounts that should not show up in mnttab
431 		 */
432 		if (!showhidden && vfsp->vfs_flag & VFS_NOMNTTAB) {
433 			vfsp = vfsp->vfs_next;
434 			continue;
435 		}
436 		mntfs_vfs_generate(vfsp, global_zone, tab, &cp, forread);
437 		if (tab)
438 			tab++;
439 		vfsp = vfsp->vfs_next;
440 	} while (vfsp != rootvfs);
441 }
442 
443 static char *
444 mntfs_mapin(char *base, size_t size)
445 {
446 	size_t rlen = roundup(size, PAGESIZE);
447 	struct as *as = curproc->p_as;
448 	char *addr;
449 
450 	as_rangelock(as);
451 	map_addr(&addr, rlen, 0, 1, 0);
452 	if (addr == NULL || as_map(as, addr, rlen, segvn_create, zfod_argsp)) {
453 		as_rangeunlock(as);
454 		return (NULL);
455 	}
456 	as_rangeunlock(as);
457 	if (copyout(base, addr, size)) {
458 		(void) as_unmap(as, addr, rlen);
459 		return (NULL);
460 	}
461 	return (addr);
462 }
463 
464 static void
465 mntfs_freesnap(mntsnap_t *snap)
466 {
467 	if (snap->mnts_text != NULL)
468 		(void) as_unmap(curproc->p_as, snap->mnts_text,
469 			roundup(snap->mnts_textsize, PAGESIZE));
470 	snap->mnts_textsize = snap->mnts_count = 0;
471 	if (snap->mnts_metadata != NULL)
472 		(void) as_unmap(curproc->p_as, snap->mnts_metadata,
473 			roundup(snap->mnts_metasize, PAGESIZE));
474 	snap->mnts_metasize = 0;
475 }
476 
477 #ifdef _SYSCALL32_IMPL
478 
479 typedef struct extmnttab32 {
480 	uint32_t	mnt_special;
481 	uint32_t	mnt_mountp;
482 	uint32_t	mnt_fstype;
483 	uint32_t	mnt_mntopts;
484 	uint32_t	mnt_time;
485 	uint_t		mnt_major;
486 	uint_t		mnt_minor;
487 } extmnttab32_t;
488 
489 #endif
490 
491 /*
492  * Snapshot the latest version of the kernel mounted resource information
493  *
494  * There are two types of snapshots: one destined for reading, and one destined
495  * for ioctl().  The difference is that the ioctl() interface is delimited by
496  * NULLs, while the read() interface is delimited by tabs and newlines.
497  */
498 /* ARGSUSED */
499 static int
500 mntfs_snapshot(mntnode_t *mnp, int forread, int datamodel)
501 {
502 	size_t size;
503 	timespec_t lastmodt;
504 	mntdata_t *mntdata = MTOD(mnp);
505 	zone_t *zone = mntdata->mnt_zone;
506 	boolean_t global_view = (MTOD(mnp)->mnt_zone == global_zone);
507 	boolean_t showhidden = ((mnp->mnt_flags & MNT_SHOWHIDDEN) != 0);
508 	struct extmnttab *metadata_baseaddr;
509 	char *text_baseaddr;
510 	int i;
511 	mntsnap_t *snap;
512 
513 	if (forread)
514 		snap = &mnp->mnt_read;
515 	else
516 		snap = &mnp->mnt_ioctl;
517 
518 	vfs_list_read_lock();
519 	/*
520 	 * Check if the mnttab info has changed since the last snapshot
521 	 */
522 	vfs_mnttab_modtime(&lastmodt);
523 	if (snap->mnts_count &&
524 	    lastmodt.tv_sec == snap->mnts_time.tv_sec &&
525 	    lastmodt.tv_nsec == snap->mnts_time.tv_nsec) {
526 		vfs_list_unlock();
527 		return (0);
528 	}
529 
530 
531 	if (snap->mnts_count != 0)
532 		mntfs_freesnap(snap);
533 	if (global_view)
534 		size = mntfs_global_len(&snap->mnts_count, showhidden);
535 	else
536 		size = mntfs_zone_len(&snap->mnts_count, zone, showhidden);
537 	ASSERT(size != 0);
538 
539 	if (!forread)
540 		metadata_baseaddr = kmem_alloc(
541 		    snap->mnts_count * sizeof (struct extmnttab), KM_SLEEP);
542 	else
543 		metadata_baseaddr = NULL;
544 
545 	text_baseaddr = kmem_alloc(size, KM_SLEEP);
546 
547 	if (global_view)
548 		mntfs_global_generate(showhidden, metadata_baseaddr,
549 		    text_baseaddr, forread);
550 	else
551 		mntfs_zone_generate(zone, showhidden,
552 		    metadata_baseaddr, text_baseaddr, forread);
553 
554 	vfs_mnttab_modtime(&snap->mnts_time);
555 	vfs_list_unlock();
556 
557 	snap->mnts_text = mntfs_mapin(text_baseaddr, size);
558 	snap->mnts_textsize = size;
559 	kmem_free(text_baseaddr, size);
560 
561 	/*
562 	 * The pointers in the metadata refer to addreesses in the range
563 	 * [base_addr, base_addr + size].  Now that we have mapped the text into
564 	 * the user's address space, we have to convert these addresses into the
565 	 * new (user) range.  We also handle the conversion for 32-bit and
566 	 * 32-bit applications here.
567 	 */
568 	if (!forread) {
569 		struct extmnttab *tab;
570 #ifdef _SYSCALL32_IMPL
571 		struct extmnttab32 *tab32;
572 
573 		if (datamodel == DATAMODEL_ILP32) {
574 			tab = (struct extmnttab *)metadata_baseaddr;
575 			tab32 = (struct extmnttab32 *)metadata_baseaddr;
576 
577 			for (i = 0; i < snap->mnts_count; i++) {
578 				tab32[i].mnt_special =
579 				    (uintptr_t)snap->mnts_text +
580 				    (tab[i].mnt_special - text_baseaddr);
581 				tab32[i].mnt_mountp =
582 				    (uintptr_t)snap->mnts_text +
583 				    (tab[i].mnt_mountp - text_baseaddr);
584 				tab32[i].mnt_fstype =
585 				    (uintptr_t)snap->mnts_text +
586 				    (tab[i].mnt_fstype - text_baseaddr);
587 				tab32[i].mnt_mntopts =
588 				    (uintptr_t)snap->mnts_text +
589 				    (tab[i].mnt_mntopts - text_baseaddr);
590 				tab32[i].mnt_time = (uintptr_t)snap->mnts_text +
591 				    (tab[i].mnt_time - text_baseaddr);
592 				tab32[i].mnt_major = tab[i].mnt_major;
593 				tab32[i].mnt_minor = tab[i].mnt_minor;
594 			}
595 
596 			snap->mnts_metasize =
597 			    snap->mnts_count * sizeof (struct extmnttab32);
598 			snap->mnts_metadata = mntfs_mapin(
599 			    (char *)metadata_baseaddr,
600 			    snap->mnts_metasize);
601 
602 		} else {
603 #endif
604 			tab = (struct extmnttab *)metadata_baseaddr;
605 			for (i = 0; i < snap->mnts_count; i++) {
606 				tab[i].mnt_special = snap->mnts_text +
607 				    (tab[i].mnt_special - text_baseaddr);
608 				tab[i].mnt_mountp = snap->mnts_text +
609 				    (tab[i].mnt_mountp - text_baseaddr);
610 				tab[i].mnt_fstype = snap->mnts_text +
611 				    (tab[i].mnt_fstype - text_baseaddr);
612 				tab[i].mnt_mntopts = snap->mnts_text +
613 				    (tab[i].mnt_mntopts - text_baseaddr);
614 				tab[i].mnt_time = snap->mnts_text +
615 				    (tab[i].mnt_time - text_baseaddr);
616 			}
617 
618 			snap->mnts_metasize =
619 			    snap->mnts_count * sizeof (struct extmnttab);
620 			snap->mnts_metadata = mntfs_mapin(
621 			    (char *)metadata_baseaddr, snap->mnts_metasize);
622 #ifdef _SYSCALL32_IMPL
623 		}
624 #endif
625 
626 		kmem_free(metadata_baseaddr,
627 		    snap->mnts_count * sizeof (struct extmnttab));
628 	}
629 
630 	mntdata->mnt_size = size;
631 
632 	if (snap->mnts_text == NULL ||
633 	    (!forread && snap->mnts_metadata == NULL)) {
634 		mntfs_freesnap(snap);
635 		return (ENOMEM);
636 	}
637 	vfs_mnttab_readop();
638 	return (0);
639 }
640 
641 /*
642  * Public function to convert vfs_mntopts into a string.
643  * A buffer of sufficient size is allocated, which is returned via bufp,
644  * and whose length is returned via lenp.
645  */
646 void
647 mntfs_getmntopts(struct vfs *vfsp, char **bufp, size_t *lenp)
648 {
649 	size_t len;
650 	char *buf;
651 
652 	vfs_list_read_lock();
653 
654 	len = mntfs_optsize(vfsp) + 1;
655 	buf = kmem_alloc(len, KM_NOSLEEP);
656 	if (buf == NULL) {
657 		*bufp = NULL;
658 		vfs_list_unlock();
659 		return;
660 	}
661 	buf[len - 1] = '\0';
662 	(void) mntfs_optprint(vfsp, buf);
663 	ASSERT(buf[len - 1] == '\0');
664 
665 	vfs_list_unlock();
666 	*bufp = buf;
667 	*lenp = len;
668 }
669 
670 
671 /* ARGSUSED */
672 static int
673 mntopen(vnode_t **vpp, int flag, cred_t *cr)
674 {
675 	vnode_t *vp = *vpp;
676 	mntnode_t *nmnp;
677 
678 	/*
679 	 * Not allowed to open for writing, return error.
680 	 */
681 	if (flag & FWRITE)
682 		return (EPERM);
683 	/*
684 	 * Create a new mnt/vnode for each open, this will give us a handle to
685 	 * hang the snapshot on.
686 	 */
687 	nmnp = mntgetnode(vp);
688 
689 	*vpp = MTOV(nmnp);
690 	atomic_add_32(&MTOD(nmnp)->mnt_nopen, 1);
691 	VN_RELE(vp);
692 	return (0);
693 }
694 
695 /* ARGSUSED */
696 static int
697 mntclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
698 {
699 	mntnode_t *mnp = VTOM(vp);
700 
701 	/* Clean up any locks or shares held by the current process */
702 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
703 	cleanshares(vp, ttoproc(curthread)->p_pid);
704 
705 	if (count > 1)
706 		return (0);
707 	if (vp->v_count == 1) {
708 		mntfs_freesnap(&mnp->mnt_read);
709 		mntfs_freesnap(&mnp->mnt_ioctl);
710 		atomic_add_32(&MTOD(mnp)->mnt_nopen, -1);
711 	}
712 	return (0);
713 }
714 
715 /* ARGSUSED */
716 static int
717 mntread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, caller_context_t *ct)
718 {
719 	int error = 0;
720 	off_t off = uio->uio_offset;
721 	size_t len = uio->uio_resid;
722 	mntnode_t *mnp = VTOM(vp);
723 	char *buf;
724 	mntsnap_t *snap = &mnp->mnt_read;
725 	int datamodel;
726 
727 	if (off == (off_t)0 || snap->mnts_count == 0) {
728 		/*
729 		 * It is assumed that any kernel callers wishing
730 		 * to read mnttab will be using extmnttab entries
731 		 * and not extmnttab32 entries, whether or not
732 		 * the kernel is LP64 or ILP32.  Thus, force the
733 		 * datamodel that mntfs_snapshot uses to be
734 		 * DATAMODEL_LP64.
735 		 */
736 		if (uio->uio_segflg == UIO_SYSSPACE)
737 			datamodel = DATAMODEL_LP64;
738 		else
739 			datamodel = get_udatamodel();
740 		if ((error = mntfs_snapshot(mnp, 1, datamodel)) != 0)
741 			return (error);
742 	}
743 	if ((size_t)(off + len) > snap->mnts_textsize)
744 		len = snap->mnts_textsize - off;
745 
746 	if (off < 0 || len > snap->mnts_textsize)
747 		return (EFAULT);
748 
749 	if (len == 0)
750 		return (0);
751 
752 	/*
753 	 * The mnttab image is stored in the user's address space,
754 	 * so we have to copy it into the kernel from userland,
755 	 * then copy it back out to the specified address.
756 	 */
757 	buf = kmem_alloc(len, KM_SLEEP);
758 	if (copyin(snap->mnts_text + off, buf, len))
759 		error = EFAULT;
760 	else {
761 		error = uiomove(buf, len, UIO_READ, uio);
762 	}
763 	kmem_free(buf, len);
764 	vfs_mnttab_readop();
765 	return (error);
766 }
767 
768 
769 static int
770 mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
771 {
772 	mntnode_t *mnp = VTOM(vp);
773 	int error;
774 	vnode_t *rvp;
775 	extern timespec_t vfs_mnttab_ctime;
776 	mntdata_t *mntdata = MTOD(VTOM(vp));
777 	mntsnap_t *snap = mnp->mnt_read.mnts_count ?
778 	    &mnp->mnt_read : &mnp->mnt_ioctl;
779 
780 	/*
781 	 * Return all the attributes.  Should be refined
782 	 * so that it returns only those asked for.
783 	 * Most of this is complete fakery anyway.
784 	 */
785 	rvp = mnp->mnt_mountvp;
786 	/*
787 	 * Attributes are same as underlying file with modifications
788 	 */
789 	if (error = VOP_GETATTR(rvp, vap, flags, cr))
790 		return (error);
791 
792 	/*
793 	 * We always look like a regular file
794 	 */
795 	vap->va_type = VREG;
796 	/*
797 	 * mode should basically be read only
798 	 */
799 	vap->va_mode &= 07444;
800 	vap->va_fsid = vp->v_vfsp->vfs_dev;
801 	vap->va_blksize = DEV_BSIZE;
802 	vap->va_rdev = 0;
803 	vap->va_seq = 0;
804 	/*
805 	 * Set nlink to the number of open vnodes for mnttab info
806 	 * plus one for existing.
807 	 */
808 	vap->va_nlink = mntdata->mnt_nopen + 1;
809 	/*
810 	 * If we haven't taken a snapshot yet, set the
811 	 * size to the size of the latest snapshot.
812 	 */
813 	vap->va_size = snap->mnts_textsize ? snap->mnts_textsize :
814 	    mntdata->mnt_size;
815 	/*
816 	 * Fetch mtime from the vfs mnttab timestamp
817 	 */
818 	vap->va_ctime = vfs_mnttab_ctime;
819 	vfs_list_read_lock();
820 	vfs_mnttab_modtime(&vap->va_mtime);
821 	vap->va_atime = vap->va_mtime;
822 	vfs_list_unlock();
823 	/*
824 	 * Nodeid is always ROOTINO;
825 	 */
826 	vap->va_nodeid = (ino64_t)MNTROOTINO;
827 	vap->va_nblocks = btod(vap->va_size);
828 	return (0);
829 }
830 
831 
832 static int
833 mntaccess(vnode_t *vp, int mode, int flags, cred_t *cr)
834 {
835 	mntnode_t *mnp = VTOM(vp);
836 
837 	if (mode & (VWRITE|VEXEC))
838 		return (EROFS);
839 
840 	/*
841 	 * Do access check on the underlying directory vnode.
842 	 */
843 	return (VOP_ACCESS(mnp->mnt_mountvp, mode, flags, cr));
844 }
845 
846 
847 /*
848  * New /mntfs vnode required; allocate it and fill in most of the fields.
849  */
850 static mntnode_t *
851 mntgetnode(vnode_t *dp)
852 {
853 	mntnode_t *mnp;
854 	vnode_t *vp;
855 
856 	mnp = kmem_zalloc(sizeof (mntnode_t), KM_SLEEP);
857 	mnp->mnt_vnode = vn_alloc(KM_SLEEP);
858 	mnp->mnt_mountvp = VTOM(dp)->mnt_mountvp;
859 	vp = MTOV(mnp);
860 	vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
861 	vn_setops(vp, mntvnodeops);
862 	vp->v_vfsp = dp->v_vfsp;
863 	vp->v_type = VREG;
864 	vp->v_data = (caddr_t)mnp;
865 
866 	return (mnp);
867 }
868 
869 /*
870  * Free the storage obtained from mntgetnode().
871  */
872 static void
873 mntfreenode(mntnode_t *mnp)
874 {
875 	vnode_t *vp = MTOV(mnp);
876 
877 	vn_invalid(vp);
878 	vn_free(vp);
879 	kmem_free(mnp, sizeof (*mnp));
880 }
881 
882 
883 /* ARGSUSED */
884 static int
885 mntfsync(vnode_t *vp, int syncflag, cred_t *cr)
886 {
887 	return (0);
888 }
889 
890 /* ARGSUSED */
891 static void
892 mntinactive(vnode_t *vp, cred_t *cr)
893 {
894 	mntnode_t *mnp = VTOM(vp);
895 
896 	mntfreenode(mnp);
897 }
898 
899 /* ARGSUSED */
900 static int
901 mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp)
902 {
903 	if (*noffp == 0)
904 		VTOM(vp)->mnt_offset = 0;
905 
906 	return (0);
907 }
908 
909 /*
910  * Return the answer requested to poll().
911  * POLLRDBAND will return when the mtime of the mnttab
912  * information is newer than the latest one read for this open.
913  */
914 /* ARGSUSED */
915 static int
916 mntpoll(vnode_t *vp, short ev, int any, short *revp, pollhead_t **phpp)
917 {
918 	mntnode_t *mnp = VTOM(vp);
919 	mntsnap_t *snap = &mnp->mnt_read;
920 
921 	if (mnp->mnt_ioctl.mnts_time.tv_sec > snap->mnts_time.tv_sec ||
922 	    (mnp->mnt_ioctl.mnts_time.tv_sec == snap->mnts_time.tv_sec &&
923 	    mnp->mnt_ioctl.mnts_time.tv_nsec > snap->mnts_time.tv_nsec))
924 		snap = &mnp->mnt_ioctl;
925 
926 	*revp = 0;
927 	*phpp = (pollhead_t *)NULL;
928 	if (ev & POLLIN)
929 		*revp |= POLLIN;
930 
931 	if (ev & POLLRDNORM)
932 		*revp |= POLLRDNORM;
933 
934 	if (ev & POLLRDBAND) {
935 		vfs_mnttab_poll(&snap->mnts_time, phpp);
936 		if (*phpp == (pollhead_t *)NULL)
937 			*revp |= POLLRDBAND;
938 	}
939 	if (*revp || *phpp != NULL || any) {
940 		return (0);
941 	}
942 	/*
943 	 * If someone is polling an unsupported poll events (e.g.
944 	 * POLLOUT, POLLPRI, etc.), just return POLLERR revents.
945 	 * That way we will ensure that we don't return a 0
946 	 * revents with a NULL pollhead pointer.
947 	 */
948 	*revp = POLLERR;
949 	return (0);
950 }
951 /* ARGSUSED */
952 static int
953 mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
954 	cred_t *cr, int *rvalp)
955 {
956 	uint_t *up = (uint_t *)arg;
957 	mntnode_t *mnp = VTOM(vp);
958 	mntsnap_t *snap = &mnp->mnt_ioctl;
959 	int error;
960 
961 	error = 0;
962 	switch (cmd) {
963 
964 	case MNTIOC_NMNTS: {		/* get no. of mounted resources */
965 		if (snap->mnts_count == 0) {
966 			if ((error =
967 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
968 				return (error);
969 		}
970 		if (suword32(up, snap->mnts_count) != 0)
971 			error = EFAULT;
972 		break;
973 	}
974 
975 	case MNTIOC_GETDEVLIST: {	/* get mounted device major/minor nos */
976 		uint_t *devlist;
977 		int i;
978 		size_t len;
979 
980 		if (snap->mnts_count == 0) {
981 			if ((error =
982 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
983 				return (error);
984 		}
985 
986 		len = 2 * snap->mnts_count * sizeof (uint_t);
987 		devlist = kmem_alloc(len, KM_SLEEP);
988 		for (i = 0; i < snap->mnts_count; i++) {
989 
990 #ifdef _SYSCALL32_IMPL
991 			if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
992 				struct extmnttab32 tab;
993 
994 				if ((error = xcopyin(snap->mnts_text +
995 				    i * sizeof (struct extmnttab32), &tab,
996 				    sizeof (tab))) != 0)
997 					break;
998 
999 				devlist[i*2] = tab.mnt_major;
1000 				devlist[i*2+1] = tab.mnt_minor;
1001 			} else {
1002 #endif
1003 				struct extmnttab tab;
1004 
1005 				if ((error = xcopyin(snap->mnts_text +
1006 				    i * sizeof (struct extmnttab), &tab,
1007 				    sizeof (tab))) != 0)
1008 					break;
1009 
1010 				devlist[i*2] = tab.mnt_major;
1011 				devlist[i*2+1] = tab.mnt_minor;
1012 #ifdef _SYSCALL32_IMPL
1013 			}
1014 #endif
1015 		}
1016 
1017 		if (error == 0)
1018 			error = xcopyout(devlist, up, len);
1019 		kmem_free(devlist, len);
1020 		break;
1021 	}
1022 
1023 	case MNTIOC_SETTAG:		/* set tag on mounted file system */
1024 	case MNTIOC_CLRTAG:		/* clear tag on mounted file system */
1025 	{
1026 		struct mnttagdesc *dp = (struct mnttagdesc *)arg;
1027 		STRUCT_DECL(mnttagdesc, tagdesc);
1028 		char *cptr;
1029 		uint32_t major, minor;
1030 		char tagbuf[MAX_MNTOPT_TAG];
1031 		char *pbuf;
1032 		size_t len;
1033 		uint_t start = 0;
1034 		mntdata_t *mntdata = MTOD(mnp);
1035 		zone_t *zone = mntdata->mnt_zone;
1036 
1037 		STRUCT_INIT(tagdesc, flag & DATAMODEL_MASK);
1038 		if (copyin(dp, STRUCT_BUF(tagdesc), STRUCT_SIZE(tagdesc))) {
1039 			error = EFAULT;
1040 			break;
1041 		}
1042 		pbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1043 		if (zone != global_zone) {
1044 			(void) strcpy(pbuf, zone->zone_rootpath);
1045 			/* truncate "/" and nul */
1046 			start = zone->zone_rootpathlen - 2;
1047 			ASSERT(pbuf[start] == '/');
1048 		}
1049 		cptr = STRUCT_FGETP(tagdesc, mtd_mntpt);
1050 		error = copyinstr(cptr, pbuf + start, MAXPATHLEN - start, &len);
1051 		if (error) {
1052 			kmem_free(pbuf, MAXPATHLEN);
1053 			break;
1054 		}
1055 		if (start != 0 && pbuf[start] != '/') {
1056 			kmem_free(pbuf, MAXPATHLEN);
1057 			error = EINVAL;
1058 			break;
1059 		}
1060 		cptr = STRUCT_FGETP(tagdesc, mtd_tag);
1061 		if ((error = copyinstr(cptr, tagbuf, MAX_MNTOPT_TAG, &len))) {
1062 			kmem_free(pbuf, MAXPATHLEN);
1063 			break;
1064 		}
1065 		major = STRUCT_FGET(tagdesc, mtd_major);
1066 		minor = STRUCT_FGET(tagdesc, mtd_minor);
1067 		if (cmd == MNTIOC_SETTAG)
1068 			error = vfs_settag(major, minor, pbuf, tagbuf, cr);
1069 		else
1070 			error = vfs_clrtag(major, minor, pbuf, tagbuf, cr);
1071 		kmem_free(pbuf, MAXPATHLEN);
1072 		break;
1073 	}
1074 
1075 	case MNTIOC_SHOWHIDDEN:
1076 	{
1077 		mutex_enter(&vp->v_lock);
1078 		mnp->mnt_flags |= MNT_SHOWHIDDEN;
1079 		mutex_exit(&vp->v_lock);
1080 		break;
1081 	}
1082 
1083 	case MNTIOC_GETMNTENT:
1084 	{
1085 		size_t idx;
1086 		uintptr_t addr;
1087 
1088 		idx = mnp->mnt_offset;
1089 		if (snap->mnts_count == 0 || idx == 0) {
1090 			if ((error =
1091 			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK)) != 0)
1092 				return (error);
1093 		}
1094 		/*
1095 		 * If the next index is beyond the end of the current mnttab,
1096 		 * return EOF
1097 		 */
1098 		if (idx >= snap->mnts_count) {
1099 			*rvalp = 1;
1100 			return (0);
1101 		}
1102 
1103 #ifdef _SYSCALL32_IMPL
1104 		if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
1105 			addr = (uintptr_t)(snap->mnts_metadata + idx *
1106 			    sizeof (struct extmnttab32));
1107 			error = suword32((void *)arg, addr);
1108 		} else {
1109 #endif
1110 			addr = (uintptr_t)(snap->mnts_metadata + idx *
1111 			    sizeof (struct extmnttab));
1112 			error = sulword((void *)arg, addr);
1113 #ifdef _SYSCALL32_IMPL
1114 		}
1115 #endif
1116 
1117 		if (error != 0)
1118 			return (error);
1119 
1120 		mnp->mnt_offset++;
1121 		break;
1122 	}
1123 
1124 	default:
1125 		error = EINVAL;
1126 		break;
1127 	}
1128 
1129 	return (error);
1130 }
1131 
1132 /*
1133  * /mntfs vnode operations vector
1134  */
1135 const fs_operation_def_t mnt_vnodeops_template[] = {
1136 	VOPNAME_OPEN,		{ .vop_open = mntopen },
1137 	VOPNAME_CLOSE,		{ .vop_close = mntclose },
1138 	VOPNAME_READ,		{ .vop_read = mntread },
1139 	VOPNAME_IOCTL,		{ .vop_ioctl = mntioctl },
1140 	VOPNAME_GETATTR,	{ .vop_getattr = mntgetattr },
1141 	VOPNAME_ACCESS,		{ .vop_access = mntaccess },
1142 	VOPNAME_FSYNC,		{ .vop_fsync = mntfsync },
1143 	VOPNAME_INACTIVE,	{ .vop_inactive = mntinactive },
1144 	VOPNAME_SEEK,		{ .vop_seek = mntseek },
1145 	VOPNAME_POLL,		{ .vop_poll = mntpoll },
1146 	VOPNAME_DISPOSE,	{ .error = fs_error },
1147 	VOPNAME_SHRLOCK,	{ .error = fs_error },
1148 	NULL,			NULL
1149 };
1150