xref: /titanic_52/usr/src/uts/common/fs/zfs/zfs_ctldir.c (revision 35497fcdac20037e6061dd2eb9250b6f12a36644)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * ZFS control directory (a.k.a. ".zfs")
30  *
31  * This directory provides a common location for all ZFS meta-objects.
32  * Currently, this is only the 'snapshot' directory, but this may expand in the
33  * future.  The elements are built using the GFS primitives, as the hierarchy
34  * does not actually exist on disk.
35  *
36  * For 'snapshot', we don't want to have all snapshots always mounted, because
37  * this would take up a huge amount of space in /etc/mnttab.  We have three
38  * types of objects:
39  *
40  * 	ctldir ------> snapshotdir -------> snapshot
41  *                                             |
42  *                                             |
43  *                                             V
44  *                                         mounted fs
45  *
46  * The 'snapshot' node contains just enough information to lookup '..' and act
47  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
48  * perform an automount of the underlying filesystem and return the
49  * corresponding vnode.
50  *
51  * All mounts are handled automatically by the kernel, but unmounts are
52  * (currently) handled from user land.  The main reason is that there is no
53  * reliable way to auto-unmount the filesystem when it's "no longer in use".
54  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
55  * unmounts any snapshots within the snapshot directory.
56  */
57 
58 #include <fs/fs_subr.h>
59 #include <sys/zfs_ctldir.h>
60 #include <sys/zfs_ioctl.h>
61 #include <sys/zfs_vfsops.h>
62 #include <sys/gfs.h>
63 #include <sys/stat.h>
64 #include <sys/dmu.h>
65 #include <sys/mount.h>
66 
67 typedef struct {
68 	char		*se_name;
69 	vnode_t		*se_root;
70 	avl_node_t	se_node;
71 } zfs_snapentry_t;
72 
73 static int
74 snapentry_compare(const void *a, const void *b)
75 {
76 	const zfs_snapentry_t *sa = a;
77 	const zfs_snapentry_t *sb = b;
78 	int ret = strcmp(sa->se_name, sb->se_name);
79 
80 	if (ret < 0)
81 		return (-1);
82 	else if (ret > 0)
83 		return (1);
84 	else
85 		return (0);
86 }
87 
88 vnodeops_t *zfsctl_ops_root;
89 vnodeops_t *zfsctl_ops_snapdir;
90 vnodeops_t *zfsctl_ops_snapshot;
91 
92 static const fs_operation_def_t zfsctl_tops_root[];
93 static const fs_operation_def_t zfsctl_tops_snapdir[];
94 static const fs_operation_def_t zfsctl_tops_snapshot[];
95 
96 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
97 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
98 
99 static gfs_opsvec_t zfsctl_opsvec[] = {
100 	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
101 	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
102 	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
103 	{ NULL }
104 };
105 
106 typedef struct zfsctl_node {
107 	gfs_dir_t	zc_gfs_private;
108 	uint64_t	zc_id;
109 } zfsctl_node_t;
110 
111 typedef struct zfsctl_snapdir {
112 	zfsctl_node_t	sd_node;
113 	kmutex_t	sd_lock;
114 	avl_tree_t	sd_snaps;
115 } zfsctl_snapdir_t;
116 
117 /*
118  * Root directory elements.  We have only a single static entry, 'snapshot'.
119  */
120 static gfs_dirent_t zfsctl_root_entries[] = {
121 	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
122 	{ NULL }
123 };
124 
125 /* include . and .. in the calculation */
126 #define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
127     sizeof (gfs_dirent_t)) + 1)
128 
129 
130 /*
131  * Initialize the various GFS pieces we'll need to create and manipulate .zfs
132  * directories.  This is called from the ZFS init routine, and initializes the
133  * vnode ops vectors that we'll be using.
134  */
135 void
136 zfsctl_init(void)
137 {
138 	VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
139 }
140 
141 void
142 zfsctl_fini(void)
143 {
144 	/*
145 	 * Remove vfsctl vnode ops
146 	 */
147 	if (zfsctl_ops_root)
148 		vn_freevnodeops(zfsctl_ops_root);
149 	if (zfsctl_ops_snapdir)
150 		vn_freevnodeops(zfsctl_ops_snapdir);
151 	if (zfsctl_ops_snapshot)
152 		vn_freevnodeops(zfsctl_ops_snapshot);
153 
154 	zfsctl_ops_root = NULL;
155 	zfsctl_ops_snapdir = NULL;
156 	zfsctl_ops_snapshot = NULL;
157 }
158 
159 /*
160  * Return the inode number associated with the 'snapshot' directory.
161  */
162 /* ARGSUSED */
163 static ino64_t
164 zfsctl_root_inode_cb(vnode_t *vp, int index)
165 {
166 	ASSERT(index == 0);
167 	return (ZFSCTL_INO_SNAPDIR);
168 }
169 
170 /*
171  * Create the '.zfs' directory.  This directory is cached as part of the VFS
172  * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
173  * therefore checks against a vfs_count of 2 instead of 1.  This reference
174  * is removed when the ctldir is destroyed in the unmount.
175  */
176 void
177 zfsctl_create(zfsvfs_t *zfsvfs)
178 {
179 	vnode_t *vp;
180 	zfsctl_node_t *zcp;
181 
182 	ASSERT(zfsvfs->z_ctldir == NULL);
183 
184 	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
185 	    zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
186 	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
187 	zcp = vp->v_data;
188 	zcp->zc_id = ZFSCTL_INO_ROOT;
189 
190 	/*
191 	 * We're only faking the fact that we have a root of a filesystem for
192 	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
193 	 * for us.
194 	 */
195 	vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
196 
197 	zfsvfs->z_ctldir = vp;
198 }
199 
200 /*
201  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
202  * There might still be more references if we were force unmounted, but only
203  * new zfs_inactive() calls can occur and they don't reference .zfs
204  */
205 void
206 zfsctl_destroy(zfsvfs_t *zfsvfs)
207 {
208 	VN_RELE(zfsvfs->z_ctldir);
209 	zfsvfs->z_ctldir = NULL;
210 }
211 
212 /*
213  * Given a root znode, retrieve the associated .zfs directory.
214  * Add a hold to the vnode and return it.
215  */
216 vnode_t *
217 zfsctl_root(znode_t *zp)
218 {
219 	ASSERT(zfs_has_ctldir(zp));
220 	VN_HOLD(zp->z_zfsvfs->z_ctldir);
221 	return (zp->z_zfsvfs->z_ctldir);
222 }
223 
224 /*
225  * Common open routine.  Disallow any write access.
226  */
227 /* ARGSUSED */
228 static int
229 zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr)
230 {
231 	if (flags & FWRITE)
232 		return (EACCES);
233 
234 	return (0);
235 }
236 
237 /*
238  * Common close routine.  Nothing to do here.
239  */
240 /* ARGSUSED */
241 static int
242 zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
243     cred_t *cr)
244 {
245 	return (0);
246 }
247 
248 /*
249  * Common access routine.  Disallow writes.
250  */
251 /* ARGSUSED */
252 static int
253 zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr)
254 {
255 	if (mode & VWRITE)
256 		return (EACCES);
257 
258 	return (0);
259 }
260 
261 /*
262  * Common getattr function.  Fill in basic information.
263  */
264 static void
265 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
266 {
267 	timestruc_t now;
268 
269 	vap->va_uid = 0;
270 	vap->va_gid = 0;
271 	vap->va_rdev = 0;
272 	/*
273 	 * We are a purly virtual object, so we have no
274 	 * blocksize or allocated blocks.
275 	 */
276 	vap->va_blksize = 0;
277 	vap->va_nblocks = 0;
278 	vap->va_seq = 0;
279 	vap->va_fsid = vp->v_vfsp->vfs_dev;
280 	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
281 	    S_IROTH | S_IXOTH;
282 	vap->va_type = VDIR;
283 	/*
284 	 * We live in the now.
285 	 */
286 	gethrestime(&now);
287 	vap->va_mtime = vap->va_ctime = vap->va_atime = now;
288 }
289 
290 static int
291 zfsctl_common_fid(vnode_t *vp, fid_t *fidp)
292 {
293 	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
294 	zfsctl_node_t	*zcp = vp->v_data;
295 	uint64_t	object = zcp->zc_id;
296 	zfid_short_t	*zfid;
297 	int		i;
298 
299 	ZFS_ENTER(zfsvfs);
300 
301 	if (fidp->fid_len < SHORT_FID_LEN) {
302 		fidp->fid_len = SHORT_FID_LEN;
303 		ZFS_EXIT(zfsvfs);
304 		return (ENOSPC);
305 	}
306 
307 	zfid = (zfid_short_t *)fidp;
308 
309 	zfid->zf_len = SHORT_FID_LEN;
310 
311 	for (i = 0; i < sizeof (zfid->zf_object); i++)
312 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
313 
314 	/* .zfs znodes always have a generation number of 0 */
315 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
316 		zfid->zf_gen[i] = 0;
317 
318 	ZFS_EXIT(zfsvfs);
319 	return (0);
320 }
321 
322 /*
323  * .zfs inode namespace
324  *
325  * We need to generate unique inode numbers for all files and directories
326  * within the .zfs pseudo-filesystem.  We use the following scheme:
327  *
328  * 	ENTRY			ZFSCTL_INODE
329  * 	.zfs			1
330  * 	.zfs/snapshot		2
331  * 	.zfs/snapshot/<snap>	objectid(snap)
332  */
333 
334 #define	ZFSCTL_INO_SNAP(id)	(id)
335 
336 /*
337  * Get root directory attributes.
338  */
339 /* ARGSUSED */
340 static int
341 zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
342 {
343 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
344 
345 	ZFS_ENTER(zfsvfs);
346 	vap->va_nodeid = ZFSCTL_INO_ROOT;
347 	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
348 
349 	zfsctl_common_getattr(vp, vap);
350 	ZFS_EXIT(zfsvfs);
351 
352 	return (0);
353 }
354 
355 /*
356  * Special case the handling of "..".
357  */
358 /* ARGSUSED */
359 int
360 zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
361     int flags, vnode_t *rdir, cred_t *cr)
362 {
363 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
364 	int err;
365 
366 	ZFS_ENTER(zfsvfs);
367 
368 	if (strcmp(nm, "..") == 0) {
369 		err = VFS_ROOT(dvp->v_vfsp, vpp);
370 	} else {
371 		err = gfs_dir_lookup(dvp, nm, vpp);
372 	}
373 
374 	ZFS_EXIT(zfsvfs);
375 
376 	return (err);
377 }
378 
379 static const fs_operation_def_t zfsctl_tops_root[] = {
380 	{ VOPNAME_OPEN,		zfsctl_common_open			},
381 	{ VOPNAME_CLOSE,	zfsctl_common_close			},
382 	{ VOPNAME_IOCTL,	fs_inval				},
383 	{ VOPNAME_GETATTR,	zfsctl_root_getattr			},
384 	{ VOPNAME_ACCESS,	zfsctl_common_access			},
385 	{ VOPNAME_READDIR,	gfs_vop_readdir				},
386 	{ VOPNAME_LOOKUP,	zfsctl_root_lookup			},
387 	{ VOPNAME_SEEK,		fs_seek					},
388 	{ VOPNAME_INACTIVE,	(fs_generic_func_p) gfs_vop_inactive	},
389 	{ VOPNAME_FID,		zfsctl_common_fid			},
390 	{ NULL }
391 };
392 
393 static int
394 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
395 {
396 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
397 
398 	dmu_objset_name(os, zname);
399 	if (strlen(zname) + 1 + strlen(name) >= len)
400 		return (ENAMETOOLONG);
401 	(void) strcat(zname, "@");
402 	(void) strcat(zname, name);
403 	return (0);
404 }
405 
406 static int
407 zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
408 {
409 	zfsctl_snapdir_t *sdp = dvp->v_data;
410 	zfs_snapentry_t search, *sep;
411 	avl_index_t where;
412 	int err;
413 
414 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
415 
416 	search.se_name = (char *)name;
417 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
418 		return (ENOENT);
419 
420 	ASSERT(vn_ismntpt(sep->se_root));
421 
422 	/* this will be dropped by dounmount() */
423 	if ((err = vn_vfswlock(sep->se_root)) != 0)
424 		return (err);
425 
426 	VN_HOLD(sep->se_root);
427 	if ((err = dounmount(vn_mountedvfs(sep->se_root), force, kcred)) != 0)
428 		return (err);
429 	ASSERT(sep->se_root->v_count == 1);
430 	gfs_vop_inactive(sep->se_root, cr);
431 
432 	avl_remove(&sdp->sd_snaps, sep);
433 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
434 	kmem_free(sep, sizeof (zfs_snapentry_t));
435 
436 	return (0);
437 }
438 
439 
440 static void
441 zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
442 {
443 	avl_index_t where;
444 	vfs_t *vfsp;
445 	refstr_t *pathref;
446 	char newpath[MAXNAMELEN];
447 	const char *oldpath;
448 	char *tail;
449 
450 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
451 	ASSERT(sep != NULL);
452 
453 	vfsp = vn_mountedvfs(sep->se_root);
454 	ASSERT(vfsp != NULL);
455 
456 	vfs_lock_wait(vfsp);
457 
458 	/*
459 	 * Change the name in the AVL tree.
460 	 */
461 	avl_remove(&sdp->sd_snaps, sep);
462 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
463 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
464 	(void) strcpy(sep->se_name, nm);
465 	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
466 	avl_insert(&sdp->sd_snaps, sep, where);
467 
468 	/*
469 	 * Change the current mountpoint info:
470 	 * 	- update the tail of the mntpoint path
471 	 *	- update the tail of the resource path
472 	 */
473 	pathref = vfs_getmntpoint(vfsp);
474 	oldpath = refstr_value(pathref);
475 	VERIFY((tail = strrchr(oldpath, '/')) != NULL);
476 	ASSERT((tail - oldpath) + strlen(nm) + 2 < MAXNAMELEN);
477 	(void) strncpy(newpath, oldpath, tail - oldpath + 1);
478 	(void) strcat(newpath, nm);
479 	refstr_rele(pathref);
480 	vfs_setmntpoint(vfsp, newpath);
481 
482 	pathref = vfs_getresource(vfsp);
483 	oldpath = refstr_value(pathref);
484 	VERIFY((tail = strrchr(oldpath, '@')) != NULL);
485 	ASSERT((tail - oldpath) + strlen(nm) + 2 < MAXNAMELEN);
486 	(void) strncpy(newpath, oldpath, tail - oldpath + 1);
487 	(void) strcat(newpath, nm);
488 	refstr_rele(pathref);
489 	vfs_setresource(vfsp, newpath);
490 
491 	vfs_unlock(vfsp);
492 }
493 
494 static int
495 zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
496     cred_t *cr)
497 {
498 	zfsctl_snapdir_t *sdp = sdvp->v_data;
499 	zfs_snapentry_t search, *sep;
500 	avl_index_t where;
501 	char from[MAXNAMELEN], to[MAXNAMELEN];
502 	int err;
503 
504 	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
505 	if (err)
506 		return (err);
507 	err = zfs_secpolicy_write(from, NULL, cr);
508 	if (err)
509 		return (err);
510 
511 	/*
512 	 * Cannot move snapshots out of the snapdir.
513 	 */
514 	if (sdvp != tdvp)
515 		return (EINVAL);
516 
517 	if (strcmp(snm, tnm) == 0)
518 		return (0);
519 
520 	err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
521 	if (err)
522 		return (err);
523 
524 	mutex_enter(&sdp->sd_lock);
525 
526 	search.se_name = (char *)snm;
527 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
528 		mutex_exit(&sdp->sd_lock);
529 		return (ENOENT);
530 	}
531 
532 	err = dmu_objset_rename(from, to);
533 	if (err == 0)
534 		zfsctl_rename_snap(sdp, sep, tnm);
535 
536 	mutex_exit(&sdp->sd_lock);
537 
538 	return (err);
539 }
540 
541 /* ARGSUSED */
542 static int
543 zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
544 {
545 	zfsctl_snapdir_t *sdp = dvp->v_data;
546 	char snapname[MAXNAMELEN];
547 	int err;
548 
549 	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
550 	if (err)
551 		return (err);
552 	err = zfs_secpolicy_write(snapname, NULL, cr);
553 	if (err)
554 		return (err);
555 
556 	mutex_enter(&sdp->sd_lock);
557 
558 	err = zfsctl_unmount_snap(dvp, name, 0, cr);
559 	if (err) {
560 		mutex_exit(&sdp->sd_lock);
561 		return (err);
562 	}
563 
564 	err = dmu_objset_destroy(snapname);
565 
566 	mutex_exit(&sdp->sd_lock);
567 
568 	return (err);
569 }
570 
571 /*
572  * Lookup entry point for the 'snapshot' directory.  Try to open the
573  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
574  * Perform a mount of the associated dataset on top of the vnode.
575  */
576 /* ARGSUSED */
577 static int
578 zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
579     int flags, vnode_t *rdir, cred_t *cr)
580 {
581 	zfsctl_snapdir_t *sdp = dvp->v_data;
582 	objset_t *snap;
583 	char snapname[MAXNAMELEN];
584 	char *mountpoint;
585 	zfs_snapentry_t *sep, search;
586 	struct mounta margs;
587 	vfs_t *vfsp;
588 	size_t mountpoint_len;
589 	avl_index_t where;
590 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
591 	int err;
592 
593 	ASSERT(dvp->v_type == VDIR);
594 
595 	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
596 		return (0);
597 
598 	/*
599 	 * If we get a recursive call, that means we got called
600 	 * from the domount() code while it was trying to look up the
601 	 * spec (which looks like a local path for zfs).  We need to
602 	 * add some flag to domount() to tell it not to do this lookup.
603 	 */
604 	if (MUTEX_HELD(&sdp->sd_lock))
605 		return (ENOENT);
606 
607 	ZFS_ENTER(zfsvfs);
608 
609 	mutex_enter(&sdp->sd_lock);
610 	search.se_name = (char *)nm;
611 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
612 		*vpp = sep->se_root;
613 		VN_HOLD(*vpp);
614 		/*
615 		 * If the snapshot was unmounted behind our backs,
616 		 * try to remount it.
617 		 */
618 		if (traverse(vpp) != 0) {
619 			ASSERT(!vn_ismntpt(*vpp));
620 			goto domount;
621 		}
622 		mutex_exit(&sdp->sd_lock);
623 		ZFS_EXIT(zfsvfs);
624 		return (0);
625 	}
626 
627 	/*
628 	 * The requested snapshot is not currently mounted, look it up.
629 	 */
630 	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
631 	if (err) {
632 		mutex_exit(&sdp->sd_lock);
633 		ZFS_EXIT(zfsvfs);
634 		return (err);
635 	}
636 	if (dmu_objset_open(snapname, DMU_OST_ZFS,
637 	    DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
638 		mutex_exit(&sdp->sd_lock);
639 		ZFS_EXIT(zfsvfs);
640 		return (ENOENT);
641 	}
642 
643 	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
644 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
645 	(void) strcpy(sep->se_name, nm);
646 	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
647 	avl_insert(&sdp->sd_snaps, sep, where);
648 
649 	dmu_objset_close(snap);
650 domount:
651 	mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
652 	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
653 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
654 	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
655 	    refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
656 
657 	margs.spec = snapname;
658 	margs.dir = mountpoint;
659 	margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
660 	margs.fstype = "zfs";
661 	margs.dataptr = NULL;
662 	margs.datalen = 0;
663 	margs.optptr = NULL;
664 	margs.optlen = 0;
665 
666 	err = domount("zfs", &margs, *vpp, kcred, &vfsp);
667 	kmem_free(mountpoint, mountpoint_len);
668 
669 	if (err == 0) {
670 		/*
671 		 * Return the mounted root rather than the covered mount point.
672 		 */
673 		VFS_RELE(vfsp);
674 		err = traverse(vpp);
675 	}
676 
677 	if (err == 0) {
678 		/*
679 		 * Fix up the root vnode.
680 		 */
681 		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
682 		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
683 		(*vpp)->v_vfsp = zfsvfs->z_vfs;
684 		(*vpp)->v_flag &= ~VROOT;
685 	}
686 	mutex_exit(&sdp->sd_lock);
687 	ZFS_EXIT(zfsvfs);
688 
689 	/*
690 	 * If we had an error, drop our hold on the vnode and
691 	 * zfsctl_snapshot_inactive() will clean up.
692 	 */
693 	if (err) {
694 		VN_RELE(*vpp);
695 		*vpp = NULL;
696 	}
697 	return (err);
698 }
699 
700 /* ARGSUSED */
701 static int
702 zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
703     offset_t *offp, offset_t *nextp, void *data)
704 {
705 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
706 	char snapname[MAXNAMELEN];
707 	uint64_t id, cookie;
708 
709 	ZFS_ENTER(zfsvfs);
710 
711 	cookie = *offp;
712 	if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
713 	    &cookie) == ENOENT) {
714 		*eofp = 1;
715 		ZFS_EXIT(zfsvfs);
716 		return (0);
717 	}
718 
719 	(void) strcpy(dp->d_name, snapname);
720 	dp->d_ino = ZFSCTL_INO_SNAP(id);
721 	*nextp = cookie;
722 
723 	ZFS_EXIT(zfsvfs);
724 
725 	return (0);
726 }
727 
728 vnode_t *
729 zfsctl_mknode_snapdir(vnode_t *pvp)
730 {
731 	vnode_t *vp;
732 	zfsctl_snapdir_t *sdp;
733 
734 	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
735 	    zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
736 	    zfsctl_snapdir_readdir_cb, NULL);
737 	sdp = vp->v_data;
738 	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
739 	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
740 	avl_create(&sdp->sd_snaps, snapentry_compare,
741 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
742 	return (vp);
743 }
744 
745 /* ARGSUSED */
746 static int
747 zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
748 {
749 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
750 	zfsctl_snapdir_t *sdp = vp->v_data;
751 
752 	ZFS_ENTER(zfsvfs);
753 	zfsctl_common_getattr(vp, vap);
754 	vap->va_nodeid = gfs_file_inode(vp);
755 	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
756 	ZFS_EXIT(zfsvfs);
757 
758 	return (0);
759 }
760 
761 /* ARGSUSED */
762 static void
763 zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr)
764 {
765 	zfsctl_snapdir_t *sdp = vp->v_data;
766 	void *private;
767 
768 	private = gfs_dir_inactive(vp);
769 	if (private != NULL) {
770 		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
771 		mutex_destroy(&sdp->sd_lock);
772 		avl_destroy(&sdp->sd_snaps);
773 		kmem_free(private, sizeof (zfsctl_snapdir_t));
774 	}
775 }
776 
777 static const fs_operation_def_t zfsctl_tops_snapdir[] = {
778 	{ VOPNAME_OPEN,		zfsctl_common_open			},
779 	{ VOPNAME_CLOSE,	zfsctl_common_close			},
780 	{ VOPNAME_IOCTL,	fs_inval				},
781 	{ VOPNAME_GETATTR,	zfsctl_snapdir_getattr			},
782 	{ VOPNAME_ACCESS,	zfsctl_common_access			},
783 	{ VOPNAME_RENAME,	zfsctl_snapdir_rename			},
784 	{ VOPNAME_RMDIR,	zfsctl_snapdir_remove			},
785 	{ VOPNAME_READDIR,	gfs_vop_readdir				},
786 	{ VOPNAME_LOOKUP,	zfsctl_snapdir_lookup			},
787 	{ VOPNAME_SEEK,		fs_seek					},
788 	{ VOPNAME_INACTIVE,	(fs_generic_func_p) zfsctl_snapdir_inactive },
789 	{ VOPNAME_FID,		zfsctl_common_fid			},
790 	{ NULL }
791 };
792 
793 static vnode_t *
794 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
795 {
796 	vnode_t *vp;
797 	zfsctl_node_t *zcp;
798 
799 	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
800 	    zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
801 	zcp = vp->v_data;
802 	zcp->zc_id = objset;
803 
804 	return (vp);
805 }
806 
807 static void
808 zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr)
809 {
810 	zfsctl_snapdir_t *sdp;
811 	zfs_snapentry_t *sep, *next;
812 	vnode_t *dvp;
813 
814 	VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
815 	sdp = dvp->v_data;
816 
817 	mutex_enter(&sdp->sd_lock);
818 
819 	if (vp->v_count > 1) {
820 		mutex_exit(&sdp->sd_lock);
821 		return;
822 	}
823 	ASSERT(!vn_ismntpt(vp));
824 
825 	sep = avl_first(&sdp->sd_snaps);
826 	while (sep != NULL) {
827 		next = AVL_NEXT(&sdp->sd_snaps, sep);
828 
829 		if (sep->se_root == vp) {
830 			avl_remove(&sdp->sd_snaps, sep);
831 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
832 			kmem_free(sep, sizeof (zfs_snapentry_t));
833 			break;
834 		}
835 		sep = next;
836 	}
837 	ASSERT(sep != NULL);
838 
839 	mutex_exit(&sdp->sd_lock);
840 	VN_RELE(dvp);
841 
842 	/*
843 	 * Dispose of the vnode for the snapshot mount point.
844 	 * This is safe to do because once this entry has been removed
845 	 * from the AVL tree, it can't be found again, so cannot become
846 	 * "active".  If we lookup the same name again we will end up
847 	 * creating a new vnode.
848 	 */
849 	gfs_vop_inactive(vp, cr);
850 }
851 
852 
853 /*
854  * These VP's should never see the light of day.  They should always
855  * be covered.
856  */
857 static const fs_operation_def_t zfsctl_tops_snapshot[] = {
858 	VOPNAME_INACTIVE, (fs_generic_func_p) zfsctl_snapshot_inactive,
859 	NULL, NULL
860 };
861 
862 int
863 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
864 {
865 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
866 	vnode_t *dvp, *vp;
867 	zfsctl_snapdir_t *sdp;
868 	zfsctl_node_t *zcp;
869 	zfs_snapentry_t *sep;
870 	int error;
871 
872 	ASSERT(zfsvfs->z_ctldir != NULL);
873 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
874 	    NULL, 0, NULL, kcred);
875 	if (error != 0)
876 		return (error);
877 	sdp = dvp->v_data;
878 
879 	mutex_enter(&sdp->sd_lock);
880 	sep = avl_first(&sdp->sd_snaps);
881 	while (sep != NULL) {
882 		vp = sep->se_root;
883 		zcp = vp->v_data;
884 		if (zcp->zc_id == objsetid)
885 			break;
886 
887 		sep = AVL_NEXT(&sdp->sd_snaps, sep);
888 	}
889 
890 	if (sep != NULL) {
891 		VN_HOLD(vp);
892 		error = traverse(&vp);
893 		if (error == 0)
894 			*zfsvfsp = VTOZ(vp)->z_zfsvfs;
895 		VN_RELE(vp);
896 	} else {
897 		error = EINVAL;
898 	}
899 
900 	mutex_exit(&sdp->sd_lock);
901 	VN_RELE(dvp);
902 
903 	return (error);
904 }
905 
906 /*
907  * Unmount any snapshots for the given filesystem.  This is called from
908  * zfs_umount() - if we have a ctldir, then go through and unmount all the
909  * snapshots.
910  */
911 int
912 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
913 {
914 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
915 	vnode_t *dvp, *svp;
916 	zfsctl_snapdir_t *sdp;
917 	zfs_snapentry_t *sep, *next;
918 	int error;
919 
920 	ASSERT(zfsvfs->z_ctldir != NULL);
921 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
922 	    NULL, 0, NULL, cr);
923 	if (error != 0)
924 		return (error);
925 	sdp = dvp->v_data;
926 
927 	mutex_enter(&sdp->sd_lock);
928 
929 	sep = avl_first(&sdp->sd_snaps);
930 	while (sep != NULL) {
931 		svp = sep->se_root;
932 		next = AVL_NEXT(&sdp->sd_snaps, sep);
933 
934 		/*
935 		 * If this snapshot is not mounted, then it must
936 		 * have just been unmounted by somebody else, and
937 		 * will be cleaned up by zfsctl_snapdir_inactive().
938 		 */
939 		if (vn_ismntpt(svp)) {
940 			if ((error = vn_vfswlock(svp)) != 0)
941 				goto out;
942 
943 			VN_HOLD(svp);
944 			error = dounmount(vn_mountedvfs(svp), fflags, cr);
945 			if (error) {
946 				VN_RELE(svp);
947 				goto out;
948 			}
949 
950 			avl_remove(&sdp->sd_snaps, sep);
951 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
952 			kmem_free(sep, sizeof (zfs_snapentry_t));
953 
954 			/*
955 			 * We can't use VN_RELE(), as that will try to
956 			 * invoke zfsctl_snapdir_inactive(), and that
957 			 * would lead to an attempt to re-grab the sd_lock.
958 			 */
959 			ASSERT3U(svp->v_count, ==, 1);
960 			gfs_vop_inactive(svp, cr);
961 		}
962 		sep = next;
963 	}
964 out:
965 	mutex_exit(&sdp->sd_lock);
966 	VN_RELE(dvp);
967 
968 	return (error);
969 }
970