xref: /titanic_50/usr/src/uts/common/fs/zfs/zfs_ctldir.c (revision 76939ce0e89c177cb48bf98208fd3d831eb283d5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * ZFS control directory (a.k.a. ".zfs")
30  *
31  * This directory provides a common location for all ZFS meta-objects.
32  * Currently, this is only the 'snapshot' directory, but this may expand in the
33  * future.  The elements are built using the GFS primitives, as the hierarchy
34  * does not actually exist on disk.
35  *
36  * For 'snapshot', we don't want to have all snapshots always mounted, because
37  * this would take up a huge amount of space in /etc/mnttab.  We have three
38  * types of objects:
39  *
40  * 	ctldir ------> snapshotdir -------> snapshot
41  *                                             |
42  *                                             |
43  *                                             V
44  *                                         mounted fs
45  *
46  * The 'snapshot' node contains just enough information to lookup '..' and act
47  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
48  * perform an automount of the underlying filesystem and return the
49  * corresponding vnode.
50  *
51  * All mounts are handled automatically by the kernel, but unmounts are
52  * (currently) handled from user land.  The main reason is that there is no
53  * reliable way to auto-unmount the filesystem when it's "no longer in use".
54  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
55  * unmounts any snapshots within the snapshot directory.
56  */
57 
58 #include <fs/fs_subr.h>
59 #include <sys/zfs_ctldir.h>
60 #include <sys/zfs_ioctl.h>
61 #include <sys/zfs_vfsops.h>
62 #include <sys/vfs_opreg.h>
63 #include <sys/gfs.h>
64 #include <sys/stat.h>
65 #include <sys/dmu.h>
66 #include <sys/dsl_deleg.h>
67 #include <sys/mount.h>
68 
69 typedef struct {
70 	char		*se_name;
71 	vnode_t		*se_root;
72 	avl_node_t	se_node;
73 } zfs_snapentry_t;
74 
75 static int
76 snapentry_compare(const void *a, const void *b)
77 {
78 	const zfs_snapentry_t *sa = a;
79 	const zfs_snapentry_t *sb = b;
80 	int ret = strcmp(sa->se_name, sb->se_name);
81 
82 	if (ret < 0)
83 		return (-1);
84 	else if (ret > 0)
85 		return (1);
86 	else
87 		return (0);
88 }
89 
90 vnodeops_t *zfsctl_ops_root;
91 vnodeops_t *zfsctl_ops_snapdir;
92 vnodeops_t *zfsctl_ops_snapshot;
93 
94 static const fs_operation_def_t zfsctl_tops_root[];
95 static const fs_operation_def_t zfsctl_tops_snapdir[];
96 static const fs_operation_def_t zfsctl_tops_snapshot[];
97 
98 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
99 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
100 
101 static gfs_opsvec_t zfsctl_opsvec[] = {
102 	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
103 	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
104 	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
105 	{ NULL }
106 };
107 
108 typedef struct zfsctl_node {
109 	gfs_dir_t	zc_gfs_private;
110 	uint64_t	zc_id;
111 	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
112 } zfsctl_node_t;
113 
114 typedef struct zfsctl_snapdir {
115 	zfsctl_node_t	sd_node;
116 	kmutex_t	sd_lock;
117 	avl_tree_t	sd_snaps;
118 } zfsctl_snapdir_t;
119 
120 /*
121  * Root directory elements.  We have only a single static entry, 'snapshot'.
122  */
123 static gfs_dirent_t zfsctl_root_entries[] = {
124 	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
125 	{ NULL }
126 };
127 
128 /* include . and .. in the calculation */
129 #define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
130     sizeof (gfs_dirent_t)) + 1)
131 
132 
133 /*
134  * Initialize the various GFS pieces we'll need to create and manipulate .zfs
135  * directories.  This is called from the ZFS init routine, and initializes the
136  * vnode ops vectors that we'll be using.
137  */
138 void
139 zfsctl_init(void)
140 {
141 	VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
142 }
143 
144 void
145 zfsctl_fini(void)
146 {
147 	/*
148 	 * Remove vfsctl vnode ops
149 	 */
150 	if (zfsctl_ops_root)
151 		vn_freevnodeops(zfsctl_ops_root);
152 	if (zfsctl_ops_snapdir)
153 		vn_freevnodeops(zfsctl_ops_snapdir);
154 	if (zfsctl_ops_snapshot)
155 		vn_freevnodeops(zfsctl_ops_snapshot);
156 
157 	zfsctl_ops_root = NULL;
158 	zfsctl_ops_snapdir = NULL;
159 	zfsctl_ops_snapshot = NULL;
160 }
161 
162 /*
163  * Return the inode number associated with the 'snapshot' directory.
164  */
165 /* ARGSUSED */
166 static ino64_t
167 zfsctl_root_inode_cb(vnode_t *vp, int index)
168 {
169 	ASSERT(index == 0);
170 	return (ZFSCTL_INO_SNAPDIR);
171 }
172 
173 /*
174  * Create the '.zfs' directory.  This directory is cached as part of the VFS
175  * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
176  * therefore checks against a vfs_count of 2 instead of 1.  This reference
177  * is removed when the ctldir is destroyed in the unmount.
178  */
179 void
180 zfsctl_create(zfsvfs_t *zfsvfs)
181 {
182 	vnode_t *vp, *rvp;
183 	zfsctl_node_t *zcp;
184 
185 	ASSERT(zfsvfs->z_ctldir == NULL);
186 
187 	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
188 	    zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
189 	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
190 	zcp = vp->v_data;
191 	zcp->zc_id = ZFSCTL_INO_ROOT;
192 
193 	VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
194 	ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
195 	VN_RELE(rvp);
196 
197 	/*
198 	 * We're only faking the fact that we have a root of a filesystem for
199 	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
200 	 * for us.
201 	 */
202 	vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
203 
204 	zfsvfs->z_ctldir = vp;
205 }
206 
207 /*
208  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
209  * There might still be more references if we were force unmounted, but only
210  * new zfs_inactive() calls can occur and they don't reference .zfs
211  */
212 void
213 zfsctl_destroy(zfsvfs_t *zfsvfs)
214 {
215 	VN_RELE(zfsvfs->z_ctldir);
216 	zfsvfs->z_ctldir = NULL;
217 }
218 
219 /*
220  * Given a root znode, retrieve the associated .zfs directory.
221  * Add a hold to the vnode and return it.
222  */
223 vnode_t *
224 zfsctl_root(znode_t *zp)
225 {
226 	ASSERT(zfs_has_ctldir(zp));
227 	VN_HOLD(zp->z_zfsvfs->z_ctldir);
228 	return (zp->z_zfsvfs->z_ctldir);
229 }
230 
231 /*
232  * Common open routine.  Disallow any write access.
233  */
234 /* ARGSUSED */
235 static int
236 zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr)
237 {
238 	if (flags & FWRITE)
239 		return (EACCES);
240 
241 	return (0);
242 }
243 
244 /*
245  * Common close routine.  Nothing to do here.
246  */
247 /* ARGSUSED */
248 static int
249 zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
250     cred_t *cr)
251 {
252 	return (0);
253 }
254 
255 /*
256  * Common access routine.  Disallow writes.
257  */
258 /* ARGSUSED */
259 static int
260 zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr)
261 {
262 	if (mode & VWRITE)
263 		return (EACCES);
264 
265 	return (0);
266 }
267 
268 /*
269  * Common getattr function.  Fill in basic information.
270  */
271 static void
272 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
273 {
274 	zfsctl_node_t	*zcp = vp->v_data;
275 	timestruc_t	now;
276 
277 	vap->va_uid = 0;
278 	vap->va_gid = 0;
279 	vap->va_rdev = 0;
280 	/*
281 	 * We are a purly virtual object, so we have no
282 	 * blocksize or allocated blocks.
283 	 */
284 	vap->va_blksize = 0;
285 	vap->va_nblocks = 0;
286 	vap->va_seq = 0;
287 	vap->va_fsid = vp->v_vfsp->vfs_dev;
288 	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
289 	    S_IROTH | S_IXOTH;
290 	vap->va_type = VDIR;
291 	/*
292 	 * We live in the now (for atime).
293 	 */
294 	gethrestime(&now);
295 	vap->va_atime = now;
296 	vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
297 }
298 
299 static int
300 zfsctl_common_fid(vnode_t *vp, fid_t *fidp)
301 {
302 	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
303 	zfsctl_node_t	*zcp = vp->v_data;
304 	uint64_t	object = zcp->zc_id;
305 	zfid_short_t	*zfid;
306 	int		i;
307 
308 	ZFS_ENTER(zfsvfs);
309 
310 	if (fidp->fid_len < SHORT_FID_LEN) {
311 		fidp->fid_len = SHORT_FID_LEN;
312 		ZFS_EXIT(zfsvfs);
313 		return (ENOSPC);
314 	}
315 
316 	zfid = (zfid_short_t *)fidp;
317 
318 	zfid->zf_len = SHORT_FID_LEN;
319 
320 	for (i = 0; i < sizeof (zfid->zf_object); i++)
321 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
322 
323 	/* .zfs znodes always have a generation number of 0 */
324 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
325 		zfid->zf_gen[i] = 0;
326 
327 	ZFS_EXIT(zfsvfs);
328 	return (0);
329 }
330 
331 /*
332  * .zfs inode namespace
333  *
334  * We need to generate unique inode numbers for all files and directories
335  * within the .zfs pseudo-filesystem.  We use the following scheme:
336  *
337  * 	ENTRY			ZFSCTL_INODE
338  * 	.zfs			1
339  * 	.zfs/snapshot		2
340  * 	.zfs/snapshot/<snap>	objectid(snap)
341  */
342 
343 #define	ZFSCTL_INO_SNAP(id)	(id)
344 
345 /*
346  * Get root directory attributes.
347  */
348 /* ARGSUSED */
349 static int
350 zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
351 {
352 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
353 
354 	ZFS_ENTER(zfsvfs);
355 	vap->va_nodeid = ZFSCTL_INO_ROOT;
356 	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
357 
358 	zfsctl_common_getattr(vp, vap);
359 	ZFS_EXIT(zfsvfs);
360 
361 	return (0);
362 }
363 
364 /*
365  * Special case the handling of "..".
366  */
367 /* ARGSUSED */
368 int
369 zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
370     int flags, vnode_t *rdir, cred_t *cr)
371 {
372 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
373 	int err;
374 
375 	ZFS_ENTER(zfsvfs);
376 
377 	if (strcmp(nm, "..") == 0) {
378 		err = VFS_ROOT(dvp->v_vfsp, vpp);
379 	} else {
380 		err = gfs_dir_lookup(dvp, nm, vpp);
381 	}
382 
383 	ZFS_EXIT(zfsvfs);
384 
385 	return (err);
386 }
387 
388 static const fs_operation_def_t zfsctl_tops_root[] = {
389 	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
390 	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
391 	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
392 	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_root_getattr }	},
393 	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
394 	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir } 	},
395 	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_root_lookup }	},
396 	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
397 	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive }	},
398 	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid	}	},
399 	{ NULL }
400 };
401 
402 static int
403 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
404 {
405 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
406 
407 	dmu_objset_name(os, zname);
408 	if (strlen(zname) + 1 + strlen(name) >= len)
409 		return (ENAMETOOLONG);
410 	(void) strcat(zname, "@");
411 	(void) strcat(zname, name);
412 	return (0);
413 }
414 
415 int
416 zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
417 {
418 	zfsctl_snapdir_t *sdp = dvp->v_data;
419 	zfs_snapentry_t search, *sep;
420 	avl_index_t where;
421 	int err;
422 
423 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
424 
425 	search.se_name = (char *)name;
426 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
427 		return (ENOENT);
428 
429 	ASSERT(vn_ismntpt(sep->se_root));
430 
431 	/* this will be dropped by dounmount() */
432 	if ((err = vn_vfswlock(sep->se_root)) != 0)
433 		return (err);
434 
435 	VN_HOLD(sep->se_root);
436 	err = dounmount(vn_mountedvfs(sep->se_root), force, kcred);
437 	if (err) {
438 		VN_RELE(sep->se_root);
439 		return (err);
440 	}
441 	ASSERT(sep->se_root->v_count == 1);
442 	gfs_vop_inactive(sep->se_root, cr);
443 
444 	avl_remove(&sdp->sd_snaps, sep);
445 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
446 	kmem_free(sep, sizeof (zfs_snapentry_t));
447 
448 	return (0);
449 }
450 
451 
452 static void
453 zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
454 {
455 	avl_index_t where;
456 	vfs_t *vfsp;
457 	refstr_t *pathref;
458 	char newpath[MAXNAMELEN];
459 	char *tail;
460 
461 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
462 	ASSERT(sep != NULL);
463 
464 	vfsp = vn_mountedvfs(sep->se_root);
465 	ASSERT(vfsp != NULL);
466 
467 	vfs_lock_wait(vfsp);
468 
469 	/*
470 	 * Change the name in the AVL tree.
471 	 */
472 	avl_remove(&sdp->sd_snaps, sep);
473 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
474 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
475 	(void) strcpy(sep->se_name, nm);
476 	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
477 	avl_insert(&sdp->sd_snaps, sep, where);
478 
479 	/*
480 	 * Change the current mountpoint info:
481 	 * 	- update the tail of the mntpoint path
482 	 *	- update the tail of the resource path
483 	 */
484 	pathref = vfs_getmntpoint(vfsp);
485 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
486 	VERIFY((tail = strrchr(newpath, '/')) != NULL);
487 	*(tail+1) = '\0';
488 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
489 	(void) strcat(newpath, nm);
490 	refstr_rele(pathref);
491 	vfs_setmntpoint(vfsp, newpath);
492 
493 	pathref = vfs_getresource(vfsp);
494 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
495 	VERIFY((tail = strrchr(newpath, '@')) != NULL);
496 	*(tail+1) = '\0';
497 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
498 	(void) strcat(newpath, nm);
499 	refstr_rele(pathref);
500 	vfs_setresource(vfsp, newpath);
501 
502 	vfs_unlock(vfsp);
503 }
504 
505 static int
506 zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
507     cred_t *cr)
508 {
509 	zfsctl_snapdir_t *sdp = sdvp->v_data;
510 	zfs_snapentry_t search, *sep;
511 	avl_index_t where;
512 	char from[MAXNAMELEN], to[MAXNAMELEN];
513 	int err;
514 
515 	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
516 	if (err)
517 		return (err);
518 
519 	err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
520 	if (err)
521 		return (err);
522 
523 	if (err = zfs_secpolicy_rename_perms(from, to, cr))
524 		return (err);
525 	/*
526 	 * Cannot move snapshots out of the snapdir.
527 	 */
528 	if (sdvp != tdvp)
529 		return (EINVAL);
530 
531 	if (strcmp(snm, tnm) == 0)
532 		return (0);
533 
534 	mutex_enter(&sdp->sd_lock);
535 
536 	search.se_name = (char *)snm;
537 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
538 		mutex_exit(&sdp->sd_lock);
539 		return (ENOENT);
540 	}
541 
542 	err = dmu_objset_rename(from, to, B_FALSE);
543 	if (err == 0)
544 		zfsctl_rename_snap(sdp, sep, tnm);
545 
546 	mutex_exit(&sdp->sd_lock);
547 
548 	return (err);
549 }
550 
551 /* ARGSUSED */
552 static int
553 zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
554 {
555 	zfsctl_snapdir_t *sdp = dvp->v_data;
556 	char snapname[MAXNAMELEN];
557 	int err;
558 
559 	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
560 	if (err)
561 		return (err);
562 
563 	if (err = zfs_secpolicy_destroy_perms(snapname, cr))
564 		return (err);
565 
566 	mutex_enter(&sdp->sd_lock);
567 
568 	err = zfsctl_unmount_snap(dvp, name, MS_FORCE, cr);
569 	if (err) {
570 		mutex_exit(&sdp->sd_lock);
571 		return (err);
572 	}
573 
574 	err = dmu_objset_destroy(snapname);
575 
576 	mutex_exit(&sdp->sd_lock);
577 
578 	return (err);
579 }
580 
581 /* ARGSUSED */
582 static int
583 zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
584     cred_t *cr)
585 {
586 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
587 	char name[MAXNAMELEN];
588 	int err;
589 	static enum symfollow follow = NO_FOLLOW;
590 	static enum uio_seg seg = UIO_SYSSPACE;
591 
592 	dmu_objset_name(zfsvfs->z_os, name);
593 
594 	*vpp = NULL;
595 
596 	err = zfs_secpolicy_snapshot_perms(name, cr);
597 	if (err)
598 		return (err);
599 
600 	if (err == 0) {
601 		err = dmu_objset_snapshot(name, dirname, B_FALSE);
602 		if (err)
603 			return (err);
604 		err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
605 	}
606 
607 	return (err);
608 }
609 
610 /*
611  * Lookup entry point for the 'snapshot' directory.  Try to open the
612  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
613  * Perform a mount of the associated dataset on top of the vnode.
614  */
615 /* ARGSUSED */
616 static int
617 zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
618     int flags, vnode_t *rdir, cred_t *cr)
619 {
620 	zfsctl_snapdir_t *sdp = dvp->v_data;
621 	objset_t *snap;
622 	char snapname[MAXNAMELEN];
623 	char *mountpoint;
624 	zfs_snapentry_t *sep, search;
625 	struct mounta margs;
626 	vfs_t *vfsp;
627 	size_t mountpoint_len;
628 	avl_index_t where;
629 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
630 	int err;
631 
632 	ASSERT(dvp->v_type == VDIR);
633 
634 	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
635 		return (0);
636 
637 	/*
638 	 * If we get a recursive call, that means we got called
639 	 * from the domount() code while it was trying to look up the
640 	 * spec (which looks like a local path for zfs).  We need to
641 	 * add some flag to domount() to tell it not to do this lookup.
642 	 */
643 	if (MUTEX_HELD(&sdp->sd_lock))
644 		return (ENOENT);
645 
646 	ZFS_ENTER(zfsvfs);
647 
648 	mutex_enter(&sdp->sd_lock);
649 	search.se_name = (char *)nm;
650 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
651 		*vpp = sep->se_root;
652 		VN_HOLD(*vpp);
653 		err = traverse(vpp);
654 		if (err) {
655 			VN_RELE(*vpp);
656 			*vpp = NULL;
657 		} else if (*vpp == sep->se_root) {
658 			/*
659 			 * The snapshot was unmounted behind our backs,
660 			 * try to remount it.
661 			 */
662 			goto domount;
663 		}
664 		mutex_exit(&sdp->sd_lock);
665 		ZFS_EXIT(zfsvfs);
666 		return (err);
667 	}
668 
669 	/*
670 	 * The requested snapshot is not currently mounted, look it up.
671 	 */
672 	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
673 	if (err) {
674 		mutex_exit(&sdp->sd_lock);
675 		ZFS_EXIT(zfsvfs);
676 		return (err);
677 	}
678 	if (dmu_objset_open(snapname, DMU_OST_ZFS,
679 	    DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
680 		mutex_exit(&sdp->sd_lock);
681 		ZFS_EXIT(zfsvfs);
682 		return (ENOENT);
683 	}
684 
685 	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
686 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
687 	(void) strcpy(sep->se_name, nm);
688 	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
689 	avl_insert(&sdp->sd_snaps, sep, where);
690 
691 	dmu_objset_close(snap);
692 domount:
693 	mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
694 	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
695 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
696 	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
697 	    refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
698 
699 	margs.spec = snapname;
700 	margs.dir = mountpoint;
701 	margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
702 	margs.fstype = "zfs";
703 	margs.dataptr = NULL;
704 	margs.datalen = 0;
705 	margs.optptr = NULL;
706 	margs.optlen = 0;
707 
708 	err = domount("zfs", &margs, *vpp, kcred, &vfsp);
709 	kmem_free(mountpoint, mountpoint_len);
710 
711 	if (err == 0) {
712 		/*
713 		 * Return the mounted root rather than the covered mount point.
714 		 */
715 		VFS_RELE(vfsp);
716 		err = traverse(vpp);
717 	}
718 
719 	if (err == 0) {
720 		/*
721 		 * Fix up the root vnode.
722 		 *
723 		 * This is where we lie about our v_vfsp in order to
724 		 * make .zfs/snapshot/<snapdir> accessible over NFS
725 		 * without requiring manual mounts of <snapdir>.
726 		 */
727 		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
728 		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
729 		(*vpp)->v_vfsp = zfsvfs->z_vfs;
730 		(*vpp)->v_flag &= ~VROOT;
731 	}
732 	mutex_exit(&sdp->sd_lock);
733 	ZFS_EXIT(zfsvfs);
734 
735 	/*
736 	 * If we had an error, drop our hold on the vnode and
737 	 * zfsctl_snapshot_inactive() will clean up.
738 	 */
739 	if (err) {
740 		VN_RELE(*vpp);
741 		*vpp = NULL;
742 	}
743 	return (err);
744 }
745 
746 /* ARGSUSED */
747 static int
748 zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
749     offset_t *offp, offset_t *nextp, void *data)
750 {
751 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
752 	char snapname[MAXNAMELEN];
753 	uint64_t id, cookie;
754 
755 	ZFS_ENTER(zfsvfs);
756 
757 	cookie = *offp;
758 	if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
759 	    &cookie) == ENOENT) {
760 		*eofp = 1;
761 		ZFS_EXIT(zfsvfs);
762 		return (0);
763 	}
764 
765 	(void) strcpy(dp->d_name, snapname);
766 	dp->d_ino = ZFSCTL_INO_SNAP(id);
767 	*nextp = cookie;
768 
769 	ZFS_EXIT(zfsvfs);
770 
771 	return (0);
772 }
773 
774 vnode_t *
775 zfsctl_mknode_snapdir(vnode_t *pvp)
776 {
777 	vnode_t *vp;
778 	zfsctl_snapdir_t *sdp;
779 
780 	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
781 	    zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
782 	    zfsctl_snapdir_readdir_cb, NULL);
783 	sdp = vp->v_data;
784 	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
785 	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
786 	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
787 	avl_create(&sdp->sd_snaps, snapentry_compare,
788 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
789 	return (vp);
790 }
791 
792 /* ARGSUSED */
793 static int
794 zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
795 {
796 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
797 	zfsctl_snapdir_t *sdp = vp->v_data;
798 
799 	ZFS_ENTER(zfsvfs);
800 	zfsctl_common_getattr(vp, vap);
801 	vap->va_nodeid = gfs_file_inode(vp);
802 	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
803 	ZFS_EXIT(zfsvfs);
804 
805 	return (0);
806 }
807 
808 /* ARGSUSED */
809 static void
810 zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr)
811 {
812 	zfsctl_snapdir_t *sdp = vp->v_data;
813 	void *private;
814 
815 	private = gfs_dir_inactive(vp);
816 	if (private != NULL) {
817 		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
818 		mutex_destroy(&sdp->sd_lock);
819 		avl_destroy(&sdp->sd_snaps);
820 		kmem_free(private, sizeof (zfsctl_snapdir_t));
821 	}
822 }
823 
824 static const fs_operation_def_t zfsctl_tops_snapdir[] = {
825 	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
826 	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
827 	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
828 	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_snapdir_getattr } },
829 	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
830 	{ VOPNAME_RENAME,	{ .vop_rename = zfsctl_snapdir_rename }	},
831 	{ VOPNAME_RMDIR,	{ .vop_rmdir = zfsctl_snapdir_remove }	},
832 	{ VOPNAME_MKDIR,	{ .vop_mkdir = zfsctl_snapdir_mkdir }	},
833 	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir }	},
834 	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_snapdir_lookup }	},
835 	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
836 	{ VOPNAME_INACTIVE,	{ .vop_inactive = zfsctl_snapdir_inactive } },
837 	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid }	},
838 	{ NULL }
839 };
840 
841 static vnode_t *
842 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
843 {
844 	vnode_t *vp;
845 	zfsctl_node_t *zcp;
846 
847 	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
848 	    zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
849 	zcp = vp->v_data;
850 	zcp->zc_id = objset;
851 
852 	return (vp);
853 }
854 
855 static void
856 zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr)
857 {
858 	zfsctl_snapdir_t *sdp;
859 	zfs_snapentry_t *sep, *next;
860 	vnode_t *dvp;
861 
862 	VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
863 	sdp = dvp->v_data;
864 
865 	mutex_enter(&sdp->sd_lock);
866 
867 	if (vp->v_count > 1) {
868 		mutex_exit(&sdp->sd_lock);
869 		return;
870 	}
871 	ASSERT(!vn_ismntpt(vp));
872 
873 	sep = avl_first(&sdp->sd_snaps);
874 	while (sep != NULL) {
875 		next = AVL_NEXT(&sdp->sd_snaps, sep);
876 
877 		if (sep->se_root == vp) {
878 			avl_remove(&sdp->sd_snaps, sep);
879 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
880 			kmem_free(sep, sizeof (zfs_snapentry_t));
881 			break;
882 		}
883 		sep = next;
884 	}
885 	ASSERT(sep != NULL);
886 
887 	mutex_exit(&sdp->sd_lock);
888 	VN_RELE(dvp);
889 
890 	/*
891 	 * Dispose of the vnode for the snapshot mount point.
892 	 * This is safe to do because once this entry has been removed
893 	 * from the AVL tree, it can't be found again, so cannot become
894 	 * "active".  If we lookup the same name again we will end up
895 	 * creating a new vnode.
896 	 */
897 	gfs_vop_inactive(vp, cr);
898 }
899 
900 
901 /*
902  * These VP's should never see the light of day.  They should always
903  * be covered.
904  */
905 static const fs_operation_def_t zfsctl_tops_snapshot[] = {
906 	VOPNAME_INACTIVE, { .vop_inactive =  zfsctl_snapshot_inactive },
907 	NULL, NULL
908 };
909 
910 int
911 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
912 {
913 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
914 	vnode_t *dvp, *vp;
915 	zfsctl_snapdir_t *sdp;
916 	zfsctl_node_t *zcp;
917 	zfs_snapentry_t *sep;
918 	int error;
919 
920 	ASSERT(zfsvfs->z_ctldir != NULL);
921 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
922 	    NULL, 0, NULL, kcred);
923 	if (error != 0)
924 		return (error);
925 	sdp = dvp->v_data;
926 
927 	mutex_enter(&sdp->sd_lock);
928 	sep = avl_first(&sdp->sd_snaps);
929 	while (sep != NULL) {
930 		vp = sep->se_root;
931 		zcp = vp->v_data;
932 		if (zcp->zc_id == objsetid)
933 			break;
934 
935 		sep = AVL_NEXT(&sdp->sd_snaps, sep);
936 	}
937 
938 	if (sep != NULL) {
939 		VN_HOLD(vp);
940 		error = traverse(&vp);
941 		if (error == 0) {
942 			if (vp == sep->se_root)
943 				error = EINVAL;
944 			else
945 				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
946 		}
947 		mutex_exit(&sdp->sd_lock);
948 		VN_RELE(vp);
949 	} else {
950 		error = EINVAL;
951 		mutex_exit(&sdp->sd_lock);
952 	}
953 
954 	VN_RELE(dvp);
955 
956 	return (error);
957 }
958 
959 /*
960  * Unmount any snapshots for the given filesystem.  This is called from
961  * zfs_umount() - if we have a ctldir, then go through and unmount all the
962  * snapshots.
963  */
964 int
965 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
966 {
967 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
968 	vnode_t *dvp, *svp;
969 	zfsctl_snapdir_t *sdp;
970 	zfs_snapentry_t *sep, *next;
971 	int error;
972 
973 	ASSERT(zfsvfs->z_ctldir != NULL);
974 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
975 	    NULL, 0, NULL, cr);
976 	if (error != 0)
977 		return (error);
978 	sdp = dvp->v_data;
979 
980 	mutex_enter(&sdp->sd_lock);
981 
982 	sep = avl_first(&sdp->sd_snaps);
983 	while (sep != NULL) {
984 		svp = sep->se_root;
985 		next = AVL_NEXT(&sdp->sd_snaps, sep);
986 
987 		/*
988 		 * If this snapshot is not mounted, then it must
989 		 * have just been unmounted by somebody else, and
990 		 * will be cleaned up by zfsctl_snapdir_inactive().
991 		 */
992 		if (vn_ismntpt(svp)) {
993 			if ((error = vn_vfswlock(svp)) != 0)
994 				goto out;
995 
996 			VN_HOLD(svp);
997 			error = dounmount(vn_mountedvfs(svp), fflags, cr);
998 			if (error) {
999 				VN_RELE(svp);
1000 				goto out;
1001 			}
1002 
1003 			avl_remove(&sdp->sd_snaps, sep);
1004 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
1005 			kmem_free(sep, sizeof (zfs_snapentry_t));
1006 
1007 			/*
1008 			 * We can't use VN_RELE(), as that will try to
1009 			 * invoke zfsctl_snapdir_inactive(), and that
1010 			 * would lead to an attempt to re-grab the sd_lock.
1011 			 */
1012 			ASSERT3U(svp->v_count, ==, 1);
1013 			gfs_vop_inactive(svp, cr);
1014 		}
1015 		sep = next;
1016 	}
1017 out:
1018 	mutex_exit(&sdp->sd_lock);
1019 	VN_RELE(dvp);
1020 
1021 	return (error);
1022 }
1023