xref: /titanic_51/usr/src/uts/common/fs/zfs/zfs_ctldir.c (revision 942214a9d3873106f26cc86dd4aef6ac6176b830)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * ZFS control directory (a.k.a. ".zfs")
30  *
31  * This directory provides a common location for all ZFS meta-objects.
32  * Currently, this is only the 'snapshot' directory, but this may expand in the
33  * future.  The elements are built using the GFS primitives, as the hierarchy
34  * does not actually exist on disk.
35  *
36  * For 'snapshot', we don't want to have all snapshots always mounted, because
37  * this would take up a huge amount of space in /etc/mnttab.  We have three
38  * types of objects:
39  *
40  * 	ctldir ------> snapshotdir -------> snapshot
41  *                                             |
42  *                                             |
43  *                                             V
44  *                                         mounted fs
45  *
46  * The 'snapshot' node contains just enough information to lookup '..' and act
47  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
48  * perform an automount of the underlying filesystem and return the
49  * corresponding vnode.
50  *
51  * All mounts are handled automatically by the kernel, but unmounts are
52  * (currently) handled from user land.  The main reason is that there is no
53  * reliable way to auto-unmount the filesystem when it's "no longer in use".
54  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
55  * unmounts any snapshots within the snapshot directory.
56  */
57 
58 #include <fs/fs_subr.h>
59 #include <sys/zfs_ctldir.h>
60 #include <sys/zfs_ioctl.h>
61 #include <sys/zfs_vfsops.h>
62 #include <sys/gfs.h>
63 #include <sys/stat.h>
64 #include <sys/dmu.h>
65 #include <sys/mount.h>
66 
67 typedef struct {
68 	char		*se_name;
69 	vnode_t		*se_root;
70 	avl_node_t	se_node;
71 } zfs_snapentry_t;
72 
73 static int
74 snapentry_compare(const void *a, const void *b)
75 {
76 	const zfs_snapentry_t *sa = a;
77 	const zfs_snapentry_t *sb = b;
78 	int ret = strcmp(sa->se_name, sb->se_name);
79 
80 	if (ret < 0)
81 		return (-1);
82 	else if (ret > 0)
83 		return (1);
84 	else
85 		return (0);
86 }
87 
88 vnodeops_t *zfsctl_ops_root;
89 vnodeops_t *zfsctl_ops_snapdir;
90 vnodeops_t *zfsctl_ops_snapshot;
91 
92 static const fs_operation_def_t zfsctl_tops_root[];
93 static const fs_operation_def_t zfsctl_tops_snapdir[];
94 static const fs_operation_def_t zfsctl_tops_snapshot[];
95 
96 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
97 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
98 
99 static gfs_opsvec_t zfsctl_opsvec[] = {
100 	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
101 	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
102 	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
103 	{ NULL }
104 };
105 
106 typedef struct zfsctl_node {
107 	gfs_dir_t	zc_gfs_private;
108 	uint64_t	zc_id;
109 	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
110 } zfsctl_node_t;
111 
112 typedef struct zfsctl_snapdir {
113 	zfsctl_node_t	sd_node;
114 	kmutex_t	sd_lock;
115 	avl_tree_t	sd_snaps;
116 } zfsctl_snapdir_t;
117 
118 /*
119  * Root directory elements.  We have only a single static entry, 'snapshot'.
120  */
121 static gfs_dirent_t zfsctl_root_entries[] = {
122 	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
123 	{ NULL }
124 };
125 
126 /* include . and .. in the calculation */
127 #define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
128     sizeof (gfs_dirent_t)) + 1)
129 
130 
131 /*
132  * Initialize the various GFS pieces we'll need to create and manipulate .zfs
133  * directories.  This is called from the ZFS init routine, and initializes the
134  * vnode ops vectors that we'll be using.
135  */
136 void
137 zfsctl_init(void)
138 {
139 	VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
140 }
141 
142 void
143 zfsctl_fini(void)
144 {
145 	/*
146 	 * Remove vfsctl vnode ops
147 	 */
148 	if (zfsctl_ops_root)
149 		vn_freevnodeops(zfsctl_ops_root);
150 	if (zfsctl_ops_snapdir)
151 		vn_freevnodeops(zfsctl_ops_snapdir);
152 	if (zfsctl_ops_snapshot)
153 		vn_freevnodeops(zfsctl_ops_snapshot);
154 
155 	zfsctl_ops_root = NULL;
156 	zfsctl_ops_snapdir = NULL;
157 	zfsctl_ops_snapshot = NULL;
158 }
159 
160 /*
161  * Return the inode number associated with the 'snapshot' directory.
162  */
163 /* ARGSUSED */
164 static ino64_t
165 zfsctl_root_inode_cb(vnode_t *vp, int index)
166 {
167 	ASSERT(index == 0);
168 	return (ZFSCTL_INO_SNAPDIR);
169 }
170 
171 /*
172  * Create the '.zfs' directory.  This directory is cached as part of the VFS
173  * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
174  * therefore checks against a vfs_count of 2 instead of 1.  This reference
175  * is removed when the ctldir is destroyed in the unmount.
176  */
177 void
178 zfsctl_create(zfsvfs_t *zfsvfs)
179 {
180 	vnode_t *vp, *rvp;
181 	zfsctl_node_t *zcp;
182 
183 	ASSERT(zfsvfs->z_ctldir == NULL);
184 
185 	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
186 	    zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
187 	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
188 	zcp = vp->v_data;
189 	zcp->zc_id = ZFSCTL_INO_ROOT;
190 
191 	VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
192 	ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
193 	VN_RELE(rvp);
194 
195 	/*
196 	 * We're only faking the fact that we have a root of a filesystem for
197 	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
198 	 * for us.
199 	 */
200 	vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
201 
202 	zfsvfs->z_ctldir = vp;
203 }
204 
205 /*
206  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
207  * There might still be more references if we were force unmounted, but only
208  * new zfs_inactive() calls can occur and they don't reference .zfs
209  */
210 void
211 zfsctl_destroy(zfsvfs_t *zfsvfs)
212 {
213 	VN_RELE(zfsvfs->z_ctldir);
214 	zfsvfs->z_ctldir = NULL;
215 }
216 
217 /*
218  * Given a root znode, retrieve the associated .zfs directory.
219  * Add a hold to the vnode and return it.
220  */
221 vnode_t *
222 zfsctl_root(znode_t *zp)
223 {
224 	ASSERT(zfs_has_ctldir(zp));
225 	VN_HOLD(zp->z_zfsvfs->z_ctldir);
226 	return (zp->z_zfsvfs->z_ctldir);
227 }
228 
229 /*
230  * Common open routine.  Disallow any write access.
231  */
232 /* ARGSUSED */
233 static int
234 zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr)
235 {
236 	if (flags & FWRITE)
237 		return (EACCES);
238 
239 	return (0);
240 }
241 
242 /*
243  * Common close routine.  Nothing to do here.
244  */
245 /* ARGSUSED */
246 static int
247 zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
248     cred_t *cr)
249 {
250 	return (0);
251 }
252 
253 /*
254  * Common access routine.  Disallow writes.
255  */
256 /* ARGSUSED */
257 static int
258 zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr)
259 {
260 	if (mode & VWRITE)
261 		return (EACCES);
262 
263 	return (0);
264 }
265 
266 /*
267  * Common getattr function.  Fill in basic information.
268  */
269 static void
270 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
271 {
272 	zfsctl_node_t	*zcp = vp->v_data;
273 	timestruc_t	now;
274 
275 	vap->va_uid = 0;
276 	vap->va_gid = 0;
277 	vap->va_rdev = 0;
278 	/*
279 	 * We are a purly virtual object, so we have no
280 	 * blocksize or allocated blocks.
281 	 */
282 	vap->va_blksize = 0;
283 	vap->va_nblocks = 0;
284 	vap->va_seq = 0;
285 	vap->va_fsid = vp->v_vfsp->vfs_dev;
286 	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
287 	    S_IROTH | S_IXOTH;
288 	vap->va_type = VDIR;
289 	/*
290 	 * We live in the now (for atime).
291 	 */
292 	gethrestime(&now);
293 	vap->va_atime = now;
294 	vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
295 }
296 
297 static int
298 zfsctl_common_fid(vnode_t *vp, fid_t *fidp)
299 {
300 	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
301 	zfsctl_node_t	*zcp = vp->v_data;
302 	uint64_t	object = zcp->zc_id;
303 	zfid_short_t	*zfid;
304 	int		i;
305 
306 	ZFS_ENTER(zfsvfs);
307 
308 	if (fidp->fid_len < SHORT_FID_LEN) {
309 		fidp->fid_len = SHORT_FID_LEN;
310 		ZFS_EXIT(zfsvfs);
311 		return (ENOSPC);
312 	}
313 
314 	zfid = (zfid_short_t *)fidp;
315 
316 	zfid->zf_len = SHORT_FID_LEN;
317 
318 	for (i = 0; i < sizeof (zfid->zf_object); i++)
319 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
320 
321 	/* .zfs znodes always have a generation number of 0 */
322 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
323 		zfid->zf_gen[i] = 0;
324 
325 	ZFS_EXIT(zfsvfs);
326 	return (0);
327 }
328 
329 /*
330  * .zfs inode namespace
331  *
332  * We need to generate unique inode numbers for all files and directories
333  * within the .zfs pseudo-filesystem.  We use the following scheme:
334  *
335  * 	ENTRY			ZFSCTL_INODE
336  * 	.zfs			1
337  * 	.zfs/snapshot		2
338  * 	.zfs/snapshot/<snap>	objectid(snap)
339  */
340 
341 #define	ZFSCTL_INO_SNAP(id)	(id)
342 
343 /*
344  * Get root directory attributes.
345  */
346 /* ARGSUSED */
347 static int
348 zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
349 {
350 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
351 
352 	ZFS_ENTER(zfsvfs);
353 	vap->va_nodeid = ZFSCTL_INO_ROOT;
354 	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
355 
356 	zfsctl_common_getattr(vp, vap);
357 	ZFS_EXIT(zfsvfs);
358 
359 	return (0);
360 }
361 
362 /*
363  * Special case the handling of "..".
364  */
365 /* ARGSUSED */
366 int
367 zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
368     int flags, vnode_t *rdir, cred_t *cr)
369 {
370 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
371 	int err;
372 
373 	ZFS_ENTER(zfsvfs);
374 
375 	if (strcmp(nm, "..") == 0) {
376 		err = VFS_ROOT(dvp->v_vfsp, vpp);
377 	} else {
378 		err = gfs_dir_lookup(dvp, nm, vpp);
379 	}
380 
381 	ZFS_EXIT(zfsvfs);
382 
383 	return (err);
384 }
385 
386 static const fs_operation_def_t zfsctl_tops_root[] = {
387 	{ VOPNAME_OPEN,		zfsctl_common_open			},
388 	{ VOPNAME_CLOSE,	zfsctl_common_close			},
389 	{ VOPNAME_IOCTL,	fs_inval				},
390 	{ VOPNAME_GETATTR,	zfsctl_root_getattr			},
391 	{ VOPNAME_ACCESS,	zfsctl_common_access			},
392 	{ VOPNAME_READDIR,	gfs_vop_readdir				},
393 	{ VOPNAME_LOOKUP,	zfsctl_root_lookup			},
394 	{ VOPNAME_SEEK,		fs_seek					},
395 	{ VOPNAME_INACTIVE,	(fs_generic_func_p) gfs_vop_inactive	},
396 	{ VOPNAME_FID,		zfsctl_common_fid			},
397 	{ NULL }
398 };
399 
400 static int
401 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
402 {
403 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
404 
405 	dmu_objset_name(os, zname);
406 	if (strlen(zname) + 1 + strlen(name) >= len)
407 		return (ENAMETOOLONG);
408 	(void) strcat(zname, "@");
409 	(void) strcat(zname, name);
410 	return (0);
411 }
412 
413 static int
414 zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
415 {
416 	zfsctl_snapdir_t *sdp = dvp->v_data;
417 	zfs_snapentry_t search, *sep;
418 	avl_index_t where;
419 	int err;
420 
421 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
422 
423 	search.se_name = (char *)name;
424 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
425 		return (ENOENT);
426 
427 	ASSERT(vn_ismntpt(sep->se_root));
428 
429 	/* this will be dropped by dounmount() */
430 	if ((err = vn_vfswlock(sep->se_root)) != 0)
431 		return (err);
432 
433 	VN_HOLD(sep->se_root);
434 	err = dounmount(vn_mountedvfs(sep->se_root), force, kcred);
435 	if (err) {
436 		VN_RELE(sep->se_root);
437 		return (err);
438 	}
439 	ASSERT(sep->se_root->v_count == 1);
440 	gfs_vop_inactive(sep->se_root, cr);
441 
442 	avl_remove(&sdp->sd_snaps, sep);
443 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
444 	kmem_free(sep, sizeof (zfs_snapentry_t));
445 
446 	return (0);
447 }
448 
449 
450 static void
451 zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
452 {
453 	avl_index_t where;
454 	vfs_t *vfsp;
455 	refstr_t *pathref;
456 	char newpath[MAXNAMELEN];
457 	char *tail;
458 
459 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
460 	ASSERT(sep != NULL);
461 
462 	vfsp = vn_mountedvfs(sep->se_root);
463 	ASSERT(vfsp != NULL);
464 
465 	vfs_lock_wait(vfsp);
466 
467 	/*
468 	 * Change the name in the AVL tree.
469 	 */
470 	avl_remove(&sdp->sd_snaps, sep);
471 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
472 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
473 	(void) strcpy(sep->se_name, nm);
474 	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
475 	avl_insert(&sdp->sd_snaps, sep, where);
476 
477 	/*
478 	 * Change the current mountpoint info:
479 	 * 	- update the tail of the mntpoint path
480 	 *	- update the tail of the resource path
481 	 */
482 	pathref = vfs_getmntpoint(vfsp);
483 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
484 	VERIFY((tail = strrchr(newpath, '/')) != NULL);
485 	*(tail+1) = '\0';
486 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
487 	(void) strcat(newpath, nm);
488 	refstr_rele(pathref);
489 	vfs_setmntpoint(vfsp, newpath);
490 
491 	pathref = vfs_getresource(vfsp);
492 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
493 	VERIFY((tail = strrchr(newpath, '@')) != NULL);
494 	*(tail+1) = '\0';
495 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
496 	(void) strcat(newpath, nm);
497 	refstr_rele(pathref);
498 	vfs_setresource(vfsp, newpath);
499 
500 	vfs_unlock(vfsp);
501 }
502 
503 static int
504 zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
505     cred_t *cr)
506 {
507 	zfsctl_snapdir_t *sdp = sdvp->v_data;
508 	zfs_snapentry_t search, *sep;
509 	avl_index_t where;
510 	char from[MAXNAMELEN], to[MAXNAMELEN];
511 	int err;
512 
513 	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
514 	if (err)
515 		return (err);
516 	err = zfs_secpolicy_write(from, cr);
517 	if (err)
518 		return (err);
519 
520 	/*
521 	 * Cannot move snapshots out of the snapdir.
522 	 */
523 	if (sdvp != tdvp)
524 		return (EINVAL);
525 
526 	if (strcmp(snm, tnm) == 0)
527 		return (0);
528 
529 	err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
530 	if (err)
531 		return (err);
532 
533 	mutex_enter(&sdp->sd_lock);
534 
535 	search.se_name = (char *)snm;
536 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
537 		mutex_exit(&sdp->sd_lock);
538 		return (ENOENT);
539 	}
540 
541 	err = dmu_objset_rename(from, to);
542 	if (err == 0)
543 		zfsctl_rename_snap(sdp, sep, tnm);
544 
545 	mutex_exit(&sdp->sd_lock);
546 
547 	return (err);
548 }
549 
550 /* ARGSUSED */
551 static int
552 zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
553 {
554 	zfsctl_snapdir_t *sdp = dvp->v_data;
555 	char snapname[MAXNAMELEN];
556 	int err;
557 
558 	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
559 	if (err)
560 		return (err);
561 	err = zfs_secpolicy_write(snapname, cr);
562 	if (err)
563 		return (err);
564 
565 	mutex_enter(&sdp->sd_lock);
566 
567 	err = zfsctl_unmount_snap(dvp, name, 0, cr);
568 	if (err) {
569 		mutex_exit(&sdp->sd_lock);
570 		return (err);
571 	}
572 
573 	err = dmu_objset_destroy(snapname);
574 
575 	mutex_exit(&sdp->sd_lock);
576 
577 	return (err);
578 }
579 
580 /*
581  * Lookup entry point for the 'snapshot' directory.  Try to open the
582  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
583  * Perform a mount of the associated dataset on top of the vnode.
584  */
585 /* ARGSUSED */
586 static int
587 zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
588     int flags, vnode_t *rdir, cred_t *cr)
589 {
590 	zfsctl_snapdir_t *sdp = dvp->v_data;
591 	objset_t *snap;
592 	char snapname[MAXNAMELEN];
593 	char *mountpoint;
594 	zfs_snapentry_t *sep, search;
595 	struct mounta margs;
596 	vfs_t *vfsp;
597 	size_t mountpoint_len;
598 	avl_index_t where;
599 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
600 	int err;
601 
602 	ASSERT(dvp->v_type == VDIR);
603 
604 	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
605 		return (0);
606 
607 	/*
608 	 * If we get a recursive call, that means we got called
609 	 * from the domount() code while it was trying to look up the
610 	 * spec (which looks like a local path for zfs).  We need to
611 	 * add some flag to domount() to tell it not to do this lookup.
612 	 */
613 	if (MUTEX_HELD(&sdp->sd_lock))
614 		return (ENOENT);
615 
616 	ZFS_ENTER(zfsvfs);
617 
618 	mutex_enter(&sdp->sd_lock);
619 	search.se_name = (char *)nm;
620 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
621 		*vpp = sep->se_root;
622 		VN_HOLD(*vpp);
623 		err = traverse(vpp);
624 		if (err) {
625 			VN_RELE(*vpp);
626 			*vpp = NULL;
627 		} else if (*vpp == sep->se_root) {
628 			/*
629 			 * The snapshot was unmounted behind our backs,
630 			 * try to remount it.
631 			 */
632 			goto domount;
633 		}
634 		mutex_exit(&sdp->sd_lock);
635 		ZFS_EXIT(zfsvfs);
636 		return (err);
637 	}
638 
639 	/*
640 	 * The requested snapshot is not currently mounted, look it up.
641 	 */
642 	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
643 	if (err) {
644 		mutex_exit(&sdp->sd_lock);
645 		ZFS_EXIT(zfsvfs);
646 		return (err);
647 	}
648 	if (dmu_objset_open(snapname, DMU_OST_ZFS,
649 	    DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
650 		mutex_exit(&sdp->sd_lock);
651 		ZFS_EXIT(zfsvfs);
652 		return (ENOENT);
653 	}
654 
655 	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
656 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
657 	(void) strcpy(sep->se_name, nm);
658 	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
659 	avl_insert(&sdp->sd_snaps, sep, where);
660 
661 	dmu_objset_close(snap);
662 domount:
663 	mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
664 	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
665 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
666 	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
667 	    refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
668 
669 	margs.spec = snapname;
670 	margs.dir = mountpoint;
671 	margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
672 	margs.fstype = "zfs";
673 	margs.dataptr = NULL;
674 	margs.datalen = 0;
675 	margs.optptr = NULL;
676 	margs.optlen = 0;
677 
678 	err = domount("zfs", &margs, *vpp, kcred, &vfsp);
679 	kmem_free(mountpoint, mountpoint_len);
680 
681 	if (err == 0) {
682 		/*
683 		 * Return the mounted root rather than the covered mount point.
684 		 */
685 		VFS_RELE(vfsp);
686 		err = traverse(vpp);
687 	}
688 
689 	if (err == 0) {
690 		/*
691 		 * Fix up the root vnode.
692 		 */
693 		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
694 		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
695 		(*vpp)->v_vfsp = zfsvfs->z_vfs;
696 		(*vpp)->v_flag &= ~VROOT;
697 	}
698 	mutex_exit(&sdp->sd_lock);
699 	ZFS_EXIT(zfsvfs);
700 
701 	/*
702 	 * If we had an error, drop our hold on the vnode and
703 	 * zfsctl_snapshot_inactive() will clean up.
704 	 */
705 	if (err) {
706 		VN_RELE(*vpp);
707 		*vpp = NULL;
708 	}
709 	return (err);
710 }
711 
712 /* ARGSUSED */
713 static int
714 zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
715     offset_t *offp, offset_t *nextp, void *data)
716 {
717 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
718 	char snapname[MAXNAMELEN];
719 	uint64_t id, cookie;
720 
721 	ZFS_ENTER(zfsvfs);
722 
723 	cookie = *offp;
724 	if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
725 	    &cookie) == ENOENT) {
726 		*eofp = 1;
727 		ZFS_EXIT(zfsvfs);
728 		return (0);
729 	}
730 
731 	(void) strcpy(dp->d_name, snapname);
732 	dp->d_ino = ZFSCTL_INO_SNAP(id);
733 	*nextp = cookie;
734 
735 	ZFS_EXIT(zfsvfs);
736 
737 	return (0);
738 }
739 
740 vnode_t *
741 zfsctl_mknode_snapdir(vnode_t *pvp)
742 {
743 	vnode_t *vp;
744 	zfsctl_snapdir_t *sdp;
745 
746 	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
747 	    zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
748 	    zfsctl_snapdir_readdir_cb, NULL);
749 	sdp = vp->v_data;
750 	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
751 	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
752 	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
753 	avl_create(&sdp->sd_snaps, snapentry_compare,
754 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
755 	return (vp);
756 }
757 
758 /* ARGSUSED */
759 static int
760 zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
761 {
762 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
763 	zfsctl_snapdir_t *sdp = vp->v_data;
764 
765 	ZFS_ENTER(zfsvfs);
766 	zfsctl_common_getattr(vp, vap);
767 	vap->va_nodeid = gfs_file_inode(vp);
768 	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
769 	ZFS_EXIT(zfsvfs);
770 
771 	return (0);
772 }
773 
774 /* ARGSUSED */
775 static void
776 zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr)
777 {
778 	zfsctl_snapdir_t *sdp = vp->v_data;
779 	void *private;
780 
781 	private = gfs_dir_inactive(vp);
782 	if (private != NULL) {
783 		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
784 		mutex_destroy(&sdp->sd_lock);
785 		avl_destroy(&sdp->sd_snaps);
786 		kmem_free(private, sizeof (zfsctl_snapdir_t));
787 	}
788 }
789 
790 static const fs_operation_def_t zfsctl_tops_snapdir[] = {
791 	{ VOPNAME_OPEN,		zfsctl_common_open			},
792 	{ VOPNAME_CLOSE,	zfsctl_common_close			},
793 	{ VOPNAME_IOCTL,	fs_inval				},
794 	{ VOPNAME_GETATTR,	zfsctl_snapdir_getattr			},
795 	{ VOPNAME_ACCESS,	zfsctl_common_access			},
796 	{ VOPNAME_RENAME,	zfsctl_snapdir_rename			},
797 	{ VOPNAME_RMDIR,	zfsctl_snapdir_remove			},
798 	{ VOPNAME_READDIR,	gfs_vop_readdir				},
799 	{ VOPNAME_LOOKUP,	zfsctl_snapdir_lookup			},
800 	{ VOPNAME_SEEK,		fs_seek					},
801 	{ VOPNAME_INACTIVE,	(fs_generic_func_p) zfsctl_snapdir_inactive },
802 	{ VOPNAME_FID,		zfsctl_common_fid			},
803 	{ NULL }
804 };
805 
806 static vnode_t *
807 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
808 {
809 	vnode_t *vp;
810 	zfsctl_node_t *zcp;
811 
812 	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
813 	    zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
814 	zcp = vp->v_data;
815 	zcp->zc_id = objset;
816 
817 	return (vp);
818 }
819 
820 static void
821 zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr)
822 {
823 	zfsctl_snapdir_t *sdp;
824 	zfs_snapentry_t *sep, *next;
825 	vnode_t *dvp;
826 
827 	VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
828 	sdp = dvp->v_data;
829 
830 	mutex_enter(&sdp->sd_lock);
831 
832 	if (vp->v_count > 1) {
833 		mutex_exit(&sdp->sd_lock);
834 		return;
835 	}
836 	ASSERT(!vn_ismntpt(vp));
837 
838 	sep = avl_first(&sdp->sd_snaps);
839 	while (sep != NULL) {
840 		next = AVL_NEXT(&sdp->sd_snaps, sep);
841 
842 		if (sep->se_root == vp) {
843 			avl_remove(&sdp->sd_snaps, sep);
844 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
845 			kmem_free(sep, sizeof (zfs_snapentry_t));
846 			break;
847 		}
848 		sep = next;
849 	}
850 	ASSERT(sep != NULL);
851 
852 	mutex_exit(&sdp->sd_lock);
853 	VN_RELE(dvp);
854 
855 	/*
856 	 * Dispose of the vnode for the snapshot mount point.
857 	 * This is safe to do because once this entry has been removed
858 	 * from the AVL tree, it can't be found again, so cannot become
859 	 * "active".  If we lookup the same name again we will end up
860 	 * creating a new vnode.
861 	 */
862 	gfs_vop_inactive(vp, cr);
863 }
864 
865 
866 /*
867  * These VP's should never see the light of day.  They should always
868  * be covered.
869  */
870 static const fs_operation_def_t zfsctl_tops_snapshot[] = {
871 	VOPNAME_INACTIVE, (fs_generic_func_p) zfsctl_snapshot_inactive,
872 	NULL, NULL
873 };
874 
875 int
876 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
877 {
878 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
879 	vnode_t *dvp, *vp;
880 	zfsctl_snapdir_t *sdp;
881 	zfsctl_node_t *zcp;
882 	zfs_snapentry_t *sep;
883 	int error;
884 
885 	ASSERT(zfsvfs->z_ctldir != NULL);
886 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
887 	    NULL, 0, NULL, kcred);
888 	if (error != 0)
889 		return (error);
890 	sdp = dvp->v_data;
891 
892 	mutex_enter(&sdp->sd_lock);
893 	sep = avl_first(&sdp->sd_snaps);
894 	while (sep != NULL) {
895 		vp = sep->se_root;
896 		zcp = vp->v_data;
897 		if (zcp->zc_id == objsetid)
898 			break;
899 
900 		sep = AVL_NEXT(&sdp->sd_snaps, sep);
901 	}
902 
903 	if (sep != NULL) {
904 		VN_HOLD(vp);
905 		error = traverse(&vp);
906 		if (error == 0) {
907 			if (vp == sep->se_root)
908 				error = EINVAL;
909 			else
910 				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
911 		}
912 		mutex_exit(&sdp->sd_lock);
913 		VN_RELE(vp);
914 	} else {
915 		error = EINVAL;
916 		mutex_exit(&sdp->sd_lock);
917 	}
918 
919 	VN_RELE(dvp);
920 
921 	return (error);
922 }
923 
924 /*
925  * Unmount any snapshots for the given filesystem.  This is called from
926  * zfs_umount() - if we have a ctldir, then go through and unmount all the
927  * snapshots.
928  */
929 int
930 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
931 {
932 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
933 	vnode_t *dvp, *svp;
934 	zfsctl_snapdir_t *sdp;
935 	zfs_snapentry_t *sep, *next;
936 	int error;
937 
938 	ASSERT(zfsvfs->z_ctldir != NULL);
939 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
940 	    NULL, 0, NULL, cr);
941 	if (error != 0)
942 		return (error);
943 	sdp = dvp->v_data;
944 
945 	mutex_enter(&sdp->sd_lock);
946 
947 	sep = avl_first(&sdp->sd_snaps);
948 	while (sep != NULL) {
949 		svp = sep->se_root;
950 		next = AVL_NEXT(&sdp->sd_snaps, sep);
951 
952 		/*
953 		 * If this snapshot is not mounted, then it must
954 		 * have just been unmounted by somebody else, and
955 		 * will be cleaned up by zfsctl_snapdir_inactive().
956 		 */
957 		if (vn_ismntpt(svp)) {
958 			if ((error = vn_vfswlock(svp)) != 0)
959 				goto out;
960 
961 			VN_HOLD(svp);
962 			error = dounmount(vn_mountedvfs(svp), fflags, cr);
963 			if (error) {
964 				VN_RELE(svp);
965 				goto out;
966 			}
967 
968 			avl_remove(&sdp->sd_snaps, sep);
969 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
970 			kmem_free(sep, sizeof (zfs_snapentry_t));
971 
972 			/*
973 			 * We can't use VN_RELE(), as that will try to
974 			 * invoke zfsctl_snapdir_inactive(), and that
975 			 * would lead to an attempt to re-grab the sd_lock.
976 			 */
977 			ASSERT3U(svp->v_count, ==, 1);
978 			gfs_vop_inactive(svp, cr);
979 		}
980 		sep = next;
981 	}
982 out:
983 	mutex_exit(&sdp->sd_lock);
984 	VN_RELE(dvp);
985 
986 	return (error);
987 }
988