xref: /titanic_52/usr/src/uts/common/fs/zfs/zfs_ctldir.c (revision adb91f4744062c28f7f3d0e8bf4704d2a8127b89)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * ZFS control directory (a.k.a. ".zfs")
30  *
31  * This directory provides a common location for all ZFS meta-objects.
32  * Currently, this is only the 'snapshot' directory, but this may expand in the
33  * future.  The elements are built using the GFS primitives, as the hierarchy
34  * does not actually exist on disk.
35  *
36  * For 'snapshot', we don't want to have all snapshots always mounted, because
37  * this would take up a huge amount of space in /etc/mnttab.  We have three
38  * types of objects:
39  *
40  * 	ctldir ------> snapshotdir -------> snapshot
41  *                                             |
42  *                                             |
43  *                                             V
44  *                                         mounted fs
45  *
46  * The 'snapshot' node contains just enough information to lookup '..' and act
47  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
48  * perform an automount of the underlying filesystem and return the
49  * corresponding vnode.
50  *
51  * All mounts are handled automatically by the kernel, but unmounts are
52  * (currently) handled from user land.  The main reason is that there is no
53  * reliable way to auto-unmount the filesystem when it's "no longer in use".
54  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
55  * unmounts any snapshots within the snapshot directory.
56  */
57 
58 #include <fs/fs_subr.h>
59 #include <sys/zfs_ctldir.h>
60 #include <sys/zfs_ioctl.h>
61 #include <sys/zfs_vfsops.h>
62 #include <sys/vfs_opreg.h>
63 #include <sys/gfs.h>
64 #include <sys/stat.h>
65 #include <sys/dmu.h>
66 #include <sys/mount.h>
67 
68 typedef struct {
69 	char		*se_name;
70 	vnode_t		*se_root;
71 	avl_node_t	se_node;
72 } zfs_snapentry_t;
73 
74 static int
75 snapentry_compare(const void *a, const void *b)
76 {
77 	const zfs_snapentry_t *sa = a;
78 	const zfs_snapentry_t *sb = b;
79 	int ret = strcmp(sa->se_name, sb->se_name);
80 
81 	if (ret < 0)
82 		return (-1);
83 	else if (ret > 0)
84 		return (1);
85 	else
86 		return (0);
87 }
88 
89 vnodeops_t *zfsctl_ops_root;
90 vnodeops_t *zfsctl_ops_snapdir;
91 vnodeops_t *zfsctl_ops_snapshot;
92 
93 static const fs_operation_def_t zfsctl_tops_root[];
94 static const fs_operation_def_t zfsctl_tops_snapdir[];
95 static const fs_operation_def_t zfsctl_tops_snapshot[];
96 
97 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
98 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
99 
100 static gfs_opsvec_t zfsctl_opsvec[] = {
101 	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
102 	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
103 	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
104 	{ NULL }
105 };
106 
107 typedef struct zfsctl_node {
108 	gfs_dir_t	zc_gfs_private;
109 	uint64_t	zc_id;
110 	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
111 } zfsctl_node_t;
112 
113 typedef struct zfsctl_snapdir {
114 	zfsctl_node_t	sd_node;
115 	kmutex_t	sd_lock;
116 	avl_tree_t	sd_snaps;
117 } zfsctl_snapdir_t;
118 
119 /*
120  * Root directory elements.  We have only a single static entry, 'snapshot'.
121  */
122 static gfs_dirent_t zfsctl_root_entries[] = {
123 	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
124 	{ NULL }
125 };
126 
127 /* include . and .. in the calculation */
128 #define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
129     sizeof (gfs_dirent_t)) + 1)
130 
131 
132 /*
133  * Initialize the various GFS pieces we'll need to create and manipulate .zfs
134  * directories.  This is called from the ZFS init routine, and initializes the
135  * vnode ops vectors that we'll be using.
136  */
137 void
138 zfsctl_init(void)
139 {
140 	VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
141 }
142 
143 void
144 zfsctl_fini(void)
145 {
146 	/*
147 	 * Remove vfsctl vnode ops
148 	 */
149 	if (zfsctl_ops_root)
150 		vn_freevnodeops(zfsctl_ops_root);
151 	if (zfsctl_ops_snapdir)
152 		vn_freevnodeops(zfsctl_ops_snapdir);
153 	if (zfsctl_ops_snapshot)
154 		vn_freevnodeops(zfsctl_ops_snapshot);
155 
156 	zfsctl_ops_root = NULL;
157 	zfsctl_ops_snapdir = NULL;
158 	zfsctl_ops_snapshot = NULL;
159 }
160 
161 /*
162  * Return the inode number associated with the 'snapshot' directory.
163  */
164 /* ARGSUSED */
165 static ino64_t
166 zfsctl_root_inode_cb(vnode_t *vp, int index)
167 {
168 	ASSERT(index == 0);
169 	return (ZFSCTL_INO_SNAPDIR);
170 }
171 
172 /*
173  * Create the '.zfs' directory.  This directory is cached as part of the VFS
174  * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
175  * therefore checks against a vfs_count of 2 instead of 1.  This reference
176  * is removed when the ctldir is destroyed in the unmount.
177  */
178 void
179 zfsctl_create(zfsvfs_t *zfsvfs)
180 {
181 	vnode_t *vp, *rvp;
182 	zfsctl_node_t *zcp;
183 
184 	ASSERT(zfsvfs->z_ctldir == NULL);
185 
186 	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
187 	    zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
188 	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
189 	zcp = vp->v_data;
190 	zcp->zc_id = ZFSCTL_INO_ROOT;
191 
192 	VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
193 	ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
194 	VN_RELE(rvp);
195 
196 	/*
197 	 * We're only faking the fact that we have a root of a filesystem for
198 	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
199 	 * for us.
200 	 */
201 	vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
202 
203 	zfsvfs->z_ctldir = vp;
204 }
205 
206 /*
207  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
208  * There might still be more references if we were force unmounted, but only
209  * new zfs_inactive() calls can occur and they don't reference .zfs
210  */
211 void
212 zfsctl_destroy(zfsvfs_t *zfsvfs)
213 {
214 	VN_RELE(zfsvfs->z_ctldir);
215 	zfsvfs->z_ctldir = NULL;
216 }
217 
218 /*
219  * Given a root znode, retrieve the associated .zfs directory.
220  * Add a hold to the vnode and return it.
221  */
222 vnode_t *
223 zfsctl_root(znode_t *zp)
224 {
225 	ASSERT(zfs_has_ctldir(zp));
226 	VN_HOLD(zp->z_zfsvfs->z_ctldir);
227 	return (zp->z_zfsvfs->z_ctldir);
228 }
229 
230 /*
231  * Common open routine.  Disallow any write access.
232  */
233 /* ARGSUSED */
234 static int
235 zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr)
236 {
237 	if (flags & FWRITE)
238 		return (EACCES);
239 
240 	return (0);
241 }
242 
243 /*
244  * Common close routine.  Nothing to do here.
245  */
246 /* ARGSUSED */
247 static int
248 zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
249     cred_t *cr)
250 {
251 	return (0);
252 }
253 
254 /*
255  * Common access routine.  Disallow writes.
256  */
257 /* ARGSUSED */
258 static int
259 zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr)
260 {
261 	if (mode & VWRITE)
262 		return (EACCES);
263 
264 	return (0);
265 }
266 
267 /*
268  * Common getattr function.  Fill in basic information.
269  */
270 static void
271 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
272 {
273 	zfsctl_node_t	*zcp = vp->v_data;
274 	timestruc_t	now;
275 
276 	vap->va_uid = 0;
277 	vap->va_gid = 0;
278 	vap->va_rdev = 0;
279 	/*
280 	 * We are a purly virtual object, so we have no
281 	 * blocksize or allocated blocks.
282 	 */
283 	vap->va_blksize = 0;
284 	vap->va_nblocks = 0;
285 	vap->va_seq = 0;
286 	vap->va_fsid = vp->v_vfsp->vfs_dev;
287 	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
288 	    S_IROTH | S_IXOTH;
289 	vap->va_type = VDIR;
290 	/*
291 	 * We live in the now (for atime).
292 	 */
293 	gethrestime(&now);
294 	vap->va_atime = now;
295 	vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
296 }
297 
298 static int
299 zfsctl_common_fid(vnode_t *vp, fid_t *fidp)
300 {
301 	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
302 	zfsctl_node_t	*zcp = vp->v_data;
303 	uint64_t	object = zcp->zc_id;
304 	zfid_short_t	*zfid;
305 	int		i;
306 
307 	ZFS_ENTER(zfsvfs);
308 
309 	if (fidp->fid_len < SHORT_FID_LEN) {
310 		fidp->fid_len = SHORT_FID_LEN;
311 		ZFS_EXIT(zfsvfs);
312 		return (ENOSPC);
313 	}
314 
315 	zfid = (zfid_short_t *)fidp;
316 
317 	zfid->zf_len = SHORT_FID_LEN;
318 
319 	for (i = 0; i < sizeof (zfid->zf_object); i++)
320 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
321 
322 	/* .zfs znodes always have a generation number of 0 */
323 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
324 		zfid->zf_gen[i] = 0;
325 
326 	ZFS_EXIT(zfsvfs);
327 	return (0);
328 }
329 
330 /*
331  * .zfs inode namespace
332  *
333  * We need to generate unique inode numbers for all files and directories
334  * within the .zfs pseudo-filesystem.  We use the following scheme:
335  *
336  * 	ENTRY			ZFSCTL_INODE
337  * 	.zfs			1
338  * 	.zfs/snapshot		2
339  * 	.zfs/snapshot/<snap>	objectid(snap)
340  */
341 
342 #define	ZFSCTL_INO_SNAP(id)	(id)
343 
344 /*
345  * Get root directory attributes.
346  */
347 /* ARGSUSED */
348 static int
349 zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
350 {
351 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
352 
353 	ZFS_ENTER(zfsvfs);
354 	vap->va_nodeid = ZFSCTL_INO_ROOT;
355 	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
356 
357 	zfsctl_common_getattr(vp, vap);
358 	ZFS_EXIT(zfsvfs);
359 
360 	return (0);
361 }
362 
363 /*
364  * Special case the handling of "..".
365  */
366 /* ARGSUSED */
367 int
368 zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
369     int flags, vnode_t *rdir, cred_t *cr)
370 {
371 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
372 	int err;
373 
374 	ZFS_ENTER(zfsvfs);
375 
376 	if (strcmp(nm, "..") == 0) {
377 		err = VFS_ROOT(dvp->v_vfsp, vpp);
378 	} else {
379 		err = gfs_dir_lookup(dvp, nm, vpp);
380 	}
381 
382 	ZFS_EXIT(zfsvfs);
383 
384 	return (err);
385 }
386 
387 static const fs_operation_def_t zfsctl_tops_root[] = {
388 	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
389 	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
390 	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
391 	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_root_getattr }	},
392 	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
393 	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir } 	},
394 	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_root_lookup }	},
395 	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
396 	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive }	},
397 	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid	}	},
398 	{ NULL }
399 };
400 
401 static int
402 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
403 {
404 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
405 
406 	dmu_objset_name(os, zname);
407 	if (strlen(zname) + 1 + strlen(name) >= len)
408 		return (ENAMETOOLONG);
409 	(void) strcat(zname, "@");
410 	(void) strcat(zname, name);
411 	return (0);
412 }
413 
414 static int
415 zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
416 {
417 	zfsctl_snapdir_t *sdp = dvp->v_data;
418 	zfs_snapentry_t search, *sep;
419 	avl_index_t where;
420 	int err;
421 
422 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
423 
424 	search.se_name = (char *)name;
425 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
426 		return (ENOENT);
427 
428 	ASSERT(vn_ismntpt(sep->se_root));
429 
430 	/* this will be dropped by dounmount() */
431 	if ((err = vn_vfswlock(sep->se_root)) != 0)
432 		return (err);
433 
434 	VN_HOLD(sep->se_root);
435 	err = dounmount(vn_mountedvfs(sep->se_root), force, kcred);
436 	if (err) {
437 		VN_RELE(sep->se_root);
438 		return (err);
439 	}
440 	ASSERT(sep->se_root->v_count == 1);
441 	gfs_vop_inactive(sep->se_root, cr);
442 
443 	avl_remove(&sdp->sd_snaps, sep);
444 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
445 	kmem_free(sep, sizeof (zfs_snapentry_t));
446 
447 	return (0);
448 }
449 
450 
451 static void
452 zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
453 {
454 	avl_index_t where;
455 	vfs_t *vfsp;
456 	refstr_t *pathref;
457 	char newpath[MAXNAMELEN];
458 	char *tail;
459 
460 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
461 	ASSERT(sep != NULL);
462 
463 	vfsp = vn_mountedvfs(sep->se_root);
464 	ASSERT(vfsp != NULL);
465 
466 	vfs_lock_wait(vfsp);
467 
468 	/*
469 	 * Change the name in the AVL tree.
470 	 */
471 	avl_remove(&sdp->sd_snaps, sep);
472 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
473 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
474 	(void) strcpy(sep->se_name, nm);
475 	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
476 	avl_insert(&sdp->sd_snaps, sep, where);
477 
478 	/*
479 	 * Change the current mountpoint info:
480 	 * 	- update the tail of the mntpoint path
481 	 *	- update the tail of the resource path
482 	 */
483 	pathref = vfs_getmntpoint(vfsp);
484 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
485 	VERIFY((tail = strrchr(newpath, '/')) != NULL);
486 	*(tail+1) = '\0';
487 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
488 	(void) strcat(newpath, nm);
489 	refstr_rele(pathref);
490 	vfs_setmntpoint(vfsp, newpath);
491 
492 	pathref = vfs_getresource(vfsp);
493 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
494 	VERIFY((tail = strrchr(newpath, '@')) != NULL);
495 	*(tail+1) = '\0';
496 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
497 	(void) strcat(newpath, nm);
498 	refstr_rele(pathref);
499 	vfs_setresource(vfsp, newpath);
500 
501 	vfs_unlock(vfsp);
502 }
503 
504 static int
505 zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
506     cred_t *cr)
507 {
508 	zfsctl_snapdir_t *sdp = sdvp->v_data;
509 	zfs_snapentry_t search, *sep;
510 	avl_index_t where;
511 	char from[MAXNAMELEN], to[MAXNAMELEN];
512 	int err;
513 
514 	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
515 	if (err)
516 		return (err);
517 	err = zfs_secpolicy_write(from, cr);
518 	if (err)
519 		return (err);
520 
521 	/*
522 	 * Cannot move snapshots out of the snapdir.
523 	 */
524 	if (sdvp != tdvp)
525 		return (EINVAL);
526 
527 	if (strcmp(snm, tnm) == 0)
528 		return (0);
529 
530 	err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
531 	if (err)
532 		return (err);
533 
534 	mutex_enter(&sdp->sd_lock);
535 
536 	search.se_name = (char *)snm;
537 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
538 		mutex_exit(&sdp->sd_lock);
539 		return (ENOENT);
540 	}
541 
542 	err = dmu_objset_rename(from, to, B_FALSE);
543 	if (err == 0)
544 		zfsctl_rename_snap(sdp, sep, tnm);
545 
546 	mutex_exit(&sdp->sd_lock);
547 
548 	return (err);
549 }
550 
551 /* ARGSUSED */
552 static int
553 zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
554 {
555 	zfsctl_snapdir_t *sdp = dvp->v_data;
556 	char snapname[MAXNAMELEN];
557 	int err;
558 
559 	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
560 	if (err)
561 		return (err);
562 	err = zfs_secpolicy_write(snapname, cr);
563 	if (err)
564 		return (err);
565 
566 	mutex_enter(&sdp->sd_lock);
567 
568 	err = zfsctl_unmount_snap(dvp, name, 0, cr);
569 	if (err) {
570 		mutex_exit(&sdp->sd_lock);
571 		return (err);
572 	}
573 
574 	err = dmu_objset_destroy(snapname);
575 
576 	mutex_exit(&sdp->sd_lock);
577 
578 	return (err);
579 }
580 
581 /*
582  * Lookup entry point for the 'snapshot' directory.  Try to open the
583  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
584  * Perform a mount of the associated dataset on top of the vnode.
585  */
586 /* ARGSUSED */
587 static int
588 zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
589     int flags, vnode_t *rdir, cred_t *cr)
590 {
591 	zfsctl_snapdir_t *sdp = dvp->v_data;
592 	objset_t *snap;
593 	char snapname[MAXNAMELEN];
594 	char *mountpoint;
595 	zfs_snapentry_t *sep, search;
596 	struct mounta margs;
597 	vfs_t *vfsp;
598 	size_t mountpoint_len;
599 	avl_index_t where;
600 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
601 	int err;
602 
603 	ASSERT(dvp->v_type == VDIR);
604 
605 	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
606 		return (0);
607 
608 	/*
609 	 * If we get a recursive call, that means we got called
610 	 * from the domount() code while it was trying to look up the
611 	 * spec (which looks like a local path for zfs).  We need to
612 	 * add some flag to domount() to tell it not to do this lookup.
613 	 */
614 	if (MUTEX_HELD(&sdp->sd_lock))
615 		return (ENOENT);
616 
617 	ZFS_ENTER(zfsvfs);
618 
619 	mutex_enter(&sdp->sd_lock);
620 	search.se_name = (char *)nm;
621 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
622 		*vpp = sep->se_root;
623 		VN_HOLD(*vpp);
624 		err = traverse(vpp);
625 		if (err) {
626 			VN_RELE(*vpp);
627 			*vpp = NULL;
628 		} else if (*vpp == sep->se_root) {
629 			/*
630 			 * The snapshot was unmounted behind our backs,
631 			 * try to remount it.
632 			 */
633 			goto domount;
634 		}
635 		mutex_exit(&sdp->sd_lock);
636 		ZFS_EXIT(zfsvfs);
637 		return (err);
638 	}
639 
640 	/*
641 	 * The requested snapshot is not currently mounted, look it up.
642 	 */
643 	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
644 	if (err) {
645 		mutex_exit(&sdp->sd_lock);
646 		ZFS_EXIT(zfsvfs);
647 		return (err);
648 	}
649 	if (dmu_objset_open(snapname, DMU_OST_ZFS,
650 	    DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
651 		mutex_exit(&sdp->sd_lock);
652 		ZFS_EXIT(zfsvfs);
653 		return (ENOENT);
654 	}
655 
656 	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
657 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
658 	(void) strcpy(sep->se_name, nm);
659 	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
660 	avl_insert(&sdp->sd_snaps, sep, where);
661 
662 	dmu_objset_close(snap);
663 domount:
664 	mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
665 	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
666 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
667 	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
668 	    refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
669 
670 	margs.spec = snapname;
671 	margs.dir = mountpoint;
672 	margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
673 	margs.fstype = "zfs";
674 	margs.dataptr = NULL;
675 	margs.datalen = 0;
676 	margs.optptr = NULL;
677 	margs.optlen = 0;
678 
679 	err = domount("zfs", &margs, *vpp, kcred, &vfsp);
680 	kmem_free(mountpoint, mountpoint_len);
681 
682 	if (err == 0) {
683 		/*
684 		 * Return the mounted root rather than the covered mount point.
685 		 */
686 		VFS_RELE(vfsp);
687 		err = traverse(vpp);
688 	}
689 
690 	if (err == 0) {
691 		/*
692 		 * Fix up the root vnode.
693 		 */
694 		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
695 		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
696 		(*vpp)->v_vfsp = zfsvfs->z_vfs;
697 		(*vpp)->v_flag &= ~VROOT;
698 	}
699 	mutex_exit(&sdp->sd_lock);
700 	ZFS_EXIT(zfsvfs);
701 
702 	/*
703 	 * If we had an error, drop our hold on the vnode and
704 	 * zfsctl_snapshot_inactive() will clean up.
705 	 */
706 	if (err) {
707 		VN_RELE(*vpp);
708 		*vpp = NULL;
709 	}
710 	return (err);
711 }
712 
713 /* ARGSUSED */
714 static int
715 zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
716     offset_t *offp, offset_t *nextp, void *data)
717 {
718 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
719 	char snapname[MAXNAMELEN];
720 	uint64_t id, cookie;
721 
722 	ZFS_ENTER(zfsvfs);
723 
724 	cookie = *offp;
725 	if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
726 	    &cookie) == ENOENT) {
727 		*eofp = 1;
728 		ZFS_EXIT(zfsvfs);
729 		return (0);
730 	}
731 
732 	(void) strcpy(dp->d_name, snapname);
733 	dp->d_ino = ZFSCTL_INO_SNAP(id);
734 	*nextp = cookie;
735 
736 	ZFS_EXIT(zfsvfs);
737 
738 	return (0);
739 }
740 
741 vnode_t *
742 zfsctl_mknode_snapdir(vnode_t *pvp)
743 {
744 	vnode_t *vp;
745 	zfsctl_snapdir_t *sdp;
746 
747 	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
748 	    zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
749 	    zfsctl_snapdir_readdir_cb, NULL);
750 	sdp = vp->v_data;
751 	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
752 	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
753 	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
754 	avl_create(&sdp->sd_snaps, snapentry_compare,
755 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
756 	return (vp);
757 }
758 
759 /* ARGSUSED */
760 static int
761 zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
762 {
763 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
764 	zfsctl_snapdir_t *sdp = vp->v_data;
765 
766 	ZFS_ENTER(zfsvfs);
767 	zfsctl_common_getattr(vp, vap);
768 	vap->va_nodeid = gfs_file_inode(vp);
769 	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
770 	ZFS_EXIT(zfsvfs);
771 
772 	return (0);
773 }
774 
775 /* ARGSUSED */
776 static void
777 zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr)
778 {
779 	zfsctl_snapdir_t *sdp = vp->v_data;
780 	void *private;
781 
782 	private = gfs_dir_inactive(vp);
783 	if (private != NULL) {
784 		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
785 		mutex_destroy(&sdp->sd_lock);
786 		avl_destroy(&sdp->sd_snaps);
787 		kmem_free(private, sizeof (zfsctl_snapdir_t));
788 	}
789 }
790 
791 static const fs_operation_def_t zfsctl_tops_snapdir[] = {
792 	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
793 	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
794 	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
795 	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_snapdir_getattr } },
796 	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
797 	{ VOPNAME_RENAME,	{ .vop_rename = zfsctl_snapdir_rename }	},
798 	{ VOPNAME_RMDIR,	{ .vop_rmdir = zfsctl_snapdir_remove }	},
799 	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir }	},
800 	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_snapdir_lookup }	},
801 	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
802 	{ VOPNAME_INACTIVE,	{ .vop_inactive = zfsctl_snapdir_inactive } },
803 	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid }	},
804 	{ NULL }
805 };
806 
807 static vnode_t *
808 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
809 {
810 	vnode_t *vp;
811 	zfsctl_node_t *zcp;
812 
813 	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
814 	    zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
815 	zcp = vp->v_data;
816 	zcp->zc_id = objset;
817 
818 	return (vp);
819 }
820 
821 static void
822 zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr)
823 {
824 	zfsctl_snapdir_t *sdp;
825 	zfs_snapentry_t *sep, *next;
826 	vnode_t *dvp;
827 
828 	VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
829 	sdp = dvp->v_data;
830 
831 	mutex_enter(&sdp->sd_lock);
832 
833 	if (vp->v_count > 1) {
834 		mutex_exit(&sdp->sd_lock);
835 		return;
836 	}
837 	ASSERT(!vn_ismntpt(vp));
838 
839 	sep = avl_first(&sdp->sd_snaps);
840 	while (sep != NULL) {
841 		next = AVL_NEXT(&sdp->sd_snaps, sep);
842 
843 		if (sep->se_root == vp) {
844 			avl_remove(&sdp->sd_snaps, sep);
845 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
846 			kmem_free(sep, sizeof (zfs_snapentry_t));
847 			break;
848 		}
849 		sep = next;
850 	}
851 	ASSERT(sep != NULL);
852 
853 	mutex_exit(&sdp->sd_lock);
854 	VN_RELE(dvp);
855 
856 	/*
857 	 * Dispose of the vnode for the snapshot mount point.
858 	 * This is safe to do because once this entry has been removed
859 	 * from the AVL tree, it can't be found again, so cannot become
860 	 * "active".  If we lookup the same name again we will end up
861 	 * creating a new vnode.
862 	 */
863 	gfs_vop_inactive(vp, cr);
864 }
865 
866 
867 /*
868  * These VP's should never see the light of day.  They should always
869  * be covered.
870  */
871 static const fs_operation_def_t zfsctl_tops_snapshot[] = {
872 	VOPNAME_INACTIVE, { .vop_inactive =  zfsctl_snapshot_inactive },
873 	NULL, NULL
874 };
875 
876 int
877 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
878 {
879 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
880 	vnode_t *dvp, *vp;
881 	zfsctl_snapdir_t *sdp;
882 	zfsctl_node_t *zcp;
883 	zfs_snapentry_t *sep;
884 	int error;
885 
886 	ASSERT(zfsvfs->z_ctldir != NULL);
887 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
888 	    NULL, 0, NULL, kcred);
889 	if (error != 0)
890 		return (error);
891 	sdp = dvp->v_data;
892 
893 	mutex_enter(&sdp->sd_lock);
894 	sep = avl_first(&sdp->sd_snaps);
895 	while (sep != NULL) {
896 		vp = sep->se_root;
897 		zcp = vp->v_data;
898 		if (zcp->zc_id == objsetid)
899 			break;
900 
901 		sep = AVL_NEXT(&sdp->sd_snaps, sep);
902 	}
903 
904 	if (sep != NULL) {
905 		VN_HOLD(vp);
906 		error = traverse(&vp);
907 		if (error == 0) {
908 			if (vp == sep->se_root)
909 				error = EINVAL;
910 			else
911 				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
912 		}
913 		mutex_exit(&sdp->sd_lock);
914 		VN_RELE(vp);
915 	} else {
916 		error = EINVAL;
917 		mutex_exit(&sdp->sd_lock);
918 	}
919 
920 	VN_RELE(dvp);
921 
922 	return (error);
923 }
924 
925 /*
926  * Unmount any snapshots for the given filesystem.  This is called from
927  * zfs_umount() - if we have a ctldir, then go through and unmount all the
928  * snapshots.
929  */
930 int
931 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
932 {
933 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
934 	vnode_t *dvp, *svp;
935 	zfsctl_snapdir_t *sdp;
936 	zfs_snapentry_t *sep, *next;
937 	int error;
938 
939 	ASSERT(zfsvfs->z_ctldir != NULL);
940 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
941 	    NULL, 0, NULL, cr);
942 	if (error != 0)
943 		return (error);
944 	sdp = dvp->v_data;
945 
946 	mutex_enter(&sdp->sd_lock);
947 
948 	sep = avl_first(&sdp->sd_snaps);
949 	while (sep != NULL) {
950 		svp = sep->se_root;
951 		next = AVL_NEXT(&sdp->sd_snaps, sep);
952 
953 		/*
954 		 * If this snapshot is not mounted, then it must
955 		 * have just been unmounted by somebody else, and
956 		 * will be cleaned up by zfsctl_snapdir_inactive().
957 		 */
958 		if (vn_ismntpt(svp)) {
959 			if ((error = vn_vfswlock(svp)) != 0)
960 				goto out;
961 
962 			VN_HOLD(svp);
963 			error = dounmount(vn_mountedvfs(svp), fflags, cr);
964 			if (error) {
965 				VN_RELE(svp);
966 				goto out;
967 			}
968 
969 			avl_remove(&sdp->sd_snaps, sep);
970 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
971 			kmem_free(sep, sizeof (zfs_snapentry_t));
972 
973 			/*
974 			 * We can't use VN_RELE(), as that will try to
975 			 * invoke zfsctl_snapdir_inactive(), and that
976 			 * would lead to an attempt to re-grab the sd_lock.
977 			 */
978 			ASSERT3U(svp->v_count, ==, 1);
979 			gfs_vop_inactive(svp, cr);
980 		}
981 		sep = next;
982 	}
983 out:
984 	mutex_exit(&sdp->sd_lock);
985 	VN_RELE(dvp);
986 
987 	return (error);
988 }
989