xref: /titanic_51/usr/src/uts/common/fs/zfs/zfs_ctldir.c (revision c73ac1a68a2b22039517eb1e61ac88cf026cc3f6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * ZFS control directory (a.k.a. ".zfs")
31  *
32  * This directory provides a common location for all ZFS meta-objects.
33  * Currently, this is only the 'snapshot' directory, but this may expand in the
34  * future.  The elements are built using the GFS primitives, as the hierarchy
35  * does not actually exist on disk.
36  *
37  * For 'snapshot', we don't want to have all snapshots always mounted, because
38  * this would take up a huge amount of space in /etc/mnttab.  We have three
39  * types of objects:
40  *
41  * 	ctldir ------> snapshotdir -------> snapshot
42  *                                             |
43  *                                             |
44  *                                             V
45  *                                         mounted fs
46  *
47  * The 'snapshot' node contains just enough information to lookup '..' and act
48  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
49  * perform an automount of the underlying filesystem and return the
50  * corresponding vnode.
51  *
52  * All mounts are handled automatically by the kernel, but unmounts are
53  * (currently) handled from user land.  The main reason is that there is no
54  * reliable way to auto-unmount the filesystem when it's "no longer in use".
55  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
56  * unmounts any snapshots within the snapshot directory.
57  */
58 
59 #include <fs/fs_subr.h>
60 #include <sys/zfs_ctldir.h>
61 #include <sys/zfs_ioctl.h>
62 #include <sys/zfs_vfsops.h>
63 #include <sys/gfs.h>
64 #include <sys/stat.h>
65 #include <sys/dmu.h>
66 #include <sys/mount.h>
67 
68 typedef struct {
69 	char		*se_name;
70 	vnode_t		*se_root;
71 	avl_node_t	se_node;
72 } zfs_snapentry_t;
73 
74 static int
75 snapentry_compare(const void *a, const void *b)
76 {
77 	const zfs_snapentry_t *sa = a;
78 	const zfs_snapentry_t *sb = b;
79 	int ret = strcmp(sa->se_name, sb->se_name);
80 
81 	if (ret < 0)
82 		return (-1);
83 	else if (ret > 0)
84 		return (1);
85 	else
86 		return (0);
87 }
88 
89 vnodeops_t *zfsctl_ops_root;
90 vnodeops_t *zfsctl_ops_snapdir;
91 vnodeops_t *zfsctl_ops_snapshot;
92 
93 static const fs_operation_def_t zfsctl_tops_root[];
94 static const fs_operation_def_t zfsctl_tops_snapdir[];
95 static const fs_operation_def_t zfsctl_tops_snapshot[];
96 
97 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
98 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
99 
100 static gfs_opsvec_t zfsctl_opsvec[] = {
101 	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
102 	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
103 	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
104 	{ NULL }
105 };
106 
107 typedef struct zfsctl_node {
108 	gfs_dir_t	zc_gfs_private;
109 	uint64_t	zc_id;
110 } zfsctl_node_t;
111 
112 typedef struct zfsctl_snapdir {
113 	zfsctl_node_t	sd_node;
114 	kmutex_t	sd_lock;
115 	avl_tree_t	sd_snaps;
116 } zfsctl_snapdir_t;
117 
118 /*
119  * Root directory elements.  We have only a single static entry, 'snapshot'.
120  */
121 static gfs_dirent_t zfsctl_root_entries[] = {
122 	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
123 	{ NULL }
124 };
125 
126 /* include . and .. in the calculation */
127 #define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
128     sizeof (gfs_dirent_t)) + 1)
129 
130 
131 /*
132  * Initialize the various GFS pieces we'll need to create and manipulate .zfs
133  * directories.  This is called from the ZFS init routine, and initializes the
134  * vnode ops vectors that we'll be using.
135  */
136 void
137 zfsctl_init(void)
138 {
139 	VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
140 }
141 
142 void
143 zfsctl_fini(void)
144 {
145 	/*
146 	 * Remove vfsctl vnode ops
147 	 */
148 	if (zfsctl_ops_root)
149 		vn_freevnodeops(zfsctl_ops_root);
150 	if (zfsctl_ops_snapdir)
151 		vn_freevnodeops(zfsctl_ops_snapdir);
152 	if (zfsctl_ops_snapshot)
153 		vn_freevnodeops(zfsctl_ops_snapshot);
154 
155 	zfsctl_ops_root = NULL;
156 	zfsctl_ops_snapdir = NULL;
157 	zfsctl_ops_snapshot = NULL;
158 }
159 
160 /*
161  * Return the inode number associated with the 'snapshot' directory.
162  */
163 /* ARGSUSED */
164 static ino64_t
165 zfsctl_root_inode_cb(vnode_t *vp, int index)
166 {
167 	ASSERT(index == 0);
168 	return (ZFSCTL_INO_SNAPDIR);
169 }
170 
171 /*
172  * Create the '.zfs' directory.  This directory is cached as part of the VFS
173  * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
174  * therefore checks against a vfs_count of 2 instead of 1.  This reference
175  * is removed when the ctldir is destroyed in the unmount.
176  */
177 void
178 zfsctl_create(zfsvfs_t *zfsvfs)
179 {
180 	vnode_t *vp;
181 	zfsctl_node_t *zcp;
182 
183 	ASSERT(zfsvfs->z_ctldir == NULL);
184 
185 	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
186 	    zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
187 	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
188 	zcp = vp->v_data;
189 	zcp->zc_id = ZFSCTL_INO_ROOT;
190 
191 	/*
192 	 * We're only faking the fact that we have a root of a filesystem for
193 	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
194 	 * for us.
195 	 */
196 	vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
197 
198 	zfsvfs->z_ctldir = vp;
199 }
200 
201 /*
202  * Destroy the '.zfs' directory.  Only called when the filesystem is
203  * unmounted, and there are no more references.  Release the vnode,
204  * which will release the hold on the vfs structure.
205  */
206 void
207 zfsctl_destroy(zfsvfs_t *zfsvfs)
208 {
209 	ASSERT(zfsvfs->z_ctldir->v_count == 1);
210 	VN_RELE(zfsvfs->z_ctldir);
211 	zfsvfs->z_ctldir = NULL;
212 }
213 
214 /*
215  * Given a root znode, retrieve the associated .zfs directory.
216  * Add a hold to the vnode and return it.
217  */
218 vnode_t *
219 zfsctl_root(znode_t *zp)
220 {
221 	ASSERT(zfs_has_ctldir(zp));
222 	VN_HOLD(zp->z_zfsvfs->z_ctldir);
223 	return (zp->z_zfsvfs->z_ctldir);
224 }
225 
226 /*
227  * Common open routine.  Disallow any write access.
228  */
229 /* ARGSUSED */
230 static int
231 zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr)
232 {
233 	if (flags & FWRITE)
234 		return (EACCES);
235 
236 	return (0);
237 }
238 
239 /*
240  * Common close routine.  Nothing to do here.
241  */
242 /* ARGSUSED */
243 static int
244 zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
245     cred_t *cr)
246 {
247 	return (0);
248 }
249 
250 /*
251  * Common access routine.  Disallow writes.
252  */
253 /* ARGSUSED */
254 static int
255 zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr)
256 {
257 	if (mode & VWRITE)
258 		return (EACCES);
259 
260 	return (0);
261 }
262 
263 /*
264  * Common getattr function.  Fill in basic information.
265  */
266 static void
267 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
268 {
269 	timestruc_t now;
270 
271 	vap->va_uid = 0;
272 	vap->va_gid = 0;
273 	vap->va_rdev = 0;
274 	/*
275 	 * We are a purly virtual object, so we have no
276 	 * blocksize or allocated blocks.
277 	 */
278 	vap->va_blksize = 0;
279 	vap->va_nblocks = 0;
280 	vap->va_seq = 0;
281 	vap->va_fsid = vp->v_vfsp->vfs_dev;
282 	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
283 	    S_IROTH | S_IXOTH;
284 	vap->va_type = VDIR;
285 	/*
286 	 * We live in the now.
287 	 */
288 	gethrestime(&now);
289 	vap->va_mtime = vap->va_ctime = vap->va_atime = now;
290 }
291 
292 static int
293 zfsctl_common_fid(vnode_t *vp, fid_t *fidp)
294 {
295 	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
296 	zfsctl_node_t	*zcp = vp->v_data;
297 	uint64_t	object = zcp->zc_id;
298 	zfid_short_t	*zfid;
299 	int		i;
300 
301 	ZFS_ENTER(zfsvfs);
302 
303 	if (fidp->fid_len < SHORT_FID_LEN) {
304 		fidp->fid_len = SHORT_FID_LEN;
305 		return (ENOSPC);
306 	}
307 
308 	zfid = (zfid_short_t *)fidp;
309 
310 	zfid->zf_len = SHORT_FID_LEN;
311 
312 	for (i = 0; i < sizeof (zfid->zf_object); i++)
313 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
314 
315 	/* .zfs znodes always have a generation number of 0 */
316 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
317 		zfid->zf_gen[i] = 0;
318 
319 	ZFS_EXIT(zfsvfs);
320 	return (0);
321 }
322 
323 /*
324  * .zfs inode namespace
325  *
326  * We need to generate unique inode numbers for all files and directories
327  * within the .zfs pseudo-filesystem.  We use the following scheme:
328  *
329  * 	ENTRY			ZFSCTL_INODE
330  * 	.zfs			1
331  * 	.zfs/snapshot		2
332  * 	.zfs/snapshot/<snap>	objectid(snap)
333  */
334 
335 #define	ZFSCTL_INO_SNAP(id)	(id)
336 
337 /*
338  * Get root directory attributes.
339  */
340 /* ARGSUSED */
341 static int
342 zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
343 {
344 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
345 
346 	ZFS_ENTER(zfsvfs);
347 	vap->va_nodeid = ZFSCTL_INO_ROOT;
348 	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
349 
350 	zfsctl_common_getattr(vp, vap);
351 	ZFS_EXIT(zfsvfs);
352 
353 	return (0);
354 }
355 
356 /*
357  * Special case the handling of "..".
358  */
359 /* ARGSUSED */
360 int
361 zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
362     int flags, vnode_t *rdir, cred_t *cr)
363 {
364 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
365 	int err;
366 
367 	ZFS_ENTER(zfsvfs);
368 
369 	if (strcmp(nm, "..") == 0) {
370 		err = VFS_ROOT(dvp->v_vfsp, vpp);
371 	} else {
372 		err = gfs_dir_lookup(dvp, nm, vpp);
373 	}
374 
375 	ZFS_EXIT(zfsvfs);
376 
377 	return (err);
378 }
379 
380 static const fs_operation_def_t zfsctl_tops_root[] = {
381 	{ VOPNAME_OPEN,		zfsctl_common_open			},
382 	{ VOPNAME_CLOSE,	zfsctl_common_close			},
383 	{ VOPNAME_IOCTL,	fs_inval				},
384 	{ VOPNAME_GETATTR,	zfsctl_root_getattr			},
385 	{ VOPNAME_ACCESS,	zfsctl_common_access			},
386 	{ VOPNAME_READDIR,	gfs_vop_readdir				},
387 	{ VOPNAME_LOOKUP,	zfsctl_root_lookup			},
388 	{ VOPNAME_SEEK,		fs_seek					},
389 	{ VOPNAME_INACTIVE,	(fs_generic_func_p) gfs_vop_inactive	},
390 	{ VOPNAME_FID,		zfsctl_common_fid			},
391 	{ NULL }
392 };
393 
394 static int
395 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
396 {
397 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
398 
399 	dmu_objset_name(os, zname);
400 	if (strlen(zname) + 1 + strlen(name) >= len)
401 		return (ENAMETOOLONG);
402 	(void) strcat(zname, "@");
403 	(void) strcat(zname, name);
404 	return (0);
405 }
406 
407 static int
408 zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
409 {
410 	zfsctl_snapdir_t *sdp = dvp->v_data;
411 	zfs_snapentry_t search, *sep;
412 	avl_index_t where;
413 	int err;
414 
415 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
416 
417 	search.se_name = (char *)name;
418 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
419 		return (ENOENT);
420 
421 	ASSERT(vn_ismntpt(sep->se_root));
422 
423 	/* this will be dropped by dounmount() */
424 	if ((err = vn_vfswlock(sep->se_root)) != 0)
425 		return (err);
426 
427 	VN_HOLD(sep->se_root);
428 	if ((err = dounmount(vn_mountedvfs(sep->se_root), force, kcred)) != 0)
429 		return (err);
430 	ASSERT(sep->se_root->v_count == 1);
431 	gfs_vop_inactive(sep->se_root, cr);
432 
433 	avl_remove(&sdp->sd_snaps, sep);
434 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
435 	kmem_free(sep, sizeof (zfs_snapentry_t));
436 
437 	return (0);
438 }
439 
440 
441 static void
442 zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
443 {
444 	avl_index_t where;
445 	vfs_t *vfsp;
446 	refstr_t *pathref;
447 	char newpath[MAXNAMELEN];
448 	const char *oldpath;
449 	char *tail;
450 
451 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
452 	ASSERT(sep != NULL);
453 
454 	vfsp = vn_mountedvfs(sep->se_root);
455 	ASSERT(vfsp != NULL);
456 
457 	vfs_lock_wait(vfsp);
458 
459 	/*
460 	 * Change the name in the AVL tree.
461 	 */
462 	avl_remove(&sdp->sd_snaps, sep);
463 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
464 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
465 	(void) strcpy(sep->se_name, nm);
466 	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
467 	avl_insert(&sdp->sd_snaps, sep, where);
468 
469 	/*
470 	 * Change the current mountpoint info:
471 	 * 	- update the tail of the mntpoint path
472 	 *	- update the tail of the resource path
473 	 */
474 	pathref = vfs_getmntpoint(vfsp);
475 	oldpath = refstr_value(pathref);
476 	VERIFY((tail = strrchr(oldpath, '/')) != NULL);
477 	ASSERT((tail - oldpath) + strlen(nm) + 2 < MAXNAMELEN);
478 	(void) strncpy(newpath, oldpath, tail - oldpath + 1);
479 	(void) strcat(newpath, nm);
480 	refstr_rele(pathref);
481 	vfs_setmntpoint(vfsp, newpath);
482 
483 	pathref = vfs_getresource(vfsp);
484 	oldpath = refstr_value(pathref);
485 	VERIFY((tail = strrchr(oldpath, '@')) != NULL);
486 	ASSERT((tail - oldpath) + strlen(nm) + 2 < MAXNAMELEN);
487 	(void) strncpy(newpath, oldpath, tail - oldpath + 1);
488 	(void) strcat(newpath, nm);
489 	refstr_rele(pathref);
490 	vfs_setresource(vfsp, newpath);
491 
492 	vfs_unlock(vfsp);
493 }
494 
495 static int
496 zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
497     cred_t *cr)
498 {
499 	zfsctl_snapdir_t *sdp = sdvp->v_data;
500 	zfs_snapentry_t search, *sep;
501 	avl_index_t where;
502 	char from[MAXNAMELEN], to[MAXNAMELEN];
503 	int err;
504 
505 	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
506 	if (err)
507 		return (err);
508 	err = zfs_secpolicy_write(from, NULL, cr);
509 	if (err)
510 		return (err);
511 
512 	/*
513 	 * Cannot move snapshots out of the snapdir.
514 	 */
515 	if (sdvp != tdvp)
516 		return (EINVAL);
517 
518 	if (strcmp(snm, tnm) == 0)
519 		return (0);
520 
521 	err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
522 	if (err)
523 		return (err);
524 
525 	mutex_enter(&sdp->sd_lock);
526 
527 	search.se_name = (char *)snm;
528 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
529 		mutex_exit(&sdp->sd_lock);
530 		return (ENOENT);
531 	}
532 
533 	err = dmu_objset_rename(from, to);
534 	if (err == 0)
535 		zfsctl_rename_snap(sdp, sep, tnm);
536 
537 	mutex_exit(&sdp->sd_lock);
538 
539 	return (err);
540 }
541 
542 /* ARGSUSED */
543 static int
544 zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
545 {
546 	zfsctl_snapdir_t *sdp = dvp->v_data;
547 	char snapname[MAXNAMELEN];
548 	int err;
549 
550 	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
551 	if (err)
552 		return (err);
553 	err = zfs_secpolicy_write(snapname, NULL, cr);
554 	if (err)
555 		return (err);
556 
557 	mutex_enter(&sdp->sd_lock);
558 
559 	err = zfsctl_unmount_snap(dvp, name, 0, cr);
560 	if (err) {
561 		mutex_exit(&sdp->sd_lock);
562 		return (err);
563 	}
564 
565 	err = dmu_objset_destroy(snapname);
566 
567 	mutex_exit(&sdp->sd_lock);
568 
569 	return (err);
570 }
571 
572 /*
573  * Lookup entry point for the 'snapshot' directory.  Try to open the
574  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
575  * Perform a mount of the associated dataset on top of the vnode.
576  */
577 /* ARGSUSED */
578 static int
579 zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
580     int flags, vnode_t *rdir, cred_t *cr)
581 {
582 	zfsctl_snapdir_t *sdp = dvp->v_data;
583 	objset_t *snap;
584 	char snapname[MAXNAMELEN];
585 	char *mountpoint;
586 	zfs_snapentry_t *sep, search;
587 	struct mounta margs;
588 	vfs_t *vfsp;
589 	size_t mountpoint_len;
590 	avl_index_t where;
591 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
592 	int err;
593 
594 	ASSERT(dvp->v_type == VDIR);
595 
596 	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
597 		return (0);
598 
599 	/*
600 	 * If we get a recursive call, that means we got called
601 	 * from the domount() code while it was trying to look up the
602 	 * spec (which looks like a local path for zfs).  We need to
603 	 * add some flag to domount() to tell it not to do this lookup.
604 	 */
605 	if (MUTEX_HELD(&sdp->sd_lock))
606 		return (ENOENT);
607 
608 	ZFS_ENTER(zfsvfs);
609 
610 	mutex_enter(&sdp->sd_lock);
611 	search.se_name = (char *)nm;
612 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
613 		*vpp = sep->se_root;
614 		VN_HOLD(*vpp);
615 		/*
616 		 * If the snapshot was unmounted behind our backs, remount it.
617 		 */
618 		if (!vn_ismntpt(*vpp))
619 			goto domount;
620 		VERIFY(traverse(vpp) == 0);
621 		mutex_exit(&sdp->sd_lock);
622 		ZFS_EXIT(zfsvfs);
623 		return (0);
624 	}
625 
626 	/*
627 	 * The requested snapshot is not currently mounted, look it up.
628 	 */
629 	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
630 	if (err) {
631 		mutex_exit(&sdp->sd_lock);
632 		ZFS_EXIT(zfsvfs);
633 		return (err);
634 	}
635 	if (dmu_objset_open(snapname, DMU_OST_ZFS,
636 	    DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
637 		mutex_exit(&sdp->sd_lock);
638 		ZFS_EXIT(zfsvfs);
639 		return (ENOENT);
640 	}
641 
642 	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
643 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
644 	(void) strcpy(sep->se_name, nm);
645 	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
646 	avl_insert(&sdp->sd_snaps, sep, where);
647 
648 	dmu_objset_close(snap);
649 domount:
650 	mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
651 	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
652 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
653 	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
654 	    refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
655 
656 	margs.spec = snapname;
657 	margs.dir = mountpoint;
658 	margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
659 	margs.fstype = "zfs";
660 	margs.dataptr = NULL;
661 	margs.datalen = 0;
662 	margs.optptr = NULL;
663 	margs.optlen = 0;
664 
665 	err = domount("zfs", &margs, *vpp, kcred, &vfsp);
666 	kmem_free(mountpoint, mountpoint_len);
667 
668 	if (err == 0) {
669 		/*
670 		 * Return the mounted root rather than the covered mount point.
671 		 */
672 		VFS_RELE(vfsp);
673 		err = traverse(vpp);
674 	}
675 
676 	if (err == 0) {
677 		/*
678 		 * Fix up the root vnode.
679 		 */
680 		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
681 		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
682 		(*vpp)->v_vfsp = zfsvfs->z_vfs;
683 		(*vpp)->v_flag &= ~VROOT;
684 	}
685 	mutex_exit(&sdp->sd_lock);
686 	ZFS_EXIT(zfsvfs);
687 
688 	if (err)
689 		VN_RELE(*vpp);
690 	return (err);
691 }
692 
693 /* ARGSUSED */
694 static int
695 zfsctl_snapdir_readdir_cb(vnode_t *vp, struct dirent64 *dp, int *eofp,
696     offset_t *offp, offset_t *nextp, void *data)
697 {
698 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
699 	char snapname[MAXNAMELEN];
700 	uint64_t id, cookie;
701 
702 	ZFS_ENTER(zfsvfs);
703 
704 	cookie = *offp;
705 	if (dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
706 	    &cookie) == ENOENT) {
707 		*eofp = 1;
708 		ZFS_EXIT(zfsvfs);
709 		return (0);
710 	}
711 
712 	(void) strcpy(dp->d_name, snapname);
713 	dp->d_ino = ZFSCTL_INO_SNAP(id);
714 	*nextp = cookie;
715 
716 	ZFS_EXIT(zfsvfs);
717 
718 	return (0);
719 }
720 
721 vnode_t *
722 zfsctl_mknode_snapdir(vnode_t *pvp)
723 {
724 	vnode_t *vp;
725 	zfsctl_snapdir_t *sdp;
726 
727 	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
728 	    zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
729 	    zfsctl_snapdir_readdir_cb, NULL);
730 	sdp = vp->v_data;
731 	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
732 	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
733 	avl_create(&sdp->sd_snaps, snapentry_compare,
734 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
735 	return (vp);
736 }
737 
738 /* ARGSUSED */
739 static int
740 zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
741 {
742 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
743 	zfsctl_snapdir_t *sdp = vp->v_data;
744 
745 	ZFS_ENTER(zfsvfs);
746 	zfsctl_common_getattr(vp, vap);
747 	vap->va_nodeid = gfs_file_inode(vp);
748 	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
749 	ZFS_EXIT(zfsvfs);
750 
751 	return (0);
752 }
753 
754 static void
755 zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr)
756 {
757 	zfsctl_snapdir_t *sdp = vp->v_data;
758 
759 	ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
760 	mutex_destroy(&sdp->sd_lock);
761 	avl_destroy(&sdp->sd_snaps);
762 	gfs_vop_inactive(vp, cr);
763 }
764 
765 static const fs_operation_def_t zfsctl_tops_snapdir[] = {
766 	{ VOPNAME_OPEN,		zfsctl_common_open			},
767 	{ VOPNAME_CLOSE,	zfsctl_common_close			},
768 	{ VOPNAME_IOCTL,	fs_inval				},
769 	{ VOPNAME_GETATTR,	zfsctl_snapdir_getattr			},
770 	{ VOPNAME_ACCESS,	zfsctl_common_access			},
771 	{ VOPNAME_RENAME,	zfsctl_snapdir_rename			},
772 	{ VOPNAME_RMDIR,	zfsctl_snapdir_remove			},
773 	{ VOPNAME_READDIR,	gfs_vop_readdir				},
774 	{ VOPNAME_LOOKUP,	zfsctl_snapdir_lookup			},
775 	{ VOPNAME_SEEK,		fs_seek					},
776 	{ VOPNAME_INACTIVE,	(fs_generic_func_p) zfsctl_snapdir_inactive },
777 	{ VOPNAME_FID,		zfsctl_common_fid			},
778 	{ NULL }
779 };
780 
781 static vnode_t *
782 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
783 {
784 	vnode_t *vp;
785 	zfsctl_node_t *zcp;
786 
787 	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
788 	    zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
789 	zcp = vp->v_data;
790 	zcp->zc_id = objset;
791 
792 	return (vp);
793 }
794 
795 static void
796 zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr)
797 {
798 	zfsctl_snapdir_t *sdp;
799 	zfs_snapentry_t *sep, *next;
800 	vnode_t *dvp;
801 
802 	VERIFY(gfs_dir_lookup(vp, "..", &dvp) == 0);
803 	sdp = dvp->v_data;
804 
805 	mutex_enter(&sdp->sd_lock);
806 
807 	if (vp->v_count > 1) {
808 		mutex_exit(&sdp->sd_lock);
809 		return;
810 	}
811 	ASSERT(!vn_ismntpt(vp));
812 
813 	sep = avl_first(&sdp->sd_snaps);
814 	while (sep != NULL) {
815 		next = AVL_NEXT(&sdp->sd_snaps, sep);
816 
817 		if (sep->se_root == vp) {
818 			avl_remove(&sdp->sd_snaps, sep);
819 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
820 			kmem_free(sep, sizeof (zfs_snapentry_t));
821 			break;
822 		}
823 		sep = next;
824 	}
825 	ASSERT(sep != NULL);
826 
827 	mutex_exit(&sdp->sd_lock);
828 	VN_RELE(dvp);
829 
830 	gfs_vop_inactive(vp, cr);
831 }
832 
833 
834 /*
835  * These VP's should never see the light of day.  They should always
836  * be covered.
837  */
838 static const fs_operation_def_t zfsctl_tops_snapshot[] = {
839 	VOPNAME_INACTIVE, (fs_generic_func_p) zfsctl_snapshot_inactive,
840 	NULL, NULL
841 };
842 
843 int
844 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
845 {
846 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
847 	vnode_t *dvp, *vp;
848 	zfsctl_snapdir_t *sdp;
849 	zfsctl_node_t *zcp;
850 	zfs_snapentry_t *sep;
851 	int error;
852 
853 	ASSERT(zfsvfs->z_ctldir != NULL);
854 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
855 	    NULL, 0, NULL, kcred);
856 	if (error != 0)
857 		return (error);
858 	sdp = dvp->v_data;
859 
860 	mutex_enter(&sdp->sd_lock);
861 	sep = avl_first(&sdp->sd_snaps);
862 	while (sep != NULL) {
863 		vp = sep->se_root;
864 		zcp = vp->v_data;
865 		if (zcp->zc_id == objsetid)
866 			break;
867 
868 		sep = AVL_NEXT(&sdp->sd_snaps, sep);
869 	}
870 
871 	if (sep != NULL) {
872 		VN_HOLD(vp);
873 		error = traverse(&vp);
874 		if (error == 0)
875 			*zfsvfsp = VTOZ(vp)->z_zfsvfs;
876 		VN_RELE(vp);
877 	} else {
878 		error = EINVAL;
879 	}
880 
881 	mutex_exit(&sdp->sd_lock);
882 	VN_RELE(dvp);
883 
884 	return (error);
885 }
886 
887 /*
888  * Unmount any snapshots for the given filesystem.  This is called from
889  * zfs_umount() - if we have a ctldir, then go through and unmount all the
890  * snapshots.
891  */
892 int
893 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
894 {
895 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
896 	vnode_t *dvp, *svp;
897 	zfsctl_snapdir_t *sdp;
898 	zfs_snapentry_t *sep, *next;
899 	int error;
900 
901 	ASSERT(zfsvfs->z_ctldir != NULL);
902 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
903 	    NULL, 0, NULL, cr);
904 	if (error != 0)
905 		return (error);
906 	sdp = dvp->v_data;
907 
908 	mutex_enter(&sdp->sd_lock);
909 
910 	sep = avl_first(&sdp->sd_snaps);
911 	while (sep != NULL) {
912 		svp = sep->se_root;
913 		next = AVL_NEXT(&sdp->sd_snaps, sep);
914 
915 		/*
916 		 * If this snapshot is not mounted, then it must
917 		 * have just been unmounted by somebody else, and
918 		 * will be cleaned up by zfsctl_snapdir_inactive().
919 		 */
920 		if (vn_ismntpt(svp)) {
921 			if ((error = vn_vfswlock(svp)) != 0)
922 				goto out;
923 
924 			VN_HOLD(svp);
925 			error = dounmount(vn_mountedvfs(svp), fflags, cr);
926 			if (error) {
927 				VN_RELE(svp);
928 				goto out;
929 			}
930 
931 			avl_remove(&sdp->sd_snaps, sep);
932 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
933 			kmem_free(sep, sizeof (zfs_snapentry_t));
934 
935 			/*
936 			 * We can't use VN_RELE(), as that will try to
937 			 * invoke zfsctl_snapdir_inactive(), and that
938 			 * would lead to an attempt to re-grab the sd_lock.
939 			 */
940 			ASSERT3U(svp->v_count, ==, 1);
941 			gfs_vop_inactive(svp, cr);
942 		}
943 		sep = next;
944 	}
945 out:
946 	mutex_exit(&sdp->sd_lock);
947 	VN_RELE(dvp);
948 
949 	return (error);
950 }
951