xref: /titanic_41/usr/src/uts/common/fs/zfs/zfs_vfsops.c (revision fa9e4066f08beec538e775443c5be79dd423fcab)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/sysmacros.h>
33 #include <sys/kmem.h>
34 #include <sys/pathname.h>
35 #include <sys/acl.h>
36 #include <sys/vnode.h>
37 #include <sys/vfs.h>
38 #include <sys/mntent.h>
39 #include <sys/mount.h>
40 #include <sys/cmn_err.h>
41 #include "fs/fs_subr.h"
42 #include <sys/zfs_znode.h>
43 #include <sys/zil.h>
44 #include <sys/fs/zfs.h>
45 #include <sys/dmu.h>
46 #include <sys/dsl_prop.h>
47 #include <sys/spa.h>
48 #include <sys/zap.h>
49 #include <sys/varargs.h>
50 #include <sys/policy.h>
51 #include <sys/atomic.h>
52 #include <sys/mkdev.h>
53 #include <sys/modctl.h>
54 #include <sys/zfs_ioctl.h>
55 #include <sys/zfs_ctldir.h>
56 
57 int zfsfstype;
58 vfsops_t *zfs_vfsops = NULL;
59 static major_t	zfs_major;
60 static minor_t zfs_minor;
61 static kmutex_t	zfs_dev_mtx;
62 
63 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
64 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
65 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
66 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
67 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
68 static void zfs_freevfs(vfs_t *vfsp);
69 static void zfs_objset_close(zfsvfs_t *zfsvfs);
70 
71 static const fs_operation_def_t zfs_vfsops_template[] = {
72 	VFSNAME_MOUNT, zfs_mount,
73 	VFSNAME_UNMOUNT, zfs_umount,
74 	VFSNAME_ROOT, zfs_root,
75 	VFSNAME_STATVFS, zfs_statvfs,
76 	VFSNAME_SYNC, (fs_generic_func_p) zfs_sync,
77 	VFSNAME_VGET, zfs_vget,
78 	VFSNAME_FREEVFS, (fs_generic_func_p) zfs_freevfs,
79 	NULL, NULL
80 };
81 
82 static const fs_operation_def_t zfs_vfsops_eio_template[] = {
83 	VFSNAME_FREEVFS, (fs_generic_func_p) zfs_freevfs,
84 	NULL, NULL
85 };
86 
87 /*
88  * We need to keep a count of active fs's.
89  * This is necessary to prevent our module
90  * from being unloaded after a umount -f
91  */
92 static uint32_t	zfs_active_fs_count = 0;
93 
94 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
95 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
96 
97 static mntopt_t mntopts[] = {
98 	{ MNTOPT_XATTR, NULL, NULL, MO_NODISPLAY|MO_DEFAULT, NULL },
99 	{ MNTOPT_NOATIME, noatime_cancel, NULL, MO_DEFAULT, NULL },
100 	{ MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
101 };
102 
103 static mntopts_t zfs_mntopts = {
104 	sizeof (mntopts) / sizeof (mntopt_t),
105 	mntopts
106 };
107 
108 /*ARGSUSED*/
109 int
110 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
111 {
112 	/*
113 	 * Data integrity is job one.  We don't want a compromised kernel
114 	 * writing to the storage pool, so we never sync during panic.
115 	 */
116 	if (panicstr)
117 		return (0);
118 
119 	/*
120 	 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
121 	 * to sync metadata, which they would otherwise cache indefinitely.
122 	 * Semantically, the only requirement is that the sync be initiated.
123 	 * The DMU syncs out txgs frequently, so there's nothing to do.
124 	 */
125 	if (flag & SYNC_ATTR)
126 		return (0);
127 
128 	if (vfsp != NULL) {
129 		/*
130 		 * Sync a specific filesystem.
131 		 */
132 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
133 
134 		ZFS_ENTER(zfsvfs);
135 		if (zfsvfs->z_log != NULL)
136 			zil_commit(zfsvfs->z_log, UINT64_MAX, FSYNC);
137 		else
138 			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
139 		ZFS_EXIT(zfsvfs);
140 	} else {
141 		/*
142 		 * Sync all ZFS filesystems.  This is what happens when you
143 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
144 		 * request by waiting for all pools to commit all dirty data.
145 		 */
146 		spa_sync_allpools();
147 	}
148 
149 	return (0);
150 }
151 
152 static void
153 atime_changed_cb(void *arg, uint64_t newval)
154 {
155 	zfsvfs_t *zfsvfs = arg;
156 
157 	if (newval == TRUE) {
158 		zfsvfs->z_atime = TRUE;
159 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
160 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
161 	} else {
162 		zfsvfs->z_atime = FALSE;
163 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
164 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
165 	}
166 }
167 
168 static void
169 blksz_changed_cb(void *arg, uint64_t newval)
170 {
171 	zfsvfs_t *zfsvfs = arg;
172 
173 	if (newval < SPA_MINBLOCKSIZE ||
174 	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
175 		newval = SPA_MAXBLOCKSIZE;
176 
177 	zfsvfs->z_max_blksz = newval;
178 	zfsvfs->z_vfs->vfs_bsize = newval;
179 }
180 
181 static void
182 readonly_changed_cb(void *arg, uint64_t newval)
183 {
184 	zfsvfs_t *zfsvfs = arg;
185 
186 	if (newval) {
187 		/* XXX locking on vfs_flag? */
188 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
189 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
190 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
191 		(void) zfs_delete_thread_target(zfsvfs, 0);
192 	} else {
193 		/* XXX locking on vfs_flag? */
194 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
195 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
196 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
197 		(void) zfs_delete_thread_target(zfsvfs, 1);
198 	}
199 }
200 
201 static void
202 devices_changed_cb(void *arg, uint64_t newval)
203 {
204 	zfsvfs_t *zfsvfs = arg;
205 
206 	if (newval == FALSE) {
207 		zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
208 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
209 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
210 	} else {
211 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
212 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
213 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
214 	}
215 }
216 
217 static void
218 setuid_changed_cb(void *arg, uint64_t newval)
219 {
220 	zfsvfs_t *zfsvfs = arg;
221 
222 	if (newval == FALSE) {
223 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
224 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
225 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
226 	} else {
227 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
228 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
229 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
230 	}
231 }
232 
233 static void
234 exec_changed_cb(void *arg, uint64_t newval)
235 {
236 	zfsvfs_t *zfsvfs = arg;
237 
238 	if (newval == FALSE) {
239 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
240 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
241 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
242 	} else {
243 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
244 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
245 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
246 	}
247 }
248 
249 static void
250 snapdir_changed_cb(void *arg, uint64_t newval)
251 {
252 	zfsvfs_t *zfsvfs = arg;
253 
254 	zfsvfs->z_show_ctldir = newval;
255 }
256 
257 static void
258 acl_mode_changed_cb(void *arg, uint64_t newval)
259 {
260 	zfsvfs_t *zfsvfs = arg;
261 
262 	zfsvfs->z_acl_mode = newval;
263 }
264 
265 static void
266 acl_inherit_changed_cb(void *arg, uint64_t newval)
267 {
268 	zfsvfs_t *zfsvfs = arg;
269 
270 	zfsvfs->z_acl_inherit = newval;
271 }
272 
273 /*ARGSUSED*/
274 static int
275 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
276 {
277 	zfsvfs_t	*zfsvfs = NULL;
278 	znode_t		*zp = NULL;
279 	vnode_t		*vp = NULL;
280 	objset_t	*os = NULL;
281 	struct dsl_dataset *ds;
282 	char		*osname;
283 	uint64_t	readonly, recordsize;
284 	pathname_t	spn;
285 	dev_t		mount_dev;
286 	major_t		new_major;
287 	int		mode;
288 	int		error = 0;
289 	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
290 				UIO_SYSSPACE : UIO_USERSPACE;
291 	int		canwrite;
292 
293 	if (mvp->v_type != VDIR)
294 		return (ENOTDIR);
295 
296 	mutex_enter(&mvp->v_lock);
297 	if ((uap->flags & MS_REMOUNT) == 0 &&
298 	    (uap->flags & MS_OVERLAY) == 0 &&
299 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
300 		mutex_exit(&mvp->v_lock);
301 		return (EBUSY);
302 	}
303 	mutex_exit(&mvp->v_lock);
304 
305 	/*
306 	 * ZFS does not support passing unparsed data in via MS_DATA.
307 	 * Users should use the MS_OPTIONSTR interface; this means
308 	 * that all option parsing is already done and the options struct
309 	 * can be interrogated.
310 	 */
311 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
312 		return (EINVAL);
313 
314 	/*
315 	 * When doing a remount, we simply refresh our temporary properties
316 	 * according to those options set in the current VFS options.
317 	 */
318 	if (uap->flags & MS_REMOUNT) {
319 		zfsvfs = vfsp->vfs_data;
320 
321 		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
322 			readonly_changed_cb(zfsvfs, B_TRUE);
323 		else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
324 			if (dmu_objset_is_snapshot(zfsvfs->z_os))
325 				return (EROFS);
326 			readonly_changed_cb(zfsvfs, B_FALSE);
327 		}
328 
329 		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
330 			devices_changed_cb(zfsvfs, B_FALSE);
331 			setuid_changed_cb(zfsvfs, B_FALSE);
332 		} else {
333 			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
334 				devices_changed_cb(zfsvfs, B_FALSE);
335 			else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
336 				devices_changed_cb(zfsvfs, B_TRUE);
337 
338 			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
339 				setuid_changed_cb(zfsvfs, B_FALSE);
340 			else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
341 				setuid_changed_cb(zfsvfs, B_TRUE);
342 		}
343 
344 		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
345 			exec_changed_cb(zfsvfs, B_FALSE);
346 		else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
347 			exec_changed_cb(zfsvfs, B_TRUE);
348 
349 		return (0);
350 	}
351 
352 	/*
353 	 * Get the objset name (the "special" mount argument).
354 	 */
355 	if (error = pn_get(uap->spec, fromspace, &spn))
356 		return (error);
357 
358 	osname = spn.pn_path;
359 
360 	if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
361 		goto out;
362 
363 	/*
364 	 * Refuse to mount a filesystem if we are in a local zone and the
365 	 * dataset is not visible.
366 	 */
367 	if (!INGLOBALZONE(curproc) &&
368 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
369 		error = EPERM;
370 		goto out;
371 	}
372 
373 	/*
374 	 * Initialize the zfs-specific filesystem structure.
375 	 * Should probably make this a kmem cache, shuffle fields,
376 	 * and just bzero upto z_hold_mtx[].
377 	 */
378 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
379 	zfsvfs->z_vfs = vfsp;
380 	zfsvfs->z_parent = zfsvfs;
381 	zfsvfs->z_assign = TXG_NOWAIT;
382 	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
383 	zfsvfs->z_show_ctldir = VISIBLE;
384 
385 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
386 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
387 	    offsetof(znode_t, z_link_node));
388 	rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
389 
390 	/*
391 	 * Initialize the generic filesystem structure.
392 	 */
393 	vfsp->vfs_bcount = 0;
394 	vfsp->vfs_data = NULL;
395 
396 	/*
397 	 * Create a unique device for the mount.
398 	 */
399 	do {
400 		ASSERT3U(zfs_minor, <=, MAXMIN32);
401 		int start = zfs_minor;
402 		do {
403 			mutex_enter(&zfs_dev_mtx);
404 			zfs_minor++;
405 			if (zfs_minor > MAXMIN32)
406 				zfs_minor = 0;
407 			mount_dev = makedevice(zfs_major, zfs_minor);
408 			mutex_exit(&zfs_dev_mtx);
409 		} while (vfs_devismounted(mount_dev) && zfs_minor != start);
410 		if (zfs_minor == start) {
411 			/*
412 			 * We are using all ~262,000 minor numbers
413 			 * for the current major number.  Create a
414 			 * new major number.
415 			 */
416 			if ((new_major = getudev()) == (major_t)-1) {
417 				cmn_err(CE_WARN,
418 				    "zfs_mount: Can't get unique"
419 				    " major device number.");
420 				goto out;
421 			}
422 			mutex_enter(&zfs_dev_mtx);
423 			zfs_major = new_major;
424 			zfs_minor = 0;
425 			mutex_exit(&zfs_dev_mtx);
426 		} else {
427 			break;
428 		}
429 		/* CONSTANTCONDITION */
430 	} while (1);
431 
432 	ASSERT(vfs_devismounted(mount_dev) == 0);
433 
434 	if (dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL) != 0)
435 		recordsize = SPA_MAXBLOCKSIZE;
436 
437 	vfsp->vfs_dev = mount_dev;
438 	vfsp->vfs_fstype = zfsfstype;
439 	vfsp->vfs_bsize = recordsize;
440 	vfsp->vfs_flag |= VFS_NOTRUNC;
441 	vfsp->vfs_data = zfsvfs;
442 
443 	error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL);
444 	if (error)
445 		goto out;
446 
447 	if (readonly)
448 		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
449 	else
450 		mode = DS_MODE_PRIMARY;
451 
452 	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
453 	if (error == EROFS) {
454 		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
455 		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
456 		    &zfsvfs->z_os);
457 	}
458 	os = zfsvfs->z_os;
459 
460 	if (error)
461 		goto out;
462 
463 	if (error = zfs_init_fs(zfsvfs, &zp, cr))
464 		goto out;
465 
466 	if (dmu_objset_is_snapshot(os)) {
467 		ASSERT(mode & DS_MODE_READONLY);
468 		atime_changed_cb(zfsvfs, B_FALSE);
469 		readonly_changed_cb(zfsvfs, B_TRUE);
470 		zfsvfs->z_issnap = B_TRUE;
471 	} else {
472 		int do_readonly = FALSE, readonly;
473 		int do_setuid = FALSE, setuid;
474 		int do_exec = FALSE, exec;
475 		int do_devices = FALSE, devices;
476 
477 		/*
478 		 * Start a delete thread running.
479 		 */
480 		(void) zfs_delete_thread_target(zfsvfs, 1);
481 
482 		/*
483 		 * Parse and replay the intent log.
484 		 */
485 		zil_replay(os, zfsvfs, &zfsvfs->z_assign, zfs_replay_vector,
486 		    (void (*)(void *))zfs_delete_wait_empty);
487 
488 		if (!zil_disable)
489 			zfsvfs->z_log = zil_open(os, zfs_get_data);
490 
491 		/*
492 		 * The act of registering our callbacks will destroy any mount
493 		 * options we may have.  In order to enable temporary overrides
494 		 * of mount options, we stash away the current values and
495 		 * restore them after we register the callbacks.
496 		 */
497 		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
498 			readonly = B_TRUE;
499 			do_readonly = B_TRUE;
500 		} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
501 			readonly = B_FALSE;
502 			do_readonly = B_TRUE;
503 		}
504 		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
505 			devices = B_FALSE;
506 			setuid = B_FALSE;
507 			do_devices = B_TRUE;
508 			do_setuid = B_TRUE;
509 		} else {
510 			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
511 				devices = B_FALSE;
512 				do_devices = B_TRUE;
513 			} else if (vfs_optionisset(vfsp,
514 			    MNTOPT_DEVICES, NULL)) {
515 				devices = B_TRUE;
516 				do_devices = B_TRUE;
517 			}
518 
519 			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
520 				setuid = B_FALSE;
521 				do_setuid = B_TRUE;
522 			} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
523 				setuid = B_TRUE;
524 				do_setuid = B_TRUE;
525 			}
526 		}
527 		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
528 			exec = B_FALSE;
529 			do_exec = B_TRUE;
530 		} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
531 			exec = B_TRUE;
532 			do_exec = B_TRUE;
533 		}
534 
535 		/*
536 		 * Register property callbacks.
537 		 */
538 		ds = dmu_objset_ds(os);
539 		VERIFY(dsl_prop_register(ds, "atime", atime_changed_cb,
540 		    zfsvfs) == 0);
541 
542 		VERIFY(dsl_prop_register(ds, "recordsize", blksz_changed_cb,
543 		    zfsvfs) == 0);
544 
545 		VERIFY(dsl_prop_register(ds, "readonly", readonly_changed_cb,
546 		    zfsvfs) == 0);
547 
548 		VERIFY(dsl_prop_register(ds, "devices", devices_changed_cb,
549 		    zfsvfs) == 0);
550 
551 		VERIFY(dsl_prop_register(ds, "setuid", setuid_changed_cb,
552 		    zfsvfs) == 0);
553 
554 		VERIFY(dsl_prop_register(ds, "exec", exec_changed_cb,
555 		    zfsvfs) == 0);
556 
557 		VERIFY(dsl_prop_register(ds, "snapdir", snapdir_changed_cb,
558 		    zfsvfs) == 0);
559 
560 		VERIFY(dsl_prop_register(ds, "aclmode", acl_mode_changed_cb,
561 		    zfsvfs) == 0);
562 
563 		VERIFY(dsl_prop_register(ds, "aclinherit",
564 		    acl_inherit_changed_cb, zfsvfs) == 0);
565 
566 
567 		/*
568 		 * Invoke our callbacks to restore temporary mount options.
569 		 */
570 		if (do_readonly)
571 			readonly_changed_cb(zfsvfs, readonly);
572 		if (do_setuid)
573 			setuid_changed_cb(zfsvfs, setuid);
574 		if (do_exec)
575 			exec_changed_cb(zfsvfs, exec);
576 		if (do_devices)
577 			devices_changed_cb(zfsvfs, devices);
578 	}
579 
580 	vp = ZTOV(zp);
581 	if (!zfsvfs->z_issnap)
582 		zfsctl_create(zfsvfs);
583 out:
584 	if (error) {
585 		if (zp)
586 			VN_RELE(vp);
587 
588 		if (zfsvfs) {
589 			if (os)
590 				dmu_objset_close(os);
591 			kmem_free(zfsvfs, sizeof (zfsvfs_t));
592 		}
593 	} else {
594 		atomic_add_32(&zfs_active_fs_count, 1);
595 		VN_RELE(vp);
596 	}
597 
598 	pn_free(&spn);
599 	return (error);
600 }
601 
602 static int
603 zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
604 {
605 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
606 	dmu_objset_stats_t dstats;
607 	dev32_t d32;
608 
609 	ZFS_ENTER(zfsvfs);
610 
611 	dmu_objset_stats(zfsvfs->z_os, &dstats);
612 
613 	/*
614 	 * The underlying storage pool actually uses multiple block sizes.
615 	 * We report the fragsize as the smallest block size we support,
616 	 * and we report our blocksize as the filesystem's maximum blocksize.
617 	 */
618 	statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
619 	statp->f_bsize = zfsvfs->z_max_blksz;
620 
621 	/*
622 	 * The following report "total" blocks of various kinds in the
623 	 * file system, but reported in terms of f_frsize - the
624 	 * "fragment" size.
625 	 */
626 
627 	statp->f_blocks =
628 	    (dstats.dds_space_refd + dstats.dds_available) >> SPA_MINBLOCKSHIFT;
629 	statp->f_bfree = dstats.dds_available >> SPA_MINBLOCKSHIFT;
630 	statp->f_bavail = statp->f_bfree; /* no root reservation */
631 
632 	/*
633 	 * statvfs() should really be called statufs(), because it assumes
634 	 * static metadata.  ZFS doesn't preallocate files, so the best
635 	 * we can do is report the max that could possibly fit in f_files,
636 	 * and that minus the number actually used in f_ffree.
637 	 * For f_ffree, report the smaller of the number of object available
638 	 * and the number of blocks (each object will take at least a block).
639 	 */
640 	statp->f_ffree = MIN(dstats.dds_objects_avail, statp->f_bfree);
641 	statp->f_favail = statp->f_ffree;	/* no "root reservation" */
642 	statp->f_files = statp->f_ffree + dstats.dds_objects_used;
643 
644 	(void) cmpldev(&d32, vfsp->vfs_dev);
645 	statp->f_fsid = d32;
646 
647 	/*
648 	 * We're a zfs filesystem.
649 	 */
650 	(void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
651 
652 	statp->f_flag = 0;
653 
654 	statp->f_namemax = ZFS_MAXNAMELEN;
655 
656 	/*
657 	 * We have all of 32 characters to stuff a string here.
658 	 * Is there anything useful we could/should provide?
659 	 */
660 	bzero(statp->f_fstr, sizeof (statp->f_fstr));
661 
662 	ZFS_EXIT(zfsvfs);
663 	return (0);
664 }
665 
666 static int
667 zfs_root(vfs_t *vfsp, vnode_t **vpp)
668 {
669 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
670 	znode_t *rootzp;
671 	int error;
672 
673 	ZFS_ENTER(zfsvfs);
674 
675 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
676 	if (error == 0)
677 		*vpp = ZTOV(rootzp);
678 
679 	ZFS_EXIT(zfsvfs);
680 	return (error);
681 }
682 
683 /*ARGSUSED*/
684 static int
685 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
686 {
687 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
688 	int ret;
689 
690 	if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
691 		return (ret);
692 
693 	/*
694 	 * Unmount any snapshots mounted under .zfs before unmounting the
695 	 * dataset itself.
696 	 */
697 	if (zfsvfs->z_ctldir != NULL &&
698 	    (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
699 		return (ret);
700 
701 	if (fflag & MS_FORCE) {
702 		vfsp->vfs_flag |= VFS_UNMOUNTED;
703 		zfsvfs->z_unmounted1 = B_TRUE;
704 
705 		/*
706 		 * Wait for all zfs threads to leave zfs.
707 		 * Grabbing a rwlock as reader in all vops and
708 		 * as writer here doesn't work because it too easy to get
709 		 * multiple reader enters as zfs can re-enter itself.
710 		 * This can lead to deadlock if there is an intervening
711 		 * rw_enter as writer.
712 		 * So a file system threads ref count (z_op_cnt) is used.
713 		 * A polling loop on z_op_cnt may seem inefficient, but
714 		 * - this saves all threads on exit from having to grab a
715 		 *   mutex in order to cv_signal
716 		 * - only occurs on forced unmount in the rare case when
717 		 *   there are outstanding threads within the file system.
718 		 */
719 		while (zfsvfs->z_op_cnt) {
720 			delay(1);
721 		}
722 
723 		zfs_objset_close(zfsvfs);
724 
725 		return (0);
726 	}
727 
728 	zfs_zcache_flush(zfsvfs);
729 
730 	/*
731 	 * Stop all delete threads.
732 	 */
733 	(void) zfs_delete_thread_target(zfsvfs, 0);
734 
735 	/*
736 	 * Check the number of active vnodes in the file system.
737 	 * Our count is maintained in the vfs structure, but the number
738 	 * is off by 1 to indicate a hold on the vfs structure itself.
739 	 *
740 	 * The '.zfs' directory maintains a reference of its own, and any active
741 	 * references underneath are reflected in the vnode count.
742 	 */
743 	if (zfsvfs->z_ctldir == NULL) {
744 		if (vfsp->vfs_count > 1) {
745 			if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0)
746 				(void) zfs_delete_thread_target(zfsvfs, 1);
747 			return (EBUSY);
748 		}
749 	} else {
750 		if (vfsp->vfs_count > 2 ||
751 		    (zfsvfs->z_ctldir->v_count > 1 && !(fflag & MS_FORCE))) {
752 			if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0)
753 				(void) zfs_delete_thread_target(zfsvfs, 1);
754 			return (EBUSY);
755 		}
756 	}
757 
758 	vfsp->vfs_flag |= VFS_UNMOUNTED;
759 	zfs_objset_close(zfsvfs);
760 
761 	/*
762 	 * We can now safely destroy the '.zfs' directory node, which will
763 	 * release its hold on the vfs_t.
764 	 */
765 	if (zfsvfs->z_ctldir != NULL)
766 		zfsctl_destroy(zfsvfs);
767 
768 	return (0);
769 }
770 
771 static int
772 zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
773 {
774 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
775 	znode_t		*zp;
776 	uint64_t	object = 0;
777 	uint64_t	fid_gen = 0;
778 	uint64_t	gen_mask;
779 	uint64_t	zp_gen;
780 	int 		i, err;
781 
782 	*vpp = NULL;
783 
784 	ZFS_ENTER(zfsvfs);
785 
786 	if (fidp->fid_len == LONG_FID_LEN) {
787 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
788 		uint64_t	objsetid = 0;
789 		uint64_t	setgen = 0;
790 
791 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
792 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
793 
794 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
795 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
796 
797 		ZFS_EXIT(zfsvfs);
798 
799 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
800 		if (err)
801 			return (EINVAL);
802 		ZFS_ENTER(zfsvfs);
803 	}
804 
805 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
806 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
807 
808 		for (i = 0; i < sizeof (zfid->zf_object); i++)
809 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
810 
811 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
812 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
813 	} else {
814 		ZFS_EXIT(zfsvfs);
815 		return (EINVAL);
816 	}
817 
818 	/* A zero fid_gen means we are in the .zfs control directories */
819 	if (fid_gen == 0 &&
820 	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
821 		*vpp = zfsvfs->z_ctldir;
822 		ASSERT(*vpp != NULL);
823 		if (object == ZFSCTL_INO_SNAPDIR) {
824 			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
825 			    0, NULL, NULL) == 0);
826 		} else {
827 			VN_HOLD(*vpp);
828 		}
829 		ZFS_EXIT(zfsvfs);
830 		return (0);
831 	}
832 
833 	gen_mask = -1ULL >> (64 - 8 * i);
834 
835 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
836 	if (err = zfs_zget(zfsvfs, object, &zp)) {
837 		ZFS_EXIT(zfsvfs);
838 		return (err);
839 	}
840 	zp_gen = zp->z_phys->zp_gen & gen_mask;
841 	if (zp_gen == 0)
842 		zp_gen = 1;
843 	if (zp->z_reap || zp_gen != fid_gen) {
844 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
845 		VN_RELE(ZTOV(zp));
846 		ZFS_EXIT(zfsvfs);
847 		return (EINVAL);
848 	}
849 
850 	*vpp = ZTOV(zp);
851 	ZFS_EXIT(zfsvfs);
852 	return (0);
853 }
854 
855 static void
856 zfs_objset_close(zfsvfs_t *zfsvfs)
857 {
858 	zfs_delete_t	*zd = &zfsvfs->z_delete_head;
859 	znode_t		*zp, *nextzp;
860 	objset_t	*os = zfsvfs->z_os;
861 	struct dsl_dataset *ds;
862 
863 	/*
864 	 * Stop all delete threads.
865 	 */
866 	(void) zfs_delete_thread_target(zfsvfs, 0);
867 
868 	/*
869 	 * For forced unmount, at this point all vops except zfs_inactive
870 	 * are erroring EIO. We need to now suspend zfs_inactive threads
871 	 * while we are freeing dbufs before switching zfs_inactive
872 	 * to use behaviour without a objset.
873 	 */
874 	rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
875 
876 	zfs_zcache_flush(zfsvfs);
877 
878 	/*
879 	 * Release all delete in progress znodes
880 	 * They will be processed when the file system remounts.
881 	 */
882 	mutex_enter(&zd->z_mutex);
883 	while (zp = list_head(&zd->z_znodes)) {
884 		list_remove(&zd->z_znodes, zp);
885 		zp->z_dbuf_held = 0;
886 		dmu_buf_rele(zp->z_dbuf);
887 	}
888 	mutex_exit(&zd->z_mutex);
889 
890 	/*
891 	 * Release all holds on dbufs
892 	 * Note, although we have stopped all other vop threads and
893 	 * zfs_inactive(), the dmu can callback via znode_pageout_func()
894 	 * which can zfs_znode_free() the znode.
895 	 * So we lock z_all_znodes; search the list for a held
896 	 * dbuf; drop the lock (we know zp can't disappear if we hold
897 	 * a dbuf lock; then regrab the lock and restart.
898 	 */
899 	mutex_enter(&zfsvfs->z_znodes_lock);
900 	for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = nextzp) {
901 		nextzp = list_next(&zfsvfs->z_all_znodes, zp);
902 		if (zp->z_dbuf_held) {
903 			/* dbufs should only be held when force unmounting */
904 			zp->z_dbuf_held = 0;
905 			mutex_exit(&zfsvfs->z_znodes_lock);
906 			dmu_buf_rele(zp->z_dbuf);
907 			/* Start again */
908 			mutex_enter(&zfsvfs->z_znodes_lock);
909 			nextzp = list_head(&zfsvfs->z_all_znodes);
910 		}
911 	}
912 	mutex_exit(&zfsvfs->z_znodes_lock);
913 
914 	/*
915 	 * Unregister properties.
916 	 */
917 	if (!dmu_objset_is_snapshot(os)) {
918 		ds = dmu_objset_ds(os);
919 
920 		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
921 		    zfsvfs) == 0);
922 
923 		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
924 		    zfsvfs) == 0);
925 
926 		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
927 		    zfsvfs) == 0);
928 
929 		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
930 		    zfsvfs) == 0);
931 
932 		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
933 		    zfsvfs) == 0);
934 
935 		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
936 		    zfsvfs) == 0);
937 
938 		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
939 		    zfsvfs) == 0);
940 
941 		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
942 		    zfsvfs) == 0);
943 
944 		VERIFY(dsl_prop_unregister(ds, "aclinherit",
945 		    acl_inherit_changed_cb, zfsvfs) == 0);
946 	}
947 
948 	/*
949 	 * Make the dmu drop all it dbuf holds so that zfs_inactive
950 	 * can then safely free znode/vnodes.
951 	 */
952 	txg_wait_synced(dmu_objset_pool(os), 0);
953 
954 	/*
955 	 * Switch zfs_inactive to behaviour without an objset.
956 	 * It just tosses cached pages and frees the znode & vnode.
957 	 * Then re-enable zfs_inactive threads in that new behaviour.
958 	 */
959 	zfsvfs->z_unmounted2 = B_TRUE;
960 	rw_exit(&zfsvfs->z_um_lock); /* re-enable any zfs_inactive threads */
961 
962 	/*
963 	 * Close the zil. Can't close the zil while zfs_inactive
964 	 * threads are blocked as zil_close can call zfs_inactive.
965 	 */
966 	if (zfsvfs->z_log) {
967 		zil_close(zfsvfs->z_log);
968 		zfsvfs->z_log = NULL;
969 	}
970 
971 	/*
972 	 * Finally close the objset
973 	 */
974 	dmu_objset_close(os);
975 
976 }
977 
978 static void
979 zfs_freevfs(vfs_t *vfsp)
980 {
981 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
982 
983 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
984 
985 	atomic_add_32(&zfs_active_fs_count, -1);
986 }
987 
988 /*
989  * VFS_INIT() initialization.  Note that there is no VFS_FINI(),
990  * so we can't safely do any non-idempotent initialization here.
991  * Leave that to zfs_init() and zfs_fini(), which are called
992  * from the module's _init() and _fini() entry points.
993  */
994 /*ARGSUSED*/
995 static int
996 zfs_vfsinit(int fstype, char *name)
997 {
998 	int error;
999 
1000 	zfsfstype = fstype;
1001 
1002 	/*
1003 	 * Setup vfsops and vnodeops tables.
1004 	 */
1005 	error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
1006 	if (error != 0) {
1007 		cmn_err(CE_WARN, "zfs: bad vfs ops template");
1008 	}
1009 
1010 	error = zfs_create_op_tables();
1011 	if (error) {
1012 		zfs_remove_op_tables();
1013 		cmn_err(CE_WARN, "zfs: bad vnode ops template");
1014 		(void) vfs_freevfsops_by_type(zfsfstype);
1015 		return (error);
1016 	}
1017 
1018 	mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
1019 
1020 	/*
1021 	 * unique major number for all zfs mounts
1022 	 */
1023 	if ((zfs_major = getudev()) == (major_t)-1) {
1024 		cmn_err(CE_WARN,
1025 		    "zfs_vfsinit: Can't get unique device number.");
1026 		zfs_remove_op_tables();
1027 		(void) vfs_freevfsops_by_type(zfsfstype);
1028 		return (error);
1029 	}
1030 	zfs_minor = 0;
1031 
1032 	return (0);
1033 }
1034 
1035 void
1036 zfs_init(void)
1037 {
1038 	/*
1039 	 * Initialize .zfs directory structures
1040 	 */
1041 	zfsctl_init();
1042 
1043 	/*
1044 	 * Initialize znode cache, vnode ops, etc...
1045 	 */
1046 	zfs_znode_init();
1047 }
1048 
1049 void
1050 zfs_fini(void)
1051 {
1052 	zfsctl_fini();
1053 	zfs_znode_fini();
1054 }
1055 
1056 int
1057 zfs_busy(void)
1058 {
1059 	return (zfs_active_fs_count != 0);
1060 }
1061 
1062 static vfsdef_t vfw = {
1063 	VFSDEF_VERSION,
1064 	MNTTYPE_ZFS,
1065 	zfs_vfsinit,
1066 	VSW_HASPROTO | VSW_CANRWRO | VSW_CANREMOUNT | VSW_VOLATILEDEV,
1067 	&zfs_mntopts
1068 };
1069 
1070 struct modlfs zfs_modlfs = {
1071 	&mod_fsops, "ZFS filesystem version 1", &vfw
1072 };
1073