xref: /freebsd/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c (revision d0abb9a6399accc9053e2808052be00a6754ecef)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
25  * All rights reserved.
26  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
27  * Copyright (c) 2014 Integros [integros.com]
28  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
29  */
30 
31 /* Portions Copyright 2010 Robert Milkowski */
32 
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/sysmacros.h>
38 #include <sys/kmem.h>
39 #include <sys/acl.h>
40 #include <sys/vnode.h>
41 #include <sys/vfs.h>
42 #include <sys/mntent.h>
43 #include <sys/mount.h>
44 #include <sys/cmn_err.h>
45 #include <sys/zfs_znode.h>
46 #include <sys/zfs_vnops.h>
47 #include <sys/zfs_dir.h>
48 #include <sys/zil.h>
49 #include <sys/fs/zfs.h>
50 #include <sys/dmu.h>
51 #include <sys/dsl_prop.h>
52 #include <sys/dsl_dataset.h>
53 #include <sys/dsl_deleg.h>
54 #include <sys/spa.h>
55 #include <sys/zap.h>
56 #include <sys/sa.h>
57 #include <sys/sa_impl.h>
58 #include <sys/policy.h>
59 #include <sys/atomic.h>
60 #include <sys/zfs_ioctl.h>
61 #include <sys/zfs_ctldir.h>
62 #include <sys/zfs_fuid.h>
63 #include <sys/sunddi.h>
64 #include <sys/dmu_objset.h>
65 #include <sys/dsl_dir.h>
66 #include <sys/jail.h>
67 #include <sys/osd.h>
68 #include <ufs/ufs/quota.h>
69 #include <sys/zfs_quota.h>
70 
71 #include "zfs_comutil.h"
72 
73 #ifndef	MNTK_VMSETSIZE_BUG
74 #define	MNTK_VMSETSIZE_BUG	0
75 #endif
76 #ifndef	MNTK_NOMSYNC
77 #define	MNTK_NOMSYNC	8
78 #endif
79 
80 struct mtx zfs_debug_mtx;
81 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
82 
83 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
84 
85 int zfs_super_owner;
86 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
87 	"File system owners can perform privileged operation on file systems");
88 
89 int zfs_debug_level;
90 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
91 	"Debug level");
92 
93 struct zfs_jailparam {
94 	int mount_snapshot;
95 };
96 
97 static struct zfs_jailparam zfs_jailparam0 = {
98 	.mount_snapshot = 0,
99 };
100 
101 static int zfs_jailparam_slot;
102 
103 SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters");
104 SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I",
105 	"Allow mounting snapshots in the .zfs directory for unjailed datasets");
106 
107 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
108 static int zfs_version_acl = ZFS_ACL_VERSION;
109 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
110 	"ZFS_ACL_VERSION");
111 static int zfs_version_spa = SPA_VERSION;
112 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
113 	"SPA_VERSION");
114 static int zfs_version_zpl = ZPL_VERSION;
115 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
116 	"ZPL_VERSION");
117 
118 #if __FreeBSD_version >= 1400018
119 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg,
120     bool *mp_busy);
121 #else
122 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
123 #endif
124 static int zfs_mount(vfs_t *vfsp);
125 static int zfs_umount(vfs_t *vfsp, int fflag);
126 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
127 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
128 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
129 static int zfs_sync(vfs_t *vfsp, int waitfor);
130 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
131     struct ucred **credanonp, int *numsecflavors, int *secflavors);
132 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
133 static void zfs_freevfs(vfs_t *vfsp);
134 
135 struct vfsops zfs_vfsops = {
136 	.vfs_mount =		zfs_mount,
137 	.vfs_unmount =		zfs_umount,
138 	.vfs_root =		vfs_cache_root,
139 	.vfs_cachedroot =	zfs_root,
140 	.vfs_statfs =		zfs_statfs,
141 	.vfs_vget =		zfs_vget,
142 	.vfs_sync =		zfs_sync,
143 	.vfs_checkexp =		zfs_checkexp,
144 	.vfs_fhtovp =		zfs_fhtovp,
145 	.vfs_quotactl =		zfs_quotactl,
146 };
147 
148 VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL
149 #ifdef VFCF_CROSS_COPY_FILE_RANGE
150 	| VFCF_CROSS_COPY_FILE_RANGE
151 #endif
152 #ifdef VFCF_FILEREVINC
153 	| VFCF_FILEREVINC
154 #endif
155 );
156 
157 /*
158  * We need to keep a count of active fs's.
159  * This is necessary to prevent our module
160  * from being unloaded after a umount -f
161  */
162 static uint32_t	zfs_active_fs_count = 0;
163 
164 int
zfs_get_temporary_prop(dsl_dataset_t * ds,zfs_prop_t zfs_prop,uint64_t * val,char * setpoint)165 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
166     char *setpoint)
167 {
168 	int error;
169 	zfsvfs_t *zfvp;
170 	vfs_t *vfsp;
171 	objset_t *os;
172 	uint64_t tmp = *val;
173 
174 	error = dmu_objset_from_ds(ds, &os);
175 	if (error != 0)
176 		return (error);
177 
178 	error = getzfsvfs_impl(os, &zfvp);
179 	if (error != 0)
180 		return (error);
181 	if (zfvp == NULL)
182 		return (ENOENT);
183 	vfsp = zfvp->z_vfs;
184 	switch (zfs_prop) {
185 	case ZFS_PROP_ATIME:
186 		if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
187 			tmp = 0;
188 		if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
189 			tmp = 1;
190 		break;
191 	case ZFS_PROP_DEVICES:
192 		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
193 			tmp = 0;
194 		if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
195 			tmp = 1;
196 		break;
197 	case ZFS_PROP_EXEC:
198 		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
199 			tmp = 0;
200 		if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
201 			tmp = 1;
202 		break;
203 	case ZFS_PROP_SETUID:
204 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
205 			tmp = 0;
206 		if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
207 			tmp = 1;
208 		break;
209 	case ZFS_PROP_READONLY:
210 		if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
211 			tmp = 0;
212 		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
213 			tmp = 1;
214 		break;
215 	case ZFS_PROP_XATTR:
216 		if (zfvp->z_flags & ZSB_XATTR)
217 			tmp = zfvp->z_xattr;
218 		break;
219 	case ZFS_PROP_NBMAND:
220 		if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
221 			tmp = 0;
222 		if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
223 			tmp = 1;
224 		break;
225 	default:
226 		vfs_unbusy(vfsp);
227 		return (ENOENT);
228 	}
229 
230 	vfs_unbusy(vfsp);
231 	if (tmp != *val) {
232 		if (setpoint)
233 			(void) strcpy(setpoint, "temporary");
234 		*val = tmp;
235 	}
236 	return (0);
237 }
238 
239 static int
zfs_getquota(zfsvfs_t * zfsvfs,uid_t id,int isgroup,struct dqblk64 * dqp)240 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
241 {
242 	int error = 0;
243 	char buf[32];
244 	uint64_t usedobj, quotaobj, defaultquota;
245 	uint64_t quota, used = 0;
246 	timespec_t now;
247 
248 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
249 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
250 	defaultquota = isgroup ? zfsvfs->z_defaultgroupquota :
251 	    zfsvfs->z_defaultuserquota;
252 
253 	if (zfsvfs->z_replay)
254 		return (ENOENT);
255 
256 	(void) sprintf(buf, "%llx", (longlong_t)id);
257 	if (quotaobj == 0) {
258 		if (defaultquota == 0)
259 			return (ENOENT);
260 		quota = defaultquota;
261 	} else {
262 		error = zap_lookup(zfsvfs->z_os, quotaobj, buf, sizeof (quota),
263 		    1, &quota);
264 		if (error && (quota = defaultquota) == 0)
265 			return (error);
266 	}
267 
268 	/*
269 	 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
270 	 * So we set them to be the same.
271 	 */
272 	dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
273 	error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
274 	if (error == ENOENT)
275 		error = 0;
276 	if (error)
277 		return (error);
278 	dqp->dqb_curblocks = btodb(used);
279 	dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
280 	vfs_timestamp(&now);
281 	/*
282 	 * Setting this to 0 causes FreeBSD quota(8) to print
283 	 * the number of days since the epoch, which isn't
284 	 * particularly useful.
285 	 */
286 	dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
287 	return (error);
288 }
289 
290 static int
291 #if __FreeBSD_version >= 1400018
zfs_quotactl(vfs_t * vfsp,int cmds,uid_t id,void * arg,bool * mp_busy)292 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy)
293 #else
294 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
295 #endif
296 {
297 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
298 	struct thread *td;
299 	int cmd, type, error = 0;
300 	int bitsize;
301 	zfs_userquota_prop_t quota_type;
302 	struct dqblk64 dqblk = { 0 };
303 
304 	td = curthread;
305 	cmd = cmds >> SUBCMDSHIFT;
306 	type = cmds & SUBCMDMASK;
307 
308 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
309 		return (error);
310 	if (id == -1) {
311 		switch (type) {
312 		case USRQUOTA:
313 			id = td->td_ucred->cr_ruid;
314 			break;
315 		case GRPQUOTA:
316 			id = td->td_ucred->cr_rgid;
317 			break;
318 		default:
319 			error = EINVAL;
320 #if __FreeBSD_version < 1400018
321 			if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
322 				vfs_unbusy(vfsp);
323 #endif
324 			goto done;
325 		}
326 	}
327 	/*
328 	 * Map BSD type to:
329 	 * ZFS_PROP_USERUSED,
330 	 * ZFS_PROP_USERQUOTA,
331 	 * ZFS_PROP_GROUPUSED,
332 	 * ZFS_PROP_GROUPQUOTA
333 	 */
334 	switch (cmd) {
335 	case Q_SETQUOTA:
336 	case Q_SETQUOTA32:
337 		if (type == USRQUOTA)
338 			quota_type = ZFS_PROP_USERQUOTA;
339 		else if (type == GRPQUOTA)
340 			quota_type = ZFS_PROP_GROUPQUOTA;
341 		else
342 			error = EINVAL;
343 		break;
344 	case Q_GETQUOTA:
345 	case Q_GETQUOTA32:
346 		if (type == USRQUOTA)
347 			quota_type = ZFS_PROP_USERUSED;
348 		else if (type == GRPQUOTA)
349 			quota_type = ZFS_PROP_GROUPUSED;
350 		else
351 			error = EINVAL;
352 		break;
353 	}
354 
355 	/*
356 	 * Depending on the cmd, we may need to get
357 	 * the ruid and domain (see fuidstr_to_sid?),
358 	 * the fuid (how?), or other information.
359 	 * Create fuid using zfs_fuid_create(zfsvfs, id,
360 	 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
361 	 * I think I can use just the id?
362 	 *
363 	 * Look at zfs_id_overquota() to look up a quota.
364 	 * zap_lookup(something, quotaobj, fuidstring,
365 	 *     sizeof (long long), 1, &quota)
366 	 *
367 	 * See zfs_set_userquota() to set a quota.
368 	 */
369 	if ((uint32_t)type >= MAXQUOTAS) {
370 		error = EINVAL;
371 		goto done;
372 	}
373 
374 	switch (cmd) {
375 	case Q_GETQUOTASIZE:
376 		bitsize = 64;
377 		error = copyout(&bitsize, arg, sizeof (int));
378 		break;
379 	case Q_QUOTAON:
380 		// As far as I can tell, you can't turn quotas on or off on zfs
381 		error = 0;
382 #if __FreeBSD_version < 1400018
383 		vfs_unbusy(vfsp);
384 #endif
385 		break;
386 	case Q_QUOTAOFF:
387 		error = ENOTSUP;
388 #if __FreeBSD_version < 1400018
389 		vfs_unbusy(vfsp);
390 #endif
391 		break;
392 	case Q_SETQUOTA:
393 		error = copyin(arg, &dqblk, sizeof (dqblk));
394 		if (error == 0)
395 			error = zfs_set_userquota(zfsvfs, quota_type,
396 			    "", id, dbtob(dqblk.dqb_bhardlimit));
397 		break;
398 	case Q_GETQUOTA:
399 		error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
400 		if (error == 0)
401 			error = copyout(&dqblk, arg, sizeof (dqblk));
402 		break;
403 	default:
404 		error = EINVAL;
405 		break;
406 	}
407 done:
408 	zfs_exit(zfsvfs, FTAG);
409 	return (error);
410 }
411 
412 
413 boolean_t
zfs_is_readonly(zfsvfs_t * zfsvfs)414 zfs_is_readonly(zfsvfs_t *zfsvfs)
415 {
416 	return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
417 }
418 
419 static int
zfs_sync(vfs_t * vfsp,int waitfor)420 zfs_sync(vfs_t *vfsp, int waitfor)
421 {
422 
423 	/*
424 	 * Data integrity is job one.  We don't want a compromised kernel
425 	 * writing to the storage pool, so we never sync during panic.
426 	 */
427 	if (panicstr)
428 		return (0);
429 
430 	/*
431 	 * Ignore the system syncher.  ZFS already commits async data
432 	 * at zfs_txg_timeout intervals.
433 	 */
434 	if (waitfor == MNT_LAZY)
435 		return (0);
436 
437 	if (vfsp != NULL) {
438 		/*
439 		 * Sync a specific filesystem.
440 		 */
441 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
442 		dsl_pool_t *dp;
443 		int error;
444 
445 		if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
446 			return (error);
447 		dp = dmu_objset_pool(zfsvfs->z_os);
448 
449 		/*
450 		 * If the system is shutting down, then skip any
451 		 * filesystems which may exist on a suspended pool.
452 		 */
453 		if (rebooting && spa_suspended(dp->dp_spa)) {
454 			zfs_exit(zfsvfs, FTAG);
455 			return (0);
456 		}
457 
458 		if (zfsvfs->z_log != NULL) {
459 			error = zil_commit(zfsvfs->z_log, 0);
460 			if (error != 0) {
461 				zfs_exit(zfsvfs, FTAG);
462 				return (error);
463 			}
464 		}
465 
466 		zfs_exit(zfsvfs, FTAG);
467 	} else {
468 		/*
469 		 * Sync all ZFS filesystems.  This is what happens when you
470 		 * run sync(8).  Unlike other filesystems, ZFS honors the
471 		 * request by waiting for all pools to commit all dirty data.
472 		 */
473 		spa_sync_allpools();
474 	}
475 
476 	return (0);
477 }
478 
479 static void
atime_changed_cb(void * arg,uint64_t newval)480 atime_changed_cb(void *arg, uint64_t newval)
481 {
482 	zfsvfs_t *zfsvfs = arg;
483 
484 	if (newval == TRUE) {
485 		zfsvfs->z_atime = TRUE;
486 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
487 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
488 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
489 	} else {
490 		zfsvfs->z_atime = FALSE;
491 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
492 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
493 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
494 	}
495 }
496 
497 static void
xattr_changed_cb(void * arg,uint64_t newval)498 xattr_changed_cb(void *arg, uint64_t newval)
499 {
500 	zfsvfs_t *zfsvfs = arg;
501 
502 	if (newval == ZFS_XATTR_OFF) {
503 		zfsvfs->z_flags &= ~ZSB_XATTR;
504 	} else {
505 		zfsvfs->z_flags |= ZSB_XATTR;
506 
507 		if (newval == ZFS_XATTR_SA)
508 			zfsvfs->z_xattr_sa = B_TRUE;
509 		else
510 			zfsvfs->z_xattr_sa = B_FALSE;
511 	}
512 }
513 
514 static void
blksz_changed_cb(void * arg,uint64_t newval)515 blksz_changed_cb(void *arg, uint64_t newval)
516 {
517 	zfsvfs_t *zfsvfs = arg;
518 	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
519 	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
520 	ASSERT(ISP2(newval));
521 
522 	zfsvfs->z_max_blksz = newval;
523 	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
524 }
525 
526 static void
readonly_changed_cb(void * arg,uint64_t newval)527 readonly_changed_cb(void *arg, uint64_t newval)
528 {
529 	zfsvfs_t *zfsvfs = arg;
530 
531 	if (newval) {
532 		/* XXX locking on vfs_flag? */
533 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
534 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
535 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
536 	} else {
537 		/* XXX locking on vfs_flag? */
538 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
539 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
540 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
541 	}
542 }
543 
544 static void
setuid_changed_cb(void * arg,uint64_t newval)545 setuid_changed_cb(void *arg, uint64_t newval)
546 {
547 	zfsvfs_t *zfsvfs = arg;
548 
549 	if (newval == FALSE) {
550 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
551 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
552 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
553 	} else {
554 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
555 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
556 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
557 	}
558 }
559 
560 static void
exec_changed_cb(void * arg,uint64_t newval)561 exec_changed_cb(void *arg, uint64_t newval)
562 {
563 	zfsvfs_t *zfsvfs = arg;
564 
565 	if (newval == FALSE) {
566 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
567 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
568 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
569 	} else {
570 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
571 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
572 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
573 	}
574 }
575 
576 /*
577  * The nbmand mount option can be changed at mount time.
578  * We can't allow it to be toggled on live file systems or incorrect
579  * behavior may be seen from cifs clients
580  *
581  * This property isn't registered via dsl_prop_register(), but this callback
582  * will be called when a file system is first mounted
583  */
584 static void
nbmand_changed_cb(void * arg,uint64_t newval)585 nbmand_changed_cb(void *arg, uint64_t newval)
586 {
587 	zfsvfs_t *zfsvfs = arg;
588 	if (newval == FALSE) {
589 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
590 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
591 	} else {
592 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
593 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
594 	}
595 }
596 
597 static void
snapdir_changed_cb(void * arg,uint64_t newval)598 snapdir_changed_cb(void *arg, uint64_t newval)
599 {
600 	zfsvfs_t *zfsvfs = arg;
601 
602 	zfsvfs->z_show_ctldir = newval;
603 }
604 
605 static void
acl_mode_changed_cb(void * arg,uint64_t newval)606 acl_mode_changed_cb(void *arg, uint64_t newval)
607 {
608 	zfsvfs_t *zfsvfs = arg;
609 
610 	zfsvfs->z_acl_mode = newval;
611 }
612 
613 static void
acl_inherit_changed_cb(void * arg,uint64_t newval)614 acl_inherit_changed_cb(void *arg, uint64_t newval)
615 {
616 	zfsvfs_t *zfsvfs = arg;
617 
618 	zfsvfs->z_acl_inherit = newval;
619 }
620 
621 static void
acl_type_changed_cb(void * arg,uint64_t newval)622 acl_type_changed_cb(void *arg, uint64_t newval)
623 {
624 	zfsvfs_t *zfsvfs = arg;
625 
626 	zfsvfs->z_acl_type = newval;
627 }
628 
629 static void
longname_changed_cb(void * arg,uint64_t newval)630 longname_changed_cb(void *arg, uint64_t newval)
631 {
632 	zfsvfs_t *zfsvfs = arg;
633 
634 	zfsvfs->z_longname = newval;
635 }
636 
637 static int
zfs_register_callbacks(vfs_t * vfsp)638 zfs_register_callbacks(vfs_t *vfsp)
639 {
640 	struct dsl_dataset *ds = NULL;
641 	objset_t *os = NULL;
642 	zfsvfs_t *zfsvfs = NULL;
643 	uint64_t nbmand;
644 	boolean_t readonly = B_FALSE;
645 	boolean_t do_readonly = B_FALSE;
646 	boolean_t setuid = B_FALSE;
647 	boolean_t do_setuid = B_FALSE;
648 	boolean_t exec = B_FALSE;
649 	boolean_t do_exec = B_FALSE;
650 	boolean_t xattr = B_FALSE;
651 	boolean_t atime = B_FALSE;
652 	boolean_t do_atime = B_FALSE;
653 	boolean_t do_xattr = B_FALSE;
654 	int error = 0;
655 
656 	ASSERT3P(vfsp, !=, NULL);
657 	zfsvfs = vfsp->vfs_data;
658 	ASSERT3P(zfsvfs, !=, NULL);
659 	os = zfsvfs->z_os;
660 
661 	/*
662 	 * This function can be called for a snapshot when we update snapshot's
663 	 * mount point, which isn't really supported.
664 	 */
665 	if (dmu_objset_is_snapshot(os))
666 		return (EOPNOTSUPP);
667 
668 	/*
669 	 * The act of registering our callbacks will destroy any mount
670 	 * options we may have.  In order to enable temporary overrides
671 	 * of mount options, we stash away the current values and
672 	 * restore them after we register the callbacks.
673 	 */
674 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
675 	    !spa_writeable(dmu_objset_spa(os))) {
676 		readonly = B_TRUE;
677 		do_readonly = B_TRUE;
678 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
679 		readonly = B_FALSE;
680 		do_readonly = B_TRUE;
681 	}
682 	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
683 		setuid = B_FALSE;
684 		do_setuid = B_TRUE;
685 	} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
686 		setuid = B_TRUE;
687 		do_setuid = B_TRUE;
688 	}
689 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
690 		exec = B_FALSE;
691 		do_exec = B_TRUE;
692 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
693 		exec = B_TRUE;
694 		do_exec = B_TRUE;
695 	}
696 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
697 		zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
698 		do_xattr = B_TRUE;
699 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
700 		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
701 		do_xattr = B_TRUE;
702 	} else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
703 		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
704 		do_xattr = B_TRUE;
705 	} else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
706 		zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
707 		do_xattr = B_TRUE;
708 	}
709 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
710 		atime = B_FALSE;
711 		do_atime = B_TRUE;
712 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
713 		atime = B_TRUE;
714 		do_atime = B_TRUE;
715 	}
716 
717 	/*
718 	 * We need to enter pool configuration here, so that we can use
719 	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
720 	 * dsl_prop_get_integer() can not be used, because it has to acquire
721 	 * spa_namespace_lock and we can not do that because we already hold
722 	 * z_teardown_lock.  The problem is that spa_write_cachefile() is called
723 	 * with spa_namespace_lock held and the function calls ZFS vnode
724 	 * operations to write the cache file and thus z_teardown_lock is
725 	 * acquired after spa_namespace_lock.
726 	 */
727 	ds = dmu_objset_ds(os);
728 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
729 
730 	/*
731 	 * nbmand is a special property.  It can only be changed at
732 	 * mount time.
733 	 *
734 	 * This is weird, but it is documented to only be changeable
735 	 * at mount time.
736 	 */
737 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
738 		nbmand = B_FALSE;
739 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
740 		nbmand = B_TRUE;
741 	} else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) {
742 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
743 		return (error);
744 	}
745 
746 	/*
747 	 * Register property callbacks.
748 	 *
749 	 * It would probably be fine to just check for i/o error from
750 	 * the first prop_register(), but I guess I like to go
751 	 * overboard...
752 	 */
753 	error = dsl_prop_register(ds,
754 	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
755 	error = error ? error : dsl_prop_register(ds,
756 	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
757 	error = error ? error : dsl_prop_register(ds,
758 	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
759 	error = error ? error : dsl_prop_register(ds,
760 	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
761 	error = error ? error : dsl_prop_register(ds,
762 	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
763 	error = error ? error : dsl_prop_register(ds,
764 	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
765 	error = error ? error : dsl_prop_register(ds,
766 	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
767 	error = error ? error : dsl_prop_register(ds,
768 	    zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
769 	error = error ? error : dsl_prop_register(ds,
770 	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
771 	error = error ? error : dsl_prop_register(ds,
772 	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
773 	    zfsvfs);
774 	error = error ? error : dsl_prop_register(ds,
775 	    zfs_prop_to_name(ZFS_PROP_LONGNAME), longname_changed_cb, zfsvfs);
776 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
777 	if (error)
778 		goto unregister;
779 
780 	/*
781 	 * Invoke our callbacks to restore temporary mount options.
782 	 */
783 	if (do_readonly)
784 		readonly_changed_cb(zfsvfs, readonly);
785 	if (do_setuid)
786 		setuid_changed_cb(zfsvfs, setuid);
787 	if (do_exec)
788 		exec_changed_cb(zfsvfs, exec);
789 	if (do_xattr)
790 		xattr_changed_cb(zfsvfs, xattr);
791 	if (do_atime)
792 		atime_changed_cb(zfsvfs, atime);
793 
794 	nbmand_changed_cb(zfsvfs, nbmand);
795 
796 	return (0);
797 
798 unregister:
799 	dsl_prop_unregister_all(ds, zfsvfs);
800 	return (error);
801 }
802 
803 /*
804  * Associate this zfsvfs with the given objset, which must be owned.
805  * This will cache a bunch of on-disk state from the objset in the
806  * zfsvfs.
807  */
808 static int
zfsvfs_init(zfsvfs_t * zfsvfs,objset_t * os)809 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
810 {
811 	int error;
812 	uint64_t val;
813 
814 	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
815 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
816 	zfsvfs->z_os = os;
817 
818 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
819 	if (error != 0)
820 		return (error);
821 	if (zfsvfs->z_version >
822 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
823 		(void) printf("Can't mount a version %lld file system "
824 		    "on a version %lld pool\n. Pool must be upgraded to mount "
825 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
826 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
827 		return (SET_ERROR(ENOTSUP));
828 	}
829 	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
830 	if (error != 0)
831 		return (error);
832 	zfsvfs->z_norm = (int)val;
833 
834 	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
835 	if (error != 0)
836 		return (error);
837 	zfsvfs->z_utf8 = (val != 0);
838 
839 	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
840 	if (error != 0)
841 		return (error);
842 	zfsvfs->z_case = (uint_t)val;
843 
844 	error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
845 	if (error != 0)
846 		return (error);
847 	zfsvfs->z_acl_type = (uint_t)val;
848 
849 	/*
850 	 * Fold case on file systems that are always or sometimes case
851 	 * insensitive.
852 	 */
853 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
854 	    zfsvfs->z_case == ZFS_CASE_MIXED)
855 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
856 
857 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
858 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
859 
860 	uint64_t sa_obj = 0;
861 	if (zfsvfs->z_use_sa) {
862 		/* should either have both of these objects or none */
863 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
864 		    &sa_obj);
865 		if (error != 0)
866 			return (error);
867 
868 		error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
869 		if (error == 0 && val == ZFS_XATTR_SA)
870 			zfsvfs->z_xattr_sa = B_TRUE;
871 	}
872 
873 	error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSERQUOTA,
874 	    &zfsvfs->z_defaultuserquota);
875 	if (error != 0)
876 		return (error);
877 
878 	error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPQUOTA,
879 	    &zfsvfs->z_defaultgroupquota);
880 	if (error != 0)
881 		return (error);
882 
883 	error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTQUOTA,
884 	    &zfsvfs->z_defaultprojectquota);
885 	if (error != 0)
886 		return (error);
887 
888 	error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSEROBJQUOTA,
889 	    &zfsvfs->z_defaultuserobjquota);
890 	if (error != 0)
891 		return (error);
892 
893 	error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPOBJQUOTA,
894 	    &zfsvfs->z_defaultgroupobjquota);
895 	if (error != 0)
896 		return (error);
897 
898 	error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTOBJQUOTA,
899 	    &zfsvfs->z_defaultprojectobjquota);
900 	if (error != 0)
901 		return (error);
902 
903 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
904 	    &zfsvfs->z_attr_table);
905 	if (error != 0)
906 		return (error);
907 
908 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
909 		sa_register_update_callback(os, zfs_sa_upgrade);
910 
911 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
912 	    &zfsvfs->z_root);
913 	if (error != 0)
914 		return (error);
915 	ASSERT3U(zfsvfs->z_root, !=, 0);
916 
917 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
918 	    &zfsvfs->z_unlinkedobj);
919 	if (error != 0)
920 		return (error);
921 
922 	error = zap_lookup(os, MASTER_NODE_OBJ,
923 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
924 	    8, 1, &zfsvfs->z_userquota_obj);
925 	if (error == ENOENT)
926 		zfsvfs->z_userquota_obj = 0;
927 	else if (error != 0)
928 		return (error);
929 
930 	error = zap_lookup(os, MASTER_NODE_OBJ,
931 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
932 	    8, 1, &zfsvfs->z_groupquota_obj);
933 	if (error == ENOENT)
934 		zfsvfs->z_groupquota_obj = 0;
935 	else if (error != 0)
936 		return (error);
937 
938 	error = zap_lookup(os, MASTER_NODE_OBJ,
939 	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
940 	    8, 1, &zfsvfs->z_projectquota_obj);
941 	if (error == ENOENT)
942 		zfsvfs->z_projectquota_obj = 0;
943 	else if (error != 0)
944 		return (error);
945 
946 	error = zap_lookup(os, MASTER_NODE_OBJ,
947 	    zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
948 	    8, 1, &zfsvfs->z_userobjquota_obj);
949 	if (error == ENOENT)
950 		zfsvfs->z_userobjquota_obj = 0;
951 	else if (error != 0)
952 		return (error);
953 
954 	error = zap_lookup(os, MASTER_NODE_OBJ,
955 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
956 	    8, 1, &zfsvfs->z_groupobjquota_obj);
957 	if (error == ENOENT)
958 		zfsvfs->z_groupobjquota_obj = 0;
959 	else if (error != 0)
960 		return (error);
961 
962 	error = zap_lookup(os, MASTER_NODE_OBJ,
963 	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
964 	    8, 1, &zfsvfs->z_projectobjquota_obj);
965 	if (error == ENOENT)
966 		zfsvfs->z_projectobjquota_obj = 0;
967 	else if (error != 0)
968 		return (error);
969 
970 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
971 	    &zfsvfs->z_fuid_obj);
972 	if (error == ENOENT)
973 		zfsvfs->z_fuid_obj = 0;
974 	else if (error != 0)
975 		return (error);
976 
977 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
978 	    &zfsvfs->z_shares_dir);
979 	if (error == ENOENT)
980 		zfsvfs->z_shares_dir = 0;
981 	else if (error != 0)
982 		return (error);
983 
984 	/*
985 	 * Only use the name cache if we are looking for a
986 	 * name on a file system that does not require normalization
987 	 * or case folding.  We can also look there if we happen to be
988 	 * on a non-normalizing, mixed sensitivity file system IF we
989 	 * are looking for the exact name (which is always the case on
990 	 * FreeBSD).
991 	 */
992 	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
993 	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
994 	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
995 
996 	return (0);
997 }
998 
999 taskq_t *zfsvfs_taskq;
1000 
1001 static void
zfsvfs_task_unlinked_drain(void * context,int pending __unused)1002 zfsvfs_task_unlinked_drain(void *context, int pending __unused)
1003 {
1004 
1005 	zfs_unlinked_drain((zfsvfs_t *)context);
1006 }
1007 
1008 int
zfsvfs_create(const char * osname,boolean_t readonly,zfsvfs_t ** zfvp)1009 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
1010 {
1011 	objset_t *os;
1012 	zfsvfs_t *zfsvfs;
1013 	int error;
1014 	boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
1015 
1016 	/*
1017 	 * XXX: Fix struct statfs so this isn't necessary!
1018 	 *
1019 	 * The 'osname' is used as the filesystem's special node, which means
1020 	 * it must fit in statfs.f_mntfromname, or else it can't be
1021 	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
1022 	 * 'zfs unmount' to think it's not mounted when it is.
1023 	 */
1024 	if (strlen(osname) >= MNAMELEN)
1025 		return (SET_ERROR(ENAMETOOLONG));
1026 
1027 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1028 
1029 	error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
1030 	    &os);
1031 	if (error != 0) {
1032 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
1033 		return (error);
1034 	}
1035 
1036 	error = zfsvfs_create_impl(zfvp, zfsvfs, os);
1037 
1038 	return (error);
1039 }
1040 
1041 
1042 int
zfsvfs_create_impl(zfsvfs_t ** zfvp,zfsvfs_t * zfsvfs,objset_t * os)1043 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
1044 {
1045 	int error;
1046 
1047 	zfsvfs->z_vfs = NULL;
1048 	zfsvfs->z_parent = zfsvfs;
1049 
1050 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1051 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1052 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1053 	    offsetof(znode_t, z_link_node));
1054 	TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
1055 	    zfsvfs_task_unlinked_drain, zfsvfs);
1056 	ZFS_TEARDOWN_INIT(zfsvfs);
1057 	ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
1058 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1059 	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1060 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1061 
1062 	error = zfsvfs_init(zfsvfs, os);
1063 	if (error != 0) {
1064 		dmu_objset_disown(os, B_TRUE, zfsvfs);
1065 		*zfvp = NULL;
1066 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
1067 		return (error);
1068 	}
1069 
1070 	*zfvp = zfsvfs;
1071 	return (0);
1072 }
1073 
1074 static int
zfsvfs_setup(zfsvfs_t * zfsvfs,boolean_t mounting)1075 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1076 {
1077 	int error;
1078 
1079 	/*
1080 	 * Check for a bad on-disk format version now since we
1081 	 * lied about owning the dataset readonly before.
1082 	 */
1083 	if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1084 	    dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1085 		return (SET_ERROR(EROFS));
1086 
1087 	error = zfs_register_callbacks(zfsvfs->z_vfs);
1088 	if (error)
1089 		return (error);
1090 
1091 	/*
1092 	 * If we are not mounting (ie: online recv), then we don't
1093 	 * have to worry about replaying the log as we blocked all
1094 	 * operations out since we closed the ZIL.
1095 	 */
1096 	if (mounting) {
1097 		boolean_t readonly;
1098 
1099 		ASSERT0P(zfsvfs->z_kstat.dk_kstats);
1100 		error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1101 		if (error)
1102 			return (error);
1103 		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1104 		    &zfsvfs->z_kstat.dk_zil_sums);
1105 
1106 		/*
1107 		 * During replay we remove the read only flag to
1108 		 * allow replays to succeed.
1109 		 */
1110 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1111 		if (readonly != 0) {
1112 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1113 		} else {
1114 			dsl_dir_t *dd;
1115 			zap_stats_t zs;
1116 
1117 			if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1118 			    &zs) == 0) {
1119 				dataset_kstats_update_nunlinks_kstat(
1120 				    &zfsvfs->z_kstat, zs.zs_num_entries);
1121 				dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1122 				    "num_entries in unlinked set: %llu",
1123 				    (u_longlong_t)zs.zs_num_entries);
1124 			}
1125 
1126 			zfs_unlinked_drain(zfsvfs);
1127 			dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1128 			dd->dd_activity_cancelled = B_FALSE;
1129 		}
1130 
1131 		/*
1132 		 * Parse and replay the intent log.
1133 		 *
1134 		 * Because of ziltest, this must be done after
1135 		 * zfs_unlinked_drain().  (Further note: ziltest
1136 		 * doesn't use readonly mounts, where
1137 		 * zfs_unlinked_drain() isn't called.)  This is because
1138 		 * ziltest causes spa_sync() to think it's committed,
1139 		 * but actually it is not, so the intent log contains
1140 		 * many txg's worth of changes.
1141 		 *
1142 		 * In particular, if object N is in the unlinked set in
1143 		 * the last txg to actually sync, then it could be
1144 		 * actually freed in a later txg and then reallocated
1145 		 * in a yet later txg.  This would write a "create
1146 		 * object N" record to the intent log.  Normally, this
1147 		 * would be fine because the spa_sync() would have
1148 		 * written out the fact that object N is free, before
1149 		 * we could write the "create object N" intent log
1150 		 * record.
1151 		 *
1152 		 * But when we are in ziltest mode, we advance the "open
1153 		 * txg" without actually spa_sync()-ing the changes to
1154 		 * disk.  So we would see that object N is still
1155 		 * allocated and in the unlinked set, and there is an
1156 		 * intent log record saying to allocate it.
1157 		 */
1158 		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1159 			if (zil_replay_disable) {
1160 				zil_destroy(zfsvfs->z_log, B_FALSE);
1161 			} else {
1162 				boolean_t use_nc = zfsvfs->z_use_namecache;
1163 				zfsvfs->z_use_namecache = B_FALSE;
1164 				zfsvfs->z_replay = B_TRUE;
1165 				zil_replay(zfsvfs->z_os, zfsvfs,
1166 				    zfs_replay_vector);
1167 				zfsvfs->z_replay = B_FALSE;
1168 				zfsvfs->z_use_namecache = use_nc;
1169 			}
1170 		}
1171 
1172 		/* restore readonly bit */
1173 		if (readonly != 0)
1174 			zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1175 	} else {
1176 		ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL);
1177 		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1178 		    &zfsvfs->z_kstat.dk_zil_sums);
1179 	}
1180 
1181 	/*
1182 	 * Set the objset user_ptr to track its zfsvfs.
1183 	 */
1184 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1185 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1186 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1187 
1188 	return (0);
1189 }
1190 
1191 void
zfsvfs_free(zfsvfs_t * zfsvfs)1192 zfsvfs_free(zfsvfs_t *zfsvfs)
1193 {
1194 	int i;
1195 
1196 	zfs_fuid_destroy(zfsvfs);
1197 
1198 	mutex_destroy(&zfsvfs->z_znodes_lock);
1199 	mutex_destroy(&zfsvfs->z_lock);
1200 	list_destroy(&zfsvfs->z_all_znodes);
1201 	ZFS_TEARDOWN_DESTROY(zfsvfs);
1202 	ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
1203 	rw_destroy(&zfsvfs->z_fuid_lock);
1204 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1205 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1206 	dataset_kstats_destroy(&zfsvfs->z_kstat);
1207 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1208 }
1209 
1210 static void
zfs_set_fuid_feature(zfsvfs_t * zfsvfs)1211 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1212 {
1213 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1214 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1215 }
1216 
1217 extern int zfs_xattr_compat;
1218 
1219 static int
zfs_domount(vfs_t * vfsp,char * osname)1220 zfs_domount(vfs_t *vfsp, char *osname)
1221 {
1222 	uint64_t recordsize, fsid_guid;
1223 	int error = 0;
1224 	zfsvfs_t *zfsvfs;
1225 
1226 	ASSERT3P(vfsp, !=, NULL);
1227 	ASSERT3P(osname, !=, NULL);
1228 
1229 	error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1230 	if (error)
1231 		return (error);
1232 	zfsvfs->z_vfs = vfsp;
1233 
1234 	if ((error = dsl_prop_get_integer(osname,
1235 	    "recordsize", &recordsize, NULL)))
1236 		goto out;
1237 	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1238 	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1239 
1240 	vfsp->vfs_data = zfsvfs;
1241 	vfsp->mnt_flag |= MNT_LOCAL;
1242 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1243 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1244 	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1245 	/*
1246 	 * This can cause a loss of coherence between ARC and page cache
1247 	 * on ZoF - unclear if the problem is in FreeBSD or ZoF
1248 	 */
1249 	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
1250 	vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1251 	vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1252 
1253 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
1254 	vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1255 #endif
1256 	/*
1257 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
1258 	 * separates our fsid from any other filesystem types, and a
1259 	 * 56-bit objset unique ID.  The objset unique ID is unique to
1260 	 * all objsets open on this system, provided by unique_create().
1261 	 * The 8-bit fs type must be put in the low bits of fsid[1]
1262 	 * because that's where other Solaris filesystems put it.
1263 	 */
1264 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1265 	ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0);
1266 	vfsp->vfs_fsid.val[0] = fsid_guid;
1267 	vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) |
1268 	    (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1269 
1270 	/*
1271 	 * Set features for file system.
1272 	 */
1273 	zfs_set_fuid_feature(zfsvfs);
1274 
1275 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1276 		uint64_t pval;
1277 
1278 		atime_changed_cb(zfsvfs, B_FALSE);
1279 		readonly_changed_cb(zfsvfs, B_TRUE);
1280 		if ((error = dsl_prop_get_integer(osname,
1281 		    "xattr", &pval, NULL)))
1282 			goto out;
1283 		xattr_changed_cb(zfsvfs, pval);
1284 		if ((error = dsl_prop_get_integer(osname,
1285 		    "acltype", &pval, NULL)))
1286 			goto out;
1287 		acl_type_changed_cb(zfsvfs, pval);
1288 		zfsvfs->z_issnap = B_TRUE;
1289 		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1290 
1291 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1292 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1293 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1294 	} else {
1295 		if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1296 			goto out;
1297 	}
1298 
1299 #if __FreeBSD_version >= 1500040
1300 	/*
1301 	 * Named attributes can only work if the xattr property is set to
1302 	 * on/dir and not sa.  Also, zfs_xattr_compat must be set.
1303 	 */
1304 	if ((zfsvfs->z_flags & ZSB_XATTR) != 0 && !zfsvfs->z_xattr_sa &&
1305 	    zfs_xattr_compat)
1306 		vfsp->mnt_flag |= MNT_NAMEDATTR;
1307 #endif
1308 
1309 	vfs_mountedfrom(vfsp, osname);
1310 
1311 	if (!zfsvfs->z_issnap)
1312 		zfsctl_create(zfsvfs);
1313 out:
1314 	if (error) {
1315 		dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1316 		zfsvfs_free(zfsvfs);
1317 	} else {
1318 		atomic_inc_32(&zfs_active_fs_count);
1319 	}
1320 
1321 	return (error);
1322 }
1323 
1324 static void
zfs_unregister_callbacks(zfsvfs_t * zfsvfs)1325 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1326 {
1327 	objset_t *os = zfsvfs->z_os;
1328 
1329 	if (!dmu_objset_is_snapshot(os))
1330 		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1331 }
1332 
1333 static int
getpoolname(const char * osname,char * poolname)1334 getpoolname(const char *osname, char *poolname)
1335 {
1336 	char *p;
1337 
1338 	p = strchr(osname, '/');
1339 	if (p == NULL) {
1340 		if (strlen(osname) >= MAXNAMELEN)
1341 			return (ENAMETOOLONG);
1342 		(void) strcpy(poolname, osname);
1343 	} else {
1344 		if (p - osname >= MAXNAMELEN)
1345 			return (ENAMETOOLONG);
1346 		(void) strlcpy(poolname, osname, p - osname + 1);
1347 	}
1348 	return (0);
1349 }
1350 
1351 static void
fetch_osname_options(char * name,bool * checkpointrewind)1352 fetch_osname_options(char *name, bool *checkpointrewind)
1353 {
1354 
1355 	if (name[0] == '!') {
1356 		*checkpointrewind = true;
1357 		memmove(name, name + 1, strlen(name));
1358 	} else {
1359 		*checkpointrewind = false;
1360 	}
1361 }
1362 
1363 static int
zfs_mount(vfs_t * vfsp)1364 zfs_mount(vfs_t *vfsp)
1365 {
1366 	kthread_t	*td = curthread;
1367 	vnode_t		*mvp = vfsp->mnt_vnodecovered;
1368 	cred_t		*cr = td->td_ucred;
1369 	char		*osname;
1370 	int		error = 0;
1371 	int		canwrite;
1372 	bool		checkpointrewind, isctlsnap = false;
1373 
1374 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1375 		return (SET_ERROR(EINVAL));
1376 
1377 	/*
1378 	 * If full-owner-access is enabled and delegated administration is
1379 	 * turned on, we must set nosuid.
1380 	 */
1381 	if (zfs_super_owner &&
1382 	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1383 		secpolicy_fs_mount_clearopts(cr, vfsp);
1384 	}
1385 
1386 	fetch_osname_options(osname, &checkpointrewind);
1387 	isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) &&
1388 	    strchr(osname, '@') != NULL);
1389 
1390 	/*
1391 	 * Check for mount privilege?
1392 	 *
1393 	 * If we don't have privilege then see if
1394 	 * we have local permission to allow it
1395 	 */
1396 	error = secpolicy_fs_mount(cr, mvp, vfsp);
1397 	if (error && isctlsnap) {
1398 		secpolicy_fs_mount_clearopts(cr, vfsp);
1399 	} else if (error) {
1400 		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1401 			goto out;
1402 
1403 		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1404 			vattr_t		vattr;
1405 
1406 			/*
1407 			 * Make sure user is the owner of the mount point
1408 			 * or has sufficient privileges.
1409 			 */
1410 
1411 			vattr.va_mask = AT_UID;
1412 
1413 			vn_lock(mvp, LK_SHARED | LK_RETRY);
1414 			if (VOP_GETATTR(mvp, &vattr, cr)) {
1415 				VOP_UNLOCK(mvp);
1416 				goto out;
1417 			}
1418 
1419 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1420 			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1421 				VOP_UNLOCK(mvp);
1422 				goto out;
1423 			}
1424 			VOP_UNLOCK(mvp);
1425 		}
1426 
1427 		secpolicy_fs_mount_clearopts(cr, vfsp);
1428 	}
1429 
1430 	/*
1431 	 * Refuse to mount a filesystem if we are in a local zone and the
1432 	 * dataset is not visible.
1433 	 */
1434 	if (!INGLOBALZONE(curproc) &&
1435 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1436 		boolean_t mount_snapshot = B_FALSE;
1437 
1438 		/*
1439 		 * Snapshots may be mounted in .zfs for unjailed datasets
1440 		 * if allowed by the jail param zfs.mount_snapshot.
1441 		 */
1442 		if (isctlsnap) {
1443 			struct prison *pr;
1444 			struct zfs_jailparam *zjp;
1445 
1446 			pr = curthread->td_ucred->cr_prison;
1447 			mtx_lock(&pr->pr_mtx);
1448 			zjp = osd_jail_get(pr, zfs_jailparam_slot);
1449 			mtx_unlock(&pr->pr_mtx);
1450 			if (zjp && zjp->mount_snapshot)
1451 				mount_snapshot = B_TRUE;
1452 		}
1453 		if (!mount_snapshot) {
1454 			error = SET_ERROR(EPERM);
1455 			goto out;
1456 		}
1457 	}
1458 
1459 	vfsp->vfs_flag |= MNT_NFS4ACLS;
1460 
1461 	/*
1462 	 * When doing a remount, we simply refresh our temporary properties
1463 	 * according to those options set in the current VFS options.
1464 	 */
1465 	if (vfsp->vfs_flag & MS_REMOUNT) {
1466 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
1467 
1468 		/*
1469 		 * Refresh mount options with z_teardown_lock blocking I/O while
1470 		 * the filesystem is in an inconsistent state.
1471 		 * The lock also serializes this code with filesystem
1472 		 * manipulations between entry to zfs_suspend_fs() and return
1473 		 * from zfs_resume_fs().
1474 		 */
1475 		ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1476 		zfs_unregister_callbacks(zfsvfs);
1477 		error = zfs_register_callbacks(vfsp);
1478 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1479 		goto out;
1480 	}
1481 
1482 	/* Initial root mount: try hard to import the requested root pool. */
1483 	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1484 	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1485 		char pname[MAXNAMELEN];
1486 
1487 		error = getpoolname(osname, pname);
1488 		if (error == 0)
1489 			error = spa_import_rootpool(pname, checkpointrewind);
1490 		if (error)
1491 			goto out;
1492 	}
1493 	DROP_GIANT();
1494 	error = zfs_domount(vfsp, osname);
1495 	PICKUP_GIANT();
1496 
1497 out:
1498 	return (error);
1499 }
1500 
1501 static int
zfs_statfs(vfs_t * vfsp,struct statfs * statp)1502 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1503 {
1504 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1505 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
1506 	int error;
1507 
1508 	statp->f_version = STATFS_VERSION;
1509 
1510 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1511 		return (error);
1512 
1513 	dmu_objset_space(zfsvfs->z_os,
1514 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
1515 
1516 	/*
1517 	 * The underlying storage pool actually uses multiple block sizes.
1518 	 * We report the fragsize as the smallest block size we support,
1519 	 * and we report our blocksize as the filesystem's maximum blocksize.
1520 	 */
1521 	statp->f_bsize = SPA_MINBLOCKSIZE;
1522 	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1523 
1524 	/*
1525 	 * The following report "total" blocks of various kinds in the
1526 	 * file system, but reported in terms of f_frsize - the
1527 	 * "fragment" size.
1528 	 */
1529 
1530 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1531 	statp->f_bfree = availbytes / statp->f_bsize;
1532 	statp->f_bavail = statp->f_bfree; /* no root reservation */
1533 
1534 	/*
1535 	 * statvfs() should really be called statufs(), because it assumes
1536 	 * static metadata.  ZFS doesn't preallocate files, so the best
1537 	 * we can do is report the max that could possibly fit in f_files,
1538 	 * and that minus the number actually used in f_ffree.
1539 	 * For f_ffree, report the smaller of the number of object available
1540 	 * and the number of blocks (each object will take at least a block).
1541 	 */
1542 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
1543 	statp->f_files = statp->f_ffree + usedobjs;
1544 
1545 	/*
1546 	 * We're a zfs filesystem.
1547 	 */
1548 	strlcpy(statp->f_fstypename, "zfs",
1549 	    sizeof (statp->f_fstypename));
1550 
1551 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1552 	    sizeof (statp->f_mntfromname));
1553 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1554 	    sizeof (statp->f_mntonname));
1555 
1556 	statp->f_namemax =
1557 	    zfsvfs->z_longname ? (ZAP_MAXNAMELEN_NEW - 1) : (MAXNAMELEN - 1);
1558 
1559 	zfs_exit(zfsvfs, FTAG);
1560 	return (0);
1561 }
1562 
1563 static int
zfs_root(vfs_t * vfsp,int flags,vnode_t ** vpp)1564 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1565 {
1566 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1567 	znode_t *rootzp;
1568 	int error;
1569 
1570 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1571 		return (error);
1572 
1573 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1574 	if (error == 0)
1575 		*vpp = ZTOV(rootzp);
1576 
1577 	zfs_exit(zfsvfs, FTAG);
1578 
1579 	if (error == 0) {
1580 		error = vn_lock(*vpp, flags);
1581 		if (error != 0) {
1582 			VN_RELE(*vpp);
1583 			*vpp = NULL;
1584 		}
1585 	}
1586 	return (error);
1587 }
1588 
1589 /*
1590  * Teardown the zfsvfs::z_os.
1591  *
1592  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1593  * and 'z_teardown_inactive_lock' held.
1594  */
1595 static int
zfsvfs_teardown(zfsvfs_t * zfsvfs,boolean_t unmounting)1596 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1597 {
1598 	znode_t	*zp;
1599 	dsl_dir_t *dd;
1600 
1601 	/*
1602 	 * If someone has not already unmounted this file system,
1603 	 * drain the zrele_taskq to ensure all active references to the
1604 	 * zfsvfs_t have been handled only then can it be safely destroyed.
1605 	 */
1606 	if (zfsvfs->z_os) {
1607 		/*
1608 		 * If we're unmounting we have to wait for the list to
1609 		 * drain completely.
1610 		 *
1611 		 * If we're not unmounting there's no guarantee the list
1612 		 * will drain completely, but zreles run from the taskq
1613 		 * may add the parents of dir-based xattrs to the taskq
1614 		 * so we want to wait for these.
1615 		 *
1616 		 * We can safely check z_all_znodes for being empty because the
1617 		 * VFS has already blocked operations which add to it.
1618 		 */
1619 		int round = 0;
1620 		while (!list_is_empty(&zfsvfs->z_all_znodes)) {
1621 			taskq_wait_outstanding(dsl_pool_zrele_taskq(
1622 			    dmu_objset_pool(zfsvfs->z_os)), 0);
1623 			if (++round > 1 && !unmounting)
1624 				break;
1625 		}
1626 	}
1627 	ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1628 
1629 	if (!unmounting) {
1630 		/*
1631 		 * We purge the parent filesystem's vfsp as the parent
1632 		 * filesystem and all of its snapshots have their vnode's
1633 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
1634 		 * 'z_parent' is self referential for non-snapshots.
1635 		 */
1636 #ifdef FREEBSD_NAMECACHE
1637 		cache_purgevfs(zfsvfs->z_parent->z_vfs);
1638 #endif
1639 	}
1640 
1641 	/*
1642 	 * Close the zil. NB: Can't close the zil while zfs_inactive
1643 	 * threads are blocked as zil_close can call zfs_inactive.
1644 	 */
1645 	if (zfsvfs->z_log) {
1646 		zil_close(zfsvfs->z_log);
1647 		zfsvfs->z_log = NULL;
1648 	}
1649 
1650 	ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
1651 
1652 	/*
1653 	 * If we are not unmounting (ie: online recv) and someone already
1654 	 * unmounted this file system while we were doing the switcheroo,
1655 	 * or a reopen of z_os failed then just bail out now.
1656 	 */
1657 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1658 		ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1659 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1660 		return (SET_ERROR(EIO));
1661 	}
1662 
1663 	/*
1664 	 * At this point there are no vops active, and any new vops will
1665 	 * fail with EIO since we have z_teardown_lock for writer (only
1666 	 * relevant for forced unmount).
1667 	 *
1668 	 * Release all holds on dbufs.
1669 	 */
1670 	mutex_enter(&zfsvfs->z_znodes_lock);
1671 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1672 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1673 		if (zp->z_sa_hdl != NULL) {
1674 			zfs_znode_dmu_fini(zp);
1675 		}
1676 	}
1677 	mutex_exit(&zfsvfs->z_znodes_lock);
1678 
1679 	/*
1680 	 * If we are unmounting, set the unmounted flag and let new vops
1681 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
1682 	 * other vops will fail with EIO.
1683 	 */
1684 	if (unmounting) {
1685 		zfsvfs->z_unmounted = B_TRUE;
1686 		ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1687 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1688 	}
1689 
1690 	/*
1691 	 * z_os will be NULL if there was an error in attempting to reopen
1692 	 * zfsvfs, so just return as the properties had already been
1693 	 * unregistered and cached data had been evicted before.
1694 	 */
1695 	if (zfsvfs->z_os == NULL)
1696 		return (0);
1697 
1698 	/*
1699 	 * Unregister properties.
1700 	 */
1701 	zfs_unregister_callbacks(zfsvfs);
1702 
1703 	/*
1704 	 * Evict cached data. We must write out any dirty data before
1705 	 * disowning the dataset.
1706 	 */
1707 	objset_t *os = zfsvfs->z_os;
1708 	boolean_t os_dirty = B_FALSE;
1709 	for (int t = 0; t < TXG_SIZE; t++) {
1710 		if (dmu_objset_is_dirty(os, t)) {
1711 			os_dirty = B_TRUE;
1712 			break;
1713 		}
1714 	}
1715 	if (!zfs_is_readonly(zfsvfs) && os_dirty)
1716 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1717 	dmu_objset_evict_dbufs(zfsvfs->z_os);
1718 	dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1719 	dsl_dir_cancel_waiters(dd);
1720 
1721 	return (0);
1722 }
1723 
1724 static int
zfs_umount(vfs_t * vfsp,int fflag)1725 zfs_umount(vfs_t *vfsp, int fflag)
1726 {
1727 	kthread_t *td = curthread;
1728 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1729 	objset_t *os;
1730 	cred_t *cr = td->td_ucred;
1731 	int ret;
1732 
1733 	ret = secpolicy_fs_unmount(cr, vfsp);
1734 	if (ret) {
1735 		if (dsl_deleg_access((char *)vfsp->vfs_resource,
1736 		    ZFS_DELEG_PERM_MOUNT, cr))
1737 			return (ret);
1738 	}
1739 
1740 	/*
1741 	 * Unmount any snapshots mounted under .zfs before unmounting the
1742 	 * dataset itself.
1743 	 */
1744 	if (zfsvfs->z_ctldir != NULL) {
1745 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1746 			return (ret);
1747 	}
1748 
1749 	if (fflag & MS_FORCE) {
1750 		/*
1751 		 * Mark file system as unmounted before calling
1752 		 * vflush(FORCECLOSE). This way we ensure no future vnops
1753 		 * will be called and risk operating on DOOMED vnodes.
1754 		 */
1755 		ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1756 		zfsvfs->z_unmounted = B_TRUE;
1757 		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1758 	}
1759 
1760 	/*
1761 	 * Flush all the files.
1762 	 */
1763 	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1764 	if (ret != 0)
1765 		return (ret);
1766 	while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1767 	    &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1768 		taskqueue_drain(zfsvfs_taskq->tq_queue,
1769 		    &zfsvfs->z_unlinked_drain_task);
1770 
1771 	VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
1772 	os = zfsvfs->z_os;
1773 
1774 	/*
1775 	 * z_os will be NULL if there was an error in
1776 	 * attempting to reopen zfsvfs.
1777 	 */
1778 	if (os != NULL) {
1779 		/*
1780 		 * Unset the objset user_ptr.
1781 		 */
1782 		mutex_enter(&os->os_user_ptr_lock);
1783 		dmu_objset_set_user(os, NULL);
1784 		mutex_exit(&os->os_user_ptr_lock);
1785 
1786 		/*
1787 		 * Finally release the objset
1788 		 */
1789 		dmu_objset_disown(os, B_TRUE, zfsvfs);
1790 	}
1791 
1792 	/*
1793 	 * We can now safely destroy the '.zfs' directory node.
1794 	 */
1795 	if (zfsvfs->z_ctldir != NULL)
1796 		zfsctl_destroy(zfsvfs);
1797 	zfs_freevfs(vfsp);
1798 
1799 	return (0);
1800 }
1801 
1802 static int
zfs_vget(vfs_t * vfsp,ino_t ino,int flags,vnode_t ** vpp)1803 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1804 {
1805 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1806 	znode_t		*zp;
1807 	int 		err;
1808 
1809 	/*
1810 	 * zfs_zget() can't operate on virtual entries like .zfs/ or
1811 	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1812 	 * This will make NFS to switch to LOOKUP instead of using VGET.
1813 	 */
1814 	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1815 	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1816 		return (EOPNOTSUPP);
1817 
1818 	if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1819 		return (err);
1820 	err = zfs_zget(zfsvfs, ino, &zp);
1821 	if (err == 0 && zp->z_unlinked) {
1822 		vrele(ZTOV(zp));
1823 		err = EINVAL;
1824 	}
1825 	if (err == 0)
1826 		*vpp = ZTOV(zp);
1827 	zfs_exit(zfsvfs, FTAG);
1828 	if (err == 0) {
1829 		err = vn_lock(*vpp, flags);
1830 		if (err != 0)
1831 			vrele(*vpp);
1832 #if __FreeBSD_version >= 1500040
1833 		else if ((zp->z_pflags & ZFS_XATTR) != 0) {
1834 			if ((*vpp)->v_type == VDIR)
1835 				vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
1836 			else
1837 				vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR);
1838 		}
1839 #endif
1840 	}
1841 	if (err != 0)
1842 		*vpp = NULL;
1843 	return (err);
1844 }
1845 
1846 static int
zfs_checkexp(vfs_t * vfsp,struct sockaddr * nam,uint64_t * extflagsp,struct ucred ** credanonp,int * numsecflavors,int * secflavors)1847 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1848     struct ucred **credanonp, int *numsecflavors, int *secflavors)
1849 {
1850 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1851 
1852 	/*
1853 	 * If this is regular file system vfsp is the same as
1854 	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1855 	 * zfsvfs->z_parent->z_vfs represents parent file system
1856 	 * which we have to use here, because only this file system
1857 	 * has mnt_export configured.
1858 	 */
1859 	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1860 	    credanonp, numsecflavors, secflavors));
1861 }
1862 
1863 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN,
1864 	"struct fid bigger than SHORT_FID_LEN");
1865 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN,
1866 	"struct fid bigger than LONG_FID_LEN");
1867 
1868 static int
zfs_fhtovp(vfs_t * vfsp,fid_t * fidp,int flags,vnode_t ** vpp)1869 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1870 {
1871 	struct componentname cn;
1872 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1873 	znode_t		*zp;
1874 	vnode_t		*dvp;
1875 	uint64_t	object = 0;
1876 	uint64_t	fid_gen = 0;
1877 	uint64_t	setgen = 0;
1878 	uint64_t	gen_mask;
1879 	uint64_t	zp_gen;
1880 	int 		i, err;
1881 
1882 	*vpp = NULL;
1883 
1884 	if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1885 		return (err);
1886 
1887 	/*
1888 	 * On FreeBSD we can get snapshot's mount point or its parent file
1889 	 * system mount point depending if snapshot is already mounted or not.
1890 	 */
1891 	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1892 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
1893 		uint64_t	objsetid = 0;
1894 
1895 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1896 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1897 
1898 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1899 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1900 
1901 		zfs_exit(zfsvfs, FTAG);
1902 
1903 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1904 		if (err)
1905 			return (SET_ERROR(EINVAL));
1906 		if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1907 			return (err);
1908 	}
1909 
1910 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1911 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
1912 
1913 		for (i = 0; i < sizeof (zfid->zf_object); i++)
1914 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1915 
1916 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
1917 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1918 	} else {
1919 		zfs_exit(zfsvfs, FTAG);
1920 		return (SET_ERROR(EINVAL));
1921 	}
1922 
1923 	if (fidp->fid_len == LONG_FID_LEN && setgen != 0) {
1924 		zfs_exit(zfsvfs, FTAG);
1925 		dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n",
1926 		    (u_longlong_t)fid_gen, (u_longlong_t)setgen);
1927 		return (SET_ERROR(EINVAL));
1928 	}
1929 
1930 	/*
1931 	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1932 	 * directory tree. If the object == zfsvfs->z_shares_dir, then
1933 	 * we are in the .zfs/shares directory tree.
1934 	 */
1935 	if ((fid_gen == 0 &&
1936 	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1937 	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
1938 		zfs_exit(zfsvfs, FTAG);
1939 		VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1940 		if (object == ZFSCTL_INO_SNAPDIR) {
1941 			cn.cn_nameptr = "snapshot";
1942 			cn.cn_namelen = strlen(cn.cn_nameptr);
1943 			cn.cn_nameiop = LOOKUP;
1944 			cn.cn_flags = ISLASTCN | LOCKLEAF;
1945 			cn.cn_lkflags = flags;
1946 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1947 			vput(dvp);
1948 		} else if (object == zfsvfs->z_shares_dir) {
1949 			/*
1950 			 * XXX This branch must not be taken,
1951 			 * if it is, then the lookup below will
1952 			 * explode.
1953 			 */
1954 			cn.cn_nameptr = "shares";
1955 			cn.cn_namelen = strlen(cn.cn_nameptr);
1956 			cn.cn_nameiop = LOOKUP;
1957 			cn.cn_flags = ISLASTCN;
1958 			cn.cn_lkflags = flags;
1959 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1960 			vput(dvp);
1961 		} else {
1962 			*vpp = dvp;
1963 		}
1964 		return (err);
1965 	}
1966 
1967 	gen_mask = -1ULL >> (64 - 8 * i);
1968 
1969 	dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object,
1970 	    (u_longlong_t)fid_gen,
1971 	    (u_longlong_t)gen_mask);
1972 	if ((err = zfs_zget(zfsvfs, object, &zp))) {
1973 		zfs_exit(zfsvfs, FTAG);
1974 		return (err);
1975 	}
1976 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1977 	    sizeof (uint64_t));
1978 	zp_gen = zp_gen & gen_mask;
1979 	if (zp_gen == 0)
1980 		zp_gen = 1;
1981 	if (zp->z_unlinked || zp_gen != fid_gen) {
1982 		dprintf("znode gen (%llu) != fid gen (%llu)\n",
1983 		    (u_longlong_t)zp_gen, (u_longlong_t)fid_gen);
1984 		vrele(ZTOV(zp));
1985 		zfs_exit(zfsvfs, FTAG);
1986 		return (SET_ERROR(EINVAL));
1987 	}
1988 
1989 	*vpp = ZTOV(zp);
1990 	zfs_exit(zfsvfs, FTAG);
1991 	err = vn_lock(*vpp, flags);
1992 	if (err == 0) {
1993 		vnode_create_vobject(*vpp, zp->z_size, curthread);
1994 #if __FreeBSD_version >= 1500040
1995 		if ((zp->z_pflags & ZFS_XATTR) != 0) {
1996 			if ((*vpp)->v_type == VDIR)
1997 				vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
1998 			else
1999 				vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR);
2000 		}
2001 #endif
2002 	} else
2003 		*vpp = NULL;
2004 	return (err);
2005 }
2006 
2007 /*
2008  * Block out VOPs and close zfsvfs_t::z_os
2009  *
2010  * Note, if successful, then we return with the 'z_teardown_lock' and
2011  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
2012  * dataset and objset intact so that they can be atomically handed off during
2013  * a subsequent rollback or recv operation and the resume thereafter.
2014  */
2015 int
zfs_suspend_fs(zfsvfs_t * zfsvfs)2016 zfs_suspend_fs(zfsvfs_t *zfsvfs)
2017 {
2018 	int error;
2019 
2020 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
2021 		return (error);
2022 
2023 	return (0);
2024 }
2025 
2026 /*
2027  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
2028  * is an invariant across any of the operations that can be performed while the
2029  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
2030  * are the same: the relevant objset and associated dataset are owned by
2031  * zfsvfs, held, and long held on entry.
2032  */
2033 int
zfs_resume_fs(zfsvfs_t * zfsvfs,dsl_dataset_t * ds)2034 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2035 {
2036 	int err;
2037 	znode_t *zp;
2038 
2039 	ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
2040 	ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
2041 
2042 	/*
2043 	 * We already own this, so just update the objset_t, as the one we
2044 	 * had before may have been evicted.
2045 	 */
2046 	objset_t *os;
2047 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
2048 	VERIFY(dsl_dataset_long_held(ds));
2049 	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2050 	dsl_pool_config_enter(dp, FTAG);
2051 	VERIFY0(dmu_objset_from_ds(ds, &os));
2052 	dsl_pool_config_exit(dp, FTAG);
2053 
2054 	err = zfsvfs_init(zfsvfs, os);
2055 	if (err != 0)
2056 		goto bail;
2057 
2058 	ds->ds_dir->dd_activity_cancelled = B_FALSE;
2059 	VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
2060 
2061 	zfs_set_fuid_feature(zfsvfs);
2062 
2063 	/*
2064 	 * Attempt to re-establish all the active znodes with
2065 	 * their dbufs.  If a zfs_rezget() fails, then we'll let
2066 	 * any potential callers discover that via zfs_enter_verify_zp
2067 	 * when they try to use their znode.
2068 	 */
2069 	mutex_enter(&zfsvfs->z_znodes_lock);
2070 	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2071 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2072 		(void) zfs_rezget(zp);
2073 	}
2074 	mutex_exit(&zfsvfs->z_znodes_lock);
2075 
2076 bail:
2077 	/* release the VOPs */
2078 	ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2079 	ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2080 
2081 	if (err) {
2082 		/*
2083 		 * Since we couldn't setup the sa framework, try to force
2084 		 * unmount this file system.
2085 		 */
2086 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
2087 			vfs_ref(zfsvfs->z_vfs);
2088 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2089 		}
2090 	}
2091 	return (err);
2092 }
2093 
2094 static void
zfs_freevfs(vfs_t * vfsp)2095 zfs_freevfs(vfs_t *vfsp)
2096 {
2097 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2098 
2099 	zfsvfs_free(zfsvfs);
2100 
2101 	atomic_dec_32(&zfs_active_fs_count);
2102 }
2103 
2104 #ifdef __i386__
2105 static int desiredvnodes_backup;
2106 #include <sys/vmmeter.h>
2107 
2108 
2109 #include <vm/vm_page.h>
2110 #include <vm/vm_object.h>
2111 #include <vm/vm_kern.h>
2112 #include <vm/vm_map.h>
2113 #endif
2114 
2115 static void
zfs_vnodes_adjust(void)2116 zfs_vnodes_adjust(void)
2117 {
2118 #ifdef __i386__
2119 	int newdesiredvnodes;
2120 
2121 	desiredvnodes_backup = desiredvnodes;
2122 
2123 	/*
2124 	 * We calculate newdesiredvnodes the same way it is done in
2125 	 * vntblinit(). If it is equal to desiredvnodes, it means that
2126 	 * it wasn't tuned by the administrator and we can tune it down.
2127 	 */
2128 	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2129 	    vm_kmem_size / (5 * (sizeof (struct vm_object) +
2130 	    sizeof (struct vnode))));
2131 	if (newdesiredvnodes == desiredvnodes)
2132 		desiredvnodes = (3 * newdesiredvnodes) / 4;
2133 #endif
2134 }
2135 
2136 static void
zfs_vnodes_adjust_back(void)2137 zfs_vnodes_adjust_back(void)
2138 {
2139 
2140 #ifdef __i386__
2141 	desiredvnodes = desiredvnodes_backup;
2142 #endif
2143 }
2144 
2145 static struct sx zfs_vnlru_lock;
2146 static struct vnode *zfs_vnlru_marker;
2147 static arc_prune_t *zfs_prune;
2148 
2149 static void
zfs_prune_task(uint64_t nr_to_scan,void * arg __unused)2150 zfs_prune_task(uint64_t nr_to_scan, void *arg __unused)
2151 {
2152 	if (nr_to_scan > INT_MAX)
2153 		nr_to_scan = INT_MAX;
2154 	sx_xlock(&zfs_vnlru_lock);
2155 	vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker);
2156 	sx_xunlock(&zfs_vnlru_lock);
2157 }
2158 
2159 void
zfs_init(void)2160 zfs_init(void)
2161 {
2162 
2163 	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2164 
2165 	/*
2166 	 * Initialize .zfs directory structures
2167 	 */
2168 	zfsctl_init();
2169 
2170 	/*
2171 	 * Initialize znode cache, vnode ops, etc...
2172 	 */
2173 	zfs_znode_init();
2174 
2175 	/*
2176 	 * Reduce number of vnodes. Originally number of vnodes is calculated
2177 	 * with UFS inode in mind. We reduce it here, because it's too big for
2178 	 * ZFS/i386.
2179 	 */
2180 	zfs_vnodes_adjust();
2181 
2182 	dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
2183 
2184 	zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2185 
2186 	zfs_vnlru_marker = vnlru_alloc_marker();
2187 	sx_init(&zfs_vnlru_lock, "zfs vnlru lock");
2188 	zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL);
2189 }
2190 
2191 void
zfs_fini(void)2192 zfs_fini(void)
2193 {
2194 	arc_remove_prune_callback(zfs_prune);
2195 	vnlru_free_marker(zfs_vnlru_marker);
2196 	sx_destroy(&zfs_vnlru_lock);
2197 
2198 	taskq_destroy(zfsvfs_taskq);
2199 	zfsctl_fini();
2200 	zfs_znode_fini();
2201 	zfs_vnodes_adjust_back();
2202 }
2203 
2204 int
zfs_busy(void)2205 zfs_busy(void)
2206 {
2207 	return (zfs_active_fs_count != 0);
2208 }
2209 
2210 /*
2211  * Release VOPs and unmount a suspended filesystem.
2212  */
2213 int
zfs_end_fs(zfsvfs_t * zfsvfs,dsl_dataset_t * ds)2214 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2215 {
2216 	ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
2217 	ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
2218 
2219 	/*
2220 	 * We already own this, so just hold and rele it to update the
2221 	 * objset_t, as the one we had before may have been evicted.
2222 	 */
2223 	objset_t *os;
2224 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
2225 	VERIFY(dsl_dataset_long_held(ds));
2226 	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2227 	dsl_pool_config_enter(dp, FTAG);
2228 	VERIFY0(dmu_objset_from_ds(ds, &os));
2229 	dsl_pool_config_exit(dp, FTAG);
2230 	zfsvfs->z_os = os;
2231 
2232 	/* release the VOPs */
2233 	ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2234 	ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2235 
2236 	/*
2237 	 * Try to force unmount this file system.
2238 	 */
2239 	(void) zfs_umount(zfsvfs->z_vfs, 0);
2240 	zfsvfs->z_unmounted = B_TRUE;
2241 	return (0);
2242 }
2243 
2244 int
zfs_set_version(zfsvfs_t * zfsvfs,uint64_t newvers)2245 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2246 {
2247 	int error;
2248 	objset_t *os = zfsvfs->z_os;
2249 	dmu_tx_t *tx;
2250 
2251 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2252 		return (SET_ERROR(EINVAL));
2253 
2254 	if (newvers < zfsvfs->z_version)
2255 		return (SET_ERROR(EINVAL));
2256 
2257 	if (zfs_spa_version_map(newvers) >
2258 	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
2259 		return (SET_ERROR(ENOTSUP));
2260 
2261 	tx = dmu_tx_create(os);
2262 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2263 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2264 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2265 		    ZFS_SA_ATTRS);
2266 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2267 	}
2268 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
2269 	if (error) {
2270 		dmu_tx_abort(tx);
2271 		return (error);
2272 	}
2273 
2274 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2275 	    8, 1, &newvers, tx);
2276 
2277 	if (error) {
2278 		dmu_tx_commit(tx);
2279 		return (error);
2280 	}
2281 
2282 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2283 		uint64_t sa_obj;
2284 
2285 		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2286 		    SPA_VERSION_SA);
2287 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2288 		    DMU_OT_NONE, 0, tx);
2289 
2290 		error = zap_add(os, MASTER_NODE_OBJ,
2291 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2292 		ASSERT0(error);
2293 
2294 		VERIFY0(sa_set_sa_object(os, sa_obj));
2295 		sa_register_update_callback(os, zfs_sa_upgrade);
2296 	}
2297 
2298 	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2299 	    "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2300 	    (uintmax_t)newvers);
2301 	dmu_tx_commit(tx);
2302 
2303 	zfsvfs->z_version = newvers;
2304 	os->os_version = newvers;
2305 
2306 	zfs_set_fuid_feature(zfsvfs);
2307 
2308 	return (0);
2309 }
2310 
2311 int
zfs_set_default_quota(zfsvfs_t * zfsvfs,zfs_prop_t prop,uint64_t quota)2312 zfs_set_default_quota(zfsvfs_t *zfsvfs, zfs_prop_t prop, uint64_t quota)
2313 {
2314 	int error;
2315 	objset_t *os = zfsvfs->z_os;
2316 	const char *propstr = zfs_prop_to_name(prop);
2317 	dmu_tx_t *tx;
2318 
2319 	tx = dmu_tx_create(os);
2320 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, propstr);
2321 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
2322 	if (error) {
2323 		dmu_tx_abort(tx);
2324 		return (error);
2325 	}
2326 
2327 	if (quota == 0) {
2328 		error = zap_remove(os, MASTER_NODE_OBJ, propstr, tx);
2329 		if (error == ENOENT)
2330 			error = 0;
2331 	} else {
2332 		error = zap_update(os, MASTER_NODE_OBJ, propstr, 8, 1,
2333 		    &quota, tx);
2334 	}
2335 
2336 	if (error)
2337 		goto out;
2338 
2339 	switch (prop) {
2340 	case ZFS_PROP_DEFAULTUSERQUOTA:
2341 		zfsvfs->z_defaultuserquota = quota;
2342 		break;
2343 	case ZFS_PROP_DEFAULTGROUPQUOTA:
2344 		zfsvfs->z_defaultgroupquota = quota;
2345 		break;
2346 	case ZFS_PROP_DEFAULTPROJECTQUOTA:
2347 		zfsvfs->z_defaultprojectquota = quota;
2348 		break;
2349 	case ZFS_PROP_DEFAULTUSEROBJQUOTA:
2350 		zfsvfs->z_defaultuserobjquota = quota;
2351 		break;
2352 	case ZFS_PROP_DEFAULTGROUPOBJQUOTA:
2353 		zfsvfs->z_defaultgroupobjquota = quota;
2354 		break;
2355 	case ZFS_PROP_DEFAULTPROJECTOBJQUOTA:
2356 		zfsvfs->z_defaultprojectobjquota = quota;
2357 		break;
2358 	default:
2359 		break;
2360 	}
2361 
2362 out:
2363 	dmu_tx_commit(tx);
2364 	return (error);
2365 }
2366 
2367 /*
2368  * Return true if the corresponding vfs's unmounted flag is set.
2369  * Otherwise return false.
2370  * If this function returns true we know VFS unmount has been initiated.
2371  */
2372 boolean_t
zfs_get_vfs_flag_unmounted(objset_t * os)2373 zfs_get_vfs_flag_unmounted(objset_t *os)
2374 {
2375 	zfsvfs_t *zfvp;
2376 	boolean_t unmounted = B_FALSE;
2377 
2378 	ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS);
2379 
2380 	mutex_enter(&os->os_user_ptr_lock);
2381 	zfvp = dmu_objset_get_user(os);
2382 	if (zfvp != NULL && zfvp->z_vfs != NULL &&
2383 	    (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2384 		unmounted = B_TRUE;
2385 	mutex_exit(&os->os_user_ptr_lock);
2386 
2387 	return (unmounted);
2388 }
2389 
2390 #ifdef _KERNEL
2391 void
zfsvfs_update_fromname(const char * oldname,const char * newname)2392 zfsvfs_update_fromname(const char *oldname, const char *newname)
2393 {
2394 	char tmpbuf[MAXPATHLEN];
2395 	struct mount *mp;
2396 	char *fromname;
2397 	size_t oldlen;
2398 
2399 	oldlen = strlen(oldname);
2400 
2401 	mtx_lock(&mountlist_mtx);
2402 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2403 		fromname = mp->mnt_stat.f_mntfromname;
2404 		if (strcmp(fromname, oldname) == 0) {
2405 			(void) strlcpy(fromname, newname,
2406 			    sizeof (mp->mnt_stat.f_mntfromname));
2407 			continue;
2408 		}
2409 		if (strncmp(fromname, oldname, oldlen) == 0 &&
2410 		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2411 			(void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2412 			    newname, fromname + oldlen);
2413 			(void) strlcpy(fromname, tmpbuf,
2414 			    sizeof (mp->mnt_stat.f_mntfromname));
2415 			continue;
2416 		}
2417 	}
2418 	mtx_unlock(&mountlist_mtx);
2419 }
2420 #endif
2421 
2422 /*
2423  * Find a prison with ZFS info.
2424  * Return the ZFS info and the (locked) prison.
2425  */
2426 static struct zfs_jailparam *
zfs_jailparam_find(struct prison * spr,struct prison ** prp)2427 zfs_jailparam_find(struct prison *spr, struct prison **prp)
2428 {
2429 	struct prison *pr;
2430 	struct zfs_jailparam *zjp;
2431 
2432 	for (pr = spr; ; pr = pr->pr_parent) {
2433 		mtx_lock(&pr->pr_mtx);
2434 		if (pr == &prison0) {
2435 			zjp = &zfs_jailparam0;
2436 			break;
2437 		}
2438 		zjp = osd_jail_get(pr, zfs_jailparam_slot);
2439 		if (zjp != NULL)
2440 			break;
2441 		mtx_unlock(&pr->pr_mtx);
2442 	}
2443 	*prp = pr;
2444 
2445 	return (zjp);
2446 }
2447 
2448 /*
2449  * Ensure a prison has its own ZFS info.  If zjpp is non-null, point it to the
2450  * ZFS info and lock the prison.
2451  */
2452 static void
zfs_jailparam_alloc(struct prison * pr,struct zfs_jailparam ** zjpp)2453 zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp)
2454 {
2455 	struct prison *ppr;
2456 	struct zfs_jailparam *zjp, *nzjp;
2457 	void **rsv;
2458 
2459 	/* If this prison already has ZFS info, return that. */
2460 	zjp = zfs_jailparam_find(pr, &ppr);
2461 	if (ppr == pr)
2462 		goto done;
2463 
2464 	/*
2465 	 * Allocate a new info record.  Then check again, in case something
2466 	 * changed during the allocation.
2467 	 */
2468 	mtx_unlock(&ppr->pr_mtx);
2469 	nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK);
2470 	rsv = osd_reserve(zfs_jailparam_slot);
2471 	zjp = zfs_jailparam_find(pr, &ppr);
2472 	if (ppr == pr) {
2473 		free(nzjp, M_PRISON);
2474 		osd_free_reserved(rsv);
2475 		goto done;
2476 	}
2477 	/* Inherit the initial values from the ancestor. */
2478 	mtx_lock(&pr->pr_mtx);
2479 	(void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp);
2480 	(void) memcpy(nzjp, zjp, sizeof (*zjp));
2481 	zjp = nzjp;
2482 	mtx_unlock(&ppr->pr_mtx);
2483 done:
2484 	if (zjpp != NULL)
2485 		*zjpp = zjp;
2486 	else
2487 		mtx_unlock(&pr->pr_mtx);
2488 }
2489 
2490 /*
2491  * Jail OSD methods for ZFS VFS info.
2492  */
2493 static int
zfs_jailparam_create(void * obj,void * data)2494 zfs_jailparam_create(void *obj, void *data)
2495 {
2496 	struct prison *pr = obj;
2497 	struct vfsoptlist *opts = data;
2498 	int jsys;
2499 
2500 	if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 &&
2501 	    jsys == JAIL_SYS_INHERIT)
2502 		return (0);
2503 	/*
2504 	 * Inherit a prison's initial values from its parent
2505 	 * (different from JAIL_SYS_INHERIT which also inherits changes).
2506 	 */
2507 	zfs_jailparam_alloc(pr, NULL);
2508 	return (0);
2509 }
2510 
2511 static int
zfs_jailparam_get(void * obj,void * data)2512 zfs_jailparam_get(void *obj, void *data)
2513 {
2514 	struct prison *ppr, *pr = obj;
2515 	struct vfsoptlist *opts = data;
2516 	struct zfs_jailparam *zjp;
2517 	int jsys, error;
2518 
2519 	zjp = zfs_jailparam_find(pr, &ppr);
2520 	jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
2521 	error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys));
2522 	if (error != 0 && error != ENOENT)
2523 		goto done;
2524 	if (jsys == JAIL_SYS_NEW) {
2525 		error = vfs_setopt(opts, "zfs.mount_snapshot",
2526 		    &zjp->mount_snapshot, sizeof (zjp->mount_snapshot));
2527 		if (error != 0 && error != ENOENT)
2528 			goto done;
2529 	} else {
2530 		/*
2531 		 * If this prison is inheriting its ZFS info, report
2532 		 * empty/zero parameters.
2533 		 */
2534 		static int mount_snapshot = 0;
2535 
2536 		error = vfs_setopt(opts, "zfs.mount_snapshot",
2537 		    &mount_snapshot, sizeof (mount_snapshot));
2538 		if (error != 0 && error != ENOENT)
2539 			goto done;
2540 	}
2541 	error = 0;
2542 done:
2543 	mtx_unlock(&ppr->pr_mtx);
2544 	return (error);
2545 }
2546 
2547 static int
zfs_jailparam_set(void * obj,void * data)2548 zfs_jailparam_set(void *obj, void *data)
2549 {
2550 	struct prison *pr = obj;
2551 	struct prison *ppr;
2552 	struct vfsoptlist *opts = data;
2553 	int error, jsys, mount_snapshot;
2554 
2555 	/* Set the parameters, which should be correct. */
2556 	error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2557 	if (error == ENOENT)
2558 		jsys = -1;
2559 	error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2560 	    sizeof (mount_snapshot));
2561 	if (error == ENOENT)
2562 		mount_snapshot = -1;
2563 	else
2564 		jsys = JAIL_SYS_NEW;
2565 	switch (jsys) {
2566 	case JAIL_SYS_NEW:
2567 	{
2568 		/* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */
2569 		struct zfs_jailparam *zjp;
2570 
2571 		/*
2572 		 * A child jail cannot have more permissions than its parent
2573 		 */
2574 		if (pr->pr_parent != &prison0) {
2575 			zjp = zfs_jailparam_find(pr->pr_parent, &ppr);
2576 			mtx_unlock(&ppr->pr_mtx);
2577 			if (zjp->mount_snapshot < mount_snapshot) {
2578 				return (EPERM);
2579 			}
2580 		}
2581 		zfs_jailparam_alloc(pr, &zjp);
2582 		if (mount_snapshot != -1)
2583 			zjp->mount_snapshot = mount_snapshot;
2584 		mtx_unlock(&pr->pr_mtx);
2585 		break;
2586 	}
2587 	case JAIL_SYS_INHERIT:
2588 		/* "zfs=inherit": inherit the parent's ZFS info. */
2589 		mtx_lock(&pr->pr_mtx);
2590 		osd_jail_del(pr, zfs_jailparam_slot);
2591 		mtx_unlock(&pr->pr_mtx);
2592 		break;
2593 	case -1:
2594 		/*
2595 		 * If the setting being changed is not ZFS related
2596 		 * then do nothing.
2597 		 */
2598 		break;
2599 	}
2600 
2601 	return (0);
2602 }
2603 
2604 static int
zfs_jailparam_check(void * obj __unused,void * data)2605 zfs_jailparam_check(void *obj __unused, void *data)
2606 {
2607 	struct vfsoptlist *opts = data;
2608 	int error, jsys, mount_snapshot;
2609 
2610 	/* Check that the parameters are correct. */
2611 	error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2612 	if (error != ENOENT) {
2613 		if (error != 0)
2614 			return (error);
2615 		if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT)
2616 			return (EINVAL);
2617 	}
2618 	error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2619 	    sizeof (mount_snapshot));
2620 	if (error != ENOENT) {
2621 		if (error != 0)
2622 			return (error);
2623 		if (mount_snapshot != 0 && mount_snapshot != 1)
2624 			return (EINVAL);
2625 	}
2626 	return (0);
2627 }
2628 
2629 static void
zfs_jailparam_destroy(void * data)2630 zfs_jailparam_destroy(void *data)
2631 {
2632 
2633 	free(data, M_PRISON);
2634 }
2635 
2636 static void
zfs_jailparam_sysinit(void * arg __unused)2637 zfs_jailparam_sysinit(void *arg __unused)
2638 {
2639 	struct prison *pr;
2640 	osd_method_t  methods[PR_MAXMETHOD] = {
2641 		[PR_METHOD_CREATE] = zfs_jailparam_create,
2642 		[PR_METHOD_GET] = zfs_jailparam_get,
2643 		[PR_METHOD_SET] = zfs_jailparam_set,
2644 		[PR_METHOD_CHECK] = zfs_jailparam_check,
2645 	};
2646 
2647 	zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods);
2648 	/* Copy the defaults to any existing prisons. */
2649 	sx_slock(&allprison_lock);
2650 	TAILQ_FOREACH(pr, &allprison, pr_list)
2651 		zfs_jailparam_alloc(pr, NULL);
2652 	sx_sunlock(&allprison_lock);
2653 }
2654 
2655 static void
zfs_jailparam_sysuninit(void * arg __unused)2656 zfs_jailparam_sysuninit(void *arg __unused)
2657 {
2658 
2659 	osd_jail_deregister(zfs_jailparam_slot);
2660 }
2661 
2662 SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2663 	zfs_jailparam_sysinit, NULL);
2664 SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2665 	zfs_jailparam_sysuninit, NULL);
2666