1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
25 * All rights reserved.
26 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
27 * Copyright (c) 2014 Integros [integros.com]
28 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
29 */
30
31 /* Portions Copyright 2010 Robert Milkowski */
32
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/sysmacros.h>
38 #include <sys/kmem.h>
39 #include <sys/acl.h>
40 #include <sys/vnode.h>
41 #include <sys/vfs.h>
42 #include <sys/mntent.h>
43 #include <sys/mount.h>
44 #include <sys/cmn_err.h>
45 #include <sys/zfs_znode.h>
46 #include <sys/zfs_vnops.h>
47 #include <sys/zfs_dir.h>
48 #include <sys/zil.h>
49 #include <sys/fs/zfs.h>
50 #include <sys/dmu.h>
51 #include <sys/dsl_prop.h>
52 #include <sys/dsl_dataset.h>
53 #include <sys/dsl_deleg.h>
54 #include <sys/spa.h>
55 #include <sys/zap.h>
56 #include <sys/sa.h>
57 #include <sys/sa_impl.h>
58 #include <sys/policy.h>
59 #include <sys/atomic.h>
60 #include <sys/zfs_ioctl.h>
61 #include <sys/zfs_ctldir.h>
62 #include <sys/zfs_fuid.h>
63 #include <sys/sunddi.h>
64 #include <sys/dmu_objset.h>
65 #include <sys/dsl_dir.h>
66 #include <sys/jail.h>
67 #include <sys/osd.h>
68 #include <ufs/ufs/quota.h>
69 #include <sys/zfs_quota.h>
70
71 #include "zfs_comutil.h"
72
73 #ifndef MNTK_VMSETSIZE_BUG
74 #define MNTK_VMSETSIZE_BUG 0
75 #endif
76 #ifndef MNTK_NOMSYNC
77 #define MNTK_NOMSYNC 8
78 #endif
79
80 struct mtx zfs_debug_mtx;
81 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
82
83 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
84
85 int zfs_super_owner;
86 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
87 "File system owners can perform privileged operation on file systems");
88
89 int zfs_debug_level;
90 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
91 "Debug level");
92
93 struct zfs_jailparam {
94 int mount_snapshot;
95 };
96
97 static struct zfs_jailparam zfs_jailparam0 = {
98 .mount_snapshot = 0,
99 };
100
101 static int zfs_jailparam_slot;
102
103 SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters");
104 SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I",
105 "Allow mounting snapshots in the .zfs directory for unjailed datasets");
106
107 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
108 static int zfs_version_acl = ZFS_ACL_VERSION;
109 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
110 "ZFS_ACL_VERSION");
111 static int zfs_version_spa = SPA_VERSION;
112 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
113 "SPA_VERSION");
114 static int zfs_version_zpl = ZPL_VERSION;
115 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
116 "ZPL_VERSION");
117
118 #if __FreeBSD_version >= 1400018
119 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg,
120 bool *mp_busy);
121 #else
122 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
123 #endif
124 static int zfs_mount(vfs_t *vfsp);
125 static int zfs_umount(vfs_t *vfsp, int fflag);
126 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
127 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
128 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
129 static int zfs_sync(vfs_t *vfsp, int waitfor);
130 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
131 struct ucred **credanonp, int *numsecflavors, int *secflavors);
132 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
133 static void zfs_freevfs(vfs_t *vfsp);
134
135 struct vfsops zfs_vfsops = {
136 .vfs_mount = zfs_mount,
137 .vfs_unmount = zfs_umount,
138 .vfs_root = vfs_cache_root,
139 .vfs_cachedroot = zfs_root,
140 .vfs_statfs = zfs_statfs,
141 .vfs_vget = zfs_vget,
142 .vfs_sync = zfs_sync,
143 .vfs_checkexp = zfs_checkexp,
144 .vfs_fhtovp = zfs_fhtovp,
145 .vfs_quotactl = zfs_quotactl,
146 };
147
148 VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL
149 #ifdef VFCF_CROSS_COPY_FILE_RANGE
150 | VFCF_CROSS_COPY_FILE_RANGE
151 #endif
152 #ifdef VFCF_FILEREVINC
153 | VFCF_FILEREVINC
154 #endif
155 );
156
157 /*
158 * We need to keep a count of active fs's.
159 * This is necessary to prevent our module
160 * from being unloaded after a umount -f
161 */
162 static uint32_t zfs_active_fs_count = 0;
163
164 int
zfs_get_temporary_prop(dsl_dataset_t * ds,zfs_prop_t zfs_prop,uint64_t * val,char * setpoint)165 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
166 char *setpoint)
167 {
168 int error;
169 zfsvfs_t *zfvp;
170 vfs_t *vfsp;
171 objset_t *os;
172 uint64_t tmp = *val;
173
174 error = dmu_objset_from_ds(ds, &os);
175 if (error != 0)
176 return (error);
177
178 error = getzfsvfs_impl(os, &zfvp);
179 if (error != 0)
180 return (error);
181 if (zfvp == NULL)
182 return (ENOENT);
183 vfsp = zfvp->z_vfs;
184 switch (zfs_prop) {
185 case ZFS_PROP_ATIME:
186 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
187 tmp = 0;
188 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
189 tmp = 1;
190 break;
191 case ZFS_PROP_DEVICES:
192 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
193 tmp = 0;
194 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
195 tmp = 1;
196 break;
197 case ZFS_PROP_EXEC:
198 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
199 tmp = 0;
200 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
201 tmp = 1;
202 break;
203 case ZFS_PROP_SETUID:
204 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
205 tmp = 0;
206 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
207 tmp = 1;
208 break;
209 case ZFS_PROP_READONLY:
210 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
211 tmp = 0;
212 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
213 tmp = 1;
214 break;
215 case ZFS_PROP_XATTR:
216 if (zfvp->z_flags & ZSB_XATTR)
217 tmp = zfvp->z_xattr;
218 break;
219 case ZFS_PROP_NBMAND:
220 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
221 tmp = 0;
222 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
223 tmp = 1;
224 break;
225 default:
226 vfs_unbusy(vfsp);
227 return (ENOENT);
228 }
229
230 vfs_unbusy(vfsp);
231 if (tmp != *val) {
232 if (setpoint)
233 (void) strcpy(setpoint, "temporary");
234 *val = tmp;
235 }
236 return (0);
237 }
238
239 static int
zfs_getquota(zfsvfs_t * zfsvfs,uid_t id,int isgroup,struct dqblk64 * dqp)240 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
241 {
242 int error = 0;
243 char buf[32];
244 uint64_t usedobj, quotaobj, defaultquota;
245 uint64_t quota, used = 0;
246 timespec_t now;
247
248 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
249 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
250 defaultquota = isgroup ? zfsvfs->z_defaultgroupquota :
251 zfsvfs->z_defaultuserquota;
252
253 if (zfsvfs->z_replay)
254 return (ENOENT);
255
256 (void) sprintf(buf, "%llx", (longlong_t)id);
257 if (quotaobj == 0) {
258 if (defaultquota == 0)
259 return (ENOENT);
260 quota = defaultquota;
261 } else {
262 error = zap_lookup(zfsvfs->z_os, quotaobj, buf, sizeof (quota),
263 1, "a);
264 if (error && (quota = defaultquota) == 0)
265 return (error);
266 }
267
268 /*
269 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
270 * So we set them to be the same.
271 */
272 dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
273 error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
274 if (error == ENOENT)
275 error = 0;
276 if (error)
277 return (error);
278 dqp->dqb_curblocks = btodb(used);
279 dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
280 vfs_timestamp(&now);
281 /*
282 * Setting this to 0 causes FreeBSD quota(8) to print
283 * the number of days since the epoch, which isn't
284 * particularly useful.
285 */
286 dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
287 return (error);
288 }
289
290 static int
291 #if __FreeBSD_version >= 1400018
zfs_quotactl(vfs_t * vfsp,int cmds,uid_t id,void * arg,bool * mp_busy)292 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy)
293 #else
294 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
295 #endif
296 {
297 zfsvfs_t *zfsvfs = vfsp->vfs_data;
298 struct thread *td;
299 int cmd, type, error = 0;
300 int bitsize;
301 zfs_userquota_prop_t quota_type;
302 struct dqblk64 dqblk = { 0 };
303
304 td = curthread;
305 cmd = cmds >> SUBCMDSHIFT;
306 type = cmds & SUBCMDMASK;
307
308 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
309 return (error);
310 if (id == -1) {
311 switch (type) {
312 case USRQUOTA:
313 id = td->td_ucred->cr_ruid;
314 break;
315 case GRPQUOTA:
316 id = td->td_ucred->cr_rgid;
317 break;
318 default:
319 error = EINVAL;
320 #if __FreeBSD_version < 1400018
321 if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
322 vfs_unbusy(vfsp);
323 #endif
324 goto done;
325 }
326 }
327 /*
328 * Map BSD type to:
329 * ZFS_PROP_USERUSED,
330 * ZFS_PROP_USERQUOTA,
331 * ZFS_PROP_GROUPUSED,
332 * ZFS_PROP_GROUPQUOTA
333 */
334 switch (cmd) {
335 case Q_SETQUOTA:
336 case Q_SETQUOTA32:
337 if (type == USRQUOTA)
338 quota_type = ZFS_PROP_USERQUOTA;
339 else if (type == GRPQUOTA)
340 quota_type = ZFS_PROP_GROUPQUOTA;
341 else
342 error = EINVAL;
343 break;
344 case Q_GETQUOTA:
345 case Q_GETQUOTA32:
346 if (type == USRQUOTA)
347 quota_type = ZFS_PROP_USERUSED;
348 else if (type == GRPQUOTA)
349 quota_type = ZFS_PROP_GROUPUSED;
350 else
351 error = EINVAL;
352 break;
353 }
354
355 /*
356 * Depending on the cmd, we may need to get
357 * the ruid and domain (see fuidstr_to_sid?),
358 * the fuid (how?), or other information.
359 * Create fuid using zfs_fuid_create(zfsvfs, id,
360 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
361 * I think I can use just the id?
362 *
363 * Look at zfs_id_overquota() to look up a quota.
364 * zap_lookup(something, quotaobj, fuidstring,
365 * sizeof (long long), 1, "a)
366 *
367 * See zfs_set_userquota() to set a quota.
368 */
369 if ((uint32_t)type >= MAXQUOTAS) {
370 error = EINVAL;
371 goto done;
372 }
373
374 switch (cmd) {
375 case Q_GETQUOTASIZE:
376 bitsize = 64;
377 error = copyout(&bitsize, arg, sizeof (int));
378 break;
379 case Q_QUOTAON:
380 // As far as I can tell, you can't turn quotas on or off on zfs
381 error = 0;
382 #if __FreeBSD_version < 1400018
383 vfs_unbusy(vfsp);
384 #endif
385 break;
386 case Q_QUOTAOFF:
387 error = ENOTSUP;
388 #if __FreeBSD_version < 1400018
389 vfs_unbusy(vfsp);
390 #endif
391 break;
392 case Q_SETQUOTA:
393 error = copyin(arg, &dqblk, sizeof (dqblk));
394 if (error == 0)
395 error = zfs_set_userquota(zfsvfs, quota_type,
396 "", id, dbtob(dqblk.dqb_bhardlimit));
397 break;
398 case Q_GETQUOTA:
399 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
400 if (error == 0)
401 error = copyout(&dqblk, arg, sizeof (dqblk));
402 break;
403 default:
404 error = EINVAL;
405 break;
406 }
407 done:
408 zfs_exit(zfsvfs, FTAG);
409 return (error);
410 }
411
412
413 boolean_t
zfs_is_readonly(zfsvfs_t * zfsvfs)414 zfs_is_readonly(zfsvfs_t *zfsvfs)
415 {
416 return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
417 }
418
419 static int
zfs_sync(vfs_t * vfsp,int waitfor)420 zfs_sync(vfs_t *vfsp, int waitfor)
421 {
422
423 /*
424 * Data integrity is job one. We don't want a compromised kernel
425 * writing to the storage pool, so we never sync during panic.
426 */
427 if (panicstr)
428 return (0);
429
430 /*
431 * Ignore the system syncher. ZFS already commits async data
432 * at zfs_txg_timeout intervals.
433 */
434 if (waitfor == MNT_LAZY)
435 return (0);
436
437 if (vfsp != NULL) {
438 /*
439 * Sync a specific filesystem.
440 */
441 zfsvfs_t *zfsvfs = vfsp->vfs_data;
442 dsl_pool_t *dp;
443 int error;
444
445 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
446 return (error);
447 dp = dmu_objset_pool(zfsvfs->z_os);
448
449 /*
450 * If the system is shutting down, then skip any
451 * filesystems which may exist on a suspended pool.
452 */
453 if (rebooting && spa_suspended(dp->dp_spa)) {
454 zfs_exit(zfsvfs, FTAG);
455 return (0);
456 }
457
458 if (zfsvfs->z_log != NULL) {
459 error = zil_commit(zfsvfs->z_log, 0);
460 if (error != 0) {
461 zfs_exit(zfsvfs, FTAG);
462 return (error);
463 }
464 }
465
466 zfs_exit(zfsvfs, FTAG);
467 } else {
468 /*
469 * Sync all ZFS filesystems. This is what happens when you
470 * run sync(8). Unlike other filesystems, ZFS honors the
471 * request by waiting for all pools to commit all dirty data.
472 */
473 spa_sync_allpools();
474 }
475
476 return (0);
477 }
478
479 static void
atime_changed_cb(void * arg,uint64_t newval)480 atime_changed_cb(void *arg, uint64_t newval)
481 {
482 zfsvfs_t *zfsvfs = arg;
483
484 if (newval == TRUE) {
485 zfsvfs->z_atime = TRUE;
486 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
487 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
488 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
489 } else {
490 zfsvfs->z_atime = FALSE;
491 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
492 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
493 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
494 }
495 }
496
497 static void
relatime_changed_cb(void * arg,uint64_t newval)498 relatime_changed_cb(void *arg, uint64_t newval)
499 {
500 ((zfsvfs_t *)arg)->z_relatime = (newval != 0);
501 }
502
503 static void
xattr_changed_cb(void * arg,uint64_t newval)504 xattr_changed_cb(void *arg, uint64_t newval)
505 {
506 zfsvfs_t *zfsvfs = arg;
507
508 if (newval == ZFS_XATTR_OFF) {
509 zfsvfs->z_flags &= ~ZSB_XATTR;
510 } else {
511 zfsvfs->z_flags |= ZSB_XATTR;
512
513 if (newval == ZFS_XATTR_SA)
514 zfsvfs->z_xattr_sa = B_TRUE;
515 else
516 zfsvfs->z_xattr_sa = B_FALSE;
517 }
518 }
519
520 static void
blksz_changed_cb(void * arg,uint64_t newval)521 blksz_changed_cb(void *arg, uint64_t newval)
522 {
523 zfsvfs_t *zfsvfs = arg;
524 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
525 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
526 ASSERT(ISP2(newval));
527
528 zfsvfs->z_max_blksz = newval;
529 zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
530 }
531
532 static void
readonly_changed_cb(void * arg,uint64_t newval)533 readonly_changed_cb(void *arg, uint64_t newval)
534 {
535 zfsvfs_t *zfsvfs = arg;
536
537 if (newval) {
538 /* XXX locking on vfs_flag? */
539 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
540 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
541 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
542 } else {
543 /* XXX locking on vfs_flag? */
544 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
545 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
546 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
547 }
548 }
549
550 static void
setuid_changed_cb(void * arg,uint64_t newval)551 setuid_changed_cb(void *arg, uint64_t newval)
552 {
553 zfsvfs_t *zfsvfs = arg;
554
555 if (newval == FALSE) {
556 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
557 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
558 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
559 } else {
560 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
561 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
562 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
563 }
564 }
565
566 static void
exec_changed_cb(void * arg,uint64_t newval)567 exec_changed_cb(void *arg, uint64_t newval)
568 {
569 zfsvfs_t *zfsvfs = arg;
570
571 if (newval == FALSE) {
572 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
573 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
574 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
575 } else {
576 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
577 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
578 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
579 }
580 }
581
582 /*
583 * The nbmand mount option can be changed at mount time.
584 * We can't allow it to be toggled on live file systems or incorrect
585 * behavior may be seen from cifs clients
586 *
587 * This property isn't registered via dsl_prop_register(), but this callback
588 * will be called when a file system is first mounted
589 */
590 static void
nbmand_changed_cb(void * arg,uint64_t newval)591 nbmand_changed_cb(void *arg, uint64_t newval)
592 {
593 zfsvfs_t *zfsvfs = arg;
594 if (newval == FALSE) {
595 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
596 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
597 } else {
598 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
599 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
600 }
601 }
602
603 static void
snapdir_changed_cb(void * arg,uint64_t newval)604 snapdir_changed_cb(void *arg, uint64_t newval)
605 {
606 zfsvfs_t *zfsvfs = arg;
607
608 zfsvfs->z_show_ctldir = newval;
609 }
610
611 static void
acl_mode_changed_cb(void * arg,uint64_t newval)612 acl_mode_changed_cb(void *arg, uint64_t newval)
613 {
614 zfsvfs_t *zfsvfs = arg;
615
616 zfsvfs->z_acl_mode = newval;
617 }
618
619 static void
acl_inherit_changed_cb(void * arg,uint64_t newval)620 acl_inherit_changed_cb(void *arg, uint64_t newval)
621 {
622 zfsvfs_t *zfsvfs = arg;
623
624 zfsvfs->z_acl_inherit = newval;
625 }
626
627 static void
acl_type_changed_cb(void * arg,uint64_t newval)628 acl_type_changed_cb(void *arg, uint64_t newval)
629 {
630 zfsvfs_t *zfsvfs = arg;
631
632 zfsvfs->z_acl_type = newval;
633 }
634
635 static void
longname_changed_cb(void * arg,uint64_t newval)636 longname_changed_cb(void *arg, uint64_t newval)
637 {
638 zfsvfs_t *zfsvfs = arg;
639
640 zfsvfs->z_longname = newval;
641 }
642
643 static int
zfs_register_callbacks(vfs_t * vfsp)644 zfs_register_callbacks(vfs_t *vfsp)
645 {
646 struct dsl_dataset *ds = NULL;
647 objset_t *os = NULL;
648 zfsvfs_t *zfsvfs = NULL;
649 uint64_t nbmand;
650 boolean_t readonly = B_FALSE;
651 boolean_t do_readonly = B_FALSE;
652 boolean_t setuid = B_FALSE;
653 boolean_t do_setuid = B_FALSE;
654 boolean_t exec = B_FALSE;
655 boolean_t do_exec = B_FALSE;
656 boolean_t xattr = B_FALSE;
657 boolean_t atime = B_FALSE;
658 boolean_t do_atime = B_FALSE;
659 boolean_t do_xattr = B_FALSE;
660 int error = 0;
661
662 ASSERT3P(vfsp, !=, NULL);
663 zfsvfs = vfsp->vfs_data;
664 ASSERT3P(zfsvfs, !=, NULL);
665 os = zfsvfs->z_os;
666
667 /*
668 * This function can be called for a snapshot when we update snapshot's
669 * mount point, which isn't really supported.
670 */
671 if (dmu_objset_is_snapshot(os))
672 return (EOPNOTSUPP);
673
674 /*
675 * The act of registering our callbacks will destroy any mount
676 * options we may have. In order to enable temporary overrides
677 * of mount options, we stash away the current values and
678 * restore them after we register the callbacks.
679 */
680 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
681 !spa_writeable(dmu_objset_spa(os))) {
682 readonly = B_TRUE;
683 do_readonly = B_TRUE;
684 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
685 readonly = B_FALSE;
686 do_readonly = B_TRUE;
687 }
688 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
689 setuid = B_FALSE;
690 do_setuid = B_TRUE;
691 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
692 setuid = B_TRUE;
693 do_setuid = B_TRUE;
694 }
695 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
696 exec = B_FALSE;
697 do_exec = B_TRUE;
698 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
699 exec = B_TRUE;
700 do_exec = B_TRUE;
701 }
702 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
703 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
704 do_xattr = B_TRUE;
705 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
706 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
707 do_xattr = B_TRUE;
708 } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
709 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
710 do_xattr = B_TRUE;
711 } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
712 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
713 do_xattr = B_TRUE;
714 }
715 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
716 atime = B_FALSE;
717 do_atime = B_TRUE;
718 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
719 atime = B_TRUE;
720 do_atime = B_TRUE;
721 }
722
723 /*
724 * We need to enter pool configuration here, so that we can use
725 * dsl_prop_get_int_ds() to handle the special nbmand property below.
726 * dsl_prop_get_integer() can not be used, because it has to acquire
727 * spa_namespace_lock and we can not do that because we already hold
728 * z_teardown_lock. The problem is that spa_write_cachefile() is called
729 * with spa_namespace_lock held and the function calls ZFS vnode
730 * operations to write the cache file and thus z_teardown_lock is
731 * acquired after spa_namespace_lock.
732 */
733 ds = dmu_objset_ds(os);
734 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
735
736 /*
737 * nbmand is a special property. It can only be changed at
738 * mount time.
739 *
740 * This is weird, but it is documented to only be changeable
741 * at mount time.
742 */
743 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
744 nbmand = B_FALSE;
745 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
746 nbmand = B_TRUE;
747 } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) {
748 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
749 return (error);
750 }
751
752 /*
753 * Register property callbacks.
754 *
755 * It would probably be fine to just check for i/o error from
756 * the first prop_register(), but I guess I like to go
757 * overboard...
758 */
759 error = dsl_prop_register(ds,
760 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
761 error = error ? error : dsl_prop_register(ds,
762 zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zfsvfs);
763 error = error ? error : dsl_prop_register(ds,
764 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
765 error = error ? error : dsl_prop_register(ds,
766 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
767 error = error ? error : dsl_prop_register(ds,
768 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
769 error = error ? error : dsl_prop_register(ds,
770 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
771 error = error ? error : dsl_prop_register(ds,
772 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
773 error = error ? error : dsl_prop_register(ds,
774 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
775 error = error ? error : dsl_prop_register(ds,
776 zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
777 error = error ? error : dsl_prop_register(ds,
778 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
779 error = error ? error : dsl_prop_register(ds,
780 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
781 zfsvfs);
782 error = error ? error : dsl_prop_register(ds,
783 zfs_prop_to_name(ZFS_PROP_LONGNAME), longname_changed_cb, zfsvfs);
784 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
785 if (error)
786 goto unregister;
787
788 /*
789 * Invoke our callbacks to restore temporary mount options.
790 */
791 if (do_readonly)
792 readonly_changed_cb(zfsvfs, readonly);
793 if (do_setuid)
794 setuid_changed_cb(zfsvfs, setuid);
795 if (do_exec)
796 exec_changed_cb(zfsvfs, exec);
797 if (do_xattr)
798 xattr_changed_cb(zfsvfs, xattr);
799 if (do_atime)
800 atime_changed_cb(zfsvfs, atime);
801
802 nbmand_changed_cb(zfsvfs, nbmand);
803
804 return (0);
805
806 unregister:
807 dsl_prop_unregister_all(ds, zfsvfs);
808 return (error);
809 }
810
811 /*
812 * Associate this zfsvfs with the given objset, which must be owned.
813 * This will cache a bunch of on-disk state from the objset in the
814 * zfsvfs.
815 */
816 static int
zfsvfs_init(zfsvfs_t * zfsvfs,objset_t * os)817 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
818 {
819 int error;
820 uint64_t val;
821
822 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
823 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
824 zfsvfs->z_os = os;
825
826 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
827 if (error != 0)
828 return (error);
829 if (zfsvfs->z_version >
830 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
831 (void) printf("Can't mount a version %lld file system "
832 "on a version %lld pool\n. Pool must be upgraded to mount "
833 "this file system.", (u_longlong_t)zfsvfs->z_version,
834 (u_longlong_t)spa_version(dmu_objset_spa(os)));
835 return (SET_ERROR(ENOTSUP));
836 }
837 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
838 if (error != 0)
839 return (error);
840 zfsvfs->z_norm = (int)val;
841
842 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
843 if (error != 0)
844 return (error);
845 zfsvfs->z_utf8 = (val != 0);
846
847 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
848 if (error != 0)
849 return (error);
850 zfsvfs->z_case = (uint_t)val;
851
852 error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
853 if (error != 0)
854 return (error);
855 zfsvfs->z_acl_type = (uint_t)val;
856
857 /*
858 * Fold case on file systems that are always or sometimes case
859 * insensitive.
860 */
861 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
862 zfsvfs->z_case == ZFS_CASE_MIXED)
863 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
864
865 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
866 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
867
868 uint64_t sa_obj = 0;
869 if (zfsvfs->z_use_sa) {
870 /* should either have both of these objects or none */
871 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
872 &sa_obj);
873 if (error != 0)
874 return (error);
875
876 error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
877 if (error == 0 && val == ZFS_XATTR_SA)
878 zfsvfs->z_xattr_sa = B_TRUE;
879 }
880
881 error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSERQUOTA,
882 &zfsvfs->z_defaultuserquota);
883 if (error != 0)
884 return (error);
885
886 error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPQUOTA,
887 &zfsvfs->z_defaultgroupquota);
888 if (error != 0)
889 return (error);
890
891 error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTQUOTA,
892 &zfsvfs->z_defaultprojectquota);
893 if (error != 0)
894 return (error);
895
896 error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSEROBJQUOTA,
897 &zfsvfs->z_defaultuserobjquota);
898 if (error != 0)
899 return (error);
900
901 error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPOBJQUOTA,
902 &zfsvfs->z_defaultgroupobjquota);
903 if (error != 0)
904 return (error);
905
906 error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTOBJQUOTA,
907 &zfsvfs->z_defaultprojectobjquota);
908 if (error != 0)
909 return (error);
910
911 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
912 &zfsvfs->z_attr_table);
913 if (error != 0)
914 return (error);
915
916 if (zfsvfs->z_version >= ZPL_VERSION_SA)
917 sa_register_update_callback(os, zfs_sa_upgrade);
918
919 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
920 &zfsvfs->z_root);
921 if (error != 0)
922 return (error);
923 ASSERT3U(zfsvfs->z_root, !=, 0);
924
925 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
926 &zfsvfs->z_unlinkedobj);
927 if (error != 0)
928 return (error);
929
930 error = zap_lookup(os, MASTER_NODE_OBJ,
931 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
932 8, 1, &zfsvfs->z_userquota_obj);
933 if (error == ENOENT)
934 zfsvfs->z_userquota_obj = 0;
935 else if (error != 0)
936 return (error);
937
938 error = zap_lookup(os, MASTER_NODE_OBJ,
939 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
940 8, 1, &zfsvfs->z_groupquota_obj);
941 if (error == ENOENT)
942 zfsvfs->z_groupquota_obj = 0;
943 else if (error != 0)
944 return (error);
945
946 error = zap_lookup(os, MASTER_NODE_OBJ,
947 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
948 8, 1, &zfsvfs->z_projectquota_obj);
949 if (error == ENOENT)
950 zfsvfs->z_projectquota_obj = 0;
951 else if (error != 0)
952 return (error);
953
954 error = zap_lookup(os, MASTER_NODE_OBJ,
955 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
956 8, 1, &zfsvfs->z_userobjquota_obj);
957 if (error == ENOENT)
958 zfsvfs->z_userobjquota_obj = 0;
959 else if (error != 0)
960 return (error);
961
962 error = zap_lookup(os, MASTER_NODE_OBJ,
963 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
964 8, 1, &zfsvfs->z_groupobjquota_obj);
965 if (error == ENOENT)
966 zfsvfs->z_groupobjquota_obj = 0;
967 else if (error != 0)
968 return (error);
969
970 error = zap_lookup(os, MASTER_NODE_OBJ,
971 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
972 8, 1, &zfsvfs->z_projectobjquota_obj);
973 if (error == ENOENT)
974 zfsvfs->z_projectobjquota_obj = 0;
975 else if (error != 0)
976 return (error);
977
978 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
979 &zfsvfs->z_fuid_obj);
980 if (error == ENOENT)
981 zfsvfs->z_fuid_obj = 0;
982 else if (error != 0)
983 return (error);
984
985 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
986 &zfsvfs->z_shares_dir);
987 if (error == ENOENT)
988 zfsvfs->z_shares_dir = 0;
989 else if (error != 0)
990 return (error);
991
992 /*
993 * Only use the name cache if we are looking for a
994 * name on a file system that does not require normalization
995 * or case folding. We can also look there if we happen to be
996 * on a non-normalizing, mixed sensitivity file system IF we
997 * are looking for the exact name (which is always the case on
998 * FreeBSD).
999 */
1000 zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
1001 ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
1002 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
1003
1004 return (0);
1005 }
1006
1007 taskq_t *zfsvfs_taskq;
1008
1009 static void
zfsvfs_task_unlinked_drain(void * context,int pending __unused)1010 zfsvfs_task_unlinked_drain(void *context, int pending __unused)
1011 {
1012
1013 zfs_unlinked_drain((zfsvfs_t *)context);
1014 }
1015
1016 int
zfsvfs_create(const char * osname,boolean_t readonly,zfsvfs_t ** zfvp)1017 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
1018 {
1019 objset_t *os;
1020 zfsvfs_t *zfsvfs;
1021 int error;
1022 boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
1023
1024 /*
1025 * XXX: Fix struct statfs so this isn't necessary!
1026 *
1027 * The 'osname' is used as the filesystem's special node, which means
1028 * it must fit in statfs.f_mntfromname, or else it can't be
1029 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
1030 * 'zfs unmount' to think it's not mounted when it is.
1031 */
1032 if (strlen(osname) >= MNAMELEN)
1033 return (SET_ERROR(ENAMETOOLONG));
1034
1035 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1036
1037 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
1038 &os);
1039 if (error != 0) {
1040 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1041 return (error);
1042 }
1043
1044 error = zfsvfs_create_impl(zfvp, zfsvfs, os);
1045
1046 return (error);
1047 }
1048
1049
1050 int
zfsvfs_create_impl(zfsvfs_t ** zfvp,zfsvfs_t * zfsvfs,objset_t * os)1051 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
1052 {
1053 int error;
1054
1055 zfsvfs->z_vfs = NULL;
1056 zfsvfs->z_parent = zfsvfs;
1057
1058 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1059 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1060 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1061 offsetof(znode_t, z_link_node));
1062 TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
1063 zfsvfs_task_unlinked_drain, zfsvfs);
1064 ZFS_TEARDOWN_INIT(zfsvfs);
1065 ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
1066 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1067 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1068 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1069
1070 error = zfsvfs_init(zfsvfs, os);
1071 if (error != 0) {
1072 dmu_objset_disown(os, B_TRUE, zfsvfs);
1073 *zfvp = NULL;
1074 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1075 return (error);
1076 }
1077
1078 *zfvp = zfsvfs;
1079 return (0);
1080 }
1081
1082 static int
zfsvfs_setup(zfsvfs_t * zfsvfs,boolean_t mounting)1083 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1084 {
1085 int error;
1086
1087 /*
1088 * Check for a bad on-disk format version now since we
1089 * lied about owning the dataset readonly before.
1090 */
1091 if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1092 dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1093 return (SET_ERROR(EROFS));
1094
1095 error = zfs_register_callbacks(zfsvfs->z_vfs);
1096 if (error)
1097 return (error);
1098
1099 /*
1100 * If we are not mounting (ie: online recv), then we don't
1101 * have to worry about replaying the log as we blocked all
1102 * operations out since we closed the ZIL.
1103 */
1104 if (mounting) {
1105 boolean_t readonly;
1106
1107 ASSERT0P(zfsvfs->z_kstat.dk_kstats);
1108 error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1109 if (error)
1110 return (error);
1111 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1112 &zfsvfs->z_kstat.dk_zil_sums);
1113
1114 /*
1115 * During replay we remove the read only flag to
1116 * allow replays to succeed.
1117 */
1118 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1119 if (readonly != 0) {
1120 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1121 } else {
1122 dsl_dir_t *dd;
1123 zap_stats_t zs;
1124
1125 if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1126 &zs) == 0) {
1127 dataset_kstats_update_nunlinks_kstat(
1128 &zfsvfs->z_kstat, zs.zs_num_entries);
1129 dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1130 "num_entries in unlinked set: %llu",
1131 (u_longlong_t)zs.zs_num_entries);
1132 }
1133
1134 zfs_unlinked_drain(zfsvfs);
1135 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1136 dd->dd_activity_cancelled = B_FALSE;
1137 }
1138
1139 /*
1140 * Parse and replay the intent log.
1141 *
1142 * Because of ziltest, this must be done after
1143 * zfs_unlinked_drain(). (Further note: ziltest
1144 * doesn't use readonly mounts, where
1145 * zfs_unlinked_drain() isn't called.) This is because
1146 * ziltest causes spa_sync() to think it's committed,
1147 * but actually it is not, so the intent log contains
1148 * many txg's worth of changes.
1149 *
1150 * In particular, if object N is in the unlinked set in
1151 * the last txg to actually sync, then it could be
1152 * actually freed in a later txg and then reallocated
1153 * in a yet later txg. This would write a "create
1154 * object N" record to the intent log. Normally, this
1155 * would be fine because the spa_sync() would have
1156 * written out the fact that object N is free, before
1157 * we could write the "create object N" intent log
1158 * record.
1159 *
1160 * But when we are in ziltest mode, we advance the "open
1161 * txg" without actually spa_sync()-ing the changes to
1162 * disk. So we would see that object N is still
1163 * allocated and in the unlinked set, and there is an
1164 * intent log record saying to allocate it.
1165 */
1166 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1167 if (zil_replay_disable) {
1168 zil_destroy(zfsvfs->z_log, B_FALSE);
1169 } else {
1170 boolean_t use_nc = zfsvfs->z_use_namecache;
1171 zfsvfs->z_use_namecache = B_FALSE;
1172 zfsvfs->z_replay = B_TRUE;
1173 zil_replay(zfsvfs->z_os, zfsvfs,
1174 zfs_replay_vector);
1175 zfsvfs->z_replay = B_FALSE;
1176 zfsvfs->z_use_namecache = use_nc;
1177 }
1178 }
1179
1180 /* restore readonly bit */
1181 if (readonly != 0)
1182 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1183 } else {
1184 ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL);
1185 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1186 &zfsvfs->z_kstat.dk_zil_sums);
1187 }
1188
1189 /*
1190 * Set the objset user_ptr to track its zfsvfs.
1191 */
1192 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1193 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1194 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1195
1196 return (0);
1197 }
1198
1199 void
zfsvfs_free(zfsvfs_t * zfsvfs)1200 zfsvfs_free(zfsvfs_t *zfsvfs)
1201 {
1202 int i;
1203
1204 zfs_fuid_destroy(zfsvfs);
1205
1206 mutex_destroy(&zfsvfs->z_znodes_lock);
1207 mutex_destroy(&zfsvfs->z_lock);
1208 list_destroy(&zfsvfs->z_all_znodes);
1209 ZFS_TEARDOWN_DESTROY(zfsvfs);
1210 ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
1211 rw_destroy(&zfsvfs->z_fuid_lock);
1212 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1213 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1214 dataset_kstats_destroy(&zfsvfs->z_kstat);
1215 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1216 }
1217
1218 static void
zfs_set_fuid_feature(zfsvfs_t * zfsvfs)1219 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1220 {
1221 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1222 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1223 }
1224
1225 extern int zfs_xattr_compat;
1226
1227 static int
zfs_domount(vfs_t * vfsp,char * osname)1228 zfs_domount(vfs_t *vfsp, char *osname)
1229 {
1230 uint64_t recordsize, fsid_guid;
1231 int error = 0;
1232 zfsvfs_t *zfsvfs;
1233
1234 ASSERT3P(vfsp, !=, NULL);
1235 ASSERT3P(osname, !=, NULL);
1236
1237 error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1238 if (error)
1239 return (error);
1240 zfsvfs->z_vfs = vfsp;
1241
1242 if ((error = dsl_prop_get_integer(osname,
1243 "recordsize", &recordsize, NULL)))
1244 goto out;
1245 zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1246 zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1247
1248 vfsp->vfs_data = zfsvfs;
1249 vfsp->mnt_flag |= MNT_LOCAL;
1250 vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1251 vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1252 vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1253 /*
1254 * This can cause a loss of coherence between ARC and page cache
1255 * on ZoF - unclear if the problem is in FreeBSD or ZoF
1256 */
1257 vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */
1258 vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1259 vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1260
1261 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
1262 vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1263 #endif
1264 /*
1265 * The fsid is 64 bits, composed of an 8-bit fs type, which
1266 * separates our fsid from any other filesystem types, and a
1267 * 56-bit objset unique ID. The objset unique ID is unique to
1268 * all objsets open on this system, provided by unique_create().
1269 * The 8-bit fs type must be put in the low bits of fsid[1]
1270 * because that's where other Solaris filesystems put it.
1271 */
1272 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1273 ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0);
1274 vfsp->vfs_fsid.val[0] = fsid_guid;
1275 vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) |
1276 (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1277
1278 /*
1279 * Set features for file system.
1280 */
1281 zfs_set_fuid_feature(zfsvfs);
1282
1283 if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1284 uint64_t pval;
1285
1286 atime_changed_cb(zfsvfs, B_FALSE);
1287 readonly_changed_cb(zfsvfs, B_TRUE);
1288 if ((error = dsl_prop_get_integer(osname,
1289 "xattr", &pval, NULL)))
1290 goto out;
1291 xattr_changed_cb(zfsvfs, pval);
1292 if ((error = dsl_prop_get_integer(osname,
1293 "acltype", &pval, NULL)))
1294 goto out;
1295 acl_type_changed_cb(zfsvfs, pval);
1296 zfsvfs->z_issnap = B_TRUE;
1297 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1298
1299 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1300 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1301 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1302 } else {
1303 if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1304 goto out;
1305 }
1306
1307 #if __FreeBSD_version >= 1500040
1308 /*
1309 * Named attributes can only work if the xattr property is set to
1310 * on/dir and not sa. Also, zfs_xattr_compat must be set.
1311 */
1312 if ((zfsvfs->z_flags & ZSB_XATTR) != 0 && !zfsvfs->z_xattr_sa &&
1313 zfs_xattr_compat)
1314 vfsp->mnt_flag |= MNT_NAMEDATTR;
1315 #endif
1316
1317 vfs_mountedfrom(vfsp, osname);
1318
1319 if (!zfsvfs->z_issnap)
1320 zfsctl_create(zfsvfs);
1321 out:
1322 if (error) {
1323 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1324 zfsvfs_free(zfsvfs);
1325 } else {
1326 atomic_inc_32(&zfs_active_fs_count);
1327 }
1328
1329 return (error);
1330 }
1331
1332 static void
zfs_unregister_callbacks(zfsvfs_t * zfsvfs)1333 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1334 {
1335 objset_t *os = zfsvfs->z_os;
1336
1337 if (!dmu_objset_is_snapshot(os))
1338 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1339 }
1340
1341 static int
getpoolname(const char * osname,char * poolname)1342 getpoolname(const char *osname, char *poolname)
1343 {
1344 char *p;
1345
1346 p = strchr(osname, '/');
1347 if (p == NULL) {
1348 if (strlen(osname) >= MAXNAMELEN)
1349 return (ENAMETOOLONG);
1350 (void) strcpy(poolname, osname);
1351 } else {
1352 if (p - osname >= MAXNAMELEN)
1353 return (ENAMETOOLONG);
1354 (void) strlcpy(poolname, osname, p - osname + 1);
1355 }
1356 return (0);
1357 }
1358
1359 static void
fetch_osname_options(char * name,bool * checkpointrewind)1360 fetch_osname_options(char *name, bool *checkpointrewind)
1361 {
1362
1363 if (name[0] == '!') {
1364 *checkpointrewind = true;
1365 memmove(name, name + 1, strlen(name));
1366 } else {
1367 *checkpointrewind = false;
1368 }
1369 }
1370
1371 static int
zfs_mount(vfs_t * vfsp)1372 zfs_mount(vfs_t *vfsp)
1373 {
1374 kthread_t *td = curthread;
1375 vnode_t *mvp = vfsp->mnt_vnodecovered;
1376 cred_t *cr = td->td_ucred;
1377 char *osname;
1378 int error = 0;
1379 int canwrite;
1380 bool checkpointrewind, isctlsnap = false;
1381
1382 if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1383 return (SET_ERROR(EINVAL));
1384
1385 /*
1386 * If full-owner-access is enabled and delegated administration is
1387 * turned on, we must set nosuid.
1388 */
1389 if (zfs_super_owner &&
1390 dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1391 secpolicy_fs_mount_clearopts(cr, vfsp);
1392 }
1393
1394 fetch_osname_options(osname, &checkpointrewind);
1395 isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) &&
1396 strchr(osname, '@') != NULL);
1397
1398 /*
1399 * Check for mount privilege?
1400 *
1401 * If we don't have privilege then see if
1402 * we have local permission to allow it
1403 */
1404 error = secpolicy_fs_mount(cr, mvp, vfsp);
1405 if (error && isctlsnap) {
1406 secpolicy_fs_mount_clearopts(cr, vfsp);
1407 } else if (error) {
1408 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1409 goto out;
1410
1411 if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1412 vattr_t vattr;
1413
1414 /*
1415 * Make sure user is the owner of the mount point
1416 * or has sufficient privileges.
1417 */
1418
1419 vattr.va_mask = AT_UID;
1420
1421 vn_lock(mvp, LK_SHARED | LK_RETRY);
1422 if (VOP_GETATTR(mvp, &vattr, cr)) {
1423 VOP_UNLOCK(mvp);
1424 goto out;
1425 }
1426
1427 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1428 VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1429 VOP_UNLOCK(mvp);
1430 goto out;
1431 }
1432 VOP_UNLOCK(mvp);
1433 }
1434
1435 secpolicy_fs_mount_clearopts(cr, vfsp);
1436 }
1437
1438 /*
1439 * Refuse to mount a filesystem if we are in a local zone and the
1440 * dataset is not visible.
1441 */
1442 if (!INGLOBALZONE(curproc) &&
1443 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1444 boolean_t mount_snapshot = B_FALSE;
1445
1446 /*
1447 * Snapshots may be mounted in .zfs for unjailed datasets
1448 * if allowed by the jail param zfs.mount_snapshot.
1449 */
1450 if (isctlsnap) {
1451 struct prison *pr;
1452 struct zfs_jailparam *zjp;
1453
1454 pr = curthread->td_ucred->cr_prison;
1455 mtx_lock(&pr->pr_mtx);
1456 zjp = osd_jail_get(pr, zfs_jailparam_slot);
1457 mtx_unlock(&pr->pr_mtx);
1458 if (zjp && zjp->mount_snapshot)
1459 mount_snapshot = B_TRUE;
1460 }
1461 if (!mount_snapshot) {
1462 error = SET_ERROR(EPERM);
1463 goto out;
1464 }
1465 }
1466
1467 vfsp->vfs_flag |= MNT_NFS4ACLS;
1468
1469 /*
1470 * When doing a remount, we simply refresh our temporary properties
1471 * according to those options set in the current VFS options.
1472 */
1473 if (vfsp->vfs_flag & MS_REMOUNT) {
1474 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1475
1476 /*
1477 * Refresh mount options with z_teardown_lock blocking I/O while
1478 * the filesystem is in an inconsistent state.
1479 * The lock also serializes this code with filesystem
1480 * manipulations between entry to zfs_suspend_fs() and return
1481 * from zfs_resume_fs().
1482 */
1483 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1484 zfs_unregister_callbacks(zfsvfs);
1485 error = zfs_register_callbacks(vfsp);
1486 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1487 goto out;
1488 }
1489
1490 /* Initial root mount: try hard to import the requested root pool. */
1491 if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1492 (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1493 char pname[MAXNAMELEN];
1494
1495 error = getpoolname(osname, pname);
1496 if (error == 0)
1497 error = spa_import_rootpool(pname, checkpointrewind);
1498 if (error)
1499 goto out;
1500 }
1501 DROP_GIANT();
1502 error = zfs_domount(vfsp, osname);
1503 PICKUP_GIANT();
1504
1505 out:
1506 return (error);
1507 }
1508
1509 static int
zfs_statfs(vfs_t * vfsp,struct statfs * statp)1510 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1511 {
1512 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1513 uint64_t refdbytes, availbytes, usedobjs, availobjs;
1514 int error;
1515
1516 statp->f_version = STATFS_VERSION;
1517
1518 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1519 return (error);
1520
1521 dmu_objset_space(zfsvfs->z_os,
1522 &refdbytes, &availbytes, &usedobjs, &availobjs);
1523
1524 /*
1525 * The underlying storage pool actually uses multiple block sizes.
1526 * We report the fragsize as the smallest block size we support,
1527 * and we report our blocksize as the filesystem's maximum blocksize.
1528 */
1529 statp->f_bsize = SPA_MINBLOCKSIZE;
1530 statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1531
1532 /*
1533 * The following report "total" blocks of various kinds in the
1534 * file system, but reported in terms of f_frsize - the
1535 * "fragment" size.
1536 */
1537
1538 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1539 statp->f_bfree = availbytes / statp->f_bsize;
1540 statp->f_bavail = statp->f_bfree; /* no root reservation */
1541
1542 /*
1543 * statvfs() should really be called statufs(), because it assumes
1544 * static metadata. ZFS doesn't preallocate files, so the best
1545 * we can do is report the max that could possibly fit in f_files,
1546 * and that minus the number actually used in f_ffree.
1547 * For f_ffree, report the smaller of the number of object available
1548 * and the number of blocks (each object will take at least a block).
1549 */
1550 statp->f_ffree = MIN(availobjs, statp->f_bfree);
1551 statp->f_files = statp->f_ffree + usedobjs;
1552
1553 /*
1554 * We're a zfs filesystem.
1555 */
1556 strlcpy(statp->f_fstypename, "zfs",
1557 sizeof (statp->f_fstypename));
1558
1559 strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1560 sizeof (statp->f_mntfromname));
1561 strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1562 sizeof (statp->f_mntonname));
1563
1564 statp->f_namemax =
1565 zfsvfs->z_longname ? (ZAP_MAXNAMELEN_NEW - 1) : (MAXNAMELEN - 1);
1566
1567 zfs_exit(zfsvfs, FTAG);
1568 return (0);
1569 }
1570
1571 static int
zfs_root(vfs_t * vfsp,int flags,vnode_t ** vpp)1572 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1573 {
1574 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1575 znode_t *rootzp;
1576 int error;
1577
1578 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1579 return (error);
1580
1581 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1582 if (error == 0)
1583 *vpp = ZTOV(rootzp);
1584
1585 zfs_exit(zfsvfs, FTAG);
1586
1587 if (error == 0) {
1588 error = vn_lock(*vpp, flags);
1589 if (error != 0) {
1590 VN_RELE(*vpp);
1591 *vpp = NULL;
1592 }
1593 }
1594 return (error);
1595 }
1596
1597 /*
1598 * Teardown the zfsvfs::z_os.
1599 *
1600 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1601 * and 'z_teardown_inactive_lock' held.
1602 */
1603 static int
zfsvfs_teardown(zfsvfs_t * zfsvfs,boolean_t unmounting)1604 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1605 {
1606 znode_t *zp;
1607 dsl_dir_t *dd;
1608
1609 /*
1610 * If someone has not already unmounted this file system,
1611 * drain the zrele_taskq to ensure all active references to the
1612 * zfsvfs_t have been handled only then can it be safely destroyed.
1613 */
1614 if (zfsvfs->z_os) {
1615 /*
1616 * If we're unmounting we have to wait for the list to
1617 * drain completely.
1618 *
1619 * If we're not unmounting there's no guarantee the list
1620 * will drain completely, but zreles run from the taskq
1621 * may add the parents of dir-based xattrs to the taskq
1622 * so we want to wait for these.
1623 *
1624 * We can safely check z_all_znodes for being empty because the
1625 * VFS has already blocked operations which add to it.
1626 */
1627 int round = 0;
1628 while (!list_is_empty(&zfsvfs->z_all_znodes)) {
1629 taskq_wait_outstanding(dsl_pool_zrele_taskq(
1630 dmu_objset_pool(zfsvfs->z_os)), 0);
1631 if (++round > 1 && !unmounting)
1632 break;
1633 }
1634 }
1635 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1636
1637 if (!unmounting) {
1638 /*
1639 * We purge the parent filesystem's vfsp as the parent
1640 * filesystem and all of its snapshots have their vnode's
1641 * v_vfsp set to the parent's filesystem's vfsp. Note,
1642 * 'z_parent' is self referential for non-snapshots.
1643 */
1644 #ifdef FREEBSD_NAMECACHE
1645 cache_purgevfs(zfsvfs->z_parent->z_vfs);
1646 #endif
1647 }
1648
1649 /*
1650 * Close the zil. NB: Can't close the zil while zfs_inactive
1651 * threads are blocked as zil_close can call zfs_inactive.
1652 */
1653 if (zfsvfs->z_log) {
1654 zil_close(zfsvfs->z_log);
1655 zfsvfs->z_log = NULL;
1656 }
1657
1658 ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
1659
1660 /*
1661 * If we are not unmounting (ie: online recv) and someone already
1662 * unmounted this file system while we were doing the switcheroo,
1663 * or a reopen of z_os failed then just bail out now.
1664 */
1665 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1666 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1667 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1668 return (SET_ERROR(EIO));
1669 }
1670
1671 /*
1672 * At this point there are no vops active, and any new vops will
1673 * fail with EIO since we have z_teardown_lock for writer (only
1674 * relevant for forced unmount).
1675 *
1676 * Release all holds on dbufs.
1677 */
1678 mutex_enter(&zfsvfs->z_znodes_lock);
1679 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1680 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1681 if (zp->z_sa_hdl != NULL) {
1682 zfs_znode_dmu_fini(zp);
1683 }
1684 }
1685 mutex_exit(&zfsvfs->z_znodes_lock);
1686
1687 /*
1688 * If we are unmounting, set the unmounted flag and let new vops
1689 * unblock. zfs_inactive will have the unmounted behavior, and all
1690 * other vops will fail with EIO.
1691 */
1692 if (unmounting) {
1693 zfsvfs->z_unmounted = B_TRUE;
1694 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1695 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1696 }
1697
1698 /*
1699 * z_os will be NULL if there was an error in attempting to reopen
1700 * zfsvfs, so just return as the properties had already been
1701 * unregistered and cached data had been evicted before.
1702 */
1703 if (zfsvfs->z_os == NULL)
1704 return (0);
1705
1706 /*
1707 * Unregister properties.
1708 */
1709 zfs_unregister_callbacks(zfsvfs);
1710
1711 /*
1712 * Evict cached data. We must write out any dirty data before
1713 * disowning the dataset.
1714 */
1715 objset_t *os = zfsvfs->z_os;
1716 boolean_t os_dirty = B_FALSE;
1717 for (int t = 0; t < TXG_SIZE; t++) {
1718 if (dmu_objset_is_dirty(os, t)) {
1719 os_dirty = B_TRUE;
1720 break;
1721 }
1722 }
1723 if (!zfs_is_readonly(zfsvfs) && os_dirty)
1724 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1725 dmu_objset_evict_dbufs(zfsvfs->z_os);
1726 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1727 dsl_dir_cancel_waiters(dd);
1728
1729 return (0);
1730 }
1731
1732 static int
zfs_umount(vfs_t * vfsp,int fflag)1733 zfs_umount(vfs_t *vfsp, int fflag)
1734 {
1735 kthread_t *td = curthread;
1736 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1737 objset_t *os;
1738 cred_t *cr = td->td_ucred;
1739 int ret;
1740
1741 ret = secpolicy_fs_unmount(cr, vfsp);
1742 if (ret) {
1743 if (dsl_deleg_access((char *)vfsp->vfs_resource,
1744 ZFS_DELEG_PERM_MOUNT, cr))
1745 return (ret);
1746 }
1747
1748 /*
1749 * Unmount any snapshots mounted under .zfs before unmounting the
1750 * dataset itself.
1751 */
1752 if (zfsvfs->z_ctldir != NULL) {
1753 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1754 return (ret);
1755 }
1756
1757 if (fflag & MS_FORCE) {
1758 /*
1759 * Mark file system as unmounted before calling
1760 * vflush(FORCECLOSE). This way we ensure no future vnops
1761 * will be called and risk operating on DOOMED vnodes.
1762 */
1763 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1764 zfsvfs->z_unmounted = B_TRUE;
1765 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1766 }
1767
1768 /*
1769 * Flush all the files.
1770 */
1771 ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1772 if (ret != 0)
1773 return (ret);
1774 while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1775 &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1776 taskqueue_drain(zfsvfs_taskq->tq_queue,
1777 &zfsvfs->z_unlinked_drain_task);
1778
1779 VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
1780 os = zfsvfs->z_os;
1781
1782 /*
1783 * z_os will be NULL if there was an error in
1784 * attempting to reopen zfsvfs.
1785 */
1786 if (os != NULL) {
1787 /*
1788 * Unset the objset user_ptr.
1789 */
1790 mutex_enter(&os->os_user_ptr_lock);
1791 dmu_objset_set_user(os, NULL);
1792 mutex_exit(&os->os_user_ptr_lock);
1793
1794 /*
1795 * Finally release the objset
1796 */
1797 dmu_objset_disown(os, B_TRUE, zfsvfs);
1798 }
1799
1800 /*
1801 * We can now safely destroy the '.zfs' directory node.
1802 */
1803 if (zfsvfs->z_ctldir != NULL)
1804 zfsctl_destroy(zfsvfs);
1805 zfs_freevfs(vfsp);
1806
1807 return (0);
1808 }
1809
1810 static int
zfs_vget(vfs_t * vfsp,ino_t ino,int flags,vnode_t ** vpp)1811 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1812 {
1813 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1814 znode_t *zp;
1815 int err;
1816
1817 /*
1818 * zfs_zget() can't operate on virtual entries like .zfs/ or
1819 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1820 * This will make NFS to switch to LOOKUP instead of using VGET.
1821 */
1822 if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1823 (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1824 return (EOPNOTSUPP);
1825
1826 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1827 return (err);
1828 err = zfs_zget(zfsvfs, ino, &zp);
1829 if (err == 0 && zp->z_unlinked) {
1830 vrele(ZTOV(zp));
1831 err = EINVAL;
1832 }
1833 if (err == 0)
1834 *vpp = ZTOV(zp);
1835 zfs_exit(zfsvfs, FTAG);
1836 if (err == 0) {
1837 err = vn_lock(*vpp, flags);
1838 if (err != 0)
1839 vrele(*vpp);
1840 #if __FreeBSD_version >= 1500040
1841 else if ((zp->z_pflags & ZFS_XATTR) != 0) {
1842 if ((*vpp)->v_type == VDIR)
1843 vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
1844 else
1845 vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR);
1846 }
1847 #endif
1848 }
1849 if (err != 0)
1850 *vpp = NULL;
1851 return (err);
1852 }
1853
1854 static int
zfs_checkexp(vfs_t * vfsp,struct sockaddr * nam,uint64_t * extflagsp,struct ucred ** credanonp,int * numsecflavors,int * secflavors)1855 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1856 struct ucred **credanonp, int *numsecflavors, int *secflavors)
1857 {
1858 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1859
1860 /*
1861 * If this is regular file system vfsp is the same as
1862 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1863 * zfsvfs->z_parent->z_vfs represents parent file system
1864 * which we have to use here, because only this file system
1865 * has mnt_export configured.
1866 */
1867 return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1868 credanonp, numsecflavors, secflavors));
1869 }
1870
1871 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN,
1872 "struct fid bigger than SHORT_FID_LEN");
1873 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN,
1874 "struct fid bigger than LONG_FID_LEN");
1875
1876 static int
zfs_fhtovp(vfs_t * vfsp,fid_t * fidp,int flags,vnode_t ** vpp)1877 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1878 {
1879 struct componentname cn;
1880 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1881 znode_t *zp;
1882 vnode_t *dvp;
1883 uint64_t object = 0;
1884 uint64_t fid_gen = 0;
1885 uint64_t setgen = 0;
1886 uint64_t gen_mask;
1887 uint64_t zp_gen;
1888 int i, err;
1889
1890 *vpp = NULL;
1891
1892 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1893 return (err);
1894
1895 /*
1896 * On FreeBSD we can get snapshot's mount point or its parent file
1897 * system mount point depending if snapshot is already mounted or not.
1898 */
1899 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1900 zfid_long_t *zlfid = (zfid_long_t *)fidp;
1901 uint64_t objsetid = 0;
1902
1903 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1904 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1905
1906 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1907 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1908
1909 zfs_exit(zfsvfs, FTAG);
1910
1911 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1912 if (err)
1913 return (SET_ERROR(EINVAL));
1914 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1915 return (err);
1916 }
1917
1918 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1919 zfid_short_t *zfid = (zfid_short_t *)fidp;
1920
1921 for (i = 0; i < sizeof (zfid->zf_object); i++)
1922 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1923
1924 for (i = 0; i < sizeof (zfid->zf_gen); i++)
1925 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1926 } else {
1927 zfs_exit(zfsvfs, FTAG);
1928 return (SET_ERROR(EINVAL));
1929 }
1930
1931 if (fidp->fid_len == LONG_FID_LEN && setgen != 0) {
1932 zfs_exit(zfsvfs, FTAG);
1933 dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n",
1934 (u_longlong_t)fid_gen, (u_longlong_t)setgen);
1935 return (SET_ERROR(EINVAL));
1936 }
1937
1938 /*
1939 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1940 * directory tree. If the object == zfsvfs->z_shares_dir, then
1941 * we are in the .zfs/shares directory tree.
1942 */
1943 if ((fid_gen == 0 &&
1944 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1945 (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
1946 zfs_exit(zfsvfs, FTAG);
1947 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1948 if (object == ZFSCTL_INO_SNAPDIR) {
1949 cn.cn_nameptr = "snapshot";
1950 cn.cn_namelen = strlen(cn.cn_nameptr);
1951 cn.cn_nameiop = LOOKUP;
1952 cn.cn_flags = ISLASTCN | LOCKLEAF;
1953 cn.cn_lkflags = flags;
1954 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1955 vput(dvp);
1956 } else if (object == zfsvfs->z_shares_dir) {
1957 /*
1958 * XXX This branch must not be taken,
1959 * if it is, then the lookup below will
1960 * explode.
1961 */
1962 cn.cn_nameptr = "shares";
1963 cn.cn_namelen = strlen(cn.cn_nameptr);
1964 cn.cn_nameiop = LOOKUP;
1965 cn.cn_flags = ISLASTCN;
1966 cn.cn_lkflags = flags;
1967 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1968 vput(dvp);
1969 } else {
1970 *vpp = dvp;
1971 }
1972 return (err);
1973 }
1974
1975 gen_mask = -1ULL >> (64 - 8 * i);
1976
1977 dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object,
1978 (u_longlong_t)fid_gen,
1979 (u_longlong_t)gen_mask);
1980 if ((err = zfs_zget(zfsvfs, object, &zp))) {
1981 zfs_exit(zfsvfs, FTAG);
1982 return (err);
1983 }
1984 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1985 sizeof (uint64_t));
1986 zp_gen = zp_gen & gen_mask;
1987 if (zp_gen == 0)
1988 zp_gen = 1;
1989 if (zp->z_unlinked || zp_gen != fid_gen) {
1990 dprintf("znode gen (%llu) != fid gen (%llu)\n",
1991 (u_longlong_t)zp_gen, (u_longlong_t)fid_gen);
1992 vrele(ZTOV(zp));
1993 zfs_exit(zfsvfs, FTAG);
1994 return (SET_ERROR(EINVAL));
1995 }
1996
1997 *vpp = ZTOV(zp);
1998 zfs_exit(zfsvfs, FTAG);
1999 err = vn_lock(*vpp, flags);
2000 if (err == 0) {
2001 vnode_create_vobject(*vpp, zp->z_size, curthread);
2002 #if __FreeBSD_version >= 1500040
2003 if ((zp->z_pflags & ZFS_XATTR) != 0) {
2004 if ((*vpp)->v_type == VDIR)
2005 vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
2006 else
2007 vn_irflag_set_cond(*vpp, VIRF_NAMEDATTR);
2008 }
2009 #endif
2010 } else
2011 *vpp = NULL;
2012 return (err);
2013 }
2014
2015 /*
2016 * Block out VOPs and close zfsvfs_t::z_os
2017 *
2018 * Note, if successful, then we return with the 'z_teardown_lock' and
2019 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
2020 * dataset and objset intact so that they can be atomically handed off during
2021 * a subsequent rollback or recv operation and the resume thereafter.
2022 */
2023 int
zfs_suspend_fs(zfsvfs_t * zfsvfs)2024 zfs_suspend_fs(zfsvfs_t *zfsvfs)
2025 {
2026 int error;
2027
2028 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
2029 return (error);
2030
2031 return (0);
2032 }
2033
2034 /*
2035 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
2036 * is an invariant across any of the operations that can be performed while the
2037 * filesystem was suspended. Whether it succeeded or failed, the preconditions
2038 * are the same: the relevant objset and associated dataset are owned by
2039 * zfsvfs, held, and long held on entry.
2040 */
2041 int
zfs_resume_fs(zfsvfs_t * zfsvfs,dsl_dataset_t * ds)2042 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2043 {
2044 int err;
2045 znode_t *zp;
2046
2047 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
2048 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
2049
2050 /*
2051 * We already own this, so just update the objset_t, as the one we
2052 * had before may have been evicted.
2053 */
2054 objset_t *os;
2055 VERIFY3P(ds->ds_owner, ==, zfsvfs);
2056 VERIFY(dsl_dataset_long_held(ds));
2057 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2058 dsl_pool_config_enter(dp, FTAG);
2059 VERIFY0(dmu_objset_from_ds(ds, &os));
2060 dsl_pool_config_exit(dp, FTAG);
2061
2062 err = zfsvfs_init(zfsvfs, os);
2063 if (err != 0)
2064 goto bail;
2065
2066 ds->ds_dir->dd_activity_cancelled = B_FALSE;
2067 VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
2068
2069 zfs_set_fuid_feature(zfsvfs);
2070
2071 /*
2072 * Attempt to re-establish all the active znodes with
2073 * their dbufs. If a zfs_rezget() fails, then we'll let
2074 * any potential callers discover that via zfs_enter_verify_zp
2075 * when they try to use their znode.
2076 */
2077 mutex_enter(&zfsvfs->z_znodes_lock);
2078 for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2079 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2080 (void) zfs_rezget(zp);
2081 }
2082 mutex_exit(&zfsvfs->z_znodes_lock);
2083
2084 bail:
2085 /* release the VOPs */
2086 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2087 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2088
2089 if (err) {
2090 /*
2091 * Since we couldn't setup the sa framework, try to force
2092 * unmount this file system.
2093 */
2094 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
2095 vfs_ref(zfsvfs->z_vfs);
2096 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2097 }
2098 }
2099 return (err);
2100 }
2101
2102 static void
zfs_freevfs(vfs_t * vfsp)2103 zfs_freevfs(vfs_t *vfsp)
2104 {
2105 zfsvfs_t *zfsvfs = vfsp->vfs_data;
2106
2107 zfsvfs_free(zfsvfs);
2108
2109 atomic_dec_32(&zfs_active_fs_count);
2110 }
2111
2112 #ifdef __i386__
2113 static int desiredvnodes_backup;
2114 #include <sys/vmmeter.h>
2115
2116
2117 #include <vm/vm_page.h>
2118 #include <vm/vm_object.h>
2119 #include <vm/vm_kern.h>
2120 #include <vm/vm_map.h>
2121 #endif
2122
2123 static void
zfs_vnodes_adjust(void)2124 zfs_vnodes_adjust(void)
2125 {
2126 #ifdef __i386__
2127 int newdesiredvnodes;
2128
2129 desiredvnodes_backup = desiredvnodes;
2130
2131 /*
2132 * We calculate newdesiredvnodes the same way it is done in
2133 * vntblinit(). If it is equal to desiredvnodes, it means that
2134 * it wasn't tuned by the administrator and we can tune it down.
2135 */
2136 newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2137 vm_kmem_size / (5 * (sizeof (struct vm_object) +
2138 sizeof (struct vnode))));
2139 if (newdesiredvnodes == desiredvnodes)
2140 desiredvnodes = (3 * newdesiredvnodes) / 4;
2141 #endif
2142 }
2143
2144 static void
zfs_vnodes_adjust_back(void)2145 zfs_vnodes_adjust_back(void)
2146 {
2147
2148 #ifdef __i386__
2149 desiredvnodes = desiredvnodes_backup;
2150 #endif
2151 }
2152
2153 static struct sx zfs_vnlru_lock;
2154 static struct vnode *zfs_vnlru_marker;
2155 static arc_prune_t *zfs_prune;
2156
2157 static void
zfs_prune_task(uint64_t nr_to_scan,void * arg __unused)2158 zfs_prune_task(uint64_t nr_to_scan, void *arg __unused)
2159 {
2160 if (nr_to_scan > INT_MAX)
2161 nr_to_scan = INT_MAX;
2162 sx_xlock(&zfs_vnlru_lock);
2163 vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker);
2164 sx_xunlock(&zfs_vnlru_lock);
2165 }
2166
2167 void
zfs_init(void)2168 zfs_init(void)
2169 {
2170
2171 printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2172
2173 /*
2174 * Initialize .zfs directory structures
2175 */
2176 zfsctl_init();
2177
2178 /*
2179 * Initialize znode cache, vnode ops, etc...
2180 */
2181 zfs_znode_init();
2182
2183 /*
2184 * Reduce number of vnodes. Originally number of vnodes is calculated
2185 * with UFS inode in mind. We reduce it here, because it's too big for
2186 * ZFS/i386.
2187 */
2188 zfs_vnodes_adjust();
2189
2190 dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
2191
2192 zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2193
2194 zfs_vnlru_marker = vnlru_alloc_marker();
2195 sx_init(&zfs_vnlru_lock, "zfs vnlru lock");
2196 zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL);
2197 }
2198
2199 void
zfs_fini(void)2200 zfs_fini(void)
2201 {
2202 arc_remove_prune_callback(zfs_prune);
2203 vnlru_free_marker(zfs_vnlru_marker);
2204 sx_destroy(&zfs_vnlru_lock);
2205
2206 taskq_destroy(zfsvfs_taskq);
2207 zfsctl_fini();
2208 zfs_znode_fini();
2209 zfs_vnodes_adjust_back();
2210 }
2211
2212 int
zfs_busy(void)2213 zfs_busy(void)
2214 {
2215 return (zfs_active_fs_count != 0);
2216 }
2217
2218 /*
2219 * Release VOPs and unmount a suspended filesystem.
2220 */
2221 int
zfs_end_fs(zfsvfs_t * zfsvfs,dsl_dataset_t * ds)2222 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2223 {
2224 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
2225 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
2226
2227 /*
2228 * We already own this, so just hold and rele it to update the
2229 * objset_t, as the one we had before may have been evicted.
2230 */
2231 objset_t *os;
2232 VERIFY3P(ds->ds_owner, ==, zfsvfs);
2233 VERIFY(dsl_dataset_long_held(ds));
2234 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2235 dsl_pool_config_enter(dp, FTAG);
2236 VERIFY0(dmu_objset_from_ds(ds, &os));
2237 dsl_pool_config_exit(dp, FTAG);
2238 zfsvfs->z_os = os;
2239
2240 /* release the VOPs */
2241 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2242 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2243
2244 /*
2245 * Try to force unmount this file system.
2246 */
2247 (void) zfs_umount(zfsvfs->z_vfs, 0);
2248 zfsvfs->z_unmounted = B_TRUE;
2249 return (0);
2250 }
2251
2252 int
zfs_set_version(zfsvfs_t * zfsvfs,uint64_t newvers)2253 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2254 {
2255 int error;
2256 objset_t *os = zfsvfs->z_os;
2257 dmu_tx_t *tx;
2258
2259 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2260 return (SET_ERROR(EINVAL));
2261
2262 if (newvers < zfsvfs->z_version)
2263 return (SET_ERROR(EINVAL));
2264
2265 if (zfs_spa_version_map(newvers) >
2266 spa_version(dmu_objset_spa(zfsvfs->z_os)))
2267 return (SET_ERROR(ENOTSUP));
2268
2269 tx = dmu_tx_create(os);
2270 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2271 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2272 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2273 ZFS_SA_ATTRS);
2274 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2275 }
2276 error = dmu_tx_assign(tx, DMU_TX_WAIT);
2277 if (error) {
2278 dmu_tx_abort(tx);
2279 return (error);
2280 }
2281
2282 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2283 8, 1, &newvers, tx);
2284
2285 if (error) {
2286 dmu_tx_commit(tx);
2287 return (error);
2288 }
2289
2290 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2291 uint64_t sa_obj;
2292
2293 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2294 SPA_VERSION_SA);
2295 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2296 DMU_OT_NONE, 0, tx);
2297
2298 error = zap_add(os, MASTER_NODE_OBJ,
2299 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2300 ASSERT0(error);
2301
2302 VERIFY0(sa_set_sa_object(os, sa_obj));
2303 sa_register_update_callback(os, zfs_sa_upgrade);
2304 }
2305
2306 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2307 "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2308 (uintmax_t)newvers);
2309 dmu_tx_commit(tx);
2310
2311 zfsvfs->z_version = newvers;
2312 os->os_version = newvers;
2313
2314 zfs_set_fuid_feature(zfsvfs);
2315
2316 return (0);
2317 }
2318
2319 int
zfs_set_default_quota(zfsvfs_t * zfsvfs,zfs_prop_t prop,uint64_t quota)2320 zfs_set_default_quota(zfsvfs_t *zfsvfs, zfs_prop_t prop, uint64_t quota)
2321 {
2322 int error;
2323 objset_t *os = zfsvfs->z_os;
2324 const char *propstr = zfs_prop_to_name(prop);
2325 dmu_tx_t *tx;
2326
2327 tx = dmu_tx_create(os);
2328 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, propstr);
2329 error = dmu_tx_assign(tx, DMU_TX_WAIT);
2330 if (error) {
2331 dmu_tx_abort(tx);
2332 return (error);
2333 }
2334
2335 if (quota == 0) {
2336 error = zap_remove(os, MASTER_NODE_OBJ, propstr, tx);
2337 if (error == ENOENT)
2338 error = 0;
2339 } else {
2340 error = zap_update(os, MASTER_NODE_OBJ, propstr, 8, 1,
2341 "a, tx);
2342 }
2343
2344 if (error)
2345 goto out;
2346
2347 switch (prop) {
2348 case ZFS_PROP_DEFAULTUSERQUOTA:
2349 zfsvfs->z_defaultuserquota = quota;
2350 break;
2351 case ZFS_PROP_DEFAULTGROUPQUOTA:
2352 zfsvfs->z_defaultgroupquota = quota;
2353 break;
2354 case ZFS_PROP_DEFAULTPROJECTQUOTA:
2355 zfsvfs->z_defaultprojectquota = quota;
2356 break;
2357 case ZFS_PROP_DEFAULTUSEROBJQUOTA:
2358 zfsvfs->z_defaultuserobjquota = quota;
2359 break;
2360 case ZFS_PROP_DEFAULTGROUPOBJQUOTA:
2361 zfsvfs->z_defaultgroupobjquota = quota;
2362 break;
2363 case ZFS_PROP_DEFAULTPROJECTOBJQUOTA:
2364 zfsvfs->z_defaultprojectobjquota = quota;
2365 break;
2366 default:
2367 break;
2368 }
2369
2370 out:
2371 dmu_tx_commit(tx);
2372 return (error);
2373 }
2374
2375 /*
2376 * Return true if the corresponding vfs's unmounted flag is set.
2377 * Otherwise return false.
2378 * If this function returns true we know VFS unmount has been initiated.
2379 */
2380 boolean_t
zfs_get_vfs_flag_unmounted(objset_t * os)2381 zfs_get_vfs_flag_unmounted(objset_t *os)
2382 {
2383 zfsvfs_t *zfvp;
2384 boolean_t unmounted = B_FALSE;
2385
2386 ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS);
2387
2388 mutex_enter(&os->os_user_ptr_lock);
2389 zfvp = dmu_objset_get_user(os);
2390 if (zfvp != NULL && zfvp->z_vfs != NULL &&
2391 (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2392 unmounted = B_TRUE;
2393 mutex_exit(&os->os_user_ptr_lock);
2394
2395 return (unmounted);
2396 }
2397
2398 #ifdef _KERNEL
2399 void
zfsvfs_update_fromname(const char * oldname,const char * newname)2400 zfsvfs_update_fromname(const char *oldname, const char *newname)
2401 {
2402 char tmpbuf[MAXPATHLEN];
2403 struct mount *mp;
2404 char *fromname;
2405 size_t oldlen;
2406
2407 oldlen = strlen(oldname);
2408
2409 mtx_lock(&mountlist_mtx);
2410 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2411 fromname = mp->mnt_stat.f_mntfromname;
2412 if (strcmp(fromname, oldname) == 0) {
2413 (void) strlcpy(fromname, newname,
2414 sizeof (mp->mnt_stat.f_mntfromname));
2415 continue;
2416 }
2417 if (strncmp(fromname, oldname, oldlen) == 0 &&
2418 (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2419 (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2420 newname, fromname + oldlen);
2421 (void) strlcpy(fromname, tmpbuf,
2422 sizeof (mp->mnt_stat.f_mntfromname));
2423 continue;
2424 }
2425 }
2426 mtx_unlock(&mountlist_mtx);
2427 }
2428 #endif
2429
2430 /*
2431 * Find a prison with ZFS info.
2432 * Return the ZFS info and the (locked) prison.
2433 */
2434 static struct zfs_jailparam *
zfs_jailparam_find(struct prison * spr,struct prison ** prp)2435 zfs_jailparam_find(struct prison *spr, struct prison **prp)
2436 {
2437 struct prison *pr;
2438 struct zfs_jailparam *zjp;
2439
2440 for (pr = spr; ; pr = pr->pr_parent) {
2441 mtx_lock(&pr->pr_mtx);
2442 if (pr == &prison0) {
2443 zjp = &zfs_jailparam0;
2444 break;
2445 }
2446 zjp = osd_jail_get(pr, zfs_jailparam_slot);
2447 if (zjp != NULL)
2448 break;
2449 mtx_unlock(&pr->pr_mtx);
2450 }
2451 *prp = pr;
2452
2453 return (zjp);
2454 }
2455
2456 /*
2457 * Ensure a prison has its own ZFS info. If zjpp is non-null, point it to the
2458 * ZFS info and lock the prison.
2459 */
2460 static void
zfs_jailparam_alloc(struct prison * pr,struct zfs_jailparam ** zjpp)2461 zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp)
2462 {
2463 struct prison *ppr;
2464 struct zfs_jailparam *zjp, *nzjp;
2465 void **rsv;
2466
2467 /* If this prison already has ZFS info, return that. */
2468 zjp = zfs_jailparam_find(pr, &ppr);
2469 if (ppr == pr)
2470 goto done;
2471
2472 /*
2473 * Allocate a new info record. Then check again, in case something
2474 * changed during the allocation.
2475 */
2476 mtx_unlock(&ppr->pr_mtx);
2477 nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK);
2478 rsv = osd_reserve(zfs_jailparam_slot);
2479 zjp = zfs_jailparam_find(pr, &ppr);
2480 if (ppr == pr) {
2481 free(nzjp, M_PRISON);
2482 osd_free_reserved(rsv);
2483 goto done;
2484 }
2485 /* Inherit the initial values from the ancestor. */
2486 mtx_lock(&pr->pr_mtx);
2487 (void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp);
2488 (void) memcpy(nzjp, zjp, sizeof (*zjp));
2489 zjp = nzjp;
2490 mtx_unlock(&ppr->pr_mtx);
2491 done:
2492 if (zjpp != NULL)
2493 *zjpp = zjp;
2494 else
2495 mtx_unlock(&pr->pr_mtx);
2496 }
2497
2498 /*
2499 * Jail OSD methods for ZFS VFS info.
2500 */
2501 static int
zfs_jailparam_create(void * obj,void * data)2502 zfs_jailparam_create(void *obj, void *data)
2503 {
2504 struct prison *pr = obj;
2505 struct vfsoptlist *opts = data;
2506 int jsys;
2507
2508 if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 &&
2509 jsys == JAIL_SYS_INHERIT)
2510 return (0);
2511 /*
2512 * Inherit a prison's initial values from its parent
2513 * (different from JAIL_SYS_INHERIT which also inherits changes).
2514 */
2515 zfs_jailparam_alloc(pr, NULL);
2516 return (0);
2517 }
2518
2519 static int
zfs_jailparam_get(void * obj,void * data)2520 zfs_jailparam_get(void *obj, void *data)
2521 {
2522 struct prison *ppr, *pr = obj;
2523 struct vfsoptlist *opts = data;
2524 struct zfs_jailparam *zjp;
2525 int jsys, error;
2526
2527 zjp = zfs_jailparam_find(pr, &ppr);
2528 jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
2529 error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys));
2530 if (error != 0 && error != ENOENT)
2531 goto done;
2532 if (jsys == JAIL_SYS_NEW) {
2533 error = vfs_setopt(opts, "zfs.mount_snapshot",
2534 &zjp->mount_snapshot, sizeof (zjp->mount_snapshot));
2535 if (error != 0 && error != ENOENT)
2536 goto done;
2537 } else {
2538 /*
2539 * If this prison is inheriting its ZFS info, report
2540 * empty/zero parameters.
2541 */
2542 static int mount_snapshot = 0;
2543
2544 error = vfs_setopt(opts, "zfs.mount_snapshot",
2545 &mount_snapshot, sizeof (mount_snapshot));
2546 if (error != 0 && error != ENOENT)
2547 goto done;
2548 }
2549 error = 0;
2550 done:
2551 mtx_unlock(&ppr->pr_mtx);
2552 return (error);
2553 }
2554
2555 static int
zfs_jailparam_set(void * obj,void * data)2556 zfs_jailparam_set(void *obj, void *data)
2557 {
2558 struct prison *pr = obj;
2559 struct prison *ppr;
2560 struct vfsoptlist *opts = data;
2561 int error, jsys, mount_snapshot;
2562
2563 /* Set the parameters, which should be correct. */
2564 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2565 if (error == ENOENT)
2566 jsys = -1;
2567 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2568 sizeof (mount_snapshot));
2569 if (error == ENOENT)
2570 mount_snapshot = -1;
2571 else
2572 jsys = JAIL_SYS_NEW;
2573 switch (jsys) {
2574 case JAIL_SYS_NEW:
2575 {
2576 /* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */
2577 struct zfs_jailparam *zjp;
2578
2579 /*
2580 * A child jail cannot have more permissions than its parent
2581 */
2582 if (pr->pr_parent != &prison0) {
2583 zjp = zfs_jailparam_find(pr->pr_parent, &ppr);
2584 mtx_unlock(&ppr->pr_mtx);
2585 if (zjp->mount_snapshot < mount_snapshot) {
2586 return (EPERM);
2587 }
2588 }
2589 zfs_jailparam_alloc(pr, &zjp);
2590 if (mount_snapshot != -1)
2591 zjp->mount_snapshot = mount_snapshot;
2592 mtx_unlock(&pr->pr_mtx);
2593 break;
2594 }
2595 case JAIL_SYS_INHERIT:
2596 /* "zfs=inherit": inherit the parent's ZFS info. */
2597 mtx_lock(&pr->pr_mtx);
2598 osd_jail_del(pr, zfs_jailparam_slot);
2599 mtx_unlock(&pr->pr_mtx);
2600 break;
2601 case -1:
2602 /*
2603 * If the setting being changed is not ZFS related
2604 * then do nothing.
2605 */
2606 break;
2607 }
2608
2609 return (0);
2610 }
2611
2612 static int
zfs_jailparam_check(void * obj __unused,void * data)2613 zfs_jailparam_check(void *obj __unused, void *data)
2614 {
2615 struct vfsoptlist *opts = data;
2616 int error, jsys, mount_snapshot;
2617
2618 /* Check that the parameters are correct. */
2619 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2620 if (error != ENOENT) {
2621 if (error != 0)
2622 return (error);
2623 if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT)
2624 return (EINVAL);
2625 }
2626 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2627 sizeof (mount_snapshot));
2628 if (error != ENOENT) {
2629 if (error != 0)
2630 return (error);
2631 if (mount_snapshot != 0 && mount_snapshot != 1)
2632 return (EINVAL);
2633 }
2634 return (0);
2635 }
2636
2637 static void
zfs_jailparam_destroy(void * data)2638 zfs_jailparam_destroy(void *data)
2639 {
2640
2641 free(data, M_PRISON);
2642 }
2643
2644 static void
zfs_jailparam_sysinit(void * arg __unused)2645 zfs_jailparam_sysinit(void *arg __unused)
2646 {
2647 struct prison *pr;
2648 osd_method_t methods[PR_MAXMETHOD] = {
2649 [PR_METHOD_CREATE] = zfs_jailparam_create,
2650 [PR_METHOD_GET] = zfs_jailparam_get,
2651 [PR_METHOD_SET] = zfs_jailparam_set,
2652 [PR_METHOD_CHECK] = zfs_jailparam_check,
2653 };
2654
2655 zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods);
2656 /* Copy the defaults to any existing prisons. */
2657 sx_slock(&allprison_lock);
2658 TAILQ_FOREACH(pr, &allprison, pr_list)
2659 zfs_jailparam_alloc(pr, NULL);
2660 sx_sunlock(&allprison_lock);
2661 }
2662
2663 static void
zfs_jailparam_sysuninit(void * arg __unused)2664 zfs_jailparam_sysuninit(void *arg __unused)
2665 {
2666
2667 osd_jail_deregister(zfs_jailparam_slot);
2668 }
2669
2670 SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2671 zfs_jailparam_sysinit, NULL);
2672 SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2673 zfs_jailparam_sysuninit, NULL);
2674