1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
24 * All rights reserved.
25 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
28 */
29
30 /* Portions Copyright 2010 Robert Milkowski */
31
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/sysmacros.h>
37 #include <sys/kmem.h>
38 #include <sys/acl.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/mntent.h>
42 #include <sys/mount.h>
43 #include <sys/cmn_err.h>
44 #include <sys/zfs_znode.h>
45 #include <sys/zfs_vnops.h>
46 #include <sys/zfs_dir.h>
47 #include <sys/zil.h>
48 #include <sys/fs/zfs.h>
49 #include <sys/dmu.h>
50 #include <sys/dsl_prop.h>
51 #include <sys/dsl_dataset.h>
52 #include <sys/dsl_deleg.h>
53 #include <sys/spa.h>
54 #include <sys/zap.h>
55 #include <sys/sa.h>
56 #include <sys/sa_impl.h>
57 #include <sys/policy.h>
58 #include <sys/atomic.h>
59 #include <sys/zfs_ioctl.h>
60 #include <sys/zfs_ctldir.h>
61 #include <sys/zfs_fuid.h>
62 #include <sys/sunddi.h>
63 #include <sys/dmu_objset.h>
64 #include <sys/dsl_dir.h>
65 #include <sys/jail.h>
66 #include <sys/osd.h>
67 #include <ufs/ufs/quota.h>
68 #include <sys/zfs_quota.h>
69
70 #include "zfs_comutil.h"
71
72 #ifndef MNTK_VMSETSIZE_BUG
73 #define MNTK_VMSETSIZE_BUG 0
74 #endif
75 #ifndef MNTK_NOMSYNC
76 #define MNTK_NOMSYNC 8
77 #endif
78
79 struct mtx zfs_debug_mtx;
80 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
81
82 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
83
84 int zfs_super_owner;
85 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
86 "File system owners can perform privileged operation on file systems");
87
88 int zfs_debug_level;
89 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
90 "Debug level");
91
92 struct zfs_jailparam {
93 int mount_snapshot;
94 };
95
96 static struct zfs_jailparam zfs_jailparam0 = {
97 .mount_snapshot = 0,
98 };
99
100 static int zfs_jailparam_slot;
101
102 SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters");
103 SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I",
104 "Allow mounting snapshots in the .zfs directory for unjailed datasets");
105
106 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
107 static int zfs_version_acl = ZFS_ACL_VERSION;
108 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
109 "ZFS_ACL_VERSION");
110 static int zfs_version_spa = SPA_VERSION;
111 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
112 "SPA_VERSION");
113 static int zfs_version_zpl = ZPL_VERSION;
114 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
115 "ZPL_VERSION");
116
117 #if __FreeBSD_version >= 1400018
118 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg,
119 bool *mp_busy);
120 #else
121 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
122 #endif
123 static int zfs_mount(vfs_t *vfsp);
124 static int zfs_umount(vfs_t *vfsp, int fflag);
125 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
126 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
127 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
128 static int zfs_sync(vfs_t *vfsp, int waitfor);
129 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
130 struct ucred **credanonp, int *numsecflavors, int *secflavors);
131 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
132 static void zfs_freevfs(vfs_t *vfsp);
133
134 struct vfsops zfs_vfsops = {
135 .vfs_mount = zfs_mount,
136 .vfs_unmount = zfs_umount,
137 .vfs_root = vfs_cache_root,
138 .vfs_cachedroot = zfs_root,
139 .vfs_statfs = zfs_statfs,
140 .vfs_vget = zfs_vget,
141 .vfs_sync = zfs_sync,
142 .vfs_checkexp = zfs_checkexp,
143 .vfs_fhtovp = zfs_fhtovp,
144 .vfs_quotactl = zfs_quotactl,
145 };
146
147 #ifdef VFCF_CROSS_COPY_FILE_RANGE
148 VFS_SET(zfs_vfsops, zfs,
149 VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE);
150 #else
151 VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL);
152 #endif
153
154 /*
155 * We need to keep a count of active fs's.
156 * This is necessary to prevent our module
157 * from being unloaded after a umount -f
158 */
159 static uint32_t zfs_active_fs_count = 0;
160
161 int
zfs_get_temporary_prop(dsl_dataset_t * ds,zfs_prop_t zfs_prop,uint64_t * val,char * setpoint)162 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
163 char *setpoint)
164 {
165 int error;
166 zfsvfs_t *zfvp;
167 vfs_t *vfsp;
168 objset_t *os;
169 uint64_t tmp = *val;
170
171 error = dmu_objset_from_ds(ds, &os);
172 if (error != 0)
173 return (error);
174
175 error = getzfsvfs_impl(os, &zfvp);
176 if (error != 0)
177 return (error);
178 if (zfvp == NULL)
179 return (ENOENT);
180 vfsp = zfvp->z_vfs;
181 switch (zfs_prop) {
182 case ZFS_PROP_ATIME:
183 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
184 tmp = 0;
185 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
186 tmp = 1;
187 break;
188 case ZFS_PROP_DEVICES:
189 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
190 tmp = 0;
191 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
192 tmp = 1;
193 break;
194 case ZFS_PROP_EXEC:
195 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
196 tmp = 0;
197 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
198 tmp = 1;
199 break;
200 case ZFS_PROP_SETUID:
201 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
202 tmp = 0;
203 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
204 tmp = 1;
205 break;
206 case ZFS_PROP_READONLY:
207 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
208 tmp = 0;
209 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
210 tmp = 1;
211 break;
212 case ZFS_PROP_XATTR:
213 if (zfvp->z_flags & ZSB_XATTR)
214 tmp = zfvp->z_xattr;
215 break;
216 case ZFS_PROP_NBMAND:
217 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
218 tmp = 0;
219 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
220 tmp = 1;
221 break;
222 default:
223 vfs_unbusy(vfsp);
224 return (ENOENT);
225 }
226
227 vfs_unbusy(vfsp);
228 if (tmp != *val) {
229 if (setpoint)
230 (void) strcpy(setpoint, "temporary");
231 *val = tmp;
232 }
233 return (0);
234 }
235
236 static int
zfs_getquota(zfsvfs_t * zfsvfs,uid_t id,int isgroup,struct dqblk64 * dqp)237 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
238 {
239 int error = 0;
240 char buf[32];
241 uint64_t usedobj, quotaobj;
242 uint64_t quota, used = 0;
243 timespec_t now;
244
245 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
246 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
247
248 if (quotaobj == 0 || zfsvfs->z_replay) {
249 error = ENOENT;
250 goto done;
251 }
252 (void) sprintf(buf, "%llx", (longlong_t)id);
253 if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
254 buf, sizeof (quota), 1, "a)) != 0) {
255 dprintf("%s(%d): quotaobj lookup failed\n",
256 __FUNCTION__, __LINE__);
257 goto done;
258 }
259 /*
260 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
261 * So we set them to be the same.
262 */
263 dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
264 error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
265 if (error && error != ENOENT) {
266 dprintf("%s(%d): usedobj failed; %d\n",
267 __FUNCTION__, __LINE__, error);
268 goto done;
269 }
270 dqp->dqb_curblocks = btodb(used);
271 dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
272 vfs_timestamp(&now);
273 /*
274 * Setting this to 0 causes FreeBSD quota(8) to print
275 * the number of days since the epoch, which isn't
276 * particularly useful.
277 */
278 dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
279 done:
280 return (error);
281 }
282
283 static int
284 #if __FreeBSD_version >= 1400018
zfs_quotactl(vfs_t * vfsp,int cmds,uid_t id,void * arg,bool * mp_busy)285 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy)
286 #else
287 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
288 #endif
289 {
290 zfsvfs_t *zfsvfs = vfsp->vfs_data;
291 struct thread *td;
292 int cmd, type, error = 0;
293 int bitsize;
294 zfs_userquota_prop_t quota_type;
295 struct dqblk64 dqblk = { 0 };
296
297 td = curthread;
298 cmd = cmds >> SUBCMDSHIFT;
299 type = cmds & SUBCMDMASK;
300
301 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
302 return (error);
303 if (id == -1) {
304 switch (type) {
305 case USRQUOTA:
306 id = td->td_ucred->cr_ruid;
307 break;
308 case GRPQUOTA:
309 id = td->td_ucred->cr_rgid;
310 break;
311 default:
312 error = EINVAL;
313 #if __FreeBSD_version < 1400018
314 if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
315 vfs_unbusy(vfsp);
316 #endif
317 goto done;
318 }
319 }
320 /*
321 * Map BSD type to:
322 * ZFS_PROP_USERUSED,
323 * ZFS_PROP_USERQUOTA,
324 * ZFS_PROP_GROUPUSED,
325 * ZFS_PROP_GROUPQUOTA
326 */
327 switch (cmd) {
328 case Q_SETQUOTA:
329 case Q_SETQUOTA32:
330 if (type == USRQUOTA)
331 quota_type = ZFS_PROP_USERQUOTA;
332 else if (type == GRPQUOTA)
333 quota_type = ZFS_PROP_GROUPQUOTA;
334 else
335 error = EINVAL;
336 break;
337 case Q_GETQUOTA:
338 case Q_GETQUOTA32:
339 if (type == USRQUOTA)
340 quota_type = ZFS_PROP_USERUSED;
341 else if (type == GRPQUOTA)
342 quota_type = ZFS_PROP_GROUPUSED;
343 else
344 error = EINVAL;
345 break;
346 }
347
348 /*
349 * Depending on the cmd, we may need to get
350 * the ruid and domain (see fuidstr_to_sid?),
351 * the fuid (how?), or other information.
352 * Create fuid using zfs_fuid_create(zfsvfs, id,
353 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
354 * I think I can use just the id?
355 *
356 * Look at zfs_id_overquota() to look up a quota.
357 * zap_lookup(something, quotaobj, fuidstring,
358 * sizeof (long long), 1, "a)
359 *
360 * See zfs_set_userquota() to set a quota.
361 */
362 if ((uint32_t)type >= MAXQUOTAS) {
363 error = EINVAL;
364 goto done;
365 }
366
367 switch (cmd) {
368 case Q_GETQUOTASIZE:
369 bitsize = 64;
370 error = copyout(&bitsize, arg, sizeof (int));
371 break;
372 case Q_QUOTAON:
373 // As far as I can tell, you can't turn quotas on or off on zfs
374 error = 0;
375 #if __FreeBSD_version < 1400018
376 vfs_unbusy(vfsp);
377 #endif
378 break;
379 case Q_QUOTAOFF:
380 error = ENOTSUP;
381 #if __FreeBSD_version < 1400018
382 vfs_unbusy(vfsp);
383 #endif
384 break;
385 case Q_SETQUOTA:
386 error = copyin(arg, &dqblk, sizeof (dqblk));
387 if (error == 0)
388 error = zfs_set_userquota(zfsvfs, quota_type,
389 "", id, dbtob(dqblk.dqb_bhardlimit));
390 break;
391 case Q_GETQUOTA:
392 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
393 if (error == 0)
394 error = copyout(&dqblk, arg, sizeof (dqblk));
395 break;
396 default:
397 error = EINVAL;
398 break;
399 }
400 done:
401 zfs_exit(zfsvfs, FTAG);
402 return (error);
403 }
404
405
406 boolean_t
zfs_is_readonly(zfsvfs_t * zfsvfs)407 zfs_is_readonly(zfsvfs_t *zfsvfs)
408 {
409 return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
410 }
411
412 static int
zfs_sync(vfs_t * vfsp,int waitfor)413 zfs_sync(vfs_t *vfsp, int waitfor)
414 {
415
416 /*
417 * Data integrity is job one. We don't want a compromised kernel
418 * writing to the storage pool, so we never sync during panic.
419 */
420 if (panicstr)
421 return (0);
422
423 /*
424 * Ignore the system syncher. ZFS already commits async data
425 * at zfs_txg_timeout intervals.
426 */
427 if (waitfor == MNT_LAZY)
428 return (0);
429
430 if (vfsp != NULL) {
431 /*
432 * Sync a specific filesystem.
433 */
434 zfsvfs_t *zfsvfs = vfsp->vfs_data;
435 dsl_pool_t *dp;
436 int error;
437
438 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
439 return (error);
440 dp = dmu_objset_pool(zfsvfs->z_os);
441
442 /*
443 * If the system is shutting down, then skip any
444 * filesystems which may exist on a suspended pool.
445 */
446 if (rebooting && spa_suspended(dp->dp_spa)) {
447 zfs_exit(zfsvfs, FTAG);
448 return (0);
449 }
450
451 if (zfsvfs->z_log != NULL)
452 zil_commit(zfsvfs->z_log, 0);
453
454 zfs_exit(zfsvfs, FTAG);
455 } else {
456 /*
457 * Sync all ZFS filesystems. This is what happens when you
458 * run sync(8). Unlike other filesystems, ZFS honors the
459 * request by waiting for all pools to commit all dirty data.
460 */
461 spa_sync_allpools();
462 }
463
464 return (0);
465 }
466
467 static void
atime_changed_cb(void * arg,uint64_t newval)468 atime_changed_cb(void *arg, uint64_t newval)
469 {
470 zfsvfs_t *zfsvfs = arg;
471
472 if (newval == TRUE) {
473 zfsvfs->z_atime = TRUE;
474 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
475 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
476 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
477 } else {
478 zfsvfs->z_atime = FALSE;
479 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
480 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
481 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
482 }
483 }
484
485 static void
xattr_changed_cb(void * arg,uint64_t newval)486 xattr_changed_cb(void *arg, uint64_t newval)
487 {
488 zfsvfs_t *zfsvfs = arg;
489
490 if (newval == ZFS_XATTR_OFF) {
491 zfsvfs->z_flags &= ~ZSB_XATTR;
492 } else {
493 zfsvfs->z_flags |= ZSB_XATTR;
494
495 if (newval == ZFS_XATTR_SA)
496 zfsvfs->z_xattr_sa = B_TRUE;
497 else
498 zfsvfs->z_xattr_sa = B_FALSE;
499 }
500 }
501
502 static void
blksz_changed_cb(void * arg,uint64_t newval)503 blksz_changed_cb(void *arg, uint64_t newval)
504 {
505 zfsvfs_t *zfsvfs = arg;
506 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
507 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
508 ASSERT(ISP2(newval));
509
510 zfsvfs->z_max_blksz = newval;
511 zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
512 }
513
514 static void
readonly_changed_cb(void * arg,uint64_t newval)515 readonly_changed_cb(void *arg, uint64_t newval)
516 {
517 zfsvfs_t *zfsvfs = arg;
518
519 if (newval) {
520 /* XXX locking on vfs_flag? */
521 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
522 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
523 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
524 } else {
525 /* XXX locking on vfs_flag? */
526 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
527 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
528 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
529 }
530 }
531
532 static void
setuid_changed_cb(void * arg,uint64_t newval)533 setuid_changed_cb(void *arg, uint64_t newval)
534 {
535 zfsvfs_t *zfsvfs = arg;
536
537 if (newval == FALSE) {
538 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
539 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
540 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
541 } else {
542 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
543 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
544 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
545 }
546 }
547
548 static void
exec_changed_cb(void * arg,uint64_t newval)549 exec_changed_cb(void *arg, uint64_t newval)
550 {
551 zfsvfs_t *zfsvfs = arg;
552
553 if (newval == FALSE) {
554 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
555 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
556 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
557 } else {
558 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
559 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
560 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
561 }
562 }
563
564 /*
565 * The nbmand mount option can be changed at mount time.
566 * We can't allow it to be toggled on live file systems or incorrect
567 * behavior may be seen from cifs clients
568 *
569 * This property isn't registered via dsl_prop_register(), but this callback
570 * will be called when a file system is first mounted
571 */
572 static void
nbmand_changed_cb(void * arg,uint64_t newval)573 nbmand_changed_cb(void *arg, uint64_t newval)
574 {
575 zfsvfs_t *zfsvfs = arg;
576 if (newval == FALSE) {
577 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
578 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
579 } else {
580 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
581 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
582 }
583 }
584
585 static void
snapdir_changed_cb(void * arg,uint64_t newval)586 snapdir_changed_cb(void *arg, uint64_t newval)
587 {
588 zfsvfs_t *zfsvfs = arg;
589
590 zfsvfs->z_show_ctldir = newval;
591 }
592
593 static void
acl_mode_changed_cb(void * arg,uint64_t newval)594 acl_mode_changed_cb(void *arg, uint64_t newval)
595 {
596 zfsvfs_t *zfsvfs = arg;
597
598 zfsvfs->z_acl_mode = newval;
599 }
600
601 static void
acl_inherit_changed_cb(void * arg,uint64_t newval)602 acl_inherit_changed_cb(void *arg, uint64_t newval)
603 {
604 zfsvfs_t *zfsvfs = arg;
605
606 zfsvfs->z_acl_inherit = newval;
607 }
608
609 static void
acl_type_changed_cb(void * arg,uint64_t newval)610 acl_type_changed_cb(void *arg, uint64_t newval)
611 {
612 zfsvfs_t *zfsvfs = arg;
613
614 zfsvfs->z_acl_type = newval;
615 }
616
617 static void
longname_changed_cb(void * arg,uint64_t newval)618 longname_changed_cb(void *arg, uint64_t newval)
619 {
620 zfsvfs_t *zfsvfs = arg;
621
622 zfsvfs->z_longname = newval;
623 }
624
625 static int
zfs_register_callbacks(vfs_t * vfsp)626 zfs_register_callbacks(vfs_t *vfsp)
627 {
628 struct dsl_dataset *ds = NULL;
629 objset_t *os = NULL;
630 zfsvfs_t *zfsvfs = NULL;
631 uint64_t nbmand;
632 boolean_t readonly = B_FALSE;
633 boolean_t do_readonly = B_FALSE;
634 boolean_t setuid = B_FALSE;
635 boolean_t do_setuid = B_FALSE;
636 boolean_t exec = B_FALSE;
637 boolean_t do_exec = B_FALSE;
638 boolean_t xattr = B_FALSE;
639 boolean_t atime = B_FALSE;
640 boolean_t do_atime = B_FALSE;
641 boolean_t do_xattr = B_FALSE;
642 int error = 0;
643
644 ASSERT3P(vfsp, !=, NULL);
645 zfsvfs = vfsp->vfs_data;
646 ASSERT3P(zfsvfs, !=, NULL);
647 os = zfsvfs->z_os;
648
649 /*
650 * This function can be called for a snapshot when we update snapshot's
651 * mount point, which isn't really supported.
652 */
653 if (dmu_objset_is_snapshot(os))
654 return (EOPNOTSUPP);
655
656 /*
657 * The act of registering our callbacks will destroy any mount
658 * options we may have. In order to enable temporary overrides
659 * of mount options, we stash away the current values and
660 * restore them after we register the callbacks.
661 */
662 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
663 !spa_writeable(dmu_objset_spa(os))) {
664 readonly = B_TRUE;
665 do_readonly = B_TRUE;
666 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
667 readonly = B_FALSE;
668 do_readonly = B_TRUE;
669 }
670 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
671 setuid = B_FALSE;
672 do_setuid = B_TRUE;
673 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
674 setuid = B_TRUE;
675 do_setuid = B_TRUE;
676 }
677 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
678 exec = B_FALSE;
679 do_exec = B_TRUE;
680 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
681 exec = B_TRUE;
682 do_exec = B_TRUE;
683 }
684 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
685 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
686 do_xattr = B_TRUE;
687 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
688 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
689 do_xattr = B_TRUE;
690 } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
691 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
692 do_xattr = B_TRUE;
693 } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
694 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
695 do_xattr = B_TRUE;
696 }
697 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
698 atime = B_FALSE;
699 do_atime = B_TRUE;
700 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
701 atime = B_TRUE;
702 do_atime = B_TRUE;
703 }
704
705 /*
706 * We need to enter pool configuration here, so that we can use
707 * dsl_prop_get_int_ds() to handle the special nbmand property below.
708 * dsl_prop_get_integer() can not be used, because it has to acquire
709 * spa_namespace_lock and we can not do that because we already hold
710 * z_teardown_lock. The problem is that spa_write_cachefile() is called
711 * with spa_namespace_lock held and the function calls ZFS vnode
712 * operations to write the cache file and thus z_teardown_lock is
713 * acquired after spa_namespace_lock.
714 */
715 ds = dmu_objset_ds(os);
716 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
717
718 /*
719 * nbmand is a special property. It can only be changed at
720 * mount time.
721 *
722 * This is weird, but it is documented to only be changeable
723 * at mount time.
724 */
725 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
726 nbmand = B_FALSE;
727 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
728 nbmand = B_TRUE;
729 } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) {
730 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
731 return (error);
732 }
733
734 /*
735 * Register property callbacks.
736 *
737 * It would probably be fine to just check for i/o error from
738 * the first prop_register(), but I guess I like to go
739 * overboard...
740 */
741 error = dsl_prop_register(ds,
742 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
743 error = error ? error : dsl_prop_register(ds,
744 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
745 error = error ? error : dsl_prop_register(ds,
746 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
747 error = error ? error : dsl_prop_register(ds,
748 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
749 error = error ? error : dsl_prop_register(ds,
750 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
751 error = error ? error : dsl_prop_register(ds,
752 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
753 error = error ? error : dsl_prop_register(ds,
754 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
755 error = error ? error : dsl_prop_register(ds,
756 zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
757 error = error ? error : dsl_prop_register(ds,
758 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
759 error = error ? error : dsl_prop_register(ds,
760 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
761 zfsvfs);
762 error = error ? error : dsl_prop_register(ds,
763 zfs_prop_to_name(ZFS_PROP_LONGNAME), longname_changed_cb, zfsvfs);
764 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
765 if (error)
766 goto unregister;
767
768 /*
769 * Invoke our callbacks to restore temporary mount options.
770 */
771 if (do_readonly)
772 readonly_changed_cb(zfsvfs, readonly);
773 if (do_setuid)
774 setuid_changed_cb(zfsvfs, setuid);
775 if (do_exec)
776 exec_changed_cb(zfsvfs, exec);
777 if (do_xattr)
778 xattr_changed_cb(zfsvfs, xattr);
779 if (do_atime)
780 atime_changed_cb(zfsvfs, atime);
781
782 nbmand_changed_cb(zfsvfs, nbmand);
783
784 return (0);
785
786 unregister:
787 dsl_prop_unregister_all(ds, zfsvfs);
788 return (error);
789 }
790
791 /*
792 * Associate this zfsvfs with the given objset, which must be owned.
793 * This will cache a bunch of on-disk state from the objset in the
794 * zfsvfs.
795 */
796 static int
zfsvfs_init(zfsvfs_t * zfsvfs,objset_t * os)797 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
798 {
799 int error;
800 uint64_t val;
801
802 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
803 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
804 zfsvfs->z_os = os;
805
806 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
807 if (error != 0)
808 return (error);
809 if (zfsvfs->z_version >
810 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
811 (void) printf("Can't mount a version %lld file system "
812 "on a version %lld pool\n. Pool must be upgraded to mount "
813 "this file system.", (u_longlong_t)zfsvfs->z_version,
814 (u_longlong_t)spa_version(dmu_objset_spa(os)));
815 return (SET_ERROR(ENOTSUP));
816 }
817 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
818 if (error != 0)
819 return (error);
820 zfsvfs->z_norm = (int)val;
821
822 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
823 if (error != 0)
824 return (error);
825 zfsvfs->z_utf8 = (val != 0);
826
827 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
828 if (error != 0)
829 return (error);
830 zfsvfs->z_case = (uint_t)val;
831
832 error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
833 if (error != 0)
834 return (error);
835 zfsvfs->z_acl_type = (uint_t)val;
836
837 /*
838 * Fold case on file systems that are always or sometimes case
839 * insensitive.
840 */
841 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
842 zfsvfs->z_case == ZFS_CASE_MIXED)
843 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
844
845 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
846 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
847
848 uint64_t sa_obj = 0;
849 if (zfsvfs->z_use_sa) {
850 /* should either have both of these objects or none */
851 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
852 &sa_obj);
853 if (error != 0)
854 return (error);
855
856 error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
857 if (error == 0 && val == ZFS_XATTR_SA)
858 zfsvfs->z_xattr_sa = B_TRUE;
859 }
860
861 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
862 &zfsvfs->z_attr_table);
863 if (error != 0)
864 return (error);
865
866 if (zfsvfs->z_version >= ZPL_VERSION_SA)
867 sa_register_update_callback(os, zfs_sa_upgrade);
868
869 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
870 &zfsvfs->z_root);
871 if (error != 0)
872 return (error);
873 ASSERT3U(zfsvfs->z_root, !=, 0);
874
875 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
876 &zfsvfs->z_unlinkedobj);
877 if (error != 0)
878 return (error);
879
880 error = zap_lookup(os, MASTER_NODE_OBJ,
881 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
882 8, 1, &zfsvfs->z_userquota_obj);
883 if (error == ENOENT)
884 zfsvfs->z_userquota_obj = 0;
885 else if (error != 0)
886 return (error);
887
888 error = zap_lookup(os, MASTER_NODE_OBJ,
889 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
890 8, 1, &zfsvfs->z_groupquota_obj);
891 if (error == ENOENT)
892 zfsvfs->z_groupquota_obj = 0;
893 else if (error != 0)
894 return (error);
895
896 error = zap_lookup(os, MASTER_NODE_OBJ,
897 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
898 8, 1, &zfsvfs->z_projectquota_obj);
899 if (error == ENOENT)
900 zfsvfs->z_projectquota_obj = 0;
901 else if (error != 0)
902 return (error);
903
904 error = zap_lookup(os, MASTER_NODE_OBJ,
905 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
906 8, 1, &zfsvfs->z_userobjquota_obj);
907 if (error == ENOENT)
908 zfsvfs->z_userobjquota_obj = 0;
909 else if (error != 0)
910 return (error);
911
912 error = zap_lookup(os, MASTER_NODE_OBJ,
913 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
914 8, 1, &zfsvfs->z_groupobjquota_obj);
915 if (error == ENOENT)
916 zfsvfs->z_groupobjquota_obj = 0;
917 else if (error != 0)
918 return (error);
919
920 error = zap_lookup(os, MASTER_NODE_OBJ,
921 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
922 8, 1, &zfsvfs->z_projectobjquota_obj);
923 if (error == ENOENT)
924 zfsvfs->z_projectobjquota_obj = 0;
925 else if (error != 0)
926 return (error);
927
928 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
929 &zfsvfs->z_fuid_obj);
930 if (error == ENOENT)
931 zfsvfs->z_fuid_obj = 0;
932 else if (error != 0)
933 return (error);
934
935 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
936 &zfsvfs->z_shares_dir);
937 if (error == ENOENT)
938 zfsvfs->z_shares_dir = 0;
939 else if (error != 0)
940 return (error);
941
942 /*
943 * Only use the name cache if we are looking for a
944 * name on a file system that does not require normalization
945 * or case folding. We can also look there if we happen to be
946 * on a non-normalizing, mixed sensitivity file system IF we
947 * are looking for the exact name (which is always the case on
948 * FreeBSD).
949 */
950 zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
951 ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
952 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
953
954 return (0);
955 }
956
957 taskq_t *zfsvfs_taskq;
958
959 static void
zfsvfs_task_unlinked_drain(void * context,int pending __unused)960 zfsvfs_task_unlinked_drain(void *context, int pending __unused)
961 {
962
963 zfs_unlinked_drain((zfsvfs_t *)context);
964 }
965
966 int
zfsvfs_create(const char * osname,boolean_t readonly,zfsvfs_t ** zfvp)967 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
968 {
969 objset_t *os;
970 zfsvfs_t *zfsvfs;
971 int error;
972 boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
973
974 /*
975 * XXX: Fix struct statfs so this isn't necessary!
976 *
977 * The 'osname' is used as the filesystem's special node, which means
978 * it must fit in statfs.f_mntfromname, or else it can't be
979 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
980 * 'zfs unmount' to think it's not mounted when it is.
981 */
982 if (strlen(osname) >= MNAMELEN)
983 return (SET_ERROR(ENAMETOOLONG));
984
985 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
986
987 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
988 &os);
989 if (error != 0) {
990 kmem_free(zfsvfs, sizeof (zfsvfs_t));
991 return (error);
992 }
993
994 error = zfsvfs_create_impl(zfvp, zfsvfs, os);
995
996 return (error);
997 }
998
999
1000 int
zfsvfs_create_impl(zfsvfs_t ** zfvp,zfsvfs_t * zfsvfs,objset_t * os)1001 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
1002 {
1003 int error;
1004
1005 zfsvfs->z_vfs = NULL;
1006 zfsvfs->z_parent = zfsvfs;
1007
1008 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1009 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1010 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1011 offsetof(znode_t, z_link_node));
1012 TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
1013 zfsvfs_task_unlinked_drain, zfsvfs);
1014 ZFS_TEARDOWN_INIT(zfsvfs);
1015 ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
1016 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1017 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1018 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1019
1020 error = zfsvfs_init(zfsvfs, os);
1021 if (error != 0) {
1022 dmu_objset_disown(os, B_TRUE, zfsvfs);
1023 *zfvp = NULL;
1024 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1025 return (error);
1026 }
1027
1028 *zfvp = zfsvfs;
1029 return (0);
1030 }
1031
1032 static int
zfsvfs_setup(zfsvfs_t * zfsvfs,boolean_t mounting)1033 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1034 {
1035 int error;
1036
1037 /*
1038 * Check for a bad on-disk format version now since we
1039 * lied about owning the dataset readonly before.
1040 */
1041 if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1042 dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1043 return (SET_ERROR(EROFS));
1044
1045 error = zfs_register_callbacks(zfsvfs->z_vfs);
1046 if (error)
1047 return (error);
1048
1049 /*
1050 * If we are not mounting (ie: online recv), then we don't
1051 * have to worry about replaying the log as we blocked all
1052 * operations out since we closed the ZIL.
1053 */
1054 if (mounting) {
1055 boolean_t readonly;
1056
1057 ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
1058 error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1059 if (error)
1060 return (error);
1061 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1062 &zfsvfs->z_kstat.dk_zil_sums);
1063
1064 /*
1065 * During replay we remove the read only flag to
1066 * allow replays to succeed.
1067 */
1068 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1069 if (readonly != 0) {
1070 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1071 } else {
1072 dsl_dir_t *dd;
1073 zap_stats_t zs;
1074
1075 if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1076 &zs) == 0) {
1077 dataset_kstats_update_nunlinks_kstat(
1078 &zfsvfs->z_kstat, zs.zs_num_entries);
1079 dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1080 "num_entries in unlinked set: %llu",
1081 (u_longlong_t)zs.zs_num_entries);
1082 }
1083
1084 zfs_unlinked_drain(zfsvfs);
1085 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1086 dd->dd_activity_cancelled = B_FALSE;
1087 }
1088
1089 /*
1090 * Parse and replay the intent log.
1091 *
1092 * Because of ziltest, this must be done after
1093 * zfs_unlinked_drain(). (Further note: ziltest
1094 * doesn't use readonly mounts, where
1095 * zfs_unlinked_drain() isn't called.) This is because
1096 * ziltest causes spa_sync() to think it's committed,
1097 * but actually it is not, so the intent log contains
1098 * many txg's worth of changes.
1099 *
1100 * In particular, if object N is in the unlinked set in
1101 * the last txg to actually sync, then it could be
1102 * actually freed in a later txg and then reallocated
1103 * in a yet later txg. This would write a "create
1104 * object N" record to the intent log. Normally, this
1105 * would be fine because the spa_sync() would have
1106 * written out the fact that object N is free, before
1107 * we could write the "create object N" intent log
1108 * record.
1109 *
1110 * But when we are in ziltest mode, we advance the "open
1111 * txg" without actually spa_sync()-ing the changes to
1112 * disk. So we would see that object N is still
1113 * allocated and in the unlinked set, and there is an
1114 * intent log record saying to allocate it.
1115 */
1116 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1117 if (zil_replay_disable) {
1118 zil_destroy(zfsvfs->z_log, B_FALSE);
1119 } else {
1120 boolean_t use_nc = zfsvfs->z_use_namecache;
1121 zfsvfs->z_use_namecache = B_FALSE;
1122 zfsvfs->z_replay = B_TRUE;
1123 zil_replay(zfsvfs->z_os, zfsvfs,
1124 zfs_replay_vector);
1125 zfsvfs->z_replay = B_FALSE;
1126 zfsvfs->z_use_namecache = use_nc;
1127 }
1128 }
1129
1130 /* restore readonly bit */
1131 if (readonly != 0)
1132 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1133 } else {
1134 ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL);
1135 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1136 &zfsvfs->z_kstat.dk_zil_sums);
1137 }
1138
1139 /*
1140 * Set the objset user_ptr to track its zfsvfs.
1141 */
1142 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1143 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1144 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1145
1146 return (0);
1147 }
1148
1149 void
zfsvfs_free(zfsvfs_t * zfsvfs)1150 zfsvfs_free(zfsvfs_t *zfsvfs)
1151 {
1152 int i;
1153
1154 zfs_fuid_destroy(zfsvfs);
1155
1156 mutex_destroy(&zfsvfs->z_znodes_lock);
1157 mutex_destroy(&zfsvfs->z_lock);
1158 list_destroy(&zfsvfs->z_all_znodes);
1159 ZFS_TEARDOWN_DESTROY(zfsvfs);
1160 ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
1161 rw_destroy(&zfsvfs->z_fuid_lock);
1162 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1163 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1164 dataset_kstats_destroy(&zfsvfs->z_kstat);
1165 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1166 }
1167
1168 static void
zfs_set_fuid_feature(zfsvfs_t * zfsvfs)1169 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1170 {
1171 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1172 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1173 }
1174
1175 static int
zfs_domount(vfs_t * vfsp,char * osname)1176 zfs_domount(vfs_t *vfsp, char *osname)
1177 {
1178 uint64_t recordsize, fsid_guid;
1179 int error = 0;
1180 zfsvfs_t *zfsvfs;
1181
1182 ASSERT3P(vfsp, !=, NULL);
1183 ASSERT3P(osname, !=, NULL);
1184
1185 error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1186 if (error)
1187 return (error);
1188 zfsvfs->z_vfs = vfsp;
1189
1190 if ((error = dsl_prop_get_integer(osname,
1191 "recordsize", &recordsize, NULL)))
1192 goto out;
1193 zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1194 zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1195
1196 vfsp->vfs_data = zfsvfs;
1197 vfsp->mnt_flag |= MNT_LOCAL;
1198 vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1199 vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1200 vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1201 /*
1202 * This can cause a loss of coherence between ARC and page cache
1203 * on ZoF - unclear if the problem is in FreeBSD or ZoF
1204 */
1205 vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */
1206 vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1207 vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1208
1209 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
1210 vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1211 #endif
1212 /*
1213 * The fsid is 64 bits, composed of an 8-bit fs type, which
1214 * separates our fsid from any other filesystem types, and a
1215 * 56-bit objset unique ID. The objset unique ID is unique to
1216 * all objsets open on this system, provided by unique_create().
1217 * The 8-bit fs type must be put in the low bits of fsid[1]
1218 * because that's where other Solaris filesystems put it.
1219 */
1220 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1221 ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0);
1222 vfsp->vfs_fsid.val[0] = fsid_guid;
1223 vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) |
1224 (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1225
1226 /*
1227 * Set features for file system.
1228 */
1229 zfs_set_fuid_feature(zfsvfs);
1230
1231 if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1232 uint64_t pval;
1233
1234 atime_changed_cb(zfsvfs, B_FALSE);
1235 readonly_changed_cb(zfsvfs, B_TRUE);
1236 if ((error = dsl_prop_get_integer(osname,
1237 "xattr", &pval, NULL)))
1238 goto out;
1239 xattr_changed_cb(zfsvfs, pval);
1240 if ((error = dsl_prop_get_integer(osname,
1241 "acltype", &pval, NULL)))
1242 goto out;
1243 acl_type_changed_cb(zfsvfs, pval);
1244 zfsvfs->z_issnap = B_TRUE;
1245 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1246
1247 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1248 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1249 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1250 } else {
1251 if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1252 goto out;
1253 }
1254
1255 vfs_mountedfrom(vfsp, osname);
1256
1257 if (!zfsvfs->z_issnap)
1258 zfsctl_create(zfsvfs);
1259 out:
1260 if (error) {
1261 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1262 zfsvfs_free(zfsvfs);
1263 } else {
1264 atomic_inc_32(&zfs_active_fs_count);
1265 }
1266
1267 return (error);
1268 }
1269
1270 static void
zfs_unregister_callbacks(zfsvfs_t * zfsvfs)1271 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1272 {
1273 objset_t *os = zfsvfs->z_os;
1274
1275 if (!dmu_objset_is_snapshot(os))
1276 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1277 }
1278
1279 static int
getpoolname(const char * osname,char * poolname)1280 getpoolname(const char *osname, char *poolname)
1281 {
1282 char *p;
1283
1284 p = strchr(osname, '/');
1285 if (p == NULL) {
1286 if (strlen(osname) >= MAXNAMELEN)
1287 return (ENAMETOOLONG);
1288 (void) strcpy(poolname, osname);
1289 } else {
1290 if (p - osname >= MAXNAMELEN)
1291 return (ENAMETOOLONG);
1292 (void) strlcpy(poolname, osname, p - osname + 1);
1293 }
1294 return (0);
1295 }
1296
1297 static void
fetch_osname_options(char * name,bool * checkpointrewind)1298 fetch_osname_options(char *name, bool *checkpointrewind)
1299 {
1300
1301 if (name[0] == '!') {
1302 *checkpointrewind = true;
1303 memmove(name, name + 1, strlen(name));
1304 } else {
1305 *checkpointrewind = false;
1306 }
1307 }
1308
1309 static int
zfs_mount(vfs_t * vfsp)1310 zfs_mount(vfs_t *vfsp)
1311 {
1312 kthread_t *td = curthread;
1313 vnode_t *mvp = vfsp->mnt_vnodecovered;
1314 cred_t *cr = td->td_ucred;
1315 char *osname;
1316 int error = 0;
1317 int canwrite;
1318 bool checkpointrewind, isctlsnap = false;
1319
1320 if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1321 return (SET_ERROR(EINVAL));
1322
1323 /*
1324 * If full-owner-access is enabled and delegated administration is
1325 * turned on, we must set nosuid.
1326 */
1327 if (zfs_super_owner &&
1328 dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1329 secpolicy_fs_mount_clearopts(cr, vfsp);
1330 }
1331
1332 fetch_osname_options(osname, &checkpointrewind);
1333 isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) &&
1334 strchr(osname, '@') != NULL);
1335
1336 /*
1337 * Check for mount privilege?
1338 *
1339 * If we don't have privilege then see if
1340 * we have local permission to allow it
1341 */
1342 error = secpolicy_fs_mount(cr, mvp, vfsp);
1343 if (error && isctlsnap) {
1344 secpolicy_fs_mount_clearopts(cr, vfsp);
1345 } else if (error) {
1346 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1347 goto out;
1348
1349 if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1350 vattr_t vattr;
1351
1352 /*
1353 * Make sure user is the owner of the mount point
1354 * or has sufficient privileges.
1355 */
1356
1357 vattr.va_mask = AT_UID;
1358
1359 vn_lock(mvp, LK_SHARED | LK_RETRY);
1360 if (VOP_GETATTR(mvp, &vattr, cr)) {
1361 VOP_UNLOCK(mvp);
1362 goto out;
1363 }
1364
1365 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1366 VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1367 VOP_UNLOCK(mvp);
1368 goto out;
1369 }
1370 VOP_UNLOCK(mvp);
1371 }
1372
1373 secpolicy_fs_mount_clearopts(cr, vfsp);
1374 }
1375
1376 /*
1377 * Refuse to mount a filesystem if we are in a local zone and the
1378 * dataset is not visible.
1379 */
1380 if (!INGLOBALZONE(curproc) &&
1381 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1382 boolean_t mount_snapshot = B_FALSE;
1383
1384 /*
1385 * Snapshots may be mounted in .zfs for unjailed datasets
1386 * if allowed by the jail param zfs.mount_snapshot.
1387 */
1388 if (isctlsnap) {
1389 struct prison *pr;
1390 struct zfs_jailparam *zjp;
1391
1392 pr = curthread->td_ucred->cr_prison;
1393 mtx_lock(&pr->pr_mtx);
1394 zjp = osd_jail_get(pr, zfs_jailparam_slot);
1395 mtx_unlock(&pr->pr_mtx);
1396 if (zjp && zjp->mount_snapshot)
1397 mount_snapshot = B_TRUE;
1398 }
1399 if (!mount_snapshot) {
1400 error = SET_ERROR(EPERM);
1401 goto out;
1402 }
1403 }
1404
1405 vfsp->vfs_flag |= MNT_NFS4ACLS;
1406
1407 /*
1408 * When doing a remount, we simply refresh our temporary properties
1409 * according to those options set in the current VFS options.
1410 */
1411 if (vfsp->vfs_flag & MS_REMOUNT) {
1412 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1413
1414 /*
1415 * Refresh mount options with z_teardown_lock blocking I/O while
1416 * the filesystem is in an inconsistent state.
1417 * The lock also serializes this code with filesystem
1418 * manipulations between entry to zfs_suspend_fs() and return
1419 * from zfs_resume_fs().
1420 */
1421 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1422 zfs_unregister_callbacks(zfsvfs);
1423 error = zfs_register_callbacks(vfsp);
1424 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1425 goto out;
1426 }
1427
1428 /* Initial root mount: try hard to import the requested root pool. */
1429 if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1430 (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1431 char pname[MAXNAMELEN];
1432
1433 error = getpoolname(osname, pname);
1434 if (error == 0)
1435 error = spa_import_rootpool(pname, checkpointrewind);
1436 if (error)
1437 goto out;
1438 }
1439 DROP_GIANT();
1440 error = zfs_domount(vfsp, osname);
1441 PICKUP_GIANT();
1442
1443 out:
1444 return (error);
1445 }
1446
1447 static int
zfs_statfs(vfs_t * vfsp,struct statfs * statp)1448 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1449 {
1450 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1451 uint64_t refdbytes, availbytes, usedobjs, availobjs;
1452 int error;
1453
1454 statp->f_version = STATFS_VERSION;
1455
1456 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1457 return (error);
1458
1459 dmu_objset_space(zfsvfs->z_os,
1460 &refdbytes, &availbytes, &usedobjs, &availobjs);
1461
1462 /*
1463 * The underlying storage pool actually uses multiple block sizes.
1464 * We report the fragsize as the smallest block size we support,
1465 * and we report our blocksize as the filesystem's maximum blocksize.
1466 */
1467 statp->f_bsize = SPA_MINBLOCKSIZE;
1468 statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1469
1470 /*
1471 * The following report "total" blocks of various kinds in the
1472 * file system, but reported in terms of f_frsize - the
1473 * "fragment" size.
1474 */
1475
1476 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1477 statp->f_bfree = availbytes / statp->f_bsize;
1478 statp->f_bavail = statp->f_bfree; /* no root reservation */
1479
1480 /*
1481 * statvfs() should really be called statufs(), because it assumes
1482 * static metadata. ZFS doesn't preallocate files, so the best
1483 * we can do is report the max that could possibly fit in f_files,
1484 * and that minus the number actually used in f_ffree.
1485 * For f_ffree, report the smaller of the number of object available
1486 * and the number of blocks (each object will take at least a block).
1487 */
1488 statp->f_ffree = MIN(availobjs, statp->f_bfree);
1489 statp->f_files = statp->f_ffree + usedobjs;
1490
1491 /*
1492 * We're a zfs filesystem.
1493 */
1494 strlcpy(statp->f_fstypename, "zfs",
1495 sizeof (statp->f_fstypename));
1496
1497 strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1498 sizeof (statp->f_mntfromname));
1499 strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1500 sizeof (statp->f_mntonname));
1501
1502 statp->f_namemax =
1503 zfsvfs->z_longname ? (ZAP_MAXNAMELEN_NEW - 1) : (MAXNAMELEN - 1);
1504
1505 zfs_exit(zfsvfs, FTAG);
1506 return (0);
1507 }
1508
1509 static int
zfs_root(vfs_t * vfsp,int flags,vnode_t ** vpp)1510 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1511 {
1512 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1513 znode_t *rootzp;
1514 int error;
1515
1516 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1517 return (error);
1518
1519 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1520 if (error == 0)
1521 *vpp = ZTOV(rootzp);
1522
1523 zfs_exit(zfsvfs, FTAG);
1524
1525 if (error == 0) {
1526 error = vn_lock(*vpp, flags);
1527 if (error != 0) {
1528 VN_RELE(*vpp);
1529 *vpp = NULL;
1530 }
1531 }
1532 return (error);
1533 }
1534
1535 /*
1536 * Teardown the zfsvfs::z_os.
1537 *
1538 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1539 * and 'z_teardown_inactive_lock' held.
1540 */
1541 static int
zfsvfs_teardown(zfsvfs_t * zfsvfs,boolean_t unmounting)1542 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1543 {
1544 znode_t *zp;
1545 dsl_dir_t *dd;
1546
1547 /*
1548 * If someone has not already unmounted this file system,
1549 * drain the zrele_taskq to ensure all active references to the
1550 * zfsvfs_t have been handled only then can it be safely destroyed.
1551 */
1552 if (zfsvfs->z_os) {
1553 /*
1554 * If we're unmounting we have to wait for the list to
1555 * drain completely.
1556 *
1557 * If we're not unmounting there's no guarantee the list
1558 * will drain completely, but zreles run from the taskq
1559 * may add the parents of dir-based xattrs to the taskq
1560 * so we want to wait for these.
1561 *
1562 * We can safely check z_all_znodes for being empty because the
1563 * VFS has already blocked operations which add to it.
1564 */
1565 int round = 0;
1566 while (!list_is_empty(&zfsvfs->z_all_znodes)) {
1567 taskq_wait_outstanding(dsl_pool_zrele_taskq(
1568 dmu_objset_pool(zfsvfs->z_os)), 0);
1569 if (++round > 1 && !unmounting)
1570 break;
1571 }
1572 }
1573 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1574
1575 if (!unmounting) {
1576 /*
1577 * We purge the parent filesystem's vfsp as the parent
1578 * filesystem and all of its snapshots have their vnode's
1579 * v_vfsp set to the parent's filesystem's vfsp. Note,
1580 * 'z_parent' is self referential for non-snapshots.
1581 */
1582 #ifdef FREEBSD_NAMECACHE
1583 cache_purgevfs(zfsvfs->z_parent->z_vfs);
1584 #endif
1585 }
1586
1587 /*
1588 * Close the zil. NB: Can't close the zil while zfs_inactive
1589 * threads are blocked as zil_close can call zfs_inactive.
1590 */
1591 if (zfsvfs->z_log) {
1592 zil_close(zfsvfs->z_log);
1593 zfsvfs->z_log = NULL;
1594 }
1595
1596 ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
1597
1598 /*
1599 * If we are not unmounting (ie: online recv) and someone already
1600 * unmounted this file system while we were doing the switcheroo,
1601 * or a reopen of z_os failed then just bail out now.
1602 */
1603 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1604 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1605 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1606 return (SET_ERROR(EIO));
1607 }
1608
1609 /*
1610 * At this point there are no vops active, and any new vops will
1611 * fail with EIO since we have z_teardown_lock for writer (only
1612 * relevant for forced unmount).
1613 *
1614 * Release all holds on dbufs.
1615 */
1616 mutex_enter(&zfsvfs->z_znodes_lock);
1617 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1618 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1619 if (zp->z_sa_hdl != NULL) {
1620 zfs_znode_dmu_fini(zp);
1621 }
1622 }
1623 mutex_exit(&zfsvfs->z_znodes_lock);
1624
1625 /*
1626 * If we are unmounting, set the unmounted flag and let new vops
1627 * unblock. zfs_inactive will have the unmounted behavior, and all
1628 * other vops will fail with EIO.
1629 */
1630 if (unmounting) {
1631 zfsvfs->z_unmounted = B_TRUE;
1632 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1633 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1634 }
1635
1636 /*
1637 * z_os will be NULL if there was an error in attempting to reopen
1638 * zfsvfs, so just return as the properties had already been
1639 * unregistered and cached data had been evicted before.
1640 */
1641 if (zfsvfs->z_os == NULL)
1642 return (0);
1643
1644 /*
1645 * Unregister properties.
1646 */
1647 zfs_unregister_callbacks(zfsvfs);
1648
1649 /*
1650 * Evict cached data. We must write out any dirty data before
1651 * disowning the dataset.
1652 */
1653 objset_t *os = zfsvfs->z_os;
1654 boolean_t os_dirty = B_FALSE;
1655 for (int t = 0; t < TXG_SIZE; t++) {
1656 if (dmu_objset_is_dirty(os, t)) {
1657 os_dirty = B_TRUE;
1658 break;
1659 }
1660 }
1661 if (!zfs_is_readonly(zfsvfs) && os_dirty)
1662 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1663 dmu_objset_evict_dbufs(zfsvfs->z_os);
1664 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1665 dsl_dir_cancel_waiters(dd);
1666
1667 return (0);
1668 }
1669
1670 static int
zfs_umount(vfs_t * vfsp,int fflag)1671 zfs_umount(vfs_t *vfsp, int fflag)
1672 {
1673 kthread_t *td = curthread;
1674 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1675 objset_t *os;
1676 cred_t *cr = td->td_ucred;
1677 int ret;
1678
1679 ret = secpolicy_fs_unmount(cr, vfsp);
1680 if (ret) {
1681 if (dsl_deleg_access((char *)vfsp->vfs_resource,
1682 ZFS_DELEG_PERM_MOUNT, cr))
1683 return (ret);
1684 }
1685
1686 /*
1687 * Unmount any snapshots mounted under .zfs before unmounting the
1688 * dataset itself.
1689 */
1690 if (zfsvfs->z_ctldir != NULL) {
1691 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1692 return (ret);
1693 }
1694
1695 if (fflag & MS_FORCE) {
1696 /*
1697 * Mark file system as unmounted before calling
1698 * vflush(FORCECLOSE). This way we ensure no future vnops
1699 * will be called and risk operating on DOOMED vnodes.
1700 */
1701 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1702 zfsvfs->z_unmounted = B_TRUE;
1703 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1704 }
1705
1706 /*
1707 * Flush all the files.
1708 */
1709 ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1710 if (ret != 0)
1711 return (ret);
1712 while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1713 &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1714 taskqueue_drain(zfsvfs_taskq->tq_queue,
1715 &zfsvfs->z_unlinked_drain_task);
1716
1717 VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
1718 os = zfsvfs->z_os;
1719
1720 /*
1721 * z_os will be NULL if there was an error in
1722 * attempting to reopen zfsvfs.
1723 */
1724 if (os != NULL) {
1725 /*
1726 * Unset the objset user_ptr.
1727 */
1728 mutex_enter(&os->os_user_ptr_lock);
1729 dmu_objset_set_user(os, NULL);
1730 mutex_exit(&os->os_user_ptr_lock);
1731
1732 /*
1733 * Finally release the objset
1734 */
1735 dmu_objset_disown(os, B_TRUE, zfsvfs);
1736 }
1737
1738 /*
1739 * We can now safely destroy the '.zfs' directory node.
1740 */
1741 if (zfsvfs->z_ctldir != NULL)
1742 zfsctl_destroy(zfsvfs);
1743 zfs_freevfs(vfsp);
1744
1745 return (0);
1746 }
1747
1748 static int
zfs_vget(vfs_t * vfsp,ino_t ino,int flags,vnode_t ** vpp)1749 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1750 {
1751 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1752 znode_t *zp;
1753 int err;
1754
1755 /*
1756 * zfs_zget() can't operate on virtual entries like .zfs/ or
1757 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1758 * This will make NFS to switch to LOOKUP instead of using VGET.
1759 */
1760 if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1761 (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1762 return (EOPNOTSUPP);
1763
1764 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1765 return (err);
1766 err = zfs_zget(zfsvfs, ino, &zp);
1767 if (err == 0 && zp->z_unlinked) {
1768 vrele(ZTOV(zp));
1769 err = EINVAL;
1770 }
1771 if (err == 0)
1772 *vpp = ZTOV(zp);
1773 zfs_exit(zfsvfs, FTAG);
1774 if (err == 0) {
1775 err = vn_lock(*vpp, flags);
1776 if (err != 0)
1777 vrele(*vpp);
1778 }
1779 if (err != 0)
1780 *vpp = NULL;
1781 return (err);
1782 }
1783
1784 static int
zfs_checkexp(vfs_t * vfsp,struct sockaddr * nam,uint64_t * extflagsp,struct ucred ** credanonp,int * numsecflavors,int * secflavors)1785 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1786 struct ucred **credanonp, int *numsecflavors, int *secflavors)
1787 {
1788 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1789
1790 /*
1791 * If this is regular file system vfsp is the same as
1792 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1793 * zfsvfs->z_parent->z_vfs represents parent file system
1794 * which we have to use here, because only this file system
1795 * has mnt_export configured.
1796 */
1797 return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1798 credanonp, numsecflavors, secflavors));
1799 }
1800
1801 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN,
1802 "struct fid bigger than SHORT_FID_LEN");
1803 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN,
1804 "struct fid bigger than LONG_FID_LEN");
1805
1806 static int
zfs_fhtovp(vfs_t * vfsp,fid_t * fidp,int flags,vnode_t ** vpp)1807 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1808 {
1809 struct componentname cn;
1810 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1811 znode_t *zp;
1812 vnode_t *dvp;
1813 uint64_t object = 0;
1814 uint64_t fid_gen = 0;
1815 uint64_t setgen = 0;
1816 uint64_t gen_mask;
1817 uint64_t zp_gen;
1818 int i, err;
1819
1820 *vpp = NULL;
1821
1822 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1823 return (err);
1824
1825 /*
1826 * On FreeBSD we can get snapshot's mount point or its parent file
1827 * system mount point depending if snapshot is already mounted or not.
1828 */
1829 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1830 zfid_long_t *zlfid = (zfid_long_t *)fidp;
1831 uint64_t objsetid = 0;
1832
1833 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1834 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1835
1836 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1837 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1838
1839 zfs_exit(zfsvfs, FTAG);
1840
1841 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1842 if (err)
1843 return (SET_ERROR(EINVAL));
1844 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1845 return (err);
1846 }
1847
1848 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1849 zfid_short_t *zfid = (zfid_short_t *)fidp;
1850
1851 for (i = 0; i < sizeof (zfid->zf_object); i++)
1852 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1853
1854 for (i = 0; i < sizeof (zfid->zf_gen); i++)
1855 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1856 } else {
1857 zfs_exit(zfsvfs, FTAG);
1858 return (SET_ERROR(EINVAL));
1859 }
1860
1861 if (fidp->fid_len == LONG_FID_LEN && setgen != 0) {
1862 zfs_exit(zfsvfs, FTAG);
1863 dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n",
1864 (u_longlong_t)fid_gen, (u_longlong_t)setgen);
1865 return (SET_ERROR(EINVAL));
1866 }
1867
1868 /*
1869 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1870 * directory tree. If the object == zfsvfs->z_shares_dir, then
1871 * we are in the .zfs/shares directory tree.
1872 */
1873 if ((fid_gen == 0 &&
1874 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1875 (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
1876 zfs_exit(zfsvfs, FTAG);
1877 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1878 if (object == ZFSCTL_INO_SNAPDIR) {
1879 cn.cn_nameptr = "snapshot";
1880 cn.cn_namelen = strlen(cn.cn_nameptr);
1881 cn.cn_nameiop = LOOKUP;
1882 cn.cn_flags = ISLASTCN | LOCKLEAF;
1883 cn.cn_lkflags = flags;
1884 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1885 vput(dvp);
1886 } else if (object == zfsvfs->z_shares_dir) {
1887 /*
1888 * XXX This branch must not be taken,
1889 * if it is, then the lookup below will
1890 * explode.
1891 */
1892 cn.cn_nameptr = "shares";
1893 cn.cn_namelen = strlen(cn.cn_nameptr);
1894 cn.cn_nameiop = LOOKUP;
1895 cn.cn_flags = ISLASTCN;
1896 cn.cn_lkflags = flags;
1897 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1898 vput(dvp);
1899 } else {
1900 *vpp = dvp;
1901 }
1902 return (err);
1903 }
1904
1905 gen_mask = -1ULL >> (64 - 8 * i);
1906
1907 dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object,
1908 (u_longlong_t)fid_gen,
1909 (u_longlong_t)gen_mask);
1910 if ((err = zfs_zget(zfsvfs, object, &zp))) {
1911 zfs_exit(zfsvfs, FTAG);
1912 return (err);
1913 }
1914 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1915 sizeof (uint64_t));
1916 zp_gen = zp_gen & gen_mask;
1917 if (zp_gen == 0)
1918 zp_gen = 1;
1919 if (zp->z_unlinked || zp_gen != fid_gen) {
1920 dprintf("znode gen (%llu) != fid gen (%llu)\n",
1921 (u_longlong_t)zp_gen, (u_longlong_t)fid_gen);
1922 vrele(ZTOV(zp));
1923 zfs_exit(zfsvfs, FTAG);
1924 return (SET_ERROR(EINVAL));
1925 }
1926
1927 *vpp = ZTOV(zp);
1928 zfs_exit(zfsvfs, FTAG);
1929 err = vn_lock(*vpp, flags);
1930 if (err == 0)
1931 vnode_create_vobject(*vpp, zp->z_size, curthread);
1932 else
1933 *vpp = NULL;
1934 return (err);
1935 }
1936
1937 /*
1938 * Block out VOPs and close zfsvfs_t::z_os
1939 *
1940 * Note, if successful, then we return with the 'z_teardown_lock' and
1941 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
1942 * dataset and objset intact so that they can be atomically handed off during
1943 * a subsequent rollback or recv operation and the resume thereafter.
1944 */
1945 int
zfs_suspend_fs(zfsvfs_t * zfsvfs)1946 zfs_suspend_fs(zfsvfs_t *zfsvfs)
1947 {
1948 int error;
1949
1950 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1951 return (error);
1952
1953 return (0);
1954 }
1955
1956 /*
1957 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
1958 * is an invariant across any of the operations that can be performed while the
1959 * filesystem was suspended. Whether it succeeded or failed, the preconditions
1960 * are the same: the relevant objset and associated dataset are owned by
1961 * zfsvfs, held, and long held on entry.
1962 */
1963 int
zfs_resume_fs(zfsvfs_t * zfsvfs,dsl_dataset_t * ds)1964 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
1965 {
1966 int err;
1967 znode_t *zp;
1968
1969 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
1970 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
1971
1972 /*
1973 * We already own this, so just update the objset_t, as the one we
1974 * had before may have been evicted.
1975 */
1976 objset_t *os;
1977 VERIFY3P(ds->ds_owner, ==, zfsvfs);
1978 VERIFY(dsl_dataset_long_held(ds));
1979 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
1980 dsl_pool_config_enter(dp, FTAG);
1981 VERIFY0(dmu_objset_from_ds(ds, &os));
1982 dsl_pool_config_exit(dp, FTAG);
1983
1984 err = zfsvfs_init(zfsvfs, os);
1985 if (err != 0)
1986 goto bail;
1987
1988 ds->ds_dir->dd_activity_cancelled = B_FALSE;
1989 VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
1990
1991 zfs_set_fuid_feature(zfsvfs);
1992
1993 /*
1994 * Attempt to re-establish all the active znodes with
1995 * their dbufs. If a zfs_rezget() fails, then we'll let
1996 * any potential callers discover that via zfs_enter_verify_zp
1997 * when they try to use their znode.
1998 */
1999 mutex_enter(&zfsvfs->z_znodes_lock);
2000 for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2001 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2002 (void) zfs_rezget(zp);
2003 }
2004 mutex_exit(&zfsvfs->z_znodes_lock);
2005
2006 bail:
2007 /* release the VOPs */
2008 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2009 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2010
2011 if (err) {
2012 /*
2013 * Since we couldn't setup the sa framework, try to force
2014 * unmount this file system.
2015 */
2016 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
2017 vfs_ref(zfsvfs->z_vfs);
2018 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2019 }
2020 }
2021 return (err);
2022 }
2023
2024 static void
zfs_freevfs(vfs_t * vfsp)2025 zfs_freevfs(vfs_t *vfsp)
2026 {
2027 zfsvfs_t *zfsvfs = vfsp->vfs_data;
2028
2029 zfsvfs_free(zfsvfs);
2030
2031 atomic_dec_32(&zfs_active_fs_count);
2032 }
2033
2034 #ifdef __i386__
2035 static int desiredvnodes_backup;
2036 #include <sys/vmmeter.h>
2037
2038
2039 #include <vm/vm_page.h>
2040 #include <vm/vm_object.h>
2041 #include <vm/vm_kern.h>
2042 #include <vm/vm_map.h>
2043 #endif
2044
2045 static void
zfs_vnodes_adjust(void)2046 zfs_vnodes_adjust(void)
2047 {
2048 #ifdef __i386__
2049 int newdesiredvnodes;
2050
2051 desiredvnodes_backup = desiredvnodes;
2052
2053 /*
2054 * We calculate newdesiredvnodes the same way it is done in
2055 * vntblinit(). If it is equal to desiredvnodes, it means that
2056 * it wasn't tuned by the administrator and we can tune it down.
2057 */
2058 newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2059 vm_kmem_size / (5 * (sizeof (struct vm_object) +
2060 sizeof (struct vnode))));
2061 if (newdesiredvnodes == desiredvnodes)
2062 desiredvnodes = (3 * newdesiredvnodes) / 4;
2063 #endif
2064 }
2065
2066 static void
zfs_vnodes_adjust_back(void)2067 zfs_vnodes_adjust_back(void)
2068 {
2069
2070 #ifdef __i386__
2071 desiredvnodes = desiredvnodes_backup;
2072 #endif
2073 }
2074
2075 static struct sx zfs_vnlru_lock;
2076 static struct vnode *zfs_vnlru_marker;
2077 static arc_prune_t *zfs_prune;
2078
2079 static void
zfs_prune_task(uint64_t nr_to_scan,void * arg __unused)2080 zfs_prune_task(uint64_t nr_to_scan, void *arg __unused)
2081 {
2082 if (nr_to_scan > INT_MAX)
2083 nr_to_scan = INT_MAX;
2084 sx_xlock(&zfs_vnlru_lock);
2085 vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker);
2086 sx_xunlock(&zfs_vnlru_lock);
2087 }
2088
2089 void
zfs_init(void)2090 zfs_init(void)
2091 {
2092
2093 printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2094
2095 /*
2096 * Initialize .zfs directory structures
2097 */
2098 zfsctl_init();
2099
2100 /*
2101 * Initialize znode cache, vnode ops, etc...
2102 */
2103 zfs_znode_init();
2104
2105 /*
2106 * Reduce number of vnodes. Originally number of vnodes is calculated
2107 * with UFS inode in mind. We reduce it here, because it's too big for
2108 * ZFS/i386.
2109 */
2110 zfs_vnodes_adjust();
2111
2112 dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
2113
2114 zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2115
2116 zfs_vnlru_marker = vnlru_alloc_marker();
2117 sx_init(&zfs_vnlru_lock, "zfs vnlru lock");
2118 zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL);
2119 }
2120
2121 void
zfs_fini(void)2122 zfs_fini(void)
2123 {
2124 arc_remove_prune_callback(zfs_prune);
2125 vnlru_free_marker(zfs_vnlru_marker);
2126 sx_destroy(&zfs_vnlru_lock);
2127
2128 taskq_destroy(zfsvfs_taskq);
2129 zfsctl_fini();
2130 zfs_znode_fini();
2131 zfs_vnodes_adjust_back();
2132 }
2133
2134 int
zfs_busy(void)2135 zfs_busy(void)
2136 {
2137 return (zfs_active_fs_count != 0);
2138 }
2139
2140 /*
2141 * Release VOPs and unmount a suspended filesystem.
2142 */
2143 int
zfs_end_fs(zfsvfs_t * zfsvfs,dsl_dataset_t * ds)2144 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2145 {
2146 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
2147 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
2148
2149 /*
2150 * We already own this, so just hold and rele it to update the
2151 * objset_t, as the one we had before may have been evicted.
2152 */
2153 objset_t *os;
2154 VERIFY3P(ds->ds_owner, ==, zfsvfs);
2155 VERIFY(dsl_dataset_long_held(ds));
2156 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2157 dsl_pool_config_enter(dp, FTAG);
2158 VERIFY0(dmu_objset_from_ds(ds, &os));
2159 dsl_pool_config_exit(dp, FTAG);
2160 zfsvfs->z_os = os;
2161
2162 /* release the VOPs */
2163 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2164 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2165
2166 /*
2167 * Try to force unmount this file system.
2168 */
2169 (void) zfs_umount(zfsvfs->z_vfs, 0);
2170 zfsvfs->z_unmounted = B_TRUE;
2171 return (0);
2172 }
2173
2174 int
zfs_set_version(zfsvfs_t * zfsvfs,uint64_t newvers)2175 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2176 {
2177 int error;
2178 objset_t *os = zfsvfs->z_os;
2179 dmu_tx_t *tx;
2180
2181 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2182 return (SET_ERROR(EINVAL));
2183
2184 if (newvers < zfsvfs->z_version)
2185 return (SET_ERROR(EINVAL));
2186
2187 if (zfs_spa_version_map(newvers) >
2188 spa_version(dmu_objset_spa(zfsvfs->z_os)))
2189 return (SET_ERROR(ENOTSUP));
2190
2191 tx = dmu_tx_create(os);
2192 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2193 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2194 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2195 ZFS_SA_ATTRS);
2196 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2197 }
2198 error = dmu_tx_assign(tx, TXG_WAIT);
2199 if (error) {
2200 dmu_tx_abort(tx);
2201 return (error);
2202 }
2203
2204 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2205 8, 1, &newvers, tx);
2206
2207 if (error) {
2208 dmu_tx_commit(tx);
2209 return (error);
2210 }
2211
2212 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2213 uint64_t sa_obj;
2214
2215 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2216 SPA_VERSION_SA);
2217 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2218 DMU_OT_NONE, 0, tx);
2219
2220 error = zap_add(os, MASTER_NODE_OBJ,
2221 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2222 ASSERT0(error);
2223
2224 VERIFY0(sa_set_sa_object(os, sa_obj));
2225 sa_register_update_callback(os, zfs_sa_upgrade);
2226 }
2227
2228 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2229 "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2230 (uintmax_t)newvers);
2231 dmu_tx_commit(tx);
2232
2233 zfsvfs->z_version = newvers;
2234 os->os_version = newvers;
2235
2236 zfs_set_fuid_feature(zfsvfs);
2237
2238 return (0);
2239 }
2240
2241 /*
2242 * Return true if the corresponding vfs's unmounted flag is set.
2243 * Otherwise return false.
2244 * If this function returns true we know VFS unmount has been initiated.
2245 */
2246 boolean_t
zfs_get_vfs_flag_unmounted(objset_t * os)2247 zfs_get_vfs_flag_unmounted(objset_t *os)
2248 {
2249 zfsvfs_t *zfvp;
2250 boolean_t unmounted = B_FALSE;
2251
2252 ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS);
2253
2254 mutex_enter(&os->os_user_ptr_lock);
2255 zfvp = dmu_objset_get_user(os);
2256 if (zfvp != NULL && zfvp->z_vfs != NULL &&
2257 (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2258 unmounted = B_TRUE;
2259 mutex_exit(&os->os_user_ptr_lock);
2260
2261 return (unmounted);
2262 }
2263
2264 #ifdef _KERNEL
2265 void
zfsvfs_update_fromname(const char * oldname,const char * newname)2266 zfsvfs_update_fromname(const char *oldname, const char *newname)
2267 {
2268 char tmpbuf[MAXPATHLEN];
2269 struct mount *mp;
2270 char *fromname;
2271 size_t oldlen;
2272
2273 oldlen = strlen(oldname);
2274
2275 mtx_lock(&mountlist_mtx);
2276 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2277 fromname = mp->mnt_stat.f_mntfromname;
2278 if (strcmp(fromname, oldname) == 0) {
2279 (void) strlcpy(fromname, newname,
2280 sizeof (mp->mnt_stat.f_mntfromname));
2281 continue;
2282 }
2283 if (strncmp(fromname, oldname, oldlen) == 0 &&
2284 (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2285 (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2286 newname, fromname + oldlen);
2287 (void) strlcpy(fromname, tmpbuf,
2288 sizeof (mp->mnt_stat.f_mntfromname));
2289 continue;
2290 }
2291 }
2292 mtx_unlock(&mountlist_mtx);
2293 }
2294 #endif
2295
2296 /*
2297 * Find a prison with ZFS info.
2298 * Return the ZFS info and the (locked) prison.
2299 */
2300 static struct zfs_jailparam *
zfs_jailparam_find(struct prison * spr,struct prison ** prp)2301 zfs_jailparam_find(struct prison *spr, struct prison **prp)
2302 {
2303 struct prison *pr;
2304 struct zfs_jailparam *zjp;
2305
2306 for (pr = spr; ; pr = pr->pr_parent) {
2307 mtx_lock(&pr->pr_mtx);
2308 if (pr == &prison0) {
2309 zjp = &zfs_jailparam0;
2310 break;
2311 }
2312 zjp = osd_jail_get(pr, zfs_jailparam_slot);
2313 if (zjp != NULL)
2314 break;
2315 mtx_unlock(&pr->pr_mtx);
2316 }
2317 *prp = pr;
2318
2319 return (zjp);
2320 }
2321
2322 /*
2323 * Ensure a prison has its own ZFS info. If zjpp is non-null, point it to the
2324 * ZFS info and lock the prison.
2325 */
2326 static void
zfs_jailparam_alloc(struct prison * pr,struct zfs_jailparam ** zjpp)2327 zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp)
2328 {
2329 struct prison *ppr;
2330 struct zfs_jailparam *zjp, *nzjp;
2331 void **rsv;
2332
2333 /* If this prison already has ZFS info, return that. */
2334 zjp = zfs_jailparam_find(pr, &ppr);
2335 if (ppr == pr)
2336 goto done;
2337
2338 /*
2339 * Allocate a new info record. Then check again, in case something
2340 * changed during the allocation.
2341 */
2342 mtx_unlock(&ppr->pr_mtx);
2343 nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK);
2344 rsv = osd_reserve(zfs_jailparam_slot);
2345 zjp = zfs_jailparam_find(pr, &ppr);
2346 if (ppr == pr) {
2347 free(nzjp, M_PRISON);
2348 osd_free_reserved(rsv);
2349 goto done;
2350 }
2351 /* Inherit the initial values from the ancestor. */
2352 mtx_lock(&pr->pr_mtx);
2353 (void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp);
2354 (void) memcpy(nzjp, zjp, sizeof (*zjp));
2355 zjp = nzjp;
2356 mtx_unlock(&ppr->pr_mtx);
2357 done:
2358 if (zjpp != NULL)
2359 *zjpp = zjp;
2360 else
2361 mtx_unlock(&pr->pr_mtx);
2362 }
2363
2364 /*
2365 * Jail OSD methods for ZFS VFS info.
2366 */
2367 static int
zfs_jailparam_create(void * obj,void * data)2368 zfs_jailparam_create(void *obj, void *data)
2369 {
2370 struct prison *pr = obj;
2371 struct vfsoptlist *opts = data;
2372 int jsys;
2373
2374 if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 &&
2375 jsys == JAIL_SYS_INHERIT)
2376 return (0);
2377 /*
2378 * Inherit a prison's initial values from its parent
2379 * (different from JAIL_SYS_INHERIT which also inherits changes).
2380 */
2381 zfs_jailparam_alloc(pr, NULL);
2382 return (0);
2383 }
2384
2385 static int
zfs_jailparam_get(void * obj,void * data)2386 zfs_jailparam_get(void *obj, void *data)
2387 {
2388 struct prison *ppr, *pr = obj;
2389 struct vfsoptlist *opts = data;
2390 struct zfs_jailparam *zjp;
2391 int jsys, error;
2392
2393 zjp = zfs_jailparam_find(pr, &ppr);
2394 jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
2395 error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys));
2396 if (error != 0 && error != ENOENT)
2397 goto done;
2398 if (jsys == JAIL_SYS_NEW) {
2399 error = vfs_setopt(opts, "zfs.mount_snapshot",
2400 &zjp->mount_snapshot, sizeof (zjp->mount_snapshot));
2401 if (error != 0 && error != ENOENT)
2402 goto done;
2403 } else {
2404 /*
2405 * If this prison is inheriting its ZFS info, report
2406 * empty/zero parameters.
2407 */
2408 static int mount_snapshot = 0;
2409
2410 error = vfs_setopt(opts, "zfs.mount_snapshot",
2411 &mount_snapshot, sizeof (mount_snapshot));
2412 if (error != 0 && error != ENOENT)
2413 goto done;
2414 }
2415 error = 0;
2416 done:
2417 mtx_unlock(&ppr->pr_mtx);
2418 return (error);
2419 }
2420
2421 static int
zfs_jailparam_set(void * obj,void * data)2422 zfs_jailparam_set(void *obj, void *data)
2423 {
2424 struct prison *pr = obj;
2425 struct prison *ppr;
2426 struct vfsoptlist *opts = data;
2427 int error, jsys, mount_snapshot;
2428
2429 /* Set the parameters, which should be correct. */
2430 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2431 if (error == ENOENT)
2432 jsys = -1;
2433 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2434 sizeof (mount_snapshot));
2435 if (error == ENOENT)
2436 mount_snapshot = -1;
2437 else
2438 jsys = JAIL_SYS_NEW;
2439 switch (jsys) {
2440 case JAIL_SYS_NEW:
2441 {
2442 /* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */
2443 struct zfs_jailparam *zjp;
2444
2445 /*
2446 * A child jail cannot have more permissions than its parent
2447 */
2448 if (pr->pr_parent != &prison0) {
2449 zjp = zfs_jailparam_find(pr->pr_parent, &ppr);
2450 mtx_unlock(&ppr->pr_mtx);
2451 if (zjp->mount_snapshot < mount_snapshot) {
2452 return (EPERM);
2453 }
2454 }
2455 zfs_jailparam_alloc(pr, &zjp);
2456 if (mount_snapshot != -1)
2457 zjp->mount_snapshot = mount_snapshot;
2458 mtx_unlock(&pr->pr_mtx);
2459 break;
2460 }
2461 case JAIL_SYS_INHERIT:
2462 /* "zfs=inherit": inherit the parent's ZFS info. */
2463 mtx_lock(&pr->pr_mtx);
2464 osd_jail_del(pr, zfs_jailparam_slot);
2465 mtx_unlock(&pr->pr_mtx);
2466 break;
2467 case -1:
2468 /*
2469 * If the setting being changed is not ZFS related
2470 * then do nothing.
2471 */
2472 break;
2473 }
2474
2475 return (0);
2476 }
2477
2478 static int
zfs_jailparam_check(void * obj __unused,void * data)2479 zfs_jailparam_check(void *obj __unused, void *data)
2480 {
2481 struct vfsoptlist *opts = data;
2482 int error, jsys, mount_snapshot;
2483
2484 /* Check that the parameters are correct. */
2485 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2486 if (error != ENOENT) {
2487 if (error != 0)
2488 return (error);
2489 if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT)
2490 return (EINVAL);
2491 }
2492 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2493 sizeof (mount_snapshot));
2494 if (error != ENOENT) {
2495 if (error != 0)
2496 return (error);
2497 if (mount_snapshot != 0 && mount_snapshot != 1)
2498 return (EINVAL);
2499 }
2500 return (0);
2501 }
2502
2503 static void
zfs_jailparam_destroy(void * data)2504 zfs_jailparam_destroy(void *data)
2505 {
2506
2507 free(data, M_PRISON);
2508 }
2509
2510 static void
zfs_jailparam_sysinit(void * arg __unused)2511 zfs_jailparam_sysinit(void *arg __unused)
2512 {
2513 struct prison *pr;
2514 osd_method_t methods[PR_MAXMETHOD] = {
2515 [PR_METHOD_CREATE] = zfs_jailparam_create,
2516 [PR_METHOD_GET] = zfs_jailparam_get,
2517 [PR_METHOD_SET] = zfs_jailparam_set,
2518 [PR_METHOD_CHECK] = zfs_jailparam_check,
2519 };
2520
2521 zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods);
2522 /* Copy the defaults to any existing prisons. */
2523 sx_slock(&allprison_lock);
2524 TAILQ_FOREACH(pr, &allprison, pr_list)
2525 zfs_jailparam_alloc(pr, NULL);
2526 sx_sunlock(&allprison_lock);
2527 }
2528
2529 static void
zfs_jailparam_sysuninit(void * arg __unused)2530 zfs_jailparam_sysuninit(void *arg __unused)
2531 {
2532
2533 osd_jail_deregister(zfs_jailparam_slot);
2534 }
2535
2536 SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2537 zfs_jailparam_sysinit, NULL);
2538 SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2539 zfs_jailparam_sysuninit, NULL);
2540