1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2014 Integros [integros.com]
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2019 Joyent, Inc.
27 * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
28 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
29 * Copyright 2024 Oxide Computer Company
30 * Copyright 2025 MNX Cloud, Inc.
31 */
32
33 /* Portions Copyright 2010 Robert Milkowski */
34
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/sysmacros.h>
39 #include <sys/kmem.h>
40 #include <sys/pathname.h>
41 #include <sys/vnode.h>
42 #include <sys/vfs.h>
43 #include <sys/vfs_opreg.h>
44 #include <sys/mntent.h>
45 #include <sys/mount.h>
46 #include <sys/cmn_err.h>
47 #include "fs/fs_subr.h"
48 #include <sys/zfs_znode.h>
49 #include <sys/zfs_dir.h>
50 #include <sys/zil.h>
51 #include <sys/fs/zfs.h>
52 #include <sys/dmu.h>
53 #include <sys/dsl_prop.h>
54 #include <sys/dsl_dataset.h>
55 #include <sys/dsl_deleg.h>
56 #include <sys/spa.h>
57 #include <sys/zap.h>
58 #include <sys/sa.h>
59 #include <sys/sa_impl.h>
60 #include <sys/varargs.h>
61 #include <sys/policy.h>
62 #include <sys/atomic.h>
63 #include <sys/mkdev.h>
64 #include <sys/modctl.h>
65 #include <sys/refstr.h>
66 #include <sys/zfs_ioctl.h>
67 #include <sys/zfs_ctldir.h>
68 #include <sys/zfs_fuid.h>
69 #include <sys/bootconf.h>
70 #include <sys/ddi.h>
71 #include <sys/sunddi.h>
72 #include <sys/dnlc.h>
73 #include <sys/dmu_objset.h>
74 #include <sys/spa_boot.h>
75 #include <sys/vdev_impl.h>
76 #include <sys/ilstr.h>
77 #include "zfs_comutil.h"
78
79 int zfsfstype;
80 vfsops_t *zfs_vfsops = NULL;
81 static major_t zfs_major;
82 static minor_t zfs_minor;
83 static kmutex_t zfs_dev_mtx;
84
85 extern int sys_shutdown;
86
87 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
88 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
89 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
90 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
91 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
92 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
93 static void zfs_freevfs(vfs_t *vfsp);
94 static int zfs_syncfs(vfs_t *vfsp, uint64_t flags, cred_t *cr);
95
96 static const fs_operation_def_t zfs_vfsops_template[] = {
97 VFSNAME_MOUNT, { .vfs_mount = zfs_mount },
98 VFSNAME_MOUNTROOT, { .vfs_mountroot = zfs_mountroot },
99 VFSNAME_UNMOUNT, { .vfs_unmount = zfs_umount },
100 VFSNAME_ROOT, { .vfs_root = zfs_root },
101 VFSNAME_STATVFS, { .vfs_statvfs = zfs_statvfs },
102 VFSNAME_SYNC, { .vfs_sync = zfs_sync },
103 VFSNAME_VGET, { .vfs_vget = zfs_vget },
104 VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs },
105 VFSNAME_SYNCFS, { .vfs_syncfs = zfs_syncfs },
106 NULL, NULL
107 };
108
109 /*
110 * We need to keep a count of active fs's.
111 * This is necessary to prevent our module
112 * from being unloaded after a umount -f
113 */
114 static uint32_t zfs_active_fs_count = 0;
115
116 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
117 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
118 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
119 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
120
121 /*
122 * MO_DEFAULT is not used since the default value is determined
123 * by the equivalent property.
124 */
125 static mntopt_t mntopts[] = {
126 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
127 { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
128 { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
129 { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
130 };
131
132 static mntopts_t zfs_mntopts = {
133 sizeof (mntopts) / sizeof (mntopt_t),
134 mntopts
135 };
136
137 /*ARGSUSED*/
138 int
zfs_sync(vfs_t * vfsp,short flag,cred_t * cr)139 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
140 {
141 /*
142 * Data integrity is job one. We don't want a compromised kernel
143 * writing to the storage pool, so we never sync during panic.
144 */
145 if (panicstr)
146 return (0);
147
148 /*
149 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
150 * to sync metadata, which they would otherwise cache indefinitely.
151 * Semantically, the only requirement is that the sync be initiated.
152 * The DMU syncs out txgs frequently, so there's nothing to do.
153 */
154 if (flag & SYNC_ATTR)
155 return (0);
156
157 if (vfsp != NULL) {
158 /*
159 * Sync a specific filesystem.
160 */
161 zfsvfs_t *zfsvfs = vfsp->vfs_data;
162 dsl_pool_t *dp;
163
164 ZFS_ENTER(zfsvfs);
165 dp = dmu_objset_pool(zfsvfs->z_os);
166
167 /*
168 * If the system is shutting down, then skip any
169 * filesystems which may exist on a suspended pool.
170 */
171 if (sys_shutdown && spa_suspended(dp->dp_spa)) {
172 ZFS_EXIT(zfsvfs);
173 return (0);
174 }
175
176 if (zfsvfs->z_log != NULL)
177 zil_commit(zfsvfs->z_log, 0);
178
179 ZFS_EXIT(zfsvfs);
180 } else {
181 /*
182 * Sync all ZFS filesystems. This is what happens when you
183 * run sync(8). Unlike other filesystems, ZFS honors the
184 * request by waiting for all pools to commit all dirty data.
185 */
186 spa_sync_allpools();
187 }
188
189 return (0);
190 }
191
192 /*
193 * This is a synchronous request to sync all file system data out.
194 */
195 static int
zfs_syncfs(vfs_t * vfsp,uint64_t flags,cred_t * cr)196 zfs_syncfs(vfs_t *vfsp, uint64_t flags, cred_t *cr)
197 {
198 if (flags != 0) {
199 return (ENOTSUP);
200 }
201
202 return (zfs_sync(vfsp, 0, cr));
203 }
204
205 static int
zfs_create_unique_device(dev_t * dev)206 zfs_create_unique_device(dev_t *dev)
207 {
208 major_t new_major;
209
210 do {
211 ASSERT3U(zfs_minor, <=, MAXMIN32);
212 minor_t start = zfs_minor;
213 do {
214 mutex_enter(&zfs_dev_mtx);
215 if (zfs_minor >= MAXMIN32) {
216 /*
217 * If we're still using the real major
218 * keep out of /dev/zfs and /dev/zvol minor
219 * number space. If we're using a getudev()'ed
220 * major number, we can use all of its minors.
221 */
222 if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
223 zfs_minor = ZFS_MIN_MINOR;
224 else
225 zfs_minor = 0;
226 } else {
227 zfs_minor++;
228 }
229 *dev = makedevice(zfs_major, zfs_minor);
230 mutex_exit(&zfs_dev_mtx);
231 } while (vfs_devismounted(*dev) && zfs_minor != start);
232 if (zfs_minor == start) {
233 /*
234 * We are using all ~262,000 minor numbers for the
235 * current major number. Create a new major number.
236 */
237 if ((new_major = getudev()) == (major_t)-1) {
238 cmn_err(CE_WARN,
239 "zfs_mount: Can't get unique major "
240 "device number.");
241 return (-1);
242 }
243 mutex_enter(&zfs_dev_mtx);
244 zfs_major = new_major;
245 zfs_minor = 0;
246
247 mutex_exit(&zfs_dev_mtx);
248 } else {
249 break;
250 }
251 /* CONSTANTCONDITION */
252 } while (1);
253
254 return (0);
255 }
256
257 static void
atime_changed_cb(void * arg,uint64_t newval)258 atime_changed_cb(void *arg, uint64_t newval)
259 {
260 zfsvfs_t *zfsvfs = arg;
261
262 if (newval == TRUE) {
263 zfsvfs->z_atime = TRUE;
264 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
265 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
266 } else {
267 zfsvfs->z_atime = FALSE;
268 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
269 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
270 }
271 }
272
273 static void
xattr_changed_cb(void * arg,uint64_t newval)274 xattr_changed_cb(void *arg, uint64_t newval)
275 {
276 zfsvfs_t *zfsvfs = arg;
277
278 if (newval == TRUE) {
279 /* XXX locking on vfs_flag? */
280 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
281 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
282 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
283 } else {
284 /* XXX locking on vfs_flag? */
285 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
286 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
287 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
288 }
289 }
290
291 static void
blksz_changed_cb(void * arg,uint64_t newval)292 blksz_changed_cb(void *arg, uint64_t newval)
293 {
294 zfsvfs_t *zfsvfs = arg;
295 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
296 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
297 ASSERT(ISP2(newval));
298
299 zfsvfs->z_max_blksz = newval;
300 zfsvfs->z_vfs->vfs_bsize = newval;
301 }
302
303 static void
readonly_changed_cb(void * arg,uint64_t newval)304 readonly_changed_cb(void *arg, uint64_t newval)
305 {
306 zfsvfs_t *zfsvfs = arg;
307
308 if (newval) {
309 /* XXX locking on vfs_flag? */
310 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
311 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
312 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
313 } else {
314 /* XXX locking on vfs_flag? */
315 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
316 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
317 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
318 }
319 }
320
321 static void
devices_changed_cb(void * arg,uint64_t newval)322 devices_changed_cb(void *arg, uint64_t newval)
323 {
324 zfsvfs_t *zfsvfs = arg;
325
326 if (newval == FALSE) {
327 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
328 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
329 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
330 } else {
331 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
332 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
333 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
334 }
335 }
336
337 static void
setuid_changed_cb(void * arg,uint64_t newval)338 setuid_changed_cb(void *arg, uint64_t newval)
339 {
340 zfsvfs_t *zfsvfs = arg;
341
342 if (newval == FALSE) {
343 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
344 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
345 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
346 } else {
347 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
348 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
349 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
350 }
351 }
352
353 static void
exec_changed_cb(void * arg,uint64_t newval)354 exec_changed_cb(void *arg, uint64_t newval)
355 {
356 zfsvfs_t *zfsvfs = arg;
357
358 if (newval == FALSE) {
359 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
360 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
361 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
362 } else {
363 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
364 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
365 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
366 }
367 }
368
369 /*
370 * The nbmand mount option can be changed at mount time.
371 * We can't allow it to be toggled on live file systems or incorrect
372 * behavior may be seen from cifs clients
373 *
374 * This property isn't registered via dsl_prop_register(), but this callback
375 * will be called when a file system is first mounted
376 */
377 static void
nbmand_changed_cb(void * arg,uint64_t newval)378 nbmand_changed_cb(void *arg, uint64_t newval)
379 {
380 zfsvfs_t *zfsvfs = arg;
381 if (newval == FALSE) {
382 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
383 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
384 } else {
385 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
386 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
387 }
388 }
389
390 static void
snapdir_changed_cb(void * arg,uint64_t newval)391 snapdir_changed_cb(void *arg, uint64_t newval)
392 {
393 zfsvfs_t *zfsvfs = arg;
394
395 zfsvfs->z_show_ctldir = newval;
396 }
397
398 static void
vscan_changed_cb(void * arg,uint64_t newval)399 vscan_changed_cb(void *arg, uint64_t newval)
400 {
401 zfsvfs_t *zfsvfs = arg;
402
403 zfsvfs->z_vscan = newval;
404 }
405
406 static void
acl_mode_changed_cb(void * arg,uint64_t newval)407 acl_mode_changed_cb(void *arg, uint64_t newval)
408 {
409 zfsvfs_t *zfsvfs = arg;
410
411 zfsvfs->z_acl_mode = newval;
412 }
413
414 static void
acl_inherit_changed_cb(void * arg,uint64_t newval)415 acl_inherit_changed_cb(void *arg, uint64_t newval)
416 {
417 zfsvfs_t *zfsvfs = arg;
418
419 zfsvfs->z_acl_inherit = newval;
420 }
421
422 static void
acl_implicit_changed_cb(void * arg,uint64_t newval)423 acl_implicit_changed_cb(void *arg, uint64_t newval)
424 {
425 zfsvfs_t *zfsvfs = arg;
426
427 zfsvfs->z_acl_implicit = (boolean_t)newval;
428 }
429
430 static int
zfs_register_callbacks(vfs_t * vfsp)431 zfs_register_callbacks(vfs_t *vfsp)
432 {
433 struct dsl_dataset *ds = NULL;
434 objset_t *os = NULL;
435 zfsvfs_t *zfsvfs = NULL;
436 uint64_t nbmand;
437 boolean_t readonly = B_FALSE;
438 boolean_t do_readonly = B_FALSE;
439 boolean_t setuid = B_FALSE;
440 boolean_t do_setuid = B_FALSE;
441 boolean_t exec = B_FALSE;
442 boolean_t do_exec = B_FALSE;
443 boolean_t devices = B_FALSE;
444 boolean_t do_devices = B_FALSE;
445 boolean_t xattr = B_FALSE;
446 boolean_t do_xattr = B_FALSE;
447 boolean_t atime = B_FALSE;
448 boolean_t do_atime = B_FALSE;
449 int error = 0;
450
451 ASSERT(vfsp);
452 zfsvfs = vfsp->vfs_data;
453 ASSERT(zfsvfs);
454 os = zfsvfs->z_os;
455
456 /*
457 * The act of registering our callbacks will destroy any mount
458 * options we may have. In order to enable temporary overrides
459 * of mount options, we stash away the current values and
460 * restore them after we register the callbacks.
461 */
462 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
463 !spa_writeable(dmu_objset_spa(os))) {
464 readonly = B_TRUE;
465 do_readonly = B_TRUE;
466 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
467 readonly = B_FALSE;
468 do_readonly = B_TRUE;
469 }
470 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
471 devices = B_FALSE;
472 setuid = B_FALSE;
473 do_devices = B_TRUE;
474 do_setuid = B_TRUE;
475 } else {
476 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
477 devices = B_FALSE;
478 do_devices = B_TRUE;
479 } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
480 devices = B_TRUE;
481 do_devices = B_TRUE;
482 }
483
484 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
485 setuid = B_FALSE;
486 do_setuid = B_TRUE;
487 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
488 setuid = B_TRUE;
489 do_setuid = B_TRUE;
490 }
491 }
492 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
493 exec = B_FALSE;
494 do_exec = B_TRUE;
495 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
496 exec = B_TRUE;
497 do_exec = B_TRUE;
498 }
499 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
500 xattr = B_FALSE;
501 do_xattr = B_TRUE;
502 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
503 xattr = B_TRUE;
504 do_xattr = B_TRUE;
505 }
506 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
507 atime = B_FALSE;
508 do_atime = B_TRUE;
509 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
510 atime = B_TRUE;
511 do_atime = B_TRUE;
512 }
513
514 /*
515 * nbmand is a special property. It can only be changed at
516 * mount time.
517 *
518 * This is weird, but it is documented to only be changeable
519 * at mount time.
520 */
521 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
522 nbmand = B_FALSE;
523 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
524 nbmand = B_TRUE;
525 } else {
526 char osname[ZFS_MAX_DATASET_NAME_LEN];
527
528 dmu_objset_name(os, osname);
529 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
530 NULL)) {
531 return (error);
532 }
533 }
534
535 /*
536 * Register property callbacks.
537 *
538 * It would probably be fine to just check for i/o error from
539 * the first prop_register(), but I guess I like to go
540 * overboard...
541 */
542 ds = dmu_objset_ds(os);
543 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
544 error = dsl_prop_register(ds,
545 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
546 error = error ? error : dsl_prop_register(ds,
547 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
548 error = error ? error : dsl_prop_register(ds,
549 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
550 error = error ? error : dsl_prop_register(ds,
551 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
552 error = error ? error : dsl_prop_register(ds,
553 zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
554 error = error ? error : dsl_prop_register(ds,
555 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
556 error = error ? error : dsl_prop_register(ds,
557 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
558 error = error ? error : dsl_prop_register(ds,
559 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
560 error = error ? error : dsl_prop_register(ds,
561 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
562 error = error ? error : dsl_prop_register(ds,
563 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
564 zfsvfs);
565 error = error ? error : dsl_prop_register(ds,
566 zfs_prop_to_name(ZFS_PROP_ACLIMPLICIT),
567 acl_implicit_changed_cb, zfsvfs);
568 error = error ? error : dsl_prop_register(ds,
569 zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
570 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
571 if (error)
572 goto unregister;
573
574 /*
575 * Invoke our callbacks to restore temporary mount options.
576 */
577 if (do_readonly)
578 readonly_changed_cb(zfsvfs, readonly);
579 if (do_setuid)
580 setuid_changed_cb(zfsvfs, setuid);
581 if (do_exec)
582 exec_changed_cb(zfsvfs, exec);
583 if (do_devices)
584 devices_changed_cb(zfsvfs, devices);
585 if (do_xattr)
586 xattr_changed_cb(zfsvfs, xattr);
587 if (do_atime)
588 atime_changed_cb(zfsvfs, atime);
589
590 nbmand_changed_cb(zfsvfs, nbmand);
591
592 return (0);
593
594 unregister:
595 dsl_prop_unregister_all(ds, zfsvfs);
596 return (error);
597 }
598
599 static int
zfs_space_delta_cb(dmu_object_type_t bonustype,void * data,uint64_t * userp,uint64_t * groupp,uint64_t * projectp)600 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
601 uint64_t *userp, uint64_t *groupp, uint64_t *projectp)
602 {
603 sa_hdr_phys_t sa;
604 sa_hdr_phys_t *sap = data;
605 uint64_t flags;
606 int hdrsize;
607 boolean_t swap = B_FALSE;
608
609 /*
610 * Is it a valid type of object to track?
611 */
612 if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
613 return (SET_ERROR(ENOENT));
614
615 /*
616 * If we have a NULL data pointer
617 * then assume the id's aren't changing and
618 * return EEXIST to the dmu to let it know to
619 * use the same ids
620 */
621 if (data == NULL)
622 return (SET_ERROR(EEXIST));
623
624 if (bonustype == DMU_OT_ZNODE) {
625 znode_phys_t *znp = data;
626 *userp = znp->zp_uid;
627 *groupp = znp->zp_gid;
628 *projectp = ZFS_DEFAULT_PROJID;
629 return (0);
630 }
631
632 if (sap->sa_magic == 0) {
633 /*
634 * This should only happen for newly created files
635 * that haven't had the znode data filled in yet.
636 */
637 *userp = 0;
638 *groupp = 0;
639 *projectp = ZFS_DEFAULT_PROJID;
640 return (0);
641 }
642
643 sa = *sap;
644 if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
645 sa.sa_magic = SA_MAGIC;
646 sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
647 swap = B_TRUE;
648 } else {
649 VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
650 }
651
652 hdrsize = sa_hdrsize(&sa);
653 VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
654
655 *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET));
656 *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET));
657 flags = *((uint64_t *)((uintptr_t)data + hdrsize + SA_FLAGS_OFFSET));
658 if (swap)
659 flags = BSWAP_64(flags);
660
661 if (flags & ZFS_PROJID)
662 *projectp = *((uint64_t *)((uintptr_t)data + hdrsize +
663 SA_PROJID_OFFSET));
664 else
665 *projectp = ZFS_DEFAULT_PROJID;
666
667 if (swap) {
668 *userp = BSWAP_64(*userp);
669 *groupp = BSWAP_64(*groupp);
670 *projectp = BSWAP_64(*projectp);
671 }
672 return (0);
673 }
674
675 static void
fuidstr_to_sid(zfsvfs_t * zfsvfs,const char * fuidstr,char * domainbuf,int buflen,uid_t * ridp)676 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
677 char *domainbuf, int buflen, uid_t *ridp)
678 {
679 uint64_t fuid;
680 const char *domain;
681
682 fuid = zfs_strtonum(fuidstr, NULL);
683
684 domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
685 if (domain)
686 (void) strlcpy(domainbuf, domain, buflen);
687 else
688 domainbuf[0] = '\0';
689 *ridp = FUID_RID(fuid);
690 }
691
692 static uint64_t
zfs_userquota_prop_to_obj(zfsvfs_t * zfsvfs,zfs_userquota_prop_t type)693 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
694 {
695 switch (type) {
696 case ZFS_PROP_USERUSED:
697 case ZFS_PROP_USEROBJUSED:
698 return (DMU_USERUSED_OBJECT);
699 case ZFS_PROP_GROUPUSED:
700 case ZFS_PROP_GROUPOBJUSED:
701 return (DMU_GROUPUSED_OBJECT);
702 case ZFS_PROP_PROJECTUSED:
703 case ZFS_PROP_PROJECTOBJUSED:
704 return (DMU_PROJECTUSED_OBJECT);
705 case ZFS_PROP_USERQUOTA:
706 return (zfsvfs->z_userquota_obj);
707 case ZFS_PROP_GROUPQUOTA:
708 return (zfsvfs->z_groupquota_obj);
709 case ZFS_PROP_USEROBJQUOTA:
710 return (zfsvfs->z_userobjquota_obj);
711 case ZFS_PROP_GROUPOBJQUOTA:
712 return (zfsvfs->z_groupobjquota_obj);
713 case ZFS_PROP_PROJECTQUOTA:
714 return (zfsvfs->z_projectquota_obj);
715 case ZFS_PROP_PROJECTOBJQUOTA:
716 return (zfsvfs->z_projectobjquota_obj);
717 default:
718 return (ZFS_NO_OBJECT);
719 }
720 }
721
722 int
zfs_userspace_many(zfsvfs_t * zfsvfs,zfs_userquota_prop_t type,uint64_t * cookiep,void * vbuf,uint64_t * bufsizep)723 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
724 uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
725 {
726 int error;
727 zap_cursor_t zc;
728 zap_attribute_t za;
729 zfs_useracct_t *buf = vbuf;
730 uint64_t obj;
731 int offset = 0;
732
733 if (!dmu_objset_userspace_present(zfsvfs->z_os))
734 return (SET_ERROR(ENOTSUP));
735
736 if ((type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED ||
737 type == ZFS_PROP_PROJECTOBJQUOTA ||
738 type == ZFS_PROP_PROJECTOBJUSED) &&
739 !dmu_objset_projectquota_present(zfsvfs->z_os))
740 return (SET_ERROR(ENOTSUP));
741
742 if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
743 type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA ||
744 type == ZFS_PROP_PROJECTOBJUSED ||
745 type == ZFS_PROP_PROJECTOBJQUOTA) &&
746 !dmu_objset_userobjspace_present(zfsvfs->z_os))
747 return (SET_ERROR(ENOTSUP));
748
749 obj = zfs_userquota_prop_to_obj(zfsvfs, type);
750 if (obj == ZFS_NO_OBJECT) {
751 *bufsizep = 0;
752 return (0);
753 }
754
755 if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
756 type == ZFS_PROP_PROJECTOBJUSED)
757 offset = DMU_OBJACCT_PREFIX_LEN;
758
759 for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
760 (error = zap_cursor_retrieve(&zc, &za)) == 0;
761 zap_cursor_advance(&zc)) {
762 if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
763 *bufsizep)
764 break;
765
766 /*
767 * skip object quota (with zap name prefix DMU_OBJACCT_PREFIX)
768 * when dealing with block quota and vice versa.
769 */
770 if ((offset > 0) != (strncmp(za.za_name, DMU_OBJACCT_PREFIX,
771 DMU_OBJACCT_PREFIX_LEN) == 0))
772 continue;
773
774 fuidstr_to_sid(zfsvfs, za.za_name + offset,
775 buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
776
777 buf->zu_space = za.za_first_integer;
778 buf++;
779 }
780 if (error == ENOENT)
781 error = 0;
782
783 ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
784 *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
785 *cookiep = zap_cursor_serialize(&zc);
786 zap_cursor_fini(&zc);
787 return (error);
788 }
789
790 /*
791 * buf must be big enough (eg, 16+1 bytes)
792 */
793 static int
id_to_fuidstr(zfsvfs_t * zfsvfs,const char * domain,uid_t rid,ilstr_t * ils,boolean_t addok)794 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
795 ilstr_t *ils, boolean_t addok)
796 {
797 uint64_t fuid;
798 int domainid = 0;
799
800 if (domain && domain[0]) {
801 domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
802 if (domainid == -1)
803 return (SET_ERROR(ENOENT));
804 }
805 fuid = FUID_ENCODE(domainid, rid);
806 ilstr_aprintf(ils, "%llx", (longlong_t)fuid);
807 return (0);
808 }
809
810 int
zfs_userspace_one(zfsvfs_t * zfsvfs,zfs_userquota_prop_t type,const char * domain,uint64_t rid,uint64_t * valp)811 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
812 const char *domain, uint64_t rid, uint64_t *valp)
813 {
814 ilstr_t ils;
815 char buf[20 + DMU_OBJACCT_PREFIX_LEN];
816 int err;
817 uint64_t obj;
818
819 ilstr_init_prealloc(&ils, buf, sizeof (buf));
820 *valp = 0;
821
822 if (!dmu_objset_userspace_present(zfsvfs->z_os))
823 return (SET_ERROR(ENOTSUP));
824
825 if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
826 type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA ||
827 type == ZFS_PROP_PROJECTOBJUSED ||
828 type == ZFS_PROP_PROJECTOBJQUOTA) &&
829 !dmu_objset_userobjspace_present(zfsvfs->z_os))
830 return (SET_ERROR(ENOTSUP));
831
832 if (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED ||
833 type == ZFS_PROP_PROJECTOBJQUOTA ||
834 type == ZFS_PROP_PROJECTOBJUSED) {
835 if (!dmu_objset_projectquota_present(zfsvfs->z_os))
836 return (SET_ERROR(ENOTSUP));
837 if (!zpl_is_valid_projid(rid))
838 return (SET_ERROR(EINVAL));
839 }
840
841 obj = zfs_userquota_prop_to_obj(zfsvfs, type);
842 if (obj == ZFS_NO_OBJECT)
843 return (0);
844
845 if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
846 type == ZFS_PROP_PROJECTOBJUSED) {
847 ilstr_append_str(&ils, DMU_OBJACCT_PREFIX);
848 }
849
850 err = id_to_fuidstr(zfsvfs, domain, rid, &ils, B_FALSE);
851 if (err)
852 return (err);
853
854 VERIFY3S(ilstr_errno(&ils), ==, ILSTR_ERROR_OK);
855 err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
856 if (err == ENOENT)
857 err = 0;
858 return (err);
859 }
860
861 int
zfs_set_userquota(zfsvfs_t * zfsvfs,zfs_userquota_prop_t type,const char * domain,uint64_t rid,uint64_t quota)862 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
863 const char *domain, uint64_t rid, uint64_t quota)
864 {
865 char buf[32];
866 int err;
867 dmu_tx_t *tx;
868 uint64_t *objp;
869 boolean_t fuid_dirtied;
870
871 if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
872 return (SET_ERROR(ENOTSUP));
873
874 switch (type) {
875 case ZFS_PROP_USERQUOTA:
876 objp = &zfsvfs->z_userquota_obj;
877 break;
878 case ZFS_PROP_GROUPQUOTA:
879 objp = &zfsvfs->z_groupquota_obj;
880 break;
881 case ZFS_PROP_USEROBJQUOTA:
882 objp = &zfsvfs->z_userobjquota_obj;
883 break;
884 case ZFS_PROP_GROUPOBJQUOTA:
885 objp = &zfsvfs->z_groupobjquota_obj;
886 break;
887 case ZFS_PROP_PROJECTQUOTA:
888 if (!dmu_objset_projectquota_enabled(zfsvfs->z_os))
889 return (SET_ERROR(ENOTSUP));
890 if (!zpl_is_valid_projid(rid))
891 return (SET_ERROR(EINVAL));
892
893 objp = &zfsvfs->z_projectquota_obj;
894 break;
895 case ZFS_PROP_PROJECTOBJQUOTA:
896 if (!dmu_objset_projectquota_enabled(zfsvfs->z_os))
897 return (SET_ERROR(ENOTSUP));
898 if (!zpl_is_valid_projid(rid))
899 return (SET_ERROR(EINVAL));
900
901 objp = &zfsvfs->z_projectobjquota_obj;
902 break;
903 default:
904 return (SET_ERROR(EINVAL));
905 }
906
907 ilstr_t ils;
908 ilstr_init_prealloc(&ils, buf, sizeof (buf));
909 err = id_to_fuidstr(zfsvfs, domain, rid, &ils, B_TRUE);
910 if (err)
911 return (err);
912 VERIFY3S(ilstr_errno(&ils), ==, ILSTR_ERROR_OK);
913 fuid_dirtied = zfsvfs->z_fuid_dirty;
914
915 tx = dmu_tx_create(zfsvfs->z_os);
916 dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
917 if (*objp == 0) {
918 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
919 zfs_userquota_prop_prefixes[type]);
920 }
921 if (fuid_dirtied)
922 zfs_fuid_txhold(zfsvfs, tx);
923 err = dmu_tx_assign(tx, TXG_WAIT);
924 if (err) {
925 dmu_tx_abort(tx);
926 return (err);
927 }
928
929 mutex_enter(&zfsvfs->z_lock);
930 if (*objp == 0) {
931 *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
932 DMU_OT_NONE, 0, tx);
933 VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
934 zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
935 }
936 mutex_exit(&zfsvfs->z_lock);
937
938 if (quota == 0) {
939 err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
940 if (err == ENOENT)
941 err = 0;
942 } else {
943 err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx);
944 }
945 ASSERT(err == 0);
946 if (fuid_dirtied)
947 zfs_fuid_sync(zfsvfs, tx);
948 dmu_tx_commit(tx);
949 return (err);
950 }
951
952 boolean_t
zfs_id_overobjquota(zfsvfs_t * zfsvfs,uint64_t usedobj,uint64_t id)953 zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
954 {
955 char buf[20 + DMU_OBJACCT_PREFIX_LEN];
956 uint64_t used, quota, quotaobj;
957 int err;
958
959 if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) {
960 if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) {
961 dsl_pool_config_enter(
962 dmu_objset_pool(zfsvfs->z_os), FTAG);
963 dmu_objset_id_quota_upgrade(zfsvfs->z_os);
964 dsl_pool_config_exit(
965 dmu_objset_pool(zfsvfs->z_os), FTAG);
966 }
967 return (B_FALSE);
968 }
969
970 if (usedobj == DMU_PROJECTUSED_OBJECT) {
971 if (!dmu_objset_projectquota_present(zfsvfs->z_os)) {
972 if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) {
973 dsl_pool_config_enter(
974 dmu_objset_pool(zfsvfs->z_os), FTAG);
975 dmu_objset_id_quota_upgrade(zfsvfs->z_os);
976 dsl_pool_config_exit(
977 dmu_objset_pool(zfsvfs->z_os), FTAG);
978 }
979 return (B_FALSE);
980 }
981 quotaobj = zfsvfs->z_projectobjquota_obj;
982 } else if (usedobj == DMU_USERUSED_OBJECT) {
983 quotaobj = zfsvfs->z_userobjquota_obj;
984 } else if (usedobj == DMU_GROUPUSED_OBJECT) {
985 quotaobj = zfsvfs->z_groupobjquota_obj;
986 } else {
987 return (B_FALSE);
988 }
989 if (quotaobj == 0 || zfsvfs->z_replay)
990 return (B_FALSE);
991
992 (void) sprintf(buf, "%llx", (longlong_t)id);
993 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a);
994 if (err != 0)
995 return (B_FALSE);
996
997 (void) sprintf(buf, DMU_OBJACCT_PREFIX "%llx", (longlong_t)id);
998 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
999 if (err != 0)
1000 return (B_FALSE);
1001 return (used >= quota);
1002 }
1003
1004 boolean_t
zfs_id_overblockquota(zfsvfs_t * zfsvfs,uint64_t usedobj,uint64_t id)1005 zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
1006 {
1007 char buf[20];
1008 uint64_t used, quota, quotaobj;
1009 int err;
1010
1011 if (usedobj == DMU_PROJECTUSED_OBJECT) {
1012 if (!dmu_objset_projectquota_present(zfsvfs->z_os)) {
1013 if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) {
1014 dsl_pool_config_enter(
1015 dmu_objset_pool(zfsvfs->z_os), FTAG);
1016 dmu_objset_id_quota_upgrade(zfsvfs->z_os);
1017 dsl_pool_config_exit(
1018 dmu_objset_pool(zfsvfs->z_os), FTAG);
1019 }
1020 return (B_FALSE);
1021 }
1022 quotaobj = zfsvfs->z_projectquota_obj;
1023 } else if (usedobj == DMU_USERUSED_OBJECT) {
1024 quotaobj = zfsvfs->z_userquota_obj;
1025 } else if (usedobj == DMU_GROUPUSED_OBJECT) {
1026 quotaobj = zfsvfs->z_groupquota_obj;
1027 } else {
1028 return (B_FALSE);
1029 }
1030 if (quotaobj == 0 || zfsvfs->z_replay)
1031 return (B_FALSE);
1032
1033 (void) sprintf(buf, "%llx", (longlong_t)id);
1034 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a);
1035 if (err != 0)
1036 return (B_FALSE);
1037
1038 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
1039 if (err != 0)
1040 return (B_FALSE);
1041 return (used >= quota);
1042 }
1043
1044 boolean_t
zfs_id_overquota(zfsvfs_t * zfsvfs,uint64_t usedobj,uint64_t id)1045 zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
1046 {
1047 return (zfs_id_overblockquota(zfsvfs, usedobj, id) ||
1048 zfs_id_overobjquota(zfsvfs, usedobj, id));
1049 }
1050
1051 /*
1052 * Associate this zfsvfs with the given objset, which must be owned.
1053 * This will cache a bunch of on-disk state from the objset in the
1054 * zfsvfs.
1055 */
1056 static int
zfsvfs_init(zfsvfs_t * zfsvfs,objset_t * os)1057 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
1058 {
1059 int error;
1060 uint64_t val;
1061
1062 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
1063 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
1064 zfsvfs->z_os = os;
1065
1066 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
1067 if (error != 0)
1068 return (error);
1069 if (zfsvfs->z_version >
1070 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
1071 (void) printf("Can't mount a version %lld file system "
1072 "on a version %lld pool\n. Pool must be upgraded to mount "
1073 "this file system.", (u_longlong_t)zfsvfs->z_version,
1074 (u_longlong_t)spa_version(dmu_objset_spa(os)));
1075 return (SET_ERROR(ENOTSUP));
1076 }
1077 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
1078 if (error != 0)
1079 return (error);
1080 zfsvfs->z_norm = (int)val;
1081
1082 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
1083 if (error != 0)
1084 return (error);
1085 zfsvfs->z_utf8 = (val != 0);
1086
1087 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
1088 if (error != 0)
1089 return (error);
1090 zfsvfs->z_case = (uint_t)val;
1091
1092 /*
1093 * Fold case on file systems that are always or sometimes case
1094 * insensitive.
1095 */
1096 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
1097 zfsvfs->z_case == ZFS_CASE_MIXED)
1098 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1099
1100 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1101 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1102
1103 uint64_t sa_obj = 0;
1104 if (zfsvfs->z_use_sa) {
1105 /* should either have both of these objects or none */
1106 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
1107 &sa_obj);
1108 if (error != 0)
1109 return (error);
1110 }
1111
1112 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1113 &zfsvfs->z_attr_table);
1114 if (error != 0)
1115 return (error);
1116
1117 if (zfsvfs->z_version >= ZPL_VERSION_SA)
1118 sa_register_update_callback(os, zfs_sa_upgrade);
1119
1120 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
1121 &zfsvfs->z_root);
1122 if (error != 0)
1123 return (error);
1124 ASSERT(zfsvfs->z_root != 0);
1125
1126 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
1127 &zfsvfs->z_unlinkedobj);
1128 if (error != 0)
1129 return (error);
1130
1131 error = zap_lookup(os, MASTER_NODE_OBJ,
1132 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
1133 8, 1, &zfsvfs->z_userquota_obj);
1134 if (error == ENOENT)
1135 zfsvfs->z_userquota_obj = 0;
1136 else if (error != 0)
1137 return (error);
1138
1139 error = zap_lookup(os, MASTER_NODE_OBJ,
1140 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
1141 8, 1, &zfsvfs->z_groupquota_obj);
1142 if (error == ENOENT)
1143 zfsvfs->z_groupquota_obj = 0;
1144 else if (error != 0)
1145 return (error);
1146
1147 error = zap_lookup(os, MASTER_NODE_OBJ,
1148 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
1149 8, 1, &zfsvfs->z_projectquota_obj);
1150 if (error == ENOENT)
1151 zfsvfs->z_projectquota_obj = 0;
1152 else if (error != 0)
1153 return (error);
1154
1155 error = zap_lookup(os, MASTER_NODE_OBJ,
1156 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
1157 8, 1, &zfsvfs->z_userobjquota_obj);
1158 if (error == ENOENT)
1159 zfsvfs->z_userobjquota_obj = 0;
1160 else if (error != 0)
1161 return (error);
1162
1163 error = zap_lookup(os, MASTER_NODE_OBJ,
1164 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
1165 8, 1, &zfsvfs->z_groupobjquota_obj);
1166 if (error == ENOENT)
1167 zfsvfs->z_groupobjquota_obj = 0;
1168 else if (error != 0)
1169 return (error);
1170
1171 error = zap_lookup(os, MASTER_NODE_OBJ,
1172 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
1173 8, 1, &zfsvfs->z_projectobjquota_obj);
1174 if (error == ENOENT)
1175 zfsvfs->z_projectobjquota_obj = 0;
1176 else if (error != 0)
1177 return (error);
1178
1179 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
1180 &zfsvfs->z_fuid_obj);
1181 if (error == ENOENT)
1182 zfsvfs->z_fuid_obj = 0;
1183 else if (error != 0)
1184 return (error);
1185
1186 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
1187 &zfsvfs->z_shares_dir);
1188 if (error == ENOENT)
1189 zfsvfs->z_shares_dir = 0;
1190 else if (error != 0)
1191 return (error);
1192
1193 return (0);
1194 }
1195
1196 int
zfsvfs_create(const char * osname,boolean_t readonly,zfsvfs_t ** zfvp)1197 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
1198 {
1199 objset_t *os;
1200 zfsvfs_t *zfsvfs;
1201 int error;
1202 boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
1203
1204 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1205
1206 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os);
1207 if (error != 0) {
1208 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1209 return (error);
1210 }
1211
1212 error = zfsvfs_create_impl(zfvp, zfsvfs, os);
1213 if (error != 0) {
1214 dmu_objset_disown(os, B_TRUE, zfsvfs);
1215 }
1216 return (error);
1217 }
1218
1219
1220 int
zfsvfs_create_impl(zfsvfs_t ** zfvp,zfsvfs_t * zfsvfs,objset_t * os)1221 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
1222 {
1223 int error;
1224
1225 zfsvfs->z_vfs = NULL;
1226 zfsvfs->z_parent = zfsvfs;
1227
1228 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1229 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1230 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1231 offsetof(znode_t, z_link_node));
1232 rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
1233 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
1234 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1235 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1236 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1237
1238 error = zfsvfs_init(zfsvfs, os);
1239 if (error != 0) {
1240 *zfvp = NULL;
1241 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1242 return (error);
1243 }
1244
1245 zfsvfs->z_drain_task = TASKQID_INVALID;
1246 zfsvfs->z_draining = B_FALSE;
1247 zfsvfs->z_drain_cancel = B_TRUE;
1248
1249 *zfvp = zfsvfs;
1250 return (0);
1251 }
1252
1253 static int
zfsvfs_setup(zfsvfs_t * zfsvfs,boolean_t mounting)1254 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1255 {
1256 int error;
1257
1258 error = zfs_register_callbacks(zfsvfs->z_vfs);
1259 if (error)
1260 return (error);
1261
1262 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1263
1264 /*
1265 * If we are not mounting (ie: online recv), then we don't
1266 * have to worry about replaying the log as we blocked all
1267 * operations out since we closed the ZIL.
1268 */
1269 if (mounting) {
1270 boolean_t readonly;
1271
1272 /*
1273 * During replay we remove the read only flag to
1274 * allow replays to succeed.
1275 */
1276 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1277 if (readonly != 0) {
1278 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1279 } else {
1280 zfs_unlinked_drain(zfsvfs);
1281 }
1282
1283 /*
1284 * Parse and replay the intent log.
1285 *
1286 * Because of ziltest, this must be done after
1287 * zfs_unlinked_drain(). (Further note: ziltest
1288 * doesn't use readonly mounts, where
1289 * zfs_unlinked_drain() isn't called.) This is because
1290 * ziltest causes spa_sync() to think it's committed,
1291 * but actually it is not, so the intent log contains
1292 * many txg's worth of changes.
1293 *
1294 * In particular, if object N is in the unlinked set in
1295 * the last txg to actually sync, then it could be
1296 * actually freed in a later txg and then reallocated
1297 * in a yet later txg. This would write a "create
1298 * object N" record to the intent log. Normally, this
1299 * would be fine because the spa_sync() would have
1300 * written out the fact that object N is free, before
1301 * we could write the "create object N" intent log
1302 * record.
1303 *
1304 * But when we are in ziltest mode, we advance the "open
1305 * txg" without actually spa_sync()-ing the changes to
1306 * disk. So we would see that object N is still
1307 * allocated and in the unlinked set, and there is an
1308 * intent log record saying to allocate it.
1309 */
1310 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1311 if (zil_replay_disable) {
1312 zil_destroy(zfsvfs->z_log, B_FALSE);
1313 } else {
1314 zfsvfs->z_replay = B_TRUE;
1315 zil_replay(zfsvfs->z_os, zfsvfs,
1316 zfs_replay_vector);
1317 zfsvfs->z_replay = B_FALSE;
1318 }
1319 }
1320
1321 /* restore readonly bit */
1322 if (readonly != 0)
1323 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1324 }
1325
1326 /*
1327 * Set the objset user_ptr to track its zfsvfs.
1328 */
1329 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1330 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1331 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1332
1333 return (0);
1334 }
1335
1336 void
zfsvfs_free(zfsvfs_t * zfsvfs)1337 zfsvfs_free(zfsvfs_t *zfsvfs)
1338 {
1339 int i;
1340 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1341
1342 /*
1343 * This is a barrier to prevent the filesystem from going away in
1344 * zfs_znode_move() until we can safely ensure that the filesystem is
1345 * not unmounted. We consider the filesystem valid before the barrier
1346 * and invalid after the barrier.
1347 */
1348 rw_enter(&zfsvfs_lock, RW_READER);
1349 rw_exit(&zfsvfs_lock);
1350
1351 zfs_fuid_destroy(zfsvfs);
1352
1353 mutex_destroy(&zfsvfs->z_znodes_lock);
1354 mutex_destroy(&zfsvfs->z_lock);
1355 list_destroy(&zfsvfs->z_all_znodes);
1356 rrm_destroy(&zfsvfs->z_teardown_lock);
1357 rw_destroy(&zfsvfs->z_teardown_inactive_lock);
1358 rw_destroy(&zfsvfs->z_fuid_lock);
1359 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1360 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1361 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1362 }
1363
1364 static void
zfs_set_fuid_feature(zfsvfs_t * zfsvfs)1365 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1366 {
1367 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1368 if (zfsvfs->z_vfs) {
1369 if (zfsvfs->z_use_fuids) {
1370 vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1371 vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1372 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1373 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1374 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1375 vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1376 } else {
1377 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1378 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1379 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1380 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1381 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1382 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1383 }
1384 }
1385 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1386 }
1387
1388 static int
zfs_domount(vfs_t * vfsp,char * osname)1389 zfs_domount(vfs_t *vfsp, char *osname)
1390 {
1391 dev_t mount_dev;
1392 uint64_t recordsize, fsid_guid;
1393 int error = 0;
1394 zfsvfs_t *zfsvfs;
1395 boolean_t readonly = vfsp->vfs_flag & VFS_RDONLY ? B_TRUE : B_FALSE;
1396
1397 ASSERT(vfsp);
1398 ASSERT(osname);
1399
1400 error = zfsvfs_create(osname, readonly, &zfsvfs);
1401 if (error)
1402 return (error);
1403 zfsvfs->z_vfs = vfsp;
1404
1405 /* Initialize the generic filesystem structure. */
1406 vfsp->vfs_bcount = 0;
1407 vfsp->vfs_data = NULL;
1408
1409 if (zfs_create_unique_device(&mount_dev) == -1) {
1410 error = SET_ERROR(ENODEV);
1411 goto out;
1412 }
1413 ASSERT(vfs_devismounted(mount_dev) == 0);
1414
1415 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
1416 NULL))
1417 goto out;
1418
1419 vfsp->vfs_dev = mount_dev;
1420 vfsp->vfs_fstype = zfsfstype;
1421 vfsp->vfs_bsize = recordsize;
1422 vfsp->vfs_flag |= VFS_NOTRUNC;
1423 vfsp->vfs_data = zfsvfs;
1424
1425 /*
1426 * The fsid is 64 bits, composed of an 8-bit fs type, which
1427 * separates our fsid from any other filesystem types, and a
1428 * 56-bit objset unique ID. The objset unique ID is unique to
1429 * all objsets open on this system, provided by unique_create().
1430 * The 8-bit fs type must be put in the low bits of fsid[1]
1431 * because that's where other Solaris filesystems put it.
1432 */
1433 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1434 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1435 vfsp->vfs_fsid.val[0] = fsid_guid;
1436 vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1437 zfsfstype & 0xFF;
1438
1439 /*
1440 * Set features for file system.
1441 */
1442 zfs_set_fuid_feature(zfsvfs);
1443 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
1444 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1445 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1446 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
1447 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
1448 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1449 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1450 }
1451 vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
1452
1453 if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1454 uint64_t pval;
1455
1456 atime_changed_cb(zfsvfs, B_FALSE);
1457 readonly_changed_cb(zfsvfs, B_TRUE);
1458 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
1459 goto out;
1460 xattr_changed_cb(zfsvfs, pval);
1461 zfsvfs->z_issnap = B_TRUE;
1462 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1463
1464 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1465 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1466 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1467 } else {
1468 error = zfsvfs_setup(zfsvfs, B_TRUE);
1469 }
1470
1471 /* cache the root vnode for this mount */
1472 znode_t *rootzp;
1473 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp)) {
1474 goto out;
1475 }
1476 zfsvfs->z_rootdir = ZTOV(rootzp);
1477
1478 if (!zfsvfs->z_issnap)
1479 zfsctl_create(zfsvfs);
1480 out:
1481 if (error) {
1482 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1483 zfsvfs_free(zfsvfs);
1484 } else {
1485 atomic_inc_32(&zfs_active_fs_count);
1486 }
1487
1488 return (error);
1489 }
1490
1491 void
zfs_unregister_callbacks(zfsvfs_t * zfsvfs)1492 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1493 {
1494 objset_t *os = zfsvfs->z_os;
1495
1496 if (!dmu_objset_is_snapshot(os))
1497 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1498 }
1499
1500 /*
1501 * Convert a decimal digit string to a uint64_t integer.
1502 */
1503 static int
str_to_uint64(char * str,uint64_t * objnum)1504 str_to_uint64(char *str, uint64_t *objnum)
1505 {
1506 uint64_t num = 0;
1507
1508 while (*str) {
1509 if (*str < '0' || *str > '9')
1510 return (SET_ERROR(EINVAL));
1511
1512 num = num*10 + *str++ - '0';
1513 }
1514
1515 *objnum = num;
1516 return (0);
1517 }
1518
1519 /*
1520 * The boot path passed from the boot loader is in the form of
1521 * "rootpool-name/root-filesystem-object-number'. Convert this
1522 * string to a dataset name: "rootpool-name/root-filesystem-name".
1523 */
1524 static int
zfs_parse_bootfs(char * bpath,char * outpath)1525 zfs_parse_bootfs(char *bpath, char *outpath)
1526 {
1527 char *slashp;
1528 uint64_t objnum;
1529 int error;
1530
1531 if (*bpath == 0 || *bpath == '/')
1532 return (SET_ERROR(EINVAL));
1533
1534 (void) strcpy(outpath, bpath);
1535
1536 slashp = strchr(bpath, '/');
1537
1538 /* if no '/', just return the pool name */
1539 if (slashp == NULL) {
1540 return (0);
1541 }
1542
1543 /* if not a number, just return the root dataset name */
1544 if (str_to_uint64(slashp+1, &objnum)) {
1545 return (0);
1546 }
1547
1548 *slashp = '\0';
1549 error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
1550 *slashp = '/';
1551
1552 return (error);
1553 }
1554
1555 /*
1556 * Check that the hex label string is appropriate for the dataset being
1557 * mounted into the global_zone proper.
1558 *
1559 * Return an error if the hex label string is not default or
1560 * admin_low/admin_high. For admin_low labels, the corresponding
1561 * dataset must be readonly.
1562 */
1563 int
zfs_check_global_label(const char * dsname,const char * hexsl)1564 zfs_check_global_label(const char *dsname, const char *hexsl)
1565 {
1566 if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1567 return (0);
1568 if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
1569 return (0);
1570 if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
1571 /* must be readonly */
1572 uint64_t rdonly;
1573
1574 if (dsl_prop_get_integer(dsname,
1575 zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
1576 return (SET_ERROR(EACCES));
1577 return (rdonly ? 0 : EACCES);
1578 }
1579 return (SET_ERROR(EACCES));
1580 }
1581
1582 static int
zfs_statfs_project(zfsvfs_t * zfsvfs,znode_t * zp,struct statvfs64 * statp,uint32_t bshift)1583 zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct statvfs64 *statp,
1584 uint32_t bshift)
1585 {
1586 ilstr_t ils;
1587 char buf[20 + DMU_OBJACCT_PREFIX_LEN];
1588 uint64_t offset = DMU_OBJACCT_PREFIX_LEN;
1589 uint64_t quota;
1590 uint64_t used;
1591 int err;
1592
1593 ilstr_init_prealloc(&ils, buf, sizeof (buf));
1594 ilstr_append_str(&ils, DMU_OBJACCT_PREFIX);
1595 err = id_to_fuidstr(zfsvfs, NULL, zp->z_projid, &ils, B_FALSE);
1596 if (err)
1597 return (err);
1598
1599 VERIFY3S(ilstr_errno(&ils), ==, ILSTR_ERROR_OK);
1600 if (zfsvfs->z_projectquota_obj == 0)
1601 goto objs;
1602
1603 err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj,
1604 buf + offset, 8, 1, "a);
1605 if (err == ENOENT)
1606 goto objs;
1607 else if (err)
1608 return (err);
1609
1610 err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
1611 buf + offset, 8, 1, &used);
1612 if (unlikely(err == ENOENT)) {
1613 uint32_t blksize;
1614 u_longlong_t nblocks;
1615
1616 /*
1617 * Quota accounting is async, so it is possible race case.
1618 * There is at least one object with the given project ID.
1619 */
1620 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
1621 if (unlikely(zp->z_blksz == 0))
1622 blksize = zfsvfs->z_max_blksz;
1623
1624 used = blksize * nblocks;
1625 } else if (err) {
1626 return (err);
1627 }
1628
1629 statp->f_blocks = quota >> bshift;
1630 statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0;
1631 statp->f_bavail = statp->f_bfree;
1632
1633 objs:
1634 if (zfsvfs->z_projectobjquota_obj == 0)
1635 return (0);
1636
1637 err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj,
1638 buf + offset, 8, 1, "a);
1639 if (err == ENOENT)
1640 return (0);
1641 else if (err)
1642 return (err);
1643
1644 err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
1645 buf, 8, 1, &used);
1646 if (unlikely(err == ENOENT)) {
1647 /*
1648 * Quota accounting is async, so it is possible race case.
1649 * There is at least one object with the given project ID.
1650 */
1651 used = 1;
1652 } else if (err) {
1653 return (err);
1654 }
1655
1656 statp->f_files = quota;
1657 statp->f_ffree = (quota > used) ? (quota - used) : 0;
1658
1659 return (0);
1660 }
1661
1662 /*
1663 * Determine whether the mount is allowed according to MAC check.
1664 * by comparing (where appropriate) label of the dataset against
1665 * the label of the zone being mounted into. If the dataset has
1666 * no label, create one.
1667 *
1668 * Returns 0 if access allowed, error otherwise (e.g. EACCES)
1669 */
1670 static int
zfs_mount_label_policy(vfs_t * vfsp,char * osname)1671 zfs_mount_label_policy(vfs_t *vfsp, char *osname)
1672 {
1673 int error, retv;
1674 zone_t *mntzone = NULL;
1675 ts_label_t *mnt_tsl;
1676 bslabel_t *mnt_sl;
1677 bslabel_t ds_sl;
1678 char ds_hexsl[MAXNAMELEN];
1679
1680 retv = EACCES; /* assume the worst */
1681
1682 /*
1683 * Start by getting the dataset label if it exists.
1684 */
1685 error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1686 1, sizeof (ds_hexsl), &ds_hexsl, NULL);
1687 if (error)
1688 return (SET_ERROR(EACCES));
1689
1690 /*
1691 * If labeling is NOT enabled, then disallow the mount of datasets
1692 * which have a non-default label already. No other label checks
1693 * are needed.
1694 */
1695 if (!is_system_labeled()) {
1696 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1697 return (0);
1698 return (SET_ERROR(EACCES));
1699 }
1700
1701 /*
1702 * Get the label of the mountpoint. If mounting into the global
1703 * zone (i.e. mountpoint is not within an active zone and the
1704 * zoned property is off), the label must be default or
1705 * admin_low/admin_high only; no other checks are needed.
1706 */
1707 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
1708 if (mntzone->zone_id == GLOBAL_ZONEID) {
1709 uint64_t zoned;
1710
1711 zone_rele(mntzone);
1712
1713 if (dsl_prop_get_integer(osname,
1714 zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
1715 return (SET_ERROR(EACCES));
1716 if (!zoned)
1717 return (zfs_check_global_label(osname, ds_hexsl));
1718 else
1719 /*
1720 * This is the case of a zone dataset being mounted
1721 * initially, before the zone has been fully created;
1722 * allow this mount into global zone.
1723 */
1724 return (0);
1725 }
1726
1727 mnt_tsl = mntzone->zone_slabel;
1728 ASSERT(mnt_tsl != NULL);
1729 label_hold(mnt_tsl);
1730 mnt_sl = label2bslabel(mnt_tsl);
1731
1732 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
1733 /*
1734 * The dataset doesn't have a real label, so fabricate one.
1735 */
1736 char *str = NULL;
1737
1738 if (l_to_str_internal(mnt_sl, &str) == 0 &&
1739 dsl_prop_set_string(osname,
1740 zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1741 ZPROP_SRC_LOCAL, str) == 0)
1742 retv = 0;
1743 if (str != NULL)
1744 kmem_free(str, strlen(str) + 1);
1745 } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
1746 /*
1747 * Now compare labels to complete the MAC check. If the
1748 * labels are equal then allow access. If the mountpoint
1749 * label dominates the dataset label, allow readonly access.
1750 * Otherwise, access is denied.
1751 */
1752 if (blequal(mnt_sl, &ds_sl))
1753 retv = 0;
1754 else if (bldominates(mnt_sl, &ds_sl)) {
1755 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1756 retv = 0;
1757 }
1758 }
1759
1760 label_rele(mnt_tsl);
1761 zone_rele(mntzone);
1762 return (retv);
1763 }
1764
1765 /*
1766 * Load a string-valued boot property and attempt to convert it to a 64-bit
1767 * unsigned integer. If the value is not present, or the conversion fails,
1768 * return the provided default value.
1769 */
1770 static uint64_t
spa_get_bootprop_uint64(const char * name,uint64_t defval)1771 spa_get_bootprop_uint64(const char *name, uint64_t defval)
1772 {
1773 char *propval;
1774 u_longlong_t r;
1775 int e;
1776
1777 if ((propval = spa_get_bootprop(name)) == NULL) {
1778 /*
1779 * The property does not exist.
1780 */
1781 return (defval);
1782 }
1783
1784 e = ddi_strtoull(propval, NULL, 10, &r);
1785
1786 spa_free_bootprop(propval);
1787
1788 /*
1789 * If the conversion succeeded, return the value. If there was any
1790 * kind of failure, just return the default value.
1791 */
1792 return (e == 0 ? r : defval);
1793 }
1794
1795 static int
zfs_mountroot(vfs_t * vfsp,enum whymountroot why)1796 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
1797 {
1798 int error = 0;
1799 static int zfsrootdone = 0;
1800 zfsvfs_t *zfsvfs = NULL;
1801 znode_t *zp = NULL;
1802 vnode_t *vp = NULL;
1803 char *zfs_bootfs;
1804 char *zfs_devid;
1805 char *zfs_rootdisk_path;
1806 uint64_t zfs_bootpool;
1807 uint64_t zfs_bootvdev;
1808
1809 ASSERT(vfsp);
1810
1811 /*
1812 * The filesystem that we mount as root is defined in the
1813 * boot property "zfs-bootfs" with a format of
1814 * "poolname/root-dataset-objnum".
1815 */
1816 if (why == ROOT_INIT) {
1817 if (zfsrootdone++)
1818 return (SET_ERROR(EBUSY));
1819
1820 /*
1821 * the process of doing a spa_load will require the
1822 * clock to be set before we could (for example) do
1823 * something better by looking at the timestamp on
1824 * an uberblock, so just set it to -1.
1825 */
1826 clkset(-1);
1827
1828 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
1829 cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
1830 "bootfs name");
1831 return (SET_ERROR(EINVAL));
1832 }
1833 zfs_devid = spa_get_bootprop("diskdevid");
1834
1835 /*
1836 * The boot loader may also provide us with the GUID for both
1837 * the pool and the nominated boot vdev. A GUID value of 0 is
1838 * explicitly invalid (see "spa_change_guid()"), so we use this
1839 * as a sentinel value when no GUID is present.
1840 */
1841 zfs_bootpool = spa_get_bootprop_uint64("zfs-bootpool", 0);
1842 zfs_bootvdev = spa_get_bootprop_uint64("zfs-bootvdev", 0);
1843
1844 /*
1845 * If we have been given a root disk override path, we want to
1846 * ignore device paths from the pool configuration and use only
1847 * the specific path we were given in the boot properties.
1848 */
1849 zfs_rootdisk_path = spa_get_bootprop("zfs-rootdisk-path");
1850
1851 /*
1852 * Initialise the early boot device rescan mechanism. A scan
1853 * will not actually be performed unless we need to do so in
1854 * order to find the correct /devices path for a relocated
1855 * device.
1856 */
1857 vdev_disk_preroot_init(zfs_rootdisk_path);
1858
1859 error = spa_import_rootpool(rootfs.bo_name, zfs_devid,
1860 zfs_bootpool, zfs_bootvdev);
1861
1862 spa_free_bootprop(zfs_devid);
1863
1864 if (error != 0) {
1865 spa_free_bootprop(zfs_bootfs);
1866 spa_free_bootprop(zfs_rootdisk_path);
1867 vdev_disk_preroot_fini();
1868 cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
1869 error);
1870 return (error);
1871 }
1872
1873 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
1874 spa_free_bootprop(zfs_bootfs);
1875 spa_free_bootprop(zfs_rootdisk_path);
1876 vdev_disk_preroot_fini();
1877 cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
1878 error);
1879 return (error);
1880 }
1881
1882 spa_free_bootprop(zfs_bootfs);
1883 spa_free_bootprop(zfs_rootdisk_path);
1884
1885 if ((error = vfs_lock(vfsp)) != 0) {
1886 vdev_disk_preroot_fini();
1887 return (error);
1888 }
1889
1890 if (error = zfs_domount(vfsp, rootfs.bo_name)) {
1891 cmn_err(CE_NOTE, "zfs_domount: error %d", error);
1892 goto out;
1893 }
1894
1895 /* zfs_domount has already cached the root vnode for us */
1896 zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
1897 ASSERT(zfsvfs);
1898 ASSERT(zfsvfs->z_rootdir);
1899
1900 vp = zfsvfs->z_rootdir;
1901 mutex_enter(&vp->v_lock);
1902 vp->v_flag |= VROOT;
1903 mutex_exit(&vp->v_lock);
1904
1905 /*
1906 * Leave rootvp held. The root file system is never unmounted.
1907 */
1908 VN_HOLD(vp);
1909 rootvp = vp;
1910
1911 vfs_add((struct vnode *)0, vfsp,
1912 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
1913 out:
1914 vdev_disk_preroot_fini();
1915 vfs_unlock(vfsp);
1916 return (error);
1917 } else if (why == ROOT_REMOUNT) {
1918 readonly_changed_cb(vfsp->vfs_data, B_FALSE);
1919 vfsp->vfs_flag |= VFS_REMOUNT;
1920
1921 /* refresh mount options */
1922 zfs_unregister_callbacks(vfsp->vfs_data);
1923 return (zfs_register_callbacks(vfsp));
1924
1925 } else if (why == ROOT_UNMOUNT) {
1926 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
1927 (void) zfs_sync(vfsp, 0, 0);
1928 return (0);
1929 }
1930
1931 /*
1932 * if "why" is equal to anything else other than ROOT_INIT,
1933 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
1934 */
1935 return (SET_ERROR(ENOTSUP));
1936 }
1937
1938 /*ARGSUSED*/
1939 static int
zfs_mount(vfs_t * vfsp,vnode_t * mvp,struct mounta * uap,cred_t * cr)1940 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
1941 {
1942 char *osname;
1943 pathname_t spn;
1944 int error = 0;
1945 uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ?
1946 UIO_SYSSPACE : UIO_USERSPACE;
1947 int canwrite;
1948
1949 if (mvp->v_type != VDIR)
1950 return (SET_ERROR(ENOTDIR));
1951
1952 mutex_enter(&mvp->v_lock);
1953 if ((uap->flags & MS_REMOUNT) == 0 &&
1954 (uap->flags & MS_OVERLAY) == 0 &&
1955 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
1956 mutex_exit(&mvp->v_lock);
1957 return (SET_ERROR(EBUSY));
1958 }
1959 mutex_exit(&mvp->v_lock);
1960
1961 /*
1962 * ZFS does not support passing unparsed data in via MS_DATA.
1963 * Users should use the MS_OPTIONSTR interface; this means
1964 * that all option parsing is already done and the options struct
1965 * can be interrogated.
1966 */
1967 if ((uap->flags & MS_DATA) && uap->datalen > 0)
1968 return (SET_ERROR(EINVAL));
1969
1970 /*
1971 * Get the objset name (the "special" mount argument).
1972 */
1973 if (error = pn_get(uap->spec, fromspace, &spn))
1974 return (error);
1975
1976 osname = spn.pn_path;
1977
1978 /*
1979 * Check for mount privilege?
1980 *
1981 * If we don't have privilege then see if
1982 * we have local permission to allow it
1983 */
1984 error = secpolicy_fs_mount(cr, mvp, vfsp);
1985 if (error) {
1986 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) {
1987 vattr_t vattr;
1988
1989 /*
1990 * Make sure user is the owner of the mount point
1991 * or has sufficient privileges.
1992 */
1993
1994 vattr.va_mask = AT_UID;
1995
1996 if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
1997 goto out;
1998 }
1999
2000 if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
2001 VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
2002 goto out;
2003 }
2004 secpolicy_fs_mount_clearopts(cr, vfsp);
2005 } else {
2006 goto out;
2007 }
2008 }
2009
2010 /*
2011 * Refuse to mount a filesystem if we are in a local zone and the
2012 * dataset is not visible.
2013 */
2014 if (!INGLOBALZONE(curproc) &&
2015 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
2016 error = SET_ERROR(EPERM);
2017 goto out;
2018 }
2019
2020 error = zfs_mount_label_policy(vfsp, osname);
2021 if (error)
2022 goto out;
2023
2024 /*
2025 * When doing a remount, we simply refresh our temporary properties
2026 * according to those options set in the current VFS options.
2027 */
2028 if (uap->flags & MS_REMOUNT) {
2029 /* refresh mount options */
2030 zfs_unregister_callbacks(vfsp->vfs_data);
2031 error = zfs_register_callbacks(vfsp);
2032 goto out;
2033 }
2034
2035 error = zfs_domount(vfsp, osname);
2036
2037 /*
2038 * Add an extra VFS_HOLD on our parent vfs so that it can't
2039 * disappear due to a forced unmount.
2040 */
2041 if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
2042 VFS_HOLD(mvp->v_vfsp);
2043
2044 out:
2045 pn_free(&spn);
2046 return (error);
2047 }
2048
2049 static int
zfs_statvfs(vfs_t * vfsp,struct statvfs64 * statp)2050 zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
2051 {
2052 zfsvfs_t *zfsvfs = vfsp->vfs_data;
2053 dev32_t d32;
2054 uint64_t refdbytes, availbytes, usedobjs, availobjs;
2055 int err = 0;
2056
2057 ZFS_ENTER(zfsvfs);
2058
2059 dmu_objset_space(zfsvfs->z_os,
2060 &refdbytes, &availbytes, &usedobjs, &availobjs);
2061
2062 /*
2063 * The underlying storage pool actually uses multiple block sizes.
2064 * We report the fragsize as the smallest block size we support,
2065 * and we report our blocksize as the filesystem's maximum blocksize.
2066 */
2067 statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
2068 statp->f_bsize = zfsvfs->z_max_blksz;
2069
2070 /*
2071 * The following report "total" blocks of various kinds in the
2072 * file system, but reported in terms of f_frsize - the
2073 * "fragment" size.
2074 */
2075
2076 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
2077 statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
2078 statp->f_bavail = statp->f_bfree; /* no root reservation */
2079
2080 /*
2081 * statvfs() should really be called statufs(), because it assumes
2082 * static metadata. ZFS doesn't preallocate files, so the best
2083 * we can do is report the max that could possibly fit in f_files,
2084 * and that minus the number actually used in f_ffree.
2085 * For f_ffree, report the smaller of the number of object available
2086 * and the number of blocks (each object will take at least a block).
2087 */
2088 statp->f_ffree = MIN(availobjs, statp->f_bfree);
2089 statp->f_favail = statp->f_ffree; /* no "root reservation" */
2090 statp->f_files = statp->f_ffree + usedobjs;
2091
2092 (void) cmpldev(&d32, vfsp->vfs_dev);
2093 statp->f_fsid = d32;
2094
2095 /*
2096 * We're a zfs filesystem.
2097 */
2098 (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
2099
2100 statp->f_flag = vf_to_stf(vfsp->vfs_flag);
2101
2102 statp->f_namemax = MAXNAMELEN - 1;
2103
2104 /*
2105 * We have all of 32 characters to stuff a string here.
2106 * Is there anything useful we could/should provide?
2107 */
2108 bzero(statp->f_fstr, sizeof (statp->f_fstr));
2109
2110 if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
2111 dmu_objset_projectquota_present(zfsvfs->z_os)) {
2112 znode_t *zp;
2113
2114 /*
2115 * In ZoL, zfs_statvfs is passed a Linux dentry (directory
2116 * entry), instead of a vfsp. The ZoL code uses the dentry
2117 * to get the znode from the dentry's inode. This represents
2118 * whatever filename was passed to the user-level statvfs
2119 * syscall.
2120 *
2121 * We're using the VFS root znode here, so this represents a
2122 * potential difference from ZoL.
2123 */
2124 if (zfs_zget(zfsvfs, zfsvfs->z_root, &zp) == 0) {
2125 uint32_t bshift = ddi_fls(statp->f_bsize) - 1;
2126
2127 if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid &&
2128 zpl_is_valid_projid(zp->z_projid))
2129 err = zfs_statfs_project(zfsvfs, zp, statp,
2130 bshift);
2131 VN_RELE(ZTOV(zp));
2132 }
2133 }
2134
2135 ZFS_EXIT(zfsvfs);
2136 return (err);
2137 }
2138
2139 static int
zfs_root(vfs_t * vfsp,vnode_t ** vpp)2140 zfs_root(vfs_t *vfsp, vnode_t **vpp)
2141 {
2142 zfsvfs_t *zfsvfs = vfsp->vfs_data;
2143 struct vnode *vp;
2144 int error;
2145
2146 ZFS_ENTER(zfsvfs);
2147
2148 vp = zfsvfs->z_rootdir;
2149 if (vp != NULL) {
2150 VN_HOLD(vp);
2151 error = 0;
2152 } else {
2153 /* forced unmount */
2154 error = EIO;
2155 }
2156 *vpp = vp;
2157
2158 ZFS_EXIT(zfsvfs);
2159 return (error);
2160
2161 }
2162
2163 /*
2164 * Teardown the zfsvfs::z_os.
2165 *
2166 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
2167 * and 'z_teardown_inactive_lock' held.
2168 */
2169 static int
zfsvfs_teardown(zfsvfs_t * zfsvfs,boolean_t unmounting)2170 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
2171 {
2172 znode_t *zp;
2173
2174 zfs_unlinked_drain_stop_wait(zfsvfs);
2175
2176 rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
2177
2178 if (!unmounting) {
2179 /*
2180 * We purge the parent filesystem's vfsp as the parent
2181 * filesystem and all of its snapshots have their vnode's
2182 * v_vfsp set to the parent's filesystem's vfsp. Note,
2183 * 'z_parent' is self referential for non-snapshots.
2184 */
2185 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
2186 }
2187
2188 /*
2189 * Close the zil. NB: Can't close the zil while zfs_inactive
2190 * threads are blocked as zil_close can call zfs_inactive.
2191 */
2192 if (zfsvfs->z_log) {
2193 zil_close(zfsvfs->z_log);
2194 zfsvfs->z_log = NULL;
2195 }
2196
2197 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
2198
2199 /*
2200 * If we are not unmounting (ie: online recv) and someone already
2201 * unmounted this file system while we were doing the switcheroo,
2202 * or a reopen of z_os failed then just bail out now.
2203 */
2204 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
2205 rw_exit(&zfsvfs->z_teardown_inactive_lock);
2206 rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2207 return (SET_ERROR(EIO));
2208 }
2209
2210 /*
2211 * At this point there are no vops active, and any new vops will
2212 * fail with EIO since we have z_teardown_lock for writer (only
2213 * relavent for forced unmount).
2214 *
2215 * Release all holds on dbufs.
2216 */
2217 mutex_enter(&zfsvfs->z_znodes_lock);
2218 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
2219 zp = list_next(&zfsvfs->z_all_znodes, zp))
2220 if (zp->z_sa_hdl) {
2221 ASSERT(ZTOV(zp)->v_count > 0);
2222 zfs_znode_dmu_fini(zp);
2223 }
2224 mutex_exit(&zfsvfs->z_znodes_lock);
2225
2226 /*
2227 * If we are unmounting, set the unmounted flag and let new vops
2228 * unblock. zfs_inactive will have the unmounted behavior, and all
2229 * other vops will fail with EIO.
2230 */
2231 if (unmounting) {
2232 /*
2233 * Clear the cached root vnode now that we are unmounted.
2234 * Its release must be performed outside the teardown locks to
2235 * avoid recursive lock entry via zfs_inactive().
2236 */
2237 vnode_t *vp = zfsvfs->z_rootdir;
2238 zfsvfs->z_rootdir = NULL;
2239
2240 zfsvfs->z_unmounted = B_TRUE;
2241 rw_exit(&zfsvfs->z_teardown_inactive_lock);
2242 rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2243
2244 /* Drop the cached root vp now that it is safe */
2245 VN_RELE(vp);
2246 }
2247
2248 /*
2249 * z_os will be NULL if there was an error in attempting to reopen
2250 * zfsvfs, so just return as the properties had already been
2251 * unregistered and cached data had been evicted before.
2252 */
2253 if (zfsvfs->z_os == NULL)
2254 return (0);
2255
2256 /*
2257 * Unregister properties.
2258 */
2259 zfs_unregister_callbacks(zfsvfs);
2260
2261 /*
2262 * Evict cached data
2263 */
2264 if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
2265 !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
2266 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
2267 dmu_objset_evict_dbufs(zfsvfs->z_os);
2268
2269 return (0);
2270 }
2271
2272 /*ARGSUSED*/
2273 static int
zfs_umount(vfs_t * vfsp,int fflag,cred_t * cr)2274 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
2275 {
2276 zfsvfs_t *zfsvfs = vfsp->vfs_data;
2277 objset_t *os;
2278 int ret;
2279
2280 ret = secpolicy_fs_unmount(cr, vfsp);
2281 if (ret) {
2282 if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
2283 ZFS_DELEG_PERM_MOUNT, cr))
2284 return (ret);
2285 }
2286
2287 /*
2288 * We purge the parent filesystem's vfsp as the parent filesystem
2289 * and all of its snapshots have their vnode's v_vfsp set to the
2290 * parent's filesystem's vfsp. Note, 'z_parent' is self
2291 * referential for non-snapshots.
2292 */
2293 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
2294
2295 /*
2296 * Unmount any snapshots mounted under .zfs before unmounting the
2297 * dataset itself.
2298 */
2299 if (zfsvfs->z_ctldir != NULL &&
2300 (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
2301 return (ret);
2302 }
2303
2304 if (!(fflag & MS_FORCE)) {
2305 /*
2306 * Check the number of active vnodes in the file system.
2307 * Our count is maintained in the vfs structure, but the
2308 * number is off by 1 to indicate a hold on the vfs
2309 * structure itself.
2310 */
2311 boolean_t draining;
2312 uint_t thresh = 1;
2313 vnode_t *ctlvp, *rvp;
2314
2315 /*
2316 * The cached vnode for the root directory of the mount also
2317 * maintains a hold on the vfs structure.
2318 */
2319 rvp = zfsvfs->z_rootdir;
2320 thresh++;
2321
2322 /*
2323 * The '.zfs' directory maintains a reference of its own, and
2324 * any active references underneath are reflected in the vnode
2325 * count. Allow one additional reference for it.
2326 */
2327 ctlvp = zfsvfs->z_ctldir;
2328 if (ctlvp != NULL) {
2329 thresh++;
2330 }
2331
2332 /*
2333 * If it's running, the asynchronous unlinked drain task needs
2334 * to be stopped before the number of active vnodes can be
2335 * reliably checked.
2336 */
2337 draining = zfsvfs->z_draining;
2338 if (draining)
2339 zfs_unlinked_drain_stop_wait(zfsvfs);
2340
2341 if (vfsp->vfs_count > thresh || rvp->v_count > 1 ||
2342 (ctlvp != NULL && ctlvp->v_count > 1)) {
2343 if (draining) {
2344 /* If it was draining, restart the task */
2345 zfs_unlinked_drain(zfsvfs);
2346 }
2347 return (SET_ERROR(EBUSY));
2348 }
2349 }
2350
2351 vfsp->vfs_flag |= VFS_UNMOUNTED;
2352
2353 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
2354 os = zfsvfs->z_os;
2355
2356 /*
2357 * z_os will be NULL if there was an error in
2358 * attempting to reopen zfsvfs.
2359 */
2360 if (os != NULL) {
2361 /*
2362 * Unset the objset user_ptr.
2363 */
2364 mutex_enter(&os->os_user_ptr_lock);
2365 dmu_objset_set_user(os, NULL);
2366 mutex_exit(&os->os_user_ptr_lock);
2367
2368 /*
2369 * Finally release the objset
2370 */
2371 dmu_objset_disown(os, B_TRUE, zfsvfs);
2372 }
2373
2374 /*
2375 * We can now safely destroy the '.zfs' directory node.
2376 */
2377 if (zfsvfs->z_ctldir != NULL)
2378 zfsctl_destroy(zfsvfs);
2379
2380 return (0);
2381 }
2382
2383 static int
zfs_vget(vfs_t * vfsp,vnode_t ** vpp,fid_t * fidp)2384 zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
2385 {
2386 zfsvfs_t *zfsvfs = vfsp->vfs_data;
2387 znode_t *zp;
2388 uint64_t object = 0;
2389 uint64_t fid_gen = 0;
2390 uint64_t gen_mask;
2391 uint64_t zp_gen;
2392 int i, err;
2393
2394 *vpp = NULL;
2395
2396 ZFS_ENTER(zfsvfs);
2397
2398 if (fidp->fid_len == LONG_FID_LEN) {
2399 zfid_long_t *zlfid = (zfid_long_t *)fidp;
2400 uint64_t objsetid = 0;
2401 uint64_t setgen = 0;
2402
2403 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
2404 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
2405
2406 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
2407 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
2408
2409 ZFS_EXIT(zfsvfs);
2410
2411 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
2412 if (err)
2413 return (SET_ERROR(EINVAL));
2414 ZFS_ENTER(zfsvfs);
2415 }
2416
2417 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
2418 zfid_short_t *zfid = (zfid_short_t *)fidp;
2419
2420 for (i = 0; i < sizeof (zfid->zf_object); i++)
2421 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
2422
2423 for (i = 0; i < sizeof (zfid->zf_gen); i++)
2424 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
2425 } else {
2426 ZFS_EXIT(zfsvfs);
2427 return (SET_ERROR(EINVAL));
2428 }
2429
2430 /* A zero fid_gen means we are in the .zfs control directories */
2431 if (fid_gen == 0 &&
2432 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
2433 *vpp = zfsvfs->z_ctldir;
2434 ASSERT(*vpp != NULL);
2435 if (object == ZFSCTL_INO_SNAPDIR) {
2436 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
2437 0, NULL, NULL, NULL, NULL, NULL) == 0);
2438 } else {
2439 VN_HOLD(*vpp);
2440 }
2441 ZFS_EXIT(zfsvfs);
2442 return (0);
2443 }
2444
2445 gen_mask = -1ULL >> (64 - 8 * i);
2446
2447 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
2448 if (err = zfs_zget(zfsvfs, object, &zp)) {
2449 ZFS_EXIT(zfsvfs);
2450 return (err);
2451 }
2452 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
2453 sizeof (uint64_t));
2454 zp_gen = zp_gen & gen_mask;
2455 if (zp_gen == 0)
2456 zp_gen = 1;
2457 if (zp->z_unlinked || zp_gen != fid_gen) {
2458 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
2459 VN_RELE(ZTOV(zp));
2460 ZFS_EXIT(zfsvfs);
2461 return (SET_ERROR(EINVAL));
2462 }
2463
2464 *vpp = ZTOV(zp);
2465 ZFS_EXIT(zfsvfs);
2466 return (0);
2467 }
2468
2469 /*
2470 * Block out VOPs and close zfsvfs_t::z_os
2471 *
2472 * Note, if successful, then we return with the 'z_teardown_lock' and
2473 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
2474 * dataset and objset intact so that they can be atomically handed off during
2475 * a subsequent rollback or recv operation and the resume thereafter.
2476 */
2477 int
zfs_suspend_fs(zfsvfs_t * zfsvfs)2478 zfs_suspend_fs(zfsvfs_t *zfsvfs)
2479 {
2480 int error;
2481
2482 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
2483 return (error);
2484
2485 return (0);
2486 }
2487
2488 /*
2489 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
2490 * is an invariant across any of the operations that can be performed while the
2491 * filesystem was suspended. Whether it succeeded or failed, the preconditions
2492 * are the same: the relevant objset and associated dataset are owned by
2493 * zfsvfs, held, and long held on entry.
2494 */
2495 int
zfs_resume_fs(zfsvfs_t * zfsvfs,dsl_dataset_t * ds)2496 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2497 {
2498 int err;
2499 znode_t *zp;
2500
2501 ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
2502 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
2503
2504 /*
2505 * We already own this, so just update the objset_t, as the one we
2506 * had before may have been evicted.
2507 */
2508 objset_t *os;
2509 VERIFY3P(ds->ds_owner, ==, zfsvfs);
2510 VERIFY(dsl_dataset_long_held(ds));
2511 VERIFY0(dmu_objset_from_ds(ds, &os));
2512
2513 err = zfsvfs_init(zfsvfs, os);
2514 if (err != 0)
2515 goto bail;
2516
2517 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
2518
2519 zfs_set_fuid_feature(zfsvfs);
2520
2521 /*
2522 * Attempt to re-establish all the active znodes with
2523 * their dbufs. If a zfs_rezget() fails, then we'll let
2524 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
2525 * when they try to use their znode.
2526 */
2527 mutex_enter(&zfsvfs->z_znodes_lock);
2528 for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2529 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2530 (void) zfs_rezget(zp);
2531 }
2532 mutex_exit(&zfsvfs->z_znodes_lock);
2533
2534 if (((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) &&
2535 !zfsvfs->z_unmounted) {
2536 /*
2537 * zfs_suspend_fs() could have interrupted freeing
2538 * of dnodes. We need to restart this freeing so
2539 * that we don't "leak" the space.
2540 */
2541 zfs_unlinked_drain(zfsvfs);
2542 }
2543
2544 bail:
2545 /* release the VOPs */
2546 rw_exit(&zfsvfs->z_teardown_inactive_lock);
2547 rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2548
2549 if (err) {
2550 /*
2551 * Since we couldn't setup the sa framework, try to force
2552 * unmount this file system.
2553 */
2554 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
2555 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
2556 }
2557 return (err);
2558 }
2559
2560 static void
zfs_freevfs(vfs_t * vfsp)2561 zfs_freevfs(vfs_t *vfsp)
2562 {
2563 zfsvfs_t *zfsvfs = vfsp->vfs_data;
2564
2565 /*
2566 * If this is a snapshot, we have an extra VFS_HOLD on our parent
2567 * from zfs_mount(). Release it here. If we came through
2568 * zfs_mountroot() instead, we didn't grab an extra hold, so
2569 * skip the VFS_RELE for rootvfs.
2570 */
2571 if (zfsvfs->z_issnap && (vfsp != rootvfs))
2572 VFS_RELE(zfsvfs->z_parent->z_vfs);
2573
2574 zfsvfs_free(zfsvfs);
2575
2576 atomic_dec_32(&zfs_active_fs_count);
2577 }
2578
2579 /*
2580 * VFS_INIT() initialization. Note that there is no VFS_FINI(),
2581 * so we can't safely do any non-idempotent initialization here.
2582 * Leave that to zfs_init() and zfs_fini(), which are called
2583 * from the module's _init() and _fini() entry points.
2584 */
2585 /*ARGSUSED*/
2586 static int
zfs_vfsinit(int fstype,char * name)2587 zfs_vfsinit(int fstype, char *name)
2588 {
2589 int error;
2590
2591 zfsfstype = fstype;
2592
2593 /*
2594 * Setup vfsops and vnodeops tables.
2595 */
2596 error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
2597 if (error != 0) {
2598 cmn_err(CE_WARN, "zfs: bad vfs ops template");
2599 }
2600
2601 error = zfs_create_op_tables();
2602 if (error) {
2603 zfs_remove_op_tables();
2604 cmn_err(CE_WARN, "zfs: bad vnode ops template");
2605 (void) vfs_freevfsops_by_type(zfsfstype);
2606 return (error);
2607 }
2608
2609 mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
2610
2611 /*
2612 * Unique major number for all zfs mounts.
2613 * If we run out of 32-bit minors, we'll getudev() another major.
2614 */
2615 zfs_major = ddi_name_to_major(ZFS_DRIVER);
2616 zfs_minor = ZFS_MIN_MINOR;
2617
2618 return (0);
2619 }
2620
2621 void
zfs_init(void)2622 zfs_init(void)
2623 {
2624 /*
2625 * Initialize .zfs directory structures
2626 */
2627 zfsctl_init();
2628
2629 /*
2630 * Initialize znode cache, vnode ops, etc...
2631 */
2632 zfs_znode_init();
2633
2634 dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
2635 }
2636
2637 void
zfs_fini(void)2638 zfs_fini(void)
2639 {
2640 zfsctl_fini();
2641 zfs_znode_fini();
2642 }
2643
2644 int
zfs_busy(void)2645 zfs_busy(void)
2646 {
2647 return (zfs_active_fs_count != 0);
2648 }
2649
2650 int
zfs_set_version(zfsvfs_t * zfsvfs,uint64_t newvers)2651 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2652 {
2653 int error;
2654 objset_t *os = zfsvfs->z_os;
2655 dmu_tx_t *tx;
2656
2657 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2658 return (SET_ERROR(EINVAL));
2659
2660 if (newvers < zfsvfs->z_version)
2661 return (SET_ERROR(EINVAL));
2662
2663 if (zfs_spa_version_map(newvers) >
2664 spa_version(dmu_objset_spa(zfsvfs->z_os)))
2665 return (SET_ERROR(ENOTSUP));
2666
2667 tx = dmu_tx_create(os);
2668 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2669 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2670 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2671 ZFS_SA_ATTRS);
2672 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2673 }
2674 error = dmu_tx_assign(tx, TXG_WAIT);
2675 if (error) {
2676 dmu_tx_abort(tx);
2677 return (error);
2678 }
2679
2680 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2681 8, 1, &newvers, tx);
2682
2683 if (error) {
2684 dmu_tx_commit(tx);
2685 return (error);
2686 }
2687
2688 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2689 uint64_t sa_obj;
2690
2691 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2692 SPA_VERSION_SA);
2693 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2694 DMU_OT_NONE, 0, tx);
2695
2696 error = zap_add(os, MASTER_NODE_OBJ,
2697 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2698 ASSERT0(error);
2699
2700 VERIFY(0 == sa_set_sa_object(os, sa_obj));
2701 sa_register_update_callback(os, zfs_sa_upgrade);
2702 }
2703
2704 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2705 "from %llu to %llu", zfsvfs->z_version, newvers);
2706
2707 dmu_tx_commit(tx);
2708
2709 zfsvfs->z_version = newvers;
2710 os->os_version = newvers;
2711
2712 zfs_set_fuid_feature(zfsvfs);
2713
2714 return (0);
2715 }
2716
2717 /*
2718 * Read a property stored within the master node.
2719 */
2720 int
zfs_get_zplprop(objset_t * os,zfs_prop_t prop,uint64_t * value)2721 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2722 {
2723 uint64_t *cached_copy = NULL;
2724
2725 /*
2726 * Figure out where in the objset_t the cached copy would live, if it
2727 * is available for the requested property.
2728 */
2729 if (os != NULL) {
2730 switch (prop) {
2731 case ZFS_PROP_VERSION:
2732 cached_copy = &os->os_version;
2733 break;
2734 case ZFS_PROP_NORMALIZE:
2735 cached_copy = &os->os_normalization;
2736 break;
2737 case ZFS_PROP_UTF8ONLY:
2738 cached_copy = &os->os_utf8only;
2739 break;
2740 case ZFS_PROP_CASE:
2741 cached_copy = &os->os_casesensitivity;
2742 break;
2743 default:
2744 break;
2745 }
2746 }
2747 if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2748 *value = *cached_copy;
2749 return (0);
2750 }
2751
2752 /*
2753 * If the property wasn't cached, look up the file system's value for
2754 * the property. For the version property, we look up a slightly
2755 * different string.
2756 */
2757 const char *pname;
2758 int error = ENOENT;
2759 if (prop == ZFS_PROP_VERSION) {
2760 pname = ZPL_VERSION_STR;
2761 } else {
2762 pname = zfs_prop_to_name(prop);
2763 }
2764
2765 if (os != NULL) {
2766 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2767 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2768 }
2769
2770 if (error == ENOENT) {
2771 /* No value set, use the default value */
2772 switch (prop) {
2773 case ZFS_PROP_VERSION:
2774 *value = ZPL_VERSION;
2775 break;
2776 case ZFS_PROP_NORMALIZE:
2777 case ZFS_PROP_UTF8ONLY:
2778 *value = 0;
2779 break;
2780 case ZFS_PROP_CASE:
2781 *value = ZFS_CASE_SENSITIVE;
2782 break;
2783 default:
2784 return (error);
2785 }
2786 error = 0;
2787 }
2788
2789 /*
2790 * If one of the methods for getting the property value above worked,
2791 * copy it into the objset_t's cache.
2792 */
2793 if (error == 0 && cached_copy != NULL) {
2794 *cached_copy = *value;
2795 }
2796
2797 return (error);
2798 }
2799
2800 /*
2801 * Return true if the coresponding vfs's unmounted flag is set.
2802 * Otherwise return false.
2803 * If this function returns true we know VFS unmount has been initiated.
2804 */
2805 boolean_t
zfs_get_vfs_flag_unmounted(objset_t * os)2806 zfs_get_vfs_flag_unmounted(objset_t *os)
2807 {
2808 zfsvfs_t *zfvp;
2809 boolean_t unmounted = B_FALSE;
2810
2811 ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
2812
2813 mutex_enter(&os->os_user_ptr_lock);
2814 zfvp = dmu_objset_get_user(os);
2815 if (zfvp != NULL && zfvp->z_vfs != NULL &&
2816 (zfvp->z_vfs->vfs_flag & VFS_UNMOUNTED))
2817 unmounted = B_TRUE;
2818 mutex_exit(&os->os_user_ptr_lock);
2819
2820 return (unmounted);
2821 }
2822
2823 /*
2824 * Takes a dataset, a property, a value and that value's setpoint as
2825 * found in the ZAP. Checks if the property has been changed in the vfs.
2826 * If so, val and setpoint will be overwritten with updated content.
2827 * Otherwise, they are left unchanged.
2828 *
2829 * OpenZFS moved it to os specific zfs_vfsops.c, we keep it here for now.
2830 */
2831 int
zfs_get_temporary_prop(dsl_dataset_t * ds,zfs_prop_t zfs_prop,uint64_t * val,char * setpoint)2832 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
2833 char *setpoint)
2834 {
2835 int error;
2836 zfsvfs_t *zfvp;
2837 vfs_t *vfsp;
2838 objset_t *os;
2839 uint64_t tmp = *val;
2840
2841 error = dmu_objset_from_ds(ds, &os);
2842 if (error != 0)
2843 return (error);
2844
2845 error = getzfsvfs_impl(os, &zfvp);
2846 if (error != 0)
2847 return (error);
2848
2849 vfsp = zfvp->z_vfs;
2850
2851 switch (zfs_prop) {
2852 case ZFS_PROP_ATIME:
2853 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
2854 tmp = 0;
2855 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
2856 tmp = 1;
2857 break;
2858 case ZFS_PROP_DEVICES:
2859 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
2860 tmp = 0;
2861 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
2862 tmp = 1;
2863 break;
2864 case ZFS_PROP_EXEC:
2865 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
2866 tmp = 0;
2867 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
2868 tmp = 1;
2869 break;
2870 case ZFS_PROP_SETUID:
2871 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
2872 tmp = 0;
2873 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
2874 tmp = 1;
2875 break;
2876 case ZFS_PROP_READONLY:
2877 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
2878 tmp = 0;
2879 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
2880 tmp = 1;
2881 break;
2882 case ZFS_PROP_XATTR:
2883 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL))
2884 tmp = 0;
2885 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
2886 tmp = 1;
2887 break;
2888 case ZFS_PROP_NBMAND:
2889 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
2890 tmp = 0;
2891 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
2892 tmp = 1;
2893 break;
2894 default:
2895 VFS_RELE(vfsp);
2896 return (ENOENT);
2897 }
2898
2899 VFS_RELE(vfsp);
2900 if (tmp != *val) {
2901 if (setpoint != NULL)
2902 (void) strcpy(setpoint, "temporary");
2903 *val = tmp;
2904 }
2905 return (0);
2906 }
2907
2908 static vfsdef_t vfw = {
2909 VFSDEF_VERSION,
2910 MNTTYPE_ZFS,
2911 zfs_vfsinit,
2912 VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
2913 VSW_XID|VSW_ZMOUNT,
2914 &zfs_mntopts
2915 };
2916
2917 struct modlfs zfs_modlfs = {
2918 &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
2919 };
2920