1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2014 Integros [integros.com] 25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 26 * Copyright 2019 Joyent, Inc. 27 * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> 28 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 29 * Copyright 2024 Oxide Computer Company 30 * Copyright 2025 MNX Cloud, Inc. 31 */ 32 33 /* Portions Copyright 2010 Robert Milkowski */ 34 35 #include <sys/types.h> 36 #include <sys/param.h> 37 #include <sys/systm.h> 38 #include <sys/sysmacros.h> 39 #include <sys/kmem.h> 40 #include <sys/pathname.h> 41 #include <sys/vnode.h> 42 #include <sys/vfs.h> 43 #include <sys/vfs_opreg.h> 44 #include <sys/mntent.h> 45 #include <sys/mount.h> 46 #include <sys/cmn_err.h> 47 #include "fs/fs_subr.h" 48 #include <sys/zfs_znode.h> 49 #include <sys/zfs_dir.h> 50 #include <sys/zil.h> 51 #include <sys/fs/zfs.h> 52 #include <sys/dmu.h> 53 #include <sys/dsl_prop.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_deleg.h> 56 #include <sys/spa.h> 57 #include <sys/zap.h> 58 #include <sys/sa.h> 59 #include <sys/sa_impl.h> 60 #include <sys/varargs.h> 61 #include <sys/policy.h> 62 #include <sys/atomic.h> 63 #include <sys/mkdev.h> 64 #include <sys/modctl.h> 65 #include <sys/refstr.h> 66 #include <sys/zfs_ioctl.h> 67 #include <sys/zfs_ctldir.h> 68 #include <sys/zfs_fuid.h> 69 #include <sys/bootconf.h> 70 #include <sys/ddi.h> 71 #include <sys/sunddi.h> 72 #include <sys/dnlc.h> 73 #include <sys/dmu_objset.h> 74 #include <sys/spa_boot.h> 75 #include <sys/vdev_impl.h> 76 #include <sys/ilstr.h> 77 #include "zfs_comutil.h" 78 79 int zfsfstype; 80 vfsops_t *zfs_vfsops = NULL; 81 static major_t zfs_major; 82 static minor_t zfs_minor; 83 static kmutex_t zfs_dev_mtx; 84 85 extern int sys_shutdown; 86 87 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); 88 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); 89 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot); 90 static int zfs_root(vfs_t *vfsp, vnode_t **vpp); 91 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp); 92 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp); 93 static void zfs_freevfs(vfs_t *vfsp); 94 static int zfs_syncfs(vfs_t *vfsp, uint64_t flags, cred_t *cr); 95 96 static const fs_operation_def_t zfs_vfsops_template[] = { 97 VFSNAME_MOUNT, { .vfs_mount = zfs_mount }, 98 VFSNAME_MOUNTROOT, { .vfs_mountroot = zfs_mountroot }, 99 VFSNAME_UNMOUNT, { .vfs_unmount = zfs_umount }, 100 VFSNAME_ROOT, { .vfs_root = zfs_root }, 101 VFSNAME_STATVFS, { .vfs_statvfs = zfs_statvfs }, 102 VFSNAME_SYNC, { .vfs_sync = zfs_sync }, 103 VFSNAME_VGET, { .vfs_vget = zfs_vget }, 104 VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, 105 VFSNAME_SYNCFS, { .vfs_syncfs = zfs_syncfs }, 106 NULL, NULL 107 }; 108 109 /* 110 * We need to keep a count of active fs's. 111 * This is necessary to prevent our module 112 * from being unloaded after a umount -f 113 */ 114 static uint32_t zfs_active_fs_count = 0; 115 116 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL }; 117 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL }; 118 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; 119 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; 120 121 /* 122 * MO_DEFAULT is not used since the default value is determined 123 * by the equivalent property. 124 */ 125 static mntopt_t mntopts[] = { 126 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL }, 127 { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL }, 128 { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL }, 129 { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL } 130 }; 131 132 static mntopts_t zfs_mntopts = { 133 sizeof (mntopts) / sizeof (mntopt_t), 134 mntopts 135 }; 136 137 /*ARGSUSED*/ 138 int 139 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) 140 { 141 /* 142 * Data integrity is job one. We don't want a compromised kernel 143 * writing to the storage pool, so we never sync during panic. 144 */ 145 if (panicstr) 146 return (0); 147 148 /* 149 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS 150 * to sync metadata, which they would otherwise cache indefinitely. 151 * Semantically, the only requirement is that the sync be initiated. 152 * The DMU syncs out txgs frequently, so there's nothing to do. 153 */ 154 if (flag & SYNC_ATTR) 155 return (0); 156 157 if (vfsp != NULL) { 158 /* 159 * Sync a specific filesystem. 160 */ 161 zfsvfs_t *zfsvfs = vfsp->vfs_data; 162 dsl_pool_t *dp; 163 164 ZFS_ENTER(zfsvfs); 165 dp = dmu_objset_pool(zfsvfs->z_os); 166 167 /* 168 * If the system is shutting down, then skip any 169 * filesystems which may exist on a suspended pool. 170 */ 171 if (sys_shutdown && spa_suspended(dp->dp_spa)) { 172 ZFS_EXIT(zfsvfs); 173 return (0); 174 } 175 176 if (zfsvfs->z_log != NULL) 177 zil_commit(zfsvfs->z_log, 0); 178 179 ZFS_EXIT(zfsvfs); 180 } else { 181 /* 182 * Sync all ZFS filesystems. This is what happens when you 183 * run sync(8). Unlike other filesystems, ZFS honors the 184 * request by waiting for all pools to commit all dirty data. 185 */ 186 spa_sync_allpools(); 187 } 188 189 return (0); 190 } 191 192 /* 193 * This is a synchronous request to sync all file system data out. 194 */ 195 static int 196 zfs_syncfs(vfs_t *vfsp, uint64_t flags, cred_t *cr) 197 { 198 if (flags != 0) { 199 return (ENOTSUP); 200 } 201 202 return (zfs_sync(vfsp, 0, cr)); 203 } 204 205 static int 206 zfs_create_unique_device(dev_t *dev) 207 { 208 major_t new_major; 209 210 do { 211 ASSERT3U(zfs_minor, <=, MAXMIN32); 212 minor_t start = zfs_minor; 213 do { 214 mutex_enter(&zfs_dev_mtx); 215 if (zfs_minor >= MAXMIN32) { 216 /* 217 * If we're still using the real major 218 * keep out of /dev/zfs and /dev/zvol minor 219 * number space. If we're using a getudev()'ed 220 * major number, we can use all of its minors. 221 */ 222 if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) 223 zfs_minor = ZFS_MIN_MINOR; 224 else 225 zfs_minor = 0; 226 } else { 227 zfs_minor++; 228 } 229 *dev = makedevice(zfs_major, zfs_minor); 230 mutex_exit(&zfs_dev_mtx); 231 } while (vfs_devismounted(*dev) && zfs_minor != start); 232 if (zfs_minor == start) { 233 /* 234 * We are using all ~262,000 minor numbers for the 235 * current major number. Create a new major number. 236 */ 237 if ((new_major = getudev()) == (major_t)-1) { 238 cmn_err(CE_WARN, 239 "zfs_mount: Can't get unique major " 240 "device number."); 241 return (-1); 242 } 243 mutex_enter(&zfs_dev_mtx); 244 zfs_major = new_major; 245 zfs_minor = 0; 246 247 mutex_exit(&zfs_dev_mtx); 248 } else { 249 break; 250 } 251 /* CONSTANTCONDITION */ 252 } while (1); 253 254 return (0); 255 } 256 257 static void 258 atime_changed_cb(void *arg, uint64_t newval) 259 { 260 zfsvfs_t *zfsvfs = arg; 261 262 if (newval == TRUE) { 263 zfsvfs->z_atime = TRUE; 264 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 265 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 266 } else { 267 zfsvfs->z_atime = FALSE; 268 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 269 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 270 } 271 } 272 273 static void 274 xattr_changed_cb(void *arg, uint64_t newval) 275 { 276 zfsvfs_t *zfsvfs = arg; 277 278 if (newval == TRUE) { 279 /* XXX locking on vfs_flag? */ 280 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; 281 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); 282 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); 283 } else { 284 /* XXX locking on vfs_flag? */ 285 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; 286 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); 287 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); 288 } 289 } 290 291 static void 292 blksz_changed_cb(void *arg, uint64_t newval) 293 { 294 zfsvfs_t *zfsvfs = arg; 295 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); 296 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); 297 ASSERT(ISP2(newval)); 298 299 zfsvfs->z_max_blksz = newval; 300 zfsvfs->z_vfs->vfs_bsize = newval; 301 } 302 303 static void 304 readonly_changed_cb(void *arg, uint64_t newval) 305 { 306 zfsvfs_t *zfsvfs = arg; 307 308 if (newval) { 309 /* XXX locking on vfs_flag? */ 310 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 311 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 312 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 313 } else { 314 /* XXX locking on vfs_flag? */ 315 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 316 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 317 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 318 } 319 } 320 321 static void 322 devices_changed_cb(void *arg, uint64_t newval) 323 { 324 zfsvfs_t *zfsvfs = arg; 325 326 if (newval == FALSE) { 327 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES; 328 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES); 329 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0); 330 } else { 331 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES; 332 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES); 333 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0); 334 } 335 } 336 337 static void 338 setuid_changed_cb(void *arg, uint64_t newval) 339 { 340 zfsvfs_t *zfsvfs = arg; 341 342 if (newval == FALSE) { 343 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 344 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 345 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 346 } else { 347 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 348 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 349 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 350 } 351 } 352 353 static void 354 exec_changed_cb(void *arg, uint64_t newval) 355 { 356 zfsvfs_t *zfsvfs = arg; 357 358 if (newval == FALSE) { 359 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 360 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 361 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 362 } else { 363 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 364 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 365 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 366 } 367 } 368 369 /* 370 * The nbmand mount option can be changed at mount time. 371 * We can't allow it to be toggled on live file systems or incorrect 372 * behavior may be seen from cifs clients 373 * 374 * This property isn't registered via dsl_prop_register(), but this callback 375 * will be called when a file system is first mounted 376 */ 377 static void 378 nbmand_changed_cb(void *arg, uint64_t newval) 379 { 380 zfsvfs_t *zfsvfs = arg; 381 if (newval == FALSE) { 382 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 383 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 384 } else { 385 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 386 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 387 } 388 } 389 390 static void 391 snapdir_changed_cb(void *arg, uint64_t newval) 392 { 393 zfsvfs_t *zfsvfs = arg; 394 395 zfsvfs->z_show_ctldir = newval; 396 } 397 398 static void 399 vscan_changed_cb(void *arg, uint64_t newval) 400 { 401 zfsvfs_t *zfsvfs = arg; 402 403 zfsvfs->z_vscan = newval; 404 } 405 406 static void 407 acl_mode_changed_cb(void *arg, uint64_t newval) 408 { 409 zfsvfs_t *zfsvfs = arg; 410 411 zfsvfs->z_acl_mode = newval; 412 } 413 414 static void 415 acl_inherit_changed_cb(void *arg, uint64_t newval) 416 { 417 zfsvfs_t *zfsvfs = arg; 418 419 zfsvfs->z_acl_inherit = newval; 420 } 421 422 static void 423 acl_implicit_changed_cb(void *arg, uint64_t newval) 424 { 425 zfsvfs_t *zfsvfs = arg; 426 427 zfsvfs->z_acl_implicit = (boolean_t)newval; 428 } 429 430 static int 431 zfs_register_callbacks(vfs_t *vfsp) 432 { 433 struct dsl_dataset *ds = NULL; 434 objset_t *os = NULL; 435 zfsvfs_t *zfsvfs = NULL; 436 uint64_t nbmand; 437 boolean_t readonly = B_FALSE; 438 boolean_t do_readonly = B_FALSE; 439 boolean_t setuid = B_FALSE; 440 boolean_t do_setuid = B_FALSE; 441 boolean_t exec = B_FALSE; 442 boolean_t do_exec = B_FALSE; 443 boolean_t devices = B_FALSE; 444 boolean_t do_devices = B_FALSE; 445 boolean_t xattr = B_FALSE; 446 boolean_t do_xattr = B_FALSE; 447 boolean_t atime = B_FALSE; 448 boolean_t do_atime = B_FALSE; 449 int error = 0; 450 451 ASSERT(vfsp); 452 zfsvfs = vfsp->vfs_data; 453 ASSERT(zfsvfs); 454 os = zfsvfs->z_os; 455 456 /* 457 * The act of registering our callbacks will destroy any mount 458 * options we may have. In order to enable temporary overrides 459 * of mount options, we stash away the current values and 460 * restore them after we register the callbacks. 461 */ 462 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || 463 !spa_writeable(dmu_objset_spa(os))) { 464 readonly = B_TRUE; 465 do_readonly = B_TRUE; 466 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 467 readonly = B_FALSE; 468 do_readonly = B_TRUE; 469 } 470 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 471 devices = B_FALSE; 472 setuid = B_FALSE; 473 do_devices = B_TRUE; 474 do_setuid = B_TRUE; 475 } else { 476 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { 477 devices = B_FALSE; 478 do_devices = B_TRUE; 479 } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) { 480 devices = B_TRUE; 481 do_devices = B_TRUE; 482 } 483 484 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 485 setuid = B_FALSE; 486 do_setuid = B_TRUE; 487 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 488 setuid = B_TRUE; 489 do_setuid = B_TRUE; 490 } 491 } 492 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 493 exec = B_FALSE; 494 do_exec = B_TRUE; 495 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 496 exec = B_TRUE; 497 do_exec = B_TRUE; 498 } 499 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 500 xattr = B_FALSE; 501 do_xattr = B_TRUE; 502 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 503 xattr = B_TRUE; 504 do_xattr = B_TRUE; 505 } 506 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 507 atime = B_FALSE; 508 do_atime = B_TRUE; 509 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 510 atime = B_TRUE; 511 do_atime = B_TRUE; 512 } 513 514 /* 515 * nbmand is a special property. It can only be changed at 516 * mount time. 517 * 518 * This is weird, but it is documented to only be changeable 519 * at mount time. 520 */ 521 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 522 nbmand = B_FALSE; 523 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 524 nbmand = B_TRUE; 525 } else { 526 char osname[ZFS_MAX_DATASET_NAME_LEN]; 527 528 dmu_objset_name(os, osname); 529 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, 530 NULL)) { 531 return (error); 532 } 533 } 534 535 /* 536 * Register property callbacks. 537 * 538 * It would probably be fine to just check for i/o error from 539 * the first prop_register(), but I guess I like to go 540 * overboard... 541 */ 542 ds = dmu_objset_ds(os); 543 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 544 error = dsl_prop_register(ds, 545 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); 546 error = error ? error : dsl_prop_register(ds, 547 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); 548 error = error ? error : dsl_prop_register(ds, 549 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); 550 error = error ? error : dsl_prop_register(ds, 551 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); 552 error = error ? error : dsl_prop_register(ds, 553 zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); 554 error = error ? error : dsl_prop_register(ds, 555 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); 556 error = error ? error : dsl_prop_register(ds, 557 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); 558 error = error ? error : dsl_prop_register(ds, 559 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); 560 error = error ? error : dsl_prop_register(ds, 561 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); 562 error = error ? error : dsl_prop_register(ds, 563 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, 564 zfsvfs); 565 error = error ? error : dsl_prop_register(ds, 566 zfs_prop_to_name(ZFS_PROP_ACLIMPLICIT), 567 acl_implicit_changed_cb, zfsvfs); 568 error = error ? error : dsl_prop_register(ds, 569 zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); 570 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 571 if (error) 572 goto unregister; 573 574 /* 575 * Invoke our callbacks to restore temporary mount options. 576 */ 577 if (do_readonly) 578 readonly_changed_cb(zfsvfs, readonly); 579 if (do_setuid) 580 setuid_changed_cb(zfsvfs, setuid); 581 if (do_exec) 582 exec_changed_cb(zfsvfs, exec); 583 if (do_devices) 584 devices_changed_cb(zfsvfs, devices); 585 if (do_xattr) 586 xattr_changed_cb(zfsvfs, xattr); 587 if (do_atime) 588 atime_changed_cb(zfsvfs, atime); 589 590 nbmand_changed_cb(zfsvfs, nbmand); 591 592 return (0); 593 594 unregister: 595 dsl_prop_unregister_all(ds, zfsvfs); 596 return (error); 597 } 598 599 static int 600 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, 601 uint64_t *userp, uint64_t *groupp, uint64_t *projectp) 602 { 603 sa_hdr_phys_t sa; 604 sa_hdr_phys_t *sap = data; 605 uint64_t flags; 606 int hdrsize; 607 boolean_t swap = B_FALSE; 608 609 /* 610 * Is it a valid type of object to track? 611 */ 612 if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) 613 return (SET_ERROR(ENOENT)); 614 615 /* 616 * If we have a NULL data pointer 617 * then assume the id's aren't changing and 618 * return EEXIST to the dmu to let it know to 619 * use the same ids 620 */ 621 if (data == NULL) 622 return (SET_ERROR(EEXIST)); 623 624 if (bonustype == DMU_OT_ZNODE) { 625 znode_phys_t *znp = data; 626 *userp = znp->zp_uid; 627 *groupp = znp->zp_gid; 628 *projectp = ZFS_DEFAULT_PROJID; 629 return (0); 630 } 631 632 if (sap->sa_magic == 0) { 633 /* 634 * This should only happen for newly created files 635 * that haven't had the znode data filled in yet. 636 */ 637 *userp = 0; 638 *groupp = 0; 639 *projectp = ZFS_DEFAULT_PROJID; 640 return (0); 641 } 642 643 sa = *sap; 644 if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { 645 sa.sa_magic = SA_MAGIC; 646 sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); 647 swap = B_TRUE; 648 } else { 649 VERIFY3U(sa.sa_magic, ==, SA_MAGIC); 650 } 651 652 hdrsize = sa_hdrsize(&sa); 653 VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); 654 655 *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET)); 656 *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET)); 657 flags = *((uint64_t *)((uintptr_t)data + hdrsize + SA_FLAGS_OFFSET)); 658 if (swap) 659 flags = BSWAP_64(flags); 660 661 if (flags & ZFS_PROJID) 662 *projectp = *((uint64_t *)((uintptr_t)data + hdrsize + 663 SA_PROJID_OFFSET)); 664 else 665 *projectp = ZFS_DEFAULT_PROJID; 666 667 if (swap) { 668 *userp = BSWAP_64(*userp); 669 *groupp = BSWAP_64(*groupp); 670 *projectp = BSWAP_64(*projectp); 671 } 672 return (0); 673 } 674 675 static void 676 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, 677 char *domainbuf, int buflen, uid_t *ridp) 678 { 679 uint64_t fuid; 680 const char *domain; 681 682 fuid = zfs_strtonum(fuidstr, NULL); 683 684 domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); 685 if (domain) 686 (void) strlcpy(domainbuf, domain, buflen); 687 else 688 domainbuf[0] = '\0'; 689 *ridp = FUID_RID(fuid); 690 } 691 692 static uint64_t 693 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) 694 { 695 switch (type) { 696 case ZFS_PROP_USERUSED: 697 case ZFS_PROP_USEROBJUSED: 698 return (DMU_USERUSED_OBJECT); 699 case ZFS_PROP_GROUPUSED: 700 case ZFS_PROP_GROUPOBJUSED: 701 return (DMU_GROUPUSED_OBJECT); 702 case ZFS_PROP_PROJECTUSED: 703 case ZFS_PROP_PROJECTOBJUSED: 704 return (DMU_PROJECTUSED_OBJECT); 705 case ZFS_PROP_USERQUOTA: 706 return (zfsvfs->z_userquota_obj); 707 case ZFS_PROP_GROUPQUOTA: 708 return (zfsvfs->z_groupquota_obj); 709 case ZFS_PROP_USEROBJQUOTA: 710 return (zfsvfs->z_userobjquota_obj); 711 case ZFS_PROP_GROUPOBJQUOTA: 712 return (zfsvfs->z_groupobjquota_obj); 713 case ZFS_PROP_PROJECTQUOTA: 714 return (zfsvfs->z_projectquota_obj); 715 case ZFS_PROP_PROJECTOBJQUOTA: 716 return (zfsvfs->z_projectobjquota_obj); 717 default: 718 return (ZFS_NO_OBJECT); 719 } 720 } 721 722 int 723 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 724 uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) 725 { 726 int error; 727 zap_cursor_t zc; 728 zap_attribute_t za; 729 zfs_useracct_t *buf = vbuf; 730 uint64_t obj; 731 int offset = 0; 732 733 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 734 return (SET_ERROR(ENOTSUP)); 735 736 if ((type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || 737 type == ZFS_PROP_PROJECTOBJQUOTA || 738 type == ZFS_PROP_PROJECTOBJUSED) && 739 !dmu_objset_projectquota_present(zfsvfs->z_os)) 740 return (SET_ERROR(ENOTSUP)); 741 742 if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || 743 type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || 744 type == ZFS_PROP_PROJECTOBJUSED || 745 type == ZFS_PROP_PROJECTOBJQUOTA) && 746 !dmu_objset_userobjspace_present(zfsvfs->z_os)) 747 return (SET_ERROR(ENOTSUP)); 748 749 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 750 if (obj == ZFS_NO_OBJECT) { 751 *bufsizep = 0; 752 return (0); 753 } 754 755 if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || 756 type == ZFS_PROP_PROJECTOBJUSED) 757 offset = DMU_OBJACCT_PREFIX_LEN; 758 759 for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); 760 (error = zap_cursor_retrieve(&zc, &za)) == 0; 761 zap_cursor_advance(&zc)) { 762 if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > 763 *bufsizep) 764 break; 765 766 /* 767 * skip object quota (with zap name prefix DMU_OBJACCT_PREFIX) 768 * when dealing with block quota and vice versa. 769 */ 770 if ((offset > 0) != (strncmp(za.za_name, DMU_OBJACCT_PREFIX, 771 DMU_OBJACCT_PREFIX_LEN) == 0)) 772 continue; 773 774 fuidstr_to_sid(zfsvfs, za.za_name + offset, 775 buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); 776 777 buf->zu_space = za.za_first_integer; 778 buf++; 779 } 780 if (error == ENOENT) 781 error = 0; 782 783 ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); 784 *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; 785 *cookiep = zap_cursor_serialize(&zc); 786 zap_cursor_fini(&zc); 787 return (error); 788 } 789 790 /* 791 * buf must be big enough (eg, 16+1 bytes) 792 */ 793 static int 794 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, 795 ilstr_t *ils, boolean_t addok) 796 { 797 uint64_t fuid; 798 int domainid = 0; 799 800 if (domain && domain[0]) { 801 domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); 802 if (domainid == -1) 803 return (SET_ERROR(ENOENT)); 804 } 805 fuid = FUID_ENCODE(domainid, rid); 806 ilstr_aprintf(ils, "%llx", (longlong_t)fuid); 807 return (0); 808 } 809 810 int 811 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 812 const char *domain, uint64_t rid, uint64_t *valp) 813 { 814 ilstr_t ils; 815 char buf[20 + DMU_OBJACCT_PREFIX_LEN]; 816 int err; 817 uint64_t obj; 818 819 ilstr_init_prealloc(&ils, buf, sizeof (buf)); 820 *valp = 0; 821 822 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 823 return (SET_ERROR(ENOTSUP)); 824 825 if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || 826 type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || 827 type == ZFS_PROP_PROJECTOBJUSED || 828 type == ZFS_PROP_PROJECTOBJQUOTA) && 829 !dmu_objset_userobjspace_present(zfsvfs->z_os)) 830 return (SET_ERROR(ENOTSUP)); 831 832 if (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || 833 type == ZFS_PROP_PROJECTOBJQUOTA || 834 type == ZFS_PROP_PROJECTOBJUSED) { 835 if (!dmu_objset_projectquota_present(zfsvfs->z_os)) 836 return (SET_ERROR(ENOTSUP)); 837 if (!zpl_is_valid_projid(rid)) 838 return (SET_ERROR(EINVAL)); 839 } 840 841 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 842 if (obj == ZFS_NO_OBJECT) 843 return (0); 844 845 if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || 846 type == ZFS_PROP_PROJECTOBJUSED) { 847 ilstr_append_str(&ils, DMU_OBJACCT_PREFIX); 848 } 849 850 err = id_to_fuidstr(zfsvfs, domain, rid, &ils, B_FALSE); 851 if (err) 852 return (err); 853 854 VERIFY3S(ilstr_errno(&ils), ==, ILSTR_ERROR_OK); 855 err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); 856 if (err == ENOENT) 857 err = 0; 858 return (err); 859 } 860 861 int 862 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 863 const char *domain, uint64_t rid, uint64_t quota) 864 { 865 char buf[32]; 866 int err; 867 dmu_tx_t *tx; 868 uint64_t *objp; 869 boolean_t fuid_dirtied; 870 871 if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) 872 return (SET_ERROR(ENOTSUP)); 873 874 switch (type) { 875 case ZFS_PROP_USERQUOTA: 876 objp = &zfsvfs->z_userquota_obj; 877 break; 878 case ZFS_PROP_GROUPQUOTA: 879 objp = &zfsvfs->z_groupquota_obj; 880 break; 881 case ZFS_PROP_USEROBJQUOTA: 882 objp = &zfsvfs->z_userobjquota_obj; 883 break; 884 case ZFS_PROP_GROUPOBJQUOTA: 885 objp = &zfsvfs->z_groupobjquota_obj; 886 break; 887 case ZFS_PROP_PROJECTQUOTA: 888 if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) 889 return (SET_ERROR(ENOTSUP)); 890 if (!zpl_is_valid_projid(rid)) 891 return (SET_ERROR(EINVAL)); 892 893 objp = &zfsvfs->z_projectquota_obj; 894 break; 895 case ZFS_PROP_PROJECTOBJQUOTA: 896 if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) 897 return (SET_ERROR(ENOTSUP)); 898 if (!zpl_is_valid_projid(rid)) 899 return (SET_ERROR(EINVAL)); 900 901 objp = &zfsvfs->z_projectobjquota_obj; 902 break; 903 default: 904 return (SET_ERROR(EINVAL)); 905 } 906 907 ilstr_t ils; 908 ilstr_init_prealloc(&ils, buf, sizeof (buf)); 909 err = id_to_fuidstr(zfsvfs, domain, rid, &ils, B_TRUE); 910 if (err) 911 return (err); 912 VERIFY3S(ilstr_errno(&ils), ==, ILSTR_ERROR_OK); 913 fuid_dirtied = zfsvfs->z_fuid_dirty; 914 915 tx = dmu_tx_create(zfsvfs->z_os); 916 dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); 917 if (*objp == 0) { 918 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 919 zfs_userquota_prop_prefixes[type]); 920 } 921 if (fuid_dirtied) 922 zfs_fuid_txhold(zfsvfs, tx); 923 err = dmu_tx_assign(tx, TXG_WAIT); 924 if (err) { 925 dmu_tx_abort(tx); 926 return (err); 927 } 928 929 mutex_enter(&zfsvfs->z_lock); 930 if (*objp == 0) { 931 *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, 932 DMU_OT_NONE, 0, tx); 933 VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 934 zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); 935 } 936 mutex_exit(&zfsvfs->z_lock); 937 938 if (quota == 0) { 939 err = zap_remove(zfsvfs->z_os, *objp, buf, tx); 940 if (err == ENOENT) 941 err = 0; 942 } else { 943 err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); 944 } 945 ASSERT(err == 0); 946 if (fuid_dirtied) 947 zfs_fuid_sync(zfsvfs, tx); 948 dmu_tx_commit(tx); 949 return (err); 950 } 951 952 boolean_t 953 zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) 954 { 955 char buf[20 + DMU_OBJACCT_PREFIX_LEN]; 956 uint64_t used, quota, quotaobj; 957 int err; 958 959 if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) { 960 if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) { 961 dsl_pool_config_enter( 962 dmu_objset_pool(zfsvfs->z_os), FTAG); 963 dmu_objset_id_quota_upgrade(zfsvfs->z_os); 964 dsl_pool_config_exit( 965 dmu_objset_pool(zfsvfs->z_os), FTAG); 966 } 967 return (B_FALSE); 968 } 969 970 if (usedobj == DMU_PROJECTUSED_OBJECT) { 971 if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { 972 if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { 973 dsl_pool_config_enter( 974 dmu_objset_pool(zfsvfs->z_os), FTAG); 975 dmu_objset_id_quota_upgrade(zfsvfs->z_os); 976 dsl_pool_config_exit( 977 dmu_objset_pool(zfsvfs->z_os), FTAG); 978 } 979 return (B_FALSE); 980 } 981 quotaobj = zfsvfs->z_projectobjquota_obj; 982 } else if (usedobj == DMU_USERUSED_OBJECT) { 983 quotaobj = zfsvfs->z_userobjquota_obj; 984 } else if (usedobj == DMU_GROUPUSED_OBJECT) { 985 quotaobj = zfsvfs->z_groupobjquota_obj; 986 } else { 987 return (B_FALSE); 988 } 989 if (quotaobj == 0 || zfsvfs->z_replay) 990 return (B_FALSE); 991 992 (void) sprintf(buf, "%llx", (longlong_t)id); 993 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); 994 if (err != 0) 995 return (B_FALSE); 996 997 (void) sprintf(buf, DMU_OBJACCT_PREFIX "%llx", (longlong_t)id); 998 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); 999 if (err != 0) 1000 return (B_FALSE); 1001 return (used >= quota); 1002 } 1003 1004 boolean_t 1005 zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) 1006 { 1007 char buf[20]; 1008 uint64_t used, quota, quotaobj; 1009 int err; 1010 1011 if (usedobj == DMU_PROJECTUSED_OBJECT) { 1012 if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { 1013 if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { 1014 dsl_pool_config_enter( 1015 dmu_objset_pool(zfsvfs->z_os), FTAG); 1016 dmu_objset_id_quota_upgrade(zfsvfs->z_os); 1017 dsl_pool_config_exit( 1018 dmu_objset_pool(zfsvfs->z_os), FTAG); 1019 } 1020 return (B_FALSE); 1021 } 1022 quotaobj = zfsvfs->z_projectquota_obj; 1023 } else if (usedobj == DMU_USERUSED_OBJECT) { 1024 quotaobj = zfsvfs->z_userquota_obj; 1025 } else if (usedobj == DMU_GROUPUSED_OBJECT) { 1026 quotaobj = zfsvfs->z_groupquota_obj; 1027 } else { 1028 return (B_FALSE); 1029 } 1030 if (quotaobj == 0 || zfsvfs->z_replay) 1031 return (B_FALSE); 1032 1033 (void) sprintf(buf, "%llx", (longlong_t)id); 1034 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); 1035 if (err != 0) 1036 return (B_FALSE); 1037 1038 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); 1039 if (err != 0) 1040 return (B_FALSE); 1041 return (used >= quota); 1042 } 1043 1044 boolean_t 1045 zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) 1046 { 1047 return (zfs_id_overblockquota(zfsvfs, usedobj, id) || 1048 zfs_id_overobjquota(zfsvfs, usedobj, id)); 1049 } 1050 1051 /* 1052 * Associate this zfsvfs with the given objset, which must be owned. 1053 * This will cache a bunch of on-disk state from the objset in the 1054 * zfsvfs. 1055 */ 1056 static int 1057 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) 1058 { 1059 int error; 1060 uint64_t val; 1061 1062 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; 1063 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 1064 zfsvfs->z_os = os; 1065 1066 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 1067 if (error != 0) 1068 return (error); 1069 if (zfsvfs->z_version > 1070 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { 1071 (void) printf("Can't mount a version %lld file system " 1072 "on a version %lld pool\n. Pool must be upgraded to mount " 1073 "this file system.", (u_longlong_t)zfsvfs->z_version, 1074 (u_longlong_t)spa_version(dmu_objset_spa(os))); 1075 return (SET_ERROR(ENOTSUP)); 1076 } 1077 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); 1078 if (error != 0) 1079 return (error); 1080 zfsvfs->z_norm = (int)val; 1081 1082 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); 1083 if (error != 0) 1084 return (error); 1085 zfsvfs->z_utf8 = (val != 0); 1086 1087 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); 1088 if (error != 0) 1089 return (error); 1090 zfsvfs->z_case = (uint_t)val; 1091 1092 /* 1093 * Fold case on file systems that are always or sometimes case 1094 * insensitive. 1095 */ 1096 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 1097 zfsvfs->z_case == ZFS_CASE_MIXED) 1098 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 1099 1100 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1101 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1102 1103 uint64_t sa_obj = 0; 1104 if (zfsvfs->z_use_sa) { 1105 /* should either have both of these objects or none */ 1106 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, 1107 &sa_obj); 1108 if (error != 0) 1109 return (error); 1110 } 1111 1112 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 1113 &zfsvfs->z_attr_table); 1114 if (error != 0) 1115 return (error); 1116 1117 if (zfsvfs->z_version >= ZPL_VERSION_SA) 1118 sa_register_update_callback(os, zfs_sa_upgrade); 1119 1120 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 1121 &zfsvfs->z_root); 1122 if (error != 0) 1123 return (error); 1124 ASSERT(zfsvfs->z_root != 0); 1125 1126 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 1127 &zfsvfs->z_unlinkedobj); 1128 if (error != 0) 1129 return (error); 1130 1131 error = zap_lookup(os, MASTER_NODE_OBJ, 1132 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 1133 8, 1, &zfsvfs->z_userquota_obj); 1134 if (error == ENOENT) 1135 zfsvfs->z_userquota_obj = 0; 1136 else if (error != 0) 1137 return (error); 1138 1139 error = zap_lookup(os, MASTER_NODE_OBJ, 1140 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 1141 8, 1, &zfsvfs->z_groupquota_obj); 1142 if (error == ENOENT) 1143 zfsvfs->z_groupquota_obj = 0; 1144 else if (error != 0) 1145 return (error); 1146 1147 error = zap_lookup(os, MASTER_NODE_OBJ, 1148 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 1149 8, 1, &zfsvfs->z_projectquota_obj); 1150 if (error == ENOENT) 1151 zfsvfs->z_projectquota_obj = 0; 1152 else if (error != 0) 1153 return (error); 1154 1155 error = zap_lookup(os, MASTER_NODE_OBJ, 1156 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 1157 8, 1, &zfsvfs->z_userobjquota_obj); 1158 if (error == ENOENT) 1159 zfsvfs->z_userobjquota_obj = 0; 1160 else if (error != 0) 1161 return (error); 1162 1163 error = zap_lookup(os, MASTER_NODE_OBJ, 1164 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 1165 8, 1, &zfsvfs->z_groupobjquota_obj); 1166 if (error == ENOENT) 1167 zfsvfs->z_groupobjquota_obj = 0; 1168 else if (error != 0) 1169 return (error); 1170 1171 error = zap_lookup(os, MASTER_NODE_OBJ, 1172 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 1173 8, 1, &zfsvfs->z_projectobjquota_obj); 1174 if (error == ENOENT) 1175 zfsvfs->z_projectobjquota_obj = 0; 1176 else if (error != 0) 1177 return (error); 1178 1179 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 1180 &zfsvfs->z_fuid_obj); 1181 if (error == ENOENT) 1182 zfsvfs->z_fuid_obj = 0; 1183 else if (error != 0) 1184 return (error); 1185 1186 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 1187 &zfsvfs->z_shares_dir); 1188 if (error == ENOENT) 1189 zfsvfs->z_shares_dir = 0; 1190 else if (error != 0) 1191 return (error); 1192 1193 return (0); 1194 } 1195 1196 int 1197 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) 1198 { 1199 objset_t *os; 1200 zfsvfs_t *zfsvfs; 1201 int error; 1202 boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); 1203 1204 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 1205 1206 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); 1207 if (error != 0) { 1208 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1209 return (error); 1210 } 1211 1212 error = zfsvfs_create_impl(zfvp, zfsvfs, os); 1213 if (error != 0) { 1214 dmu_objset_disown(os, B_TRUE, zfsvfs); 1215 } 1216 return (error); 1217 } 1218 1219 1220 int 1221 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) 1222 { 1223 int error; 1224 1225 zfsvfs->z_vfs = NULL; 1226 zfsvfs->z_parent = zfsvfs; 1227 1228 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1229 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 1230 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1231 offsetof(znode_t, z_link_node)); 1232 rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); 1233 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); 1234 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 1235 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1236 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1237 1238 error = zfsvfs_init(zfsvfs, os); 1239 if (error != 0) { 1240 *zfvp = NULL; 1241 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1242 return (error); 1243 } 1244 1245 zfsvfs->z_drain_task = TASKQID_INVALID; 1246 zfsvfs->z_draining = B_FALSE; 1247 zfsvfs->z_drain_cancel = B_TRUE; 1248 1249 *zfvp = zfsvfs; 1250 return (0); 1251 } 1252 1253 static int 1254 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 1255 { 1256 int error; 1257 1258 error = zfs_register_callbacks(zfsvfs->z_vfs); 1259 if (error) 1260 return (error); 1261 1262 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); 1263 1264 /* 1265 * If we are not mounting (ie: online recv), then we don't 1266 * have to worry about replaying the log as we blocked all 1267 * operations out since we closed the ZIL. 1268 */ 1269 if (mounting) { 1270 boolean_t readonly; 1271 1272 /* 1273 * During replay we remove the read only flag to 1274 * allow replays to succeed. 1275 */ 1276 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 1277 if (readonly != 0) { 1278 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 1279 } else { 1280 zfs_unlinked_drain(zfsvfs); 1281 } 1282 1283 /* 1284 * Parse and replay the intent log. 1285 * 1286 * Because of ziltest, this must be done after 1287 * zfs_unlinked_drain(). (Further note: ziltest 1288 * doesn't use readonly mounts, where 1289 * zfs_unlinked_drain() isn't called.) This is because 1290 * ziltest causes spa_sync() to think it's committed, 1291 * but actually it is not, so the intent log contains 1292 * many txg's worth of changes. 1293 * 1294 * In particular, if object N is in the unlinked set in 1295 * the last txg to actually sync, then it could be 1296 * actually freed in a later txg and then reallocated 1297 * in a yet later txg. This would write a "create 1298 * object N" record to the intent log. Normally, this 1299 * would be fine because the spa_sync() would have 1300 * written out the fact that object N is free, before 1301 * we could write the "create object N" intent log 1302 * record. 1303 * 1304 * But when we are in ziltest mode, we advance the "open 1305 * txg" without actually spa_sync()-ing the changes to 1306 * disk. So we would see that object N is still 1307 * allocated and in the unlinked set, and there is an 1308 * intent log record saying to allocate it. 1309 */ 1310 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { 1311 if (zil_replay_disable) { 1312 zil_destroy(zfsvfs->z_log, B_FALSE); 1313 } else { 1314 zfsvfs->z_replay = B_TRUE; 1315 zil_replay(zfsvfs->z_os, zfsvfs, 1316 zfs_replay_vector); 1317 zfsvfs->z_replay = B_FALSE; 1318 } 1319 } 1320 1321 /* restore readonly bit */ 1322 if (readonly != 0) 1323 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 1324 } 1325 1326 /* 1327 * Set the objset user_ptr to track its zfsvfs. 1328 */ 1329 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1330 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1331 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1332 1333 return (0); 1334 } 1335 1336 void 1337 zfsvfs_free(zfsvfs_t *zfsvfs) 1338 { 1339 int i; 1340 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ 1341 1342 /* 1343 * This is a barrier to prevent the filesystem from going away in 1344 * zfs_znode_move() until we can safely ensure that the filesystem is 1345 * not unmounted. We consider the filesystem valid before the barrier 1346 * and invalid after the barrier. 1347 */ 1348 rw_enter(&zfsvfs_lock, RW_READER); 1349 rw_exit(&zfsvfs_lock); 1350 1351 zfs_fuid_destroy(zfsvfs); 1352 1353 mutex_destroy(&zfsvfs->z_znodes_lock); 1354 mutex_destroy(&zfsvfs->z_lock); 1355 list_destroy(&zfsvfs->z_all_znodes); 1356 rrm_destroy(&zfsvfs->z_teardown_lock); 1357 rw_destroy(&zfsvfs->z_teardown_inactive_lock); 1358 rw_destroy(&zfsvfs->z_fuid_lock); 1359 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1360 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1361 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1362 } 1363 1364 static void 1365 zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1366 { 1367 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1368 if (zfsvfs->z_vfs) { 1369 if (zfsvfs->z_use_fuids) { 1370 vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1371 vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1372 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1373 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1374 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1375 vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1376 } else { 1377 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1378 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1379 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1380 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1381 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1382 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1383 } 1384 } 1385 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1386 } 1387 1388 static int 1389 zfs_domount(vfs_t *vfsp, char *osname) 1390 { 1391 dev_t mount_dev; 1392 uint64_t recordsize, fsid_guid; 1393 int error = 0; 1394 zfsvfs_t *zfsvfs; 1395 boolean_t readonly = vfsp->vfs_flag & VFS_RDONLY ? B_TRUE : B_FALSE; 1396 1397 ASSERT(vfsp); 1398 ASSERT(osname); 1399 1400 error = zfsvfs_create(osname, readonly, &zfsvfs); 1401 if (error) 1402 return (error); 1403 zfsvfs->z_vfs = vfsp; 1404 1405 /* Initialize the generic filesystem structure. */ 1406 vfsp->vfs_bcount = 0; 1407 vfsp->vfs_data = NULL; 1408 1409 if (zfs_create_unique_device(&mount_dev) == -1) { 1410 error = SET_ERROR(ENODEV); 1411 goto out; 1412 } 1413 ASSERT(vfs_devismounted(mount_dev) == 0); 1414 1415 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, 1416 NULL)) 1417 goto out; 1418 1419 vfsp->vfs_dev = mount_dev; 1420 vfsp->vfs_fstype = zfsfstype; 1421 vfsp->vfs_bsize = recordsize; 1422 vfsp->vfs_flag |= VFS_NOTRUNC; 1423 vfsp->vfs_data = zfsvfs; 1424 1425 /* 1426 * The fsid is 64 bits, composed of an 8-bit fs type, which 1427 * separates our fsid from any other filesystem types, and a 1428 * 56-bit objset unique ID. The objset unique ID is unique to 1429 * all objsets open on this system, provided by unique_create(). 1430 * The 8-bit fs type must be put in the low bits of fsid[1] 1431 * because that's where other Solaris filesystems put it. 1432 */ 1433 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1434 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 1435 vfsp->vfs_fsid.val[0] = fsid_guid; 1436 vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | 1437 zfsfstype & 0xFF; 1438 1439 /* 1440 * Set features for file system. 1441 */ 1442 zfs_set_fuid_feature(zfsvfs); 1443 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { 1444 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1445 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1446 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); 1447 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { 1448 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1449 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1450 } 1451 vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); 1452 1453 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1454 uint64_t pval; 1455 1456 atime_changed_cb(zfsvfs, B_FALSE); 1457 readonly_changed_cb(zfsvfs, B_TRUE); 1458 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) 1459 goto out; 1460 xattr_changed_cb(zfsvfs, pval); 1461 zfsvfs->z_issnap = B_TRUE; 1462 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; 1463 1464 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1465 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1466 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1467 } else { 1468 error = zfsvfs_setup(zfsvfs, B_TRUE); 1469 } 1470 1471 /* cache the root vnode for this mount */ 1472 znode_t *rootzp; 1473 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp)) { 1474 goto out; 1475 } 1476 zfsvfs->z_rootdir = ZTOV(rootzp); 1477 1478 if (!zfsvfs->z_issnap) 1479 zfsctl_create(zfsvfs); 1480 out: 1481 if (error) { 1482 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); 1483 zfsvfs_free(zfsvfs); 1484 } else { 1485 atomic_inc_32(&zfs_active_fs_count); 1486 } 1487 1488 return (error); 1489 } 1490 1491 void 1492 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1493 { 1494 objset_t *os = zfsvfs->z_os; 1495 1496 if (!dmu_objset_is_snapshot(os)) 1497 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); 1498 } 1499 1500 /* 1501 * Convert a decimal digit string to a uint64_t integer. 1502 */ 1503 static int 1504 str_to_uint64(char *str, uint64_t *objnum) 1505 { 1506 uint64_t num = 0; 1507 1508 while (*str) { 1509 if (*str < '0' || *str > '9') 1510 return (SET_ERROR(EINVAL)); 1511 1512 num = num*10 + *str++ - '0'; 1513 } 1514 1515 *objnum = num; 1516 return (0); 1517 } 1518 1519 /* 1520 * The boot path passed from the boot loader is in the form of 1521 * "rootpool-name/root-filesystem-object-number'. Convert this 1522 * string to a dataset name: "rootpool-name/root-filesystem-name". 1523 */ 1524 static int 1525 zfs_parse_bootfs(char *bpath, char *outpath) 1526 { 1527 char *slashp; 1528 uint64_t objnum; 1529 int error; 1530 1531 if (*bpath == 0 || *bpath == '/') 1532 return (SET_ERROR(EINVAL)); 1533 1534 (void) strcpy(outpath, bpath); 1535 1536 slashp = strchr(bpath, '/'); 1537 1538 /* if no '/', just return the pool name */ 1539 if (slashp == NULL) { 1540 return (0); 1541 } 1542 1543 /* if not a number, just return the root dataset name */ 1544 if (str_to_uint64(slashp+1, &objnum)) { 1545 return (0); 1546 } 1547 1548 *slashp = '\0'; 1549 error = dsl_dsobj_to_dsname(bpath, objnum, outpath); 1550 *slashp = '/'; 1551 1552 return (error); 1553 } 1554 1555 /* 1556 * Check that the hex label string is appropriate for the dataset being 1557 * mounted into the global_zone proper. 1558 * 1559 * Return an error if the hex label string is not default or 1560 * admin_low/admin_high. For admin_low labels, the corresponding 1561 * dataset must be readonly. 1562 */ 1563 int 1564 zfs_check_global_label(const char *dsname, const char *hexsl) 1565 { 1566 if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1567 return (0); 1568 if (strcasecmp(hexsl, ADMIN_HIGH) == 0) 1569 return (0); 1570 if (strcasecmp(hexsl, ADMIN_LOW) == 0) { 1571 /* must be readonly */ 1572 uint64_t rdonly; 1573 1574 if (dsl_prop_get_integer(dsname, 1575 zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) 1576 return (SET_ERROR(EACCES)); 1577 return (rdonly ? 0 : EACCES); 1578 } 1579 return (SET_ERROR(EACCES)); 1580 } 1581 1582 static int 1583 zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct statvfs64 *statp, 1584 uint32_t bshift) 1585 { 1586 ilstr_t ils; 1587 char buf[20 + DMU_OBJACCT_PREFIX_LEN]; 1588 uint64_t offset = DMU_OBJACCT_PREFIX_LEN; 1589 uint64_t quota; 1590 uint64_t used; 1591 int err; 1592 1593 ilstr_init_prealloc(&ils, buf, sizeof (buf)); 1594 ilstr_append_str(&ils, DMU_OBJACCT_PREFIX); 1595 err = id_to_fuidstr(zfsvfs, NULL, zp->z_projid, &ils, B_FALSE); 1596 if (err) 1597 return (err); 1598 1599 VERIFY3S(ilstr_errno(&ils), ==, ILSTR_ERROR_OK); 1600 if (zfsvfs->z_projectquota_obj == 0) 1601 goto objs; 1602 1603 err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj, 1604 buf + offset, 8, 1, "a); 1605 if (err == ENOENT) 1606 goto objs; 1607 else if (err) 1608 return (err); 1609 1610 err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, 1611 buf + offset, 8, 1, &used); 1612 if (unlikely(err == ENOENT)) { 1613 uint32_t blksize; 1614 u_longlong_t nblocks; 1615 1616 /* 1617 * Quota accounting is async, so it is possible race case. 1618 * There is at least one object with the given project ID. 1619 */ 1620 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 1621 if (unlikely(zp->z_blksz == 0)) 1622 blksize = zfsvfs->z_max_blksz; 1623 1624 used = blksize * nblocks; 1625 } else if (err) { 1626 return (err); 1627 } 1628 1629 statp->f_blocks = quota >> bshift; 1630 statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0; 1631 statp->f_bavail = statp->f_bfree; 1632 1633 objs: 1634 if (zfsvfs->z_projectobjquota_obj == 0) 1635 return (0); 1636 1637 err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj, 1638 buf + offset, 8, 1, "a); 1639 if (err == ENOENT) 1640 return (0); 1641 else if (err) 1642 return (err); 1643 1644 err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, 1645 buf, 8, 1, &used); 1646 if (unlikely(err == ENOENT)) { 1647 /* 1648 * Quota accounting is async, so it is possible race case. 1649 * There is at least one object with the given project ID. 1650 */ 1651 used = 1; 1652 } else if (err) { 1653 return (err); 1654 } 1655 1656 statp->f_files = quota; 1657 statp->f_ffree = (quota > used) ? (quota - used) : 0; 1658 1659 return (0); 1660 } 1661 1662 /* 1663 * Determine whether the mount is allowed according to MAC check. 1664 * by comparing (where appropriate) label of the dataset against 1665 * the label of the zone being mounted into. If the dataset has 1666 * no label, create one. 1667 * 1668 * Returns 0 if access allowed, error otherwise (e.g. EACCES) 1669 */ 1670 static int 1671 zfs_mount_label_policy(vfs_t *vfsp, char *osname) 1672 { 1673 int error, retv; 1674 zone_t *mntzone = NULL; 1675 ts_label_t *mnt_tsl; 1676 bslabel_t *mnt_sl; 1677 bslabel_t ds_sl; 1678 char ds_hexsl[MAXNAMELEN]; 1679 1680 retv = EACCES; /* assume the worst */ 1681 1682 /* 1683 * Start by getting the dataset label if it exists. 1684 */ 1685 error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1686 1, sizeof (ds_hexsl), &ds_hexsl, NULL); 1687 if (error) 1688 return (SET_ERROR(EACCES)); 1689 1690 /* 1691 * If labeling is NOT enabled, then disallow the mount of datasets 1692 * which have a non-default label already. No other label checks 1693 * are needed. 1694 */ 1695 if (!is_system_labeled()) { 1696 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1697 return (0); 1698 return (SET_ERROR(EACCES)); 1699 } 1700 1701 /* 1702 * Get the label of the mountpoint. If mounting into the global 1703 * zone (i.e. mountpoint is not within an active zone and the 1704 * zoned property is off), the label must be default or 1705 * admin_low/admin_high only; no other checks are needed. 1706 */ 1707 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 1708 if (mntzone->zone_id == GLOBAL_ZONEID) { 1709 uint64_t zoned; 1710 1711 zone_rele(mntzone); 1712 1713 if (dsl_prop_get_integer(osname, 1714 zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) 1715 return (SET_ERROR(EACCES)); 1716 if (!zoned) 1717 return (zfs_check_global_label(osname, ds_hexsl)); 1718 else 1719 /* 1720 * This is the case of a zone dataset being mounted 1721 * initially, before the zone has been fully created; 1722 * allow this mount into global zone. 1723 */ 1724 return (0); 1725 } 1726 1727 mnt_tsl = mntzone->zone_slabel; 1728 ASSERT(mnt_tsl != NULL); 1729 label_hold(mnt_tsl); 1730 mnt_sl = label2bslabel(mnt_tsl); 1731 1732 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { 1733 /* 1734 * The dataset doesn't have a real label, so fabricate one. 1735 */ 1736 char *str = NULL; 1737 1738 if (l_to_str_internal(mnt_sl, &str) == 0 && 1739 dsl_prop_set_string(osname, 1740 zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1741 ZPROP_SRC_LOCAL, str) == 0) 1742 retv = 0; 1743 if (str != NULL) 1744 kmem_free(str, strlen(str) + 1); 1745 } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { 1746 /* 1747 * Now compare labels to complete the MAC check. If the 1748 * labels are equal then allow access. If the mountpoint 1749 * label dominates the dataset label, allow readonly access. 1750 * Otherwise, access is denied. 1751 */ 1752 if (blequal(mnt_sl, &ds_sl)) 1753 retv = 0; 1754 else if (bldominates(mnt_sl, &ds_sl)) { 1755 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1756 retv = 0; 1757 } 1758 } 1759 1760 label_rele(mnt_tsl); 1761 zone_rele(mntzone); 1762 return (retv); 1763 } 1764 1765 /* 1766 * Load a string-valued boot property and attempt to convert it to a 64-bit 1767 * unsigned integer. If the value is not present, or the conversion fails, 1768 * return the provided default value. 1769 */ 1770 static uint64_t 1771 spa_get_bootprop_uint64(const char *name, uint64_t defval) 1772 { 1773 char *propval; 1774 u_longlong_t r; 1775 int e; 1776 1777 if ((propval = spa_get_bootprop(name)) == NULL) { 1778 /* 1779 * The property does not exist. 1780 */ 1781 return (defval); 1782 } 1783 1784 e = ddi_strtoull(propval, NULL, 10, &r); 1785 1786 spa_free_bootprop(propval); 1787 1788 /* 1789 * If the conversion succeeded, return the value. If there was any 1790 * kind of failure, just return the default value. 1791 */ 1792 return (e == 0 ? r : defval); 1793 } 1794 1795 static int 1796 zfs_mountroot(vfs_t *vfsp, enum whymountroot why) 1797 { 1798 int error = 0; 1799 static int zfsrootdone = 0; 1800 zfsvfs_t *zfsvfs = NULL; 1801 znode_t *zp = NULL; 1802 vnode_t *vp = NULL; 1803 char *zfs_bootfs; 1804 char *zfs_devid; 1805 char *zfs_rootdisk_path; 1806 uint64_t zfs_bootpool; 1807 uint64_t zfs_bootvdev; 1808 1809 ASSERT(vfsp); 1810 1811 /* 1812 * The filesystem that we mount as root is defined in the 1813 * boot property "zfs-bootfs" with a format of 1814 * "poolname/root-dataset-objnum". 1815 */ 1816 if (why == ROOT_INIT) { 1817 if (zfsrootdone++) 1818 return (SET_ERROR(EBUSY)); 1819 1820 /* 1821 * the process of doing a spa_load will require the 1822 * clock to be set before we could (for example) do 1823 * something better by looking at the timestamp on 1824 * an uberblock, so just set it to -1. 1825 */ 1826 clkset(-1); 1827 1828 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { 1829 cmn_err(CE_NOTE, "spa_get_bootfs: can not get " 1830 "bootfs name"); 1831 return (SET_ERROR(EINVAL)); 1832 } 1833 zfs_devid = spa_get_bootprop("diskdevid"); 1834 1835 /* 1836 * The boot loader may also provide us with the GUID for both 1837 * the pool and the nominated boot vdev. A GUID value of 0 is 1838 * explicitly invalid (see "spa_change_guid()"), so we use this 1839 * as a sentinel value when no GUID is present. 1840 */ 1841 zfs_bootpool = spa_get_bootprop_uint64("zfs-bootpool", 0); 1842 zfs_bootvdev = spa_get_bootprop_uint64("zfs-bootvdev", 0); 1843 1844 /* 1845 * If we have been given a root disk override path, we want to 1846 * ignore device paths from the pool configuration and use only 1847 * the specific path we were given in the boot properties. 1848 */ 1849 zfs_rootdisk_path = spa_get_bootprop("zfs-rootdisk-path"); 1850 1851 /* 1852 * Initialise the early boot device rescan mechanism. A scan 1853 * will not actually be performed unless we need to do so in 1854 * order to find the correct /devices path for a relocated 1855 * device. 1856 */ 1857 vdev_disk_preroot_init(zfs_rootdisk_path); 1858 1859 error = spa_import_rootpool(rootfs.bo_name, zfs_devid, 1860 zfs_bootpool, zfs_bootvdev); 1861 1862 spa_free_bootprop(zfs_devid); 1863 1864 if (error != 0) { 1865 spa_free_bootprop(zfs_bootfs); 1866 spa_free_bootprop(zfs_rootdisk_path); 1867 vdev_disk_preroot_fini(); 1868 cmn_err(CE_NOTE, "spa_import_rootpool: error %d", 1869 error); 1870 return (error); 1871 } 1872 1873 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { 1874 spa_free_bootprop(zfs_bootfs); 1875 spa_free_bootprop(zfs_rootdisk_path); 1876 vdev_disk_preroot_fini(); 1877 cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", 1878 error); 1879 return (error); 1880 } 1881 1882 spa_free_bootprop(zfs_bootfs); 1883 spa_free_bootprop(zfs_rootdisk_path); 1884 1885 if ((error = vfs_lock(vfsp)) != 0) { 1886 vdev_disk_preroot_fini(); 1887 return (error); 1888 } 1889 1890 if (error = zfs_domount(vfsp, rootfs.bo_name)) { 1891 cmn_err(CE_NOTE, "zfs_domount: error %d", error); 1892 goto out; 1893 } 1894 1895 /* zfs_domount has already cached the root vnode for us */ 1896 zfsvfs = (zfsvfs_t *)vfsp->vfs_data; 1897 ASSERT(zfsvfs); 1898 ASSERT(zfsvfs->z_rootdir); 1899 1900 vp = zfsvfs->z_rootdir; 1901 mutex_enter(&vp->v_lock); 1902 vp->v_flag |= VROOT; 1903 mutex_exit(&vp->v_lock); 1904 1905 /* 1906 * Leave rootvp held. The root file system is never unmounted. 1907 */ 1908 VN_HOLD(vp); 1909 rootvp = vp; 1910 1911 vfs_add((struct vnode *)0, vfsp, 1912 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); 1913 out: 1914 vdev_disk_preroot_fini(); 1915 vfs_unlock(vfsp); 1916 return (error); 1917 } else if (why == ROOT_REMOUNT) { 1918 readonly_changed_cb(vfsp->vfs_data, B_FALSE); 1919 vfsp->vfs_flag |= VFS_REMOUNT; 1920 1921 /* refresh mount options */ 1922 zfs_unregister_callbacks(vfsp->vfs_data); 1923 return (zfs_register_callbacks(vfsp)); 1924 1925 } else if (why == ROOT_UNMOUNT) { 1926 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); 1927 (void) zfs_sync(vfsp, 0, 0); 1928 return (0); 1929 } 1930 1931 /* 1932 * if "why" is equal to anything else other than ROOT_INIT, 1933 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. 1934 */ 1935 return (SET_ERROR(ENOTSUP)); 1936 } 1937 1938 /*ARGSUSED*/ 1939 static int 1940 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 1941 { 1942 char *osname; 1943 pathname_t spn; 1944 int error = 0; 1945 uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ? 1946 UIO_SYSSPACE : UIO_USERSPACE; 1947 int canwrite; 1948 1949 if (mvp->v_type != VDIR) 1950 return (SET_ERROR(ENOTDIR)); 1951 1952 mutex_enter(&mvp->v_lock); 1953 if ((uap->flags & MS_REMOUNT) == 0 && 1954 (uap->flags & MS_OVERLAY) == 0 && 1955 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { 1956 mutex_exit(&mvp->v_lock); 1957 return (SET_ERROR(EBUSY)); 1958 } 1959 mutex_exit(&mvp->v_lock); 1960 1961 /* 1962 * ZFS does not support passing unparsed data in via MS_DATA. 1963 * Users should use the MS_OPTIONSTR interface; this means 1964 * that all option parsing is already done and the options struct 1965 * can be interrogated. 1966 */ 1967 if ((uap->flags & MS_DATA) && uap->datalen > 0) 1968 return (SET_ERROR(EINVAL)); 1969 1970 /* 1971 * Get the objset name (the "special" mount argument). 1972 */ 1973 if (error = pn_get(uap->spec, fromspace, &spn)) 1974 return (error); 1975 1976 osname = spn.pn_path; 1977 1978 /* 1979 * Check for mount privilege? 1980 * 1981 * If we don't have privilege then see if 1982 * we have local permission to allow it 1983 */ 1984 error = secpolicy_fs_mount(cr, mvp, vfsp); 1985 if (error) { 1986 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) { 1987 vattr_t vattr; 1988 1989 /* 1990 * Make sure user is the owner of the mount point 1991 * or has sufficient privileges. 1992 */ 1993 1994 vattr.va_mask = AT_UID; 1995 1996 if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) { 1997 goto out; 1998 } 1999 2000 if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 && 2001 VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) { 2002 goto out; 2003 } 2004 secpolicy_fs_mount_clearopts(cr, vfsp); 2005 } else { 2006 goto out; 2007 } 2008 } 2009 2010 /* 2011 * Refuse to mount a filesystem if we are in a local zone and the 2012 * dataset is not visible. 2013 */ 2014 if (!INGLOBALZONE(curproc) && 2015 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 2016 error = SET_ERROR(EPERM); 2017 goto out; 2018 } 2019 2020 error = zfs_mount_label_policy(vfsp, osname); 2021 if (error) 2022 goto out; 2023 2024 /* 2025 * When doing a remount, we simply refresh our temporary properties 2026 * according to those options set in the current VFS options. 2027 */ 2028 if (uap->flags & MS_REMOUNT) { 2029 /* refresh mount options */ 2030 zfs_unregister_callbacks(vfsp->vfs_data); 2031 error = zfs_register_callbacks(vfsp); 2032 goto out; 2033 } 2034 2035 error = zfs_domount(vfsp, osname); 2036 2037 /* 2038 * Add an extra VFS_HOLD on our parent vfs so that it can't 2039 * disappear due to a forced unmount. 2040 */ 2041 if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) 2042 VFS_HOLD(mvp->v_vfsp); 2043 2044 out: 2045 pn_free(&spn); 2046 return (error); 2047 } 2048 2049 static int 2050 zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp) 2051 { 2052 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2053 dev32_t d32; 2054 uint64_t refdbytes, availbytes, usedobjs, availobjs; 2055 int err = 0; 2056 2057 ZFS_ENTER(zfsvfs); 2058 2059 dmu_objset_space(zfsvfs->z_os, 2060 &refdbytes, &availbytes, &usedobjs, &availobjs); 2061 2062 /* 2063 * The underlying storage pool actually uses multiple block sizes. 2064 * We report the fragsize as the smallest block size we support, 2065 * and we report our blocksize as the filesystem's maximum blocksize. 2066 */ 2067 statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT; 2068 statp->f_bsize = zfsvfs->z_max_blksz; 2069 2070 /* 2071 * The following report "total" blocks of various kinds in the 2072 * file system, but reported in terms of f_frsize - the 2073 * "fragment" size. 2074 */ 2075 2076 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 2077 statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT; 2078 statp->f_bavail = statp->f_bfree; /* no root reservation */ 2079 2080 /* 2081 * statvfs() should really be called statufs(), because it assumes 2082 * static metadata. ZFS doesn't preallocate files, so the best 2083 * we can do is report the max that could possibly fit in f_files, 2084 * and that minus the number actually used in f_ffree. 2085 * For f_ffree, report the smaller of the number of object available 2086 * and the number of blocks (each object will take at least a block). 2087 */ 2088 statp->f_ffree = MIN(availobjs, statp->f_bfree); 2089 statp->f_favail = statp->f_ffree; /* no "root reservation" */ 2090 statp->f_files = statp->f_ffree + usedobjs; 2091 2092 (void) cmpldev(&d32, vfsp->vfs_dev); 2093 statp->f_fsid = d32; 2094 2095 /* 2096 * We're a zfs filesystem. 2097 */ 2098 (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name); 2099 2100 statp->f_flag = vf_to_stf(vfsp->vfs_flag); 2101 2102 statp->f_namemax = MAXNAMELEN - 1; 2103 2104 /* 2105 * We have all of 32 characters to stuff a string here. 2106 * Is there anything useful we could/should provide? 2107 */ 2108 bzero(statp->f_fstr, sizeof (statp->f_fstr)); 2109 2110 if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && 2111 dmu_objset_projectquota_present(zfsvfs->z_os)) { 2112 znode_t *zp; 2113 2114 /* 2115 * In ZoL, zfs_statvfs is passed a Linux dentry (directory 2116 * entry), instead of a vfsp. The ZoL code uses the dentry 2117 * to get the znode from the dentry's inode. This represents 2118 * whatever filename was passed to the user-level statvfs 2119 * syscall. 2120 * 2121 * We're using the VFS root znode here, so this represents a 2122 * potential difference from ZoL. 2123 */ 2124 if (zfs_zget(zfsvfs, zfsvfs->z_root, &zp) == 0) { 2125 uint32_t bshift = ddi_fls(statp->f_bsize) - 1; 2126 2127 if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid && 2128 zpl_is_valid_projid(zp->z_projid)) 2129 err = zfs_statfs_project(zfsvfs, zp, statp, 2130 bshift); 2131 VN_RELE(ZTOV(zp)); 2132 } 2133 } 2134 2135 ZFS_EXIT(zfsvfs); 2136 return (err); 2137 } 2138 2139 static int 2140 zfs_root(vfs_t *vfsp, vnode_t **vpp) 2141 { 2142 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2143 struct vnode *vp; 2144 int error; 2145 2146 ZFS_ENTER(zfsvfs); 2147 2148 vp = zfsvfs->z_rootdir; 2149 if (vp != NULL) { 2150 VN_HOLD(vp); 2151 error = 0; 2152 } else { 2153 /* forced unmount */ 2154 error = EIO; 2155 } 2156 *vpp = vp; 2157 2158 ZFS_EXIT(zfsvfs); 2159 return (error); 2160 2161 } 2162 2163 /* 2164 * Teardown the zfsvfs::z_os. 2165 * 2166 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' 2167 * and 'z_teardown_inactive_lock' held. 2168 */ 2169 static int 2170 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 2171 { 2172 znode_t *zp; 2173 2174 zfs_unlinked_drain_stop_wait(zfsvfs); 2175 2176 rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); 2177 2178 if (!unmounting) { 2179 /* 2180 * We purge the parent filesystem's vfsp as the parent 2181 * filesystem and all of its snapshots have their vnode's 2182 * v_vfsp set to the parent's filesystem's vfsp. Note, 2183 * 'z_parent' is self referential for non-snapshots. 2184 */ 2185 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 2186 } 2187 2188 /* 2189 * Close the zil. NB: Can't close the zil while zfs_inactive 2190 * threads are blocked as zil_close can call zfs_inactive. 2191 */ 2192 if (zfsvfs->z_log) { 2193 zil_close(zfsvfs->z_log); 2194 zfsvfs->z_log = NULL; 2195 } 2196 2197 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); 2198 2199 /* 2200 * If we are not unmounting (ie: online recv) and someone already 2201 * unmounted this file system while we were doing the switcheroo, 2202 * or a reopen of z_os failed then just bail out now. 2203 */ 2204 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 2205 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2206 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2207 return (SET_ERROR(EIO)); 2208 } 2209 2210 /* 2211 * At this point there are no vops active, and any new vops will 2212 * fail with EIO since we have z_teardown_lock for writer (only 2213 * relavent for forced unmount). 2214 * 2215 * Release all holds on dbufs. 2216 */ 2217 mutex_enter(&zfsvfs->z_znodes_lock); 2218 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 2219 zp = list_next(&zfsvfs->z_all_znodes, zp)) 2220 if (zp->z_sa_hdl) { 2221 ASSERT(ZTOV(zp)->v_count > 0); 2222 zfs_znode_dmu_fini(zp); 2223 } 2224 mutex_exit(&zfsvfs->z_znodes_lock); 2225 2226 /* 2227 * If we are unmounting, set the unmounted flag and let new vops 2228 * unblock. zfs_inactive will have the unmounted behavior, and all 2229 * other vops will fail with EIO. 2230 */ 2231 if (unmounting) { 2232 /* 2233 * Clear the cached root vnode now that we are unmounted. 2234 * Its release must be performed outside the teardown locks to 2235 * avoid recursive lock entry via zfs_inactive(). 2236 */ 2237 vnode_t *vp = zfsvfs->z_rootdir; 2238 zfsvfs->z_rootdir = NULL; 2239 2240 zfsvfs->z_unmounted = B_TRUE; 2241 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2242 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2243 2244 /* Drop the cached root vp now that it is safe */ 2245 VN_RELE(vp); 2246 } 2247 2248 /* 2249 * z_os will be NULL if there was an error in attempting to reopen 2250 * zfsvfs, so just return as the properties had already been 2251 * unregistered and cached data had been evicted before. 2252 */ 2253 if (zfsvfs->z_os == NULL) 2254 return (0); 2255 2256 /* 2257 * Unregister properties. 2258 */ 2259 zfs_unregister_callbacks(zfsvfs); 2260 2261 /* 2262 * Evict cached data 2263 */ 2264 if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && 2265 !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) 2266 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 2267 dmu_objset_evict_dbufs(zfsvfs->z_os); 2268 2269 return (0); 2270 } 2271 2272 /*ARGSUSED*/ 2273 static int 2274 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) 2275 { 2276 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2277 objset_t *os; 2278 int ret; 2279 2280 ret = secpolicy_fs_unmount(cr, vfsp); 2281 if (ret) { 2282 if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), 2283 ZFS_DELEG_PERM_MOUNT, cr)) 2284 return (ret); 2285 } 2286 2287 /* 2288 * We purge the parent filesystem's vfsp as the parent filesystem 2289 * and all of its snapshots have their vnode's v_vfsp set to the 2290 * parent's filesystem's vfsp. Note, 'z_parent' is self 2291 * referential for non-snapshots. 2292 */ 2293 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 2294 2295 /* 2296 * Unmount any snapshots mounted under .zfs before unmounting the 2297 * dataset itself. 2298 */ 2299 if (zfsvfs->z_ctldir != NULL && 2300 (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) { 2301 return (ret); 2302 } 2303 2304 if (!(fflag & MS_FORCE)) { 2305 /* 2306 * Check the number of active vnodes in the file system. 2307 * Our count is maintained in the vfs structure, but the 2308 * number is off by 1 to indicate a hold on the vfs 2309 * structure itself. 2310 */ 2311 boolean_t draining; 2312 uint_t thresh = 1; 2313 vnode_t *ctlvp, *rvp; 2314 2315 /* 2316 * The cached vnode for the root directory of the mount also 2317 * maintains a hold on the vfs structure. 2318 */ 2319 rvp = zfsvfs->z_rootdir; 2320 thresh++; 2321 2322 /* 2323 * The '.zfs' directory maintains a reference of its own, and 2324 * any active references underneath are reflected in the vnode 2325 * count. Allow one additional reference for it. 2326 */ 2327 ctlvp = zfsvfs->z_ctldir; 2328 if (ctlvp != NULL) { 2329 thresh++; 2330 } 2331 2332 /* 2333 * If it's running, the asynchronous unlinked drain task needs 2334 * to be stopped before the number of active vnodes can be 2335 * reliably checked. 2336 */ 2337 draining = zfsvfs->z_draining; 2338 if (draining) 2339 zfs_unlinked_drain_stop_wait(zfsvfs); 2340 2341 if (vfsp->vfs_count > thresh || rvp->v_count > 1 || 2342 (ctlvp != NULL && ctlvp->v_count > 1)) { 2343 if (draining) { 2344 /* If it was draining, restart the task */ 2345 zfs_unlinked_drain(zfsvfs); 2346 } 2347 return (SET_ERROR(EBUSY)); 2348 } 2349 } 2350 2351 vfsp->vfs_flag |= VFS_UNMOUNTED; 2352 2353 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); 2354 os = zfsvfs->z_os; 2355 2356 /* 2357 * z_os will be NULL if there was an error in 2358 * attempting to reopen zfsvfs. 2359 */ 2360 if (os != NULL) { 2361 /* 2362 * Unset the objset user_ptr. 2363 */ 2364 mutex_enter(&os->os_user_ptr_lock); 2365 dmu_objset_set_user(os, NULL); 2366 mutex_exit(&os->os_user_ptr_lock); 2367 2368 /* 2369 * Finally release the objset 2370 */ 2371 dmu_objset_disown(os, B_TRUE, zfsvfs); 2372 } 2373 2374 /* 2375 * We can now safely destroy the '.zfs' directory node. 2376 */ 2377 if (zfsvfs->z_ctldir != NULL) 2378 zfsctl_destroy(zfsvfs); 2379 2380 return (0); 2381 } 2382 2383 static int 2384 zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 2385 { 2386 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2387 znode_t *zp; 2388 uint64_t object = 0; 2389 uint64_t fid_gen = 0; 2390 uint64_t gen_mask; 2391 uint64_t zp_gen; 2392 int i, err; 2393 2394 *vpp = NULL; 2395 2396 ZFS_ENTER(zfsvfs); 2397 2398 if (fidp->fid_len == LONG_FID_LEN) { 2399 zfid_long_t *zlfid = (zfid_long_t *)fidp; 2400 uint64_t objsetid = 0; 2401 uint64_t setgen = 0; 2402 2403 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 2404 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 2405 2406 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 2407 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 2408 2409 ZFS_EXIT(zfsvfs); 2410 2411 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 2412 if (err) 2413 return (SET_ERROR(EINVAL)); 2414 ZFS_ENTER(zfsvfs); 2415 } 2416 2417 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 2418 zfid_short_t *zfid = (zfid_short_t *)fidp; 2419 2420 for (i = 0; i < sizeof (zfid->zf_object); i++) 2421 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 2422 2423 for (i = 0; i < sizeof (zfid->zf_gen); i++) 2424 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 2425 } else { 2426 ZFS_EXIT(zfsvfs); 2427 return (SET_ERROR(EINVAL)); 2428 } 2429 2430 /* A zero fid_gen means we are in the .zfs control directories */ 2431 if (fid_gen == 0 && 2432 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { 2433 *vpp = zfsvfs->z_ctldir; 2434 ASSERT(*vpp != NULL); 2435 if (object == ZFSCTL_INO_SNAPDIR) { 2436 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 2437 0, NULL, NULL, NULL, NULL, NULL) == 0); 2438 } else { 2439 VN_HOLD(*vpp); 2440 } 2441 ZFS_EXIT(zfsvfs); 2442 return (0); 2443 } 2444 2445 gen_mask = -1ULL >> (64 - 8 * i); 2446 2447 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); 2448 if (err = zfs_zget(zfsvfs, object, &zp)) { 2449 ZFS_EXIT(zfsvfs); 2450 return (err); 2451 } 2452 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 2453 sizeof (uint64_t)); 2454 zp_gen = zp_gen & gen_mask; 2455 if (zp_gen == 0) 2456 zp_gen = 1; 2457 if (zp->z_unlinked || zp_gen != fid_gen) { 2458 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); 2459 VN_RELE(ZTOV(zp)); 2460 ZFS_EXIT(zfsvfs); 2461 return (SET_ERROR(EINVAL)); 2462 } 2463 2464 *vpp = ZTOV(zp); 2465 ZFS_EXIT(zfsvfs); 2466 return (0); 2467 } 2468 2469 /* 2470 * Block out VOPs and close zfsvfs_t::z_os 2471 * 2472 * Note, if successful, then we return with the 'z_teardown_lock' and 2473 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying 2474 * dataset and objset intact so that they can be atomically handed off during 2475 * a subsequent rollback or recv operation and the resume thereafter. 2476 */ 2477 int 2478 zfs_suspend_fs(zfsvfs_t *zfsvfs) 2479 { 2480 int error; 2481 2482 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 2483 return (error); 2484 2485 return (0); 2486 } 2487 2488 /* 2489 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset 2490 * is an invariant across any of the operations that can be performed while the 2491 * filesystem was suspended. Whether it succeeded or failed, the preconditions 2492 * are the same: the relevant objset and associated dataset are owned by 2493 * zfsvfs, held, and long held on entry. 2494 */ 2495 int 2496 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 2497 { 2498 int err; 2499 znode_t *zp; 2500 2501 ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); 2502 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); 2503 2504 /* 2505 * We already own this, so just update the objset_t, as the one we 2506 * had before may have been evicted. 2507 */ 2508 objset_t *os; 2509 VERIFY3P(ds->ds_owner, ==, zfsvfs); 2510 VERIFY(dsl_dataset_long_held(ds)); 2511 VERIFY0(dmu_objset_from_ds(ds, &os)); 2512 2513 err = zfsvfs_init(zfsvfs, os); 2514 if (err != 0) 2515 goto bail; 2516 2517 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); 2518 2519 zfs_set_fuid_feature(zfsvfs); 2520 2521 /* 2522 * Attempt to re-establish all the active znodes with 2523 * their dbufs. If a zfs_rezget() fails, then we'll let 2524 * any potential callers discover that via ZFS_ENTER_VERIFY_VP 2525 * when they try to use their znode. 2526 */ 2527 mutex_enter(&zfsvfs->z_znodes_lock); 2528 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 2529 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 2530 (void) zfs_rezget(zp); 2531 } 2532 mutex_exit(&zfsvfs->z_znodes_lock); 2533 2534 if (((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) && 2535 !zfsvfs->z_unmounted) { 2536 /* 2537 * zfs_suspend_fs() could have interrupted freeing 2538 * of dnodes. We need to restart this freeing so 2539 * that we don't "leak" the space. 2540 */ 2541 zfs_unlinked_drain(zfsvfs); 2542 } 2543 2544 bail: 2545 /* release the VOPs */ 2546 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2547 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2548 2549 if (err) { 2550 /* 2551 * Since we couldn't setup the sa framework, try to force 2552 * unmount this file system. 2553 */ 2554 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) 2555 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED()); 2556 } 2557 return (err); 2558 } 2559 2560 static void 2561 zfs_freevfs(vfs_t *vfsp) 2562 { 2563 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2564 2565 /* 2566 * If this is a snapshot, we have an extra VFS_HOLD on our parent 2567 * from zfs_mount(). Release it here. If we came through 2568 * zfs_mountroot() instead, we didn't grab an extra hold, so 2569 * skip the VFS_RELE for rootvfs. 2570 */ 2571 if (zfsvfs->z_issnap && (vfsp != rootvfs)) 2572 VFS_RELE(zfsvfs->z_parent->z_vfs); 2573 2574 zfsvfs_free(zfsvfs); 2575 2576 atomic_dec_32(&zfs_active_fs_count); 2577 } 2578 2579 /* 2580 * VFS_INIT() initialization. Note that there is no VFS_FINI(), 2581 * so we can't safely do any non-idempotent initialization here. 2582 * Leave that to zfs_init() and zfs_fini(), which are called 2583 * from the module's _init() and _fini() entry points. 2584 */ 2585 /*ARGSUSED*/ 2586 static int 2587 zfs_vfsinit(int fstype, char *name) 2588 { 2589 int error; 2590 2591 zfsfstype = fstype; 2592 2593 /* 2594 * Setup vfsops and vnodeops tables. 2595 */ 2596 error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops); 2597 if (error != 0) { 2598 cmn_err(CE_WARN, "zfs: bad vfs ops template"); 2599 } 2600 2601 error = zfs_create_op_tables(); 2602 if (error) { 2603 zfs_remove_op_tables(); 2604 cmn_err(CE_WARN, "zfs: bad vnode ops template"); 2605 (void) vfs_freevfsops_by_type(zfsfstype); 2606 return (error); 2607 } 2608 2609 mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 2610 2611 /* 2612 * Unique major number for all zfs mounts. 2613 * If we run out of 32-bit minors, we'll getudev() another major. 2614 */ 2615 zfs_major = ddi_name_to_major(ZFS_DRIVER); 2616 zfs_minor = ZFS_MIN_MINOR; 2617 2618 return (0); 2619 } 2620 2621 void 2622 zfs_init(void) 2623 { 2624 /* 2625 * Initialize .zfs directory structures 2626 */ 2627 zfsctl_init(); 2628 2629 /* 2630 * Initialize znode cache, vnode ops, etc... 2631 */ 2632 zfs_znode_init(); 2633 2634 dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); 2635 } 2636 2637 void 2638 zfs_fini(void) 2639 { 2640 zfsctl_fini(); 2641 zfs_znode_fini(); 2642 } 2643 2644 int 2645 zfs_busy(void) 2646 { 2647 return (zfs_active_fs_count != 0); 2648 } 2649 2650 int 2651 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2652 { 2653 int error; 2654 objset_t *os = zfsvfs->z_os; 2655 dmu_tx_t *tx; 2656 2657 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2658 return (SET_ERROR(EINVAL)); 2659 2660 if (newvers < zfsvfs->z_version) 2661 return (SET_ERROR(EINVAL)); 2662 2663 if (zfs_spa_version_map(newvers) > 2664 spa_version(dmu_objset_spa(zfsvfs->z_os))) 2665 return (SET_ERROR(ENOTSUP)); 2666 2667 tx = dmu_tx_create(os); 2668 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2669 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2670 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 2671 ZFS_SA_ATTRS); 2672 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2673 } 2674 error = dmu_tx_assign(tx, TXG_WAIT); 2675 if (error) { 2676 dmu_tx_abort(tx); 2677 return (error); 2678 } 2679 2680 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2681 8, 1, &newvers, tx); 2682 2683 if (error) { 2684 dmu_tx_commit(tx); 2685 return (error); 2686 } 2687 2688 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2689 uint64_t sa_obj; 2690 2691 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, 2692 SPA_VERSION_SA); 2693 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 2694 DMU_OT_NONE, 0, tx); 2695 2696 error = zap_add(os, MASTER_NODE_OBJ, 2697 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 2698 ASSERT0(error); 2699 2700 VERIFY(0 == sa_set_sa_object(os, sa_obj)); 2701 sa_register_update_callback(os, zfs_sa_upgrade); 2702 } 2703 2704 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, 2705 "from %llu to %llu", zfsvfs->z_version, newvers); 2706 2707 dmu_tx_commit(tx); 2708 2709 zfsvfs->z_version = newvers; 2710 os->os_version = newvers; 2711 2712 zfs_set_fuid_feature(zfsvfs); 2713 2714 return (0); 2715 } 2716 2717 /* 2718 * Read a property stored within the master node. 2719 */ 2720 int 2721 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) 2722 { 2723 uint64_t *cached_copy = NULL; 2724 2725 /* 2726 * Figure out where in the objset_t the cached copy would live, if it 2727 * is available for the requested property. 2728 */ 2729 if (os != NULL) { 2730 switch (prop) { 2731 case ZFS_PROP_VERSION: 2732 cached_copy = &os->os_version; 2733 break; 2734 case ZFS_PROP_NORMALIZE: 2735 cached_copy = &os->os_normalization; 2736 break; 2737 case ZFS_PROP_UTF8ONLY: 2738 cached_copy = &os->os_utf8only; 2739 break; 2740 case ZFS_PROP_CASE: 2741 cached_copy = &os->os_casesensitivity; 2742 break; 2743 default: 2744 break; 2745 } 2746 } 2747 if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { 2748 *value = *cached_copy; 2749 return (0); 2750 } 2751 2752 /* 2753 * If the property wasn't cached, look up the file system's value for 2754 * the property. For the version property, we look up a slightly 2755 * different string. 2756 */ 2757 const char *pname; 2758 int error = ENOENT; 2759 if (prop == ZFS_PROP_VERSION) { 2760 pname = ZPL_VERSION_STR; 2761 } else { 2762 pname = zfs_prop_to_name(prop); 2763 } 2764 2765 if (os != NULL) { 2766 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); 2767 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); 2768 } 2769 2770 if (error == ENOENT) { 2771 /* No value set, use the default value */ 2772 switch (prop) { 2773 case ZFS_PROP_VERSION: 2774 *value = ZPL_VERSION; 2775 break; 2776 case ZFS_PROP_NORMALIZE: 2777 case ZFS_PROP_UTF8ONLY: 2778 *value = 0; 2779 break; 2780 case ZFS_PROP_CASE: 2781 *value = ZFS_CASE_SENSITIVE; 2782 break; 2783 default: 2784 return (error); 2785 } 2786 error = 0; 2787 } 2788 2789 /* 2790 * If one of the methods for getting the property value above worked, 2791 * copy it into the objset_t's cache. 2792 */ 2793 if (error == 0 && cached_copy != NULL) { 2794 *cached_copy = *value; 2795 } 2796 2797 return (error); 2798 } 2799 2800 /* 2801 * Return true if the coresponding vfs's unmounted flag is set. 2802 * Otherwise return false. 2803 * If this function returns true we know VFS unmount has been initiated. 2804 */ 2805 boolean_t 2806 zfs_get_vfs_flag_unmounted(objset_t *os) 2807 { 2808 zfsvfs_t *zfvp; 2809 boolean_t unmounted = B_FALSE; 2810 2811 ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); 2812 2813 mutex_enter(&os->os_user_ptr_lock); 2814 zfvp = dmu_objset_get_user(os); 2815 if (zfvp != NULL && zfvp->z_vfs != NULL && 2816 (zfvp->z_vfs->vfs_flag & VFS_UNMOUNTED)) 2817 unmounted = B_TRUE; 2818 mutex_exit(&os->os_user_ptr_lock); 2819 2820 return (unmounted); 2821 } 2822 2823 /* 2824 * Takes a dataset, a property, a value and that value's setpoint as 2825 * found in the ZAP. Checks if the property has been changed in the vfs. 2826 * If so, val and setpoint will be overwritten with updated content. 2827 * Otherwise, they are left unchanged. 2828 * 2829 * OpenZFS moved it to os specific zfs_vfsops.c, we keep it here for now. 2830 */ 2831 int 2832 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, 2833 char *setpoint) 2834 { 2835 int error; 2836 zfsvfs_t *zfvp; 2837 vfs_t *vfsp; 2838 objset_t *os; 2839 uint64_t tmp = *val; 2840 2841 error = dmu_objset_from_ds(ds, &os); 2842 if (error != 0) 2843 return (error); 2844 2845 error = getzfsvfs_impl(os, &zfvp); 2846 if (error != 0) 2847 return (error); 2848 2849 vfsp = zfvp->z_vfs; 2850 2851 switch (zfs_prop) { 2852 case ZFS_PROP_ATIME: 2853 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) 2854 tmp = 0; 2855 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) 2856 tmp = 1; 2857 break; 2858 case ZFS_PROP_DEVICES: 2859 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) 2860 tmp = 0; 2861 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) 2862 tmp = 1; 2863 break; 2864 case ZFS_PROP_EXEC: 2865 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) 2866 tmp = 0; 2867 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) 2868 tmp = 1; 2869 break; 2870 case ZFS_PROP_SETUID: 2871 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) 2872 tmp = 0; 2873 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) 2874 tmp = 1; 2875 break; 2876 case ZFS_PROP_READONLY: 2877 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) 2878 tmp = 0; 2879 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) 2880 tmp = 1; 2881 break; 2882 case ZFS_PROP_XATTR: 2883 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) 2884 tmp = 0; 2885 if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) 2886 tmp = 1; 2887 break; 2888 case ZFS_PROP_NBMAND: 2889 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) 2890 tmp = 0; 2891 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) 2892 tmp = 1; 2893 break; 2894 default: 2895 VFS_RELE(vfsp); 2896 return (ENOENT); 2897 } 2898 2899 VFS_RELE(vfsp); 2900 if (tmp != *val) { 2901 if (setpoint != NULL) 2902 (void) strcpy(setpoint, "temporary"); 2903 *val = tmp; 2904 } 2905 return (0); 2906 } 2907 2908 static vfsdef_t vfw = { 2909 VFSDEF_VERSION, 2910 MNTTYPE_ZFS, 2911 zfs_vfsinit, 2912 VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS| 2913 VSW_XID|VSW_ZMOUNT, 2914 &zfs_mntopts 2915 }; 2916 2917 struct modlfs zfs_modlfs = { 2918 &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw 2919 }; 2920