1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2014 Integros [integros.com] 25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 26 * Copyright 2019 Joyent, Inc. 27 * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> 28 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 29 * Copyright 2022 Oxide Computer Company 30 */ 31 32 /* Portions Copyright 2010 Robert Milkowski */ 33 34 #include <sys/types.h> 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/sysmacros.h> 38 #include <sys/kmem.h> 39 #include <sys/pathname.h> 40 #include <sys/vnode.h> 41 #include <sys/vfs.h> 42 #include <sys/vfs_opreg.h> 43 #include <sys/mntent.h> 44 #include <sys/mount.h> 45 #include <sys/cmn_err.h> 46 #include "fs/fs_subr.h" 47 #include <sys/zfs_znode.h> 48 #include <sys/zfs_dir.h> 49 #include <sys/zil.h> 50 #include <sys/fs/zfs.h> 51 #include <sys/dmu.h> 52 #include <sys/dsl_prop.h> 53 #include <sys/dsl_dataset.h> 54 #include <sys/dsl_deleg.h> 55 #include <sys/spa.h> 56 #include <sys/zap.h> 57 #include <sys/sa.h> 58 #include <sys/sa_impl.h> 59 #include <sys/varargs.h> 60 #include <sys/policy.h> 61 #include <sys/atomic.h> 62 #include <sys/mkdev.h> 63 #include <sys/modctl.h> 64 #include <sys/refstr.h> 65 #include <sys/zfs_ioctl.h> 66 #include <sys/zfs_ctldir.h> 67 #include <sys/zfs_fuid.h> 68 #include <sys/bootconf.h> 69 #include <sys/ddi.h> 70 #include <sys/sunddi.h> 71 #include <sys/dnlc.h> 72 #include <sys/dmu_objset.h> 73 #include <sys/spa_boot.h> 74 #include <sys/vdev_impl.h> 75 #include "zfs_comutil.h" 76 77 int zfsfstype; 78 vfsops_t *zfs_vfsops = NULL; 79 static major_t zfs_major; 80 static minor_t zfs_minor; 81 static kmutex_t zfs_dev_mtx; 82 83 extern int sys_shutdown; 84 85 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); 86 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); 87 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot); 88 static int zfs_root(vfs_t *vfsp, vnode_t **vpp); 89 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp); 90 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp); 91 static void zfs_freevfs(vfs_t *vfsp); 92 93 static const fs_operation_def_t zfs_vfsops_template[] = { 94 VFSNAME_MOUNT, { .vfs_mount = zfs_mount }, 95 VFSNAME_MOUNTROOT, { .vfs_mountroot = zfs_mountroot }, 96 VFSNAME_UNMOUNT, { .vfs_unmount = zfs_umount }, 97 VFSNAME_ROOT, { .vfs_root = zfs_root }, 98 VFSNAME_STATVFS, { .vfs_statvfs = zfs_statvfs }, 99 VFSNAME_SYNC, { .vfs_sync = zfs_sync }, 100 VFSNAME_VGET, { .vfs_vget = zfs_vget }, 101 VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, 102 NULL, NULL 103 }; 104 105 /* 106 * We need to keep a count of active fs's. 107 * This is necessary to prevent our module 108 * from being unloaded after a umount -f 109 */ 110 static uint32_t zfs_active_fs_count = 0; 111 112 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL }; 113 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL }; 114 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; 115 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; 116 117 /* 118 * MO_DEFAULT is not used since the default value is determined 119 * by the equivalent property. 120 */ 121 static mntopt_t mntopts[] = { 122 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL }, 123 { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL }, 124 { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL }, 125 { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL } 126 }; 127 128 static mntopts_t zfs_mntopts = { 129 sizeof (mntopts) / sizeof (mntopt_t), 130 mntopts 131 }; 132 133 /*ARGSUSED*/ 134 int 135 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) 136 { 137 /* 138 * Data integrity is job one. We don't want a compromised kernel 139 * writing to the storage pool, so we never sync during panic. 140 */ 141 if (panicstr) 142 return (0); 143 144 /* 145 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS 146 * to sync metadata, which they would otherwise cache indefinitely. 147 * Semantically, the only requirement is that the sync be initiated. 148 * The DMU syncs out txgs frequently, so there's nothing to do. 149 */ 150 if (flag & SYNC_ATTR) 151 return (0); 152 153 if (vfsp != NULL) { 154 /* 155 * Sync a specific filesystem. 156 */ 157 zfsvfs_t *zfsvfs = vfsp->vfs_data; 158 dsl_pool_t *dp; 159 160 ZFS_ENTER(zfsvfs); 161 dp = dmu_objset_pool(zfsvfs->z_os); 162 163 /* 164 * If the system is shutting down, then skip any 165 * filesystems which may exist on a suspended pool. 166 */ 167 if (sys_shutdown && spa_suspended(dp->dp_spa)) { 168 ZFS_EXIT(zfsvfs); 169 return (0); 170 } 171 172 if (zfsvfs->z_log != NULL) 173 zil_commit(zfsvfs->z_log, 0); 174 175 ZFS_EXIT(zfsvfs); 176 } else { 177 /* 178 * Sync all ZFS filesystems. This is what happens when you 179 * run sync(8). Unlike other filesystems, ZFS honors the 180 * request by waiting for all pools to commit all dirty data. 181 */ 182 spa_sync_allpools(); 183 } 184 185 return (0); 186 } 187 188 static int 189 zfs_create_unique_device(dev_t *dev) 190 { 191 major_t new_major; 192 193 do { 194 ASSERT3U(zfs_minor, <=, MAXMIN32); 195 minor_t start = zfs_minor; 196 do { 197 mutex_enter(&zfs_dev_mtx); 198 if (zfs_minor >= MAXMIN32) { 199 /* 200 * If we're still using the real major 201 * keep out of /dev/zfs and /dev/zvol minor 202 * number space. If we're using a getudev()'ed 203 * major number, we can use all of its minors. 204 */ 205 if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) 206 zfs_minor = ZFS_MIN_MINOR; 207 else 208 zfs_minor = 0; 209 } else { 210 zfs_minor++; 211 } 212 *dev = makedevice(zfs_major, zfs_minor); 213 mutex_exit(&zfs_dev_mtx); 214 } while (vfs_devismounted(*dev) && zfs_minor != start); 215 if (zfs_minor == start) { 216 /* 217 * We are using all ~262,000 minor numbers for the 218 * current major number. Create a new major number. 219 */ 220 if ((new_major = getudev()) == (major_t)-1) { 221 cmn_err(CE_WARN, 222 "zfs_mount: Can't get unique major " 223 "device number."); 224 return (-1); 225 } 226 mutex_enter(&zfs_dev_mtx); 227 zfs_major = new_major; 228 zfs_minor = 0; 229 230 mutex_exit(&zfs_dev_mtx); 231 } else { 232 break; 233 } 234 /* CONSTANTCONDITION */ 235 } while (1); 236 237 return (0); 238 } 239 240 static void 241 atime_changed_cb(void *arg, uint64_t newval) 242 { 243 zfsvfs_t *zfsvfs = arg; 244 245 if (newval == TRUE) { 246 zfsvfs->z_atime = TRUE; 247 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 248 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 249 } else { 250 zfsvfs->z_atime = FALSE; 251 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 252 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 253 } 254 } 255 256 static void 257 xattr_changed_cb(void *arg, uint64_t newval) 258 { 259 zfsvfs_t *zfsvfs = arg; 260 261 if (newval == TRUE) { 262 /* XXX locking on vfs_flag? */ 263 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; 264 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); 265 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); 266 } else { 267 /* XXX locking on vfs_flag? */ 268 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; 269 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); 270 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); 271 } 272 } 273 274 static void 275 blksz_changed_cb(void *arg, uint64_t newval) 276 { 277 zfsvfs_t *zfsvfs = arg; 278 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); 279 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); 280 ASSERT(ISP2(newval)); 281 282 zfsvfs->z_max_blksz = newval; 283 zfsvfs->z_vfs->vfs_bsize = newval; 284 } 285 286 static void 287 readonly_changed_cb(void *arg, uint64_t newval) 288 { 289 zfsvfs_t *zfsvfs = arg; 290 291 if (newval) { 292 /* XXX locking on vfs_flag? */ 293 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 294 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 295 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 296 } else { 297 /* XXX locking on vfs_flag? */ 298 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 299 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 300 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 301 } 302 } 303 304 static void 305 devices_changed_cb(void *arg, uint64_t newval) 306 { 307 zfsvfs_t *zfsvfs = arg; 308 309 if (newval == FALSE) { 310 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES; 311 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES); 312 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0); 313 } else { 314 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES; 315 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES); 316 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0); 317 } 318 } 319 320 static void 321 setuid_changed_cb(void *arg, uint64_t newval) 322 { 323 zfsvfs_t *zfsvfs = arg; 324 325 if (newval == FALSE) { 326 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 327 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 328 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 329 } else { 330 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 331 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 332 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 333 } 334 } 335 336 static void 337 exec_changed_cb(void *arg, uint64_t newval) 338 { 339 zfsvfs_t *zfsvfs = arg; 340 341 if (newval == FALSE) { 342 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 343 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 344 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 345 } else { 346 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 347 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 348 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 349 } 350 } 351 352 /* 353 * The nbmand mount option can be changed at mount time. 354 * We can't allow it to be toggled on live file systems or incorrect 355 * behavior may be seen from cifs clients 356 * 357 * This property isn't registered via dsl_prop_register(), but this callback 358 * will be called when a file system is first mounted 359 */ 360 static void 361 nbmand_changed_cb(void *arg, uint64_t newval) 362 { 363 zfsvfs_t *zfsvfs = arg; 364 if (newval == FALSE) { 365 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 366 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 367 } else { 368 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 369 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 370 } 371 } 372 373 static void 374 snapdir_changed_cb(void *arg, uint64_t newval) 375 { 376 zfsvfs_t *zfsvfs = arg; 377 378 zfsvfs->z_show_ctldir = newval; 379 } 380 381 static void 382 vscan_changed_cb(void *arg, uint64_t newval) 383 { 384 zfsvfs_t *zfsvfs = arg; 385 386 zfsvfs->z_vscan = newval; 387 } 388 389 static void 390 acl_mode_changed_cb(void *arg, uint64_t newval) 391 { 392 zfsvfs_t *zfsvfs = arg; 393 394 zfsvfs->z_acl_mode = newval; 395 } 396 397 static void 398 acl_inherit_changed_cb(void *arg, uint64_t newval) 399 { 400 zfsvfs_t *zfsvfs = arg; 401 402 zfsvfs->z_acl_inherit = newval; 403 } 404 405 static int 406 zfs_register_callbacks(vfs_t *vfsp) 407 { 408 struct dsl_dataset *ds = NULL; 409 objset_t *os = NULL; 410 zfsvfs_t *zfsvfs = NULL; 411 uint64_t nbmand; 412 boolean_t readonly = B_FALSE; 413 boolean_t do_readonly = B_FALSE; 414 boolean_t setuid = B_FALSE; 415 boolean_t do_setuid = B_FALSE; 416 boolean_t exec = B_FALSE; 417 boolean_t do_exec = B_FALSE; 418 boolean_t devices = B_FALSE; 419 boolean_t do_devices = B_FALSE; 420 boolean_t xattr = B_FALSE; 421 boolean_t do_xattr = B_FALSE; 422 boolean_t atime = B_FALSE; 423 boolean_t do_atime = B_FALSE; 424 int error = 0; 425 426 ASSERT(vfsp); 427 zfsvfs = vfsp->vfs_data; 428 ASSERT(zfsvfs); 429 os = zfsvfs->z_os; 430 431 /* 432 * The act of registering our callbacks will destroy any mount 433 * options we may have. In order to enable temporary overrides 434 * of mount options, we stash away the current values and 435 * restore them after we register the callbacks. 436 */ 437 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || 438 !spa_writeable(dmu_objset_spa(os))) { 439 readonly = B_TRUE; 440 do_readonly = B_TRUE; 441 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 442 readonly = B_FALSE; 443 do_readonly = B_TRUE; 444 } 445 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 446 devices = B_FALSE; 447 setuid = B_FALSE; 448 do_devices = B_TRUE; 449 do_setuid = B_TRUE; 450 } else { 451 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { 452 devices = B_FALSE; 453 do_devices = B_TRUE; 454 } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) { 455 devices = B_TRUE; 456 do_devices = B_TRUE; 457 } 458 459 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 460 setuid = B_FALSE; 461 do_setuid = B_TRUE; 462 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 463 setuid = B_TRUE; 464 do_setuid = B_TRUE; 465 } 466 } 467 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 468 exec = B_FALSE; 469 do_exec = B_TRUE; 470 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 471 exec = B_TRUE; 472 do_exec = B_TRUE; 473 } 474 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 475 xattr = B_FALSE; 476 do_xattr = B_TRUE; 477 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 478 xattr = B_TRUE; 479 do_xattr = B_TRUE; 480 } 481 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 482 atime = B_FALSE; 483 do_atime = B_TRUE; 484 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 485 atime = B_TRUE; 486 do_atime = B_TRUE; 487 } 488 489 /* 490 * nbmand is a special property. It can only be changed at 491 * mount time. 492 * 493 * This is weird, but it is documented to only be changeable 494 * at mount time. 495 */ 496 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 497 nbmand = B_FALSE; 498 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 499 nbmand = B_TRUE; 500 } else { 501 char osname[ZFS_MAX_DATASET_NAME_LEN]; 502 503 dmu_objset_name(os, osname); 504 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, 505 NULL)) { 506 return (error); 507 } 508 } 509 510 /* 511 * Register property callbacks. 512 * 513 * It would probably be fine to just check for i/o error from 514 * the first prop_register(), but I guess I like to go 515 * overboard... 516 */ 517 ds = dmu_objset_ds(os); 518 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 519 error = dsl_prop_register(ds, 520 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); 521 error = error ? error : dsl_prop_register(ds, 522 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); 523 error = error ? error : dsl_prop_register(ds, 524 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); 525 error = error ? error : dsl_prop_register(ds, 526 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); 527 error = error ? error : dsl_prop_register(ds, 528 zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); 529 error = error ? error : dsl_prop_register(ds, 530 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); 531 error = error ? error : dsl_prop_register(ds, 532 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); 533 error = error ? error : dsl_prop_register(ds, 534 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); 535 error = error ? error : dsl_prop_register(ds, 536 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); 537 error = error ? error : dsl_prop_register(ds, 538 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, 539 zfsvfs); 540 error = error ? error : dsl_prop_register(ds, 541 zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); 542 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 543 if (error) 544 goto unregister; 545 546 /* 547 * Invoke our callbacks to restore temporary mount options. 548 */ 549 if (do_readonly) 550 readonly_changed_cb(zfsvfs, readonly); 551 if (do_setuid) 552 setuid_changed_cb(zfsvfs, setuid); 553 if (do_exec) 554 exec_changed_cb(zfsvfs, exec); 555 if (do_devices) 556 devices_changed_cb(zfsvfs, devices); 557 if (do_xattr) 558 xattr_changed_cb(zfsvfs, xattr); 559 if (do_atime) 560 atime_changed_cb(zfsvfs, atime); 561 562 nbmand_changed_cb(zfsvfs, nbmand); 563 564 return (0); 565 566 unregister: 567 dsl_prop_unregister_all(ds, zfsvfs); 568 return (error); 569 } 570 571 static int 572 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, 573 uint64_t *userp, uint64_t *groupp, uint64_t *projectp) 574 { 575 sa_hdr_phys_t sa; 576 sa_hdr_phys_t *sap = data; 577 uint64_t flags; 578 int hdrsize; 579 boolean_t swap = B_FALSE; 580 581 /* 582 * Is it a valid type of object to track? 583 */ 584 if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) 585 return (SET_ERROR(ENOENT)); 586 587 /* 588 * If we have a NULL data pointer 589 * then assume the id's aren't changing and 590 * return EEXIST to the dmu to let it know to 591 * use the same ids 592 */ 593 if (data == NULL) 594 return (SET_ERROR(EEXIST)); 595 596 if (bonustype == DMU_OT_ZNODE) { 597 znode_phys_t *znp = data; 598 *userp = znp->zp_uid; 599 *groupp = znp->zp_gid; 600 *projectp = ZFS_DEFAULT_PROJID; 601 return (0); 602 } 603 604 if (sap->sa_magic == 0) { 605 /* 606 * This should only happen for newly created files 607 * that haven't had the znode data filled in yet. 608 */ 609 *userp = 0; 610 *groupp = 0; 611 *projectp = ZFS_DEFAULT_PROJID; 612 return (0); 613 } 614 615 sa = *sap; 616 if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { 617 sa.sa_magic = SA_MAGIC; 618 sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); 619 swap = B_TRUE; 620 } else { 621 VERIFY3U(sa.sa_magic, ==, SA_MAGIC); 622 } 623 624 hdrsize = sa_hdrsize(&sa); 625 VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); 626 627 *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET)); 628 *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET)); 629 flags = *((uint64_t *)((uintptr_t)data + hdrsize + SA_FLAGS_OFFSET)); 630 if (swap) 631 flags = BSWAP_64(flags); 632 633 if (flags & ZFS_PROJID) 634 *projectp = *((uint64_t *)((uintptr_t)data + hdrsize + 635 SA_PROJID_OFFSET)); 636 else 637 *projectp = ZFS_DEFAULT_PROJID; 638 639 if (swap) { 640 *userp = BSWAP_64(*userp); 641 *groupp = BSWAP_64(*groupp); 642 *projectp = BSWAP_64(*projectp); 643 } 644 return (0); 645 } 646 647 static void 648 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, 649 char *domainbuf, int buflen, uid_t *ridp) 650 { 651 uint64_t fuid; 652 const char *domain; 653 654 fuid = zfs_strtonum(fuidstr, NULL); 655 656 domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); 657 if (domain) 658 (void) strlcpy(domainbuf, domain, buflen); 659 else 660 domainbuf[0] = '\0'; 661 *ridp = FUID_RID(fuid); 662 } 663 664 static uint64_t 665 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) 666 { 667 switch (type) { 668 case ZFS_PROP_USERUSED: 669 case ZFS_PROP_USEROBJUSED: 670 return (DMU_USERUSED_OBJECT); 671 case ZFS_PROP_GROUPUSED: 672 case ZFS_PROP_GROUPOBJUSED: 673 return (DMU_GROUPUSED_OBJECT); 674 case ZFS_PROP_PROJECTUSED: 675 case ZFS_PROP_PROJECTOBJUSED: 676 return (DMU_PROJECTUSED_OBJECT); 677 case ZFS_PROP_USERQUOTA: 678 return (zfsvfs->z_userquota_obj); 679 case ZFS_PROP_GROUPQUOTA: 680 return (zfsvfs->z_groupquota_obj); 681 case ZFS_PROP_USEROBJQUOTA: 682 return (zfsvfs->z_userobjquota_obj); 683 case ZFS_PROP_GROUPOBJQUOTA: 684 return (zfsvfs->z_groupobjquota_obj); 685 case ZFS_PROP_PROJECTQUOTA: 686 return (zfsvfs->z_projectquota_obj); 687 case ZFS_PROP_PROJECTOBJQUOTA: 688 return (zfsvfs->z_projectobjquota_obj); 689 default: 690 return (ZFS_NO_OBJECT); 691 } 692 } 693 694 int 695 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 696 uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) 697 { 698 int error; 699 zap_cursor_t zc; 700 zap_attribute_t za; 701 zfs_useracct_t *buf = vbuf; 702 uint64_t obj; 703 int offset = 0; 704 705 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 706 return (SET_ERROR(ENOTSUP)); 707 708 if ((type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || 709 type == ZFS_PROP_PROJECTOBJQUOTA || 710 type == ZFS_PROP_PROJECTOBJUSED) && 711 !dmu_objset_projectquota_present(zfsvfs->z_os)) 712 return (SET_ERROR(ENOTSUP)); 713 714 if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || 715 type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || 716 type == ZFS_PROP_PROJECTOBJUSED || 717 type == ZFS_PROP_PROJECTOBJQUOTA) && 718 !dmu_objset_userobjspace_present(zfsvfs->z_os)) 719 return (SET_ERROR(ENOTSUP)); 720 721 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 722 if (obj == ZFS_NO_OBJECT) { 723 *bufsizep = 0; 724 return (0); 725 } 726 727 if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || 728 type == ZFS_PROP_PROJECTOBJUSED) 729 offset = DMU_OBJACCT_PREFIX_LEN; 730 731 for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); 732 (error = zap_cursor_retrieve(&zc, &za)) == 0; 733 zap_cursor_advance(&zc)) { 734 if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > 735 *bufsizep) 736 break; 737 738 /* 739 * skip object quota (with zap name prefix DMU_OBJACCT_PREFIX) 740 * when dealing with block quota and vice versa. 741 */ 742 if ((offset > 0) != (strncmp(za.za_name, DMU_OBJACCT_PREFIX, 743 DMU_OBJACCT_PREFIX_LEN) == 0)) 744 continue; 745 746 fuidstr_to_sid(zfsvfs, za.za_name + offset, 747 buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); 748 749 buf->zu_space = za.za_first_integer; 750 buf++; 751 } 752 if (error == ENOENT) 753 error = 0; 754 755 ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); 756 *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; 757 *cookiep = zap_cursor_serialize(&zc); 758 zap_cursor_fini(&zc); 759 return (error); 760 } 761 762 /* 763 * buf must be big enough (eg, 32 bytes) 764 */ 765 static int 766 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, 767 char *buf, boolean_t addok) 768 { 769 uint64_t fuid; 770 int domainid = 0; 771 772 if (domain && domain[0]) { 773 domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); 774 if (domainid == -1) 775 return (SET_ERROR(ENOENT)); 776 } 777 fuid = FUID_ENCODE(domainid, rid); 778 (void) sprintf(buf, "%llx", (longlong_t)fuid); 779 return (0); 780 } 781 782 int 783 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 784 const char *domain, uint64_t rid, uint64_t *valp) 785 { 786 char buf[20 + DMU_OBJACCT_PREFIX_LEN]; 787 int offset = 0; 788 int err; 789 uint64_t obj; 790 791 *valp = 0; 792 793 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 794 return (SET_ERROR(ENOTSUP)); 795 796 if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || 797 type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || 798 type == ZFS_PROP_PROJECTOBJUSED || 799 type == ZFS_PROP_PROJECTOBJQUOTA) && 800 !dmu_objset_userobjspace_present(zfsvfs->z_os)) 801 return (SET_ERROR(ENOTSUP)); 802 803 if (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || 804 type == ZFS_PROP_PROJECTOBJQUOTA || 805 type == ZFS_PROP_PROJECTOBJUSED) { 806 if (!dmu_objset_projectquota_present(zfsvfs->z_os)) 807 return (SET_ERROR(ENOTSUP)); 808 if (!zpl_is_valid_projid(rid)) 809 return (SET_ERROR(EINVAL)); 810 } 811 812 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 813 if (obj == ZFS_NO_OBJECT) 814 return (0); 815 816 if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || 817 type == ZFS_PROP_PROJECTOBJUSED) { 818 strncpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN); 819 offset = DMU_OBJACCT_PREFIX_LEN; 820 } 821 822 err = id_to_fuidstr(zfsvfs, domain, rid, buf + offset, B_FALSE); 823 if (err) 824 return (err); 825 826 err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); 827 if (err == ENOENT) 828 err = 0; 829 return (err); 830 } 831 832 int 833 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 834 const char *domain, uint64_t rid, uint64_t quota) 835 { 836 char buf[32]; 837 int err; 838 dmu_tx_t *tx; 839 uint64_t *objp; 840 boolean_t fuid_dirtied; 841 842 if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) 843 return (SET_ERROR(ENOTSUP)); 844 845 switch (type) { 846 case ZFS_PROP_USERQUOTA: 847 objp = &zfsvfs->z_userquota_obj; 848 break; 849 case ZFS_PROP_GROUPQUOTA: 850 objp = &zfsvfs->z_groupquota_obj; 851 break; 852 case ZFS_PROP_USEROBJQUOTA: 853 objp = &zfsvfs->z_userobjquota_obj; 854 break; 855 case ZFS_PROP_GROUPOBJQUOTA: 856 objp = &zfsvfs->z_groupobjquota_obj; 857 break; 858 case ZFS_PROP_PROJECTQUOTA: 859 if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) 860 return (SET_ERROR(ENOTSUP)); 861 if (!zpl_is_valid_projid(rid)) 862 return (SET_ERROR(EINVAL)); 863 864 objp = &zfsvfs->z_projectquota_obj; 865 break; 866 case ZFS_PROP_PROJECTOBJQUOTA: 867 if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) 868 return (SET_ERROR(ENOTSUP)); 869 if (!zpl_is_valid_projid(rid)) 870 return (SET_ERROR(EINVAL)); 871 872 objp = &zfsvfs->z_projectobjquota_obj; 873 break; 874 default: 875 return (SET_ERROR(EINVAL)); 876 } 877 878 err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); 879 if (err) 880 return (err); 881 fuid_dirtied = zfsvfs->z_fuid_dirty; 882 883 tx = dmu_tx_create(zfsvfs->z_os); 884 dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); 885 if (*objp == 0) { 886 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 887 zfs_userquota_prop_prefixes[type]); 888 } 889 if (fuid_dirtied) 890 zfs_fuid_txhold(zfsvfs, tx); 891 err = dmu_tx_assign(tx, TXG_WAIT); 892 if (err) { 893 dmu_tx_abort(tx); 894 return (err); 895 } 896 897 mutex_enter(&zfsvfs->z_lock); 898 if (*objp == 0) { 899 *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, 900 DMU_OT_NONE, 0, tx); 901 VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 902 zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); 903 } 904 mutex_exit(&zfsvfs->z_lock); 905 906 if (quota == 0) { 907 err = zap_remove(zfsvfs->z_os, *objp, buf, tx); 908 if (err == ENOENT) 909 err = 0; 910 } else { 911 err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); 912 } 913 ASSERT(err == 0); 914 if (fuid_dirtied) 915 zfs_fuid_sync(zfsvfs, tx); 916 dmu_tx_commit(tx); 917 return (err); 918 } 919 920 boolean_t 921 zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) 922 { 923 char buf[20 + DMU_OBJACCT_PREFIX_LEN]; 924 uint64_t used, quota, quotaobj; 925 int err; 926 927 if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) { 928 if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) { 929 dsl_pool_config_enter( 930 dmu_objset_pool(zfsvfs->z_os), FTAG); 931 dmu_objset_id_quota_upgrade(zfsvfs->z_os); 932 dsl_pool_config_exit( 933 dmu_objset_pool(zfsvfs->z_os), FTAG); 934 } 935 return (B_FALSE); 936 } 937 938 if (usedobj == DMU_PROJECTUSED_OBJECT) { 939 if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { 940 if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { 941 dsl_pool_config_enter( 942 dmu_objset_pool(zfsvfs->z_os), FTAG); 943 dmu_objset_id_quota_upgrade(zfsvfs->z_os); 944 dsl_pool_config_exit( 945 dmu_objset_pool(zfsvfs->z_os), FTAG); 946 } 947 return (B_FALSE); 948 } 949 quotaobj = zfsvfs->z_projectobjquota_obj; 950 } else if (usedobj == DMU_USERUSED_OBJECT) { 951 quotaobj = zfsvfs->z_userobjquota_obj; 952 } else if (usedobj == DMU_GROUPUSED_OBJECT) { 953 quotaobj = zfsvfs->z_groupobjquota_obj; 954 } else { 955 return (B_FALSE); 956 } 957 if (quotaobj == 0 || zfsvfs->z_replay) 958 return (B_FALSE); 959 960 (void) sprintf(buf, "%llx", (longlong_t)id); 961 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); 962 if (err != 0) 963 return (B_FALSE); 964 965 (void) sprintf(buf, DMU_OBJACCT_PREFIX "%llx", (longlong_t)id); 966 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); 967 if (err != 0) 968 return (B_FALSE); 969 return (used >= quota); 970 } 971 972 boolean_t 973 zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) 974 { 975 char buf[20]; 976 uint64_t used, quota, quotaobj; 977 int err; 978 979 if (usedobj == DMU_PROJECTUSED_OBJECT) { 980 if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { 981 if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { 982 dsl_pool_config_enter( 983 dmu_objset_pool(zfsvfs->z_os), FTAG); 984 dmu_objset_id_quota_upgrade(zfsvfs->z_os); 985 dsl_pool_config_exit( 986 dmu_objset_pool(zfsvfs->z_os), FTAG); 987 } 988 return (B_FALSE); 989 } 990 quotaobj = zfsvfs->z_projectquota_obj; 991 } else if (usedobj == DMU_USERUSED_OBJECT) { 992 quotaobj = zfsvfs->z_userquota_obj; 993 } else if (usedobj == DMU_GROUPUSED_OBJECT) { 994 quotaobj = zfsvfs->z_groupquota_obj; 995 } else { 996 return (B_FALSE); 997 } 998 if (quotaobj == 0 || zfsvfs->z_replay) 999 return (B_FALSE); 1000 1001 (void) sprintf(buf, "%llx", (longlong_t)id); 1002 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); 1003 if (err != 0) 1004 return (B_FALSE); 1005 1006 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); 1007 if (err != 0) 1008 return (B_FALSE); 1009 return (used >= quota); 1010 } 1011 1012 boolean_t 1013 zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) 1014 { 1015 return (zfs_id_overblockquota(zfsvfs, usedobj, id) || 1016 zfs_id_overobjquota(zfsvfs, usedobj, id)); 1017 } 1018 1019 /* 1020 * Associate this zfsvfs with the given objset, which must be owned. 1021 * This will cache a bunch of on-disk state from the objset in the 1022 * zfsvfs. 1023 */ 1024 static int 1025 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) 1026 { 1027 int error; 1028 uint64_t val; 1029 1030 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; 1031 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 1032 zfsvfs->z_os = os; 1033 1034 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 1035 if (error != 0) 1036 return (error); 1037 if (zfsvfs->z_version > 1038 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { 1039 (void) printf("Can't mount a version %lld file system " 1040 "on a version %lld pool\n. Pool must be upgraded to mount " 1041 "this file system.", (u_longlong_t)zfsvfs->z_version, 1042 (u_longlong_t)spa_version(dmu_objset_spa(os))); 1043 return (SET_ERROR(ENOTSUP)); 1044 } 1045 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); 1046 if (error != 0) 1047 return (error); 1048 zfsvfs->z_norm = (int)val; 1049 1050 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); 1051 if (error != 0) 1052 return (error); 1053 zfsvfs->z_utf8 = (val != 0); 1054 1055 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); 1056 if (error != 0) 1057 return (error); 1058 zfsvfs->z_case = (uint_t)val; 1059 1060 /* 1061 * Fold case on file systems that are always or sometimes case 1062 * insensitive. 1063 */ 1064 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 1065 zfsvfs->z_case == ZFS_CASE_MIXED) 1066 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 1067 1068 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1069 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1070 1071 uint64_t sa_obj = 0; 1072 if (zfsvfs->z_use_sa) { 1073 /* should either have both of these objects or none */ 1074 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, 1075 &sa_obj); 1076 if (error != 0) 1077 return (error); 1078 } 1079 1080 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 1081 &zfsvfs->z_attr_table); 1082 if (error != 0) 1083 return (error); 1084 1085 if (zfsvfs->z_version >= ZPL_VERSION_SA) 1086 sa_register_update_callback(os, zfs_sa_upgrade); 1087 1088 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 1089 &zfsvfs->z_root); 1090 if (error != 0) 1091 return (error); 1092 ASSERT(zfsvfs->z_root != 0); 1093 1094 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 1095 &zfsvfs->z_unlinkedobj); 1096 if (error != 0) 1097 return (error); 1098 1099 error = zap_lookup(os, MASTER_NODE_OBJ, 1100 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 1101 8, 1, &zfsvfs->z_userquota_obj); 1102 if (error == ENOENT) 1103 zfsvfs->z_userquota_obj = 0; 1104 else if (error != 0) 1105 return (error); 1106 1107 error = zap_lookup(os, MASTER_NODE_OBJ, 1108 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 1109 8, 1, &zfsvfs->z_groupquota_obj); 1110 if (error == ENOENT) 1111 zfsvfs->z_groupquota_obj = 0; 1112 else if (error != 0) 1113 return (error); 1114 1115 error = zap_lookup(os, MASTER_NODE_OBJ, 1116 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 1117 8, 1, &zfsvfs->z_projectquota_obj); 1118 if (error == ENOENT) 1119 zfsvfs->z_projectquota_obj = 0; 1120 else if (error != 0) 1121 return (error); 1122 1123 error = zap_lookup(os, MASTER_NODE_OBJ, 1124 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 1125 8, 1, &zfsvfs->z_userobjquota_obj); 1126 if (error == ENOENT) 1127 zfsvfs->z_userobjquota_obj = 0; 1128 else if (error != 0) 1129 return (error); 1130 1131 error = zap_lookup(os, MASTER_NODE_OBJ, 1132 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 1133 8, 1, &zfsvfs->z_groupobjquota_obj); 1134 if (error == ENOENT) 1135 zfsvfs->z_groupobjquota_obj = 0; 1136 else if (error != 0) 1137 return (error); 1138 1139 error = zap_lookup(os, MASTER_NODE_OBJ, 1140 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 1141 8, 1, &zfsvfs->z_projectobjquota_obj); 1142 if (error == ENOENT) 1143 zfsvfs->z_projectobjquota_obj = 0; 1144 else if (error != 0) 1145 return (error); 1146 1147 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 1148 &zfsvfs->z_fuid_obj); 1149 if (error == ENOENT) 1150 zfsvfs->z_fuid_obj = 0; 1151 else if (error != 0) 1152 return (error); 1153 1154 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 1155 &zfsvfs->z_shares_dir); 1156 if (error == ENOENT) 1157 zfsvfs->z_shares_dir = 0; 1158 else if (error != 0) 1159 return (error); 1160 1161 return (0); 1162 } 1163 1164 int 1165 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) 1166 { 1167 objset_t *os; 1168 zfsvfs_t *zfsvfs; 1169 int error; 1170 boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); 1171 1172 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 1173 1174 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); 1175 if (error != 0) { 1176 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1177 return (error); 1178 } 1179 1180 error = zfsvfs_create_impl(zfvp, zfsvfs, os); 1181 if (error != 0) { 1182 dmu_objset_disown(os, B_TRUE, zfsvfs); 1183 } 1184 return (error); 1185 } 1186 1187 1188 int 1189 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) 1190 { 1191 int error; 1192 1193 zfsvfs->z_vfs = NULL; 1194 zfsvfs->z_parent = zfsvfs; 1195 1196 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1197 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 1198 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1199 offsetof(znode_t, z_link_node)); 1200 rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); 1201 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); 1202 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 1203 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1204 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1205 1206 error = zfsvfs_init(zfsvfs, os); 1207 if (error != 0) { 1208 *zfvp = NULL; 1209 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1210 return (error); 1211 } 1212 1213 zfsvfs->z_drain_task = TASKQID_INVALID; 1214 zfsvfs->z_draining = B_FALSE; 1215 zfsvfs->z_drain_cancel = B_TRUE; 1216 1217 *zfvp = zfsvfs; 1218 return (0); 1219 } 1220 1221 static int 1222 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 1223 { 1224 int error; 1225 1226 error = zfs_register_callbacks(zfsvfs->z_vfs); 1227 if (error) 1228 return (error); 1229 1230 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); 1231 1232 /* 1233 * If we are not mounting (ie: online recv), then we don't 1234 * have to worry about replaying the log as we blocked all 1235 * operations out since we closed the ZIL. 1236 */ 1237 if (mounting) { 1238 boolean_t readonly; 1239 1240 /* 1241 * During replay we remove the read only flag to 1242 * allow replays to succeed. 1243 */ 1244 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 1245 if (readonly != 0) { 1246 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 1247 } else { 1248 zfs_unlinked_drain(zfsvfs); 1249 } 1250 1251 /* 1252 * Parse and replay the intent log. 1253 * 1254 * Because of ziltest, this must be done after 1255 * zfs_unlinked_drain(). (Further note: ziltest 1256 * doesn't use readonly mounts, where 1257 * zfs_unlinked_drain() isn't called.) This is because 1258 * ziltest causes spa_sync() to think it's committed, 1259 * but actually it is not, so the intent log contains 1260 * many txg's worth of changes. 1261 * 1262 * In particular, if object N is in the unlinked set in 1263 * the last txg to actually sync, then it could be 1264 * actually freed in a later txg and then reallocated 1265 * in a yet later txg. This would write a "create 1266 * object N" record to the intent log. Normally, this 1267 * would be fine because the spa_sync() would have 1268 * written out the fact that object N is free, before 1269 * we could write the "create object N" intent log 1270 * record. 1271 * 1272 * But when we are in ziltest mode, we advance the "open 1273 * txg" without actually spa_sync()-ing the changes to 1274 * disk. So we would see that object N is still 1275 * allocated and in the unlinked set, and there is an 1276 * intent log record saying to allocate it. 1277 */ 1278 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { 1279 if (zil_replay_disable) { 1280 zil_destroy(zfsvfs->z_log, B_FALSE); 1281 } else { 1282 zfsvfs->z_replay = B_TRUE; 1283 zil_replay(zfsvfs->z_os, zfsvfs, 1284 zfs_replay_vector); 1285 zfsvfs->z_replay = B_FALSE; 1286 } 1287 } 1288 1289 /* restore readonly bit */ 1290 if (readonly != 0) 1291 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 1292 } 1293 1294 /* 1295 * Set the objset user_ptr to track its zfsvfs. 1296 */ 1297 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1298 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1299 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1300 1301 return (0); 1302 } 1303 1304 void 1305 zfsvfs_free(zfsvfs_t *zfsvfs) 1306 { 1307 int i; 1308 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ 1309 1310 /* 1311 * This is a barrier to prevent the filesystem from going away in 1312 * zfs_znode_move() until we can safely ensure that the filesystem is 1313 * not unmounted. We consider the filesystem valid before the barrier 1314 * and invalid after the barrier. 1315 */ 1316 rw_enter(&zfsvfs_lock, RW_READER); 1317 rw_exit(&zfsvfs_lock); 1318 1319 zfs_fuid_destroy(zfsvfs); 1320 1321 mutex_destroy(&zfsvfs->z_znodes_lock); 1322 mutex_destroy(&zfsvfs->z_lock); 1323 list_destroy(&zfsvfs->z_all_znodes); 1324 rrm_destroy(&zfsvfs->z_teardown_lock); 1325 rw_destroy(&zfsvfs->z_teardown_inactive_lock); 1326 rw_destroy(&zfsvfs->z_fuid_lock); 1327 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1328 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1329 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1330 } 1331 1332 static void 1333 zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1334 { 1335 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1336 if (zfsvfs->z_vfs) { 1337 if (zfsvfs->z_use_fuids) { 1338 vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1339 vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1340 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1341 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1342 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1343 vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1344 } else { 1345 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1346 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1347 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1348 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1349 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1350 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1351 } 1352 } 1353 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1354 } 1355 1356 static int 1357 zfs_domount(vfs_t *vfsp, char *osname) 1358 { 1359 dev_t mount_dev; 1360 uint64_t recordsize, fsid_guid; 1361 int error = 0; 1362 zfsvfs_t *zfsvfs; 1363 boolean_t readonly = vfsp->vfs_flag & VFS_RDONLY ? B_TRUE : B_FALSE; 1364 1365 ASSERT(vfsp); 1366 ASSERT(osname); 1367 1368 error = zfsvfs_create(osname, readonly, &zfsvfs); 1369 if (error) 1370 return (error); 1371 zfsvfs->z_vfs = vfsp; 1372 1373 /* Initialize the generic filesystem structure. */ 1374 vfsp->vfs_bcount = 0; 1375 vfsp->vfs_data = NULL; 1376 1377 if (zfs_create_unique_device(&mount_dev) == -1) { 1378 error = SET_ERROR(ENODEV); 1379 goto out; 1380 } 1381 ASSERT(vfs_devismounted(mount_dev) == 0); 1382 1383 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, 1384 NULL)) 1385 goto out; 1386 1387 vfsp->vfs_dev = mount_dev; 1388 vfsp->vfs_fstype = zfsfstype; 1389 vfsp->vfs_bsize = recordsize; 1390 vfsp->vfs_flag |= VFS_NOTRUNC; 1391 vfsp->vfs_data = zfsvfs; 1392 1393 /* 1394 * The fsid is 64 bits, composed of an 8-bit fs type, which 1395 * separates our fsid from any other filesystem types, and a 1396 * 56-bit objset unique ID. The objset unique ID is unique to 1397 * all objsets open on this system, provided by unique_create(). 1398 * The 8-bit fs type must be put in the low bits of fsid[1] 1399 * because that's where other Solaris filesystems put it. 1400 */ 1401 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1402 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 1403 vfsp->vfs_fsid.val[0] = fsid_guid; 1404 vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | 1405 zfsfstype & 0xFF; 1406 1407 /* 1408 * Set features for file system. 1409 */ 1410 zfs_set_fuid_feature(zfsvfs); 1411 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { 1412 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1413 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1414 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); 1415 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { 1416 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1417 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1418 } 1419 vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); 1420 1421 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1422 uint64_t pval; 1423 1424 atime_changed_cb(zfsvfs, B_FALSE); 1425 readonly_changed_cb(zfsvfs, B_TRUE); 1426 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) 1427 goto out; 1428 xattr_changed_cb(zfsvfs, pval); 1429 zfsvfs->z_issnap = B_TRUE; 1430 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; 1431 1432 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1433 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1434 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1435 } else { 1436 error = zfsvfs_setup(zfsvfs, B_TRUE); 1437 } 1438 1439 /* cache the root vnode for this mount */ 1440 znode_t *rootzp; 1441 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp)) { 1442 goto out; 1443 } 1444 zfsvfs->z_rootdir = ZTOV(rootzp); 1445 1446 if (!zfsvfs->z_issnap) 1447 zfsctl_create(zfsvfs); 1448 out: 1449 if (error) { 1450 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); 1451 zfsvfs_free(zfsvfs); 1452 } else { 1453 atomic_inc_32(&zfs_active_fs_count); 1454 } 1455 1456 return (error); 1457 } 1458 1459 void 1460 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1461 { 1462 objset_t *os = zfsvfs->z_os; 1463 1464 if (!dmu_objset_is_snapshot(os)) 1465 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); 1466 } 1467 1468 /* 1469 * Convert a decimal digit string to a uint64_t integer. 1470 */ 1471 static int 1472 str_to_uint64(char *str, uint64_t *objnum) 1473 { 1474 uint64_t num = 0; 1475 1476 while (*str) { 1477 if (*str < '0' || *str > '9') 1478 return (SET_ERROR(EINVAL)); 1479 1480 num = num*10 + *str++ - '0'; 1481 } 1482 1483 *objnum = num; 1484 return (0); 1485 } 1486 1487 /* 1488 * The boot path passed from the boot loader is in the form of 1489 * "rootpool-name/root-filesystem-object-number'. Convert this 1490 * string to a dataset name: "rootpool-name/root-filesystem-name". 1491 */ 1492 static int 1493 zfs_parse_bootfs(char *bpath, char *outpath) 1494 { 1495 char *slashp; 1496 uint64_t objnum; 1497 int error; 1498 1499 if (*bpath == 0 || *bpath == '/') 1500 return (SET_ERROR(EINVAL)); 1501 1502 (void) strcpy(outpath, bpath); 1503 1504 slashp = strchr(bpath, '/'); 1505 1506 /* if no '/', just return the pool name */ 1507 if (slashp == NULL) { 1508 return (0); 1509 } 1510 1511 /* if not a number, just return the root dataset name */ 1512 if (str_to_uint64(slashp+1, &objnum)) { 1513 return (0); 1514 } 1515 1516 *slashp = '\0'; 1517 error = dsl_dsobj_to_dsname(bpath, objnum, outpath); 1518 *slashp = '/'; 1519 1520 return (error); 1521 } 1522 1523 /* 1524 * Check that the hex label string is appropriate for the dataset being 1525 * mounted into the global_zone proper. 1526 * 1527 * Return an error if the hex label string is not default or 1528 * admin_low/admin_high. For admin_low labels, the corresponding 1529 * dataset must be readonly. 1530 */ 1531 int 1532 zfs_check_global_label(const char *dsname, const char *hexsl) 1533 { 1534 if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1535 return (0); 1536 if (strcasecmp(hexsl, ADMIN_HIGH) == 0) 1537 return (0); 1538 if (strcasecmp(hexsl, ADMIN_LOW) == 0) { 1539 /* must be readonly */ 1540 uint64_t rdonly; 1541 1542 if (dsl_prop_get_integer(dsname, 1543 zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) 1544 return (SET_ERROR(EACCES)); 1545 return (rdonly ? 0 : EACCES); 1546 } 1547 return (SET_ERROR(EACCES)); 1548 } 1549 1550 static int 1551 zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct statvfs64 *statp, 1552 uint32_t bshift) 1553 { 1554 char buf[20 + DMU_OBJACCT_PREFIX_LEN]; 1555 uint64_t offset = DMU_OBJACCT_PREFIX_LEN; 1556 uint64_t quota; 1557 uint64_t used; 1558 int err; 1559 1560 strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); 1561 err = id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset, B_FALSE); 1562 if (err) 1563 return (err); 1564 1565 if (zfsvfs->z_projectquota_obj == 0) 1566 goto objs; 1567 1568 err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj, 1569 buf + offset, 8, 1, "a); 1570 if (err == ENOENT) 1571 goto objs; 1572 else if (err) 1573 return (err); 1574 1575 err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, 1576 buf + offset, 8, 1, &used); 1577 if (unlikely(err == ENOENT)) { 1578 uint32_t blksize; 1579 u_longlong_t nblocks; 1580 1581 /* 1582 * Quota accounting is async, so it is possible race case. 1583 * There is at least one object with the given project ID. 1584 */ 1585 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 1586 if (unlikely(zp->z_blksz == 0)) 1587 blksize = zfsvfs->z_max_blksz; 1588 1589 used = blksize * nblocks; 1590 } else if (err) { 1591 return (err); 1592 } 1593 1594 statp->f_blocks = quota >> bshift; 1595 statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0; 1596 statp->f_bavail = statp->f_bfree; 1597 1598 objs: 1599 if (zfsvfs->z_projectobjquota_obj == 0) 1600 return (0); 1601 1602 err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj, 1603 buf + offset, 8, 1, "a); 1604 if (err == ENOENT) 1605 return (0); 1606 else if (err) 1607 return (err); 1608 1609 err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, 1610 buf, 8, 1, &used); 1611 if (unlikely(err == ENOENT)) { 1612 /* 1613 * Quota accounting is async, so it is possible race case. 1614 * There is at least one object with the given project ID. 1615 */ 1616 used = 1; 1617 } else if (err) { 1618 return (err); 1619 } 1620 1621 statp->f_files = quota; 1622 statp->f_ffree = (quota > used) ? (quota - used) : 0; 1623 1624 return (0); 1625 } 1626 1627 /* 1628 * Determine whether the mount is allowed according to MAC check. 1629 * by comparing (where appropriate) label of the dataset against 1630 * the label of the zone being mounted into. If the dataset has 1631 * no label, create one. 1632 * 1633 * Returns 0 if access allowed, error otherwise (e.g. EACCES) 1634 */ 1635 static int 1636 zfs_mount_label_policy(vfs_t *vfsp, char *osname) 1637 { 1638 int error, retv; 1639 zone_t *mntzone = NULL; 1640 ts_label_t *mnt_tsl; 1641 bslabel_t *mnt_sl; 1642 bslabel_t ds_sl; 1643 char ds_hexsl[MAXNAMELEN]; 1644 1645 retv = EACCES; /* assume the worst */ 1646 1647 /* 1648 * Start by getting the dataset label if it exists. 1649 */ 1650 error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1651 1, sizeof (ds_hexsl), &ds_hexsl, NULL); 1652 if (error) 1653 return (SET_ERROR(EACCES)); 1654 1655 /* 1656 * If labeling is NOT enabled, then disallow the mount of datasets 1657 * which have a non-default label already. No other label checks 1658 * are needed. 1659 */ 1660 if (!is_system_labeled()) { 1661 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1662 return (0); 1663 return (SET_ERROR(EACCES)); 1664 } 1665 1666 /* 1667 * Get the label of the mountpoint. If mounting into the global 1668 * zone (i.e. mountpoint is not within an active zone and the 1669 * zoned property is off), the label must be default or 1670 * admin_low/admin_high only; no other checks are needed. 1671 */ 1672 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 1673 if (mntzone->zone_id == GLOBAL_ZONEID) { 1674 uint64_t zoned; 1675 1676 zone_rele(mntzone); 1677 1678 if (dsl_prop_get_integer(osname, 1679 zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) 1680 return (SET_ERROR(EACCES)); 1681 if (!zoned) 1682 return (zfs_check_global_label(osname, ds_hexsl)); 1683 else 1684 /* 1685 * This is the case of a zone dataset being mounted 1686 * initially, before the zone has been fully created; 1687 * allow this mount into global zone. 1688 */ 1689 return (0); 1690 } 1691 1692 mnt_tsl = mntzone->zone_slabel; 1693 ASSERT(mnt_tsl != NULL); 1694 label_hold(mnt_tsl); 1695 mnt_sl = label2bslabel(mnt_tsl); 1696 1697 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { 1698 /* 1699 * The dataset doesn't have a real label, so fabricate one. 1700 */ 1701 char *str = NULL; 1702 1703 if (l_to_str_internal(mnt_sl, &str) == 0 && 1704 dsl_prop_set_string(osname, 1705 zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1706 ZPROP_SRC_LOCAL, str) == 0) 1707 retv = 0; 1708 if (str != NULL) 1709 kmem_free(str, strlen(str) + 1); 1710 } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { 1711 /* 1712 * Now compare labels to complete the MAC check. If the 1713 * labels are equal then allow access. If the mountpoint 1714 * label dominates the dataset label, allow readonly access. 1715 * Otherwise, access is denied. 1716 */ 1717 if (blequal(mnt_sl, &ds_sl)) 1718 retv = 0; 1719 else if (bldominates(mnt_sl, &ds_sl)) { 1720 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1721 retv = 0; 1722 } 1723 } 1724 1725 label_rele(mnt_tsl); 1726 zone_rele(mntzone); 1727 return (retv); 1728 } 1729 1730 /* 1731 * Load a string-valued boot property and attempt to convert it to a 64-bit 1732 * unsigned integer. If the value is not present, or the conversion fails, 1733 * return the provided default value. 1734 */ 1735 static uint64_t 1736 spa_get_bootprop_uint64(const char *name, uint64_t defval) 1737 { 1738 char *propval; 1739 u_longlong_t r; 1740 int e; 1741 1742 if ((propval = spa_get_bootprop(name)) == NULL) { 1743 /* 1744 * The property does not exist. 1745 */ 1746 return (defval); 1747 } 1748 1749 e = ddi_strtoull(propval, NULL, 10, &r); 1750 1751 spa_free_bootprop(propval); 1752 1753 /* 1754 * If the conversion succeeded, return the value. If there was any 1755 * kind of failure, just return the default value. 1756 */ 1757 return (e == 0 ? r : defval); 1758 } 1759 1760 static int 1761 zfs_mountroot(vfs_t *vfsp, enum whymountroot why) 1762 { 1763 int error = 0; 1764 static int zfsrootdone = 0; 1765 zfsvfs_t *zfsvfs = NULL; 1766 znode_t *zp = NULL; 1767 vnode_t *vp = NULL; 1768 char *zfs_bootfs; 1769 char *zfs_devid; 1770 uint64_t zfs_bootpool; 1771 uint64_t zfs_bootvdev; 1772 1773 ASSERT(vfsp); 1774 1775 /* 1776 * The filesystem that we mount as root is defined in the 1777 * boot property "zfs-bootfs" with a format of 1778 * "poolname/root-dataset-objnum". 1779 */ 1780 if (why == ROOT_INIT) { 1781 if (zfsrootdone++) 1782 return (SET_ERROR(EBUSY)); 1783 1784 /* 1785 * the process of doing a spa_load will require the 1786 * clock to be set before we could (for example) do 1787 * something better by looking at the timestamp on 1788 * an uberblock, so just set it to -1. 1789 */ 1790 clkset(-1); 1791 1792 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { 1793 cmn_err(CE_NOTE, "spa_get_bootfs: can not get " 1794 "bootfs name"); 1795 return (SET_ERROR(EINVAL)); 1796 } 1797 zfs_devid = spa_get_bootprop("diskdevid"); 1798 1799 /* 1800 * The boot loader may also provide us with the GUID for both 1801 * the pool and the nominated boot vdev. A GUID value of 0 is 1802 * explicitly invalid (see "spa_change_guid()"), so we use this 1803 * as a sentinel value when no GUID is present. 1804 */ 1805 zfs_bootpool = spa_get_bootprop_uint64("zfs-bootpool", 0); 1806 zfs_bootvdev = spa_get_bootprop_uint64("zfs-bootvdev", 0); 1807 1808 /* 1809 * Initialise the early boot device rescan mechanism. A scan 1810 * will not actually be performed unless we need to do so in 1811 * order to find the correct /devices path for a relocated 1812 * device. 1813 */ 1814 vdev_disk_preroot_init(); 1815 1816 error = spa_import_rootpool(rootfs.bo_name, zfs_devid, 1817 zfs_bootpool, zfs_bootvdev); 1818 1819 spa_free_bootprop(zfs_devid); 1820 1821 if (error != 0) { 1822 spa_free_bootprop(zfs_bootfs); 1823 vdev_disk_preroot_fini(); 1824 cmn_err(CE_NOTE, "spa_import_rootpool: error %d", 1825 error); 1826 return (error); 1827 } 1828 1829 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { 1830 spa_free_bootprop(zfs_bootfs); 1831 vdev_disk_preroot_fini(); 1832 cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", 1833 error); 1834 return (error); 1835 } 1836 1837 spa_free_bootprop(zfs_bootfs); 1838 vdev_disk_preroot_fini(); 1839 1840 if (error = vfs_lock(vfsp)) 1841 return (error); 1842 1843 if (error = zfs_domount(vfsp, rootfs.bo_name)) { 1844 cmn_err(CE_NOTE, "zfs_domount: error %d", error); 1845 goto out; 1846 } 1847 1848 /* zfs_domount has already cached the root vnode for us */ 1849 zfsvfs = (zfsvfs_t *)vfsp->vfs_data; 1850 ASSERT(zfsvfs); 1851 ASSERT(zfsvfs->z_rootdir); 1852 1853 vp = zfsvfs->z_rootdir; 1854 mutex_enter(&vp->v_lock); 1855 vp->v_flag |= VROOT; 1856 mutex_exit(&vp->v_lock); 1857 1858 /* 1859 * Leave rootvp held. The root file system is never unmounted. 1860 */ 1861 VN_HOLD(vp); 1862 rootvp = vp; 1863 1864 vfs_add((struct vnode *)0, vfsp, 1865 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); 1866 out: 1867 vfs_unlock(vfsp); 1868 return (error); 1869 } else if (why == ROOT_REMOUNT) { 1870 readonly_changed_cb(vfsp->vfs_data, B_FALSE); 1871 vfsp->vfs_flag |= VFS_REMOUNT; 1872 1873 /* refresh mount options */ 1874 zfs_unregister_callbacks(vfsp->vfs_data); 1875 return (zfs_register_callbacks(vfsp)); 1876 1877 } else if (why == ROOT_UNMOUNT) { 1878 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); 1879 (void) zfs_sync(vfsp, 0, 0); 1880 return (0); 1881 } 1882 1883 /* 1884 * if "why" is equal to anything else other than ROOT_INIT, 1885 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. 1886 */ 1887 return (SET_ERROR(ENOTSUP)); 1888 } 1889 1890 /*ARGSUSED*/ 1891 static int 1892 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 1893 { 1894 char *osname; 1895 pathname_t spn; 1896 int error = 0; 1897 uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ? 1898 UIO_SYSSPACE : UIO_USERSPACE; 1899 int canwrite; 1900 1901 if (mvp->v_type != VDIR) 1902 return (SET_ERROR(ENOTDIR)); 1903 1904 mutex_enter(&mvp->v_lock); 1905 if ((uap->flags & MS_REMOUNT) == 0 && 1906 (uap->flags & MS_OVERLAY) == 0 && 1907 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { 1908 mutex_exit(&mvp->v_lock); 1909 return (SET_ERROR(EBUSY)); 1910 } 1911 mutex_exit(&mvp->v_lock); 1912 1913 /* 1914 * ZFS does not support passing unparsed data in via MS_DATA. 1915 * Users should use the MS_OPTIONSTR interface; this means 1916 * that all option parsing is already done and the options struct 1917 * can be interrogated. 1918 */ 1919 if ((uap->flags & MS_DATA) && uap->datalen > 0) 1920 return (SET_ERROR(EINVAL)); 1921 1922 /* 1923 * Get the objset name (the "special" mount argument). 1924 */ 1925 if (error = pn_get(uap->spec, fromspace, &spn)) 1926 return (error); 1927 1928 osname = spn.pn_path; 1929 1930 /* 1931 * Check for mount privilege? 1932 * 1933 * If we don't have privilege then see if 1934 * we have local permission to allow it 1935 */ 1936 error = secpolicy_fs_mount(cr, mvp, vfsp); 1937 if (error) { 1938 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) { 1939 vattr_t vattr; 1940 1941 /* 1942 * Make sure user is the owner of the mount point 1943 * or has sufficient privileges. 1944 */ 1945 1946 vattr.va_mask = AT_UID; 1947 1948 if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) { 1949 goto out; 1950 } 1951 1952 if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 && 1953 VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) { 1954 goto out; 1955 } 1956 secpolicy_fs_mount_clearopts(cr, vfsp); 1957 } else { 1958 goto out; 1959 } 1960 } 1961 1962 /* 1963 * Refuse to mount a filesystem if we are in a local zone and the 1964 * dataset is not visible. 1965 */ 1966 if (!INGLOBALZONE(curproc) && 1967 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 1968 error = SET_ERROR(EPERM); 1969 goto out; 1970 } 1971 1972 error = zfs_mount_label_policy(vfsp, osname); 1973 if (error) 1974 goto out; 1975 1976 /* 1977 * When doing a remount, we simply refresh our temporary properties 1978 * according to those options set in the current VFS options. 1979 */ 1980 if (uap->flags & MS_REMOUNT) { 1981 /* refresh mount options */ 1982 zfs_unregister_callbacks(vfsp->vfs_data); 1983 error = zfs_register_callbacks(vfsp); 1984 goto out; 1985 } 1986 1987 error = zfs_domount(vfsp, osname); 1988 1989 /* 1990 * Add an extra VFS_HOLD on our parent vfs so that it can't 1991 * disappear due to a forced unmount. 1992 */ 1993 if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) 1994 VFS_HOLD(mvp->v_vfsp); 1995 1996 out: 1997 pn_free(&spn); 1998 return (error); 1999 } 2000 2001 static int 2002 zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp) 2003 { 2004 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2005 dev32_t d32; 2006 uint64_t refdbytes, availbytes, usedobjs, availobjs; 2007 int err = 0; 2008 2009 ZFS_ENTER(zfsvfs); 2010 2011 dmu_objset_space(zfsvfs->z_os, 2012 &refdbytes, &availbytes, &usedobjs, &availobjs); 2013 2014 /* 2015 * The underlying storage pool actually uses multiple block sizes. 2016 * We report the fragsize as the smallest block size we support, 2017 * and we report our blocksize as the filesystem's maximum blocksize. 2018 */ 2019 statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT; 2020 statp->f_bsize = zfsvfs->z_max_blksz; 2021 2022 /* 2023 * The following report "total" blocks of various kinds in the 2024 * file system, but reported in terms of f_frsize - the 2025 * "fragment" size. 2026 */ 2027 2028 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 2029 statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT; 2030 statp->f_bavail = statp->f_bfree; /* no root reservation */ 2031 2032 /* 2033 * statvfs() should really be called statufs(), because it assumes 2034 * static metadata. ZFS doesn't preallocate files, so the best 2035 * we can do is report the max that could possibly fit in f_files, 2036 * and that minus the number actually used in f_ffree. 2037 * For f_ffree, report the smaller of the number of object available 2038 * and the number of blocks (each object will take at least a block). 2039 */ 2040 statp->f_ffree = MIN(availobjs, statp->f_bfree); 2041 statp->f_favail = statp->f_ffree; /* no "root reservation" */ 2042 statp->f_files = statp->f_ffree + usedobjs; 2043 2044 (void) cmpldev(&d32, vfsp->vfs_dev); 2045 statp->f_fsid = d32; 2046 2047 /* 2048 * We're a zfs filesystem. 2049 */ 2050 (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name); 2051 2052 statp->f_flag = vf_to_stf(vfsp->vfs_flag); 2053 2054 statp->f_namemax = MAXNAMELEN - 1; 2055 2056 /* 2057 * We have all of 32 characters to stuff a string here. 2058 * Is there anything useful we could/should provide? 2059 */ 2060 bzero(statp->f_fstr, sizeof (statp->f_fstr)); 2061 2062 if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && 2063 dmu_objset_projectquota_present(zfsvfs->z_os)) { 2064 znode_t *zp; 2065 2066 /* 2067 * In ZoL, zfs_statvfs is passed a Linux dentry (directory 2068 * entry), instead of a vfsp. The ZoL code uses the dentry 2069 * to get the znode from the dentry's inode. This represents 2070 * whatever filename was passed to the user-level statvfs 2071 * syscall. 2072 * 2073 * We're using the VFS root znode here, so this represents a 2074 * potential difference from ZoL. 2075 */ 2076 if (zfs_zget(zfsvfs, zfsvfs->z_root, &zp) == 0) { 2077 uint32_t bshift = ddi_fls(statp->f_bsize) - 1; 2078 2079 if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid && 2080 zpl_is_valid_projid(zp->z_projid)) 2081 err = zfs_statfs_project(zfsvfs, zp, statp, 2082 bshift); 2083 VN_RELE(ZTOV(zp)); 2084 } 2085 } 2086 2087 ZFS_EXIT(zfsvfs); 2088 return (err); 2089 } 2090 2091 static int 2092 zfs_root(vfs_t *vfsp, vnode_t **vpp) 2093 { 2094 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2095 struct vnode *vp; 2096 int error; 2097 2098 ZFS_ENTER(zfsvfs); 2099 2100 vp = zfsvfs->z_rootdir; 2101 if (vp != NULL) { 2102 VN_HOLD(vp); 2103 error = 0; 2104 } else { 2105 /* forced unmount */ 2106 error = EIO; 2107 } 2108 *vpp = vp; 2109 2110 ZFS_EXIT(zfsvfs); 2111 return (error); 2112 2113 } 2114 2115 /* 2116 * Teardown the zfsvfs::z_os. 2117 * 2118 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' 2119 * and 'z_teardown_inactive_lock' held. 2120 */ 2121 static int 2122 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 2123 { 2124 znode_t *zp; 2125 2126 zfs_unlinked_drain_stop_wait(zfsvfs); 2127 2128 rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); 2129 2130 if (!unmounting) { 2131 /* 2132 * We purge the parent filesystem's vfsp as the parent 2133 * filesystem and all of its snapshots have their vnode's 2134 * v_vfsp set to the parent's filesystem's vfsp. Note, 2135 * 'z_parent' is self referential for non-snapshots. 2136 */ 2137 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 2138 } 2139 2140 /* 2141 * Close the zil. NB: Can't close the zil while zfs_inactive 2142 * threads are blocked as zil_close can call zfs_inactive. 2143 */ 2144 if (zfsvfs->z_log) { 2145 zil_close(zfsvfs->z_log); 2146 zfsvfs->z_log = NULL; 2147 } 2148 2149 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); 2150 2151 /* 2152 * If we are not unmounting (ie: online recv) and someone already 2153 * unmounted this file system while we were doing the switcheroo, 2154 * or a reopen of z_os failed then just bail out now. 2155 */ 2156 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 2157 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2158 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2159 return (SET_ERROR(EIO)); 2160 } 2161 2162 /* 2163 * At this point there are no vops active, and any new vops will 2164 * fail with EIO since we have z_teardown_lock for writer (only 2165 * relavent for forced unmount). 2166 * 2167 * Release all holds on dbufs. 2168 */ 2169 mutex_enter(&zfsvfs->z_znodes_lock); 2170 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 2171 zp = list_next(&zfsvfs->z_all_znodes, zp)) 2172 if (zp->z_sa_hdl) { 2173 ASSERT(ZTOV(zp)->v_count > 0); 2174 zfs_znode_dmu_fini(zp); 2175 } 2176 mutex_exit(&zfsvfs->z_znodes_lock); 2177 2178 /* 2179 * If we are unmounting, set the unmounted flag and let new vops 2180 * unblock. zfs_inactive will have the unmounted behavior, and all 2181 * other vops will fail with EIO. 2182 */ 2183 if (unmounting) { 2184 /* 2185 * Clear the cached root vnode now that we are unmounted. 2186 * Its release must be performed outside the teardown locks to 2187 * avoid recursive lock entry via zfs_inactive(). 2188 */ 2189 vnode_t *vp = zfsvfs->z_rootdir; 2190 zfsvfs->z_rootdir = NULL; 2191 2192 zfsvfs->z_unmounted = B_TRUE; 2193 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2194 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2195 2196 /* Drop the cached root vp now that it is safe */ 2197 VN_RELE(vp); 2198 } 2199 2200 /* 2201 * z_os will be NULL if there was an error in attempting to reopen 2202 * zfsvfs, so just return as the properties had already been 2203 * unregistered and cached data had been evicted before. 2204 */ 2205 if (zfsvfs->z_os == NULL) 2206 return (0); 2207 2208 /* 2209 * Unregister properties. 2210 */ 2211 zfs_unregister_callbacks(zfsvfs); 2212 2213 /* 2214 * Evict cached data 2215 */ 2216 if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && 2217 !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) 2218 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 2219 dmu_objset_evict_dbufs(zfsvfs->z_os); 2220 2221 return (0); 2222 } 2223 2224 /*ARGSUSED*/ 2225 static int 2226 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) 2227 { 2228 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2229 objset_t *os; 2230 int ret; 2231 2232 ret = secpolicy_fs_unmount(cr, vfsp); 2233 if (ret) { 2234 if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), 2235 ZFS_DELEG_PERM_MOUNT, cr)) 2236 return (ret); 2237 } 2238 2239 /* 2240 * We purge the parent filesystem's vfsp as the parent filesystem 2241 * and all of its snapshots have their vnode's v_vfsp set to the 2242 * parent's filesystem's vfsp. Note, 'z_parent' is self 2243 * referential for non-snapshots. 2244 */ 2245 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 2246 2247 /* 2248 * Unmount any snapshots mounted under .zfs before unmounting the 2249 * dataset itself. 2250 */ 2251 if (zfsvfs->z_ctldir != NULL && 2252 (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) { 2253 return (ret); 2254 } 2255 2256 if (!(fflag & MS_FORCE)) { 2257 /* 2258 * Check the number of active vnodes in the file system. 2259 * Our count is maintained in the vfs structure, but the 2260 * number is off by 1 to indicate a hold on the vfs 2261 * structure itself. 2262 */ 2263 boolean_t draining; 2264 uint_t thresh = 1; 2265 vnode_t *ctlvp, *rvp; 2266 2267 /* 2268 * The cached vnode for the root directory of the mount also 2269 * maintains a hold on the vfs structure. 2270 */ 2271 rvp = zfsvfs->z_rootdir; 2272 thresh++; 2273 2274 /* 2275 * The '.zfs' directory maintains a reference of its own, and 2276 * any active references underneath are reflected in the vnode 2277 * count. Allow one additional reference for it. 2278 */ 2279 ctlvp = zfsvfs->z_ctldir; 2280 if (ctlvp != NULL) { 2281 thresh++; 2282 } 2283 2284 /* 2285 * If it's running, the asynchronous unlinked drain task needs 2286 * to be stopped before the number of active vnodes can be 2287 * reliably checked. 2288 */ 2289 draining = zfsvfs->z_draining; 2290 if (draining) 2291 zfs_unlinked_drain_stop_wait(zfsvfs); 2292 2293 if (vfsp->vfs_count > thresh || rvp->v_count > 1 || 2294 (ctlvp != NULL && ctlvp->v_count > 1)) { 2295 if (draining) { 2296 /* If it was draining, restart the task */ 2297 zfs_unlinked_drain(zfsvfs); 2298 } 2299 return (SET_ERROR(EBUSY)); 2300 } 2301 } 2302 2303 vfsp->vfs_flag |= VFS_UNMOUNTED; 2304 2305 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); 2306 os = zfsvfs->z_os; 2307 2308 /* 2309 * z_os will be NULL if there was an error in 2310 * attempting to reopen zfsvfs. 2311 */ 2312 if (os != NULL) { 2313 /* 2314 * Unset the objset user_ptr. 2315 */ 2316 mutex_enter(&os->os_user_ptr_lock); 2317 dmu_objset_set_user(os, NULL); 2318 mutex_exit(&os->os_user_ptr_lock); 2319 2320 /* 2321 * Finally release the objset 2322 */ 2323 dmu_objset_disown(os, B_TRUE, zfsvfs); 2324 } 2325 2326 /* 2327 * We can now safely destroy the '.zfs' directory node. 2328 */ 2329 if (zfsvfs->z_ctldir != NULL) 2330 zfsctl_destroy(zfsvfs); 2331 2332 return (0); 2333 } 2334 2335 static int 2336 zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 2337 { 2338 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2339 znode_t *zp; 2340 uint64_t object = 0; 2341 uint64_t fid_gen = 0; 2342 uint64_t gen_mask; 2343 uint64_t zp_gen; 2344 int i, err; 2345 2346 *vpp = NULL; 2347 2348 ZFS_ENTER(zfsvfs); 2349 2350 if (fidp->fid_len == LONG_FID_LEN) { 2351 zfid_long_t *zlfid = (zfid_long_t *)fidp; 2352 uint64_t objsetid = 0; 2353 uint64_t setgen = 0; 2354 2355 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 2356 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 2357 2358 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 2359 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 2360 2361 ZFS_EXIT(zfsvfs); 2362 2363 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 2364 if (err) 2365 return (SET_ERROR(EINVAL)); 2366 ZFS_ENTER(zfsvfs); 2367 } 2368 2369 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 2370 zfid_short_t *zfid = (zfid_short_t *)fidp; 2371 2372 for (i = 0; i < sizeof (zfid->zf_object); i++) 2373 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 2374 2375 for (i = 0; i < sizeof (zfid->zf_gen); i++) 2376 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 2377 } else { 2378 ZFS_EXIT(zfsvfs); 2379 return (SET_ERROR(EINVAL)); 2380 } 2381 2382 /* A zero fid_gen means we are in the .zfs control directories */ 2383 if (fid_gen == 0 && 2384 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { 2385 *vpp = zfsvfs->z_ctldir; 2386 ASSERT(*vpp != NULL); 2387 if (object == ZFSCTL_INO_SNAPDIR) { 2388 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 2389 0, NULL, NULL, NULL, NULL, NULL) == 0); 2390 } else { 2391 VN_HOLD(*vpp); 2392 } 2393 ZFS_EXIT(zfsvfs); 2394 return (0); 2395 } 2396 2397 gen_mask = -1ULL >> (64 - 8 * i); 2398 2399 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); 2400 if (err = zfs_zget(zfsvfs, object, &zp)) { 2401 ZFS_EXIT(zfsvfs); 2402 return (err); 2403 } 2404 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 2405 sizeof (uint64_t)); 2406 zp_gen = zp_gen & gen_mask; 2407 if (zp_gen == 0) 2408 zp_gen = 1; 2409 if (zp->z_unlinked || zp_gen != fid_gen) { 2410 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); 2411 VN_RELE(ZTOV(zp)); 2412 ZFS_EXIT(zfsvfs); 2413 return (SET_ERROR(EINVAL)); 2414 } 2415 2416 *vpp = ZTOV(zp); 2417 ZFS_EXIT(zfsvfs); 2418 return (0); 2419 } 2420 2421 /* 2422 * Block out VOPs and close zfsvfs_t::z_os 2423 * 2424 * Note, if successful, then we return with the 'z_teardown_lock' and 2425 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying 2426 * dataset and objset intact so that they can be atomically handed off during 2427 * a subsequent rollback or recv operation and the resume thereafter. 2428 */ 2429 int 2430 zfs_suspend_fs(zfsvfs_t *zfsvfs) 2431 { 2432 int error; 2433 2434 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 2435 return (error); 2436 2437 return (0); 2438 } 2439 2440 /* 2441 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset 2442 * is an invariant across any of the operations that can be performed while the 2443 * filesystem was suspended. Whether it succeeded or failed, the preconditions 2444 * are the same: the relevant objset and associated dataset are owned by 2445 * zfsvfs, held, and long held on entry. 2446 */ 2447 int 2448 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 2449 { 2450 int err; 2451 znode_t *zp; 2452 2453 ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); 2454 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); 2455 2456 /* 2457 * We already own this, so just update the objset_t, as the one we 2458 * had before may have been evicted. 2459 */ 2460 objset_t *os; 2461 VERIFY3P(ds->ds_owner, ==, zfsvfs); 2462 VERIFY(dsl_dataset_long_held(ds)); 2463 VERIFY0(dmu_objset_from_ds(ds, &os)); 2464 2465 err = zfsvfs_init(zfsvfs, os); 2466 if (err != 0) 2467 goto bail; 2468 2469 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); 2470 2471 zfs_set_fuid_feature(zfsvfs); 2472 2473 /* 2474 * Attempt to re-establish all the active znodes with 2475 * their dbufs. If a zfs_rezget() fails, then we'll let 2476 * any potential callers discover that via ZFS_ENTER_VERIFY_VP 2477 * when they try to use their znode. 2478 */ 2479 mutex_enter(&zfsvfs->z_znodes_lock); 2480 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 2481 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 2482 (void) zfs_rezget(zp); 2483 } 2484 mutex_exit(&zfsvfs->z_znodes_lock); 2485 2486 if (((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) && 2487 !zfsvfs->z_unmounted) { 2488 /* 2489 * zfs_suspend_fs() could have interrupted freeing 2490 * of dnodes. We need to restart this freeing so 2491 * that we don't "leak" the space. 2492 */ 2493 zfs_unlinked_drain(zfsvfs); 2494 } 2495 2496 bail: 2497 /* release the VOPs */ 2498 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2499 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2500 2501 if (err) { 2502 /* 2503 * Since we couldn't setup the sa framework, try to force 2504 * unmount this file system. 2505 */ 2506 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) 2507 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED()); 2508 } 2509 return (err); 2510 } 2511 2512 static void 2513 zfs_freevfs(vfs_t *vfsp) 2514 { 2515 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2516 2517 /* 2518 * If this is a snapshot, we have an extra VFS_HOLD on our parent 2519 * from zfs_mount(). Release it here. If we came through 2520 * zfs_mountroot() instead, we didn't grab an extra hold, so 2521 * skip the VFS_RELE for rootvfs. 2522 */ 2523 if (zfsvfs->z_issnap && (vfsp != rootvfs)) 2524 VFS_RELE(zfsvfs->z_parent->z_vfs); 2525 2526 zfsvfs_free(zfsvfs); 2527 2528 atomic_dec_32(&zfs_active_fs_count); 2529 } 2530 2531 /* 2532 * VFS_INIT() initialization. Note that there is no VFS_FINI(), 2533 * so we can't safely do any non-idempotent initialization here. 2534 * Leave that to zfs_init() and zfs_fini(), which are called 2535 * from the module's _init() and _fini() entry points. 2536 */ 2537 /*ARGSUSED*/ 2538 static int 2539 zfs_vfsinit(int fstype, char *name) 2540 { 2541 int error; 2542 2543 zfsfstype = fstype; 2544 2545 /* 2546 * Setup vfsops and vnodeops tables. 2547 */ 2548 error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops); 2549 if (error != 0) { 2550 cmn_err(CE_WARN, "zfs: bad vfs ops template"); 2551 } 2552 2553 error = zfs_create_op_tables(); 2554 if (error) { 2555 zfs_remove_op_tables(); 2556 cmn_err(CE_WARN, "zfs: bad vnode ops template"); 2557 (void) vfs_freevfsops_by_type(zfsfstype); 2558 return (error); 2559 } 2560 2561 mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 2562 2563 /* 2564 * Unique major number for all zfs mounts. 2565 * If we run out of 32-bit minors, we'll getudev() another major. 2566 */ 2567 zfs_major = ddi_name_to_major(ZFS_DRIVER); 2568 zfs_minor = ZFS_MIN_MINOR; 2569 2570 return (0); 2571 } 2572 2573 void 2574 zfs_init(void) 2575 { 2576 /* 2577 * Initialize .zfs directory structures 2578 */ 2579 zfsctl_init(); 2580 2581 /* 2582 * Initialize znode cache, vnode ops, etc... 2583 */ 2584 zfs_znode_init(); 2585 2586 dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); 2587 } 2588 2589 void 2590 zfs_fini(void) 2591 { 2592 zfsctl_fini(); 2593 zfs_znode_fini(); 2594 } 2595 2596 int 2597 zfs_busy(void) 2598 { 2599 return (zfs_active_fs_count != 0); 2600 } 2601 2602 int 2603 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2604 { 2605 int error; 2606 objset_t *os = zfsvfs->z_os; 2607 dmu_tx_t *tx; 2608 2609 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2610 return (SET_ERROR(EINVAL)); 2611 2612 if (newvers < zfsvfs->z_version) 2613 return (SET_ERROR(EINVAL)); 2614 2615 if (zfs_spa_version_map(newvers) > 2616 spa_version(dmu_objset_spa(zfsvfs->z_os))) 2617 return (SET_ERROR(ENOTSUP)); 2618 2619 tx = dmu_tx_create(os); 2620 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2621 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2622 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 2623 ZFS_SA_ATTRS); 2624 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2625 } 2626 error = dmu_tx_assign(tx, TXG_WAIT); 2627 if (error) { 2628 dmu_tx_abort(tx); 2629 return (error); 2630 } 2631 2632 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2633 8, 1, &newvers, tx); 2634 2635 if (error) { 2636 dmu_tx_commit(tx); 2637 return (error); 2638 } 2639 2640 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2641 uint64_t sa_obj; 2642 2643 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, 2644 SPA_VERSION_SA); 2645 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 2646 DMU_OT_NONE, 0, tx); 2647 2648 error = zap_add(os, MASTER_NODE_OBJ, 2649 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 2650 ASSERT0(error); 2651 2652 VERIFY(0 == sa_set_sa_object(os, sa_obj)); 2653 sa_register_update_callback(os, zfs_sa_upgrade); 2654 } 2655 2656 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, 2657 "from %llu to %llu", zfsvfs->z_version, newvers); 2658 2659 dmu_tx_commit(tx); 2660 2661 zfsvfs->z_version = newvers; 2662 os->os_version = newvers; 2663 2664 zfs_set_fuid_feature(zfsvfs); 2665 2666 return (0); 2667 } 2668 2669 /* 2670 * Read a property stored within the master node. 2671 */ 2672 int 2673 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) 2674 { 2675 uint64_t *cached_copy = NULL; 2676 2677 /* 2678 * Figure out where in the objset_t the cached copy would live, if it 2679 * is available for the requested property. 2680 */ 2681 if (os != NULL) { 2682 switch (prop) { 2683 case ZFS_PROP_VERSION: 2684 cached_copy = &os->os_version; 2685 break; 2686 case ZFS_PROP_NORMALIZE: 2687 cached_copy = &os->os_normalization; 2688 break; 2689 case ZFS_PROP_UTF8ONLY: 2690 cached_copy = &os->os_utf8only; 2691 break; 2692 case ZFS_PROP_CASE: 2693 cached_copy = &os->os_casesensitivity; 2694 break; 2695 default: 2696 break; 2697 } 2698 } 2699 if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { 2700 *value = *cached_copy; 2701 return (0); 2702 } 2703 2704 /* 2705 * If the property wasn't cached, look up the file system's value for 2706 * the property. For the version property, we look up a slightly 2707 * different string. 2708 */ 2709 const char *pname; 2710 int error = ENOENT; 2711 if (prop == ZFS_PROP_VERSION) { 2712 pname = ZPL_VERSION_STR; 2713 } else { 2714 pname = zfs_prop_to_name(prop); 2715 } 2716 2717 if (os != NULL) { 2718 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); 2719 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); 2720 } 2721 2722 if (error == ENOENT) { 2723 /* No value set, use the default value */ 2724 switch (prop) { 2725 case ZFS_PROP_VERSION: 2726 *value = ZPL_VERSION; 2727 break; 2728 case ZFS_PROP_NORMALIZE: 2729 case ZFS_PROP_UTF8ONLY: 2730 *value = 0; 2731 break; 2732 case ZFS_PROP_CASE: 2733 *value = ZFS_CASE_SENSITIVE; 2734 break; 2735 default: 2736 return (error); 2737 } 2738 error = 0; 2739 } 2740 2741 /* 2742 * If one of the methods for getting the property value above worked, 2743 * copy it into the objset_t's cache. 2744 */ 2745 if (error == 0 && cached_copy != NULL) { 2746 *cached_copy = *value; 2747 } 2748 2749 return (error); 2750 } 2751 2752 /* 2753 * Return true if the coresponding vfs's unmounted flag is set. 2754 * Otherwise return false. 2755 * If this function returns true we know VFS unmount has been initiated. 2756 */ 2757 boolean_t 2758 zfs_get_vfs_flag_unmounted(objset_t *os) 2759 { 2760 zfsvfs_t *zfvp; 2761 boolean_t unmounted = B_FALSE; 2762 2763 ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); 2764 2765 mutex_enter(&os->os_user_ptr_lock); 2766 zfvp = dmu_objset_get_user(os); 2767 if (zfvp != NULL && zfvp->z_vfs != NULL && 2768 (zfvp->z_vfs->vfs_flag & VFS_UNMOUNTED)) 2769 unmounted = B_TRUE; 2770 mutex_exit(&os->os_user_ptr_lock); 2771 2772 return (unmounted); 2773 } 2774 2775 static vfsdef_t vfw = { 2776 VFSDEF_VERSION, 2777 MNTTYPE_ZFS, 2778 zfs_vfsinit, 2779 VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS| 2780 VSW_XID|VSW_ZMOUNT, 2781 &zfs_mntopts 2782 }; 2783 2784 struct modlfs zfs_modlfs = { 2785 &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw 2786 }; 2787