1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2014 Integros [integros.com] 25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 26 * Copyright 2019 Joyent, Inc. 27 * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> 28 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. 29 * Copyright 2022 Oxide Computer Company 30 */ 31 32 /* Portions Copyright 2010 Robert Milkowski */ 33 34 #include <sys/types.h> 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/sysmacros.h> 38 #include <sys/kmem.h> 39 #include <sys/pathname.h> 40 #include <sys/vnode.h> 41 #include <sys/vfs.h> 42 #include <sys/vfs_opreg.h> 43 #include <sys/mntent.h> 44 #include <sys/mount.h> 45 #include <sys/cmn_err.h> 46 #include "fs/fs_subr.h" 47 #include <sys/zfs_znode.h> 48 #include <sys/zfs_dir.h> 49 #include <sys/zil.h> 50 #include <sys/fs/zfs.h> 51 #include <sys/dmu.h> 52 #include <sys/dsl_prop.h> 53 #include <sys/dsl_dataset.h> 54 #include <sys/dsl_deleg.h> 55 #include <sys/spa.h> 56 #include <sys/zap.h> 57 #include <sys/sa.h> 58 #include <sys/sa_impl.h> 59 #include <sys/varargs.h> 60 #include <sys/policy.h> 61 #include <sys/atomic.h> 62 #include <sys/mkdev.h> 63 #include <sys/modctl.h> 64 #include <sys/refstr.h> 65 #include <sys/zfs_ioctl.h> 66 #include <sys/zfs_ctldir.h> 67 #include <sys/zfs_fuid.h> 68 #include <sys/bootconf.h> 69 #include <sys/ddi.h> 70 #include <sys/sunddi.h> 71 #include <sys/dnlc.h> 72 #include <sys/dmu_objset.h> 73 #include <sys/spa_boot.h> 74 #include <sys/vdev_impl.h> 75 #include "zfs_comutil.h" 76 77 int zfsfstype; 78 vfsops_t *zfs_vfsops = NULL; 79 static major_t zfs_major; 80 static minor_t zfs_minor; 81 static kmutex_t zfs_dev_mtx; 82 83 extern int sys_shutdown; 84 85 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); 86 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); 87 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot); 88 static int zfs_root(vfs_t *vfsp, vnode_t **vpp); 89 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp); 90 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp); 91 static void zfs_freevfs(vfs_t *vfsp); 92 93 static const fs_operation_def_t zfs_vfsops_template[] = { 94 VFSNAME_MOUNT, { .vfs_mount = zfs_mount }, 95 VFSNAME_MOUNTROOT, { .vfs_mountroot = zfs_mountroot }, 96 VFSNAME_UNMOUNT, { .vfs_unmount = zfs_umount }, 97 VFSNAME_ROOT, { .vfs_root = zfs_root }, 98 VFSNAME_STATVFS, { .vfs_statvfs = zfs_statvfs }, 99 VFSNAME_SYNC, { .vfs_sync = zfs_sync }, 100 VFSNAME_VGET, { .vfs_vget = zfs_vget }, 101 VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, 102 NULL, NULL 103 }; 104 105 /* 106 * We need to keep a count of active fs's. 107 * This is necessary to prevent our module 108 * from being unloaded after a umount -f 109 */ 110 static uint32_t zfs_active_fs_count = 0; 111 112 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL }; 113 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL }; 114 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; 115 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; 116 117 /* 118 * MO_DEFAULT is not used since the default value is determined 119 * by the equivalent property. 120 */ 121 static mntopt_t mntopts[] = { 122 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL }, 123 { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL }, 124 { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL }, 125 { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL } 126 }; 127 128 static mntopts_t zfs_mntopts = { 129 sizeof (mntopts) / sizeof (mntopt_t), 130 mntopts 131 }; 132 133 /*ARGSUSED*/ 134 int 135 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) 136 { 137 /* 138 * Data integrity is job one. We don't want a compromised kernel 139 * writing to the storage pool, so we never sync during panic. 140 */ 141 if (panicstr) 142 return (0); 143 144 /* 145 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS 146 * to sync metadata, which they would otherwise cache indefinitely. 147 * Semantically, the only requirement is that the sync be initiated. 148 * The DMU syncs out txgs frequently, so there's nothing to do. 149 */ 150 if (flag & SYNC_ATTR) 151 return (0); 152 153 if (vfsp != NULL) { 154 /* 155 * Sync a specific filesystem. 156 */ 157 zfsvfs_t *zfsvfs = vfsp->vfs_data; 158 dsl_pool_t *dp; 159 160 ZFS_ENTER(zfsvfs); 161 dp = dmu_objset_pool(zfsvfs->z_os); 162 163 /* 164 * If the system is shutting down, then skip any 165 * filesystems which may exist on a suspended pool. 166 */ 167 if (sys_shutdown && spa_suspended(dp->dp_spa)) { 168 ZFS_EXIT(zfsvfs); 169 return (0); 170 } 171 172 if (zfsvfs->z_log != NULL) 173 zil_commit(zfsvfs->z_log, 0); 174 175 ZFS_EXIT(zfsvfs); 176 } else { 177 /* 178 * Sync all ZFS filesystems. This is what happens when you 179 * run sync(8). Unlike other filesystems, ZFS honors the 180 * request by waiting for all pools to commit all dirty data. 181 */ 182 spa_sync_allpools(); 183 } 184 185 return (0); 186 } 187 188 static int 189 zfs_create_unique_device(dev_t *dev) 190 { 191 major_t new_major; 192 193 do { 194 ASSERT3U(zfs_minor, <=, MAXMIN32); 195 minor_t start = zfs_minor; 196 do { 197 mutex_enter(&zfs_dev_mtx); 198 if (zfs_minor >= MAXMIN32) { 199 /* 200 * If we're still using the real major 201 * keep out of /dev/zfs and /dev/zvol minor 202 * number space. If we're using a getudev()'ed 203 * major number, we can use all of its minors. 204 */ 205 if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) 206 zfs_minor = ZFS_MIN_MINOR; 207 else 208 zfs_minor = 0; 209 } else { 210 zfs_minor++; 211 } 212 *dev = makedevice(zfs_major, zfs_minor); 213 mutex_exit(&zfs_dev_mtx); 214 } while (vfs_devismounted(*dev) && zfs_minor != start); 215 if (zfs_minor == start) { 216 /* 217 * We are using all ~262,000 minor numbers for the 218 * current major number. Create a new major number. 219 */ 220 if ((new_major = getudev()) == (major_t)-1) { 221 cmn_err(CE_WARN, 222 "zfs_mount: Can't get unique major " 223 "device number."); 224 return (-1); 225 } 226 mutex_enter(&zfs_dev_mtx); 227 zfs_major = new_major; 228 zfs_minor = 0; 229 230 mutex_exit(&zfs_dev_mtx); 231 } else { 232 break; 233 } 234 /* CONSTANTCONDITION */ 235 } while (1); 236 237 return (0); 238 } 239 240 static void 241 atime_changed_cb(void *arg, uint64_t newval) 242 { 243 zfsvfs_t *zfsvfs = arg; 244 245 if (newval == TRUE) { 246 zfsvfs->z_atime = TRUE; 247 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); 248 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); 249 } else { 250 zfsvfs->z_atime = FALSE; 251 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); 252 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); 253 } 254 } 255 256 static void 257 xattr_changed_cb(void *arg, uint64_t newval) 258 { 259 zfsvfs_t *zfsvfs = arg; 260 261 if (newval == TRUE) { 262 /* XXX locking on vfs_flag? */ 263 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; 264 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); 265 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); 266 } else { 267 /* XXX locking on vfs_flag? */ 268 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; 269 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); 270 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); 271 } 272 } 273 274 static void 275 blksz_changed_cb(void *arg, uint64_t newval) 276 { 277 zfsvfs_t *zfsvfs = arg; 278 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); 279 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); 280 ASSERT(ISP2(newval)); 281 282 zfsvfs->z_max_blksz = newval; 283 zfsvfs->z_vfs->vfs_bsize = newval; 284 } 285 286 static void 287 readonly_changed_cb(void *arg, uint64_t newval) 288 { 289 zfsvfs_t *zfsvfs = arg; 290 291 if (newval) { 292 /* XXX locking on vfs_flag? */ 293 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 294 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); 295 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); 296 } else { 297 /* XXX locking on vfs_flag? */ 298 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 299 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); 300 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); 301 } 302 } 303 304 static void 305 devices_changed_cb(void *arg, uint64_t newval) 306 { 307 zfsvfs_t *zfsvfs = arg; 308 309 if (newval == FALSE) { 310 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES; 311 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES); 312 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0); 313 } else { 314 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES; 315 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES); 316 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0); 317 } 318 } 319 320 static void 321 setuid_changed_cb(void *arg, uint64_t newval) 322 { 323 zfsvfs_t *zfsvfs = arg; 324 325 if (newval == FALSE) { 326 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; 327 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); 328 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); 329 } else { 330 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; 331 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); 332 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); 333 } 334 } 335 336 static void 337 exec_changed_cb(void *arg, uint64_t newval) 338 { 339 zfsvfs_t *zfsvfs = arg; 340 341 if (newval == FALSE) { 342 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; 343 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); 344 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); 345 } else { 346 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; 347 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); 348 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); 349 } 350 } 351 352 /* 353 * The nbmand mount option can be changed at mount time. 354 * We can't allow it to be toggled on live file systems or incorrect 355 * behavior may be seen from cifs clients 356 * 357 * This property isn't registered via dsl_prop_register(), but this callback 358 * will be called when a file system is first mounted 359 */ 360 static void 361 nbmand_changed_cb(void *arg, uint64_t newval) 362 { 363 zfsvfs_t *zfsvfs = arg; 364 if (newval == FALSE) { 365 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); 366 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); 367 } else { 368 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); 369 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); 370 } 371 } 372 373 static void 374 snapdir_changed_cb(void *arg, uint64_t newval) 375 { 376 zfsvfs_t *zfsvfs = arg; 377 378 zfsvfs->z_show_ctldir = newval; 379 } 380 381 static void 382 vscan_changed_cb(void *arg, uint64_t newval) 383 { 384 zfsvfs_t *zfsvfs = arg; 385 386 zfsvfs->z_vscan = newval; 387 } 388 389 static void 390 acl_mode_changed_cb(void *arg, uint64_t newval) 391 { 392 zfsvfs_t *zfsvfs = arg; 393 394 zfsvfs->z_acl_mode = newval; 395 } 396 397 static void 398 acl_inherit_changed_cb(void *arg, uint64_t newval) 399 { 400 zfsvfs_t *zfsvfs = arg; 401 402 zfsvfs->z_acl_inherit = newval; 403 } 404 405 static int 406 zfs_register_callbacks(vfs_t *vfsp) 407 { 408 struct dsl_dataset *ds = NULL; 409 objset_t *os = NULL; 410 zfsvfs_t *zfsvfs = NULL; 411 uint64_t nbmand; 412 boolean_t readonly = B_FALSE; 413 boolean_t do_readonly = B_FALSE; 414 boolean_t setuid = B_FALSE; 415 boolean_t do_setuid = B_FALSE; 416 boolean_t exec = B_FALSE; 417 boolean_t do_exec = B_FALSE; 418 boolean_t devices = B_FALSE; 419 boolean_t do_devices = B_FALSE; 420 boolean_t xattr = B_FALSE; 421 boolean_t do_xattr = B_FALSE; 422 boolean_t atime = B_FALSE; 423 boolean_t do_atime = B_FALSE; 424 int error = 0; 425 426 ASSERT(vfsp); 427 zfsvfs = vfsp->vfs_data; 428 ASSERT(zfsvfs); 429 os = zfsvfs->z_os; 430 431 /* 432 * The act of registering our callbacks will destroy any mount 433 * options we may have. In order to enable temporary overrides 434 * of mount options, we stash away the current values and 435 * restore them after we register the callbacks. 436 */ 437 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || 438 !spa_writeable(dmu_objset_spa(os))) { 439 readonly = B_TRUE; 440 do_readonly = B_TRUE; 441 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { 442 readonly = B_FALSE; 443 do_readonly = B_TRUE; 444 } 445 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { 446 devices = B_FALSE; 447 setuid = B_FALSE; 448 do_devices = B_TRUE; 449 do_setuid = B_TRUE; 450 } else { 451 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { 452 devices = B_FALSE; 453 do_devices = B_TRUE; 454 } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) { 455 devices = B_TRUE; 456 do_devices = B_TRUE; 457 } 458 459 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { 460 setuid = B_FALSE; 461 do_setuid = B_TRUE; 462 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { 463 setuid = B_TRUE; 464 do_setuid = B_TRUE; 465 } 466 } 467 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { 468 exec = B_FALSE; 469 do_exec = B_TRUE; 470 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { 471 exec = B_TRUE; 472 do_exec = B_TRUE; 473 } 474 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { 475 xattr = B_FALSE; 476 do_xattr = B_TRUE; 477 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { 478 xattr = B_TRUE; 479 do_xattr = B_TRUE; 480 } 481 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { 482 atime = B_FALSE; 483 do_atime = B_TRUE; 484 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { 485 atime = B_TRUE; 486 do_atime = B_TRUE; 487 } 488 489 /* 490 * nbmand is a special property. It can only be changed at 491 * mount time. 492 * 493 * This is weird, but it is documented to only be changeable 494 * at mount time. 495 */ 496 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { 497 nbmand = B_FALSE; 498 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { 499 nbmand = B_TRUE; 500 } else { 501 char osname[ZFS_MAX_DATASET_NAME_LEN]; 502 503 dmu_objset_name(os, osname); 504 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, 505 NULL)) { 506 return (error); 507 } 508 } 509 510 /* 511 * Register property callbacks. 512 * 513 * It would probably be fine to just check for i/o error from 514 * the first prop_register(), but I guess I like to go 515 * overboard... 516 */ 517 ds = dmu_objset_ds(os); 518 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 519 error = dsl_prop_register(ds, 520 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); 521 error = error ? error : dsl_prop_register(ds, 522 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); 523 error = error ? error : dsl_prop_register(ds, 524 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); 525 error = error ? error : dsl_prop_register(ds, 526 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); 527 error = error ? error : dsl_prop_register(ds, 528 zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); 529 error = error ? error : dsl_prop_register(ds, 530 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); 531 error = error ? error : dsl_prop_register(ds, 532 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); 533 error = error ? error : dsl_prop_register(ds, 534 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); 535 error = error ? error : dsl_prop_register(ds, 536 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); 537 error = error ? error : dsl_prop_register(ds, 538 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, 539 zfsvfs); 540 error = error ? error : dsl_prop_register(ds, 541 zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); 542 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 543 if (error) 544 goto unregister; 545 546 /* 547 * Invoke our callbacks to restore temporary mount options. 548 */ 549 if (do_readonly) 550 readonly_changed_cb(zfsvfs, readonly); 551 if (do_setuid) 552 setuid_changed_cb(zfsvfs, setuid); 553 if (do_exec) 554 exec_changed_cb(zfsvfs, exec); 555 if (do_devices) 556 devices_changed_cb(zfsvfs, devices); 557 if (do_xattr) 558 xattr_changed_cb(zfsvfs, xattr); 559 if (do_atime) 560 atime_changed_cb(zfsvfs, atime); 561 562 nbmand_changed_cb(zfsvfs, nbmand); 563 564 return (0); 565 566 unregister: 567 dsl_prop_unregister_all(ds, zfsvfs); 568 return (error); 569 } 570 571 static int 572 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, 573 uint64_t *userp, uint64_t *groupp, uint64_t *projectp) 574 { 575 sa_hdr_phys_t sa; 576 sa_hdr_phys_t *sap = data; 577 uint64_t flags; 578 int hdrsize; 579 boolean_t swap = B_FALSE; 580 581 /* 582 * Is it a valid type of object to track? 583 */ 584 if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) 585 return (SET_ERROR(ENOENT)); 586 587 /* 588 * If we have a NULL data pointer 589 * then assume the id's aren't changing and 590 * return EEXIST to the dmu to let it know to 591 * use the same ids 592 */ 593 if (data == NULL) 594 return (SET_ERROR(EEXIST)); 595 596 if (bonustype == DMU_OT_ZNODE) { 597 znode_phys_t *znp = data; 598 *userp = znp->zp_uid; 599 *groupp = znp->zp_gid; 600 *projectp = ZFS_DEFAULT_PROJID; 601 return (0); 602 } 603 604 if (sap->sa_magic == 0) { 605 /* 606 * This should only happen for newly created files 607 * that haven't had the znode data filled in yet. 608 */ 609 *userp = 0; 610 *groupp = 0; 611 *projectp = ZFS_DEFAULT_PROJID; 612 return (0); 613 } 614 615 sa = *sap; 616 if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { 617 sa.sa_magic = SA_MAGIC; 618 sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); 619 swap = B_TRUE; 620 } else { 621 VERIFY3U(sa.sa_magic, ==, SA_MAGIC); 622 } 623 624 hdrsize = sa_hdrsize(&sa); 625 VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); 626 627 *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET)); 628 *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET)); 629 flags = *((uint64_t *)((uintptr_t)data + hdrsize + SA_FLAGS_OFFSET)); 630 if (swap) 631 flags = BSWAP_64(flags); 632 633 if (flags & ZFS_PROJID) 634 *projectp = *((uint64_t *)((uintptr_t)data + hdrsize + 635 SA_PROJID_OFFSET)); 636 else 637 *projectp = ZFS_DEFAULT_PROJID; 638 639 if (swap) { 640 *userp = BSWAP_64(*userp); 641 *groupp = BSWAP_64(*groupp); 642 *projectp = BSWAP_64(*projectp); 643 } 644 return (0); 645 } 646 647 static void 648 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, 649 char *domainbuf, int buflen, uid_t *ridp) 650 { 651 uint64_t fuid; 652 const char *domain; 653 654 fuid = zfs_strtonum(fuidstr, NULL); 655 656 domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); 657 if (domain) 658 (void) strlcpy(domainbuf, domain, buflen); 659 else 660 domainbuf[0] = '\0'; 661 *ridp = FUID_RID(fuid); 662 } 663 664 static uint64_t 665 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) 666 { 667 switch (type) { 668 case ZFS_PROP_USERUSED: 669 case ZFS_PROP_USEROBJUSED: 670 return (DMU_USERUSED_OBJECT); 671 case ZFS_PROP_GROUPUSED: 672 case ZFS_PROP_GROUPOBJUSED: 673 return (DMU_GROUPUSED_OBJECT); 674 case ZFS_PROP_PROJECTUSED: 675 case ZFS_PROP_PROJECTOBJUSED: 676 return (DMU_PROJECTUSED_OBJECT); 677 case ZFS_PROP_USERQUOTA: 678 return (zfsvfs->z_userquota_obj); 679 case ZFS_PROP_GROUPQUOTA: 680 return (zfsvfs->z_groupquota_obj); 681 case ZFS_PROP_USEROBJQUOTA: 682 return (zfsvfs->z_userobjquota_obj); 683 case ZFS_PROP_GROUPOBJQUOTA: 684 return (zfsvfs->z_groupobjquota_obj); 685 case ZFS_PROP_PROJECTQUOTA: 686 return (zfsvfs->z_projectquota_obj); 687 case ZFS_PROP_PROJECTOBJQUOTA: 688 return (zfsvfs->z_projectobjquota_obj); 689 default: 690 return (ZFS_NO_OBJECT); 691 } 692 } 693 694 int 695 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 696 uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) 697 { 698 int error; 699 zap_cursor_t zc; 700 zap_attribute_t za; 701 zfs_useracct_t *buf = vbuf; 702 uint64_t obj; 703 int offset = 0; 704 705 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 706 return (SET_ERROR(ENOTSUP)); 707 708 if ((type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || 709 type == ZFS_PROP_PROJECTOBJQUOTA || 710 type == ZFS_PROP_PROJECTOBJUSED) && 711 !dmu_objset_projectquota_present(zfsvfs->z_os)) 712 return (SET_ERROR(ENOTSUP)); 713 714 if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || 715 type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || 716 type == ZFS_PROP_PROJECTOBJUSED || 717 type == ZFS_PROP_PROJECTOBJQUOTA) && 718 !dmu_objset_userobjspace_present(zfsvfs->z_os)) 719 return (SET_ERROR(ENOTSUP)); 720 721 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 722 if (obj == ZFS_NO_OBJECT) { 723 *bufsizep = 0; 724 return (0); 725 } 726 727 if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || 728 type == ZFS_PROP_PROJECTOBJUSED) 729 offset = DMU_OBJACCT_PREFIX_LEN; 730 731 for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); 732 (error = zap_cursor_retrieve(&zc, &za)) == 0; 733 zap_cursor_advance(&zc)) { 734 if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > 735 *bufsizep) 736 break; 737 738 /* 739 * skip object quota (with zap name prefix DMU_OBJACCT_PREFIX) 740 * when dealing with block quota and vice versa. 741 */ 742 if ((offset > 0) != (strncmp(za.za_name, DMU_OBJACCT_PREFIX, 743 DMU_OBJACCT_PREFIX_LEN) == 0)) 744 continue; 745 746 fuidstr_to_sid(zfsvfs, za.za_name + offset, 747 buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); 748 749 buf->zu_space = za.za_first_integer; 750 buf++; 751 } 752 if (error == ENOENT) 753 error = 0; 754 755 ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); 756 *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; 757 *cookiep = zap_cursor_serialize(&zc); 758 zap_cursor_fini(&zc); 759 return (error); 760 } 761 762 /* 763 * buf must be big enough (eg, 32 bytes) 764 */ 765 static int 766 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, 767 char *buf, boolean_t addok) 768 { 769 uint64_t fuid; 770 int domainid = 0; 771 772 if (domain && domain[0]) { 773 domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); 774 if (domainid == -1) 775 return (SET_ERROR(ENOENT)); 776 } 777 fuid = FUID_ENCODE(domainid, rid); 778 (void) sprintf(buf, "%llx", (longlong_t)fuid); 779 return (0); 780 } 781 782 int 783 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 784 const char *domain, uint64_t rid, uint64_t *valp) 785 { 786 char buf[20 + DMU_OBJACCT_PREFIX_LEN]; 787 int offset = 0; 788 int err; 789 uint64_t obj; 790 791 *valp = 0; 792 793 if (!dmu_objset_userspace_present(zfsvfs->z_os)) 794 return (SET_ERROR(ENOTSUP)); 795 796 if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || 797 type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || 798 type == ZFS_PROP_PROJECTOBJUSED || 799 type == ZFS_PROP_PROJECTOBJQUOTA) && 800 !dmu_objset_userobjspace_present(zfsvfs->z_os)) 801 return (SET_ERROR(ENOTSUP)); 802 803 if (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || 804 type == ZFS_PROP_PROJECTOBJQUOTA || 805 type == ZFS_PROP_PROJECTOBJUSED) { 806 if (!dmu_objset_projectquota_present(zfsvfs->z_os)) 807 return (SET_ERROR(ENOTSUP)); 808 if (!zpl_is_valid_projid(rid)) 809 return (SET_ERROR(EINVAL)); 810 } 811 812 obj = zfs_userquota_prop_to_obj(zfsvfs, type); 813 if (obj == ZFS_NO_OBJECT) 814 return (0); 815 816 if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || 817 type == ZFS_PROP_PROJECTOBJUSED) { 818 strncpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN); 819 offset = DMU_OBJACCT_PREFIX_LEN; 820 } 821 822 err = id_to_fuidstr(zfsvfs, domain, rid, buf + offset, B_FALSE); 823 if (err) 824 return (err); 825 826 err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); 827 if (err == ENOENT) 828 err = 0; 829 return (err); 830 } 831 832 int 833 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, 834 const char *domain, uint64_t rid, uint64_t quota) 835 { 836 char buf[32]; 837 int err; 838 dmu_tx_t *tx; 839 uint64_t *objp; 840 boolean_t fuid_dirtied; 841 842 if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) 843 return (SET_ERROR(ENOTSUP)); 844 845 switch (type) { 846 case ZFS_PROP_USERQUOTA: 847 objp = &zfsvfs->z_userquota_obj; 848 break; 849 case ZFS_PROP_GROUPQUOTA: 850 objp = &zfsvfs->z_groupquota_obj; 851 break; 852 case ZFS_PROP_USEROBJQUOTA: 853 objp = &zfsvfs->z_userobjquota_obj; 854 break; 855 case ZFS_PROP_GROUPOBJQUOTA: 856 objp = &zfsvfs->z_groupobjquota_obj; 857 break; 858 case ZFS_PROP_PROJECTQUOTA: 859 if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) 860 return (SET_ERROR(ENOTSUP)); 861 if (!zpl_is_valid_projid(rid)) 862 return (SET_ERROR(EINVAL)); 863 864 objp = &zfsvfs->z_projectquota_obj; 865 break; 866 case ZFS_PROP_PROJECTOBJQUOTA: 867 if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) 868 return (SET_ERROR(ENOTSUP)); 869 if (!zpl_is_valid_projid(rid)) 870 return (SET_ERROR(EINVAL)); 871 872 objp = &zfsvfs->z_projectobjquota_obj; 873 break; 874 default: 875 return (SET_ERROR(EINVAL)); 876 } 877 878 err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); 879 if (err) 880 return (err); 881 fuid_dirtied = zfsvfs->z_fuid_dirty; 882 883 tx = dmu_tx_create(zfsvfs->z_os); 884 dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); 885 if (*objp == 0) { 886 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 887 zfs_userquota_prop_prefixes[type]); 888 } 889 if (fuid_dirtied) 890 zfs_fuid_txhold(zfsvfs, tx); 891 err = dmu_tx_assign(tx, TXG_WAIT); 892 if (err) { 893 dmu_tx_abort(tx); 894 return (err); 895 } 896 897 mutex_enter(&zfsvfs->z_lock); 898 if (*objp == 0) { 899 *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, 900 DMU_OT_NONE, 0, tx); 901 VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 902 zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); 903 } 904 mutex_exit(&zfsvfs->z_lock); 905 906 if (quota == 0) { 907 err = zap_remove(zfsvfs->z_os, *objp, buf, tx); 908 if (err == ENOENT) 909 err = 0; 910 } else { 911 err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); 912 } 913 ASSERT(err == 0); 914 if (fuid_dirtied) 915 zfs_fuid_sync(zfsvfs, tx); 916 dmu_tx_commit(tx); 917 return (err); 918 } 919 920 boolean_t 921 zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) 922 { 923 char buf[20 + DMU_OBJACCT_PREFIX_LEN]; 924 uint64_t used, quota, quotaobj; 925 int err; 926 927 if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) { 928 if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) { 929 dsl_pool_config_enter( 930 dmu_objset_pool(zfsvfs->z_os), FTAG); 931 dmu_objset_id_quota_upgrade(zfsvfs->z_os); 932 dsl_pool_config_exit( 933 dmu_objset_pool(zfsvfs->z_os), FTAG); 934 } 935 return (B_FALSE); 936 } 937 938 if (usedobj == DMU_PROJECTUSED_OBJECT) { 939 if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { 940 if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { 941 dsl_pool_config_enter( 942 dmu_objset_pool(zfsvfs->z_os), FTAG); 943 dmu_objset_id_quota_upgrade(zfsvfs->z_os); 944 dsl_pool_config_exit( 945 dmu_objset_pool(zfsvfs->z_os), FTAG); 946 } 947 return (B_FALSE); 948 } 949 quotaobj = zfsvfs->z_projectobjquota_obj; 950 } else if (usedobj == DMU_USERUSED_OBJECT) { 951 quotaobj = zfsvfs->z_userobjquota_obj; 952 } else if (usedobj == DMU_GROUPUSED_OBJECT) { 953 quotaobj = zfsvfs->z_groupobjquota_obj; 954 } else { 955 return (B_FALSE); 956 } 957 if (quotaobj == 0 || zfsvfs->z_replay) 958 return (B_FALSE); 959 960 (void) sprintf(buf, "%llx", (longlong_t)id); 961 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); 962 if (err != 0) 963 return (B_FALSE); 964 965 (void) sprintf(buf, DMU_OBJACCT_PREFIX "%llx", (longlong_t)id); 966 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); 967 if (err != 0) 968 return (B_FALSE); 969 return (used >= quota); 970 } 971 972 boolean_t 973 zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) 974 { 975 char buf[20]; 976 uint64_t used, quota, quotaobj; 977 int err; 978 979 if (usedobj == DMU_PROJECTUSED_OBJECT) { 980 if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { 981 if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { 982 dsl_pool_config_enter( 983 dmu_objset_pool(zfsvfs->z_os), FTAG); 984 dmu_objset_id_quota_upgrade(zfsvfs->z_os); 985 dsl_pool_config_exit( 986 dmu_objset_pool(zfsvfs->z_os), FTAG); 987 } 988 return (B_FALSE); 989 } 990 quotaobj = zfsvfs->z_projectquota_obj; 991 } else if (usedobj == DMU_USERUSED_OBJECT) { 992 quotaobj = zfsvfs->z_userquota_obj; 993 } else if (usedobj == DMU_GROUPUSED_OBJECT) { 994 quotaobj = zfsvfs->z_groupquota_obj; 995 } else { 996 return (B_FALSE); 997 } 998 if (quotaobj == 0 || zfsvfs->z_replay) 999 return (B_FALSE); 1000 1001 (void) sprintf(buf, "%llx", (longlong_t)id); 1002 err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); 1003 if (err != 0) 1004 return (B_FALSE); 1005 1006 err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); 1007 if (err != 0) 1008 return (B_FALSE); 1009 return (used >= quota); 1010 } 1011 1012 boolean_t 1013 zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) 1014 { 1015 return (zfs_id_overblockquota(zfsvfs, usedobj, id) || 1016 zfs_id_overobjquota(zfsvfs, usedobj, id)); 1017 } 1018 1019 /* 1020 * Associate this zfsvfs with the given objset, which must be owned. 1021 * This will cache a bunch of on-disk state from the objset in the 1022 * zfsvfs. 1023 */ 1024 static int 1025 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) 1026 { 1027 int error; 1028 uint64_t val; 1029 1030 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; 1031 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; 1032 zfsvfs->z_os = os; 1033 1034 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 1035 if (error != 0) 1036 return (error); 1037 if (zfsvfs->z_version > 1038 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { 1039 (void) printf("Can't mount a version %lld file system " 1040 "on a version %lld pool\n. Pool must be upgraded to mount " 1041 "this file system.", (u_longlong_t)zfsvfs->z_version, 1042 (u_longlong_t)spa_version(dmu_objset_spa(os))); 1043 return (SET_ERROR(ENOTSUP)); 1044 } 1045 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); 1046 if (error != 0) 1047 return (error); 1048 zfsvfs->z_norm = (int)val; 1049 1050 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); 1051 if (error != 0) 1052 return (error); 1053 zfsvfs->z_utf8 = (val != 0); 1054 1055 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); 1056 if (error != 0) 1057 return (error); 1058 zfsvfs->z_case = (uint_t)val; 1059 1060 /* 1061 * Fold case on file systems that are always or sometimes case 1062 * insensitive. 1063 */ 1064 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 1065 zfsvfs->z_case == ZFS_CASE_MIXED) 1066 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 1067 1068 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1069 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1070 1071 uint64_t sa_obj = 0; 1072 if (zfsvfs->z_use_sa) { 1073 /* should either have both of these objects or none */ 1074 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, 1075 &sa_obj); 1076 if (error != 0) 1077 return (error); 1078 } 1079 1080 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 1081 &zfsvfs->z_attr_table); 1082 if (error != 0) 1083 return (error); 1084 1085 if (zfsvfs->z_version >= ZPL_VERSION_SA) 1086 sa_register_update_callback(os, zfs_sa_upgrade); 1087 1088 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 1089 &zfsvfs->z_root); 1090 if (error != 0) 1091 return (error); 1092 ASSERT(zfsvfs->z_root != 0); 1093 1094 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 1095 &zfsvfs->z_unlinkedobj); 1096 if (error != 0) 1097 return (error); 1098 1099 error = zap_lookup(os, MASTER_NODE_OBJ, 1100 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 1101 8, 1, &zfsvfs->z_userquota_obj); 1102 if (error == ENOENT) 1103 zfsvfs->z_userquota_obj = 0; 1104 else if (error != 0) 1105 return (error); 1106 1107 error = zap_lookup(os, MASTER_NODE_OBJ, 1108 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 1109 8, 1, &zfsvfs->z_groupquota_obj); 1110 if (error == ENOENT) 1111 zfsvfs->z_groupquota_obj = 0; 1112 else if (error != 0) 1113 return (error); 1114 1115 error = zap_lookup(os, MASTER_NODE_OBJ, 1116 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 1117 8, 1, &zfsvfs->z_projectquota_obj); 1118 if (error == ENOENT) 1119 zfsvfs->z_projectquota_obj = 0; 1120 else if (error != 0) 1121 return (error); 1122 1123 error = zap_lookup(os, MASTER_NODE_OBJ, 1124 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 1125 8, 1, &zfsvfs->z_userobjquota_obj); 1126 if (error == ENOENT) 1127 zfsvfs->z_userobjquota_obj = 0; 1128 else if (error != 0) 1129 return (error); 1130 1131 error = zap_lookup(os, MASTER_NODE_OBJ, 1132 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 1133 8, 1, &zfsvfs->z_groupobjquota_obj); 1134 if (error == ENOENT) 1135 zfsvfs->z_groupobjquota_obj = 0; 1136 else if (error != 0) 1137 return (error); 1138 1139 error = zap_lookup(os, MASTER_NODE_OBJ, 1140 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 1141 8, 1, &zfsvfs->z_projectobjquota_obj); 1142 if (error == ENOENT) 1143 zfsvfs->z_projectobjquota_obj = 0; 1144 else if (error != 0) 1145 return (error); 1146 1147 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 1148 &zfsvfs->z_fuid_obj); 1149 if (error == ENOENT) 1150 zfsvfs->z_fuid_obj = 0; 1151 else if (error != 0) 1152 return (error); 1153 1154 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, 1155 &zfsvfs->z_shares_dir); 1156 if (error == ENOENT) 1157 zfsvfs->z_shares_dir = 0; 1158 else if (error != 0) 1159 return (error); 1160 1161 return (0); 1162 } 1163 1164 int 1165 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) 1166 { 1167 objset_t *os; 1168 zfsvfs_t *zfsvfs; 1169 int error; 1170 boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); 1171 1172 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 1173 1174 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); 1175 if (error != 0) { 1176 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1177 return (error); 1178 } 1179 1180 error = zfsvfs_create_impl(zfvp, zfsvfs, os); 1181 if (error != 0) { 1182 dmu_objset_disown(os, B_TRUE, zfsvfs); 1183 } 1184 return (error); 1185 } 1186 1187 1188 int 1189 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) 1190 { 1191 int error; 1192 1193 zfsvfs->z_vfs = NULL; 1194 zfsvfs->z_parent = zfsvfs; 1195 1196 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1197 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); 1198 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1199 offsetof(znode_t, z_link_node)); 1200 rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); 1201 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); 1202 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); 1203 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1204 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1205 1206 error = zfsvfs_init(zfsvfs, os); 1207 if (error != 0) { 1208 *zfvp = NULL; 1209 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1210 return (error); 1211 } 1212 1213 zfsvfs->z_drain_task = TASKQID_INVALID; 1214 zfsvfs->z_draining = B_FALSE; 1215 zfsvfs->z_drain_cancel = B_TRUE; 1216 1217 *zfvp = zfsvfs; 1218 return (0); 1219 } 1220 1221 static int 1222 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) 1223 { 1224 int error; 1225 1226 error = zfs_register_callbacks(zfsvfs->z_vfs); 1227 if (error) 1228 return (error); 1229 1230 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); 1231 1232 /* 1233 * If we are not mounting (ie: online recv), then we don't 1234 * have to worry about replaying the log as we blocked all 1235 * operations out since we closed the ZIL. 1236 */ 1237 if (mounting) { 1238 boolean_t readonly; 1239 1240 /* 1241 * During replay we remove the read only flag to 1242 * allow replays to succeed. 1243 */ 1244 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; 1245 if (readonly != 0) { 1246 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; 1247 } else { 1248 zfs_unlinked_drain(zfsvfs); 1249 } 1250 1251 /* 1252 * Parse and replay the intent log. 1253 * 1254 * Because of ziltest, this must be done after 1255 * zfs_unlinked_drain(). (Further note: ziltest 1256 * doesn't use readonly mounts, where 1257 * zfs_unlinked_drain() isn't called.) This is because 1258 * ziltest causes spa_sync() to think it's committed, 1259 * but actually it is not, so the intent log contains 1260 * many txg's worth of changes. 1261 * 1262 * In particular, if object N is in the unlinked set in 1263 * the last txg to actually sync, then it could be 1264 * actually freed in a later txg and then reallocated 1265 * in a yet later txg. This would write a "create 1266 * object N" record to the intent log. Normally, this 1267 * would be fine because the spa_sync() would have 1268 * written out the fact that object N is free, before 1269 * we could write the "create object N" intent log 1270 * record. 1271 * 1272 * But when we are in ziltest mode, we advance the "open 1273 * txg" without actually spa_sync()-ing the changes to 1274 * disk. So we would see that object N is still 1275 * allocated and in the unlinked set, and there is an 1276 * intent log record saying to allocate it. 1277 */ 1278 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { 1279 if (zil_replay_disable) { 1280 zil_destroy(zfsvfs->z_log, B_FALSE); 1281 } else { 1282 zfsvfs->z_replay = B_TRUE; 1283 zil_replay(zfsvfs->z_os, zfsvfs, 1284 zfs_replay_vector); 1285 zfsvfs->z_replay = B_FALSE; 1286 } 1287 } 1288 1289 /* restore readonly bit */ 1290 if (readonly != 0) 1291 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; 1292 } 1293 1294 /* 1295 * Set the objset user_ptr to track its zfsvfs. 1296 */ 1297 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1298 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1299 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1300 1301 return (0); 1302 } 1303 1304 void 1305 zfsvfs_free(zfsvfs_t *zfsvfs) 1306 { 1307 int i; 1308 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ 1309 1310 /* 1311 * This is a barrier to prevent the filesystem from going away in 1312 * zfs_znode_move() until we can safely ensure that the filesystem is 1313 * not unmounted. We consider the filesystem valid before the barrier 1314 * and invalid after the barrier. 1315 */ 1316 rw_enter(&zfsvfs_lock, RW_READER); 1317 rw_exit(&zfsvfs_lock); 1318 1319 zfs_fuid_destroy(zfsvfs); 1320 1321 mutex_destroy(&zfsvfs->z_znodes_lock); 1322 mutex_destroy(&zfsvfs->z_lock); 1323 list_destroy(&zfsvfs->z_all_znodes); 1324 rrm_destroy(&zfsvfs->z_teardown_lock); 1325 rw_destroy(&zfsvfs->z_teardown_inactive_lock); 1326 rw_destroy(&zfsvfs->z_fuid_lock); 1327 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1328 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1329 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1330 } 1331 1332 static void 1333 zfs_set_fuid_feature(zfsvfs_t *zfsvfs) 1334 { 1335 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); 1336 if (zfsvfs->z_vfs) { 1337 if (zfsvfs->z_use_fuids) { 1338 vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1339 vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1340 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1341 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1342 vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1343 vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1344 } else { 1345 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); 1346 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); 1347 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); 1348 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); 1349 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); 1350 vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); 1351 } 1352 } 1353 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); 1354 } 1355 1356 static int 1357 zfs_domount(vfs_t *vfsp, char *osname) 1358 { 1359 dev_t mount_dev; 1360 uint64_t recordsize, fsid_guid; 1361 int error = 0; 1362 zfsvfs_t *zfsvfs; 1363 boolean_t readonly = vfsp->vfs_flag & VFS_RDONLY ? B_TRUE : B_FALSE; 1364 1365 ASSERT(vfsp); 1366 ASSERT(osname); 1367 1368 error = zfsvfs_create(osname, readonly, &zfsvfs); 1369 if (error) 1370 return (error); 1371 zfsvfs->z_vfs = vfsp; 1372 1373 /* Initialize the generic filesystem structure. */ 1374 vfsp->vfs_bcount = 0; 1375 vfsp->vfs_data = NULL; 1376 1377 if (zfs_create_unique_device(&mount_dev) == -1) { 1378 error = SET_ERROR(ENODEV); 1379 goto out; 1380 } 1381 ASSERT(vfs_devismounted(mount_dev) == 0); 1382 1383 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, 1384 NULL)) 1385 goto out; 1386 1387 vfsp->vfs_dev = mount_dev; 1388 vfsp->vfs_fstype = zfsfstype; 1389 vfsp->vfs_bsize = recordsize; 1390 vfsp->vfs_flag |= VFS_NOTRUNC; 1391 vfsp->vfs_data = zfsvfs; 1392 1393 /* 1394 * The fsid is 64 bits, composed of an 8-bit fs type, which 1395 * separates our fsid from any other filesystem types, and a 1396 * 56-bit objset unique ID. The objset unique ID is unique to 1397 * all objsets open on this system, provided by unique_create(). 1398 * The 8-bit fs type must be put in the low bits of fsid[1] 1399 * because that's where other Solaris filesystems put it. 1400 */ 1401 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); 1402 ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 1403 vfsp->vfs_fsid.val[0] = fsid_guid; 1404 vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | 1405 zfsfstype & 0xFF; 1406 1407 /* 1408 * Set features for file system. 1409 */ 1410 zfs_set_fuid_feature(zfsvfs); 1411 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { 1412 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1413 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1414 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); 1415 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { 1416 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); 1417 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); 1418 } 1419 vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); 1420 1421 if (dmu_objset_is_snapshot(zfsvfs->z_os)) { 1422 uint64_t pval; 1423 1424 atime_changed_cb(zfsvfs, B_FALSE); 1425 readonly_changed_cb(zfsvfs, B_TRUE); 1426 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) 1427 goto out; 1428 xattr_changed_cb(zfsvfs, pval); 1429 zfsvfs->z_issnap = B_TRUE; 1430 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; 1431 1432 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); 1433 dmu_objset_set_user(zfsvfs->z_os, zfsvfs); 1434 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); 1435 } else { 1436 error = zfsvfs_setup(zfsvfs, B_TRUE); 1437 } 1438 1439 /* cache the root vnode for this mount */ 1440 znode_t *rootzp; 1441 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp)) { 1442 goto out; 1443 } 1444 zfsvfs->z_rootdir = ZTOV(rootzp); 1445 1446 if (!zfsvfs->z_issnap) 1447 zfsctl_create(zfsvfs); 1448 out: 1449 if (error) { 1450 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); 1451 zfsvfs_free(zfsvfs); 1452 } else { 1453 atomic_inc_32(&zfs_active_fs_count); 1454 } 1455 1456 return (error); 1457 } 1458 1459 void 1460 zfs_unregister_callbacks(zfsvfs_t *zfsvfs) 1461 { 1462 objset_t *os = zfsvfs->z_os; 1463 1464 if (!dmu_objset_is_snapshot(os)) 1465 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); 1466 } 1467 1468 /* 1469 * Convert a decimal digit string to a uint64_t integer. 1470 */ 1471 static int 1472 str_to_uint64(char *str, uint64_t *objnum) 1473 { 1474 uint64_t num = 0; 1475 1476 while (*str) { 1477 if (*str < '0' || *str > '9') 1478 return (SET_ERROR(EINVAL)); 1479 1480 num = num*10 + *str++ - '0'; 1481 } 1482 1483 *objnum = num; 1484 return (0); 1485 } 1486 1487 /* 1488 * The boot path passed from the boot loader is in the form of 1489 * "rootpool-name/root-filesystem-object-number'. Convert this 1490 * string to a dataset name: "rootpool-name/root-filesystem-name". 1491 */ 1492 static int 1493 zfs_parse_bootfs(char *bpath, char *outpath) 1494 { 1495 char *slashp; 1496 uint64_t objnum; 1497 int error; 1498 1499 if (*bpath == 0 || *bpath == '/') 1500 return (SET_ERROR(EINVAL)); 1501 1502 (void) strcpy(outpath, bpath); 1503 1504 slashp = strchr(bpath, '/'); 1505 1506 /* if no '/', just return the pool name */ 1507 if (slashp == NULL) { 1508 return (0); 1509 } 1510 1511 /* if not a number, just return the root dataset name */ 1512 if (str_to_uint64(slashp+1, &objnum)) { 1513 return (0); 1514 } 1515 1516 *slashp = '\0'; 1517 error = dsl_dsobj_to_dsname(bpath, objnum, outpath); 1518 *slashp = '/'; 1519 1520 return (error); 1521 } 1522 1523 /* 1524 * Check that the hex label string is appropriate for the dataset being 1525 * mounted into the global_zone proper. 1526 * 1527 * Return an error if the hex label string is not default or 1528 * admin_low/admin_high. For admin_low labels, the corresponding 1529 * dataset must be readonly. 1530 */ 1531 int 1532 zfs_check_global_label(const char *dsname, const char *hexsl) 1533 { 1534 if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1535 return (0); 1536 if (strcasecmp(hexsl, ADMIN_HIGH) == 0) 1537 return (0); 1538 if (strcasecmp(hexsl, ADMIN_LOW) == 0) { 1539 /* must be readonly */ 1540 uint64_t rdonly; 1541 1542 if (dsl_prop_get_integer(dsname, 1543 zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) 1544 return (SET_ERROR(EACCES)); 1545 return (rdonly ? 0 : EACCES); 1546 } 1547 return (SET_ERROR(EACCES)); 1548 } 1549 1550 static int 1551 zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct statvfs64 *statp, 1552 uint32_t bshift) 1553 { 1554 char buf[20 + DMU_OBJACCT_PREFIX_LEN]; 1555 uint64_t offset = DMU_OBJACCT_PREFIX_LEN; 1556 uint64_t quota; 1557 uint64_t used; 1558 int err; 1559 1560 strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); 1561 err = id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset, B_FALSE); 1562 if (err) 1563 return (err); 1564 1565 if (zfsvfs->z_projectquota_obj == 0) 1566 goto objs; 1567 1568 err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj, 1569 buf + offset, 8, 1, "a); 1570 if (err == ENOENT) 1571 goto objs; 1572 else if (err) 1573 return (err); 1574 1575 err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, 1576 buf + offset, 8, 1, &used); 1577 if (unlikely(err == ENOENT)) { 1578 uint32_t blksize; 1579 u_longlong_t nblocks; 1580 1581 /* 1582 * Quota accounting is async, so it is possible race case. 1583 * There is at least one object with the given project ID. 1584 */ 1585 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 1586 if (unlikely(zp->z_blksz == 0)) 1587 blksize = zfsvfs->z_max_blksz; 1588 1589 used = blksize * nblocks; 1590 } else if (err) { 1591 return (err); 1592 } 1593 1594 statp->f_blocks = quota >> bshift; 1595 statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0; 1596 statp->f_bavail = statp->f_bfree; 1597 1598 objs: 1599 if (zfsvfs->z_projectobjquota_obj == 0) 1600 return (0); 1601 1602 err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj, 1603 buf + offset, 8, 1, "a); 1604 if (err == ENOENT) 1605 return (0); 1606 else if (err) 1607 return (err); 1608 1609 err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, 1610 buf, 8, 1, &used); 1611 if (unlikely(err == ENOENT)) { 1612 /* 1613 * Quota accounting is async, so it is possible race case. 1614 * There is at least one object with the given project ID. 1615 */ 1616 used = 1; 1617 } else if (err) { 1618 return (err); 1619 } 1620 1621 statp->f_files = quota; 1622 statp->f_ffree = (quota > used) ? (quota - used) : 0; 1623 1624 return (0); 1625 } 1626 1627 /* 1628 * Determine whether the mount is allowed according to MAC check. 1629 * by comparing (where appropriate) label of the dataset against 1630 * the label of the zone being mounted into. If the dataset has 1631 * no label, create one. 1632 * 1633 * Returns 0 if access allowed, error otherwise (e.g. EACCES) 1634 */ 1635 static int 1636 zfs_mount_label_policy(vfs_t *vfsp, char *osname) 1637 { 1638 int error, retv; 1639 zone_t *mntzone = NULL; 1640 ts_label_t *mnt_tsl; 1641 bslabel_t *mnt_sl; 1642 bslabel_t ds_sl; 1643 char ds_hexsl[MAXNAMELEN]; 1644 1645 retv = EACCES; /* assume the worst */ 1646 1647 /* 1648 * Start by getting the dataset label if it exists. 1649 */ 1650 error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1651 1, sizeof (ds_hexsl), &ds_hexsl, NULL); 1652 if (error) 1653 return (SET_ERROR(EACCES)); 1654 1655 /* 1656 * If labeling is NOT enabled, then disallow the mount of datasets 1657 * which have a non-default label already. No other label checks 1658 * are needed. 1659 */ 1660 if (!is_system_labeled()) { 1661 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) 1662 return (0); 1663 return (SET_ERROR(EACCES)); 1664 } 1665 1666 /* 1667 * Get the label of the mountpoint. If mounting into the global 1668 * zone (i.e. mountpoint is not within an active zone and the 1669 * zoned property is off), the label must be default or 1670 * admin_low/admin_high only; no other checks are needed. 1671 */ 1672 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); 1673 if (mntzone->zone_id == GLOBAL_ZONEID) { 1674 uint64_t zoned; 1675 1676 zone_rele(mntzone); 1677 1678 if (dsl_prop_get_integer(osname, 1679 zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) 1680 return (SET_ERROR(EACCES)); 1681 if (!zoned) 1682 return (zfs_check_global_label(osname, ds_hexsl)); 1683 else 1684 /* 1685 * This is the case of a zone dataset being mounted 1686 * initially, before the zone has been fully created; 1687 * allow this mount into global zone. 1688 */ 1689 return (0); 1690 } 1691 1692 mnt_tsl = mntzone->zone_slabel; 1693 ASSERT(mnt_tsl != NULL); 1694 label_hold(mnt_tsl); 1695 mnt_sl = label2bslabel(mnt_tsl); 1696 1697 if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { 1698 /* 1699 * The dataset doesn't have a real label, so fabricate one. 1700 */ 1701 char *str = NULL; 1702 1703 if (l_to_str_internal(mnt_sl, &str) == 0 && 1704 dsl_prop_set_string(osname, 1705 zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1706 ZPROP_SRC_LOCAL, str) == 0) 1707 retv = 0; 1708 if (str != NULL) 1709 kmem_free(str, strlen(str) + 1); 1710 } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { 1711 /* 1712 * Now compare labels to complete the MAC check. If the 1713 * labels are equal then allow access. If the mountpoint 1714 * label dominates the dataset label, allow readonly access. 1715 * Otherwise, access is denied. 1716 */ 1717 if (blequal(mnt_sl, &ds_sl)) 1718 retv = 0; 1719 else if (bldominates(mnt_sl, &ds_sl)) { 1720 vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); 1721 retv = 0; 1722 } 1723 } 1724 1725 label_rele(mnt_tsl); 1726 zone_rele(mntzone); 1727 return (retv); 1728 } 1729 1730 /* 1731 * Load a string-valued boot property and attempt to convert it to a 64-bit 1732 * unsigned integer. If the value is not present, or the conversion fails, 1733 * return the provided default value. 1734 */ 1735 static uint64_t 1736 spa_get_bootprop_uint64(const char *name, uint64_t defval) 1737 { 1738 char *propval; 1739 u_longlong_t r; 1740 int e; 1741 1742 if ((propval = spa_get_bootprop(name)) == NULL) { 1743 /* 1744 * The property does not exist. 1745 */ 1746 return (defval); 1747 } 1748 1749 e = ddi_strtoull(propval, NULL, 10, &r); 1750 1751 spa_free_bootprop(propval); 1752 1753 /* 1754 * If the conversion succeeded, return the value. If there was any 1755 * kind of failure, just return the default value. 1756 */ 1757 return (e == 0 ? r : defval); 1758 } 1759 1760 static int 1761 zfs_mountroot(vfs_t *vfsp, enum whymountroot why) 1762 { 1763 int error = 0; 1764 static int zfsrootdone = 0; 1765 zfsvfs_t *zfsvfs = NULL; 1766 znode_t *zp = NULL; 1767 vnode_t *vp = NULL; 1768 char *zfs_bootfs; 1769 char *zfs_devid; 1770 char *zfs_rootdisk_path; 1771 uint64_t zfs_bootpool; 1772 uint64_t zfs_bootvdev; 1773 1774 ASSERT(vfsp); 1775 1776 /* 1777 * The filesystem that we mount as root is defined in the 1778 * boot property "zfs-bootfs" with a format of 1779 * "poolname/root-dataset-objnum". 1780 */ 1781 if (why == ROOT_INIT) { 1782 if (zfsrootdone++) 1783 return (SET_ERROR(EBUSY)); 1784 1785 /* 1786 * the process of doing a spa_load will require the 1787 * clock to be set before we could (for example) do 1788 * something better by looking at the timestamp on 1789 * an uberblock, so just set it to -1. 1790 */ 1791 clkset(-1); 1792 1793 if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { 1794 cmn_err(CE_NOTE, "spa_get_bootfs: can not get " 1795 "bootfs name"); 1796 return (SET_ERROR(EINVAL)); 1797 } 1798 zfs_devid = spa_get_bootprop("diskdevid"); 1799 1800 /* 1801 * The boot loader may also provide us with the GUID for both 1802 * the pool and the nominated boot vdev. A GUID value of 0 is 1803 * explicitly invalid (see "spa_change_guid()"), so we use this 1804 * as a sentinel value when no GUID is present. 1805 */ 1806 zfs_bootpool = spa_get_bootprop_uint64("zfs-bootpool", 0); 1807 zfs_bootvdev = spa_get_bootprop_uint64("zfs-bootvdev", 0); 1808 1809 /* 1810 * If we have been given a root disk override path, we want to 1811 * ignore device paths from the pool configuration and use only 1812 * the specific path we were given in the boot properties. 1813 */ 1814 zfs_rootdisk_path = spa_get_bootprop("zfs-rootdisk-path"); 1815 1816 /* 1817 * Initialise the early boot device rescan mechanism. A scan 1818 * will not actually be performed unless we need to do so in 1819 * order to find the correct /devices path for a relocated 1820 * device. 1821 */ 1822 vdev_disk_preroot_init(zfs_rootdisk_path); 1823 1824 error = spa_import_rootpool(rootfs.bo_name, zfs_devid, 1825 zfs_bootpool, zfs_bootvdev); 1826 1827 spa_free_bootprop(zfs_devid); 1828 1829 if (error != 0) { 1830 spa_free_bootprop(zfs_bootfs); 1831 spa_free_bootprop(zfs_rootdisk_path); 1832 vdev_disk_preroot_fini(); 1833 cmn_err(CE_NOTE, "spa_import_rootpool: error %d", 1834 error); 1835 return (error); 1836 } 1837 1838 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { 1839 spa_free_bootprop(zfs_bootfs); 1840 spa_free_bootprop(zfs_rootdisk_path); 1841 vdev_disk_preroot_fini(); 1842 cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", 1843 error); 1844 return (error); 1845 } 1846 1847 spa_free_bootprop(zfs_bootfs); 1848 spa_free_bootprop(zfs_rootdisk_path); 1849 1850 if ((error = vfs_lock(vfsp)) != 0) { 1851 vdev_disk_preroot_fini(); 1852 return (error); 1853 } 1854 1855 if (error = zfs_domount(vfsp, rootfs.bo_name)) { 1856 cmn_err(CE_NOTE, "zfs_domount: error %d", error); 1857 goto out; 1858 } 1859 1860 /* zfs_domount has already cached the root vnode for us */ 1861 zfsvfs = (zfsvfs_t *)vfsp->vfs_data; 1862 ASSERT(zfsvfs); 1863 ASSERT(zfsvfs->z_rootdir); 1864 1865 vp = zfsvfs->z_rootdir; 1866 mutex_enter(&vp->v_lock); 1867 vp->v_flag |= VROOT; 1868 mutex_exit(&vp->v_lock); 1869 1870 /* 1871 * Leave rootvp held. The root file system is never unmounted. 1872 */ 1873 VN_HOLD(vp); 1874 rootvp = vp; 1875 1876 vfs_add((struct vnode *)0, vfsp, 1877 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); 1878 out: 1879 vdev_disk_preroot_fini(); 1880 vfs_unlock(vfsp); 1881 return (error); 1882 } else if (why == ROOT_REMOUNT) { 1883 readonly_changed_cb(vfsp->vfs_data, B_FALSE); 1884 vfsp->vfs_flag |= VFS_REMOUNT; 1885 1886 /* refresh mount options */ 1887 zfs_unregister_callbacks(vfsp->vfs_data); 1888 return (zfs_register_callbacks(vfsp)); 1889 1890 } else if (why == ROOT_UNMOUNT) { 1891 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); 1892 (void) zfs_sync(vfsp, 0, 0); 1893 return (0); 1894 } 1895 1896 /* 1897 * if "why" is equal to anything else other than ROOT_INIT, 1898 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. 1899 */ 1900 return (SET_ERROR(ENOTSUP)); 1901 } 1902 1903 /*ARGSUSED*/ 1904 static int 1905 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) 1906 { 1907 char *osname; 1908 pathname_t spn; 1909 int error = 0; 1910 uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ? 1911 UIO_SYSSPACE : UIO_USERSPACE; 1912 int canwrite; 1913 1914 if (mvp->v_type != VDIR) 1915 return (SET_ERROR(ENOTDIR)); 1916 1917 mutex_enter(&mvp->v_lock); 1918 if ((uap->flags & MS_REMOUNT) == 0 && 1919 (uap->flags & MS_OVERLAY) == 0 && 1920 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { 1921 mutex_exit(&mvp->v_lock); 1922 return (SET_ERROR(EBUSY)); 1923 } 1924 mutex_exit(&mvp->v_lock); 1925 1926 /* 1927 * ZFS does not support passing unparsed data in via MS_DATA. 1928 * Users should use the MS_OPTIONSTR interface; this means 1929 * that all option parsing is already done and the options struct 1930 * can be interrogated. 1931 */ 1932 if ((uap->flags & MS_DATA) && uap->datalen > 0) 1933 return (SET_ERROR(EINVAL)); 1934 1935 /* 1936 * Get the objset name (the "special" mount argument). 1937 */ 1938 if (error = pn_get(uap->spec, fromspace, &spn)) 1939 return (error); 1940 1941 osname = spn.pn_path; 1942 1943 /* 1944 * Check for mount privilege? 1945 * 1946 * If we don't have privilege then see if 1947 * we have local permission to allow it 1948 */ 1949 error = secpolicy_fs_mount(cr, mvp, vfsp); 1950 if (error) { 1951 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) { 1952 vattr_t vattr; 1953 1954 /* 1955 * Make sure user is the owner of the mount point 1956 * or has sufficient privileges. 1957 */ 1958 1959 vattr.va_mask = AT_UID; 1960 1961 if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) { 1962 goto out; 1963 } 1964 1965 if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 && 1966 VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) { 1967 goto out; 1968 } 1969 secpolicy_fs_mount_clearopts(cr, vfsp); 1970 } else { 1971 goto out; 1972 } 1973 } 1974 1975 /* 1976 * Refuse to mount a filesystem if we are in a local zone and the 1977 * dataset is not visible. 1978 */ 1979 if (!INGLOBALZONE(curproc) && 1980 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { 1981 error = SET_ERROR(EPERM); 1982 goto out; 1983 } 1984 1985 error = zfs_mount_label_policy(vfsp, osname); 1986 if (error) 1987 goto out; 1988 1989 /* 1990 * When doing a remount, we simply refresh our temporary properties 1991 * according to those options set in the current VFS options. 1992 */ 1993 if (uap->flags & MS_REMOUNT) { 1994 /* refresh mount options */ 1995 zfs_unregister_callbacks(vfsp->vfs_data); 1996 error = zfs_register_callbacks(vfsp); 1997 goto out; 1998 } 1999 2000 error = zfs_domount(vfsp, osname); 2001 2002 /* 2003 * Add an extra VFS_HOLD on our parent vfs so that it can't 2004 * disappear due to a forced unmount. 2005 */ 2006 if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) 2007 VFS_HOLD(mvp->v_vfsp); 2008 2009 out: 2010 pn_free(&spn); 2011 return (error); 2012 } 2013 2014 static int 2015 zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp) 2016 { 2017 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2018 dev32_t d32; 2019 uint64_t refdbytes, availbytes, usedobjs, availobjs; 2020 int err = 0; 2021 2022 ZFS_ENTER(zfsvfs); 2023 2024 dmu_objset_space(zfsvfs->z_os, 2025 &refdbytes, &availbytes, &usedobjs, &availobjs); 2026 2027 /* 2028 * The underlying storage pool actually uses multiple block sizes. 2029 * We report the fragsize as the smallest block size we support, 2030 * and we report our blocksize as the filesystem's maximum blocksize. 2031 */ 2032 statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT; 2033 statp->f_bsize = zfsvfs->z_max_blksz; 2034 2035 /* 2036 * The following report "total" blocks of various kinds in the 2037 * file system, but reported in terms of f_frsize - the 2038 * "fragment" size. 2039 */ 2040 2041 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; 2042 statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT; 2043 statp->f_bavail = statp->f_bfree; /* no root reservation */ 2044 2045 /* 2046 * statvfs() should really be called statufs(), because it assumes 2047 * static metadata. ZFS doesn't preallocate files, so the best 2048 * we can do is report the max that could possibly fit in f_files, 2049 * and that minus the number actually used in f_ffree. 2050 * For f_ffree, report the smaller of the number of object available 2051 * and the number of blocks (each object will take at least a block). 2052 */ 2053 statp->f_ffree = MIN(availobjs, statp->f_bfree); 2054 statp->f_favail = statp->f_ffree; /* no "root reservation" */ 2055 statp->f_files = statp->f_ffree + usedobjs; 2056 2057 (void) cmpldev(&d32, vfsp->vfs_dev); 2058 statp->f_fsid = d32; 2059 2060 /* 2061 * We're a zfs filesystem. 2062 */ 2063 (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name); 2064 2065 statp->f_flag = vf_to_stf(vfsp->vfs_flag); 2066 2067 statp->f_namemax = MAXNAMELEN - 1; 2068 2069 /* 2070 * We have all of 32 characters to stuff a string here. 2071 * Is there anything useful we could/should provide? 2072 */ 2073 bzero(statp->f_fstr, sizeof (statp->f_fstr)); 2074 2075 if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && 2076 dmu_objset_projectquota_present(zfsvfs->z_os)) { 2077 znode_t *zp; 2078 2079 /* 2080 * In ZoL, zfs_statvfs is passed a Linux dentry (directory 2081 * entry), instead of a vfsp. The ZoL code uses the dentry 2082 * to get the znode from the dentry's inode. This represents 2083 * whatever filename was passed to the user-level statvfs 2084 * syscall. 2085 * 2086 * We're using the VFS root znode here, so this represents a 2087 * potential difference from ZoL. 2088 */ 2089 if (zfs_zget(zfsvfs, zfsvfs->z_root, &zp) == 0) { 2090 uint32_t bshift = ddi_fls(statp->f_bsize) - 1; 2091 2092 if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid && 2093 zpl_is_valid_projid(zp->z_projid)) 2094 err = zfs_statfs_project(zfsvfs, zp, statp, 2095 bshift); 2096 VN_RELE(ZTOV(zp)); 2097 } 2098 } 2099 2100 ZFS_EXIT(zfsvfs); 2101 return (err); 2102 } 2103 2104 static int 2105 zfs_root(vfs_t *vfsp, vnode_t **vpp) 2106 { 2107 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2108 struct vnode *vp; 2109 int error; 2110 2111 ZFS_ENTER(zfsvfs); 2112 2113 vp = zfsvfs->z_rootdir; 2114 if (vp != NULL) { 2115 VN_HOLD(vp); 2116 error = 0; 2117 } else { 2118 /* forced unmount */ 2119 error = EIO; 2120 } 2121 *vpp = vp; 2122 2123 ZFS_EXIT(zfsvfs); 2124 return (error); 2125 2126 } 2127 2128 /* 2129 * Teardown the zfsvfs::z_os. 2130 * 2131 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' 2132 * and 'z_teardown_inactive_lock' held. 2133 */ 2134 static int 2135 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) 2136 { 2137 znode_t *zp; 2138 2139 zfs_unlinked_drain_stop_wait(zfsvfs); 2140 2141 rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); 2142 2143 if (!unmounting) { 2144 /* 2145 * We purge the parent filesystem's vfsp as the parent 2146 * filesystem and all of its snapshots have their vnode's 2147 * v_vfsp set to the parent's filesystem's vfsp. Note, 2148 * 'z_parent' is self referential for non-snapshots. 2149 */ 2150 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 2151 } 2152 2153 /* 2154 * Close the zil. NB: Can't close the zil while zfs_inactive 2155 * threads are blocked as zil_close can call zfs_inactive. 2156 */ 2157 if (zfsvfs->z_log) { 2158 zil_close(zfsvfs->z_log); 2159 zfsvfs->z_log = NULL; 2160 } 2161 2162 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); 2163 2164 /* 2165 * If we are not unmounting (ie: online recv) and someone already 2166 * unmounted this file system while we were doing the switcheroo, 2167 * or a reopen of z_os failed then just bail out now. 2168 */ 2169 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { 2170 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2171 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2172 return (SET_ERROR(EIO)); 2173 } 2174 2175 /* 2176 * At this point there are no vops active, and any new vops will 2177 * fail with EIO since we have z_teardown_lock for writer (only 2178 * relavent for forced unmount). 2179 * 2180 * Release all holds on dbufs. 2181 */ 2182 mutex_enter(&zfsvfs->z_znodes_lock); 2183 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; 2184 zp = list_next(&zfsvfs->z_all_znodes, zp)) 2185 if (zp->z_sa_hdl) { 2186 ASSERT(ZTOV(zp)->v_count > 0); 2187 zfs_znode_dmu_fini(zp); 2188 } 2189 mutex_exit(&zfsvfs->z_znodes_lock); 2190 2191 /* 2192 * If we are unmounting, set the unmounted flag and let new vops 2193 * unblock. zfs_inactive will have the unmounted behavior, and all 2194 * other vops will fail with EIO. 2195 */ 2196 if (unmounting) { 2197 /* 2198 * Clear the cached root vnode now that we are unmounted. 2199 * Its release must be performed outside the teardown locks to 2200 * avoid recursive lock entry via zfs_inactive(). 2201 */ 2202 vnode_t *vp = zfsvfs->z_rootdir; 2203 zfsvfs->z_rootdir = NULL; 2204 2205 zfsvfs->z_unmounted = B_TRUE; 2206 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2207 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2208 2209 /* Drop the cached root vp now that it is safe */ 2210 VN_RELE(vp); 2211 } 2212 2213 /* 2214 * z_os will be NULL if there was an error in attempting to reopen 2215 * zfsvfs, so just return as the properties had already been 2216 * unregistered and cached data had been evicted before. 2217 */ 2218 if (zfsvfs->z_os == NULL) 2219 return (0); 2220 2221 /* 2222 * Unregister properties. 2223 */ 2224 zfs_unregister_callbacks(zfsvfs); 2225 2226 /* 2227 * Evict cached data 2228 */ 2229 if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && 2230 !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) 2231 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 2232 dmu_objset_evict_dbufs(zfsvfs->z_os); 2233 2234 return (0); 2235 } 2236 2237 /*ARGSUSED*/ 2238 static int 2239 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) 2240 { 2241 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2242 objset_t *os; 2243 int ret; 2244 2245 ret = secpolicy_fs_unmount(cr, vfsp); 2246 if (ret) { 2247 if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), 2248 ZFS_DELEG_PERM_MOUNT, cr)) 2249 return (ret); 2250 } 2251 2252 /* 2253 * We purge the parent filesystem's vfsp as the parent filesystem 2254 * and all of its snapshots have their vnode's v_vfsp set to the 2255 * parent's filesystem's vfsp. Note, 'z_parent' is self 2256 * referential for non-snapshots. 2257 */ 2258 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); 2259 2260 /* 2261 * Unmount any snapshots mounted under .zfs before unmounting the 2262 * dataset itself. 2263 */ 2264 if (zfsvfs->z_ctldir != NULL && 2265 (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) { 2266 return (ret); 2267 } 2268 2269 if (!(fflag & MS_FORCE)) { 2270 /* 2271 * Check the number of active vnodes in the file system. 2272 * Our count is maintained in the vfs structure, but the 2273 * number is off by 1 to indicate a hold on the vfs 2274 * structure itself. 2275 */ 2276 boolean_t draining; 2277 uint_t thresh = 1; 2278 vnode_t *ctlvp, *rvp; 2279 2280 /* 2281 * The cached vnode for the root directory of the mount also 2282 * maintains a hold on the vfs structure. 2283 */ 2284 rvp = zfsvfs->z_rootdir; 2285 thresh++; 2286 2287 /* 2288 * The '.zfs' directory maintains a reference of its own, and 2289 * any active references underneath are reflected in the vnode 2290 * count. Allow one additional reference for it. 2291 */ 2292 ctlvp = zfsvfs->z_ctldir; 2293 if (ctlvp != NULL) { 2294 thresh++; 2295 } 2296 2297 /* 2298 * If it's running, the asynchronous unlinked drain task needs 2299 * to be stopped before the number of active vnodes can be 2300 * reliably checked. 2301 */ 2302 draining = zfsvfs->z_draining; 2303 if (draining) 2304 zfs_unlinked_drain_stop_wait(zfsvfs); 2305 2306 if (vfsp->vfs_count > thresh || rvp->v_count > 1 || 2307 (ctlvp != NULL && ctlvp->v_count > 1)) { 2308 if (draining) { 2309 /* If it was draining, restart the task */ 2310 zfs_unlinked_drain(zfsvfs); 2311 } 2312 return (SET_ERROR(EBUSY)); 2313 } 2314 } 2315 2316 vfsp->vfs_flag |= VFS_UNMOUNTED; 2317 2318 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); 2319 os = zfsvfs->z_os; 2320 2321 /* 2322 * z_os will be NULL if there was an error in 2323 * attempting to reopen zfsvfs. 2324 */ 2325 if (os != NULL) { 2326 /* 2327 * Unset the objset user_ptr. 2328 */ 2329 mutex_enter(&os->os_user_ptr_lock); 2330 dmu_objset_set_user(os, NULL); 2331 mutex_exit(&os->os_user_ptr_lock); 2332 2333 /* 2334 * Finally release the objset 2335 */ 2336 dmu_objset_disown(os, B_TRUE, zfsvfs); 2337 } 2338 2339 /* 2340 * We can now safely destroy the '.zfs' directory node. 2341 */ 2342 if (zfsvfs->z_ctldir != NULL) 2343 zfsctl_destroy(zfsvfs); 2344 2345 return (0); 2346 } 2347 2348 static int 2349 zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) 2350 { 2351 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2352 znode_t *zp; 2353 uint64_t object = 0; 2354 uint64_t fid_gen = 0; 2355 uint64_t gen_mask; 2356 uint64_t zp_gen; 2357 int i, err; 2358 2359 *vpp = NULL; 2360 2361 ZFS_ENTER(zfsvfs); 2362 2363 if (fidp->fid_len == LONG_FID_LEN) { 2364 zfid_long_t *zlfid = (zfid_long_t *)fidp; 2365 uint64_t objsetid = 0; 2366 uint64_t setgen = 0; 2367 2368 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 2369 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); 2370 2371 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 2372 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); 2373 2374 ZFS_EXIT(zfsvfs); 2375 2376 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); 2377 if (err) 2378 return (SET_ERROR(EINVAL)); 2379 ZFS_ENTER(zfsvfs); 2380 } 2381 2382 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { 2383 zfid_short_t *zfid = (zfid_short_t *)fidp; 2384 2385 for (i = 0; i < sizeof (zfid->zf_object); i++) 2386 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); 2387 2388 for (i = 0; i < sizeof (zfid->zf_gen); i++) 2389 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); 2390 } else { 2391 ZFS_EXIT(zfsvfs); 2392 return (SET_ERROR(EINVAL)); 2393 } 2394 2395 /* A zero fid_gen means we are in the .zfs control directories */ 2396 if (fid_gen == 0 && 2397 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { 2398 *vpp = zfsvfs->z_ctldir; 2399 ASSERT(*vpp != NULL); 2400 if (object == ZFSCTL_INO_SNAPDIR) { 2401 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, 2402 0, NULL, NULL, NULL, NULL, NULL) == 0); 2403 } else { 2404 VN_HOLD(*vpp); 2405 } 2406 ZFS_EXIT(zfsvfs); 2407 return (0); 2408 } 2409 2410 gen_mask = -1ULL >> (64 - 8 * i); 2411 2412 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); 2413 if (err = zfs_zget(zfsvfs, object, &zp)) { 2414 ZFS_EXIT(zfsvfs); 2415 return (err); 2416 } 2417 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 2418 sizeof (uint64_t)); 2419 zp_gen = zp_gen & gen_mask; 2420 if (zp_gen == 0) 2421 zp_gen = 1; 2422 if (zp->z_unlinked || zp_gen != fid_gen) { 2423 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); 2424 VN_RELE(ZTOV(zp)); 2425 ZFS_EXIT(zfsvfs); 2426 return (SET_ERROR(EINVAL)); 2427 } 2428 2429 *vpp = ZTOV(zp); 2430 ZFS_EXIT(zfsvfs); 2431 return (0); 2432 } 2433 2434 /* 2435 * Block out VOPs and close zfsvfs_t::z_os 2436 * 2437 * Note, if successful, then we return with the 'z_teardown_lock' and 2438 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying 2439 * dataset and objset intact so that they can be atomically handed off during 2440 * a subsequent rollback or recv operation and the resume thereafter. 2441 */ 2442 int 2443 zfs_suspend_fs(zfsvfs_t *zfsvfs) 2444 { 2445 int error; 2446 2447 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) 2448 return (error); 2449 2450 return (0); 2451 } 2452 2453 /* 2454 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset 2455 * is an invariant across any of the operations that can be performed while the 2456 * filesystem was suspended. Whether it succeeded or failed, the preconditions 2457 * are the same: the relevant objset and associated dataset are owned by 2458 * zfsvfs, held, and long held on entry. 2459 */ 2460 int 2461 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) 2462 { 2463 int err; 2464 znode_t *zp; 2465 2466 ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); 2467 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); 2468 2469 /* 2470 * We already own this, so just update the objset_t, as the one we 2471 * had before may have been evicted. 2472 */ 2473 objset_t *os; 2474 VERIFY3P(ds->ds_owner, ==, zfsvfs); 2475 VERIFY(dsl_dataset_long_held(ds)); 2476 VERIFY0(dmu_objset_from_ds(ds, &os)); 2477 2478 err = zfsvfs_init(zfsvfs, os); 2479 if (err != 0) 2480 goto bail; 2481 2482 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); 2483 2484 zfs_set_fuid_feature(zfsvfs); 2485 2486 /* 2487 * Attempt to re-establish all the active znodes with 2488 * their dbufs. If a zfs_rezget() fails, then we'll let 2489 * any potential callers discover that via ZFS_ENTER_VERIFY_VP 2490 * when they try to use their znode. 2491 */ 2492 mutex_enter(&zfsvfs->z_znodes_lock); 2493 for (zp = list_head(&zfsvfs->z_all_znodes); zp; 2494 zp = list_next(&zfsvfs->z_all_znodes, zp)) { 2495 (void) zfs_rezget(zp); 2496 } 2497 mutex_exit(&zfsvfs->z_znodes_lock); 2498 2499 if (((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) && 2500 !zfsvfs->z_unmounted) { 2501 /* 2502 * zfs_suspend_fs() could have interrupted freeing 2503 * of dnodes. We need to restart this freeing so 2504 * that we don't "leak" the space. 2505 */ 2506 zfs_unlinked_drain(zfsvfs); 2507 } 2508 2509 bail: 2510 /* release the VOPs */ 2511 rw_exit(&zfsvfs->z_teardown_inactive_lock); 2512 rrm_exit(&zfsvfs->z_teardown_lock, FTAG); 2513 2514 if (err) { 2515 /* 2516 * Since we couldn't setup the sa framework, try to force 2517 * unmount this file system. 2518 */ 2519 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) 2520 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED()); 2521 } 2522 return (err); 2523 } 2524 2525 static void 2526 zfs_freevfs(vfs_t *vfsp) 2527 { 2528 zfsvfs_t *zfsvfs = vfsp->vfs_data; 2529 2530 /* 2531 * If this is a snapshot, we have an extra VFS_HOLD on our parent 2532 * from zfs_mount(). Release it here. If we came through 2533 * zfs_mountroot() instead, we didn't grab an extra hold, so 2534 * skip the VFS_RELE for rootvfs. 2535 */ 2536 if (zfsvfs->z_issnap && (vfsp != rootvfs)) 2537 VFS_RELE(zfsvfs->z_parent->z_vfs); 2538 2539 zfsvfs_free(zfsvfs); 2540 2541 atomic_dec_32(&zfs_active_fs_count); 2542 } 2543 2544 /* 2545 * VFS_INIT() initialization. Note that there is no VFS_FINI(), 2546 * so we can't safely do any non-idempotent initialization here. 2547 * Leave that to zfs_init() and zfs_fini(), which are called 2548 * from the module's _init() and _fini() entry points. 2549 */ 2550 /*ARGSUSED*/ 2551 static int 2552 zfs_vfsinit(int fstype, char *name) 2553 { 2554 int error; 2555 2556 zfsfstype = fstype; 2557 2558 /* 2559 * Setup vfsops and vnodeops tables. 2560 */ 2561 error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops); 2562 if (error != 0) { 2563 cmn_err(CE_WARN, "zfs: bad vfs ops template"); 2564 } 2565 2566 error = zfs_create_op_tables(); 2567 if (error) { 2568 zfs_remove_op_tables(); 2569 cmn_err(CE_WARN, "zfs: bad vnode ops template"); 2570 (void) vfs_freevfsops_by_type(zfsfstype); 2571 return (error); 2572 } 2573 2574 mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 2575 2576 /* 2577 * Unique major number for all zfs mounts. 2578 * If we run out of 32-bit minors, we'll getudev() another major. 2579 */ 2580 zfs_major = ddi_name_to_major(ZFS_DRIVER); 2581 zfs_minor = ZFS_MIN_MINOR; 2582 2583 return (0); 2584 } 2585 2586 void 2587 zfs_init(void) 2588 { 2589 /* 2590 * Initialize .zfs directory structures 2591 */ 2592 zfsctl_init(); 2593 2594 /* 2595 * Initialize znode cache, vnode ops, etc... 2596 */ 2597 zfs_znode_init(); 2598 2599 dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); 2600 } 2601 2602 void 2603 zfs_fini(void) 2604 { 2605 zfsctl_fini(); 2606 zfs_znode_fini(); 2607 } 2608 2609 int 2610 zfs_busy(void) 2611 { 2612 return (zfs_active_fs_count != 0); 2613 } 2614 2615 int 2616 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) 2617 { 2618 int error; 2619 objset_t *os = zfsvfs->z_os; 2620 dmu_tx_t *tx; 2621 2622 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) 2623 return (SET_ERROR(EINVAL)); 2624 2625 if (newvers < zfsvfs->z_version) 2626 return (SET_ERROR(EINVAL)); 2627 2628 if (zfs_spa_version_map(newvers) > 2629 spa_version(dmu_objset_spa(zfsvfs->z_os))) 2630 return (SET_ERROR(ENOTSUP)); 2631 2632 tx = dmu_tx_create(os); 2633 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); 2634 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2635 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, 2636 ZFS_SA_ATTRS); 2637 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2638 } 2639 error = dmu_tx_assign(tx, TXG_WAIT); 2640 if (error) { 2641 dmu_tx_abort(tx); 2642 return (error); 2643 } 2644 2645 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 2646 8, 1, &newvers, tx); 2647 2648 if (error) { 2649 dmu_tx_commit(tx); 2650 return (error); 2651 } 2652 2653 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { 2654 uint64_t sa_obj; 2655 2656 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, 2657 SPA_VERSION_SA); 2658 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 2659 DMU_OT_NONE, 0, tx); 2660 2661 error = zap_add(os, MASTER_NODE_OBJ, 2662 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 2663 ASSERT0(error); 2664 2665 VERIFY(0 == sa_set_sa_object(os, sa_obj)); 2666 sa_register_update_callback(os, zfs_sa_upgrade); 2667 } 2668 2669 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, 2670 "from %llu to %llu", zfsvfs->z_version, newvers); 2671 2672 dmu_tx_commit(tx); 2673 2674 zfsvfs->z_version = newvers; 2675 os->os_version = newvers; 2676 2677 zfs_set_fuid_feature(zfsvfs); 2678 2679 return (0); 2680 } 2681 2682 /* 2683 * Read a property stored within the master node. 2684 */ 2685 int 2686 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) 2687 { 2688 uint64_t *cached_copy = NULL; 2689 2690 /* 2691 * Figure out where in the objset_t the cached copy would live, if it 2692 * is available for the requested property. 2693 */ 2694 if (os != NULL) { 2695 switch (prop) { 2696 case ZFS_PROP_VERSION: 2697 cached_copy = &os->os_version; 2698 break; 2699 case ZFS_PROP_NORMALIZE: 2700 cached_copy = &os->os_normalization; 2701 break; 2702 case ZFS_PROP_UTF8ONLY: 2703 cached_copy = &os->os_utf8only; 2704 break; 2705 case ZFS_PROP_CASE: 2706 cached_copy = &os->os_casesensitivity; 2707 break; 2708 default: 2709 break; 2710 } 2711 } 2712 if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { 2713 *value = *cached_copy; 2714 return (0); 2715 } 2716 2717 /* 2718 * If the property wasn't cached, look up the file system's value for 2719 * the property. For the version property, we look up a slightly 2720 * different string. 2721 */ 2722 const char *pname; 2723 int error = ENOENT; 2724 if (prop == ZFS_PROP_VERSION) { 2725 pname = ZPL_VERSION_STR; 2726 } else { 2727 pname = zfs_prop_to_name(prop); 2728 } 2729 2730 if (os != NULL) { 2731 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); 2732 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); 2733 } 2734 2735 if (error == ENOENT) { 2736 /* No value set, use the default value */ 2737 switch (prop) { 2738 case ZFS_PROP_VERSION: 2739 *value = ZPL_VERSION; 2740 break; 2741 case ZFS_PROP_NORMALIZE: 2742 case ZFS_PROP_UTF8ONLY: 2743 *value = 0; 2744 break; 2745 case ZFS_PROP_CASE: 2746 *value = ZFS_CASE_SENSITIVE; 2747 break; 2748 default: 2749 return (error); 2750 } 2751 error = 0; 2752 } 2753 2754 /* 2755 * If one of the methods for getting the property value above worked, 2756 * copy it into the objset_t's cache. 2757 */ 2758 if (error == 0 && cached_copy != NULL) { 2759 *cached_copy = *value; 2760 } 2761 2762 return (error); 2763 } 2764 2765 /* 2766 * Return true if the coresponding vfs's unmounted flag is set. 2767 * Otherwise return false. 2768 * If this function returns true we know VFS unmount has been initiated. 2769 */ 2770 boolean_t 2771 zfs_get_vfs_flag_unmounted(objset_t *os) 2772 { 2773 zfsvfs_t *zfvp; 2774 boolean_t unmounted = B_FALSE; 2775 2776 ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); 2777 2778 mutex_enter(&os->os_user_ptr_lock); 2779 zfvp = dmu_objset_get_user(os); 2780 if (zfvp != NULL && zfvp->z_vfs != NULL && 2781 (zfvp->z_vfs->vfs_flag & VFS_UNMOUNTED)) 2782 unmounted = B_TRUE; 2783 mutex_exit(&os->os_user_ptr_lock); 2784 2785 return (unmounted); 2786 } 2787 2788 static vfsdef_t vfw = { 2789 VFSDEF_VERSION, 2790 MNTTYPE_ZFS, 2791 zfs_vfsinit, 2792 VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS| 2793 VSW_XID|VSW_ZMOUNT, 2794 &zfs_mntopts 2795 }; 2796 2797 struct modlfs zfs_modlfs = { 2798 &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw 2799 }; 2800