1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/time.h> 31 #include <sys/systm.h> 32 #include <sys/sysmacros.h> 33 #include <sys/resource.h> 34 #include <sys/mntent.h> 35 #include <sys/vfs.h> 36 #include <sys/vnode.h> 37 #include <sys/file.h> 38 #include <sys/kmem.h> 39 #include <sys/cmn_err.h> 40 #include <sys/errno.h> 41 #include <sys/unistd.h> 42 #include <sys/stat.h> 43 #include <sys/mode.h> 44 #include <sys/atomic.h> 45 #include <vm/pvn.h> 46 #include "fs/fs_subr.h" 47 #include <sys/zfs_dir.h> 48 #include <sys/zfs_acl.h> 49 #include <sys/zfs_ioctl.h> 50 #include <sys/zfs_znode.h> 51 #include <sys/zap.h> 52 #include <sys/dmu.h> 53 #include <sys/fs/zfs.h> 54 55 struct kmem_cache *znode_cache = NULL; 56 57 /*ARGSUSED*/ 58 static void 59 znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr) 60 { 61 znode_t *zp = user_ptr; 62 vnode_t *vp = ZTOV(zp); 63 64 mutex_enter(&zp->z_lock); 65 if (vp->v_count == 0) { 66 mutex_exit(&zp->z_lock); 67 vn_invalid(vp); 68 zfs_znode_free(zp); 69 } else { 70 /* signal force unmount that this znode can be freed */ 71 zp->z_dbuf = NULL; 72 mutex_exit(&zp->z_lock); 73 } 74 } 75 76 /*ARGSUSED*/ 77 static int 78 zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags) 79 { 80 znode_t *zp = buf; 81 82 zp->z_vnode = vn_alloc(KM_SLEEP); 83 zp->z_vnode->v_data = (caddr_t)zp; 84 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 85 rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); 86 rw_init(&zp->z_grow_lock, NULL, RW_DEFAULT, NULL); 87 rw_init(&zp->z_append_lock, NULL, RW_DEFAULT, NULL); 88 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 89 zp->z_dbuf_held = 0; 90 zp->z_dirlocks = 0; 91 return (0); 92 } 93 94 /*ARGSUSED*/ 95 static void 96 zfs_znode_cache_destructor(void *buf, void *cdarg) 97 { 98 znode_t *zp = buf; 99 100 ASSERT(zp->z_dirlocks == 0); 101 mutex_destroy(&zp->z_lock); 102 rw_destroy(&zp->z_map_lock); 103 rw_destroy(&zp->z_grow_lock); 104 rw_destroy(&zp->z_append_lock); 105 mutex_destroy(&zp->z_acl_lock); 106 107 ASSERT(zp->z_dbuf_held == 0); 108 ASSERT(ZTOV(zp)->v_count == 0); 109 vn_free(ZTOV(zp)); 110 } 111 112 void 113 zfs_znode_init(void) 114 { 115 /* 116 * Initialize zcache 117 */ 118 ASSERT(znode_cache == NULL); 119 znode_cache = kmem_cache_create("zfs_znode_cache", 120 sizeof (znode_t), 0, zfs_znode_cache_constructor, 121 zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 122 } 123 124 void 125 zfs_znode_fini(void) 126 { 127 /* 128 * Cleanup vfs & vnode ops 129 */ 130 zfs_remove_op_tables(); 131 132 /* 133 * Cleanup zcache 134 */ 135 if (znode_cache) 136 kmem_cache_destroy(znode_cache); 137 znode_cache = NULL; 138 } 139 140 struct vnodeops *zfs_dvnodeops; 141 struct vnodeops *zfs_fvnodeops; 142 struct vnodeops *zfs_symvnodeops; 143 struct vnodeops *zfs_xdvnodeops; 144 struct vnodeops *zfs_evnodeops; 145 146 void 147 zfs_remove_op_tables() 148 { 149 /* 150 * Remove vfs ops 151 */ 152 ASSERT(zfsfstype); 153 (void) vfs_freevfsops_by_type(zfsfstype); 154 zfsfstype = 0; 155 156 /* 157 * Remove vnode ops 158 */ 159 if (zfs_dvnodeops) 160 vn_freevnodeops(zfs_dvnodeops); 161 if (zfs_fvnodeops) 162 vn_freevnodeops(zfs_fvnodeops); 163 if (zfs_symvnodeops) 164 vn_freevnodeops(zfs_symvnodeops); 165 if (zfs_xdvnodeops) 166 vn_freevnodeops(zfs_xdvnodeops); 167 if (zfs_evnodeops) 168 vn_freevnodeops(zfs_evnodeops); 169 170 zfs_dvnodeops = NULL; 171 zfs_fvnodeops = NULL; 172 zfs_symvnodeops = NULL; 173 zfs_xdvnodeops = NULL; 174 zfs_evnodeops = NULL; 175 } 176 177 extern const fs_operation_def_t zfs_dvnodeops_template[]; 178 extern const fs_operation_def_t zfs_fvnodeops_template[]; 179 extern const fs_operation_def_t zfs_xdvnodeops_template[]; 180 extern const fs_operation_def_t zfs_symvnodeops_template[]; 181 extern const fs_operation_def_t zfs_evnodeops_template[]; 182 183 int 184 zfs_create_op_tables() 185 { 186 int error; 187 188 /* 189 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() 190 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). 191 * In this case we just return as the ops vectors are already set up. 192 */ 193 if (zfs_dvnodeops) 194 return (0); 195 196 error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, 197 &zfs_dvnodeops); 198 if (error) 199 return (error); 200 201 error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, 202 &zfs_fvnodeops); 203 if (error) 204 return (error); 205 206 error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, 207 &zfs_symvnodeops); 208 if (error) 209 return (error); 210 211 error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, 212 &zfs_xdvnodeops); 213 if (error) 214 return (error); 215 216 error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, 217 &zfs_evnodeops); 218 219 return (error); 220 } 221 222 /* 223 * zfs_init_fs - Initialize the zfsvfs struct and the file system 224 * incore "master" object. Verify version compatibility. 225 */ 226 int 227 zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) 228 { 229 extern int zfsfstype; 230 231 objset_t *os = zfsvfs->z_os; 232 uint64_t zoid; 233 uint64_t version = ZFS_VERSION; 234 int i, error; 235 dmu_object_info_t doi; 236 dmu_objset_stats_t *stats; 237 238 *zpp = NULL; 239 240 /* 241 * XXX - hack to auto-create the pool root filesystem at 242 * the first attempted mount. 243 */ 244 if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) { 245 dmu_tx_t *tx = dmu_tx_create(os); 246 247 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */ 248 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */ 249 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */ 250 error = dmu_tx_assign(tx, TXG_WAIT); 251 ASSERT3U(error, ==, 0); 252 zfs_create_fs(os, cr, tx); 253 dmu_tx_commit(tx); 254 } 255 256 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1, 257 &version); 258 if (error) { 259 return (error); 260 } else if (version != ZFS_VERSION) { 261 (void) printf("Mismatched versions: File system " 262 "is version %lld on-disk format, which is " 263 "incompatible with this software version %lld!", 264 (u_longlong_t)version, ZFS_VERSION); 265 return (ENOTSUP); 266 } 267 268 /* 269 * The fsid is 64 bits, composed of an 8-bit fs type, which 270 * separates our fsid from any other filesystem types, and a 271 * 56-bit objset unique ID. The objset unique ID is unique to 272 * all objsets open on this system, provided by unique_create(). 273 * The 8-bit fs type must be put in the low bits of fsid[1] 274 * because that's where other Solaris filesystems put it. 275 */ 276 stats = kmem_alloc(sizeof (dmu_objset_stats_t), KM_SLEEP); 277 dmu_objset_stats(os, stats); 278 ASSERT((stats->dds_fsid_guid & ~((1ULL<<56)-1)) == 0); 279 zfsvfs->z_vfs->vfs_fsid.val[0] = stats->dds_fsid_guid; 280 zfsvfs->z_vfs->vfs_fsid.val[1] = ((stats->dds_fsid_guid>>32) << 8) | 281 zfsfstype & 0xFF; 282 kmem_free(stats, sizeof (dmu_objset_stats_t)); 283 stats = NULL; 284 285 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid); 286 if (error) 287 return (error); 288 ASSERT(zoid != 0); 289 zfsvfs->z_root = zoid; 290 291 /* 292 * Create the per mount vop tables. 293 */ 294 295 /* 296 * Initialize zget mutex's 297 */ 298 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 299 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 300 301 error = zfs_zget(zfsvfs, zoid, zpp); 302 if (error) 303 return (error); 304 ASSERT3U((*zpp)->z_id, ==, zoid); 305 306 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid); 307 if (error) 308 return (error); 309 310 zfsvfs->z_dqueue = zoid; 311 312 /* 313 * Initialize delete head structure 314 * Thread(s) will be started/stopped via 315 * readonly_changed_cb() depending 316 * on whether this is rw/ro mount. 317 */ 318 list_create(&zfsvfs->z_delete_head.z_znodes, 319 sizeof (znode_t), offsetof(znode_t, z_list_node)); 320 321 return (0); 322 } 323 324 /* 325 * Construct a new znode/vnode and intialize. 326 * 327 * This does not do a call to dmu_set_user() that is 328 * up to the caller to do, in case you don't want to 329 * return the znode 330 */ 331 static znode_t * 332 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz) 333 { 334 znode_t *zp; 335 vnode_t *vp; 336 337 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 338 339 ASSERT(zp->z_dirlocks == NULL); 340 341 zp->z_phys = db->db_data; 342 zp->z_zfsvfs = zfsvfs; 343 zp->z_active = 1; 344 zp->z_reap = 0; 345 zp->z_atime_dirty = 0; 346 zp->z_dbuf_held = 0; 347 zp->z_mapcnt = 0; 348 zp->z_last_itx = 0; 349 zp->z_dbuf = db; 350 zp->z_id = obj_num; 351 zp->z_blksz = blksz; 352 zp->z_seq = 0x7A4653; 353 354 mutex_enter(&zfsvfs->z_znodes_lock); 355 list_insert_tail(&zfsvfs->z_all_znodes, zp); 356 mutex_exit(&zfsvfs->z_znodes_lock); 357 358 vp = ZTOV(zp); 359 vn_reinit(vp); 360 361 vp->v_vfsp = zfsvfs->z_parent->z_vfs; 362 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 363 364 switch (vp->v_type) { 365 case VDIR: 366 if (zp->z_phys->zp_flags & ZFS_XATTR) { 367 vn_setops(vp, zfs_xdvnodeops); 368 vp->v_flag |= V_XATTRDIR; 369 } else 370 vn_setops(vp, zfs_dvnodeops); 371 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 372 break; 373 case VBLK: 374 case VCHR: 375 vp->v_rdev = (dev_t)zp->z_phys->zp_rdev; 376 /*FALLTHROUGH*/ 377 case VFIFO: 378 case VSOCK: 379 case VDOOR: 380 vn_setops(vp, zfs_fvnodeops); 381 break; 382 case VREG: 383 vp->v_flag |= VMODSORT; 384 vn_setops(vp, zfs_fvnodeops); 385 break; 386 case VLNK: 387 vn_setops(vp, zfs_symvnodeops); 388 break; 389 default: 390 vn_setops(vp, zfs_evnodeops); 391 break; 392 } 393 394 return (zp); 395 } 396 397 static void 398 zfs_znode_dmu_init(znode_t *zp) 399 { 400 znode_t *nzp; 401 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 402 dmu_buf_t *db = zp->z_dbuf; 403 404 mutex_enter(&zp->z_lock); 405 406 nzp = dmu_buf_set_user(db, zp, &zp->z_phys, znode_pageout_func); 407 408 /* 409 * there should be no 410 * concurrent zgets on this object. 411 */ 412 ASSERT3P(nzp, ==, NULL); 413 414 /* 415 * Slap on VROOT if we are the root znode 416 */ 417 if (zp->z_id == zfsvfs->z_root) { 418 ZTOV(zp)->v_flag |= VROOT; 419 } 420 421 ASSERT(zp->z_dbuf_held == 0); 422 zp->z_dbuf_held = 1; 423 VFS_HOLD(zfsvfs->z_vfs); 424 mutex_exit(&zp->z_lock); 425 vn_exists(ZTOV(zp)); 426 } 427 428 /* 429 * Create a new DMU object to hold a zfs znode. 430 * 431 * IN: dzp - parent directory for new znode 432 * vap - file attributes for new znode 433 * tx - dmu transaction id for zap operations 434 * cr - credentials of caller 435 * flag - flags: 436 * IS_ROOT_NODE - new object will be root 437 * IS_XATTR - new object is an attribute 438 * IS_REPLAY - intent log replay 439 * 440 * OUT: oid - ID of created object 441 * 442 */ 443 void 444 zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, 445 uint_t flag, znode_t **zpp, int bonuslen) 446 { 447 dmu_buf_t *dbp; 448 znode_phys_t *pzp; 449 znode_t *zp; 450 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 451 timestruc_t now; 452 uint64_t gen; 453 int err; 454 455 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 456 457 if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ 458 *oid = vap->va_nodeid; 459 flag |= IS_REPLAY; 460 now = vap->va_ctime; /* see zfs_replay_create() */ 461 gen = vap->va_nblocks; /* ditto */ 462 } else { 463 *oid = 0; 464 gethrestime(&now); 465 gen = dmu_tx_get_txg(tx); 466 } 467 468 /* 469 * Create a new DMU object. 470 */ 471 /* 472 * There's currently no mechanism for pre-reading the blocks that will 473 * be to needed allocate a new object, so we accept the small chance 474 * that there will be an i/o error and we will fail one of the 475 * assertions below. 476 */ 477 if (vap->va_type == VDIR) { 478 if (flag & IS_REPLAY) { 479 err = zap_create_claim(zfsvfs->z_os, *oid, 480 DMU_OT_DIRECTORY_CONTENTS, 481 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 482 ASSERT3U(err, ==, 0); 483 } else { 484 *oid = zap_create(zfsvfs->z_os, 485 DMU_OT_DIRECTORY_CONTENTS, 486 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 487 } 488 } else { 489 if (flag & IS_REPLAY) { 490 err = dmu_object_claim(zfsvfs->z_os, *oid, 491 DMU_OT_PLAIN_FILE_CONTENTS, 0, 492 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 493 ASSERT3U(err, ==, 0); 494 } else { 495 *oid = dmu_object_alloc(zfsvfs->z_os, 496 DMU_OT_PLAIN_FILE_CONTENTS, 0, 497 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 498 } 499 } 500 VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp)); 501 dmu_buf_will_dirty(dbp, tx); 502 503 /* 504 * Initialize the znode physical data to zero. 505 */ 506 ASSERT(dbp->db_size >= sizeof (znode_phys_t)); 507 bzero(dbp->db_data, dbp->db_size); 508 pzp = dbp->db_data; 509 510 /* 511 * If this is the root, fix up the half-initialized parent pointer 512 * to reference the just-allocated physical data area. 513 */ 514 if (flag & IS_ROOT_NODE) { 515 dzp->z_phys = pzp; 516 dzp->z_id = *oid; 517 } 518 519 /* 520 * If parent is an xattr, so am I. 521 */ 522 if (dzp->z_phys->zp_flags & ZFS_XATTR) 523 flag |= IS_XATTR; 524 525 if (vap->va_type == VBLK || vap->va_type == VCHR) { 526 pzp->zp_rdev = vap->va_rdev; 527 } 528 529 if (vap->va_type == VDIR) { 530 pzp->zp_size = 2; /* contents ("." and "..") */ 531 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 532 } 533 534 pzp->zp_parent = dzp->z_id; 535 if (flag & IS_XATTR) 536 pzp->zp_flags |= ZFS_XATTR; 537 538 pzp->zp_gen = gen; 539 540 ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 541 ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 542 543 if (vap->va_mask & AT_ATIME) { 544 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 545 } else { 546 ZFS_TIME_ENCODE(&now, pzp->zp_atime); 547 } 548 549 if (vap->va_mask & AT_MTIME) { 550 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 551 } else { 552 ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 553 } 554 555 pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); 556 zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0); 557 558 zfs_perm_init(zp, dzp, flag, vap, tx, cr); 559 560 if (zpp) { 561 kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp); 562 563 mutex_enter(hash_mtx); 564 zfs_znode_dmu_init(zp); 565 mutex_exit(hash_mtx); 566 567 *zpp = zp; 568 } else { 569 ZTOV(zp)->v_count = 0; 570 dmu_buf_rele(dbp, NULL); 571 zfs_znode_free(zp); 572 } 573 } 574 575 int 576 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 577 { 578 dmu_object_info_t doi; 579 dmu_buf_t *db; 580 znode_t *zp; 581 int err; 582 583 *zpp = NULL; 584 585 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 586 587 err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 588 if (err) { 589 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 590 return (err); 591 } 592 593 dmu_object_info_from_db(db, &doi); 594 if (doi.doi_bonus_type != DMU_OT_ZNODE || 595 doi.doi_bonus_size < sizeof (znode_phys_t)) { 596 dmu_buf_rele(db, NULL); 597 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 598 return (EINVAL); 599 } 600 601 ASSERT(db->db_object == obj_num); 602 ASSERT(db->db_offset == -1); 603 ASSERT(db->db_data != NULL); 604 605 zp = dmu_buf_get_user(db); 606 607 if (zp != NULL) { 608 mutex_enter(&zp->z_lock); 609 610 ASSERT3U(zp->z_id, ==, obj_num); 611 if (zp->z_reap) { 612 dmu_buf_rele(db, NULL); 613 mutex_exit(&zp->z_lock); 614 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 615 return (ENOENT); 616 } else if (zp->z_dbuf_held) { 617 dmu_buf_rele(db, NULL); 618 } else { 619 zp->z_dbuf_held = 1; 620 VFS_HOLD(zfsvfs->z_vfs); 621 } 622 623 if (zp->z_active == 0) 624 zp->z_active = 1; 625 626 VN_HOLD(ZTOV(zp)); 627 mutex_exit(&zp->z_lock); 628 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 629 *zpp = zp; 630 return (0); 631 } 632 633 /* 634 * Not found create new znode/vnode 635 */ 636 zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size); 637 ASSERT3U(zp->z_id, ==, obj_num); 638 zfs_znode_dmu_init(zp); 639 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 640 *zpp = zp; 641 return (0); 642 } 643 644 void 645 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 646 { 647 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 648 int error; 649 650 ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); 651 if (zp->z_phys->zp_acl.z_acl_extern_obj) { 652 error = dmu_object_free(zfsvfs->z_os, 653 zp->z_phys->zp_acl.z_acl_extern_obj, tx); 654 ASSERT3U(error, ==, 0); 655 } 656 error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx); 657 ASSERT3U(error, ==, 0); 658 zp->z_dbuf_held = 0; 659 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 660 dmu_buf_rele(zp->z_dbuf, NULL); 661 } 662 663 void 664 zfs_zinactive(znode_t *zp) 665 { 666 vnode_t *vp = ZTOV(zp); 667 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 668 uint64_t z_id = zp->z_id; 669 670 ASSERT(zp->z_dbuf_held && zp->z_phys); 671 672 /* 673 * Don't allow a zfs_zget() while were trying to release this znode 674 */ 675 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 676 677 mutex_enter(&zp->z_lock); 678 mutex_enter(&vp->v_lock); 679 vp->v_count--; 680 if (vp->v_count > 0 || vn_has_cached_data(vp)) { 681 /* 682 * If the hold count is greater than zero, somebody has 683 * obtained a new reference on this znode while we were 684 * processing it here, so we are done. If we still have 685 * mapped pages then we are also done, since we don't 686 * want to inactivate the znode until the pages get pushed. 687 * 688 * XXX - if vn_has_cached_data(vp) is true, but count == 0, 689 * this seems like it would leave the znode hanging with 690 * no chance to go inactive... 691 */ 692 mutex_exit(&vp->v_lock); 693 mutex_exit(&zp->z_lock); 694 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 695 return; 696 } 697 mutex_exit(&vp->v_lock); 698 zp->z_active = 0; 699 700 /* 701 * If this was the last reference to a file with no links, 702 * remove the file from the file system. 703 */ 704 if (zp->z_reap) { 705 mutex_exit(&zp->z_lock); 706 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 707 /* XATTR files are not put on the delete queue */ 708 if (zp->z_phys->zp_flags & ZFS_XATTR) { 709 zfs_rmnode(zp); 710 } else { 711 mutex_enter(&zfsvfs->z_delete_head.z_mutex); 712 list_insert_tail(&zfsvfs->z_delete_head.z_znodes, zp); 713 zfsvfs->z_delete_head.z_znode_count++; 714 cv_broadcast(&zfsvfs->z_delete_head.z_cv); 715 mutex_exit(&zfsvfs->z_delete_head.z_mutex); 716 } 717 VFS_RELE(zfsvfs->z_vfs); 718 return; 719 } 720 ASSERT(zp->z_phys); 721 ASSERT(zp->z_dbuf_held); 722 723 zp->z_dbuf_held = 0; 724 mutex_exit(&zp->z_lock); 725 dmu_buf_rele(zp->z_dbuf, NULL); 726 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 727 VFS_RELE(zfsvfs->z_vfs); 728 } 729 730 void 731 zfs_znode_free(znode_t *zp) 732 { 733 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 734 735 mutex_enter(&zfsvfs->z_znodes_lock); 736 list_remove(&zfsvfs->z_all_znodes, zp); 737 mutex_exit(&zfsvfs->z_znodes_lock); 738 739 kmem_cache_free(znode_cache, zp); 740 } 741 742 void 743 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 744 { 745 timestruc_t now; 746 747 ASSERT(MUTEX_HELD(&zp->z_lock)); 748 749 gethrestime(&now); 750 751 if (tx) { 752 dmu_buf_will_dirty(zp->z_dbuf, tx); 753 zp->z_atime_dirty = 0; 754 zp->z_seq++; 755 } else { 756 zp->z_atime_dirty = 1; 757 } 758 759 if (flag & AT_ATIME) 760 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 761 762 if (flag & AT_MTIME) 763 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 764 765 if (flag & AT_CTIME) 766 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 767 } 768 769 /* 770 * Update the requested znode timestamps with the current time. 771 * If we are in a transaction, then go ahead and mark the znode 772 * dirty in the transaction so the timestamps will go to disk. 773 * Otherwise, we will get pushed next time the znode is updated 774 * in a transaction, or when this znode eventually goes inactive. 775 * 776 * Why is this OK? 777 * 1 - Only the ACCESS time is ever updated outside of a transaction. 778 * 2 - Multiple consecutive updates will be collapsed into a single 779 * znode update by the transaction grouping semantics of the DMU. 780 */ 781 void 782 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 783 { 784 mutex_enter(&zp->z_lock); 785 zfs_time_stamper_locked(zp, flag, tx); 786 mutex_exit(&zp->z_lock); 787 } 788 789 /* 790 * Grow the block size for a file. This may involve migrating data 791 * from the bonus buffer into a data block (when we grow beyond the 792 * bonus buffer data area). 793 * 794 * IN: zp - znode of file to free data in. 795 * size - requested block size 796 * tx - open transaction. 797 * 798 * RETURN: 0 if success 799 * error code if failure 800 * 801 * NOTE: this function assumes that the znode is write locked. 802 */ 803 int 804 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 805 { 806 int error; 807 u_longlong_t dummy; 808 809 ASSERT(rw_write_held(&zp->z_grow_lock)); 810 811 if (size <= zp->z_blksz) 812 return (0); 813 /* 814 * If the file size is already greater than the current blocksize, 815 * we will not grow. If there is more than one block in a file, 816 * the blocksize cannot change. 817 */ 818 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 819 return (0); 820 821 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 822 size, 0, tx); 823 if (error == ENOTSUP) 824 return (0); 825 ASSERT3U(error, ==, 0); 826 827 /* What blocksize did we actually get? */ 828 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 829 830 return (0); 831 } 832 833 /* 834 * This is a dummy interface used when pvn_vplist_dirty() should *not* 835 * be calling back into the fs for a putpage(). E.g.: when truncating 836 * a file, the pages being "thrown away* don't need to be written out. 837 */ 838 /* ARGSUSED */ 839 static int 840 zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 841 int flags, cred_t *cr) 842 { 843 ASSERT(0); 844 return (0); 845 } 846 847 /* 848 * Free space in a file. Currently, this function only 849 * supports freeing space at the end of the file. 850 * 851 * IN: zp - znode of file to free data in. 852 * from - start of section to free. 853 * len - length of section to free (0 => to EOF). 854 * flag - current file open mode flags. 855 * tx - open transaction. 856 * 857 * RETURN: 0 if success 858 * error code if failure 859 */ 860 int 861 zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx, 862 cred_t *cr) 863 { 864 vnode_t *vp = ZTOV(zp); 865 uint64_t size = zp->z_phys->zp_size; 866 uint64_t end = from + len; 867 int have_grow_lock, error; 868 869 if (ZTOV(zp)->v_type == VFIFO) 870 return (0); 871 872 have_grow_lock = RW_WRITE_HELD(&zp->z_grow_lock); 873 874 /* 875 * Nothing to do if file already at desired length. 876 */ 877 if (len == 0 && size == from) { 878 return (0); 879 } 880 881 /* 882 * Check for any locks in the region to be freed. 883 */ 884 if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) { 885 uint64_t start; 886 887 if (size > from) 888 start = from; 889 else 890 start = size; 891 if (error = chklock(vp, FWRITE, start, 0, flag, NULL)) 892 return (error); 893 } 894 895 if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || 896 zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { 897 uint64_t new_blksz; 898 /* 899 * We are growing the file past the current block size. 900 */ 901 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 902 ASSERT(!ISP2(zp->z_blksz)); 903 new_blksz = MIN(end, SPA_MAXBLOCKSIZE); 904 } else { 905 new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 906 } 907 error = zfs_grow_blocksize(zp, new_blksz, tx); 908 ASSERT(error == 0); 909 } 910 if (end > size || len == 0) 911 zp->z_phys->zp_size = end; 912 if (from > size) 913 return (0); 914 915 if (have_grow_lock) 916 rw_downgrade(&zp->z_grow_lock); 917 /* 918 * Clear any mapped pages in the truncated region. 919 */ 920 rw_enter(&zp->z_map_lock, RW_WRITER); 921 if (vn_has_cached_data(vp)) { 922 page_t *pp; 923 uint64_t start = from & PAGEMASK; 924 int off = from & PAGEOFFSET; 925 926 if (off != 0 && (pp = page_lookup(vp, start, SE_SHARED))) { 927 /* 928 * We need to zero a partial page. 929 */ 930 pagezero(pp, off, PAGESIZE - off); 931 start += PAGESIZE; 932 page_unlock(pp); 933 } 934 error = pvn_vplist_dirty(vp, start, zfs_no_putpage, 935 B_INVAL | B_TRUNC, cr); 936 ASSERT(error == 0); 937 } 938 rw_exit(&zp->z_map_lock); 939 940 if (!have_grow_lock) 941 rw_enter(&zp->z_grow_lock, RW_READER); 942 943 if (len == 0) 944 len = -1; 945 else if (end > size) 946 len = size - from; 947 VERIFY(0 == dmu_free_range(zp->z_zfsvfs->z_os, 948 zp->z_id, from, len, tx)); 949 950 if (!have_grow_lock) 951 rw_exit(&zp->z_grow_lock); 952 953 return (0); 954 } 955 956 void 957 zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx) 958 { 959 zfsvfs_t zfsvfs; 960 uint64_t moid, doid, roid = 0; 961 uint64_t version = ZFS_VERSION; 962 int error; 963 znode_t *rootzp = NULL; 964 vnode_t *vp; 965 vattr_t vattr; 966 967 /* 968 * First attempt to create master node. 969 */ 970 /* 971 * In an empty objset, there are no blocks to read and thus 972 * there can be no i/o errors (which we assert below). 973 */ 974 moid = MASTER_NODE_OBJ; 975 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 976 DMU_OT_NONE, 0, tx); 977 ASSERT(error == 0); 978 979 /* 980 * Set starting attributes. 981 */ 982 983 error = zap_update(os, moid, ZFS_VERSION_OBJ, 8, 1, &version, tx); 984 ASSERT(error == 0); 985 986 /* 987 * Create a delete queue. 988 */ 989 doid = zap_create(os, DMU_OT_DELETE_QUEUE, DMU_OT_NONE, 0, tx); 990 991 error = zap_add(os, moid, ZFS_DELETE_QUEUE, 8, 1, &doid, tx); 992 ASSERT(error == 0); 993 994 /* 995 * Create root znode. Create minimal znode/vnode/zfsvfs 996 * to allow zfs_mknode to work. 997 */ 998 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 999 vattr.va_type = VDIR; 1000 vattr.va_mode = S_IFDIR|0755; 1001 vattr.va_uid = 0; 1002 vattr.va_gid = 3; 1003 1004 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1005 rootzp->z_zfsvfs = &zfsvfs; 1006 rootzp->z_active = 1; 1007 rootzp->z_reap = 0; 1008 rootzp->z_atime_dirty = 0; 1009 rootzp->z_dbuf_held = 0; 1010 1011 vp = ZTOV(rootzp); 1012 vn_reinit(vp); 1013 vp->v_type = VDIR; 1014 1015 bzero(&zfsvfs, sizeof (zfsvfs_t)); 1016 1017 zfsvfs.z_os = os; 1018 zfsvfs.z_assign = TXG_NOWAIT; 1019 zfsvfs.z_parent = &zfsvfs; 1020 1021 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1022 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1023 offsetof(znode_t, z_link_node)); 1024 1025 zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0); 1026 ASSERT3U(rootzp->z_id, ==, roid); 1027 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx); 1028 ASSERT(error == 0); 1029 1030 ZTOV(rootzp)->v_count = 0; 1031 kmem_cache_free(znode_cache, rootzp); 1032 } 1033