1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/time.h> 32 #include <sys/systm.h> 33 #include <sys/sysmacros.h> 34 #include <sys/resource.h> 35 #include <sys/mntent.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/file.h> 39 #include <sys/kmem.h> 40 #include <sys/cmn_err.h> 41 #include <sys/errno.h> 42 #include <sys/unistd.h> 43 #include <sys/stat.h> 44 #include <sys/mode.h> 45 #include <sys/atomic.h> 46 #include <vm/pvn.h> 47 #include "fs/fs_subr.h" 48 #include <sys/zfs_dir.h> 49 #include <sys/zfs_acl.h> 50 #include <sys/zfs_ioctl.h> 51 #include <sys/zfs_znode.h> 52 #include <sys/zap.h> 53 #include <sys/dmu.h> 54 #include <sys/fs/zfs.h> 55 56 struct kmem_cache *znode_cache = NULL; 57 58 /* 59 * Note that znodes can be on one of 2 states: 60 * ZCACHE_mru - recently used, currently cached 61 * ZCACHE_mfu - frequently used, currently cached 62 * When there are no active references to the znode, they 63 * are linked onto one of the lists in zcache. These are the 64 * only znodes that can be evicted. 65 */ 66 67 typedef struct zcache_state { 68 list_t list; /* linked list of evictable znodes in state */ 69 uint64_t lcnt; /* total number of znodes in the linked list */ 70 uint64_t cnt; /* total number of all znodes in this state */ 71 uint64_t hits; 72 kmutex_t mtx; 73 } zcache_state_t; 74 75 /* The 2 states: */ 76 static zcache_state_t ZCACHE_mru; 77 static zcache_state_t ZCACHE_mfu; 78 79 static struct zcache { 80 zcache_state_t *mru; 81 zcache_state_t *mfu; 82 uint64_t p; /* Target size of mru */ 83 uint64_t c; /* Target size of cache */ 84 uint64_t c_max; /* Maximum target cache size */ 85 86 /* performance stats */ 87 uint64_t missed; 88 uint64_t evicted; 89 uint64_t skipped; 90 } zcache; 91 92 void zcache_kmem_reclaim(void); 93 94 #define ZCACHE_MINTIME (hz>>4) /* 62 ms */ 95 96 /* 97 * Move the supplied znode to the indicated state. The mutex 98 * for the znode must be held by the caller. 99 */ 100 static void 101 zcache_change_state(zcache_state_t *new_state, znode_t *zp) 102 { 103 /* ASSERT(MUTEX_HELD(hash_mtx)); */ 104 ASSERT(zp->z_active); 105 106 if (zp->z_zcache_state) { 107 ASSERT3U(zp->z_zcache_state->cnt, >=, 1); 108 atomic_add_64(&zp->z_zcache_state->cnt, -1); 109 } 110 atomic_add_64(&new_state->cnt, 1); 111 zp->z_zcache_state = new_state; 112 } 113 114 static void 115 zfs_zcache_evict(znode_t *zp, kmutex_t *hash_mtx) 116 { 117 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 118 119 ASSERT(zp->z_phys); 120 ASSERT(zp->z_dbuf_held); 121 122 zp->z_dbuf_held = 0; 123 mutex_exit(&zp->z_lock); 124 dmu_buf_rele(zp->z_dbuf); 125 mutex_exit(hash_mtx); 126 VFS_RELE(zfsvfs->z_vfs); 127 } 128 129 /* 130 * Evict znodes from list until we've removed the specified number 131 */ 132 static void 133 zcache_evict_state(zcache_state_t *state, int64_t cnt, zfsvfs_t *zfsvfs) 134 { 135 int znodes_evicted = 0; 136 znode_t *zp, *zp_prev; 137 kmutex_t *hash_mtx; 138 139 ASSERT(state == zcache.mru || state == zcache.mfu); 140 141 mutex_enter(&state->mtx); 142 143 for (zp = list_tail(&state->list); zp; zp = zp_prev) { 144 zp_prev = list_prev(&state->list, zp); 145 if (zfsvfs && zp->z_zfsvfs != zfsvfs) 146 continue; 147 hash_mtx = ZFS_OBJ_MUTEX(zp); 148 if (mutex_tryenter(hash_mtx)) { 149 mutex_enter(&zp->z_lock); 150 list_remove(&zp->z_zcache_state->list, zp); 151 zp->z_zcache_state->lcnt -= 1; 152 ASSERT3U(zp->z_zcache_state->cnt, >=, 1); 153 atomic_add_64(&zp->z_zcache_state->cnt, -1); 154 zp->z_zcache_state = NULL; 155 zp->z_zcache_access = 0; 156 /* drops z_lock and hash_mtx */ 157 zfs_zcache_evict(zp, hash_mtx); 158 znodes_evicted += 1; 159 atomic_add_64(&zcache.evicted, 1); 160 if (znodes_evicted >= cnt) 161 break; 162 } else { 163 atomic_add_64(&zcache.skipped, 1); 164 } 165 } 166 mutex_exit(&state->mtx); 167 168 if (znodes_evicted < cnt) 169 dprintf("only evicted %lld znodes from %x", 170 (longlong_t)znodes_evicted, state); 171 } 172 173 static void 174 zcache_adjust(void) 175 { 176 uint64_t mrucnt = zcache.mru->lcnt; 177 uint64_t mfucnt = zcache.mfu->lcnt; 178 uint64_t p = zcache.p; 179 uint64_t c = zcache.c; 180 181 if (mrucnt > p) 182 zcache_evict_state(zcache.mru, mrucnt - p, NULL); 183 184 if (mfucnt > 0 && mrucnt + mfucnt > c) { 185 int64_t toevict = MIN(mfucnt, mrucnt + mfucnt - c); 186 zcache_evict_state(zcache.mfu, toevict, NULL); 187 } 188 } 189 190 /* 191 * Flush all *evictable* data from the cache. 192 * NOTE: this will not touch "active" (i.e. referenced) data. 193 */ 194 void 195 zfs_zcache_flush(zfsvfs_t *zfsvfs) 196 { 197 zcache_evict_state(zcache.mru, zcache.mru->lcnt, zfsvfs); 198 zcache_evict_state(zcache.mfu, zcache.mfu->lcnt, zfsvfs); 199 } 200 201 static void 202 zcache_try_grow(int64_t cnt) 203 { 204 int64_t size; 205 /* 206 * If we're almost to the current target cache size, 207 * increment the target cache size 208 */ 209 size = zcache.mru->lcnt + zcache.mfu->lcnt; 210 if ((zcache.c - size) <= 1) { 211 atomic_add_64(&zcache.c, cnt); 212 if (zcache.c > zcache.c_max) 213 zcache.c = zcache.c_max; 214 else if (zcache.p + cnt < zcache.c) 215 atomic_add_64(&zcache.p, cnt); 216 } 217 } 218 219 /* 220 * This routine is called whenever a znode is accessed. 221 */ 222 static void 223 zcache_access(znode_t *zp, kmutex_t *hash_mtx) 224 { 225 ASSERT(MUTEX_HELD(hash_mtx)); 226 227 if (zp->z_zcache_state == NULL) { 228 /* 229 * This znode is not in the cache. 230 * Add the new znode to the MRU state. 231 */ 232 233 zcache_try_grow(1); 234 235 ASSERT(zp->z_zcache_access == 0); 236 zp->z_zcache_access = lbolt; 237 zcache_change_state(zcache.mru, zp); 238 mutex_exit(hash_mtx); 239 240 /* 241 * If we are using less than 2/3 of our total target 242 * cache size, bump up the target size for the MRU 243 * list. 244 */ 245 if (zcache.mru->lcnt + zcache.mfu->lcnt < zcache.c*2/3) { 246 zcache.p = zcache.mru->lcnt + zcache.c/6; 247 } 248 249 zcache_adjust(); 250 251 atomic_add_64(&zcache.missed, 1); 252 } else if (zp->z_zcache_state == zcache.mru) { 253 /* 254 * This znode has been "accessed" only once so far, 255 * Move it to the MFU state. 256 */ 257 if (lbolt > zp->z_zcache_access + ZCACHE_MINTIME) { 258 /* 259 * More than 125ms have passed since we 260 * instantiated this buffer. Move it to the 261 * most frequently used state. 262 */ 263 zp->z_zcache_access = lbolt; 264 zcache_change_state(zcache.mfu, zp); 265 } 266 atomic_add_64(&zcache.mru->hits, 1); 267 mutex_exit(hash_mtx); 268 } else { 269 ASSERT(zp->z_zcache_state == zcache.mfu); 270 /* 271 * This buffer has been accessed more than once. 272 * Keep it in the MFU state. 273 */ 274 atomic_add_64(&zcache.mfu->hits, 1); 275 mutex_exit(hash_mtx); 276 } 277 } 278 279 static void 280 zcache_init(void) 281 { 282 zcache.c = 20; 283 zcache.c_max = 50; 284 285 zcache.mru = &ZCACHE_mru; 286 zcache.mfu = &ZCACHE_mfu; 287 288 list_create(&zcache.mru->list, sizeof (znode_t), 289 offsetof(znode_t, z_zcache_node)); 290 list_create(&zcache.mfu->list, sizeof (znode_t), 291 offsetof(znode_t, z_zcache_node)); 292 } 293 294 static void 295 zcache_fini(void) 296 { 297 zfs_zcache_flush(NULL); 298 299 list_destroy(&zcache.mru->list); 300 list_destroy(&zcache.mfu->list); 301 } 302 303 /*ARGSUSED*/ 304 static void 305 znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr) 306 { 307 znode_t *zp = user_ptr; 308 vnode_t *vp = ZTOV(zp); 309 310 if (vp->v_count == 0) { 311 vn_invalid(vp); 312 zfs_znode_free(zp); 313 } 314 } 315 316 /*ARGSUSED*/ 317 static int 318 zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags) 319 { 320 znode_t *zp = buf; 321 322 zp->z_vnode = vn_alloc(KM_SLEEP); 323 zp->z_vnode->v_data = (caddr_t)zp; 324 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 325 rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); 326 rw_init(&zp->z_grow_lock, NULL, RW_DEFAULT, NULL); 327 rw_init(&zp->z_append_lock, NULL, RW_DEFAULT, NULL); 328 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 329 zp->z_dbuf_held = 0; 330 zp->z_dirlocks = 0; 331 return (0); 332 } 333 334 /*ARGSUSED*/ 335 static void 336 zfs_znode_cache_destructor(void *buf, void *cdarg) 337 { 338 znode_t *zp = buf; 339 340 ASSERT(zp->z_dirlocks == 0); 341 mutex_destroy(&zp->z_lock); 342 rw_destroy(&zp->z_map_lock); 343 rw_destroy(&zp->z_grow_lock); 344 rw_destroy(&zp->z_append_lock); 345 mutex_destroy(&zp->z_acl_lock); 346 347 ASSERT(zp->z_dbuf_held == 0); 348 ASSERT(ZTOV(zp)->v_count == 0); 349 vn_free(ZTOV(zp)); 350 } 351 352 void 353 zfs_znode_init(void) 354 { 355 /* 356 * Initialize zcache 357 */ 358 ASSERT(znode_cache == NULL); 359 znode_cache = kmem_cache_create("zfs_znode_cache", 360 sizeof (znode_t), 0, zfs_znode_cache_constructor, 361 zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 362 363 zcache_init(); 364 } 365 366 void 367 zfs_znode_fini(void) 368 { 369 zcache_fini(); 370 371 /* 372 * Cleanup vfs & vnode ops 373 */ 374 zfs_remove_op_tables(); 375 376 /* 377 * Cleanup zcache 378 */ 379 if (znode_cache) 380 kmem_cache_destroy(znode_cache); 381 znode_cache = NULL; 382 } 383 384 struct vnodeops *zfs_dvnodeops; 385 struct vnodeops *zfs_fvnodeops; 386 struct vnodeops *zfs_symvnodeops; 387 struct vnodeops *zfs_xdvnodeops; 388 struct vnodeops *zfs_evnodeops; 389 390 void 391 zfs_remove_op_tables() 392 { 393 /* 394 * Remove vfs ops 395 */ 396 ASSERT(zfsfstype); 397 (void) vfs_freevfsops_by_type(zfsfstype); 398 zfsfstype = 0; 399 400 /* 401 * Remove vnode ops 402 */ 403 if (zfs_dvnodeops) 404 vn_freevnodeops(zfs_dvnodeops); 405 if (zfs_fvnodeops) 406 vn_freevnodeops(zfs_fvnodeops); 407 if (zfs_symvnodeops) 408 vn_freevnodeops(zfs_symvnodeops); 409 if (zfs_xdvnodeops) 410 vn_freevnodeops(zfs_xdvnodeops); 411 if (zfs_evnodeops) 412 vn_freevnodeops(zfs_evnodeops); 413 414 zfs_dvnodeops = NULL; 415 zfs_fvnodeops = NULL; 416 zfs_symvnodeops = NULL; 417 zfs_xdvnodeops = NULL; 418 zfs_evnodeops = NULL; 419 } 420 421 extern const fs_operation_def_t zfs_dvnodeops_template[]; 422 extern const fs_operation_def_t zfs_fvnodeops_template[]; 423 extern const fs_operation_def_t zfs_xdvnodeops_template[]; 424 extern const fs_operation_def_t zfs_symvnodeops_template[]; 425 extern const fs_operation_def_t zfs_evnodeops_template[]; 426 427 int 428 zfs_create_op_tables() 429 { 430 int error; 431 432 /* 433 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() 434 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). 435 * In this case we just return as the ops vectors are already set up. 436 */ 437 if (zfs_dvnodeops) 438 return (0); 439 440 error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, 441 &zfs_dvnodeops); 442 if (error) 443 return (error); 444 445 error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, 446 &zfs_fvnodeops); 447 if (error) 448 return (error); 449 450 error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, 451 &zfs_symvnodeops); 452 if (error) 453 return (error); 454 455 error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, 456 &zfs_xdvnodeops); 457 if (error) 458 return (error); 459 460 error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, 461 &zfs_evnodeops); 462 463 return (error); 464 } 465 466 /* 467 * zfs_init_fs - Initialize the zfsvfs struct and the file system 468 * incore "master" object. Verify version compatibility. 469 */ 470 int 471 zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) 472 { 473 extern int zfsfstype; 474 475 objset_t *os = zfsvfs->z_os; 476 uint64_t zoid; 477 uint64_t version = ZFS_VERSION; 478 int i, error; 479 dmu_object_info_t doi; 480 dmu_objset_stats_t *stats; 481 482 *zpp = NULL; 483 484 /* 485 * XXX - hack to auto-create the pool root filesystem at 486 * the first attempted mount. 487 */ 488 if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) { 489 dmu_tx_t *tx = dmu_tx_create(os); 490 491 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 3); /* master node */ 492 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1); /* delete queue */ 493 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */ 494 error = dmu_tx_assign(tx, TXG_WAIT); 495 ASSERT3U(error, ==, 0); 496 zfs_create_fs(os, cr, tx); 497 dmu_tx_commit(tx); 498 } 499 500 if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1, &version)) { 501 return (EINVAL); 502 } else if (version != ZFS_VERSION) { 503 (void) printf("Mismatched versions: File system " 504 "is version %lld on-disk format, which is " 505 "incompatible with this software version %lld!", 506 (u_longlong_t)version, ZFS_VERSION); 507 return (ENOTSUP); 508 } 509 510 /* 511 * The fsid is 64 bits, composed of an 8-bit fs type, which 512 * separates our fsid from any other filesystem types, and a 513 * 56-bit objset unique ID. The objset unique ID is unique to 514 * all objsets open on this system, provided by unique_create(). 515 * The 8-bit fs type must be put in the low bits of fsid[1] 516 * because that's where other Solaris filesystems put it. 517 */ 518 stats = kmem_alloc(sizeof (dmu_objset_stats_t), KM_SLEEP); 519 dmu_objset_stats(os, stats); 520 ASSERT((stats->dds_fsid_guid & ~((1ULL<<56)-1)) == 0); 521 zfsvfs->z_vfs->vfs_fsid.val[0] = stats->dds_fsid_guid; 522 zfsvfs->z_vfs->vfs_fsid.val[1] = ((stats->dds_fsid_guid>>32) << 8) | 523 zfsfstype & 0xFF; 524 kmem_free(stats, sizeof (dmu_objset_stats_t)); 525 stats = NULL; 526 527 if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid)) { 528 return (EINVAL); 529 } 530 ASSERT(zoid != 0); 531 zfsvfs->z_root = zoid; 532 533 /* 534 * Create the per mount vop tables. 535 */ 536 537 /* 538 * Initialize zget mutex's 539 */ 540 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 541 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 542 543 error = zfs_zget(zfsvfs, zoid, zpp); 544 if (error) 545 return (error); 546 ASSERT3U((*zpp)->z_id, ==, zoid); 547 548 if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid)) { 549 return (EINVAL); 550 } 551 552 zfsvfs->z_dqueue = zoid; 553 554 /* 555 * Initialize delete head structure 556 * Thread(s) will be started/stopped via 557 * readonly_changed_cb() depending 558 * on whether this is rw/ro mount. 559 */ 560 list_create(&zfsvfs->z_delete_head.z_znodes, 561 sizeof (znode_t), offsetof(znode_t, z_list_node)); 562 563 return (0); 564 } 565 566 /* 567 * Construct a new znode/vnode and intialize. 568 * 569 * This does not do a call to dmu_set_user() that is 570 * up to the caller to do, in case you don't want to 571 * return the znode 572 */ 573 znode_t * 574 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz) 575 { 576 znode_t *zp; 577 vnode_t *vp; 578 579 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 580 581 ASSERT(zp->z_dirlocks == NULL); 582 583 zp->z_phys = db->db_data; 584 zp->z_zfsvfs = zfsvfs; 585 zp->z_active = 1; 586 zp->z_reap = 0; 587 zp->z_atime_dirty = 0; 588 zp->z_dbuf_held = 0; 589 zp->z_mapcnt = 0; 590 zp->z_last_itx = 0; 591 zp->z_dbuf = db; 592 zp->z_id = obj_num; 593 zp->z_blksz = blksz; 594 zp->z_seq = 0x7A4653; 595 596 bzero(&zp->z_zcache_node, sizeof (list_node_t)); 597 598 mutex_enter(&zfsvfs->z_znodes_lock); 599 list_insert_tail(&zfsvfs->z_all_znodes, zp); 600 mutex_exit(&zfsvfs->z_znodes_lock); 601 602 vp = ZTOV(zp); 603 vn_reinit(vp); 604 605 vp->v_vfsp = zfsvfs->z_parent->z_vfs; 606 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 607 608 switch (vp->v_type) { 609 case VDIR: 610 if (zp->z_phys->zp_flags & ZFS_XATTR) { 611 vn_setops(vp, zfs_xdvnodeops); 612 vp->v_flag |= V_XATTRDIR; 613 } else 614 vn_setops(vp, zfs_dvnodeops); 615 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 616 break; 617 case VBLK: 618 case VCHR: 619 vp->v_rdev = (dev_t)zp->z_phys->zp_rdev; 620 /*FALLTHROUGH*/ 621 case VFIFO: 622 case VSOCK: 623 case VDOOR: 624 vn_setops(vp, zfs_fvnodeops); 625 break; 626 case VREG: 627 vp->v_flag |= VMODSORT; 628 vn_setops(vp, zfs_fvnodeops); 629 break; 630 case VLNK: 631 vn_setops(vp, zfs_symvnodeops); 632 break; 633 default: 634 vn_setops(vp, zfs_evnodeops); 635 break; 636 } 637 638 return (zp); 639 } 640 641 static void 642 zfs_znode_dmu_init(znode_t *zp) 643 { 644 znode_t *nzp; 645 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 646 dmu_buf_t *db = zp->z_dbuf; 647 648 mutex_enter(&zp->z_lock); 649 650 nzp = dmu_buf_set_user(db, zp, &zp->z_phys, znode_pageout_func); 651 652 /* 653 * there should be no 654 * concurrent zgets on this object. 655 */ 656 ASSERT3P(nzp, ==, NULL); 657 658 /* 659 * Slap on VROOT if we are the root znode 660 */ 661 if (zp->z_id == zfsvfs->z_root) { 662 ZTOV(zp)->v_flag |= VROOT; 663 } 664 665 zp->z_zcache_state = NULL; 666 zp->z_zcache_access = 0; 667 668 ASSERT(zp->z_dbuf_held == 0); 669 zp->z_dbuf_held = 1; 670 VFS_HOLD(zfsvfs->z_vfs); 671 mutex_exit(&zp->z_lock); 672 vn_exists(ZTOV(zp)); 673 } 674 675 /* 676 * Create a new DMU object to hold a zfs znode. 677 * 678 * IN: dzp - parent directory for new znode 679 * vap - file attributes for new znode 680 * tx - dmu transaction id for zap operations 681 * cr - credentials of caller 682 * flag - flags: 683 * IS_ROOT_NODE - new object will be root 684 * IS_XATTR - new object is an attribute 685 * IS_REPLAY - intent log replay 686 * 687 * OUT: oid - ID of created object 688 * 689 */ 690 void 691 zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, 692 uint_t flag, znode_t **zpp, int bonuslen) 693 { 694 dmu_buf_t *dbp; 695 znode_phys_t *pzp; 696 znode_t *zp; 697 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 698 timestruc_t now; 699 uint64_t gen; 700 int err; 701 702 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 703 704 if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ 705 *oid = vap->va_nodeid; 706 flag |= IS_REPLAY; 707 now = vap->va_ctime; /* see zfs_replay_create() */ 708 gen = vap->va_nblocks; /* ditto */ 709 } else { 710 *oid = 0; 711 gethrestime(&now); 712 gen = dmu_tx_get_txg(tx); 713 } 714 715 /* 716 * Create a new DMU object. 717 */ 718 if (vap->va_type == VDIR) { 719 if (flag & IS_REPLAY) { 720 err = zap_create_claim(zfsvfs->z_os, *oid, 721 DMU_OT_DIRECTORY_CONTENTS, 722 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 723 ASSERT3U(err, ==, 0); 724 } else { 725 *oid = zap_create(zfsvfs->z_os, 726 DMU_OT_DIRECTORY_CONTENTS, 727 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 728 } 729 } else { 730 if (flag & IS_REPLAY) { 731 err = dmu_object_claim(zfsvfs->z_os, *oid, 732 DMU_OT_PLAIN_FILE_CONTENTS, 0, 733 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 734 ASSERT3U(err, ==, 0); 735 } else { 736 *oid = dmu_object_alloc(zfsvfs->z_os, 737 DMU_OT_PLAIN_FILE_CONTENTS, 0, 738 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 739 } 740 } 741 dbp = dmu_bonus_hold(zfsvfs->z_os, *oid); 742 dmu_buf_will_dirty(dbp, tx); 743 744 /* 745 * Initialize the znode physical data to zero. 746 */ 747 ASSERT(dbp->db_size >= sizeof (znode_phys_t)); 748 bzero(dbp->db_data, dbp->db_size); 749 pzp = dbp->db_data; 750 751 /* 752 * If this is the root, fix up the half-initialized parent pointer 753 * to reference the just-allocated physical data area. 754 */ 755 if (flag & IS_ROOT_NODE) { 756 dzp->z_phys = pzp; 757 dzp->z_id = *oid; 758 } 759 760 /* 761 * If parent is an xattr, so am I. 762 */ 763 if (dzp->z_phys->zp_flags & ZFS_XATTR) 764 flag |= IS_XATTR; 765 766 if (vap->va_type == VBLK || vap->va_type == VCHR) { 767 pzp->zp_rdev = vap->va_rdev; 768 } 769 770 if (vap->va_type == VDIR) { 771 pzp->zp_size = 2; /* contents ("." and "..") */ 772 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 773 } 774 775 pzp->zp_parent = dzp->z_id; 776 if (flag & IS_XATTR) 777 pzp->zp_flags |= ZFS_XATTR; 778 779 pzp->zp_gen = gen; 780 781 ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 782 ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 783 784 if (vap->va_mask & AT_ATIME) { 785 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 786 } else { 787 ZFS_TIME_ENCODE(&now, pzp->zp_atime); 788 } 789 790 if (vap->va_mask & AT_MTIME) { 791 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 792 } else { 793 ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 794 } 795 796 pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); 797 zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0); 798 799 zfs_perm_init(zp, dzp, flag, vap, tx, cr); 800 801 if (zpp) { 802 kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp); 803 804 mutex_enter(hash_mtx); 805 zfs_znode_dmu_init(zp); 806 zcache_access(zp, hash_mtx); 807 *zpp = zp; 808 } else { 809 ZTOV(zp)->v_count = 0; 810 dmu_buf_rele(dbp); 811 zfs_znode_free(zp); 812 } 813 } 814 815 int 816 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 817 { 818 dmu_object_info_t doi; 819 dmu_buf_t *db; 820 znode_t *zp; 821 822 *zpp = NULL; 823 824 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 825 826 db = dmu_bonus_hold(zfsvfs->z_os, obj_num); 827 if (db == NULL) { 828 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 829 return (ENOENT); 830 } 831 832 dmu_object_info_from_db(db, &doi); 833 if (doi.doi_bonus_type != DMU_OT_ZNODE || 834 doi.doi_bonus_size < sizeof (znode_phys_t)) { 835 dmu_buf_rele(db); 836 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 837 return (EINVAL); 838 } 839 dmu_buf_read(db); 840 841 ASSERT(db->db_object == obj_num); 842 ASSERT(db->db_offset == -1); 843 ASSERT(db->db_data != NULL); 844 845 zp = dmu_buf_get_user(db); 846 847 if (zp != NULL) { 848 mutex_enter(&zp->z_lock); 849 850 ASSERT3U(zp->z_id, ==, obj_num); 851 if (zp->z_reap) { 852 dmu_buf_rele(db); 853 mutex_exit(&zp->z_lock); 854 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 855 return (ENOENT); 856 } else if (zp->z_dbuf_held) { 857 dmu_buf_rele(db); 858 } else { 859 zp->z_dbuf_held = 1; 860 VFS_HOLD(zfsvfs->z_vfs); 861 } 862 863 if (zp->z_active == 0) { 864 zp->z_active = 1; 865 if (list_link_active(&zp->z_zcache_node)) { 866 mutex_enter(&zp->z_zcache_state->mtx); 867 list_remove(&zp->z_zcache_state->list, zp); 868 zp->z_zcache_state->lcnt -= 1; 869 mutex_exit(&zp->z_zcache_state->mtx); 870 } 871 } 872 VN_HOLD(ZTOV(zp)); 873 mutex_exit(&zp->z_lock); 874 zcache_access(zp, ZFS_OBJ_MUTEX(zp)); 875 *zpp = zp; 876 return (0); 877 } 878 879 /* 880 * Not found create new znode/vnode 881 */ 882 zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size); 883 ASSERT3U(zp->z_id, ==, obj_num); 884 zfs_znode_dmu_init(zp); 885 zcache_access(zp, ZFS_OBJ_MUTEX(zp)); 886 *zpp = zp; 887 return (0); 888 } 889 890 void 891 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 892 { 893 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 894 int error; 895 896 ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); 897 if (zp->z_phys->zp_acl.z_acl_extern_obj) { 898 error = dmu_object_free(zfsvfs->z_os, 899 zp->z_phys->zp_acl.z_acl_extern_obj, tx); 900 ASSERT3U(error, ==, 0); 901 } 902 if (zp->z_zcache_state) { 903 ASSERT3U(zp->z_zcache_state->cnt, >=, 1); 904 atomic_add_64(&zp->z_zcache_state->cnt, -1); 905 } 906 error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx); 907 ASSERT3U(error, ==, 0); 908 zp->z_dbuf_held = 0; 909 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 910 dmu_buf_rele(zp->z_dbuf); 911 } 912 913 void 914 zfs_zinactive(znode_t *zp) 915 { 916 vnode_t *vp = ZTOV(zp); 917 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 918 uint64_t z_id = zp->z_id; 919 920 ASSERT(zp->z_dbuf_held && zp->z_phys); 921 922 /* 923 * Don't allow a zfs_zget() while were trying to release this znode 924 */ 925 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 926 927 mutex_enter(&zp->z_lock); 928 mutex_enter(&vp->v_lock); 929 vp->v_count--; 930 if (vp->v_count > 0 || vn_has_cached_data(vp)) { 931 /* 932 * If the hold count is greater than zero, somebody has 933 * obtained a new reference on this znode while we were 934 * processing it here, so we are done. If we still have 935 * mapped pages then we are also done, since we don't 936 * want to inactivate the znode until the pages get pushed. 937 * 938 * XXX - if vn_has_cached_data(vp) is true, but count == 0, 939 * this seems like it would leave the znode hanging with 940 * no chance to go inactive... 941 */ 942 mutex_exit(&vp->v_lock); 943 mutex_exit(&zp->z_lock); 944 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 945 return; 946 } 947 mutex_exit(&vp->v_lock); 948 zp->z_active = 0; 949 950 /* 951 * If this was the last reference to a file with no links, 952 * remove the file from the file system. 953 */ 954 if (zp->z_reap) { 955 mutex_exit(&zp->z_lock); 956 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 957 ASSERT3U(zp->z_zcache_state->cnt, >=, 1); 958 atomic_add_64(&zp->z_zcache_state->cnt, -1); 959 zp->z_zcache_state = NULL; 960 /* XATTR files are not put on the delete queue */ 961 if (zp->z_phys->zp_flags & ZFS_XATTR) { 962 zfs_rmnode(zp); 963 } else { 964 mutex_enter(&zfsvfs->z_delete_head.z_mutex); 965 list_insert_tail(&zfsvfs->z_delete_head.z_znodes, zp); 966 zfsvfs->z_delete_head.z_znode_count++; 967 cv_broadcast(&zfsvfs->z_delete_head.z_cv); 968 mutex_exit(&zfsvfs->z_delete_head.z_mutex); 969 } 970 VFS_RELE(zfsvfs->z_vfs); 971 return; 972 } 973 974 /* 975 * If the file system for this znode is no longer mounted, 976 * evict the znode now, don't put it in the cache. 977 */ 978 if (zfsvfs->z_unmounted1) { 979 zfs_zcache_evict(zp, ZFS_OBJ_MUTEX(zp)); 980 return; 981 } 982 983 /* put znode on evictable list */ 984 mutex_enter(&zp->z_zcache_state->mtx); 985 list_insert_head(&zp->z_zcache_state->list, zp); 986 zp->z_zcache_state->lcnt += 1; 987 mutex_exit(&zp->z_zcache_state->mtx); 988 mutex_exit(&zp->z_lock); 989 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 990 } 991 992 void 993 zfs_znode_free(znode_t *zp) 994 { 995 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 996 997 mutex_enter(&zfsvfs->z_znodes_lock); 998 list_remove(&zfsvfs->z_all_znodes, zp); 999 mutex_exit(&zfsvfs->z_znodes_lock); 1000 1001 kmem_cache_free(znode_cache, zp); 1002 } 1003 1004 void 1005 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1006 { 1007 timestruc_t now; 1008 1009 ASSERT(MUTEX_HELD(&zp->z_lock)); 1010 1011 gethrestime(&now); 1012 1013 if (tx) { 1014 dmu_buf_will_dirty(zp->z_dbuf, tx); 1015 zp->z_atime_dirty = 0; 1016 zp->z_seq++; 1017 } else { 1018 zp->z_atime_dirty = 1; 1019 } 1020 1021 if (flag & AT_ATIME) 1022 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1023 1024 if (flag & AT_MTIME) 1025 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1026 1027 if (flag & AT_CTIME) 1028 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1029 } 1030 1031 /* 1032 * Update the requested znode timestamps with the current time. 1033 * If we are in a transaction, then go ahead and mark the znode 1034 * dirty in the transaction so the timestamps will go to disk. 1035 * Otherwise, we will get pushed next time the znode is updated 1036 * in a transaction, or when this znode eventually goes inactive. 1037 * 1038 * Why is this OK? 1039 * 1 - Only the ACCESS time is ever updated outside of a transaction. 1040 * 2 - Multiple consecutive updates will be collapsed into a single 1041 * znode update by the transaction grouping semantics of the DMU. 1042 */ 1043 void 1044 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1045 { 1046 mutex_enter(&zp->z_lock); 1047 zfs_time_stamper_locked(zp, flag, tx); 1048 mutex_exit(&zp->z_lock); 1049 } 1050 1051 /* 1052 * Grow the block size for a file. This may involve migrating data 1053 * from the bonus buffer into a data block (when we grow beyond the 1054 * bonus buffer data area). 1055 * 1056 * IN: zp - znode of file to free data in. 1057 * size - requested block size 1058 * tx - open transaction. 1059 * 1060 * RETURN: 0 if success 1061 * error code if failure 1062 * 1063 * NOTE: this function assumes that the znode is write locked. 1064 */ 1065 int 1066 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1067 { 1068 int error; 1069 u_longlong_t dummy; 1070 1071 ASSERT(rw_write_held(&zp->z_grow_lock)); 1072 1073 if (size <= zp->z_blksz) 1074 return (0); 1075 /* 1076 * If the file size is already greater than the current blocksize, 1077 * we will not grow. If there is more than one block in a file, 1078 * the blocksize cannot change. 1079 */ 1080 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1081 return (0); 1082 1083 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1084 size, 0, tx); 1085 if (error == ENOTSUP) 1086 return (0); 1087 ASSERT3U(error, ==, 0); 1088 1089 /* What blocksize did we actually get? */ 1090 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1091 1092 return (0); 1093 } 1094 1095 /* 1096 * This is a dummy interface used when pvn_vplist_dirty() should *not* 1097 * be calling back into the fs for a putpage(). E.g.: when truncating 1098 * a file, the pages being "thrown away* don't need to be written out. 1099 */ 1100 /* ARGSUSED */ 1101 static int 1102 zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 1103 int flags, cred_t *cr) 1104 { 1105 ASSERT(0); 1106 return (0); 1107 } 1108 1109 /* 1110 * Free space in a file. Currently, this function only 1111 * supports freeing space at the end of the file. 1112 * 1113 * IN: zp - znode of file to free data in. 1114 * from - start of section to free. 1115 * len - length of section to free (0 => to EOF). 1116 * flag - current file open mode flags. 1117 * tx - open transaction. 1118 * 1119 * RETURN: 0 if success 1120 * error code if failure 1121 */ 1122 int 1123 zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx, 1124 cred_t *cr) 1125 { 1126 vnode_t *vp = ZTOV(zp); 1127 uint64_t size = zp->z_phys->zp_size; 1128 uint64_t end = from + len; 1129 int have_grow_lock, error; 1130 1131 if (ZTOV(zp)->v_type == VFIFO) 1132 return (0); 1133 1134 have_grow_lock = RW_WRITE_HELD(&zp->z_grow_lock); 1135 1136 /* 1137 * Nothing to do if file already at desired length. 1138 */ 1139 if (len == 0 && size == from) { 1140 return (0); 1141 } 1142 1143 /* 1144 * Check for any locks in the region to be freed. 1145 */ 1146 if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) { 1147 uint64_t start; 1148 1149 if (size > from) 1150 start = from; 1151 else 1152 start = size; 1153 if (error = chklock(vp, FWRITE, start, 0, flag, NULL)) 1154 return (error); 1155 } 1156 1157 if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || 1158 zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { 1159 uint64_t new_blksz; 1160 /* 1161 * We are growing the file past the current block size. 1162 */ 1163 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1164 ASSERT(!ISP2(zp->z_blksz)); 1165 new_blksz = MIN(end, SPA_MAXBLOCKSIZE); 1166 } else { 1167 new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1168 } 1169 error = zfs_grow_blocksize(zp, new_blksz, tx); 1170 ASSERT(error == 0); 1171 } 1172 if (end > size || len == 0) 1173 zp->z_phys->zp_size = end; 1174 if (from > size) 1175 return (0); 1176 1177 if (have_grow_lock) 1178 rw_downgrade(&zp->z_grow_lock); 1179 /* 1180 * Clear any mapped pages in the truncated region. 1181 */ 1182 rw_enter(&zp->z_map_lock, RW_WRITER); 1183 if (vn_has_cached_data(vp)) { 1184 page_t *pp; 1185 uint64_t start = from & PAGEMASK; 1186 int off = from & PAGEOFFSET; 1187 1188 if (off != 0 && (pp = page_lookup(vp, start, SE_SHARED))) { 1189 /* 1190 * We need to zero a partial page. 1191 */ 1192 pagezero(pp, off, PAGESIZE - off); 1193 start += PAGESIZE; 1194 page_unlock(pp); 1195 } 1196 error = pvn_vplist_dirty(vp, start, zfs_no_putpage, 1197 B_INVAL | B_TRUNC, cr); 1198 ASSERT(error == 0); 1199 } 1200 rw_exit(&zp->z_map_lock); 1201 1202 if (!have_grow_lock) 1203 rw_enter(&zp->z_grow_lock, RW_READER); 1204 1205 if (len == 0) 1206 len = -1; 1207 else if (end > size) 1208 len = size - from; 1209 dmu_free_range(zp->z_zfsvfs->z_os, zp->z_id, from, len, tx); 1210 1211 if (!have_grow_lock) 1212 rw_exit(&zp->z_grow_lock); 1213 1214 return (0); 1215 } 1216 1217 1218 void 1219 zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx) 1220 { 1221 zfsvfs_t zfsvfs; 1222 uint64_t moid, doid, roid = 0; 1223 uint64_t version = ZFS_VERSION; 1224 int error; 1225 znode_t *rootzp = NULL; 1226 vnode_t *vp; 1227 vattr_t vattr; 1228 1229 /* 1230 * First attempt to create master node. 1231 */ 1232 moid = MASTER_NODE_OBJ; 1233 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1234 DMU_OT_NONE, 0, tx); 1235 ASSERT(error == 0); 1236 1237 /* 1238 * Set starting attributes. 1239 */ 1240 1241 error = zap_update(os, moid, ZFS_VERSION_OBJ, 8, 1, &version, tx); 1242 ASSERT(error == 0); 1243 1244 /* 1245 * Create a delete queue. 1246 */ 1247 doid = zap_create(os, DMU_OT_DELETE_QUEUE, DMU_OT_NONE, 0, tx); 1248 1249 error = zap_add(os, moid, ZFS_DELETE_QUEUE, 8, 1, &doid, tx); 1250 ASSERT(error == 0); 1251 1252 /* 1253 * Create root znode. Create minimal znode/vnode/zfsvfs 1254 * to allow zfs_mknode to work. 1255 */ 1256 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1257 vattr.va_type = VDIR; 1258 vattr.va_mode = S_IFDIR|0755; 1259 vattr.va_uid = 0; 1260 vattr.va_gid = 3; 1261 1262 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1263 rootzp->z_zfsvfs = &zfsvfs; 1264 rootzp->z_active = 1; 1265 rootzp->z_reap = 0; 1266 rootzp->z_atime_dirty = 0; 1267 rootzp->z_dbuf_held = 0; 1268 1269 vp = ZTOV(rootzp); 1270 vn_reinit(vp); 1271 vp->v_type = VDIR; 1272 1273 bzero(&zfsvfs, sizeof (zfsvfs_t)); 1274 1275 zfsvfs.z_os = os; 1276 zfsvfs.z_assign = TXG_NOWAIT; 1277 zfsvfs.z_parent = &zfsvfs; 1278 1279 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1280 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1281 offsetof(znode_t, z_link_node)); 1282 1283 zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0); 1284 ASSERT3U(rootzp->z_id, ==, roid); 1285 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx); 1286 ASSERT(error == 0); 1287 1288 ZTOV(rootzp)->v_count = 0; 1289 kmem_cache_free(znode_cache, rootzp); 1290 } 1291