1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/time.h> 32 #include <sys/systm.h> 33 #include <sys/sysmacros.h> 34 #include <sys/resource.h> 35 #include <sys/mntent.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/file.h> 39 #include <sys/kmem.h> 40 #include <sys/cmn_err.h> 41 #include <sys/errno.h> 42 #include <sys/unistd.h> 43 #include <sys/stat.h> 44 #include <sys/mode.h> 45 #include <sys/atomic.h> 46 #include <vm/pvn.h> 47 #include "fs/fs_subr.h" 48 #include <sys/zfs_dir.h> 49 #include <sys/zfs_acl.h> 50 #include <sys/zfs_ioctl.h> 51 #include <sys/zfs_znode.h> 52 #include <sys/zap.h> 53 #include <sys/dmu.h> 54 #include <sys/fs/zfs.h> 55 56 struct kmem_cache *znode_cache = NULL; 57 58 /* 59 * Note that znodes can be on one of 2 states: 60 * ZCACHE_mru - recently used, currently cached 61 * ZCACHE_mfu - frequently used, currently cached 62 * When there are no active references to the znode, they 63 * are linked onto one of the lists in zcache. These are the 64 * only znodes that can be evicted. 65 */ 66 67 typedef struct zcache_state { 68 list_t list; /* linked list of evictable znodes in state */ 69 uint64_t lcnt; /* total number of znodes in the linked list */ 70 uint64_t cnt; /* total number of all znodes in this state */ 71 uint64_t hits; 72 kmutex_t mtx; 73 } zcache_state_t; 74 75 /* The 2 states: */ 76 static zcache_state_t ZCACHE_mru; 77 static zcache_state_t ZCACHE_mfu; 78 79 static struct zcache { 80 zcache_state_t *mru; 81 zcache_state_t *mfu; 82 uint64_t p; /* Target size of mru */ 83 uint64_t c; /* Target size of cache */ 84 uint64_t c_max; /* Maximum target cache size */ 85 86 /* performance stats */ 87 uint64_t missed; 88 uint64_t evicted; 89 uint64_t skipped; 90 } zcache; 91 92 void zcache_kmem_reclaim(void); 93 94 #define ZCACHE_MINTIME (hz>>4) /* 62 ms */ 95 96 /* 97 * Move the supplied znode to the indicated state. The mutex 98 * for the znode must be held by the caller. 99 */ 100 static void 101 zcache_change_state(zcache_state_t *new_state, znode_t *zp) 102 { 103 /* ASSERT(MUTEX_HELD(hash_mtx)); */ 104 ASSERT(zp->z_active); 105 106 if (zp->z_zcache_state) { 107 ASSERT3U(zp->z_zcache_state->cnt, >=, 1); 108 atomic_add_64(&zp->z_zcache_state->cnt, -1); 109 } 110 atomic_add_64(&new_state->cnt, 1); 111 zp->z_zcache_state = new_state; 112 } 113 114 static void 115 zfs_zcache_evict(znode_t *zp, kmutex_t *hash_mtx) 116 { 117 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 118 119 ASSERT(zp->z_phys); 120 ASSERT(zp->z_dbuf_held); 121 122 zp->z_dbuf_held = 0; 123 mutex_exit(&zp->z_lock); 124 dmu_buf_rele(zp->z_dbuf); 125 mutex_exit(hash_mtx); 126 VFS_RELE(zfsvfs->z_vfs); 127 } 128 129 /* 130 * Evict znodes from list until we've removed the specified number 131 */ 132 static void 133 zcache_evict_state(zcache_state_t *state, int64_t cnt, zfsvfs_t *zfsvfs) 134 { 135 int znodes_evicted = 0; 136 znode_t *zp, *zp_prev; 137 kmutex_t *hash_mtx; 138 139 ASSERT(state == zcache.mru || state == zcache.mfu); 140 141 mutex_enter(&state->mtx); 142 143 for (zp = list_tail(&state->list); zp; zp = zp_prev) { 144 zp_prev = list_prev(&state->list, zp); 145 if (zfsvfs && zp->z_zfsvfs != zfsvfs) 146 continue; 147 hash_mtx = ZFS_OBJ_MUTEX(zp); 148 if (mutex_tryenter(hash_mtx)) { 149 mutex_enter(&zp->z_lock); 150 list_remove(&zp->z_zcache_state->list, zp); 151 zp->z_zcache_state->lcnt -= 1; 152 ASSERT3U(zp->z_zcache_state->cnt, >=, 1); 153 atomic_add_64(&zp->z_zcache_state->cnt, -1); 154 zp->z_zcache_state = NULL; 155 zp->z_zcache_access = 0; 156 /* drops z_lock and hash_mtx */ 157 zfs_zcache_evict(zp, hash_mtx); 158 znodes_evicted += 1; 159 atomic_add_64(&zcache.evicted, 1); 160 if (znodes_evicted >= cnt) 161 break; 162 } else { 163 atomic_add_64(&zcache.skipped, 1); 164 } 165 } 166 mutex_exit(&state->mtx); 167 168 if (znodes_evicted < cnt) 169 dprintf("only evicted %lld znodes from %x", 170 (longlong_t)znodes_evicted, state); 171 } 172 173 static void 174 zcache_adjust(void) 175 { 176 uint64_t mrucnt = zcache.mru->lcnt; 177 uint64_t mfucnt = zcache.mfu->lcnt; 178 uint64_t p = zcache.p; 179 uint64_t c = zcache.c; 180 181 if (mrucnt > p) 182 zcache_evict_state(zcache.mru, mrucnt - p, NULL); 183 184 if (mfucnt > 0 && mrucnt + mfucnt > c) { 185 int64_t toevict = MIN(mfucnt, mrucnt + mfucnt - c); 186 zcache_evict_state(zcache.mfu, toevict, NULL); 187 } 188 } 189 190 /* 191 * Flush all *evictable* data from the cache. 192 * NOTE: this will not touch "active" (i.e. referenced) data. 193 */ 194 void 195 zfs_zcache_flush(zfsvfs_t *zfsvfs) 196 { 197 zcache_evict_state(zcache.mru, zcache.mru->lcnt, zfsvfs); 198 zcache_evict_state(zcache.mfu, zcache.mfu->lcnt, zfsvfs); 199 } 200 201 static void 202 zcache_try_grow(int64_t cnt) 203 { 204 int64_t size; 205 /* 206 * If we're almost to the current target cache size, 207 * increment the target cache size 208 */ 209 size = zcache.mru->lcnt + zcache.mfu->lcnt; 210 if ((zcache.c - size) <= 1) { 211 atomic_add_64(&zcache.c, cnt); 212 if (zcache.c > zcache.c_max) 213 zcache.c = zcache.c_max; 214 else if (zcache.p + cnt < zcache.c) 215 atomic_add_64(&zcache.p, cnt); 216 } 217 } 218 219 /* 220 * This routine is called whenever a znode is accessed. 221 */ 222 static void 223 zcache_access(znode_t *zp, kmutex_t *hash_mtx) 224 { 225 ASSERT(MUTEX_HELD(hash_mtx)); 226 227 if (zp->z_zcache_state == NULL) { 228 /* 229 * This znode is not in the cache. 230 * Add the new znode to the MRU state. 231 */ 232 233 zcache_try_grow(1); 234 235 ASSERT(zp->z_zcache_access == 0); 236 zp->z_zcache_access = lbolt; 237 zcache_change_state(zcache.mru, zp); 238 mutex_exit(hash_mtx); 239 240 /* 241 * If we are using less than 2/3 of our total target 242 * cache size, bump up the target size for the MRU 243 * list. 244 */ 245 if (zcache.mru->lcnt + zcache.mfu->lcnt < zcache.c*2/3) { 246 zcache.p = zcache.mru->lcnt + zcache.c/6; 247 } 248 249 zcache_adjust(); 250 251 atomic_add_64(&zcache.missed, 1); 252 } else if (zp->z_zcache_state == zcache.mru) { 253 /* 254 * This znode has been "accessed" only once so far, 255 * Move it to the MFU state. 256 */ 257 if (lbolt > zp->z_zcache_access + ZCACHE_MINTIME) { 258 /* 259 * More than 125ms have passed since we 260 * instantiated this buffer. Move it to the 261 * most frequently used state. 262 */ 263 zp->z_zcache_access = lbolt; 264 zcache_change_state(zcache.mfu, zp); 265 } 266 atomic_add_64(&zcache.mru->hits, 1); 267 mutex_exit(hash_mtx); 268 } else { 269 ASSERT(zp->z_zcache_state == zcache.mfu); 270 /* 271 * This buffer has been accessed more than once. 272 * Keep it in the MFU state. 273 */ 274 atomic_add_64(&zcache.mfu->hits, 1); 275 mutex_exit(hash_mtx); 276 } 277 } 278 279 static void 280 zcache_init(void) 281 { 282 zcache.c = 20; 283 zcache.c_max = 50; 284 285 zcache.mru = &ZCACHE_mru; 286 zcache.mfu = &ZCACHE_mfu; 287 288 list_create(&zcache.mru->list, sizeof (znode_t), 289 offsetof(znode_t, z_zcache_node)); 290 list_create(&zcache.mfu->list, sizeof (znode_t), 291 offsetof(znode_t, z_zcache_node)); 292 } 293 294 static void 295 zcache_fini(void) 296 { 297 zfs_zcache_flush(NULL); 298 299 list_destroy(&zcache.mru->list); 300 list_destroy(&zcache.mfu->list); 301 } 302 303 /*ARGSUSED*/ 304 static void 305 znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr) 306 { 307 znode_t *zp = user_ptr; 308 vnode_t *vp = ZTOV(zp); 309 310 if (vp->v_count == 0) { 311 vn_invalid(vp); 312 zfs_znode_free(zp); 313 } 314 } 315 316 /*ARGSUSED*/ 317 static int 318 zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags) 319 { 320 znode_t *zp = buf; 321 322 zp->z_vnode = vn_alloc(KM_SLEEP); 323 zp->z_vnode->v_data = (caddr_t)zp; 324 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 325 rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); 326 rw_init(&zp->z_grow_lock, NULL, RW_DEFAULT, NULL); 327 rw_init(&zp->z_append_lock, NULL, RW_DEFAULT, NULL); 328 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 329 zp->z_dbuf_held = 0; 330 zp->z_dirlocks = 0; 331 return (0); 332 } 333 334 /*ARGSUSED*/ 335 static void 336 zfs_znode_cache_destructor(void *buf, void *cdarg) 337 { 338 znode_t *zp = buf; 339 340 ASSERT(zp->z_dirlocks == 0); 341 mutex_destroy(&zp->z_lock); 342 rw_destroy(&zp->z_map_lock); 343 rw_destroy(&zp->z_grow_lock); 344 rw_destroy(&zp->z_append_lock); 345 mutex_destroy(&zp->z_acl_lock); 346 347 ASSERT(zp->z_dbuf_held == 0); 348 ASSERT(ZTOV(zp)->v_count == 0); 349 vn_free(ZTOV(zp)); 350 } 351 352 void 353 zfs_znode_init(void) 354 { 355 /* 356 * Initialize zcache 357 */ 358 ASSERT(znode_cache == NULL); 359 znode_cache = kmem_cache_create("zfs_znode_cache", 360 sizeof (znode_t), 0, zfs_znode_cache_constructor, 361 zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 362 363 zcache_init(); 364 } 365 366 void 367 zfs_znode_fini(void) 368 { 369 zcache_fini(); 370 371 /* 372 * Cleanup vfs & vnode ops 373 */ 374 zfs_remove_op_tables(); 375 376 /* 377 * Cleanup zcache 378 */ 379 if (znode_cache) 380 kmem_cache_destroy(znode_cache); 381 znode_cache = NULL; 382 } 383 384 struct vnodeops *zfs_dvnodeops; 385 struct vnodeops *zfs_fvnodeops; 386 struct vnodeops *zfs_symvnodeops; 387 struct vnodeops *zfs_xdvnodeops; 388 struct vnodeops *zfs_evnodeops; 389 390 void 391 zfs_remove_op_tables() 392 { 393 /* 394 * Remove vfs ops 395 */ 396 ASSERT(zfsfstype); 397 (void) vfs_freevfsops_by_type(zfsfstype); 398 zfsfstype = 0; 399 400 /* 401 * Remove vnode ops 402 */ 403 if (zfs_dvnodeops) 404 vn_freevnodeops(zfs_dvnodeops); 405 if (zfs_fvnodeops) 406 vn_freevnodeops(zfs_fvnodeops); 407 if (zfs_symvnodeops) 408 vn_freevnodeops(zfs_symvnodeops); 409 if (zfs_xdvnodeops) 410 vn_freevnodeops(zfs_xdvnodeops); 411 if (zfs_evnodeops) 412 vn_freevnodeops(zfs_evnodeops); 413 414 zfs_dvnodeops = NULL; 415 zfs_fvnodeops = NULL; 416 zfs_symvnodeops = NULL; 417 zfs_xdvnodeops = NULL; 418 zfs_evnodeops = NULL; 419 } 420 421 extern const fs_operation_def_t zfs_dvnodeops_template[]; 422 extern const fs_operation_def_t zfs_fvnodeops_template[]; 423 extern const fs_operation_def_t zfs_xdvnodeops_template[]; 424 extern const fs_operation_def_t zfs_symvnodeops_template[]; 425 extern const fs_operation_def_t zfs_evnodeops_template[]; 426 427 int 428 zfs_create_op_tables() 429 { 430 int error; 431 432 /* 433 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() 434 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). 435 * In this case we just return as the ops vectors are already set up. 436 */ 437 if (zfs_dvnodeops) 438 return (0); 439 440 error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, 441 &zfs_dvnodeops); 442 if (error) 443 return (error); 444 445 error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, 446 &zfs_fvnodeops); 447 if (error) 448 return (error); 449 450 error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, 451 &zfs_symvnodeops); 452 if (error) 453 return (error); 454 455 error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, 456 &zfs_xdvnodeops); 457 if (error) 458 return (error); 459 460 error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, 461 &zfs_evnodeops); 462 463 return (error); 464 } 465 466 /* 467 * zfs_init_fs - Initialize the zfsvfs struct and the file system 468 * incore "master" object. Verify version compatibility. 469 */ 470 int 471 zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) 472 { 473 extern int zfsfstype; 474 475 objset_t *os = zfsvfs->z_os; 476 uint64_t zoid; 477 uint64_t version = ZFS_VERSION; 478 int i, error; 479 dmu_object_info_t doi; 480 dmu_objset_stats_t *stats; 481 482 *zpp = NULL; 483 484 /* 485 * XXX - hack to auto-create the pool root filesystem at 486 * the first attempted mount. 487 */ 488 if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) { 489 dmu_tx_t *tx = dmu_tx_create(os); 490 491 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 3); /* master node */ 492 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1); /* delete queue */ 493 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */ 494 error = dmu_tx_assign(tx, TXG_WAIT); 495 ASSERT3U(error, ==, 0); 496 zfs_create_fs(os, cr, tx); 497 dmu_tx_commit(tx); 498 } 499 500 if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1, &version)) { 501 return (EINVAL); 502 } else if (version != ZFS_VERSION) { 503 (void) printf("Mismatched versions: File system " 504 "is version %lld on-disk format, which is " 505 "incompatible with this software version %lld!", 506 (u_longlong_t)version, ZFS_VERSION); 507 return (ENOTSUP); 508 } 509 510 /* 511 * The fsid is 64 bits, composed of an 8-bit fs type, which 512 * separates our fsid from any other filesystem types, and a 513 * 56-bit objset unique ID. The objset unique ID is unique to 514 * all objsets open on this system, provided by unique_create(). 515 * The 8-bit fs type must be put in the low bits of fsid[1] 516 * because that's where other Solaris filesystems put it. 517 */ 518 stats = kmem_alloc(sizeof (dmu_objset_stats_t), KM_SLEEP); 519 dmu_objset_stats(os, stats); 520 ASSERT((stats->dds_fsid_guid & ~((1ULL<<56)-1)) == 0); 521 zfsvfs->z_vfs->vfs_fsid.val[0] = stats->dds_fsid_guid; 522 zfsvfs->z_vfs->vfs_fsid.val[1] = ((stats->dds_fsid_guid>>32) << 8) | 523 zfsfstype & 0xFF; 524 kmem_free(stats, sizeof (dmu_objset_stats_t)); 525 stats = NULL; 526 527 if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid)) { 528 return (EINVAL); 529 } 530 ASSERT(zoid != 0); 531 zfsvfs->z_root = zoid; 532 533 /* 534 * Create the per mount vop tables. 535 */ 536 537 /* 538 * Initialize zget mutex's 539 */ 540 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 541 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 542 543 error = zfs_zget(zfsvfs, zoid, zpp); 544 if (error) 545 return (error); 546 ASSERT3U((*zpp)->z_id, ==, zoid); 547 548 if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid)) { 549 return (EINVAL); 550 } 551 552 zfsvfs->z_dqueue = zoid; 553 554 /* 555 * Initialize delete head structure 556 * Thread(s) will be started/stopped via 557 * readonly_changed_cb() depending 558 * on whether this is rw/ro mount. 559 */ 560 list_create(&zfsvfs->z_delete_head.z_znodes, 561 sizeof (znode_t), offsetof(znode_t, z_list_node)); 562 563 return (0); 564 } 565 566 /* 567 * Construct a new znode/vnode and intialize. 568 * 569 * This does not do a call to dmu_set_user() that is 570 * up to the caller to do, in case you don't want to 571 * return the znode 572 */ 573 znode_t * 574 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz) 575 { 576 znode_t *zp; 577 vnode_t *vp; 578 579 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 580 581 ASSERT(zp->z_dirlocks == NULL); 582 583 zp->z_phys = db->db_data; 584 zp->z_zfsvfs = zfsvfs; 585 zp->z_active = 1; 586 zp->z_reap = 0; 587 zp->z_atime_dirty = 0; 588 zp->z_dbuf_held = 0; 589 zp->z_mapcnt = 0; 590 zp->z_last_itx = 0; 591 zp->z_dbuf = db; 592 zp->z_id = obj_num; 593 zp->z_blksz = blksz; 594 zp->z_seq = 0x7A4653; 595 596 bzero(&zp->z_zcache_node, sizeof (list_node_t)); 597 598 mutex_enter(&zfsvfs->z_znodes_lock); 599 list_insert_tail(&zfsvfs->z_all_znodes, zp); 600 mutex_exit(&zfsvfs->z_znodes_lock); 601 602 vp = ZTOV(zp); 603 vn_reinit(vp); 604 605 vp->v_vfsp = zfsvfs->z_parent->z_vfs; 606 vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 607 608 switch (vp->v_type) { 609 case VDIR: 610 if (zp->z_phys->zp_flags & ZFS_XATTR) { 611 vn_setops(vp, zfs_xdvnodeops); 612 vp->v_flag |= V_XATTRDIR; 613 } else 614 vn_setops(vp, zfs_dvnodeops); 615 break; 616 case VBLK: 617 case VCHR: 618 vp->v_rdev = (dev_t)zp->z_phys->zp_rdev; 619 /*FALLTHROUGH*/ 620 case VFIFO: 621 case VSOCK: 622 case VDOOR: 623 vn_setops(vp, zfs_fvnodeops); 624 break; 625 case VREG: 626 vp->v_flag |= VMODSORT; 627 vn_setops(vp, zfs_fvnodeops); 628 break; 629 case VLNK: 630 vn_setops(vp, zfs_symvnodeops); 631 break; 632 default: 633 vn_setops(vp, zfs_evnodeops); 634 break; 635 } 636 637 return (zp); 638 } 639 640 static void 641 zfs_znode_dmu_init(znode_t *zp) 642 { 643 znode_t *nzp; 644 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 645 dmu_buf_t *db = zp->z_dbuf; 646 647 mutex_enter(&zp->z_lock); 648 649 nzp = dmu_buf_set_user(db, zp, &zp->z_phys, znode_pageout_func); 650 651 /* 652 * there should be no 653 * concurrent zgets on this object. 654 */ 655 ASSERT3P(nzp, ==, NULL); 656 657 /* 658 * Slap on VROOT if we are the root znode 659 */ 660 if (zp->z_id == zfsvfs->z_root) { 661 ZTOV(zp)->v_flag |= VROOT; 662 } 663 664 zp->z_zcache_state = NULL; 665 zp->z_zcache_access = 0; 666 667 ASSERT(zp->z_dbuf_held == 0); 668 zp->z_dbuf_held = 1; 669 VFS_HOLD(zfsvfs->z_vfs); 670 mutex_exit(&zp->z_lock); 671 vn_exists(ZTOV(zp)); 672 } 673 674 /* 675 * Create a new DMU object to hold a zfs znode. 676 * 677 * IN: dzp - parent directory for new znode 678 * vap - file attributes for new znode 679 * tx - dmu transaction id for zap operations 680 * cr - credentials of caller 681 * flag - flags: 682 * IS_ROOT_NODE - new object will be root 683 * IS_XATTR - new object is an attribute 684 * IS_REPLAY - intent log replay 685 * 686 * OUT: oid - ID of created object 687 * 688 */ 689 void 690 zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, 691 uint_t flag, znode_t **zpp, int bonuslen) 692 { 693 dmu_buf_t *dbp; 694 znode_phys_t *pzp; 695 znode_t *zp; 696 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 697 timestruc_t now; 698 uint64_t gen; 699 int err; 700 701 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 702 703 if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ 704 *oid = vap->va_nodeid; 705 flag |= IS_REPLAY; 706 now = vap->va_ctime; /* see zfs_replay_create() */ 707 gen = vap->va_nblocks; /* ditto */ 708 } else { 709 *oid = 0; 710 gethrestime(&now); 711 gen = dmu_tx_get_txg(tx); 712 } 713 714 /* 715 * Create a new DMU object. 716 */ 717 if (vap->va_type == VDIR) { 718 if (flag & IS_REPLAY) { 719 err = zap_create_claim(zfsvfs->z_os, *oid, 720 DMU_OT_DIRECTORY_CONTENTS, 721 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 722 ASSERT3U(err, ==, 0); 723 } else { 724 *oid = zap_create(zfsvfs->z_os, 725 DMU_OT_DIRECTORY_CONTENTS, 726 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 727 } 728 } else { 729 if (flag & IS_REPLAY) { 730 err = dmu_object_claim(zfsvfs->z_os, *oid, 731 DMU_OT_PLAIN_FILE_CONTENTS, 0, 732 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 733 ASSERT3U(err, ==, 0); 734 } else { 735 *oid = dmu_object_alloc(zfsvfs->z_os, 736 DMU_OT_PLAIN_FILE_CONTENTS, 0, 737 DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 738 } 739 } 740 dbp = dmu_bonus_hold(zfsvfs->z_os, *oid); 741 dmu_buf_will_dirty(dbp, tx); 742 743 /* 744 * Initialize the znode physical data to zero. 745 */ 746 ASSERT(dbp->db_size >= sizeof (znode_phys_t)); 747 bzero(dbp->db_data, dbp->db_size); 748 pzp = dbp->db_data; 749 750 /* 751 * If this is the root, fix up the half-initialized parent pointer 752 * to reference the just-allocated physical data area. 753 */ 754 if (flag & IS_ROOT_NODE) { 755 dzp->z_phys = pzp; 756 dzp->z_id = *oid; 757 } 758 759 /* 760 * If parent is an xattr, so am I. 761 */ 762 if (dzp->z_phys->zp_flags & ZFS_XATTR) 763 flag |= IS_XATTR; 764 765 if (vap->va_type == VBLK || vap->va_type == VCHR) { 766 pzp->zp_rdev = vap->va_rdev; 767 } 768 769 if (vap->va_type == VDIR) { 770 pzp->zp_size = 2; /* contents ("." and "..") */ 771 pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 772 } 773 774 pzp->zp_parent = dzp->z_id; 775 if (flag & IS_XATTR) 776 pzp->zp_flags |= ZFS_XATTR; 777 778 pzp->zp_gen = gen; 779 780 ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 781 ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 782 783 if (vap->va_mask & AT_ATIME) { 784 ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 785 } else { 786 ZFS_TIME_ENCODE(&now, pzp->zp_atime); 787 } 788 789 if (vap->va_mask & AT_MTIME) { 790 ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 791 } else { 792 ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 793 } 794 795 pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); 796 zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0); 797 798 zfs_perm_init(zp, dzp, flag, vap, tx, cr); 799 800 if (zpp) { 801 kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp); 802 803 mutex_enter(hash_mtx); 804 zfs_znode_dmu_init(zp); 805 zcache_access(zp, hash_mtx); 806 *zpp = zp; 807 } else { 808 ZTOV(zp)->v_count = 0; 809 dmu_buf_rele(dbp); 810 zfs_znode_free(zp); 811 } 812 } 813 814 int 815 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 816 { 817 dmu_object_info_t doi; 818 dmu_buf_t *db; 819 znode_t *zp; 820 821 *zpp = NULL; 822 823 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 824 825 db = dmu_bonus_hold(zfsvfs->z_os, obj_num); 826 if (db == NULL) { 827 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 828 return (ENOENT); 829 } 830 831 dmu_object_info_from_db(db, &doi); 832 if (doi.doi_bonus_type != DMU_OT_ZNODE || 833 doi.doi_bonus_size < sizeof (znode_phys_t)) { 834 dmu_buf_rele(db); 835 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 836 return (EINVAL); 837 } 838 dmu_buf_read(db); 839 840 ASSERT(db->db_object == obj_num); 841 ASSERT(db->db_offset == -1); 842 ASSERT(db->db_data != NULL); 843 844 zp = dmu_buf_get_user(db); 845 846 if (zp != NULL) { 847 mutex_enter(&zp->z_lock); 848 849 ASSERT3U(zp->z_id, ==, obj_num); 850 if (zp->z_reap) { 851 dmu_buf_rele(db); 852 mutex_exit(&zp->z_lock); 853 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 854 return (ENOENT); 855 } else if (zp->z_dbuf_held) { 856 dmu_buf_rele(db); 857 } else { 858 zp->z_dbuf_held = 1; 859 VFS_HOLD(zfsvfs->z_vfs); 860 } 861 862 if (zp->z_active == 0) { 863 zp->z_active = 1; 864 if (list_link_active(&zp->z_zcache_node)) { 865 mutex_enter(&zp->z_zcache_state->mtx); 866 list_remove(&zp->z_zcache_state->list, zp); 867 zp->z_zcache_state->lcnt -= 1; 868 mutex_exit(&zp->z_zcache_state->mtx); 869 } 870 } 871 VN_HOLD(ZTOV(zp)); 872 mutex_exit(&zp->z_lock); 873 zcache_access(zp, ZFS_OBJ_MUTEX(zp)); 874 *zpp = zp; 875 return (0); 876 } 877 878 /* 879 * Not found create new znode/vnode 880 */ 881 zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size); 882 ASSERT3U(zp->z_id, ==, obj_num); 883 zfs_znode_dmu_init(zp); 884 zcache_access(zp, ZFS_OBJ_MUTEX(zp)); 885 *zpp = zp; 886 return (0); 887 } 888 889 void 890 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 891 { 892 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 893 int error; 894 895 ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id); 896 if (zp->z_phys->zp_acl.z_acl_extern_obj) { 897 error = dmu_object_free(zfsvfs->z_os, 898 zp->z_phys->zp_acl.z_acl_extern_obj, tx); 899 ASSERT3U(error, ==, 0); 900 } 901 if (zp->z_zcache_state) { 902 ASSERT3U(zp->z_zcache_state->cnt, >=, 1); 903 atomic_add_64(&zp->z_zcache_state->cnt, -1); 904 } 905 error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx); 906 ASSERT3U(error, ==, 0); 907 zp->z_dbuf_held = 0; 908 ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); 909 dmu_buf_rele(zp->z_dbuf); 910 } 911 912 void 913 zfs_zinactive(znode_t *zp) 914 { 915 vnode_t *vp = ZTOV(zp); 916 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 917 uint64_t z_id = zp->z_id; 918 919 ASSERT(zp->z_dbuf_held && zp->z_phys); 920 921 /* 922 * Don't allow a zfs_zget() while were trying to release this znode 923 */ 924 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 925 926 mutex_enter(&zp->z_lock); 927 mutex_enter(&vp->v_lock); 928 vp->v_count--; 929 if (vp->v_count > 0 || vn_has_cached_data(vp)) { 930 /* 931 * If the hold count is greater than zero, somebody has 932 * obtained a new reference on this znode while we were 933 * processing it here, so we are done. If we still have 934 * mapped pages then we are also done, since we don't 935 * want to inactivate the znode until the pages get pushed. 936 * 937 * XXX - if vn_has_cached_data(vp) is true, but count == 0, 938 * this seems like it would leave the znode hanging with 939 * no chance to go inactive... 940 */ 941 mutex_exit(&vp->v_lock); 942 mutex_exit(&zp->z_lock); 943 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 944 return; 945 } 946 mutex_exit(&vp->v_lock); 947 zp->z_active = 0; 948 949 /* 950 * If this was the last reference to a file with no links, 951 * remove the file from the file system. 952 */ 953 if (zp->z_reap) { 954 mutex_exit(&zp->z_lock); 955 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 956 ASSERT3U(zp->z_zcache_state->cnt, >=, 1); 957 atomic_add_64(&zp->z_zcache_state->cnt, -1); 958 zp->z_zcache_state = NULL; 959 /* XATTR files are not put on the delete queue */ 960 if (zp->z_phys->zp_flags & ZFS_XATTR) { 961 zfs_rmnode(zp); 962 } else { 963 mutex_enter(&zfsvfs->z_delete_head.z_mutex); 964 list_insert_tail(&zfsvfs->z_delete_head.z_znodes, zp); 965 zfsvfs->z_delete_head.z_znode_count++; 966 cv_broadcast(&zfsvfs->z_delete_head.z_cv); 967 mutex_exit(&zfsvfs->z_delete_head.z_mutex); 968 } 969 VFS_RELE(zfsvfs->z_vfs); 970 return; 971 } 972 973 /* 974 * If the file system for this znode is no longer mounted, 975 * evict the znode now, don't put it in the cache. 976 */ 977 if (zfsvfs->z_unmounted1) { 978 zfs_zcache_evict(zp, ZFS_OBJ_MUTEX(zp)); 979 return; 980 } 981 982 /* put znode on evictable list */ 983 mutex_enter(&zp->z_zcache_state->mtx); 984 list_insert_head(&zp->z_zcache_state->list, zp); 985 zp->z_zcache_state->lcnt += 1; 986 mutex_exit(&zp->z_zcache_state->mtx); 987 mutex_exit(&zp->z_lock); 988 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 989 } 990 991 void 992 zfs_znode_free(znode_t *zp) 993 { 994 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 995 996 mutex_enter(&zfsvfs->z_znodes_lock); 997 list_remove(&zfsvfs->z_all_znodes, zp); 998 mutex_exit(&zfsvfs->z_znodes_lock); 999 1000 kmem_cache_free(znode_cache, zp); 1001 } 1002 1003 void 1004 zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1005 { 1006 timestruc_t now; 1007 1008 ASSERT(MUTEX_HELD(&zp->z_lock)); 1009 1010 gethrestime(&now); 1011 1012 if (tx) { 1013 dmu_buf_will_dirty(zp->z_dbuf, tx); 1014 zp->z_atime_dirty = 0; 1015 zp->z_seq++; 1016 } else { 1017 zp->z_atime_dirty = 1; 1018 } 1019 1020 if (flag & AT_ATIME) 1021 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1022 1023 if (flag & AT_MTIME) 1024 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1025 1026 if (flag & AT_CTIME) 1027 ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1028 } 1029 1030 /* 1031 * Update the requested znode timestamps with the current time. 1032 * If we are in a transaction, then go ahead and mark the znode 1033 * dirty in the transaction so the timestamps will go to disk. 1034 * Otherwise, we will get pushed next time the znode is updated 1035 * in a transaction, or when this znode eventually goes inactive. 1036 * 1037 * Why is this OK? 1038 * 1 - Only the ACCESS time is ever updated outside of a transaction. 1039 * 2 - Multiple consecutive updates will be collapsed into a single 1040 * znode update by the transaction grouping semantics of the DMU. 1041 */ 1042 void 1043 zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1044 { 1045 mutex_enter(&zp->z_lock); 1046 zfs_time_stamper_locked(zp, flag, tx); 1047 mutex_exit(&zp->z_lock); 1048 } 1049 1050 /* 1051 * Grow the block size for a file. This may involve migrating data 1052 * from the bonus buffer into a data block (when we grow beyond the 1053 * bonus buffer data area). 1054 * 1055 * IN: zp - znode of file to free data in. 1056 * size - requested block size 1057 * tx - open transaction. 1058 * 1059 * RETURN: 0 if success 1060 * error code if failure 1061 * 1062 * NOTE: this function assumes that the znode is write locked. 1063 */ 1064 int 1065 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1066 { 1067 int error; 1068 u_longlong_t dummy; 1069 1070 ASSERT(rw_write_held(&zp->z_grow_lock)); 1071 1072 if (size <= zp->z_blksz) 1073 return (0); 1074 /* 1075 * If the file size is already greater than the current blocksize, 1076 * we will not grow. If there is more than one block in a file, 1077 * the blocksize cannot change. 1078 */ 1079 if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1080 return (0); 1081 1082 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1083 size, 0, tx); 1084 if (error == ENOTSUP) 1085 return (0); 1086 ASSERT3U(error, ==, 0); 1087 1088 /* What blocksize did we actually get? */ 1089 dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1090 1091 return (0); 1092 } 1093 1094 /* 1095 * This is a dummy interface used when pvn_vplist_dirty() should *not* 1096 * be calling back into the fs for a putpage(). E.g.: when truncating 1097 * a file, the pages being "thrown away* don't need to be written out. 1098 */ 1099 /* ARGSUSED */ 1100 static int 1101 zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 1102 int flags, cred_t *cr) 1103 { 1104 ASSERT(0); 1105 return (0); 1106 } 1107 1108 /* 1109 * Free space in a file. Currently, this function only 1110 * supports freeing space at the end of the file. 1111 * 1112 * IN: zp - znode of file to free data in. 1113 * from - start of section to free. 1114 * len - length of section to free (0 => to EOF). 1115 * flag - current file open mode flags. 1116 * tx - open transaction. 1117 * 1118 * RETURN: 0 if success 1119 * error code if failure 1120 */ 1121 int 1122 zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx, 1123 cred_t *cr) 1124 { 1125 vnode_t *vp = ZTOV(zp); 1126 uint64_t size = zp->z_phys->zp_size; 1127 uint64_t end = from + len; 1128 int have_grow_lock, error; 1129 1130 have_grow_lock = RW_WRITE_HELD(&zp->z_grow_lock); 1131 1132 /* 1133 * Nothing to do if file already at desired length. 1134 */ 1135 if (len == 0 && size == from) { 1136 return (0); 1137 } 1138 1139 /* 1140 * Check for any locks in the region to be freed. 1141 */ 1142 if (MANDLOCK(vp, (mode_t)zp->z_phys->zp_mode)) { 1143 uint64_t start; 1144 1145 if (size > from) 1146 start = from; 1147 else 1148 start = size; 1149 if (error = chklock(vp, FWRITE, start, 0, flag, NULL)) 1150 return (error); 1151 } 1152 1153 if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || 1154 zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { 1155 uint64_t new_blksz; 1156 /* 1157 * We are growing the file past the current block size. 1158 */ 1159 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1160 ASSERT(!ISP2(zp->z_blksz)); 1161 new_blksz = MIN(end, SPA_MAXBLOCKSIZE); 1162 } else { 1163 new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1164 } 1165 error = zfs_grow_blocksize(zp, new_blksz, tx); 1166 ASSERT(error == 0); 1167 } 1168 if (end > size || len == 0) 1169 zp->z_phys->zp_size = end; 1170 if (from > size) 1171 return (0); 1172 1173 if (have_grow_lock) 1174 rw_downgrade(&zp->z_grow_lock); 1175 /* 1176 * Clear any mapped pages in the truncated region. 1177 */ 1178 rw_enter(&zp->z_map_lock, RW_WRITER); 1179 if (vn_has_cached_data(vp)) { 1180 page_t *pp; 1181 uint64_t start = from & PAGEMASK; 1182 int off = from & PAGEOFFSET; 1183 1184 if (off != 0 && (pp = page_lookup(vp, start, SE_SHARED))) { 1185 /* 1186 * We need to zero a partial page. 1187 */ 1188 pagezero(pp, off, PAGESIZE - off); 1189 start += PAGESIZE; 1190 page_unlock(pp); 1191 } 1192 error = pvn_vplist_dirty(vp, start, zfs_no_putpage, 1193 B_INVAL | B_TRUNC, cr); 1194 ASSERT(error == 0); 1195 } 1196 rw_exit(&zp->z_map_lock); 1197 1198 if (!have_grow_lock) 1199 rw_enter(&zp->z_grow_lock, RW_READER); 1200 1201 if (len == 0) 1202 len = -1; 1203 else if (end > size) 1204 len = size - from; 1205 dmu_free_range(zp->z_zfsvfs->z_os, zp->z_id, from, len, tx); 1206 1207 if (!have_grow_lock) 1208 rw_exit(&zp->z_grow_lock); 1209 1210 return (0); 1211 } 1212 1213 1214 void 1215 zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx) 1216 { 1217 zfsvfs_t zfsvfs; 1218 uint64_t moid, doid, roid = 0; 1219 uint64_t version = ZFS_VERSION; 1220 int error; 1221 znode_t *rootzp = NULL; 1222 vnode_t *vp; 1223 vattr_t vattr; 1224 1225 /* 1226 * First attempt to create master node. 1227 */ 1228 moid = MASTER_NODE_OBJ; 1229 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1230 DMU_OT_NONE, 0, tx); 1231 ASSERT(error == 0); 1232 1233 /* 1234 * Set starting attributes. 1235 */ 1236 1237 error = zap_update(os, moid, ZFS_VERSION_OBJ, 8, 1, &version, tx); 1238 ASSERT(error == 0); 1239 1240 /* 1241 * Create a delete queue. 1242 */ 1243 doid = zap_create(os, DMU_OT_DELETE_QUEUE, DMU_OT_NONE, 0, tx); 1244 1245 error = zap_add(os, moid, ZFS_DELETE_QUEUE, 8, 1, &doid, tx); 1246 ASSERT(error == 0); 1247 1248 /* 1249 * Create root znode. Create minimal znode/vnode/zfsvfs 1250 * to allow zfs_mknode to work. 1251 */ 1252 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1253 vattr.va_type = VDIR; 1254 vattr.va_mode = S_IFDIR|0755; 1255 vattr.va_uid = 0; 1256 vattr.va_gid = 3; 1257 1258 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1259 rootzp->z_zfsvfs = &zfsvfs; 1260 rootzp->z_active = 1; 1261 rootzp->z_reap = 0; 1262 rootzp->z_atime_dirty = 0; 1263 rootzp->z_dbuf_held = 0; 1264 1265 vp = ZTOV(rootzp); 1266 vn_reinit(vp); 1267 vp->v_type = VDIR; 1268 1269 bzero(&zfsvfs, sizeof (zfsvfs_t)); 1270 1271 zfsvfs.z_os = os; 1272 zfsvfs.z_assign = TXG_NOWAIT; 1273 zfsvfs.z_parent = &zfsvfs; 1274 1275 mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1276 list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1277 offsetof(znode_t, z_link_node)); 1278 1279 zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0); 1280 ASSERT3U(rootzp->z_id, ==, roid); 1281 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx); 1282 ASSERT(error == 0); 1283 1284 ZTOV(rootzp)->v_count = 0; 1285 kmem_cache_free(znode_cache, rootzp); 1286 } 1287