1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2014 Integros [integros.com] 25 */ 26 27 /* Portions Copyright 2007 Jeremy Teo */ 28 29 #ifdef _KERNEL 30 #include <sys/types.h> 31 #include <sys/param.h> 32 #include <sys/time.h> 33 #include <sys/systm.h> 34 #include <sys/sysmacros.h> 35 #include <sys/resource.h> 36 #include <sys/mntent.h> 37 #include <sys/mkdev.h> 38 #include <sys/u8_textprep.h> 39 #include <sys/dsl_dataset.h> 40 #include <sys/vfs.h> 41 #include <sys/vfs_opreg.h> 42 #include <sys/vnode.h> 43 #include <sys/file.h> 44 #include <sys/kmem.h> 45 #include <sys/errno.h> 46 #include <sys/unistd.h> 47 #include <sys/mode.h> 48 #include <sys/atomic.h> 49 #include <vm/pvn.h> 50 #include "fs/fs_subr.h" 51 #include <sys/zfs_dir.h> 52 #include <sys/zfs_acl.h> 53 #include <sys/zfs_ioctl.h> 54 #include <sys/zfs_rlock.h> 55 #include <sys/zfs_fuid.h> 56 #include <sys/dnode.h> 57 #include <sys/fs/zfs.h> 58 #include <sys/kidmap.h> 59 #endif /* _KERNEL */ 60 61 #include <sys/dmu.h> 62 #include <sys/dmu_objset.h> 63 #include <sys/dmu_tx.h> 64 #include <sys/refcount.h> 65 #include <sys/stat.h> 66 #include <sys/zap.h> 67 #include <sys/zfs_znode.h> 68 #include <sys/sa.h> 69 #include <sys/zfs_sa.h> 70 #include <sys/zfs_stat.h> 71 72 #include "zfs_prop.h" 73 #include "zfs_comutil.h" 74 75 /* 76 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only 77 * turned on when DEBUG is also defined. 78 */ 79 #ifdef DEBUG 80 #define ZNODE_STATS 81 #endif /* DEBUG */ 82 83 #ifdef ZNODE_STATS 84 #define ZNODE_STAT_ADD(stat) ((stat)++) 85 #else 86 #define ZNODE_STAT_ADD(stat) /* nothing */ 87 #endif /* ZNODE_STATS */ 88 89 /* 90 * Functions needed for userland (ie: libzpool) are not put under 91 * #ifdef_KERNEL; the rest of the functions have dependencies 92 * (such as VFS logic) that will not compile easily in userland. 93 */ 94 #ifdef _KERNEL 95 /* 96 * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to 97 * be freed before it can be safely accessed. 98 */ 99 krwlock_t zfsvfs_lock; 100 101 static kmem_cache_t *znode_cache = NULL; 102 103 /*ARGSUSED*/ 104 static void 105 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) 106 { 107 /* 108 * We should never drop all dbuf refs without first clearing 109 * the eviction callback. 110 */ 111 panic("evicting znode %p\n", user_ptr); 112 } 113 114 /* 115 * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on 116 * z_rangelock. It will modify the offset and length of the lock to reflect 117 * znode-specific information, and convert RL_APPEND to RL_WRITER. This is 118 * called with the rangelock_t's rl_lock held, which avoids races. 119 */ 120 static void 121 zfs_rangelock_cb(locked_range_t *new, void *arg) 122 { 123 znode_t *zp = arg; 124 125 /* 126 * If in append mode, convert to writer and lock starting at the 127 * current end of file. 128 */ 129 if (new->lr_type == RL_APPEND) { 130 new->lr_offset = zp->z_size; 131 new->lr_type = RL_WRITER; 132 } 133 134 /* 135 * If we need to grow the block size then lock the whole file range. 136 */ 137 uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); 138 if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || 139 zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { 140 new->lr_offset = 0; 141 new->lr_length = UINT64_MAX; 142 } 143 } 144 145 /*ARGSUSED*/ 146 static int 147 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 148 { 149 znode_t *zp = buf; 150 151 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 152 153 zp->z_vnode = vn_alloc(kmflags); 154 if (zp->z_vnode == NULL) { 155 return (-1); 156 } 157 ZTOV(zp)->v_data = zp; 158 159 list_link_init(&zp->z_link_node); 160 161 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 162 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 163 rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); 164 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 165 166 rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); 167 168 zp->z_dirlocks = NULL; 169 zp->z_acl_cached = NULL; 170 zp->z_moved = 0; 171 return (0); 172 } 173 174 /*ARGSUSED*/ 175 static void 176 zfs_znode_cache_destructor(void *buf, void *arg) 177 { 178 znode_t *zp = buf; 179 180 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 181 ASSERT(ZTOV(zp)->v_data == zp); 182 vn_free(ZTOV(zp)); 183 ASSERT(!list_link_active(&zp->z_link_node)); 184 mutex_destroy(&zp->z_lock); 185 rw_destroy(&zp->z_parent_lock); 186 rw_destroy(&zp->z_name_lock); 187 mutex_destroy(&zp->z_acl_lock); 188 rangelock_fini(&zp->z_rangelock); 189 190 ASSERT(zp->z_dirlocks == NULL); 191 ASSERT(zp->z_acl_cached == NULL); 192 } 193 194 #ifdef ZNODE_STATS 195 static struct { 196 uint64_t zms_zfsvfs_invalid; 197 uint64_t zms_zfsvfs_recheck1; 198 uint64_t zms_zfsvfs_unmounted; 199 uint64_t zms_zfsvfs_recheck2; 200 uint64_t zms_obj_held; 201 uint64_t zms_vnode_locked; 202 uint64_t zms_not_only_dnlc; 203 } znode_move_stats; 204 #endif /* ZNODE_STATS */ 205 206 static void 207 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) 208 { 209 vnode_t *vp; 210 211 /* Copy fields. */ 212 nzp->z_zfsvfs = ozp->z_zfsvfs; 213 214 /* Swap vnodes. */ 215 vp = nzp->z_vnode; 216 nzp->z_vnode = ozp->z_vnode; 217 ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ 218 ZTOV(ozp)->v_data = ozp; 219 ZTOV(nzp)->v_data = nzp; 220 221 nzp->z_id = ozp->z_id; 222 ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ 223 nzp->z_unlinked = ozp->z_unlinked; 224 nzp->z_atime_dirty = ozp->z_atime_dirty; 225 nzp->z_zn_prefetch = ozp->z_zn_prefetch; 226 nzp->z_blksz = ozp->z_blksz; 227 nzp->z_seq = ozp->z_seq; 228 nzp->z_mapcnt = ozp->z_mapcnt; 229 nzp->z_gen = ozp->z_gen; 230 nzp->z_sync_cnt = ozp->z_sync_cnt; 231 nzp->z_is_sa = ozp->z_is_sa; 232 nzp->z_sa_hdl = ozp->z_sa_hdl; 233 bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2); 234 nzp->z_links = ozp->z_links; 235 nzp->z_size = ozp->z_size; 236 nzp->z_pflags = ozp->z_pflags; 237 nzp->z_uid = ozp->z_uid; 238 nzp->z_gid = ozp->z_gid; 239 nzp->z_mode = ozp->z_mode; 240 241 /* 242 * Since this is just an idle znode and kmem is already dealing with 243 * memory pressure, release any cached ACL. 244 */ 245 if (ozp->z_acl_cached) { 246 zfs_acl_free(ozp->z_acl_cached); 247 ozp->z_acl_cached = NULL; 248 } 249 250 sa_set_userp(nzp->z_sa_hdl, nzp); 251 252 /* 253 * Invalidate the original znode by clearing fields that provide a 254 * pointer back to the znode. Set the low bit of the vfs pointer to 255 * ensure that zfs_znode_move() recognizes the znode as invalid in any 256 * subsequent callback. 257 */ 258 ozp->z_sa_hdl = NULL; 259 POINTER_INVALIDATE(&ozp->z_zfsvfs); 260 261 /* 262 * Mark the znode. 263 */ 264 nzp->z_moved = 1; 265 ozp->z_moved = (uint8_t)-1; 266 } 267 268 /*ARGSUSED*/ 269 static kmem_cbrc_t 270 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) 271 { 272 znode_t *ozp = buf, *nzp = newbuf; 273 zfsvfs_t *zfsvfs; 274 vnode_t *vp; 275 276 /* 277 * The znode is on the file system's list of known znodes if the vfs 278 * pointer is valid. We set the low bit of the vfs pointer when freeing 279 * the znode to invalidate it, and the memory patterns written by kmem 280 * (baddcafe and deadbeef) set at least one of the two low bits. A newly 281 * created znode sets the vfs pointer last of all to indicate that the 282 * znode is known and in a valid state to be moved by this function. 283 */ 284 zfsvfs = ozp->z_zfsvfs; 285 if (!POINTER_IS_VALID(zfsvfs)) { 286 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); 287 return (KMEM_CBRC_DONT_KNOW); 288 } 289 290 /* 291 * Close a small window in which it's possible that the filesystem could 292 * be unmounted and freed, and zfsvfs, though valid in the previous 293 * statement, could point to unrelated memory by the time we try to 294 * prevent the filesystem from being unmounted. 295 */ 296 rw_enter(&zfsvfs_lock, RW_WRITER); 297 if (zfsvfs != ozp->z_zfsvfs) { 298 rw_exit(&zfsvfs_lock); 299 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1); 300 return (KMEM_CBRC_DONT_KNOW); 301 } 302 303 /* 304 * If the znode is still valid, then so is the file system. We know that 305 * no valid file system can be freed while we hold zfsvfs_lock, so we 306 * can safely ensure that the filesystem is not and will not be 307 * unmounted. The next statement is equivalent to ZFS_ENTER(). 308 */ 309 rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); 310 if (zfsvfs->z_unmounted) { 311 ZFS_EXIT(zfsvfs); 312 rw_exit(&zfsvfs_lock); 313 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); 314 return (KMEM_CBRC_DONT_KNOW); 315 } 316 rw_exit(&zfsvfs_lock); 317 318 mutex_enter(&zfsvfs->z_znodes_lock); 319 /* 320 * Recheck the vfs pointer in case the znode was removed just before 321 * acquiring the lock. 322 */ 323 if (zfsvfs != ozp->z_zfsvfs) { 324 mutex_exit(&zfsvfs->z_znodes_lock); 325 ZFS_EXIT(zfsvfs); 326 ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2); 327 return (KMEM_CBRC_DONT_KNOW); 328 } 329 330 /* 331 * At this point we know that as long as we hold z_znodes_lock, the 332 * znode cannot be freed and fields within the znode can be safely 333 * accessed. Now, prevent a race with zfs_zget(). 334 */ 335 if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { 336 mutex_exit(&zfsvfs->z_znodes_lock); 337 ZFS_EXIT(zfsvfs); 338 ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); 339 return (KMEM_CBRC_LATER); 340 } 341 342 vp = ZTOV(ozp); 343 if (mutex_tryenter(&vp->v_lock) == 0) { 344 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 345 mutex_exit(&zfsvfs->z_znodes_lock); 346 ZFS_EXIT(zfsvfs); 347 ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); 348 return (KMEM_CBRC_LATER); 349 } 350 351 /* Only move znodes that are referenced _only_ by the DNLC. */ 352 if (vp->v_count != 1 || !vn_in_dnlc(vp)) { 353 mutex_exit(&vp->v_lock); 354 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 355 mutex_exit(&zfsvfs->z_znodes_lock); 356 ZFS_EXIT(zfsvfs); 357 ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); 358 return (KMEM_CBRC_LATER); 359 } 360 361 /* 362 * The znode is known and in a valid state to move. We're holding the 363 * locks needed to execute the critical section. 364 */ 365 zfs_znode_move_impl(ozp, nzp); 366 mutex_exit(&vp->v_lock); 367 ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 368 369 list_link_replace(&ozp->z_link_node, &nzp->z_link_node); 370 mutex_exit(&zfsvfs->z_znodes_lock); 371 ZFS_EXIT(zfsvfs); 372 373 return (KMEM_CBRC_YES); 374 } 375 376 void 377 zfs_znode_init(void) 378 { 379 /* 380 * Initialize zcache 381 */ 382 rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); 383 ASSERT(znode_cache == NULL); 384 znode_cache = kmem_cache_create("zfs_znode_cache", 385 sizeof (znode_t), 0, zfs_znode_cache_constructor, 386 zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 387 kmem_cache_set_move(znode_cache, zfs_znode_move); 388 } 389 390 void 391 zfs_znode_fini(void) 392 { 393 /* 394 * Cleanup vfs & vnode ops 395 */ 396 zfs_remove_op_tables(); 397 398 /* 399 * Cleanup zcache 400 */ 401 if (znode_cache) 402 kmem_cache_destroy(znode_cache); 403 znode_cache = NULL; 404 rw_destroy(&zfsvfs_lock); 405 } 406 407 struct vnodeops *zfs_dvnodeops; 408 struct vnodeops *zfs_fvnodeops; 409 struct vnodeops *zfs_symvnodeops; 410 struct vnodeops *zfs_xdvnodeops; 411 struct vnodeops *zfs_evnodeops; 412 struct vnodeops *zfs_sharevnodeops; 413 414 void 415 zfs_remove_op_tables() 416 { 417 /* 418 * Remove vfs ops 419 */ 420 ASSERT(zfsfstype); 421 (void) vfs_freevfsops_by_type(zfsfstype); 422 zfsfstype = 0; 423 424 /* 425 * Remove vnode ops 426 */ 427 if (zfs_dvnodeops) 428 vn_freevnodeops(zfs_dvnodeops); 429 if (zfs_fvnodeops) 430 vn_freevnodeops(zfs_fvnodeops); 431 if (zfs_symvnodeops) 432 vn_freevnodeops(zfs_symvnodeops); 433 if (zfs_xdvnodeops) 434 vn_freevnodeops(zfs_xdvnodeops); 435 if (zfs_evnodeops) 436 vn_freevnodeops(zfs_evnodeops); 437 if (zfs_sharevnodeops) 438 vn_freevnodeops(zfs_sharevnodeops); 439 440 zfs_dvnodeops = NULL; 441 zfs_fvnodeops = NULL; 442 zfs_symvnodeops = NULL; 443 zfs_xdvnodeops = NULL; 444 zfs_evnodeops = NULL; 445 zfs_sharevnodeops = NULL; 446 } 447 448 extern const fs_operation_def_t zfs_dvnodeops_template[]; 449 extern const fs_operation_def_t zfs_fvnodeops_template[]; 450 extern const fs_operation_def_t zfs_xdvnodeops_template[]; 451 extern const fs_operation_def_t zfs_symvnodeops_template[]; 452 extern const fs_operation_def_t zfs_evnodeops_template[]; 453 extern const fs_operation_def_t zfs_sharevnodeops_template[]; 454 455 int 456 zfs_create_op_tables() 457 { 458 int error; 459 460 /* 461 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs() 462 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv). 463 * In this case we just return as the ops vectors are already set up. 464 */ 465 if (zfs_dvnodeops) 466 return (0); 467 468 error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template, 469 &zfs_dvnodeops); 470 if (error) 471 return (error); 472 473 error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template, 474 &zfs_fvnodeops); 475 if (error) 476 return (error); 477 478 error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template, 479 &zfs_symvnodeops); 480 if (error) 481 return (error); 482 483 error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template, 484 &zfs_xdvnodeops); 485 if (error) 486 return (error); 487 488 error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, 489 &zfs_evnodeops); 490 if (error) 491 return (error); 492 493 error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template, 494 &zfs_sharevnodeops); 495 496 return (error); 497 } 498 499 int 500 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) 501 { 502 zfs_acl_ids_t acl_ids; 503 vattr_t vattr; 504 znode_t *sharezp; 505 vnode_t *vp; 506 znode_t *zp; 507 int error; 508 509 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 510 vattr.va_type = VDIR; 511 vattr.va_mode = S_IFDIR|0555; 512 vattr.va_uid = crgetuid(kcred); 513 vattr.va_gid = crgetgid(kcred); 514 515 sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); 516 ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); 517 sharezp->z_moved = 0; 518 sharezp->z_unlinked = 0; 519 sharezp->z_atime_dirty = 0; 520 sharezp->z_zfsvfs = zfsvfs; 521 sharezp->z_is_sa = zfsvfs->z_use_sa; 522 523 vp = ZTOV(sharezp); 524 vn_reinit(vp); 525 vp->v_type = VDIR; 526 527 VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, 528 kcred, NULL, &acl_ids)); 529 zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); 530 ASSERT3P(zp, ==, sharezp); 531 ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */ 532 POINTER_INVALIDATE(&sharezp->z_zfsvfs); 533 error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, 534 ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); 535 zfsvfs->z_shares_dir = sharezp->z_id; 536 537 zfs_acl_ids_free(&acl_ids); 538 ZTOV(sharezp)->v_count = 0; 539 sa_handle_destroy(sharezp->z_sa_hdl); 540 kmem_cache_free(znode_cache, sharezp); 541 542 return (error); 543 } 544 545 /* 546 * define a couple of values we need available 547 * for both 64 and 32 bit environments. 548 */ 549 #ifndef NBITSMINOR64 550 #define NBITSMINOR64 32 551 #endif 552 #ifndef MAXMAJ64 553 #define MAXMAJ64 0xffffffffUL 554 #endif 555 #ifndef MAXMIN64 556 #define MAXMIN64 0xffffffffUL 557 #endif 558 559 /* 560 * Create special expldev for ZFS private use. 561 * Can't use standard expldev since it doesn't do 562 * what we want. The standard expldev() takes a 563 * dev32_t in LP64 and expands it to a long dev_t. 564 * We need an interface that takes a dev32_t in ILP32 565 * and expands it to a long dev_t. 566 */ 567 static uint64_t 568 zfs_expldev(dev_t dev) 569 { 570 #ifndef _LP64 571 major_t major = (major_t)dev >> NBITSMINOR32 & MAXMAJ32; 572 return (((uint64_t)major << NBITSMINOR64) | 573 ((minor_t)dev & MAXMIN32)); 574 #else 575 return (dev); 576 #endif 577 } 578 579 /* 580 * Special cmpldev for ZFS private use. 581 * Can't use standard cmpldev since it takes 582 * a long dev_t and compresses it to dev32_t in 583 * LP64. We need to do a compaction of a long dev_t 584 * to a dev32_t in ILP32. 585 */ 586 dev_t 587 zfs_cmpldev(uint64_t dev) 588 { 589 #ifndef _LP64 590 minor_t minor = (minor_t)dev & MAXMIN64; 591 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 592 593 if (major > MAXMAJ32 || minor > MAXMIN32) 594 return (NODEV32); 595 596 return (((dev32_t)major << NBITSMINOR32) | minor); 597 #else 598 return (dev); 599 #endif 600 } 601 602 static void 603 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, 604 dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) 605 { 606 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); 607 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); 608 609 mutex_enter(&zp->z_lock); 610 611 ASSERT(zp->z_sa_hdl == NULL); 612 ASSERT(zp->z_acl_cached == NULL); 613 if (sa_hdl == NULL) { 614 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, 615 SA_HDL_SHARED, &zp->z_sa_hdl)); 616 } else { 617 zp->z_sa_hdl = sa_hdl; 618 sa_set_userp(sa_hdl, zp); 619 } 620 621 zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; 622 623 /* 624 * Slap on VROOT if we are the root znode 625 */ 626 if (zp->z_id == zfsvfs->z_root) 627 ZTOV(zp)->v_flag |= VROOT; 628 629 mutex_exit(&zp->z_lock); 630 vn_exists(ZTOV(zp)); 631 } 632 633 void 634 zfs_znode_dmu_fini(znode_t *zp) 635 { 636 ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || 637 zp->z_unlinked || 638 RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); 639 640 sa_handle_destroy(zp->z_sa_hdl); 641 zp->z_sa_hdl = NULL; 642 } 643 644 /* 645 * Construct a new znode/vnode and intialize. 646 * 647 * This does not do a call to dmu_set_user() that is 648 * up to the caller to do, in case you don't want to 649 * return the znode 650 */ 651 static znode_t * 652 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, 653 dmu_object_type_t obj_type, sa_handle_t *hdl) 654 { 655 znode_t *zp; 656 vnode_t *vp; 657 uint64_t mode; 658 uint64_t parent; 659 sa_bulk_attr_t bulk[9]; 660 int count = 0; 661 662 zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 663 664 ASSERT(zp->z_dirlocks == NULL); 665 ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 666 zp->z_moved = 0; 667 668 /* 669 * Defer setting z_zfsvfs until the znode is ready to be a candidate for 670 * the zfs_znode_move() callback. 671 */ 672 zp->z_sa_hdl = NULL; 673 zp->z_unlinked = 0; 674 zp->z_atime_dirty = 0; 675 zp->z_mapcnt = 0; 676 zp->z_id = db->db_object; 677 zp->z_blksz = blksz; 678 zp->z_seq = 0x7A4653; 679 zp->z_sync_cnt = 0; 680 681 vp = ZTOV(zp); 682 vn_reinit(vp); 683 684 zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); 685 686 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); 687 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); 688 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 689 &zp->z_size, 8); 690 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, 691 &zp->z_links, 8); 692 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 693 &zp->z_pflags, 8); 694 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); 695 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 696 &zp->z_atime, 16); 697 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 698 &zp->z_uid, 8); 699 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 700 &zp->z_gid, 8); 701 702 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) { 703 if (hdl == NULL) 704 sa_handle_destroy(zp->z_sa_hdl); 705 kmem_cache_free(znode_cache, zp); 706 return (NULL); 707 } 708 709 zp->z_mode = mode; 710 vp->v_vfsp = zfsvfs->z_parent->z_vfs; 711 712 vp->v_type = IFTOVT((mode_t)mode); 713 714 switch (vp->v_type) { 715 case VDIR: 716 if (zp->z_pflags & ZFS_XATTR) { 717 vn_setops(vp, zfs_xdvnodeops); 718 vp->v_flag |= V_XATTRDIR; 719 } else { 720 vn_setops(vp, zfs_dvnodeops); 721 } 722 zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 723 break; 724 case VBLK: 725 case VCHR: 726 { 727 uint64_t rdev; 728 VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), 729 &rdev, sizeof (rdev)) == 0); 730 731 vp->v_rdev = zfs_cmpldev(rdev); 732 } 733 /*FALLTHROUGH*/ 734 case VFIFO: 735 case VSOCK: 736 case VDOOR: 737 vn_setops(vp, zfs_fvnodeops); 738 break; 739 case VREG: 740 vp->v_flag |= VMODSORT; 741 if (parent == zfsvfs->z_shares_dir) { 742 ASSERT(zp->z_uid == 0 && zp->z_gid == 0); 743 vn_setops(vp, zfs_sharevnodeops); 744 } else { 745 vn_setops(vp, zfs_fvnodeops); 746 } 747 break; 748 case VLNK: 749 vn_setops(vp, zfs_symvnodeops); 750 break; 751 default: 752 vn_setops(vp, zfs_evnodeops); 753 break; 754 } 755 756 mutex_enter(&zfsvfs->z_znodes_lock); 757 list_insert_tail(&zfsvfs->z_all_znodes, zp); 758 membar_producer(); 759 /* 760 * Everything else must be valid before assigning z_zfsvfs makes the 761 * znode eligible for zfs_znode_move(). 762 */ 763 zp->z_zfsvfs = zfsvfs; 764 mutex_exit(&zfsvfs->z_znodes_lock); 765 766 VFS_HOLD(zfsvfs->z_vfs); 767 return (zp); 768 } 769 770 static uint64_t empty_xattr; 771 static uint64_t pad[4]; 772 static zfs_acl_phys_t acl_phys; 773 /* 774 * Create a new DMU object to hold a zfs znode. 775 * 776 * IN: dzp - parent directory for new znode 777 * vap - file attributes for new znode 778 * tx - dmu transaction id for zap operations 779 * cr - credentials of caller 780 * flag - flags: 781 * IS_ROOT_NODE - new object will be root 782 * IS_XATTR - new object is an attribute 783 * bonuslen - length of bonus buffer 784 * setaclp - File/Dir initial ACL 785 * fuidp - Tracks fuid allocation. 786 * 787 * OUT: zpp - allocated znode 788 * 789 */ 790 void 791 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 792 uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) 793 { 794 uint64_t crtime[2], atime[2], mtime[2], ctime[2]; 795 uint64_t mode, size, links, parent, pflags; 796 uint64_t dzp_pflags = 0; 797 uint64_t rdev = 0; 798 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 799 dmu_buf_t *db; 800 timestruc_t now; 801 uint64_t gen, obj; 802 int bonuslen; 803 int dnodesize; 804 sa_handle_t *sa_hdl; 805 dmu_object_type_t obj_type; 806 sa_bulk_attr_t *sa_attrs; 807 int cnt = 0; 808 zfs_acl_locator_cb_t locate = { 0 }; 809 810 ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 811 812 if (zfsvfs->z_replay) { 813 obj = vap->va_nodeid; 814 now = vap->va_ctime; /* see zfs_replay_create() */ 815 gen = vap->va_nblocks; /* ditto */ 816 dnodesize = vap->va_fsid; /* ditto */ 817 } else { 818 obj = 0; 819 gethrestime(&now); 820 gen = dmu_tx_get_txg(tx); 821 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); 822 } 823 824 if (dnodesize == 0) 825 dnodesize = DNODE_MIN_SIZE; 826 827 obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; 828 bonuslen = (obj_type == DMU_OT_SA) ? 829 DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; 830 831 /* 832 * Create a new DMU object. 833 */ 834 /* 835 * There's currently no mechanism for pre-reading the blocks that will 836 * be needed to allocate a new object, so we accept the small chance 837 * that there will be an i/o error and we will fail one of the 838 * assertions below. 839 */ 840 if (vap->va_type == VDIR) { 841 if (zfsvfs->z_replay) { 842 VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, 843 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 844 obj_type, bonuslen, dnodesize, tx)); 845 } else { 846 obj = zap_create_norm_dnsize(zfsvfs->z_os, 847 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 848 obj_type, bonuslen, dnodesize, tx); 849 } 850 } else { 851 if (zfsvfs->z_replay) { 852 VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, 853 DMU_OT_PLAIN_FILE_CONTENTS, 0, 854 obj_type, bonuslen, dnodesize, tx)); 855 } else { 856 obj = dmu_object_alloc_dnsize(zfsvfs->z_os, 857 DMU_OT_PLAIN_FILE_CONTENTS, 0, 858 obj_type, bonuslen, dnodesize, tx); 859 } 860 } 861 862 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 863 VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); 864 865 /* 866 * If this is the root, fix up the half-initialized parent pointer 867 * to reference the just-allocated physical data area. 868 */ 869 if (flag & IS_ROOT_NODE) { 870 dzp->z_id = obj; 871 } else { 872 dzp_pflags = dzp->z_pflags; 873 } 874 875 /* 876 * If parent is an xattr, so am I. 877 */ 878 if (dzp_pflags & ZFS_XATTR) { 879 flag |= IS_XATTR; 880 } 881 882 if (zfsvfs->z_use_fuids) 883 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 884 else 885 pflags = 0; 886 887 if (vap->va_type == VDIR) { 888 size = 2; /* contents ("." and "..") */ 889 links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 890 } else { 891 size = links = 0; 892 } 893 894 if (vap->va_type == VBLK || vap->va_type == VCHR) { 895 rdev = zfs_expldev(vap->va_rdev); 896 } 897 898 parent = dzp->z_id; 899 mode = acl_ids->z_mode; 900 if (flag & IS_XATTR) 901 pflags |= ZFS_XATTR; 902 903 /* 904 * No execs denied will be deterimed when zfs_mode_compute() is called. 905 */ 906 pflags |= acl_ids->z_aclp->z_hints & 907 (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| 908 ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); 909 910 ZFS_TIME_ENCODE(&now, crtime); 911 ZFS_TIME_ENCODE(&now, ctime); 912 913 if (vap->va_mask & AT_ATIME) { 914 ZFS_TIME_ENCODE(&vap->va_atime, atime); 915 } else { 916 ZFS_TIME_ENCODE(&now, atime); 917 } 918 919 if (vap->va_mask & AT_MTIME) { 920 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 921 } else { 922 ZFS_TIME_ENCODE(&now, mtime); 923 } 924 925 /* Now add in all of the "SA" attributes */ 926 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, 927 &sa_hdl)); 928 929 /* 930 * Setup the array of attributes to be replaced/set on the new file 931 * 932 * order for DMU_OT_ZNODE is critical since it needs to be constructed 933 * in the old znode_phys_t format. Don't change this ordering 934 */ 935 sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); 936 937 if (obj_type == DMU_OT_ZNODE) { 938 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), 939 NULL, &atime, 16); 940 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), 941 NULL, &mtime, 16); 942 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), 943 NULL, &ctime, 16); 944 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), 945 NULL, &crtime, 16); 946 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), 947 NULL, &gen, 8); 948 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), 949 NULL, &mode, 8); 950 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), 951 NULL, &size, 8); 952 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), 953 NULL, &parent, 8); 954 } else { 955 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), 956 NULL, &mode, 8); 957 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), 958 NULL, &size, 8); 959 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), 960 NULL, &gen, 8); 961 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), 962 NULL, &acl_ids->z_fuid, 8); 963 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), 964 NULL, &acl_ids->z_fgid, 8); 965 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), 966 NULL, &parent, 8); 967 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), 968 NULL, &pflags, 8); 969 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), 970 NULL, &atime, 16); 971 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), 972 NULL, &mtime, 16); 973 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), 974 NULL, &ctime, 16); 975 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), 976 NULL, &crtime, 16); 977 } 978 979 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); 980 981 if (obj_type == DMU_OT_ZNODE) { 982 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, 983 &empty_xattr, 8); 984 } 985 if (obj_type == DMU_OT_ZNODE || 986 (vap->va_type == VBLK || vap->va_type == VCHR)) { 987 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), 988 NULL, &rdev, 8); 989 990 } 991 if (obj_type == DMU_OT_ZNODE) { 992 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), 993 NULL, &pflags, 8); 994 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, 995 &acl_ids->z_fuid, 8); 996 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, 997 &acl_ids->z_fgid, 8); 998 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, 999 sizeof (uint64_t) * 4); 1000 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, 1001 &acl_phys, sizeof (zfs_acl_phys_t)); 1002 } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { 1003 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, 1004 &acl_ids->z_aclp->z_acl_count, 8); 1005 locate.cb_aclp = acl_ids->z_aclp; 1006 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), 1007 zfs_acl_data_locator, &locate, 1008 acl_ids->z_aclp->z_acl_bytes); 1009 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, 1010 acl_ids->z_fuid, acl_ids->z_fgid); 1011 } 1012 1013 VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); 1014 1015 if (!(flag & IS_ROOT_NODE)) { 1016 *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); 1017 ASSERT(*zpp != NULL); 1018 } else { 1019 /* 1020 * If we are creating the root node, the "parent" we 1021 * passed in is the znode for the root. 1022 */ 1023 *zpp = dzp; 1024 1025 (*zpp)->z_sa_hdl = sa_hdl; 1026 } 1027 1028 (*zpp)->z_pflags = pflags; 1029 (*zpp)->z_mode = mode; 1030 (*zpp)->z_dnodesize = dnodesize; 1031 1032 if (vap->va_mask & AT_XVATTR) 1033 zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); 1034 1035 if (obj_type == DMU_OT_ZNODE || 1036 acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { 1037 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); 1038 } 1039 kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); 1040 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 1041 } 1042 1043 /* 1044 * Update in-core attributes. It is assumed the caller will be doing an 1045 * sa_bulk_update to push the changes out. 1046 */ 1047 void 1048 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) 1049 { 1050 xoptattr_t *xoap; 1051 1052 xoap = xva_getxoptattr(xvap); 1053 ASSERT(xoap); 1054 1055 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 1056 uint64_t times[2]; 1057 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); 1058 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), 1059 ×, sizeof (times), tx); 1060 XVA_SET_RTN(xvap, XAT_CREATETIME); 1061 } 1062 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 1063 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, 1064 zp->z_pflags, tx); 1065 XVA_SET_RTN(xvap, XAT_READONLY); 1066 } 1067 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 1068 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, 1069 zp->z_pflags, tx); 1070 XVA_SET_RTN(xvap, XAT_HIDDEN); 1071 } 1072 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 1073 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, 1074 zp->z_pflags, tx); 1075 XVA_SET_RTN(xvap, XAT_SYSTEM); 1076 } 1077 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 1078 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, 1079 zp->z_pflags, tx); 1080 XVA_SET_RTN(xvap, XAT_ARCHIVE); 1081 } 1082 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 1083 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, 1084 zp->z_pflags, tx); 1085 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 1086 } 1087 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 1088 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, 1089 zp->z_pflags, tx); 1090 XVA_SET_RTN(xvap, XAT_NOUNLINK); 1091 } 1092 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 1093 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, 1094 zp->z_pflags, tx); 1095 XVA_SET_RTN(xvap, XAT_APPENDONLY); 1096 } 1097 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 1098 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, 1099 zp->z_pflags, tx); 1100 XVA_SET_RTN(xvap, XAT_NODUMP); 1101 } 1102 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 1103 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, 1104 zp->z_pflags, tx); 1105 XVA_SET_RTN(xvap, XAT_OPAQUE); 1106 } 1107 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 1108 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 1109 xoap->xoa_av_quarantined, zp->z_pflags, tx); 1110 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 1111 } 1112 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 1113 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, 1114 zp->z_pflags, tx); 1115 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 1116 } 1117 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 1118 zfs_sa_set_scanstamp(zp, xvap, tx); 1119 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 1120 } 1121 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 1122 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, 1123 zp->z_pflags, tx); 1124 XVA_SET_RTN(xvap, XAT_REPARSE); 1125 } 1126 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 1127 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, 1128 zp->z_pflags, tx); 1129 XVA_SET_RTN(xvap, XAT_OFFLINE); 1130 } 1131 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 1132 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, 1133 zp->z_pflags, tx); 1134 XVA_SET_RTN(xvap, XAT_SPARSE); 1135 } 1136 } 1137 1138 int 1139 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 1140 { 1141 dmu_object_info_t doi; 1142 dmu_buf_t *db; 1143 znode_t *zp; 1144 int err; 1145 sa_handle_t *hdl; 1146 1147 *zpp = NULL; 1148 1149 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 1150 1151 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); 1152 if (err) { 1153 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1154 return (err); 1155 } 1156 1157 dmu_object_info_from_db(db, &doi); 1158 if (doi.doi_bonus_type != DMU_OT_SA && 1159 (doi.doi_bonus_type != DMU_OT_ZNODE || 1160 (doi.doi_bonus_type == DMU_OT_ZNODE && 1161 doi.doi_bonus_size < sizeof (znode_phys_t)))) { 1162 sa_buf_rele(db, NULL); 1163 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1164 return (SET_ERROR(EINVAL)); 1165 } 1166 1167 hdl = dmu_buf_get_user(db); 1168 if (hdl != NULL) { 1169 zp = sa_get_userdata(hdl); 1170 1171 1172 /* 1173 * Since "SA" does immediate eviction we 1174 * should never find a sa handle that doesn't 1175 * know about the znode. 1176 */ 1177 1178 ASSERT3P(zp, !=, NULL); 1179 1180 mutex_enter(&zp->z_lock); 1181 ASSERT3U(zp->z_id, ==, obj_num); 1182 if (zp->z_unlinked) { 1183 err = SET_ERROR(ENOENT); 1184 } else { 1185 VN_HOLD(ZTOV(zp)); 1186 *zpp = zp; 1187 err = 0; 1188 } 1189 mutex_exit(&zp->z_lock); 1190 sa_buf_rele(db, NULL); 1191 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1192 return (err); 1193 } 1194 1195 /* 1196 * Not found create new znode/vnode 1197 * but only if file exists. 1198 * 1199 * There is a small window where zfs_vget() could 1200 * find this object while a file create is still in 1201 * progress. This is checked for in zfs_znode_alloc() 1202 * 1203 * if zfs_znode_alloc() fails it will drop the hold on the 1204 * bonus buffer. 1205 */ 1206 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, 1207 doi.doi_bonus_type, NULL); 1208 if (zp == NULL) { 1209 err = SET_ERROR(ENOENT); 1210 } else { 1211 *zpp = zp; 1212 } 1213 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1214 return (err); 1215 } 1216 1217 int 1218 zfs_rezget(znode_t *zp) 1219 { 1220 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1221 dmu_object_info_t doi; 1222 dmu_buf_t *db; 1223 uint64_t obj_num = zp->z_id; 1224 uint64_t mode; 1225 sa_bulk_attr_t bulk[8]; 1226 int err; 1227 int count = 0; 1228 uint64_t gen; 1229 1230 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 1231 1232 mutex_enter(&zp->z_acl_lock); 1233 if (zp->z_acl_cached) { 1234 zfs_acl_free(zp->z_acl_cached); 1235 zp->z_acl_cached = NULL; 1236 } 1237 1238 mutex_exit(&zp->z_acl_lock); 1239 ASSERT(zp->z_sa_hdl == NULL); 1240 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); 1241 if (err) { 1242 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1243 return (err); 1244 } 1245 1246 dmu_object_info_from_db(db, &doi); 1247 if (doi.doi_bonus_type != DMU_OT_SA && 1248 (doi.doi_bonus_type != DMU_OT_ZNODE || 1249 (doi.doi_bonus_type == DMU_OT_ZNODE && 1250 doi.doi_bonus_size < sizeof (znode_phys_t)))) { 1251 sa_buf_rele(db, NULL); 1252 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1253 return (SET_ERROR(EINVAL)); 1254 } 1255 1256 zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); 1257 1258 /* reload cached values */ 1259 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, 1260 &gen, sizeof (gen)); 1261 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 1262 &zp->z_size, sizeof (zp->z_size)); 1263 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, 1264 &zp->z_links, sizeof (zp->z_links)); 1265 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 1266 &zp->z_pflags, sizeof (zp->z_pflags)); 1267 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 1268 &zp->z_atime, sizeof (zp->z_atime)); 1269 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 1270 &zp->z_uid, sizeof (zp->z_uid)); 1271 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, 1272 &zp->z_gid, sizeof (zp->z_gid)); 1273 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 1274 &mode, sizeof (mode)); 1275 1276 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { 1277 zfs_znode_dmu_fini(zp); 1278 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1279 return (SET_ERROR(EIO)); 1280 } 1281 1282 zp->z_mode = mode; 1283 1284 if (gen != zp->z_gen) { 1285 zfs_znode_dmu_fini(zp); 1286 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1287 return (SET_ERROR(EIO)); 1288 } 1289 1290 zp->z_blksz = doi.doi_data_block_size; 1291 1292 /* 1293 * If the file has zero links, then it has been unlinked on the send 1294 * side and it must be in the received unlinked set. 1295 * We call zfs_znode_dmu_fini() now to prevent any accesses to the 1296 * stale data and to prevent automatical removal of the file in 1297 * zfs_zinactive(). The file will be removed either when it is removed 1298 * on the send side and the next incremental stream is received or 1299 * when the unlinked set gets processed. 1300 */ 1301 zp->z_unlinked = (zp->z_links == 0); 1302 if (zp->z_unlinked) 1303 zfs_znode_dmu_fini(zp); 1304 1305 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 1306 1307 return (0); 1308 } 1309 1310 void 1311 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 1312 { 1313 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1314 objset_t *os = zfsvfs->z_os; 1315 uint64_t obj = zp->z_id; 1316 uint64_t acl_obj = zfs_external_acl(zp); 1317 1318 ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 1319 if (acl_obj) { 1320 VERIFY(!zp->z_is_sa); 1321 VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 1322 } 1323 VERIFY(0 == dmu_object_free(os, obj, tx)); 1324 zfs_znode_dmu_fini(zp); 1325 ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 1326 zfs_znode_free(zp); 1327 } 1328 1329 void 1330 zfs_zinactive(znode_t *zp) 1331 { 1332 vnode_t *vp = ZTOV(zp); 1333 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1334 uint64_t z_id = zp->z_id; 1335 1336 ASSERT(zp->z_sa_hdl); 1337 1338 /* 1339 * Don't allow a zfs_zget() while were trying to release this znode 1340 */ 1341 ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 1342 1343 mutex_enter(&zp->z_lock); 1344 mutex_enter(&vp->v_lock); 1345 VN_RELE_LOCKED(vp); 1346 if (vp->v_count > 0 || vn_has_cached_data(vp)) { 1347 /* 1348 * If the hold count is greater than zero, somebody has 1349 * obtained a new reference on this znode while we were 1350 * processing it here, so we are done. If we still have 1351 * mapped pages then we are also done, since we don't 1352 * want to inactivate the znode until the pages get pushed. 1353 * 1354 * XXX - if vn_has_cached_data(vp) is true, but count == 0, 1355 * this seems like it would leave the znode hanging with 1356 * no chance to go inactive... 1357 */ 1358 mutex_exit(&vp->v_lock); 1359 mutex_exit(&zp->z_lock); 1360 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1361 return; 1362 } 1363 mutex_exit(&vp->v_lock); 1364 1365 /* 1366 * If this was the last reference to a file with no links, remove 1367 * the file from the file system unless the file system is mounted 1368 * read-only. That can happen, for example, if the file system was 1369 * originally read-write, the file was opened, then unlinked and 1370 * the file system was made read-only before the file was finally 1371 * closed. The file will remain in the unlinked set. 1372 */ 1373 if (zp->z_unlinked) { 1374 ASSERT(!zfsvfs->z_issnap); 1375 if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) { 1376 mutex_exit(&zp->z_lock); 1377 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1378 zfs_rmnode(zp); 1379 return; 1380 } 1381 } 1382 1383 mutex_exit(&zp->z_lock); 1384 zfs_znode_dmu_fini(zp); 1385 ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1386 zfs_znode_free(zp); 1387 } 1388 1389 void 1390 zfs_znode_free(znode_t *zp) 1391 { 1392 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1393 1394 vn_invalid(ZTOV(zp)); 1395 1396 ASSERT(ZTOV(zp)->v_count == 0); 1397 1398 mutex_enter(&zfsvfs->z_znodes_lock); 1399 POINTER_INVALIDATE(&zp->z_zfsvfs); 1400 list_remove(&zfsvfs->z_all_znodes, zp); 1401 mutex_exit(&zfsvfs->z_znodes_lock); 1402 1403 if (zp->z_acl_cached) { 1404 zfs_acl_free(zp->z_acl_cached); 1405 zp->z_acl_cached = NULL; 1406 } 1407 1408 kmem_cache_free(znode_cache, zp); 1409 1410 VFS_RELE(zfsvfs->z_vfs); 1411 } 1412 1413 void 1414 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], 1415 uint64_t ctime[2], boolean_t have_tx) 1416 { 1417 timestruc_t now; 1418 1419 gethrestime(&now); 1420 1421 if (have_tx) { /* will sa_bulk_update happen really soon? */ 1422 zp->z_atime_dirty = 0; 1423 zp->z_seq++; 1424 } else { 1425 zp->z_atime_dirty = 1; 1426 } 1427 1428 if (flag & AT_ATIME) { 1429 ZFS_TIME_ENCODE(&now, zp->z_atime); 1430 } 1431 1432 if (flag & AT_MTIME) { 1433 ZFS_TIME_ENCODE(&now, mtime); 1434 if (zp->z_zfsvfs->z_use_fuids) { 1435 zp->z_pflags |= (ZFS_ARCHIVE | 1436 ZFS_AV_MODIFIED); 1437 } 1438 } 1439 1440 if (flag & AT_CTIME) { 1441 ZFS_TIME_ENCODE(&now, ctime); 1442 if (zp->z_zfsvfs->z_use_fuids) 1443 zp->z_pflags |= ZFS_ARCHIVE; 1444 } 1445 } 1446 1447 /* 1448 * Grow the block size for a file. 1449 * 1450 * IN: zp - znode of file to free data in. 1451 * size - requested block size 1452 * tx - open transaction. 1453 * 1454 * NOTE: this function assumes that the znode is write locked. 1455 */ 1456 void 1457 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1458 { 1459 int error; 1460 u_longlong_t dummy; 1461 1462 if (size <= zp->z_blksz) 1463 return; 1464 /* 1465 * If the file size is already greater than the current blocksize, 1466 * we will not grow. If there is more than one block in a file, 1467 * the blocksize cannot change. 1468 */ 1469 if (zp->z_blksz && zp->z_size > zp->z_blksz) 1470 return; 1471 1472 error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1473 size, 0, tx); 1474 1475 if (error == ENOTSUP) 1476 return; 1477 ASSERT0(error); 1478 1479 /* What blocksize did we actually get? */ 1480 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); 1481 } 1482 1483 /* 1484 * This is a dummy interface used when pvn_vplist_dirty() should *not* 1485 * be calling back into the fs for a putpage(). E.g.: when truncating 1486 * a file, the pages being "thrown away* don't need to be written out. 1487 */ 1488 /* ARGSUSED */ 1489 static int 1490 zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 1491 int flags, cred_t *cr) 1492 { 1493 ASSERT(0); 1494 return (0); 1495 } 1496 1497 /* 1498 * Increase the file length 1499 * 1500 * IN: zp - znode of file to free data in. 1501 * end - new end-of-file 1502 * 1503 * RETURN: 0 on success, error code on failure 1504 */ 1505 static int 1506 zfs_extend(znode_t *zp, uint64_t end) 1507 { 1508 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1509 dmu_tx_t *tx; 1510 locked_range_t *lr; 1511 uint64_t newblksz; 1512 int error; 1513 1514 /* 1515 * We will change zp_size, lock the whole file. 1516 */ 1517 lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); 1518 1519 /* 1520 * Nothing to do if file already at desired length. 1521 */ 1522 if (end <= zp->z_size) { 1523 rangelock_exit(lr); 1524 return (0); 1525 } 1526 tx = dmu_tx_create(zfsvfs->z_os); 1527 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1528 zfs_sa_upgrade_txholds(tx, zp); 1529 if (end > zp->z_blksz && 1530 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1531 /* 1532 * We are growing the file past the current block size. 1533 */ 1534 if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1535 /* 1536 * File's blocksize is already larger than the 1537 * "recordsize" property. Only let it grow to 1538 * the next power of 2. 1539 */ 1540 ASSERT(!ISP2(zp->z_blksz)); 1541 newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); 1542 } else { 1543 newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1544 } 1545 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1546 } else { 1547 newblksz = 0; 1548 } 1549 1550 error = dmu_tx_assign(tx, TXG_WAIT); 1551 if (error) { 1552 dmu_tx_abort(tx); 1553 rangelock_exit(lr); 1554 return (error); 1555 } 1556 1557 if (newblksz) 1558 zfs_grow_blocksize(zp, newblksz, tx); 1559 1560 zp->z_size = end; 1561 1562 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), 1563 &zp->z_size, sizeof (zp->z_size), tx)); 1564 1565 rangelock_exit(lr); 1566 1567 dmu_tx_commit(tx); 1568 1569 return (0); 1570 } 1571 1572 /* 1573 * Free space in a file. 1574 * 1575 * IN: zp - znode of file to free data in. 1576 * off - start of section to free. 1577 * len - length of section to free. 1578 * 1579 * RETURN: 0 on success, error code on failure 1580 */ 1581 static int 1582 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1583 { 1584 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1585 locked_range_t *lr; 1586 int error; 1587 1588 /* 1589 * Lock the range being freed. 1590 */ 1591 lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); 1592 1593 /* 1594 * Nothing to do if file already at desired length. 1595 */ 1596 if (off >= zp->z_size) { 1597 rangelock_exit(lr); 1598 return (0); 1599 } 1600 1601 if (off + len > zp->z_size) 1602 len = zp->z_size - off; 1603 1604 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1605 1606 rangelock_exit(lr); 1607 1608 return (error); 1609 } 1610 1611 /* 1612 * Truncate a file 1613 * 1614 * IN: zp - znode of file to free data in. 1615 * end - new end-of-file. 1616 * 1617 * RETURN: 0 on success, error code on failure 1618 */ 1619 static int 1620 zfs_trunc(znode_t *zp, uint64_t end) 1621 { 1622 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1623 vnode_t *vp = ZTOV(zp); 1624 dmu_tx_t *tx; 1625 locked_range_t *lr; 1626 int error; 1627 sa_bulk_attr_t bulk[2]; 1628 int count = 0; 1629 1630 /* 1631 * We will change zp_size, lock the whole file. 1632 */ 1633 lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); 1634 1635 /* 1636 * Nothing to do if file already at desired length. 1637 */ 1638 if (end >= zp->z_size) { 1639 rangelock_exit(lr); 1640 return (0); 1641 } 1642 1643 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, 1644 DMU_OBJECT_END); 1645 if (error) { 1646 rangelock_exit(lr); 1647 return (error); 1648 } 1649 tx = dmu_tx_create(zfsvfs->z_os); 1650 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1651 zfs_sa_upgrade_txholds(tx, zp); 1652 dmu_tx_mark_netfree(tx); 1653 error = dmu_tx_assign(tx, TXG_WAIT); 1654 if (error) { 1655 dmu_tx_abort(tx); 1656 rangelock_exit(lr); 1657 return (error); 1658 } 1659 1660 zp->z_size = end; 1661 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), 1662 NULL, &zp->z_size, sizeof (zp->z_size)); 1663 1664 if (end == 0) { 1665 zp->z_pflags &= ~ZFS_SPARSE; 1666 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1667 NULL, &zp->z_pflags, 8); 1668 } 1669 VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); 1670 1671 dmu_tx_commit(tx); 1672 1673 /* 1674 * Clear any mapped pages in the truncated region. This has to 1675 * happen outside of the transaction to avoid the possibility of 1676 * a deadlock with someone trying to push a page that we are 1677 * about to invalidate. 1678 */ 1679 if (vn_has_cached_data(vp)) { 1680 page_t *pp; 1681 uint64_t start = end & PAGEMASK; 1682 int poff = end & PAGEOFFSET; 1683 1684 if (poff != 0 && (pp = page_lookup(vp, start, SE_SHARED))) { 1685 /* 1686 * We need to zero a partial page. 1687 */ 1688 pagezero(pp, poff, PAGESIZE - poff); 1689 start += PAGESIZE; 1690 page_unlock(pp); 1691 } 1692 error = pvn_vplist_dirty(vp, start, zfs_no_putpage, 1693 B_INVAL | B_TRUNC, NULL); 1694 ASSERT(error == 0); 1695 } 1696 1697 rangelock_exit(lr); 1698 1699 return (0); 1700 } 1701 1702 /* 1703 * Free space in a file 1704 * 1705 * IN: zp - znode of file to free data in. 1706 * off - start of range 1707 * len - end of range (0 => EOF) 1708 * flag - current file open mode flags. 1709 * log - TRUE if this action should be logged 1710 * 1711 * RETURN: 0 on success, error code on failure 1712 */ 1713 int 1714 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1715 { 1716 vnode_t *vp = ZTOV(zp); 1717 dmu_tx_t *tx; 1718 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1719 zilog_t *zilog = zfsvfs->z_log; 1720 uint64_t mode; 1721 uint64_t mtime[2], ctime[2]; 1722 sa_bulk_attr_t bulk[3]; 1723 int count = 0; 1724 int error; 1725 1726 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, 1727 sizeof (mode))) != 0) 1728 return (error); 1729 1730 if (off > zp->z_size) { 1731 error = zfs_extend(zp, off+len); 1732 if (error == 0 && log) 1733 goto log; 1734 else 1735 return (error); 1736 } 1737 1738 /* 1739 * Check for any locks in the region to be freed. 1740 */ 1741 1742 if (MANDLOCK(vp, (mode_t)mode)) { 1743 uint64_t length = (len ? len : zp->z_size - off); 1744 if (error = chklock(vp, FWRITE, off, length, flag, NULL)) 1745 return (error); 1746 } 1747 1748 if (len == 0) { 1749 error = zfs_trunc(zp, off); 1750 } else { 1751 if ((error = zfs_free_range(zp, off, len)) == 0 && 1752 off + len > zp->z_size) 1753 error = zfs_extend(zp, off+len); 1754 } 1755 if (error || !log) 1756 return (error); 1757 log: 1758 tx = dmu_tx_create(zfsvfs->z_os); 1759 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1760 zfs_sa_upgrade_txholds(tx, zp); 1761 error = dmu_tx_assign(tx, TXG_WAIT); 1762 if (error) { 1763 dmu_tx_abort(tx); 1764 return (error); 1765 } 1766 1767 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); 1768 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); 1769 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1770 NULL, &zp->z_pflags, 8); 1771 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); 1772 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1773 ASSERT(error == 0); 1774 1775 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1776 1777 dmu_tx_commit(tx); 1778 return (0); 1779 } 1780 1781 void 1782 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1783 { 1784 uint64_t moid, obj, sa_obj, version; 1785 uint64_t sense = ZFS_CASE_SENSITIVE; 1786 uint64_t norm = 0; 1787 nvpair_t *elem; 1788 int error; 1789 int i; 1790 znode_t *rootzp = NULL; 1791 zfsvfs_t *zfsvfs; 1792 vnode_t *vp; 1793 vattr_t vattr; 1794 znode_t *zp; 1795 zfs_acl_ids_t acl_ids; 1796 1797 /* 1798 * First attempt to create master node. 1799 */ 1800 /* 1801 * In an empty objset, there are no blocks to read and thus 1802 * there can be no i/o errors (which we assert below). 1803 */ 1804 moid = MASTER_NODE_OBJ; 1805 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1806 DMU_OT_NONE, 0, tx); 1807 ASSERT(error == 0); 1808 1809 /* 1810 * Set starting attributes. 1811 */ 1812 version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); 1813 elem = NULL; 1814 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1815 /* For the moment we expect all zpl props to be uint64_ts */ 1816 uint64_t val; 1817 char *name; 1818 1819 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1820 VERIFY(nvpair_value_uint64(elem, &val) == 0); 1821 name = nvpair_name(elem); 1822 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1823 if (val < version) 1824 version = val; 1825 } else { 1826 error = zap_update(os, moid, name, 8, 1, &val, tx); 1827 } 1828 ASSERT(error == 0); 1829 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1830 norm = val; 1831 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1832 sense = val; 1833 } 1834 ASSERT(version != 0); 1835 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); 1836 1837 /* 1838 * Create zap object used for SA attribute registration 1839 */ 1840 1841 if (version >= ZPL_VERSION_SA) { 1842 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, 1843 DMU_OT_NONE, 0, tx); 1844 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); 1845 ASSERT(error == 0); 1846 } else { 1847 sa_obj = 0; 1848 } 1849 /* 1850 * Create a delete queue. 1851 */ 1852 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1853 1854 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); 1855 ASSERT(error == 0); 1856 1857 /* 1858 * Create root znode. Create minimal znode/vnode/zfsvfs 1859 * to allow zfs_mknode to work. 1860 */ 1861 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1862 vattr.va_type = VDIR; 1863 vattr.va_mode = S_IFDIR|0755; 1864 vattr.va_uid = crgetuid(cr); 1865 vattr.va_gid = crgetgid(cr); 1866 1867 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1868 ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); 1869 rootzp->z_moved = 0; 1870 rootzp->z_unlinked = 0; 1871 rootzp->z_atime_dirty = 0; 1872 rootzp->z_is_sa = USE_SA(version, os); 1873 1874 vp = ZTOV(rootzp); 1875 vn_reinit(vp); 1876 vp->v_type = VDIR; 1877 1878 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); 1879 zfsvfs->z_os = os; 1880 zfsvfs->z_parent = zfsvfs; 1881 zfsvfs->z_version = version; 1882 zfsvfs->z_use_fuids = USE_FUIDS(version, os); 1883 zfsvfs->z_use_sa = USE_SA(version, os); 1884 zfsvfs->z_norm = norm; 1885 1886 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, 1887 &zfsvfs->z_attr_table); 1888 1889 ASSERT(error == 0); 1890 1891 /* 1892 * Fold case on file systems that are always or sometimes case 1893 * insensitive. 1894 */ 1895 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1896 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 1897 1898 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1899 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), 1900 offsetof(znode_t, z_link_node)); 1901 1902 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1903 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 1904 1905 rootzp->z_zfsvfs = zfsvfs; 1906 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, 1907 cr, NULL, &acl_ids)); 1908 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); 1909 ASSERT3P(zp, ==, rootzp); 1910 ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */ 1911 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1912 ASSERT(error == 0); 1913 zfs_acl_ids_free(&acl_ids); 1914 POINTER_INVALIDATE(&rootzp->z_zfsvfs); 1915 1916 ZTOV(rootzp)->v_count = 0; 1917 sa_handle_destroy(rootzp->z_sa_hdl); 1918 kmem_cache_free(znode_cache, rootzp); 1919 1920 /* 1921 * Create shares directory 1922 */ 1923 1924 error = zfs_create_share_dir(zfsvfs, tx); 1925 1926 ASSERT(error == 0); 1927 1928 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 1929 mutex_destroy(&zfsvfs->z_hold_mtx[i]); 1930 kmem_free(zfsvfs, sizeof (zfsvfs_t)); 1931 } 1932 1933 #endif /* _KERNEL */ 1934 1935 static int 1936 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) 1937 { 1938 uint64_t sa_obj = 0; 1939 int error; 1940 1941 error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); 1942 if (error != 0 && error != ENOENT) 1943 return (error); 1944 1945 error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); 1946 return (error); 1947 } 1948 1949 static int 1950 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, 1951 dmu_buf_t **db, void *tag) 1952 { 1953 dmu_object_info_t doi; 1954 int error; 1955 1956 if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) 1957 return (error); 1958 1959 dmu_object_info_from_db(*db, &doi); 1960 if ((doi.doi_bonus_type != DMU_OT_SA && 1961 doi.doi_bonus_type != DMU_OT_ZNODE) || 1962 doi.doi_bonus_type == DMU_OT_ZNODE && 1963 doi.doi_bonus_size < sizeof (znode_phys_t)) { 1964 sa_buf_rele(*db, tag); 1965 return (SET_ERROR(ENOTSUP)); 1966 } 1967 1968 error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); 1969 if (error != 0) { 1970 sa_buf_rele(*db, tag); 1971 return (error); 1972 } 1973 1974 return (0); 1975 } 1976 1977 void 1978 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) 1979 { 1980 sa_handle_destroy(hdl); 1981 sa_buf_rele(db, tag); 1982 } 1983 1984 /* 1985 * Given an object number, return its parent object number and whether 1986 * or not the object is an extended attribute directory. 1987 */ 1988 static int 1989 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, 1990 uint64_t *pobjp, int *is_xattrdir) 1991 { 1992 uint64_t parent; 1993 uint64_t pflags; 1994 uint64_t mode; 1995 uint64_t parent_mode; 1996 sa_bulk_attr_t bulk[3]; 1997 sa_handle_t *sa_hdl; 1998 dmu_buf_t *sa_db; 1999 int count = 0; 2000 int error; 2001 2002 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, 2003 &parent, sizeof (parent)); 2004 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, 2005 &pflags, sizeof (pflags)); 2006 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, 2007 &mode, sizeof (mode)); 2008 2009 if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) 2010 return (error); 2011 2012 /* 2013 * When a link is removed its parent pointer is not changed and will 2014 * be invalid. There are two cases where a link is removed but the 2015 * file stays around, when it goes to the delete queue and when there 2016 * are additional links. 2017 */ 2018 error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); 2019 if (error != 0) 2020 return (error); 2021 2022 error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); 2023 zfs_release_sa_handle(sa_hdl, sa_db, FTAG); 2024 if (error != 0) 2025 return (error); 2026 2027 *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); 2028 2029 /* 2030 * Extended attributes can be applied to files, directories, etc. 2031 * Otherwise the parent must be a directory. 2032 */ 2033 if (!*is_xattrdir && !S_ISDIR(parent_mode)) 2034 return (SET_ERROR(EINVAL)); 2035 2036 *pobjp = parent; 2037 2038 return (0); 2039 } 2040 2041 /* 2042 * Given an object number, return some zpl level statistics 2043 */ 2044 static int 2045 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, 2046 zfs_stat_t *sb) 2047 { 2048 sa_bulk_attr_t bulk[4]; 2049 int count = 0; 2050 2051 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, 2052 &sb->zs_mode, sizeof (sb->zs_mode)); 2053 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, 2054 &sb->zs_gen, sizeof (sb->zs_gen)); 2055 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, 2056 &sb->zs_links, sizeof (sb->zs_links)); 2057 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, 2058 &sb->zs_ctime, sizeof (sb->zs_ctime)); 2059 2060 return (sa_bulk_lookup(hdl, bulk, count)); 2061 } 2062 2063 static int 2064 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, 2065 sa_attr_type_t *sa_table, char *buf, int len) 2066 { 2067 sa_handle_t *sa_hdl; 2068 sa_handle_t *prevhdl = NULL; 2069 dmu_buf_t *prevdb = NULL; 2070 dmu_buf_t *sa_db = NULL; 2071 char *path = buf + len - 1; 2072 int error; 2073 2074 *path = '\0'; 2075 sa_hdl = hdl; 2076 2077 uint64_t deleteq_obj; 2078 VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, 2079 ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); 2080 error = zap_lookup_int(osp, deleteq_obj, obj); 2081 if (error == 0) { 2082 return (ESTALE); 2083 } else if (error != ENOENT) { 2084 return (error); 2085 } 2086 error = 0; 2087 2088 for (;;) { 2089 uint64_t pobj; 2090 char component[MAXNAMELEN + 2]; 2091 size_t complen; 2092 int is_xattrdir; 2093 2094 if (prevdb) 2095 zfs_release_sa_handle(prevhdl, prevdb, FTAG); 2096 2097 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, 2098 &is_xattrdir)) != 0) 2099 break; 2100 2101 if (pobj == obj) { 2102 if (path[0] != '/') 2103 *--path = '/'; 2104 break; 2105 } 2106 2107 component[0] = '/'; 2108 if (is_xattrdir) { 2109 (void) sprintf(component + 1, "<xattrdir>"); 2110 } else { 2111 error = zap_value_search(osp, pobj, obj, 2112 ZFS_DIRENT_OBJ(-1ULL), component + 1); 2113 if (error != 0) 2114 break; 2115 } 2116 2117 complen = strlen(component); 2118 path -= complen; 2119 ASSERT(path >= buf); 2120 bcopy(component, path, complen); 2121 obj = pobj; 2122 2123 if (sa_hdl != hdl) { 2124 prevhdl = sa_hdl; 2125 prevdb = sa_db; 2126 } 2127 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); 2128 if (error != 0) { 2129 sa_hdl = prevhdl; 2130 sa_db = prevdb; 2131 break; 2132 } 2133 } 2134 2135 if (sa_hdl != NULL && sa_hdl != hdl) { 2136 ASSERT(sa_db != NULL); 2137 zfs_release_sa_handle(sa_hdl, sa_db, FTAG); 2138 } 2139 2140 if (error == 0) 2141 (void) memmove(buf, path, buf + len - path); 2142 2143 return (error); 2144 } 2145 2146 int 2147 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 2148 { 2149 sa_attr_type_t *sa_table; 2150 sa_handle_t *hdl; 2151 dmu_buf_t *db; 2152 int error; 2153 2154 error = zfs_sa_setup(osp, &sa_table); 2155 if (error != 0) 2156 return (error); 2157 2158 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); 2159 if (error != 0) 2160 return (error); 2161 2162 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); 2163 2164 zfs_release_sa_handle(hdl, db, FTAG); 2165 return (error); 2166 } 2167 2168 int 2169 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, 2170 char *buf, int len) 2171 { 2172 char *path = buf + len - 1; 2173 sa_attr_type_t *sa_table; 2174 sa_handle_t *hdl; 2175 dmu_buf_t *db; 2176 int error; 2177 2178 *path = '\0'; 2179 2180 error = zfs_sa_setup(osp, &sa_table); 2181 if (error != 0) 2182 return (error); 2183 2184 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); 2185 if (error != 0) 2186 return (error); 2187 2188 error = zfs_obj_to_stats_impl(hdl, sa_table, sb); 2189 if (error != 0) { 2190 zfs_release_sa_handle(hdl, db, FTAG); 2191 return (error); 2192 } 2193 2194 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); 2195 2196 zfs_release_sa_handle(hdl, db, FTAG); 2197 return (error); 2198 } 2199