1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013, 2016 by Delphix. All rights reserved. 25 * Copyright 2017 Nexenta Systems, Inc. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/time.h> 31 #include <sys/sysmacros.h> 32 #include <sys/vfs.h> 33 #include <sys/vnode.h> 34 #include <sys/file.h> 35 #include <sys/kmem.h> 36 #include <sys/uio.h> 37 #include <sys/pathname.h> 38 #include <sys/cmn_err.h> 39 #include <sys/errno.h> 40 #include <sys/stat.h> 41 #include <sys/sunddi.h> 42 #include <sys/random.h> 43 #include <sys/policy.h> 44 #include <sys/zfs_dir.h> 45 #include <sys/zfs_acl.h> 46 #include <sys/zfs_vnops.h> 47 #include <sys/fs/zfs.h> 48 #include <sys/zap.h> 49 #include <sys/dmu.h> 50 #include <sys/atomic.h> 51 #include <sys/zfs_ctldir.h> 52 #include <sys/zfs_fuid.h> 53 #include <sys/sa.h> 54 #include <sys/zfs_sa.h> 55 #include <sys/dmu_objset.h> 56 #include <sys/dsl_dir.h> 57 58 /* 59 * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups 60 * of names after deciding which is the appropriate lookup interface. 61 */ 62 static int 63 zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, 64 matchtype_t mt, boolean_t update, int *deflags, pathname_t *rpnp, 65 uint64_t *zoid) 66 { 67 boolean_t conflict = B_FALSE; 68 int error; 69 70 if (zfsvfs->z_norm) { 71 size_t bufsz = 0; 72 char *buf = NULL; 73 74 if (rpnp) { 75 buf = rpnp->pn_buf; 76 bufsz = rpnp->pn_bufsize; 77 } 78 79 /* 80 * In the non-mixed case we only expect there would ever 81 * be one match, but we need to use the normalizing lookup. 82 */ 83 error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, 84 zoid, mt, buf, bufsz, &conflict); 85 } else { 86 error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); 87 } 88 89 /* 90 * Allow multiple entries provided the first entry is 91 * the object id. Non-zpl consumers may safely make 92 * use of the additional space. 93 * 94 * XXX: This should be a feature flag for compatibility 95 */ 96 if (error == EOVERFLOW) 97 error = 0; 98 99 if (zfsvfs->z_norm && !error && deflags) 100 *deflags = conflict ? ED_CASE_CONFLICT : 0; 101 102 *zoid = ZFS_DIRENT_OBJ(*zoid); 103 104 return (error); 105 } 106 107 /* 108 * Lock a directory entry. A dirlock on <dzp, name> protects that name 109 * in dzp's directory zap object. As long as you hold a dirlock, you can 110 * assume two things: (1) dzp cannot be reaped, and (2) no other thread 111 * can change the zap entry for (i.e. link or unlink) this name. 112 * 113 * Input arguments: 114 * dzp - znode for directory 115 * name - name of entry to lock 116 * flag - ZNEW: if the entry already exists, fail with EEXIST. 117 * ZEXISTS: if the entry does not exist, fail with ENOENT. 118 * ZSHARED: allow concurrent access with other ZSHARED callers. 119 * ZXATTR: we want dzp's xattr directory 120 * ZCILOOK: On a mixed sensitivity file system, 121 * this lookup should be case-insensitive. 122 * ZCIEXACT: On a purely case-insensitive file system, 123 * this lookup should be case-sensitive. 124 * ZRENAMING: we are locking for renaming, force narrow locks 125 * ZHAVELOCK: Don't grab the z_name_lock for this call. The 126 * current thread already holds it. 127 * 128 * Output arguments: 129 * zpp - pointer to the znode for the entry (NULL if there isn't one) 130 * dlpp - pointer to the dirlock for this entry (NULL on error) 131 * direntflags - (case-insensitive lookup only) 132 * flags if multiple case-sensitive matches exist in directory 133 * realpnp - (case-insensitive lookup only) 134 * actual name matched within the directory 135 * 136 * Return value: 0 on success or errno on failure. 137 * 138 * NOTE: Always checks for, and rejects, '.' and '..'. 139 * NOTE: For case-insensitive file systems we take wide locks (see below), 140 * but return znode pointers to a single match. 141 */ 142 int 143 zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, 144 znode_t **zpp, int flag, int *direntflags, pathname_t *realpnp) 145 { 146 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 147 zfs_dirlock_t *dl; 148 boolean_t update; 149 matchtype_t mt = 0; 150 uint64_t zoid; 151 int error = 0; 152 int cmpflags; 153 154 *zpp = NULL; 155 *dlpp = NULL; 156 157 /* 158 * Verify that we are not trying to lock '.', '..', or '.zfs' 159 */ 160 if ((name[0] == '.' && 161 (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) || 162 (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)) 163 return (SET_ERROR(EEXIST)); 164 165 /* 166 * Case sensitivity and normalization preferences are set when 167 * the file system is created. These are stored in the 168 * zfsvfs->z_case and zfsvfs->z_norm fields. These choices 169 * affect what vnodes can be cached in the DNLC, how we 170 * perform zap lookups, and the "width" of our dirlocks. 171 * 172 * A normal dirlock locks a single name. Note that with 173 * normalization a name can be composed multiple ways, but 174 * when normalized, these names all compare equal. A wide 175 * dirlock locks multiple names. We need these when the file 176 * system is supporting mixed-mode access. It is sometimes 177 * necessary to lock all case permutations of file name at 178 * once so that simultaneous case-insensitive/case-sensitive 179 * behaves as rationally as possible. 180 */ 181 182 /* 183 * When matching we may need to normalize & change case according to 184 * FS settings. 185 * 186 * Note that a normalized match is necessary for a case insensitive 187 * filesystem when the lookup request is not exact because normalization 188 * can fold case independent of normalizing code point sequences. 189 * 190 * See the table above zfs_dropname(). 191 */ 192 if (zfsvfs->z_norm != 0) { 193 mt = MT_NORMALIZE; 194 195 /* 196 * Determine if the match needs to honor the case specified in 197 * lookup, and if so keep track of that so that during 198 * normalization we don't fold case. 199 */ 200 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE && 201 (flag & ZCIEXACT)) || 202 (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) { 203 mt |= MT_MATCH_CASE; 204 } 205 } 206 207 /* 208 * Only look in or update the DNLC if we are looking for the 209 * name on a file system that does not require normalization 210 * or case folding. We can also look there if we happen to be 211 * on a non-normalizing, mixed sensitivity file system IF we 212 * are looking for the exact name. 213 * 214 * Maybe can add TO-UPPERed version of name to dnlc in ci-only 215 * case for performance improvement? 216 */ 217 update = !zfsvfs->z_norm || 218 (zfsvfs->z_case == ZFS_CASE_MIXED && 219 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); 220 221 /* 222 * ZRENAMING indicates we are in a situation where we should 223 * take narrow locks regardless of the file system's 224 * preferences for normalizing and case folding. This will 225 * prevent us deadlocking trying to grab the same wide lock 226 * twice if the two names happen to be case-insensitive 227 * matches. 228 */ 229 if (flag & ZRENAMING) 230 cmpflags = 0; 231 else 232 cmpflags = zfsvfs->z_norm; 233 234 /* 235 * Wait until there are no locks on this name. 236 * 237 * Don't grab the lock if it is already held. However, cannot 238 * have both ZSHARED and ZHAVELOCK together. 239 */ 240 ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); 241 if (!(flag & ZHAVELOCK)) 242 rw_enter(&dzp->z_name_lock, RW_READER); 243 244 mutex_enter(&dzp->z_lock); 245 for (;;) { 246 if (dzp->z_unlinked && !(flag & ZXATTR)) { 247 mutex_exit(&dzp->z_lock); 248 if (!(flag & ZHAVELOCK)) 249 rw_exit(&dzp->z_name_lock); 250 return (SET_ERROR(ENOENT)); 251 } 252 for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { 253 if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, 254 U8_UNICODE_LATEST, &error) == 0) || error != 0) 255 break; 256 } 257 if (error != 0) { 258 mutex_exit(&dzp->z_lock); 259 if (!(flag & ZHAVELOCK)) 260 rw_exit(&dzp->z_name_lock); 261 return (SET_ERROR(ENOENT)); 262 } 263 if (dl == NULL) { 264 /* 265 * Allocate a new dirlock and add it to the list. 266 */ 267 dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); 268 cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); 269 dl->dl_name = name; 270 dl->dl_sharecnt = 0; 271 dl->dl_namelock = 0; 272 dl->dl_namesize = 0; 273 dl->dl_dzp = dzp; 274 dl->dl_next = dzp->z_dirlocks; 275 dzp->z_dirlocks = dl; 276 break; 277 } 278 if ((flag & ZSHARED) && dl->dl_sharecnt != 0) 279 break; 280 cv_wait(&dl->dl_cv, &dzp->z_lock); 281 } 282 283 /* 284 * If the z_name_lock was NOT held for this dirlock record it. 285 */ 286 if (flag & ZHAVELOCK) 287 dl->dl_namelock = 1; 288 289 if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { 290 /* 291 * We're the second shared reference to dl. Make a copy of 292 * dl_name in case the first thread goes away before we do. 293 * Note that we initialize the new name before storing its 294 * pointer into dl_name, because the first thread may load 295 * dl->dl_name at any time. It'll either see the old value, 296 * which belongs to it, or the new shared copy; either is OK. 297 */ 298 dl->dl_namesize = strlen(dl->dl_name) + 1; 299 name = kmem_alloc(dl->dl_namesize, KM_SLEEP); 300 memcpy(name, dl->dl_name, dl->dl_namesize); 301 dl->dl_name = name; 302 } 303 304 mutex_exit(&dzp->z_lock); 305 306 /* 307 * We have a dirlock on the name. (Note that it is the dirlock, 308 * not the dzp's z_lock, that protects the name in the zap object.) 309 * See if there's an object by this name; if so, put a hold on it. 310 */ 311 if (flag & ZXATTR) { 312 error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, 313 sizeof (zoid)); 314 if (error == 0) 315 error = (zoid == 0 ? SET_ERROR(ENOENT) : 0); 316 } else { 317 error = zfs_match_find(zfsvfs, dzp, name, mt, 318 update, direntflags, realpnp, &zoid); 319 } 320 if (error) { 321 if (error != ENOENT || (flag & ZEXISTS)) { 322 zfs_dirent_unlock(dl); 323 return (error); 324 } 325 } else { 326 if (flag & ZNEW) { 327 zfs_dirent_unlock(dl); 328 return (SET_ERROR(EEXIST)); 329 } 330 error = zfs_zget(zfsvfs, zoid, zpp); 331 if (error) { 332 zfs_dirent_unlock(dl); 333 return (error); 334 } 335 } 336 337 *dlpp = dl; 338 339 return (0); 340 } 341 342 /* 343 * Unlock this directory entry and wake anyone who was waiting for it. 344 */ 345 void 346 zfs_dirent_unlock(zfs_dirlock_t *dl) 347 { 348 znode_t *dzp = dl->dl_dzp; 349 zfs_dirlock_t **prev_dl, *cur_dl; 350 351 mutex_enter(&dzp->z_lock); 352 353 if (!dl->dl_namelock) 354 rw_exit(&dzp->z_name_lock); 355 356 if (dl->dl_sharecnt > 1) { 357 dl->dl_sharecnt--; 358 mutex_exit(&dzp->z_lock); 359 return; 360 } 361 prev_dl = &dzp->z_dirlocks; 362 while ((cur_dl = *prev_dl) != dl) 363 prev_dl = &cur_dl->dl_next; 364 *prev_dl = dl->dl_next; 365 cv_broadcast(&dl->dl_cv); 366 mutex_exit(&dzp->z_lock); 367 368 if (dl->dl_namesize != 0) 369 kmem_free(dl->dl_name, dl->dl_namesize); 370 cv_destroy(&dl->dl_cv); 371 kmem_free(dl, sizeof (*dl)); 372 } 373 374 /* 375 * Look up an entry in a directory. 376 * 377 * NOTE: '.' and '..' are handled as special cases because 378 * no directory entries are actually stored for them. If this is 379 * the root of a filesystem, then '.zfs' is also treated as a 380 * special pseudo-directory. 381 */ 382 int 383 zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags, 384 int *deflg, pathname_t *rpnp) 385 { 386 zfs_dirlock_t *dl; 387 znode_t *zp; 388 struct inode *ip; 389 int error = 0; 390 uint64_t parent; 391 392 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { 393 *zpp = dzp; 394 zhold(*zpp); 395 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { 396 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 397 398 /* 399 * If we are a snapshot mounted under .zfs, return 400 * the inode pointer for the snapshot directory. 401 */ 402 if ((error = sa_lookup(dzp->z_sa_hdl, 403 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) 404 return (error); 405 406 if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { 407 error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, 408 "snapshot", &ip, 0, kcred, NULL, NULL); 409 *zpp = ITOZ(ip); 410 return (error); 411 } 412 rw_enter(&dzp->z_parent_lock, RW_READER); 413 error = zfs_zget(zfsvfs, parent, &zp); 414 if (error == 0) 415 *zpp = zp; 416 rw_exit(&dzp->z_parent_lock); 417 } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { 418 if (ZTOZSB(dzp)->z_show_ctldir == ZFS_SNAPDIR_DISABLED) { 419 return (SET_ERROR(ENOENT)); 420 } 421 ip = zfsctl_root(dzp); 422 *zpp = ITOZ(ip); 423 } else { 424 int zf; 425 426 zf = ZEXISTS | ZSHARED; 427 if (flags & FIGNORECASE) 428 zf |= ZCILOOK; 429 430 error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); 431 if (error == 0) { 432 *zpp = zp; 433 zfs_dirent_unlock(dl); 434 dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ 435 } 436 rpnp = NULL; 437 } 438 439 if ((flags & FIGNORECASE) && rpnp && !error) 440 (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); 441 442 return (error); 443 } 444 445 /* 446 * unlinked Set (formerly known as the "delete queue") Error Handling 447 * 448 * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we 449 * don't specify the name of the entry that we will be manipulating. We 450 * also fib and say that we won't be adding any new entries to the 451 * unlinked set, even though we might (this is to lower the minimum file 452 * size that can be deleted in a full filesystem). So on the small 453 * chance that the nlink list is using a fat zap (ie. has more than 454 * 2000 entries), we *may* not pre-read a block that's needed. 455 * Therefore it is remotely possible for some of the assertions 456 * regarding the unlinked set below to fail due to i/o error. On a 457 * nondebug system, this will result in the space being leaked. 458 */ 459 void 460 zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) 461 { 462 zfsvfs_t *zfsvfs = ZTOZSB(zp); 463 464 ASSERT(zp->z_unlinked); 465 ASSERT(ZTOI(zp)->i_nlink == 0); 466 467 VERIFY3U(0, ==, 468 zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); 469 470 dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1); 471 } 472 473 /* 474 * Clean up any znodes that had no links when we either crashed or 475 * (force) umounted the file system. 476 */ 477 static void 478 zfs_unlinked_drain_task(void *arg) 479 { 480 zfsvfs_t *zfsvfs = arg; 481 zap_cursor_t zc; 482 zap_attribute_t *zap = zap_attribute_alloc(); 483 dmu_object_info_t doi; 484 znode_t *zp; 485 int error; 486 487 ASSERT3B(zfsvfs->z_draining, ==, B_TRUE); 488 489 /* 490 * Iterate over the contents of the unlinked set. 491 */ 492 for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); 493 zap_cursor_retrieve(&zc, zap) == 0 && !zfsvfs->z_drain_cancel; 494 zap_cursor_advance(&zc)) { 495 496 /* 497 * See what kind of object we have in list 498 */ 499 500 error = dmu_object_info(zfsvfs->z_os, 501 zap->za_first_integer, &doi); 502 if (error != 0) 503 continue; 504 505 ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || 506 (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); 507 /* 508 * We need to re-mark these list entries for deletion, 509 * so we pull them back into core and set zp->z_unlinked. 510 */ 511 error = zfs_zget(zfsvfs, zap->za_first_integer, &zp); 512 513 /* 514 * We may pick up znodes that are already marked for deletion. 515 * This could happen during the purge of an extended attribute 516 * directory. All we need to do is skip over them, since they 517 * are already in the system marked z_unlinked. 518 */ 519 if (error != 0) 520 continue; 521 522 zp->z_unlinked = B_TRUE; 523 524 /* 525 * zrele() decrements the znode's ref count and may cause 526 * it to be synchronously freed. We interrupt freeing 527 * of this znode by checking the return value of 528 * dmu_objset_zfs_unmounting() in dmu_free_long_range() 529 * when an unmount is requested. 530 */ 531 zrele(zp); 532 ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); 533 } 534 zap_cursor_fini(&zc); 535 536 zfsvfs->z_draining = B_FALSE; 537 zfsvfs->z_drain_task = TASKQID_INVALID; 538 zap_attribute_free(zap); 539 } 540 541 /* 542 * Sets z_draining then tries to dispatch async unlinked drain. 543 * If that fails executes synchronous unlinked drain. 544 */ 545 void 546 zfs_unlinked_drain(zfsvfs_t *zfsvfs) 547 { 548 ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); 549 ASSERT3B(zfsvfs->z_draining, ==, B_FALSE); 550 551 zfsvfs->z_draining = B_TRUE; 552 zfsvfs->z_drain_cancel = B_FALSE; 553 554 zfsvfs->z_drain_task = taskq_dispatch( 555 dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)), 556 zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP); 557 if (zfsvfs->z_drain_task == TASKQID_INVALID) { 558 zfs_dbgmsg("async zfs_unlinked_drain dispatch failed"); 559 zfs_unlinked_drain_task(zfsvfs); 560 } 561 } 562 563 /* 564 * Wait for the unlinked drain taskq task to stop. This will interrupt the 565 * unlinked set processing if it is in progress. 566 */ 567 void 568 zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs) 569 { 570 ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); 571 572 if (zfsvfs->z_draining) { 573 zfsvfs->z_drain_cancel = B_TRUE; 574 taskq_cancel_id(dsl_pool_unlinked_drain_taskq( 575 dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task); 576 zfsvfs->z_drain_task = TASKQID_INVALID; 577 zfsvfs->z_draining = B_FALSE; 578 } 579 } 580 581 /* 582 * Delete the entire contents of a directory. Return a count 583 * of the number of entries that could not be deleted. If we encounter 584 * an error, return a count of at least one so that the directory stays 585 * in the unlinked set. 586 * 587 * NOTE: this function assumes that the directory is inactive, 588 * so there is no need to lock its entries before deletion. 589 * Also, it assumes the directory contents is *only* regular 590 * files. 591 */ 592 static int 593 zfs_purgedir(znode_t *dzp) 594 { 595 zap_cursor_t zc; 596 zap_attribute_t *zap = zap_attribute_alloc(); 597 znode_t *xzp; 598 dmu_tx_t *tx; 599 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 600 zfs_dirlock_t dl; 601 int skipped = 0; 602 int error; 603 604 for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); 605 (error = zap_cursor_retrieve(&zc, zap)) == 0; 606 zap_cursor_advance(&zc)) { 607 error = zfs_zget(zfsvfs, 608 ZFS_DIRENT_OBJ(zap->za_first_integer), &xzp); 609 if (error) { 610 skipped += 1; 611 continue; 612 } 613 614 ASSERT(S_ISREG(ZTOI(xzp)->i_mode) || 615 S_ISLNK(ZTOI(xzp)->i_mode)); 616 617 tx = dmu_tx_create(zfsvfs->z_os); 618 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 619 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap->za_name); 620 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 621 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 622 /* Is this really needed ? */ 623 zfs_sa_upgrade_txholds(tx, xzp); 624 dmu_tx_mark_netfree(tx); 625 error = dmu_tx_assign(tx, TXG_WAIT); 626 if (error) { 627 dmu_tx_abort(tx); 628 zfs_zrele_async(xzp); 629 skipped += 1; 630 continue; 631 } 632 memset(&dl, 0, sizeof (dl)); 633 dl.dl_dzp = dzp; 634 dl.dl_name = zap->za_name; 635 636 error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); 637 if (error) 638 skipped += 1; 639 dmu_tx_commit(tx); 640 641 zfs_zrele_async(xzp); 642 } 643 zap_cursor_fini(&zc); 644 zap_attribute_free(zap); 645 if (error != ENOENT) 646 skipped += 1; 647 return (skipped); 648 } 649 650 void 651 zfs_rmnode(znode_t *zp) 652 { 653 zfsvfs_t *zfsvfs = ZTOZSB(zp); 654 objset_t *os = zfsvfs->z_os; 655 znode_t *xzp = NULL; 656 dmu_tx_t *tx; 657 znode_hold_t *zh; 658 uint64_t z_id = zp->z_id; 659 uint64_t acl_obj; 660 uint64_t xattr_obj; 661 uint64_t links; 662 int error; 663 664 ASSERT(ZTOI(zp)->i_nlink == 0); 665 ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0); 666 667 /* 668 * If this is an attribute directory, purge its contents. 669 */ 670 if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) { 671 if (zfs_purgedir(zp) != 0) { 672 /* 673 * Not enough space to delete some xattrs. 674 * Leave it in the unlinked set. 675 */ 676 zh = zfs_znode_hold_enter(zfsvfs, z_id); 677 zfs_znode_dmu_fini(zp); 678 zfs_znode_hold_exit(zfsvfs, zh); 679 return; 680 } 681 } 682 683 /* 684 * Free up all the data in the file. We don't do this for directories 685 * because we need truncate and remove to be in the same tx, like in 686 * zfs_znode_delete(). Otherwise, if we crash here we'll end up with 687 * an inconsistent truncated zap object in the delete queue. Note a 688 * truncated file is harmless since it only contains user data. 689 */ 690 if (S_ISREG(ZTOI(zp)->i_mode)) { 691 error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); 692 if (error) { 693 /* 694 * Not enough space or we were interrupted by unmount. 695 * Leave the file in the unlinked set. 696 */ 697 zh = zfs_znode_hold_enter(zfsvfs, z_id); 698 zfs_znode_dmu_fini(zp); 699 zfs_znode_hold_exit(zfsvfs, zh); 700 return; 701 } 702 } 703 704 /* 705 * If the file has extended attributes, we're going to unlink 706 * the xattr dir. 707 */ 708 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 709 &xattr_obj, sizeof (xattr_obj)); 710 if (error == 0 && xattr_obj) { 711 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 712 ASSERT(error == 0); 713 } 714 715 acl_obj = zfs_external_acl(zp); 716 717 /* 718 * Set up the final transaction. 719 */ 720 tx = dmu_tx_create(os); 721 dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); 722 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 723 if (xzp) { 724 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); 725 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 726 } 727 if (acl_obj) 728 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 729 730 zfs_sa_upgrade_txholds(tx, zp); 731 error = dmu_tx_assign(tx, TXG_WAIT); 732 if (error) { 733 /* 734 * Not enough space to delete the file. Leave it in the 735 * unlinked set, leaking it until the fs is remounted (at 736 * which point we'll call zfs_unlinked_drain() to process it). 737 */ 738 dmu_tx_abort(tx); 739 zh = zfs_znode_hold_enter(zfsvfs, z_id); 740 zfs_znode_dmu_fini(zp); 741 zfs_znode_hold_exit(zfsvfs, zh); 742 goto out; 743 } 744 745 if (xzp) { 746 ASSERT(error == 0); 747 mutex_enter(&xzp->z_lock); 748 xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ 749 clear_nlink(ZTOI(xzp)); /* no more links to it */ 750 links = 0; 751 VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 752 &links, sizeof (links), tx)); 753 mutex_exit(&xzp->z_lock); 754 zfs_unlinked_add(xzp, tx); 755 } 756 757 mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); 758 759 /* 760 * Remove this znode from the unlinked set. If a has rollback has 761 * occurred while a file is open and unlinked. Then when the file 762 * is closed post rollback it will not exist in the rolled back 763 * version of the unlinked object. 764 */ 765 error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, 766 zp->z_id, tx); 767 VERIFY(error == 0 || error == ENOENT); 768 769 uint64_t count; 770 if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { 771 cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); 772 } 773 774 mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); 775 776 dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); 777 778 zfs_znode_delete(zp, tx); 779 780 dmu_tx_commit(tx); 781 out: 782 if (xzp) 783 zfs_zrele_async(xzp); 784 } 785 786 static uint64_t 787 zfs_dirent(znode_t *zp, uint64_t mode) 788 { 789 uint64_t de = zp->z_id; 790 791 if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE) 792 de |= IFTODT(mode) << 60; 793 return (de); 794 } 795 796 /* 797 * Link zp into dl. Can fail in the following cases : 798 * - if zp has been unlinked. 799 * - if the number of entries with the same hash (aka. colliding entries) 800 * exceed the capacity of a leaf-block of fatzap and splitting of the 801 * leaf-block does not help. 802 */ 803 int 804 zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) 805 { 806 znode_t *dzp = dl->dl_dzp; 807 zfsvfs_t *zfsvfs = ZTOZSB(zp); 808 uint64_t value; 809 int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); 810 sa_bulk_attr_t bulk[5]; 811 uint64_t mtime[2], ctime[2]; 812 uint64_t links; 813 int count = 0; 814 int error; 815 816 mutex_enter(&zp->z_lock); 817 818 if (!(flag & ZRENAMING)) { 819 if (zp->z_unlinked) { /* no new links to unlinked zp */ 820 ASSERT(!(flag & (ZNEW | ZEXISTS))); 821 mutex_exit(&zp->z_lock); 822 return (SET_ERROR(ENOENT)); 823 } 824 if (!(flag & ZNEW)) { 825 /* 826 * ZNEW nodes come from zfs_mknode() where the link 827 * count has already been initialised 828 */ 829 inc_nlink(ZTOI(zp)); 830 links = ZTOI(zp)->i_nlink; 831 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), 832 NULL, &links, sizeof (links)); 833 } 834 } 835 836 value = zfs_dirent(zp, zp->z_mode); 837 error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1, 838 &value, tx); 839 840 /* 841 * zap_add could fail to add the entry if it exceeds the capacity of the 842 * leaf-block and zap_leaf_split() failed to help. 843 * The caller of this routine is responsible for failing the transaction 844 * which will rollback the SA updates done above. 845 */ 846 if (error != 0) { 847 if (!(flag & ZRENAMING) && !(flag & ZNEW)) 848 drop_nlink(ZTOI(zp)); 849 mutex_exit(&zp->z_lock); 850 return (error); 851 } 852 853 /* 854 * If we added a longname activate the SPA_FEATURE_LONGNAME. 855 */ 856 if (strlen(dl->dl_name) >= ZAP_MAXNAMELEN) { 857 dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os); 858 ds->ds_feature_activation[SPA_FEATURE_LONGNAME] = 859 (void *)B_TRUE; 860 } 861 862 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, 863 &dzp->z_id, sizeof (dzp->z_id)); 864 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 865 &zp->z_pflags, sizeof (zp->z_pflags)); 866 867 if (!(flag & ZNEW)) { 868 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 869 ctime, sizeof (ctime)); 870 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, 871 ctime); 872 } 873 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 874 ASSERT(error == 0); 875 876 mutex_exit(&zp->z_lock); 877 878 mutex_enter(&dzp->z_lock); 879 dzp->z_size++; 880 if (zp_is_dir) 881 inc_nlink(ZTOI(dzp)); 882 links = ZTOI(dzp)->i_nlink; 883 count = 0; 884 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 885 &dzp->z_size, sizeof (dzp->z_size)); 886 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, 887 &links, sizeof (links)); 888 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 889 mtime, sizeof (mtime)); 890 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 891 ctime, sizeof (ctime)); 892 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 893 &dzp->z_pflags, sizeof (dzp->z_pflags)); 894 zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); 895 error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); 896 ASSERT(error == 0); 897 mutex_exit(&dzp->z_lock); 898 899 return (0); 900 } 901 902 /* 903 * The match type in the code for this function should conform to: 904 * 905 * ------------------------------------------------------------------------ 906 * fs type | z_norm | lookup type | match type 907 * ---------|-------------|-------------|---------------------------------- 908 * CS !norm | 0 | 0 | 0 (exact) 909 * CS norm | formX | 0 | MT_NORMALIZE 910 * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE 911 * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE 912 * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE 913 * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE 914 * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE 915 * CM !norm | upper | ZCILOOK | MT_NORMALIZE 916 * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE 917 * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE 918 * 919 * Abbreviations: 920 * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed 921 * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) 922 * formX = unicode normalization form set on fs creation 923 */ 924 static int 925 zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, 926 int flag) 927 { 928 int error; 929 930 if (ZTOZSB(zp)->z_norm) { 931 matchtype_t mt = MT_NORMALIZE; 932 933 if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE && 934 (flag & ZCIEXACT)) || 935 (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED && 936 !(flag & ZCILOOK))) { 937 mt |= MT_MATCH_CASE; 938 } 939 940 error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id, 941 dl->dl_name, mt, tx); 942 } else { 943 error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 944 tx); 945 } 946 947 return (error); 948 } 949 950 static int 951 zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp) 952 { 953 zfsvfs_t *zfsvfs = ZTOZSB(zp); 954 int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); 955 boolean_t unlinked = B_FALSE; 956 sa_bulk_attr_t bulk[3]; 957 uint64_t mtime[2], ctime[2]; 958 uint64_t links; 959 int count = 0; 960 int error; 961 962 if (zp_is_dir && !zfs_dirempty(zp)) 963 return (SET_ERROR(ENOTEMPTY)); 964 965 if (ZTOI(zp)->i_nlink <= zp_is_dir) { 966 zfs_panic_recover("zfs: link count on %lu is %u, " 967 "should be at least %u", zp->z_id, 968 (int)ZTOI(zp)->i_nlink, zp_is_dir + 1); 969 set_nlink(ZTOI(zp), zp_is_dir + 1); 970 } 971 drop_nlink(ZTOI(zp)); 972 if (ZTOI(zp)->i_nlink == zp_is_dir) { 973 zp->z_unlinked = B_TRUE; 974 clear_nlink(ZTOI(zp)); 975 unlinked = B_TRUE; 976 } else { 977 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), 978 NULL, &ctime, sizeof (ctime)); 979 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 980 NULL, &zp->z_pflags, sizeof (zp->z_pflags)); 981 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, 982 ctime); 983 } 984 links = ZTOI(zp)->i_nlink; 985 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), 986 NULL, &links, sizeof (links)); 987 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 988 ASSERT3U(error, ==, 0); 989 990 if (unlinkedp != NULL) 991 *unlinkedp = unlinked; 992 else if (unlinked) 993 zfs_unlinked_add(zp, tx); 994 995 return (0); 996 } 997 998 /* 999 * Forcefully drop an nlink reference from (zp) and mark it for deletion if it 1000 * was the last link. This *must* only be done to znodes which have already 1001 * been zfs_link_destroy()'d with ZRENAMING. This is explicitly only used in 1002 * the error path of zfs_rename(), where we have to correct the nlink count if 1003 * we failed to link the target as well as failing to re-link the original 1004 * znodes. 1005 */ 1006 int 1007 zfs_drop_nlink(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp) 1008 { 1009 int error; 1010 1011 mutex_enter(&zp->z_lock); 1012 error = zfs_drop_nlink_locked(zp, tx, unlinkedp); 1013 mutex_exit(&zp->z_lock); 1014 1015 return (error); 1016 } 1017 1018 /* 1019 * Unlink zp from dl, and mark zp for deletion if this was the last link. Can 1020 * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY). 1021 * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. 1022 * If it's non-NULL, we use it to indicate whether the znode needs deletion, 1023 * and it's the caller's job to do it. 1024 */ 1025 int 1026 zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, 1027 boolean_t *unlinkedp) 1028 { 1029 znode_t *dzp = dl->dl_dzp; 1030 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1031 int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); 1032 boolean_t unlinked = B_FALSE; 1033 sa_bulk_attr_t bulk[5]; 1034 uint64_t mtime[2], ctime[2]; 1035 uint64_t links; 1036 int count = 0; 1037 int error; 1038 1039 if (!(flag & ZRENAMING)) { 1040 mutex_enter(&zp->z_lock); 1041 1042 if (zp_is_dir && !zfs_dirempty(zp)) { 1043 mutex_exit(&zp->z_lock); 1044 return (SET_ERROR(ENOTEMPTY)); 1045 } 1046 1047 /* 1048 * If we get here, we are going to try to remove the object. 1049 * First try removing the name from the directory; if that 1050 * fails, return the error. 1051 */ 1052 error = zfs_dropname(dl, zp, dzp, tx, flag); 1053 if (error != 0) { 1054 mutex_exit(&zp->z_lock); 1055 return (error); 1056 } 1057 1058 /* The only error is !zfs_dirempty() and we checked earlier. */ 1059 error = zfs_drop_nlink_locked(zp, tx, &unlinked); 1060 ASSERT3U(error, ==, 0); 1061 mutex_exit(&zp->z_lock); 1062 } else { 1063 error = zfs_dropname(dl, zp, dzp, tx, flag); 1064 if (error != 0) 1065 return (error); 1066 } 1067 1068 mutex_enter(&dzp->z_lock); 1069 dzp->z_size--; /* one dirent removed */ 1070 if (zp_is_dir) 1071 drop_nlink(ZTOI(dzp)); /* ".." link from zp */ 1072 links = ZTOI(dzp)->i_nlink; 1073 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), 1074 NULL, &links, sizeof (links)); 1075 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), 1076 NULL, &dzp->z_size, sizeof (dzp->z_size)); 1077 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), 1078 NULL, ctime, sizeof (ctime)); 1079 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 1080 NULL, mtime, sizeof (mtime)); 1081 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1082 NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); 1083 zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); 1084 error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); 1085 ASSERT(error == 0); 1086 mutex_exit(&dzp->z_lock); 1087 1088 if (unlinkedp != NULL) 1089 *unlinkedp = unlinked; 1090 else if (unlinked) 1091 zfs_unlinked_add(zp, tx); 1092 1093 return (0); 1094 } 1095 1096 /* 1097 * Indicate whether the directory is empty. Works with or without z_lock 1098 * held, but can only be consider a hint in the latter case. Returns true 1099 * if only "." and ".." remain and there's no work in progress. 1100 * 1101 * The internal ZAP size, rather than zp->z_size, needs to be checked since 1102 * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE. 1103 */ 1104 boolean_t 1105 zfs_dirempty(znode_t *dzp) 1106 { 1107 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1108 uint64_t count; 1109 int error; 1110 1111 if (dzp->z_dirlocks != NULL) 1112 return (B_FALSE); 1113 1114 error = zap_count(zfsvfs->z_os, dzp->z_id, &count); 1115 if (error != 0 || count != 0) 1116 return (B_FALSE); 1117 1118 return (B_TRUE); 1119 } 1120 1121 int 1122 zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr) 1123 { 1124 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1125 znode_t *xzp; 1126 dmu_tx_t *tx; 1127 int error; 1128 zfs_acl_ids_t acl_ids; 1129 boolean_t fuid_dirtied; 1130 #ifdef ZFS_DEBUG 1131 uint64_t parent; 1132 #endif 1133 1134 *xzpp = NULL; 1135 1136 if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, 1137 &acl_ids, zfs_init_idmap)) != 0) 1138 return (error); 1139 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) { 1140 zfs_acl_ids_free(&acl_ids); 1141 return (SET_ERROR(EDQUOT)); 1142 } 1143 1144 tx = dmu_tx_create(zfsvfs->z_os); 1145 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1146 ZFS_SA_BASE_ATTR_SIZE); 1147 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1148 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1149 fuid_dirtied = zfsvfs->z_fuid_dirty; 1150 if (fuid_dirtied) 1151 zfs_fuid_txhold(zfsvfs, tx); 1152 error = dmu_tx_assign(tx, TXG_WAIT); 1153 if (error) { 1154 zfs_acl_ids_free(&acl_ids); 1155 dmu_tx_abort(tx); 1156 return (error); 1157 } 1158 zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); 1159 1160 if (fuid_dirtied) 1161 zfs_fuid_sync(zfsvfs, tx); 1162 1163 #ifdef ZFS_DEBUG 1164 error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1165 &parent, sizeof (parent)); 1166 ASSERT(error == 0 && parent == zp->z_id); 1167 #endif 1168 1169 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, 1170 sizeof (xzp->z_id), tx)); 1171 1172 if (!zp->z_unlinked) 1173 zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL, 1174 acl_ids.z_fuidp, vap); 1175 1176 zfs_acl_ids_free(&acl_ids); 1177 dmu_tx_commit(tx); 1178 1179 *xzpp = xzp; 1180 1181 return (0); 1182 } 1183 1184 /* 1185 * Return a znode for the extended attribute directory for zp. 1186 * ** If the directory does not already exist, it is created ** 1187 * 1188 * IN: zp - znode to obtain attribute directory from 1189 * cr - credentials of caller 1190 * flags - flags from the VOP_LOOKUP call 1191 * 1192 * OUT: xipp - pointer to extended attribute znode 1193 * 1194 * RETURN: 0 on success 1195 * error number on failure 1196 */ 1197 int 1198 zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) 1199 { 1200 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1201 znode_t *xzp; 1202 zfs_dirlock_t *dl; 1203 vattr_t va; 1204 int error; 1205 top: 1206 error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); 1207 if (error) 1208 return (error); 1209 1210 if (xzp != NULL) { 1211 *xzpp = xzp; 1212 zfs_dirent_unlock(dl); 1213 return (0); 1214 } 1215 1216 if (!(flags & CREATE_XATTR_DIR)) { 1217 zfs_dirent_unlock(dl); 1218 return (SET_ERROR(ENOENT)); 1219 } 1220 1221 if (zfs_is_readonly(zfsvfs)) { 1222 zfs_dirent_unlock(dl); 1223 return (SET_ERROR(EROFS)); 1224 } 1225 1226 /* 1227 * The ability to 'create' files in an attribute 1228 * directory comes from the write_xattr permission on the base file. 1229 * 1230 * The ability to 'search' an attribute directory requires 1231 * read_xattr permission on the base file. 1232 * 1233 * Once in a directory the ability to read/write attributes 1234 * is controlled by the permissions on the attribute file. 1235 */ 1236 va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID; 1237 va.va_mode = S_IFDIR | S_ISVTX | 0777; 1238 zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); 1239 1240 va.va_dentry = NULL; 1241 error = zfs_make_xattrdir(zp, &va, xzpp, cr); 1242 zfs_dirent_unlock(dl); 1243 1244 if (error == ERESTART) { 1245 /* NB: we already did dmu_tx_wait() if necessary */ 1246 goto top; 1247 } 1248 1249 return (error); 1250 } 1251 1252 /* 1253 * Decide whether it is okay to remove within a sticky directory. 1254 * 1255 * In sticky directories, write access is not sufficient; 1256 * you can remove entries from a directory only if: 1257 * 1258 * you own the directory, 1259 * you own the entry, 1260 * you have write access to the entry, 1261 * or you are privileged (checked in secpolicy...). 1262 * 1263 * The function returns 0 if remove access is granted. 1264 */ 1265 int 1266 zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) 1267 { 1268 uid_t uid; 1269 uid_t downer; 1270 uid_t fowner; 1271 zfsvfs_t *zfsvfs = ZTOZSB(zdp); 1272 1273 if (zfsvfs->z_replay) 1274 return (0); 1275 1276 if ((zdp->z_mode & S_ISVTX) == 0) 1277 return (0); 1278 1279 downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid), 1280 cr, ZFS_OWNER); 1281 fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid), 1282 cr, ZFS_OWNER); 1283 1284 if ((uid = crgetuid(cr)) == downer || uid == fowner || 1285 zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, 1286 zfs_init_idmap) == 0) 1287 return (0); 1288 else 1289 return (secpolicy_vnode_remove(cr)); 1290 } 1291