1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013, 2016 by Delphix. All rights reserved. 25 * Copyright 2017 Nexenta Systems, Inc. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/time.h> 31 #include <sys/sysmacros.h> 32 #include <sys/vfs.h> 33 #include <sys/vnode.h> 34 #include <sys/file.h> 35 #include <sys/kmem.h> 36 #include <sys/uio.h> 37 #include <sys/pathname.h> 38 #include <sys/cmn_err.h> 39 #include <sys/errno.h> 40 #include <sys/stat.h> 41 #include <sys/sunddi.h> 42 #include <sys/random.h> 43 #include <sys/policy.h> 44 #include <sys/zfs_dir.h> 45 #include <sys/zfs_acl.h> 46 #include <sys/zfs_vnops.h> 47 #include <sys/fs/zfs.h> 48 #include <sys/zap.h> 49 #include <sys/dmu.h> 50 #include <sys/atomic.h> 51 #include <sys/zfs_ctldir.h> 52 #include <sys/zfs_fuid.h> 53 #include <sys/sa.h> 54 #include <sys/zfs_sa.h> 55 #include <sys/dmu_objset.h> 56 #include <sys/dsl_dir.h> 57 58 /* 59 * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups 60 * of names after deciding which is the appropriate lookup interface. 61 */ 62 static int 63 zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, 64 matchtype_t mt, boolean_t update, int *deflags, pathname_t *rpnp, 65 uint64_t *zoid) 66 { 67 boolean_t conflict = B_FALSE; 68 int error; 69 70 if (zfsvfs->z_norm) { 71 size_t bufsz = 0; 72 char *buf = NULL; 73 74 if (rpnp) { 75 buf = rpnp->pn_buf; 76 bufsz = rpnp->pn_bufsize; 77 } 78 79 /* 80 * In the non-mixed case we only expect there would ever 81 * be one match, but we need to use the normalizing lookup. 82 */ 83 error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, 84 zoid, mt, buf, bufsz, &conflict); 85 } else { 86 error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); 87 } 88 89 /* 90 * Allow multiple entries provided the first entry is 91 * the object id. Non-zpl consumers may safely make 92 * use of the additional space. 93 * 94 * XXX: This should be a feature flag for compatibility 95 */ 96 if (error == EOVERFLOW) 97 error = 0; 98 99 if (zfsvfs->z_norm && !error && deflags) 100 *deflags = conflict ? ED_CASE_CONFLICT : 0; 101 102 *zoid = ZFS_DIRENT_OBJ(*zoid); 103 104 return (error); 105 } 106 107 /* 108 * Lock a directory entry. A dirlock on <dzp, name> protects that name 109 * in dzp's directory zap object. As long as you hold a dirlock, you can 110 * assume two things: (1) dzp cannot be reaped, and (2) no other thread 111 * can change the zap entry for (i.e. link or unlink) this name. 112 * 113 * Input arguments: 114 * dzp - znode for directory 115 * name - name of entry to lock 116 * flag - ZNEW: if the entry already exists, fail with EEXIST. 117 * ZEXISTS: if the entry does not exist, fail with ENOENT. 118 * ZSHARED: allow concurrent access with other ZSHARED callers. 119 * ZXATTR: we want dzp's xattr directory 120 * ZCILOOK: On a mixed sensitivity file system, 121 * this lookup should be case-insensitive. 122 * ZCIEXACT: On a purely case-insensitive file system, 123 * this lookup should be case-sensitive. 124 * ZRENAMING: we are locking for renaming, force narrow locks 125 * ZHAVELOCK: Don't grab the z_name_lock for this call. The 126 * current thread already holds it. 127 * 128 * Output arguments: 129 * zpp - pointer to the znode for the entry (NULL if there isn't one) 130 * dlpp - pointer to the dirlock for this entry (NULL on error) 131 * direntflags - (case-insensitive lookup only) 132 * flags if multiple case-sensitive matches exist in directory 133 * realpnp - (case-insensitive lookup only) 134 * actual name matched within the directory 135 * 136 * Return value: 0 on success or errno on failure. 137 * 138 * NOTE: Always checks for, and rejects, '.' and '..'. 139 * NOTE: For case-insensitive file systems we take wide locks (see below), 140 * but return znode pointers to a single match. 141 */ 142 int 143 zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, 144 znode_t **zpp, int flag, int *direntflags, pathname_t *realpnp) 145 { 146 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 147 zfs_dirlock_t *dl; 148 boolean_t update; 149 matchtype_t mt = 0; 150 uint64_t zoid; 151 int error = 0; 152 int cmpflags; 153 154 *zpp = NULL; 155 *dlpp = NULL; 156 157 /* 158 * Verify that we are not trying to lock '.', '..', or '.zfs' 159 */ 160 if ((name[0] == '.' && 161 (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) || 162 (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)) 163 return (SET_ERROR(EEXIST)); 164 165 /* 166 * Case sensitivity and normalization preferences are set when 167 * the file system is created. These are stored in the 168 * zfsvfs->z_case and zfsvfs->z_norm fields. These choices 169 * affect what vnodes can be cached in the DNLC, how we 170 * perform zap lookups, and the "width" of our dirlocks. 171 * 172 * A normal dirlock locks a single name. Note that with 173 * normalization a name can be composed multiple ways, but 174 * when normalized, these names all compare equal. A wide 175 * dirlock locks multiple names. We need these when the file 176 * system is supporting mixed-mode access. It is sometimes 177 * necessary to lock all case permutations of file name at 178 * once so that simultaneous case-insensitive/case-sensitive 179 * behaves as rationally as possible. 180 */ 181 182 /* 183 * When matching we may need to normalize & change case according to 184 * FS settings. 185 * 186 * Note that a normalized match is necessary for a case insensitive 187 * filesystem when the lookup request is not exact because normalization 188 * can fold case independent of normalizing code point sequences. 189 * 190 * See the table above zfs_dropname(). 191 */ 192 if (zfsvfs->z_norm != 0) { 193 mt = MT_NORMALIZE; 194 195 /* 196 * Determine if the match needs to honor the case specified in 197 * lookup, and if so keep track of that so that during 198 * normalization we don't fold case. 199 */ 200 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE && 201 (flag & ZCIEXACT)) || 202 (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) { 203 mt |= MT_MATCH_CASE; 204 } 205 } 206 207 /* 208 * Only look in or update the DNLC if we are looking for the 209 * name on a file system that does not require normalization 210 * or case folding. We can also look there if we happen to be 211 * on a non-normalizing, mixed sensitivity file system IF we 212 * are looking for the exact name. 213 * 214 * Maybe can add TO-UPPERed version of name to dnlc in ci-only 215 * case for performance improvement? 216 */ 217 update = !zfsvfs->z_norm || 218 (zfsvfs->z_case == ZFS_CASE_MIXED && 219 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); 220 221 /* 222 * ZRENAMING indicates we are in a situation where we should 223 * take narrow locks regardless of the file system's 224 * preferences for normalizing and case folding. This will 225 * prevent us deadlocking trying to grab the same wide lock 226 * twice if the two names happen to be case-insensitive 227 * matches. 228 */ 229 if (flag & ZRENAMING) 230 cmpflags = 0; 231 else 232 cmpflags = zfsvfs->z_norm; 233 234 /* 235 * Wait until there are no locks on this name. 236 * 237 * Don't grab the lock if it is already held. However, cannot 238 * have both ZSHARED and ZHAVELOCK together. 239 */ 240 ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); 241 if (!(flag & ZHAVELOCK)) 242 rw_enter(&dzp->z_name_lock, RW_READER); 243 244 mutex_enter(&dzp->z_lock); 245 for (;;) { 246 if (dzp->z_unlinked && !(flag & ZXATTR)) { 247 mutex_exit(&dzp->z_lock); 248 if (!(flag & ZHAVELOCK)) 249 rw_exit(&dzp->z_name_lock); 250 return (SET_ERROR(ENOENT)); 251 } 252 for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { 253 if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, 254 U8_UNICODE_LATEST, &error) == 0) || error != 0) 255 break; 256 } 257 if (error != 0) { 258 mutex_exit(&dzp->z_lock); 259 if (!(flag & ZHAVELOCK)) 260 rw_exit(&dzp->z_name_lock); 261 return (SET_ERROR(ENOENT)); 262 } 263 if (dl == NULL) { 264 /* 265 * Allocate a new dirlock and add it to the list. 266 */ 267 dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); 268 cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); 269 dl->dl_name = name; 270 dl->dl_sharecnt = 0; 271 dl->dl_namelock = 0; 272 dl->dl_namesize = 0; 273 dl->dl_dzp = dzp; 274 dl->dl_next = dzp->z_dirlocks; 275 dzp->z_dirlocks = dl; 276 break; 277 } 278 if ((flag & ZSHARED) && dl->dl_sharecnt != 0) 279 break; 280 cv_wait(&dl->dl_cv, &dzp->z_lock); 281 } 282 283 /* 284 * If the z_name_lock was NOT held for this dirlock record it. 285 */ 286 if (flag & ZHAVELOCK) 287 dl->dl_namelock = 1; 288 289 if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { 290 /* 291 * We're the second shared reference to dl. Make a copy of 292 * dl_name in case the first thread goes away before we do. 293 * Note that we initialize the new name before storing its 294 * pointer into dl_name, because the first thread may load 295 * dl->dl_name at any time. It'll either see the old value, 296 * which belongs to it, or the new shared copy; either is OK. 297 */ 298 dl->dl_namesize = strlen(dl->dl_name) + 1; 299 name = kmem_alloc(dl->dl_namesize, KM_SLEEP); 300 memcpy(name, dl->dl_name, dl->dl_namesize); 301 dl->dl_name = name; 302 } 303 304 mutex_exit(&dzp->z_lock); 305 306 /* 307 * We have a dirlock on the name. (Note that it is the dirlock, 308 * not the dzp's z_lock, that protects the name in the zap object.) 309 * See if there's an object by this name; if so, put a hold on it. 310 */ 311 if (flag & ZXATTR) { 312 error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, 313 sizeof (zoid)); 314 if (error == 0) 315 error = (zoid == 0 ? SET_ERROR(ENOENT) : 0); 316 } else { 317 error = zfs_match_find(zfsvfs, dzp, name, mt, 318 update, direntflags, realpnp, &zoid); 319 } 320 if (error) { 321 if (error != ENOENT || (flag & ZEXISTS)) { 322 zfs_dirent_unlock(dl); 323 return (error); 324 } 325 } else { 326 if (flag & ZNEW) { 327 zfs_dirent_unlock(dl); 328 return (SET_ERROR(EEXIST)); 329 } 330 error = zfs_zget(zfsvfs, zoid, zpp); 331 if (error) { 332 zfs_dirent_unlock(dl); 333 return (error); 334 } 335 } 336 337 *dlpp = dl; 338 339 return (0); 340 } 341 342 /* 343 * Unlock this directory entry and wake anyone who was waiting for it. 344 */ 345 void 346 zfs_dirent_unlock(zfs_dirlock_t *dl) 347 { 348 znode_t *dzp = dl->dl_dzp; 349 zfs_dirlock_t **prev_dl, *cur_dl; 350 351 mutex_enter(&dzp->z_lock); 352 353 if (!dl->dl_namelock) 354 rw_exit(&dzp->z_name_lock); 355 356 if (dl->dl_sharecnt > 1) { 357 dl->dl_sharecnt--; 358 mutex_exit(&dzp->z_lock); 359 return; 360 } 361 prev_dl = &dzp->z_dirlocks; 362 while ((cur_dl = *prev_dl) != dl) 363 prev_dl = &cur_dl->dl_next; 364 *prev_dl = dl->dl_next; 365 cv_broadcast(&dl->dl_cv); 366 mutex_exit(&dzp->z_lock); 367 368 if (dl->dl_namesize != 0) 369 kmem_free(dl->dl_name, dl->dl_namesize); 370 cv_destroy(&dl->dl_cv); 371 kmem_free(dl, sizeof (*dl)); 372 } 373 374 /* 375 * Look up an entry in a directory. 376 * 377 * NOTE: '.' and '..' are handled as special cases because 378 * no directory entries are actually stored for them. If this is 379 * the root of a filesystem, then '.zfs' is also treated as a 380 * special pseudo-directory. 381 */ 382 int 383 zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags, 384 int *deflg, pathname_t *rpnp) 385 { 386 zfs_dirlock_t *dl; 387 znode_t *zp; 388 struct inode *ip; 389 int error = 0; 390 uint64_t parent; 391 392 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { 393 *zpp = dzp; 394 zhold(*zpp); 395 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { 396 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 397 398 /* 399 * If we are a snapshot mounted under .zfs, return 400 * the inode pointer for the snapshot directory. 401 */ 402 if ((error = sa_lookup(dzp->z_sa_hdl, 403 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) 404 return (error); 405 406 if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { 407 error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, 408 "snapshot", &ip, 0, kcred, NULL, NULL); 409 *zpp = ITOZ(ip); 410 return (error); 411 } 412 rw_enter(&dzp->z_parent_lock, RW_READER); 413 error = zfs_zget(zfsvfs, parent, &zp); 414 if (error == 0) 415 *zpp = zp; 416 rw_exit(&dzp->z_parent_lock); 417 } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { 418 ip = zfsctl_root(dzp); 419 *zpp = ITOZ(ip); 420 } else { 421 int zf; 422 423 zf = ZEXISTS | ZSHARED; 424 if (flags & FIGNORECASE) 425 zf |= ZCILOOK; 426 427 error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); 428 if (error == 0) { 429 *zpp = zp; 430 zfs_dirent_unlock(dl); 431 dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ 432 } 433 rpnp = NULL; 434 } 435 436 if ((flags & FIGNORECASE) && rpnp && !error) 437 (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); 438 439 return (error); 440 } 441 442 /* 443 * unlinked Set (formerly known as the "delete queue") Error Handling 444 * 445 * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we 446 * don't specify the name of the entry that we will be manipulating. We 447 * also fib and say that we won't be adding any new entries to the 448 * unlinked set, even though we might (this is to lower the minimum file 449 * size that can be deleted in a full filesystem). So on the small 450 * chance that the nlink list is using a fat zap (ie. has more than 451 * 2000 entries), we *may* not pre-read a block that's needed. 452 * Therefore it is remotely possible for some of the assertions 453 * regarding the unlinked set below to fail due to i/o error. On a 454 * nondebug system, this will result in the space being leaked. 455 */ 456 void 457 zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) 458 { 459 zfsvfs_t *zfsvfs = ZTOZSB(zp); 460 461 ASSERT(zp->z_unlinked); 462 ASSERT(ZTOI(zp)->i_nlink == 0); 463 464 VERIFY3U(0, ==, 465 zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); 466 467 dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1); 468 } 469 470 /* 471 * Clean up any znodes that had no links when we either crashed or 472 * (force) umounted the file system. 473 */ 474 static void 475 zfs_unlinked_drain_task(void *arg) 476 { 477 zfsvfs_t *zfsvfs = arg; 478 zap_cursor_t zc; 479 zap_attribute_t zap; 480 dmu_object_info_t doi; 481 znode_t *zp; 482 int error; 483 484 ASSERT3B(zfsvfs->z_draining, ==, B_TRUE); 485 486 /* 487 * Iterate over the contents of the unlinked set. 488 */ 489 for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); 490 zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel; 491 zap_cursor_advance(&zc)) { 492 493 /* 494 * See what kind of object we have in list 495 */ 496 497 error = dmu_object_info(zfsvfs->z_os, 498 zap.za_first_integer, &doi); 499 if (error != 0) 500 continue; 501 502 ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || 503 (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); 504 /* 505 * We need to re-mark these list entries for deletion, 506 * so we pull them back into core and set zp->z_unlinked. 507 */ 508 error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); 509 510 /* 511 * We may pick up znodes that are already marked for deletion. 512 * This could happen during the purge of an extended attribute 513 * directory. All we need to do is skip over them, since they 514 * are already in the system marked z_unlinked. 515 */ 516 if (error != 0) 517 continue; 518 519 zp->z_unlinked = B_TRUE; 520 521 /* 522 * zrele() decrements the znode's ref count and may cause 523 * it to be synchronously freed. We interrupt freeing 524 * of this znode by checking the return value of 525 * dmu_objset_zfs_unmounting() in dmu_free_long_range() 526 * when an unmount is requested. 527 */ 528 zrele(zp); 529 ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); 530 } 531 zap_cursor_fini(&zc); 532 533 zfsvfs->z_draining = B_FALSE; 534 zfsvfs->z_drain_task = TASKQID_INVALID; 535 } 536 537 /* 538 * Sets z_draining then tries to dispatch async unlinked drain. 539 * If that fails executes synchronous unlinked drain. 540 */ 541 void 542 zfs_unlinked_drain(zfsvfs_t *zfsvfs) 543 { 544 ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); 545 ASSERT3B(zfsvfs->z_draining, ==, B_FALSE); 546 547 zfsvfs->z_draining = B_TRUE; 548 zfsvfs->z_drain_cancel = B_FALSE; 549 550 zfsvfs->z_drain_task = taskq_dispatch( 551 dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)), 552 zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP); 553 if (zfsvfs->z_drain_task == TASKQID_INVALID) { 554 zfs_dbgmsg("async zfs_unlinked_drain dispatch failed"); 555 zfs_unlinked_drain_task(zfsvfs); 556 } 557 } 558 559 /* 560 * Wait for the unlinked drain taskq task to stop. This will interrupt the 561 * unlinked set processing if it is in progress. 562 */ 563 void 564 zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs) 565 { 566 ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); 567 568 if (zfsvfs->z_draining) { 569 zfsvfs->z_drain_cancel = B_TRUE; 570 taskq_cancel_id(dsl_pool_unlinked_drain_taskq( 571 dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task); 572 zfsvfs->z_drain_task = TASKQID_INVALID; 573 zfsvfs->z_draining = B_FALSE; 574 } 575 } 576 577 /* 578 * Delete the entire contents of a directory. Return a count 579 * of the number of entries that could not be deleted. If we encounter 580 * an error, return a count of at least one so that the directory stays 581 * in the unlinked set. 582 * 583 * NOTE: this function assumes that the directory is inactive, 584 * so there is no need to lock its entries before deletion. 585 * Also, it assumes the directory contents is *only* regular 586 * files. 587 */ 588 static int 589 zfs_purgedir(znode_t *dzp) 590 { 591 zap_cursor_t zc; 592 zap_attribute_t zap; 593 znode_t *xzp; 594 dmu_tx_t *tx; 595 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 596 zfs_dirlock_t dl; 597 int skipped = 0; 598 int error; 599 600 for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); 601 (error = zap_cursor_retrieve(&zc, &zap)) == 0; 602 zap_cursor_advance(&zc)) { 603 error = zfs_zget(zfsvfs, 604 ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); 605 if (error) { 606 skipped += 1; 607 continue; 608 } 609 610 ASSERT(S_ISREG(ZTOI(xzp)->i_mode) || 611 S_ISLNK(ZTOI(xzp)->i_mode)); 612 613 tx = dmu_tx_create(zfsvfs->z_os); 614 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 615 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); 616 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 617 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 618 /* Is this really needed ? */ 619 zfs_sa_upgrade_txholds(tx, xzp); 620 dmu_tx_mark_netfree(tx); 621 error = dmu_tx_assign(tx, TXG_WAIT); 622 if (error) { 623 dmu_tx_abort(tx); 624 zfs_zrele_async(xzp); 625 skipped += 1; 626 continue; 627 } 628 memset(&dl, 0, sizeof (dl)); 629 dl.dl_dzp = dzp; 630 dl.dl_name = zap.za_name; 631 632 error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); 633 if (error) 634 skipped += 1; 635 dmu_tx_commit(tx); 636 637 zfs_zrele_async(xzp); 638 } 639 zap_cursor_fini(&zc); 640 if (error != ENOENT) 641 skipped += 1; 642 return (skipped); 643 } 644 645 void 646 zfs_rmnode(znode_t *zp) 647 { 648 zfsvfs_t *zfsvfs = ZTOZSB(zp); 649 objset_t *os = zfsvfs->z_os; 650 znode_t *xzp = NULL; 651 dmu_tx_t *tx; 652 uint64_t acl_obj; 653 uint64_t xattr_obj; 654 uint64_t links; 655 int error; 656 657 ASSERT(ZTOI(zp)->i_nlink == 0); 658 ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0); 659 660 /* 661 * If this is an attribute directory, purge its contents. 662 */ 663 if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) { 664 if (zfs_purgedir(zp) != 0) { 665 /* 666 * Not enough space to delete some xattrs. 667 * Leave it in the unlinked set. 668 */ 669 zfs_znode_dmu_fini(zp); 670 671 return; 672 } 673 } 674 675 /* 676 * Free up all the data in the file. We don't do this for directories 677 * because we need truncate and remove to be in the same tx, like in 678 * zfs_znode_delete(). Otherwise, if we crash here we'll end up with 679 * an inconsistent truncated zap object in the delete queue. Note a 680 * truncated file is harmless since it only contains user data. 681 */ 682 if (S_ISREG(ZTOI(zp)->i_mode)) { 683 error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); 684 if (error) { 685 /* 686 * Not enough space or we were interrupted by unmount. 687 * Leave the file in the unlinked set. 688 */ 689 zfs_znode_dmu_fini(zp); 690 return; 691 } 692 } 693 694 /* 695 * If the file has extended attributes, we're going to unlink 696 * the xattr dir. 697 */ 698 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 699 &xattr_obj, sizeof (xattr_obj)); 700 if (error == 0 && xattr_obj) { 701 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 702 ASSERT(error == 0); 703 } 704 705 acl_obj = zfs_external_acl(zp); 706 707 /* 708 * Set up the final transaction. 709 */ 710 tx = dmu_tx_create(os); 711 dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); 712 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 713 if (xzp) { 714 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); 715 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 716 } 717 if (acl_obj) 718 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 719 720 zfs_sa_upgrade_txholds(tx, zp); 721 error = dmu_tx_assign(tx, TXG_WAIT); 722 if (error) { 723 /* 724 * Not enough space to delete the file. Leave it in the 725 * unlinked set, leaking it until the fs is remounted (at 726 * which point we'll call zfs_unlinked_drain() to process it). 727 */ 728 dmu_tx_abort(tx); 729 zfs_znode_dmu_fini(zp); 730 goto out; 731 } 732 733 if (xzp) { 734 ASSERT(error == 0); 735 mutex_enter(&xzp->z_lock); 736 xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ 737 clear_nlink(ZTOI(xzp)); /* no more links to it */ 738 links = 0; 739 VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 740 &links, sizeof (links), tx)); 741 mutex_exit(&xzp->z_lock); 742 zfs_unlinked_add(xzp, tx); 743 } 744 745 mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); 746 747 /* 748 * Remove this znode from the unlinked set. If a has rollback has 749 * occurred while a file is open and unlinked. Then when the file 750 * is closed post rollback it will not exist in the rolled back 751 * version of the unlinked object. 752 */ 753 error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, 754 zp->z_id, tx); 755 VERIFY(error == 0 || error == ENOENT); 756 757 uint64_t count; 758 if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { 759 cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); 760 } 761 762 mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); 763 764 dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); 765 766 zfs_znode_delete(zp, tx); 767 768 dmu_tx_commit(tx); 769 out: 770 if (xzp) 771 zfs_zrele_async(xzp); 772 } 773 774 static uint64_t 775 zfs_dirent(znode_t *zp, uint64_t mode) 776 { 777 uint64_t de = zp->z_id; 778 779 if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE) 780 de |= IFTODT(mode) << 60; 781 return (de); 782 } 783 784 /* 785 * Link zp into dl. Can fail in the following cases : 786 * - if zp has been unlinked. 787 * - if the number of entries with the same hash (aka. colliding entries) 788 * exceed the capacity of a leaf-block of fatzap and splitting of the 789 * leaf-block does not help. 790 */ 791 int 792 zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) 793 { 794 znode_t *dzp = dl->dl_dzp; 795 zfsvfs_t *zfsvfs = ZTOZSB(zp); 796 uint64_t value; 797 int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); 798 sa_bulk_attr_t bulk[5]; 799 uint64_t mtime[2], ctime[2]; 800 uint64_t links; 801 int count = 0; 802 int error; 803 804 mutex_enter(&zp->z_lock); 805 806 if (!(flag & ZRENAMING)) { 807 if (zp->z_unlinked) { /* no new links to unlinked zp */ 808 ASSERT(!(flag & (ZNEW | ZEXISTS))); 809 mutex_exit(&zp->z_lock); 810 return (SET_ERROR(ENOENT)); 811 } 812 if (!(flag & ZNEW)) { 813 /* 814 * ZNEW nodes come from zfs_mknode() where the link 815 * count has already been initialised 816 */ 817 inc_nlink(ZTOI(zp)); 818 links = ZTOI(zp)->i_nlink; 819 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), 820 NULL, &links, sizeof (links)); 821 } 822 } 823 824 value = zfs_dirent(zp, zp->z_mode); 825 error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1, 826 &value, tx); 827 828 /* 829 * zap_add could fail to add the entry if it exceeds the capacity of the 830 * leaf-block and zap_leaf_split() failed to help. 831 * The caller of this routine is responsible for failing the transaction 832 * which will rollback the SA updates done above. 833 */ 834 if (error != 0) { 835 if (!(flag & ZRENAMING) && !(flag & ZNEW)) 836 drop_nlink(ZTOI(zp)); 837 mutex_exit(&zp->z_lock); 838 return (error); 839 } 840 841 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, 842 &dzp->z_id, sizeof (dzp->z_id)); 843 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 844 &zp->z_pflags, sizeof (zp->z_pflags)); 845 846 if (!(flag & ZNEW)) { 847 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 848 ctime, sizeof (ctime)); 849 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, 850 ctime); 851 } 852 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 853 ASSERT(error == 0); 854 855 mutex_exit(&zp->z_lock); 856 857 mutex_enter(&dzp->z_lock); 858 dzp->z_size++; 859 if (zp_is_dir) 860 inc_nlink(ZTOI(dzp)); 861 links = ZTOI(dzp)->i_nlink; 862 count = 0; 863 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 864 &dzp->z_size, sizeof (dzp->z_size)); 865 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, 866 &links, sizeof (links)); 867 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 868 mtime, sizeof (mtime)); 869 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 870 ctime, sizeof (ctime)); 871 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 872 &dzp->z_pflags, sizeof (dzp->z_pflags)); 873 zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); 874 error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); 875 ASSERT(error == 0); 876 mutex_exit(&dzp->z_lock); 877 878 return (0); 879 } 880 881 /* 882 * The match type in the code for this function should conform to: 883 * 884 * ------------------------------------------------------------------------ 885 * fs type | z_norm | lookup type | match type 886 * ---------|-------------|-------------|---------------------------------- 887 * CS !norm | 0 | 0 | 0 (exact) 888 * CS norm | formX | 0 | MT_NORMALIZE 889 * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE 890 * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE 891 * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE 892 * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE 893 * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE 894 * CM !norm | upper | ZCILOOK | MT_NORMALIZE 895 * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE 896 * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE 897 * 898 * Abbreviations: 899 * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed 900 * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) 901 * formX = unicode normalization form set on fs creation 902 */ 903 static int 904 zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, 905 int flag) 906 { 907 int error; 908 909 if (ZTOZSB(zp)->z_norm) { 910 matchtype_t mt = MT_NORMALIZE; 911 912 if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE && 913 (flag & ZCIEXACT)) || 914 (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED && 915 !(flag & ZCILOOK))) { 916 mt |= MT_MATCH_CASE; 917 } 918 919 error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id, 920 dl->dl_name, mt, tx); 921 } else { 922 error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 923 tx); 924 } 925 926 return (error); 927 } 928 929 static int 930 zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp) 931 { 932 zfsvfs_t *zfsvfs = ZTOZSB(zp); 933 int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); 934 boolean_t unlinked = B_FALSE; 935 sa_bulk_attr_t bulk[3]; 936 uint64_t mtime[2], ctime[2]; 937 uint64_t links; 938 int count = 0; 939 int error; 940 941 if (zp_is_dir && !zfs_dirempty(zp)) 942 return (SET_ERROR(ENOTEMPTY)); 943 944 if (ZTOI(zp)->i_nlink <= zp_is_dir) { 945 zfs_panic_recover("zfs: link count on %lu is %u, " 946 "should be at least %u", zp->z_id, 947 (int)ZTOI(zp)->i_nlink, zp_is_dir + 1); 948 set_nlink(ZTOI(zp), zp_is_dir + 1); 949 } 950 drop_nlink(ZTOI(zp)); 951 if (ZTOI(zp)->i_nlink == zp_is_dir) { 952 zp->z_unlinked = B_TRUE; 953 clear_nlink(ZTOI(zp)); 954 unlinked = B_TRUE; 955 } else { 956 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), 957 NULL, &ctime, sizeof (ctime)); 958 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 959 NULL, &zp->z_pflags, sizeof (zp->z_pflags)); 960 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, 961 ctime); 962 } 963 links = ZTOI(zp)->i_nlink; 964 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), 965 NULL, &links, sizeof (links)); 966 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 967 ASSERT3U(error, ==, 0); 968 969 if (unlinkedp != NULL) 970 *unlinkedp = unlinked; 971 else if (unlinked) 972 zfs_unlinked_add(zp, tx); 973 974 return (0); 975 } 976 977 /* 978 * Forcefully drop an nlink reference from (zp) and mark it for deletion if it 979 * was the last link. This *must* only be done to znodes which have already 980 * been zfs_link_destroy()'d with ZRENAMING. This is explicitly only used in 981 * the error path of zfs_rename(), where we have to correct the nlink count if 982 * we failed to link the target as well as failing to re-link the original 983 * znodes. 984 */ 985 int 986 zfs_drop_nlink(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp) 987 { 988 int error; 989 990 mutex_enter(&zp->z_lock); 991 error = zfs_drop_nlink_locked(zp, tx, unlinkedp); 992 mutex_exit(&zp->z_lock); 993 994 return (error); 995 } 996 997 /* 998 * Unlink zp from dl, and mark zp for deletion if this was the last link. Can 999 * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY). 1000 * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. 1001 * If it's non-NULL, we use it to indicate whether the znode needs deletion, 1002 * and it's the caller's job to do it. 1003 */ 1004 int 1005 zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, 1006 boolean_t *unlinkedp) 1007 { 1008 znode_t *dzp = dl->dl_dzp; 1009 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1010 int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); 1011 boolean_t unlinked = B_FALSE; 1012 sa_bulk_attr_t bulk[5]; 1013 uint64_t mtime[2], ctime[2]; 1014 uint64_t links; 1015 int count = 0; 1016 int error; 1017 1018 if (!(flag & ZRENAMING)) { 1019 mutex_enter(&zp->z_lock); 1020 1021 if (zp_is_dir && !zfs_dirempty(zp)) { 1022 mutex_exit(&zp->z_lock); 1023 return (SET_ERROR(ENOTEMPTY)); 1024 } 1025 1026 /* 1027 * If we get here, we are going to try to remove the object. 1028 * First try removing the name from the directory; if that 1029 * fails, return the error. 1030 */ 1031 error = zfs_dropname(dl, zp, dzp, tx, flag); 1032 if (error != 0) { 1033 mutex_exit(&zp->z_lock); 1034 return (error); 1035 } 1036 1037 /* The only error is !zfs_dirempty() and we checked earlier. */ 1038 error = zfs_drop_nlink_locked(zp, tx, &unlinked); 1039 ASSERT3U(error, ==, 0); 1040 mutex_exit(&zp->z_lock); 1041 } else { 1042 error = zfs_dropname(dl, zp, dzp, tx, flag); 1043 if (error != 0) 1044 return (error); 1045 } 1046 1047 mutex_enter(&dzp->z_lock); 1048 dzp->z_size--; /* one dirent removed */ 1049 if (zp_is_dir) 1050 drop_nlink(ZTOI(dzp)); /* ".." link from zp */ 1051 links = ZTOI(dzp)->i_nlink; 1052 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), 1053 NULL, &links, sizeof (links)); 1054 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), 1055 NULL, &dzp->z_size, sizeof (dzp->z_size)); 1056 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), 1057 NULL, ctime, sizeof (ctime)); 1058 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 1059 NULL, mtime, sizeof (mtime)); 1060 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1061 NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); 1062 zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); 1063 error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); 1064 ASSERT(error == 0); 1065 mutex_exit(&dzp->z_lock); 1066 1067 if (unlinkedp != NULL) 1068 *unlinkedp = unlinked; 1069 else if (unlinked) 1070 zfs_unlinked_add(zp, tx); 1071 1072 return (0); 1073 } 1074 1075 /* 1076 * Indicate whether the directory is empty. Works with or without z_lock 1077 * held, but can only be consider a hint in the latter case. Returns true 1078 * if only "." and ".." remain and there's no work in progress. 1079 * 1080 * The internal ZAP size, rather than zp->z_size, needs to be checked since 1081 * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE. 1082 */ 1083 boolean_t 1084 zfs_dirempty(znode_t *dzp) 1085 { 1086 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1087 uint64_t count; 1088 int error; 1089 1090 if (dzp->z_dirlocks != NULL) 1091 return (B_FALSE); 1092 1093 error = zap_count(zfsvfs->z_os, dzp->z_id, &count); 1094 if (error != 0 || count != 0) 1095 return (B_FALSE); 1096 1097 return (B_TRUE); 1098 } 1099 1100 int 1101 zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr) 1102 { 1103 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1104 znode_t *xzp; 1105 dmu_tx_t *tx; 1106 int error; 1107 zfs_acl_ids_t acl_ids; 1108 boolean_t fuid_dirtied; 1109 #ifdef ZFS_DEBUG 1110 uint64_t parent; 1111 #endif 1112 1113 *xzpp = NULL; 1114 1115 if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr, 1116 kcred->user_ns))) 1117 return (error); 1118 1119 if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, 1120 &acl_ids, kcred->user_ns)) != 0) 1121 return (error); 1122 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) { 1123 zfs_acl_ids_free(&acl_ids); 1124 return (SET_ERROR(EDQUOT)); 1125 } 1126 1127 tx = dmu_tx_create(zfsvfs->z_os); 1128 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1129 ZFS_SA_BASE_ATTR_SIZE); 1130 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1131 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1132 fuid_dirtied = zfsvfs->z_fuid_dirty; 1133 if (fuid_dirtied) 1134 zfs_fuid_txhold(zfsvfs, tx); 1135 error = dmu_tx_assign(tx, TXG_WAIT); 1136 if (error) { 1137 zfs_acl_ids_free(&acl_ids); 1138 dmu_tx_abort(tx); 1139 return (error); 1140 } 1141 zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); 1142 1143 if (fuid_dirtied) 1144 zfs_fuid_sync(zfsvfs, tx); 1145 1146 #ifdef ZFS_DEBUG 1147 error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1148 &parent, sizeof (parent)); 1149 ASSERT(error == 0 && parent == zp->z_id); 1150 #endif 1151 1152 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, 1153 sizeof (xzp->z_id), tx)); 1154 1155 if (!zp->z_unlinked) 1156 zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL, 1157 acl_ids.z_fuidp, vap); 1158 1159 zfs_acl_ids_free(&acl_ids); 1160 dmu_tx_commit(tx); 1161 1162 *xzpp = xzp; 1163 1164 return (0); 1165 } 1166 1167 /* 1168 * Return a znode for the extended attribute directory for zp. 1169 * ** If the directory does not already exist, it is created ** 1170 * 1171 * IN: zp - znode to obtain attribute directory from 1172 * cr - credentials of caller 1173 * flags - flags from the VOP_LOOKUP call 1174 * 1175 * OUT: xipp - pointer to extended attribute znode 1176 * 1177 * RETURN: 0 on success 1178 * error number on failure 1179 */ 1180 int 1181 zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) 1182 { 1183 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1184 znode_t *xzp; 1185 zfs_dirlock_t *dl; 1186 vattr_t va; 1187 int error; 1188 top: 1189 error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); 1190 if (error) 1191 return (error); 1192 1193 if (xzp != NULL) { 1194 *xzpp = xzp; 1195 zfs_dirent_unlock(dl); 1196 return (0); 1197 } 1198 1199 if (!(flags & CREATE_XATTR_DIR)) { 1200 zfs_dirent_unlock(dl); 1201 return (SET_ERROR(ENOENT)); 1202 } 1203 1204 if (zfs_is_readonly(zfsvfs)) { 1205 zfs_dirent_unlock(dl); 1206 return (SET_ERROR(EROFS)); 1207 } 1208 1209 /* 1210 * The ability to 'create' files in an attribute 1211 * directory comes from the write_xattr permission on the base file. 1212 * 1213 * The ability to 'search' an attribute directory requires 1214 * read_xattr permission on the base file. 1215 * 1216 * Once in a directory the ability to read/write attributes 1217 * is controlled by the permissions on the attribute file. 1218 */ 1219 va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID; 1220 va.va_mode = S_IFDIR | S_ISVTX | 0777; 1221 zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); 1222 1223 va.va_dentry = NULL; 1224 error = zfs_make_xattrdir(zp, &va, xzpp, cr); 1225 zfs_dirent_unlock(dl); 1226 1227 if (error == ERESTART) { 1228 /* NB: we already did dmu_tx_wait() if necessary */ 1229 goto top; 1230 } 1231 1232 return (error); 1233 } 1234 1235 /* 1236 * Decide whether it is okay to remove within a sticky directory. 1237 * 1238 * In sticky directories, write access is not sufficient; 1239 * you can remove entries from a directory only if: 1240 * 1241 * you own the directory, 1242 * you own the entry, 1243 * you have write access to the entry, 1244 * or you are privileged (checked in secpolicy...). 1245 * 1246 * The function returns 0 if remove access is granted. 1247 */ 1248 int 1249 zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) 1250 { 1251 uid_t uid; 1252 uid_t downer; 1253 uid_t fowner; 1254 zfsvfs_t *zfsvfs = ZTOZSB(zdp); 1255 1256 if (zfsvfs->z_replay) 1257 return (0); 1258 1259 if ((zdp->z_mode & S_ISVTX) == 0) 1260 return (0); 1261 1262 downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid), 1263 cr, ZFS_OWNER); 1264 fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid), 1265 cr, ZFS_OWNER); 1266 1267 if ((uid = crgetuid(cr)) == downer || uid == fowner || 1268 zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, 1269 kcred->user_ns) == 0) 1270 return (0); 1271 else 1272 return (secpolicy_vnode_remove(cr)); 1273 } 1274