1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013, 2016 by Delphix. All rights reserved. 25 * Copyright 2017 Nexenta Systems, Inc. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/time.h> 31 #include <sys/sysmacros.h> 32 #include <sys/vfs.h> 33 #include <sys/vnode.h> 34 #include <sys/file.h> 35 #include <sys/kmem.h> 36 #include <sys/uio.h> 37 #include <sys/pathname.h> 38 #include <sys/cmn_err.h> 39 #include <sys/errno.h> 40 #include <sys/stat.h> 41 #include <sys/sunddi.h> 42 #include <sys/random.h> 43 #include <sys/policy.h> 44 #include <sys/zfs_dir.h> 45 #include <sys/zfs_acl.h> 46 #include <sys/zfs_vnops.h> 47 #include <sys/fs/zfs.h> 48 #include <sys/zap.h> 49 #include <sys/dmu.h> 50 #include <sys/atomic.h> 51 #include <sys/zfs_ctldir.h> 52 #include <sys/zfs_fuid.h> 53 #include <sys/sa.h> 54 #include <sys/zfs_sa.h> 55 #include <sys/dmu_objset.h> 56 #include <sys/dsl_dir.h> 57 58 /* 59 * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups 60 * of names after deciding which is the appropriate lookup interface. 61 */ 62 static int 63 zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, 64 matchtype_t mt, boolean_t update, int *deflags, pathname_t *rpnp, 65 uint64_t *zoid) 66 { 67 boolean_t conflict = B_FALSE; 68 int error; 69 70 if (zfsvfs->z_norm) { 71 size_t bufsz = 0; 72 char *buf = NULL; 73 74 if (rpnp) { 75 buf = rpnp->pn_buf; 76 bufsz = rpnp->pn_bufsize; 77 } 78 79 /* 80 * In the non-mixed case we only expect there would ever 81 * be one match, but we need to use the normalizing lookup. 82 */ 83 error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, 84 zoid, mt, buf, bufsz, &conflict); 85 } else { 86 error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); 87 } 88 89 /* 90 * Allow multiple entries provided the first entry is 91 * the object id. Non-zpl consumers may safely make 92 * use of the additional space. 93 * 94 * XXX: This should be a feature flag for compatibility 95 */ 96 if (error == EOVERFLOW) 97 error = 0; 98 99 if (zfsvfs->z_norm && !error && deflags) 100 *deflags = conflict ? ED_CASE_CONFLICT : 0; 101 102 *zoid = ZFS_DIRENT_OBJ(*zoid); 103 104 return (error); 105 } 106 107 /* 108 * Lock a directory entry. A dirlock on <dzp, name> protects that name 109 * in dzp's directory zap object. As long as you hold a dirlock, you can 110 * assume two things: (1) dzp cannot be reaped, and (2) no other thread 111 * can change the zap entry for (i.e. link or unlink) this name. 112 * 113 * Input arguments: 114 * dzp - znode for directory 115 * name - name of entry to lock 116 * flag - ZNEW: if the entry already exists, fail with EEXIST. 117 * ZEXISTS: if the entry does not exist, fail with ENOENT. 118 * ZSHARED: allow concurrent access with other ZSHARED callers. 119 * ZXATTR: we want dzp's xattr directory 120 * ZCILOOK: On a mixed sensitivity file system, 121 * this lookup should be case-insensitive. 122 * ZCIEXACT: On a purely case-insensitive file system, 123 * this lookup should be case-sensitive. 124 * ZRENAMING: we are locking for renaming, force narrow locks 125 * ZHAVELOCK: Don't grab the z_name_lock for this call. The 126 * current thread already holds it. 127 * 128 * Output arguments: 129 * zpp - pointer to the znode for the entry (NULL if there isn't one) 130 * dlpp - pointer to the dirlock for this entry (NULL on error) 131 * direntflags - (case-insensitive lookup only) 132 * flags if multiple case-sensitive matches exist in directory 133 * realpnp - (case-insensitive lookup only) 134 * actual name matched within the directory 135 * 136 * Return value: 0 on success or errno on failure. 137 * 138 * NOTE: Always checks for, and rejects, '.' and '..'. 139 * NOTE: For case-insensitive file systems we take wide locks (see below), 140 * but return znode pointers to a single match. 141 */ 142 int 143 zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, 144 znode_t **zpp, int flag, int *direntflags, pathname_t *realpnp) 145 { 146 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 147 zfs_dirlock_t *dl; 148 boolean_t update; 149 matchtype_t mt = 0; 150 uint64_t zoid; 151 int error = 0; 152 int cmpflags; 153 154 *zpp = NULL; 155 *dlpp = NULL; 156 157 /* 158 * Verify that we are not trying to lock '.', '..', or '.zfs' 159 */ 160 if ((name[0] == '.' && 161 (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) || 162 (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)) 163 return (SET_ERROR(EEXIST)); 164 165 /* 166 * Case sensitivity and normalization preferences are set when 167 * the file system is created. These are stored in the 168 * zfsvfs->z_case and zfsvfs->z_norm fields. These choices 169 * affect what vnodes can be cached in the DNLC, how we 170 * perform zap lookups, and the "width" of our dirlocks. 171 * 172 * A normal dirlock locks a single name. Note that with 173 * normalization a name can be composed multiple ways, but 174 * when normalized, these names all compare equal. A wide 175 * dirlock locks multiple names. We need these when the file 176 * system is supporting mixed-mode access. It is sometimes 177 * necessary to lock all case permutations of file name at 178 * once so that simultaneous case-insensitive/case-sensitive 179 * behaves as rationally as possible. 180 */ 181 182 /* 183 * When matching we may need to normalize & change case according to 184 * FS settings. 185 * 186 * Note that a normalized match is necessary for a case insensitive 187 * filesystem when the lookup request is not exact because normalization 188 * can fold case independent of normalizing code point sequences. 189 * 190 * See the table above zfs_dropname(). 191 */ 192 if (zfsvfs->z_norm != 0) { 193 mt = MT_NORMALIZE; 194 195 /* 196 * Determine if the match needs to honor the case specified in 197 * lookup, and if so keep track of that so that during 198 * normalization we don't fold case. 199 */ 200 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE && 201 (flag & ZCIEXACT)) || 202 (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) { 203 mt |= MT_MATCH_CASE; 204 } 205 } 206 207 /* 208 * Only look in or update the DNLC if we are looking for the 209 * name on a file system that does not require normalization 210 * or case folding. We can also look there if we happen to be 211 * on a non-normalizing, mixed sensitivity file system IF we 212 * are looking for the exact name. 213 * 214 * Maybe can add TO-UPPERed version of name to dnlc in ci-only 215 * case for performance improvement? 216 */ 217 update = !zfsvfs->z_norm || 218 (zfsvfs->z_case == ZFS_CASE_MIXED && 219 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); 220 221 /* 222 * ZRENAMING indicates we are in a situation where we should 223 * take narrow locks regardless of the file system's 224 * preferences for normalizing and case folding. This will 225 * prevent us deadlocking trying to grab the same wide lock 226 * twice if the two names happen to be case-insensitive 227 * matches. 228 */ 229 if (flag & ZRENAMING) 230 cmpflags = 0; 231 else 232 cmpflags = zfsvfs->z_norm; 233 234 /* 235 * Wait until there are no locks on this name. 236 * 237 * Don't grab the lock if it is already held. However, cannot 238 * have both ZSHARED and ZHAVELOCK together. 239 */ 240 ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); 241 if (!(flag & ZHAVELOCK)) 242 rw_enter(&dzp->z_name_lock, RW_READER); 243 244 mutex_enter(&dzp->z_lock); 245 for (;;) { 246 if (dzp->z_unlinked && !(flag & ZXATTR)) { 247 mutex_exit(&dzp->z_lock); 248 if (!(flag & ZHAVELOCK)) 249 rw_exit(&dzp->z_name_lock); 250 return (SET_ERROR(ENOENT)); 251 } 252 for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { 253 if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, 254 U8_UNICODE_LATEST, &error) == 0) || error != 0) 255 break; 256 } 257 if (error != 0) { 258 mutex_exit(&dzp->z_lock); 259 if (!(flag & ZHAVELOCK)) 260 rw_exit(&dzp->z_name_lock); 261 return (SET_ERROR(ENOENT)); 262 } 263 if (dl == NULL) { 264 /* 265 * Allocate a new dirlock and add it to the list. 266 */ 267 dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); 268 cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); 269 dl->dl_name = name; 270 dl->dl_sharecnt = 0; 271 dl->dl_namelock = 0; 272 dl->dl_namesize = 0; 273 dl->dl_dzp = dzp; 274 dl->dl_next = dzp->z_dirlocks; 275 dzp->z_dirlocks = dl; 276 break; 277 } 278 if ((flag & ZSHARED) && dl->dl_sharecnt != 0) 279 break; 280 cv_wait(&dl->dl_cv, &dzp->z_lock); 281 } 282 283 /* 284 * If the z_name_lock was NOT held for this dirlock record it. 285 */ 286 if (flag & ZHAVELOCK) 287 dl->dl_namelock = 1; 288 289 if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { 290 /* 291 * We're the second shared reference to dl. Make a copy of 292 * dl_name in case the first thread goes away before we do. 293 * Note that we initialize the new name before storing its 294 * pointer into dl_name, because the first thread may load 295 * dl->dl_name at any time. It'll either see the old value, 296 * which belongs to it, or the new shared copy; either is OK. 297 */ 298 dl->dl_namesize = strlen(dl->dl_name) + 1; 299 name = kmem_alloc(dl->dl_namesize, KM_SLEEP); 300 memcpy(name, dl->dl_name, dl->dl_namesize); 301 dl->dl_name = name; 302 } 303 304 mutex_exit(&dzp->z_lock); 305 306 /* 307 * We have a dirlock on the name. (Note that it is the dirlock, 308 * not the dzp's z_lock, that protects the name in the zap object.) 309 * See if there's an object by this name; if so, put a hold on it. 310 */ 311 if (flag & ZXATTR) { 312 error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, 313 sizeof (zoid)); 314 if (error == 0) 315 error = (zoid == 0 ? SET_ERROR(ENOENT) : 0); 316 } else { 317 error = zfs_match_find(zfsvfs, dzp, name, mt, 318 update, direntflags, realpnp, &zoid); 319 } 320 if (error) { 321 if (error != ENOENT || (flag & ZEXISTS)) { 322 zfs_dirent_unlock(dl); 323 return (error); 324 } 325 } else { 326 if (flag & ZNEW) { 327 zfs_dirent_unlock(dl); 328 return (SET_ERROR(EEXIST)); 329 } 330 error = zfs_zget(zfsvfs, zoid, zpp); 331 if (error) { 332 zfs_dirent_unlock(dl); 333 return (error); 334 } 335 } 336 337 *dlpp = dl; 338 339 return (0); 340 } 341 342 /* 343 * Unlock this directory entry and wake anyone who was waiting for it. 344 */ 345 void 346 zfs_dirent_unlock(zfs_dirlock_t *dl) 347 { 348 znode_t *dzp = dl->dl_dzp; 349 zfs_dirlock_t **prev_dl, *cur_dl; 350 351 mutex_enter(&dzp->z_lock); 352 353 if (!dl->dl_namelock) 354 rw_exit(&dzp->z_name_lock); 355 356 if (dl->dl_sharecnt > 1) { 357 dl->dl_sharecnt--; 358 mutex_exit(&dzp->z_lock); 359 return; 360 } 361 prev_dl = &dzp->z_dirlocks; 362 while ((cur_dl = *prev_dl) != dl) 363 prev_dl = &cur_dl->dl_next; 364 *prev_dl = dl->dl_next; 365 cv_broadcast(&dl->dl_cv); 366 mutex_exit(&dzp->z_lock); 367 368 if (dl->dl_namesize != 0) 369 kmem_free(dl->dl_name, dl->dl_namesize); 370 cv_destroy(&dl->dl_cv); 371 kmem_free(dl, sizeof (*dl)); 372 } 373 374 /* 375 * Look up an entry in a directory. 376 * 377 * NOTE: '.' and '..' are handled as special cases because 378 * no directory entries are actually stored for them. If this is 379 * the root of a filesystem, then '.zfs' is also treated as a 380 * special pseudo-directory. 381 */ 382 int 383 zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags, 384 int *deflg, pathname_t *rpnp) 385 { 386 zfs_dirlock_t *dl; 387 znode_t *zp; 388 struct inode *ip; 389 int error = 0; 390 uint64_t parent; 391 392 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { 393 *zpp = dzp; 394 zhold(*zpp); 395 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { 396 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 397 398 /* 399 * If we are a snapshot mounted under .zfs, return 400 * the inode pointer for the snapshot directory. 401 */ 402 if ((error = sa_lookup(dzp->z_sa_hdl, 403 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) 404 return (error); 405 406 if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { 407 error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, 408 "snapshot", &ip, 0, kcred, NULL, NULL); 409 *zpp = ITOZ(ip); 410 return (error); 411 } 412 rw_enter(&dzp->z_parent_lock, RW_READER); 413 error = zfs_zget(zfsvfs, parent, &zp); 414 if (error == 0) 415 *zpp = zp; 416 rw_exit(&dzp->z_parent_lock); 417 } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { 418 ip = zfsctl_root(dzp); 419 *zpp = ITOZ(ip); 420 } else { 421 int zf; 422 423 zf = ZEXISTS | ZSHARED; 424 if (flags & FIGNORECASE) 425 zf |= ZCILOOK; 426 427 error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); 428 if (error == 0) { 429 *zpp = zp; 430 zfs_dirent_unlock(dl); 431 dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ 432 } 433 rpnp = NULL; 434 } 435 436 if ((flags & FIGNORECASE) && rpnp && !error) 437 (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); 438 439 return (error); 440 } 441 442 /* 443 * unlinked Set (formerly known as the "delete queue") Error Handling 444 * 445 * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we 446 * don't specify the name of the entry that we will be manipulating. We 447 * also fib and say that we won't be adding any new entries to the 448 * unlinked set, even though we might (this is to lower the minimum file 449 * size that can be deleted in a full filesystem). So on the small 450 * chance that the nlink list is using a fat zap (ie. has more than 451 * 2000 entries), we *may* not pre-read a block that's needed. 452 * Therefore it is remotely possible for some of the assertions 453 * regarding the unlinked set below to fail due to i/o error. On a 454 * nondebug system, this will result in the space being leaked. 455 */ 456 void 457 zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) 458 { 459 zfsvfs_t *zfsvfs = ZTOZSB(zp); 460 461 ASSERT(zp->z_unlinked); 462 ASSERT(ZTOI(zp)->i_nlink == 0); 463 464 VERIFY3U(0, ==, 465 zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); 466 467 dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1); 468 } 469 470 /* 471 * Clean up any znodes that had no links when we either crashed or 472 * (force) umounted the file system. 473 */ 474 static void 475 zfs_unlinked_drain_task(void *arg) 476 { 477 zfsvfs_t *zfsvfs = arg; 478 zap_cursor_t zc; 479 zap_attribute_t zap; 480 dmu_object_info_t doi; 481 znode_t *zp; 482 int error; 483 484 ASSERT3B(zfsvfs->z_draining, ==, B_TRUE); 485 486 /* 487 * Iterate over the contents of the unlinked set. 488 */ 489 for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); 490 zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel; 491 zap_cursor_advance(&zc)) { 492 493 /* 494 * See what kind of object we have in list 495 */ 496 497 error = dmu_object_info(zfsvfs->z_os, 498 zap.za_first_integer, &doi); 499 if (error != 0) 500 continue; 501 502 ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || 503 (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); 504 /* 505 * We need to re-mark these list entries for deletion, 506 * so we pull them back into core and set zp->z_unlinked. 507 */ 508 error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); 509 510 /* 511 * We may pick up znodes that are already marked for deletion. 512 * This could happen during the purge of an extended attribute 513 * directory. All we need to do is skip over them, since they 514 * are already in the system marked z_unlinked. 515 */ 516 if (error != 0) 517 continue; 518 519 zp->z_unlinked = B_TRUE; 520 521 /* 522 * zrele() decrements the znode's ref count and may cause 523 * it to be synchronously freed. We interrupt freeing 524 * of this znode by checking the return value of 525 * dmu_objset_zfs_unmounting() in dmu_free_long_range() 526 * when an unmount is requested. 527 */ 528 zrele(zp); 529 ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); 530 } 531 zap_cursor_fini(&zc); 532 533 zfsvfs->z_draining = B_FALSE; 534 zfsvfs->z_drain_task = TASKQID_INVALID; 535 } 536 537 /* 538 * Sets z_draining then tries to dispatch async unlinked drain. 539 * If that fails executes synchronous unlinked drain. 540 */ 541 void 542 zfs_unlinked_drain(zfsvfs_t *zfsvfs) 543 { 544 ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); 545 ASSERT3B(zfsvfs->z_draining, ==, B_FALSE); 546 547 zfsvfs->z_draining = B_TRUE; 548 zfsvfs->z_drain_cancel = B_FALSE; 549 550 zfsvfs->z_drain_task = taskq_dispatch( 551 dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)), 552 zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP); 553 if (zfsvfs->z_drain_task == TASKQID_INVALID) { 554 zfs_dbgmsg("async zfs_unlinked_drain dispatch failed"); 555 zfs_unlinked_drain_task(zfsvfs); 556 } 557 } 558 559 /* 560 * Wait for the unlinked drain taskq task to stop. This will interrupt the 561 * unlinked set processing if it is in progress. 562 */ 563 void 564 zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs) 565 { 566 ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); 567 568 if (zfsvfs->z_draining) { 569 zfsvfs->z_drain_cancel = B_TRUE; 570 taskq_cancel_id(dsl_pool_unlinked_drain_taskq( 571 dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task); 572 zfsvfs->z_drain_task = TASKQID_INVALID; 573 zfsvfs->z_draining = B_FALSE; 574 } 575 } 576 577 /* 578 * Delete the entire contents of a directory. Return a count 579 * of the number of entries that could not be deleted. If we encounter 580 * an error, return a count of at least one so that the directory stays 581 * in the unlinked set. 582 * 583 * NOTE: this function assumes that the directory is inactive, 584 * so there is no need to lock its entries before deletion. 585 * Also, it assumes the directory contents is *only* regular 586 * files. 587 */ 588 static int 589 zfs_purgedir(znode_t *dzp) 590 { 591 zap_cursor_t zc; 592 zap_attribute_t zap; 593 znode_t *xzp; 594 dmu_tx_t *tx; 595 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 596 zfs_dirlock_t dl; 597 int skipped = 0; 598 int error; 599 600 for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); 601 (error = zap_cursor_retrieve(&zc, &zap)) == 0; 602 zap_cursor_advance(&zc)) { 603 error = zfs_zget(zfsvfs, 604 ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); 605 if (error) { 606 skipped += 1; 607 continue; 608 } 609 610 ASSERT(S_ISREG(ZTOI(xzp)->i_mode) || 611 S_ISLNK(ZTOI(xzp)->i_mode)); 612 613 tx = dmu_tx_create(zfsvfs->z_os); 614 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 615 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); 616 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 617 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 618 /* Is this really needed ? */ 619 zfs_sa_upgrade_txholds(tx, xzp); 620 dmu_tx_mark_netfree(tx); 621 error = dmu_tx_assign(tx, TXG_WAIT); 622 if (error) { 623 dmu_tx_abort(tx); 624 zfs_zrele_async(xzp); 625 skipped += 1; 626 continue; 627 } 628 memset(&dl, 0, sizeof (dl)); 629 dl.dl_dzp = dzp; 630 dl.dl_name = zap.za_name; 631 632 error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); 633 if (error) 634 skipped += 1; 635 dmu_tx_commit(tx); 636 637 zfs_zrele_async(xzp); 638 } 639 zap_cursor_fini(&zc); 640 if (error != ENOENT) 641 skipped += 1; 642 return (skipped); 643 } 644 645 void 646 zfs_rmnode(znode_t *zp) 647 { 648 zfsvfs_t *zfsvfs = ZTOZSB(zp); 649 objset_t *os = zfsvfs->z_os; 650 znode_t *xzp = NULL; 651 dmu_tx_t *tx; 652 znode_hold_t *zh; 653 uint64_t z_id = zp->z_id; 654 uint64_t acl_obj; 655 uint64_t xattr_obj; 656 uint64_t links; 657 int error; 658 659 ASSERT(ZTOI(zp)->i_nlink == 0); 660 ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0); 661 662 /* 663 * If this is an attribute directory, purge its contents. 664 */ 665 if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) { 666 if (zfs_purgedir(zp) != 0) { 667 /* 668 * Not enough space to delete some xattrs. 669 * Leave it in the unlinked set. 670 */ 671 zh = zfs_znode_hold_enter(zfsvfs, z_id); 672 zfs_znode_dmu_fini(zp); 673 zfs_znode_hold_exit(zfsvfs, zh); 674 return; 675 } 676 } 677 678 /* 679 * Free up all the data in the file. We don't do this for directories 680 * because we need truncate and remove to be in the same tx, like in 681 * zfs_znode_delete(). Otherwise, if we crash here we'll end up with 682 * an inconsistent truncated zap object in the delete queue. Note a 683 * truncated file is harmless since it only contains user data. 684 */ 685 if (S_ISREG(ZTOI(zp)->i_mode)) { 686 error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); 687 if (error) { 688 /* 689 * Not enough space or we were interrupted by unmount. 690 * Leave the file in the unlinked set. 691 */ 692 zh = zfs_znode_hold_enter(zfsvfs, z_id); 693 zfs_znode_dmu_fini(zp); 694 zfs_znode_hold_exit(zfsvfs, zh); 695 return; 696 } 697 } 698 699 /* 700 * If the file has extended attributes, we're going to unlink 701 * the xattr dir. 702 */ 703 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 704 &xattr_obj, sizeof (xattr_obj)); 705 if (error == 0 && xattr_obj) { 706 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 707 ASSERT(error == 0); 708 } 709 710 acl_obj = zfs_external_acl(zp); 711 712 /* 713 * Set up the final transaction. 714 */ 715 tx = dmu_tx_create(os); 716 dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); 717 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 718 if (xzp) { 719 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); 720 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 721 } 722 if (acl_obj) 723 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); 724 725 zfs_sa_upgrade_txholds(tx, zp); 726 error = dmu_tx_assign(tx, TXG_WAIT); 727 if (error) { 728 /* 729 * Not enough space to delete the file. Leave it in the 730 * unlinked set, leaking it until the fs is remounted (at 731 * which point we'll call zfs_unlinked_drain() to process it). 732 */ 733 dmu_tx_abort(tx); 734 zh = zfs_znode_hold_enter(zfsvfs, z_id); 735 zfs_znode_dmu_fini(zp); 736 zfs_znode_hold_exit(zfsvfs, zh); 737 goto out; 738 } 739 740 if (xzp) { 741 ASSERT(error == 0); 742 mutex_enter(&xzp->z_lock); 743 xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ 744 clear_nlink(ZTOI(xzp)); /* no more links to it */ 745 links = 0; 746 VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), 747 &links, sizeof (links), tx)); 748 mutex_exit(&xzp->z_lock); 749 zfs_unlinked_add(xzp, tx); 750 } 751 752 mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); 753 754 /* 755 * Remove this znode from the unlinked set. If a has rollback has 756 * occurred while a file is open and unlinked. Then when the file 757 * is closed post rollback it will not exist in the rolled back 758 * version of the unlinked object. 759 */ 760 error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, 761 zp->z_id, tx); 762 VERIFY(error == 0 || error == ENOENT); 763 764 uint64_t count; 765 if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { 766 cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); 767 } 768 769 mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); 770 771 dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); 772 773 zfs_znode_delete(zp, tx); 774 775 dmu_tx_commit(tx); 776 out: 777 if (xzp) 778 zfs_zrele_async(xzp); 779 } 780 781 static uint64_t 782 zfs_dirent(znode_t *zp, uint64_t mode) 783 { 784 uint64_t de = zp->z_id; 785 786 if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE) 787 de |= IFTODT(mode) << 60; 788 return (de); 789 } 790 791 /* 792 * Link zp into dl. Can fail in the following cases : 793 * - if zp has been unlinked. 794 * - if the number of entries with the same hash (aka. colliding entries) 795 * exceed the capacity of a leaf-block of fatzap and splitting of the 796 * leaf-block does not help. 797 */ 798 int 799 zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) 800 { 801 znode_t *dzp = dl->dl_dzp; 802 zfsvfs_t *zfsvfs = ZTOZSB(zp); 803 uint64_t value; 804 int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); 805 sa_bulk_attr_t bulk[5]; 806 uint64_t mtime[2], ctime[2]; 807 uint64_t links; 808 int count = 0; 809 int error; 810 811 mutex_enter(&zp->z_lock); 812 813 if (!(flag & ZRENAMING)) { 814 if (zp->z_unlinked) { /* no new links to unlinked zp */ 815 ASSERT(!(flag & (ZNEW | ZEXISTS))); 816 mutex_exit(&zp->z_lock); 817 return (SET_ERROR(ENOENT)); 818 } 819 if (!(flag & ZNEW)) { 820 /* 821 * ZNEW nodes come from zfs_mknode() where the link 822 * count has already been initialised 823 */ 824 inc_nlink(ZTOI(zp)); 825 links = ZTOI(zp)->i_nlink; 826 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), 827 NULL, &links, sizeof (links)); 828 } 829 } 830 831 value = zfs_dirent(zp, zp->z_mode); 832 error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1, 833 &value, tx); 834 835 /* 836 * zap_add could fail to add the entry if it exceeds the capacity of the 837 * leaf-block and zap_leaf_split() failed to help. 838 * The caller of this routine is responsible for failing the transaction 839 * which will rollback the SA updates done above. 840 */ 841 if (error != 0) { 842 if (!(flag & ZRENAMING) && !(flag & ZNEW)) 843 drop_nlink(ZTOI(zp)); 844 mutex_exit(&zp->z_lock); 845 return (error); 846 } 847 848 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, 849 &dzp->z_id, sizeof (dzp->z_id)); 850 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 851 &zp->z_pflags, sizeof (zp->z_pflags)); 852 853 if (!(flag & ZNEW)) { 854 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 855 ctime, sizeof (ctime)); 856 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, 857 ctime); 858 } 859 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 860 ASSERT(error == 0); 861 862 mutex_exit(&zp->z_lock); 863 864 mutex_enter(&dzp->z_lock); 865 dzp->z_size++; 866 if (zp_is_dir) 867 inc_nlink(ZTOI(dzp)); 868 links = ZTOI(dzp)->i_nlink; 869 count = 0; 870 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 871 &dzp->z_size, sizeof (dzp->z_size)); 872 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, 873 &links, sizeof (links)); 874 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 875 mtime, sizeof (mtime)); 876 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 877 ctime, sizeof (ctime)); 878 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 879 &dzp->z_pflags, sizeof (dzp->z_pflags)); 880 zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); 881 error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); 882 ASSERT(error == 0); 883 mutex_exit(&dzp->z_lock); 884 885 return (0); 886 } 887 888 /* 889 * The match type in the code for this function should conform to: 890 * 891 * ------------------------------------------------------------------------ 892 * fs type | z_norm | lookup type | match type 893 * ---------|-------------|-------------|---------------------------------- 894 * CS !norm | 0 | 0 | 0 (exact) 895 * CS norm | formX | 0 | MT_NORMALIZE 896 * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE 897 * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE 898 * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE 899 * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE 900 * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE 901 * CM !norm | upper | ZCILOOK | MT_NORMALIZE 902 * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE 903 * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE 904 * 905 * Abbreviations: 906 * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed 907 * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) 908 * formX = unicode normalization form set on fs creation 909 */ 910 static int 911 zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, 912 int flag) 913 { 914 int error; 915 916 if (ZTOZSB(zp)->z_norm) { 917 matchtype_t mt = MT_NORMALIZE; 918 919 if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE && 920 (flag & ZCIEXACT)) || 921 (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED && 922 !(flag & ZCILOOK))) { 923 mt |= MT_MATCH_CASE; 924 } 925 926 error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id, 927 dl->dl_name, mt, tx); 928 } else { 929 error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 930 tx); 931 } 932 933 return (error); 934 } 935 936 static int 937 zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp) 938 { 939 zfsvfs_t *zfsvfs = ZTOZSB(zp); 940 int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); 941 boolean_t unlinked = B_FALSE; 942 sa_bulk_attr_t bulk[3]; 943 uint64_t mtime[2], ctime[2]; 944 uint64_t links; 945 int count = 0; 946 int error; 947 948 if (zp_is_dir && !zfs_dirempty(zp)) 949 return (SET_ERROR(ENOTEMPTY)); 950 951 if (ZTOI(zp)->i_nlink <= zp_is_dir) { 952 zfs_panic_recover("zfs: link count on %lu is %u, " 953 "should be at least %u", zp->z_id, 954 (int)ZTOI(zp)->i_nlink, zp_is_dir + 1); 955 set_nlink(ZTOI(zp), zp_is_dir + 1); 956 } 957 drop_nlink(ZTOI(zp)); 958 if (ZTOI(zp)->i_nlink == zp_is_dir) { 959 zp->z_unlinked = B_TRUE; 960 clear_nlink(ZTOI(zp)); 961 unlinked = B_TRUE; 962 } else { 963 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), 964 NULL, &ctime, sizeof (ctime)); 965 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 966 NULL, &zp->z_pflags, sizeof (zp->z_pflags)); 967 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, 968 ctime); 969 } 970 links = ZTOI(zp)->i_nlink; 971 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), 972 NULL, &links, sizeof (links)); 973 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 974 ASSERT3U(error, ==, 0); 975 976 if (unlinkedp != NULL) 977 *unlinkedp = unlinked; 978 else if (unlinked) 979 zfs_unlinked_add(zp, tx); 980 981 return (0); 982 } 983 984 /* 985 * Forcefully drop an nlink reference from (zp) and mark it for deletion if it 986 * was the last link. This *must* only be done to znodes which have already 987 * been zfs_link_destroy()'d with ZRENAMING. This is explicitly only used in 988 * the error path of zfs_rename(), where we have to correct the nlink count if 989 * we failed to link the target as well as failing to re-link the original 990 * znodes. 991 */ 992 int 993 zfs_drop_nlink(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp) 994 { 995 int error; 996 997 mutex_enter(&zp->z_lock); 998 error = zfs_drop_nlink_locked(zp, tx, unlinkedp); 999 mutex_exit(&zp->z_lock); 1000 1001 return (error); 1002 } 1003 1004 /* 1005 * Unlink zp from dl, and mark zp for deletion if this was the last link. Can 1006 * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY). 1007 * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. 1008 * If it's non-NULL, we use it to indicate whether the znode needs deletion, 1009 * and it's the caller's job to do it. 1010 */ 1011 int 1012 zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, 1013 boolean_t *unlinkedp) 1014 { 1015 znode_t *dzp = dl->dl_dzp; 1016 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1017 int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); 1018 boolean_t unlinked = B_FALSE; 1019 sa_bulk_attr_t bulk[5]; 1020 uint64_t mtime[2], ctime[2]; 1021 uint64_t links; 1022 int count = 0; 1023 int error; 1024 1025 if (!(flag & ZRENAMING)) { 1026 mutex_enter(&zp->z_lock); 1027 1028 if (zp_is_dir && !zfs_dirempty(zp)) { 1029 mutex_exit(&zp->z_lock); 1030 return (SET_ERROR(ENOTEMPTY)); 1031 } 1032 1033 /* 1034 * If we get here, we are going to try to remove the object. 1035 * First try removing the name from the directory; if that 1036 * fails, return the error. 1037 */ 1038 error = zfs_dropname(dl, zp, dzp, tx, flag); 1039 if (error != 0) { 1040 mutex_exit(&zp->z_lock); 1041 return (error); 1042 } 1043 1044 /* The only error is !zfs_dirempty() and we checked earlier. */ 1045 error = zfs_drop_nlink_locked(zp, tx, &unlinked); 1046 ASSERT3U(error, ==, 0); 1047 mutex_exit(&zp->z_lock); 1048 } else { 1049 error = zfs_dropname(dl, zp, dzp, tx, flag); 1050 if (error != 0) 1051 return (error); 1052 } 1053 1054 mutex_enter(&dzp->z_lock); 1055 dzp->z_size--; /* one dirent removed */ 1056 if (zp_is_dir) 1057 drop_nlink(ZTOI(dzp)); /* ".." link from zp */ 1058 links = ZTOI(dzp)->i_nlink; 1059 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), 1060 NULL, &links, sizeof (links)); 1061 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), 1062 NULL, &dzp->z_size, sizeof (dzp->z_size)); 1063 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), 1064 NULL, ctime, sizeof (ctime)); 1065 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 1066 NULL, mtime, sizeof (mtime)); 1067 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), 1068 NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); 1069 zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); 1070 error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); 1071 ASSERT(error == 0); 1072 mutex_exit(&dzp->z_lock); 1073 1074 if (unlinkedp != NULL) 1075 *unlinkedp = unlinked; 1076 else if (unlinked) 1077 zfs_unlinked_add(zp, tx); 1078 1079 return (0); 1080 } 1081 1082 /* 1083 * Indicate whether the directory is empty. Works with or without z_lock 1084 * held, but can only be consider a hint in the latter case. Returns true 1085 * if only "." and ".." remain and there's no work in progress. 1086 * 1087 * The internal ZAP size, rather than zp->z_size, needs to be checked since 1088 * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE. 1089 */ 1090 boolean_t 1091 zfs_dirempty(znode_t *dzp) 1092 { 1093 zfsvfs_t *zfsvfs = ZTOZSB(dzp); 1094 uint64_t count; 1095 int error; 1096 1097 if (dzp->z_dirlocks != NULL) 1098 return (B_FALSE); 1099 1100 error = zap_count(zfsvfs->z_os, dzp->z_id, &count); 1101 if (error != 0 || count != 0) 1102 return (B_FALSE); 1103 1104 return (B_TRUE); 1105 } 1106 1107 int 1108 zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr) 1109 { 1110 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1111 znode_t *xzp; 1112 dmu_tx_t *tx; 1113 int error; 1114 zfs_acl_ids_t acl_ids; 1115 boolean_t fuid_dirtied; 1116 #ifdef ZFS_DEBUG 1117 uint64_t parent; 1118 #endif 1119 1120 *xzpp = NULL; 1121 1122 if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, 1123 &acl_ids, kcred->user_ns)) != 0) 1124 return (error); 1125 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) { 1126 zfs_acl_ids_free(&acl_ids); 1127 return (SET_ERROR(EDQUOT)); 1128 } 1129 1130 tx = dmu_tx_create(zfsvfs->z_os); 1131 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 1132 ZFS_SA_BASE_ATTR_SIZE); 1133 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 1134 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 1135 fuid_dirtied = zfsvfs->z_fuid_dirty; 1136 if (fuid_dirtied) 1137 zfs_fuid_txhold(zfsvfs, tx); 1138 error = dmu_tx_assign(tx, TXG_WAIT); 1139 if (error) { 1140 zfs_acl_ids_free(&acl_ids); 1141 dmu_tx_abort(tx); 1142 return (error); 1143 } 1144 zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); 1145 1146 if (fuid_dirtied) 1147 zfs_fuid_sync(zfsvfs, tx); 1148 1149 #ifdef ZFS_DEBUG 1150 error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1151 &parent, sizeof (parent)); 1152 ASSERT(error == 0 && parent == zp->z_id); 1153 #endif 1154 1155 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, 1156 sizeof (xzp->z_id), tx)); 1157 1158 if (!zp->z_unlinked) 1159 zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL, 1160 acl_ids.z_fuidp, vap); 1161 1162 zfs_acl_ids_free(&acl_ids); 1163 dmu_tx_commit(tx); 1164 1165 *xzpp = xzp; 1166 1167 return (0); 1168 } 1169 1170 /* 1171 * Return a znode for the extended attribute directory for zp. 1172 * ** If the directory does not already exist, it is created ** 1173 * 1174 * IN: zp - znode to obtain attribute directory from 1175 * cr - credentials of caller 1176 * flags - flags from the VOP_LOOKUP call 1177 * 1178 * OUT: xipp - pointer to extended attribute znode 1179 * 1180 * RETURN: 0 on success 1181 * error number on failure 1182 */ 1183 int 1184 zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) 1185 { 1186 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1187 znode_t *xzp; 1188 zfs_dirlock_t *dl; 1189 vattr_t va; 1190 int error; 1191 top: 1192 error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); 1193 if (error) 1194 return (error); 1195 1196 if (xzp != NULL) { 1197 *xzpp = xzp; 1198 zfs_dirent_unlock(dl); 1199 return (0); 1200 } 1201 1202 if (!(flags & CREATE_XATTR_DIR)) { 1203 zfs_dirent_unlock(dl); 1204 return (SET_ERROR(ENOENT)); 1205 } 1206 1207 if (zfs_is_readonly(zfsvfs)) { 1208 zfs_dirent_unlock(dl); 1209 return (SET_ERROR(EROFS)); 1210 } 1211 1212 /* 1213 * The ability to 'create' files in an attribute 1214 * directory comes from the write_xattr permission on the base file. 1215 * 1216 * The ability to 'search' an attribute directory requires 1217 * read_xattr permission on the base file. 1218 * 1219 * Once in a directory the ability to read/write attributes 1220 * is controlled by the permissions on the attribute file. 1221 */ 1222 va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID; 1223 va.va_mode = S_IFDIR | S_ISVTX | 0777; 1224 zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); 1225 1226 va.va_dentry = NULL; 1227 error = zfs_make_xattrdir(zp, &va, xzpp, cr); 1228 zfs_dirent_unlock(dl); 1229 1230 if (error == ERESTART) { 1231 /* NB: we already did dmu_tx_wait() if necessary */ 1232 goto top; 1233 } 1234 1235 return (error); 1236 } 1237 1238 /* 1239 * Decide whether it is okay to remove within a sticky directory. 1240 * 1241 * In sticky directories, write access is not sufficient; 1242 * you can remove entries from a directory only if: 1243 * 1244 * you own the directory, 1245 * you own the entry, 1246 * you have write access to the entry, 1247 * or you are privileged (checked in secpolicy...). 1248 * 1249 * The function returns 0 if remove access is granted. 1250 */ 1251 int 1252 zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) 1253 { 1254 uid_t uid; 1255 uid_t downer; 1256 uid_t fowner; 1257 zfsvfs_t *zfsvfs = ZTOZSB(zdp); 1258 1259 if (zfsvfs->z_replay) 1260 return (0); 1261 1262 if ((zdp->z_mode & S_ISVTX) == 0) 1263 return (0); 1264 1265 downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid), 1266 cr, ZFS_OWNER); 1267 fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid), 1268 cr, ZFS_OWNER); 1269 1270 if ((uid = crgetuid(cr)) == downer || uid == fowner || 1271 zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, 1272 kcred->user_ns) == 0) 1273 return (0); 1274 else 1275 return (secpolicy_vnode_remove(cr)); 1276 } 1277