1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 41 #pragma ident "%Z%%M% %I% %E% SMI" 42 43 /* 44 * Directory manipulation routines. 45 * 46 * When manipulating directories, the i_rwlock provides serialization 47 * since directories cannot be mmapped. The i_contents lock is redundant. 48 */ 49 50 #include <sys/types.h> 51 #include <sys/t_lock.h> 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/signal.h> 55 #include <sys/cred.h> 56 #include <sys/proc.h> 57 #include <sys/disp.h> 58 #include <sys/user.h> 59 #include <sys/vfs.h> 60 #include <sys/vnode.h> 61 #include <sys/stat.h> 62 #include <sys/mode.h> 63 #include <sys/buf.h> 64 #include <sys/uio.h> 65 #include <sys/dnlc.h> 66 #include <sys/fs/ufs_inode.h> 67 #include <sys/fs/ufs_fs.h> 68 #include <sys/mount.h> 69 #include <sys/fs/ufs_fsdir.h> 70 #include <sys/fs/ufs_trans.h> 71 #include <sys/fs/ufs_panic.h> 72 #include <sys/fs/ufs_quota.h> 73 #include <sys/errno.h> 74 #include <sys/debug.h> 75 #include <vm/seg.h> 76 #include <sys/sysmacros.h> 77 #include <sys/cmn_err.h> 78 #include <sys/cpuvar.h> 79 #include <sys/unistd.h> 80 #include <sys/policy.h> 81 82 /* 83 * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ 84 */ 85 #if !ISP2(DIRBLKSIZ) 86 #error "DIRBLKSIZ not a power of 2" 87 #endif 88 89 /* 90 * A virgin directory. 91 */ 92 static struct dirtemplate mastertemplate = { 93 0, 12, 1, ".", 94 0, DIRBLKSIZ - 12, 2, ".." 95 }; 96 97 #define LDIRSIZ(len) \ 98 ((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3)) 99 #define MAX_DIR_NAME_LEN(len) \ 100 (((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1) 101 102 /* 103 * The dnlc directory cache allows a 64 bit handle for directory entries. 104 * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset 105 * into the handle. Note, a 32 bit offset allows a 4GB directory, which 106 * is way beyond what could be cached in memory by the directory 107 * caching routines. So we are quite safe with this limit. 108 * The macros below pack and unpack the handle. 109 */ 110 #define H_TO_INO(h) (uint32_t)((h) & UINT_MAX) 111 #define H_TO_OFF(h) (off_t)((h) >> 32) 112 #define INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino)) 113 114 /* 115 * The average size of a typical on disk directory entry is about 16 bytes 116 * and so defines AV_DIRECT_SHIFT : log2(16) 117 * This define is only used to approximate the number of entries 118 * is a directory. This is needed for dnlc_dir_start() which will immediately 119 * return an error if the value is not within its acceptable range of 120 * number of files in a directory. 121 */ 122 #define AV_DIRECT_SHIFT 4 123 /* 124 * If the directory size (from i_size) is greater than the ufs_min_dir_cache 125 * tunable then we request dnlc directory caching. 126 * This has found to be profitable after 1024 file names. 127 */ 128 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT; 129 130 /* The time point the dnlc directory caching was disabled */ 131 static hrtime_t ufs_dc_disable_at; 132 /* directory caching disable duration */ 133 static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5; 134 135 #ifdef DEBUG 136 int dirchk = 1; 137 #else /* !DEBUG */ 138 int dirchk = 0; 139 #endif /* DEBUG */ 140 int ufs_negative_cache = 1; 141 uint64_t ufs_dirremove_retry_cnt; 142 143 static void dirbad(); 144 static int ufs_dirrename(); 145 static int ufs_diraddentry(); 146 static int ufs_dirempty(); 147 static int ufs_dirscan(); 148 static int ufs_dirclrdotdot(); 149 static int ufs_dirfixdotdot(); 150 static int ufs_dirpurgedotdot(); 151 static int dirprepareentry(); 152 static int ufs_dirmakedirect(); 153 static int dirbadname(); 154 static int dirmangled(); 155 156 /* 157 * Look for a given name in a directory. On successful return, *ipp 158 * will point to the VN_HELD inode. 159 */ 160 int 161 ufs_dirlook( 162 struct inode *dp, 163 char *namep, 164 struct inode **ipp, 165 struct cred *cr, 166 int skipdnlc) /* skip the 1st level dnlc */ 167 { 168 uint64_t handle; 169 struct fbuf *fbp; /* a buffer of directory entries */ 170 struct direct *ep; /* the current directory entry */ 171 struct vnode *vp; 172 struct vnode *dvp; /* directory vnode ptr */ 173 dcanchor_t *dcap; 174 off_t endsearch; /* offset to end directory search */ 175 off_t offset; 176 off_t start_off; /* starting offset from middle search */ 177 off_t last_offset; /* last offset */ 178 int entryoffsetinblock; /* offset of ep in addr's buffer */ 179 int numdirpasses; /* strategy for directory search */ 180 int namlen; /* length of name */ 181 int err; 182 int doingchk; 183 int i; 184 int caching; 185 ino_t ep_ino; /* entry i number */ 186 ino_t chkino; 187 ushort_t ep_reclen; /* direct local d_reclen */ 188 189 ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */ 190 191 /* 192 * Check accessibility of directory. 193 */ 194 if (((dp->i_mode & IFMT) != IFDIR) && 195 ((dp->i_mode & IFMT) != IFATTRDIR)) 196 return (ENOTDIR); 197 198 if (err = ufs_iaccess(dp, IEXEC, cr)) 199 return (err); 200 201 /* 202 * Check the directory name lookup cache, first for individual files 203 * then for complete directories. 204 */ 205 dvp = ITOV(dp); 206 if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) { 207 /* vp is already held from dnlc_lookup */ 208 if (vp == DNLC_NO_VNODE) { 209 VN_RELE(vp); 210 return (ENOENT); 211 } 212 *ipp = VTOI(vp); 213 return (0); 214 } 215 216 dcap = &dp->i_danchor; 217 218 /* 219 * Grab the reader lock on the directory data before checking 220 * the dnlc to avoid a race with ufs_dirremove() & friends. 221 */ 222 rw_enter(&dp->i_rwlock, RW_READER); 223 224 switch (dnlc_dir_lookup(dcap, namep, &handle)) { 225 case DFOUND: 226 ep_ino = (ino_t)H_TO_INO(handle); 227 if (dp->i_number == ep_ino) { 228 VN_HOLD(dvp); /* want ourself, "." */ 229 *ipp = dp; 230 rw_exit(&dp->i_rwlock); 231 return (0); 232 } 233 if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) { 234 uint64_t handle2; 235 /* 236 * release the lock on the dir we are searching 237 * to avoid a deadlock when grabbing the 238 * i_contents lock in ufs_iget_alloced(). 239 */ 240 rw_exit(&dp->i_rwlock); 241 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 242 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); 243 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 244 /* 245 * must recheck as we dropped dp->i_rwlock 246 */ 247 rw_enter(&dp->i_rwlock, RW_READER); 248 if (!err && (dnlc_dir_lookup(dcap, namep, &handle2) 249 == DFOUND) && (handle == handle2)) { 250 dnlc_update(dvp, namep, ITOV(*ipp)); 251 rw_exit(&dp->i_rwlock); 252 return (0); 253 } 254 /* check failed, read the actual directory */ 255 if (!err) { 256 VN_RELE(ITOV(*ipp)); 257 } 258 goto restart; 259 } 260 /* usual case of not "." nor ".." */ 261 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 262 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); 263 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 264 if (err) { 265 rw_exit(&dp->i_rwlock); 266 return (err); 267 } 268 dnlc_update(dvp, namep, ITOV(*ipp)); 269 rw_exit(&dp->i_rwlock); 270 return (0); 271 case DNOENT: 272 if (ufs_negative_cache && (dp->i_nlink > 0)) { 273 dnlc_enter(dvp, namep, DNLC_NO_VNODE); 274 } 275 rw_exit(&dp->i_rwlock); 276 return (ENOENT); 277 default: 278 break; 279 } 280 restart: 281 282 fbp = NULL; 283 doingchk = 0; 284 chkino = 0; 285 caching = 0; 286 287 /* 288 * Attempt to cache any directories greater than the tunable 289 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM), 290 * disable caching for this directory and record the system time. 291 * Any attempt after the disable time has expired will enable 292 * the caching again. 293 */ 294 if (dp->i_size >= ufs_min_dir_cache) { 295 /* 296 * if the directory caching disable time has expired 297 * enable the caching again. 298 */ 299 if (dp->i_cachedir == CD_DISABLED_NOMEM && 300 gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) { 301 ufs_dc_disable_at = 0; 302 dp->i_cachedir = CD_ENABLED; 303 } 304 if (dp->i_cachedir == CD_ENABLED) { 305 switch (dnlc_dir_start(dcap, dp->i_size >> 306 AV_DIRECT_SHIFT)) { 307 case DNOMEM: 308 dp->i_cachedir = CD_DISABLED_NOMEM; 309 ufs_dc_disable_at = gethrtime(); 310 break; 311 case DTOOBIG: 312 dp->i_cachedir = CD_DISABLED_TOOBIG; 313 break; 314 case DOK: 315 caching = 1; 316 break; 317 default: 318 break; 319 } 320 } 321 } 322 /* 323 * If caching we don't stop when the file has been 324 * found, but need to know later, so clear *ipp now 325 */ 326 *ipp = NULL; 327 328 recheck: 329 if (caching) { 330 offset = 0; 331 entryoffsetinblock = 0; 332 numdirpasses = 1; 333 } else { 334 /* 335 * Take care to look at dp->i_diroff only once, as it 336 * may be changing due to other threads/cpus. 337 */ 338 offset = dp->i_diroff; 339 if (offset > dp->i_size) { 340 offset = 0; 341 } 342 if (offset == 0) { 343 entryoffsetinblock = 0; 344 numdirpasses = 1; 345 } else { 346 start_off = offset; 347 348 entryoffsetinblock = blkoff(dp->i_fs, offset); 349 if (entryoffsetinblock != 0) { 350 err = blkatoff(dp, offset, (char **)0, &fbp); 351 if (err) 352 goto bad; 353 } 354 numdirpasses = 2; 355 } 356 } 357 endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t); 358 namlen = strlen(namep); 359 last_offset = 0; 360 361 searchloop: 362 while (offset < endsearch) { 363 /* 364 * If offset is on a block boundary, 365 * read the next directory block. 366 * Release previous if it exists. 367 */ 368 if (blkoff(dp->i_fs, offset) == 0) { 369 if (fbp != NULL) { 370 fbrelse(fbp, S_OTHER); 371 } 372 err = blkatoff(dp, offset, (char **)0, &fbp); 373 if (err) 374 goto bad; 375 entryoffsetinblock = 0; 376 } 377 378 /* 379 * If the offset to the next entry is invalid or if the 380 * next entry is a zero length record or if the record 381 * length is invalid, then skip to the next directory 382 * block. Complete validation checks are done if the 383 * record length is invalid. 384 * 385 * Full validation checks are slow so they are disabled 386 * by default. Complete checks can be run by patching 387 * "dirchk" to be true. 388 * 389 * We have to check the validity of entryoffsetinblock 390 * here because it can be set to i_diroff above. 391 */ 392 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock); 393 if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 || 394 (dirchk || (ep->d_reclen & 0x3)) && 395 dirmangled(dp, ep, entryoffsetinblock, offset)) { 396 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); 397 offset += i; 398 entryoffsetinblock += i; 399 if (caching) { 400 dnlc_dir_purge(dcap); 401 caching = 0; 402 } 403 continue; 404 } 405 406 ep_reclen = ep->d_reclen; 407 408 /* 409 * Add named entries and free space into the directory cache 410 */ 411 if (caching) { 412 ushort_t extra; 413 off_t off2; 414 415 if (ep->d_ino == 0) { 416 extra = ep_reclen; 417 if (offset & (DIRBLKSIZ - 1)) { 418 dnlc_dir_purge(dcap); 419 dp->i_cachedir = CD_DISABLED; 420 caching = 0; 421 } 422 } else { 423 /* 424 * entries hold the previous offset except the 425 * 1st which holds the offset + 1 426 */ 427 if (offset & (DIRBLKSIZ - 1)) { 428 off2 = last_offset; 429 } else { 430 off2 = offset + 1; 431 } 432 caching = (dnlc_dir_add_entry(dcap, ep->d_name, 433 INO_OFF_TO_H(ep->d_ino, off2)) == DOK); 434 extra = ep_reclen - DIRSIZ(ep); 435 } 436 if (caching && (extra >= LDIRSIZ(1))) { 437 caching = (dnlc_dir_add_space(dcap, extra, 438 (uint64_t)offset) == DOK); 439 } 440 } 441 442 /* 443 * Check for a name match. 444 * We have the parent inode read locked with i_rwlock. 445 */ 446 if (ep->d_ino && ep->d_namlen == namlen && 447 *namep == *ep->d_name && /* fast chk 1st chr */ 448 bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) { 449 450 /* 451 * We have to release the fbp early here to avoid 452 * a possible deadlock situation where we have the 453 * fbp and want the directory inode and someone doing 454 * a ufs_direnter_* has the directory inode and wants 455 * the fbp. XXX - is this still needed? 456 */ 457 ep_ino = (ino_t)ep->d_ino; 458 ASSERT(fbp != NULL); 459 fbrelse(fbp, S_OTHER); 460 fbp = NULL; 461 462 /* 463 * Atomic update (read lock held) 464 */ 465 dp->i_diroff = offset; 466 467 if (namlen == 2 && namep[0] == '.' && namep[1] == '.') { 468 struct timeval32 omtime; 469 470 if (caching) { 471 dnlc_dir_purge(dcap); 472 caching = 0; 473 } 474 if (doingchk) { 475 /* 476 * if the inumber didn't change 477 * continue with already found inode. 478 */ 479 if (ep_ino == chkino) 480 goto checkok; 481 else { 482 VN_RELE(ITOV(*ipp)); 483 /* *ipp is nulled at restart */ 484 goto restart; 485 } 486 } 487 /* 488 * release the lock on the dir we are searching 489 * to avoid a deadlock when grabbing the 490 * i_contents lock in ufs_iget_alloced(). 491 */ 492 omtime = dp->i_mtime; 493 rw_exit(&dp->i_rwlock); 494 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, 495 RW_READER); 496 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, 497 cr); 498 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 499 rw_enter(&dp->i_rwlock, RW_READER); 500 if (err) 501 goto bad; 502 /* 503 * Since we released the lock on the directory, 504 * we must check that the same inode is still 505 * the ".." entry for this directory. 506 */ 507 /*CSTYLED*/ 508 if (timercmp(&omtime, &dp->i_mtime, !=)) { 509 /* 510 * Modification time changed on the 511 * directory, we must go check if 512 * the inumber changed for ".." 513 */ 514 doingchk = 1; 515 chkino = ep_ino; 516 entryoffsetinblock = 0; 517 if (caching) { 518 /* 519 * Forget directory caching 520 * for this rare case 521 */ 522 dnlc_dir_purge(dcap); 523 caching = 0; 524 } 525 goto recheck; 526 } 527 } else if (dp->i_number == ep_ino) { 528 VN_HOLD(dvp); /* want ourself, "." */ 529 *ipp = dp; 530 if (caching) { 531 dnlc_dir_purge(dcap); 532 caching = 0; 533 } 534 } else { 535 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, 536 RW_READER); 537 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, 538 cr); 539 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 540 if (err) 541 goto bad; 542 } 543 checkok: 544 ASSERT(*ipp); 545 dnlc_update(dvp, namep, ITOV(*ipp)); 546 /* 547 * If we are not caching then just return the entry 548 * otherwise complete loading up the cache 549 */ 550 if (!caching) { 551 rw_exit(&dp->i_rwlock); 552 return (0); 553 } 554 err = blkatoff(dp, offset, (char **)0, &fbp); 555 if (err) 556 goto bad; 557 } 558 last_offset = offset; 559 offset += ep_reclen; 560 entryoffsetinblock += ep_reclen; 561 } 562 /* 563 * If we started in the middle of the directory and failed 564 * to find our target, we must check the beginning as well. 565 */ 566 if (numdirpasses == 2) { 567 numdirpasses--; 568 offset = 0; 569 endsearch = start_off; 570 goto searchloop; 571 } 572 573 /* 574 * If whole directory caching is on (or was originally on) then 575 * the entry may have been found. 576 */ 577 if (*ipp == NULL) { 578 err = ENOENT; 579 if (ufs_negative_cache && (dp->i_nlink > 0)) { 580 dnlc_enter(dvp, namep, DNLC_NO_VNODE); 581 } 582 } 583 if (caching) { 584 dnlc_dir_complete(dcap); 585 caching = 0; 586 } 587 588 bad: 589 if (err && *ipp) { 590 /* 591 * err and *ipp can both be set if we were attempting to 592 * cache the directory, and we found the entry, then later 593 * while trying to complete the directory cache encountered 594 * a error (eg reading a directory sector). 595 */ 596 VN_RELE(ITOV(*ipp)); 597 *ipp = NULL; 598 } 599 600 if (fbp) 601 fbrelse(fbp, S_OTHER); 602 rw_exit(&dp->i_rwlock); 603 if (caching) 604 dnlc_dir_purge(dcap); 605 return (err); 606 } 607 608 /* 609 * Write a new directory entry for DE_CREATE or DE_MKDIR operations. 610 */ 611 int 612 ufs_direnter_cm( 613 struct inode *tdp, /* target directory to make entry in */ 614 char *namep, /* name of entry */ 615 enum de_op op, /* entry operation */ 616 struct vattr *vap, /* attributes if new inode needed */ 617 struct inode **ipp, /* return entered inode here */ 618 struct cred *cr, /* user credentials */ 619 int flags) /* no entry exists */ 620 { 621 struct inode *tip; /* inode of (existing) target file */ 622 char *s; 623 struct slot slot; /* slot info to pass around */ 624 int namlen; /* length of name */ 625 int err; /* error number */ 626 struct inode *nip; /* new inode */ 627 int do_rele_nip = 0; /* release nip */ 628 int noentry = flags & ~IQUIET; 629 int quiet = flags & IQUIET; /* Suppress out of inodes message */ 630 631 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 632 633 if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) || 634 ((vap->va_type == VCHR) || (vap->va_type == VBLK) || 635 (vap->va_type == VDOOR) || (vap->va_type == VSOCK) || 636 (vap->va_type == VFIFO)))) 637 return (EINVAL); 638 639 /* don't allow '/' characters in pathname component */ 640 for (s = namep, namlen = 0; *s; s++, namlen++) 641 if (*s == '/') 642 return (EACCES); 643 ASSERT(namlen); 644 645 /* 646 * If name is "." or ".." then if this is a create look it up 647 * and return EEXIST. 648 */ 649 if (namep[0] == '.' && 650 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { 651 /* 652 * ufs_dirlook will acquire the i_rwlock 653 */ 654 rw_exit(&tdp->i_rwlock); 655 if (err = ufs_dirlook(tdp, namep, ipp, cr, 0)) { 656 rw_enter(&tdp->i_rwlock, RW_WRITER); 657 return (err); 658 } 659 rw_enter(&tdp->i_rwlock, RW_WRITER); 660 return (EEXIST); 661 } 662 663 /* 664 * If target directory has not been removed, then we can consider 665 * allowing file to be created. 666 */ 667 if (tdp->i_nlink <= 0) { 668 return (ENOENT); 669 } 670 671 /* 672 * Check accessibility of directory. 673 */ 674 if (((tdp->i_mode & IFMT) != IFDIR) && 675 ((tdp->i_mode & IFMT) != IFATTRDIR)) { 676 return (ENOTDIR); 677 } 678 679 /* 680 * Execute access is required to search the directory. 681 */ 682 if (err = ufs_iaccess(tdp, IEXEC, cr)) { 683 return (err); 684 } 685 686 /* 687 * Search for the entry. Return VN_HELD tip if found. 688 */ 689 tip = NULL; 690 slot.fbp = NULL; 691 slot.status = NONE; 692 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 693 rw_enter(&tdp->i_contents, RW_WRITER); 694 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry); 695 if (err) 696 goto out; 697 if (tip) { 698 ASSERT(!noentry); 699 *ipp = tip; 700 err = EEXIST; 701 } else { 702 /* 703 * The entry does not exist. Check write permission in 704 * directory to see if entry can be created. 705 */ 706 if (err = ufs_iaccess(tdp, IWRITE, cr)) 707 goto out; 708 /* 709 * Make new inode and directory entry. 710 */ 711 tdp->i_flag |= quiet; 712 if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) { 713 if (nip != NULL) 714 do_rele_nip = 1; 715 goto out; 716 } 717 if (err = ufs_diraddentry(tdp, namep, op, 718 namlen, &slot, nip, NULL, cr)) { 719 /* 720 * Unmake the inode we just made. 721 */ 722 rw_enter(&nip->i_contents, RW_WRITER); 723 if (((nip->i_mode & IFMT) == IFDIR) || 724 ((nip->i_mode & IFMT) == IFATTRDIR)) { 725 tdp->i_nlink--; 726 ufs_setreclaim(tdp); 727 tdp->i_flag |= ICHG; 728 tdp->i_seq++; 729 TRANS_INODE(tdp->i_ufsvfs, tdp); 730 ITIMES_NOLOCK(tdp); 731 } 732 nip->i_nlink = 0; 733 ufs_setreclaim(nip); 734 TRANS_INODE(nip->i_ufsvfs, nip); 735 nip->i_flag |= ICHG; 736 nip->i_seq++; 737 ITIMES_NOLOCK(nip); 738 rw_exit(&nip->i_contents); 739 do_rele_nip = 1; 740 } else { 741 *ipp = nip; 742 } 743 } 744 745 out: 746 if (slot.fbp) 747 fbrelse(slot.fbp, S_OTHER); 748 749 tdp->i_flag &= ~quiet; 750 rw_exit(&tdp->i_contents); 751 752 /* 753 * Drop vfs_dqrwlock before calling VN_RELE() on nip to 754 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 755 */ 756 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); 757 758 if (do_rele_nip) { 759 VN_RELE(ITOV(nip)); 760 } 761 762 return (err); 763 } 764 765 /* 766 * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations. 767 * If tvpp is non-null, return with the pointer to the target vnode. 768 */ 769 int 770 ufs_direnter_lr( 771 struct inode *tdp, /* target directory to make entry in */ 772 char *namep, /* name of entry */ 773 enum de_op op, /* entry operation */ 774 struct inode *sdp, /* source inode parent if rename */ 775 struct inode *sip, /* source inode */ 776 struct cred *cr, /* user credentials */ 777 vnode_t **tvpp) /* Return: (held) vnode of (existing) target */ 778 { 779 struct inode *tip; /* inode of (existing) target file */ 780 char *s; 781 struct slot slot; /* slot info to pass around */ 782 int namlen; /* length of name */ 783 int err; /* error number */ 784 785 /* don't allow '/' characters in pathname component */ 786 for (s = namep, namlen = 0; *s; s++, namlen++) 787 if (*s == '/') 788 return (EACCES); 789 ASSERT(namlen); 790 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 791 792 /* 793 * If name is "." or ".." then if this is a create look it up 794 * and return EEXIST. Rename or link TO "." or ".." is forbidden. 795 */ 796 if (namep[0] == '.' && 797 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { 798 if (op == DE_RENAME) { 799 return (EINVAL); /* *SIGH* should be ENOTEMPTY */ 800 } 801 return (EEXIST); 802 } 803 /* 804 * For link and rename lock the source entry and check the link count 805 * to see if it has been removed while it was unlocked. If not, we 806 * increment the link count and force the inode to disk to make sure 807 * that it is there before any directory entry that points to it. 808 * 809 * In the case of a symbolic link, we are dealing with a new inode 810 * which does not yet have any links. We've created it with a link 811 * count of 1, and we don't want to increment it since this will be 812 * its first link. 813 * 814 * We are about to push the inode to disk. We make sure 815 * that the inode's data blocks are flushed first so the 816 * inode and it's data blocks are always in sync. This 817 * adds some robustness in in the event of a power failure 818 * or panic where sync fails. If we panic before the 819 * inode is updated, then the inode still refers to the 820 * old data blocks (or none for a new file). If we panic 821 * after the inode is updated, then the inode refers to 822 * the new data blocks. 823 * 824 * We do this before grabbing the i_contents lock because 825 * ufs_syncip() will want that lock. We could do the data 826 * syncing after the removal checks, but upon return from 827 * the data sync we would have to repeat the removal 828 * checks. 829 */ 830 if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) { 831 return (err); 832 } 833 834 rw_enter(&sip->i_contents, RW_WRITER); 835 if (sip->i_nlink <= 0) { 836 rw_exit(&sip->i_contents); 837 return (ENOENT); 838 } 839 if (sip->i_nlink == MAXLINK) { 840 rw_exit(&sip->i_contents); 841 return (EMLINK); 842 } 843 844 /* 845 * Sync the indirect blocks associated with the file 846 * for the same reasons as described above. Since this 847 * call wants the i_contents lock held for it we can do 848 * this here with no extra work. 849 */ 850 if (err = ufs_sync_indir(sip)) { 851 rw_exit(&sip->i_contents); 852 return (err); 853 } 854 855 if (op != DE_SYMLINK) 856 sip->i_nlink++; 857 TRANS_INODE(sip->i_ufsvfs, sip); 858 sip->i_flag |= ICHG; 859 sip->i_seq++; 860 ufs_iupdat(sip, I_SYNC); 861 rw_exit(&sip->i_contents); 862 863 /* 864 * If target directory has not been removed, then we can consider 865 * allowing file to be created. 866 */ 867 if (tdp->i_nlink <= 0) { 868 err = ENOENT; 869 goto out2; 870 } 871 /* 872 * Check accessibility of directory. 873 */ 874 if (((tdp->i_mode & IFMT) != IFDIR) && 875 (tdp->i_mode & IFMT) != IFATTRDIR) { 876 err = ENOTDIR; 877 goto out2; 878 } 879 /* 880 * Execute access is required to search the directory. 881 */ 882 if (err = ufs_iaccess(tdp, IEXEC, cr)) { 883 goto out2; 884 } 885 886 /* 887 * Search for the entry. Return VN_HELD tip if found. 888 */ 889 tip = NULL; 890 slot.status = NONE; 891 slot.fbp = NULL; 892 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 893 rw_enter(&tdp->i_contents, RW_WRITER); 894 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0); 895 if (err) 896 goto out; 897 898 if (tip) { 899 switch (op) { 900 case DE_RENAME: 901 err = ufs_dirrename(sdp, sip, tdp, namep, 902 tip, &slot, cr); 903 break; 904 905 case DE_LINK: 906 case DE_SYMLINK: 907 /* 908 * Can't link to an existing file. 909 */ 910 err = EEXIST; 911 break; 912 default: 913 break; 914 } 915 } else { 916 /* 917 * The entry does not exist. Check write permission in 918 * directory to see if entry can be created. 919 */ 920 if (err = ufs_iaccess(tdp, IWRITE, cr)) 921 goto out; 922 err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp, 923 cr); 924 } 925 926 out: 927 if (slot.fbp) 928 fbrelse(slot.fbp, S_OTHER); 929 930 rw_exit(&tdp->i_contents); 931 932 /* 933 * Drop vfs_dqrwlock before calling VN_RELE() on tip to 934 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 935 */ 936 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); 937 938 /* 939 * If we renamed a file over the top of an existing file, 940 * or linked a file to an existing file (or tried to), 941 * then set *tvpp to the target vnode, if tvpp is non-null 942 * otherwise, release and delete (or just release) the inode. 943 * 944 * N.B., by returning the target's vnode pointer to the caller, 945 * that caller becomes responsible for doing the VN_RELE. 946 */ 947 if (tip) { 948 if ((err == 0) && (tvpp != NULL)) { 949 *tvpp = ITOV(tip); 950 } else { 951 VN_RELE(ITOV(tip)); 952 } 953 } 954 955 out2: 956 if (err) { 957 /* 958 * Undo bumped link count. 959 */ 960 if (op != DE_SYMLINK) { 961 rw_enter(&sip->i_contents, RW_WRITER); 962 sip->i_nlink--; 963 ufs_setreclaim(sip); 964 TRANS_INODE(sip->i_ufsvfs, sip); 965 sip->i_flag |= ICHG; 966 sip->i_seq++; 967 ITIMES_NOLOCK(sip); 968 rw_exit(&sip->i_contents); 969 } 970 } 971 return (err); 972 } 973 974 /* 975 * Check for the existence of a name in a directory (unless noentry 976 * is set) , or else of an empty 977 * slot in which an entry may be made. If the requested name is found, 978 * then on return *ipp points at the inode and *offp contains 979 * its offset in the directory. If the name is not found, then *ipp 980 * will be NULL and *slotp will contain information about a directory slot in 981 * which an entry may be made (either an empty slot, or the first position 982 * past the end of the directory). 983 * The target directory inode (tdp) is supplied write locked (i_rwlock). 984 * 985 * This may not be used on "." or "..", but aliases of "." are ok. 986 */ 987 int 988 ufs_dircheckforname( 989 struct inode *tdp, /* inode of directory being checked */ 990 char *namep, /* name we're checking for */ 991 int namlen, /* length of name, excluding null */ 992 struct slot *slotp, /* slot structure */ 993 struct inode **ipp, /* return inode if we find one */ 994 struct cred *cr, 995 int noentry) /* noentry - just look for space */ 996 { 997 uint64_t handle; 998 struct fbuf *fbp; /* pointer to directory block */ 999 struct direct *ep; /* directory entry */ 1000 struct direct *nep; /* next directory entry */ 1001 dcanchor_t *dcap; 1002 vnode_t *dvp; /* directory vnode ptr */ 1003 off_t dirsize; /* size of the directory */ 1004 off_t offset; /* offset in the directory */ 1005 off_t last_offset; /* last offset */ 1006 off_t enduseful; /* pointer past last used dir slot */ 1007 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1008 int i; /* length of mangled entry */ 1009 int needed; 1010 int err; 1011 int first; 1012 int caching; 1013 int stat; 1014 ino_t ep_ino; 1015 slotstat_t initstat = slotp->status; 1016 1017 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1018 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1019 ASSERT(*ipp == NULL); 1020 fbp = NULL; 1021 1022 /* 1023 * First check if there is a complete cache of the directory. 1024 */ 1025 dvp = ITOV(tdp); 1026 1027 dcap = &tdp->i_danchor; 1028 if (noentry) { 1029 /* 1030 * We know from the 1st level dnlc cache that the entry 1031 * doesn't exist, so don't bother searching the directory 1032 * cache, but just look for space (possibly in the directory 1033 * cache). 1034 */ 1035 stat = DNOENT; 1036 } else { 1037 stat = dnlc_dir_lookup(dcap, namep, &handle); 1038 } 1039 switch (stat) { 1040 case DFOUND: 1041 ep_ino = (ino_t)H_TO_INO(handle); 1042 if (tdp->i_number == ep_ino) { 1043 *ipp = tdp; /* we want ourself, ie "." */ 1044 VN_HOLD(dvp); 1045 } else { 1046 err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr); 1047 if (err) 1048 return (err); 1049 } 1050 offset = H_TO_OFF(handle); 1051 first = 0; 1052 if (offset & 1) { 1053 /* This is the first entry in the block */ 1054 first = 1; 1055 offset -= 1; 1056 ASSERT((offset & (DIRBLKSIZ - 1)) == 0); 1057 } 1058 err = blkatoff(tdp, offset, (char **)&ep, &fbp); 1059 if (err) { 1060 VN_RELE(ITOV(*ipp)); 1061 *ipp = NULL; 1062 return (err); 1063 } 1064 /* 1065 * Check the validity of the entry. 1066 * If it's bad, then throw away the cache and 1067 * continue without it. The dirmangled() routine 1068 * will then be called upon it. 1069 */ 1070 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { 1071 VN_RELE(ITOV(*ipp)); 1072 *ipp = NULL; 1073 dnlc_dir_purge(dcap); 1074 break; 1075 } 1076 /* 1077 * Remember the returned offset is the offset of the 1078 * preceding record (unless this is the 1st record 1079 * in the DIRBLKSIZ sized block (disk sector)), then it's 1080 * offset + 1. Note, no real offsets are on odd boundaries. 1081 */ 1082 if (first) { 1083 ASSERT((offset & (DIRBLKSIZ - 1)) == 0); 1084 slotp->offset = offset; 1085 slotp->size = 0; 1086 slotp->ep = ep; 1087 } else { 1088 /* get the next entry */ 1089 nep = (struct direct *)((char *)ep + ep->d_reclen); 1090 /* 1091 * Check the validity of this entry as well 1092 * If it's bad, then throw away the cache and 1093 * continue without it. The dirmangled() routine 1094 * will then be called upon it. 1095 */ 1096 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || 1097 (nep->d_ino != ep_ino)) { 1098 VN_RELE(ITOV(*ipp)); 1099 *ipp = NULL; 1100 dnlc_dir_purge(dcap); 1101 break; 1102 } 1103 slotp->offset = offset + ep->d_reclen; 1104 slotp->size = ep->d_reclen; 1105 slotp->ep = nep; 1106 } 1107 slotp->status = EXIST; 1108 slotp->fbp = fbp; 1109 slotp->endoff = 0; 1110 slotp->cached = 1; 1111 dnlc_update(dvp, namep, ITOV(*ipp)); 1112 return (0); 1113 case DNOENT: 1114 /* 1115 * The caller gets to set the initial slot status to 1116 * indicate whether it's interested in getting a 1117 * empty slot. For example, the status can be set 1118 * to FOUND when an entry is being deleted. 1119 */ 1120 ASSERT(slotp->fbp == NULL); 1121 if (slotp->status == FOUND) { 1122 return (0); 1123 } 1124 switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen), 1125 &handle)) { 1126 case DFOUND: 1127 offset = (off_t)handle; 1128 err = blkatoff(tdp, offset, (char **)&ep, &fbp); 1129 if (err) { 1130 dnlc_dir_purge(dcap); 1131 ASSERT(*ipp == NULL); 1132 return (err); 1133 } 1134 /* 1135 * Check the validity of the entry. 1136 * If it's bad, then throw away the cache and 1137 * continue without it. The dirmangled() routine 1138 * will then be called upon it. 1139 */ 1140 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { 1141 dnlc_dir_purge(dcap); 1142 break; 1143 } 1144 /* 1145 * Remember the returned offset is the offset of the 1146 * containing record. 1147 */ 1148 slotp->status = FOUND; 1149 slotp->ep = ep; 1150 slotp->offset = offset; 1151 slotp->fbp = fbp; 1152 slotp->size = ep->d_reclen; 1153 /* 1154 * Set end offset to 0. Truncation is handled 1155 * because the dnlc cache will blow away the 1156 * cached directory when an entry is removed 1157 * that drops the entries left to less than half 1158 * the minumum number (dnlc_min_dir_cache). 1159 */ 1160 slotp->endoff = 0; 1161 slotp->cached = 1; 1162 return (0); 1163 case DNOENT: 1164 slotp->status = NONE; 1165 slotp->offset = P2ROUNDUP_TYPED(tdp->i_size, 1166 DIRBLKSIZ, u_offset_t); 1167 slotp->size = DIRBLKSIZ; 1168 slotp->endoff = 0; 1169 slotp->cached = 1; 1170 return (0); 1171 default: 1172 break; 1173 } 1174 break; 1175 } 1176 slotp->cached = 0; 1177 caching = NULL; 1178 if (!noentry && tdp->i_size >= ufs_min_dir_cache) { 1179 /* 1180 * if the directory caching disable time has expired 1181 * enable caching again. 1182 */ 1183 if (tdp->i_cachedir == CD_DISABLED_NOMEM && 1184 gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) { 1185 ufs_dc_disable_at = 0; 1186 tdp->i_cachedir = CD_ENABLED; 1187 } 1188 /* 1189 * Attempt to cache any directories greater than the tunable 1190 * ufs_min_cache_dir. If it fails due to memory shortage 1191 * (DNOMEM), disable caching for this directory and record 1192 * the system time. Any attempt after the disable time has 1193 * expired will enable the caching again. 1194 */ 1195 if (tdp->i_cachedir == CD_ENABLED) { 1196 switch (dnlc_dir_start(dcap, 1197 tdp->i_size >> AV_DIRECT_SHIFT)) { 1198 case DNOMEM: 1199 tdp->i_cachedir = CD_DISABLED_NOMEM; 1200 ufs_dc_disable_at = gethrtime(); 1201 break; 1202 case DTOOBIG: 1203 tdp->i_cachedir = CD_DISABLED_TOOBIG; 1204 break; 1205 case DOK: 1206 caching = 1; 1207 break; 1208 default: 1209 break; 1210 } 1211 } 1212 } 1213 1214 /* 1215 * No point in using i_diroff since we must search whole directory 1216 */ 1217 dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t); 1218 enduseful = 0; 1219 offset = last_offset = 0; 1220 entryoffsetinblk = 0; 1221 needed = (int)LDIRSIZ(namlen); 1222 while (offset < dirsize) { 1223 /* 1224 * If offset is on a block boundary, 1225 * read the next directory block. 1226 * Release previous if it exists. 1227 */ 1228 if (blkoff(tdp->i_fs, offset) == 0) { 1229 if (fbp != NULL) 1230 fbrelse(fbp, S_OTHER); 1231 1232 err = blkatoff(tdp, offset, (char **)0, &fbp); 1233 if (err) { 1234 ASSERT(*ipp == NULL); 1235 if (caching) { 1236 dnlc_dir_purge(dcap); 1237 } 1238 return (err); 1239 } 1240 entryoffsetinblk = 0; 1241 } 1242 /* 1243 * If still looking for a slot, and at a DIRBLKSIZ 1244 * boundary, have to start looking for free space 1245 * again. 1246 */ 1247 if (slotp->status == NONE && 1248 (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) { 1249 slotp->offset = -1; 1250 } 1251 /* 1252 * If the next entry is a zero length record or if the 1253 * record length is invalid, then skip to the next 1254 * directory block. Complete validation checks are 1255 * done if the record length is invalid. 1256 * 1257 * Full validation checks are slow so they are disabled 1258 * by default. Complete checks can be run by patching 1259 * "dirchk" to be true. 1260 * 1261 * We do not have to check the validity of 1262 * entryoffsetinblk here because it starts out as zero 1263 * and is only incremented by d_reclen values that we 1264 * validate here. 1265 */ 1266 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1267 if (ep->d_reclen == 0 || 1268 (dirchk || (ep->d_reclen & 0x3)) && 1269 dirmangled(tdp, ep, entryoffsetinblk, offset)) { 1270 i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1)); 1271 offset += i; 1272 entryoffsetinblk += i; 1273 if (caching) { 1274 dnlc_dir_purge(dcap); 1275 caching = 0; 1276 } 1277 continue; 1278 } 1279 1280 /* 1281 * Add named entries and free space into the directory cache 1282 */ 1283 if (caching) { 1284 ushort_t extra; 1285 off_t off2; 1286 1287 if (ep->d_ino == 0) { 1288 extra = ep->d_reclen; 1289 if (offset & (DIRBLKSIZ - 1)) { 1290 dnlc_dir_purge(dcap); 1291 caching = 0; 1292 } 1293 } else { 1294 /* 1295 * entries hold the previous offset if 1296 * not the 1st one 1297 */ 1298 if (offset & (DIRBLKSIZ - 1)) { 1299 off2 = last_offset; 1300 } else { 1301 off2 = offset + 1; 1302 } 1303 caching = (dnlc_dir_add_entry(dcap, ep->d_name, 1304 INO_OFF_TO_H(ep->d_ino, off2)) == DOK); 1305 extra = ep->d_reclen - DIRSIZ(ep); 1306 } 1307 if (caching && (extra >= LDIRSIZ(1))) { 1308 caching = (dnlc_dir_add_space(dcap, extra, 1309 (uint64_t)offset) == DOK); 1310 } 1311 } 1312 1313 /* 1314 * If an appropriate sized slot has not yet been found, 1315 * check to see if one is available. 1316 */ 1317 if ((slotp->status != FOUND) && (slotp->status != EXIST)) { 1318 int size = ep->d_reclen; 1319 1320 if (ep->d_ino != 0) 1321 size -= DIRSIZ(ep); 1322 if (size > 0) { 1323 if (size >= needed) { 1324 slotp->offset = offset; 1325 slotp->size = ep->d_reclen; 1326 if (noentry) { 1327 slotp->ep = ep; 1328 slotp->fbp = fbp; 1329 slotp->status = FOUND; 1330 slotp->endoff = 0; 1331 return (0); 1332 } 1333 slotp->status = FOUND; 1334 } else if (slotp->status == NONE) { 1335 if (slotp->offset == -1) 1336 slotp->offset = offset; 1337 } 1338 } 1339 } 1340 /* 1341 * Check for a name match. 1342 */ 1343 if (ep->d_ino && ep->d_namlen == namlen && 1344 *namep == *ep->d_name && /* fast chk 1st char */ 1345 bcmp(namep, ep->d_name, namlen) == 0) { 1346 1347 tdp->i_diroff = offset; 1348 1349 if (tdp->i_number == ep->d_ino) { 1350 *ipp = tdp; /* we want ourself, ie "." */ 1351 VN_HOLD(dvp); 1352 } else { 1353 err = ufs_iget_alloced(tdp->i_vfs, 1354 (ino_t)ep->d_ino, ipp, cr); 1355 if (err) { 1356 fbrelse(fbp, S_OTHER); 1357 if (caching) 1358 dnlc_dir_purge(dcap); 1359 return (err); 1360 } 1361 } 1362 slotp->status = EXIST; 1363 slotp->offset = offset; 1364 slotp->size = (int)(offset - last_offset); 1365 slotp->fbp = fbp; 1366 slotp->ep = ep; 1367 slotp->endoff = 0; 1368 if (caching) 1369 dnlc_dir_purge(dcap); 1370 return (0); 1371 } 1372 last_offset = offset; 1373 offset += ep->d_reclen; 1374 entryoffsetinblk += ep->d_reclen; 1375 if (ep->d_ino) 1376 enduseful = offset; 1377 } 1378 if (fbp) { 1379 fbrelse(fbp, S_OTHER); 1380 } 1381 1382 if (caching) { 1383 dnlc_dir_complete(dcap); 1384 slotp->cached = 1; 1385 if (slotp->status == FOUND) { 1386 if (initstat == FOUND) { 1387 return (0); 1388 } 1389 (void) dnlc_dir_rem_space_by_handle(dcap, 1390 slotp->offset); 1391 slotp->endoff = 0; 1392 return (0); 1393 } 1394 } 1395 1396 if (slotp->status == NONE) { 1397 /* 1398 * We didn't find a slot; the new directory entry should be put 1399 * at the end of the directory. Return an indication of where 1400 * this is, and set "endoff" to zero; since we're going to have 1401 * to extend the directory, we're certainly not going to 1402 * truncate it. 1403 */ 1404 slotp->offset = dirsize; 1405 slotp->size = DIRBLKSIZ; 1406 slotp->endoff = 0; 1407 } else { 1408 /* 1409 * We found a slot, and will return an indication of where that 1410 * slot is, as any new directory entry will be put there. 1411 * Since that slot will become a useful entry, if the last 1412 * useful entry we found was before this one, update the offset 1413 * of the last useful entry. 1414 */ 1415 if (enduseful < slotp->offset + slotp->size) 1416 enduseful = slotp->offset + slotp->size; 1417 slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t); 1418 } 1419 *ipp = NULL; 1420 return (0); 1421 } 1422 1423 uint64_t ufs_dirrename_retry_cnt; 1424 1425 /* 1426 * Rename the entry in the directory tdp so that it points to 1427 * sip instead of tip. 1428 */ 1429 static int 1430 ufs_dirrename( 1431 struct inode *sdp, /* parent directory of source */ 1432 struct inode *sip, /* source inode */ 1433 struct inode *tdp, /* parent directory of target */ 1434 char *namep, /* entry we are trying to change */ 1435 struct inode *tip, /* target inode */ 1436 struct slot *slotp, /* slot for entry */ 1437 struct cred *cr) /* credentials */ 1438 { 1439 vnode_t *tdvp; 1440 off_t offset; 1441 int err; 1442 int doingdirectory; 1443 1444 ASSERT(sdp->i_ufsvfs != NULL); 1445 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1446 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1447 /* 1448 * Short circuit rename of something to itself. 1449 */ 1450 if (sip->i_number == tip->i_number) { 1451 return (ESAME); /* special KLUDGE error code */ 1452 } 1453 1454 /* 1455 * We're locking 2 peer level locks, so must use tryenter 1456 * on the 2nd to avoid deadlocks that would occur 1457 * if we renamed a->b and b->a concurrently. 1458 */ 1459 retry: 1460 rw_enter(&tip->i_contents, RW_WRITER); 1461 if (!rw_tryenter(&sip->i_contents, RW_READER)) { 1462 /* 1463 * drop tip and wait (sleep) until we stand a chance 1464 * of holding sip 1465 */ 1466 rw_exit(&tip->i_contents); 1467 rw_enter(&sip->i_contents, RW_READER); 1468 /* 1469 * Reverse the lock grabs in case we have heavy 1470 * contention on the 2nd lock. 1471 */ 1472 if (!rw_tryenter(&tip->i_contents, RW_WRITER)) { 1473 ufs_dirrename_retry_cnt++; 1474 rw_exit(&sip->i_contents); 1475 goto retry; 1476 } 1477 } 1478 1479 /* 1480 * Check that everything is on the same filesystem. 1481 */ 1482 if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) || 1483 (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) { 1484 err = EXDEV; /* XXX archaic */ 1485 goto out; 1486 } 1487 /* 1488 * Must have write permission to rewrite target entry. 1489 * Perform additional checks for sticky directories. 1490 */ 1491 if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0 || 1492 (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0) 1493 goto out; 1494 1495 /* 1496 * Ensure source and target are compatible (both directories 1497 * or both not directories). If target is a directory it must 1498 * be empty and have no links to it; in addition it must not 1499 * be a mount point, and both the source and target must be 1500 * writable. 1501 */ 1502 doingdirectory = (((sip->i_mode & IFMT) == IFDIR) || 1503 ((sip->i_mode & IFMT) == IFATTRDIR)); 1504 if (((tip->i_mode & IFMT) == IFDIR) || 1505 ((tip->i_mode & IFMT) == IFATTRDIR)) { 1506 if (!doingdirectory) { 1507 err = EISDIR; 1508 goto out; 1509 } 1510 /* 1511 * vn_vfswlock will prevent mounts from using the directory 1512 * until we are done. 1513 */ 1514 if (vn_vfswlock(ITOV(tip))) { 1515 err = EBUSY; 1516 goto out; 1517 } 1518 if (vn_mountedvfs(ITOV(tip)) != NULL) { 1519 vn_vfsunlock(ITOV(tip)); 1520 err = EBUSY; 1521 goto out; 1522 } 1523 if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) { 1524 vn_vfsunlock(ITOV(tip)); 1525 err = EEXIST; /* SIGH should be ENOTEMPTY */ 1526 goto out; 1527 } 1528 } else if (doingdirectory) { 1529 err = ENOTDIR; 1530 goto out; 1531 } 1532 1533 /* 1534 * Rewrite the inode pointer for target name entry 1535 * from the target inode (ip) to the source inode (sip). 1536 * This prevents the target entry from disappearing 1537 * during a crash. Mark the directory inode to reflect the changes. 1538 */ 1539 tdvp = ITOV(tdp); 1540 slotp->ep->d_ino = (int32_t)sip->i_number; 1541 dnlc_update(tdvp, namep, ITOV(sip)); 1542 if (slotp->size) { 1543 offset = slotp->offset - slotp->size; 1544 } else { 1545 offset = slotp->offset + 1; 1546 } 1547 if (slotp->cached) { 1548 (void) dnlc_dir_update(&tdp->i_danchor, namep, 1549 INO_OFF_TO_H(slotp->ep->d_ino, offset)); 1550 } 1551 1552 err = TRANS_DIR(tdp, slotp->offset); 1553 if (err) 1554 fbrelse(slotp->fbp, S_OTHER); 1555 else 1556 err = ufs_fbwrite(slotp->fbp, tdp); 1557 1558 slotp->fbp = NULL; 1559 if (err) { 1560 if (doingdirectory) 1561 vn_vfsunlock(ITOV(tip)); 1562 goto out; 1563 } 1564 1565 TRANS_INODE(tdp->i_ufsvfs, tdp); 1566 tdp->i_flag |= IUPD|ICHG; 1567 tdp->i_seq++; 1568 ITIMES_NOLOCK(tdp); 1569 1570 /* 1571 * Decrement the link count of the target inode. 1572 * Fix the ".." entry in sip to point to dp. 1573 * This is done after the new entry is on the disk. 1574 */ 1575 tip->i_nlink--; 1576 TRANS_INODE(tip->i_ufsvfs, tip); 1577 tip->i_flag |= ICHG; 1578 tip->i_seq++; 1579 ITIMES_NOLOCK(tip); 1580 if (doingdirectory) { 1581 /* 1582 * The entry for tip no longer exists so I can unlock the 1583 * vfslock. 1584 */ 1585 vn_vfsunlock(ITOV(tip)); 1586 /* 1587 * Decrement target link count once more if it was a directory. 1588 */ 1589 if (--tip->i_nlink != 0) { 1590 err = ufs_fault(ITOV(tip), 1591 "ufs_dirrename: target directory link count != 0 (%s)", 1592 tip->i_fs->fs_fsmnt); 1593 rw_exit(&tip->i_contents); 1594 return (err); 1595 } 1596 TRANS_INODE(tip->i_ufsvfs, tip); 1597 ufs_setreclaim(tip); 1598 /* 1599 * Renaming a directory with the parent different 1600 * requires that ".." be rewritten. The window is 1601 * still there for ".." to be inconsistent, but this 1602 * is unavoidable, and a lot shorter than when it was 1603 * done in a user process. We decrement the link 1604 * count in the new parent as appropriate to reflect 1605 * the just-removed target. If the parent is the 1606 * same, this is appropriate since the original 1607 * directory is going away. If the new parent is 1608 * different, ufs_dirfixdotdot() will bump the link count 1609 * back. 1610 */ 1611 tdp->i_nlink--; 1612 ufs_setreclaim(tdp); 1613 TRANS_INODE(tdp->i_ufsvfs, tdp); 1614 tdp->i_flag |= ICHG; 1615 tdp->i_seq++; 1616 ITIMES_NOLOCK(tdp); 1617 if (sdp != tdp) { 1618 rw_exit(&tip->i_contents); 1619 rw_exit(&sip->i_contents); 1620 err = ufs_dirfixdotdot(sip, sdp, tdp); 1621 return (err); 1622 } 1623 } else 1624 ufs_setreclaim(tip); 1625 out: 1626 rw_exit(&tip->i_contents); 1627 rw_exit(&sip->i_contents); 1628 return (err); 1629 } 1630 1631 /* 1632 * Fix the ".." entry of the child directory so that it points 1633 * to the new parent directory instead of the old one. Routine 1634 * assumes that dp is a directory and that all the inodes are on 1635 * the same file system. 1636 */ 1637 static int 1638 ufs_dirfixdotdot( 1639 struct inode *dp, /* child directory */ 1640 struct inode *opdp, /* old parent directory */ 1641 struct inode *npdp) /* new parent directory */ 1642 { 1643 struct fbuf *fbp; 1644 struct dirtemplate *dirp; 1645 vnode_t *dvp; 1646 int err; 1647 1648 ASSERT(RW_WRITE_HELD(&npdp->i_rwlock)); 1649 ASSERT(RW_WRITE_HELD(&npdp->i_contents)); 1650 1651 /* 1652 * We hold the child directory's i_contents lock before calling 1653 * blkatoff so that we honor correct locking protocol which is 1654 * i_contents lock and then page lock. (blkatoff will call 1655 * ufs_getpage where we want the page lock) 1656 * We hold the child directory's i_rwlock before i_contents (as 1657 * per the locking protocol) since we are modifying the ".." entry 1658 * of the child directory. 1659 * We hold the i_rwlock and i_contents lock until we record 1660 * this directory delta to the log (via ufs_trans_dir) and have 1661 * done fbrelse. 1662 */ 1663 rw_enter(&dp->i_rwlock, RW_WRITER); 1664 rw_enter(&dp->i_contents, RW_WRITER); 1665 err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp); 1666 if (err) 1667 goto bad; 1668 1669 if (dp->i_nlink <= 0 || 1670 dp->i_size < sizeof (struct dirtemplate)) { 1671 err = ENOENT; 1672 goto bad; 1673 } 1674 1675 if (dirp->dotdot_namlen != 2 || 1676 dirp->dotdot_name[0] != '.' || 1677 dirp->dotdot_name[1] != '.') { /* Sanity check. */ 1678 dirbad(dp, "mangled .. entry", (off_t)0); 1679 err = ENOTDIR; 1680 goto bad; 1681 } 1682 1683 /* 1684 * Increment the link count in the new parent inode and force it out. 1685 */ 1686 if (npdp->i_nlink == MAXLINK) { 1687 err = EMLINK; 1688 goto bad; 1689 } 1690 npdp->i_nlink++; 1691 TRANS_INODE(npdp->i_ufsvfs, npdp); 1692 npdp->i_flag |= ICHG; 1693 npdp->i_seq++; 1694 ufs_iupdat(npdp, I_SYNC); 1695 1696 /* 1697 * Rewrite the child ".." entry and force it out. 1698 */ 1699 dvp = ITOV(dp); 1700 dirp->dotdot_ino = (uint32_t)npdp->i_number; 1701 dnlc_update(dvp, "..", ITOV(npdp)); 1702 (void) dnlc_dir_update(&dp->i_danchor, "..", 1703 INO_OFF_TO_H(dirp->dotdot_ino, 0)); 1704 1705 err = TRANS_DIR(dp, 0); 1706 if (err) 1707 fbrelse(fbp, S_OTHER); 1708 else 1709 err = ufs_fbwrite(fbp, dp); 1710 1711 fbp = NULL; 1712 if (err) 1713 goto bad; 1714 1715 rw_exit(&dp->i_contents); 1716 rw_exit(&dp->i_rwlock); 1717 1718 /* 1719 * Decrement the link count of the old parent inode and force it out. 1720 */ 1721 ASSERT(opdp); 1722 rw_enter(&opdp->i_contents, RW_WRITER); 1723 ASSERT(opdp->i_nlink > 0); 1724 opdp->i_nlink--; 1725 ufs_setreclaim(opdp); 1726 TRANS_INODE(opdp->i_ufsvfs, opdp); 1727 opdp->i_flag |= ICHG; 1728 opdp->i_seq++; 1729 ufs_iupdat(opdp, I_SYNC); 1730 rw_exit(&opdp->i_contents); 1731 return (0); 1732 1733 bad: 1734 if (fbp) 1735 fbrelse(fbp, S_OTHER); 1736 rw_exit(&dp->i_contents); 1737 rw_exit(&dp->i_rwlock); 1738 return (err); 1739 } 1740 1741 /* 1742 * Enter the file sip in the directory tdp with name namep. 1743 */ 1744 static int 1745 ufs_diraddentry( 1746 struct inode *tdp, 1747 char *namep, 1748 enum de_op op, 1749 int namlen, 1750 struct slot *slotp, 1751 struct inode *sip, 1752 struct inode *sdp, 1753 struct cred *cr) 1754 { 1755 struct direct *ep, *nep; 1756 vnode_t *tdvp; 1757 dcanchor_t *dcap = &tdp->i_danchor; 1758 off_t offset; 1759 int err; 1760 ushort_t extra; 1761 1762 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1763 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1764 /* 1765 * Prepare a new entry. If the caller has not supplied an 1766 * existing inode, make a new one. 1767 */ 1768 err = dirprepareentry(tdp, slotp, cr); 1769 if (err) { 1770 if (slotp->fbp) { 1771 fbrelse(slotp->fbp, S_OTHER); 1772 slotp->fbp = NULL; 1773 } 1774 return (err); 1775 } 1776 /* 1777 * Check inode to be linked to see if it is in the 1778 * same filesystem. 1779 */ 1780 if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) { 1781 err = EXDEV; 1782 goto bad; 1783 } 1784 1785 /* 1786 * If renaming a directory then fix up the ".." entry in the 1787 * directory to point to the new parent. 1788 */ 1789 if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) || 1790 ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) { 1791 err = ufs_dirfixdotdot(sip, sdp, tdp); 1792 if (err) 1793 goto bad; 1794 } 1795 1796 /* 1797 * Fill in entry data. 1798 */ 1799 ep = slotp->ep; 1800 ep->d_namlen = (ushort_t)namlen; 1801 (void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3)); 1802 ep->d_ino = (uint32_t)sip->i_number; 1803 tdvp = ITOV(tdp); 1804 dnlc_update(tdvp, namep, ITOV(sip)); 1805 /* 1806 * Note the offset supplied for any named entry is 1807 * the offset of the previous one, unless it's the 1st. 1808 * slotp->size is used to pass the length to 1809 * the previous entry. 1810 */ 1811 if (slotp->size) { 1812 offset = slotp->offset - slotp->size; 1813 } else { 1814 offset = slotp->offset + 1; 1815 } 1816 1817 if (slotp->cached) { 1818 /* 1819 * Add back any usable unused space to the dnlc directory 1820 * cache. 1821 */ 1822 extra = ep->d_reclen - DIRSIZ(ep); 1823 if (extra >= LDIRSIZ(1)) { 1824 (void) dnlc_dir_add_space(dcap, extra, 1825 (uint64_t)slotp->offset); 1826 } 1827 1828 (void) dnlc_dir_add_entry(dcap, namep, 1829 INO_OFF_TO_H(ep->d_ino, offset)); 1830 1831 /* adjust the previous offset of the next entry */ 1832 nep = (struct direct *)((char *)ep + ep->d_reclen); 1833 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { 1834 /* 1835 * Not a new block. 1836 * 1837 * Check the validity of the next entry. 1838 * If it's bad, then throw away the cache, and 1839 * continue as before directory caching. 1840 */ 1841 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || 1842 dnlc_dir_update(dcap, nep->d_name, 1843 INO_OFF_TO_H(nep->d_ino, slotp->offset)) 1844 == DNOENT) { 1845 dnlc_dir_purge(dcap); 1846 slotp->cached = 0; 1847 } 1848 } 1849 } 1850 1851 /* 1852 * Write out the directory block. 1853 */ 1854 err = TRANS_DIR(tdp, slotp->offset); 1855 if (err) 1856 fbrelse(slotp->fbp, S_OTHER); 1857 else 1858 err = ufs_fbwrite(slotp->fbp, tdp); 1859 1860 slotp->fbp = NULL; 1861 /* 1862 * If this is a rename of a directory, then we have already 1863 * fixed the ".." entry to refer to the new parent. If err 1864 * is true at this point, we have failed to update the new 1865 * parent to refer to the renamed directory. 1866 * XXX - we need to unwind the ".." fix. 1867 */ 1868 if (err) 1869 return (err); 1870 1871 /* 1872 * Mark the directory inode to reflect the changes. 1873 * Truncate the directory to chop off blocks of empty entries. 1874 */ 1875 1876 TRANS_INODE(tdp->i_ufsvfs, tdp); 1877 tdp->i_flag |= IUPD|ICHG; 1878 tdp->i_seq++; 1879 tdp->i_diroff = 0; 1880 ITIMES_NOLOCK(tdp); 1881 /* 1882 * If the directory grew then dirprepareentry() will have 1883 * set IATTCHG in tdp->i_flag, then the directory inode must 1884 * be flushed out. This is because if fsync() is used later 1885 * the directory size must be correct, otherwise a crash would 1886 * cause fsck to move the file to lost+found. Also because later 1887 * a file may be linked in more than one directory, then there 1888 * is no way to flush the original directory. So it must be 1889 * flushed out on creation. See bug 4293809. 1890 */ 1891 if (tdp->i_flag & IATTCHG) { 1892 ufs_iupdat(tdp, I_SYNC); 1893 } 1894 1895 if (slotp->endoff && (slotp->endoff < tdp->i_size)) { 1896 if (!TRANS_ISTRANS(tdp->i_ufsvfs)) { 1897 (void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0, 1898 cr); 1899 } 1900 } 1901 1902 1903 return (0); 1904 1905 bad: 1906 if (slotp->cached) { 1907 dnlc_dir_purge(dcap); 1908 fbrelse(slotp->fbp, S_OTHER); 1909 slotp->cached = 0; 1910 slotp->fbp = NULL; 1911 return (err); 1912 } 1913 1914 /* 1915 * Clear out entry prepared by dirprepareent. 1916 */ 1917 slotp->ep->d_ino = 0; 1918 slotp->ep->d_namlen = 0; 1919 1920 /* 1921 * Don't touch err so we don't clobber the real error that got us here. 1922 */ 1923 if (TRANS_DIR(tdp, slotp->offset)) 1924 fbrelse(slotp->fbp, S_OTHER); 1925 else 1926 (void) ufs_fbwrite(slotp->fbp, tdp); 1927 slotp->fbp = NULL; 1928 return (err); 1929 } 1930 1931 /* 1932 * Prepare a directory slot to receive an entry. 1933 */ 1934 static int 1935 dirprepareentry( 1936 struct inode *dp, /* directory we are working in */ 1937 struct slot *slotp, /* available slot info */ 1938 struct cred *cr) 1939 { 1940 struct direct *ep, *nep; 1941 off_t entryend; 1942 int err; 1943 slotstat_t status = slotp->status; 1944 ushort_t dsize; 1945 1946 ASSERT((status == NONE) || (status == FOUND)); 1947 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 1948 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 1949 /* 1950 * If we didn't find a slot, then indicate that the 1951 * new slot belongs at the end of the directory. 1952 * If we found a slot, then the new entry can be 1953 * put at slotp->offset. 1954 */ 1955 entryend = slotp->offset + slotp->size; 1956 if (status == NONE) { 1957 ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0); 1958 if (DIRBLKSIZ > dp->i_fs->fs_fsize) { 1959 err = ufs_fault(ITOV(dp), 1960 "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d" 1961 " > dp->i_fs->fs_fsize: %d (%s)", 1962 DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt); 1963 return (err); 1964 } 1965 /* 1966 * Allocate the new block. 1967 */ 1968 err = BMAPALLOC(dp, (u_offset_t)slotp->offset, 1969 (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr); 1970 if (err) { 1971 return (err); 1972 } 1973 dp->i_size = entryend; 1974 TRANS_INODE(dp->i_ufsvfs, dp); 1975 dp->i_flag |= IUPD|ICHG|IATTCHG; 1976 dp->i_seq++; 1977 ITIMES_NOLOCK(dp); 1978 } else if (entryend > dp->i_size) { 1979 /* 1980 * Adjust directory size, if needed. This should never 1981 * push the size past a new multiple of DIRBLKSIZ. 1982 * This is an artifact of the old (4.2BSD) way of initializing 1983 * directory sizes to be less than DIRBLKSIZ. 1984 */ 1985 dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t); 1986 TRANS_INODE(dp->i_ufsvfs, dp); 1987 dp->i_flag |= IUPD|ICHG|IATTCHG; 1988 dp->i_seq++; 1989 ITIMES_NOLOCK(dp); 1990 } 1991 1992 /* 1993 * Get the block containing the space for the new directory entry. 1994 */ 1995 if (slotp->fbp == NULL) { 1996 err = blkatoff(dp, slotp->offset, (char **)&slotp->ep, 1997 &slotp->fbp); 1998 if (err) { 1999 return (err); 2000 } 2001 } 2002 ep = slotp->ep; 2003 2004 switch (status) { 2005 case NONE: 2006 /* 2007 * No space in the directory. slotp->offset will be on a 2008 * directory block boundary and we will write the new entry 2009 * into a fresh block. 2010 */ 2011 ep->d_reclen = DIRBLKSIZ; 2012 slotp->size = 0; /* length of previous entry */ 2013 break; 2014 case FOUND: 2015 /* 2016 * An entry of the required size has been found. Use it. 2017 */ 2018 if (ep->d_ino == 0) { 2019 /* this is the 1st record in a block */ 2020 slotp->size = 0; /* length of previous entry */ 2021 } else { 2022 dsize = DIRSIZ(ep); 2023 nep = (struct direct *)((char *)ep + dsize); 2024 nep->d_reclen = ep->d_reclen - dsize; 2025 ep->d_reclen = dsize; 2026 slotp->ep = nep; 2027 slotp->offset += dsize; 2028 slotp->size = dsize; /* length of previous entry */ 2029 } 2030 break; 2031 default: 2032 break; 2033 } 2034 return (0); 2035 } 2036 2037 /* 2038 * Allocate and initialize a new inode that will go into directory tdp. 2039 * This routine is called from ufs_symlink(), as well as within this file. 2040 */ 2041 int 2042 ufs_dirmakeinode( 2043 struct inode *tdp, 2044 struct inode **ipp, 2045 struct vattr *vap, 2046 enum de_op op, 2047 struct cred *cr) 2048 { 2049 struct inode *ip; 2050 enum vtype type; 2051 int imode; /* mode and format as in inode */ 2052 ino_t ipref; 2053 int err; 2054 timestruc_t now; 2055 2056 ASSERT(vap != NULL); 2057 ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR || 2058 op == DE_SYMLINK); 2059 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 2060 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 2061 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 2062 /* 2063 * Allocate a new inode. 2064 */ 2065 type = vap->va_type; 2066 if (type == VDIR) { 2067 ipref = dirpref(tdp); 2068 } else { 2069 ipref = tdp->i_number; 2070 } 2071 if (op == DE_ATTRDIR) 2072 imode = vap->va_mode; 2073 else 2074 imode = MAKEIMODE(type, vap->va_mode); 2075 *ipp = NULL; 2076 err = ufs_ialloc(tdp, ipref, imode, &ip, cr); 2077 if (err) 2078 return (err); 2079 2080 /* 2081 * We don't need to grab vfs_dqrwlock here because it is held 2082 * in ufs_direnter_*() above us. 2083 */ 2084 ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock)); 2085 rw_enter(&ip->i_contents, RW_WRITER); 2086 if (ip->i_dquot != NULL) { 2087 err = ufs_fault(ITOV(ip), 2088 "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)", 2089 tdp->i_fs->fs_fsmnt); 2090 rw_exit(&ip->i_contents); 2091 return (err); 2092 } 2093 *ipp = ip; 2094 ip->i_mode = (o_mode_t)imode; 2095 if (type == VBLK || type == VCHR) { 2096 dev_t d = vap->va_rdev; 2097 dev32_t dev32; 2098 2099 /* 2100 * Don't allow a special file to be created with a 2101 * dev_t that cannot be represented by this filesystem 2102 * format on disk. 2103 */ 2104 if (!cmpldev(&dev32, d)) { 2105 err = EOVERFLOW; 2106 goto fail; 2107 } 2108 2109 ITOV(ip)->v_rdev = ip->i_rdev = d; 2110 2111 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { 2112 ip->i_ordev = dev32; /* can't use old format */ 2113 } else { 2114 ip->i_ordev = cmpdev(d); 2115 } 2116 } 2117 ITOV(ip)->v_type = type; 2118 ufs_reset_vnode(ip->i_vnode); 2119 if (type == VDIR) { 2120 ip->i_nlink = 2; /* anticipating a call to dirmakedirect */ 2121 } else { 2122 ip->i_nlink = 1; 2123 } 2124 2125 if (op == DE_ATTRDIR) { 2126 ip->i_uid = vap->va_uid; 2127 ip->i_gid = vap->va_gid; 2128 } else 2129 ip->i_uid = crgetuid(cr); 2130 /* 2131 * To determine the group-id of the created file: 2132 * 1) If the gid is set in the attribute list (non-Sun & pre-4.0 2133 * clients are not likely to set the gid), then use it if 2134 * the process is privileged, belongs to the target group, 2135 * or the group is the same as the parent directory. 2136 * 2) If the filesystem was not mounted with the Old-BSD-compatible 2137 * GRPID option, and the directory's set-gid bit is clear, 2138 * then use the process's gid. 2139 * 3) Otherwise, set the group-id to the gid of the parent directory. 2140 */ 2141 if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) && 2142 ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) || 2143 secpolicy_vnode_create_gid(cr) == 0)) { 2144 /* 2145 * XXX - is this only the case when a 4.0 NFS client, or a 2146 * client derived from that code, makes a call over the wire? 2147 */ 2148 ip->i_gid = vap->va_gid; 2149 } else 2150 ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr); 2151 2152 /* 2153 * For SunOS 5.0->5.4, the lines below read: 2154 * 2155 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; 2156 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; 2157 * 2158 * where MAXUID was set to 60002. See notes on this in ufs_inode.c 2159 */ 2160 ip->i_suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? 2161 UID_LONG : ip->i_uid; 2162 ip->i_sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? 2163 GID_LONG : ip->i_gid; 2164 2165 /* 2166 * If we're creating a directory, and the parent directory has the 2167 * set-GID bit set, set it on the new directory. 2168 * Otherwise, if the user is neither privileged nor a member of the 2169 * file's new group, clear the file's set-GID bit. 2170 */ 2171 if ((tdp->i_mode & ISGID) && (type == VDIR)) 2172 ip->i_mode |= ISGID; 2173 else { 2174 if ((ip->i_mode & ISGID) && 2175 secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0) 2176 ip->i_mode &= ~ISGID; 2177 } 2178 2179 if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2180 ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2181 err = EOVERFLOW; 2182 goto fail; 2183 } 2184 2185 /* 2186 * Extended attribute directories are not subject to quotas. 2187 */ 2188 if (op != DE_ATTRDIR) 2189 ip->i_dquot = getinoquota(ip); 2190 else 2191 ip->i_dquot = NULL; 2192 2193 if (op == DE_MKDIR || op == DE_ATTRDIR) { 2194 err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr); 2195 if (err) 2196 goto fail; 2197 } 2198 2199 /* 2200 * generate the shadow inode and attach it to the new object 2201 */ 2202 ASSERT((tdp->i_shadow && tdp->i_ufs_acl) || 2203 (!tdp->i_shadow && !tdp->i_ufs_acl)); 2204 if (tdp->i_shadow && tdp->i_ufs_acl && 2205 (((tdp->i_mode & IFMT) == IFDIR) || 2206 ((tdp->i_mode & IFMT) == IFATTRDIR))) { 2207 err = ufs_si_inherit(ip, tdp, ip->i_mode, cr); 2208 if (err) { 2209 if (op == DE_MKDIR) { 2210 /* 2211 * clean up parent directory 2212 * 2213 * tdp->i_contents already locked from 2214 * ufs_direnter_*() 2215 */ 2216 tdp->i_nlink--; 2217 TRANS_INODE(tdp->i_ufsvfs, tdp); 2218 tdp->i_flag |= ICHG; 2219 tdp->i_seq++; 2220 ufs_iupdat(tdp, I_SYNC); 2221 } 2222 goto fail; 2223 } 2224 } 2225 2226 /* 2227 * If the passed in attributes contain atime and/or mtime 2228 * settings, then use them instead of using the current 2229 * high resolution time. 2230 */ 2231 if (vap->va_mask & (AT_MTIME|AT_ATIME)) { 2232 if (vap->va_mask & AT_ATIME) { 2233 ip->i_atime.tv_sec = vap->va_atime.tv_sec; 2234 ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2235 ip->i_flag &= ~IACC; 2236 } else 2237 ip->i_flag |= IACC; 2238 if (vap->va_mask & AT_MTIME) { 2239 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; 2240 ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2241 gethrestime(&now); 2242 if (now.tv_sec > TIME32_MAX) { 2243 /* 2244 * In 2038, ctime sticks forever.. 2245 */ 2246 ip->i_ctime.tv_sec = TIME32_MAX; 2247 ip->i_ctime.tv_usec = 0; 2248 } else { 2249 ip->i_ctime.tv_sec = now.tv_sec; 2250 ip->i_ctime.tv_usec = now.tv_nsec / 1000; 2251 } 2252 ip->i_flag &= ~(IUPD|ICHG); 2253 ip->i_flag |= IMODTIME; 2254 } else 2255 ip->i_flag |= IUPD|ICHG; 2256 ip->i_flag |= IMOD; 2257 } else 2258 ip->i_flag |= IACC|IUPD|ICHG; 2259 ip->i_seq++; 2260 2261 /* 2262 * If this is an attribute tag it as one. 2263 */ 2264 if ((tdp->i_mode & IFMT) == IFATTRDIR) { 2265 ip->i_cflags |= IXATTR; 2266 } 2267 2268 /* 2269 * push inode before it's name appears in a directory 2270 */ 2271 TRANS_INODE(ip->i_ufsvfs, ip); 2272 ufs_iupdat(ip, I_SYNC); 2273 rw_exit(&ip->i_contents); 2274 return (0); 2275 2276 fail: 2277 /* Throw away inode we just allocated. */ 2278 ip->i_nlink = 0; 2279 ufs_setreclaim(ip); 2280 TRANS_INODE(ip->i_ufsvfs, ip); 2281 ip->i_flag |= ICHG; 2282 ip->i_seq++; 2283 ITIMES_NOLOCK(ip); 2284 rw_exit(&ip->i_contents); 2285 return (err); 2286 } 2287 2288 /* 2289 * Write a prototype directory into the empty inode ip, whose parent is dp. 2290 */ 2291 static int 2292 ufs_dirmakedirect( 2293 struct inode *ip, /* new directory */ 2294 struct inode *dp, /* parent directory */ 2295 int attrdir, 2296 struct cred *cr) 2297 { 2298 struct dirtemplate *dirp; 2299 struct fbuf *fbp; 2300 int err; 2301 2302 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 2303 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 2304 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 2305 /* 2306 * Allocate space for the directory we're creating. 2307 */ 2308 err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr); 2309 if (err) 2310 return (err); 2311 if (DIRBLKSIZ > dp->i_fs->fs_fsize) { 2312 err = ufs_fault(ITOV(dp), 2313 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)", 2314 DIRBLKSIZ, dp->i_fs->fs_fsize, 2315 dp->i_fs->fs_fsmnt); 2316 return (err); 2317 } 2318 ip->i_size = DIRBLKSIZ; 2319 TRANS_INODE(ip->i_ufsvfs, ip); 2320 ip->i_flag |= IUPD|ICHG|IATTCHG; 2321 ip->i_seq++; 2322 ITIMES_NOLOCK(ip); 2323 /* 2324 * Update the tdp link count and write out the change. 2325 * This reflects the ".." entry we'll soon write. 2326 */ 2327 if (dp->i_nlink == MAXLINK) 2328 return (EMLINK); 2329 if (attrdir == 0) 2330 dp->i_nlink++; 2331 TRANS_INODE(dp->i_ufsvfs, dp); 2332 dp->i_flag |= ICHG; 2333 dp->i_seq++; 2334 ufs_iupdat(dp, I_SYNC); 2335 /* 2336 * Initialize directory with "." 2337 * and ".." from static template. 2338 * 2339 * Since the parent directory is locked, we don't have to 2340 * worry about anything changing when we drop the write 2341 * lock on (ip). 2342 * 2343 */ 2344 err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize, 2345 S_READ, &fbp); 2346 2347 if (err) { 2348 goto fail; 2349 } 2350 dirp = (struct dirtemplate *)fbp->fb_addr; 2351 /* 2352 * Now initialize the directory we're creating 2353 * with the "." and ".." entries. 2354 */ 2355 *dirp = mastertemplate; /* structure assignment */ 2356 dirp->dot_ino = (uint32_t)ip->i_number; 2357 dirp->dotdot_ino = (uint32_t)dp->i_number; 2358 2359 err = TRANS_DIR(ip, 0); 2360 if (err) { 2361 fbrelse(fbp, S_OTHER); 2362 goto fail; 2363 } 2364 2365 err = ufs_fbwrite(fbp, ip); 2366 if (err) { 2367 goto fail; 2368 } 2369 2370 return (0); 2371 2372 fail: 2373 if (attrdir == 0) 2374 dp->i_nlink--; 2375 TRANS_INODE(dp->i_ufsvfs, dp); 2376 dp->i_flag |= ICHG; 2377 dp->i_seq++; 2378 ufs_iupdat(dp, I_SYNC); 2379 return (err); 2380 } 2381 2382 /* 2383 * Delete a directory entry. If oip is nonzero the entry is checked 2384 * to make sure it still reflects oip. 2385 * 2386 * If vpp is non-null, return the ptr of the (held) vnode associated with 2387 * the removed name. The caller is responsible for doing the VN_RELE(). 2388 */ 2389 int 2390 ufs_dirremove( 2391 struct inode *dp, 2392 char *namep, 2393 struct inode *oip, 2394 struct vnode *cdir, 2395 enum dr_op op, 2396 struct cred *cr, 2397 vnode_t **vpp) /* Return (held) vnode ptr of removed file/dir */ 2398 { 2399 struct direct *ep, *pep, *nep; 2400 struct inode *ip; 2401 vnode_t *dvp, *vp; 2402 struct slot slot; 2403 int namlen; 2404 int err; 2405 int mode; 2406 ushort_t extra; 2407 2408 namlen = (int)strlen(namep); 2409 if (namlen == 0) 2410 return (ufs_fault(ITOV(dp), "ufs_dirremove: namlen == 0")); 2411 /* 2412 * return error when removing . and .. 2413 */ 2414 if (namep[0] == '.') { 2415 if (namlen == 1) 2416 return (EINVAL); 2417 else if (namlen == 2 && namep[1] == '.') { 2418 return (EEXIST); /* SIGH should be ENOTEMPTY */ 2419 } 2420 } 2421 2422 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 2423 /* 2424 * Check accessibility of directory. 2425 */ 2426 retry: 2427 if (((dp->i_mode & IFMT) != IFDIR) && 2428 ((dp->i_mode & IFMT) != IFATTRDIR)) { 2429 return (ENOTDIR); 2430 } 2431 2432 /* 2433 * Execute access is required to search the directory. 2434 * Access for write is interpreted as allowing 2435 * deletion of files in the directory. 2436 */ 2437 if (err = ufs_iaccess(dp, IEXEC|IWRITE, cr)) { 2438 return (err); 2439 } 2440 2441 ip = NULL; 2442 slot.fbp = NULL; 2443 slot.status = FOUND; /* don't need to look for empty slot */ 2444 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 2445 rw_enter(&dp->i_contents, RW_WRITER); 2446 err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0); 2447 if (err) 2448 goto out_novfs; 2449 if (ip == NULL) { 2450 err = ENOENT; 2451 goto out_novfs; 2452 } 2453 vp = ITOV(ip); 2454 if (oip && oip != ip) { 2455 err = ENOENT; 2456 goto out_novfs; 2457 } 2458 2459 mode = ip->i_mode & IFMT; 2460 if (mode == IFDIR || mode == IFATTRDIR) { 2461 2462 /* 2463 * vn_vfswlock() prevents races between mount and rmdir. 2464 */ 2465 if (vn_vfswlock(vp)) { 2466 err = EBUSY; 2467 goto out_novfs; 2468 } 2469 if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) { 2470 err = EBUSY; 2471 goto out; 2472 } 2473 /* 2474 * If we are removing a directory, get a lock on it. 2475 * Taking a writer lock prevents a parallel ufs_dirlook from 2476 * incorrectly entering a negative cache vnode entry in the dnlc 2477 * If the directory is empty, it will stay empty until 2478 * we can remove it. 2479 */ 2480 if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) { 2481 /* 2482 * It is possible that a thread in rename would have 2483 * acquired this rwlock. To prevent a deadlock we 2484 * do a rw_tryenter. If we fail to get the lock 2485 * we drop all the locks we have acquired, wait 2486 * for 2 ticks and reacquire the 2487 * directory's (dp) i_rwlock and try again. 2488 * If we dont drop dp's i_rwlock then we will panic 2489 * with a "Deadlock: cycle in blocking chain" 2490 * since in ufs_dircheckpath we want dp's i_rwlock. 2491 * dp is guaranteed to exist since ufs_dirremove is 2492 * called after a VN_HOLD(dp) has been done. 2493 */ 2494 ufs_dirremove_retry_cnt++; 2495 vn_vfsunlock(vp); 2496 if (slot.fbp) 2497 fbrelse(slot.fbp, S_OTHER); 2498 rw_exit(&dp->i_contents); 2499 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 2500 rw_exit(&dp->i_rwlock); 2501 VN_RELE(vp); 2502 delay(2); 2503 rw_enter(&dp->i_rwlock, RW_WRITER); 2504 goto retry; 2505 } 2506 } 2507 rw_enter(&ip->i_contents, RW_READER); 2508 2509 /* 2510 * Now check the restrictions that apply on sticky directories. 2511 */ 2512 if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) { 2513 rw_exit(&ip->i_contents); 2514 if (mode == IFDIR || mode == IFATTRDIR) 2515 rw_exit(&ip->i_rwlock); 2516 goto out; 2517 } 2518 2519 if (op == DR_RMDIR) { 2520 /* 2521 * For rmdir(2), some special checks are required. 2522 * (a) Don't remove any alias of the parent (e.g. "."). 2523 * (b) Don't remove the current directory. 2524 * (c) Make sure the entry is (still) a directory. 2525 * (d) Make sure the directory is empty. 2526 */ 2527 2528 if (dp == ip || vp == cdir) 2529 err = EINVAL; 2530 else if (((ip->i_mode & IFMT) != IFDIR) && 2531 ((ip->i_mode & IFMT) != IFATTRDIR)) 2532 err = ENOTDIR; 2533 else if ((ip->i_nlink > 2) || 2534 !ufs_dirempty(ip, dp->i_number, cr)) { 2535 err = EEXIST; /* SIGH should be ENOTEMPTY */ 2536 } 2537 2538 if (err) { 2539 rw_exit(&ip->i_contents); 2540 if (mode == IFDIR || mode == IFATTRDIR) 2541 rw_exit(&ip->i_rwlock); 2542 goto out; 2543 } 2544 } else if (op == DR_REMOVE) { 2545 /* 2546 * unlink(2) requires a different check: allow only 2547 * privileged users to unlink a directory. 2548 */ 2549 if (vp->v_type == VDIR && 2550 secpolicy_fs_linkdir(cr, vp->v_vfsp)) { 2551 err = EPERM; 2552 rw_exit(&ip->i_contents); 2553 rw_exit(&ip->i_rwlock); 2554 goto out; 2555 } 2556 } 2557 2558 rw_exit(&ip->i_contents); 2559 2560 /* 2561 * Remove the cache'd entry, if any. 2562 */ 2563 dvp = ITOV(dp); 2564 dnlc_remove(dvp, namep); 2565 ep = slot.ep; 2566 ep->d_ino = 0; 2567 2568 if (slot.cached) { 2569 dcanchor_t *dcap = &dp->i_danchor; 2570 2571 (void) dnlc_dir_rem_entry(dcap, namep, NULL); 2572 if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) { 2573 (void) dnlc_dir_rem_space_by_handle(dcap, slot.offset); 2574 } 2575 if (slot.offset & (DIRBLKSIZ - 1)) { 2576 /* 2577 * Collapse new free space into previous entry. 2578 * Note, the previous entry has already been 2579 * validated in ufs_dircheckforname(). 2580 */ 2581 ASSERT(slot.size); 2582 pep = (struct direct *)((char *)ep - slot.size); 2583 if ((pep->d_ino == 0) && 2584 ((uintptr_t)pep & (DIRBLKSIZ - 1))) { 2585 dnlc_dir_purge(dcap); 2586 slot.cached = 0; 2587 goto nocache; 2588 } 2589 if (pep->d_ino) { 2590 extra = pep->d_reclen - DIRSIZ(pep); 2591 } else { 2592 extra = pep->d_reclen; 2593 } 2594 if (extra >= LDIRSIZ(1)) { 2595 (void) dnlc_dir_rem_space_by_handle(dcap, 2596 (uint64_t)(slot.offset - slot.size)); 2597 } 2598 pep->d_reclen += ep->d_reclen; 2599 (void) dnlc_dir_add_space(dcap, extra + ep->d_reclen, 2600 (uint64_t)(slot.offset - slot.size)); 2601 /* adjust the previous pointer in the next entry */ 2602 nep = (struct direct *)((char *)ep + ep->d_reclen); 2603 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { 2604 /* 2605 * Not a new block. 2606 * 2607 * Check the validity of the entry. 2608 * If it's bad, then throw away the cache and 2609 * continue. 2610 */ 2611 if ((nep->d_reclen == 0) || 2612 (nep->d_reclen & 0x3) || 2613 (dnlc_dir_update(dcap, nep->d_name, 2614 INO_OFF_TO_H(nep->d_ino, 2615 slot.offset - slot.size)) == DNOENT)) { 2616 dnlc_dir_purge(dcap); 2617 slot.cached = 0; 2618 } 2619 } 2620 } else { 2621 (void) dnlc_dir_add_space(dcap, ep->d_reclen, 2622 (uint64_t)slot.offset); 2623 } 2624 } else { 2625 /* 2626 * If the entry isn't the first in the directory, we must 2627 * reclaim the space of the now empty record by adding 2628 * the record size to the size of the previous entry. 2629 */ 2630 if (slot.offset & (DIRBLKSIZ - 1)) { 2631 /* 2632 * Collapse new free space into previous entry. 2633 */ 2634 pep = (struct direct *)((char *)ep - slot.size); 2635 pep->d_reclen += ep->d_reclen; 2636 } 2637 } 2638 nocache: 2639 2640 2641 err = TRANS_DIR(dp, slot.offset); 2642 if (err) 2643 fbrelse(slot.fbp, S_OTHER); 2644 else 2645 err = ufs_fbwrite(slot.fbp, dp); 2646 slot.fbp = NULL; 2647 2648 /* 2649 * If we were removing a directory, it is 'gone' now, but we cannot 2650 * unlock it as a thread may be waiting for the lock in ufs_create. If 2651 * we did, it could then create a file in a deleted directory. 2652 */ 2653 2654 if (err) { 2655 if (mode == IFDIR || mode == IFATTRDIR) 2656 rw_exit(&ip->i_rwlock); 2657 goto out; 2658 } 2659 2660 rw_enter(&ip->i_contents, RW_WRITER); 2661 2662 dp->i_flag |= IUPD|ICHG; 2663 dp->i_seq++; 2664 ip->i_flag |= ICHG; 2665 ip->i_seq++; 2666 2667 TRANS_INODE(dp->i_ufsvfs, dp); 2668 TRANS_INODE(ip->i_ufsvfs, ip); 2669 /* 2670 * Now dispose of the inode. 2671 */ 2672 if (ip->i_nlink > 0) { 2673 /* 2674 * This is not done for IFATTRDIR's because they don't 2675 * have entries in the dnlc and the link counts are 2676 * not incremented when they are created. 2677 */ 2678 if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) { 2679 /* 2680 * Decrement by 2 because we're trashing the "." 2681 * entry as well as removing the entry in dp. 2682 * Clear the directory entry, but there may be 2683 * other hard links so don't free the inode. 2684 * Decrement the dp linkcount because we're 2685 * trashing the ".." entry. 2686 */ 2687 ip->i_nlink -= 2; 2688 dp->i_nlink--; 2689 ufs_setreclaim(dp); 2690 /* 2691 * XXX need to discard negative cache entries 2692 * for vp. See comment in ufs_delete(). 2693 */ 2694 dnlc_remove(vp, "."); 2695 dnlc_remove(vp, ".."); 2696 /* 2697 * The return value is ignored here bacause if 2698 * the directory purge fails we don't want to 2699 * stop the delete. If ufs_dirpurgedotdot fails 2700 * the delete will continue with the preexiting 2701 * behavior. 2702 */ 2703 (void) ufs_dirpurgedotdot(ip, dp->i_number, cr); 2704 } else { 2705 ip->i_nlink--; 2706 } 2707 ufs_setreclaim(ip); 2708 } 2709 ITIMES_NOLOCK(dp); 2710 ITIMES_NOLOCK(ip); 2711 2712 if (!TRANS_ISTRANS(dp->i_ufsvfs)) 2713 ufs_iupdat(dp, I_SYNC); 2714 if (!TRANS_ISTRANS(ip->i_ufsvfs)) 2715 ufs_iupdat(ip, I_SYNC); 2716 2717 rw_exit(&ip->i_contents); 2718 if (mode == IFDIR || mode == IFATTRDIR) 2719 rw_exit(&ip->i_rwlock); 2720 out: 2721 if (mode == IFDIR || mode == IFATTRDIR) { 2722 vn_vfsunlock(vp); 2723 } 2724 out_novfs: 2725 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 2726 2727 if (slot.fbp) 2728 fbrelse(slot.fbp, S_OTHER); 2729 2730 rw_exit(&dp->i_contents); 2731 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 2732 2733 /* 2734 * If no error and vpp is non-NULL, return the vnode ptr to the caller. 2735 * The caller becomes responsible for the VN_RELE(). Otherwise, 2736 * Release (and delete) the inode after we drop vfs_dqrwlock to 2737 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 2738 */ 2739 if (ip) { 2740 if ((err == 0) && (vpp != NULL)) { 2741 *vpp = ITOV(ip); 2742 } else { 2743 VN_RELE(vp); 2744 } 2745 } 2746 2747 return (err); 2748 } 2749 2750 /* 2751 * Return buffer with contents of block "offset" 2752 * from the beginning of directory "ip". If "res" 2753 * is non-zero, fill it in with a pointer to the 2754 * remaining space in the directory. 2755 * 2756 */ 2757 2758 int 2759 blkatoff( 2760 struct inode *ip, 2761 off_t offset, 2762 char **res, 2763 struct fbuf **fbpp) 2764 { 2765 struct fs *fs; 2766 struct fbuf *fbp; 2767 daddr_t lbn; 2768 uint_t bsize; 2769 int err; 2770 2771 CPU_STATS_ADD_K(sys, ufsdirblk, 1); 2772 fs = ip->i_fs; 2773 lbn = (daddr_t)lblkno(fs, offset); 2774 bsize = (uint_t)blksize(fs, ip, lbn); 2775 err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask), 2776 bsize, S_READ, &fbp); 2777 if (err) { 2778 *fbpp = (struct fbuf *)NULL; 2779 return (err); 2780 } 2781 if (res) 2782 *res = fbp->fb_addr + blkoff(fs, offset); 2783 *fbpp = fbp; 2784 return (0); 2785 } 2786 2787 /* 2788 * Do consistency checking: 2789 * record length must be multiple of 4 2790 * entry must fit in rest of its DIRBLKSIZ block 2791 * record must be large enough to contain entry 2792 * name is not longer than MAXNAMLEN 2793 * name must be as long as advertised, and null terminated 2794 * NOTE: record length must not be zero (should be checked previously). 2795 * This routine is only called if dirchk is true. 2796 * It would be nice to set the FSBAD flag in the super-block when 2797 * this routine fails so that a fsck is forced on next reboot, 2798 * but locking is a problem. 2799 */ 2800 static int 2801 dirmangled( 2802 struct inode *dp, 2803 struct direct *ep, 2804 int entryoffsetinblock, 2805 off_t offset) 2806 { 2807 int i; 2808 2809 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); 2810 if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i || 2811 (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN || 2812 ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) { 2813 dirbad(dp, "mangled entry", offset); 2814 return (1); 2815 } 2816 return (0); 2817 } 2818 2819 static void 2820 dirbad(struct inode *ip, char *how, off_t offset) 2821 { 2822 cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s", 2823 ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how); 2824 } 2825 2826 static int 2827 dirbadname(char *sp, int l) 2828 { 2829 while (l--) { /* check for nulls */ 2830 if (*sp++ == '\0') { 2831 return (1); 2832 } 2833 } 2834 return (*sp); /* check for terminating null */ 2835 } 2836 2837 /* 2838 * Check if a directory is empty or not. 2839 */ 2840 static int 2841 ufs_dirempty( 2842 struct inode *ip, 2843 ino_t parentino, 2844 struct cred *cr) 2845 { 2846 return (ufs_dirscan(ip, parentino, cr, 0)); 2847 } 2848 2849 /* 2850 * clear the .. directory entry. 2851 */ 2852 static int 2853 ufs_dirpurgedotdot( 2854 struct inode *ip, 2855 ino_t parentino, 2856 struct cred *cr) 2857 { 2858 return (ufs_dirscan(ip, parentino, cr, 1)); 2859 } 2860 2861 /* 2862 * Scan the directoy. If clr_dotdot is true clear the .. 2863 * directory else check to see if the directory is empty. 2864 * 2865 * Using a struct dirtemplate here is not precisely 2866 * what we want, but better than using a struct direct. 2867 * 2868 * clr_dotdot is used as a flag to tell us if we need 2869 * to clear the dotdot entry 2870 * 2871 * N.B.: does not handle corrupted directories. 2872 */ 2873 static int 2874 ufs_dirscan( 2875 struct inode *ip, 2876 ino_t parentino, 2877 struct cred *cr, 2878 int clr_dotdot) 2879 { 2880 offset_t off; 2881 struct dirtemplate dbuf; 2882 struct direct *dp = (struct direct *)&dbuf; 2883 int err, count; 2884 int empty = 1; /* Assume it's empty */ 2885 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) 2886 2887 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 2888 2889 ASSERT(ip->i_size <= (offset_t)MAXOFF_T); 2890 for (off = 0; off < ip->i_size; off += dp->d_reclen) { 2891 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, 2892 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); 2893 /* 2894 * Since we read MINDIRSIZ, residual must 2895 * be 0 unless we're at end of file. 2896 */ 2897 if (err || count != 0 || dp->d_reclen == 0) { 2898 empty = 0; 2899 break; 2900 } 2901 /* skip empty entries */ 2902 if (dp->d_ino == 0) 2903 continue; 2904 /* accept only "." and ".." */ 2905 if (dp->d_namlen > 2 || dp->d_name[0] != '.') { 2906 empty = 0; 2907 break; 2908 } 2909 /* 2910 * At this point d_namlen must be 1 or 2. 2911 * 1 implies ".", 2 implies ".." if second 2912 * char is also "." 2913 */ 2914 if (dp->d_namlen == 1) 2915 continue; 2916 if (dp->d_name[1] == '.' && 2917 (ino_t)dp->d_ino == parentino) { 2918 /* 2919 * If we're doing a purge we need to check for 2920 * the . and .. entries and clear the d_ino for .. 2921 * 2922 * if clr_dotdot is set ufs_dirscan does not 2923 * check for an empty directory. 2924 */ 2925 if (clr_dotdot) { 2926 /* 2927 * Have to actually zap the .. 2928 * entry in the directory, as 2929 * otherwise someone might have 2930 * dp as its cwd and try to 2931 * open .., which now points to 2932 * an unallocated inode. 2933 */ 2934 empty = ufs_dirclrdotdot(ip, parentino); 2935 break; 2936 } else { 2937 continue; 2938 } 2939 } 2940 empty = 0; 2941 break; 2942 } 2943 return (empty); 2944 } 2945 2946 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */ 2947 uint64_t dircheck_retry_cnt; 2948 /* 2949 * Check if source directory inode is in the path of the target directory. 2950 * Target is supplied locked. 2951 * 2952 * The source and target inode's should be different upon entry. 2953 */ 2954 int 2955 ufs_dircheckpath( 2956 ino_t source_ino, 2957 struct inode *target, 2958 struct inode *sdp, 2959 struct cred *cr) 2960 { 2961 struct fbuf *fbp; 2962 struct dirtemplate *dirp; 2963 struct inode *ip; 2964 struct ufsvfs *ufsvfsp; 2965 struct inode *tip; 2966 ino_t dotdotino; 2967 int err; 2968 2969 ASSERT(target->i_ufsvfs != NULL); 2970 ASSERT(RW_LOCK_HELD(&target->i_rwlock)); 2971 ASSERT(RW_LOCK_HELD(&sdp->i_rwlock)); 2972 2973 ip = target; 2974 if (ip->i_number == source_ino) { 2975 err = EINVAL; 2976 goto out; 2977 } 2978 if (ip->i_number == UFSROOTINO) { 2979 err = 0; 2980 goto out; 2981 } 2982 /* 2983 * Search back through the directory tree, using the ".." entries. 2984 * Fail any attempt to move a directory into an ancestor directory. 2985 */ 2986 fbp = NULL; 2987 for (;;) { 2988 struct vfs *vfs; 2989 2990 err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp); 2991 if (err) 2992 break; 2993 if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 || 2994 ip->i_size < sizeof (struct dirtemplate)) { 2995 dirbad(ip, "bad size, unlinked or not dir", (off_t)0); 2996 err = ENOTDIR; 2997 break; 2998 } 2999 if (dirp->dotdot_namlen != 2 || 3000 dirp->dotdot_name[0] != '.' || 3001 dirp->dotdot_name[1] != '.') { 3002 dirbad(ip, "mangled .. entry", (off_t)0); 3003 err = ENOTDIR; /* Sanity check */ 3004 break; 3005 } 3006 dotdotino = (ino_t)dirp->dotdot_ino; 3007 if (dotdotino == source_ino) { 3008 err = EINVAL; 3009 break; 3010 } 3011 if (dotdotino == UFSROOTINO) 3012 break; 3013 if (fbp) { 3014 fbrelse(fbp, S_OTHER); 3015 fbp = NULL; 3016 } 3017 vfs = ip->i_vfs; 3018 ufsvfsp = ip->i_ufsvfs; 3019 3020 if (ip != target) { 3021 rw_exit(&ip->i_rwlock); 3022 VN_RELE(ITOV(ip)); 3023 } 3024 /* 3025 * Race to get the inode. 3026 */ 3027 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3028 if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) { 3029 rw_exit(&ufsvfsp->vfs_dqrwlock); 3030 ip = NULL; 3031 break; 3032 } 3033 rw_exit(&ufsvfsp->vfs_dqrwlock); 3034 /* 3035 * If the directory of the source inode (also a directory) 3036 * is the same as this next entry up the chain, then 3037 * we know the source directory itself can't be in the 3038 * chain. This also prevents a panic because we already 3039 * have sdp->i_rwlock locked. 3040 */ 3041 if (tip == sdp) { 3042 VN_RELE(ITOV(tip)); 3043 ip = NULL; 3044 break; 3045 } 3046 ip = tip; 3047 3048 /* 3049 * If someone has set the WRITE_WANTED bit in this lock and if 3050 * this happens to be a sdp or tdp of another parallel rename 3051 * which is executing the same code and in similar situation 3052 * we end up in a 4 way deadlock. We need to make sure that 3053 * the WRITE_WANTED bit is not set. 3054 */ 3055 retry_lock: 3056 if (!rw_tryenter(&ip->i_rwlock, RW_READER)) { 3057 /* 3058 * If the lock held as WRITER thats fine but if it 3059 * has WRITE_WANTED bit set we might end up in a 3060 * deadlock. If WRITE_WANTED is set we return 3061 * with EAGAIN else we just go back and try. 3062 */ 3063 if (RW_ISWRITER(&ip->i_rwlock) && 3064 !(RW_WRITE_HELD(&ip->i_rwlock))) { 3065 err = EAGAIN; 3066 if (fbp) { 3067 fbrelse(fbp, S_OTHER); 3068 } 3069 VN_RELE(ITOV(ip)); 3070 return (err); 3071 } else { 3072 /* 3073 * The lock is being write held. We could 3074 * just do a rw_enter here but there is a 3075 * window between the check and now, where 3076 * the status could have changed, so to 3077 * avoid looping we backoff and go back to 3078 * try for the lock. 3079 */ 3080 delay(retry_backoff_delay); 3081 dircheck_retry_cnt++; 3082 goto retry_lock; 3083 } 3084 } 3085 } 3086 if (fbp) { 3087 fbrelse(fbp, S_OTHER); 3088 } 3089 out: 3090 if (ip) { 3091 if (ip != target) { 3092 rw_exit(&ip->i_rwlock); 3093 VN_RELE(ITOV(ip)); 3094 } 3095 } 3096 return (err); 3097 } 3098 3099 int 3100 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr) 3101 { 3102 offset_t off; 3103 struct dirtemplate dbuf; 3104 struct direct *dp = (struct direct *)&dbuf; 3105 int err, count; 3106 int empty = 1; /* Assume it's empty */ 3107 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) 3108 3109 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 3110 3111 ASSERT(ip->i_size <= (offset_t)MAXOFF_T); 3112 for (off = 0; off < ip->i_size; off += dp->d_reclen) { 3113 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, 3114 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); 3115 /* 3116 * Since we read MINDIRSIZ, residual must 3117 * be 0 unless we're at end of file. 3118 */ 3119 3120 if (err || count != 0 || dp->d_reclen == 0) { 3121 empty = 0; 3122 break; 3123 } 3124 /* skip empty entries */ 3125 if (dp->d_ino == 0) 3126 continue; 3127 /* 3128 * At this point d_namlen must be 1 or 2. 3129 * 1 implies ".", 2 implies ".." if second 3130 * char is also "." 3131 */ 3132 3133 if (dp->d_namlen == 1 && dp->d_name[0] == '.' && 3134 (ino_t)dp->d_ino == parentino) 3135 continue; 3136 3137 if (dp->d_namlen == 2 && dp->d_name[0] == '.' && 3138 dp->d_name[1] == '.') { 3139 continue; 3140 } 3141 empty = 0; 3142 break; 3143 } 3144 return (empty); 3145 } 3146 3147 3148 /* 3149 * Allocate and initialize a new shadow inode to contain extended attributes. 3150 */ 3151 int 3152 ufs_xattrmkdir( 3153 struct inode *tdp, 3154 struct inode **ipp, 3155 int flags, 3156 struct cred *cr) 3157 { 3158 struct inode *ip; 3159 struct vattr va; 3160 int err; 3161 int retry = 1; 3162 struct ufsvfs *ufsvfsp; 3163 struct ulockfs *ulp; 3164 int issync; 3165 int trans_size; 3166 int dorwlock; /* 0 = not yet taken, */ 3167 /* 1 = taken outside the transaction, */ 3168 /* 2 = taken inside the transaction */ 3169 3170 /* 3171 * Validate permission to create attribute directory 3172 */ 3173 3174 if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0) { 3175 return (err); 3176 } 3177 3178 if (vn_is_readonly(ITOV(tdp))) 3179 return (EROFS); 3180 3181 /* 3182 * No need to re-init err after again:, since it's set before 3183 * the next use of it. 3184 */ 3185 again: 3186 dorwlock = 0; 3187 va.va_type = VDIR; 3188 va.va_uid = tdp->i_uid; 3189 va.va_gid = tdp->i_gid; 3190 3191 if ((tdp->i_mode & IFMT) == IFDIR) { 3192 va.va_mode = (o_mode_t)IFATTRDIR; 3193 va.va_mode |= tdp->i_mode & 0777; 3194 } else { 3195 va.va_mode = (o_mode_t)IFATTRDIR|0700; 3196 if (tdp->i_mode & 0040) 3197 va.va_mode |= 0750; 3198 if (tdp->i_mode & 0004) 3199 va.va_mode |= 0705; 3200 } 3201 va.va_mask = AT_TYPE|AT_MODE; 3202 3203 ufsvfsp = tdp->i_ufsvfs; 3204 3205 err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK); 3206 if (err) 3207 return (err); 3208 3209 /* 3210 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file. 3211 * This follows the protocol for read()/write(). 3212 */ 3213 if (ITOV(tdp)->v_type != VDIR) { 3214 rw_enter(&tdp->i_rwlock, RW_WRITER); 3215 dorwlock = 1; 3216 } 3217 3218 if (ulp) { 3219 trans_size = (int)TOP_MKDIR_SIZE(tdp); 3220 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size); 3221 } 3222 3223 /* 3224 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory. 3225 * This follows the protocol established by 3226 * ufs_link/create/remove/rename/mkdir/rmdir/symlink. 3227 */ 3228 if (dorwlock == 0) { 3229 rw_enter(&tdp->i_rwlock, RW_WRITER); 3230 dorwlock = 2; 3231 } 3232 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3233 rw_enter(&tdp->i_contents, RW_WRITER); 3234 3235 /* 3236 * Suppress out of inodes messages if we will retry. 3237 */ 3238 if (retry) 3239 tdp->i_flag |= IQUIET; 3240 err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr); 3241 tdp->i_flag &= ~IQUIET; 3242 3243 if (err) 3244 goto fail; 3245 3246 if (flags) { 3247 3248 /* 3249 * Now attach it to src file. 3250 */ 3251 3252 tdp->i_oeftflag = ip->i_number; 3253 } 3254 3255 ip->i_cflags |= IXATTR; 3256 ITOV(ip)->v_flag |= V_XATTRDIR; 3257 TRANS_INODE(ufsvfsp, tdp); 3258 tdp->i_flag |= ICHG | IUPD; 3259 tdp->i_seq++; 3260 ufs_iupdat(tdp, I_SYNC); 3261 rw_exit(&tdp->i_contents); 3262 rw_exit(&ufsvfsp->vfs_dqrwlock); 3263 3264 rw_enter(&ip->i_rwlock, RW_WRITER); 3265 rw_enter(&ip->i_contents, RW_WRITER); 3266 TRANS_INODE(ufsvfsp, ip); 3267 ip->i_flag |= ICHG| IUPD; 3268 ip->i_seq++; 3269 ufs_iupdat(ip, I_SYNC); 3270 rw_exit(&ip->i_contents); 3271 rw_exit(&ip->i_rwlock); 3272 if (dorwlock == 2) 3273 rw_exit(&tdp->i_rwlock); 3274 if (ulp) { 3275 int terr = 0; 3276 3277 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); 3278 ufs_lockfs_end(ulp); 3279 if (err == 0) 3280 err = terr; 3281 } 3282 if (dorwlock == 1) 3283 rw_exit(&tdp->i_rwlock); 3284 *ipp = ip; 3285 return (err); 3286 3287 fail: 3288 rw_exit(&tdp->i_contents); 3289 rw_exit(&ufsvfsp->vfs_dqrwlock); 3290 if (dorwlock == 2) 3291 rw_exit(&tdp->i_rwlock); 3292 if (ulp) { 3293 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); 3294 ufs_lockfs_end(ulp); 3295 } 3296 if (dorwlock == 1) 3297 rw_exit(&tdp->i_rwlock); 3298 if (ip != NULL) 3299 VN_RELE(ITOV(ip)); 3300 3301 /* 3302 * No inodes? See if any are tied up in pending deletions. 3303 * This has to be done outside of any of the above, because 3304 * the draining operation can't be done from inside a transaction. 3305 */ 3306 if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3307 ufs_delete_drain_wait(ufsvfsp, 1); 3308 retry = 0; 3309 goto again; 3310 } 3311 3312 return (err); 3313 } 3314 3315 /* 3316 * clear the dotdot directory entry. 3317 * Used by ufs_dirscan when clr_dotdot 3318 * flag is set and we're deleting a 3319 * directory. 3320 */ 3321 static int 3322 ufs_dirclrdotdot(struct inode *ip, ino_t parentino) 3323 { 3324 struct fbuf *fbp; 3325 struct direct *dotp, *dotdotp; 3326 int err = 0; 3327 3328 ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); 3329 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 3330 err = blkatoff(ip, 0, NULL, &fbp); 3331 if (err) { 3332 return (err); 3333 } 3334 3335 dotp = (struct direct *)fbp->fb_addr; 3336 if ((dotp->d_namlen < (MAXNAMLEN + 1)) && 3337 ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) { 3338 dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen); 3339 if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) && 3340 ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) { 3341 3342 dotp->d_reclen += dotdotp->d_reclen; 3343 if (parentino == dotdotp->d_ino) { 3344 dotdotp->d_ino = 0; 3345 dotdotp->d_namlen = 0; 3346 dotdotp->d_reclen = 0; 3347 } 3348 3349 err = TRANS_DIR(ip, 0); 3350 if (err) { 3351 fbrelse(fbp, S_OTHER); 3352 } else { 3353 err = ufs_fbwrite(fbp, ip); 3354 } 3355 } 3356 } else { 3357 err = -1; 3358 } 3359 return (err); 3360 } 3361