1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 /* 43 * Directory manipulation routines. 44 * 45 * When manipulating directories, the i_rwlock provides serialization 46 * since directories cannot be mmapped. The i_contents lock is redundant. 47 */ 48 49 #include <sys/types.h> 50 #include <sys/t_lock.h> 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/signal.h> 54 #include <sys/cred.h> 55 #include <sys/proc.h> 56 #include <sys/disp.h> 57 #include <sys/user.h> 58 #include <sys/vfs.h> 59 #include <sys/vnode.h> 60 #include <sys/stat.h> 61 #include <sys/mode.h> 62 #include <sys/buf.h> 63 #include <sys/uio.h> 64 #include <sys/dnlc.h> 65 #include <sys/fs/ufs_inode.h> 66 #include <sys/fs/ufs_fs.h> 67 #include <sys/mount.h> 68 #include <sys/fs/ufs_fsdir.h> 69 #include <sys/fs/ufs_trans.h> 70 #include <sys/fs/ufs_panic.h> 71 #include <sys/fs/ufs_quota.h> 72 #include <sys/errno.h> 73 #include <sys/debug.h> 74 #include <vm/seg.h> 75 #include <sys/sysmacros.h> 76 #include <sys/cmn_err.h> 77 #include <sys/cpuvar.h> 78 #include <sys/unistd.h> 79 #include <sys/policy.h> 80 81 /* 82 * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ 83 */ 84 #if !ISP2(DIRBLKSIZ) 85 #error "DIRBLKSIZ not a power of 2" 86 #endif 87 88 /* 89 * A virgin directory. 90 */ 91 static struct dirtemplate mastertemplate = { 92 0, 12, 1, ".", 93 0, DIRBLKSIZ - 12, 2, ".." 94 }; 95 96 #define LDIRSIZ(len) \ 97 ((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3)) 98 #define MAX_DIR_NAME_LEN(len) \ 99 (((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1) 100 101 /* 102 * The dnlc directory cache allows a 64 bit handle for directory entries. 103 * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset 104 * into the handle. Note, a 32 bit offset allows a 4GB directory, which 105 * is way beyond what could be cached in memory by the directory 106 * caching routines. So we are quite safe with this limit. 107 * The macros below pack and unpack the handle. 108 */ 109 #define H_TO_INO(h) (uint32_t)((h) & UINT_MAX) 110 #define H_TO_OFF(h) (off_t)((h) >> 32) 111 #define INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino)) 112 113 /* 114 * The average size of a typical on disk directory entry is about 16 bytes 115 * and so defines AV_DIRECT_SHIFT : log2(16) 116 * This define is only used to approximate the number of entries 117 * is a directory. This is needed for dnlc_dir_start() which will immediately 118 * return an error if the value is not within its acceptable range of 119 * number of files in a directory. 120 */ 121 #define AV_DIRECT_SHIFT 4 122 /* 123 * If the directory size (from i_size) is greater than the ufs_min_dir_cache 124 * tunable then we request dnlc directory caching. 125 * This has found to be profitable after 1024 file names. 126 */ 127 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT; 128 129 /* The time point the dnlc directory caching was disabled */ 130 static hrtime_t ufs_dc_disable_at; 131 /* directory caching disable duration */ 132 static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5; 133 134 #ifdef DEBUG 135 int dirchk = 1; 136 #else /* !DEBUG */ 137 int dirchk = 0; 138 #endif /* DEBUG */ 139 int ufs_negative_cache = 1; 140 uint64_t ufs_dirremove_retry_cnt; 141 142 static void dirbad(); 143 static int ufs_dirrename(); 144 static int ufs_diraddentry(); 145 static int ufs_dirempty(); 146 static int ufs_dirscan(); 147 static int ufs_dirclrdotdot(); 148 static int ufs_dirfixdotdot(); 149 static int ufs_dirpurgedotdot(); 150 static int dirprepareentry(); 151 static int ufs_dirmakedirect(); 152 static int dirbadname(); 153 static int dirmangled(); 154 155 /* 156 * Look for a given name in a directory. On successful return, *ipp 157 * will point to the VN_HELD inode. 158 */ 159 int 160 ufs_dirlook( 161 struct inode *dp, 162 char *namep, 163 struct inode **ipp, 164 struct cred *cr, 165 int skipdnlc) /* skip the 1st level dnlc */ 166 { 167 uint64_t handle; 168 struct fbuf *fbp; /* a buffer of directory entries */ 169 struct direct *ep; /* the current directory entry */ 170 struct vnode *vp; 171 struct vnode *dvp; /* directory vnode ptr */ 172 dcanchor_t *dcap; 173 off_t endsearch; /* offset to end directory search */ 174 off_t offset; 175 off_t start_off; /* starting offset from middle search */ 176 off_t last_offset; /* last offset */ 177 int entryoffsetinblock; /* offset of ep in addr's buffer */ 178 int numdirpasses; /* strategy for directory search */ 179 int namlen; /* length of name */ 180 int err; 181 int doingchk; 182 int i; 183 int caching; 184 ino_t ep_ino; /* entry i number */ 185 ino_t chkino; 186 ushort_t ep_reclen; /* direct local d_reclen */ 187 188 ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */ 189 190 /* 191 * Check accessibility of directory. 192 */ 193 if (((dp->i_mode & IFMT) != IFDIR) && 194 ((dp->i_mode & IFMT) != IFATTRDIR)) 195 return (ENOTDIR); 196 197 if (err = ufs_iaccess(dp, IEXEC, cr)) 198 return (err); 199 200 /* 201 * Check the directory name lookup cache, first for individual files 202 * then for complete directories. 203 */ 204 dvp = ITOV(dp); 205 if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) { 206 /* vp is already held from dnlc_lookup */ 207 if (vp == DNLC_NO_VNODE) { 208 VN_RELE(vp); 209 return (ENOENT); 210 } 211 *ipp = VTOI(vp); 212 return (0); 213 } 214 215 dcap = &dp->i_danchor; 216 217 /* 218 * Grab the reader lock on the directory data before checking 219 * the dnlc to avoid a race with ufs_dirremove() & friends. 220 */ 221 rw_enter(&dp->i_rwlock, RW_READER); 222 223 switch (dnlc_dir_lookup(dcap, namep, &handle)) { 224 case DFOUND: 225 ep_ino = (ino_t)H_TO_INO(handle); 226 if (dp->i_number == ep_ino) { 227 VN_HOLD(dvp); /* want ourself, "." */ 228 *ipp = dp; 229 rw_exit(&dp->i_rwlock); 230 return (0); 231 } 232 if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) { 233 uint64_t handle2; 234 /* 235 * release the lock on the dir we are searching 236 * to avoid a deadlock when grabbing the 237 * i_contents lock in ufs_iget_alloced(). 238 */ 239 rw_exit(&dp->i_rwlock); 240 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 241 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); 242 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 243 /* 244 * must recheck as we dropped dp->i_rwlock 245 */ 246 rw_enter(&dp->i_rwlock, RW_READER); 247 if (!err && (dnlc_dir_lookup(dcap, namep, &handle2) 248 == DFOUND) && (handle == handle2)) { 249 dnlc_update(dvp, namep, ITOV(*ipp)); 250 rw_exit(&dp->i_rwlock); 251 return (0); 252 } 253 /* check failed, read the actual directory */ 254 if (!err) { 255 VN_RELE(ITOV(*ipp)); 256 } 257 goto restart; 258 } 259 /* usual case of not "." nor ".." */ 260 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 261 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); 262 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 263 if (err) { 264 rw_exit(&dp->i_rwlock); 265 return (err); 266 } 267 dnlc_update(dvp, namep, ITOV(*ipp)); 268 rw_exit(&dp->i_rwlock); 269 return (0); 270 case DNOENT: 271 if (ufs_negative_cache && (dp->i_nlink > 0)) { 272 dnlc_enter(dvp, namep, DNLC_NO_VNODE); 273 } 274 rw_exit(&dp->i_rwlock); 275 return (ENOENT); 276 default: 277 break; 278 } 279 restart: 280 281 fbp = NULL; 282 doingchk = 0; 283 chkino = 0; 284 caching = 0; 285 286 /* 287 * Attempt to cache any directories greater than the tunable 288 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM), 289 * disable caching for this directory and record the system time. 290 * Any attempt after the disable time has expired will enable 291 * the caching again. 292 */ 293 if (dp->i_size >= ufs_min_dir_cache) { 294 /* 295 * if the directory caching disable time has expired 296 * enable the caching again. 297 */ 298 if (dp->i_cachedir == CD_DISABLED_NOMEM && 299 gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) { 300 ufs_dc_disable_at = 0; 301 dp->i_cachedir = CD_ENABLED; 302 } 303 if (dp->i_cachedir == CD_ENABLED) { 304 switch (dnlc_dir_start(dcap, dp->i_size >> 305 AV_DIRECT_SHIFT)) { 306 case DNOMEM: 307 dp->i_cachedir = CD_DISABLED_NOMEM; 308 ufs_dc_disable_at = gethrtime(); 309 break; 310 case DTOOBIG: 311 dp->i_cachedir = CD_DISABLED_TOOBIG; 312 break; 313 case DOK: 314 caching = 1; 315 break; 316 default: 317 break; 318 } 319 } 320 } 321 /* 322 * If caching we don't stop when the file has been 323 * found, but need to know later, so clear *ipp now 324 */ 325 *ipp = NULL; 326 327 recheck: 328 if (caching) { 329 offset = 0; 330 entryoffsetinblock = 0; 331 numdirpasses = 1; 332 } else { 333 /* 334 * Take care to look at dp->i_diroff only once, as it 335 * may be changing due to other threads/cpus. 336 */ 337 offset = dp->i_diroff; 338 if (offset > dp->i_size) { 339 offset = 0; 340 } 341 if (offset == 0) { 342 entryoffsetinblock = 0; 343 numdirpasses = 1; 344 } else { 345 start_off = offset; 346 347 entryoffsetinblock = blkoff(dp->i_fs, offset); 348 if (entryoffsetinblock != 0) { 349 err = blkatoff(dp, offset, (char **)0, &fbp); 350 if (err) 351 goto bad; 352 } 353 numdirpasses = 2; 354 } 355 } 356 endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t); 357 namlen = strlen(namep); 358 last_offset = 0; 359 360 searchloop: 361 while (offset < endsearch) { 362 /* 363 * If offset is on a block boundary, 364 * read the next directory block. 365 * Release previous if it exists. 366 */ 367 if (blkoff(dp->i_fs, offset) == 0) { 368 if (fbp != NULL) { 369 fbrelse(fbp, S_OTHER); 370 } 371 err = blkatoff(dp, offset, (char **)0, &fbp); 372 if (err) 373 goto bad; 374 entryoffsetinblock = 0; 375 } 376 377 /* 378 * If the offset to the next entry is invalid or if the 379 * next entry is a zero length record or if the record 380 * length is invalid, then skip to the next directory 381 * block. Complete validation checks are done if the 382 * record length is invalid. 383 * 384 * Full validation checks are slow so they are disabled 385 * by default. Complete checks can be run by patching 386 * "dirchk" to be true. 387 * 388 * We have to check the validity of entryoffsetinblock 389 * here because it can be set to i_diroff above. 390 */ 391 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock); 392 if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 || 393 (dirchk || (ep->d_reclen & 0x3)) && 394 dirmangled(dp, ep, entryoffsetinblock, offset)) { 395 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); 396 offset += i; 397 entryoffsetinblock += i; 398 if (caching) { 399 dnlc_dir_purge(dcap); 400 caching = 0; 401 } 402 continue; 403 } 404 405 ep_reclen = ep->d_reclen; 406 407 /* 408 * Add named entries and free space into the directory cache 409 */ 410 if (caching) { 411 ushort_t extra; 412 off_t off2; 413 414 if (ep->d_ino == 0) { 415 extra = ep_reclen; 416 if (offset & (DIRBLKSIZ - 1)) { 417 dnlc_dir_purge(dcap); 418 dp->i_cachedir = CD_DISABLED; 419 caching = 0; 420 } 421 } else { 422 /* 423 * entries hold the previous offset except the 424 * 1st which holds the offset + 1 425 */ 426 if (offset & (DIRBLKSIZ - 1)) { 427 off2 = last_offset; 428 } else { 429 off2 = offset + 1; 430 } 431 caching = (dnlc_dir_add_entry(dcap, ep->d_name, 432 INO_OFF_TO_H(ep->d_ino, off2)) == DOK); 433 extra = ep_reclen - DIRSIZ(ep); 434 } 435 if (caching && (extra >= LDIRSIZ(1))) { 436 caching = (dnlc_dir_add_space(dcap, extra, 437 (uint64_t)offset) == DOK); 438 } 439 } 440 441 /* 442 * Check for a name match. 443 * We have the parent inode read locked with i_rwlock. 444 */ 445 if (ep->d_ino && ep->d_namlen == namlen && 446 *namep == *ep->d_name && /* fast chk 1st chr */ 447 bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) { 448 449 /* 450 * We have to release the fbp early here to avoid 451 * a possible deadlock situation where we have the 452 * fbp and want the directory inode and someone doing 453 * a ufs_direnter_* has the directory inode and wants 454 * the fbp. XXX - is this still needed? 455 */ 456 ep_ino = (ino_t)ep->d_ino; 457 ASSERT(fbp != NULL); 458 fbrelse(fbp, S_OTHER); 459 fbp = NULL; 460 461 /* 462 * Atomic update (read lock held) 463 */ 464 dp->i_diroff = offset; 465 466 if (namlen == 2 && namep[0] == '.' && namep[1] == '.') { 467 struct timeval32 omtime; 468 469 if (caching) { 470 dnlc_dir_purge(dcap); 471 caching = 0; 472 } 473 if (doingchk) { 474 /* 475 * if the inumber didn't change 476 * continue with already found inode. 477 */ 478 if (ep_ino == chkino) 479 goto checkok; 480 else { 481 VN_RELE(ITOV(*ipp)); 482 /* *ipp is nulled at restart */ 483 goto restart; 484 } 485 } 486 /* 487 * release the lock on the dir we are searching 488 * to avoid a deadlock when grabbing the 489 * i_contents lock in ufs_iget_alloced(). 490 */ 491 omtime = dp->i_mtime; 492 rw_exit(&dp->i_rwlock); 493 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, 494 RW_READER); 495 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, 496 cr); 497 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 498 rw_enter(&dp->i_rwlock, RW_READER); 499 if (err) 500 goto bad; 501 /* 502 * Since we released the lock on the directory, 503 * we must check that the same inode is still 504 * the ".." entry for this directory. 505 */ 506 /*CSTYLED*/ 507 if (timercmp(&omtime, &dp->i_mtime, !=)) { 508 /* 509 * Modification time changed on the 510 * directory, we must go check if 511 * the inumber changed for ".." 512 */ 513 doingchk = 1; 514 chkino = ep_ino; 515 entryoffsetinblock = 0; 516 if (caching) { 517 /* 518 * Forget directory caching 519 * for this rare case 520 */ 521 dnlc_dir_purge(dcap); 522 caching = 0; 523 } 524 goto recheck; 525 } 526 } else if (dp->i_number == ep_ino) { 527 VN_HOLD(dvp); /* want ourself, "." */ 528 *ipp = dp; 529 if (caching) { 530 dnlc_dir_purge(dcap); 531 caching = 0; 532 } 533 } else { 534 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, 535 RW_READER); 536 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, 537 cr); 538 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 539 if (err) 540 goto bad; 541 } 542 checkok: 543 ASSERT(*ipp); 544 dnlc_update(dvp, namep, ITOV(*ipp)); 545 /* 546 * If we are not caching then just return the entry 547 * otherwise complete loading up the cache 548 */ 549 if (!caching) { 550 rw_exit(&dp->i_rwlock); 551 return (0); 552 } 553 err = blkatoff(dp, offset, (char **)0, &fbp); 554 if (err) 555 goto bad; 556 } 557 last_offset = offset; 558 offset += ep_reclen; 559 entryoffsetinblock += ep_reclen; 560 } 561 /* 562 * If we started in the middle of the directory and failed 563 * to find our target, we must check the beginning as well. 564 */ 565 if (numdirpasses == 2) { 566 numdirpasses--; 567 offset = 0; 568 endsearch = start_off; 569 goto searchloop; 570 } 571 572 /* 573 * If whole directory caching is on (or was originally on) then 574 * the entry may have been found. 575 */ 576 if (*ipp == NULL) { 577 err = ENOENT; 578 if (ufs_negative_cache && (dp->i_nlink > 0)) { 579 dnlc_enter(dvp, namep, DNLC_NO_VNODE); 580 } 581 } 582 if (caching) { 583 dnlc_dir_complete(dcap); 584 caching = 0; 585 } 586 587 bad: 588 if (err && *ipp) { 589 /* 590 * err and *ipp can both be set if we were attempting to 591 * cache the directory, and we found the entry, then later 592 * while trying to complete the directory cache encountered 593 * a error (eg reading a directory sector). 594 */ 595 VN_RELE(ITOV(*ipp)); 596 *ipp = NULL; 597 } 598 599 if (fbp) 600 fbrelse(fbp, S_OTHER); 601 rw_exit(&dp->i_rwlock); 602 if (caching) 603 dnlc_dir_purge(dcap); 604 return (err); 605 } 606 607 /* 608 * Write a new directory entry for DE_CREATE or DE_MKDIR operations. 609 */ 610 int 611 ufs_direnter_cm( 612 struct inode *tdp, /* target directory to make entry in */ 613 char *namep, /* name of entry */ 614 enum de_op op, /* entry operation */ 615 struct vattr *vap, /* attributes if new inode needed */ 616 struct inode **ipp, /* return entered inode here */ 617 struct cred *cr, /* user credentials */ 618 int flags) /* no entry exists */ 619 { 620 struct inode *tip; /* inode of (existing) target file */ 621 char *s; 622 struct ufs_slot slot; /* slot info to pass around */ 623 int namlen; /* length of name */ 624 int err; /* error number */ 625 struct inode *nip; /* new inode */ 626 int do_rele_nip = 0; /* release nip */ 627 int noentry = flags & ~IQUIET; 628 int quiet = flags & IQUIET; /* Suppress out of inodes message */ 629 630 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 631 632 if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) || 633 ((vap->va_type == VCHR) || (vap->va_type == VBLK) || 634 (vap->va_type == VDOOR) || (vap->va_type == VSOCK) || 635 (vap->va_type == VFIFO)))) 636 return (EINVAL); 637 638 /* don't allow '/' characters in pathname component */ 639 for (s = namep, namlen = 0; *s; s++, namlen++) 640 if (*s == '/') 641 return (EACCES); 642 ASSERT(namlen); 643 644 /* 645 * If name is "." or ".." then if this is a create look it up 646 * and return EEXIST. 647 */ 648 if (namep[0] == '.' && 649 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { 650 /* 651 * ufs_dirlook will acquire the i_rwlock 652 */ 653 rw_exit(&tdp->i_rwlock); 654 if (err = ufs_dirlook(tdp, namep, ipp, cr, 0)) { 655 rw_enter(&tdp->i_rwlock, RW_WRITER); 656 return (err); 657 } 658 rw_enter(&tdp->i_rwlock, RW_WRITER); 659 return (EEXIST); 660 } 661 662 /* 663 * If target directory has not been removed, then we can consider 664 * allowing file to be created. 665 */ 666 if (tdp->i_nlink <= 0) { 667 return (ENOENT); 668 } 669 670 /* 671 * Check accessibility of directory. 672 */ 673 if (((tdp->i_mode & IFMT) != IFDIR) && 674 ((tdp->i_mode & IFMT) != IFATTRDIR)) { 675 return (ENOTDIR); 676 } 677 678 /* 679 * Execute access is required to search the directory. 680 */ 681 if (err = ufs_iaccess(tdp, IEXEC, cr)) { 682 return (err); 683 } 684 685 /* 686 * Search for the entry. Return VN_HELD tip if found. 687 */ 688 tip = NULL; 689 slot.fbp = NULL; 690 slot.status = NONE; 691 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 692 rw_enter(&tdp->i_contents, RW_WRITER); 693 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry); 694 if (err) 695 goto out; 696 if (tip) { 697 ASSERT(!noentry); 698 *ipp = tip; 699 err = EEXIST; 700 } else { 701 /* 702 * The entry does not exist. Check write permission in 703 * directory to see if entry can be created. 704 */ 705 if (err = ufs_iaccess(tdp, IWRITE, cr)) 706 goto out; 707 /* 708 * Make new inode and directory entry. 709 */ 710 tdp->i_flag |= quiet; 711 if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) { 712 if (nip != NULL) 713 do_rele_nip = 1; 714 goto out; 715 } 716 if (err = ufs_diraddentry(tdp, namep, op, 717 namlen, &slot, nip, NULL, cr)) { 718 /* 719 * Unmake the inode we just made. 720 */ 721 rw_enter(&nip->i_contents, RW_WRITER); 722 if (((nip->i_mode & IFMT) == IFDIR) || 723 ((nip->i_mode & IFMT) == IFATTRDIR)) { 724 tdp->i_nlink--; 725 ufs_setreclaim(tdp); 726 tdp->i_flag |= ICHG; 727 tdp->i_seq++; 728 TRANS_INODE(tdp->i_ufsvfs, tdp); 729 ITIMES_NOLOCK(tdp); 730 } 731 nip->i_nlink = 0; 732 ufs_setreclaim(nip); 733 TRANS_INODE(nip->i_ufsvfs, nip); 734 nip->i_flag |= ICHG; 735 nip->i_seq++; 736 ITIMES_NOLOCK(nip); 737 rw_exit(&nip->i_contents); 738 do_rele_nip = 1; 739 } else { 740 *ipp = nip; 741 } 742 } 743 744 out: 745 if (slot.fbp) 746 fbrelse(slot.fbp, S_OTHER); 747 748 tdp->i_flag &= ~quiet; 749 rw_exit(&tdp->i_contents); 750 751 /* 752 * Drop vfs_dqrwlock before calling VN_RELE() on nip to 753 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 754 */ 755 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); 756 757 if (do_rele_nip) { 758 VN_RELE(ITOV(nip)); 759 } 760 761 return (err); 762 } 763 764 /* 765 * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations. 766 * If tvpp is non-null, return with the pointer to the target vnode. 767 */ 768 int 769 ufs_direnter_lr( 770 struct inode *tdp, /* target directory to make entry in */ 771 char *namep, /* name of entry */ 772 enum de_op op, /* entry operation */ 773 struct inode *sdp, /* source inode parent if rename */ 774 struct inode *sip, /* source inode */ 775 struct cred *cr, /* user credentials */ 776 vnode_t **tvpp) /* Return: (held) vnode of (existing) target */ 777 { 778 struct inode *tip; /* inode of (existing) target file */ 779 char *s; 780 struct ufs_slot slot; /* slot info to pass around */ 781 int namlen; /* length of name */ 782 int err; /* error number */ 783 784 /* don't allow '/' characters in pathname component */ 785 for (s = namep, namlen = 0; *s; s++, namlen++) 786 if (*s == '/') 787 return (EACCES); 788 ASSERT(namlen); 789 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 790 791 /* 792 * If name is "." or ".." then if this is a create look it up 793 * and return EEXIST. Rename or link TO "." or ".." is forbidden. 794 */ 795 if (namep[0] == '.' && 796 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { 797 if (op == DE_RENAME) { 798 return (EINVAL); /* *SIGH* should be ENOTEMPTY */ 799 } 800 return (EEXIST); 801 } 802 /* 803 * For link and rename lock the source entry and check the link count 804 * to see if it has been removed while it was unlocked. If not, we 805 * increment the link count and force the inode to disk to make sure 806 * that it is there before any directory entry that points to it. 807 * 808 * In the case of a symbolic link, we are dealing with a new inode 809 * which does not yet have any links. We've created it with a link 810 * count of 1, and we don't want to increment it since this will be 811 * its first link. 812 * 813 * We are about to push the inode to disk. We make sure 814 * that the inode's data blocks are flushed first so the 815 * inode and it's data blocks are always in sync. This 816 * adds some robustness in in the event of a power failure 817 * or panic where sync fails. If we panic before the 818 * inode is updated, then the inode still refers to the 819 * old data blocks (or none for a new file). If we panic 820 * after the inode is updated, then the inode refers to 821 * the new data blocks. 822 * 823 * We do this before grabbing the i_contents lock because 824 * ufs_syncip() will want that lock. We could do the data 825 * syncing after the removal checks, but upon return from 826 * the data sync we would have to repeat the removal 827 * checks. 828 */ 829 if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) { 830 return (err); 831 } 832 833 rw_enter(&sip->i_contents, RW_WRITER); 834 if (sip->i_nlink <= 0) { 835 rw_exit(&sip->i_contents); 836 return (ENOENT); 837 } 838 if (sip->i_nlink == MAXLINK) { 839 rw_exit(&sip->i_contents); 840 return (EMLINK); 841 } 842 843 /* 844 * Sync the indirect blocks associated with the file 845 * for the same reasons as described above. Since this 846 * call wants the i_contents lock held for it we can do 847 * this here with no extra work. 848 */ 849 if (err = ufs_sync_indir(sip)) { 850 rw_exit(&sip->i_contents); 851 return (err); 852 } 853 854 if (op != DE_SYMLINK) 855 sip->i_nlink++; 856 TRANS_INODE(sip->i_ufsvfs, sip); 857 sip->i_flag |= ICHG; 858 sip->i_seq++; 859 ufs_iupdat(sip, I_SYNC); 860 rw_exit(&sip->i_contents); 861 862 /* 863 * If target directory has not been removed, then we can consider 864 * allowing file to be created. 865 */ 866 if (tdp->i_nlink <= 0) { 867 err = ENOENT; 868 goto out2; 869 } 870 /* 871 * Check accessibility of directory. 872 */ 873 if (((tdp->i_mode & IFMT) != IFDIR) && 874 (tdp->i_mode & IFMT) != IFATTRDIR) { 875 err = ENOTDIR; 876 goto out2; 877 } 878 /* 879 * Execute access is required to search the directory. 880 */ 881 if (err = ufs_iaccess(tdp, IEXEC, cr)) { 882 goto out2; 883 } 884 885 /* 886 * Search for the entry. Return VN_HELD tip if found. 887 */ 888 tip = NULL; 889 slot.status = NONE; 890 slot.fbp = NULL; 891 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 892 rw_enter(&tdp->i_contents, RW_WRITER); 893 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0); 894 if (err) 895 goto out; 896 897 if (tip) { 898 switch (op) { 899 case DE_RENAME: 900 err = ufs_dirrename(sdp, sip, tdp, namep, 901 tip, &slot, cr); 902 break; 903 904 case DE_LINK: 905 case DE_SYMLINK: 906 /* 907 * Can't link to an existing file. 908 */ 909 err = EEXIST; 910 break; 911 default: 912 break; 913 } 914 } else { 915 /* 916 * The entry does not exist. Check write permission in 917 * directory to see if entry can be created. 918 */ 919 if (err = ufs_iaccess(tdp, IWRITE, cr)) 920 goto out; 921 err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp, 922 cr); 923 } 924 925 out: 926 if (slot.fbp) 927 fbrelse(slot.fbp, S_OTHER); 928 929 rw_exit(&tdp->i_contents); 930 931 /* 932 * Drop vfs_dqrwlock before calling VN_RELE() on tip to 933 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 934 */ 935 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); 936 937 /* 938 * If we renamed a file over the top of an existing file, 939 * or linked a file to an existing file (or tried to), 940 * then set *tvpp to the target vnode, if tvpp is non-null 941 * otherwise, release and delete (or just release) the inode. 942 * 943 * N.B., by returning the target's vnode pointer to the caller, 944 * that caller becomes responsible for doing the VN_RELE. 945 */ 946 if (tip) { 947 if ((err == 0) && (tvpp != NULL)) { 948 *tvpp = ITOV(tip); 949 } else { 950 VN_RELE(ITOV(tip)); 951 } 952 } 953 954 out2: 955 if (err) { 956 /* 957 * Undo bumped link count. 958 */ 959 if (op != DE_SYMLINK) { 960 rw_enter(&sip->i_contents, RW_WRITER); 961 sip->i_nlink--; 962 ufs_setreclaim(sip); 963 TRANS_INODE(sip->i_ufsvfs, sip); 964 sip->i_flag |= ICHG; 965 sip->i_seq++; 966 ITIMES_NOLOCK(sip); 967 rw_exit(&sip->i_contents); 968 } 969 } 970 return (err); 971 } 972 973 /* 974 * Check for the existence of a name in a directory (unless noentry 975 * is set) , or else of an empty 976 * slot in which an entry may be made. If the requested name is found, 977 * then on return *ipp points at the inode and *offp contains 978 * its offset in the directory. If the name is not found, then *ipp 979 * will be NULL and *slotp will contain information about a directory slot in 980 * which an entry may be made (either an empty slot, or the first position 981 * past the end of the directory). 982 * The target directory inode (tdp) is supplied write locked (i_rwlock). 983 * 984 * This may not be used on "." or "..", but aliases of "." are ok. 985 */ 986 int 987 ufs_dircheckforname( 988 struct inode *tdp, /* inode of directory being checked */ 989 char *namep, /* name we're checking for */ 990 int namlen, /* length of name, excluding null */ 991 struct ufs_slot *slotp, /* slot structure */ 992 struct inode **ipp, /* return inode if we find one */ 993 struct cred *cr, 994 int noentry) /* noentry - just look for space */ 995 { 996 uint64_t handle; 997 struct fbuf *fbp; /* pointer to directory block */ 998 struct direct *ep; /* directory entry */ 999 struct direct *nep; /* next directory entry */ 1000 dcanchor_t *dcap; 1001 vnode_t *dvp; /* directory vnode ptr */ 1002 off_t dirsize; /* size of the directory */ 1003 off_t offset; /* offset in the directory */ 1004 off_t last_offset; /* last offset */ 1005 off_t enduseful; /* pointer past last used dir slot */ 1006 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1007 int i; /* length of mangled entry */ 1008 int needed; 1009 int err; 1010 int first; 1011 int caching; 1012 int stat; 1013 ino_t ep_ino; 1014 slotstat_t initstat = slotp->status; 1015 1016 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1017 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1018 ASSERT(*ipp == NULL); 1019 fbp = NULL; 1020 1021 /* 1022 * First check if there is a complete cache of the directory. 1023 */ 1024 dvp = ITOV(tdp); 1025 1026 dcap = &tdp->i_danchor; 1027 if (noentry) { 1028 /* 1029 * We know from the 1st level dnlc cache that the entry 1030 * doesn't exist, so don't bother searching the directory 1031 * cache, but just look for space (possibly in the directory 1032 * cache). 1033 */ 1034 stat = DNOENT; 1035 } else { 1036 stat = dnlc_dir_lookup(dcap, namep, &handle); 1037 } 1038 switch (stat) { 1039 case DFOUND: 1040 ep_ino = (ino_t)H_TO_INO(handle); 1041 if (tdp->i_number == ep_ino) { 1042 *ipp = tdp; /* we want ourself, ie "." */ 1043 VN_HOLD(dvp); 1044 } else { 1045 err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr); 1046 if (err) 1047 return (err); 1048 } 1049 offset = H_TO_OFF(handle); 1050 first = 0; 1051 if (offset & 1) { 1052 /* This is the first entry in the block */ 1053 first = 1; 1054 offset -= 1; 1055 ASSERT((offset & (DIRBLKSIZ - 1)) == 0); 1056 } 1057 err = blkatoff(tdp, offset, (char **)&ep, &fbp); 1058 if (err) { 1059 VN_RELE(ITOV(*ipp)); 1060 *ipp = NULL; 1061 return (err); 1062 } 1063 /* 1064 * Check the validity of the entry. 1065 * If it's bad, then throw away the cache and 1066 * continue without it. The dirmangled() routine 1067 * will then be called upon it. 1068 */ 1069 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { 1070 VN_RELE(ITOV(*ipp)); 1071 *ipp = NULL; 1072 dnlc_dir_purge(dcap); 1073 break; 1074 } 1075 /* 1076 * Remember the returned offset is the offset of the 1077 * preceding record (unless this is the 1st record 1078 * in the DIRBLKSIZ sized block (disk sector)), then it's 1079 * offset + 1. Note, no real offsets are on odd boundaries. 1080 */ 1081 if (first) { 1082 ASSERT((offset & (DIRBLKSIZ - 1)) == 0); 1083 slotp->offset = offset; 1084 slotp->size = 0; 1085 slotp->ep = ep; 1086 } else { 1087 /* get the next entry */ 1088 nep = (struct direct *)((char *)ep + ep->d_reclen); 1089 /* 1090 * Check the validity of this entry as well 1091 * If it's bad, then throw away the cache and 1092 * continue without it. The dirmangled() routine 1093 * will then be called upon it. 1094 */ 1095 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || 1096 (nep->d_ino != ep_ino)) { 1097 VN_RELE(ITOV(*ipp)); 1098 *ipp = NULL; 1099 dnlc_dir_purge(dcap); 1100 break; 1101 } 1102 slotp->offset = offset + ep->d_reclen; 1103 slotp->size = ep->d_reclen; 1104 slotp->ep = nep; 1105 } 1106 slotp->status = EXIST; 1107 slotp->fbp = fbp; 1108 slotp->endoff = 0; 1109 slotp->cached = 1; 1110 dnlc_update(dvp, namep, ITOV(*ipp)); 1111 return (0); 1112 case DNOENT: 1113 /* 1114 * The caller gets to set the initial slot status to 1115 * indicate whether it's interested in getting a 1116 * empty slot. For example, the status can be set 1117 * to FOUND when an entry is being deleted. 1118 */ 1119 ASSERT(slotp->fbp == NULL); 1120 if (slotp->status == FOUND) { 1121 return (0); 1122 } 1123 switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen), 1124 &handle)) { 1125 case DFOUND: 1126 offset = (off_t)handle; 1127 err = blkatoff(tdp, offset, (char **)&ep, &fbp); 1128 if (err) { 1129 dnlc_dir_purge(dcap); 1130 ASSERT(*ipp == NULL); 1131 return (err); 1132 } 1133 /* 1134 * Check the validity of the entry. 1135 * If it's bad, then throw away the cache and 1136 * continue without it. The dirmangled() routine 1137 * will then be called upon it. 1138 */ 1139 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { 1140 dnlc_dir_purge(dcap); 1141 break; 1142 } 1143 /* 1144 * Remember the returned offset is the offset of the 1145 * containing record. 1146 */ 1147 slotp->status = FOUND; 1148 slotp->ep = ep; 1149 slotp->offset = offset; 1150 slotp->fbp = fbp; 1151 slotp->size = ep->d_reclen; 1152 /* 1153 * Set end offset to 0. Truncation is handled 1154 * because the dnlc cache will blow away the 1155 * cached directory when an entry is removed 1156 * that drops the entries left to less than half 1157 * the minumum number (dnlc_min_dir_cache). 1158 */ 1159 slotp->endoff = 0; 1160 slotp->cached = 1; 1161 return (0); 1162 case DNOENT: 1163 slotp->status = NONE; 1164 slotp->offset = P2ROUNDUP_TYPED(tdp->i_size, 1165 DIRBLKSIZ, u_offset_t); 1166 slotp->size = DIRBLKSIZ; 1167 slotp->endoff = 0; 1168 slotp->cached = 1; 1169 return (0); 1170 default: 1171 break; 1172 } 1173 break; 1174 } 1175 slotp->cached = 0; 1176 caching = NULL; 1177 if (!noentry && tdp->i_size >= ufs_min_dir_cache) { 1178 /* 1179 * if the directory caching disable time has expired 1180 * enable caching again. 1181 */ 1182 if (tdp->i_cachedir == CD_DISABLED_NOMEM && 1183 gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) { 1184 ufs_dc_disable_at = 0; 1185 tdp->i_cachedir = CD_ENABLED; 1186 } 1187 /* 1188 * Attempt to cache any directories greater than the tunable 1189 * ufs_min_cache_dir. If it fails due to memory shortage 1190 * (DNOMEM), disable caching for this directory and record 1191 * the system time. Any attempt after the disable time has 1192 * expired will enable the caching again. 1193 */ 1194 if (tdp->i_cachedir == CD_ENABLED) { 1195 switch (dnlc_dir_start(dcap, 1196 tdp->i_size >> AV_DIRECT_SHIFT)) { 1197 case DNOMEM: 1198 tdp->i_cachedir = CD_DISABLED_NOMEM; 1199 ufs_dc_disable_at = gethrtime(); 1200 break; 1201 case DTOOBIG: 1202 tdp->i_cachedir = CD_DISABLED_TOOBIG; 1203 break; 1204 case DOK: 1205 caching = 1; 1206 break; 1207 default: 1208 break; 1209 } 1210 } 1211 } 1212 1213 /* 1214 * No point in using i_diroff since we must search whole directory 1215 */ 1216 dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t); 1217 enduseful = 0; 1218 offset = last_offset = 0; 1219 entryoffsetinblk = 0; 1220 needed = (int)LDIRSIZ(namlen); 1221 while (offset < dirsize) { 1222 /* 1223 * If offset is on a block boundary, 1224 * read the next directory block. 1225 * Release previous if it exists. 1226 */ 1227 if (blkoff(tdp->i_fs, offset) == 0) { 1228 if (fbp != NULL) 1229 fbrelse(fbp, S_OTHER); 1230 1231 err = blkatoff(tdp, offset, (char **)0, &fbp); 1232 if (err) { 1233 ASSERT(*ipp == NULL); 1234 if (caching) { 1235 dnlc_dir_purge(dcap); 1236 } 1237 return (err); 1238 } 1239 entryoffsetinblk = 0; 1240 } 1241 /* 1242 * If still looking for a slot, and at a DIRBLKSIZ 1243 * boundary, have to start looking for free space 1244 * again. 1245 */ 1246 if (slotp->status == NONE && 1247 (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) { 1248 slotp->offset = -1; 1249 } 1250 /* 1251 * If the next entry is a zero length record or if the 1252 * record length is invalid, then skip to the next 1253 * directory block. Complete validation checks are 1254 * done if the record length is invalid. 1255 * 1256 * Full validation checks are slow so they are disabled 1257 * by default. Complete checks can be run by patching 1258 * "dirchk" to be true. 1259 * 1260 * We do not have to check the validity of 1261 * entryoffsetinblk here because it starts out as zero 1262 * and is only incremented by d_reclen values that we 1263 * validate here. 1264 */ 1265 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1266 if (ep->d_reclen == 0 || 1267 (dirchk || (ep->d_reclen & 0x3)) && 1268 dirmangled(tdp, ep, entryoffsetinblk, offset)) { 1269 i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1)); 1270 offset += i; 1271 entryoffsetinblk += i; 1272 if (caching) { 1273 dnlc_dir_purge(dcap); 1274 caching = 0; 1275 } 1276 continue; 1277 } 1278 1279 /* 1280 * Add named entries and free space into the directory cache 1281 */ 1282 if (caching) { 1283 ushort_t extra; 1284 off_t off2; 1285 1286 if (ep->d_ino == 0) { 1287 extra = ep->d_reclen; 1288 if (offset & (DIRBLKSIZ - 1)) { 1289 dnlc_dir_purge(dcap); 1290 caching = 0; 1291 } 1292 } else { 1293 /* 1294 * entries hold the previous offset if 1295 * not the 1st one 1296 */ 1297 if (offset & (DIRBLKSIZ - 1)) { 1298 off2 = last_offset; 1299 } else { 1300 off2 = offset + 1; 1301 } 1302 caching = (dnlc_dir_add_entry(dcap, ep->d_name, 1303 INO_OFF_TO_H(ep->d_ino, off2)) == DOK); 1304 extra = ep->d_reclen - DIRSIZ(ep); 1305 } 1306 if (caching && (extra >= LDIRSIZ(1))) { 1307 caching = (dnlc_dir_add_space(dcap, extra, 1308 (uint64_t)offset) == DOK); 1309 } 1310 } 1311 1312 /* 1313 * If an appropriate sized slot has not yet been found, 1314 * check to see if one is available. 1315 */ 1316 if ((slotp->status != FOUND) && (slotp->status != EXIST)) { 1317 int size = ep->d_reclen; 1318 1319 if (ep->d_ino != 0) 1320 size -= DIRSIZ(ep); 1321 if (size > 0) { 1322 if (size >= needed) { 1323 slotp->offset = offset; 1324 slotp->size = ep->d_reclen; 1325 if (noentry) { 1326 slotp->ep = ep; 1327 slotp->fbp = fbp; 1328 slotp->status = FOUND; 1329 slotp->endoff = 0; 1330 return (0); 1331 } 1332 slotp->status = FOUND; 1333 } else if (slotp->status == NONE) { 1334 if (slotp->offset == -1) 1335 slotp->offset = offset; 1336 } 1337 } 1338 } 1339 /* 1340 * Check for a name match. 1341 */ 1342 if (ep->d_ino && ep->d_namlen == namlen && 1343 *namep == *ep->d_name && /* fast chk 1st char */ 1344 bcmp(namep, ep->d_name, namlen) == 0) { 1345 1346 tdp->i_diroff = offset; 1347 1348 if (tdp->i_number == ep->d_ino) { 1349 *ipp = tdp; /* we want ourself, ie "." */ 1350 VN_HOLD(dvp); 1351 } else { 1352 err = ufs_iget_alloced(tdp->i_vfs, 1353 (ino_t)ep->d_ino, ipp, cr); 1354 if (err) { 1355 fbrelse(fbp, S_OTHER); 1356 if (caching) 1357 dnlc_dir_purge(dcap); 1358 return (err); 1359 } 1360 } 1361 slotp->status = EXIST; 1362 slotp->offset = offset; 1363 slotp->size = (int)(offset - last_offset); 1364 slotp->fbp = fbp; 1365 slotp->ep = ep; 1366 slotp->endoff = 0; 1367 if (caching) 1368 dnlc_dir_purge(dcap); 1369 return (0); 1370 } 1371 last_offset = offset; 1372 offset += ep->d_reclen; 1373 entryoffsetinblk += ep->d_reclen; 1374 if (ep->d_ino) 1375 enduseful = offset; 1376 } 1377 if (fbp) { 1378 fbrelse(fbp, S_OTHER); 1379 } 1380 1381 if (caching) { 1382 dnlc_dir_complete(dcap); 1383 slotp->cached = 1; 1384 if (slotp->status == FOUND) { 1385 if (initstat == FOUND) { 1386 return (0); 1387 } 1388 (void) dnlc_dir_rem_space_by_handle(dcap, 1389 slotp->offset); 1390 slotp->endoff = 0; 1391 return (0); 1392 } 1393 } 1394 1395 if (slotp->status == NONE) { 1396 /* 1397 * We didn't find a slot; the new directory entry should be put 1398 * at the end of the directory. Return an indication of where 1399 * this is, and set "endoff" to zero; since we're going to have 1400 * to extend the directory, we're certainly not going to 1401 * truncate it. 1402 */ 1403 slotp->offset = dirsize; 1404 slotp->size = DIRBLKSIZ; 1405 slotp->endoff = 0; 1406 } else { 1407 /* 1408 * We found a slot, and will return an indication of where that 1409 * slot is, as any new directory entry will be put there. 1410 * Since that slot will become a useful entry, if the last 1411 * useful entry we found was before this one, update the offset 1412 * of the last useful entry. 1413 */ 1414 if (enduseful < slotp->offset + slotp->size) 1415 enduseful = slotp->offset + slotp->size; 1416 slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t); 1417 } 1418 *ipp = NULL; 1419 return (0); 1420 } 1421 1422 uint64_t ufs_dirrename_retry_cnt; 1423 1424 /* 1425 * Rename the entry in the directory tdp so that it points to 1426 * sip instead of tip. 1427 */ 1428 static int 1429 ufs_dirrename( 1430 struct inode *sdp, /* parent directory of source */ 1431 struct inode *sip, /* source inode */ 1432 struct inode *tdp, /* parent directory of target */ 1433 char *namep, /* entry we are trying to change */ 1434 struct inode *tip, /* target inode */ 1435 struct ufs_slot *slotp, /* slot for entry */ 1436 struct cred *cr) /* credentials */ 1437 { 1438 vnode_t *tdvp; 1439 off_t offset; 1440 int err; 1441 int doingdirectory; 1442 1443 ASSERT(sdp->i_ufsvfs != NULL); 1444 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1445 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1446 /* 1447 * Short circuit rename of something to itself. 1448 */ 1449 if (sip->i_number == tip->i_number) { 1450 return (ESAME); /* special KLUDGE error code */ 1451 } 1452 1453 /* 1454 * We're locking 2 peer level locks, so must use tryenter 1455 * on the 2nd to avoid deadlocks that would occur 1456 * if we renamed a->b and b->a concurrently. 1457 */ 1458 retry: 1459 rw_enter(&tip->i_contents, RW_WRITER); 1460 if (!rw_tryenter(&sip->i_contents, RW_READER)) { 1461 /* 1462 * drop tip and wait (sleep) until we stand a chance 1463 * of holding sip 1464 */ 1465 rw_exit(&tip->i_contents); 1466 rw_enter(&sip->i_contents, RW_READER); 1467 /* 1468 * Reverse the lock grabs in case we have heavy 1469 * contention on the 2nd lock. 1470 */ 1471 if (!rw_tryenter(&tip->i_contents, RW_WRITER)) { 1472 ufs_dirrename_retry_cnt++; 1473 rw_exit(&sip->i_contents); 1474 goto retry; 1475 } 1476 } 1477 1478 /* 1479 * Check that everything is on the same filesystem. 1480 */ 1481 if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) || 1482 (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) { 1483 err = EXDEV; /* XXX archaic */ 1484 goto out; 1485 } 1486 /* 1487 * Must have write permission to rewrite target entry. 1488 * Perform additional checks for sticky directories. 1489 */ 1490 if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0 || 1491 (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0) 1492 goto out; 1493 1494 /* 1495 * Ensure source and target are compatible (both directories 1496 * or both not directories). If target is a directory it must 1497 * be empty and have no links to it; in addition it must not 1498 * be a mount point, and both the source and target must be 1499 * writable. 1500 */ 1501 doingdirectory = (((sip->i_mode & IFMT) == IFDIR) || 1502 ((sip->i_mode & IFMT) == IFATTRDIR)); 1503 if (((tip->i_mode & IFMT) == IFDIR) || 1504 ((tip->i_mode & IFMT) == IFATTRDIR)) { 1505 if (!doingdirectory) { 1506 err = EISDIR; 1507 goto out; 1508 } 1509 /* 1510 * vn_vfsrlock will prevent mounts from using the directory 1511 * until we are done. 1512 */ 1513 if (vn_vfsrlock(ITOV(tip))) { 1514 err = EBUSY; 1515 goto out; 1516 } 1517 if (vn_mountedvfs(ITOV(tip)) != NULL) { 1518 vn_vfsunlock(ITOV(tip)); 1519 err = EBUSY; 1520 goto out; 1521 } 1522 if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) { 1523 vn_vfsunlock(ITOV(tip)); 1524 err = EEXIST; /* SIGH should be ENOTEMPTY */ 1525 goto out; 1526 } 1527 } else if (doingdirectory) { 1528 err = ENOTDIR; 1529 goto out; 1530 } 1531 1532 /* 1533 * Rewrite the inode pointer for target name entry 1534 * from the target inode (ip) to the source inode (sip). 1535 * This prevents the target entry from disappearing 1536 * during a crash. Mark the directory inode to reflect the changes. 1537 */ 1538 tdvp = ITOV(tdp); 1539 slotp->ep->d_ino = (int32_t)sip->i_number; 1540 dnlc_update(tdvp, namep, ITOV(sip)); 1541 if (slotp->size) { 1542 offset = slotp->offset - slotp->size; 1543 } else { 1544 offset = slotp->offset + 1; 1545 } 1546 if (slotp->cached) { 1547 (void) dnlc_dir_update(&tdp->i_danchor, namep, 1548 INO_OFF_TO_H(slotp->ep->d_ino, offset)); 1549 } 1550 1551 err = TRANS_DIR(tdp, slotp->offset); 1552 if (err) 1553 fbrelse(slotp->fbp, S_OTHER); 1554 else 1555 err = ufs_fbwrite(slotp->fbp, tdp); 1556 1557 slotp->fbp = NULL; 1558 if (err) { 1559 if (doingdirectory) 1560 vn_vfsunlock(ITOV(tip)); 1561 goto out; 1562 } 1563 1564 TRANS_INODE(tdp->i_ufsvfs, tdp); 1565 tdp->i_flag |= IUPD|ICHG; 1566 tdp->i_seq++; 1567 ITIMES_NOLOCK(tdp); 1568 1569 /* 1570 * Decrement the link count of the target inode. 1571 * Fix the ".." entry in sip to point to dp. 1572 * This is done after the new entry is on the disk. 1573 */ 1574 tip->i_nlink--; 1575 TRANS_INODE(tip->i_ufsvfs, tip); 1576 tip->i_flag |= ICHG; 1577 tip->i_seq++; 1578 ITIMES_NOLOCK(tip); 1579 if (doingdirectory) { 1580 /* 1581 * The entry for tip no longer exists so I can unlock the 1582 * vfslock. 1583 */ 1584 vn_vfsunlock(ITOV(tip)); 1585 /* 1586 * Decrement target link count once more if it was a directory. 1587 */ 1588 if (--tip->i_nlink != 0) { 1589 err = ufs_fault(ITOV(tip), 1590 "ufs_dirrename: target directory link count != 0 (%s)", 1591 tip->i_fs->fs_fsmnt); 1592 rw_exit(&tip->i_contents); 1593 return (err); 1594 } 1595 TRANS_INODE(tip->i_ufsvfs, tip); 1596 ufs_setreclaim(tip); 1597 /* 1598 * Renaming a directory with the parent different 1599 * requires that ".." be rewritten. The window is 1600 * still there for ".." to be inconsistent, but this 1601 * is unavoidable, and a lot shorter than when it was 1602 * done in a user process. We decrement the link 1603 * count in the new parent as appropriate to reflect 1604 * the just-removed target. If the parent is the 1605 * same, this is appropriate since the original 1606 * directory is going away. If the new parent is 1607 * different, ufs_dirfixdotdot() will bump the link count 1608 * back. 1609 */ 1610 tdp->i_nlink--; 1611 ufs_setreclaim(tdp); 1612 TRANS_INODE(tdp->i_ufsvfs, tdp); 1613 tdp->i_flag |= ICHG; 1614 tdp->i_seq++; 1615 ITIMES_NOLOCK(tdp); 1616 if (sdp != tdp) { 1617 rw_exit(&tip->i_contents); 1618 rw_exit(&sip->i_contents); 1619 err = ufs_dirfixdotdot(sip, sdp, tdp); 1620 return (err); 1621 } 1622 } else 1623 ufs_setreclaim(tip); 1624 out: 1625 rw_exit(&tip->i_contents); 1626 rw_exit(&sip->i_contents); 1627 return (err); 1628 } 1629 1630 /* 1631 * Fix the ".." entry of the child directory so that it points 1632 * to the new parent directory instead of the old one. Routine 1633 * assumes that dp is a directory and that all the inodes are on 1634 * the same file system. 1635 */ 1636 static int 1637 ufs_dirfixdotdot( 1638 struct inode *dp, /* child directory */ 1639 struct inode *opdp, /* old parent directory */ 1640 struct inode *npdp) /* new parent directory */ 1641 { 1642 struct fbuf *fbp; 1643 struct dirtemplate *dirp; 1644 vnode_t *dvp; 1645 int err; 1646 1647 ASSERT(RW_WRITE_HELD(&npdp->i_rwlock)); 1648 ASSERT(RW_WRITE_HELD(&npdp->i_contents)); 1649 1650 /* 1651 * We hold the child directory's i_contents lock before calling 1652 * blkatoff so that we honor correct locking protocol which is 1653 * i_contents lock and then page lock. (blkatoff will call 1654 * ufs_getpage where we want the page lock) 1655 * We hold the child directory's i_rwlock before i_contents (as 1656 * per the locking protocol) since we are modifying the ".." entry 1657 * of the child directory. 1658 * We hold the i_rwlock and i_contents lock until we record 1659 * this directory delta to the log (via ufs_trans_dir) and have 1660 * done fbrelse. 1661 */ 1662 rw_enter(&dp->i_rwlock, RW_WRITER); 1663 rw_enter(&dp->i_contents, RW_WRITER); 1664 err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp); 1665 if (err) 1666 goto bad; 1667 1668 if (dp->i_nlink <= 0 || 1669 dp->i_size < sizeof (struct dirtemplate)) { 1670 err = ENOENT; 1671 goto bad; 1672 } 1673 1674 if (dirp->dotdot_namlen != 2 || 1675 dirp->dotdot_name[0] != '.' || 1676 dirp->dotdot_name[1] != '.') { /* Sanity check. */ 1677 dirbad(dp, "mangled .. entry", (off_t)0); 1678 err = ENOTDIR; 1679 goto bad; 1680 } 1681 1682 /* 1683 * Increment the link count in the new parent inode and force it out. 1684 */ 1685 if (npdp->i_nlink == MAXLINK) { 1686 err = EMLINK; 1687 goto bad; 1688 } 1689 npdp->i_nlink++; 1690 TRANS_INODE(npdp->i_ufsvfs, npdp); 1691 npdp->i_flag |= ICHG; 1692 npdp->i_seq++; 1693 ufs_iupdat(npdp, I_SYNC); 1694 1695 /* 1696 * Rewrite the child ".." entry and force it out. 1697 */ 1698 dvp = ITOV(dp); 1699 dirp->dotdot_ino = (uint32_t)npdp->i_number; 1700 dnlc_update(dvp, "..", ITOV(npdp)); 1701 (void) dnlc_dir_update(&dp->i_danchor, "..", 1702 INO_OFF_TO_H(dirp->dotdot_ino, 0)); 1703 1704 err = TRANS_DIR(dp, 0); 1705 if (err) 1706 fbrelse(fbp, S_OTHER); 1707 else 1708 err = ufs_fbwrite(fbp, dp); 1709 1710 fbp = NULL; 1711 if (err) 1712 goto bad; 1713 1714 rw_exit(&dp->i_contents); 1715 rw_exit(&dp->i_rwlock); 1716 1717 /* 1718 * Decrement the link count of the old parent inode and force it out. 1719 */ 1720 ASSERT(opdp); 1721 rw_enter(&opdp->i_contents, RW_WRITER); 1722 ASSERT(opdp->i_nlink > 0); 1723 opdp->i_nlink--; 1724 ufs_setreclaim(opdp); 1725 TRANS_INODE(opdp->i_ufsvfs, opdp); 1726 opdp->i_flag |= ICHG; 1727 opdp->i_seq++; 1728 ufs_iupdat(opdp, I_SYNC); 1729 rw_exit(&opdp->i_contents); 1730 return (0); 1731 1732 bad: 1733 if (fbp) 1734 fbrelse(fbp, S_OTHER); 1735 rw_exit(&dp->i_contents); 1736 rw_exit(&dp->i_rwlock); 1737 return (err); 1738 } 1739 1740 /* 1741 * Enter the file sip in the directory tdp with name namep. 1742 */ 1743 static int 1744 ufs_diraddentry( 1745 struct inode *tdp, 1746 char *namep, 1747 enum de_op op, 1748 int namlen, 1749 struct ufs_slot *slotp, 1750 struct inode *sip, 1751 struct inode *sdp, 1752 struct cred *cr) 1753 { 1754 struct direct *ep, *nep; 1755 vnode_t *tdvp; 1756 dcanchor_t *dcap = &tdp->i_danchor; 1757 off_t offset; 1758 int err; 1759 ushort_t extra; 1760 1761 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1762 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1763 /* 1764 * Prepare a new entry. If the caller has not supplied an 1765 * existing inode, make a new one. 1766 */ 1767 err = dirprepareentry(tdp, slotp, cr); 1768 if (err) { 1769 if (slotp->fbp) { 1770 fbrelse(slotp->fbp, S_OTHER); 1771 slotp->fbp = NULL; 1772 } 1773 return (err); 1774 } 1775 /* 1776 * Check inode to be linked to see if it is in the 1777 * same filesystem. 1778 */ 1779 if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) { 1780 err = EXDEV; 1781 goto bad; 1782 } 1783 1784 /* 1785 * If renaming a directory then fix up the ".." entry in the 1786 * directory to point to the new parent. 1787 */ 1788 if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) || 1789 ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) { 1790 err = ufs_dirfixdotdot(sip, sdp, tdp); 1791 if (err) 1792 goto bad; 1793 } 1794 1795 /* 1796 * Fill in entry data. 1797 */ 1798 ep = slotp->ep; 1799 ep->d_namlen = (ushort_t)namlen; 1800 (void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3)); 1801 ep->d_ino = (uint32_t)sip->i_number; 1802 tdvp = ITOV(tdp); 1803 dnlc_update(tdvp, namep, ITOV(sip)); 1804 /* 1805 * Note the offset supplied for any named entry is 1806 * the offset of the previous one, unless it's the 1st. 1807 * slotp->size is used to pass the length to 1808 * the previous entry. 1809 */ 1810 if (slotp->size) { 1811 offset = slotp->offset - slotp->size; 1812 } else { 1813 offset = slotp->offset + 1; 1814 } 1815 1816 if (slotp->cached) { 1817 /* 1818 * Add back any usable unused space to the dnlc directory 1819 * cache. 1820 */ 1821 extra = ep->d_reclen - DIRSIZ(ep); 1822 if (extra >= LDIRSIZ(1)) { 1823 (void) dnlc_dir_add_space(dcap, extra, 1824 (uint64_t)slotp->offset); 1825 } 1826 1827 (void) dnlc_dir_add_entry(dcap, namep, 1828 INO_OFF_TO_H(ep->d_ino, offset)); 1829 1830 /* adjust the previous offset of the next entry */ 1831 nep = (struct direct *)((char *)ep + ep->d_reclen); 1832 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { 1833 /* 1834 * Not a new block. 1835 * 1836 * Check the validity of the next entry. 1837 * If it's bad, then throw away the cache, and 1838 * continue as before directory caching. 1839 */ 1840 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || 1841 dnlc_dir_update(dcap, nep->d_name, 1842 INO_OFF_TO_H(nep->d_ino, slotp->offset)) 1843 == DNOENT) { 1844 dnlc_dir_purge(dcap); 1845 slotp->cached = 0; 1846 } 1847 } 1848 } 1849 1850 /* 1851 * Write out the directory block. 1852 */ 1853 err = TRANS_DIR(tdp, slotp->offset); 1854 if (err) 1855 fbrelse(slotp->fbp, S_OTHER); 1856 else 1857 err = ufs_fbwrite(slotp->fbp, tdp); 1858 1859 slotp->fbp = NULL; 1860 /* 1861 * If this is a rename of a directory, then we have already 1862 * fixed the ".." entry to refer to the new parent. If err 1863 * is true at this point, we have failed to update the new 1864 * parent to refer to the renamed directory. 1865 * XXX - we need to unwind the ".." fix. 1866 */ 1867 if (err) 1868 return (err); 1869 1870 /* 1871 * Mark the directory inode to reflect the changes. 1872 * Truncate the directory to chop off blocks of empty entries. 1873 */ 1874 1875 TRANS_INODE(tdp->i_ufsvfs, tdp); 1876 tdp->i_flag |= IUPD|ICHG; 1877 tdp->i_seq++; 1878 tdp->i_diroff = 0; 1879 ITIMES_NOLOCK(tdp); 1880 /* 1881 * If the directory grew then dirprepareentry() will have 1882 * set IATTCHG in tdp->i_flag, then the directory inode must 1883 * be flushed out. This is because if fsync() is used later 1884 * the directory size must be correct, otherwise a crash would 1885 * cause fsck to move the file to lost+found. Also because later 1886 * a file may be linked in more than one directory, then there 1887 * is no way to flush the original directory. So it must be 1888 * flushed out on creation. See bug 4293809. 1889 */ 1890 if (tdp->i_flag & IATTCHG) { 1891 ufs_iupdat(tdp, I_SYNC); 1892 } 1893 1894 if (slotp->endoff && (slotp->endoff < tdp->i_size)) { 1895 if (!TRANS_ISTRANS(tdp->i_ufsvfs)) { 1896 (void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0, 1897 cr); 1898 } 1899 } 1900 1901 1902 return (0); 1903 1904 bad: 1905 if (slotp->cached) { 1906 dnlc_dir_purge(dcap); 1907 fbrelse(slotp->fbp, S_OTHER); 1908 slotp->cached = 0; 1909 slotp->fbp = NULL; 1910 return (err); 1911 } 1912 1913 /* 1914 * Clear out entry prepared by dirprepareent. 1915 */ 1916 slotp->ep->d_ino = 0; 1917 slotp->ep->d_namlen = 0; 1918 1919 /* 1920 * Don't touch err so we don't clobber the real error that got us here. 1921 */ 1922 if (TRANS_DIR(tdp, slotp->offset)) 1923 fbrelse(slotp->fbp, S_OTHER); 1924 else 1925 (void) ufs_fbwrite(slotp->fbp, tdp); 1926 slotp->fbp = NULL; 1927 return (err); 1928 } 1929 1930 /* 1931 * Prepare a directory slot to receive an entry. 1932 */ 1933 static int 1934 dirprepareentry( 1935 struct inode *dp, /* directory we are working in */ 1936 struct ufs_slot *slotp, /* available slot info */ 1937 struct cred *cr) 1938 { 1939 struct direct *ep, *nep; 1940 off_t entryend; 1941 int err; 1942 slotstat_t status = slotp->status; 1943 ushort_t dsize; 1944 1945 ASSERT((status == NONE) || (status == FOUND)); 1946 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 1947 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 1948 /* 1949 * If we didn't find a slot, then indicate that the 1950 * new slot belongs at the end of the directory. 1951 * If we found a slot, then the new entry can be 1952 * put at slotp->offset. 1953 */ 1954 entryend = slotp->offset + slotp->size; 1955 if (status == NONE) { 1956 ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0); 1957 if (DIRBLKSIZ > dp->i_fs->fs_fsize) { 1958 err = ufs_fault(ITOV(dp), 1959 "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d" 1960 " > dp->i_fs->fs_fsize: %d (%s)", 1961 DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt); 1962 return (err); 1963 } 1964 /* 1965 * Allocate the new block. 1966 */ 1967 err = BMAPALLOC(dp, (u_offset_t)slotp->offset, 1968 (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr); 1969 if (err) { 1970 return (err); 1971 } 1972 dp->i_size = entryend; 1973 TRANS_INODE(dp->i_ufsvfs, dp); 1974 dp->i_flag |= IUPD|ICHG|IATTCHG; 1975 dp->i_seq++; 1976 ITIMES_NOLOCK(dp); 1977 } else if (entryend > dp->i_size) { 1978 /* 1979 * Adjust directory size, if needed. This should never 1980 * push the size past a new multiple of DIRBLKSIZ. 1981 * This is an artifact of the old (4.2BSD) way of initializing 1982 * directory sizes to be less than DIRBLKSIZ. 1983 */ 1984 dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t); 1985 TRANS_INODE(dp->i_ufsvfs, dp); 1986 dp->i_flag |= IUPD|ICHG|IATTCHG; 1987 dp->i_seq++; 1988 ITIMES_NOLOCK(dp); 1989 } 1990 1991 /* 1992 * Get the block containing the space for the new directory entry. 1993 */ 1994 if (slotp->fbp == NULL) { 1995 err = blkatoff(dp, slotp->offset, (char **)&slotp->ep, 1996 &slotp->fbp); 1997 if (err) { 1998 return (err); 1999 } 2000 } 2001 ep = slotp->ep; 2002 2003 switch (status) { 2004 case NONE: 2005 /* 2006 * No space in the directory. slotp->offset will be on a 2007 * directory block boundary and we will write the new entry 2008 * into a fresh block. 2009 */ 2010 ep->d_reclen = DIRBLKSIZ; 2011 slotp->size = 0; /* length of previous entry */ 2012 break; 2013 case FOUND: 2014 /* 2015 * An entry of the required size has been found. Use it. 2016 */ 2017 if (ep->d_ino == 0) { 2018 /* this is the 1st record in a block */ 2019 slotp->size = 0; /* length of previous entry */ 2020 } else { 2021 dsize = DIRSIZ(ep); 2022 nep = (struct direct *)((char *)ep + dsize); 2023 nep->d_reclen = ep->d_reclen - dsize; 2024 ep->d_reclen = dsize; 2025 slotp->ep = nep; 2026 slotp->offset += dsize; 2027 slotp->size = dsize; /* length of previous entry */ 2028 } 2029 break; 2030 default: 2031 break; 2032 } 2033 return (0); 2034 } 2035 2036 /* 2037 * Allocate and initialize a new inode that will go into directory tdp. 2038 * This routine is called from ufs_symlink(), as well as within this file. 2039 */ 2040 int 2041 ufs_dirmakeinode( 2042 struct inode *tdp, 2043 struct inode **ipp, 2044 struct vattr *vap, 2045 enum de_op op, 2046 struct cred *cr) 2047 { 2048 struct inode *ip; 2049 enum vtype type; 2050 int imode; /* mode and format as in inode */ 2051 ino_t ipref; 2052 int err; 2053 timestruc_t now; 2054 2055 ASSERT(vap != NULL); 2056 ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR || 2057 op == DE_SYMLINK); 2058 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 2059 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 2060 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 2061 /* 2062 * Allocate a new inode. 2063 */ 2064 type = vap->va_type; 2065 if (type == VDIR) { 2066 ipref = dirpref(tdp); 2067 } else { 2068 ipref = tdp->i_number; 2069 } 2070 if (op == DE_ATTRDIR) 2071 imode = vap->va_mode; 2072 else 2073 imode = MAKEIMODE(type, vap->va_mode); 2074 *ipp = NULL; 2075 err = ufs_ialloc(tdp, ipref, imode, &ip, cr); 2076 if (err) 2077 return (err); 2078 2079 /* 2080 * We don't need to grab vfs_dqrwlock here because it is held 2081 * in ufs_direnter_*() above us. 2082 */ 2083 ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock)); 2084 rw_enter(&ip->i_contents, RW_WRITER); 2085 if (ip->i_dquot != NULL) { 2086 err = ufs_fault(ITOV(ip), 2087 "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)", 2088 tdp->i_fs->fs_fsmnt); 2089 rw_exit(&ip->i_contents); 2090 return (err); 2091 } 2092 *ipp = ip; 2093 ip->i_mode = (o_mode_t)imode; 2094 if (type == VBLK || type == VCHR) { 2095 dev_t d = vap->va_rdev; 2096 dev32_t dev32; 2097 2098 /* 2099 * Don't allow a special file to be created with a 2100 * dev_t that cannot be represented by this filesystem 2101 * format on disk. 2102 */ 2103 if (!cmpldev(&dev32, d)) { 2104 err = EOVERFLOW; 2105 goto fail; 2106 } 2107 2108 ITOV(ip)->v_rdev = ip->i_rdev = d; 2109 2110 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { 2111 ip->i_ordev = dev32; /* can't use old format */ 2112 } else { 2113 ip->i_ordev = cmpdev(d); 2114 } 2115 } 2116 ITOV(ip)->v_type = type; 2117 ufs_reset_vnode(ip->i_vnode); 2118 if (type == VDIR) { 2119 ip->i_nlink = 2; /* anticipating a call to dirmakedirect */ 2120 } else { 2121 ip->i_nlink = 1; 2122 } 2123 2124 if (op == DE_ATTRDIR) { 2125 ip->i_uid = vap->va_uid; 2126 ip->i_gid = vap->va_gid; 2127 } else 2128 ip->i_uid = crgetuid(cr); 2129 /* 2130 * To determine the group-id of the created file: 2131 * 1) If the gid is set in the attribute list (non-Sun & pre-4.0 2132 * clients are not likely to set the gid), then use it if 2133 * the process is privileged, belongs to the target group, 2134 * or the group is the same as the parent directory. 2135 * 2) If the filesystem was not mounted with the Old-BSD-compatible 2136 * GRPID option, and the directory's set-gid bit is clear, 2137 * then use the process's gid. 2138 * 3) Otherwise, set the group-id to the gid of the parent directory. 2139 */ 2140 if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) && 2141 ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) || 2142 secpolicy_vnode_create_gid(cr) == 0)) { 2143 /* 2144 * XXX - is this only the case when a 4.0 NFS client, or a 2145 * client derived from that code, makes a call over the wire? 2146 */ 2147 ip->i_gid = vap->va_gid; 2148 } else 2149 ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr); 2150 2151 /* 2152 * For SunOS 5.0->5.4, the lines below read: 2153 * 2154 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; 2155 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; 2156 * 2157 * where MAXUID was set to 60002. See notes on this in ufs_inode.c 2158 */ 2159 ip->i_suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? 2160 UID_LONG : ip->i_uid; 2161 ip->i_sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? 2162 GID_LONG : ip->i_gid; 2163 2164 /* 2165 * If we're creating a directory, and the parent directory has the 2166 * set-GID bit set, set it on the new directory. 2167 * Otherwise, if the user is neither privileged nor a member of the 2168 * file's new group, clear the file's set-GID bit. 2169 */ 2170 if ((tdp->i_mode & ISGID) && (type == VDIR)) 2171 ip->i_mode |= ISGID; 2172 else { 2173 if ((ip->i_mode & ISGID) && 2174 secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0) 2175 ip->i_mode &= ~ISGID; 2176 } 2177 2178 if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2179 ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2180 err = EOVERFLOW; 2181 goto fail; 2182 } 2183 2184 /* 2185 * Extended attribute directories are not subject to quotas. 2186 */ 2187 if (op != DE_ATTRDIR) 2188 ip->i_dquot = getinoquota(ip); 2189 else 2190 ip->i_dquot = NULL; 2191 2192 if (op == DE_MKDIR || op == DE_ATTRDIR) { 2193 err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr); 2194 if (err) 2195 goto fail; 2196 } 2197 2198 /* 2199 * generate the shadow inode and attach it to the new object 2200 */ 2201 ASSERT((tdp->i_shadow && tdp->i_ufs_acl) || 2202 (!tdp->i_shadow && !tdp->i_ufs_acl)); 2203 if (tdp->i_shadow && tdp->i_ufs_acl && 2204 (((tdp->i_mode & IFMT) == IFDIR) || 2205 ((tdp->i_mode & IFMT) == IFATTRDIR))) { 2206 err = ufs_si_inherit(ip, tdp, ip->i_mode, cr); 2207 if (err) { 2208 if (op == DE_MKDIR) { 2209 /* 2210 * clean up parent directory 2211 * 2212 * tdp->i_contents already locked from 2213 * ufs_direnter_*() 2214 */ 2215 tdp->i_nlink--; 2216 TRANS_INODE(tdp->i_ufsvfs, tdp); 2217 tdp->i_flag |= ICHG; 2218 tdp->i_seq++; 2219 ufs_iupdat(tdp, I_SYNC); 2220 } 2221 goto fail; 2222 } 2223 } 2224 2225 /* 2226 * If the passed in attributes contain atime and/or mtime 2227 * settings, then use them instead of using the current 2228 * high resolution time. 2229 */ 2230 if (vap->va_mask & (AT_MTIME|AT_ATIME)) { 2231 if (vap->va_mask & AT_ATIME) { 2232 ip->i_atime.tv_sec = vap->va_atime.tv_sec; 2233 ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2234 ip->i_flag &= ~IACC; 2235 } else 2236 ip->i_flag |= IACC; 2237 if (vap->va_mask & AT_MTIME) { 2238 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; 2239 ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2240 gethrestime(&now); 2241 if (now.tv_sec > TIME32_MAX) { 2242 /* 2243 * In 2038, ctime sticks forever.. 2244 */ 2245 ip->i_ctime.tv_sec = TIME32_MAX; 2246 ip->i_ctime.tv_usec = 0; 2247 } else { 2248 ip->i_ctime.tv_sec = now.tv_sec; 2249 ip->i_ctime.tv_usec = now.tv_nsec / 1000; 2250 } 2251 ip->i_flag &= ~(IUPD|ICHG); 2252 ip->i_flag |= IMODTIME; 2253 } else 2254 ip->i_flag |= IUPD|ICHG; 2255 ip->i_flag |= IMOD; 2256 } else 2257 ip->i_flag |= IACC|IUPD|ICHG; 2258 ip->i_seq++; 2259 2260 /* 2261 * If this is an attribute tag it as one. 2262 */ 2263 if ((tdp->i_mode & IFMT) == IFATTRDIR) { 2264 ip->i_cflags |= IXATTR; 2265 } 2266 2267 /* 2268 * push inode before it's name appears in a directory 2269 */ 2270 TRANS_INODE(ip->i_ufsvfs, ip); 2271 ufs_iupdat(ip, I_SYNC); 2272 rw_exit(&ip->i_contents); 2273 return (0); 2274 2275 fail: 2276 /* Throw away inode we just allocated. */ 2277 ip->i_nlink = 0; 2278 ufs_setreclaim(ip); 2279 TRANS_INODE(ip->i_ufsvfs, ip); 2280 ip->i_flag |= ICHG; 2281 ip->i_seq++; 2282 ITIMES_NOLOCK(ip); 2283 rw_exit(&ip->i_contents); 2284 return (err); 2285 } 2286 2287 /* 2288 * Write a prototype directory into the empty inode ip, whose parent is dp. 2289 */ 2290 static int 2291 ufs_dirmakedirect( 2292 struct inode *ip, /* new directory */ 2293 struct inode *dp, /* parent directory */ 2294 int attrdir, 2295 struct cred *cr) 2296 { 2297 struct dirtemplate *dirp; 2298 struct fbuf *fbp; 2299 int err; 2300 2301 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 2302 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 2303 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 2304 /* 2305 * Allocate space for the directory we're creating. 2306 */ 2307 err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr); 2308 if (err) 2309 return (err); 2310 if (DIRBLKSIZ > dp->i_fs->fs_fsize) { 2311 err = ufs_fault(ITOV(dp), 2312 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)", 2313 DIRBLKSIZ, dp->i_fs->fs_fsize, 2314 dp->i_fs->fs_fsmnt); 2315 return (err); 2316 } 2317 ip->i_size = DIRBLKSIZ; 2318 TRANS_INODE(ip->i_ufsvfs, ip); 2319 ip->i_flag |= IUPD|ICHG|IATTCHG; 2320 ip->i_seq++; 2321 ITIMES_NOLOCK(ip); 2322 /* 2323 * Update the tdp link count and write out the change. 2324 * This reflects the ".." entry we'll soon write. 2325 */ 2326 if (dp->i_nlink == MAXLINK) 2327 return (EMLINK); 2328 if (attrdir == 0) 2329 dp->i_nlink++; 2330 TRANS_INODE(dp->i_ufsvfs, dp); 2331 dp->i_flag |= ICHG; 2332 dp->i_seq++; 2333 ufs_iupdat(dp, I_SYNC); 2334 /* 2335 * Initialize directory with "." 2336 * and ".." from static template. 2337 * 2338 * Since the parent directory is locked, we don't have to 2339 * worry about anything changing when we drop the write 2340 * lock on (ip). 2341 * 2342 */ 2343 err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize, 2344 S_READ, &fbp); 2345 2346 if (err) { 2347 goto fail; 2348 } 2349 dirp = (struct dirtemplate *)fbp->fb_addr; 2350 /* 2351 * Now initialize the directory we're creating 2352 * with the "." and ".." entries. 2353 */ 2354 *dirp = mastertemplate; /* structure assignment */ 2355 dirp->dot_ino = (uint32_t)ip->i_number; 2356 dirp->dotdot_ino = (uint32_t)dp->i_number; 2357 2358 err = TRANS_DIR(ip, 0); 2359 if (err) { 2360 fbrelse(fbp, S_OTHER); 2361 goto fail; 2362 } 2363 2364 err = ufs_fbwrite(fbp, ip); 2365 if (err) { 2366 goto fail; 2367 } 2368 2369 return (0); 2370 2371 fail: 2372 if (attrdir == 0) 2373 dp->i_nlink--; 2374 TRANS_INODE(dp->i_ufsvfs, dp); 2375 dp->i_flag |= ICHG; 2376 dp->i_seq++; 2377 ufs_iupdat(dp, I_SYNC); 2378 return (err); 2379 } 2380 2381 /* 2382 * Delete a directory entry. If oip is nonzero the entry is checked 2383 * to make sure it still reflects oip. 2384 * 2385 * If vpp is non-null, return the ptr of the (held) vnode associated with 2386 * the removed name. The caller is responsible for doing the VN_RELE(). 2387 */ 2388 int 2389 ufs_dirremove( 2390 struct inode *dp, 2391 char *namep, 2392 struct inode *oip, 2393 struct vnode *cdir, 2394 enum dr_op op, 2395 struct cred *cr, 2396 vnode_t **vpp) /* Return (held) vnode ptr of removed file/dir */ 2397 { 2398 struct direct *ep, *pep, *nep; 2399 struct inode *ip; 2400 vnode_t *dvp, *vp; 2401 struct ufs_slot slot; 2402 int namlen; 2403 int err; 2404 int mode; 2405 ushort_t extra; 2406 2407 namlen = (int)strlen(namep); 2408 if (namlen == 0) 2409 return (ufs_fault(ITOV(dp), "ufs_dirremove: namlen == 0")); 2410 /* 2411 * return error when removing . and .. 2412 */ 2413 if (namep[0] == '.') { 2414 if (namlen == 1) 2415 return (EINVAL); 2416 else if (namlen == 2 && namep[1] == '.') { 2417 return (EEXIST); /* SIGH should be ENOTEMPTY */ 2418 } 2419 } 2420 2421 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 2422 /* 2423 * Check accessibility of directory. 2424 */ 2425 retry: 2426 if (((dp->i_mode & IFMT) != IFDIR) && 2427 ((dp->i_mode & IFMT) != IFATTRDIR)) { 2428 return (ENOTDIR); 2429 } 2430 2431 /* 2432 * Execute access is required to search the directory. 2433 * Access for write is interpreted as allowing 2434 * deletion of files in the directory. 2435 */ 2436 if (err = ufs_iaccess(dp, IEXEC|IWRITE, cr)) { 2437 return (err); 2438 } 2439 2440 ip = NULL; 2441 slot.fbp = NULL; 2442 slot.status = FOUND; /* don't need to look for empty slot */ 2443 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 2444 rw_enter(&dp->i_contents, RW_WRITER); 2445 err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0); 2446 if (err) 2447 goto out_novfs; 2448 if (ip == NULL) { 2449 err = ENOENT; 2450 goto out_novfs; 2451 } 2452 vp = ITOV(ip); 2453 if (oip && oip != ip) { 2454 err = ENOENT; 2455 goto out_novfs; 2456 } 2457 2458 mode = ip->i_mode & IFMT; 2459 if (mode == IFDIR || mode == IFATTRDIR) { 2460 2461 /* 2462 * vn_vfsrlock() prevents races between mount and rmdir. 2463 */ 2464 if (vn_vfsrlock(vp)) { 2465 err = EBUSY; 2466 goto out_novfs; 2467 } 2468 if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) { 2469 err = EBUSY; 2470 goto out; 2471 } 2472 /* 2473 * If we are removing a directory, get a lock on it. 2474 * Taking a writer lock prevents a parallel ufs_dirlook from 2475 * incorrectly entering a negative cache vnode entry in the dnlc 2476 * If the directory is empty, it will stay empty until 2477 * we can remove it. 2478 */ 2479 if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) { 2480 /* 2481 * It is possible that a thread in rename would have 2482 * acquired this rwlock. To prevent a deadlock we 2483 * do a rw_tryenter. If we fail to get the lock 2484 * we drop all the locks we have acquired, wait 2485 * for 2 ticks and reacquire the 2486 * directory's (dp) i_rwlock and try again. 2487 * If we dont drop dp's i_rwlock then we will panic 2488 * with a "Deadlock: cycle in blocking chain" 2489 * since in ufs_dircheckpath we want dp's i_rwlock. 2490 * dp is guaranteed to exist since ufs_dirremove is 2491 * called after a VN_HOLD(dp) has been done. 2492 */ 2493 ufs_dirremove_retry_cnt++; 2494 vn_vfsunlock(vp); 2495 if (slot.fbp) 2496 fbrelse(slot.fbp, S_OTHER); 2497 rw_exit(&dp->i_contents); 2498 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 2499 rw_exit(&dp->i_rwlock); 2500 VN_RELE(vp); 2501 delay(2); 2502 rw_enter(&dp->i_rwlock, RW_WRITER); 2503 goto retry; 2504 } 2505 } 2506 rw_enter(&ip->i_contents, RW_READER); 2507 2508 /* 2509 * Now check the restrictions that apply on sticky directories. 2510 */ 2511 if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) { 2512 rw_exit(&ip->i_contents); 2513 if (mode == IFDIR || mode == IFATTRDIR) 2514 rw_exit(&ip->i_rwlock); 2515 goto out; 2516 } 2517 2518 if (op == DR_RMDIR) { 2519 /* 2520 * For rmdir(2), some special checks are required. 2521 * (a) Don't remove any alias of the parent (e.g. "."). 2522 * (b) Don't remove the current directory. 2523 * (c) Make sure the entry is (still) a directory. 2524 * (d) Make sure the directory is empty. 2525 */ 2526 2527 if (dp == ip || vp == cdir) 2528 err = EINVAL; 2529 else if (((ip->i_mode & IFMT) != IFDIR) && 2530 ((ip->i_mode & IFMT) != IFATTRDIR)) 2531 err = ENOTDIR; 2532 else if ((ip->i_nlink > 2) || 2533 !ufs_dirempty(ip, dp->i_number, cr)) { 2534 err = EEXIST; /* SIGH should be ENOTEMPTY */ 2535 } 2536 2537 if (err) { 2538 rw_exit(&ip->i_contents); 2539 if (mode == IFDIR || mode == IFATTRDIR) 2540 rw_exit(&ip->i_rwlock); 2541 goto out; 2542 } 2543 } else if (op == DR_REMOVE) { 2544 /* 2545 * unlink(2) requires a different check: allow only 2546 * privileged users to unlink a directory. 2547 */ 2548 if (vp->v_type == VDIR && 2549 secpolicy_fs_linkdir(cr, vp->v_vfsp)) { 2550 err = EPERM; 2551 rw_exit(&ip->i_contents); 2552 rw_exit(&ip->i_rwlock); 2553 goto out; 2554 } 2555 } 2556 2557 rw_exit(&ip->i_contents); 2558 2559 /* 2560 * Remove the cache'd entry, if any. 2561 */ 2562 dvp = ITOV(dp); 2563 dnlc_remove(dvp, namep); 2564 ep = slot.ep; 2565 ep->d_ino = 0; 2566 2567 if (slot.cached) { 2568 dcanchor_t *dcap = &dp->i_danchor; 2569 2570 (void) dnlc_dir_rem_entry(dcap, namep, NULL); 2571 if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) { 2572 (void) dnlc_dir_rem_space_by_handle(dcap, slot.offset); 2573 } 2574 if (slot.offset & (DIRBLKSIZ - 1)) { 2575 /* 2576 * Collapse new free space into previous entry. 2577 * Note, the previous entry has already been 2578 * validated in ufs_dircheckforname(). 2579 */ 2580 ASSERT(slot.size); 2581 pep = (struct direct *)((char *)ep - slot.size); 2582 if ((pep->d_ino == 0) && 2583 ((uintptr_t)pep & (DIRBLKSIZ - 1))) { 2584 dnlc_dir_purge(dcap); 2585 slot.cached = 0; 2586 goto nocache; 2587 } 2588 if (pep->d_ino) { 2589 extra = pep->d_reclen - DIRSIZ(pep); 2590 } else { 2591 extra = pep->d_reclen; 2592 } 2593 if (extra >= LDIRSIZ(1)) { 2594 (void) dnlc_dir_rem_space_by_handle(dcap, 2595 (uint64_t)(slot.offset - slot.size)); 2596 } 2597 pep->d_reclen += ep->d_reclen; 2598 (void) dnlc_dir_add_space(dcap, extra + ep->d_reclen, 2599 (uint64_t)(slot.offset - slot.size)); 2600 /* adjust the previous pointer in the next entry */ 2601 nep = (struct direct *)((char *)ep + ep->d_reclen); 2602 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { 2603 /* 2604 * Not a new block. 2605 * 2606 * Check the validity of the entry. 2607 * If it's bad, then throw away the cache and 2608 * continue. 2609 */ 2610 if ((nep->d_reclen == 0) || 2611 (nep->d_reclen & 0x3) || 2612 (dnlc_dir_update(dcap, nep->d_name, 2613 INO_OFF_TO_H(nep->d_ino, 2614 slot.offset - slot.size)) == DNOENT)) { 2615 dnlc_dir_purge(dcap); 2616 slot.cached = 0; 2617 } 2618 } 2619 } else { 2620 (void) dnlc_dir_add_space(dcap, ep->d_reclen, 2621 (uint64_t)slot.offset); 2622 } 2623 } else { 2624 /* 2625 * If the entry isn't the first in the directory, we must 2626 * reclaim the space of the now empty record by adding 2627 * the record size to the size of the previous entry. 2628 */ 2629 if (slot.offset & (DIRBLKSIZ - 1)) { 2630 /* 2631 * Collapse new free space into previous entry. 2632 */ 2633 pep = (struct direct *)((char *)ep - slot.size); 2634 pep->d_reclen += ep->d_reclen; 2635 } 2636 } 2637 nocache: 2638 2639 2640 err = TRANS_DIR(dp, slot.offset); 2641 if (err) 2642 fbrelse(slot.fbp, S_OTHER); 2643 else 2644 err = ufs_fbwrite(slot.fbp, dp); 2645 slot.fbp = NULL; 2646 2647 /* 2648 * If we were removing a directory, it is 'gone' now, but we cannot 2649 * unlock it as a thread may be waiting for the lock in ufs_create. If 2650 * we did, it could then create a file in a deleted directory. 2651 */ 2652 2653 if (err) { 2654 if (mode == IFDIR || mode == IFATTRDIR) 2655 rw_exit(&ip->i_rwlock); 2656 goto out; 2657 } 2658 2659 rw_enter(&ip->i_contents, RW_WRITER); 2660 2661 dp->i_flag |= IUPD|ICHG; 2662 dp->i_seq++; 2663 ip->i_flag |= ICHG; 2664 ip->i_seq++; 2665 2666 TRANS_INODE(dp->i_ufsvfs, dp); 2667 TRANS_INODE(ip->i_ufsvfs, ip); 2668 /* 2669 * Now dispose of the inode. 2670 */ 2671 if (ip->i_nlink > 0) { 2672 /* 2673 * This is not done for IFATTRDIR's because they don't 2674 * have entries in the dnlc and the link counts are 2675 * not incremented when they are created. 2676 */ 2677 if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) { 2678 /* 2679 * Decrement by 2 because we're trashing the "." 2680 * entry as well as removing the entry in dp. 2681 * Clear the directory entry, but there may be 2682 * other hard links so don't free the inode. 2683 * Decrement the dp linkcount because we're 2684 * trashing the ".." entry. 2685 */ 2686 ip->i_nlink -= 2; 2687 dp->i_nlink--; 2688 ufs_setreclaim(dp); 2689 /* 2690 * XXX need to discard negative cache entries 2691 * for vp. See comment in ufs_delete(). 2692 */ 2693 dnlc_remove(vp, "."); 2694 dnlc_remove(vp, ".."); 2695 /* 2696 * The return value is ignored here bacause if 2697 * the directory purge fails we don't want to 2698 * stop the delete. If ufs_dirpurgedotdot fails 2699 * the delete will continue with the preexiting 2700 * behavior. 2701 */ 2702 (void) ufs_dirpurgedotdot(ip, dp->i_number, cr); 2703 } else { 2704 ip->i_nlink--; 2705 } 2706 ufs_setreclaim(ip); 2707 } 2708 ITIMES_NOLOCK(dp); 2709 ITIMES_NOLOCK(ip); 2710 2711 if (!TRANS_ISTRANS(dp->i_ufsvfs)) 2712 ufs_iupdat(dp, I_SYNC); 2713 if (!TRANS_ISTRANS(ip->i_ufsvfs)) 2714 ufs_iupdat(ip, I_SYNC); 2715 2716 rw_exit(&ip->i_contents); 2717 if (mode == IFDIR || mode == IFATTRDIR) 2718 rw_exit(&ip->i_rwlock); 2719 out: 2720 if (mode == IFDIR || mode == IFATTRDIR) { 2721 vn_vfsunlock(vp); 2722 } 2723 out_novfs: 2724 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 2725 2726 if (slot.fbp) 2727 fbrelse(slot.fbp, S_OTHER); 2728 2729 rw_exit(&dp->i_contents); 2730 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 2731 2732 /* 2733 * If no error and vpp is non-NULL, return the vnode ptr to the caller. 2734 * The caller becomes responsible for the VN_RELE(). Otherwise, 2735 * Release (and delete) the inode after we drop vfs_dqrwlock to 2736 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 2737 */ 2738 if (ip) { 2739 if ((err == 0) && (vpp != NULL)) { 2740 *vpp = ITOV(ip); 2741 } else { 2742 VN_RELE(vp); 2743 } 2744 } 2745 2746 return (err); 2747 } 2748 2749 /* 2750 * Return buffer with contents of block "offset" 2751 * from the beginning of directory "ip". If "res" 2752 * is non-zero, fill it in with a pointer to the 2753 * remaining space in the directory. 2754 * 2755 */ 2756 2757 int 2758 blkatoff( 2759 struct inode *ip, 2760 off_t offset, 2761 char **res, 2762 struct fbuf **fbpp) 2763 { 2764 struct fs *fs; 2765 struct fbuf *fbp; 2766 daddr_t lbn; 2767 uint_t bsize; 2768 int err; 2769 2770 CPU_STATS_ADD_K(sys, ufsdirblk, 1); 2771 fs = ip->i_fs; 2772 lbn = (daddr_t)lblkno(fs, offset); 2773 bsize = (uint_t)blksize(fs, ip, lbn); 2774 err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask), 2775 bsize, S_READ, &fbp); 2776 if (err) { 2777 *fbpp = (struct fbuf *)NULL; 2778 return (err); 2779 } 2780 if (res) 2781 *res = fbp->fb_addr + blkoff(fs, offset); 2782 *fbpp = fbp; 2783 return (0); 2784 } 2785 2786 /* 2787 * Do consistency checking: 2788 * record length must be multiple of 4 2789 * entry must fit in rest of its DIRBLKSIZ block 2790 * record must be large enough to contain entry 2791 * name is not longer than MAXNAMLEN 2792 * name must be as long as advertised, and null terminated 2793 * NOTE: record length must not be zero (should be checked previously). 2794 * This routine is only called if dirchk is true. 2795 * It would be nice to set the FSBAD flag in the super-block when 2796 * this routine fails so that a fsck is forced on next reboot, 2797 * but locking is a problem. 2798 */ 2799 static int 2800 dirmangled( 2801 struct inode *dp, 2802 struct direct *ep, 2803 int entryoffsetinblock, 2804 off_t offset) 2805 { 2806 int i; 2807 2808 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); 2809 if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i || 2810 (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN || 2811 ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) { 2812 dirbad(dp, "mangled entry", offset); 2813 return (1); 2814 } 2815 return (0); 2816 } 2817 2818 static void 2819 dirbad(struct inode *ip, char *how, off_t offset) 2820 { 2821 cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s", 2822 ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how); 2823 } 2824 2825 static int 2826 dirbadname(char *sp, int l) 2827 { 2828 while (l--) { /* check for nulls */ 2829 if (*sp++ == '\0') { 2830 return (1); 2831 } 2832 } 2833 return (*sp); /* check for terminating null */ 2834 } 2835 2836 /* 2837 * Check if a directory is empty or not. 2838 */ 2839 static int 2840 ufs_dirempty( 2841 struct inode *ip, 2842 ino_t parentino, 2843 struct cred *cr) 2844 { 2845 return (ufs_dirscan(ip, parentino, cr, 0)); 2846 } 2847 2848 /* 2849 * clear the .. directory entry. 2850 */ 2851 static int 2852 ufs_dirpurgedotdot( 2853 struct inode *ip, 2854 ino_t parentino, 2855 struct cred *cr) 2856 { 2857 return (ufs_dirscan(ip, parentino, cr, 1)); 2858 } 2859 2860 /* 2861 * Scan the directoy. If clr_dotdot is true clear the .. 2862 * directory else check to see if the directory is empty. 2863 * 2864 * Using a struct dirtemplate here is not precisely 2865 * what we want, but better than using a struct direct. 2866 * 2867 * clr_dotdot is used as a flag to tell us if we need 2868 * to clear the dotdot entry 2869 * 2870 * N.B.: does not handle corrupted directories. 2871 */ 2872 static int 2873 ufs_dirscan( 2874 struct inode *ip, 2875 ino_t parentino, 2876 struct cred *cr, 2877 int clr_dotdot) 2878 { 2879 offset_t off; 2880 struct dirtemplate dbuf; 2881 struct direct *dp = (struct direct *)&dbuf; 2882 int err, count; 2883 int empty = 1; /* Assume it's empty */ 2884 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) 2885 2886 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 2887 2888 ASSERT(ip->i_size <= (offset_t)MAXOFF_T); 2889 for (off = 0; off < ip->i_size; off += dp->d_reclen) { 2890 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, 2891 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); 2892 /* 2893 * Since we read MINDIRSIZ, residual must 2894 * be 0 unless we're at end of file. 2895 */ 2896 if (err || count != 0 || dp->d_reclen == 0) { 2897 empty = 0; 2898 break; 2899 } 2900 /* skip empty entries */ 2901 if (dp->d_ino == 0) 2902 continue; 2903 /* accept only "." and ".." */ 2904 if (dp->d_namlen > 2 || dp->d_name[0] != '.') { 2905 empty = 0; 2906 break; 2907 } 2908 /* 2909 * At this point d_namlen must be 1 or 2. 2910 * 1 implies ".", 2 implies ".." if second 2911 * char is also "." 2912 */ 2913 if (dp->d_namlen == 1) 2914 continue; 2915 if (dp->d_name[1] == '.' && 2916 (ino_t)dp->d_ino == parentino) { 2917 /* 2918 * If we're doing a purge we need to check for 2919 * the . and .. entries and clear the d_ino for .. 2920 * 2921 * if clr_dotdot is set ufs_dirscan does not 2922 * check for an empty directory. 2923 */ 2924 if (clr_dotdot) { 2925 /* 2926 * Have to actually zap the .. 2927 * entry in the directory, as 2928 * otherwise someone might have 2929 * dp as its cwd and try to 2930 * open .., which now points to 2931 * an unallocated inode. 2932 */ 2933 empty = ufs_dirclrdotdot(ip, parentino); 2934 break; 2935 } else { 2936 continue; 2937 } 2938 } 2939 empty = 0; 2940 break; 2941 } 2942 return (empty); 2943 } 2944 2945 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */ 2946 uint64_t dircheck_retry_cnt; 2947 /* 2948 * Check if source directory inode is in the path of the target directory. 2949 * Target is supplied locked. 2950 * 2951 * The source and target inode's should be different upon entry. 2952 */ 2953 int 2954 ufs_dircheckpath( 2955 ino_t source_ino, 2956 struct inode *target, 2957 struct inode *sdp, 2958 struct cred *cr) 2959 { 2960 struct fbuf *fbp; 2961 struct dirtemplate *dirp; 2962 struct inode *ip; 2963 struct ufsvfs *ufsvfsp; 2964 struct inode *tip; 2965 ino_t dotdotino; 2966 int err; 2967 2968 ASSERT(target->i_ufsvfs != NULL); 2969 ASSERT(RW_LOCK_HELD(&target->i_rwlock)); 2970 ASSERT(RW_LOCK_HELD(&sdp->i_rwlock)); 2971 2972 ip = target; 2973 if (ip->i_number == source_ino) { 2974 err = EINVAL; 2975 goto out; 2976 } 2977 if (ip->i_number == UFSROOTINO) { 2978 err = 0; 2979 goto out; 2980 } 2981 /* 2982 * Search back through the directory tree, using the ".." entries. 2983 * Fail any attempt to move a directory into an ancestor directory. 2984 */ 2985 fbp = NULL; 2986 for (;;) { 2987 struct vfs *vfs; 2988 2989 err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp); 2990 if (err) 2991 break; 2992 if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 || 2993 ip->i_size < sizeof (struct dirtemplate)) { 2994 dirbad(ip, "bad size, unlinked or not dir", (off_t)0); 2995 err = ENOTDIR; 2996 break; 2997 } 2998 if (dirp->dotdot_namlen != 2 || 2999 dirp->dotdot_name[0] != '.' || 3000 dirp->dotdot_name[1] != '.') { 3001 dirbad(ip, "mangled .. entry", (off_t)0); 3002 err = ENOTDIR; /* Sanity check */ 3003 break; 3004 } 3005 dotdotino = (ino_t)dirp->dotdot_ino; 3006 if (dotdotino == source_ino) { 3007 err = EINVAL; 3008 break; 3009 } 3010 if (dotdotino == UFSROOTINO) 3011 break; 3012 if (fbp) { 3013 fbrelse(fbp, S_OTHER); 3014 fbp = NULL; 3015 } 3016 vfs = ip->i_vfs; 3017 ufsvfsp = ip->i_ufsvfs; 3018 3019 if (ip != target) { 3020 rw_exit(&ip->i_rwlock); 3021 VN_RELE(ITOV(ip)); 3022 } 3023 /* 3024 * Race to get the inode. 3025 */ 3026 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3027 if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) { 3028 rw_exit(&ufsvfsp->vfs_dqrwlock); 3029 ip = NULL; 3030 break; 3031 } 3032 rw_exit(&ufsvfsp->vfs_dqrwlock); 3033 /* 3034 * If the directory of the source inode (also a directory) 3035 * is the same as this next entry up the chain, then 3036 * we know the source directory itself can't be in the 3037 * chain. This also prevents a panic because we already 3038 * have sdp->i_rwlock locked. 3039 */ 3040 if (tip == sdp) { 3041 VN_RELE(ITOV(tip)); 3042 ip = NULL; 3043 break; 3044 } 3045 ip = tip; 3046 3047 /* 3048 * If someone has set the WRITE_WANTED bit in this lock and if 3049 * this happens to be a sdp or tdp of another parallel rename 3050 * which is executing the same code and in similar situation 3051 * we end up in a 4 way deadlock. We need to make sure that 3052 * the WRITE_WANTED bit is not set. 3053 */ 3054 retry_lock: 3055 if (!rw_tryenter(&ip->i_rwlock, RW_READER)) { 3056 /* 3057 * If the lock held as WRITER thats fine but if it 3058 * has WRITE_WANTED bit set we might end up in a 3059 * deadlock. If WRITE_WANTED is set we return 3060 * with EAGAIN else we just go back and try. 3061 */ 3062 if (RW_ISWRITER(&ip->i_rwlock) && 3063 !(RW_WRITE_HELD(&ip->i_rwlock))) { 3064 err = EAGAIN; 3065 if (fbp) { 3066 fbrelse(fbp, S_OTHER); 3067 } 3068 VN_RELE(ITOV(ip)); 3069 return (err); 3070 } else { 3071 /* 3072 * The lock is being write held. We could 3073 * just do a rw_enter here but there is a 3074 * window between the check and now, where 3075 * the status could have changed, so to 3076 * avoid looping we backoff and go back to 3077 * try for the lock. 3078 */ 3079 delay(retry_backoff_delay); 3080 dircheck_retry_cnt++; 3081 goto retry_lock; 3082 } 3083 } 3084 } 3085 if (fbp) { 3086 fbrelse(fbp, S_OTHER); 3087 } 3088 out: 3089 if (ip) { 3090 if (ip != target) { 3091 rw_exit(&ip->i_rwlock); 3092 VN_RELE(ITOV(ip)); 3093 } 3094 } 3095 return (err); 3096 } 3097 3098 int 3099 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr) 3100 { 3101 offset_t off; 3102 struct dirtemplate dbuf; 3103 struct direct *dp = (struct direct *)&dbuf; 3104 int err, count; 3105 int empty = 1; /* Assume it's empty */ 3106 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) 3107 3108 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 3109 3110 ASSERT(ip->i_size <= (offset_t)MAXOFF_T); 3111 for (off = 0; off < ip->i_size; off += dp->d_reclen) { 3112 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, 3113 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); 3114 /* 3115 * Since we read MINDIRSIZ, residual must 3116 * be 0 unless we're at end of file. 3117 */ 3118 3119 if (err || count != 0 || dp->d_reclen == 0) { 3120 empty = 0; 3121 break; 3122 } 3123 /* skip empty entries */ 3124 if (dp->d_ino == 0) 3125 continue; 3126 /* 3127 * At this point d_namlen must be 1 or 2. 3128 * 1 implies ".", 2 implies ".." if second 3129 * char is also "." 3130 */ 3131 3132 if (dp->d_namlen == 1 && dp->d_name[0] == '.' && 3133 (ino_t)dp->d_ino == parentino) 3134 continue; 3135 3136 if (dp->d_namlen == 2 && dp->d_name[0] == '.' && 3137 dp->d_name[1] == '.') { 3138 continue; 3139 } 3140 empty = 0; 3141 break; 3142 } 3143 return (empty); 3144 } 3145 3146 3147 /* 3148 * Allocate and initialize a new shadow inode to contain extended attributes. 3149 */ 3150 int 3151 ufs_xattrmkdir( 3152 struct inode *tdp, 3153 struct inode **ipp, 3154 int flags, 3155 struct cred *cr) 3156 { 3157 struct inode *ip; 3158 struct vattr va; 3159 int err; 3160 int retry = 1; 3161 struct ufsvfs *ufsvfsp; 3162 struct ulockfs *ulp; 3163 int issync; 3164 int trans_size; 3165 int dorwlock; /* 0 = not yet taken, */ 3166 /* 1 = taken outside the transaction, */ 3167 /* 2 = taken inside the transaction */ 3168 3169 /* 3170 * Validate permission to create attribute directory 3171 */ 3172 3173 if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0) { 3174 return (err); 3175 } 3176 3177 if (vn_is_readonly(ITOV(tdp))) 3178 return (EROFS); 3179 3180 /* 3181 * No need to re-init err after again:, since it's set before 3182 * the next use of it. 3183 */ 3184 again: 3185 dorwlock = 0; 3186 va.va_type = VDIR; 3187 va.va_uid = tdp->i_uid; 3188 va.va_gid = tdp->i_gid; 3189 3190 if ((tdp->i_mode & IFMT) == IFDIR) { 3191 va.va_mode = (o_mode_t)IFATTRDIR; 3192 va.va_mode |= tdp->i_mode & 0777; 3193 } else { 3194 va.va_mode = (o_mode_t)IFATTRDIR|0700; 3195 if (tdp->i_mode & 0040) 3196 va.va_mode |= 0750; 3197 if (tdp->i_mode & 0004) 3198 va.va_mode |= 0705; 3199 } 3200 va.va_mask = AT_TYPE|AT_MODE; 3201 3202 ufsvfsp = tdp->i_ufsvfs; 3203 3204 err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK); 3205 if (err) 3206 return (err); 3207 3208 /* 3209 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file. 3210 * This follows the protocol for read()/write(). 3211 */ 3212 if (ITOV(tdp)->v_type != VDIR) { 3213 rw_enter(&tdp->i_rwlock, RW_WRITER); 3214 dorwlock = 1; 3215 } 3216 3217 if (ulp) { 3218 trans_size = (int)TOP_MKDIR_SIZE(tdp); 3219 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size); 3220 } 3221 3222 /* 3223 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory. 3224 * This follows the protocol established by 3225 * ufs_link/create/remove/rename/mkdir/rmdir/symlink. 3226 */ 3227 if (dorwlock == 0) { 3228 rw_enter(&tdp->i_rwlock, RW_WRITER); 3229 dorwlock = 2; 3230 } 3231 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3232 rw_enter(&tdp->i_contents, RW_WRITER); 3233 3234 /* 3235 * Suppress out of inodes messages if we will retry. 3236 */ 3237 if (retry) 3238 tdp->i_flag |= IQUIET; 3239 err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr); 3240 tdp->i_flag &= ~IQUIET; 3241 3242 if (err) 3243 goto fail; 3244 3245 if (flags) { 3246 3247 /* 3248 * Now attach it to src file. 3249 */ 3250 3251 tdp->i_oeftflag = ip->i_number; 3252 } 3253 3254 ip->i_cflags |= IXATTR; 3255 ITOV(ip)->v_flag |= V_XATTRDIR; 3256 TRANS_INODE(ufsvfsp, tdp); 3257 tdp->i_flag |= ICHG | IUPD; 3258 tdp->i_seq++; 3259 ufs_iupdat(tdp, I_SYNC); 3260 rw_exit(&tdp->i_contents); 3261 rw_exit(&ufsvfsp->vfs_dqrwlock); 3262 3263 rw_enter(&ip->i_rwlock, RW_WRITER); 3264 rw_enter(&ip->i_contents, RW_WRITER); 3265 TRANS_INODE(ufsvfsp, ip); 3266 ip->i_flag |= ICHG| IUPD; 3267 ip->i_seq++; 3268 ufs_iupdat(ip, I_SYNC); 3269 rw_exit(&ip->i_contents); 3270 rw_exit(&ip->i_rwlock); 3271 if (dorwlock == 2) 3272 rw_exit(&tdp->i_rwlock); 3273 if (ulp) { 3274 int terr = 0; 3275 3276 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); 3277 ufs_lockfs_end(ulp); 3278 if (err == 0) 3279 err = terr; 3280 } 3281 if (dorwlock == 1) 3282 rw_exit(&tdp->i_rwlock); 3283 *ipp = ip; 3284 return (err); 3285 3286 fail: 3287 rw_exit(&tdp->i_contents); 3288 rw_exit(&ufsvfsp->vfs_dqrwlock); 3289 if (dorwlock == 2) 3290 rw_exit(&tdp->i_rwlock); 3291 if (ulp) { 3292 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); 3293 ufs_lockfs_end(ulp); 3294 } 3295 if (dorwlock == 1) 3296 rw_exit(&tdp->i_rwlock); 3297 if (ip != NULL) 3298 VN_RELE(ITOV(ip)); 3299 3300 /* 3301 * No inodes? See if any are tied up in pending deletions. 3302 * This has to be done outside of any of the above, because 3303 * the draining operation can't be done from inside a transaction. 3304 */ 3305 if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3306 ufs_delete_drain_wait(ufsvfsp, 1); 3307 retry = 0; 3308 goto again; 3309 } 3310 3311 return (err); 3312 } 3313 3314 /* 3315 * clear the dotdot directory entry. 3316 * Used by ufs_dirscan when clr_dotdot 3317 * flag is set and we're deleting a 3318 * directory. 3319 */ 3320 static int 3321 ufs_dirclrdotdot(struct inode *ip, ino_t parentino) 3322 { 3323 struct fbuf *fbp; 3324 struct direct *dotp, *dotdotp; 3325 int err = 0; 3326 3327 ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); 3328 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 3329 err = blkatoff(ip, 0, NULL, &fbp); 3330 if (err) { 3331 return (err); 3332 } 3333 3334 dotp = (struct direct *)fbp->fb_addr; 3335 if ((dotp->d_namlen < (MAXNAMLEN + 1)) && 3336 ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) { 3337 dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen); 3338 if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) && 3339 ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) { 3340 3341 dotp->d_reclen += dotdotp->d_reclen; 3342 if (parentino == dotdotp->d_ino) { 3343 dotdotp->d_ino = 0; 3344 dotdotp->d_namlen = 0; 3345 dotdotp->d_reclen = 0; 3346 } 3347 3348 err = TRANS_DIR(ip, 0); 3349 if (err) { 3350 fbrelse(fbp, S_OTHER); 3351 } else { 3352 err = ufs_fbwrite(fbp, ip); 3353 } 3354 } 3355 } else { 3356 err = -1; 3357 } 3358 return (err); 3359 } 3360