1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 /* 40 * Directory manipulation routines. 41 * 42 * When manipulating directories, the i_rwlock provides serialization 43 * since directories cannot be mmapped. The i_contents lock is redundant. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/t_lock.h> 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/signal.h> 51 #include <sys/cred.h> 52 #include <sys/proc.h> 53 #include <sys/disp.h> 54 #include <sys/user.h> 55 #include <sys/vfs.h> 56 #include <sys/vnode.h> 57 #include <sys/stat.h> 58 #include <sys/mode.h> 59 #include <sys/buf.h> 60 #include <sys/uio.h> 61 #include <sys/dnlc.h> 62 #include <sys/fs/ufs_inode.h> 63 #include <sys/fs/ufs_fs.h> 64 #include <sys/mount.h> 65 #include <sys/fs/ufs_fsdir.h> 66 #include <sys/fs/ufs_trans.h> 67 #include <sys/fs/ufs_panic.h> 68 #include <sys/fs/ufs_quota.h> 69 #include <sys/errno.h> 70 #include <sys/debug.h> 71 #include <vm/seg.h> 72 #include <sys/sysmacros.h> 73 #include <sys/cmn_err.h> 74 #include <sys/cpuvar.h> 75 #include <sys/unistd.h> 76 #include <sys/policy.h> 77 78 /* 79 * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ 80 */ 81 #if !ISP2(DIRBLKSIZ) 82 #error "DIRBLKSIZ not a power of 2" 83 #endif 84 85 /* 86 * A virgin directory. 87 */ 88 static struct dirtemplate mastertemplate = { 89 0, 12, 1, ".", 90 0, DIRBLKSIZ - 12, 2, ".." 91 }; 92 93 #define LDIRSIZ(len) \ 94 ((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3)) 95 #define MAX_DIR_NAME_LEN(len) \ 96 (((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1) 97 98 /* 99 * The dnlc directory cache allows a 64 bit handle for directory entries. 100 * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset 101 * into the handle. Note, a 32 bit offset allows a 4GB directory, which 102 * is way beyond what could be cached in memory by the directory 103 * caching routines. So we are quite safe with this limit. 104 * The macros below pack and unpack the handle. 105 */ 106 #define H_TO_INO(h) (uint32_t)((h) & UINT_MAX) 107 #define H_TO_OFF(h) (off_t)((h) >> 32) 108 #define INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino)) 109 110 /* 111 * The average size of a typical on disk directory entry is about 16 bytes 112 * and so defines AV_DIRECT_SHIFT : log2(16) 113 * This define is only used to approximate the number of entries 114 * is a directory. This is needed for dnlc_dir_start() which will immediately 115 * return an error if the value is not within its acceptable range of 116 * number of files in a directory. 117 */ 118 #define AV_DIRECT_SHIFT 4 119 /* 120 * If the directory size (from i_size) is greater than the ufs_min_dir_cache 121 * tunable then we request dnlc directory caching. 122 * This has found to be profitable after 1024 file names. 123 */ 124 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT; 125 126 /* The time point the dnlc directory caching was disabled */ 127 static hrtime_t ufs_dc_disable_at; 128 /* directory caching disable duration */ 129 static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5; 130 131 #ifdef DEBUG 132 int dirchk = 1; 133 #else /* !DEBUG */ 134 int dirchk = 0; 135 #endif /* DEBUG */ 136 int ufs_negative_cache = 1; 137 uint64_t ufs_dirremove_retry_cnt; 138 139 static void dirbad(); 140 static int ufs_dirrename(); 141 static int ufs_diraddentry(); 142 static int ufs_dirempty(); 143 static int ufs_dirscan(); 144 static int ufs_dirclrdotdot(); 145 static int ufs_dirfixdotdot(); 146 static int ufs_dirpurgedotdot(); 147 static int dirprepareentry(); 148 static int ufs_dirmakedirect(); 149 static int dirbadname(); 150 static int dirmangled(); 151 152 /* 153 * Check accessibility of directory against inquired mode and type. 154 * Execute access is required to search the directory. 155 * Access for write is interpreted as allowing 156 * deletion of files in the directory. 157 * Note, the reader i_contents lock will be acquired in 158 * ufs_iaccess(). 159 */ 160 int 161 ufs_diraccess(struct inode *ip, int mode, struct cred *cr) 162 { 163 if (((ip->i_mode & IFMT) != IFDIR) && 164 ((ip->i_mode & IFMT) != IFATTRDIR)) 165 return (ENOTDIR); 166 167 return (ufs_iaccess(ip, mode, cr, 1)); 168 } 169 170 /* 171 * Look for a given name in a directory. On successful return, *ipp 172 * will point to the VN_HELD inode. 173 * The caller is responsible for checking accessibility upfront 174 * via ufs_diraccess(). 175 */ 176 int 177 ufs_dirlook( 178 struct inode *dp, 179 char *namep, 180 struct inode **ipp, 181 struct cred *cr, 182 int skipdnlc) /* skip the 1st level dnlc */ 183 { 184 uint64_t handle; 185 struct fbuf *fbp; /* a buffer of directory entries */ 186 struct direct *ep; /* the current directory entry */ 187 struct vnode *vp; 188 struct vnode *dvp; /* directory vnode ptr */ 189 struct ulockfs *ulp; 190 dcanchor_t *dcap; 191 off_t endsearch; /* offset to end directory search */ 192 off_t offset; 193 off_t start_off; /* starting offset from middle search */ 194 off_t last_offset; /* last offset */ 195 int entryoffsetinblock; /* offset of ep in addr's buffer */ 196 int numdirpasses; /* strategy for directory search */ 197 int namlen; /* length of name */ 198 int err; 199 int doingchk; 200 int i; 201 int caching; 202 int indeadlock; 203 ino_t ep_ino; /* entry i number */ 204 ino_t chkino; 205 ushort_t ep_reclen; /* direct local d_reclen */ 206 207 ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */ 208 209 if (dp->i_ufsvfs) 210 ulp = &dp->i_ufsvfs->vfs_ulockfs; 211 212 /* 213 * Check the directory name lookup cache, first for individual files 214 * then for complete directories. 215 */ 216 dvp = ITOV(dp); 217 if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) { 218 /* vp is already held from dnlc_lookup */ 219 if (vp == DNLC_NO_VNODE) { 220 VN_RELE(vp); 221 return (ENOENT); 222 } 223 *ipp = VTOI(vp); 224 return (0); 225 } 226 227 dcap = &dp->i_danchor; 228 229 /* 230 * Grab the reader lock on the directory data before checking 231 * the dnlc to avoid a race with ufs_dirremove() & friends. 232 * 233 * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to 234 * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 235 * possible, retries the operation. 236 */ 237 ufs_tryirwlock((&dp->i_rwlock), RW_READER, retry_dircache); 238 if (indeadlock) 239 return (EAGAIN); 240 241 switch (dnlc_dir_lookup(dcap, namep, &handle)) { 242 case DFOUND: 243 ep_ino = (ino_t)H_TO_INO(handle); 244 if (dp->i_number == ep_ino) { 245 VN_HOLD(dvp); /* want ourself, "." */ 246 *ipp = dp; 247 rw_exit(&dp->i_rwlock); 248 return (0); 249 } 250 if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) { 251 uint64_t handle2; 252 /* 253 * release the lock on the dir we are searching 254 * to avoid a deadlock when grabbing the 255 * i_contents lock in ufs_iget_alloced(). 256 */ 257 rw_exit(&dp->i_rwlock); 258 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 259 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); 260 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 261 /* 262 * must recheck as we dropped dp->i_rwlock 263 */ 264 ufs_tryirwlock(&dp->i_rwlock, RW_READER, retry_parent); 265 if (indeadlock) { 266 if (!err) 267 VN_RELE(ITOV(*ipp)); 268 return (EAGAIN); 269 } 270 if (!err && (dnlc_dir_lookup(dcap, namep, &handle2) 271 == DFOUND) && (handle == handle2)) { 272 dnlc_update(dvp, namep, ITOV(*ipp)); 273 rw_exit(&dp->i_rwlock); 274 return (0); 275 } 276 /* check failed, read the actual directory */ 277 if (!err) { 278 VN_RELE(ITOV(*ipp)); 279 } 280 goto restart; 281 } 282 /* usual case of not "." nor ".." */ 283 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 284 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); 285 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 286 if (err) { 287 rw_exit(&dp->i_rwlock); 288 return (err); 289 } 290 dnlc_update(dvp, namep, ITOV(*ipp)); 291 rw_exit(&dp->i_rwlock); 292 return (0); 293 case DNOENT: 294 if (ufs_negative_cache && (dp->i_nlink > 0)) { 295 dnlc_enter(dvp, namep, DNLC_NO_VNODE); 296 } 297 rw_exit(&dp->i_rwlock); 298 return (ENOENT); 299 default: 300 break; 301 } 302 restart: 303 304 fbp = NULL; 305 doingchk = 0; 306 chkino = 0; 307 caching = 0; 308 309 /* 310 * Attempt to cache any directories greater than the tunable 311 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM), 312 * disable caching for this directory and record the system time. 313 * Any attempt after the disable time has expired will enable 314 * the caching again. 315 */ 316 if (dp->i_size >= ufs_min_dir_cache) { 317 /* 318 * if the directory caching disable time has expired 319 * enable the caching again. 320 */ 321 if (dp->i_cachedir == CD_DISABLED_NOMEM && 322 gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) { 323 ufs_dc_disable_at = 0; 324 dp->i_cachedir = CD_ENABLED; 325 } 326 if (dp->i_cachedir == CD_ENABLED) { 327 switch (dnlc_dir_start(dcap, dp->i_size >> 328 AV_DIRECT_SHIFT)) { 329 case DNOMEM: 330 dp->i_cachedir = CD_DISABLED_NOMEM; 331 ufs_dc_disable_at = gethrtime(); 332 break; 333 case DTOOBIG: 334 dp->i_cachedir = CD_DISABLED_TOOBIG; 335 break; 336 case DOK: 337 caching = 1; 338 break; 339 default: 340 break; 341 } 342 } 343 } 344 /* 345 * If caching we don't stop when the file has been 346 * found, but need to know later, so clear *ipp now 347 */ 348 *ipp = NULL; 349 350 recheck: 351 if (caching) { 352 offset = 0; 353 entryoffsetinblock = 0; 354 numdirpasses = 1; 355 } else { 356 /* 357 * Take care to look at dp->i_diroff only once, as it 358 * may be changing due to other threads/cpus. 359 */ 360 offset = dp->i_diroff; 361 if (offset > dp->i_size) { 362 offset = 0; 363 } 364 if (offset == 0) { 365 entryoffsetinblock = 0; 366 numdirpasses = 1; 367 } else { 368 start_off = offset; 369 370 entryoffsetinblock = blkoff(dp->i_fs, offset); 371 if (entryoffsetinblock != 0) { 372 err = blkatoff(dp, offset, (char **)0, &fbp); 373 if (err) 374 goto bad; 375 } 376 numdirpasses = 2; 377 } 378 } 379 endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t); 380 namlen = strlen(namep); 381 last_offset = 0; 382 383 searchloop: 384 while (offset < endsearch) { 385 /* 386 * If offset is on a block boundary, 387 * read the next directory block. 388 * Release previous if it exists. 389 */ 390 if (blkoff(dp->i_fs, offset) == 0) { 391 if (fbp != NULL) { 392 fbrelse(fbp, S_OTHER); 393 } 394 err = blkatoff(dp, offset, (char **)0, &fbp); 395 if (err) 396 goto bad; 397 entryoffsetinblock = 0; 398 } 399 400 /* 401 * If the offset to the next entry is invalid or if the 402 * next entry is a zero length record or if the record 403 * length is invalid, then skip to the next directory 404 * block. Complete validation checks are done if the 405 * record length is invalid. 406 * 407 * Full validation checks are slow so they are disabled 408 * by default. Complete checks can be run by patching 409 * "dirchk" to be true. 410 * 411 * We have to check the validity of entryoffsetinblock 412 * here because it can be set to i_diroff above. 413 */ 414 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock); 415 if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 || 416 (dirchk || (ep->d_reclen & 0x3)) && 417 dirmangled(dp, ep, entryoffsetinblock, offset)) { 418 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); 419 offset += i; 420 entryoffsetinblock += i; 421 if (caching) { 422 dnlc_dir_purge(dcap); 423 caching = 0; 424 } 425 continue; 426 } 427 428 ep_reclen = ep->d_reclen; 429 430 /* 431 * Add named entries and free space into the directory cache 432 */ 433 if (caching) { 434 ushort_t extra; 435 off_t off2; 436 437 if (ep->d_ino == 0) { 438 extra = ep_reclen; 439 if (offset & (DIRBLKSIZ - 1)) { 440 dnlc_dir_purge(dcap); 441 dp->i_cachedir = CD_DISABLED; 442 caching = 0; 443 } 444 } else { 445 /* 446 * entries hold the previous offset except the 447 * 1st which holds the offset + 1 448 */ 449 if (offset & (DIRBLKSIZ - 1)) { 450 off2 = last_offset; 451 } else { 452 off2 = offset + 1; 453 } 454 caching = (dnlc_dir_add_entry(dcap, ep->d_name, 455 INO_OFF_TO_H(ep->d_ino, off2)) == DOK); 456 extra = ep_reclen - DIRSIZ(ep); 457 } 458 if (caching && (extra >= LDIRSIZ(1))) { 459 caching = (dnlc_dir_add_space(dcap, extra, 460 (uint64_t)offset) == DOK); 461 } 462 } 463 464 /* 465 * Check for a name match. 466 * We have the parent inode read locked with i_rwlock. 467 */ 468 if (ep->d_ino && ep->d_namlen == namlen && 469 *namep == *ep->d_name && /* fast chk 1st chr */ 470 bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) { 471 472 /* 473 * We have to release the fbp early here to avoid 474 * a possible deadlock situation where we have the 475 * fbp and want the directory inode and someone doing 476 * a ufs_direnter_* has the directory inode and wants 477 * the fbp. XXX - is this still needed? 478 */ 479 ep_ino = (ino_t)ep->d_ino; 480 ASSERT(fbp != NULL); 481 fbrelse(fbp, S_OTHER); 482 fbp = NULL; 483 484 /* 485 * Atomic update (read lock held) 486 */ 487 dp->i_diroff = offset; 488 489 if (namlen == 2 && namep[0] == '.' && namep[1] == '.') { 490 struct timeval32 omtime; 491 492 if (caching) { 493 dnlc_dir_purge(dcap); 494 caching = 0; 495 } 496 if (doingchk) { 497 /* 498 * if the inumber didn't change 499 * continue with already found inode. 500 */ 501 if (ep_ino == chkino) 502 goto checkok; 503 else { 504 VN_RELE(ITOV(*ipp)); 505 /* *ipp is nulled at restart */ 506 goto restart; 507 } 508 } 509 /* 510 * release the lock on the dir we are searching 511 * to avoid a deadlock when grabbing the 512 * i_contents lock in ufs_iget_alloced(). 513 */ 514 omtime = dp->i_mtime; 515 rw_exit(&dp->i_rwlock); 516 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, 517 RW_READER); 518 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, 519 cr); 520 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 521 ufs_tryirwlock(&dp->i_rwlock, RW_READER, 522 retry_disk); 523 if (indeadlock) { 524 if (!err) 525 VN_RELE(ITOV(*ipp)); 526 return (EAGAIN); 527 } 528 if (err) 529 goto bad; 530 /* 531 * Since we released the lock on the directory, 532 * we must check that the same inode is still 533 * the ".." entry for this directory. 534 */ 535 /*CSTYLED*/ 536 if (timercmp(&omtime, &dp->i_mtime, !=)) { 537 /* 538 * Modification time changed on the 539 * directory, we must go check if 540 * the inumber changed for ".." 541 */ 542 doingchk = 1; 543 chkino = ep_ino; 544 entryoffsetinblock = 0; 545 if (caching) { 546 /* 547 * Forget directory caching 548 * for this rare case 549 */ 550 dnlc_dir_purge(dcap); 551 caching = 0; 552 } 553 goto recheck; 554 } 555 } else if (dp->i_number == ep_ino) { 556 VN_HOLD(dvp); /* want ourself, "." */ 557 *ipp = dp; 558 if (caching) { 559 dnlc_dir_purge(dcap); 560 caching = 0; 561 } 562 } else { 563 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, 564 RW_READER); 565 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, 566 cr); 567 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 568 if (err) 569 goto bad; 570 } 571 checkok: 572 ASSERT(*ipp); 573 dnlc_update(dvp, namep, ITOV(*ipp)); 574 /* 575 * If we are not caching then just return the entry 576 * otherwise complete loading up the cache 577 */ 578 if (!caching) { 579 rw_exit(&dp->i_rwlock); 580 return (0); 581 } 582 err = blkatoff(dp, offset, (char **)0, &fbp); 583 if (err) 584 goto bad; 585 } 586 last_offset = offset; 587 offset += ep_reclen; 588 entryoffsetinblock += ep_reclen; 589 } 590 /* 591 * If we started in the middle of the directory and failed 592 * to find our target, we must check the beginning as well. 593 */ 594 if (numdirpasses == 2) { 595 numdirpasses--; 596 offset = 0; 597 endsearch = start_off; 598 goto searchloop; 599 } 600 601 /* 602 * If whole directory caching is on (or was originally on) then 603 * the entry may have been found. 604 */ 605 if (*ipp == NULL) { 606 err = ENOENT; 607 if (ufs_negative_cache && (dp->i_nlink > 0)) { 608 dnlc_enter(dvp, namep, DNLC_NO_VNODE); 609 } 610 } 611 if (caching) { 612 dnlc_dir_complete(dcap); 613 caching = 0; 614 } 615 616 bad: 617 if (err && *ipp) { 618 /* 619 * err and *ipp can both be set if we were attempting to 620 * cache the directory, and we found the entry, then later 621 * while trying to complete the directory cache encountered 622 * a error (eg reading a directory sector). 623 */ 624 VN_RELE(ITOV(*ipp)); 625 *ipp = NULL; 626 } 627 628 if (fbp) 629 fbrelse(fbp, S_OTHER); 630 rw_exit(&dp->i_rwlock); 631 if (caching) 632 dnlc_dir_purge(dcap); 633 return (err); 634 } 635 636 /* 637 * Write a new directory entry for DE_CREATE or DE_MKDIR operations. 638 */ 639 int 640 ufs_direnter_cm( 641 struct inode *tdp, /* target directory to make entry in */ 642 char *namep, /* name of entry */ 643 enum de_op op, /* entry operation */ 644 struct vattr *vap, /* attributes if new inode needed */ 645 struct inode **ipp, /* return entered inode here */ 646 struct cred *cr, /* user credentials */ 647 int flags) /* no entry exists */ 648 { 649 struct inode *tip; /* inode of (existing) target file */ 650 char *s; 651 struct ufs_slot slot; /* slot info to pass around */ 652 int namlen; /* length of name */ 653 int err; /* error number */ 654 struct inode *nip; /* new inode */ 655 int do_rele_nip = 0; /* release nip */ 656 int noentry = flags & ~IQUIET; 657 int quiet = flags & IQUIET; /* Suppress out of inodes message */ 658 int indeadlock; 659 struct ulockfs *ulp; 660 661 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 662 663 if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) || 664 ((vap->va_type == VCHR) || (vap->va_type == VBLK) || 665 (vap->va_type == VDOOR) || (vap->va_type == VSOCK) || 666 (vap->va_type == VFIFO)))) 667 return (EINVAL); 668 669 /* don't allow '/' characters in pathname component */ 670 for (s = namep, namlen = 0; *s; s++, namlen++) 671 if (*s == '/') 672 return (EACCES); 673 ASSERT(namlen); 674 675 /* 676 * Check accessibility of target directory. 677 */ 678 if (err = ufs_diraccess(tdp, IEXEC, cr)) 679 return (err); 680 681 /* 682 * If name is "." or ".." then if this is a create look it up 683 * and return EEXIST. 684 */ 685 if (namep[0] == '.' && 686 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { 687 /* 688 * ufs_dirlook will acquire the i_rwlock 689 */ 690 if (tdp->i_ufsvfs) 691 ulp = &tdp->i_ufsvfs->vfs_ulockfs; 692 rw_exit(&tdp->i_rwlock); 693 if (err = ufs_dirlook(tdp, namep, ipp, cr, 0)) { 694 if (err == EAGAIN) 695 return (err); 696 697 /* 698 * ufs_tryirwlock uses rw_tryenter and checks for 699 * SLOCK to avoid i_rwlock, ufs_lockfs_begin deadlock. 700 * If deadlock possible, retries the operation. 701 */ 702 ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry_err); 703 if (indeadlock) 704 return (EAGAIN); 705 706 return (err); 707 } 708 ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry); 709 if (indeadlock) { 710 VN_RELE(ITOV(*ipp)); 711 return (EAGAIN); 712 } 713 return (EEXIST); 714 } 715 716 /* 717 * If target directory has not been removed, then we can consider 718 * allowing file to be created. 719 */ 720 if (tdp->i_nlink <= 0) { 721 return (ENOENT); 722 } 723 724 /* 725 * Search for the entry. Return VN_HELD tip if found. 726 */ 727 tip = NULL; 728 slot.fbp = NULL; 729 slot.status = NONE; 730 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 731 rw_enter(&tdp->i_contents, RW_WRITER); 732 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry); 733 if (err) 734 goto out; 735 if (tip) { 736 ASSERT(!noentry); 737 *ipp = tip; 738 err = EEXIST; 739 } else { 740 /* 741 * The entry does not exist. Check write permission in 742 * directory to see if entry can be created. 743 */ 744 if (err = ufs_iaccess(tdp, IWRITE, cr, 0)) 745 goto out; 746 /* 747 * Make new inode and directory entry. 748 */ 749 tdp->i_flag |= quiet; 750 if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) { 751 if (nip != NULL) 752 do_rele_nip = 1; 753 goto out; 754 } 755 if (err = ufs_diraddentry(tdp, namep, op, 756 namlen, &slot, nip, NULL, cr)) { 757 /* 758 * Unmake the inode we just made. 759 */ 760 rw_enter(&nip->i_contents, RW_WRITER); 761 if (((nip->i_mode & IFMT) == IFDIR) || 762 ((nip->i_mode & IFMT) == IFATTRDIR)) { 763 tdp->i_nlink--; 764 ufs_setreclaim(tdp); 765 tdp->i_flag |= ICHG; 766 tdp->i_seq++; 767 TRANS_INODE(tdp->i_ufsvfs, tdp); 768 ITIMES_NOLOCK(tdp); 769 } 770 nip->i_nlink = 0; 771 ufs_setreclaim(nip); 772 TRANS_INODE(nip->i_ufsvfs, nip); 773 nip->i_flag |= ICHG; 774 nip->i_seq++; 775 ITIMES_NOLOCK(nip); 776 rw_exit(&nip->i_contents); 777 do_rele_nip = 1; 778 } else { 779 *ipp = nip; 780 } 781 } 782 783 out: 784 if (slot.fbp) 785 fbrelse(slot.fbp, S_OTHER); 786 787 tdp->i_flag &= ~quiet; 788 rw_exit(&tdp->i_contents); 789 790 /* 791 * Drop vfs_dqrwlock before calling VN_RELE() on nip to 792 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 793 */ 794 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); 795 796 if (do_rele_nip) { 797 VN_RELE(ITOV(nip)); 798 } 799 800 return (err); 801 } 802 803 /* 804 * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations. 805 * If tvpp is non-null, return with the pointer to the target vnode. 806 */ 807 int 808 ufs_direnter_lr( 809 struct inode *tdp, /* target directory to make entry in */ 810 char *namep, /* name of entry */ 811 enum de_op op, /* entry operation */ 812 struct inode *sdp, /* source inode parent if rename */ 813 struct inode *sip, /* source inode */ 814 struct cred *cr, /* user credentials */ 815 vnode_t **tvpp) /* Return: (held) vnode of (existing) target */ 816 { 817 struct inode *tip; /* inode of (existing) target file */ 818 char *s; 819 struct ufs_slot slot; /* slot info to pass around */ 820 int namlen; /* length of name */ 821 int err; /* error number */ 822 823 /* don't allow '/' characters in pathname component */ 824 for (s = namep, namlen = 0; *s; s++, namlen++) 825 if (*s == '/') 826 return (EACCES); 827 ASSERT(namlen); 828 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 829 830 /* 831 * If name is "." or ".." then if this is a create look it up 832 * and return EEXIST. Rename or link TO "." or ".." is forbidden. 833 */ 834 if (namep[0] == '.' && 835 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { 836 if (op == DE_RENAME) { 837 return (EINVAL); /* *SIGH* should be ENOTEMPTY */ 838 } 839 return (EEXIST); 840 } 841 /* 842 * For link and rename lock the source entry and check the link count 843 * to see if it has been removed while it was unlocked. If not, we 844 * increment the link count and force the inode to disk to make sure 845 * that it is there before any directory entry that points to it. 846 * 847 * In the case of a symbolic link, we are dealing with a new inode 848 * which does not yet have any links. We've created it with a link 849 * count of 1, and we don't want to increment it since this will be 850 * its first link. 851 * 852 * We are about to push the inode to disk. We make sure 853 * that the inode's data blocks are flushed first so the 854 * inode and it's data blocks are always in sync. This 855 * adds some robustness in in the event of a power failure 856 * or panic where sync fails. If we panic before the 857 * inode is updated, then the inode still refers to the 858 * old data blocks (or none for a new file). If we panic 859 * after the inode is updated, then the inode refers to 860 * the new data blocks. 861 * 862 * We do this before grabbing the i_contents lock because 863 * ufs_syncip() will want that lock. We could do the data 864 * syncing after the removal checks, but upon return from 865 * the data sync we would have to repeat the removal 866 * checks. 867 */ 868 if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) { 869 return (err); 870 } 871 872 rw_enter(&sip->i_contents, RW_WRITER); 873 if (sip->i_nlink <= 0) { 874 rw_exit(&sip->i_contents); 875 return (ENOENT); 876 } 877 if (sip->i_nlink == MAXLINK) { 878 rw_exit(&sip->i_contents); 879 return (EMLINK); 880 } 881 882 /* 883 * Sync the indirect blocks associated with the file 884 * for the same reasons as described above. Since this 885 * call wants the i_contents lock held for it we can do 886 * this here with no extra work. 887 */ 888 if (err = ufs_sync_indir(sip)) { 889 rw_exit(&sip->i_contents); 890 return (err); 891 } 892 893 if (op != DE_SYMLINK) 894 sip->i_nlink++; 895 TRANS_INODE(sip->i_ufsvfs, sip); 896 sip->i_flag |= ICHG; 897 sip->i_seq++; 898 ufs_iupdat(sip, I_SYNC); 899 rw_exit(&sip->i_contents); 900 901 /* 902 * If target directory has not been removed, then we can consider 903 * allowing file to be created. 904 */ 905 if (tdp->i_nlink <= 0) { 906 err = ENOENT; 907 goto out2; 908 } 909 910 /* 911 * Check accessibility of target directory. 912 */ 913 if (err = ufs_diraccess(tdp, IEXEC, cr)) 914 goto out2; 915 916 /* 917 * Search for the entry. Return VN_HELD tip if found. 918 */ 919 tip = NULL; 920 slot.status = NONE; 921 slot.fbp = NULL; 922 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 923 rw_enter(&tdp->i_contents, RW_WRITER); 924 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0); 925 if (err) 926 goto out; 927 928 if (tip) { 929 switch (op) { 930 case DE_RENAME: 931 err = ufs_dirrename(sdp, sip, tdp, namep, 932 tip, &slot, cr); 933 break; 934 935 case DE_LINK: 936 case DE_SYMLINK: 937 /* 938 * Can't link to an existing file. 939 */ 940 err = EEXIST; 941 break; 942 default: 943 break; 944 } 945 } else { 946 /* 947 * The entry does not exist. Check write permission in 948 * directory to see if entry can be created. 949 */ 950 if (err = ufs_iaccess(tdp, IWRITE, cr, 0)) 951 goto out; 952 err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp, 953 cr); 954 } 955 956 out: 957 if (slot.fbp) 958 fbrelse(slot.fbp, S_OTHER); 959 960 rw_exit(&tdp->i_contents); 961 962 /* 963 * Drop vfs_dqrwlock before calling VN_RELE() on tip to 964 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 965 */ 966 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); 967 968 /* 969 * If we renamed a file over the top of an existing file, 970 * or linked a file to an existing file (or tried to), 971 * then set *tvpp to the target vnode, if tvpp is non-null 972 * otherwise, release and delete (or just release) the inode. 973 * 974 * N.B., by returning the target's vnode pointer to the caller, 975 * that caller becomes responsible for doing the VN_RELE. 976 */ 977 if (tip) { 978 if ((err == 0) && (tvpp != NULL)) { 979 *tvpp = ITOV(tip); 980 } else { 981 VN_RELE(ITOV(tip)); 982 } 983 } 984 985 out2: 986 if (err) { 987 /* 988 * Undo bumped link count. 989 */ 990 if (op != DE_SYMLINK) { 991 rw_enter(&sip->i_contents, RW_WRITER); 992 sip->i_nlink--; 993 ufs_setreclaim(sip); 994 TRANS_INODE(sip->i_ufsvfs, sip); 995 sip->i_flag |= ICHG; 996 sip->i_seq++; 997 ITIMES_NOLOCK(sip); 998 rw_exit(&sip->i_contents); 999 } 1000 } 1001 return (err); 1002 } 1003 1004 /* 1005 * Check for the existence of a name in a directory (unless noentry 1006 * is set) , or else of an empty 1007 * slot in which an entry may be made. If the requested name is found, 1008 * then on return *ipp points at the inode and *offp contains 1009 * its offset in the directory. If the name is not found, then *ipp 1010 * will be NULL and *slotp will contain information about a directory slot in 1011 * which an entry may be made (either an empty slot, or the first position 1012 * past the end of the directory). 1013 * The target directory inode (tdp) is supplied write locked (i_rwlock). 1014 * 1015 * This may not be used on "." or "..", but aliases of "." are ok. 1016 */ 1017 int 1018 ufs_dircheckforname( 1019 struct inode *tdp, /* inode of directory being checked */ 1020 char *namep, /* name we're checking for */ 1021 int namlen, /* length of name, excluding null */ 1022 struct ufs_slot *slotp, /* slot structure */ 1023 struct inode **ipp, /* return inode if we find one */ 1024 struct cred *cr, 1025 int noentry) /* noentry - just look for space */ 1026 { 1027 uint64_t handle; 1028 struct fbuf *fbp; /* pointer to directory block */ 1029 struct direct *ep; /* directory entry */ 1030 struct direct *nep; /* next directory entry */ 1031 dcanchor_t *dcap; 1032 vnode_t *dvp; /* directory vnode ptr */ 1033 off_t dirsize; /* size of the directory */ 1034 off_t offset; /* offset in the directory */ 1035 off_t last_offset; /* last offset */ 1036 off_t enduseful; /* pointer past last used dir slot */ 1037 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1038 int i; /* length of mangled entry */ 1039 int needed; 1040 int err; 1041 int first; 1042 int caching; 1043 int stat; 1044 ino_t ep_ino; 1045 slotstat_t initstat = slotp->status; 1046 1047 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1048 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1049 ASSERT(*ipp == NULL); 1050 fbp = NULL; 1051 1052 /* 1053 * First check if there is a complete cache of the directory. 1054 */ 1055 dvp = ITOV(tdp); 1056 1057 dcap = &tdp->i_danchor; 1058 if (noentry) { 1059 /* 1060 * We know from the 1st level dnlc cache that the entry 1061 * doesn't exist, so don't bother searching the directory 1062 * cache, but just look for space (possibly in the directory 1063 * cache). 1064 */ 1065 stat = DNOENT; 1066 } else { 1067 stat = dnlc_dir_lookup(dcap, namep, &handle); 1068 } 1069 switch (stat) { 1070 case DFOUND: 1071 ep_ino = (ino_t)H_TO_INO(handle); 1072 if (tdp->i_number == ep_ino) { 1073 *ipp = tdp; /* we want ourself, ie "." */ 1074 VN_HOLD(dvp); 1075 } else { 1076 err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr); 1077 if (err) 1078 return (err); 1079 } 1080 offset = H_TO_OFF(handle); 1081 first = 0; 1082 if (offset & 1) { 1083 /* This is the first entry in the block */ 1084 first = 1; 1085 offset -= 1; 1086 ASSERT((offset & (DIRBLKSIZ - 1)) == 0); 1087 } 1088 err = blkatoff(tdp, offset, (char **)&ep, &fbp); 1089 if (err) { 1090 VN_RELE(ITOV(*ipp)); 1091 *ipp = NULL; 1092 return (err); 1093 } 1094 /* 1095 * Check the validity of the entry. 1096 * If it's bad, then throw away the cache and 1097 * continue without it. The dirmangled() routine 1098 * will then be called upon it. 1099 */ 1100 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { 1101 VN_RELE(ITOV(*ipp)); 1102 *ipp = NULL; 1103 dnlc_dir_purge(dcap); 1104 break; 1105 } 1106 /* 1107 * Remember the returned offset is the offset of the 1108 * preceding record (unless this is the 1st record 1109 * in the DIRBLKSIZ sized block (disk sector)), then it's 1110 * offset + 1. Note, no real offsets are on odd boundaries. 1111 */ 1112 if (first) { 1113 ASSERT((offset & (DIRBLKSIZ - 1)) == 0); 1114 slotp->offset = offset; 1115 slotp->size = 0; 1116 slotp->ep = ep; 1117 } else { 1118 /* get the next entry */ 1119 nep = (struct direct *)((char *)ep + ep->d_reclen); 1120 /* 1121 * Check the validity of this entry as well 1122 * If it's bad, then throw away the cache and 1123 * continue without it. The dirmangled() routine 1124 * will then be called upon it. 1125 */ 1126 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || 1127 (nep->d_ino != ep_ino)) { 1128 VN_RELE(ITOV(*ipp)); 1129 *ipp = NULL; 1130 dnlc_dir_purge(dcap); 1131 break; 1132 } 1133 slotp->offset = offset + ep->d_reclen; 1134 slotp->size = ep->d_reclen; 1135 slotp->ep = nep; 1136 } 1137 slotp->status = EXIST; 1138 slotp->fbp = fbp; 1139 slotp->endoff = 0; 1140 slotp->cached = 1; 1141 dnlc_update(dvp, namep, ITOV(*ipp)); 1142 return (0); 1143 case DNOENT: 1144 /* 1145 * The caller gets to set the initial slot status to 1146 * indicate whether it's interested in getting a 1147 * empty slot. For example, the status can be set 1148 * to FOUND when an entry is being deleted. 1149 */ 1150 ASSERT(slotp->fbp == NULL); 1151 if (slotp->status == FOUND) { 1152 return (0); 1153 } 1154 switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen), 1155 &handle)) { 1156 case DFOUND: 1157 offset = (off_t)handle; 1158 err = blkatoff(tdp, offset, (char **)&ep, &fbp); 1159 if (err) { 1160 dnlc_dir_purge(dcap); 1161 ASSERT(*ipp == NULL); 1162 return (err); 1163 } 1164 /* 1165 * Check the validity of the entry. 1166 * If it's bad, then throw away the cache and 1167 * continue without it. The dirmangled() routine 1168 * will then be called upon it. 1169 */ 1170 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { 1171 dnlc_dir_purge(dcap); 1172 break; 1173 } 1174 /* 1175 * Remember the returned offset is the offset of the 1176 * containing record. 1177 */ 1178 slotp->status = FOUND; 1179 slotp->ep = ep; 1180 slotp->offset = offset; 1181 slotp->fbp = fbp; 1182 slotp->size = ep->d_reclen; 1183 /* 1184 * Set end offset to 0. Truncation is handled 1185 * because the dnlc cache will blow away the 1186 * cached directory when an entry is removed 1187 * that drops the entries left to less than half 1188 * the minumum number (dnlc_min_dir_cache). 1189 */ 1190 slotp->endoff = 0; 1191 slotp->cached = 1; 1192 return (0); 1193 case DNOENT: 1194 slotp->status = NONE; 1195 slotp->offset = P2ROUNDUP_TYPED(tdp->i_size, 1196 DIRBLKSIZ, u_offset_t); 1197 slotp->size = DIRBLKSIZ; 1198 slotp->endoff = 0; 1199 slotp->cached = 1; 1200 return (0); 1201 default: 1202 break; 1203 } 1204 break; 1205 } 1206 slotp->cached = 0; 1207 caching = NULL; 1208 if (!noentry && tdp->i_size >= ufs_min_dir_cache) { 1209 /* 1210 * if the directory caching disable time has expired 1211 * enable caching again. 1212 */ 1213 if (tdp->i_cachedir == CD_DISABLED_NOMEM && 1214 gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) { 1215 ufs_dc_disable_at = 0; 1216 tdp->i_cachedir = CD_ENABLED; 1217 } 1218 /* 1219 * Attempt to cache any directories greater than the tunable 1220 * ufs_min_cache_dir. If it fails due to memory shortage 1221 * (DNOMEM), disable caching for this directory and record 1222 * the system time. Any attempt after the disable time has 1223 * expired will enable the caching again. 1224 */ 1225 if (tdp->i_cachedir == CD_ENABLED) { 1226 switch (dnlc_dir_start(dcap, 1227 tdp->i_size >> AV_DIRECT_SHIFT)) { 1228 case DNOMEM: 1229 tdp->i_cachedir = CD_DISABLED_NOMEM; 1230 ufs_dc_disable_at = gethrtime(); 1231 break; 1232 case DTOOBIG: 1233 tdp->i_cachedir = CD_DISABLED_TOOBIG; 1234 break; 1235 case DOK: 1236 caching = 1; 1237 break; 1238 default: 1239 break; 1240 } 1241 } 1242 } 1243 1244 /* 1245 * No point in using i_diroff since we must search whole directory 1246 */ 1247 dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t); 1248 enduseful = 0; 1249 offset = last_offset = 0; 1250 entryoffsetinblk = 0; 1251 needed = (int)LDIRSIZ(namlen); 1252 while (offset < dirsize) { 1253 /* 1254 * If offset is on a block boundary, 1255 * read the next directory block. 1256 * Release previous if it exists. 1257 */ 1258 if (blkoff(tdp->i_fs, offset) == 0) { 1259 if (fbp != NULL) 1260 fbrelse(fbp, S_OTHER); 1261 1262 err = blkatoff(tdp, offset, (char **)0, &fbp); 1263 if (err) { 1264 ASSERT(*ipp == NULL); 1265 if (caching) { 1266 dnlc_dir_purge(dcap); 1267 } 1268 return (err); 1269 } 1270 entryoffsetinblk = 0; 1271 } 1272 /* 1273 * If still looking for a slot, and at a DIRBLKSIZ 1274 * boundary, have to start looking for free space 1275 * again. 1276 */ 1277 if (slotp->status == NONE && 1278 (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) { 1279 slotp->offset = -1; 1280 } 1281 /* 1282 * If the next entry is a zero length record or if the 1283 * record length is invalid, then skip to the next 1284 * directory block. Complete validation checks are 1285 * done if the record length is invalid. 1286 * 1287 * Full validation checks are slow so they are disabled 1288 * by default. Complete checks can be run by patching 1289 * "dirchk" to be true. 1290 * 1291 * We do not have to check the validity of 1292 * entryoffsetinblk here because it starts out as zero 1293 * and is only incremented by d_reclen values that we 1294 * validate here. 1295 */ 1296 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1297 if (ep->d_reclen == 0 || 1298 (dirchk || (ep->d_reclen & 0x3)) && 1299 dirmangled(tdp, ep, entryoffsetinblk, offset)) { 1300 i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1)); 1301 offset += i; 1302 entryoffsetinblk += i; 1303 if (caching) { 1304 dnlc_dir_purge(dcap); 1305 caching = 0; 1306 } 1307 continue; 1308 } 1309 1310 /* 1311 * Add named entries and free space into the directory cache 1312 */ 1313 if (caching) { 1314 ushort_t extra; 1315 off_t off2; 1316 1317 if (ep->d_ino == 0) { 1318 extra = ep->d_reclen; 1319 if (offset & (DIRBLKSIZ - 1)) { 1320 dnlc_dir_purge(dcap); 1321 caching = 0; 1322 } 1323 } else { 1324 /* 1325 * entries hold the previous offset if 1326 * not the 1st one 1327 */ 1328 if (offset & (DIRBLKSIZ - 1)) { 1329 off2 = last_offset; 1330 } else { 1331 off2 = offset + 1; 1332 } 1333 caching = (dnlc_dir_add_entry(dcap, ep->d_name, 1334 INO_OFF_TO_H(ep->d_ino, off2)) == DOK); 1335 extra = ep->d_reclen - DIRSIZ(ep); 1336 } 1337 if (caching && (extra >= LDIRSIZ(1))) { 1338 caching = (dnlc_dir_add_space(dcap, extra, 1339 (uint64_t)offset) == DOK); 1340 } 1341 } 1342 1343 /* 1344 * If an appropriate sized slot has not yet been found, 1345 * check to see if one is available. 1346 */ 1347 if ((slotp->status != FOUND) && (slotp->status != EXIST)) { 1348 int size = ep->d_reclen; 1349 1350 if (ep->d_ino != 0) 1351 size -= DIRSIZ(ep); 1352 if (size > 0) { 1353 if (size >= needed) { 1354 slotp->offset = offset; 1355 slotp->size = ep->d_reclen; 1356 if (noentry) { 1357 slotp->ep = ep; 1358 slotp->fbp = fbp; 1359 slotp->status = FOUND; 1360 slotp->endoff = 0; 1361 return (0); 1362 } 1363 slotp->status = FOUND; 1364 } else if (slotp->status == NONE) { 1365 if (slotp->offset == -1) 1366 slotp->offset = offset; 1367 } 1368 } 1369 } 1370 /* 1371 * Check for a name match. 1372 */ 1373 if (ep->d_ino && ep->d_namlen == namlen && 1374 *namep == *ep->d_name && /* fast chk 1st char */ 1375 bcmp(namep, ep->d_name, namlen) == 0) { 1376 1377 tdp->i_diroff = offset; 1378 1379 if (tdp->i_number == ep->d_ino) { 1380 *ipp = tdp; /* we want ourself, ie "." */ 1381 VN_HOLD(dvp); 1382 } else { 1383 err = ufs_iget_alloced(tdp->i_vfs, 1384 (ino_t)ep->d_ino, ipp, cr); 1385 if (err) { 1386 fbrelse(fbp, S_OTHER); 1387 if (caching) 1388 dnlc_dir_purge(dcap); 1389 return (err); 1390 } 1391 } 1392 slotp->status = EXIST; 1393 slotp->offset = offset; 1394 slotp->size = (int)(offset - last_offset); 1395 slotp->fbp = fbp; 1396 slotp->ep = ep; 1397 slotp->endoff = 0; 1398 if (caching) 1399 dnlc_dir_purge(dcap); 1400 return (0); 1401 } 1402 last_offset = offset; 1403 offset += ep->d_reclen; 1404 entryoffsetinblk += ep->d_reclen; 1405 if (ep->d_ino) 1406 enduseful = offset; 1407 } 1408 if (fbp) { 1409 fbrelse(fbp, S_OTHER); 1410 } 1411 1412 if (caching) { 1413 dnlc_dir_complete(dcap); 1414 slotp->cached = 1; 1415 if (slotp->status == FOUND) { 1416 if (initstat == FOUND) { 1417 return (0); 1418 } 1419 (void) dnlc_dir_rem_space_by_handle(dcap, 1420 slotp->offset); 1421 slotp->endoff = 0; 1422 return (0); 1423 } 1424 } 1425 1426 if (slotp->status == NONE) { 1427 /* 1428 * We didn't find a slot; the new directory entry should be put 1429 * at the end of the directory. Return an indication of where 1430 * this is, and set "endoff" to zero; since we're going to have 1431 * to extend the directory, we're certainly not going to 1432 * truncate it. 1433 */ 1434 slotp->offset = dirsize; 1435 slotp->size = DIRBLKSIZ; 1436 slotp->endoff = 0; 1437 } else { 1438 /* 1439 * We found a slot, and will return an indication of where that 1440 * slot is, as any new directory entry will be put there. 1441 * Since that slot will become a useful entry, if the last 1442 * useful entry we found was before this one, update the offset 1443 * of the last useful entry. 1444 */ 1445 if (enduseful < slotp->offset + slotp->size) 1446 enduseful = slotp->offset + slotp->size; 1447 slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t); 1448 } 1449 *ipp = NULL; 1450 return (0); 1451 } 1452 1453 uint64_t ufs_dirrename_retry_cnt; 1454 1455 /* 1456 * Rename the entry in the directory tdp so that it points to 1457 * sip instead of tip. 1458 */ 1459 static int 1460 ufs_dirrename( 1461 struct inode *sdp, /* parent directory of source */ 1462 struct inode *sip, /* source inode */ 1463 struct inode *tdp, /* parent directory of target */ 1464 char *namep, /* entry we are trying to change */ 1465 struct inode *tip, /* target inode */ 1466 struct ufs_slot *slotp, /* slot for entry */ 1467 struct cred *cr) /* credentials */ 1468 { 1469 vnode_t *tdvp; 1470 off_t offset; 1471 int err; 1472 int doingdirectory; 1473 1474 ASSERT(sdp->i_ufsvfs != NULL); 1475 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1476 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1477 /* 1478 * Short circuit rename of something to itself. 1479 */ 1480 if (sip->i_number == tip->i_number) { 1481 return (ESAME); /* special KLUDGE error code */ 1482 } 1483 1484 /* 1485 * We're locking 2 peer level locks, so must use tryenter 1486 * on the 2nd to avoid deadlocks that would occur 1487 * if we renamed a->b and b->a concurrently. 1488 */ 1489 retry: 1490 rw_enter(&tip->i_contents, RW_WRITER); 1491 if (!rw_tryenter(&sip->i_contents, RW_READER)) { 1492 /* 1493 * drop tip and wait (sleep) until we stand a chance 1494 * of holding sip 1495 */ 1496 rw_exit(&tip->i_contents); 1497 rw_enter(&sip->i_contents, RW_READER); 1498 /* 1499 * Reverse the lock grabs in case we have heavy 1500 * contention on the 2nd lock. 1501 */ 1502 if (!rw_tryenter(&tip->i_contents, RW_WRITER)) { 1503 ufs_dirrename_retry_cnt++; 1504 rw_exit(&sip->i_contents); 1505 goto retry; 1506 } 1507 } 1508 1509 /* 1510 * Check that everything is on the same filesystem. 1511 */ 1512 if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) || 1513 (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) { 1514 err = EXDEV; /* XXX archaic */ 1515 goto out; 1516 } 1517 /* 1518 * Must have write permission to rewrite target entry. 1519 * Perform additional checks for sticky directories. 1520 */ 1521 if ((err = ufs_iaccess(tdp, IWRITE, cr, 0)) != 0 || 1522 (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0) 1523 goto out; 1524 1525 /* 1526 * Ensure source and target are compatible (both directories 1527 * or both not directories). If target is a directory it must 1528 * be empty and have no links to it; in addition it must not 1529 * be a mount point, and both the source and target must be 1530 * writable. 1531 */ 1532 doingdirectory = (((sip->i_mode & IFMT) == IFDIR) || 1533 ((sip->i_mode & IFMT) == IFATTRDIR)); 1534 if (((tip->i_mode & IFMT) == IFDIR) || 1535 ((tip->i_mode & IFMT) == IFATTRDIR)) { 1536 if (!doingdirectory) { 1537 err = EISDIR; 1538 goto out; 1539 } 1540 /* 1541 * vn_vfsrlock will prevent mounts from using the directory 1542 * until we are done. 1543 */ 1544 if (vn_vfsrlock(ITOV(tip))) { 1545 err = EBUSY; 1546 goto out; 1547 } 1548 if (vn_mountedvfs(ITOV(tip)) != NULL) { 1549 vn_vfsunlock(ITOV(tip)); 1550 err = EBUSY; 1551 goto out; 1552 } 1553 if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) { 1554 vn_vfsunlock(ITOV(tip)); 1555 err = EEXIST; /* SIGH should be ENOTEMPTY */ 1556 goto out; 1557 } 1558 } else if (doingdirectory) { 1559 err = ENOTDIR; 1560 goto out; 1561 } 1562 1563 /* 1564 * Rewrite the inode pointer for target name entry 1565 * from the target inode (ip) to the source inode (sip). 1566 * This prevents the target entry from disappearing 1567 * during a crash. Mark the directory inode to reflect the changes. 1568 */ 1569 tdvp = ITOV(tdp); 1570 slotp->ep->d_ino = (int32_t)sip->i_number; 1571 dnlc_update(tdvp, namep, ITOV(sip)); 1572 if (slotp->size) { 1573 offset = slotp->offset - slotp->size; 1574 } else { 1575 offset = slotp->offset + 1; 1576 } 1577 if (slotp->cached) { 1578 (void) dnlc_dir_update(&tdp->i_danchor, namep, 1579 INO_OFF_TO_H(slotp->ep->d_ino, offset)); 1580 } 1581 1582 err = TRANS_DIR(tdp, slotp->offset); 1583 if (err) 1584 fbrelse(slotp->fbp, S_OTHER); 1585 else 1586 err = ufs_fbwrite(slotp->fbp, tdp); 1587 1588 slotp->fbp = NULL; 1589 if (err) { 1590 if (doingdirectory) 1591 vn_vfsunlock(ITOV(tip)); 1592 goto out; 1593 } 1594 1595 TRANS_INODE(tdp->i_ufsvfs, tdp); 1596 tdp->i_flag |= IUPD|ICHG; 1597 tdp->i_seq++; 1598 ITIMES_NOLOCK(tdp); 1599 1600 /* 1601 * Decrement the link count of the target inode. 1602 * Fix the ".." entry in sip to point to dp. 1603 * This is done after the new entry is on the disk. 1604 */ 1605 tip->i_nlink--; 1606 TRANS_INODE(tip->i_ufsvfs, tip); 1607 tip->i_flag |= ICHG; 1608 tip->i_seq++; 1609 ITIMES_NOLOCK(tip); 1610 if (doingdirectory) { 1611 /* 1612 * The entry for tip no longer exists so I can unlock the 1613 * vfslock. 1614 */ 1615 vn_vfsunlock(ITOV(tip)); 1616 /* 1617 * Decrement target link count once more if it was a directory. 1618 */ 1619 if (--tip->i_nlink != 0) { 1620 err = ufs_fault(ITOV(tip), 1621 "ufs_dirrename: target directory link count != 0 (%s)", 1622 tip->i_fs->fs_fsmnt); 1623 rw_exit(&tip->i_contents); 1624 return (err); 1625 } 1626 TRANS_INODE(tip->i_ufsvfs, tip); 1627 ufs_setreclaim(tip); 1628 /* 1629 * Renaming a directory with the parent different 1630 * requires that ".." be rewritten. The window is 1631 * still there for ".." to be inconsistent, but this 1632 * is unavoidable, and a lot shorter than when it was 1633 * done in a user process. We decrement the link 1634 * count in the new parent as appropriate to reflect 1635 * the just-removed target. If the parent is the 1636 * same, this is appropriate since the original 1637 * directory is going away. If the new parent is 1638 * different, ufs_dirfixdotdot() will bump the link count 1639 * back. 1640 */ 1641 tdp->i_nlink--; 1642 ufs_setreclaim(tdp); 1643 TRANS_INODE(tdp->i_ufsvfs, tdp); 1644 tdp->i_flag |= ICHG; 1645 tdp->i_seq++; 1646 ITIMES_NOLOCK(tdp); 1647 if (sdp != tdp) { 1648 rw_exit(&tip->i_contents); 1649 rw_exit(&sip->i_contents); 1650 err = ufs_dirfixdotdot(sip, sdp, tdp); 1651 return (err); 1652 } 1653 } else 1654 ufs_setreclaim(tip); 1655 out: 1656 rw_exit(&tip->i_contents); 1657 rw_exit(&sip->i_contents); 1658 return (err); 1659 } 1660 1661 /* 1662 * Fix the ".." entry of the child directory so that it points 1663 * to the new parent directory instead of the old one. Routine 1664 * assumes that dp is a directory and that all the inodes are on 1665 * the same file system. 1666 */ 1667 static int 1668 ufs_dirfixdotdot( 1669 struct inode *dp, /* child directory */ 1670 struct inode *opdp, /* old parent directory */ 1671 struct inode *npdp) /* new parent directory */ 1672 { 1673 struct fbuf *fbp; 1674 struct dirtemplate *dirp; 1675 vnode_t *dvp; 1676 int err; 1677 1678 ASSERT(RW_WRITE_HELD(&npdp->i_rwlock)); 1679 ASSERT(RW_WRITE_HELD(&npdp->i_contents)); 1680 1681 /* 1682 * We hold the child directory's i_contents lock before calling 1683 * blkatoff so that we honor correct locking protocol which is 1684 * i_contents lock and then page lock. (blkatoff will call 1685 * ufs_getpage where we want the page lock) 1686 * We hold the child directory's i_rwlock before i_contents (as 1687 * per the locking protocol) since we are modifying the ".." entry 1688 * of the child directory. 1689 * We hold the i_rwlock and i_contents lock until we record 1690 * this directory delta to the log (via ufs_trans_dir) and have 1691 * done fbrelse. 1692 */ 1693 rw_enter(&dp->i_rwlock, RW_WRITER); 1694 rw_enter(&dp->i_contents, RW_WRITER); 1695 err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp); 1696 if (err) 1697 goto bad; 1698 1699 if (dp->i_nlink <= 0 || 1700 dp->i_size < sizeof (struct dirtemplate)) { 1701 err = ENOENT; 1702 goto bad; 1703 } 1704 1705 if (dirp->dotdot_namlen != 2 || 1706 dirp->dotdot_name[0] != '.' || 1707 dirp->dotdot_name[1] != '.') { /* Sanity check. */ 1708 dirbad(dp, "mangled .. entry", (off_t)0); 1709 err = ENOTDIR; 1710 goto bad; 1711 } 1712 1713 /* 1714 * Increment the link count in the new parent inode and force it out. 1715 */ 1716 if (npdp->i_nlink == MAXLINK) { 1717 err = EMLINK; 1718 goto bad; 1719 } 1720 npdp->i_nlink++; 1721 TRANS_INODE(npdp->i_ufsvfs, npdp); 1722 npdp->i_flag |= ICHG; 1723 npdp->i_seq++; 1724 ufs_iupdat(npdp, I_SYNC); 1725 1726 /* 1727 * Rewrite the child ".." entry and force it out. 1728 */ 1729 dvp = ITOV(dp); 1730 dirp->dotdot_ino = (uint32_t)npdp->i_number; 1731 dnlc_update(dvp, "..", ITOV(npdp)); 1732 (void) dnlc_dir_update(&dp->i_danchor, "..", 1733 INO_OFF_TO_H(dirp->dotdot_ino, 0)); 1734 1735 err = TRANS_DIR(dp, 0); 1736 if (err) 1737 fbrelse(fbp, S_OTHER); 1738 else 1739 err = ufs_fbwrite(fbp, dp); 1740 1741 fbp = NULL; 1742 if (err) 1743 goto bad; 1744 1745 rw_exit(&dp->i_contents); 1746 rw_exit(&dp->i_rwlock); 1747 1748 /* 1749 * Decrement the link count of the old parent inode and force it out. 1750 */ 1751 ASSERT(opdp); 1752 rw_enter(&opdp->i_contents, RW_WRITER); 1753 ASSERT(opdp->i_nlink > 0); 1754 opdp->i_nlink--; 1755 ufs_setreclaim(opdp); 1756 TRANS_INODE(opdp->i_ufsvfs, opdp); 1757 opdp->i_flag |= ICHG; 1758 opdp->i_seq++; 1759 ufs_iupdat(opdp, I_SYNC); 1760 rw_exit(&opdp->i_contents); 1761 return (0); 1762 1763 bad: 1764 if (fbp) 1765 fbrelse(fbp, S_OTHER); 1766 rw_exit(&dp->i_contents); 1767 rw_exit(&dp->i_rwlock); 1768 return (err); 1769 } 1770 1771 /* 1772 * Enter the file sip in the directory tdp with name namep. 1773 */ 1774 static int 1775 ufs_diraddentry( 1776 struct inode *tdp, 1777 char *namep, 1778 enum de_op op, 1779 int namlen, 1780 struct ufs_slot *slotp, 1781 struct inode *sip, 1782 struct inode *sdp, 1783 struct cred *cr) 1784 { 1785 struct direct *ep, *nep; 1786 vnode_t *tdvp; 1787 dcanchor_t *dcap = &tdp->i_danchor; 1788 off_t offset; 1789 int err; 1790 ushort_t extra; 1791 1792 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1793 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1794 /* 1795 * Prepare a new entry. If the caller has not supplied an 1796 * existing inode, make a new one. 1797 */ 1798 err = dirprepareentry(tdp, slotp, cr); 1799 if (err) { 1800 if (slotp->fbp) { 1801 fbrelse(slotp->fbp, S_OTHER); 1802 slotp->fbp = NULL; 1803 } 1804 return (err); 1805 } 1806 /* 1807 * Check inode to be linked to see if it is in the 1808 * same filesystem. 1809 */ 1810 if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) { 1811 err = EXDEV; 1812 goto bad; 1813 } 1814 1815 /* 1816 * If renaming a directory then fix up the ".." entry in the 1817 * directory to point to the new parent. 1818 */ 1819 if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) || 1820 ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) { 1821 err = ufs_dirfixdotdot(sip, sdp, tdp); 1822 if (err) 1823 goto bad; 1824 } 1825 1826 /* 1827 * Fill in entry data. 1828 */ 1829 ep = slotp->ep; 1830 ep->d_namlen = (ushort_t)namlen; 1831 (void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3)); 1832 ep->d_ino = (uint32_t)sip->i_number; 1833 tdvp = ITOV(tdp); 1834 dnlc_update(tdvp, namep, ITOV(sip)); 1835 /* 1836 * Note the offset supplied for any named entry is 1837 * the offset of the previous one, unless it's the 1st. 1838 * slotp->size is used to pass the length to 1839 * the previous entry. 1840 */ 1841 if (slotp->size) { 1842 offset = slotp->offset - slotp->size; 1843 } else { 1844 offset = slotp->offset + 1; 1845 } 1846 1847 if (slotp->cached) { 1848 /* 1849 * Add back any usable unused space to the dnlc directory 1850 * cache. 1851 */ 1852 extra = ep->d_reclen - DIRSIZ(ep); 1853 if (extra >= LDIRSIZ(1)) { 1854 (void) dnlc_dir_add_space(dcap, extra, 1855 (uint64_t)slotp->offset); 1856 } 1857 1858 (void) dnlc_dir_add_entry(dcap, namep, 1859 INO_OFF_TO_H(ep->d_ino, offset)); 1860 1861 /* adjust the previous offset of the next entry */ 1862 nep = (struct direct *)((char *)ep + ep->d_reclen); 1863 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { 1864 /* 1865 * Not a new block. 1866 * 1867 * Check the validity of the next entry. 1868 * If it's bad, then throw away the cache, and 1869 * continue as before directory caching. 1870 */ 1871 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || 1872 dnlc_dir_update(dcap, nep->d_name, 1873 INO_OFF_TO_H(nep->d_ino, slotp->offset)) 1874 == DNOENT) { 1875 dnlc_dir_purge(dcap); 1876 slotp->cached = 0; 1877 } 1878 } 1879 } 1880 1881 /* 1882 * Write out the directory block. 1883 */ 1884 err = TRANS_DIR(tdp, slotp->offset); 1885 if (err) 1886 fbrelse(slotp->fbp, S_OTHER); 1887 else 1888 err = ufs_fbwrite(slotp->fbp, tdp); 1889 1890 slotp->fbp = NULL; 1891 /* 1892 * If this is a rename of a directory, then we have already 1893 * fixed the ".." entry to refer to the new parent. If err 1894 * is true at this point, we have failed to update the new 1895 * parent to refer to the renamed directory. 1896 * XXX - we need to unwind the ".." fix. 1897 */ 1898 if (err) 1899 return (err); 1900 1901 /* 1902 * Mark the directory inode to reflect the changes. 1903 * Truncate the directory to chop off blocks of empty entries. 1904 */ 1905 1906 TRANS_INODE(tdp->i_ufsvfs, tdp); 1907 tdp->i_flag |= IUPD|ICHG; 1908 tdp->i_seq++; 1909 tdp->i_diroff = 0; 1910 ITIMES_NOLOCK(tdp); 1911 /* 1912 * If the directory grew then dirprepareentry() will have 1913 * set IATTCHG in tdp->i_flag, then the directory inode must 1914 * be flushed out. This is because if fsync() is used later 1915 * the directory size must be correct, otherwise a crash would 1916 * cause fsck to move the file to lost+found. Also because later 1917 * a file may be linked in more than one directory, then there 1918 * is no way to flush the original directory. So it must be 1919 * flushed out on creation. See bug 4293809. 1920 */ 1921 if (tdp->i_flag & IATTCHG) { 1922 ufs_iupdat(tdp, I_SYNC); 1923 } 1924 1925 if (slotp->endoff && (slotp->endoff < tdp->i_size)) { 1926 if (!TRANS_ISTRANS(tdp->i_ufsvfs)) { 1927 (void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0, 1928 cr); 1929 } 1930 } 1931 1932 1933 return (0); 1934 1935 bad: 1936 if (slotp->cached) { 1937 dnlc_dir_purge(dcap); 1938 fbrelse(slotp->fbp, S_OTHER); 1939 slotp->cached = 0; 1940 slotp->fbp = NULL; 1941 return (err); 1942 } 1943 1944 /* 1945 * Clear out entry prepared by dirprepareent. 1946 */ 1947 slotp->ep->d_ino = 0; 1948 slotp->ep->d_namlen = 0; 1949 1950 /* 1951 * Don't touch err so we don't clobber the real error that got us here. 1952 */ 1953 if (TRANS_DIR(tdp, slotp->offset)) 1954 fbrelse(slotp->fbp, S_OTHER); 1955 else 1956 (void) ufs_fbwrite(slotp->fbp, tdp); 1957 slotp->fbp = NULL; 1958 return (err); 1959 } 1960 1961 /* 1962 * Prepare a directory slot to receive an entry. 1963 */ 1964 static int 1965 dirprepareentry( 1966 struct inode *dp, /* directory we are working in */ 1967 struct ufs_slot *slotp, /* available slot info */ 1968 struct cred *cr) 1969 { 1970 struct direct *ep, *nep; 1971 off_t entryend; 1972 int err; 1973 slotstat_t status = slotp->status; 1974 ushort_t dsize; 1975 1976 ASSERT((status == NONE) || (status == FOUND)); 1977 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 1978 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 1979 /* 1980 * If we didn't find a slot, then indicate that the 1981 * new slot belongs at the end of the directory. 1982 * If we found a slot, then the new entry can be 1983 * put at slotp->offset. 1984 */ 1985 entryend = slotp->offset + slotp->size; 1986 if (status == NONE) { 1987 ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0); 1988 if (DIRBLKSIZ > dp->i_fs->fs_fsize) { 1989 err = ufs_fault(ITOV(dp), 1990 "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d" 1991 " > dp->i_fs->fs_fsize: %d (%s)", 1992 DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt); 1993 return (err); 1994 } 1995 /* 1996 * Allocate the new block. 1997 */ 1998 err = BMAPALLOC(dp, (u_offset_t)slotp->offset, 1999 (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr); 2000 if (err) { 2001 return (err); 2002 } 2003 dp->i_size = entryend; 2004 TRANS_INODE(dp->i_ufsvfs, dp); 2005 dp->i_flag |= IUPD|ICHG|IATTCHG; 2006 dp->i_seq++; 2007 ITIMES_NOLOCK(dp); 2008 } else if (entryend > dp->i_size) { 2009 /* 2010 * Adjust directory size, if needed. This should never 2011 * push the size past a new multiple of DIRBLKSIZ. 2012 * This is an artifact of the old (4.2BSD) way of initializing 2013 * directory sizes to be less than DIRBLKSIZ. 2014 */ 2015 dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t); 2016 TRANS_INODE(dp->i_ufsvfs, dp); 2017 dp->i_flag |= IUPD|ICHG|IATTCHG; 2018 dp->i_seq++; 2019 ITIMES_NOLOCK(dp); 2020 } 2021 2022 /* 2023 * Get the block containing the space for the new directory entry. 2024 */ 2025 if (slotp->fbp == NULL) { 2026 err = blkatoff(dp, slotp->offset, (char **)&slotp->ep, 2027 &slotp->fbp); 2028 if (err) { 2029 return (err); 2030 } 2031 } 2032 ep = slotp->ep; 2033 2034 switch (status) { 2035 case NONE: 2036 /* 2037 * No space in the directory. slotp->offset will be on a 2038 * directory block boundary and we will write the new entry 2039 * into a fresh block. 2040 */ 2041 ep->d_reclen = DIRBLKSIZ; 2042 slotp->size = 0; /* length of previous entry */ 2043 break; 2044 case FOUND: 2045 /* 2046 * An entry of the required size has been found. Use it. 2047 */ 2048 if (ep->d_ino == 0) { 2049 /* this is the 1st record in a block */ 2050 slotp->size = 0; /* length of previous entry */ 2051 } else { 2052 dsize = DIRSIZ(ep); 2053 nep = (struct direct *)((char *)ep + dsize); 2054 nep->d_reclen = ep->d_reclen - dsize; 2055 ep->d_reclen = dsize; 2056 slotp->ep = nep; 2057 slotp->offset += dsize; 2058 slotp->size = dsize; /* length of previous entry */ 2059 } 2060 break; 2061 default: 2062 break; 2063 } 2064 return (0); 2065 } 2066 2067 /* 2068 * Allocate and initialize a new inode that will go into directory tdp. 2069 * This routine is called from ufs_symlink(), as well as within this file. 2070 */ 2071 int 2072 ufs_dirmakeinode( 2073 struct inode *tdp, 2074 struct inode **ipp, 2075 struct vattr *vap, 2076 enum de_op op, 2077 struct cred *cr) 2078 { 2079 struct inode *ip; 2080 enum vtype type; 2081 int imode; /* mode and format as in inode */ 2082 ino_t ipref; 2083 int err; 2084 timestruc_t now; 2085 2086 ASSERT(vap != NULL); 2087 ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR || 2088 op == DE_SYMLINK); 2089 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 2090 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 2091 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 2092 /* 2093 * Allocate a new inode. 2094 */ 2095 type = vap->va_type; 2096 if (type == VDIR) { 2097 ipref = dirpref(tdp); 2098 } else { 2099 ipref = tdp->i_number; 2100 } 2101 if (op == DE_ATTRDIR) 2102 imode = vap->va_mode; 2103 else 2104 imode = MAKEIMODE(type, vap->va_mode); 2105 *ipp = NULL; 2106 err = ufs_ialloc(tdp, ipref, imode, &ip, cr); 2107 if (err) 2108 return (err); 2109 2110 /* 2111 * We don't need to grab vfs_dqrwlock here because it is held 2112 * in ufs_direnter_*() above us. 2113 */ 2114 ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock)); 2115 rw_enter(&ip->i_contents, RW_WRITER); 2116 if (ip->i_dquot != NULL) { 2117 err = ufs_fault(ITOV(ip), 2118 "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)", 2119 tdp->i_fs->fs_fsmnt); 2120 rw_exit(&ip->i_contents); 2121 return (err); 2122 } 2123 *ipp = ip; 2124 ip->i_mode = (o_mode_t)imode; 2125 if (type == VBLK || type == VCHR) { 2126 dev_t d = vap->va_rdev; 2127 dev32_t dev32; 2128 2129 /* 2130 * Don't allow a special file to be created with a 2131 * dev_t that cannot be represented by this filesystem 2132 * format on disk. 2133 */ 2134 if (!cmpldev(&dev32, d)) { 2135 err = EOVERFLOW; 2136 goto fail; 2137 } 2138 2139 ITOV(ip)->v_rdev = ip->i_rdev = d; 2140 2141 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { 2142 ip->i_ordev = dev32; /* can't use old format */ 2143 } else { 2144 ip->i_ordev = cmpdev(d); 2145 } 2146 } 2147 ITOV(ip)->v_type = type; 2148 ufs_reset_vnode(ip->i_vnode); 2149 if (type == VDIR) { 2150 ip->i_nlink = 2; /* anticipating a call to dirmakedirect */ 2151 } else { 2152 ip->i_nlink = 1; 2153 } 2154 2155 if (op == DE_ATTRDIR) { 2156 ip->i_uid = vap->va_uid; 2157 ip->i_gid = vap->va_gid; 2158 } else 2159 ip->i_uid = crgetuid(cr); 2160 /* 2161 * To determine the group-id of the created file: 2162 * 1) If the gid is set in the attribute list (non-Sun & pre-4.0 2163 * clients are not likely to set the gid), then use it if 2164 * the process is privileged, belongs to the target group, 2165 * or the group is the same as the parent directory. 2166 * 2) If the filesystem was not mounted with the Old-BSD-compatible 2167 * GRPID option, and the directory's set-gid bit is clear, 2168 * then use the process's gid. 2169 * 3) Otherwise, set the group-id to the gid of the parent directory. 2170 */ 2171 if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) && 2172 ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) || 2173 secpolicy_vnode_create_gid(cr) == 0)) { 2174 /* 2175 * XXX - is this only the case when a 4.0 NFS client, or a 2176 * client derived from that code, makes a call over the wire? 2177 */ 2178 ip->i_gid = vap->va_gid; 2179 } else 2180 ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr); 2181 2182 /* 2183 * For SunOS 5.0->5.4, the lines below read: 2184 * 2185 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; 2186 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; 2187 * 2188 * where MAXUID was set to 60002. See notes on this in ufs_inode.c 2189 */ 2190 ip->i_suid = 2191 (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? UID_LONG : ip->i_uid; 2192 ip->i_sgid = 2193 (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? GID_LONG : ip->i_gid; 2194 2195 /* 2196 * If we're creating a directory, and the parent directory has the 2197 * set-GID bit set, set it on the new directory. 2198 * Otherwise, if the user is neither privileged nor a member of the 2199 * file's new group, clear the file's set-GID bit. 2200 */ 2201 if ((tdp->i_mode & ISGID) && (type == VDIR)) 2202 ip->i_mode |= ISGID; 2203 else { 2204 if ((ip->i_mode & ISGID) && 2205 secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0) 2206 ip->i_mode &= ~ISGID; 2207 } 2208 2209 if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2210 ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2211 err = EOVERFLOW; 2212 goto fail; 2213 } 2214 2215 /* 2216 * Extended attribute directories are not subject to quotas. 2217 */ 2218 if (op != DE_ATTRDIR) 2219 ip->i_dquot = getinoquota(ip); 2220 else 2221 ip->i_dquot = NULL; 2222 2223 if (op == DE_MKDIR || op == DE_ATTRDIR) { 2224 err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr); 2225 if (err) 2226 goto fail; 2227 } 2228 2229 /* 2230 * generate the shadow inode and attach it to the new object 2231 */ 2232 ASSERT((tdp->i_shadow && tdp->i_ufs_acl) || 2233 (!tdp->i_shadow && !tdp->i_ufs_acl)); 2234 if (tdp->i_shadow && tdp->i_ufs_acl && 2235 (((tdp->i_mode & IFMT) == IFDIR) || 2236 ((tdp->i_mode & IFMT) == IFATTRDIR))) { 2237 err = ufs_si_inherit(ip, tdp, ip->i_mode, cr); 2238 if (err) { 2239 if (op == DE_MKDIR) { 2240 /* 2241 * clean up parent directory 2242 * 2243 * tdp->i_contents already locked from 2244 * ufs_direnter_*() 2245 */ 2246 tdp->i_nlink--; 2247 TRANS_INODE(tdp->i_ufsvfs, tdp); 2248 tdp->i_flag |= ICHG; 2249 tdp->i_seq++; 2250 ufs_iupdat(tdp, I_SYNC); 2251 } 2252 goto fail; 2253 } 2254 } 2255 2256 /* 2257 * If the passed in attributes contain atime and/or mtime 2258 * settings, then use them instead of using the current 2259 * high resolution time. 2260 */ 2261 if (vap->va_mask & (AT_MTIME|AT_ATIME)) { 2262 if (vap->va_mask & AT_ATIME) { 2263 ip->i_atime.tv_sec = vap->va_atime.tv_sec; 2264 ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2265 ip->i_flag &= ~IACC; 2266 } else 2267 ip->i_flag |= IACC; 2268 if (vap->va_mask & AT_MTIME) { 2269 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; 2270 ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2271 gethrestime(&now); 2272 if (now.tv_sec > TIME32_MAX) { 2273 /* 2274 * In 2038, ctime sticks forever.. 2275 */ 2276 ip->i_ctime.tv_sec = TIME32_MAX; 2277 ip->i_ctime.tv_usec = 0; 2278 } else { 2279 ip->i_ctime.tv_sec = now.tv_sec; 2280 ip->i_ctime.tv_usec = now.tv_nsec / 1000; 2281 } 2282 ip->i_flag &= ~(IUPD|ICHG); 2283 ip->i_flag |= IMODTIME; 2284 } else 2285 ip->i_flag |= IUPD|ICHG; 2286 ip->i_flag |= IMOD; 2287 } else 2288 ip->i_flag |= IACC|IUPD|ICHG; 2289 ip->i_seq++; 2290 2291 /* 2292 * If this is an attribute tag it as one. 2293 */ 2294 if ((tdp->i_mode & IFMT) == IFATTRDIR) { 2295 ip->i_cflags |= IXATTR; 2296 } 2297 2298 /* 2299 * push inode before it's name appears in a directory 2300 */ 2301 TRANS_INODE(ip->i_ufsvfs, ip); 2302 ufs_iupdat(ip, I_SYNC); 2303 rw_exit(&ip->i_contents); 2304 return (0); 2305 2306 fail: 2307 /* Throw away inode we just allocated. */ 2308 ip->i_nlink = 0; 2309 ufs_setreclaim(ip); 2310 TRANS_INODE(ip->i_ufsvfs, ip); 2311 ip->i_flag |= ICHG; 2312 ip->i_seq++; 2313 ITIMES_NOLOCK(ip); 2314 rw_exit(&ip->i_contents); 2315 return (err); 2316 } 2317 2318 /* 2319 * Write a prototype directory into the empty inode ip, whose parent is dp. 2320 */ 2321 static int 2322 ufs_dirmakedirect( 2323 struct inode *ip, /* new directory */ 2324 struct inode *dp, /* parent directory */ 2325 int attrdir, 2326 struct cred *cr) 2327 { 2328 struct dirtemplate *dirp; 2329 struct fbuf *fbp; 2330 int err; 2331 2332 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 2333 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 2334 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 2335 /* 2336 * Allocate space for the directory we're creating. 2337 */ 2338 err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr); 2339 if (err) 2340 return (err); 2341 if (DIRBLKSIZ > dp->i_fs->fs_fsize) { 2342 err = ufs_fault(ITOV(dp), 2343 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)", 2344 DIRBLKSIZ, dp->i_fs->fs_fsize, 2345 dp->i_fs->fs_fsmnt); 2346 return (err); 2347 } 2348 ip->i_size = DIRBLKSIZ; 2349 TRANS_INODE(ip->i_ufsvfs, ip); 2350 ip->i_flag |= IUPD|ICHG|IATTCHG; 2351 ip->i_seq++; 2352 ITIMES_NOLOCK(ip); 2353 /* 2354 * Update the tdp link count and write out the change. 2355 * This reflects the ".." entry we'll soon write. 2356 */ 2357 if (dp->i_nlink == MAXLINK) 2358 return (EMLINK); 2359 if (attrdir == 0) 2360 dp->i_nlink++; 2361 TRANS_INODE(dp->i_ufsvfs, dp); 2362 dp->i_flag |= ICHG; 2363 dp->i_seq++; 2364 ufs_iupdat(dp, I_SYNC); 2365 /* 2366 * Initialize directory with "." 2367 * and ".." from static template. 2368 * 2369 * Since the parent directory is locked, we don't have to 2370 * worry about anything changing when we drop the write 2371 * lock on (ip). 2372 * 2373 */ 2374 err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize, 2375 S_READ, &fbp); 2376 2377 if (err) { 2378 goto fail; 2379 } 2380 dirp = (struct dirtemplate *)fbp->fb_addr; 2381 /* 2382 * Now initialize the directory we're creating 2383 * with the "." and ".." entries. 2384 */ 2385 *dirp = mastertemplate; /* structure assignment */ 2386 dirp->dot_ino = (uint32_t)ip->i_number; 2387 dirp->dotdot_ino = (uint32_t)dp->i_number; 2388 2389 err = TRANS_DIR(ip, 0); 2390 if (err) { 2391 fbrelse(fbp, S_OTHER); 2392 goto fail; 2393 } 2394 2395 err = ufs_fbwrite(fbp, ip); 2396 if (err) { 2397 goto fail; 2398 } 2399 2400 return (0); 2401 2402 fail: 2403 if (attrdir == 0) 2404 dp->i_nlink--; 2405 TRANS_INODE(dp->i_ufsvfs, dp); 2406 dp->i_flag |= ICHG; 2407 dp->i_seq++; 2408 ufs_iupdat(dp, I_SYNC); 2409 return (err); 2410 } 2411 2412 /* 2413 * Delete a directory entry. If oip is nonzero the entry is checked 2414 * to make sure it still reflects oip. 2415 * 2416 * If vpp is non-null, return the ptr of the (held) vnode associated with 2417 * the removed name. The caller is responsible for doing the VN_RELE(). 2418 */ 2419 int 2420 ufs_dirremove( 2421 struct inode *dp, 2422 char *namep, 2423 struct inode *oip, 2424 struct vnode *cdir, 2425 enum dr_op op, 2426 struct cred *cr, 2427 vnode_t **vpp) /* Return (held) vnode ptr of removed file/dir */ 2428 { 2429 struct direct *ep, *pep, *nep; 2430 struct inode *ip; 2431 vnode_t *dvp, *vp; 2432 struct ufs_slot slot; 2433 int namlen; 2434 int err; 2435 int mode; 2436 ushort_t extra; 2437 2438 namlen = (int)strlen(namep); 2439 if (namlen == 0) { 2440 struct fs *fs = dp->i_fs; 2441 2442 cmn_err(CE_WARN, "%s: ufs_dirremove: attempted to remove" 2443 " nameless file in directory (directory inode %llu)", 2444 fs->fs_fsmnt, (u_longlong_t)dp->i_number); 2445 ASSERT(namlen != 0); 2446 2447 return (ENOENT); 2448 } 2449 2450 /* 2451 * return error when removing . and .. 2452 */ 2453 if (namep[0] == '.') { 2454 if (namlen == 1) 2455 return (EINVAL); 2456 else if (namlen == 2 && namep[1] == '.') { 2457 return (EEXIST); /* SIGH should be ENOTEMPTY */ 2458 } 2459 } 2460 2461 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 2462 2463 retry: 2464 /* 2465 * Check accessibility of directory. 2466 */ 2467 if (err = ufs_diraccess(dp, IEXEC|IWRITE, cr)) 2468 return (err); 2469 2470 ip = NULL; 2471 slot.fbp = NULL; 2472 slot.status = FOUND; /* don't need to look for empty slot */ 2473 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 2474 rw_enter(&dp->i_contents, RW_WRITER); 2475 2476 err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0); 2477 if (err) 2478 goto out_novfs; 2479 if (ip == NULL) { 2480 err = ENOENT; 2481 goto out_novfs; 2482 } 2483 vp = ITOV(ip); 2484 if (oip && oip != ip) { 2485 err = ENOENT; 2486 goto out_novfs; 2487 } 2488 2489 mode = ip->i_mode & IFMT; 2490 if (mode == IFDIR || mode == IFATTRDIR) { 2491 2492 /* 2493 * vn_vfsrlock() prevents races between mount and rmdir. 2494 */ 2495 if (vn_vfsrlock(vp)) { 2496 err = EBUSY; 2497 goto out_novfs; 2498 } 2499 if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) { 2500 err = EBUSY; 2501 goto out; 2502 } 2503 /* 2504 * If we are removing a directory, get a lock on it. 2505 * Taking a writer lock prevents a parallel ufs_dirlook from 2506 * incorrectly entering a negative cache vnode entry in the dnlc 2507 * If the directory is empty, it will stay empty until 2508 * we can remove it. 2509 */ 2510 if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) { 2511 /* 2512 * It is possible that a thread in rename would have 2513 * acquired this rwlock. To prevent a deadlock we 2514 * do a rw_tryenter. If we fail to get the lock 2515 * we drop all the locks we have acquired, wait 2516 * for 2 ticks and reacquire the 2517 * directory's (dp) i_rwlock and try again. 2518 * If we dont drop dp's i_rwlock then we will panic 2519 * with a "Deadlock: cycle in blocking chain" 2520 * since in ufs_dircheckpath we want dp's i_rwlock. 2521 * dp is guaranteed to exist since ufs_dirremove is 2522 * called after a VN_HOLD(dp) has been done. 2523 */ 2524 ufs_dirremove_retry_cnt++; 2525 vn_vfsunlock(vp); 2526 if (slot.fbp) 2527 fbrelse(slot.fbp, S_OTHER); 2528 rw_exit(&dp->i_contents); 2529 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 2530 rw_exit(&dp->i_rwlock); 2531 VN_RELE(vp); 2532 delay(2); 2533 rw_enter(&dp->i_rwlock, RW_WRITER); 2534 goto retry; 2535 } 2536 } 2537 rw_enter(&ip->i_contents, RW_READER); 2538 2539 /* 2540 * Now check the restrictions that apply on sticky directories. 2541 */ 2542 if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) { 2543 rw_exit(&ip->i_contents); 2544 if (mode == IFDIR || mode == IFATTRDIR) 2545 rw_exit(&ip->i_rwlock); 2546 goto out; 2547 } 2548 2549 if (op == DR_RMDIR) { 2550 /* 2551 * For rmdir(2), some special checks are required. 2552 * (a) Don't remove any alias of the parent (e.g. "."). 2553 * (b) Don't remove the current directory. 2554 * (c) Make sure the entry is (still) a directory. 2555 * (d) Make sure the directory is empty. 2556 */ 2557 2558 if (dp == ip || vp == cdir) 2559 err = EINVAL; 2560 else if (((ip->i_mode & IFMT) != IFDIR) && 2561 ((ip->i_mode & IFMT) != IFATTRDIR)) 2562 err = ENOTDIR; 2563 else if ((ip->i_nlink > 2) || 2564 !ufs_dirempty(ip, dp->i_number, cr)) { 2565 err = EEXIST; /* SIGH should be ENOTEMPTY */ 2566 } 2567 2568 if (err) { 2569 rw_exit(&ip->i_contents); 2570 if (mode == IFDIR || mode == IFATTRDIR) 2571 rw_exit(&ip->i_rwlock); 2572 goto out; 2573 } 2574 } else if (op == DR_REMOVE) { 2575 /* 2576 * unlink(2) requires a different check: allow only 2577 * privileged users to unlink a directory. 2578 */ 2579 if (vp->v_type == VDIR && 2580 secpolicy_fs_linkdir(cr, vp->v_vfsp)) { 2581 err = EPERM; 2582 rw_exit(&ip->i_contents); 2583 rw_exit(&ip->i_rwlock); 2584 goto out; 2585 } 2586 } 2587 2588 rw_exit(&ip->i_contents); 2589 2590 /* 2591 * Remove the cache'd entry, if any. 2592 */ 2593 dvp = ITOV(dp); 2594 dnlc_remove(dvp, namep); 2595 ep = slot.ep; 2596 ep->d_ino = 0; 2597 2598 if (slot.cached) { 2599 dcanchor_t *dcap = &dp->i_danchor; 2600 2601 (void) dnlc_dir_rem_entry(dcap, namep, NULL); 2602 if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) { 2603 (void) dnlc_dir_rem_space_by_handle(dcap, slot.offset); 2604 } 2605 if (slot.offset & (DIRBLKSIZ - 1)) { 2606 /* 2607 * Collapse new free space into previous entry. 2608 * Note, the previous entry has already been 2609 * validated in ufs_dircheckforname(). 2610 */ 2611 ASSERT(slot.size); 2612 pep = (struct direct *)((char *)ep - slot.size); 2613 if ((pep->d_ino == 0) && 2614 ((uintptr_t)pep & (DIRBLKSIZ - 1))) { 2615 dnlc_dir_purge(dcap); 2616 slot.cached = 0; 2617 goto nocache; 2618 } 2619 if (pep->d_ino) { 2620 extra = pep->d_reclen - DIRSIZ(pep); 2621 } else { 2622 extra = pep->d_reclen; 2623 } 2624 if (extra >= LDIRSIZ(1)) { 2625 (void) dnlc_dir_rem_space_by_handle(dcap, 2626 (uint64_t)(slot.offset - slot.size)); 2627 } 2628 pep->d_reclen += ep->d_reclen; 2629 (void) dnlc_dir_add_space(dcap, extra + ep->d_reclen, 2630 (uint64_t)(slot.offset - slot.size)); 2631 /* adjust the previous pointer in the next entry */ 2632 nep = (struct direct *)((char *)ep + ep->d_reclen); 2633 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { 2634 /* 2635 * Not a new block. 2636 * 2637 * Check the validity of the entry. 2638 * If it's bad, then throw away the cache and 2639 * continue. 2640 */ 2641 if ((nep->d_reclen == 0) || 2642 (nep->d_reclen & 0x3) || 2643 (dnlc_dir_update(dcap, nep->d_name, 2644 INO_OFF_TO_H(nep->d_ino, 2645 slot.offset - slot.size)) == DNOENT)) { 2646 dnlc_dir_purge(dcap); 2647 slot.cached = 0; 2648 } 2649 } 2650 } else { 2651 (void) dnlc_dir_add_space(dcap, ep->d_reclen, 2652 (uint64_t)slot.offset); 2653 } 2654 } else { 2655 /* 2656 * If the entry isn't the first in the directory, we must 2657 * reclaim the space of the now empty record by adding 2658 * the record size to the size of the previous entry. 2659 */ 2660 if (slot.offset & (DIRBLKSIZ - 1)) { 2661 /* 2662 * Collapse new free space into previous entry. 2663 */ 2664 pep = (struct direct *)((char *)ep - slot.size); 2665 pep->d_reclen += ep->d_reclen; 2666 } 2667 } 2668 nocache: 2669 2670 2671 err = TRANS_DIR(dp, slot.offset); 2672 if (err) 2673 fbrelse(slot.fbp, S_OTHER); 2674 else 2675 err = ufs_fbwrite(slot.fbp, dp); 2676 slot.fbp = NULL; 2677 2678 /* 2679 * If we were removing a directory, it is 'gone' now, but we cannot 2680 * unlock it as a thread may be waiting for the lock in ufs_create. If 2681 * we did, it could then create a file in a deleted directory. 2682 */ 2683 2684 if (err) { 2685 if (mode == IFDIR || mode == IFATTRDIR) 2686 rw_exit(&ip->i_rwlock); 2687 goto out; 2688 } 2689 2690 rw_enter(&ip->i_contents, RW_WRITER); 2691 2692 dp->i_flag |= IUPD|ICHG; 2693 dp->i_seq++; 2694 ip->i_flag |= ICHG; 2695 ip->i_seq++; 2696 2697 TRANS_INODE(dp->i_ufsvfs, dp); 2698 TRANS_INODE(ip->i_ufsvfs, ip); 2699 /* 2700 * Now dispose of the inode. 2701 */ 2702 if (ip->i_nlink > 0) { 2703 /* 2704 * This is not done for IFATTRDIR's because they don't 2705 * have entries in the dnlc and the link counts are 2706 * not incremented when they are created. 2707 */ 2708 if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) { 2709 /* 2710 * Decrement by 2 because we're trashing the "." 2711 * entry as well as removing the entry in dp. 2712 * Clear the directory entry, but there may be 2713 * other hard links so don't free the inode. 2714 * Decrement the dp linkcount because we're 2715 * trashing the ".." entry. 2716 */ 2717 ip->i_nlink -= 2; 2718 dp->i_nlink--; 2719 ufs_setreclaim(dp); 2720 /* 2721 * XXX need to discard negative cache entries 2722 * for vp. See comment in ufs_delete(). 2723 */ 2724 dnlc_remove(vp, "."); 2725 dnlc_remove(vp, ".."); 2726 /* 2727 * The return value is ignored here bacause if 2728 * the directory purge fails we don't want to 2729 * stop the delete. If ufs_dirpurgedotdot fails 2730 * the delete will continue with the preexiting 2731 * behavior. 2732 */ 2733 (void) ufs_dirpurgedotdot(ip, dp->i_number, cr); 2734 } else { 2735 ip->i_nlink--; 2736 } 2737 ufs_setreclaim(ip); 2738 } 2739 ITIMES_NOLOCK(dp); 2740 ITIMES_NOLOCK(ip); 2741 2742 if (!TRANS_ISTRANS(dp->i_ufsvfs)) 2743 ufs_iupdat(dp, I_SYNC); 2744 if (!TRANS_ISTRANS(ip->i_ufsvfs)) 2745 ufs_iupdat(ip, I_SYNC); 2746 2747 rw_exit(&ip->i_contents); 2748 if (mode == IFDIR || mode == IFATTRDIR) 2749 rw_exit(&ip->i_rwlock); 2750 out: 2751 if (mode == IFDIR || mode == IFATTRDIR) { 2752 vn_vfsunlock(vp); 2753 } 2754 out_novfs: 2755 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 2756 2757 if (slot.fbp) 2758 fbrelse(slot.fbp, S_OTHER); 2759 2760 rw_exit(&dp->i_contents); 2761 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 2762 2763 /* 2764 * If no error and vpp is non-NULL, return the vnode ptr to the caller. 2765 * The caller becomes responsible for the VN_RELE(). Otherwise, 2766 * Release (and delete) the inode after we drop vfs_dqrwlock to 2767 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 2768 */ 2769 if (ip) { 2770 if ((err == 0) && (vpp != NULL)) { 2771 *vpp = ITOV(ip); 2772 } else { 2773 VN_RELE(vp); 2774 } 2775 } 2776 2777 return (err); 2778 } 2779 2780 /* 2781 * Return buffer with contents of block "offset" 2782 * from the beginning of directory "ip". If "res" 2783 * is non-zero, fill it in with a pointer to the 2784 * remaining space in the directory. 2785 * 2786 */ 2787 2788 int 2789 blkatoff( 2790 struct inode *ip, 2791 off_t offset, 2792 char **res, 2793 struct fbuf **fbpp) 2794 { 2795 struct fs *fs; 2796 struct fbuf *fbp; 2797 daddr_t lbn; 2798 uint_t bsize; 2799 int err; 2800 2801 CPU_STATS_ADD_K(sys, ufsdirblk, 1); 2802 fs = ip->i_fs; 2803 lbn = (daddr_t)lblkno(fs, offset); 2804 bsize = (uint_t)blksize(fs, ip, lbn); 2805 err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask), 2806 bsize, S_READ, &fbp); 2807 if (err) { 2808 *fbpp = (struct fbuf *)NULL; 2809 return (err); 2810 } 2811 if (res) 2812 *res = fbp->fb_addr + blkoff(fs, offset); 2813 *fbpp = fbp; 2814 return (0); 2815 } 2816 2817 /* 2818 * Do consistency checking: 2819 * record length must be multiple of 4 2820 * entry must fit in rest of its DIRBLKSIZ block 2821 * record must be large enough to contain entry 2822 * name is not longer than MAXNAMLEN 2823 * name must be as long as advertised, and null terminated 2824 * NOTE: record length must not be zero (should be checked previously). 2825 * This routine is only called if dirchk is true. 2826 * It would be nice to set the FSBAD flag in the super-block when 2827 * this routine fails so that a fsck is forced on next reboot, 2828 * but locking is a problem. 2829 */ 2830 static int 2831 dirmangled( 2832 struct inode *dp, 2833 struct direct *ep, 2834 int entryoffsetinblock, 2835 off_t offset) 2836 { 2837 int i; 2838 2839 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); 2840 if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i || 2841 (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN || 2842 ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) { 2843 dirbad(dp, "mangled entry", offset); 2844 return (1); 2845 } 2846 return (0); 2847 } 2848 2849 static void 2850 dirbad(struct inode *ip, char *how, off_t offset) 2851 { 2852 cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s", 2853 ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how); 2854 } 2855 2856 static int 2857 dirbadname(char *sp, int l) 2858 { 2859 while (l--) { /* check for nulls */ 2860 if (*sp++ == '\0') { 2861 return (1); 2862 } 2863 } 2864 return (*sp); /* check for terminating null */ 2865 } 2866 2867 /* 2868 * Check if a directory is empty or not. 2869 */ 2870 static int 2871 ufs_dirempty( 2872 struct inode *ip, 2873 ino_t parentino, 2874 struct cred *cr) 2875 { 2876 return (ufs_dirscan(ip, parentino, cr, 0)); 2877 } 2878 2879 /* 2880 * clear the .. directory entry. 2881 */ 2882 static int 2883 ufs_dirpurgedotdot( 2884 struct inode *ip, 2885 ino_t parentino, 2886 struct cred *cr) 2887 { 2888 return (ufs_dirscan(ip, parentino, cr, 1)); 2889 } 2890 2891 /* 2892 * Scan the directoy. If clr_dotdot is true clear the .. 2893 * directory else check to see if the directory is empty. 2894 * 2895 * Using a struct dirtemplate here is not precisely 2896 * what we want, but better than using a struct direct. 2897 * 2898 * clr_dotdot is used as a flag to tell us if we need 2899 * to clear the dotdot entry 2900 * 2901 * N.B.: does not handle corrupted directories. 2902 */ 2903 static int 2904 ufs_dirscan( 2905 struct inode *ip, 2906 ino_t parentino, 2907 struct cred *cr, 2908 int clr_dotdot) 2909 { 2910 offset_t off; 2911 struct dirtemplate dbuf; 2912 struct direct *dp = (struct direct *)&dbuf; 2913 int err, count; 2914 int empty = 1; /* Assume it's empty */ 2915 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) 2916 2917 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 2918 2919 ASSERT(ip->i_size <= (offset_t)MAXOFF_T); 2920 for (off = 0; off < ip->i_size; off += dp->d_reclen) { 2921 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, 2922 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); 2923 /* 2924 * Since we read MINDIRSIZ, residual must 2925 * be 0 unless we're at end of file. 2926 */ 2927 if (err || count != 0 || dp->d_reclen == 0) { 2928 empty = 0; 2929 break; 2930 } 2931 /* skip empty entries */ 2932 if (dp->d_ino == 0) 2933 continue; 2934 /* accept only "." and ".." */ 2935 if (dp->d_namlen > 2 || dp->d_name[0] != '.') { 2936 empty = 0; 2937 break; 2938 } 2939 /* 2940 * At this point d_namlen must be 1 or 2. 2941 * 1 implies ".", 2 implies ".." if second 2942 * char is also "." 2943 */ 2944 if (dp->d_namlen == 1) 2945 continue; 2946 if (dp->d_name[1] == '.' && 2947 (ino_t)dp->d_ino == parentino) { 2948 /* 2949 * If we're doing a purge we need to check for 2950 * the . and .. entries and clear the d_ino for .. 2951 * 2952 * if clr_dotdot is set ufs_dirscan does not 2953 * check for an empty directory. 2954 */ 2955 if (clr_dotdot) { 2956 /* 2957 * Have to actually zap the .. 2958 * entry in the directory, as 2959 * otherwise someone might have 2960 * dp as its cwd and try to 2961 * open .., which now points to 2962 * an unallocated inode. 2963 */ 2964 empty = ufs_dirclrdotdot(ip, parentino); 2965 break; 2966 } else { 2967 continue; 2968 } 2969 } 2970 empty = 0; 2971 break; 2972 } 2973 return (empty); 2974 } 2975 2976 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */ 2977 uint64_t dircheck_retry_cnt; 2978 /* 2979 * Check if source directory inode is in the path of the target directory. 2980 * Target is supplied locked. 2981 * 2982 * The source and target inode's should be different upon entry. 2983 */ 2984 int 2985 ufs_dircheckpath( 2986 ino_t source_ino, 2987 struct inode *target, 2988 struct inode *sdp, 2989 struct cred *cr) 2990 { 2991 struct fbuf *fbp; 2992 struct dirtemplate *dirp; 2993 struct inode *ip; 2994 struct ufsvfs *ufsvfsp; 2995 struct inode *tip; 2996 ino_t dotdotino; 2997 int err; 2998 2999 ASSERT(target->i_ufsvfs != NULL); 3000 ASSERT(RW_LOCK_HELD(&target->i_rwlock)); 3001 ASSERT(RW_LOCK_HELD(&sdp->i_rwlock)); 3002 3003 ip = target; 3004 if (ip->i_number == source_ino) { 3005 err = EINVAL; 3006 goto out; 3007 } 3008 if (ip->i_number == UFSROOTINO) { 3009 err = 0; 3010 goto out; 3011 } 3012 /* 3013 * Search back through the directory tree, using the ".." entries. 3014 * Fail any attempt to move a directory into an ancestor directory. 3015 */ 3016 fbp = NULL; 3017 for (;;) { 3018 struct vfs *vfs; 3019 3020 err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp); 3021 if (err) 3022 break; 3023 if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 || 3024 ip->i_size < sizeof (struct dirtemplate)) { 3025 dirbad(ip, "bad size, unlinked or not dir", (off_t)0); 3026 err = ENOTDIR; 3027 break; 3028 } 3029 if (dirp->dotdot_namlen != 2 || 3030 dirp->dotdot_name[0] != '.' || 3031 dirp->dotdot_name[1] != '.') { 3032 dirbad(ip, "mangled .. entry", (off_t)0); 3033 err = ENOTDIR; /* Sanity check */ 3034 break; 3035 } 3036 dotdotino = (ino_t)dirp->dotdot_ino; 3037 if (dotdotino == source_ino) { 3038 err = EINVAL; 3039 break; 3040 } 3041 if (dotdotino == UFSROOTINO) 3042 break; 3043 if (fbp) { 3044 fbrelse(fbp, S_OTHER); 3045 fbp = NULL; 3046 } 3047 vfs = ip->i_vfs; 3048 ufsvfsp = ip->i_ufsvfs; 3049 3050 if (ip != target) { 3051 rw_exit(&ip->i_rwlock); 3052 VN_RELE(ITOV(ip)); 3053 } 3054 /* 3055 * Race to get the inode. 3056 */ 3057 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3058 if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) { 3059 rw_exit(&ufsvfsp->vfs_dqrwlock); 3060 ip = NULL; 3061 break; 3062 } 3063 rw_exit(&ufsvfsp->vfs_dqrwlock); 3064 /* 3065 * If the directory of the source inode (also a directory) 3066 * is the same as this next entry up the chain, then 3067 * we know the source directory itself can't be in the 3068 * chain. This also prevents a panic because we already 3069 * have sdp->i_rwlock locked. 3070 */ 3071 if (tip == sdp) { 3072 VN_RELE(ITOV(tip)); 3073 ip = NULL; 3074 break; 3075 } 3076 ip = tip; 3077 3078 /* 3079 * If someone has set the WRITE_WANTED bit in this lock and if 3080 * this happens to be a sdp or tdp of another parallel rename 3081 * which is executing the same code and in similar situation 3082 * we end up in a 4 way deadlock. We need to make sure that 3083 * the WRITE_WANTED bit is not set. 3084 */ 3085 retry_lock: 3086 if (!rw_tryenter(&ip->i_rwlock, RW_READER)) { 3087 /* 3088 * If the lock held as WRITER thats fine but if it 3089 * has WRITE_WANTED bit set we might end up in a 3090 * deadlock. If WRITE_WANTED is set we return 3091 * with EAGAIN else we just go back and try. 3092 */ 3093 if (RW_ISWRITER(&ip->i_rwlock) && 3094 !(RW_WRITE_HELD(&ip->i_rwlock))) { 3095 err = EAGAIN; 3096 if (fbp) { 3097 fbrelse(fbp, S_OTHER); 3098 } 3099 VN_RELE(ITOV(ip)); 3100 return (err); 3101 } else { 3102 /* 3103 * The lock is being write held. We could 3104 * just do a rw_enter here but there is a 3105 * window between the check and now, where 3106 * the status could have changed, so to 3107 * avoid looping we backoff and go back to 3108 * try for the lock. 3109 */ 3110 delay(retry_backoff_delay); 3111 dircheck_retry_cnt++; 3112 goto retry_lock; 3113 } 3114 } 3115 } 3116 if (fbp) { 3117 fbrelse(fbp, S_OTHER); 3118 } 3119 out: 3120 if (ip) { 3121 if (ip != target) { 3122 rw_exit(&ip->i_rwlock); 3123 VN_RELE(ITOV(ip)); 3124 } 3125 } 3126 return (err); 3127 } 3128 3129 int 3130 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr) 3131 { 3132 offset_t off; 3133 struct dirtemplate dbuf; 3134 struct direct *dp = (struct direct *)&dbuf; 3135 int err, count; 3136 int empty = 1; /* Assume it's empty */ 3137 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) 3138 3139 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 3140 3141 ASSERT(ip->i_size <= (offset_t)MAXOFF_T); 3142 for (off = 0; off < ip->i_size; off += dp->d_reclen) { 3143 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, 3144 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); 3145 /* 3146 * Since we read MINDIRSIZ, residual must 3147 * be 0 unless we're at end of file. 3148 */ 3149 3150 if (err || count != 0 || dp->d_reclen == 0) { 3151 empty = 0; 3152 break; 3153 } 3154 /* skip empty entries */ 3155 if (dp->d_ino == 0) 3156 continue; 3157 /* 3158 * At this point d_namlen must be 1 or 2. 3159 * 1 implies ".", 2 implies ".." if second 3160 * char is also "." 3161 */ 3162 3163 if (dp->d_namlen == 1 && dp->d_name[0] == '.' && 3164 (ino_t)dp->d_ino == parentino) 3165 continue; 3166 3167 if (dp->d_namlen == 2 && dp->d_name[0] == '.' && 3168 dp->d_name[1] == '.') { 3169 continue; 3170 } 3171 empty = 0; 3172 break; 3173 } 3174 return (empty); 3175 } 3176 3177 3178 /* 3179 * Allocate and initialize a new shadow inode to contain extended attributes. 3180 */ 3181 int 3182 ufs_xattrmkdir( 3183 struct inode *tdp, 3184 struct inode **ipp, 3185 int flags, 3186 struct cred *cr) 3187 { 3188 struct inode *ip; 3189 struct vattr va; 3190 int err; 3191 int retry = 1; 3192 struct ufsvfs *ufsvfsp; 3193 struct ulockfs *ulp; 3194 int issync; 3195 int trans_size; 3196 int dorwlock; /* 0 = not yet taken, */ 3197 /* 1 = taken outside the transaction, */ 3198 /* 2 = taken inside the transaction */ 3199 3200 /* 3201 * Validate permission to create attribute directory 3202 */ 3203 3204 if ((err = ufs_iaccess(tdp, IWRITE, cr, 1)) != 0) { 3205 return (err); 3206 } 3207 3208 if (vn_is_readonly(ITOV(tdp))) 3209 return (EROFS); 3210 3211 /* 3212 * No need to re-init err after again:, since it's set before 3213 * the next use of it. 3214 */ 3215 again: 3216 dorwlock = 0; 3217 va.va_type = VDIR; 3218 va.va_uid = tdp->i_uid; 3219 va.va_gid = tdp->i_gid; 3220 3221 if ((tdp->i_mode & IFMT) == IFDIR) { 3222 va.va_mode = (o_mode_t)IFATTRDIR; 3223 va.va_mode |= tdp->i_mode & 0777; 3224 } else { 3225 va.va_mode = (o_mode_t)IFATTRDIR|0700; 3226 if (tdp->i_mode & 0040) 3227 va.va_mode |= 0750; 3228 if (tdp->i_mode & 0004) 3229 va.va_mode |= 0705; 3230 } 3231 va.va_mask = AT_TYPE|AT_MODE; 3232 3233 ufsvfsp = tdp->i_ufsvfs; 3234 3235 err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK); 3236 if (err) 3237 return (err); 3238 3239 /* 3240 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file. 3241 * This follows the protocol for read()/write(). 3242 */ 3243 if (ITOV(tdp)->v_type != VDIR) { 3244 rw_enter(&tdp->i_rwlock, RW_WRITER); 3245 dorwlock = 1; 3246 } 3247 3248 if (ulp) { 3249 trans_size = (int)TOP_MKDIR_SIZE(tdp); 3250 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size); 3251 } 3252 3253 /* 3254 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory. 3255 * This follows the protocol established by 3256 * ufs_link/create/remove/rename/mkdir/rmdir/symlink. 3257 */ 3258 if (dorwlock == 0) { 3259 rw_enter(&tdp->i_rwlock, RW_WRITER); 3260 dorwlock = 2; 3261 } 3262 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3263 rw_enter(&tdp->i_contents, RW_WRITER); 3264 3265 /* 3266 * Suppress out of inodes messages if we will retry. 3267 */ 3268 if (retry) 3269 tdp->i_flag |= IQUIET; 3270 err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr); 3271 tdp->i_flag &= ~IQUIET; 3272 3273 if (err) 3274 goto fail; 3275 3276 if (flags) { 3277 3278 /* 3279 * Now attach it to src file. 3280 */ 3281 3282 tdp->i_oeftflag = ip->i_number; 3283 } 3284 3285 ip->i_cflags |= IXATTR; 3286 ITOV(ip)->v_flag |= V_XATTRDIR; 3287 TRANS_INODE(ufsvfsp, tdp); 3288 tdp->i_flag |= ICHG | IUPD; 3289 tdp->i_seq++; 3290 ufs_iupdat(tdp, I_SYNC); 3291 rw_exit(&tdp->i_contents); 3292 rw_exit(&ufsvfsp->vfs_dqrwlock); 3293 3294 rw_enter(&ip->i_rwlock, RW_WRITER); 3295 rw_enter(&ip->i_contents, RW_WRITER); 3296 TRANS_INODE(ufsvfsp, ip); 3297 ip->i_flag |= ICHG| IUPD; 3298 ip->i_seq++; 3299 ufs_iupdat(ip, I_SYNC); 3300 rw_exit(&ip->i_contents); 3301 rw_exit(&ip->i_rwlock); 3302 if (dorwlock == 2) 3303 rw_exit(&tdp->i_rwlock); 3304 if (ulp) { 3305 int terr = 0; 3306 3307 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); 3308 ufs_lockfs_end(ulp); 3309 if (err == 0) 3310 err = terr; 3311 } 3312 if (dorwlock == 1) 3313 rw_exit(&tdp->i_rwlock); 3314 *ipp = ip; 3315 return (err); 3316 3317 fail: 3318 rw_exit(&tdp->i_contents); 3319 rw_exit(&ufsvfsp->vfs_dqrwlock); 3320 if (dorwlock == 2) 3321 rw_exit(&tdp->i_rwlock); 3322 if (ulp) { 3323 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); 3324 ufs_lockfs_end(ulp); 3325 } 3326 if (dorwlock == 1) 3327 rw_exit(&tdp->i_rwlock); 3328 if (ip != NULL) 3329 VN_RELE(ITOV(ip)); 3330 3331 /* 3332 * No inodes? See if any are tied up in pending deletions. 3333 * This has to be done outside of any of the above, because 3334 * the draining operation can't be done from inside a transaction. 3335 */ 3336 if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3337 ufs_delete_drain_wait(ufsvfsp, 1); 3338 retry = 0; 3339 goto again; 3340 } 3341 3342 return (err); 3343 } 3344 3345 /* 3346 * clear the dotdot directory entry. 3347 * Used by ufs_dirscan when clr_dotdot 3348 * flag is set and we're deleting a 3349 * directory. 3350 */ 3351 static int 3352 ufs_dirclrdotdot(struct inode *ip, ino_t parentino) 3353 { 3354 struct fbuf *fbp; 3355 struct direct *dotp, *dotdotp; 3356 int err = 0; 3357 3358 ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); 3359 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 3360 err = blkatoff(ip, 0, NULL, &fbp); 3361 if (err) { 3362 return (err); 3363 } 3364 3365 dotp = (struct direct *)fbp->fb_addr; 3366 if ((dotp->d_namlen < (MAXNAMLEN + 1)) && 3367 ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) { 3368 dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen); 3369 if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) && 3370 ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) { 3371 3372 dotp->d_reclen += dotdotp->d_reclen; 3373 if (parentino == dotdotp->d_ino) { 3374 dotdotp->d_ino = 0; 3375 dotdotp->d_namlen = 0; 3376 dotdotp->d_reclen = 0; 3377 } 3378 3379 err = TRANS_DIR(ip, 0); 3380 if (err) { 3381 fbrelse(fbp, S_OTHER); 3382 } else { 3383 err = ufs_fbwrite(fbp, ip); 3384 } 3385 } 3386 } else { 3387 err = -1; 3388 } 3389 return (err); 3390 } 3391