1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 26 /* All Rights Reserved */ 27 28 /* 29 * University Copyright- Copyright (c) 1982, 1986, 1988 30 * The Regents of the University of California 31 * All Rights Reserved 32 * 33 * University Acknowledgment- Portions of this document are derived from 34 * software developed by the University of California, Berkeley, and its 35 * contributors. 36 */ 37 38 /* 39 * Directory manipulation routines. 40 * 41 * When manipulating directories, the i_rwlock provides serialization 42 * since directories cannot be mmapped. The i_contents lock is redundant. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/t_lock.h> 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/signal.h> 50 #include <sys/cred.h> 51 #include <sys/proc.h> 52 #include <sys/disp.h> 53 #include <sys/user.h> 54 #include <sys/vfs.h> 55 #include <sys/vnode.h> 56 #include <sys/stat.h> 57 #include <sys/mode.h> 58 #include <sys/buf.h> 59 #include <sys/uio.h> 60 #include <sys/dnlc.h> 61 #include <sys/fs/ufs_inode.h> 62 #include <sys/fs/ufs_fs.h> 63 #include <sys/mount.h> 64 #include <sys/fs/ufs_fsdir.h> 65 #include <sys/fs/ufs_trans.h> 66 #include <sys/fs/ufs_panic.h> 67 #include <sys/fs/ufs_quota.h> 68 #include <sys/errno.h> 69 #include <sys/debug.h> 70 #include <vm/seg.h> 71 #include <sys/sysmacros.h> 72 #include <sys/cmn_err.h> 73 #include <sys/cpuvar.h> 74 #include <sys/unistd.h> 75 #include <sys/policy.h> 76 77 /* 78 * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ 79 */ 80 #if !ISP2(DIRBLKSIZ) 81 #error "DIRBLKSIZ not a power of 2" 82 #endif 83 84 /* 85 * A virgin directory. 86 */ 87 static struct dirtemplate mastertemplate = { 88 0, 12, 1, ".", 89 0, DIRBLKSIZ - 12, 2, ".." 90 }; 91 92 #define LDIRSIZ(len) \ 93 ((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3)) 94 #define MAX_DIR_NAME_LEN(len) \ 95 (((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1) 96 97 /* 98 * The dnlc directory cache allows a 64 bit handle for directory entries. 99 * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset 100 * into the handle. Note, a 32 bit offset allows a 4GB directory, which 101 * is way beyond what could be cached in memory by the directory 102 * caching routines. So we are quite safe with this limit. 103 * The macros below pack and unpack the handle. 104 */ 105 #define H_TO_INO(h) (uint32_t)((h) & UINT_MAX) 106 #define H_TO_OFF(h) (off_t)((h) >> 32) 107 #define INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino)) 108 109 /* 110 * The average size of a typical on disk directory entry is about 16 bytes 111 * and so defines AV_DIRECT_SHIFT : log2(16) 112 * This define is only used to approximate the number of entries 113 * is a directory. This is needed for dnlc_dir_start() which will immediately 114 * return an error if the value is not within its acceptable range of 115 * number of files in a directory. 116 */ 117 #define AV_DIRECT_SHIFT 4 118 /* 119 * If the directory size (from i_size) is greater than the ufs_min_dir_cache 120 * tunable then we request dnlc directory caching. 121 * This has found to be profitable after 1024 file names. 122 */ 123 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT; 124 125 /* The time point the dnlc directory caching was disabled */ 126 static hrtime_t ufs_dc_disable_at; 127 /* directory caching disable duration */ 128 static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5; 129 130 #ifdef DEBUG 131 int dirchk = 1; 132 #else /* !DEBUG */ 133 int dirchk = 0; 134 #endif /* DEBUG */ 135 int ufs_negative_cache = 1; 136 uint64_t ufs_dirremove_retry_cnt; 137 138 static void dirbad(); 139 static int ufs_dirrename(); 140 static int ufs_diraddentry(); 141 static int ufs_dirempty(); 142 static int ufs_dirscan(); 143 static int ufs_dirclrdotdot(); 144 static int ufs_dirfixdotdot(); 145 static int ufs_dirpurgedotdot(); 146 static int dirprepareentry(); 147 static int ufs_dirmakedirect(); 148 static int dirbadname(); 149 static int dirmangled(); 150 151 /* 152 * Check accessibility of directory against inquired mode and type. 153 * Execute access is required to search the directory. 154 * Access for write is interpreted as allowing 155 * deletion of files in the directory. 156 * Note, the reader i_contents lock will be acquired in 157 * ufs_iaccess(). 158 */ 159 int 160 ufs_diraccess(struct inode *ip, int mode, struct cred *cr) 161 { 162 if (((ip->i_mode & IFMT) != IFDIR) && 163 ((ip->i_mode & IFMT) != IFATTRDIR)) 164 return (ENOTDIR); 165 166 return (ufs_iaccess(ip, mode, cr, 1)); 167 } 168 169 /* 170 * Look for a given name in a directory. On successful return, *ipp 171 * will point to the VN_HELD inode. 172 * The caller is responsible for checking accessibility upfront 173 * via ufs_diraccess(). 174 */ 175 int 176 ufs_dirlook( 177 struct inode *dp, 178 char *namep, 179 struct inode **ipp, 180 struct cred *cr, 181 int skipdnlc, /* skip the 1st level dnlc */ 182 int skipcaching) /* force directory caching off */ 183 { 184 uint64_t handle; 185 struct fbuf *fbp; /* a buffer of directory entries */ 186 struct direct *ep; /* the current directory entry */ 187 struct vnode *vp; 188 struct vnode *dvp; /* directory vnode ptr */ 189 struct ulockfs *ulp; 190 dcanchor_t *dcap; 191 off_t endsearch; /* offset to end directory search */ 192 off_t offset; 193 off_t start_off; /* starting offset from middle search */ 194 off_t last_offset; /* last offset */ 195 int entryoffsetinblock; /* offset of ep in addr's buffer */ 196 int numdirpasses; /* strategy for directory search */ 197 int namlen; /* length of name */ 198 int err; 199 int doingchk; 200 int i; 201 int caching; 202 int indeadlock; 203 ino_t ep_ino; /* entry i number */ 204 ino_t chkino; 205 ushort_t ep_reclen; /* direct local d_reclen */ 206 207 ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */ 208 209 if (dp->i_ufsvfs) 210 ulp = &dp->i_ufsvfs->vfs_ulockfs; 211 212 /* 213 * Check the directory name lookup cache, first for individual files 214 * then for complete directories. 215 */ 216 dvp = ITOV(dp); 217 if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) { 218 /* vp is already held from dnlc_lookup */ 219 if (vp == DNLC_NO_VNODE) { 220 VN_RELE(vp); 221 return (ENOENT); 222 } 223 *ipp = VTOI(vp); 224 return (0); 225 } 226 227 dcap = &dp->i_danchor; 228 229 /* 230 * Grab the reader lock on the directory data before checking 231 * the dnlc to avoid a race with ufs_dirremove() & friends. 232 * 233 * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to 234 * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 235 * possible, retries the operation. 236 */ 237 ufs_tryirwlock((&dp->i_rwlock), RW_READER, retry_dircache); 238 if (indeadlock) 239 return (EAGAIN); 240 241 switch (dnlc_dir_lookup(dcap, namep, &handle)) { 242 case DFOUND: 243 ep_ino = (ino_t)H_TO_INO(handle); 244 if (dp->i_number == ep_ino) { 245 VN_HOLD(dvp); /* want ourself, "." */ 246 *ipp = dp; 247 rw_exit(&dp->i_rwlock); 248 return (0); 249 } 250 if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) { 251 uint64_t handle2; 252 /* 253 * release the lock on the dir we are searching 254 * to avoid a deadlock when grabbing the 255 * i_contents lock in ufs_iget_alloced(). 256 */ 257 rw_exit(&dp->i_rwlock); 258 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 259 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); 260 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 261 /* 262 * must recheck as we dropped dp->i_rwlock 263 */ 264 ufs_tryirwlock(&dp->i_rwlock, RW_READER, retry_parent); 265 if (indeadlock) { 266 if (!err) 267 VN_RELE(ITOV(*ipp)); 268 return (EAGAIN); 269 } 270 if (!err && (dnlc_dir_lookup(dcap, namep, &handle2) 271 == DFOUND) && (handle == handle2)) { 272 dnlc_update(dvp, namep, ITOV(*ipp)); 273 rw_exit(&dp->i_rwlock); 274 return (0); 275 } 276 /* check failed, read the actual directory */ 277 if (!err) { 278 VN_RELE(ITOV(*ipp)); 279 } 280 goto restart; 281 } 282 /* usual case of not "." nor ".." */ 283 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 284 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); 285 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 286 if (err) { 287 rw_exit(&dp->i_rwlock); 288 return (err); 289 } 290 dnlc_update(dvp, namep, ITOV(*ipp)); 291 rw_exit(&dp->i_rwlock); 292 return (0); 293 case DNOENT: 294 if (ufs_negative_cache && (dp->i_nlink > 0)) { 295 dnlc_enter(dvp, namep, DNLC_NO_VNODE); 296 } 297 rw_exit(&dp->i_rwlock); 298 return (ENOENT); 299 default: 300 break; 301 } 302 restart: 303 304 fbp = NULL; 305 doingchk = 0; 306 chkino = 0; 307 caching = 0; 308 309 /* 310 * Attempt to cache any directories greater than the tunable 311 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM), 312 * disable caching for this directory and record the system time. 313 * Any attempt after the disable time has expired will enable 314 * the caching again. 315 */ 316 if (!skipcaching && (dp->i_size >= ufs_min_dir_cache)) { 317 /* 318 * if the directory caching disable time has expired 319 * enable the caching again. 320 */ 321 if (dp->i_cachedir == CD_DISABLED_NOMEM && 322 gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) { 323 ufs_dc_disable_at = 0; 324 dp->i_cachedir = CD_ENABLED; 325 } 326 if (dp->i_cachedir == CD_ENABLED) { 327 switch (dnlc_dir_start(dcap, dp->i_size >> 328 AV_DIRECT_SHIFT)) { 329 case DNOMEM: 330 dp->i_cachedir = CD_DISABLED_NOMEM; 331 ufs_dc_disable_at = gethrtime(); 332 break; 333 case DTOOBIG: 334 dp->i_cachedir = CD_DISABLED_TOOBIG; 335 break; 336 case DOK: 337 caching = 1; 338 break; 339 default: 340 break; 341 } 342 } 343 } 344 /* 345 * If caching we don't stop when the file has been 346 * found, but need to know later, so clear *ipp now 347 */ 348 *ipp = NULL; 349 350 recheck: 351 if (caching) { 352 offset = 0; 353 entryoffsetinblock = 0; 354 numdirpasses = 1; 355 } else { 356 /* 357 * Take care to look at dp->i_diroff only once, as it 358 * may be changing due to other threads/cpus. 359 */ 360 offset = dp->i_diroff; 361 if (offset > dp->i_size) { 362 offset = 0; 363 } 364 if (offset == 0) { 365 entryoffsetinblock = 0; 366 numdirpasses = 1; 367 } else { 368 start_off = offset; 369 370 entryoffsetinblock = blkoff(dp->i_fs, offset); 371 if (entryoffsetinblock != 0) { 372 err = blkatoff(dp, offset, (char **)0, &fbp); 373 if (err) 374 goto bad; 375 } 376 numdirpasses = 2; 377 } 378 } 379 endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t); 380 namlen = strlen(namep); 381 last_offset = 0; 382 383 searchloop: 384 while (offset < endsearch) { 385 /* 386 * If offset is on a block boundary, 387 * read the next directory block. 388 * Release previous if it exists. 389 */ 390 if (blkoff(dp->i_fs, offset) == 0) { 391 if (fbp != NULL) { 392 fbrelse(fbp, S_OTHER); 393 } 394 err = blkatoff(dp, offset, (char **)0, &fbp); 395 if (err) 396 goto bad; 397 entryoffsetinblock = 0; 398 } 399 400 /* 401 * If the offset to the next entry is invalid or if the 402 * next entry is a zero length record or if the record 403 * length is invalid, then skip to the next directory 404 * block. Complete validation checks are done if the 405 * record length is invalid. 406 * 407 * Full validation checks are slow so they are disabled 408 * by default. Complete checks can be run by patching 409 * "dirchk" to be true. 410 * 411 * We have to check the validity of entryoffsetinblock 412 * here because it can be set to i_diroff above. 413 */ 414 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock); 415 if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 || 416 (dirchk || (ep->d_reclen & 0x3)) && 417 dirmangled(dp, ep, entryoffsetinblock, offset)) { 418 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); 419 offset += i; 420 entryoffsetinblock += i; 421 if (caching) { 422 dnlc_dir_purge(dcap); 423 caching = 0; 424 } 425 continue; 426 } 427 428 ep_reclen = ep->d_reclen; 429 430 /* 431 * Add named entries and free space into the directory cache 432 */ 433 if (caching) { 434 ushort_t extra; 435 off_t off2; 436 437 if (ep->d_ino == 0) { 438 extra = ep_reclen; 439 if (offset & (DIRBLKSIZ - 1)) { 440 dnlc_dir_purge(dcap); 441 dp->i_cachedir = CD_DISABLED; 442 caching = 0; 443 } 444 } else { 445 /* 446 * entries hold the previous offset except the 447 * 1st which holds the offset + 1 448 */ 449 if (offset & (DIRBLKSIZ - 1)) { 450 off2 = last_offset; 451 } else { 452 off2 = offset + 1; 453 } 454 caching = (dnlc_dir_add_entry(dcap, ep->d_name, 455 INO_OFF_TO_H(ep->d_ino, off2)) == DOK); 456 extra = ep_reclen - DIRSIZ(ep); 457 } 458 if (caching && (extra >= LDIRSIZ(1))) { 459 caching = (dnlc_dir_add_space(dcap, extra, 460 (uint64_t)offset) == DOK); 461 } 462 } 463 464 /* 465 * Check for a name match. 466 * We have the parent inode read locked with i_rwlock. 467 */ 468 if (ep->d_ino && ep->d_namlen == namlen && 469 *namep == *ep->d_name && /* fast chk 1st chr */ 470 bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) { 471 472 /* 473 * We have to release the fbp early here to avoid 474 * a possible deadlock situation where we have the 475 * fbp and want the directory inode and someone doing 476 * a ufs_direnter_* has the directory inode and wants 477 * the fbp. XXX - is this still needed? 478 */ 479 ep_ino = (ino_t)ep->d_ino; 480 ASSERT(fbp != NULL); 481 fbrelse(fbp, S_OTHER); 482 fbp = NULL; 483 484 /* 485 * Atomic update (read lock held) 486 */ 487 dp->i_diroff = offset; 488 489 if (namlen == 2 && namep[0] == '.' && namep[1] == '.') { 490 struct timeval32 omtime; 491 492 if (caching) { 493 dnlc_dir_purge(dcap); 494 caching = 0; 495 } 496 if (doingchk) { 497 /* 498 * if the inumber didn't change 499 * continue with already found inode. 500 */ 501 if (ep_ino == chkino) 502 goto checkok; 503 else { 504 VN_RELE(ITOV(*ipp)); 505 /* *ipp is nulled at restart */ 506 goto restart; 507 } 508 } 509 /* 510 * release the lock on the dir we are searching 511 * to avoid a deadlock when grabbing the 512 * i_contents lock in ufs_iget_alloced(). 513 */ 514 omtime = dp->i_mtime; 515 rw_exit(&dp->i_rwlock); 516 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, 517 RW_READER); 518 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, 519 cr); 520 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 521 ufs_tryirwlock(&dp->i_rwlock, RW_READER, 522 retry_disk); 523 if (indeadlock) { 524 if (!err) 525 VN_RELE(ITOV(*ipp)); 526 return (EAGAIN); 527 } 528 if (err) 529 goto bad; 530 /* 531 * Since we released the lock on the directory, 532 * we must check that the same inode is still 533 * the ".." entry for this directory. 534 */ 535 /*CSTYLED*/ 536 if (timercmp(&omtime, &dp->i_mtime, !=)) { 537 /* 538 * Modification time changed on the 539 * directory, we must go check if 540 * the inumber changed for ".." 541 */ 542 doingchk = 1; 543 chkino = ep_ino; 544 entryoffsetinblock = 0; 545 if (caching) { 546 /* 547 * Forget directory caching 548 * for this rare case 549 */ 550 dnlc_dir_purge(dcap); 551 caching = 0; 552 } 553 goto recheck; 554 } 555 } else if (dp->i_number == ep_ino) { 556 VN_HOLD(dvp); /* want ourself, "." */ 557 *ipp = dp; 558 if (caching) { 559 dnlc_dir_purge(dcap); 560 caching = 0; 561 } 562 } else { 563 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, 564 RW_READER); 565 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, 566 cr); 567 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 568 if (err) 569 goto bad; 570 } 571 checkok: 572 ASSERT(*ipp); 573 dnlc_update(dvp, namep, ITOV(*ipp)); 574 /* 575 * If we are not caching then just return the entry 576 * otherwise complete loading up the cache 577 */ 578 if (!caching) { 579 rw_exit(&dp->i_rwlock); 580 return (0); 581 } 582 err = blkatoff(dp, offset, (char **)0, &fbp); 583 if (err) 584 goto bad; 585 } 586 last_offset = offset; 587 offset += ep_reclen; 588 entryoffsetinblock += ep_reclen; 589 } 590 /* 591 * If we started in the middle of the directory and failed 592 * to find our target, we must check the beginning as well. 593 */ 594 if (numdirpasses == 2) { 595 numdirpasses--; 596 offset = 0; 597 endsearch = start_off; 598 goto searchloop; 599 } 600 601 /* 602 * If whole directory caching is on (or was originally on) then 603 * the entry may have been found. 604 */ 605 if (*ipp == NULL) { 606 err = ENOENT; 607 if (ufs_negative_cache && (dp->i_nlink > 0)) { 608 dnlc_enter(dvp, namep, DNLC_NO_VNODE); 609 } 610 } 611 if (caching) { 612 dnlc_dir_complete(dcap); 613 caching = 0; 614 } 615 616 bad: 617 if (err && *ipp) { 618 /* 619 * err and *ipp can both be set if we were attempting to 620 * cache the directory, and we found the entry, then later 621 * while trying to complete the directory cache encountered 622 * a error (eg reading a directory sector). 623 */ 624 VN_RELE(ITOV(*ipp)); 625 *ipp = NULL; 626 } 627 628 if (fbp) 629 fbrelse(fbp, S_OTHER); 630 rw_exit(&dp->i_rwlock); 631 if (caching) 632 dnlc_dir_purge(dcap); 633 return (err); 634 } 635 636 /* 637 * Write a new directory entry for DE_CREATE or DE_MKDIR operations. 638 */ 639 int 640 ufs_direnter_cm( 641 struct inode *tdp, /* target directory to make entry in */ 642 char *namep, /* name of entry */ 643 enum de_op op, /* entry operation */ 644 struct vattr *vap, /* attributes if new inode needed */ 645 struct inode **ipp, /* return entered inode here */ 646 struct cred *cr, /* user credentials */ 647 int flags) /* no entry exists */ 648 { 649 struct inode *tip; /* inode of (existing) target file */ 650 char *s; 651 struct ufs_slot slot; /* slot info to pass around */ 652 int namlen; /* length of name */ 653 int err; /* error number */ 654 struct inode *nip; /* new inode */ 655 int do_rele_nip = 0; /* release nip */ 656 int noentry = flags & ~IQUIET; 657 int quiet = flags & IQUIET; /* Suppress out of inodes message */ 658 int indeadlock; 659 struct ulockfs *ulp; 660 661 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 662 663 if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) || 664 ((vap->va_type == VCHR) || (vap->va_type == VBLK) || 665 (vap->va_type == VDOOR) || (vap->va_type == VSOCK) || 666 (vap->va_type == VFIFO)))) 667 return (EINVAL); 668 669 /* don't allow '/' characters in pathname component */ 670 for (s = namep, namlen = 0; *s; s++, namlen++) 671 if (*s == '/') 672 return (EACCES); 673 ASSERT(namlen); 674 675 /* 676 * Check accessibility of target directory. 677 */ 678 if (err = ufs_diraccess(tdp, IEXEC, cr)) 679 return (err); 680 681 /* 682 * If name is "." or ".." then if this is a create look it up 683 * and return EEXIST. 684 */ 685 if (namep[0] == '.' && 686 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { 687 /* 688 * ufs_dirlook will acquire the i_rwlock 689 */ 690 if (tdp->i_ufsvfs) 691 ulp = &tdp->i_ufsvfs->vfs_ulockfs; 692 rw_exit(&tdp->i_rwlock); 693 if (err = ufs_dirlook(tdp, namep, ipp, cr, 0, 0)) { 694 if (err == EAGAIN) 695 return (err); 696 697 /* 698 * ufs_tryirwlock uses rw_tryenter and checks for 699 * SLOCK to avoid i_rwlock, ufs_lockfs_begin deadlock. 700 * If deadlock possible, retries the operation. 701 */ 702 ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry_err); 703 if (indeadlock) 704 return (EAGAIN); 705 706 return (err); 707 } 708 ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry); 709 if (indeadlock) { 710 VN_RELE(ITOV(*ipp)); 711 return (EAGAIN); 712 } 713 return (EEXIST); 714 } 715 716 /* 717 * If target directory has not been removed, then we can consider 718 * allowing file to be created. 719 */ 720 if (tdp->i_nlink <= 0) { 721 return (ENOENT); 722 } 723 724 /* 725 * Search for the entry. Return VN_HELD tip if found. 726 */ 727 tip = NULL; 728 slot.fbp = NULL; 729 slot.status = NONE; 730 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 731 rw_enter(&tdp->i_contents, RW_WRITER); 732 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry); 733 if (err) 734 goto out; 735 if (tip) { 736 ASSERT(!noentry); 737 *ipp = tip; 738 err = EEXIST; 739 } else { 740 /* 741 * The entry does not exist. Check write permission in 742 * directory to see if entry can be created. 743 */ 744 if (err = ufs_iaccess(tdp, IWRITE, cr, 0)) 745 goto out; 746 /* 747 * Make new inode and directory entry. 748 */ 749 tdp->i_flag |= quiet; 750 if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) { 751 if (nip != NULL) 752 do_rele_nip = 1; 753 goto out; 754 } 755 if (err = ufs_diraddentry(tdp, namep, op, 756 namlen, &slot, nip, NULL, cr)) { 757 /* 758 * Unmake the inode we just made. 759 */ 760 rw_enter(&nip->i_contents, RW_WRITER); 761 if (((nip->i_mode & IFMT) == IFDIR) || 762 ((nip->i_mode & IFMT) == IFATTRDIR)) { 763 tdp->i_nlink--; 764 ufs_setreclaim(tdp); 765 tdp->i_flag |= ICHG; 766 tdp->i_seq++; 767 TRANS_INODE(tdp->i_ufsvfs, tdp); 768 ITIMES_NOLOCK(tdp); 769 } 770 nip->i_nlink = 0; 771 ufs_setreclaim(nip); 772 TRANS_INODE(nip->i_ufsvfs, nip); 773 nip->i_flag |= ICHG; 774 nip->i_seq++; 775 ITIMES_NOLOCK(nip); 776 rw_exit(&nip->i_contents); 777 do_rele_nip = 1; 778 } else { 779 *ipp = nip; 780 } 781 } 782 783 out: 784 if (slot.fbp) 785 fbrelse(slot.fbp, S_OTHER); 786 787 tdp->i_flag &= ~quiet; 788 rw_exit(&tdp->i_contents); 789 790 /* 791 * Drop vfs_dqrwlock before calling VN_RELE() on nip to 792 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 793 */ 794 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); 795 796 if (do_rele_nip) { 797 VN_RELE(ITOV(nip)); 798 } 799 800 return (err); 801 } 802 803 /* 804 * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations. 805 */ 806 int 807 ufs_direnter_lr( 808 struct inode *tdp, /* target directory to make entry in */ 809 char *namep, /* name of entry */ 810 enum de_op op, /* entry operation */ 811 struct inode *sdp, /* source inode parent if rename */ 812 struct inode *sip, /* source inode */ 813 struct cred *cr) /* user credentials */ 814 { 815 struct inode *tip; /* inode of (existing) target file */ 816 char *s; 817 struct ufs_slot slot; /* slot info to pass around */ 818 int namlen; /* length of name */ 819 int err; /* error number */ 820 821 /* don't allow '/' characters in pathname component */ 822 for (s = namep, namlen = 0; *s; s++, namlen++) 823 if (*s == '/') 824 return (EACCES); 825 ASSERT(namlen); 826 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 827 828 /* 829 * If name is "." or ".." then if this is a create look it up 830 * and return EEXIST. Rename or link TO "." or ".." is forbidden. 831 */ 832 if (namep[0] == '.' && 833 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { 834 if (op == DE_RENAME) { 835 return (EINVAL); /* *SIGH* should be ENOTEMPTY */ 836 } 837 return (EEXIST); 838 } 839 /* 840 * For link and rename lock the source entry and check the link count 841 * to see if it has been removed while it was unlocked. If not, we 842 * increment the link count and force the inode to disk to make sure 843 * that it is there before any directory entry that points to it. 844 * 845 * In the case of a symbolic link, we are dealing with a new inode 846 * which does not yet have any links. We've created it with a link 847 * count of 1, and we don't want to increment it since this will be 848 * its first link. 849 * 850 * We are about to push the inode to disk. We make sure 851 * that the inode's data blocks are flushed first so the 852 * inode and it's data blocks are always in sync. This 853 * adds some robustness in in the event of a power failure 854 * or panic where sync fails. If we panic before the 855 * inode is updated, then the inode still refers to the 856 * old data blocks (or none for a new file). If we panic 857 * after the inode is updated, then the inode refers to 858 * the new data blocks. 859 * 860 * We do this before grabbing the i_contents lock because 861 * ufs_syncip() will want that lock. We could do the data 862 * syncing after the removal checks, but upon return from 863 * the data sync we would have to repeat the removal 864 * checks. 865 */ 866 if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) { 867 return (err); 868 } 869 870 rw_enter(&sip->i_contents, RW_WRITER); 871 if (sip->i_nlink <= 0) { 872 rw_exit(&sip->i_contents); 873 return (ENOENT); 874 } 875 if (sip->i_nlink == MAXLINK) { 876 rw_exit(&sip->i_contents); 877 return (EMLINK); 878 } 879 880 /* 881 * Sync the indirect blocks associated with the file 882 * for the same reasons as described above. Since this 883 * call wants the i_contents lock held for it we can do 884 * this here with no extra work. 885 */ 886 if (err = ufs_sync_indir(sip)) { 887 rw_exit(&sip->i_contents); 888 return (err); 889 } 890 891 if (op != DE_SYMLINK) 892 sip->i_nlink++; 893 TRANS_INODE(sip->i_ufsvfs, sip); 894 sip->i_flag |= ICHG; 895 sip->i_seq++; 896 ufs_iupdat(sip, I_SYNC); 897 rw_exit(&sip->i_contents); 898 899 /* 900 * If target directory has not been removed, then we can consider 901 * allowing file to be created. 902 */ 903 if (tdp->i_nlink <= 0) { 904 err = ENOENT; 905 goto out2; 906 } 907 908 /* 909 * Check accessibility of target directory. 910 */ 911 if (err = ufs_diraccess(tdp, IEXEC, cr)) 912 goto out2; 913 914 /* 915 * Search for the entry. Return VN_HELD tip if found. 916 */ 917 tip = NULL; 918 slot.status = NONE; 919 slot.fbp = NULL; 920 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 921 rw_enter(&tdp->i_contents, RW_WRITER); 922 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0); 923 if (err) 924 goto out; 925 926 if (tip) { 927 switch (op) { 928 case DE_RENAME: 929 err = ufs_dirrename(sdp, sip, tdp, namep, 930 tip, &slot, cr); 931 break; 932 933 case DE_LINK: 934 case DE_SYMLINK: 935 /* 936 * Can't link to an existing file. 937 */ 938 err = EEXIST; 939 break; 940 default: 941 break; 942 } 943 } else { 944 /* 945 * The entry does not exist. Check write permission in 946 * directory to see if entry can be created. 947 */ 948 if (err = ufs_iaccess(tdp, IWRITE, cr, 0)) 949 goto out; 950 err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp, 951 cr); 952 } 953 954 out: 955 if (slot.fbp) 956 fbrelse(slot.fbp, S_OTHER); 957 958 rw_exit(&tdp->i_contents); 959 960 /* 961 * Drop vfs_dqrwlock before calling VN_RELE() on tip to 962 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 963 */ 964 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); 965 966 /* 967 * If we renamed a file over the top of an existing file, 968 * or linked a file to an existing file (or tried to), 969 * then release and delete (or just release) the inode. 970 */ 971 if (tip) 972 VN_RELE(ITOV(tip)); 973 974 out2: 975 if (err) { 976 /* 977 * Undo bumped link count. 978 */ 979 if (op != DE_SYMLINK) { 980 rw_enter(&sip->i_contents, RW_WRITER); 981 sip->i_nlink--; 982 ufs_setreclaim(sip); 983 TRANS_INODE(sip->i_ufsvfs, sip); 984 sip->i_flag |= ICHG; 985 sip->i_seq++; 986 ITIMES_NOLOCK(sip); 987 rw_exit(&sip->i_contents); 988 } 989 } 990 return (err); 991 } 992 993 /* 994 * Check for the existence of a name in a directory (unless noentry 995 * is set) , or else of an empty 996 * slot in which an entry may be made. If the requested name is found, 997 * then on return *ipp points at the inode and *offp contains 998 * its offset in the directory. If the name is not found, then *ipp 999 * will be NULL and *slotp will contain information about a directory slot in 1000 * which an entry may be made (either an empty slot, or the first position 1001 * past the end of the directory). 1002 * The target directory inode (tdp) is supplied write locked (i_rwlock). 1003 * 1004 * This may not be used on "." or "..", but aliases of "." are ok. 1005 */ 1006 int 1007 ufs_dircheckforname( 1008 struct inode *tdp, /* inode of directory being checked */ 1009 char *namep, /* name we're checking for */ 1010 int namlen, /* length of name, excluding null */ 1011 struct ufs_slot *slotp, /* slot structure */ 1012 struct inode **ipp, /* return inode if we find one */ 1013 struct cred *cr, 1014 int noentry) /* noentry - just look for space */ 1015 { 1016 uint64_t handle; 1017 struct fbuf *fbp; /* pointer to directory block */ 1018 struct direct *ep; /* directory entry */ 1019 struct direct *nep; /* next directory entry */ 1020 dcanchor_t *dcap; 1021 vnode_t *dvp; /* directory vnode ptr */ 1022 off_t dirsize; /* size of the directory */ 1023 off_t offset; /* offset in the directory */ 1024 off_t last_offset; /* last offset */ 1025 off_t enduseful; /* pointer past last used dir slot */ 1026 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1027 int i; /* length of mangled entry */ 1028 int needed; 1029 int err; 1030 int first; 1031 int caching; 1032 int stat; 1033 ino_t ep_ino; 1034 slotstat_t initstat = slotp->status; 1035 1036 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1037 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1038 ASSERT(*ipp == NULL); 1039 fbp = NULL; 1040 1041 /* 1042 * First check if there is a complete cache of the directory. 1043 */ 1044 dvp = ITOV(tdp); 1045 1046 dcap = &tdp->i_danchor; 1047 if (noentry) { 1048 /* 1049 * We know from the 1st level dnlc cache that the entry 1050 * doesn't exist, so don't bother searching the directory 1051 * cache, but just look for space (possibly in the directory 1052 * cache). 1053 */ 1054 stat = DNOENT; 1055 } else { 1056 stat = dnlc_dir_lookup(dcap, namep, &handle); 1057 } 1058 switch (stat) { 1059 case DFOUND: 1060 ep_ino = (ino_t)H_TO_INO(handle); 1061 if (tdp->i_number == ep_ino) { 1062 *ipp = tdp; /* we want ourself, ie "." */ 1063 VN_HOLD(dvp); 1064 } else { 1065 err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr); 1066 if (err) 1067 return (err); 1068 } 1069 offset = H_TO_OFF(handle); 1070 first = 0; 1071 if (offset & 1) { 1072 /* This is the first entry in the block */ 1073 first = 1; 1074 offset -= 1; 1075 ASSERT((offset & (DIRBLKSIZ - 1)) == 0); 1076 } 1077 err = blkatoff(tdp, offset, (char **)&ep, &fbp); 1078 if (err) { 1079 VN_RELE(ITOV(*ipp)); 1080 *ipp = NULL; 1081 return (err); 1082 } 1083 /* 1084 * Check the validity of the entry. 1085 * If it's bad, then throw away the cache and 1086 * continue without it. The dirmangled() routine 1087 * will then be called upon it. 1088 */ 1089 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { 1090 VN_RELE(ITOV(*ipp)); 1091 *ipp = NULL; 1092 dnlc_dir_purge(dcap); 1093 break; 1094 } 1095 /* 1096 * Remember the returned offset is the offset of the 1097 * preceding record (unless this is the 1st record 1098 * in the DIRBLKSIZ sized block (disk sector)), then it's 1099 * offset + 1. Note, no real offsets are on odd boundaries. 1100 */ 1101 if (first) { 1102 ASSERT((offset & (DIRBLKSIZ - 1)) == 0); 1103 slotp->offset = offset; 1104 slotp->size = 0; 1105 slotp->ep = ep; 1106 } else { 1107 /* get the next entry */ 1108 nep = (struct direct *)((char *)ep + ep->d_reclen); 1109 /* 1110 * Check the validity of this entry as well 1111 * If it's bad, then throw away the cache and 1112 * continue without it. The dirmangled() routine 1113 * will then be called upon it. 1114 */ 1115 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || 1116 (nep->d_ino != ep_ino)) { 1117 VN_RELE(ITOV(*ipp)); 1118 *ipp = NULL; 1119 dnlc_dir_purge(dcap); 1120 break; 1121 } 1122 slotp->offset = offset + ep->d_reclen; 1123 slotp->size = ep->d_reclen; 1124 slotp->ep = nep; 1125 } 1126 slotp->status = EXIST; 1127 slotp->fbp = fbp; 1128 slotp->endoff = 0; 1129 slotp->cached = 1; 1130 dnlc_update(dvp, namep, ITOV(*ipp)); 1131 return (0); 1132 case DNOENT: 1133 /* 1134 * The caller gets to set the initial slot status to 1135 * indicate whether it's interested in getting a 1136 * empty slot. For example, the status can be set 1137 * to FOUND when an entry is being deleted. 1138 */ 1139 ASSERT(slotp->fbp == NULL); 1140 if (slotp->status == FOUND) { 1141 return (0); 1142 } 1143 switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen), 1144 &handle)) { 1145 case DFOUND: 1146 offset = (off_t)handle; 1147 err = blkatoff(tdp, offset, (char **)&ep, &fbp); 1148 if (err) { 1149 dnlc_dir_purge(dcap); 1150 ASSERT(*ipp == NULL); 1151 return (err); 1152 } 1153 /* 1154 * Check the validity of the entry. 1155 * If it's bad, then throw away the cache and 1156 * continue without it. The dirmangled() routine 1157 * will then be called upon it. 1158 */ 1159 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { 1160 dnlc_dir_purge(dcap); 1161 break; 1162 } 1163 /* 1164 * Remember the returned offset is the offset of the 1165 * containing record. 1166 */ 1167 slotp->status = FOUND; 1168 slotp->ep = ep; 1169 slotp->offset = offset; 1170 slotp->fbp = fbp; 1171 slotp->size = ep->d_reclen; 1172 /* 1173 * Set end offset to 0. Truncation is handled 1174 * because the dnlc cache will blow away the 1175 * cached directory when an entry is removed 1176 * that drops the entries left to less than half 1177 * the minumum number (dnlc_min_dir_cache). 1178 */ 1179 slotp->endoff = 0; 1180 slotp->cached = 1; 1181 return (0); 1182 case DNOENT: 1183 slotp->status = NONE; 1184 slotp->offset = P2ROUNDUP_TYPED(tdp->i_size, 1185 DIRBLKSIZ, u_offset_t); 1186 slotp->size = DIRBLKSIZ; 1187 slotp->endoff = 0; 1188 slotp->cached = 1; 1189 return (0); 1190 default: 1191 break; 1192 } 1193 break; 1194 } 1195 slotp->cached = 0; 1196 caching = 0; 1197 if (!noentry && tdp->i_size >= ufs_min_dir_cache) { 1198 /* 1199 * if the directory caching disable time has expired 1200 * enable caching again. 1201 */ 1202 if (tdp->i_cachedir == CD_DISABLED_NOMEM && 1203 gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) { 1204 ufs_dc_disable_at = 0; 1205 tdp->i_cachedir = CD_ENABLED; 1206 } 1207 /* 1208 * Attempt to cache any directories greater than the tunable 1209 * ufs_min_cache_dir. If it fails due to memory shortage 1210 * (DNOMEM), disable caching for this directory and record 1211 * the system time. Any attempt after the disable time has 1212 * expired will enable the caching again. 1213 */ 1214 if (tdp->i_cachedir == CD_ENABLED) { 1215 switch (dnlc_dir_start(dcap, 1216 tdp->i_size >> AV_DIRECT_SHIFT)) { 1217 case DNOMEM: 1218 tdp->i_cachedir = CD_DISABLED_NOMEM; 1219 ufs_dc_disable_at = gethrtime(); 1220 break; 1221 case DTOOBIG: 1222 tdp->i_cachedir = CD_DISABLED_TOOBIG; 1223 break; 1224 case DOK: 1225 caching = 1; 1226 break; 1227 default: 1228 break; 1229 } 1230 } 1231 } 1232 1233 /* 1234 * No point in using i_diroff since we must search whole directory 1235 */ 1236 dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t); 1237 enduseful = 0; 1238 offset = last_offset = 0; 1239 entryoffsetinblk = 0; 1240 needed = (int)LDIRSIZ(namlen); 1241 while (offset < dirsize) { 1242 /* 1243 * If offset is on a block boundary, 1244 * read the next directory block. 1245 * Release previous if it exists. 1246 */ 1247 if (blkoff(tdp->i_fs, offset) == 0) { 1248 if (fbp != NULL) 1249 fbrelse(fbp, S_OTHER); 1250 1251 err = blkatoff(tdp, offset, (char **)0, &fbp); 1252 if (err) { 1253 ASSERT(*ipp == NULL); 1254 if (caching) { 1255 dnlc_dir_purge(dcap); 1256 } 1257 return (err); 1258 } 1259 entryoffsetinblk = 0; 1260 } 1261 /* 1262 * If still looking for a slot, and at a DIRBLKSIZ 1263 * boundary, have to start looking for free space 1264 * again. 1265 */ 1266 if (slotp->status == NONE && 1267 (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) { 1268 slotp->offset = -1; 1269 } 1270 /* 1271 * If the next entry is a zero length record or if the 1272 * record length is invalid, then skip to the next 1273 * directory block. Complete validation checks are 1274 * done if the record length is invalid. 1275 * 1276 * Full validation checks are slow so they are disabled 1277 * by default. Complete checks can be run by patching 1278 * "dirchk" to be true. 1279 * 1280 * We do not have to check the validity of 1281 * entryoffsetinblk here because it starts out as zero 1282 * and is only incremented by d_reclen values that we 1283 * validate here. 1284 */ 1285 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1286 if (ep->d_reclen == 0 || 1287 (dirchk || (ep->d_reclen & 0x3)) && 1288 dirmangled(tdp, ep, entryoffsetinblk, offset)) { 1289 i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1)); 1290 offset += i; 1291 entryoffsetinblk += i; 1292 if (caching) { 1293 dnlc_dir_purge(dcap); 1294 caching = 0; 1295 } 1296 continue; 1297 } 1298 1299 /* 1300 * Add named entries and free space into the directory cache 1301 */ 1302 if (caching) { 1303 ushort_t extra; 1304 off_t off2; 1305 1306 if (ep->d_ino == 0) { 1307 extra = ep->d_reclen; 1308 if (offset & (DIRBLKSIZ - 1)) { 1309 dnlc_dir_purge(dcap); 1310 caching = 0; 1311 } 1312 } else { 1313 /* 1314 * entries hold the previous offset if 1315 * not the 1st one 1316 */ 1317 if (offset & (DIRBLKSIZ - 1)) { 1318 off2 = last_offset; 1319 } else { 1320 off2 = offset + 1; 1321 } 1322 caching = (dnlc_dir_add_entry(dcap, ep->d_name, 1323 INO_OFF_TO_H(ep->d_ino, off2)) == DOK); 1324 extra = ep->d_reclen - DIRSIZ(ep); 1325 } 1326 if (caching && (extra >= LDIRSIZ(1))) { 1327 caching = (dnlc_dir_add_space(dcap, extra, 1328 (uint64_t)offset) == DOK); 1329 } 1330 } 1331 1332 /* 1333 * If an appropriate sized slot has not yet been found, 1334 * check to see if one is available. 1335 */ 1336 if ((slotp->status != FOUND) && (slotp->status != EXIST)) { 1337 int size = ep->d_reclen; 1338 1339 if (ep->d_ino != 0) 1340 size -= DIRSIZ(ep); 1341 if (size > 0) { 1342 if (size >= needed) { 1343 slotp->offset = offset; 1344 slotp->size = ep->d_reclen; 1345 if (noentry) { 1346 slotp->ep = ep; 1347 slotp->fbp = fbp; 1348 slotp->status = FOUND; 1349 slotp->endoff = 0; 1350 return (0); 1351 } 1352 slotp->status = FOUND; 1353 } else if (slotp->status == NONE) { 1354 if (slotp->offset == -1) 1355 slotp->offset = offset; 1356 } 1357 } 1358 } 1359 /* 1360 * Check for a name match. 1361 */ 1362 if (ep->d_ino && ep->d_namlen == namlen && 1363 *namep == *ep->d_name && /* fast chk 1st char */ 1364 bcmp(namep, ep->d_name, namlen) == 0) { 1365 1366 tdp->i_diroff = offset; 1367 1368 if (tdp->i_number == ep->d_ino) { 1369 *ipp = tdp; /* we want ourself, ie "." */ 1370 VN_HOLD(dvp); 1371 } else { 1372 err = ufs_iget_alloced(tdp->i_vfs, 1373 (ino_t)ep->d_ino, ipp, cr); 1374 if (err) { 1375 fbrelse(fbp, S_OTHER); 1376 if (caching) 1377 dnlc_dir_purge(dcap); 1378 return (err); 1379 } 1380 } 1381 slotp->status = EXIST; 1382 slotp->offset = offset; 1383 slotp->size = (int)(offset - last_offset); 1384 slotp->fbp = fbp; 1385 slotp->ep = ep; 1386 slotp->endoff = 0; 1387 if (caching) 1388 dnlc_dir_purge(dcap); 1389 return (0); 1390 } 1391 last_offset = offset; 1392 offset += ep->d_reclen; 1393 entryoffsetinblk += ep->d_reclen; 1394 if (ep->d_ino) 1395 enduseful = offset; 1396 } 1397 if (fbp) { 1398 fbrelse(fbp, S_OTHER); 1399 } 1400 1401 if (caching) { 1402 dnlc_dir_complete(dcap); 1403 slotp->cached = 1; 1404 if (slotp->status == FOUND) { 1405 if (initstat == FOUND) { 1406 return (0); 1407 } 1408 (void) dnlc_dir_rem_space_by_handle(dcap, 1409 slotp->offset); 1410 slotp->endoff = 0; 1411 return (0); 1412 } 1413 } 1414 1415 if (slotp->status == NONE) { 1416 /* 1417 * We didn't find a slot; the new directory entry should be put 1418 * at the end of the directory. Return an indication of where 1419 * this is, and set "endoff" to zero; since we're going to have 1420 * to extend the directory, we're certainly not going to 1421 * truncate it. 1422 */ 1423 slotp->offset = dirsize; 1424 slotp->size = DIRBLKSIZ; 1425 slotp->endoff = 0; 1426 } else { 1427 /* 1428 * We found a slot, and will return an indication of where that 1429 * slot is, as any new directory entry will be put there. 1430 * Since that slot will become a useful entry, if the last 1431 * useful entry we found was before this one, update the offset 1432 * of the last useful entry. 1433 */ 1434 if (enduseful < slotp->offset + slotp->size) 1435 enduseful = slotp->offset + slotp->size; 1436 slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t); 1437 } 1438 *ipp = NULL; 1439 return (0); 1440 } 1441 1442 uint64_t ufs_dirrename_retry_cnt; 1443 1444 /* 1445 * Rename the entry in the directory tdp so that it points to 1446 * sip instead of tip. 1447 */ 1448 static int 1449 ufs_dirrename( 1450 struct inode *sdp, /* parent directory of source */ 1451 struct inode *sip, /* source inode */ 1452 struct inode *tdp, /* parent directory of target */ 1453 char *namep, /* entry we are trying to change */ 1454 struct inode *tip, /* target inode */ 1455 struct ufs_slot *slotp, /* slot for entry */ 1456 struct cred *cr) /* credentials */ 1457 { 1458 vnode_t *tdvp; 1459 off_t offset; 1460 int err; 1461 int doingdirectory; 1462 1463 ASSERT(sdp->i_ufsvfs != NULL); 1464 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1465 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1466 /* 1467 * Short circuit rename of something to itself. 1468 */ 1469 if (sip->i_number == tip->i_number) { 1470 return (ESAME); /* special KLUDGE error code */ 1471 } 1472 1473 /* 1474 * We're locking 2 peer level locks, so must use tryenter 1475 * on the 2nd to avoid deadlocks that would occur 1476 * if we renamed a->b and b->a concurrently. 1477 */ 1478 retry: 1479 rw_enter(&tip->i_contents, RW_WRITER); 1480 if (!rw_tryenter(&sip->i_contents, RW_READER)) { 1481 /* 1482 * drop tip and wait (sleep) until we stand a chance 1483 * of holding sip 1484 */ 1485 rw_exit(&tip->i_contents); 1486 rw_enter(&sip->i_contents, RW_READER); 1487 /* 1488 * Reverse the lock grabs in case we have heavy 1489 * contention on the 2nd lock. 1490 */ 1491 if (!rw_tryenter(&tip->i_contents, RW_WRITER)) { 1492 ufs_dirrename_retry_cnt++; 1493 rw_exit(&sip->i_contents); 1494 goto retry; 1495 } 1496 } 1497 1498 /* 1499 * Check that everything is on the same filesystem. 1500 */ 1501 if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) || 1502 (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) { 1503 err = EXDEV; /* XXX archaic */ 1504 goto out; 1505 } 1506 /* 1507 * Must have write permission to rewrite target entry. 1508 * Perform additional checks for sticky directories. 1509 */ 1510 if ((err = ufs_iaccess(tdp, IWRITE, cr, 0)) != 0 || 1511 (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0) 1512 goto out; 1513 1514 /* 1515 * Ensure source and target are compatible (both directories 1516 * or both not directories). If target is a directory it must 1517 * be empty and have no links to it; in addition it must not 1518 * be a mount point, and both the source and target must be 1519 * writable. 1520 */ 1521 doingdirectory = (((sip->i_mode & IFMT) == IFDIR) || 1522 ((sip->i_mode & IFMT) == IFATTRDIR)); 1523 if (((tip->i_mode & IFMT) == IFDIR) || 1524 ((tip->i_mode & IFMT) == IFATTRDIR)) { 1525 if (!doingdirectory) { 1526 err = EISDIR; 1527 goto out; 1528 } 1529 /* 1530 * vn_vfsrlock will prevent mounts from using the directory 1531 * until we are done. 1532 */ 1533 if (vn_vfsrlock(ITOV(tip))) { 1534 err = EBUSY; 1535 goto out; 1536 } 1537 if (vn_mountedvfs(ITOV(tip)) != NULL) { 1538 vn_vfsunlock(ITOV(tip)); 1539 err = EBUSY; 1540 goto out; 1541 } 1542 if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) { 1543 vn_vfsunlock(ITOV(tip)); 1544 err = EEXIST; /* SIGH should be ENOTEMPTY */ 1545 goto out; 1546 } 1547 } else if (doingdirectory) { 1548 err = ENOTDIR; 1549 goto out; 1550 } 1551 1552 /* 1553 * Rewrite the inode pointer for target name entry 1554 * from the target inode (ip) to the source inode (sip). 1555 * This prevents the target entry from disappearing 1556 * during a crash. Mark the directory inode to reflect the changes. 1557 */ 1558 tdvp = ITOV(tdp); 1559 slotp->ep->d_ino = (int32_t)sip->i_number; 1560 dnlc_update(tdvp, namep, ITOV(sip)); 1561 if (slotp->size) { 1562 offset = slotp->offset - slotp->size; 1563 } else { 1564 offset = slotp->offset + 1; 1565 } 1566 if (slotp->cached) { 1567 (void) dnlc_dir_update(&tdp->i_danchor, namep, 1568 INO_OFF_TO_H(slotp->ep->d_ino, offset)); 1569 } 1570 1571 err = TRANS_DIR(tdp, slotp->offset); 1572 if (err) 1573 fbrelse(slotp->fbp, S_OTHER); 1574 else 1575 err = ufs_fbwrite(slotp->fbp, tdp); 1576 1577 slotp->fbp = NULL; 1578 if (err) { 1579 if (doingdirectory) 1580 vn_vfsunlock(ITOV(tip)); 1581 goto out; 1582 } 1583 1584 TRANS_INODE(tdp->i_ufsvfs, tdp); 1585 tdp->i_flag |= IUPD|ICHG; 1586 tdp->i_seq++; 1587 ITIMES_NOLOCK(tdp); 1588 1589 /* 1590 * Decrement the link count of the target inode. 1591 * Fix the ".." entry in sip to point to dp. 1592 * This is done after the new entry is on the disk. 1593 */ 1594 tip->i_nlink--; 1595 TRANS_INODE(tip->i_ufsvfs, tip); 1596 tip->i_flag |= ICHG; 1597 tip->i_seq++; 1598 ITIMES_NOLOCK(tip); 1599 if (doingdirectory) { 1600 /* 1601 * The entry for tip no longer exists so I can unlock the 1602 * vfslock. 1603 */ 1604 vn_vfsunlock(ITOV(tip)); 1605 /* 1606 * Decrement target link count once more if it was a directory. 1607 */ 1608 if (--tip->i_nlink != 0) { 1609 err = ufs_fault(ITOV(tip), 1610 "ufs_dirrename: target directory link count != 0 (%s)", 1611 tip->i_fs->fs_fsmnt); 1612 rw_exit(&tip->i_contents); 1613 return (err); 1614 } 1615 TRANS_INODE(tip->i_ufsvfs, tip); 1616 ufs_setreclaim(tip); 1617 /* 1618 * Renaming a directory with the parent different 1619 * requires that ".." be rewritten. The window is 1620 * still there for ".." to be inconsistent, but this 1621 * is unavoidable, and a lot shorter than when it was 1622 * done in a user process. We decrement the link 1623 * count in the new parent as appropriate to reflect 1624 * the just-removed target. If the parent is the 1625 * same, this is appropriate since the original 1626 * directory is going away. If the new parent is 1627 * different, ufs_dirfixdotdot() will bump the link count 1628 * back. 1629 */ 1630 tdp->i_nlink--; 1631 ufs_setreclaim(tdp); 1632 TRANS_INODE(tdp->i_ufsvfs, tdp); 1633 tdp->i_flag |= ICHG; 1634 tdp->i_seq++; 1635 ITIMES_NOLOCK(tdp); 1636 if (sdp != tdp) { 1637 rw_exit(&tip->i_contents); 1638 rw_exit(&sip->i_contents); 1639 err = ufs_dirfixdotdot(sip, sdp, tdp); 1640 return (err); 1641 } 1642 } else 1643 ufs_setreclaim(tip); 1644 out: 1645 rw_exit(&tip->i_contents); 1646 rw_exit(&sip->i_contents); 1647 return (err); 1648 } 1649 1650 /* 1651 * Fix the ".." entry of the child directory so that it points 1652 * to the new parent directory instead of the old one. Routine 1653 * assumes that dp is a directory and that all the inodes are on 1654 * the same file system. 1655 */ 1656 static int 1657 ufs_dirfixdotdot( 1658 struct inode *dp, /* child directory */ 1659 struct inode *opdp, /* old parent directory */ 1660 struct inode *npdp) /* new parent directory */ 1661 { 1662 struct fbuf *fbp; 1663 struct dirtemplate *dirp; 1664 vnode_t *dvp; 1665 int err; 1666 1667 ASSERT(RW_WRITE_HELD(&npdp->i_rwlock)); 1668 ASSERT(RW_WRITE_HELD(&npdp->i_contents)); 1669 1670 /* 1671 * We hold the child directory's i_contents lock before calling 1672 * blkatoff so that we honor correct locking protocol which is 1673 * i_contents lock and then page lock. (blkatoff will call 1674 * ufs_getpage where we want the page lock) 1675 * We hold the child directory's i_rwlock before i_contents (as 1676 * per the locking protocol) since we are modifying the ".." entry 1677 * of the child directory. 1678 * We hold the i_rwlock and i_contents lock until we record 1679 * this directory delta to the log (via ufs_trans_dir) and have 1680 * done fbrelse. 1681 */ 1682 rw_enter(&dp->i_rwlock, RW_WRITER); 1683 rw_enter(&dp->i_contents, RW_WRITER); 1684 err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp); 1685 if (err) 1686 goto bad; 1687 1688 if (dp->i_nlink <= 0 || 1689 dp->i_size < sizeof (struct dirtemplate)) { 1690 err = ENOENT; 1691 goto bad; 1692 } 1693 1694 if (dirp->dotdot_namlen != 2 || 1695 dirp->dotdot_name[0] != '.' || 1696 dirp->dotdot_name[1] != '.') { /* Sanity check. */ 1697 dirbad(dp, "mangled .. entry", (off_t)0); 1698 err = ENOTDIR; 1699 goto bad; 1700 } 1701 1702 /* 1703 * Increment the link count in the new parent inode and force it out. 1704 */ 1705 if (npdp->i_nlink == MAXLINK) { 1706 err = EMLINK; 1707 goto bad; 1708 } 1709 npdp->i_nlink++; 1710 TRANS_INODE(npdp->i_ufsvfs, npdp); 1711 npdp->i_flag |= ICHG; 1712 npdp->i_seq++; 1713 ufs_iupdat(npdp, I_SYNC); 1714 1715 /* 1716 * Rewrite the child ".." entry and force it out. 1717 */ 1718 dvp = ITOV(dp); 1719 dirp->dotdot_ino = (uint32_t)npdp->i_number; 1720 dnlc_update(dvp, "..", ITOV(npdp)); 1721 (void) dnlc_dir_update(&dp->i_danchor, "..", 1722 INO_OFF_TO_H(dirp->dotdot_ino, 0)); 1723 1724 err = TRANS_DIR(dp, 0); 1725 if (err) 1726 fbrelse(fbp, S_OTHER); 1727 else 1728 err = ufs_fbwrite(fbp, dp); 1729 1730 fbp = NULL; 1731 if (err) 1732 goto bad; 1733 1734 rw_exit(&dp->i_contents); 1735 rw_exit(&dp->i_rwlock); 1736 1737 /* 1738 * Decrement the link count of the old parent inode and force it out. 1739 */ 1740 ASSERT(opdp); 1741 rw_enter(&opdp->i_contents, RW_WRITER); 1742 ASSERT(opdp->i_nlink > 0); 1743 opdp->i_nlink--; 1744 ufs_setreclaim(opdp); 1745 TRANS_INODE(opdp->i_ufsvfs, opdp); 1746 opdp->i_flag |= ICHG; 1747 opdp->i_seq++; 1748 ufs_iupdat(opdp, I_SYNC); 1749 rw_exit(&opdp->i_contents); 1750 return (0); 1751 1752 bad: 1753 if (fbp) 1754 fbrelse(fbp, S_OTHER); 1755 rw_exit(&dp->i_contents); 1756 rw_exit(&dp->i_rwlock); 1757 return (err); 1758 } 1759 1760 /* 1761 * Enter the file sip in the directory tdp with name namep. 1762 */ 1763 static int 1764 ufs_diraddentry( 1765 struct inode *tdp, 1766 char *namep, 1767 enum de_op op, 1768 int namlen, 1769 struct ufs_slot *slotp, 1770 struct inode *sip, 1771 struct inode *sdp, 1772 struct cred *cr) 1773 { 1774 struct direct *ep, *nep; 1775 vnode_t *tdvp; 1776 dcanchor_t *dcap = &tdp->i_danchor; 1777 off_t offset; 1778 int err; 1779 ushort_t extra; 1780 1781 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1782 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1783 /* 1784 * Prepare a new entry. If the caller has not supplied an 1785 * existing inode, make a new one. 1786 */ 1787 err = dirprepareentry(tdp, slotp, cr); 1788 if (err) { 1789 if (slotp->fbp) { 1790 fbrelse(slotp->fbp, S_OTHER); 1791 slotp->fbp = NULL; 1792 } 1793 return (err); 1794 } 1795 /* 1796 * Check inode to be linked to see if it is in the 1797 * same filesystem. 1798 */ 1799 if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) { 1800 err = EXDEV; 1801 goto bad; 1802 } 1803 1804 /* 1805 * If renaming a directory then fix up the ".." entry in the 1806 * directory to point to the new parent. 1807 */ 1808 if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) || 1809 ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) { 1810 err = ufs_dirfixdotdot(sip, sdp, tdp); 1811 if (err) 1812 goto bad; 1813 } 1814 1815 /* 1816 * Fill in entry data. 1817 */ 1818 ep = slotp->ep; 1819 ep->d_namlen = (ushort_t)namlen; 1820 (void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3)); 1821 ep->d_ino = (uint32_t)sip->i_number; 1822 tdvp = ITOV(tdp); 1823 dnlc_update(tdvp, namep, ITOV(sip)); 1824 /* 1825 * Note the offset supplied for any named entry is 1826 * the offset of the previous one, unless it's the 1st. 1827 * slotp->size is used to pass the length to 1828 * the previous entry. 1829 */ 1830 if (slotp->size) { 1831 offset = slotp->offset - slotp->size; 1832 } else { 1833 offset = slotp->offset + 1; 1834 } 1835 1836 if (slotp->cached) { 1837 /* 1838 * Add back any usable unused space to the dnlc directory 1839 * cache. 1840 */ 1841 extra = ep->d_reclen - DIRSIZ(ep); 1842 if (extra >= LDIRSIZ(1)) { 1843 (void) dnlc_dir_add_space(dcap, extra, 1844 (uint64_t)slotp->offset); 1845 } 1846 1847 (void) dnlc_dir_add_entry(dcap, namep, 1848 INO_OFF_TO_H(ep->d_ino, offset)); 1849 1850 /* adjust the previous offset of the next entry */ 1851 nep = (struct direct *)((char *)ep + ep->d_reclen); 1852 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { 1853 /* 1854 * Not a new block. 1855 * 1856 * Check the validity of the next entry. 1857 * If it's bad, then throw away the cache, and 1858 * continue as before directory caching. 1859 */ 1860 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || 1861 dnlc_dir_update(dcap, nep->d_name, 1862 INO_OFF_TO_H(nep->d_ino, slotp->offset)) 1863 == DNOENT) { 1864 dnlc_dir_purge(dcap); 1865 slotp->cached = 0; 1866 } 1867 } 1868 } 1869 1870 /* 1871 * Write out the directory block. 1872 */ 1873 err = TRANS_DIR(tdp, slotp->offset); 1874 if (err) 1875 fbrelse(slotp->fbp, S_OTHER); 1876 else 1877 err = ufs_fbwrite(slotp->fbp, tdp); 1878 1879 slotp->fbp = NULL; 1880 /* 1881 * If this is a rename of a directory, then we have already 1882 * fixed the ".." entry to refer to the new parent. If err 1883 * is true at this point, we have failed to update the new 1884 * parent to refer to the renamed directory. 1885 * XXX - we need to unwind the ".." fix. 1886 */ 1887 if (err) 1888 return (err); 1889 1890 /* 1891 * Mark the directory inode to reflect the changes. 1892 * Truncate the directory to chop off blocks of empty entries. 1893 */ 1894 1895 TRANS_INODE(tdp->i_ufsvfs, tdp); 1896 tdp->i_flag |= IUPD|ICHG; 1897 tdp->i_seq++; 1898 tdp->i_diroff = 0; 1899 ITIMES_NOLOCK(tdp); 1900 /* 1901 * If the directory grew then dirprepareentry() will have 1902 * set IATTCHG in tdp->i_flag, then the directory inode must 1903 * be flushed out. This is because if fsync() is used later 1904 * the directory size must be correct, otherwise a crash would 1905 * cause fsck to move the file to lost+found. Also because later 1906 * a file may be linked in more than one directory, then there 1907 * is no way to flush the original directory. So it must be 1908 * flushed out on creation. See bug 4293809. 1909 */ 1910 if (tdp->i_flag & IATTCHG) { 1911 ufs_iupdat(tdp, I_SYNC); 1912 } 1913 1914 if (slotp->endoff && (slotp->endoff < tdp->i_size)) { 1915 if (!TRANS_ISTRANS(tdp->i_ufsvfs)) { 1916 (void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0, 1917 cr); 1918 } 1919 } 1920 1921 1922 return (0); 1923 1924 bad: 1925 if (slotp->cached) { 1926 dnlc_dir_purge(dcap); 1927 fbrelse(slotp->fbp, S_OTHER); 1928 slotp->cached = 0; 1929 slotp->fbp = NULL; 1930 return (err); 1931 } 1932 1933 /* 1934 * Clear out entry prepared by dirprepareent. 1935 */ 1936 slotp->ep->d_ino = 0; 1937 slotp->ep->d_namlen = 0; 1938 1939 /* 1940 * Don't touch err so we don't clobber the real error that got us here. 1941 */ 1942 if (TRANS_DIR(tdp, slotp->offset)) 1943 fbrelse(slotp->fbp, S_OTHER); 1944 else 1945 (void) ufs_fbwrite(slotp->fbp, tdp); 1946 slotp->fbp = NULL; 1947 return (err); 1948 } 1949 1950 /* 1951 * Prepare a directory slot to receive an entry. 1952 */ 1953 static int 1954 dirprepareentry( 1955 struct inode *dp, /* directory we are working in */ 1956 struct ufs_slot *slotp, /* available slot info */ 1957 struct cred *cr) 1958 { 1959 struct direct *ep, *nep; 1960 off_t entryend; 1961 int err; 1962 slotstat_t status = slotp->status; 1963 ushort_t dsize; 1964 1965 ASSERT((status == NONE) || (status == FOUND)); 1966 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 1967 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 1968 /* 1969 * If we didn't find a slot, then indicate that the 1970 * new slot belongs at the end of the directory. 1971 * If we found a slot, then the new entry can be 1972 * put at slotp->offset. 1973 */ 1974 entryend = slotp->offset + slotp->size; 1975 if (status == NONE) { 1976 ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0); 1977 if (DIRBLKSIZ > dp->i_fs->fs_fsize) { 1978 err = ufs_fault(ITOV(dp), 1979 "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d" 1980 " > dp->i_fs->fs_fsize: %d (%s)", 1981 DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt); 1982 return (err); 1983 } 1984 /* 1985 * Allocate the new block. 1986 */ 1987 err = BMAPALLOC(dp, (u_offset_t)slotp->offset, 1988 (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr); 1989 if (err) { 1990 return (err); 1991 } 1992 dp->i_size = entryend; 1993 TRANS_INODE(dp->i_ufsvfs, dp); 1994 dp->i_flag |= IUPD|ICHG|IATTCHG; 1995 dp->i_seq++; 1996 ITIMES_NOLOCK(dp); 1997 } else if (entryend > dp->i_size) { 1998 /* 1999 * Adjust directory size, if needed. This should never 2000 * push the size past a new multiple of DIRBLKSIZ. 2001 * This is an artifact of the old (4.2BSD) way of initializing 2002 * directory sizes to be less than DIRBLKSIZ. 2003 */ 2004 dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t); 2005 TRANS_INODE(dp->i_ufsvfs, dp); 2006 dp->i_flag |= IUPD|ICHG|IATTCHG; 2007 dp->i_seq++; 2008 ITIMES_NOLOCK(dp); 2009 } 2010 2011 /* 2012 * Get the block containing the space for the new directory entry. 2013 */ 2014 if (slotp->fbp == NULL) { 2015 err = blkatoff(dp, slotp->offset, (char **)&slotp->ep, 2016 &slotp->fbp); 2017 if (err) { 2018 return (err); 2019 } 2020 } 2021 ep = slotp->ep; 2022 2023 switch (status) { 2024 case NONE: 2025 /* 2026 * No space in the directory. slotp->offset will be on a 2027 * directory block boundary and we will write the new entry 2028 * into a fresh block. 2029 */ 2030 ep->d_reclen = DIRBLKSIZ; 2031 slotp->size = 0; /* length of previous entry */ 2032 break; 2033 case FOUND: 2034 /* 2035 * An entry of the required size has been found. Use it. 2036 */ 2037 if (ep->d_ino == 0) { 2038 /* this is the 1st record in a block */ 2039 slotp->size = 0; /* length of previous entry */ 2040 } else { 2041 dsize = DIRSIZ(ep); 2042 nep = (struct direct *)((char *)ep + dsize); 2043 nep->d_reclen = ep->d_reclen - dsize; 2044 ep->d_reclen = dsize; 2045 slotp->ep = nep; 2046 slotp->offset += dsize; 2047 slotp->size = dsize; /* length of previous entry */ 2048 } 2049 break; 2050 default: 2051 break; 2052 } 2053 return (0); 2054 } 2055 2056 /* 2057 * Allocate and initialize a new inode that will go into directory tdp. 2058 * This routine is called from ufs_symlink(), as well as within this file. 2059 */ 2060 int 2061 ufs_dirmakeinode( 2062 struct inode *tdp, 2063 struct inode **ipp, 2064 struct vattr *vap, 2065 enum de_op op, 2066 struct cred *cr) 2067 { 2068 struct inode *ip; 2069 enum vtype type; 2070 int imode; /* mode and format as in inode */ 2071 ino_t ipref; 2072 int err; 2073 timestruc_t now; 2074 2075 ASSERT(vap != NULL); 2076 ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR || 2077 op == DE_SYMLINK); 2078 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 2079 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 2080 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 2081 /* 2082 * Allocate a new inode. 2083 */ 2084 type = vap->va_type; 2085 if (type == VDIR) { 2086 ipref = dirpref(tdp); 2087 } else { 2088 ipref = tdp->i_number; 2089 } 2090 if (op == DE_ATTRDIR) 2091 imode = vap->va_mode; 2092 else 2093 imode = MAKEIMODE(type, vap->va_mode); 2094 *ipp = NULL; 2095 err = ufs_ialloc(tdp, ipref, imode, &ip, cr); 2096 if (err) 2097 return (err); 2098 2099 /* 2100 * We don't need to grab vfs_dqrwlock here because it is held 2101 * in ufs_direnter_*() above us. 2102 */ 2103 ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock)); 2104 rw_enter(&ip->i_contents, RW_WRITER); 2105 if (ip->i_dquot != NULL) { 2106 err = ufs_fault(ITOV(ip), 2107 "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)", 2108 tdp->i_fs->fs_fsmnt); 2109 rw_exit(&ip->i_contents); 2110 return (err); 2111 } 2112 *ipp = ip; 2113 ip->i_mode = (o_mode_t)imode; 2114 if (type == VBLK || type == VCHR) { 2115 dev_t d = vap->va_rdev; 2116 dev32_t dev32; 2117 2118 /* 2119 * Don't allow a special file to be created with a 2120 * dev_t that cannot be represented by this filesystem 2121 * format on disk. 2122 */ 2123 if (!cmpldev(&dev32, d)) { 2124 err = EOVERFLOW; 2125 goto fail; 2126 } 2127 2128 ITOV(ip)->v_rdev = ip->i_rdev = d; 2129 2130 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { 2131 ip->i_ordev = dev32; /* can't use old format */ 2132 } else { 2133 ip->i_ordev = cmpdev(d); 2134 } 2135 } 2136 ITOV(ip)->v_type = type; 2137 ufs_reset_vnode(ip->i_vnode); 2138 if (type == VDIR) { 2139 ip->i_nlink = 2; /* anticipating a call to dirmakedirect */ 2140 } else { 2141 ip->i_nlink = 1; 2142 } 2143 2144 if (op == DE_ATTRDIR) { 2145 ip->i_uid = vap->va_uid; 2146 ip->i_gid = vap->va_gid; 2147 } else 2148 ip->i_uid = crgetuid(cr); 2149 /* 2150 * To determine the group-id of the created file: 2151 * 1) If the gid is set in the attribute list (non-Sun & pre-4.0 2152 * clients are not likely to set the gid), then use it if 2153 * the process is privileged, belongs to the target group, 2154 * or the group is the same as the parent directory. 2155 * 2) If the filesystem was not mounted with the Old-BSD-compatible 2156 * GRPID option, and the directory's set-gid bit is clear, 2157 * then use the process's gid. 2158 * 3) Otherwise, set the group-id to the gid of the parent directory. 2159 */ 2160 if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) && 2161 ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) || 2162 secpolicy_vnode_create_gid(cr) == 0)) { 2163 /* 2164 * XXX - is this only the case when a 4.0 NFS client, or a 2165 * client derived from that code, makes a call over the wire? 2166 */ 2167 ip->i_gid = vap->va_gid; 2168 } else 2169 ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr); 2170 2171 /* 2172 * For SunOS 5.0->5.4, the lines below read: 2173 * 2174 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; 2175 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; 2176 * 2177 * where MAXUID was set to 60002. See notes on this in ufs_inode.c 2178 */ 2179 ip->i_suid = 2180 (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? UID_LONG : ip->i_uid; 2181 ip->i_sgid = 2182 (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? GID_LONG : ip->i_gid; 2183 2184 /* 2185 * If we're creating a directory, and the parent directory has the 2186 * set-GID bit set, set it on the new directory. 2187 * Otherwise, if the user is neither privileged nor a member of the 2188 * file's new group, clear the file's set-GID bit. 2189 */ 2190 if ((tdp->i_mode & ISGID) && (type == VDIR)) 2191 ip->i_mode |= ISGID; 2192 else { 2193 if ((ip->i_mode & ISGID) && 2194 secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0) 2195 ip->i_mode &= ~ISGID; 2196 } 2197 2198 if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2199 ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2200 err = EOVERFLOW; 2201 goto fail; 2202 } 2203 2204 /* 2205 * Extended attribute directories are not subject to quotas. 2206 */ 2207 if (op != DE_ATTRDIR) 2208 ip->i_dquot = getinoquota(ip); 2209 else 2210 ip->i_dquot = NULL; 2211 2212 if (op == DE_MKDIR || op == DE_ATTRDIR) { 2213 err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr); 2214 if (err) 2215 goto fail; 2216 } 2217 2218 /* 2219 * generate the shadow inode and attach it to the new object 2220 */ 2221 ASSERT((tdp->i_shadow && tdp->i_ufs_acl) || 2222 (!tdp->i_shadow && !tdp->i_ufs_acl)); 2223 if (tdp->i_shadow && tdp->i_ufs_acl && 2224 (((tdp->i_mode & IFMT) == IFDIR) || 2225 ((tdp->i_mode & IFMT) == IFATTRDIR))) { 2226 err = ufs_si_inherit(ip, tdp, ip->i_mode, cr); 2227 if (err) { 2228 if (op == DE_MKDIR) { 2229 /* 2230 * clean up parent directory 2231 * 2232 * tdp->i_contents already locked from 2233 * ufs_direnter_*() 2234 */ 2235 tdp->i_nlink--; 2236 TRANS_INODE(tdp->i_ufsvfs, tdp); 2237 tdp->i_flag |= ICHG; 2238 tdp->i_seq++; 2239 ufs_iupdat(tdp, I_SYNC); 2240 } 2241 goto fail; 2242 } 2243 } 2244 2245 /* 2246 * If the passed in attributes contain atime and/or mtime 2247 * settings, then use them instead of using the current 2248 * high resolution time. 2249 */ 2250 if (vap->va_mask & (AT_MTIME|AT_ATIME)) { 2251 if (vap->va_mask & AT_ATIME) { 2252 ip->i_atime.tv_sec = vap->va_atime.tv_sec; 2253 ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2254 ip->i_flag &= ~IACC; 2255 } else 2256 ip->i_flag |= IACC; 2257 if (vap->va_mask & AT_MTIME) { 2258 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; 2259 ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2260 gethrestime(&now); 2261 if (now.tv_sec > TIME32_MAX) { 2262 /* 2263 * In 2038, ctime sticks forever.. 2264 */ 2265 ip->i_ctime.tv_sec = TIME32_MAX; 2266 ip->i_ctime.tv_usec = 0; 2267 } else { 2268 ip->i_ctime.tv_sec = now.tv_sec; 2269 ip->i_ctime.tv_usec = now.tv_nsec / 1000; 2270 } 2271 ip->i_flag &= ~(IUPD|ICHG); 2272 ip->i_flag |= IMODTIME; 2273 } else 2274 ip->i_flag |= IUPD|ICHG; 2275 ip->i_flag |= IMOD; 2276 } else 2277 ip->i_flag |= IACC|IUPD|ICHG; 2278 ip->i_seq++; 2279 2280 /* 2281 * If this is an attribute tag it as one. 2282 */ 2283 if ((tdp->i_mode & IFMT) == IFATTRDIR) { 2284 ip->i_cflags |= IXATTR; 2285 } 2286 2287 /* 2288 * push inode before it's name appears in a directory 2289 */ 2290 TRANS_INODE(ip->i_ufsvfs, ip); 2291 ufs_iupdat(ip, I_SYNC); 2292 rw_exit(&ip->i_contents); 2293 return (0); 2294 2295 fail: 2296 /* Throw away inode we just allocated. */ 2297 ip->i_nlink = 0; 2298 ufs_setreclaim(ip); 2299 TRANS_INODE(ip->i_ufsvfs, ip); 2300 ip->i_flag |= ICHG; 2301 ip->i_seq++; 2302 ITIMES_NOLOCK(ip); 2303 rw_exit(&ip->i_contents); 2304 return (err); 2305 } 2306 2307 /* 2308 * Write a prototype directory into the empty inode ip, whose parent is dp. 2309 */ 2310 static int 2311 ufs_dirmakedirect( 2312 struct inode *ip, /* new directory */ 2313 struct inode *dp, /* parent directory */ 2314 int attrdir, 2315 struct cred *cr) 2316 { 2317 struct dirtemplate *dirp; 2318 struct fbuf *fbp; 2319 int err; 2320 2321 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 2322 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 2323 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 2324 /* 2325 * Allocate space for the directory we're creating. 2326 */ 2327 err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr); 2328 if (err) 2329 return (err); 2330 if (DIRBLKSIZ > dp->i_fs->fs_fsize) { 2331 err = ufs_fault(ITOV(dp), 2332 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)", 2333 DIRBLKSIZ, dp->i_fs->fs_fsize, 2334 dp->i_fs->fs_fsmnt); 2335 return (err); 2336 } 2337 ip->i_size = DIRBLKSIZ; 2338 TRANS_INODE(ip->i_ufsvfs, ip); 2339 ip->i_flag |= IUPD|ICHG|IATTCHG; 2340 ip->i_seq++; 2341 ITIMES_NOLOCK(ip); 2342 /* 2343 * Update the tdp link count and write out the change. 2344 * This reflects the ".." entry we'll soon write. 2345 */ 2346 if (dp->i_nlink == MAXLINK) 2347 return (EMLINK); 2348 if (attrdir == 0) 2349 dp->i_nlink++; 2350 TRANS_INODE(dp->i_ufsvfs, dp); 2351 dp->i_flag |= ICHG; 2352 dp->i_seq++; 2353 ufs_iupdat(dp, I_SYNC); 2354 /* 2355 * Initialize directory with "." 2356 * and ".." from static template. 2357 * 2358 * Since the parent directory is locked, we don't have to 2359 * worry about anything changing when we drop the write 2360 * lock on (ip). 2361 * 2362 */ 2363 err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize, 2364 S_READ, &fbp); 2365 2366 if (err) { 2367 goto fail; 2368 } 2369 dirp = (struct dirtemplate *)fbp->fb_addr; 2370 /* 2371 * Now initialize the directory we're creating 2372 * with the "." and ".." entries. 2373 */ 2374 *dirp = mastertemplate; /* structure assignment */ 2375 dirp->dot_ino = (uint32_t)ip->i_number; 2376 dirp->dotdot_ino = (uint32_t)dp->i_number; 2377 2378 err = TRANS_DIR(ip, 0); 2379 if (err) { 2380 fbrelse(fbp, S_OTHER); 2381 goto fail; 2382 } 2383 2384 err = ufs_fbwrite(fbp, ip); 2385 if (err) { 2386 goto fail; 2387 } 2388 2389 return (0); 2390 2391 fail: 2392 if (attrdir == 0) 2393 dp->i_nlink--; 2394 TRANS_INODE(dp->i_ufsvfs, dp); 2395 dp->i_flag |= ICHG; 2396 dp->i_seq++; 2397 ufs_iupdat(dp, I_SYNC); 2398 return (err); 2399 } 2400 2401 /* 2402 * Delete a directory entry. If oip is nonzero the entry is checked 2403 * to make sure it still reflects oip. 2404 */ 2405 int 2406 ufs_dirremove( 2407 struct inode *dp, 2408 char *namep, 2409 struct inode *oip, 2410 struct vnode *cdir, 2411 enum dr_op op, 2412 struct cred *cr) 2413 { 2414 struct direct *ep, *pep, *nep; 2415 struct inode *ip; 2416 vnode_t *dvp, *vp; 2417 struct ufs_slot slot; 2418 int namlen; 2419 int err; 2420 int mode; 2421 ushort_t extra; 2422 2423 namlen = (int)strlen(namep); 2424 if (namlen == 0) { 2425 struct fs *fs = dp->i_fs; 2426 2427 cmn_err(CE_WARN, "%s: ufs_dirremove: attempted to remove" 2428 " nameless file in directory (directory inode %llu)", 2429 fs->fs_fsmnt, (u_longlong_t)dp->i_number); 2430 ASSERT(namlen != 0); 2431 2432 return (ENOENT); 2433 } 2434 2435 /* 2436 * return error when removing . and .. 2437 */ 2438 if (namep[0] == '.') { 2439 if (namlen == 1) 2440 return (EINVAL); 2441 else if (namlen == 2 && namep[1] == '.') { 2442 return (EEXIST); /* SIGH should be ENOTEMPTY */ 2443 } 2444 } 2445 2446 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 2447 2448 retry: 2449 /* 2450 * Check accessibility of directory. 2451 */ 2452 if (err = ufs_diraccess(dp, IEXEC|IWRITE, cr)) 2453 return (err); 2454 2455 ip = NULL; 2456 slot.fbp = NULL; 2457 slot.status = FOUND; /* don't need to look for empty slot */ 2458 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 2459 rw_enter(&dp->i_contents, RW_WRITER); 2460 2461 err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0); 2462 if (err) 2463 goto out_novfs; 2464 if (ip == NULL) { 2465 err = ENOENT; 2466 goto out_novfs; 2467 } 2468 vp = ITOV(ip); 2469 if (oip && oip != ip) { 2470 err = ENOENT; 2471 goto out_novfs; 2472 } 2473 2474 mode = ip->i_mode & IFMT; 2475 if (mode == IFDIR || mode == IFATTRDIR) { 2476 2477 /* 2478 * vn_vfsrlock() prevents races between mount and rmdir. 2479 */ 2480 if (vn_vfsrlock(vp)) { 2481 err = EBUSY; 2482 goto out_novfs; 2483 } 2484 if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) { 2485 err = EBUSY; 2486 goto out; 2487 } 2488 /* 2489 * If we are removing a directory, get a lock on it. 2490 * Taking a writer lock prevents a parallel ufs_dirlook from 2491 * incorrectly entering a negative cache vnode entry in the dnlc 2492 * If the directory is empty, it will stay empty until 2493 * we can remove it. 2494 */ 2495 if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) { 2496 /* 2497 * It is possible that a thread in rename would have 2498 * acquired this rwlock. To prevent a deadlock we 2499 * do a rw_tryenter. If we fail to get the lock 2500 * we drop all the locks we have acquired, wait 2501 * for 2 ticks and reacquire the 2502 * directory's (dp) i_rwlock and try again. 2503 * If we dont drop dp's i_rwlock then we will panic 2504 * with a "Deadlock: cycle in blocking chain" 2505 * since in ufs_dircheckpath we want dp's i_rwlock. 2506 * dp is guaranteed to exist since ufs_dirremove is 2507 * called after a VN_HOLD(dp) has been done. 2508 */ 2509 ufs_dirremove_retry_cnt++; 2510 vn_vfsunlock(vp); 2511 if (slot.fbp) 2512 fbrelse(slot.fbp, S_OTHER); 2513 rw_exit(&dp->i_contents); 2514 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 2515 rw_exit(&dp->i_rwlock); 2516 VN_RELE(vp); 2517 delay(2); 2518 rw_enter(&dp->i_rwlock, RW_WRITER); 2519 goto retry; 2520 } 2521 } 2522 rw_enter(&ip->i_contents, RW_READER); 2523 2524 /* 2525 * Now check the restrictions that apply on sticky directories. 2526 */ 2527 if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) { 2528 rw_exit(&ip->i_contents); 2529 if (mode == IFDIR || mode == IFATTRDIR) 2530 rw_exit(&ip->i_rwlock); 2531 goto out; 2532 } 2533 2534 if (op == DR_RMDIR) { 2535 /* 2536 * For rmdir(2), some special checks are required. 2537 * (a) Don't remove any alias of the parent (e.g. "."). 2538 * (b) Don't remove the current directory. 2539 * (c) Make sure the entry is (still) a directory. 2540 * (d) Make sure the directory is empty. 2541 */ 2542 2543 if (dp == ip || vp == cdir) 2544 err = EINVAL; 2545 else if (((ip->i_mode & IFMT) != IFDIR) && 2546 ((ip->i_mode & IFMT) != IFATTRDIR)) 2547 err = ENOTDIR; 2548 else if ((ip->i_nlink > 2) || 2549 !ufs_dirempty(ip, dp->i_number, cr)) { 2550 err = EEXIST; /* SIGH should be ENOTEMPTY */ 2551 } 2552 2553 if (err) { 2554 rw_exit(&ip->i_contents); 2555 if (mode == IFDIR || mode == IFATTRDIR) 2556 rw_exit(&ip->i_rwlock); 2557 goto out; 2558 } 2559 } else if (op == DR_REMOVE) { 2560 /* 2561 * unlink(2) requires a different check: allow only 2562 * privileged users to unlink a directory. 2563 */ 2564 if (vp->v_type == VDIR && 2565 secpolicy_fs_linkdir(cr, vp->v_vfsp)) { 2566 err = EPERM; 2567 rw_exit(&ip->i_contents); 2568 rw_exit(&ip->i_rwlock); 2569 goto out; 2570 } 2571 } 2572 2573 rw_exit(&ip->i_contents); 2574 2575 /* 2576 * Remove the cache'd entry, if any. 2577 */ 2578 dvp = ITOV(dp); 2579 dnlc_remove(dvp, namep); 2580 ep = slot.ep; 2581 ep->d_ino = 0; 2582 2583 if (slot.cached) { 2584 dcanchor_t *dcap = &dp->i_danchor; 2585 2586 (void) dnlc_dir_rem_entry(dcap, namep, NULL); 2587 if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) { 2588 (void) dnlc_dir_rem_space_by_handle(dcap, slot.offset); 2589 } 2590 if (slot.offset & (DIRBLKSIZ - 1)) { 2591 /* 2592 * Collapse new free space into previous entry. 2593 * Note, the previous entry has already been 2594 * validated in ufs_dircheckforname(). 2595 */ 2596 ASSERT(slot.size); 2597 pep = (struct direct *)((char *)ep - slot.size); 2598 if ((pep->d_ino == 0) && 2599 ((uintptr_t)pep & (DIRBLKSIZ - 1))) { 2600 dnlc_dir_purge(dcap); 2601 slot.cached = 0; 2602 goto nocache; 2603 } 2604 if (pep->d_ino) { 2605 extra = pep->d_reclen - DIRSIZ(pep); 2606 } else { 2607 extra = pep->d_reclen; 2608 } 2609 if (extra >= LDIRSIZ(1)) { 2610 (void) dnlc_dir_rem_space_by_handle(dcap, 2611 (uint64_t)(slot.offset - slot.size)); 2612 } 2613 pep->d_reclen += ep->d_reclen; 2614 (void) dnlc_dir_add_space(dcap, extra + ep->d_reclen, 2615 (uint64_t)(slot.offset - slot.size)); 2616 /* adjust the previous pointer in the next entry */ 2617 nep = (struct direct *)((char *)ep + ep->d_reclen); 2618 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { 2619 /* 2620 * Not a new block. 2621 * 2622 * Check the validity of the entry. 2623 * If it's bad, then throw away the cache and 2624 * continue. 2625 */ 2626 if ((nep->d_reclen == 0) || 2627 (nep->d_reclen & 0x3) || 2628 (dnlc_dir_update(dcap, nep->d_name, 2629 INO_OFF_TO_H(nep->d_ino, 2630 slot.offset - slot.size)) == DNOENT)) { 2631 dnlc_dir_purge(dcap); 2632 slot.cached = 0; 2633 } 2634 } 2635 } else { 2636 (void) dnlc_dir_add_space(dcap, ep->d_reclen, 2637 (uint64_t)slot.offset); 2638 } 2639 } else { 2640 /* 2641 * If the entry isn't the first in the directory, we must 2642 * reclaim the space of the now empty record by adding 2643 * the record size to the size of the previous entry. 2644 */ 2645 if (slot.offset & (DIRBLKSIZ - 1)) { 2646 /* 2647 * Collapse new free space into previous entry. 2648 */ 2649 pep = (struct direct *)((char *)ep - slot.size); 2650 pep->d_reclen += ep->d_reclen; 2651 } 2652 } 2653 nocache: 2654 2655 2656 err = TRANS_DIR(dp, slot.offset); 2657 if (err) 2658 fbrelse(slot.fbp, S_OTHER); 2659 else 2660 err = ufs_fbwrite(slot.fbp, dp); 2661 slot.fbp = NULL; 2662 2663 /* 2664 * If we were removing a directory, it is 'gone' now, but we cannot 2665 * unlock it as a thread may be waiting for the lock in ufs_create. If 2666 * we did, it could then create a file in a deleted directory. 2667 */ 2668 2669 if (err) { 2670 if (mode == IFDIR || mode == IFATTRDIR) 2671 rw_exit(&ip->i_rwlock); 2672 goto out; 2673 } 2674 2675 rw_enter(&ip->i_contents, RW_WRITER); 2676 2677 dp->i_flag |= IUPD|ICHG; 2678 dp->i_seq++; 2679 ip->i_flag |= ICHG; 2680 ip->i_seq++; 2681 2682 TRANS_INODE(dp->i_ufsvfs, dp); 2683 TRANS_INODE(ip->i_ufsvfs, ip); 2684 /* 2685 * Now dispose of the inode. 2686 */ 2687 if (ip->i_nlink > 0) { 2688 /* 2689 * This is not done for IFATTRDIR's because they don't 2690 * have entries in the dnlc and the link counts are 2691 * not incremented when they are created. 2692 */ 2693 if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) { 2694 /* 2695 * Decrement by 2 because we're trashing the "." 2696 * entry as well as removing the entry in dp. 2697 * Clear the directory entry, but there may be 2698 * other hard links so don't free the inode. 2699 * Decrement the dp linkcount because we're 2700 * trashing the ".." entry. 2701 */ 2702 ip->i_nlink -= 2; 2703 dp->i_nlink--; 2704 ufs_setreclaim(dp); 2705 /* 2706 * XXX need to discard negative cache entries 2707 * for vp. See comment in ufs_delete(). 2708 */ 2709 dnlc_remove(vp, "."); 2710 dnlc_remove(vp, ".."); 2711 /* 2712 * The return value is ignored here bacause if 2713 * the directory purge fails we don't want to 2714 * stop the delete. If ufs_dirpurgedotdot fails 2715 * the delete will continue with the preexiting 2716 * behavior. 2717 */ 2718 (void) ufs_dirpurgedotdot(ip, dp->i_number, cr); 2719 } else { 2720 ip->i_nlink--; 2721 } 2722 ufs_setreclaim(ip); 2723 } 2724 ITIMES_NOLOCK(dp); 2725 ITIMES_NOLOCK(ip); 2726 2727 if (!TRANS_ISTRANS(dp->i_ufsvfs)) 2728 ufs_iupdat(dp, I_SYNC); 2729 if (!TRANS_ISTRANS(ip->i_ufsvfs)) 2730 ufs_iupdat(ip, I_SYNC); 2731 2732 rw_exit(&ip->i_contents); 2733 if (mode == IFDIR || mode == IFATTRDIR) 2734 rw_exit(&ip->i_rwlock); 2735 out: 2736 if (mode == IFDIR || mode == IFATTRDIR) { 2737 vn_vfsunlock(vp); 2738 } 2739 out_novfs: 2740 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 2741 2742 if (slot.fbp) 2743 fbrelse(slot.fbp, S_OTHER); 2744 2745 rw_exit(&dp->i_contents); 2746 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 2747 2748 /* 2749 * Release (and delete) the inode after we drop vfs_dqrwlock to 2750 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 2751 */ 2752 if (ip) 2753 VN_RELE(vp); 2754 2755 return (err); 2756 } 2757 2758 /* 2759 * Return buffer with contents of block "offset" 2760 * from the beginning of directory "ip". If "res" 2761 * is non-zero, fill it in with a pointer to the 2762 * remaining space in the directory. 2763 * 2764 */ 2765 2766 int 2767 blkatoff( 2768 struct inode *ip, 2769 off_t offset, 2770 char **res, 2771 struct fbuf **fbpp) 2772 { 2773 struct fs *fs; 2774 struct fbuf *fbp; 2775 daddr_t lbn; 2776 uint_t bsize; 2777 int err; 2778 2779 CPU_STATS_ADD_K(sys, ufsdirblk, 1); 2780 fs = ip->i_fs; 2781 lbn = (daddr_t)lblkno(fs, offset); 2782 bsize = (uint_t)blksize(fs, ip, lbn); 2783 err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask), 2784 bsize, S_READ, &fbp); 2785 if (err) { 2786 *fbpp = (struct fbuf *)NULL; 2787 return (err); 2788 } 2789 if (res) 2790 *res = fbp->fb_addr + blkoff(fs, offset); 2791 *fbpp = fbp; 2792 return (0); 2793 } 2794 2795 /* 2796 * Do consistency checking: 2797 * record length must be multiple of 4 2798 * entry must fit in rest of its DIRBLKSIZ block 2799 * record must be large enough to contain entry 2800 * name is not longer than MAXNAMLEN 2801 * name must be as long as advertised, and null terminated 2802 * NOTE: record length must not be zero (should be checked previously). 2803 * This routine is only called if dirchk is true. 2804 * It would be nice to set the FSBAD flag in the super-block when 2805 * this routine fails so that a fsck is forced on next reboot, 2806 * but locking is a problem. 2807 */ 2808 static int 2809 dirmangled( 2810 struct inode *dp, 2811 struct direct *ep, 2812 int entryoffsetinblock, 2813 off_t offset) 2814 { 2815 int i; 2816 2817 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); 2818 if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i || 2819 (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN || 2820 ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) { 2821 dirbad(dp, "mangled entry", offset); 2822 return (1); 2823 } 2824 return (0); 2825 } 2826 2827 static void 2828 dirbad(struct inode *ip, char *how, off_t offset) 2829 { 2830 cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s", 2831 ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how); 2832 } 2833 2834 static int 2835 dirbadname(char *sp, int l) 2836 { 2837 while (l--) { /* check for nulls */ 2838 if (*sp++ == '\0') { 2839 return (1); 2840 } 2841 } 2842 return (*sp); /* check for terminating null */ 2843 } 2844 2845 /* 2846 * Check if a directory is empty or not. 2847 */ 2848 static int 2849 ufs_dirempty( 2850 struct inode *ip, 2851 ino_t parentino, 2852 struct cred *cr) 2853 { 2854 return (ufs_dirscan(ip, parentino, cr, 0)); 2855 } 2856 2857 /* 2858 * clear the .. directory entry. 2859 */ 2860 static int 2861 ufs_dirpurgedotdot( 2862 struct inode *ip, 2863 ino_t parentino, 2864 struct cred *cr) 2865 { 2866 return (ufs_dirscan(ip, parentino, cr, 1)); 2867 } 2868 2869 /* 2870 * Scan the directoy. If clr_dotdot is true clear the .. 2871 * directory else check to see if the directory is empty. 2872 * 2873 * Using a struct dirtemplate here is not precisely 2874 * what we want, but better than using a struct direct. 2875 * 2876 * clr_dotdot is used as a flag to tell us if we need 2877 * to clear the dotdot entry 2878 * 2879 * N.B.: does not handle corrupted directories. 2880 */ 2881 static int 2882 ufs_dirscan( 2883 struct inode *ip, 2884 ino_t parentino, 2885 struct cred *cr, 2886 int clr_dotdot) 2887 { 2888 offset_t off; 2889 struct dirtemplate dbuf; 2890 struct direct *dp = (struct direct *)&dbuf; 2891 int err, count; 2892 int empty = 1; /* Assume it's empty */ 2893 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) 2894 2895 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 2896 2897 ASSERT(ip->i_size <= (offset_t)MAXOFF_T); 2898 for (off = 0; off < ip->i_size; off += dp->d_reclen) { 2899 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, 2900 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); 2901 /* 2902 * Since we read MINDIRSIZ, residual must 2903 * be 0 unless we're at end of file. 2904 */ 2905 if (err || count != 0 || dp->d_reclen == 0) { 2906 empty = 0; 2907 break; 2908 } 2909 /* skip empty entries */ 2910 if (dp->d_ino == 0) 2911 continue; 2912 /* accept only "." and ".." */ 2913 if (dp->d_namlen > 2 || dp->d_name[0] != '.') { 2914 empty = 0; 2915 break; 2916 } 2917 /* 2918 * At this point d_namlen must be 1 or 2. 2919 * 1 implies ".", 2 implies ".." if second 2920 * char is also "." 2921 */ 2922 if (dp->d_namlen == 1) 2923 continue; 2924 if (dp->d_name[1] == '.' && 2925 (ino_t)dp->d_ino == parentino) { 2926 /* 2927 * If we're doing a purge we need to check for 2928 * the . and .. entries and clear the d_ino for .. 2929 * 2930 * if clr_dotdot is set ufs_dirscan does not 2931 * check for an empty directory. 2932 */ 2933 if (clr_dotdot) { 2934 /* 2935 * Have to actually zap the .. 2936 * entry in the directory, as 2937 * otherwise someone might have 2938 * dp as its cwd and try to 2939 * open .., which now points to 2940 * an unallocated inode. 2941 */ 2942 empty = ufs_dirclrdotdot(ip, parentino); 2943 break; 2944 } else { 2945 continue; 2946 } 2947 } 2948 empty = 0; 2949 break; 2950 } 2951 return (empty); 2952 } 2953 2954 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */ 2955 uint64_t dircheck_retry_cnt; 2956 /* 2957 * Check if source directory inode is in the path of the target directory. 2958 * Target is supplied locked. 2959 * 2960 * The source and target inode's should be different upon entry. 2961 */ 2962 int 2963 ufs_dircheckpath( 2964 ino_t source_ino, 2965 struct inode *target, 2966 struct inode *sdp, 2967 struct cred *cr) 2968 { 2969 struct fbuf *fbp; 2970 struct dirtemplate *dirp; 2971 struct inode *ip; 2972 struct ufsvfs *ufsvfsp; 2973 struct inode *tip; 2974 ino_t dotdotino; 2975 int err; 2976 2977 ASSERT(target->i_ufsvfs != NULL); 2978 ASSERT(RW_LOCK_HELD(&target->i_rwlock)); 2979 ASSERT(RW_LOCK_HELD(&sdp->i_rwlock)); 2980 2981 ip = target; 2982 if (ip->i_number == source_ino) { 2983 err = EINVAL; 2984 goto out; 2985 } 2986 if (ip->i_number == UFSROOTINO) { 2987 err = 0; 2988 goto out; 2989 } 2990 /* 2991 * Search back through the directory tree, using the ".." entries. 2992 * Fail any attempt to move a directory into an ancestor directory. 2993 */ 2994 fbp = NULL; 2995 for (;;) { 2996 struct vfs *vfs; 2997 2998 err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp); 2999 if (err) 3000 break; 3001 if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 || 3002 ip->i_size < sizeof (struct dirtemplate)) { 3003 dirbad(ip, "bad size, unlinked or not dir", (off_t)0); 3004 err = ENOTDIR; 3005 break; 3006 } 3007 if (dirp->dotdot_namlen != 2 || 3008 dirp->dotdot_name[0] != '.' || 3009 dirp->dotdot_name[1] != '.') { 3010 dirbad(ip, "mangled .. entry", (off_t)0); 3011 err = ENOTDIR; /* Sanity check */ 3012 break; 3013 } 3014 dotdotino = (ino_t)dirp->dotdot_ino; 3015 if (dotdotino == source_ino) { 3016 err = EINVAL; 3017 break; 3018 } 3019 if (dotdotino == UFSROOTINO) 3020 break; 3021 if (fbp) { 3022 fbrelse(fbp, S_OTHER); 3023 fbp = NULL; 3024 } 3025 vfs = ip->i_vfs; 3026 ufsvfsp = ip->i_ufsvfs; 3027 3028 if (ip != target) { 3029 rw_exit(&ip->i_rwlock); 3030 VN_RELE(ITOV(ip)); 3031 } 3032 /* 3033 * Race to get the inode. 3034 */ 3035 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3036 if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) { 3037 rw_exit(&ufsvfsp->vfs_dqrwlock); 3038 ip = NULL; 3039 break; 3040 } 3041 rw_exit(&ufsvfsp->vfs_dqrwlock); 3042 /* 3043 * If the directory of the source inode (also a directory) 3044 * is the same as this next entry up the chain, then 3045 * we know the source directory itself can't be in the 3046 * chain. This also prevents a panic because we already 3047 * have sdp->i_rwlock locked. 3048 */ 3049 if (tip == sdp) { 3050 VN_RELE(ITOV(tip)); 3051 ip = NULL; 3052 break; 3053 } 3054 ip = tip; 3055 3056 /* 3057 * If someone has set the WRITE_WANTED bit in this lock and if 3058 * this happens to be a sdp or tdp of another parallel rename 3059 * which is executing the same code and in similar situation 3060 * we end up in a 4 way deadlock. We need to make sure that 3061 * the WRITE_WANTED bit is not set. 3062 */ 3063 retry_lock: 3064 if (!rw_tryenter(&ip->i_rwlock, RW_READER)) { 3065 /* 3066 * If the lock held as WRITER thats fine but if it 3067 * has WRITE_WANTED bit set we might end up in a 3068 * deadlock. If WRITE_WANTED is set we return 3069 * with EAGAIN else we just go back and try. 3070 */ 3071 if (RW_ISWRITER(&ip->i_rwlock) && 3072 !(RW_WRITE_HELD(&ip->i_rwlock))) { 3073 err = EAGAIN; 3074 if (fbp) { 3075 fbrelse(fbp, S_OTHER); 3076 } 3077 VN_RELE(ITOV(ip)); 3078 return (err); 3079 } else { 3080 /* 3081 * The lock is being write held. We could 3082 * just do a rw_enter here but there is a 3083 * window between the check and now, where 3084 * the status could have changed, so to 3085 * avoid looping we backoff and go back to 3086 * try for the lock. 3087 */ 3088 delay(retry_backoff_delay); 3089 dircheck_retry_cnt++; 3090 goto retry_lock; 3091 } 3092 } 3093 } 3094 if (fbp) { 3095 fbrelse(fbp, S_OTHER); 3096 } 3097 out: 3098 if (ip) { 3099 if (ip != target) { 3100 rw_exit(&ip->i_rwlock); 3101 VN_RELE(ITOV(ip)); 3102 } 3103 } 3104 return (err); 3105 } 3106 3107 int 3108 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr) 3109 { 3110 offset_t off; 3111 struct dirtemplate dbuf; 3112 struct direct *dp = (struct direct *)&dbuf; 3113 int err, count; 3114 int empty = 1; /* Assume it's empty */ 3115 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) 3116 3117 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 3118 3119 ASSERT(ip->i_size <= (offset_t)MAXOFF_T); 3120 for (off = 0; off < ip->i_size; off += dp->d_reclen) { 3121 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, 3122 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); 3123 /* 3124 * Since we read MINDIRSIZ, residual must 3125 * be 0 unless we're at end of file. 3126 */ 3127 3128 if (err || count != 0 || dp->d_reclen == 0) { 3129 empty = 0; 3130 break; 3131 } 3132 /* skip empty entries */ 3133 if (dp->d_ino == 0) 3134 continue; 3135 /* 3136 * At this point d_namlen must be 1 or 2. 3137 * 1 implies ".", 2 implies ".." if second 3138 * char is also "." 3139 */ 3140 3141 if (dp->d_namlen == 1 && dp->d_name[0] == '.' && 3142 (ino_t)dp->d_ino == parentino) 3143 continue; 3144 3145 if (dp->d_namlen == 2 && dp->d_name[0] == '.' && 3146 dp->d_name[1] == '.') { 3147 continue; 3148 } 3149 empty = 0; 3150 break; 3151 } 3152 return (empty); 3153 } 3154 3155 3156 /* 3157 * Allocate and initialize a new shadow inode to contain extended attributes. 3158 */ 3159 int 3160 ufs_xattrmkdir( 3161 struct inode *tdp, 3162 struct inode **ipp, 3163 int flags, 3164 struct cred *cr) 3165 { 3166 struct inode *ip; 3167 struct vattr va; 3168 int err; 3169 int retry = 1; 3170 struct ufsvfs *ufsvfsp; 3171 struct ulockfs *ulp; 3172 int issync; 3173 int trans_size; 3174 int dorwlock; /* 0 = not yet taken, */ 3175 /* 1 = taken outside the transaction, */ 3176 /* 2 = taken inside the transaction */ 3177 3178 /* 3179 * Validate permission to create attribute directory 3180 */ 3181 3182 if ((err = ufs_iaccess(tdp, IWRITE, cr, 1)) != 0) { 3183 return (err); 3184 } 3185 3186 if (vn_is_readonly(ITOV(tdp))) 3187 return (EROFS); 3188 3189 /* 3190 * No need to re-init err after again:, since it's set before 3191 * the next use of it. 3192 */ 3193 again: 3194 dorwlock = 0; 3195 va.va_type = VDIR; 3196 va.va_uid = tdp->i_uid; 3197 va.va_gid = tdp->i_gid; 3198 3199 if ((tdp->i_mode & IFMT) == IFDIR) { 3200 va.va_mode = (o_mode_t)IFATTRDIR; 3201 va.va_mode |= tdp->i_mode & 0777; 3202 } else { 3203 va.va_mode = (o_mode_t)IFATTRDIR|0700; 3204 if (tdp->i_mode & 0040) 3205 va.va_mode |= 0750; 3206 if (tdp->i_mode & 0004) 3207 va.va_mode |= 0705; 3208 } 3209 va.va_mask = AT_TYPE|AT_MODE; 3210 3211 ufsvfsp = tdp->i_ufsvfs; 3212 3213 err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK); 3214 if (err) 3215 return (err); 3216 3217 /* 3218 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file. 3219 * This follows the protocol for read()/write(). 3220 */ 3221 if (ITOV(tdp)->v_type != VDIR) { 3222 rw_enter(&tdp->i_rwlock, RW_WRITER); 3223 dorwlock = 1; 3224 } 3225 3226 if (ulp) { 3227 trans_size = (int)TOP_MKDIR_SIZE(tdp); 3228 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size); 3229 } 3230 3231 /* 3232 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory. 3233 * This follows the protocol established by 3234 * ufs_link/create/remove/rename/mkdir/rmdir/symlink. 3235 */ 3236 if (dorwlock == 0) { 3237 rw_enter(&tdp->i_rwlock, RW_WRITER); 3238 dorwlock = 2; 3239 } 3240 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3241 rw_enter(&tdp->i_contents, RW_WRITER); 3242 3243 /* 3244 * Suppress out of inodes messages if we will retry. 3245 */ 3246 if (retry) 3247 tdp->i_flag |= IQUIET; 3248 err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr); 3249 tdp->i_flag &= ~IQUIET; 3250 3251 if (err) 3252 goto fail; 3253 3254 if (flags) { 3255 3256 /* 3257 * Now attach it to src file. 3258 */ 3259 3260 tdp->i_oeftflag = ip->i_number; 3261 } 3262 3263 ip->i_cflags |= IXATTR; 3264 ITOV(ip)->v_flag |= V_XATTRDIR; 3265 TRANS_INODE(ufsvfsp, tdp); 3266 tdp->i_flag |= ICHG | IUPD; 3267 tdp->i_seq++; 3268 ufs_iupdat(tdp, I_SYNC); 3269 rw_exit(&tdp->i_contents); 3270 rw_exit(&ufsvfsp->vfs_dqrwlock); 3271 3272 rw_enter(&ip->i_rwlock, RW_WRITER); 3273 rw_enter(&ip->i_contents, RW_WRITER); 3274 TRANS_INODE(ufsvfsp, ip); 3275 ip->i_flag |= ICHG| IUPD; 3276 ip->i_seq++; 3277 ufs_iupdat(ip, I_SYNC); 3278 rw_exit(&ip->i_contents); 3279 rw_exit(&ip->i_rwlock); 3280 if (dorwlock == 2) 3281 rw_exit(&tdp->i_rwlock); 3282 if (ulp) { 3283 int terr = 0; 3284 3285 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); 3286 ufs_lockfs_end(ulp); 3287 if (err == 0) 3288 err = terr; 3289 } 3290 if (dorwlock == 1) 3291 rw_exit(&tdp->i_rwlock); 3292 *ipp = ip; 3293 return (err); 3294 3295 fail: 3296 rw_exit(&tdp->i_contents); 3297 rw_exit(&ufsvfsp->vfs_dqrwlock); 3298 if (dorwlock == 2) 3299 rw_exit(&tdp->i_rwlock); 3300 if (ulp) { 3301 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); 3302 ufs_lockfs_end(ulp); 3303 } 3304 if (dorwlock == 1) 3305 rw_exit(&tdp->i_rwlock); 3306 if (ip != NULL) 3307 VN_RELE(ITOV(ip)); 3308 3309 /* 3310 * No inodes? See if any are tied up in pending deletions. 3311 * This has to be done outside of any of the above, because 3312 * the draining operation can't be done from inside a transaction. 3313 */ 3314 if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3315 ufs_delete_drain_wait(ufsvfsp, 1); 3316 retry = 0; 3317 goto again; 3318 } 3319 3320 return (err); 3321 } 3322 3323 /* 3324 * clear the dotdot directory entry. 3325 * Used by ufs_dirscan when clr_dotdot 3326 * flag is set and we're deleting a 3327 * directory. 3328 */ 3329 static int 3330 ufs_dirclrdotdot(struct inode *ip, ino_t parentino) 3331 { 3332 struct fbuf *fbp; 3333 struct direct *dotp, *dotdotp; 3334 int err = 0; 3335 3336 ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); 3337 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 3338 err = blkatoff(ip, 0, NULL, &fbp); 3339 if (err) { 3340 return (err); 3341 } 3342 3343 dotp = (struct direct *)fbp->fb_addr; 3344 if ((dotp->d_namlen < (MAXNAMLEN + 1)) && 3345 ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) { 3346 dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen); 3347 if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) && 3348 ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) { 3349 3350 dotp->d_reclen += dotdotp->d_reclen; 3351 if (parentino == dotdotp->d_ino) { 3352 dotdotp->d_ino = 0; 3353 dotdotp->d_namlen = 0; 3354 dotdotp->d_reclen = 0; 3355 } 3356 3357 err = TRANS_DIR(ip, 0); 3358 if (err) { 3359 fbrelse(fbp, S_OTHER); 3360 } else { 3361 err = ufs_fbwrite(fbp, ip); 3362 } 3363 } 3364 } else { 3365 err = -1; 3366 } 3367 return (err); 3368 } 3369